From 064af42167bf4fc9aaea2702d80ce08074b889c0 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Wed, 8 Jul 2009 13:19:16 -0700 Subject: [PATCH 1/1] Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45. --- .gitignore | 43 + COPYING | 29 + CodingStyle | 504 +++ INSTALL | 514 +++ Makefile.am | 74 + README | 74 + acinclude.m4 | 195 + boot.sh | 2 + configure.ac | 89 + datapath/.gitignore | 7 + datapath/Makefile.am | 12 + datapath/Modules.mk | 32 + datapath/actions.c | 421 +++ datapath/actions.h | 18 + datapath/brc_procfs.c | 185 + datapath/brc_procfs.h | 11 + datapath/brc_sysfs.h | 25 + datapath/brc_sysfs_dp.c | 532 +++ datapath/brc_sysfs_if.c | 334 ++ datapath/brcompat.c | 519 +++ datapath/compat.h | 17 + datapath/datapath.c | 1611 ++++++++ datapath/datapath.h | 139 + datapath/dp_dev.c | 210 ++ datapath/dp_dev.h | 27 + datapath/dp_notify.c | 29 + datapath/flow.c | 301 ++ datapath/flow.h | 49 + datapath/linux-2.6/.gitignore | 20 + datapath/linux-2.6/Kbuild.in | 34 + datapath/linux-2.6/Makefile.in | 9 + datapath/linux-2.6/Makefile.main.in | 82 + datapath/linux-2.6/Modules.mk | 50 + datapath/linux-2.6/compat-2.6/compat26.h | 37 + .../linux-2.6/compat-2.6/genetlink-brcompat.c | 20 + .../compat-2.6/genetlink-openvswitch.c | 22 + .../compat-2.6/include/asm-generic/bug.h | 19 + .../compat-2.6/include/linux/cpumask.h | 11 + .../linux-2.6/compat-2.6/include/linux/dmi.h | 114 + .../linux-2.6/compat-2.6/include/linux/err.h | 21 + .../linux-2.6/compat-2.6/include/linux/icmp.h | 13 + .../compat-2.6/include/linux/if_arp.h | 15 + .../linux-2.6/compat-2.6/include/linux/ip.h | 18 + .../linux-2.6/compat-2.6/include/linux/ipv6.h | 13 + .../compat-2.6/include/linux/jiffies.h | 26 + .../compat-2.6/include/linux/kernel.h | 9 + .../compat-2.6/include/linux/lockdep.h | 450 +++ .../linux-2.6/compat-2.6/include/linux/log2.h | 17 + .../compat-2.6/include/linux/mutex.h | 59 + .../compat-2.6/include/linux/netdevice.h | 35 + .../include/linux/netfilter_bridge.h | 24 + .../compat-2.6/include/linux/netfilter_ipv4.h | 19 + .../compat-2.6/include/linux/netlink.h | 24 + .../compat-2.6/include/linux/percpu.h | 10 + .../compat-2.6/include/linux/random.h | 17 + .../compat-2.6/include/linux/rculist.h | 12 + .../compat-2.6/include/linux/rtnetlink.h | 29 + .../compat-2.6/include/linux/skbuff.h | 170 + .../linux-2.6/compat-2.6/include/linux/tcp.h | 18 + .../compat-2.6/include/linux/timer.h | 96 + .../compat-2.6/include/linux/types.h | 14 + .../linux-2.6/compat-2.6/include/linux/udp.h | 13 + .../compat-2.6/include/linux/workqueue.h | 42 + .../compat-2.6/include/net/checksum.h | 16 + .../compat-2.6/include/net/genetlink.h | 123 + .../compat-2.6/include/net/netlink.h | 22 + datapath/linux-2.6/compat-2.6/random32.c | 144 + datapath/linux-2.6/compat-2.6/veth.c | 537 +++ .../config/config-linux-2.6.23-rc9-kvm | 1408 +++++++ datapath/table.c | 240 ++ debian/.gitignore | 19 + debian/automake.mk | 50 + debian/changelog | 5 + debian/commands/reconfigure | 128 + debian/commands/update | 4 + debian/compat | 1 + debian/control | 143 + debian/control.modules.in | 20 + debian/copyright | 21 + debian/corekeeper.cron.daily | 5 + debian/corekeeper.init | 63 + debian/dirs | 2 + debian/openvswitch-common.dirs | 1 + debian/openvswitch-common.install | 3 + debian/openvswitch-common.manpages | 2 + debian/openvswitch-controller.README.Debian | 12 + debian/openvswitch-controller.default | 29 + debian/openvswitch-controller.dirs | 1 + debian/openvswitch-controller.init | 269 ++ debian/openvswitch-controller.install | 1 + debian/openvswitch-controller.manpages | 1 + debian/openvswitch-controller.postinst | 52 + ...atapath-module-_KVERS_.postinst.modules.in | 25 + .../openvswitch-datapath-source.README.Debian | 31 + debian/openvswitch-datapath-source.copyright | 15 + debian/openvswitch-datapath-source.dirs | 1 + debian/openvswitch-datapath-source.install | 6 + debian/openvswitch-monitor.default | 27 + debian/openvswitch-monitor.dirs | 1 + debian/openvswitch-monitor.init | 174 + debian/openvswitch-monitor.install | 1 + debian/openvswitch-pki-server.apache2 | 1 + debian/openvswitch-pki-server.dirs | 1 + debian/openvswitch-pki-server.install | 1 + debian/openvswitch-pki-server.postinst | 44 + debian/openvswitch-pki.postinst | 41 + debian/openvswitch-switch-config.dirs | 1 + debian/openvswitch-switch-config.install | 1 + debian/openvswitch-switch-config.manpages | 1 + debian/openvswitch-switch-config.overrides | 1 + debian/openvswitch-switch-config.templates | 228 ++ debian/openvswitch-switch.README.Debian | 18 + debian/openvswitch-switch.dirs | 2 + debian/openvswitch-switch.init | 428 +++ debian/openvswitch-switch.install | 7 + debian/openvswitch-switch.logrotate | 11 + debian/openvswitch-switch.manpages | 5 + debian/openvswitch-switch.postinst | 51 + debian/openvswitch-switch.postrm | 43 + debian/openvswitch-switch.template | 165 + debian/openvswitch-switchui.copyright | 33 + debian/openvswitch-switchui.default | 35 + debian/openvswitch-switchui.dirs | 3 + debian/openvswitch-switchui.init | 210 ++ debian/openvswitch-switchui.install | 2 + debian/openvswitch-wdt.default | 24 + debian/openvswitch-wdt.dirs | 2 + debian/openvswitch-wdt.init | 176 + debian/openvswitch-wdt.install | 1 + debian/ovs-switch-setup | 615 +++ debian/ovs-switch-setup.8 | 41 + debian/po/POTFILES.in | 1 + debian/po/templates.pot | 522 +++ debian/rules | 145 + extras/ezio/automake.mk | 49 + extras/ezio/byteq.c | 216 ++ extras/ezio/byteq.h | 57 + extras/ezio/ezio-term.c | 1060 ++++++ extras/ezio/ezio.c | 243 ++ extras/ezio/ezio.h | 96 + extras/ezio/ezio3.ti | 21 + extras/ezio/ovs-switchui.c | 3026 +++++++++++++++ extras/ezio/terminal.c | 833 +++++ extras/ezio/terminal.h | 41 + extras/ezio/tty.c | 404 ++ extras/ezio/tty.h | 39 + extras/ezio/vt-dummy.c | 40 + extras/ezio/vt-linux.c | 139 + extras/ezio/vt.h | 33 + include/.gitignore | 2 + include/automake.mk | 2 + include/openflow/automake.mk | 4 + include/openflow/nicira-ext.h | 109 + include/openflow/openflow-mgmt.h | 194 + include/openflow/openflow.h | 796 ++++ include/openvswitch/automake.mk | 4 + include/openvswitch/brcompat-netlink.h | 56 + include/openvswitch/datapath-protocol.h | 287 ++ lib/.gitignore | 4 + lib/automake.mk | 184 + lib/backtrace.c | 106 + lib/backtrace.h | 31 + lib/bitmap.c | 55 + lib/bitmap.h | 82 + lib/cfg.c | 1182 ++++++ lib/cfg.h | 98 + lib/classifier.c | 832 +++++ lib/classifier.h | 149 + lib/command-line.c | 49 + lib/command-line.h | 25 + lib/common.man | 7 + lib/compiler.h | 30 + lib/coverage-counters.h | 25 + lib/coverage-scan.pl | 47 + lib/coverage.c | 163 + lib/coverage.h | 58 + lib/csum.c | 98 + lib/csum.h | 31 + lib/daemon.c | 294 ++ lib/daemon.h | 51 + lib/daemon.man | 21 + lib/dh1024.pem | 10 + lib/dh2048.pem | 12 + lib/dh4096.pem | 18 + lib/dhcp-client.c | 1073 ++++++ lib/dhcp-client.h | 56 + lib/dhcp.c | 825 ++++ lib/dhcp.h | 262 ++ lib/dhparams.h | 10 + lib/dirs.h | 25 + lib/dpif.c | 1060 ++++++ lib/dpif.h | 102 + lib/dpif.man | 16 + lib/dynamic-string.c | 261 ++ lib/dynamic-string.h | 62 + lib/fatal-signal.c | 253 ++ lib/fatal-signal.h | 39 + lib/fault.c | 73 + lib/fault.h | 23 + lib/flow.c | 280 ++ lib/flow.h | 101 + lib/hash.c | 85 + lib/hash.h | 71 + lib/hmap.c | 145 + lib/hmap.h | 254 ++ lib/leak-checker.c | 244 ++ lib/leak-checker.h | 41 + lib/leak-checker.man | 7 + lib/learning-switch.c | 644 ++++ lib/learning-switch.h | 33 + lib/list.c | 158 + lib/list.h | 71 + lib/mac-learning.c | 285 ++ lib/mac-learning.h | 37 + lib/netdev.c | 1556 ++++++++ lib/netdev.h | 116 + lib/netlink-protocol.h | 141 + lib/netlink.c | 1077 ++++++ lib/netlink.h | 127 + lib/odp-util.c | 133 + lib/odp-util.h | 82 + lib/ofp-print.c | 1473 ++++++++ lib/ofp-print.h | 44 + lib/ofpbuf.c | 288 ++ lib/ofpbuf.h | 73 + lib/packets.h | 274 ++ lib/pcap.c | 163 + lib/pcap.h | 30 + lib/poll-loop.c | 267 ++ lib/poll-loop.h | 55 + lib/port-array.c | 183 + lib/port-array.h | 95 + lib/process.c | 417 +++ lib/process.h | 48 + lib/queue.c | 120 + lib/queue.h | 34 + lib/random.c | 90 + lib/random.h | 30 + lib/rconn.c | 959 +++++ lib/rconn.h | 104 + lib/sat-math.h | 46 + lib/sha1.c | 394 ++ lib/sha1.h | 74 + lib/shash.c | 93 + lib/shash.h | 42 + lib/signals.c | 128 + lib/signals.h | 27 + lib/socket-util.c | 343 ++ lib/socket-util.h | 39 + lib/stp.c | 1226 ++++++ lib/stp.h | 103 + lib/svec.c | 381 ++ lib/svec.h | 60 + lib/tag.c | 82 + lib/tag.h | 134 + lib/timeval.c | 305 ++ lib/timeval.h | 51 + lib/type-props.h | 32 + lib/unixctl.c | 592 +++ lib/unixctl.h | 44 + lib/util.c | 296 ++ lib/util.h | 117 + lib/valgrind.h | 26 + lib/vconn-provider.h | 170 + lib/vconn-ssl.c | 1197 ++++++ lib/vconn-ssl.h | 54 + lib/vconn-stream.c | 346 ++ lib/vconn-stream.h | 35 + lib/vconn-tcp.c | 186 + lib/vconn-unix.c | 118 + lib/vconn.c | 1405 +++++++ lib/vconn.h | 128 + lib/vlog-modules.def | 65 + lib/vlog.c | 711 ++++ lib/vlog.h | 192 + lib/vlog.man | 44 + lib/xtoxll.h | 34 + m4/nx-build.m4 | 53 + m4/openvswitch.m4 | 210 ++ secchan/.gitignore | 4 + secchan/automake.mk | 42 + secchan/commands/automake.mk | 3 + secchan/commands/reboot | 3 + secchan/discovery.c | 270 ++ secchan/discovery.h | 38 + secchan/executer.c | 519 +++ secchan/executer.h | 33 + secchan/fail-open.c | 140 + secchan/fail-open.h | 38 + secchan/in-band.c | 358 ++ secchan/in-band.h | 37 + secchan/main.c | 565 +++ secchan/netflow.c | 328 ++ secchan/netflow.h | 33 + secchan/ofproto.c | 3305 +++++++++++++++++ secchan/ofproto.h | 109 + secchan/pinsched.c | 284 ++ secchan/pinsched.h | 35 + secchan/pktbuf.c | 150 + secchan/pktbuf.h | 34 + secchan/secchan.8.in | 463 +++ secchan/status.c | 229 ++ secchan/status.h | 45 + soexpand.pl | 26 + tests/.gitignore | 10 + tests/automake.mk | 56 + tests/flowgen.pl | 224 ++ tests/test-classifier.c | 977 +++++ tests/test-dhcp-client.c | 189 + tests/test-flows.c | 76 + tests/test-flows.sh | 9 + tests/test-hash.c | 155 + tests/test-hmap.c | 281 ++ tests/test-list.c | 159 + tests/test-stp-ieee802.1d-1998 | 12 + tests/test-stp-ieee802.1d-2004-fig17.4 | 31 + tests/test-stp-ieee802.1d-2004-fig17.6 | 14 + tests/test-stp-ieee802.1d-2004-fig17.7 | 17 + tests/test-stp-iol-io-1.1 | 25 + tests/test-stp-iol-io-1.2 | 14 + tests/test-stp-iol-io-1.4 | 13 + tests/test-stp-iol-io-1.5 | 40 + tests/test-stp-iol-op-1.1 | 7 + tests/test-stp-iol-op-1.4 | 8 + tests/test-stp-iol-op-3.1 | 11 + tests/test-stp-iol-op-3.3 | 11 + tests/test-stp-iol-op-3.4 | 11 + tests/test-stp.c | 648 ++++ tests/test-stp.sh | 7 + tests/test-type-props.c | 41 + third-party/.gitignore | 2 + third-party/README | 35 + third-party/automake.mk | 3 + third-party/ofp-tcpdump.patch | 109 + utilities/.gitignore | 22 + utilities/automake.mk | 74 + utilities/nlmon.c | 90 + utilities/ovs-appctl.8.in | 166 + utilities/ovs-appctl.c | 221 ++ utilities/ovs-cfg-mod.8.in | 101 + utilities/ovs-cfg-mod.c | 239 ++ utilities/ovs-controller.8.in | 132 + utilities/ovs-controller.c | 323 ++ utilities/ovs-discover.8.in | 118 + utilities/ovs-discover.c | 405 ++ utilities/ovs-dpctl.8.in | 166 + utilities/ovs-dpctl.c | 552 +++ utilities/ovs-kill.8.in | 60 + utilities/ovs-kill.c | 210 ++ utilities/ovs-monitor | 128 + utilities/ovs-ofctl.8.in | 489 +++ utilities/ovs-ofctl.c | 1278 +++++++ utilities/ovs-parse-leaks.in | 285 ++ utilities/ovs-pki-cgi.in | 41 + utilities/ovs-pki.8.in | 323 ++ utilities/ovs-pki.in | 582 +++ utilities/ovs-wdt.c | 263 ++ vswitchd/.gitignore | 7 + vswitchd/automake.mk | 40 + vswitchd/bridge.c | 3058 +++++++++++++++ vswitchd/bridge.h | 43 + vswitchd/mgmt.c | 679 ++++ vswitchd/mgmt.h | 36 + vswitchd/ovs-brcompatd.8.in | 49 + vswitchd/ovs-brcompatd.c | 766 ++++ vswitchd/ovs-vswitchd.8.in | 87 + vswitchd/ovs-vswitchd.c | 255 ++ vswitchd/ovs-vswitchd.conf.5.in | 642 ++++ vswitchd/ovs-vswitchd.h | 32 + vswitchd/port.c | 68 + vswitchd/port.h | 33 + vswitchd/proc-net-compat.c | 344 ++ vswitchd/proc-net-compat.h | 51 + vswitchd/xenserver.c | 90 + vswitchd/xenserver.h | 32 + xenserver/README | 78 + xenserver/automake.mk | 19 + xenserver/etc_init.d_vswitch | 302 ++ xenserver/etc_init.d_vswitch-xapi-update | 71 + xenserver/etc_logrotate.d_vswitch | 14 + xenserver/etc_profile.d_vswitch.sh | 56 + xenserver/etc_sysconfig_vswitch.example | 79 + .../etc_xapi.d_plugins_vswitch-cfg-update | 123 + xenserver/etc_xensource_scripts_vif | 130 + ...pt_xensource_libexec_interface-reconfigure | 1572 ++++++++ ...xsconsole_plugins-base_XSFeatureVSwitch.py | 296 ++ xenserver/vswitch-xen.spec | 310 ++ 387 files changed, 75535 insertions(+) create mode 100644 .gitignore create mode 100644 COPYING create mode 100644 CodingStyle create mode 100644 INSTALL create mode 100644 Makefile.am create mode 100644 README create mode 100644 acinclude.m4 create mode 100755 boot.sh create mode 100644 configure.ac create mode 100644 datapath/.gitignore create mode 100644 datapath/Makefile.am create mode 100644 datapath/Modules.mk create mode 100644 datapath/actions.c create mode 100644 datapath/actions.h create mode 100644 datapath/brc_procfs.c create mode 100644 datapath/brc_procfs.h create mode 100644 datapath/brc_sysfs.h create mode 100644 datapath/brc_sysfs_dp.c create mode 100644 datapath/brc_sysfs_if.c create mode 100644 datapath/brcompat.c create mode 100644 datapath/compat.h create mode 100644 datapath/datapath.c create mode 100644 datapath/datapath.h create mode 100644 datapath/dp_dev.c create mode 100644 datapath/dp_dev.h create mode 100644 datapath/dp_notify.c create mode 100644 datapath/flow.c create mode 100644 datapath/flow.h create mode 100644 datapath/linux-2.6/.gitignore create mode 100644 datapath/linux-2.6/Kbuild.in create mode 100644 datapath/linux-2.6/Makefile.in create mode 100644 datapath/linux-2.6/Makefile.main.in create mode 100644 datapath/linux-2.6/Modules.mk create mode 100644 datapath/linux-2.6/compat-2.6/compat26.h create mode 100644 datapath/linux-2.6/compat-2.6/genetlink-brcompat.c create mode 100644 datapath/linux-2.6/compat-2.6/genetlink-openvswitch.c create mode 100644 datapath/linux-2.6/compat-2.6/include/asm-generic/bug.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/cpumask.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/dmi.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/err.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/icmp.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/if_arp.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/ip.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/ipv6.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/jiffies.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/kernel.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/lockdep.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/log2.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/mutex.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/netdevice.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/netfilter_bridge.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/netfilter_ipv4.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/netlink.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/percpu.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/random.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/rculist.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/rtnetlink.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/skbuff.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/tcp.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/timer.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/types.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/udp.h create mode 100644 datapath/linux-2.6/compat-2.6/include/linux/workqueue.h create mode 100644 datapath/linux-2.6/compat-2.6/include/net/checksum.h create mode 100644 datapath/linux-2.6/compat-2.6/include/net/genetlink.h create mode 100644 datapath/linux-2.6/compat-2.6/include/net/netlink.h create mode 100644 datapath/linux-2.6/compat-2.6/random32.c create mode 100644 datapath/linux-2.6/compat-2.6/veth.c create mode 100644 datapath/linux-2.6/config/config-linux-2.6.23-rc9-kvm create mode 100644 datapath/table.c create mode 100644 debian/.gitignore create mode 100644 debian/automake.mk create mode 100644 debian/changelog create mode 100755 debian/commands/reconfigure create mode 100755 debian/commands/update create mode 100644 debian/compat create mode 100644 debian/control create mode 100644 debian/control.modules.in create mode 100644 debian/copyright create mode 100755 debian/corekeeper.cron.daily create mode 100755 debian/corekeeper.init create mode 100644 debian/dirs create mode 100644 debian/openvswitch-common.dirs create mode 100644 debian/openvswitch-common.install create mode 100644 debian/openvswitch-common.manpages create mode 100644 debian/openvswitch-controller.README.Debian create mode 100644 debian/openvswitch-controller.default create mode 100644 debian/openvswitch-controller.dirs create mode 100755 debian/openvswitch-controller.init create mode 100644 debian/openvswitch-controller.install create mode 100644 debian/openvswitch-controller.manpages create mode 100755 debian/openvswitch-controller.postinst create mode 100755 debian/openvswitch-datapath-module-_KVERS_.postinst.modules.in create mode 100644 debian/openvswitch-datapath-source.README.Debian create mode 100644 debian/openvswitch-datapath-source.copyright create mode 100644 debian/openvswitch-datapath-source.dirs create mode 100644 debian/openvswitch-datapath-source.install create mode 100644 debian/openvswitch-monitor.default create mode 100644 debian/openvswitch-monitor.dirs create mode 100755 debian/openvswitch-monitor.init create mode 100644 debian/openvswitch-monitor.install create mode 100644 debian/openvswitch-pki-server.apache2 create mode 100644 debian/openvswitch-pki-server.dirs create mode 100644 debian/openvswitch-pki-server.install create mode 100755 debian/openvswitch-pki-server.postinst create mode 100755 debian/openvswitch-pki.postinst create mode 100644 debian/openvswitch-switch-config.dirs create mode 100644 debian/openvswitch-switch-config.install create mode 100644 debian/openvswitch-switch-config.manpages create mode 100644 debian/openvswitch-switch-config.overrides create mode 100644 debian/openvswitch-switch-config.templates create mode 100644 debian/openvswitch-switch.README.Debian create mode 100644 debian/openvswitch-switch.dirs create mode 100755 debian/openvswitch-switch.init create mode 100644 debian/openvswitch-switch.install create mode 100644 debian/openvswitch-switch.logrotate create mode 100644 debian/openvswitch-switch.manpages create mode 100755 debian/openvswitch-switch.postinst create mode 100755 debian/openvswitch-switch.postrm create mode 100644 debian/openvswitch-switch.template create mode 100644 debian/openvswitch-switchui.copyright create mode 100644 debian/openvswitch-switchui.default create mode 100644 debian/openvswitch-switchui.dirs create mode 100755 debian/openvswitch-switchui.init create mode 100644 debian/openvswitch-switchui.install create mode 100644 debian/openvswitch-wdt.default create mode 100644 debian/openvswitch-wdt.dirs create mode 100755 debian/openvswitch-wdt.init create mode 100644 debian/openvswitch-wdt.install create mode 100755 debian/ovs-switch-setup create mode 100644 debian/ovs-switch-setup.8 create mode 100644 debian/po/POTFILES.in create mode 100644 debian/po/templates.pot create mode 100755 debian/rules create mode 100644 extras/ezio/automake.mk create mode 100644 extras/ezio/byteq.c create mode 100644 extras/ezio/byteq.h create mode 100644 extras/ezio/ezio-term.c create mode 100644 extras/ezio/ezio.c create mode 100644 extras/ezio/ezio.h create mode 100644 extras/ezio/ezio3.ti create mode 100644 extras/ezio/ovs-switchui.c create mode 100644 extras/ezio/terminal.c create mode 100644 extras/ezio/terminal.h create mode 100644 extras/ezio/tty.c create mode 100644 extras/ezio/tty.h create mode 100644 extras/ezio/vt-dummy.c create mode 100644 extras/ezio/vt-linux.c create mode 100644 extras/ezio/vt.h create mode 100644 include/.gitignore create mode 100644 include/automake.mk create mode 100644 include/openflow/automake.mk create mode 100644 include/openflow/nicira-ext.h create mode 100644 include/openflow/openflow-mgmt.h create mode 100644 include/openflow/openflow.h create mode 100644 include/openvswitch/automake.mk create mode 100644 include/openvswitch/brcompat-netlink.h create mode 100644 include/openvswitch/datapath-protocol.h create mode 100644 lib/.gitignore create mode 100644 lib/automake.mk create mode 100644 lib/backtrace.c create mode 100644 lib/backtrace.h create mode 100644 lib/bitmap.c create mode 100644 lib/bitmap.h create mode 100644 lib/cfg.c create mode 100644 lib/cfg.h create mode 100644 lib/classifier.c create mode 100644 lib/classifier.h create mode 100644 lib/command-line.c create mode 100644 lib/command-line.h create mode 100644 lib/common.man create mode 100644 lib/compiler.h create mode 100644 lib/coverage-counters.h create mode 100755 lib/coverage-scan.pl create mode 100644 lib/coverage.c create mode 100644 lib/coverage.h create mode 100644 lib/csum.c create mode 100644 lib/csum.h create mode 100644 lib/daemon.c create mode 100644 lib/daemon.h create mode 100644 lib/daemon.man create mode 100644 lib/dh1024.pem create mode 100644 lib/dh2048.pem create mode 100644 lib/dh4096.pem create mode 100644 lib/dhcp-client.c create mode 100644 lib/dhcp-client.h create mode 100644 lib/dhcp.c create mode 100644 lib/dhcp.h create mode 100644 lib/dhparams.h create mode 100644 lib/dirs.h create mode 100644 lib/dpif.c create mode 100644 lib/dpif.h create mode 100644 lib/dpif.man create mode 100644 lib/dynamic-string.c create mode 100644 lib/dynamic-string.h create mode 100644 lib/fatal-signal.c create mode 100644 lib/fatal-signal.h create mode 100644 lib/fault.c create mode 100644 lib/fault.h create mode 100644 lib/flow.c create mode 100644 lib/flow.h create mode 100644 lib/hash.c create mode 100644 lib/hash.h create mode 100644 lib/hmap.c create mode 100644 lib/hmap.h create mode 100644 lib/leak-checker.c create mode 100644 lib/leak-checker.h create mode 100644 lib/leak-checker.man create mode 100644 lib/learning-switch.c create mode 100644 lib/learning-switch.h create mode 100644 lib/list.c create mode 100644 lib/list.h create mode 100644 lib/mac-learning.c create mode 100644 lib/mac-learning.h create mode 100644 lib/netdev.c create mode 100644 lib/netdev.h create mode 100644 lib/netlink-protocol.h create mode 100644 lib/netlink.c create mode 100644 lib/netlink.h create mode 100644 lib/odp-util.c create mode 100644 lib/odp-util.h create mode 100644 lib/ofp-print.c create mode 100644 lib/ofp-print.h create mode 100644 lib/ofpbuf.c create mode 100644 lib/ofpbuf.h create mode 100644 lib/packets.h create mode 100644 lib/pcap.c create mode 100644 lib/pcap.h create mode 100644 lib/poll-loop.c create mode 100644 lib/poll-loop.h create mode 100644 lib/port-array.c create mode 100644 lib/port-array.h create mode 100644 lib/process.c create mode 100644 lib/process.h create mode 100644 lib/queue.c create mode 100644 lib/queue.h create mode 100644 lib/random.c create mode 100644 lib/random.h create mode 100644 lib/rconn.c create mode 100644 lib/rconn.h create mode 100644 lib/sat-math.h create mode 100644 lib/sha1.c create mode 100644 lib/sha1.h create mode 100644 lib/shash.c create mode 100644 lib/shash.h create mode 100644 lib/signals.c create mode 100644 lib/signals.h create mode 100644 lib/socket-util.c create mode 100644 lib/socket-util.h create mode 100644 lib/stp.c create mode 100644 lib/stp.h create mode 100644 lib/svec.c create mode 100644 lib/svec.h create mode 100644 lib/tag.c create mode 100644 lib/tag.h create mode 100644 lib/timeval.c create mode 100644 lib/timeval.h create mode 100644 lib/type-props.h create mode 100644 lib/unixctl.c create mode 100644 lib/unixctl.h create mode 100644 lib/util.c create mode 100644 lib/util.h create mode 100644 lib/valgrind.h create mode 100644 lib/vconn-provider.h create mode 100644 lib/vconn-ssl.c create mode 100644 lib/vconn-ssl.h create mode 100644 lib/vconn-stream.c create mode 100644 lib/vconn-stream.h create mode 100644 lib/vconn-tcp.c create mode 100644 lib/vconn-unix.c create mode 100644 lib/vconn.c create mode 100644 lib/vconn.h create mode 100644 lib/vlog-modules.def create mode 100644 lib/vlog.c create mode 100644 lib/vlog.h create mode 100644 lib/vlog.man create mode 100644 lib/xtoxll.h create mode 100644 m4/nx-build.m4 create mode 100644 m4/openvswitch.m4 create mode 100644 secchan/.gitignore create mode 100644 secchan/automake.mk create mode 100644 secchan/commands/automake.mk create mode 100755 secchan/commands/reboot create mode 100644 secchan/discovery.c create mode 100644 secchan/discovery.h create mode 100644 secchan/executer.c create mode 100644 secchan/executer.h create mode 100644 secchan/fail-open.c create mode 100644 secchan/fail-open.h create mode 100644 secchan/in-band.c create mode 100644 secchan/in-band.h create mode 100644 secchan/main.c create mode 100644 secchan/netflow.c create mode 100644 secchan/netflow.h create mode 100644 secchan/ofproto.c create mode 100644 secchan/ofproto.h create mode 100644 secchan/pinsched.c create mode 100644 secchan/pinsched.h create mode 100644 secchan/pktbuf.c create mode 100644 secchan/pktbuf.h create mode 100644 secchan/secchan.8.in create mode 100644 secchan/status.c create mode 100644 secchan/status.h create mode 100755 soexpand.pl create mode 100644 tests/.gitignore create mode 100644 tests/automake.mk create mode 100755 tests/flowgen.pl create mode 100644 tests/test-classifier.c create mode 100644 tests/test-dhcp-client.c create mode 100644 tests/test-flows.c create mode 100755 tests/test-flows.sh create mode 100644 tests/test-hash.c create mode 100644 tests/test-hmap.c create mode 100644 tests/test-list.c create mode 100644 tests/test-stp-ieee802.1d-1998 create mode 100644 tests/test-stp-ieee802.1d-2004-fig17.4 create mode 100644 tests/test-stp-ieee802.1d-2004-fig17.6 create mode 100644 tests/test-stp-ieee802.1d-2004-fig17.7 create mode 100644 tests/test-stp-iol-io-1.1 create mode 100644 tests/test-stp-iol-io-1.2 create mode 100644 tests/test-stp-iol-io-1.4 create mode 100644 tests/test-stp-iol-io-1.5 create mode 100644 tests/test-stp-iol-op-1.1 create mode 100644 tests/test-stp-iol-op-1.4 create mode 100644 tests/test-stp-iol-op-3.1 create mode 100644 tests/test-stp-iol-op-3.3 create mode 100644 tests/test-stp-iol-op-3.4 create mode 100644 tests/test-stp.c create mode 100755 tests/test-stp.sh create mode 100644 tests/test-type-props.c create mode 100644 third-party/.gitignore create mode 100644 third-party/README create mode 100644 third-party/automake.mk create mode 100644 third-party/ofp-tcpdump.patch create mode 100644 utilities/.gitignore create mode 100644 utilities/automake.mk create mode 100644 utilities/nlmon.c create mode 100644 utilities/ovs-appctl.8.in create mode 100644 utilities/ovs-appctl.c create mode 100644 utilities/ovs-cfg-mod.8.in create mode 100644 utilities/ovs-cfg-mod.c create mode 100644 utilities/ovs-controller.8.in create mode 100644 utilities/ovs-controller.c create mode 100644 utilities/ovs-discover.8.in create mode 100644 utilities/ovs-discover.c create mode 100644 utilities/ovs-dpctl.8.in create mode 100644 utilities/ovs-dpctl.c create mode 100644 utilities/ovs-kill.8.in create mode 100644 utilities/ovs-kill.c create mode 100755 utilities/ovs-monitor create mode 100644 utilities/ovs-ofctl.8.in create mode 100644 utilities/ovs-ofctl.c create mode 100755 utilities/ovs-parse-leaks.in create mode 100755 utilities/ovs-pki-cgi.in create mode 100644 utilities/ovs-pki.8.in create mode 100755 utilities/ovs-pki.in create mode 100644 utilities/ovs-wdt.c create mode 100644 vswitchd/.gitignore create mode 100644 vswitchd/automake.mk create mode 100644 vswitchd/bridge.c create mode 100644 vswitchd/bridge.h create mode 100644 vswitchd/mgmt.c create mode 100644 vswitchd/mgmt.h create mode 100644 vswitchd/ovs-brcompatd.8.in create mode 100644 vswitchd/ovs-brcompatd.c create mode 100644 vswitchd/ovs-vswitchd.8.in create mode 100644 vswitchd/ovs-vswitchd.c create mode 100644 vswitchd/ovs-vswitchd.conf.5.in create mode 100644 vswitchd/ovs-vswitchd.h create mode 100644 vswitchd/port.c create mode 100644 vswitchd/port.h create mode 100644 vswitchd/proc-net-compat.c create mode 100644 vswitchd/proc-net-compat.h create mode 100644 vswitchd/xenserver.c create mode 100644 vswitchd/xenserver.h create mode 100644 xenserver/README create mode 100644 xenserver/automake.mk create mode 100755 xenserver/etc_init.d_vswitch create mode 100755 xenserver/etc_init.d_vswitch-xapi-update create mode 100644 xenserver/etc_logrotate.d_vswitch create mode 100644 xenserver/etc_profile.d_vswitch.sh create mode 100644 xenserver/etc_sysconfig_vswitch.example create mode 100755 xenserver/etc_xapi.d_plugins_vswitch-cfg-update create mode 100755 xenserver/etc_xensource_scripts_vif create mode 100755 xenserver/opt_xensource_libexec_interface-reconfigure create mode 100644 xenserver/usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py create mode 100644 xenserver/vswitch-xen.spec diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..480615ac --- /dev/null +++ b/.gitignore @@ -0,0 +1,43 @@ +#*# +*.a +*.d +*.ko +*.la +*.lo +*.loT +*.mod.c +*.o +*.o +*.pyc +*.so +*~ +.#* +.*.cmd +.*.swp +.deps +.libs +.tmp_versions +/Makefile +/Makefile.in +/aclocal.m4 +/autom4te.cache +/build-arch-stamp +/build-aux +/build-indep-stamp +/compile +/config.guess +/config.h +/config.h.in +/config.log +/config.status +/config.sub +/configure +/configure-stamp +/depcomp +/install-sh +/missing +/stamp-h1 +Module.symvers +TAGS +cscope.* +tags diff --git a/COPYING b/COPYING new file mode 100644 index 00000000..c4a9fb2f --- /dev/null +++ b/COPYING @@ -0,0 +1,29 @@ +This file is a summary of the licensing of files in this distribution. +Some files may be marked specifically with a different license, in +which case that license applies to the file in question. + +Files under the debian, doc, include, lib, m4, secchan, tests, +third-party, and utilities directories are licensed under the ISC +license: + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +Files under the datapath directory are licensed under the GNU General +Public License, version 2. + +Files under the extras and vswitchd directories are licensed under the +GNU General Public License, version 3 or later. + +Files under the xenserver directory are licensed on a file-by-file +basis. Some files are under an uncertain license that may not be +DFSG-compliant or GPL-compatible. Refer to each file for details. diff --git a/CodingStyle b/CodingStyle new file mode 100644 index 00000000..126b45a8 --- /dev/null +++ b/CodingStyle @@ -0,0 +1,504 @@ + Open vSwitch Coding Style + ========================= + +This file describes the coding style used in most C files in the Open +vSwitch distribution. However, Linux kernel code datapath directory +follows the Linux kernel's established coding conventions. + +BASICS + + Limit lines to 79 characters. + + Use form feeds (control+L) to divide long source files into logical +pieces. A form feed should appear as the only character on a line. + + Do not use tabs for indentation. + + Avoid trailing spaces on lines. + + +NAMING + + Use names that explain the purpose of a function or object. + + Use underscores to separate words in an identifier: multi_word_name. + + Use lowercase for most names. Use uppercase for macros, macro +parameters, and members of enumerations. + + Give arrays names that are plural. + + Pick a unique name prefix (ending with an underscore) for each +module, and apply that prefix to all of that module's externally +visible names. Names of macro parameters, struct and union members, +and parameters in function prototypes are not considered externally +visible for this purpose. + + Do not use names that begin with _. If you need a name for +"internal use only", use __ as a suffix instead of a prefix. + + Avoid negative names: "found" is a better name than "not_found". + + In names, a "size" is a count of bytes, a "length" is a count of +characters. A buffer has size, but a string has length. The length +of a string does not include the null terminator, but the size of the +buffer that contains the string does. + + +COMMENTS + + Comments should be written as full sentences that start with a +capital letter and end with a period. Put two spaces between +sentences. + + Write block comments as shown below. You may put the /* and */ on +the same line as comment text if you prefer. + + /* + * We redirect stderr to /dev/null because we often want to remove all + * traffic control configuration on a port so its in a known state. If + * this done when there is no such configuration, tc complains, so we just + * always ignore it. + */ + + Each function and each variable declared outside a function, and +each struct, union, and typedef declaration should be preceded by a +comment. See FUNCTION DEFINITIONS below for function comment +guidelines. + + Each struct and union member should each have an inline comment that +explains its meaning. structs and unions with many members should be +additionally divided into logical groups of members by block comments, +e.g.: + + /* An event that will wake the following call to poll_block(). */ + struct poll_waiter { + /* Set when the waiter is created. */ + struct list node; /* Element in global waiters list. */ + int fd; /* File descriptor. */ + short int events; /* Events to wait for (POLLIN, POLLOUT). */ + poll_fd_func *function; /* Callback function, if any, or null. */ + void *aux; /* Argument to callback function. */ + struct backtrace *backtrace; /* Event that created waiter, or null. */ + + /* Set only when poll_block() is called. */ + struct pollfd *pollfd; /* Pointer to element of the pollfds array + (null if added from a callback). */ + }; + + Use XXX or FIXME comments to mark code that needs work. + + Don't use // comments. + + Don't comment out or #if 0 out code. Just remove it. The code that +was there will still be in version control history. + + +FUNCTIONS + + Put the return type, function name, and the braces that surround the +function's code on separate lines, all starting in column 0. + + Before each function definition, write a comment that describes the +function's purpose, including each parameter, the return value, and +side effects. References to argument names should be given in +single-quotes, e.g. 'arg'. The comment should not include the +function name, nor need it follow any formal structure. The comment +does not need to describe how a function does its work, unless this +information is needed to use the function correctly (this is often +better done with comments *inside* the function). + + Simple static functions do not need a comment. + + Within a file, non-static functions should come first, in the order +that they are declared in the header file, followed by static +functions. Static functions should be in one or more separate pages +(separated by form feed characters) in logical groups. A commonly +useful way to divide groups is by "level", with high-level functions +first, followed by groups of progressively lower-level functions. +This makes it easy for the program's reader to see the top-down +structure by reading from top to bottom. + + All function declarations and definitions should include a +prototype. Empty parentheses, e.g. "int foo();", do not include a +prototype (they state that the function's parameters are unknown); +write "void" in parentheses instead, e.g. "int foo(void);". + + Prototypes for static functions should either all go at the top of +the file, separated into groups by blank lines, or they should appear +at the top of each page of functions. Don't comment individual +prototypes, but a comment on each group of prototypes is often +appropriate. + + In the absence of good reasons for another order, the following +parameter order is preferred. One notable exception is that data +parameters and their corresponding size parameters should be paired. + + 1. The primary object being manipulated, if any (equivalent to the + "this" pointer in C++). + 2. Input-only parameters. + 3. Input/output parameters. + 4. Output-only parameters. + 5. Status parameter. + + Example: + + /* Stores the features supported by 'netdev' into each of '*current', + * '*advertised', '*supported', and '*peer' that are non-null. Each value + * is a bitmap of "enum ofp_port_features" bits, in host byte order. + * Returns 0 if successful, otherwise a positive errno value. On failure, + * all of the passed-in values are set to 0. */ + int + netdev_get_features(struct netdev *netdev, + uint32_t *current, uint32_t *advertised, + uint32_t *supported, uint32_t *peer) + { + ... + } + + +FUNCTION PROTOTYPES + + Put the return type and function name on the same line in a function +prototype: + + static const struct option_class *get_option_class(int code); + + + Omit parameter names from function prototypes when the names do not +give useful information, e.g.: + + int netdev_get_mtu(const struct netdev *); + + +STATEMENTS + + Indent each level of code with 4 spaces. Use BSD-style brace +placement: + + if (a()) { + b(); + d(); + } + + Put a space between "if", "while", "for", etc. and the expressions +that follow them. + + Enclose single statements in braces: + + if (a > b) { + return a; + } else { + return b; + } + + Use comments and blank lines to divide long functions into logical +groups of statements. + + Avoid assignments inside "if" and "while" conditions. + + Do not put gratuitous parentheses around the expression in a return +statement, that is, write "return 0;" and not "return(0);" + + Write only one statement per line. + + Indent "switch" statements like this: + + switch (conn->state) { + case S_RECV: + error = run_connection_input(conn); + break; + + case S_PROCESS: + error = 0; + break; + + case S_SEND: + error = run_connection_output(conn); + break; + + default: + NOT_REACHED(); + } + + "switch" statements with very short, uniform cases may use an +abbreviated style: + + switch (code) { + case 200: return "OK"; + case 201: return "Created"; + case 202: return "Accepted"; + case 204: return "No Content"; + default: return "Unknown"; + } + + Use "for (;;)" to write an infinite loop. + + In an if/else construct where one branch is the "normal" or "common" +case and the other branch is the "uncommon" or "error" case, put the +common case after the "if", not the "else". This is a form of +documentation. It also places the most important code in sequential +order without forcing the reader to visually skip past less important +details. (Some compilers also assume that the "if" branch is the more +common case, so this can be a real form of optimization as well.) + + +MACROS + + Don't define an object-like macro if an enum can be used instead. + + Don't define a function-like macro if a "static inline" function +can be used instead. + + If a macro's definition contains multiple statements, enclose them +with "do { ... } while (0)" to allow them to work properly in all +syntactic circumstances. + + Do use macros to eliminate the need to update different parts of a +single file in parallel, e.g. a list of enums and an array that gives +the name of each enum. For example: + + /* Logging importance levels. */ + #define VLOG_LEVELS \ + VLOG_LEVEL(EMER, LOG_ALERT) \ + VLOG_LEVEL(ERR, LOG_ERR) \ + VLOG_LEVEL(WARN, LOG_WARNING) \ + VLOG_LEVEL(INFO, LOG_NOTICE) \ + VLOG_LEVEL(DBG, LOG_DEBUG) + enum vlog_level { + #define VLOG_LEVEL(NAME, SYSLOG_LEVEL) VLL_##NAME, + VLOG_LEVELS + #undef VLOG_LEVEL + VLL_N_LEVELS + }; + + /* Name for each logging level. */ + static const char *level_names[VLL_N_LEVELS] = { + #define VLOG_LEVEL(NAME, SYSLOG_LEVEL) #NAME, + VLOG_LEVELS + #undef VLOG_LEVEL + }; + + +SOURCE FILES + + Each source file should state its license in a comment at the very +top, followed by a comment explaining the purpose of the code that is +in that file. The comment should explain how the code in the file +relates to code in other files. The goal is to allow a programmer to +quickly figure out where a given module fits into the larger system. + + The first non-comment line in a .c source file should be: + + #include + +#include directives should appear in the following order: + + 1. #include + + 2. The module's own headers, if any. Including this before any + other header (besides ) ensures that the module's + header file is self-contained (see HEADER FILES) below. + + 3. Standard C library headers and other system headers, preferably + in alphabetical order. (Occasionally one encounters a set of + system headers that must be included in a particular order, in + which case that order must take precedence.) + + 4. Open vSwitch headers, in alphabetical order. Use "", not <>, + to specify Open vSwitch header names. + + +HEADER FILES + + Each header file should start with its license, as described under +SOURCE FILES above, followed by a "header guard" to make the header +file idempotent, like so: + + #ifndef NETDEV_H + #define NETDEV_H 1 + + ... + + #endif /* netdev.h */ + + Header files should be self-contained; that is, they should #include +whatever additional headers are required, without requiring the client +to #include them for it. + + Don't define the members of a struct or union in a header file, +unless client code is actually intended to access them directly or if +the definition is otherwise actually needed (e.g. inline functions +defined in the header need them). + + Similarly, don't #include a header file just for the declaration of +a struct or union tag (e.g. just for "struct ;"). Just declare +the tag yourself. This reduces the number of header file +dependencies. + + +TYPES + + Use typedefs sparingly. Code is clearer if the actual type is +visible at the point of declaration. Do not, in general, declare a +typedef for a struct, union, or enum. Do not declare a typedef for a +pointer type, because this can be very confusing to the reader. + + A function type is a good use for a typedef because it can clarify +code. The type should be a function type, not a pointer-to-function +type. That way, the typedef name can be used to declare function +prototypes. (It cannot be used for function definitions, because that +is explicitly prohibited by C89 and C99.) + + You may assume that "char" is exactly 8 bits and that "int" and +"long" are at least 32 bits. + + Don't assume that "long" is big enough to hold a pointer. If you +need to cast a pointer to an integer, use "intptr_t" or "uintptr_t" +from . + + Use the int_t and uint_t types from for exact-width +integer types. Use the PRId, PRIu, and PRIx macros from + for formatting them with printf() and related functions. + + Use %zu to format size_t with printf(). + + Use bit-fields sparingly. Do not use bit-fields for layout of +network protocol fields or in other circumstances where the exact +format is important. + + Declare bit-fields to be type "unsigned int" or "signed int". Do +*not* declare bit-fields of type "int": C89 allows these to be either +signed or unsigned according to the compiler's whim. (A 1-bit +bit-field of type "int" may have a range of -1...0!) Do not declare +bit-fields of type _Bool or enum or any other type, because these are +not portable. + + Try to order structure members such that they pack well on a system +with 2-byte "short", 4-byte "int", and 4- or 8-byte "long" and pointer +types. Prefer clear organization over size optimization unless you +are convinced there is a size or speed benefit. + + Pointer declarators bind to the variable name, not the type name. +Write "int *x", not "int* x" and definitely not "int * x". + + +EXPRESSIONS + + Put one space on each side of infix binary and ternary operators: + + * / % + + - + << >> + < <= > >= + == != + & + ^ + | + && + || + ?: + = += -= *= /= %= &= ^= |= <<= >>= + + Avoid comma operators. + + Do not put any white space around postfix, prefix, or grouping +operators: + + () [] -> . + ! ~ ++ -- + - * & + +Exception 1: Put a space after (but not before) the "sizeof" keyword. +Exception 2: Put a space between the () used in a cast and the +expression whose type is cast: (void *) 0. + + Break long lines before binary operators and the ternary operators ? +and :, rather than after them, e.g. + + if (first_long_condition() || second_long_condition() + || third_long_condition()) + +and + + return (out_port != VIGP_CONTROL_PATH + ? alpheus_output_port(dp, skb, out_port) + : alpheus_output_control(dp, skb, fwd_save_skb(skb), + VIGR_ACTION)); + + + Do not parenthesize the operands of && and || unless operator +precedence makes it necessary, or unless the operands are themselves +expressions that use && and ||. Thus: + + if (!isdigit(s[0]) || !isdigit(s[1]) || !isdigit(s[2])) { + printf("string %s does not start with 3-digit code\n", s); + } + +but + + if (rule && (!best || rule->priority > best->priority)) { + best = rule; + } + + Do parenthesize a subexpression that must be split across more than +one line, e.g.: + + *idxp = ((l1_idx << PORT_ARRAY_L1_SHIFT) + | (l2_idx << PORT_ARRAY_L2_SHIFT) + | (l3_idx << PORT_ARRAY_L3_SHIFT)); + + Try to avoid casts. Don't cast the return value of malloc(). + + The "sizeof" operator is unique among C operators in that it accepts +two very different kinds of operands: an expression or a type. In +general, prefer to specify an expression, e.g. "int *x = +xmalloc(sizeof *x);". When the operand of sizeof is an expression, +there is no need to parenthesize that operand, and please don't. + + Use the ARRAY_SIZE macro from lib/util.h to calculate the number of +elements in an array. + + When using a relational operator like "<" or "==", put an expression +or variable argument on the left and a constant argument on the +right, e.g. "x == 0", *not* "0 == x". + + +BLANK LINES + + Put one blank line between top-level definitions of functions and +global variables. + + +C DIALECT + + Try to avoid using GCC extensions where possible. + + Some C99 extensions are OK: + + * Flexible array members (e.g. struct { int foo[]; }). + + * "static inline" functions (but no other forms of "inline", for + which GCC and C99 have differing interpretations). + + * "long long" + + * and . + + * bool and , but don't assume that bool or _Bool can + only take on the values 0 or 1, because this behavior can't be + simulated on C89 compilers. + + Don't use other C99 extensions, and especially: + + * Don't use // comments. + + * Don't use designated initializers (e.g. don't write "struct foo + foo = {.a = 1};" or "int a[] = {[2] = 5};"). + + * Don't mix declarations and code within a block. + + * Don't use declarations in iteration statements (e.g. don't write + "for (int i = 0; i < 10; i++)"). + + * Don't put a trailing comma in an enum declaration (e.g. don't + write "enum { x = 1, };"). diff --git a/INSTALL b/INSTALL new file mode 100644 index 00000000..994e8d32 --- /dev/null +++ b/INSTALL @@ -0,0 +1,514 @@ + Open vSwitch Installation Instructions + +This document describes how to build, install, and execute +Open vSwitch. + +Open vSwitch implements an Ethernet switch with MAC learning that may +be configured with any of the following features: + + * NIC bonding with automatic fail-over and source MAC-based TX + load balancing ("SLB"). + + * 802.1Q VLAN support. + + * Port mirroring, with optional VLAN tagging. + + * NetFlow v5 flow logging. + + * Connectivity to an external OpenFlow controller, such as + NOX. + +The current version of this distribution requires a kernel module to +be built and loaded. An (optional) entirely userspace switch is on +the roadmap for future versions. + +The distribution also contains a number of related utilities. + +Build Methods +============= + +There are two principal ways to build and install this distribution: + + - Using "configure" and "make" in the ordinary way. See + Building Conventionally below for detailed instructions. + + - As a set of Debian packages. Refer to Building Debian + Packages, below, for instructions. + +Base Prerequisites +------------------ + +Regardless of how it is built, Open vSwitch has a common set of +prerequisites. To compile the userspace programs in the OpenFlow +reference distribution, you will need the following software: + + - A make program, e.g. GNU make + (http://www.gnu.org/software/make/). BSD make should also work. + + - The GNU C compiler (http://gcc.gnu.org/). We generally test + with version 4.2 or 4.3. + + - libssl, from OpenSSL (http://www.openssl.org/), is optional but + recommended if you plan to connect the Open vSwitch to an + OpenFlow controller. libssl is required to establish + confidentiality and authenticity in the connections from an + Open vSwitch to an OpenFlow controller. To enable, configure + with --enable-ssl=yes. + +To compile the kernel module, you must also install the following: + + - A supported Linux kernel version. Please refer to README for a + list of supported versions. + + The OpenFlow datapath requires bridging support (CONFIG_BRIDGE) + to be built as a kernel module. (This is common in kernels + provided by Linux distributions.) The bridge module must not be + loaded or in use. If the bridge module is running (check with + "lsmod | grep bridge"), you must remove it ("rmmod bridge") + before starting the datapath. + + - To build a kernel module, you need the same version of GCC that + was used to build that kernel (usually version 4.0 or later). + + - A kernel build directory corresponding to the Linux kernel image + the module is to run on. Under Debian and Ubuntu, for example, + each linux-image package containing a kernel binary has a + corresponding linux-headers package with the required build + infrastructure. + +If you are working from a Git tree or snapshot (instead of from a +distribution tarball), or if you modify the Open vSwitch build system, +you will also need the following software: + + - Autoconf version 2.60 or later (http://www.gnu.org/software/autoconf). + + - Automake version 1.10 or later (http://www.gnu.org/software/automake). + + - pkg-config (http://pkg-config.freedesktop.org/wiki/). We test + with version 0.22. + +Debian Prerequisites +-------------------- + +To build Debian packages from the Open vSwitch distribution, you will +need to install a number of Debian packages in addition to the base +prerequisites listed above. These additional prerequisites may be +found listed as "Build-Depends" in debian/control in the source tree. +To check that they are installed, first install the dpkg-dev package, +then run dpkg-checkbuilddeps from the top level of the OpenFlow source +tree. + +To build Debian packages without being root, also install the +"fakeroot" package. + +Building Conventionally +======================= + +This section explains how to build and install the Open vSwitch +distribution in the ordinary way using "configure" and "make". + +0. Check that you have installed all the prerequisites listed above in + the Base Prerequisites section. + +1. In the top source directory, configure the package by running the + configure script. You can usually invoke configure without any + arguments: + + % ./configure + + To use a specific C compiler for compiling OpenFlow user programs, + also specify it on the configure command line, like so: + + % ./configure CC=gcc-4.2 + + To build the Linux kernel module, so that you can run the + kernel-based switch, pass the location of the kernel build + directory on --with-l26. For example, to build for a running + instance of Linux 2.6: + + % ./configure --with-l26=/lib/modules/`uname -r`/build + + If you wish to build the kernel module for an architecture other + than the architecture of the machine used for the build, you may + specify the kernel architecture string using the KARCH variable + when invoking the configure script. For example, to build for MIPS + with Linux 2.6: + + % ./configure --with-l26=/path/to/linux-2.6 KARCH=mips + + The configure script accepts a number of other options and honors + additional environment variables. For a full list, invoke + configure with the --help option. + +2. Run make in the top source directory: + + % make + + The following main binaries will be built: + + - Virtual switch daemon: vswitchd/ovs-vswitchd + + - Bridge compatibility daemon: vswitchd/ovs-brcompatd + + - Datapath administration utility: utilities/ovs-dpctl. + + Some less important binaries will be built also: + + - Runtime configuration utility: utilities/ovs-appctl. + + - Simple OpenFlow controller: utilities/ovs-controller. + + - Secure channel executable: secchan/secchan. + + - Miscellaneous utilities: utilities/ovs-discover, + utilities/ovs-kill. + + - ANSI terminal support for EZIO 16x2 LCD panel: + extras/ezio/ezio-term (only if the proper libraries are + installed). + + - Switch monitoring UI for small text displays: + extras/ezio/ovs-switchui (only if the proper libraries are + installed). + + - Tests: various binaries in tests/. + + If you passed --with-l26 to configure, "make" will also build the + following kernel modules: + + - datapath/linux-2.6/brcompat_mod.ko + + - datapath/linux-2.6/openflow_mod.ko + +3. Run "make install" to install the executables and manpages into the + running system, by default under /usr/local. + +4. If you built kernel modules, you may load them with "insmod", e.g.: + + % insmod datapath/linux-2.6/openflow_mod.ko + + The insmod program must be run as root. You may need to specify a + full path to insmod, e.g. /sbin/insmod. To verify that the modules + have been loaded, run "/sbin/lsmod" and check that openflow_mod is + listed. + +5. Test the virtuaal switch, as described under Testing the Virtual +Switch below. + +Building Debian Packages +======================== + +Follow these instructions to build Debian packages for OpenFlow. + +0. Check that you have installed all the prerequisites listed above in + the Base Prerequisites and Debian Prerequisites sections above. + +1. In the top source directory, run the following command, as root: + + % dpkg-buildpackage + + Alternatively, if you installed the "fakeroot" package, you may run + dpkg-buildpackage as an ordinary user with the following syntax: + + % dpkg-buildpackage -rfakeroot + + The following packages will be built in the directory above the + source tree: + + - openflow-controller: The OpenFlow controller. Depends on + openflow-pki (see below). + + - openflow-switch: Install this package on a machine that acts + as an OpenFlow kernel switch. + + - openflow-datapath-source: Source code for OpenFlow's Linux + kernel module. + + - openflow-pki: Public-key infrastructure for OpenFlow. Install + this package on a machine that acts as an OpenFlow PKI server + (see "Establishing a Public Key Infrastructure" below). + + - openflow-common: Files and utilities required by more than one + of the above packages. + +2. To set up an OpenFlow controller, install the openflow-controller + package and its dependencies. You may configure it by editing + /etc/default/openflow-controller, e.g. to enable non-SSL + connections, which are disabled by default. If you change the + default settings, you will need to restart the controller by + running: + + % /etc/init.d/openflow-controller restart + +3. To set up an OpenFlow switch, install the openflow-switch package + and its dependencies. If it is to be a kernel-based switch, also + install openflow-datapath-source, then follow the instructions in + /usr/share/doc/openflow-datapath-source/README.Debian to build and + install the kernel module. + + You may configure the switch one of the following ways: + + - Completely by hand, as described under the Testing section + below. + + For the userspace datapath-based switch, this is the only + supported form of configuration. + + - By editing /etc/default/openflow-switch. You must at least + configure some network devices, by uncommenting NETDEVS and + adding the appropriate devices to the list, e.g. NETDEVS="eth0 + eth1". + + After you edit this file, you will need to start the switch by + running: + + % /etc/init.d/openflow-switch restart + + This form of configuration is not supported for the userspace + datapath-based switch. + + - By running the ovs-switch-setup program. This interactive + program will walk you through all the steps of configuring an + OpenFlow switch, including configuration of SSL certificates. + Run it without arguments, as root: + + % ovs-switch-setup + + This form of configuration is not supported for the userspace + datapath-based switch. + +Installation +============ + +This section explains how to install Open vSwitch in a network with one +controller and one or more switches, each of which runs on a separate +machine. Before you begin, you must decide on one of two ways for +each switch to reach the controller over the network: + + - Use a "control network" that is completely separate from the + "data network" to be controlled ("out-of-band control"). The + location of the controller must be configured manually in this + case. + + - Use the same network for control and for data ("in-band + control"). When in-band control is used, the location of the + controller may be configured manually or discovered + automatically. We will assume manual configuration here; + please refer to secchan(8) for instructions on setting up + controller discovery. + +Controller Setup +---------------- + +On the machine that is to be the OpenFlow controller, start the +"ovs-controller" program listening for connections from switches on +TCP port 6633 (the default), as shown below. + + # ovs-controller -v ptcp: + +(See ovs-controller(8) for more details) + +Make sure the machine hosting the controller is reachable by the +switch. + +Testing the Virtual Switch +-------------------------- + +The Open vSwitch kernel module must be loaded, as described under +"Building Conventionally", before it may be used. + +0. The commands below must run as root, so log in as root, or use a + program such as "su" to become root temporarily. + +1. Create a datapath instance. The command below creates a datapath + identified as dp0 (see ovs-dpctl(8) for more detailed usage + information). + + # ovs-dpctl add-dp dp0 + + (dp0 is the first datapath within a host. openvswitch_mod supports + multiple datapaths within the same host, which would be identified + as dp1, dp2, etc.) + + Creating datapath dp0 creates a new network device, also named dp0. + This network device, called the datapath's "local port", will be + bridged to the physical switch ports by the secchan, for use in + in-band control. + +2. Use ovs-dpctl to attach the datapath to physical interfaces on the + machine. Say, for example, you want to create a trivial 2-port + switch using interfaces eth1 and eth2, you would issue the + following commands: + + # ovs-dpctl add-if dp0 eth1 + # ovs-dpctl add-if dp0 eth2 + + You can verify that the interfaces were successfully added by asking + ovs-dpctl to print the current status of datapath dp0: + + # ovs-dpctl show dp0 + +3. Arrange so that the switch can reach the controller over the + network. + + - If you are using out-of-band control, at this point make sure + that the switch machine can reach the controller over the + network. + + - If you are using in-band control, then at this point you must + configure the dp0 network device created in step 1. This + device is not yet bridged to any physical network (because + secchan does that, and it is not yet running), so the next + step depends on whether connectivity is required to configure + the device's IP address: + + * If the switch has a static IP address, you may configure + its IP address now, e.g.: + + # ifconfig dp0 192.168.1.1 + + * If the switch does not have a static IP address, e.g. its + IP address is obtained dynamically via DHCP, then proceed + to step 4. The DHCP client will not be able to contact + the DHCP server until the secure channel has started up. + + - If you are using in-band control with controller discovery, no + configuration is required at this point. You may proceed to + step 4. + +4. Run secchan to start the secure channel connecting the datapath to + a remote controller. If the controller is running on host + 192.168.1.2 port 6633 (the default port), the secchan invocation + would look like this: + + # secchan dp0 tcp:192.168.1.2 + + - If you are using in-band control with controller discovery, omit + the second argument to the secchan command. + + - If you are using out-of-band control, add --out-of-band to the + command line. + +5. If you are using in-band control with manual configuration, and the + switch obtains its IP address dynamically, then you may now obtain + the switch's IP address, e.g. by invoking a DHCP client. The + secure channel will only be able to connect to the controller after + an IP address has been obtained. + +6. The secure channel should connect to the controller within a few + seconds. It may take a little longer if controller discovery is in + use, because the switch must then also obtain its own IP address + and the controller's location via DHCP. + +Configuration +============= + +Secure operation over SSL +------------------------- + +The instructions above set up Open vSwitch for operation over a +plaintext TCP connection. Production use of Open vSwitch should use +SSL[*] to ensure confidentiality and authenticity of traffic among +switches and controllers. The source must be configured with +--enable-ssl=yes to build with SSL support. + +To use SSL with Open vSwitch, you must set up a public-key infrastructure +(PKI) including a pair of certificate authorities (CAs), one for +controllers and one for switches. If you have an established PKI, +Open vSwitch can use it directly. Otherwise, refer to "Establishing a +Public Key Infrastructure" below. + +To configure the controller to listen for SSL connections on port 6633 +(the default), invoke it as follows: + + # ovs-controller -v pssl: --private-key=PRIVKEY --certificate=CERT \ + --ca-cert=CACERT + +where PRIVKEY is a file containing the controller's private key, CERT +is a file containing the controller CA's certificate for the +controller's public key, and CACERT is a file containing the root +certificate for the switch CA. If, for example, your PKI was created +with the instructions below, then the invocation would look like: + + # ovs-controller -v pssl: --private-key=ctl-privkey.pem \ + --certificate=ctl-cert.pem --ca-cert=pki/switchca/cacert.pem + +To configure a switch to connect to a controller running on port 6633 +(the default) on host 192.168.1.2 over SSL, invoke secchan as follows: + + # secchan -v DATAPATH ssl:192.168.1.2 --private-key=PRIVKEY \ + --certificate=CERT --ca-cert=CACERT + +where DATAPATH is the datapath to connect to (e.g. dp0 or +unix:/var/run/dp0.sock), PRIVKEY is a file containing the switch's +private key, CERT is a file containing the switch CA's certificate for +the switch's public key, and CACERT is a file containing the root +certificate for the controller CA. If, for example, your PKI was +created with the instructions below, then the invocation would look +like: + + # secchan -v DATAPATH ssl:192.168.1.2 --private-key=sc-privkey.pem \ + --certificate=sc-cert.pem --ca-cert=pki/controllerca/cacert.pem + +[*] To be specific, Open vSwitch uses TLS version 1.0 or later (TLSv1), as + specified by RFC 2246, which is very similar to SSL version 3.0. + TLSv1 was released in January 1999, so all current software and + hardware should implement it. + +Establishing a Public Key Infrastructure +---------------------------------------- + +If you do not have a PKI, the ovs-pki script included with Open vSwitch +can help. To create an initial PKI structure, invoke it as: + % ovs-pki init +which will create and populate a new PKI directory. The default +location for the PKI directory depends on how the Open vSwitch tree was +configured (to see the configured default, look for the --dir option +description in the output of "ovs-pki --help"). + +The pki directory contains two important subdirectories. The +controllerca subdirectory contains controller certificate authority +related files, including the following: + + - cacert.pem: Root certificate for the controller certificate + authority. This file must be provided to secchan with the + --ca-cert option to enable it to authenticate valid controllers. + + - private/cakey.pem: Private signing key for the controller + certificate authority. This file must be kept secret. There is + no need for switches or controllers to have a copy of it. + +The switchca subdirectory contains switch certificate authority +related files, analogous to those in the controllerca subdirectory: + + - cacert.pem: Root certificate for the switch certificate + authority. This file must be provided to the controller program + with the --ca-cert option to enable it to authenticate valid + switches. + + - private/cakey.pem: Private signing key for the switch + certificate authority. This file must be kept secret. There is + no need for switches or controllers to have a copy of it. + +After you create the initial structure, you can create keys and +certificates for switches and controllers with ovs-pki. To create a +controller private key and certificate in files named ctl-privkey.pem +and ctl-cert.pem, for example, you could run: + % ovs-pki req+sign ctl controller +ctl-privkey.pem and ctl-cert.pem would need to be copied to the +controller for its use at runtime (they could then be deleted from +their original locations). The --private-key and --certificate +options of ovs-controller, respectively, would point to these files. + +Analogously, to create a switch private key and certificate in files +named sc-privkey.pem and sc-cert.pem, for example, you could run: + % ovs-pki req+sign sc switch +sc-privkey.pem and sc-cert.pem would need to be copied to the switch +for its use at runtime (they could then be deleted from their original +locations). The --private-key and --certificate options, +respectively, of secchan would point to these files. + +Bug Reporting +------------- + +Please report problems to ovs-bugs@openvswitch.org. diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 00000000..18108cf0 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,74 @@ +AUTOMAKE_OPTIONS = foreign subdir-objects +ACLOCAL_AMFLAGS = -I m4 +SUBDIRS = datapath + +if ENABLE_USERSPACE +if HAVE_DPKG_BUILDPACKAGE +distcheck-hook: + cd $(srcdir) && dpkg-buildpackage -rfakeroot -us -uc + cd $(srcdir) && fakeroot ./debian/rules clean +else +distcheck-hook: +endif + +AM_CPPFLAGS = $(SSL_CFLAGS) +AM_CPPFLAGS += $(NCURSES_CFLAGS) +AM_CPPFLAGS += $(PCRE_CFLAGS) +AM_CPPFLAGS += -I $(top_srcdir)/include +AM_CPPFLAGS += -I $(top_srcdir)/lib + +AM_CFLAGS = -Wstrict-prototypes + +if NDEBUG +AM_CPPFLAGS += -DNDEBUG +AM_CFLAGS += -fomit-frame-pointer +else +AM_LDFLAGS = -export-dynamic +endif + +CLEANFILES = +DISTCLEANFILES = +EXTRA_DIST = +TESTS = +TESTS_ENVIRONMENT = +bin_PROGRAMS = +sbin_PROGRAMS = +bin_SCRIPTS = +dist_commands_DATA = +dist_man_MANS = +dist_pkgdata_SCRIPTS = +dist_sbin_SCRIPTS = +man_MANS = +noinst_HEADERS = +noinst_LIBRARIES = +noinst_PROGRAMS = +noinst_SCRIPTS = + +EXTRA_DIST += soexpand.pl + +ro_c = echo '/* -*- mode: c; buffer-read-only: t -*- */' + +SUFFIXES = .in +.in: + $(PERL) $(srcdir)/soexpand.pl -I$(srcdir) < $< | \ + sed -e 's,[@]LOGDIR[@],$(LOGDIR),g' \ + -e 's,[@]PKIDIR[@],$(PKIDIR),g' \ + -e 's,[@]RUNDIR[@],$(RUNDIR),g' \ + -e 's,[@]pkgdatadir[@],$(pkgdatadir),g' \ + -e 's,[@]PERL[@],$(PERL),g' > $@ + +include lib/automake.mk +include secchan/automake.mk +include utilities/automake.mk +include tests/automake.mk +include include/automake.mk +include third-party/automake.mk +include debian/automake.mk +include vswitchd/automake.mk +include xenserver/automake.mk +if HAVE_CURSES +if HAVE_PCRE +include extras/ezio/automake.mk +endif +endif +endif # ENABLE_USERSPACE diff --git a/README b/README new file mode 100644 index 00000000..8991e4c8 --- /dev/null +++ b/README @@ -0,0 +1,74 @@ + Open vSwitch + +What is Open vSwitch? +--------------------- + +Open vSwitch is an Ethernet switch for virtual servers with the +following features: + + * NIC bonding with automatic fail-over and source MAC-based TX + load balancing ("SLB"). + + * 802.1Q VLAN support. + + * Port mirroring, with optional VLAN tagging. + + * NetFlow v5 flow logging. + + * Connectivity to an external OpenFlow controller, such as + NOX. + +What's here? +------------ + +The most important components of this distribution are: + + - A Linux kernel module for flow-based switching, in the + datapath directory. + + - ovs-vswitchd, a daemon that implements the virtual switch. + + - ovs-dpctl, a tool for configuring the kernel module and + controlling OpenFlow switches. + +This distribution includes some additional software as well: + + - secchan, a program that implements a simple OpenFlow switch + (without the special features provided by ovs-vswitchd) using + the same kernel module as ovs-vswitchd. + + - ovs-controller, a simple OpenFlow switch + + - ovs-ofctl, a utility for querying and controlling OpenFlow + switches and controllers. + + - vlog-appctl, a utility that can control Open vSwitch daemons, + adjusting their logging levels among other uses. + + - ovs-pki, a utility for creating and managing the public-key + infrastructure for OpenFlow switches. + + - A patch to tcpdump that enables it to parse OpenFlow + messages. + +For installation instructions, read INSTALL. Each userspace program +is also accompanied by a manpage. + +Platform support +---------------- + +Our primary test environment is Debian GNU/Linux. Ports to other +platforms are welcome. Please contact us with portability-related bug +reports or patches. + +The testing of the kernel module has focused on version 2.6.18 from +Xen and version 2.6.26 from kernel.org. Linux 2.6 releases from +2.6.15 onward should also work. + +GCC is the expected compiler. + +Contact +------- + +ovs-bugs@openvswitch.org +http://openvswitch.org/ diff --git a/acinclude.m4 b/acinclude.m4 new file mode 100644 index 00000000..498196b3 --- /dev/null +++ b/acinclude.m4 @@ -0,0 +1,195 @@ +# -*- autoconf -*- + +# Copyright (c) 2008, 2009 Nicira Networks. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +dnl Checks for --disable-userspace. +AC_DEFUN([OVS_CHECK_USERSPACE], + [AC_ARG_ENABLE( + [userspace], + [AC_HELP_STRING([--disable-userspace], + [Disable building userspace components.])], + [case "${enableval}" in + (yes) build_userspace=true ;; + (no) build_userspace=false ;; + (*) AC_MSG_ERROR([bad value ${enableval} for --enable-userspace]) ;; + esac], + [build_userspace=true]) + AM_CONDITIONAL([ENABLE_USERSPACE], [$build_userspace])]) + +dnl OVS_CHECK_LINUX(OPTION, VERSION, VARIABLE, CONDITIONAL) +dnl +dnl Configure linux kernel source tree +AC_DEFUN([OVS_CHECK_LINUX], [ + AC_ARG_WITH([$1], + [AC_HELP_STRING([--with-$1=/path/to/linux-$2], + [Specify the linux $2 kernel sources])], + [path="$withval"], [path=])dnl + if test -n "$path"; then + path=`eval echo "$path"` + + AC_MSG_CHECKING([for $path directory]) + if test -d "$path"; then + AC_MSG_RESULT([yes]) + $3=$path + AC_SUBST($3) + else + AC_MSG_RESULT([no]) + AC_ERROR([source dir $path doesn't exist]) + fi + + AC_MSG_CHECKING([for $path kernel version]) + patchlevel=`sed -n 's/^PATCHLEVEL = //p' "$path/Makefile"` + sublevel=`sed -n 's/^SUBLEVEL = //p' "$path/Makefile"` + AC_MSG_RESULT([2.$patchlevel.$sublevel]) + if test "2.$patchlevel" != '$2'; then + AC_ERROR([Linux kernel source in $path is not version $2]) + fi + if ! test -e "$path"/include/linux/version.h || \ + ! test -e "$path"/include/linux/autoconf.h; then + AC_MSG_ERROR([Linux kernel source in $path is not configured]) + fi + m4_if($2, [2.6], [OVS_CHECK_LINUX26_COMPAT]) + fi + AM_CONDITIONAL($4, test -n "$path") +]) + +dnl OVS_GREP_IFELSE(FILE, REGEX, IF-MATCH, IF-NO-MATCH) +dnl +dnl Greps FILE for REGEX. If it matches, runs IF-MATCH, otherwise IF-NO-MATCH. +AC_DEFUN([OVS_GREP_IFELSE], [ + AC_MSG_CHECKING([whether $2 matches in $1]) + grep '$2' $1 >/dev/null 2>&1 + status=$? + case $status in + 0) + AC_MSG_RESULT([yes]) + $3 + ;; + 1) + AC_MSG_RESULT([no]) + $4 + ;; + *) + AC_MSG_ERROR([grep exited with status $status]) + ;; + esac +]) + +dnl OVS_DEFINE(NAME) +dnl +dnl Defines NAME to 1 in kcompat.h. +AC_DEFUN([OVS_DEFINE], [ + echo '#define $1 1' >> datapath/linux-2.6/kcompat.h.new +]) + +AC_DEFUN([OVS_CHECK_VETH], [ + AC_MSG_CHECKING([whether to build veth module]) + if test "$sublevel" = 18; then + AC_MSG_RESULT([yes]) + AC_SUBST([BUILD_VETH], 1) + else + AC_MSG_RESULT([no]) + fi +]) + +AC_DEFUN([OVS_CHECK_LOG2_H], [ + AC_MSG_CHECKING([for $KSRC26/include/linux/log2.h]) + if test -e $KSRC26/include/linux/log2.h; then + AC_MSG_RESULT([yes]) + OVS_DEFINE([HAVE_LOG2_H]) + else + AC_MSG_RESULT([no]) + fi +]) + +dnl OVS_CHECK_LINUX26_COMPAT +dnl +dnl Runs various Autoconf checks on the Linux 2.6 kernel source in +dnl the directory in $KSRC26. +AC_DEFUN([OVS_CHECK_LINUX26_COMPAT], [ + rm -f datapath/linux-2.6/kcompat.h.new + mkdir -p datapath/linux-2.6 + : > datapath/linux-2.6/kcompat.h.new + OVS_GREP_IFELSE([$KSRC26/include/linux/skbuff.h], [skb_transport_header], + [OVS_DEFINE([HAVE_SKBUFF_HEADER_HELPERS])]) + OVS_GREP_IFELSE([$KSRC26/include/linux/skbuff.h], [raw], + [OVS_DEFINE([HAVE_MAC_RAW])]) + OVS_GREP_IFELSE([$KSRC26/include/linux/skbuff.h], + [skb_copy_from_linear_data_offset], + [OVS_DEFINE([HAVE_SKB_COPY_FROM_LINEAR_DATA_OFFSET])]) + OVS_GREP_IFELSE([$KSRC26/include/net/netlink.h], [NLA_NUL_STRING], + [OVS_DEFINE([HAVE_NLA_NUL_STRING])]) + OVS_GREP_IFELSE([$KSRC26/include/linux/err.h], [ERR_CAST], + [OVS_DEFINE([HAVE_ERR_CAST])]) + OVS_CHECK_LOG2_H + OVS_CHECK_VETH + if cmp -s datapath/linux-2.6/kcompat.h.new \ + datapath/linux-2.6/kcompat.h >/dev/null 2>&1; then + rm datapath/linux-2.6/kcompat.h.new + else + mv datapath/linux-2.6/kcompat.h.new datapath/linux-2.6/kcompat.h + fi +]) + +dnl Checks for net/if_packet.h. +AC_DEFUN([OVS_CHECK_IF_PACKET], + [AC_CHECK_HEADER([net/if_packet.h], + [HAVE_IF_PACKET=yes], + [HAVE_IF_PACKET=no]) + AM_CONDITIONAL([HAVE_IF_PACKET], [test "$HAVE_IF_PACKET" = yes]) + if test "$HAVE_IF_PACKET" = yes; then + AC_DEFINE([HAVE_IF_PACKET], [1], + [Define to 1 if net/if_packet.h is available.]) + fi]) + +dnl Checks for dpkg-buildpackage. If this is available then we check +dnl that the Debian packaging is functional at "make distcheck" time. +AC_DEFUN([OVS_CHECK_DPKG_BUILDPACKAGE], + [AC_CHECK_PROG([HAVE_DPKG_BUILDPACKAGE], [dpkg-buildpackage], [yes], [no]) + AM_CONDITIONAL([HAVE_DPKG_BUILDPACKAGE], + [test $HAVE_DPKG_BUILDPACKAGE = yes])]) + +dnl ---------------------------------------------------------------------- +dnl These macros are from GNU PSPP, with the following original license: +dnl Copyright (C) 2005, 2006, 2007 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. + +dnl OVS_CHECK_CC_OPTION([OPTION], [ACTION-IF-ACCEPTED], [ACTION-IF-REJECTED]) +dnl Check whether the given C compiler OPTION is accepted. +dnl If so, execute ACTION-IF-ACCEPTED, otherwise ACTION-IF-REJECTED. +AC_DEFUN([OVS_CHECK_CC_OPTION], +[ + m4_define([ovs_cv_name], [ovs_cv_[]m4_translit([$1], [-], [_])])dnl + AC_CACHE_CHECK([whether $CC accepts $1], [ovs_cv_name], + [ovs_save_CFLAGS="$CFLAGS" + CFLAGS="$CFLAGS $1" + AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,)], [ovs_cv_name[]=yes], [ovs_cv_name[]=no]) + CFLAGS="$ovs_save_CFLAGS"]) + if test $ovs_cv_name = yes; then + m4_if([$2], [], [;], [$2]) + else + m4_if([$3], [], [:], [$3]) + fi +]) + +dnl OVS_ENABLE_OPTION([OPTION]) +dnl Check whether the given C compiler OPTION is accepted. +dnl If so, add it to CFLAGS. +dnl Example: OVS_ENABLE_OPTION([-Wdeclaration-after-statement]) +AC_DEFUN([OVS_ENABLE_OPTION], + [OVS_CHECK_CC_OPTION([$1], [CFLAGS="$CFLAGS $1"])]) +dnl ---------------------------------------------------------------------- diff --git a/boot.sh b/boot.sh new file mode 100755 index 00000000..05dd3599 --- /dev/null +++ b/boot.sh @@ -0,0 +1,2 @@ +#! /bin/sh +autoreconf --install --force diff --git a/configure.ac b/configure.ac new file mode 100644 index 00000000..a557f0f4 --- /dev/null +++ b/configure.ac @@ -0,0 +1,89 @@ +# Copyright (c) 2008, 2009 Nicira Networks +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +AC_PREREQ(2.60) +AC_INIT(openvswitch, 0.90.0, ovs-bugs@openvswitch.org) +NX_BUILDNR +AC_CONFIG_SRCDIR([datapath/datapath.c]) +AC_CONFIG_MACRO_DIR([m4]) +AC_CONFIG_AUX_DIR([build-aux]) +AC_CONFIG_HEADERS([config.h]) +AM_INIT_AUTOMAKE + +AC_PROG_CC +AM_PROG_CC_C_O +AC_PROG_CPP +AC_PROG_RANLIB +AC_PROG_MKDIR_P + +AC_ARG_VAR([PERL], [path to Perl interpreter]) +AC_PATH_PROG([PERL], perl, no) +if test "$PERL" = no; then + AC_MSG_ERROR([Perl interpreter not found in $PATH or $PERL.]) +fi + +AC_USE_SYSTEM_EXTENSIONS +AC_C_BIGENDIAN +AC_SYS_LARGEFILE + +OVS_CHECK_USERSPACE +OVS_CHECK_NDEBUG +OVS_CHECK_NETLINK +OVS_CHECK_OPENSSL +OVS_CHECK_LOGDIR +OVS_CHECK_CURSES +OVS_CHECK_LINUX_VT_H +OVS_CHECK_PCRE +OVS_CHECK_IF_PACKET +OVS_CHECK_DPKG_BUILDPACKAGE + +if $build_userspace; then + OVS_CHECK_PKIDIR + OVS_CHECK_RUNDIR + OVS_CHECK_MALLOC_HOOKS + OVS_CHECK_VALGRIND + OVS_CHECK_TTY_LOCK_DIR + OVS_CHECK_SOCKET_LIBS + OVS_CHECK_FAULT_LIBS + + AC_CHECK_FUNCS([strsignal]) + + OVS_ENABLE_OPTION([-Wall]) + OVS_ENABLE_OPTION([-Wno-sign-compare]) + OVS_ENABLE_OPTION([-Wpointer-arith]) + OVS_ENABLE_OPTION([-Wdeclaration-after-statement]) + OVS_ENABLE_OPTION([-Wformat-security]) + OVS_ENABLE_OPTION([-Wswitch-enum]) + OVS_ENABLE_OPTION([-Wunused-parameter]) + OVS_ENABLE_OPTION([-Wstrict-aliasing]) + OVS_ENABLE_OPTION([-Wbad-function-cast]) + OVS_ENABLE_OPTION([-Wcast-align]) + OVS_ENABLE_OPTION([-Wstrict-prototypes]) + OVS_ENABLE_OPTION([-Wold-style-definition]) + OVS_ENABLE_OPTION([-Wmissing-prototypes]) + OVS_ENABLE_OPTION([-Wmissing-field-initializers]) + OVS_ENABLE_OPTION([-Wno-override-init]) +fi + +AC_ARG_VAR(KARCH, [Kernel Architecture String]) +AC_SUBST(KARCH) +OVS_CHECK_LINUX(l26, 2.6, KSRC26, L26_ENABLED) + +AC_CONFIG_FILES([Makefile +datapath/Makefile +datapath/linux-2.6/Kbuild +datapath/linux-2.6/Makefile +datapath/linux-2.6/Makefile.main]) + +AC_OUTPUT diff --git a/datapath/.gitignore b/datapath/.gitignore new file mode 100644 index 00000000..5a59a0d3 --- /dev/null +++ b/datapath/.gitignore @@ -0,0 +1,7 @@ +/Makefile +/Makefile.in +*.cmd +*.ko +*.mod.c +Module.symvers + diff --git a/datapath/Makefile.am b/datapath/Makefile.am new file mode 100644 index 00000000..71e2dc48 --- /dev/null +++ b/datapath/Makefile.am @@ -0,0 +1,12 @@ +SUBDIRS = +if L26_ENABLED +SUBDIRS += linux-2.6 +endif + +EXTRA_DIST = $(dist_headers) $(dist_sources) + +# Suppress warnings about GNU extensions in Modules.mk files. +AUTOMAKE_OPTIONS = -Wno-portability + +include Modules.mk +include linux-2.6/Modules.mk diff --git a/datapath/Modules.mk b/datapath/Modules.mk new file mode 100644 index 00000000..1b5de4ab --- /dev/null +++ b/datapath/Modules.mk @@ -0,0 +1,32 @@ +# Some modules should be built and distributed, e.g. openvswitch. +# +# Some modules should be distributed but not built, e.g. we do not build +# veth if the kernel in question already has it. +# +# Some modules should be built but not distributed, e.g. third-party +# hwtable modules. +both_modules = openvswitch +build_modules = $(both_modules) # Modules to build +dist_modules = $(both_modules) # Modules to distribute + +openvswitch_sources = \ + actions.c \ + datapath.c \ + dp_dev.c \ + dp_notify.c \ + flow.c \ + table.c + +openvswitch_headers = \ + actions.h \ + compat.h \ + datapath.h \ + dp_dev.h \ + flow.h + +dist_sources = $(foreach module,$(dist_modules),$($(module)_sources)) +dist_headers = $(foreach module,$(dist_modules),$($(module)_headers)) +build_sources = $(foreach module,$(build_modules),$($(module)_sources)) +build_headers = $(foreach module,$(build_modules),$($(module)_headers)) +build_links = $(notdir $(build_sources)) +build_objects = $(notdir $(patsubst %.c,%.o,$(build_sources))) diff --git a/datapath/actions.c b/datapath/actions.c new file mode 100644 index 00000000..30b840cb --- /dev/null +++ b/datapath/actions.c @@ -0,0 +1,421 @@ +/* + * Distributed under the terms of the GNU GPL version 2. + * Copyright (c) 2007, 2008, 2009 Nicira Networks. + */ + +/* Functions for executing flow actions. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "datapath.h" +#include "dp_dev.h" +#include "actions.h" +#include "openvswitch/datapath-protocol.h" + +struct sk_buff * +make_writable(struct sk_buff *skb, gfp_t gfp) +{ + if (skb_shared(skb) || skb_cloned(skb)) { + struct sk_buff *nskb = skb_copy(skb, gfp); + if (nskb) { + kfree_skb(skb); + return nskb; + } + } else { + unsigned int hdr_len = (skb_transport_offset(skb) + + sizeof(struct tcphdr)); + if (pskb_may_pull(skb, min(hdr_len, skb->len))) + return skb; + } + kfree_skb(skb); + return NULL; +} + + +static struct sk_buff * +vlan_pull_tag(struct sk_buff *skb) +{ + struct vlan_ethhdr *vh = vlan_eth_hdr(skb); + struct ethhdr *eh; + + + /* Verify we were given a vlan packet */ + if (vh->h_vlan_proto != htons(ETH_P_8021Q)) + return skb; + + memmove(skb->data + VLAN_HLEN, skb->data, 2 * VLAN_ETH_ALEN); + + eh = (struct ethhdr *)skb_pull(skb, VLAN_HLEN); + + skb->protocol = eh->h_proto; + skb->mac_header += VLAN_HLEN; + + return skb; +} + + +static struct sk_buff * +modify_vlan_tci(struct datapath *dp, struct sk_buff *skb, + struct odp_flow_key *key, const union odp_action *a, + int n_actions, gfp_t gfp) +{ + u16 tci, mask; + + if (a->type == ODPAT_SET_VLAN_VID) { + tci = ntohs(a->vlan_vid.vlan_vid); + mask = VLAN_VID_MASK; + key->dl_vlan = htons(tci & mask); + } else { + tci = a->vlan_pcp.vlan_pcp << 13; + mask = VLAN_PCP_MASK; + } + + skb = make_writable(skb, gfp); + if (!skb) + return ERR_PTR(-ENOMEM); + + if (skb->protocol == htons(ETH_P_8021Q)) { + /* Modify vlan id, but maintain other TCI values */ + struct vlan_ethhdr *vh = vlan_eth_hdr(skb); + vh->h_vlan_TCI = htons((ntohs(vh->h_vlan_TCI) & ~mask) | tci); + } else { + /* Add vlan header */ + + /* Set up checksumming pointers for checksum-deferred packets + * on Xen. Otherwise, dev_queue_xmit() will try to do this + * when we send the packet out on the wire, and it will fail at + * that point because skb_checksum_setup() will not look inside + * an 802.1Q header. */ + skb_checksum_setup(skb); + + /* GSO is not implemented for packets with an 802.1Q header, so + * we have to do segmentation before we add that header. + * + * GSO does work with hardware-accelerated VLAN tagging, but we + * can't use hardware-accelerated VLAN tagging since it + * requires the device to have a VLAN group configured (with + * e.g. vconfig(8)) and we don't do that. + * + * Having to do this here may be a performance loss, since we + * can't take advantage of TSO hardware support, although it + * does not make a measurable network performance difference + * for 1G Ethernet. Fixing that would require patching the + * kernel (either to add GSO support to the VLAN protocol or to + * support hardware-accelerated VLAN tagging without VLAN + * groups configured). */ + if (skb_is_gso(skb)) { + struct sk_buff *segs; + + segs = skb_gso_segment(skb, 0); + kfree_skb(skb); + if (unlikely(IS_ERR(segs))) + return ERR_CAST(segs); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + + segs = __vlan_put_tag(segs, tci); + err = -ENOMEM; + if (segs) { + struct odp_flow_key segkey = *key; + err = execute_actions(dp, segs, + &segkey, a + 1, + n_actions - 1, + gfp); + } + + if (unlikely(err)) { + while ((segs = nskb)) { + nskb = segs->next; + segs->next = NULL; + kfree_skb(segs); + } + return ERR_PTR(err); + } + + segs = nskb; + } while (segs->next); + + skb = segs; + } + + /* The hardware-accelerated version of vlan_put_tag() works + * only for a device that has a VLAN group configured (with + * e.g. vconfig(8)), so call the software-only version + * __vlan_put_tag() directly instead. + */ + skb = __vlan_put_tag(skb, tci); + if (!skb) + return ERR_PTR(-ENOMEM); + } + + return skb; +} + +static struct sk_buff *strip_vlan(struct sk_buff *skb, + struct odp_flow_key *key, gfp_t gfp) +{ + skb = make_writable(skb, gfp); + if (skb) { + vlan_pull_tag(skb); + key->dl_vlan = htons(ODP_VLAN_NONE); + } + return skb; +} + +static struct sk_buff *set_dl_addr(struct sk_buff *skb, + const struct odp_action_dl_addr *a, + gfp_t gfp) +{ + skb = make_writable(skb, gfp); + if (skb) { + struct ethhdr *eh = eth_hdr(skb); + memcpy(a->type == ODPAT_SET_DL_SRC ? eh->h_source : eh->h_dest, + a->dl_addr, ETH_ALEN); + } + return skb; +} + +/* Updates 'sum', which is a field in 'skb''s data, given that a 4-byte field + * covered by the sum has been changed from 'from' to 'to'. If set, + * 'pseudohdr' indicates that the field is in the TCP or UDP pseudo-header. + * Based on nf_proto_csum_replace4. */ +static void update_csum(__sum16 *sum, struct sk_buff *skb, + __be32 from, __be32 to, int pseudohdr) +{ + __be32 diff[] = { ~from, to }; + if (skb->ip_summed != CHECKSUM_PARTIAL) { + *sum = csum_fold(csum_partial((char *)diff, sizeof(diff), + ~csum_unfold(*sum))); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + skb->csum = ~csum_partial((char *)diff, sizeof(diff), + ~skb->csum); + } else if (pseudohdr) + *sum = ~csum_fold(csum_partial((char *)diff, sizeof(diff), + csum_unfold(*sum))); +} + +static struct sk_buff *set_nw_addr(struct sk_buff *skb, + struct odp_flow_key *key, + const struct odp_action_nw_addr *a, + gfp_t gfp) +{ + if (key->dl_type != htons(ETH_P_IP)) + return skb; + + skb = make_writable(skb, gfp); + if (skb) { + struct iphdr *nh = ip_hdr(skb); + u32 *f = a->type == ODPAT_SET_NW_SRC ? &nh->saddr : &nh->daddr; + u32 old = *f; + u32 new = a->nw_addr; + + if (key->nw_proto == IPPROTO_TCP) { + struct tcphdr *th = tcp_hdr(skb); + update_csum(&th->check, skb, old, new, 1); + } else if (key->nw_proto == IPPROTO_UDP) { + struct udphdr *th = udp_hdr(skb); + update_csum(&th->check, skb, old, new, 1); + } + update_csum(&nh->check, skb, old, new, 0); + *f = new; + } + return skb; +} + +static struct sk_buff * +set_tp_port(struct sk_buff *skb, struct odp_flow_key *key, + const struct odp_action_tp_port *a, + gfp_t gfp) +{ + int check_ofs; + + if (key->dl_type != htons(ETH_P_IP)) + return skb; + + if (key->nw_proto == IPPROTO_TCP) + check_ofs = offsetof(struct tcphdr, check); + else if (key->nw_proto == IPPROTO_UDP) + check_ofs = offsetof(struct udphdr, check); + else + return skb; + + skb = make_writable(skb, gfp); + if (skb) { + struct udphdr *th = udp_hdr(skb); + u16 *f = a->type == ODPAT_SET_TP_SRC ? &th->source : &th->dest; + u16 old = *f; + u16 new = a->tp_port; + update_csum((u16*)((u8*)skb->data + check_ofs), + skb, old, new, 1); + *f = new; + } + return skb; +} + +static inline unsigned packet_length(const struct sk_buff *skb) +{ + unsigned length = skb->len - ETH_HLEN; + if (skb->protocol == htons(ETH_P_8021Q)) + length -= VLAN_HLEN; + return length; +} + +int dp_xmit_skb(struct sk_buff *skb) +{ + struct datapath *dp = skb->dev->br_port->dp; + int len = skb->len; + + if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) { + printk(KERN_WARNING "%s: dropped over-mtu packet: %d > %d\n", + dp_name(dp), packet_length(skb), skb->dev->mtu); + kfree_skb(skb); + return -E2BIG; + } + + dev_queue_xmit(skb); + + return len; +} + +static void +do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +{ + struct net_bridge_port *p; + struct net_device *dev; + + if (!skb) + goto error; + + p = dp->ports[out_port]; + if (!p) + goto error; + + dev = skb->dev = p->dev; + if (is_dp_dev(dev)) + dp_dev_recv(dev, skb); + else + dp_xmit_skb(skb); + return; + +error: + kfree_skb(skb); +} + +/* Never consumes 'skb'. Returns a port that 'skb' should be sent to, -1 if + * none. */ +static int output_group(struct datapath *dp, __u16 group, + struct sk_buff *skb, gfp_t gfp) +{ + struct dp_port_group *g = rcu_dereference(dp->groups[group]); + int prev_port = -1; + int i; + + if (!g) + return -1; + for (i = 0; i < g->n_ports; i++) { + struct net_bridge_port *p = dp->ports[g->ports[i]]; + if (!p || skb->dev == p->dev) + continue; + if (prev_port != -1) { + struct sk_buff *clone = skb_clone(skb, gfp); + if (!clone) + return -1; + do_output(dp, clone, prev_port); + } + prev_port = p->port_no; + } + return prev_port; +} + +static int +output_control(struct datapath *dp, struct sk_buff *skb, u32 arg, gfp_t gfp) +{ + skb = skb_clone(skb, gfp); + if (!skb) + return -ENOMEM; + return dp_output_control(dp, skb, _ODPL_ACTION_NR, arg); +} + +/* Execute a list of actions against 'skb'. */ +int execute_actions(struct datapath *dp, struct sk_buff *skb, + struct odp_flow_key *key, + const union odp_action *a, int n_actions, + gfp_t gfp) +{ + /* Every output action needs a separate clone of 'skb', but the common + * case is just a single output action, so that doing a clone and + * then freeing the original skbuff is wasteful. So the following code + * is slightly obscure just to avoid that. */ + int prev_port = -1; + int err = 0; + for (; n_actions > 0; a++, n_actions--) { + WARN_ON_ONCE(skb_shared(skb)); + if (prev_port != -1) { + do_output(dp, skb_clone(skb, gfp), prev_port); + prev_port = -1; + } + + switch (a->type) { + case ODPAT_OUTPUT: + prev_port = a->output.port; + break; + + case ODPAT_OUTPUT_GROUP: + prev_port = output_group(dp, a->output_group.group, + skb, gfp); + break; + + case ODPAT_CONTROLLER: + err = output_control(dp, skb, a->controller.arg, gfp); + if (err) { + kfree_skb(skb); + return err; + } + break; + + case ODPAT_SET_VLAN_VID: + case ODPAT_SET_VLAN_PCP: + skb = modify_vlan_tci(dp, skb, key, a, n_actions, gfp); + if (IS_ERR(skb)) + return PTR_ERR(skb); + break; + + case ODPAT_STRIP_VLAN: + skb = strip_vlan(skb, key, gfp); + break; + + case ODPAT_SET_DL_SRC: + case ODPAT_SET_DL_DST: + skb = set_dl_addr(skb, &a->dl_addr, gfp); + break; + + case ODPAT_SET_NW_SRC: + case ODPAT_SET_NW_DST: + skb = set_nw_addr(skb, key, &a->nw_addr, gfp); + break; + + case ODPAT_SET_TP_SRC: + case ODPAT_SET_TP_DST: + skb = set_tp_port(skb, key, &a->tp_port, gfp); + break; + } + if (!skb) + return -ENOMEM; + } + if (prev_port != -1) + do_output(dp, skb, prev_port); + else + kfree_skb(skb); + return err; +} diff --git a/datapath/actions.h b/datapath/actions.h new file mode 100644 index 00000000..410e3ba7 --- /dev/null +++ b/datapath/actions.h @@ -0,0 +1,18 @@ +#ifndef ACTIONS_H +#define ACTIONS_H 1 + +#include + +struct datapath; +struct sk_buff; +struct odp_flow_key; +union odp_action; + +struct sk_buff *make_writable(struct sk_buff *, gfp_t gfp); +int dp_xmit_skb(struct sk_buff *); +int execute_actions(struct datapath *dp, struct sk_buff *skb, + struct odp_flow_key *key, + const union odp_action *, int n_actions, + gfp_t gfp); + +#endif /* actions.h */ diff --git a/datapath/brc_procfs.c b/datapath/brc_procfs.c new file mode 100644 index 00000000..733e9a94 --- /dev/null +++ b/datapath/brc_procfs.c @@ -0,0 +1,185 @@ +#include +#include +#include +#include +#include +#include +#include "openvswitch/brcompat-netlink.h" + +/* This code implements a Generic Netlink command BRC_GENL_C_SET_PROC that can + * be used to add, modify, and delete arbitrary files in selected + * subdirectories of /proc. It's a horrible kluge prompted by the need to + * simulate certain /proc/net/vlan and /proc/net/bonding files for software + * that wants to read them, and with any luck it will go away eventually. + * + * The implementation is a kluge too. In particular, we want to release the + * strings copied into the 'data' members of proc_dir_entry when the + * proc_dir_entry structures are freed, but there doesn't appear to be a way to + * hook that, so instead we have to rely on being the only entity modifying the + * directories in question. + */ + +static int brc_seq_show(struct seq_file *seq, void *unused) +{ + seq_puts(seq, seq->private); + return 0; +} + +static int brc_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, brc_seq_show, PDE(inode)->data); +} + +static struct file_operations brc_fops = { + .owner = THIS_MODULE, + .open = brc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct proc_dir_entry *proc_vlan_dir; +static struct proc_dir_entry *proc_bonding_dir; + +struct proc_dir_entry *brc_lookup_entry(struct proc_dir_entry *de, const char *name) +{ + int namelen = strlen(name); + for (de = de->subdir; de; de = de->next) { + if (de->namelen != namelen) + continue; + if (!memcmp(name, de->name, de->namelen)) + return de; + } + return NULL; +} + +static struct proc_dir_entry *brc_open_dir(const char *dir_name, + struct proc_dir_entry *parent, + struct proc_dir_entry **dirp) +{ + if (!*dirp) { + struct proc_dir_entry *dir; + if (brc_lookup_entry(parent, dir_name)) { + printk(KERN_WARNING "%s proc directory exists, can't " + "simulate--probably its real module is " + "loaded\n", dir_name); + return NULL; + } + dir = *dirp = proc_mkdir(dir_name, parent); + } + return *dirp; +} + +/* Maximum length of the BRC_GENL_A_PROC_DIR and BRC_GENL_A_PROC_NAME strings. + * If we could depend on supporting NLA_NUL_STRING and the .len member in + * Generic Netlink policy, then we could just put this in brc_genl_policy (and + * simplify brc_genl_set_proc() below too), but upstream 2.6.18 does not have + * either. */ +#define BRC_NAME_LEN_MAX 32 + +int brc_genl_set_proc(struct sk_buff *skb, struct genl_info *info) +{ + struct proc_dir_entry *dir, *entry; + const char *dir_name, *name; + char *data; + + if (!info->attrs[BRC_GENL_A_PROC_DIR] || + VERIFY_NUL_STRING(info->attrs[BRC_GENL_A_PROC_DIR]) || + !info->attrs[BRC_GENL_A_PROC_NAME] || + VERIFY_NUL_STRING(info->attrs[BRC_GENL_A_PROC_NAME]) || + (info->attrs[BRC_GENL_A_PROC_DATA] && + VERIFY_NUL_STRING(info->attrs[BRC_GENL_A_PROC_DATA]))) + return -EINVAL; + + dir_name = nla_data(info->attrs[BRC_GENL_A_PROC_DIR]); + name = nla_data(info->attrs[BRC_GENL_A_PROC_NAME]); + if (strlen(dir_name) > BRC_NAME_LEN_MAX || + strlen(name) > BRC_NAME_LEN_MAX) + return -EINVAL; + + if (!strcmp(dir_name, "net/vlan")) + dir = brc_open_dir("vlan", proc_net, &proc_vlan_dir); + else if (!strcmp(dir_name, "net/bonding")) + dir = brc_open_dir("bonding", proc_net, &proc_bonding_dir); + else + return -EINVAL; + if (!dir) { + /* Probably failed because the module that really implements + * the function in question is loaded and already owns the + * directory in question.*/ + return -EBUSY; + } + + entry = brc_lookup_entry(dir, name); + if (!info->attrs[BRC_GENL_A_PROC_DATA]) { + if (!entry) + return -ENOENT; + + data = entry->data; + remove_proc_entry(name, dir); + if (brc_lookup_entry(dir, name)) + return -EBUSY; /* Shouldn't happen */ + + kfree(data); + } else { + data = kstrdup(nla_data(info->attrs[BRC_GENL_A_PROC_DATA]), + GFP_KERNEL); + if (!data) + return -ENOMEM; + + if (entry) { + char *old_data = entry->data; + entry->data = data; + kfree(old_data); + return 0; + } + + entry = create_proc_entry(name, S_IFREG|S_IRUSR|S_IWUSR, dir); + if (!entry) { + kfree(data); + return -ENOBUFS; + } + entry->proc_fops = &brc_fops; + entry->data = data; + } + return 0; +} + +static void kill_proc_dir(const char *dir_name, + struct proc_dir_entry *parent, + struct proc_dir_entry *dir) +{ + if (!dir) + return; + for (;;) { + struct proc_dir_entry *e; + char *data; + char name[BRC_NAME_LEN_MAX + 1]; + + e = dir->subdir; + if (!e) + break; + + if (e->namelen >= sizeof name) { + /* Can't happen: we prevent adding names this long by + * limiting the BRC_GENL_A_PROC_NAME string to + * BRC_NAME_LEN_MAX bytes. */ + WARN_ON(1); + break; + } + strcpy(name, e->name); + + data = e->data; + e->data = NULL; + kfree(data); + + remove_proc_entry(name, dir); + } + remove_proc_entry(dir_name, parent); +} + +void brc_procfs_exit(void) +{ + kill_proc_dir("vlan", proc_net, proc_vlan_dir); + kill_proc_dir("bonding", proc_net, proc_bonding_dir); +} diff --git a/datapath/brc_procfs.h b/datapath/brc_procfs.h new file mode 100644 index 00000000..93e21cfb --- /dev/null +++ b/datapath/brc_procfs.h @@ -0,0 +1,11 @@ +#ifndef BRC_PROCFS_H +#define BRC_PROCFS_H 1 + +struct sk_buff; +struct genl_info; + +void brc_procfs_exit(void); +int brc_genl_set_proc(struct sk_buff *skb, struct genl_info *info); + +#endif /* brc_procfs.h */ + diff --git a/datapath/brc_sysfs.h b/datapath/brc_sysfs.h new file mode 100644 index 00000000..0c72fb22 --- /dev/null +++ b/datapath/brc_sysfs.h @@ -0,0 +1,25 @@ +#ifndef BRC_SYSFS_H +#define BRC_SYSFS_H 1 + +struct datapath; +struct net_bridge_port; + +/* brc_sysfs_dp.c */ +int brc_sysfs_add_dp(struct datapath *dp); +int brc_sysfs_del_dp(struct datapath *dp); + +/* brc_sysfs_if.c */ +int brc_sysfs_add_if(struct net_bridge_port *p); +int brc_sysfs_del_if(struct net_bridge_port *p); + +#include +#if LINUX_VERSION_CODE == KERNEL_VERSION(2,6,18) +#define SUPPORT_SYSFS 1 +#else +/* We only support sysfs on Linux 2.6.18 because that's the only place we + * really need it (on Xen, for brcompat) and it's a big pain to try to support + * multiple versions. */ +#endif + +#endif /* brc_sysfs.h */ + diff --git a/datapath/brc_sysfs_dp.c b/datapath/brc_sysfs_dp.c new file mode 100644 index 00000000..fc02f279 --- /dev/null +++ b/datapath/brc_sysfs_dp.c @@ -0,0 +1,532 @@ +#include + +/* + * Sysfs attributes of bridge for Open vSwitch + * + * This has been shamelessly copied from the kernel sources. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "brc_sysfs.h" +#include "datapath.h" +#include "dp_dev.h" + +#ifdef SUPPORT_SYSFS +#define to_dev(obj) container_of(obj, struct device, kobj) + +/* Hack to attempt to build on more platforms. */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) +#define to_kobj(d) &(d)->class_dev.kobj +#define BRC_DEVICE_ATTR CLASS_DEVICE_ATTR +#else +#define to_kobj(d) &(d)->dev.kobj +#define BRC_DEVICE_ATTR DEVICE_ATTR +#endif + +/* + * Common code for storing bridge parameters. + */ +static ssize_t store_bridge_parm(struct class_device *d, + const char *buf, size_t len, + void (*set)(struct datapath *, unsigned long)) +{ + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); + char *endp; + unsigned long val; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + val = simple_strtoul(buf, &endp, 0); + if (endp == buf) + return -EINVAL; + +#if 0 + spin_lock_bh(&br->lock); + (*set)(br, val); + spin_unlock_bh(&br->lock); +#else + /* xxx We use a default value of 0 for all fields. If the caller is + * xxx attempting to set the value to our default, just silently + * xxx ignore the request. + */ + if (val != 0) { + printk("%s: xxx writing dp parms not supported yet!\n", + dp_name(dp)); + } +#endif + return len; +} + + +static ssize_t show_forward_delay(struct class_device *d, + char *buf) +{ +#if 0 + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); + return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); +#else + return sprintf(buf, "%d\n", 0); +#endif +} + +static void set_forward_delay(struct datapath *dp, unsigned long val) +{ +#if 0 + unsigned long delay = clock_t_to_jiffies(val); + br->forward_delay = delay; + if (br_is_root_bridge(br)) + br->bridge_forward_delay = delay; +#else + printk("%s: xxx attempt to set_forward_delay()\n", dp_name(dp)); +#endif +} + +static ssize_t store_forward_delay(struct class_device *d, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_forward_delay); +} +static BRC_DEVICE_ATTR(forward_delay, S_IRUGO | S_IWUSR, + show_forward_delay, store_forward_delay); + +static ssize_t show_hello_time(struct class_device *d, char *buf) +{ +#if 0 + return sprintf(buf, "%lu\n", + jiffies_to_clock_t(to_bridge(d)->hello_time)); +#else + return sprintf(buf, "%d\n", 0); +#endif +} + +static void set_hello_time(struct datapath *dp, unsigned long val) +{ +#if 0 + unsigned long t = clock_t_to_jiffies(val); + br->hello_time = t; + if (br_is_root_bridge(br)) + br->bridge_hello_time = t; +#else + printk("%s: xxx attempt to set_hello_time()\n", dp_name(dp)); +#endif +} + +static ssize_t store_hello_time(struct class_device *d, + const char *buf, + size_t len) +{ + return store_bridge_parm(d, buf, len, set_hello_time); +} +static BRC_DEVICE_ATTR(hello_time, S_IRUGO | S_IWUSR, show_hello_time, + store_hello_time); + +static ssize_t show_max_age(struct class_device *d, + char *buf) +{ +#if 0 + return sprintf(buf, "%lu\n", + jiffies_to_clock_t(to_bridge(d)->max_age)); +#else + return sprintf(buf, "%d\n", 0); +#endif +} + +static void set_max_age(struct datapath *dp, unsigned long val) +{ +#if 0 + unsigned long t = clock_t_to_jiffies(val); + br->max_age = t; + if (br_is_root_bridge(br)) + br->bridge_max_age = t; +#else + printk("%s: xxx attempt to set_max_age()\n", dp_name(dp)); +#endif +} + +static ssize_t store_max_age(struct class_device *d, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_max_age); +} +static BRC_DEVICE_ATTR(max_age, S_IRUGO | S_IWUSR, show_max_age, store_max_age); + +static ssize_t show_ageing_time(struct class_device *d, + char *buf) +{ +#if 0 + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); + return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); +#else + return sprintf(buf, "%d\n", 0); +#endif +} + +static void set_ageing_time(struct datapath *dp, unsigned long val) +{ +#if 0 + br->ageing_time = clock_t_to_jiffies(val); +#else + printk("%s: xxx attempt to set_ageing_time()\n", dp_name(dp)); +#endif +} + +static ssize_t store_ageing_time(struct class_device *d, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_ageing_time); +} +static BRC_DEVICE_ATTR(ageing_time, S_IRUGO | S_IWUSR, show_ageing_time, + store_ageing_time); + +static ssize_t show_stp_state(struct class_device *d, + char *buf) +{ +#if 0 + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); + return sprintf(buf, "%d\n", br->stp_enabled); +#else + return sprintf(buf, "%d\n", 0); +#endif +} + + +static ssize_t store_stp_state(struct class_device *d, + const char *buf, + size_t len) +{ + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); +#if 0 + char *endp; + unsigned long val; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + val = simple_strtoul(buf, &endp, 0); + if (endp == buf) + return -EINVAL; + + rtnl_lock(); + br_stp_set_enabled(br, val); + rtnl_unlock(); +#else + printk("%s: xxx attempt to set_stp_state()\n", dp_name(dp)); +#endif + + return len; +} +static BRC_DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state, + store_stp_state); + +static ssize_t show_priority(struct class_device *d, + char *buf) +{ +#if 0 + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); + return sprintf(buf, "%d\n", + (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); +#else + return sprintf(buf, "%d\n", 0); +#endif +} + +static void set_priority(struct datapath *dp, unsigned long val) +{ +#if 0 + br_stp_set_bridge_priority(br, (u16) val); +#else + printk("%s: xxx attempt to set_priority()\n", dp_name(dp)); +#endif +} + +static ssize_t store_priority(struct class_device *d, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, set_priority); +} +static BRC_DEVICE_ATTR(priority, S_IRUGO | S_IWUSR, show_priority, store_priority); + +static ssize_t show_root_id(struct class_device *d, + char *buf) +{ +#if 0 + return br_show_bridge_id(buf, &to_bridge(d)->designated_root); +#else + return sprintf(buf, "0000.010203040506\n"); +#endif +} +static BRC_DEVICE_ATTR(root_id, S_IRUGO, show_root_id, NULL); + +static ssize_t show_bridge_id(struct class_device *d, + char *buf) +{ + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); + const unsigned char *addr = dp->ports[ODPP_LOCAL]->dev->dev_addr; + + /* xxx Do we need a lock of some sort? */ + return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n", + 0, 0, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]); +} +static BRC_DEVICE_ATTR(bridge_id, S_IRUGO, show_bridge_id, NULL); + +static ssize_t show_root_port(struct class_device *d, + char *buf) +{ +#if 0 + return sprintf(buf, "%d\n", to_bridge(d)->root_port); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRC_DEVICE_ATTR(root_port, S_IRUGO, show_root_port, NULL); + +static ssize_t show_root_path_cost(struct class_device *d, + char *buf) +{ +#if 0 + return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRC_DEVICE_ATTR(root_path_cost, S_IRUGO, show_root_path_cost, NULL); + +static ssize_t show_topology_change(struct class_device *d, + char *buf) +{ +#if 0 + return sprintf(buf, "%d\n", to_bridge(d)->topology_change); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRC_DEVICE_ATTR(topology_change, S_IRUGO, show_topology_change, NULL); + +static ssize_t show_topology_change_detected(struct class_device *d, + char *buf) +{ +#if 0 + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); + return sprintf(buf, "%d\n", br->topology_change_detected); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRC_DEVICE_ATTR(topology_change_detected, S_IRUGO, + show_topology_change_detected, NULL); + +static ssize_t show_hello_timer(struct class_device *d, + char *buf) +{ +#if 0 + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); + return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer)); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRC_DEVICE_ATTR(hello_timer, S_IRUGO, show_hello_timer, NULL); + +static ssize_t show_tcn_timer(struct class_device *d, + char *buf) +{ +#if 0 + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); + return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer)); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRC_DEVICE_ATTR(tcn_timer, S_IRUGO, show_tcn_timer, NULL); + +static ssize_t show_topology_change_timer(struct class_device *d, + char *buf) +{ +#if 0 + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); + return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRC_DEVICE_ATTR(topology_change_timer, S_IRUGO, show_topology_change_timer, + NULL); + +static ssize_t show_gc_timer(struct class_device *d, + char *buf) +{ +#if 0 + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); + return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer)); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRC_DEVICE_ATTR(gc_timer, S_IRUGO, show_gc_timer, NULL); + +static ssize_t show_group_addr(struct class_device *d, + char *buf) +{ +#if 0 + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); + return sprintf(buf, "%x:%x:%x:%x:%x:%x\n", + br->group_addr[0], br->group_addr[1], + br->group_addr[2], br->group_addr[3], + br->group_addr[4], br->group_addr[5]); +#else + return sprintf(buf, "00:01:02:03:04:05\n"); +#endif +} + +static ssize_t store_group_addr(struct class_device *d, + const char *buf, size_t len) +{ + struct datapath *dp = dp_dev_get_dp(to_net_dev(d)); +#if 0 + unsigned new_addr[6]; + int i; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (sscanf(buf, "%x:%x:%x:%x:%x:%x", + &new_addr[0], &new_addr[1], &new_addr[2], + &new_addr[3], &new_addr[4], &new_addr[5]) != 6) + return -EINVAL; + + /* Must be 01:80:c2:00:00:0X */ + for (i = 0; i < 5; i++) + if (new_addr[i] != br_group_address[i]) + return -EINVAL; + + if (new_addr[5] & ~0xf) + return -EINVAL; + + if (new_addr[5] == 1 /* 802.3x Pause address */ + || new_addr[5] == 2 /* 802.3ad Slow protocols */ + || new_addr[5] == 3) /* 802.1X PAE address */ + return -EINVAL; + + spin_lock_bh(&br->lock); + for (i = 0; i < 6; i++) + br->group_addr[i] = new_addr[i]; + spin_unlock_bh(&br->lock); +#else + printk("%s: xxx attempt to store_group_addr()\n", dp_name(dp)); +#endif + return len; +} + +static BRC_DEVICE_ATTR(group_addr, S_IRUGO | S_IWUSR, + show_group_addr, store_group_addr); + +static struct attribute *bridge_attrs[] = { + &class_device_attr_forward_delay.attr, + &class_device_attr_hello_time.attr, + &class_device_attr_max_age.attr, + &class_device_attr_ageing_time.attr, + &class_device_attr_stp_state.attr, + &class_device_attr_priority.attr, + &class_device_attr_bridge_id.attr, + &class_device_attr_root_id.attr, + &class_device_attr_root_path_cost.attr, + &class_device_attr_root_port.attr, + &class_device_attr_topology_change.attr, + &class_device_attr_topology_change_detected.attr, + &class_device_attr_hello_timer.attr, + &class_device_attr_tcn_timer.attr, + &class_device_attr_topology_change_timer.attr, + &class_device_attr_gc_timer.attr, + &class_device_attr_group_addr.attr, + NULL +}; + +static struct attribute_group bridge_group = { + .name = SYSFS_BRIDGE_ATTR, + .attrs = bridge_attrs, +}; + +/* + * Add entries in sysfs onto the existing network class device + * for the bridge. + * Adds a attribute group "bridge" containing tuning parameters. + * Sub directory to hold links to interfaces. + * + * Note: the ifobj exists only to be a subdirectory + * to hold links. The ifobj exists in the same data structure + * as its parent the bridge so reference counting works. + */ +int brc_sysfs_add_dp(struct datapath *dp) +{ + struct kobject *kobj = to_kobj(dp->ports[ODPP_LOCAL]->dev); + int err; + + err = sysfs_create_group(kobj, &bridge_group); + if (err) { + pr_info("%s: can't create group %s/%s\n", + __func__, dp_name(dp), bridge_group.name); + goto out1; + } + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + kobject_set_name(&dp->ifobj, SYSFS_BRIDGE_PORT_SUBDIR); + dp->ifobj.ktype = NULL; + dp->ifobj.kset = NULL; + dp->ifobj.parent = kobj; + + err = kobject_register(&dp->ifobj); + if (err) { + pr_info("%s: can't add kobject (directory) %s/%s\n", + __FUNCTION__, dp_name(dp), dp->ifobj.name); + goto out2; + } +#else + br->ifobj = kobject_create_and_add(SYSFS_BRIDGE_PORT_SUBDIR, kobj); + if (!br->ifobj) { + pr_info("%s: can't add kobject (directory) %s/%s\n", + __func__, dp_name(dp), SYSFS_BRIDGE_PORT_SUBDIR); + goto out2; + } +#endif + return 0; + + out2: + sysfs_remove_group(kobj, &bridge_group); + out1: + return err; +} + +int brc_sysfs_del_dp(struct datapath *dp) +{ + struct kobject *kobj = to_kobj(dp->ports[ODPP_LOCAL]->dev); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + kobject_unregister(&dp->ifobj); +#else + kobject_put(dp->ifobj); +#endif + sysfs_remove_group(kobj, &bridge_group); + + return 0; +} +#else /* !SUPPORT_SYSFS */ +int brc_sysfs_add_dp(struct datapath *dp) { return 0; } +int brc_sysfs_del_dp(struct datapath *dp) { return 0; } +int brc_sysfs_add_if(struct net_bridge_port *p) { return 0; } +int brc_sysfs_del_if(struct net_bridge_port *p) +{ + dev_put(p->dev); + kfree(p); + return 0; +} +#endif /* !SUPPORT_SYSFS */ diff --git a/datapath/brc_sysfs_if.c b/datapath/brc_sysfs_if.c new file mode 100644 index 00000000..20bb109b --- /dev/null +++ b/datapath/brc_sysfs_if.c @@ -0,0 +1,334 @@ +/* + * Sysfs attributes of bridge ports for Open vSwitch + * + * This has been shamelessly copied from the kernel sources. + */ + +#include +#include +#include +#include +#include +#include +#include "brc_sysfs.h" +#include "datapath.h" + +#ifdef SUPPORT_SYSFS + +struct brport_attribute { + struct attribute attr; + ssize_t (*show)(struct net_bridge_port *, char *); + ssize_t (*store)(struct net_bridge_port *, unsigned long); +}; + +#define BRPORT_ATTR(_name,_mode,_show,_store) \ +struct brport_attribute brport_attr_##_name = { \ + .attr = {.name = __stringify(_name), \ + .mode = _mode, \ + .owner = THIS_MODULE, }, \ + .show = _show, \ + .store = _store, \ +}; + +static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) +{ +#if 0 + return sprintf(buf, "%d\n", p->path_cost); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static ssize_t store_path_cost(struct net_bridge_port *p, unsigned long v) +{ +#if 0 + br_stp_set_path_cost(p, v); +#endif + return 0; +} +static BRPORT_ATTR(path_cost, S_IRUGO | S_IWUSR, + show_path_cost, store_path_cost); + +static ssize_t show_priority(struct net_bridge_port *p, char *buf) +{ +#if 0 + return sprintf(buf, "%d\n", p->priority); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static ssize_t store_priority(struct net_bridge_port *p, unsigned long v) +{ +#if 0 + if (v >= (1<<(16-BR_PORT_BITS))) + return -ERANGE; + br_stp_set_port_priority(p, v); +#endif + return 0; +} +static BRPORT_ATTR(priority, S_IRUGO | S_IWUSR, + show_priority, store_priority); + +static ssize_t show_designated_root(struct net_bridge_port *p, char *buf) +{ +#if 0 + return br_show_bridge_id(buf, &p->designated_root); +#else + return sprintf(buf, "0000.010203040506\n"); +#endif +} +static BRPORT_ATTR(designated_root, S_IRUGO, show_designated_root, NULL); + +static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf) +{ +#if 0 + return br_show_bridge_id(buf, &p->designated_bridge); +#else + return sprintf(buf, "0000.060504030201\n"); +#endif +} +static BRPORT_ATTR(designated_bridge, S_IRUGO, show_designated_bridge, NULL); + +static ssize_t show_designated_port(struct net_bridge_port *p, char *buf) +{ +#if 0 + return sprintf(buf, "%d\n", p->designated_port); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRPORT_ATTR(designated_port, S_IRUGO, show_designated_port, NULL); + +static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf) +{ +#if 0 + return sprintf(buf, "%d\n", p->designated_cost); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRPORT_ATTR(designated_cost, S_IRUGO, show_designated_cost, NULL); + +static ssize_t show_port_id(struct net_bridge_port *p, char *buf) +{ +#if 0 + return sprintf(buf, "0x%x\n", p->port_id); +#else + return sprintf(buf, "0x%x\n", 0); +#endif +} +static BRPORT_ATTR(port_id, S_IRUGO, show_port_id, NULL); + +static ssize_t show_port_no(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "0x%x\n", p->port_no); +} + +static BRPORT_ATTR(port_no, S_IRUGO, show_port_no, NULL); + +static ssize_t show_change_ack(struct net_bridge_port *p, char *buf) +{ +#if 0 + return sprintf(buf, "%d\n", p->topology_change_ack); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRPORT_ATTR(change_ack, S_IRUGO, show_change_ack, NULL); + +static ssize_t show_config_pending(struct net_bridge_port *p, char *buf) +{ +#if 0 + return sprintf(buf, "%d\n", p->config_pending); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRPORT_ATTR(config_pending, S_IRUGO, show_config_pending, NULL); + +static ssize_t show_port_state(struct net_bridge_port *p, char *buf) +{ +#if 0 + return sprintf(buf, "%d\n", p->state); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRPORT_ATTR(state, S_IRUGO, show_port_state, NULL); + +static ssize_t show_message_age_timer(struct net_bridge_port *p, + char *buf) +{ +#if 0 + return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer)); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRPORT_ATTR(message_age_timer, S_IRUGO, show_message_age_timer, NULL); + +static ssize_t show_forward_delay_timer(struct net_bridge_port *p, + char *buf) +{ +#if 0 + return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer)); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRPORT_ATTR(forward_delay_timer, S_IRUGO, show_forward_delay_timer, NULL); + +static ssize_t show_hold_timer(struct net_bridge_port *p, + char *buf) +{ +#if 0 + return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer)); +#else + return sprintf(buf, "%d\n", 0); +#endif +} +static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL); + +static struct brport_attribute *brport_attrs[] = { + &brport_attr_path_cost, + &brport_attr_priority, + &brport_attr_port_id, + &brport_attr_port_no, + &brport_attr_designated_root, + &brport_attr_designated_bridge, + &brport_attr_designated_port, + &brport_attr_designated_cost, + &brport_attr_state, + &brport_attr_change_ack, + &brport_attr_config_pending, + &brport_attr_message_age_timer, + &brport_attr_forward_delay_timer, + &brport_attr_hold_timer, + NULL +}; + +#define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr) +#define to_brport(obj) container_of(obj, struct net_bridge_port, kobj) + +static ssize_t brport_show(struct kobject * kobj, + struct attribute * attr, char * buf) +{ + struct brport_attribute * brport_attr = to_brport_attr(attr); + struct net_bridge_port * p = to_brport(kobj); + + return brport_attr->show(p, buf); +} + +static ssize_t brport_store(struct kobject * kobj, + struct attribute * attr, + const char * buf, size_t count) +{ + struct net_bridge_port * p = to_brport(kobj); +#if 0 + struct brport_attribute * brport_attr = to_brport_attr(attr); + char *endp; + unsigned long val; +#endif + ssize_t ret = -EINVAL; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + +#if 0 + val = simple_strtoul(buf, &endp, 0); + if (endp != buf) { + rtnl_lock(); + if (p->dev && p->br && brport_attr->store) { + spin_lock_bh(&p->br->lock); + ret = brport_attr->store(p, val); + spin_unlock_bh(&p->br->lock); + if (ret == 0) + ret = count; + } + rtnl_unlock(); + } +#else + printk("%s: xxx writing port parms not supported yet!\n", + dp_name(p->dp)); +#endif + return ret; +} + +struct sysfs_ops brport_sysfs_ops = { + .show = brport_show, + .store = brport_store, +}; + +static void release_nbp(struct kobject *kobj) +{ + struct net_bridge_port *p + = container_of(kobj, struct net_bridge_port, kobj); + kfree(p); +} + +struct kobj_type brport_ktype = { + .sysfs_ops = &brport_sysfs_ops, + .release = release_nbp +}; + +/* + * Add sysfs entries to ethernet device added to a bridge. + * Creates a brport subdirectory with bridge attributes. + * Puts symlink in bridge's brport subdirectory + */ +int brc_sysfs_add_if(struct net_bridge_port *p) +{ + struct datapath *dp = p->dp; + struct brport_attribute **a; + int err; + + kobject_init(&p->kobj); + kobject_set_name(&p->kobj, SYSFS_BRIDGE_PORT_ATTR); + p->kobj.ktype = &brport_ktype; + p->kobj.kset = NULL; + p->kobj.parent = &(p->dev->class_dev.kobj); + + err = kobject_add(&p->kobj); + if (err) + goto err_put; + + err = sysfs_create_link(&p->kobj, + &dp->ports[ODPP_LOCAL]->dev->class_dev.kobj, + SYSFS_BRIDGE_PORT_LINK); + if (err) + goto err_del; + + for (a = brport_attrs; *a; ++a) { + err = sysfs_create_file(&p->kobj, &((*a)->attr)); + if (err) + goto err_del; + } + + err = sysfs_create_link(&dp->ifobj, &p->kobj, p->dev->name); + if (err) + goto err_del; + + kobject_uevent(&p->kobj, KOBJ_ADD); + + return err; + +err_del: + kobject_del(&p->kobj); +err_put: + kobject_put(&p->kobj); + return err; +} + +int brc_sysfs_del_if(struct net_bridge_port *p) +{ + struct net_device *dev = p->dev; + + kobject_uevent(&p->kobj, KOBJ_REMOVE); + kobject_del(&p->kobj); + + dev_put(dev); + + kobject_put(&p->kobj); + + return 0; +} +#endif /* SUPPORT_SYSFS */ diff --git a/datapath/brcompat.c b/datapath/brcompat.c new file mode 100644 index 00000000..2e437ccd --- /dev/null +++ b/datapath/brcompat.c @@ -0,0 +1,519 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "compat.h" +#include "openvswitch/brcompat-netlink.h" +#include "brc_procfs.h" +#include "brc_sysfs.h" +#include "datapath.h" +#include "dp_dev.h" + +static struct genl_family brc_genl_family; +static struct genl_multicast_group brc_mc_group; + +/* Time to wait for ovs-vswitchd to respond to a datapath action, in + * jiffies. */ +#define BRC_TIMEOUT (HZ * 5) + +/* Mutex to serialize ovs-brcompatd callbacks. (Some callbacks naturally hold + * br_ioctl_mutex, others hold rtnl_lock, but we can't take the former + * ourselves and we don't want to hold the latter over a potentially long + * period of time.) */ +static DEFINE_MUTEX(brc_serial); + +/* Userspace communication. */ +static DEFINE_SPINLOCK(brc_lock); /* Ensure atomic access to these vars. */ +static DECLARE_COMPLETION(brc_done); /* Userspace signaled operation done? */ +static int brc_err; /* Error code from userspace. */ +static u32 brc_seq; /* Sequence number for current op. */ + +static int brc_send_command(const char *bridge, const char *port, int op); + +static int +get_dp_ifindices(int *indices, int num) +{ + int i, index = 0; + + rcu_read_lock(); + for (i=0; i < ODP_MAX && index < num; i++) { + struct datapath *dp = get_dp(i); + if (!dp) + continue; + indices[index++] = dp->ports[ODPP_LOCAL]->dev->ifindex; + } + rcu_read_unlock(); + + return index; +} + +static void +get_port_ifindices(struct datapath *dp, int *ifindices, int num) +{ + struct net_bridge_port *p; + + rcu_read_lock(); + list_for_each_entry_rcu (p, &dp->port_list, node) { + if (p->port_no < num) + ifindices[p->port_no] = p->dev->ifindex; + } + rcu_read_unlock(); +} + +static int brc_add_del_bridge(char __user *uname, int add) +{ + char name[IFNAMSIZ]; + + if (copy_from_user(name, uname, IFNAMSIZ)) + return -EFAULT; + + name[IFNAMSIZ - 1] = 0; + return brc_send_command(name, NULL, + add ? BRC_GENL_C_DP_ADD : BRC_GENL_C_DP_DEL); +} + +static int brc_get_bridges(int __user *uindices, int n) +{ + int *indices; + int ret; + + if (n >= 2048) + return -ENOMEM; + + indices = kcalloc(n, sizeof(int), GFP_KERNEL); + if (indices == NULL) + return -ENOMEM; + + n = get_dp_ifindices(indices, n); + + ret = copy_to_user(uindices, indices, n * sizeof(int)) ? -EFAULT : n; + + kfree(indices); + return ret; +} + +/* Legacy deviceless bridge ioctl's. Called with br_ioctl_mutex. */ +static int +old_deviceless(void __user *uarg) +{ + unsigned long args[3]; + + if (copy_from_user(args, uarg, sizeof(args))) + return -EFAULT; + + switch (args[0]) { + case BRCTL_GET_BRIDGES: + return brc_get_bridges((int __user *)args[1], args[2]); + + case BRCTL_ADD_BRIDGE: + return brc_add_del_bridge((void __user *)args[1], 1); + case BRCTL_DEL_BRIDGE: + return brc_add_del_bridge((void __user *)args[1], 0); + } + + return -EOPNOTSUPP; +} + +/* Called with the br_ioctl_mutex. */ +static int +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) +brc_ioctl_deviceless_stub(unsigned int cmd, void __user *uarg) +#else +brc_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg) +#endif +{ + switch (cmd) { + case SIOCGIFBR: + case SIOCSIFBR: + return old_deviceless(uarg); + + case SIOCBRADDBR: + return brc_add_del_bridge(uarg, 1); + case SIOCBRDELBR: + return brc_add_del_bridge(uarg, 0); + } + + return -EOPNOTSUPP; +} + +static int +brc_add_del_port(struct net_device *dev, int port_ifindex, int add) +{ + struct net_device *port; + char dev_name[IFNAMSIZ], port_name[IFNAMSIZ]; + int err; + + port = __dev_get_by_index(&init_net, port_ifindex); + if (!port) + return -EINVAL; + + /* Save name of dev and port because there's a race between the + * rtnl_unlock() and the brc_send_command(). */ + strcpy(dev_name, dev->name); + strcpy(port_name, port->name); + + rtnl_unlock(); + err = brc_send_command(dev_name, port_name, + add ? BRC_GENL_C_PORT_ADD : BRC_GENL_C_PORT_DEL); + rtnl_lock(); + + return err; +} + +static int +brc_get_bridge_info(struct net_device *dev, struct __bridge_info __user *ub) +{ + struct __bridge_info b; + u64 id = 0; + int i; + + memset(&b, 0, sizeof(struct __bridge_info)); + + for (i=0; idev_addr[i] << (8*(ETH_ALEN-1 - i)); + b.bridge_id = cpu_to_be64(id); + b.stp_enabled = 0; + + if (copy_to_user(ub, &b, sizeof(struct __bridge_info))) + return -EFAULT; + + return 0; +} + +static int +brc_get_port_list(struct net_device *dev, int __user *uindices, int num) +{ + struct dp_dev *dp_dev = netdev_priv(dev); + struct datapath *dp = dp_dev->dp; + int *indices; + + if (num < 0) + return -EINVAL; + if (num == 0) + num = 256; + if (num > DP_MAX_PORTS) + num = DP_MAX_PORTS; + + indices = kcalloc(num, sizeof(int), GFP_KERNEL); + if (indices == NULL) + return -ENOMEM; + + get_port_ifindices(dp, indices, num); + if (copy_to_user(uindices, indices, num * sizeof(int))) + num = -EFAULT; + kfree(indices); + return num; +} + +/* Legacy ioctl's through SIOCDEVPRIVATE. Called with rtnl_lock. */ +static int +old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) +{ + unsigned long args[4]; + + if (copy_from_user(args, rq->ifr_data, sizeof(args))) + return -EFAULT; + + switch (args[0]) { + case BRCTL_ADD_IF: + return brc_add_del_port(dev, args[1], 1); + case BRCTL_DEL_IF: + return brc_add_del_port(dev, args[1], 0); + + case BRCTL_GET_BRIDGE_INFO: + return brc_get_bridge_info(dev, (struct __bridge_info __user *)args[1]); + + case BRCTL_GET_PORT_LIST: + return brc_get_port_list(dev, (int __user *)args[1], args[2]); + } + + return -EOPNOTSUPP; +} + +/* Called with the rtnl_lock. */ +static int +brc_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) +{ + int err; + + switch (cmd) { + case SIOCDEVPRIVATE: + err = old_dev_ioctl(dev, rq, cmd); + break; + + case SIOCBRADDIF: + return brc_add_del_port(dev, rq->ifr_ifindex, 1); + case SIOCBRDELIF: + return brc_add_del_port(dev, rq->ifr_ifindex, 0); + + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + + +static struct genl_family brc_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = BRC_GENL_FAMILY_NAME, + .version = 1, + .maxattr = BRC_GENL_A_MAX, +}; + +static int brc_genl_query(struct sk_buff *skb, struct genl_info *info) +{ + int err = -EINVAL; + struct sk_buff *ans_skb; + void *data; + + ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ans_skb) + return -ENOMEM; + + data = genlmsg_put_reply(ans_skb, info, &brc_genl_family, + 0, BRC_GENL_C_QUERY_MC); + if (data == NULL) { + err = -ENOMEM; + goto err; + } + NLA_PUT_U32(ans_skb, BRC_GENL_A_MC_GROUP, brc_mc_group.id); + + genlmsg_end(ans_skb, data); + return genlmsg_reply(ans_skb, info); + +err: +nla_put_failure: + kfree_skb(ans_skb); + return err; +} + +static struct genl_ops brc_genl_ops_query_dp = { + .cmd = BRC_GENL_C_QUERY_MC, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privelege. */ + .policy = NULL, + .doit = brc_genl_query, + .dumpit = NULL +}; + +/* Attribute policy: what each attribute may contain. */ +static struct nla_policy brc_genl_policy[BRC_GENL_A_MAX + 1] = { + [BRC_GENL_A_ERR_CODE] = { .type = NLA_U32 }, + [BRC_GENL_A_PROC_DIR] = { .type = NLA_NUL_STRING }, + [BRC_GENL_A_PROC_NAME] = { .type = NLA_NUL_STRING }, + [BRC_GENL_A_PROC_DATA] = { .type = NLA_NUL_STRING }, +}; + +static int +brc_genl_dp_result(struct sk_buff *skb, struct genl_info *info) +{ + unsigned long int flags; + int err; + + if (!info->attrs[BRC_GENL_A_ERR_CODE]) + return -EINVAL; + + spin_lock_irqsave(&brc_lock, flags); + if (brc_seq == info->snd_seq) { + brc_err = nla_get_u32(info->attrs[BRC_GENL_A_ERR_CODE]); + complete(&brc_done); + err = 0; + } else { + err = -ESTALE; + } + spin_unlock_irqrestore(&brc_lock, flags); + + return err; +} + +static struct genl_ops brc_genl_ops_dp_result = { + .cmd = BRC_GENL_C_DP_RESULT, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privelege. */ + .policy = brc_genl_policy, + .doit = brc_genl_dp_result, + .dumpit = NULL +}; + +static struct genl_ops brc_genl_ops_set_proc = { + .cmd = BRC_GENL_C_SET_PROC, + .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privelege. */ + .policy = brc_genl_policy, + .doit = brc_genl_set_proc, + .dumpit = NULL +}; + +static int brc_send_command(const char *bridge, const char *port, int op) +{ + unsigned long int flags; + struct sk_buff *skb; + void *data; + int error; + + mutex_lock(&brc_serial); + + /* Increment sequence number first, so that we ignore any replies + * to stale requests. */ + spin_lock_irqsave(&brc_lock, flags); + brc_seq++; + INIT_COMPLETION(brc_done); + spin_unlock_irqrestore(&brc_lock, flags); + + /* Compose message. */ + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + error = -ENOMEM; + if (skb == NULL) + goto exit_unlock; + data = genlmsg_put(skb, 0, brc_seq, &brc_genl_family, 0, op); + + NLA_PUT_STRING(skb, BRC_GENL_A_DP_NAME, bridge); + if (port) + NLA_PUT_STRING(skb, BRC_GENL_A_PORT_NAME, port); + + genlmsg_end(skb, data); + + /* Send message. */ + error = genlmsg_multicast(skb, 0, brc_mc_group.id, GFP_KERNEL); + if (error < 0) + goto exit_unlock; + + /* Wait for reply. */ + error = -ETIMEDOUT; + if (!wait_for_completion_timeout(&brc_done, BRC_TIMEOUT)) + goto exit_unlock; + + error = -brc_err; + goto exit_unlock; + +nla_put_failure: + kfree_skb(skb); +exit_unlock: + mutex_unlock(&brc_serial); + return error; +} + +int brc_add_dp(struct datapath *dp) +{ + if (!try_module_get(THIS_MODULE)) + return -ENODEV; +#ifdef SUPPORT_SYSFS + brc_sysfs_add_dp(dp); +#endif + + return 0; +} + +int brc_del_dp(struct datapath *dp) +{ +#ifdef SUPPORT_SYSFS + brc_sysfs_del_dp(dp); +#endif + module_put(THIS_MODULE); + + return 0; +} + +static int +__init brc_init(void) +{ + int i; + int err; + + printk("Open vSwitch Bridge Compatibility, built "__DATE__" "__TIME__"\n"); + + rcu_read_lock(); + for (i=0; i + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) + +#include "compat26.h" + +#else + +#include "compat24.h" + +#endif + + +#endif /* compat.h */ diff --git a/datapath/datapath.c b/datapath/datapath.c new file mode 100644 index 00000000..015edc4b --- /dev/null +++ b/datapath/datapath.c @@ -0,0 +1,1611 @@ +/* + * Distributed under the terms of the GNU GPL version 2. + * Copyright (c) 2007, 2008, 2009 Nicira Networks. + */ + +/* Functions for managing the dp interface/device. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "openvswitch/datapath-protocol.h" +#include "datapath.h" +#include "actions.h" +#include "dp_dev.h" +#include "flow.h" + +#include "compat.h" + + +int (*dp_ioctl_hook)(struct net_device *dev, struct ifreq *rq, int cmd); +EXPORT_SYMBOL(dp_ioctl_hook); + +int (*dp_add_dp_hook)(struct datapath *dp); +EXPORT_SYMBOL(dp_add_dp_hook); + +int (*dp_del_dp_hook)(struct datapath *dp); +EXPORT_SYMBOL(dp_del_dp_hook); + +int (*dp_add_if_hook)(struct net_bridge_port *p); +EXPORT_SYMBOL(dp_add_if_hook); + +int (*dp_del_if_hook)(struct net_bridge_port *p); +EXPORT_SYMBOL(dp_del_if_hook); + +/* Datapaths. Protected on the read side by rcu_read_lock, on the write side + * by dp_mutex. dp_mutex is almost completely redundant with genl_mutex + * maintained by the Generic Netlink code, but the timeout path needs mutual + * exclusion too. + * + * dp_mutex nests inside the RTNL lock: if you need both you must take the RTNL + * lock first. + * + * It is safe to access the datapath and net_bridge_port structures with just + * dp_mutex. + */ +static struct datapath *dps[ODP_MAX]; +static DEFINE_MUTEX(dp_mutex); + +/* Number of milliseconds between runs of the maintenance thread. */ +#define MAINT_SLEEP_MSECS 1000 + +static int new_nbp(struct datapath *, struct net_device *, int port_no); + +/* Must be called with rcu_read_lock or dp_mutex. */ +struct datapath *get_dp(int dp_idx) +{ + if (dp_idx < 0 || dp_idx >= ODP_MAX) + return NULL; + return rcu_dereference(dps[dp_idx]); +} +EXPORT_SYMBOL_GPL(get_dp); + +struct datapath *get_dp_locked(int dp_idx) +{ + struct datapath *dp; + + mutex_lock(&dp_mutex); + dp = get_dp(dp_idx); + if (dp) + mutex_lock(&dp->mutex); + mutex_unlock(&dp_mutex); + return dp; +} + +static inline size_t br_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + + nla_total_size(4) /* IFLA_MASTER */ + + nla_total_size(4) /* IFLA_MTU */ + + nla_total_size(4) /* IFLA_LINK */ + + nla_total_size(1); /* IFLA_OPERSTATE */ +} + +static int dp_fill_ifinfo(struct sk_buff *skb, + const struct net_bridge_port *port, + int event, unsigned int flags) +{ + const struct datapath *dp = port->dp; + const struct net_device *dev = port->dev; + struct ifinfomsg *hdr; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(skb, 0, 0, event, sizeof(*hdr), flags); + if (nlh == NULL) + return -EMSGSIZE; + + hdr = nlmsg_data(nlh); + hdr->ifi_family = AF_BRIDGE; + hdr->__ifi_pad = 0; + hdr->ifi_type = dev->type; + hdr->ifi_index = dev->ifindex; + hdr->ifi_flags = dev_get_flags(dev); + hdr->ifi_change = 0; + + NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name); + NLA_PUT_U32(skb, IFLA_MASTER, dp->ports[ODPP_LOCAL]->dev->ifindex); + NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); +#ifdef IFLA_OPERSTATE + NLA_PUT_U8(skb, IFLA_OPERSTATE, + netif_running(dev) ? dev->operstate : IF_OPER_DOWN); +#endif + + if (dev->addr_len) + NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + + if (dev->ifindex != dev->iflink) + NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); + + return nlmsg_end(skb, nlh); + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static void dp_ifinfo_notify(int event, struct net_bridge_port *port) +{ + struct net *net = dev_net(port->dev); + struct sk_buff *skb; + int err = -ENOBUFS; + + skb = nlmsg_new(br_nlmsg_size(), GFP_KERNEL); + if (skb == NULL) + goto errout; + + err = dp_fill_ifinfo(skb, port, event, 0); + if (err < 0) { + /* -EMSGSIZE implies BUG in br_nlmsg_size() */ + WARN_ON(err == -EMSGSIZE); + kfree_skb(skb); + goto errout; + } + err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); +errout: + if (err < 0) + rtnl_set_sk_err(net, RTNLGRP_LINK, err); +} + +static int create_dp(int dp_idx, const char __user *devnamep) +{ + struct net_device *dp_dev; + char devname[IFNAMSIZ]; + struct datapath *dp; + int err; + int i; + + if (devnamep) { + err = -EFAULT; + if (strncpy_from_user(devname, devnamep, IFNAMSIZ - 1) < 0) + goto err; + devname[IFNAMSIZ - 1] = '\0'; + } else { + snprintf(devname, sizeof devname, "of%d", dp_idx); + } + + rtnl_lock(); + mutex_lock(&dp_mutex); + err = -ENODEV; + if (!try_module_get(THIS_MODULE)) + goto err_unlock; + + /* Exit early if a datapath with that number already exists. + * (We don't use -EEXIST because that's ambiguous with 'devname' + * conflicting with an existing network device name.) */ + err = -EBUSY; + if (get_dp(dp_idx)) + goto err_put_module; + + err = -ENOMEM; + dp = kzalloc(sizeof *dp, GFP_KERNEL); + if (dp == NULL) + goto err_put_module; + + mutex_init(&dp->mutex); + dp->dp_idx = dp_idx; + for (i = 0; i < DP_N_QUEUES; i++) + skb_queue_head_init(&dp->queues[i]); + init_waitqueue_head(&dp->waitqueue); + + /* Setup our datapath device */ + dp_dev = dp_dev_create(dp, devname, ODPP_LOCAL); + err = PTR_ERR(dp_dev); + if (IS_ERR(dp_dev)) + goto err_free_dp; + + err = -ENOMEM; + rcu_assign_pointer(dp->table, dp_table_create(DP_L1_SIZE)); + if (!dp->table) + goto err_destroy_dp_dev; + INIT_LIST_HEAD(&dp->port_list); + + err = new_nbp(dp, dp_dev, ODPP_LOCAL); + if (err) + goto err_destroy_table; + + dp->drop_frags = 0; + dp->stats_percpu = alloc_percpu(struct dp_stats_percpu); + if (!dp->stats_percpu) + goto err_destroy_local_port; + + rcu_assign_pointer(dps[dp_idx], dp); + mutex_unlock(&dp_mutex); + rtnl_unlock(); + + if (dp_add_dp_hook) + dp_add_dp_hook(dp); + + return 0; + +err_destroy_local_port: + dp_del_port(dp->ports[ODPP_LOCAL], NULL); +err_destroy_table: + dp_table_destroy(dp->table, 0); +err_destroy_dp_dev: + dp_dev_destroy(dp_dev); +err_free_dp: + kfree(dp); +err_put_module: + module_put(THIS_MODULE); +err_unlock: + mutex_unlock(&dp_mutex); + rtnl_unlock(); +err: + return err; +} + +static void do_destroy_dp(struct datapath *dp, struct list_head *dp_devs) +{ + struct net_bridge_port *p, *n; + int i; + + if (dp_del_dp_hook) + dp_del_dp_hook(dp); + + /* Drop references to DP. */ + list_for_each_entry_safe (p, n, &dp->port_list, node) + dp_del_port(p, dp_devs); + + rcu_assign_pointer(dps[dp->dp_idx], NULL); + synchronize_rcu(); + + /* Wait until no longer in use, then destroy it. */ + synchronize_rcu(); + dp_table_destroy(dp->table, 1); + for (i = 0; i < DP_N_QUEUES; i++) + skb_queue_purge(&dp->queues[i]); + for (i = 0; i < DP_MAX_GROUPS; i++) + kfree(dp->groups[i]); + free_percpu(dp->stats_percpu); + kfree(dp); + module_put(THIS_MODULE); +} + +static int destroy_dp(int dp_idx) +{ + struct dp_dev *dp_dev, *next; + struct datapath *dp; + LIST_HEAD(dp_devs); + int err; + + rtnl_lock(); + mutex_lock(&dp_mutex); + dp = get_dp(dp_idx); + err = -ENODEV; + if (!dp) + goto err_unlock; + + do_destroy_dp(dp, &dp_devs); + err = 0; + +err_unlock: + mutex_unlock(&dp_mutex); + rtnl_unlock(); + list_for_each_entry_safe (dp_dev, next, &dp_devs, list) + free_netdev(dp_dev->dev); + return err; +} + +/* Called with RTNL lock and dp_mutex. */ +static int new_nbp(struct datapath *dp, struct net_device *dev, int port_no) +{ + struct net_bridge_port *p; + + if (dev->br_port != NULL) + return -EBUSY; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + dev_set_promiscuity(dev, 1); + dev_hold(dev); + p->port_no = port_no; + p->dp = dp; + p->dev = dev; + if (!is_dp_dev(dev)) + rcu_assign_pointer(dev->br_port, p); + else { + /* It would make sense to assign dev->br_port here too, but + * that causes packets received on internal ports to get caught + * in dp_frame_hook(). In turn dp_frame_hook() can reject them + * back to network stack, but that's a waste of time. */ + } + rcu_assign_pointer(dp->ports[port_no], p); + list_add_rcu(&p->node, &dp->port_list); + dp->n_ports++; + + dp_ifinfo_notify(RTM_NEWLINK, p); + + return 0; +} + +static int add_port(int dp_idx, struct odp_port __user *portp) +{ + struct net_device *dev; + struct datapath *dp; + struct odp_port port; + int port_no; + int err; + + err = -EFAULT; + if (copy_from_user(&port, portp, sizeof port)) + goto out; + port.devname[IFNAMSIZ - 1] = '\0'; + port_no = port.port; + + err = -EINVAL; + if (port_no < 0 || port_no >= DP_MAX_PORTS) + goto out; + + rtnl_lock(); + dp = get_dp_locked(dp_idx); + err = -ENODEV; + if (!dp) + goto out_unlock_rtnl; + + err = -EEXIST; + if (dp->ports[port_no]) + goto out_unlock_dp; + + if (!(port.flags & ODP_PORT_INTERNAL)) { + err = -ENODEV; + dev = dev_get_by_name(&init_net, port.devname); + if (!dev) + goto out_unlock_dp; + + err = -EINVAL; + if (dev->flags & IFF_LOOPBACK || dev->type != ARPHRD_ETHER || + is_dp_dev(dev)) + goto out_put; + } else { + dev = dp_dev_create(dp, port.devname, port_no); + err = PTR_ERR(dev); + if (IS_ERR(dev)) + goto out_unlock_dp; + dev_hold(dev); + } + + err = new_nbp(dp, dev, port_no); + if (err) + goto out_put; + + if (dp_add_if_hook) + dp_add_if_hook(dp->ports[port_no]); + +out_put: + dev_put(dev); +out_unlock_dp: + mutex_unlock(&dp->mutex); +out_unlock_rtnl: + rtnl_unlock(); +out: + return err; +} + +int dp_del_port(struct net_bridge_port *p, struct list_head *dp_devs) +{ + ASSERT_RTNL(); + +#ifdef SUPPORT_SYSFS + if (p->port_no != ODPP_LOCAL && dp_del_if_hook) + sysfs_remove_link(&p->dp->ifobj, p->dev->name); +#endif + dp_ifinfo_notify(RTM_DELLINK, p); + + p->dp->n_ports--; + + if (is_dp_dev(p->dev)) { + /* Make sure that no packets arrive from now on, since + * dp_dev_xmit() will try to find itself through + * p->dp->ports[], and we're about to set that to null. */ + netif_tx_disable(p->dev); + } + + /* First drop references to device. */ + dev_set_promiscuity(p->dev, -1); + list_del_rcu(&p->node); + rcu_assign_pointer(p->dp->ports[p->port_no], NULL); + rcu_assign_pointer(p->dev->br_port, NULL); + + /* Then wait until no one is still using it, and destroy it. */ + synchronize_rcu(); + + if (is_dp_dev(p->dev)) { + dp_dev_destroy(p->dev); + if (dp_devs) { + struct dp_dev *dp_dev = dp_dev_priv(p->dev); + list_add(&dp_dev->list, dp_devs); + } + } + if (p->port_no != ODPP_LOCAL && dp_del_if_hook) { + dp_del_if_hook(p); + } else { + dev_put(p->dev); + kfree(p); + } + + return 0; +} + +static int del_port(int dp_idx, int port_no) +{ + struct dp_dev *dp_dev, *next; + struct net_bridge_port *p; + struct datapath *dp; + LIST_HEAD(dp_devs); + int err; + + err = -EINVAL; + if (port_no < 0 || port_no >= DP_MAX_PORTS || port_no == ODPP_LOCAL) + goto out; + + rtnl_lock(); + dp = get_dp_locked(dp_idx); + err = -ENODEV; + if (!dp) + goto out_unlock_rtnl; + + p = dp->ports[port_no]; + err = -ENOENT; + if (!p) + goto out_unlock_dp; + + err = dp_del_port(p, &dp_devs); + +out_unlock_dp: + mutex_unlock(&dp->mutex); +out_unlock_rtnl: + rtnl_unlock(); +out: + list_for_each_entry_safe (dp_dev, next, &dp_devs, list) + free_netdev(dp_dev->dev); + return err; +} + +/* Must be called with rcu_read_lock. */ +static void +do_port_input(struct net_bridge_port *p, struct sk_buff *skb) +{ + /* Make our own copy of the packet. Otherwise we will mangle the + * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). + * (No one comes after us, since we tell handle_bridge() that we took + * the packet.) */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return; + + /* Push the Ethernet header back on. */ + skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + dp_process_received_packet(skb, p); +} + +/* Must be called with rcu_read_lock and with bottom-halves disabled. */ +void dp_process_received_packet(struct sk_buff *skb, struct net_bridge_port *p) +{ + struct datapath *dp = p->dp; + struct dp_stats_percpu *stats; + struct odp_flow_key key; + struct sw_flow *flow; + + WARN_ON_ONCE(skb_shared(skb)); + WARN_ON_ONCE(skb->destructor); + + /* BHs are off so we don't have to use get_cpu()/put_cpu() here. */ + stats = percpu_ptr(dp->stats_percpu, smp_processor_id()); + + if (flow_extract(skb, p ? p->port_no : ODPP_NONE, &key)) { + if (dp->drop_frags) { + kfree_skb(skb); + stats->n_frags++; + return; + } + } + + flow = dp_table_lookup(rcu_dereference(dp->table), &key); + if (flow) { + struct sw_flow_actions *acts = rcu_dereference(flow->sf_acts); + flow_used(flow, skb); + execute_actions(dp, skb, &key, acts->actions, acts->n_actions, + GFP_ATOMIC); + stats->n_hit++; + } else { + stats->n_missed++; + dp_output_control(dp, skb, _ODPL_MISS_NR, 0); + } +} + +/* + * Used as br_handle_frame_hook. (Cannot run bridge at the same time, even on + * different set of devices!) + */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) +/* Called with rcu_read_lock and bottom-halves disabled. */ +static struct sk_buff *dp_frame_hook(struct net_bridge_port *p, + struct sk_buff *skb) +{ + do_port_input(p, skb); + return NULL; +} +#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) +/* Called with rcu_read_lock and bottom-halves disabled. */ +static int dp_frame_hook(struct net_bridge_port *p, struct sk_buff **pskb) +{ + do_port_input(p, *pskb); + return 1; +} +#else +#error +#endif + +#ifdef CONFIG_XEN +/* This code is copied verbatim from net/dev/core.c in Xen's + * linux-2.6.18-92.1.10.el5.xs5.0.0.394.644. We can't call those functions + * directly because they aren't exported. */ +static int skb_pull_up_to(struct sk_buff *skb, void *ptr) +{ + if (ptr < (void *)skb->tail) + return 1; + if (__pskb_pull_tail(skb, + ptr - (void *)skb->data - skb_headlen(skb))) { + return 1; + } else { + return 0; + } +} + +int skb_checksum_setup(struct sk_buff *skb) +{ + if (skb->proto_csum_blank) { + if (skb->protocol != htons(ETH_P_IP)) + goto out; + if (!skb_pull_up_to(skb, skb->nh.iph + 1)) + goto out; + skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl; + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + skb->csum = offsetof(struct tcphdr, check); + break; + case IPPROTO_UDP: + skb->csum = offsetof(struct udphdr, check); + break; + default: + if (net_ratelimit()) + printk(KERN_ERR "Attempting to checksum a non-" + "TCP/UDP packet, dropping a protocol" + " %d packet", skb->nh.iph->protocol); + goto out; + } + if (!skb_pull_up_to(skb, skb->h.raw + skb->csum + 2)) + goto out; + skb->ip_summed = CHECKSUM_HW; + skb->proto_csum_blank = 0; + } + return 0; +out: + return -EPROTO; +} +#endif + +int +dp_output_control(struct datapath *dp, struct sk_buff *skb, int queue_no, + u32 arg) +{ + struct dp_stats_percpu *stats; + struct sk_buff_head *queue; + int port_no; + int err; + + WARN_ON_ONCE(skb_shared(skb)); + BUG_ON(queue_no != _ODPL_MISS_NR && queue_no != _ODPL_ACTION_NR); + + queue = &dp->queues[queue_no]; + err = -ENOBUFS; + if (skb_queue_len(queue) >= DP_MAX_QUEUE_LEN) + goto err_kfree_skb; + + /* If a checksum-deferred packet is forwarded to the controller, + * correct the pointers and checksum. This happens on a regular basis + * only on Xen (the CHECKSUM_HW case), on which VMs can pass up packets + * that do not have their checksum computed. We also implement it for + * the non-Xen case, but it is difficult to trigger or test this case + * there, hence the WARN_ON_ONCE(). + */ + err = skb_checksum_setup(skb); + if (err) + goto err_kfree_skb; +#ifndef CHECKSUM_HW + if (skb->ip_summed == CHECKSUM_PARTIAL) { + WARN_ON_ONCE(1); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) + /* Until 2.6.22, the start of the transport header was also the + * start of data to be checksummed. Linux 2.6.22 introduced + * the csum_start field for this purpose, but we should point + * the transport header to it anyway for backward + * compatibility, as dev_queue_xmit() does even in 2.6.28. */ + skb_set_transport_header(skb, skb->csum_start - + skb_headroom(skb)); +#endif + err = skb_checksum_help(skb); + if (err) + goto err_kfree_skb; + } +#else + if (skb->ip_summed == CHECKSUM_HW) { + err = skb_checksum_help(skb, 0); + if (err) + goto err_kfree_skb; + } +#endif + + /* Break apart GSO packets into their component pieces. Otherwise + * userspace may try to stuff a 64kB packet into a 1500-byte MTU. */ + if (skb_is_gso(skb)) { + struct sk_buff *nskb = skb_gso_segment(skb, 0); + if (nskb) { + kfree_skb(skb); + skb = nskb; + if (unlikely(IS_ERR(skb))) { + err = PTR_ERR(skb); + goto err; + } + } else { + /* XXX This case might not be possible. It's hard to + * tell from the skb_gso_segment() code and comment. */ + } + } + + /* Figure out port number. */ + port_no = ODPP_LOCAL; + if (skb->dev) { + if (skb->dev->br_port) + port_no = skb->dev->br_port->port_no; + else if (is_dp_dev(skb->dev)) + port_no = dp_dev_priv(skb->dev)->port_no; + } + + /* Append each packet to queue. There will be only one packet unless + * we broke up a GSO packet above. */ + do { + struct odp_msg *header; + struct sk_buff *nskb = skb->next; + skb->next = NULL; + + err = skb_cow(skb, sizeof *header); + if (err) { + while (nskb) { + kfree_skb(skb); + skb = nskb; + nskb = skb->next; + } + goto err_kfree_skb; + } + + header = (struct odp_msg*)__skb_push(skb, sizeof *header); + header->type = queue_no; + header->length = skb->len; + header->port = port_no; + header->reserved = 0; + header->arg = arg; + skb_queue_tail(queue, skb); + + skb = nskb; + } while (skb); + + wake_up_interruptible(&dp->waitqueue); + return 0; + +err_kfree_skb: + kfree_skb(skb); +err: + stats = percpu_ptr(dp->stats_percpu, get_cpu()); + stats->n_lost++; + put_cpu(); + + return err; +} + +static int flush_flows(struct datapath *dp) +{ + dp->n_flows = 0; + return dp_table_flush(dp); +} + +static int validate_actions(const struct sw_flow_actions *actions) +{ + unsigned int i; + + for (i = 0; i < actions->n_actions; i++) { + const union odp_action *a = &actions->actions[i]; + switch (a->type) { + case ODPAT_OUTPUT: + if (a->output.port >= DP_MAX_PORTS) + return -EINVAL; + break; + + case ODPAT_OUTPUT_GROUP: + if (a->output_group.group >= DP_MAX_GROUPS) + return -EINVAL; + break; + + case ODPAT_SET_VLAN_VID: + if (a->vlan_vid.vlan_vid & htons(~VLAN_VID_MASK)) + return -EINVAL; + break; + + case ODPAT_SET_VLAN_PCP: + if (a->vlan_pcp.vlan_pcp & ~VLAN_PCP_MASK) + return -EINVAL; + break; + + default: + if (a->type >= ODPAT_N_ACTIONS) + return -EOPNOTSUPP; + break; + } + } + + return 0; +} + +static struct sw_flow_actions *get_actions(const struct odp_flow *flow) +{ + struct sw_flow_actions *actions; + int error; + + actions = flow_actions_alloc(flow->n_actions); + error = PTR_ERR(actions); + if (IS_ERR(actions)) + goto error; + + error = -EFAULT; + if (copy_from_user(actions->actions, flow->actions, + flow->n_actions * sizeof(union odp_action))) + goto error_free_actions; + error = validate_actions(actions); + if (error) + goto error_free_actions; + + return actions; + +error_free_actions: + kfree(actions); +error: + return ERR_PTR(error); +} + +static void get_stats(struct sw_flow *flow, struct odp_flow_stats *stats) +{ + if (flow->used.tv_sec) { + stats->used_sec = flow->used.tv_sec; + stats->used_nsec = flow->used.tv_nsec; + } else { + stats->used_sec = 0; + stats->used_nsec = 0; + } + stats->n_packets = flow->packet_count; + stats->n_bytes = flow->byte_count; + stats->ip_tos = flow->ip_tos; + stats->tcp_flags = flow->tcp_flags; +} + +static void clear_stats(struct sw_flow *flow) +{ + flow->used.tv_sec = flow->used.tv_nsec = 0; + flow->tcp_flags = 0; + flow->ip_tos = 0; + flow->packet_count = 0; + flow->byte_count = 0; +} + +static int put_flow(struct datapath *dp, struct odp_flow_put __user *ufp) +{ + struct odp_flow_put uf; + struct sw_flow *flow, **bucket; + struct dp_table *table; + struct odp_flow_stats stats; + int error; + + error = -EFAULT; + if (copy_from_user(&uf, ufp, sizeof(struct odp_flow_put))) + goto error; + uf.flow.key.reserved = 0; + +retry: + table = rcu_dereference(dp->table); + bucket = dp_table_lookup_for_insert(table, &uf.flow.key); + if (!bucket) { + /* No such flow, and the slots where it could go are full. */ + error = uf.flags & ODPPF_CREATE ? -EXFULL : -ENOENT; + goto error; + } else if (!*bucket) { + /* No such flow, but we found an available slot for it. */ + struct sw_flow_actions *acts; + + error = -ENOENT; + if (!(uf.flags & ODPPF_CREATE)) + goto error; + + /* Expand table, if necessary, to make room. */ + if (dp->n_flows * 4 >= table->n_buckets && + table->n_buckets < DP_MAX_BUCKETS) { + error = dp_table_expand(dp); + if (error) + goto error; + + /* The bucket's location has changed. Try again. */ + goto retry; + } + + /* Allocate flow. */ + error = -ENOMEM; + flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); + if (flow == NULL) + goto error; + flow->key = uf.flow.key; + spin_lock_init(&flow->lock); + clear_stats(flow); + + /* Obtain actions. */ + acts = get_actions(&uf.flow); + error = PTR_ERR(acts); + if (IS_ERR(acts)) + goto error_free_flow; + rcu_assign_pointer(flow->sf_acts, acts); + + /* Put flow in bucket. */ + rcu_assign_pointer(*bucket, flow); + dp->n_flows++; + memset(&stats, 0, sizeof(struct odp_flow_stats)); + } else { + /* We found a matching flow. */ + struct sw_flow *flow = *rcu_dereference(bucket); + struct sw_flow_actions *old_acts, *new_acts; + unsigned long int flags; + + /* Bail out if we're not allowed to modify an existing flow. */ + error = -EEXIST; + if (!(uf.flags & ODPPF_MODIFY)) + goto error; + + /* Swap actions. */ + new_acts = get_actions(&uf.flow); + error = PTR_ERR(new_acts); + if (IS_ERR(new_acts)) + goto error; + old_acts = rcu_dereference(flow->sf_acts); + if (old_acts->n_actions != new_acts->n_actions || + memcmp(old_acts->actions, new_acts->actions, + sizeof(union odp_action) * old_acts->n_actions)) { + rcu_assign_pointer(flow->sf_acts, new_acts); + flow_deferred_free_acts(old_acts); + } else { + kfree(new_acts); + } + + /* Fetch stats, then clear them if necessary. */ + spin_lock_irqsave(&flow->lock, flags); + get_stats(flow, &stats); + if (uf.flags & ODPPF_ZERO_STATS) + clear_stats(flow); + spin_unlock_irqrestore(&flow->lock, flags); + } + + /* Copy stats to userspace. */ + if (__copy_to_user(&ufp->flow.stats, &stats, + sizeof(struct odp_flow_stats))) + return -EFAULT; + return 0; + +error_free_flow: + kmem_cache_free(flow_cache, flow); +error: + return error; +} + +static int put_actions(const struct sw_flow *flow, struct odp_flow __user *ufp) +{ + union odp_action __user *actions; + struct sw_flow_actions *sf_acts; + u32 n_actions; + + if (__get_user(actions, &ufp->actions) || + __get_user(n_actions, &ufp->n_actions)) + return -EFAULT; + + if (!n_actions) + return 0; + if (ufp->n_actions > INT_MAX / sizeof(union odp_action)) + return -EINVAL; + + sf_acts = rcu_dereference(flow->sf_acts); + if (__put_user(sf_acts->n_actions, &ufp->n_actions) || + (actions && copy_to_user(actions, sf_acts->actions, + sizeof(union odp_action) * + min(sf_acts->n_actions, n_actions)))) + return -EFAULT; + + return 0; +} + +static int answer_query(struct sw_flow *flow, struct odp_flow __user *ufp) +{ + struct odp_flow_stats stats; + unsigned long int flags; + + spin_lock_irqsave(&flow->lock, flags); + get_stats(flow, &stats); + spin_unlock_irqrestore(&flow->lock, flags); + + if (__copy_to_user(&ufp->stats, &stats, sizeof(struct odp_flow_stats))) + return -EFAULT; + return put_actions(flow, ufp); +} + +static int del_or_query_flow(struct datapath *dp, + struct odp_flow __user *ufp, + unsigned int cmd) +{ + struct dp_table *table = rcu_dereference(dp->table); + struct odp_flow uf; + struct sw_flow *flow; + int error; + + error = -EFAULT; + if (copy_from_user(&uf, ufp, sizeof uf)) + goto error; + uf.key.reserved = 0; + + flow = dp_table_lookup(table, &uf.key); + error = -ENOENT; + if (!flow) + goto error; + + if (cmd == ODP_FLOW_DEL) { + /* XXX redundant lookup */ + error = dp_table_delete(table, flow); + if (error) + goto error; + + /* XXX These statistics might lose a few packets, since other + * CPUs can be using this flow. We used to synchronize_rcu() + * to make sure that we get completely accurate stats, but that + * blows our performance, badly. */ + dp->n_flows--; + error = answer_query(flow, ufp); + flow_deferred_free(flow); + } else { + error = answer_query(flow, ufp); + } + +error: + return error; +} + +static int query_multiple_flows(struct datapath *dp, + const struct odp_flowvec *flowvec) +{ + struct dp_table *table = rcu_dereference(dp->table); + int i; + for (i = 0; i < flowvec->n_flows; i++) { + struct __user odp_flow *ufp = &flowvec->flows[i]; + struct odp_flow uf; + struct sw_flow *flow; + int error; + + if (__copy_from_user(&uf, ufp, sizeof uf)) + return -EFAULT; + uf.key.reserved = 0; + + flow = dp_table_lookup(table, &uf.key); + if (!flow) + error = __clear_user(&ufp->stats, sizeof ufp->stats); + else + error = answer_query(flow, ufp); + if (error) + return -EFAULT; + } + return flowvec->n_flows; +} + +struct list_flows_cbdata { + struct odp_flow __user *uflows; + int n_flows; + int listed_flows; +}; + +static int list_flow(struct sw_flow *flow, void *cbdata_) +{ + struct list_flows_cbdata *cbdata = cbdata_; + struct odp_flow __user *ufp = &cbdata->uflows[cbdata->listed_flows++]; + int error; + + if (__copy_to_user(&ufp->key, &flow->key, sizeof flow->key)) + return -EFAULT; + error = answer_query(flow, ufp); + if (error) + return error; + + if (cbdata->listed_flows >= cbdata->n_flows) + return cbdata->listed_flows; + return 0; +} + +static int list_flows(struct datapath *dp, const struct odp_flowvec *flowvec) +{ + struct list_flows_cbdata cbdata; + int error; + + if (!flowvec->n_flows) + return 0; + + cbdata.uflows = flowvec->flows; + cbdata.n_flows = flowvec->n_flows; + cbdata.listed_flows = 0; + error = dp_table_foreach(rcu_dereference(dp->table), + list_flow, &cbdata); + return error ? error : cbdata.listed_flows; +} + +static int do_flowvec_ioctl(struct datapath *dp, unsigned long argp, + int (*function)(struct datapath *, + const struct odp_flowvec *)) +{ + struct odp_flowvec __user *uflowvec; + struct odp_flowvec flowvec; + int retval; + + uflowvec = (struct odp_flowvec __user *)argp; + if (!access_ok(VERIFY_WRITE, uflowvec, sizeof *uflowvec) || + copy_from_user(&flowvec, uflowvec, sizeof flowvec)) + return -EFAULT; + + if (flowvec.n_flows > INT_MAX / sizeof(struct odp_flow)) + return -EINVAL; + + if (!access_ok(VERIFY_WRITE, flowvec.flows, + flowvec.n_flows * sizeof(struct odp_flow))) + return -EFAULT; + + retval = function(dp, &flowvec); + return (retval < 0 ? retval + : retval == flowvec.n_flows ? 0 + : __put_user(retval, &uflowvec->n_flows)); +} + +static int do_execute(struct datapath *dp, const struct odp_execute *executep) +{ + struct odp_execute execute; + struct odp_flow_key key; + struct sk_buff *skb; + struct sw_flow_actions *actions; + int err; + + err = -EFAULT; + if (copy_from_user(&execute, executep, sizeof execute)) + goto error; + + err = -EINVAL; + if (execute.length < ETH_HLEN || execute.length > 65535) + goto error; + + err = -ENOMEM; + actions = flow_actions_alloc(execute.n_actions); + if (!actions) + goto error; + + err = -EFAULT; + if (copy_from_user(actions->actions, execute.actions, + execute.n_actions * sizeof *execute.actions)) + goto error_free_actions; + + err = validate_actions(actions); + if (err) + goto error_free_actions; + + err = -ENOMEM; + skb = alloc_skb(execute.length, GFP_KERNEL); + if (!skb) + goto error_free_actions; + if (execute.in_port < DP_MAX_PORTS) { + struct net_bridge_port *p = dp->ports[execute.in_port]; + if (p) + skb->dev = p->dev; + } + + err = -EFAULT; + if (copy_from_user(skb_put(skb, execute.length), execute.data, + execute.length)) + goto error_free_skb; + + flow_extract(skb, execute.in_port, &key); + err = execute_actions(dp, skb, &key, actions->actions, + actions->n_actions, GFP_KERNEL); + kfree(actions); + return err; + +error_free_skb: + kfree_skb(skb); +error_free_actions: + kfree(actions); +error: + return err; +} + +static int +get_dp_stats(struct datapath *dp, struct odp_stats __user *statsp) +{ + struct odp_stats stats; + int i; + + stats.n_flows = dp->n_flows; + stats.cur_capacity = rcu_dereference(dp->table)->n_buckets * 2; + stats.max_capacity = DP_MAX_BUCKETS * 2; + stats.n_ports = dp->n_ports; + stats.max_ports = DP_MAX_PORTS; + stats.max_groups = DP_MAX_GROUPS; + stats.n_frags = stats.n_hit = stats.n_missed = stats.n_lost = 0; + for_each_possible_cpu(i) { + const struct dp_stats_percpu *s; + s = percpu_ptr(dp->stats_percpu, i); + stats.n_frags += s->n_frags; + stats.n_hit += s->n_hit; + stats.n_missed += s->n_missed; + stats.n_lost += s->n_lost; + } + stats.max_miss_queue = DP_MAX_QUEUE_LEN; + stats.max_action_queue = DP_MAX_QUEUE_LEN; + return copy_to_user(statsp, &stats, sizeof stats) ? -EFAULT : 0; +} + +static int +put_port(const struct net_bridge_port *p, struct odp_port __user *uop) +{ + struct odp_port op; + memset(&op, 0, sizeof op); + strncpy(op.devname, p->dev->name, sizeof op.devname); + op.port = p->port_no; + op.flags = is_dp_dev(p->dev) ? ODP_PORT_INTERNAL : 0; + return copy_to_user(uop, &op, sizeof op) ? -EFAULT : 0; +} + +static int +query_port(struct datapath *dp, struct odp_port __user *uport) +{ + struct odp_port port; + + if (copy_from_user(&port, uport, sizeof port)) + return -EFAULT; + if (port.devname[0]) { + struct net_bridge_port *p; + struct net_device *dev; + int err; + + port.devname[IFNAMSIZ - 1] = '\0'; + + dev = dev_get_by_name(&init_net, port.devname); + if (!dev) + return -ENODEV; + + p = dev->br_port; + if (!p && is_dp_dev(dev)) { + struct dp_dev *dp_dev = dp_dev_priv(dev); + if (dp_dev->dp == dp) + p = dp->ports[dp_dev->port_no]; + } + err = p && p->dp == dp ? put_port(p, uport) : -ENOENT; + dev_put(dev); + + return err; + } else { + if (port.port >= DP_MAX_PORTS) + return -EINVAL; + if (!dp->ports[port.port]) + return -ENOENT; + return put_port(dp->ports[port.port], uport); + } +} + +static int +list_ports(struct datapath *dp, struct odp_portvec __user *pvp) +{ + struct odp_portvec pv; + struct net_bridge_port *p; + int idx; + + if (copy_from_user(&pv, pvp, sizeof pv)) + return -EFAULT; + + idx = 0; + if (pv.n_ports) { + list_for_each_entry_rcu (p, &dp->port_list, node) { + if (put_port(p, &pv.ports[idx])) + return -EFAULT; + if (idx++ >= pv.n_ports) + break; + } + } + return put_user(idx, &pvp->n_ports); +} + +/* RCU callback for freeing a dp_port_group */ +static void free_port_group(struct rcu_head *rcu) +{ + struct dp_port_group *g = container_of(rcu, struct dp_port_group, rcu); + kfree(g); +} + +static int +set_port_group(struct datapath *dp, const struct odp_port_group __user *upg) +{ + struct odp_port_group pg; + struct dp_port_group *new_group, *old_group; + int error; + + error = -EFAULT; + if (copy_from_user(&pg, upg, sizeof pg)) + goto error; + + error = -EINVAL; + if (pg.n_ports > DP_MAX_PORTS || pg.group >= DP_MAX_GROUPS) + goto error; + + error = -ENOMEM; + new_group = kmalloc(sizeof *new_group + sizeof(u16) * pg.n_ports, + GFP_KERNEL); + if (!new_group) + goto error; + + new_group->n_ports = pg.n_ports; + error = -EFAULT; + if (copy_from_user(new_group->ports, pg.ports, + sizeof(u16) * pg.n_ports)) + goto error_free; + + old_group = rcu_dereference(dp->groups[pg.group]); + rcu_assign_pointer(dp->groups[pg.group], new_group); + if (old_group) + call_rcu(&old_group->rcu, free_port_group); + return 0; + +error_free: + kfree(new_group); +error: + return error; +} + +static int +get_port_group(struct datapath *dp, struct odp_port_group *upg) +{ + struct odp_port_group pg; + struct dp_port_group *g; + u16 n_copy; + + if (copy_from_user(&pg, upg, sizeof pg)) + return -EFAULT; + + if (pg.group >= DP_MAX_GROUPS) + return -EINVAL; + + g = dp->groups[pg.group]; + n_copy = g ? min_t(int, g->n_ports, pg.n_ports) : 0; + if (n_copy && copy_to_user(pg.ports, g->ports, n_copy * sizeof(u16))) + return -EFAULT; + + if (put_user(g ? g->n_ports : 0, &upg->n_ports)) + return -EFAULT; + + return 0; +} + +static long openvswitch_ioctl(struct file *f, unsigned int cmd, + unsigned long argp) +{ + int dp_idx = iminor(f->f_dentry->d_inode); + struct datapath *dp; + int drop_frags, listeners, port_no; + int err; + + /* Handle commands with special locking requirements up front. */ + switch (cmd) { + case ODP_DP_CREATE: + return create_dp(dp_idx, (char __user *)argp); + + case ODP_DP_DESTROY: + return destroy_dp(dp_idx); + + case ODP_PORT_ADD: + return add_port(dp_idx, (struct odp_port __user *)argp); + + case ODP_PORT_DEL: + err = get_user(port_no, (int __user *)argp); + if (err) + break; + return del_port(dp_idx, port_no); + } + + dp = get_dp_locked(dp_idx); + if (!dp) + return -ENODEV; + + switch (cmd) { + case ODP_DP_STATS: + err = get_dp_stats(dp, (struct odp_stats __user *)argp); + break; + + case ODP_GET_DROP_FRAGS: + err = put_user(dp->drop_frags, (int __user *)argp); + break; + + case ODP_SET_DROP_FRAGS: + err = get_user(drop_frags, (int __user *)argp); + if (err) + break; + err = -EINVAL; + if (drop_frags != 0 && drop_frags != 1) + break; + dp->drop_frags = drop_frags; + err = 0; + break; + + case ODP_GET_LISTEN_MASK: + err = put_user((int)f->private_data, (int __user *)argp); + break; + + case ODP_SET_LISTEN_MASK: + err = get_user(listeners, (int __user *)argp); + if (err) + break; + err = -EINVAL; + if (listeners & ~ODPL_ALL) + break; + err = 0; + f->private_data = (void*)listeners; + break; + + case ODP_PORT_QUERY: + err = query_port(dp, (struct odp_port __user *)argp); + break; + + case ODP_PORT_LIST: + err = list_ports(dp, (struct odp_portvec __user *)argp); + break; + + case ODP_PORT_GROUP_SET: + err = set_port_group(dp, (struct odp_port_group __user *)argp); + break; + + case ODP_PORT_GROUP_GET: + err = get_port_group(dp, (struct odp_port_group __user *)argp); + break; + + case ODP_FLOW_FLUSH: + err = flush_flows(dp); + break; + + case ODP_FLOW_PUT: + err = put_flow(dp, (struct odp_flow_put __user *)argp); + break; + + case ODP_FLOW_DEL: + case ODP_FLOW_GET: + err = del_or_query_flow(dp, (struct odp_flow __user *)argp, + cmd); + break; + + case ODP_FLOW_GET_MULTIPLE: + err = do_flowvec_ioctl(dp, argp, query_multiple_flows); + break; + + case ODP_FLOW_LIST: + err = do_flowvec_ioctl(dp, argp, list_flows); + break; + + case ODP_EXECUTE: + err = do_execute(dp, (struct odp_execute __user *)argp); + break; + + default: + err = -ENOIOCTLCMD; + break; + } + mutex_unlock(&dp->mutex); + return err; +} + +static int dp_has_packet_of_interest(struct datapath *dp, int listeners) +{ + int i; + for (i = 0; i < DP_N_QUEUES; i++) { + if (listeners & (1 << i) && !skb_queue_empty(&dp->queues[i])) + return 1; + } + return 0; +} + +ssize_t openvswitch_read(struct file *f, char __user *buf, size_t nbytes, + loff_t *ppos) +{ + int listeners = (int) f->private_data; + int dp_idx = iminor(f->f_dentry->d_inode); + struct datapath *dp = get_dp(dp_idx); + struct sk_buff *skb; + struct iovec __user iov; + size_t copy_bytes; + int retval; + + if (!dp) + return -ENODEV; + + if (nbytes == 0 || !listeners) + return 0; + + for (;;) { + int i; + + for (i = 0; i < DP_N_QUEUES; i++) { + if (listeners & (1 << i)) { + skb = skb_dequeue(&dp->queues[i]); + if (skb) + goto success; + } + } + + if (f->f_flags & O_NONBLOCK) { + retval = -EAGAIN; + goto error; + } + + wait_event_interruptible(dp->waitqueue, + dp_has_packet_of_interest(dp, + listeners)); + + if (signal_pending(current)) { + retval = -ERESTARTSYS; + goto error; + } + } +success: + copy_bytes = min(skb->len, nbytes); + iov.iov_base = buf; + iov.iov_len = copy_bytes; + retval = skb_copy_datagram_iovec(skb, 0, &iov, iov.iov_len); + if (!retval) + retval = copy_bytes; + kfree_skb(skb); + +error: + return retval; +} + +static unsigned int openvswitch_poll(struct file *file, poll_table *wait) +{ + int dp_idx = iminor(file->f_dentry->d_inode); + struct datapath *dp = get_dp(dp_idx); + unsigned int mask; + + if (dp) { + mask = 0; + poll_wait(file, &dp->waitqueue, wait); + if (dp_has_packet_of_interest(dp, (int)file->private_data)) + mask |= POLLIN | POLLRDNORM; + } else { + mask = POLLIN | POLLRDNORM | POLLHUP; + } + return mask; +} + +struct file_operations openvswitch_fops = { + /* XXX .aio_read = openvswitch_aio_read, */ + .read = openvswitch_read, + .poll = openvswitch_poll, + .unlocked_ioctl = openvswitch_ioctl, + /* XXX .fasync = openvswitch_fasync, */ +}; + +static int major; +static struct llc_sap *dp_stp_sap; + +static int dp_stp_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + /* We don't really care about STP packets, we just listen for them for + * mutual exclusion with the bridge module, so this just discards + * them. */ + kfree_skb(skb); + return 0; +} + +static int __init dp_init(void) +{ + int err; + + printk("Open vSwitch %s, built "__DATE__" "__TIME__"\n", VERSION BUILDNR); + + /* Register to receive STP packets because the bridge module also + * attempts to do so. Since there can only be a single listener for a + * given protocol, this provides mutual exclusion against the bridge + * module, preventing both of them from being loaded at the same + * time. */ + dp_stp_sap = llc_sap_open(LLC_SAP_BSPAN, dp_stp_rcv); + if (!dp_stp_sap) { + printk(KERN_ERR "openvswitch: can't register sap for STP (probably the bridge module is loaded)\n"); + return -EADDRINUSE; + } + + err = flow_init(); + if (err) + goto error; + + err = register_netdevice_notifier(&dp_device_notifier); + if (err) + goto error_flow_exit; + + major = register_chrdev(0, "openvswitch", &openvswitch_fops); + if (err < 0) + goto error_unreg_notifier; + + /* Hook into callback used by the bridge to intercept packets. + * Parasites we are. */ + br_handle_frame_hook = dp_frame_hook; + + return 0; + +error_unreg_notifier: + unregister_netdevice_notifier(&dp_device_notifier); +error_flow_exit: + flow_exit(); +error: + return err; +} + +static void dp_cleanup(void) +{ + rcu_barrier(); + unregister_chrdev(major, "openvswitch"); + unregister_netdevice_notifier(&dp_device_notifier); + flow_exit(); + br_handle_frame_hook = NULL; + llc_sap_put(dp_stp_sap); +} + +module_init(dp_init); +module_exit(dp_cleanup); + +MODULE_DESCRIPTION("Open vSwitch switching datapath"); +MODULE_LICENSE("GPL"); diff --git a/datapath/datapath.h b/datapath/datapath.h new file mode 100644 index 00000000..102b27f3 --- /dev/null +++ b/datapath/datapath.h @@ -0,0 +1,139 @@ +/* Interface exported by openvswitch_mod. */ + +#ifndef DATAPATH_H +#define DATAPATH_H 1 + +#include +#include +#include +#include +#include +#include +#include +#include "flow.h" +#include "brc_sysfs.h" + +struct sk_buff; + +/* Mask for the priority bits in a vlan header. If we ever merge upstream + * then this should go into include/linux/if_vlan.h. */ +#define VLAN_PCP_MASK 0xe000 + +#define DP_MAX_PORTS 256 +#define DP_MAX_GROUPS 16 + +#define DP_L2_BITS (PAGE_SHIFT - ilog2(sizeof(struct sw_flow*))) +#define DP_L2_SIZE (1 << DP_L2_BITS) +#define DP_L2_SHIFT 0 + +#define DP_L1_BITS (PAGE_SHIFT - ilog2(sizeof(struct sw_flow**))) +#define DP_L1_SIZE (1 << DP_L1_BITS) +#define DP_L1_SHIFT DP_L2_BITS + +#define DP_MAX_BUCKETS (DP_L1_SIZE * DP_L2_SIZE) + +struct dp_table { + unsigned int n_buckets; + struct sw_flow ***flows[2]; + struct rcu_head rcu; +}; + +#define DP_N_QUEUES 2 +#define DP_MAX_QUEUE_LEN 100 + +struct dp_stats_percpu { + u64 n_frags; + u64 n_hit; + u64 n_missed; + u64 n_lost; +}; + +struct dp_port_group { + struct rcu_head rcu; + int n_ports; + u16 ports[]; +}; + +struct datapath { + struct mutex mutex; + int dp_idx; + +#ifdef SUPPORT_SYSFS + struct kobject ifobj; +#endif + + int drop_frags; + + /* Queued data. */ + struct sk_buff_head queues[DP_N_QUEUES]; + wait_queue_head_t waitqueue; + + /* Flow table. */ + unsigned int n_flows; + struct dp_table *table; + + /* Port groups. */ + struct dp_port_group *groups[DP_MAX_GROUPS]; + + /* Switch ports. */ + unsigned int n_ports; + struct net_bridge_port *ports[DP_MAX_PORTS]; + struct list_head port_list; /* All ports, including local_port. */ + + /* Stats. */ + struct dp_stats_percpu *stats_percpu; +}; + +struct net_bridge_port { + u16 port_no; + struct datapath *dp; + struct net_device *dev; +#ifdef SUPPORT_SYSFS + struct kobject kobj; +#endif + struct list_head node; /* Element in datapath.ports. */ +}; + +extern struct notifier_block dp_device_notifier; +extern int (*dp_ioctl_hook)(struct net_device *dev, struct ifreq *rq, int cmd); +extern int (*dp_add_dp_hook)(struct datapath *dp); +extern int (*dp_del_dp_hook)(struct datapath *dp); +extern int (*dp_add_if_hook)(struct net_bridge_port *p); +extern int (*dp_del_if_hook)(struct net_bridge_port *p); + +/* Flow table. */ +struct dp_table *dp_table_create(unsigned int n_buckets); +void dp_table_destroy(struct dp_table *, int free_flows); +struct sw_flow *dp_table_lookup(struct dp_table *, const struct odp_flow_key *); +struct sw_flow **dp_table_lookup_for_insert(struct dp_table *, const struct odp_flow_key *); +int dp_table_delete(struct dp_table *, struct sw_flow *); +int dp_table_expand(struct datapath *); +int dp_table_flush(struct datapath *); +int dp_table_foreach(struct dp_table *table, + int (*callback)(struct sw_flow *flow, void *aux), + void *aux); + +void dp_process_received_packet(struct sk_buff *, struct net_bridge_port *); +int dp_del_port(struct net_bridge_port *, struct list_head *); +int dp_output_port(struct datapath *, struct sk_buff *, int out_port, + int ignore_no_fwd); +int dp_output_control(struct datapath *, struct sk_buff *, int, u32 arg); +void dp_set_origin(struct datapath *, u16, struct sk_buff *); + +struct datapath *get_dp(int dp_idx); + +static inline const char *dp_name(const struct datapath *dp) +{ + return dp->ports[ODPP_LOCAL]->dev->name; +} + +#ifdef CONFIG_XEN +int skb_checksum_setup(struct sk_buff *skb); +#else +static inline int skb_checksum_setup(struct sk_buff *skb) +{ + return 0; +} +#endif + +#endif /* datapath.h */ diff --git a/datapath/dp_dev.c b/datapath/dp_dev.c new file mode 100644 index 00000000..8a749dbc --- /dev/null +++ b/datapath/dp_dev.c @@ -0,0 +1,210 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "dp_dev.h" + +struct datapath *dp_dev_get_dp(struct net_device *netdev) +{ + return dp_dev_priv(netdev)->dp; +} +EXPORT_SYMBOL(dp_dev_get_dp); + +static struct net_device_stats *dp_dev_get_stats(struct net_device *netdev) +{ + struct dp_dev *dp_dev = dp_dev_priv(netdev); + return &dp_dev->stats; +} + +int dp_dev_recv(struct net_device *netdev, struct sk_buff *skb) +{ + struct dp_dev *dp_dev = dp_dev_priv(netdev); + int len; + len = skb->len; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, netdev); + if (in_interrupt()) + netif_rx(skb); + else + netif_rx_ni(skb); + netdev->last_rx = jiffies; + dp_dev->stats.rx_packets++; + dp_dev->stats.rx_bytes += len; + return len; +} + +static int dp_dev_mac_addr(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + + if (!is_valid_ether_addr(addr->sa_data)) + return -EADDRNOTAVAIL; + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + return 0; +} + +static int dp_dev_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + struct dp_dev *dp_dev = dp_dev_priv(netdev); + + /* By orphaning 'skb' we will screw up socket accounting slightly, but + * the effect is limited to the device queue length. If we don't + * do this, then the sk_buff will be destructed eventually, but it is + * harder to predict when. */ + skb_orphan(skb); + + /* We are going to modify 'skb', by sticking it on &dp_dev->xmit_queue, + * so we need to have our own clone. (At any rate, fwd_port_input() + * will need its own clone, so there's no benefit to queuing any other + * way.) */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return 0; + + dp_dev->stats.tx_packets++; + dp_dev->stats.tx_bytes += skb->len; + + if (skb_queue_len(&dp_dev->xmit_queue) >= netdev->tx_queue_len) { + /* Queue overflow. Stop transmitter. */ + netif_stop_queue(netdev); + + /* We won't see all dropped packets individually, so overrun + * error is appropriate. */ + dp_dev->stats.tx_fifo_errors++; + } + skb_queue_tail(&dp_dev->xmit_queue, skb); + netdev->trans_start = jiffies; + + schedule_work(&dp_dev->xmit_work); + + return 0; +} + +static void dp_dev_do_xmit(struct work_struct *work) +{ + struct dp_dev *dp_dev = container_of(work, struct dp_dev, xmit_work); + struct datapath *dp = dp_dev->dp; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&dp_dev->xmit_queue)) != NULL) { + skb_reset_mac_header(skb); + rcu_read_lock_bh(); + dp_process_received_packet(skb, dp->ports[dp_dev->port_no]); + rcu_read_unlock_bh(); + } + netif_wake_queue(dp_dev->dev); +} + +static int dp_dev_open(struct net_device *netdev) +{ + netif_start_queue(netdev); + return 0; +} + +static int dp_dev_stop(struct net_device *netdev) +{ + netif_stop_queue(netdev); + return 0; +} + +static void dp_getinfo(struct net_device *netdev, struct ethtool_drvinfo *info) +{ + struct dp_dev *dp_dev = dp_dev_priv(netdev); + strcpy(info->driver, "openvswitch"); + sprintf(info->bus_info, "%d", dp_dev->dp->dp_idx); +} + +static struct ethtool_ops dp_ethtool_ops = { + .get_drvinfo = dp_getinfo, + .get_link = ethtool_op_get_link, + .get_sg = ethtool_op_get_sg, + .get_tx_csum = ethtool_op_get_tx_csum, + .get_tso = ethtool_op_get_tso, +}; + +static void +do_setup(struct net_device *netdev) +{ + ether_setup(netdev); + + netdev->do_ioctl = dp_ioctl_hook; + netdev->get_stats = dp_dev_get_stats; + netdev->hard_start_xmit = dp_dev_xmit; + netdev->open = dp_dev_open; + SET_ETHTOOL_OPS(netdev, &dp_ethtool_ops); + netdev->stop = dp_dev_stop; + netdev->tx_queue_len = 100; + netdev->set_mac_address = dp_dev_mac_addr; + + netdev->flags = IFF_BROADCAST | IFF_MULTICAST; + + random_ether_addr(netdev->dev_addr); + + /* Set the OUI to the Nicira one. */ + netdev->dev_addr[0] = 0x00; + netdev->dev_addr[1] = 0x23; + netdev->dev_addr[2] = 0x20; + + /* Set the top bits to indicate random Nicira address. */ + netdev->dev_addr[3] |= 0xc0; +} + +/* Create a datapath device associated with 'dp'. If 'dp_name' is null, + * the device name will be of the form 'of'. Returns the new device or + * an error code. + * + * Called with RTNL lock and dp_mutex. */ +struct net_device *dp_dev_create(struct datapath *dp, const char *dp_name, int port_no) +{ + struct dp_dev *dp_dev; + struct net_device *netdev; + char dev_name[IFNAMSIZ]; + int err; + + if (dp_name) { + if (strlen(dp_name) >= IFNAMSIZ) + return ERR_PTR(-EINVAL); + strncpy(dev_name, dp_name, sizeof(dev_name)); + } else + snprintf(dev_name, sizeof dev_name, "of%d", dp->dp_idx); + + netdev = alloc_netdev(sizeof(struct dp_dev), dev_name, do_setup); + if (!netdev) + return ERR_PTR(-ENOMEM); + + err = register_netdevice(netdev); + if (err) { + free_netdev(netdev); + return ERR_PTR(err); + } + + dp_dev = dp_dev_priv(netdev); + dp_dev->dp = dp; + dp_dev->port_no = port_no; + dp_dev->dev = netdev; + skb_queue_head_init(&dp_dev->xmit_queue); + INIT_WORK(&dp_dev->xmit_work, dp_dev_do_xmit); + return netdev; +} + +/* Called with RTNL lock and dp_mutex.*/ +void dp_dev_destroy(struct net_device *netdev) +{ + struct dp_dev *dp_dev = dp_dev_priv(netdev); + + netif_tx_disable(netdev); + synchronize_net(); + skb_queue_purge(&dp_dev->xmit_queue); + unregister_netdevice(netdev); +} + +int is_dp_dev(struct net_device *netdev) +{ + return netdev->open == dp_dev_open; +} +EXPORT_SYMBOL(is_dp_dev); diff --git a/datapath/dp_dev.h b/datapath/dp_dev.h new file mode 100644 index 00000000..84874390 --- /dev/null +++ b/datapath/dp_dev.h @@ -0,0 +1,27 @@ +#ifndef DP_DEV_H +#define DP_DEV_H 1 + +struct dp_dev { + struct datapath *dp; + int port_no; + + struct net_device *dev; + struct net_device_stats stats; + struct sk_buff_head xmit_queue; + struct work_struct xmit_work; + + struct list_head list; +}; + +static inline struct dp_dev *dp_dev_priv(struct net_device *netdev) +{ + return netdev_priv(netdev); +} + +struct net_device *dp_dev_create(struct datapath *, const char *, int port_no); +void dp_dev_destroy(struct net_device *); +int dp_dev_recv(struct net_device *, struct sk_buff *); +int is_dp_dev(struct net_device *); +struct datapath *dp_dev_get_dp(struct net_device *); + +#endif /* dp_dev.h */ diff --git a/datapath/dp_notify.c b/datapath/dp_notify.c new file mode 100644 index 00000000..56d5c3c9 --- /dev/null +++ b/datapath/dp_notify.c @@ -0,0 +1,29 @@ +/* + * Distributed under the terms of the GNU GPL version 2. + * Copyright (c) 2007, 2008, 2009 Nicira Networks. + */ + +/* Handle changes to managed devices */ + +#include + +#include "datapath.h" + + +static int dp_device_event(struct notifier_block *unused, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct net_bridge_port *p = dev->br_port; + if (event == NETDEV_UNREGISTER && p) { + struct datapath *dp = p->dp; + mutex_lock(&dp->mutex); + dp_del_port(p, NULL); + mutex_unlock(&dp->mutex); + } + return NOTIFY_DONE; +} + +struct notifier_block dp_device_notifier = { + .notifier_call = dp_device_event +}; diff --git a/datapath/flow.c b/datapath/flow.c new file mode 100644 index 00000000..b24c242c --- /dev/null +++ b/datapath/flow.c @@ -0,0 +1,301 @@ +/* + * Distributed under the terms of the GNU GPL version 2. + * Copyright (c) 2007, 2008, 2009 Nicira Networks. + */ + +#include "flow.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "compat.h" + +struct kmem_cache *flow_cache; + +static inline int iphdr_ok(struct sk_buff *skb) +{ + int nh_ofs = skb_network_offset(skb); + if (skb->len >= nh_ofs + sizeof(struct iphdr)) { + int ip_len = ip_hdrlen(skb); + return (ip_len >= sizeof(struct iphdr) + && pskb_may_pull(skb, nh_ofs + ip_len)); + } + return 0; +} + +static inline int tcphdr_ok(struct sk_buff *skb) +{ + int th_ofs = skb_transport_offset(skb); + if (pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr))) { + int tcp_len = tcp_hdrlen(skb); + return (tcp_len >= sizeof(struct tcphdr) + && skb->len >= th_ofs + tcp_len); + } + return 0; +} + +static inline int udphdr_ok(struct sk_buff *skb) +{ + int th_ofs = skb_transport_offset(skb); + return pskb_may_pull(skb, th_ofs + sizeof(struct udphdr)); +} + +static inline int icmphdr_ok(struct sk_buff *skb) +{ + int th_ofs = skb_transport_offset(skb); + return pskb_may_pull(skb, th_ofs + sizeof(struct icmphdr)); +} + +#define TCP_FLAGS_OFFSET 13 +#define TCP_FLAG_MASK 0x3f + +static inline struct ovs_tcphdr *ovs_tcp_hdr(const struct sk_buff *skb) +{ + return (struct ovs_tcphdr *)skb_transport_header(skb); +} + +void flow_used(struct sw_flow *flow, struct sk_buff *skb) +{ + unsigned long flags; + u8 tcp_flags = 0; + + if (flow->key.dl_type == htons(ETH_P_IP) && iphdr_ok(skb)) { + struct iphdr *nh = ip_hdr(skb); + flow->ip_tos = nh->tos; + if (flow->key.nw_proto == IPPROTO_TCP && tcphdr_ok(skb)) { + u8 *tcp = (u8 *)tcp_hdr(skb); + tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK; + } + } + + spin_lock_irqsave(&flow->lock, flags); + getnstimeofday(&flow->used); + flow->packet_count++; + flow->byte_count += skb->len; + flow->tcp_flags |= tcp_flags; + spin_unlock_irqrestore(&flow->lock, flags); +} + +struct sw_flow_actions *flow_actions_alloc(size_t n_actions) +{ + struct sw_flow_actions *sfa; + + if (n_actions > (PAGE_SIZE - sizeof *sfa) / sizeof(union odp_action)) + return ERR_PTR(-EINVAL); + + sfa = kmalloc(sizeof *sfa + n_actions * sizeof(union odp_action), + GFP_KERNEL); + if (!sfa) + return ERR_PTR(-ENOMEM); + + sfa->n_actions = n_actions; + return sfa; +} + + +/* Frees 'flow' immediately. */ +void flow_free(struct sw_flow *flow) +{ + if (unlikely(!flow)) + return; + kfree(flow->sf_acts); + kmem_cache_free(flow_cache, flow); +} + +/* RCU callback used by flow_deferred_free. */ +static void rcu_free_flow_callback(struct rcu_head *rcu) +{ + struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); + flow_free(flow); +} + +/* Schedules 'flow' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void flow_deferred_free(struct sw_flow *flow) +{ + call_rcu(&flow->rcu, rcu_free_flow_callback); +} + +/* RCU callback used by flow_deferred_free_acts. */ +static void rcu_free_acts_callback(struct rcu_head *rcu) +{ + struct sw_flow_actions *sf_acts = container_of(rcu, + struct sw_flow_actions, rcu); + kfree(sf_acts); +} + +/* Schedules 'sf_acts' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void flow_deferred_free_acts(struct sw_flow_actions *sf_acts) +{ + call_rcu(&sf_acts->rcu, rcu_free_acts_callback); +} + +#define SNAP_OUI_LEN 3 + +struct eth_snap_hdr +{ + struct ethhdr eth; + u8 dsap; /* Always 0xAA */ + u8 ssap; /* Always 0xAA */ + u8 ctrl; + u8 oui[SNAP_OUI_LEN]; + u16 ethertype; +} __attribute__ ((packed)); + +static int is_snap(const struct eth_snap_hdr *esh) +{ + return (esh->dsap == LLC_SAP_SNAP + && esh->ssap == LLC_SAP_SNAP + && !memcmp(esh->oui, "\0\0\0", 3)); +} + +/* Parses the Ethernet frame in 'skb', which was received on 'in_port', + * and initializes 'key' to match. Returns 1 if 'skb' contains an IP + * fragment, 0 otherwise. */ +int flow_extract(struct sk_buff *skb, u16 in_port, struct odp_flow_key *key) +{ + struct ethhdr *eth; + struct eth_snap_hdr *esh; + int retval = 0; + int nh_ofs; + + memset(key, 0, sizeof *key); + key->dl_vlan = htons(ODP_VLAN_NONE); + key->in_port = in_port; + + if (skb->len < sizeof *eth) + return 0; + if (!pskb_may_pull(skb, skb->len >= 64 ? 64 : skb->len)) { + return 0; + } + + skb_reset_mac_header(skb); + eth = eth_hdr(skb); + esh = (struct eth_snap_hdr *) eth; + nh_ofs = sizeof *eth; + if (likely(ntohs(eth->h_proto) >= ODP_DL_TYPE_ETH2_CUTOFF)) + key->dl_type = eth->h_proto; + else if (skb->len >= sizeof *esh && is_snap(esh)) { + key->dl_type = esh->ethertype; + nh_ofs = sizeof *esh; + } else { + key->dl_type = htons(ODP_DL_TYPE_NOT_ETH_TYPE); + if (skb->len >= nh_ofs + sizeof(struct llc_pdu_un)) { + nh_ofs += sizeof(struct llc_pdu_un); + } + } + + /* Check for a VLAN tag */ + if (key->dl_type == htons(ETH_P_8021Q) && + skb->len >= nh_ofs + sizeof(struct vlan_hdr)) { + struct vlan_hdr *vh = (struct vlan_hdr*)(skb->data + nh_ofs); + key->dl_type = vh->h_vlan_encapsulated_proto; + key->dl_vlan = vh->h_vlan_TCI & htons(VLAN_VID_MASK); + nh_ofs += sizeof(struct vlan_hdr); + } + memcpy(key->dl_src, eth->h_source, ETH_ALEN); + memcpy(key->dl_dst, eth->h_dest, ETH_ALEN); + skb_set_network_header(skb, nh_ofs); + + /* Network layer. */ + if (key->dl_type == htons(ETH_P_IP) && iphdr_ok(skb)) { + struct iphdr *nh = ip_hdr(skb); + int th_ofs = nh_ofs + nh->ihl * 4; + key->nw_src = nh->saddr; + key->nw_dst = nh->daddr; + key->nw_proto = nh->protocol; + skb_set_transport_header(skb, th_ofs); + + /* Transport layer. */ + if (!(nh->frag_off & htons(IP_MF | IP_OFFSET))) { + if (key->nw_proto == IPPROTO_TCP) { + if (tcphdr_ok(skb)) { + struct tcphdr *tcp = tcp_hdr(skb); + key->tp_src = tcp->source; + key->tp_dst = tcp->dest; + } else { + /* Avoid tricking other code into + * thinking that this packet has an L4 + * header. */ + key->nw_proto = 0; + } + } else if (key->nw_proto == IPPROTO_UDP) { + if (udphdr_ok(skb)) { + struct udphdr *udp = udp_hdr(skb); + key->tp_src = udp->source; + key->tp_dst = udp->dest; + } else { + /* Avoid tricking other code into + * thinking that this packet has an L4 + * header. */ + key->nw_proto = 0; + } + } else if (key->nw_proto == IPPROTO_ICMP) { + if (icmphdr_ok(skb)) { + struct icmphdr *icmp = icmp_hdr(skb); + /* The ICMP type and code fields use the 16-bit + * transport port fields, so we need to store them + * in 16-bit network byte order. */ + key->tp_src = htons(icmp->type); + key->tp_dst = htons(icmp->code); + } else { + /* Avoid tricking other code into + * thinking that this packet has an L4 + * header. */ + key->nw_proto = 0; + } + } + } else { + retval = 1; + } + } else { + skb_reset_transport_header(skb); + } + return retval; +} + +/* Initializes the flow module. + * Returns zero if successful or a negative error code. */ +int flow_init(void) +{ + flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, + 0, NULL); + if (flow_cache == NULL) + return -ENOMEM; + + return 0; +} + +/* Uninitializes the flow module. */ +void flow_exit(void) +{ + kmem_cache_destroy(flow_cache); +} + +void print_flow(const struct odp_flow_key *key) +{ +#define MAC_FMT "%02x:%02x:%02x:%02x:%02x:%02x" +#define MAC_ARG(x) ((u8*)(x))[0],((u8*)(x))[1],((u8*)(x))[2],((u8*)(x))[3],((u8*)(x))[4],((u8*)(x))[5] + printk("port%04x:vlan%d mac"MAC_FMT"->"MAC_FMT" " + "type%04x proto%d ip%x->%x port%d->%d\n", + key->in_port, ntohs(key->dl_vlan), + MAC_ARG(key->dl_src), MAC_ARG(key->dl_dst), + ntohs(key->dl_type), key->nw_proto, + key->nw_src, key->nw_dst, + ntohs(key->tp_src), ntohs(key->tp_dst)); +} diff --git a/datapath/flow.h b/datapath/flow.h new file mode 100644 index 00000000..55efede1 --- /dev/null +++ b/datapath/flow.h @@ -0,0 +1,49 @@ +#ifndef FLOW_H +#define FLOW_H 1 + +#include +#include +#include +#include +#include + +#include "openvswitch/datapath-protocol.h" + +struct sk_buff; + +struct sw_flow_actions { + struct rcu_head rcu; + unsigned int n_actions; + union odp_action actions[]; +}; + +struct sw_flow { + struct rcu_head rcu; + struct odp_flow_key key; + struct sw_flow_actions *sf_acts; + + struct timespec used; /* Last used time. */ + + u8 ip_tos; /* IP TOS value. */ + + spinlock_t lock; /* Lock for values below. */ + u64 packet_count; /* Number of packets matched. */ + u64 byte_count; /* Number of bytes matched. */ + u8 tcp_flags; /* Union of seen TCP flags. */ +}; + +extern struct kmem_cache *flow_cache; + +struct sw_flow_actions *flow_actions_alloc(size_t n_actions); +void flow_free(struct sw_flow *); +void flow_deferred_free(struct sw_flow *); +void flow_deferred_free_acts(struct sw_flow_actions *); +int flow_extract(struct sk_buff *, u16 in_port, struct odp_flow_key *); +void flow_used(struct sw_flow *, struct sk_buff *); + +void print_flow(const struct odp_flow_key *); + +int flow_init(void); +void flow_exit(void); + +#endif /* flow.h */ diff --git a/datapath/linux-2.6/.gitignore b/datapath/linux-2.6/.gitignore new file mode 100644 index 00000000..af5821a2 --- /dev/null +++ b/datapath/linux-2.6/.gitignore @@ -0,0 +1,20 @@ +/Kbuild +/Makefile +/Makefile.main +/actions.c +/brcompat.c +/brc_sysfs_dp.c +/brc_sysfs_if.c +/datapath.c +/dp_dev.c +/dp_notify.c +/flow.c +/genetlink-brcompat.c +/genetlink-openvswitch.c +/kcompat.h +/linux-2.6 +/modules.order +/random32.c +/table.c +/tmp +/veth.c diff --git a/datapath/linux-2.6/Kbuild.in b/datapath/linux-2.6/Kbuild.in new file mode 100644 index 00000000..f08eb9c5 --- /dev/null +++ b/datapath/linux-2.6/Kbuild.in @@ -0,0 +1,34 @@ +# -*- makefile -*- +export builddir = @abs_builddir@ +export srcdir = @abs_srcdir@ +export top_srcdir = @abs_top_srcdir@ +export VERSION = @VERSION@ +export BUILDNR = @BUILDNR@ + +include $(srcdir)/../Modules.mk +include $(srcdir)/Modules.mk + +EXTRA_CFLAGS := -DVERSION=\"$(VERSION)\" +EXTRA_CFLAGS += -I$(srcdir)/.. +EXTRA_CFLAGS += -I$(builddir)/.. +EXTRA_CFLAGS += -I$(top_srcdir)/include +ifeq '$(BUILDNR)' '0' +EXTRA_CFLAGS += -DBUILDNR=\"\" +else +EXTRA_CFLAGS += -DBUILDNR=\"+build$(BUILDNR)\" +endif +EXTRA_CFLAGS += -g +EXTRA_CFLAGS += -include $(builddir)/kcompat.h + +# These include directories have to go before -I$(KSRC)/include. +# NOSTDINC_FLAGS just happens to be a variable that goes in the +# right place, even though it's conceptually incorrect. +NOSTDINC_FLAGS += -I$(srcdir)/compat-2.6 -I$(srcdir)/compat-2.6/include + +obj-m := $(patsubst %,%_mod.o,$(build_modules)) + +define module_template +$(1)_mod-y = $$(notdir $$(patsubst %.c,%.o,$($(1)_sources))) +endef + +$(foreach module,$(build_modules),$(eval $(call module_template,$(module)))) diff --git a/datapath/linux-2.6/Makefile.in b/datapath/linux-2.6/Makefile.in new file mode 100644 index 00000000..efc1663e --- /dev/null +++ b/datapath/linux-2.6/Makefile.in @@ -0,0 +1,9 @@ +ifeq ($(KERNELRELEASE),) +# We're being called directly by running make in this directory. +include Makefile.main +else +# We're being included by the Linux kernel build system +include Kbuild +endif + + diff --git a/datapath/linux-2.6/Makefile.main.in b/datapath/linux-2.6/Makefile.main.in new file mode 100644 index 00000000..0005ec4f --- /dev/null +++ b/datapath/linux-2.6/Makefile.main.in @@ -0,0 +1,82 @@ +# -*- makefile -*- +export builddir = @abs_builddir@ +export srcdir = @abs_srcdir@ +export top_srcdir = @abs_top_srcdir@ +export KSRC = @KSRC26@ +export VERSION = @VERSION@ +export BUILD_VETH = @BUILD_VETH@ + +include $(srcdir)/../Modules.mk +include $(srcdir)/Modules.mk + +default: $(build_links) + +$(foreach s,$(sort $(foreach m,$(build_modules),$($(m)_sources))), \ + $(eval $(notdir $(s)): ; ln -s $(srcdir)/../$(s) $@)) + +distclean: clean + rm -f kcompat.h +distdir: clean +install: +all: default +check: all +clean: + rm -f *.o *.ko *_mod.* Module.symvers *.cmd kcompat.h.new + for d in $(build_links); do if test -h $$d; then rm $$d; fi; done + +ifneq ($(KSRC),) + +ifeq (/lib/modules/$(shell uname -r)/source, $(KSRC)) + KOBJ := /lib/modules/$(shell uname -r)/build +else + KOBJ := $(KSRC) +endif + +ifneq ($(shell grep -c 'PATCHLEVEL = 6' $(KSRC)/Makefile),1) + $(error Linux kernel source in $(KSRC) not 2.6) +endif + +VERSION_FILE := $(KOBJ)/include/linux/version.h +ifeq (,$(wildcard $(VERSION_FILE))) + $(error Linux kernel source not configured - missing version.h) +endif + +CONFIG_FILE := $(KSRC)/include/linux/autoconf.h +ifeq (,$(wildcard $(CONFIG_FILE))) + $(error Linux kernel source not configured - missing autoconf.h) +endif + +default: + $(MAKE) -C $(KSRC) M=$(builddir) modules +endif + +# Much of the kernel build system in this file is derived from Intel's +# e1000 distribution, with the following license: + +################################################################################ +# +# Intel PRO/1000 Linux driver +# Copyright(c) 1999 - 2007, 2009 Intel Corporation. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms and conditions of the GNU General Public License, +# version 2, as published by the Free Software Foundation. +# +# This program is distributed in the hope it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +# more details. +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. +# +# The full GNU General Public License is included in this distribution in +# the file called "COPYING". +# +# Contact Information: +# Linux NICS +# e1000-devel Mailing List +# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 +# +################################################################################ diff --git a/datapath/linux-2.6/Modules.mk b/datapath/linux-2.6/Modules.mk new file mode 100644 index 00000000..bbc4c72f --- /dev/null +++ b/datapath/linux-2.6/Modules.mk @@ -0,0 +1,50 @@ +openvswitch_sources += \ + linux-2.6/compat-2.6/genetlink-openvswitch.c \ + linux-2.6/compat-2.6/random32.c +openvswitch_headers += \ + linux-2.6/compat-2.6/compat26.h \ + linux-2.6/compat-2.6/include/asm-generic/bug.h \ + linux-2.6/compat-2.6/include/linux/dmi.h \ + linux-2.6/compat-2.6/include/linux/err.h \ + linux-2.6/compat-2.6/include/linux/icmp.h \ + linux-2.6/compat-2.6/include/linux/if_arp.h \ + linux-2.6/compat-2.6/include/linux/ip.h \ + linux-2.6/compat-2.6/include/linux/ipv6.h \ + linux-2.6/compat-2.6/include/linux/jiffies.h \ + linux-2.6/compat-2.6/include/linux/kernel.h \ + linux-2.6/compat-2.6/include/linux/log2.h \ + linux-2.6/compat-2.6/include/linux/lockdep.h \ + linux-2.6/compat-2.6/include/linux/mutex.h \ + linux-2.6/compat-2.6/include/linux/netdevice.h \ + linux-2.6/compat-2.6/include/linux/netfilter_bridge.h \ + linux-2.6/compat-2.6/include/linux/netfilter_ipv4.h \ + linux-2.6/compat-2.6/include/linux/netlink.h \ + linux-2.6/compat-2.6/include/linux/percpu.h \ + linux-2.6/compat-2.6/include/linux/random.h \ + linux-2.6/compat-2.6/include/linux/rculist.h \ + linux-2.6/compat-2.6/include/linux/rtnetlink.h \ + linux-2.6/compat-2.6/include/linux/skbuff.h \ + linux-2.6/compat-2.6/include/linux/tcp.h \ + linux-2.6/compat-2.6/include/linux/timer.h \ + linux-2.6/compat-2.6/include/linux/types.h \ + linux-2.6/compat-2.6/include/linux/udp.h \ + linux-2.6/compat-2.6/include/linux/workqueue.h \ + linux-2.6/compat-2.6/include/net/checksum.h \ + linux-2.6/compat-2.6/include/net/genetlink.h \ + linux-2.6/compat-2.6/include/net/netlink.h + +both_modules += brcompat +brcompat_sources = \ + linux-2.6/compat-2.6/genetlink-brcompat.c \ + brcompat.c \ + brc_procfs.c \ + brc_sysfs_dp.c \ + brc_sysfs_if.c +brcompat_headers = \ + brc_procfs.h \ + brc_sysfs.h + +dist_modules += veth +build_modules += $(if $(BUILD_VETH),veth) +veth_sources = linux-2.6/compat-2.6/veth.c +veth_headers = diff --git a/datapath/linux-2.6/compat-2.6/compat26.h b/datapath/linux-2.6/compat-2.6/compat26.h new file mode 100644 index 00000000..61448d63 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/compat26.h @@ -0,0 +1,37 @@ +#ifndef __COMPAT26_H +#define __COMPAT26_H 1 + +#include + +#if defined(CONFIG_PREEMPT) && LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,21) +#error "CONFIG_PREEMPT is broken with 2.6.x before 2.6.21--see commit 4498121ca3, \"[NET]: Handle disabled preemption in gfp_any()\"" +#endif + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) +/*---------------------------------------------------------------------------- + * In 2.6.24, a namespace argument became required for dev_get_by_name. */ + +#define dev_get_by_name(net, name) \ + dev_get_by_name((name)) + +#define dev_get_by_index(net, ifindex) \ + dev_get_by_index((ifindex)) + +#define __dev_get_by_name(net, name) \ + __dev_get_by_name((name)) + +#define __dev_get_by_index(net, ifindex) \ + __dev_get_by_index((ifindex)) + +#endif /* linux kernel <= 2.6.23 */ + + +#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,22) +/*---------------------------------------------------------------------------- + * In 2.6.23, the last argument was dropped from kmem_cache_create. */ +#define kmem_cache_create(n, s, a, f, c) \ + kmem_cache_create((n), (s), (a), (f), (c), NULL) + +#endif /* linux kernel <= 2.6.22 */ + +#endif /* compat26.h */ diff --git a/datapath/linux-2.6/compat-2.6/genetlink-brcompat.c b/datapath/linux-2.6/compat-2.6/genetlink-brcompat.c new file mode 100644 index 00000000..c43b3ce4 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/genetlink-brcompat.c @@ -0,0 +1,20 @@ +#include "net/genetlink.h" + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) + +/* We fix grp->id to 32 so that it doesn't collide with any of the multicast + * groups selected by openvswitch_mod, which uses groups 16 through 31. + * Collision isn't fatal--multicast listeners should check that the family is + * the one that they want and discard others--but it wastes time and memory to + * receive unwanted messages. */ +int genl_register_mc_group(struct genl_family *family, + struct genl_multicast_group *grp) +{ + grp->id = 32; + grp->family = family; + + return 0; +} + +#endif /* kernel < 2.6.23 */ diff --git a/datapath/linux-2.6/compat-2.6/genetlink-openvswitch.c b/datapath/linux-2.6/compat-2.6/genetlink-openvswitch.c new file mode 100644 index 00000000..9e09215f --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/genetlink-openvswitch.c @@ -0,0 +1,22 @@ +#include "net/genetlink.h" + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) + +/* We use multicast groups 16 through 31 to avoid colliding with the multicast + * group selected by brcompat_mod, which uses groups 32. Collision isn't + * fatal--multicast listeners should check that the family is the one that they + * want and discard others--but it wastes time and memory to receive unwanted + * messages. */ +int genl_register_mc_group(struct genl_family *family, + struct genl_multicast_group *grp) +{ + /* This code is called single-threaded. */ + static unsigned int next_id = 0; + grp->id = next_id++ % 16 + 16; + grp->family = family; + + return 0; +} + +#endif /* kernel < 2.6.23 */ diff --git a/datapath/linux-2.6/compat-2.6/include/asm-generic/bug.h b/datapath/linux-2.6/compat-2.6/include/asm-generic/bug.h new file mode 100644 index 00000000..1d9b3140 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/asm-generic/bug.h @@ -0,0 +1,19 @@ +#ifndef __ASM_GENERIC_BUG_WRAPPER_H +#define __ASM_GENERIC_BUG_WRAPPER_H + +#include_next + +#ifndef WARN_ON_ONCE +#define WARN_ON_ONCE(condition) ({ \ + static int __warned; \ + int __ret_warn_once = !!(condition); \ + \ + if (unlikely(__ret_warn_once) && !__warned) { \ + WARN_ON(1); \ + __warned = 1; \ + } \ + unlikely(__ret_warn_once); \ +}) +#endif + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/cpumask.h b/datapath/linux-2.6/compat-2.6/include/linux/cpumask.h new file mode 100644 index 00000000..48c73aa8 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/cpumask.h @@ -0,0 +1,11 @@ +#ifndef __LINUX_CPUMASK_WRAPPER_H +#define __LINUX_CPUMASK_WRAPPER_H + +#include_next + +/* for_each_cpu was renamed for_each_possible_cpu in 2.6.18. */ +#ifndef for_each_possible_cpu +#define for_each_possible_cpu for_each_cpu +#endif + +#endif /* linux/cpumask.h wrapper */ diff --git a/datapath/linux-2.6/compat-2.6/include/linux/dmi.h b/datapath/linux-2.6/compat-2.6/include/linux/dmi.h new file mode 100644 index 00000000..52916fec --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/dmi.h @@ -0,0 +1,114 @@ +#ifndef __LINUX_DMI_WRAPPER_H +#define __LINUX_DMI_WRAPPER_H 1 + +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23) + +#include_next + +#else /* linux version >= 2.6.23 */ + +#ifndef __DMI_H__ +#define __DMI_H__ + +#include + +enum dmi_field { + DMI_NONE, + DMI_BIOS_VENDOR, + DMI_BIOS_VERSION, + DMI_BIOS_DATE, + DMI_SYS_VENDOR, + DMI_PRODUCT_NAME, + DMI_PRODUCT_VERSION, + DMI_PRODUCT_SERIAL, + DMI_PRODUCT_UUID, + DMI_BOARD_VENDOR, + DMI_BOARD_NAME, + DMI_BOARD_VERSION, + DMI_BOARD_SERIAL, + DMI_BOARD_ASSET_TAG, + DMI_CHASSIS_VENDOR, + DMI_CHASSIS_TYPE, + DMI_CHASSIS_VERSION, + DMI_CHASSIS_SERIAL, + DMI_CHASSIS_ASSET_TAG, + DMI_STRING_MAX, +}; + +enum dmi_device_type { + DMI_DEV_TYPE_ANY = 0, + DMI_DEV_TYPE_OTHER, + DMI_DEV_TYPE_UNKNOWN, + DMI_DEV_TYPE_VIDEO, + DMI_DEV_TYPE_SCSI, + DMI_DEV_TYPE_ETHERNET, + DMI_DEV_TYPE_TOKENRING, + DMI_DEV_TYPE_SOUND, + DMI_DEV_TYPE_IPMI = -1, + DMI_DEV_TYPE_OEM_STRING = -2 +}; + +struct dmi_header { + u8 type; + u8 length; + u16 handle; +}; + +/* + * DMI callbacks for problem boards + */ +struct dmi_strmatch { + u8 slot; + char *substr; +}; + +struct dmi_system_id { + int (*callback)(struct dmi_system_id *); + const char *ident; + struct dmi_strmatch matches[4]; + void *driver_data; +}; + +#define DMI_MATCH(a, b) { a, b } + +struct dmi_device { + struct list_head list; + int type; + const char *name; + void *device_data; /* Type specific data */ +}; + +/* No CONFIG_DMI before 2.6.16 */ +#if defined(CONFIG_DMI) || defined(CONFIG_X86_32) + +extern int dmi_check_system(struct dmi_system_id *list); +extern char * dmi_get_system_info(int field); +extern struct dmi_device * dmi_find_device(int type, const char *name, + struct dmi_device *from); +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) +extern void dmi_scan_machine(void); +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,17) +extern int dmi_get_year(int field); +#endif +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19) +extern int dmi_name_in_vendors(char *str); +#endif + +#else + +static inline int dmi_check_system(struct dmi_system_id *list) { return 0; } +static inline char * dmi_get_system_info(int field) { return NULL; } +static inline struct dmi_device * dmi_find_device(int type, const char *name, + struct dmi_device *from) { return NULL; } +static inline int dmi_get_year(int year) { return 0; } +static inline int dmi_name_in_vendors(char *s) { return 0; } + +#endif + +#endif /* __DMI_H__ */ + +#endif /* linux kernel < 2.6.22 */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/err.h b/datapath/linux-2.6/compat-2.6/include/linux/err.h new file mode 100644 index 00000000..50faf2a1 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/err.h @@ -0,0 +1,21 @@ +#ifndef __LINUX_ERR_WRAPPER_H +#define __LINUX_ERR_WRAPPER_H 1 + +#include_next + +#ifndef HAVE_ERR_CAST +/** + * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type + * @ptr: The pointer to cast. + * + * Explicitly cast an error-valued pointer to another pointer type in such a + * way as to make it clear that's what's going on. + */ +static inline void *ERR_CAST(const void *ptr) +{ + /* cast away the const */ + return (void *) ptr; +} +#endif /* HAVE_ERR_CAST */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/icmp.h b/datapath/linux-2.6/compat-2.6/include/linux/icmp.h new file mode 100644 index 00000000..89b354e4 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/icmp.h @@ -0,0 +1,13 @@ +#ifndef __LINUX_ICMP_WRAPPER_H +#define __LINUX_ICMP_WRAPPER_H 1 + +#include_next + +#ifndef HAVE_SKBUFF_HEADER_HELPERS +static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb) +{ + return (struct icmphdr *)skb_transport_header(skb); +} +#endif + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/if_arp.h b/datapath/linux-2.6/compat-2.6/include/linux/if_arp.h new file mode 100644 index 00000000..e48d6ba0 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/if_arp.h @@ -0,0 +1,15 @@ +#ifndef __LINUX_IF_ARP_WRAPPER_H +#define __LINUX_IF_ARP_WRAPPER_H 1 + +#include_next + +#ifndef HAVE_SKBUFF_HEADER_HELPERS +#include + +static inline struct arphdr *arp_hdr(const struct sk_buff *skb) +{ + return (struct arphdr *)skb_network_header(skb); +} +#endif /* !HAVE_SKBUFF_HEADER_HELPERS */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/ip.h b/datapath/linux-2.6/compat-2.6/include/linux/ip.h new file mode 100644 index 00000000..36765396 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/ip.h @@ -0,0 +1,18 @@ +#ifndef __LINUX_IP_WRAPPER_H +#define __LINUX_IP_WRAPPER_H 1 + +#include_next + +#ifndef HAVE_SKBUFF_HEADER_HELPERS +static inline struct iphdr *ip_hdr(const struct sk_buff *skb) +{ + return (struct iphdr *)skb_network_header(skb); +} + +static inline unsigned int ip_hdrlen(const struct sk_buff *skb) +{ + return ip_hdr(skb)->ihl * 4; +} +#endif /* !HAVE_SKBUFF_HEADER_HELPERS */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/ipv6.h b/datapath/linux-2.6/compat-2.6/include/linux/ipv6.h new file mode 100644 index 00000000..25a5431a --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/ipv6.h @@ -0,0 +1,13 @@ +#ifndef __LINUX_IPV6_WRAPPER_H +#define __LINUX_IPV6_WRAPPER_H 1 + +#include_next + +#ifndef HAVE_SKBUFF_HEADER_HELPERS +static inline struct ipv6hdr *ipv6_hdr(const struct sk_buff *skb) +{ + return (struct ipv6hdr *)skb_network_header(skb); +} +#endif + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/jiffies.h b/datapath/linux-2.6/compat-2.6/include/linux/jiffies.h new file mode 100644 index 00000000..3286e634 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/jiffies.h @@ -0,0 +1,26 @@ +#ifndef __LINUX_JIFFIES_WRAPPER_H +#define __LINUX_JIFFIES_WRAPPER_H 1 + +#include_next + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) + +/* Same as above, but does so with platform independent 64bit types. + * These must be used when utilizing jiffies_64 (i.e. return value of + * get_jiffies_64() */ +#define time_after64(a,b) \ + (typecheck(__u64, a) && \ + typecheck(__u64, b) && \ + ((__s64)(b) - (__s64)(a) < 0)) +#define time_before64(a,b) time_after64(b,a) + +#define time_after_eq64(a,b) \ + (typecheck(__u64, a) && \ + typecheck(__u64, b) && \ + ((__s64)(a) - (__s64)(b) >= 0)) +#define time_before_eq64(a,b) time_after_eq64(b,a) + +#endif /* linux kernel < 2.6.19 */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/kernel.h b/datapath/linux-2.6/compat-2.6/include/linux/kernel.h new file mode 100644 index 00000000..9459155d --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/kernel.h @@ -0,0 +1,9 @@ +#ifndef __KERNEL_H_WRAPPER +#define __KERNEL_H_WRAPPER 1 + +#include_next +#ifndef HAVE_LOG2_H +#include +#endif + +#endif /* linux/kernel.h */ diff --git a/datapath/linux-2.6/compat-2.6/include/linux/lockdep.h b/datapath/linux-2.6/compat-2.6/include/linux/lockdep.h new file mode 100644 index 00000000..1c839423 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/lockdep.h @@ -0,0 +1,450 @@ +/* + * Runtime locking correctness validator + * + * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * see Documentation/lockdep-design.txt for more details. + */ +#ifndef __LINUX_LOCKDEP_WRAPPER_H +#define __LINUX_LOCKDEP_WRAPPER_H + +#include_next + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) + +struct task_struct; +struct lockdep_map; + +#ifdef CONFIG_LOCKDEP + +#include +#include +#include +#include + +/* + * Lock-class usage-state bits: + */ +enum lock_usage_bit +{ + LOCK_USED = 0, + LOCK_USED_IN_HARDIRQ, + LOCK_USED_IN_SOFTIRQ, + LOCK_ENABLED_SOFTIRQS, + LOCK_ENABLED_HARDIRQS, + LOCK_USED_IN_HARDIRQ_READ, + LOCK_USED_IN_SOFTIRQ_READ, + LOCK_ENABLED_SOFTIRQS_READ, + LOCK_ENABLED_HARDIRQS_READ, + LOCK_USAGE_STATES +}; + +/* + * Usage-state bitmasks: + */ +#define LOCKF_USED (1 << LOCK_USED) +#define LOCKF_USED_IN_HARDIRQ (1 << LOCK_USED_IN_HARDIRQ) +#define LOCKF_USED_IN_SOFTIRQ (1 << LOCK_USED_IN_SOFTIRQ) +#define LOCKF_ENABLED_HARDIRQS (1 << LOCK_ENABLED_HARDIRQS) +#define LOCKF_ENABLED_SOFTIRQS (1 << LOCK_ENABLED_SOFTIRQS) + +#define LOCKF_ENABLED_IRQS (LOCKF_ENABLED_HARDIRQS | LOCKF_ENABLED_SOFTIRQS) +#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) + +#define LOCKF_USED_IN_HARDIRQ_READ (1 << LOCK_USED_IN_HARDIRQ_READ) +#define LOCKF_USED_IN_SOFTIRQ_READ (1 << LOCK_USED_IN_SOFTIRQ_READ) +#define LOCKF_ENABLED_HARDIRQS_READ (1 << LOCK_ENABLED_HARDIRQS_READ) +#define LOCKF_ENABLED_SOFTIRQS_READ (1 << LOCK_ENABLED_SOFTIRQS_READ) + +#define LOCKF_ENABLED_IRQS_READ \ + (LOCKF_ENABLED_HARDIRQS_READ | LOCKF_ENABLED_SOFTIRQS_READ) +#define LOCKF_USED_IN_IRQ_READ \ + (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) + +#define MAX_LOCKDEP_SUBCLASSES 8UL + +/* + * Lock-classes are keyed via unique addresses, by embedding the + * lockclass-key into the kernel (or module) .data section. (For + * static locks we use the lock address itself as the key.) + */ +struct lockdep_subclass_key { + char __one_byte; +} __attribute__ ((__packed__)); + +struct lock_class_key { + struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; +}; + +/* + * The lock-class itself: + */ +struct lock_class { + /* + * class-hash: + */ + struct list_head hash_entry; + + /* + * global list of all lock-classes: + */ + struct list_head lock_entry; + + struct lockdep_subclass_key *key; + unsigned int subclass; + + /* + * IRQ/softirq usage tracking bits: + */ + unsigned long usage_mask; + struct stack_trace usage_traces[LOCK_USAGE_STATES]; + + /* + * These fields represent a directed graph of lock dependencies, + * to every node we attach a list of "forward" and a list of + * "backward" graph nodes. + */ + struct list_head locks_after, locks_before; + + /* + * Generation counter, when doing certain classes of graph walking, + * to ensure that we check one node only once: + */ + unsigned int version; + + /* + * Statistics counter: + */ + unsigned long ops; + + const char *name; + int name_version; + +#ifdef CONFIG_LOCK_STAT + unsigned long contention_point[4]; +#endif +}; + +#ifdef CONFIG_LOCK_STAT +struct lock_time { + s64 min; + s64 max; + s64 total; + unsigned long nr; +}; + +enum bounce_type { + bounce_acquired_write, + bounce_acquired_read, + bounce_contended_write, + bounce_contended_read, + nr_bounce_types, + + bounce_acquired = bounce_acquired_write, + bounce_contended = bounce_contended_write, +}; + +struct lock_class_stats { + unsigned long contention_point[4]; + struct lock_time read_waittime; + struct lock_time write_waittime; + struct lock_time read_holdtime; + struct lock_time write_holdtime; + unsigned long bounces[nr_bounce_types]; +}; + +struct lock_class_stats lock_stats(struct lock_class *class); +void clear_lock_stats(struct lock_class *class); +#endif + +/* + * Map the lock object (the lock instance) to the lock-class object. + * This is embedded into specific lock instances: + */ +struct lockdep_map { + struct lock_class_key *key; + struct lock_class *class_cache; + const char *name; +#ifdef CONFIG_LOCK_STAT + int cpu; +#endif +}; + +/* + * Every lock has a list of other locks that were taken after it. + * We only grow the list, never remove from it: + */ +struct lock_list { + struct list_head entry; + struct lock_class *class; + struct stack_trace trace; + int distance; +}; + +/* + * We record lock dependency chains, so that we can cache them: + */ +struct lock_chain { + struct list_head entry; + u64 chain_key; +}; + +struct held_lock { + /* + * One-way hash of the dependency chain up to this point. We + * hash the hashes step by step as the dependency chain grows. + * + * We use it for dependency-caching and we skip detection + * passes and dependency-updates if there is a cache-hit, so + * it is absolutely critical for 100% coverage of the validator + * to have a unique key value for every unique dependency path + * that can occur in the system, to make a unique hash value + * as likely as possible - hence the 64-bit width. + * + * The task struct holds the current hash value (initialized + * with zero), here we store the previous hash value: + */ + u64 prev_chain_key; + struct lock_class *class; + unsigned long acquire_ip; + struct lockdep_map *instance; + +#ifdef CONFIG_LOCK_STAT + u64 waittime_stamp; + u64 holdtime_stamp; +#endif + /* + * The lock-stack is unified in that the lock chains of interrupt + * contexts nest ontop of process context chains, but we 'separate' + * the hashes by starting with 0 if we cross into an interrupt + * context, and we also keep do not add cross-context lock + * dependencies - the lock usage graph walking covers that area + * anyway, and we'd just unnecessarily increase the number of + * dependencies otherwise. [Note: hardirq and softirq contexts + * are separated from each other too.] + * + * The following field is used to detect when we cross into an + * interrupt context: + */ + int irq_context; + int trylock; + int read; + int check; + int hardirqs_off; +}; + +/* + * Initialization, self-test and debugging-output methods: + */ +extern void lockdep_init(void); +extern void lockdep_info(void); +extern void lockdep_reset(void); +extern void lockdep_reset_lock(struct lockdep_map *lock); +extern void lockdep_free_key_range(void *start, unsigned long size); + +extern void lockdep_off(void); +extern void lockdep_on(void); + +/* + * These methods are used by specific locking variants (spinlocks, + * rwlocks, mutexes and rwsems) to pass init/acquire/release events + * to lockdep: + */ + +extern void lockdep_init_map(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass); + +/* + * Reinitialize a lock key - for cases where there is special locking or + * special initialization of locks so that the validator gets the scope + * of dependencies wrong: they are either too broad (they need a class-split) + * or they are too narrow (they suffer from a false class-split): + */ +#define lockdep_set_class(lock, key) \ + lockdep_init_map(&(lock)->dep_map, #key, key, 0) +#define lockdep_set_class_and_name(lock, key, name) \ + lockdep_init_map(&(lock)->dep_map, name, key, 0) +#define lockdep_set_class_and_subclass(lock, key, sub) \ + lockdep_init_map(&(lock)->dep_map, #key, key, sub) +#define lockdep_set_subclass(lock, sub) \ + lockdep_init_map(&(lock)->dep_map, #lock, \ + (lock)->dep_map.key, sub) + +/* + * Acquire a lock. + * + * Values for "read": + * + * 0: exclusive (write) acquire + * 1: read-acquire (no recursion allowed) + * 2: read-acquire with same-instance recursion allowed + * + * Values for check: + * + * 0: disabled + * 1: simple checks (freeing, held-at-exit-time, etc.) + * 2: full validation + */ +extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, unsigned long ip); + +extern void lock_release(struct lockdep_map *lock, int nested, + unsigned long ip); + +# define INIT_LOCKDEP .lockdep_recursion = 0, + +#define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) + +#else /* !LOCKDEP */ + +static inline void lockdep_off(void) +{ +} + +static inline void lockdep_on(void) +{ +} + +# define lock_acquire(l, s, t, r, c, i) do { } while (0) +# define lock_release(l, n, i) do { } while (0) +# define lockdep_init() do { } while (0) +# define lockdep_info() do { } while (0) +# define lockdep_init_map(lock, name, key, sub) do { (void)(key); } while (0) +# define lockdep_set_class(lock, key) do { (void)(key); } while (0) +# define lockdep_set_class_and_name(lock, key, name) \ + do { (void)(key); } while (0) +#define lockdep_set_class_and_subclass(lock, key, sub) \ + do { (void)(key); } while (0) +#define lockdep_set_subclass(lock, sub) do { } while (0) + +# define INIT_LOCKDEP +# define lockdep_reset() do { debug_locks = 1; } while (0) +# define lockdep_free_key_range(start, size) do { } while (0) +/* + * The class key takes no space if lockdep is disabled: + */ +struct lock_class_key { }; + +#define lockdep_depth(tsk) (0) + +#endif /* !LOCKDEP */ + +#ifdef CONFIG_LOCK_STAT + +extern void lock_contended(struct lockdep_map *lock, unsigned long ip); +extern void lock_acquired(struct lockdep_map *lock); + +#define LOCK_CONTENDED(_lock, try, lock) \ +do { \ + if (!try(_lock)) { \ + lock_contended(&(_lock)->dep_map, _RET_IP_); \ + lock(_lock); \ + } \ + lock_acquired(&(_lock)->dep_map); \ +} while (0) + +#else /* CONFIG_LOCK_STAT */ + +#define lock_contended(lockdep_map, ip) do {} while (0) +#define lock_acquired(lockdep_map) do {} while (0) + +#define LOCK_CONTENDED(_lock, try, lock) \ + lock(_lock) + +#endif /* CONFIG_LOCK_STAT */ + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS) +extern void early_init_irq_lock_class(void); +#else +static inline void early_init_irq_lock_class(void) +{ +} +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS +extern void early_boot_irqs_off(void); +extern void early_boot_irqs_on(void); +extern void print_irqtrace_events(struct task_struct *curr); +#else +static inline void early_boot_irqs_off(void) +{ +} +static inline void early_boot_irqs_on(void) +{ +} +static inline void print_irqtrace_events(struct task_struct *curr) +{ +} +#endif + +/* + * For trivial one-depth nesting of a lock-class, the following + * global define can be used. (Subsystems with multiple levels + * of nesting should define their own lock-nesting subclasses.) + */ +#define SINGLE_DEPTH_NESTING 1 + +/* + * Map the dependency ops to NOP or to real lockdep ops, depending + * on the per lock-class debug mode: + */ + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# ifdef CONFIG_PROVE_LOCKING +# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, i) +# else +# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, i) +# endif +# define spin_release(l, n, i) lock_release(l, n, i) +#else +# define spin_acquire(l, s, t, i) do { } while (0) +# define spin_release(l, n, i) do { } while (0) +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# ifdef CONFIG_PROVE_LOCKING +# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, i) +# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 2, i) +# else +# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, i) +# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 1, i) +# endif +# define rwlock_release(l, n, i) lock_release(l, n, i) +#else +# define rwlock_acquire(l, s, t, i) do { } while (0) +# define rwlock_acquire_read(l, s, t, i) do { } while (0) +# define rwlock_release(l, n, i) do { } while (0) +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# ifdef CONFIG_PROVE_LOCKING +# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, i) +# else +# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, i) +# endif +# define mutex_release(l, n, i) lock_release(l, n, i) +#else +# define mutex_acquire(l, s, t, i) do { } while (0) +# define mutex_release(l, n, i) do { } while (0) +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# ifdef CONFIG_PROVE_LOCKING +# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, i) +# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 2, i) +# else +# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, i) +# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 1, i) +# endif +# define rwsem_release(l, n, i) lock_release(l, n, i) +#else +# define rwsem_acquire(l, s, t, i) do { } while (0) +# define rwsem_acquire_read(l, s, t, i) do { } while (0) +# define rwsem_release(l, n, i) do { } while (0) +#endif + +#endif /* linux kernel < 2.6.18 */ + +#endif /* __LINUX_LOCKDEP_WRAPPER_H */ diff --git a/datapath/linux-2.6/compat-2.6/include/linux/log2.h b/datapath/linux-2.6/compat-2.6/include/linux/log2.h new file mode 100644 index 00000000..69abae5e --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/log2.h @@ -0,0 +1,17 @@ +#ifndef __LINUX_LOG2_WRAPPER +#define __LINUX_LOG2_WRAPPER + +#ifdef HAVE_LOG2_H +#include_next +#else +/* This is very stripped down because log2.h has far too many dependencies. */ + +extern __attribute__((const, noreturn)) +int ____ilog2_NaN(void); + +#define ilog2(n) ((n) == 4 ? 2 : \ + (n) == 8 ? 3 : \ + ____ilog2_NaN()) +#endif + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/mutex.h b/datapath/linux-2.6/compat-2.6/include/linux/mutex.h new file mode 100644 index 00000000..93dfa3b2 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/mutex.h @@ -0,0 +1,59 @@ +#ifndef __LINUX_MUTEX_WRAPPER_H +#define __LINUX_MUTEX_WRAPPER_H + + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) + +#include + +struct mutex { + struct semaphore sema; +}; + +#define mutex_init(mutex) init_MUTEX(&(mutex)->sema) +#define mutex_destroy(mutex) do { } while (0) + +#define __MUTEX_INITIALIZER(name) \ + __SEMAPHORE_INITIALIZER(name,1) + +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = { __MUTEX_INITIALIZER(mutexname.sema) } + +/* + * See kernel/mutex.c for detailed documentation of these APIs. + * Also see Documentation/mutex-design.txt. + */ +static inline void mutex_lock(struct mutex *lock) +{ + down(&lock->sema); +} + +static inline int mutex_lock_interruptible(struct mutex *lock) +{ + return down_interruptible(&lock->sema); +} + +#define mutex_lock_nested(lock, subclass) mutex_lock(lock) +#define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) + +/* + * NOTE: mutex_trylock() follows the spin_trylock() convention, + * not the down_trylock() convention! + */ +static inline int mutex_trylock(struct mutex *lock) +{ + return !down_trylock(&lock->sema); +} + +static inline void mutex_unlock(struct mutex *lock) +{ + up(&lock->sema); +} +#else + +#include_next + +#endif /* linux version < 2.6.16 */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/netdevice.h b/datapath/linux-2.6/compat-2.6/include/linux/netdevice.h new file mode 100644 index 00000000..32e1735d --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/netdevice.h @@ -0,0 +1,35 @@ +#ifndef __LINUX_NETDEVICE_WRAPPER_H +#define __LINUX_NETDEVICE_WRAPPER_H 1 + +#include_next + +struct net; + +#ifndef to_net_dev +#define to_net_dev(class) container_of(class, struct net_device, class_dev) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) +static inline +struct net *dev_net(const struct net_device *dev) +{ + return NULL; +} +#endif /* linux kernel < 2.6.26 */ + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) +#define proc_net init_net.proc_net +#endif + +#ifndef for_each_netdev +/* Linux before 2.6.22 didn't have for_each_netdev at all. */ +#define for_each_netdev(net, d) for (d = dev_base; d; d = d->next) +#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) +/* Linux 2.6.24 added a network namespace pointer to the macro. */ +#undef for_each_netdev +#define for_each_netdev(net,d) list_for_each_entry(d, &dev_base_head, dev_list) +#endif + + + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/netfilter_bridge.h b/datapath/linux-2.6/compat-2.6/include/linux/netfilter_bridge.h new file mode 100644 index 00000000..1c8183c8 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/netfilter_bridge.h @@ -0,0 +1,24 @@ +#ifndef __LINUX_NETFILTER_BRIDGE_WRAPPER_H +#define __LINUX_NETFILTER_BRIDGE_WRAPPER_H + +#include_next + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) + +#include +#include + +static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb) +{ + switch (skb->protocol) { + case __constant_htons(ETH_P_8021Q): + return VLAN_HLEN; + default: + return 0; + } +} + +#endif /* linux version < 2.6.22 */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/netfilter_ipv4.h b/datapath/linux-2.6/compat-2.6/include/linux/netfilter_ipv4.h new file mode 100644 index 00000000..ed8a5d94 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/netfilter_ipv4.h @@ -0,0 +1,19 @@ +#ifndef __LINUX_NETFILTER_IPV4_WRAPPER_H +#define __LINUX_NETFILTER_IPV4_WRAPPER_H 1 + +#include_next + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) + +#ifdef __KERNEL__ + +#define NF_INET_PRE_ROUTING NF_IP_PRE_ROUTING +#define NF_INET_POST_ROUTING NF_IP_POST_ROUTING +#define NF_INET_FORWARD NF_IP_FORWARD + +#endif /* __KERNEL__ */ + +#endif /* linux kernel < 2.6.25 */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/netlink.h b/datapath/linux-2.6/compat-2.6/include/linux/netlink.h new file mode 100644 index 00000000..c5f83bd0 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/netlink.h @@ -0,0 +1,24 @@ +#ifndef __LINUX_NETLINK_WRAPPER_H +#define __LINUX_NETLINK_WRAPPER_H 1 + +#include +#include_next +#include + +#include + +#ifndef NLMSG_DEFAULT_SIZE +#define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN) +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) +#define nlmsg_new(s, f) nlmsg_new_proper((s), (f)) +static inline struct sk_buff *nlmsg_new_proper(int size, gfp_t flags) +{ + return alloc_skb(size, flags); +} + +#endif /* linux kernel < 2.6.19 */ + + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/percpu.h b/datapath/linux-2.6/compat-2.6/include/linux/percpu.h new file mode 100644 index 00000000..0f68bb25 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/percpu.h @@ -0,0 +1,10 @@ +#ifndef __LINUX_PERCPU_H_WRAPPER +#define __LINUX_PERCPU_H_WRAPPER 1 + +#include_next + +#ifndef percpu_ptr +#define percpu_ptr per_cpu_ptr +#endif + +#endif /* linux/percpu.h wrapper */ diff --git a/datapath/linux-2.6/compat-2.6/include/linux/random.h b/datapath/linux-2.6/compat-2.6/include/linux/random.h new file mode 100644 index 00000000..4e4932c9 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/random.h @@ -0,0 +1,17 @@ +#ifndef __LINUX_RANDOM_WRAPPER_H +#define __LINUX_RANDOM_WRAPPER_H 1 + +#include_next + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) + +#ifdef __KERNEL__ +u32 random32(void); +void srandom32(u32 seed); +#endif /* __KERNEL__ */ + +#endif /* linux kernel < 2.6.19 */ + + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/rculist.h b/datapath/linux-2.6/compat-2.6/include/linux/rculist.h new file mode 100644 index 00000000..4164c0e9 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/rculist.h @@ -0,0 +1,12 @@ +#ifndef __LINUX_RCULIST_WRAPPER_H +#define __LINUX_RCULIST_WRAPPER_H + +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) +#include_next +#else +/* Prior to 2.6.26, the contents of rculist.h were part of list.h. */ +#include +#endif + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/rtnetlink.h b/datapath/linux-2.6/compat-2.6/include/linux/rtnetlink.h new file mode 100644 index 00000000..8bc51560 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/rtnetlink.h @@ -0,0 +1,29 @@ +#ifndef __RTNETLINK_WRAPPER_H +#define __RTNETLINK_WRAPPER_H 1 + +#include_next + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) +static inline int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, + u32 group, struct nlmsghdr *nlh, gfp_t flags) +{ + BUG_ON(nlh); /* not implemented */ + if (group) { + /* errors reported via destination sk->sk_err */ + nlmsg_multicast(rtnl, skb, 0, group); + } + return 0; +} + +static inline void rtnl_set_sk_err(struct net *net, u32 group, int error) +{ + netlink_set_err(rtnl, 0, group, error); +} +#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) +#define rtnl_notify(skb, net, pid, group, nlh, flags) \ + ((void) (net), rtnl_notify(skb, pid, group, nlh, flags)) +#define rtnl_set_sk_err(net, group, error) \ + ((void) (net), rtnl_set_sk_err(group, error)) +#endif /* linux kernel < 2.6.25 */ + +#endif /* linux/rtnetlink.h wrapper */ diff --git a/datapath/linux-2.6/compat-2.6/include/linux/skbuff.h b/datapath/linux-2.6/compat-2.6/include/linux/skbuff.h new file mode 100644 index 00000000..666ef850 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/skbuff.h @@ -0,0 +1,170 @@ +#ifndef __LINUX_SKBUFF_WRAPPER_H +#define __LINUX_SKBUFF_WRAPPER_H 1 + +#include_next + +#include + +#ifndef HAVE_SKB_COPY_FROM_LINEAR_DATA_OFFSET +static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb, + const int offset, void *to, + const unsigned int len) +{ + memcpy(to, skb->data + offset, len); +} + +static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb, + const int offset, + const void *from, + const unsigned int len) +{ + memcpy(skb->data + offset, from, len); +} + +#endif /* !HAVE_SKB_COPY_FROM_LINEAR_DATA_OFFSET */ + +/* + * The networking layer reserves some headroom in skb data (via + * dev_alloc_skb). This is used to avoid having to reallocate skb data when + * the header has to grow. In the default case, if the header has to grow + * 16 bytes or less we avoid the reallocation. + * + * Unfortunately this headroom changes the DMA alignment of the resulting + * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive + * on some architectures. An architecture can override this value, + * perhaps setting it to a cacheline in size (since that will maintain + * cacheline alignment of the DMA). It must be a power of 2. + * + * Various parts of the networking layer expect at least 16 bytes of + * headroom, you should not reduce this. + */ +#ifndef NET_SKB_PAD +#define NET_SKB_PAD 16 +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) +static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom, + int cloned) +{ + int delta = 0; + + if (headroom < NET_SKB_PAD) + headroom = NET_SKB_PAD; + if (headroom > skb_headroom(skb)) + delta = headroom - skb_headroom(skb); + + if (delta || cloned) + return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0, + GFP_ATOMIC); + return 0; +} + +static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom) +{ + return __skb_cow(skb, headroom, skb_header_cloned(skb)); +} +#endif /* linux < 2.6.23 */ + + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) +/* Emulate Linux 2.6.17 and later behavior, in which kfree_skb silently ignores + * null pointer arguments. */ +#define kfree_skb(skb) kfree_skb_maybe_null(skb) +static inline void kfree_skb_maybe_null(struct sk_buff *skb) +{ + if (likely(skb != NULL)) + (kfree_skb)(skb); +} +#endif + + +#ifndef CHECKSUM_PARTIAL +/* Note that CHECKSUM_PARTIAL is not implemented, but this allows us to at + * least test against it: see update_csum() in forward.c. */ +#define CHECKSUM_PARTIAL 3 +#endif +#ifndef CHECKSUM_COMPLETE +#define CHECKSUM_COMPLETE CHECKSUM_HW +#endif + +#ifdef HAVE_MAC_RAW +#define mac_header mac.raw +#define network_header nh.raw +#endif + +#ifndef HAVE_SKBUFF_HEADER_HELPERS +static inline unsigned char *skb_transport_header(const struct sk_buff *skb) +{ + return skb->h.raw; +} + +static inline void skb_reset_transport_header(struct sk_buff *skb) +{ + skb->h.raw = skb->data; +} + +static inline void skb_set_transport_header(struct sk_buff *skb, + const int offset) +{ + skb->h.raw = skb->data + offset; +} + +static inline unsigned char *skb_network_header(const struct sk_buff *skb) +{ + return skb->nh.raw; +} + +static inline void skb_set_network_header(struct sk_buff *skb, const int offset) +{ + skb->nh.raw = skb->data + offset; +} + +static inline unsigned char *skb_mac_header(const struct sk_buff *skb) +{ + return skb->mac.raw; +} + +static inline void skb_reset_mac_header(struct sk_buff *skb) +{ + skb->mac_header = skb->data; +} + +static inline void skb_set_mac_header(struct sk_buff *skb, const int offset) +{ + skb->mac.raw = skb->data + offset; +} + +static inline int skb_transport_offset(const struct sk_buff *skb) +{ + return skb_transport_header(skb) - skb->data; +} + +static inline int skb_network_offset(const struct sk_buff *skb) +{ + return skb_network_header(skb) - skb->data; +} + +static inline void skb_copy_to_linear_data(struct sk_buff *skb, + const void *from, + const unsigned int len) +{ + memcpy(skb->data, from, len); +} +#endif /* !HAVE_SKBUFF_HEADER_HELPERS */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) +#warning "TSO/UFO not supported on kernels earlier than 2.6.18" + +static inline int skb_is_gso(const struct sk_buff *skb) +{ + return 0; +} + +static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb, + int features) +{ + return NULL; +} +#endif /* before 2.6.18 */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/tcp.h b/datapath/linux-2.6/compat-2.6/include/linux/tcp.h new file mode 100644 index 00000000..6fad1933 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/tcp.h @@ -0,0 +1,18 @@ +#ifndef __LINUX_TCP_WRAPPER_H +#define __LINUX_TCP_WRAPPER_H 1 + +#include_next + +#ifndef HAVE_SKBUFF_HEADER_HELPERS +static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb) +{ + return (struct tcphdr *)skb_transport_header(skb); +} + +static inline unsigned int tcp_hdrlen(const struct sk_buff *skb) +{ + return tcp_hdr(skb)->doff * 4; +} +#endif /* !HAVE_SKBUFF_HEADER_HELPERS */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/timer.h b/datapath/linux-2.6/compat-2.6/include/linux/timer.h new file mode 100644 index 00000000..6c3a9b0f --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/timer.h @@ -0,0 +1,96 @@ +#ifndef __LINUX_TIMER_WRAPPER_H +#define __LINUX_TIMER_WRAPPER_H 1 + +#include_next + +#include + +#ifndef RHEL_RELEASE_VERSION +#define RHEL_RELEASE_VERSION(X,Y) ( 0 ) +#endif +#if ((LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)) && \ + (!defined(RHEL_RELEASE_CODE) || \ + (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(5,1)))) + +extern unsigned long volatile jiffies; + +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +static inline unsigned long __round_jiffies(unsigned long j, int cpu) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + */ + if (rem < HZ/4) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + if (j <= jiffies) /* rounding ate our timeout entirely; */ + return original; + return j; +} + + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +static inline unsigned long round_jiffies(unsigned long j) +{ + return __round_jiffies(j, 0); // FIXME +} + +#endif /* linux kernel < 2.6.20 */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/types.h b/datapath/linux-2.6/compat-2.6/include/linux/types.h new file mode 100644 index 00000000..c1f375eb --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/types.h @@ -0,0 +1,14 @@ +#ifndef __LINUX_TYPES_WRAPPER_H +#define __LINUX_TYPES_WRAPPER_H 1 + +#include_next + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) + +typedef __u16 __bitwise __sum16; +typedef __u32 __bitwise __wsum; + +#endif /* linux kernel < 2.6.20 */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/udp.h b/datapath/linux-2.6/compat-2.6/include/linux/udp.h new file mode 100644 index 00000000..6fe4721b --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/udp.h @@ -0,0 +1,13 @@ +#ifndef __LINUX_UDP_WRAPPER_H +#define __LINUX_UDP_WRAPPER_H 1 + +#include_next + +#ifndef HAVE_SKBUFF_HEADER_HELPERS +static inline struct udphdr *udp_hdr(const struct sk_buff *skb) +{ + return (struct udphdr *)skb_transport_header(skb); +} +#endif /* HAVE_SKBUFF_HEADER_HELPERS */ + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/linux/workqueue.h b/datapath/linux-2.6/compat-2.6/include/linux/workqueue.h new file mode 100644 index 00000000..1ac3b6ec --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/linux/workqueue.h @@ -0,0 +1,42 @@ +#ifndef __LINUX_WORKQUEUE_WRAPPER_H +#define __LINUX_WORKQUEUE_WRAPPER_H 1 + +#include_next + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) + +#ifdef __KERNEL__ +/* + * initialize a work-struct's func and data pointers: + */ +#undef PREPARE_WORK +#define PREPARE_WORK(_work, _func) \ + do { \ + (_work)->func = (void(*)(void*)) _func; \ + (_work)->data = _work; \ + } while (0) + +/* + * initialize all of a work-struct: + */ +#undef INIT_WORK +#define INIT_WORK(_work, _func) \ + do { \ + INIT_LIST_HEAD(&(_work)->entry); \ + (_work)->pending = 0; \ + PREPARE_WORK((_work), (_func)); \ + init_timer(&(_work)->timer); \ + } while (0) + +#endif /* __KERNEL__ */ + +#endif /* linux kernel < 2.6.20 */ + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) +/* There is no equivalent to cancel_work_sync() so just flush all + * pending work. */ +#define cancel_work_sync(_work) flush_scheduled_work() +#endif + +#endif diff --git a/datapath/linux-2.6/compat-2.6/include/net/checksum.h b/datapath/linux-2.6/compat-2.6/include/net/checksum.h new file mode 100644 index 00000000..c64c6bd0 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/net/checksum.h @@ -0,0 +1,16 @@ +#ifndef __NET_CHECKSUM_WRAPPER_H +#define __NET_CHECKSUM_WRAPPER_H 1 + +#include_next + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) + +static inline __wsum csum_unfold(__sum16 n) +{ + return (__force __wsum)n; +} + +#endif /* linux kernel < 2.6.20 */ + +#endif /* checksum.h */ diff --git a/datapath/linux-2.6/compat-2.6/include/net/genetlink.h b/datapath/linux-2.6/compat-2.6/include/net/genetlink.h new file mode 100644 index 00000000..57a47316 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/net/genetlink.h @@ -0,0 +1,123 @@ +#ifndef __NET_GENERIC_NETLINK_WRAPPER_H +#define __NET_GENERIC_NETLINK_WRAPPER_H 1 + + +#include +#include_next + +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) + +#include + +/*---------------------------------------------------------------------------- + * In 2.6.23, registering of multicast groups was added. Our compatability + * layer just supports registering a single group, since that's all we + * need. + */ + +/** + * struct genl_multicast_group - generic netlink multicast group + * @name: name of the multicast group, names are per-family + * @id: multicast group ID, assigned by the core, to use with + * genlmsg_multicast(). + * @list: list entry for linking + * @family: pointer to family, need not be set before registering + */ +struct genl_multicast_group +{ + struct genl_family *family; /* private */ + struct list_head list; /* private */ + char name[GENL_NAMSIZ]; + u32 id; +}; + +int genl_register_mc_group(struct genl_family *family, + struct genl_multicast_group *grp); +#endif /* linux kernel < 2.6.23 */ + + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) +/** + * genlmsg_msg_size - length of genetlink message not including padding + * @payload: length of message payload + */ +static inline int genlmsg_msg_size(int payload) +{ + return GENL_HDRLEN + payload; +} + +/** + * genlmsg_total_size - length of genetlink message including padding + * @payload: length of message payload + */ +static inline int genlmsg_total_size(int payload) +{ + return NLMSG_ALIGN(genlmsg_msg_size(payload)); +} + +#define genlmsg_multicast(s, p, g, f) \ + genlmsg_multicast_flags((s), (p), (g), (f)) + +static inline int genlmsg_multicast_flags(struct sk_buff *skb, u32 pid, + unsigned int group, gfp_t flags) +{ + int err; + + NETLINK_CB(skb).dst_group = group; + + err = netlink_broadcast(genl_sock, skb, pid, group, flags); + if (err > 0) + err = 0; + + return err; +} +#endif /* linux kernel < 2.6.19 */ + + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) + +#define genlmsg_put(skb, p, seq, fam, flg, c) \ + genlmsg_put((skb), (p), (seq), (fam)->id, (fam)->hdrsize, \ + (flg), (c), (fam)->version) + +/** + * genlmsg_put_reply - Add generic netlink header to a reply message + * @skb: socket buffer holding the message + * @info: receiver info + * @family: generic netlink family + * @flags: netlink message flags + * @cmd: generic netlink command + * + * Returns pointer to user specific header + */ +static inline void *genlmsg_put_reply(struct sk_buff *skb, + struct genl_info *info, struct genl_family *family, + int flags, u8 cmd) +{ + return genlmsg_put(skb, info->snd_pid, info->snd_seq, family, + flags, cmd); +} + +/** + * genlmsg_reply - reply to a request + * @skb: netlink message to be sent back + * @info: receiver information + */ +static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info) +{ + return genlmsg_unicast(skb, info->snd_pid); +} + +/** + * genlmsg_new - Allocate a new generic netlink message + * @payload: size of the message payload + * @flags: the type of memory to allocate. + */ +static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags) +{ + return nlmsg_new(genlmsg_total_size(payload), flags); +} +#endif /* linux kernel < 2.6.20 */ + +#endif /* genetlink.h */ diff --git a/datapath/linux-2.6/compat-2.6/include/net/netlink.h b/datapath/linux-2.6/compat-2.6/include/net/netlink.h new file mode 100644 index 00000000..e0d594d7 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/include/net/netlink.h @@ -0,0 +1,22 @@ +#ifndef __NET_NETLINK_WRAPPER_H +#define __NET_NETLINK_WRAPPER_H 1 + +#include_next + +#ifndef HAVE_NLA_NUL_STRING +#define NLA_NUL_STRING NLA_STRING + +static inline int VERIFY_NUL_STRING(struct nlattr *attr) +{ + return (!attr || (nla_len(attr) + && memchr(nla_data(attr), '\0', nla_len(attr))) + ? 0 : EINVAL); +} +#else +static inline int VERIFY_NUL_STRING(struct nlattr *attr) +{ + return 0; +} +#endif /* !HAVE_NLA_NUL_STRING */ + +#endif /* net/netlink.h */ diff --git a/datapath/linux-2.6/compat-2.6/random32.c b/datapath/linux-2.6/compat-2.6/random32.c new file mode 100644 index 00000000..b0dd2a32 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/random32.c @@ -0,0 +1,144 @@ +#include +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) + +/* + This is a maximally equidistributed combined Tausworthe generator + based on code from GNU Scientific Library 1.5 (30 Jun 2004) + + x_n = (s1_n ^ s2_n ^ s3_n) + + s1_{n+1} = (((s1_n & 4294967294) <<12) ^ (((s1_n <<13) ^ s1_n) >>19)) + s2_{n+1} = (((s2_n & 4294967288) << 4) ^ (((s2_n << 2) ^ s2_n) >>25)) + s3_{n+1} = (((s3_n & 4294967280) <<17) ^ (((s3_n << 3) ^ s3_n) >>11)) + + The period of this generator is about 2^88. + + From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe + Generators", Mathematics of Computation, 65, 213 (1996), 203--213. + + This is available on the net from L'Ecuyer's home page, + + http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps + ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps + + There is an erratum in the paper "Tables of Maximally + Equidistributed Combined LFSR Generators", Mathematics of + Computation, 68, 225 (1999), 261--269: + http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps + + ... the k_j most significant bits of z_j must be non- + zero, for each j. (Note: this restriction also applies to the + computer code given in [4], but was mistakenly not mentioned in + that paper.) + + This affects the seeding procedure by imposing the requirement + s1 > 1, s2 > 7, s3 > 15. + +*/ + +#include +#include +#include +#include +#include + +#include "compat26.h" + +struct rnd_state { + u32 s1, s2, s3; +}; + +static struct rnd_state net_rand_state[NR_CPUS]; + +static u32 __random32(struct rnd_state *state) +{ +#define TAUSWORTHE(s,a,b,c,d) ((s&c)<>b) + + state->s1 = TAUSWORTHE(state->s1, 13, 19, 4294967294UL, 12); + state->s2 = TAUSWORTHE(state->s2, 2, 25, 4294967288UL, 4); + state->s3 = TAUSWORTHE(state->s3, 3, 11, 4294967280UL, 17); + + return (state->s1 ^ state->s2 ^ state->s3); +} + +static void __set_random32(struct rnd_state *state, unsigned long s) +{ + if (s == 0) + s = 1; /* default seed is 1 */ + +#define LCG(n) (69069 * n) + state->s1 = LCG(s); + state->s2 = LCG(state->s1); + state->s3 = LCG(state->s2); + + /* "warm it up" */ + __random32(state); + __random32(state); + __random32(state); + __random32(state); + __random32(state); + __random32(state); +} + +/** + * random32 - pseudo random number generator + * + * A 32 bit pseudo-random number is generated using a fast + * algorithm suitable for simulation. This algorithm is NOT + * considered safe for cryptographic use. + */ +u32 random32(void) +{ + return __random32(&net_rand_state[smp_processor_id()]); +} + +/** + * srandom32 - add entropy to pseudo random number generator + * @seed: seed value + * + * Add some additional seeding to the random32() pool. + * Note: this pool is per cpu so it only affects current CPU. + */ +void srandom32(u32 entropy) +{ + struct rnd_state *state = &net_rand_state[smp_processor_id()]; + __set_random32(state, state->s1 ^ entropy); +} + +static int __init random32_reseed(void); + +/* + * Generate some initially weak seeding values to allow + * to start the random32() engine. + */ +int __init random32_init(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) { + struct rnd_state *state = &net_rand_state[i]; + __set_random32(state, i + jiffies); + } + random32_reseed(); + return 0; +} + +/* + * Generate better values after random number generator + * is fully initalized. + */ +static int __init random32_reseed(void) +{ + int i; + unsigned long seed; + + for (i = 0; i < NR_CPUS; i++) { + struct rnd_state *state = &net_rand_state[i]; + + get_random_bytes(&seed, sizeof(seed)); + __set_random32(state, seed); + } + return 0; +} + +#endif /* kernel < 2.6.19 */ diff --git a/datapath/linux-2.6/compat-2.6/veth.c b/datapath/linux-2.6/compat-2.6/veth.c new file mode 100644 index 00000000..3cda3365 --- /dev/null +++ b/datapath/linux-2.6/compat-2.6/veth.c @@ -0,0 +1,537 @@ +/* veth driver port to Linux 2.6.18 */ + +/* + * drivers/net/veth.c + * + * Copyright (C) 2007, 2009 OpenVZ http://openvz.org, SWsoft Inc + * + * Author: Pavel Emelianov + * Ethtool interface from: Eric W. Biederman + * + */ + +#include +#include +#include +#include + +#include +#include + +#define DRV_NAME "veth" +#define DRV_VERSION "1.0" + +struct veth_net_stats { + unsigned long rx_packets; + unsigned long tx_packets; + unsigned long rx_bytes; + unsigned long tx_bytes; + unsigned long tx_dropped; +}; + +struct veth_priv { + struct net_device *peer; + struct net_device *dev; + struct list_head list; + struct veth_net_stats *stats; + unsigned ip_summed; + struct net_device_stats dev_stats; +}; + +static LIST_HEAD(veth_list); + +/* + * ethtool interface + */ + +static struct { + const char string[ETH_GSTRING_LEN]; +} ethtool_stats_keys[] = { + { "peer_ifindex" }, +}; + +static int veth_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) +{ + cmd->supported = 0; + cmd->advertising = 0; + cmd->speed = SPEED_10000; + cmd->duplex = DUPLEX_FULL; + cmd->port = PORT_TP; + cmd->phy_address = 0; + cmd->transceiver = XCVR_INTERNAL; + cmd->autoneg = AUTONEG_DISABLE; + cmd->maxtxpkt = 0; + cmd->maxrxpkt = 0; + return 0; +} + +static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) +{ + strcpy(info->driver, DRV_NAME); + strcpy(info->version, DRV_VERSION); + strcpy(info->fw_version, "N/A"); +} + +static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf) +{ + switch(stringset) { + case ETH_SS_STATS: + memcpy(buf, ðtool_stats_keys, sizeof(ethtool_stats_keys)); + break; + } +} + +static void veth_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, u64 *data) +{ + struct veth_priv *priv; + + priv = netdev_priv(dev); + data[0] = priv->peer->ifindex; +} + +static u32 veth_get_rx_csum(struct net_device *dev) +{ + struct veth_priv *priv; + + priv = netdev_priv(dev); + return priv->ip_summed == CHECKSUM_UNNECESSARY; +} + +static int veth_set_rx_csum(struct net_device *dev, u32 data) +{ + struct veth_priv *priv; + + priv = netdev_priv(dev); + priv->ip_summed = data ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; + return 0; +} + +static u32 veth_get_tx_csum(struct net_device *dev) +{ + return (dev->features & NETIF_F_NO_CSUM) != 0; +} + +static int veth_set_tx_csum(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_NO_CSUM; + else + dev->features &= ~NETIF_F_NO_CSUM; + return 0; +} + +static struct ethtool_ops veth_ethtool_ops = { + .get_settings = veth_get_settings, + .get_drvinfo = veth_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_rx_csum = veth_get_rx_csum, + .set_rx_csum = veth_set_rx_csum, + .get_tx_csum = veth_get_tx_csum, + .set_tx_csum = veth_set_tx_csum, + .get_sg = ethtool_op_get_sg, + .set_sg = ethtool_op_set_sg, + .get_strings = veth_get_strings, + .get_ethtool_stats = veth_get_ethtool_stats, +}; + +/* + * xmit + */ + +static int veth_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device *rcv = NULL; + struct veth_priv *priv, *rcv_priv; + struct veth_net_stats *stats; + int length, cpu; + + skb_orphan(skb); + + priv = netdev_priv(dev); + rcv = priv->peer; + rcv_priv = netdev_priv(rcv); + + cpu = smp_processor_id(); + stats = per_cpu_ptr(priv->stats, cpu); + + if (!(rcv->flags & IFF_UP)) + goto outf; + + skb->dev = rcv; + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, rcv); + if (dev->features & NETIF_F_NO_CSUM) + skb->ip_summed = rcv_priv->ip_summed; + + dst_release(skb->dst); + skb->dst = NULL; + secpath_reset(skb); + nf_reset(skb); + + length = skb->len; + + stats->tx_bytes += length; + stats->tx_packets++; + + stats = per_cpu_ptr(rcv_priv->stats, cpu); + stats->rx_bytes += length; + stats->rx_packets++; + + netif_rx(skb); + return 0; + +outf: + kfree_skb(skb); + stats->tx_dropped++; + return 0; +} + +/* + * general routines + */ + +static struct net_device_stats *veth_get_stats(struct net_device *dev) +{ + struct veth_priv *priv; + struct net_device_stats *dev_stats; + int cpu; + struct veth_net_stats *stats; + + priv = netdev_priv(dev); + dev_stats = &priv->dev_stats; + + dev_stats->rx_packets = 0; + dev_stats->tx_packets = 0; + dev_stats->rx_bytes = 0; + dev_stats->tx_bytes = 0; + dev_stats->tx_dropped = 0; + + for_each_online_cpu(cpu) { + stats = per_cpu_ptr(priv->stats, cpu); + + dev_stats->rx_packets += stats->rx_packets; + dev_stats->tx_packets += stats->tx_packets; + dev_stats->rx_bytes += stats->rx_bytes; + dev_stats->tx_bytes += stats->tx_bytes; + dev_stats->tx_dropped += stats->tx_dropped; + } + + return dev_stats; +} + +static int veth_open(struct net_device *dev) +{ + struct veth_priv *priv; + + priv = netdev_priv(dev); + if (priv->peer == NULL) + return -ENOTCONN; + + if (priv->peer->flags & IFF_UP) { + netif_carrier_on(dev); + netif_carrier_on(priv->peer); + } + return 0; +} + +static int veth_dev_init(struct net_device *dev) +{ + struct veth_net_stats *stats; + struct veth_priv *priv; + + stats = alloc_percpu(struct veth_net_stats); + if (stats == NULL) + return -ENOMEM; + + priv = netdev_priv(dev); + priv->stats = stats; + return 0; +} + +static void veth_dev_free(struct net_device *dev) +{ + struct veth_priv *priv; + + priv = netdev_priv(dev); + free_percpu(priv->stats); + free_netdev(dev); +} + +static void veth_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->hard_start_xmit = veth_xmit; + dev->get_stats = veth_get_stats; + dev->open = veth_open; + dev->ethtool_ops = &veth_ethtool_ops; + dev->features |= NETIF_F_LLTX; + dev->init = veth_dev_init; + dev->destructor = veth_dev_free; +} + +static void veth_change_state(struct net_device *dev) +{ + struct net_device *peer; + struct veth_priv *priv; + + priv = netdev_priv(dev); + peer = priv->peer; + + if (netif_carrier_ok(peer)) { + if (!netif_carrier_ok(dev)) + netif_carrier_on(dev); + } else { + if (netif_carrier_ok(dev)) + netif_carrier_off(dev); + } +} + +static int veth_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + if (dev->open != veth_open) + goto out; + + switch (event) { + case NETDEV_CHANGE: + veth_change_state(dev); + break; + } +out: + return NOTIFY_DONE; +} + +static struct notifier_block veth_notifier_block __read_mostly = { + .notifier_call = veth_device_event, +}; + +/* + * netlink interface + */ + +static int veth_newlink(const char *devname, const char *peername) +{ + int err; + const char *names[2]; + struct net_device *devs[2]; + int i; + + names[0] = devname; + names[1] = peername; + devs[0] = devs[1] = NULL; + + for (i = 0; i < 2; i++) { + struct net_device *dev; + + err = -ENOMEM; + devs[i] = alloc_netdev(sizeof(struct veth_priv), + names[i], veth_setup); + if (!devs[i]) { + goto err; + } + + dev = devs[i]; + + if (strchr(dev->name, '%')) { + err = dev_alloc_name(dev, dev->name); + if (err < 0) + goto err; + } + random_ether_addr(dev->dev_addr); + + err = register_netdevice(dev); + if (err < 0) + goto err; + + netif_carrier_off(dev); + } + + /* + * tie the devices together + */ + + for (i = 0; i < 2; i++) { + struct veth_priv *priv = netdev_priv(devs[i]); + priv->dev = devs[i]; + priv->peer = devs[!i]; + if (!i) + list_add(&priv->list, &veth_list); + else + INIT_LIST_HEAD(&priv->list); + } + return 0; + +err: + for (i = 0; i < 2; i++) { + if (devs[i]) { + if (devs[i]->reg_state != NETREG_UNINITIALIZED) + unregister_netdevice(devs[i]); + else + free_netdev(devs[i]); + } + } + return err; +} + +static void veth_dellink(struct net_device *dev) +{ + struct veth_priv *priv; + struct net_device *peer; + + priv = netdev_priv(dev); + peer = priv->peer; + + if (!list_empty(&priv->list)) + list_del(&priv->list); + + priv = netdev_priv(peer); + if (!list_empty(&priv->list)) + list_del(&priv->list); + + unregister_netdevice(dev); + unregister_netdevice(peer); +} + +/* + * sysfs + */ + +/* + * "show" function for the veth_pairs attribute. + * The class parameter is ignored. + */ +static ssize_t veth_show_veth_pairs(struct class *cls, char *buffer) +{ + int res = 0; + struct veth_priv *priv; + + list_for_each_entry(priv, &veth_list, list) { + if (res > (PAGE_SIZE - (IFNAMSIZ * 2 + 1))) { + /* not enough space for another interface name */ + if ((PAGE_SIZE - res) > 10) + res = PAGE_SIZE - 10; + res += sprintf(buffer + res, "++more++"); + break; + } + res += sprintf(buffer + res, "%s,%s ", + priv->dev->name, priv->peer->name); + } + res += sprintf(buffer + res, "\n"); + res++; + return res; +} + +/* + * "store" function for the veth_pairs attribute. This is what + * creates and deletes veth pairs. + * + * The class parameter is ignored. + * + */ +static ssize_t veth_store_veth_pairs(struct class *cls, const char *buffer, + size_t count) +{ + int c = *buffer++; + int retval; + printk("1\n"); + if (c == '+') { + char devname[IFNAMSIZ + 1] = ""; + char peername[IFNAMSIZ + 1] = ""; + char *comma = strchr(buffer, ','); + printk("2\n"); + if (!comma) + goto err_no_cmd; + strncat(devname, buffer, + min_t(int, sizeof devname, comma - buffer)); + strncat(peername, comma + 1, + min_t(int, sizeof peername, strcspn(comma + 1, "\n"))); + printk("3 '%s' '%s'\n", devname, peername); + if (!dev_valid_name(devname) || !dev_valid_name(peername)) + goto err_no_cmd; + printk("4\n"); + rtnl_lock(); + retval = veth_newlink(devname, peername); + rtnl_unlock(); + return retval ? retval : count; + } else if (c == '-') { + struct net_device *dev; + + rtnl_lock(); + dev = dev_get_by_name(buffer); + if (!dev) + retval = -ENODEV; + else if (dev->init != veth_dev_init) + retval = -EINVAL; + else { + veth_dellink(dev); + retval = count; + } + rtnl_unlock(); + + return retval; + } + +err_no_cmd: + printk(KERN_ERR DRV_NAME ": no command found in veth_pairs. Use +ifname,peername or -ifname.\n"); + return -EPERM; +} + +/* class attribute for veth_pairs file. This ends up in /sys/class/net */ +static CLASS_ATTR(veth_pairs, S_IWUSR | S_IRUGO, + veth_show_veth_pairs, veth_store_veth_pairs); + +static struct class *netdev_class; + +/* + * Initialize sysfs. This sets up the veth_pairs file in + * /sys/class/net. + */ +int veth_create_sysfs(void) +{ + struct net_device *dev = dev_get_by_name("lo"); + if (!dev) + return -ESRCH; + netdev_class = dev->class_dev.class; + if (!netdev_class) + return -ENODEV; + + return class_create_file(netdev_class, &class_attr_veth_pairs); +} + +/* + * Remove /sys/class/net/veth_pairs. + */ +void veth_destroy_sysfs(void) +{ + class_remove_file(netdev_class, &class_attr_veth_pairs); +} + + + +/* + * init/fini + */ + +static __init int veth_init(void) +{ + int retval = veth_create_sysfs(); + if (retval) + return retval; + register_netdevice_notifier(&veth_notifier_block); + return 0; +} + +static __exit void veth_exit(void) +{ + unregister_netdevice_notifier(&veth_notifier_block); +} + +module_init(veth_init); +module_exit(veth_exit); + +MODULE_DESCRIPTION("Virtual Ethernet Tunnel"); +MODULE_LICENSE("GPL v2"); diff --git a/datapath/linux-2.6/config/config-linux-2.6.23-rc9-kvm b/datapath/linux-2.6/config/config-linux-2.6.23-rc9-kvm new file mode 100644 index 00000000..f287cf72 --- /dev/null +++ b/datapath/linux-2.6/config/config-linux-2.6.23-rc9-kvm @@ -0,0 +1,1408 @@ +# +# Automatically generated make config: don't edit +# Linux kernel version: 2.6.23-rc9 +# Fri Oct 19 15:08:37 2007 +# +CONFIG_X86_32=y +CONFIG_GENERIC_TIME=y +CONFIG_GENERIC_CMOS_UPDATE=y +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_SEMAPHORE_SLEEPERS=y +CONFIG_X86=y +CONFIG_MMU=y +CONFIG_ZONE_DMA=y +CONFIG_QUICKLIST=y +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_IOMAP=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_HWEIGHT=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_DMI=y +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" + +# +# General setup +# +CONFIG_EXPERIMENTAL=y +CONFIG_LOCK_KERNEL=y +CONFIG_INIT_ENV_ARG_LIMIT=32 +CONFIG_LOCALVERSION="" +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_POSIX_MQUEUE=y +CONFIG_BSD_PROCESS_ACCT=y +# CONFIG_BSD_PROCESS_ACCT_V3 is not set +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +# CONFIG_USER_NS is not set +# CONFIG_AUDIT is not set +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=14 +# CONFIG_CPUSETS is not set +CONFIG_SYSFS_DEPRECATED=y +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_SYSCTL=y +# CONFIG_EMBEDDED is not set +CONFIG_UID16=y +CONFIG_SYSCTL_SYSCALL=y +CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KALLSYMS_EXTRA_PASS=y +CONFIG_HOTPLUG=y +CONFIG_PRINTK=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_ANON_INODES=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_SLAB=y +# CONFIG_SLUB is not set +# CONFIG_SLOB is not set +CONFIG_RT_MUTEXES=y +# CONFIG_TINY_SHMEM is not set +CONFIG_BASE_SMALL=0 +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_MODULE_FORCE_UNLOAD is not set +# CONFIG_MODVERSIONS is not set +# CONFIG_MODULE_SRCVERSION_ALL is not set +CONFIG_KMOD=y +CONFIG_STOP_MACHINE=y +CONFIG_BLOCK=y +CONFIG_LBD=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_LSF=y +# CONFIG_BLK_DEV_BSG is not set + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" + +# +# Processor type and features +# +# CONFIG_TICK_ONESHOT is not set +# CONFIG_NO_HZ is not set +# CONFIG_HIGH_RES_TIMERS is not set +CONFIG_SMP=y +CONFIG_X86_PC=y +# CONFIG_X86_ELAN is not set +# CONFIG_X86_VOYAGER is not set +# CONFIG_X86_NUMAQ is not set +# CONFIG_X86_SUMMIT is not set +# CONFIG_X86_BIGSMP is not set +# CONFIG_X86_VISWS is not set +# CONFIG_X86_GENERICARCH is not set +# CONFIG_X86_ES7000 is not set +# CONFIG_PARAVIRT is not set +# CONFIG_M386 is not set +CONFIG_M486=y +# CONFIG_M586 is not set +# CONFIG_M586TSC is not set +# CONFIG_M586MMX is not set +# CONFIG_M686 is not set +# CONFIG_MPENTIUMII is not set +# CONFIG_MPENTIUMIII is not set +# CONFIG_MPENTIUMM is not set +# CONFIG_MCORE2 is not set +# CONFIG_MPENTIUM4 is not set +# CONFIG_MK6 is not set +# CONFIG_MK7 is not set +# CONFIG_MK8 is not set +# CONFIG_MCRUSOE is not set +# CONFIG_MEFFICEON is not set +# CONFIG_MWINCHIPC6 is not set +# CONFIG_MWINCHIP2 is not set +# CONFIG_MWINCHIP3D is not set +# CONFIG_MGEODEGX1 is not set +# CONFIG_MGEODE_LX is not set +# CONFIG_MCYRIXIII is not set +# CONFIG_MVIAC3_2 is not set +# CONFIG_MVIAC7 is not set +CONFIG_X86_GENERIC=y +CONFIG_X86_CMPXCHG=y +CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_XADD=y +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +# CONFIG_ARCH_HAS_ILOG2_U32 is not set +# CONFIG_ARCH_HAS_ILOG2_U64 is not set +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_X86_PPRO_FENCE=y +CONFIG_X86_F00F_BUG=y +CONFIG_X86_WP_WORKS_OK=y +CONFIG_X86_INVLPG=y +CONFIG_X86_BSWAP=y +CONFIG_X86_POPAD_OK=y +CONFIG_X86_ALIGNMENT_16=y +CONFIG_X86_INTEL_USERCOPY=y +CONFIG_X86_MINIMUM_CPU_FAMILY=4 +# CONFIG_HPET_TIMER is not set +CONFIG_NR_CPUS=8 +# CONFIG_SCHED_SMT is not set +CONFIG_SCHED_MC=y +CONFIG_PREEMPT_NONE=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT is not set +CONFIG_PREEMPT_BKL=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +# CONFIG_X86_MCE is not set +CONFIG_VM86=y +# CONFIG_TOSHIBA is not set +# CONFIG_I8K is not set +# CONFIG_X86_REBOOTFIXUPS is not set +# CONFIG_MICROCODE is not set +# CONFIG_X86_MSR is not set +# CONFIG_X86_CPUID is not set + +# +# Firmware Drivers +# +# CONFIG_EDD is not set +# CONFIG_DELL_RBU is not set +# CONFIG_DCDBAS is not set +CONFIG_DMIID=y +# CONFIG_NOHIGHMEM is not set +CONFIG_HIGHMEM4G=y +# CONFIG_HIGHMEM64G is not set +CONFIG_PAGE_OFFSET=0xC0000000 +CONFIG_HIGHMEM=y +CONFIG_ARCH_FLATMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_POPULATES_NODE_MAP=y +CONFIG_SELECT_MEMORY_MODEL=y +CONFIG_FLATMEM_MANUAL=y +# CONFIG_DISCONTIGMEM_MANUAL is not set +# CONFIG_SPARSEMEM_MANUAL is not set +CONFIG_FLATMEM=y +CONFIG_FLAT_NODE_MEM_MAP=y +CONFIG_SPARSEMEM_STATIC=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +# CONFIG_RESOURCES_64BIT is not set +CONFIG_ZONE_DMA_FLAG=1 +CONFIG_BOUNCE=y +CONFIG_NR_QUICK=1 +CONFIG_VIRT_TO_BUS=y +# CONFIG_HIGHPTE is not set +# CONFIG_MATH_EMULATION is not set +# CONFIG_MTRR is not set +CONFIG_IRQBALANCE=y +CONFIG_SECCOMP=y +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y +# CONFIG_HZ_300 is not set +# CONFIG_HZ_1000 is not set +CONFIG_HZ=250 +# CONFIG_KEXEC is not set +# CONFIG_CRASH_DUMP is not set +CONFIG_PHYSICAL_START=0x100000 +# CONFIG_RELOCATABLE is not set +CONFIG_PHYSICAL_ALIGN=0x100000 +CONFIG_HOTPLUG_CPU=y +CONFIG_COMPAT_VDSO=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y + +# +# Power management options (ACPI, APM) +# +CONFIG_PM=y +# CONFIG_PM_LEGACY is not set +# CONFIG_PM_DEBUG is not set +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_SLEEP=y +CONFIG_SUSPEND_SMP_POSSIBLE=y +CONFIG_SUSPEND=y +CONFIG_HIBERNATION_SMP_POSSIBLE=y +# CONFIG_HIBERNATION is not set +# CONFIG_ACPI is not set +CONFIG_APM=y +# CONFIG_APM_IGNORE_USER_SUSPEND is not set +# CONFIG_APM_DO_ENABLE is not set +# CONFIG_APM_CPU_IDLE is not set +# CONFIG_APM_DISPLAY_BLANK is not set +# CONFIG_APM_ALLOW_INTS is not set +# CONFIG_APM_REAL_MODE_POWER_OFF is not set + +# +# CPU Frequency scaling +# +# CONFIG_CPU_FREQ is not set + +# +# Bus options (PCI, PCMCIA, EISA, MCA, ISA) +# +CONFIG_PCI=y +# CONFIG_PCI_GOBIOS is not set +# CONFIG_PCI_GOMMCONFIG is not set +# CONFIG_PCI_GODIRECT is not set +CONFIG_PCI_GOANY=y +CONFIG_PCI_BIOS=y +CONFIG_PCI_DIRECT=y +# CONFIG_PCIEPORTBUS is not set +CONFIG_ARCH_SUPPORTS_MSI=y +# CONFIG_PCI_MSI is not set +# CONFIG_PCI_DEBUG is not set +CONFIG_HT_IRQ=y +CONFIG_ISA_DMA_API=y +CONFIG_ISA=y +# CONFIG_EISA is not set +# CONFIG_MCA is not set +# CONFIG_SCx200 is not set + +# +# PCCARD (PCMCIA/CardBus) support +# +# CONFIG_PCCARD is not set +# CONFIG_HOTPLUG_PCI is not set + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +# CONFIG_BINFMT_AOUT is not set +CONFIG_BINFMT_MISC=m + +# +# Networking +# +CONFIG_NET=y + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_MMAP=y +CONFIG_UNIX=y +CONFIG_XFRM=y +CONFIG_XFRM_USER=m +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_MIGRATE=y +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_ASK_IP_FIB_HASH=y +# CONFIG_IP_FIB_TRIE is not set +CONFIG_IP_FIB_HASH=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +# CONFIG_IP_ROUTE_VERBOSE is not set +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +# CONFIG_ARPD is not set +CONFIG_SYN_COOKIES=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_TUNNEL=m +CONFIG_INET_TUNNEL=m +CONFIG_INET_XFRM_MODE_TRANSPORT=m +CONFIG_INET_XFRM_MODE_TUNNEL=m +CONFIG_INET_XFRM_MODE_BEET=m +CONFIG_INET_DIAG=y +CONFIG_INET_TCP_DIAG=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=m +CONFIG_TCP_CONG_CUBIC=y +CONFIG_TCP_CONG_WESTWOOD=m +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_VEGAS=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +# CONFIG_DEFAULT_BIC is not set +CONFIG_DEFAULT_CUBIC=y +# CONFIG_DEFAULT_HTCP is not set +# CONFIG_DEFAULT_VEGAS is not set +# CONFIG_DEFAULT_WESTWOOD is not set +# CONFIG_DEFAULT_RENO is not set +CONFIG_DEFAULT_TCP_CONG="cubic" +CONFIG_TCP_MD5SIG=y +# CONFIG_IP_VS is not set +CONFIG_IPV6=m +CONFIG_IPV6_PRIVACY=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +# CONFIG_IPV6_OPTIMISTIC_DAD is not set +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_IPCOMP=m +# CONFIG_IPV6_MIP6 is not set +CONFIG_INET6_XFRM_TUNNEL=m +CONFIG_INET6_TUNNEL=m +CONFIG_INET6_XFRM_MODE_TRANSPORT=m +CONFIG_INET6_XFRM_MODE_TUNNEL=m +CONFIG_INET6_XFRM_MODE_BEET=m +# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set +CONFIG_IPV6_SIT=m +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_NETWORK_SECMARK=y +CONFIG_NETFILTER=y +# CONFIG_NETFILTER_DEBUG is not set +CONFIG_BRIDGE_NETFILTER=y + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NF_CONNTRACK_ENABLED=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_CT_ACCT=y +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CT_PROTO_GRE=m +CONFIG_NF_CT_PROTO_SCTP=m +# CONFIG_NF_CT_PROTO_UDPLITE is not set +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NETFILTER_XTABLES=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +# CONFIG_NETFILTER_XT_TARGET_CONNMARK is not set +# CONFIG_NETFILTER_XT_TARGET_DSCP is not set +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +# CONFIG_NETFILTER_XT_TARGET_NOTRACK is not set +# CONFIG_NETFILTER_XT_TARGET_TRACE is not set +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +# CONFIG_NETFILTER_XT_MATCH_CONNLIMIT is not set +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +# CONFIG_NETFILTER_XT_MATCH_PHYSDEV is not set +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +# CONFIG_NETFILTER_XT_MATCH_U32 is not set +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m + +# +# IP: Netfilter Configuration +# +CONFIG_NF_CONNTRACK_IPV4=m +CONFIG_NF_CONNTRACK_PROC_COMPAT=y +# CONFIG_IP_NF_QUEUE is not set +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_IPRANGE=m +CONFIG_IP_NF_MATCH_TOS=m +CONFIG_IP_NF_MATCH_RECENT=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_MATCH_OWNER=m +CONFIG_IP_NF_MATCH_ADDRTYPE=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_LOG=m +CONFIG_IP_NF_TARGET_ULOG=m +CONFIG_NF_NAT=m +CONFIG_NF_NAT_NEEDED=y +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_SAME=m +CONFIG_NF_NAT_SNMP_BASIC=m +CONFIG_NF_NAT_PROTO_GRE=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_AMANDA=m +CONFIG_NF_NAT_PPTP=m +CONFIG_NF_NAT_H323=m +CONFIG_NF_NAT_SIP=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_TOS=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m + +# +# IPv6: Netfilter Configuration (EXPERIMENTAL) +# +CONFIG_NF_CONNTRACK_IPV6=m +# CONFIG_IP6_NF_QUEUE is not set +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_OWNER=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_LOG=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_RAW=m + +# +# DECnet: Netfilter Configuration +# +# CONFIG_DECNET_NF_GRABULATOR is not set + +# +# Bridge: Netfilter Configuration +# +# CONFIG_BRIDGE_NF_EBTABLES is not set +CONFIG_IP_DCCP=m +CONFIG_INET_DCCP_DIAG=m +CONFIG_IP_DCCP_ACKVEC=y + +# +# DCCP CCIDs Configuration (EXPERIMENTAL) +# +CONFIG_IP_DCCP_CCID2=m +# CONFIG_IP_DCCP_CCID2_DEBUG is not set +CONFIG_IP_DCCP_CCID3=m +CONFIG_IP_DCCP_TFRC_LIB=m +# CONFIG_IP_DCCP_CCID3_DEBUG is not set +CONFIG_IP_DCCP_CCID3_RTO=100 + +# +# DCCP Kernel Hacking +# +# CONFIG_IP_DCCP_DEBUG is not set +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_MSG is not set +# CONFIG_SCTP_DBG_OBJCNT is not set +# CONFIG_SCTP_HMAC_NONE is not set +# CONFIG_SCTP_HMAC_SHA1 is not set +CONFIG_SCTP_HMAC_MD5=y +CONFIG_TIPC=m +CONFIG_TIPC_ADVANCED=y +CONFIG_TIPC_ZONES=3 +CONFIG_TIPC_CLUSTERS=1 +CONFIG_TIPC_NODES=255 +CONFIG_TIPC_SLAVE_NODES=0 +CONFIG_TIPC_PORTS=8191 +CONFIG_TIPC_LOG=0 +# CONFIG_TIPC_DEBUG is not set +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +# CONFIG_ATM_MPOA is not set +CONFIG_ATM_BR2684=m +CONFIG_ATM_BR2684_IPFILTER=y +CONFIG_BRIDGE=m +CONFIG_VLAN_8021Q=m +CONFIG_DECNET=m +# CONFIG_DECNET_ROUTER is not set +CONFIG_LLC=m +CONFIG_LLC2=m +CONFIG_IPX=m +CONFIG_IPX_INTERN=y +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=m +# CONFIG_LTPC is not set +# CONFIG_COPS is not set +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +CONFIG_IPDDP_DECAP=y +CONFIG_X25=m +CONFIG_LAPB=m +CONFIG_ECONET=m +CONFIG_ECONET_AUNUDP=y +CONFIG_ECONET_NATIVE=y +CONFIG_WAN_ROUTER=m + +# +# QoS and/or fair queueing +# +CONFIG_NET_SCHED=y +CONFIG_NET_SCH_FIFO=y + +# +# Queueing/Scheduling +# +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +# CONFIG_NET_SCH_RR is not set +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_INGRESS=m + +# +# Classification +# +CONFIG_NET_CLS=y +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_ROUTE=y +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +# CONFIG_NET_CLS_POLICE is not set +CONFIG_NET_CLS_IND=y + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +# CONFIG_HAMRADIO is not set +# CONFIG_IRDA is not set +# CONFIG_BT is not set +CONFIG_AF_RXRPC=m +# CONFIG_AF_RXRPC_DEBUG is not set +CONFIG_RXKAD=m +CONFIG_FIB_RULES=y + +# +# Wireless +# +# CONFIG_CFG80211 is not set +# CONFIG_WIRELESS_EXT is not set +# CONFIG_MAC80211 is not set +# CONFIG_IEEE80211 is not set +# CONFIG_RFKILL is not set +# CONFIG_NET_9P is not set + +# +# Device Drivers +# + +# +# Generic Driver Options +# +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y +# CONFIG_FW_LOADER is not set +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_SYS_HYPERVISOR is not set +CONFIG_CONNECTOR=m +# CONFIG_MTD is not set +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +# CONFIG_PARPORT_SERIAL is not set +# CONFIG_PARPORT_PC_FIFO is not set +# CONFIG_PARPORT_PC_SUPERIO is not set +# CONFIG_PARPORT_GSC is not set +# CONFIG_PARPORT_AX88796 is not set +# CONFIG_PARPORT_1284 is not set +# CONFIG_PNP is not set +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_FD is not set +# CONFIG_BLK_DEV_XD is not set +# CONFIG_PARIDE is not set +# CONFIG_BLK_CPQ_DA is not set +# CONFIG_BLK_CPQ_CISS_DA is not set +# CONFIG_BLK_DEV_DAC960 is not set +# CONFIG_BLK_DEV_UMEM is not set +# CONFIG_BLK_DEV_COW_COMMON is not set +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_NBD=m +# CONFIG_BLK_DEV_SX8 is not set +CONFIG_BLK_DEV_RAM=m +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=4096 +CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 +# CONFIG_CDROM_PKTCDVD is not set +# CONFIG_ATA_OVER_ETH is not set +CONFIG_MISC_DEVICES=y +# CONFIG_IBM_ASM is not set +# CONFIG_PHANTOM is not set +# CONFIG_EEPROM_93CX6 is not set +# CONFIG_SGI_IOC4 is not set +# CONFIG_TIFM_CORE is not set +CONFIG_IDE=y +CONFIG_BLK_DEV_IDE=y + +# +# Please see Documentation/ide.txt for help/info on IDE drives +# +# CONFIG_BLK_DEV_IDE_SATA is not set +# CONFIG_BLK_DEV_HD_IDE is not set +CONFIG_BLK_DEV_IDEDISK=y +# CONFIG_IDEDISK_MULTI_MODE is not set +CONFIG_BLK_DEV_IDECD=y +# CONFIG_BLK_DEV_IDETAPE is not set +# CONFIG_BLK_DEV_IDEFLOPPY is not set +# CONFIG_IDE_TASK_IOCTL is not set +CONFIG_IDE_PROC_FS=y + +# +# IDE chipset support/bugfixes +# +CONFIG_IDE_GENERIC=y +# CONFIG_BLK_DEV_CMD640 is not set +CONFIG_BLK_DEV_IDEPCI=y +# CONFIG_IDEPCI_SHARE_IRQ is not set +CONFIG_IDEPCI_PCIBUS_ORDER=y +# CONFIG_BLK_DEV_OFFBOARD is not set +# CONFIG_BLK_DEV_GENERIC is not set +# CONFIG_BLK_DEV_OPTI621 is not set +# CONFIG_BLK_DEV_RZ1000 is not set +# CONFIG_BLK_DEV_IDEDMA_PCI is not set +# CONFIG_IDE_ARM is not set +# CONFIG_IDE_CHIPSETS is not set +# CONFIG_BLK_DEV_IDEDMA is not set +# CONFIG_BLK_DEV_HD is not set + +# +# SCSI device support +# +# CONFIG_RAID_ATTRS is not set +# CONFIG_SCSI is not set +# CONFIG_SCSI_DMA is not set +# CONFIG_SCSI_NETLINK is not set +# CONFIG_ATA is not set +# CONFIG_MD is not set + +# +# Fusion MPT device support +# +# CONFIG_FUSION is not set + +# +# IEEE 1394 (FireWire) support +# +# CONFIG_FIREWIRE is not set +# CONFIG_IEEE1394 is not set +# CONFIG_I2O is not set +# CONFIG_MACINTOSH_DRIVERS is not set +CONFIG_NETDEVICES=y +# CONFIG_NETDEVICES_MULTIQUEUE is not set +# CONFIG_IFB is not set +CONFIG_DUMMY=m +# CONFIG_BONDING is not set +# CONFIG_MACVLAN is not set +# CONFIG_EQUALIZER is not set +CONFIG_TUN=m +# CONFIG_ARCNET is not set +# CONFIG_PHYLIB is not set +CONFIG_NET_ETHERNET=y +CONFIG_MII=y +# CONFIG_HAPPYMEAL is not set +# CONFIG_SUNGEM is not set +# CONFIG_CASSINI is not set +# CONFIG_NET_VENDOR_3COM is not set +# CONFIG_LANCE is not set +# CONFIG_NET_VENDOR_SMC is not set +# CONFIG_NET_VENDOR_RACAL is not set +# CONFIG_NET_TULIP is not set +# CONFIG_AT1700 is not set +# CONFIG_DEPCA is not set +# CONFIG_HP100 is not set +# CONFIG_NET_ISA is not set +CONFIG_NET_PCI=y +CONFIG_PCNET32=y +# CONFIG_PCNET32_NAPI is not set +# CONFIG_AMD8111_ETH is not set +# CONFIG_ADAPTEC_STARFIRE is not set +# CONFIG_AC3200 is not set +# CONFIG_APRICOT is not set +# CONFIG_B44 is not set +# CONFIG_FORCEDETH is not set +# CONFIG_CS89x0 is not set +# CONFIG_DGRS is not set +# CONFIG_EEPRO100 is not set +# CONFIG_E100 is not set +# CONFIG_FEALNX is not set +# CONFIG_NATSEMI is not set +CONFIG_NE2K_PCI=y +CONFIG_8139CP=y +# CONFIG_8139TOO is not set +# CONFIG_SIS900 is not set +# CONFIG_EPIC100 is not set +# CONFIG_SUNDANCE is not set +# CONFIG_TLAN is not set +# CONFIG_VIA_RHINE is not set +# CONFIG_SC92031 is not set +# CONFIG_NET_POCKET is not set +# CONFIG_NETDEV_1000 is not set +# CONFIG_NETDEV_10000 is not set +# CONFIG_TR is not set + +# +# Wireless LAN +# +# CONFIG_WLAN_PRE80211 is not set +# CONFIG_WLAN_80211 is not set +# CONFIG_WAN is not set +CONFIG_ATM_DRIVERS=y +# CONFIG_ATM_DUMMY is not set +# CONFIG_ATM_TCP is not set +# CONFIG_ATM_LANAI is not set +# CONFIG_ATM_ENI is not set +# CONFIG_ATM_FIRESTREAM is not set +# CONFIG_ATM_ZATM is not set +# CONFIG_ATM_NICSTAR is not set +# CONFIG_ATM_IDT77252 is not set +# CONFIG_ATM_AMBASSADOR is not set +# CONFIG_ATM_HORIZON is not set +# CONFIG_ATM_IA is not set +# CONFIG_ATM_FORE200E_MAYBE is not set +# CONFIG_ATM_HE is not set +# CONFIG_FDDI is not set +CONFIG_HIPPI=y +# CONFIG_ROADRUNNER is not set +# CONFIG_PLIP is not set +# CONFIG_PPP is not set +# CONFIG_SLIP is not set +# CONFIG_SHAPER is not set +# CONFIG_NETCONSOLE is not set +# CONFIG_NETPOLL is not set +# CONFIG_NET_POLL_CONTROLLER is not set +# CONFIG_ISDN is not set +# CONFIG_PHONE is not set + +# +# Input device support +# +CONFIG_INPUT=y +# CONFIG_INPUT_FF_MEMLESS is not set +# CONFIG_INPUT_POLLDEV is not set + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=y +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +# CONFIG_INPUT_JOYDEV is not set +# CONFIG_INPUT_TSDEV is not set +# CONFIG_INPUT_EVDEV is not set +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ATKBD=y +# CONFIG_KEYBOARD_SUNKBD is not set +# CONFIG_KEYBOARD_LKKBD is not set +# CONFIG_KEYBOARD_XTKBD is not set +# CONFIG_KEYBOARD_NEWTON is not set +# CONFIG_KEYBOARD_STOWAWAY is not set +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=y +CONFIG_MOUSE_PS2_ALPS=y +CONFIG_MOUSE_PS2_LOGIPS2PP=y +CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_LIFEBOOK=y +CONFIG_MOUSE_PS2_TRACKPOINT=y +# CONFIG_MOUSE_PS2_TOUCHKIT is not set +# CONFIG_MOUSE_SERIAL is not set +# CONFIG_MOUSE_APPLETOUCH is not set +# CONFIG_MOUSE_INPORT is not set +# CONFIG_MOUSE_LOGIBM is not set +# CONFIG_MOUSE_PC110PAD is not set +# CONFIG_MOUSE_VSXXXAA is not set +# CONFIG_INPUT_JOYSTICK is not set +# CONFIG_INPUT_TABLET is not set +# CONFIG_INPUT_TOUCHSCREEN is not set +# CONFIG_INPUT_MISC is not set + +# +# Hardware I/O ports +# +CONFIG_SERIO=y +CONFIG_SERIO_I8042=y +CONFIG_SERIO_SERPORT=y +# CONFIG_SERIO_CT82C710 is not set +# CONFIG_SERIO_PARKBD is not set +# CONFIG_SERIO_PCIPS2 is not set +CONFIG_SERIO_LIBPS2=y +# CONFIG_SERIO_RAW is not set +# CONFIG_GAMEPORT is not set + +# +# Character devices +# +CONFIG_VT=y +CONFIG_VT_CONSOLE=y +CONFIG_HW_CONSOLE=y +# CONFIG_VT_HW_CONSOLE_BINDING is not set +# CONFIG_SERIAL_NONSTANDARD is not set + +# +# Serial drivers +# +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_NR_UARTS=4 +CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +# CONFIG_SERIAL_8250_EXTENDED is not set + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +# CONFIG_SERIAL_JSM is not set +CONFIG_UNIX98_PTYS=y +CONFIG_LEGACY_PTYS=y +CONFIG_LEGACY_PTY_COUNT=256 +# CONFIG_PRINTER is not set +# CONFIG_PPDEV is not set +# CONFIG_TIPAR is not set +# CONFIG_IPMI_HANDLER is not set +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_NOWAYOUT=y + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=y +# CONFIG_ACQUIRE_WDT is not set +# CONFIG_ADVANTECH_WDT is not set +# CONFIG_ALIM1535_WDT is not set +# CONFIG_ALIM7101_WDT is not set +# CONFIG_SC520_WDT is not set +# CONFIG_EUROTECH_WDT is not set +# CONFIG_IB700_WDT is not set +# CONFIG_IBMASR is not set +# CONFIG_WAFER_WDT is not set +# CONFIG_I6300ESB_WDT is not set +# CONFIG_ITCO_WDT is not set +# CONFIG_SC1200_WDT is not set +# CONFIG_PC87413_WDT is not set +# CONFIG_60XX_WDT is not set +# CONFIG_SBC8360_WDT is not set +# CONFIG_CPU5_WDT is not set +# CONFIG_SMSC37B787_WDT is not set +# CONFIG_W83627HF_WDT is not set +# CONFIG_W83697HF_WDT is not set +# CONFIG_W83877F_WDT is not set +# CONFIG_W83977F_WDT is not set +# CONFIG_MACHZ_WDT is not set +# CONFIG_SBC_EPX_C3_WATCHDOG is not set + +# +# ISA-based Watchdog Cards +# +# CONFIG_PCWATCHDOG is not set +# CONFIG_MIXCOMWD is not set +# CONFIG_WDT is not set + +# +# PCI-based Watchdog Cards +# +# CONFIG_PCIPCWATCHDOG is not set +# CONFIG_WDTPCI is not set +# CONFIG_HW_RANDOM is not set +# CONFIG_NVRAM is not set +# CONFIG_RTC is not set +# CONFIG_GEN_RTC is not set +# CONFIG_DTLK is not set +# CONFIG_R3964 is not set +# CONFIG_APPLICOM is not set +# CONFIG_SONYPI is not set +# CONFIG_AGP is not set +# CONFIG_DRM is not set +# CONFIG_MWAVE is not set +# CONFIG_PC8736x_GPIO is not set +# CONFIG_NSC_GPIO is not set +# CONFIG_CS5535_GPIO is not set +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 +# CONFIG_HANGCHECK_TIMER is not set +# CONFIG_TCG_TPM is not set +# CONFIG_TELCLOCK is not set +CONFIG_DEVPORT=y +# CONFIG_I2C is not set + +# +# SPI support +# +# CONFIG_SPI is not set +# CONFIG_SPI_MASTER is not set +# CONFIG_W1 is not set +# CONFIG_POWER_SUPPLY is not set +CONFIG_HWMON=y +# CONFIG_HWMON_VID is not set +# CONFIG_SENSORS_ABITUGURU is not set +# CONFIG_SENSORS_ABITUGURU3 is not set +# CONFIG_SENSORS_K8TEMP is not set +# CONFIG_SENSORS_F71805F is not set +# CONFIG_SENSORS_CORETEMP is not set +# CONFIG_SENSORS_IT87 is not set +# CONFIG_SENSORS_PC87360 is not set +# CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_SIS5595 is not set +# CONFIG_SENSORS_SMSC47M1 is not set +# CONFIG_SENSORS_SMSC47B397 is not set +# CONFIG_SENSORS_VIA686A is not set +# CONFIG_SENSORS_VT1211 is not set +# CONFIG_SENSORS_VT8231 is not set +# CONFIG_SENSORS_W83627HF is not set +# CONFIG_SENSORS_W83627EHF is not set +# CONFIG_SENSORS_HDAPS is not set +# CONFIG_SENSORS_APPLESMC is not set +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Multifunction device drivers +# +# CONFIG_MFD_SM501 is not set + +# +# Multimedia devices +# +# CONFIG_VIDEO_DEV is not set +# CONFIG_DVB_CORE is not set +# CONFIG_DAB is not set + +# +# Graphics support +# +# CONFIG_BACKLIGHT_LCD_SUPPORT is not set + +# +# Display device support +# +# CONFIG_DISPLAY_SUPPORT is not set +# CONFIG_VGASTATE is not set +CONFIG_VIDEO_OUTPUT_CONTROL=m +# CONFIG_FB is not set + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +# CONFIG_VGACON_SOFT_SCROLLBACK is not set +# CONFIG_VIDEO_SELECT is not set +# CONFIG_MDA_CONSOLE is not set +CONFIG_DUMMY_CONSOLE=y + +# +# Sound +# +# CONFIG_SOUND is not set +CONFIG_HID_SUPPORT=y +# CONFIG_HID is not set +CONFIG_USB_SUPPORT=y +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB_ARCH_HAS_OHCI=y +CONFIG_USB_ARCH_HAS_EHCI=y +# CONFIG_USB is not set + +# +# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' +# + +# +# USB Gadget Support +# +# CONFIG_USB_GADGET is not set +# CONFIG_MMC is not set +# CONFIG_NEW_LEDS is not set +# CONFIG_INFINIBAND is not set +# CONFIG_EDAC is not set +# CONFIG_RTC_CLASS is not set + +# +# DMA Engine support +# +# CONFIG_DMA_ENGINE is not set + +# +# DMA Clients +# + +# +# DMA Devices +# +# CONFIG_AUXDISPLAY is not set +CONFIG_VIRTUALIZATION=y +# CONFIG_KVM is not set + +# +# Userspace I/O +# +# CONFIG_UIO is not set + +# +# File systems +# +# CONFIG_EXT2_FS is not set +CONFIG_EXT3_FS=y +CONFIG_EXT3_FS_XATTR=y +CONFIG_EXT3_FS_POSIX_ACL=y +CONFIG_EXT3_FS_SECURITY=y +# CONFIG_EXT4DEV_FS is not set +CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set +CONFIG_FS_MBCACHE=y +# CONFIG_REISERFS_FS is not set +# CONFIG_JFS_FS is not set +CONFIG_FS_POSIX_ACL=y +# CONFIG_XFS_FS is not set +# CONFIG_GFS2_FS is not set +# CONFIG_OCFS2_FS is not set +# CONFIG_MINIX_FS is not set +CONFIG_ROMFS_FS=m +CONFIG_INOTIFY=y +CONFIG_INOTIFY_USER=y +# CONFIG_QUOTA is not set +CONFIG_DNOTIFY=y +# CONFIG_AUTOFS_FS is not set +# CONFIG_AUTOFS4_FS is not set +# CONFIG_FUSE_FS is not set +CONFIG_GENERIC_ACL=y + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=y +CONFIG_UDF_NLS=y + +# +# DOS/FAT/NT Filesystems +# +# CONFIG_MSDOS_FS is not set +# CONFIG_VFAT_FS is not set +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +CONFIG_PROC_SYSCTL=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +# CONFIG_HUGETLBFS is not set +# CONFIG_HUGETLB_PAGE is not set +CONFIG_RAMFS=y +CONFIG_CONFIGFS_FS=m + +# +# Miscellaneous filesystems +# +# CONFIG_ADFS_FS is not set +# CONFIG_AFFS_FS is not set +# CONFIG_ECRYPT_FS is not set +# CONFIG_HFS_FS is not set +# CONFIG_HFSPLUS_FS is not set +# CONFIG_BEFS_FS is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +CONFIG_CRAMFS=m +# CONFIG_VXFS_FS is not set +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_SYSV_FS is not set +# CONFIG_UFS_FS is not set + +# +# Network File Systems +# +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +# CONFIG_NFS_V3_ACL is not set +# CONFIG_NFS_V4 is not set +# CONFIG_NFS_DIRECTIO is not set +# CONFIG_NFSD is not set +CONFIG_LOCKD=y +CONFIG_LOCKD_V4=y +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=y +# CONFIG_SUNRPC_BIND34 is not set +# CONFIG_RPCSEC_GSS_KRB5 is not set +# CONFIG_RPCSEC_GSS_SPKM3 is not set +# CONFIG_SMB_FS is not set +# CONFIG_CIFS is not set +# CONFIG_NCP_FS is not set +# CONFIG_CODA_FS is not set +# CONFIG_AFS_FS is not set + +# +# Partition Types +# +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y + +# +# Native Language Support +# +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="iso8859-1" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_UTF8=m + +# +# Distributed Lock Manager +# +# CONFIG_DLM is not set +CONFIG_INSTRUMENTATION=y +# CONFIG_PROFILING is not set +# CONFIG_KPROBES is not set + +# +# Kernel hacking +# +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +# CONFIG_PRINTK_TIME is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_MAGIC_SYSRQ=y +# CONFIG_UNUSED_SYMBOLS is not set +CONFIG_DEBUG_FS=y +# CONFIG_HEADERS_CHECK is not set +CONFIG_DEBUG_KERNEL=y +# CONFIG_DEBUG_SHIRQ is not set +CONFIG_DETECT_SOFTLOCKUP=y +CONFIG_SCHED_DEBUG=y +# CONFIG_SCHEDSTATS is not set +# CONFIG_TIMER_STATS is not set +CONFIG_DEBUG_SLAB=y +CONFIG_DEBUG_SLAB_LEAK=y +CONFIG_DEBUG_RT_MUTEXES=y +CONFIG_DEBUG_PI_LIST=y +# CONFIG_RT_MUTEX_TESTER is not set +CONFIG_DEBUG_SPINLOCK=y +CONFIG_DEBUG_MUTEXES=y +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +CONFIG_LOCKDEP=y +CONFIG_LOCK_STAT=y +# CONFIG_DEBUG_LOCKDEP is not set +CONFIG_TRACE_IRQFLAGS=y +CONFIG_DEBUG_SPINLOCK_SLEEP=y +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y +CONFIG_DEBUG_KOBJECT=y +CONFIG_DEBUG_HIGHMEM=y +CONFIG_DEBUG_BUGVERBOSE=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_LIST=y +CONFIG_FRAME_POINTER=y +CONFIG_FORCED_INLINING=y +CONFIG_RCU_TORTURE_TEST=m +# CONFIG_FAULT_INJECTION is not set +CONFIG_EARLY_PRINTK=y +CONFIG_DEBUG_STACKOVERFLOW=y +# CONFIG_DEBUG_STACK_USAGE is not set +CONFIG_DEBUG_PAGEALLOC=y +CONFIG_DEBUG_RODATA=y +CONFIG_4KSTACKS=y +CONFIG_X86_FIND_SMP_CONFIG=y +CONFIG_X86_MPPARSE=y +CONFIG_DOUBLEFAULT=y + +# +# Security options +# +CONFIG_KEYS=y +# CONFIG_KEYS_DEBUG_PROC_KEYS is not set +# CONFIG_SECURITY is not set +CONFIG_CRYPTO=y +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_BLKCIPHER=m +CONFIG_CRYPTO_HASH=m +CONFIG_CRYPTO_MANAGER=m +CONFIG_CRYPTO_HMAC=m +# CONFIG_CRYPTO_XCBC is not set +CONFIG_CRYPTO_NULL=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_SHA1=m +CONFIG_CRYPTO_SHA256=m +# CONFIG_CRYPTO_SHA512 is not set +# CONFIG_CRYPTO_WP512 is not set +# CONFIG_CRYPTO_TGR192 is not set +CONFIG_CRYPTO_GF128MUL=m +# CONFIG_CRYPTO_ECB is not set +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_LRW=m +# CONFIG_CRYPTO_CRYPTD is not set +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_FCRYPT=m +# CONFIG_CRYPTO_BLOWFISH is not set +# CONFIG_CRYPTO_TWOFISH is not set +# CONFIG_CRYPTO_TWOFISH_586 is not set +# CONFIG_CRYPTO_SERPENT is not set +CONFIG_CRYPTO_AES=m +# CONFIG_CRYPTO_AES_586 is not set +# CONFIG_CRYPTO_CAST5 is not set +# CONFIG_CRYPTO_CAST6 is not set +CONFIG_CRYPTO_TEA=m +# CONFIG_CRYPTO_ARC4 is not set +# CONFIG_CRYPTO_KHAZAD is not set +# CONFIG_CRYPTO_ANUBIS is not set +CONFIG_CRYPTO_DEFLATE=m +# CONFIG_CRYPTO_MICHAEL_MIC is not set +# CONFIG_CRYPTO_CRC32C is not set +# CONFIG_CRYPTO_CAMELLIA is not set +# CONFIG_CRYPTO_TEST is not set +CONFIG_CRYPTO_HW=y +# CONFIG_CRYPTO_DEV_PADLOCK is not set +# CONFIG_CRYPTO_DEV_GEODE is not set + +# +# Library routines +# +CONFIG_BITREVERSE=y +CONFIG_CRC_CCITT=m +CONFIG_CRC16=m +CONFIG_CRC_ITU_T=m +CONFIG_CRC32=y +# CONFIG_CRC7 is not set +CONFIG_LIBCRC32C=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=m +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m +CONFIG_PLIST=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT=y +CONFIG_HAS_DMA=y +CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_X86_SMP=y +CONFIG_X86_HT=y +CONFIG_X86_BIOS_REBOOT=y +CONFIG_X86_TRAMPOLINE=y +CONFIG_KTIME_SCALAR=y diff --git a/datapath/table.c b/datapath/table.c new file mode 100644 index 00000000..c0885b70 --- /dev/null +++ b/datapath/table.c @@ -0,0 +1,240 @@ +#include "flow.h" +#include "datapath.h" + +#include +#include +#include +#include +#include +#include +#include + +static void free_table(struct sw_flow ***flows, unsigned int n_buckets, + int free_flows) +{ + unsigned int i; + + for (i = 0; i < n_buckets >> DP_L1_BITS; i++) { + struct sw_flow **l2 = flows[i]; + if (free_flows) { + unsigned int j; + for (j = 0; j < DP_L1_SIZE; j++) { + if (l2[j]) + flow_free(l2[j]); + } + } + free_page((unsigned long)l2); + } + kfree(flows); +} + +static struct sw_flow ***alloc_table(unsigned int n_buckets) +{ + struct sw_flow ***flows; + unsigned int i; + + flows = kmalloc((n_buckets >> DP_L1_BITS) * sizeof(struct sw_flow**), + GFP_KERNEL); + if (!flows) + return NULL; + for (i = 0; i < n_buckets >> DP_L1_BITS; i++) { + flows[i] = (struct sw_flow **)get_zeroed_page(GFP_KERNEL); + if (!flows[i]) { + free_table(flows, i << DP_L1_BITS, 0); + return NULL; + } + } + return flows; +} + +struct dp_table *dp_table_create(unsigned int n_buckets) +{ + struct dp_table *table; + + table = kzalloc(sizeof *table, GFP_KERNEL); + if (!table) + goto err; + + table->n_buckets = n_buckets; + table->flows[0] = alloc_table(n_buckets); + if (!table[0].flows) + goto err_free_tables; + + table->flows[1] = alloc_table(n_buckets); + if (!table->flows[1]) + goto err_free_flows0; + + return table; + +err_free_flows0: + free_table(table->flows[0], table->n_buckets, 0); +err_free_tables: + kfree(table); +err: + return NULL; +} + +void dp_table_destroy(struct dp_table *table, int free_flows) +{ + int i; + for (i = 0; i < 2; i++) + free_table(table->flows[i], table->n_buckets, free_flows); + kfree(table); +} + +static struct sw_flow **find_bucket(struct dp_table *table, + struct sw_flow ***flows, u32 hash) +{ + unsigned int l1 = (hash & (table->n_buckets - 1)) >> DP_L1_SHIFT; + unsigned int l2 = hash & ((1 << DP_L2_BITS) - 1); + return &flows[l1][l2]; +} + +static struct sw_flow *lookup_table(struct dp_table *table, + struct sw_flow ***flows, u32 hash, + const struct odp_flow_key *key) +{ + struct sw_flow **bucket = find_bucket(table, flows, hash); + struct sw_flow *flow = rcu_dereference(*bucket); + if (flow && !memcmp(&flow->key, key, sizeof(struct odp_flow_key))) + return flow; + return NULL; +} + +static u32 flow_hash0(const struct odp_flow_key *key) +{ + return jhash2((u32*)key, sizeof *key / sizeof(u32), 0xaaaaaaaa); +} + +static u32 flow_hash1(const struct odp_flow_key *key) +{ + return jhash2((u32*)key, sizeof *key / sizeof(u32), 0x55555555); +} + +static void find_buckets(struct dp_table *table, + const struct odp_flow_key *key, + struct sw_flow **buckets[2]) +{ + buckets[0] = find_bucket(table, table->flows[0], flow_hash0(key)); + buckets[1] = find_bucket(table, table->flows[1], flow_hash1(key)); +} + +struct sw_flow *dp_table_lookup(struct dp_table *table, + const struct odp_flow_key *key) +{ + struct sw_flow *flow; + flow = lookup_table(table, table->flows[0], flow_hash0(key), key); + if (!flow) + flow = lookup_table(table, table->flows[1], + flow_hash1(key), key); + return flow; +} + +int dp_table_foreach(struct dp_table *table, + int (*callback)(struct sw_flow *flow, void *aux), + void *aux) +{ + unsigned int i, j, k; + for (i = 0; i < 2; i++) { + for (j = 0; j < table->n_buckets >> DP_L1_BITS; j++) { + struct sw_flow **l2 = table->flows[i][j]; + for (k = 0; k < DP_L1_SIZE; k++) { + struct sw_flow *flow = rcu_dereference(l2[k]); + if (flow) { + int error = callback(flow, aux); + if (error) + return error; + } + } + } + } + return 0; +} + +static int insert_flow(struct sw_flow *flow, void *new_table_) +{ + struct dp_table *new_table = new_table_; + struct sw_flow **buckets[2]; + int i; + + find_buckets(new_table, &flow->key, buckets); + for (i = 0; i < 2; i++) { + if (!*buckets[i]) { + rcu_assign_pointer(*buckets[i], flow); + return 0; + } + } + WARN_ON_ONCE(1); + return 0; +} + +static void dp_free_table_rcu(struct rcu_head *rcu) +{ + struct dp_table *table = container_of(rcu, struct dp_table, rcu); + dp_table_destroy(table, 0); +} + +int dp_table_expand(struct datapath *dp) +{ + struct dp_table *old_table = rcu_dereference(dp->table); + struct dp_table *new_table = dp_table_create(old_table->n_buckets * 2); + if (!new_table) + return -ENOMEM; + dp_table_foreach(old_table, insert_flow, new_table); + rcu_assign_pointer(dp->table, new_table); + call_rcu(&old_table->rcu, dp_free_table_rcu); + return 0; +} + +static void dp_free_table_and_flows_rcu(struct rcu_head *rcu) +{ + struct dp_table *table = container_of(rcu, struct dp_table, rcu); + dp_table_destroy(table, 1); +} + +int dp_table_flush(struct datapath *dp) +{ + struct dp_table *old_table = rcu_dereference(dp->table); + struct dp_table *new_table = dp_table_create(DP_L1_SIZE); + if (!new_table) + return -ENOMEM; + rcu_assign_pointer(dp->table, new_table); + call_rcu(&old_table->rcu, dp_free_table_and_flows_rcu); + return 0; +} + +struct sw_flow ** +dp_table_lookup_for_insert(struct dp_table *table, + const struct odp_flow_key *target) +{ + struct sw_flow **buckets[2]; + struct sw_flow **empty_bucket = NULL; + int i; + + find_buckets(table, target, buckets); + for (i = 0; i < 2; i++) { + struct sw_flow *f = rcu_dereference(*buckets[i]); + if (f) { + if (!memcmp(&f->key, target, sizeof(struct odp_flow_key))) + return buckets[i]; + } else if (!empty_bucket) + empty_bucket = buckets[i]; + } + return empty_bucket; +} + +int dp_table_delete(struct dp_table *table, struct sw_flow *target) +{ + struct sw_flow **buckets[2]; + int i; + + find_buckets(table, &target->key, buckets); + for (i = 0; i < 2; i++) { + struct sw_flow *flow = rcu_dereference(*buckets[i]); + if (flow == target) { + rcu_assign_pointer(*buckets[i], NULL); + return 0; + } + } + return -ENOENT; +} diff --git a/debian/.gitignore b/debian/.gitignore new file mode 100644 index 00000000..2053c5c1 --- /dev/null +++ b/debian/.gitignore @@ -0,0 +1,19 @@ +*.debhelper +*.debhelper.log +*.substvars +/control +/corekeeper +/files +/nicira-switch +/openvswitch +/openvswitch-common +/openvswitch-common.copyright +/openvswitch-controller +/openvswitch-datapath-source +/openvswitch-dbg +/openvswitch-monitor +/openvswitch-pki +/openvswitch-pki-server +/openvswitch-switch +/openvswitch-switch-config +/openvswitch-switch.copyright diff --git a/debian/automake.mk b/debian/automake.mk new file mode 100644 index 00000000..813987e7 --- /dev/null +++ b/debian/automake.mk @@ -0,0 +1,50 @@ +EXTRA_DIST += \ + debian/changelog \ + debian/commands/reconfigure \ + debian/commands/update \ + debian/compat \ + debian/control \ + debian/control.modules.in \ + debian/copyright \ + debian/corekeeper.cron.daily \ + debian/corekeeper.init \ + debian/dirs \ + debian/ovs-switch-setup \ + debian/ovs-switch-setup.8 \ + debian/openvswitch-common.dirs \ + debian/openvswitch-common.install \ + debian/openvswitch-common.manpages \ + debian/openvswitch-controller.README.Debian \ + debian/openvswitch-controller.default \ + debian/openvswitch-controller.dirs \ + debian/openvswitch-controller.init \ + debian/openvswitch-controller.install \ + debian/openvswitch-controller.manpages \ + debian/openvswitch-controller.postinst \ + debian/openvswitch-datapath-module-_KVERS_.postinst.modules.in \ + debian/openvswitch-datapath-source.README.Debian \ + debian/openvswitch-datapath-source.copyright \ + debian/openvswitch-datapath-source.dirs \ + debian/openvswitch-datapath-source.install \ + debian/openvswitch-pki-server.apache2 \ + debian/openvswitch-pki-server.dirs \ + debian/openvswitch-pki-server.install \ + debian/openvswitch-pki-server.postinst \ + debian/openvswitch-pki.postinst \ + debian/openvswitch-switch-config.dirs \ + debian/openvswitch-switch-config.install \ + debian/openvswitch-switch-config.manpages \ + debian/openvswitch-switch-config.overrides \ + debian/openvswitch-switch-config.templates \ + debian/openvswitch-switch.README.Debian \ + debian/openvswitch-switch.dirs \ + debian/openvswitch-switch.init \ + debian/openvswitch-switch.install \ + debian/openvswitch-switch.logrotate \ + debian/openvswitch-switch.manpages \ + debian/openvswitch-switch.postinst \ + debian/openvswitch-switch.postrm \ + debian/openvswitch-switch.template \ + debian/po/POTFILES.in \ + debian/po/templates.pot \ + debian/rules diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 00000000..4aa1f90b --- /dev/null +++ b/debian/changelog @@ -0,0 +1,5 @@ +openvswitch (0.90.0) unstable; urgency=low + + * Development version. + + -- Open vSwitch developers Mon, 19 Nov 2007 14:57:52 -0800 diff --git a/debian/commands/reconfigure b/debian/commands/reconfigure new file mode 100755 index 00000000..dc493a18 --- /dev/null +++ b/debian/commands/reconfigure @@ -0,0 +1,128 @@ +#! /usr/bin/perl + +use POSIX; +use strict; +use warnings; + +my $default = '/etc/default/openvswitch-switch'; + +my (%config) = load_config($default); +if (@ARGV) { + foreach my $arg (@ARGV) { + my ($key, $value) = $arg =~ /^([^=]+)=(.*)/ + or die "bad argument '$arg'\n"; + if ($value ne '') { + $config{$key} = $value; + } else { + delete $config{$key}; + } + } + save_config($default, %config); +} +print "$_=$config{$_}\n" foreach sort(keys(%config)); + +sub load_config { + my ($file) = @_; + + # Get the list of the variables that the shell sets automatically. + my (%auto_vars) = read_vars("set -a && env"); + + # Get the variables from $default. + my (%config) = read_vars("set -a && . '$default' && env"); + + # Subtract. + delete @config{keys %auto_vars}; + + return %config; +} + +sub read_vars { + my ($cmd) = @_; + local @ENV; + if (!open(VARS, '-|', $cmd)) { + print STDERR "$cmd: failed to execute: $!\n"; + return (); + } + my (%config); + while () { + my ($var, $value) = /^([^=]+)=(.*)$/ or next; + $config{$var} = $value; + } + close(VARS); + return %config; +} + +sub shell_escape { + local $_ = $_[0]; + if ($_ eq '') { + return '""'; + } elsif (m&^[-a-zA-Z0-9:./%^_+,]*$&) { + return $_; + } else { + s/'/'\\''/; + return "'$_'"; + } +} + +sub shell_assign { + my ($var, $value) = @_; + return $var . '=' . shell_escape($value); +} + +sub save_config { + my ($file, %config) = @_; + my (@lines); + if (open(FILE, '<', $file)) { + @lines = ; + chomp @lines; + close(FILE); + } + + # Replace all existing variable assignments. + for (my ($i) = 0; $i <= $#lines; $i++) { + local $_ = $lines[$i]; + my ($var, $value) = /^\s*([^=#]+)=(.*)$/ or next; + if (exists($config{$var})) { + $lines[$i] = shell_assign($var, $config{$var}); + delete $config{$var}; + } else { + $lines[$i] = "#$lines[$i]"; + } + } + + # Find a place to put any remaining variable assignments. + VAR: + for my $var (keys(%config)) { + my $assign = shell_assign($var, $config{$var}); + + # Replace the last commented-out variable assignment to $var, if any. + for (my ($i) = $#lines; $i >= 0; $i--) { + local $_ = $lines[$i]; + if (/^\s*#\s*$var=/) { + $lines[$i] = $assign; + next VAR; + } + } + + # Find a place to add the var: after the final commented line + # just after a line that contains "$var:". + for (my ($i) = 0; $i <= $#lines; $i++) { + if ($lines[$i] =~ /^\s*#\s*$var:/) { + for (my ($j) = $i + 1; $j <= $#lines; $j++) { + if ($lines[$j] !~ /^\s*#/) { + splice(@lines, $j, 0, $assign); + next VAR; + } + } + } + } + + # Just append it. + push(@lines, $assign); + } + + open(NEWFILE, '>', "$file.tmp") or die "$file.tmp: create: $!\n"; + print NEWFILE join('', map("$_\n", @lines)); + close(NEWFILE); + rename("$file.tmp", $file) or die "$file.tmp: rename to $file: $!\n"; +} diff --git a/debian/commands/update b/debian/commands/update new file mode 100755 index 00000000..545e3c23 --- /dev/null +++ b/debian/commands/update @@ -0,0 +1,4 @@ +#! /bin/sh +set -e +apt-get update -qy +apt-get upgrade -qy diff --git a/debian/compat b/debian/compat new file mode 100644 index 00000000..7ed6ff82 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +5 diff --git a/debian/control b/debian/control new file mode 100644 index 00000000..09eda114 --- /dev/null +++ b/debian/control @@ -0,0 +1,143 @@ +Source: openvswitch +Section: net +Priority: extra +Maintainer: Open vSwitch developers +Build-Depends: debhelper (>= 5), autoconf (>= 2.60), automake1.10, libssl-dev, pkg-config (>= 0.21), po-debconf, bzip2, openssl, libncurses5-dev, libpcre3-dev +Standards-Version: 3.7.3 + +Package: openvswitch-datapath-source +Architecture: all +Depends: module-assistant, bzip2, debhelper (>= 5.0.37) +Suggests: openvswitch-switch +Description: Source code for Open vSwitch datapath Linux module + This package provides the Open vSwitch datapath module source code + that is needed by openvswitch-switch. The kernel module can be built + from it using module-assistant or make-kpkg. README.Debian in this + package provides further instructions. + . + Open vSwitch is a software-based Ethernet switch targeted at virtual + servers. + +Package: openvswitch-common +Architecture: any +Depends: ${shlibs:Depends}, openssl +Description: Open vSwitch common components + openvswitch-common provides components required by both openvswitch-switch + and openvswitch-controller. + . + Open vSwitch is a software-based Ethernet switch targeted at virtual + servers. + +Package: openvswitch-switch +Architecture: any +Suggests: openvswitch-datapath-module +Depends: ${shlibs:Depends}, ${misc:Depends}, openvswitch-common, dhcp3-client, module-init-tools, dmidecode, procps, debianutils +Description: Open vSwitch switch implementations + openvswitch-switch provides the userspace components and utilities for + the Open vSwitch kernel-based switch. + . + Open vSwitch is a software-based Ethernet switch targeted at virtual + servers. + +Package: openvswitch-switch-config +Architecture: any +Depends: ${shlibs:Depends}, ${misc:Depends}, openvswitch-switch, libwww-perl, libdigest-sha1-perl +Description: Open vSwitch switch implementations + openvswitch-switch-config provides a utility for interactively configuring + the Open vSwitch switch provided in the openvswitch-switch package. + . + Open vSwitch is a software-based Ethernet switch targeted at virtual + servers. + +Package: openvswitch-switchui +Architecture: any +Recommends: openvswitch-switch +Depends: ${shlibs:Depends}, ${misc:Depends}, console-tools +Description: Monitoring utility for OpenFlow switches + The ovs-switchui utility included in this package provides a + "front-panel display" to allow administrators to view the status of + an OpenFlow switch at a glance. + . + The ezio-term utility, also included, provides a VT100-compatible + terminal interface for EZIO3 (aka MTB-134) 16x2 LCD displays found on + server appliances made by Portwell. It allows ovs-switchui to work + with such displays. + +Package: openvswitch-pki +Architecture: all +Depends: ${shlibs:Depends}, ${misc:Depends}, openvswitch-common +Description: Open vSwitch public key infrastructure + openvswitch-pki provides PKI (public key infrastructure) support for + Open vSwitch switches and controllers, reducing the risk of + man-in-the-middle attacks on the Open vSwitch network infrastructure. + . + Open vSwitch is a software-based Ethernet switch targeted at virtual + servers. + +Package: openvswitch-pki-server +Architecture: all +Depends: ${shlibs:Depends}, ${misc:Depends}, ${perl:Depends}, openvswitch-pki, apache2 +Description: Open vSwitch public key infrastructure (HTTP server support) + openvswitch-pki-server provides HTTP access to the Open vSwitch PKI (public + key infrastructure) maintained on the local machine by the + openvswitch-pki package. This HTTP access is needed for secure and + convenient OpenFlow switch setup using the ovs-switch-setup program + in the openvswitch-switch package. + . + Open vSwitch is a software-based Ethernet switch targeted at virtual + servers. + +Package: openvswitch-controller +Architecture: any +Depends: ${shlibs:Depends}, openvswitch-common, openvswitch-pki +Description: Open vSwitch controller implementation + The Open vSwitch controller enables OpenFlow switches that connect to it + to act as MAC-learning Ethernet switches. + . + Open vSwitch is a software-based Ethernet switch targeted at virtual + servers. + +Package: corekeeper +Architecture: all +Depends: tmpreaper +Description: Core file centralizer and reaper + The corekeeper package configures the system to dump all core files to + /var/log/core. It also deletes core files older than 7 days. + +Package: openvswitch-dbg +Architecture: any +Depends: ${shlibs:Depends} +Description: Debug symbols for Open vSwitch packages + This package contains the debug symbols for all the other openvswitch-* + packages. Install it to debug one of them or to examine a core dump + produced by one of them. + +Package: openvswitch-monitor +Architecture: any +Recommends: openvswitch-switch +Depends: ${shlibs:Depends}, ${misc:Depends} +Description: Monitor utility for Open vSwitch switches + The ovs-monitor utility included in this package monitors the secure + channel and datapath. If either become unresponsive, the switch is + rebooted. + +Package: openvswitch-wdt +Architecture: any +Recommends: openvswitch-switch +Depends: ${shlibs:Depends}, ${misc:Depends} +Description: Watchdog utility for Open vSwitch switches + The ovs-wdt program included in this package manages the hardware + watchdog timer in switches based on the Portwell NAR-5520 hardware. + +Package: nicira-switch +Architecture: all +Depends: + openvswitch-common (= ${source:Version}), + openvswitch-switch (= ${source:Version}), + openvswitch-switchui (= ${source:Version}), + openvswitch-datapath-module (= ${source:Version}), + corekeeper, openvswitch-monitor, openvswitch-wdt +Description: Metapackage for installing a Nicira Open vSwitch switch + Installing this package will install everything needed for a Nicira + Portwell-based Open vSwitch switch, including monitoring and the switch UI. + diff --git a/debian/control.modules.in b/debian/control.modules.in new file mode 100644 index 00000000..4da85b40 --- /dev/null +++ b/debian/control.modules.in @@ -0,0 +1,20 @@ +Source: openvswitch +Section: net +Priority: extra +Maintainer: Open vSwitch developers +Build-Depends: debhelper (>= 5.0.37) +Standards-Version: 3.7.3 + +Package: openvswitch-datapath-module-_KVERS_ +Architecture: any +Recommends: kernel-image-_KVERS_, openvswitch-switch +Provides: openvswitch-datapath-module +Description: Open vSwitch Linux datapath kernel module + This package contains the Open vSwitch loadable datapath kernel modules for + the kernel-image-_KVERS_ package. + . + If you compiled a custom kernel, you will most likely need to compile + a custom version of this module as well. The + openvswitch-datapath-source package has been provided for this + purpose. Refer to README.Debian provided in that package for further + instructions. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 00000000..0f89e828 --- /dev/null +++ b/debian/copyright @@ -0,0 +1,21 @@ +Upstream Authors: + + Nicira Networks + +Copyright: + + Copyright (C) 2008 Nicira Networks. + +License: + + Permission to use, copy, modify, and/or distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/debian/corekeeper.cron.daily b/debian/corekeeper.cron.daily new file mode 100755 index 00000000..badc192d --- /dev/null +++ b/debian/corekeeper.cron.daily @@ -0,0 +1,5 @@ +#! /bin/sh + +PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin + +tmpreaper 7d --mtime --all /var/log/core diff --git a/debian/corekeeper.init b/debian/corekeeper.init new file mode 100755 index 00000000..27d62a12 --- /dev/null +++ b/debian/corekeeper.init @@ -0,0 +1,63 @@ +#!/bin/sh +# +# Example init.d script with LSB support. +# +# Please read this init.d carefully and modify the sections to +# adjust it to the program you want to run. +# +# Copyright (c) 2007 Javier Fernandez-Sanguino +# +# This is free software; you may redistribute it and/or modify +# it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2, +# or (at your option) any later version. +# +# This is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License with +# the Debian operating system, in /usr/share/common-licenses/GPL; if +# not, write to the Free Software Foundation, Inc., 59 Temple Place, +# Suite 330, Boston, MA 02111-1307 USA +# +### BEGIN INIT INFO +# Provides: corekeeper +# Required-Start: +# Required-Stop: +# Should-Start: $syslog +# Should-Stop: +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Configure core file dump location +### END INIT INFO + +PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin + +. /lib/lsb/init-functions + +set -e + +case "$1" in + start) + log_daemon_msg "Initializing core dump location..." + if echo "/var/log/core/core.%e.%t" > /proc/sys/kernel/core_pattern + then + log_progress_msg "success" + log_end_msg 0 + exit 0 + else + log_end_msg 1 + exit 1 + fi + ;; + stop|restart|force-reload|status|reload) + exit 0 + ;; + *) + N=/etc/init.d/$NAME + echo "Usage: $N {start|stop|restart|force-reload|status}" >&2 + exit 1 + ;; +esac diff --git a/debian/dirs b/debian/dirs new file mode 100644 index 00000000..ca882bbb --- /dev/null +++ b/debian/dirs @@ -0,0 +1,2 @@ +usr/bin +usr/sbin diff --git a/debian/openvswitch-common.dirs b/debian/openvswitch-common.dirs new file mode 100644 index 00000000..be9ed2f0 --- /dev/null +++ b/debian/openvswitch-common.dirs @@ -0,0 +1 @@ +var/log/openvswitch diff --git a/debian/openvswitch-common.install b/debian/openvswitch-common.install new file mode 100644 index 00000000..1967ccc1 --- /dev/null +++ b/debian/openvswitch-common.install @@ -0,0 +1,3 @@ +_debian/utilities/ovs-appctl usr/sbin +_debian/utilities/ovs-parse-leaks usr/bin +_debian/utilities/ovs-pki usr/sbin diff --git a/debian/openvswitch-common.manpages b/debian/openvswitch-common.manpages new file mode 100644 index 00000000..99c48bd0 --- /dev/null +++ b/debian/openvswitch-common.manpages @@ -0,0 +1,2 @@ +_debian/utilities/ovs-appctl.8 +_debian/utilities/ovs-pki.8 diff --git a/debian/openvswitch-controller.README.Debian b/debian/openvswitch-controller.README.Debian new file mode 100644 index 00000000..18819a79 --- /dev/null +++ b/debian/openvswitch-controller.README.Debian @@ -0,0 +1,12 @@ +README.Debian for openvswitch-controller +------------------------------------- + +* To (re)configure the controller, edit /etc/default/openvswitch-controller + and run "/etc/init.d/openvswitch-controller restart". + +* To enable OpenFlow switches to automatically discover the location + of the controller, you must install and configure a DHCP server. + The secchan(8) manpage (found in the openvswitch-switch package) gives + a working example configuration file for the ISC DHCP server. + + -- Ben Pfaff , Mon, 11 May 2009 13:26:38 -0700 diff --git a/debian/openvswitch-controller.default b/debian/openvswitch-controller.default new file mode 100644 index 00000000..1d9f9261 --- /dev/null +++ b/debian/openvswitch-controller.default @@ -0,0 +1,29 @@ +# This is a POSIX shell fragment -*- sh -*- + +# LISTEN: What OpenFlow connection methods should the controller listen on? +# +# This is a space-delimited list of connection methods: +# +# * "pssl:[PORT]": Listen for SSL connections on the specified PORT +# (default: 6633). The private key, certificate, and CA certificate +# must be specified below. +# +# * "pctp:[PORT]": Listen for TCP connections on the specified PORT +# (default: 6633). Not recommended for security reasons. +# +LISTEN="pssl:" + +# PRIVKEY: Name of file containing controller's private key. +# Required if SSL enabled. +PRIVKEY=/etc/openvswitch-controller/privkey.pem + +# CERT: Name of file containing certificate for private key. +# Required if SSL enabled. +CERT=/etc/openvswitch-controller/cert.pem + +# CACERT: Name of file containing switch CA certificate. +# Required if SSL enabled. +CACERT=/etc/openvswitch-controller/cacert.pem + +# Additional options to pass to controller, e.g. "--hub" +DAEMON_OPTS="" diff --git a/debian/openvswitch-controller.dirs b/debian/openvswitch-controller.dirs new file mode 100644 index 00000000..4ada77c6 --- /dev/null +++ b/debian/openvswitch-controller.dirs @@ -0,0 +1 @@ +etc/openvswitch-controller diff --git a/debian/openvswitch-controller.init b/debian/openvswitch-controller.init new file mode 100755 index 00000000..ee9c44d5 --- /dev/null +++ b/debian/openvswitch-controller.init @@ -0,0 +1,269 @@ +#!/bin/sh +# +# Copyright (c) 2007, 2009 Javier Fernandez-Sanguino +# +# This is free software; you may redistribute it and/or modify +# it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2, +# or (at your option) any later version. +# +# This is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License with +# the Debian operating system, in /usr/share/common-licenses/GPL; if +# not, write to the Free Software Foundation, Inc., 59 Temple Place, +# Suite 330, Boston, MA 02111-1307 USA +# +### BEGIN INIT INFO +# Provides: openvswitch-controller +# Required-Start: $network $local_fs +# Required-Stop: +# Should-Start: $named +# Should-Stop: +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Open vSwitch controller +### END INIT INFO + +PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin + +DAEMON=/usr/sbin/controller # Introduce the server's location here +NAME=ovs-controller # Introduce the short server's name here +DESC=ovs-controller # Introduce a short description here +LOGDIR=/var/log/openvswitch # Log directory to use + +PIDFILE=/var/run/$NAME.pid + +test -x $DAEMON || exit 0 + +. /lib/lsb/init-functions + +# Default options, these can be overriden by the information +# at /etc/default/$NAME +DAEMON_OPTS="" # Additional options given to the server + +DODTIME=10 # Time to wait for the server to die, in seconds + # If this value is set too low you might not + # let some servers to die gracefully and + # 'restart' will not work + +LOGFILE=$LOGDIR/$NAME.log # Server logfile +#DAEMONUSER= # User to run the daemons as. If this value + # is set start-stop-daemon will chuid the server + +# Include defaults if available +default=/etc/default/openvswitch-controller +if [ -f $default ] ; then + . $default +fi + +# Check that the user exists (if we set a user) +# Does the user exist? +if [ -n "$DAEMONUSER" ] ; then + if getent passwd | grep -q "^$DAEMONUSER:"; then + # Obtain the uid and gid + DAEMONUID=`getent passwd |grep "^$DAEMONUSER:" | awk -F : '{print $3}'` + DAEMONGID=`getent passwd |grep "^$DAEMONUSER:" | awk -F : '{print $4}'` + else + log_failure_msg "The user $DAEMONUSER, required to run $NAME does not exist." + exit 1 + fi +fi + + +set -e + +running_pid() { +# Check if a given process pid's cmdline matches a given name + pid=$1 + name=$2 + [ -z "$pid" ] && return 1 + [ ! -d /proc/$pid ] && return 1 + cmd=`cat /proc/$pid/cmdline | tr "\000" "\n"|head -n 1 |cut -d : -f 1` + # Is this the expected server + [ "$cmd" != "$name" ] && return 1 + return 0 +} + +running() { +# Check if the process is running looking at /proc +# (works for all users) + + # No pidfile, probably no daemon present + [ ! -f "$PIDFILE" ] && return 1 + pid=`cat $PIDFILE` + running_pid $pid $DAEMON || return 1 + return 0 +} + +start_server() { + if [ -z "$LISTEN" ]; then + echo "$default: No connection methods configured, controller disabled" >&2 + exit 0 + fi + + SSL_OPTS= + case $LISTEN in + *ssl*) + : ${PRIVKEY:=/etc/openvswitch-controller/privkey.pem} + : ${CERT:=/etc/openvswitch-controller/cert.pem} + : ${CACERT:=/etc/openvswitch-controller/cacert.pem} + if test ! -e "$PRIVKEY" || test ! -e "$CERT" || + test ! -e "$CACERT"; then + if test ! -e "$PRIVKEY"; then + echo "$PRIVKEY: private key missing" >&2 + fi + if test ! -e "$CERT"; then + echo "$CERT: certificate for private key missing" >&2 + fi + if test ! -e "$CACERT"; then + echo "$CACERT: CA certificate missing" >&2 + fi + exit 1 + fi + SSL_OPTS="--private-key=$PRIVKEY --certificate=$CERT --ca-cert=$CACERT" + ;; + esac + +# Start the process using the wrapper + if [ -z "$DAEMONUSER" ] ; then + start-stop-daemon --start --pidfile $PIDFILE \ + --exec $DAEMON -- --detach --pidfile=$PIDFILE \ + $LISTEN $DAEMON_OPTS $SSL_OPTS + errcode=$? + else +# if we are using a daemonuser then change the user id + start-stop-daemon --start --quiet --pidfile $PIDFILE \ + --chuid $DAEMONUSER --exec $DAEMON -- \ + --detach --pidfile=$PIDFILE $LISTEN $DAEMON_OPTS \ + $SSL_OPTS + errcode=$? + fi + return $errcode +} + +stop_server() { +# Stop the process using the wrapper + if [ -z "$DAEMONUSER" ] ; then + start-stop-daemon --stop --quiet --pidfile $PIDFILE \ + --exec $DAEMON + errcode=$? + else +# if we are using a daemonuser then look for process that match + start-stop-daemon --stop --quiet --pidfile $PIDFILE \ + --user $DAEMONUSER --exec $DAEMON + errcode=$? + fi + + return $errcode +} + +reload_server() { + [ ! -f "$PIDFILE" ] && return 1 + pid=`cat $PIDFILE` # This is the daemon's pid + # Send a SIGHUP + kill -1 $pid + return $? +} + +force_stop() { +# Force the process to die killing it manually + [ ! -e "$PIDFILE" ] && return + if running ; then + kill -15 $pid + # Is it really dead? + sleep "$DIETIME"s + if running ; then + kill -9 $pid + sleep "$DIETIME"s + if running ; then + echo "Cannot kill $NAME (pid=$pid)!" + exit 1 + fi + fi + fi + rm -f $PIDFILE +} + + +case "$1" in + start) + log_daemon_msg "Starting $DESC " "$NAME" + # Check if it's running first + if running ; then + log_progress_msg "apparently already running" + log_end_msg 0 + exit 0 + fi + if start_server && running ; then + # It's ok, the server started and is running + log_end_msg 0 + else + # Either we could not start it or it is not running + # after we did + # NOTE: Some servers might die some time after they start, + # this code does not try to detect this and might give + # a false positive (use 'status' for that) + log_end_msg 1 + fi + ;; + stop) + log_daemon_msg "Stopping $DESC" "$NAME" + if running ; then + # Only stop the server if we see it running + stop_server + log_end_msg $? + else + # If it's not running don't do anything + log_progress_msg "apparently not running" + log_end_msg 0 + exit 0 + fi + ;; + force-stop) + # First try to stop gracefully the program + $0 stop + if running; then + # If it's still running try to kill it more forcefully + log_daemon_msg "Stopping (force) $DESC" "$NAME" + force_stop + log_end_msg $? + fi + ;; + restart|force-reload) + log_daemon_msg "Restarting $DESC" "$NAME" + stop_server + # Wait some sensible amount, some server need this + [ -n "$DIETIME" ] && sleep $DIETIME + start_server + running + log_end_msg $? + ;; + status) + + log_daemon_msg "Checking status of $DESC" "$NAME" + if running ; then + log_progress_msg "running" + log_end_msg 0 + else + log_progress_msg "apparently not running" + log_end_msg 1 + exit 1 + fi + ;; + # Use this if the daemon cannot reload + reload) + log_warning_msg "Reloading $NAME daemon: not implemented, as the daemon" + log_warning_msg "cannot re-read the config file (use restart)." + ;; + *) + N=/etc/init.d/$NAME + echo "Usage: $N {start|stop|force-stop|restart|force-reload|status}" >&2 + exit 1 + ;; +esac + +exit 0 diff --git a/debian/openvswitch-controller.install b/debian/openvswitch-controller.install new file mode 100644 index 00000000..7d0edbbe --- /dev/null +++ b/debian/openvswitch-controller.install @@ -0,0 +1 @@ +_debian/utilities/ovs-controller usr/sbin diff --git a/debian/openvswitch-controller.manpages b/debian/openvswitch-controller.manpages new file mode 100644 index 00000000..6a9911e1 --- /dev/null +++ b/debian/openvswitch-controller.manpages @@ -0,0 +1 @@ +_debian/utilities/ovs-controller.8 diff --git a/debian/openvswitch-controller.postinst b/debian/openvswitch-controller.postinst new file mode 100755 index 00000000..51acfb1a --- /dev/null +++ b/debian/openvswitch-controller.postinst @@ -0,0 +1,52 @@ +#!/bin/sh +# postinst script for openvswitch-controller +# +# see: dh_installdeb(1) + +set -e + +# summary of how this script can be called: +# * `configure' +# * `abort-upgrade' +# * `abort-remove' `in-favour' +# +# * `abort-remove' +# * `abort-deconfigure' `in-favour' +# `removing' +# +# for details, see http://www.debian.org/doc/debian-policy/ or +# the debian-policy package + + +case "$1" in + configure) + cd /etc/openvswitch-controller + if ! test -e cacert.pem; then + ln -s /usr/share/openvswitch/pki/switchca/cacert.pem cacert.pem + fi + if ! test -e privkey.pem || ! test -e cert.pem; then + oldumask=$(umask) + umask 077 + ovs-pki req+sign tmp controller >/dev/null + mv tmp-privkey.pem privkey.pem + mv tmp-cert.pem cert.pem + mv tmp-req.pem req.pem + chmod go+r cert.pem req.pem + umask $oldumask + fi + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +#DEBHELPER# + +exit 0 + + diff --git a/debian/openvswitch-datapath-module-_KVERS_.postinst.modules.in b/debian/openvswitch-datapath-module-_KVERS_.postinst.modules.in new file mode 100755 index 00000000..02683008 --- /dev/null +++ b/debian/openvswitch-datapath-module-_KVERS_.postinst.modules.in @@ -0,0 +1,25 @@ +#!/bin/sh +# postinst script for #PACKAGE# +# +# see: dh_installdeb(1) + +set -e + +depmod -a + +#DEBHELPER# + +# If the switch is running, restart it. This ensures that we are using the +# latest kernel module, because the init script will unload and reload the +# module. +# +# (Ideally we'd only want to do this if this package corresponds to the +# running kernel, but I don't know a reliable way to check.) +INIT=/etc/init.d/openvswitch-switch +if test -x $INIT && $INIT status; then + $INIT restart || true +fi + +exit 0 + + diff --git a/debian/openvswitch-datapath-source.README.Debian b/debian/openvswitch-datapath-source.README.Debian new file mode 100644 index 00000000..73bba7a1 --- /dev/null +++ b/debian/openvswitch-datapath-source.README.Debian @@ -0,0 +1,31 @@ +Open vSwitch for Debian +---------------------- + +* How do I build this module the Debian way? + + - Building with module-assistant: + + $ module-assistant auto-install openvswitch + or + $ m-a a-i openvswitch + + If kernel source or headers are in a non-standard directory, add + the option -k /path/to/kernel/source with the correct path. + + - Building with make-kpkg + + $ cd /usr/src/ + $ tar jxvf openvswitch.tar.bz2 + $ cd /usr/src/kernel-source-2.6.9 + $ make-kpkg --added-modules=openvswitch modules + + - Building without make-kpkg + + $ cd /usr/src/ + $ tar jxvf openvswitch.tar.bz2 + $ cd modules/openvswitch + $ fakeroot debian/rules kdist_image + + If you run this as root, fakeroot is not needed. + + -- Ben Pfaff , Mon, 11 May 2009 13:27:50 -0700 diff --git a/debian/openvswitch-datapath-source.copyright b/debian/openvswitch-datapath-source.copyright new file mode 100644 index 00000000..32cba237 --- /dev/null +++ b/debian/openvswitch-datapath-source.copyright @@ -0,0 +1,15 @@ +Upstream Authors: + + Nicira Networks + +Copyright: + + Copyright (C) 2008 Nicira Networks + +License: + + Files in the datapath/ and its sub-directories are covered under the GNU + General Public License Version 2. + + On Debian systems, the complete text of the GNU General + Public License can be found in `/usr/share/common-licenses/GPL'. diff --git a/debian/openvswitch-datapath-source.dirs b/debian/openvswitch-datapath-source.dirs new file mode 100644 index 00000000..e5a7d6b0 --- /dev/null +++ b/debian/openvswitch-datapath-source.dirs @@ -0,0 +1 @@ +usr/src/modules/openvswitch-datapath/debian diff --git a/debian/openvswitch-datapath-source.install b/debian/openvswitch-datapath-source.install new file mode 100644 index 00000000..d1acc894 --- /dev/null +++ b/debian/openvswitch-datapath-source.install @@ -0,0 +1,6 @@ +debian/changelog usr/src/modules/openvswitch-datapath/debian +debian/control usr/src/modules/openvswitch-datapath/debian +debian/compat usr/src/modules/openvswitch-datapath/debian +debian/*.modules.in usr/src/modules/openvswitch-datapath/debian +debian/rules usr/src/modules/openvswitch-datapath/debian +_debian/openvswitch.tar.gz usr/src/modules/openvswitch-datapath diff --git a/debian/openvswitch-monitor.default b/debian/openvswitch-monitor.default new file mode 100644 index 00000000..f0c356e8 --- /dev/null +++ b/debian/openvswitch-monitor.default @@ -0,0 +1,27 @@ +# This is a POSIX shell fragment -*- sh -*- + +# To configure the Open vSwitch monitor package, modify the following. +# Afterward, the monitor will be configured automatically at boot time. +# It can be started immediately with +# /etc/init.d/openvswitch-monitor start + +# Defaults for initscript +# sourced by /etc/init.d/openvswitch-monitor +# installed at /etc/default/openvswitch-monitor by the maintainer scripts + +# THRESHOLD: The number of failed attempts the monitor should make until +# it reboots the system. A value of zero disables the monitor. +THRESHOLD=3 + +# INTERVAL: The number of seconds to wait between probing secchan and +# the datapath. +INTERVAL=1 + +# LOG_FILE: File to log messages related to monitoring. +LOG_FILE="/var/log/openvswitch/monitor" + +# SWITCH_VCONN: The vconn used to connect to the switch (secchan). +# The secchan must be configured to listen to this vconn. The default +# here set is also listened to by default by the openvswitch-switch +# package, so ordinarily there is no need to modify this. +SWITCH_VCONN="/var/run/secchan.mgmt" diff --git a/debian/openvswitch-monitor.dirs b/debian/openvswitch-monitor.dirs new file mode 100644 index 00000000..236670a2 --- /dev/null +++ b/debian/openvswitch-monitor.dirs @@ -0,0 +1 @@ +usr/sbin diff --git a/debian/openvswitch-monitor.init b/debian/openvswitch-monitor.init new file mode 100755 index 00000000..8c7e1ad0 --- /dev/null +++ b/debian/openvswitch-monitor.init @@ -0,0 +1,174 @@ +#!/bin/sh +# +# Example init.d script with LSB support. +# +# Please read this init.d carefully and modify the sections to +# adjust it to the program you want to run. +# +# Copyright (c) 2007, 2009 Javier Fernandez-Sanguino +# +# This is free software; you may redistribute it and/or modify +# it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2, +# or (at your option) any later version. +# +# This is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License with +# the Debian operating system, in /usr/share/common-licenses/GPL; if +# not, write to the Free Software Foundation, Inc., 59 Temple Place, +# Suite 330, Boston, MA 02111-1307 USA +# +### BEGIN INIT INFO +# Provides: openvswitch-monitor +# Required-Start: $network $local_fs +# Required-Stop: +# Should-Start: $named $syslog openvswitch-switch +# Should-Stop: +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Open vSwitch switch monitor +### END INIT INFO + +PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin + +DAEMON=/usr/sbin/ovs-monitor +NAME=openvswitch-monitor +DESC="Open vSwitch switch monitor" + +PIDFILE=/var/run/$NAME.pid + +test -x $DAEMON || exit 0 + +. /lib/lsb/init-functions + +# Default options, these can be overriden by the information +# at /etc/default/$NAME +DAEMON_OPTS="" # Additional options given to the daemon + +DODTIME=10 # Time to wait for the daemon to die, in seconds + # If this value is set too low you might not + # let some daemons to die gracefully and + # 'restart' will not work + +# Include defaults if available +if [ -f /etc/default/$NAME ] ; then + . /etc/default/$NAME +fi + +set -e + +running_pid() { +# Check if a given process pid's cmdline matches a given name + pid=$1 + name=$2 + [ -z "$pid" ] && return 1 + [ ! -d /proc/$pid ] && return 1 + return 0 +} + +running() { +# Check if the process is running looking at /proc +# (works for all users) + + # No pidfile, probably no daemon present + [ ! -f "$PIDFILE" ] && return 1 + pid=`cat $PIDFILE` + running_pid $pid $DAEMON || return 1 + return 0 +} + +start_daemon() { +# Start the process using the wrapper + if test $THRESHOLD != 0; then + start-stop-daemon --start --quiet -m --background --pidfile $PIDFILE \ + --exec $DAEMON -- -c $THRESHOLD -i $INTERVAL -l $LOG_FILE \ + -s $SWITCH_VCONN $DAEMON_OPTS + fi + + # Wait up to 3 seconds for the daemon to start. + for i in 1 2 3; do + if running; then + break + fi + sleep 1 + done +} + +stop_daemon() { + start-stop-daemon -o --stop --pidfile $PIDFILE + rm $PIDFILE +} + +case "$1" in + start) + log_daemon_msg "Starting $DESC " "$NAME" + # Check if it's running first + if running ; then + log_progress_msg "apparently already running" + log_end_msg 0 + exit 0 + fi + if start_daemon && running ; then + # It's ok, the daemon started and is running + log_end_msg 0 + else + # Either we could not start it or it is not running + # after we did + # NOTE: Some daemons might die some time after they start, + # this code does not try to detect this and might give + # a false positive (use 'status' for that) + log_end_msg 1 + fi + ;; + stop) + log_daemon_msg "Stopping $DESC" "$NAME" + if running ; then + # Only stop the daemon if we see it running + stop_daemon + log_end_msg $? + else + # If it's not running don't do anything + log_progress_msg "apparently not running" + log_end_msg 0 + exit 0 + fi + ;; + restart|force-reload) + log_daemon_msg "Restarting $DESC" "$NAME" + if running ; then + stop_daemon + # Wait some sensible amount, some daemons need this + [ -n "$DIETIME" ] && sleep $DIETIME + fi + start_daemon + running + log_end_msg $? + ;; + status) + log_daemon_msg "Checking status of $DESC" "$NAME" + if running ; then + log_progress_msg "running" + log_end_msg 0 + else + log_progress_msg "apparently not running" + log_end_msg 1 + exit 1 + fi + ;; + # Use this if the daemon cannot reload + reload) + log_warning_msg "Reloading $NAME daemon: not implemented, as the daemon" + log_warning_msg "cannot re-read the config file (use restart)." + ;; + *) + N=/etc/init.d/$NAME + echo "Usage: $N {start|stop|restart|force-reload|status}" >&2 + exit 1 + ;; +esac + +exit 0 diff --git a/debian/openvswitch-monitor.install b/debian/openvswitch-monitor.install new file mode 100644 index 00000000..9fc601a8 --- /dev/null +++ b/debian/openvswitch-monitor.install @@ -0,0 +1 @@ +utilities/ovs-monitor usr/sbin diff --git a/debian/openvswitch-pki-server.apache2 b/debian/openvswitch-pki-server.apache2 new file mode 100644 index 00000000..d0bc8ba9 --- /dev/null +++ b/debian/openvswitch-pki-server.apache2 @@ -0,0 +1 @@ +Alias /openvswitch/pki/ /usr/share/openvswitch/pki/ diff --git a/debian/openvswitch-pki-server.dirs b/debian/openvswitch-pki-server.dirs new file mode 100644 index 00000000..7307777b --- /dev/null +++ b/debian/openvswitch-pki-server.dirs @@ -0,0 +1 @@ +etc/apache2/sites-available diff --git a/debian/openvswitch-pki-server.install b/debian/openvswitch-pki-server.install new file mode 100644 index 00000000..5af75da0 --- /dev/null +++ b/debian/openvswitch-pki-server.install @@ -0,0 +1 @@ +_debian/utilities/ovs-pki-cgi usr/lib/cgi-bin diff --git a/debian/openvswitch-pki-server.postinst b/debian/openvswitch-pki-server.postinst new file mode 100755 index 00000000..d161a98a --- /dev/null +++ b/debian/openvswitch-pki-server.postinst @@ -0,0 +1,44 @@ +#!/bin/sh +# postinst script for openflow +# +# see: dh_installdeb(1) + +set -e + +# summary of how this script can be called: +# * `configure' +# * `abort-upgrade' +# * `abort-remove' `in-favour' +# +# * `abort-remove' +# * `abort-deconfigure' `in-favour' +# `removing' +# +# for details, see http://www.debian.org/doc/debian-policy/ or +# the debian-policy package + +case "$1" in + configure) + # Enable site under Apache. + a2ensite openflow-pki >/dev/null + if command -v invoke-rc.d >/dev/null 2>&1; then + invoke-rc.d apache2 force-reload || : + else + [ -x /etc/init.d/apache2 ] && /etc/init.d/apache2 force-reload || : + fi + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +#DEBHELPER# + +exit 0 + + diff --git a/debian/openvswitch-pki.postinst b/debian/openvswitch-pki.postinst new file mode 100755 index 00000000..a75a314f --- /dev/null +++ b/debian/openvswitch-pki.postinst @@ -0,0 +1,41 @@ +#!/bin/sh +# postinst script for openvswitch +# +# see: dh_installdeb(1) + +set -e + +# summary of how this script can be called: +# * `configure' +# * `abort-upgrade' +# * `abort-remove' `in-favour' +# +# * `abort-remove' +# * `abort-deconfigure' `in-favour' +# `removing' +# +# for details, see http://www.debian.org/doc/debian-policy/ or +# the debian-policy package + +case "$1" in + configure) + # Create certificate authorities. + if test ! -d /usr/share/openvswitch/pki; then + ovs-pki init + fi + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +#DEBHELPER# + +exit 0 + + diff --git a/debian/openvswitch-switch-config.dirs b/debian/openvswitch-switch-config.dirs new file mode 100644 index 00000000..881ded8a --- /dev/null +++ b/debian/openvswitch-switch-config.dirs @@ -0,0 +1 @@ +/usr/share/lintian/overrides diff --git a/debian/openvswitch-switch-config.install b/debian/openvswitch-switch-config.install new file mode 100644 index 00000000..c8cbf17f --- /dev/null +++ b/debian/openvswitch-switch-config.install @@ -0,0 +1 @@ +debian/ovs-switch-setup usr/sbin diff --git a/debian/openvswitch-switch-config.manpages b/debian/openvswitch-switch-config.manpages new file mode 100644 index 00000000..0e122793 --- /dev/null +++ b/debian/openvswitch-switch-config.manpages @@ -0,0 +1 @@ +debian/ovs-switch-setup.8 diff --git a/debian/openvswitch-switch-config.overrides b/debian/openvswitch-switch-config.overrides new file mode 100644 index 00000000..4ac77aba --- /dev/null +++ b/debian/openvswitch-switch-config.overrides @@ -0,0 +1 @@ +debconf-is-not-a-registry diff --git a/debian/openvswitch-switch-config.templates b/debian/openvswitch-switch-config.templates new file mode 100644 index 00000000..24bf0352 --- /dev/null +++ b/debian/openvswitch-switch-config.templates @@ -0,0 +1,228 @@ +Template: openvswitch-switch/netdevs +Type: multiselect +_Choices: ${choices} +_Description: OpenFlow switch network devices: + Choose the network devices that should become part of the OpenFlow + switch. At least two devices must be selected for this machine to be + a useful switch. Unselecting all network devices will disable the + OpenFlow switch entirely. + . + The network devices that you select should not be configured with IP + or IPv6 addresses, even if the switch contacts the controller over + one of the selected network devices. This is because a running + OpenFlow switch takes over network devices at a low level: they + become part of the switch and cannot be used for other purposes. + +Template: openvswitch-switch/no-netdevs +Type: error +_Description: No network devices were selected. + No network devices were selected for inclusion in the OpenFlow switch. + The switch will be disabled. + +Template: openvswitch-switch/configured-netdevs +Type: note +_Description: Some Network Devices Have IP or IPv6 Addresses + The following network devices selected to be part of the OpenFlow switch + have IP or IPv6 addresses configured: + . + ${configured-netdevs} + . + This is usually a mistake, even if the switch contacts the controller over + one of the selected network devices. This is because a running + OpenFlow switch takes over network devices at a low level: they + become part of the switch and cannot be used for other purposes. + . + If this is an unintentional mistake, move back and fix the selection, + or de-configure the IP or IPv6 from these network devices. + +Template: openvswitch-switch/mode +Type: select +_Choices: discovery, in-band, out-of-band +Default: discovery +_Description: Switch-to-controller access method: + The OpenFlow switch must be able to contact the OpenFlow controller over + the network. It can do so in one of three ways: + . + discovery: A single network is used for OpenFlow traffic and other + data traffic; that is, the switch contacts the controller over one of + the network devices selected as OpenFlow switch network devices in + the previous question. The switch automatically determines the + location of the controller using a DHCP request with an + OpenFlow-specific vendor option. This is the most common case. + . + in-band: As above, but the location of the controller is manually + configured. + . + out-of-band: OpenFlow traffic uses a network separate from the data traffic + that it controls. If this is the case, the control network must already + be configured on a network device other than one of those selected as + an OpenFlow switch netdev in the previous question. + +Template: openvswitch-switch/discover +Type: note +_Description: Preparing to discover controller. + The setup program will now attempt to discover the OpenFlow controller. + Controller discovery may take up to 30 seconds. Please be patient. + . + See secchan(8) for instructions on how to configure a DHCP server for + controller discovery. + +Template: openvswitch-switch/discovery-failure +Type: error +_Description: Controller discovery failed. + The controller's location could not be determined automatically. + . + Ensure that the OpenFlow DHCP server is properly configured. See + secchan(8) for instructions on how to configure a DHCP server for + controller discovery. + +Template: openvswitch-switch/discovery-success +Type: boolean +Default: true +_Description: Use discovered settings? + Controller discovery obtained the following settings: + . + Controller location: ${controller-vconn} + . + PKI URL: ${pki-uri} + . + Please verify that these settings are correct. + +Template: openvswitch-switch/switch-ip +Type: string +Default: dhcp +_Description: Switch IP address: + For in-band communication with the controller, the OpenFlow switch must + be able to determine its own IP address. Its IP address may be configured + statically or dynamically. + . + For static configuration, specify the switch's IP address as a string. + . + For dynamic configuration with DHCP (the most common case), specify "dhcp". + Configuration with DHCP will only work reliably if the network topology + allows the switch to contact the DHCP server before it connects to the + OpenFlow controller. + +Template: openvswitch-switch/switch-ip-error +Type: error +_Description: The switch IP address is invalid. + The switch IP address must specified as "dhcp" or a valid IP address in + dotted-octet form (e.g. "1.2.3.4"). + +Template: openvswitch-switch/controller-vconn +Type: string +_Description: Controller location: + Specify how the OpenFlow switch should connect to the OpenFlow controller. + The value should be in form "ssl:HOST[:PORT]" to connect to the controller + over SSL (recommended for security) or "tcp:HOST[:PORT]" to connect over + cleartext TCP. + +Template: openvswitch-switch/controller-vconn-error +Type: error +_Description: The controller location is invalid. + The controller location must be specifed as "ssl:HOST[:PORT]" to + connect to the controller over SSL (recommended for security) or + "tcp:HOST[:PORT]" to connect over cleartext TCP. + +Template: openvswitch-switch/pki-uri +Type: string +_Description: OpenFlow PKI server host name or URL: + Specify a URL to the OpenFlow public key infrastructure (PKI). If a + host name or IP address is specified in place of a URL, then + http:///openvswitch/pki/ will be used, + where is the specified host name or IP address. + . + The OpenFlow PKI is usually on the same machine as the OpenFlow + controller. + . + The setup process will connect to the OpenFlow PKI server over + HTTP, using the system's configured default HTTP proxy (if any). + +Template: openvswitch-switch/fetch-cacert-failed +Type: error +_Description: The switch CA certificate could not be retrieved. + Retrieval of ${url} failed, with the following status: "${error}". + . + Ensure that the OpenFlow PKI server is correctly configured and + available at ${pki-uri}. If the system is configured to use an HTTP + proxy, also make sure that the HTTP proxy is available and that the + PKI server can be reached through it. + +Template: openvswitch-switch/verify-controller-ca +Type: select +_Choices: yes, no +Default: yes +_Description: Is ${fingerprint} the controller CA's fingerprint? + If a man-in-the-middle attack is possible in your network + environment, check that the controller CA's fingerprint is really + ${fingerprint}. Answer "yes" if it matches, "no" if + there is a discrepancy. + . + If a man-in-the-middle attack is not a concern, there is no need to + verify the fingerprint. Simply answer "yes". + +Template: openvswitch-switch/send-cert-req +Type: select +_Choices: yes, no +Default: yes +_Description: Send certificate request to switch CA? + Before it can connect to the controller over SSL, the OpenFlow + switch's key must be signed by the switch certificate authority (CA) + located on the OpenFlow PKI server, which is usually collocated with + the OpenFlow controller. A signing request can be sent to the PKI + server now. + . + Answer "yes" to send a signing request to the switch CA now. This is + ordinarily the correct choice. There is no harm in sending a given + signing request more than once. + . + Answer "no" to skip sending a signing request to the switch CA. + Unless the request has already been sent to the switch CA, manual + sending of the request and signing will be necessary. + +Template: openvswitch-switch/send-cert-req-failed +Type: error +_Description: The certificate request could not be sent. + Posting to ${url} failed, with the following status: "${error}". + . + Ensure that the OpenFlow PKI server is correctly configured and + available at ${pki-uri}. + +Template: openvswitch-switch/fetch-switch-cert +Type: select +_Choices: yes, no +_Description: Fetch signed switch certificate from PKI server? + Before it can connect to the controller over SSL, the OpenFlow + switch's key must be signed by the switch certificate authority (CA) + located on the OpenFlow PKI server, which is usually collocated with + the OpenFlow controller. + . + At this point, a signing request has been sent to the switch CA (or + sending a request has been manually skipped), but the signed + certificate has not yet been retrieved. Manual action may need to be + taken at the PKI server to approve the signing request. + . + Answer "yes" to attempt to retrieve the signed switch certificate + from the switch CA. If the switch certificate request has been + signed at the PKI server, this is the correct choice. + . + Answer "no" to postpone switch configuration. The configuration + process must be restarted later, when the switch certificate request + has been signed. + +Template: openvswitch-switch/fetch-switch-cert-failed +Type: error +_Description: Signed switch certificate could not be retrieved. + The signed switch certificate could not be retrieved from the switch + CA: retrieval of ${url} failed, with the following status: "${error}". + . + This probably indicates that the switch's certificate request has not + yet been signed. If this is the problem, it may be fixed by signing + the certificate request at ${pki-uri}, then trying to fetch the + signed switch certificate again. + +Template: openvswitch-switch/complete +Type: note +_Description: OpenFlow Switch Setup Finished + Setup of this OpenFlow switch is finished. Complete the setup procedure + to enable the switch. diff --git a/debian/openvswitch-switch.README.Debian b/debian/openvswitch-switch.README.Debian new file mode 100644 index 00000000..eb504f65 --- /dev/null +++ b/debian/openvswitch-switch.README.Debian @@ -0,0 +1,18 @@ +README.Debian for openvswitch-switch +--------------------------------- + +* The switch must be configured before it can be used. To configure + it interactively, install the openvswitch-switch-config package and run + the ovs-switch-setup program. Alternatively, edit + /etc/default/openvswitch-switch by hand, then start the switch manually + with "/etc/init.d/openvswitch-switch start". + +* To use the Linux kernel-based switch implementation, you will need + to build and install the Open vSwitch kernel module. To do so, install + the openvswitch-datapath-source package, then follow the instructions + given in /usr/share/doc/openvswitch-datapath-source/README.Debian + +* This package does not yet support the userspace datapath-based + switch implementation. + + -- Ben Pfaff , Mon, 11 May 2009 13:29:43 -0700 diff --git a/debian/openvswitch-switch.dirs b/debian/openvswitch-switch.dirs new file mode 100644 index 00000000..b4a52873 --- /dev/null +++ b/debian/openvswitch-switch.dirs @@ -0,0 +1,2 @@ +/etc/openvswitch-switch +/usr/share/openvswitch/switch diff --git a/debian/openvswitch-switch.init b/debian/openvswitch-switch.init new file mode 100755 index 00000000..b238f72e --- /dev/null +++ b/debian/openvswitch-switch.init @@ -0,0 +1,428 @@ +#! /bin/sh +# +# /etc/init.d/openvswitch-switch +# +# Written by Miquel van Smoorenburg . +# Modified for Debian by Ian Murdock . +# Further changes by Javier Fernandez-Sanguino +# Modified for openvswitch-switch. +# +# Version: @(#)skeleton 1.9 26-Feb-2001 miquels@cistron.nl +# +### BEGIN INIT INFO +# Provides: openvswitch-switch +# Required-Start: $network $named $remote_fs $syslog +# Required-Stop: +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Open vSwitch switch +### END INIT INFO + +PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin +DAEMON=/usr/sbin/secchan +NAME=secchan +DESC=secchan + +test -x $DAEMON || exit 0 + +NICIRA_OUI="002320" + +LOGDIR=/var/log/openvswitch +PIDFILE=/var/run/$NAME.pid +DHCLIENT_PIDFILE=/var/run/dhclient.of0.pid +DODTIME=1 # Time to wait for the server to die, in seconds + # If this value is set too low you might not + # let some servers to die gracefully and + # 'restart' will not work + +# Include secchan defaults if available +unset NETDEVS +unset MODE +unset SWITCH_IP +unset CONTROLLER +unset PRIVKEY +unset CERT +unset CACERT +unset CACERT_MODE +unset MGMT_VCONNS +unset COMMANDS +unset DAEMON_OPTS +unset CORE_LIMIT +unset DATAPATH_ID +default=/etc/default/openvswitch-switch +if [ -f $default ] ; then + . $default +fi + +set -e + +running_pid() +{ + # Check if a given process pid's cmdline matches a given name + pid=$1 + name=$2 + [ -z "$pid" ] && return 1 + [ ! -d /proc/$pid ] && return 1 + cmd=`cat /proc/$pid/cmdline | tr "\000" "\n"|head -n 1 |cut -d : -f 1` + # Is this the expected child? + case $cmd in + $name|*/$name) + return 0 + ;; + *) + return 1 + ;; + esac +} + +running() +{ +# Check if the process is running looking at /proc +# (works for all users) + + # No pidfile, probably no daemon present + [ ! -f "$PIDFILE" ] && return 1 + # Obtain the pid and check it against the binary name + pid=`cat $PIDFILE` + running_pid $pid $NAME || return 1 + return 0 +} + +force_stop() { +# Forcefully kill the process + [ ! -f "$PIDFILE" ] && return + if running ; then + kill -15 $pid + # Is it really dead? + [ -n "$DODTIME" ] && sleep "$DODTIME"s + if running ; then + kill -9 $pid + [ -n "$DODTIME" ] && sleep "$DODTIME"s + if running ; then + echo "Cannot kill $NAME (pid=$pid)!" + exit 1 + fi + fi + fi + rm -f $PIDFILE + return 0 +} + +must_succeed() { + echo -n "$1: " + shift + if "$@"; then + echo "success." + else + echo " ERROR." + exit 1 + fi +} + +check_op() { + echo -n "$1: " + shift + if "$@"; then + echo "success." + else + echo " ERROR." + fi +} + +configure_ssl() { + if (test "$CACERT_MODE" != secure && test "$CACERT_MODE" != bootstrap) \ + || test ! -e "$PRIVKEY" || test ! -e "$CERT" \ + || (test ! -e "$CACERT" && test "$CACERT_MODE" != bootstrap); then + if test "$CACERT_MODE" != secure && test "$CACERT_MODE" != bootstrap + then + echo "CACERT_MODE is not set to 'secure' or 'bootstrap'" + fi + if test ! -e "$PRIVKEY"; then + echo "$PRIVKEY: private key missing" >&2 + fi + if test ! -e "$CERT"; then + echo "$CERT: certificate for private key missing" >&2 + fi + if test ! -e "$CACERT" && test "$CACERT_MODE" != bootstrap; then + echo "$CACERT: CA certificate missing (and CA certificate bootstrapping not enabled)" >&2 + fi + echo "Run ovs-switch-setup (in the openvswitch-switch-config package) or edit /etc/default/openvswitch-switch to configure" >&2 + if test "$MODE" = discovery; then + echo "You may also delete or rename $PRIVKEY to disable SSL requirement" >&2 + fi + exit 1 + fi + + SSL_OPTS="--private-key=$PRIVKEY --certificate=$CERT" + if test ! -e "$CACERT" && test "$CACERT_MODE" = bootstrap; then + SSL_OPTS="$SSL_OPTS --bootstrap-ca-cert=$CACERT" + else + SSL_OPTS="$SSL_OPTS --ca-cert=$CACERT" + fi +} + +check_int_var() { + eval value=\$$1 + if test -n "$value"; then + if expr "X$value" : 'X[0-9][0-9]*$' > /dev/null 2>&1; then + if test $value -lt $2; then + echo "warning: The $1 option may not be set to a value below $2, treating as $2" >&2 + eval $1=$2 + fi + else + echo "warning: The $1 option must be set to a number, ignoring" >&2 + unset $1 + fi + fi +} + +check_new_option() { + case $DAEMON_OPTS in + *$1*) + echo "warning: The $1 option in DAEMON_OPTS may now be set with the $2 variable in $default. The setting in DAEMON_OPTS will override the $2 variable, which will prevent the switch UI from configuring $1." >&2 + ;; + esac +} + +case "$1" in + start) + if test -z "$NETDEVS"; then + echo "$default: No network devices configured, switch disabled" >&2 + echo "Run ovs-switch-setup (in the openvswitch-switch-config package) or edit /etc/default/openvswitch-switch to configure" >&2 + exit 0 + fi + if test "$MODE" = discovery; then + unset CONTROLLER + elif test "$MODE" = in-band || test "$MODE" = out-of-band; then + if test -z "$CONTROLLER"; then + echo "$default: No controller configured and not configured for discovery, switch disabled" >&2 + echo "Run ovs-switch-setup (in the openvswitch-switch-config package) or edit /etc/default/openvswitch-switch to configure" >&2 + exit 0 + fi + else + echo "$default: MODE must set to 'discovery', 'in-band', or 'out-of-band'" >&2 + echo "Run ovs-switch-setup (in the openvswitch-switch-config package) or edit /etc/default/openvswitch-switch to configure" >&2 + exit 1 + fi + : ${PRIVKEY:=/etc/openvswitch-switch/of0-privkey.pem} + : ${CERT:=/etc/openvswitch-switch/of0-cert.pem} + : ${CACERT:=/etc/openvswitch-switch/cacert.pem} + case $CONTROLLER in + '') + # Discovery mode. + if test -e "$PRIVKEY"; then + configure_ssl + fi + ;; + tcp:*) + ;; + ssl:*) + configure_ssl + ;; + *) + echo "$default: CONTROLLER must be in the form 'ssl:HOST[:PORT]' or 'tcp:HOST[:PORT]' when not in discovery mode" >&2 + echo "Run ovs-switch-setup (in the openvswitch-switch-config package) or edit /etc/default/openvswitch-switch to configure" >&2 + exit 1 + esac + case $DISCONNECTED_MODE in + ''|switch|drop) ;; + *) echo "$default: warning: DISCONNECTED_MODE is not 'switch' or 'drop'" >&2 ;; + esac + + check_int_var RATE_LIMIT 100 + check_int_var INACTIVITY_PROBE 5 + check_int_var MAX_BACKOFF 1 + + check_new_option --fail DISCONNECTED_MODE + check_new_option --stp STP + check_new_option --rate-limit RATE_LIMIT + check_new_option --inactivity INACTIVITY_PROBE + check_new_option --max-backoff MAX_BACKOFF + case $DAEMON_OPTS in + *--rate-limit*) + echo "$default: --rate-limit may now be set with RATE_LIMIT" >&2 + esac + + echo -n "Loading openvswitch_mod: " + if grep -q '^openvswitch_mod$' /proc/modules; then + echo "already loaded, nothing to do." + elif modprobe openvswitch_mod; then + echo "success." + else + echo "ERROR." + echo "openvswitch_mod has probably not been built for this kernel." + if ! test -d /usr/share/doc/openvswitch-datapath-source; then + echo "Install the openvswitch-datapath-source package, then read" + echo "/usr/share/doc/openvswitch-datapath-source/README.Debian" + else + echo "For instructions, read" + echo "/usr/share/doc/openvswitch-datapath-source/README.Debian" + fi + exit 1 + fi + + for netdev in $NETDEVS; do + check_op "Removing IP address from $netdev" ifconfig $netdev 0.0.0.0 + done + + must_succeed "Creating datapath" ovs-dpctl add-dp of0 $NETDEVS + + xx='[0-9abcdefABCDEF][0-9abcdefABCDEF]' + case $DATAPATH_ID in + '') + # Check if the DMI System UUID contains a Nicira mac address + # that should be used for this datapath. The UUID is assumed + # to be RFC 4122 compliant. + DMIDECODE=`which dmidecode` + if [ -n $DMIDECODE ]; then + UUID_MAC=`$DMIDECODE -s system-uuid | cut -d'-' -f 5` + case $UUID_MAC in + $NICIRA_OUI*) + ifconfig of0 down + must_succeed "Setting of0 MAC address to $UUID_MAC" ifconfig of0 hw ether $UUID_MAC + ifconfig of0 up + ;; + esac + fi + ;; + $xx:$xx:$xx:$xx:$xx:$xx) + ifconfig of0 down + must_succeed "Setting of0 MAC address to $DATAPATH_ID" ifconfig of0 hw ether $DATAPATH_ID + ifconfig of0 up + ;; + *) + echo "DATAPATH_ID is not a valid MAC address in the form XX:XX:XX:XX:XX:XX, ignoring" >&2 + ;; + esac + + if test "$MODE" = in-band; then + if test "$SWITCH_IP" = dhcp; then + must_succeed "Temporarily disabling of0" ifconfig of0 down + else + COMMAND="ifconfig of0 $SWITCH_IP" + if test -n "$SWITCH_NETMASK"; then + COMMAND="$COMMAND netmask $SWITCH_NETMASK" + fi + must_succeed "Configuring of0: $COMMAND" $COMMAND + if test -n "$SWITCH_GATEWAY"; then + # This can fail because the route already exists, + # so we don't insist that it succeed. + COMMAND="route add default gw $SWITCH_GATEWAY" + check_op "Adding default route: $COMMAND" $COMMAND + fi + fi + else + must_succeed "Disabling of0" ifconfig of0 down + fi + + if test -n "$CORE_LIMIT"; then + check_op "Setting core limit to $CORE_LIMIT" ulimit -c "$CORE_LIMIT" + fi + + # Compose secchan options. + set -- + set -- "$@" --verbose=ANY:console:emer --verbose=ANY:syslog:err + set -- "$@" --log-file + set -- "$@" --detach --pidfile=$PIDFILE + for vconn in $MGMT_VCONNS; do + set -- "$@" --listen="$vconn" + done + if test -n "$COMMANDS"; then + set -- "$@" --command-acl="$COMMANDS" + fi + case $STP in + yes) set -- "$@" --stp ;; + no) set -- "$@" --no-stp ;; + esac + case $DISCONNECTED_MODE in + switch) set -- "$@" --fail=open ;; + drop) set -- "$@" --fail=closed ;; + esac + if test -n "$RATE_LIMIT"; then + set -- "$@" --rate-limit=$RATE_LIMIT + fi + if test -n "$INACTIVITY_PROBE"; then + set -- "$@" --inactivity-probe=$INACTIVITY_PROBE + fi + if test -n "$MAX_BACKOFF"; then + set -- "$@" --max-backoff=$MAX_BACKOFF + fi + set -- "$@" $SSL_OPTS $DAEMON_OPTS + if test "$MODE" = out-of-band; then + set -- "$@" --out-of-band + fi + set -- "$@" of0 "$CONTROLLER" + echo -n "Starting $DESC: " + start-stop-daemon --start --quiet --pidfile $PIDFILE \ + --exec $DAEMON -- "$@" + if running; then + echo "$NAME." + else + echo " ERROR." + fi + + if test "$MODE" = in-band && test "$SWITCH_IP" = dhcp; then + echo -n "Starting dhclient on of0: " + start-stop-daemon --start --quiet --pidfile $DHCLIENT_PIDFILE \ + --exec /sbin/dhclient -- -q -pf $DHCLIENT_PIDFILE of0 + if running; then + echo "dhclient." + else + echo " ERROR." + fi + fi + ;; + stop) + if test -e /var/run/dhclient.of0.pid; then + echo -n "Stopping dhclient on of0: " + start-stop-daemon --stop --quiet --oknodo \ + --pidfile $DHCLIENT_PIDFILE --exec /sbin/dhclient + echo "dhclient." + fi + + echo -n "Stopping $DESC: " + start-stop-daemon --stop --quiet --oknodo --pidfile $PIDFILE \ + --exec $DAEMON + echo "$NAME." + + check_op "Deleting datapath" ovs-dpctl del-dp of0 + check_op "Unloading kernel module" modprobe -r openvswitch_mod + ;; + force-stop) + echo -n "Forcefully stopping $DESC: " + force_stop + if ! running; then + echo "$NAME." + else + echo " ERROR." + fi + ;; + reload) + ;; + force-reload) + start-stop-daemon --stop --test --quiet --pidfile \ + $PIDFILE --exec $DAEMON \ + && $0 restart \ + || exit 0 + ;; + restart) + $0 stop || true + $0 start + ;; + status) + echo -n "$NAME is " + if running ; then + echo "running" + else + echo " not running." + exit 1 + fi + ;; + *) + N=/etc/init.d/$NAME + echo "Usage: $N {start|stop|restart|force-reload|status|force-stop}" >&2 + exit 1 + ;; +esac + +exit 0 diff --git a/debian/openvswitch-switch.install b/debian/openvswitch-switch.install new file mode 100644 index 00000000..9fddacf0 --- /dev/null +++ b/debian/openvswitch-switch.install @@ -0,0 +1,7 @@ +_debian/secchan/secchan usr/sbin +_debian/utilities/ovs-dpctl usr/sbin +_debian/utilities/ovs-discover usr/sbin +_debian/utilities/ovs-kill usr/sbin +_debian/utilities/ovs-ofctl usr/sbin +debian/openvswitch/usr/share/openvswitch/commands/* usr/share/openvswitch/commands +debian/commands/* usr/share/openvswitch/commands diff --git a/debian/openvswitch-switch.logrotate b/debian/openvswitch-switch.logrotate new file mode 100644 index 00000000..41394e86 --- /dev/null +++ b/debian/openvswitch-switch.logrotate @@ -0,0 +1,11 @@ +/var/log/openvswitch/secchan.log { + daily + compress + create 640 root adm + delaycompress + missingok + rotate 30 + postrotate + ovs-appctl --target /var/run/secchan.pid --reopen + endscript +} diff --git a/debian/openvswitch-switch.manpages b/debian/openvswitch-switch.manpages new file mode 100644 index 00000000..f789eba9 --- /dev/null +++ b/debian/openvswitch-switch.manpages @@ -0,0 +1,5 @@ +_debian/secchan/secchan.8 +_debian/utilities/ovs-discover.8 +_debian/utilities/ovs-dpctl.8 +_debian/utilities/ovs-kill.8 +_debian/utilities/ovs-ofctl.8 diff --git a/debian/openvswitch-switch.postinst b/debian/openvswitch-switch.postinst new file mode 100755 index 00000000..74b52ba9 --- /dev/null +++ b/debian/openvswitch-switch.postinst @@ -0,0 +1,51 @@ +#!/bin/sh +# postinst script for openvswitch-switch +# +# see: dh_installdeb(1) + +set -e + +# summary of how this script can be called: +# * `configure' +# * `abort-upgrade' +# * `abort-remove' `in-favour' +# +# * `abort-remove' +# * `abort-deconfigure' `in-favour' +# `removing' +# +# for details, see http://www.debian.org/doc/debian-policy/ or +# the debian-policy package + + +case "$1" in + configure) + DEFAULT=/etc/default/openvswitch-switch + TEMPLATE=/usr/share/openvswitch/switch/default.template + if ! test -e $DEFAULT; then + cp $TEMPLATE $DEFAULT + else + for var in $(awk -F'[ :]' '/^# [_A-Z0-9]+:/{print $2}' $TEMPLATE) + do + if ! grep $var $DEFAULT >/dev/null 2>&1; then + echo >> $DEFAULT + sed -n "/$var:/,/$var=/p" $TEMPLATE >> $DEFAULT + fi + done + fi + ;; + + abort-upgrade|abort-remove|abort-deconfigure) + ;; + + *) + echo "postinst called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +#DEBHELPER# + +exit 0 + + diff --git a/debian/openvswitch-switch.postrm b/debian/openvswitch-switch.postrm new file mode 100755 index 00000000..19e8ebe4 --- /dev/null +++ b/debian/openvswitch-switch.postrm @@ -0,0 +1,43 @@ +#!/bin/sh +# postrm script for openvswitch-switch +# +# see: dh_installdeb(1) + +set -e + +# summary of how this script can be called: +# * `remove' +# * `purge' +# * `upgrade' +# * `failed-upgrade' +# * `abort-install' +# * `abort-install' +# * `abort-upgrade' +# * `disappear' +# +# for details, see http://www.debian.org/doc/debian-policy/ or +# the debian-policy package + + +case "$1" in + purge) + rm -f /etc/default/openvswitch-switch + ;; + + remove|upgrade|failed-upgrade|abort-install|abort-upgrade|disappear) + ;; + + *) + echo "postrm called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +# dh_installdeb will replace this with shell code automatically +# generated by other debhelper scripts. + +#DEBHELPER# + +exit 0 + + diff --git a/debian/openvswitch-switch.template b/debian/openvswitch-switch.template new file mode 100644 index 00000000..7fe0e15c --- /dev/null +++ b/debian/openvswitch-switch.template @@ -0,0 +1,165 @@ +# This is a POSIX shell fragment -*- sh -*- + +# To configure the secure channel, fill in the following properly and +# uncomment them. Afterward, the secure channel will come up +# automatically at boot time. It can be started immediately with +# /etc/init.d/openvswitch-switch start +# Alternatively, use the ovs-switch-setup program (from the +# openvswitch-switch-config package) to do everything automatically. + +# NETDEVS: Which network devices should the OpenFlow switch include? +# +# List the network devices that should become part of the OpenFlow +# switch, separated by spaces. At least two devices must be selected +# for this machine to be a useful switch. Unselecting all network +# devices will disable the OpenFlow switch entirely. +# +# The network devices that you select should not be configured with IP +# or IPv6 addresses, even if the switch contacts the controller over +# one of the selected network devices. This is because a running +# Open vSwitch switch takes over network devices at a low level: they +# become part of the switch and cannot be used for other purposes. +#NETDEVS="" + +# MODE: The OpenFlow switch has three modes that determine how it +# reaches the controller: +# +# * in-band with discovery: A single network is used for OpenFlow +# traffic and other data traffic; that is, the switch contacts the +# controller over one of the network devices selected as OpenFlow +# switch ports. The switch automatically determines the location of +# the controller using a DHCP request with an OpenFlow-specific +# vendor option. This is the most common case. +# +# * in-band: As above, but the location of the controller is manually +# configured. +# +# * out-of-band: OpenFlow traffic uses a network separate from the +# data traffic that it controls. If this is the case, the control +# network must already be configured on a network device other than +# one of those selected as an Open vSwitch switch port in the previous +# question. +# +# Set MODE to 'discovery', 'in-band', or 'out-of-band' for these +# respective cases. +MODE=discovery + +# SWITCH_IP: In 'in-band' mode, the switch's IP address may be +# configured statically or dynamically: +# +# * For static configuration, specify the switch's IP address as a +# string. In this case you may also set SWITCH_NETMASK and +# SWITCH_GATEWAY appropriately (see below). +# +# * For dynamic configuration with DHCP (the most common case), +# specify "dhcp". Configuration with DHCP will only work reliably +# if the network topology allows the switch to contact the DHCP +# server before it connects to the OpenFlow controller. +# +# This setting has no effect unless MODE is set to 'in-band'. +SWITCH_IP=dhcp + +# SWITCH_NETMASK: IP netmask to use in 'in-band' mode when the switch +# IP address is not 'dhcp'. +#SWITCH_NETMASK=255.255.255.0 + +# SWITCH_GATEWAY: IP gateway to use in 'in-band' mode when the switch +# IP address is not 'dhcp'. +#SWITCH_GATEWAY=192.168.1.1 + +# CONTROLLER: Location of controller. +# One of the following formats: +# tcp:HOST[:PORT] via TCP to PORT (default: 6633) on HOST +# ssl:HOST[:PORT] via SSL to PORT (default: 6633) on HOST +# The default below assumes that the controller is running locally. +# This setting has no effect when MODE is set to 'discovery'. +#CONTROLLER="tcp:127.0.0.1" + +# PRIVKEY: Name of file containing switch's private key. +# Required if SSL enabled. +#PRIVKEY=/etc/openvswitch-switch/of0-privkey.pem + +# CERT: Name of file containing certificate for private key. +# Required if SSL enabled. +#CERT=/etc/openvswitch-switch/of0-cert.pem + +# CACERT: Name of file containing controller CA certificate. +# Required if SSL enabled. +#CACERT=/etc/openvswitch-switch/cacert.pem + +# CACERT_MODE: Two modes are available: +# +# * secure: The controller CA certificate named in CACERT above must exist. +# (You must copy it manually from the PKI server or another trusted source.) +# +# * bootstrap: If the controller CA certificate named in CACERT above does +# not exist, the switch will obtain it from the controller the first time +# it connects and save a copy to the file named in CACERT. This is insecure, +# in the same way that initial connections with ssh are insecure, but +# it is convenient. +# +# Set CACERT_MODE to 'secure' or 'bootstrap' for these respective cases. +#CACERT_MODE=secure + +# MGMT_VCONNS: List of vconns (space-separated) on which secchan +# should listen for management connections from ovs-ofctl, etc. +# openvswitch-switchui by default connects to +# unix:/var/run/secchan.mgmt, so do not disable this if you want to +# use openvswitch-switchui. +MGMT_VCONNS="punix:/var/run/secchan.mgmt" + +# COMMANDS: Access control list for the commands that can be executed +# remotely over the OpenFlow protocol, as a comma-separated list of +# shell glob patterns. Negative patterns (beginning with !) act as a +# blacklist. To be executable, a command name must match one positive +# pattern and not match any negative patterns. +#COMMANDS="reboot,update" + +# DISCONNECTED_MODE: Switch behavior when attempts to connect to the +# controller repeatedly fail, either 'switch', to act as an L2 switch +# in this case, or 'drop', to drop all packets (except those necessary +# to connect to the controller). If unset, the default is 'drop'. +#DISCONNECTED_MODE=switch + +# STP: Enable or disabled 802.1D-1998 Spanning Tree Protocol. Set to +# 'yes' to enable STP, 'no' to disable it. If unset, secchan's +# current default is 'no' (but this may change in the future). +#STP=no + +# RATE_LIMIT: Maximum number of received frames, that do not match any +# existing switch flow, to forward up to the controller per second. +# The valid range is 100 and up. If unset, this rate will not be +# limited. +#RATE_LIMIT=1000 + +# INACTIVITY_PROBE: The maximum number of seconds of inactivity on the +# controller connection before secchan sends an inactivity probe +# message to the controller. The valid range is 5 and up. If unset, +# secchan defaults to 15 seconds. +#INACTIVITY_PROBE=5 + +# MAX_BACKOFF: The maximum time that secchan will wait between +# attempts to connect to the controller. The valid range is 1 and up. +# If unset, secchan defaults to 15 seconds. +#MAX_BACKOFF=15 + +# DAEMON_OPTS: Additional options to pass to secchan, e.g. "--fail=open" +DAEMON_OPTS="" + +# CORE_LIMIT: Maximum size for core dumps. +# +# Leaving this unset will use the system default. Setting it to 0 +# will disable core dumps. Setting it to "unlimited" will dump all +# core files regardless of size. +#CORE_LIMIT=unlimited + +# DATAPATH_ID: Identifier for this switch. +# +# By default, the switch checks if the DMI System UUID contains a Nicira +# mac address to use as a datapath ID. If not, then the switch generates +# a new, random datapath ID every time it starts up. By setting this +# value, the supplied datapath ID will always be used. +# +# Set DATAPATH_ID to a MAC address in the form XX:XX:XX:XX:XX:XX where each +# X is a hexadecimal digit (0-9 or a-f). +#DATAPATH_ID=XX:XX:XX:XX:XX:XX diff --git a/debian/openvswitch-switchui.copyright b/debian/openvswitch-switchui.copyright new file mode 100644 index 00000000..ab7cac59 --- /dev/null +++ b/debian/openvswitch-switchui.copyright @@ -0,0 +1,33 @@ +Upstream Authors: + + Nicira Networks, Inc. + +Copyright: + + Copyright (c) 2008, 2009 Nicira Networks, Inc. + +License: + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + + In addition, as a special exception, Nicira Networks gives + permission to link the code of its release of ovs-vswitchd with + the OpenSSL project's "OpenSSL" library (or with modified versions + of it that use the same license as the "OpenSSL" library), and + distribute the linked executables. You must obey the GNU General + Public License in all respects for all of the code used other than + "OpenSSL". If you modify this file, you may extend this exception + to your version of the file, but you are not obligated to do so. + If you do not wish to do so, delete this exception statement from + your version. diff --git a/debian/openvswitch-switchui.default b/debian/openvswitch-switchui.default new file mode 100644 index 00000000..6cdbf7a5 --- /dev/null +++ b/debian/openvswitch-switchui.default @@ -0,0 +1,35 @@ +# This is a POSIX shell fragment -*- sh -*- + +# To configure the switch monitor, modify the following. Afterward, +# the secure channel will come up automatically at boot time. It can +# be restarted immediately with +# /etc/init.d/openvswitch-switchui start + +# Defaults for initscript +# sourced by /etc/init.d/openvswitch-switchui +# installed at /etc/default/openvswitch-switchui by the maintainer scripts + +# SWITCH_VCONN: The vconn used to connect to the switch (secchan). +# The secchan must be configured to listen to this vconn. The default +# here set is also listened to by default by the openvswitch-switch +# package, so ordinarily there is no need to modify this. +SWITCH_VCONN="unix:/var/run/secchan.mgmt" + +# EZIO3_DEVICE: To display the switch monitor on an EZIO3 (aka +# MTB-134) 16x2 LCD displays found on server appliances made by +# Portwell, set this to the EZIO3 serial device and uncomment it. +#EZIO3_DEVICE="/dev/ttyS1" + +# OPENVT: When EZIO3_DEVICE is unset, this specifies the command under +# which to run ovs-switchui. The default value of "/usr/bin/openvt" +# causes ovs-switchui to run on a new, otherwise empty virtual +# console. +# +# The value must be a command name without arguments. Use a wrapper +# script to provide arguments if you need them. +# +# When EZIO3_DEVICE is set, this variable has no effect. +OPENVT="/usr/bin/openvt" + +# DAEMON_OPTS: Additional options to pass to ovs-switchui. +DAEMON_OPTS="" diff --git a/debian/openvswitch-switchui.dirs b/debian/openvswitch-switchui.dirs new file mode 100644 index 00000000..4dced02c --- /dev/null +++ b/debian/openvswitch-switchui.dirs @@ -0,0 +1,3 @@ +usr/bin +usr/sbin +usr/share/terminfo diff --git a/debian/openvswitch-switchui.init b/debian/openvswitch-switchui.init new file mode 100755 index 00000000..7a02c5ea --- /dev/null +++ b/debian/openvswitch-switchui.init @@ -0,0 +1,210 @@ +#!/bin/sh +# +# Example init.d script with LSB support. +# +# Please read this init.d carefully and modify the sections to +# adjust it to the program you want to run. +# +# Copyright (c) 2007, 2009 Javier Fernandez-Sanguino +# +# This is free software; you may redistribute it and/or modify +# it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2, +# or (at your option) any later version. +# +# This is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License with +# the Debian operating system, in /usr/share/common-licenses/GPL; if +# not, write to the Free Software Foundation, Inc., 59 Temple Place, +# Suite 330, Boston, MA 02111-1307 USA +# +### BEGIN INIT INFO +# Provides: openvswitch-switchui +# Required-Start: $network $local_fs +# Required-Stop: +# Should-Start: $named $syslog openvswitch-switch +# Should-Stop: +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Open vSwitch switch monitor +### END INIT INFO + +PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin + +DAEMON=/usr/bin/ovs-switchui +NAME=openvswitch-switchui +DESC="Open vSwitch switch monitor" + +PIDFILE=/var/run/$NAME.pid + +test -x $DAEMON || exit 0 + +. /lib/lsb/init-functions + +# Default options, these can be overriden by the information +# at /etc/default/$NAME +DAEMON_OPTS="" # Additional options given to the server + +DODTIME=10 # Time to wait for the server to die, in seconds + # If this value is set too low you might not + # let some servers to die gracefully and + # 'restart' will not work + +# Include defaults if available +if [ -f /etc/default/$NAME ] ; then + . /etc/default/$NAME +fi + +set -e + +running_pid() { +# Check if a given process pid's cmdline matches a given name + pid=$1 + name=$2 + [ -z "$pid" ] && return 1 + [ ! -d /proc/$pid ] && return 1 + return 0 +} + +running() { +# Check if the process is running looking at /proc +# (works for all users) + + # No pidfile, probably no daemon present + [ ! -f "$PIDFILE" ] && return 1 + pid=`cat $PIDFILE` + running_pid $pid $DAEMON || return 1 + return 0 +} + +start_server() { +# Start the process using the wrapper + if test -n "$EZIO3_DEVICE"; then + # Make ezio-term detach and create the pidfile. + WRAPPER="/usr/sbin/ezio-term" + WRAPPER_OPTS="--detach --pidfile=$PIDFILE --ezio=$EZIO3_DEVICE --input=vt" + else + # openvt will detach, so instead make ovs-switchui make the pidfile. + WRAPPER=$OPENVT + WRAPPER_OPTS="" + DAEMON_OPTS="--pidfile=$PIDFILE" + fi + start-stop-daemon --start --quiet --pidfile $PIDFILE \ + --exec $WRAPPER -- $WRAPPER_OPTS -- $DAEMON $DAEMON_OPTS \ + --log-file $SWITCH_VCONN + + # Wait up to 3 seconds for the daemon to start. + for i in 1 2 3; do + if running; then + break + fi + sleep 1 + done +} + +stop_server() { + ovs-kill $PIDFILE +} + +force_stop() { +# Force the process to die killing it manually + [ ! -e "$PIDFILE" ] && return + if running ; then + kill -15 $pid + # Is it really dead? + sleep "$DIETIME"s + if running ; then + kill -9 $pid + sleep "$DIETIME"s + if running ; then + echo "Cannot kill $NAME (pid=$pid)!" + exit 1 + fi + fi + fi + rm -f $PIDFILE +} + + +case "$1" in + start) + log_daemon_msg "Starting $DESC " "$NAME" + # Check if it's running first + if running ; then + log_progress_msg "apparently already running" + log_end_msg 0 + exit 0 + fi + if start_server && running ; then + # It's ok, the server started and is running + log_end_msg 0 + else + # Either we could not start it or it is not running + # after we did + # NOTE: Some servers might die some time after they start, + # this code does not try to detect this and might give + # a false positive (use 'status' for that) + log_end_msg 1 + fi + ;; + stop) + log_daemon_msg "Stopping $DESC" "$NAME" + if running ; then + # Only stop the server if we see it running + stop_server + log_end_msg $? + else + # If it's not running don't do anything + log_progress_msg "apparently not running" + log_end_msg 0 + exit 0 + fi + ;; + force-stop) + # First try to stop gracefully the program + $0 stop + if running; then + # If it's still running try to kill it more forcefully + log_daemon_msg "Stopping (force) $DESC" "$NAME" + force_stop + log_end_msg $? + fi + ;; + restart|force-reload) + log_daemon_msg "Restarting $DESC" "$NAME" + stop_server + # Wait some sensible amount, some server need this + [ -n "$DIETIME" ] && sleep $DIETIME + start_server + running + log_end_msg $? + ;; + status) + + log_daemon_msg "Checking status of $DESC" "$NAME" + if running ; then + log_progress_msg "running" + log_end_msg 0 + else + log_progress_msg "apparently not running" + log_end_msg 1 + exit 1 + fi + ;; + # Use this if the daemon cannot reload + reload) + log_warning_msg "Reloading $NAME daemon: not implemented, as the daemon" + log_warning_msg "cannot re-read the config file (use restart)." + ;; + *) + N=/etc/init.d/$NAME + echo "Usage: $N {start|stop|force-stop|restart|force-reload|status}" >&2 + exit 1 + ;; +esac + +exit 0 diff --git a/debian/openvswitch-switchui.install b/debian/openvswitch-switchui.install new file mode 100644 index 00000000..f2872c83 --- /dev/null +++ b/debian/openvswitch-switchui.install @@ -0,0 +1,2 @@ +_debian/extras/ezio/ezio-term usr/sbin +_debian/extras/ezio/ovs-switchui usr/bin diff --git a/debian/openvswitch-wdt.default b/debian/openvswitch-wdt.default new file mode 100644 index 00000000..35625d45 --- /dev/null +++ b/debian/openvswitch-wdt.default @@ -0,0 +1,24 @@ +# This is a POSIX shell fragment -*- sh -*- + +# To configure the Open vSwitch reliability packages, modify the following. +# Afterward, the watchdog timer and oops handling will be configured +# automatically at boot time. It can be started immediately with +# /etc/init.d/openvswitch-wdt start + +# Defaults for initscript +# sourced by /etc/init.d/openvswitch-wdt +# installed at /etc/default/openvswitch-wdt by the maintainer scripts + +# OOPS_REBOOT_TIME: The number of seconds the system should wait until it +# reboots when the kernel oops. A value of zero causes the system to +# wait forever. +OOPS_REBOOT_TIME=1 + +# WDT_TIMEOUT: The number of seconds the watchdog timer should wait until +# it reboots the system when it hasn't received a keep-alive. A value +# of zero disables the watchdog timer. +WDT_TIMEOUT=30 + +# WDT_INTERVAL: The number of seconds to wait between sending keep-alive +# messages to the watchdog timer. +WDT_INTERVAL=1 diff --git a/debian/openvswitch-wdt.dirs b/debian/openvswitch-wdt.dirs new file mode 100644 index 00000000..ca882bbb --- /dev/null +++ b/debian/openvswitch-wdt.dirs @@ -0,0 +1,2 @@ +usr/bin +usr/sbin diff --git a/debian/openvswitch-wdt.init b/debian/openvswitch-wdt.init new file mode 100755 index 00000000..b1c0ec5e --- /dev/null +++ b/debian/openvswitch-wdt.init @@ -0,0 +1,176 @@ +#!/bin/sh +# +# Example init.d script with LSB support. +# +# Please read this init.d carefully and modify the sections to +# adjust it to the program you want to run. +# +# Copyright (c) 2007, 2009 Javier Fernandez-Sanguino +# +# This is free software; you may redistribute it and/or modify +# it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2, +# or (at your option) any later version. +# +# This is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License with +# the Debian operating system, in /usr/share/common-licenses/GPL; if +# not, write to the Free Software Foundation, Inc., 59 Temple Place, +# Suite 330, Boston, MA 02111-1307 USA +# +### BEGIN INIT INFO +# Provides: openvswitch-wdt +# Required-Start: $network $local_fs +# Required-Stop: +# Should-Start: $named $syslog openvswitch-switch +# Should-Stop: +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Open vSwitch switch watchdog +### END INIT INFO + +PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin + +DAEMON=/usr/sbin/ovs-wdt +NAME=openvswitch-wdt +DESC="Open vSwitch switch watchdog" + +PIDFILE=/var/run/$NAME.pid + +test -x $DAEMON || exit 0 + +. /lib/lsb/init-functions + +# Default options, these can be overriden by the information +# at /etc/default/$NAME +DAEMON_OPTS="" # Additional options given to the daemon + +DODTIME=10 # Time to wait for the daemon to die, in seconds + # If this value is set too low you might not + # let some daemons to die gracefully and + # 'restart' will not work + +# Include defaults if available +if [ -f /etc/default/$NAME ] ; then + . /etc/default/$NAME +fi + +set -e + +running_pid() { +# Check if a given process pid's cmdline matches a given name + pid=$1 + name=$2 + [ -z "$pid" ] && return 1 + [ ! -d /proc/$pid ] && return 1 + return 0 +} + +running() { +# Check if the process is running looking at /proc +# (works for all users) + + # No pidfile, probably no daemon present + [ ! -f "$PIDFILE" ] && return 1 + pid=`cat $PIDFILE` + running_pid $pid $DAEMON || return 1 + return 0 +} + +start_daemon() { +# Start the process using the wrapper + if test $WDT_TIMEOUT != 0; then + start-stop-daemon --start --quiet -m --background --pidfile $PIDFILE \ + --exec $DAEMON -- --timeout=$WDT_TIMEOUT --interval=$WDT_INTERVAL $DAEMON_OPTS + fi + + # Wait up to 3 seconds for the daemon to start. + for i in 1 2 3; do + if running; then + break + fi + sleep 1 + done + + echo $OOPS_REBOOT_TIME > /proc/sys/kernel/panic + echo 1 > /proc/sys/kernel/panic_on_oops +} + +stop_daemon() { + start-stop-daemon -o --stop --pidfile $PIDFILE + rm $PIDFILE +} + +case "$1" in + start) + log_daemon_msg "Starting $DESC " "$NAME" + # Check if it's running first + if running ; then + log_progress_msg "apparently already running" + log_end_msg 0 + exit 0 + fi + if start_daemon && running ; then + # It's ok, the daemon started and is running + log_end_msg 0 + else + # Either we could not start it or it is not running + # after we did + # NOTE: Some daemons might die some time after they start, + # this code does not try to detect this and might give + # a false positive (use 'status' for that) + log_end_msg 1 + fi + ;; + stop) + log_daemon_msg "Stopping $DESC" "$NAME" + if running ; then + # Only stop the daemon if we see it running + stop_daemon + log_end_msg $? + else + # If it's not running don't do anything + log_progress_msg "apparently not running" + log_end_msg 0 + exit 0 + fi + ;; + restart|force-reload) + log_daemon_msg "Restarting $DESC" "$NAME" + if running ; then + stop_daemon + # Wait some sensible amount, some daemons need this + [ -n "$DIETIME" ] && sleep $DIETIME + fi + start_daemon + running + log_end_msg $? + ;; + status) + log_daemon_msg "Checking status of $DESC" "$NAME" + if running ; then + log_progress_msg "running" + log_end_msg 0 + else + log_progress_msg "apparently not running" + log_end_msg 1 + exit 1 + fi + ;; + # Use this if the daemon cannot reload + reload) + log_warning_msg "Reloading $NAME daemon: not implemented, as the daemon" + log_warning_msg "cannot re-read the config file (use restart)." + ;; + *) + N=/etc/init.d/$NAME + echo "Usage: $N {start|stop|restart|force-reload|status}" >&2 + exit 1 + ;; +esac + +exit 0 diff --git a/debian/openvswitch-wdt.install b/debian/openvswitch-wdt.install new file mode 100644 index 00000000..80a04e13 --- /dev/null +++ b/debian/openvswitch-wdt.install @@ -0,0 +1 @@ +_debian/utilities/ovs-wdt usr/sbin diff --git a/debian/ovs-switch-setup b/debian/ovs-switch-setup new file mode 100755 index 00000000..7a720727 --- /dev/null +++ b/debian/ovs-switch-setup @@ -0,0 +1,615 @@ +#! /usr/bin/perl + +use POSIX; +use Debconf::Client::ConfModule ':all'; +use HTTP::Request; +use LWP::UserAgent; +use Digest::SHA1 'sha1_hex'; +use strict; +use warnings; + +# XXX should support configuring SWITCH_NETMASK and SWITCH_GATEWAY +# when the mode is in-band. + +my $debconf_owner = 'openvswitch-switch'; + +my $default = '/etc/default/openvswitch-switch'; +my $template = '/usr/share/openvswitch/switch/default.template'; +my $etc = '/etc/openvswitch-switch'; +my $rundir = '/var/run'; +my $privkey_file = "$etc/of0-privkey.pem"; +my $req_file = "$etc/of0-req.pem"; +my $cert_file = "$etc/of0-cert.pem"; +my $cacert_file = "$etc/cacert.pem"; +my $ovs_discover_pidfile = "$rundir/ovs-discover.pid"; + +my $ua = LWP::UserAgent->new; +$ua->timeout(10); +$ua->env_proxy; + +system("/etc/init.d/openvswitch-switch stop 1>&2"); +kill_ovs_discover(); + +version('2.0'); +capb('backup'); +title('Open vSwitch Switch Setup'); + +my (%netdevs) = find_netdevs(); +db_subst('netdevs', 'choices', + join(', ', map($netdevs{$_}, sort(keys(%netdevs))))); +db_set('netdevs', join(', ', grep(!/IP/, values(%netdevs)))); + +my %oldconfig; +if (-e $default) { + %oldconfig = load_config($default); + + my (%map) = + (NETDEVS => sub { + db_set('netdevs', join(', ', map($netdevs{$_}, + grep(exists $netdevs{$_}, split)))) + }, + MODE => sub { + db_set('mode', + $_ eq 'in-band' || $_ eq 'out-of-band' ? $_ : 'discovery') + }, + SWITCH_IP => sub { db_set('switch-ip', $_) }, + CONTROLLER => sub { db_set('controller-vconn', $_) }, + PRIVKEY => sub { $privkey_file = $_ }, + CERT => sub { $cert_file = $_ }, + CACERT => sub { $cacert_file = $_ }, + ); + + for my $key (keys(%map)) { + local $_ = $oldconfig{$key}; + &{$map{$key}}() if defined && !/^\s*$/; + } +} elsif (-e $template) { + %oldconfig = load_config($template); +} + +my $cacert_preverified = -e $cacert_file; +my ($req, $req_fingerprint); + +my %options; + +my (@states) = + (sub { + # User backed up from first dialog box. + exit(10); + }, + sub { + # Prompt for ports to include in switch. + db_input('netdevs'); + return; + }, + sub { + # Validate the chosen ports. + my (@netdevs) = split(', ', db_get('netdevs')); + if (!@netdevs) { + # No ports chosen. Disable switch. + db_input('no-netdevs'); + return 'prev' if db_go(); + return 'done'; + } elsif (my (@conf_netdevs) = grep(/IP/, @netdevs)) { + # Point out that some ports have configured IP addresses. + db_subst('configured-netdevs', 'configured-netdevs', + join(', ', @conf_netdevs)); + db_input('configured-netdevs'); + return; + } else { + # Otherwise proceed. + return 'skip'; + } + }, + sub { + # Discovery or in-band or out-of-band controller? + db_input('mode'); + return; + }, + sub { + return 'skip' if db_get('mode') ne 'discovery'; + for (;;) { + # Notify user that we are going to do discovery. + db_input('discover'); + return 'prev' if db_go(); + print STDERR "Please wait up to 30 seconds for discovery...\n"; + + # Make sure that there's no running discovery process. + kill_ovs_discover(); + + # Do discovery. + %options = (); + open(DISCOVER, '-|', 'ovs-discover --timeout=30 --pidfile ' + . join(' ', netdev_names())); + while () { + chomp; + if (my ($name, $value) = /^([^=]+)=(.*)$/) { + if ($value =~ /^"(.*)"$/) { + $value = $1; + $value =~ s/\\([0-7][0-7][0-7])/chr($1)/ge; + } else { + $value =~ s/^(0x[[:xdigit:]]+)$/hex($1)/e; + $value = '' if $value eq 'empty'; + next if $value eq 'null'; # Shouldn't happen. + } + $options{$name} = $value; + } + last if /^$/; + } + + # Check results. + my $vconn = $options{'ovs-controller-vconn'}; + my $pki_uri = $options{'ovs-pki-uri'}; + return 'next' + if (defined($vconn) + && is_valid_vconn($vconn) + && (!is_ssl_vconn($vconn) || defined($pki_uri))); + + # Try again? + kill_ovs_discover(); + db_input('discovery-failure'); + db_go(); + } + }, + sub { + return 'skip' if db_get('mode') ne 'discovery'; + + my $vconn = $options{'ovs-controller-vconn'}; + my $pki_uri = $options{'ovs-pki-uri'}; + db_subst('discovery-success', 'controller-vconn', $vconn); + db_subst('discovery-success', + 'pki-uri', is_ssl_vconn($vconn) ? $pki_uri : "no PKI in use"); + db_input('discovery-success'); + return 'prev' if db_go(); + db_set('controller-vconn', $vconn); + db_set('pki-uri', $pki_uri); + return 'next'; + }, + sub { + return 'skip' if db_get('mode') ne 'in-band'; + for (;;) { + db_input('switch-ip'); + return 'prev' if db_go(); + + my $ip = db_get('switch-ip'); + return 'next' if $ip =~ /^dhcp|\d+\.\d+.\d+.\d+$/i; + + db_input('switch-ip-error'); + db_go(); + } + }, + sub { + return 'skip' if db_get('mode') eq 'discovery'; + for (;;) { + my $old_vconn = db_get('controller-vconn'); + db_input('controller-vconn'); + return 'prev' if db_go(); + + my $vconn = db_get('controller-vconn'); + if (is_valid_vconn($vconn)) { + if ($old_vconn ne $vconn || db_get('pki-uri') eq '') { + db_set('pki-uri', pki_host_to_uri($2)); + } + return 'next'; + } + + db_input('controller-vconn-error'); + db_go(); + } + }, + sub { + return 'skip' if !ssl_enabled(); + + if (! -e $privkey_file) { + my $old_umask = umask(077); + run_cmd("ovs-pki req $etc/of0 >&2 2>/dev/null"); + chmod(0644, $req_file) or die "$req_file: chmod: $!\n"; + umask($old_umask); + } + + if (! -e $cert_file) { + open(REQ, '<', $req_file) or die "$req_file: open: $!\n"; + $req = join('', ); + close(REQ); + $req_fingerprint = sha1_hex($req); + } + return 'skip'; + }, + sub { + return 'skip' if !ssl_enabled(); + return 'skip' if -e $cacert_file && -e $cert_file; + + db_input('pki-uri'); + return 'prev' if db_go(); + return; + }, + sub { + return 'skip' if !ssl_enabled(); + return 'skip' if -e $cacert_file; + + my $pki_uri = db_get('pki-uri'); + if ($pki_uri !~ /:/) { + $pki_uri = pki_host_to_uri($pki_uri); + } else { + # Trim trailing slashes. + $pki_uri =~ s%/+$%%; + } + db_set('pki-uri', $pki_uri); + + my $url = "$pki_uri/controllerca/cacert.pem"; + my $response = $ua->get($url, ':content_file' => $cacert_file); + if ($response->is_success) { + return 'next'; + } + + db_subst('fetch-cacert-failed', 'url', $url); + db_subst('fetch-cacert-failed', 'error', $response->status_line); + db_subst('fetch-cacert-failed', 'pki-uri', $pki_uri); + db_input('fetch-cacert-failed'); + db_go(); + return 'prev'; + }, + sub { + return 'skip' if !ssl_enabled(); + return 'skip' if -e $cert_file; + + for (;;) { + db_set('send-cert-req', 'yes'); + db_input('send-cert-req'); + return 'prev' if db_go(); + return 'next' if db_get('send-cert-req') eq 'no'; + + my $pki_uri = db_get('pki-uri'); + my ($pki_base_uri) = $pki_uri =~ m%^([^/]+://[^/]+)/%; + my $url = "$pki_base_uri/cgi-bin/ovs-pki-cgi"; + my $response = $ua->post($url, {'type' => 'switch', + 'req' => $req}); + return 'next' if $response->is_success; + + db_subst('send-cert-req-failed', 'url', $url); + db_subst('send-cert-req-failed', 'error', + $response->status_line); + db_subst('send-cert-req-failed', 'pki-uri', $pki_uri); + db_input('send-cert-req-failed'); + db_go(); + } + }, + sub { + return 'skip' if !ssl_enabled(); + return 'skip' if $cacert_preverified; + + my ($cacert_fingerprint) = x509_fingerprint($cacert_file); + db_subst('verify-controller-ca', 'fingerprint', $cacert_fingerprint); + db_input('verify-controller-ca'); + return 'prev' if db_go(); + return 'next' if db_get('verify-controller-ca') eq 'yes'; + unlink($cacert_file); + return 'prev'; + }, + sub { + return 'skip' if !ssl_enabled(); + return 'skip' if -e $cert_file; + + for (;;) { + db_set('fetch-switch-cert', 'yes'); + db_input('fetch-switch-cert'); + return 'prev' if db_go(); + exit(1) if db_get('fetch-switch-cert') eq 'no'; + + my $pki_uri = db_get('pki-uri'); + my $url = "$pki_uri/switchca/certs/$req_fingerprint-cert.pem"; + my $response = $ua->get($url, ':content_file' => $cert_file); + if ($response->is_success) { + return 'next'; + } + + db_subst('fetch-switch-cert-failed', 'url', $url); + db_subst('fetch-switch-cert-failed', 'error', + $response->status_line); + db_subst('fetch-switch-cert-failed', 'pki-uri', $pki_uri); + db_input('fetch-switch-cert-failed'); + db_go(); + } + }, + sub { + db_input('complete'); + db_go(); + return; + }, + sub { + return 'done'; + }, +); + +my $state = 1; +my $direction = 1; +for (;;) { + my $ret = &{$states[$state]}(); + $ret = db_go() ? 'prev' : 'next' if !defined $ret; + if ($ret eq 'next') { + $direction = 1; + } elsif ($ret eq 'prev') { + $direction = -1; + } elsif ($ret eq 'skip') { + # Nothing to do. + } elsif ($ret eq 'done') { + last; + } else { + die "unknown ret $ret"; + } + $state += $direction; +} + +my %config = %oldconfig; +$config{NETDEVS} = join(' ', netdev_names()); +$config{MODE} = db_get('mode'); +if (db_get('mode') eq 'in-band') { + $config{SWITCH_IP} = db_get('switch-ip'); +} +if (db_get('mode') ne 'discovery') { + $config{CONTROLLER} = db_get('controller-vconn'); +} +$config{PRIVKEY} = $privkey_file; +$config{CERT} = $cert_file; +$config{CACERT} = $cacert_file; +save_config($default, %config); + +dup2(2, 1); # Get stdout back. +kill_ovs_discover(); +system("/etc/init.d/openvswitch-switch start"); + +sub ssl_enabled { + return is_ssl_vconn(db_get('controller-vconn')); +} + +sub db_subst { + my ($question, $key, $value) = @_; + $question = "$debconf_owner/$question"; + my ($ret, $seen) = subst($question, $key, $value); + if ($ret && $ret != 30) { + die "Error substituting $value for $key in debconf question " + . "$question: $seen"; + } +} + +sub db_set { + my ($question, $value) = @_; + $question = "$debconf_owner/$question"; + my ($ret, $seen) = set($question, $value); + if ($ret && $ret != 30) { + die "Error setting debconf question $question to $value: $seen"; + } +} + +sub db_get { + my ($question) = @_; + $question = "$debconf_owner/$question"; + my ($ret, $seen) = get($question); + if ($ret) { + die "Error getting debconf question $question answer: $seen"; + } + return $seen; +} + +sub db_fset { + my ($question, $flag, $value) = @_; + $question = "$debconf_owner/$question"; + my ($ret, $seen) = fset($question, $flag, $value); + if ($ret && $ret != 30) { + die "Error setting debconf question $question flag $flag to $value: " + . "$seen"; + } +} + +sub db_fget { + my ($question, $flag) = @_; + $question = "$debconf_owner/$question"; + my ($ret, $seen) = fget($question, $flag); + if ($ret) { + die "Error getting debconf question $question flag $flag: $seen"; + } + return $seen; +} + +sub db_input { + my ($question) = @_; + db_fset($question, "seen", "false"); + + $question = "$debconf_owner/$question"; + my ($ret, $seen) = input('high', $question); + if ($ret && $ret != 30) { + die "Error requesting debconf question $question: $seen"; + } + return $ret; +} + +sub db_go { + my ($ret, $seen) = go(); + if (!defined($ret)) { + exit(1); # Cancel button was pushed. + } + if ($ret && $ret != 30) { + die "Error asking debconf questions: $seen"; + } + return $ret; +} + +sub run_cmd { + my ($cmd) = @_; + return if system($cmd) == 0; + + if ($? == -1) { + die "$cmd: failed to execute: $!\n"; + } elsif ($? & 127) { + die sprintf("$cmd: child died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'); + } else { + die sprintf("$cmd: child exited with value %d\n", $? >> 8); + } +} + +sub x509_fingerprint { + my ($file) = @_; + my $cmd = "openssl x509 -noout -in $file -fingerprint"; + open(OPENSSL, '-|', $cmd) or die "$cmd: failed to execute: $!\n"; + my $line = ; + close(OPENSSL); + my ($fingerprint) = $line =~ /SHA1 Fingerprint=(.*)/; + return $line if !defined $fingerprint; + $fingerprint =~ s/://g; + return $fingerprint; +} + +sub find_netdevs { + my ($netdev, %netdevs); + open(IFCONFIG, "/sbin/ifconfig -a|") or die "ifconfig failed: $!"; + while () { + if (my ($nd) = /^([^\s]+)/) { + $netdev = $nd; + $netdevs{$netdev} = "$netdev"; + if (my ($hwaddr) = /HWaddr (\S+)/) { + $netdevs{$netdev} .= " (MAC: $hwaddr)"; + } + } elsif (my ($ip4) = /^\s*inet addr:(\S+)/) { + $netdevs{$netdev} .= " (IP: $ip4)"; + } elsif (my ($ip6) = /^\s*inet6 addr:(\S+)/) { + $netdevs{$netdev} .= " (IPv6: $ip6)"; + } + } + foreach my $nd (keys(%netdevs)) { + delete $netdevs{$nd} if $nd eq 'lo' || $nd =~ /^wmaster/; + } + close(IFCONFIG); + return %netdevs; +} + +sub load_config { + my ($file) = @_; + + # Get the list of the variables that the shell sets automatically. + my (%auto_vars) = read_vars("set -a && env"); + + # Get the variables from $default. + my (%config) = read_vars("set -a && . '$default' && env"); + + # Subtract. + delete @config{keys %auto_vars}; + + return %config; +} + +sub read_vars { + my ($cmd) = @_; + local @ENV; + if (!open(VARS, '-|', $cmd)) { + print STDERR "$cmd: failed to execute: $!\n"; + return (); + } + my (%config); + while () { + my ($var, $value) = /^([^=]+)=(.*)$/ or next; + $config{$var} = $value; + } + close(VARS); + return %config; +} + +sub shell_escape { + local $_ = $_[0]; + if ($_ eq '') { + return '""'; + } elsif (m&^[-a-zA-Z0-9:./%^_+,]*$&) { + return $_; + } else { + s/'/'\\''/; + return "'$_'"; + } +} + +sub shell_assign { + my ($var, $value) = @_; + return $var . '=' . shell_escape($value); +} + +sub save_config { + my ($file, %config) = @_; + my (@lines); + if (open(FILE, '<', $file)) { + @lines = ; + chomp @lines; + close(FILE); + } + + # Replace all existing variable assignments. + for (my ($i) = 0; $i <= $#lines; $i++) { + local $_ = $lines[$i]; + my ($var, $value) = /^\s*([^=#]+)=(.*)$/ or next; + if (exists($config{$var})) { + $lines[$i] = shell_assign($var, $config{$var}); + delete $config{$var}; + } else { + $lines[$i] = "#$lines[$i]"; + } + } + + # Find a place to put any remaining variable assignments. + VAR: + for my $var (keys(%config)) { + my $assign = shell_assign($var, $config{$var}); + + # Replace the last commented-out variable assignment to $var, if any. + for (my ($i) = $#lines; $i >= 0; $i--) { + local $_ = $lines[$i]; + if (/^\s*#\s*$var=/) { + $lines[$i] = $assign; + next VAR; + } + } + + # Find a place to add the var: after the final commented line + # just after a line that contains "$var:". + for (my ($i) = 0; $i <= $#lines; $i++) { + if ($lines[$i] =~ /^\s*#\s*$var:/) { + for (my ($j) = $i + 1; $j <= $#lines; $j++) { + if ($lines[$j] !~ /^\s*#/) { + splice(@lines, $j, 0, $assign); + next VAR; + } + } + } + } + + # Just append it. + push(@lines, $assign); + } + + open(NEWFILE, '>', "$file.tmp") or die "$file.tmp: create: $!\n"; + print NEWFILE join('', map("$_\n", @lines)); + close(NEWFILE); + rename("$file.tmp", $file) or die "$file.tmp: rename to $file: $!\n"; +} + +sub pki_host_to_uri { + my ($pki_host) = @_; + return "http://$pki_host/openvswitch/pki"; +} + +sub kill_ovs_discover { + # Delegate this to a subprocess because there is no portable way + # to invoke fcntl(F_GETLK) from Perl. + system("ovs-kill --force $ovs_discover_pidfile"); +} + +sub netdev_names { + return map(/^(\S+)/, split(', ', db_get('netdevs'))); +} + +sub is_valid_vconn { + my ($vconn) = @_; + return scalar($vconn =~ /^(tcp|ssl):([^:]+)(:.*)?/); +} + +sub is_ssl_vconn { + my ($vconn) = @_; + return scalar($vconn =~ /^ssl:/); +} diff --git a/debian/ovs-switch-setup.8 b/debian/ovs-switch-setup.8 new file mode 100644 index 00000000..696ad365 --- /dev/null +++ b/debian/ovs-switch-setup.8 @@ -0,0 +1,41 @@ +.TH ovs-switch-setup 8 "June 2008" "Open vSwitch" "Open vSwitch Manual" + +.SH NAME +ovs\-switch\-setup \- interactive setup for Open vSwitch switch + +.SH SYNOPSIS +.B ovs\-switch\-setup + +.SH DESCRIPTION +The \fBovs\-switch\-setup\fR program is an interactive program that +assists the system administrator in configuring an Open vSwitch switch, +including the underlying public key infrastructure (PKI). + +.SH OPTIONS +ovs\-switch\-setup does not accept any command-line options. + +.SH FILES +.IP /etc/default/openvswitch-switch +Main configuration file for Open vSwitch switch. + +.IP /etc/openvswitch-switch/cacert.pem +Default location of CA certificate for OpenFlow controllers. + +.IP /etc/openvswitch-switch/of0-cert.pem +Default location of certificate for the Open vSwitch switch's private key. + +.IP /etc/openvswitch-switch/of0-privkey.pem +Default location of the Open vSwitch switch's private key. This file +should be readable only by \fBroot\fR. + +.IP /etc/openvswitch-switch/of0-req.pem +Default location of certificate request for the Open vSwitch switch's +certificate. This file is not used after the signed certificate +(typically \fB/etc/openvswitch-switch/of0-cert.pem\fR, above) has been +obtained from the OpenFlow PKI server. + +.SH "SEE ALSO" + +.BR ovs\-dpctl (8), +.BR ovs-pki (8), +.BR secchan (8) diff --git a/debian/po/POTFILES.in b/debian/po/POTFILES.in new file mode 100644 index 00000000..865bf94c --- /dev/null +++ b/debian/po/POTFILES.in @@ -0,0 +1 @@ +[type: gettext/rfc822deb] openvswitch-switch-config.templates diff --git a/debian/po/templates.pot b/debian/po/templates.pot new file mode 100644 index 00000000..119e5587 --- /dev/null +++ b/debian/po/templates.pot @@ -0,0 +1,522 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# FIRST AUTHOR , YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"Report-Msgid-Bugs-To: ovs-dev@openvswitch.org\n" +"POT-Creation-Date: 2009-05-11 13:38-0700\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=CHARSET\n" +"Content-Transfer-Encoding: 8bit\n" + +#. Type: multiselect +#. Choices +#: ../openvswitch-switch-config.templates:1001 +msgid "${choices}" +msgstr "" + +#. Type: multiselect +#. Description +#: ../openvswitch-switch-config.templates:1002 +msgid "OpenFlow switch network devices:" +msgstr "" + +#. Type: multiselect +#. Description +#: ../openvswitch-switch-config.templates:1002 +msgid "" +"Choose the network devices that should become part of the OpenFlow switch. " +"At least two devices must be selected for this machine to be a useful " +"switch. Unselecting all network devices will disable the OpenFlow switch " +"entirely." +msgstr "" + +#. Type: multiselect +#. Description +#: ../openvswitch-switch-config.templates:1002 +msgid "" +"The network devices that you select should not be configured with IP or IPv6 " +"addresses, even if the switch contacts the controller over one of the " +"selected network devices. This is because a running OpenFlow switch takes " +"over network devices at a low level: they become part of the switch and " +"cannot be used for other purposes." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:2001 +msgid "No network devices were selected." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:2001 +msgid "" +"No network devices were selected for inclusion in the OpenFlow switch. The " +"switch will be disabled." +msgstr "" + +#. Type: note +#. Description +#: ../openvswitch-switch-config.templates:3001 +msgid "Some Network Devices Have IP or IPv6 Addresses" +msgstr "" + +#. Type: note +#. Description +#: ../openvswitch-switch-config.templates:3001 +msgid "" +"The following network devices selected to be part of the OpenFlow switch " +"have IP or IPv6 addresses configured:" +msgstr "" + +#. Type: note +#. Description +#: ../openvswitch-switch-config.templates:3001 +msgid "${configured-netdevs}" +msgstr "" + +#. Type: note +#. Description +#: ../openvswitch-switch-config.templates:3001 +msgid "" +"This is usually a mistake, even if the switch contacts the controller over " +"one of the selected network devices. This is because a running OpenFlow " +"switch takes over network devices at a low level: they become part of the " +"switch and cannot be used for other purposes." +msgstr "" + +#. Type: note +#. Description +#: ../openvswitch-switch-config.templates:3001 +msgid "" +"If this is an unintentional mistake, move back and fix the selection, or de-" +"configure the IP or IPv6 from these network devices." +msgstr "" + +#. Type: select +#. Choices +#: ../openvswitch-switch-config.templates:4001 +msgid "discovery, in-band, out-of-band" +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:4002 +msgid "Switch-to-controller access method:" +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:4002 +msgid "" +"The OpenFlow switch must be able to contact the OpenFlow controller over the " +"network. It can do so in one of three ways:" +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:4002 +msgid "" +"discovery: A single network is used for OpenFlow traffic and other data " +"traffic; that is, the switch contacts the controller over one of the network " +"devices selected as OpenFlow switch network devices in the previous " +"question. The switch automatically determines the location of the " +"controller using a DHCP request with an OpenFlow-specific vendor option. " +"This is the most common case." +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:4002 +msgid "" +"in-band: As above, but the location of the controller is manually configured." +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:4002 +msgid "" +"out-of-band: OpenFlow traffic uses a network separate from the data traffic " +"that it controls. If this is the case, the control network must already be " +"configured on a network device other than one of those selected as an " +"OpenFlow switch netdev in the previous question." +msgstr "" + +#. Type: note +#. Description +#: ../openvswitch-switch-config.templates:5001 +msgid "Preparing to discover controller." +msgstr "" + +#. Type: note +#. Description +#: ../openvswitch-switch-config.templates:5001 +msgid "" +"The setup program will now attempt to discover the OpenFlow controller. " +"Controller discovery may take up to 30 seconds. Please be patient." +msgstr "" + +#. Type: note +#. Description +#: ../openvswitch-switch-config.templates:5001 +msgid "" +"See secchan(8) for instructions on how to configure a DHCP server for " +"controller discovery." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:6001 +msgid "Controller discovery failed." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:6001 +msgid "The controller's location could not be determined automatically." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:6001 +msgid "" +"Ensure that the OpenFlow DHCP server is properly configured. See secchan(8) " +"for instructions on how to configure a DHCP server for controller discovery." +msgstr "" + +#. Type: boolean +#. Description +#: ../openvswitch-switch-config.templates:7001 +msgid "Use discovered settings?" +msgstr "" + +#. Type: boolean +#. Description +#: ../openvswitch-switch-config.templates:7001 +msgid "Controller discovery obtained the following settings:" +msgstr "" + +#. Type: boolean +#. Description +#: ../openvswitch-switch-config.templates:7001 +msgid "Controller location: ${controller-vconn}" +msgstr "" + +#. Type: boolean +#. Description +#: ../openvswitch-switch-config.templates:7001 +msgid "PKI URL: ${pki-uri}" +msgstr "" + +#. Type: boolean +#. Description +#: ../openvswitch-switch-config.templates:7001 +msgid "Please verify that these settings are correct." +msgstr "" + +#. Type: string +#. Description +#: ../openvswitch-switch-config.templates:8001 +msgid "Switch IP address:" +msgstr "" + +#. Type: string +#. Description +#: ../openvswitch-switch-config.templates:8001 +msgid "" +"For in-band communication with the controller, the OpenFlow switch must be " +"able to determine its own IP address. Its IP address may be configured " +"statically or dynamically." +msgstr "" + +#. Type: string +#. Description +#: ../openvswitch-switch-config.templates:8001 +msgid "For static configuration, specify the switch's IP address as a string." +msgstr "" + +#. Type: string +#. Description +#: ../openvswitch-switch-config.templates:8001 +msgid "" +"For dynamic configuration with DHCP (the most common case), specify \"dhcp" +"\". Configuration with DHCP will only work reliably if the network topology " +"allows the switch to contact the DHCP server before it connects to the " +"OpenFlow controller." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:9001 +msgid "The switch IP address is invalid." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:9001 +msgid "" +"The switch IP address must specified as \"dhcp\" or a valid IP address in " +"dotted-octet form (e.g. \"1.2.3.4\")." +msgstr "" + +#. Type: string +#. Description +#: ../openvswitch-switch-config.templates:10001 +msgid "Controller location:" +msgstr "" + +#. Type: string +#. Description +#: ../openvswitch-switch-config.templates:10001 +msgid "" +"Specify how the OpenFlow switch should connect to the OpenFlow controller. " +"The value should be in form \"ssl:HOST[:PORT]\" to connect to the controller " +"over SSL (recommended for security) or \"tcp:HOST[:PORT]\" to connect over " +"cleartext TCP." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:11001 +msgid "The controller location is invalid." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:11001 +msgid "" +"The controller location must be specifed as \"ssl:HOST[:PORT]\" to connect " +"to the controller over SSL (recommended for security) or \"tcp:HOST[:PORT]\" " +"to connect over cleartext TCP." +msgstr "" + +#. Type: string +#. Description +#: ../openvswitch-switch-config.templates:12001 +msgid "OpenFlow PKI server host name or URL:" +msgstr "" + +#. Type: string +#. Description +#: ../openvswitch-switch-config.templates:12001 +msgid "" +"Specify a URL to the OpenFlow public key infrastructure (PKI). If a host " +"name or IP address is specified in place of a URL, then http:///" +"openvswitch/pki/ will be used, where is the specified host name or IP " +"address." +msgstr "" + +#. Type: string +#. Description +#: ../openvswitch-switch-config.templates:12001 +msgid "" +"The OpenFlow PKI is usually on the same machine as the OpenFlow controller." +msgstr "" + +#. Type: string +#. Description +#: ../openvswitch-switch-config.templates:12001 +msgid "" +"The setup process will connect to the OpenFlow PKI server over HTTP, using " +"the system's configured default HTTP proxy (if any)." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:13001 +msgid "The switch CA certificate could not be retrieved." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:13001 +msgid "Retrieval of ${url} failed, with the following status: \"${error}\"." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:13001 +msgid "" +"Ensure that the OpenFlow PKI server is correctly configured and available at " +"${pki-uri}. If the system is configured to use an HTTP proxy, also make " +"sure that the HTTP proxy is available and that the PKI server can be reached " +"through it." +msgstr "" + +#. Type: select +#. Choices +#. Type: select +#. Choices +#. Type: select +#. Choices +#: ../openvswitch-switch-config.templates:14001 +#: ../openvswitch-switch-config.templates:15001 +#: ../openvswitch-switch-config.templates:17001 +msgid "yes, no" +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:14002 +msgid "Is ${fingerprint} the controller CA's fingerprint?" +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:14002 +msgid "" +"If a man-in-the-middle attack is possible in your network environment, check " +"that the controller CA's fingerprint is really ${fingerprint}. Answer \"yes" +"\" if it matches, \"no\" if there is a discrepancy." +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:14002 +msgid "" +"If a man-in-the-middle attack is not a concern, there is no need to verify " +"the fingerprint. Simply answer \"yes\"." +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:15002 +msgid "Send certificate request to switch CA?" +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:15002 +msgid "" +"Before it can connect to the controller over SSL, the OpenFlow switch's key " +"must be signed by the switch certificate authority (CA) located on the " +"OpenFlow PKI server, which is usually collocated with the OpenFlow " +"controller. A signing request can be sent to the PKI server now." +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:15002 +msgid "" +"Answer \"yes\" to send a signing request to the switch CA now. This is " +"ordinarily the correct choice. There is no harm in sending a given signing " +"request more than once." +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:15002 +msgid "" +"Answer \"no\" to skip sending a signing request to the switch CA. Unless the " +"request has already been sent to the switch CA, manual sending of the " +"request and signing will be necessary." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:16001 +msgid "The certificate request could not be sent." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:16001 +msgid "Posting to ${url} failed, with the following status: \"${error}\"." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:16001 +msgid "" +"Ensure that the OpenFlow PKI server is correctly configured and available at " +"${pki-uri}." +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:17002 +msgid "Fetch signed switch certificate from PKI server?" +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:17002 +msgid "" +"Before it can connect to the controller over SSL, the OpenFlow switch's key " +"must be signed by the switch certificate authority (CA) located on the " +"OpenFlow PKI server, which is usually collocated with the OpenFlow " +"controller." +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:17002 +msgid "" +"At this point, a signing request has been sent to the switch CA (or sending " +"a request has been manually skipped), but the signed certificate has not yet " +"been retrieved. Manual action may need to be taken at the PKI server to " +"approve the signing request." +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:17002 +msgid "" +"Answer \"yes\" to attempt to retrieve the signed switch certificate from the " +"switch CA. If the switch certificate request has been signed at the PKI " +"server, this is the correct choice." +msgstr "" + +#. Type: select +#. Description +#: ../openvswitch-switch-config.templates:17002 +msgid "" +"Answer \"no\" to postpone switch configuration. The configuration process " +"must be restarted later, when the switch certificate request has been signed." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:18001 +msgid "Signed switch certificate could not be retrieved." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:18001 +msgid "" +"The signed switch certificate could not be retrieved from the switch CA: " +"retrieval of ${url} failed, with the following status: \"${error}\"." +msgstr "" + +#. Type: error +#. Description +#: ../openvswitch-switch-config.templates:18001 +msgid "" +"This probably indicates that the switch's certificate request has not yet " +"been signed. If this is the problem, it may be fixed by signing the " +"certificate request at ${pki-uri}, then trying to fetch the signed switch " +"certificate again." +msgstr "" + +#. Type: note +#. Description +#: ../openvswitch-switch-config.templates:19001 +msgid "OpenFlow Switch Setup Finished" +msgstr "" + +#. Type: note +#. Description +#: ../openvswitch-switch-config.templates:19001 +msgid "" +"Setup of this OpenFlow switch is finished. Complete the setup procedure to " +"enable the switch." +msgstr "" diff --git a/debian/rules b/debian/rules new file mode 100755 index 00000000..707fe8b7 --- /dev/null +++ b/debian/rules @@ -0,0 +1,145 @@ +#!/usr/bin/make -f +# -*- makefile -*- +# Sample debian/rules that uses debhelper. +# +# This file was originally written by Joey Hess and Craig Small. +# As a special exception, when this file is copied by dh-make into a +# dh-make output file, you may use that output file without restriction. +# This special exception was added by Craig Small in version 0.37 of dh-make. +# +# Modified to make a template file for a multi-binary package with separated +# build-arch and build-indep targets by Bill Allombert 2001 + +# Uncomment this to turn on verbose mode. +#export DH_VERBOSE=1 + +# This has to be exported to make some magic below work. +export DH_OPTIONS + +# prefix of the target package name +PACKAGE=openvswitch-datapath-module +# modifieable for experiments or debugging m-a +MA_DIR ?= /usr/share/modass +# load generic variable handling +-include $(MA_DIR)/include/generic.make +# load default rules +-include $(MA_DIR)/include/common-rules.make + +DATAPATH_CONFIGURE_OPTS = + +# Official build number. Leave set to 0 if not an official build. +BUILD_NUMBER = 0 + +configure: configure-stamp +configure-stamp: + dh_testdir + test -e configure || ./boot.sh + test -d _debian || mkdir _debian + cd _debian && ( \ + test -e Makefile || \ + ../configure --prefix=/usr --localstatedir=/var --enable-ssl \ + --with-build-number=$(BUILD_NUMBER) \ + $(DATAPATH_CONFIGURE_OPTS)) + touch configure-stamp + +#Architecture +build: build-arch build-indep + +build-arch: build-arch-stamp +build-arch-stamp: configure-stamp + $(MAKE) -C _debian + touch $@ + +build-indep: build-indep-stamp +build-indep-stamp: configure-stamp + $(MAKE) -C _debian dist distdir=openvswitch + touch $@ + +clean: + dh_testdir + dh_testroot + rm -f build-arch-stamp build-indep-stamp configure-stamp + rm -rf _debian + [ ! -f Makefile ] || $(MAKE) distclean + dh_clean + debconf-updatepo + +kdist_clean: + dh_clean + rm -rf openvswitch + +kdist_config: prep-deb-files + +binary-modules: DSTDIR = $(CURDIR)/debian/$(PKGNAME)/lib/modules/$(KVERS) +binary-modules: prep-deb-files + dh_testdir + dh_testroot + dh_clean -k + tar xzf openvswitch.tar.gz + cd openvswitch && ./configure --with-l26=$(KSRC) $(DATAPATH_CONFIGURE_OPTS) --with-build-number=$(BUILD_NUMBER) + cd openvswitch && $(MAKE) -C datapath/linux-2.6 + install -d -m755 $(DSTDIR) + install -m644 openvswitch/datapath/linux-2.6/*_mod.ko $(DSTDIR)/ + dh_installdocs + dh_installchangelogs + dh_compress + dh_fixperms + dh_installdeb + dh_gencontrol + dh_md5sums + dh_builddeb --destdir=$(DEB_DESTDIR) + +install: install-indep install-arch +install-indep: build-indep + dh_testdir + dh_testroot + dh_clean -k -i + dh_installdirs -i + dh_install -i + cd debian/openvswitch-datapath-source/usr/src && tar -c modules | bzip2 -9 > openvswitch-datapath.tar.bz2 && rm -rf modules + install -m644 debian/openvswitch-pki-server.apache2 debian/openvswitch-pki-server/etc/apache2/sites-available/openvswitch-pki + install -m1777 -d debian/corekeeper/var/log/core + +install-arch: build-arch + dh_testdir + dh_testroot + dh_clean -k -s + dh_installdirs -s + $(MAKE) -C _debian DESTDIR=$(CURDIR)/debian/openvswitch install + cp debian/openvswitch-switch-config.overrides debian/openvswitch-switch-config/usr/share/lintian/overrides/openvswitch-switch-config + cp debian/openvswitch-switch.template debian/openvswitch-switch/usr/share/openvswitch/switch/default.template + dh_install -s + env TERMINFO=debian/openvswitch-switchui/usr/share/terminfo tic -x extras/ezio/ezio3.ti + +# Must not depend on anything. This is to be called by +# binary-arch/binary-indep +# in another 'make' thread. +binary-common: + dh_testdir + dh_testroot + dh_installchangelogs + dh_installdocs + dh_installexamples + dh_installdebconf + dh_installlogrotate + dh_installinit + dh_installcron + dh_installman + dh_link + dh_strip --dbg-package=openvswitch-dbg + dh_compress + dh_fixperms -X var/log/core + dh_perl + dh_makeshlibs + dh_installdeb + dh_shlibdeps + dh_gencontrol + dh_md5sums + dh_builddeb +binary-indep: install-indep + $(MAKE) -f debian/rules DH_OPTIONS=-i binary-common +binary-arch: install-arch + $(MAKE) -f debian/rules DH_OPTIONS=-s binary-common + +binary: binary-arch binary-indep +.PHONY: build clean binary-indep binary-arch binary install install-indep install-arch configure diff --git a/extras/ezio/automake.mk b/extras/ezio/automake.mk new file mode 100644 index 00000000..2aeaa644 --- /dev/null +++ b/extras/ezio/automake.mk @@ -0,0 +1,49 @@ +# Copyright (C) 2008, 2009 Nicira Networks, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. This file is offered as-is, +# without warranty of any kind. + +EXTRA_DIST += extras/ezio/ezio3.ti +install-data-hook: + @echo tic -x $(srcdir)/extras/ezio/ezio3.ti + @if ! tic -x $(srcdir)/extras/ezio/ezio3.ti; then \ + echo "-----------------------------------------------------------"; \ + echo "Failed to install ezio3 terminfo file. The ezio-term"; \ + echo "program will not work until it has been installed."; \ + echo "Probably, you need to install the 'tic' program from"; \ + echo "ncurses, e.g. using a command like:"; \ + echo " apt-get install ncurses-bin"; \ + echo "and then re-run \"make install\""; \ + echo "-----------------------------------------------------------"; \ + exit 1; \ + fi + +bin_PROGRAMS += extras/ezio/ezio-term +extras_ezio_ezio_term_SOURCES = \ + extras/ezio/byteq.c \ + extras/ezio/byteq.h \ + extras/ezio/ezio-term.c \ + extras/ezio/ezio.c \ + extras/ezio/ezio.h \ + extras/ezio/terminal.c \ + extras/ezio/terminal.h \ + extras/ezio/tty.c \ + extras/ezio/tty.h \ + extras/ezio/vt.h +if HAVE_LINUX_VT_H +extras_ezio_ezio_term_SOURCES += extras/ezio/vt-linux.c +else +extras_ezio_ezio_term_SOURCES += extras/ezio/vt-dummy.c +endif +extras_ezio_ezio_term_LDADD = lib/libopenvswitch.a $(NCURSES_LIBS) + +bin_PROGRAMS += extras/ezio/ovs-switchui +extras_ezio_ovs_switchui_SOURCES = extras/ezio/ovs-switchui.c +extras_ezio_ovs_switchui_LDADD = \ + lib/libopenvswitch.a \ + $(NCURSES_LIBS) \ + $(PCRE_LIBS) \ + $(SSL_LIBS) \ + -lm diff --git a/extras/ezio/byteq.c b/extras/ezio/byteq.c new file mode 100644 index 00000000..31d48aad --- /dev/null +++ b/extras/ezio/byteq.c @@ -0,0 +1,216 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include +#include "extras/ezio/byteq.h" +#include +#include +#include +#include +#include "util.h" + +/* The queue size must be a power of 2. */ +BUILD_ASSERT_DECL(!(BYTEQ_SIZE & (BYTEQ_SIZE - 1))); + +static uint8_t *head(struct byteq *); +static int headroom(const struct byteq *); +static void advance_head(struct byteq *, unsigned int n); +static int tailroom(const struct byteq *); +static const uint8_t *tail(const struct byteq *); +static void advance_tail(struct byteq *, unsigned int n); + +/* Initializes 'q' as empty. */ +void +byteq_init(struct byteq *q) +{ + q->head = q->tail = 0; +} + +/* Returns the number of bytes current queued in 'q'. */ +int +byteq_used(const struct byteq *q) +{ + return q->head - q->tail; +} + +/* Returns the number of bytes that can be added to 'q' without overflow. */ +int +byteq_avail(const struct byteq *q) +{ + return BYTEQ_SIZE - byteq_used(q); +} + +/* Returns true if no bytes are queued in 'q', + * false if at least one byte is queued. */ +bool +byteq_is_empty(const struct byteq *q) +{ + return !byteq_used(q); +} + +/* Returns true if 'q' has no room to queue additional bytes, + * false if 'q' has room for at least one more byte. */ +bool +byteq_is_full(const struct byteq *q) +{ + return !byteq_avail(q); +} + +/* Adds 'c' at the head of 'q', which must not be full. */ +void +byteq_put(struct byteq *q, uint8_t c) +{ + assert(!byteq_is_full(q)); + *head(q) = c; + q->head++; +} + +/* Adds the 'n' bytes in 'p' at the head of 'q', which must have at least 'n' + * bytes of free space. */ +void +byteq_putn(struct byteq *q, const void *p_, size_t n) +{ + const uint8_t *p = p_; + assert(byteq_avail(q) >= n); + while (n > 0) { + size_t chunk = MIN(n, headroom(q)); + memcpy(head(q), p, chunk); + advance_head(q, chunk); + p += chunk; + n -= chunk; + } +} + +/* Appends null-terminated string 's' to the head of 'q', which must have + * enough space. The null terminator is not added to 'q'. */ +void +byteq_put_string(struct byteq *q, const char *s) +{ + byteq_putn(q, s, strlen(s)); +} + +/* Removes a byte from the tail of 'q' and returns it. 'q' must not be + * empty. */ +uint8_t +byteq_get(struct byteq *q) +{ + uint8_t c; + assert(!byteq_is_empty(q)); + c = *tail(q); + q->tail++; + return c; +} + +/* Writes as much of 'q' as possible to 'fd'. Returns 0 if 'q' is fully + * drained by the write, otherwise a positive errno value (e.g. EAGAIN if a + * socket or tty buffer filled up). */ +int +byteq_write(struct byteq *q, int fd) +{ + while (!byteq_is_empty(q)) { + ssize_t n = write(fd, tail(q), tailroom(q)); + if (n > 0) { + advance_tail(q, n); + } else { + assert(n < 0); + return errno; + } + } + return 0; +} + +/* Reads as much possible from 'fd' into 'q'. Returns 0 if 'q' is completely + * filled up by the read, EOF if end-of-file was reached before 'q' was filled, + * and otherwise a positive errno value (e.g. EAGAIN if a socket or tty buffer + * was drained). */ +int +byteq_read(struct byteq *q, int fd) +{ + while (!byteq_is_full(q)) { + ssize_t n = read(fd, head(q), headroom(q)); + if (n > 0) { + advance_head(q, n); + } else { + return !n ? EOF : errno; + } + } + return 0; +} + +/* Returns the number of contiguous bytes of in-use space starting at the tail + * of 'q'. */ +static int +tailroom(const struct byteq *q) +{ + int used = byteq_used(q); + int tail_to_end = BYTEQ_SIZE - (q->tail & (BYTEQ_SIZE - 1)); + return MIN(used, tail_to_end); +} + +/* Returns the first in-use byte of 'q', the point at which data is removed + * from 'q'. */ +static const uint8_t * +tail(const struct byteq *q) +{ + return &q->buffer[q->tail & (BYTEQ_SIZE - 1)]; +} + +/* Removes 'n' bytes from the tail of 'q', which must have at least 'n' bytes + * of tailroom. */ +static void +advance_tail(struct byteq *q, unsigned int n) +{ + assert(tailroom(q) >= n); + q->tail += n; +} + +/* Returns the byte after the last in-use byte of 'q', the point at which new + * data will be added to 'q'. */ +static uint8_t * +head(struct byteq *q) +{ + return &q->buffer[q->head & (BYTEQ_SIZE - 1)]; +} + +/* Returns the number of contiguous bytes of free space starting at the head + * of 'q'. */ +static int +headroom(const struct byteq *q) +{ + int avail = byteq_avail(q); + int head_to_end = BYTEQ_SIZE - (q->head & (BYTEQ_SIZE - 1)); + return MIN(avail, head_to_end); +} + +/* Adds to 'q' the 'n' bytes after the last currently in-use byte of 'q'. 'q' + * must have at least 'n' bytes of headroom. */ +static void +advance_head(struct byteq *q, unsigned int n) +{ + assert(headroom(q) >= n); + q->head += n; +} diff --git a/extras/ezio/byteq.h b/extras/ezio/byteq.h new file mode 100644 index 00000000..4397f6aa --- /dev/null +++ b/extras/ezio/byteq.h @@ -0,0 +1,57 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#ifndef BYTEQ_H +#define BYTEQ_H 1 + +#include +#include +#include + +/* Maximum number of bytes in a byteq. */ +#define BYTEQ_SIZE 512 + +/* General-purpose circular queue of bytes. */ +struct byteq { + uint8_t buffer[BYTEQ_SIZE]; /* Circular queue. */ + unsigned int head; /* Head of queue. */ + unsigned int tail; /* Chases the head. */ +}; + +void byteq_init(struct byteq *); +int byteq_used(const struct byteq *); +int byteq_avail(const struct byteq *); +bool byteq_is_empty(const struct byteq *); +bool byteq_is_full(const struct byteq *); +void byteq_put(struct byteq *, uint8_t c); +void byteq_putn(struct byteq *, const void *, size_t n); +void byteq_put_string(struct byteq *, const char *); +uint8_t byteq_get(struct byteq *); +int byteq_write(struct byteq *, int fd); +int byteq_read(struct byteq *, int fd); + +#endif /* byteq.h */ diff --git a/extras/ezio/ezio-term.c b/extras/ezio/ezio-term.c new file mode 100644 index 00000000..c2177add --- /dev/null +++ b/extras/ezio/ezio-term.c @@ -0,0 +1,1060 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "command-line.h" +#include "extras/ezio/byteq.h" +#include "extras/ezio/tty.h" +#include "extras/ezio/vt.h" +#include "daemon.h" +#include "ezio.h" +#include "poll-loop.h" +#include "socket-util.h" +#include "terminal.h" +#include "timeval.h" +#include "util.h" + +#define THIS_MODULE VLM_ezio_term +#include "vlog.h" + +/* EZIO button status. */ +enum btn_status { + BTN_UP = 1 << 0, + BTN_DOWN = 1 << 1, + BTN_ENTER = 1 << 2, + BTN_ESC = 1 << 3 +}; + +/* -e, --ezio: EZIO3 serial device file. */ +static char *ezio_dev = "/dev/ttyS1"; + +/* -i, --input: Terminal from which to accept additional keyboard input. */ +static char *input_dev = NULL; + +struct inputdev; +static int inputdev_open(const char *name, struct inputdev **); +static void inputdev_close(struct inputdev *); +static int inputdev_run(struct inputdev *, struct byteq *); +static void inputdev_update(struct inputdev *, const struct ezio *); +static void inputdev_wait(struct inputdev *); + +static struct scanner *scanner_create(void); +static void scanner_destroy(struct scanner *); +static void scanner_run(struct scanner *, struct ezio *); +static void scanner_wait(struct scanner *); +static void scanner_left(struct scanner *, struct ezio *); +static void scanner_right(struct scanner *, struct ezio *); + +static struct updater *updater_create(void); +static void updater_destroy(struct updater *); +static int updater_run(struct updater *, const struct ezio *shadow, + int ezio_fd); +static void updater_wait(struct updater *, int ezio_fd); +enum btn_status updater_get_buttons(struct updater *); +bool updater_has_buttons(const struct updater *); + +static void handle_buttons(struct updater *, struct scanner *, + struct byteq *, struct ezio *); + +static void usage(void) NO_RETURN; +static void parse_options(int argc, char *argv[]); + +int +main(int argc, char *argv[]) +{ + struct terminal *terminal; + struct updater *updater; + struct scanner *scanner; + struct inputdev *inputdev; + struct byteq inputq; + struct ezio ezio; + int ezio_fd, pty_fd, dummy_fd; + int retval; + int i; + + set_program_name(argv[0]); + time_init(); + vlog_init(); + parse_options(argc, argv); + signal(SIGPIPE, SIG_IGN); + + argc -= optind; + argv += optind; + + /* Make sure that the ezio3 terminfo entry is available. */ + dummy_fd = open("/dev/null", O_RDWR); + if (dummy_fd >= 0) { + if (setupterm("ezio3", dummy_fd, &retval) == ERR) { + if (retval == 0) { + ovs_fatal(0, "Missing terminfo entry for ezio3. " + "Did you run \"make install\"?"); + } else { + ovs_fatal(0, "Missing terminfo database. Is ncurses " + "properly installed?"); + } + } + del_curterm(cur_term); + close(dummy_fd); + } else { + ovs_error(errno, "failed to open /dev/null"); + } + + /* Lock serial port. */ + retval = tty_lock(ezio_dev); + if (retval) { + ovs_fatal(retval, "%s: lock failed", ezio_dev); + } + + /* Open EZIO and configure as 2400 bps, N-8-1, in raw mode. */ + ezio_fd = open(ezio_dev, O_RDWR | O_NOCTTY); + if (ezio_fd < 0) { + ovs_fatal(errno, "%s: open", ezio_dev); + } + retval = tty_set_raw_mode(ezio_fd, B2400); + if (retval) { + ovs_fatal(retval, "%s: failed to configure tty parameters", ezio_dev); + } + + /* Open keyboard device for input. */ + if (input_dev) { + retval = inputdev_open(input_dev, &inputdev); + if (retval) { + ovs_fatal(retval, "%s: failed to open input device", input_dev); + } + } else { + inputdev = NULL; + } + + /* Open pty master. */ + pty_fd = tty_open_master_pty(); + if (pty_fd < 0) { + ovs_fatal(-pty_fd, "failed to open master pty"); + } + tty_set_window_size(pty_fd, 2, 40); + + /* Start child process. */ + if (argc < 1) { + char *child_argv[2]; + + child_argv[0] = getenv("SHELL"); + if (!child_argv[0]) { + child_argv[0] = "/bin/sh"; + } + child_argv[1] = NULL; + retval = tty_fork_child(pty_fd, child_argv); + } else { + retval = tty_fork_child(pty_fd, argv); + } + if (retval) { + ovs_fatal(retval, "failed to fork child process"); + } + + die_if_already_running(); + daemonize(); + + terminal = terminal_create(); + updater = updater_create(); + scanner = scanner_create(); + ezio_init(&ezio); + for (i = 0; i < 8; i++) { + ezio_set_default_icon(&ezio, i); + } + byteq_init(&inputq); + for (;;) { + /* Get button presses and keyboard input into inputq, then push the + * inputq to the pty. */ + handle_buttons(updater, scanner, &inputq, &ezio); + if (inputdev) { + retval = inputdev_run(inputdev, &inputq); + if (retval) { + VLOG_ERR("error reading from input device: %s", + strerror(retval)); + inputdev_close(inputdev); + inputdev = NULL; + } + } + retval = byteq_write(&inputq, pty_fd); + if (retval && retval != EAGAIN) { + VLOG_ERR("error passing through input: %s", + retval == EOF ? "end of file" : strerror(retval)); + } + + /* Process data from pty in terminal emulator. */ + retval = terminal_run(terminal, &ezio, pty_fd); + if (retval) { + VLOG_ERR("error reading from terminal: %s", + retval == EOF ? "end of file" : strerror(retval)); + break; + } + + /* Scroll left and right through text. */ + scanner_run(scanner, &ezio); + + /* Update the display to match what should be shown. */ + retval = updater_run(updater, &ezio, ezio_fd); + if (retval) { + VLOG_ERR("error writing to ezio: %s", + retval == EOF ? "end of file" : strerror(retval)); + break; + } + if (inputdev) { + inputdev_update(inputdev, &ezio); + } + + /* Wait for something to happen. */ + terminal_wait(terminal, pty_fd); + scanner_wait(scanner); + if (updater_has_buttons(updater)) { + poll_immediate_wake(); + } + updater_wait(updater, ezio_fd); + if (!byteq_is_empty(&inputq)) { + poll_fd_wait(pty_fd, POLLOUT); + } + if (inputdev) { + inputdev_wait(inputdev); + } + poll_block(); + } + terminal_destroy(terminal); + updater_destroy(updater); + scanner_destroy(scanner); + + return 0; +} + +static void +send_keys(struct byteq *q, const char *s) +{ + size_t n = strlen(s); + if (byteq_avail(q) >= n) { + byteq_putn(q, s, n); + } +} + +static void +handle_buttons(struct updater *up, struct scanner *s, + struct byteq *q, struct ezio *ezio) +{ + while (updater_has_buttons(up)) { + int btns = updater_get_buttons(up); + switch (btns) { + case BTN_UP: + send_keys(q, "\x1b\x5b\x41"); /* Up arrow. */ + break; + + case BTN_UP | BTN_ESC: + send_keys(q, "\x1b[5~"); /* Page up. */ + break; + + case BTN_DOWN: + send_keys(q, "\x1b\x5b\x42"); /* Down arrow. */ + break; + + case BTN_DOWN | BTN_ESC: + send_keys(q, "\x1b[6~"); /* Page down. */ + break; + + case BTN_ENTER: + send_keys(q, "\r"); + break; + + case BTN_ESC: + send_keys(q, "\x7f"); + break; + + case BTN_UP | BTN_DOWN: + scanner_left(s, ezio); + break; + + case BTN_ESC | BTN_ENTER: + scanner_right(s, ezio); + break; + + case BTN_UP | BTN_DOWN | BTN_ENTER | BTN_ESC: + send_keys(q, "\x04"); /* End of file. */ + break; + + case BTN_UP | BTN_ENTER | BTN_ESC: + send_keys(q, "y"); + break; + + case BTN_DOWN | BTN_ENTER | BTN_ESC: + send_keys(q, "n"); + break; + } + } +} + +/* EZIO screen updater. */ + +/* EZIO command codes. */ +#define EZIO_CMD 0xfe /* Command prefix byte. */ +#define EZIO_CLEAR 0x01 /* Clear screen. */ +#define EZIO_HOME 0x02 /* Move to (0, 0). */ +#define EZIO_READ 0x06 /* Poll keyboard. */ + +#define EZIO_ENTRY_MODE 0x04 /* Set entry mode: */ +#define EZIO_LTOR_MODE 0x02 /* ...left-to-right (vs. r-to-l). */ +#define EZIO_SHIFT_MODE 0x01 /* ...scroll with output (vs. don't). */ + +#define EZIO_DISPLAY_MODE 0x08 /* Set display mode: */ +#define EZIO_ENABLE_DISPLAY 0x04 /* ...turn on display (vs. blank). */ +#define EZIO_SHOW_CURSOR 0x02 /* ...show cursor (vs. hide). */ +#define EZIO_BLOCK_CURSOR 0x01 /* ...block cursor (vs. underline). */ + +#define EZIO_INIT 0x28 /* Initialize EZIO. */ + +#define EZIO_MOVE_CURSOR 0x80 /* Set cursor position. */ +#define EZIO_COL_SHIFT 0 /* Shift count for column (0-based). */ +#define EZIO_ROW_SHIFT 6 /* Shift count for row (0-based). */ + +#define EZIO_DEFINE_ICON 0x40 /* Define icon. */ +#define EZIO_ICON_SHIFT 3 /* Shift count for icon number (0-7). */ + +#define EZIO_SCROLL_LEFT 0x18 /* Scroll display left 1 position. */ +#define EZIO_SCROLL_RIGHT 0x1c /* Scroll display right 1 position. */ +#define EZIO_CURSOR_LEFT 0x10 /* Move cursor left 1 position. */ +#define EZIO_CURSOR_RIGHT 0x14 /* Move cursor right 1 position. */ + +/* Rate limiting: the EZIO runs at 2400 bps, which is 240 bytes per second. + * Kernel tty buffers, on the other hand, tend to be at least 4 kB. That + * means that, if we keep the kernel buffer filled, then the queued data will + * be 4,096 kB / 240 bytes/s ~= 17 seconds ahead of what is actually + * displayed. This is not a happy situation. So we rate-limit with a token + * bucket. + * + * The parameters below work out as: (6 tokens/ms * 1000 ms) / (25 + * tokens/byte) = 240 bytes/s. */ +#define UP_TOKENS_PER_MS 6 /* Tokens acquired per millisecond. */ +#define UP_BUCKET_SIZE (6 * 100) /* Capacity of the token bukect. */ +#define UP_TOKENS_PER_BYTE 25 /* Tokens required to output a byte. */ + +struct updater { + /* Current state of EZIO device. */ + struct ezio visible; + + /* Output state. */ + struct byteq obuf; /* Output being sent to serial port. */ + int tokens; /* Token bucket content. */ + long long int last_fill; /* Last time we increased 'tokens'.*/ + bool up_to_date; /* Does visible state match shadow state? */ + + /* Input state. */ + struct byteq ibuf; /* Queued button pushes. */ + long long int last_poll; /* Last time we sent a button poll request. */ + enum btn_status last_status; /* Last received button status. */ + long long int last_change; /* Time when status most recently changed. */ + int repeat_count; /* Autorepeat count. */ + bool releasing; /* Waiting for button release? */ +}; + +static void send_command(struct updater *, uint8_t command); +static void recv_button_state(struct updater *, enum btn_status status); +static int range(int value, int min, int max); +static void send_command(struct updater *, uint8_t command); +static void set_cursor_position(struct updater *, int x, int y); +static bool icons_differ(const struct ezio *, const struct ezio *, int *idx); +static void update_char(struct updater *, const struct ezio *, int x, int y); +static void update_cursor_status(struct updater *, const struct ezio *); + +/* Creates and returns a new updater. */ +static struct updater * +updater_create(void) +{ + struct updater *up = xmalloc(sizeof *up); + ezio_init(&up->visible); + byteq_init(&up->obuf); + up->tokens = UP_BUCKET_SIZE; + up->last_fill = time_msec(); + byteq_init(&up->ibuf); + up->last_poll = LLONG_MIN; + up->last_status = 0; + up->last_change = time_msec(); + up->releasing = false; + send_command(up, EZIO_INIT); + send_command(up, EZIO_INIT); + send_command(up, EZIO_CLEAR); + send_command(up, EZIO_HOME); + return up; +} + +/* Destroys updater 'up. */ +static void +updater_destroy(struct updater *up) +{ + free(up); +} + +/* Sends EZIO commands over file descriptor 'ezio_fd' to the EZIO represented + * by updater 'up', to make the EZIO display the contents of 'shadow'. + * Rate-limiting can cause the update to be only partial, but the next call to + * updater_run() will resume the update. + * + * Returns 0 if successful, otherwise a positive errno value. */ +static int +updater_run(struct updater *up, const struct ezio *shadow, int ezio_fd) +{ + uint8_t c; + while (read(ezio_fd, &c, 1) > 0) { + if ((c & 0xf0) == 0xb0) { + recv_button_state(up, ~c & 0x0f); + } + } + + up->up_to_date = false; + for (;;) { + struct ezio *visible = &up->visible; + int idx, x, y; + int retval; + + /* Flush the buffer out to the EZIO device. */ + retval = byteq_write(&up->obuf, ezio_fd); + if (retval == EAGAIN) { + return 0; + } else if (retval) { + VLOG_WARN("error writing ezio: %s", strerror(retval)); + return retval; + } + + /* Make sure we have some tokens before we write anything more. */ + if (up->tokens <= 0) { + long long int now = time_msec(); + if (now > up->last_fill) { + up->tokens += (now - up->last_fill) * UP_TOKENS_PER_MS; + up->last_fill = now; + if (up->tokens > UP_BUCKET_SIZE) { + up->tokens = UP_BUCKET_SIZE; + } + } + if (up->tokens <= 0) { + /* Still out of tokens. */ + return 0; + } + } + + /* Consider what else we might want to send. */ + if (time_msec() >= up->last_poll + 100) { + /* Send a button-read command. */ + send_command(up, EZIO_READ); + up->last_poll = time_msec(); + } else if (visible->show_cursor && !shadow->show_cursor) { + /* Turn off the cursor. */ + update_cursor_status(up, shadow); + } else if (icons_differ(shadow, visible, &idx)) { + /* Update the icons. */ + send_command(up, EZIO_DEFINE_ICON + (idx << EZIO_ICON_SHIFT)); + byteq_putn(&up->obuf, &shadow->icons[idx][0], 8); + set_cursor_position(up, shadow->x, shadow->y); + memcpy(visible->icons[idx], shadow->icons[idx], 8); + } else if (visible->x_ofs != shadow->x_ofs) { + /* Scroll to the correct horizontal position. */ + if (visible->x_ofs < shadow->x_ofs) { + send_command(up, EZIO_SCROLL_LEFT); + visible->x_ofs++; + } else { + send_command(up, EZIO_SCROLL_RIGHT); + visible->x_ofs--; + } + } else if (ezio_chars_differ(shadow, visible, shadow->x_ofs, + shadow->x_ofs + 16, &x, &y)) { + /* Update the visible region. */ + update_char(up, shadow, x, y); + } else if (ezio_chars_differ(shadow, visible, 0, 40, &x, &y)) { + /* Update the off-screen region. */ + update_char(up, shadow, x, y); + } else if ((visible->x != shadow->x || visible->y != shadow->y) + && shadow->show_cursor) { + /* Update the cursor position. (This has to follow updating the + * display content, because updating display content changes the + * cursor position.) */ + set_cursor_position(up, shadow->x, shadow->y); + } else if (visible->show_cursor != shadow->show_cursor + || visible->blink_cursor != shadow->blink_cursor) { + /* Update the cursor type. */ + update_cursor_status(up, shadow); + } else { + /* We're fully up-to-date. */ + up->up_to_date = true; + return 0; + } + up->tokens -= UP_TOKENS_PER_BYTE * byteq_used(&up->obuf); + } +} + +/* Calls poll-loop functions that will cause poll_block() to wake up when + * updater_run() has work to do. */ +static void +updater_wait(struct updater *up, int ezio_fd) +{ + if (!byteq_is_empty(&up->obuf)) { + poll_fd_wait(ezio_fd, POLLOUT); + } else if (up->tokens <= 0) { + poll_timer_wait((-up->tokens / UP_TOKENS_PER_MS) + 1); + } else if (!up->up_to_date) { + poll_immediate_wake(); + } + + if (!up->last_status && time_msec() - up->last_change > 100) { + /* No button presses in a while. Sleep longer. */ + poll_timer_wait(100); + } else { + poll_timer_wait(50); + } +} + +/* Returns a button or buttons that were pushed. Must not be called if + * updater_has_buttons() would return false. One or more BTN_* flags will be + * set in the return value. */ +enum btn_status +updater_get_buttons(struct updater *up) +{ + return byteq_get(&up->ibuf); +} + +/* Any buttons pushed? */ +bool +updater_has_buttons(const struct updater *up) +{ + return !byteq_is_empty(&up->ibuf); +} + +/* Adds 'btns' to the queue of pushed buttons */ +static void +buttons_pushed(struct updater *up, enum btn_status btns) +{ + if (!byteq_is_full(&up->ibuf)) { + byteq_put(&up->ibuf, btns); + } +} + +/* Updates the buttons-pushed queue based on the current button 'status'. */ +static void +recv_button_state(struct updater *up, enum btn_status status) +{ + /* Calculate milliseconds since button status last changed. */ + long long int stable_msec; + if (status != up->last_status) { + up->last_change = time_msec(); + stable_msec = 0; + } else { + stable_msec = time_msec() - up->last_change; + } + + if (up->releasing) { + if (!status) { + up->releasing = false; + } + } else if (up->last_status) { + if (!(status & up->last_status)) { + /* Button(s) were pushed and released. */ + if (!up->repeat_count) { + buttons_pushed(up, up->last_status); + } + } else if (stable_msec >= 150 && !up->repeat_count) { + /* Buttons have been stable for a while, so push them once. */ + buttons_pushed(up, status); + up->repeat_count++; + } else if (stable_msec >= 1000) { + /* Autorepeat 10/second after 1 second hold time. */ + int n = (stable_msec - 1000) / 100 + 1; + while (up->repeat_count < n) { + buttons_pushed(up, status); + up->repeat_count++; + } + } else if ((status & up->last_status) == up->last_status) { + /* More buttons pushed than at last poll. */ + } else { + /* Some, but not all, buttons were released. Ignore the buttons + * until all are released. */ + up->releasing = true; + } + } + if (!status) { + up->repeat_count = 0; + } + up->last_status = status; +} + +static int +range(int value, int min, int max) +{ + return value < min ? min : value > max ? max : value; +} + +static void +send_command(struct updater *up, uint8_t command) +{ + byteq_put(&up->obuf, EZIO_CMD); + byteq_put(&up->obuf, command); +} + +/* Moves the cursor to 0-based position (x, y). Updates 'up->visible' to + * reflect the change. */ +static void +set_cursor_position(struct updater *up, int x, int y) +{ + int command = EZIO_MOVE_CURSOR; + command |= range(x, 0, 39) << EZIO_COL_SHIFT; + command |= range(y, 0, 1) << EZIO_ROW_SHIFT; + send_command(up, command); + up->visible.x = x; + up->visible.y = y; +} + +/* If any of the icons differ from 'a' to 'b', returns true and sets '*idx' to + * the index of the first icon that differs. Otherwise, returns false. */ +static bool +icons_differ(const struct ezio *a, const struct ezio *b, int *idx) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(a->icons); i++) { + if (memcmp(&a->icons[i], &b->icons[i], sizeof a->icons[i])) { + *idx = i; + return true; + } + } + return false; +} + +/* Queues commands in 'up''s output buffer to update the character at 0-based + * position (x,y) to match the character that 'shadow' has there. Updates + * 'up->visible' to reflect the change. */ +static void +update_char(struct updater *up, const struct ezio *shadow, int x, int y) +{ + if (x != up->visible.x || y != up->visible.y) { + set_cursor_position(up, x, y); + } + byteq_put(&up->obuf, shadow->chars[y][x]); + up->visible.chars[y][x] = shadow->chars[y][x]; + up->visible.x++; +} + +/* Queues commands in 'up''s output buffer to change the EZIO's cursor shape to + * match that in 'shadow'. Updates 'up->visible' to reflect the change. */ +static void +update_cursor_status(struct updater *up, const struct ezio *shadow) +{ + uint8_t command = EZIO_DISPLAY_MODE | EZIO_ENABLE_DISPLAY; + if (shadow->show_cursor) { + command |= EZIO_SHOW_CURSOR; + if (shadow->blink_cursor) { + command |= EZIO_BLOCK_CURSOR; + } + } + send_command(up, command); + up->visible.show_cursor = shadow->show_cursor; + up->visible.blink_cursor = shadow->blink_cursor; +} + +/* An input device, such as a tty. */ + +struct inputdev { + /* Input. */ + int fd; /* File descriptor. */ + + /* State for mirroring the EZIO display to the device. */ + bool is_tty; /* We only attempt to mirror to ttys. */ + struct byteq outq; /* Output queue. */ + struct ezio visible; /* Data that we have displayed. */ +}; + +/* Opens 'name' as a input device. If successful, returns 0 and stores a + * pointer to the input device in '*devp'. On failure, returns a positive + * errno value. */ +static int +inputdev_open(const char *name, struct inputdev **devp) +{ + struct inputdev *dev; + int retval; + int fd; + + *devp = NULL; + if (!strcmp(name, "vt")) { + fd = vt_open(O_RDWR | O_NOCTTY); + if (fd < 0) { + return -fd; + } + } else if (!strcmp(name, "-")) { + fd = dup(STDIN_FILENO); + if (fd < 0) { + return errno; + } + } else { + fd = open(name, O_RDWR | O_NOCTTY); + if (fd < 0) { + return errno; + } + } + + retval = tty_set_raw_mode(fd, B0); + if (retval) { + close(fd); + VLOG_WARN("%s: failed to configure tty parameters: %s", + name, strerror(retval)); + return retval; + } + + dev = xmalloc(sizeof *dev); + dev->fd = fd; + dev->is_tty = isatty(fd); + byteq_init(&dev->outq); + ezio_init(&dev->visible); + *devp = dev; + return 0; +} + +/* Closes and destroys input device 'dev'. */ +static void +inputdev_close(struct inputdev *dev) +{ + if (dev) { + close(dev->fd); + free(dev); + } +} + +/* Reads input from 'dev' into 'q'. Returns 0 if successful, otherwise a + * positive errno value. */ +static int +inputdev_run(struct inputdev *dev, struct byteq *q) +{ + int retval = byteq_read(q, dev->fd); + return retval == EAGAIN ? 0 : retval; +} + +/* Dumps data from 'dev''s output queue to the underlying file descriptor, + * updating the tty screen display. */ +static void +flush_inputdev(struct inputdev *dev) +{ + int retval = byteq_write(&dev->outq, dev->fd); + if (retval && retval != EAGAIN) { + VLOG_WARN("error writing input device, " + "disabling further output"); + dev->is_tty = false; + } +} + +/* Updates the tty screen display on 'dev' to match 'e'. */ +static void +inputdev_update(struct inputdev *dev, const struct ezio *e) +{ + struct byteq *q = &dev->outq; + int x, y; + + if (!dev->is_tty) { + return; + } + + flush_inputdev(dev); + if (!byteq_is_empty(q)) { + return; + } + + if (!ezio_chars_differ(e, &dev->visible, 0, 40, &x, &y) + && e->x == dev->visible.x + && e->y == dev->visible.y + && e->x_ofs == dev->visible.x_ofs + && e->show_cursor == dev->visible.show_cursor) { + return; + } + dev->visible = *e; + + byteq_put_string(q, "\033[H\033[2J"); /* Clear screen. */ + for (y = 0; y < 4; y++) { + byteq_put(q, "+||+"[y]); + for (x = 0; x < 40; x++) { + int c; + if (x == e->x_ofs) { + byteq_put(q, '['); + } + c = y == 0 || y == 3 ? '-' : e->chars[y - 1][x]; + if (c == 6) { + c = '\\'; + } else if (c == 7) { + c = '~'; + } else if (c < 0x20 || c > 0x7d) { + c = '?'; + } + byteq_put(q, c); + if (x == e->x_ofs + 15) { + byteq_put(q, ']'); + } + } + byteq_put(q, "+||+"[y]); + byteq_put(q, '\r'); + byteq_put(q, '\n'); + } + if (e->show_cursor) { + int x = range(e->x, 0, 39) + 2 + (e->x >= e->x_ofs) + (e->x > e->x_ofs + 15); + int y = range(e->y, 0, 1) + 2; + char cup[16]; + sprintf(cup, "\033[%d;%dH", y, x); /* Position cursor. */ + byteq_put_string(q, cup); + } + flush_inputdev(dev); +} + +/* Calls poll-loop functions that will cause poll_block() to wake up when + * inputdev_run() has work to do. */ +static void +inputdev_wait(struct inputdev *dev) +{ + int flags = POLLIN; + if (dev->is_tty && !byteq_is_empty(&dev->outq)) { + flags |= POLLOUT; + } + poll_fd_wait(dev->fd, flags); +} + +/* Scrolls the display left and right automatically to display all the + * content. */ + +enum scanner_state { + SCANNER_LEFT, /* Moving left. */ + SCANNER_RIGHT /* Moving right. */ +}; + +struct scanner { + enum scanner_state state; /* Current state. */ + int wait; /* No. of cycles to pause before continuing. */ + long long int last_move; /* Last time the state machine ran. */ +}; + +static void find_min_max(struct ezio *, int *min, int *max); + +static struct scanner * +scanner_create(void) +{ + struct scanner *s = xmalloc(sizeof *s); + s->state = SCANNER_RIGHT; + s->wait = 0; + s->last_move = LLONG_MIN; + return s; +} + +static void +scanner_destroy(struct scanner *s) +{ + free(s); +} + +static void +scanner_run(struct scanner *s, struct ezio *ezio) +{ + long long int now = time_msec(); + if (now >= s->last_move + 750) { + s->last_move = now; + if (s->wait) { + s->wait--; + } else { + int min, max; + + find_min_max(ezio, &min, &max); + if (max - min + 1 <= 16) { + ezio->x_ofs = min; + return; + } + + switch (s->state) { + case SCANNER_RIGHT: + if (ezio->x_ofs + 15 < max) { + ezio->x_ofs++; + } else { + s->state = SCANNER_LEFT; + s->wait = 1; + } + break; + + case SCANNER_LEFT: + if (ezio->x_ofs > min) { + ezio->x_ofs--; + } else { + s->state = SCANNER_RIGHT; + s->wait = 1; + } + break; + } + } + } +} + +static void +scanner_wait(struct scanner *s) +{ + long long int now = time_msec(); + long long int expires = s->last_move + 750; + if (now >= expires) { + poll_immediate_wake(); + } else { + poll_timer_wait(expires - now); + } + +} + +static void +scanner_left(struct scanner *s, struct ezio *ezio) +{ + s->wait = 7; + if (ezio->x_ofs > 0) { + ezio->x_ofs--; + } +} + +static void +scanner_right(struct scanner *s, struct ezio *ezio) +{ + s->wait = 7; + if (ezio->x_ofs < 40 - 16) { + ezio->x_ofs++; + } +} + +static void +find_min_max(struct ezio *ezio, int *min, int *max) +{ + int x; + + *min = 0; + for (x = 0; x < 40; x++) { + if (ezio->chars[0][x] != ' ' || ezio->chars[1][x] != ' ') { + *min = x; + break; + } + } + + *max = 15; + for (x = 39; x >= 0; x--) { + if (ezio->chars[0][x] != ' ' || ezio->chars[1][x] != ' ') { + *max = x; + break; + } + } + + if (ezio->show_cursor) { + if (ezio->x < *min) { + *min = ezio->x; + } + if (ezio->x > *max) { + *max = ezio->x; + } + } +} + +static void +parse_options(int argc, char *argv[]) +{ + enum { + OPT_DUMMY = UCHAR_MAX + 1, + VLOG_OPTION_ENUMS + }; + static struct option long_options[] = { + {"ezio3", required_argument, 0, 'e'}, + {"input", required_argument, 0, 'i'}, + {"verbose", optional_argument, 0, 'v'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + DAEMON_LONG_OPTIONS, + VLOG_LONG_OPTIONS, + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + + for (;;) { + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case 'e': + ezio_dev = optarg; + break; + + case 'i': + input_dev = optarg ? optarg : "-"; + break; + + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(0, 0); + exit(EXIT_SUCCESS); + + DAEMON_OPTION_HANDLERS + VLOG_OPTION_HANDLERS + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); +} + +static void +usage(void) +{ + printf("%s: EZIO3 terminal front-end\n" + "Provides a front-end to a 16x2 EZIO3 LCD display that makes\n" + "it look more like a conventional terminal\n" + "usage: %s [OPTIONS] [-- COMMAND [ARG...]]\n" + "where COMMAND is a command to run with stdin, stdout, and\n" + "stderr directed to the EZIO3 display.\n" + "\nSettings (defaults in parentheses):\n" + " -e, --ezio=TTY set EZIO3 serial device (/dev/ttyS1)\n" + " -i, --input=TERMINAL also read input from TERMINAL;\n" + " specify - for stdin, or vt to allocate\n" + " and switch to a free virtual terminal\n" + "\nOther options:\n" + " -v, --verbose=MODULE:FACILITY:LEVEL configure logging levels\n" + " -v, --verbose set maximum verbosity level\n" + " -h, --help display this help message\n" + " -V, --version display version information\n", + program_name, program_name); + exit(EXIT_SUCCESS); +} diff --git a/extras/ezio/ezio.c b/extras/ezio/ezio.c new file mode 100644 index 00000000..6024766e --- /dev/null +++ b/extras/ezio/ezio.c @@ -0,0 +1,243 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include +#include "ezio.h" +#include +#include +#include +#include "util.h" + +static void remove_elements(uint8_t *p, size_t n_elems, size_t elem_size, + int pos, int n_del); +static void insert_elements(uint8_t *p, size_t n_elems, size_t elem_size, + int pos, int n_insert); +static int range(int value, int min, int max); + +void +ezio_init(struct ezio *e) +{ + memset(e->icons, 0, sizeof e->icons); + ezio_clear(e); + e->x_ofs = 0; + e->show_cursor = true; + e->blink_cursor = false; +} + +void +ezio_set_icon(struct ezio *e, int idx, + int row0, int row1, int row2, int row3, + int row4, int row5, int row6, int row7) +{ + e->icons[idx][0] = row0; + e->icons[idx][1] = row1; + e->icons[idx][2] = row2; + e->icons[idx][3] = row3; + e->icons[idx][4] = row4; + e->icons[idx][5] = row5; + e->icons[idx][6] = row6; + e->icons[idx][7] = row7; +} + +void +ezio_set_default_icon(struct ezio *e, int idx) +{ + uint8_t *icon; + + assert(idx >= 0 && idx < 8); + icon = e->icons[idx]; + if (idx == 6) { + ezio_set_icon(e, idx, + e_____, + eX____, + e_X___, + e__X__, + e___X_, + e____X, + e_____, + e_____); + } else if (idx == 7) { + ezio_set_icon(e, idx, + e_____, + e_____, + e_X___, + eX_X_X, + eX_X_X, + e___X_, + e_____, + e_____); + } else { + ezio_set_icon(e, idx, + e_____, + e_____, + e_____, + e_____, + e_____, + e_____, + e_____, + e_____); + } +} + +void +ezio_clear(struct ezio *e) +{ + memset(e->chars, ' ', sizeof e->chars); + e->x = e->y = 0; +} + +void +ezio_put_char(struct ezio *e, int x, int y, uint8_t c) +{ + assert(x >= 0 && x <= 39); + assert(y >= 0 && y <= 1); + e->chars[y][x] = c != 0xfe ? c : 0xff; +} + +void +ezio_line_feed(struct ezio *e) +{ + if (++e->y >= 2) { + e->y = 1; + ezio_scroll_up(e, 1); + } +} + +void +ezio_newline(struct ezio *e) +{ + e->x = 0; + ezio_line_feed(e); +} + +void +ezio_delete_char(struct ezio *e, int x, int y, int n) +{ + remove_elements(&e->chars[y][0], 40, 1, x, n); +} + +void +ezio_delete_line(struct ezio *e, int y, int n) +{ + remove_elements(e->chars[0], 2, 40, y, n); +} + +void +ezio_insert_char(struct ezio *e, int x, int y, int n) +{ + insert_elements(&e->chars[y][0], 40, 1, x, n); +} + +void +ezio_insert_line(struct ezio *e, int y, int n) +{ + insert_elements(&e->chars[0][0], 2, 40, y, n); +} + +void +ezio_scroll_left(struct ezio *e, int n) +{ + int y; + for (y = 0; y < 2; y++) { + ezio_delete_char(e, 0, y, n); + } +} + +void +ezio_scroll_right(struct ezio *e, int n) +{ + int y; + + for (y = 0; y < 2; y++) { + ezio_insert_char(e, 0, y, n); + } +} + +void +ezio_scroll_up(struct ezio *e, int n) +{ + ezio_delete_line(e, 0, n); +} + +void +ezio_scroll_down(struct ezio *e, int n) +{ + ezio_insert_line(e, 0, n); +} + +bool +ezio_chars_differ(const struct ezio *a, const struct ezio *b, int x0, int x1, + int *xp, int *yp) +{ + int x, y; + + x0 = range(x0, 0, 39); + x1 = range(x1, 1, 40); + for (y = 0; y < 2; y++) { + for (x = x0; x < x1; x++) { + if (a->chars[y][x] != b->chars[y][x]) { + *xp = x; + *yp = y; + return true; + } + } + } + return false; +} + +static void +remove_elements(uint8_t *p, size_t n_elems, size_t elem_size, + int pos, int n_del) +{ + if (pos >= 0 && pos < n_elems) { + n_del = MIN(n_del, n_elems - pos); + memmove(p + elem_size * pos, + p + elem_size * (pos + n_del), + elem_size * (n_elems - pos - n_del)); + memset(p + elem_size * (n_elems - n_del), ' ', n_del * elem_size); + } +} + +static void +insert_elements(uint8_t *p, size_t n_elems, size_t elem_size, + int pos, int n_insert) +{ + if (pos >= 0 && pos < n_elems) { + n_insert = MIN(n_insert, n_elems - pos); + memmove(p + elem_size * (pos + n_insert), + p + elem_size * pos, + elem_size * (n_elems - pos - n_insert)); + memset(p + elem_size * pos, ' ', n_insert * elem_size); + } +} + +static int +range(int value, int min, int max) +{ + return value < min ? min : value > max ? max : value; +} + diff --git a/extras/ezio/ezio.h b/extras/ezio/ezio.h new file mode 100644 index 00000000..1308ec30 --- /dev/null +++ b/extras/ezio/ezio.h @@ -0,0 +1,96 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#ifndef EZIO_H +#define EZIO_H 1 + +#include +#include + +/* Constants for visual representation of a row in an EZIO icon. */ +#define e_____ 0x00 +#define e____X 0x01 +#define e___X_ 0x02 +#define e___XX 0x03 +#define e__X__ 0x04 +#define e__X_X 0x05 +#define e__XX_ 0x06 +#define e__XXX 0x07 +#define e_X___ 0x08 +#define e_X__X 0x09 +#define e_X_X_ 0x0a +#define e_X_XX 0x0b +#define e_XX__ 0x0c +#define e_XX_X 0x0d +#define e_XXX_ 0x0e +#define e_XXXX 0x0f +#define eX____ 0x10 +#define eX___X 0x11 +#define eX__X_ 0x12 +#define eX__XX 0x13 +#define eX_X__ 0x14 +#define eX_X_X 0x15 +#define eX_XX_ 0x16 +#define eX_XXX 0x17 +#define eXX___ 0x18 +#define eXX__X 0x19 +#define eXX_X_ 0x1a +#define eXX_XX 0x1b +#define eXXX__ 0x1c +#define eXXX_X 0x1d +#define eXXXX_ 0x1e +#define eXXXXX 0x1f + +struct ezio { + uint8_t icons[8][8]; + uint8_t chars[2][40]; + int x, y, x_ofs; + bool show_cursor; + bool blink_cursor; +}; + +void ezio_init(struct ezio *); +void ezio_set_icon(struct ezio *, int idx, + int row0, int row1, int row2, int row3, + int row4, int row5, int row6, int row7); +void ezio_set_default_icon(struct ezio *, int idx); +void ezio_clear(struct ezio *); +void ezio_put_char(struct ezio *, int x, int y, uint8_t c); +void ezio_line_feed(struct ezio *); +void ezio_newline(struct ezio *); +void ezio_delete_char(struct ezio *, int x, int y, int n); +void ezio_delete_line(struct ezio *, int y, int n); +void ezio_insert_char(struct ezio *, int x, int y, int n); +void ezio_insert_line(struct ezio *, int y, int n); +void ezio_scroll_left(struct ezio *, int n); +void ezio_scroll_right(struct ezio *, int n); +void ezio_scroll_up(struct ezio *, int n); +void ezio_scroll_down(struct ezio *, int n); +bool ezio_chars_differ(const struct ezio *, const struct ezio *, + int x0, int x1, int *xp, int *yp); + +#endif /* ezio.h */ diff --git a/extras/ezio/ezio3.ti b/extras/ezio/ezio3.ti new file mode 100644 index 00000000..0bbcb398 --- /dev/null +++ b/extras/ezio/ezio3.ti @@ -0,0 +1,21 @@ +# Copyright (C) 2008, 2009 Nicira Networks, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty. This file is offered +# as-is, without warranty of any kind. + +ezio3|16x2 EZIO3 LCD display, + cols#40, lines#2, it#8, am, xenl, npc, + bel=, clear=\E[H\E[J, cr=^M, + cub=\E[%p1%dD, cub1=^H, cud=\E[%p1%dB, cud1=^J, + cuf=\E[%p1%dC, cuf1=\E[C$<2>, + cup=\E[%i%p1%d;%p2%dH, cuu=\E[%p1%dA, + cuu1=\E[A, ed=\E[J, el=\E[K, el1=\E[1K, + home=\E[H, ht=^I, ind=^J, kbs=^H, + kcub1=\E[D, kcud1=\E[B, kcuf1=\E[C, kcuu1=\E[A, + civis=\E[1r, cnorm=\E[2r, cvvis=\E[3r, + ri=\EM, rs2=\Ec, rmacs=^O, smacs=^N, + dico=\E[%p1%d;%p2%d;%p3%d;%p4%d;%p5%d;%p6%d;%p7%d;%p8%d;%p9%dp, + cico=\E[%p1%dq, + acsc=}\355\,\177+\176~\245f\337{\367, + diff --git a/extras/ezio/ovs-switchui.c b/extras/ezio/ovs-switchui.c new file mode 100644 index 00000000..6fbf2523 --- /dev/null +++ b/extras/ezio/ovs-switchui.c @@ -0,0 +1,3026 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "command-line.h" +#include "daemon.h" +#include "dynamic-string.h" +#include "ezio.h" +#include "fatal-signal.h" +#include "netdev.h" +#include "ofpbuf.h" +#include "openflow/nicira-ext.h" +#include "openflow/openflow.h" +#include "packets.h" +#include "poll-loop.h" +#include "process.h" +#include "random.h" +#include "rconn.h" +#include "socket-util.h" +#include "svec.h" +#include "timeval.h" +#include "util.h" +#include "vconn.h" +#include "xtoxll.h" + +#define THIS_MODULE VLM_switchui +#include "vlog.h" + +static void parse_options(int argc, char *argv[]); +static void usage(void); + +static void initialize_terminal(void); +static void restore_terminal(void *aux); + +enum priority { + P_STATUS = 5, + P_PROGRESS = 10, + P_WARNING = 15, + P_ERROR = 20, + P_FATAL = 25 +}; + +struct message; +static void emit(struct message **, enum priority, const char *, ...) + PRINTF_FORMAT(3, 4); +static void emit_function(struct message **, enum priority, + void (*function)(void *aux), void *aux); +static int shown(struct message **); +static void clear_messages(void); +static bool empty_message(const struct message *); +static struct message *best_message(void); +static struct message *next_message(struct message *); +static struct message *prev_message(struct message *); +static void put_message(const struct message *); +static void message_shown(struct message *); +static void age_messages(void); + +struct pair { + char *name; + char *value; +}; + +struct dict { + struct pair *pairs; + size_t n, max; +}; + +static void dict_init(struct dict *); +static void dict_add(struct dict *, const char *name, const char *value); +static void dict_add_nocopy(struct dict *, char *name, char *value); +static void dict_delete(struct dict *, const char *name); +static void dict_parse(struct dict *, const char *data, size_t nbytes); +static void dict_free(struct dict *); +static bool dict_lookup(const struct dict *, + const char *name, const char **value); +static int dict_get_int(const struct dict *, const char *name, int def); +static bool dict_get_bool(const struct dict *, const char *name, bool def); +static const char *dict_get_string(const struct dict *, + const char *name, const char *def); +static uint32_t dict_get_ip(const struct dict *, const char *name); + +static void addf(const char *format, ...) PRINTF_FORMAT(1, 2); + +static void fetch_status(struct rconn *, struct dict *, long long int timeout); +static bool parse_reply(void *, struct dict *, uint32_t xid); +static void compose_messages(const struct dict *, struct rconn *rconn); + +static void show_flows(struct rconn *); +static void show_dpid_ip(struct rconn *, const struct dict *); +static void show_secchan_state(const struct dict *); +static void show_fail_open_state(const struct dict *); +static void show_discovery_state(const struct dict *); +static void show_remote_state(const struct dict *); +static void show_data_rates(struct rconn *, const struct dict *); + +static void init_reboot_notifier(void); +static bool show_reboot_state(void); + +static void show_string(const char *string); +static void block_until(long long timeout); +static void menu(const struct dict *); +static void drain_keyboard_buffer(void); + +static const char *progress(void); + +int +main(int argc, char *argv[]) +{ + struct rconn *rconn; + struct message *msg; + int countdown = 5; + bool user_selected; + bool debug_mode; + + /* Tracking keystroke repeat counts. */ + int last_key = 0; + long long int last_key_time = 0; + int repeat_count = 0; + + set_program_name(argv[0]); + time_init(); + vlog_init(); + parse_options(argc, argv); + signal(SIGPIPE, SIG_IGN); + vlog_set_levels(VLM_ANY_MODULE, VLF_CONSOLE, VLL_EMER); + init_reboot_notifier(); + + argc -= optind; + argv += optind; + if (argc != 1) { + ovs_fatal(0, "exactly one non-option argument required; " + "use --help for help"); + } + + rconn = rconn_new(argv[0], 5, 5); + + die_if_already_running(); + daemonize(); + + initialize_terminal(); + fatal_signal_add_hook(restore_terminal, NULL, true); + + msg = NULL; + countdown = 0; + user_selected = false; + debug_mode = false; + for (;;) { + struct dict dict; + long long timeout = time_msec() + 1000; + + clear_messages(); + + dict_init(&dict); + fetch_status(rconn, &dict, timeout); + dict_add(&dict, "debug", debug_mode ? "true" : "false"); + compose_messages(&dict, rconn); + + if (countdown) { + if (!empty_message(msg)) { + countdown--; + } else { + msg = user_selected ? next_message(msg) : best_message(); + countdown = 5; + } + } else { + msg = best_message(); + countdown = 5; + user_selected = false; + } + if (!user_selected) { + message_shown(msg); + } + + do { + for (;;) { + int c = getch(); + if (c == ERR) { + break; + } + + if (c != last_key || time_msec() > last_key_time + 250) { + repeat_count = 0; + } + last_key = c; + last_key_time = time_msec(); + repeat_count++; + + if (c == KEY_DOWN || c == KEY_UP) { + msg = (c == KEY_DOWN ? next_message(msg) + : prev_message(msg)); + countdown = 5; + user_selected = true; + } else if (c == '\r' || c == '\n') { + countdown = 60; + user_selected = true; + if (repeat_count >= 20) { + debug_mode = !debug_mode; + show_string(debug_mode + ? "Debug Mode\nEnabled" + : "Debug Mode\nDisabled"); + } + } else if (c == '\b' || c == '\x7f' || + c == '\x1b' || c == KEY_BACKSPACE || c == KEY_DC) { + menu(&dict); + drain_keyboard_buffer(); + break; + } + } + + erase(); + curs_set(0); + move(0, 0); + put_message(msg); + refresh(); + + poll_fd_wait(STDIN_FILENO, POLLIN); + poll_timer_wait(timeout - time_msec()); + poll_block(); + } while (time_msec() < timeout); + age_messages(); + dict_free(&dict); + } + + return 0; +} + +static void +compose_messages(const struct dict *dict, struct rconn *rconn) +{ + if (!show_reboot_state()) { + show_flows(rconn); + show_dpid_ip(rconn, dict); + show_secchan_state(dict); + show_fail_open_state(dict); + show_discovery_state(dict); + show_remote_state(dict); + show_data_rates(rconn, dict); + } +} + +struct put_flows_data { + struct rconn *rconn; + uint32_t xid; + uint32_t flow_count; + bool got_reply; +}; + +static void +parse_flow_reply(void *data, struct put_flows_data *pfd) +{ + struct ofp_header *oh; + struct ofp_stats_reply *rpy; + struct ofp_aggregate_stats_reply *asr; + const size_t min_size = sizeof *rpy + sizeof *asr; + + oh = data; + if (ntohs(oh->length) < min_size) { + VLOG_WARN("reply is too short (%"PRIu16")", ntohs(oh->length)); + return; + } + if (oh->xid != pfd->xid) { + VLOG_WARN("xid 0x%08"PRIx32" != expected 0x%08"PRIx32, + oh->xid, pfd->xid); + return; + } + if (oh->type != OFPT_STATS_REPLY) { + VLOG_WARN("reply is wrong type %"PRIu8, oh->type); + return; + } + + rpy = data; + if (rpy->type != htons(OFPST_AGGREGATE)) { + VLOG_WARN("reply has wrong stat type ID %08"PRIx16, rpy->type); + return; + } + + asr = (struct ofp_aggregate_stats_reply *) rpy->body; + pfd->flow_count = ntohl(asr->flow_count); + pfd->got_reply = true; +} + +static bool +have_icons(void) +{ + const char *dico = tigetstr("dico"); + return dico && dico != (const char *) -1; +} + +static void +set_icon(int num, int r0, int r1, int r2, int r3, int r4, int r5, int r6, + int r7) +{ + if (have_icons()) { + putp(tparm(tigetstr("dico"), num, r0, r1, r2, r3, r4, r5, r6, r7)); + } +} + +static void +set_repeated_icon(int num, int row) +{ + set_icon(num, row, row, row, row, row, row, row, row); +} + +#if 0 +static void +set_brick_icon(int num, int n_solid) +{ + const static int rows[6] = {_____, X____, XX___, XXX__, XXXX_, XXXXX}; + set_repeated_icon(num, rows[n_solid < 0 ? 0 + : n_solid > 5 ? 5 + : n_solid]); +} +#endif + +static int +icon_char(int num, int alternate) +{ + return have_icons() ? 0x80 | num | A_ALTCHARSET : alternate; +} + +static void +put_icon(int num, char alternate) +{ + addch(icon_char(num, alternate)); +} + +#if 0 +static void +bar_graph(int n_chars, int n_pixels) +{ + int i; + + if (n_pixels < 0) { + n_pixels = 0; + } else if (n_pixels > n_chars * 5) { + n_pixels = n_chars * 5; + } + + if (n_pixels > 5) { + set_brick_icon(0, 5); + for (i = 0; i < n_pixels / 5; i++) { + put_icon(0, "#"); + } + } + if (n_pixels % 5) { + set_brick_icon(1, n_pixels % 5); + put_icon(1, "#"); + } +} +#endif + +static void +put_flows(void *pfd_) +{ + struct put_flows_data *pfd = pfd_; + static struct rconn_packet_counter *counter; + char host[64]; + + if (!counter) { + counter = rconn_packet_counter_create(); + } + + if (!pfd->xid) { + struct ofp_stats_request *rq; + struct ofp_aggregate_stats_request *asr; + struct ofpbuf *b; + + pfd->xid = random_uint32(); + rq = make_openflow_xid(sizeof *rq, OFPT_STATS_REQUEST, + pfd->xid, &b); + rq->type = htons(OFPST_AGGREGATE); + rq->flags = htons(0); + asr = ofpbuf_put_uninit(b, sizeof *asr); + memset(asr, 0, sizeof *asr); + asr->match.wildcards = htonl(OFPFW_ALL); + asr->table_id = 0xff; + asr->out_port = htons(OFPP_NONE); + update_openflow_length(b); + rconn_send_with_limit(pfd->rconn, b, counter, 10); + } + + if (!pfd->got_reply) { + int i; + + rconn_run(pfd->rconn); + for (i = 0; i < 50; i++) { + struct ofpbuf *b; + + b = rconn_recv(pfd->rconn); + if (!b) { + break; + } + + parse_flow_reply(b->data, pfd); + ofpbuf_delete(b); + if (pfd->got_reply) { + break; + } + } + } + + gethostname(host, sizeof host); + host[sizeof host - 1] = '\0'; + if (strlen(host) + 6 <= 16) { + addf("Host: %s\n", host); + } else { + addf("%s\n", host); + } + if (pfd->got_reply) { + addf("Flows: %"PRIu32, pfd->flow_count); + } + + if (!pfd->got_reply) { + rconn_run_wait(pfd->rconn); + rconn_recv_wait(pfd->rconn); + } +} + +static void +show_flows(struct rconn *rconn) +{ + static struct message *m; + static struct put_flows_data pfd; + + memset(&pfd, 0, sizeof pfd); + pfd.rconn = rconn; + emit_function(&m, P_STATUS, put_flows, &pfd); + +} + +struct put_dpid_ip_data { + struct rconn *rconn; + uint32_t xid; + uint64_t dpid; + char ip[16]; + bool got_reply; +}; + +static void +parse_dp_reply(void *data, struct put_dpid_ip_data *pdid) +{ + struct ofp_switch_features *osf; + struct ofp_header *oh; + + oh = data; + if (ntohs(oh->length) < sizeof *osf) { + VLOG_WARN("reply is too short (%"PRIu16")", ntohs(oh->length)); + return; + } + if (oh->xid != pdid->xid) { + VLOG_WARN("xid 0x%08"PRIx32" != expected 0x%08"PRIx32, + oh->xid, pdid->xid); + return; + } + if (oh->type != OFPT_FEATURES_REPLY) { + VLOG_WARN("reply is wrong type %"PRIu8, oh->type); + return; + } + + osf = data; + pdid->dpid = ntohll(osf->datapath_id); + pdid->got_reply = true; +} + +static void +put_dpid_id(void *pdid_) +{ + struct put_dpid_ip_data *pdid = pdid_; + static struct rconn_packet_counter *counter; + + if (!counter) { + counter = rconn_packet_counter_create(); + } + + if (!pdid->xid) { + struct ofp_header *oh; + struct ofpbuf *b; + + pdid->xid = random_uint32(); + oh = make_openflow_xid(sizeof *oh, OFPT_FEATURES_REQUEST, + pdid->xid, &b); + rconn_send_with_limit(pdid->rconn, b, counter, 10); + } + + if (!pdid->got_reply) { + int i; + + rconn_run(pdid->rconn); + for (i = 0; i < 50; i++) { + struct ofpbuf *b; + + b = rconn_recv(pdid->rconn); + if (!b) { + break; + } + + parse_dp_reply(b->data, pdid); + ofpbuf_delete(b); + if (pdid->got_reply) { + break; + } + } + } + + addf("DP: "); + if (pdid->got_reply) { + addf("%012"PRIx64, pdid->dpid); + } + addf("\nIP: %s", pdid->ip); + + if (!pdid->got_reply) { + rconn_run_wait(pdid->rconn); + rconn_recv_wait(pdid->rconn); + } +} + +static void +show_dpid_ip(struct rconn *rconn, const struct dict *dict) +{ + static struct message *m; + static struct put_dpid_ip_data pdid; + const char *is_connected, *local_ip; + + dict_lookup(dict, "local.is-connected", &is_connected); + dict_lookup(dict, "in-band.local-ip", &local_ip); + if (!is_connected && !local_ip) { + /* If we're not connected to the datapath and don't have a local IP, + * then we won't have anything useful to show anyhow. */ + return; + } + + memset(&pdid, 0, sizeof pdid); + pdid.rconn = rconn; + ovs_strlcpy(pdid.ip, local_ip ? local_ip : "", sizeof pdid.ip); + emit_function(&m, P_STATUS, put_dpid_id, &pdid); +} + +static size_t +dict_find(const struct dict *dict, const char *name) +{ + size_t i; + + for (i = 0; i < dict->n; i++) { + const struct pair *p = &dict->pairs[i]; + if (!strcmp(p->name, name)) { + return i; + } + } + + return SIZE_MAX; +} + +static bool +dict_lookup(const struct dict *dict, const char *name, const char **value) +{ + size_t idx = dict_find(dict, name); + if (idx != SIZE_MAX) { + *value = dict->pairs[idx].value; + return true; + } else { + *value = NULL; + return false; + } +} + +static const char * +dict_get(const struct dict *dict, const char *name) +{ + const char *value; + return dict_lookup(dict, name, &value) ? value : NULL; +} + +static int +dict_get_int(const struct dict *dict, const char *name, int def) +{ + const char *value; + return dict_lookup(dict, name, &value) ? atoi(value) : def; +} + +static bool +dict_get_bool(const struct dict *dict, const char *name, bool def) +{ + const char *value; + if (dict_lookup(dict, name, &value)) { + if (!strcmp(value, "true")) { + return true; + } + if (!strcmp(value, "false")) { + return false; + } + } + return def; +} + +static const char * +dict_get_string(const struct dict *dict, const char *name, const char *def) +{ + const char *value; + return dict_lookup(dict, name, &value) ? value : def; +} + +static uint32_t +dict_get_ip(const struct dict *dict, const char *name) +{ + struct in_addr in; + return (inet_aton(dict_get_string(dict, name, ""), &in) ? in.s_addr + : htonl(0)); +} + +static void +addf(const char *format, ...) +{ + char buf[128]; + va_list args; + + va_start(args, format); + vsnprintf(buf, sizeof buf, format, args); + va_end(args); + + addstr(buf); +} + +static void +show_secchan_state(const struct dict *dict) +{ + static struct message *msg; + const char *is_connected; + + if (!dict_lookup(dict, "remote.is-connected", &is_connected)) { + /* Secchan not running or not responding. */ + emit(&msg, P_ERROR, "Switch disabled"); + } +} + +static const char * +discovery_state_label(const char *name) +{ + static struct dict *states; + if (!states) { + states = xmalloc(sizeof *states); + dict_init(states); + dict_add(states, "INIT", "Init"); + dict_add(states, "INIT_REBOOT", "Init"); + dict_add(states, "REBOOTING", "Init"); + dict_add(states, "SELECTING", "Searching"); + dict_add(states, "REQUESTING", "Requesting"); + dict_add(states, "BOUND", "Got"); + dict_add(states, "RENEWING", "Renewing"); + dict_add(states, "REBINDING", "Rebinding"); + dict_add(states, "RELEASED", "Released"); + } + return dict_get_string(states, name, "Error"); +} + +static void +show_discovery_state(const struct dict *dict) +{ + static struct message *m_bound, *m_other; + struct message **m; + const char *state, *ip; + enum priority priority; + int state_elapsed; + + state = dict_get_string(dict, "discovery.state", NULL); + if (!state) { + return; + } + ip = dict_get_string(dict, "discovery.ip", NULL); + state_elapsed = dict_get_int(dict, "discovery.state-elapsed", 0); + + if (!strcmp(state, "BOUND")) { + m = &m_bound; + priority = P_STATUS; + } else { + m = &m_other; + priority = P_PROGRESS; + } + emit(m, priority, "Discovery %s\n%s", + progress(), discovery_state_label(state)); + if (ip) { + emit(m, priority, " %s", ip); + } +} + +static void +human_time(int seconds, char *buf, size_t size) +{ + const char *sign = ""; + if (seconds < 0) { + sign = "-"; + seconds = seconds == INT_MIN ? INT_MAX : -seconds; + } + + if (seconds <= 60) { + snprintf(buf, size, "%s%d s", sign, seconds); + } else if (seconds <= 60 * 60) { + snprintf(buf, size, "%s%d min", sign, seconds / 60); + } else if (seconds <= 60 * 60 * 24 * 2) { + snprintf(buf, size, "%s%d h", sign, seconds / 60 / 60); + } else { + snprintf(buf, size, "%s%d days", sign, seconds / 60 / 60 / 24); + } +} + +static void +show_fail_open_state(const struct dict *dict) +{ + static struct message *m; + int cur_duration, trigger_duration; + + if (!dict_get_bool(dict, "fail-open.triggered", false)) { + return; + } + trigger_duration = dict_get_int(dict, "fail-open.trigger-duration", 0); + cur_duration = dict_get_int(dict, "fail-open.current-duration", 0); + if (shown(&m) < 5) { + emit(&m, P_WARNING, "Failed open %s\nafter %d secs", + progress(), trigger_duration); + } else { + char buf[16]; + human_time(cur_duration - trigger_duration, buf, sizeof buf); + emit(&m, P_WARNING, "In fail open for\n%s now %s", buf, progress()); + } +} + +static const char * +progress(void) +{ + return "..." + (3 - (unsigned int) time_now() % 4); +} + +static void +show_remote_state(const struct dict *dict) +{ + bool debug_mode = dict_get_bool(dict, "debug", false); + const char *state, *is_connected; + + state = dict_get_string(dict, "remote.state", NULL); + if (!state) { + return; + } + is_connected = dict_get_string(dict, "remote.is-connected", "false"); + if (!strcmp(is_connected, "true")) { + if (debug_mode) { + static struct message *m_connected; + char buf[16]; + human_time(dict_get_int(dict, "remote.last-connection", 0), + buf, sizeof buf); + emit(&m_connected, P_STATUS, + "Connected for\nlast %s %s", buf, progress()); + } + + if (!strcmp(state, "IDLE")) { + static struct message *m_idle; + emit(&m_idle, P_PROGRESS, "Sent idle probe"); + } + + if (debug_mode) { + const char *name = dict_get_string(dict, "remote.name", NULL); + if (name) { + static struct message *m_name; + emit(&m_name, P_STATUS, "Connected to\n%s", name); + } + } + } else { + int elapsed, backoff; + const char *name, *error; + + elapsed = dict_get_int(dict, "remote.state-elapsed", 0); + backoff = dict_get_int(dict, "remote.backoff", 0); + name = dict_get_string(dict, "remote.name", "unknown"); + state = dict_get_string(dict, "remote.state", "VOID"); + error = dict_get_string(dict, "remote.last-connect-error", NULL); + if (!strcmp(state, "VOID")) { + static struct message *m; + emit(&m, P_PROGRESS, "Controller not\nfound"); + } else if (!strcmp(state, "BACKOFF")) { + static struct message *m[3]; + char buf[16]; + + if (error) { + emit(&m[0], P_PROGRESS, "Connect failed:\n%s", error); + } + emit(&m[2], P_STATUS, "Last connected\n%s ago", buf); + emit(&m[1], P_PROGRESS, + "Disconnected\nReconnect in %d", backoff - elapsed); + human_time(dict_get_int(dict, "remote.last-connection", 0), + buf, sizeof buf); + } else if (!strcmp(state, "CONNECTING")) { + static struct message *m; + emit(&m, P_PROGRESS, "Connecting %s\n%s", progress(), name); + } + } +} + +static void +fetch_status(struct rconn *rconn, struct dict *dict, long long timeout) +{ + static struct rconn_packet_counter *counter; + static uint32_t xid; + struct nicira_header *rq; + struct ofpbuf *b; + int retval; + + if (!counter) { + counter = rconn_packet_counter_create(); + } + if (!xid) { + xid = random_uint32(); + } + + rq = make_openflow_xid(sizeof *rq, OFPT_VENDOR, ++xid, &b); + rq->vendor = htonl(NX_VENDOR_ID); + rq->subtype = htonl(NXT_STATUS_REQUEST); + retval = rconn_send_with_limit(rconn, b, counter, 10); + if (retval) { + /* continue into the loop so that we pause for a while */ + } + + while (time_msec() < timeout) { + int i; + + rconn_run(rconn); + + for (i = 0; i < 50; i++) { + struct ofpbuf *b; + bool got_reply; + + b = rconn_recv(rconn); + if (!b) { + break; + } + + got_reply = parse_reply(b->data, dict, xid); + ofpbuf_delete(b); + if (got_reply) { + return; + } + } + + rconn_run_wait(rconn); + rconn_recv_wait(rconn); + poll_timer_wait(timeout - time_msec()); + poll_block(); + } +} + +static bool +parse_reply(void *data, struct dict *dict, uint32_t xid) +{ + struct ofp_header *oh; + struct nicira_header *rpy; + + oh = data; + if (ntohs(oh->length) < sizeof *rpy) { + VLOG_WARN("reply is too short (%"PRIu16")", ntohs(oh->length)); + return false; + } + if (oh->xid != xid) { + VLOG_WARN("xid 0x%08"PRIx32" != expected 0x%08"PRIx32, oh->xid, xid); + return false; + } + if (oh->type != OFPT_VENDOR) { + VLOG_WARN("reply is wrong type %"PRIu8, oh->type); + return false; + } + + rpy = data; + if (rpy->vendor != htonl(NX_VENDOR_ID)) { + VLOG_WARN("reply has wrong vendor ID %08"PRIx32, rpy->vendor); + return false; + } + if (rpy->subtype != htonl(NXT_STATUS_REPLY)) { + VLOG_WARN("reply has wrong subtype %08"PRIx32, rpy->subtype); + return false; + } + + dict_parse(dict, (const char *) (rpy + 1), + ntohs(oh->length) - sizeof *rpy); + return true; +} + +static void +dict_parse(struct dict *dict, const char *data, size_t nbytes) +{ + char *save_ptr = NULL; + char *copy, *name; + + copy = xmemdup0(data, nbytes); + for (name = strtok_r(copy, "=", &save_ptr); name; + name = strtok_r(NULL, "=", &save_ptr)) + { + char *value = strtok_r(NULL, "\n", &save_ptr); + if (!value) { + break; + } + dict_add(dict, name, value); + } + free(copy); +} + +static void +dict_init(struct dict *dict) +{ + dict->n = 0; + dict->max = 16; + dict->pairs = xmalloc(sizeof *dict->pairs * dict->max); +} + +static void +dict_add(struct dict *dict, const char *name, const char *value) +{ + dict_add_nocopy(dict, xstrdup(name), xstrdup(value)); +} + +static void +dict_add_nocopy(struct dict *dict, char *name, char *value) +{ + struct pair *p; + + if (dict->n >= dict->max) { + dict->max *= 2; + dict->pairs = xrealloc(dict->pairs, sizeof *dict->pairs * dict->max); + } + p = &dict->pairs[dict->n++]; + p->name = name; + p->value = value; +} + +static void +dict_delete(struct dict *dict, const char *name) +{ + size_t idx; + while ((idx = dict_find(dict, name)) != SIZE_MAX) { + struct pair *pair = &dict->pairs[idx]; + free(pair->name); + free(pair->value); + dict->pairs[idx] = dict->pairs[--dict->n]; + } +} + +static void +dict_free(struct dict *dict) +{ + if (dict) { + size_t i; + + for (i = 0; i < dict->n; i++) { + free(dict->pairs[i].name); + free(dict->pairs[i].value); + } + free(dict->pairs); + } +} + +static void +initialize_terminal(void) +{ + initscr(); + cbreak(); + noecho(); + nonl(); + intrflush(stdscr, FALSE); + keypad(stdscr, TRUE); + nodelay(stdscr, TRUE); + typeahead(-1); + scrollok(stdscr, TRUE); +} + +static void +restore_terminal(void *aux UNUSED) +{ + endwin(); +} + +struct byte_count { + long long int when; + uint64_t tx_bytes; +}; + +struct show_rates_data { + struct rconn *rconn; + uint32_t xid; + struct byte_count prev, now; + bool got_reply; +}; + +static void +parse_port_reply(void *data, struct show_rates_data *rates) +{ + struct ofp_header *oh; + struct ofp_stats_reply *rpy; + struct ofp_port_stats *ops; + size_t n_ports; + size_t i; + + oh = data; + if (ntohs(oh->length) < sizeof *rpy) { + VLOG_WARN("reply is too short (%"PRIu16")", ntohs(oh->length)); + return; + } + if (oh->xid != rates->xid) { + VLOG_WARN("xid 0x%08"PRIx32" != expected 0x%08"PRIx32, + oh->xid, rates->xid); + return; + } + if (oh->type != OFPT_STATS_REPLY) { + VLOG_WARN("reply is wrong type %"PRIu8, oh->type); + return; + } + + rpy = data; + if (rpy->type != htons(OFPST_PORT)) { + VLOG_WARN("reply has wrong stat type ID %08"PRIx16, rpy->type); + return; + } + + n_ports = ((ntohs(oh->length) - offsetof(struct ofp_stats_reply, body)) + / sizeof *ops); + ops = (struct ofp_port_stats *) rpy->body; + rates->prev = rates->now; + rates->now.when = time_msec(); + rates->now.tx_bytes = UINT64_MAX; + for (i = 0; i < n_ports; i++, ops++) { + if (ops->tx_bytes != htonll(UINT64_MAX)) { + if (rates->now.tx_bytes == UINT64_MAX) { + rates->now.tx_bytes = 0; + } + rates->now.tx_bytes += ntohll(ops->tx_bytes); + } + } + rates->got_reply = true; +} + +static void +dump_graph(const bool graph[80]) +{ + signed char icons[32]; + int n_icons = 3; + int i; + + memset(icons, -1, sizeof icons); + for (i = 0; i < 16; i++) { + uint8_t row; + int j; + + row = 0; + for (j = 0; j < 5; j++) { + row = (row << 1) | graph[i * 5 + j]; + } + if (!row) { + addch(' '); + continue; + } + + if (icons[row] < 0) { + if (n_icons >= 8) { + addch('X'); + continue; + } + set_repeated_icon(n_icons, row); + icons[row] = n_icons++; + } + put_icon(icons[row], row == 0x1f ? '#' : ' '); + } +} + +static void +do_show_data_rates(void *rates_) +{ + struct show_rates_data *rates = rates_; + static struct rconn_packet_counter *counter; + bool graph[80]; + + if (!counter) { + counter = rconn_packet_counter_create(); + } + if (!rates->xid) { + struct ofp_stats_request *rq; + struct ofpbuf *b; + + rates->xid = random_uint32(); + rq = make_openflow_xid(sizeof *rq, OFPT_STATS_REQUEST, + rates->xid, &b); + rq->type = htons(OFPST_PORT); + rq->flags = htons(0); + rconn_send_with_limit(rates->rconn, b, counter, 10); + } + + if (!rates->got_reply) { + int i; + + rconn_run(rates->rconn); + for (i = 0; i < 50; i++) { + struct ofpbuf *b; + + b = rconn_recv(rates->rconn); + if (!b) { + break; + } + + parse_port_reply(b->data, rates); + ofpbuf_delete(b); + if (rates->got_reply) { + break; + } + } + } + + set_icon(0, + e_____, + e_____, + e_____, + e__X__, + e__X__, + e__X_X, + e__XX_, + e__X_X); + set_icon(1, + e_____, + e_____, + e_____, + eX___X, + eXX_XX, + eX_X_X, + eX___X, + eX___X); + set_icon(2, + e_____, + e_____, + e_____, + e_XXX_, + eX____, + eX_XXX, + eX___X, + e_XXX_); + + memset(graph, 0, sizeof graph); + graph[24] = 1; + graph[48] = 1; + graph[72] = 1; + + addstr("TX: "); + put_icon(0, 'k'); + addstr(" "); + put_icon(1, 'M'); + addstr(" "); + put_icon(2, 'G'); + addch('\n'); + + if (rates->now.tx_bytes != UINT64_MAX + && rates->prev.tx_bytes != UINT64_MAX + && rates->now.when - rates->prev.when > 500 + && time_msec() - rates->now.when < 2000) + { + uint64_t bits = (rates->now.tx_bytes - rates->prev.tx_bytes) * 8; + uint64_t msecs = rates->now.when - rates->prev.when; + double bps = (double) bits * 1000.0 / msecs; + int pixels = bps > 0 ? log(bps) / log(10.0) * 8 + .5 : 0; + if (pixels < 0) { + pixels = 0; + } else if (pixels > 80) { + pixels = 80; + } + memset(graph, 1, pixels); + } + + dump_graph(graph); + + if (!rates->got_reply) { + rconn_run_wait(rates->rconn); + rconn_recv_wait(rates->rconn); + } +} + +static void +show_data_rates(struct rconn *rconn, const struct dict *dict) +{ + static struct message *m; + static struct show_rates_data rates; + const char *is_connected, *local_ip; + static bool inited = false; + + dict_lookup(dict, "local.is-connected", &is_connected); + dict_lookup(dict, "in-band.local-ip", &local_ip); + if (!is_connected && !local_ip) { + /* If we're not connected to the datapath and don't have a local IP, + * then we won't have anything useful to show anyhow. */ + return; + } + + rates.rconn = rconn; + rates.xid = 0; + rates.got_reply = false; + if (!inited) { + rates.now.tx_bytes = UINT64_MAX; + rates.prev.tx_bytes = UINT64_MAX; + inited = true; + } + emit_function(&m, P_STATUS, do_show_data_rates, &rates); +} + +struct message { + /* Content. */ + void (*function)(void *aux); + void *aux; + char string[128]; + + size_t index; + enum priority priority; + int age; + int shown; +}; + +static struct message **messages; +static size_t n_messages, allocated_messages; + +static struct message * +allocate_message(struct message **msgp) +{ + if (!*msgp) { + /* Allocate and initialize message. */ + *msgp = xcalloc(1, sizeof **msgp); + (*msgp)->index = n_messages; + + /* Add to list of messages. */ + if (n_messages >= allocated_messages) { + allocated_messages = 2 * allocated_messages + 1; + messages = xrealloc(messages, + sizeof *messages * allocated_messages); + } + messages[n_messages++] = *msgp; + } + return *msgp; +} + +static void +emit(struct message **msgp, enum priority priority, const char *format, ...) +{ + struct message *msg = allocate_message(msgp); + va_list args; + size_t length; + + msg->priority = priority; + + va_start(args, format); + length = strlen(msg->string); + vsnprintf(msg->string + length, sizeof msg->string - length, format, args); + va_end(args); +} + +static void +emit_function(struct message **msgp, enum priority priority, + void (*function)(void *aux), void *aux) +{ + struct message *msg = allocate_message(msgp); + msg->priority = priority; + msg->function = function; + msg->aux = aux; +} + +static int +shown(struct message **msgp) +{ + struct message *msg = allocate_message(msgp); + return msg->shown; +} + +static void +clear_messages(void) +{ + size_t i; + + for (i = 0; i < n_messages; i++) { + struct message *msg = messages[i]; + msg->string[0] = '\0'; + msg->function = NULL; + } +} + +static struct message * +best_message(void) +{ + struct message *best_msg; + int best_score; + size_t i; + + best_score = INT_MIN; + best_msg = NULL; + for (i = 0; i < n_messages; i++) { + struct message *msg = messages[i]; + int score; + + if (empty_message(msg)) { + continue; + } + + score = msg->priority; + if (!msg->shown) { + score += msg->age; + } else { + score -= msg->shown; + } + if (score > best_score) { + best_score = score; + best_msg = msg; + } + } + return best_msg; +} + +static void +message_shown(struct message *msg) +{ + if (msg && msg->shown++ > 3600) { + msg->shown = 0; + } +} + +static bool +empty_message(const struct message *msg) +{ + return !msg || (!msg->string[0] && !msg->function); +} + +static struct message *get_message(size_t index) +{ + assert(index <= n_messages || index == SIZE_MAX); + return (index < n_messages ? messages[index] + : index == SIZE_MAX ? messages[n_messages - 1] + : messages[0]); +} + +static struct message * +next_message(struct message *msg) +{ + struct message *p; + + for (p = get_message(msg->index + 1); p != msg; + p = get_message(p->index + 1)) { + if (!empty_message(p)) { + break; + } + } + return p; +} + +static struct message * +prev_message(struct message *msg) +{ + struct message *p; + + for (p = get_message(msg->index - 1); p != msg; + p = get_message(p->index - 1)) { + if (!empty_message(p)) { + break; + } + } + return p; +} + +static void +put_message(const struct message *m) +{ + if (m->string[0]) { + addstr(m->string); + } else if (m->function) { + m->function(m->aux); + } +} + +static void +age_messages(void) +{ + size_t i; + int load; + + load = 0; + for (i = 0; i < n_messages; i++) { + struct message *msg = messages[i]; + if (!empty_message(msg)) { + load++; + } + } + + for (i = 0; i < n_messages; i++) { + struct message *msg = messages[i]; + if (empty_message(msg)) { + msg->age = msg->shown = 0; + } else { + if (msg->age && msg->age % 60 == 0) { + msg->shown -= MAX(0, 5 - (load + 6) / 12); + if (msg->shown < 0) { + msg->shown = 0; + } + } + if (msg->age++ > 3600) { + msg->age = 0; + } + } + } +} + +/* Set by SIGUSR1 handler. */ +static volatile sig_atomic_t sigusr1_triggered; + +/* The time after which we stop indicating that the switch is rebooting. + * (This is just in case the reboot fails.) */ +static time_t reboot_deadline = TIME_MIN; + +static void sigusr1_handler(int); + +static void +init_reboot_notifier(void) +{ + signal(SIGUSR1, sigusr1_handler); +} + +static void +sigusr1_handler(int signr UNUSED) +{ + sigusr1_triggered = true; +} + +static bool +show_reboot_state(void) +{ + if (sigusr1_triggered) { + reboot_deadline = time_now() + 30; + sigusr1_triggered = false; + } + if (time_now() < reboot_deadline) { + static struct message *msg; + emit(&msg, P_FATAL, "Rebooting"); + return true; + } + return false; +} + +struct menu_item { + char *text; + void (*f)(const struct dict *); + int id; + bool enabled; + int toggle; +}; + +struct menu { + struct menu_item **items; + size_t n_items, allocated_items; +}; + +static void menu_init(struct menu *); +static void menu_free(struct menu *); +static struct menu_item *menu_add_item(struct menu *, const char *text, ...) + PRINTF_FORMAT(2, 3); +static int menu_show(const struct menu *, int start, bool select); + +static void cmd_shell(const struct dict *); +static void cmd_show_version(const struct dict *); +static void cmd_configure(const struct dict *); +static void cmd_setup_pki(const struct dict *); +static void cmd_browse_status(const struct dict *); +static void cmd_show_motto(const struct dict *); + +static void +menu_init(struct menu *menu) +{ + memset(menu, 0, sizeof *menu); +} + +static void +menu_free(struct menu *menu) +{ + size_t i; + + for (i = 0; i < menu->n_items; i++) { + struct menu_item *item = menu->items[i]; + free(item->text); + free(item); + } + free(menu->items); +} + +static struct menu_item * +menu_add_item(struct menu *menu, const char *text, ...) +{ + struct menu_item *item; + va_list args; + + if (menu->n_items >= menu->allocated_items) { + menu->allocated_items = 2 * menu->allocated_items + 1; + menu->items = xrealloc(menu->items, + sizeof *menu->items * menu->allocated_items); + } + item = menu->items[menu->n_items++] = xmalloc(sizeof *item); + va_start(args, text); + item->text = xvasprintf(text, args); + va_end(args); + item->f = NULL; + item->id = -1; + item->enabled = true; + item->toggle = -1; + return item; +} + +static void +menu(const struct dict *dict) +{ + bool debug_mode = dict_get_bool(dict, "debug", false); + struct menu menu; + int choice; + + menu_init(&menu); + menu_add_item(&menu, "Exit"); + menu_add_item(&menu, "Show Version")->f = cmd_show_version; + menu_add_item(&menu, "Configure")->f = cmd_configure; + menu_add_item(&menu, "Setup PKI")->f = cmd_setup_pki; + if (debug_mode) { + menu_add_item(&menu, "Browse Status")->f = cmd_browse_status; + menu_add_item(&menu, "Shell")->f = cmd_shell; + menu_add_item(&menu, "Show Motto")->f = cmd_show_motto; + } + + choice = menu_show(&menu, 0, true); + if (choice >= 0) { + void (*f)(const struct dict *) = menu.items[choice]->f; + if (f) { + (f)(dict); + } + } + + menu_free(&menu); +} + +static int +menu_show(const struct menu *menu, int start, bool select) +{ + long long int adjust = LLONG_MAX; + int min = 0, max = MAX(menu->n_items - 2, 0); + int pos, selection; + set_icon(0, + eXX___, + eXXX__, + eXXXX_, + eXXXXX, + eXXXX_, + eXXX__, + eXX___, + e_____); + set_icon(1, + eXXXXX, + eX___X, + eX___X, + eX___X, + eX___X, + eX___X, + eXXXXX, + e_____); + set_icon(2, + eXXXXX, + eX___X, + eXX_XX, + eX_X_X, + eXX_XX, + eX___X, + eXXXXX, + e_____); + if (menu->n_items) { + pos = MIN(menu->n_items - 1, MAX(0, start)); + selection = pos; + } else { + pos = 0; + selection = -1; + } + for (;;) { + int key; + + while ((key = getch()) != ERR) { + switch (key) { + case KEY_UP: + if (select && selection > 0) { + selection--; + if (selection >= pos) { + break; + } + } + if (pos >= min) { + pos--; + } + break; + + case KEY_DOWN: + if (select && selection < menu->n_items - 1) { + selection++; + if (selection <= pos + 1) { + break; + } + } + if (pos <= max) { + pos++; + } + break; + + case '\r': case '\n': + if (select && selection >= 0 && selection < menu->n_items) { + struct menu_item *item = menu->items[selection]; + if (!item->enabled) { + show_string("Item disabled"); + break; + } else if (item->toggle >= 0) { + item->toggle = !item->toggle; + break; + } + } + return selection; + + case '\b': case '\x7f': case '\x1b': + case KEY_BACKSPACE: case KEY_DC: + return -1; + } + adjust = time_msec() + 1000; + } + if (time_msec() >= adjust && menu->n_items > 1) { + if (pos < min) { + pos = min; + } else if (pos > max) { + pos = max; + } + } + + erase(); + curs_set(0); + move(0, 0); + if (!menu->n_items) { + addstr("[Empty]"); + } else { + int idx; + for (idx = pos; idx < pos + 2; idx++) { + size_t width = 40; + + if (select) { + width--; + if (selection == idx) { + put_icon(0, '>'); + } else { + addch(' '); + } + } + + if (idx < 0) { + addstr("[Top]"); + } else if (idx >= menu->n_items) { + addstr("[Bottom]"); + } else { + const struct menu_item *item = menu->items[idx]; + size_t length = strlen(item->text); + if (!item->enabled) { + width -= 2; + addch('('); + } + if (item->toggle >= 0) { + if (have_icons()) { + addch(icon_char(item->toggle ? 2 : 1, 0)); + width--; + } else { + addstr(item->toggle ? "[X]" : "[ ]"); + width -= 3; + } + } + addnstr(item->text, MIN(width, length)); + if (!item->enabled) { + addch(')'); + } + } + if (idx == pos) { + addch('\n'); + } + } + } + refresh(); + + if (pos < min || pos > max) { + poll_timer_wait(adjust - time_msec()); + } + poll_fd_wait(STDIN_FILENO, POLLIN); + poll_block(); + } +} + +static int +menu_show2(const struct menu *menu, int start, bool select) +{ + int pos; + if (menu->n_items) { + pos = MIN(menu->n_items - 1, MAX(0, start)); + } else { + pos = -1; + } + set_icon(0, + e__X__, + e_XXX_, + eXXXXX, + e__X__, + e__X__, + e__X__, + e__X__, + e__X__); + set_icon(1, + e__X__, + e__X__, + e__X__, + e__X__, + e__X__, + eXXXXX, + e_XXX_, + e__X__); + for (;;) { + int key; + + while ((key = getch()) != ERR) { + switch (key) { + case KEY_UP: + if (pos > 0) { + pos--; + } + break; + + case KEY_DOWN: + if (menu->n_items > 0 && pos < menu->n_items - 1) { + pos++; + } + break; + + case '\r': case '\n': + if (select && !menu->items[pos]->enabled) { + show_string("Item disabled"); + break; + } + return pos; + + case '\b': case '\x7f': case '\x1b': + case KEY_BACKSPACE: case KEY_DC: + return -1; + } + } + + erase(); + curs_set(0); + move(0, 0); + if (pos == -1) { + addstr("[Empty]"); + } else { + const struct menu_item *item = menu->items[pos]; + const char *line1 = item->text; + size_t len1 = strcspn(line1, "\n"); + const char *line2 = line1[len1] ? &line1[len1 + 1] : ""; + size_t len2 = strcspn(line2, "\n"); + size_t width = 39 - 2 * !item->enabled; + + /* First line. */ + addch(pos > 0 ? icon_char(0, '^') : ' '); + if (!item->enabled && len1) { + addch('('); + } + addnstr(line1, MIN(len1, width)); + if (!item->enabled && len1) { + addch(')'); + } + addch('\n'); + + /* Second line. */ + addch(pos < menu->n_items - 1 ? icon_char(1, 'V') : ' '); + if (!item->enabled && len2) { + addch('('); + } + addnstr(line2, MIN(len2, width)); + if (!item->enabled && len2) { + addch(')'); + } + } + refresh(); + + poll_fd_wait(STDIN_FILENO, POLLIN); + poll_block(); + } +} + +static bool +yesno(const char *title, bool def) +{ + bool answer = def; + + set_icon(0, + eXX___, + eXXX__, + eXXXX_, + eXXXXX, + eXXXX_, + eXXX__, + eXX___, + e_____); + + for (;;) { + int key; + + while ((key = getch()) != ERR) { + switch (key) { + case KEY_UP: + case KEY_DOWN: + case KEY_LEFT: + case KEY_RIGHT: + answer = !answer; + break; + + case 'y': case 'Y': + answer = true; + break; + + case 'n': case 'N': + answer = false; + break; + + case '\r': case '\n': + return answer; + } + } + + erase(); + curs_set(0); + move(0, 0); + addstr(title); + + move(0, 12); + addch(answer ? icon_char(0, '>') : ' '); + addstr("Yes"); + + move(1, 12); + addch(!answer ? icon_char(0, '>') : ' '); + addstr("No"); + + refresh(); + + poll_fd_wait(STDIN_FILENO, POLLIN); + poll_block(); + } +} + +static void +cmd_show_version(const struct dict *dict UNUSED) +{ + show_string(VERSION BUILDNR); +} + +static void +cmd_browse_status(const struct dict *dict) +{ + struct menu menu; + size_t i; + + menu_init(&menu); + for (i = 0; i < dict->n; i++) { + const struct pair *p = &dict->pairs[i]; + menu_add_item(&menu, "%s = %s", p->name, p->value); + } + menu_show(&menu, 0, false); + menu_free(&menu); +} + +static void +cmd_shell(const struct dict *dict UNUSED) +{ + const char *home; + + erase(); + refresh(); + endwin(); + + printf("Type ^D to exit\n"); + fflush(stdout); + + putenv("PS1=#"); + putenv("PS2=>"); + putenv("PS3=?"); + putenv("PS4=+"); + home = getenv("HOME"); + if (home) { + chdir(home); + } + system("/bin/sh"); + initialize_terminal(); +} + +static void +cmd_show_motto(const struct dict *dict UNUSED) +{ + show_string("\"Just Add Ice\""); +} + +static void +show_string(const char *string) +{ + VLOG_INFO("%s", string); + erase(); + curs_set(0); + move(0, 0); + addstr(string); + refresh(); + block_until(time_msec() + 5000); +} + +static void +block_until(long long timeout) +{ + while (timeout > time_msec()) { + poll_timer_wait(timeout - time_msec()); + poll_block(); + } + drain_keyboard_buffer(); +} + +static void +drain_keyboard_buffer(void) +{ + while (getch() != ERR) { + continue; + } +} + +static int +read_vars(const char *cmd, struct dict *dict) +{ + struct ds ds; + FILE *stream; + int status; + + stream = popen(cmd, "r"); + if (!stream) { + VLOG_ERR("popen(\"%s\") failed: %s", cmd, strerror(errno)); + return errno; + } + + dict_init(dict); + ds_init(&ds); + while (!ds_get_line(&ds, stream)) { + const char *s = ds_cstr(&ds); + const char *equals = strchr(s, '='); + if (equals) { + dict_add_nocopy(dict, + xmemdup0(s, equals - s), xstrdup(equals + 1)); + } + } + status = pclose(stream); + if (status) { + char *msg = process_status_msg(status); + VLOG_ERR("pclose(\"%s\") reported subprocess failure: %s", + cmd, msg); + free(msg); + dict_free(dict); + return ECHILD; + } + return 0; +} + +static bool +run_and_report_failure(char **argv, const char *title) +{ + int null_fds[3] = {0, 1, 2}; + int status; + int retval; + char *s; + + s = process_escape_args(argv); + VLOG_INFO("starting subprocess: %s", s); + free(s); + + retval = process_run(argv, NULL, 0, null_fds, 3, &status); + if (retval) { + char *s = xasprintf("%s:\n%s", title, strerror(retval)); + show_string(s); + free(s); + return false; + } else if (status) { + char *msg = process_status_msg(status); + char *s = xasprintf("%s:\n%s", title, msg); + show_string(s); + free(msg); + free(s); + return false; + } else { + VLOG_INFO("subprocess exited with status 0"); + return true; + } +} + +static int +do_load_config(const char *file_name, struct dict *dict) +{ + struct dict auto_vars; + int retval; + char *cmd; + size_t i; + + /* Get the list of the variables that the shell sets automatically. */ + retval = read_vars("set -a && env", &auto_vars); + if (retval) { + return retval; + } + + /* Get the variables from 'file_name'. */ + cmd = xasprintf("set -a && . '%s' && env", file_name); + retval = read_vars(cmd, dict); + free(cmd); + if (retval) { + dict_free(&auto_vars); + return retval; + } + + /* Subtract. */ + for (i = 0; i < auto_vars.n; i++) { + dict_delete(dict, auto_vars.pairs[i].name); + } + dict_free(&auto_vars); + return 0; +} + +static bool +load_config(struct dict *dict) +{ + static const char default_file[] = "/etc/default/openflow-switch"; + int retval = do_load_config(default_file, dict); + if (!retval) { + return true; + } else { + char *s = xasprintf("Cfg load failed:\n%s", strerror(retval)); + show_string(s); + free(s); + return false; + } +} + +static bool +save_config(const struct svec *settings) +{ + struct svec argv; + size_t i; + bool ok; + + VLOG_INFO("Saving configuration:"); + for (i = 0; i < settings->n; i++) { + VLOG_INFO("%s", settings->names[i]); + } + + svec_init(&argv); + svec_add(&argv, "/usr/share/openvswitch/commands/reconfigure"); + svec_append(&argv, settings); + svec_terminate(&argv); + ok = run_and_report_failure(argv.names, "Save failed"); + if (ok) { + long long int timeout = time_msec() + 5000; + + erase(); + curs_set(0); + move(0, 0); + addstr("Saved.\nRestarting..."); + refresh(); + + svec_clear(&argv); + svec_add(&argv, "/bin/sh"); + svec_add(&argv, "-c"); + svec_add(&argv, + "/etc/init.d/openflow-switch restart >/dev/null 2>&1"); + svec_terminate(&argv); + + ok = run_and_report_failure(argv.names, "Restart failed"); + if (ok) { + block_until(timeout); + } + } + svec_destroy(&argv); + + if (ok) { + VLOG_INFO("Save completed successfully"); + } else { + VLOG_WARN("Save failed"); + } + return ok; +} + +static int +match(pcre *re, const char *string, int length) +{ + int ovec[999]; + int retval; + + retval = pcre_exec(re, NULL, string, length, 0, PCRE_PARTIAL, + ovec, ARRAY_SIZE(ovec)); + if (retval >= 0) { + if (ovec[0] >= 0 && ovec[1] >= length) { + /* 're' matched all of 'string'. */ + return 0; + } else { + /* 're' matched the initial part of 'string' but not all of it. */ + return PCRE_ERROR_NOMATCH; + } + } else { + return retval; + } +} + +static void +figure_choices(pcre *re, struct ds *s, int pos, struct ds *choices) +{ + struct ds tmp; + int retval; + char c; + + ds_clear(choices); + + /* See whether the current string is a complete match. */ + if (!match(re, s->string, pos)) { + ds_put_char(choices, '\n'); + } + + /* Then try all the other possibilities. */ + ds_init(&tmp); + ds_put_buffer(&tmp, s->string, pos); + for (c = 0x20; c < 0x7f; c++) { + ds_put_char(&tmp, c); + retval = match(re, tmp.string, pos + 1); + if (retval == PCRE_ERROR_PARTIAL || !retval) { + ds_put_char(choices, c); + } + tmp.length--; + } + ds_destroy(&tmp); + + if (!choices->length) { + ds_put_char(choices, '\n'); + } +} + +static void +figure_completion(pcre *re, struct ds *s) +{ + for (;;) { + int found = -1; + int c; + + /* See whether the current string is a complete match. */ + if (!match(re, s->string, s->length)) { + return; + } + for (c = 0x20; c < 0x7f; c++) { + int retval; + + ds_put_char(s, c); + retval = match(re, s->string, s->length); + s->length--; + + if (retval == PCRE_ERROR_PARTIAL || !retval) { + if (found != -1) { + return; + } + found = c; + } + } + if (found == -1) { + return; + } + ds_put_char(s, found); + } +} + +#define OCTET_RE "([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])" +#define IP_RE "("OCTET_RE"\\."OCTET_RE"\\."OCTET_RE"\\."OCTET_RE")" +#define PORT_RE \ + "([0-9]|" \ + "[1-9][0-9]|" \ + "[1-9][0-9][0-9]|" \ + "[1-9][0-9][0-9][0-9]|" \ + "[1-5][0-9][0-9][0-9][0-9]|" \ + "6[1-4][0-9][0-9][0-9]|" \ + "65[1-4][0-9][0-9]|" \ + "655[1-2][0-9]|" \ + "6553[1-5])" +#define XOCTET_RE "[0-9A-F][0-9A-F]" +#define MAC_RE \ + XOCTET_RE":"XOCTET_RE":"XOCTET_RE":"\ + XOCTET_RE":"XOCTET_RE":"XOCTET_RE +#define NUM100_TO_99999_RE \ + "([1-9][0-9][0-9]|" \ + "[1-9][0-9][0-9][0-9]|" \ + "[1-9][0-9][0-9][0-9][0-9])" +#define NUM5_TO_99999_RE \ + "([5-9]|" \ + "[1-9][0-9]|" \ + "[1-9][0-9][0-9]|" \ + "[1-9][0-9][0-9][0-9]|" \ + "[1-9][0-9][0-9][0-9][0-9])" +#define NUM1_TO_99999_RE \ + "([1-9]|" \ + "[1-9][0-9]|" \ + "[1-9][0-9][0-9]|" \ + "[1-9][0-9][0-9][0-9]|" \ + "[1-9][0-9][0-9][0-9][0-9])" + +static char * +prompt(const char *prompt, const char *initial, const char *pattern) +{ + struct ds ds; + int pos, chidx; + struct ds choices; + const char *error; + int erroffset; + pcre *re; + int retval; + int okpartial; + char *p; + + set_icon(0, + e____X, + e____X, + e__X_X, + e_X__X, + eXXXXX, + e_X___, + e__X__, + e_____); + + re = pcre_compile(pattern, PCRE_ANCHORED, &error, &erroffset, NULL); + if (!re) { + VLOG_ERR("PCRE error for pattern \"%s\" at offset %d: %s", + pattern, erroffset, error); + return xstrdup(initial); + } + + retval = pcre_fullinfo(re, NULL, PCRE_INFO_OKPARTIAL, &okpartial); + assert(!retval); + assert(okpartial); + + pos = 0; + ds_init(&ds); + ds_put_cstr(&ds, initial); + ds_init(&choices); + figure_choices(re, &ds, pos, &choices); + p = memchr(choices.string, initial[0], choices.length); + chidx = p ? p - choices.string : 0; + for (;;) { + int c, key; + + while ((key = getch()) != ERR) { + switch (key) { + case KEY_UP: + if (choices.length > 1) { + if (++chidx >= choices.length) { + chidx = 0; + } + ds.string[pos] = choices.string[chidx]; + ds_truncate(&ds, pos + 1); + figure_completion(re, &ds); + } + break; + + case KEY_DOWN: + if (choices.length > 1) { + if (--chidx < 0) { + chidx = choices.length - 1; + } + ds.string[pos] = choices.string[chidx]; + ds_truncate(&ds, pos + 1); + figure_completion(re, &ds); + } + break; + + case '\r': case '\n': + if (choices.string[chidx] == '\n') { + ds_truncate(&ds, pos); + return ds_cstr(&ds); + } else { + if (pos >= ds.length) { + pos++; + ds_put_char(&ds, choices.string[chidx]); + figure_choices(re, &ds, pos, &choices); + chidx = 0; + figure_completion(re, &ds); + } else { + pos = ds.length; + figure_choices(re, &ds, pos, &choices); + chidx = 0; + figure_completion(re, &ds); + } + } + break; + + case '\f': + ds_truncate(&ds, pos + 1); + figure_choices(re, &ds, pos, &choices); + chidx = 0; + break; + + case '\b': case '\x7f': case '\x1b': + case KEY_BACKSPACE: case KEY_DC: + if (pos) { + pos--; + } else { + return xstrdup(initial); + } + figure_choices(re, &ds, pos, &choices); + chidx = 0; + if (pos < ds.length) { + p = memchr(choices.string, ds.string[pos], + choices.length); + if (p) { + chidx = p - choices.string; + } + } + break; + + default: + if (key >= 0x20 && key < 0x7f) { + /* Check whether 'key' is valid and toggle case if + * necessary. */ + if (!memchr(choices.string, key, choices.length)) { + if (memchr(choices.string, toupper(key), + choices.length)) { + key = toupper(key); + } else if (memchr(choices.string, tolower(key), + choices.length)) { + key = tolower(key); + } else { + break; + } + } + + /* Insert 'key' and advance the position. */ + if (pos >= ds.length) { + ds_put_char(&ds, key); + } else { + ds.string[pos] = key; + } + pos++; + + if (choices.string[chidx] != key) { + ds_truncate(&ds, pos); + } + figure_choices(re, &ds, pos, &choices); + chidx = 0; + if (pos < ds.length) { + p = memchr(choices.string, ds.string[pos], + choices.length); + if (p) { + chidx = p - choices.string; + } + } + figure_completion(re, &ds); + } + } + } + + erase(); + curs_set(1); + move(0, 0); + addnstr(prompt, MIN(40, strlen(prompt))); + + c = choices.string[chidx]; + move(1, 0); + addstr(ds_cstr(&ds)); + move(1, pos); + if (c == '\n') { + put_icon(0, '$'); + } else { + addch(c); + } + move(1, pos); + refresh(); + + poll_fd_wait(STDIN_FILENO, POLLIN); + poll_block(); + } +} + +static void +prompt_ip(const char *title, uint32_t *ip) +{ + char *in = xasprintf(IP_FMT, IP_ARGS(ip)); + char *out = prompt(title, in, "^"IP_RE"$"); + *ip = inet_addr(out); + free(in); + free(out); +} + +static void +abbreviate_netdevs(const struct svec *netdevs, struct ds *abbrev) +{ + size_t i; + + ds_init(abbrev); + for (i = 0; i < netdevs->n; ) { + size_t i_len = strlen(netdevs->names[i]); + size_t j; + + for (j = i + 1; j < netdevs->n; j++) { + size_t j_len = strlen(netdevs->names[j]); + if (!i_len || !j_len || i_len != j_len + || memcmp(netdevs->names[i], netdevs->names[j], i_len - 1)) { + break; + } + } + + if (abbrev->length) { + ds_put_char(abbrev, ' '); + } + if (j - i == 1) { + ds_put_cstr(abbrev, netdevs->names[i]); + } else { + size_t k; + + ds_put_buffer(abbrev, netdevs->names[i], i_len - 1); + ds_put_char(abbrev, '['); + for (k = i; k < j; k++) { + ds_put_char(abbrev, netdevs->names[k][i_len - 1]); + } + ds_put_char(abbrev, ']'); + } + i = j; + } +} + +static void +choose_netdevs(struct svec *choices) +{ + struct svec netdevs; + struct menu menu; + size_t i; + + netdev_enumerate(&netdevs); + svec_sort(&netdevs); + + menu_init(&menu); + menu_add_item(&menu, "Exit"); + for (i = 0; i < netdevs.n; i++) { + const char *name = netdevs.names[i]; + struct menu_item *item; + struct netdev *netdev; + int retval; + + if (!strncmp(name, "wmaster", strlen("wmaster")) + || !strncmp(name, "of", strlen("of")) + || !strcmp(name, "lo")) { + continue; + } + + retval = netdev_open(name, NETDEV_ETH_TYPE_NONE, &netdev); + if (!retval) { + bool exclude = netdev_get_in4(netdev, NULL); + netdev_close(netdev); + if (exclude) { + continue; + } + } + + item = menu_add_item(&menu, "%s", name); + item->toggle = svec_contains(choices, name); + } + if (menu.n_items > 1) { + menu_show(&menu, 0, true); + } else { + show_string("No available\nbridge ports"); + } + + svec_clear(choices); + for (i = 0; i < menu.n_items; i++) { + struct menu_item *item = menu.items[i]; + if (item->toggle > 0) { + svec_add(choices, item->text); + } + } + + menu_free(&menu); +} + +static bool +is_datapath_id_in_dmi(void) +{ + FILE *dmidecode; + char line[256]; + bool is_in_dmi; + + dmidecode = popen("dmidecode -s system-uuid", "r"); + if (!dmidecode) { + return false; + } + is_in_dmi = fgets(line, sizeof line, dmidecode) && strstr(line, "-002320"); + fclose(dmidecode); + return is_in_dmi; +} + +struct switch_config { + struct svec netdevs; + enum { DISCOVERY, IN_BAND } mode; + uint32_t switch_ip; + uint32_t switch_mask; + uint32_t switch_gw; + enum { FAIL_DROP, FAIL_SWITCH } disconnected; + bool stp; + int rate_limit; + int inactivity_probe; + int max_backoff; + char *controller_vconn; + char *datapath_id; +}; + +static const char * +disconnected_string(int value) +{ +#define FAIL_SWITCH_STRING "Switch packets" +#define FAIL_DROP_STRING "Drop packets" + return value == FAIL_SWITCH ? FAIL_SWITCH_STRING : FAIL_DROP_STRING; +} + +static void +cmd_configure(const struct dict *dict UNUSED) +{ + bool debug_mode = dict_get_bool(dict, "debug", false); + struct dict config_dict; + struct switch_config config; + int start; + + if (!load_config(&config_dict)) { + return; + } + svec_init(&config.netdevs); + svec_parse_words(&config.netdevs, + dict_get_string(&config_dict, "NETDEVS", "")); + config.mode = (!strcmp(dict_get_string(&config_dict, "MODE", "discovery"), + "in-band") ? IN_BAND : DISCOVERY); + config.switch_ip = dict_get_ip(&config_dict, "SWITCH_IP"); + config.switch_mask = dict_get_ip(&config_dict, "SWITCH_NETMASK"); + config.switch_gw = dict_get_ip(&config_dict, "SWITCH_GATEWAY"); + config.controller_vconn = xstrdup(dict_get_string(&config_dict, + "CONTROLLER", "")); + config.disconnected = (!strcmp(dict_get_string(&config_dict, + "DISCONNECTED_MODE", ""), + "switch") + ? FAIL_SWITCH : FAIL_DROP); + config.stp = !strcmp(dict_get_string(&config_dict, "stp", ""), "yes"); + config.rate_limit = dict_get_int(&config_dict, "RATE_LIMIT", -1); + config.inactivity_probe = dict_get_int(&config_dict, "INACTIVITY_PROBE", + -1); + config.max_backoff = dict_get_int(&config_dict, "MAX_BACKOFF", -1); + if (is_datapath_id_in_dmi()) { + config.datapath_id = xstrdup("DMI"); + } else { + const char *dpid = dict_get(&config_dict, "DATAPATH_ID"); + if (dpid) { + struct ds ds = DS_EMPTY_INITIALIZER; + const char *cp; + for (cp = dpid; *cp != '\0'; cp++) { + if (*cp != ':') { + ds_put_char(&ds, toupper((unsigned char) *cp)); + } + } + config.datapath_id = ds_cstr(&ds); + } else { + config.datapath_id = xstrdup("Random"); + } + } + dict_free(&config_dict); + + start = 0; + while (start != -1) { + enum { + MENU_EXIT, + MENU_NETDEVS, + MENU_MODE, + MENU_IP, + MENU_NETMASK, + MENU_GATEWAY, + MENU_CONTROLLER, + MENU_DISCONNECTED_MODE, + MENU_DATAPATH_ID, + MENU_STP, + MENU_RATE_LIMIT, + MENU_INACTIVITY_PROBE, + MENU_MAX_BACKOFF, + }; + + struct ds ports; + struct menu_item *item; + struct menu menu; + char *in, *out; + uint32_t ip; + + menu_init(&menu); + + /* Exit. */ + item = menu_add_item(&menu, "Exit"); + item->id = MENU_EXIT; + + /* Bridge Ports. */ + abbreviate_netdevs(&config.netdevs, &ports); + item = menu_add_item(&menu, "Bridge Ports:\n%s", ds_cstr(&ports)); + item->id = MENU_NETDEVS; + ds_destroy(&ports); + + /* Mode. */ + item = menu_add_item(&menu, "Mode:\n%s", + (config.mode == DISCOVERY + ? "Discovery" : "In-Band")); + item->id = MENU_MODE; + + /* IP address. */ + if (config.switch_ip == htonl(0)) { + item = menu_add_item(&menu, "Switch IP Addr:\nDHCP"); + } else { + item = menu_add_item(&menu, "Switch IP Addr:\n"IP_FMT, + IP_ARGS(&config.switch_ip)); + } + item->id = MENU_IP; + item->enabled = config.mode == IN_BAND; + + /* Netmask. */ + item = menu_add_item(&menu, "Switch Netmask:\n"IP_FMT, + IP_ARGS(&config.switch_mask)); + item->id = MENU_NETMASK; + item->enabled = config.mode == IN_BAND && config.switch_ip != htonl(0); + + /* Gateway. */ + item = menu_add_item(&menu, "Switch Gateway:\n"IP_FMT, + IP_ARGS(&config.switch_gw)); + item->id = MENU_GATEWAY; + item->enabled = config.mode == IN_BAND && config.switch_ip != htonl(0); + + /* Controller. */ + item = menu_add_item(&menu, "Controller:\n%s", + config.controller_vconn); + item->id = MENU_CONTROLLER; + item->enabled = config.mode == IN_BAND; + + /* Disconnected mode. */ + item = menu_add_item(&menu, "If disconnected:\n%s\n", + disconnected_string(config.disconnected)); + item->id = MENU_DISCONNECTED_MODE; + + /* Datapath ID. */ + item = menu_add_item(&menu, "Datapath ID:\n%s", config.datapath_id); + item->id = MENU_DATAPATH_ID; + item->enabled = strcmp(config.datapath_id, "DMI"); + + /* Spanning tree protocol. */ + if (debug_mode) { + item = menu_add_item(&menu, "802.1D-1998 STP:\n%s", + config.stp ? "Enabled" : "Disabled"); + item->id = MENU_STP; + } + + /* Rate-limiting. */ + if (debug_mode) { + if (config.rate_limit < 0) { + item = menu_add_item(&menu, "Ctlr rate limit:\nDisabled"); + } else { + item = menu_add_item(&menu, "Ctlr rate limit:\n%d/s", + config.rate_limit); + } + item->id = MENU_RATE_LIMIT; + } + + /* Inactivity probe. */ + if (debug_mode) { + if (config.inactivity_probe < 0) { + item = menu_add_item(&menu, "Activity probe:\nDefault"); + } else { + item = menu_add_item(&menu, "Activity probe:\n%d s", + config.inactivity_probe); + } + item->id = MENU_INACTIVITY_PROBE; + } + + /* Max backoff. */ + if (debug_mode) { + if (config.max_backoff < 0) { + item = menu_add_item(&menu, "Max backoff:\nDefault"); + } else { + item = menu_add_item(&menu, "Max backoff:\n%d s", + config.max_backoff); + } + item->id = MENU_MAX_BACKOFF; + } + + start = menu_show2(&menu, start, true); + menu_free(&menu); + + in = out = NULL; + switch (start) { + case MENU_EXIT: + start = -1; + break; + + case MENU_NETDEVS: + choose_netdevs(&config.netdevs); + break; + + case MENU_MODE: + out = prompt("Mode:", + config.mode == DISCOVERY ? "Discovery" : "In-Band", + "^(Discovery|In-Band)$"); + config.mode = !strcmp(out, "Discovery") ? DISCOVERY : IN_BAND; + free(out); + break; + + case MENU_IP: + in = (config.switch_ip == htonl(0) ? xstrdup("DHCP") + : xasprintf(IP_FMT, IP_ARGS(&config.switch_ip))); + out = prompt("Switch IP:", in, "^(DHCP|"IP_RE")$"); + ip = strcmp(out, "DHCP") ? inet_addr(out) : htonl(0); + free(in); + free(out); + if (ip != config.switch_ip) { + config.switch_ip = ip; + if (ip != htonl(0)) { + uint32_t mask = guess_netmask(ip); + if (mask) { + config.switch_mask = mask; + config.switch_gw = (ip & mask) | htonl(1); + } + } + } + break; + + case MENU_NETMASK: + prompt_ip("Switch Netmask:", &config.switch_mask); + break; + + case MENU_GATEWAY: + prompt_ip("Switch Gateway:", &config.switch_gw); + break; + + case MENU_CONTROLLER: + out = prompt("Controller:", config.controller_vconn, + "^(tcp|ssl):"IP_RE"(:"PORT_RE")?$"); + free(config.controller_vconn); + config.controller_vconn = out; + break; + + case MENU_DISCONNECTED_MODE: + out = prompt("If disconnected", + disconnected_string(config.disconnected), + "^("FAIL_DROP_STRING"|"FAIL_SWITCH_STRING")$"); + config.disconnected = (!strcmp(out, FAIL_DROP_STRING) + ? FAIL_DROP : FAIL_SWITCH); + free(out); + break; + + case MENU_DATAPATH_ID: + out = prompt("Datapath ID:", config.datapath_id, + "^Random|"MAC_RE"$"); + free(config.datapath_id); + config.datapath_id = out; + break; + + case MENU_STP: + out = prompt("802.1D-1998 STP:", + config.stp ? "Enabled" : "Disabled", + "^(Enabled|Disabled)$"); + config.stp = !strcmp(out, "Enabled"); + free(out); + break; + + case MENU_RATE_LIMIT: + in = (config.rate_limit < 0 + ? xstrdup("Disabled") + : xasprintf("%d/s", config.rate_limit)); + out = prompt("Ctlr rate limit:", in, + "^(Disabled|("NUM100_TO_99999_RE")/s)$"); + free(in); + config.rate_limit = isdigit(out[0]) ? atoi(out) : -1; + free(out); + break; + + case MENU_INACTIVITY_PROBE: + in = (config.inactivity_probe < 0 + ? xstrdup("Default") + : xasprintf("%d s", config.inactivity_probe)); + out = prompt("Activity probe:", in, + "^(Default|("NUM5_TO_99999_RE") s)$"); + free(in); + config.inactivity_probe = isdigit(out[0]) ? atoi(out) : -1; + free(out); + break; + + case MENU_MAX_BACKOFF: + in = (config.max_backoff < 0 + ? xstrdup("Default") + : xasprintf("%d s", config.max_backoff)); + out = prompt("Max backoff:", in, + "^(Default|("NUM1_TO_99999_RE") s)$"); + free(in); + config.max_backoff = isdigit(out[0]) ? atoi(out) : -1; + free(out); + break; + } + } + + if (yesno("Save\nChanges?", false)) { + struct svec set; + char *netdevs; + + svec_init(&set); + netdevs = svec_join(&config.netdevs, " ", ""); + svec_add_nocopy(&set, xasprintf("NETDEVS=%s", netdevs)); + free(netdevs); + svec_add(&set, + config.mode == IN_BAND ? "MODE=in-band" : "MODE=discovery"); + if (config.mode == IN_BAND) { + if (config.switch_ip == htonl(0)) { + svec_add(&set, "SWITCH_IP=dhcp"); + } else { + svec_add_nocopy(&set, xasprintf("SWITCH_IP="IP_FMT, + IP_ARGS(&config.switch_ip))); + svec_add_nocopy(&set, + xasprintf("SWITCH_NETMASK="IP_FMT, + IP_ARGS(&config.switch_mask))); + svec_add_nocopy(&set, xasprintf("SWITCH_GATEWAY="IP_FMT, + IP_ARGS(&config.switch_gw))); + svec_add_nocopy(&set, xasprintf("CONTROLLER=%s", + config.controller_vconn)); + } + } + svec_add(&set, (config.disconnected == FAIL_DROP + ? "DISCONNECTED_MODE=drop" + : "DISCONNECTED_MODE=switch")); + svec_add_nocopy(&set, xasprintf("STP=%s", config.stp ? "yes" : "no")); + if (config.rate_limit < 0) { + svec_add(&set, "RATE_LIMIT="); + } else { + svec_add_nocopy(&set, + xasprintf("RATE_LIMIT=%d", config.rate_limit)); + } + if (config.inactivity_probe < 0) { + svec_add(&set, "INACTIVITY_PROBE="); + } else { + svec_add_nocopy(&set, xasprintf("INACTIVITY_PROBE=%d", + config.inactivity_probe)); + } + if (config.max_backoff < 0) { + svec_add(&set, "MAX_BACKOFF="); + } else { + svec_add_nocopy(&set, xasprintf("MAX_BACKOFF=%d", + config.max_backoff)); + } + save_config(&set); + svec_destroy(&set); + } + + svec_destroy(&config.netdevs); + free(config.controller_vconn); + free(config.datapath_id); +} + +static void +cmd_setup_pki(const struct dict *dict UNUSED) +{ + static const char def_privkey_file[] + = "/etc/openflow-switch/of0-privkey.pem"; + static const char def_cert_file[] = "/etc/openflow-switch/of0-cert.pem"; + static const char def_cacert_file[] = "/etc/openflow-switch/cacert.pem"; + struct dict config_dict; + const char *privkey_file, *cert_file, *cacert_file; + bool bootstrap; + struct stat s; + struct svec set; + bool has_keys; + + if (!load_config(&config_dict)) { + return; + } + privkey_file = dict_get_string(&config_dict, "PRIVKEY", def_privkey_file); + cert_file = dict_get_string(&config_dict, "CERT", def_cert_file); + cacert_file = dict_get_string(&config_dict, "CACERT", def_cacert_file); + bootstrap = !strcmp(dict_get_string(&config_dict, "CACERT_MODE", "secure"), + "bootstrap"); + + has_keys = !stat(privkey_file, &s) && !stat(cert_file, &s); + if (!has_keys + ? yesno("Generate\nkeys?", true) + : yesno("Generate\nnew keys?", false)) { + struct svec argv; + bool ok; + + privkey_file = def_privkey_file; + cert_file = def_cert_file; + + svec_init(&argv); + svec_parse_words(&argv, "sh -c 'cd /etc/openflow-switch " + "&& ovs-pki --force req of0" + "&& ovs-pki --force self-sign of0'"); + svec_terminate(&argv); + ok = run_and_report_failure(argv.names, "Key gen failed"); + svec_destroy(&argv); + if (!ok) { + return; + } + has_keys = true; + } + if (!has_keys) { + return; + } + + if (stat(cacert_file, &s) && errno == ENOENT) { + bootstrap = yesno("Bootstrap\nCA cert?", bootstrap); + } else if (yesno("Replace\nCA cert?", false)) { + unlink(cacert_file); + bootstrap = true; + } + + svec_init(&set); + svec_add_nocopy(&set, xasprintf("PRIVKEY=%s", privkey_file)); + svec_add_nocopy(&set, xasprintf("CERT=%s", cert_file)); + svec_add_nocopy(&set, xasprintf("CACERT=%s", cacert_file)); + svec_add_nocopy(&set, xasprintf("CACERT_MODE=%s", + bootstrap ? "bootstrap" : "secure")); + save_config(&set); + svec_destroy(&set); +} + +static void +parse_options(int argc, char *argv[]) +{ + enum { + OPT_DUMMY = UCHAR_MAX + 1, + VLOG_OPTION_ENUMS + }; + static struct option long_options[] = { + {"verbose", optional_argument, 0, 'v'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + DAEMON_LONG_OPTIONS, + VLOG_LONG_OPTIONS, + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + + for (;;) { + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION); + exit(EXIT_SUCCESS); + + VLOG_OPTION_HANDLERS + DAEMON_OPTION_HANDLERS + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); +} + +static void +usage(void) +{ + printf("%s: OpenFlow switch monitoring user interface\n" + "usage: %s [OPTIONS] SWITCH\n" + "where SWITCH is an active OpenFlow connection method.\n", + program_name, program_name); + vconn_usage(true, false, false); + printf("\nOptions:\n" + " -v, --verbose=MODULE:FACILITY:LEVEL configure logging levels\n" + " -v, --verbose set maximum verbosity level\n" + " -h, --help display this help message\n" + " -V, --version display version information\n"); + exit(EXIT_SUCCESS); +} diff --git a/extras/ezio/terminal.c b/extras/ezio/terminal.c new file mode 100644 index 00000000..eacf0af0 --- /dev/null +++ b/extras/ezio/terminal.c @@ -0,0 +1,833 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include +#include "terminal.h" +#include +#include +#include +#include +#include +#include +#include +#include "dynamic-string.h" +#include "ezio.h" +#include "poll-loop.h" +#include "util.h" + +#define THIS_MODULE VLM_terminal +#include "vlog.h" + +/* UTF-8 decoding. */ +static struct utf8_reader *utf8_reader_create(void); +static void utf8_reader_destroy(struct utf8_reader *); +static int utf8_reader_read(struct utf8_reader *, uint8_t c); + +/* ANSI escape sequence decoding. */ +struct ansi_sequence { + int n_args; +#define ANSI_MAX_ARGS 16 + int args[ANSI_MAX_ARGS]; + int function; +}; + +static struct ansi_decoder *ansi_decoder_create(void); +static void ansi_decoder_destroy(struct ansi_decoder *); +static int ansi_decoder_put(struct ansi_decoder *, uint8_t c); +static const struct ansi_sequence *ansi_decoder_get(struct ansi_decoder *); + +/* Terminal emulation. */ +struct terminal { + struct ansi_decoder *ansi; + struct utf8_reader *utf8; + enum { EZIO, UTF8 } encoding; +}; + +static void recv_byte(struct terminal *term, struct ezio *ezio, uint8_t c); + +struct terminal * +terminal_create(void) +{ + struct terminal *term = xmalloc(sizeof *term); + term->ansi = ansi_decoder_create(); + term->utf8 = utf8_reader_create(); + term->encoding = UTF8; + return term; +} + +void +terminal_destroy(struct terminal *term) +{ + if (term) { + utf8_reader_destroy(term->utf8); + ansi_decoder_destroy(term->ansi); + free(term); + } +} + +int +terminal_run(struct terminal *term, struct ezio *ezio, int input_fd) +{ + char input[512]; + int n; + + n = read(input_fd, input, sizeof input); + if (n > 0) { + int i; + + for (i = 0; i < n; i++) { + recv_byte(term, ezio, input[i]); + } + return 0; + } else { + return !n ? EOF : errno == EAGAIN ? 0 : errno; + } +} + +void +terminal_wait(struct terminal *term UNUSED, int input_fd) +{ + poll_fd_wait(input_fd, POLLIN); +} + +static void recv_ansi_sequence(const struct ansi_sequence *, struct ezio *); +static void recv_control(uint8_t c, struct ezio *); +static void recv_character(uint8_t byte, struct ezio *); +static int unicode_to_ezio(uint16_t unicode); +static int default_arg(int value, int default_value); +static int range(int value, int min, int max); +static void clear_elements(uint8_t *p, size_t size, int pos, int clear_type); +static void define_icon(struct ezio *e, const int *args); +static void clear_icon(struct ezio *e, int icon_nr); +static void set_cursor(struct ezio *e, int visibility); + +static void +recv_byte(struct terminal *term, struct ezio *ezio, uint8_t c) +{ + int retval; + + /* Decode and interpret ANSI escape sequences. */ + retval = ansi_decoder_put(term->ansi, c); + if (retval <= 0) { + if (retval < 0) { + recv_ansi_sequence(ansi_decoder_get(term->ansi), ezio); + return; + } + return; + } + + /* Encoding selection. */ + if (c == 0x0e) { + /* Shift Out. */ + term->encoding = EZIO; + return; + } else if (c == 0x0f) { + /* Shift In. */ + term->encoding = UTF8; + return; + } + + if (term->encoding == UTF8) { + int unicode, ezchar; + + /* Convert UTF-8 input to Unicode code point. */ + unicode = utf8_reader_read(term->utf8, c); + if (unicode < 0) { + return; + } + + /* Convert Unicode code point to EZIO encoding. */ + ezchar = unicode_to_ezio(unicode); + if (ezchar >= 0) { + if (ezchar & 0xff00) { + recv_character(ezchar >> 8, ezio); + } + recv_character(ezchar, ezio); + } else if (unicode < 0x100) { + recv_control(unicode, ezio); + } else { + /* Unsupported Unicode code point. */ + return; + } + } else { + if (c >= 0x80 && c < 0x87) { + c &= 0x07; + } + if (c != 0xfe) { + recv_character(c, ezio); + } + } +} + +static void +log_ansi_sequence(const struct ansi_sequence *seq, struct ezio *e) +{ + struct sequence { + int function; + const char *name; + }; + static const struct sequence sequences[] = { + {0x5a, "CBT: Cursor Backward Tabulation"}, + {0x47, "CHA: Cursor Character Absolute"}, + {0x49, "CHT: Cursor Forward Tabulation"}, + {0x45, "CNL: Cursor Next Line"}, + {0x46, "CPL: Cursor Preceding Line"}, + {0x44, "CUB: Cursor Left"}, + {0x42, "CUD: Cursor Down"}, + {0x43, "CUF: Cursor Right"}, + {0x48, "CUP: Cursor Position"}, + {0x41, "CUU: Cursor Up"}, + {0x50, "DCH: Delete Character"}, + {0x4d, "DL: Delete Line"}, + {0x58, "ECH: Erase Character"}, + {0x4a, "ED: Erase in Page"}, + {0x4b, "EL: Erase in Line"}, + {0x40, "ICH: Insert Character"}, + {0x4c, "IL: Insert Line"}, + {0x4500, "NEL: Next Line"}, + {0x4d00, "RI: Reverse Line Feed"}, + {0x6300, "RIS: Reset to Initial State"}, + {0x54, "SD: Scroll Down"}, + {0x240, "SL: Scroll Left"}, + {0x241, "SR: Scroll Right"}, + {0x53, "SU: Scroll Up"}, + {0x70, "DICO: Define Icon"}, + {0x71, "CICO: Clear Icon"}, + {0x72, "Set cursor visibility"}, + }; + const struct sequence *s; + struct ds ds; + int i; + + ds_init(&ds); + for (s = sequences; s < &sequences[ARRAY_SIZE(sequences)]; s++) { + if (s->function == seq->function) { + ds_put_cstr(&ds, s->name); + goto found; + } + } + ds_put_format(&ds, "0x%02x", s->function); + if (s->function < 0x100) { + ds_put_format(&ds, "(%02d/%02d)", s->function / 16, s->function % 16); + } + +found: + for (i = 0; i < seq->n_args; i++) { + ds_put_format(&ds, ", %d", seq->args[i]); + } + VLOG_DBG("%s (cursor:%d,%d)", ds_cstr(&ds), e->x, e->y); + ds_destroy(&ds); +} + +static void +recv_ansi_sequence(const struct ansi_sequence *seq, struct ezio *e) +{ +#define ARG1(DEFAULT) default_arg(seq->args[0], DEFAULT) +#define ARG2(DEFAULT) default_arg(seq->args[1], DEFAULT) + if (VLOG_IS_DBG_ENABLED()) { + log_ansi_sequence(seq, e); + } + switch (seq->function) { + case 0x5a: /* CBT: Cursor Backward Tabulation. */ + e->x = 8 * (e->x / 8 - ARG1(1)); + break; + case 0x47: /* CHA: Cursor Character Absolute. */ + e->x = ARG1(1) - 1; + break; + case 0x49: /* CHT: Cursor Forward Tabulation. */ + e->x = 8 * (e->x / 8 + ARG1(1)); + break; + case 0x45: /* CNL: Cursor Next Line. */ + e->x = 0; + e->y += ARG1(1); + break; + case 0x46: /* CPL: Cursor Preceding Line. */ + e->x = 0; + e->y -= ARG1(1); + break; + case 0x44: /* CUB: Cursor Left. */ + e->x -= ARG1(1); + break; + case 0x42: /* CUD: Cursor Down. */ + e->y += ARG1(1); + break; + case 0x43: /* CUF: Cursor Right. */ + e->x += ARG1(1); + break; + case 0x48: /* CUP: Cursor Position. */ + e->y = ARG1(1) - 1; + e->x = ARG2(1) - 1; + break; + case 0x41: /* CUU: Cursor Up. */ + e->y -= ARG1(1); + break; + case 0x50: /* DCH: Delete Character. */ + ezio_delete_char(e, e->x, e->y, ARG1(1)); + break; + case 0x4d: /* DL: Delete Line. */ + ezio_delete_line(e, e->y, ARG1(1)); + break; + case 0x58: /* ECH: Erase Character. */ + memset(&e->chars[e->y][e->x], ' ', MIN(ARG1(1), 40 - e->x)); + break; + case 0x4a: /* ED: Erase in Page. */ + clear_elements(&e->chars[0][0], 2 * 40, e->x + 40 * e->y, ARG1(0)); + break; + case 0x4b: /* EL: Erase in Line. */ + clear_elements(&e->chars[e->y][0], 40, e->x, ARG1(0)); + break; + case 0x40: /* ICH: Insert Character. */ + ezio_insert_char(e, e->x, e->y, ARG1(1)); + break; + case 0x4c: /* IL: Insert Line. */ + ezio_insert_line(e, e->y, ARG1(1)); + break; + case 0x4500: /* NEL: Next Line. */ + e->x = 0; + e->y++; + break; + case 0x4d00: /* RI: Reverse Line Feed. */ + e->y--; + break; + case 0x6300: /* RIS: Reset to Initial State. */ + ezio_init(e); + break; + case 0x54: /* SD: Scroll Down. */ + ezio_scroll_down(e, ARG1(1)); + break; + case 0x240: /* SL: Scroll Left. */ + ezio_scroll_left(e, ARG1(1)); + break; + case 0x241: /* SR: Scroll Right. */ + ezio_scroll_right(e, ARG1(1)); + break; + case 0x53: /* SU: Scroll Up. */ + ezio_scroll_up(e, ARG1(1)); + break; + + /* Private sequences. */ + case 0x70: /* DICO: Define Icon. */ + define_icon(e, seq->args); + break; + case 0x71: /* CICO: Clear Icon. */ + clear_icon(e, ARG1(0)); + break; + case 0x72: /* Set cursor visibility. */ + set_cursor(e, ARG1(1)); + break; + } + e->x = range(e->x, 0, 40); + e->y = range(e->y, 0, 1); + VLOG_DBG("cursor:%d,%d", e->x, e->y); +} + +static void +recv_control(uint8_t c, struct ezio *e) +{ + switch (c) { + case '\b': + if (e->x > 0) { + --e->x; + } + break; + + case '\t': + e->x = ROUND_UP(e->x + 1, 8); + if (e->x > 40) { + ezio_newline(e); + } + break; + + case '\n': + ezio_line_feed(e); + break; + + case '\f': + ezio_clear(e); + break; + + case '\r': + e->x = 0; + break; + + default: + VLOG_DBG("Unhandled control character 0x%02"PRIx8, c); + } +} + +static void +recv_character(uint8_t byte, struct ezio *e) +{ + if (e->x >= 40) { + ezio_newline(e); + } + ezio_put_char(e, e->x++, e->y, byte); +} + +static int +default_arg(int value, int default_value) +{ + return value >= 0 ? value : default_value; +} + +static int +range(int value, int min, int max) +{ + return value < min ? min : value > max ? max : value; +} + +static void +clear_elements(uint8_t *p, size_t size, int pos, int clear_type) +{ + switch (clear_type) { + case 0: + /* Clear from 'pos' to end. */ + memset(p + pos, ' ', size - pos); + break; + case 1: + /* Clear from beginning to 'pos'. */ + memset(p, ' ', pos + 1); + break; + case 2: + /* Clear all. */ + memset(p, ' ', size); + break; + } +} + +static void +define_icon(struct ezio *e, const int *args) +{ + int icon_nr; + int row; + + icon_nr = args[0]; + if (icon_nr < 0 || icon_nr > 7) { + return; + } + + for (row = 0; row < 8; row++) { + e->icons[icon_nr][row] = default_arg(args[row + 1], 0) & 0x1f; + } +} + +static void +clear_icon(struct ezio *e, int icon_nr) +{ + if (icon_nr >= 0 && icon_nr <= 7) { + ezio_set_default_icon(e, icon_nr); + } +} + +static void +set_cursor(struct ezio *e, int visibility) +{ + switch (visibility) { + case 1: + e->show_cursor = e->blink_cursor = false; + break; + case 2: + e->show_cursor = true; + e->blink_cursor = false; + break; + case 3: + e->show_cursor = e->blink_cursor = true; + break; + } +} + +static int +unicode_to_ezio(uint16_t unicode) +{ + switch (unicode) { + /* Most ASCII characters map one-to-one. */ + case 0x0020 ... 0x005b: + case 0x005d ... 0x007d: + return unicode; + + /* A few ASCII characters have to be simulated with icons. */ + case 0x005c: return 0x06; /* BACKSLASH */ + case 0x007e: return 0x07; /* TILDE */ + + /* EZIO extended characters equivalents in Unicode - Japanese. */ + case 0x00a5: return '\\'; /* YEN SIGN */ + case 0x3002: return 0xa1; /* IDEOGRAPHIC FULL STOP */ + case 0x300c: return 0xa2; /* LEFT CORNER BRACKET */ + case 0x300d: return 0xa3; /* RIGHT CORNER BRACKET */ + case 0x3001: return 0xa4; /* IDEOGRAPHIC COMMA */ + case 0x30fb: return 0xa5; /* KATAKANA MIDDLE DOT */ + case 0x30f2: return 0xa6; /* KATAKANA LETTER WO */ + case 0x30a1: return 0xa7; /* KATAKANA LETTER SMALL A */ + case 0x30a3: return 0xa8; /* KATAKANA LETTER SMALL I */ + case 0x30a5: return 0xa9; /* KATAKANA LETTER SMALL U */ + case 0x30a7: return 0xaa; /* KATAKANA LETTER SMALL E */ + case 0x30a9: return 0xab; /* KATAKANA LETTER SMALL O */ + case 0x30e3: return 0xac; /* KATAKANA LETTER SMALL YA */ + case 0x30e5: return 0xad; /* KATAKANA LETTER SMALL YU */ + case 0x30e7: return 0xae; /* KATAKANA LETTER SMALL YO */ + case 0x30c3: return 0xaf; /* KATAKANA LETTER SMALL TU = SMALL TSU */ + case 0x30fc: return 0xb0; /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */ + case 0x30a2: return 0xb1; /* KATAKANA LETTER A */ + case 0x30a4: return 0xb2; /* KATAKANA LETTER I */ + case 0x30a6: return 0xb3; /* KATAKANA LETTER U */ + case 0x30a8: return 0xb4; /* KATAKANA LETTER E */ + case 0x30aa: return 0xb5; /* KATAKANA LETTER O */ + case 0x30ab: return 0xb6; /* KATAKANA LETTER KA */ + case 0x30ac: return 0xb6de; /* KATAKANA LETTER GA */ + case 0x30ad: return 0xb7; /* KATAKANA LETTER KI */ + case 0x30ae: return 0xb7de; /* KATAKANA LETTER GI */ + case 0x30af: return 0xb8; /* KATAKANA LETTER KU */ + case 0x30b0: return 0xb8de; /* KATAKANA LETTER GU */ + case 0x30b1: return 0xb9; /* KATAKANA LETTER KE */ + case 0x30b2: return 0xb9de; /* KATAKANA LETTER GE */ + case 0x30b3: return 0xba; /* KATAKANA LETTER KO */ + case 0x30b4: return 0xbade; /* KATAKANA LETTER GO */ + case 0x30b5: return 0xbb; /* KATAKANA LETTER SA */ + case 0x30b6: return 0xbbde; /* KATAKANA LETTER ZA */ + case 0x30b7: return 0xbc; /* KATAKANA LETTER SI = SHI */ + case 0x30b8: return 0xbcde; /* KATAKANA LETTER ZI = JI */ + case 0x30b9: return 0xbd; /* KATAKANA LETTER SU */ + case 0x30ba: return 0xbdde; /* KATAKANA LETTER ZU */ + case 0x30bb: return 0xbe; /* KATAKANA LETTER SE */ + case 0x30bc: return 0xbede; /* KATAKANA LETTER ZE */ + case 0x30bd: return 0xbf; /* KATAKANA LETTER SO */ + case 0x30be: return 0xbfde; /* KATAKANA LETTER ZO */ + case 0x30bf: return 0xc0; /* KATAKANA LETTER TA */ + case 0x30c0: return 0xc0de; /* KATAKANA LETTER DA */ + case 0x30c1: return 0xc1; /* KATAKANA LETTER TI = CHI */ + case 0x30c2: return 0xc1de; /* KATAKANA LETTER DI = JI */ + case 0x30c4: return 0xc2; /* KATAKANA LETTER TU = TSU */ + case 0x30c5: return 0xc2de; /* KATAKANA LETTER DU = ZU */ + case 0x30c6: return 0xc3; /* KATAKANA LETTER TE */ + case 0x30c7: return 0xc3de; /* KATAKANA LETTER DE */ + case 0x30c8: return 0xc4; /* KATAKANA LETTER TO */ + case 0x30c9: return 0xc4de; /* KATAKANA LETTER DO */ + case 0x30ca: return 0xc5; /* KATAKANA LETTER NA */ + case 0x30cb: return 0xc6; /* KATAKANA LETTER NI */ + case 0x30cc: return 0xc7; /* KATAKANA LETTER NU */ + case 0x30cd: return 0xc8; /* KATAKANA LETTER NE */ + case 0x30ce: return 0xc9; /* KATAKANA LETTER NO */ + case 0x30cf: return 0xca; /* KATAKANA LETTER HA */ + case 0x30d0: return 0xcade; /* KATAKANA LETTER BA */ + case 0x30d1: return 0xcadf; /* KATAKANA LETTER PA */ + case 0x30d2: return 0xcb; /* KATAKANA LETTER HI */ + case 0x30d3: return 0xcbde; /* KATAKANA LETTER BI */ + case 0x30d4: return 0xcbdf; /* KATAKANA LETTER PI */ + case 0x30d5: return 0xcc; /* KATAKANA LETTER HU = FU */ + case 0x30d6: return 0xccde; /* KATAKANA LETTER BU */ + case 0x30d7: return 0xccdf; /* KATAKANA LETTER PU */ + case 0x30d8: return 0xcd; /* KATAKANA LETTER HE */ + case 0x30d9: return 0xcdde; /* KATAKANA LETTER BE */ + case 0x30da: return 0xcddf; /* KATAKANA LETTER PE */ + case 0x30db: return 0xce; /* KATAKANA LETTER HO */ + case 0x30dc: return 0xcede; /* KATAKANA LETTER BO */ + case 0x30dd: return 0xcedf; /* KATAKANA LETTER PO */ + case 0x30de: return 0xcf; /* KATAKANA LETTER MA */ + case 0x30df: return 0xd0; /* KATAKANA LETTER MI */ + case 0x30e0: return 0xd1; /* KATAKANA LETTER MU */ + case 0x30e1: return 0xd2; /* KATAKANA LETTER ME */ + case 0x30e2: return 0xd3; /* KATAKANA LETTER MO */ + case 0x30e4: return 0xd4; /* KATAKANA LETTER YA */ + case 0x30e6: return 0xd5; /* KATAKANA LETTER YU */ + case 0x30e8: return 0xd6; /* KATAKANA LETTER YO */ + case 0x30e9: return 0xd7; /* KATAKANA LETTER RA */ + case 0x30ea: return 0xd8; /* KATAKANA LETTER RI */ + case 0x30eb: return 0xd9; /* KATAKANA LETTER RU */ + case 0x30ec: return 0xda; /* KATAKANA LETTER RE */ + case 0x30ed: return 0xdb; /* KATAKANA LETTER RO */ + case 0x30ef: return 0xdc; /* KATAKANA LETTER WA */ + case 0x30f3: return 0xdd; /* KATAKANA LETTER N */ + case 0x30f4: return 0xb3de; /* KATAKANA LETTER VU */ + case 0x30f7: return 0xdcde; /* KATAKANA LETTER VA */ + case 0x3099: return 0xde; /* COMBINING KATAKANA-HIRAGANA VOICED SOUND + * MARK */ + case 0x309a: return 0xdf; /* COMBINING KATAKANA-HIRAGANA SEMI-VOICED + * SOUND MARK */ + case 0x309b: return 0xde; /* KATAKANA-HIRAGANA VOICED SOUND MARK */ + case 0x309c: return 0xdf; /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */ + + /* EZIO extended characters equivalents in Unicode - other. */ + case 0x2192: return 0x7e; /* RIGHTWARDS ARROW */ + case 0x2190: return 0x7f; /* LEFTWARDS ARROW */ + case 0x03b1: return 0xe0; /* GREEK SMALL LETTER ALPHA */ + case 0x00e4: return 0xe1; /* LATIN SMALL LETTER A WITH DIAERESIS */ + case 0x03b2: return 0xe2; /* GREEK SMALL LETTER BETA */ + case 0x03b5: return 0xe3; /* GREEK SMALL LETTER EPSILON */ + case 0x03bc: return 0xe4; /* GREEK SMALL LETTER MU */ + case 0x03c6: return 0xe5; /* GREEK SMALL LETTER PHI */ + case 0x03c1: return 0xe6; /* GREEK SMALL LETTER RHO */ + /* 0xe7 is 'g'. */ + case 0x221a: return 0xe8; /* SQUARE ROOT = radical sign */ + /* 0xe9 is an unrecognizable symbol. */ + /* 0xea is 'j'. */ + /* 0xeb is an unrecognizable symbol.*/ + case 0x00a2: return 0xec; /* CENT SIGN */ + case 0x00a3: return 0xed; /* POUND SIGN */ + case 0x00f1: return 0xee; /* LATIN SMALL LETTER N WITH TILDE */ + case 0x00f6: return 0xef; /* LATIN SMALL LETTER O WITH DIAERESIS */ + /* 0xf0 is 'p'. */ + /* 0xf1 is 'q'. */ + case 0x03b8: return 0xf2; /* GREEK SMALL LETTER THETA */ + case 0x221e: return 0xf3; /* INFINITY */ + case 0x03a9: return 0xf4; /* GREEK CAPITAL LETTER OMEGA */ + case 0x00fc: return 0xf5; /* LATIN SMALL LETTER U WITH DIAERESIS */ + case 0x03a3: return 0xf6; /* GREEK CAPITAL LETTER SIGMA */ + case 0x03c0: return 0xf7; /* GREEK SMALL LETTER PI */ + /* 0xf8 is x-macron (the sample mean). */ + /* 0xf9 is 'y'. */ + case 0x5343: return 0xfa; /* thousand */ + case 0x4e07: return 0xfb; /* ten thousand */ + case 0x5186: return 0xfc; /* yen */ + case 0x00f7: return 0xfd; /* DIVISION SIGN */ + case 0x2588: return 0xff; /* FULL BLOCK = solid */ + + /* EZIO icons (from the Unicode Private Use corporate subarea). */ + case 0xf8f8: return 0x00; + case 0xf8f9: return 0x01; + case 0xf8fa: return 0x02; + case 0xf8fb: return 0x03; + case 0xf8fc: return 0x04; + case 0xf8fd: return 0x05; + case 0xf8fe: return 0x06; + case 0xf8ff: return 0x07; + + /* No mappings for anything else. */ + default: return -1; + } +} + +/* UTF-8 decoder. */ + +#define UTF_STATES \ + UTF_STATE(UTF8_INIT, 0x00, 0xf4, UTF8_INIT) \ + UTF_STATE(UTF8_3, 0x80, 0xbf, UTF8_2) \ + UTF_STATE(UTF8_2, 0x80, 0xbf, UTF8_1) \ + UTF_STATE(UTF8_1, 0x80, 0xbf, UTF8_INIT) \ + UTF_STATE(UTF8_E0, 0xa0, 0xbf, UTF8_1) \ + UTF_STATE(UTF8_ED, 0x80, 0x9f, UTF8_1) \ + UTF_STATE(UTF8_F0, 0x90, 0xbf, UTF8_INIT) \ + UTF_STATE(UTF8_F4, 0x80, 0x8f, UTF8_INIT) + +enum state { +#define UTF_STATE(NAME, MIN, MAX, NEXT) NAME, + UTF_STATES +#undef UTF_STATE +}; + +struct state_info { + uint8_t min, max; + enum state next; +}; + +static const struct state_info states[] = { +#define UTF_STATE(NAME, MIN, MAX, NEXT) {MIN, MAX, NEXT}, + UTF_STATES +#undef UTF_STATE +}; + +struct utf8_reader { + int cp; + enum state state; +}; + +struct utf8_reader * +utf8_reader_create(void) +{ + struct utf8_reader *r = xmalloc(sizeof *r); + r->state = UTF8_INIT; + return r; +} + +void +utf8_reader_destroy(struct utf8_reader *r) +{ + free(r); +} + +int +utf8_reader_read(struct utf8_reader *r, uint8_t c) +{ + const struct state_info *s = &states[r->state]; + if (c >= s->min && c <= s->max) { + if (r->state == UTF8_INIT) { + if (c < 0x80) { + return c; + } else if (c >= 0xc2 && c <= 0xdf) { + r->cp = c & 0x1f; + r->state = UTF8_1; + return -1; + } else if (c >= 0xe0 && c <= 0xef) { + r->cp = c & 0x0f; + r->state = c == 0xe0 ? UTF8_E0 : c == 0xed ? UTF8_ED : UTF8_2; + return -1; + } else if (c >= 0xf0 && c <= 0xf4) { + r->cp = c & 0x07; + r->state = c == 0xf0 ? UTF8_F0 : c == 0xf4 ? UTF8_F4 : UTF8_3; + return -1; + } + } else { + r->cp = (r->cp << 6) | (c & 0x3f); + r->state = s->next; + return r->state == UTF8_INIT ? r->cp : -1; + } + } + + /* Invalid UTF-8 sequence. Return the Unicode general substitute + * REPLACEMENT CHARACTER. */ + r->state = UTF8_INIT; + return 0xfffd; +} + +/* ANSI control sequence decoder. */ + +/* States are named for what we are looking for in that state. */ +enum ansi_state { + ANSI_ESC, /* Looking for ESC. */ + ANSI_CSI, /* Looking for [ (to complete CSI). */ + ANSI_PARAMETER, /* Looking for parameter. */ + ANSI_INTERMEDIATE, /* Looking for intermediate byte. */ + ANSI_FINAL, /* Looking for final byte. */ + ANSI_COMPLETE /* Got an entire escape sequence. */ +}; + +struct ansi_decoder { + enum ansi_state state; + struct ansi_sequence seq; + int c; +}; + +struct ansi_decoder * +ansi_decoder_create(void) +{ + struct ansi_decoder *d = xmalloc(sizeof *d); + d->state = ANSI_ESC; + return d; +} + +void +ansi_decoder_destroy(struct ansi_decoder *d) +{ + free(d); +} + +int +ansi_decoder_put(struct ansi_decoder *d, uint8_t c) +{ + if (c == 27) { + /* Escape always starts a new escape sequence, aborting an incomplete + * one if necessary. */ + if (d->state != ANSI_ESC) { + VLOG_DBG("Unexpected escape inside escape sequence"); + } + d->state = ANSI_CSI; + return 0; + } + + switch (d->state) { + case ANSI_ESC: + return 1; + + case ANSI_CSI: + if (c == '[') { + d->state = ANSI_PARAMETER; + d->seq.n_args = 0; + d->seq.function = 0; + } else if (c >= 0x40 && c <= 0x5f) { + d->state = ANSI_COMPLETE; + d->seq.n_args = 0; + d->seq.function = 0; + d->seq.function = c << 8; + return -1; + } else { + d->state = ANSI_ESC; + } + break; + + case ANSI_PARAMETER: + if (c >= '0' && c <= '9') { + int *arg; + if (d->seq.n_args == 0) { + d->seq.args[d->seq.n_args++] = 0; + } else if (d->seq.n_args > ANSI_MAX_ARGS) { + break; + } + arg = &d->seq.args[d->seq.n_args - 1]; + if (*arg == -1) { + *arg = 0; + } + *arg = *arg * 10 + (c - '0'); + break; + } else if (c == ';') { + if (d->seq.n_args < ANSI_MAX_ARGS) { + d->seq.args[d->seq.n_args] = -1; + } + d->seq.n_args++; + break; + } + d->state = ANSI_INTERMEDIATE; + /* Fall through. */ + + case ANSI_INTERMEDIATE: + if (c >= 0x20 && c <= 0x2f) { + d->seq.function = d->seq.function * 16 + (c - 0x20); + break; + } + d->state = ANSI_FINAL; + /* Fall through. */ + + case ANSI_FINAL: + if (c >= 0x40 && c <= 0x7e) { + d->seq.function = d->seq.function * 256 + c; + d->state = ANSI_COMPLETE; + return -1; + } else { + /* Invalid sequence. */ + d->state = ANSI_ESC; + } + break; + + case ANSI_COMPLETE: + NOT_REACHED(); + } + return 0; +} + +const struct ansi_sequence * +ansi_decoder_get(struct ansi_decoder *d) +{ + assert(d->state == ANSI_COMPLETE); + d->state = ANSI_ESC; + if (d->seq.n_args < ANSI_MAX_ARGS) { + int i; + for (i = d->seq.n_args; i < ANSI_MAX_ARGS; i++) { + d->seq.args[i] = -1; + } + } else { + d->seq.n_args = ANSI_MAX_ARGS; + } + return &d->seq; +} diff --git a/extras/ezio/terminal.h b/extras/ezio/terminal.h new file mode 100644 index 00000000..1ae5c479 --- /dev/null +++ b/extras/ezio/terminal.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#ifndef TERMINAL_H +#define TERMINAL_H 1 + +#include +#include + +struct ezio; + +struct terminal *terminal_create(void); +void terminal_destroy(struct terminal *); +int terminal_run(struct terminal *, struct ezio *, int input_fd); +void terminal_wait(struct terminal *, int input_fd); + +#endif /* terminal.h */ diff --git a/extras/ezio/tty.c b/extras/ezio/tty.c new file mode 100644 index 00000000..ce788f28 --- /dev/null +++ b/extras/ezio/tty.c @@ -0,0 +1,404 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include +#include "extras/ezio/tty.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fatal-signal.h" +#include "socket-util.h" +#include "util.h" + +#define THIS_MODULE VLM_tty +#include "vlog.h" + +/* Get major() and minor() macros. */ +#if MAJOR_IN_MKDEV +# include +#elif MAJOR_IN_SYSMACROS +# include +#else +# include +# ifndef major +# define major(dev) (((dev) >> 8) & 0xff) +# define minor(dev) ((dev) & 0xff) +# endif +#endif + +static int +fcntl_lock(int fd) +{ + struct flock l; + memset(&l, 0, sizeof l); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + return fcntl(fd, F_SETLK, &l) == -1 ? errno : 0; +} + +static int +remove_lockfile(const char *name) +{ + char buffer[BUFSIZ]; + ssize_t n; + pid_t pid; + int fd; + + /* Remove existing lockfile. */ + fd = open(name, O_RDWR); + if (fd < 0) { + if (errno == ENOENT) { + return 0; + } else { + VLOG_ERR("%s: open: %s", name, strerror(errno)); + return errno; + } + } + + /* Read lockfile. */ + n = read(fd, buffer, sizeof buffer - 1); + if (n < 0) { + int error = errno; + VLOG_ERR("%s: read: %s", name, strerror(error)); + close(fd); + return error; + } + buffer[n] = '\0'; + if (n == 4 && memchr(buffer, '\0', n)) { + int32_t x; + memcpy(&x, buffer, sizeof x); + pid = x; + } else if (n >= 0) { + pid = strtol(buffer, NULL, 10); + } + if (pid <= 0) { + close(fd); + VLOG_WARN("%s: format not recognized, treating as locked.", name); + return EACCES; + } + + /* Is lockfile fresh? */ + if (strstr(buffer, "fcntl")) { + int retval = fcntl_lock(fd); + if (retval) { + close(fd); + VLOG_ERR("%s: device is locked (via fcntl): %s", + name, strerror(retval)); + return retval; + } else { + VLOG_WARN("%s: removing stale lockfile (checked via fcntl)", name); + } + } else { + if (!(kill(pid, 0) < 0 && errno == ESRCH)) { + close(fd); + VLOG_ERR("%s: device is locked (without fcntl)", name); + return EACCES; + } else { + VLOG_WARN("%s: removing stale lockfile (without fcntl)", name); + } + } + close(fd); + + /* Remove stale lockfile. */ + if (unlink(name)) { + VLOG_ERR("%s: unlink: %s", name, strerror(errno)); + return errno; + } + return 0; +} + +static int +create_lockfile(const char *name) +{ + const char *username; + char buffer[BUFSIZ]; + struct passwd *pwd; + mode_t old_umask; + uid_t uid; + int fd; + + /* Create file. */ + old_umask = umask(022); + fd = open(name, O_WRONLY | O_CREAT | O_EXCL, 0666); + if (fd < 0) { + int error = errno; + VLOG_ERR("%s: create: %s", name, strerror(error)); + umask(old_umask); + return error; + } + umask(old_umask); + + /* Lock file. */ + if (fcntl_lock(fd)) { + int error = errno; + close(fd); + VLOG_ERR("%s: cannot lock: %s", name, strerror(error)); + return error; + } + + /* Write to file. */ + uid = getuid(); + pwd = getpwuid(uid); + username = pwd ? pwd->pw_name : "unknown"; + snprintf(buffer, sizeof buffer, "%10ld %s %.20s fcntl\n", + (long int) getpid(), program_name, username); + if (write(fd, buffer, strlen(buffer)) != strlen(buffer)) { + int error = errno; + VLOG_ERR("%s: write: %s", name, strerror(error)); + close(fd); + unlink(name); + return error; + } + + /* We intentionally do not close 'fd', to avoid releasing the fcntl lock. + * The asssumption here is that we never unlock a tty. */ + fatal_signal_add_file_to_unlink(name); + + return 0; +} + +static int +do_lock(char *name) +{ + int retval = remove_lockfile(name); + if (!retval) { + retval = create_lockfile(name); + } + free(name); + return retval; +} + +int +tty_lock(const char *dev_name) +{ + struct stat s; + char *name; + int retval; + + /* Check that the lockfile directory exists. */ + if (stat(TTY_LOCK_DIR, &s)) { + VLOG_ERR("%s: stat: %s", TTY_LOCK_DIR, strerror(errno)); + return errno; + } + + /* First lock by device number. */ + if (stat(dev_name, &s)) { + VLOG_ERR("%s: stat: %s", dev_name, strerror(errno)); + return errno; + } + retval = do_lock(xasprintf("%s/LK.%03d.%03d.%03d", TTY_LOCK_DIR, + major(s.st_dev), + major(s.st_rdev), minor(s.st_rdev))); + if (retval) { + return retval; + } + + /* Then lock by device name. */ + if (!strncmp(dev_name, "/dev/", 5)) { + char *cp; + + name = xasprintf("%s/%s", TTY_LOCK_DIR, dev_name + 5); + for (cp = name + strlen(dev_name) + 1; *cp; cp++) { + if (*cp == '/') { + *cp = '_'; + } + } + } else { + char *slash = strrchr(dev_name, '/'); + name = xasprintf ("%s/%s", TTY_LOCK_DIR, slash ? slash + 1 : dev_name); + } + return do_lock(name); +} + +struct saved_termios { + int fd; + struct termios tios; +}; + +static void +restore_termios(void *s_) +{ + struct saved_termios *s = s_; + tcsetattr(s->fd, TCSAFLUSH, &s->tios); +} + +int +tty_set_raw_mode(int fd, speed_t speed) +{ + if (isatty(fd)) { + struct termios tios; + struct saved_termios *s; + + if (tcgetattr(fd, &tios) < 0) { + return errno; + } + + s = xmalloc(sizeof *s); + s->fd = dup(fd); + if (s->fd < 0) { + int error = errno; + VLOG_WARN("dup failed: %s", strerror(error)); + free(s); + return errno; + } + s->tios = tios; + fatal_signal_add_hook(restore_termios, s, true); + + tios.c_iflag &= ~(IGNBRK | BRKINT | PARMRK | ISTRIP + | INLCR | IGNCR | ICRNL | IXON); + tios.c_oflag &= ~OPOST; + tios.c_lflag &= ~(ECHO | ECHONL | ICANON | ISIG | IEXTEN); + tios.c_cflag &= ~(CSIZE | PARENB); + tios.c_cflag |= CS8; + if (speed != B0) { + cfsetispeed(&tios, speed); + cfsetospeed(&tios, speed); + } + if (tcsetattr(fd, TCSAFLUSH, &tios) < 0) { + return errno; + } + } + return set_nonblocking(fd); +} + +int +tty_open_master_pty(void) +{ + int retval; + int fd; + + fd = posix_openpt(O_RDWR | O_NOCTTY); + if (fd < 0) { + int error = errno; + VLOG_WARN("posix_openpt failed: %s", strerror(error)); + close(fd); + return -error; + } + + if (grantpt(fd) < 0) { + int error = errno; + VLOG_WARN("grantpt failed: %s", strerror(error)); + close(fd); + return -error; + } + + if (unlockpt(fd) < 0) { + int error = errno; + VLOG_WARN("unlockpt failed: %s", strerror(error)); + close(fd); + return -error; + } + + retval = set_nonblocking(fd); + if (retval) { + VLOG_WARN("set_nonblocking failed: %s", strerror(retval)); + close(fd); + return retval; + } + + return fd; +} + +int +tty_fork_child(int master_fd, char *argv[]) +{ + int retval = fork(); + if (!retval) { + char *slave_name; + int slave_fd; + int fd; + + /* Running in child process. */ + fatal_signal_fork(); + + /* Open pty slave as controlling terminal. */ + setsid(); + slave_name = ptsname(master_fd); + if (slave_name == NULL) { + ovs_fatal(errno, "ptsname"); + } + slave_fd = open(slave_name, O_RDWR); + if (isastream(slave_fd) + && (ioctl(slave_fd, I_PUSH, "ptem") < 0 + || ioctl(slave_fd, I_PUSH, "ldterm") < 0)) { + ovs_fatal(errno, "STREAMS ioctl"); + } + + /* Make pty slave stdin, stdout. */ + if (dup2(slave_fd, STDIN_FILENO) < 0 + || dup2(slave_fd, STDOUT_FILENO) < 0 + || dup2(slave_fd, STDERR_FILENO) < 0) { + ovs_fatal(errno, "dup2"); + } + + /* Close other file descriptors. */ + for (fd = 3; fd < 20; fd++) { + close(fd); + } + + /* Set terminal type. */ + setenv("TERM", "ezio3", true); + + /* Invoke subprocess. */ + execvp(argv[0], argv); + ovs_fatal(errno, "execvp"); + } else if (retval > 0) { + /* Running in parent process. */ + return 0; + } else { + /* Fork failed. */ + VLOG_WARN("fork failed: %s", strerror(errno)); + return errno; + } +} + +int +tty_set_window_size(int fd UNUSED, int rows UNUSED, int columns UNUSED) +{ +#ifdef TIOCGWINSZ + struct winsize win; + win.ws_row = rows; + win.ws_col = columns; + win.ws_xpixel = 0; + win.ws_ypixel = 0; + if (ioctl(fd, TIOCSWINSZ, &win) == -1) { + return errno; + } +#else +#error +#endif + return 0; +} diff --git a/extras/ezio/tty.h b/extras/ezio/tty.h new file mode 100644 index 00000000..7500df55 --- /dev/null +++ b/extras/ezio/tty.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#ifndef TTY_H +#define TTY_H 1 + +#include + +int tty_lock(const char *dev_name); +int tty_set_raw_mode(int fd, speed_t); +int tty_open_master_pty(void); +int tty_fork_child(int master_fd, char *argv[]); +int tty_set_window_size(int fd, int n_rows, int n_columns); + +#endif /* tty.h */ diff --git a/extras/ezio/vt-dummy.c b/extras/ezio/vt-dummy.c new file mode 100644 index 00000000..f36d3114 --- /dev/null +++ b/extras/ezio/vt-dummy.c @@ -0,0 +1,40 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include +#include "extras/ezio/vt.h" +#include + +#define THIS_MODULE VLM_vt +#include "vlog.h" + +int +vt_open(int open_flags) +{ + VLOG_ERR("no virtual terminal support on this platform"); + return -ENOSYS; +} diff --git a/extras/ezio/vt-linux.c b/extras/ezio/vt-linux.c new file mode 100644 index 00000000..f502c9e1 --- /dev/null +++ b/extras/ezio/vt-linux.c @@ -0,0 +1,139 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include +#include "extras/ezio/vt.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +#define THIS_MODULE VLM_vt +#include "vlog.h" + +static bool get_console_fd(int *fd); + +int +vt_open(int open_flags) +{ + int console_fd, vt_fd; + char name[16]; + int vt; + + if (!get_console_fd(&console_fd)) { + return -EACCES; + } + + /* Deallocate all unused virtual terminals, so that we don't proliferate an + * excess of empty ones over multiple runs. */ + if (ioctl(console_fd, VT_DISALLOCATE, 0) < 0) { + VLOG_WARN("failed to deallocate empty virtual terminals: %s", + strerror(errno)); + } + + /* Find a unused virtual terminal. */ + if (ioctl(console_fd, VT_OPENQRY, &vt) < 0) { + int error = errno; + VLOG_ERR("failed to find a free virtual terminal: %s", + strerror(error)); + close(console_fd); + return -error; + } + + /* Open virtual terminal. */ + sprintf(name, "/dev/tty%d", vt); + vt_fd = open(name, open_flags); + if (vt_fd < 0) { + int error = errno; + VLOG_ERR("failed to open %s: %s", name, strerror(error)); + close(console_fd); + return -error; + } + + /* Activate virtual terminal. */ + if (ioctl(console_fd, VT_ACTIVATE, vt) < 0 + || ioctl(console_fd, VT_WAITACTIVE, vt) < 0) { + int error = errno; + VLOG_ERR("could not activate virtual terminal %d: %s", + vt, strerror(error)); + close(console_fd); + close(vt_fd); + return -error; + } + + /* Success. */ + VLOG_DBG("allocated virtual terminal %d (%s)", vt, name); + close(console_fd); + return vt_fd; +} + +static bool +is_console(int fd) +{ + uint8_t type = 0; + return !ioctl(fd, KDGKBTYPE, &type) && (type == KB_101 || type == KB_84); +} + +static bool +open_console(const char *name, int *fdp) +{ + *fdp = open(name, O_RDWR | O_NOCTTY); + if (*fdp >= 0) { + if (is_console(*fdp)) { + return true; + } + close(*fdp); + } + return false; +} + +static bool +get_console_fd(int *fdp) +{ + int fd; + + if (open_console("/dev/tty", fdp) + || open_console("/dev/tty0", fdp) + || open_console("/dev/console", fdp)) { + return true; + } + for (fd = 0; fd < 3; fd++) { + if (is_console(fd)) { + *fdp = dup(fd); + if (*fdp >= 0) { + return true; + } + } + } + VLOG_ERR("unable to obtain a file descriptor for the console"); + return false; +} diff --git a/extras/ezio/vt.h b/extras/ezio/vt.h new file mode 100644 index 00000000..2aafe941 --- /dev/null +++ b/extras/ezio/vt.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#ifndef VT_H +#define VT_H 1 + +int vt_open(int open_flags); + +#endif /* vt.h */ diff --git a/include/.gitignore b/include/.gitignore new file mode 100644 index 00000000..b336cc7c --- /dev/null +++ b/include/.gitignore @@ -0,0 +1,2 @@ +/Makefile +/Makefile.in diff --git a/include/automake.mk b/include/automake.mk new file mode 100644 index 00000000..a1316c4a --- /dev/null +++ b/include/automake.mk @@ -0,0 +1,2 @@ +include include/openflow/automake.mk +include include/openvswitch/automake.mk diff --git a/include/openflow/automake.mk b/include/openflow/automake.mk new file mode 100644 index 00000000..d4731550 --- /dev/null +++ b/include/openflow/automake.mk @@ -0,0 +1,4 @@ +noinst_HEADERS += \ + include/openflow/openflow-mgmt.h \ + include/openflow/nicira-ext.h \ + include/openflow/openflow.h diff --git a/include/openflow/nicira-ext.h b/include/openflow/nicira-ext.h new file mode 100644 index 00000000..176e0310 --- /dev/null +++ b/include/openflow/nicira-ext.h @@ -0,0 +1,109 @@ +/* + * Distributed under the terms of the GNU GPL version 2. + * Copyright (c) 2008, 2009 Nicira Networks + */ + +#ifndef OPENFLOW_NICIRA_EXT_H +#define OPENFLOW_NICIRA_EXT_H 1 + +#include "openflow/openflow.h" + +#define NICIRA_OUI_STR "002320" + +/* The following vendor extensions, proposed by Nicira Networks, are not yet + * ready for standardization (and may never be), so they are not included in + * openflow.h. */ + +#define NX_VENDOR_ID 0x00002320 + +enum nicira_type { + /* Switch status request. The request body is an ASCII string that + * specifies a prefix of the key names to include in the output; if it is + * the null string, then all key-value pairs are included. */ + NXT_STATUS_REQUEST, + + /* Switch status reply. The reply body is an ASCII string of key-value + * pairs in the form "key=value\n". */ + NXT_STATUS_REPLY, + + /* Configure an action. Most actions do not require configuration + * beyond that supplied in the actual action call. */ + NXT_ACT_SET_CONFIG, + + /* Get configuration of action. */ + NXT_ACT_GET_CONFIG, + + /* Remote command execution. The request body is a sequence of strings + * delimited by null bytes. The first string is a command name. + * Subsequent strings are command arguments. */ + NXT_COMMAND_REQUEST, + + /* Remote command execution reply, sent when the command's execution + * completes. The reply body is struct nx_command_reply. */ + NXT_COMMAND_REPLY, + + /* No longer used. */ + NXT_FLOW_END_CONFIG__OBSOLETE, + + /* No longer used. */ + NXT_FLOW_END__OBSOLETE, + + /* Management protocol. See "openflow-mgmt.h". */ + NXT_MGMT, +}; + +struct nicira_header { + struct ofp_header header; + uint32_t vendor; /* NX_VENDOR_ID. */ + uint32_t subtype; /* One of NXT_* above. */ +}; +OFP_ASSERT(sizeof(struct nicira_header) == sizeof(struct ofp_vendor_header) + 4); + + +enum nx_action_subtype { + NXAST_SNAT__OBSOLETE, /* No longer used. */ + NXAST_RESUBMIT /* Throw against flow table again. */ +}; + +/* Action structure for NXAST_RESUBMIT. */ +struct nx_action_resubmit { + uint16_t type; /* OFPAT_VENDOR. */ + uint16_t len; /* Length is 8. */ + uint32_t vendor; /* NX_VENDOR_ID. */ + uint16_t subtype; /* NXAST_RESUBMIT. */ + uint16_t in_port; /* New in_port for checking flow table. */ + uint8_t pad[4]; +}; +OFP_ASSERT(sizeof(struct nx_action_resubmit) == 16); + +/* Header for Nicira-defined actions. */ +struct nx_action_header { + uint16_t type; /* OFPAT_VENDOR. */ + uint16_t len; /* Length is 8. */ + uint32_t vendor; /* NX_VENDOR_ID. */ + uint16_t subtype; /* NXAST_*. */ + uint8_t pad[6]; +}; +OFP_ASSERT(sizeof(struct nx_action_header) == 16); + +/* Status bits for NXT_COMMAND_REPLY. */ +enum { + NXT_STATUS_EXITED = 1 << 31, /* Exited normally. */ + NXT_STATUS_SIGNALED = 1 << 30, /* Exited due to signal. */ + NXT_STATUS_UNKNOWN = 1 << 29, /* Exited for unknown reason. */ + NXT_STATUS_COREDUMP = 1 << 28, /* Exited with core dump. */ + NXT_STATUS_ERROR = 1 << 27, /* Command could not be executed. */ + NXT_STATUS_STARTED = 1 << 26, /* Command was started. */ + NXT_STATUS_EXITSTATUS = 0xff, /* Exit code mask if NXT_STATUS_EXITED. */ + NXT_STATUS_TERMSIG = 0xff, /* Signal number if NXT_STATUS_SIGNALED. */ +}; + +/* NXT_COMMAND_REPLY. */ +struct nx_command_reply { + struct nicira_header nxh; + uint32_t status; /* Status bits defined above. */ + /* Followed by any number of bytes of process output. */ +}; +OFP_ASSERT(sizeof(struct nx_command_reply) == 20); + +#endif /* openflow/nicira-ext.h */ diff --git a/include/openflow/openflow-mgmt.h b/include/openflow/openflow-mgmt.h new file mode 100644 index 00000000..1b4f1a0c --- /dev/null +++ b/include/openflow/openflow-mgmt.h @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef OPENFLOW_OPENFLOW_MGMT_H +#define OPENFLOW_OPENFLOW_MGMT_H 1 + +#include "openflow/nicira-ext.h" + +enum ofmp_type { + OFMPT_CAPABILITY_REQUEST, + OFMPT_CAPABILITY_REPLY, + OFMPT_RESOURCES_REQUEST, + OFMPT_RESOURCES_UPDATE, + OFMPT_CONFIG_REQUEST, + OFMPT_CONFIG_UPDATE, + OFMPT_CONFIG_UPDATE_ACK, + OFMPT_ERROR +}; + +/* Header on all OpenFlow management packets. */ +struct ofmp_header { + struct nicira_header header; + uint16_t type; /* One of OFMPT_* above. */ + uint8_t pad[2]; +}; +OFP_ASSERT(sizeof(struct ofmp_header) == sizeof(struct nicira_header) + 4); + + +/* Generic TLV header. */ +struct ofmp_tlv { + uint16_t type; /* Type of value (one of OFMPTLV_*). */ + uint16_t len; /* Length of TLV (includes this header). */ + uint8_t data[0]; /* Value of data as defined by type and length. */ +}; +OFP_ASSERT(sizeof(struct ofmp_tlv) == 4); + +/* Universal TLV terminator. Used to indicate end of TLV list. */ +struct ofmp_tlv_end { + uint16_t type; /* Type is 0. */ + uint16_t len; /* Length is 4. */ +}; +OFP_ASSERT(sizeof(struct ofmp_tlv_end) == 4); + + +/* Bitmask of capability description styles. */ +enum ofmp_capability_format { + OFMPCAF_SIMPLE = 0 << 0, /* "ovs-vswitchd.conf" style. */ +}; + +/* Body of capbility request. + * + * OFMPT_CAPABILITY_REQUEST (controller -> switch) */ +struct ofmp_capability_request { + struct ofmp_header header; + uint32_t format; /* One of OFMPCAF_*. */ +}; +OFP_ASSERT(sizeof(struct ofmp_capability_request) == 24); + +/* Body of reply to capability request. + * + * OFMPT_CAPABILITY_REPLY (switch -> controller). */ +struct ofmp_capability_reply { + struct ofmp_header header; + uint32_t format; /* One of OFMPCAF_*. */ + uint64_t mgmt_id; /* Management ID. */ + uint8_t data[0]; +}; +OFP_ASSERT(sizeof(struct ofmp_capability_reply) == 32); + + +/* Resource TLV for datapath description. */ +struct ofmptsr_dp { + uint16_t type; /* OFMPTSR_DP. */ + uint16_t len; /* 28. */ + uint8_t pad[4]; + uint64_t dp_id; /* Datapath ID. */ + uint8_t name[OFP_MAX_PORT_NAME_LEN]; /* Null-terminated name. */ +}; +OFP_ASSERT(sizeof(struct ofmptsr_dp) == 32); + +/* TLV types for switch resource descriptions. */ +enum ofmp_switch_resources { + OFMPTSR_END = 0, /* Terminator. */ + OFMPTSR_DP, /* Datapath. */ +}; + +/* Body of resources request. + * + * OFMPT_RESOURCES_REQUEST (controller -> switch) */ +struct ofmp_resources_request { + struct ofmp_header header; +}; + +/* Body of capbility update. Sent in response to a resources request or + * sent asynchronously when resources change on the switch. + * + * OFMPT_RESOURCES_UPDATE (switch -> controller) */ +struct ofmp_resources_update { + struct ofmp_header header; + uint8_t data[0]; +}; +OFP_ASSERT(sizeof(struct ofmp_resources_update) == 20); + + +/* Bitmask of capability description styles. */ +enum ofmp_config_format { + OFMPCOF_SIMPLE = 0 << 0, /* "ovs-vswitchd.conf" style. */ +}; + +#define CONFIG_COOKIE_LEN 20 + +/* Body of configuration request. + * + * OFMPT_CONFIG_REQUEST (controller -> switch) */ +struct ofmp_config_request { + struct ofmp_header header; + uint32_t format; /* One of OFMPCOF_*. */ +}; +OFP_ASSERT(sizeof(struct ofmp_config_request) == 24); + +/* Body of configuration update. Sent in response to a configuration + * request from the controller. May be sent asynchronously by either + * the controller or switch to modify configuration or notify of + * changes, respectively. If sent by the controller, the switch must + * respond with a OFMPT_CONFIG_UPDATE_ACK. + * + * OFMPT_CONFIG_UPDATE (switch <-> controller) */ +struct ofmp_config_update { + struct ofmp_header header; + uint32_t format; /* One of OFMPCOF_*. */ + uint8_t cookie[CONFIG_COOKIE_LEN]; /* Cookie of config attempting to be + * replaced by this update. */ + uint8_t data[0]; +}; +OFP_ASSERT(sizeof(struct ofmp_config_update) == 44); + +/* Bitmask of configuration update ack flags. */ +enum ofmp_config_update_ack_flags { + OFMPCUAF_SUCCESS = 1 << 0, /* Config succeeded. */ +}; + +/* Body of configuration update ack. Sent in response to a configuration + * udpate request. + * + * OFMPT_CONFIG_UPDATE_ACK (switch -> controller) */ +struct ofmp_config_update_ack { + struct ofmp_header header; + uint32_t format; /* One of OFMPCOF_*. */ + uint32_t flags; /* One of OFMPCUAF_*. */ + uint8_t cookie[CONFIG_COOKIE_LEN]; /* Cookie of current configuration + * being used in the switch. */ +}; +OFP_ASSERT(sizeof(struct ofmp_config_update_ack) == 48); + +/* Values for 'type' in ofmp_error_msg. */ +enum ofmp_error_type { + OFMPET_BAD_CONFIG /* Problem with configuration. */ +}; + +/* ofmp_error_msg 'code' values for OFMPET_BAD_CONFIG. 'data' contains + * at least the first 64 bytes of the failed request. */ +enum ofmp_bad_config_code { + OFMPBCC_BUSY, /* Config updating, try again. */ + OFMPBCC_OLD_COOKIE, /* Config has changed. */ +}; + +/* Body of error message. May be sent by either the switch or the + * controller to indicate some error condition. + * + * OFMPT_ERROR (switch <-> controller) */ +struct ofmp_error_msg { + struct ofmp_header header; + + uint16_t type; /* One of OFMPET_*. */ + uint16_t code; /* Code depending on 'type'. */ + uint8_t data[0]; /* Variable-length data. Interpreted based + on the type and code. */ +}; +OFP_ASSERT(sizeof(struct ofmp_error_msg) == 24); + +#endif /* openflow/openflow-mgmt.h */ diff --git a/include/openflow/openflow.h b/include/openflow/openflow.h new file mode 100644 index 00000000..cd96c6ad --- /dev/null +++ b/include/openflow/openflow.h @@ -0,0 +1,796 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* OpenFlow: protocol between controller and datapath. */ + +#ifndef OPENFLOW_OPENFLOW_H +#define OPENFLOW_OPENFLOW_H 1 + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +#ifdef SWIG +#define OFP_ASSERT(EXPR) /* SWIG can't handle OFP_ASSERT. */ +#elif !defined(__cplusplus) +/* Build-time assertion for use in a declaration context. */ +#define OFP_ASSERT(EXPR) \ + extern int (*build_assert(void))[ sizeof(struct { \ + unsigned int build_assert_failed : (EXPR) ? 1 : -1; })] +#else /* __cplusplus */ +#include +#define OFP_ASSERT BOOST_STATIC_ASSERT +#endif /* __cplusplus */ + +#ifndef SWIG +#define OFP_PACKED __attribute__((packed)) +#else +#define OFP_PACKED /* SWIG doesn't understand __attribute. */ +#endif + +/* The most significant bit being set in the version field indicates an + * experimental OpenFlow version. + */ +#define OFP_VERSION 0x97 + +#define OFP_MAX_TABLE_NAME_LEN 32 +#define OFP_MAX_PORT_NAME_LEN 16 + +#define OFP_TCP_PORT 6633 +#define OFP_SSL_PORT 6633 + +#define OFP_ETH_ALEN 6 /* Bytes in an Ethernet address. */ + +/* Port numbering. Physical ports are numbered starting from 0. */ +enum ofp_port { + /* Maximum number of physical switch ports. */ + OFPP_MAX = 0xff00, + + /* Fake output "ports". */ + OFPP_IN_PORT = 0xfff8, /* Send the packet out the input port. This + virtual port must be explicitly used + in order to send back out of the input + port. */ + OFPP_TABLE = 0xfff9, /* Perform actions in flow table. + NB: This can only be the destination + port for packet-out messages. */ + OFPP_NORMAL = 0xfffa, /* Process with normal L2/L3 switching. */ + OFPP_FLOOD = 0xfffb, /* All physical ports except input port and + those disabled by STP. */ + OFPP_ALL = 0xfffc, /* All physical ports except input port. */ + OFPP_CONTROLLER = 0xfffd, /* Send to controller. */ + OFPP_LOCAL = 0xfffe, /* Local openflow "port". */ + OFPP_NONE = 0xffff /* Not associated with a physical port. */ +}; + +enum ofp_type { + /* Immutable messages. */ + OFPT_HELLO, /* Symmetric message */ + OFPT_ERROR, /* Symmetric message */ + OFPT_ECHO_REQUEST, /* Symmetric message */ + OFPT_ECHO_REPLY, /* Symmetric message */ + OFPT_VENDOR, /* Symmetric message */ + + /* Switch configuration messages. */ + OFPT_FEATURES_REQUEST, /* Controller/switch message */ + OFPT_FEATURES_REPLY, /* Controller/switch message */ + OFPT_GET_CONFIG_REQUEST, /* Controller/switch message */ + OFPT_GET_CONFIG_REPLY, /* Controller/switch message */ + OFPT_SET_CONFIG, /* Controller/switch message */ + + /* Asynchronous messages. */ + OFPT_PACKET_IN, /* Async message */ + OFPT_FLOW_EXPIRED, /* Async message */ + OFPT_PORT_STATUS, /* Async message */ + + /* Controller command messages. */ + OFPT_PACKET_OUT, /* Controller/switch message */ + OFPT_FLOW_MOD, /* Controller/switch message */ + OFPT_PORT_MOD, /* Controller/switch message */ + + /* Statistics messages. */ + OFPT_STATS_REQUEST, /* Controller/switch message */ + OFPT_STATS_REPLY /* Controller/switch message */ +}; + +/* Header on all OpenFlow packets. */ +struct ofp_header { + uint8_t version; /* OFP_VERSION. */ + uint8_t type; /* One of the OFPT_ constants. */ + uint16_t length; /* Length including this ofp_header. */ + uint32_t xid; /* Transaction id associated with this packet. + Replies use the same id as was in the request + to facilitate pairing. */ +}; +OFP_ASSERT(sizeof(struct ofp_header) == 8); + +/* OFPT_HELLO. This message has an empty body, but implementations must + * ignore any data included in the body, to allow for future extensions. */ +struct ofp_hello { + struct ofp_header header; +}; + +#define OFP_DEFAULT_MISS_SEND_LEN 128 + +enum ofp_config_flags { + /* Tells datapath to notify the controller of expired flow entries. */ + OFPC_SEND_FLOW_EXP = 1 << 0, + + /* Handling of IP fragments. */ + OFPC_FRAG_NORMAL = 0 << 1, /* No special handling for fragments. */ + OFPC_FRAG_DROP = 1 << 1, /* Drop fragments. */ + OFPC_FRAG_REASM = 2 << 1, /* Reassemble (only if OFPC_IP_REASM set). */ + OFPC_FRAG_MASK = 3 << 1 +}; + +/* Switch configuration. */ +struct ofp_switch_config { + struct ofp_header header; + uint16_t flags; /* OFPC_* flags. */ + uint16_t miss_send_len; /* Max bytes of new flow that datapath should + send to the controller. */ +}; +OFP_ASSERT(sizeof(struct ofp_switch_config) == 12); + +/* Capabilities supported by the datapath. */ +enum ofp_capabilities { + OFPC_FLOW_STATS = 1 << 0, /* Flow statistics. */ + OFPC_TABLE_STATS = 1 << 1, /* Table statistics. */ + OFPC_PORT_STATS = 1 << 2, /* Port statistics. */ + OFPC_STP = 1 << 3, /* 802.1d spanning tree. */ + OFPC_MULTI_PHY_TX = 1 << 4, /* Supports transmitting through multiple + physical interfaces */ + OFPC_IP_REASM = 1 << 5 /* Can reassemble IP fragments. */ +}; + +/* Flags to indicate behavior of the physical port. These flags are + * used in ofp_phy_port to describe the current configuration. They are + * used in the ofp_port_mod message to configure the port's behavior. + */ +enum ofp_port_config { + OFPPC_PORT_DOWN = 1 << 0, /* Port is administratively down. */ + + OFPPC_NO_STP = 1 << 1, /* Disable 802.1D spanning tree on port. */ + OFPPC_NO_RECV = 1 << 2, /* Drop most packets received on port. */ + OFPPC_NO_RECV_STP = 1 << 3, /* Drop received 802.1D STP packets. */ + OFPPC_NO_FLOOD = 1 << 4, /* Do not include this port when flooding. */ + OFPPC_NO_FWD = 1 << 5, /* Drop packets forwarded to port. */ + OFPPC_NO_PACKET_IN = 1 << 6 /* Do not send packet-in msgs for port. */ +}; + +/* Current state of the physical port. These are not configurable from + * the controller. + */ +enum ofp_port_state { + OFPPS_LINK_DOWN = 1 << 0, /* No physical link present. */ + + /* The OFPPS_STP_* bits have no effect on switch operation. The + * controller must adjust OFPPC_NO_RECV, OFPPC_NO_FWD, and + * OFPPC_NO_PACKET_IN appropriately to fully implement an 802.1D spanning + * tree. */ + OFPPS_STP_LISTEN = 0 << 8, /* Not learning or relaying frames. */ + OFPPS_STP_LEARN = 1 << 8, /* Learning but not relaying frames. */ + OFPPS_STP_FORWARD = 2 << 8, /* Learning and relaying frames. */ + OFPPS_STP_BLOCK = 3 << 8, /* Not part of spanning tree. */ + OFPPS_STP_MASK = 3 << 8 /* Bit mask for OFPPS_STP_* values. */ +}; + +/* Features of physical ports available in a datapath. */ +enum ofp_port_features { + OFPPF_10MB_HD = 1 << 0, /* 10 Mb half-duplex rate support. */ + OFPPF_10MB_FD = 1 << 1, /* 10 Mb full-duplex rate support. */ + OFPPF_100MB_HD = 1 << 2, /* 100 Mb half-duplex rate support. */ + OFPPF_100MB_FD = 1 << 3, /* 100 Mb full-duplex rate support. */ + OFPPF_1GB_HD = 1 << 4, /* 1 Gb half-duplex rate support. */ + OFPPF_1GB_FD = 1 << 5, /* 1 Gb full-duplex rate support. */ + OFPPF_10GB_FD = 1 << 6, /* 10 Gb full-duplex rate support. */ + OFPPF_COPPER = 1 << 7, /* Copper medium. */ + OFPPF_FIBER = 1 << 8, /* Fiber medium. */ + OFPPF_AUTONEG = 1 << 9, /* Auto-negotiation. */ + OFPPF_PAUSE = 1 << 10, /* Pause. */ + OFPPF_PAUSE_ASYM = 1 << 11 /* Asymmetric pause. */ +}; + +/* Description of a physical port */ +struct ofp_phy_port { + uint16_t port_no; + uint8_t hw_addr[OFP_ETH_ALEN]; + uint8_t name[OFP_MAX_PORT_NAME_LEN]; /* Null-terminated */ + + uint32_t config; /* Bitmap of OFPPC_* flags. */ + uint32_t state; /* Bitmap of OFPPS_* flags. */ + + /* Bitmaps of OFPPF_* that describe features. All bits zeroed if + * unsupported or unavailable. */ + uint32_t curr; /* Current features. */ + uint32_t advertised; /* Features being advertised by the port. */ + uint32_t supported; /* Features supported by the port. */ + uint32_t peer; /* Features advertised by peer. */ +}; +OFP_ASSERT(sizeof(struct ofp_phy_port) == 48); + +/* Switch features. */ +struct ofp_switch_features { + struct ofp_header header; + uint64_t datapath_id; /* Datapath unique ID. Only the lower 48-bits + are meaningful. */ + + uint32_t n_buffers; /* Max packets buffered at once. */ + + uint8_t n_tables; /* Number of tables supported by datapath. */ + uint8_t pad[3]; /* Align to 64-bits. */ + + /* Features. */ + uint32_t capabilities; /* Bitmap of support "ofp_capabilities". */ + uint32_t actions; /* Bitmap of supported "ofp_action_type"s. */ + + /* Port info.*/ + struct ofp_phy_port ports[0]; /* Port definitions. The number of ports + is inferred from the length field in + the header. */ +}; +OFP_ASSERT(sizeof(struct ofp_switch_features) == 32); + +/* What changed about the physical port */ +enum ofp_port_reason { + OFPPR_ADD, /* The port was added. */ + OFPPR_DELETE, /* The port was removed. */ + OFPPR_MODIFY /* Some attribute of the port has changed. */ +}; + +/* A physical port has changed in the datapath */ +struct ofp_port_status { + struct ofp_header header; + uint8_t reason; /* One of OFPPR_*. */ + uint8_t pad[7]; /* Align to 64-bits. */ + struct ofp_phy_port desc; +}; +OFP_ASSERT(sizeof(struct ofp_port_status) == 64); + +/* Modify behavior of the physical port */ +struct ofp_port_mod { + struct ofp_header header; + uint16_t port_no; + uint8_t hw_addr[OFP_ETH_ALEN]; /* The hardware address is not + configurable. This is used to + sanity-check the request, so it must + be the same as returned in an + ofp_phy_port struct. */ + + uint32_t config; /* Bitmap of OFPPC_* flags. */ + uint32_t mask; /* Bitmap of OFPPC_* flags to be changed. */ + + uint32_t advertise; /* Bitmap of "ofp_port_features"s. Zero all + bits to prevent any action taking place. */ + uint8_t pad[4]; /* Pad to 64-bits. */ +}; +OFP_ASSERT(sizeof(struct ofp_port_mod) == 32); + +/* Why is this packet being sent to the controller? */ +enum ofp_packet_in_reason { + OFPR_NO_MATCH, /* No matching flow. */ + OFPR_ACTION /* Action explicitly output to controller. */ +}; + +/* Packet received on port (datapath -> controller). */ +struct ofp_packet_in { + struct ofp_header header; + uint32_t buffer_id; /* ID assigned by datapath. */ + uint16_t total_len; /* Full length of frame. */ + uint16_t in_port; /* Port on which frame was received. */ + uint8_t reason; /* Reason packet is being sent (one of OFPR_*) */ + uint8_t pad; + uint8_t data[0]; /* Ethernet frame, halfway through 32-bit word, + so the IP header is 32-bit aligned. The + amount of data is inferred from the length + field in the header. Because of padding, + offsetof(struct ofp_packet_in, data) == + sizeof(struct ofp_packet_in) - 2. */ +}; +OFP_ASSERT(sizeof(struct ofp_packet_in) == 20); + +enum ofp_action_type { + OFPAT_OUTPUT, /* Output to switch port. */ + OFPAT_SET_VLAN_VID, /* Set the 802.1q VLAN id. */ + OFPAT_SET_VLAN_PCP, /* Set the 802.1q priority. */ + OFPAT_STRIP_VLAN, /* Strip the 802.1q header. */ + OFPAT_SET_DL_SRC, /* Ethernet source address. */ + OFPAT_SET_DL_DST, /* Ethernet destination address. */ + OFPAT_SET_NW_SRC, /* IP source address. */ + OFPAT_SET_NW_DST, /* IP destination address. */ + OFPAT_SET_TP_SRC, /* TCP/UDP source port. */ + OFPAT_SET_TP_DST, /* TCP/UDP destination port. */ + OFPAT_VENDOR = 0xffff +}; + +/* Action structure for OFPAT_OUTPUT, which sends packets out 'port'. + * When the 'port' is the OFPP_CONTROLLER, 'max_len' indicates the max + * number of bytes to send. A 'max_len' of zero means the entire packet + * should be sent. */ +struct ofp_action_output { + uint16_t type; /* OFPAT_OUTPUT. */ + uint16_t len; /* Length is 8. */ + uint16_t port; /* Output port. */ + uint16_t max_len; /* Max length to send to controller. */ +}; +OFP_ASSERT(sizeof(struct ofp_action_output) == 8); + +/* The VLAN id is 12 bits, so we can use the entire 16 bits to indicate + * special conditions. All ones is used to match that no VLAN id was + * set. */ +#define OFP_VLAN_NONE 0xffff + +/* Action structure for OFPAT_SET_VLAN_VID. */ +struct ofp_action_vlan_vid { + uint16_t type; /* OFPAT_SET_VLAN_VID. */ + uint16_t len; /* Length is 8. */ + uint16_t vlan_vid; /* VLAN id. */ + uint8_t pad[2]; +}; +OFP_ASSERT(sizeof(struct ofp_action_vlan_vid) == 8); + +/* Action structure for OFPAT_SET_VLAN_PCP. */ +struct ofp_action_vlan_pcp { + uint16_t type; /* OFPAT_SET_VLAN_PCP. */ + uint16_t len; /* Length is 8. */ + uint8_t vlan_pcp; /* VLAN priority. */ + uint8_t pad[3]; +}; +OFP_ASSERT(sizeof(struct ofp_action_vlan_vid) == 8); + +/* Action structure for OFPAT_SET_DL_SRC/DST. */ +struct ofp_action_dl_addr { + uint16_t type; /* OFPAT_SET_DL_SRC/DST. */ + uint16_t len; /* Length is 16. */ + uint8_t dl_addr[OFP_ETH_ALEN]; /* Ethernet address. */ + uint8_t pad[6]; +}; +OFP_ASSERT(sizeof(struct ofp_action_dl_addr) == 16); + +/* Action structure for OFPAT_SET_NW_SRC/DST. */ +struct ofp_action_nw_addr { + uint16_t type; /* OFPAT_SET_TW_SRC/DST. */ + uint16_t len; /* Length is 8. */ + uint32_t nw_addr; /* IP address. */ +}; +OFP_ASSERT(sizeof(struct ofp_action_nw_addr) == 8); + +/* Action structure for OFPAT_SET_TP_SRC/DST. */ +struct ofp_action_tp_port { + uint16_t type; /* OFPAT_SET_TP_SRC/DST. */ + uint16_t len; /* Length is 8. */ + uint16_t tp_port; /* TCP/UDP port. */ + uint8_t pad[2]; +}; +OFP_ASSERT(sizeof(struct ofp_action_tp_port) == 8); + +/* Action header for OFPAT_VENDOR. The rest of the body is vendor-defined. */ +struct ofp_action_vendor_header { + uint16_t type; /* OFPAT_VENDOR. */ + uint16_t len; /* Length is a multiple of 8. */ + uint32_t vendor; /* Vendor ID, which takes the same form + as in "struct ofp_vendor_header". */ +}; +OFP_ASSERT(sizeof(struct ofp_action_vendor_header) == 8); + +/* Action header that is common to all actions. The length includes the + * header and any padding used to make the action 64-bit aligned. + * NB: The length of an action *must* always be a multiple of eight. */ +struct ofp_action_header { + uint16_t type; /* One of OFPAT_*. */ + uint16_t len; /* Length of action, including this + header. This is the length of action, + including any padding to make it + 64-bit aligned. */ + uint8_t pad[4]; +}; +OFP_ASSERT(sizeof(struct ofp_action_header) == 8); + +union ofp_action { + uint16_t type; + struct ofp_action_header header; + struct ofp_action_vendor_header vendor; + struct ofp_action_output output; + struct ofp_action_vlan_vid vlan_vid; + struct ofp_action_vlan_pcp vlan_pcp; + struct ofp_action_nw_addr nw_addr; + struct ofp_action_tp_port tp_port; +}; +OFP_ASSERT(sizeof(union ofp_action) == 8); + +/* Send packet (controller -> datapath). */ +struct ofp_packet_out { + struct ofp_header header; + uint32_t buffer_id; /* ID assigned by datapath (-1 if none). */ + uint16_t in_port; /* Packet's input port (OFPP_NONE if none). */ + uint16_t actions_len; /* Size of action array in bytes. */ + struct ofp_action_header actions[0]; /* Actions. */ + /* uint8_t data[0]; */ /* Packet data. The length is inferred + from the length field in the header. + (Only meaningful if buffer_id == -1.) */ +}; +OFP_ASSERT(sizeof(struct ofp_packet_out) == 16); + +enum ofp_flow_mod_command { + OFPFC_ADD, /* New flow. */ + OFPFC_MODIFY, /* Modify all matching flows. */ + OFPFC_MODIFY_STRICT, /* Modify entry strictly matching wildcards */ + OFPFC_DELETE, /* Delete all matching flows. */ + OFPFC_DELETE_STRICT /* Strictly match wildcards and priority. */ +}; + +/* Flow wildcards. */ +enum ofp_flow_wildcards { + OFPFW_IN_PORT = 1 << 0, /* Switch input port. */ + OFPFW_DL_VLAN = 1 << 1, /* VLAN. */ + OFPFW_DL_SRC = 1 << 2, /* Ethernet source address. */ + OFPFW_DL_DST = 1 << 3, /* Ethernet destination address. */ + OFPFW_DL_TYPE = 1 << 4, /* Ethernet frame type. */ + OFPFW_NW_PROTO = 1 << 5, /* IP protocol. */ + OFPFW_TP_SRC = 1 << 6, /* TCP/UDP source port. */ + OFPFW_TP_DST = 1 << 7, /* TCP/UDP destination port. */ + + /* IP source address wildcard bit count. 0 is exact match, 1 ignores the + * LSB, 2 ignores the 2 least-significant bits, ..., 32 and higher wildcard + * the entire field. This is the *opposite* of the usual convention where + * e.g. /24 indicates that 8 bits (not 24 bits) are wildcarded. */ + OFPFW_NW_SRC_SHIFT = 8, + OFPFW_NW_SRC_BITS = 6, + OFPFW_NW_SRC_MASK = ((1 << OFPFW_NW_SRC_BITS) - 1) << OFPFW_NW_SRC_SHIFT, + OFPFW_NW_SRC_ALL = 32 << OFPFW_NW_SRC_SHIFT, + + /* IP destination address wildcard bit count. Same format as source. */ + OFPFW_NW_DST_SHIFT = 14, + OFPFW_NW_DST_BITS = 6, + OFPFW_NW_DST_MASK = ((1 << OFPFW_NW_DST_BITS) - 1) << OFPFW_NW_DST_SHIFT, + OFPFW_NW_DST_ALL = 32 << OFPFW_NW_DST_SHIFT, + + /* Wildcard all fields. */ + OFPFW_ALL = ((1 << 20) - 1) +}; + +/* The wildcards for ICMP type and code fields use the transport source + * and destination port fields, respectively. */ +#define OFPFW_ICMP_TYPE OFPFW_TP_SRC +#define OFPFW_ICMP_CODE OFPFW_TP_DST + +/* Values below this cutoff are 802.3 packets and the two bytes + * following MAC addresses are used as a frame length. Otherwise, the + * two bytes are used as the Ethernet type. + */ +#define OFP_DL_TYPE_ETH2_CUTOFF 0x0600 + +/* Value of dl_type to indicate that the frame does not include an + * Ethernet type. + */ +#define OFP_DL_TYPE_NOT_ETH_TYPE 0x05ff + +/* The VLAN id is 12-bits, so we can use the entire 16 bits to indicate + * special conditions. All ones indicates that no VLAN id was set. + */ +#define OFP_VLAN_NONE 0xffff + +/* Fields to match against flows */ +struct ofp_match { + uint32_t wildcards; /* Wildcard fields. */ + uint16_t in_port; /* Input switch port. */ + uint8_t dl_src[OFP_ETH_ALEN]; /* Ethernet source address. */ + uint8_t dl_dst[OFP_ETH_ALEN]; /* Ethernet destination address. */ + uint16_t dl_vlan; /* Input VLAN. */ + uint16_t dl_type; /* Ethernet frame type. */ + uint8_t nw_proto; /* IP protocol. */ + uint8_t pad; /* Align to 32-bits. */ + uint32_t nw_src; /* IP source address. */ + uint32_t nw_dst; /* IP destination address. */ + uint16_t tp_src; /* TCP/UDP source port. */ + uint16_t tp_dst; /* TCP/UDP destination port. */ +}; +OFP_ASSERT(sizeof(struct ofp_match) == 36); + +/* The match fields for ICMP type and code use the transport source and + * destination port fields, respectively. */ +#define icmp_type tp_src +#define icmp_code tp_dst + +/* Value used in "idle_timeout" and "hard_timeout" to indicate that the entry + * is permanent. */ +#define OFP_FLOW_PERMANENT 0 + +/* By default, choose a priority in the middle. */ +#define OFP_DEFAULT_PRIORITY 0x8000 + +/* Flow setup and teardown (controller -> datapath). */ +struct ofp_flow_mod { + struct ofp_header header; + struct ofp_match match; /* Fields to match */ + + /* Flow actions. */ + uint16_t command; /* One of OFPFC_*. */ + uint16_t idle_timeout; /* Idle time before discarding (seconds). */ + uint16_t hard_timeout; /* Max time before discarding (seconds). */ + uint16_t priority; /* Priority level of flow entry. */ + uint32_t buffer_id; /* Buffered packet to apply to (or -1). + Not meaningful for OFPFC_DELETE*. */ + uint16_t out_port; /* For OFPFC_DELETE* commands, require + matching entries to include this as an + output port. A value of OFPP_NONE + indicates no restriction. */ + uint8_t pad[2]; /* Align to 32-bits. */ + uint32_t reserved; /* Reserved for future use. */ + struct ofp_action_header actions[0]; /* The action length is inferred + from the length field in the + header. */ +}; +OFP_ASSERT(sizeof(struct ofp_flow_mod) == 64); + +/* Why did this flow expire? */ +enum ofp_flow_expired_reason { + OFPER_IDLE_TIMEOUT, /* Flow idle time exceeded idle_timeout. */ + OFPER_HARD_TIMEOUT /* Time exceeded hard_timeout. */ +}; + +/* Flow expiration (datapath -> controller). */ +struct ofp_flow_expired { + struct ofp_header header; + struct ofp_match match; /* Description of fields. */ + + uint16_t priority; /* Priority level of flow entry. */ + uint8_t reason; /* One of OFPER_*. */ + uint8_t pad[1]; /* Align to 32-bits. */ + + uint32_t duration; /* Time flow was alive in seconds. */ + uint8_t pad2[4]; /* Align to 64-bits. */ + uint64_t packet_count; + uint64_t byte_count; +}; +OFP_ASSERT(sizeof(struct ofp_flow_expired) == 72); + +/* Values for 'type' in ofp_error_message. These values are immutable: they + * will not change in future versions of the protocol (although new values may + * be added). */ +enum ofp_error_type { + OFPET_HELLO_FAILED, /* Hello protocol failed. */ + OFPET_BAD_REQUEST, /* Request was not understood. */ + OFPET_BAD_ACTION, /* Error in action description. */ + OFPET_FLOW_MOD_FAILED, /* Problem modifying flow entry. */ + OFPET_PORT_MOD_FAILED /* OFPT_PORT_MOD failed. */ +}; + +/* ofp_error_msg 'code' values for OFPET_HELLO_FAILED. 'data' contains an + * ASCII text string that may give failure details. */ +enum ofp_hello_failed_code { + OFPHFC_INCOMPATIBLE /* No compatible version. */ +}; + +/* ofp_error_msg 'code' values for OFPET_BAD_REQUEST. 'data' contains at least + * the first 64 bytes of the failed request. */ +enum ofp_bad_request_code { + OFPBRC_BAD_VERSION, /* ofp_header.version not supported. */ + OFPBRC_BAD_TYPE, /* ofp_header.type not supported. */ + OFPBRC_BAD_STAT, /* ofp_stats_request.type not supported. */ + OFPBRC_BAD_VENDOR, /* Vendor not supported (in ofp_vendor_header + * or ofp_stats_request or ofp_stats_reply). */ + OFPBRC_BAD_SUBTYPE, /* Vendor subtype not supported. */ + OFPBRC_BAD_LENGTH, /* Wrong request length for type. */ + OFPBRC_BUFFER_EMPTY, /* Specified buffer has already been used. */ + OFPBRC_BAD_COOKIE /* Specified buffer does not exist. */ +}; + +/* ofp_error_msg 'code' values for OFPET_BAD_ACTION. 'data' contains at least + * the first 64 bytes of the failed request. */ +enum ofp_bad_action_code { + OFPBAC_BAD_TYPE, /* Unknown action type. */ + OFPBAC_BAD_LEN, /* Length problem in actions. */ + OFPBAC_BAD_VENDOR, /* Unknown vendor id specified. */ + OFPBAC_BAD_VENDOR_TYPE, /* Unknown action type for vendor id. */ + OFPBAC_BAD_OUT_PORT, /* Problem validating output action. */ + OFPBAC_BAD_ARGUMENT, /* Bad action argument. */ + OFPBAC_TOO_MANY /* Can't handle this many actions. */ +}; + +/* ofp_error_msg 'code' values for OFPET_FLOW_MOD_FAILED. 'data' contains + * at least the first 64 bytes of the failed request. */ +enum ofp_flow_mod_failed_code { + OFPFMFC_ALL_TABLES_FULL, /* Flow not added because of full tables. */ + OFPFMFC_BAD_COMMAND /* Unknown command. */ +}; + +/* ofp_error_msg 'code' values for OFPET_PORT_MOD_FAILED. 'data' contains + * at least the first 64 bytes of the failed request. */ +enum ofp_port_mod_failed_code { + OFPPMFC_BAD_PORT, /* Specified port does not exist. */ + OFPPMFC_BAD_HW_ADDR, /* Specified hardware address is wrong. */ +}; + +/* OFPT_ERROR: Error message (datapath -> controller). */ +struct ofp_error_msg { + struct ofp_header header; + + uint16_t type; + uint16_t code; + uint8_t data[0]; /* Variable-length data. Interpreted based + on the type and code. */ +}; +OFP_ASSERT(sizeof(struct ofp_error_msg) == 12); + +enum ofp_stats_types { + /* Description of this OpenFlow switch. + * The request body is empty. + * The reply body is struct ofp_desc_stats. */ + OFPST_DESC, + + /* Individual flow statistics. + * The request body is struct ofp_flow_stats_request. + * The reply body is an array of struct ofp_flow_stats. */ + OFPST_FLOW, + + /* Aggregate flow statistics. + * The request body is struct ofp_aggregate_stats_request. + * The reply body is struct ofp_aggregate_stats_reply. */ + OFPST_AGGREGATE, + + /* Flow table statistics. + * The request body is empty. + * The reply body is an array of struct ofp_table_stats. */ + OFPST_TABLE, + + /* Physical port statistics. + * The request body is empty. + * The reply body is an array of struct ofp_port_stats. */ + OFPST_PORT, + + /* Vendor extension. + * The request and reply bodies begin with a 32-bit vendor ID, which takes + * the same form as in "struct ofp_vendor_header". The request and reply + * bodies are otherwise vendor-defined. */ + OFPST_VENDOR = 0xffff +}; + +struct ofp_stats_request { + struct ofp_header header; + uint16_t type; /* One of the OFPST_* constants. */ + uint16_t flags; /* OFPSF_REQ_* flags (none yet defined). */ + uint8_t body[0]; /* Body of the request. */ +}; +OFP_ASSERT(sizeof(struct ofp_stats_request) == 12); + +enum ofp_stats_reply_flags { + OFPSF_REPLY_MORE = 1 << 0 /* More replies to follow. */ +}; + +struct ofp_stats_reply { + struct ofp_header header; + uint16_t type; /* One of the OFPST_* constants. */ + uint16_t flags; /* OFPSF_REPLY_* flags. */ + uint8_t body[0]; /* Body of the reply. */ +}; +OFP_ASSERT(sizeof(struct ofp_stats_reply) == 12); + +#define DESC_STR_LEN 256 +#define SERIAL_NUM_LEN 32 +/* Body of reply to OFPST_DESC request. Each entry is a NULL-terminated + * ASCII string. */ +struct ofp_desc_stats { + char mfr_desc[DESC_STR_LEN]; /* Manufacturer description. */ + char hw_desc[DESC_STR_LEN]; /* Hardware description. */ + char sw_desc[DESC_STR_LEN]; /* Software description. */ + char serial_num[SERIAL_NUM_LEN]; /* Serial number. */ +}; +OFP_ASSERT(sizeof(struct ofp_desc_stats) == 800); + +/* Body for ofp_stats_request of type OFPST_FLOW. */ +struct ofp_flow_stats_request { + struct ofp_match match; /* Fields to match. */ + uint8_t table_id; /* ID of table to read (from ofp_table_stats) + or 0xff for all tables. */ + uint8_t pad; /* Align to 32 bits. */ + uint16_t out_port; /* Require matching entries to include this + as an output port. A value of OFPP_NONE + indicates no restriction. */ +}; +OFP_ASSERT(sizeof(struct ofp_flow_stats_request) == 40); + +/* Body of reply to OFPST_FLOW request. */ +struct ofp_flow_stats { + uint16_t length; /* Length of this entry. */ + uint8_t table_id; /* ID of table flow came from. */ + uint8_t pad; + struct ofp_match match; /* Description of fields. */ + uint32_t duration; /* Time flow has been alive in seconds. */ + uint16_t priority; /* Priority of the entry. Only meaningful + when this is not an exact-match entry. */ + uint16_t idle_timeout; /* Number of seconds idle before expiration. */ + uint16_t hard_timeout; /* Number of seconds before expiration. */ + uint16_t pad2[3]; /* Pad to 64 bits. */ + uint64_t packet_count; /* Number of packets in flow. */ + uint64_t byte_count; /* Number of bytes in flow. */ + struct ofp_action_header actions[0]; /* Actions. */ +}; +OFP_ASSERT(sizeof(struct ofp_flow_stats) == 72); + +/* Body for ofp_stats_request of type OFPST_AGGREGATE. */ +struct ofp_aggregate_stats_request { + struct ofp_match match; /* Fields to match. */ + uint8_t table_id; /* ID of table to read (from ofp_table_stats) + or 0xff for all tables. */ + uint8_t pad; /* Align to 32 bits. */ + uint16_t out_port; /* Require matching entries to include this + as an output port. A value of OFPP_NONE + indicates no restriction. */ +}; +OFP_ASSERT(sizeof(struct ofp_aggregate_stats_request) == 40); + +/* Body of reply to OFPST_AGGREGATE request. */ +struct ofp_aggregate_stats_reply { + uint64_t packet_count; /* Number of packets in flows. */ + uint64_t byte_count; /* Number of bytes in flows. */ + uint32_t flow_count; /* Number of flows. */ + uint8_t pad[4]; /* Align to 64 bits. */ +}; +OFP_ASSERT(sizeof(struct ofp_aggregate_stats_reply) == 24); + +/* Body of reply to OFPST_TABLE request. */ +struct ofp_table_stats { + uint8_t table_id; /* Identifier of table. Lower numbered tables + are consulted first. */ + uint8_t pad[3]; /* Align to 32-bits. */ + char name[OFP_MAX_TABLE_NAME_LEN]; + uint32_t wildcards; /* Bitmap of OFPFW_* wildcards that are + supported by the table. */ + uint32_t max_entries; /* Max number of entries supported. */ + uint32_t active_count; /* Number of active entries. */ + uint64_t lookup_count; /* Number of packets looked up in table. */ + uint64_t matched_count; /* Number of packets that hit table. */ +}; +OFP_ASSERT(sizeof(struct ofp_table_stats) == 64); + +/* Body of reply to OFPST_PORT request. If a counter is unsupported, set + * the field to all ones. */ +struct ofp_port_stats { + uint16_t port_no; + uint8_t pad[6]; /* Align to 64-bits. */ + uint64_t rx_packets; /* Number of received packets. */ + uint64_t tx_packets; /* Number of transmitted packets. */ + uint64_t rx_bytes; /* Number of received bytes. */ + uint64_t tx_bytes; /* Number of transmitted bytes. */ + uint64_t rx_dropped; /* Number of packets dropped by RX. */ + uint64_t tx_dropped; /* Number of packets dropped by TX. */ + uint64_t rx_errors; /* Number of receive errors. This is a super-set + of receive errors and should be great than or + equal to the sum of all rx_*_err values. */ + uint64_t tx_errors; /* Number of transmit errors. This is a super-set + of transmit errors. */ + uint64_t rx_frame_err; /* Number of frame alignment errors. */ + uint64_t rx_over_err; /* Number of packets with RX overrun. */ + uint64_t rx_crc_err; /* Number of CRC errors. */ + uint64_t collisions; /* Number of collisions. */ +}; +OFP_ASSERT(sizeof(struct ofp_port_stats) == 104); + +/* Vendor extension. */ +struct ofp_vendor_header { + struct ofp_header header; /* Type OFPT_VENDOR. */ + uint32_t vendor; /* Vendor ID: + * - MSB 0: low-order bytes are IEEE OUI. + * - MSB != 0: defined by OpenFlow + * consortium. */ + /* Vendor-defined arbitrary additional data. */ +}; +OFP_ASSERT(sizeof(struct ofp_vendor_header) == 12); + +#endif /* openflow/openflow.h */ diff --git a/include/openvswitch/automake.mk b/include/openvswitch/automake.mk new file mode 100644 index 00000000..889a21f5 --- /dev/null +++ b/include/openvswitch/automake.mk @@ -0,0 +1,4 @@ +noinst_HEADERS += \ + include/openvswitch/brcompat-netlink.h \ + include/openvswitch/datapath-protocol.h + diff --git a/include/openvswitch/brcompat-netlink.h b/include/openvswitch/brcompat-netlink.h new file mode 100644 index 00000000..016ddfa5 --- /dev/null +++ b/include/openvswitch/brcompat-netlink.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef OPENVSWITCH_BRCOMPAT_NETLINK_H +#define OPENVSWITCH_BRCOMPAT_NETLINK_H 1 + +#define BRC_GENL_FAMILY_NAME "brcompat" + +/* Attributes that can be attached to the datapath's netlink messages. */ +enum { + BRC_GENL_A_UNSPEC, + BRC_GENL_A_DP_NAME, /* Datapath name. */ + BRC_GENL_A_PORT_NAME, /* Interface name. */ + BRC_GENL_A_ERR_CODE, /* Positive error code. */ + BRC_GENL_A_MC_GROUP, /* Generic netlink multicast group. */ + BRC_GENL_A_PROC_DIR, /* Name of subdirectory in /proc. */ + BRC_GENL_A_PROC_NAME, /* Name of file in /proc. */ + BRC_GENL_A_PROC_DATA, /* Contents of file in /proc. */ + + __BRC_GENL_A_MAX, + BRC_GENL_A_MAX = __BRC_GENL_A_MAX - 1 +}; + +/* Commands that can be executed on the datapath's netlink interface. */ +enum brc_genl_command { + BRC_GENL_C_UNSPEC, + + /* + * "K:" messages are sent by the kernel to userspace. + * "U:" messages are sent by userspace to the kernel. + */ + BRC_GENL_C_DP_ADD, /* K: Datapath created. */ + BRC_GENL_C_DP_DEL, /* K: Datapath destroyed. */ + BRC_GENL_C_DP_RESULT, /* U: Return code from ovs-brcompatd. */ + BRC_GENL_C_PORT_ADD, /* K: Port added to datapath. */ + BRC_GENL_C_PORT_DEL, /* K: Port removed from datapath. */ + BRC_GENL_C_QUERY_MC, /* U: Get multicast group for brcompat. */ + BRC_GENL_C_SET_PROC, /* U: Set contents of file in /proc. */ + + __BRC_GENL_C_MAX, + BRC_GENL_C_MAX = __BRC_GENL_C_MAX - 1 +}; +#endif /* openvswitch/brcompat-netlink.h */ diff --git a/include/openvswitch/datapath-protocol.h b/include/openvswitch/datapath-protocol.h new file mode 100644 index 00000000..537dd595 --- /dev/null +++ b/include/openvswitch/datapath-protocol.h @@ -0,0 +1,287 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* Protocol between secchan and datapath. */ + +#ifndef OPENVSWITCH_DATAPATH_PROTOCOL_H +#define OPENVSWITCH_DATAPATH_PROTOCOL_H 1 + +#ifdef __KERNEL__ +#include +#else +#include +#endif +#include + +#define ODP_MAX 256 /* Maximum number of datapaths. */ + +#define ODP_DP_CREATE _IO('O', 0) +#define ODP_DP_DESTROY _IO('O', 1) +#define ODP_DP_STATS _IOW('O', 2, struct odp_stats) + +#define ODP_GET_DROP_FRAGS _IOW('O', 3, int) +#define ODP_SET_DROP_FRAGS _IOR('O', 4, int) + +#define ODP_GET_LISTEN_MASK _IOW('O', 5, int) +#define ODP_SET_LISTEN_MASK _IOR('O', 6, int) + +#define ODP_PORT_ADD _IOR('O', 7, struct odp_port) +#define ODP_PORT_DEL _IOR('O', 8, int) +#define ODP_PORT_QUERY _IOWR('O', 9, struct odp_port) +#define ODP_PORT_LIST _IOWR('O', 10, struct odp_portvec) + +#define ODP_PORT_GROUP_SET _IOR('O', 11, struct odp_port_group) +#define ODP_PORT_GROUP_GET _IOWR('O', 12, struct odp_port_group) + +#define ODP_FLOW_GET _IOWR('O', 13, struct odp_flow) +#define ODP_FLOW_GET_MULTIPLE _IOWR('O', 14, struct odp_flowvec) +#define ODP_FLOW_LIST _IOWR('O', 15, struct odp_flowvec) + +#define ODP_FLOW_FLUSH _IO('O', 16) +#define ODP_FLOW_PUT _IOWR('O', 17, struct odp_flow) +#define ODP_FLOW_DEL _IOWR('O', 18, struct odp_flow) + +#define ODP_EXECUTE _IOR('O', 19, struct odp_execute) + +struct odp_stats { + /* Flows. */ + __u32 n_flows; /* Number of flows in flow table. */ + __u32 cur_capacity; /* Current flow table capacity. */ + __u32 max_capacity; /* Maximum expansion of flow table capacity. */ + + /* Ports. */ + __u32 n_ports; /* Current number of ports. */ + __u32 max_ports; /* Maximum supported number of ports. */ + __u16 max_groups; /* Maximum number of port groups. */ + __u16 reserved; + + /* Lookups. */ + __u64 n_frags; /* Number of dropped IP fragments. */ + __u64 n_hit; /* Number of flow table matches. */ + __u64 n_missed; /* Number of flow table misses. */ + __u64 n_lost; /* Number of misses not sent to userspace. */ + + /* Queues. */ + __u16 max_miss_queue; /* Max length of ODPL_MISS queue. */ + __u16 max_action_queue; /* Max length of ODPL_ACTION queue. */ +}; + +/* Logical ports. */ +#define ODPP_LOCAL ((__u16)0) +#define ODPP_NONE ((__u16)-1) + +/* Listening channels. */ +#define _ODPL_MISS_NR 0 /* Packet missed in flow table. */ +#define ODPL_MISS (1 << _ODPL_MISS_NR) +#define _ODPL_ACTION_NR 1 /* Packet output to ODPP_CONTROLLER. */ +#define ODPL_ACTION (1 << _ODPL_ACTION_NR) +#define ODPL_ALL (ODPL_MISS | ODPL_ACTION) + +/* Format of messages read from datapath fd. */ +struct odp_msg { + __u32 type; /* _ODPL_MISS_NR or _ODPL_ACTION_NR. */ + __u32 length; /* Message length, including header. */ + __u16 port; /* Port on which frame was received. */ + __u16 reserved; + __u32 arg; /* Argument value specified in action. */ + /* Followed by packet data. */ +}; + +#define ODP_PORT_INTERNAL (1 << 0) /* This port is simulated. */ +struct odp_port { + char devname[16]; /* IFNAMSIZ */ + __u16 port; + __u16 flags; + __u32 reserved2; +}; + +struct odp_portvec { + struct odp_port *ports; + int n_ports; +}; + +struct odp_port_group { + __u16 *ports; + __u16 n_ports; /* Number of ports. */ + __u16 group; /* Group number. */ +}; + +struct odp_flow_stats { + __u64 n_packets; /* Number of matched packets. */ + __u64 n_bytes; /* Number of matched bytes. */ + __u64 used_sec; /* Time last used. */ + __u32 used_nsec; + __u8 tcp_flags; + __u8 ip_tos; + __u16 reserved; +}; + +struct odp_flow_key { + __be32 nw_src; /* IP source address. */ + __be32 nw_dst; /* IP destination address. */ + __u16 in_port; /* Input switch port. */ + __be16 dl_vlan; /* Input VLAN. */ + __be16 dl_type; /* Ethernet frame type. */ + __be16 tp_src; /* TCP/UDP source port. */ + __be16 tp_dst; /* TCP/UDP destination port. */ + __u8 dl_src[ETH_ALEN]; /* Ethernet source address. */ + __u8 dl_dst[ETH_ALEN]; /* Ethernet destination address. */ + __u8 nw_proto; /* IP protocol. */ + __u8 reserved; /* Pad to 64 bits. */ +}; + +struct odp_flow { + struct odp_flow_stats stats; + struct odp_flow_key key; + union odp_action *actions; + __u32 n_actions; +}; + +/* Flags for ODP_FLOW_PUT. */ +#define ODPPF_CREATE (1 << 0) /* Allow creating a new flow. */ +#define ODPPF_MODIFY (1 << 1) /* Allow modifying an existing flow. */ +#define ODPPF_ZERO_STATS (1 << 2) /* Zero the stats of an existing flow. */ + +/* ODP_FLOW_PUT argument. */ +struct odp_flow_put { + struct odp_flow flow; + __u32 flags; +}; + +struct odp_flowvec { + struct odp_flow *flows; + int n_flows; +}; + +/* The VLAN id is 12 bits, so we can use the entire 16 bits to indicate + * special conditions. All ones is used to match that no VLAN id was + * set. */ +#define ODP_VLAN_NONE 0xffff + +/* Action types. */ +#define ODPAT_OUTPUT 0 /* Output to switch port. */ +#define ODPAT_OUTPUT_GROUP 1 /* Output to all ports in group. */ +#define ODPAT_CONTROLLER 2 /* Send copy to controller. */ +#define ODPAT_SET_VLAN_VID 3 /* Set the 802.1q VLAN id. */ +#define ODPAT_SET_VLAN_PCP 4 /* Set the 802.1q priority. */ +#define ODPAT_STRIP_VLAN 5 /* Strip the 802.1q header. */ +#define ODPAT_SET_DL_SRC 6 /* Ethernet source address. */ +#define ODPAT_SET_DL_DST 7 /* Ethernet destination address. */ +#define ODPAT_SET_NW_SRC 8 /* IP source address. */ +#define ODPAT_SET_NW_DST 9 /* IP destination address. */ +#define ODPAT_SET_TP_SRC 10 /* TCP/UDP source port. */ +#define ODPAT_SET_TP_DST 11 /* TCP/UDP destination port. */ +#define ODPAT_N_ACTIONS 12 + +struct odp_action_output { + __u16 type; /* ODPAT_OUTPUT. */ + __u16 port; /* Output port. */ + __u16 reserved1; + __u16 reserved2; +}; + +struct odp_action_output_group { + __u16 type; /* ODPAT_OUTPUT_GROUP. */ + __u16 group; /* Group number. */ + __u16 reserved1; + __u16 reserved2; +}; + +struct odp_action_controller { + __u16 type; /* ODPAT_OUTPUT_CONTROLLER. */ + __u16 reserved; + __u32 arg; /* Copied to struct odp_msg 'arg' member. */ +}; + +/* Action structure for ODPAT_SET_VLAN_VID. */ +struct odp_action_vlan_vid { + __u16 type; /* ODPAT_SET_VLAN_VID. */ + __be16 vlan_vid; /* VLAN id. */ + __u16 reserved1; + __u16 reserved2; +}; + +/* Action structure for ODPAT_SET_VLAN_PCP. */ +struct odp_action_vlan_pcp { + __u16 type; /* ODPAT_SET_VLAN_PCP. */ + __u8 vlan_pcp; /* VLAN priority. */ + __u8 reserved1; + __u16 reserved2; + __u16 reserved3; +}; + +/* Action structure for ODPAT_SET_DL_SRC/DST. */ +struct odp_action_dl_addr { + __u16 type; /* ODPAT_SET_DL_SRC/DST. */ + __u8 dl_addr[ETH_ALEN]; /* Ethernet address. */ +}; + +/* Action structure for ODPAT_SET_NW_SRC/DST. */ +struct odp_action_nw_addr { + __u16 type; /* ODPAT_SET_TW_SRC/DST. */ + __u16 reserved; + __be32 nw_addr; /* IP address. */ +}; + +/* Action structure for ODPAT_SET_TP_SRC/DST. */ +struct odp_action_tp_port { + __u16 type; /* ODPAT_SET_TP_SRC/DST. */ + __be16 tp_port; /* TCP/UDP port. */ + __u16 reserved1; + __u16 reserved2; +}; + +union odp_action { + __u16 type; + struct odp_action_output output; + struct odp_action_output_group output_group; + struct odp_action_controller controller; + struct odp_action_vlan_vid vlan_vid; + struct odp_action_vlan_pcp vlan_pcp; + struct odp_action_dl_addr dl_addr; + struct odp_action_nw_addr nw_addr; + struct odp_action_tp_port tp_port; +}; + +struct odp_execute { + __u16 in_port; + __u16 reserved1; + __u32 reserved2; + + union odp_action *actions; + __u32 n_actions; + + const void *data; + __u32 length; +}; + +/* Values below this cutoff are 802.3 packets and the two bytes + * following MAC addresses are used as a frame length. Otherwise, the + * two bytes are used as the Ethernet type. + */ +#define ODP_DL_TYPE_ETH2_CUTOFF 0x0600 + +/* Value of dl_type to indicate that the frame does not include an + * Ethernet type. + */ +#define ODP_DL_TYPE_NOT_ETH_TYPE 0x05ff + +/* The VLAN id is 12-bits, so we can use the entire 16 bits to indicate + * special conditions. All ones indicates that no VLAN id was set. + */ +#define ODP_VLAN_NONE 0xffff + +#endif /* openvswitch/datapath-protocol.h */ diff --git a/lib/.gitignore b/lib/.gitignore new file mode 100644 index 00000000..6a3f65ce --- /dev/null +++ b/lib/.gitignore @@ -0,0 +1,4 @@ +/Makefile +/Makefile.in +/dhparams.c +/coverage-counters.c diff --git a/lib/automake.mk b/lib/automake.mk new file mode 100644 index 00000000..4d2924b6 --- /dev/null +++ b/lib/automake.mk @@ -0,0 +1,184 @@ +noinst_LIBRARIES += lib/libopenvswitch.a + +lib_libopenvswitch_a_SOURCES = \ + lib/backtrace.c \ + lib/backtrace.h \ + lib/bitmap.c \ + lib/bitmap.h \ + lib/cfg.c \ + lib/cfg.h \ + lib/classifier.c \ + lib/classifier.h \ + lib/command-line.c \ + lib/command-line.h \ + lib/compiler.h \ + lib/coverage.c \ + lib/coverage.h \ + lib/coverage-counters.c \ + lib/coverage-counters.h \ + lib/csum.c \ + lib/csum.h \ + lib/daemon.c \ + lib/daemon.h \ + lib/dhcp-client.c \ + lib/dhcp-client.h \ + lib/dhcp.c \ + lib/dhcp.h \ + lib/dhparams.h \ + lib/dirs.c \ + lib/dirs.h \ + lib/dynamic-string.c \ + lib/dynamic-string.h \ + lib/fatal-signal.c \ + lib/fatal-signal.h \ + lib/fault.c \ + lib/fault.h \ + lib/flow.c \ + lib/flow.h \ + lib/hash.c \ + lib/hash.h \ + lib/hmap.c \ + lib/hmap.h \ + lib/leak-checker.c \ + lib/leak-checker.h \ + lib/learning-switch.c \ + lib/learning-switch.h \ + lib/list.c \ + lib/list.h \ + lib/mac-learning.c \ + lib/mac-learning.h \ + lib/netdev.c \ + lib/netdev.h \ + lib/odp-util.c \ + lib/odp-util.h \ + lib/ofp-print.c \ + lib/ofp-print.h \ + lib/ofpbuf.c \ + lib/ofpbuf.h \ + lib/packets.h \ + lib/pcap.c \ + lib/pcap.h \ + lib/poll-loop.c \ + lib/poll-loop.h \ + lib/port-array.c \ + lib/port-array.h \ + lib/process.c \ + lib/process.h \ + lib/queue.c \ + lib/queue.h \ + lib/random.c \ + lib/random.h \ + lib/rconn.c \ + lib/rconn.h \ + lib/sat-math.h \ + lib/sha1.c \ + lib/sha1.h \ + lib/shash.c \ + lib/shash.h \ + lib/signals.c \ + lib/signals.h \ + lib/socket-util.c \ + lib/socket-util.h \ + lib/stp.c \ + lib/stp.h \ + lib/svec.c \ + lib/svec.h \ + lib/tag.c \ + lib/tag.h \ + lib/timeval.c \ + lib/timeval.h \ + lib/type-props.h \ + lib/unixctl.c \ + lib/unixctl.h \ + lib/util.c \ + lib/util.h \ + lib/valgrind.h \ + lib/vconn-provider.h \ + lib/vconn-ssl.h \ + lib/vconn-stream.c \ + lib/vconn-stream.h \ + lib/vconn-tcp.c \ + lib/vconn-unix.c \ + lib/vconn.c \ + lib/vconn.h \ + lib/vlog-modules.def \ + lib/vlog.c \ + lib/vlog.h \ + lib/xtoxll.h + +if HAVE_NETLINK +lib_libopenvswitch_a_SOURCES += \ + lib/dpif.c \ + lib/dpif.h \ + lib/netlink-protocol.h \ + lib/netlink.c \ + lib/netlink.h +endif + +if HAVE_OPENSSL +lib_libopenvswitch_a_SOURCES += \ + lib/vconn-ssl.c +nodist_lib_libopenvswitch_a_SOURCES = lib/dhparams.c +lib/dhparams.c: lib/dh1024.pem lib/dh2048.pem lib/dh4096.pem + (echo '#include "lib/dhparams.h"' && \ + openssl dhparam -C -in $(srcdir)/lib/dh1024.pem -noout && \ + openssl dhparam -C -in $(srcdir)/lib/dh2048.pem -noout && \ + openssl dhparam -C -in $(srcdir)/lib/dh4096.pem -noout) \ + | sed 's/\(get_dh[0-9]*\)()/\1(void)/' > lib/dhparams.c.tmp + mv lib/dhparams.c.tmp lib/dhparams.c +endif + +EXTRA_DIST += \ + lib/dh1024.pem \ + lib/dh2048.pem \ + lib/dh4096.pem \ + lib/dhparams.h + +EXTRA_DIST += \ + lib/common.man \ + lib/daemon.man \ + lib/dpif.man \ + lib/leak-checker.man \ + lib/vlog.man + + +CLEANFILES += lib/dirs.c +lib/dirs.c: Makefile + ($(ro_c) && \ + echo 'const char ovs_pkgdatadir[] = "$(pkgdatadir)";' && \ + echo 'const char ovs_rundir[] = "@RUNDIR@";' && \ + echo 'const char ovs_logdir[] = "@LOGDIR@";' && \ + echo 'const char ovs_bindir[] = "$(bindir)";') > lib/dirs.c.tmp + mv lib/dirs.c.tmp lib/dirs.c + +install-data-local: + $(MKDIR_P) $(DESTDIR)$(RUNDIR) + $(MKDIR_P) $(DESTDIR)$(PKIDIR) + $(MKDIR_P) $(DESTDIR)$(LOGDIR) + +# All the source files that have coverage counters. +COVERAGE_FILES = \ + lib/cfg.c \ + lib/dpif.c \ + lib/flow.c \ + lib/hmap.c \ + lib/mac-learning.c \ + lib/netdev.c \ + lib/netlink.c \ + lib/odp-util.c \ + lib/poll-loop.c \ + lib/process.c \ + lib/rconn.c \ + lib/timeval.c \ + lib/unixctl.c \ + lib/util.c \ + lib/vconn.c \ + secchan/ofproto.c \ + secchan/pktbuf.c \ + vswitchd/bridge.c \ + vswitchd/mgmt.c \ + vswitchd/ovs-brcompatd.c +lib/coverage-counters.c: $(COVERAGE_FILES) lib/coverage-scan.pl + (cd $(srcdir) && $(PERL) lib/coverage-scan.pl $(COVERAGE_FILES)) > $@.tmp + mv $@.tmp $@ +EXTRA_DIST += lib/coverage-scan.pl diff --git a/lib/backtrace.c b/lib/backtrace.c new file mode 100644 index 00000000..07060390 --- /dev/null +++ b/lib/backtrace.c @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "backtrace.h" +#include +#include +#include +#include +#include "compiler.h" + +#define THIS_MODULE VLM_backtrace +#include "vlog.h" + +static uintptr_t UNUSED +get_max_stack(void) +{ + static const char file_name[] = "/proc/self/maps"; + char line[1024]; + int line_number; + FILE *f; + + f = fopen(file_name, "r"); + if (f == NULL) { + VLOG_WARN("opening %s failed: %s", file_name, strerror(errno)); + return -1; + } + + for (line_number = 1; fgets(line, sizeof line, f); line_number++) { + if (strstr(line, "[stack]")) { + uintptr_t end; + if (sscanf(line, "%*"SCNxPTR"-%"SCNxPTR, &end) != 1) { + VLOG_WARN("%s:%d: parse error", file_name, line_number); + continue; + } + fclose(f); + return end; + } + } + fclose(f); + + VLOG_WARN("%s: no stack found", file_name); + return -1; +} + +static uintptr_t +stack_high(void) +{ + static uintptr_t high; + if (!high) { + high = get_max_stack(); + } + return high; +} + +static uintptr_t +stack_low(void) +{ +#ifdef __i386__ + uintptr_t low; + asm("movl %%esp,%0" : "=g" (low)); + return low; +#else + /* This causes a warning in GCC that cannot be disabled, so use it only on + * non-x86. */ + int dummy; + return (uintptr_t) &dummy; +#endif +} + +static bool +in_stack(void *p) +{ + uintptr_t address = (uintptr_t) p; + return address >= stack_low() && address < stack_high(); +} + +void +backtrace_capture(struct backtrace *backtrace) +{ + void **frame; + size_t n; + + n = 0; + for (frame = __builtin_frame_address(1); + frame != NULL && in_stack(frame) && frame[0] != NULL + && n < BACKTRACE_MAX_FRAMES; + frame = frame[0]) + { + backtrace->frames[n++] = (uintptr_t) frame[1]; + } + backtrace->n_frames = n; +} diff --git a/lib/backtrace.h b/lib/backtrace.h new file mode 100644 index 00000000..49b250c7 --- /dev/null +++ b/lib/backtrace.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef BACKTRACE_H +#define BACKTRACE_H 1 + +#include + +#define BACKTRACE_MAX_FRAMES 31 + +struct backtrace { + int n_frames; + uintptr_t frames[BACKTRACE_MAX_FRAMES]; +}; + +void backtrace_capture(struct backtrace *); + +#endif /* backtrace.h */ diff --git a/lib/bitmap.c b/lib/bitmap.c new file mode 100644 index 00000000..cab7e6d2 --- /dev/null +++ b/lib/bitmap.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "bitmap.h" +#include + +/* Sets 'count' consecutive bits in 'bitmap', starting at bit offset 'start', + * to 'value'. */ +void +bitmap_set_multiple(unsigned long *bitmap, size_t start, size_t count, + bool value) +{ + for (; count && start % BITMAP_ULONG_BITS; count--) { + bitmap_set(bitmap, start++, value); + } + for (; count >= BITMAP_ULONG_BITS; count -= BITMAP_ULONG_BITS) { + *bitmap_unit__(bitmap, start) = -(unsigned long) value; + start += BITMAP_ULONG_BITS; + } + for (; count; count--) { + bitmap_set(bitmap, start++, value); + } +} + +/* Compares the 'n' bits in bitmaps 'a' and 'b'. Returns true if all bits are + * equal, false otherwise. */ +bool +bitmap_equal(const unsigned long *a, const unsigned long *b, size_t n) +{ + size_t i; + + if (memcmp(a, b, n / BITMAP_ULONG_BITS * sizeof(unsigned long))) { + return false; + } + for (i = ROUND_DOWN(n, BITMAP_ULONG_BITS); i < n; i++) { + if (bitmap_is_set(a, i) != bitmap_is_set(b, i)) { + return false; + } + } + return true; +} diff --git a/lib/bitmap.h b/lib/bitmap.h new file mode 100644 index 00000000..a2d9a50d --- /dev/null +++ b/lib/bitmap.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef BITMAP_H +#define BITMAP_H 1 + +#include +#include +#include "util.h" + +#define BITMAP_ULONG_BITS (sizeof(unsigned long) * CHAR_BIT) + +static inline unsigned long * +bitmap_unit__(const unsigned long *bitmap, size_t offset) +{ + return (unsigned long *) &bitmap[offset / BITMAP_ULONG_BITS]; +} + +static inline unsigned long +bitmap_bit__(size_t offset) +{ + return 1UL << (offset % BITMAP_ULONG_BITS); +} + +static inline unsigned long * +bitmap_allocate(size_t n_bits) +{ + return xcalloc(1, ROUND_UP(n_bits, BITMAP_ULONG_BITS)); +} + +static inline void +bitmap_free(unsigned long *bitmap) +{ + free(bitmap); +} + +static inline bool +bitmap_is_set(const unsigned long *bitmap, size_t offset) +{ + return (*bitmap_unit__(bitmap, offset) & bitmap_bit__(offset)) != 0; +} + +static inline void +bitmap_set1(unsigned long *bitmap, size_t offset) +{ + *bitmap_unit__(bitmap, offset) |= bitmap_bit__(offset); +} + +static inline void +bitmap_set0(unsigned long *bitmap, size_t offset) +{ + *bitmap_unit__(bitmap, offset) &= ~bitmap_bit__(offset); +} + +static inline void +bitmap_set(unsigned long *bitmap, size_t offset, bool value) +{ + if (value) { + bitmap_set1(bitmap, offset); + } else { + bitmap_set0(bitmap, offset); + } +} + +void bitmap_set_multiple(unsigned long *, size_t start, size_t count, + bool value); +bool bitmap_equal(const unsigned long *, const unsigned long *, size_t n); + +#endif /* bitmap.h */ diff --git a/lib/cfg.c b/lib/cfg.c new file mode 100644 index 00000000..b4d1d591 --- /dev/null +++ b/lib/cfg.c @@ -0,0 +1,1182 @@ +/* Copyright (c) 2008, 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#include +#include "cfg.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "coverage.h" +#include "dynamic-string.h" +#include "ofpbuf.h" +#include "packets.h" +#include "svec.h" +#include "timeval.h" +#include "util.h" + +#define THIS_MODULE VLM_cfg +#include "vlog.h" + +/* XXX This file really needs a unit test! For a while, cfg_get_string(0, + * "bridge.a.controller") would return the value of + * "bridge.a.controller.in-band", if it existed, and I'm really not certain + * that the fix didn't break other things. */ + +/* Configuration file name. */ +static char *cfg_name; + +/* Put the temporary file in the same directory as cfg_name, so that + * they are guaranteed to be in the same file system and therefore we can + * rename() tmp_name over cfg_name. */ +static char *tmp_name; + +/* Lock information. */ +static char *lock_name; +static int lock_fd = -1; + +/* Flag to indicate whether local modifications have been made. */ +static bool dirty; + +static uint8_t cfg_cookie[CFG_COOKIE_LEN]; + +/* Current configuration. Maintained in sorted order. */ +static struct svec cfg = SVEC_EMPTY_INITIALIZER; + +static bool has_double_dot(const char *key, size_t len); +static bool is_valid_key(const char *key, size_t len, + const char *file_name, int line_number, + const char *id); +static char *parse_section(const char *file_name, int line_number, + const char *); +static void parse_setting(const char *file_name, int line_number, + const char *section, const char *); +static int compare_key(const char *a, const char *b); +static char **find_key_le(const char *key); +static char **find_key_ge(const char *key); +static char *find_key(const char *); +static bool parse_mac(const char *, uint8_t mac[6]); +static bool parse_dpid(const char *, uint64_t *); +static bool is_key(const char *); +static bool is_int(const char *); +static bool is_bool(const char *); +static const char *extract_value(const char *key); +static const char *get_nth_value(int idx, const char *key); +static bool is_type(const char *s, enum cfg_flags); + +#define CC_ALPHA "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +#define CC_DIGIT "0123456789" +#define CC_ALNUM CC_ALPHA CC_DIGIT +#define CC_SPACE " \t\r\n\v" + +#define CC_FILE_NAME CC_ALNUM "._-" +#define CC_KEY CC_ALNUM "._-@$:+" + +/* Sets 'file_name' as the configuration file read by cfg_read(). Returns 0 on + * success, otherwise a positive errno value if 'file_name' cannot be opened. + * + * This function does not actually read the named file or directory. Use + * cfg_read() to (re)read all the configuration files. */ +int +cfg_set_file(const char *file_name) +{ + const char *slash; + int fd; + + if (cfg_name) { + assert(lock_fd < 0); + free(cfg_name); + free(lock_name); + free(tmp_name); + cfg_name = lock_name = tmp_name = NULL; + } + + /* Make sure that we can open this file for reading. */ + fd = open(file_name, O_RDONLY); + if (fd < 0) { + return errno; + } + close(fd); + + cfg_name = xstrdup(file_name); + + /* Put the temporary file in the same directory as cfg_name, so that they + * are guaranteed to be in the same file system, to guarantee that + * rename(tmp_name, cfg_name) will work. */ + tmp_name = xasprintf("%s.~tmp~", file_name); + + /* Put the lock file in the same directory as cfg_name, but prefixed by + * a dot so as not to garner administrator interest. */ + slash = strrchr(file_name, '/'); + if (slash) { + lock_name = xasprintf("%.*s/.%s.~lock~", + slash - file_name, file_name, slash + 1); + } else { + lock_name = xasprintf(".%s.~lock~", file_name); + } + + VLOG_INFO("using \"%s\" as configuration file, \"%s\" as lock file", + file_name, lock_name); + return 0; +} + +static int +update_cookie(void) +{ + int i; + SHA1Context context; + + if (SHA1Reset(&context) != shaSuccess) { + return -1; + } + for (i = 0; i < cfg.n; i++) { + if (SHA1Input(&context, (uint8_t *)cfg.names[i], + strlen(cfg.names[i])) != shaSuccess) { + return -1; + } + SHA1Input(&context, (uint8_t *)"\n", 1); + } + if (SHA1Result(&context, cfg_cookie) != shaSuccess) { + return -1; + } + + return 0; +} + +/* Reads all of the configuration files or directories that have been added + * with cfg_add_file(), merges their content. Any previous configuration is + * replaced. Returns 0 if successful, otherwise a positive errno value. */ +int +cfg_read(void) +{ + struct svec old_cfg; + struct ds ds; + FILE *file; + char *section; + int line_number; + + + if (!cfg_name) { + return ENODEV; + } + + /* Save old configuration data and clear the active configuration. */ + svec_init(&old_cfg); + svec_swap(&old_cfg, &cfg); + + /* Read new configuration. */ + VLOG_DBG("reading configuration from %s", cfg_name); + + file = fopen(cfg_name, "r"); + if (!file) { + VLOG_ERR("failed to open \"%s\": %s", cfg_name, strerror(errno)); + return errno; + } + + ds_init(&ds); + section = NULL; + line_number = 0; + while (!ds_get_line(&ds, file)) { + const char *s = ds_cstr(&ds); + size_t indent = strspn(s, CC_SPACE); + + line_number++; + s += indent; + if (*s == '#' || *s == '\0') { + /* Ignore comments and lines that contain only white space. */ + } else if (*s == '[') { + if (!indent) { + free(section); + section = parse_section(cfg_name, line_number, s); + } else { + VLOG_ERR("%s:%d: ignoring indented section header", + cfg_name, line_number); + } + } else if (indent && !section) { + VLOG_ERR("%s:%d: ignoring indented line outside any section", + cfg_name, line_number); + } else { + if (!indent) { + free(section); + section = NULL; + } + parse_setting(cfg_name, line_number, section, s); + } + } + ds_destroy(&ds); + free(section); + + svec_sort(&cfg); + svec_terminate(&cfg); + update_cookie(); + + fclose(file); + + if (VLOG_IS_DBG_ENABLED()) { + struct svec removed, added; + size_t i; + + svec_diff(&old_cfg, &cfg, &removed, NULL, &added); + if (removed.n || added.n) { + VLOG_DBG("configuration changes:"); + for (i = 0; i < removed.n; i++) { + VLOG_DBG("-%s", removed.names[i]); + } + for (i = 0; i < added.n; i++) { + VLOG_DBG("+%s", added.names[i]); + } + } else { + VLOG_DBG("configuration unchanged"); + } + svec_destroy(&added); + svec_destroy(&removed); + } + svec_destroy(&old_cfg); + + dirty = false; + + return 0; +} + +/* Fills 'svec' with the entire configuration file. */ +void +cfg_get_all(struct svec *svec) +{ + svec_clear(svec); + svec_append(svec, &cfg); +} + +int +cfg_get_cookie(uint8_t *cookie) +{ + if (dirty) { + update_cookie(); + } + + memcpy(cookie, cfg_cookie, sizeof(cfg_cookie)); + return 0; +} + +void +cfg_unlock(void) +{ + if (lock_fd != -1) { + COVERAGE_INC(cfg_unlock); + close(lock_fd); + lock_fd = -1; + } +} + +static int +open_lockfile(const char *name) +{ + for (;;) { + /* Try to open an existing lock file. */ + int fd = open(name, O_RDWR); + if (fd >= 0) { + return fd; + } else if (errno != ENOENT) { + VLOG_WARN("%s: failed to open lock file: %s", + name, strerror(errno)); + return -errno; + } + + /* Try to create a new lock file. */ + VLOG_INFO("%s: lock file does not exist, creating", name); + fd = open(name, O_RDWR | O_CREAT | O_EXCL, 0600); + if (fd >= 0) { + return fd; + } else if (errno != EEXIST) { + VLOG_WARN("%s: failed to create lock file: %s", + name, strerror(errno)); + return -errno; + } + + /* Someone else created the lock file. Try again. */ + } +} + +static int +try_lock(int fd, bool block) +{ + struct flock l; + memset(&l, 0, sizeof l); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + return fcntl(fd, block ? F_SETLKW : F_SETLK, &l) == -1 ? errno : 0; +} + +/* Locks the configuration file against modification by other processes and + * re-reads it from disk. + * + * The 'timeout' specifies the maximum number of milliseconds to wait for the + * config file to become free. Use 0 to avoid waiting or INT_MAX to wait + * forever. + * + * Returns 0 on success, otherwise a positive errno value. */ +int +cfg_lock(uint8_t *cookie, int timeout) +{ + long long int start = time_msec(); + long long int elapsed = 0; + int fd; + uint8_t curr_cookie[CFG_COOKIE_LEN]; + + assert(lock_fd < 0); + COVERAGE_INC(cfg_lock); + for (;;) { + int error; + + /* Open lock file. */ + fd = open_lockfile(lock_name); + if (fd < 0) { + return -fd; + } + + /* Try to lock it. This will block (if 'timeout' > 0). */ + error = try_lock(fd, timeout > 0); + time_refresh(); + elapsed = time_msec() - start; + if (!error) { + /* Success! */ + break; + } + + /* Lock failed. Close the lock file and reopen it on the next + * iteration, just in case someone deletes it underneath us (even + * though that should not happen). */ + close(fd); + if (error != EINTR) { + /* Hard error, give up. */ + COVERAGE_INC(cfg_lock_error); + VLOG_WARN("%s: failed to lock file " + "(after %lld ms, with %d-ms timeout): %s", + lock_name, elapsed, timeout, strerror(error)); + return error; + } + + /* Probably, the periodic timer set up by time_init() woke up us. Just + * check whether it's time to give up. */ + if (timeout != INT_MAX && elapsed >= timeout) { + COVERAGE_INC(cfg_lock_timeout); + VLOG_WARN("%s: giving up on lock file after %lld ms", + lock_name, elapsed); + return ETIMEDOUT; + } + COVERAGE_INC(cfg_lock_retry); + } + if (elapsed) { + VLOG_WARN("%s: waited %lld ms for lock file", lock_name, elapsed); + } + lock_fd = fd; + + cfg_read(); + + if (cookie) { + cfg_get_cookie(curr_cookie); + + if (memcmp(curr_cookie, cookie, sizeof *curr_cookie)) { + /* Configuration has changed, so reject. */ + cfg_unlock(); + return EINVAL; + } + } + + return 0; +} + +static int +do_write_config(const void *data, size_t len) +{ + FILE *file; + int error; + + file = fopen(tmp_name, "w"); + if (file == NULL) { + VLOG_WARN("could not open %s for writing: %s", + tmp_name, strerror(errno)); + return errno; + } + + fwrite(data, 1, len, file); + + /* This is essentially equivalent to: + * error = ferror(file) || fflush(file) || fclose(file); + * but it doesn't short-circuit, so that it always closes 'file'. */ + error = ferror(file); + error = fflush(file) || error; + error = fclose(file) || error; + if (error) { + VLOG_WARN("problem writing to %s: %s", tmp_name, strerror(errno)); + return errno; + } + + if (rename(tmp_name, cfg_name) < 0) { + VLOG_WARN("could not rename %s to %s: %s", + tmp_name, cfg_name, strerror(errno)); + return errno; + } + + dirty = false; + + return 0; +} + +/* Write the current configuration into the configuration file. Returns 0 if + * successful, otherwise a negative errno value. */ +int +cfg_write(void) +{ + char *content; + int retval; + + svec_sort(&cfg); + content = (cfg.n + ? svec_join(&cfg, "\n", "\n") + : xstrdup("# This file intentionally left blank.\n")); + retval = do_write_config(content, strlen(content)); + free(content); + + return retval; +} + +int +cfg_write_data(uint8_t *data, size_t len) +{ + int retval = do_write_config(data, len); + if (!retval) { + cfg_read(); + } + return retval; +} + +/* Returns true if the configuration has changed since the last time it was + * read or written. */ +bool +cfg_is_dirty(void) +{ + return dirty; +} + +void +cfg_buf_put(struct ofpbuf *buffer) +{ + int i; + + for (i = 0; i < cfg.n; i++) { + ofpbuf_put(buffer, cfg.names[i], strlen(cfg.names[i])); + ofpbuf_put(buffer, "\n", 1); + } +} + +/* Formats the printf()-style format string in the parameter 'format', which + * must be the function's last parameter, into string variable 'dst'. The + * function is responsible for freeing 'dst'. */ +#define FORMAT_KEY(FORMAT, DST) \ + do { \ + va_list args__; \ + va_start(args__, FORMAT); \ + (DST) = xvasprintf(FORMAT, args__); \ + va_end(args__); \ + } while (0) + +/* Returns true if the configuration includes a key named 'key'. */ +bool +cfg_has(const char *key_, ...) +{ + char *key; + bool retval; + + FORMAT_KEY(key_, key); + retval = find_key(key) != NULL; + free(key); + return retval; +} + +bool +cfg_is_valid(enum cfg_flags flags, const char *key_, ...) +{ + char *key, **first, **last, **p; + size_t n; + bool retval; + + FORMAT_KEY(key_, key); + first = find_key_le(key); + last = find_key_ge(key); + n = last - first; + retval = ((!(flags & CFG_REQUIRED) || n) + && (!(flags & CFG_MULTIPLE) || n <= 1)); + for (p = first; retval && p < last; p++) { + retval = is_type(strchr(*p, '=') + 1, flags); + } + free(key); + return retval; +} + +/* Returns true if the configuration includes at least one key whose name + * begins with 'section' followed by a dot. */ +bool +cfg_has_section(const char *section_, ...) +{ + struct ds section; + bool retval = false; + va_list args; + char **p; + + ds_init(§ion); + va_start(args, section_); + ds_put_format_valist(§ion, section_, args); + ds_put_char(§ion, '.'); + va_end(args); + + for (p = cfg.names; *p; p++) { /* XXX this is inefficient */ + if (!strncmp(section.string, *p, section.length)) { + retval = true; + break; + } + } + + ds_destroy(§ion); + return retval; +} + +/* Returns the number of values for the given 'key'. The return value is 0 if + * no values exist for 'key'. */ +int +cfg_count(const char *key_, ...) +{ + char *key; + int retval; + + FORMAT_KEY(key_, key); + retval = find_key_ge(key) - find_key_le(key); + free(key); + return retval; +} + +/* Fills 'svec' with all of the immediate subsections of 'section'. For + * example, if 'section' is "bridge" and keys bridge.a, bridge.b, bridge.b.c, + * and bridge.c.x.y.z exist, then 'svec' would be initialized to a, b, and + * c. The caller must first initialize 'svec'. */ +void +cfg_get_subsections(struct svec *svec, const char *section_, ...) +{ + struct ds section; + va_list args; + char **p; + + ds_init(§ion); + va_start(args, section_); + ds_put_format_valist(§ion, section_, args); + ds_put_char(§ion, '.'); + va_end(args); + + svec_clear(svec); + for (p = cfg.names; *p; p++) { /* XXX this is inefficient */ + if (!strncmp(section.string, *p, section.length)) { + const char *ss = *p + section.length; + size_t ss_len = strcspn(ss, ".="); + svec_add_nocopy(svec, xmemdup0(ss, ss_len)); + } + } + svec_unique(svec); + ds_destroy(§ion); +} + +void +cfg_add_entry(const char *entry_, ...) +{ + char *entry; + + FORMAT_KEY(entry_, entry); + svec_add_nocopy(&cfg, entry); + svec_sort(&cfg); + svec_terminate(&cfg); + dirty = true; +} + +void +cfg_del_entry(const char *entry_, ...) +{ + char *entry; + + FORMAT_KEY(entry_, entry); + svec_del(&cfg, entry); + svec_terminate(&cfg); + free(entry); + dirty = true; +} + +void +cfg_del_section(const char *section_, ...) +{ + struct ds section; + va_list args; + char **p; + + ds_init(§ion); + va_start(args, section_); + ds_put_format_valist(§ion, section_, args); + ds_put_char(§ion, '.'); + va_end(args); + + for (p = cfg.names; *p; p++) { + if (!strncmp(section.string, *p, section.length)) { + free(*p); + *p = NULL; + } + } + svec_compact(&cfg); + svec_terminate(&cfg); + + ds_destroy(§ion); + dirty = true; +} + +void +cfg_del_match(const char *pattern_, ...) +{ + bool matched = false; + char *pattern; + char **p; + + FORMAT_KEY(pattern_, pattern); + + for (p = cfg.names; *p; p++) { + if (!fnmatch(pattern, *p, 0)) { + free(*p); + *p = NULL; + matched = true; + } + } + if (matched) { + svec_compact(&cfg); + svec_terminate(&cfg); + dirty = true; + } + + free(pattern); +} + +/* Fills 'svec' with all of the key-value pairs that have sections that + * begin with 'section'. The caller must first initialize 'svec'. */ +void +cfg_get_section(struct svec *svec, const char *section_, ...) +{ + struct ds section; + va_list args; + char **p; + + ds_init(§ion); + va_start(args, section_); + ds_put_format_valist(§ion, section_, args); + ds_put_char(§ion, '.'); + va_end(args); + + for (p = cfg.names; *p; p++) { /* XXX this is inefficient */ + if (!strncmp(section.string, *p, section.length)) { + svec_add(svec, *p); + } + } + ds_destroy(§ion); +} + +/* Returns the value numbered 'idx' of 'key'. Returns a null pointer if 'idx' + * is greater than or equal to cfg_count(key). The caller must not modify or + * free the returned string or retain its value beyond the next call to + * cfg_read(). */ +const char * +cfg_get_string(int idx, const char *key_, ...) +{ + const char *retval; + char *key; + + FORMAT_KEY(key_, key); + retval = get_nth_value(idx, key); + free(key); + return retval; +} + +/* Returns the value numbered 'idx' of 'key'. Returns a null pointer if 'idx' + * is greater than or equal to cfg_count(key) or if the value 'idx' of 'key' is + * not a valid key. The caller must not modify or free the returned string or + * retain its value beyond the next call to cfg_read(). */ +const char * +cfg_get_key(int idx, const char *key_, ...) +{ + const char *value, *retval; + char *key; + + FORMAT_KEY(key_, key); + value = get_nth_value(idx, key); + retval = value && is_key(value) ? value : NULL; + free(key); + return retval; +} + +/* Returns the value numbered 'idx' of 'key', converted to an integer. Returns + * 0 if 'idx' is greater than or equal to cfg_count(key) or if the value 'idx' + * of 'key' is not a valid integer. */ +int +cfg_get_int(int idx, const char *key_, ...) +{ + const char *value; + int retval; + char *key; + + FORMAT_KEY(key_, key); + value = get_nth_value(idx, key); + retval = value && is_int(value) ? atoi(value) : 0; + free(key); + return retval; +} + +/* Returns the value numbered 'idx' of 'key', converted to a boolean value. + * Returns false if 'idx' is greater than or equal to cfg_count(key) or if the + * value 'idx' of 'key' is not a valid boolean. */ +bool +cfg_get_bool(int idx, const char *key_, ...) +{ + const char *value; + bool retval; + char *key; + + FORMAT_KEY(key_, key); + value = get_nth_value(idx, key); + retval = value && is_bool(value) ? !strcmp(value, "true") : false; + free(key); + return retval; +} + +/* Returns the value numbered 'idx' of 'key', converted to an IP address in + * network byte order. Returns 0 if 'idx' is greater than or equal to + * cfg_count(key) or if the value 'idx' of 'key' is not a valid IP address (as + * determined by inet_aton()). */ +uint32_t +cfg_get_ip(int idx, const char *key_, ...) +{ + struct in_addr addr; + const char *value; + char *key; + + FORMAT_KEY(key_, key); + value = get_nth_value(idx, key); + if (!value || !inet_aton(value, &addr)) { + addr.s_addr = htonl(0); + } + free(key); + return addr.s_addr; +} + +/* Returns the value numbered 'idx' of 'key', converted to an MAC address in + * host byte order. Returns 0 if 'idx' is greater than or equal to + * cfg_count(key) or if the value 'idx' of 'key' is not a valid MAC address in + * the format "##:##:##:##:##:##". */ +uint64_t +cfg_get_mac(int idx, const char *key_, ...) +{ + uint8_t mac[ETH_ADDR_LEN]; + const char *value; + char *key; + + FORMAT_KEY(key_, key); + value = get_nth_value(idx, key); + if (!value || !parse_mac(value, mac)) { + memset(mac, 0, sizeof mac); + } + free(key); + return eth_addr_to_uint64(mac); +} + +/* Returns the value numbered 'idx' of 'key', parsed as an datapath ID. + * Returns 0 if 'idx' is greater than or equal to cfg_count(key) or if the + * value 'idx' of 'key' is not a valid datapath ID consisting of exactly 12 + * hexadecimal digits. */ +uint64_t +cfg_get_dpid(int idx, const char *key_, ...) +{ + uint64_t dpid; + const char *value; + char *key; + + FORMAT_KEY(key_, key); + value = get_nth_value(idx, key); + if (!value || !parse_dpid(value, &dpid)) { + dpid = 0; + } + free(key); + return dpid; +} + +/* Returns the value numbered 'idx' of 'key', converted to an integer. Returns + * -1 if 'idx' is greater than or equal to cfg_count(key) or if the value 'idx' + * of 'key' is not a valid integer between 0 and 4095. */ +int +cfg_get_vlan(int idx, const char *key_, ...) +{ + const char *value; + int retval; + char *key; + + FORMAT_KEY(key_, key); + value = get_nth_value(idx, key); + if (value && is_int(value)) { + retval = atoi(value); + if (retval < 0 || retval > 4095) { + retval = -1; + } + } else { + retval = -1; + } + free(key); + return retval; +} + +/* Fills 'svec' with all of the string values of 'key'. The caller must + * first initialize 'svec'. */ +void +cfg_get_all_strings(struct svec *svec, const char *key_, ...) +{ + char **p, **q; + char *key; + + FORMAT_KEY(key_, key); + svec_clear(svec); + for (p = find_key_le(key), q = find_key_ge(key); p < q; p++) { + svec_add(svec, extract_value(*p)); + } + free(key); +} + +/* Fills 'svec' with all of the values of 'key' that are valid keys. + * Values of 'key' that are not valid keys are omitted. The caller + * must first initialize 'svec'. */ +void +cfg_get_all_keys(struct svec *svec, const char *key_, ...) +{ + char **p, **q; + char *key; + + FORMAT_KEY(key_, key); + svec_clear(svec); + for (p = find_key_le(key), q = find_key_ge(key); p < q; p++) { + const char *value = extract_value(*p); + if (is_key(value)) { + svec_add(svec, value); + } + } + free(key); +} + +static bool +has_double_dot(const char *key, size_t len) +{ + if (len >= 2) { + size_t i; + + for (i = 0; i < len - 1; i++) { + if (key[i] == '.' && key[i + 1] == '.') { + return true; + } + } + } + return false; +} + +static bool +is_valid_key(const char *key, size_t len, + const char *file_name, int line_number, const char *id) +{ + if (!len) { + VLOG_ERR("%s:%d: missing %s name", file_name, line_number, id); + return false; + } else if (key[0] == '.') { + VLOG_ERR("%s:%d: %s name \"%.*s\" begins with invalid character '.'", + file_name, line_number, id, (int) len, key); + return false; + } else if (key[len - 1] == '.') { + VLOG_ERR("%s:%d: %s name \"%.*s\" ends with invalid character '.'", + file_name, line_number, id, (int) len, key); + return false; + } else if (has_double_dot(key, len)) { + VLOG_ERR("%s:%d: %s name \"%.*s\" contains '..', which is not allowed", + file_name, line_number, id, (int) len, key); + return false; + } else { + return true; + } +} + +static char * +parse_section(const char *file_name, int line_number, const char *s) +{ + struct ds section; + size_t len; + + ds_init(§ion); + + /* Skip [ and any white space. */ + s++; + s += strspn(s, CC_SPACE); + + /* Obtain the section name. */ + len = strspn(s, CC_KEY); + if (!is_valid_key(s, len, file_name, line_number, "section")) { + goto error; + } + ds_put_buffer(§ion, s, len); + s += len; + + /* Obtain the subsection name, if any. */ + s += strspn(s, CC_SPACE); + if (*s == '"') { + s++; + len = strspn(s, CC_KEY); + if (!is_valid_key(s, len, file_name, line_number, "subsection")) { + goto error; + } + ds_put_char(§ion, '.'); + ds_put_buffer(§ion, s, len); + s += len; + if (*s != '"') { + VLOG_ERR("%s:%d: missing '\"' following subsection name", + file_name, line_number); + goto error; + } + s++; + s += strspn(s, CC_SPACE); + } + + /* Check for ]. */ + if (*s != ']') { + VLOG_ERR("%s:%d: missing ']' following section name", + file_name, line_number); + goto error; + } + s++; + s += strspn(s, CC_SPACE); + if (*s != '\0') { + VLOG_ERR("%s:%d: trailing garbage following ']'", + file_name, line_number); + goto error; + } + + return ds_cstr(§ion); + +error: + ds_destroy(§ion); + return NULL; +} + +static void +parse_setting(const char *file_name, int line_number, const char *section, + const char *s) +{ + struct ds key = DS_EMPTY_INITIALIZER; + struct ds value = DS_EMPTY_INITIALIZER; + size_t len; + + if (section) { + ds_put_format(&key, "%s.", section); + } + + /* Obtain the key. */ + len = strspn(s, CC_KEY); + if (!len) { + VLOG_ERR("%s:%d: missing key name", file_name, line_number); + goto done; + } + if (!is_valid_key(s, len, file_name, line_number, "key")) { + goto done; + } + ds_put_buffer(&key, s, len); + s += len; + + /* Skip the '='. */ + s += strspn(s, CC_SPACE); + if (*s != '=') { + VLOG_ERR("%s:%d: missing '=' following key", file_name, line_number); + goto done; + } + s++; + s += strspn(s, CC_SPACE); + + /* Obtain the value. */ + ds_put_cstr(&value, s); + while (value.length > 0 && strchr(CC_SPACE, ds_last(&value))) { + value.length--; + } + + /* Add the setting. */ + svec_add_nocopy(&cfg, xasprintf("%s=%s", ds_cstr(&key), ds_cstr(&value))); + +done: + ds_destroy(&key); + ds_destroy(&value); +} + +static int +compare_key(const char *a, const char *b) +{ + for (;;) { + int ac = *a == '\0' || *a == '=' ? INT_MAX : *a; + int bc = *b == '\0' || *b == '=' ? INT_MAX : *b; + if (ac != bc) { + return ac < bc ? -1 : 1; + } else if (ac == INT_MAX) { + return 0; + } + a++; + b++; + } +} + +/* Returns the address of the greatest configuration string with a key less + * than or equal to 'key'. Returns the address of the null terminator if all + * configuration strings are greater than 'key'. */ +static char ** +find_key_le(const char *key) +{ + int low = 0; + int len = cfg.n; + while (len > 0) { + int half = len >> 1; + int middle = low + half; + if (compare_key(cfg.names[middle], key) < 0) { + low = middle + 1; + len -= half + 1; + } else { + len = half; + } + } + return &cfg.names[low]; +} + +/* Returns the address of the least configuration string with a key greater + * than or equal to 'key'. Returns the address of the null terminator if all + * configuration strings are less than 'key'. */ +static char ** +find_key_ge(const char *key) +{ + int low = 0; + int len = cfg.n; + while (len > 0) { + int half = len >> 1; + int middle = low + half; + if (compare_key(cfg.names[middle], key) > 0) { + len = half; + } else { + low = middle + 1; + len -= half + 1; + } + } + return &cfg.names[low]; +} + +static char * +find_key(const char *key) +{ + char **p = find_key_le(key); + return p < &cfg.names[cfg.n] && !compare_key(*p, key) ? *p : NULL; +} + +static bool +parse_mac(const char *s, uint8_t mac[6]) +{ + return sscanf(s, "%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8, + &mac[0], &mac[1], &mac[2], &mac[3], &mac[4], &mac[5]) == 6; +} + +static bool +parse_dpid(const char *s, uint64_t *dpid) +{ + if (strlen(s) == 12 && strspn(s, "0123456789abcdefABCDEF") == 12) { + *dpid = strtoll(s, NULL, 16); + return true; + } else { + return false; + } +} + +static bool +is_key(const char *s) +{ + /* XXX needs to check the same things as is_valid_key() too. */ + return *s && s[strspn(s, CC_KEY)] == '\0'; +} + +static bool +is_int(const char *s) +{ + return *s && s[strspn(s, CC_DIGIT)] == '\0'; +} + +static bool +is_bool(const char *s) +{ + return !strcmp(s, "true") || !strcmp(s, "false"); +} + +static const char * +extract_value(const char *key) +{ + const char *p = strchr(key, '='); + return p ? p + 1 : NULL; +} + +static const char * +get_nth_value(int idx, const char *key) +{ + char **p = find_key_le(key); + char **q = find_key_ge(key); + return idx < q - p ? extract_value(p[idx]) : NULL; +} + +static bool +is_type(const char *s, enum cfg_flags flags) +{ + uint8_t mac[ETH_ADDR_LEN]; + struct in_addr addr; + uint64_t dpid; + + return (flags & CFG_STRING + || (flags & CFG_KEY && is_key(s)) + || (flags & CFG_INT && is_int(s)) + || (flags & CFG_BOOL && is_bool(s)) + || (flags & CFG_IP && inet_aton(s, &addr)) + || (flags & CFG_MAC && parse_mac(s, mac)) + || (flags & CFG_DPID && parse_dpid(s, &dpid))); +} diff --git a/lib/cfg.h b/lib/cfg.h new file mode 100644 index 00000000..85ad66f9 --- /dev/null +++ b/lib/cfg.h @@ -0,0 +1,98 @@ +/* Copyright (c) 2008, 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + + +#ifndef VSWITCHD_CFG_H +#define VSWITCHD_CFG_H 1 + +#include +#include +#include +#include "compiler.h" +#include "sha1.h" + +struct svec; +struct ofpbuf; + +int cfg_set_file(const char *file_name); +int cfg_read(void); +int cfg_lock(uint8_t *cookie, int timeout); +void cfg_unlock(void); +int cfg_write(void); +int cfg_write_data(uint8_t *data, size_t len); +bool cfg_is_dirty(void); + +void cfg_get_all(struct svec *); + +#define CFG_COOKIE_LEN SHA1HashSize +int cfg_get_cookie(uint8_t *cookie); + +void cfg_buf_put(struct ofpbuf *buffer); +void cfg_get_subsections(struct svec *, const char *, ...) PRINTF_FORMAT(2, 3); + +enum cfg_flags { + /* Types allowed. */ + CFG_STRING = 1 << 0, /* Arbitrary content. */ + CFG_KEY = 1 << 0, /* Valid key name. */ + CFG_INT = 1 << 2, /* Integer value. */ + CFG_BOOL = 1 << 3, /* Boolean. */ + CFG_IP = 1 << 4, /* IPv4 address. */ + CFG_MAC = 1 << 5, /* MAC address. */ + CFG_VLAN = 1 << 6, /* Integer in range 0...4095. */ + CFG_DPID = 1 << 7, /* 12 hexadecimal digits. */ + + /* Number allowed. */ + CFG_REQUIRED = 1 << 8, /* At least one value allowed. */ + CFG_MULTIPLE = 1 << 9 /* More than one value allowed. */ +}; +void cfg_register(const char *key_spec, enum cfg_flags); + +void cfg_add_entry(const char *key, ...) PRINTF_FORMAT(1, 2); +void cfg_del_entry(const char *key, ...) PRINTF_FORMAT(1, 2); +void cfg_del_section(const char *key, ...) PRINTF_FORMAT(1, 2); +void cfg_del_match(const char *pattern, ...) PRINTF_FORMAT(1, 2); +void cfg_get_section(struct svec *svec, const char *key, ...) + PRINTF_FORMAT(2, 3); + +bool cfg_has(const char *key, ...) PRINTF_FORMAT(1, 2); +bool cfg_is_valid(enum cfg_flags, const char *key, ...) PRINTF_FORMAT(2, 3); +bool cfg_has_section(const char *key, ...) PRINTF_FORMAT(1, 2); +int cfg_count(const char *key, ...) PRINTF_FORMAT(1, 2); + +const char *cfg_get_string(int idx, const char *key, ...) PRINTF_FORMAT(2, 3); +const char *cfg_get_key(int idx, const char *key, ...) PRINTF_FORMAT(2, 3); +int cfg_get_int(int idx, const char *key, ...) PRINTF_FORMAT(2, 3); +bool cfg_get_bool(int idx, const char *key, ...) PRINTF_FORMAT(2, 3); +uint32_t cfg_get_ip(int idx, const char *key, ...) PRINTF_FORMAT(2, 3); +uint64_t cfg_get_mac(int idx, const char *key, ...) PRINTF_FORMAT(2, 3); +int cfg_get_vlan(int idx, const char *key, ...) PRINTF_FORMAT(2, 3); +uint64_t cfg_get_dpid(int idx, const char *key, ...) PRINTF_FORMAT(2, 3); + +void cfg_get_all_strings(struct svec *, const char *key, ...) + PRINTF_FORMAT(2, 3); +void cfg_get_all_keys(struct svec *, const char *key, ...) PRINTF_FORMAT(2, 3); + +#endif /* vswitchd/cfg.h */ diff --git a/lib/classifier.c b/lib/classifier.c new file mode 100644 index 00000000..11223a81 --- /dev/null +++ b/lib/classifier.c @@ -0,0 +1,832 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "classifier.h" +#include +#include +#include +#include "flow.h" +#include "hash.h" + +const struct cls_field cls_fields[CLS_N_FIELDS + 1] = { +#define CLS_FIELD(WILDCARDS, MEMBER, NAME) \ + { offsetof(flow_t, MEMBER), \ + sizeof ((flow_t *)0)->MEMBER, \ + WILDCARDS, \ + #NAME }, + CLS_FIELDS +#undef CLS_FIELD + { sizeof(flow_t), 0, 0, "exact" }, +}; + +static uint32_t hash_fields(const flow_t *, int table_idx); +static bool equal_fields(const flow_t *, const flow_t *, int table_idx); + +static int table_idx_from_wildcards(uint32_t wildcards); +static struct cls_rule *table_insert(struct hmap *, struct cls_rule *); +static struct cls_rule *insert_exact_rule(struct classifier *, + struct cls_rule *); +static struct cls_bucket *find_bucket(struct hmap *, size_t hash, + const struct cls_rule *); +static struct cls_rule *search_table(const struct hmap *table, int field_idx, + const struct cls_rule *); +static struct cls_rule *search_exact_table(const struct classifier *, + size_t hash, const flow_t *); +static bool rules_match_1wild(const struct cls_rule *fixed, + const struct cls_rule *wild, int field_idx); + +/* Converts the flow in 'flow' into a cls_rule in 'rule', with the given + * 'wildcards' and 'priority'.*/ +void +cls_rule_from_flow(struct cls_rule *rule, const flow_t *flow, + uint32_t wildcards, unsigned int priority) +{ + assert(flow->reserved == 0); + rule->flow = *flow; + flow_wildcards_init(&rule->wc, wildcards); + rule->priority = priority; + rule->table_idx = table_idx_from_wildcards(rule->wc.wildcards); +} + +/* Converts the ofp_match in 'match' into a cls_rule in 'rule', with the given + * 'priority'. */ +void +cls_rule_from_match(struct cls_rule *rule, const struct ofp_match *match, + unsigned int priority) +{ + uint32_t wildcards; + flow_from_match(&rule->flow, &wildcards, match); + flow_wildcards_init(&rule->wc, wildcards); + rule->priority = rule->wc.wildcards ? priority : UINT16_MAX; + rule->table_idx = table_idx_from_wildcards(rule->wc.wildcards); +} + +/* Prints cls_rule 'rule', for debugging. + * + * (The output could be improved and expanded, but this was good enough to + * debug the classifier.) */ +void +cls_rule_print(const struct cls_rule *rule) +{ + printf("wildcards=%x priority=%u ", rule->wc.wildcards, rule->priority); + flow_print(stdout, &rule->flow); + putc('\n', stdout); +} + +/* Adjusts pointers around 'old', which must be in classifier 'cls', to + * compensate for it having been moved in memory to 'new' (e.g. due to + * realloc()). + * + * This function cannot be realized in all possible flow classifier + * implementations, so we will probably have to change the interface if we + * change the implementation. Shouldn't be a big deal though. */ +void +cls_rule_moved(struct classifier *cls, struct cls_rule *old, + struct cls_rule *new) +{ + if (old != new) { + if (new->wc.wildcards) { + list_moved(&new->node.list); + } else { + hmap_moved(&cls->exact_table, &old->node.hmap, &new->node.hmap); + } + } +} + +/* Replaces 'old', which must be in classifier 'cls', by 'new' (e.g. due to + * realloc()); that is, after calling this function 'new' will be in 'cls' in + * place of 'old'. + * + * 'new' and 'old' must be exactly the same: wildcard the same fields, have the + * same fixed values for non-wildcarded fields, and have the same priority. + * + * The caller takes ownership of 'old' and is thus responsible for freeing it, + * etc., as necessary. + * + * This function cannot be realized in all possible flow classifier + * implementations, so we will probably have to change the interface if we + * change the implementation. Shouldn't be a big deal though. */ +void +cls_rule_replace(struct classifier *cls, const struct cls_rule *old, + struct cls_rule *new) +{ + assert(old != new); + assert(old->wc.wildcards == new->wc.wildcards); + assert(old->priority == new->priority); + + if (new->wc.wildcards) { + list_replace(&new->node.list, &old->node.list); + } else { + hmap_replace(&cls->exact_table, &old->node.hmap, &new->node.hmap); + } +} + +/* Initializes 'cls' as a classifier that initially contains no classification + * rules. */ +void +classifier_init(struct classifier *cls) +{ + int i; + + cls->n_rules = 0; + for (i = 0; i < ARRAY_SIZE(cls->tables); i++) { + hmap_init(&cls->tables[i]); + } + hmap_init(&cls->exact_table); +} + +/* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the + * caller's responsibility. */ +void +classifier_destroy(struct classifier *cls) +{ + if (cls) { + struct cls_bucket *bucket, *next_bucket; + struct hmap *tbl; + + for (tbl = &cls->tables[0]; tbl < &cls->tables[CLS_N_FIELDS]; tbl++) { + HMAP_FOR_EACH_SAFE (bucket, next_bucket, + struct cls_bucket, hmap_node, tbl) { + free(bucket); + } + hmap_destroy(tbl); + } + hmap_destroy(&cls->exact_table); + } +} + +/* Returns true if 'cls' does not contain any classification rules, false + * otherwise. */ +bool +classifier_is_empty(const struct classifier *cls) +{ + return cls->n_rules == 0; +} + +/* Returns the number of rules in 'classifier'. */ +int +classifier_count(const struct classifier *cls) +{ + return cls->n_rules; +} + +/* Returns the number of rules in 'classifier' that have no wildcards. */ +int +classifier_count_exact(const struct classifier *cls) +{ + return hmap_count(&cls->exact_table); +} + +/* Inserts 'rule' into 'cls'. Transfers ownership of 'rule' to 'cls'. + * + * If 'cls' already contains an identical rule (including wildcards, values of + * fixed fields, and priority), replaces the old rule by 'rule' and returns the + * rule that was replaced. The caller takes ownership of the returned rule and + * is thus responsible for freeing it, etc., as necessary. + * + * Returns NULL if 'cls' does not contain a rule with an identical key, after + * inserting the new rule. In this case, no rules are displaced by the new + * rule, even rules that cannot have any effect because the new rule matches a + * superset of their flows and has higher priority. */ +struct cls_rule * +classifier_insert(struct classifier *cls, struct cls_rule *rule) +{ + struct cls_rule *old; + assert((rule->wc.wildcards == 0) == (rule->table_idx == CLS_F_IDX_EXACT)); + old = (rule->wc.wildcards + ? table_insert(&cls->tables[rule->table_idx], rule) + : insert_exact_rule(cls, rule)); + if (!old) { + cls->n_rules++; + } + return old; +} + +/* Inserts 'rule' into 'cls'. Transfers ownership of 'rule' to 'cls'. + * + * 'rule' must be an exact-match rule (rule->wc.wildcards must be 0) and 'cls' + * must not contain any rule with an identical key. */ +void +classifier_insert_exact(struct classifier *cls, struct cls_rule *rule) +{ + hmap_insert(&cls->exact_table, &rule->node.hmap, + flow_hash(&rule->flow, 0)); + cls->n_rules++; +} + +/* Removes 'rule' from 'cls'. It is caller's responsibility to free 'rule', if + * this is desirable. */ +void +classifier_remove(struct classifier *cls, struct cls_rule *rule) +{ + if (rule->wc.wildcards) { + /* Remove 'rule' from bucket. If that empties the bucket, remove the + * bucket from its table. */ + struct hmap *table = &cls->tables[rule->table_idx]; + struct list *rules = list_remove(&rule->node.list); + if (list_is_empty(rules)) { + /* This code is a little tricky. list_remove() returns the list + * element just after the one removed. Since the list is now + * empty, this will be the address of the 'rules' member of the + * bucket that was just emptied, so pointer arithmetic (via + * CONTAINER_OF) can find that bucket. */ + struct cls_bucket *bucket; + bucket = CONTAINER_OF(rules, struct cls_bucket, rules); + hmap_remove(table, &bucket->hmap_node); + free(bucket); + } + } else { + /* Remove 'rule' from cls->exact_table. */ + hmap_remove(&cls->exact_table, &rule->node.hmap); + } + cls->n_rules--; +} + +/* Finds and returns the highest-priority rule in 'cls' that matches 'flow'. + * Returns a null pointer if no rules in 'cls' match 'flow'. If multiple rules + * of equal priority match 'flow', returns one arbitrarily. + * + * (When multiple rules of equal priority happen to fall into the same bucket, + * rules added more recently take priority over rules added less recently, but + * this is subject to change and should not be depended upon.) */ +struct cls_rule * +classifier_lookup(const struct classifier *cls, const flow_t *flow) +{ + struct cls_rule *rule = classifier_lookup_exact(cls, flow); + if (!rule) { + rule = classifier_lookup_wild(cls, flow); + } + return rule; +} + +struct cls_rule * +classifier_lookup_exact(const struct classifier *cls, const flow_t *flow) +{ + return (!hmap_is_empty(&cls->exact_table) + ? search_exact_table(cls, flow_hash(flow, 0), flow) + : NULL); +} + +struct cls_rule * +classifier_lookup_wild(const struct classifier *cls, const flow_t *flow) +{ + struct cls_rule *best = NULL; + if (cls->n_rules > hmap_count(&cls->exact_table)) { + struct cls_rule target; + int i; + + cls_rule_from_flow(&target, flow, 0, 0); + for (i = 0; i < CLS_N_FIELDS; i++) { + struct cls_rule *rule = search_table(&cls->tables[i], i, &target); + if (rule && (!best || rule->priority > best->priority)) { + best = rule; + } + } + } + return best; +} + +struct cls_rule * +classifier_find_rule_exactly(const struct classifier *cls, + const flow_t *target, uint32_t wildcards, + unsigned int priority) +{ + struct cls_bucket *bucket; + int table_idx; + uint32_t hash; + + if (!wildcards) { + /* Ignores 'priority'. */ + return search_exact_table(cls, flow_hash(target, 0), target); + } + + assert(wildcards == (wildcards & OFPFW_ALL)); + table_idx = table_idx_from_wildcards(wildcards); + hash = hash_fields(target, table_idx); + HMAP_FOR_EACH_WITH_HASH (bucket, struct cls_bucket, hmap_node, hash, + &cls->tables[table_idx]) { + if (equal_fields(&bucket->fixed, target, table_idx)) { + struct cls_rule *pos; + LIST_FOR_EACH (pos, struct cls_rule, node.list, &bucket->rules) { + if (pos->priority < priority) { + return NULL; + } else if (pos->priority == priority && + pos->wc.wildcards == wildcards && + flow_equal(target, &pos->flow)) { + return pos; + } + } + } + } + return NULL; +} + +/* Ignores target->priority. + * + * 'callback' is allowed to delete the rule that is passed as its argument, but + * it must not delete (or move) any other rules in 'cls' that are in the same + * table as the argument rule. Two rules are in the same table if their + * cls_rule structs have the same table_idx; as a special case, a rule with + * wildcards and an exact-match rule will never be in the same table. */ +void +classifier_for_each_match(const struct classifier *cls, + const struct cls_rule *target, + int include, cls_cb_func *callback, void *aux) +{ + if (include & CLS_INC_WILD) { + const struct hmap *table; + + for (table = &cls->tables[0]; table < &cls->tables[CLS_N_FIELDS]; + table++) { + struct cls_bucket *bucket, *next_bucket; + + HMAP_FOR_EACH_SAFE (bucket, next_bucket, + struct cls_bucket, hmap_node, table) { + /* XXX there is a bit of room for optimization here based on + * rejecting entire buckets on their fixed fields, but it will + * only be worthwhile for big buckets (which we hope we won't + * get anyway, but...) */ + struct cls_rule *prev_rule, *rule; + + /* We can't just use LIST_FOR_EACH_SAFE here because, if the + * callback deletes the last rule in the bucket, then the + * bucket itself will be destroyed. The bucket contains the + * list head so that's a use-after-free error. */ + prev_rule = NULL; + LIST_FOR_EACH (rule, struct cls_rule, node.list, + &bucket->rules) { + if (rules_match_1wild(rule, target, 0)) { + if (prev_rule) { + callback(prev_rule, aux); + } + prev_rule = rule; + } + } + if (prev_rule) { + callback(prev_rule, aux); + } + } + } + } + + if (include & CLS_INC_EXACT) { + if (target->wc.wildcards) { + struct cls_rule *rule, *next_rule; + + HMAP_FOR_EACH_SAFE (rule, next_rule, struct cls_rule, node.hmap, + &cls->exact_table) { + if (rules_match_1wild(rule, target, 0)) { + callback(rule, aux); + } + } + } else { + /* Optimization: there can be at most one match in the exact + * table. */ + size_t hash = flow_hash(&target->flow, 0); + struct cls_rule *rule = search_exact_table(cls, hash, + &target->flow); + if (rule) { + callback(rule, aux); + } + } + } +} + +/* 'callback' is allowed to delete the rule that is passed as its argument, but + * it must not delete (or move) any other rules in 'cls' that are in the same + * table as the argument rule. Two rules are in the same table if their + * cls_rule structs have the same table_idx; as a special case, a rule with + * wildcards and an exact-match rule will never be in the same table. */ +void +classifier_for_each(const struct classifier *cls, int include, + void (*callback)(struct cls_rule *, void *aux), + void *aux) +{ + if (include & CLS_INC_WILD) { + const struct hmap *tbl; + + for (tbl = &cls->tables[0]; tbl < &cls->tables[CLS_N_FIELDS]; tbl++) { + struct cls_bucket *bucket, *next_bucket; + + HMAP_FOR_EACH_SAFE (bucket, next_bucket, + struct cls_bucket, hmap_node, tbl) { + struct cls_rule *prev_rule, *rule; + + /* We can't just use LIST_FOR_EACH_SAFE here because, if the + * callback deletes the last rule in the bucket, then the + * bucket itself will be destroyed. The bucket contains the + * list head so that's a use-after-free error. */ + prev_rule = NULL; + LIST_FOR_EACH (rule, struct cls_rule, node.list, + &bucket->rules) { + if (prev_rule) { + callback(prev_rule, aux); + } + prev_rule = rule; + } + if (prev_rule) { + callback(prev_rule, aux); + } + } + } + } + + if (include & CLS_INC_EXACT) { + struct cls_rule *rule, *next_rule; + + HMAP_FOR_EACH_SAFE (rule, next_rule, + struct cls_rule, node.hmap, &cls->exact_table) { + callback(rule, aux); + } + } +} + +static struct cls_bucket *create_bucket(struct hmap *, size_t hash, + const flow_t *fixed); +static struct cls_rule *bucket_insert(struct cls_bucket *, struct cls_rule *); + +static inline bool equal_bytes(const void *, const void *, size_t n); + +/* Returns a hash computed across the fields in 'flow' whose field indexes + * (CLS_F_IDX_*) are less than 'table_idx'. (If 'table_idx' is + * CLS_F_IDX_EXACT, hashes all the fields in 'flow'). */ +static uint32_t +hash_fields(const flow_t *flow, int table_idx) +{ + /* I just know I'm going to hell for writing code this way. + * + * GCC generates pretty good code here, with only a single taken + * conditional jump per execution. Now the question is, would we be better + * off marking this function ALWAYS_INLINE and writing a wrapper that + * switches on the value of 'table_idx' to get rid of all the conditional + * jumps entirely (except for one in the wrapper)? Honestly I really, + * really hope that it doesn't matter in practice. + * + * We could do better by calculating hashes incrementally, instead of + * starting over from the top each time. But that would be even uglier. */ + uint32_t a, b, c; + uint32_t tmp[3]; + size_t n; + + a = b = c = 0xdeadbeef + table_idx; + n = 0; + +#define CLS_FIELD(WILDCARDS, MEMBER, NAME) \ + if (table_idx == CLS_F_IDX_##NAME) { \ + /* Done. */ \ + memset((uint8_t *) tmp + n, 0, sizeof tmp - n); \ + goto finish; \ + } else { \ + const size_t size = sizeof flow->MEMBER; \ + const uint8_t *p1 = (const uint8_t *) &flow->MEMBER; \ + const size_t p1_size = MIN(sizeof tmp - n, size); \ + const uint8_t *p2 = p1 + p1_size; \ + const size_t p2_size = size - p1_size; \ + \ + /* Append to 'tmp' as much data as will fit. */ \ + memcpy((uint8_t *) tmp + n, p1, p1_size); \ + n += p1_size; \ + \ + /* If 'tmp' is full, mix. */ \ + if (n == sizeof tmp) { \ + a += tmp[0]; \ + b += tmp[1]; \ + c += tmp[2]; \ + HASH_MIX(a, b, c); \ + n = 0; \ + } \ + \ + /* Append to 'tmp' any data that didn't fit. */ \ + memcpy(tmp, p2, p2_size); \ + n += p2_size; \ + } + CLS_FIELDS +#undef CLS_FIELD + +finish: + a += tmp[0]; + b += tmp[1]; + c += tmp[2]; + HASH_FINAL(a, b, c); + return c; +} + +/* Compares the fields in 'a' and 'b' whose field indexes (CLS_F_IDX_*) are + * less than 'table_idx'. (If 'table_idx' is CLS_F_IDX_EXACT, compares all the + * fields in 'a' and 'b'). + * + * Returns true if all the compared fields are equal, false otherwise. */ +static bool +equal_fields(const flow_t *a, const flow_t *b, int table_idx) +{ + /* XXX The generated code could be better here. */ +#define CLS_FIELD(WILDCARDS, MEMBER, NAME) \ + if (table_idx == CLS_F_IDX_##NAME) { \ + return true; \ + } else if (!equal_bytes(&a->MEMBER, &b->MEMBER, sizeof a->MEMBER)) { \ + return false; \ + } + CLS_FIELDS +#undef CLS_FIELD + + return true; +} + +static int +table_idx_from_wildcards(uint32_t wildcards) +{ + if (!wildcards) { + return CLS_F_IDX_EXACT; + } +#define CLS_FIELD(WILDCARDS, MEMBER, NAME) \ + if (wildcards & WILDCARDS) { \ + return CLS_F_IDX_##NAME; \ + } + CLS_FIELDS +#undef CLS_FIELD + NOT_REACHED(); +} + +/* Inserts 'rule' into 'table'. Returns the rule, if any, that was displaced + * in favor of 'rule'. */ +static struct cls_rule * +table_insert(struct hmap *table, struct cls_rule *rule) +{ + struct cls_bucket *bucket; + size_t hash; + + hash = hash_fields(&rule->flow, rule->table_idx); + bucket = find_bucket(table, hash, rule); + if (!bucket) { + bucket = create_bucket(table, hash, &rule->flow); + } + + return bucket_insert(bucket, rule); +} + +/* Inserts 'rule' into 'bucket', given that 'field' is the first wildcarded + * field in 'rule'. + * + * Returns the rule, if any, that was displaced in favor of 'rule'. */ +static struct cls_rule * +bucket_insert(struct cls_bucket *bucket, struct cls_rule *rule) +{ + struct cls_rule *pos; + LIST_FOR_EACH (pos, struct cls_rule, node.list, &bucket->rules) { + if (pos->priority <= rule->priority) { + if (pos->priority == rule->priority + && pos->wc.wildcards == rule->wc.wildcards + && rules_match_1wild(pos, rule, rule->table_idx)) + { + list_replace(&rule->node.list, &pos->node.list); + return pos; + } + break; + } + } + list_insert(&pos->node.list, &rule->node.list); + return NULL; +} + +static struct cls_rule * +insert_exact_rule(struct classifier *cls, struct cls_rule *rule) +{ + struct cls_rule *old_rule; + size_t hash; + + hash = flow_hash(&rule->flow, 0); + old_rule = search_exact_table(cls, hash, &rule->flow); + if (old_rule) { + hmap_remove(&cls->exact_table, &old_rule->node.hmap); + } + hmap_insert(&cls->exact_table, &rule->node.hmap, hash); + return old_rule; +} + +/* Returns the bucket in 'table' that has the given 'hash' and the same fields + * as 'rule->flow' (up to 'rule->table_idx'), or a null pointer if no bucket + * matches. */ +static struct cls_bucket * +find_bucket(struct hmap *table, size_t hash, const struct cls_rule *rule) +{ + struct cls_bucket *bucket; + HMAP_FOR_EACH_WITH_HASH (bucket, struct cls_bucket, hmap_node, hash, + table) { + if (equal_fields(&bucket->fixed, &rule->flow, rule->table_idx)) { + return bucket; + } + } + return NULL; +} + +/* Creates a bucket and inserts it in 'table' with the given 'hash' and 'fixed' + * values. Returns the new bucket. */ +static struct cls_bucket * +create_bucket(struct hmap *table, size_t hash, const flow_t *fixed) +{ + struct cls_bucket *bucket = xmalloc(sizeof *bucket); + list_init(&bucket->rules); + bucket->fixed = *fixed; + hmap_insert(table, &bucket->hmap_node, hash); + return bucket; +} + +/* Returns true if the 'n' bytes in 'a' and 'b' are equal, false otherwise. */ +static inline bool ALWAYS_INLINE +equal_bytes(const void *a, const void *b, size_t n) +{ +#ifdef __i386__ + /* For some reason GCC generates stupid code for memcmp() of small + * constant integer lengths. Help it out. + * + * This function is always inlined, and it is always called with 'n' as a + * compile-time constant, so the switch statement gets optimized out and + * this whole function just expands to an instruction or two. */ + switch (n) { + case 1: + return *(uint8_t *) a == *(uint8_t *) b; + + case 2: + return *(uint16_t *) a == *(uint16_t *) b; + + case 4: + return *(uint32_t *) a == *(uint32_t *) b; + + case 6: + return (*(uint32_t *) a == *(uint32_t *) b + && ((uint16_t *) a)[2] == ((uint16_t *) b)[2]); + + default: + abort(); + } +#else + /* I hope GCC is smarter on your platform. */ + return !memcmp(a, b, n); +#endif +} + +/* Returns the 32-bit unsigned integer at 'p'. */ +static inline uint32_t +read_uint32(const void *p) +{ + /* GCC optimizes this into a single machine instruction on x86. */ + uint32_t x; + memcpy(&x, p, sizeof x); + return x; +} + +/* Compares the specified field in 'a' and 'b'. Returns true if the fields are + * equal, or if the ofp_match wildcard bits in 'wildcards' are set such that + * non-equal values may be ignored. 'nw_src_mask' and 'nw_dst_mask' must be + * those that would be set for 'wildcards' by cls_rule_set_masks(). + * + * The compared field is the one with wildcard bit or bits 'field_wc', offset + * 'rule_ofs' within cls_rule's "fields" member, and length 'len', in bytes. */ +static inline bool ALWAYS_INLINE +field_matches(const flow_t *a_, const flow_t *b_, + uint32_t wildcards, uint32_t nw_src_mask, uint32_t nw_dst_mask, + uint32_t field_wc, int ofs, int len) +{ + /* This function is always inlined, and it is always called with 'field_wc' + * as a compile-time constant, so the "if" conditionals here generate no + * code. */ + const void *a = (const uint8_t *) a_ + ofs; + const void *b = (const uint8_t *) b_ + ofs; + if (!(field_wc & (field_wc - 1))) { + /* Handle all the single-bit wildcard cases. */ + return wildcards & field_wc || equal_bytes(a, b, len); + } else if (field_wc == OFPFW_NW_SRC_MASK || + field_wc == OFPFW_NW_DST_MASK) { + uint32_t a_ip = read_uint32(a); + uint32_t b_ip = read_uint32(b); + uint32_t mask = (field_wc == OFPFW_NW_SRC_MASK + ? nw_src_mask : nw_dst_mask); + return ((a_ip ^ b_ip) & mask) == 0; + } else { + abort(); + } +} + +/* Returns true if 'a' and 'b' match, ignoring fields for which the wildcards + * in 'wildcards' are set. 'nw_src_mask' and 'nw_dst_mask' must be those that + * would be set for 'wildcards' by cls_rule_set_masks(). 'field_idx' is the + * index of the first field to be compared; fields before 'field_idx' are + * assumed to match. (Always returns true if 'field_idx' is CLS_N_FIELDS.) */ +static bool +rules_match(const struct cls_rule *a, const struct cls_rule *b, + uint32_t wildcards, uint32_t nw_src_mask, uint32_t nw_dst_mask, + int field_idx) +{ + /* This is related to Duff's device (see + * http://en.wikipedia.org/wiki/Duff's_device). */ + switch (field_idx) { +#define CLS_FIELD(WILDCARDS, MEMBER, NAME) \ + case CLS_F_IDX_##NAME: \ + if (!field_matches(&a->flow, &b->flow, \ + wildcards, nw_src_mask, nw_dst_mask, \ + WILDCARDS, offsetof(flow_t, MEMBER), \ + sizeof a->flow.MEMBER)) { \ + return false; \ + } \ + /* Fall though */ + CLS_FIELDS +#undef CLS_FIELD + } + return true; +} + +/* Returns true if 'fixed' and 'wild' match. All fields in 'fixed' must have + * fixed values; 'wild' may contain wildcards. + * + * 'field_idx' is the index of the first field to be compared; fields before + * 'field_idx' are assumed to match. Always returns true if 'field_idx' is + * CLS_N_FIELDS. */ +static bool +rules_match_1wild(const struct cls_rule *fixed, const struct cls_rule *wild, + int field_idx) +{ + return rules_match(fixed, wild, wild->wc.wildcards, wild->wc.nw_src_mask, + wild->wc.nw_dst_mask, field_idx); +} + +/* Searches 'bucket' for a rule that matches 'target'. Returns the + * highest-priority match, if one is found, or a null pointer if there is no + * match. + * + * 'field_idx' must be the index of the first wildcarded field in 'bucket'. */ +static struct cls_rule * +search_bucket(struct cls_bucket *bucket, int field_idx, + const struct cls_rule *target) +{ + struct cls_rule *pos; + + if (!equal_fields(&bucket->fixed, &target->flow, field_idx)) { + return NULL; + } + + LIST_FOR_EACH (pos, struct cls_rule, node.list, &bucket->rules) { + if (rules_match_1wild(target, pos, field_idx)) { + return pos; + } + } + return NULL; +} + +/* Searches 'table' for a rule that matches 'target'. Returns the + * highest-priority match, if one is found, or a null pointer if there is no + * match. + * + * 'field_idx' must be the index of the first wildcarded field in 'table'. */ +static struct cls_rule * +search_table(const struct hmap *table, int field_idx, + const struct cls_rule *target) +{ + struct cls_bucket *bucket; + + switch (hmap_count(table)) { + /* In these special cases there's no need to hash. */ + case 0: + return NULL; + case 1: + bucket = CONTAINER_OF(hmap_first(table), struct cls_bucket, hmap_node); + return search_bucket(bucket, field_idx, target); + } + + HMAP_FOR_EACH_WITH_HASH (bucket, struct cls_bucket, hmap_node, + hash_fields(&target->flow, field_idx), table) { + struct cls_rule *rule = search_bucket(bucket, field_idx, target); + if (rule) { + return rule; + } + } + return NULL; +} + +static struct cls_rule * +search_exact_table(const struct classifier *cls, size_t hash, + const flow_t *target) +{ + struct cls_rule *rule; + + HMAP_FOR_EACH_WITH_HASH (rule, struct cls_rule, node.hmap, + hash, &cls->exact_table) { + if (flow_equal(&rule->flow, target)) { + return rule; + } + } + return NULL; +} diff --git a/lib/classifier.h b/lib/classifier.h new file mode 100644 index 00000000..a632fb02 --- /dev/null +++ b/lib/classifier.h @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef CLASSIFIER_H +#define CLASSIFIER_H 1 + +/* Flow classifier. + * + * This flow classifier assumes that we can arrange the fields in a flow in an + * order such that the set of wildcarded fields in a rule tend to fall toward + * the end of the ordering. That is, if field F is wildcarded, then all the + * fields after F tend to be wildcarded as well. If this assumption is + * violated, then the classifier will still classify flows correctly, but its + * performance will suffer. + */ + +#include "flow.h" +#include "hmap.h" +#include "list.h" +#include "openflow/openflow.h" + +/* Number of bytes of fields in a rule. */ +#define CLS_N_BYTES 31 + +/* Fields in a rule. + * + * This definition sets the ordering of fields, which is important for + * performance (see above). To adjust the ordering, change the order of the + * lines. */ +#define CLS_FIELDS \ + /* flow_t all-caps */ \ + /* wildcard bit(s) member name name */ \ + /* ----------------- ----------- -------- */ \ + CLS_FIELD(OFPFW_IN_PORT, in_port, IN_PORT) \ + CLS_FIELD(OFPFW_DL_VLAN, dl_vlan, DL_VLAN) \ + CLS_FIELD(OFPFW_DL_SRC, dl_src, DL_SRC) \ + CLS_FIELD(OFPFW_DL_DST, dl_dst, DL_DST) \ + CLS_FIELD(OFPFW_DL_TYPE, dl_type, DL_TYPE) \ + CLS_FIELD(OFPFW_NW_SRC_MASK, nw_src, NW_SRC) \ + CLS_FIELD(OFPFW_NW_DST_MASK, nw_dst, NW_DST) \ + CLS_FIELD(OFPFW_NW_PROTO, nw_proto, NW_PROTO) \ + CLS_FIELD(OFPFW_TP_SRC, tp_src, TP_SRC) \ + CLS_FIELD(OFPFW_TP_DST, tp_dst, TP_DST) + +/* Field indexes. + * + * (These are also indexed into struct classifier's 'tables' array.) */ +enum { +#define CLS_FIELD(WILDCARDS, MEMBER, NAME) CLS_F_IDX_##NAME, + CLS_FIELDS +#undef CLS_FIELD + CLS_F_IDX_EXACT, /* Exact-match table. */ + CLS_N_FIELDS = CLS_F_IDX_EXACT +}; + +/* Field information. */ +struct cls_field { + int ofs; /* Offset in flow_t. */ + int len; /* Length in bytes. */ + uint32_t wildcards; /* OFPFW_* bit or bits for this field. */ + const char *name; /* Name (for debugging). */ +}; +extern const struct cls_field cls_fields[CLS_N_FIELDS + 1]; + +/* A flow classifier. */ +struct classifier { + int n_rules; /* Sum of hmap_count() over tables[]. */ + struct hmap tables[CLS_N_FIELDS]; /* Contain cls_bucket elements. */ + struct hmap exact_table; /* Contain cls_rule elements. */ +}; + +/* A group of rules with the same fixed values for initial fields. */ +struct cls_bucket { + struct hmap_node hmap_node; /* Within struct classifier 'tables'. */ + struct list rules; /* In order from highest to lowest priority. */ + flow_t fixed; /* Values for fixed fields. */ +}; + +/* A flow classification rule. + * + * Use cls_rule_from_flow() or cls_rule_from_match() to initialize a cls_rule + * or you will almost certainly not initialize 'table_idx' correctly, with + * disastrous results! */ +struct cls_rule { + union { + struct list list; /* Within struct cls_bucket 'rules'. */ + struct hmap_node hmap; /* Within struct classifier 'exact_table'. */ + } node; + flow_t flow; /* All field values. */ + struct flow_wildcards wc; /* Wildcards for fields. */ + unsigned int priority; /* Larger numbers are higher priorities. */ + unsigned int table_idx; /* Index into struct classifier 'tables'. */ +}; + +void cls_rule_from_flow(struct cls_rule *, const flow_t *, uint32_t wildcards, + unsigned int priority); +void cls_rule_from_match(struct cls_rule *, const struct ofp_match *, + unsigned int priority); +void cls_rule_print(const struct cls_rule *); +void cls_rule_moved(struct classifier *, + struct cls_rule *old, struct cls_rule *new); +void cls_rule_replace(struct classifier *, const struct cls_rule *old, + struct cls_rule *new); + +void classifier_init(struct classifier *); +void classifier_destroy(struct classifier *); +bool classifier_is_empty(const struct classifier *); +int classifier_count(const struct classifier *); +int classifier_count_exact(const struct classifier *); +struct cls_rule *classifier_insert(struct classifier *, struct cls_rule *); +void classifier_insert_exact(struct classifier *, struct cls_rule *); +void classifier_remove(struct classifier *, struct cls_rule *); +struct cls_rule *classifier_lookup(const struct classifier *, const flow_t *); +struct cls_rule *classifier_lookup_wild(const struct classifier *, + const flow_t *); +struct cls_rule *classifier_lookup_exact(const struct classifier *, + const flow_t *); + +typedef void cls_cb_func(struct cls_rule *, void *aux); + +enum { + CLS_INC_EXACT = 1 << 0, /* Include exact-match flows? */ + CLS_INC_WILD = 1 << 1, /* Include flows with wildcards? */ + CLS_INC_ALL = CLS_INC_EXACT | CLS_INC_WILD +}; +void classifier_for_each(const struct classifier *, int include, + cls_cb_func *, void *aux); +void classifier_for_each_match(const struct classifier *, + const struct cls_rule *, + int include, cls_cb_func *, void *aux); +struct cls_rule *classifier_find_rule_exactly(const struct classifier *, + const flow_t *target, + uint32_t wildcards, + unsigned int priority); + +#endif /* classifier.h */ diff --git a/lib/command-line.c b/lib/command-line.c new file mode 100644 index 00000000..e0bd8811 --- /dev/null +++ b/lib/command-line.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "command-line.h" +#include +#include +#include "util.h" +#include "vlog.h" + +/* Given the GNU-style long options in 'options', returns a string that may be + * passed to getopt() with the corresponding short options. The caller is + * responsible for freeing the string. */ +char * +long_options_to_short_options(const struct option options[]) +{ + char short_options[UCHAR_MAX * 3 + 1]; + char *p = short_options; + + for (; options->name; options++) { + const struct option *o = options; + if (o->flag == NULL && o->val > 0 && o->val <= UCHAR_MAX) { + *p++ = o->val; + if (o->has_arg == required_argument) { + *p++ = ':'; + } else if (o->has_arg == optional_argument) { + *p++ = ':'; + *p++ = ':'; + } + } + } + *p = '\0'; + + return xstrdup(short_options); +} + diff --git a/lib/command-line.h b/lib/command-line.h new file mode 100644 index 00000000..f58a3360 --- /dev/null +++ b/lib/command-line.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef COMMAND_LINE_H +#define COMMAND_LINE_H 1 + +/* Utilities for command-line parsing. */ + +struct option; +char *long_options_to_short_options(const struct option *options); + +#endif /* command-line.h */ diff --git a/lib/common.man b/lib/common.man new file mode 100644 index 00000000..84e81c2c --- /dev/null +++ b/lib/common.man @@ -0,0 +1,7 @@ +.TP +\fB-h\fR, \fB--help\fR +Prints a brief help message to the console. + +.TP +\fB-V\fR, \fB--version\fR +Prints version information to the console. diff --git a/lib/compiler.h b/lib/compiler.h new file mode 100644 index 00000000..a08678b2 --- /dev/null +++ b/lib/compiler.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef COMPILER_H +#define COMPILER_H 1 + +#define NO_RETURN __attribute__((__noreturn__)) +#define UNUSED __attribute__((__unused__)) +#define PACKED __attribute__((__packed__)) +#define PRINTF_FORMAT(FMT, ARG1) __attribute__((__format__(printf, FMT, ARG1))) +#define STRFTIME_FORMAT(FMT) __attribute__((__format__(__strftime__, FMT, 0))) +#define MALLOC_LIKE __attribute__((__malloc__)) +#define ALWAYS_INLINE __attribute__((always_inline)) +#define likely(x) __builtin_expect((x),1) +#define unlikely(x) __builtin_expect((x),0) + +#endif /* compiler.h */ diff --git a/lib/coverage-counters.h b/lib/coverage-counters.h new file mode 100644 index 00000000..fa0cf73d --- /dev/null +++ b/lib/coverage-counters.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef COVERAGE_COUNTERS_H +#define COVERAGE_COUNTERS_H 1 + +#include + +extern struct coverage_counter *coverage_counters[]; +extern size_t coverage_n_counters; + +#endif /* coverage.h */ diff --git a/lib/coverage-scan.pl b/lib/coverage-scan.pl new file mode 100755 index 00000000..886223c7 --- /dev/null +++ b/lib/coverage-scan.pl @@ -0,0 +1,47 @@ +# Copyright (c) 2009 Nicira Networks. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +use strict; +use warnings; + +my %counters; +while (<>) { + my ($counter) = /^\s*COVERAGE_(?:INC|ADD)\s*\(\s*([a-zA-Z_][a-zA-Z_0-9]*)/ + or next; + push (@{$counters{$counter}}, "$ARGV:$."); +} continue { + # This magic resets $. from one file to the next. See "perldoc -f eof". + close ARGV if eof; +} + +print < +#include "coverage.h" +#include "util.h" + +EOF + +for my $counter (sort(keys(%counters))) { + my $locations = join(', ', @{$counters{$counter}}); + print < +#include "coverage.h" +#include +#include +#include "coverage-counters.h" +#include "dynamic-string.h" +#include "hash.h" +#include "util.h" + +#define THIS_MODULE VLM_coverage +#include "vlog.h" + +static unsigned int epoch; + +/* Sorts coverage counters in descending order by count, within equal counts + * alphabetically by name. */ +static int +compare_coverage_counters(const void *a_, const void *b_) +{ + const struct coverage_counter *const *ap = a_; + const struct coverage_counter *const *bp = b_; + const struct coverage_counter *a = *ap; + const struct coverage_counter *b = *bp; + if (a->count != b->count) { + return a->count < b->count ? 1 : -1; + } else { + return strcmp(a->name, b->name); + } +} + +static uint32_t +coverage_hash(void) +{ + struct coverage_counter **c; + uint32_t hash = 0; + int n_groups, i; + + /* Sort coverage counters into groups with equal counts. */ + c = xmalloc(coverage_n_counters * sizeof *c); + for (i = 0; i < coverage_n_counters; i++) { + c[i] = coverage_counters[i]; + } + qsort(c, coverage_n_counters, sizeof *c, compare_coverage_counters); + + /* Hash the names in each group along with the rank. */ + n_groups = 0; + for (i = 0; i < coverage_n_counters; ) { + int j; + + if (!c[i]->count) { + break; + } + n_groups++; + hash = hash_int(i, hash); + for (j = i; j < coverage_n_counters; j++) { + if (c[j]->count != c[i]->count) { + break; + } + hash = hash_string(c[j]->name, hash); + } + i = j; + } + + free(c); + + return hash_int(n_groups, hash); +} + +static bool +coverage_hit(uint32_t hash) +{ + enum { HIT_BITS = 1024, BITS_PER_WORD = 32 }; + static uint32_t hit[HIT_BITS / BITS_PER_WORD]; + BUILD_ASSERT_DECL(IS_POW2(HIT_BITS)); + + unsigned int bit_index = hash & (HIT_BITS - 1); + unsigned int word_index = bit_index / BITS_PER_WORD; + unsigned int word_mask = 1u << (bit_index % BITS_PER_WORD); + + if (hit[word_index] & word_mask) { + return true; + } else { + hit[word_index] |= word_mask; + return false; + } +} + +static void +coverage_log_counter(enum vlog_level level, const struct coverage_counter *c) +{ + VLOG(level, "%-24s %5u / %9llu", c->name, c->count, c->count + c->total); +} + +/* Logs the coverage counters at the given vlog 'level'. */ +void +coverage_log(enum vlog_level level) +{ + size_t n_never_hit; + uint32_t hash; + size_t i; + + if (!vlog_is_enabled(THIS_MODULE, level)) { + return; + } + + hash = coverage_hash(); + if (coverage_hit(hash)) { + VLOG(level, "Skipping details of duplicate event coverage for " + "hash=%08"PRIx32" in epoch %u", hash, epoch); + return; + } + + n_never_hit = 0; + VLOG(level, "Event coverage (epoch %u/entire run), hash=%08"PRIx32":", + epoch, hash); + for (i = 0; i < coverage_n_counters; i++) { + struct coverage_counter *c = coverage_counters[i]; + if (c->count) { + coverage_log_counter(level, c); + } + } + for (i = 0; i < coverage_n_counters; i++) { + struct coverage_counter *c = coverage_counters[i]; + if (!c->count) { + if (c->total) { + coverage_log_counter(level, c); + } else { + n_never_hit++; + } + } + } + VLOG(level, "%zu events never hit", n_never_hit); +} + +/* Advances to the next epoch of coverage, resetting all the counters to 0. */ +void +coverage_clear(void) +{ + size_t i; + + epoch++; + for (i = 0; i < coverage_n_counters; i++) { + struct coverage_counter *c = coverage_counters[i]; + c->total += c->count; + c->count = 0; + } +} diff --git a/lib/coverage.h b/lib/coverage.h new file mode 100644 index 00000000..98b6a85b --- /dev/null +++ b/lib/coverage.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef COVERAGE_H +#define COVERAGE_H 1 + +/* This file implements a simple form of coverage instrumentation. Points in + * source code that are of interest must be explicitly annotated with + * COVERAGE_INC. The coverage counters may be logged at any time with + * coverage_log(). + * + * This form of coverage instrumentation is intended to be so lightweight that + * it can be enabled in production builds. It is obviously not a substitute + * for traditional coverage instrumentation with e.g. "gcov", but it is still + * a useful debugging tool. */ + +#include "vlog.h" + +/* A coverage counter. */ +struct coverage_counter { + const char *name; /* Textual name. */ + unsigned int count; /* Count within the current epoch. */ + unsigned long long int total; /* Total count over all epochs. */ +}; + +/* Increments the counter with the given NAME. Coverage counters need not be + * declared explicitly, but when you add the first coverage counter to a given + * file, you must also add that file to COVERAGE_FILES in lib/automake.mk. */ +#define COVERAGE_INC(NAME) \ + do { \ + extern struct coverage_counter NAME##_count; \ + NAME##_count.count++; \ + } while (0) + +/* Adds AMOUNT to the coverage counter with the given NAME. */ +#define COVERAGE_ADD(NAME, AMOUNT) \ + do { \ + extern struct coverage_counter NAME##_count; \ + NAME##_count.count += AMOUNT; \ + } while (0) + +void coverage_log(enum vlog_level); +void coverage_clear(void); + +#endif /* coverage.h */ diff --git a/lib/csum.c b/lib/csum.c new file mode 100644 index 00000000..5266be6d --- /dev/null +++ b/lib/csum.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "csum.h" + +/* Returns the IP checksum of the 'n' bytes in 'data'. */ +uint16_t +csum(const void *data, size_t n) +{ + return csum_finish(csum_continue(0, data, n)); +} + +/* Adds the 16 bits in 'new' to the partial IP checksum 'partial' and returns + * the updated checksum. (To start a new checksum, pass 0 for 'partial'. To + * obtain the finished checksum, pass the return value to csum_finish().) */ +uint32_t +csum_add16(uint32_t partial, uint16_t new) +{ + return partial + new; +} + +/* Adds the 32 bits in 'new' to the partial IP checksum 'partial' and returns + * the updated checksum. (To start a new checksum, pass 0 for 'partial'. To + * obtain the finished checksum, pass the return value to csum_finish().) */ +uint32_t +csum_add32(uint32_t partial, uint32_t new) +{ + return partial + (new >> 16) + (new & 0xffff); +} + + +/* Adds the 'n' bytes in 'data' to the partial IP checksum 'partial' and + * returns the updated checksum. (To start a new checksum, pass 0 for + * 'partial'. To obtain the finished checksum, pass the return value to + * csum_finish().) */ +uint32_t +csum_continue(uint32_t partial, const void *data_, size_t n) +{ + const uint16_t *data = data_; + + for (; n > 1; n -= 2) { + partial = csum_add16(partial, *data++); + } + if (n) { + partial += *(uint8_t *) data; + } + return partial; +} + +/* Returns the IP checksum corresponding to 'partial', which is a value updated + * by some combination of csum_add16(), csum_add32(), and csum_continue(). */ +uint16_t +csum_finish(uint32_t partial) +{ + return ~((partial & 0xffff) + (partial >> 16)); +} + +/* Returns the new checksum for a packet in which the checksum field previously + * contained 'old_csum' and in which a field that contained 'old_u16' was + * changed to contain 'new_u16'. */ +uint16_t +recalc_csum16(uint16_t old_csum, uint16_t old_u16, uint16_t new_u16) +{ + /* Ones-complement arithmetic is endian-independent, so this code does not + * use htons() or ntohs(). + * + * See RFC 1624 for formula and explanation. */ + uint16_t hc_complement = ~old_csum; + uint16_t m_complement = ~old_u16; + uint16_t m_prime = new_u16; + uint32_t sum = hc_complement + m_complement + m_prime; + uint16_t hc_prime_complement = sum + (sum >> 16); + return ~hc_prime_complement; +} + +/* Returns the new checksum for a packet in which the checksum field previously + * contained 'old_csum' and in which a field that contained 'old_u32' was + * changed to contain 'new_u32'. */ +uint16_t +recalc_csum32(uint16_t old_csum, uint32_t old_u32, uint32_t new_u32) +{ + return recalc_csum16(recalc_csum16(old_csum, old_u32, new_u32), + old_u32 >> 16, new_u32 >> 16); +} diff --git a/lib/csum.h b/lib/csum.h new file mode 100644 index 00000000..25de1ac4 --- /dev/null +++ b/lib/csum.h @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef CSUM_H +#define CSUM_H 1 + +#include +#include + +uint16_t csum(const void *, size_t); +uint32_t csum_add16(uint32_t partial, uint16_t); +uint32_t csum_add32(uint32_t partial, uint32_t); +uint32_t csum_continue(uint32_t partial, const void *, size_t); +uint16_t csum_finish(uint32_t partial); +uint16_t recalc_csum16(uint16_t old_csum, uint16_t old_u16, uint16_t new_u16); +uint16_t recalc_csum32(uint16_t old_csum, uint32_t old_u32, uint32_t new_u32); + +#endif /* csum.h */ diff --git a/lib/daemon.c b/lib/daemon.c new file mode 100644 index 00000000..8b176238 --- /dev/null +++ b/lib/daemon.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "daemon.h" +#include +#include +#include +#include +#include +#include "fatal-signal.h" +#include "dirs.h" +#include "util.h" + +#define THIS_MODULE VLM_daemon +#include "vlog.h" + +/* Should we run in the background? */ +static bool detach; + +/* Name of pidfile (null if none). */ +static char *pidfile; + +/* Create pidfile even if one already exists and is locked? */ +static bool force; + +/* Returns the file name that would be used for a pidfile if 'name' were + * provided to set_pidfile(). The caller must free the returned string. */ +char * +make_pidfile_name(const char *name) +{ + return (!name ? xasprintf("%s/%s.pid", ovs_rundir, program_name) + : *name == '/' ? xstrdup(name) + : xasprintf("%s/%s", ovs_rundir, name)); +} + +/* Sets up a following call to daemonize() to create a pidfile named 'name'. + * If 'name' begins with '/', then it is treated as an absolute path. + * Otherwise, it is taken relative to RUNDIR, which is $(prefix)/var/run by + * default. + * + * If 'name' is null, then program_name followed by ".pid" is used. */ +void +set_pidfile(const char *name) +{ + free(pidfile); + pidfile = make_pidfile_name(name); +} + +/* Returns an absolute path to the configured pidfile, or a null pointer if no + * pidfile is configured. The caller must not modify or free the returned + * string. */ +const char * +get_pidfile(void) +{ + return pidfile; +} + +/* Normally, die_if_already_running() will terminate the program with a message + * if a locked pidfile already exists. If this function is called, + * die_if_already_running() will merely log a warning. */ +void +ignore_existing_pidfile(void) +{ + force = true; +} + +/* Sets up a following call to daemonize() to detach from the foreground + * session, running this process in the background. */ +void +set_detach(void) +{ + detach = true; +} + +/* If a pidfile has been configured and that pidfile already exists and is + * locked by a running process, returns the pid of the running process. + * Otherwise, returns 0. */ +static pid_t +already_running(void) +{ + pid_t pid = 0; + if (pidfile) { + int fd = open(pidfile, O_RDWR); + if (fd >= 0) { + struct flock lck; + lck.l_type = F_WRLCK; + lck.l_whence = SEEK_SET; + lck.l_start = 0; + lck.l_len = 0; + if (fcntl(fd, F_GETLK, &lck) != -1 && lck.l_type != F_UNLCK) { + pid = lck.l_pid; + } + close(fd); + } + } + return pid; +} + +/* If a locked pidfile exists, issue a warning message and, unless + * ignore_existing_pidfile() has been called, terminate the program. */ +void +die_if_already_running(void) +{ + pid_t pid = already_running(); + if (pid) { + if (!force) { + ovs_fatal(0, "%s: already running as pid %ld", + get_pidfile(), (long int) pid); + } else { + VLOG_WARN("%s: %s already running as pid %ld", + get_pidfile(), program_name, (long int) pid); + } + } +} + +/* If a pidfile has been configured, creates it and stores the running process' + * pid init. Ensures that the pidfile will be deleted when the process + * exits. */ +static void +make_pidfile(void) +{ + if (pidfile) { + /* Create pidfile via temporary file, so that observers never see an + * empty pidfile or an unlocked pidfile. */ + long int pid = getpid(); + char *tmpfile; + int fd; + + tmpfile = xasprintf("%s.tmp%ld", pidfile, pid); + fatal_signal_add_file_to_unlink(tmpfile); + fd = open(tmpfile, O_CREAT | O_WRONLY | O_TRUNC, 0666); + if (fd >= 0) { + struct flock lck; + lck.l_type = F_WRLCK; + lck.l_whence = SEEK_SET; + lck.l_start = 0; + lck.l_len = 0; + if (fcntl(fd, F_SETLK, &lck) != -1) { + char *text = xasprintf("%ld\n", pid); + if (write(fd, text, strlen(text)) == strlen(text)) { + fatal_signal_add_file_to_unlink(pidfile); + if (rename(tmpfile, pidfile) < 0) { + VLOG_ERR("failed to rename \"%s\" to \"%s\": %s", + tmpfile, pidfile, strerror(errno)); + fatal_signal_remove_file_to_unlink(pidfile); + close(fd); + } else { + /* Keep 'fd' open to retain the lock. */ + } + free(text); + } else { + VLOG_ERR("%s: write failed: %s", tmpfile, strerror(errno)); + close(fd); + } + } else { + VLOG_ERR("%s: fcntl failed: %s", tmpfile, strerror(errno)); + close(fd); + } + } else { + VLOG_ERR("%s: create failed: %s", tmpfile, strerror(errno)); + } + fatal_signal_remove_file_to_unlink(tmpfile); + free(tmpfile); + } + free(pidfile); + pidfile = NULL; +} + +/* If configured with set_pidfile() or set_detach(), creates the pid file and + * detaches from the foreground session. */ +void +daemonize(void) +{ + if (detach) { + char c = 0; + int fds[2]; + if (pipe(fds) < 0) { + ovs_fatal(errno, "pipe failed"); + } + + switch (fork()) { + default: + /* Parent process: wait for child to create pidfile, then exit. */ + close(fds[1]); + fatal_signal_fork(); + if (read(fds[0], &c, 1) != 1) { + ovs_fatal(errno, "daemon child failed to signal startup"); + } + exit(0); + + case 0: + /* Child process. */ + close(fds[0]); + make_pidfile(); + write(fds[1], &c, 1); + close(fds[1]); + setsid(); + chdir("/"); + break; + + case -1: + /* Error. */ + ovs_fatal(errno, "could not fork"); + break; + } + } else { + make_pidfile(); + } +} + +void +daemon_usage(void) +{ + printf( + "\nDaemon options:\n" + " -D, --detach run in background as daemon\n" + " -P, --pidfile[=FILE] create pidfile (default: %s/%s.pid)\n" + " -f, --force with -P, start even if already running\n", + ovs_rundir, program_name); +} + +/* Opens and reads a PID from 'pidfile'. Returns the nonnegative PID if + * successful, otherwise a negative errno value. */ +pid_t +read_pidfile(const char *pidfile) +{ + char line[128]; + struct flock lck; + FILE *file; + int error; + + file = fopen(pidfile, "r"); + if (!file) { + error = errno; + VLOG_WARN("%s: open: %s", pidfile, strerror(error)); + goto error; + } + + lck.l_type = F_WRLCK; + lck.l_whence = SEEK_SET; + lck.l_start = 0; + lck.l_len = 0; + if (fcntl(fileno(file), F_GETLK, &lck)) { + error = errno; + VLOG_WARN("%s: fcntl: %s", pidfile, strerror(error)); + goto error; + } + if (lck.l_type == F_UNLCK) { + error = ESRCH; + VLOG_WARN("%s: pid file is not locked", pidfile); + goto error; + } + + if (!fgets(line, sizeof line, file)) { + if (ferror(file)) { + error = errno; + VLOG_WARN("%s: read: %s", pidfile, strerror(error)); + } else { + error = ESRCH; + VLOG_WARN("%s: read: unexpected end of file", pidfile); + } + goto error; + } + + if (lck.l_pid != strtoul(line, NULL, 10)) { + error = ESRCH; + VLOG_WARN("l_pid (%ld) != %s pid (%s)", + (long int) lck.l_pid, pidfile, line); + goto error; + } + + fclose(file); + return lck.l_pid; + +error: + if (file) { + fclose(file); + } + return -error; +} diff --git a/lib/daemon.h b/lib/daemon.h new file mode 100644 index 00000000..7781dd06 --- /dev/null +++ b/lib/daemon.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef DAEMON_H +#define DAEMON_H 1 + +#include +#include + +#define DAEMON_LONG_OPTIONS \ + {"detach", no_argument, 0, 'D'}, \ + {"force", no_argument, 0, 'f'}, \ + {"pidfile", optional_argument, 0, 'P'} + +#define DAEMON_OPTION_HANDLERS \ + case 'D': \ + set_detach(); \ + break; \ + \ + case 'P': \ + set_pidfile(optarg); \ + break; \ + \ + case 'f': \ + ignore_existing_pidfile(); \ + break; + +char *make_pidfile_name(const char *name); +void set_pidfile(const char *name); +const char *get_pidfile(void); +void set_detach(void); +void daemonize(void); +void die_if_already_running(void); +void ignore_existing_pidfile(void); +void daemon_usage(void); +pid_t read_pidfile(const char *name); + +#endif /* daemon.h */ diff --git a/lib/daemon.man b/lib/daemon.man new file mode 100644 index 00000000..4ab65680 --- /dev/null +++ b/lib/daemon.man @@ -0,0 +1,21 @@ +.TP +\fB-P\fR[\fIpidfile\fR], \fB--pidfile\fR[\fB=\fIpidfile\fR] +Causes a file (by default, \fB\*(PN.pid\fR) to be created indicating +the PID of the running process. If \fIpidfile\fR is not specified, or +if it does not begin with \fB/\fR, then it is created in +\fB@RUNDIR@\fR. + +.TP +\fB-f\fR, \fB--force\fR +By default, when \fB-P\fR or \fB--pidfile\fR is specified and the +specified pidfile already exists and is locked by a running process, +\fB\*(PN\fR refuses to start. Specify \fB-f\fR or \fB--force\fR +to cause it to instead overwrite the pidfile. + +When \fB-P\fR or \fB--pidfile\fR is not specified, this option has no +effect. + +.TP +\fB-D\fR, \fB--detach\fR +Causes \fB\*(PN\fR to detach itself from the foreground session and +run as a background process. diff --git a/lib/dh1024.pem b/lib/dh1024.pem new file mode 100644 index 00000000..6eaeca9b --- /dev/null +++ b/lib/dh1024.pem @@ -0,0 +1,10 @@ +-----BEGIN DH PARAMETERS----- +MIGHAoGBAPSI/VhOSdvNILSd5JEHNmszbDgNRR0PfIizHHxbLY7288kjwEPwpVsY +jY67VYy4XTjTNP18F1dDox0YbN4zISy1Kv884bEpQBgRjXyEpwpy1obEAxnIByl6 +ypUM2Zafq9AKUJsCRtMIPWakXUGfnHy9iUsiGSa6q6Jew1XpL3jHAgEC +-----END DH PARAMETERS----- + +These are the 1024 bit DH parameters from "Assigned Number for SKIP Protocols" +(http://www.skip-vpn.org/spec/numbers.html). +See there for how they were generated. +Note that g is not a generator, but this is not a problem since p is a safe prime. diff --git a/lib/dh2048.pem b/lib/dh2048.pem new file mode 100644 index 00000000..dcd0b8d0 --- /dev/null +++ b/lib/dh2048.pem @@ -0,0 +1,12 @@ +-----BEGIN DH PARAMETERS----- +MIIBCAKCAQEA9kJXtwh/CBdyorrWqULzBej5UxE5T7bxbrlLOCDaAadWoxTpj0BV +89AHxstDqZSt90xkhkn4DIO9ZekX1KHTUPj1WV/cdlJPPT2N286Z4VeSWc39uK50 +T8X8dryDxUcwYc58yWb/Ffm7/ZFexwGq01uejaClcjrUGvC/RgBYK+X0iP1YTknb +zSC0neSRBzZrM2w4DUUdD3yIsxx8Wy2O9vPJI8BD8KVbGI2Ou1WMuF040zT9fBdX +Q6MdGGzeMyEstSr/POGxKUAYEY18hKcKctaGxAMZyAcpesqVDNmWn6vQClCbAkbT +CD1mpF1Bn5x8vYlLIhkmuquiXsNV6TILOwIBAg== +-----END DH PARAMETERS----- + +These are the 2048 bit DH parameters from "Assigned Number for SKIP Protocols" +(http://www.skip-vpn.org/spec/numbers.html). +See there for how they were generated. diff --git a/lib/dh4096.pem b/lib/dh4096.pem new file mode 100644 index 00000000..1b35ad8e --- /dev/null +++ b/lib/dh4096.pem @@ -0,0 +1,18 @@ +-----BEGIN DH PARAMETERS----- +MIICCAKCAgEA+hRyUsFN4VpJ1O8JLcCo/VWr19k3BCgJ4uk+d+KhehjdRqNDNyOQ +l/MOyQNQfWXPeGKmOmIig6Ev/nm6Nf9Z2B1h3R4hExf+zTiHnvVPeRBhjdQi81rt +Xeoh6TNrSBIKIHfUJWBh3va0TxxjQIs6IZOLeVNRLMqzeylWqMf49HsIXqbcokUS +Vt1BkvLdW48j8PPv5DsKRN3tloTxqDJGo9tKvj1Fuk74A+Xda1kNhB7KFlqMyN98 +VETEJ6c7KpfOo30mnK30wqw3S8OtaIR/maYX72tGOno2ehFDkq3pnPtEbD2CScxc +alJC+EL7RPk5c/tgeTvCngvc1KZn92Y//EI7G9tPZtylj2b56sHtMftIoYJ9+ODM +sccD5Piz/rejE3Ome8EOOceUSCYAhXn8b3qvxVI1ddd1pED6FHRhFvLrZxFvBEM9 +ERRMp5QqOaHJkM+Dxv8Cj6MqrCbfC4u+ZErxodzuusgDgvZiLF22uxMZbobFWyte +OvOzKGtwcTqO/1wV5gKkzu1ZVswVUQd5Gg8lJicwqRWyyNRczDDoG9jVDxmogKTH +AaqLulO7R8Ifa1SwF2DteSGVtgWEN8gDpN3RBmmPTDngyF2DHb5qmpnznwtFKdTL +KWbuHn491xNO25CQWMtem80uKw+pTnisBRF/454n1Jnhub144YRBoN8CAQI= +-----END DH PARAMETERS----- + +These are the 4096 bit DH parameters from "Assigned Number for SKIP Protocols" +(http://www.skip-vpn.org/spec/numbers.html). +See there for how they were generated. +Note that g is not a generator, but this is not a problem since p is a safe prime. diff --git a/lib/dhcp-client.c b/lib/dhcp-client.c new file mode 100644 index 00000000..225be3de --- /dev/null +++ b/lib/dhcp-client.c @@ -0,0 +1,1073 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "dhcp-client.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "csum.h" +#include "dhcp.h" +#include "dynamic-string.h" +#include "flow.h" +#include "netdev.h" +#include "ofpbuf.h" +#include "poll-loop.h" +#include "sat-math.h" +#include "timeval.h" + +#define THIS_MODULE VLM_dhcp_client +#include "vlog.h" + +#define DHCLIENT_STATES \ + DHCLIENT_STATE(INIT, 1 << 0) \ + DHCLIENT_STATE(INIT_REBOOT, 1 << 1) \ + DHCLIENT_STATE(REBOOTING, 1 << 2) \ + DHCLIENT_STATE(SELECTING, 1 << 3) \ + DHCLIENT_STATE(REQUESTING, 1 << 4) \ + DHCLIENT_STATE(BOUND, 1 << 5) \ + DHCLIENT_STATE(RENEWING, 1 << 6) \ + DHCLIENT_STATE(REBINDING, 1 << 7) \ + DHCLIENT_STATE(RELEASED, 1 << 8) +enum dhclient_state { +#define DHCLIENT_STATE(NAME, VALUE) S_##NAME = VALUE, + DHCLIENT_STATES +#undef DHCLIENT_STATE +}; + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60); + +static const char * +state_name(enum dhclient_state state) +{ + switch (state) { +#define DHCLIENT_STATE(NAME, VALUE) case S_##NAME: return #NAME; + DHCLIENT_STATES +#undef DHCLIENT_STATE + } + return "***ERROR***"; +} + +struct dhclient { + /* Configuration. */ + struct netdev *netdev; + + void (*modify_request)(struct dhcp_msg *, void *aux); + bool (*validate_offer)(const struct dhcp_msg *, void *aux); + void *aux; + + /* DHCP state. */ + enum dhclient_state state; + unsigned int state_entered; /* When we transitioned to this state. */ + uint32_t xid; /* In host byte order. */ + uint32_t ipaddr, netmask, router; + uint32_t server_ip; + struct dhcp_msg *binding; + bool changed; + + unsigned int retransmit, delay; /* Used by send_reliably(). */ + unsigned int max_timeout; + + unsigned int init_delay; /* Used by S_INIT. */ + + time_t lease_expiration; + unsigned int bound_timeout; + unsigned int renewing_timeout; + unsigned int rebinding_timeout; + + /* Used by dhclient_run() and dhclient_wait() */ + unsigned int min_timeout; + int received; + + /* Set when we send out a DHCPDISCOVER message. */ + uint32_t secs; + + struct ds s; +}; + +/* Minimum acceptable lease time, in seconds. */ +#define MIN_ACCEPTABLE_LEASE 15 + +static void state_transition(struct dhclient *, enum dhclient_state); +static unsigned int elapsed_in_this_state(const struct dhclient *cli); +static bool timeout(struct dhclient *, unsigned int secs); + +static void dhclient_msg_init(struct dhclient *, enum dhcp_msg_type, + struct dhcp_msg *); +static void send_reliably(struct dhclient *cli, + void (*make_packet)(struct dhclient *, + struct dhcp_msg *)); +static bool do_receive_msg(struct dhclient *, struct dhcp_msg *); +static void do_send_msg(struct dhclient *, const struct dhcp_msg *); +static bool receive_ack(struct dhclient *); + +static unsigned int fuzz(unsigned int x, int max_fuzz); +static unsigned int calc_t2(unsigned int lease); +static unsigned int calc_t1(unsigned int lease, unsigned int t2); + +static unsigned int clamp(unsigned int x, unsigned int min, unsigned int max); + +/* Creates a new DHCP client to configure the network device 'netdev_name' + * (e.g. "eth0"). + * + * If 'modify_request' is non-null, then each DHCP message to discover or + * request an address will be passed to it (along with auxiliary data 'aux'). + * It may then add any desired options to the message for transmission. + * + * If 'validate_offer' is non-null, then each DHCP message that offers an + * address will be passed to it (along with auxiliary data 'aux') for + * validation: if it returns true, the address will accepted; otherwise, it + * will be rejected. + * + * The DHCP client will not start advertising for an IP address until + * dhclient_init() is called. + * + * If successful, returns 0 and sets '*cli' to the new DHCP client. Otherwise, + * returns a positive errno value and sets '*cli' to a null pointer. */ +int +dhclient_create(const char *netdev_name, + void (*modify_request)(struct dhcp_msg *, void *aux), + bool (*validate_offer)(const struct dhcp_msg *, void *aux), + void *aux, struct dhclient **cli_) +{ + struct dhclient *cli; + struct netdev *netdev; + int error; + + *cli_ = NULL; + + error = netdev_open(netdev_name, ETH_TYPE_IP, &netdev); + /* XXX install socket filter to catch only DHCP packets. */ + if (error) { + VLOG_ERR("could not open %s network device: %s", + netdev_name, strerror(error)); + return error; + } + + error = netdev_turn_flags_on(netdev, NETDEV_UP, false); + if (error) { + VLOG_ERR("could not bring %s device up: %s", + netdev_name, strerror(error)); + netdev_close(netdev); + return error; + } + + cli = xcalloc(1, sizeof *cli); + cli->modify_request = modify_request; + cli->validate_offer = validate_offer; + cli->aux = aux; + cli->netdev = netdev; + cli->state = S_RELEASED; + cli->state_entered = time_now(); + cli->xid = random_uint32(); + cli->ipaddr = 0; + cli->server_ip = 0; + cli->retransmit = cli->delay = 0; + cli->max_timeout = 64; + cli->min_timeout = 1; + ds_init(&cli->s); + cli->changed = true; + *cli_ = cli; + return 0; +} + +/* Sets the maximum amount of timeout that 'cli' will wait for a reply from + * the DHCP server before retransmitting, in seconds, to 'max_timeout'. The + * default is 64 seconds. */ +void +dhclient_set_max_timeout(struct dhclient *cli, unsigned int max_timeout) +{ + cli->max_timeout = MAX(2, max_timeout); +} + +/* Destroys 'cli' and frees all related resources. */ +void +dhclient_destroy(struct dhclient *cli) +{ + if (cli) { + dhcp_msg_uninit(cli->binding); + free(cli->binding); + netdev_close(cli->netdev); + ds_destroy(&cli->s); + free(cli); + } +} + +/* Returns the network device in use by 'cli'. The caller must not destroy + * the returned device. */ +struct netdev * +dhclient_get_netdev(struct dhclient *cli) +{ + return cli->netdev; +} + +/* Forces 'cli' into a (re)initialization state, in which no address is bound + * but the client is advertising to obtain one. If 'requested_ip' is nonzero, + * then the client will attempt to re-bind to that IP address; otherwise, it + * will not ask for any particular address. */ +void +dhclient_init(struct dhclient *cli, uint32_t requested_ip) +{ + state_transition(cli, requested_ip ? S_INIT_REBOOT : S_INIT); + cli->ipaddr = requested_ip; + cli->min_timeout = 0; + cli->init_delay = 0; +} + +/* Forces 'cli' to release its bound IP address (if any). The client will not + * advertise for a new address until dhclient_init() is called again. */ +void +dhclient_release(struct dhclient *cli) +{ + if (dhclient_is_bound(cli)) { + struct dhcp_msg msg; + dhclient_msg_init(cli, DHCPRELEASE, &msg); + msg.ciaddr = cli->ipaddr; + do_send_msg(cli, &msg); + dhcp_msg_uninit(&msg); + } + state_transition(cli, S_RELEASED); + cli->min_timeout = UINT_MAX; +} + +static void +do_force_renew(struct dhclient *cli, int deadline) +{ + time_t now = time_now(); + unsigned int lease_left = sat_sub(cli->lease_expiration, now); + if (lease_left <= deadline) { + if (cli->state & (S_RENEWING | S_REBINDING)) { + return; + } + deadline = lease_left; + } + if (cli->state & (S_BOUND | S_RENEWING)) { + state_transition(cli, S_RENEWING); + cli->renewing_timeout = deadline * 3 / 4; + cli->rebinding_timeout = deadline * 1 / 4; + } else { + state_transition(cli, S_REBINDING); + cli->rebinding_timeout = deadline; + } + cli->min_timeout = 0; +} + +/* Forces 'cli' to attempt to renew the lease its current IP address (if any) + * within 'deadline' seconds. If the deadline is not met, then the client + * gives up its IP address binding and re-starts the DHCP process. */ +void +dhclient_force_renew(struct dhclient *cli, int deadline) +{ + /* Drain the receive queue so that we know that any DHCPACK we process is + * freshly received. */ + netdev_drain(cli->netdev); + + switch (cli->state) { + case S_INIT: + case S_INIT_REBOOT: + case S_REBOOTING: + case S_SELECTING: + case S_REQUESTING: + break; + + case S_BOUND: + case S_RENEWING: + case S_REBINDING: + do_force_renew(cli, deadline); + break; + + case S_RELEASED: + dhclient_init(cli, 0); + break; + } +} + +/* Returns true if 'cli' is bound to an IP address, false otherwise. */ +bool +dhclient_is_bound(const struct dhclient *cli) +{ + return cli->state & (S_BOUND | S_RENEWING | S_REBINDING); +} + +/* Returns true if 'cli' has changed from bound to unbound, or vice versa, at + * least once since the last time this function was called. */ +bool +dhclient_changed(struct dhclient *cli) +{ + bool changed = cli->changed; + cli->changed = 0; + return changed; +} + +/* Returns 'cli''s current state, as a string. The caller must not modify or + * free the string. */ +const char * +dhclient_get_state(const struct dhclient *cli) +{ + return state_name(cli->state); +} + +/* Returns the number of seconds spent so far in 'cli''s current state. */ +unsigned int +dhclient_get_state_elapsed(const struct dhclient *cli) +{ + return elapsed_in_this_state(cli); +} + +/* If 'cli' is bound, returns the number of seconds remaining in its lease; + * otherwise, returns 0. */ +unsigned int +dhclient_get_lease_remaining(const struct dhclient *cli) +{ + if (dhclient_is_bound(cli)) { + time_t now = time_now(); + return cli->lease_expiration > now ? cli->lease_expiration - now : 0; + } else { + return 0; + } +} + +/* If 'cli' is bound to an IP address, returns that IP address; otherwise, + * returns 0. */ +uint32_t +dhclient_get_ip(const struct dhclient *cli) +{ + return dhclient_is_bound(cli) ? cli->ipaddr : 0; +} + +/* If 'cli' is bound to an IP address, returns the netmask for that IP address; + * otherwise, returns 0. */ +uint32_t +dhclient_get_netmask(const struct dhclient *cli) +{ + return dhclient_is_bound(cli) ? cli->netmask : 0; +} + +/* If 'cli' is bound to an IP address and 'cli' has a default gateway, returns + * that default gateway; otherwise, returns 0. */ +uint32_t +dhclient_get_router(const struct dhclient *cli) +{ + return dhclient_is_bound(cli) ? cli->router : 0; +} + +/* If 'cli' is bound to an IP address, returns the DHCP message that was + * received to obtain that IP address (so that the caller can obtain additional + * options from it). Otherwise, returns a null pointer. */ +const struct dhcp_msg * +dhclient_get_config(const struct dhclient *cli) +{ + return dhclient_is_bound(cli) ? cli->binding : NULL; +} + +/* Configures the network device backing 'cli' to the network address and other + * parameters obtained via DHCP. If no address is bound on 'cli', removes any + * configured address from 'cli'. + * + * To use a dhclient as a regular DHCP client that binds and unbinds from IP + * addresses in the usual fashion, call this function after dhclient_run() if + * anything has changed, like so: + * + * dhclient_run(cli); + * if (dhclient_changed(cli)) { + * dhclient_configure_netdev(cli); + * } + * + */ +int +dhclient_configure_netdev(struct dhclient *cli) +{ + struct in_addr addr = { dhclient_get_ip(cli) }; + struct in_addr mask = { dhclient_get_netmask(cli) }; + struct in_addr router = { dhclient_get_router(cli) }; + int error; + + error = netdev_set_in4(cli->netdev, addr, mask); + if (error) { + VLOG_ERR("could not set %s address "IP_FMT"/"IP_FMT": %s", + netdev_get_name(cli->netdev), + IP_ARGS(&addr.s_addr), IP_ARGS(&mask.s_addr), + strerror(error)); + } + + if (!error && router.s_addr) { + error = netdev_add_router(router); + if (error) { + VLOG_ERR("failed to add default route to "IP_FMT" on %s: %s", + IP_ARGS(&router), netdev_get_name(cli->netdev), + strerror(error)); + } + } + + return error; +} + +/* If 'cli' is bound and the binding includes DNS domain parameters, updates + * /etc/resolv.conf will be updated to match the received parameters. Returns + * 0 if successful, otherwise a positive errno value. */ +int +dhclient_update_resolv_conf(struct dhclient *cli) +{ + uint32_t dns_server; + char *domain_name; + bool has_domain_name; + char new_name[128]; + FILE *old, *new; + int i; + + if (!dhclient_is_bound(cli)) { + return 0; + } + if (!dhcp_msg_get_ip(cli->binding, DHCP_CODE_DNS_SERVER, 0, &dns_server)) { + VLOG_DBG("binding does not include any DNS servers"); + return 0; + } + + sprintf(new_name, "/etc/resolv.conf.tmp%ld", (long int) getpid()); + new = fopen(new_name, "w"); + if (!new) { + VLOG_WARN("%s: create: %s", new_name, strerror(errno)); + return errno; + } + + domain_name = dhcp_msg_get_string(cli->binding, DHCP_CODE_DOMAIN_NAME); + has_domain_name = domain_name != NULL; + if (domain_name) { + if (strspn(domain_name, "-_.0123456789abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == strlen(domain_name)) { + fprintf(new, "domain %s\n", domain_name); + } else { + VLOG_WARN("ignoring invalid domain name %s", domain_name); + has_domain_name = false; + } + } else { + VLOG_DBG("binding does not include domain name"); + } + free(domain_name); + + for (i = 0; dhcp_msg_get_ip(cli->binding, DHCP_CODE_DNS_SERVER, + i, &dns_server); i++) { + fprintf(new, "nameserver "IP_FMT"\n", IP_ARGS(&dns_server)); + } + + old = fopen("/etc/resolv.conf", "r"); + if (old) { + char line[128]; + + while (fgets(line, sizeof line, old)) { + char *kw = xmemdup0(line, strcspn(line, " \t\r\n")); + if (strcmp(kw, "nameserver") + && (!has_domain_name + || (strcmp(kw, "domain") && strcmp(kw, "search")))) { + fputs(line, new); + } + free(kw); + } + fclose(old); + } else { + VLOG_DBG("/etc/resolv.conf: open: %s", strerror(errno)); + } + + if (fclose(new) < 0) { + VLOG_WARN("%s: close: %s", new_name, strerror(errno)); + return errno; + } + + if (rename(new_name, "/etc/resolv.conf") < 0) { + VLOG_WARN("failed to rename %s to /etc/resolv.conf: %s", + new_name, strerror(errno)); + return errno; + } + + return 0; +} + +/* DHCP protocol. */ + +static void +make_dhcpdiscover(struct dhclient *cli, struct dhcp_msg *msg) +{ + cli->secs = elapsed_in_this_state(cli); + dhclient_msg_init(cli, DHCPDISCOVER, msg); + if (cli->ipaddr) { + dhcp_msg_put_ip(msg, DHCP_CODE_REQUESTED_IP, cli->ipaddr); + } +} + +static void +make_dhcprequest(struct dhclient *cli, struct dhcp_msg *msg) +{ + dhclient_msg_init(cli, DHCPREQUEST, msg); + msg->ciaddr = dhclient_get_ip(cli); + if (cli->state == S_REQUESTING) { + dhcp_msg_put_ip(msg, DHCP_CODE_SERVER_IDENTIFIER, cli->server_ip); + } + dhcp_msg_put_ip(msg, DHCP_CODE_REQUESTED_IP, cli->ipaddr); +} + +static void +do_init(struct dhclient *cli, enum dhclient_state next_state) +{ + if (!cli->init_delay) { + cli->init_delay = fuzz(2, 1); + } + if (timeout(cli, cli->init_delay)) { + state_transition(cli, next_state); + } +} + +static void +dhclient_run_INIT(struct dhclient *cli) +{ + do_init(cli, S_SELECTING); +} + +static void +dhclient_run_INIT_REBOOT(struct dhclient *cli) +{ + do_init(cli, S_REBOOTING); +} + +static void +dhclient_run_REBOOTING(struct dhclient *cli) +{ + send_reliably(cli, make_dhcprequest); + if (!receive_ack(cli) && timeout(cli, 60)) { + state_transition(cli, S_INIT); + } +} + +static bool +dhcp_receive(struct dhclient *cli, unsigned int msgs, struct dhcp_msg *msg) +{ + while (do_receive_msg(cli, msg)) { + if (msg->type > 31 || !((1u << msg->type) & msgs)) { + VLOG_DBG_RL(&rl, "received unexpected %s in %s state: %s", + dhcp_type_name(msg->type), state_name(cli->state), + dhcp_msg_to_string(msg, false, &cli->s)); + } else if (msg->xid != cli->xid) { + VLOG_DBG_RL(&rl, + "ignoring %s with xid != %08"PRIx32" in %s state: %s", + dhcp_type_name(msg->type), msg->xid, + state_name(cli->state), + dhcp_msg_to_string(msg, false, &cli->s)); + } else { + return true; + } + dhcp_msg_uninit(msg); + } + return false; +} + +static bool +validate_offered_options(struct dhclient *cli, const struct dhcp_msg *msg) +{ + uint32_t lease, netmask; + if (!dhcp_msg_get_secs(msg, DHCP_CODE_LEASE_TIME, 0, &lease)) { + VLOG_WARN_RL(&rl, "%s lacks lease time: %s", dhcp_type_name(msg->type), + dhcp_msg_to_string(msg, false, &cli->s)); + } else if (!dhcp_msg_get_ip(msg, DHCP_CODE_SUBNET_MASK, 0, &netmask)) { + VLOG_WARN_RL(&rl, "%s lacks netmask: %s", dhcp_type_name(msg->type), + dhcp_msg_to_string(msg, false, &cli->s)); + } else if (lease < MIN_ACCEPTABLE_LEASE) { + VLOG_WARN_RL(&rl, "Ignoring %s with %"PRIu32"-second lease time: %s", + dhcp_type_name(msg->type), lease, + dhcp_msg_to_string(msg, false, &cli->s)); + } else if (cli->validate_offer && !cli->validate_offer(msg, cli->aux)) { + VLOG_DBG_RL(&rl, "client validation hook refused offer: %s", + dhcp_msg_to_string(msg, false, &cli->s)); + } else { + return true; + } + return false; +} + +static void +dhclient_run_SELECTING(struct dhclient *cli) +{ + struct dhcp_msg msg; + + send_reliably(cli, make_dhcpdiscover); + if (cli->server_ip && timeout(cli, 60)) { + cli->server_ip = 0; + state_transition(cli, S_INIT); + } + for (; dhcp_receive(cli, 1u << DHCPOFFER, &msg); dhcp_msg_uninit(&msg)) { + if (!validate_offered_options(cli, &msg)) { + continue; + } + if (!dhcp_msg_get_ip(&msg, DHCP_CODE_SERVER_IDENTIFIER, + 0, &cli->server_ip)) { + VLOG_WARN_RL(&rl, "DHCPOFFER lacks server identifier: %s", + dhcp_msg_to_string(&msg, false, &cli->s)); + continue; + } + + VLOG_DBG_RL(&rl, "accepting DHCPOFFER: %s", + dhcp_msg_to_string(&msg, false, &cli->s)); + cli->ipaddr = msg.yiaddr; + state_transition(cli, S_REQUESTING); + break; + } +} + +static bool +same_binding(const struct dhcp_msg *old, const struct dhcp_msg *new) +{ + static const int codes[] = { + DHCP_CODE_SUBNET_MASK, + DHCP_CODE_ROUTER, + DHCP_CODE_DNS_SERVER, + DHCP_CODE_HOST_NAME, + DHCP_CODE_DOMAIN_NAME, + DHCP_CODE_IP_TTL, + DHCP_CODE_MTU, + DHCP_CODE_BROADCAST_ADDRESS, + DHCP_CODE_STATIC_ROUTE, + DHCP_CODE_ARP_CACHE_TIMEOUT, + DHCP_CODE_ETHERNET_ENCAPSULATION, + DHCP_CODE_TCP_TTL, + DHCP_CODE_SERVER_IDENTIFIER, + DHCP_CODE_OFP_CONTROLLER_VCONN, + DHCP_CODE_OFP_PKI_URI, + }; + int i; + bool same = true; + + if (old->yiaddr != new->yiaddr) { + VLOG_WARN("DHCP binding changed IP address from "IP_FMT" to "IP_FMT, + IP_ARGS(&old->yiaddr), IP_ARGS(&new->yiaddr)); + same = false; + } + for (i = 0; i < ARRAY_SIZE(codes); i++) { + int code = codes[i]; + const struct dhcp_option *old_opt = &old->options[code]; + const struct dhcp_option *new_opt = &new->options[code]; + if (!dhcp_option_equals(old_opt, new_opt)) { + struct ds old_string = DS_EMPTY_INITIALIZER; + struct ds new_string = DS_EMPTY_INITIALIZER; + VLOG_WARN("DHCP binding changed option from %s to %s", + dhcp_option_to_string(old_opt, code, &old_string), + dhcp_option_to_string(new_opt, code, &new_string)); + ds_destroy(&old_string); + ds_destroy(&new_string); + same = false; + } + } + return same; +} + +static bool +receive_ack(struct dhclient *cli) +{ + struct dhcp_msg msg; + + if (!dhcp_receive(cli, (1u << DHCPACK) | (1u << DHCPNAK), &msg)) { + return false; + } else if (msg.type == DHCPNAK) { + dhcp_msg_uninit(&msg); + state_transition(cli, S_INIT); + return true; + } else if (!validate_offered_options(cli, &msg)) { + dhcp_msg_uninit(&msg); + return false; + } else { + uint32_t lease = 0, t1 = 0, t2 = 0; + + if (cli->binding) { + if (!same_binding(cli->binding, &msg)) { + cli->changed = true; + } + dhcp_msg_uninit(cli->binding); + } else { + cli->binding = xmalloc(sizeof *cli->binding); + } + dhcp_msg_copy(cli->binding, &msg); + + dhcp_msg_get_secs(&msg, DHCP_CODE_LEASE_TIME, 0, &lease); + dhcp_msg_get_secs(&msg, DHCP_CODE_T1, 0, &t1); + dhcp_msg_get_secs(&msg, DHCP_CODE_T2, 0, &t2); + assert(lease >= MIN_ACCEPTABLE_LEASE); + + if (!t2 || t2 >= lease) { + t2 = calc_t2(lease); + } + if (!t1 || t1 >= t2) { + t1 = calc_t1(lease, t2); + } + + cli->lease_expiration = sat_add(time_now(), lease); + cli->bound_timeout = t1; + cli->renewing_timeout = t2 - t1; + cli->rebinding_timeout = lease - t2; + + cli->ipaddr = msg.yiaddr; + dhcp_msg_get_ip(&msg, DHCP_CODE_SUBNET_MASK, 0, &cli->netmask); + if (!dhcp_msg_get_ip(&msg, DHCP_CODE_ROUTER, 0, &cli->router)) { + cli->router = INADDR_ANY; + } + state_transition(cli, S_BOUND); + VLOG_DBG("Bound: %s", dhcp_msg_to_string(&msg, false, &cli->s)); + return true; + } +} + +static void +dhclient_run_REQUESTING(struct dhclient *cli) +{ + send_reliably(cli, make_dhcprequest); + if (!receive_ack(cli) && timeout(cli, 60)) { + state_transition(cli, S_INIT); + } +} + +static void +dhclient_run_BOUND(struct dhclient *cli) +{ + if (timeout(cli, cli->bound_timeout)) { + state_transition(cli, S_RENEWING); + } +} + +static void +dhclient_run_RENEWING(struct dhclient *cli) +{ + send_reliably(cli, make_dhcprequest); + if (!receive_ack(cli) && timeout(cli, cli->renewing_timeout)) { + state_transition(cli, S_REBINDING); + } +} + +static void +dhclient_run_REBINDING(struct dhclient *cli) +{ + send_reliably(cli, make_dhcprequest); + if (!receive_ack(cli) && timeout(cli, cli->rebinding_timeout)) { + state_transition(cli, S_INIT); + } +} + +static void +dhclient_run_RELEASED(struct dhclient *cli UNUSED) +{ + /* Nothing to do. */ +} + +/* Processes the DHCP protocol for 'cli'. */ +void +dhclient_run(struct dhclient *cli) +{ + int old_state; + do { + old_state = cli->state; + cli->min_timeout = UINT_MAX; + cli->received = 0; + switch (cli->state) { +#define DHCLIENT_STATE(NAME, VALUE) \ + case S_##NAME: dhclient_run_##NAME(cli); break; + DHCLIENT_STATES +#undef DHCLIENT_STATE + default: + NOT_REACHED(); + } + } while (cli->state != old_state); +} + +/* Sets up poll timeouts to wake up the poll loop when 'cli' needs to do some + * work. */ +void +dhclient_wait(struct dhclient *cli) +{ + if (cli->min_timeout != UINT_MAX) { + time_t now = time_now(); + unsigned int wake = sat_add(cli->state_entered, cli->min_timeout); + if (wake <= now) { + poll_immediate_wake(); + } else { + poll_timer_wait(sat_mul(sat_sub(wake, now), 1000)); + } + } + /* Reset timeout to 1 second. This will have no effect ordinarily, because + * dhclient_run() will typically set it back to a higher value. If, + * however, the caller fails to call dhclient_run() before its next call to + * dhclient_wait() we won't potentially block forever. */ + cli->min_timeout = 1; + + if (cli->state & (S_SELECTING | S_REQUESTING | S_RENEWING | S_REBINDING)) { + netdev_recv_wait(cli->netdev); + } +} + +static void +state_transition(struct dhclient *cli, enum dhclient_state state) +{ + bool was_bound = dhclient_is_bound(cli); + bool am_bound; + if (cli->state != state) { + VLOG_DBG("entering %s", state_name(state)); + cli->state = state; + } + cli->state_entered = time_now(); + cli->retransmit = cli->delay = 0; + am_bound = dhclient_is_bound(cli); + if (was_bound != am_bound) { + cli->changed = true; + if (am_bound) { + assert(cli->binding != NULL); + VLOG_INFO("%s: obtained address "IP_FMT", netmask "IP_FMT, + netdev_get_name(cli->netdev), + IP_ARGS(&cli->ipaddr), IP_ARGS(&cli->netmask)); + if (cli->router) { + VLOG_INFO("%s: obtained default gateway "IP_FMT, + netdev_get_name(cli->netdev), IP_ARGS(&cli->router)); + } + } else { + dhcp_msg_uninit(cli->binding); + free(cli->binding); + cli->binding = NULL; + + VLOG_INFO("%s: network address unbound", + netdev_get_name(cli->netdev)); + } + } + if (cli->state & (S_SELECTING | S_REQUESTING | S_REBOOTING)) { + netdev_drain(cli->netdev); + } +} + +static void +send_reliably(struct dhclient *cli, + void (*make_packet)(struct dhclient *, struct dhcp_msg *)) +{ + if (timeout(cli, cli->retransmit)) { + struct dhcp_msg msg; + make_packet(cli, &msg); + if (cli->modify_request) { + cli->modify_request(&msg, cli->aux); + } + do_send_msg(cli, &msg); + cli->delay = MIN(cli->max_timeout, MAX(4, cli->delay * 2)); + cli->retransmit += fuzz(cli->delay, 1); + timeout(cli, cli->retransmit); + dhcp_msg_uninit(&msg); + } +} + +static void +dhclient_msg_init(struct dhclient *cli, enum dhcp_msg_type type, + struct dhcp_msg *msg) +{ + dhcp_msg_init(msg); + msg->op = DHCP_BOOTREQUEST; + msg->xid = cli->xid; + msg->secs = cli->secs; + msg->type = type; + memcpy(msg->chaddr, netdev_get_etheraddr(cli->netdev), ETH_ADDR_LEN); +} + +/* If time goes backward this returns a large number, which makes it look like + * we've been in the current state a very long time. That's probably + * fine for that corner case--we'll just expire our lease, etc., and try to + * get a new one. */ +static unsigned int +elapsed_in_this_state(const struct dhclient *cli) +{ + return time_now() - cli->state_entered; +} + +static bool +timeout(struct dhclient *cli, unsigned int secs) +{ + cli->min_timeout = MIN(cli->min_timeout, secs); + return time_now() >= sat_add(cli->state_entered, secs); +} + +static bool +do_receive_msg(struct dhclient *cli, struct dhcp_msg *msg) +{ + struct ofpbuf b; + + ofpbuf_init(&b, netdev_get_mtu(cli->netdev) + VLAN_ETH_HEADER_LEN); + for (; cli->received < 50; cli->received++) { + const struct ip_header *ip; + const struct dhcp_header *dhcp; + flow_t flow; + int error; + + ofpbuf_clear(&b); + error = netdev_recv(cli->netdev, &b); + if (error) { + goto drained; + } + + flow_extract(&b, 0, &flow); + if (flow.dl_type != htons(ETH_TYPE_IP) + || flow.nw_proto != IP_TYPE_UDP + || flow.tp_dst != htons(68) + || !(eth_addr_is_broadcast(flow.dl_dst) + || eth_addr_equals(flow.dl_dst, + netdev_get_etheraddr(cli->netdev)))) { + continue; + } + + ip = b.l3; + if (IP_IS_FRAGMENT(ip->ip_frag_off)) { + /* We don't do reassembly. */ + VLOG_WARN_RL(&rl, "ignoring fragmented DHCP datagram"); + continue; + } + + dhcp = b.l7; + if (!dhcp) { + VLOG_WARN_RL(&rl, "ignoring DHCP datagram with missing payload"); + continue; + } + + ofpbuf_pull(&b, (char *)b.l7 - (char*)b.data); + error = dhcp_parse(msg, &b); + if (!error) { + if (VLOG_IS_DBG_ENABLED()) { + VLOG_DBG_RL(&rl, "received %s", + dhcp_msg_to_string(msg, false, &cli->s)); + } else { + VLOG_INFO_RL(&rl, "received %s", dhcp_type_name(msg->type)); + } + ofpbuf_uninit(&b); + return true; + } + } + netdev_drain(cli->netdev); +drained: + ofpbuf_uninit(&b); + return false; +} + +static void +do_send_msg(struct dhclient *cli, const struct dhcp_msg *msg) +{ + struct ofpbuf b; + struct eth_header eh; + struct ip_header nh; + struct udp_header th; + uint32_t udp_csum; + int error; + + ofpbuf_init(&b, ETH_TOTAL_MAX); + ofpbuf_reserve(&b, ETH_HEADER_LEN + IP_HEADER_LEN + UDP_HEADER_LEN); + + dhcp_assemble(msg, &b); + + memcpy(eh.eth_src, netdev_get_etheraddr(cli->netdev), ETH_ADDR_LEN); + memcpy(eh.eth_dst, eth_addr_broadcast, ETH_ADDR_LEN); + eh.eth_type = htons(ETH_TYPE_IP); + + nh.ip_ihl_ver = IP_IHL_VER(5, IP_VERSION); + nh.ip_tos = 0; + nh.ip_tot_len = htons(IP_HEADER_LEN + UDP_HEADER_LEN + b.size); + /* We can't guarantee uniqueness of ip_id versus the host's, screwing up + * fragment reassembly, so prevent fragmentation and use an all-zeros + * ip_id. RFC 791 doesn't say we can do this, but Linux does the same + * thing for DF packets, so it must not screw anything up. */ + nh.ip_id = 0; + nh.ip_frag_off = htons(IP_DONT_FRAGMENT); + nh.ip_ttl = 64; + nh.ip_proto = IP_TYPE_UDP; + nh.ip_csum = 0; + nh.ip_src = dhclient_get_ip(cli); + /* XXX need to use UDP socket for nonzero server IPs so that we can get + * routing table support. + * + * if (...have server IP and in appropriate state...) { + * nh.ip_dst = cli->server_ip; + * } else { + * nh.ip_dst = INADDR_BROADCAST; + * } + */ + nh.ip_dst = INADDR_BROADCAST; + nh.ip_csum = csum(&nh, sizeof nh); + + th.udp_src = htons(66); + th.udp_dst = htons(67); + th.udp_len = htons(UDP_HEADER_LEN + b.size); + th.udp_csum = 0; + udp_csum = csum_add32(0, nh.ip_src); + udp_csum = csum_add32(udp_csum, nh.ip_dst); + udp_csum = csum_add16(udp_csum, IP_TYPE_UDP << 8); + udp_csum = csum_add16(udp_csum, th.udp_len); + udp_csum = csum_continue(udp_csum, &th, sizeof th); + th.udp_csum = csum_finish(csum_continue(udp_csum, b.data, b.size)); + + ofpbuf_push(&b, &th, sizeof th); + ofpbuf_push(&b, &nh, sizeof nh); + ofpbuf_push(&b, &eh, sizeof eh); + + /* Don't try to send the frame if it's too long for an Ethernet frame. We + * disregard the network device's actual MTU because we don't want the + * frame to have to be discarded or fragmented if it travels over a regular + * Ethernet at some point. 1500 bytes should be enough for anyone. */ + if (b.size <= ETH_TOTAL_MAX) { + if (VLOG_IS_DBG_ENABLED()) { + VLOG_DBG("sending %s", dhcp_msg_to_string(msg, false, &cli->s)); + } else { + VLOG_INFO("sending %s", dhcp_type_name(msg->type)); + } + error = netdev_send(cli->netdev, &b); + if (error) { + VLOG_ERR("send failed on %s: %s", + netdev_get_name(cli->netdev), strerror(error)); + } + } else { + VLOG_ERR("cannot send %zu-byte Ethernet frame", b.size); + } + + ofpbuf_uninit(&b); +} + +static unsigned int +fuzz(unsigned int x, int max_fuzz) +{ + /* Generate number in range [-max_fuzz, +max_fuzz]. */ + int fuzz = random_range(max_fuzz * 2 + 1) - max_fuzz; + unsigned int y = x + fuzz; + return fuzz >= 0 ? (y >= x ? y : UINT_MAX) : (y <= x ? y : 0); +} + +static unsigned int +clamp(unsigned int x, unsigned int min, unsigned int max) +{ + return x < min ? min : x > max ? max : x; +} + +static unsigned int +calc_t2(unsigned int lease) +{ + unsigned int base = lease * 0.875; + return lease >= 60 ? clamp(fuzz(base, 10), 0, lease - 1) : base; +} + +static unsigned int +calc_t1(unsigned int lease, unsigned int t2) +{ + unsigned int base = lease / 2; + return lease >= 60 ? clamp(fuzz(base, 10), 0, t2 - 1) : base; +} diff --git a/lib/dhcp-client.h b/lib/dhcp-client.h new file mode 100644 index 00000000..b37cac61 --- /dev/null +++ b/lib/dhcp-client.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef DHCP_CLIENT_H +#define DHCP_CLIENT_H 1 + +#include +#include + +struct dhclient; +struct dhcp_msg; +struct netdev; +int dhclient_create(const char *netdev, + void (*modify_request)(struct dhcp_msg *, void *aux), + bool (*validate_offer)(const struct dhcp_msg *, void *aux), + void *aux, struct dhclient **); +void dhclient_set_max_timeout(struct dhclient *, unsigned int max_timeout); +void dhclient_destroy(struct dhclient *); + +struct netdev *dhclient_get_netdev(struct dhclient *); + +void dhclient_init(struct dhclient *, uint32_t requested_ip); +void dhclient_release(struct dhclient *); +void dhclient_force_renew(struct dhclient *, int deadline); +bool dhclient_is_bound(const struct dhclient *); +bool dhclient_changed(struct dhclient *); + +const char *dhclient_get_state(const struct dhclient *); +unsigned int dhclient_get_state_elapsed(const struct dhclient *); +unsigned int dhclient_get_lease_remaining(const struct dhclient *); + +uint32_t dhclient_get_ip(const struct dhclient *); +uint32_t dhclient_get_netmask(const struct dhclient *); +uint32_t dhclient_get_router(const struct dhclient *); +const struct dhcp_msg *dhclient_get_config(const struct dhclient *); + +int dhclient_configure_netdev(struct dhclient *); +int dhclient_update_resolv_conf(struct dhclient *); + +void dhclient_run(struct dhclient *); +void dhclient_wait(struct dhclient *); + +#endif /* dhcp-client.h */ diff --git a/lib/dhcp.c b/lib/dhcp.c new file mode 100644 index 00000000..b7a1f1f0 --- /dev/null +++ b/lib/dhcp.c @@ -0,0 +1,825 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "dhcp.h" +#include +#include +#include +#include +#include +#include +#include "dynamic-string.h" +#include "ofpbuf.h" + +#define THIS_MODULE VLM_dhcp +#include "vlog.h" + +/* Information about a DHCP argument type. */ +struct arg_type { + const char *name; /* Name. */ + size_t size; /* Number of bytes per argument. */ +}; + +static struct arg_type types[] = { +#define DHCP_ARG(NAME, SIZE) [DHCP_ARG_##NAME] = {#NAME, SIZE}, + DHCP_ARGS +#undef DHCP_ARG +}; + +/* Information about a DHCP option. */ +struct option_class { + const char *name; /* Name. */ + enum dhcp_arg_type type; /* Argument type. */ + size_t min_args; /* Minimum number of arguments. */ + size_t max_args; /* Maximum number of arguments. */ +}; + +static const struct option_class * +get_option_class(int code) +{ + static struct option_class classes[DHCP_N_OPTIONS]; + static bool init = false; + if (!init) { + int i; + + init = true; +#define DHCP_OPT(NAME, CODE, TYPE, MIN, MAX) \ + classes[CODE].name = #NAME; \ + classes[CODE].type = DHCP_ARG_##TYPE; \ + classes[CODE].min_args = MIN; \ + classes[CODE].max_args = MAX; + DHCP_OPTS +#undef DHCP_OPT + + for (i = 0; i < DHCP_N_OPTIONS; i++) { + if (!classes[i].name) { + classes[i].name = xasprintf("option-%d", i); + classes[i].type = DHCP_ARG_UINT8; + classes[i].min_args = 0; + classes[i].max_args = SIZE_MAX; + } + } + } + assert(code >= 0 && code < DHCP_N_OPTIONS); + return &classes[code]; +} + +/* A single (bad) DHCP message can in theory dump out many, many log messages, + * especially at high logging levels, so the burst size is set quite high + * here to avoid missing useful information. */ +struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 600); + +static void copy_data(struct dhcp_msg *); + +const char * +dhcp_type_name(enum dhcp_msg_type type) +{ + switch (type) { +#define DHCP_MSG(NAME, VALUE) case NAME: return #NAME; + DHCP_MSGS +#undef DHCP_MSG + } + return "<>"; +} + +/* Initializes 'msg' as a DHCP message. The message should be freed with + * dhcp_msg_uninit() when it is no longer needed. */ +void +dhcp_msg_init(struct dhcp_msg *msg) +{ + memset(msg, 0, sizeof *msg); +} + +/* Frees the contents of 'msg'. The caller is responsible for freeing 'msg', + * if necessary. */ +void +dhcp_msg_uninit(struct dhcp_msg *msg) +{ + if (msg) { + free(msg->data); + } +} + +/* Initializes 'dst' as a copy of 'src'. 'dst' (and 'src') should be freed + * with dhcp_msg_uninit() when it is no longer needed. */ +void +dhcp_msg_copy(struct dhcp_msg *dst, const struct dhcp_msg *src) +{ + *dst = *src; + dst->data_allocated = src->data_used; + dst->data_used = 0; + dst->data = xmalloc(dst->data_allocated); + copy_data(dst); +} + +static void +prealloc_data(struct dhcp_msg *msg, size_t n) +{ + size_t needed = msg->data_used + n; + if (needed > msg->data_allocated) { + uint8_t *old_data = msg->data; + msg->data_allocated = MAX(needed * 2, 64); + msg->data = xmalloc(msg->data_allocated); + if (old_data) { + copy_data(msg); + free(old_data); + } + } +} + +static void * +append_data(struct dhcp_msg *msg, const void *data, size_t n) +{ + uint8_t *p = &msg->data[msg->data_used]; + memcpy(p, data, n); + msg->data_used += n; + return p; +} + +static void +copy_data(struct dhcp_msg *msg) +{ + int code; + + msg->data_used = 0; + for (code = 0; code < DHCP_N_OPTIONS; code++) { + struct dhcp_option *opt = &msg->options[code]; + if (opt->data) { + assert(msg->data_used + opt->n <= msg->data_allocated); + opt->data = append_data(msg, opt->data, opt->n); + } + } +} + +/* Appends the 'n' bytes in 'data' to the DHCP option in 'msg' represented by + * 'code' (which must be in the range 0...DHCP_N_OPTIONS). */ +void +dhcp_msg_put(struct dhcp_msg *msg, int code, + const void *data, size_t n) +{ + struct dhcp_option *opt; + if (code == DHCP_CODE_PAD || code == DHCP_CODE_END) { + return; + } + + opt = &msg->options[code]; + prealloc_data(msg, n + opt->n); + if (opt->n) { + if (&msg->data[msg->data_used - opt->n] != opt->data) { + opt->data = append_data(msg, opt->data, opt->n); + } + append_data(msg, data, n); + } else { + opt->data = append_data(msg, data, n); + } + opt->n += n; +} + +/* Appends the boolean value 'b', as a octet with value 0 (false) or 1 (true), + * to the DHCP option in 'msg' represented by 'code' (which must be in the + * range 0...DHCP_N_OPTIONS). */ +void +dhcp_msg_put_bool(struct dhcp_msg *msg, int code, bool b_) +{ + char b = !!b_; + dhcp_msg_put(msg, code, &b, 1); +} + +/* Appends the number of seconds 'secs', as a 32-bit number in network byte + * order, to the DHCP option in 'msg' represented by 'code' (which must be in + * the range 0...DHCP_N_OPTIONS). */ +void +dhcp_msg_put_secs(struct dhcp_msg *msg, int code, uint32_t secs_) +{ + uint32_t secs = htonl(secs_); + dhcp_msg_put(msg, code, &secs, sizeof secs); +} + +/* Appends the IP address 'ip', as a 32-bit number in network byte order, to + * the DHCP option in 'msg' represented by 'code' (which must be in the range + * 0...DHCP_N_OPTIONS). */ +void +dhcp_msg_put_ip(struct dhcp_msg *msg, int code, uint32_t ip) +{ + dhcp_msg_put(msg, code, &ip, sizeof ip); +} + +/* Appends the ASCII string 'string', to the DHCP option in 'msg' represented + * by 'code' (which must be in the range 0...DHCP_N_OPTIONS). */ +void +dhcp_msg_put_string(struct dhcp_msg *msg, int code, const char *string) +{ + dhcp_msg_put(msg, code, string, strlen(string)); +} + +/* Appends octet 'x' to DHCP option in 'msg' represented by 'code' (which must + * be in the range 0...DHCP_N_OPTIONS). */ +void +dhcp_msg_put_uint8(struct dhcp_msg *msg, int code, uint8_t x) +{ + dhcp_msg_put(msg, code, &x, sizeof x); +} + +/* Appends the 'n' octets in 'data' to DHCP option in 'msg' represented by + * 'code' (which must be in the range 0...DHCP_N_OPTIONS). */ +void dhcp_msg_put_uint8_array(struct dhcp_msg *msg, int code, + const uint8_t data[], size_t n) +{ + dhcp_msg_put(msg, code, data, n); +} + +/* Appends the 16-bit value in 'x', in network byte order, to DHCP option in + * 'msg' represented by 'code' (which must be in the range + * 0...DHCP_N_OPTIONS). */ +void +dhcp_msg_put_uint16(struct dhcp_msg *msg, int code, uint16_t x_) +{ + uint16_t x = htons(x_); + dhcp_msg_put(msg, code, &x, sizeof x); +} + + +/* Appends the 'n' 16-bit values in 'data', in network byte order, to DHCP + * option in 'msg' represented by 'code' (which must be in the range + * 0...DHCP_N_OPTIONS). */ +void +dhcp_msg_put_uint16_array(struct dhcp_msg *msg, int code, + const uint16_t data[], size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) { + dhcp_msg_put_uint16(msg, code, data[i]); + } +} + +/* Returns a pointer to the 'size' bytes starting at byte offset 'offset' in + * the DHCP option in 'msg' represented by 'code' (which must be in the range + * 0...DHCP_N_OPTIONS). If the option has fewer than 'offset + size' bytes, + * returns a null pointer. */ +const void * +dhcp_msg_get(const struct dhcp_msg *msg, int code, + size_t offset, size_t size) +{ + const struct dhcp_option *opt = &msg->options[code]; + return offset + size <= opt->n ? (const char *) opt->data + offset : NULL; +} + +/* Stores in '*out' the boolean value at byte offset 'offset' in the DHCP + * option in 'msg' represented by 'code' (which must be in the range + * 0...DHCP_N_OPTIONS). Returns true if successful, false if the option has + * fewer than 'offset + 1' bytes. */ +bool +dhcp_msg_get_bool(const struct dhcp_msg *msg, int code, size_t offset, + bool *out) +{ + const uint8_t *uint8 = dhcp_msg_get(msg, code, offset, sizeof *uint8); + if (uint8) { + *out = *uint8 != 0; + return true; + } else { + return false; + } +} + +/* Stores in '*out' the 32-bit count of seconds at offset 'offset' (in + * 4-byte increments) in the DHCP option in 'msg' represented by 'code' + * (which must be in the range 0...DHCP_N_OPTIONS). The value is converted to + * native byte order. Returns true if successful, false if the option has + * fewer than '4 * (offset + 1)' bytes. */ +bool +dhcp_msg_get_secs(const struct dhcp_msg *msg, int code, size_t offset, + uint32_t *out) +{ + const uint32_t *uint32 = dhcp_msg_get(msg, code, offset * sizeof *uint32, + sizeof *uint32); + if (uint32) { + *out = ntohl(*uint32); + return true; + } else { + return false; + } +} + +/* Stores in '*out' the IP address at offset 'offset' (in 4-byte increments) in + * the DHCP option in 'msg' represented by 'code' (which must be in the range + * 0...DHCP_N_OPTIONS). The IP address is stored in network byte order. + * Returns true if successful, false if the option has fewer than '4 * (offset + * + 1)' bytes. */ +bool +dhcp_msg_get_ip(const struct dhcp_msg *msg, int code, + size_t offset, uint32_t *out) +{ + const uint32_t *uint32 = dhcp_msg_get(msg, code, offset * sizeof *uint32, + sizeof *uint32); + if (uint32) { + *out = *uint32; + return true; + } else { + return false; + } +} + +/* Returns the string in the DHCP option in 'msg' represented by 'code' (which + * must be in the range 0...DHCP_N_OPTIONS). The caller is responsible for + * freeing the string with free(). + * + * If 'msg' has no option represented by 'code', returns a null pointer. (If + * the option was specified but had no content, then an empty string is + * returned, not a null pointer.) */ +char * +dhcp_msg_get_string(const struct dhcp_msg *msg, int code) +{ + const struct dhcp_option *opt = &msg->options[code]; + return opt->data ? xmemdup0(opt->data, opt->n) : NULL; +} + +/* Stores in '*out' the octet at byte offset 'offset' in the DHCP option in + * 'msg' represented by 'code' (which must be in the range 0...DHCP_N_OPTIONS). + * Returns true if successful, false if the option has fewer than 'offset + 1' + * bytes. */ +bool +dhcp_msg_get_uint8(const struct dhcp_msg *msg, int code, + size_t offset, uint8_t *out) +{ + const uint8_t *uint8 = dhcp_msg_get(msg, code, offset, sizeof *uint8); + if (uint8) { + *out = *uint8; + return true; + } else { + return false; + } +} + +/* Stores in '*out' the 16-bit value at offset 'offset' (in 2-byte units) in + * the DHCP option in 'msg' represented by 'code' (which must be in the range + * 0...DHCP_N_OPTIONS). The value is converted to native byte order. Returns + * true if successful, false if the option has fewer than '2 * (offset + 1)' + * bytes. */ +bool +dhcp_msg_get_uint16(const struct dhcp_msg *msg, int code, + size_t offset, uint16_t *out) +{ + const uint16_t *uint16 = dhcp_msg_get(msg, code, offset * sizeof *uint16, + sizeof *uint16); + if (uint16) { + *out = ntohs(*uint16); + return true; + } else { + return false; + } +} + +/* Appends a string representing 'duration' seconds to 'ds'. */ +static void +put_duration(struct ds *ds, unsigned int duration) +{ + if (duration) { + if (duration >= 86400) { + ds_put_format(ds, "%ud", duration / 86400); + duration %= 86400; + } + if (duration >= 3600) { + ds_put_format(ds, "%uh", duration / 3600); + duration %= 3600; + } + if (duration >= 60) { + ds_put_format(ds, "%umin", duration / 60); + duration %= 60; + } + if (duration > 0) { + ds_put_format(ds, "%us", duration); + } + } else { + ds_put_cstr(ds, "0s"); + } +} + +/* Appends a string representation of 'opt', which has the given 'code', to + * 'ds'. */ +const char * +dhcp_option_to_string(const struct dhcp_option *opt, int code, struct ds *ds) +{ + const struct option_class *class = get_option_class(code); + const struct arg_type *type = &types[class->type]; + size_t offset; + const char *cp; + + for (cp = class->name; *cp; cp++) { + unsigned char c = *cp; + ds_put_char(ds, c == '_' ? '-' : tolower(c)); + } + ds_put_char(ds, '='); + + if (!opt->data || !opt->n) { + ds_put_cstr(ds, opt->data ? "empty" : "null"); + return ds_cstr(ds); + } + + if (class->type == DHCP_ARG_STRING) { + ds_put_char(ds, '"'); + ds_put_printable(ds, opt->data, opt->n); + ds_put_char(ds, '"'); + return ds_cstr(ds); + } + for (offset = 0; offset + type->size <= opt->n; offset += type->size) { + const void *p = (const char *) opt->data + offset; + const uint8_t *uint8 = p; + const uint32_t *uint32 = p; + const uint16_t *uint16 = p; + + if (offset && class->type != DHCP_ARG_STRING) { + ds_put_cstr(ds, class->type == DHCP_ARG_UINT8 ? ":" : ", "); + } + switch (class->type) { + case DHCP_ARG_FIXED: + NOT_REACHED(); + case DHCP_ARG_IP: + ds_put_format(ds, IP_FMT, IP_ARGS(uint32)); + break; + case DHCP_ARG_UINT8: + ds_put_format(ds, "%02"PRIx8, *uint8); + break; + case DHCP_ARG_UINT16: + ds_put_format(ds, "%"PRIu16, ntohs(*uint16)); + break; + case DHCP_ARG_UINT32: + ds_put_format(ds, "%"PRIu32, ntohl(*uint32)); + break; + case DHCP_ARG_SECS: + put_duration(ds, ntohl(*uint32)); + break; + case DHCP_ARG_STRING: + NOT_REACHED(); + case DHCP_ARG_BOOLEAN: + if (*uint8 == 0) { + ds_put_cstr(ds, "false"); + } else if (*uint8 == 1) { + ds_put_cstr(ds, "true"); + } else { + ds_put_format(ds, "**%"PRIu8"**", *uint8); + } + break; + } + } + if (offset != opt->n) { + if (offset) { + ds_put_cstr(ds, ", "); + } + ds_put_cstr(ds, "**leftovers:"); + for (; offset < opt->n; offset++) { + const void *p = (const char *) opt->data + offset; + const uint8_t *uint8 = p; + ds_put_format(ds, " %"PRIu8, *uint8); + } + ds_put_cstr(ds, "**"); + } + return ds_cstr(ds); +} + +/* Returns true if 'a' and 'b' have the same content, false otherwise. */ +bool +dhcp_option_equals(const struct dhcp_option *a, const struct dhcp_option *b) +{ + return ((a->data != NULL) == (b->data != NULL) + && a->n == b->n + && !memcmp(a->data, b->data, a->n)); +} + +/* Replaces 'ds' by a string representation of 'msg'. If 'multiline' is + * false, 'ds' receives a single-line representation of 'msg', otherwise a + * multiline representation. */ +const char * +dhcp_msg_to_string(const struct dhcp_msg *msg, bool multiline, struct ds *ds) +{ + char separator = multiline ? '\n' : ' '; + int code; + + ds_clear(ds); + ds_put_format(ds, "op=%s", + (msg->op == DHCP_BOOTREQUEST ? "request" + : msg->op == DHCP_BOOTREPLY ? "reply" + : "error")); + ds_put_format(ds, "%ctype=%s", separator, dhcp_type_name(msg->type)); + ds_put_format(ds, "%cxid=0x%08"PRIx32, separator, msg->xid); + ds_put_format(ds, "%csecs=", separator); + put_duration(ds, msg->secs); + if (msg->flags) { + ds_put_format(ds, "%cflags=", separator); + if (msg->flags & DHCP_FLAGS_BROADCAST) { + ds_put_cstr(ds, "[BROADCAST]"); + } + if (msg->flags & DHCP_FLAGS_MBZ) { + ds_put_format(ds, "[0x%04"PRIx16"]", msg->flags & DHCP_FLAGS_MBZ); + } + } + if (msg->ciaddr) { + ds_put_format(ds, "%cciaddr="IP_FMT, separator, IP_ARGS(&msg->ciaddr)); + } + if (msg->yiaddr) { + ds_put_format(ds, "%cyiaddr="IP_FMT, separator, IP_ARGS(&msg->yiaddr)); + } + if (msg->siaddr) { + ds_put_format(ds, "%csiaddr="IP_FMT, separator, IP_ARGS(&msg->siaddr)); + } + if (msg->giaddr) { + ds_put_format(ds, "%cgiaddr="IP_FMT, separator, IP_ARGS(&msg->giaddr)); + } + ds_put_format(ds, "%cchaddr="ETH_ADDR_FMT, + separator, ETH_ADDR_ARGS(msg->chaddr)); + + for (code = 0; code < DHCP_N_OPTIONS; code++) { + const struct dhcp_option *opt = &msg->options[code]; + if (opt->data) { + ds_put_char(ds, separator); + dhcp_option_to_string(opt, code, ds); + } + } + if (multiline) { + ds_put_char(ds, separator); + } + return ds_cstr(ds); +} + +static void +parse_options(struct dhcp_msg *msg, const char *name, void *data, size_t size, + int option_offset) +{ + struct ofpbuf b; + + b.data = data; + b.size = size; + for (;;) { + uint8_t *code, *len; + void *payload; + + code = ofpbuf_try_pull(&b, 1); + if (!code || *code == DHCP_CODE_END) { + break; + } else if (*code == DHCP_CODE_PAD) { + continue; + } + + len = ofpbuf_try_pull(&b, 1); + if (!len) { + VLOG_DBG_RL(&rl, "reached end of %s expecting length byte", name); + break; + } + + payload = ofpbuf_try_pull(&b, *len); + if (!payload) { + VLOG_DBG_RL(&rl, "expected %"PRIu8" bytes of option-%"PRIu8" " + "payload with only %zu bytes of %s left", + *len, *code, b.size, name); + break; + } + dhcp_msg_put(msg, *code + option_offset, payload, *len); + } +} + +static void +validate_options(struct dhcp_msg *msg) +{ + int code; + + for (code = 0; code < DHCP_N_OPTIONS; code++) { + struct dhcp_option *opt = &msg->options[code]; + const struct option_class *class = get_option_class(code); + struct arg_type *type = &types[class->type]; + if (opt->data) { + size_t n_elems = opt->n / type->size; + size_t remainder = opt->n % type->size; + bool ok = true; + if (remainder) { + VLOG_DBG_RL(&rl, "%s option has %zu %zu-byte %s arguments " + "with %zu bytes left over", + class->name, n_elems, type->size, + type->name, remainder); + ok = false; + } + if (n_elems < class->min_args || n_elems > class->max_args) { + VLOG_DBG_RL(&rl, "%s option has %zu %zu-byte %s arguments but " + "between %zu and %zu are required", + class->name, n_elems, type->size, type->name, + class->min_args, class->max_args); + ok = false; + } + if (!ok) { + struct ds ds = DS_EMPTY_INITIALIZER; + VLOG_DBG_RL(&rl, "%s option contains: %s", class->name, + dhcp_option_to_string(opt, code, &ds)); + ds_destroy(&ds); + + opt->n = 0; + opt->data = NULL; + } + } + } +} + +/* Attempts to parse 'b' as a DHCP message. If successful, initializes '*msg' + * to the parsed message and returns 0. Otherwise, returns a positive errno + * value and '*msg' is indeterminate. */ +int +dhcp_parse(struct dhcp_msg *msg, const struct ofpbuf *b_) +{ + struct ofpbuf b = *b_; + struct dhcp_header *dhcp; + uint32_t *cookie; + uint8_t type; + char *vendor_class; + + dhcp = ofpbuf_try_pull(&b, sizeof *dhcp); + if (!dhcp) { + VLOG_DBG_RL(&rl, "buffer too small for DHCP header (%zu bytes)", + b.size); + goto error; + } + + if (dhcp->op != DHCP_BOOTREPLY && dhcp->op != DHCP_BOOTREQUEST) { + VLOG_DBG_RL(&rl, "invalid DHCP op (%"PRIu8")", dhcp->op); + goto error; + } + if (dhcp->htype != ARP_HRD_ETHERNET) { + VLOG_DBG_RL(&rl, "invalid DHCP htype (%"PRIu8")", dhcp->htype); + goto error; + } + if (dhcp->hlen != ETH_ADDR_LEN) { + VLOG_DBG_RL(&rl, "invalid DHCP hlen (%"PRIu8")", dhcp->hlen); + goto error; + } + + dhcp_msg_init(msg); + msg->op = dhcp->op; + msg->xid = ntohl(dhcp->xid); + msg->secs = ntohs(dhcp->secs); + msg->flags = ntohs(dhcp->flags); + msg->ciaddr = dhcp->ciaddr; + msg->yiaddr = dhcp->yiaddr; + msg->siaddr = dhcp->siaddr; + msg->giaddr = dhcp->giaddr; + memcpy(msg->chaddr, dhcp->chaddr, ETH_ADDR_LEN); + + cookie = ofpbuf_try_pull(&b, sizeof cookie); + if (cookie) { + if (ntohl(*cookie) == DHCP_OPTS_COOKIE) { + uint8_t overload; + + parse_options(msg, "options", b.data, b.size, 0); + if (dhcp_msg_get_uint8(msg, DHCP_CODE_OPTION_OVERLOAD, + 0, &overload)) { + if (overload & 1) { + parse_options(msg, "file", dhcp->file, sizeof dhcp->file, + 0); + } + if (overload & 2) { + parse_options(msg, "sname", + dhcp->sname, sizeof dhcp->sname, 0); + } + } + } else { + VLOG_DBG_RL(&rl, "bad DHCP options cookie: %08"PRIx32, + ntohl(*cookie)); + } + } else { + VLOG_DBG_RL(&rl, "DHCP packet has no options"); + } + + vendor_class = dhcp_msg_get_string(msg, DHCP_CODE_VENDOR_CLASS); + if (vendor_class && !strcmp(vendor_class, "OpenFlow")) { + parse_options(msg, "vendor-specific", + msg->options[DHCP_CODE_VENDOR_SPECIFIC].data, + msg->options[DHCP_CODE_VENDOR_SPECIFIC].n, + DHCP_VENDOR_OFS); + } + free(vendor_class); + + validate_options(msg); + if (!dhcp_msg_get_uint8(msg, DHCP_CODE_DHCP_MSG_TYPE, 0, &type)) { + VLOG_DBG_RL(&rl, "missing DHCP message type"); + dhcp_msg_uninit(msg); + goto error; + } + msg->type = type; + return 0; + +error: + if (VLOG_IS_DBG_ENABLED()) { + struct ds ds; + + ds_init(&ds); + ds_put_hex_dump(&ds, b_->data, b_->size, 0, true); + VLOG_DBG_RL(&rl, "invalid DHCP message dump:\n%s", ds_cstr(&ds)); + + ds_clear(&ds); + dhcp_msg_to_string(msg, false, &ds); + VLOG_DBG_RL(&rl, "partially dissected DHCP message: %s", ds_cstr(&ds)); + + ds_destroy(&ds); + } + return EPROTO; +} + +static void +put_option_chunk(struct ofpbuf *b, uint8_t code, void *data, size_t n) +{ + uint8_t header[2]; + + assert(n < 256); + header[0] = code; + header[1] = n; + ofpbuf_put(b, header, sizeof header); + ofpbuf_put(b, data, n); +} + +static void +put_option(struct ofpbuf *b, uint8_t code, void *data, size_t n) +{ + if (data) { + if (n) { + /* Divide the data into chunks of 255 bytes or less. Make + * intermediate chunks multiples of 8 bytes in case the + * recipient validates a chunk at a time instead of the + * concatenated value. */ + uint8_t *p = data; + while (n) { + size_t chunk = n > 255 ? 248 : n; + put_option_chunk(b, code, p, chunk); + p += chunk; + n -= chunk; + } + } else { + /* Option should be present but carry no data. */ + put_option_chunk(b, code, NULL, 0); + } + } +} + +/* Appends to 'b' the DHCP message represented by 'msg'. */ +void +dhcp_assemble(const struct dhcp_msg *msg, struct ofpbuf *b) +{ + const uint8_t end = DHCP_CODE_END; + uint32_t cookie = htonl(DHCP_OPTS_COOKIE); + struct ofpbuf vnd_data; + struct dhcp_header dhcp; + int i; + + memset(&dhcp, 0, sizeof dhcp); + dhcp.op = msg->op; + dhcp.htype = ARP_HRD_ETHERNET; + dhcp.hlen = ETH_ADDR_LEN; + dhcp.hops = 0; + dhcp.xid = htonl(msg->xid); + dhcp.secs = htons(msg->secs); + dhcp.flags = htons(msg->flags); + dhcp.ciaddr = msg->ciaddr; + dhcp.yiaddr = msg->yiaddr; + dhcp.siaddr = msg->siaddr; + dhcp.giaddr = msg->giaddr; + memcpy(dhcp.chaddr, msg->chaddr, ETH_ADDR_LEN); + ofpbuf_put(b, &dhcp, sizeof dhcp); + ofpbuf_put(b, &cookie, sizeof cookie); + + /* Put DHCP message type first. (The ordering is not required but it + * seems polite.) */ + if (msg->type) { + uint8_t type = msg->type; + put_option(b, DHCP_CODE_DHCP_MSG_TYPE, &type, 1); + } + + /* Put the standard options. */ + for (i = 0; i < DHCP_VENDOR_OFS; i++) { + const struct dhcp_option *option = &msg->options[i]; + put_option(b, i, option->data, option->n); + } + + /* Assemble vendor specific option and put it. */ + ofpbuf_init(&vnd_data, 0); + for (i = DHCP_VENDOR_OFS; i < DHCP_N_OPTIONS; i++) { + const struct dhcp_option *option = &msg->options[i]; + put_option(&vnd_data, i - DHCP_VENDOR_OFS, option->data, option->n); + } + if (vnd_data.size) { + put_option(b, DHCP_CODE_VENDOR_SPECIFIC, vnd_data.data, vnd_data.size); + } + ofpbuf_uninit(&vnd_data); + + /* Put end-of-options option. */ + ofpbuf_put(b, &end, sizeof end); +} + diff --git a/lib/dhcp.h b/lib/dhcp.h new file mode 100644 index 00000000..eaeb6120 --- /dev/null +++ b/lib/dhcp.h @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef DHCP_H +#define DHCP_H 1 + +#include +#include "packets.h" +#include "util.h" + +struct ds; +struct ofpbuf; + +/* Values for 'op' field. */ +#define DHCP_BOOTREQUEST 1 /* Message sent by DHCP client. */ +#define DHCP_BOOTREPLY 2 /* Message sent by DHCP server. */ + +/* Bits in 'flags' field. */ +#define DHCP_FLAGS_BROADCAST 0x8000 /* Server must broadcast all replies. */ +#define DHCP_FLAGS_MBZ 0x7fff /* Must be zero. */ + +/* First four bytes of 'options' field. */ +#define DHCP_OPTS_COOKIE 0x63825363 + +#define DHCP_HEADER_LEN 236 +struct dhcp_header { + uint8_t op; /* DHCP_BOOTREQUEST or DHCP_BOOTREPLY. */ + uint8_t htype; /* ARP_HRD_ETHERNET (typically). */ + uint8_t hlen; /* ETH_ADDR_LEN (typically). */ + uint8_t hops; /* Hop count; set to 0 by client. */ + uint32_t xid; /* Transaction ID. */ + uint16_t secs; /* Since client started address acquisition. */ + uint16_t flags; /* DHCP_FLAGS_*. */ + uint32_t ciaddr; /* Client IP, if it has a lease for one. */ + uint32_t yiaddr; /* Client ("your") IP address. */ + uint32_t siaddr; /* Next server IP address. */ + uint32_t giaddr; /* Relay agent IP address. */ + uint8_t chaddr[16]; /* Client hardware address. */ + char sname[64]; /* Optional server host name. */ + char file[128]; /* Boot file name. */ + /* Followed by variable-length options field. */ +}; +BUILD_ASSERT_DECL(DHCP_HEADER_LEN == sizeof(struct dhcp_header)); + +#define DHCP_ARGS \ + DHCP_ARG(FIXED, 0) /* Fixed-length option (PAD and END only). */ \ + DHCP_ARG(IP, 4) /* IP addresses. */ \ + DHCP_ARG(SECS, 4) /* 32-bit duration in seconds. */ \ + DHCP_ARG(STRING, 1) /* NVT string, optionally null-terminated. */ \ + DHCP_ARG(UINT8, 1) /* 8-bit unsigned integer. */ \ + DHCP_ARG(UINT16, 2) /* 16-bit unsigned integer. */ \ + DHCP_ARG(UINT32, 4) /* 32-bit unsigned integer. */ \ + DHCP_ARG(BOOLEAN, 1) /* Boolean octet (0 or 1). */ + +/* DHCP option argument types. */ +enum dhcp_arg_type { +#define DHCP_ARG(NAME, SIZE) DHCP_ARG_##NAME, + DHCP_ARGS +#undef DHCP_ARG +}; + +#define DHCP_MSGS \ + DHCP_MSG(DHCPDISCOVER, 1) /* Client->server: What IPs are available? */ \ + DHCP_MSG(DHCPOFFER, 2) /* Server->client: This IP is available. */ \ + DHCP_MSG(DHCPREQUEST, 3) /* Client->server: I want that IP. */ \ + DHCP_MSG(DHCPDECLINE, 4) /* Client->server: That IP is in use!. */ \ + DHCP_MSG(DHCPACK, 5) /* Server->client: You can have that IP. */ \ + DHCP_MSG(DHCPNAK, 6) /* Server->client: You can't have that IP. */ \ + DHCP_MSG(DHCPRELEASE, 7) /* Client->server: I'm done with this IP. */ \ + DHCP_MSG(DCHPINFORM, 8) /* Client->server: I'm using this IP. */ + +/* DHCP message type (this is the argument for the DHCP_MSG_TYPE option). */ +enum dhcp_msg_type { +#define DHCP_MSG(NAME, VALUE) NAME = VALUE, + DHCP_MSGS +#undef DHCP_MSG +}; +const char *dhcp_type_name(enum dhcp_msg_type); + +/* DHCP allows for 256 standardized options and 256 vendor-specific options. + * We put them in a single array, with the standard options at the + * beginning. */ +#define DHCP_N_OPTIONS 512 +#define DHCP_VENDOR_OFS 256 + +/* DHCP options. */ +#define DHCP_OPTS \ + /* arg min max */ \ + /* name code type args args */ \ + DHCP_OPT(PAD, 0, FIXED, 0, 0) \ + DHCP_OPT(END, 255, FIXED, 0, 0) \ + DHCP_OPT(SUBNET_MASK, 1, IP, 1, 1) \ + DHCP_OPT(TIME_OFFSET, 2, SECS, 1, 1) \ + DHCP_OPT(ROUTER, 3, IP, 1, SIZE_MAX) \ + /* Time Server Option is obsolete. */ \ + /* Name Server Option is obsolete. */ \ + DHCP_OPT(DNS_SERVER, 6, IP, 1, SIZE_MAX) \ + /* Log Server Option is obsolete. */ \ + /* Cookie Server Option is obsolete. */ \ + DHCP_OPT(LPR_SERVER, 9, IP, 1, SIZE_MAX) \ + /* Impress Server Option is obsolete. */ \ + /* Resource Location Server Option is obsolete. */ \ + DHCP_OPT(HOST_NAME, 12, STRING, 1, SIZE_MAX) \ + DHCP_OPT(BOOT_FILE_SIZE, 13, UINT16, 1, 1) \ + /* Merit Dump File option is obsolete. */ \ + DHCP_OPT(DOMAIN_NAME, 15, STRING, 1, SIZE_MAX) \ + /* Swap Server option is obsolete. */ \ + DHCP_OPT(ROOT_PATH, 17, STRING, 1, SIZE_MAX) \ + DHCP_OPT(EXTENSIONS_PATH, 18, STRING, 1, SIZE_MAX) \ + DHCP_OPT(IP_FORWARDING, 19, BOOLEAN, 1, 1) \ + DHCP_OPT(SOURCE_ROUTING, 20, BOOLEAN, 1, 1) \ + DHCP_OPT(POLICY_FILTER, 21, IP, 2, SIZE_MAX) \ + DHCP_OPT(MAX_DGRAM_REASSEMBLY, 22, UINT16, 1, 1) \ + DHCP_OPT(IP_TTL, 23, UINT8, 1, 1) \ + DHCP_OPT(PATH_MTU_TIMEOUT, 24, SECS, 1, 1) \ + DHCP_OPT(PATH_MTU_PLATEAU, 25, UINT16, 2, SIZE_MAX) \ + DHCP_OPT(MTU, 26, UINT16, 1, 1) \ + DHCP_OPT(ALL_SUBNETS_ARE_LOCAL, 27, BOOLEAN, 1, 1) \ + DHCP_OPT(BROADCAST_ADDRESS, 28, IP, 1, 1) \ + DHCP_OPT(PERFORM_MASK_DISCOVERY, 29, BOOLEAN, 1, 1) \ + DHCP_OPT(MASK_SUPPLIER, 30, BOOLEAN, 1, 1) \ + DHCP_OPT(PERFORM_ROUTER_DISCOVERY, 31, BOOLEAN, 1, 1) \ + DHCP_OPT(ROUTER_SOLICITATION, 32, IP, 1, 1) \ + DHCP_OPT(STATIC_ROUTE, 33, IP, 2, SIZE_MAX) \ + /* Trailer Encapsulation Option is obsolete. */ \ + DHCP_OPT(ARP_CACHE_TIMEOUT, 35, SECS, 1, 1) \ + DHCP_OPT(ETHERNET_ENCAPSULATION, 36, BOOLEAN, 1, 1) \ + DHCP_OPT(TCP_TTL, 37, UINT8, 1, 1) \ + DHCP_OPT(TCP_KEEPALIVE_INTERVAL, 38, SECS, 1, 1) \ + DHCP_OPT(TCP_KEEPALIVE_GARBAGE, 39, BOOLEAN, 1, 1) \ + DHCP_OPT(NIS_DOMAIN, 40, STRING, 1, SIZE_MAX) \ + DHCP_OPT(NIS_SERVERS, 41, IP, 1, SIZE_MAX) \ + DHCP_OPT(NTP_SERVERS, 42, IP, 1, SIZE_MAX) \ + DHCP_OPT(VENDOR_SPECIFIC, 43, UINT8, 1, SIZE_MAX) \ + DHCP_OPT(NETBIOS_NS, 44, IP, 1, SIZE_MAX) \ + DHCP_OPT(NETBIOS_DDS, 45, IP, 1, SIZE_MAX) \ + DHCP_OPT(NETBIOS_NODE_TYPE, 46, UINT8, 1, 1) \ + DHCP_OPT(NETBIOS_SCOPE, 47, STRING, 1, SIZE_MAX) \ + DHCP_OPT(X_FONT_SERVER, 48, IP, 1, SIZE_MAX) \ + DHCP_OPT(XDM, 49, IP, 1, SIZE_MAX) \ + DHCP_OPT(NISPLUS_DOMAIN, 64, STRING, 1, SIZE_MAX) \ + DHCP_OPT(NISPLUS_SERVERS, 65, IP, 1, SIZE_MAX) \ + DHCP_OPT(MOBILE_IP_HOME_AGENT, 68, IP, 0, SIZE_MAX) \ + DHCP_OPT(SMTP_SERVER, 69, IP, 1, SIZE_MAX) \ + DHCP_OPT(POP3_SERVER, 70, IP, 1, SIZE_MAX) \ + DHCP_OPT(NNTP_SERVER, 71, IP, 1, SIZE_MAX) \ + DHCP_OPT(WWW_SERVER, 72, IP, 1, SIZE_MAX) \ + DHCP_OPT(FINGER_SERVER, 73, IP, 1, SIZE_MAX) \ + DHCP_OPT(IRC_SERVER, 74, IP, 1, SIZE_MAX) \ + /* StreetTalk Server Option is obsolete. */ \ + /* StreetTalk Directory Assistance Server Option is obsolete. */ \ + DHCP_OPT(REQUESTED_IP, 50, IP, 1, 1) \ + DHCP_OPT(LEASE_TIME, 51, SECS, 1, 1) \ + DHCP_OPT(OPTION_OVERLOAD, 52, UINT8, 1, 1) \ + DHCP_OPT(TFTP_SERVER, 66, STRING, 1, SIZE_MAX) \ + DHCP_OPT(BOOTFILE_NAME, 67, STRING, 1, SIZE_MAX) \ + DHCP_OPT(DHCP_MSG_TYPE, 53, UINT8, 1, 1) \ + DHCP_OPT(SERVER_IDENTIFIER, 54, IP, 1, 1) \ + DHCP_OPT(PARAMETER_REQUEST_LIST, 55, UINT8, 1, SIZE_MAX) \ + DHCP_OPT(MESSAGE, 56, STRING, 1, SIZE_MAX) \ + DHCP_OPT(MAX_DHCP_MSG_SIZE, 57, UINT16, 1, 1) \ + DHCP_OPT(T1, 58, SECS, 1, 1) \ + DHCP_OPT(T2, 59, SECS, 1, 1) \ + DHCP_OPT(VENDOR_CLASS, 60, STRING, 1, SIZE_MAX) \ + DHCP_OPT(CLIENT_ID, 61, UINT8, 2, SIZE_MAX) \ + DHCP_VNDOPT(OFP_CONTROLLER_VCONN, 1, STRING, 1, SIZE_MAX) \ + DHCP_VNDOPT(OFP_PKI_URI, 2, STRING, 1, SIZE_MAX) + +/* Shorthand for defining vendor options (used above). */ +#define DHCP_VNDOPT(NAME, CODE, ARG, MIN, MAX) \ + DHCP_OPT(NAME, (CODE) + DHCP_VENDOR_OFS, ARG, MIN, MAX) + +/* DHCP option codes. */ +enum { +#define DHCP_OPT(NAME, VALUE, ARGTYPE, MIN_ARGS, MAX_ARGS) \ + DHCP_CODE_##NAME = VALUE, +DHCP_OPTS +#undef DHCP_OPT +}; + +/* The contents of a DHCP option. + * + * DHCP options can (rarely) be present but lack content. To represent such an + * option, 'n' is 0 and 'data' is non-null (but does not point to anything + * useful). */ +struct dhcp_option { + size_t n; /* Number of bytes of data. */ + void *data; /* Data. */ +}; + +const char *dhcp_option_to_string(const struct dhcp_option *, int code, + struct ds *); +bool dhcp_option_equals(const struct dhcp_option *, + const struct dhcp_option *); + +/* Abstracted DHCP protocol message, to make them easier to manipulate than + * through raw protocol buffers. */ +struct dhcp_msg { + /* For use by calling code. */ + uint8_t op; /* DHCP_BOOTREQUEST or DHCP_BOOTREPLY. */ + uint32_t xid; /* Transaction ID. */ + uint16_t secs; /* Since client started address acquisition. */ + uint16_t flags; /* DHCP_FLAGS_*. */ + uint32_t ciaddr; /* Client IP, if it has a lease for one. */ + uint32_t yiaddr; /* Client ("your") IP address. */ + uint32_t siaddr; /* Next server IP address. */ + uint32_t giaddr; /* Relay agent IP address. */ + uint8_t chaddr[ETH_ADDR_LEN]; /* Client hardware address. */ + enum dhcp_msg_type type; /* DHCP_CODE_DHCP_MSG_TYPE option argument. */ + struct dhcp_option options[DHCP_N_OPTIONS]; /* Indexed by option code. */ + + /* For direct use only by dhcp_msg_*() functions. */ + uint8_t *data; + size_t data_used, data_allocated; +}; + +void dhcp_msg_init(struct dhcp_msg *); +void dhcp_msg_uninit(struct dhcp_msg *); +void dhcp_msg_copy(struct dhcp_msg *, const struct dhcp_msg *); +void dhcp_msg_put(struct dhcp_msg *, int code, const void *, size_t); +void dhcp_msg_put_bool(struct dhcp_msg *, int code, bool); +void dhcp_msg_put_secs(struct dhcp_msg *, int code, uint32_t); +void dhcp_msg_put_ip(struct dhcp_msg *, int code, uint32_t); +void dhcp_msg_put_string(struct dhcp_msg *, int code, const char *); +void dhcp_msg_put_uint8(struct dhcp_msg *, int code, uint8_t); +void dhcp_msg_put_uint8_array(struct dhcp_msg *, int code, + const uint8_t[], size_t n); +void dhcp_msg_put_uint16(struct dhcp_msg *, int code, uint16_t); +void dhcp_msg_put_uint16_array(struct dhcp_msg *, int code, + const uint16_t[], size_t n); +const void *dhcp_msg_get(const struct dhcp_msg *, int code, size_t offset, + size_t size); +bool dhcp_msg_get_bool(const struct dhcp_msg *, int code, + size_t offset, bool *); +bool dhcp_msg_get_secs(const struct dhcp_msg *, int code, + size_t offset, uint32_t *); +bool dhcp_msg_get_ip(const struct dhcp_msg *, int code, + size_t offset, uint32_t *); +char *dhcp_msg_get_string(const struct dhcp_msg *, int code); +bool dhcp_msg_get_uint8(const struct dhcp_msg *, int code, + size_t offset, uint8_t *); +bool dhcp_msg_get_uint16(const struct dhcp_msg *, int code, + size_t offset, uint16_t *); +const char *dhcp_msg_to_string(const struct dhcp_msg *, bool multiline, + struct ds *); +int dhcp_parse(struct dhcp_msg *, const struct ofpbuf *); +void dhcp_assemble(const struct dhcp_msg *, struct ofpbuf *); + +#endif /* dhcp.h */ diff --git a/lib/dhparams.h b/lib/dhparams.h new file mode 100644 index 00000000..0377bb94 --- /dev/null +++ b/lib/dhparams.h @@ -0,0 +1,10 @@ +#ifndef DHPARAMS_H +#define DHPARAMS_H 1 + +#include + +DH *get_dh1024(void); +DH *get_dh2048(void); +DH *get_dh4096(void); + +#endif /* dhparams.h */ diff --git a/lib/dirs.h b/lib/dirs.h new file mode 100644 index 00000000..c90c013f --- /dev/null +++ b/lib/dirs.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef DIRS_H +#define DIRS_H 1 + +extern const char ovs_pkgdatadir[]; /* /usr/local/share/openvswitch */ +extern const char ovs_rundir[]; /* /usr/local/var/run */ +extern const char ovs_logdir[]; /* /usr/local/var/log */ +extern const char ovs_bindir[]; /* /usr/local/bin */ + +#endif /* dirs.h */ diff --git a/lib/dpif.c b/lib/dpif.c new file mode 100644 index 00000000..73ad5df0 --- /dev/null +++ b/lib/dpif.c @@ -0,0 +1,1060 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "dpif.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "coverage.h" +#include "dynamic-string.h" +#include "flow.h" +#include "netlink.h" +#include "odp-util.h" +#include "ofp-print.h" +#include "ofpbuf.h" +#include "packets.h" +#include "poll-loop.h" +#include "util.h" +#include "valgrind.h" + +#include "vlog.h" +#define THIS_MODULE VLM_dpif + +/* Rate limit for individual messages going to or from the datapath, output at + * DBG level. This is very high because, if these are enabled, it is because + * we really need to see them. */ +static struct vlog_rate_limit dpmsg_rl = VLOG_RATE_LIMIT_INIT(600, 600); + +/* Not really much point in logging many dpif errors. */ +static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(9999, 5); + +static int get_minor_from_name(const char *name, unsigned int *minor); +static int name_to_minor(const char *name, unsigned int *minor); +static int lookup_minor(const char *name, unsigned int *minor); +static int open_by_minor(unsigned int minor, struct dpif *); +static int make_openvswitch_device(unsigned int minor, char **fnp); +static void check_rw_odp_flow(struct odp_flow *); + +int +dpif_open(const char *name, struct dpif *dpif) +{ + int listen_mask; + int error; + + dpif->fd = -1; + + error = name_to_minor(name, &dpif->minor); + if (error) { + return error; + } + + error = open_by_minor(dpif->minor, dpif); + if (error) { + return error; + } + + /* We can open the device, but that doesn't mean that it's been created. + * If it hasn't been, then any command other than ODP_DP_CREATE will + * return ENODEV. Try something innocuous. */ + listen_mask = 0; /* Make Valgrind happy. */ + if (ioctl(dpif->fd, ODP_GET_LISTEN_MASK, &listen_mask)) { + error = errno; + if (error != ENODEV) { + VLOG_WARN("dp%u: probe returned unexpected error: %s", + dpif->minor, strerror(error)); + } + dpif_close(dpif); + return error; + } + return 0; +} + +void +dpif_close(struct dpif *dpif) +{ + if (dpif) { + close(dpif->fd); + dpif->fd = -1; + } +} + +static int +do_ioctl(const struct dpif *dpif, int cmd, const char *cmd_name, + const void *arg) +{ + int error = ioctl(dpif->fd, cmd, arg) ? errno : 0; + if (cmd_name) { + if (error) { + VLOG_WARN_RL(&error_rl, "dp%u: ioctl(%s) failed (%s)", + dpif->minor, cmd_name, strerror(error)); + } else { + VLOG_DBG_RL(&dpmsg_rl, "dp%u: ioctl(%s): success", + dpif->minor, cmd_name); + } + } + return error; +} + +int +dpif_create(const char *name, struct dpif *dpif) +{ + unsigned int minor; + int error; + + if (!get_minor_from_name(name, &minor)) { + /* Minor was specified in 'name', go ahead and create it. */ + error = open_by_minor(minor, dpif); + if (error) { + return error; + } + + if (!strncmp(name, "nl:", 3)) { + char devname[128]; + sprintf(devname, "of%u", minor); + error = ioctl(dpif->fd, ODP_DP_CREATE, devname) < 0 ? errno : 0; + } else { + error = ioctl(dpif->fd, ODP_DP_CREATE, name) < 0 ? errno : 0; + } + if (error) { + dpif_close(dpif); + } + return error; + } else { + for (minor = 0; minor < ODP_MAX; minor++) { + error = open_by_minor(minor, dpif); + if (error) { + return error; + } + + error = ioctl(dpif->fd, ODP_DP_CREATE, name) < 0 ? errno : 0; + if (!error) { + return 0; + } + dpif_close(dpif); + if (error != EBUSY) { + return error; + } + } + return ENOBUFS; + } +} + +int +dpif_get_name(struct dpif *dpif, char *name, size_t name_size) +{ + struct odp_port port; + int error; + + assert(name_size > 0); + *name = '\0'; + + error = dpif_port_query_by_number(dpif, ODPP_LOCAL, &port); + if (!error) { + ovs_strlcpy(name, port.devname, name_size); + } + return error; +} + +int +dpif_delete(struct dpif *dpif) +{ + COVERAGE_INC(dpif_destroy); + return do_ioctl(dpif, ODP_DP_DESTROY, "ODP_DP_DESTROY", NULL); +} + +int +dpif_get_dp_stats(const struct dpif *dpif, struct odp_stats *stats) +{ + memset(stats, 0, sizeof *stats); + return do_ioctl(dpif, ODP_DP_STATS, "ODP_DP_STATS", stats); +} + +int +dpif_get_drop_frags(const struct dpif *dpif, bool *drop_frags) +{ + int tmp; + int error = do_ioctl(dpif, ODP_GET_DROP_FRAGS, "ODP_GET_DROP_FRAGS", &tmp); + *drop_frags = error ? tmp & 1 : false; + return error; +} + +int +dpif_set_drop_frags(struct dpif *dpif, bool drop_frags) +{ + int tmp = drop_frags; + return do_ioctl(dpif, ODP_SET_DROP_FRAGS, "ODP_SET_DROP_FRAGS", &tmp); +} + +int +dpif_get_listen_mask(const struct dpif *dpif, int *listen_mask) +{ + int error = do_ioctl(dpif, ODP_GET_LISTEN_MASK, "ODP_GET_LISTEN_MASK", + listen_mask); + if (error) { + *listen_mask = 0; + } + return error; +} + +int +dpif_set_listen_mask(struct dpif *dpif, int listen_mask) +{ + return do_ioctl(dpif, ODP_SET_LISTEN_MASK, "ODP_SET_LISTEN_MASK", + &listen_mask); +} + +int +dpif_purge(struct dpif *dpif) +{ + struct odp_stats stats; + unsigned int i; + int error; + + COVERAGE_INC(dpif_purge); + + error = dpif_get_dp_stats(dpif, &stats); + if (error) { + return error; + } + + for (i = 0; i < stats.max_miss_queue + stats.max_action_queue; i++) { + struct ofpbuf *buf; + error = dpif_recv(dpif, &buf); + if (error) { + return error == EAGAIN ? 0 : error; + } + ofpbuf_delete(buf); + } + return 0; +} + +int +dpif_port_add(struct dpif *dpif, const char *devname, uint16_t port_no, + uint16_t flags) +{ + struct odp_port port; + + COVERAGE_INC(dpif_port_add); + memset(&port, 0, sizeof port); + strncpy(port.devname, devname, sizeof port.devname); + port.port = port_no; + port.flags = flags; + if (!ioctl(dpif->fd, ODP_PORT_ADD, &port)) { + VLOG_DBG_RL(&dpmsg_rl, "dp%u: added %s as port %"PRIu16, + dpif->minor, devname, port_no); + return 0; + } else { + VLOG_WARN_RL(&error_rl, "dp%u: failed to add %s as port " + "%"PRIu16": %s", dpif->minor, devname, port_no, + strerror(errno)); + return errno; + } +} + +int +dpif_port_del(struct dpif *dpif, uint16_t port_no) +{ + int tmp = port_no; + COVERAGE_INC(dpif_port_del); + return do_ioctl(dpif, ODP_PORT_DEL, "ODP_PORT_DEL", &tmp); +} + +int +dpif_port_query_by_number(const struct dpif *dpif, uint16_t port_no, + struct odp_port *port) +{ + memset(port, 0, sizeof *port); + port->port = port_no; + if (!ioctl(dpif->fd, ODP_PORT_QUERY, port)) { + VLOG_DBG_RL(&dpmsg_rl, "dp%u: port %"PRIu16" is device %s", + dpif->minor, port_no, port->devname); + return 0; + } else { + VLOG_WARN_RL(&error_rl, "dp%u: failed to query port %"PRIu16": %s", + dpif->minor, port_no, strerror(errno)); + return errno; + } +} + +int +dpif_port_query_by_name(const struct dpif *dpif, const char *devname, + struct odp_port *port) +{ + memset(port, 0, sizeof *port); + strncpy(port->devname, devname, sizeof port->devname); + if (!ioctl(dpif->fd, ODP_PORT_QUERY, port)) { + VLOG_DBG_RL(&dpmsg_rl, "dp%u: device %s is on port %"PRIu16, + dpif->minor, devname, port->port); + return 0; + } else { + VLOG_WARN_RL(&error_rl, "dp%u: failed to query port %s: %s", + dpif->minor, devname, strerror(errno)); + return errno; + } +} + +int +dpif_port_list(const struct dpif *dpif, + struct odp_port **ports, size_t *n_ports) +{ + struct odp_portvec pv; + struct odp_stats stats; + int error; + + do { + error = dpif_get_dp_stats(dpif, &stats); + if (error) { + goto error; + } + + *ports = xcalloc(1, stats.n_ports * sizeof **ports); + pv.ports = *ports; + pv.n_ports = stats.n_ports; + error = do_ioctl(dpif, ODP_PORT_LIST, "ODP_PORT_LIST", &pv); + if (error) { + free(*ports); + goto error; + } + } while (pv.n_ports != stats.n_ports); + *n_ports = pv.n_ports; + return 0; + +error: + *ports = NULL; + *n_ports = 0; + return error; +} + +int +dpif_port_group_set(struct dpif *dpif, uint16_t group, + const uint16_t ports[], size_t n_ports) +{ + struct odp_port_group pg; + + COVERAGE_INC(dpif_port_group_set); + assert(n_ports <= UINT16_MAX); + pg.group = group; + pg.ports = (uint16_t *) ports; + pg.n_ports = n_ports; + return do_ioctl(dpif, ODP_PORT_GROUP_SET, "ODP_PORT_GROUP_SET", &pg); +} + +/* Careful: '*n_out' can be greater than 'n_ports' on return, if 'n_ports' is + * less than the number of ports in 'group'. */ +int +dpif_port_group_get(const struct dpif *dpif, uint16_t group, + uint16_t ports[], size_t n_ports, size_t *n_out) +{ + struct odp_port_group pg; + int error; + + assert(n_ports <= UINT16_MAX); + pg.group = group; + pg.ports = ports; + pg.n_ports = n_ports; + error = do_ioctl(dpif, ODP_PORT_GROUP_GET, "ODP_PORT_GROUP_GET", &pg); + *n_out = error ? 0 : pg.n_ports; + return error; +} + +int +dpif_flow_flush(struct dpif *dpif) +{ + COVERAGE_INC(dpif_flow_flush); + return do_ioctl(dpif, ODP_FLOW_FLUSH, "ODP_FLOW_FLUSH", NULL); +} + +static enum vlog_level +flow_message_log_level(int error) +{ + return error ? VLL_WARN : VLL_DBG; +} + +static bool +should_log_flow_message(int error) +{ + return !vlog_should_drop(THIS_MODULE, flow_message_log_level(error), + error ? &error_rl : &dpmsg_rl); +} + +static void +log_flow_message(const struct dpif *dpif, int error, + const char *operation, + const flow_t *flow, const struct odp_flow_stats *stats, + const union odp_action *actions, size_t n_actions) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + ds_put_format(&ds, "dp%u: ", dpif->minor); + if (error) { + ds_put_cstr(&ds, "failed to "); + } + ds_put_format(&ds, "%s ", operation); + if (error) { + ds_put_format(&ds, "(%s) ", strerror(error)); + } + flow_format(&ds, flow); + if (stats) { + ds_put_cstr(&ds, ", "); + format_odp_flow_stats(&ds, stats); + } + if (actions || n_actions) { + ds_put_cstr(&ds, ", actions:"); + format_odp_actions(&ds, actions, n_actions); + } + vlog(THIS_MODULE, flow_message_log_level(error), "%s", ds_cstr(&ds)); + ds_destroy(&ds); +} + +static int +do_flow_ioctl(const struct dpif *dpif, int cmd, struct odp_flow *flow, + const char *operation, bool show_stats) +{ + int error = do_ioctl(dpif, cmd, NULL, flow); + if (error && show_stats) { + flow->n_actions = 0; + } + if (should_log_flow_message(error)) { + log_flow_message(dpif, error, operation, &flow->key, + show_stats && !error ? &flow->stats : NULL, + flow->actions, flow->n_actions); + } + return error; +} + +int +dpif_flow_put(struct dpif *dpif, struct odp_flow_put *put) +{ + int error = do_ioctl(dpif, ODP_FLOW_PUT, NULL, put); + COVERAGE_INC(dpif_flow_put); + if (should_log_flow_message(error)) { + struct ds operation = DS_EMPTY_INITIALIZER; + ds_put_cstr(&operation, "put"); + if (put->flags & ODPPF_CREATE) { + ds_put_cstr(&operation, "[create]"); + } + if (put->flags & ODPPF_MODIFY) { + ds_put_cstr(&operation, "[modify]"); + } + if (put->flags & ODPPF_ZERO_STATS) { + ds_put_cstr(&operation, "[zero]"); + } +#define ODPPF_ALL (ODPPF_CREATE | ODPPF_MODIFY | ODPPF_ZERO_STATS) + if (put->flags & ~ODPPF_ALL) { + ds_put_format(&operation, "[%x]", put->flags & ~ODPPF_ALL); + } + log_flow_message(dpif, error, ds_cstr(&operation), &put->flow.key, + !error ? &put->flow.stats : NULL, + put->flow.actions, put->flow.n_actions); + ds_destroy(&operation); + } + return error; +} + +int +dpif_flow_del(struct dpif *dpif, struct odp_flow *flow) +{ + COVERAGE_INC(dpif_flow_del); + check_rw_odp_flow(flow); + memset(&flow->stats, 0, sizeof flow->stats); + return do_flow_ioctl(dpif, ODP_FLOW_DEL, flow, "delete flow", true); +} + +int +dpif_flow_get(const struct dpif *dpif, struct odp_flow *flow) +{ + COVERAGE_INC(dpif_flow_query); + check_rw_odp_flow(flow); + memset(&flow->stats, 0, sizeof flow->stats); + return do_flow_ioctl(dpif, ODP_FLOW_GET, flow, "get flow", true); +} + +int +dpif_flow_get_multiple(const struct dpif *dpif, + struct odp_flow flows[], size_t n) +{ + struct odp_flowvec fv; + size_t i; + + COVERAGE_ADD(dpif_flow_query_multiple, n); + fv.flows = flows; + fv.n_flows = n; + for (i = 0; i < n; i++) { + check_rw_odp_flow(&flows[i]); + } + return do_ioctl(dpif, ODP_FLOW_GET_MULTIPLE, "ODP_FLOW_GET_MULTIPLE", + &fv); +} + +int +dpif_flow_list(const struct dpif *dpif, struct odp_flow flows[], size_t n, + size_t *n_out) +{ + struct odp_flowvec fv; + uint32_t i; + int error; + + COVERAGE_INC(dpif_flow_query_list); + fv.flows = flows; + fv.n_flows = n; + if (RUNNING_ON_VALGRIND) { + memset(flows, 0, n * sizeof *flows); + } else { + for (i = 0; i < n; i++) { + flows[i].actions = NULL; + flows[i].n_actions = 0; + } + } + error = do_ioctl(dpif, ODP_FLOW_LIST, NULL, &fv); + if (error) { + *n_out = 0; + VLOG_WARN_RL(&error_rl, "dp%u: flow list failed (%s)", + dpif->minor, strerror(error)); + } else { + COVERAGE_ADD(dpif_flow_query_list_n, fv.n_flows); + *n_out = fv.n_flows; + VLOG_DBG_RL(&dpmsg_rl, "dp%u: listed %zu flows", dpif->minor, *n_out); + } + return error; +} + +int +dpif_flow_list_all(const struct dpif *dpif, + struct odp_flow **flowsp, size_t *np) +{ + struct odp_stats stats; + struct odp_flow *flows; + size_t n_flows; + int error; + + *flowsp = NULL; + *np = 0; + + error = dpif_get_dp_stats(dpif, &stats); + if (error) { + return error; + } + + flows = xmalloc(sizeof *flows * stats.n_flows); + error = dpif_flow_list(dpif, flows, stats.n_flows, &n_flows); + if (error) { + free(flows); + return error; + } + + if (stats.n_flows != n_flows) { + VLOG_WARN_RL(&error_rl, "dp%u: datapath stats reported %"PRIu32" " + "flows but flow listing reported %zu", + dpif->minor, stats.n_flows, n_flows); + } + *flowsp = flows; + *np = n_flows; + return 0; +} + +int +dpif_execute(struct dpif *dpif, uint16_t in_port, + const union odp_action actions[], size_t n_actions, + const struct ofpbuf *buf) +{ + int error; + + COVERAGE_INC(dpif_execute); + if (n_actions > 0) { + struct odp_execute execute; + memset(&execute, 0, sizeof execute); + execute.in_port = in_port; + execute.actions = (union odp_action *) actions; + execute.n_actions = n_actions; + execute.data = buf->data; + execute.length = buf->size; + error = do_ioctl(dpif, ODP_EXECUTE, NULL, &execute); + } else { + error = 0; + } + + if (!(error ? VLOG_DROP_WARN(&error_rl) : VLOG_DROP_DBG(&dpmsg_rl))) { + struct ds ds = DS_EMPTY_INITIALIZER; + char *packet = ofp_packet_to_string(buf->data, buf->size, buf->size); + ds_put_format(&ds, "dp%u: execute ", dpif->minor); + format_odp_actions(&ds, actions, n_actions); + if (error) { + ds_put_format(&ds, " failed (%s)", strerror(error)); + } + ds_put_format(&ds, " on packet %s", packet); + vlog(THIS_MODULE, error ? VLL_WARN : VLL_DBG, "%s", ds_cstr(&ds)); + ds_destroy(&ds); + free(packet); + } + return error; +} + +int +dpif_recv(struct dpif *dpif, struct ofpbuf **bufp) +{ + struct ofpbuf *buf; + int retval; + int error; + + buf = ofpbuf_new(65536); + retval = read(dpif->fd, ofpbuf_tail(buf), ofpbuf_tailroom(buf)); + if (retval < 0) { + error = errno; + if (error != EAGAIN) { + VLOG_WARN_RL(&error_rl, "dp%u: read failed: %s", + dpif->minor, strerror(error)); + } + } else if (retval >= sizeof(struct odp_msg)) { + struct odp_msg *msg = buf->data; + if (msg->length <= retval) { + buf->size += retval; + if (VLOG_IS_DBG_ENABLED()) { + void *payload = msg + 1; + size_t length = buf->size - sizeof *msg; + char *s = ofp_packet_to_string(payload, length, length); + VLOG_DBG_RL(&dpmsg_rl, "dp%u: received %s message of length " + "%zu on port %"PRIu16": %s", dpif->minor, + (msg->type == _ODPL_MISS_NR ? "miss" + : msg->type == _ODPL_ACTION_NR ? "action" + : ""), + msg->length - sizeof(struct odp_msg), + msg->port, s); + free(s); + } + *bufp = buf; + COVERAGE_INC(dpif_recv); + return 0; + } else { + VLOG_WARN_RL(&error_rl, "dp%u: discarding message truncated " + "from %zu bytes to %d", + dpif->minor, msg->length, retval); + error = ERANGE; + } + } else if (!retval) { + VLOG_WARN_RL(&error_rl, "dp%u: unexpected end of file", dpif->minor); + error = EPROTO; + } else { + VLOG_WARN_RL(&error_rl, + "dp%u: discarding too-short message (%d bytes)", + dpif->minor, retval); + error = ERANGE; + } + + *bufp = NULL; + ofpbuf_delete(buf); + return error; +} + +void +dpif_recv_wait(struct dpif *dpif) +{ + poll_fd_wait(dpif->fd, POLLIN); +} + +struct dpifmon { + struct dpif dpif; + struct nl_sock *sock; + int local_ifindex; +}; + +int +dpifmon_create(const char *datapath_name, struct dpifmon **monp) +{ + struct dpifmon *mon; + char local_name[IFNAMSIZ]; + int error; + + mon = *monp = xmalloc(sizeof *mon); + + error = dpif_open(datapath_name, &mon->dpif); + if (error) { + goto error; + } + error = dpif_get_name(&mon->dpif, local_name, sizeof local_name); + if (error) { + goto error_close_dpif; + } + + mon->local_ifindex = if_nametoindex(local_name); + if (!mon->local_ifindex) { + error = errno; + VLOG_WARN("could not get ifindex of %s device: %s", + local_name, strerror(errno)); + goto error_close_dpif; + } + + error = nl_sock_create(NETLINK_ROUTE, RTNLGRP_LINK, 0, 0, &mon->sock); + if (error) { + VLOG_WARN("could not create rtnetlink socket: %s", strerror(error)); + goto error_close_dpif; + } + + return 0; + +error_close_dpif: + dpif_close(&mon->dpif); +error: + free(mon); + *monp = NULL; + return error; +} + +void +dpifmon_destroy(struct dpifmon *mon) +{ + if (mon) { + dpif_close(&mon->dpif); + nl_sock_destroy(mon->sock); + } +} + +int +dpifmon_poll(struct dpifmon *mon, char **devnamep) +{ + static struct vlog_rate_limit slow_rl = VLOG_RATE_LIMIT_INIT(1, 5); + static const struct nl_policy rtnlgrp_link_policy[] = { + [IFLA_IFNAME] = { .type = NL_A_STRING }, + [IFLA_MASTER] = { .type = NL_A_U32, .optional = true }, + }; + struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)]; + struct ofpbuf *buf; + int error; + + *devnamep = NULL; +again: + error = nl_sock_recv(mon->sock, &buf, false); + switch (error) { + case 0: + if (!nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct ifinfomsg), + rtnlgrp_link_policy, + attrs, ARRAY_SIZE(rtnlgrp_link_policy))) { + VLOG_WARN_RL(&slow_rl, "received bad rtnl message"); + error = ENOBUFS; + } else { + const char *devname = nl_attr_get_string(attrs[IFLA_IFNAME]); + bool for_us; + + if (attrs[IFLA_MASTER]) { + uint32_t master_ifindex = nl_attr_get_u32(attrs[IFLA_MASTER]); + for_us = master_ifindex == mon->local_ifindex; + } else { + /* It's for us if that device is one of our ports. This is + * open-coded instead of using dpif_port_query_by_name() to + * avoid logging a warning on failure. */ + struct odp_port port; + memset(&port, 0, sizeof port); + strncpy(port.devname, devname, sizeof port.devname); + for_us = !ioctl(mon->dpif.fd, ODP_PORT_QUERY, &port); + } + + if (!for_us) { + /* Not for us, try again. */ + ofpbuf_delete(buf); + COVERAGE_INC(dpifmon_poll_false_wakeup); + goto again; + } + COVERAGE_INC(dpifmon_poll_changed); + *devnamep = xstrdup(devname); + } + ofpbuf_delete(buf); + break; + + case EAGAIN: + /* Nothing to do. */ + break; + + case ENOBUFS: + VLOG_WARN_RL(&slow_rl, "dpifmon socket overflowed"); + break; + + default: + VLOG_WARN_RL(&slow_rl, "error on dpifmon socket: %s", strerror(error)); + break; + } + return error; +} + +void +dpifmon_run(struct dpifmon *mon UNUSED) +{ + /* Nothing to do in this implementation. */ +} + +void +dpifmon_wait(struct dpifmon *mon) +{ + nl_sock_wait(mon->sock, POLLIN); +} + +static int get_openvswitch_major(void); +static int get_major(const char *target, int default_major); + +static int +lookup_minor(const char *name, unsigned int *minor) +{ + struct ethtool_drvinfo drvinfo; + struct ifreq ifr; + int error; + int sock; + + *minor = -1; + sock = socket(AF_INET, SOCK_DGRAM, 0); + if (sock < 0) { + VLOG_WARN("socket(AF_INET) failed: %s", strerror(errno)); + error = errno; + goto error; + } + + memset(&ifr, 0, sizeof ifr); + strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name); + ifr.ifr_data = (caddr_t) &drvinfo; + + memset(&drvinfo, 0, sizeof drvinfo); + drvinfo.cmd = ETHTOOL_GDRVINFO; + if (ioctl(sock, SIOCETHTOOL, &ifr)) { + VLOG_WARN("ioctl(SIOCETHTOOL) failed: %s", strerror(errno)); + error = errno; + goto error_close_sock; + } + + if (strcmp(drvinfo.driver, "openvswitch")) { + VLOG_WARN("%s is not an openvswitch device", name); + error = EOPNOTSUPP; + goto error_close_sock; + } + + if (!isdigit(drvinfo.bus_info[0])) { + VLOG_WARN("%s ethtool info does not contain an openvswitch minor", + name); + error = EPROTOTYPE; + goto error_close_sock; + } + + *minor = atoi(drvinfo.bus_info); + close(sock); + return 0; + +error_close_sock: + close(sock); +error: + return error; +} + +static int +make_openvswitch_device(unsigned int minor, char **fnp) +{ + dev_t dev = makedev(get_openvswitch_major(), minor); + const char dirname[] = "/dev/net"; + struct stat s; + char fn[128]; + + *fnp = NULL; + sprintf(fn, "%s/dp%d", dirname, minor); + if (!stat(fn, &s)) { + if (!S_ISCHR(s.st_mode)) { + VLOG_WARN_RL(&error_rl, "%s is not a character device, fixing", + fn); + } else if (s.st_rdev != dev) { + VLOG_WARN_RL(&error_rl, + "%s is device %u:%u instead of %u:%u, fixing", + fn, major(s.st_rdev), minor(s.st_rdev), + major(dev), minor(dev)); + } else { + goto success; + } + if (unlink(fn)) { + VLOG_WARN_RL(&error_rl, "%s: unlink failed (%s)", + fn, strerror(errno)); + return errno; + } + } else if (errno == ENOENT) { + if (stat(dirname, &s)) { + if (errno == ENOENT) { + if (mkdir(dirname, 0755)) { + VLOG_WARN_RL(&error_rl, "%s: mkdir failed (%s)", + dirname, strerror(errno)); + return errno; + } + } else { + VLOG_WARN_RL(&error_rl, "%s: stat failed (%s)", + dirname, strerror(errno)); + return errno; + } + } + } else { + VLOG_WARN_RL(&error_rl, "%s: stat failed (%s)", fn, strerror(errno)); + return errno; + } + + /* The device needs to be created. */ + if (mknod(fn, S_IFCHR | 0700, dev)) { + VLOG_WARN_RL(&error_rl, + "%s: creating character device %u:%u failed (%s)", + fn, major(dev), minor(dev), strerror(errno)); + return errno; + } + +success: + *fnp = xstrdup(fn); + return 0; +} + + +static int +get_openvswitch_major(void) +{ + static unsigned int openvswitch_major; + if (!openvswitch_major) { + enum { DEFAULT_MAJOR = 248 }; + openvswitch_major = get_major("openvswitch", DEFAULT_MAJOR); + } + return openvswitch_major; +} + +static int +get_major(const char *target, int default_major) +{ + const char fn[] = "/proc/devices"; + char line[128]; + FILE *file; + int ln; + + file = fopen(fn, "r"); + if (!file) { + VLOG_ERR("opening %s failed (%s)", fn, strerror(errno)); + goto error; + } + + for (ln = 1; fgets(line, sizeof line, file); ln++) { + char name[64]; + int major; + + if (!strncmp(line, "Character", 9) || line[0] == '\0') { + /* Nothing to do. */ + } else if (!strncmp(line, "Block", 5)) { + /* We only want character devices, so skip the rest of the file. */ + break; + } else if (sscanf(line, "%d %63s", &major, name)) { + if (!strcmp(name, target)) { + fclose(file); + return major; + } + } else { + static bool warned; + if (!warned) { + VLOG_WARN("%s:%d: syntax error", fn, ln); + } + warned = true; + } + } + + VLOG_ERR("%s: %s major not found (is the module loaded?), using " + "default major %d", fn, target, default_major); +error: + VLOG_INFO("using default major %d for %s", default_major, target); + return default_major; +} + +static int +name_to_minor(const char *name, unsigned int *minor) +{ + if (!get_minor_from_name(name, minor)) { + return 0; + } + return lookup_minor(name, minor); +} + +static int +get_minor_from_name(const char *name, unsigned int *minor) +{ + if (!strncmp(name, "dp", 2) && isdigit(name[2])) { + *minor = atoi(name + 2); + return 0; + } else if (!strncmp(name, "nl:", 3) && isdigit(name[3])) { + /* This is for compatibility only and will be dropped. */ + *minor = atoi(name + 3); + return 0; + } else { + return EINVAL; + } +} + +static int +open_by_minor(unsigned int minor, struct dpif *dpif) +{ + int error; + char *fn; + int fd; + + dpif->minor = -1; + dpif->fd = -1; + error = make_openvswitch_device(minor, &fn); + if (error) { + return error; + } + + fd = open(fn, O_RDONLY | O_NONBLOCK); + if (fd < 0) { + error = errno; + VLOG_WARN("%s: open failed (%s)", fn, strerror(error)); + free(fn); + return error; + } + + free(fn); + dpif->minor = minor; + dpif->fd = fd; + return 0; +} + +/* There is a tendency to construct odp_flow objects on the stack and to + * forget to properly initialize their "actions" and "n_actions" members. + * When this happens, we get memory corruption because the kernel + * writes through the random pointer that is in the "actions" member. + * + * This function attempts to combat the problem by: + * + * - Forcing a segfault if "actions" points to an invalid region (instead + * of just getting back EFAULT, which can be easily missed in the log). + * + * - Storing a distinctive value that is likely to cause an + * easy-to-identify error later if it is dereferenced, etc. + * + * - Triggering a warning on uninitialized memory from Valgrind if + * "actions" or "n_actions" was not initialized. + */ +static void +check_rw_odp_flow(struct odp_flow *flow) +{ + if (flow->n_actions) { + memset(&flow->actions[0], 0xcc, sizeof flow->actions[0]); + } +} diff --git a/lib/dpif.h b/lib/dpif.h new file mode 100644 index 00000000..22f83555 --- /dev/null +++ b/lib/dpif.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + + +#ifndef DPIF_H +#define DPIF_H 1 + +/* Operations for the datapath running in the local kernel. The interface can + * generalize to multiple types of local datapaths, but the implementation only + * supports the openflow kernel module. */ + +#include "openvswitch/datapath-protocol.h" +#include +#include +#include + +struct ofpbuf; + +/* A datapath interface. Opaque. */ +struct dpif { + unsigned int minor; /* For use in error messages. */ + int fd; +}; + +int dpif_open(const char *name, struct dpif *); +int dpif_create(const char *name, struct dpif *); +void dpif_close(struct dpif *); + +static inline unsigned int dpif_id(const struct dpif *dpif); +int dpif_get_name(struct dpif *, char *name, size_t name_size); + +int dpif_delete(struct dpif *); + +int dpif_get_dp_stats(const struct dpif *, struct odp_stats *); +int dpif_get_drop_frags(const struct dpif *, bool *drop_frags); +int dpif_set_drop_frags(struct dpif *, bool drop_frags); + +int dpif_get_listen_mask(const struct dpif *, int *listen_mask); +int dpif_set_listen_mask(struct dpif *, int listen_mask); +int dpif_purge(struct dpif *); + +int dpif_port_add(struct dpif *, const char *devname, uint16_t port_no, + uint16_t flags); +int dpif_port_del(struct dpif *, uint16_t port_no); +int dpif_port_query_by_number(const struct dpif *, uint16_t port_no, + struct odp_port *); +int dpif_port_query_by_name(const struct dpif *, const char *devname, + struct odp_port *); +int dpif_port_list(const struct dpif *, struct odp_port **, size_t *n_ports); + +int dpif_port_group_set(struct dpif *, uint16_t group, + const uint16_t ports[], size_t n_ports); +int dpif_port_group_get(const struct dpif *, uint16_t group, + uint16_t ports[], size_t n_ports, size_t *n_out); + +int dpif_flow_flush(struct dpif *); +int dpif_flow_put(struct dpif *, struct odp_flow_put *); +int dpif_flow_del(struct dpif *, struct odp_flow *); +int dpif_flow_get(const struct dpif *, struct odp_flow *); +int dpif_flow_get_multiple(const struct dpif *, struct odp_flow[], size_t n); +int dpif_flow_list(const struct dpif *, struct odp_flow[], size_t n, + size_t *n_out); +int dpif_flow_list_all(const struct dpif *, + struct odp_flow **flowsp, size_t *np); + +int dpif_execute(struct dpif *, uint16_t in_port, + const union odp_action[], size_t n_actions, + const struct ofpbuf *); + +int dpif_recv(struct dpif *, struct ofpbuf **); +void dpif_recv_wait(struct dpif *); + +static inline unsigned int +dpif_id(const struct dpif *dpif) +{ + return dpif->minor; +} + +struct dpifmon; + +int dpifmon_create(const char *datapath_name, struct dpifmon **); +void dpifmon_destroy(struct dpifmon *); + +int dpifmon_poll(struct dpifmon *, char **devnamep); + +void dpifmon_run(struct dpifmon *); +void dpifmon_wait(struct dpifmon *); + +#endif /* dpif.h */ diff --git a/lib/dpif.man b/lib/dpif.man new file mode 100644 index 00000000..72175b0a --- /dev/null +++ b/lib/dpif.man @@ -0,0 +1,16 @@ +.RS +.TP +\fBdp\fIN\fR +Datapath number \fIN\fR, where \fIN\fR is a number between 0 and 255, +inclusive. + +.TP +\fIname\fR +The name of the network device associated with the datapath's local +port. (\fB\*(PN\fR internally converts this into a datapath number, +as above.) + +.TP +\fBnl:\fIN\fR +This is an obsolete synonym for \fBdp\fIN\fR. +.RE diff --git a/lib/dynamic-string.c b/lib/dynamic-string.c new file mode 100644 index 00000000..6214c8a1 --- /dev/null +++ b/lib/dynamic-string.c @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "dynamic-string.h" +#include +#include +#include +#include +#include "timeval.h" +#include "util.h" + +void +ds_init(struct ds *ds) +{ + ds->string = NULL; + ds->length = 0; + ds->allocated = 0; +} + +void +ds_clear(struct ds *ds) +{ + ds->length = 0; +} + +void +ds_truncate(struct ds *ds, size_t new_length) +{ + if (ds->length > new_length) { + ds->length = new_length; + ds->string[new_length] = '\0'; + } +} + +void +ds_reserve(struct ds *ds, size_t min_length) +{ + if (min_length > ds->allocated || !ds->string) { + ds->allocated += MAX(min_length, ds->allocated); + ds->allocated = MAX(8, ds->allocated); + ds->string = xrealloc(ds->string, ds->allocated + 1); + } +} + +char * +ds_put_uninit(struct ds *ds, size_t n) +{ + ds_reserve(ds, ds->length + n); + ds->length += n; + ds->string[ds->length] = '\0'; + return &ds->string[ds->length - n]; +} + +void +ds_put_char(struct ds *ds, char c) +{ + *ds_put_uninit(ds, 1) = c; +} + +void +ds_put_char_multiple(struct ds *ds, char c, size_t n) +{ + memset(ds_put_uninit(ds, n), c, n); +} + +void +ds_put_buffer(struct ds *ds, const char *s, size_t n) +{ + memcpy(ds_put_uninit(ds, n), s, n); +} + +void +ds_put_cstr(struct ds *ds, const char *s) +{ + size_t s_len = strlen(s); + memcpy(ds_put_uninit(ds, s_len), s, s_len); +} + +void +ds_put_format(struct ds *ds, const char *format, ...) +{ + va_list args; + + va_start(args, format); + ds_put_format_valist(ds, format, args); + va_end(args); +} + +void +ds_put_format_valist(struct ds *ds, const char *format, va_list args_) +{ + va_list args; + size_t available; + int needed; + + va_copy(args, args_); + available = ds->string ? ds->allocated - ds->length + 1 : 0; + needed = vsnprintf(&ds->string[ds->length], available, format, args); + va_end(args); + + if (needed < available) { + ds->length += needed; + } else { + size_t available; + + ds_reserve(ds, ds->length + needed); + + va_copy(args, args_); + available = ds->allocated - ds->length + 1; + needed = vsnprintf(&ds->string[ds->length], available, format, args); + va_end(args); + + assert(needed < available); + ds->length += needed; + } +} + +void +ds_put_printable(struct ds *ds, const char *s, size_t n) +{ + ds_reserve(ds, ds->length + n); + while (n-- > 0) { + unsigned char c = *s++; + if (c < 0x20 || c > 0x7e || c == '\\' || c == '"') { + ds_put_format(ds, "\\%03o", (int) c); + } else { + ds_put_char(ds, c); + } + } +} + +void +ds_put_strftime(struct ds *ds, const char *template, const struct tm *tm) +{ + if (!tm) { + time_t now = time_now(); + tm = localtime(&now); + } + for (;;) { + size_t avail = ds->string ? ds->allocated - ds->length + 1 : 0; + size_t used = strftime(&ds->string[ds->length], avail, template, tm); + if (used) { + ds->length += used; + return; + } + ds_reserve(ds, ds->length + (avail < 32 ? 64 : 2 * avail)); + } +} + +int +ds_get_line(struct ds *ds, FILE *file) +{ + ds_clear(ds); + for (;;) { + int c = getc(file); + if (c == EOF) { + return ds->length ? 0 : EOF; + } else if (c == '\n') { + return 0; + } else { + ds_put_char(ds, c); + } + } +} + +char * +ds_cstr(struct ds *ds) +{ + if (!ds->string) { + ds_reserve(ds, 0); + } + ds->string[ds->length] = '\0'; + return ds->string; +} + +void +ds_destroy(struct ds *ds) +{ + free(ds->string); +} + +/* Writes the 'size' bytes in 'buf' to 'string' as hex bytes arranged 16 per + * line. Numeric offsets are also included, starting at 'ofs' for the first + * byte in 'buf'. If 'ascii' is true then the corresponding ASCII characters + * are also rendered alongside. */ +void +ds_put_hex_dump(struct ds *ds, const void *buf_, size_t size, + uintptr_t ofs, bool ascii) +{ + const uint8_t *buf = buf_; + const size_t per_line = 16; /* Maximum bytes per line. */ + + while (size > 0) + { + size_t start, end, n; + size_t i; + + /* Number of bytes on this line. */ + start = ofs % per_line; + end = per_line; + if (end - start > size) + end = start + size; + n = end - start; + + /* Print line. */ + ds_put_format(ds, "%08jx ", (uintmax_t) ROUND_DOWN(ofs, per_line)); + for (i = 0; i < start; i++) + ds_put_format(ds, " "); + for (; i < end; i++) + ds_put_format(ds, "%02hhx%c", + buf[i - start], i == per_line / 2 - 1? '-' : ' '); + if (ascii) + { + for (; i < per_line; i++) + ds_put_format(ds, " "); + ds_put_format(ds, "|"); + for (i = 0; i < start; i++) + ds_put_format(ds, " "); + for (; i < end; i++) { + int c = buf[i - start]; + ds_put_char(ds, c >= 32 && c < 127 ? c : '.'); + } + for (; i < per_line; i++) + ds_put_format(ds, " "); + ds_put_format(ds, "|"); + } + ds_put_format(ds, "\n"); + + ofs += n; + buf += n; + size -= n; + } +} + +int +ds_last(const struct ds *ds) +{ + return ds->length > 0 ? (unsigned char) ds->string[ds->length - 1] : EOF; +} + +void +ds_chomp(struct ds *ds, int c) +{ + if (ds->length > 0 && ds->string[ds->length - 1] == (char) c) { + ds->string[--ds->length] = '\0'; + } +} diff --git a/lib/dynamic-string.h b/lib/dynamic-string.h new file mode 100644 index 00000000..30da5fad --- /dev/null +++ b/lib/dynamic-string.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef DYNAMIC_STRING_H +#define DYNAMIC_STRING_H 1 + +#include +#include +#include +#include +#include +#include "compiler.h" + +struct tm; + +struct ds { + char *string; /* Null-terminated string. */ + size_t length; /* Bytes used, not including null terminator. */ + size_t allocated; /* Bytes allocated, not including null terminator. */ +}; + +#define DS_EMPTY_INITIALIZER { NULL, 0, 0 } + +void ds_init(struct ds *); +void ds_clear(struct ds *); +void ds_truncate(struct ds *, size_t new_length); +void ds_reserve(struct ds *, size_t min_length); +char *ds_put_uninit(struct ds *, size_t n); +void ds_put_char(struct ds *, char); +void ds_put_char_multiple(struct ds *, char, size_t n); +void ds_put_buffer(struct ds *, const char *, size_t n); +void ds_put_cstr(struct ds *, const char *); +void ds_put_format(struct ds *, const char *, ...) PRINTF_FORMAT(2, 3); +void ds_put_format_valist(struct ds *, const char *, va_list) + PRINTF_FORMAT(2, 0); +void ds_put_printable(struct ds *, const char *, size_t); +void ds_put_strftime(struct ds *, const char *, const struct tm *) + STRFTIME_FORMAT(2); +void ds_put_hex_dump(struct ds *ds, const void *buf_, size_t size, + uintptr_t ofs, bool ascii); +int ds_get_line(struct ds *, FILE *); + +char *ds_cstr(struct ds *); +void ds_destroy(struct ds *); + +int ds_last(const struct ds *); +void ds_chomp(struct ds *, int c); + +#endif /* dynamic-string.h */ diff --git a/lib/fatal-signal.c b/lib/fatal-signal.c new file mode 100644 index 00000000..ccb7fe45 --- /dev/null +++ b/lib/fatal-signal.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include "fatal-signal.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +/* Signals to catch. */ +static const int fatal_signals[] = { SIGTERM, SIGINT, SIGHUP, SIGALRM }; + +/* Signals to catch as a sigset_t. */ +static sigset_t fatal_signal_set; + +/* Hooks to call upon catching a signal */ +struct hook { + void (*func)(void *aux); + void *aux; + bool run_at_exit; +}; +#define MAX_HOOKS 32 +static struct hook hooks[MAX_HOOKS]; +static size_t n_hooks; + +/* Number of nesting signal blockers. */ +static int block_level = 0; + +/* Signal mask saved by outermost signal blocker. */ +static sigset_t saved_signal_mask; + +/* Disabled by fatal_signal_fork()? */ +static bool disabled; + +static void call_sigprocmask(int how, sigset_t* new_set, sigset_t* old_set); +static void atexit_handler(void); +static void call_hooks(int sig_nr); + +/* Registers 'hook' to be called when a process termination signal is raised. + * If 'run_at_exit' is true, 'hook' is also called during normal process + * termination, e.g. when exit() is called or when main() returns. */ +void +fatal_signal_add_hook(void (*func)(void *aux), void *aux, bool run_at_exit) +{ + fatal_signal_block(); + assert(n_hooks < MAX_HOOKS); + hooks[n_hooks].func = func; + hooks[n_hooks].aux = aux; + hooks[n_hooks].run_at_exit = run_at_exit; + n_hooks++; + fatal_signal_unblock(); +} + +/* Blocks program termination signals until fatal_signal_unblock() is called. + * May be called multiple times with nesting; if so, fatal_signal_unblock() + * must be called the same number of times to unblock signals. + * + * This is needed while adjusting a data structure that will be accessed by a + * fatal signal hook, so that the hook is not invoked while the data structure + * is in an inconsistent state. */ +void +fatal_signal_block(void) +{ + static bool inited = false; + if (!inited) { + size_t i; + + inited = true; + sigemptyset(&fatal_signal_set); + for (i = 0; i < ARRAY_SIZE(fatal_signals); i++) { + int sig_nr = fatal_signals[i]; + struct sigaction old_sa; + + sigaddset(&fatal_signal_set, sig_nr); + if (sigaction(sig_nr, NULL, &old_sa)) { + ovs_fatal(errno, "sigaction"); + } + if (old_sa.sa_handler == SIG_DFL + && signal(sig_nr, fatal_signal_handler) == SIG_ERR) { + ovs_fatal(errno, "signal"); + } + } + atexit(atexit_handler); + } + + if (++block_level == 1) { + call_sigprocmask(SIG_BLOCK, &fatal_signal_set, &saved_signal_mask); + } +} + +/* Unblocks program termination signals blocked by fatal_signal_block() is + * called. If multiple calls to fatal_signal_block() are nested, + * fatal_signal_unblock() must be called the same number of times to unblock + * signals. */ +void +fatal_signal_unblock(void) +{ + assert(block_level > 0); + if (--block_level == 0) { + call_sigprocmask(SIG_SETMASK, &saved_signal_mask, NULL); + } +} + +/* Handles fatal signal number 'sig_nr'. + * + * Ordinarily this is the actual signal handler. When other code needs to + * handle one of our signals, however, it can register for that signal and, if + * and when necessary, call this function to do fatal signal processing for it + * and terminate the process. Currently only timeval.c does this, for SIGALRM. + * (It is not important whether the other code sets up its signal handler + * before or after this file, because this file will only set up a signal + * handler in the case where the signal has its default handling.) */ +void +fatal_signal_handler(int sig_nr) +{ + call_hooks(sig_nr); + + /* Re-raise the signal with the default handling so that the program + * termination status reflects that we were killed by this signal */ + signal(sig_nr, SIG_DFL); + raise(sig_nr); +} + +static void +atexit_handler(void) +{ + if (!disabled) { + call_hooks(0); + } +} + +static void +call_hooks(int sig_nr) +{ + static volatile sig_atomic_t recurse = 0; + if (!recurse) { + size_t i; + + recurse = 1; + + for (i = 0; i < n_hooks; i++) { + struct hook *h = &hooks[i]; + if (sig_nr || h->run_at_exit) { + h->func(h->aux); + } + } + } +} + +static char **files; +static size_t n_files, max_files; + +static void unlink_files(void *aux); +static void do_unlink_files(void); + +/* Registers 'file' to be unlinked when the program terminates via exit() or a + * fatal signal. */ +void +fatal_signal_add_file_to_unlink(const char *file) +{ + static bool added_hook = false; + if (!added_hook) { + added_hook = true; + fatal_signal_add_hook(unlink_files, NULL, true); + } + + fatal_signal_block(); + if (n_files >= max_files) { + files = x2nrealloc(files, &max_files, sizeof *files); + } + files[n_files++] = xstrdup(file); + fatal_signal_unblock(); +} + +/* Unregisters 'file' from being unlinked when the program terminates via + * exit() or a fatal signal. */ +void +fatal_signal_remove_file_to_unlink(const char *file) +{ + size_t i; + + fatal_signal_block(); + for (i = 0; i < n_files; i++) { + if (!strcmp(files[i], file)) { + free(files[i]); + files[i] = files[--n_files]; + break; + } + } + fatal_signal_unblock(); +} + +static void +unlink_files(void *aux UNUSED) +{ + do_unlink_files(); +} + +static void +do_unlink_files(void) +{ + size_t i; + + for (i = 0; i < n_files; i++) { + unlink(files[i]); + } +} + +/* Disables the fatal signal hook mechanism. Following a fork, one of the + * resulting processes can call this function to allow it to terminate without + * triggering fatal signal processing or removing files. Fatal signal + * processing is still enabled in the other process. */ +void +fatal_signal_fork(void) +{ + size_t i; + + disabled = true; + + for (i = 0; i < ARRAY_SIZE(fatal_signals); i++) { + int sig_nr = fatal_signals[i]; + if (signal(sig_nr, SIG_DFL) == SIG_IGN) { + signal(sig_nr, SIG_IGN); + } + } +} + +static void +call_sigprocmask(int how, sigset_t* new_set, sigset_t* old_set) +{ + int error = sigprocmask(how, new_set, old_set); + if (error) { + fprintf(stderr, "sigprocmask: %s\n", strerror(errno)); + } +} diff --git a/lib/fatal-signal.h b/lib/fatal-signal.h new file mode 100644 index 00000000..a92b10dd --- /dev/null +++ b/lib/fatal-signal.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef FATAL_SIGNAL_H +#define FATAL_SIGNAL_H 1 + +#include + +/* Basic interface. */ +void fatal_signal_add_hook(void (*)(void *aux), void *aux, bool run_at_exit); +void fatal_signal_block(void); +void fatal_signal_unblock(void); +void fatal_signal_fork(void); + +/* Convenience functions for unlinking files upon termination. + * + * These functions also unlink the files upon normal process termination via + * exit(). */ +void fatal_signal_add_file_to_unlink(const char *); +void fatal_signal_remove_file_to_unlink(const char *); + +/* Interface for other code that catches one of our signals and needs to pass + * it through. */ +void fatal_signal_handler(int sig_nr); + +#endif /* fatal-signal.h */ diff --git a/lib/fault.c b/lib/fault.c new file mode 100644 index 00000000..6a0ff143 --- /dev/null +++ b/lib/fault.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "fault.h" +#include +#include +#include +#include +#include +#include +#include "util.h" + +#include "vlog.h" +#define THIS_MODULE VLM_fault + +static void +fault_handler(int sig_nr) +{ + VLOG_EMER("Caught signal %d.", sig_nr); + log_backtrace(); + fflush(stdout); + fflush(stderr); + + signal(sig_nr, SIG_DFL); + raise(sig_nr); +} + +void +log_backtrace(void) +{ + /* During the loop: + + frame[0] points to the next frame. + frame[1] points to the return address. */ + void **frame; + for (frame = __builtin_frame_address(0); + frame != NULL && frame[0] != NULL; + frame = frame[0]) { + Dl_info addrinfo; + if (!dladdr(frame[1], &addrinfo) || !addrinfo.dli_sname) { + fprintf(stderr, " 0x%08"PRIxPTR"\n", (uintptr_t) frame[1]); + } else { + fprintf(stderr, " 0x%08"PRIxPTR" (%s+0x%x)\n", + (uintptr_t) frame[1], addrinfo.dli_sname, + (char *) frame[1] - (char *) addrinfo.dli_saddr); + } + } + fflush(stderr); +} + +void +register_fault_handlers(void) +{ + signal(SIGABRT, fault_handler); + signal(SIGBUS, fault_handler); + signal(SIGFPE, fault_handler); + signal(SIGILL, fault_handler); + signal(SIGSEGV, fault_handler); +} diff --git a/lib/fault.h b/lib/fault.h new file mode 100644 index 00000000..fa984d40 --- /dev/null +++ b/lib/fault.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef FAULT_H +#define FAULT_H 1 + +void register_fault_handlers(void); +void log_backtrace(void); + +#endif /* fault.h */ diff --git a/lib/flow.c b/lib/flow.c new file mode 100644 index 00000000..e55f4fe3 --- /dev/null +++ b/lib/flow.c @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include +#include "flow.h" +#include +#include +#include +#include +#include "coverage.h" +#include "dynamic-string.h" +#include "hash.h" +#include "ofpbuf.h" +#include "openflow/openflow.h" +#include "openvswitch/datapath-protocol.h" +#include "packets.h" + +#include "vlog.h" +#define THIS_MODULE VLM_flow + +static struct ip_header * +pull_ip(struct ofpbuf *packet) +{ + if (packet->size >= IP_HEADER_LEN) { + struct ip_header *ip = packet->data; + int ip_len = IP_IHL(ip->ip_ihl_ver) * 4; + if (ip_len >= IP_HEADER_LEN && packet->size >= ip_len) { + return ofpbuf_pull(packet, ip_len); + } + } + return NULL; +} + +static struct tcp_header * +pull_tcp(struct ofpbuf *packet) +{ + if (packet->size >= TCP_HEADER_LEN) { + struct tcp_header *tcp = packet->data; + int tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4; + if (tcp_len >= TCP_HEADER_LEN && packet->size >= tcp_len) { + return ofpbuf_pull(packet, tcp_len); + } + } + return NULL; +} + +static struct udp_header * +pull_udp(struct ofpbuf *packet) +{ + return ofpbuf_try_pull(packet, UDP_HEADER_LEN); +} + +static struct icmp_header * +pull_icmp(struct ofpbuf *packet) +{ + return ofpbuf_try_pull(packet, ICMP_HEADER_LEN); +} + +static struct eth_header * +pull_eth(struct ofpbuf *packet) +{ + return ofpbuf_try_pull(packet, ETH_HEADER_LEN); +} + +static struct vlan_header * +pull_vlan(struct ofpbuf *packet) +{ + return ofpbuf_try_pull(packet, VLAN_HEADER_LEN); +} + +/* Returns 1 if 'packet' is an IP fragment, 0 otherwise. */ +int +flow_extract(struct ofpbuf *packet, uint16_t in_port, flow_t *flow) +{ + struct ofpbuf b = *packet; + struct eth_header *eth; + int retval = 0; + + COVERAGE_INC(flow_extract); + + memset(flow, 0, sizeof *flow); + flow->dl_vlan = htons(OFP_VLAN_NONE); + flow->in_port = in_port; + + packet->l2 = b.data; + packet->l3 = NULL; + packet->l4 = NULL; + packet->l7 = NULL; + + eth = pull_eth(&b); + if (eth) { + if (ntohs(eth->eth_type) >= OFP_DL_TYPE_ETH2_CUTOFF) { + /* This is an Ethernet II frame */ + flow->dl_type = eth->eth_type; + } else { + /* This is an 802.2 frame */ + struct llc_header *llc = ofpbuf_at(&b, 0, sizeof *llc); + struct snap_header *snap = ofpbuf_at(&b, sizeof *llc, + sizeof *snap); + if (llc == NULL) { + return 0; + } + if (snap + && llc->llc_dsap == LLC_DSAP_SNAP + && llc->llc_ssap == LLC_SSAP_SNAP + && llc->llc_cntl == LLC_CNTL_SNAP + && !memcmp(snap->snap_org, SNAP_ORG_ETHERNET, + sizeof snap->snap_org)) { + flow->dl_type = snap->snap_type; + ofpbuf_pull(&b, LLC_SNAP_HEADER_LEN); + } else { + flow->dl_type = htons(OFP_DL_TYPE_NOT_ETH_TYPE); + ofpbuf_pull(&b, sizeof(struct llc_header)); + } + } + + /* Check for a VLAN tag */ + if (flow->dl_type == htons(ETH_TYPE_VLAN)) { + struct vlan_header *vh = pull_vlan(&b); + if (vh) { + flow->dl_type = vh->vlan_next_type; + flow->dl_vlan = vh->vlan_tci & htons(VLAN_VID_MASK); + } + } + memcpy(flow->dl_src, eth->eth_src, ETH_ADDR_LEN); + memcpy(flow->dl_dst, eth->eth_dst, ETH_ADDR_LEN); + + packet->l3 = b.data; + if (flow->dl_type == htons(ETH_TYPE_IP)) { + const struct ip_header *nh = pull_ip(&b); + if (nh) { + flow->nw_src = nh->ip_src; + flow->nw_dst = nh->ip_dst; + flow->nw_proto = nh->ip_proto; + packet->l4 = b.data; + if (!IP_IS_FRAGMENT(nh->ip_frag_off)) { + if (flow->nw_proto == IP_TYPE_TCP) { + const struct tcp_header *tcp = pull_tcp(&b); + if (tcp) { + flow->tp_src = tcp->tcp_src; + flow->tp_dst = tcp->tcp_dst; + packet->l7 = b.data; + } else { + /* Avoid tricking other code into thinking that + * this packet has an L4 header. */ + flow->nw_proto = 0; + } + } else if (flow->nw_proto == IP_TYPE_UDP) { + const struct udp_header *udp = pull_udp(&b); + if (udp) { + flow->tp_src = udp->udp_src; + flow->tp_dst = udp->udp_dst; + packet->l7 = b.data; + } else { + /* Avoid tricking other code into thinking that + * this packet has an L4 header. */ + flow->nw_proto = 0; + } + } else if (flow->nw_proto == IP_TYPE_ICMP) { + const struct icmp_header *icmp = pull_icmp(&b); + if (icmp) { + flow->icmp_type = htons(icmp->icmp_type); + flow->icmp_code = htons(icmp->icmp_code); + packet->l7 = b.data; + } else { + /* Avoid tricking other code into thinking that + * this packet has an L4 header. */ + flow->nw_proto = 0; + } + } + } else { + retval = 1; + } + } + } + } + return retval; +} + +/* Extracts the flow stats for a packet. The 'flow' and 'packet' + * arguments must have been initialized through a call to flow_extract(). + */ +void +flow_extract_stats(const flow_t *flow, struct ofpbuf *packet, + struct odp_flow_stats *stats) +{ + memset(stats, '\0', sizeof(*stats)); + + if ((flow->dl_type == htons(ETH_TYPE_IP)) && packet->l4) { + struct ip_header *ip = packet->l3; + stats->ip_tos = ip->ip_tos; + if ((flow->nw_proto == IP_TYPE_TCP) && packet->l7) { + struct tcp_header *tcp = packet->l4; + stats->tcp_flags = TCP_FLAGS(tcp->tcp_ctl); + } + } + + stats->n_bytes = packet->size; + stats->n_packets = 1; +} + +void +flow_to_match(const flow_t *flow, uint32_t wildcards, struct ofp_match *match) +{ + match->wildcards = htonl(wildcards); + match->in_port = htons(flow->in_port == ODPP_LOCAL ? OFPP_LOCAL + : flow->in_port); + match->dl_vlan = flow->dl_vlan; + memcpy(match->dl_src, flow->dl_src, ETH_ADDR_LEN); + memcpy(match->dl_dst, flow->dl_dst, ETH_ADDR_LEN); + match->dl_type = flow->dl_type; + match->nw_src = flow->nw_src; + match->nw_dst = flow->nw_dst; + match->nw_proto = flow->nw_proto; + match->tp_src = flow->tp_src; + match->tp_dst = flow->tp_dst; + match->pad = 0; +} + +void +flow_from_match(flow_t *flow, uint32_t *wildcards, + const struct ofp_match *match) +{ + if (wildcards) { + *wildcards = ntohl(match->wildcards); + } + flow->nw_src = match->nw_src; + flow->nw_dst = match->nw_dst; + flow->in_port = (match->in_port == htons(OFPP_LOCAL) ? ODPP_LOCAL + : ntohs(match->in_port)); + flow->dl_vlan = match->dl_vlan; + flow->dl_type = match->dl_type; + flow->tp_src = match->tp_src; + flow->tp_dst = match->tp_dst; + memcpy(flow->dl_src, match->dl_src, ETH_ADDR_LEN); + memcpy(flow->dl_dst, match->dl_dst, ETH_ADDR_LEN); + flow->nw_proto = match->nw_proto; + flow->reserved = 0; +} + +char * +flow_to_string(const flow_t *flow) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + flow_format(&ds, flow); + return ds_cstr(&ds); +} + +void +flow_format(struct ds *ds, const flow_t *flow) +{ + ds_put_format(ds, "port%04x:vlan%d mac"ETH_ADDR_FMT"->"ETH_ADDR_FMT" " + "type%04x proto%"PRId8" ip"IP_FMT"->"IP_FMT" port%d->%d", + flow->in_port, ntohs(flow->dl_vlan), + ETH_ADDR_ARGS(flow->dl_src), ETH_ADDR_ARGS(flow->dl_dst), + ntohs(flow->dl_type), flow->nw_proto, + IP_ARGS(&flow->nw_src), IP_ARGS(&flow->nw_dst), + ntohs(flow->tp_src), ntohs(flow->tp_dst)); +} + +void +flow_print(FILE *stream, const flow_t *flow) +{ + char *s = flow_to_string(flow); + fputs(s, stream); + free(s); +} diff --git a/lib/flow.h b/lib/flow.h new file mode 100644 index 00000000..c7ec7701 --- /dev/null +++ b/lib/flow.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef FLOW_H +#define FLOW_H 1 + +#include +#include +#include +#include +#include "openflow/openflow.h" +#include "hash.h" +#include "openflow/openflow.h" +#include "openvswitch/datapath-protocol.h" +#include "util.h" + +struct ds; +struct ofp_match; +struct ofpbuf; + +typedef struct odp_flow_key flow_t; + +int flow_extract(struct ofpbuf *, uint16_t in_port, flow_t *); +void flow_extract_stats(const flow_t *flow, struct ofpbuf *packet, + struct odp_flow_stats *stats); +void flow_to_match(const flow_t *, uint32_t wildcards, struct ofp_match *); +void flow_from_match(flow_t *, uint32_t *wildcards, const struct ofp_match *); +char *flow_to_string(const flow_t *); +void flow_format(struct ds *, const flow_t *); +void flow_print(FILE *, const flow_t *); +static inline int flow_compare(const flow_t *, const flow_t *); +static inline bool flow_equal(const flow_t *, const flow_t *); +static inline size_t flow_hash(const flow_t *, uint32_t basis); + +static inline int +flow_compare(const flow_t *a, const flow_t *b) +{ + return memcmp(a, b, sizeof *a); +} + +static inline bool +flow_equal(const flow_t *a, const flow_t *b) +{ + return !flow_compare(a, b); +} + +static inline size_t +flow_hash(const flow_t *flow, uint32_t basis) +{ + BUILD_ASSERT_DECL(!(sizeof *flow % sizeof(uint32_t))); + return hash_words((const uint32_t *) flow, + sizeof *flow / sizeof(uint32_t), basis); +} + +/* Information on wildcards for a flow, as a supplement to flow_t. */ +struct flow_wildcards { + uint32_t wildcards; /* enum ofp_flow_wildcards (in host order). */ + uint32_t nw_src_mask; /* 1-bit in each significant nw_src bit. */ + uint32_t nw_dst_mask; /* 1-bit in each significant nw_dst bit. */ +}; + +/* Given the wildcard bit count in bits 'shift' through 'shift + 5' (inclusive) + * of 'wildcards', returns a 32-bit bit mask with a 1 in each bit that must + * match and a 0 in each bit that is wildcarded. + * + * The bits in 'wildcards' are in the format used in enum ofp_flow_wildcards: 0 + * is exact match, 1 ignores the LSB, 2 ignores the 2 least-significant bits, + * ..., 32 and higher wildcard the entire field. This is the *opposite* of the + * usual convention where e.g. /24 indicates that 8 bits (not 24 bits) are + * wildcarded. + * + * 'wildcards' is in host byte order. The return value is in network byte + * order. */ +static inline uint32_t +flow_nw_bits_to_mask(uint32_t wildcards, int shift) +{ + wildcards = (wildcards >> shift) & 0x3f; + return wildcards < 32 ? htonl(~((1u << wildcards) - 1)) : 0; +} + +static inline void +flow_wildcards_init(struct flow_wildcards *wc, uint32_t wildcards) +{ + wc->wildcards = wildcards & OFPFW_ALL; + wc->nw_src_mask = flow_nw_bits_to_mask(wc->wildcards, OFPFW_NW_SRC_SHIFT); + wc->nw_dst_mask = flow_nw_bits_to_mask(wc->wildcards, OFPFW_NW_DST_SHIFT); +} + +#endif /* flow.h */ diff --git a/lib/hash.c b/lib/hash.c new file mode 100644 index 00000000..af8f21bb --- /dev/null +++ b/lib/hash.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include "hash.h" +#include + +/* Returns the hash of the 'n' 32-bit words at 'p', starting from 'basis'. + * 'p' must be properly aligned. */ +uint32_t +hash_words(const uint32_t *p, size_t n, uint32_t basis) +{ + uint32_t a, b, c; + + a = b = c = 0xdeadbeef + (((uint32_t) n) << 2) + basis; + + while (n > 3) { + a += p[0]; + b += p[1]; + c += p[2]; + HASH_MIX(a, b, c); + n -= 3; + p += 3; + } + + switch (n) { + case 3: + c += p[2]; + /* fall through */ + case 2: + b += p[1]; + /* fall through */ + case 1: + a += p[0]; + HASH_FINAL(a, b, c); + /* fall through */ + case 0: + break; + } + return c; +} + +/* Returns the hash of the 'n' bytes at 'p', starting from 'basis'. */ +uint32_t +hash_bytes(const void *p_, size_t n, uint32_t basis) +{ + const uint8_t *p = p_; + uint32_t a, b, c; + uint32_t tmp[3]; + + a = b = c = 0xdeadbeef + n + basis; + + while (n >= sizeof tmp) { + memcpy(tmp, p, sizeof tmp); + a += tmp[0]; + b += tmp[1]; + c += tmp[2]; + HASH_MIX(a, b, c); + n -= sizeof tmp; + p += sizeof tmp; + } + + if (n) { + tmp[0] = tmp[1] = tmp[2] = 0; + memcpy(tmp, p, n); + a += tmp[0]; + b += tmp[1]; + c += tmp[2]; + HASH_FINAL(a, b, c); + } + + return c; +} diff --git a/lib/hash.h b/lib/hash.h new file mode 100644 index 00000000..39894d9a --- /dev/null +++ b/lib/hash.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef HASH_H +#define HASH_H 1 + +#include +#include +#include + +/* This is the public domain lookup3 hash by Bob Jenkins from + * http://burtleburtle.net/bob/c/lookup3.c, modified for style. */ + +#define HASH_ROT(x, k) (((x) << (k)) | ((x) >> (32 - (k)))) + +#define HASH_MIX(a, b, c) \ + do { \ + a -= c; a ^= HASH_ROT(c, 4); c += b; \ + b -= a; b ^= HASH_ROT(a, 6); a += c; \ + c -= b; c ^= HASH_ROT(b, 8); b += a; \ + a -= c; a ^= HASH_ROT(c, 16); c += b; \ + b -= a; b ^= HASH_ROT(a, 19); a += c; \ + c -= b; c ^= HASH_ROT(b, 4); b += a; \ + } while (0) + +#define HASH_FINAL(a, b, c) \ + do { \ + c ^= b; c -= HASH_ROT(b, 14); \ + a ^= c; a -= HASH_ROT(c, 11); \ + b ^= a; b -= HASH_ROT(a, 25); \ + c ^= b; c -= HASH_ROT(b, 16); \ + a ^= c; a -= HASH_ROT(c, 4); \ + b ^= a; b -= HASH_ROT(a, 14); \ + c ^= b; c -= HASH_ROT(b, 24); \ + } while (0) + +uint32_t hash_words(const uint32_t *, size_t n_word, uint32_t basis); +uint32_t hash_bytes(const void *, size_t n_bytes, uint32_t basis); + +static inline uint32_t hash_string(const char *s, uint32_t basis) +{ + return hash_bytes(s, strlen(s), basis); +} + +/* This is Bob Jenkins' integer hash from + * http://burtleburtle.net/bob/hash/integer.html, modified for style. */ +static inline uint32_t hash_int(uint32_t x, uint32_t basis) +{ + x -= x << 6; + x ^= x >> 17; + x -= x << 9; + x ^= x << 4; + x -= x << 3; + x ^= x << 10; + x ^= x >> 15; + return x + basis; +} + +#endif /* hash.h */ diff --git a/lib/hmap.c b/lib/hmap.c new file mode 100644 index 00000000..ea08ab80 --- /dev/null +++ b/lib/hmap.c @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "hmap.h" +#include +#include +#include "coverage.h" +#include "util.h" + +/* Initializes 'hmap' as an empty hash table. */ +void +hmap_init(struct hmap *hmap) +{ + hmap->buckets = &hmap->one; + hmap->one = NULL; + hmap->mask = 0; + hmap->n = 0; +} + +/* Frees memory reserved by 'hmap'. It is the client's responsibility to free + * the nodes themselves, if necessary. */ +void +hmap_destroy(struct hmap *hmap) +{ + if (hmap && hmap->buckets != &hmap->one) { + free(hmap->buckets); + } +} + +/* Exchanges hash maps 'a' and 'b'. */ +void +hmap_swap(struct hmap *a, struct hmap *b) +{ + struct hmap tmp = *a; + *a = *b; + *b = tmp; + if (a->buckets == &b->one) { + a->buckets = &a->one; + } + if (b->buckets == &a->one) { + b->buckets = &b->one; + } +} + +static void +resize(struct hmap *hmap, size_t new_mask) +{ + struct hmap tmp; + size_t i; + + assert(!(new_mask & (new_mask + 1))); + assert(new_mask != SIZE_MAX); + + hmap_init(&tmp); + if (new_mask) { + tmp.buckets = xmalloc(sizeof *tmp.buckets * (new_mask + 1)); + tmp.mask = new_mask; + for (i = 0; i <= tmp.mask; i++) { + tmp.buckets[i] = NULL; + } + } + for (i = 0; i <= hmap->mask; i++) { + struct hmap_node *node, *next; + int count = 0; + for (node = hmap->buckets[i]; node; node = next) { + next = node->next; + hmap_insert_fast(&tmp, node, node->hash); + count++; + } + if (count > 5) { + COVERAGE_INC(hmap_pathological); + } + } + hmap_swap(hmap, &tmp); + hmap_destroy(&tmp); +} + +static size_t +calc_mask(size_t capacity) +{ + size_t mask = capacity / 2; + mask |= mask >> 1; + mask |= mask >> 2; + mask |= mask >> 4; + mask |= mask >> 8; + mask |= mask >> 16; +#if SIZE_MAX > UINT32_MAX + mask |= mask >> 32; +#endif + + /* If we need to dynamically allocate buckets we might as well allocate at + * least 4 of them. */ + mask |= (mask & 1) << 1; + + return mask; +} + +/* Expands 'hmap', if necessary, to optimize the performance of searches. */ +void +hmap_expand(struct hmap *hmap) +{ + size_t new_mask = calc_mask(hmap->n); + if (new_mask > hmap->mask) { + COVERAGE_INC(hmap_expand); + resize(hmap, new_mask); + } +} + +/* Shrinks 'hmap', if necessary, to optimize the performance of iteration. */ +void +hmap_shrink(struct hmap *hmap) +{ + size_t new_mask = calc_mask(hmap->n); + if (new_mask < hmap->mask) { + COVERAGE_INC(hmap_shrink); + resize(hmap, new_mask); + } +} + +/* Expands 'hmap', if necessary, to optimize the performance of searches when + * it has up to 'n' elements. (But iteration will be slow in a hash map whose + * allocated capacity is much higher than its current number of nodes.) */ +void +hmap_reserve(struct hmap *hmap, size_t n) +{ + size_t new_mask = calc_mask(n); + if (new_mask > hmap->mask) { + COVERAGE_INC(hmap_reserve); + resize(hmap, new_mask); + } +} diff --git a/lib/hmap.h b/lib/hmap.h new file mode 100644 index 00000000..7f4b00fc --- /dev/null +++ b/lib/hmap.h @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef HMAP_H +#define HMAP_H 1 + +#include +#include +#include "util.h" + +/* A hash map node, to be embedded inside the data structure being mapped. */ +struct hmap_node { + size_t hash; /* Hash value. */ + struct hmap_node *next; /* Next in linked list. */ +}; + +/* Returns the hash value embedded in 'node'. */ +static inline size_t hmap_node_hash(const struct hmap_node *node) +{ + return node->hash; +} + +/* A hash map. */ +struct hmap { + struct hmap_node **buckets; + struct hmap_node *one; + size_t mask; + size_t n; +}; + +/* Initializer for an empty hash map. */ +#define HMAP_INITIALIZER(HMAP) { &(HMAP)->one, NULL, 0, 0 } + +/* Initialization. */ +void hmap_init(struct hmap *); +void hmap_destroy(struct hmap *); +void hmap_swap(struct hmap *a, struct hmap *b); +static inline size_t hmap_count(const struct hmap *); +static inline bool hmap_is_empty(const struct hmap *); + +/* Adjusting capacity. */ +void hmap_expand(struct hmap *); +void hmap_shrink(struct hmap *); +void hmap_reserve(struct hmap *, size_t capacity); + +/* Insertion and deletion. */ +static inline void hmap_insert_fast(struct hmap *, + struct hmap_node *, size_t hash); +static inline void hmap_insert(struct hmap *, struct hmap_node *, size_t hash); +static inline void hmap_remove(struct hmap *, struct hmap_node *); +static inline void hmap_moved(struct hmap *, + struct hmap_node *, struct hmap_node *); +static inline void hmap_replace(struct hmap *, const struct hmap_node *old, + struct hmap_node *new); + +/* Search. */ +#define HMAP_FOR_EACH_WITH_HASH(NODE, STRUCT, MEMBER, HASH, HMAP) \ + for ((NODE) = CONTAINER_OF(hmap_first_with_hash(HMAP, HASH), \ + STRUCT, MEMBER); \ + &(NODE)->MEMBER != NULL; \ + (NODE) = CONTAINER_OF(hmap_next_with_hash(&(NODE)->MEMBER), \ + STRUCT, MEMBER)) + +static inline struct hmap_node *hmap_first_with_hash(const struct hmap *, + size_t hash); +static inline struct hmap_node *hmap_next_with_hash(const struct hmap_node *); + +/* Iteration. + * + * The _SAFE version is needed when NODE may be freed. It is not needed when + * NODE may be removed from the hash map but its members remain accessible and + * intact. */ +#define HMAP_FOR_EACH(NODE, STRUCT, MEMBER, HMAP) \ + for ((NODE) = CONTAINER_OF(hmap_first(HMAP), STRUCT, MEMBER); \ + &(NODE)->MEMBER != NULL; \ + (NODE) = CONTAINER_OF(hmap_next(HMAP, &(NODE)->MEMBER), \ + STRUCT, MEMBER)) + +#define HMAP_FOR_EACH_SAFE(NODE, NEXT, STRUCT, MEMBER, HMAP) \ + for ((NODE) = CONTAINER_OF(hmap_first(HMAP), STRUCT, MEMBER); \ + (&(NODE)->MEMBER != NULL \ + ? (NEXT) = CONTAINER_OF(hmap_next(HMAP, &(NODE)->MEMBER), \ + STRUCT, MEMBER), 1 \ + : 0); \ + (NODE) = (NEXT)) + +static inline struct hmap_node *hmap_first(const struct hmap *); +static inline struct hmap_node *hmap_next(const struct hmap *, + const struct hmap_node *); + +/* Returns the number of nodes currently in 'hmap'. */ +static inline size_t +hmap_count(const struct hmap *hmap) +{ + return hmap->n; +} + +/* Returns true if 'hmap' currently contains no nodes, + * false otherwise. */ +static inline bool +hmap_is_empty(const struct hmap *hmap) +{ + return hmap->n == 0; +} + +/* Inserts 'node', with the given 'hash', into 'hmap'. 'hmap' is never + * expanded automatically. */ +static inline void +hmap_insert_fast(struct hmap *hmap, struct hmap_node *node, size_t hash) +{ + struct hmap_node **bucket = &hmap->buckets[hash & hmap->mask]; + node->hash = hash; + node->next = *bucket; + *bucket = node; + hmap->n++; +} + +/* Inserts 'node', with the given 'hash', into 'hmap', and expands 'hmap' if + * necessary to optimize search performance. */ +static inline void +hmap_insert(struct hmap *hmap, struct hmap_node *node, size_t hash) +{ + hmap_insert_fast(hmap, node, hash); + if (hmap->n / 2 > hmap->mask) { + hmap_expand(hmap); + } +} + +/* Removes 'node' from 'hmap'. Does not shrink the hash table; call + * hmap_shrink() directly if desired. */ +static inline void +hmap_remove(struct hmap *hmap, struct hmap_node *node) +{ + struct hmap_node **bucket = &hmap->buckets[node->hash & hmap->mask]; + while (*bucket != node) { + bucket = &(*bucket)->next; + } + *bucket = node->next; + hmap->n--; +} + +/* Adjusts 'hmap' to compensate for 'old_node' having moved position in memory + * to 'node' (e.g. due to realloc()). */ +static inline void +hmap_moved(struct hmap *hmap, + struct hmap_node *old_node, struct hmap_node *node) +{ + struct hmap_node **bucket = &hmap->buckets[node->hash & hmap->mask]; + while (*bucket != old_node) { + bucket = &(*bucket)->next; + } + *bucket = node; +} + +/* Puts 'new' in the position in 'hmap' currently occupied by 'old'. The 'new' + * node must hash to the same value as 'old'. The client is responsible for + * ensuring that the replacement does not violate any client-imposed + * invariants (e.g. uniqueness of keys within a map). + * + * Afterward, 'old' is not part of 'hmap', and the client is responsible for + * freeing it (if this is desirable). */ +static inline void +hmap_replace(struct hmap *hmap, + const struct hmap_node *old, struct hmap_node *new) +{ + struct hmap_node **bucket = &hmap->buckets[old->hash & hmap->mask]; + while (*bucket != old) { + bucket = &(*bucket)->next; + } + *bucket = new; + new->hash = old->hash; +} + +static inline struct hmap_node * +hmap_next_with_hash__(const struct hmap_node *node, size_t hash) +{ + while (node != NULL && node->hash != hash) { + node = node->next; + } + return (struct hmap_node *) node; +} + +/* Returns the first node in 'hmap' with the given 'hash', or a null pointer if + * no nodes have that hash value. */ +static inline struct hmap_node * +hmap_first_with_hash(const struct hmap *hmap, size_t hash) +{ + return hmap_next_with_hash__(hmap->buckets[hash & hmap->mask], hash); +} + +/* Returns the next node in the same hash map as 'node' with the same hash + * value, or a null pointer if no more nodes have that hash value. + * + * If the hash map has been reallocated since 'node' was visited, some nodes + * may be skipped; if new nodes with the same hash value have been added, they + * will be skipped. (Removing 'node' from the hash map does not prevent + * calling this function, since node->next is preserved, although freeing + * 'node' of course does.) */ +static inline struct hmap_node * +hmap_next_with_hash(const struct hmap_node *node) +{ + return hmap_next_with_hash__(node->next, node->hash); +} + +static inline struct hmap_node * +hmap_next__(const struct hmap *hmap, size_t start) +{ + size_t i; + for (i = start; i <= hmap->mask; i++) { + struct hmap_node *node = hmap->buckets[i]; + if (node) { + return node; + } + } + return NULL; +} + +/* Returns the first node in 'hmap', in arbitrary order, or a null pointer if + * 'hmap' is empty. */ +static inline struct hmap_node * +hmap_first(const struct hmap *hmap) +{ + return hmap_next__(hmap, 0); +} + +/* Returns the next node in 'hmap' following 'node', in arbitrary order, or a + * null pointer if 'node' is the last node in 'hmap'. + * + * If the hash map has been reallocated since 'node' was visited, some nodes + * may be skipped or visited twice. (Removing 'node' from the hash map does + * not prevent calling this function, since node->next is preserved, although + * freeing 'node' of course does.) */ +static inline struct hmap_node * +hmap_next(const struct hmap *hmap, const struct hmap_node *node) +{ + return (node->next + ? node->next + : hmap_next__(hmap, (node->hash & hmap->mask) + 1)); +} + +#endif /* hmap.h */ diff --git a/lib/leak-checker.c b/lib/leak-checker.c new file mode 100644 index 00000000..a25fa71d --- /dev/null +++ b/lib/leak-checker.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "leak-checker.h" +#include +#include "backtrace.h" + +#define THIS_MODULE VLM_leak_checker +#include "vlog.h" + +#ifndef HAVE_MALLOC_HOOKS +void +leak_checker_start(const char *file_name UNUSED) +{ + VLOG_WARN("not enabling leak checker because the libc in use does not " + "have the required hooks"); +} + +void +leak_checker_set_limit(off_t max_size UNUSED) +{ +} + +void +leak_checker_claim(const void *p UNUSED) +{ +} + +void +leak_checker_usage(void) +{ + printf(" --check-leaks=FILE (accepted but ignored in this build)\n"); +} +#else /* HAVE_MALLOC_HOOKS */ +#include +#include +#include +#include + +typedef void *malloc_hook_type(size_t, const void *); +typedef void *realloc_hook_type(void *, size_t, const void *); +typedef void free_hook_type(void *, const void *); + +struct hooks { + malloc_hook_type *malloc_hook_func; + realloc_hook_type *realloc_hook_func; + free_hook_type *free_hook_func; +}; + +static malloc_hook_type hook_malloc; +static realloc_hook_type hook_realloc; +static free_hook_type hook_free; + +static struct hooks libc_hooks; +static const struct hooks our_hooks = { hook_malloc, hook_realloc, hook_free }; + +static FILE *file; +static off_t limit = 10 * 1000 * 1000; + +static void +get_hooks(struct hooks *hooks) +{ + hooks->malloc_hook_func = __malloc_hook; + hooks->realloc_hook_func = __realloc_hook; + hooks->free_hook_func = __free_hook; +} + +static void +set_hooks(const struct hooks *hooks) +{ + __malloc_hook = hooks->malloc_hook_func; + __realloc_hook = hooks->realloc_hook_func; + __free_hook = hooks->free_hook_func; +} + +void +leak_checker_start(const char *file_name) +{ + if (!file) { + file = fopen(file_name, "w"); + if (!file) { + VLOG_WARN("failed to create \"%s\": %s", + file_name, strerror(errno)); + return; + } + setvbuf(file, NULL, _IOLBF, 0); + VLOG_WARN("enabled memory leak logging to \"%s\"", file_name); + get_hooks(&libc_hooks); + set_hooks(&our_hooks); + } +} + +void +leak_checker_stop(void) +{ + if (file) { + fclose(file); + file = NULL; + set_hooks(&libc_hooks); + VLOG_WARN("disabled memory leak logging"); + } +} + +void +leak_checker_set_limit(off_t limit_) +{ + limit = limit_; +} + +void +leak_checker_usage(void) +{ + printf(" --check-leaks=FILE log malloc and free calls to FILE\n"); +} + +static void PRINTF_FORMAT(1, 2) +log_callers(const char *format, ...) +{ + struct backtrace backtrace; + va_list args; + int i; + + va_start(args, format); + vfprintf(file, format, args); + va_end(args); + + putc(':', file); + backtrace_capture(&backtrace); + for (i = 0; i < backtrace.n_frames; i++) { + fprintf(file, " 0x%x", backtrace.frames[i]); + } + putc('\n', file); +} + +static void +reset_hooks(void) +{ + static int count; + + if (file) { + if (ferror(file)) { + VLOG_WARN("error writing leak checker log file"); + leak_checker_stop(); + return; + } + + if (count++ >= 100 && limit) { + struct stat s; + count = 0; + if (fstat(fileno(file), &s) < 0) { + VLOG_WARN("cannot fstat leak checker log file: %s", + strerror(errno)); + leak_checker_stop(); + return; + } + if (s.st_size > limit) { + VLOG_WARN("leak checker log file size exceeded limit"); + leak_checker_stop(); + return; + } + } + } + if (file) { + set_hooks(&our_hooks); + } +} + +static void * +hook_malloc(size_t size, const void *caller UNUSED) +{ + void *p; + + set_hooks(&libc_hooks); + p = malloc(size); + get_hooks(&libc_hooks); + + log_callers("malloc(%zu) -> %p", size, p); + + reset_hooks(); + return p; +} + +void +leak_checker_claim(const void *p) +{ + if (!file) { + return; + } + + if (p) { + set_hooks(&libc_hooks); + log_callers("claim(%p)", p); + reset_hooks(); + } +} + +static void +hook_free(void *p, const void *caller UNUSED) +{ + if (!p) { + return; + } + + set_hooks(&libc_hooks); + free(p); + get_hooks(&libc_hooks); + + log_callers("free(%p)", p); + + reset_hooks(); +} + +static void * +hook_realloc(void *p, size_t size, const void *caller UNUSED) +{ + void *q; + + set_hooks(&libc_hooks); + q = realloc(p, size); + get_hooks(&libc_hooks); + + if (p != q) { + log_callers("realloc(%p, %zu) -> %p", p, size, q); + } + + reset_hooks(); + + return q; +} +#endif /* HAVE_MALLOC_HOOKS */ diff --git a/lib/leak-checker.h b/lib/leak-checker.h new file mode 100644 index 00000000..c2259dab --- /dev/null +++ b/lib/leak-checker.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef LEAK_CHECKER_H +#define LEAK_CHECKER_H 1 + +#include + +#define LEAK_CHECKER_OPTION_ENUMS \ + OPT_CHECK_LEAKS, \ + OPT_LEAK_LIMIT +#define LEAK_CHECKER_LONG_OPTIONS \ + {"check-leaks", required_argument, 0, OPT_CHECK_LEAKS}, \ + {"leak-limit", required_argument, 0, OPT_LEAK_LIMIT} +#define LEAK_CHECKER_OPTION_HANDLERS \ + case OPT_CHECK_LEAKS: \ + leak_checker_start(optarg); \ + break; \ + case OPT_LEAK_LIMIT: \ + leak_checker_set_limit(atol(optarg)); \ + break; +void leak_checker_start(const char *file_name); +void leak_checker_set_limit(off_t limit); +void leak_checker_stop(void); +void leak_checker_claim(const void *); +void leak_checker_usage(void); + +#endif /* leak-checker.h */ diff --git a/lib/leak-checker.man b/lib/leak-checker.man new file mode 100644 index 00000000..7b376e1a --- /dev/null +++ b/lib/leak-checker.man @@ -0,0 +1,7 @@ +.TP +\fB--check-leaks=\fIfile\fR +. +Logs information about memory allocation and deallocation to +\fIfile\fR, to allow for debugging memory leaks in \fB\*(PN\fR. This +option slows down \fB\*(PN\fR considerably, so it should only be used +when a memory leak is suspected. diff --git a/lib/learning-switch.c b/lib/learning-switch.c new file mode 100644 index 00000000..efc58386 --- /dev/null +++ b/lib/learning-switch.c @@ -0,0 +1,644 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "learning-switch.h" + +#include +#include +#include +#include +#include + +#include "flow.h" +#include "mac-learning.h" +#include "ofpbuf.h" +#include "ofp-print.h" +#include "openflow/openflow.h" +#include "poll-loop.h" +#include "queue.h" +#include "rconn.h" +#include "stp.h" +#include "timeval.h" +#include "vconn.h" +#include "xtoxll.h" + +#define THIS_MODULE VLM_learning_switch +#include "vlog.h" + +enum port_state { + P_DISABLED = 1 << 0, + P_LISTENING = 1 << 1, + P_LEARNING = 1 << 2, + P_FORWARDING = 1 << 3, + P_BLOCKING = 1 << 4 +}; + +struct lswitch { + /* If nonnegative, the switch sets up flows that expire after the given + * number of seconds (or never expire, if the value is OFP_FLOW_PERMANENT). + * Otherwise, the switch processes every packet. */ + int max_idle; + + unsigned long long int datapath_id; + uint32_t capabilities; + time_t last_features_request; + struct mac_learning *ml; /* NULL to act as hub instead of switch. */ + + /* Number of outgoing queued packets on the rconn. */ + struct rconn_packet_counter *queued; + + /* Spanning tree protocol implementation. + * + * We implement STP states by, whenever a port's STP state changes, + * querying all the flows on the switch and then deleting any of them that + * are inappropriate for a port's STP state. */ + long long int next_query; /* Next time at which to query all flows. */ + long long int last_query; /* Last time we sent a query. */ + long long int last_reply; /* Last time we received a query reply. */ + unsigned int port_states[STP_MAX_PORTS]; + uint32_t query_xid; /* XID used for query. */ + int n_flows, n_no_recv, n_no_send; +}; + +/* The log messages here could actually be useful in debugging, so keep the + * rate limit relatively high. */ +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, 300); + +static void queue_tx(struct lswitch *, struct rconn *, struct ofpbuf *); +static void send_features_request(struct lswitch *, struct rconn *); +static void schedule_query(struct lswitch *, long long int delay); +static bool may_learn(const struct lswitch *, uint16_t port_no); +static bool may_recv(const struct lswitch *, uint16_t port_no, + bool any_actions); +static bool may_send(const struct lswitch *, uint16_t port_no); + +typedef void packet_handler_func(struct lswitch *, struct rconn *, void *); +static packet_handler_func process_switch_features; +static packet_handler_func process_packet_in; +static packet_handler_func process_echo_request; +static packet_handler_func process_port_status; +static packet_handler_func process_phy_port; +static packet_handler_func process_stats_reply; + +/* Creates and returns a new learning switch. + * + * If 'learn_macs' is true, the new switch will learn the ports on which MAC + * addresses appear. Otherwise, the new switch will flood all packets. + * + * If 'max_idle' is nonnegative, the new switch will set up flows that expire + * after the given number of seconds (or never expire, if 'max_idle' is + * OFP_FLOW_PERMANENT). Otherwise, the new switch will process every packet. + * + * 'rconn' is used to send out an OpenFlow features request. */ +struct lswitch * +lswitch_create(struct rconn *rconn, bool learn_macs, int max_idle) +{ + struct lswitch *sw; + size_t i; + + sw = xcalloc(1, sizeof *sw); + sw->max_idle = max_idle; + sw->datapath_id = 0; + sw->last_features_request = time_now() - 1; + sw->ml = learn_macs ? mac_learning_create() : NULL; + sw->queued = rconn_packet_counter_create(); + sw->next_query = LLONG_MIN; + sw->last_query = LLONG_MIN; + sw->last_reply = LLONG_MIN; + for (i = 0; i < STP_MAX_PORTS; i++) { + sw->port_states[i] = P_DISABLED; + } + send_features_request(sw, rconn); + return sw; +} + +/* Destroys 'sw'. */ +void +lswitch_destroy(struct lswitch *sw) +{ + if (sw) { + mac_learning_destroy(sw->ml); + rconn_packet_counter_destroy(sw->queued); + free(sw); + } +} + +/* Takes care of necessary 'sw' activity, except for receiving packets (which + * the caller must do). */ +void +lswitch_run(struct lswitch *sw, struct rconn *rconn) +{ + long long int now = time_msec(); + + if (sw->ml) { + mac_learning_run(sw->ml, NULL); + } + + /* If we're waiting for more replies, keeping waiting for up to 10 s. */ + if (sw->last_reply != LLONG_MIN) { + if (now - sw->last_reply > 10000) { + VLOG_ERR_RL(&rl, "%012llx: No more flow stat replies last 10 s", + sw->datapath_id); + sw->last_reply = LLONG_MIN; + sw->last_query = LLONG_MIN; + schedule_query(sw, 0); + } else { + return; + } + } + + /* If we're waiting for any reply at all, keep waiting for up to 10 s. */ + if (sw->last_query != LLONG_MIN) { + if (now - sw->last_query > 10000) { + VLOG_ERR_RL(&rl, "%012llx: No flow stat replies in last 10 s", + sw->datapath_id); + sw->last_query = LLONG_MIN; + schedule_query(sw, 0); + } else { + return; + } + } + + /* If it's time to send another query, do so. */ + if (sw->next_query != LLONG_MIN && now >= sw->next_query) { + sw->next_query = LLONG_MIN; + if (!rconn_is_connected(rconn)) { + schedule_query(sw, 1000); + } else { + struct ofp_stats_request *osr; + struct ofp_flow_stats_request *ofsr; + struct ofpbuf *b; + int error; + + VLOG_DBG("%012llx: Sending flow stats request to implement STP", + sw->datapath_id); + + sw->last_query = now; + sw->query_xid = random_uint32(); + sw->n_flows = 0; + sw->n_no_recv = 0; + sw->n_no_send = 0; + osr = make_openflow_xid(sizeof *osr + sizeof *ofsr, + OFPT_STATS_REQUEST, sw->query_xid, &b); + osr->type = htons(OFPST_FLOW); + osr->flags = htons(0); + ofsr = (struct ofp_flow_stats_request *) osr->body; + ofsr->match.wildcards = htonl(OFPFW_ALL); + ofsr->table_id = 0xff; + ofsr->out_port = htons(OFPP_NONE); + + error = rconn_send(rconn, b, NULL); + if (error) { + VLOG_WARN_RL(&rl, "%012llx: sending flow stats request " + "failed: %s", sw->datapath_id, strerror(error)); + ofpbuf_delete(b); + schedule_query(sw, 1000); + } + } + } +} + +static void +wait_timeout(long long int started) +{ + long long int now = time_msec(); + long long int timeout = 10000 - (now - started); + if (timeout <= 0) { + poll_immediate_wake(); + } else { + poll_timer_wait(timeout); + } +} + +void +lswitch_wait(struct lswitch *sw) +{ + if (sw->ml) { + mac_learning_wait(sw->ml); + } + + if (sw->last_reply != LLONG_MIN) { + wait_timeout(sw->last_reply); + } else if (sw->last_query != LLONG_MIN) { + wait_timeout(sw->last_query); + } +} + +/* Processes 'msg', which should be an OpenFlow received on 'rconn', according + * to the learning switch state in 'sw'. The most likely result of processing + * is that flow-setup and packet-out OpenFlow messages will be sent out on + * 'rconn'. */ +void +lswitch_process_packet(struct lswitch *sw, struct rconn *rconn, + const struct ofpbuf *msg) +{ + struct processor { + uint8_t type; + size_t min_size; + packet_handler_func *handler; + }; + static const struct processor processors[] = { + { + OFPT_ECHO_REQUEST, + sizeof(struct ofp_header), + process_echo_request + }, + { + OFPT_FEATURES_REPLY, + sizeof(struct ofp_switch_features), + process_switch_features + }, + { + OFPT_PACKET_IN, + offsetof(struct ofp_packet_in, data), + process_packet_in + }, + { + OFPT_PORT_STATUS, + sizeof(struct ofp_port_status), + process_port_status + }, + { + OFPT_STATS_REPLY, + offsetof(struct ofp_stats_reply, body), + process_stats_reply + }, + { + OFPT_FLOW_EXPIRED, + sizeof(struct ofp_flow_expired), + NULL + }, + }; + const size_t n_processors = ARRAY_SIZE(processors); + const struct processor *p; + struct ofp_header *oh; + + oh = msg->data; + if (sw->datapath_id == 0 + && oh->type != OFPT_ECHO_REQUEST + && oh->type != OFPT_FEATURES_REPLY) { + send_features_request(sw, rconn); + return; + } + + for (p = processors; p < &processors[n_processors]; p++) { + if (oh->type == p->type) { + if (msg->size < p->min_size) { + VLOG_WARN_RL(&rl, "%012llx: %s: too short (%zu bytes) for " + "type %"PRIu8" (min %zu)", sw->datapath_id, + rconn_get_name(rconn), msg->size, oh->type, + p->min_size); + return; + } + if (p->handler) { + (p->handler)(sw, rconn, msg->data); + } + return; + } + } + if (VLOG_IS_DBG_ENABLED()) { + char *p = ofp_to_string(msg->data, msg->size, 2); + VLOG_DBG_RL(&rl, "%012llx: OpenFlow packet ignored: %s", + sw->datapath_id, p); + free(p); + } +} + +static void +send_features_request(struct lswitch *sw, struct rconn *rconn) +{ + time_t now = time_now(); + if (now >= sw->last_features_request + 1) { + struct ofpbuf *b; + struct ofp_switch_config *osc; + + /* Send OFPT_FEATURES_REQUEST. */ + make_openflow(sizeof(struct ofp_header), OFPT_FEATURES_REQUEST, &b); + queue_tx(sw, rconn, b); + + /* Send OFPT_SET_CONFIG. */ + osc = make_openflow(sizeof *osc, OFPT_SET_CONFIG, &b); + osc->flags = htons(OFPC_SEND_FLOW_EXP); + osc->miss_send_len = htons(OFP_DEFAULT_MISS_SEND_LEN); + queue_tx(sw, rconn, b); + + sw->last_features_request = now; + } +} + +static void +queue_tx(struct lswitch *sw, struct rconn *rconn, struct ofpbuf *b) +{ + int retval = rconn_send_with_limit(rconn, b, sw->queued, 10); + if (retval && retval != ENOTCONN) { + if (retval == EAGAIN) { + VLOG_INFO_RL(&rl, "%012llx: %s: tx queue overflow", + sw->datapath_id, rconn_get_name(rconn)); + } else { + VLOG_WARN_RL(&rl, "%012llx: %s: send: %s", + sw->datapath_id, rconn_get_name(rconn), + strerror(retval)); + } + } +} + +static void +schedule_query(struct lswitch *sw, long long int delay) +{ + long long int now = time_msec(); + if (sw->next_query == LLONG_MIN || sw->next_query > now + delay) { + sw->next_query = now + delay; + } +} + +static void +process_switch_features(struct lswitch *sw, struct rconn *rconn, void *osf_) +{ + struct ofp_switch_features *osf = osf_; + size_t n_ports = ((ntohs(osf->header.length) + - offsetof(struct ofp_switch_features, ports)) + / sizeof *osf->ports); + size_t i; + + sw->datapath_id = ntohll(osf->datapath_id); + sw->capabilities = ntohl(osf->capabilities); + for (i = 0; i < n_ports; i++) { + process_phy_port(sw, rconn, &osf->ports[i]); + } + if (sw->capabilities & OFPC_STP) { + schedule_query(sw, 1000); + } +} + +static void +process_packet_in(struct lswitch *sw, struct rconn *rconn, void *opi_) +{ + struct ofp_packet_in *opi = opi_; + uint16_t in_port = ntohs(opi->in_port); + uint16_t out_port = OFPP_FLOOD; + + size_t pkt_ofs, pkt_len; + struct ofpbuf pkt; + flow_t flow; + + /* Extract flow data from 'opi' into 'flow'. */ + pkt_ofs = offsetof(struct ofp_packet_in, data); + pkt_len = ntohs(opi->header.length) - pkt_ofs; + pkt.data = opi->data; + pkt.size = pkt_len; + flow_extract(&pkt, in_port, &flow); + + if (may_learn(sw, in_port) && sw->ml) { + if (mac_learning_learn(sw->ml, flow.dl_src, 0, in_port)) { + VLOG_DBG_RL(&rl, "%012llx: learned that "ETH_ADDR_FMT" is on " + "port %"PRIu16, sw->datapath_id, + ETH_ADDR_ARGS(flow.dl_src), in_port); + } + } + + if (eth_addr_is_reserved(flow.dl_src)) { + goto drop_it; + } + + if (!may_recv(sw, in_port, false)) { + /* STP prevents receiving anything on this port. */ + goto drop_it; + } + + if (sw->ml) { + int learned_port = mac_learning_lookup(sw->ml, flow.dl_dst, 0); + if (learned_port >= 0 && may_send(sw, learned_port)) { + out_port = learned_port; + } + } + + if (in_port == out_port) { + /* Don't send out packets on their input ports. */ + goto drop_it; + } else if (sw->max_idle >= 0 && (!sw->ml || out_port != OFPP_FLOOD)) { + /* The output port is known, or we always flood everything, so add a + * new flow. */ + queue_tx(sw, rconn, make_add_simple_flow(&flow, ntohl(opi->buffer_id), + out_port, sw->max_idle)); + + /* If the switch didn't buffer the packet, we need to send a copy. */ + if (ntohl(opi->buffer_id) == UINT32_MAX) { + queue_tx(sw, rconn, + make_unbuffered_packet_out(&pkt, in_port, out_port)); + } + } else { + /* We don't know that MAC, or we don't set up flows. Send along the + * packet without setting up a flow. */ + struct ofpbuf *b; + if (ntohl(opi->buffer_id) == UINT32_MAX) { + b = make_unbuffered_packet_out(&pkt, in_port, out_port); + } else { + b = make_buffered_packet_out(ntohl(opi->buffer_id), + in_port, out_port); + } + queue_tx(sw, rconn, b); + } + return; + +drop_it: + if (sw->max_idle >= 0) { + /* Set up a flow to drop packets. */ + queue_tx(sw, rconn, make_add_flow(&flow, ntohl(opi->buffer_id), + sw->max_idle, 0)); + } else { + /* Just drop the packet, since we don't set up flows at all. + * XXX we should send a packet_out with no actions if buffer_id != + * UINT32_MAX, to avoid clogging the kernel buffers. */ + } + return; +} + +static void +process_echo_request(struct lswitch *sw, struct rconn *rconn, void *rq_) +{ + struct ofp_header *rq = rq_; + queue_tx(sw, rconn, make_echo_reply(rq)); +} + +static void +process_port_status(struct lswitch *sw, struct rconn *rconn, void *ops_) +{ + struct ofp_port_status *ops = ops_; + process_phy_port(sw, rconn, &ops->desc); +} + +static void +process_phy_port(struct lswitch *sw, struct rconn *rconn UNUSED, void *opp_) +{ + const struct ofp_phy_port *opp = opp_; + uint16_t port_no = ntohs(opp->port_no); + if (sw->capabilities & OFPC_STP && port_no < STP_MAX_PORTS) { + uint32_t config = ntohl(opp->config); + uint32_t state = ntohl(opp->state); + unsigned int *port_state = &sw->port_states[port_no]; + unsigned int new_port_state; + + if (!(config & (OFPPC_NO_STP | OFPPC_PORT_DOWN)) + && !(state & OFPPS_LINK_DOWN)) + { + switch (state & OFPPS_STP_MASK) { + case OFPPS_STP_LISTEN: + new_port_state = P_LISTENING; + break; + case OFPPS_STP_LEARN: + new_port_state = P_LEARNING; + break; + case OFPPS_STP_FORWARD: + new_port_state = P_FORWARDING; + break; + case OFPPS_STP_BLOCK: + new_port_state = P_BLOCKING; + break; + default: + new_port_state = P_DISABLED; + break; + } + } else { + new_port_state = P_FORWARDING; + } + if (*port_state != new_port_state) { + *port_state = new_port_state; + schedule_query(sw, 1000); + } + } +} + +static unsigned int +get_port_state(const struct lswitch *sw, uint16_t port_no) +{ + return (port_no >= STP_MAX_PORTS || !(sw->capabilities & OFPC_STP) + ? P_FORWARDING + : sw->port_states[port_no]); +} + +static bool +may_learn(const struct lswitch *sw, uint16_t port_no) +{ + return get_port_state(sw, port_no) & (P_LEARNING | P_FORWARDING); +} + +static bool +may_recv(const struct lswitch *sw, uint16_t port_no, bool any_actions) +{ + unsigned int state = get_port_state(sw, port_no); + return !(any_actions + ? state & (P_DISABLED | P_LISTENING | P_BLOCKING) + : state & (P_DISABLED | P_LISTENING | P_BLOCKING | P_LEARNING)); +} + +static bool +may_send(const struct lswitch *sw, uint16_t port_no) +{ + return get_port_state(sw, port_no) & P_FORWARDING; +} + +static void +process_flow_stats(struct lswitch *sw, struct rconn *rconn, + const struct ofp_flow_stats *ofs) +{ + const char *end = (char *) ofs + ntohs(ofs->length); + bool delete = false; + + /* Decide to delete the flow if it matches on an STP-disabled physical + * port. But don't delete it if the flow just drops all received packets, + * because that's a perfectly reasonable thing to do for disabled physical + * ports. */ + if (!(ofs->match.wildcards & htonl(OFPFW_IN_PORT))) { + if (!may_recv(sw, ntohs(ofs->match.in_port), + end > (char *) ofs->actions)) { + delete = true; + sw->n_no_recv++; + } + } + + /* Decide to delete the flow if it forwards to an STP-disabled physical + * port. */ + if (!delete) { + const struct ofp_action_header *a; + size_t len; + + for (a = ofs->actions; (char *) a < end; a += len / 8) { + len = ntohs(a->len); + if (len > end - (char *) a) { + VLOG_DBG_RL(&rl, "%012llx: action exceeds available space " + "(%zu > %td)", + sw->datapath_id, len, end - (char *) a); + break; + } else if (len % 8) { + VLOG_DBG_RL(&rl, "%012llx: action length (%zu) not multiple " + "of 8 bytes", sw->datapath_id, len); + break; + } + + if (a->type == htons(OFPAT_OUTPUT)) { + struct ofp_action_output *oao = (struct ofp_action_output *) a; + if (!may_send(sw, ntohs(oao->port))) { + delete = true; + sw->n_no_send++; + break; + } + } + } + } + + /* Delete the flow. */ + if (delete) { + struct ofp_flow_mod *ofm; + struct ofpbuf *b; + + ofm = make_openflow(offsetof(struct ofp_flow_mod, actions), + OFPT_FLOW_MOD, &b); + ofm->match = ofs->match; + ofm->command = OFPFC_DELETE_STRICT; + rconn_send(rconn, b, NULL); + } +} + +static void +process_stats_reply(struct lswitch *sw, struct rconn *rconn, void *osr_) +{ + struct ofp_stats_reply *osr = osr_; + struct flow_stats_iterator i; + const struct ofp_flow_stats *fs; + + if (sw->last_query == LLONG_MIN + || osr->type != htons(OFPST_FLOW) + || osr->header.xid != sw->query_xid) { + return; + } + for (fs = flow_stats_first(&i, osr); fs; fs = flow_stats_next(&i)) { + sw->n_flows++; + process_flow_stats(sw, rconn, fs); + } + if (!(osr->flags & htons(OFPSF_REPLY_MORE))) { + VLOG_DBG("%012llx: Deleted %d of %d received flows to " + "implement STP, %d because of no-recv, %d because of " + "no-send", sw->datapath_id, + sw->n_no_recv + sw->n_no_send, sw->n_flows, + sw->n_no_recv, sw->n_no_send); + sw->last_query = LLONG_MIN; + sw->last_reply = LLONG_MIN; + } else { + sw->last_reply = time_msec(); + } +} + diff --git a/lib/learning-switch.h b/lib/learning-switch.h new file mode 100644 index 00000000..5f2f36ec --- /dev/null +++ b/lib/learning-switch.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef LEARNING_SWITCH_H +#define LEARNING_SWITCH_H 1 + +#include + +struct ofpbuf; +struct rconn; + +struct lswitch *lswitch_create(struct rconn *, bool learn_macs, int max_idle); +void lswitch_run(struct lswitch *, struct rconn *); +void lswitch_wait(struct lswitch *); +void lswitch_destroy(struct lswitch *); +void lswitch_process_packet(struct lswitch *, struct rconn *, + const struct ofpbuf *); + + +#endif /* learning-switch.h */ diff --git a/lib/list.c b/lib/list.c new file mode 100644 index 00000000..087a5f44 --- /dev/null +++ b/lib/list.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include "list.h" +#include + +/* Initializes 'list' as an empty list. */ +void +list_init(struct list *list) +{ + list->next = list->prev = list; +} + +/* Inserts 'elem' just before 'before'. */ +void +list_insert(struct list *before, struct list *elem) +{ + elem->prev = before->prev; + elem->next = before; + before->prev->next = elem; + before->prev = elem; +} + +/* Removes elements 'first' though 'last' (exclusive) from their current list, + then inserts them just before 'before'. */ +void +list_splice(struct list *before, struct list *first, struct list *last) +{ + if (first == last) + return; + last = last->prev; + + /* Cleanly remove 'first'...'last' from its current list. */ + first->prev->next = last->next; + last->next->prev = first->prev; + + /* Splice 'first'...'last' into new list. */ + first->prev = before->prev; + last->next = before; + before->prev->next = first; + before->prev = last; +} + +/* Inserts 'elem' at the beginning of 'list', so that it becomes the front in + 'list'. */ +void +list_push_front(struct list *list, struct list *elem) +{ + list_insert(list->next, elem); +} + +/* Inserts 'elem' at the end of 'list', so that it becomes the back in + * 'list'. */ +void +list_push_back(struct list *list, struct list *elem) +{ + list_insert(list, elem); +} + +/* Puts 'elem' in the position currently occupied by 'position'. + * Afterward, 'position' is not part of a list. */ +void +list_replace(struct list *element, const struct list *position) +{ + element->next = position->next; + element->next->prev = element; + element->prev = position->prev; + element->prev->next = element; +} + +/* Adjusts pointers around 'list' to compensate for 'list' having been moved + * around in memory (e.g. as a consequence of realloc()). */ +void +list_moved(struct list *list) +{ + list->prev->next = list->next->prev = list; +} + +/* Removes 'elem' from its list and returns the element that followed it. + Undefined behavior if 'elem' is not in a list. */ +struct list * +list_remove(struct list *elem) +{ + elem->prev->next = elem->next; + elem->next->prev = elem->prev; + return elem->next; +} + +/* Removes the front element from 'list' and returns it. Undefined behavior if + 'list' is empty before removal. */ +struct list * +list_pop_front(struct list *list) +{ + struct list *front = list->next; + list_remove(front); + return front; +} + +/* Removes the back element from 'list' and returns it. + Undefined behavior if 'list' is empty before removal. */ +struct list * +list_pop_back(struct list *list) +{ + struct list *back = list->prev; + list_remove(back); + return back; +} + +/* Returns the front element in 'list'. + Undefined behavior if 'list' is empty. */ +struct list * +list_front(struct list *list) +{ + assert(!list_is_empty(list)); + return list->next; +} + +/* Returns the back element in 'list'. + Undefined behavior if 'list' is empty. */ +struct list * +list_back(struct list *list) +{ + assert(!list_is_empty(list)); + return list->prev; +} + +/* Returns the number of elements in 'list'. + Runs in O(n) in the number of elements. */ +size_t +list_size(const struct list *list) +{ + const struct list *e; + size_t cnt = 0; + + for (e = list->next; e != list; e = e->next) + cnt++; + return cnt; +} + +/* Returns true if 'list' is empty, false otherwise. */ +bool +list_is_empty(const struct list *list) +{ + return list->next == list; +} diff --git a/lib/list.h b/lib/list.h new file mode 100644 index 00000000..a421ad51 --- /dev/null +++ b/lib/list.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef LIST_H +#define LIST_H 1 + +/* Doubly linked list. */ + +#include +#include +#include "util.h" + +/* Doubly linked list head or element. */ +struct list + { + struct list *prev; /* Previous list element. */ + struct list *next; /* Next list element. */ + }; + +#define LIST_INITIALIZER(LIST) { LIST, LIST } + +void list_init(struct list *); + +/* List insertion. */ +void list_insert(struct list *, struct list *); +void list_splice(struct list *before, struct list *first, struct list *last); +void list_push_front(struct list *, struct list *); +void list_push_back(struct list *, struct list *); +void list_replace(struct list *, const struct list *); +void list_moved(struct list *); + +/* List removal. */ +struct list *list_remove(struct list *); +struct list *list_pop_front(struct list *); +struct list *list_pop_back(struct list *); + +/* List elements. */ +struct list *list_front(struct list *); +struct list *list_back(struct list *); + +/* List properties. */ +size_t list_size(const struct list *); +bool list_is_empty(const struct list *); + +#define LIST_FOR_EACH(ITER, STRUCT, MEMBER, LIST) \ + for (ITER = CONTAINER_OF((LIST)->next, STRUCT, MEMBER); \ + &(ITER)->MEMBER != (LIST); \ + ITER = CONTAINER_OF((ITER)->MEMBER.next, STRUCT, MEMBER)) +#define LIST_FOR_EACH_REVERSE(ITER, STRUCT, MEMBER, LIST) \ + for (ITER = CONTAINER_OF((LIST)->prev, STRUCT, MEMBER); \ + &(ITER)->MEMBER != (LIST); \ + ITER = CONTAINER_OF((ITER)->MEMBER.prev, STRUCT, MEMBER)) +#define LIST_FOR_EACH_SAFE(ITER, NEXT, STRUCT, MEMBER, LIST) \ + for (ITER = CONTAINER_OF((LIST)->next, STRUCT, MEMBER); \ + (NEXT = CONTAINER_OF((ITER)->MEMBER.next, STRUCT, MEMBER), \ + &(ITER)->MEMBER != (LIST)); \ + ITER = NEXT) + +#endif /* list.h */ diff --git a/lib/mac-learning.c b/lib/mac-learning.c new file mode 100644 index 00000000..f0366808 --- /dev/null +++ b/lib/mac-learning.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "mac-learning.h" + +#include +#include +#include + +#include "coverage.h" +#include "hash.h" +#include "list.h" +#include "poll-loop.h" +#include "tag.h" +#include "timeval.h" +#include "util.h" + +#define THIS_MODULE VLM_mac_learning +#include "vlog.h" + +#define MAC_HASH_BITS 10 +#define MAC_HASH_MASK (MAC_HASH_SIZE - 1) +#define MAC_HASH_SIZE (1u << MAC_HASH_BITS) + +#define MAC_MAX 1024 + +/* A MAC learning table entry. */ +struct mac_entry { + struct list hash_node; /* Element in a mac_learning 'table' list. */ + struct list lru_node; /* Element in 'lrus' or 'free' list. */ + time_t expires; /* Expiration time. */ + uint8_t mac[ETH_ADDR_LEN]; /* Known MAC address. */ + uint16_t vlan; /* VLAN tag. */ + int port; /* Port on which MAC was most recently seen. */ + tag_type tag; /* Tag for this learning entry. */ +}; + +/* MAC learning table. */ +struct mac_learning { + struct list free; /* Not-in-use entries. */ + struct list lrus; /* In-use entries, least recently used at the + front, most recently used at the back. */ + struct list table[MAC_HASH_SIZE]; /* Hash table. */ + struct mac_entry entries[MAC_MAX]; /* All entries. */ + uint32_t secret; /* Secret for */ +}; + +static uint32_t +mac_table_hash(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan) +{ + return hash_bytes(mac, ETH_ADDR_LEN, vlan); +} + +static struct mac_entry * +mac_entry_from_lru_node(struct list *list) +{ + return CONTAINER_OF(list, struct mac_entry, lru_node); +} + +/* Returns a tag that represents that 'mac' is on an unknown port in 'vlan'. + * (When we learn where 'mac' is in 'vlan', this allows flows that were + * flooded to be revalidated.) */ +static tag_type +make_unknown_mac_tag(const struct mac_learning *ml, + const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan) +{ + uint32_t h = hash_int(ml->secret, mac_table_hash(mac, vlan)); + return tag_create_deterministic(h); +} + +static struct list * +mac_table_bucket(const struct mac_learning *ml, + const uint8_t mac[ETH_ADDR_LEN], + uint16_t vlan) +{ + uint32_t hash = mac_table_hash(mac, vlan); + const struct list *list = &ml->table[hash & MAC_HASH_BITS]; + return (struct list *) list; +} + +static struct mac_entry * +search_bucket(struct list *bucket, const uint8_t mac[ETH_ADDR_LEN], + uint16_t vlan) +{ + struct mac_entry *e; + LIST_FOR_EACH (e, struct mac_entry, hash_node, bucket) { + if (eth_addr_equals(e->mac, mac) && e->vlan == vlan) { + return e; + } + } + return NULL; +} + +/* If the LRU list is not empty, stores the least-recently-used entry in '*e' + * and returns true. Otherwise, if the LRU list is empty, stores NULL in '*e' + * and return false. */ +static bool +get_lru(struct mac_learning *ml, struct mac_entry **e) +{ + if (!list_is_empty(&ml->lrus)) { + *e = mac_entry_from_lru_node(ml->lrus.next); + return true; + } else { + *e = NULL; + return false; + } +} + +/* Removes 'e' from the 'ml' hash table. 'e' must not already be on the free + * list. */ +static void +free_mac_entry(struct mac_learning *ml, struct mac_entry *e) +{ + list_remove(&e->hash_node); + list_remove(&e->lru_node); + list_push_front(&ml->free, &e->lru_node); +} + +/* Creates and returns a new MAC learning table. */ +struct mac_learning * +mac_learning_create(void) +{ + struct mac_learning *ml; + int i; + + ml = xmalloc(sizeof *ml); + list_init(&ml->lrus); + list_init(&ml->free); + for (i = 0; i < MAC_HASH_SIZE; i++) { + list_init(&ml->table[i]); + } + for (i = 0; i < MAC_MAX; i++) { + struct mac_entry *s = &ml->entries[i]; + list_push_front(&ml->free, &s->lru_node); + } + ml->secret = random_uint32(); + return ml; +} + +/* Destroys MAC learning table 'ml'. */ +void +mac_learning_destroy(struct mac_learning *ml) +{ + free(ml); +} + +/* Attempts to make 'ml' learn from the fact that a frame from 'src_mac' was + * just observed arriving from 'src_port' on the given 'vlan'. + * + * Returns nonzero if we actually learned something from this, zero if it just + * confirms what we already knew. The nonzero return value is the tag of flows + * that now need revalidation. + * + * The 'vlan' parameter is used to maintain separate per-VLAN learning tables. + * Specify 0 if this behavior is undesirable. */ +tag_type +mac_learning_learn(struct mac_learning *ml, + const uint8_t src_mac[ETH_ADDR_LEN], uint16_t vlan, + uint16_t src_port) +{ + struct mac_entry *e; + struct list *bucket; + + if (eth_addr_is_multicast(src_mac)) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, 30); + VLOG_DBG_RL(&rl, "multicast packet source "ETH_ADDR_FMT, + ETH_ADDR_ARGS(src_mac)); + return 0; + } + + bucket = mac_table_bucket(ml, src_mac, vlan); + e = search_bucket(bucket, src_mac, vlan); + if (!e) { + if (!list_is_empty(&ml->free)) { + e = mac_entry_from_lru_node(ml->free.next); + } else { + e = mac_entry_from_lru_node(ml->lrus.next); + list_remove(&e->hash_node); + } + memcpy(e->mac, src_mac, ETH_ADDR_LEN); + list_push_front(bucket, &e->hash_node); + e->port = -1; + e->vlan = vlan; + e->tag = make_unknown_mac_tag(ml, src_mac, vlan); + } + + /* Make the entry most-recently-used. */ + list_remove(&e->lru_node); + list_push_back(&ml->lrus, &e->lru_node); + e->expires = time_now() + 60; + + /* Did we learn something? */ + if (e->port != src_port) { + tag_type old_tag = e->tag; + e->port = src_port; + e->tag = tag_create_random(); + COVERAGE_INC(mac_learning_learned); + return old_tag; + } + return 0; +} + +/* Looks up MAC 'dst' for VLAN 'vlan' in 'ml'. Returns the port on which a + * frame destined for 'dst' should be sent, -1 if unknown. */ +int +mac_learning_lookup(const struct mac_learning *ml, + const uint8_t dst[ETH_ADDR_LEN], uint16_t vlan) +{ + tag_type tag = 0; + return mac_learning_lookup_tag(ml, dst, vlan, &tag); +} + +/* Looks up MAC 'dst' for VLAN 'vlan' in 'ml'. Returns the port on which a + * frame destined for 'dst' should be sent, -1 if unknown. + * + * Adds to '*tag' (which the caller must have initialized) the tag that should + * be attached to any flow created based on the return value, if any, to allow + * those flows to be revalidated when the MAC learning entry changes. */ +int +mac_learning_lookup_tag(const struct mac_learning *ml, + const uint8_t dst[ETH_ADDR_LEN], uint16_t vlan, + tag_type *tag) +{ + if (eth_addr_is_multicast(dst)) { + return -1; + } else { + struct mac_entry *e = search_bucket(mac_table_bucket(ml, dst, vlan), + dst, vlan); + if (e) { + *tag |= e->tag; + return e->port; + } else { + *tag |= make_unknown_mac_tag(ml, dst, vlan); + return -1; + } + } +} + +/* Expires all the mac-learning entries in 'ml'. The tags in 'ml' are + * discarded, so the client is responsible for revalidating any flows that + * depend on 'ml', if necessary. */ +void +mac_learning_flush(struct mac_learning *ml) +{ + struct mac_entry *e; + while (get_lru(ml, &e)){ + free_mac_entry(ml, e); + } +} + +void +mac_learning_run(struct mac_learning *ml, struct tag_set *set) +{ + struct mac_entry *e; + while (get_lru(ml, &e) && time_now() >= e->expires) { + COVERAGE_INC(mac_learning_expired); + if (set) { + tag_set_add(set, e->tag); + } + free_mac_entry(ml, e); + } +} + +void +mac_learning_wait(struct mac_learning *ml) +{ + if (!list_is_empty(&ml->lrus)) { + struct mac_entry *e = mac_entry_from_lru_node(ml->lrus.next); + poll_timer_wait((e->expires - time_now()) * 1000); + } +} diff --git a/lib/mac-learning.h b/lib/mac-learning.h new file mode 100644 index 00000000..fc1d6204 --- /dev/null +++ b/lib/mac-learning.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef MAC_LEARNING_H +#define MAC_LEARNING_H 1 + +#include "packets.h" +#include "tag.h" + +struct mac_learning *mac_learning_create(void); +void mac_learning_destroy(struct mac_learning *); +tag_type mac_learning_learn(struct mac_learning *, + const uint8_t src[ETH_ADDR_LEN], uint16_t vlan, + uint16_t src_port); +int mac_learning_lookup(const struct mac_learning *, + const uint8_t dst[ETH_ADDR_LEN], uint16_t vlan); +int mac_learning_lookup_tag(const struct mac_learning *, + const uint8_t dst[ETH_ADDR_LEN], + uint16_t vlan, tag_type *tag); +void mac_learning_flush(struct mac_learning *); +void mac_learning_run(struct mac_learning *, struct tag_set *); +void mac_learning_wait(struct mac_learning *); + +#endif /* mac-learning.h */ diff --git a/lib/netdev.c b/lib/netdev.c new file mode 100644 index 00000000..7fd070eb --- /dev/null +++ b/lib/netdev.c @@ -0,0 +1,1556 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "netdev.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "coverage.h" +#include "dynamic-string.h" +#include "fatal-signal.h" +#include "list.h" +#include "netlink.h" +#include "ofpbuf.h" +#include "openflow/openflow.h" +#include "packets.h" +#include "poll-loop.h" +#include "socket-util.h" +#include "svec.h" + +/* linux/if.h defines IFF_LOWER_UP, net/if.h doesn't. + * net/if.h defines if_nameindex(), linux/if.h doesn't. + * We can't include both headers, so define IFF_LOWER_UP ourselves. */ +#ifndef IFF_LOWER_UP +#define IFF_LOWER_UP 0x10000 +#endif + +/* These were introduced in Linux 2.6.14, so they might be missing if we have + * old headers. */ +#ifndef ADVERTISED_Pause +#define ADVERTISED_Pause (1 << 13) +#endif +#ifndef ADVERTISED_Asym_Pause +#define ADVERTISED_Asym_Pause (1 << 14) +#endif + +#define THIS_MODULE VLM_netdev +#include "vlog.h" + +struct netdev { + struct list node; + char *name; + + /* File descriptors. For ordinary network devices, the two fds below are + * the same; for tap devices, they differ. */ + int netdev_fd; /* Network device. */ + int tap_fd; /* TAP character device, if any, otherwise the + * network device. */ + + /* Cached network device information. */ + int ifindex; /* -1 if not known. */ + uint8_t etheraddr[ETH_ADDR_LEN]; + struct in6_addr in6; + int speed; + int mtu; + int txqlen; + int hwaddr_family; + + int save_flags; /* Initial device flags. */ + int changed_flags; /* Flags that we changed. */ +}; + +/* Policy for RTNLGRP_LINK messages. + * + * There are *many* more fields in these messages, but currently we only care + * about interface names. */ +static const struct nl_policy rtnlgrp_link_policy[] = { + [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false }, + [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true, + .min_len = sizeof(struct rtnl_link_stats) }, +}; + +/* All open network devices. */ +static struct list netdev_list = LIST_INITIALIZER(&netdev_list); + +/* An AF_INET socket (used for ioctl operations). */ +static int af_inet_sock = -1; + +/* NETLINK_ROUTE socket. */ +static struct nl_sock *rtnl_sock; + +/* Can we use RTM_GETLINK to get network device statistics? (In pre-2.6.19 + * kernels, this was only available if wireless extensions were enabled.) */ +static bool use_netlink_stats; + +/* This is set pretty low because we probably won't learn anything from the + * additional log messages. */ +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); + +static void init_netdev(void); +static int do_open_netdev(const char *name, int ethertype, int tap_fd, + struct netdev **netdev_); +static int restore_flags(struct netdev *netdev); +static int get_flags(const char *netdev_name, int *flagsp); +static int set_flags(const char *netdev_name, int flags); +static int do_get_ifindex(const char *netdev_name); +static int get_ifindex(const struct netdev *, int *ifindexp); +static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN], + int *hwaddr_familyp); +static int set_etheraddr(const char *netdev_name, int hwaddr_family, + const uint8_t[ETH_ADDR_LEN]); + +/* Obtains the IPv6 address for 'name' into 'in6'. */ +static void +get_ipv6_address(const char *name, struct in6_addr *in6) +{ + FILE *file; + char line[128]; + + file = fopen("/proc/net/if_inet6", "r"); + if (file == NULL) { + /* This most likely indicates that the host doesn't have IPv6 support, + * so it's not really a failure condition.*/ + *in6 = in6addr_any; + return; + } + + while (fgets(line, sizeof line, file)) { + uint8_t *s6 = in6->s6_addr; + char ifname[16 + 1]; + +#define X8 "%2"SCNx8 + if (sscanf(line, " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 + "%*x %*x %*x %*x %16s\n", + &s6[0], &s6[1], &s6[2], &s6[3], + &s6[4], &s6[5], &s6[6], &s6[7], + &s6[8], &s6[9], &s6[10], &s6[11], + &s6[12], &s6[13], &s6[14], &s6[15], + ifname) == 17 + && !strcmp(name, ifname)) + { + fclose(file); + return; + } + } + *in6 = in6addr_any; + + fclose(file); +} + +static int +do_ethtool(struct netdev *netdev, struct ethtool_cmd *ecmd, + int cmd, const char *cmd_name) +{ + struct ifreq ifr; + + memset(&ifr, 0, sizeof ifr); + strncpy(ifr.ifr_name, netdev->name, sizeof ifr.ifr_name); + ifr.ifr_data = (caddr_t) ecmd; + + ecmd->cmd = cmd; + COVERAGE_INC(netdev_ethtool); + if (ioctl(netdev->netdev_fd, SIOCETHTOOL, &ifr) == 0) { + return 0; + } else { + if (errno != EOPNOTSUPP) { + VLOG_WARN_RL(&rl, "ethtool command %s on network device %s " + "failed: %s", cmd_name, netdev->name, + strerror(errno)); + } else { + /* The device doesn't support this operation. That's pretty + * common, so there's no point in logging anything. */ + } + return errno; + } +} + +static int +do_get_features(struct netdev *netdev, + uint32_t *current, uint32_t *advertised, + uint32_t *supported, uint32_t *peer) +{ + struct ethtool_cmd ecmd; + int error; + + *current = 0; + *supported = 0; + *advertised = 0; + *peer = 0; + + memset(&ecmd, 0, sizeof ecmd); + error = do_ethtool(netdev, &ecmd, ETHTOOL_GSET, "ETHTOOL_GSET"); + if (error) { + return error; + } + + if (ecmd.supported & SUPPORTED_10baseT_Half) { + *supported |= OFPPF_10MB_HD; + } + if (ecmd.supported & SUPPORTED_10baseT_Full) { + *supported |= OFPPF_10MB_FD; + } + if (ecmd.supported & SUPPORTED_100baseT_Half) { + *supported |= OFPPF_100MB_HD; + } + if (ecmd.supported & SUPPORTED_100baseT_Full) { + *supported |= OFPPF_100MB_FD; + } + if (ecmd.supported & SUPPORTED_1000baseT_Half) { + *supported |= OFPPF_1GB_HD; + } + if (ecmd.supported & SUPPORTED_1000baseT_Full) { + *supported |= OFPPF_1GB_FD; + } + if (ecmd.supported & SUPPORTED_10000baseT_Full) { + *supported |= OFPPF_10GB_FD; + } + if (ecmd.supported & SUPPORTED_TP) { + *supported |= OFPPF_COPPER; + } + if (ecmd.supported & SUPPORTED_FIBRE) { + *supported |= OFPPF_FIBER; + } + if (ecmd.supported & SUPPORTED_Autoneg) { + *supported |= OFPPF_AUTONEG; + } + if (ecmd.supported & SUPPORTED_Pause) { + *supported |= OFPPF_PAUSE; + } + if (ecmd.supported & SUPPORTED_Asym_Pause) { + *supported |= OFPPF_PAUSE_ASYM; + } + + /* Set the advertised features */ + if (ecmd.advertising & ADVERTISED_10baseT_Half) { + *advertised |= OFPPF_10MB_HD; + } + if (ecmd.advertising & ADVERTISED_10baseT_Full) { + *advertised |= OFPPF_10MB_FD; + } + if (ecmd.advertising & ADVERTISED_100baseT_Half) { + *advertised |= OFPPF_100MB_HD; + } + if (ecmd.advertising & ADVERTISED_100baseT_Full) { + *advertised |= OFPPF_100MB_FD; + } + if (ecmd.advertising & ADVERTISED_1000baseT_Half) { + *advertised |= OFPPF_1GB_HD; + } + if (ecmd.advertising & ADVERTISED_1000baseT_Full) { + *advertised |= OFPPF_1GB_FD; + } + if (ecmd.advertising & ADVERTISED_10000baseT_Full) { + *advertised |= OFPPF_10GB_FD; + } + if (ecmd.advertising & ADVERTISED_TP) { + *advertised |= OFPPF_COPPER; + } + if (ecmd.advertising & ADVERTISED_FIBRE) { + *advertised |= OFPPF_FIBER; + } + if (ecmd.advertising & ADVERTISED_Autoneg) { + *advertised |= OFPPF_AUTONEG; + } + if (ecmd.advertising & ADVERTISED_Pause) { + *advertised |= OFPPF_PAUSE; + } + if (ecmd.advertising & ADVERTISED_Asym_Pause) { + *advertised |= OFPPF_PAUSE_ASYM; + } + + /* Set the current features */ + if (ecmd.speed == SPEED_10) { + *current = (ecmd.duplex) ? OFPPF_10MB_FD : OFPPF_10MB_HD; + } + else if (ecmd.speed == SPEED_100) { + *current = (ecmd.duplex) ? OFPPF_100MB_FD : OFPPF_100MB_HD; + } + else if (ecmd.speed == SPEED_1000) { + *current = (ecmd.duplex) ? OFPPF_1GB_FD : OFPPF_1GB_HD; + } + else if (ecmd.speed == SPEED_10000) { + *current = OFPPF_10GB_FD; + } + + if (ecmd.port == PORT_TP) { + *current |= OFPPF_COPPER; + } + else if (ecmd.port == PORT_FIBRE) { + *current |= OFPPF_FIBER; + } + + if (ecmd.autoneg) { + *current |= OFPPF_AUTONEG; + } + return 0; +} + +/* Opens the network device named 'name' (e.g. "eth0") and returns zero if + * successful, otherwise a positive errno value. On success, sets '*netdevp' + * to the new network device, otherwise to null. + * + * 'ethertype' may be a 16-bit Ethernet protocol value in host byte order to + * capture frames of that type received on the device. It may also be one of + * the 'enum netdev_pseudo_ethertype' values to receive frames in one of those + * categories. */ +int +netdev_open(const char *name, int ethertype, struct netdev **netdevp) +{ + if (!strncmp(name, "tap:", 4)) { + return netdev_open_tap(name + 4, netdevp); + } else { + return do_open_netdev(name, ethertype, -1, netdevp); + } +} + +/* Opens a TAP virtual network device. If 'name' is a nonnull, non-empty + * string, attempts to assign that name to the TAP device (failing if the name + * is already in use); otherwise, a name is automatically assigned. Returns + * zero if successful, otherwise a positive errno value. On success, sets + * '*netdevp' to the new network device, otherwise to null. */ +int +netdev_open_tap(const char *name, struct netdev **netdevp) +{ + static const char tap_dev[] = "/dev/net/tun"; + struct ifreq ifr; + int error; + int tap_fd; + + tap_fd = open(tap_dev, O_RDWR); + if (tap_fd < 0) { + ovs_error(errno, "opening \"%s\" failed", tap_dev); + return errno; + } + + memset(&ifr, 0, sizeof ifr); + ifr.ifr_flags = IFF_TAP | IFF_NO_PI; + if (name) { + strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name); + } + if (ioctl(tap_fd, TUNSETIFF, &ifr) < 0) { + int error = errno; + ovs_error(error, "ioctl(TUNSETIFF) on \"%s\" failed", tap_dev); + close(tap_fd); + return error; + } + + error = set_nonblocking(tap_fd); + if (error) { + ovs_error(error, "set_nonblocking on \"%s\" failed", tap_dev); + close(tap_fd); + return error; + } + + error = do_open_netdev(ifr.ifr_name, NETDEV_ETH_TYPE_NONE, tap_fd, + netdevp); + if (error) { + close(tap_fd); + } + return error; +} + +static int +do_open_netdev(const char *name, int ethertype, int tap_fd, + struct netdev **netdev_) +{ + int netdev_fd; + struct sockaddr_ll sll; + struct ifreq ifr; + int ifindex = -1; + uint8_t etheraddr[ETH_ADDR_LEN]; + struct in6_addr in6; + int mtu; + int txqlen; + int hwaddr_family; + int error; + struct netdev *netdev; + + init_netdev(); + *netdev_ = NULL; + COVERAGE_INC(netdev_open); + + /* Create raw socket. */ + netdev_fd = socket(PF_PACKET, SOCK_RAW, + htons(ethertype == NETDEV_ETH_TYPE_NONE ? 0 + : ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL + : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2 + : ethertype)); + if (netdev_fd < 0) { + return errno; + } + + if (ethertype != NETDEV_ETH_TYPE_NONE) { + /* Set non-blocking mode. */ + error = set_nonblocking(netdev_fd); + if (error) { + goto error_already_set; + } + + /* Get ethernet device index. */ + ifindex = do_get_ifindex(name); + if (ifindex < 0) { + return -ifindex; + } + + /* Bind to specific ethernet device. */ + memset(&sll, 0, sizeof sll); + sll.sll_family = AF_PACKET; + sll.sll_ifindex = ifindex; + if (bind(netdev_fd, (struct sockaddr *) &sll, sizeof sll) < 0) { + VLOG_ERR("bind to %s failed: %s", name, strerror(errno)); + goto error; + } + + /* Between the socket() and bind() calls above, the socket receives all + * packets of the requested type on all system interfaces. We do not + * want to receive that data, but there is no way to avoid it. So we + * must now drain out the receive queue. */ + error = drain_rcvbuf(netdev_fd); + if (error) { + goto error_already_set; + } + } + + /* Get MAC address. */ + error = get_etheraddr(name, etheraddr, &hwaddr_family); + if (error) { + goto error_already_set; + } + + /* Get MTU. */ + strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name); + if (ioctl(netdev_fd, SIOCGIFMTU, &ifr) < 0) { + VLOG_ERR("ioctl(SIOCGIFMTU) on %s device failed: %s", + name, strerror(errno)); + goto error; + } + mtu = ifr.ifr_mtu; + + /* Get TX queue length. */ + if (ioctl(netdev_fd, SIOCGIFTXQLEN, &ifr) < 0) { + VLOG_ERR("ioctl(SIOCGIFTXQLEN) on %s device failed: %s", + name, strerror(errno)); + goto error; + } + txqlen = ifr.ifr_qlen; + + get_ipv6_address(name, &in6); + + /* Allocate network device. */ + netdev = xmalloc(sizeof *netdev); + netdev->name = xstrdup(name); + netdev->ifindex = ifindex; + netdev->txqlen = txqlen; + netdev->hwaddr_family = hwaddr_family; + netdev->netdev_fd = netdev_fd; + netdev->tap_fd = tap_fd < 0 ? netdev_fd : tap_fd; + memcpy(netdev->etheraddr, etheraddr, sizeof etheraddr); + netdev->mtu = mtu; + netdev->in6 = in6; + + /* Save flags to restore at close or exit. */ + error = get_flags(netdev->name, &netdev->save_flags); + if (error) { + goto error_already_set; + } + netdev->changed_flags = 0; + fatal_signal_block(); + list_push_back(&netdev_list, &netdev->node); + fatal_signal_unblock(); + + /* Success! */ + *netdev_ = netdev; + return 0; + +error: + error = errno; +error_already_set: + close(netdev_fd); + if (tap_fd >= 0) { + close(tap_fd); + } + return error; +} + +/* Closes and destroys 'netdev'. */ +void +netdev_close(struct netdev *netdev) +{ + if (netdev) { + /* Bring down interface and drop promiscuous mode, if we brought up + * the interface or enabled promiscuous mode. */ + int error; + fatal_signal_block(); + error = restore_flags(netdev); + list_remove(&netdev->node); + fatal_signal_unblock(); + if (error) { + VLOG_WARN("failed to restore network device flags on %s: %s", + netdev->name, strerror(error)); + } + + /* Free. */ + free(netdev->name); + close(netdev->netdev_fd); + if (netdev->netdev_fd != netdev->tap_fd) { + close(netdev->tap_fd); + } + free(netdev); + } +} + +/* Pads 'buffer' out with zero-bytes to the minimum valid length of an + * Ethernet packet, if necessary. */ +static void +pad_to_minimum_length(struct ofpbuf *buffer) +{ + if (buffer->size < ETH_TOTAL_MIN) { + ofpbuf_put_zeros(buffer, ETH_TOTAL_MIN - buffer->size); + } +} + +/* Attempts to receive a packet from 'netdev' into 'buffer', which the caller + * must have initialized with sufficient room for the packet. The space + * required to receive any packet is ETH_HEADER_LEN bytes, plus VLAN_HEADER_LEN + * bytes, plus the device's MTU (which may be retrieved via netdev_get_mtu()). + * (Some devices do not allow for a VLAN header, in which case VLAN_HEADER_LEN + * need not be included.) + * + * If a packet is successfully retrieved, returns 0. In this case 'buffer' is + * guaranteed to contain at least ETH_TOTAL_MIN bytes. Otherwise, returns a + * positive errno value. Returns EAGAIN immediately if no packet is ready to + * be returned. + */ +int +netdev_recv(struct netdev *netdev, struct ofpbuf *buffer) +{ + ssize_t n_bytes; + + assert(buffer->size == 0); + assert(ofpbuf_tailroom(buffer) >= ETH_TOTAL_MIN); + do { + n_bytes = read(netdev->tap_fd, + ofpbuf_tail(buffer), ofpbuf_tailroom(buffer)); + } while (n_bytes < 0 && errno == EINTR); + if (n_bytes < 0) { + if (errno != EAGAIN) { + VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s", + strerror(errno), netdev->name); + } + return errno; + } else { + COVERAGE_INC(netdev_received); + buffer->size += n_bytes; + + /* When the kernel internally sends out an Ethernet frame on an + * interface, it gives us a copy *before* padding the frame to the + * minimum length. Thus, when it sends out something like an ARP + * request, we see a too-short frame. So pad it out to the minimum + * length. */ + pad_to_minimum_length(buffer); + return 0; + } +} + +/* Registers with the poll loop to wake up from the next call to poll_block() + * when a packet is ready to be received with netdev_recv() on 'netdev'. */ +void +netdev_recv_wait(struct netdev *netdev) +{ + poll_fd_wait(netdev->tap_fd, POLLIN); +} + +/* Discards all packets waiting to be received from 'netdev'. */ +int +netdev_drain(struct netdev *netdev) +{ + if (netdev->tap_fd != netdev->netdev_fd) { + drain_fd(netdev->tap_fd, netdev->txqlen); + return 0; + } else { + return drain_rcvbuf(netdev->netdev_fd); + } +} + +/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive + * errno value. Returns EAGAIN without blocking if the packet cannot be queued + * immediately. Returns EMSGSIZE if a partial packet was transmitted or if + * the packet is too big or too small to transmit on the device. + * + * The caller retains ownership of 'buffer' in all cases. + * + * The kernel maintains a packet transmission queue, so the caller is not + * expected to do additional queuing of packets. */ +int +netdev_send(struct netdev *netdev, const struct ofpbuf *buffer) +{ + ssize_t n_bytes; + + do { + n_bytes = write(netdev->tap_fd, buffer->data, buffer->size); + } while (n_bytes < 0 && errno == EINTR); + + if (n_bytes < 0) { + /* The Linux AF_PACKET implementation never blocks waiting for room + * for packets, instead returning ENOBUFS. Translate this into EAGAIN + * for the caller. */ + if (errno == ENOBUFS) { + return EAGAIN; + } else if (errno != EAGAIN) { + VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s", + netdev->name, strerror(errno)); + } + return errno; + } else if (n_bytes != buffer->size) { + VLOG_WARN_RL(&rl, + "send partial Ethernet packet (%d bytes of %zu) on %s", + (int) n_bytes, buffer->size, netdev->name); + return EMSGSIZE; + } else { + COVERAGE_INC(netdev_sent); + return 0; + } +} + +/* Registers with the poll loop to wake up from the next call to poll_block() + * when the packet transmission queue has sufficient room to transmit a packet + * with netdev_send(). + * + * The kernel maintains a packet transmission queue, so the client is not + * expected to do additional queuing of packets. Thus, this function is + * unlikely to ever be used. It is included for completeness. */ +void +netdev_send_wait(struct netdev *netdev) +{ + if (netdev->tap_fd == netdev->netdev_fd) { + poll_fd_wait(netdev->tap_fd, POLLOUT); + } else { + /* TAP device always accepts packets.*/ + poll_immediate_wake(); + } +} + +/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful, + * otherwise a positive errno value. */ +int +netdev_set_etheraddr(struct netdev *netdev, const uint8_t mac[ETH_ADDR_LEN]) +{ + int error = set_etheraddr(netdev->name, netdev->hwaddr_family, mac); + if (!error) { + memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN); + } + return error; +} + +int +netdev_nodev_set_etheraddr(const char *name, const uint8_t mac[ETH_ADDR_LEN]) +{ + init_netdev(); + return set_etheraddr(name, ARPHRD_ETHER, mac); +} + +/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or + * free the returned buffer. */ +const uint8_t * +netdev_get_etheraddr(const struct netdev *netdev) +{ + return netdev->etheraddr; +} + +/* Returns the name of the network device that 'netdev' represents, + * e.g. "eth0". The caller must not modify or free the returned string. */ +const char * +netdev_get_name(const struct netdev *netdev) +{ + return netdev->name; +} + +/* Returns the maximum size of transmitted (and received) packets on 'netdev', + * in bytes, not including the hardware header; thus, this is typically 1500 + * bytes for Ethernet devices. */ +int +netdev_get_mtu(const struct netdev *netdev) +{ + return netdev->mtu; +} + +/* Stores the features supported by 'netdev' into each of '*current', + * '*advertised', '*supported', and '*peer' that are non-null. Each value is a + * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if + * successful, otherwise a positive errno value. On failure, all of the + * passed-in values are set to 0. */ +int +netdev_get_features(struct netdev *netdev, + uint32_t *current, uint32_t *advertised, + uint32_t *supported, uint32_t *peer) +{ + uint32_t dummy[4]; + return do_get_features(netdev, + current ? current : &dummy[0], + advertised ? advertised : &dummy[1], + supported ? supported : &dummy[2], + peer ? peer : &dummy[3]); +} + +int +netdev_set_advertisements(struct netdev *netdev, uint32_t advertise) +{ + struct ethtool_cmd ecmd; + int error; + + memset(&ecmd, 0, sizeof ecmd); + error = do_ethtool(netdev, &ecmd, ETHTOOL_GSET, "ETHTOOL_GSET"); + if (error) { + return error; + } + + ecmd.advertising = 0; + if (advertise & OFPPF_10MB_HD) { + ecmd.advertising |= ADVERTISED_10baseT_Half; + } + if (advertise & OFPPF_10MB_FD) { + ecmd.advertising |= ADVERTISED_10baseT_Full; + } + if (advertise & OFPPF_100MB_HD) { + ecmd.advertising |= ADVERTISED_100baseT_Half; + } + if (advertise & OFPPF_100MB_FD) { + ecmd.advertising |= ADVERTISED_100baseT_Full; + } + if (advertise & OFPPF_1GB_HD) { + ecmd.advertising |= ADVERTISED_1000baseT_Half; + } + if (advertise & OFPPF_1GB_FD) { + ecmd.advertising |= ADVERTISED_1000baseT_Full; + } + if (advertise & OFPPF_10GB_FD) { + ecmd.advertising |= ADVERTISED_10000baseT_Full; + } + if (advertise & OFPPF_COPPER) { + ecmd.advertising |= ADVERTISED_TP; + } + if (advertise & OFPPF_FIBER) { + ecmd.advertising |= ADVERTISED_FIBRE; + } + if (advertise & OFPPF_AUTONEG) { + ecmd.advertising |= ADVERTISED_Autoneg; + } + if (advertise & OFPPF_PAUSE) { + ecmd.advertising |= ADVERTISED_Pause; + } + if (advertise & OFPPF_PAUSE_ASYM) { + ecmd.advertising |= ADVERTISED_Asym_Pause; + } + return do_ethtool(netdev, &ecmd, ETHTOOL_SSET, "ETHTOOL_SSET"); +} + +/* If 'netdev' has an assigned IPv4 address, sets '*in4' to that address (if + * 'in4' is non-null) and returns true. Otherwise, returns false. */ +bool +netdev_get_in4(const struct netdev *netdev, struct in_addr *in4) +{ + struct ifreq ifr; + struct in_addr ip = { INADDR_ANY }; + + strncpy(ifr.ifr_name, netdev->name, sizeof ifr.ifr_name); + ifr.ifr_addr.sa_family = AF_INET; + COVERAGE_INC(netdev_get_in4); + if (ioctl(af_inet_sock, SIOCGIFADDR, &ifr) == 0) { + struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr; + ip = sin->sin_addr; + } else { + VLOG_DBG_RL(&rl, "%s: ioctl(SIOCGIFADDR) failed: %s", + netdev->name, strerror(errno)); + } + if (in4) { + *in4 = ip; + } + return ip.s_addr != INADDR_ANY; +} + +static void +make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr) +{ + struct sockaddr_in sin; + memset(&sin, 0, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_addr = addr; + sin.sin_port = 0; + + memset(sa, 0, sizeof *sa); + memcpy(sa, &sin, sizeof sin); +} + +static int +do_set_addr(struct netdev *netdev, int sock, + int ioctl_nr, const char *ioctl_name, struct in_addr addr) +{ + struct ifreq ifr; + int error; + + strncpy(ifr.ifr_name, netdev->name, sizeof ifr.ifr_name); + make_in4_sockaddr(&ifr.ifr_addr, addr); + COVERAGE_INC(netdev_set_in4); + error = ioctl(sock, ioctl_nr, &ifr) < 0 ? errno : 0; + if (error) { + VLOG_WARN("ioctl(%s): %s", ioctl_name, strerror(error)); + } + return error; +} + +/* Assigns 'addr' as 'netdev''s IPv4 address and 'mask' as its netmask. If + * 'addr' is INADDR_ANY, 'netdev''s IPv4 address is cleared. Returns a + * positive errno value. */ +int +netdev_set_in4(struct netdev *netdev, struct in_addr addr, struct in_addr mask) +{ + int error; + + error = do_set_addr(netdev, af_inet_sock, + SIOCSIFADDR, "SIOCSIFADDR", addr); + if (!error && addr.s_addr != INADDR_ANY) { + error = do_set_addr(netdev, af_inet_sock, + SIOCSIFNETMASK, "SIOCSIFNETMASK", mask); + } + return error; +} + +/* Adds 'router' as a default IP gateway. */ +int +netdev_add_router(struct in_addr router) +{ + struct in_addr any = { INADDR_ANY }; + struct rtentry rt; + int error; + + memset(&rt, 0, sizeof rt); + make_in4_sockaddr(&rt.rt_dst, any); + make_in4_sockaddr(&rt.rt_gateway, router); + make_in4_sockaddr(&rt.rt_genmask, any); + rt.rt_flags = RTF_UP | RTF_GATEWAY; + COVERAGE_INC(netdev_add_router); + error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0; + if (error) { + VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error)); + } + return error; +} + +/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if + * 'in6' is non-null) and returns true. Otherwise, returns false. */ +bool +netdev_get_in6(const struct netdev *netdev, struct in6_addr *in6) +{ + if (in6) { + *in6 = netdev->in6; + } + return memcmp(&netdev->in6, &in6addr_any, sizeof netdev->in6) != 0; +} + +/* Obtains the current flags for 'netdev' and stores them into '*flagsp'. + * Returns 0 if successful, otherwise a positive errno value. On failure, + * stores 0 into '*flagsp'. */ +int +netdev_get_flags(const struct netdev *netdev, enum netdev_flags *flagsp) +{ + return netdev_nodev_get_flags(netdev->name, flagsp); +} + +static int +nd_to_iff_flags(enum netdev_flags nd) +{ + int iff = 0; + if (nd & NETDEV_UP) { + iff |= IFF_UP; + } + if (nd & NETDEV_PROMISC) { + iff |= IFF_PROMISC; + } + return iff; +} + +/* On 'netdev', turns off the flags in 'off' and then turns on the flags in + * 'on'. If 'permanent' is true, the changes will persist; otherwise, they + * will be reverted when 'netdev' is closed or the program exits. Returns 0 if + * successful, otherwise a positive errno value. */ +static int +do_update_flags(struct netdev *netdev, enum netdev_flags off, + enum netdev_flags on, bool permanent) +{ + int old_flags, new_flags; + int error; + + error = get_flags(netdev->name, &old_flags); + if (error) { + return error; + } + + new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on); + if (!permanent) { + netdev->changed_flags |= new_flags ^ old_flags; + } + if (new_flags != old_flags) { + error = set_flags(netdev->name, new_flags); + } + return error; +} + +/* Sets the flags for 'netdev' to 'flags'. + * If 'permanent' is true, the changes will persist; otherwise, they + * will be reverted when 'netdev' is closed or the program exits. + * Returns 0 if successful, otherwise a positive errno value. */ +int +netdev_set_flags(struct netdev *netdev, enum netdev_flags flags, + bool permanent) +{ + return do_update_flags(netdev, -1, flags, permanent); +} + +/* Turns on the specified 'flags' on 'netdev'. + * If 'permanent' is true, the changes will persist; otherwise, they + * will be reverted when 'netdev' is closed or the program exits. + * Returns 0 if successful, otherwise a positive errno value. */ +int +netdev_turn_flags_on(struct netdev *netdev, enum netdev_flags flags, + bool permanent) +{ + return do_update_flags(netdev, 0, flags, permanent); +} + +/* Turns off the specified 'flags' on 'netdev'. + * If 'permanent' is true, the changes will persist; otherwise, they + * will be reverted when 'netdev' is closed or the program exits. + * Returns 0 if successful, otherwise a positive errno value. */ +int +netdev_turn_flags_off(struct netdev *netdev, enum netdev_flags flags, + bool permanent) +{ + return do_update_flags(netdev, flags, 0, permanent); +} + +/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be + * successfully retrieved, it stores the corresponding MAC address in 'mac' and + * returns 0. Otherwise, it returns a positive errno value; in particular, + * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */ +int +netdev_arp_lookup(const struct netdev *netdev, + uint32_t ip, uint8_t mac[ETH_ADDR_LEN]) +{ + struct arpreq r; + struct sockaddr_in *pa; + int retval; + + memset(&r, 0, sizeof r); + pa = (struct sockaddr_in *) &r.arp_pa; + pa->sin_family = AF_INET; + pa->sin_addr.s_addr = ip; + pa->sin_port = 0; + r.arp_ha.sa_family = ARPHRD_ETHER; + r.arp_flags = 0; + strncpy(r.arp_dev, netdev->name, sizeof r.arp_dev); + COVERAGE_INC(netdev_arp_lookup); + retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0; + if (!retval) { + memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN); + } else if (retval != ENXIO) { + VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s", + netdev->name, IP_ARGS(&ip), strerror(retval)); + } + return retval; +} + +static int +get_stats_via_netlink(int ifindex, struct netdev_stats *stats) +{ + struct ofpbuf request; + struct ofpbuf *reply; + struct ifinfomsg *ifi; + const struct rtnl_link_stats *rtnl_stats; + struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)]; + int error; + + ofpbuf_init(&request, 0); + nl_msg_put_nlmsghdr(&request, rtnl_sock, sizeof *ifi, + RTM_GETLINK, NLM_F_REQUEST); + ifi = ofpbuf_put_zeros(&request, sizeof *ifi); + ifi->ifi_family = PF_UNSPEC; + ifi->ifi_index = ifindex; + error = nl_sock_transact(rtnl_sock, &request, &reply); + ofpbuf_uninit(&request); + if (error) { + return error; + } + + if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg), + rtnlgrp_link_policy, + attrs, ARRAY_SIZE(rtnlgrp_link_policy))) { + ofpbuf_delete(reply); + return EPROTO; + } + + if (!attrs[IFLA_STATS]) { + VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats"); + return EPROTO; + } + + rtnl_stats = nl_attr_get(attrs[IFLA_STATS]); + stats->rx_packets = rtnl_stats->rx_packets; + stats->tx_packets = rtnl_stats->tx_packets; + stats->rx_bytes = rtnl_stats->rx_bytes; + stats->tx_bytes = rtnl_stats->tx_bytes; + stats->rx_errors = rtnl_stats->rx_errors; + stats->tx_errors = rtnl_stats->tx_errors; + stats->rx_dropped = rtnl_stats->rx_dropped; + stats->tx_dropped = rtnl_stats->tx_dropped; + stats->multicast = rtnl_stats->multicast; + stats->collisions = rtnl_stats->collisions; + stats->rx_length_errors = rtnl_stats->rx_length_errors; + stats->rx_over_errors = rtnl_stats->rx_over_errors; + stats->rx_crc_errors = rtnl_stats->rx_crc_errors; + stats->rx_frame_errors = rtnl_stats->rx_frame_errors; + stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors; + stats->rx_missed_errors = rtnl_stats->rx_missed_errors; + stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors; + stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors; + stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors; + stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors; + stats->tx_window_errors = rtnl_stats->tx_window_errors; + + return 0; +} + +static int +get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats) +{ + static const char fn[] = "/proc/net/dev"; + char line[1024]; + FILE *stream; + int ln; + + stream = fopen(fn, "r"); + if (!stream) { + VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno)); + return errno; + } + + ln = 0; + while (fgets(line, sizeof line, stream)) { + if (++ln >= 3) { + char devname[16]; +#define X64 "%"SCNu64 + if (sscanf(line, + " %15[^:]:" + X64 X64 X64 X64 X64 X64 X64 "%*u" + X64 X64 X64 X64 X64 X64 X64 "%*u", + devname, + &stats->rx_bytes, + &stats->rx_packets, + &stats->rx_errors, + &stats->rx_dropped, + &stats->rx_fifo_errors, + &stats->rx_frame_errors, + &stats->multicast, + &stats->tx_bytes, + &stats->tx_packets, + &stats->tx_errors, + &stats->tx_dropped, + &stats->tx_fifo_errors, + &stats->collisions, + &stats->tx_carrier_errors) != 15) { + VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln); + } else if (!strcmp(devname, netdev_name)) { + stats->rx_length_errors = UINT64_MAX; + stats->rx_over_errors = UINT64_MAX; + stats->rx_crc_errors = UINT64_MAX; + stats->rx_missed_errors = UINT64_MAX; + stats->tx_aborted_errors = UINT64_MAX; + stats->tx_heartbeat_errors = UINT64_MAX; + stats->tx_window_errors = UINT64_MAX; + fclose(stream); + return 0; + } + } + } + VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name); + fclose(stream); + return ENODEV; +} + +int +netdev_get_carrier(const struct netdev *netdev, bool *carrier) +{ + char line[8]; + int retval; + int error; + char *fn; + int fd; + + *carrier = false; + + fn = xasprintf("/sys/class/net/%s/carrier", netdev->name); + fd = open(fn, O_RDONLY); + if (fd < 0) { + error = errno; + VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error)); + goto exit; + } + + retval = read(fd, line, sizeof line); + if (retval < 0) { + error = errno; + if (error == EINVAL) { + /* This is the normal return value when we try to check carrier if + * the network device is not up. */ + } else { + VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error)); + } + goto exit_close; + } else if (retval == 0) { + error = EPROTO; + VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn); + goto exit_close; + } + + if (line[0] != '0' && line[0] != '1') { + error = EPROTO; + VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]); + goto exit_close; + } + *carrier = line[0] != '0'; + error = 0; + +exit_close: + close(fd); +exit: + free(fn); + return error; +} + +int +netdev_get_stats(const struct netdev *netdev, struct netdev_stats *stats) +{ + int error; + + COVERAGE_INC(netdev_get_stats); + if (use_netlink_stats) { + int ifindex; + + error = get_ifindex(netdev, &ifindex); + if (!error) { + error = get_stats_via_netlink(ifindex, stats); + } + } else { + error = get_stats_via_proc(netdev->name, stats); + } + + if (error) { + memset(stats, 0xff, sizeof *stats); + } + return error; +} + +#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress" +#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1" +/* We redirect stderr to /dev/null because we often want to remove all + * traffic control configuration on a port so its in a known state. If + * this done when there is no such configuration, tc complains, so we just + * always ignore it. + */ +#define POLICE_DEL_CMD "/sbin/tc qdisc del dev %s handle ffff: ingress 2>/dev/null" + +/* Attempts to set input rate limiting (policing) policy. */ +int +netdev_nodev_set_policing(const char *netdev_name, uint32_t kbits_rate, + uint32_t kbits_burst) +{ + char command[1024]; + + init_netdev(); + + COVERAGE_INC(netdev_set_policing); + if (kbits_rate) { + if (!kbits_burst) { + /* Default to 10 kilobits if not specified. */ + kbits_burst = 10; + } + + /* xxx This should be more careful about only adding if it + * xxx actually exists, as opposed to always deleting it. */ + snprintf(command, sizeof(command), POLICE_DEL_CMD, netdev_name); + if (system(command) == -1) { + VLOG_WARN_RL(&rl, "%s: problem removing policing", netdev_name); + } + + snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name); + if (system(command) != 0) { + VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name); + return -1; + } + + snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name, + kbits_rate, kbits_burst); + if (system(command) != 0) { + VLOG_WARN_RL(&rl, "%s: problem configuring policing", + netdev_name); + return -1; + } + } else { + snprintf(command, sizeof(command), POLICE_DEL_CMD, netdev_name); + if (system(command) == -1) { + VLOG_WARN_RL(&rl, "%s: problem removing policing", netdev_name); + } + } + + return 0; +} + +int +netdev_set_policing(struct netdev *netdev, uint32_t kbits_rate, + uint32_t kbits_burst) +{ + return netdev_nodev_set_policing(netdev->name, kbits_rate, kbits_burst); +} + +/* Initializes 'svec' with a list of the names of all known network devices. */ +void +netdev_enumerate(struct svec *svec) +{ + struct if_nameindex *names; + + svec_init(svec); + names = if_nameindex(); + if (names) { + size_t i; + + for (i = 0; names[i].if_name != NULL; i++) { + svec_add(svec, names[i].if_name); + } + if_freenameindex(names); + } else { + VLOG_WARN("could not obtain list of network device names: %s", + strerror(errno)); + } +} + +/* Obtains the current flags for the network device named 'netdev_name' and + * stores them into '*flagsp'. Returns 0 if successful, otherwise a positive + * errno value. On error, stores 0 into '*flagsp'. + * + * If only device flags are needed, this is more efficient than calling + * netdev_open(), netdev_get_flags(), netdev_close(). */ +int +netdev_nodev_get_flags(const char *netdev_name, enum netdev_flags *flagsp) +{ + int error, flags; + + init_netdev(); + + *flagsp = 0; + error = get_flags(netdev_name, &flags); + if (error) { + return error; + } + + if (flags & IFF_UP) { + *flagsp |= NETDEV_UP; + } + if (flags & IFF_PROMISC) { + *flagsp |= NETDEV_PROMISC; + } + return 0; +} + +int +netdev_nodev_get_etheraddr(const char *netdev_name, uint8_t mac[6]) +{ + init_netdev(); + + return get_etheraddr(netdev_name, mac, NULL); +} + +/* If 'netdev_name' is the name of a VLAN network device (e.g. one created with + * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device + * and returns 0. Otherwise returns a errno value (specifically ENOENT if + * 'netdev_name' is the name of a network device that is not a VLAN device) and + * sets '*vlan_vid' to -1. */ +int +netdev_get_vlan_vid(const char *netdev_name, int *vlan_vid) +{ + struct ds line = DS_EMPTY_INITIALIZER; + FILE *stream = NULL; + int error; + char *fn; + + COVERAGE_INC(netdev_get_vlan_vid); + fn = xasprintf("/proc/net/vlan/%s", netdev_name); + stream = fopen(fn, "r"); + if (!stream) { + error = errno; + goto done; + } + + if (ds_get_line(&line, stream)) { + if (ferror(stream)) { + error = errno; + VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno)); + } else { + error = EPROTO; + VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn); + } + goto done; + } + + if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) { + error = EPROTO; + VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"", + fn, ds_cstr(&line)); + goto done; + } + + error = 0; + +done: + free(fn); + if (stream) { + fclose(stream); + } + ds_destroy(&line); + if (error) { + *vlan_vid = -1; + } + return error; +} + +static void restore_all_flags(void *aux); + +/* Set up a signal hook to restore network device flags on program + * termination. */ +static void +init_netdev(void) +{ + static bool inited; + if (!inited) { + int ifindex; + int error; + + inited = true; + + fatal_signal_add_hook(restore_all_flags, NULL, true); + + af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0); + if (af_inet_sock < 0) { + ovs_fatal(errno, "socket(AF_INET)"); + } + + error = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock); + if (error) { + ovs_fatal(error, "socket(AF_NETLINK, NETLINK_ROUTE)"); + } + + /* Decide on the netdev_get_stats() implementation to use. Netlink is + * preferable, so if that works, we'll use it. */ + ifindex = do_get_ifindex("lo"); + if (ifindex < 0) { + VLOG_WARN("failed to get ifindex for lo, " + "obtaining netdev stats from proc"); + use_netlink_stats = false; + } else { + struct netdev_stats stats; + error = get_stats_via_netlink(ifindex, &stats); + if (!error) { + VLOG_DBG("obtaining netdev stats via rtnetlink"); + use_netlink_stats = true; + } else { + VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats " + "via proc (you are probably running a pre-2.6.19 " + "kernel)", strerror(error)); + use_netlink_stats = false; + } + } + } +} + +/* Restore the network device flags on 'netdev' to those that were active + * before we changed them. Returns 0 if successful, otherwise a positive + * errno value. + * + * To avoid reentry, the caller must ensure that fatal signals are blocked. */ +static int +restore_flags(struct netdev *netdev) +{ + struct ifreq ifr; + int restore_flags; + + /* Get current flags. */ + strncpy(ifr.ifr_name, netdev->name, sizeof ifr.ifr_name); + COVERAGE_INC(netdev_get_flags); + if (ioctl(netdev->netdev_fd, SIOCGIFFLAGS, &ifr) < 0) { + return errno; + } + + /* Restore flags that we might have changed, if necessary. */ + restore_flags = netdev->changed_flags & (IFF_PROMISC | IFF_UP); + if ((ifr.ifr_flags ^ netdev->save_flags) & restore_flags) { + ifr.ifr_flags &= ~restore_flags; + ifr.ifr_flags |= netdev->save_flags & restore_flags; + COVERAGE_INC(netdev_set_flags); + if (ioctl(netdev->netdev_fd, SIOCSIFFLAGS, &ifr) < 0) { + return errno; + } + } + + return 0; +} + +/* Retores all the flags on all network devices that we modified. Called from + * a signal handler, so it does not attempt to report error conditions. */ +static void +restore_all_flags(void *aux UNUSED) +{ + struct netdev *netdev; + LIST_FOR_EACH (netdev, struct netdev, node, &netdev_list) { + restore_flags(netdev); + } +} + +static int +get_flags(const char *netdev_name, int *flags) +{ + struct ifreq ifr; + strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name); + COVERAGE_INC(netdev_get_flags); + if (ioctl(af_inet_sock, SIOCGIFFLAGS, &ifr) < 0) { + VLOG_ERR("ioctl(SIOCGIFFLAGS) on %s device failed: %s", + netdev_name, strerror(errno)); + return errno; + } + *flags = ifr.ifr_flags; + return 0; +} + +static int +set_flags(const char *netdev_name, int flags) +{ + struct ifreq ifr; + strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name); + ifr.ifr_flags = flags; + COVERAGE_INC(netdev_set_flags); + if (ioctl(af_inet_sock, SIOCSIFFLAGS, &ifr) < 0) { + VLOG_ERR("ioctl(SIOCSIFFLAGS) on %s device failed: %s", + netdev_name, strerror(errno)); + return errno; + } + return 0; +} + +static int +do_get_ifindex(const char *netdev_name) +{ + struct ifreq ifr; + + strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name); + COVERAGE_INC(netdev_get_ifindex); + if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) { + VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s", + netdev_name, strerror(errno)); + return -errno; + } + return ifr.ifr_ifindex; +} + +static int +get_ifindex(const struct netdev *netdev, int *ifindexp) +{ + *ifindexp = 0; + if (netdev->ifindex < 0) { + int ifindex = do_get_ifindex(netdev->name); + if (ifindex < 0) { + return -ifindex; + } + ((struct netdev *) netdev)->ifindex = ifindex; + } + *ifindexp = netdev->ifindex; + return 0; +} + +static int +get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN], + int *hwaddr_familyp) +{ + struct ifreq ifr; + + memset(&ifr, 0, sizeof ifr); + strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name); + COVERAGE_INC(netdev_get_hwaddr); + if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) { + VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s", + netdev_name, strerror(errno)); + return errno; + } + if (hwaddr_familyp) { + int hwaddr_family = ifr.ifr_hwaddr.sa_family; + *hwaddr_familyp = hwaddr_family; + if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) { + VLOG_WARN("%s device has unknown hardware address family %d", + netdev_name, hwaddr_family); + } + } + memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN); + return 0; +} + +static int +set_etheraddr(const char *netdev_name, int hwaddr_family, + const uint8_t mac[ETH_ADDR_LEN]) +{ + struct ifreq ifr; + + memset(&ifr, 0, sizeof ifr); + strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name); + ifr.ifr_hwaddr.sa_family = hwaddr_family; + memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN); + COVERAGE_INC(netdev_set_hwaddr); + if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) { + VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s", + netdev_name, strerror(errno)); + return errno; + } + return 0; +} diff --git a/lib/netdev.h b/lib/netdev.h new file mode 100644 index 00000000..63462c56 --- /dev/null +++ b/lib/netdev.h @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef NETDEV_H +#define NETDEV_H 1 + +#include +#include +#include + +/* Generic interface to network devices. + * + * Currently, there is a single implementation of this interface that supports + * Linux. The interface should be generic enough to be implementable on other + * operating systems as well. */ + +struct ofpbuf; +struct in_addr; +struct in6_addr; +struct svec; + +enum netdev_flags { + NETDEV_UP = 0x0001, /* Device enabled? */ + NETDEV_PROMISC = 0x0002 /* Promiscuous mode? */ +}; + +enum netdev_pseudo_ethertype { + NETDEV_ETH_TYPE_NONE = -128, /* Receive no frames. */ + NETDEV_ETH_TYPE_ANY, /* Receive all frames. */ + NETDEV_ETH_TYPE_802_2 /* Receive all IEEE 802.2 frames. */ +}; + +struct netdev_stats { + uint64_t rx_packets; /* Total packets received. */ + uint64_t tx_packets; /* Total packets transmitted. */ + uint64_t rx_bytes; /* Total bytes received. */ + uint64_t tx_bytes; /* Total bytes transmitted. */ + uint64_t rx_errors; /* Bad packets received. */ + uint64_t tx_errors; /* Packet transmit problems. */ + uint64_t rx_dropped; /* No buffer space. */ + uint64_t tx_dropped; /* No buffer space. */ + uint64_t multicast; /* Multicast packets received. */ + uint64_t collisions; + + /* Detailed receive errors. */ + uint64_t rx_length_errors; + uint64_t rx_over_errors; /* Receiver ring buff overflow. */ + uint64_t rx_crc_errors; /* Recved pkt with crc error. */ + uint64_t rx_frame_errors; /* Recv'd frame alignment error. */ + uint64_t rx_fifo_errors; /* Recv'r fifo overrun . */ + uint64_t rx_missed_errors; /* Receiver missed packet. */ + + /* Detailed transmit errors. */ + uint64_t tx_aborted_errors; + uint64_t tx_carrier_errors; + uint64_t tx_fifo_errors; + uint64_t tx_heartbeat_errors; + uint64_t tx_window_errors; +}; + +struct netdev; + +int netdev_open(const char *name, int ethertype, struct netdev **); +int netdev_open_tap(const char *name, struct netdev **); +void netdev_close(struct netdev *); + +int netdev_recv(struct netdev *, struct ofpbuf *); +void netdev_recv_wait(struct netdev *); +int netdev_drain(struct netdev *); +int netdev_send(struct netdev *, const struct ofpbuf *); +void netdev_send_wait(struct netdev *); +int netdev_set_etheraddr(struct netdev *, const uint8_t mac[6]); +const uint8_t *netdev_get_etheraddr(const struct netdev *); +const char *netdev_get_name(const struct netdev *); +int netdev_get_mtu(const struct netdev *); +int netdev_get_features(struct netdev *, + uint32_t *current, uint32_t *advertised, + uint32_t *supported, uint32_t *peer); +int netdev_set_advertisements(struct netdev *, uint32_t advertise); +bool netdev_get_in4(const struct netdev *, struct in_addr *); +int netdev_set_in4(struct netdev *, struct in_addr addr, struct in_addr mask); +int netdev_add_router(struct in_addr router); +bool netdev_get_in6(const struct netdev *, struct in6_addr *); +int netdev_get_flags(const struct netdev *, enum netdev_flags *); +int netdev_set_flags(struct netdev *, enum netdev_flags, bool permanent); +int netdev_turn_flags_on(struct netdev *, enum netdev_flags, bool permanent); +int netdev_turn_flags_off(struct netdev *, enum netdev_flags, bool permanent); +int netdev_arp_lookup(const struct netdev *, uint32_t ip, uint8_t mac[6]); +int netdev_get_carrier(const struct netdev *, bool *carrier); +int netdev_get_stats(const struct netdev *, struct netdev_stats *); +int netdev_set_policing(struct netdev *, uint32_t kbits_rate, + uint32_t kbits_burst); + +void netdev_enumerate(struct svec *); +int netdev_nodev_get_flags(const char *netdev_name, enum netdev_flags *); +int netdev_nodev_set_etheraddr(const char *name, const uint8_t mac[6]); +int netdev_nodev_get_etheraddr(const char *netdev_name, uint8_t mac[6]); +int netdev_nodev_set_policing(const char *netdev_name, uint32_t kbits_rate, + uint32_t kbits_burst); + +int netdev_get_vlan_vid(const char *netdev_name, int *vlan_vid); + +#endif /* netdev.h */ diff --git a/lib/netlink-protocol.h b/lib/netlink-protocol.h new file mode 100644 index 00000000..92694e8b --- /dev/null +++ b/lib/netlink-protocol.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef NETLINK_PROTOCOL_H +#define NETLINK_PROTOCOL_H 1 + +/* Netlink protocol definitions. + * + * These definitions are equivalent to those in the Linux 2.6 kernel headers, + * without requiring those headers to be available. */ + +#include +#include +#include "util.h" + +#define NETLINK_GENERIC 16 + +struct sockaddr_nl { + sa_family_t nl_family; + unsigned short int nl_pad; + uint32_t nl_pid; + uint32_t nl_groups; +}; +BUILD_ASSERT_DECL(sizeof(struct sockaddr_nl) == 12); + +/* nlmsg_flags bits. */ +#define NLM_F_REQUEST 0x001 +#define NLM_F_MULTI 0x002 +#define NLM_F_ACK 0x004 +#define NLM_F_ECHO 0x008 + +#define NLM_F_ROOT 0x100 +#define NLM_F_MATCH 0x200 +#define NLM_F_ATOMIC 0x400 +#define NLM_F_DUMP (NLM_F_ROOT | NLM_F_MATCH) + +/* nlmsg_type values. */ +#define NLMSG_NOOP 1 +#define NLMSG_ERROR 2 +#define NLMSG_DONE 3 +#define NLMSG_OVERRUN 4 + +#define NLMSG_MIN_TYPE 0x10 + +struct nlmsghdr { + uint32_t nlmsg_len; + uint16_t nlmsg_type; + uint16_t nlmsg_flags; + uint32_t nlmsg_seq; + uint32_t nlmsg_pid; +}; +BUILD_ASSERT_DECL(sizeof(struct nlmsghdr) == 16); + +#define NLMSG_ALIGNTO 4 +#define NLMSG_ALIGN(SIZE) ROUND_UP(SIZE, NLMSG_ALIGNTO) +#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr))) + +struct nlmsgerr +{ + int error; + struct nlmsghdr msg; +}; +BUILD_ASSERT_DECL(sizeof(struct nlmsgerr) == 20); + +#define NETLINK_ADD_MEMBERSHIP 1 +#define NETLINK_DROP_MEMBERSHIP 2 +#define NETLINK_PKTINFO 3 + +struct genlmsghdr { + uint8_t cmd; + uint8_t version; + uint16_t reserved; +}; +BUILD_ASSERT_DECL(sizeof(struct genlmsghdr) == 4); + +#define GENL_HDRLEN NLMSG_ALIGN(sizeof(struct genlmsghdr)) + +struct nlattr { + uint16_t nla_len; + uint16_t nla_type; +}; +BUILD_ASSERT_DECL(sizeof(struct nlattr) == 4); + +#define NLA_ALIGNTO 4 +#define NLA_ALIGN(SIZE) ROUND_UP(SIZE, NLA_ALIGNTO) +#define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr))) + +#define GENL_MIN_ID NLMSG_MIN_TYPE +#define GENL_MAX_ID 1023 + +#define GENL_ID_CTRL NLMSG_MIN_TYPE + +enum { + CTRL_CMD_UNSPEC, + CTRL_CMD_NEWFAMILY, + CTRL_CMD_DELFAMILY, + CTRL_CMD_GETFAMILY, + CTRL_CMD_NEWOPS, + CTRL_CMD_DELOPS, + CTRL_CMD_GETOPS, + __CTRL_CMD_MAX, +}; + +#define CTRL_CMD_MAX (__CTRL_CMD_MAX - 1) + +enum { + CTRL_ATTR_UNSPEC, + CTRL_ATTR_FAMILY_ID, + CTRL_ATTR_FAMILY_NAME, + CTRL_ATTR_VERSION, + CTRL_ATTR_HDRSIZE, + CTRL_ATTR_MAXATTR, + CTRL_ATTR_OPS, + __CTRL_ATTR_MAX, +}; + +#define CTRL_ATTR_MAX (__CTRL_ATTR_MAX - 1) + +enum { + CTRL_ATTR_OP_UNSPEC, + CTRL_ATTR_OP_ID, + CTRL_ATTR_OP_FLAGS, + __CTRL_ATTR_OP_MAX, +}; + +#define CTRL_ATTR_OP_MAX (__CTRL_ATTR_OP_MAX - 1) + +#endif /* netlink-protocol.h */ diff --git a/lib/netlink.c b/lib/netlink.c new file mode 100644 index 00000000..bc1956a6 --- /dev/null +++ b/lib/netlink.c @@ -0,0 +1,1077 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "netlink.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "coverage.h" +#include "dynamic-string.h" +#include "netlink-protocol.h" +#include "ofpbuf.h" +#include "poll-loop.h" +#include "timeval.h" +#include "util.h" + +#include "vlog.h" +#define THIS_MODULE VLM_netlink + +/* Linux header file confusion causes this to be undefined. */ +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +/* A single (bad) Netlink message can in theory dump out many, many log + * messages, so the burst size is set quite high here to avoid missing useful + * information. Also, at high logging levels we log *all* Netlink messages. */ +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 600); + +static void log_nlmsg(const char *function, int error, + const void *message, size_t size); + +/* Netlink sockets. */ + +struct nl_sock +{ + int fd; + uint32_t pid; +}; + +/* Next nlmsghdr sequence number. + * + * This implementation uses sequence numbers that are unique process-wide, to + * avoid a hypothetical race: send request, close socket, open new socket that + * reuses the old socket's PID value, send request on new socket, receive reply + * from kernel to old socket but with same PID and sequence number. (This race + * could be avoided other ways, e.g. by preventing PIDs from being quickly + * reused). */ +static uint32_t next_seq; + +static int alloc_pid(uint32_t *); +static void free_pid(uint32_t); + +/* Creates a new netlink socket for the given netlink 'protocol' + * (NETLINK_ROUTE, NETLINK_GENERIC, ...). Returns 0 and sets '*sockp' to the + * new socket if successful, otherwise returns a positive errno value. + * + * If 'multicast_group' is nonzero, the new socket subscribes to the specified + * netlink multicast group. (A netlink socket may listen to an arbitrary + * number of multicast groups, but so far we only need one at a time.) + * + * Nonzero 'so_sndbuf' or 'so_rcvbuf' override the kernel default send or + * receive buffer size, respectively. + */ +int +nl_sock_create(int protocol, int multicast_group, + size_t so_sndbuf, size_t so_rcvbuf, struct nl_sock **sockp) +{ + struct nl_sock *sock; + struct sockaddr_nl local, remote; + int retval = 0; + + if (next_seq == 0) { + /* Pick initial sequence number. */ + next_seq = getpid() ^ time_now(); + } + + *sockp = NULL; + sock = malloc(sizeof *sock); + if (sock == NULL) { + return ENOMEM; + } + + sock->fd = socket(AF_NETLINK, SOCK_RAW, protocol); + if (sock->fd < 0) { + VLOG_ERR("fcntl: %s", strerror(errno)); + goto error; + } + + retval = alloc_pid(&sock->pid); + if (retval) { + goto error; + } + + if (so_sndbuf != 0 + && setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF, + &so_sndbuf, sizeof so_sndbuf) < 0) { + VLOG_ERR("setsockopt(SO_SNDBUF,%zu): %s", so_sndbuf, strerror(errno)); + goto error_free_pid; + } + + if (so_rcvbuf != 0 + && setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF, + &so_rcvbuf, sizeof so_rcvbuf) < 0) { + VLOG_ERR("setsockopt(SO_RCVBUF,%zu): %s", so_rcvbuf, strerror(errno)); + goto error_free_pid; + } + + /* Bind local address as our selected pid. */ + memset(&local, 0, sizeof local); + local.nl_family = AF_NETLINK; + local.nl_pid = sock->pid; + if (multicast_group > 0 && multicast_group <= 32) { + /* This method of joining multicast groups is supported by old kernels, + * but it only allows 32 multicast groups per protocol. */ + local.nl_groups |= 1ul << (multicast_group - 1); + } + if (bind(sock->fd, (struct sockaddr *) &local, sizeof local) < 0) { + VLOG_ERR("bind(%"PRIu32"): %s", sock->pid, strerror(errno)); + goto error_free_pid; + } + + /* Bind remote address as the kernel (pid 0). */ + memset(&remote, 0, sizeof remote); + remote.nl_family = AF_NETLINK; + remote.nl_pid = 0; + if (connect(sock->fd, (struct sockaddr *) &remote, sizeof remote) < 0) { + VLOG_ERR("connect(0): %s", strerror(errno)); + goto error_free_pid; + } + + /* Older kernel headers failed to define this macro. We want our programs + * to support the newer kernel features even if compiled with older + * headers, so define it ourselves in such a case. */ +#ifndef NETLINK_ADD_MEMBERSHIP +#define NETLINK_ADD_MEMBERSHIP 1 +#endif + + /* This method of joining multicast groups is only supported by newish + * kernels, but it allows for an arbitrary number of multicast groups. */ + if (multicast_group > 32 + && setsockopt(sock->fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, + &multicast_group, sizeof multicast_group) < 0) { + VLOG_ERR("setsockopt(NETLINK_ADD_MEMBERSHIP,%d): %s", + multicast_group, strerror(errno)); + goto error_free_pid; + } + + *sockp = sock; + return 0; + +error_free_pid: + free_pid(sock->pid); +error: + if (retval == 0) { + retval = errno; + if (retval == 0) { + retval = EINVAL; + } + } + if (sock->fd >= 0) { + close(sock->fd); + } + free(sock); + return retval; +} + +/* Destroys netlink socket 'sock'. */ +void +nl_sock_destroy(struct nl_sock *sock) +{ + if (sock) { + close(sock->fd); + free_pid(sock->pid); + free(sock); + } +} + +/* Tries to send 'msg', which must contain a Netlink message, to the kernel on + * 'sock'. nlmsg_len in 'msg' will be finalized to match msg->size before the + * message is sent. + * + * Returns 0 if successful, otherwise a positive errno value. If + * 'wait' is true, then the send will wait until buffer space is ready; + * otherwise, returns EAGAIN if the 'sock' send buffer is full. */ +int +nl_sock_send(struct nl_sock *sock, const struct ofpbuf *msg, bool wait) +{ + int error; + + nl_msg_nlmsghdr(msg)->nlmsg_len = msg->size; + do { + int retval; + retval = send(sock->fd, msg->data, msg->size, wait ? 0 : MSG_DONTWAIT); + error = retval < 0 ? errno : 0; + } while (error == EINTR); + log_nlmsg(__func__, error, msg->data, msg->size); + if (!error) { + COVERAGE_INC(netlink_sent); + } + return error; +} + +/* Tries to send the 'n_iov' chunks of data in 'iov' to the kernel on 'sock' as + * a single Netlink message. (The message must be fully formed and not require + * finalization of its nlmsg_len field.) + * + * Returns 0 if successful, otherwise a positive errno value. If 'wait' is + * true, then the send will wait until buffer space is ready; otherwise, + * returns EAGAIN if the 'sock' send buffer is full. */ +int +nl_sock_sendv(struct nl_sock *sock, const struct iovec iov[], size_t n_iov, + bool wait) +{ + struct msghdr msg; + int error; + + COVERAGE_INC(netlink_send); + memset(&msg, 0, sizeof msg); + msg.msg_iov = (struct iovec *) iov; + msg.msg_iovlen = n_iov; + do { + int retval; + retval = sendmsg(sock->fd, &msg, wait ? 0 : MSG_DONTWAIT); + error = retval < 0 ? errno : 0; + } while (error == EINTR); + if (error != EAGAIN) { + log_nlmsg(__func__, error, iov[0].iov_base, iov[0].iov_len); + if (!error) { + COVERAGE_INC(netlink_sent); + } + } + return error; +} + +/* Tries to receive a netlink message from the kernel on 'sock'. If + * successful, stores the received message into '*bufp' and returns 0. The + * caller is responsible for destroying the message with ofpbuf_delete(). On + * failure, returns a positive errno value and stores a null pointer into + * '*bufp'. + * + * If 'wait' is true, nl_sock_recv waits for a message to be ready; otherwise, + * returns EAGAIN if the 'sock' receive buffer is empty. */ +int +nl_sock_recv(struct nl_sock *sock, struct ofpbuf **bufp, bool wait) +{ + uint8_t tmp; + ssize_t bufsize = 2048; + ssize_t nbytes, nbytes2; + struct ofpbuf *buf; + struct nlmsghdr *nlmsghdr; + struct iovec iov; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = &iov, + .msg_iovlen = 1, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + + buf = ofpbuf_new(bufsize); + *bufp = NULL; + +try_again: + /* Attempt to read the message. We don't know the size of the data + * yet, so we take a guess at 2048. If we're wrong, we keep trying + * and doubling the buffer size each time. + */ + nlmsghdr = ofpbuf_put_uninit(buf, bufsize); + iov.iov_base = nlmsghdr; + iov.iov_len = bufsize; + do { + nbytes = recvmsg(sock->fd, &msg, (wait ? 0 : MSG_DONTWAIT) | MSG_PEEK); + } while (nbytes < 0 && errno == EINTR); + if (nbytes < 0) { + ofpbuf_delete(buf); + return errno; + } + if (msg.msg_flags & MSG_TRUNC) { + COVERAGE_INC(netlink_recv_retry); + bufsize *= 2; + ofpbuf_reinit(buf, bufsize); + goto try_again; + } + buf->size = nbytes; + + /* We successfully read the message, so recv again to clear the queue */ + iov.iov_base = &tmp; + iov.iov_len = 1; + do { + nbytes2 = recvmsg(sock->fd, &msg, MSG_DONTWAIT); + } while (nbytes2 < 0 && errno == EINTR); + if (nbytes2 < 0) { + if (errno == ENOBUFS) { + /* The kernel is notifying us that a message it tried to send to us + * was dropped. We have to pass this along to the caller in case + * it wants to retry a request. So kill the buffer, which we can + * re-read next time. */ + COVERAGE_INC(netlink_overflow); + ofpbuf_delete(buf); + return ENOBUFS; + } else { + VLOG_ERR_RL(&rl, "failed to remove nlmsg from socket: %s\n", + strerror(errno)); + } + } + if (nbytes < sizeof *nlmsghdr + || nlmsghdr->nlmsg_len < sizeof *nlmsghdr + || nlmsghdr->nlmsg_len > nbytes) { + VLOG_ERR_RL(&rl, "received invalid nlmsg (%zd bytes < %d)", + bufsize, NLMSG_HDRLEN); + ofpbuf_delete(buf); + return EPROTO; + } + *bufp = buf; + log_nlmsg(__func__, 0, buf->data, buf->size); + COVERAGE_INC(netlink_received); + return 0; +} + +/* Sends 'request' to the kernel via 'sock' and waits for a response. If + * successful, stores the reply into '*replyp' and returns 0. The caller is + * responsible for destroying the reply with ofpbuf_delete(). On failure, + * returns a positive errno value and stores a null pointer into '*replyp'. + * + * The caller is responsible for destroying 'request'. + * + * Bare Netlink is an unreliable transport protocol. This function layers + * reliable delivery and reply semantics on top of bare Netlink. + * + * In Netlink, sending a request to the kernel is reliable enough, because the + * kernel will tell us if the message cannot be queued (and we will in that + * case put it on the transmit queue and wait until it can be delivered). + * + * Receiving the reply is the real problem: if the socket buffer is full when + * the kernel tries to send the reply, the reply will be dropped. However, the + * kernel sets a flag that a reply has been dropped. The next call to recv + * then returns ENOBUFS. We can then re-send the request. + * + * Caveats: + * + * 1. Netlink depends on sequence numbers to match up requests and + * replies. The sender of a request supplies a sequence number, and + * the reply echos back that sequence number. + * + * This is fine, but (1) some kernel netlink implementations are + * broken, in that they fail to echo sequence numbers and (2) this + * function will drop packets with non-matching sequence numbers, so + * that only a single request can be usefully transacted at a time. + * + * 2. Resending the request causes it to be re-executed, so the request + * needs to be idempotent. + */ +int +nl_sock_transact(struct nl_sock *sock, + const struct ofpbuf *request, struct ofpbuf **replyp) +{ + uint32_t seq = nl_msg_nlmsghdr(request)->nlmsg_seq; + struct nlmsghdr *nlmsghdr; + struct ofpbuf *reply; + int retval; + + *replyp = NULL; + + /* Ensure that we get a reply even if this message doesn't ordinarily call + * for one. */ + nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_ACK; + +send: + retval = nl_sock_send(sock, request, true); + if (retval) { + return retval; + } + +recv: + retval = nl_sock_recv(sock, &reply, true); + if (retval) { + if (retval == ENOBUFS) { + COVERAGE_INC(netlink_overflow); + VLOG_DBG_RL(&rl, "receive buffer overflow, resending request"); + goto send; + } else { + return retval; + } + } + nlmsghdr = nl_msg_nlmsghdr(reply); + if (seq != nlmsghdr->nlmsg_seq) { + VLOG_DBG_RL(&rl, "ignoring seq %"PRIu32" != expected %"PRIu32, + nl_msg_nlmsghdr(reply)->nlmsg_seq, seq); + ofpbuf_delete(reply); + goto recv; + } + if (nl_msg_nlmsgerr(reply, &retval)) { + ofpbuf_delete(reply); + if (retval) { + VLOG_DBG_RL(&rl, "received NAK error=%d (%s)", + retval, strerror(retval)); + } + return retval != EAGAIN ? retval : EPROTO; + } + + *replyp = reply; + return 0; +} + +/* Causes poll_block() to wake up when any of the specified 'events' (which is + * a OR'd combination of POLLIN, POLLOUT, etc.) occur on 'sock'. */ +void +nl_sock_wait(const struct nl_sock *sock, short int events) +{ + poll_fd_wait(sock->fd, events); +} + +/* Netlink messages. */ + +/* Returns the nlmsghdr at the head of 'msg'. + * + * 'msg' must be at least as large as a nlmsghdr. */ +struct nlmsghdr * +nl_msg_nlmsghdr(const struct ofpbuf *msg) +{ + return ofpbuf_at_assert(msg, 0, NLMSG_HDRLEN); +} + +/* Returns the genlmsghdr just past 'msg''s nlmsghdr. + * + * Returns a null pointer if 'msg' is not large enough to contain an nlmsghdr + * and a genlmsghdr. */ +struct genlmsghdr * +nl_msg_genlmsghdr(const struct ofpbuf *msg) +{ + return ofpbuf_at(msg, NLMSG_HDRLEN, GENL_HDRLEN); +} + +/* If 'buffer' is a NLMSG_ERROR message, stores 0 in '*errorp' if it is an ACK + * message, otherwise a positive errno value, and returns true. If 'buffer' is + * not an NLMSG_ERROR message, returns false. + * + * 'msg' must be at least as large as a nlmsghdr. */ +bool +nl_msg_nlmsgerr(const struct ofpbuf *msg, int *errorp) +{ + if (nl_msg_nlmsghdr(msg)->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err = ofpbuf_at(msg, NLMSG_HDRLEN, sizeof *err); + int code = EPROTO; + if (!err) { + VLOG_ERR_RL(&rl, "received invalid nlmsgerr (%zd bytes < %zd)", + msg->size, NLMSG_HDRLEN + sizeof *err); + } else if (err->error <= 0 && err->error > INT_MIN) { + code = -err->error; + } + if (errorp) { + *errorp = code; + } + return true; + } else { + return false; + } +} + +/* Ensures that 'b' has room for at least 'size' bytes plus netlink padding at + * its tail end, reallocating and copying its data if necessary. */ +void +nl_msg_reserve(struct ofpbuf *msg, size_t size) +{ + ofpbuf_prealloc_tailroom(msg, NLMSG_ALIGN(size)); +} + +/* Puts a nlmsghdr at the beginning of 'msg', which must be initially empty. + * Uses the given 'type' and 'flags'. 'sock' is used to obtain a PID and + * sequence number for proper routing of replies. 'expected_payload' should be + * an estimate of the number of payload bytes to be supplied; if the size of + * the payload is unknown a value of 0 is acceptable. + * + * 'type' is ordinarily an enumerated value specific to the Netlink protocol + * (e.g. RTM_NEWLINK, for NETLINK_ROUTE protocol). For Generic Netlink, 'type' + * is the family number obtained via nl_lookup_genl_family(). + * + * 'flags' is a bit-mask that indicates what kind of request is being made. It + * is often NLM_F_REQUEST indicating that a request is being made, commonly + * or'd with NLM_F_ACK to request an acknowledgement. + * + * nl_msg_put_genlmsghdr is more convenient for composing a Generic Netlink + * message. */ +void +nl_msg_put_nlmsghdr(struct ofpbuf *msg, struct nl_sock *sock, + size_t expected_payload, uint32_t type, uint32_t flags) +{ + struct nlmsghdr *nlmsghdr; + + assert(msg->size == 0); + + nl_msg_reserve(msg, NLMSG_HDRLEN + expected_payload); + nlmsghdr = nl_msg_put_uninit(msg, NLMSG_HDRLEN); + nlmsghdr->nlmsg_len = 0; + nlmsghdr->nlmsg_type = type; + nlmsghdr->nlmsg_flags = flags; + nlmsghdr->nlmsg_seq = ++next_seq; + nlmsghdr->nlmsg_pid = sock->pid; +} + +/* Puts a nlmsghdr and genlmsghdr at the beginning of 'msg', which must be + * initially empty. 'sock' is used to obtain a PID and sequence number for + * proper routing of replies. 'expected_payload' should be an estimate of the + * number of payload bytes to be supplied; if the size of the payload is + * unknown a value of 0 is acceptable. + * + * 'family' is the family number obtained via nl_lookup_genl_family(). + * + * 'flags' is a bit-mask that indicates what kind of request is being made. It + * is often NLM_F_REQUEST indicating that a request is being made, commonly + * or'd with NLM_F_ACK to request an acknowledgement. + * + * 'cmd' is an enumerated value specific to the Generic Netlink family + * (e.g. CTRL_CMD_NEWFAMILY for the GENL_ID_CTRL family). + * + * 'version' is a version number specific to the family and command (often 1). + * + * nl_msg_put_nlmsghdr should be used to compose Netlink messages that are not + * Generic Netlink messages. */ +void +nl_msg_put_genlmsghdr(struct ofpbuf *msg, struct nl_sock *sock, + size_t expected_payload, int family, uint32_t flags, + uint8_t cmd, uint8_t version) +{ + struct genlmsghdr *genlmsghdr; + + nl_msg_put_nlmsghdr(msg, sock, GENL_HDRLEN + expected_payload, + family, flags); + assert(msg->size == NLMSG_HDRLEN); + genlmsghdr = nl_msg_put_uninit(msg, GENL_HDRLEN); + genlmsghdr->cmd = cmd; + genlmsghdr->version = version; + genlmsghdr->reserved = 0; +} + +/* Appends the 'size' bytes of data in 'p', plus Netlink padding if needed, to + * the tail end of 'msg'. Data in 'msg' is reallocated and copied if + * necessary. */ +void +nl_msg_put(struct ofpbuf *msg, const void *data, size_t size) +{ + memcpy(nl_msg_put_uninit(msg, size), data, size); +} + +/* Appends 'size' bytes of data, plus Netlink padding if needed, to the tail + * end of 'msg', reallocating and copying its data if necessary. Returns a + * pointer to the first byte of the new data, which is left uninitialized. */ +void * +nl_msg_put_uninit(struct ofpbuf *msg, size_t size) +{ + size_t pad = NLMSG_ALIGN(size) - size; + char *p = ofpbuf_put_uninit(msg, size + pad); + if (pad) { + memset(p + size, 0, pad); + } + return p; +} + +/* Appends a Netlink attribute of the given 'type' and room for 'size' bytes of + * data as its payload, plus Netlink padding if needed, to the tail end of + * 'msg', reallocating and copying its data if necessary. Returns a pointer to + * the first byte of data in the attribute, which is left uninitialized. */ +void * +nl_msg_put_unspec_uninit(struct ofpbuf *msg, uint16_t type, size_t size) +{ + size_t total_size = NLA_HDRLEN + size; + struct nlattr* nla = nl_msg_put_uninit(msg, total_size); + assert(NLA_ALIGN(total_size) <= UINT16_MAX); + nla->nla_len = total_size; + nla->nla_type = type; + return nla + 1; +} + +/* Appends a Netlink attribute of the given 'type' and the 'size' bytes of + * 'data' as its payload, to the tail end of 'msg', reallocating and copying + * its data if necessary. Returns a pointer to the first byte of data in the + * attribute, which is left uninitialized. */ +void +nl_msg_put_unspec(struct ofpbuf *msg, uint16_t type, + const void *data, size_t size) +{ + memcpy(nl_msg_put_unspec_uninit(msg, type, size), data, size); +} + +/* Appends a Netlink attribute of the given 'type' and no payload to 'msg'. + * (Some Netlink protocols use the presence or absence of an attribute as a + * Boolean flag.) */ +void +nl_msg_put_flag(struct ofpbuf *msg, uint16_t type) +{ + nl_msg_put_unspec(msg, type, NULL, 0); +} + +/* Appends a Netlink attribute of the given 'type' and the given 8-bit 'value' + * to 'msg'. */ +void +nl_msg_put_u8(struct ofpbuf *msg, uint16_t type, uint8_t value) +{ + nl_msg_put_unspec(msg, type, &value, sizeof value); +} + +/* Appends a Netlink attribute of the given 'type' and the given 16-bit 'value' + * to 'msg'. */ +void +nl_msg_put_u16(struct ofpbuf *msg, uint16_t type, uint16_t value) +{ + nl_msg_put_unspec(msg, type, &value, sizeof value); +} + +/* Appends a Netlink attribute of the given 'type' and the given 32-bit 'value' + * to 'msg'. */ +void +nl_msg_put_u32(struct ofpbuf *msg, uint16_t type, uint32_t value) +{ + nl_msg_put_unspec(msg, type, &value, sizeof value); +} + +/* Appends a Netlink attribute of the given 'type' and the given 64-bit 'value' + * to 'msg'. */ +void +nl_msg_put_u64(struct ofpbuf *msg, uint16_t type, uint64_t value) +{ + nl_msg_put_unspec(msg, type, &value, sizeof value); +} + +/* Appends a Netlink attribute of the given 'type' and the given + * null-terminated string 'value' to 'msg'. */ +void +nl_msg_put_string(struct ofpbuf *msg, uint16_t type, const char *value) +{ + nl_msg_put_unspec(msg, type, value, strlen(value) + 1); +} + +/* Appends a Netlink attribute of the given 'type' and the given buffered + * netlink message in 'nested_msg' to 'msg'. The nlmsg_len field in + * 'nested_msg' is finalized to match 'nested_msg->size'. */ +void +nl_msg_put_nested(struct ofpbuf *msg, + uint16_t type, struct ofpbuf *nested_msg) +{ + nl_msg_nlmsghdr(nested_msg)->nlmsg_len = nested_msg->size; + nl_msg_put_unspec(msg, type, nested_msg->data, nested_msg->size); +} + +/* Returns the first byte in the payload of attribute 'nla'. */ +const void * +nl_attr_get(const struct nlattr *nla) +{ + assert(nla->nla_len >= NLA_HDRLEN); + return nla + 1; +} + +/* Returns the number of bytes in the payload of attribute 'nla'. */ +size_t +nl_attr_get_size(const struct nlattr *nla) +{ + assert(nla->nla_len >= NLA_HDRLEN); + return nla->nla_len - NLA_HDRLEN; +} + +/* Asserts that 'nla''s payload is at least 'size' bytes long, and returns the + * first byte of the payload. */ +const void * +nl_attr_get_unspec(const struct nlattr *nla, size_t size) +{ + assert(nla->nla_len >= NLA_HDRLEN + size); + return nla + 1; +} + +/* Returns true if 'nla' is nonnull. (Some Netlink protocols use the presence + * or absence of an attribute as a Boolean flag.) */ +bool +nl_attr_get_flag(const struct nlattr *nla) +{ + return nla != NULL; +} + +#define NL_ATTR_GET_AS(NLA, TYPE) \ + (*(TYPE*) nl_attr_get_unspec(nla, sizeof(TYPE))) + +/* Returns the 8-bit value in 'nla''s payload. + * + * Asserts that 'nla''s payload is at least 1 byte long. */ +uint8_t +nl_attr_get_u8(const struct nlattr *nla) +{ + return NL_ATTR_GET_AS(nla, uint8_t); +} + +/* Returns the 16-bit value in 'nla''s payload. + * + * Asserts that 'nla''s payload is at least 2 bytes long. */ +uint16_t +nl_attr_get_u16(const struct nlattr *nla) +{ + return NL_ATTR_GET_AS(nla, uint16_t); +} + +/* Returns the 32-bit value in 'nla''s payload. + * + * Asserts that 'nla''s payload is at least 4 bytes long. */ +uint32_t +nl_attr_get_u32(const struct nlattr *nla) +{ + return NL_ATTR_GET_AS(nla, uint32_t); +} + +/* Returns the 64-bit value in 'nla''s payload. + * + * Asserts that 'nla''s payload is at least 8 bytes long. */ +uint64_t +nl_attr_get_u64(const struct nlattr *nla) +{ + return NL_ATTR_GET_AS(nla, uint64_t); +} + +/* Returns the null-terminated string value in 'nla''s payload. + * + * Asserts that 'nla''s payload contains a null-terminated string. */ +const char * +nl_attr_get_string(const struct nlattr *nla) +{ + assert(nla->nla_len > NLA_HDRLEN); + assert(memchr(nl_attr_get(nla), '\0', nla->nla_len - NLA_HDRLEN) != NULL); + return nl_attr_get(nla); +} + +/* Default minimum and maximum payload sizes for each type of attribute. */ +static const size_t attr_len_range[][2] = { + [0 ... N_NL_ATTR_TYPES - 1] = { 0, SIZE_MAX }, + [NL_A_U8] = { 1, 1 }, + [NL_A_U16] = { 2, 2 }, + [NL_A_U32] = { 4, 4 }, + [NL_A_U64] = { 8, 8 }, + [NL_A_STRING] = { 1, SIZE_MAX }, + [NL_A_FLAG] = { 0, SIZE_MAX }, + [NL_A_NESTED] = { NLMSG_HDRLEN, SIZE_MAX }, +}; + +/* Parses the 'msg' starting at the given 'nla_offset' as a sequence of Netlink + * attributes. 'policy[i]', for 0 <= i < n_attrs, specifies how the attribute + * with nla_type == i is parsed; a pointer to attribute i is stored in + * attrs[i]. Returns true if successful, false on failure. + * + * If the Netlink attributes in 'msg' follow a Netlink header and a Generic + * Netlink header, then 'nla_offset' should be NLMSG_HDRLEN + GENL_HDRLEN. */ +bool +nl_policy_parse(const struct ofpbuf *msg, size_t nla_offset, + const struct nl_policy policy[], + struct nlattr *attrs[], size_t n_attrs) +{ + void *p, *tail; + size_t n_required; + size_t i; + + n_required = 0; + for (i = 0; i < n_attrs; i++) { + attrs[i] = NULL; + + assert(policy[i].type < N_NL_ATTR_TYPES); + if (policy[i].type != NL_A_NO_ATTR + && policy[i].type != NL_A_FLAG + && !policy[i].optional) { + n_required++; + } + } + + p = ofpbuf_at(msg, nla_offset, 0); + if (p == NULL) { + VLOG_DBG_RL(&rl, "missing headers in nl_policy_parse"); + return false; + } + tail = ofpbuf_tail(msg); + + while (p < tail) { + size_t offset = (char*)p - (char*)msg->data; + struct nlattr *nla = p; + size_t len, aligned_len; + uint16_t type; + + /* Make sure its claimed length is plausible. */ + if (nla->nla_len < NLA_HDRLEN) { + VLOG_DBG_RL(&rl, "%zu: attr shorter than NLA_HDRLEN (%"PRIu16")", + offset, nla->nla_len); + return false; + } + len = nla->nla_len - NLA_HDRLEN; + aligned_len = NLA_ALIGN(len); + if (aligned_len > (char*)tail - (char*)p) { + VLOG_DBG_RL(&rl, "%zu: attr %"PRIu16" aligned data len (%zu) " + "> bytes left (%tu)", + offset, nla->nla_type, aligned_len, + (char*)tail - (char*)p); + return false; + } + + type = nla->nla_type; + if (type < n_attrs && policy[type].type != NL_A_NO_ATTR) { + const struct nl_policy *p = &policy[type]; + size_t min_len, max_len; + + /* Validate length and content. */ + min_len = p->min_len ? p->min_len : attr_len_range[p->type][0]; + max_len = p->max_len ? p->max_len : attr_len_range[p->type][1]; + if (len < min_len || len > max_len) { + VLOG_DBG_RL(&rl, "%zu: attr %"PRIu16" length %zu not in " + "allowed range %zu...%zu", + offset, type, len, min_len, max_len); + return false; + } + if (p->type == NL_A_STRING) { + if (((char *) nla)[nla->nla_len - 1]) { + VLOG_DBG_RL(&rl, "%zu: attr %"PRIu16" lacks null at end", + offset, type); + return false; + } + if (memchr(nla + 1, '\0', len - 1) != NULL) { + VLOG_DBG_RL(&rl, "%zu: attr %"PRIu16" has bad length", + offset, type); + return false; + } + } + if (!p->optional && attrs[type] == NULL) { + assert(n_required > 0); + --n_required; + } + attrs[type] = nla; + } else { + /* Skip attribute type that we don't care about. */ + } + p = (char*)p + NLA_ALIGN(nla->nla_len); + } + if (n_required) { + VLOG_DBG_RL(&rl, "%zu required attrs missing", n_required); + return false; + } + return true; +} + +/* Miscellaneous. */ + +static const struct nl_policy family_policy[CTRL_ATTR_MAX + 1] = { + [CTRL_ATTR_FAMILY_ID] = {.type = NL_A_U16}, +}; + +static int do_lookup_genl_family(const char *name) +{ + struct nl_sock *sock; + struct ofpbuf request, *reply; + struct nlattr *attrs[ARRAY_SIZE(family_policy)]; + int retval; + + retval = nl_sock_create(NETLINK_GENERIC, 0, 0, 0, &sock); + if (retval) { + return -retval; + } + + ofpbuf_init(&request, 0); + nl_msg_put_genlmsghdr(&request, sock, 0, GENL_ID_CTRL, NLM_F_REQUEST, + CTRL_CMD_GETFAMILY, 1); + nl_msg_put_string(&request, CTRL_ATTR_FAMILY_NAME, name); + retval = nl_sock_transact(sock, &request, &reply); + ofpbuf_uninit(&request); + if (retval) { + nl_sock_destroy(sock); + return -retval; + } + + if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN, + family_policy, attrs, ARRAY_SIZE(family_policy))) { + nl_sock_destroy(sock); + ofpbuf_delete(reply); + return -EPROTO; + } + + retval = nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]); + if (retval == 0) { + retval = -EPROTO; + } + nl_sock_destroy(sock); + ofpbuf_delete(reply); + return retval; +} + +/* If '*number' is 0, translates the given Generic Netlink family 'name' to a + * number and stores it in '*number'. If successful, returns 0 and the caller + * may use '*number' as the family number. On failure, returns a positive + * errno value and '*number' caches the errno value. */ +int +nl_lookup_genl_family(const char *name, int *number) +{ + if (*number == 0) { + *number = do_lookup_genl_family(name); + assert(*number != 0); + } + return *number > 0 ? 0 : -*number; +} + +/* Netlink PID. + * + * Every Netlink socket must be bound to a unique 32-bit PID. By convention, + * programs that have a single Netlink socket use their Unix process ID as PID, + * and programs with multiple Netlink sockets add a unique per-socket + * identifier in the bits above the Unix process ID. + * + * The kernel has Netlink PID 0. + */ + +/* Parameters for how many bits in the PID should come from the Unix process ID + * and how many unique per-socket. */ +#define SOCKET_BITS 10 +#define MAX_SOCKETS (1u << SOCKET_BITS) + +#define PROCESS_BITS (32 - SOCKET_BITS) +#define MAX_PROCESSES (1u << PROCESS_BITS) +#define PROCESS_MASK ((uint32_t) (MAX_PROCESSES - 1)) + +/* Bit vector of unused socket identifiers. */ +static uint32_t avail_sockets[ROUND_UP(MAX_SOCKETS, 32)]; + +/* Allocates and returns a new Netlink PID. */ +static int +alloc_pid(uint32_t *pid) +{ + int i; + + for (i = 0; i < MAX_SOCKETS; i++) { + if ((avail_sockets[i / 32] & (1u << (i % 32))) == 0) { + avail_sockets[i / 32] |= 1u << (i % 32); + *pid = (getpid() & PROCESS_MASK) | (i << PROCESS_BITS); + return 0; + } + } + VLOG_ERR("netlink pid space exhausted"); + return ENOBUFS; +} + +/* Makes the specified 'pid' available for reuse. */ +static void +free_pid(uint32_t pid) +{ + int sock = pid >> PROCESS_BITS; + assert(avail_sockets[sock / 32] & (1u << (sock % 32))); + avail_sockets[sock / 32] &= ~(1u << (sock % 32)); +} + +static void +nlmsghdr_to_string(const struct nlmsghdr *h, struct ds *ds) +{ + struct nlmsg_flag { + unsigned int bits; + const char *name; + }; + static const struct nlmsg_flag flags[] = { + { NLM_F_REQUEST, "REQUEST" }, + { NLM_F_MULTI, "MULTI" }, + { NLM_F_ACK, "ACK" }, + { NLM_F_ECHO, "ECHO" }, + { NLM_F_DUMP, "DUMP" }, + { NLM_F_ROOT, "ROOT" }, + { NLM_F_MATCH, "MATCH" }, + { NLM_F_ATOMIC, "ATOMIC" }, + }; + const struct nlmsg_flag *flag; + uint16_t flags_left; + + ds_put_format(ds, "nl(len:%"PRIu32", type=%"PRIu16, + h->nlmsg_len, h->nlmsg_type); + if (h->nlmsg_type == NLMSG_NOOP) { + ds_put_cstr(ds, "(no-op)"); + } else if (h->nlmsg_type == NLMSG_ERROR) { + ds_put_cstr(ds, "(error)"); + } else if (h->nlmsg_type == NLMSG_DONE) { + ds_put_cstr(ds, "(done)"); + } else if (h->nlmsg_type == NLMSG_OVERRUN) { + ds_put_cstr(ds, "(overrun)"); + } else if (h->nlmsg_type < NLMSG_MIN_TYPE) { + ds_put_cstr(ds, "(reserved)"); + } else { + ds_put_cstr(ds, "(family-defined)"); + } + ds_put_format(ds, ", flags=%"PRIx16, h->nlmsg_flags); + flags_left = h->nlmsg_flags; + for (flag = flags; flag < &flags[ARRAY_SIZE(flags)]; flag++) { + if ((flags_left & flag->bits) == flag->bits) { + ds_put_format(ds, "[%s]", flag->name); + flags_left &= ~flag->bits; + } + } + if (flags_left) { + ds_put_format(ds, "[OTHER:%"PRIx16"]", flags_left); + } + ds_put_format(ds, ", seq=%"PRIx32", pid=%"PRIu32"(%d:%d))", + h->nlmsg_seq, h->nlmsg_pid, + (int) (h->nlmsg_pid & PROCESS_MASK), + (int) (h->nlmsg_pid >> PROCESS_BITS)); +} + +static char * +nlmsg_to_string(const struct ofpbuf *buffer) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + const struct nlmsghdr *h = ofpbuf_at(buffer, 0, NLMSG_HDRLEN); + if (h) { + nlmsghdr_to_string(h, &ds); + if (h->nlmsg_type == NLMSG_ERROR) { + const struct nlmsgerr *e; + e = ofpbuf_at(buffer, NLMSG_HDRLEN, + NLMSG_ALIGN(sizeof(struct nlmsgerr))); + if (e) { + ds_put_format(&ds, " error(%d", e->error); + if (e->error < 0) { + ds_put_format(&ds, "(%s)", strerror(-e->error)); + } + ds_put_cstr(&ds, ", in-reply-to("); + nlmsghdr_to_string(&e->msg, &ds); + ds_put_cstr(&ds, "))"); + } else { + ds_put_cstr(&ds, " error(truncated)"); + } + } else if (h->nlmsg_type == NLMSG_DONE) { + int *error = ofpbuf_at(buffer, NLMSG_HDRLEN, sizeof *error); + if (error) { + ds_put_format(&ds, " done(%d", *error); + if (*error < 0) { + ds_put_format(&ds, "(%s)", strerror(-*error)); + } + ds_put_cstr(&ds, ")"); + } else { + ds_put_cstr(&ds, " done(truncated)"); + } + } + } else { + ds_put_cstr(&ds, "nl(truncated)"); + } + return ds.string; +} + +static void +log_nlmsg(const char *function, int error, + const void *message, size_t size) +{ + struct ofpbuf buffer; + char *nlmsg; + + if (!VLOG_IS_DBG_ENABLED()) { + return; + } + + buffer.data = (void *) message; + buffer.size = size; + nlmsg = nlmsg_to_string(&buffer); + VLOG_DBG_RL(&rl, "%s (%s): %s", function, strerror(error), nlmsg); + free(nlmsg); +} + diff --git a/lib/netlink.h b/lib/netlink.h new file mode 100644 index 00000000..64452e9f --- /dev/null +++ b/lib/netlink.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef NETLINK_H +#define NETLINK_H 1 + +/* Netlink interface. + * + * Netlink is a datagram-based network protocol primarily for communication + * between user processes and the kernel, and mainly on Linux. Netlink is + * specified in RFC 3549, "Linux Netlink as an IP Services Protocol". + * + * Netlink is not suitable for use in physical networks of heterogeneous + * machines because host byte order is used throughout. */ + +#include +#include +#include + +struct ofpbuf; +struct nl_sock; +struct nlattr; + +/* Netlink sockets. */ + +int nl_sock_create(int protocol, int multicast_group, + size_t so_sndbuf, size_t so_rcvbuf, + struct nl_sock **); +void nl_sock_destroy(struct nl_sock *); + +int nl_sock_send(struct nl_sock *, const struct ofpbuf *, bool wait); +int nl_sock_sendv(struct nl_sock *sock, const struct iovec iov[], size_t n_iov, + bool wait); +int nl_sock_recv(struct nl_sock *, struct ofpbuf **, bool wait); +int nl_sock_transact(struct nl_sock *, const struct ofpbuf *request, + struct ofpbuf **reply); + +void nl_sock_wait(const struct nl_sock *, short int events); + +/* Netlink messages. */ + +/* Accessing headers and data. */ +struct nlmsghdr *nl_msg_nlmsghdr(const struct ofpbuf *); +struct genlmsghdr *nl_msg_genlmsghdr(const struct ofpbuf *); +bool nl_msg_nlmsgerr(const struct ofpbuf *, int *error); +void nl_msg_reserve(struct ofpbuf *, size_t); + +/* Appending headers and raw data. */ +void nl_msg_put_nlmsghdr(struct ofpbuf *, struct nl_sock *, + size_t expected_payload, + uint32_t type, uint32_t flags); +void nl_msg_put_genlmsghdr(struct ofpbuf *, struct nl_sock *, + size_t expected_payload, int family, uint32_t flags, + uint8_t cmd, uint8_t version); +void nl_msg_put(struct ofpbuf *, const void *, size_t); +void *nl_msg_put_uninit(struct ofpbuf *, size_t); + +/* Appending attributes. */ +void *nl_msg_put_unspec_uninit(struct ofpbuf *, uint16_t type, size_t); +void nl_msg_put_unspec(struct ofpbuf *, uint16_t type, const void *, size_t); +void nl_msg_put_flag(struct ofpbuf *, uint16_t type); +void nl_msg_put_u8(struct ofpbuf *, uint16_t type, uint8_t value); +void nl_msg_put_u16(struct ofpbuf *, uint16_t type, uint16_t value); +void nl_msg_put_u32(struct ofpbuf *, uint16_t type, uint32_t value); +void nl_msg_put_u64(struct ofpbuf *, uint16_t type, uint64_t value); +void nl_msg_put_string(struct ofpbuf *, uint16_t type, const char *value); +void nl_msg_put_nested(struct ofpbuf *, uint16_t type, struct ofpbuf *); + +/* Netlink attribute types. */ +enum nl_attr_type +{ + NL_A_NO_ATTR = 0, + NL_A_UNSPEC, + NL_A_U8, + NL_A_U16, + NL_A_U32, + NL_A_U64, + NL_A_STRING, + NL_A_FLAG, + NL_A_NESTED, + N_NL_ATTR_TYPES +}; + +/* Netlink attribute parsing. */ +const void *nl_attr_get(const struct nlattr *); +size_t nl_attr_get_size(const struct nlattr *); +const void *nl_attr_get_unspec(const struct nlattr *, size_t size); +bool nl_attr_get_flag(const struct nlattr *); +uint8_t nl_attr_get_u8(const struct nlattr *); +uint16_t nl_attr_get_u16(const struct nlattr *); +uint32_t nl_attr_get_u32(const struct nlattr *); +uint64_t nl_attr_get_u64(const struct nlattr *); +const char *nl_attr_get_string(const struct nlattr *); + +/* Netlink attribute policy. + * + * Specifies how to parse a single attribute from a Netlink message payload. + */ +struct nl_policy +{ + enum nl_attr_type type; + size_t min_len, max_len; + bool optional; +}; + +bool nl_policy_parse(const struct ofpbuf *, size_t offset, + const struct nl_policy[], + struct nlattr *[], size_t n_attrs); + +/* Miscellaneous. */ + +int nl_lookup_genl_family(const char *name, int *number); + +#endif /* netlink.h */ diff --git a/lib/odp-util.c b/lib/odp-util.c new file mode 100644 index 00000000..d32697b7 --- /dev/null +++ b/lib/odp-util.c @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "odp-util.h" +#include +#include +#include +#include "coverage.h" +#include "dynamic-string.h" +#include "flow.h" +#include "packets.h" +#include "timeval.h" +#include "util.h" + +union odp_action * +odp_actions_add(struct odp_actions *actions, uint16_t type) +{ + union odp_action *a; + if (actions->n_actions < MAX_ODP_ACTIONS) { + a = &actions->actions[actions->n_actions++]; + } else { + COVERAGE_INC(odp_overflow); + actions->n_actions = MAX_ODP_ACTIONS + 1; + a = &actions->actions[MAX_ODP_ACTIONS - 1]; + } + memset(a, 0, sizeof *a); + a->type = type; + return a; +} + +void +format_odp_action(struct ds *ds, const union odp_action *a) +{ + switch (a->type) { + case ODPAT_OUTPUT: + ds_put_format(ds, "%"PRIu16, a->output.port); + break; + case ODPAT_OUTPUT_GROUP: + ds_put_format(ds, "g%"PRIu16, a->output_group.group); + break; + case ODPAT_CONTROLLER: + ds_put_format(ds, "ctl(%"PRIu32")", a->controller.arg); + break; + case ODPAT_SET_VLAN_VID: + ds_put_format(ds, "set_vlan(%"PRIu16")", ntohs(a->vlan_vid.vlan_vid)); + break; + case ODPAT_SET_VLAN_PCP: + ds_put_format(ds, "set_vlan_pcp(%"PRIu8")", a->vlan_pcp.vlan_pcp); + break; + case ODPAT_STRIP_VLAN: + ds_put_format(ds, "strip_vlan"); + break; + case ODPAT_SET_DL_SRC: + ds_put_format(ds, "set_dl_src("ETH_ADDR_FMT")", + ETH_ADDR_ARGS(a->dl_addr.dl_addr)); + break; + case ODPAT_SET_DL_DST: + ds_put_format(ds, "set_dl_dst("ETH_ADDR_FMT")", + ETH_ADDR_ARGS(a->dl_addr.dl_addr)); + break; + case ODPAT_SET_NW_SRC: + ds_put_format(ds, "set_nw_src("IP_FMT")", + IP_ARGS(&a->nw_addr.nw_addr)); + break; + case ODPAT_SET_NW_DST: + ds_put_format(ds, "set_nw_dst("IP_FMT")", + IP_ARGS(&a->nw_addr.nw_addr)); + break; + case ODPAT_SET_TP_SRC: + ds_put_format(ds, "set_tp_src(%"PRIu16")", ntohs(a->tp_port.tp_port)); + break; + case ODPAT_SET_TP_DST: + ds_put_format(ds, "set_tp_dst(%"PRIu16")", ntohs(a->tp_port.tp_port)); + break; + default: + ds_put_format(ds, "***bad action %"PRIu16"***", a->type); + break; + } +} + +void +format_odp_actions(struct ds *ds, const union odp_action *actions, + size_t n_actions) +{ + size_t i; + for (i = 0; i < n_actions; i++) { + if (i) { + ds_put_char(ds, ','); + } + format_odp_action(ds, &actions[i]); + } + if (!n_actions) { + ds_put_cstr(ds, "drop"); + } +} + +void +format_odp_flow_stats(struct ds *ds, const struct odp_flow_stats *s) +{ + ds_put_format(ds, "packets:%"PRIu64", bytes:%"PRIu64", used:", + s->n_packets, s->n_bytes); + if (s->used_sec) { + long long int used = s->used_sec * 1000 + s->used_nsec / 1000000; + ds_put_format(ds, "%.3fs", (time_msec() - used) / 1000.0); + } else { + ds_put_format(ds, "never"); + } +} + +void +format_odp_flow(struct ds *ds, const struct odp_flow *f) +{ + flow_format(ds, &f->key); + ds_put_cstr(ds, ", "); + format_odp_flow_stats(ds, &f->stats); + ds_put_cstr(ds, ", actions:"); + format_odp_actions(ds, f->actions, f->n_actions); +} + diff --git a/lib/odp-util.h b/lib/odp-util.h new file mode 100644 index 00000000..77d7f9a5 --- /dev/null +++ b/lib/odp-util.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef ODP_UTIL_H +#define ODP_UTIL_H 1 + +#include +#include +#include "openflow/openflow.h" +#include "openvswitch/datapath-protocol.h" + +struct ds; + +/* The kernel datapaths limits actions to those that fit in a single page of + * memory, so there is no point in allocating more than that. */ +enum { MAX_ODP_ACTIONS = 4096 / sizeof(union odp_action) }; + +struct odp_actions { + size_t n_actions; + union odp_action actions[MAX_ODP_ACTIONS]; +}; + +static inline void +odp_actions_init(struct odp_actions *actions) +{ + actions->n_actions = 0; +} + +union odp_action *odp_actions_add(struct odp_actions *actions, uint16_t type); + +static inline bool +odp_actions_overflow(const struct odp_actions *actions) +{ + return actions->n_actions > MAX_ODP_ACTIONS; +} + +static inline uint16_t +ofp_port_to_odp_port(uint16_t ofp_port) +{ + switch (ofp_port) { + case OFPP_LOCAL: + return ODPP_LOCAL; + case OFPP_NONE: + return ODPP_NONE; + default: + return ofp_port; + } +} + +static inline uint16_t +odp_port_to_ofp_port(uint16_t odp_port) +{ + switch (odp_port) { + case ODPP_LOCAL: + return OFPP_LOCAL; + case ODPP_NONE: + return OFPP_NONE; + default: + return odp_port; + } +} + +void format_odp_action(struct ds *, const union odp_action *); +void format_odp_actions(struct ds *, const union odp_action *actions, + size_t n_actions); +void format_odp_flow_stats(struct ds *, const struct odp_flow_stats *); +void format_odp_flow(struct ds *, const struct odp_flow *); + +#endif /* odp-util.h */ diff --git a/lib/ofp-print.c b/lib/ofp-print.c new file mode 100644 index 00000000..0c7980e8 --- /dev/null +++ b/lib/ofp-print.c @@ -0,0 +1,1473 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "ofp-print.h" +#include "xtoxll.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "compiler.h" +#include "dynamic-string.h" +#include "flow.h" +#include "ofpbuf.h" +#include "openflow/openflow.h" +#include "openflow/nicira-ext.h" +#include "packets.h" +#include "pcap.h" +#include "util.h" + +static void ofp_print_port_name(struct ds *string, uint16_t port); +static void ofp_print_match(struct ds *, const struct ofp_match *, + int verbosity); + +/* Returns a string that represents the contents of the Ethernet frame in the + * 'len' bytes starting at 'data' to 'stream' as output by tcpdump. + * 'total_len' specifies the full length of the Ethernet frame (of which 'len' + * bytes were captured). + * + * The caller must free the returned string. + * + * This starts and kills a tcpdump subprocess so it's quite expensive. */ +char * +ofp_packet_to_string(const void *data, size_t len, size_t total_len UNUSED) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + struct ofpbuf buf; + + char command[128]; + FILE *pcap; + FILE *tcpdump; + int status; + int c; + + buf.data = (void *) data; + buf.size = len; + + pcap = tmpfile(); + if (!pcap) { + ovs_error(errno, "tmpfile"); + return xstrdup(""); + } + pcap_write_header(pcap); + pcap_write(pcap, &buf); + fflush(pcap); + if (ferror(pcap)) { + ovs_error(errno, "error writing temporary file"); + } + rewind(pcap); + + snprintf(command, sizeof command, "/usr/sbin/tcpdump -e -n -r /dev/fd/%d 2>/dev/null", + fileno(pcap)); + tcpdump = popen(command, "r"); + fclose(pcap); + if (!tcpdump) { + ovs_error(errno, "exec(\"%s\")", command); + return xstrdup(""); + } + + while ((c = getc(tcpdump)) != EOF) { + ds_put_char(&ds, c); + } + + status = pclose(tcpdump); + if (WIFEXITED(status)) { + if (WEXITSTATUS(status)) + ovs_error(0, "tcpdump exited with status %d", WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + ovs_error(0, "tcpdump exited with signal %d", WTERMSIG(status)); + } + return ds_cstr(&ds); +} + +/* Pretty-print the OFPT_PACKET_IN packet of 'len' bytes at 'oh' to 'stream' + * at the given 'verbosity' level. */ +static void +ofp_packet_in(struct ds *string, const void *oh, size_t len, int verbosity) +{ + const struct ofp_packet_in *op = oh; + size_t data_len; + + ds_put_format(string, " total_len=%"PRIu16" in_port=", + ntohs(op->total_len)); + ofp_print_port_name(string, ntohs(op->in_port)); + + if (op->reason == OFPR_ACTION) + ds_put_cstr(string, " (via action)"); + else if (op->reason != OFPR_NO_MATCH) + ds_put_format(string, " (***reason %"PRIu8"***)", op->reason); + + data_len = len - offsetof(struct ofp_packet_in, data); + ds_put_format(string, " data_len=%zu", data_len); + if (htonl(op->buffer_id) == UINT32_MAX) { + ds_put_format(string, " (unbuffered)"); + if (ntohs(op->total_len) != data_len) + ds_put_format(string, " (***total_len != data_len***)"); + } else { + ds_put_format(string, " buffer=0x%08"PRIx32, ntohl(op->buffer_id)); + if (ntohs(op->total_len) < data_len) + ds_put_format(string, " (***total_len < data_len***)"); + } + ds_put_char(string, '\n'); + + if (verbosity > 0) { + flow_t flow; + struct ofpbuf packet; + struct ofp_match match; + packet.data = (void *) op->data; + packet.size = data_len; + flow_extract(&packet, ntohs(op->in_port), &flow); + flow_to_match(&flow, 0, &match); + ofp_print_match(string, &match, verbosity); + ds_put_char(string, '\n'); + } + if (verbosity > 1) { + char *packet = ofp_packet_to_string(op->data, data_len, + ntohs(op->total_len)); + ds_put_cstr(string, packet); + free(packet); + } +} + +static void ofp_print_port_name(struct ds *string, uint16_t port) +{ + const char *name; + switch (port) { + case OFPP_IN_PORT: + name = "IN_PORT"; + break; + case OFPP_TABLE: + name = "TABLE"; + break; + case OFPP_NORMAL: + name = "NORMAL"; + break; + case OFPP_FLOOD: + name = "FLOOD"; + break; + case OFPP_ALL: + name = "ALL"; + break; + case OFPP_CONTROLLER: + name = "CONTROLLER"; + break; + case OFPP_LOCAL: + name = "LOCAL"; + break; + case OFPP_NONE: + name = "NONE"; + break; + default: + ds_put_format(string, "%"PRIu16, port); + return; + } + ds_put_cstr(string, name); +} + +static void +ofp_print_nx_action(struct ds *string, const struct nx_action_header *nah) +{ + switch (ntohs(nah->subtype)) { + case NXAST_RESUBMIT: { + const struct nx_action_resubmit *nar = (struct nx_action_resubmit *)nah; + ds_put_format(string, "resubmit:"); + ofp_print_port_name(string, ntohs(nar->in_port)); + break; + } + + default: + ds_put_format(string, "***unknown Nicira action:%d***\n", + ntohs(nah->subtype)); + } +} + +static int +ofp_print_action(struct ds *string, const struct ofp_action_header *ah, + size_t actions_len) +{ + uint16_t type; + size_t len; + + struct openflow_action { + size_t min_size; + size_t max_size; + }; + + const struct openflow_action of_actions[] = { + [OFPAT_OUTPUT] = { + sizeof(struct ofp_action_output), + sizeof(struct ofp_action_output), + }, + [OFPAT_SET_VLAN_VID] = { + sizeof(struct ofp_action_vlan_vid), + sizeof(struct ofp_action_vlan_vid), + }, + [OFPAT_SET_VLAN_PCP] = { + sizeof(struct ofp_action_vlan_pcp), + sizeof(struct ofp_action_vlan_pcp), + }, + [OFPAT_STRIP_VLAN] = { + sizeof(struct ofp_action_header), + sizeof(struct ofp_action_header), + }, + [OFPAT_SET_DL_SRC] = { + sizeof(struct ofp_action_dl_addr), + sizeof(struct ofp_action_dl_addr), + }, + [OFPAT_SET_DL_DST] = { + sizeof(struct ofp_action_dl_addr), + sizeof(struct ofp_action_dl_addr), + }, + [OFPAT_SET_NW_SRC] = { + sizeof(struct ofp_action_nw_addr), + sizeof(struct ofp_action_nw_addr), + }, + [OFPAT_SET_NW_DST] = { + sizeof(struct ofp_action_nw_addr), + sizeof(struct ofp_action_nw_addr), + }, + [OFPAT_SET_TP_SRC] = { + sizeof(struct ofp_action_tp_port), + sizeof(struct ofp_action_tp_port), + }, + [OFPAT_SET_TP_DST] = { + sizeof(struct ofp_action_tp_port), + sizeof(struct ofp_action_tp_port), + } + /* OFPAT_VENDOR is not here, since it would blow up the array size. */ + }; + + if (actions_len < sizeof *ah) { + ds_put_format(string, "***action array too short for next action***\n"); + return -1; + } + + type = ntohs(ah->type); + len = ntohs(ah->len); + if (actions_len < len) { + ds_put_format(string, "***truncated action %"PRIu16"***\n", type); + return -1; + } + + if ((len % 8) != 0) { + ds_put_format(string, + "***action %"PRIu16" length not a multiple of 8***\n", + type); + return -1; + } + + if (type < ARRAY_SIZE(of_actions)) { + const struct openflow_action *act = &of_actions[type]; + if ((len < act->min_size) || (len > act->max_size)) { + ds_put_format(string, + "***action %"PRIu16" wrong length: %zu***\n", type, len); + return -1; + } + } + + switch (type) { + case OFPAT_OUTPUT: { + struct ofp_action_output *oa = (struct ofp_action_output *)ah; + uint16_t port = ntohs(oa->port); + if (port < OFPP_MAX) { + ds_put_format(string, "output:%"PRIu16, port); + } else { + ofp_print_port_name(string, port); + if (port == OFPP_CONTROLLER) { + if (oa->max_len) { + ds_put_format(string, ":%"PRIu16, ntohs(oa->max_len)); + } else { + ds_put_cstr(string, ":all"); + } + } + } + break; + } + + case OFPAT_SET_VLAN_VID: { + struct ofp_action_vlan_vid *va = (struct ofp_action_vlan_vid *)ah; + ds_put_format(string, "mod_vlan_vid:%"PRIu16, ntohs(va->vlan_vid)); + break; + } + + case OFPAT_SET_VLAN_PCP: { + struct ofp_action_vlan_pcp *va = (struct ofp_action_vlan_pcp *)ah; + ds_put_format(string, "mod_vlan_pcp:%"PRIu8, va->vlan_pcp); + break; + } + + case OFPAT_STRIP_VLAN: + ds_put_cstr(string, "strip_vlan"); + break; + + case OFPAT_SET_DL_SRC: { + struct ofp_action_dl_addr *da = (struct ofp_action_dl_addr *)ah; + ds_put_format(string, "mod_dl_src:"ETH_ADDR_FMT, + ETH_ADDR_ARGS(da->dl_addr)); + break; + } + + case OFPAT_SET_DL_DST: { + struct ofp_action_dl_addr *da = (struct ofp_action_dl_addr *)ah; + ds_put_format(string, "mod_dl_dst:"ETH_ADDR_FMT, + ETH_ADDR_ARGS(da->dl_addr)); + break; + } + + case OFPAT_SET_NW_SRC: { + struct ofp_action_nw_addr *na = (struct ofp_action_nw_addr *)ah; + ds_put_format(string, "mod_nw_src:"IP_FMT, IP_ARGS(&na->nw_addr)); + break; + } + + case OFPAT_SET_NW_DST: { + struct ofp_action_nw_addr *na = (struct ofp_action_nw_addr *)ah; + ds_put_format(string, "mod_nw_dst:"IP_FMT, IP_ARGS(&na->nw_addr)); + break; + } + + case OFPAT_SET_TP_SRC: { + struct ofp_action_tp_port *ta = (struct ofp_action_tp_port *)ah; + ds_put_format(string, "mod_tp_src:%d", ntohs(ta->tp_port)); + break; + } + + case OFPAT_SET_TP_DST: { + struct ofp_action_tp_port *ta = (struct ofp_action_tp_port *)ah; + ds_put_format(string, "mod_tp_dst:%d", ntohs(ta->tp_port)); + break; + } + + case OFPAT_VENDOR: { + struct ofp_action_vendor_header *avh + = (struct ofp_action_vendor_header *)ah; + if (len < sizeof *avh) { + ds_put_format(string, "***ofpat_vendor truncated***\n"); + return -1; + } + if (avh->vendor == htonl(NX_VENDOR_ID)) { + ofp_print_nx_action(string, (struct nx_action_header *)avh); + } else { + ds_put_format(string, "vendor action:0x%x", ntohl(avh->vendor)); + } + break; + } + + default: + ds_put_format(string, "(decoder %"PRIu16" not implemented)", type); + break; + } + + return len; +} + +static void +ofp_print_actions(struct ds *string, const struct ofp_action_header *action, + size_t actions_len) +{ + uint8_t *p = (uint8_t *)action; + int len = 0; + + ds_put_cstr(string, "actions="); + if (!actions_len) { + ds_put_cstr(string, "drop"); + } + while (actions_len > 0) { + if (len) { + ds_put_cstr(string, ","); + } + len = ofp_print_action(string, (struct ofp_action_header *)p, + actions_len); + if (len < 0) { + return; + } + p += len; + actions_len -= len; + } +} + +/* Pretty-print the OFPT_PACKET_OUT packet of 'len' bytes at 'oh' to 'string' + * at the given 'verbosity' level. */ +static void ofp_packet_out(struct ds *string, const void *oh, size_t len, + int verbosity) +{ + const struct ofp_packet_out *opo = oh; + size_t actions_len = ntohs(opo->actions_len); + + ds_put_cstr(string, " in_port="); + ofp_print_port_name(string, ntohs(opo->in_port)); + + ds_put_format(string, " actions_len=%zu ", actions_len); + if (actions_len > (ntohs(opo->header.length) - sizeof *opo)) { + ds_put_format(string, "***packet too short for action length***\n"); + return; + } + ofp_print_actions(string, opo->actions, actions_len); + + if (ntohl(opo->buffer_id) == UINT32_MAX) { + int data_len = len - sizeof *opo - actions_len; + ds_put_format(string, " data_len=%d", data_len); + if (verbosity > 0 && len > sizeof *opo) { + char *packet = ofp_packet_to_string( + (uint8_t *)opo->actions + actions_len, data_len, data_len); + ds_put_char(string, '\n'); + ds_put_cstr(string, packet); + free(packet); + } + } else { + ds_put_format(string, " buffer=0x%08"PRIx32, ntohl(opo->buffer_id)); + } + ds_put_char(string, '\n'); +} + +/* qsort comparison function. */ +static int +compare_ports(const void *a_, const void *b_) +{ + const struct ofp_phy_port *a = a_; + const struct ofp_phy_port *b = b_; + uint16_t ap = ntohs(a->port_no); + uint16_t bp = ntohs(b->port_no); + + return ap < bp ? -1 : ap > bp; +} + +static void ofp_print_port_features(struct ds *string, uint32_t features) +{ + if (features == 0) { + ds_put_cstr(string, "Unsupported\n"); + return; + } + if (features & OFPPF_10MB_HD) { + ds_put_cstr(string, "10MB-HD "); + } + if (features & OFPPF_10MB_FD) { + ds_put_cstr(string, "10MB-FD "); + } + if (features & OFPPF_100MB_HD) { + ds_put_cstr(string, "100MB-HD "); + } + if (features & OFPPF_100MB_FD) { + ds_put_cstr(string, "100MB-FD "); + } + if (features & OFPPF_1GB_HD) { + ds_put_cstr(string, "1GB-HD "); + } + if (features & OFPPF_1GB_FD) { + ds_put_cstr(string, "1GB-FD "); + } + if (features & OFPPF_10GB_FD) { + ds_put_cstr(string, "10GB-FD "); + } + if (features & OFPPF_COPPER) { + ds_put_cstr(string, "COPPER "); + } + if (features & OFPPF_FIBER) { + ds_put_cstr(string, "FIBER "); + } + if (features & OFPPF_AUTONEG) { + ds_put_cstr(string, "AUTO_NEG "); + } + if (features & OFPPF_PAUSE) { + ds_put_cstr(string, "AUTO_PAUSE "); + } + if (features & OFPPF_PAUSE_ASYM) { + ds_put_cstr(string, "AUTO_PAUSE_ASYM "); + } + ds_put_char(string, '\n'); +} + +static void +ofp_print_phy_port(struct ds *string, const struct ofp_phy_port *port) +{ + uint8_t name[OFP_MAX_PORT_NAME_LEN]; + int j; + + memcpy(name, port->name, sizeof name); + for (j = 0; j < sizeof name - 1; j++) { + if (!isprint(name[j])) { + break; + } + } + name[j] = '\0'; + + ds_put_char(string, ' '); + ofp_print_port_name(string, ntohs(port->port_no)); + ds_put_format(string, "(%s): addr:"ETH_ADDR_FMT", config: %#x, state:%#x\n", + name, ETH_ADDR_ARGS(port->hw_addr), ntohl(port->config), + ntohl(port->state)); + if (port->curr) { + ds_put_format(string, " current: "); + ofp_print_port_features(string, ntohl(port->curr)); + } + if (port->advertised) { + ds_put_format(string, " advertised: "); + ofp_print_port_features(string, ntohl(port->advertised)); + } + if (port->supported) { + ds_put_format(string, " supported: "); + ofp_print_port_features(string, ntohl(port->supported)); + } + if (port->peer) { + ds_put_format(string, " peer: "); + ofp_print_port_features(string, ntohl(port->peer)); + } +} + +/* Pretty-print the struct ofp_switch_features of 'len' bytes at 'oh' to + * 'string' at the given 'verbosity' level. */ +static void +ofp_print_switch_features(struct ds *string, const void *oh, size_t len, + int verbosity UNUSED) +{ + const struct ofp_switch_features *osf = oh; + struct ofp_phy_port *port_list; + int n_ports; + int i; + + ds_put_format(string, " ver:0x%x, dpid:%"PRIx64"\n", + osf->header.version, ntohll(osf->datapath_id)); + ds_put_format(string, "n_tables:%d, n_buffers:%d\n", osf->n_tables, + ntohl(osf->n_buffers)); + ds_put_format(string, "features: capabilities:%#x, actions:%#x\n", + ntohl(osf->capabilities), ntohl(osf->actions)); + + if (ntohs(osf->header.length) >= sizeof *osf) { + len = MIN(len, ntohs(osf->header.length)); + } + n_ports = (len - sizeof *osf) / sizeof *osf->ports; + + port_list = xmemdup(osf->ports, len - sizeof *osf); + qsort(port_list, n_ports, sizeof *port_list, compare_ports); + for (i = 0; i < n_ports; i++) { + ofp_print_phy_port(string, &port_list[i]); + } + free(port_list); +} + +/* Pretty-print the struct ofp_switch_config of 'len' bytes at 'oh' to 'string' + * at the given 'verbosity' level. */ +static void +ofp_print_switch_config(struct ds *string, const void *oh, size_t len UNUSED, + int verbosity UNUSED) +{ + const struct ofp_switch_config *osc = oh; + uint16_t flags; + + flags = ntohs(osc->flags); + if (flags & OFPC_SEND_FLOW_EXP) { + flags &= ~OFPC_SEND_FLOW_EXP; + ds_put_format(string, " (sending flow expirations)"); + } + if (flags) { + ds_put_format(string, " ***unknown flags 0x%04"PRIx16"***", flags); + } + + ds_put_format(string, " miss_send_len=%"PRIu16"\n", ntohs(osc->miss_send_len)); +} + +static void print_wild(struct ds *string, const char *leader, int is_wild, + int verbosity, const char *format, ...) + __attribute__((format(printf, 5, 6))); + +static void print_wild(struct ds *string, const char *leader, int is_wild, + int verbosity, const char *format, ...) +{ + if (is_wild && verbosity < 2) { + return; + } + ds_put_cstr(string, leader); + if (!is_wild) { + va_list args; + + va_start(args, format); + ds_put_format_valist(string, format, args); + va_end(args); + } else { + ds_put_char(string, '*'); + } + ds_put_char(string, ','); +} + +static void +print_ip_netmask(struct ds *string, const char *leader, uint32_t ip, + uint32_t wild_bits, int verbosity) +{ + if (wild_bits >= 32 && verbosity < 2) { + return; + } + ds_put_cstr(string, leader); + if (wild_bits < 32) { + ds_put_format(string, IP_FMT, IP_ARGS(&ip)); + if (wild_bits) { + ds_put_format(string, "/%d", 32 - wild_bits); + } + } else { + ds_put_char(string, '*'); + } + ds_put_char(string, ','); +} + +static void +ofp_print_match(struct ds *f, const struct ofp_match *om, int verbosity) +{ + char *s = ofp_match_to_string(om, verbosity); + ds_put_cstr(f, s); + free(s); +} + +char * +ofp_match_to_string(const struct ofp_match *om, int verbosity) +{ + struct ds f = DS_EMPTY_INITIALIZER; + uint32_t w = ntohl(om->wildcards); + bool skip_type = false; + bool skip_proto = false; + + if (!(w & OFPFW_DL_TYPE)) { + skip_type = true; + if (om->dl_type == htons(ETH_TYPE_IP)) { + if (!(w & OFPFW_NW_PROTO)) { + skip_proto = true; + if (om->nw_proto == IP_TYPE_ICMP) { + ds_put_cstr(&f, "icmp,"); + } else if (om->nw_proto == IP_TYPE_TCP) { + ds_put_cstr(&f, "tcp,"); + } else if (om->nw_proto == IP_TYPE_UDP) { + ds_put_cstr(&f, "udp,"); + } else { + ds_put_cstr(&f, "ip,"); + skip_proto = false; + } + } else { + ds_put_cstr(&f, "ip,"); + } + } else if (om->dl_type == htons(ETH_TYPE_ARP)) { + ds_put_cstr(&f, "arp,"); + } else { + skip_type = false; + } + } + print_wild(&f, "in_port=", w & OFPFW_IN_PORT, verbosity, + "%d", ntohs(om->in_port)); + print_wild(&f, "dl_vlan=", w & OFPFW_DL_VLAN, verbosity, + "0x%04x", ntohs(om->dl_vlan)); + print_wild(&f, "dl_src=", w & OFPFW_DL_SRC, verbosity, + ETH_ADDR_FMT, ETH_ADDR_ARGS(om->dl_src)); + print_wild(&f, "dl_dst=", w & OFPFW_DL_DST, verbosity, + ETH_ADDR_FMT, ETH_ADDR_ARGS(om->dl_dst)); + if (!skip_type) { + print_wild(&f, "dl_type=", w & OFPFW_DL_TYPE, verbosity, + "0x%04x", ntohs(om->dl_type)); + } + print_ip_netmask(&f, "nw_src=", om->nw_src, + (w & OFPFW_NW_SRC_MASK) >> OFPFW_NW_SRC_SHIFT, verbosity); + print_ip_netmask(&f, "nw_dst=", om->nw_dst, + (w & OFPFW_NW_DST_MASK) >> OFPFW_NW_DST_SHIFT, verbosity); + if (!skip_proto) { + print_wild(&f, "nw_proto=", w & OFPFW_NW_PROTO, verbosity, + "%u", om->nw_proto); + } + if (om->nw_proto == IP_TYPE_ICMP) { + print_wild(&f, "icmp_type=", w & OFPFW_ICMP_TYPE, verbosity, + "%d", ntohs(om->icmp_type)); + print_wild(&f, "icmp_code=", w & OFPFW_ICMP_CODE, verbosity, + "%d", ntohs(om->icmp_code)); + } else { + print_wild(&f, "tp_src=", w & OFPFW_TP_SRC, verbosity, + "%d", ntohs(om->tp_src)); + print_wild(&f, "tp_dst=", w & OFPFW_TP_DST, verbosity, + "%d", ntohs(om->tp_dst)); + } + return ds_cstr(&f); +} + +/* Pretty-print the OFPT_FLOW_MOD packet of 'len' bytes at 'oh' to 'string' + * at the given 'verbosity' level. */ +static void +ofp_print_flow_mod(struct ds *string, const void *oh, size_t len, + int verbosity) +{ + const struct ofp_flow_mod *ofm = oh; + + ofp_print_match(string, &ofm->match, verbosity); + switch (ntohs(ofm->command)) { + case OFPFC_ADD: + ds_put_cstr(string, " ADD: "); + break; + case OFPFC_MODIFY: + ds_put_cstr(string, " MOD: "); + break; + case OFPFC_MODIFY_STRICT: + ds_put_cstr(string, " MOD_STRICT: "); + break; + case OFPFC_DELETE: + ds_put_cstr(string, " DEL: "); + break; + case OFPFC_DELETE_STRICT: + ds_put_cstr(string, " DEL_STRICT: "); + break; + default: + ds_put_format(string, " cmd:%d ", ntohs(ofm->command)); + } + ds_put_format(string, "idle:%d hard:%d pri:%d buf:%#x", + ntohs(ofm->idle_timeout), ntohs(ofm->hard_timeout), + ofm->match.wildcards ? ntohs(ofm->priority) : (uint16_t)-1, + ntohl(ofm->buffer_id)); + ofp_print_actions(string, ofm->actions, + len - offsetof(struct ofp_flow_mod, actions)); + ds_put_char(string, '\n'); +} + +/* Pretty-print the OFPT_FLOW_EXPIRED packet of 'len' bytes at 'oh' to 'string' + * at the given 'verbosity' level. */ +static void +ofp_print_flow_expired(struct ds *string, const void *oh, size_t len UNUSED, + int verbosity) +{ + const struct ofp_flow_expired *ofe = oh; + + ofp_print_match(string, &ofe->match, verbosity); + ds_put_cstr(string, " reason="); + switch (ofe->reason) { + case OFPER_IDLE_TIMEOUT: + ds_put_cstr(string, "idle"); + break; + case OFPER_HARD_TIMEOUT: + ds_put_cstr(string, "hard"); + break; + default: + ds_put_format(string, "**%"PRIu8"**", ofe->reason); + break; + } + ds_put_format(string, + " pri%"PRIu16" secs%"PRIu32" pkts%"PRIu64" bytes%"PRIu64"\n", + ofe->match.wildcards ? ntohs(ofe->priority) : (uint16_t)-1, + ntohl(ofe->duration), ntohll(ofe->packet_count), + ntohll(ofe->byte_count)); +} + +static void +ofp_print_port_mod(struct ds *string, const void *oh, size_t len UNUSED, + int verbosity UNUSED) +{ + const struct ofp_port_mod *opm = oh; + + ds_put_format(string, "port: %d: addr:"ETH_ADDR_FMT", config: %#x, mask:%#x\n", + ntohs(opm->port_no), ETH_ADDR_ARGS(opm->hw_addr), + ntohl(opm->config), ntohl(opm->mask)); + ds_put_format(string, " advertise: "); + if (opm->advertise) { + ofp_print_port_features(string, ntohl(opm->advertise)); + } else { + ds_put_format(string, "UNCHANGED\n"); + } +} + +struct error_type { + int type; + int code; + const char *name; +}; + +static const struct error_type error_types[] = { +#define ERROR_TYPE(TYPE) {TYPE, -1, #TYPE} +#define ERROR_CODE(TYPE, CODE) {TYPE, CODE, #CODE} + ERROR_TYPE(OFPET_HELLO_FAILED), + ERROR_CODE(OFPET_HELLO_FAILED, OFPHFC_INCOMPATIBLE), + + ERROR_TYPE(OFPET_BAD_REQUEST), + ERROR_CODE(OFPET_BAD_REQUEST, OFPBRC_BAD_VERSION), + ERROR_CODE(OFPET_BAD_REQUEST, OFPBRC_BAD_TYPE), + ERROR_CODE(OFPET_BAD_REQUEST, OFPBRC_BAD_STAT), + ERROR_CODE(OFPET_BAD_REQUEST, OFPBRC_BAD_VERSION), + + ERROR_TYPE(OFPET_BAD_ACTION), + ERROR_CODE(OFPET_BAD_ACTION, OFPBAC_BAD_TYPE), + ERROR_CODE(OFPET_BAD_ACTION, OFPBAC_BAD_LEN), + ERROR_CODE(OFPET_BAD_ACTION, OFPBAC_BAD_VENDOR), + ERROR_CODE(OFPET_BAD_ACTION, OFPBAC_BAD_VENDOR_TYPE), + ERROR_CODE(OFPET_BAD_ACTION, OFPBAC_BAD_OUT_PORT), + + ERROR_TYPE(OFPET_FLOW_MOD_FAILED), + ERROR_CODE(OFPET_FLOW_MOD_FAILED, OFPFMFC_ALL_TABLES_FULL) +}; +#define N_ERROR_TYPES ARRAY_SIZE(error_types) + +static const char * +lookup_error_type(int type) +{ + const struct error_type *t; + + for (t = error_types; t < &error_types[N_ERROR_TYPES]; t++) { + if (t->type == type && t->code == -1) { + return t->name; + } + } + return "?"; +} + +static const char * +lookup_error_code(int type, int code) +{ + const struct error_type *t; + + for (t = error_types; t < &error_types[N_ERROR_TYPES]; t++) { + if (t->type == type && t->code == code) { + return t->name; + } + } + return "?"; +} + +/* Pretty-print the OFPT_ERROR packet of 'len' bytes at 'oh' to 'string' + * at the given 'verbosity' level. */ +static void +ofp_print_error_msg(struct ds *string, const void *oh, size_t len, + int verbosity UNUSED) +{ + const struct ofp_error_msg *oem = oh; + int type = ntohs(oem->type); + int code = ntohs(oem->code); + char *s; + + ds_put_format(string, " type%d(%s) code%d(%s) payload:\n", + type, lookup_error_type(type), + code, lookup_error_code(type, code)); + + switch (type) { + case OFPET_HELLO_FAILED: + ds_put_printable(string, (char *) oem->data, len - sizeof *oem); + break; + + case OFPET_BAD_REQUEST: + s = ofp_to_string(oem->data, len - sizeof *oem, 1); + ds_put_cstr(string, s); + free(s); + break; + + default: + ds_put_hex_dump(string, oem->data, len - sizeof *oem, 0, true); + break; + } +} + +/* Pretty-print the OFPT_PORT_STATUS packet of 'len' bytes at 'oh' to 'string' + * at the given 'verbosity' level. */ +static void +ofp_print_port_status(struct ds *string, const void *oh, size_t len UNUSED, + int verbosity UNUSED) +{ + const struct ofp_port_status *ops = oh; + + if (ops->reason == OFPPR_ADD) { + ds_put_format(string, " ADD:"); + } else if (ops->reason == OFPPR_DELETE) { + ds_put_format(string, " DEL:"); + } else if (ops->reason == OFPPR_MODIFY) { + ds_put_format(string, " MOD:"); + } + + ofp_print_phy_port(string, &ops->desc); +} + +static void +ofp_desc_stats_reply(struct ds *string, const void *body, size_t len UNUSED, + int verbosity UNUSED) +{ + const struct ofp_desc_stats *ods = body; + + ds_put_format(string, "Manufacturer: %s\n", ods->mfr_desc); + ds_put_format(string, "Hardware: %s\n", ods->hw_desc); + ds_put_format(string, "Software: %s\n", ods->sw_desc); + ds_put_format(string, "Serial Num: %s\n", ods->serial_num); +} + +static void +ofp_flow_stats_request(struct ds *string, const void *oh, size_t len UNUSED, + int verbosity) +{ + const struct ofp_flow_stats_request *fsr = oh; + + if (fsr->table_id == 0xff) { + ds_put_format(string, " table_id=any, "); + } else { + ds_put_format(string, " table_id=%"PRIu8", ", fsr->table_id); + } + + ofp_print_match(string, &fsr->match, verbosity); +} + +static void +ofp_flow_stats_reply(struct ds *string, const void *body_, size_t len, + int verbosity) +{ + const char *body = body_; + const char *pos = body; + for (;;) { + const struct ofp_flow_stats *fs; + ptrdiff_t bytes_left = body + len - pos; + size_t length; + + if (bytes_left < sizeof *fs) { + if (bytes_left != 0) { + ds_put_format(string, " ***%td leftover bytes at end***", + bytes_left); + } + break; + } + + fs = (const void *) pos; + length = ntohs(fs->length); + if (length < sizeof *fs) { + ds_put_format(string, " ***length=%zu shorter than minimum %zu***", + length, sizeof *fs); + break; + } else if (length > bytes_left) { + ds_put_format(string, + " ***length=%zu but only %td bytes left***", + length, bytes_left); + break; + } else if ((length - sizeof *fs) % sizeof fs->actions[0]) { + ds_put_format(string, + " ***length=%zu has %zu bytes leftover in " + "final action***", + length, + (length - sizeof *fs) % sizeof fs->actions[0]); + break; + } + + ds_put_format(string, " duration=%"PRIu32"s, ", ntohl(fs->duration)); + ds_put_format(string, "table_id=%"PRIu8", ", fs->table_id); + ds_put_format(string, "priority=%"PRIu16", ", + fs->match.wildcards ? ntohs(fs->priority) : (uint16_t)-1); + ds_put_format(string, "n_packets=%"PRIu64", ", + ntohll(fs->packet_count)); + ds_put_format(string, "n_bytes=%"PRIu64", ", ntohll(fs->byte_count)); + if (fs->idle_timeout != htons(OFP_FLOW_PERMANENT)) { + ds_put_format(string, "idle_timeout=%"PRIu16",", + ntohs(fs->idle_timeout)); + } + if (fs->hard_timeout != htons(OFP_FLOW_PERMANENT)) { + ds_put_format(string, "hard_timeout=%"PRIu16",", + ntohs(fs->hard_timeout)); + } + ofp_print_match(string, &fs->match, verbosity); + ofp_print_actions(string, fs->actions, length - sizeof *fs); + ds_put_char(string, '\n'); + + pos += length; + } +} + +static void +ofp_aggregate_stats_request(struct ds *string, const void *oh, + size_t len UNUSED, int verbosity) +{ + const struct ofp_aggregate_stats_request *asr = oh; + + if (asr->table_id == 0xff) { + ds_put_format(string, " table_id=any, "); + } else { + ds_put_format(string, " table_id=%"PRIu8", ", asr->table_id); + } + + ofp_print_match(string, &asr->match, verbosity); +} + +static void +ofp_aggregate_stats_reply(struct ds *string, const void *body_, + size_t len UNUSED, int verbosity UNUSED) +{ + const struct ofp_aggregate_stats_reply *asr = body_; + + ds_put_format(string, " packet_count=%"PRIu64, ntohll(asr->packet_count)); + ds_put_format(string, " byte_count=%"PRIu64, ntohll(asr->byte_count)); + ds_put_format(string, " flow_count=%"PRIu32, ntohl(asr->flow_count)); +} + +static void print_port_stat(struct ds *string, const char *leader, + uint64_t stat, int more) +{ + ds_put_cstr(string, leader); + if (stat != -1) { + ds_put_format(string, "%"PRIu64, stat); + } else { + ds_put_char(string, '?'); + } + if (more) { + ds_put_cstr(string, ", "); + } else { + ds_put_cstr(string, "\n"); + } +} + +static void +ofp_port_stats_reply(struct ds *string, const void *body, size_t len, + int verbosity) +{ + const struct ofp_port_stats *ps = body; + size_t n = len / sizeof *ps; + ds_put_format(string, " %zu ports\n", n); + if (verbosity < 1) { + return; + } + + for (; n--; ps++) { + ds_put_format(string, " port %2"PRIu16": ", ntohs(ps->port_no)); + + ds_put_cstr(string, "rx "); + print_port_stat(string, "pkts=", ntohll(ps->rx_packets), 1); + print_port_stat(string, "bytes=", ntohll(ps->rx_bytes), 1); + print_port_stat(string, "drop=", ntohll(ps->rx_dropped), 1); + print_port_stat(string, "errs=", ntohll(ps->rx_errors), 1); + print_port_stat(string, "frame=", ntohll(ps->rx_frame_err), 1); + print_port_stat(string, "over=", ntohll(ps->rx_over_err), 1); + print_port_stat(string, "crc=", ntohll(ps->rx_crc_err), 0); + + ds_put_cstr(string, " tx "); + print_port_stat(string, "pkts=", ntohll(ps->tx_packets), 1); + print_port_stat(string, "bytes=", ntohll(ps->tx_bytes), 1); + print_port_stat(string, "drop=", ntohll(ps->tx_dropped), 1); + print_port_stat(string, "errs=", ntohll(ps->tx_errors), 1); + print_port_stat(string, "coll=", ntohll(ps->collisions), 0); + } +} + +static void +ofp_table_stats_reply(struct ds *string, const void *body, size_t len, + int verbosity) +{ + const struct ofp_table_stats *ts = body; + size_t n = len / sizeof *ts; + ds_put_format(string, " %zu tables\n", n); + if (verbosity < 1) { + return; + } + + for (; n--; ts++) { + char name[OFP_MAX_TABLE_NAME_LEN + 1]; + strncpy(name, ts->name, sizeof name); + name[OFP_MAX_TABLE_NAME_LEN] = '\0'; + + ds_put_format(string, " %d: %-8s: ", ts->table_id, name); + ds_put_format(string, "wild=0x%05"PRIx32", ", ntohl(ts->wildcards)); + ds_put_format(string, "max=%6"PRIu32", ", ntohl(ts->max_entries)); + ds_put_format(string, "active=%"PRIu32"\n", ntohl(ts->active_count)); + ds_put_cstr(string, " "); + ds_put_format(string, "lookup=%"PRIu64", ", + ntohll(ts->lookup_count)); + ds_put_format(string, "matched=%"PRIu64"\n", + ntohll(ts->matched_count)); + } +} + +static void +vendor_stat(struct ds *string, const void *body, size_t len, + int verbosity UNUSED) +{ + ds_put_format(string, " vendor=%08"PRIx32, ntohl(*(uint32_t *) body)); + ds_put_format(string, " %zu bytes additional data", + len - sizeof(uint32_t)); +} + +enum stats_direction { + REQUEST, + REPLY +}; + +static void +print_stats(struct ds *string, int type, const void *body, size_t body_len, + int verbosity, enum stats_direction direction) +{ + struct stats_msg { + size_t min_body, max_body; + void (*printer)(struct ds *, const void *, size_t len, int verbosity); + }; + + struct stats_type { + int type; + const char *name; + struct stats_msg request; + struct stats_msg reply; + }; + + static const struct stats_type stats_types[] = { + { + OFPST_DESC, + "description", + { 0, 0, NULL }, + { 0, SIZE_MAX, ofp_desc_stats_reply }, + }, + { + OFPST_FLOW, + "flow", + { sizeof(struct ofp_flow_stats_request), + sizeof(struct ofp_flow_stats_request), + ofp_flow_stats_request }, + { 0, SIZE_MAX, ofp_flow_stats_reply }, + }, + { + OFPST_AGGREGATE, + "aggregate", + { sizeof(struct ofp_aggregate_stats_request), + sizeof(struct ofp_aggregate_stats_request), + ofp_aggregate_stats_request }, + { sizeof(struct ofp_aggregate_stats_reply), + sizeof(struct ofp_aggregate_stats_reply), + ofp_aggregate_stats_reply }, + }, + { + OFPST_TABLE, + "table", + { 0, 0, NULL }, + { 0, SIZE_MAX, ofp_table_stats_reply }, + }, + { + OFPST_PORT, + "port", + { 0, 0, NULL, }, + { 0, SIZE_MAX, ofp_port_stats_reply }, + }, + { + OFPST_VENDOR, + "vendor-specific", + { sizeof(uint32_t), SIZE_MAX, vendor_stat }, + { sizeof(uint32_t), SIZE_MAX, vendor_stat }, + }, + { + -1, + "unknown", + { 0, 0, NULL, }, + { 0, 0, NULL, }, + }, + }; + + const struct stats_type *s; + const struct stats_msg *m; + + if (type >= ARRAY_SIZE(stats_types) || !stats_types[type].name) { + ds_put_format(string, " ***unknown type %d***", type); + return; + } + for (s = stats_types; s->type >= 0; s++) { + if (s->type == type) { + break; + } + } + ds_put_format(string, " type=%d(%s)\n", type, s->name); + + m = direction == REQUEST ? &s->request : &s->reply; + if (body_len < m->min_body || body_len > m->max_body) { + ds_put_format(string, " ***body_len=%zu not in %zu...%zu***", + body_len, m->min_body, m->max_body); + return; + } + if (m->printer) { + m->printer(string, body, body_len, verbosity); + } +} + +static void +ofp_stats_request(struct ds *string, const void *oh, size_t len, int verbosity) +{ + const struct ofp_stats_request *srq = oh; + + if (srq->flags) { + ds_put_format(string, " ***unknown flags 0x%04"PRIx16"***", + ntohs(srq->flags)); + } + + print_stats(string, ntohs(srq->type), srq->body, + len - offsetof(struct ofp_stats_request, body), + verbosity, REQUEST); +} + +static void +ofp_stats_reply(struct ds *string, const void *oh, size_t len, int verbosity) +{ + const struct ofp_stats_reply *srp = oh; + + ds_put_cstr(string, " flags="); + if (!srp->flags) { + ds_put_cstr(string, "none"); + } else { + uint16_t flags = ntohs(srp->flags); + if (flags & OFPSF_REPLY_MORE) { + ds_put_cstr(string, "[more]"); + flags &= ~OFPSF_REPLY_MORE; + } + if (flags) { + ds_put_format(string, "[***unknown flags 0x%04"PRIx16"***]", flags); + } + } + + print_stats(string, ntohs(srp->type), srp->body, + len - offsetof(struct ofp_stats_reply, body), + verbosity, REPLY); +} + +static void +ofp_echo(struct ds *string, const void *oh, size_t len, int verbosity) +{ + const struct ofp_header *hdr = oh; + + ds_put_format(string, " %zu bytes of payload\n", len - sizeof *hdr); + if (verbosity > 1) { + ds_put_hex_dump(string, hdr, len - sizeof *hdr, 0, true); + } +} + +struct openflow_packet { + uint8_t type; + const char *name; + size_t min_size; + void (*printer)(struct ds *, const void *, size_t len, int verbosity); +}; + +static const struct openflow_packet packets[] = { + { + OFPT_HELLO, + "hello", + sizeof (struct ofp_header), + NULL, + }, + { + OFPT_FEATURES_REQUEST, + "features_request", + sizeof (struct ofp_header), + NULL, + }, + { + OFPT_FEATURES_REPLY, + "features_reply", + sizeof (struct ofp_switch_features), + ofp_print_switch_features, + }, + { + OFPT_GET_CONFIG_REQUEST, + "get_config_request", + sizeof (struct ofp_header), + NULL, + }, + { + OFPT_GET_CONFIG_REPLY, + "get_config_reply", + sizeof (struct ofp_switch_config), + ofp_print_switch_config, + }, + { + OFPT_SET_CONFIG, + "set_config", + sizeof (struct ofp_switch_config), + ofp_print_switch_config, + }, + { + OFPT_PACKET_IN, + "packet_in", + offsetof(struct ofp_packet_in, data), + ofp_packet_in, + }, + { + OFPT_PACKET_OUT, + "packet_out", + sizeof (struct ofp_packet_out), + ofp_packet_out, + }, + { + OFPT_FLOW_MOD, + "flow_mod", + sizeof (struct ofp_flow_mod), + ofp_print_flow_mod, + }, + { + OFPT_FLOW_EXPIRED, + "flow_expired", + sizeof (struct ofp_flow_expired), + ofp_print_flow_expired, + }, + { + OFPT_PORT_MOD, + "port_mod", + sizeof (struct ofp_port_mod), + ofp_print_port_mod, + }, + { + OFPT_PORT_STATUS, + "port_status", + sizeof (struct ofp_port_status), + ofp_print_port_status + }, + { + OFPT_ERROR, + "error_msg", + sizeof (struct ofp_error_msg), + ofp_print_error_msg, + }, + { + OFPT_STATS_REQUEST, + "stats_request", + sizeof (struct ofp_stats_request), + ofp_stats_request, + }, + { + OFPT_STATS_REPLY, + "stats_reply", + sizeof (struct ofp_stats_reply), + ofp_stats_reply, + }, + { + OFPT_ECHO_REQUEST, + "echo_request", + sizeof (struct ofp_header), + ofp_echo, + }, + { + OFPT_ECHO_REPLY, + "echo_reply", + sizeof (struct ofp_header), + ofp_echo, + }, + { + OFPT_VENDOR, + "vendor", + sizeof (struct ofp_vendor_header), + NULL, + }, +}; + +/* Composes and returns a string representing the OpenFlow packet of 'len' + * bytes at 'oh' at the given 'verbosity' level. 0 is a minimal amount of + * verbosity and higher numbers increase verbosity. The caller is responsible + * for freeing the string. */ +char * +ofp_to_string(const void *oh_, size_t len, int verbosity) +{ + struct ds string = DS_EMPTY_INITIALIZER; + const struct ofp_header *oh = oh_; + const struct openflow_packet *pkt; + + if (len < sizeof(struct ofp_header)) { + ds_put_cstr(&string, "OpenFlow packet too short:\n"); + ds_put_hex_dump(&string, oh, len, 0, true); + return ds_cstr(&string); + } else if (oh->version != OFP_VERSION) { + ds_put_format(&string, "Bad OpenFlow version %"PRIu8":\n", oh->version); + ds_put_hex_dump(&string, oh, len, 0, true); + return ds_cstr(&string); + } + + for (pkt = packets; ; pkt++) { + if (pkt >= &packets[ARRAY_SIZE(packets)]) { + ds_put_format(&string, "Unknown OpenFlow packet type %"PRIu8":\n", + oh->type); + ds_put_hex_dump(&string, oh, len, 0, true); + return ds_cstr(&string); + } else if (oh->type == pkt->type) { + break; + } + } + + ds_put_format(&string, "%s (xid=0x%"PRIx32"):", pkt->name, oh->xid); + + if (ntohs(oh->length) > len) + ds_put_format(&string, " (***truncated to %zu bytes from %"PRIu16"***)", + len, ntohs(oh->length)); + else if (ntohs(oh->length) < len) { + ds_put_format(&string, " (***only uses %"PRIu16" bytes out of %zu***)\n", + ntohs(oh->length), len); + len = ntohs(oh->length); + } + + if (len < pkt->min_size) { + ds_put_format(&string, " (***length=%zu < min_size=%zu***)\n", + len, pkt->min_size); + } else if (!pkt->printer) { + if (len > sizeof *oh) { + ds_put_format(&string, " length=%"PRIu16" (decoder not implemented)\n", + ntohs(oh->length)); + } + } else { + pkt->printer(&string, oh, len, verbosity); + } + if (verbosity >= 3) { + ds_put_hex_dump(&string, oh, len, 0, true); + } + if (string.string[string.length - 1] != '\n') { + ds_put_char(&string, '\n'); + } + return ds_cstr(&string); +} + +/* Returns the name for the specified OpenFlow message type as a string, + * e.g. "OFPT_FEATURES_REPLY". If no name is known, the string returned is a + * hex number, e.g. "0x55". + * + * The caller must free the returned string when it is no longer needed. */ +char * +ofp_message_type_to_string(uint8_t type) +{ + struct ds s = DS_EMPTY_INITIALIZER; + const struct openflow_packet *pkt; + for (pkt = packets; ; pkt++) { + if (pkt >= &packets[ARRAY_SIZE(packets)]) { + ds_put_format(&s, "0x%02"PRIx8, type); + break; + } else if (type == pkt->type) { + const char *p; + + ds_put_cstr(&s, "OFPT_"); + for (p = pkt->name; *p; p++) { + ds_put_char(&s, toupper((unsigned char) *p)); + } + break; + } + } + return ds_cstr(&s); +} + +static void +print_and_free(FILE *stream, char *string) +{ + fputs(string, stream); + free(string); +} + +/* Pretty-print the OpenFlow packet of 'len' bytes at 'oh' to 'stream' at the + * given 'verbosity' level. 0 is a minimal amount of verbosity and higher + * numbers increase verbosity. */ +void +ofp_print(FILE *stream, const void *oh, size_t len, int verbosity) +{ + print_and_free(stream, ofp_to_string(oh, len, verbosity)); +} + +/* Dumps the contents of the Ethernet frame in the 'len' bytes starting at + * 'data' to 'stream' using tcpdump. 'total_len' specifies the full length of + * the Ethernet frame (of which 'len' bytes were captured). + * + * This starts and kills a tcpdump subprocess so it's quite expensive. */ +void +ofp_print_packet(FILE *stream, const void *data, size_t len, size_t total_len) +{ + print_and_free(stream, ofp_packet_to_string(data, len, total_len)); +} diff --git a/lib/ofp-print.h b/lib/ofp-print.h new file mode 100644 index 00000000..21fb749b --- /dev/null +++ b/lib/ofp-print.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* OpenFlow protocol pretty-printer. */ + +#ifndef OFP_PRINT_H +#define OFP_PRINT_H 1 + +#include +#include + +struct ofp_flow_mod; +struct ofp_match; + +#ifdef __cplusplus +extern "C" { +#endif + +void ofp_print(FILE *, const void *, size_t, int verbosity); +void ofp_print_packet(FILE *stream, const void *data, size_t len, size_t total_len); + +char *ofp_to_string(const void *, size_t, int verbosity); +char *ofp_match_to_string(const struct ofp_match *, int verbosity); +char *ofp_packet_to_string(const void *data, size_t len, size_t total_len); +char *ofp_message_type_to_string(uint8_t type); + +#ifdef __cplusplus +} +#endif + +#endif /* ofp-print.h */ diff --git a/lib/ofpbuf.c b/lib/ofpbuf.c new file mode 100644 index 00000000..ed326fd5 --- /dev/null +++ b/lib/ofpbuf.c @@ -0,0 +1,288 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "ofpbuf.h" +#include +#include +#include +#include "util.h" + +/* Initializes 'b' as an empty ofpbuf that contains the 'allocated' bytes of + * memory starting at 'base'. + * + * 'base' should ordinarily be the first byte of a region obtained from + * malloc(), but in circumstances where it can be guaranteed that 'b' will + * never need to be expanded or freed, it can be a pointer into arbitrary + * memory. */ +void +ofpbuf_use(struct ofpbuf *b, void *base, size_t allocated) +{ + b->base = b->data = base; + b->allocated = allocated; + b->size = 0; + b->l2 = b->l3 = b->l4 = b->l7 = NULL; + b->next = NULL; + b->private = NULL; +} + +/* Initializes 'b' as an empty ofpbuf with an initial capacity of 'size' + * bytes. */ +void +ofpbuf_init(struct ofpbuf *b, size_t size) +{ + ofpbuf_use(b, size ? xmalloc(size) : NULL, size); +} + +/* Frees memory that 'b' points to. */ +void +ofpbuf_uninit(struct ofpbuf *b) +{ + if (b) { + free(b->base); + } +} + +/* Frees memory that 'b' points to and allocates a new ofpbuf */ +void +ofpbuf_reinit(struct ofpbuf *b, size_t size) +{ + ofpbuf_uninit(b); + ofpbuf_init(b, size); +} + +/* Creates and returns a new ofpbuf with an initial capacity of 'size' + * bytes. */ +struct ofpbuf * +ofpbuf_new(size_t size) +{ + struct ofpbuf *b = xmalloc(sizeof *b); + ofpbuf_init(b, size); + return b; +} + +struct ofpbuf * +ofpbuf_clone(const struct ofpbuf *buffer) +{ + return ofpbuf_clone_data(buffer->data, buffer->size); +} + +struct ofpbuf * +ofpbuf_clone_data(const void *data, size_t size) +{ + struct ofpbuf *b = ofpbuf_new(size); + ofpbuf_put(b, data, size); + return b; +} + +/* Frees memory that 'b' points to, as well as 'b' itself. */ +void +ofpbuf_delete(struct ofpbuf *b) +{ + if (b) { + ofpbuf_uninit(b); + free(b); + } +} + +/* Returns the number of bytes of headroom in 'b', that is, the number of bytes + * of unused space in ofpbuf 'b' before the data that is in use. (Most + * commonly, the data in a ofpbuf is at its beginning, and thus the ofpbuf's + * headroom is 0.) */ +size_t +ofpbuf_headroom(struct ofpbuf *b) +{ + return (char*)b->data - (char*)b->base; +} + +/* Returns the number of bytes that may be appended to the tail end of ofpbuf + * 'b' before the ofpbuf must be reallocated. */ +size_t +ofpbuf_tailroom(struct ofpbuf *b) +{ + return (char*)ofpbuf_end(b) - (char*)ofpbuf_tail(b); +} + +/* Ensures that 'b' has room for at least 'size' bytes at its tail end, + * reallocating and copying its data if necessary. */ +void +ofpbuf_prealloc_tailroom(struct ofpbuf *b, size_t size) +{ + if (size > ofpbuf_tailroom(b)) { + size_t new_allocated = b->allocated + MAX(size, 64); + void *new_base = xmalloc(new_allocated); + uintptr_t base_delta = (char*)new_base - (char*)b->base; + memcpy(new_base, b->base, b->allocated); + free(b->base); + b->base = new_base; + b->allocated = new_allocated; + b->data = (char*)b->data + base_delta; + if (b->l2) { + b->l2 = (char*)b->l2 + base_delta; + } + if (b->l3) { + b->l3 = (char*)b->l3 + base_delta; + } + if (b->l4) { + b->l4 = (char*)b->l4 + base_delta; + } + if (b->l7) { + b->l7 = (char*)b->l7 + base_delta; + } + } +} + +void +ofpbuf_prealloc_headroom(struct ofpbuf *b, size_t size) +{ + assert(size <= ofpbuf_headroom(b)); +} + +/* Trims the size of 'b' to fit its actual content. */ +void +ofpbuf_trim(struct ofpbuf *b) +{ + /* XXX These could be supported, but the current client doesn't care. */ + assert(b->data == b->base); + assert(b->l2 == NULL && b->l3 == NULL && b->l4 == NULL && b->l7 == NULL); + if (b->allocated > b->size) { + b->base = b->data = xrealloc(b->base, b->size); + b->allocated = b->size; + } +} + +/* Appends 'size' bytes of data to the tail end of 'b', reallocating and + * copying its data if necessary. Returns a pointer to the first byte of the + * new data, which is left uninitialized. */ +void * +ofpbuf_put_uninit(struct ofpbuf *b, size_t size) +{ + void *p; + ofpbuf_prealloc_tailroom(b, size); + p = ofpbuf_tail(b); + b->size += size; + return p; +} + +/* Appends 'size' zeroed bytes to the tail end of 'b'. Data in 'b' is + * reallocated and copied if necessary. Returns a pointer to the first byte of + * the data's location in the ofpbuf. */ +void * +ofpbuf_put_zeros(struct ofpbuf *b, size_t size) +{ + void *dst = ofpbuf_put_uninit(b, size); + memset(dst, 0, size); + return dst; +} + +/* Appends the 'size' bytes of data in 'p' to the tail end of 'b'. Data in 'b' + * is reallocated and copied if necessary. Returns a pointer to the first + * byte of the data's location in the ofpbuf. */ +void * +ofpbuf_put(struct ofpbuf *b, const void *p, size_t size) +{ + void *dst = ofpbuf_put_uninit(b, size); + memcpy(dst, p, size); + return dst; +} + +/* Reserves 'size' bytes of headroom so that they can be later allocated with + * ofpbuf_push_uninit() without reallocating the ofpbuf. */ +void +ofpbuf_reserve(struct ofpbuf *b, size_t size) +{ + assert(!b->size); + ofpbuf_prealloc_tailroom(b, size); + b->data = (char*)b->data + size; +} + +void * +ofpbuf_push_uninit(struct ofpbuf *b, size_t size) +{ + ofpbuf_prealloc_headroom(b, size); + b->data = (char*)b->data - size; + b->size += size; + return b->data; +} + +void * +ofpbuf_push(struct ofpbuf *b, const void *p, size_t size) +{ + void *dst = ofpbuf_push_uninit(b, size); + memcpy(dst, p, size); + return dst; +} + +/* If 'b' contains at least 'offset + size' bytes of data, returns a pointer to + * byte 'offset'. Otherwise, returns a null pointer. */ +void * +ofpbuf_at(const struct ofpbuf *b, size_t offset, size_t size) +{ + return offset + size <= b->size ? (char *) b->data + offset : NULL; +} + +/* Returns a pointer to byte 'offset' in 'b', which must contain at least + * 'offset + size' bytes of data. */ +void * +ofpbuf_at_assert(const struct ofpbuf *b, size_t offset, size_t size) +{ + assert(offset + size <= b->size); + return ((char *) b->data) + offset; +} + +/* Returns the byte following the last byte of data in use in 'b'. */ +void * +ofpbuf_tail(const struct ofpbuf *b) +{ + return (char *) b->data + b->size; +} + +/* Returns the byte following the last byte allocated for use (but not + * necessarily in use) by 'b'. */ +void * +ofpbuf_end(const struct ofpbuf *b) +{ + return (char *) b->base + b->allocated; +} + +/* Clears any data from 'b'. */ +void +ofpbuf_clear(struct ofpbuf *b) +{ + b->data = b->base; + b->size = 0; +} + +/* Removes 'size' bytes from the head end of 'b', which must contain at least + * 'size' bytes of data. Returns the first byte of data removed. */ +void * +ofpbuf_pull(struct ofpbuf *b, size_t size) +{ + void *data = b->data; + assert(b->size >= size); + b->data = (char*)b->data + size; + b->size -= size; + return data; +} + +/* If 'b' has at least 'size' bytes of data, removes that many bytes from the + * head end of 'b' and returns the first byte removed. Otherwise, returns a + * null pointer without modifying 'b'. */ +void * +ofpbuf_try_pull(struct ofpbuf *b, size_t size) +{ + return b->size >= size ? ofpbuf_pull(b, size) : NULL; +} diff --git a/lib/ofpbuf.h b/lib/ofpbuf.h new file mode 100644 index 00000000..a68c2800 --- /dev/null +++ b/lib/ofpbuf.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef OFPBUF_H +#define OFPBUF_H 1 + +#include + +/* Buffer for holding arbitrary data. An ofpbuf is automatically reallocated + * as necessary if it grows too large for the available memory. */ +struct ofpbuf { + void *base; /* First byte of area malloc()'d area. */ + size_t allocated; /* Number of bytes allocated. */ + + void *data; /* First byte actually in use. */ + size_t size; /* Number of bytes in use. */ + + void *l2; /* Link-level header. */ + void *l3; /* Network-level header. */ + void *l4; /* Transport-level header. */ + void *l7; /* Application data. */ + + struct ofpbuf *next; /* Next in a list of ofpbufs. */ + void *private; /* Private pointer for use by owner. */ +}; + +void ofpbuf_use(struct ofpbuf *, void *, size_t); + +void ofpbuf_init(struct ofpbuf *, size_t); +void ofpbuf_uninit(struct ofpbuf *); +void ofpbuf_reinit(struct ofpbuf *, size_t); + +struct ofpbuf *ofpbuf_new(size_t); +struct ofpbuf *ofpbuf_clone(const struct ofpbuf *); +struct ofpbuf *ofpbuf_clone_data(const void *, size_t); +void ofpbuf_delete(struct ofpbuf *); + +void *ofpbuf_at(const struct ofpbuf *, size_t offset, size_t size); +void *ofpbuf_at_assert(const struct ofpbuf *, size_t offset, size_t size); +void *ofpbuf_tail(const struct ofpbuf *); +void *ofpbuf_end(const struct ofpbuf *); + +void *ofpbuf_put_uninit(struct ofpbuf *, size_t); +void *ofpbuf_put_zeros(struct ofpbuf *, size_t); +void *ofpbuf_put(struct ofpbuf *, const void *, size_t); +void ofpbuf_reserve(struct ofpbuf *, size_t); +void *ofpbuf_push_uninit(struct ofpbuf *b, size_t); +void *ofpbuf_push(struct ofpbuf *b, const void *, size_t); + +size_t ofpbuf_headroom(struct ofpbuf *); +size_t ofpbuf_tailroom(struct ofpbuf *); +void ofpbuf_prealloc_headroom(struct ofpbuf *, size_t); +void ofpbuf_prealloc_tailroom(struct ofpbuf *, size_t); +void ofpbuf_trim(struct ofpbuf *); + +void ofpbuf_clear(struct ofpbuf *); +void *ofpbuf_pull(struct ofpbuf *, size_t); +void *ofpbuf_try_pull(struct ofpbuf *, size_t); + +#endif /* ofpbuf.h */ diff --git a/lib/packets.h b/lib/packets.h new file mode 100644 index 00000000..c1ae621e --- /dev/null +++ b/lib/packets.h @@ -0,0 +1,274 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef PACKETS_H +#define PACKETS_H 1 + +#include +#include +#include "compiler.h" +#include "random.h" +#include "util.h" + +#define ETH_ADDR_LEN 6 + +static const uint8_t eth_addr_broadcast[ETH_ADDR_LEN] UNUSED + = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + +static inline bool eth_addr_is_broadcast(const uint8_t ea[6]) +{ + return (ea[0] & ea[1] & ea[2] & ea[3] & ea[4] & ea[5]) == 0xff; +} + +/* Returns true if 'ea' is an Ethernet address used for virtual interfaces + * under XenServer. Generally the actual Ethernet address is FE:FF:FF:FF:FF:FF + * but it can be FE:FE:FE:FE:FE:FE in some cases. */ +static inline bool eth_addr_is_vif(const uint8_t ea[6]) +{ + return ea[0] == 0xfe && (ea[1] & ea[2] & ea[3] & ea[4] & ea[5]) >= 0xfe; +} + +static inline bool eth_addr_is_multicast(const uint8_t ea[6]) +{ + return ea[0] & 1; +} +static inline bool eth_addr_is_local(const uint8_t ea[6]) +{ + return ea[0] & 2; +} +static inline bool eth_addr_is_zero(const uint8_t ea[6]) +{ + return !(ea[0] | ea[1] | ea[2] | ea[3] | ea[4] | ea[5]); +} +static inline bool eth_addr_equals(const uint8_t a[ETH_ADDR_LEN], + const uint8_t b[ETH_ADDR_LEN]) +{ + return !memcmp(a, b, ETH_ADDR_LEN); +} +static inline uint64_t eth_addr_to_uint64(const uint8_t ea[ETH_ADDR_LEN]) +{ + return (((uint64_t) ea[0] << 40) + | ((uint64_t) ea[1] << 32) + | ((uint64_t) ea[2] << 24) + | ((uint64_t) ea[3] << 16) + | ((uint64_t) ea[4] << 8) + | ea[5]); +} +static inline void eth_addr_from_uint64(uint64_t x, uint8_t ea[ETH_ADDR_LEN]) +{ + ea[0] = x >> 40; + ea[1] = x >> 32; + ea[2] = x >> 24; + ea[3] = x >> 16; + ea[4] = x >> 8; + ea[5] = x; +} +static inline void eth_addr_mark_random(uint8_t ea[ETH_ADDR_LEN]) +{ + ea[0] &= ~1; /* Unicast. */ + ea[0] |= 2; /* Private. */ +} +static inline void eth_addr_random(uint8_t ea[ETH_ADDR_LEN]) +{ + random_bytes(ea, ETH_ADDR_LEN); + eth_addr_mark_random(ea); +} +/* Returns true if 'ea' is a reserved multicast address, that a bridge must + * never forward, false otherwise. */ +static inline bool eth_addr_is_reserved(const uint8_t ea[ETH_ADDR_LEN]) +{ + return (ea[0] == 0x01 + && ea[1] == 0x80 + && ea[2] == 0xc2 + && ea[3] == 0x00 + && ea[4] == 0x00 + && (ea[5] & 0xf0) == 0x00); +} + +#define ETH_ADDR_FMT \ + "%02"PRIx8":%02"PRIx8":%02"PRIx8":%02"PRIx8":%02"PRIx8":%02"PRIx8 +#define ETH_ADDR_ARGS(ea) \ + (ea)[0], (ea)[1], (ea)[2], (ea)[3], (ea)[4], (ea)[5] + +#define ETH_TYPE_IP 0x0800 +#define ETH_TYPE_ARP 0x0806 +#define ETH_TYPE_VLAN 0x8100 + +#define ETH_HEADER_LEN 14 +#define ETH_PAYLOAD_MIN 46 +#define ETH_PAYLOAD_MAX 1500 +#define ETH_TOTAL_MIN (ETH_HEADER_LEN + ETH_PAYLOAD_MIN) +#define ETH_TOTAL_MAX (ETH_HEADER_LEN + ETH_PAYLOAD_MAX) +#define ETH_VLAN_TOTAL_MAX (ETH_HEADER_LEN + VLAN_HEADER_LEN + ETH_PAYLOAD_MAX) +struct eth_header { + uint8_t eth_dst[ETH_ADDR_LEN]; + uint8_t eth_src[ETH_ADDR_LEN]; + uint16_t eth_type; +} __attribute__((packed)); +BUILD_ASSERT_DECL(ETH_HEADER_LEN == sizeof(struct eth_header)); + +#define LLC_DSAP_SNAP 0xaa +#define LLC_SSAP_SNAP 0xaa +#define LLC_CNTL_SNAP 3 + +#define LLC_HEADER_LEN 3 +struct llc_header { + uint8_t llc_dsap; + uint8_t llc_ssap; + uint8_t llc_cntl; +} __attribute__((packed)); +BUILD_ASSERT_DECL(LLC_HEADER_LEN == sizeof(struct llc_header)); + +#define SNAP_ORG_ETHERNET "\0\0" /* The compiler adds a null byte, so + sizeof(SNAP_ORG_ETHERNET) == 3. */ +#define SNAP_HEADER_LEN 5 +struct snap_header { + uint8_t snap_org[3]; + uint16_t snap_type; +} __attribute__((packed)); +BUILD_ASSERT_DECL(SNAP_HEADER_LEN == sizeof(struct snap_header)); + +#define LLC_SNAP_HEADER_LEN (LLC_HEADER_LEN + SNAP_HEADER_LEN) +struct llc_snap_header { + struct llc_header llc; + struct snap_header snap; +} __attribute__((packed)); +BUILD_ASSERT_DECL(LLC_SNAP_HEADER_LEN == sizeof(struct llc_snap_header)); + +#define VLAN_VID_MASK 0x0fff +#define VLAN_PCP_MASK 0xe000 + +#define VLAN_HEADER_LEN 4 +struct vlan_header { + uint16_t vlan_tci; /* Lowest 12 bits are VLAN ID. */ + uint16_t vlan_next_type; +}; +BUILD_ASSERT_DECL(VLAN_HEADER_LEN == sizeof(struct vlan_header)); + +#define VLAN_ETH_HEADER_LEN (ETH_HEADER_LEN + VLAN_HEADER_LEN) +struct vlan_eth_header { + uint8_t veth_dst[ETH_ADDR_LEN]; + uint8_t veth_src[ETH_ADDR_LEN]; + uint16_t veth_type; /* Always htons(ETH_TYPE_VLAN). */ + uint16_t veth_tci; /* Lowest 12 bits are VLAN ID. */ + uint16_t veth_next_type; +} __attribute__((packed)); +BUILD_ASSERT_DECL(VLAN_ETH_HEADER_LEN == sizeof(struct vlan_eth_header)); + +/* The "(void) (ip)[0]" below has no effect on the value, since it's the first + * argument of a comma expression, but it makes sure that 'ip' is a pointer. + * This is useful since a common mistake is to pass an integer instead of a + * pointer to IP_ARGS. */ +#define IP_FMT "%"PRIu8".%"PRIu8".%"PRIu8".%"PRIu8 +#define IP_ARGS(ip) \ + ((void) (ip)[0], ((uint8_t *) ip)[0]), \ + ((uint8_t *) ip)[1], \ + ((uint8_t *) ip)[2], \ + ((uint8_t *) ip)[3] + +#define IP_VER(ip_ihl_ver) ((ip_ihl_ver) >> 4) +#define IP_IHL(ip_ihl_ver) ((ip_ihl_ver) & 15) +#define IP_IHL_VER(ihl, ver) (((ver) << 4) | (ihl)) + +#define IP_TYPE_ICMP 1 +#define IP_TYPE_TCP 6 +#define IP_TYPE_UDP 17 + +#define IP_VERSION 4 + +#define IP_DONT_FRAGMENT 0x4000 /* Don't fragment. */ +#define IP_MORE_FRAGMENTS 0x2000 /* More fragments. */ +#define IP_FRAG_OFF_MASK 0x1fff /* Fragment offset. */ +#define IP_IS_FRAGMENT(ip_frag_off) \ + ((ip_frag_off) & htons(IP_MORE_FRAGMENTS | IP_FRAG_OFF_MASK)) + +#define IP_HEADER_LEN 20 +struct ip_header { + uint8_t ip_ihl_ver; + uint8_t ip_tos; + uint16_t ip_tot_len; + uint16_t ip_id; + uint16_t ip_frag_off; + uint8_t ip_ttl; + uint8_t ip_proto; + uint16_t ip_csum; + uint32_t ip_src; + uint32_t ip_dst; +}; +BUILD_ASSERT_DECL(IP_HEADER_LEN == sizeof(struct ip_header)); + +#define ICMP_HEADER_LEN 4 +struct icmp_header { + uint8_t icmp_type; + uint8_t icmp_code; + uint16_t icmp_csum; +}; +BUILD_ASSERT_DECL(ICMP_HEADER_LEN == sizeof(struct icmp_header)); + +#define UDP_HEADER_LEN 8 +struct udp_header { + uint16_t udp_src; + uint16_t udp_dst; + uint16_t udp_len; + uint16_t udp_csum; +}; +BUILD_ASSERT_DECL(UDP_HEADER_LEN == sizeof(struct udp_header)); + +#define TCP_FIN 0x01 +#define TCP_SYN 0x02 +#define TCP_RST 0x04 +#define TCP_PSH 0x08 +#define TCP_ACK 0x10 +#define TCP_URG 0x20 + +#define TCP_FLAGS(tcp_ctl) (htons(tcp_ctl) & 0x003f) +#define TCP_OFFSET(tcp_ctl) (htons(tcp_ctl) >> 12) + +#define TCP_HEADER_LEN 20 +struct tcp_header { + uint16_t tcp_src; + uint16_t tcp_dst; + uint32_t tcp_seq; + uint32_t tcp_ack; + uint16_t tcp_ctl; + uint16_t tcp_winsz; + uint16_t tcp_csum; + uint16_t tcp_urg; +}; +BUILD_ASSERT_DECL(TCP_HEADER_LEN == sizeof(struct tcp_header)); + +#define ARP_HRD_ETHERNET 1 +#define ARP_PRO_IP 0x0800 +#define ARP_OP_REQUEST 1 +#define ARP_OP_REPLY 2 + +#define ARP_ETH_HEADER_LEN 28 +struct arp_eth_header { + /* Generic members. */ + uint16_t ar_hrd; /* Hardware type. */ + uint16_t ar_pro; /* Protocol type. */ + uint8_t ar_hln; /* Hardware address length. */ + uint8_t ar_pln; /* Protocol address length. */ + uint16_t ar_op; /* Opcode. */ + + /* Ethernet+IPv4 specific members. */ + uint8_t ar_sha[ETH_ADDR_LEN]; /* Sender hardware address. */ + uint32_t ar_spa; /* Sender protocol address. */ + uint8_t ar_tha[ETH_ADDR_LEN]; /* Target hardware address. */ + uint32_t ar_tpa; /* Target protocol address. */ +} __attribute__((packed)); +BUILD_ASSERT_DECL(ARP_ETH_HEADER_LEN == sizeof(struct arp_eth_header)); + +#endif /* packets.h */ diff --git a/lib/pcap.c b/lib/pcap.c new file mode 100644 index 00000000..b2cca761 --- /dev/null +++ b/lib/pcap.c @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "pcap.h" +#include +#include +#include +#include +#include "compiler.h" +#include "ofpbuf.h" + +#define THIS_MODULE VLM_pcap +#include "vlog.h" + +struct pcap_hdr { + uint32_t magic_number; /* magic number */ + uint16_t version_major; /* major version number */ + uint16_t version_minor; /* minor version number */ + int32_t thiszone; /* GMT to local correction */ + uint32_t sigfigs; /* accuracy of timestamps */ + uint32_t snaplen; /* max length of captured packets */ + uint32_t network; /* data link type */ +} PACKED; + +struct pcaprec_hdr { + uint32_t ts_sec; /* timestamp seconds */ + uint32_t ts_usec; /* timestamp microseconds */ + uint32_t incl_len; /* number of octets of packet saved in file */ + uint32_t orig_len; /* actual length of packet */ +} PACKED; + +FILE * +pcap_open(const char *file_name, const char *mode) +{ + FILE *file; + + assert(!strcmp(mode, "rb") || !strcmp(mode, "wb")); + + file = fopen(file_name, mode); + if (file == NULL) { + VLOG_WARN("%s: failed to open pcap file for %s", + file_name, mode[0] == 'r' ? "reading" : "writing"); + return NULL; + } + + if (mode[0] == 'r') { + if (!pcap_read_header(file)) { + fclose(file); + return NULL; + } + } else { + pcap_write_header(file); + } + return file; +} + +int +pcap_read_header(FILE *file) +{ + struct pcap_hdr ph; + if (fread(&ph, sizeof ph, 1, file) != 1) { + int error = ferror(file) ? errno : EOF; + VLOG_WARN("failed to read pcap header: %s", + error > 0 ? strerror(error) : "end of file"); + return error; + } + if (ph.magic_number != 0xa1b2c3d4 && ph.magic_number != 0xd4c3b2a1) { + VLOG_WARN("bad magic 0x%08"PRIx32" reading pcap file " + "(expected 0xa1b2c3d4 or 0xd4c3b2a1)", ph.magic_number); + return EPROTO; + } + return 0; +} + +void +pcap_write_header(FILE *file) +{ + /* The pcap reader is responsible for figuring out endianness based on the + * magic number, so the lack of htonX calls here is intentional. */ + struct pcap_hdr ph; + ph.magic_number = 0xa1b2c3d4; + ph.version_major = 2; + ph.version_minor = 4; + ph.thiszone = 0; + ph.sigfigs = 0; + ph.snaplen = 1518; + ph.network = 1; /* Ethernet */ + fwrite(&ph, sizeof ph, 1, file); +} + +int +pcap_read(FILE *file, struct ofpbuf **bufp) +{ + struct pcaprec_hdr prh; + struct ofpbuf *buf; + void *data; + size_t len; + + *bufp = NULL; + + /* Read header. */ + if (fread(&prh, sizeof prh, 1, file) != 1) { + int error = ferror(file) ? errno : EOF; + VLOG_WARN("failed to read pcap record header: %s", + error > 0 ? strerror(error) : "end of file"); + return error; + } + + /* Calculate length. */ + len = prh.incl_len; + if (len > 0xffff) { + uint32_t swapped_len = (((len & 0xff000000) >> 24) | + ((len & 0x00ff0000) >> 8) | + ((len & 0x0000ff00) << 8) | + ((len & 0x000000ff) << 24)); + if (swapped_len > 0xffff) { + VLOG_WARN("bad packet length %"PRIu32" or %"PRIu32" " + "reading pcap file", + len, swapped_len); + return EPROTO; + } + len = swapped_len; + } + + /* Read packet. */ + buf = ofpbuf_new(len); + data = ofpbuf_put_uninit(buf, len); + if (fread(data, len, 1, file) != 1) { + int error = ferror(file) ? errno : EOF; + VLOG_WARN("failed to read pcap packet: %s", + error > 0 ? strerror(error) : "end of file"); + ofpbuf_delete(buf); + return error; + } + *bufp = buf; + return 0; +} + +void +pcap_write(FILE *file, struct ofpbuf *buf) +{ + struct pcaprec_hdr prh; + prh.ts_sec = 0; + prh.ts_usec = 0; + prh.incl_len = buf->size; + prh.orig_len = buf->size; + fwrite(&prh, sizeof prh, 1, file); + fwrite(buf->data, buf->size, 1, file); +} diff --git a/lib/pcap.h b/lib/pcap.h new file mode 100644 index 00000000..dd7deb45 --- /dev/null +++ b/lib/pcap.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef PCAP_H +#define PCAP_H 1 + +#include + +struct ofpbuf; + +FILE *pcap_open(const char *file_name, const char *mode); +int pcap_read_header(FILE *); +void pcap_write_header(FILE *); +int pcap_read(FILE *, struct ofpbuf **); +void pcap_write(FILE *, struct ofpbuf *); + +#endif /* dhcp.h */ diff --git a/lib/poll-loop.c b/lib/poll-loop.c new file mode 100644 index 00000000..d6dbdb85 --- /dev/null +++ b/lib/poll-loop.c @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "poll-loop.h" +#include +#include +#include +#include +#include +#include "backtrace.h" +#include "coverage.h" +#include "dynamic-string.h" +#include "list.h" +#include "timeval.h" + +#define THIS_MODULE VLM_poll_loop +#include "vlog.h" + +/* An event that will wake the following call to poll_block(). */ +struct poll_waiter { + /* Set when the waiter is created. */ + struct list node; /* Element in global waiters list. */ + int fd; /* File descriptor. */ + short int events; /* Events to wait for (POLLIN, POLLOUT). */ + poll_fd_func *function; /* Callback function, if any, or null. */ + void *aux; /* Argument to callback function. */ + struct backtrace *backtrace; /* Optionally, event that created waiter. */ + + /* Set only when poll_block() is called. */ + struct pollfd *pollfd; /* Pointer to element of the pollfds array + (null if added from a callback). */ +}; + +/* All active poll waiters. */ +static struct list waiters = LIST_INITIALIZER(&waiters); + +/* Number of elements in the waiters list. */ +static size_t n_waiters; + +/* Max time to wait in next call to poll_block(), in milliseconds, or -1 to + * wait forever. */ +static int timeout = -1; + +/* Backtrace of 'timeout''s registration, if debugging is enabled. */ +static struct backtrace timeout_backtrace; + +/* Callback currently running, to allow verifying that poll_cancel() is not + * being called on a running callback. */ +#ifndef NDEBUG +static struct poll_waiter *running_cb; +#endif + +static struct poll_waiter *new_waiter(int fd, short int events); + +/* Registers 'fd' as waiting for the specified 'events' (which should be POLLIN + * or POLLOUT or POLLIN | POLLOUT). The following call to poll_block() will + * wake up when 'fd' becomes ready for one or more of the requested events. + * + * The event registration is one-shot: only the following call to poll_block() + * is affected. The event will need to be re-registered after poll_block() is + * called if it is to persist. */ +struct poll_waiter * +poll_fd_wait(int fd, short int events) +{ + COVERAGE_INC(poll_fd_wait); + return new_waiter(fd, events); +} + +/* Causes the following call to poll_block() to block for no more than 'msec' + * milliseconds. If 'msec' is nonpositive, the following call to poll_block() + * will not block at all. + * + * The timer registration is one-shot: only the following call to poll_block() + * is affected. The timer will need to be re-registered after poll_block() is + * called if it is to persist. */ +void +poll_timer_wait(int msec) +{ + if (timeout < 0 || msec < timeout) { + timeout = MAX(0, msec); + if (VLOG_IS_DBG_ENABLED()) { + backtrace_capture(&timeout_backtrace); + } + } +} + +/* Causes the following call to poll_block() to wake up immediately, without + * blocking. */ +void +poll_immediate_wake(void) +{ + poll_timer_wait(0); +} + +static void PRINTF_FORMAT(2, 3) +log_wakeup(const struct backtrace *backtrace, const char *format, ...) +{ + struct ds ds; + va_list args; + + ds_init(&ds); + va_start(args, format); + ds_put_format_valist(&ds, format, args); + va_end(args); + + if (backtrace) { + int i; + + ds_put_char(&ds, ':'); + for (i = 0; i < backtrace->n_frames; i++) { + ds_put_format(&ds, " 0x%x", backtrace->frames[i]); + } + } + VLOG_DBG("%s", ds_cstr(&ds)); + ds_destroy(&ds); +} + +/* Blocks until one or more of the events registered with poll_fd_wait() + * occurs, or until the minimum duration registered with poll_timer_wait() + * elapses, or not at all if poll_immediate_wake() has been called. + * + * Also executes any autonomous subroutines registered with poll_fd_callback(), + * if their file descriptors have become ready. */ +void +poll_block(void) +{ + static struct pollfd *pollfds; + static size_t max_pollfds; + + struct poll_waiter *pw; + struct list *node; + int n_pollfds; + int retval; + + assert(!running_cb); + if (max_pollfds < n_waiters) { + max_pollfds = n_waiters; + pollfds = xrealloc(pollfds, max_pollfds * sizeof *pollfds); + } + + n_pollfds = 0; + LIST_FOR_EACH (pw, struct poll_waiter, node, &waiters) { + pw->pollfd = &pollfds[n_pollfds]; + pollfds[n_pollfds].fd = pw->fd; + pollfds[n_pollfds].events = pw->events; + pollfds[n_pollfds].revents = 0; + n_pollfds++; + } + + if (!timeout) { + COVERAGE_INC(poll_zero_timeout); + } + retval = time_poll(pollfds, n_pollfds, timeout); + if (retval < 0) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_ERR_RL(&rl, "poll: %s", strerror(-retval)); + } else if (!retval && VLOG_IS_DBG_ENABLED()) { + log_wakeup(&timeout_backtrace, "%d-ms timeout", timeout); + } + + for (node = waiters.next; node != &waiters; ) { + pw = CONTAINER_OF(node, struct poll_waiter, node); + if (!pw->pollfd || !pw->pollfd->revents) { + if (pw->function) { + node = node->next; + continue; + } + } else { + if (VLOG_IS_DBG_ENABLED()) { + log_wakeup(pw->backtrace, "%s%s%s%s%s on fd %d", + pw->pollfd->revents & POLLIN ? "[POLLIN]" : "", + pw->pollfd->revents & POLLOUT ? "[POLLOUT]" : "", + pw->pollfd->revents & POLLERR ? "[POLLERR]" : "", + pw->pollfd->revents & POLLHUP ? "[POLLHUP]" : "", + pw->pollfd->revents & POLLNVAL ? "[POLLNVAL]" : "", + pw->fd); + } + + if (pw->function) { +#ifndef NDEBUG + running_cb = pw; +#endif + pw->function(pw->fd, pw->pollfd->revents, pw->aux); +#ifndef NDEBUG + running_cb = NULL; +#endif + } + } + node = node->next; + poll_cancel(pw); + } + + timeout = -1; + timeout_backtrace.n_frames = 0; +} + +/* Registers 'function' to be called with argument 'aux' by poll_block() when + * 'fd' becomes ready for one of the events in 'events', which should be POLLIN + * or POLLOUT or POLLIN | POLLOUT. + * + * The callback registration persists until the event actually occurs. At that + * point, it is automatically de-registered. The callback function must + * re-register the event by calling poll_fd_callback() again within the + * callback, if it wants to be called back again later. */ +struct poll_waiter * +poll_fd_callback(int fd, short int events, poll_fd_func *function, void *aux) +{ + struct poll_waiter *pw = new_waiter(fd, events); + pw->function = function; + pw->aux = aux; + return pw; +} + +/* Cancels the file descriptor event registered with poll_fd_wait() or + * poll_fd_callback(). 'pw' must be the struct poll_waiter returned by one of + * those functions. + * + * An event registered with poll_fd_wait() may be canceled from its time of + * registration until the next call to poll_block(). At that point, the event + * is automatically canceled by the system and its poll_waiter is freed. + * + * An event registered with poll_fd_callback() may be canceled from its time of + * registration until its callback is actually called. At that point, the + * event is automatically canceled by the system and its poll_waiter is + * freed. */ +void +poll_cancel(struct poll_waiter *pw) +{ + if (pw) { + assert(pw != running_cb); + list_remove(&pw->node); + free(pw->backtrace); + free(pw); + n_waiters--; + } +} + +/* Creates and returns a new poll_waiter for 'fd' and 'events'. */ +static struct poll_waiter * +new_waiter(int fd, short int events) +{ + struct poll_waiter *waiter = xcalloc(1, sizeof *waiter); + assert(fd >= 0); + waiter->fd = fd; + waiter->events = events; + if (VLOG_IS_DBG_ENABLED()) { + waiter->backtrace = xmalloc(sizeof *waiter->backtrace); + backtrace_capture(waiter->backtrace); + } + list_push_back(&waiters, &waiter->node); + n_waiters++; + return waiter; +} diff --git a/lib/poll-loop.h b/lib/poll-loop.h new file mode 100644 index 00000000..188b4f19 --- /dev/null +++ b/lib/poll-loop.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* High-level wrapper around the "poll" system call. + * + * Intended usage is for the program's main loop to go about its business + * servicing whatever events it needs to. Then, when it runs out of immediate + * tasks, it calls each subordinate module's "wait" function, which in turn + * calls one (or more) of the functions poll_fd_wait(), poll_immediate_wake(), + * and poll_timer_wait() to register to be awakened when the appropriate event + * occurs. Then the main loop calls poll_block(), which blocks until one of + * the registered events happens. + * + * There is also some support for autonomous subroutines that are executed by + * poll_block() when a file descriptor becomes ready. To prevent these + * routines from starving if events are continuously ready, the application + * should bound the amount of work it does between poll_block() calls. */ + +#ifndef POLL_LOOP_H +#define POLL_LOOP_H 1 + +#include + +struct poll_waiter; + +/* Schedule events to wake up the following poll_block(). */ +struct poll_waiter *poll_fd_wait(int fd, short int events); +void poll_timer_wait(int msec); +void poll_immediate_wake(void); + +/* Wait until an event occurs. */ +void poll_block(void); + +/* Autonomous function callbacks. */ +typedef void poll_fd_func(int fd, short int revents, void *aux); +struct poll_waiter *poll_fd_callback(int fd, short int events, + poll_fd_func *, void *aux); + +/* Cancel a file descriptor callback or event. */ +void poll_cancel(struct poll_waiter *); + +#endif /* poll-loop.h */ diff --git a/lib/port-array.c b/lib/port-array.c new file mode 100644 index 00000000..87bb2168 --- /dev/null +++ b/lib/port-array.c @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "port-array.h" +#include + +static struct port_array_l2 l2_sentinel; +static struct port_array_l3 l3_sentinel; +static bool inited; + +/* Initializes 'pa' as an empty port_array. */ +void +port_array_init(struct port_array *pa) +{ + size_t i; + if (!inited) { + inited = true; + for (i = 0; i < PORT_ARRAY_L2_SIZE; i++) { + l2_sentinel.l2[i] = &l3_sentinel; + } + } + for (i = 0; i < PORT_ARRAY_L1_SIZE; i++) { + pa->l1[i] = &l2_sentinel; + } +} + +/* Frees all the memory allocated for 'pa'. It is the client's responsibility + * to free memory that 'pa' elements point to. */ +void +port_array_destroy(struct port_array *pa) +{ + unsigned int l1_idx; + + for (l1_idx = 0; l1_idx < PORT_ARRAY_L1_SIZE; l1_idx++) { + struct port_array_l2 *l2 = pa->l1[l1_idx]; + + if (l2 != &l2_sentinel) { + unsigned int l2_idx; + + for (l2_idx = 0; l2_idx < PORT_ARRAY_L2_SIZE; l2_idx++) { + struct port_array_l3 *l3 = l2->l2[l2_idx]; + if (l3 != &l3_sentinel) { + free(l3); + } + } + free(l2); + } + } +} + +/* Clears all elements of 'pa' to null pointers. */ +void +port_array_clear(struct port_array *pa) +{ + port_array_destroy(pa); + port_array_init(pa); +} + +/* Sets 'pa' element numbered 'idx' to 'p'. */ +void +port_array_set(struct port_array *pa, uint16_t idx, void *p) +{ + struct port_array_l2 **l2p, *l2; + struct port_array_l3 **l3p, *l3; + + /* Traverse level 1. */ + l2p = &pa->l1[PORT_ARRAY_L1(idx)]; + if (*l2p == &l2_sentinel) { + *l2p = xmemdup(&l2_sentinel, sizeof l2_sentinel); + } + l2 = *l2p; + + /* Traverse level 2. */ + l3p = &l2->l2[PORT_ARRAY_L2(idx)]; + if (*l3p == &l3_sentinel) { + *l3p = xmemdup(&l3_sentinel, sizeof l3_sentinel); + } + l3 = *l3p; + + /* Set level 3. */ + l3->l3[PORT_ARRAY_L3(idx)] = p; +} + +static void * +next(const struct port_array *pa, unsigned int *idxp) +{ + unsigned int idx = *idxp; + + /* Using shift-right directly here, instead of PORT_ARRAY_L1(idx), ensures + * that with an initially too-big value of '*idxp' we will skip the outer + * loop and return NULL. */ + unsigned int l1_idx = idx >> PORT_ARRAY_L1_SHIFT; + unsigned int l2_idx = PORT_ARRAY_L2(idx); + unsigned int l3_idx = PORT_ARRAY_L3(idx); + while (l1_idx < PORT_ARRAY_L1_SIZE) { + struct port_array_l2 *l2 = pa->l1[l1_idx]; + if (l2 != &l2_sentinel) { + while (l2_idx < PORT_ARRAY_L2_SIZE) { + struct port_array_l3 *l3 = l2->l2[l2_idx]; + if (l3 != &l3_sentinel) { + while (l3_idx < PORT_ARRAY_L3_SIZE) { + void *p = l3->l3[l3_idx]; + if (p) { + *idxp = ((l1_idx << PORT_ARRAY_L1_SHIFT) + | (l2_idx << PORT_ARRAY_L2_SHIFT) + | (l3_idx << PORT_ARRAY_L3_SHIFT)); + return p; + } + l3_idx++; + } + } + l2_idx++; + l3_idx = 0; + } + } + l1_idx++; + l2_idx = 0; + l3_idx = 0; + } + *idxp = PORT_ARRAY_SIZE; + return NULL; +} + +/* Returns the value of the lowest-numbered non-empty element of 'pa', and sets + * '*idxp' to that element's index. If 'pa' is entirely empty, returns a null + * pointer and sets '*idxp' to 65536. */ +void * +port_array_first(const struct port_array *pa, unsigned int *idxp) +{ + *idxp = 0; + return next(pa, idxp); +} + +/* Returns the value of the lowest-numbered non-empty element of 'pa' greater + * than the initial value of '*idxp', and sets '*idxp' to that element's index. + * If 'pa' contains no non-empty elements with indexes greater than the initial + * value of '*idxp', returns a null pointer and sets '*idxp' to 65536. */ +void * +port_array_next(const struct port_array *pa, unsigned int *idxp) +{ + ++*idxp; + return next(pa, idxp); +} + +/* Returns the number of non-null elements of 'pa'. */ +unsigned int +port_array_count(const struct port_array *pa) +{ + unsigned int l1_idx, l2_idx, l3_idx; + unsigned int count; + + count = 0; + for (l1_idx = 0; l1_idx < PORT_ARRAY_L1_SIZE; l1_idx++) { + struct port_array_l2 *l2 = pa->l1[l1_idx]; + if (l2 != &l2_sentinel) { + for (l2_idx = 0; l2_idx < PORT_ARRAY_L2_SIZE; l2_idx++) { + struct port_array_l3 *l3 = l2->l2[l2_idx]; + if (l3 != &l3_sentinel) { + for (l3_idx = 0; l3_idx < PORT_ARRAY_L3_SIZE; l3_idx++) { + if (l3->l3[l3_idx]) { + count++; + } + } + } + } + } + } + return count; +} diff --git a/lib/port-array.h b/lib/port-array.h new file mode 100644 index 00000000..e9b3cf11 --- /dev/null +++ b/lib/port-array.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef PORT_ARRAY_H +#define PORT_ARRAY_H 1 + +#include +#include "openflow/openflow.h" +#include "util.h" + +static inline uint16_t +port_array_extract_bits__(uint16_t data, int start, int count) +{ + return (data >> start) & ((1u << count) - 1); +} + +/* Level 1: most-significant bits. */ +#define PORT_ARRAY_L1_BITS 5 +#define PORT_ARRAY_L1_SHIFT (PORT_ARRAY_L3_BITS + PORT_ARRAY_L2_BITS) +#define PORT_ARRAY_L1_SIZE (1u << PORT_ARRAY_L1_BITS) +#define PORT_ARRAY_L1(IDX) \ + port_array_extract_bits__(IDX, PORT_ARRAY_L1_SHIFT, PORT_ARRAY_L1_BITS) + +/* Level 2: middle bits. */ +#define PORT_ARRAY_L2_BITS 5 +#define PORT_ARRAY_L2_SHIFT PORT_ARRAY_L3_BITS +#define PORT_ARRAY_L2_SIZE (1u << PORT_ARRAY_L2_BITS) +#define PORT_ARRAY_L2(IDX) \ + port_array_extract_bits__(IDX, PORT_ARRAY_L2_SHIFT, PORT_ARRAY_L2_BITS) + +/* Level 3: least-significant bits. */ +#define PORT_ARRAY_L3_BITS 6 +#define PORT_ARRAY_L3_SHIFT 0 +#define PORT_ARRAY_L3_SIZE (1u << PORT_ARRAY_L3_BITS) +#define PORT_ARRAY_L3(IDX) \ + port_array_extract_bits__(IDX, PORT_ARRAY_L3_SHIFT, PORT_ARRAY_L3_BITS) + +#define PORT_ARRAY_SIZE (1u << (PORT_ARRAY_L1_BITS \ + + PORT_ARRAY_L2_BITS \ + + PORT_ARRAY_L3_BITS)) + +BUILD_ASSERT_DECL(PORT_ARRAY_SIZE > 0xffff); + +/* A "sparse array" of up to 65536 elements (numbered 0...65535), implemented + * as a 3-level trie. Most efficient when the elements are clustered + * together. */ +struct port_array { + struct port_array_l2 *l1[1u << PORT_ARRAY_L1_BITS]; +}; + +struct port_array_l2 { + struct port_array_l3 *l2[1u << PORT_ARRAY_L2_BITS]; +}; + +struct port_array_l3 { + void *l3[1u << PORT_ARRAY_L3_BITS]; +}; + +/* Returns the value of the element numbered 'idx' in 'pa', or a null pointer + * if no element numbered 'idx' has been set. */ +static inline void * +port_array_get(const struct port_array *pa, uint16_t idx) +{ + unsigned int l1_idx = PORT_ARRAY_L1(idx); + unsigned int l2_idx = PORT_ARRAY_L2(idx); + unsigned int l3_idx = PORT_ARRAY_L3(idx); + return pa->l1[l1_idx]->l2[l2_idx]->l3[l3_idx]; +} + +void port_array_init(struct port_array *); +void port_array_destroy(struct port_array *); +void port_array_clear(struct port_array *); +void port_array_set(struct port_array *, uint16_t idx, void *); +void *port_array_first(const struct port_array *, unsigned int *); +void *port_array_next(const struct port_array *, unsigned int *); +unsigned int port_array_count(const struct port_array *); + +#define PORT_ARRAY_FOR_EACH(DATA, ARRAY, PORT_NO) \ + for ((DATA) = port_array_first(ARRAY, &(PORT_NO)); (DATA) != NULL; \ + (DATA) = port_array_next(ARRAY, &(PORT_NO))) + +#endif /* port-array.h */ diff --git a/lib/process.c b/lib/process.c new file mode 100644 index 00000000..79b5659f --- /dev/null +++ b/lib/process.c @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "process.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "coverage.h" +#include "dynamic-string.h" +#include "list.h" +#include "poll-loop.h" +#include "socket-util.h" +#include "util.h" + +#define THIS_MODULE VLM_process +#include "vlog.h" + +struct process { + struct list node; + char *name; + pid_t pid; + + /* Modified by signal handler. */ + volatile bool exited; + volatile int status; +}; + +/* Pipe used to signal child termination. */ +static int fds[2]; + +/* All processes. */ +static struct list all_processes = LIST_INITIALIZER(&all_processes); + +static void block_sigchld(sigset_t *); +static void unblock_sigchld(const sigset_t *); +static void sigchld_handler(int signr UNUSED); +static bool is_member(int x, const int *array, size_t); + +/* Initializes the process subsystem (if it is not already initialized). Calls + * exit() if initialization fails. + * + * Calling this function is optional; it will be called automatically by + * process_start() if necessary. Calling it explicitly allows the client to + * prevent the process from exiting at an unexpected time. */ +void +process_init(void) +{ + static bool inited; + struct sigaction sa; + + if (inited) { + return; + } + inited = true; + + /* Create notification pipe. */ + if (pipe(fds)) { + ovs_fatal(errno, "could not create pipe"); + } + set_nonblocking(fds[0]); + set_nonblocking(fds[1]); + + /* Set up child termination signal handler. */ + memset(&sa, 0, sizeof sa); + sa.sa_handler = sigchld_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_NOCLDSTOP | SA_RESTART; + if (sigaction(SIGCHLD, &sa, NULL)) { + ovs_fatal(errno, "sigaction(SIGCHLD) failed"); + } +} + +char * +process_escape_args(char **argv) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + char **argp; + for (argp = argv; *argp; argp++) { + const char *arg = *argp; + const char *p; + if (argp != argv) { + ds_put_char(&ds, ' '); + } + if (arg[strcspn(arg, " \t\r\n\v\\")]) { + ds_put_char(&ds, '"'); + for (p = arg; *p; p++) { + if (*p == '\\' || *p == '\"') { + ds_put_char(&ds, '\\'); + } + ds_put_char(&ds, *p); + } + ds_put_char(&ds, '"'); + } else { + ds_put_cstr(&ds, arg); + } + } + return ds_cstr(&ds); +} + +/* Starts a subprocess with the arguments in the null-terminated argv[] array. + * argv[0] is used as the name of the process. Searches the PATH environment + * variable to find the program to execute. + * + * All file descriptors are closed before executing the subprocess, except for + * fds 0, 1, and 2 and the 'n_keep_fds' fds listed in 'keep_fds'. Also, any of + * the 'n_null_fds' fds listed in 'null_fds' are replaced by /dev/null. + * + * Returns 0 if successful, otherwise a positive errno value indicating the + * error. If successful, '*pp' is assigned a new struct process that may be + * used to query the process's status. On failure, '*pp' is set to NULL. */ +int +process_start(char **argv, + const int keep_fds[], size_t n_keep_fds, + const int null_fds[], size_t n_null_fds, + struct process **pp) +{ + sigset_t oldsigs; + char *binary; + pid_t pid; + + *pp = NULL; + process_init(); + COVERAGE_INC(process_start); + + if (VLOG_IS_DBG_ENABLED()) { + char *args = process_escape_args(argv); + VLOG_DBG("starting subprocess: %s", args); + free(args); + } + + /* execvp() will search PATH too, but the error in that case is more + * obscure, since it is only reported post-fork. */ + binary = process_search_path(argv[0]); + if (!binary) { + VLOG_ERR("%s not found in PATH", argv[0]); + return ENOENT; + } + free(binary); + + block_sigchld(&oldsigs); + pid = fork(); + if (pid < 0) { + unblock_sigchld(&oldsigs); + VLOG_WARN("fork failed: %s", strerror(errno)); + return errno; + } else if (pid) { + /* Running in parent process. */ + struct process *p; + const char *slash; + + p = xcalloc(1, sizeof *p); + p->pid = pid; + slash = strrchr(argv[0], '/'); + p->name = xstrdup(slash ? slash + 1 : argv[0]); + p->exited = false; + + list_push_back(&all_processes, &p->node); + unblock_sigchld(&oldsigs); + + *pp = p; + return 0; + } else { + /* Running in child process. */ + int fd_max = get_max_fds(); + int fd; + + unblock_sigchld(&oldsigs); + for (fd = 0; fd < fd_max; fd++) { + if (is_member(fd, null_fds, n_null_fds)) { + int nullfd = open("/dev/null", O_RDWR); + dup2(nullfd, fd); + close(nullfd); + } else if (fd >= 3 && !is_member(fd, keep_fds, n_keep_fds)) { + close(fd); + } + } + execvp(argv[0], argv); + fprintf(stderr, "execvp(\"%s\") failed: %s\n", + argv[0], strerror(errno)); + _exit(1); + } +} + +/* Destroys process 'p'. */ +void +process_destroy(struct process *p) +{ + if (p) { + sigset_t oldsigs; + + block_sigchld(&oldsigs); + list_remove(&p->node); + unblock_sigchld(&oldsigs); + + free(p->name); + free(p); + } +} + +/* Sends signal 'signr' to process 'p'. Returns 0 if successful, otherwise a + * positive errno value. */ +int +process_kill(const struct process *p, int signr) +{ + return (p->exited ? ESRCH + : !kill(p->pid, signr) ? 0 + : errno); +} + +/* Returns the pid of process 'p'. */ +pid_t +process_pid(const struct process *p) +{ + return p->pid; +} + +/* Returns the name of process 'p' (the name passed to process_start() with any + * leading directories stripped). */ +const char * +process_name(const struct process *p) +{ + return p->name; +} + +/* Returns true if process 'p' has exited, false otherwise. */ +bool +process_exited(struct process *p) +{ + if (p->exited) { + return true; + } else { + char buf[_POSIX_PIPE_BUF]; + read(fds[0], buf, sizeof buf); + return false; + } +} + +/* Returns process 'p''s exit status, as reported by waitpid(2). + * process_status(p) may be called only after process_exited(p) has returned + * true. */ +int +process_status(const struct process *p) +{ + assert(p->exited); + return p->status; +} + +int +process_run(char **argv, + const int keep_fds[], size_t n_keep_fds, + const int null_fds[], size_t n_null_fds, + int *status) +{ + struct process *p; + int retval; + + COVERAGE_INC(process_run); + retval = process_start(argv, keep_fds, n_keep_fds, null_fds, n_null_fds, + &p); + if (retval) { + *status = 0; + return retval; + } + + while (!process_exited(p)) { + process_wait(p); + poll_block(); + } + *status = process_status(p); + process_destroy(p); + return 0; +} + +/* Given 'status', which is a process status in the form reported by waitpid(2) + * and returned by process_status(), returns a string describing how the + * process terminated. The caller is responsible for freeing the string when + * it is no longer needed. */ +char * +process_status_msg(int status) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + if (WIFEXITED(status)) { + ds_put_format(&ds, "exit status %d", WEXITSTATUS(status)); + } else if (WIFSIGNALED(status) || WIFSTOPPED(status)) { + int signr = WIFSIGNALED(status) ? WTERMSIG(status) : WSTOPSIG(status); + const char *name = NULL; +#ifdef HAVE_STRSIGNAL + name = strsignal(signr); +#endif + ds_put_format(&ds, "%s by signal %d", + WIFSIGNALED(status) ? "killed" : "stopped", signr); + if (name) { + ds_put_format(&ds, " (%s)", name); + } + } else { + ds_put_format(&ds, "terminated abnormally (%x)", status); + } + if (WCOREDUMP(status)) { + ds_put_cstr(&ds, ", core dumped"); + } + return ds_cstr(&ds); +} + +/* Causes the next call to poll_block() to wake up when process 'p' has + * exited. */ +void +process_wait(struct process *p) +{ + if (p->exited) { + poll_immediate_wake(); + } else { + poll_fd_wait(fds[0], POLLIN); + } +} + +char * +process_search_path(const char *name) +{ + char *save_ptr = NULL; + char *path, *dir; + struct stat s; + + if (strchr(name, '/') || !getenv("PATH")) { + return stat(name, &s) == 0 ? xstrdup(name) : NULL; + } + + path = xstrdup(getenv("PATH")); + for (dir = strtok_r(path, ":", &save_ptr); dir; + dir = strtok_r(NULL, ":", &save_ptr)) { + char *file = xasprintf("%s/%s", dir, name); + if (stat(file, &s) == 0) { + free(path); + return file; + } + free(file); + } + free(path); + return NULL; +} + +static void +sigchld_handler(int signr UNUSED) +{ + struct process *p; + + COVERAGE_INC(process_sigchld); + LIST_FOR_EACH (p, struct process, node, &all_processes) { + if (!p->exited) { + int retval, status; + do { + retval = waitpid(p->pid, &status, WNOHANG); + } while (retval == -1 && errno == EINTR); + if (retval == p->pid) { + p->exited = true; + p->status = status; + } else if (retval < 0) { + /* XXX We want to log something but we're in a signal + * handler. */ + p->exited = true; + p->status = -1; + } + } + } + write(fds[1], "", 1); +} + +static bool +is_member(int x, const int *array, size_t n) +{ + size_t i; + + for (i = 0; i < n; i++) { + if (array[i] == x) { + return true; + } + } + return false; +} + +static void +block_sigchld(sigset_t *oldsigs) +{ + sigset_t sigchld; + sigemptyset(&sigchld); + sigaddset(&sigchld, SIGCHLD); + if (sigprocmask(SIG_BLOCK, &sigchld, oldsigs)) { + ovs_fatal(errno, "sigprocmask"); + } +} + +static void +unblock_sigchld(const sigset_t *oldsigs) +{ + if (sigprocmask(SIG_SETMASK, oldsigs, NULL)) { + ovs_fatal(errno, "sigprocmask"); + } +} diff --git a/lib/process.h b/lib/process.h new file mode 100644 index 00000000..d4aba3ae --- /dev/null +++ b/lib/process.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef PROCESS_H +#define PROCESS_H 1 + +#include +#include + +struct process; +void process_init(void); +char *process_escape_args(char **argv); +int process_start(char **argv, + const int *keep_fds, size_t n_keep_fds, + const int *null_fds, size_t n_null_fds, + struct process **); +void process_destroy(struct process *); +int process_kill(const struct process *, int signr); + +int process_run(char **argv, + const int *keep_fds, size_t n_keep_fds, + const int *null_fds, size_t n_null_fds, + int *status); + +pid_t process_pid(const struct process *); +const char *process_name(const struct process *); +bool process_exited(struct process *); +int process_status(const struct process *); +char *process_status_msg(int); + +void process_wait(struct process *); + +char *process_search_path(const char *); + +#endif /* process.h */ diff --git a/lib/queue.c b/lib/queue.c new file mode 100644 index 00000000..2e4c7ca6 --- /dev/null +++ b/lib/queue.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "queue.h" +#include +#include "compiler.h" +#include "leak-checker.h" +#include "ofpbuf.h" + +static void check_queue(struct ovs_queue *q); + +/* Initializes 'q' as an empty packet queue. */ +void +queue_init(struct ovs_queue *q) +{ + q->n = 0; + q->head = NULL; + q->tail = NULL; +} + +/* Destroys 'q' and all of the packets that it contains. */ +void +queue_destroy(struct ovs_queue *q) +{ + struct ofpbuf *cur, *next; + for (cur = q->head; cur != NULL; cur = next) { + next = cur->next; + ofpbuf_delete(cur); + } +} + +/* Removes and destroys all of the packets in 'q', rendering it empty. */ +void +queue_clear(struct ovs_queue *q) +{ + queue_destroy(q); + queue_init(q); +} + +/* Advances the first packet in 'q' from 'q->head' to 'next', which should be + * the second packet in the queue. + * + * The odd, unsafe interface here allows the first packet in the queue to be + * passed to a function for possible consumption (and destruction) and only + * dropped from the queue if that function actually accepts it. */ +void +queue_advance_head(struct ovs_queue *q, struct ofpbuf *next) +{ + assert(q->n); + assert(q->head); + q->head = next; + if (q->head == NULL) { + q->tail = NULL; + } + q->n--; +} + +/* Appends 'b' to the tail of 'q'. */ +void +queue_push_tail(struct ovs_queue *q, struct ofpbuf *b) +{ + check_queue(q); + leak_checker_claim(b); + + b->next = NULL; + if (q->n++) { + q->tail->next = b; + } else { + q->head = b; + } + q->tail = b; + + check_queue(q); +} + +/* Removes the first buffer from 'q', which must not be empty, and returns + * it. The caller must free the buffer (with ofpbuf_delete()) when it is no + * longer needed. */ +struct ofpbuf * +queue_pop_head(struct ovs_queue *q) +{ + struct ofpbuf *head = q->head; + queue_advance_head(q, head->next); + return head; +} + +/* Checks the internal integrity of 'q'. For use in debugging. */ +static void +check_queue(struct ovs_queue *q UNUSED) +{ +#if 0 + struct ofpbuf *iter; + size_t n; + + assert(q->n == 0 + ? q->head == NULL && q->tail == NULL + : q->head != NULL && q->tail != NULL); + + n = 0; + for (iter = q->head; iter != NULL; iter = iter->next) { + n++; + assert((iter->next != NULL) == (iter != q->tail)); + } + assert(n == q->n); +#endif +} diff --git a/lib/queue.h b/lib/queue.h new file mode 100644 index 00000000..597d5394 --- /dev/null +++ b/lib/queue.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef QUEUE_H +#define QUEUE_H 1 + +/* Packet queue. */ +struct ovs_queue { + int n; /* Number of queued packets. */ + struct ofpbuf *head; /* First queued packet, null if n == 0. */ + struct ofpbuf *tail; /* Last queued packet, null if n == 0. */ +}; + +void queue_init(struct ovs_queue *); +void queue_destroy(struct ovs_queue *); +void queue_clear(struct ovs_queue *); +void queue_advance_head(struct ovs_queue *, struct ofpbuf *next); +void queue_push_tail(struct ovs_queue *, struct ofpbuf *); +struct ofpbuf *queue_pop_head(struct ovs_queue *); + +#endif /* queue.h */ diff --git a/lib/random.c b/lib/random.c new file mode 100644 index 00000000..96713c50 --- /dev/null +++ b/lib/random.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "random.h" + +#include +#include +#include + +#include "util.h" + +void +random_init(void) +{ + static bool inited = false; + if (!inited) { + struct timeval tv; + inited = true; + if (gettimeofday(&tv, NULL) < 0) { + ovs_fatal(errno, "gettimeofday"); + } + srand(tv.tv_sec ^ tv.tv_usec); + } +} + +void +random_bytes(void *p_, size_t n) +{ + uint8_t *p = p_; + random_init(); + while (n--) { + *p++ = rand(); + } +} + +uint8_t +random_uint8(void) +{ + random_init(); + return rand(); +} + +uint16_t +random_uint16(void) +{ + if (RAND_MAX >= UINT16_MAX) { + random_init(); + return rand(); + } else { + uint16_t x; + random_bytes(&x, sizeof x); + return x; + } +} + +uint32_t +random_uint32(void) +{ + if (RAND_MAX >= UINT32_MAX) { + random_init(); + return rand(); + } else if (RAND_MAX == INT32_MAX) { + random_init(); + return rand() | ((rand() & 1u) << 31); + } else { + uint32_t x; + random_bytes(&x, sizeof x); + return x; + } +} + +int +random_range(int max) +{ + return random_uint32() % max; +} diff --git a/lib/random.h b/lib/random.h new file mode 100644 index 00000000..bba76933 --- /dev/null +++ b/lib/random.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef RANDOM_H +#define RANDOM_H 1 + +#include +#include + +void random_init(void); +void random_bytes(void *, size_t); +uint8_t random_uint8(void); +uint16_t random_uint16(void); +uint32_t random_uint32(void); +int random_range(int max); + +#endif /* random.h */ diff --git a/lib/rconn.c b/lib/rconn.c new file mode 100644 index 00000000..ead8d2d6 --- /dev/null +++ b/lib/rconn.c @@ -0,0 +1,959 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "rconn.h" +#include +#include +#include +#include +#include +#include "coverage.h" +#include "ofpbuf.h" +#include "openflow/openflow.h" +#include "poll-loop.h" +#include "sat-math.h" +#include "timeval.h" +#include "util.h" +#include "vconn.h" + +#define THIS_MODULE VLM_rconn +#include "vlog.h" + +#define STATES \ + STATE(VOID, 1 << 0) \ + STATE(BACKOFF, 1 << 1) \ + STATE(CONNECTING, 1 << 2) \ + STATE(ACTIVE, 1 << 3) \ + STATE(IDLE, 1 << 4) +enum state { +#define STATE(NAME, VALUE) S_##NAME = VALUE, + STATES +#undef STATE +}; + +static const char * +state_name(enum state state) +{ + switch (state) { +#define STATE(NAME, VALUE) case S_##NAME: return #NAME; + STATES +#undef STATE + } + return "***ERROR***"; +} + +/* A reliable connection to an OpenFlow switch or controller. + * + * See the large comment in rconn.h for more information. */ +struct rconn { + enum state state; + time_t state_entered; + + struct vconn *vconn; + char *name; + bool reliable; + + struct ovs_queue txq; + + int backoff; + int max_backoff; + time_t backoff_deadline; + time_t last_received; + time_t last_connected; + unsigned int packets_sent; + unsigned int seqno; + + /* In S_ACTIVE and S_IDLE, probably_admitted reports whether we believe + * that the peer has made a (positive) admission control decision on our + * connection. If we have not yet been (probably) admitted, then the + * connection does not reset the timer used for deciding whether the switch + * should go into fail-open mode. + * + * last_admitted reports the last time we believe such a positive admission + * control decision was made. */ + bool probably_admitted; + time_t last_admitted; + + /* These values are simply for statistics reporting, not used directly by + * anything internal to the rconn (or the secchan for that matter). */ + unsigned int packets_received; + unsigned int n_attempted_connections, n_successful_connections; + time_t creation_time; + unsigned long int total_time_connected; + + /* If we can't connect to the peer, it could be for any number of reasons. + * Usually, one would assume it is because the peer is not running or + * because the network is partitioned. But it could also be because the + * network topology has changed, in which case the upper layer will need to + * reassess it (in particular, obtain a new IP address via DHCP and find + * the new location of the controller). We set this flag when we suspect + * that this could be the case. */ + bool questionable_connectivity; + time_t last_questioned; + + /* Throughout this file, "probe" is shorthand for "inactivity probe". + * When nothing has been received from the peer for a while, we send out + * an echo request as an inactivity probe packet. We should receive back + * a response. */ + int probe_interval; /* Secs of inactivity before sending probe. */ + + /* Messages sent or received are copied to the monitor connections. */ +#define MAX_MONITORS 8 + struct vconn *monitors[8]; + size_t n_monitors; +}; + +static unsigned int elapsed_in_this_state(const struct rconn *); +static unsigned int timeout(const struct rconn *); +static bool timed_out(const struct rconn *); +static void state_transition(struct rconn *, enum state); +static int try_send(struct rconn *); +static int reconnect(struct rconn *); +static void disconnect(struct rconn *, int error); +static void flush_queue(struct rconn *); +static void question_connectivity(struct rconn *); +static void copy_to_monitor(struct rconn *, const struct ofpbuf *); +static bool is_connected_state(enum state); +static bool is_admitted_msg(const struct ofpbuf *); + +/* Creates a new rconn, connects it (reliably) to 'name', and returns it. */ +struct rconn * +rconn_new(const char *name, int inactivity_probe_interval, int max_backoff) +{ + struct rconn *rc = rconn_create(inactivity_probe_interval, max_backoff); + rconn_connect(rc, name); + return rc; +} + +/* Creates a new rconn, connects it (unreliably) to 'vconn', and returns it. */ +struct rconn * +rconn_new_from_vconn(const char *name, struct vconn *vconn) +{ + struct rconn *rc = rconn_create(60, 0); + rconn_connect_unreliably(rc, name, vconn); + return rc; +} + +/* Creates and returns a new rconn. + * + * 'probe_interval' is a number of seconds. If the interval passes once + * without an OpenFlow message being received from the peer, the rconn sends + * out an "echo request" message. If the interval passes again without a + * message being received, the rconn disconnects and re-connects to the peer. + * Setting 'probe_interval' to 0 disables this behavior. + * + * 'max_backoff' is the maximum number of seconds between attempts to connect + * to the peer. The actual interval starts at 1 second and doubles on each + * failure until it reaches 'max_backoff'. If 0 is specified, the default of + * 60 seconds is used. */ +struct rconn * +rconn_create(int probe_interval, int max_backoff) +{ + struct rconn *rc = xcalloc(1, sizeof *rc); + + rc->state = S_VOID; + rc->state_entered = time_now(); + + rc->vconn = NULL; + rc->name = xstrdup("void"); + rc->reliable = false; + + queue_init(&rc->txq); + + rc->backoff = 0; + rc->max_backoff = max_backoff ? max_backoff : 60; + rc->backoff_deadline = TIME_MIN; + rc->last_received = time_now(); + rc->last_connected = time_now(); + rc->seqno = 0; + + rc->packets_sent = 0; + + rc->probably_admitted = false; + rc->last_admitted = time_now(); + + rc->packets_received = 0; + rc->n_attempted_connections = 0; + rc->n_successful_connections = 0; + rc->creation_time = time_now(); + rc->total_time_connected = 0; + + rc->questionable_connectivity = false; + rc->last_questioned = time_now(); + + rconn_set_probe_interval(rc, probe_interval); + + rc->n_monitors = 0; + + return rc; +} + +void +rconn_set_max_backoff(struct rconn *rc, int max_backoff) +{ + rc->max_backoff = MAX(1, max_backoff); + if (rc->state == S_BACKOFF && rc->backoff > max_backoff) { + rc->backoff = max_backoff; + if (rc->backoff_deadline > time_now() + max_backoff) { + rc->backoff_deadline = time_now() + max_backoff; + } + } +} + +int +rconn_get_max_backoff(const struct rconn *rc) +{ + return rc->max_backoff; +} + +void +rconn_set_probe_interval(struct rconn *rc, int probe_interval) +{ + rc->probe_interval = probe_interval ? MAX(5, probe_interval) : 0; +} + +int +rconn_get_probe_interval(const struct rconn *rc) +{ + return rc->probe_interval; +} + +int +rconn_connect(struct rconn *rc, const char *name) +{ + rconn_disconnect(rc); + free(rc->name); + rc->name = xstrdup(name); + rc->reliable = true; + return reconnect(rc); +} + +void +rconn_connect_unreliably(struct rconn *rc, + const char *name, struct vconn *vconn) +{ + assert(vconn != NULL); + rconn_disconnect(rc); + free(rc->name); + rc->name = xstrdup(name); + rc->reliable = false; + rc->vconn = vconn; + rc->last_connected = time_now(); + state_transition(rc, S_ACTIVE); +} + +/* If 'rc' is connected, forces it to drop the connection and reconnect. */ +void +rconn_reconnect(struct rconn *rc) +{ + if (rc->state & (S_ACTIVE | S_IDLE)) { + disconnect(rc, 0); + } +} + +void +rconn_disconnect(struct rconn *rc) +{ + if (rc->state != S_VOID) { + if (rc->vconn) { + vconn_close(rc->vconn); + rc->vconn = NULL; + } + free(rc->name); + rc->name = xstrdup("void"); + rc->reliable = false; + + rc->backoff = 0; + rc->backoff_deadline = TIME_MIN; + + state_transition(rc, S_VOID); + } +} + +/* Disconnects 'rc' and frees the underlying storage. */ +void +rconn_destroy(struct rconn *rc) +{ + if (rc) { + size_t i; + + free(rc->name); + vconn_close(rc->vconn); + flush_queue(rc); + queue_destroy(&rc->txq); + for (i = 0; i < rc->n_monitors; i++) { + vconn_close(rc->monitors[i]); + } + free(rc); + } +} + +static unsigned int +timeout_VOID(const struct rconn *rc UNUSED) +{ + return UINT_MAX; +} + +static void +run_VOID(struct rconn *rc UNUSED) +{ + /* Nothing to do. */ +} + +static int +reconnect(struct rconn *rc) +{ + int retval; + + VLOG_INFO("%s: connecting...", rc->name); + rc->n_attempted_connections++; + retval = vconn_open(rc->name, OFP_VERSION, &rc->vconn); + if (!retval) { + rc->backoff_deadline = time_now() + rc->backoff; + state_transition(rc, S_CONNECTING); + } else { + VLOG_WARN("%s: connection failed (%s)", rc->name, strerror(retval)); + rc->backoff_deadline = TIME_MAX; /* Prevent resetting backoff. */ + disconnect(rc, 0); + } + return retval; +} + +static unsigned int +timeout_BACKOFF(const struct rconn *rc) +{ + return rc->backoff; +} + +static void +run_BACKOFF(struct rconn *rc) +{ + if (timed_out(rc)) { + reconnect(rc); + } +} + +static unsigned int +timeout_CONNECTING(const struct rconn *rc) +{ + return MAX(1, rc->backoff); +} + +static void +run_CONNECTING(struct rconn *rc) +{ + int retval = vconn_connect(rc->vconn); + if (!retval) { + VLOG_INFO("%s: connected", rc->name); + rc->n_successful_connections++; + state_transition(rc, S_ACTIVE); + rc->last_connected = rc->state_entered; + } else if (retval != EAGAIN) { + VLOG_INFO("%s: connection failed (%s)", rc->name, strerror(retval)); + disconnect(rc, retval); + } else if (timed_out(rc)) { + VLOG_INFO("%s: connection timed out", rc->name); + rc->backoff_deadline = TIME_MAX; /* Prevent resetting backoff. */ + disconnect(rc, 0); + } +} + +static void +do_tx_work(struct rconn *rc) +{ + if (!rc->txq.n) { + return; + } + while (rc->txq.n > 0) { + int error = try_send(rc); + if (error) { + break; + } + } + if (!rc->txq.n) { + poll_immediate_wake(); + } +} + +static unsigned int +timeout_ACTIVE(const struct rconn *rc) +{ + if (rc->probe_interval) { + unsigned int base = MAX(rc->last_received, rc->state_entered); + unsigned int arg = base + rc->probe_interval - rc->state_entered; + return arg; + } + return UINT_MAX; +} + +static void +run_ACTIVE(struct rconn *rc) +{ + if (timed_out(rc)) { + unsigned int base = MAX(rc->last_received, rc->state_entered); + VLOG_DBG("%s: idle %u seconds, sending inactivity probe", + rc->name, (unsigned int) (time_now() - base)); + + /* Ordering is important here: rconn_send() can transition to BACKOFF, + * and we don't want to transition back to IDLE if so, because then we + * can end up queuing a packet with vconn == NULL and then *boom*. */ + state_transition(rc, S_IDLE); + rconn_send(rc, make_echo_request(), NULL); + return; + } + + do_tx_work(rc); +} + +static unsigned int +timeout_IDLE(const struct rconn *rc) +{ + return rc->probe_interval; +} + +static void +run_IDLE(struct rconn *rc) +{ + if (timed_out(rc)) { + question_connectivity(rc); + VLOG_ERR("%s: no response to inactivity probe after %u " + "seconds, disconnecting", + rc->name, elapsed_in_this_state(rc)); + disconnect(rc, 0); + } else { + do_tx_work(rc); + } +} + +/* Performs whatever activities are necessary to maintain 'rc': if 'rc' is + * disconnected, attempts to (re)connect, backing off as necessary; if 'rc' is + * connected, attempts to send packets in the send queue, if any. */ +void +rconn_run(struct rconn *rc) +{ + int old_state; + do { + old_state = rc->state; + switch (rc->state) { +#define STATE(NAME, VALUE) case S_##NAME: run_##NAME(rc); break; + STATES +#undef STATE + default: + NOT_REACHED(); + } + } while (rc->state != old_state); +} + +/* Causes the next call to poll_block() to wake up when rconn_run() should be + * called on 'rc'. */ +void +rconn_run_wait(struct rconn *rc) +{ + unsigned int timeo = timeout(rc); + if (timeo != UINT_MAX) { + unsigned int expires = sat_add(rc->state_entered, timeo); + unsigned int remaining = sat_sub(expires, time_now()); + poll_timer_wait(sat_mul(remaining, 1000)); + } + + if ((rc->state & (S_ACTIVE | S_IDLE)) && rc->txq.n) { + vconn_wait(rc->vconn, WAIT_SEND); + } +} + +/* Attempts to receive a packet from 'rc'. If successful, returns the packet; + * otherwise, returns a null pointer. The caller is responsible for freeing + * the packet (with ofpbuf_delete()). */ +struct ofpbuf * +rconn_recv(struct rconn *rc) +{ + if (rc->state & (S_ACTIVE | S_IDLE)) { + struct ofpbuf *buffer; + int error = vconn_recv(rc->vconn, &buffer); + if (!error) { + copy_to_monitor(rc, buffer); + if (is_admitted_msg(buffer) + || time_now() - rc->last_connected >= 30) { + rc->probably_admitted = true; + rc->last_admitted = time_now(); + } + rc->last_received = time_now(); + rc->packets_received++; + if (rc->state == S_IDLE) { + state_transition(rc, S_ACTIVE); + } + return buffer; + } else if (error != EAGAIN) { + disconnect(rc, error); + } + } + return NULL; +} + +/* Causes the next call to poll_block() to wake up when a packet may be ready + * to be received by vconn_recv() on 'rc'. */ +void +rconn_recv_wait(struct rconn *rc) +{ + if (rc->vconn) { + vconn_wait(rc->vconn, WAIT_RECV); + } +} + +/* Sends 'b' on 'rc'. Returns 0 if successful (in which case 'b' is + * destroyed), or ENOTCONN if 'rc' is not currently connected (in which case + * the caller retains ownership of 'b'). + * + * If 'counter' is non-null, then 'counter' will be incremented while the + * packet is in flight, then decremented when it has been sent (or discarded + * due to disconnection). Because 'b' may be sent (or discarded) before this + * function returns, the caller may not be able to observe any change in + * 'counter'. + * + * There is no rconn_send_wait() function: an rconn has a send queue that it + * takes care of sending if you call rconn_run(), which will have the side + * effect of waking up poll_block(). */ +int +rconn_send(struct rconn *rc, struct ofpbuf *b, + struct rconn_packet_counter *counter) +{ + if (rconn_is_connected(rc)) { + COVERAGE_INC(rconn_queued); + copy_to_monitor(rc, b); + b->private = counter; + if (counter) { + rconn_packet_counter_inc(counter); + } + queue_push_tail(&rc->txq, b); + + /* If the queue was empty before we added 'b', try to send some + * packets. (But if the queue had packets in it, it's because the + * vconn is backlogged and there's no point in stuffing more into it + * now. We'll get back to that in rconn_run().) */ + if (rc->txq.n == 1) { + try_send(rc); + } + return 0; + } else { + return ENOTCONN; + } +} + +/* Sends 'b' on 'rc'. Increments 'counter' while the packet is in flight; it + * will be decremented when it has been sent (or discarded due to + * disconnection). Returns 0 if successful, EAGAIN if 'counter->n' is already + * at least as large as 'queue_limit', or ENOTCONN if 'rc' is not currently + * connected. Regardless of return value, 'b' is destroyed. + * + * Because 'b' may be sent (or discarded) before this function returns, the + * caller may not be able to observe any change in 'counter'. + * + * There is no rconn_send_wait() function: an rconn has a send queue that it + * takes care of sending if you call rconn_run(), which will have the side + * effect of waking up poll_block(). */ +int +rconn_send_with_limit(struct rconn *rc, struct ofpbuf *b, + struct rconn_packet_counter *counter, int queue_limit) +{ + int retval; + retval = counter->n >= queue_limit ? EAGAIN : rconn_send(rc, b, counter); + if (retval) { + COVERAGE_INC(rconn_overflow); + ofpbuf_delete(b); + } + return retval; +} + +/* Returns the total number of packets successfully sent on the underlying + * vconn. A packet is not counted as sent while it is still queued in the + * rconn, only when it has been successfuly passed to the vconn. */ +unsigned int +rconn_packets_sent(const struct rconn *rc) +{ + return rc->packets_sent; +} + +/* Adds 'vconn' to 'rc' as a monitoring connection, to which all messages sent + * and received on 'rconn' will be copied. 'rc' takes ownership of 'vconn'. */ +void +rconn_add_monitor(struct rconn *rc, struct vconn *vconn) +{ + if (rc->n_monitors < ARRAY_SIZE(rc->monitors)) { + VLOG_INFO("new monitor connection from %s", vconn_get_name(vconn)); + rc->monitors[rc->n_monitors++] = vconn; + } else { + VLOG_DBG("too many monitor connections, discarding %s", + vconn_get_name(vconn)); + vconn_close(vconn); + } +} + +/* Returns 'rc''s name (the 'name' argument passed to rconn_new()). */ +const char * +rconn_get_name(const struct rconn *rc) +{ + return rc->name; +} + +/* Returns true if 'rconn' is connected or in the process of reconnecting, + * false if 'rconn' is disconnected and will not reconnect on its own. */ +bool +rconn_is_alive(const struct rconn *rconn) +{ + return rconn->state != S_VOID; +} + +/* Returns true if 'rconn' is connected, false otherwise. */ +bool +rconn_is_connected(const struct rconn *rconn) +{ + return is_connected_state(rconn->state); +} + +/* Returns 0 if 'rconn' is connected. Otherwise, if 'rconn' is in a "failure + * mode" (that is, it is not connected), returns the number of seconds that it + * has been in failure mode, ignoring any times that it connected but the + * controller's admission control policy caused it to be quickly + * disconnected. */ +int +rconn_failure_duration(const struct rconn *rconn) +{ + return rconn_is_connected(rconn) ? 0 : time_now() - rconn->last_admitted; +} + +/* Returns the IP address of the peer, or 0 if the peer is not connected over + * an IP-based protocol or if its IP address is not known. */ +uint32_t +rconn_get_ip(const struct rconn *rconn) +{ + return rconn->vconn ? vconn_get_ip(rconn->vconn) : 0; +} + +/* If 'rconn' can't connect to the peer, it could be for any number of reasons. + * Usually, one would assume it is because the peer is not running or because + * the network is partitioned. But it could also be because the network + * topology has changed, in which case the upper layer will need to reassess it + * (in particular, obtain a new IP address via DHCP and find the new location + * of the controller). When this appears that this might be the case, this + * function returns true. It also clears the questionability flag and prevents + * it from being set again for some time. */ +bool +rconn_is_connectivity_questionable(struct rconn *rconn) +{ + bool questionable = rconn->questionable_connectivity; + rconn->questionable_connectivity = false; + return questionable; +} + +/* Returns the total number of packets successfully received by the underlying + * vconn. */ +unsigned int +rconn_packets_received(const struct rconn *rc) +{ + return rc->packets_received; +} + +/* Returns a string representing the internal state of 'rc'. The caller must + * not modify or free the string. */ +const char * +rconn_get_state(const struct rconn *rc) +{ + return state_name(rc->state); +} + +/* Returns the number of connection attempts made by 'rc', including any + * ongoing attempt that has not yet succeeded or failed. */ +unsigned int +rconn_get_attempted_connections(const struct rconn *rc) +{ + return rc->n_attempted_connections; +} + +/* Returns the number of successful connection attempts made by 'rc'. */ +unsigned int +rconn_get_successful_connections(const struct rconn *rc) +{ + return rc->n_successful_connections; +} + +/* Returns the time at which the last successful connection was made by + * 'rc'. */ +time_t +rconn_get_last_connection(const struct rconn *rc) +{ + return rc->last_connected; +} + +/* Returns the time at which 'rc' was created. */ +time_t +rconn_get_creation_time(const struct rconn *rc) +{ + return rc->creation_time; +} + +/* Returns the approximate number of seconds that 'rc' has been connected. */ +unsigned long int +rconn_get_total_time_connected(const struct rconn *rc) +{ + return (rc->total_time_connected + + (rconn_is_connected(rc) ? elapsed_in_this_state(rc) : 0)); +} + +/* Returns the current amount of backoff, in seconds. This is the amount of + * time after which the rconn will transition from BACKOFF to CONNECTING. */ +int +rconn_get_backoff(const struct rconn *rc) +{ + return rc->backoff; +} + +/* Returns the number of seconds spent in this state so far. */ +unsigned int +rconn_get_state_elapsed(const struct rconn *rc) +{ + return elapsed_in_this_state(rc); +} + +/* Returns 'rc''s current connection sequence number, a number that changes + * every time that 'rconn' connects or disconnects. */ +unsigned int +rconn_get_connection_seqno(const struct rconn *rc) +{ + return rc->seqno; +} + +struct rconn_packet_counter * +rconn_packet_counter_create(void) +{ + struct rconn_packet_counter *c = xmalloc(sizeof *c); + c->n = 0; + c->ref_cnt = 1; + return c; +} + +void +rconn_packet_counter_destroy(struct rconn_packet_counter *c) +{ + if (c) { + assert(c->ref_cnt > 0); + if (!--c->ref_cnt && !c->n) { + free(c); + } + } +} + +void +rconn_packet_counter_inc(struct rconn_packet_counter *c) +{ + c->n++; +} + +void +rconn_packet_counter_dec(struct rconn_packet_counter *c) +{ + assert(c->n > 0); + if (!--c->n && !c->ref_cnt) { + free(c); + } +} + +/* Tries to send a packet from 'rc''s send buffer. Returns 0 if successful, + * otherwise a positive errno value. */ +static int +try_send(struct rconn *rc) +{ + int retval = 0; + struct ofpbuf *next = rc->txq.head->next; + struct rconn_packet_counter *counter = rc->txq.head->private; + retval = vconn_send(rc->vconn, rc->txq.head); + if (retval) { + if (retval != EAGAIN) { + disconnect(rc, retval); + } + return retval; + } + COVERAGE_INC(rconn_sent); + rc->packets_sent++; + if (counter) { + rconn_packet_counter_dec(counter); + } + queue_advance_head(&rc->txq, next); + return 0; +} + +/* Disconnects 'rc'. 'error' is used only for logging purposes. If it is + * nonzero, then it should be EOF to indicate the connection was closed by the + * peer in a normal fashion or a positive errno value. */ +static void +disconnect(struct rconn *rc, int error) +{ + if (rc->reliable) { + time_t now = time_now(); + + if (rc->state & (S_CONNECTING | S_ACTIVE | S_IDLE)) { + if (error > 0) { + VLOG_WARN("%s: connection dropped (%s)", + rc->name, strerror(error)); + } else if (error == EOF) { + if (rc->reliable) { + VLOG_INFO("%s: connection closed by peer", rc->name); + } + } else { + VLOG_INFO("%s: connection dropped", rc->name); + } + vconn_close(rc->vconn); + rc->vconn = NULL; + flush_queue(rc); + } + + if (now >= rc->backoff_deadline) { + rc->backoff = 1; + } else { + rc->backoff = MIN(rc->max_backoff, MAX(1, 2 * rc->backoff)); + VLOG_INFO("%s: waiting %d seconds before reconnect\n", + rc->name, rc->backoff); + } + rc->backoff_deadline = now + rc->backoff; + state_transition(rc, S_BACKOFF); + if (now - rc->last_connected > 60) { + question_connectivity(rc); + } + } else { + rconn_disconnect(rc); + } +} + +/* Drops all the packets from 'rc''s send queue and decrements their queue + * counts. */ +static void +flush_queue(struct rconn *rc) +{ + if (!rc->txq.n) { + return; + } + while (rc->txq.n > 0) { + struct ofpbuf *b = queue_pop_head(&rc->txq); + struct rconn_packet_counter *counter = b->private; + if (counter) { + rconn_packet_counter_dec(counter); + } + COVERAGE_INC(rconn_discarded); + ofpbuf_delete(b); + } + poll_immediate_wake(); +} + +static unsigned int +elapsed_in_this_state(const struct rconn *rc) +{ + return time_now() - rc->state_entered; +} + +static unsigned int +timeout(const struct rconn *rc) +{ + switch (rc->state) { +#define STATE(NAME, VALUE) case S_##NAME: return timeout_##NAME(rc); + STATES +#undef STATE + default: + NOT_REACHED(); + } +} + +static bool +timed_out(const struct rconn *rc) +{ + return time_now() >= sat_add(rc->state_entered, timeout(rc)); +} + +static void +state_transition(struct rconn *rc, enum state state) +{ + rc->seqno += (rc->state == S_ACTIVE) != (state == S_ACTIVE); + if (is_connected_state(state) && !is_connected_state(rc->state)) { + rc->probably_admitted = false; + } + if (rconn_is_connected(rc)) { + rc->total_time_connected += elapsed_in_this_state(rc); + } + VLOG_DBG("%s: entering %s", rc->name, state_name(state)); + rc->state = state; + rc->state_entered = time_now(); +} + +static void +question_connectivity(struct rconn *rc) +{ + time_t now = time_now(); + if (now - rc->last_questioned > 60) { + rc->questionable_connectivity = true; + rc->last_questioned = now; + } +} + +static void +copy_to_monitor(struct rconn *rc, const struct ofpbuf *b) +{ + struct ofpbuf *clone = NULL; + int retval; + size_t i; + + for (i = 0; i < rc->n_monitors; ) { + struct vconn *vconn = rc->monitors[i]; + + if (!clone) { + clone = ofpbuf_clone(b); + } + retval = vconn_send(vconn, clone); + if (!retval) { + clone = NULL; + } else if (retval != EAGAIN) { + VLOG_DBG("%s: closing monitor connection to %s: %s", + rconn_get_name(rc), vconn_get_name(vconn), + strerror(retval)); + rc->monitors[i] = rc->monitors[--rc->n_monitors]; + continue; + } + i++; + } + ofpbuf_delete(clone); +} + +static bool +is_connected_state(enum state state) +{ + return (state & (S_ACTIVE | S_IDLE)) != 0; +} + +static bool +is_admitted_msg(const struct ofpbuf *b) +{ + struct ofp_header *oh = b->data; + uint8_t type = oh->type; + return !(type < 32 + && (1u << type) & ((1u << OFPT_HELLO) | + (1u << OFPT_ERROR) | + (1u << OFPT_ECHO_REQUEST) | + (1u << OFPT_ECHO_REPLY) | + (1u << OFPT_VENDOR) | + (1u << OFPT_FEATURES_REQUEST) | + (1u << OFPT_FEATURES_REPLY) | + (1u << OFPT_GET_CONFIG_REQUEST) | + (1u << OFPT_GET_CONFIG_REPLY) | + (1u << OFPT_SET_CONFIG))); +} diff --git a/lib/rconn.h b/lib/rconn.h new file mode 100644 index 00000000..837bc538 --- /dev/null +++ b/lib/rconn.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef RCONN_H +#define RCONN_H 1 + +#include "queue.h" +#include +#include +#include + +/* A wrapper around vconn that provides queuing and optionally reliability. + * + * An rconn maintains a message transmission queue of bounded length specified + * by the caller. The rconn does not guarantee reliable delivery of + * queued messages: all queued messages are dropped when reconnection becomes + * necessary. + * + * An rconn optionally provides reliable communication, in this sense: the + * rconn will re-connect, with exponential backoff, when the underlying vconn + * disconnects. + */ + +struct vconn; +struct rconn_packet_counter; + +struct rconn *rconn_new(const char *name, + int inactivity_probe_interval, int max_backoff); +struct rconn *rconn_new_from_vconn(const char *name, struct vconn *); +struct rconn *rconn_create(int inactivity_probe_interval, int max_backoff); + +void rconn_set_max_backoff(struct rconn *, int max_backoff); +int rconn_get_max_backoff(const struct rconn *); +void rconn_set_probe_interval(struct rconn *, int inactivity_probe_interval); +int rconn_get_probe_interval(const struct rconn *); + +int rconn_connect(struct rconn *, const char *name); +void rconn_connect_unreliably(struct rconn *, + const char *name, struct vconn *vconn); +void rconn_reconnect(struct rconn *); +void rconn_disconnect(struct rconn *); +void rconn_destroy(struct rconn *); + +void rconn_run(struct rconn *); +void rconn_run_wait(struct rconn *); +struct ofpbuf *rconn_recv(struct rconn *); +void rconn_recv_wait(struct rconn *); +int rconn_send(struct rconn *, struct ofpbuf *, struct rconn_packet_counter *); +int rconn_send_with_limit(struct rconn *, struct ofpbuf *, + struct rconn_packet_counter *, int queue_limit); +unsigned int rconn_packets_sent(const struct rconn *); +unsigned int rconn_packets_received(const struct rconn *); + +void rconn_add_monitor(struct rconn *, struct vconn *); + +const char *rconn_get_name(const struct rconn *); +bool rconn_is_alive(const struct rconn *); +bool rconn_is_connected(const struct rconn *); +int rconn_failure_duration(const struct rconn *); +bool rconn_is_connectivity_questionable(struct rconn *); + +uint32_t rconn_get_ip(const struct rconn *); + +const char *rconn_get_state(const struct rconn *); +unsigned int rconn_get_attempted_connections(const struct rconn *); +unsigned int rconn_get_successful_connections(const struct rconn *); +time_t rconn_get_last_connection(const struct rconn *); +time_t rconn_get_creation_time(const struct rconn *); +unsigned long int rconn_get_total_time_connected(const struct rconn *); +int rconn_get_backoff(const struct rconn *); +unsigned int rconn_get_state_elapsed(const struct rconn *); +unsigned int rconn_get_connection_seqno(const struct rconn *); + +/* Counts the number of packets queued into an rconn by a given source. */ +struct rconn_packet_counter { + int n; /* Number of packets queued. */ + int ref_cnt; /* Number of owners. */ +}; + +struct rconn_packet_counter *rconn_packet_counter_create(void); +void rconn_packet_counter_destroy(struct rconn_packet_counter *); +void rconn_packet_counter_inc(struct rconn_packet_counter *); +void rconn_packet_counter_dec(struct rconn_packet_counter *); + +static inline int +rconn_packet_counter_read(const struct rconn_packet_counter *counter) +{ + return counter->n; +} + +#endif /* rconn.h */ diff --git a/lib/sat-math.h b/lib/sat-math.h new file mode 100644 index 00000000..84ff51ac --- /dev/null +++ b/lib/sat-math.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef SAT_MATH_H +#define SAT_MATH_H 1 + +#include +#include + +/* Saturating addition: overflow yields UINT_MAX. */ +static inline unsigned int +sat_add(unsigned int x, unsigned int y) +{ + return x + y >= x ? x + y : UINT_MAX; +} + +/* Saturating subtraction: underflow yields 0. */ +static inline unsigned int +sat_sub(unsigned int x, unsigned int y) +{ + return x >= y ? x - y : 0; +} + +/* Saturating multiplication: overflow yields UINT_MAX. */ +static inline unsigned int +sat_mul(unsigned int x, unsigned int y) +{ + return (!y ? 0 + : x <= UINT_MAX / y ? x * y + : UINT_MAX); +} + +#endif /* sat-math.h */ diff --git a/lib/sha1.c b/lib/sha1.c new file mode 100644 index 00000000..5fc763f8 --- /dev/null +++ b/lib/sha1.c @@ -0,0 +1,394 @@ +/* + * sha1.c + * + * Description: + * This file implements the Secure Hashing Algorithm 1 as + * defined in FIPS PUB 180-1 published April 17, 1995. + * + * The SHA-1, produces a 160-bit message digest for a given + * data stream. It should take about 2**n steps to find a + * message with the same digest as a given message and + * 2**(n/2) to find any two messages with the same digest, + * when n is the digest size in bits. Therefore, this + * algorithm can serve as a means of providing a + * "fingerprint" for a message. + * + * Portability Issues: + * SHA-1 is defined in terms of 32-bit "words". This code + * uses (included via "sha1.h" to define 32 and 8 + * bit unsigned integer types. If your C compiler does not + * support 32 bit unsigned integers, this code is not + * appropriate. + * + * Caveats: + * SHA-1 is designed to work with messages less than 2^64 bits + * long. Although SHA-1 allows a message digest to be generated + * for messages of any number of bits less than 2^64, this + * implementation only works with messages with a length that is + * a multiple of the size of an 8-bit character. + * + */ + +#include "sha1.h" + +/* + * Define the SHA1 circular left shift macro + */ +#define SHA1CircularShift(bits,word) \ + (((word) << (bits)) | ((word) >> (32-(bits)))) + +/* Local Function Prototyptes */ +void SHA1PadMessage(SHA1Context *); +void SHA1ProcessMessageBlock(SHA1Context *); + +/* + * SHA1Reset + * + * Description: + * This function will initialize the SHA1Context in preparation + * for computing a new SHA1 message digest. + * + * Parameters: + * context: [in/out] + * The context to reset. + * + * Returns: + * sha Error Code. + * + */ +int SHA1Reset(SHA1Context *context) +{ + if (!context) + { + return shaNull; + } + + context->Length_Low = 0; + context->Length_High = 0; + context->Message_Block_Index = 0; + + context->Intermediate_Hash[0] = 0x67452301; + context->Intermediate_Hash[1] = 0xEFCDAB89; + context->Intermediate_Hash[2] = 0x98BADCFE; + context->Intermediate_Hash[3] = 0x10325476; + context->Intermediate_Hash[4] = 0xC3D2E1F0; + + context->Computed = 0; + context->Corrupted = 0; + + return shaSuccess; +} + +/* + * SHA1Result + * + * Description: + * This function will return the 160-bit message digest into the + * Message_Digest array provided by the caller. + * NOTE: The first octet of hash is stored in the 0th element, + * the last octet of hash in the 19th element. + * + * Parameters: + * context: [in/out] + * The context to use to calculate the SHA-1 hash. + * Message_Digest: [out] + * Where the digest is returned. + * + * Returns: + * sha Error Code. + * + */ +int SHA1Result( SHA1Context *context, + uint8_t Message_Digest[SHA1HashSize]) +{ + int i; + + if (!context || !Message_Digest) + { + return shaNull; + } + + if (context->Corrupted) + { + return context->Corrupted; + } + + if (!context->Computed) + { + SHA1PadMessage(context); + for(i=0; i<64; ++i) + { + /* message may be sensitive, clear it out */ + context->Message_Block[i] = 0; + } + context->Length_Low = 0; /* and clear length */ + context->Length_High = 0; + context->Computed = 1; + } + + for(i = 0; i < SHA1HashSize; ++i) + { + Message_Digest[i] = context->Intermediate_Hash[i>>2] + >> 8 * ( 3 - ( i & 0x03 ) ); + } + + return shaSuccess; +} + +/* + * SHA1Input + * + * Description: + * This function accepts an array of octets as the next portion + * of the message. + * + * Parameters: + * context: [in/out] + * The SHA context to update + * message_array: [in] + * An array of characters representing the next portion of + * the message. + * length: [in] + * The length of the message in message_array + * + * Returns: + * sha Error Code. + * + */ +int SHA1Input( SHA1Context *context, + const uint8_t *message_array, + unsigned length) +{ + if (!length) + { + return shaSuccess; + } + + if (!context || !message_array) + { + return shaNull; + } + + if (context->Computed) + { + context->Corrupted = shaStateError; + return shaStateError; + } + + if (context->Corrupted) + { + return context->Corrupted; + } + while(length-- && !context->Corrupted) + { + context->Message_Block[context->Message_Block_Index++] = + (*message_array & 0xFF); + + context->Length_Low += 8; + if (context->Length_Low == 0) + { + context->Length_High++; + if (context->Length_High == 0) + { + /* Message is too long */ + context->Corrupted = 1; + } + } + + if (context->Message_Block_Index == 64) + { + SHA1ProcessMessageBlock(context); + } + + message_array++; + } + + return shaSuccess; +} + +/* + * SHA1ProcessMessageBlock + * + * Description: + * This function will process the next 512 bits of the message + * stored in the Message_Block array. + * + * Parameters: + * None. + * + * Returns: + * Nothing. + * + * Comments: + * Many of the variable names in this code, especially the + * single character names, were used because those were the + * names used in the publication. + * + * + */ +void SHA1ProcessMessageBlock(SHA1Context *context) +{ + const uint32_t K[] = { /* Constants defined in SHA-1 */ + 0x5A827999, + 0x6ED9EBA1, + 0x8F1BBCDC, + 0xCA62C1D6 + }; + int t; /* Loop counter */ + uint32_t temp; /* Temporary word value */ + uint32_t W[80]; /* Word sequence */ + uint32_t A, B, C, D, E; /* Word buffers */ + + /* + * Initialize the first 16 words in the array W + */ + for(t = 0; t < 16; t++) + { + W[t] = context->Message_Block[t * 4] << 24; + W[t] |= context->Message_Block[t * 4 + 1] << 16; + W[t] |= context->Message_Block[t * 4 + 2] << 8; + W[t] |= context->Message_Block[t * 4 + 3]; + } + + for(t = 16; t < 80; t++) + { + W[t] = SHA1CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]); + } + + A = context->Intermediate_Hash[0]; + B = context->Intermediate_Hash[1]; + C = context->Intermediate_Hash[2]; + D = context->Intermediate_Hash[3]; + E = context->Intermediate_Hash[4]; + + for(t = 0; t < 20; t++) + { + temp = SHA1CircularShift(5,A) + + ((B & C) | ((~B) & D)) + E + W[t] + K[0]; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + for(t = 20; t < 40; t++) + { + temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1]; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + for(t = 40; t < 60; t++) + { + temp = SHA1CircularShift(5,A) + + ((B & C) | (B & D) | (C & D)) + E + W[t] + K[2]; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + for(t = 60; t < 80; t++) + { + temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3]; + E = D; + D = C; + C = SHA1CircularShift(30,B); + B = A; + A = temp; + } + + context->Intermediate_Hash[0] += A; + context->Intermediate_Hash[1] += B; + context->Intermediate_Hash[2] += C; + context->Intermediate_Hash[3] += D; + context->Intermediate_Hash[4] += E; + + context->Message_Block_Index = 0; +} + + +/* + * SHA1PadMessage + * + * Description: + * According to the standard, the message must be padded to an even + * 512 bits. The first padding bit must be a '1'. The last 64 + * bits represent the length of the original message. All bits in + * between should be 0. This function will pad the message + * according to those rules by filling the Message_Block array + * accordingly. It will also call the ProcessMessageBlock function + * provided appropriately. When it returns, it can be assumed that + * the message digest has been computed. + * + * Parameters: + * context: [in/out] + * The context to pad + * ProcessMessageBlock: [in] + * The appropriate SHA*ProcessMessageBlock function + * Returns: + * Nothing. + * + */ + +void SHA1PadMessage(SHA1Context *context) +{ + /* + * Check to see if the current message block is too small to hold + * the initial padding bits and length. If so, we will pad the + * block, process it, and then continue padding into a second + * block. + */ + if (context->Message_Block_Index > 55) + { + context->Message_Block[context->Message_Block_Index++] = 0x80; + while(context->Message_Block_Index < 64) + { + context->Message_Block[context->Message_Block_Index++] = 0; + } + + SHA1ProcessMessageBlock(context); + + while(context->Message_Block_Index < 56) + { + context->Message_Block[context->Message_Block_Index++] = 0; + } + } + else + { + context->Message_Block[context->Message_Block_Index++] = 0x80; + while(context->Message_Block_Index < 56) + { + context->Message_Block[context->Message_Block_Index++] = 0; + } + } + + /* + * Store the message length as the last 8 octets + */ + context->Message_Block[56] = context->Length_High >> 24; + context->Message_Block[57] = context->Length_High >> 16; + context->Message_Block[58] = context->Length_High >> 8; + context->Message_Block[59] = context->Length_High; + context->Message_Block[60] = context->Length_Low >> 24; + context->Message_Block[61] = context->Length_Low >> 16; + context->Message_Block[62] = context->Length_Low >> 8; + context->Message_Block[63] = context->Length_Low; + + SHA1ProcessMessageBlock(context); +} + +void +SHA1Bytes(const void *data, unsigned int n, + uint8_t Message_Digest[SHA1HashSize]) +{ + SHA1Context ctx; + SHA1Reset(&ctx); + SHA1Input(&ctx, data, n); + SHA1Result(&ctx, Message_Digest); +} diff --git a/lib/sha1.h b/lib/sha1.h new file mode 100644 index 00000000..382cf320 --- /dev/null +++ b/lib/sha1.h @@ -0,0 +1,74 @@ +/* + * sha1.h + * + * Description: + * This is the header file for code which implements the Secure + * Hashing Algorithm 1 as defined in FIPS PUB 180-1 published + * April 17, 1995. + * + * Many of the variable names in this code, especially the + * single character names, were used because those were the names + * used in the publication. + * + * Please read the file sha1.c for more information. + * + */ +#ifndef _SHA1_H_ +#define _SHA1_H_ + +#include +/* + * If you do not have the ISO standard stdint.h header file, then you + * must typdef the following: + * name meaning + * uint32_t unsigned 32 bit integer + * uint8_t unsigned 8 bit integer (i.e., unsigned char) + * int_least16_t integer of >= 16 bits + * + */ + +#ifndef _SHA_enum_ +#define _SHA_enum_ +enum +{ + shaSuccess = 0, + shaNull, /* Null pointer parameter */ + shaInputTooLong, /* input data too long */ + shaStateError /* called Input after Result */ +}; +#endif +#define SHA1HashSize 20 + +/* + * This structure will hold context information for the SHA-1 + * hashing operation + */ +typedef struct SHA1Context +{ + uint32_t Intermediate_Hash[SHA1HashSize/4]; /* Message Digest */ + + uint32_t Length_Low; /* Message length in bits */ + uint32_t Length_High; /* Message length in bits */ + + /* Index into message block array */ + int_least16_t Message_Block_Index; + uint8_t Message_Block[64]; /* 512-bit message blocks */ + + int Computed; /* Is the digest computed? */ + int Corrupted; /* Is the message digest corrupted? */ +} SHA1Context; + +/* + * Function Prototypes + */ +int SHA1Reset( SHA1Context *); +int SHA1Input( SHA1Context *, + const uint8_t *, + unsigned int); +int SHA1Result( SHA1Context *, + uint8_t Message_Digest[SHA1HashSize]); + +void SHA1Bytes(const void *data, unsigned int n, + uint8_t Message_Digest[SHA1HashSize]); + +#endif diff --git a/lib/shash.c b/lib/shash.c new file mode 100644 index 00000000..8f97f776 --- /dev/null +++ b/lib/shash.c @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "shash.h" +#include +#include "hash.h" + +static size_t +hash_name(const char *name) +{ + return hash_string(name, 0); +} + +void +shash_init(struct shash *sh) +{ + hmap_init(&sh->map); +} + +void +shash_destroy(struct shash *sh) +{ + if (sh) { + shash_clear(sh); + } +} + +void +shash_clear(struct shash *sh) +{ + struct shash_node *node, *next; + + HMAP_FOR_EACH_SAFE (node, next, struct shash_node, node, &sh->map) { + hmap_remove(&sh->map, &node->node); + free(node->name); + free(node); + } +} + +/* It is the caller's responsible to avoid duplicate names, if that is + * desirable. */ +void +shash_add(struct shash *sh, const char *name, void *data) +{ + struct shash_node *node = xmalloc(sizeof *node); + node->name = xstrdup(name); + node->data = data; + hmap_insert(&sh->map, &node->node, hash_name(name)); +} + +void +shash_delete(struct shash *sh, struct shash_node *node) +{ + hmap_remove(&sh->map, &node->node); + free(node->name); + free(node); +} + +/* If there are duplicates, returns a random element. */ +struct shash_node * +shash_find(const struct shash *sh, const char *name) +{ + struct shash_node *node; + + HMAP_FOR_EACH_WITH_HASH (node, struct shash_node, node, + hash_name(name), &sh->map) { + if (!strcmp(node->name, name)) { + return node; + } + } + return NULL; +} + +void * +shash_find_data(const struct shash *sh, const char *name) +{ + struct shash_node *node = shash_find(sh, name); + return node ? node->data : NULL; +} diff --git a/lib/shash.h b/lib/shash.h new file mode 100644 index 00000000..ee3fb5f5 --- /dev/null +++ b/lib/shash.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef SHASH_H +#define SHASH_H 1 + +#include "hmap.h" + +struct shash_node { + struct hmap_node node; + char *name; + void *data; +}; + +struct shash { + struct hmap map; +}; + +#define SHASH_INITIALIZER(SHASH) { HMAP_INITIALIZER(&(SHASH)->map) } + +void shash_init(struct shash *); +void shash_destroy(struct shash *); +void shash_clear(struct shash *); +void shash_add(struct shash *, const char *, void *); +void shash_delete(struct shash *, struct shash_node *); +struct shash_node *shash_find(const struct shash *, const char *); +void *shash_find_data(const struct shash *, const char *); + +#endif /* shash.h */ diff --git a/lib/signals.c b/lib/signals.c new file mode 100644 index 00000000..26eebcdb --- /dev/null +++ b/lib/signals.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "signals.h" +#include +#include +#include +#include +#include +#include "poll-loop.h" +#include "socket-util.h" +#include "util.h" + +#if defined(_NSIG) +#define N_SIGNALS _NSIG +#elif defined(NSIG) +#define N_SIGNALS NSIG +#else +/* We could try harder to get the maximum signal number, but in practice we + * only care about SIGHUP, which is normally signal 1 anyway. */ +#define N_SIGNALS 32 +#endif + +struct signal { + int signr; +}; + +static volatile sig_atomic_t signaled[N_SIGNALS]; + +static int fds[2]; + +static void signal_handler(int signr); + +/* Initializes the signals subsystem (if it is not already initialized). Calls + * exit() if initialization fails. + * + * Calling this function is optional; it will be called automatically by + * signal_start() if necessary. Calling it explicitly allows the client to + * prevent the process from exiting at an unexpected time. */ +void +signal_init(void) +{ + static bool inited; + if (!inited) { + inited = true; + if (pipe(fds)) { + ovs_fatal(errno, "could not create pipe"); + } + set_nonblocking(fds[0]); + set_nonblocking(fds[1]); + } +} + +/* Sets up a handler for 'signr' and returns a structure that represents it. + * + * Only one handler for a given signal may be registered at a time. */ +struct signal * +signal_register(int signr) +{ + struct sigaction sa; + struct signal *s; + + signal_init(); + + /* Set up signal handler. */ + assert(signr >= 1 && signr < N_SIGNALS); + memset(&sa, 0, sizeof sa); + sa.sa_handler = signal_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_RESTART; + if (sigaction(signr, &sa, NULL)) { + ovs_fatal(errno, "sigaction(%d) failed", signr); + } + + /* Return structure. */ + s = xmalloc(sizeof *s); + s->signr = signr; + return s; +} + +/* Returns true if signal 's' has been received since the last call to this + * function with argument 's'. */ +bool +signal_poll(struct signal *s) +{ + char buf[_POSIX_PIPE_BUF]; + read(fds[0], buf, sizeof buf); + if (signaled[s->signr]) { + signaled[s->signr] = 0; + return true; + } + return false; +} + +/* Causes the next call to poll_block() to wake up when signal_poll(s) would + * return true. */ +void +signal_wait(struct signal *s) +{ + if (signaled[s->signr]) { + poll_immediate_wake(); + } else { + poll_fd_wait(fds[0], POLLIN); + } +} + +static void +signal_handler(int signr) +{ + if (signr >= 1 && signr < N_SIGNALS) { + write(fds[1], "", 1); + signaled[signr] = true; + } +} diff --git a/lib/signals.h b/lib/signals.h new file mode 100644 index 00000000..7f25f662 --- /dev/null +++ b/lib/signals.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef SIGNALS_H +#define SIGNALS_H 1 + +#include + +void signal_init(void); +struct signal *signal_register(int signr); +bool signal_poll(struct signal *); +void signal_wait(struct signal *); + +#endif /* signals.h */ diff --git a/lib/socket-util.c b/lib/socket-util.c new file mode 100644 index 00000000..3d290e88 --- /dev/null +++ b/lib/socket-util.c @@ -0,0 +1,343 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "socket-util.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "fatal-signal.h" +#include "util.h" + +#include "vlog.h" +#define THIS_MODULE VLM_socket_util + +/* Sets 'fd' to non-blocking mode. Returns 0 if successful, otherwise a + * positive errno value. */ +int +set_nonblocking(int fd) +{ + int flags = fcntl(fd, F_GETFL, 0); + if (flags != -1) { + if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) != -1) { + return 0; + } else { + VLOG_ERR("fcntl(F_SETFL) failed: %s", strerror(errno)); + return errno; + } + } else { + VLOG_ERR("fcntl(F_GETFL) failed: %s", strerror(errno)); + return errno; + } +} + +/* Returns the maximum valid FD value, plus 1. */ +int +get_max_fds(void) +{ + static int max_fds = -1; + if (max_fds < 0) { + struct rlimit r; + if (!getrlimit(RLIMIT_NOFILE, &r) + && r.rlim_cur != RLIM_INFINITY + && r.rlim_cur != RLIM_SAVED_MAX + && r.rlim_cur != RLIM_SAVED_CUR) { + max_fds = r.rlim_cur; + } else { + VLOG_WARN("failed to obtain fd limit, defaulting to 1024"); + max_fds = 1024; + } + } + return max_fds; +} + +/* Translates 'host_name', which may be a DNS name or an IP address, into a + * numeric IP address in '*addr'. Returns 0 if successful, otherwise a + * positive errno value. */ +int +lookup_ip(const char *host_name, struct in_addr *addr) +{ + if (!inet_aton(host_name, addr)) { + struct hostent *he = gethostbyname(host_name); + if (he == NULL) { + struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_ERR_RL(&rl, "gethostbyname(%s): %s", host_name, + (h_errno == HOST_NOT_FOUND ? "host not found" + : h_errno == TRY_AGAIN ? "try again" + : h_errno == NO_RECOVERY ? "non-recoverable error" + : h_errno == NO_ADDRESS ? "no address" + : "unknown error")); + return ENOENT; + } + addr->s_addr = *(uint32_t *) he->h_addr; + } + return 0; +} + +/* Returns the error condition associated with socket 'fd' and resets the + * socket's error status. */ +int +get_socket_error(int fd) +{ + int error; + socklen_t len = sizeof(error); + if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &error, &len) < 0) { + struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 10); + error = errno; + VLOG_ERR_RL(&rl, "getsockopt(SO_ERROR): %s", strerror(error)); + } + return error; +} + +int +check_connection_completion(int fd) +{ + struct pollfd pfd; + int retval; + + pfd.fd = fd; + pfd.events = POLLOUT; + do { + retval = poll(&pfd, 1, 0); + } while (retval < 0 && errno == EINTR); + if (retval == 1) { + return get_socket_error(fd); + } else if (retval < 0) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 10); + VLOG_ERR_RL(&rl, "poll: %s", strerror(errno)); + return errno; + } else { + return EAGAIN; + } +} + +/* Drain all the data currently in the receive queue of a datagram socket (and + * possibly additional data). There is no way to know how many packets are in + * the receive queue, but we do know that the total number of bytes queued does + * not exceed the receive buffer size, so we pull packets until none are left + * or we've read that many bytes. */ +int +drain_rcvbuf(int fd) +{ + socklen_t rcvbuf_len; + size_t rcvbuf; + + rcvbuf_len = sizeof rcvbuf; + if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf, &rcvbuf_len) < 0) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 10); + VLOG_ERR_RL(&rl, "getsockopt(SO_RCVBUF) failed: %s", strerror(errno)); + return errno; + } + while (rcvbuf > 0) { + /* In Linux, specifying MSG_TRUNC in the flags argument causes the + * datagram length to be returned, even if that is longer than the + * buffer provided. Thus, we can use a 1-byte buffer to discard the + * incoming datagram and still be able to account how many bytes were + * removed from the receive buffer. + * + * On other Unix-like OSes, MSG_TRUNC has no effect in the flags + * argument. */ +#ifdef __linux__ +#define BUFFER_SIZE 1 +#else +#define BUFFER_SIZE 2048 +#endif + char buffer[BUFFER_SIZE]; + ssize_t n_bytes = recv(fd, buffer, sizeof buffer, + MSG_TRUNC | MSG_DONTWAIT); + if (n_bytes <= 0 || n_bytes >= rcvbuf) { + break; + } + rcvbuf -= n_bytes; + } + return 0; +} + +/* Reads and discards up to 'n' datagrams from 'fd', stopping as soon as no + * more data can be immediately read. ('fd' should therefore be in + * non-blocking mode.)*/ +void +drain_fd(int fd, size_t n_packets) +{ + for (; n_packets > 0; n_packets--) { + /* 'buffer' only needs to be 1 byte long in most circumstances. This + * size is defensive against the possibility that we someday want to + * use a Linux tap device without TUN_NO_PI, in which case a buffer + * smaller than sizeof(struct tun_pi) will give EINVAL on read. */ + char buffer[128]; + if (read(fd, buffer, sizeof buffer) <= 0) { + break; + } + } +} + +/* Stores in '*un' a sockaddr_un that refers to file 'name'. Stores in + * '*un_len' the size of the sockaddr_un. */ +static void +make_sockaddr_un(const char *name, struct sockaddr_un* un, socklen_t *un_len) +{ + un->sun_family = AF_UNIX; + strncpy(un->sun_path, name, sizeof un->sun_path); + un->sun_path[sizeof un->sun_path - 1] = '\0'; + *un_len = (offsetof(struct sockaddr_un, sun_path) + + strlen (un->sun_path) + 1); +} + +/* Creates a Unix domain socket in the given 'style' (either SOCK_DGRAM or + * SOCK_STREAM) that is bound to '*bind_path' (if 'bind_path' is non-null) and + * connected to '*connect_path' (if 'connect_path' is non-null). If 'nonblock' + * is true, the socket is made non-blocking. If 'passcred' is true, the socket + * is configured to receive SCM_CREDENTIALS control messages. + * + * Returns the socket's fd if successful, otherwise a negative errno value. */ +int +make_unix_socket(int style, bool nonblock, bool passcred UNUSED, + const char *bind_path, const char *connect_path) +{ + int error; + int fd; + + fd = socket(PF_UNIX, style, 0); + if (fd < 0) { + return -errno; + } + + /* Set nonblocking mode right away, if we want it. This prevents blocking + * in connect(), if connect_path != NULL. (In turn, that's a corner case: + * it will only happen if style is SOCK_STREAM or SOCK_SEQPACKET, and only + * if a backlog of un-accepted connections has built up in the kernel.) */ + if (nonblock) { + int flags = fcntl(fd, F_GETFL, 0); + if (flags == -1) { + goto error; + } + if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) { + goto error; + } + } + + if (bind_path) { + struct sockaddr_un un; + socklen_t un_len; + make_sockaddr_un(bind_path, &un, &un_len); + if (unlink(un.sun_path) && errno != ENOENT) { + VLOG_WARN("unlinking \"%s\": %s\n", un.sun_path, strerror(errno)); + } + fatal_signal_add_file_to_unlink(bind_path); + if (bind(fd, (struct sockaddr*) &un, un_len) + || fchmod(fd, S_IRWXU)) { + goto error; + } + } + + if (connect_path) { + struct sockaddr_un un; + socklen_t un_len; + make_sockaddr_un(connect_path, &un, &un_len); + if (connect(fd, (struct sockaddr*) &un, un_len) + && errno != EINPROGRESS) { + goto error; + } + } + +#ifdef SCM_CREDENTIALS + if (passcred) { + int enable = 1; + if (setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &enable, sizeof(enable))) { + goto error; + } + } +#endif + + return fd; + +error: + if (bind_path) { + fatal_signal_remove_file_to_unlink(bind_path); + } + error = errno; + close(fd); + return -error; +} + +int +get_unix_name_len(socklen_t sun_len) +{ + return (sun_len >= offsetof(struct sockaddr_un, sun_path) + ? sun_len - offsetof(struct sockaddr_un, sun_path) + : 0); +} + +uint32_t +guess_netmask(uint32_t ip) +{ + ip = ntohl(ip); + return ((ip >> 31) == 0 ? htonl(0xff000000) /* Class A */ + : (ip >> 30) == 2 ? htonl(0xffff0000) /* Class B */ + : (ip >> 29) == 6 ? htonl(0xffffff00) /* Class C */ + : htonl(0)); /* ??? */ +} + +int +read_fully(int fd, void *p_, size_t size, size_t *bytes_read) +{ + uint8_t *p = p_; + + *bytes_read = 0; + while (size > 0) { + ssize_t retval = read(fd, p, size); + if (retval > 0) { + *bytes_read += retval; + size -= retval; + p += retval; + } else if (retval == 0) { + return EOF; + } else if (errno != EINTR) { + return errno; + } + } + return 0; +} + +int +write_fully(int fd, const void *p_, size_t size, size_t *bytes_written) +{ + const uint8_t *p = p_; + + *bytes_written = 0; + while (size > 0) { + ssize_t retval = write(fd, p, size); + if (retval > 0) { + *bytes_written += retval; + size -= retval; + p += retval; + } else if (retval == 0) { + VLOG_WARN("write returned 0"); + return EPROTO; + } else if (errno != EINTR) { + return errno; + } + } + return 0; +} diff --git a/lib/socket-util.h b/lib/socket-util.h new file mode 100644 index 00000000..bdfb3dcb --- /dev/null +++ b/lib/socket-util.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef SOCKET_UTIL_H +#define SOCKET_UTIL_H 1 + +#include +#include +#include + +int set_nonblocking(int fd); +int get_max_fds(void); +int lookup_ip(const char *host_name, struct in_addr *address); +int get_socket_error(int sock); +int check_connection_completion(int fd); +int drain_rcvbuf(int fd); +void drain_fd(int fd, size_t n_packets); +int make_unix_socket(int style, bool nonblock, bool passcred, + const char *bind_path, const char *connect_path); +int get_unix_name_len(socklen_t sun_len); +uint32_t guess_netmask(uint32_t ip); + +int read_fully(int fd, void *, size_t, size_t *bytes_read); +int write_fully(int fd, const void *, size_t, size_t *bytes_written); + +#endif /* socket-util.h */ diff --git a/lib/stp.c b/lib/stp.c new file mode 100644 index 00000000..59f5c615 --- /dev/null +++ b/lib/stp.c @@ -0,0 +1,1226 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* Based on sample implementation in 802.1D-1998. Above copyright and license + * applies to all modifications. */ + +#include "stp.h" +#include +#include +#include +#include +#include "ofpbuf.h" +#include "packets.h" +#include "util.h" +#include "xtoxll.h" + +#include "vlog.h" +#define THIS_MODULE VLM_stp + +/* Ethernet address used as the destination for STP frames. */ +const uint8_t stp_eth_addr[ETH_ADDR_LEN] += { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x01 }; + +#define STP_PROTOCOL_ID 0x0000 +#define STP_PROTOCOL_VERSION 0x00 +#define STP_TYPE_CONFIG 0x00 +#define STP_TYPE_TCN 0x80 + +struct stp_bpdu_header { + uint16_t protocol_id; /* STP_PROTOCOL_ID. */ + uint8_t protocol_version; /* STP_PROTOCOL_VERSION. */ + uint8_t bpdu_type; /* One of STP_TYPE_*. */ +} __attribute__((packed)); +BUILD_ASSERT_DECL(sizeof(struct stp_bpdu_header) == 4); + +enum stp_config_bpdu_flags { + STP_CONFIG_TOPOLOGY_CHANGE_ACK = 0x80, + STP_CONFIG_TOPOLOGY_CHANGE = 0x01 +}; + +struct stp_config_bpdu { + struct stp_bpdu_header header; /* Type STP_TYPE_CONFIG. */ + uint8_t flags; /* STP_CONFIG_* flags. */ + uint64_t root_id; /* 8.5.1.1: Bridge believed to be root. */ + uint32_t root_path_cost; /* 8.5.1.2: Cost of path to root. */ + uint64_t bridge_id; /* 8.5.1.3: ID of transmitting bridge. */ + uint16_t port_id; /* 8.5.1.4: Port transmitting the BPDU. */ + uint16_t message_age; /* 8.5.1.5: Age of BPDU at tx time. */ + uint16_t max_age; /* 8.5.1.6: Timeout for received data. */ + uint16_t hello_time; /* 8.5.1.7: Time between BPDU generation. */ + uint16_t forward_delay; /* 8.5.1.8: State progression delay. */ +} __attribute__((packed)); +BUILD_ASSERT_DECL(sizeof(struct stp_config_bpdu) == 35); + +struct stp_tcn_bpdu { + struct stp_bpdu_header header; /* Type STP_TYPE_TCN. */ +} __attribute__((packed)); +BUILD_ASSERT_DECL(sizeof(struct stp_tcn_bpdu) == 4); + +struct stp_timer { + bool active; /* Timer in use? */ + int value; /* Current value of timer, counting up. */ +}; + +struct stp_port { + struct stp *stp; + int port_id; /* 8.5.5.1: Unique port identifier. */ + enum stp_state state; /* 8.5.5.2: Current state. */ + int path_cost; /* 8.5.5.3: Cost of tx/rx on this port. */ + stp_identifier designated_root; /* 8.5.5.4. */ + int designated_cost; /* 8.5.5.5: Path cost to root on port. */ + stp_identifier designated_bridge; /* 8.5.5.6. */ + int designated_port; /* 8.5.5.7: Port to send config msgs on. */ + bool topology_change_ack; /* 8.5.5.8: Flag for next config BPDU. */ + bool config_pending; /* 8.5.5.9: Send BPDU when hold expires? */ + bool change_detection_enabled; /* 8.5.5.10: Detect topology changes? */ + + struct stp_timer message_age_timer; /* 8.5.6.1: Age of received info. */ + struct stp_timer forward_delay_timer; /* 8.5.6.2: State change timer. */ + struct stp_timer hold_timer; /* 8.5.6.3: BPDU rate limit timer. */ + + bool state_changed; +}; + +struct stp { + /* Static bridge data. */ + char *name; /* Human-readable name for log messages. */ + stp_identifier bridge_id; /* 8.5.3.7: This bridge. */ + int max_age; /* 8.5.3.4: Time to drop received data. */ + int hello_time; /* 8.5.3.5: Time between sending BPDUs. */ + int forward_delay; /* 8.5.3.6: Delay between state changes. */ + int bridge_max_age; /* 8.5.3.8: max_age when we're root. */ + int bridge_hello_time; /* 8.5.3.9: hello_time as root. */ + int bridge_forward_delay; /* 8.5.3.10: forward_delay as root. */ + int rq_max_age; /* User-requested max age, in ms. */ + int rq_hello_time; /* User-requested hello time, in ms. */ + int rq_forward_delay; /* User-requested forward delay, in ms. */ + int elapsed_remainder; /* Left-over msecs from last stp_tick(). */ + + /* Dynamic bridge data. */ + stp_identifier designated_root; /* 8.5.3.1: Bridge believed to be root. */ + unsigned int root_path_cost; /* 8.5.3.2: Cost of path to root. */ + struct stp_port *root_port; /* 8.5.3.3: Lowest cost port to root. */ + bool topology_change_detected; /* 8.5.3.11: Detected a topology change? */ + bool topology_change; /* 8.5.3.12: Received topology change? */ + + /* Bridge timers. */ + struct stp_timer hello_timer; /* 8.5.4.1: Hello timer. */ + struct stp_timer tcn_timer; /* 8.5.4.2: Topology change timer. */ + struct stp_timer topology_change_timer; /* 8.5.4.3. */ + + /* Ports. */ + struct stp_port ports[STP_MAX_PORTS]; + + /* Interface to client. */ + struct stp_port *first_changed_port; + void (*send_bpdu)(struct ofpbuf *bpdu, int port_no, void *aux); + void *aux; +}; + +#define FOR_EACH_ENABLED_PORT(PORT, STP) \ + for ((PORT) = stp_next_enabled_port((STP), (STP)->ports); \ + (PORT); \ + (PORT) = stp_next_enabled_port((STP), (PORT) + 1)) +static struct stp_port * +stp_next_enabled_port(const struct stp *stp, const struct stp_port *port) +{ + for (; port < &stp->ports[ARRAY_SIZE(stp->ports)]; port++) { + if (port->state != STP_DISABLED) { + return (struct stp_port *) port; + } + } + return NULL; +} + +#define MESSAGE_AGE_INCREMENT 1 + +static void stp_transmit_config(struct stp_port *); +static bool stp_supersedes_port_info(const struct stp_port *, + const struct stp_config_bpdu *); +static void stp_record_config_information(struct stp_port *, + const struct stp_config_bpdu *); +static void stp_record_config_timeout_values(struct stp *, + const struct stp_config_bpdu *); +static bool stp_is_designated_port(const struct stp_port *); +static void stp_config_bpdu_generation(struct stp *); +static void stp_transmit_tcn(struct stp *); +static void stp_configuration_update(struct stp *); +static bool stp_supersedes_root(const struct stp_port *root, + const struct stp_port *); +static void stp_root_selection(struct stp *); +static void stp_designated_port_selection(struct stp *); +static void stp_become_designated_port(struct stp_port *); +static void stp_port_state_selection(struct stp *); +static void stp_make_forwarding(struct stp_port *); +static void stp_make_blocking(struct stp_port *); +static void stp_set_port_state(struct stp_port *, enum stp_state); +static void stp_topology_change_detection(struct stp *); +static void stp_topology_change_acknowledged(struct stp *); +static void stp_acknowledge_topology_change(struct stp_port *); +static void stp_received_config_bpdu(struct stp *, struct stp_port *, + const struct stp_config_bpdu *); +static void stp_received_tcn_bpdu(struct stp *, struct stp_port *); +static void stp_hello_timer_expiry(struct stp *); +static void stp_message_age_timer_expiry(struct stp_port *); +static bool stp_is_designated_for_some_port(const struct stp *); +static void stp_forward_delay_timer_expiry(struct stp_port *); +static void stp_tcn_timer_expiry(struct stp *); +static void stp_topology_change_timer_expiry(struct stp *); +static void stp_hold_timer_expiry(struct stp_port *); +static void stp_initialize_port(struct stp_port *, enum stp_state); +static void stp_become_root_bridge(struct stp *); +static void stp_update_bridge_timers(struct stp *); + +static int clamp(int x, int min, int max); +static int ms_to_timer(int ms); +static int ms_to_timer_remainder(int ms); +static int timer_to_ms(int timer); +static void stp_start_timer(struct stp_timer *, int value); +static void stp_stop_timer(struct stp_timer *); +static bool stp_timer_expired(struct stp_timer *, int elapsed, int timeout); + +static void stp_send_bpdu(struct stp_port *, const void *, size_t); + +/* Creates and returns a new STP instance that initially has no ports enabled. + * + * 'bridge_id' should be a 48-bit MAC address as returned by + * eth_addr_to_uint64(). 'bridge_id' may also have a priority value in its top + * 16 bits; if those bits are set to 0, STP_DEFAULT_BRIDGE_PRIORITY is used. + * (This priority may be changed with stp_set_bridge_priority().) + * + * When the bridge needs to send out a BPDU, it calls 'send_bpdu'. This + * callback may be called from stp_tick() or stp_received_bpdu(). The + * arguments to 'send_bpdu' are an STP BPDU encapsulated in + */ +struct stp * +stp_create(const char *name, stp_identifier bridge_id, + void (*send_bpdu)(struct ofpbuf *bpdu, int port_no, void *aux), + void *aux) +{ + struct stp *stp; + struct stp_port *p; + + stp = xcalloc(1, sizeof *stp); + stp->name = xstrdup(name); + stp->bridge_id = bridge_id; + if (!(stp->bridge_id >> 48)) { + stp->bridge_id |= (uint64_t) STP_DEFAULT_BRIDGE_PRIORITY << 48; + } + + stp->rq_max_age = 6000; + stp->rq_hello_time = 2000; + stp->rq_forward_delay = 4000; + stp_update_bridge_timers(stp); + stp->max_age = stp->bridge_max_age; + stp->hello_time = stp->bridge_hello_time; + stp->forward_delay = stp->bridge_forward_delay; + + stp->designated_root = stp->bridge_id; + stp->root_path_cost = 0; + stp->root_port = NULL; + stp->topology_change_detected = false; + stp->topology_change = false; + + stp_stop_timer(&stp->tcn_timer); + stp_stop_timer(&stp->topology_change_timer); + stp_start_timer(&stp->hello_timer, 0); + + stp->send_bpdu = send_bpdu; + stp->aux = aux; + + stp->first_changed_port = &stp->ports[ARRAY_SIZE(stp->ports)]; + for (p = stp->ports; p < &stp->ports[ARRAY_SIZE(stp->ports)]; p++) { + p->stp = stp; + p->port_id = (stp_port_no(p) + 1) | (STP_DEFAULT_PORT_PRIORITY << 8); + p->path_cost = 19; /* Recommended default for 100 Mb/s link. */ + stp_initialize_port(p, STP_DISABLED); + } + return stp; +} + +/* Destroys 'stp'. */ +void +stp_destroy(struct stp *stp) +{ + free(stp); +} + +/* Runs 'stp' given that 'ms' milliseconds have passed. */ +void +stp_tick(struct stp *stp, int ms) +{ + struct stp_port *p; + int elapsed; + + /* Convert 'ms' to STP timer ticks. Preserve any leftover milliseconds + * from previous stp_tick() calls so that we don't lose STP ticks when we + * are called too frequently. */ + ms = clamp(ms, 0, INT_MAX - 1000) + stp->elapsed_remainder; + elapsed = ms_to_timer(ms); + stp->elapsed_remainder = ms_to_timer_remainder(ms); + if (!elapsed) { + return; + } + + if (stp_timer_expired(&stp->hello_timer, elapsed, stp->hello_time)) { + stp_hello_timer_expiry(stp); + } + if (stp_timer_expired(&stp->tcn_timer, elapsed, stp->bridge_hello_time)) { + stp_tcn_timer_expiry(stp); + } + if (stp_timer_expired(&stp->topology_change_timer, elapsed, + stp->max_age + stp->forward_delay)) { + stp_topology_change_timer_expiry(stp); + } + FOR_EACH_ENABLED_PORT (p, stp) { + if (stp_timer_expired(&p->message_age_timer, elapsed, stp->max_age)) { + stp_message_age_timer_expiry(p); + } + } + FOR_EACH_ENABLED_PORT (p, stp) { + if (stp_timer_expired(&p->forward_delay_timer, elapsed, + stp->forward_delay)) { + stp_forward_delay_timer_expiry(p); + } + if (stp_timer_expired(&p->hold_timer, elapsed, ms_to_timer(1000))) { + stp_hold_timer_expiry(p); + } + } +} + +static void +set_bridge_id(struct stp *stp, stp_identifier new_bridge_id) +{ + if (new_bridge_id != stp->bridge_id) { + bool root; + struct stp_port *p; + + root = stp_is_root_bridge(stp); + FOR_EACH_ENABLED_PORT (p, stp) { + if (stp_is_designated_port(p)) { + p->designated_bridge = new_bridge_id; + } + } + stp->bridge_id = new_bridge_id; + stp_configuration_update(stp); + stp_port_state_selection(stp); + if (stp_is_root_bridge(stp) && !root) { + stp_become_root_bridge(stp); + } + } +} + +void +stp_set_bridge_id(struct stp *stp, stp_identifier bridge_id) +{ + const uint64_t mac_bits = (UINT64_C(1) << 48) - 1; + const uint64_t pri_bits = ~mac_bits; + set_bridge_id(stp, (stp->bridge_id & pri_bits) | (bridge_id & mac_bits)); +} + +void +stp_set_bridge_priority(struct stp *stp, uint16_t new_priority) +{ + const uint64_t mac_bits = (UINT64_C(1) << 48) - 1; + set_bridge_id(stp, ((stp->bridge_id & mac_bits) + | ((uint64_t) new_priority << 48))); +} + +/* Sets the desired hello time for 'stp' to 'ms', in milliseconds. The actual + * hello time is clamped to the range of 1 to 10 seconds and subject to the + * relationship (bridge_max_age >= 2 * (bridge_hello_time + 1 s)). The bridge + * hello time is only used when 'stp' is the root bridge. */ +void +stp_set_hello_time(struct stp *stp, int ms) +{ + stp->rq_hello_time = ms; + stp_update_bridge_timers(stp); +} + +/* Sets the desired max age for 'stp' to 'ms', in milliseconds. The actual max + * age is clamped to the range of 6 to 40 seconds and subject to the + * relationships (2 * (bridge_forward_delay - 1 s) >= bridge_max_age) and + * (bridge_max_age >= 2 * (bridge_hello_time + 1 s)). The bridge max age is + * only used when 'stp' is the root bridge. */ +void +stp_set_max_age(struct stp *stp, int ms) +{ + stp->rq_max_age = ms; + stp_update_bridge_timers(stp); +} + +/* Sets the desired forward delay for 'stp' to 'ms', in milliseconds. The + * actual forward delay is clamped to the range of 4 to 30 seconds and subject + * to the relationship (2 * (bridge_forward_delay - 1 s) >= bridge_max_age). + * The bridge forward delay is only used when 'stp' is the root bridge. */ +void +stp_set_forward_delay(struct stp *stp, int ms) +{ + stp->rq_forward_delay = ms; + stp_update_bridge_timers(stp); +} + +/* Returns the name given to 'stp' in the call to stp_create(). */ +const char * +stp_get_name(const struct stp *stp) +{ + return stp->name; +} + +/* Returns the bridge ID for 'stp'. */ +stp_identifier +stp_get_bridge_id(const struct stp *stp) +{ + return stp->bridge_id; +} + +/* Returns the bridge ID of the bridge currently believed to be the root. */ +stp_identifier +stp_get_designated_root(const struct stp *stp) +{ + return stp->designated_root; +} + +/* Returns true if 'stp' believes itself to the be root of the spanning tree, + * false otherwise. */ +bool +stp_is_root_bridge(const struct stp *stp) +{ + return stp->bridge_id == stp->designated_root; +} + +/* Returns the cost of the path from 'stp' to the root of the spanning tree. */ +int +stp_get_root_path_cost(const struct stp *stp) +{ + return stp->root_path_cost; +} + +/* Returns the bridge hello time, in ms. The returned value is not necessarily + * the value passed to stp_set_hello_time(): it is clamped to the valid range + * and quantized to the STP timer resolution. */ +int +stp_get_hello_time(const struct stp *stp) +{ + return timer_to_ms(stp->bridge_hello_time); +} + +/* Returns the bridge max age, in ms. The returned value is not necessarily + * the value passed to stp_set_max_age(): it is clamped to the valid range, + * quantized to the STP timer resolution, and adjusted to match the constraints + * due to the hello time. */ +int +stp_get_max_age(const struct stp *stp) +{ + return timer_to_ms(stp->bridge_max_age); +} + +/* Returns the bridge forward delay, in ms. The returned value is not + * necessarily the value passed to stp_set_forward_delay(): it is clamped to + * the valid range, quantized to the STP timer resolution, and adjusted to + * match the constraints due to the forward delay. */ +int +stp_get_forward_delay(const struct stp *stp) +{ + return timer_to_ms(stp->bridge_forward_delay); +} + +/* Returns the port in 'stp' with index 'port_no', which must be between 0 and + * STP_MAX_PORTS. */ +struct stp_port * +stp_get_port(struct stp *stp, int port_no) +{ + assert(port_no >= 0 && port_no < ARRAY_SIZE(stp->ports)); + return &stp->ports[port_no]; +} + +/* Returns the port connecting 'stp' to the root bridge, or a null pointer if + * there is no such port. */ +struct stp_port * +stp_get_root_port(struct stp *stp) +{ + return stp->root_port; +} + +/* Finds a port whose state has changed. If successful, stores the port whose + * state changed in '*portp' and returns true. If no port has changed, stores + * NULL in '*portp' and returns false. */ +bool +stp_get_changed_port(struct stp *stp, struct stp_port **portp) +{ + struct stp_port *end = &stp->ports[ARRAY_SIZE(stp->ports)]; + struct stp_port *p; + + for (p = stp->first_changed_port; p < end; p++) { + if (p->state_changed) { + p->state_changed = false; + stp->first_changed_port = p + 1; + *portp = p; + return true; + } + } + stp->first_changed_port = end; + *portp = NULL; + return false; +} + +/* Returns the name for the given 'state' (for use in debugging and log + * messages). */ +const char * +stp_state_name(enum stp_state state) +{ + switch (state) { + case STP_DISABLED: + return "disabled"; + case STP_LISTENING: + return "listening"; + case STP_LEARNING: + return "learning"; + case STP_FORWARDING: + return "forwarding"; + case STP_BLOCKING: + return "blocking"; + default: + NOT_REACHED(); + } +} + +/* Returns true if 'state' is one in which packets received on a port should + * be forwarded, false otherwise. + * + * Returns true if 'state' is STP_DISABLED, since presumably in that case the + * port should still work, just not have STP applied to it. */ +bool +stp_forward_in_state(enum stp_state state) +{ + return (state & (STP_DISABLED | STP_FORWARDING)) != 0; +} + +/* Returns true if 'state' is one in which MAC learning should be done on + * packets received on a port, false otherwise. + * + * Returns true if 'state' is STP_DISABLED, since presumably in that case the + * port should still work, just not have STP applied to it. */ +bool +stp_learn_in_state(enum stp_state state) +{ + return (state & (STP_DISABLED | STP_LEARNING | STP_FORWARDING)) != 0; +} + +/* Notifies the STP entity that bridge protocol data unit 'bpdu', which is + * 'bpdu_size' bytes in length, was received on port 'p'. + * + * This function may call the 'send_bpdu' function provided to stp_create(). */ +void +stp_received_bpdu(struct stp_port *p, const void *bpdu, size_t bpdu_size) +{ + struct stp *stp = p->stp; + const struct stp_bpdu_header *header; + + if (p->state == STP_DISABLED) { + return; + } + + if (bpdu_size < sizeof(struct stp_bpdu_header)) { + VLOG_WARN("%s: received runt %zu-byte BPDU", stp->name, bpdu_size); + return; + } + + header = bpdu; + if (header->protocol_id != htons(STP_PROTOCOL_ID)) { + VLOG_WARN("%s: received BPDU with unexpected protocol ID %"PRIu16, + stp->name, ntohs(header->protocol_id)); + return; + } + if (header->protocol_version != STP_PROTOCOL_VERSION) { + VLOG_DBG("%s: received BPDU with unexpected protocol version %"PRIu8, + stp->name, header->protocol_version); + } + + switch (header->bpdu_type) { + case STP_TYPE_CONFIG: + if (bpdu_size < sizeof(struct stp_config_bpdu)) { + VLOG_WARN("%s: received config BPDU with invalid size %zu", + stp->name, bpdu_size); + return; + } + stp_received_config_bpdu(stp, p, bpdu); + break; + + case STP_TYPE_TCN: + if (bpdu_size != sizeof(struct stp_tcn_bpdu)) { + VLOG_WARN("%s: received TCN BPDU with invalid size %zu", + stp->name, bpdu_size); + return; + } + stp_received_tcn_bpdu(stp, p); + break; + + default: + VLOG_WARN("%s: received BPDU of unexpected type %"PRIu8, + stp->name, header->bpdu_type); + return; + } +} + +/* Returns the STP entity in which 'p' is nested. */ +struct stp * +stp_port_get_stp(struct stp_port *p) +{ + return p->stp; +} + +/* Returns the index of port 'p' within its bridge. */ +int +stp_port_no(const struct stp_port *p) +{ + struct stp *stp = p->stp; + assert(p >= stp->ports && p < &stp->ports[ARRAY_SIZE(stp->ports)]); + return p - stp->ports; +} + +/* Returns the state of port 'p'. */ +enum stp_state +stp_port_get_state(const struct stp_port *p) +{ + return p->state; +} + +/* Disables STP on port 'p'. */ +void +stp_port_disable(struct stp_port *p) +{ + struct stp *stp = p->stp; + if (p->state != STP_DISABLED) { + bool root = stp_is_root_bridge(stp); + stp_become_designated_port(p); + stp_set_port_state(p, STP_DISABLED); + p->topology_change_ack = false; + p->config_pending = false; + stp_stop_timer(&p->message_age_timer); + stp_stop_timer(&p->forward_delay_timer); + stp_configuration_update(stp); + stp_port_state_selection(stp); + if (stp_is_root_bridge(stp) && !root) { + stp_become_root_bridge(stp); + } + } +} + +/* Enables STP on port 'p'. The port will initially be in "blocking" state. */ +void +stp_port_enable(struct stp_port *p) +{ + if (p->state == STP_DISABLED) { + stp_initialize_port(p, STP_BLOCKING); + stp_port_state_selection(p->stp); + } +} + +/* Sets the priority of port 'p' to 'new_priority'. Lower numerical values + * are interpreted as higher priorities. */ +void +stp_port_set_priority(struct stp_port *p, uint8_t new_priority) +{ + uint16_t new_port_id = (p->port_id & 0xff) | (new_priority << 8); + if (p->port_id != new_port_id) { + struct stp *stp = p->stp; + if (stp_is_designated_port(p)) { + p->designated_port = new_port_id; + } + p->port_id = new_port_id; + if (stp->bridge_id == p->designated_bridge + && p->port_id < p->designated_port) { + stp_become_designated_port(p); + stp_port_state_selection(stp); + } + } +} + +/* Sets the path cost of port 'p' to 'path_cost'. Lower values are generally + * used to indicate faster links. Use stp_port_set_speed() to automatically + * generate a default path cost from a link speed. */ +void +stp_port_set_path_cost(struct stp_port *p, uint16_t path_cost) +{ + if (p->path_cost != path_cost) { + struct stp *stp = p->stp; + p->path_cost = path_cost; + stp_configuration_update(stp); + stp_port_state_selection(stp); + } +} + +/* Sets the path cost of port 'p' based on 'speed' (measured in Mb/s). */ +void +stp_port_set_speed(struct stp_port *p, unsigned int speed) +{ + stp_port_set_path_cost(p, (speed >= 10000 ? 2 /* 10 Gb/s. */ + : speed >= 1000 ? 4 /* 1 Gb/s. */ + : speed >= 100 ? 19 /* 100 Mb/s. */ + : speed >= 16 ? 62 /* 16 Mb/s. */ + : speed >= 10 ? 100 /* 10 Mb/s. */ + : speed >= 4 ? 250 /* 4 Mb/s. */ + : 19)); /* 100 Mb/s (guess). */ +} + +/* Enables topology change detection on port 'p'. */ +void +stp_port_enable_change_detection(struct stp_port *p) +{ + p->change_detection_enabled = true; +} + +/* Disables topology change detection on port 'p'. */ +void +stp_port_disable_change_detection(struct stp_port *p) +{ + p->change_detection_enabled = false; +} + +static void +stp_transmit_config(struct stp_port *p) +{ + struct stp *stp = p->stp; + bool root = stp_is_root_bridge(stp); + if (!root && !stp->root_port) { + return; + } + if (p->hold_timer.active) { + p->config_pending = true; + } else { + struct stp_config_bpdu config; + memset(&config, 0, sizeof config); + config.header.protocol_id = htons(STP_PROTOCOL_ID); + config.header.protocol_version = STP_PROTOCOL_VERSION; + config.header.bpdu_type = STP_TYPE_CONFIG; + config.flags = 0; + if (p->topology_change_ack) { + config.flags |= htons(STP_CONFIG_TOPOLOGY_CHANGE_ACK); + } + if (stp->topology_change) { + config.flags |= htons(STP_CONFIG_TOPOLOGY_CHANGE); + } + config.root_id = htonll(stp->designated_root); + config.root_path_cost = htonl(stp->root_path_cost); + config.bridge_id = htonll(stp->bridge_id); + config.port_id = htons(p->port_id); + if (root) { + config.message_age = htons(0); + } else { + config.message_age = htons(stp->root_port->message_age_timer.value + + MESSAGE_AGE_INCREMENT); + } + config.max_age = htons(stp->max_age); + config.hello_time = htons(stp->hello_time); + config.forward_delay = htons(stp->forward_delay); + if (ntohs(config.message_age) < stp->max_age) { + p->topology_change_ack = false; + p->config_pending = false; + stp_send_bpdu(p, &config, sizeof config); + stp_start_timer(&p->hold_timer, 0); + } + } +} + +static bool +stp_supersedes_port_info(const struct stp_port *p, + const struct stp_config_bpdu *config) +{ + if (ntohll(config->root_id) != p->designated_root) { + return ntohll(config->root_id) < p->designated_root; + } else if (ntohl(config->root_path_cost) != p->designated_cost) { + return ntohl(config->root_path_cost) < p->designated_cost; + } else if (ntohll(config->bridge_id) != p->designated_bridge) { + return ntohll(config->bridge_id) < p->designated_bridge; + } else { + return (ntohll(config->bridge_id) != p->stp->bridge_id + || ntohs(config->port_id) <= p->designated_port); + } +} + +static void +stp_record_config_information(struct stp_port *p, + const struct stp_config_bpdu *config) +{ + p->designated_root = ntohll(config->root_id); + p->designated_cost = ntohl(config->root_path_cost); + p->designated_bridge = ntohll(config->bridge_id); + p->designated_port = ntohs(config->port_id); + stp_start_timer(&p->message_age_timer, ntohs(config->message_age)); +} + +static void +stp_record_config_timeout_values(struct stp *stp, + const struct stp_config_bpdu *config) +{ + stp->max_age = ntohs(config->max_age); + stp->hello_time = ntohs(config->hello_time); + stp->forward_delay = ntohs(config->forward_delay); + stp->topology_change = config->flags & htons(STP_CONFIG_TOPOLOGY_CHANGE); +} + +static bool +stp_is_designated_port(const struct stp_port *p) +{ + return (p->designated_bridge == p->stp->bridge_id + && p->designated_port == p->port_id); +} + +static void +stp_config_bpdu_generation(struct stp *stp) +{ + struct stp_port *p; + + FOR_EACH_ENABLED_PORT (p, stp) { + if (stp_is_designated_port(p)) { + stp_transmit_config(p); + } + } +} + +static void +stp_transmit_tcn(struct stp *stp) +{ + struct stp_port *p = stp->root_port; + struct stp_tcn_bpdu tcn_bpdu; + if (!p) { + return; + } + tcn_bpdu.header.protocol_id = htons(STP_PROTOCOL_ID); + tcn_bpdu.header.protocol_version = STP_PROTOCOL_VERSION; + tcn_bpdu.header.bpdu_type = STP_TYPE_TCN; + stp_send_bpdu(p, &tcn_bpdu, sizeof tcn_bpdu); +} + +static void +stp_configuration_update(struct stp *stp) +{ + stp_root_selection(stp); + stp_designated_port_selection(stp); +} + +static bool +stp_supersedes_root(const struct stp_port *root, const struct stp_port *p) +{ + int p_cost = p->designated_cost + p->path_cost; + int root_cost = root->designated_cost + root->path_cost; + + if (p->designated_root != root->designated_root) { + return p->designated_root < root->designated_root; + } else if (p_cost != root_cost) { + return p_cost < root_cost; + } else if (p->designated_bridge != root->designated_bridge) { + return p->designated_bridge < root->designated_bridge; + } else if (p->designated_port != root->designated_port) { + return p->designated_port < root->designated_port; + } else { + return p->port_id < root->port_id; + } +} + +static void +stp_root_selection(struct stp *stp) +{ + struct stp_port *p, *root; + + root = NULL; + FOR_EACH_ENABLED_PORT (p, stp) { + if (stp_is_designated_port(p) + || p->designated_root >= stp->bridge_id) { + continue; + } + if (root && !stp_supersedes_root(root, p)) { + continue; + } + root = p; + } + stp->root_port = root; + if (!root) { + stp->designated_root = stp->bridge_id; + stp->root_path_cost = 0; + } else { + stp->designated_root = root->designated_root; + stp->root_path_cost = root->designated_cost + root->path_cost; + } +} + +static void +stp_designated_port_selection(struct stp *stp) +{ + struct stp_port *p; + + FOR_EACH_ENABLED_PORT (p, stp) { + if (stp_is_designated_port(p) + || p->designated_root != stp->designated_root + || stp->root_path_cost < p->designated_cost + || (stp->root_path_cost == p->designated_cost + && (stp->bridge_id < p->designated_bridge + || (stp->bridge_id == p->designated_bridge + && p->port_id <= p->designated_port)))) + { + stp_become_designated_port(p); + } + } +} + +static void +stp_become_designated_port(struct stp_port *p) +{ + struct stp *stp = p->stp; + p->designated_root = stp->designated_root; + p->designated_cost = stp->root_path_cost; + p->designated_bridge = stp->bridge_id; + p->designated_port = p->port_id; +} + +static void +stp_port_state_selection(struct stp *stp) +{ + struct stp_port *p; + + FOR_EACH_ENABLED_PORT (p, stp) { + if (p == stp->root_port) { + p->config_pending = false; + p->topology_change_ack = false; + stp_make_forwarding(p); + } else if (stp_is_designated_port(p)) { + stp_stop_timer(&p->message_age_timer); + stp_make_forwarding(p); + } else { + p->config_pending = false; + p->topology_change_ack = false; + stp_make_blocking(p); + } + } +} + +static void +stp_make_forwarding(struct stp_port *p) +{ + if (p->state == STP_BLOCKING) { + stp_set_port_state(p, STP_LISTENING); + stp_start_timer(&p->forward_delay_timer, 0); + } +} + +static void +stp_make_blocking(struct stp_port *p) +{ + if (!(p->state & (STP_DISABLED | STP_BLOCKING))) { + if (p->state & (STP_FORWARDING | STP_LEARNING)) { + if (p->change_detection_enabled) { + stp_topology_change_detection(p->stp); + } + } + stp_set_port_state(p, STP_BLOCKING); + stp_stop_timer(&p->forward_delay_timer); + } +} + +static void +stp_set_port_state(struct stp_port *p, enum stp_state state) +{ + if (state != p->state && !p->state_changed) { + p->state_changed = true; + if (p < p->stp->first_changed_port) { + p->stp->first_changed_port = p; + } + } + p->state = state; +} + +static void +stp_topology_change_detection(struct stp *stp) +{ + if (stp_is_root_bridge(stp)) { + stp->topology_change = true; + stp_start_timer(&stp->topology_change_timer, 0); + } else if (!stp->topology_change_detected) { + stp_transmit_tcn(stp); + stp_start_timer(&stp->tcn_timer, 0); + } + stp->topology_change_detected = true; +} + +static void +stp_topology_change_acknowledged(struct stp *stp) +{ + stp->topology_change_detected = false; + stp_stop_timer(&stp->tcn_timer); +} + +static void +stp_acknowledge_topology_change(struct stp_port *p) +{ + p->topology_change_ack = true; + stp_transmit_config(p); +} + +void +stp_received_config_bpdu(struct stp *stp, struct stp_port *p, + const struct stp_config_bpdu *config) +{ + if (ntohs(config->message_age) >= ntohs(config->max_age)) { + VLOG_WARN("%s: received config BPDU with message age (%u) greater " + "than max age (%u)", + stp->name, + ntohs(config->message_age), ntohs(config->max_age)); + return; + } + if (p->state != STP_DISABLED) { + bool root = stp_is_root_bridge(stp); + if (stp_supersedes_port_info(p, config)) { + stp_record_config_information(p, config); + stp_configuration_update(stp); + stp_port_state_selection(stp); + if (!stp_is_root_bridge(stp) && root) { + stp_stop_timer(&stp->hello_timer); + if (stp->topology_change_detected) { + stp_stop_timer(&stp->topology_change_timer); + stp_transmit_tcn(stp); + stp_start_timer(&stp->tcn_timer, 0); + } + } + if (p == stp->root_port) { + stp_record_config_timeout_values(stp, config); + stp_config_bpdu_generation(stp); + if (config->flags & htons(STP_CONFIG_TOPOLOGY_CHANGE_ACK)) { + stp_topology_change_acknowledged(stp); + } + } + } else if (stp_is_designated_port(p)) { + stp_transmit_config(p); + } + } +} + +void +stp_received_tcn_bpdu(struct stp *stp, struct stp_port *p) +{ + if (p->state != STP_DISABLED) { + if (stp_is_designated_port(p)) { + stp_topology_change_detection(stp); + stp_acknowledge_topology_change(p); + } + } +} + +static void +stp_hello_timer_expiry(struct stp *stp) +{ + stp_config_bpdu_generation(stp); + stp_start_timer(&stp->hello_timer, 0); +} + +static void +stp_message_age_timer_expiry(struct stp_port *p) +{ + struct stp *stp = p->stp; + bool root = stp_is_root_bridge(stp); + stp_become_designated_port(p); + stp_configuration_update(stp); + stp_port_state_selection(stp); + if (stp_is_root_bridge(stp) && !root) { + stp->max_age = stp->bridge_max_age; + stp->hello_time = stp->bridge_hello_time; + stp->forward_delay = stp->bridge_forward_delay; + stp_topology_change_detection(stp); + stp_stop_timer(&stp->tcn_timer); + stp_config_bpdu_generation(stp); + stp_start_timer(&stp->hello_timer, 0); + } +} + +static bool +stp_is_designated_for_some_port(const struct stp *stp) +{ + const struct stp_port *p; + + FOR_EACH_ENABLED_PORT (p, stp) { + if (p->designated_bridge == stp->bridge_id) { + return true; + } + } + return false; +} + +static void +stp_forward_delay_timer_expiry(struct stp_port *p) +{ + if (p->state == STP_LISTENING) { + stp_set_port_state(p, STP_LEARNING); + stp_start_timer(&p->forward_delay_timer, 0); + } else if (p->state == STP_LEARNING) { + stp_set_port_state(p, STP_FORWARDING); + if (stp_is_designated_for_some_port(p->stp)) { + if (p->change_detection_enabled) { + stp_topology_change_detection(p->stp); + } + } + } +} + +static void +stp_tcn_timer_expiry(struct stp *stp) +{ + stp_transmit_tcn(stp); + stp_start_timer(&stp->tcn_timer, 0); +} + +static void +stp_topology_change_timer_expiry(struct stp *stp) +{ + stp->topology_change_detected = false; + stp->topology_change = false; +} + +static void +stp_hold_timer_expiry(struct stp_port *p) +{ + if (p->config_pending) { + stp_transmit_config(p); + } +} + +static void +stp_initialize_port(struct stp_port *p, enum stp_state state) +{ + assert(state & (STP_DISABLED | STP_BLOCKING)); + stp_become_designated_port(p); + stp_set_port_state(p, state); + p->topology_change_ack = false; + p->config_pending = false; + p->change_detection_enabled = true; + stp_stop_timer(&p->message_age_timer); + stp_stop_timer(&p->forward_delay_timer); + stp_stop_timer(&p->hold_timer); +} + +static void +stp_become_root_bridge(struct stp *stp) +{ + stp->max_age = stp->bridge_max_age; + stp->hello_time = stp->bridge_hello_time; + stp->forward_delay = stp->bridge_forward_delay; + stp_topology_change_detection(stp); + stp_stop_timer(&stp->tcn_timer); + stp_config_bpdu_generation(stp); + stp_start_timer(&stp->hello_timer, 0); +} + +static void +stp_start_timer(struct stp_timer *timer, int value) +{ + timer->value = value; + timer->active = true; +} + +static void +stp_stop_timer(struct stp_timer *timer) +{ + timer->active = false; +} + +static bool +stp_timer_expired(struct stp_timer *timer, int elapsed, int timeout) +{ + if (timer->active) { + timer->value += elapsed; + if (timer->value >= timeout) { + timer->active = false; + return true; + } + } + return false; +} + +/* Returns the number of whole STP timer ticks in 'ms' milliseconds. There + * are 256 STP timer ticks per second. */ +static int +ms_to_timer(int ms) +{ + return ms * 0x100 / 1000; +} + +/* Returns the number of leftover milliseconds when 'ms' is converted to STP + * timer ticks. */ +static int +ms_to_timer_remainder(int ms) +{ + return ms * 0x100 % 1000; +} + +/* Returns the number of whole milliseconds in 'timer' STP timer ticks. There + * are 256 STP timer ticks per second. */ +static int +timer_to_ms(int timer) +{ + return timer * 1000 / 0x100; +} + +static int +clamp(int x, int min, int max) +{ + return x < min ? min : x > max ? max : x; +} + +static void +stp_update_bridge_timers(struct stp *stp) +{ + int ht, ma, fd; + + ht = clamp(stp->rq_hello_time, 1000, 10000); + ma = clamp(stp->rq_max_age, MAX(2 * (ht + 1000), 6000), 40000); + fd = clamp(stp->rq_forward_delay, ma / 2 + 1000, 30000); + + stp->bridge_hello_time = ms_to_timer(ht); + stp->bridge_max_age = ms_to_timer(ma); + stp->bridge_forward_delay = ms_to_timer(fd); + + if (stp_is_root_bridge(stp)) { + stp->max_age = stp->bridge_max_age; + stp->hello_time = stp->bridge_hello_time; + stp->forward_delay = stp->bridge_forward_delay; + } +} + +static void +stp_send_bpdu(struct stp_port *p, const void *bpdu, size_t bpdu_size) +{ + struct eth_header *eth; + struct llc_header *llc; + struct ofpbuf *pkt; + + /* Skeleton. */ + pkt = ofpbuf_new(ETH_HEADER_LEN + LLC_HEADER_LEN + bpdu_size); + pkt->l2 = eth = ofpbuf_put_zeros(pkt, sizeof *eth); + llc = ofpbuf_put_zeros(pkt, sizeof *llc); + pkt->l3 = ofpbuf_put(pkt, bpdu, bpdu_size); + + /* 802.2 header. */ + memcpy(eth->eth_dst, stp_eth_addr, ETH_ADDR_LEN); + /* p->stp->send_bpdu() must fill in source address. */ + eth->eth_type = htons(pkt->size - ETH_HEADER_LEN); + + /* LLC header. */ + llc->llc_dsap = STP_LLC_DSAP; + llc->llc_ssap = STP_LLC_SSAP; + llc->llc_cntl = STP_LLC_CNTL; + + p->stp->send_bpdu(pkt, stp_port_no(p), p->stp->aux); +} diff --git a/lib/stp.h b/lib/stp.h new file mode 100644 index 00000000..f29ac003 --- /dev/null +++ b/lib/stp.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef STP_H +#define STP_H 1 + +/* This is an implementation of Spanning Tree Protocol as described in IEEE + * 802.1D-1998, clauses 8 and 9. Section numbers refer to this standard. */ + +#include +#include +#include "compiler.h" +#include "util.h" + +struct ofpbuf; + +/* Ethernet address used as the destination for STP frames. */ +extern const uint8_t stp_eth_addr[6]; + +/* LLC field values used for STP frames. */ +#define STP_LLC_SSAP 0x42 +#define STP_LLC_DSAP 0x42 +#define STP_LLC_CNTL 0x03 + +/* Bridge and port priorities that should be used by default. */ +#define STP_DEFAULT_BRIDGE_PRIORITY 32768 +#define STP_DEFAULT_PORT_PRIORITY 128 + +/* Bridge identifier. Top 16 bits are a priority value (numerically lower + * values are higher priorities). Bottom 48 bits are MAC address of bridge. */ +typedef uint64_t stp_identifier; + +/* Basic STP functionality. */ +#define STP_MAX_PORTS 255 +struct stp *stp_create(const char *name, stp_identifier bridge_id, + void (*send_bpdu)(struct ofpbuf *bpdu, int port_no, + void *aux), + void *aux); +void stp_destroy(struct stp *); +void stp_tick(struct stp *, int ms); +void stp_set_bridge_id(struct stp *, stp_identifier bridge_id); +void stp_set_bridge_priority(struct stp *, uint16_t new_priority); +void stp_set_hello_time(struct stp *, int ms); +void stp_set_max_age(struct stp *, int ms); +void stp_set_forward_delay(struct stp *, int ms); + +/* STP properties. */ +const char *stp_get_name(const struct stp *); +stp_identifier stp_get_bridge_id(const struct stp *); +stp_identifier stp_get_designated_root(const struct stp *); +bool stp_is_root_bridge(const struct stp *); +int stp_get_root_path_cost(const struct stp *); +int stp_get_hello_time(const struct stp *); +int stp_get_max_age(const struct stp *); +int stp_get_forward_delay(const struct stp *); + +/* Obtaining STP ports. */ +struct stp_port *stp_get_port(struct stp *, int port_no); +struct stp_port *stp_get_root_port(struct stp *); +bool stp_get_changed_port(struct stp *, struct stp_port **portp); + +/* State of an STP port. + * + * A port is in exactly one state at any given time, but distinct bits are used + * for states to allow testing for more than one state with a bit mask. */ +enum stp_state { + STP_DISABLED = 1 << 0, /* 8.4.5: Disabled by management. */ + STP_LISTENING = 1 << 1, /* 8.4.2: Not learning or relaying frames. */ + STP_LEARNING = 1 << 2, /* 8.4.3: Learning but not relaying frames. */ + STP_FORWARDING = 1 << 3, /* 8.4.4: Learning and relaying frames. */ + STP_BLOCKING = 1 << 4 /* 8.4.1: Initial boot state. */ +}; +const char *stp_state_name(enum stp_state); +bool stp_forward_in_state(enum stp_state); +bool stp_learn_in_state(enum stp_state); + +void stp_received_bpdu(struct stp_port *, const void *bpdu, size_t bpdu_size); + +struct stp *stp_port_get_stp(struct stp_port *); +int stp_port_no(const struct stp_port *); +enum stp_state stp_port_get_state(const struct stp_port *); +void stp_port_enable(struct stp_port *); +void stp_port_disable(struct stp_port *); +void stp_port_set_priority(struct stp_port *, uint8_t new_priority); +void stp_port_set_path_cost(struct stp_port *, uint16_t path_cost); +void stp_port_set_speed(struct stp_port *, unsigned int speed); +void stp_port_enable_change_detection(struct stp_port *); +void stp_port_disable_change_detection(struct stp_port *); + +#endif /* stp.h */ diff --git a/lib/svec.c b/lib/svec.c new file mode 100644 index 00000000..4f8968d1 --- /dev/null +++ b/lib/svec.c @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "svec.h" +#include +#include +#include +#include +#include "dynamic-string.h" +#include "util.h" + +#define THIS_MODULE VLM_svec +#include "vlog.h" + +void +svec_init(struct svec *svec) +{ + svec->names = NULL; + svec->n = 0; + svec->allocated = 0; +} + +void +svec_clone(struct svec *svec, const struct svec *other) +{ + svec_init(svec); + svec_append(svec, other); +} + +void +svec_destroy(struct svec *svec) +{ + svec_clear(svec); + free(svec->names); +} + +void +svec_clear(struct svec *svec) +{ + size_t i; + + for (i = 0; i < svec->n; i++) { + free(svec->names[i]); + } + svec->n = 0; +} + +void +svec_add(struct svec *svec, const char *name) +{ + svec_add_nocopy(svec, xstrdup(name)); +} + +void +svec_del(struct svec *svec, const char *name) +{ + size_t offset; + + offset = svec_find(svec, name); + if (offset != SIZE_MAX) { + free(svec->names[offset]); + memmove(&svec->names[offset], &svec->names[offset + 1], + sizeof *svec->names * (svec->n - offset - 1)); + svec->n--; + } +} + +static void +svec_expand(struct svec *svec) +{ + if (svec->n >= svec->allocated) { + svec->names = x2nrealloc(svec->names, &svec->allocated, + sizeof *svec->names); + } +} + +void +svec_add_nocopy(struct svec *svec, char *name) +{ + svec_expand(svec); + svec->names[svec->n++] = name; +} + +void +svec_append(struct svec *svec, const struct svec *other) +{ + size_t i; + for (i = 0; i < other->n; i++) { + svec_add(svec, other->names[i]); + } +} + +void +svec_terminate(struct svec *svec) +{ + svec_expand(svec); + svec->names[svec->n] = NULL; +} + +static int +compare_strings(const void *a_, const void *b_) +{ + char *const *a = a_; + char *const *b = b_; + return strcmp(*a, *b); +} + +void +svec_sort(struct svec *svec) +{ + qsort(svec->names, svec->n, sizeof *svec->names, compare_strings); +} + +void +svec_sort_unique(struct svec *svec) +{ + svec_sort(svec); + svec_unique(svec); +} + +void +svec_unique(struct svec *svec) +{ + assert(svec_is_sorted(svec)); + if (svec->n > 1) { + /* This algorithm is lazy and sub-optimal, but it's "obviously correct" + * and asymptotically optimal . */ + struct svec tmp; + size_t i; + + svec_init(&tmp); + svec_add(&tmp, svec->names[0]); + for (i = 1; i < svec->n; i++) { + if (strcmp(svec->names[i - 1], svec->names[i])) { + svec_add(&tmp, svec->names[i]); + } + } + svec_swap(&tmp, svec); + svec_destroy(&tmp); + } +} + +void +svec_compact(struct svec *svec) +{ + size_t i, j; + + for (i = j = 0; i < svec->n; i++) { + if (svec->names[i] != NULL) { + svec->names[j++] = svec->names[i]; + } + } + svec->n = j; +} + +void +svec_diff(const struct svec *a, const struct svec *b, + struct svec *a_only, struct svec *both, struct svec *b_only) +{ + size_t i, j; + + assert(svec_is_sorted(a)); + assert(svec_is_sorted(b)); + if (a_only) { + svec_init(a_only); + } + if (both) { + svec_init(both); + } + if (b_only) { + svec_init(b_only); + } + for (i = j = 0; i < a->n && j < b->n; ) { + int cmp = strcmp(a->names[i], b->names[j]); + if (cmp < 0) { + if (a_only) { + svec_add(a_only, a->names[i]); + } + i++; + } else if (cmp > 0) { + if (b_only) { + svec_add(b_only, b->names[j]); + } + j++; + } else { + if (both) { + svec_add(both, a->names[i]); + } + i++; + j++; + } + } + if (a_only) { + for (; i < a->n; i++) { + svec_add(a_only, a->names[i]); + } + } + if (b_only) { + for (; j < b->n; j++) { + svec_add(b_only, b->names[j]); + } + } +} + +bool +svec_contains(const struct svec *svec, const char *name) +{ + return svec_find(svec, name) != SIZE_MAX; +} + +size_t +svec_find(const struct svec *svec, const char *name) +{ + char **p; + + assert(svec_is_sorted(svec)); + p = bsearch(&name, svec->names, svec->n, sizeof *svec->names, + compare_strings); + return p ? p - svec->names : SIZE_MAX; +} + +bool +svec_is_sorted(const struct svec *svec) +{ + size_t i; + + for (i = 1; i < svec->n; i++) { + if (strcmp(svec->names[i - 1], svec->names[i]) > 0) { + return false; + } + } + return true; +} + +bool +svec_is_unique(const struct svec *svec) +{ + return svec_get_duplicate(svec) == NULL; +} + +const char * +svec_get_duplicate(const struct svec *svec) +{ + assert(svec_is_sorted(svec)); + if (svec->n > 1) { + size_t i; + for (i = 1; i < svec->n; i++) { + if (!strcmp(svec->names[i - 1], svec->names[i])) { + return svec->names[i]; + } + } + } + return NULL; +} + +void +svec_swap(struct svec *a, struct svec *b) +{ + struct svec tmp = *a; + *a = *b; + *b = tmp; +} + +void +svec_print(const struct svec *svec, const char *title) +{ + size_t i; + + printf("%s:\n", title); + for (i = 0; i < svec->n; i++) { + printf("\"%s\"\n", svec->names[i]); + } +} + +/* Breaks 'words' into words at white space, respecting shell-like quoting + * conventions, and appends the words to 'svec'. */ +void +svec_parse_words(struct svec *svec, const char *words) +{ + struct ds word = DS_EMPTY_INITIALIZER; + const char *p, *q; + + for (p = words; *p != '\0'; p = q) { + int quote = 0; + + while (isspace((unsigned char) *p)) { + p++; + } + if (*p == '\0') { + break; + } + + ds_clear(&word); + for (q = p; *q != '\0'; q++) { + if (*q == quote) { + quote = 0; + } else if (*q == '\'' || *q == '"') { + quote = *q; + } else if (*q == '\\' && (!quote || quote == '"')) { + q++; + if (*q == '\0') { + VLOG_WARN("%s: ends in trailing backslash", words); + break; + } + ds_put_char(&word, *q); + } else if (isspace((unsigned char) *q) && !quote) { + q++; + break; + } else { + ds_put_char(&word, *q); + } + } + svec_add(svec, ds_cstr(&word)); + if (quote) { + VLOG_WARN("%s: word ends inside quoted string", words); + } + } + ds_destroy(&word); +} + +bool +svec_equal(const struct svec *a, const struct svec *b) +{ + size_t i; + + if (a->n != b->n) { + return false; + } + for (i = 0; i < a->n; i++) { + if (strcmp(a->names[i], b->names[i])) { + return false; + } + } + return true; +} + +char * +svec_join(const struct svec *svec, + const char *delimiter, const char *terminator) +{ + struct ds ds; + size_t i; + + ds_init(&ds); + for (i = 0; i < svec->n; i++) { + if (i) { + ds_put_cstr(&ds, delimiter); + } + ds_put_cstr(&ds, svec->names[i]); + } + ds_put_cstr(&ds, terminator); + return ds_cstr(&ds); +} + +const char * +svec_back(const struct svec *svec) +{ + assert(svec->n); + return svec->names[svec->n - 1]; +} + +void +svec_pop_back(struct svec *svec) +{ + assert(svec->n); + free(svec->names[--svec->n]); +} diff --git a/lib/svec.h b/lib/svec.h new file mode 100644 index 00000000..4865d2f2 --- /dev/null +++ b/lib/svec.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef SVEC_H +#define SVEC_H 1 + +#include +#include + +struct svec { + char **names; + size_t n; + size_t allocated; +}; + +#define SVEC_EMPTY_INITIALIZER { NULL, 0, 0 } + +void svec_init(struct svec *); +void svec_clone(struct svec *, const struct svec *); +void svec_destroy(struct svec *); +void svec_clear(struct svec *); +void svec_add(struct svec *, const char *); +void svec_add_nocopy(struct svec *, char *); +void svec_del(struct svec *, const char *); +void svec_append(struct svec *, const struct svec *); +void svec_terminate(struct svec *); +void svec_sort(struct svec *); +void svec_sort_unique(struct svec *); +void svec_unique(struct svec *); +void svec_compact(struct svec *); +void svec_diff(const struct svec *a, const struct svec *b, + struct svec *a_only, struct svec *both, struct svec *b_only); +bool svec_contains(const struct svec *, const char *); +size_t svec_find(const struct svec *, const char *); +bool svec_is_sorted(const struct svec *); +bool svec_is_unique(const struct svec *); +const char *svec_get_duplicate(const struct svec *); +void svec_swap(struct svec *a, struct svec *b); +void svec_print(const struct svec *svec, const char *title); +void svec_parse_words(struct svec *svec, const char *words); +bool svec_equal(const struct svec *, const struct svec *); +char *svec_join(const struct svec *, + const char *delimiter, const char *terminator); +const char *svec_back(const struct svec *); +void svec_pop_back(struct svec *); + +#endif /* svec.h */ diff --git a/lib/tag.c b/lib/tag.c new file mode 100644 index 00000000..8a4ee892 --- /dev/null +++ b/lib/tag.c @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "tag.h" +#include +#include "random.h" +#include "type-props.h" +#include "util.h" + +#define N_TAG_BITS (CHAR_BIT * sizeof(tag_type)) +BUILD_ASSERT_DECL(IS_POW2(N_TAG_BITS)); + +#define LOG2_N_TAG_BITS (N_TAG_BITS == 32 ? 5 : N_TAG_BITS == 64 ? 6 : 0) +BUILD_ASSERT_DECL(LOG2_N_TAG_BITS > 0); + +/* Returns a randomly selected tag. */ +tag_type +tag_create_random(void) +{ + int x, y; + do { + uint16_t r = random_uint16(); + x = r & (N_TAG_BITS - 1); + y = r >> (16 - LOG2_N_TAG_BITS); + } while (x == y); + return (1u << x) | (1u << y); +} + +/* Returns a tag deterministically generated from 'seed'. + * + * 'seed' should have data in all of its bits; if it has data only in its + * low-order bits then the resulting tags will be poorly distributed. Use a + * hash function such as hash_bytes() to generate 'seed' if necessary. */ +tag_type +tag_create_deterministic(uint32_t seed) +{ + int x = seed & (N_TAG_BITS - 1); + int y = (seed >> LOG2_N_TAG_BITS) % 31; + y += y >= x; + return (1u << x) | (1u << y); +} + +/* Initializes 'set' as an empty tag set. */ +void +tag_set_init(struct tag_set *set) +{ + memset(set, 0, sizeof *set); +} + +/* Adds 'tag' to 'set'. */ +void +tag_set_add(struct tag_set *set, tag_type tag) +{ + if (tag && (!tag_is_valid(tag) || !tag_set_intersects(set, tag))) { + /* XXX We could do better by finding the set member to which we would + * add the fewest number of 1-bits. This would reduce the amount of + * ambiguity, since e.g. three 1-bits match 3 * 2 / 2 = 3 unique tags + * whereas four 1-bits match 4 * 3 / 2 = 6 unique tags. */ + tag_type *t = &set->tags[set->n++ % TAG_SET_SIZE]; + *t |= tag; + if (*t == TYPE_MAXIMUM(tag_type)) { + set->tags[0] = *t; + } + + set->total |= tag; + } +} + diff --git a/lib/tag.h b/lib/tag.h new file mode 100644 index 00000000..2002e5a3 --- /dev/null +++ b/lib/tag.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef TAG_H +#define TAG_H 1 + +#include +#include +#include +#include "util.h" + +/* + * Tagging support. + * + * A 'tag' represents an arbitrary category. Currently, tags are used to + * represent categories of flows and in particular the dependencies for a flow + * switching decision. For example, if a flow's output port is based on + * knowledge that source MAC 00:02:e3:0f:80:a4 is on eth0, then a tag that + * represents that dependency is attached to that flow in the flowtracking hash + * table. + * + * As this example shows, the universe of possible categories is very large, + * and even the number of categories that are in use at a given time can be + * very large. This means that keeping track of category membership via + * conventional means (lists, bitmaps, etc.) is likely to be expensive. + * + * Tags are actually implemented via a "superimposed coding", as discussed in + * Knuth TAOCP v.3 section 6.5 "Retrieval on Secondary Keys". A tag is an + * unsigned integer in which exactly 2 bits are set to 1 and the rest set to 0. + * For 32-bit integers (as currently used) there are 32 * 31 / 2 = 496 unique + * tags; for 64-bit integers there are 64 * 63 / 2 = 2,016. + * + * Because there is a small finite number of unique tags, tags must collide + * after some number of them have been created. In practice we generally + * create tags by choosing bits randomly. + * + * The key property of tags is that we can combine them without increasing the + * amount of data required using bitwise-OR, since the result has the 1-bits + * from both tags set. The necessary tradeoff is that the result is even more + * ambiguous: if combining two tags yields a value with 4 bits set to 1, then + * the result value will test as having 4 * 3 / 2 = 6 unique tags, not just the + * two tags that we combined. + * + * The upshot is this: a value that is the bitwise-OR combination of a number + * of tags will always include the tags that were combined, but it may contain + * any number of additional tags as well. This is acceptable for flowtracking, + * since we want to be sure that we catch every flow that needs to be + * revalidated, but it is OK if we revalidate a few extra flows as well. + * + * If we combine too many tags, then the result will have every bit set, so + * that it will test as including every tag. Fortunately, this is not a big + * problem for us: although there are many flows overall, each individual flow + * belongs only to a small number of categories. + */ + +/* Represents a tag, or the combination of 0 or more tags. */ +typedef uint32_t tag_type; + +tag_type tag_create_random(void); +tag_type tag_create_deterministic(uint32_t seed); +static inline bool tag_intersects(tag_type, tag_type); +static inline bool tag_is_valid(tag_type); + +/* Returns true if 'a' and 'b' have at least one tag in common, + * false if their set of tags is disjoint. . */ +static inline bool +tag_intersects(tag_type a, tag_type b) +{ + tag_type x = a & b; + return (x & (x - 1)) != 0; +} + +/* Returns true if 'tag' is a valid tag, that is, if exactly two bits are set + * to 1 and the rest to 0. Otherwise, returns false. */ +static inline bool +tag_is_valid(tag_type tag) +{ + tag_type x = tag & (tag - 1); + tag_type y = x & (x - 1); + return x && !y; +} + +/* + * A tag set accumulates tags with reduced ambiguity compared to a single tag. + * The flow tracking uses tag sets to keep track of tags that need to + * revalidated after a number of packets have been processed. + */ +#define TAG_SET_SIZE 4 +struct tag_set { + tag_type total; + tag_type tags[TAG_SET_SIZE]; + unsigned int n; +}; + +void tag_set_init(struct tag_set *); +void tag_set_add(struct tag_set *, tag_type); +static inline bool tag_set_is_empty(const struct tag_set *); +static inline bool tag_set_intersects(const struct tag_set *, tag_type); + +/* Returns true if 'set' will match no tags at all, + * false if it will match at least one tag. */ +static inline bool +tag_set_is_empty(const struct tag_set *set) +{ + return !set->n; +} + +/* Returns true if any of the tags in 'tags' are also in 'set', + * false if the intersection is empty. */ +static inline bool +tag_set_intersects(const struct tag_set *set, tag_type tags) +{ + BUILD_ASSERT_DECL(TAG_SET_SIZE == 4); + return (tag_intersects(set->total, tags) + && (tag_intersects(set->tags[0], tags) + || tag_intersects(set->tags[1], tags) + || tag_intersects(set->tags[2], tags) + || tag_intersects(set->tags[3], tags))); +} + +#endif /* tag.h */ diff --git a/lib/timeval.c b/lib/timeval.c new file mode 100644 index 00000000..b76993f9 --- /dev/null +++ b/lib/timeval.c @@ -0,0 +1,305 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "timeval.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "coverage.h" +#include "fatal-signal.h" +#include "util.h" + +#include "vlog.h" +#define THIS_MODULE VLM_timeval + +/* Initialized? */ +static bool inited; + +/* Has a timer tick occurred? */ +static volatile sig_atomic_t tick; + +/* The current time, as of the last refresh. */ +static struct timeval now; + +/* Time at which to die with SIGALRM (if not TIME_MIN). */ +static time_t deadline = TIME_MIN; + +static void sigalrm_handler(int); +static void refresh_if_ticked(void); +static time_t time_add(time_t, time_t); +static void block_sigalrm(sigset_t *); +static void unblock_sigalrm(const sigset_t *); +static void log_poll_interval(long long int last_wakeup, + const struct rusage *last_rusage); +static long long int timeval_to_msec(const struct timeval *); + +/* Initializes the timetracking module. */ +void +time_init(void) +{ + struct sigaction sa; + struct itimerval itimer; + + if (inited) { + return; + } + + inited = true; + gettimeofday(&now, NULL); + tick = false; + + /* Set up signal handler. */ + memset(&sa, 0, sizeof sa); + sa.sa_handler = sigalrm_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_RESTART; + if (sigaction(SIGALRM, &sa, NULL)) { + ovs_fatal(errno, "sigaction(SIGALRM) failed"); + } + + /* Set up periodic timer. */ + itimer.it_interval.tv_sec = 0; + itimer.it_interval.tv_usec = TIME_UPDATE_INTERVAL * 1000; + itimer.it_value = itimer.it_interval; + if (setitimer(ITIMER_REAL, &itimer, NULL)) { + ovs_fatal(errno, "setitimer failed"); + } +} + +/* Forces a refresh of the current time from the kernel. It is not usually + * necessary to call this function, since the time will be refreshed + * automatically at least every TIME_UPDATE_INTERVAL milliseconds. */ +void +time_refresh(void) +{ + gettimeofday(&now, NULL); + tick = false; +} + +/* Returns the current time, in seconds. */ +time_t +time_now(void) +{ + refresh_if_ticked(); + return now.tv_sec; +} + +/* Returns the current time, in ms (within TIME_UPDATE_INTERVAL ms). */ +long long int +time_msec(void) +{ + refresh_if_ticked(); + return timeval_to_msec(&now); +} + +/* Stores the current time, accurate within TIME_UPDATE_INTERVAL ms, into + * '*tv'. */ +void +time_timeval(struct timeval *tv) +{ + refresh_if_ticked(); + *tv = now; +} + +/* Configures the program to die with SIGALRM 'secs' seconds from now, if + * 'secs' is nonzero, or disables the feature if 'secs' is zero. */ +void +time_alarm(unsigned int secs) +{ + sigset_t oldsigs; + + time_init(); + block_sigalrm(&oldsigs); + deadline = secs ? time_add(time_now(), secs) : TIME_MIN; + unblock_sigalrm(&oldsigs); +} + +/* Like poll(), except: + * + * - On error, returns a negative error code (instead of setting errno). + * + * - If interrupted by a signal, retries automatically until the original + * 'timeout' expires. (Because of this property, this function will + * never return -EINTR.) + * + * - As a side effect, refreshes the current time (like time_refresh()). + */ +int +time_poll(struct pollfd *pollfds, int n_pollfds, int timeout) +{ + static long long int last_wakeup; + static struct rusage last_rusage; + long long int start; + sigset_t oldsigs; + bool blocked; + int retval; + + time_refresh(); + log_poll_interval(last_wakeup, &last_rusage); + coverage_clear(); + start = time_msec(); + blocked = false; + for (;;) { + int time_left; + if (timeout > 0) { + long long int elapsed = time_msec() - start; + time_left = timeout >= elapsed ? timeout - elapsed : 0; + } else { + time_left = timeout; + } + + retval = poll(pollfds, n_pollfds, time_left); + if (retval < 0) { + retval = -errno; + } + time_refresh(); + if (retval != -EINTR) { + break; + } + + if (!blocked && deadline == TIME_MIN) { + block_sigalrm(&oldsigs); + blocked = true; + } + } + if (blocked) { + unblock_sigalrm(&oldsigs); + } + last_wakeup = time_msec(); + getrusage(RUSAGE_SELF, &last_rusage); + return retval; +} + +/* Returns the sum of 'a' and 'b', with saturation on overflow or underflow. */ +static time_t +time_add(time_t a, time_t b) +{ + return (a >= 0 + ? (b > TIME_MAX - a ? TIME_MAX : a + b) + : (b < TIME_MIN - a ? TIME_MIN : a + b)); +} + +static void +sigalrm_handler(int sig_nr) +{ + tick = true; + if (deadline != TIME_MIN && time(0) > deadline) { + fatal_signal_handler(sig_nr); + } +} + +static void +refresh_if_ticked(void) +{ + assert(inited); + if (tick) { + time_refresh(); + } +} + +static void +block_sigalrm(sigset_t *oldsigs) +{ + sigset_t sigalrm; + sigemptyset(&sigalrm); + sigaddset(&sigalrm, SIGALRM); + if (sigprocmask(SIG_BLOCK, &sigalrm, oldsigs)) { + ovs_fatal(errno, "sigprocmask"); + } +} + +static void +unblock_sigalrm(const sigset_t *oldsigs) +{ + if (sigprocmask(SIG_SETMASK, oldsigs, NULL)) { + ovs_fatal(errno, "sigprocmask"); + } +} + +static long long int +timeval_to_msec(const struct timeval *tv) +{ + return (long long int) tv->tv_sec * 1000 + tv->tv_usec / 1000; +} + +static long long int +timeval_diff_msec(const struct timeval *a, const struct timeval *b) +{ + return timeval_to_msec(a) - timeval_to_msec(b); +} + +static void +log_poll_interval(long long int last_wakeup, const struct rusage *last_rusage) +{ + static unsigned int mean_interval; /* In 16ths of a millisecond. */ + static unsigned int n_samples; + + long long int now; + unsigned int interval; /* In 16ths of a millisecond. */ + + /* Compute interval from last wakeup to now in 16ths of a millisecond, + * capped at 10 seconds (16000 in this unit). */ + now = time_msec(); + interval = MIN(10000, now - last_wakeup) << 4; + + /* Warn if we took too much time between polls. */ + if (n_samples > 10 && interval > mean_interval * 8) { + struct rusage rusage; + + getrusage(RUSAGE_SELF, &rusage); + VLOG_WARN("%u ms poll interval (%lld ms user, %lld ms system) " + "is over %u times the weighted mean interval %u ms " + "(%u samples)", + (interval + 8) / 16, + timeval_diff_msec(&rusage.ru_utime, &last_rusage->ru_utime), + timeval_diff_msec(&rusage.ru_stime, &last_rusage->ru_stime), + interval / mean_interval, + (mean_interval + 8) / 16, n_samples); + if (rusage.ru_minflt > last_rusage->ru_minflt + || rusage.ru_majflt > last_rusage->ru_majflt) { + VLOG_WARN("faults: %ld minor, %ld major", + rusage.ru_minflt - last_rusage->ru_minflt, + rusage.ru_majflt - last_rusage->ru_majflt); + } + if (rusage.ru_inblock > last_rusage->ru_inblock + || rusage.ru_oublock > last_rusage->ru_oublock) { + VLOG_WARN("disk: %ld reads, %ld writes", + rusage.ru_inblock - last_rusage->ru_inblock, + rusage.ru_oublock - last_rusage->ru_oublock); + } + if (rusage.ru_nvcsw > last_rusage->ru_nvcsw + || rusage.ru_nivcsw > last_rusage->ru_nivcsw) { + VLOG_WARN("context switches: %ld voluntary, %ld involuntary", + rusage.ru_nvcsw - last_rusage->ru_nvcsw, + rusage.ru_nivcsw - last_rusage->ru_nivcsw); + } + coverage_log(VLL_WARN); + } + + /* Update exponentially weighted moving average. With these parameters, a + * given value decays to 1% of its value in about 100 time steps. */ + if (n_samples++) { + mean_interval = (mean_interval * 122 + interval * 6 + 64) / 128; + } else { + mean_interval = interval; + } +} diff --git a/lib/timeval.h b/lib/timeval.h new file mode 100644 index 00000000..8e10ad36 --- /dev/null +++ b/lib/timeval.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef TIMEVAL_H +#define TIMEVAL_H 1 + +#include +#include "type-props.h" +#include "util.h" + +struct pollfd; +struct timeval; + +/* POSIX allows floating-point time_t, but we don't support it. */ +BUILD_ASSERT_DECL(TYPE_IS_INTEGER(time_t)); + +/* We do try to cater to unsigned time_t, but I want to know about it if we + * ever encounter such a platform. */ +BUILD_ASSERT_DECL(TYPE_IS_SIGNED(time_t)); + +#define TIME_MAX TYPE_MAXIMUM(time_t) +#define TIME_MIN TYPE_MINIMUM(time_t) + +/* Interval between updates to the time reported by time_gettimeofday(), in ms. + * This should not be adjusted much below 10 ms or so with the current + * implementation, or too much time will be wasted in signal handlers and calls + * to time(0). */ +#define TIME_UPDATE_INTERVAL 100 + +void time_init(void); +void time_refresh(void); +time_t time_now(void); +long long int time_msec(void); +void time_timeval(struct timeval *); +void time_alarm(unsigned int secs); +int time_poll(struct pollfd *, int n_pollfds, int timeout); + +#endif /* timeval.h */ diff --git a/lib/type-props.h b/lib/type-props.h new file mode 100644 index 00000000..c718cf12 --- /dev/null +++ b/lib/type-props.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef TYPE_PROPS_H +#define TYPE_PROPS_H 1 + +#include + +#define TYPE_IS_INTEGER(TYPE) ((TYPE) 1.5 == (TYPE) 1) +#define TYPE_IS_SIGNED(TYPE) ((TYPE) 0 > (TYPE) -1) +#define TYPE_VALUE_BITS(TYPE) (sizeof(TYPE) * CHAR_BIT - TYPE_IS_SIGNED(TYPE)) +#define TYPE_MINIMUM(TYPE) (TYPE_IS_SIGNED(TYPE) \ + ? ~(TYPE)0 << TYPE_VALUE_BITS(TYPE) \ + : 0) +#define TYPE_MAXIMUM(TYPE) (TYPE_IS_SIGNED(TYPE) \ + ? ~(~(TYPE)0 << TYPE_VALUE_BITS(TYPE)) \ + : (TYPE)-1) + +#endif /* type-props.h */ diff --git a/lib/unixctl.c b/lib/unixctl.c new file mode 100644 index 00000000..42b6eeff --- /dev/null +++ b/lib/unixctl.c @@ -0,0 +1,592 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "unixctl.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "coverage.h" +#include "dirs.h" +#include "dynamic-string.h" +#include "fatal-signal.h" +#include "list.h" +#include "ofpbuf.h" +#include "poll-loop.h" +#include "shash.h" +#include "socket-util.h" +#include "util.h" + +#ifndef SCM_CREDENTIALS +#include +#endif + +#define THIS_MODULE VLM_unixctl +#include "vlog.h" + +struct unixctl_command { + void (*cb)(struct unixctl_conn *, const char *args); +}; + +struct unixctl_conn { + struct list node; + int fd; + + enum { S_RECV, S_PROCESS, S_SEND } state; + struct ofpbuf in; + struct ds out; + size_t out_pos; +}; + +/* Server for control connection. */ +struct unixctl_server { + char *path; + int fd; + struct list conns; +}; + +/* Client for control connection. */ +struct unixctl_client { + char *connect_path; + char *bind_path; + FILE *stream; +}; + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + +static struct shash commands = SHASH_INITIALIZER(&commands); + +static void +unixctl_help(struct unixctl_conn *conn, const char *args UNUSED) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + struct shash_node *node; + + ds_put_cstr(&ds, "The available commands are:\n"); + HMAP_FOR_EACH (node, struct shash_node, node, &commands.map) { + ds_put_format(&ds, "\t%s\n", node->name); + } + unixctl_command_reply(conn, 214, ds_cstr(&ds)); + ds_destroy(&ds); +} + +void +unixctl_command_register(const char *name, + void (*cb)(struct unixctl_conn *, const char *args)) +{ + struct unixctl_command *command; + + assert(!shash_find_data(&commands, name) + || shash_find_data(&commands, name) == cb); + command = xmalloc(sizeof *command); + command->cb = cb; + shash_add(&commands, name, command); +} + +static const char * +translate_reply_code(int code) +{ + switch (code) { + case 200: return "OK"; + case 201: return "Created"; + case 202: return "Accepted"; + case 204: return "No Content"; + case 211: return "System Status"; + case 214: return "Help"; + case 400: return "Bad Request"; + case 401: return "Unauthorized"; + case 403: return "Forbidden"; + case 404: return "Not Found"; + case 500: return "Internal Server Error"; + case 501: return "Invalid Argument"; + case 503: return "Service Unavailable"; + default: return "Unknown"; + } +} + +void +unixctl_command_reply(struct unixctl_conn *conn, + int code, const char *body) +{ + struct ds *out = &conn->out; + + COVERAGE_INC(unixctl_replied); + assert(conn->state == S_PROCESS); + conn->state = S_SEND; + conn->out_pos = 0; + + ds_clear(out); + ds_put_format(out, "%03d %s\n", code, translate_reply_code(code)); + if (body) { + const char *p; + for (p = body; *p != '\0'; ) { + size_t n = strcspn(p, "\n"); + + if (*p == '.') { + ds_put_char(out, '.'); + } + ds_put_buffer(out, p, n); + ds_put_char(out, '\n'); + p += n; + if (*p == '\n') { + p++; + } + } + } + ds_put_cstr(out, ".\n"); +} + +/* Creates a unixctl server listening on 'path', which may be: + * + * - NULL, in which case /..ctl is used. + * + * - A name that does not start with '/', in which case it is put in + * . + * + * - An absolute path (starting with '/') that gives the exact name of + * the Unix domain socket to listen on. + * + * A program that (optionally) daemonizes itself should call this function + * *after* daemonization, so that the socket name contains the pid of the + * daemon instead of the pid of the program that exited. (Otherwise, + * "ovs-appctl --target .pid" will fail.) + * + * Returns 0 if successful, otherwise a positive errno value. If successful, + * sets '*serverp' to the new unixctl_server, otherwise to NULL. */ +int +unixctl_server_create(const char *path, struct unixctl_server **serverp) +{ + struct unixctl_server *server; + int error; + + unixctl_command_register("help", unixctl_help); + + server = xmalloc(sizeof *server); + list_init(&server->conns); + + if (path) { + if (path[0] == '/') { + server->path = xstrdup(path); + } else { + server->path = xasprintf("%s/%s", ovs_rundir, path); + } + } else { + server->path = xasprintf("%s/%s.%ld.ctl", ovs_rundir, + program_name, (long int) getpid()); + } + + server->fd = make_unix_socket(SOCK_STREAM, true, false, server->path, + NULL); + if (server->fd < 0) { + error = -server->fd; + fprintf(stderr, "Could not initialize control socket %s (%s)\n", + server->path, strerror(error)); + goto error; + } + + if (chmod(server->path, S_IRUSR | S_IWUSR) < 0) { + error = errno; + fprintf(stderr, "Failed to chmod control socket %s (%s)\n", + server->path, strerror(error)); + goto error; + } + + if (listen(server->fd, 10) < 0) { + error = errno; + fprintf(stderr, "Failed to listen on control socket %s (%s)\n", + server->path, strerror(error)); + goto error; + } + + *serverp = server; + return 0; + +error: + if (server->fd >= 0) { + close(server->fd); + } + free(server->path); + free(server); + *serverp = NULL; + return error; +} + +static void +new_connection(struct unixctl_server *server, int fd) +{ + struct unixctl_conn *conn; + + set_nonblocking(fd); + + conn = xmalloc(sizeof *conn); + list_push_back(&server->conns, &conn->node); + conn->fd = fd; + conn->state = S_RECV; + ofpbuf_init(&conn->in, 128); + ds_init(&conn->out); + conn->out_pos = 0; +} + +static int +run_connection_output(struct unixctl_conn *conn) +{ + while (conn->out_pos < conn->out.length) { + size_t bytes_written; + int error; + + error = write_fully(conn->fd, conn->out.string + conn->out_pos, + conn->out.length - conn->out_pos, &bytes_written); + conn->out_pos += bytes_written; + if (error) { + return error; + } + } + conn->state = S_RECV; + return 0; +} + +static void +process_command(struct unixctl_conn *conn, char *s) +{ + struct unixctl_command *command; + size_t name_len; + char *name, *args; + + COVERAGE_INC(unixctl_received); + conn->state = S_PROCESS; + + name = s; + name_len = strcspn(name, " "); + args = name + name_len; + args += strspn(args, " "); + name[name_len] = '\0'; + + command = shash_find_data(&commands, name); + if (command) { + command->cb(conn, args); + } else { + char *msg = xasprintf("\"%s\" is not a valid command", name); + unixctl_command_reply(conn, 400, msg); + free(msg); + } +} + +static int +run_connection_input(struct unixctl_conn *conn) +{ + for (;;) { + size_t bytes_read; + char *newline; + int error; + + newline = memchr(conn->in.data, '\n', conn->in.size); + if (newline) { + char *command = conn->in.data; + size_t n = newline - command + 1; + + if (n > 0 && newline[-1] == '\r') { + newline--; + } + *newline = '\0'; + + process_command(conn, command); + + ofpbuf_pull(&conn->in, n); + if (!conn->in.size) { + ofpbuf_clear(&conn->in); + } + return 0; + } + + ofpbuf_prealloc_tailroom(&conn->in, 128); + error = read_fully(conn->fd, ofpbuf_tail(&conn->in), + ofpbuf_tailroom(&conn->in), &bytes_read); + conn->in.size += bytes_read; + if (conn->in.size > 65536) { + VLOG_WARN_RL(&rl, "excess command length, killing connection"); + return EPROTO; + } + if (error) { + if (error == EAGAIN || error == EWOULDBLOCK) { + if (!bytes_read) { + return EAGAIN; + } + } else { + if (error != EOF || conn->in.size != 0) { + VLOG_WARN_RL(&rl, "read failed: %s", + (error == EOF + ? "connection dropped mid-command" + : strerror(error))); + } + return error; + } + } + } +} + +static int +run_connection(struct unixctl_conn *conn) +{ + int old_state; + do { + int error; + + old_state = conn->state; + switch (conn->state) { + case S_RECV: + error = run_connection_input(conn); + break; + + case S_PROCESS: + error = 0; + break; + + case S_SEND: + error = run_connection_output(conn); + break; + + default: + NOT_REACHED(); + } + if (error) { + return error; + } + } while (conn->state != old_state); + return 0; +} + +static void +kill_connection(struct unixctl_conn *conn) +{ + list_remove(&conn->node); + ofpbuf_uninit(&conn->in); + ds_destroy(&conn->out); + close(conn->fd); + free(conn); +} + +void +unixctl_server_run(struct unixctl_server *server) +{ + struct unixctl_conn *conn, *next; + int i; + + for (i = 0; i < 10; i++) { + int fd = accept(server->fd, NULL, NULL); + if (fd < 0) { + if (errno != EAGAIN && errno != EWOULDBLOCK) { + VLOG_WARN_RL(&rl, "accept failed: %s", strerror(errno)); + } + break; + } + new_connection(server, fd); + } + + LIST_FOR_EACH_SAFE (conn, next, + struct unixctl_conn, node, &server->conns) { + int error = run_connection(conn); + if (error && error != EAGAIN) { + kill_connection(conn); + } + } +} + +void +unixctl_server_wait(struct unixctl_server *server) +{ + struct unixctl_conn *conn; + + poll_fd_wait(server->fd, POLLIN); + LIST_FOR_EACH (conn, struct unixctl_conn, node, &server->conns) { + if (conn->state == S_RECV) { + poll_fd_wait(conn->fd, POLLIN); + } else if (conn->state == S_SEND) { + poll_fd_wait(conn->fd, POLLOUT); + } + } +} + +/* Destroys 'server' and stops listening for connections. */ +void +unixctl_server_destroy(struct unixctl_server *server) +{ + if (server) { + struct unixctl_conn *conn, *next; + + LIST_FOR_EACH_SAFE (conn, next, + struct unixctl_conn, node, &server->conns) { + kill_connection(conn); + } + + close(server->fd); + unlink(server->path); + fatal_signal_remove_file_to_unlink(server->path); + free(server->path); + free(server); + } +} + +/* Connects to a Vlog server socket. 'path' should be the name of a Vlog + * server socket. If it does not start with '/', it will be prefixed with + * ovs_rundir (e.g. /var/run). + * + * Returns 0 if successful, otherwise a positive errno value. If successful, + * sets '*clientp' to the new unixctl_client, otherwise to NULL. */ +int +unixctl_client_create(const char *path, struct unixctl_client **clientp) +{ + static int counter; + struct unixctl_client *client; + int error; + int fd = -1; + + /* Determine location. */ + client = xmalloc(sizeof *client); + if (path[0] == '/') { + client->connect_path = xstrdup(path); + } else { + client->connect_path = xasprintf("%s/%s", ovs_rundir, path); + } + client->bind_path = xasprintf("/tmp/vlog.%ld.%d", + (long int) getpid(), counter++); + + /* Open socket. */ + fd = make_unix_socket(SOCK_STREAM, false, false, + client->bind_path, client->connect_path); + if (fd < 0) { + error = -fd; + goto error; + } + + /* Bind socket to stream. */ + client->stream = fdopen(fd, "r+"); + if (!client->stream) { + error = errno; + VLOG_WARN("%s: fdopen failed (%s)", + client->connect_path, strerror(error)); + goto error; + } + *clientp = client; + return 0; + +error: + if (fd >= 0) { + close(fd); + } + free(client->connect_path); + free(client->bind_path); + free(client); + *clientp = NULL; + return error; +} + +/* Destroys 'client'. */ +void +unixctl_client_destroy(struct unixctl_client *client) +{ + if (client) { + unlink(client->bind_path); + fatal_signal_remove_file_to_unlink(client->bind_path); + free(client->bind_path); + free(client->connect_path); + fclose(client->stream); + free(client); + } +} + +/* Sends 'request' to the server socket and waits for a reply. Returns 0 if + * successful, otherwise to a positive errno value. If successful, sets + * '*reply' to the reply, which the caller must free, otherwise to NULL. */ +int +unixctl_client_transact(struct unixctl_client *client, + const char *request, + int *reply_code, char **reply_body) +{ + struct ds line = DS_EMPTY_INITIALIZER; + struct ds reply = DS_EMPTY_INITIALIZER; + int error; + + /* Send 'request' to server. Add a new-line if 'request' didn't end in + * one. */ + fputs(request, client->stream); + if (request[0] == '\0' || request[strlen(request) - 1] != '\n') { + putc('\n', client->stream); + } + if (ferror(client->stream)) { + VLOG_WARN("error sending request to %s: %s", + client->connect_path, strerror(errno)); + return errno; + } + + /* Wait for response. */ + *reply_code = -1; + for (;;) { + const char *s; + + error = ds_get_line(&line, client->stream); + if (error) { + VLOG_WARN("error reading reply from %s: %s", + client->connect_path, + (error == EOF ? "unexpected end of file" + : strerror(error))); + goto error; + } + + s = ds_cstr(&line); + if (*reply_code == -1) { + if (!isdigit(s[0]) || !isdigit(s[1]) || !isdigit(s[2])) { + VLOG_WARN("reply from %s does not start with 3-digit code", + client->connect_path); + error = EPROTO; + goto error; + } + sscanf(s, "%3d", reply_code); + } else { + if (s[0] == '.') { + if (s[1] == '\0') { + break; + } + s++; + } + ds_put_cstr(&reply, s); + ds_put_char(&reply, '\n'); + } + } + *reply_body = ds_cstr(&reply); + ds_destroy(&line); + return 0; + +error: + ds_destroy(&line); + ds_destroy(&reply); + *reply_code = 0; + *reply_body = NULL; + return error == EOF ? EPROTO : error; +} + +/* Returns the path of the server socket to which 'client' is connected. The + * caller must not modify or free the returned string. */ +const char * +unixctl_client_target(const struct unixctl_client *client) +{ + return client->connect_path; +} diff --git a/lib/unixctl.h b/lib/unixctl.h new file mode 100644 index 00000000..338eecfe --- /dev/null +++ b/lib/unixctl.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef UNIXCTL_H +#define UNIXCTL_H 1 + +/* Server for Unix domain socket control connection. */ +struct unixctl_server; +int unixctl_server_create(const char *path, struct unixctl_server **); +void unixctl_server_run(struct unixctl_server *); +void unixctl_server_wait(struct unixctl_server *); +void unixctl_server_destroy(struct unixctl_server *); + +/* Client for Unix domain socket control connection. */ +struct unixctl_client; +int unixctl_client_create(const char *path, struct unixctl_client **); +void unixctl_client_destroy(struct unixctl_client *); +int unixctl_client_transact(struct unixctl_client *, + const char *request, + int *reply_code, char **reply_body); +const char *unixctl_client_target(const struct unixctl_client *); + +/* Command registration. */ +struct unixctl_conn; +void unixctl_command_register(const char *name, + void (*cb)(struct unixctl_conn *, + const char *args)); +void unixctl_command_reply(struct unixctl_conn *, int code, + const char *body); + +#endif /* unixctl.h */ diff --git a/lib/util.c b/lib/util.c new file mode 100644 index 00000000..edc24b36 --- /dev/null +++ b/lib/util.c @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "util.h" +#include +#include +#include +#include +#include +#include "coverage.h" + +const char *program_name; + +void +out_of_memory(void) +{ + ovs_fatal(0, "virtual memory exhausted"); +} + +void * +xcalloc(size_t count, size_t size) +{ + void *p = count && size ? calloc(count, size) : malloc(1); + COVERAGE_INC(util_xalloc); + if (p == NULL) { + out_of_memory(); + } + return p; +} + +void * +xmalloc(size_t size) +{ + void *p = malloc(size ? size : 1); + COVERAGE_INC(util_xalloc); + if (p == NULL) { + out_of_memory(); + } + return p; +} + +void * +xrealloc(void *p, size_t size) +{ + p = realloc(p, size ? size : 1); + COVERAGE_INC(util_xalloc); + if (p == NULL) { + out_of_memory(); + } + return p; +} + +void * +xmemdup(const void *p_, size_t size) +{ + void *p = xmalloc(size); + memcpy(p, p_, size); + return p; +} + +char * +xmemdup0(const char *p_, size_t length) +{ + char *p = xmalloc(length + 1); + memcpy(p, p_, length); + p[length] = '\0'; + return p; +} + +char * +xstrdup(const char *s) +{ + return xmemdup0(s, strlen(s)); +} + +char * +xvasprintf(const char *format, va_list args) +{ + va_list args2; + size_t needed; + char *s; + + va_copy(args2, args); + needed = vsnprintf(NULL, 0, format, args); + + s = xmalloc(needed + 1); + + vsnprintf(s, needed + 1, format, args2); + va_end(args2); + + return s; +} + +void * +x2nrealloc(void *p, size_t *n, size_t s) +{ + *n = *n == 0 ? 1 : 2 * *n; + return xrealloc(p, *n * s); +} + +char * +xasprintf(const char *format, ...) +{ + va_list args; + char *s; + + va_start(args, format); + s = xvasprintf(format, args); + va_end(args); + + return s; +} + +void +ovs_strlcpy(char *dst, const char *src, size_t size) +{ + if (size > 0) { + size_t n = strlen(src); + size_t n_copy = MIN(n, size - 1); + memcpy(dst, src, n_copy); + dst[n_copy] = '\0'; + } +} + +void +ovs_fatal(int err_no, const char *format, ...) +{ + va_list args; + + fprintf(stderr, "%s: ", program_name); + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + if (err_no != 0) + fprintf(stderr, " (%s)", strerror(err_no)); + putc('\n', stderr); + + exit(EXIT_FAILURE); +} + +void +ovs_error(int err_no, const char *format, ...) +{ + int save_errno = errno; + va_list args; + + fprintf(stderr, "%s: ", program_name); + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + if (err_no != 0) + fprintf(stderr, " (%s)", strerror(err_no)); + putc('\n', stderr); + + errno = save_errno; +} + +/* Sets program_name based on 'argv0'. Should be called at the beginning of + * main(), as "set_program_name(argv[0]);". */ +void set_program_name(const char *argv0) +{ + const char *slash = strrchr(argv0, '/'); + program_name = slash ? slash + 1 : argv0; +} + +/* Print the version information for the program. */ +void +ovs_print_version(char *date, char *time, + uint8_t min_ofp, uint8_t max_ofp) +{ + printf("%s (Open vSwitch) "VERSION BUILDNR"\n", program_name); + printf("Compiled %s %s\n", date, time); + if (min_ofp || max_ofp) { + printf("OpenFlow versions %#x:%#x\n", min_ofp, max_ofp); + } +} + +/* Writes the 'size' bytes in 'buf' to 'stream' as hex bytes arranged 16 per + * line. Numeric offsets are also included, starting at 'ofs' for the first + * byte in 'buf'. If 'ascii' is true then the corresponding ASCII characters + * are also rendered alongside. */ +void +ovs_hex_dump(FILE *stream, const void *buf_, size_t size, + uintptr_t ofs, bool ascii) +{ + const uint8_t *buf = buf_; + const size_t per_line = 16; /* Maximum bytes per line. */ + + while (size > 0) + { + size_t start, end, n; + size_t i; + + /* Number of bytes on this line. */ + start = ofs % per_line; + end = per_line; + if (end - start > size) + end = start + size; + n = end - start; + + /* Print line. */ + fprintf(stream, "%08jx ", (uintmax_t) ROUND_DOWN(ofs, per_line)); + for (i = 0; i < start; i++) + fprintf(stream, " "); + for (; i < end; i++) + fprintf(stream, "%02hhx%c", + buf[i - start], i == per_line / 2 - 1? '-' : ' '); + if (ascii) + { + for (; i < per_line; i++) + fprintf(stream, " "); + fprintf(stream, "|"); + for (i = 0; i < start; i++) + fprintf(stream, " "); + for (; i < end; i++) { + int c = buf[i - start]; + putc(c >= 32 && c < 127 ? c : '.', stream); + } + for (; i < per_line; i++) + fprintf(stream, " "); + fprintf(stream, "|"); + } + fprintf(stream, "\n"); + + ofs += n; + buf += n; + size -= n; + } +} + +bool +str_to_int(const char *s, int base, int *i) +{ + long long ll; + bool ok = str_to_llong(s, base, &ll); + *i = ll; + return ok; +} + +bool +str_to_long(const char *s, int base, long *li) +{ + long long ll; + bool ok = str_to_llong(s, base, &ll); + *li = ll; + return ok; +} + +bool +str_to_llong(const char *s, int base, long long *x) +{ + int save_errno = errno; + char *tail; + errno = 0; + *x = strtoll(s, &tail, base); + if (errno == EINVAL || errno == ERANGE || tail == s || *tail != '\0') { + errno = save_errno; + *x = 0; + return false; + } else { + errno = save_errno; + return true; + } +} + +bool +str_to_uint(const char *s, int base, unsigned int *u) +{ + return str_to_int(s, base, (int *) u); +} + +bool +str_to_ulong(const char *s, int base, unsigned long *ul) +{ + return str_to_long(s, base, (long *) ul); +} + +bool +str_to_ullong(const char *s, int base, unsigned long long *ull) +{ + return str_to_llong(s, base, (long long *) ull); +} diff --git a/lib/util.h b/lib/util.h new file mode 100644 index 00000000..87e216b8 --- /dev/null +++ b/lib/util.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef UTIL_H +#define UTIL_H 1 + +#include +#include +#include +#include +#include +#include +#include "compiler.h" + +#ifndef va_copy +#ifdef __va_copy +#define va_copy __va_copy +#else +#define va_copy(dst, src) ((dst) = (src)) +#endif +#endif + +#ifndef __cplusplus +/* Build-time assertion building block. */ +#define BUILD_ASSERT__(EXPR) \ + sizeof(struct { unsigned int build_assert_failed : (EXPR) ? 1 : -1; }) + +/* Build-time assertion for use in a statement context. */ +#define BUILD_ASSERT(EXPR) (void) BUILD_ASSERT__(EXPR) + +/* Build-time assertion for use in a declaration context. */ +#define BUILD_ASSERT_DECL(EXPR) \ + extern int (*build_assert(void))[BUILD_ASSERT__(EXPR)] +#else /* __cplusplus */ +#include +#define BUILD_ASSERT BOOST_STATIC_ASSERT +#define BUILD_ASSERT_DECL BOOST_STATIC_ASSERT +#endif /* __cplusplus */ + +extern const char *program_name; + +#define ARRAY_SIZE(ARRAY) (sizeof ARRAY / sizeof *ARRAY) +#define ROUND_UP(X, Y) (((X) + ((Y) - 1)) / (Y) * (Y)) +#define ROUND_DOWN(X, Y) ((X) / (Y) * (Y)) +#define IS_POW2(X) ((X) && !((X) & ((X) - 1))) + +#ifndef MIN +#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) +#endif + +#ifndef MAX +#define MAX(X, Y) ((X) > (Y) ? (X) : (Y)) +#endif + +#define NOT_REACHED() abort() +#define NOT_IMPLEMENTED() abort() +#define NOT_TESTED() ((void) 0) /* XXX should print a message. */ + +/* Given POINTER, the address of the given MEMBER in a STRUCT object, returns + the STRUCT object. */ +#define CONTAINER_OF(POINTER, STRUCT, MEMBER) \ + ((STRUCT *) ((char *) (POINTER) - offsetof (STRUCT, MEMBER))) + +#ifdef __cplusplus +extern "C" { +#endif + +void set_program_name(const char *); + +void ovs_print_version(char *date, char *time, + uint8_t min_ofp, uint8_t max_ofp); +#define OVS_PRINT_VERSION(min_ofp, max_ofp) \ + ovs_print_version(__DATE__, __TIME__, (min_ofp), (max_ofp)) + +void out_of_memory(void) NO_RETURN; +void *xmalloc(size_t) MALLOC_LIKE; +void *xcalloc(size_t, size_t) MALLOC_LIKE; +void *xrealloc(void *, size_t); +void *xmemdup(const void *, size_t) MALLOC_LIKE; +char *xmemdup0(const char *, size_t) MALLOC_LIKE; +char *xstrdup(const char *) MALLOC_LIKE; +char *xasprintf(const char *format, ...) PRINTF_FORMAT(1, 2) MALLOC_LIKE; +char *xvasprintf(const char *format, va_list) PRINTF_FORMAT(1, 0) MALLOC_LIKE; +void *x2nrealloc(void *p, size_t *n, size_t s); + +void ovs_strlcpy(char *dst, const char *src, size_t size); + +void ovs_fatal(int err_no, const char *format, ...) + PRINTF_FORMAT(2, 3) NO_RETURN; +void ovs_error(int err_no, const char *format, ...) PRINTF_FORMAT(2, 3); +void ovs_hex_dump(FILE *, const void *, size_t, uintptr_t offset, bool ascii); + +bool str_to_int(const char *, int base, int *); +bool str_to_long(const char *, int base, long *); +bool str_to_llong(const char *, int base, long long *); +bool str_to_uint(const char *, int base, unsigned int *); +bool str_to_ulong(const char *, int base, unsigned long *); +bool str_to_ullong(const char *, int base, unsigned long long *); + +#ifdef __cplusplus +} +#endif + +#endif /* util.h */ diff --git a/lib/valgrind.h b/lib/valgrind.h new file mode 100644 index 00000000..e15a7a70 --- /dev/null +++ b/lib/valgrind.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef VALGRIND_H +#define VALGRIND_H 1 + +#ifdef HAVE_VALGRIND_VALGRIND_H +#include +#else +#define RUNNING_ON_VALGRIND 0 +#endif + +#endif /* valgrind.h */ diff --git a/lib/vconn-provider.h b/lib/vconn-provider.h new file mode 100644 index 00000000..239d19e9 --- /dev/null +++ b/lib/vconn-provider.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef VCONN_PROVIDER_H +#define VCONN_PROVIDER_H 1 + +/* Provider interface to vconns, which provide a virtual connection to an + * OpenFlow device. */ + +#include +#include "vconn.h" + +/* Active virtual connection to an OpenFlow device. */ + +/* Active virtual connection to an OpenFlow device. + * + * This structure should be treated as opaque by vconn implementations. */ +struct vconn { + struct vconn_class *class; + int state; + int error; + int min_version; + int version; + uint32_t ip; + char *name; + bool reconnectable; +}; + +void vconn_init(struct vconn *, struct vconn_class *, int connect_status, + uint32_t ip, const char *name, bool reconnectable); +static inline void vconn_assert_class(const struct vconn *vconn, + const struct vconn_class *class) +{ + assert(vconn->class == class); +} + +struct vconn_class { + /* Prefix for connection names, e.g. "nl", "tcp". */ + const char *name; + + /* Attempts to connect to an OpenFlow device. 'name' is the full + * connection name provided by the user, e.g. "tcp:1.2.3.4". This name is + * useful for error messages but must not be modified. + * + * 'suffix' is a copy of 'name' following the colon and may be modified. + * + * Returns 0 if successful, otherwise a positive errno value. If + * successful, stores a pointer to the new connection in '*vconnp'. + * + * The open function must not block waiting for a connection to complete. + * If the connection cannot be completed immediately, it should return + * EAGAIN (not EINPROGRESS, as returned by the connect system call) and + * continue the connection in the background. */ + int (*open)(const char *name, char *suffix, struct vconn **vconnp); + + /* Closes 'vconn' and frees associated memory. */ + void (*close)(struct vconn *vconn); + + /* Tries to complete the connection on 'vconn'. If 'vconn''s connection is + * complete, returns 0 if the connection was successful or a positive errno + * value if it failed. If the connection is still in progress, returns + * EAGAIN. + * + * The connect function must not block waiting for the connection to + * complete; instead, it should return EAGAIN immediately. */ + int (*connect)(struct vconn *vconn); + + /* Tries to receive an OpenFlow message from 'vconn'. If successful, + * stores the received message into '*msgp' and returns 0. The caller is + * responsible for destroying the message with ofpbuf_delete(). On + * failure, returns a positive errno value and stores a null pointer into + * '*msgp'. + * + * If the connection has been closed in the normal fashion, returns EOF. + * + * The recv function must not block waiting for a packet to arrive. If no + * packets have been received, it should return EAGAIN. */ + int (*recv)(struct vconn *vconn, struct ofpbuf **msgp); + + /* Tries to queue 'msg' for transmission on 'vconn'. If successful, + * returns 0, in which case ownership of 'msg' is transferred to the vconn. + * Success does not guarantee that 'msg' has been or ever will be delivered + * to the peer, only that it has been queued for transmission. + * + * Returns a positive errno value on failure, in which case the caller + * retains ownership of 'msg'. + * + * The send function must not block. If 'msg' cannot be immediately + * accepted for transmission, it should return EAGAIN. */ + int (*send)(struct vconn *vconn, struct ofpbuf *msg); + + /* Arranges for the poll loop to wake up when 'vconn' is ready to take an + * action of the given 'type'. */ + void (*wait)(struct vconn *vconn, enum vconn_wait_type type); +}; + +/* Passive virtual connection to an OpenFlow device. + * + * This structure should be treated as opaque by vconn implementations. */ +struct pvconn { + struct pvconn_class *class; + char *name; +}; + +void pvconn_init(struct pvconn *, struct pvconn_class *, const char *name); +static inline void pvconn_assert_class(const struct pvconn *pvconn, + const struct pvconn_class *class) +{ + assert(pvconn->class == class); +} + +struct pvconn_class { + /* Prefix for connection names, e.g. "ptcp", "pssl". */ + const char *name; + + /* Attempts to start listening for OpenFlow connections. 'name' is the + * full connection name provided by the user, e.g. "ptcp:1234". This name + * is useful for error messages but must not be modified. + * + * 'suffix' is a copy of 'name' following the colon and may be modified. + * + * Returns 0 if successful, otherwise a positive errno value. If + * successful, stores a pointer to the new connection in '*pvconnp'. + * + * The listen function must not block. If the connection cannot be + * completed immediately, it should return EAGAIN (not EINPROGRESS, as + * returned by the connect system call) and continue the connection in the + * background. */ + int (*listen)(const char *name, char *suffix, struct pvconn **pvconnp); + + /* Closes 'pvconn' and frees associated memory. */ + void (*close)(struct pvconn *pvconn); + + /* Tries to accept a new connection on 'pvconn'. If successful, stores the + * new connection in '*new_vconnp' and returns 0. Otherwise, returns a + * positive errno value. + * + * The accept function must not block waiting for a connection. If no + * connection is ready to be accepted, it should return EAGAIN. */ + int (*accept)(struct pvconn *pvconn, struct vconn **new_vconnp); + + /* Arranges for the poll loop to wake up when a connection is ready to be + * accepted on 'pvconn'. */ + void (*wait)(struct pvconn *pvconn); +}; + +/* Active and passive vconn classes. */ +extern struct vconn_class tcp_vconn_class; +extern struct pvconn_class ptcp_pvconn_class; +extern struct vconn_class unix_vconn_class; +extern struct pvconn_class punix_pvconn_class; +#ifdef HAVE_OPENSSL +extern struct vconn_class ssl_vconn_class; +extern struct pvconn_class pssl_pvconn_class; +#endif + +#endif /* vconn-provider.h */ diff --git a/lib/vconn-ssl.c b/lib/vconn-ssl.c new file mode 100644 index 00000000..20bfb979 --- /dev/null +++ b/lib/vconn-ssl.c @@ -0,0 +1,1197 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "vconn-ssl.h" +#include "dhparams.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dynamic-string.h" +#include "leak-checker.h" +#include "ofpbuf.h" +#include "openflow/openflow.h" +#include "packets.h" +#include "poll-loop.h" +#include "socket-util.h" +#include "socket-util.h" +#include "util.h" +#include "vconn-provider.h" +#include "vconn.h" + +#include "vlog.h" +#define THIS_MODULE VLM_vconn_ssl + +/* Active SSL. */ + +enum ssl_state { + STATE_TCP_CONNECTING, + STATE_SSL_CONNECTING +}; + +enum session_type { + CLIENT, + SERVER +}; + +struct ssl_vconn +{ + struct vconn vconn; + enum ssl_state state; + int connect_error; + enum session_type type; + int fd; + SSL *ssl; + struct ofpbuf *rxbuf; + struct ofpbuf *txbuf; + struct poll_waiter *tx_waiter; + + /* rx_want and tx_want record the result of the last call to SSL_read() + * and SSL_write(), respectively: + * + * - If the call reported that data needed to be read from the file + * descriptor, the corresponding member is set to SSL_READING. + * + * - If the call reported that data needed to be written to the file + * descriptor, the corresponding member is set to SSL_WRITING. + * + * - Otherwise, the member is set to SSL_NOTHING, indicating that the + * call completed successfully (or with an error) and that there is no + * need to block. + * + * These are needed because there is no way to ask OpenSSL what a data read + * or write would require without giving it a buffer to receive into or + * data to send, respectively. (Note that the SSL_want() status is + * overwritten by each SSL_read() or SSL_write() call, so we can't rely on + * its value.) + * + * A single call to SSL_read() or SSL_write() can perform both reading + * and writing and thus invalidate not one of these values but actually + * both. Consider this situation, for example: + * + * - SSL_write() blocks on a read, so tx_want gets SSL_READING. + * + * - SSL_read() laters succeeds reading from 'fd' and clears out the + * whole receive buffer, so rx_want gets SSL_READING. + * + * - Client calls vconn_wait(WAIT_RECV) and vconn_wait(WAIT_SEND) and + * blocks. + * + * - Now we're stuck blocking until the peer sends us data, even though + * SSL_write() could now succeed, which could easily be a deadlock + * condition. + * + * On the other hand, we can't reset both tx_want and rx_want on every call + * to SSL_read() or SSL_write(), because that would produce livelock, + * e.g. in this situation: + * + * - SSL_write() blocks, so tx_want gets SSL_READING or SSL_WRITING. + * + * - SSL_read() blocks, so rx_want gets SSL_READING or SSL_WRITING, + * but tx_want gets reset to SSL_NOTHING. + * + * - Client calls vconn_wait(WAIT_RECV) and vconn_wait(WAIT_SEND) and + * blocks. + * + * - Client wakes up immediately since SSL_NOTHING in tx_want indicates + * that no blocking is necessary. + * + * The solution we adopt here is to set tx_want to SSL_NOTHING after + * calling SSL_read() only if the SSL state of the connection changed, + * which indicates that an SSL-level renegotiation made some progress, and + * similarly for rx_want and SSL_write(). This prevents both the + * deadlock and livelock situations above. + */ + int rx_want, tx_want; +}; + +/* SSL context created by ssl_init(). */ +static SSL_CTX *ctx; + +/* Required configuration. */ +static bool has_private_key, has_certificate, has_ca_cert; + +/* Ordinarily, we require a CA certificate for the peer to be locally + * available. 'has_ca_cert' is true when this is the case, and neither of the + * following variables matter. + * + * We can, however, bootstrap the CA certificate from the peer at the beginning + * of our first connection then use that certificate on all subsequent + * connections, saving it to a file for use in future runs also. In this case, + * 'has_ca_cert' is false, 'bootstrap_ca_cert' is true, and 'ca_cert_file' + * names the file to be saved. */ +static bool bootstrap_ca_cert; +static char *ca_cert_file; + +/* Who knows what can trigger various SSL errors, so let's throttle them down + * quite a bit. */ +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 25); + +static int ssl_init(void); +static int do_ssl_init(void); +static bool ssl_wants_io(int ssl_error); +static void ssl_close(struct vconn *); +static void ssl_clear_txbuf(struct ssl_vconn *); +static int interpret_ssl_error(const char *function, int ret, int error, + int *want); +static void ssl_tx_poll_callback(int fd, short int revents, void *vconn_); +static DH *tmp_dh_callback(SSL *ssl, int is_export UNUSED, int keylength); +static void log_ca_cert(const char *file_name, X509 *cert); + +static short int +want_to_poll_events(int want) +{ + switch (want) { + case SSL_NOTHING: + NOT_REACHED(); + + case SSL_READING: + return POLLIN; + + case SSL_WRITING: + return POLLOUT; + + default: + NOT_REACHED(); + } +} + +static int +new_ssl_vconn(const char *name, int fd, enum session_type type, + enum ssl_state state, const struct sockaddr_in *sin, + struct vconn **vconnp) +{ + struct ssl_vconn *sslv; + SSL *ssl = NULL; + int on = 1; + int retval; + + /* Check for all the needful configuration. */ + retval = 0; + if (!has_private_key) { + VLOG_ERR("Private key must be configured to use SSL"); + retval = ENOPROTOOPT; + } + if (!has_certificate) { + VLOG_ERR("Certificate must be configured to use SSL"); + retval = ENOPROTOOPT; + } + if (!has_ca_cert && !bootstrap_ca_cert) { + VLOG_ERR("CA certificate must be configured to use SSL"); + retval = ENOPROTOOPT; + } + if (!SSL_CTX_check_private_key(ctx)) { + VLOG_ERR("Private key does not match certificate public key: %s", + ERR_error_string(ERR_get_error(), NULL)); + retval = ENOPROTOOPT; + } + if (retval) { + goto error; + } + + /* Disable Nagle. */ + retval = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &on, sizeof on); + if (retval) { + VLOG_ERR("%s: setsockopt(TCP_NODELAY): %s", name, strerror(errno)); + retval = errno; + goto error; + } + + /* Create and configure OpenSSL stream. */ + ssl = SSL_new(ctx); + if (ssl == NULL) { + VLOG_ERR("SSL_new: %s", ERR_error_string(ERR_get_error(), NULL)); + retval = ENOPROTOOPT; + goto error; + } + if (SSL_set_fd(ssl, fd) == 0) { + VLOG_ERR("SSL_set_fd: %s", ERR_error_string(ERR_get_error(), NULL)); + retval = ENOPROTOOPT; + goto error; + } + if (bootstrap_ca_cert && type == CLIENT) { + SSL_set_verify(ssl, SSL_VERIFY_NONE, NULL); + } + + /* Create and return the ssl_vconn. */ + sslv = xmalloc(sizeof *sslv); + vconn_init(&sslv->vconn, &ssl_vconn_class, EAGAIN, sin->sin_addr.s_addr, + name, true); + sslv->state = state; + sslv->type = type; + sslv->fd = fd; + sslv->ssl = ssl; + sslv->rxbuf = NULL; + sslv->txbuf = NULL; + sslv->tx_waiter = NULL; + sslv->rx_want = sslv->tx_want = SSL_NOTHING; + *vconnp = &sslv->vconn; + return 0; + +error: + if (ssl) { + SSL_free(ssl); + } + close(fd); + return retval; +} + +static struct ssl_vconn * +ssl_vconn_cast(struct vconn *vconn) +{ + vconn_assert_class(vconn, &ssl_vconn_class); + return CONTAINER_OF(vconn, struct ssl_vconn, vconn); +} + +static int +ssl_open(const char *name, char *suffix, struct vconn **vconnp) +{ + char *save_ptr, *host_name, *port_string; + struct sockaddr_in sin; + int retval; + int fd; + + retval = ssl_init(); + if (retval) { + return retval; + } + + /* Glibc 2.7 has a bug in strtok_r when compiling with optimization that + * can cause segfaults here: + * http://sources.redhat.com/bugzilla/show_bug.cgi?id=5614. + * Using "::" instead of the obvious ":" works around it. */ + host_name = strtok_r(suffix, "::", &save_ptr); + port_string = strtok_r(NULL, "::", &save_ptr); + if (!host_name) { + ovs_error(0, "%s: bad peer name format", name); + return EAFNOSUPPORT; + } + + memset(&sin, 0, sizeof sin); + sin.sin_family = AF_INET; + if (lookup_ip(host_name, &sin.sin_addr)) { + return ENOENT; + } + sin.sin_port = htons(port_string && *port_string ? atoi(port_string) + : OFP_SSL_PORT); + + /* Create socket. */ + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + VLOG_ERR("%s: socket: %s", name, strerror(errno)); + return errno; + } + retval = set_nonblocking(fd); + if (retval) { + close(fd); + return retval; + } + + /* Connect socket. */ + retval = connect(fd, (struct sockaddr *) &sin, sizeof sin); + if (retval < 0) { + if (errno == EINPROGRESS) { + return new_ssl_vconn(name, fd, CLIENT, STATE_TCP_CONNECTING, + &sin, vconnp); + } else { + int error = errno; + VLOG_ERR("%s: connect: %s", name, strerror(error)); + close(fd); + return error; + } + } else { + return new_ssl_vconn(name, fd, CLIENT, STATE_SSL_CONNECTING, + &sin, vconnp); + } +} + +static int +do_ca_cert_bootstrap(struct vconn *vconn) +{ + struct ssl_vconn *sslv = ssl_vconn_cast(vconn); + STACK_OF(X509) *chain; + X509 *ca_cert; + FILE *file; + int error; + int fd; + + chain = SSL_get_peer_cert_chain(sslv->ssl); + if (!chain || !sk_X509_num(chain)) { + VLOG_ERR("could not bootstrap CA cert: no certificate presented by " + "peer"); + return EPROTO; + } + ca_cert = sk_X509_value(chain, sk_X509_num(chain) - 1); + + /* Check that 'ca_cert' is self-signed. Otherwise it is not a CA + * certificate and we should not attempt to use it as one. */ + error = X509_check_issued(ca_cert, ca_cert); + if (error) { + VLOG_ERR("could not bootstrap CA cert: obtained certificate is " + "not self-signed (%s)", + X509_verify_cert_error_string(error)); + if (sk_X509_num(chain) < 2) { + VLOG_ERR("only one certificate was received, so probably the peer " + "is not configured to send its CA certificate"); + } + return EPROTO; + } + + fd = open(ca_cert_file, O_CREAT | O_EXCL | O_WRONLY, 0444); + if (fd < 0) { + VLOG_ERR("could not bootstrap CA cert: creating %s failed: %s", + ca_cert_file, strerror(errno)); + return errno; + } + + file = fdopen(fd, "w"); + if (!file) { + int error = errno; + VLOG_ERR("could not bootstrap CA cert: fdopen failed: %s", + strerror(error)); + unlink(ca_cert_file); + return error; + } + + if (!PEM_write_X509(file, ca_cert)) { + VLOG_ERR("could not bootstrap CA cert: PEM_write_X509 to %s failed: " + "%s", ca_cert_file, ERR_error_string(ERR_get_error(), NULL)); + fclose(file); + unlink(ca_cert_file); + return EIO; + } + + if (fclose(file)) { + int error = errno; + VLOG_ERR("could not bootstrap CA cert: writing %s failed: %s", + ca_cert_file, strerror(error)); + unlink(ca_cert_file); + return error; + } + + VLOG_INFO("successfully bootstrapped CA cert to %s", ca_cert_file); + log_ca_cert(ca_cert_file, ca_cert); + bootstrap_ca_cert = false; + has_ca_cert = true; + + /* SSL_CTX_add_client_CA makes a copy of ca_cert's relevant data. */ + SSL_CTX_add_client_CA(ctx, ca_cert); + + /* SSL_CTX_use_certificate() takes ownership of the certificate passed in. + * 'ca_cert' is owned by sslv->ssl, so we need to duplicate it. */ + ca_cert = X509_dup(ca_cert); + if (!ca_cert) { + out_of_memory(); + } + if (SSL_CTX_load_verify_locations(ctx, ca_cert_file, NULL) != 1) { + VLOG_ERR("SSL_CTX_load_verify_locations: %s", + ERR_error_string(ERR_get_error(), NULL)); + return EPROTO; + } + VLOG_INFO("killing successful connection to retry using CA cert"); + return EPROTO; +} + +static int +ssl_connect(struct vconn *vconn) +{ + struct ssl_vconn *sslv = ssl_vconn_cast(vconn); + int retval; + + switch (sslv->state) { + case STATE_TCP_CONNECTING: + retval = check_connection_completion(sslv->fd); + if (retval) { + return retval; + } + sslv->state = STATE_SSL_CONNECTING; + /* Fall through. */ + + case STATE_SSL_CONNECTING: + retval = (sslv->type == CLIENT + ? SSL_connect(sslv->ssl) : SSL_accept(sslv->ssl)); + if (retval != 1) { + int error = SSL_get_error(sslv->ssl, retval); + if (retval < 0 && ssl_wants_io(error)) { + return EAGAIN; + } else { + int unused; + interpret_ssl_error((sslv->type == CLIENT ? "SSL_connect" + : "SSL_accept"), retval, error, &unused); + shutdown(sslv->fd, SHUT_RDWR); + return EPROTO; + } + } else if (bootstrap_ca_cert) { + return do_ca_cert_bootstrap(vconn); + } else if ((SSL_get_verify_mode(sslv->ssl) + & (SSL_VERIFY_NONE | SSL_VERIFY_PEER)) + != SSL_VERIFY_PEER) { + /* Two or more SSL connections completed at the same time while we + * were in bootstrap mode. Only one of these can finish the + * bootstrap successfully. The other one(s) must be rejected + * because they were not verified against the bootstrapped CA + * certificate. (Alternatively we could verify them against the CA + * certificate, but that's more trouble than it's worth. These + * connections will succeed the next time they retry, assuming that + * they have a certificate against the correct CA.) */ + VLOG_ERR("rejecting SSL connection during bootstrap race window"); + return EPROTO; + } else { + return 0; + } + } + + NOT_REACHED(); +} + +static void +ssl_close(struct vconn *vconn) +{ + struct ssl_vconn *sslv = ssl_vconn_cast(vconn); + poll_cancel(sslv->tx_waiter); + ssl_clear_txbuf(sslv); + ofpbuf_delete(sslv->rxbuf); + SSL_free(sslv->ssl); + close(sslv->fd); + free(sslv); +} + +static int +interpret_ssl_error(const char *function, int ret, int error, + int *want) +{ + *want = SSL_NOTHING; + + switch (error) { + case SSL_ERROR_NONE: + VLOG_ERR_RL(&rl, "%s: unexpected SSL_ERROR_NONE", function); + break; + + case SSL_ERROR_ZERO_RETURN: + VLOG_ERR_RL(&rl, "%s: unexpected SSL_ERROR_ZERO_RETURN", function); + break; + + case SSL_ERROR_WANT_READ: + *want = SSL_READING; + return EAGAIN; + + case SSL_ERROR_WANT_WRITE: + *want = SSL_WRITING; + return EAGAIN; + + case SSL_ERROR_WANT_CONNECT: + VLOG_ERR_RL(&rl, "%s: unexpected SSL_ERROR_WANT_CONNECT", function); + break; + + case SSL_ERROR_WANT_ACCEPT: + VLOG_ERR_RL(&rl, "%s: unexpected SSL_ERROR_WANT_ACCEPT", function); + break; + + case SSL_ERROR_WANT_X509_LOOKUP: + VLOG_ERR_RL(&rl, "%s: unexpected SSL_ERROR_WANT_X509_LOOKUP", + function); + break; + + case SSL_ERROR_SYSCALL: { + int queued_error = ERR_get_error(); + if (queued_error == 0) { + if (ret < 0) { + int status = errno; + VLOG_WARN_RL(&rl, "%s: system error (%s)", + function, strerror(status)); + return status; + } else { + VLOG_WARN_RL(&rl, "%s: unexpected SSL connection close", + function); + return EPROTO; + } + } else { + VLOG_WARN_RL(&rl, "%s: %s", + function, ERR_error_string(queued_error, NULL)); + break; + } + } + + case SSL_ERROR_SSL: { + int queued_error = ERR_get_error(); + if (queued_error != 0) { + VLOG_WARN_RL(&rl, "%s: %s", + function, ERR_error_string(queued_error, NULL)); + } else { + VLOG_ERR_RL(&rl, "%s: SSL_ERROR_SSL without queued error", + function); + } + break; + } + + default: + VLOG_ERR_RL(&rl, "%s: bad SSL error code %d", function, error); + break; + } + return EIO; +} + +static int +ssl_recv(struct vconn *vconn, struct ofpbuf **bufferp) +{ + struct ssl_vconn *sslv = ssl_vconn_cast(vconn); + struct ofpbuf *rx; + size_t want_bytes; + int old_state; + ssize_t ret; + + if (sslv->rxbuf == NULL) { + sslv->rxbuf = ofpbuf_new(1564); + } + rx = sslv->rxbuf; + +again: + if (sizeof(struct ofp_header) > rx->size) { + want_bytes = sizeof(struct ofp_header) - rx->size; + } else { + struct ofp_header *oh = rx->data; + size_t length = ntohs(oh->length); + if (length < sizeof(struct ofp_header)) { + VLOG_ERR_RL(&rl, "received too-short ofp_header (%zu bytes)", + length); + return EPROTO; + } + want_bytes = length - rx->size; + if (!want_bytes) { + *bufferp = rx; + sslv->rxbuf = NULL; + return 0; + } + } + ofpbuf_prealloc_tailroom(rx, want_bytes); + + /* Behavior of zero-byte SSL_read is poorly defined. */ + assert(want_bytes > 0); + + old_state = SSL_get_state(sslv->ssl); + ret = SSL_read(sslv->ssl, ofpbuf_tail(rx), want_bytes); + if (old_state != SSL_get_state(sslv->ssl)) { + sslv->tx_want = SSL_NOTHING; + if (sslv->tx_waiter) { + poll_cancel(sslv->tx_waiter); + ssl_tx_poll_callback(sslv->fd, POLLIN, vconn); + } + } + sslv->rx_want = SSL_NOTHING; + + if (ret > 0) { + rx->size += ret; + if (ret == want_bytes) { + if (rx->size > sizeof(struct ofp_header)) { + *bufferp = rx; + sslv->rxbuf = NULL; + return 0; + } else { + goto again; + } + } + return EAGAIN; + } else { + int error = SSL_get_error(sslv->ssl, ret); + if (error == SSL_ERROR_ZERO_RETURN) { + /* Connection closed (EOF). */ + if (rx->size) { + VLOG_WARN_RL(&rl, "SSL_read: unexpected connection close"); + return EPROTO; + } else { + return EOF; + } + } else { + return interpret_ssl_error("SSL_read", ret, error, &sslv->rx_want); + } + } +} + +static void +ssl_clear_txbuf(struct ssl_vconn *sslv) +{ + ofpbuf_delete(sslv->txbuf); + sslv->txbuf = NULL; + sslv->tx_waiter = NULL; +} + +static void +ssl_register_tx_waiter(struct vconn *vconn) +{ + struct ssl_vconn *sslv = ssl_vconn_cast(vconn); + sslv->tx_waiter = poll_fd_callback(sslv->fd, + want_to_poll_events(sslv->tx_want), + ssl_tx_poll_callback, vconn); +} + +static int +ssl_do_tx(struct vconn *vconn) +{ + struct ssl_vconn *sslv = ssl_vconn_cast(vconn); + + for (;;) { + int old_state = SSL_get_state(sslv->ssl); + int ret = SSL_write(sslv->ssl, sslv->txbuf->data, sslv->txbuf->size); + if (old_state != SSL_get_state(sslv->ssl)) { + sslv->rx_want = SSL_NOTHING; + } + sslv->tx_want = SSL_NOTHING; + if (ret > 0) { + ofpbuf_pull(sslv->txbuf, ret); + if (sslv->txbuf->size == 0) { + return 0; + } + } else { + int ssl_error = SSL_get_error(sslv->ssl, ret); + if (ssl_error == SSL_ERROR_ZERO_RETURN) { + VLOG_WARN_RL(&rl, "SSL_write: connection closed"); + return EPIPE; + } else { + return interpret_ssl_error("SSL_write", ret, ssl_error, + &sslv->tx_want); + } + } + } +} + +static void +ssl_tx_poll_callback(int fd UNUSED, short int revents UNUSED, void *vconn_) +{ + struct vconn *vconn = vconn_; + struct ssl_vconn *sslv = ssl_vconn_cast(vconn); + int error = ssl_do_tx(vconn); + if (error != EAGAIN) { + ssl_clear_txbuf(sslv); + } else { + ssl_register_tx_waiter(vconn); + } +} + +static int +ssl_send(struct vconn *vconn, struct ofpbuf *buffer) +{ + struct ssl_vconn *sslv = ssl_vconn_cast(vconn); + + if (sslv->txbuf) { + return EAGAIN; + } else { + int error; + + sslv->txbuf = buffer; + error = ssl_do_tx(vconn); + switch (error) { + case 0: + ssl_clear_txbuf(sslv); + return 0; + case EAGAIN: + leak_checker_claim(buffer); + ssl_register_tx_waiter(vconn); + return 0; + default: + sslv->txbuf = NULL; + return error; + } + } +} + +static void +ssl_wait(struct vconn *vconn, enum vconn_wait_type wait) +{ + struct ssl_vconn *sslv = ssl_vconn_cast(vconn); + + switch (wait) { + case WAIT_CONNECT: + if (vconn_connect(vconn) != EAGAIN) { + poll_immediate_wake(); + } else { + switch (sslv->state) { + case STATE_TCP_CONNECTING: + poll_fd_wait(sslv->fd, POLLOUT); + break; + + case STATE_SSL_CONNECTING: + /* ssl_connect() called SSL_accept() or SSL_connect(), which + * set up the status that we test here. */ + poll_fd_wait(sslv->fd, + want_to_poll_events(SSL_want(sslv->ssl))); + break; + + default: + NOT_REACHED(); + } + } + break; + + case WAIT_RECV: + if (sslv->rx_want != SSL_NOTHING) { + poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want)); + } else { + poll_immediate_wake(); + } + break; + + case WAIT_SEND: + if (!sslv->txbuf) { + /* We have room in our tx queue. */ + poll_immediate_wake(); + } else { + /* The call to ssl_tx_poll_callback() will wake us up. */ + } + break; + + default: + NOT_REACHED(); + } +} + +struct vconn_class ssl_vconn_class = { + "ssl", /* name */ + ssl_open, /* open */ + ssl_close, /* close */ + ssl_connect, /* connect */ + ssl_recv, /* recv */ + ssl_send, /* send */ + ssl_wait, /* wait */ +}; + +/* Passive SSL. */ + +struct pssl_pvconn +{ + struct pvconn pvconn; + int fd; +}; + +struct pvconn_class pssl_pvconn_class; + +static struct pssl_pvconn * +pssl_pvconn_cast(struct pvconn *pvconn) +{ + pvconn_assert_class(pvconn, &pssl_pvconn_class); + return CONTAINER_OF(pvconn, struct pssl_pvconn, pvconn); +} + +static int +pssl_open(const char *name, char *suffix, struct pvconn **pvconnp) +{ + struct sockaddr_in sin; + struct pssl_pvconn *pssl; + int retval; + int fd; + unsigned int yes = 1; + + retval = ssl_init(); + if (retval) { + return retval; + } + + /* Create socket. */ + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + int error = errno; + VLOG_ERR("%s: socket: %s", name, strerror(error)); + return error; + } + + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof yes) < 0) { + int error = errno; + VLOG_ERR("%s: setsockopt(SO_REUSEADDR): %s", name, strerror(errno)); + return error; + } + + memset(&sin, 0, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(atoi(suffix) ? atoi(suffix) : OFP_SSL_PORT); + retval = bind(fd, (struct sockaddr *) &sin, sizeof sin); + if (retval < 0) { + int error = errno; + VLOG_ERR("%s: bind: %s", name, strerror(error)); + close(fd); + return error; + } + + retval = listen(fd, 10); + if (retval < 0) { + int error = errno; + VLOG_ERR("%s: listen: %s", name, strerror(error)); + close(fd); + return error; + } + + retval = set_nonblocking(fd); + if (retval) { + close(fd); + return retval; + } + + pssl = xmalloc(sizeof *pssl); + pvconn_init(&pssl->pvconn, &pssl_pvconn_class, name); + pssl->fd = fd; + *pvconnp = &pssl->pvconn; + return 0; +} + +static void +pssl_close(struct pvconn *pvconn) +{ + struct pssl_pvconn *pssl = pssl_pvconn_cast(pvconn); + close(pssl->fd); + free(pssl); +} + +static int +pssl_accept(struct pvconn *pvconn, struct vconn **new_vconnp) +{ + struct pssl_pvconn *pssl = pssl_pvconn_cast(pvconn); + struct sockaddr_in sin; + socklen_t sin_len = sizeof sin; + char name[128]; + int new_fd; + int error; + + new_fd = accept(pssl->fd, &sin, &sin_len); + if (new_fd < 0) { + int error = errno; + if (error != EAGAIN) { + VLOG_DBG_RL(&rl, "accept: %s", strerror(error)); + } + return error; + } + + error = set_nonblocking(new_fd); + if (error) { + close(new_fd); + return error; + } + + sprintf(name, "ssl:"IP_FMT, IP_ARGS(&sin.sin_addr)); + if (sin.sin_port != htons(OFP_SSL_PORT)) { + sprintf(strchr(name, '\0'), ":%"PRIu16, ntohs(sin.sin_port)); + } + return new_ssl_vconn(name, new_fd, SERVER, STATE_SSL_CONNECTING, &sin, + new_vconnp); +} + +static void +pssl_wait(struct pvconn *pvconn) +{ + struct pssl_pvconn *pssl = pssl_pvconn_cast(pvconn); + poll_fd_wait(pssl->fd, POLLIN); +} + +struct pvconn_class pssl_pvconn_class = { + "pssl", + pssl_open, + pssl_close, + pssl_accept, + pssl_wait, +}; + +/* + * Returns true if OpenSSL error is WANT_READ or WANT_WRITE, indicating that + * OpenSSL is requesting that we call it back when the socket is ready for read + * or writing, respectively. + */ +static bool +ssl_wants_io(int ssl_error) +{ + return (ssl_error == SSL_ERROR_WANT_WRITE + || ssl_error == SSL_ERROR_WANT_READ); +} + +static int +ssl_init(void) +{ + static int init_status = -1; + if (init_status < 0) { + init_status = do_ssl_init(); + assert(init_status >= 0); + } + return init_status; +} + +static int +do_ssl_init(void) +{ + SSL_METHOD *method; + + SSL_library_init(); + SSL_load_error_strings(); + + method = TLSv1_method(); + if (method == NULL) { + VLOG_ERR("TLSv1_method: %s", ERR_error_string(ERR_get_error(), NULL)); + return ENOPROTOOPT; + } + + ctx = SSL_CTX_new(method); + if (ctx == NULL) { + VLOG_ERR("SSL_CTX_new: %s", ERR_error_string(ERR_get_error(), NULL)); + return ENOPROTOOPT; + } + SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3); + SSL_CTX_set_tmp_dh_callback(ctx, tmp_dh_callback); + SSL_CTX_set_mode(ctx, SSL_MODE_ENABLE_PARTIAL_WRITE); + SSL_CTX_set_mode(ctx, SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER); + SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT, + NULL); + + return 0; +} + +static DH * +tmp_dh_callback(SSL *ssl UNUSED, int is_export UNUSED, int keylength) +{ + struct dh { + int keylength; + DH *dh; + DH *(*constructor)(void); + }; + + static struct dh dh_table[] = { + {1024, NULL, get_dh1024}, + {2048, NULL, get_dh2048}, + {4096, NULL, get_dh4096}, + }; + + struct dh *dh; + + for (dh = dh_table; dh < &dh_table[ARRAY_SIZE(dh_table)]; dh++) { + if (dh->keylength == keylength) { + if (!dh->dh) { + dh->dh = dh->constructor(); + if (!dh->dh) { + ovs_fatal(ENOMEM, "out of memory constructing " + "Diffie-Hellman parameters"); + } + } + return dh->dh; + } + } + VLOG_ERR_RL(&rl, "no Diffie-Hellman parameters for key length %d", + keylength); + return NULL; +} + +/* Returns true if SSL is at least partially configured. */ +bool +vconn_ssl_is_configured(void) +{ + return has_private_key || has_certificate || has_ca_cert; +} + +void +vconn_ssl_set_private_key_file(const char *file_name) +{ + if (ssl_init()) { + return; + } + if (SSL_CTX_use_PrivateKey_file(ctx, file_name, SSL_FILETYPE_PEM) != 1) { + VLOG_ERR("SSL_use_PrivateKey_file: %s", + ERR_error_string(ERR_get_error(), NULL)); + return; + } + has_private_key = true; +} + +void +vconn_ssl_set_certificate_file(const char *file_name) +{ + if (ssl_init()) { + return; + } + if (SSL_CTX_use_certificate_chain_file(ctx, file_name) != 1) { + VLOG_ERR("SSL_use_certificate_file: %s", + ERR_error_string(ERR_get_error(), NULL)); + return; + } + has_certificate = true; +} + +/* Reads the X509 certificate or certificates in file 'file_name'. On success, + * stores the address of the first element in an array of pointers to + * certificates in '*certs' and the number of certificates in the array in + * '*n_certs', and returns 0. On failure, stores a null pointer in '*certs', 0 + * in '*n_certs', and returns a positive errno value. + * + * The caller is responsible for freeing '*certs'. */ +static int +read_cert_file(const char *file_name, X509 ***certs, size_t *n_certs) +{ + FILE *file; + size_t allocated_certs = 0; + + *certs = NULL; + *n_certs = 0; + + file = fopen(file_name, "r"); + if (!file) { + VLOG_ERR("failed to open %s for reading: %s", + file_name, strerror(errno)); + return errno; + } + + for (;;) { + X509 *certificate; + int c; + + /* Read certificate from file. */ + certificate = PEM_read_X509(file, NULL, NULL, NULL); + if (!certificate) { + size_t i; + + VLOG_ERR("PEM_read_X509 failed reading %s: %s", + file_name, ERR_error_string(ERR_get_error(), NULL)); + for (i = 0; i < *n_certs; i++) { + X509_free((*certs)[i]); + } + free(*certs); + *certs = NULL; + *n_certs = 0; + return EIO; + } + + /* Add certificate to array. */ + if (*n_certs >= allocated_certs) { + *certs = x2nrealloc(*certs, &allocated_certs, sizeof **certs); + } + (*certs)[(*n_certs)++] = certificate; + + /* Are there additional certificates in the file? */ + do { + c = getc(file); + } while (isspace(c)); + if (c == EOF) { + break; + } + ungetc(c, file); + } + fclose(file); + return 0; +} + + +/* Sets 'file_name' as the name of a file containing one or more X509 + * certificates to send to the peer. Typical use in OpenFlow is to send the CA + * certificate to the peer, which enables a switch to pick up the controller's + * CA certificate on its first connection. */ +void +vconn_ssl_set_peer_ca_cert_file(const char *file_name) +{ + X509 **certs; + size_t n_certs; + size_t i; + + if (ssl_init()) { + return; + } + + if (!read_cert_file(file_name, &certs, &n_certs)) { + for (i = 0; i < n_certs; i++) { + if (SSL_CTX_add_extra_chain_cert(ctx, certs[i]) != 1) { + VLOG_ERR("SSL_CTX_add_extra_chain_cert: %s", + ERR_error_string(ERR_get_error(), NULL)); + } + } + free(certs); + } +} + +/* Logs fingerprint of CA certificate 'cert' obtained from 'file_name'. */ +static void +log_ca_cert(const char *file_name, X509 *cert) +{ + unsigned char digest[EVP_MAX_MD_SIZE]; + unsigned int n_bytes; + struct ds fp; + char *subject; + + ds_init(&fp); + if (!X509_digest(cert, EVP_sha1(), digest, &n_bytes)) { + ds_put_cstr(&fp, ""); + } else { + unsigned int i; + for (i = 0; i < n_bytes; i++) { + if (i) { + ds_put_char(&fp, ':'); + } + ds_put_format(&fp, "%02hhx", digest[i]); + } + } + subject = X509_NAME_oneline(X509_get_subject_name(cert), NULL, 0); + VLOG_INFO("Trusting CA cert from %s (%s) (fingerprint %s)", file_name, + subject ? subject : "", ds_cstr(&fp)); + free(subject); + ds_destroy(&fp); +} + +/* Sets 'file_name' as the name of the file from which to read the CA + * certificate used to verify the peer within SSL connections. If 'bootstrap' + * is false, the file must exist. If 'bootstrap' is false, then the file is + * read if it is exists; if it does not, then it will be created from the CA + * certificate received from the peer on the first SSL connection. */ +void +vconn_ssl_set_ca_cert_file(const char *file_name, bool bootstrap) +{ + X509 **certs; + size_t n_certs; + struct stat s; + + if (ssl_init()) { + return; + } + + if (bootstrap && stat(file_name, &s) && errno == ENOENT) { + bootstrap_ca_cert = true; + ca_cert_file = xstrdup(file_name); + } else if (!read_cert_file(file_name, &certs, &n_certs)) { + size_t i; + + /* Set up list of CAs that the server will accept from the client. */ + for (i = 0; i < n_certs; i++) { + /* SSL_CTX_add_client_CA makes a copy of the relevant data. */ + if (SSL_CTX_add_client_CA(ctx, certs[i]) != 1) { + VLOG_ERR("failed to add client certificate %d from %s: %s", + i, file_name, + ERR_error_string(ERR_get_error(), NULL)); + } else { + log_ca_cert(file_name, certs[i]); + } + X509_free(certs[i]); + } + + /* Set up CAs for OpenSSL to trust in verifying the peer's + * certificate. */ + if (SSL_CTX_load_verify_locations(ctx, file_name, NULL) != 1) { + VLOG_ERR("SSL_CTX_load_verify_locations: %s", + ERR_error_string(ERR_get_error(), NULL)); + return; + } + + has_ca_cert = true; + } +} diff --git a/lib/vconn-ssl.h b/lib/vconn-ssl.h new file mode 100644 index 00000000..f7ff130b --- /dev/null +++ b/lib/vconn-ssl.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef VCONN_SSL_H +#define VCONN_SSL_H 1 + +#include + +#ifdef HAVE_OPENSSL +bool vconn_ssl_is_configured(void); +void vconn_ssl_set_private_key_file(const char *file_name); +void vconn_ssl_set_certificate_file(const char *file_name); +void vconn_ssl_set_ca_cert_file(const char *file_name, bool bootstrap); +void vconn_ssl_set_peer_ca_cert_file(const char *file_name); + +#define VCONN_SSL_LONG_OPTIONS \ + {"private-key", required_argument, 0, 'p'}, \ + {"certificate", required_argument, 0, 'c'}, \ + {"ca-cert", required_argument, 0, 'C'}, + +#define VCONN_SSL_OPTION_HANDLERS \ + case 'p': \ + vconn_ssl_set_private_key_file(optarg); \ + break; \ + \ + case 'c': \ + vconn_ssl_set_certificate_file(optarg); \ + break; \ + \ + case 'C': \ + vconn_ssl_set_ca_cert_file(optarg, false); \ + break; +#else /* !HAVE_OPENSSL */ +static inline bool vconn_ssl_is_configured(void) +{ + return false; +} +#define VCONN_SSL_LONG_OPTIONS +#define VCONN_SSL_OPTION_HANDLERS +#endif /* !HAVE_OPENSSL */ + +#endif /* vconn-ssl.h */ diff --git a/lib/vconn-stream.c b/lib/vconn-stream.c new file mode 100644 index 00000000..468c112c --- /dev/null +++ b/lib/vconn-stream.c @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "vconn-stream.h" +#include +#include +#include +#include +#include +#include +#include +#include "leak-checker.h" +#include "ofpbuf.h" +#include "openflow/openflow.h" +#include "poll-loop.h" +#include "socket-util.h" +#include "util.h" +#include "vconn-provider.h" +#include "vconn.h" + +#include "vlog.h" +#define THIS_MODULE VLM_vconn_stream + +/* Active stream socket vconn. */ + +struct stream_vconn +{ + struct vconn vconn; + int fd; + struct ofpbuf *rxbuf; + struct ofpbuf *txbuf; + struct poll_waiter *tx_waiter; +}; + +static struct vconn_class stream_vconn_class; + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 25); + +static void stream_clear_txbuf(struct stream_vconn *); + +int +new_stream_vconn(const char *name, int fd, int connect_status, + uint32_t ip, bool reconnectable, struct vconn **vconnp) +{ + struct stream_vconn *s; + + s = xmalloc(sizeof *s); + vconn_init(&s->vconn, &stream_vconn_class, connect_status, ip, name, + reconnectable); + s->fd = fd; + s->txbuf = NULL; + s->tx_waiter = NULL; + s->rxbuf = NULL; + *vconnp = &s->vconn; + return 0; +} + +static struct stream_vconn * +stream_vconn_cast(struct vconn *vconn) +{ + vconn_assert_class(vconn, &stream_vconn_class); + return CONTAINER_OF(vconn, struct stream_vconn, vconn); +} + +static void +stream_close(struct vconn *vconn) +{ + struct stream_vconn *s = stream_vconn_cast(vconn); + poll_cancel(s->tx_waiter); + stream_clear_txbuf(s); + ofpbuf_delete(s->rxbuf); + close(s->fd); + free(s); +} + +static int +stream_connect(struct vconn *vconn) +{ + struct stream_vconn *s = stream_vconn_cast(vconn); + return check_connection_completion(s->fd); +} + +static int +stream_recv(struct vconn *vconn, struct ofpbuf **bufferp) +{ + struct stream_vconn *s = stream_vconn_cast(vconn); + struct ofpbuf *rx; + size_t want_bytes; + ssize_t retval; + + if (s->rxbuf == NULL) { + s->rxbuf = ofpbuf_new(1564); + } + rx = s->rxbuf; + +again: + if (sizeof(struct ofp_header) > rx->size) { + want_bytes = sizeof(struct ofp_header) - rx->size; + } else { + struct ofp_header *oh = rx->data; + size_t length = ntohs(oh->length); + if (length < sizeof(struct ofp_header)) { + VLOG_ERR_RL(&rl, "received too-short ofp_header (%zu bytes)", + length); + return EPROTO; + } + want_bytes = length - rx->size; + if (!want_bytes) { + *bufferp = rx; + s->rxbuf = NULL; + return 0; + } + } + ofpbuf_prealloc_tailroom(rx, want_bytes); + + retval = read(s->fd, ofpbuf_tail(rx), want_bytes); + if (retval > 0) { + rx->size += retval; + if (retval == want_bytes) { + if (rx->size > sizeof(struct ofp_header)) { + *bufferp = rx; + s->rxbuf = NULL; + return 0; + } else { + goto again; + } + } + return EAGAIN; + } else if (retval == 0) { + if (rx->size) { + VLOG_ERR_RL(&rl, "connection dropped mid-packet"); + return EPROTO; + } else { + return EOF; + } + } else { + return errno; + } +} + +static void +stream_clear_txbuf(struct stream_vconn *s) +{ + ofpbuf_delete(s->txbuf); + s->txbuf = NULL; + s->tx_waiter = NULL; +} + +static void +stream_do_tx(int fd UNUSED, short int revents UNUSED, void *vconn_) +{ + struct vconn *vconn = vconn_; + struct stream_vconn *s = stream_vconn_cast(vconn); + ssize_t n = write(s->fd, s->txbuf->data, s->txbuf->size); + if (n < 0) { + if (errno != EAGAIN) { + VLOG_ERR_RL(&rl, "send: %s", strerror(errno)); + stream_clear_txbuf(s); + return; + } + } else if (n > 0) { + ofpbuf_pull(s->txbuf, n); + if (!s->txbuf->size) { + stream_clear_txbuf(s); + return; + } + } + s->tx_waiter = poll_fd_callback(s->fd, POLLOUT, stream_do_tx, vconn); +} + +static int +stream_send(struct vconn *vconn, struct ofpbuf *buffer) +{ + struct stream_vconn *s = stream_vconn_cast(vconn); + ssize_t retval; + + if (s->txbuf) { + return EAGAIN; + } + + retval = write(s->fd, buffer->data, buffer->size); + if (retval == buffer->size) { + ofpbuf_delete(buffer); + return 0; + } else if (retval >= 0 || errno == EAGAIN) { + leak_checker_claim(buffer); + s->txbuf = buffer; + if (retval > 0) { + ofpbuf_pull(buffer, retval); + } + s->tx_waiter = poll_fd_callback(s->fd, POLLOUT, stream_do_tx, vconn); + return 0; + } else { + return errno; + } +} + +static void +stream_wait(struct vconn *vconn, enum vconn_wait_type wait) +{ + struct stream_vconn *s = stream_vconn_cast(vconn); + switch (wait) { + case WAIT_CONNECT: + poll_fd_wait(s->fd, POLLOUT); + break; + + case WAIT_SEND: + if (!s->txbuf) { + poll_fd_wait(s->fd, POLLOUT); + } else { + /* Nothing to do: need to drain txbuf first. */ + } + break; + + case WAIT_RECV: + poll_fd_wait(s->fd, POLLIN); + break; + + default: + NOT_REACHED(); + } +} + +static struct vconn_class stream_vconn_class = { + "stream", /* name */ + NULL, /* open */ + stream_close, /* close */ + stream_connect, /* connect */ + stream_recv, /* recv */ + stream_send, /* send */ + stream_wait, /* wait */ +}; + +/* Passive stream socket vconn. */ + +struct pstream_pvconn +{ + struct pvconn pvconn; + int fd; + int (*accept_cb)(int fd, const struct sockaddr *, size_t sa_len, + struct vconn **); +}; + +static struct pvconn_class pstream_pvconn_class; + +static struct pstream_pvconn * +pstream_pvconn_cast(struct pvconn *pvconn) +{ + pvconn_assert_class(pvconn, &pstream_pvconn_class); + return CONTAINER_OF(pvconn, struct pstream_pvconn, pvconn); +} + +int +new_pstream_pvconn(const char *name, int fd, + int (*accept_cb)(int fd, const struct sockaddr *, + size_t sa_len, struct vconn **), + struct pvconn **pvconnp) +{ + struct pstream_pvconn *ps; + int retval; + + retval = set_nonblocking(fd); + if (retval) { + close(fd); + return retval; + } + + if (listen(fd, 10) < 0) { + int error = errno; + VLOG_ERR("%s: listen: %s", name, strerror(error)); + close(fd); + return error; + } + + ps = xmalloc(sizeof *ps); + pvconn_init(&ps->pvconn, &pstream_pvconn_class, name); + ps->fd = fd; + ps->accept_cb = accept_cb; + *pvconnp = &ps->pvconn; + return 0; +} + +static void +pstream_close(struct pvconn *pvconn) +{ + struct pstream_pvconn *ps = pstream_pvconn_cast(pvconn); + close(ps->fd); + free(ps); +} + +static int +pstream_accept(struct pvconn *pvconn, struct vconn **new_vconnp) +{ + struct pstream_pvconn *ps = pstream_pvconn_cast(pvconn); + struct sockaddr_storage ss; + socklen_t ss_len = sizeof ss; + int new_fd; + int retval; + + new_fd = accept(ps->fd, (struct sockaddr *) &ss, &ss_len); + if (new_fd < 0) { + int retval = errno; + if (retval != EAGAIN) { + VLOG_DBG_RL(&rl, "accept: %s", strerror(retval)); + } + return retval; + } + + retval = set_nonblocking(new_fd); + if (retval) { + close(new_fd); + return retval; + } + + return ps->accept_cb(new_fd, (const struct sockaddr *) &ss, ss_len, + new_vconnp); +} + +static void +pstream_wait(struct pvconn *pvconn) +{ + struct pstream_pvconn *ps = pstream_pvconn_cast(pvconn); + poll_fd_wait(ps->fd, POLLIN); +} + +static struct pvconn_class pstream_pvconn_class = { + "pstream", + NULL, + pstream_close, + pstream_accept, + pstream_wait +}; diff --git a/lib/vconn-stream.h b/lib/vconn-stream.h new file mode 100644 index 00000000..a5aa2dd9 --- /dev/null +++ b/lib/vconn-stream.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef VCONN_STREAM_H +#define VCONN_STREAM_H 1 + +#include +#include +#include + +struct vconn; +struct pvconn; +struct sockaddr; + +int new_stream_vconn(const char *name, int fd, int connect_status, + uint32_t ip, bool reconnectable, struct vconn **vconnp); +int new_pstream_pvconn(const char *name, int fd, + int (*accept_cb)(int fd, const struct sockaddr *, + size_t sa_len, struct vconn **), + struct pvconn **pvconnp); + +#endif /* vconn-stream.h */ diff --git a/lib/vconn-tcp.c b/lib/vconn-tcp.c new file mode 100644 index 00000000..081ac26d --- /dev/null +++ b/lib/vconn-tcp.c @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "vconn.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "packets.h" +#include "socket-util.h" +#include "util.h" +#include "openflow/openflow.h" +#include "vconn-provider.h" +#include "vconn-stream.h" + +#include "vlog.h" +#define THIS_MODULE VLM_vconn_tcp + +/* Active TCP. */ + +static int +new_tcp_vconn(const char *name, int fd, int connect_status, + const struct sockaddr_in *sin, struct vconn **vconnp) +{ + int on = 1; + int retval; + + retval = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &on, sizeof on); + if (retval) { + VLOG_ERR("%s: setsockopt(TCP_NODELAY): %s", name, strerror(errno)); + close(fd); + return errno; + } + + return new_stream_vconn(name, fd, connect_status, sin->sin_addr.s_addr, + true, vconnp); +} + +static int +tcp_open(const char *name, char *suffix, struct vconn **vconnp) +{ + char *save_ptr; + const char *host_name; + const char *port_string; + struct sockaddr_in sin; + int retval; + int fd; + + /* Glibc 2.7 has a bug in strtok_r when compiling with optimization that + * can cause segfaults here: + * http://sources.redhat.com/bugzilla/show_bug.cgi?id=5614. + * Using "::" instead of the obvious ":" works around it. */ + host_name = strtok_r(suffix, "::", &save_ptr); + port_string = strtok_r(NULL, "::", &save_ptr); + if (!host_name) { + ovs_error(0, "%s: bad peer name format", name); + return EAFNOSUPPORT; + } + + memset(&sin, 0, sizeof sin); + sin.sin_family = AF_INET; + if (lookup_ip(host_name, &sin.sin_addr)) { + return ENOENT; + } + sin.sin_port = htons(port_string ? atoi(port_string) : OFP_TCP_PORT); + + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + VLOG_ERR("%s: socket: %s", name, strerror(errno)); + return errno; + } + + retval = set_nonblocking(fd); + if (retval) { + close(fd); + return retval; + } + + retval = connect(fd, (struct sockaddr *) &sin, sizeof sin); + if (retval < 0) { + if (errno == EINPROGRESS) { + return new_tcp_vconn(name, fd, EAGAIN, &sin, vconnp); + } else { + int error = errno; + VLOG_ERR("%s: connect: %s", name, strerror(error)); + close(fd); + return error; + } + } else { + return new_tcp_vconn(name, fd, 0, &sin, vconnp); + } +} + +struct vconn_class tcp_vconn_class = { + "tcp", /* name */ + tcp_open, /* open */ + NULL, /* close */ + NULL, /* connect */ + NULL, /* recv */ + NULL, /* send */ + NULL, /* wait */ +}; + +/* Passive TCP. */ + +static int ptcp_accept(int fd, const struct sockaddr *sa, size_t sa_len, + struct vconn **vconnp); + +static int +ptcp_open(const char *name, char *suffix, struct pvconn **pvconnp) +{ + struct sockaddr_in sin; + int retval; + int fd; + unsigned int yes = 1; + + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + VLOG_ERR("%s: socket: %s", name, strerror(errno)); + return errno; + } + + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof yes) < 0) { + VLOG_ERR("%s: setsockopt(SO_REUSEADDR): %s", name, strerror(errno)); + return errno; + } + + memset(&sin, 0, sizeof sin); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(atoi(suffix) ? atoi(suffix) : OFP_TCP_PORT); + retval = bind(fd, (struct sockaddr *) &sin, sizeof sin); + if (retval < 0) { + int error = errno; + VLOG_ERR("%s: bind: %s", name, strerror(error)); + close(fd); + return error; + } + + return new_pstream_pvconn("ptcp", fd, ptcp_accept, pvconnp); +} + +static int +ptcp_accept(int fd, const struct sockaddr *sa, size_t sa_len, + struct vconn **vconnp) +{ + const struct sockaddr_in *sin = (const struct sockaddr_in *) sa; + char name[128]; + + if (sa_len == sizeof(struct sockaddr_in) && sin->sin_family == AF_INET) { + sprintf(name, "tcp:"IP_FMT, IP_ARGS(&sin->sin_addr)); + if (sin->sin_port != htons(OFP_TCP_PORT)) { + sprintf(strchr(name, '\0'), ":%"PRIu16, ntohs(sin->sin_port)); + } + } else { + strcpy(name, "tcp"); + } + return new_tcp_vconn(name, fd, 0, sin, vconnp); +} + +struct pvconn_class ptcp_pvconn_class = { + "ptcp", + ptcp_open, + NULL, + NULL, + NULL +}; + diff --git a/lib/vconn-unix.c b/lib/vconn-unix.c new file mode 100644 index 00000000..54b5e23d --- /dev/null +++ b/lib/vconn-unix.c @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "vconn.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ofpbuf.h" +#include "openflow/openflow.h" +#include "packets.h" +#include "poll-loop.h" +#include "socket-util.h" +#include "util.h" +#include "vconn-provider.h" +#include "vconn-stream.h" + +#include "vlog.h" +#define THIS_MODULE VLM_vconn_unix + +/* Active UNIX socket. */ + +/* Number of unix sockets created so far, to ensure binding path uniqueness. */ +static int n_unix_sockets; + +static int +unix_open(const char *name, char *suffix, struct vconn **vconnp) +{ + const char *connect_path = suffix; + char bind_path[128]; + int fd; + + sprintf(bind_path, "/tmp/vconn-unix.%ld.%d", + (long int) getpid(), n_unix_sockets++); + fd = make_unix_socket(SOCK_STREAM, true, false, bind_path, connect_path); + if (fd < 0) { + VLOG_ERR("%s: connection to %s failed: %s", + bind_path, connect_path, strerror(-fd)); + return -fd; + } + + return new_stream_vconn(name, fd, check_connection_completion(fd), + 0, true, vconnp); +} + +struct vconn_class unix_vconn_class = { + "unix", /* name */ + unix_open, /* open */ + NULL, /* close */ + NULL, /* connect */ + NULL, /* recv */ + NULL, /* send */ + NULL, /* wait */ +}; + +/* Passive UNIX socket. */ + +static int punix_accept(int fd, const struct sockaddr *sa, size_t sa_len, + struct vconn **vconnp); + +static int +punix_open(const char *name UNUSED, char *suffix, struct pvconn **pvconnp) +{ + int fd; + + fd = make_unix_socket(SOCK_STREAM, true, true, suffix, NULL); + if (fd < 0) { + VLOG_ERR("%s: binding failed: %s", suffix, strerror(errno)); + return errno; + } + + return new_pstream_pvconn("punix", fd, punix_accept, pvconnp); +} + +static int +punix_accept(int fd, const struct sockaddr *sa, size_t sa_len, + struct vconn **vconnp) +{ + const struct sockaddr_un *sun = (const struct sockaddr_un *) sa; + int name_len = get_unix_name_len(sa_len); + char name[128]; + + if (name_len > 0) { + snprintf(name, sizeof name, "unix:%.*s", name_len, sun->sun_path); + } else { + strcpy(name, "unix"); + } + return new_stream_vconn(name, fd, 0, 0, true, vconnp); +} + +struct pvconn_class punix_pvconn_class = { + "punix", + punix_open, + NULL, + NULL, + NULL +}; + diff --git a/lib/vconn.c b/lib/vconn.c new file mode 100644 index 00000000..e4afb22e --- /dev/null +++ b/lib/vconn.c @@ -0,0 +1,1405 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "vconn-provider.h" +#include +#include +#include +#include +#include +#include +#include +#include "coverage.h" +#include "dynamic-string.h" +#include "flow.h" +#include "ofp-print.h" +#include "ofpbuf.h" +#include "openflow/nicira-ext.h" +#include "openflow/openflow.h" +#include "packets.h" +#include "poll-loop.h" +#include "random.h" +#include "util.h" + +#define THIS_MODULE VLM_vconn +#include "vlog.h" + +/* State of an active vconn.*/ +enum vconn_state { + /* This is the ordinary progression of states. */ + VCS_CONNECTING, /* Underlying vconn is not connected. */ + VCS_SEND_HELLO, /* Waiting to send OFPT_HELLO message. */ + VCS_RECV_HELLO, /* Waiting to receive OFPT_HELLO message. */ + VCS_CONNECTED, /* Connection established. */ + + /* These states are entered only when something goes wrong. */ + VCS_SEND_ERROR, /* Sending OFPT_ERROR message. */ + VCS_DISCONNECTED /* Connection failed or connection closed. */ +}; + +static struct vconn_class *vconn_classes[] = { + &tcp_vconn_class, + &unix_vconn_class, +#ifdef HAVE_OPENSSL + &ssl_vconn_class, +#endif +}; + +static struct pvconn_class *pvconn_classes[] = { + &ptcp_pvconn_class, + &punix_pvconn_class, +#ifdef HAVE_OPENSSL + &pssl_pvconn_class, +#endif +}; + +/* Rate limit for individual OpenFlow messages going over the vconn, output at + * DBG level. This is very high because, if these are enabled, it is because + * we really need to see them. */ +static struct vlog_rate_limit ofmsg_rl = VLOG_RATE_LIMIT_INIT(600, 600); + +/* Rate limit for OpenFlow message parse errors. These always indicate a bug + * in the peer and so there's not much point in showing a lot of them. */ +static struct vlog_rate_limit bad_ofmsg_rl = VLOG_RATE_LIMIT_INIT(1, 5); + +static int do_recv(struct vconn *, struct ofpbuf **); +static int do_send(struct vconn *, struct ofpbuf *); + +/* Check the validity of the vconn class structures. */ +static void +check_vconn_classes(void) +{ +#ifndef NDEBUG + size_t i; + + for (i = 0; i < ARRAY_SIZE(vconn_classes); i++) { + struct vconn_class *class = vconn_classes[i]; + assert(class->name != NULL); + assert(class->open != NULL); + if (class->close || class->recv || class->send || class->wait) { + assert(class->close != NULL); + assert(class->recv != NULL); + assert(class->send != NULL); + assert(class->wait != NULL); + } else { + /* This class delegates to another one. */ + } + } + + for (i = 0; i < ARRAY_SIZE(pvconn_classes); i++) { + struct pvconn_class *class = pvconn_classes[i]; + assert(class->name != NULL); + assert(class->listen != NULL); + if (class->close || class->accept || class->wait) { + assert(class->close != NULL); + assert(class->accept != NULL); + assert(class->wait != NULL); + } else { + /* This class delegates to another one. */ + } + } +#endif +} + +/* Prints information on active (if 'active') and passive (if 'passive') + * connection methods supported by the vconn. If 'bootstrap' is true, also + * advertises options to bootstrap the CA certificate. */ +void +vconn_usage(bool active, bool passive, bool bootstrap UNUSED) +{ + /* Really this should be implemented via callbacks into the vconn + * providers, but that seems too heavy-weight to bother with at the + * moment. */ + + printf("\n"); + if (active) { + printf("Active OpenFlow connection methods:\n"); + printf(" tcp:HOST[:PORT] " + "PORT (default: %d) on remote TCP HOST\n", OFP_TCP_PORT); +#ifdef HAVE_OPENSSL + printf(" ssl:HOST[:PORT] " + "SSL PORT (default: %d) on remote HOST\n", OFP_SSL_PORT); +#endif + printf(" unix:FILE Unix domain socket named FILE\n"); + } + + if (passive) { + printf("Passive OpenFlow connection methods:\n"); + printf(" ptcp:[PORT] " + "listen to TCP PORT (default: %d)\n", + OFP_TCP_PORT); +#ifdef HAVE_OPENSSL + printf(" pssl:[PORT] " + "listen for SSL on PORT (default: %d)\n", + OFP_SSL_PORT); +#endif + printf(" punix:FILE " + "listen on Unix domain socket FILE\n"); + } + +#ifdef HAVE_OPENSSL + printf("PKI configuration (required to use SSL):\n" + " -p, --private-key=FILE file with private key\n" + " -c, --certificate=FILE file with certificate for private key\n" + " -C, --ca-cert=FILE file with peer CA certificate\n"); + if (bootstrap) { + printf(" --bootstrap-ca-cert=FILE file with peer CA certificate " + "to read or create\n"); + } +#endif +} + +/* Attempts to connect to an OpenFlow device. 'name' is a connection name in + * the form "TYPE:ARGS", where TYPE is an active vconn class's name and ARGS + * are vconn class-specific. + * + * The vconn will automatically negotiate an OpenFlow protocol version + * acceptable to both peers on the connection. The version negotiated will be + * no lower than 'min_version' and no higher than OFP_VERSION. + * + * Returns 0 if successful, otherwise a positive errno value. If successful, + * stores a pointer to the new connection in '*vconnp', otherwise a null + * pointer. */ +int +vconn_open(const char *name, int min_version, struct vconn **vconnp) +{ + size_t prefix_len; + size_t i; + + COVERAGE_INC(vconn_open); + check_vconn_classes(); + + *vconnp = NULL; + prefix_len = strcspn(name, ":"); + if (prefix_len == strlen(name)) { + return EAFNOSUPPORT; + } + for (i = 0; i < ARRAY_SIZE(vconn_classes); i++) { + struct vconn_class *class = vconn_classes[i]; + if (strlen(class->name) == prefix_len + && !memcmp(class->name, name, prefix_len)) { + struct vconn *vconn; + char *suffix_copy = xstrdup(name + prefix_len + 1); + int retval = class->open(name, suffix_copy, &vconn); + free(suffix_copy); + if (!retval) { + assert(vconn->state != VCS_CONNECTING + || vconn->class->connect); + vconn->min_version = min_version; + *vconnp = vconn; + } + return retval; + } + } + return EAFNOSUPPORT; +} + +int +vconn_open_block(const char *name, int min_version, struct vconn **vconnp) +{ + struct vconn *vconn; + int error; + + error = vconn_open(name, min_version, &vconn); + while (error == EAGAIN) { + vconn_connect_wait(vconn); + poll_block(); + error = vconn_connect(vconn); + assert(error != EINPROGRESS); + } + if (error) { + vconn_close(vconn); + *vconnp = NULL; + } else { + *vconnp = vconn; + } + return error; +} + +/* Closes 'vconn'. */ +void +vconn_close(struct vconn *vconn) +{ + if (vconn != NULL) { + char *name = vconn->name; + (vconn->class->close)(vconn); + free(name); + } +} + +/* Returns the name of 'vconn', that is, the string passed to vconn_open(). */ +const char * +vconn_get_name(const struct vconn *vconn) +{ + return vconn->name; +} + +/* Returns the IP address of the peer, or 0 if the peer is not connected over + * an IP-based protocol or if its IP address is not yet known. */ +uint32_t +vconn_get_ip(const struct vconn *vconn) +{ + return vconn->ip; +} + +static void +vcs_connecting(struct vconn *vconn) +{ + int retval = (vconn->class->connect)(vconn); + assert(retval != EINPROGRESS); + if (!retval) { + vconn->state = VCS_SEND_HELLO; + } else if (retval != EAGAIN) { + vconn->state = VCS_DISCONNECTED; + vconn->error = retval; + } +} + +static void +vcs_send_hello(struct vconn *vconn) +{ + struct ofpbuf *b; + int retval; + + make_openflow(sizeof(struct ofp_header), OFPT_HELLO, &b); + retval = do_send(vconn, b); + if (!retval) { + vconn->state = VCS_RECV_HELLO; + } else { + ofpbuf_delete(b); + if (retval != EAGAIN) { + vconn->state = VCS_DISCONNECTED; + vconn->error = retval; + } + } +} + +static void +vcs_recv_hello(struct vconn *vconn) +{ + struct ofpbuf *b; + int retval; + + retval = do_recv(vconn, &b); + if (!retval) { + struct ofp_header *oh = b->data; + + if (oh->type == OFPT_HELLO) { + if (b->size > sizeof *oh) { + struct ds msg = DS_EMPTY_INITIALIZER; + ds_put_format(&msg, "%s: extra-long hello:\n", vconn->name); + ds_put_hex_dump(&msg, b->data, b->size, 0, true); + VLOG_WARN_RL(&bad_ofmsg_rl, "%s", ds_cstr(&msg)); + ds_destroy(&msg); + } + + vconn->version = MIN(OFP_VERSION, oh->version); + if (vconn->version < vconn->min_version) { + VLOG_WARN_RL(&bad_ofmsg_rl, + "%s: version negotiation failed: we support " + "versions 0x%02x to 0x%02x inclusive but peer " + "supports no later than version 0x%02"PRIx8, + vconn->name, vconn->min_version, OFP_VERSION, + oh->version); + vconn->state = VCS_SEND_ERROR; + } else { + VLOG_DBG("%s: negotiated OpenFlow version 0x%02x " + "(we support versions 0x%02x to 0x%02x inclusive, " + "peer no later than version 0x%02"PRIx8")", + vconn->name, vconn->version, vconn->min_version, + OFP_VERSION, oh->version); + vconn->state = VCS_CONNECTED; + } + ofpbuf_delete(b); + return; + } else { + char *s = ofp_to_string(b->data, b->size, 1); + VLOG_WARN_RL(&bad_ofmsg_rl, + "%s: received message while expecting hello: %s", + vconn->name, s); + free(s); + retval = EPROTO; + ofpbuf_delete(b); + } + } + + if (retval != EAGAIN) { + vconn->state = VCS_DISCONNECTED; + vconn->error = retval; + } +} + +static void +vcs_send_error(struct vconn *vconn) +{ + struct ofp_error_msg *error; + struct ofpbuf *b; + char s[128]; + int retval; + + snprintf(s, sizeof s, "We support versions 0x%02x to 0x%02x inclusive but " + "you support no later than version 0x%02"PRIx8".", + vconn->min_version, OFP_VERSION, vconn->version); + error = make_openflow(sizeof *error, OFPT_ERROR, &b); + error->type = htons(OFPET_HELLO_FAILED); + error->code = htons(OFPHFC_INCOMPATIBLE); + ofpbuf_put(b, s, strlen(s)); + update_openflow_length(b); + retval = do_send(vconn, b); + if (retval) { + ofpbuf_delete(b); + } + if (retval != EAGAIN) { + vconn->state = VCS_DISCONNECTED; + vconn->error = retval ? retval : EPROTO; + } +} + +/* Tries to complete the connection on 'vconn', which must be an active + * vconn. If 'vconn''s connection is complete, returns 0 if the connection + * was successful or a positive errno value if it failed. If the + * connection is still in progress, returns EAGAIN. */ +int +vconn_connect(struct vconn *vconn) +{ + enum vconn_state last_state; + + assert(vconn->min_version >= 0); + do { + last_state = vconn->state; + switch (vconn->state) { + case VCS_CONNECTING: + vcs_connecting(vconn); + break; + + case VCS_SEND_HELLO: + vcs_send_hello(vconn); + break; + + case VCS_RECV_HELLO: + vcs_recv_hello(vconn); + break; + + case VCS_CONNECTED: + return 0; + + case VCS_SEND_ERROR: + vcs_send_error(vconn); + break; + + case VCS_DISCONNECTED: + return vconn->error; + + default: + NOT_REACHED(); + } + } while (vconn->state != last_state); + + return EAGAIN; +} + +/* Tries to receive an OpenFlow message from 'vconn', which must be an active + * vconn. If successful, stores the received message into '*msgp' and returns + * 0. The caller is responsible for destroying the message with + * ofpbuf_delete(). On failure, returns a positive errno value and stores a + * null pointer into '*msgp'. On normal connection close, returns EOF. + * + * vconn_recv will not block waiting for a packet to arrive. If no packets + * have been received, it returns EAGAIN immediately. */ +int +vconn_recv(struct vconn *vconn, struct ofpbuf **msgp) +{ + int retval = vconn_connect(vconn); + if (!retval) { + retval = do_recv(vconn, msgp); + } + return retval; +} + +static int +do_recv(struct vconn *vconn, struct ofpbuf **msgp) +{ + int retval; + +again: + retval = (vconn->class->recv)(vconn, msgp); + if (!retval) { + struct ofp_header *oh; + + COVERAGE_INC(vconn_received); + if (VLOG_IS_DBG_ENABLED()) { + char *s = ofp_to_string((*msgp)->data, (*msgp)->size, 1); + VLOG_DBG_RL(&ofmsg_rl, "%s: received: %s", vconn->name, s); + free(s); + } + + oh = ofpbuf_at_assert(*msgp, 0, sizeof *oh); + if (oh->version != vconn->version + && oh->type != OFPT_HELLO + && oh->type != OFPT_ERROR + && oh->type != OFPT_ECHO_REQUEST + && oh->type != OFPT_ECHO_REPLY + && oh->type != OFPT_VENDOR) + { + if (vconn->version < 0) { + if (oh->type == OFPT_PACKET_IN + || oh->type == OFPT_FLOW_EXPIRED + || oh->type == OFPT_PORT_STATUS) { + /* The kernel datapath is stateless and doesn't really + * support version negotiation, so it can end up sending + * these asynchronous message before version negotiation + * is complete. Just ignore them. + * + * (After we move OFPT_PORT_STATUS messages from the kernel + * into secchan, we won't get those here, since secchan + * does proper version negotiation.) */ + ofpbuf_delete(*msgp); + goto again; + } + VLOG_ERR_RL(&bad_ofmsg_rl, + "%s: received OpenFlow message type %"PRIu8" " + "before version negotiation complete", + vconn->name, oh->type); + } else { + VLOG_ERR_RL(&bad_ofmsg_rl, + "%s: received OpenFlow version 0x%02"PRIx8" " + "!= expected %02x", + vconn->name, oh->version, vconn->version); + } + ofpbuf_delete(*msgp); + retval = EPROTO; + } + } + if (retval) { + *msgp = NULL; + } + return retval; +} + +/* Tries to queue 'msg' for transmission on 'vconn', which must be an active + * vconn. If successful, returns 0, in which case ownership of 'msg' is + * transferred to the vconn. Success does not guarantee that 'msg' has been or + * ever will be delivered to the peer, only that it has been queued for + * transmission. + * + * Returns a positive errno value on failure, in which case the caller + * retains ownership of 'msg'. + * + * vconn_send will not block. If 'msg' cannot be immediately accepted for + * transmission, it returns EAGAIN immediately. */ +int +vconn_send(struct vconn *vconn, struct ofpbuf *msg) +{ + int retval = vconn_connect(vconn); + if (!retval) { + retval = do_send(vconn, msg); + } + return retval; +} + +static int +do_send(struct vconn *vconn, struct ofpbuf *msg) +{ + int retval; + + assert(msg->size >= sizeof(struct ofp_header)); + assert(((struct ofp_header *) msg->data)->length == htons(msg->size)); + if (!VLOG_IS_DBG_ENABLED()) { + COVERAGE_INC(vconn_sent); + retval = (vconn->class->send)(vconn, msg); + } else { + char *s = ofp_to_string(msg->data, msg->size, 1); + retval = (vconn->class->send)(vconn, msg); + if (retval != EAGAIN) { + VLOG_DBG_RL(&ofmsg_rl, "%s: sent (%s): %s", + vconn->name, strerror(retval), s); + } + free(s); + } + return retval; +} + +/* Same as vconn_send, except that it waits until 'msg' can be transmitted. */ +int +vconn_send_block(struct vconn *vconn, struct ofpbuf *msg) +{ + int retval; + while ((retval = vconn_send(vconn, msg)) == EAGAIN) { + vconn_send_wait(vconn); + poll_block(); + } + return retval; +} + +/* Same as vconn_recv, except that it waits until a message is received. */ +int +vconn_recv_block(struct vconn *vconn, struct ofpbuf **msgp) +{ + int retval; + while ((retval = vconn_recv(vconn, msgp)) == EAGAIN) { + vconn_recv_wait(vconn); + poll_block(); + } + return retval; +} + +/* Waits until a message with a transaction ID matching 'xid' is recived on + * 'vconn'. Returns 0 if successful, in which case the reply is stored in + * '*replyp' for the caller to examine and free. Otherwise returns a positive + * errno value, or EOF, and sets '*replyp' to null. + * + * 'request' is always destroyed, regardless of the return value. */ +int +vconn_recv_xid(struct vconn *vconn, uint32_t xid, struct ofpbuf **replyp) +{ + for (;;) { + uint32_t recv_xid; + struct ofpbuf *reply; + int error; + + error = vconn_recv_block(vconn, &reply); + if (error) { + *replyp = NULL; + return error; + } + recv_xid = ((struct ofp_header *) reply->data)->xid; + if (xid == recv_xid) { + *replyp = reply; + return 0; + } + + VLOG_DBG_RL(&bad_ofmsg_rl, "%s: received reply with xid %08"PRIx32 + " != expected %08"PRIx32, vconn->name, recv_xid, xid); + ofpbuf_delete(reply); + } +} + +/* Sends 'request' to 'vconn' and blocks until it receives a reply with a + * matching transaction ID. Returns 0 if successful, in which case the reply + * is stored in '*replyp' for the caller to examine and free. Otherwise + * returns a positive errno value, or EOF, and sets '*replyp' to null. + * + * 'request' is always destroyed, regardless of the return value. */ +int +vconn_transact(struct vconn *vconn, struct ofpbuf *request, + struct ofpbuf **replyp) +{ + uint32_t send_xid = ((struct ofp_header *) request->data)->xid; + int error; + + *replyp = NULL; + error = vconn_send_block(vconn, request); + if (error) { + ofpbuf_delete(request); + } + return error ? error : vconn_recv_xid(vconn, send_xid, replyp); +} + +void +vconn_wait(struct vconn *vconn, enum vconn_wait_type wait) +{ + assert(wait == WAIT_CONNECT || wait == WAIT_RECV || wait == WAIT_SEND); + + switch (vconn->state) { + case VCS_CONNECTING: + wait = WAIT_CONNECT; + break; + + case VCS_SEND_HELLO: + case VCS_SEND_ERROR: + wait = WAIT_SEND; + break; + + case VCS_RECV_HELLO: + wait = WAIT_RECV; + break; + + case VCS_CONNECTED: + break; + + case VCS_DISCONNECTED: + poll_immediate_wake(); + return; + } + (vconn->class->wait)(vconn, wait); +} + +void +vconn_connect_wait(struct vconn *vconn) +{ + vconn_wait(vconn, WAIT_CONNECT); +} + +void +vconn_recv_wait(struct vconn *vconn) +{ + vconn_wait(vconn, WAIT_RECV); +} + +void +vconn_send_wait(struct vconn *vconn) +{ + vconn_wait(vconn, WAIT_SEND); +} + +/* Attempts to start listening for OpenFlow connections. 'name' is a + * connection name in the form "TYPE:ARGS", where TYPE is an passive vconn + * class's name and ARGS are vconn class-specific. + * + * Returns 0 if successful, otherwise a positive errno value. If successful, + * stores a pointer to the new connection in '*pvconnp', otherwise a null + * pointer. */ +int +pvconn_open(const char *name, struct pvconn **pvconnp) +{ + size_t prefix_len; + size_t i; + + check_vconn_classes(); + + *pvconnp = NULL; + prefix_len = strcspn(name, ":"); + if (prefix_len == strlen(name)) { + return EAFNOSUPPORT; + } + for (i = 0; i < ARRAY_SIZE(pvconn_classes); i++) { + struct pvconn_class *class = pvconn_classes[i]; + if (strlen(class->name) == prefix_len + && !memcmp(class->name, name, prefix_len)) { + char *suffix_copy = xstrdup(name + prefix_len + 1); + int retval = class->listen(name, suffix_copy, pvconnp); + free(suffix_copy); + if (retval) { + *pvconnp = NULL; + } + return retval; + } + } + return EAFNOSUPPORT; +} + +/* Returns the name that was used to open 'pvconn'. The caller must not + * modify or free the name. */ +const char * +pvconn_get_name(const struct pvconn *pvconn) +{ + return pvconn->name; +} + +/* Closes 'pvconn'. */ +void +pvconn_close(struct pvconn *pvconn) +{ + if (pvconn != NULL) { + char *name = pvconn->name; + (pvconn->class->close)(pvconn); + free(name); + } +} + +/* Tries to accept a new connection on 'pvconn'. If successful, stores the new + * connection in '*new_vconn' and returns 0. Otherwise, returns a positive + * errno value. + * + * The new vconn will automatically negotiate an OpenFlow protocol version + * acceptable to both peers on the connection. The version negotiated will be + * no lower than 'min_version' and no higher than OFP_VERSION. + * + * pvconn_accept() will not block waiting for a connection. If no connection + * is ready to be accepted, it returns EAGAIN immediately. */ +int +pvconn_accept(struct pvconn *pvconn, int min_version, struct vconn **new_vconn) +{ + int retval = (pvconn->class->accept)(pvconn, new_vconn); + if (retval) { + *new_vconn = NULL; + } else { + assert((*new_vconn)->state != VCS_CONNECTING + || (*new_vconn)->class->connect); + (*new_vconn)->min_version = min_version; + } + return retval; +} + +void +pvconn_wait(struct pvconn *pvconn) +{ + (pvconn->class->wait)(pvconn); +} + +/* XXX we should really use consecutive xids to avoid probabilistic + * failures. */ +static inline uint32_t +alloc_xid(void) +{ + return random_uint32(); +} + +/* Allocates and stores in '*bufferp' a new ofpbuf with a size of + * 'openflow_len', starting with an OpenFlow header with the given 'type' and + * an arbitrary transaction id. Allocated bytes beyond the header, if any, are + * zeroed. + * + * The caller is responsible for freeing '*bufferp' when it is no longer + * needed. + * + * The OpenFlow header length is initially set to 'openflow_len'; if the + * message is later extended, the length should be updated with + * update_openflow_length() before sending. + * + * Returns the header. */ +void * +make_openflow(size_t openflow_len, uint8_t type, struct ofpbuf **bufferp) +{ + *bufferp = ofpbuf_new(openflow_len); + return put_openflow_xid(openflow_len, type, alloc_xid(), *bufferp); +} + +/* Allocates and stores in '*bufferp' a new ofpbuf with a size of + * 'openflow_len', starting with an OpenFlow header with the given 'type' and + * transaction id 'xid'. Allocated bytes beyond the header, if any, are + * zeroed. + * + * The caller is responsible for freeing '*bufferp' when it is no longer + * needed. + * + * The OpenFlow header length is initially set to 'openflow_len'; if the + * message is later extended, the length should be updated with + * update_openflow_length() before sending. + * + * Returns the header. */ +void * +make_openflow_xid(size_t openflow_len, uint8_t type, uint32_t xid, + struct ofpbuf **bufferp) +{ + *bufferp = ofpbuf_new(openflow_len); + return put_openflow_xid(openflow_len, type, xid, *bufferp); +} + +/* Appends 'openflow_len' bytes to 'buffer', starting with an OpenFlow header + * with the given 'type' and an arbitrary transaction id. Allocated bytes + * beyond the header, if any, are zeroed. + * + * The OpenFlow header length is initially set to 'openflow_len'; if the + * message is later extended, the length should be updated with + * update_openflow_length() before sending. + * + * Returns the header. */ +void * +put_openflow(size_t openflow_len, uint8_t type, struct ofpbuf *buffer) +{ + return put_openflow_xid(openflow_len, type, alloc_xid(), buffer); +} + +/* Appends 'openflow_len' bytes to 'buffer', starting with an OpenFlow header + * with the given 'type' and an transaction id 'xid'. Allocated bytes beyond + * the header, if any, are zeroed. + * + * The OpenFlow header length is initially set to 'openflow_len'; if the + * message is later extended, the length should be updated with + * update_openflow_length() before sending. + * + * Returns the header. */ +void * +put_openflow_xid(size_t openflow_len, uint8_t type, uint32_t xid, + struct ofpbuf *buffer) +{ + struct ofp_header *oh; + + assert(openflow_len >= sizeof *oh); + assert(openflow_len <= UINT16_MAX); + + oh = ofpbuf_put_uninit(buffer, openflow_len); + oh->version = OFP_VERSION; + oh->type = type; + oh->length = htons(openflow_len); + oh->xid = xid; + memset(oh + 1, 0, openflow_len - sizeof *oh); + return oh; +} + +/* Updates the 'length' field of the OpenFlow message in 'buffer' to + * 'buffer->size'. */ +void +update_openflow_length(struct ofpbuf *buffer) +{ + struct ofp_header *oh = ofpbuf_at_assert(buffer, 0, sizeof *oh); + oh->length = htons(buffer->size); +} + +struct ofpbuf * +make_flow_mod(uint16_t command, const flow_t *flow, size_t actions_len) +{ + struct ofp_flow_mod *ofm; + size_t size = sizeof *ofm + actions_len; + struct ofpbuf *out = ofpbuf_new(size); + ofm = ofpbuf_put_zeros(out, sizeof *ofm); + ofm->header.version = OFP_VERSION; + ofm->header.type = OFPT_FLOW_MOD; + ofm->header.length = htons(size); + ofm->match.wildcards = htonl(0); + ofm->match.in_port = htons(flow->in_port == ODPP_LOCAL ? OFPP_LOCAL + : flow->in_port); + memcpy(ofm->match.dl_src, flow->dl_src, sizeof ofm->match.dl_src); + memcpy(ofm->match.dl_dst, flow->dl_dst, sizeof ofm->match.dl_dst); + ofm->match.dl_vlan = flow->dl_vlan; + ofm->match.dl_type = flow->dl_type; + ofm->match.nw_src = flow->nw_src; + ofm->match.nw_dst = flow->nw_dst; + ofm->match.nw_proto = flow->nw_proto; + ofm->match.tp_src = flow->tp_src; + ofm->match.tp_dst = flow->tp_dst; + ofm->command = htons(command); + return out; +} + +struct ofpbuf * +make_add_flow(const flow_t *flow, uint32_t buffer_id, + uint16_t idle_timeout, size_t actions_len) +{ + struct ofpbuf *out = make_flow_mod(OFPFC_ADD, flow, actions_len); + struct ofp_flow_mod *ofm = out->data; + ofm->idle_timeout = htons(idle_timeout); + ofm->hard_timeout = htons(OFP_FLOW_PERMANENT); + ofm->buffer_id = htonl(buffer_id); + return out; +} + +struct ofpbuf * +make_del_flow(const flow_t *flow) +{ + struct ofpbuf *out = make_flow_mod(OFPFC_DELETE_STRICT, flow, 0); + struct ofp_flow_mod *ofm = out->data; + ofm->out_port = htons(OFPP_NONE); + return out; +} + +struct ofpbuf * +make_add_simple_flow(const flow_t *flow, + uint32_t buffer_id, uint16_t out_port, + uint16_t idle_timeout) +{ + struct ofp_action_output *oao; + struct ofpbuf *buffer = make_add_flow(flow, buffer_id, idle_timeout, + sizeof *oao); + oao = ofpbuf_put_zeros(buffer, sizeof *oao); + oao->type = htons(OFPAT_OUTPUT); + oao->len = htons(sizeof *oao); + oao->port = htons(out_port); + return buffer; +} + +struct ofpbuf * +make_packet_out(const struct ofpbuf *packet, uint32_t buffer_id, + uint16_t in_port, + const struct ofp_action_header *actions, size_t n_actions) +{ + size_t actions_len = n_actions * sizeof *actions; + struct ofp_packet_out *opo; + size_t size = sizeof *opo + actions_len + (packet ? packet->size : 0); + struct ofpbuf *out = ofpbuf_new(size); + + opo = ofpbuf_put_uninit(out, sizeof *opo); + opo->header.version = OFP_VERSION; + opo->header.type = OFPT_PACKET_OUT; + opo->header.length = htons(size); + opo->header.xid = htonl(0); + opo->buffer_id = htonl(buffer_id); + opo->in_port = htons(in_port == ODPP_LOCAL ? OFPP_LOCAL : in_port); + opo->actions_len = htons(actions_len); + ofpbuf_put(out, actions, actions_len); + if (packet) { + ofpbuf_put(out, packet->data, packet->size); + } + return out; +} + +struct ofpbuf * +make_unbuffered_packet_out(const struct ofpbuf *packet, + uint16_t in_port, uint16_t out_port) +{ + struct ofp_action_output action; + action.type = htons(OFPAT_OUTPUT); + action.len = htons(sizeof action); + action.port = htons(out_port); + return make_packet_out(packet, UINT32_MAX, in_port, + (struct ofp_action_header *) &action, 1); +} + +struct ofpbuf * +make_buffered_packet_out(uint32_t buffer_id, + uint16_t in_port, uint16_t out_port) +{ + struct ofp_action_output action; + action.type = htons(OFPAT_OUTPUT); + action.len = htons(sizeof action); + action.port = htons(out_port); + return make_packet_out(NULL, buffer_id, in_port, + (struct ofp_action_header *) &action, 1); +} + +/* Creates and returns an OFPT_ECHO_REQUEST message with an empty payload. */ +struct ofpbuf * +make_echo_request(void) +{ + struct ofp_header *rq; + struct ofpbuf *out = ofpbuf_new(sizeof *rq); + rq = ofpbuf_put_uninit(out, sizeof *rq); + rq->version = OFP_VERSION; + rq->type = OFPT_ECHO_REQUEST; + rq->length = htons(sizeof *rq); + rq->xid = 0; + return out; +} + +/* Creates and returns an OFPT_ECHO_REPLY message matching the + * OFPT_ECHO_REQUEST message in 'rq'. */ +struct ofpbuf * +make_echo_reply(const struct ofp_header *rq) +{ + size_t size = ntohs(rq->length); + struct ofpbuf *out = ofpbuf_new(size); + struct ofp_header *reply = ofpbuf_put(out, rq, size); + reply->type = OFPT_ECHO_REPLY; + return out; +} + +static int +check_message_type(uint8_t got_type, uint8_t want_type) +{ + if (got_type != want_type) { + char *want_type_name = ofp_message_type_to_string(want_type); + char *got_type_name = ofp_message_type_to_string(got_type); + VLOG_WARN_RL(&bad_ofmsg_rl, + "received bad message type %s (expected %s)", + got_type_name, want_type_name); + free(want_type_name); + free(got_type_name); + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_TYPE); + } + return 0; +} + +/* Checks that 'msg' has type 'type' and that it is exactly 'size' bytes long. + * Returns 0 if the checks pass, otherwise an OpenFlow error code (produced + * with ofp_mkerr()). */ +int +check_ofp_message(const struct ofp_header *msg, uint8_t type, size_t size) +{ + size_t got_size; + int error; + + error = check_message_type(msg->type, type); + if (error) { + return error; + } + + got_size = ntohs(msg->length); + if (got_size != size) { + char *type_name = ofp_message_type_to_string(type); + VLOG_WARN_RL(&bad_ofmsg_rl, + "received %s message of length %"PRIu16" (expected %zu)", + type_name, got_size, size); + free(type_name); + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + + return 0; +} + +/* Checks that 'msg' has type 'type' and that 'msg' is 'size' plus a + * nonnegative integer multiple of 'array_elt_size' bytes long. Returns 0 if + * the checks pass, otherwise an OpenFlow error code (produced with + * ofp_mkerr()). + * + * If 'n_array_elts' is nonnull, then '*n_array_elts' is set to the number of + * 'array_elt_size' blocks in 'msg' past the first 'min_size' bytes, when + * successful. */ +int +check_ofp_message_array(const struct ofp_header *msg, uint8_t type, + size_t min_size, size_t array_elt_size, + size_t *n_array_elts) +{ + size_t got_size; + int error; + + assert(array_elt_size); + + error = check_message_type(msg->type, type); + if (error) { + return error; + } + + got_size = ntohs(msg->length); + if (got_size < min_size) { + char *type_name = ofp_message_type_to_string(type); + VLOG_WARN_RL(&bad_ofmsg_rl, "received %s message of length %"PRIu16" " + "(expected at least %zu)", + type_name, got_size, min_size); + free(type_name); + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + if ((got_size - min_size) % array_elt_size) { + char *type_name = ofp_message_type_to_string(type); + VLOG_WARN_RL(&bad_ofmsg_rl, + "received %s message of bad length %"PRIu16": the " + "excess over %zu (%zu) is not evenly divisible by %zu " + "(remainder is %zu)", + type_name, got_size, min_size, got_size - min_size, + array_elt_size, (got_size - min_size) % array_elt_size); + free(type_name); + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + if (n_array_elts) { + *n_array_elts = (got_size - min_size) / array_elt_size; + } + return 0; +} + +int +check_ofp_packet_out(const struct ofp_header *oh, struct ofpbuf *data, + int *n_actionsp, int max_ports) +{ + const struct ofp_packet_out *opo; + unsigned int actions_len, n_actions; + size_t extra; + int error; + + *n_actionsp = 0; + error = check_ofp_message_array(oh, OFPT_PACKET_OUT, + sizeof *opo, 1, &extra); + if (error) { + return error; + } + opo = (const struct ofp_packet_out *) oh; + + actions_len = ntohs(opo->actions_len); + if (actions_len > extra) { + VLOG_WARN_RL(&bad_ofmsg_rl, "packet-out claims %zu bytes of actions " + "but message has room for only %zu bytes", + actions_len, extra); + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + if (actions_len % sizeof(union ofp_action)) { + VLOG_WARN_RL(&bad_ofmsg_rl, "packet-out claims %zu bytes of actions, " + "which is not a multiple of %zu", + actions_len, sizeof(union ofp_action)); + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + + n_actions = actions_len / sizeof(union ofp_action); + error = validate_actions((const union ofp_action *) opo->actions, + n_actions, max_ports); + if (error) { + return error; + } + + data->data = (void *) &opo->actions[n_actions]; + data->size = extra - actions_len; + *n_actionsp = n_actions; + return 0; +} + +const struct ofp_flow_stats * +flow_stats_first(struct flow_stats_iterator *iter, + const struct ofp_stats_reply *osr) +{ + iter->pos = osr->body; + iter->end = osr->body + (ntohs(osr->header.length) + - offsetof(struct ofp_stats_reply, body)); + return flow_stats_next(iter); +} + +const struct ofp_flow_stats * +flow_stats_next(struct flow_stats_iterator *iter) +{ + ptrdiff_t bytes_left = iter->end - iter->pos; + const struct ofp_flow_stats *fs; + size_t length; + + if (bytes_left < sizeof *fs) { + if (bytes_left != 0) { + VLOG_WARN_RL(&bad_ofmsg_rl, + "%td leftover bytes in flow stats reply", bytes_left); + } + return NULL; + } + + fs = (const void *) iter->pos; + length = ntohs(fs->length); + if (length < sizeof *fs) { + VLOG_WARN_RL(&bad_ofmsg_rl, "flow stats length %zu is shorter than " + "min %zu", length, sizeof *fs); + return NULL; + } else if (length > bytes_left) { + VLOG_WARN_RL(&bad_ofmsg_rl, "flow stats length %zu but only %td " + "bytes left", length, bytes_left); + return NULL; + } else if ((length - sizeof *fs) % sizeof fs->actions[0]) { + VLOG_WARN_RL(&bad_ofmsg_rl, "flow stats length %zu has %zu bytes " + "left over in final action", length, + (length - sizeof *fs) % sizeof fs->actions[0]); + return NULL; + } + iter->pos += length; + return fs; +} + +/* Alignment of ofp_actions. */ +#define ACTION_ALIGNMENT 8 + +static int +check_action_exact_len(const union ofp_action *a, unsigned int len, + unsigned int required_len) +{ + if (len != required_len) { + VLOG_DBG_RL(&bad_ofmsg_rl, + "action %u has invalid length %"PRIu16" (must be %u)\n", + a->type, ntohs(a->header.len), required_len); + return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_LEN); + } + return 0; +} + +static int +check_action_port(int port, int max_ports) +{ + switch (port) { + case OFPP_IN_PORT: + case OFPP_TABLE: + case OFPP_NORMAL: + case OFPP_FLOOD: + case OFPP_ALL: + case OFPP_CONTROLLER: + case OFPP_LOCAL: + return 0; + + default: + if (port >= 0 && port < max_ports) { + return 0; + } + VLOG_WARN_RL(&bad_ofmsg_rl, "unknown output port %x", port); + return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_OUT_PORT); + } +} + +static int +check_nicira_action(const union ofp_action *a, unsigned int len) +{ + const struct nx_action_header *nah; + + if (len < 16) { + VLOG_DBG_RL(&bad_ofmsg_rl, + "Nicira vendor action only %u bytes", len); + return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_LEN); + } + nah = (const struct nx_action_header *) a; + + switch (ntohs(nah->subtype)) { + case NXAST_RESUBMIT: + return check_action_exact_len(a, len, 16); + default: + return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_VENDOR_TYPE); + } +} + +static int +check_action(const union ofp_action *a, unsigned int len, int max_ports) +{ + int error; + + switch (a->type) { + case OFPAT_OUTPUT: + error = check_action_port(ntohs(a->output.port), max_ports); + if (error) { + return error; + } + return check_action_exact_len(a, len, 8); + + case OFPAT_SET_VLAN_VID: + case OFPAT_SET_VLAN_PCP: + case OFPAT_STRIP_VLAN: + case OFPAT_SET_NW_SRC: + case OFPAT_SET_NW_DST: + case OFPAT_SET_TP_SRC: + case OFPAT_SET_TP_DST: + return check_action_exact_len(a, len, 8); + + case OFPAT_SET_DL_SRC: + case OFPAT_SET_DL_DST: + return check_action_exact_len(a, len, 16); + + case OFPAT_VENDOR: + if (a->vendor.vendor == htonl(NX_VENDOR_ID)) { + return check_nicira_action(a, len); + } else { + return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_VENDOR); + } + break; + + default: + VLOG_WARN_RL(&bad_ofmsg_rl, "unknown action type %"PRIu16, a->type); + return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_TYPE); + } + + if (!len) { + VLOG_DBG_RL(&bad_ofmsg_rl, "action has invalid length 0"); + return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_LEN); + } + if (len % ACTION_ALIGNMENT) { + VLOG_DBG_RL(&bad_ofmsg_rl, "action length %u is not a multiple of %d", + len, ACTION_ALIGNMENT); + return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_LEN); + } + return 0; +} + +int +validate_actions(const union ofp_action *actions, size_t n_actions, + int max_ports) +{ + const union ofp_action *a; + + for (a = actions; a < &actions[n_actions]; ) { + unsigned int len = ntohs(a->header.len); + unsigned int n_slots = len / ACTION_ALIGNMENT; + unsigned int slots_left = &actions[n_actions] - a; + int error; + + if (n_slots > slots_left) { + VLOG_DBG_RL(&bad_ofmsg_rl, + "action requires %u slots but only %td remain", + n_slots, slots_left); + return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_LEN); + } + error = check_action(a, len, max_ports); + if (error) { + return error; + } + a += n_slots; + } + return 0; +} + +/* The set of actions must either come from a trusted source or have been + * previously validated with validate_actions(). */ +const union ofp_action * +actions_first(struct actions_iterator *iter, + const union ofp_action *oa, size_t n_actions) +{ + iter->pos = oa; + iter->end = oa + n_actions; + return actions_next(iter); +} + +const union ofp_action * +actions_next(struct actions_iterator *iter) +{ + if (iter->pos < iter->end) { + const union ofp_action *a = iter->pos; + unsigned int len = ntohs(a->header.len); + iter->pos += len / ACTION_ALIGNMENT; + return a; + } else { + return NULL; + } +} + +void +normalize_match(struct ofp_match *m) +{ + enum { OFPFW_NW = OFPFW_NW_SRC_MASK | OFPFW_NW_DST_MASK | OFPFW_NW_PROTO }; + enum { OFPFW_TP = OFPFW_TP_SRC | OFPFW_TP_DST }; + uint32_t wc; + + wc = ntohl(m->wildcards) & OFPFW_ALL; + if (wc & OFPFW_DL_TYPE) { + m->dl_type = 0; + + /* Can't sensibly m on network or transport headers if the + * data link type is unknown. */ + wc |= OFPFW_NW | OFPFW_TP; + m->nw_src = m->nw_dst = m->nw_proto = 0; + m->tp_src = m->tp_dst = 0; + } else if (m->dl_type == htons(ETH_TYPE_IP)) { + if (wc & OFPFW_NW_PROTO) { + m->nw_proto = 0; + + /* Can't sensibly m on transport headers if the network + * protocol is unknown. */ + wc |= OFPFW_TP; + m->tp_src = m->tp_dst = 0; + } else if (m->nw_proto == IPPROTO_TCP || + m->nw_proto == IPPROTO_UDP || + m->nw_proto == IPPROTO_ICMP) { + if (wc & OFPFW_TP_SRC) { + m->tp_src = 0; + } + if (wc & OFPFW_TP_DST) { + m->tp_dst = 0; + } + } else { + /* Transport layer fields will always be extracted as zeros, so we + * can do an exact-m on those values. */ + wc &= ~OFPFW_TP; + m->tp_src = m->tp_dst = 0; + } + if (wc & OFPFW_NW_SRC_MASK) { + m->nw_src &= flow_nw_bits_to_mask(wc, OFPFW_NW_SRC_SHIFT); + } + if (wc & OFPFW_NW_DST_MASK) { + m->nw_dst &= flow_nw_bits_to_mask(wc, OFPFW_NW_DST_SHIFT); + } + } else { + /* Network and transport layer fields will always be extracted as + * zeros, so we can do an exact-m on those values. */ + wc &= ~(OFPFW_NW | OFPFW_TP); + m->nw_proto = m->nw_src = m->nw_dst = 0; + m->tp_src = m->tp_dst = 0; + } + if (wc & OFPFW_DL_SRC) { + memset(m->dl_src, 0, sizeof m->dl_src); + } + if (wc & OFPFW_DL_DST) { + memset(m->dl_dst, 0, sizeof m->dl_dst); + } + m->wildcards = htonl(wc); +} + +void +vconn_init(struct vconn *vconn, struct vconn_class *class, int connect_status, + uint32_t ip, const char *name, bool reconnectable) +{ + vconn->class = class; + vconn->state = (connect_status == EAGAIN ? VCS_CONNECTING + : !connect_status ? VCS_SEND_HELLO + : VCS_DISCONNECTED); + vconn->error = connect_status; + vconn->version = -1; + vconn->min_version = -1; + vconn->ip = ip; + vconn->name = xstrdup(name); + vconn->reconnectable = reconnectable; +} + +void +pvconn_init(struct pvconn *pvconn, struct pvconn_class *class, + const char *name) +{ + pvconn->class = class; + pvconn->name = xstrdup(name); +} diff --git a/lib/vconn.h b/lib/vconn.h new file mode 100644 index 00000000..e17f9e24 --- /dev/null +++ b/lib/vconn.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef VCONN_H +#define VCONN_H 1 + +#include +#include +#include +#include + +#include "flow.h" + +struct ofpbuf; +struct ofp_action_header; +struct ofp_header; +struct ofp_match; +struct ofp_stats_reply; +struct pvconn; +struct vconn; + +void vconn_usage(bool active, bool passive, bool bootstrap); + +/* Active vconns: virtual connections to OpenFlow devices. */ +int vconn_open(const char *name, int min_version, struct vconn **); +void vconn_close(struct vconn *); +const char *vconn_get_name(const struct vconn *); +uint32_t vconn_get_ip(const struct vconn *); +int vconn_connect(struct vconn *); +int vconn_recv(struct vconn *, struct ofpbuf **); +int vconn_send(struct vconn *, struct ofpbuf *); +int vconn_recv_xid(struct vconn *, uint32_t xid, struct ofpbuf **); +int vconn_transact(struct vconn *, struct ofpbuf *, struct ofpbuf **); + +int vconn_open_block(const char *name, int min_version, struct vconn **); +int vconn_send_block(struct vconn *, struct ofpbuf *); +int vconn_recv_block(struct vconn *, struct ofpbuf **); + +enum vconn_wait_type { + WAIT_CONNECT, + WAIT_RECV, + WAIT_SEND +}; +void vconn_wait(struct vconn *, enum vconn_wait_type); +void vconn_connect_wait(struct vconn *); +void vconn_recv_wait(struct vconn *); +void vconn_send_wait(struct vconn *); + +/* Passive vconns: virtual listeners for incoming OpenFlow connections. */ +int pvconn_open(const char *name, struct pvconn **); +const char *pvconn_get_name(const struct pvconn *); +void pvconn_close(struct pvconn *); +int pvconn_accept(struct pvconn *, int min_version, struct vconn **); +void pvconn_wait(struct pvconn *); + +/* OpenFlow protocol utility functions. */ +void *make_openflow(size_t openflow_len, uint8_t type, struct ofpbuf **); +void *make_openflow_xid(size_t openflow_len, uint8_t type, + uint32_t xid, struct ofpbuf **); +void *put_openflow(size_t openflow_len, uint8_t type, struct ofpbuf *); +void *put_openflow_xid(size_t openflow_len, uint8_t type, uint32_t xid, + struct ofpbuf *); +void update_openflow_length(struct ofpbuf *); +struct ofpbuf *make_flow_mod(uint16_t command, const flow_t *, + size_t actions_len); +struct ofpbuf *make_add_flow(const flow_t *, uint32_t buffer_id, + uint16_t max_idle, size_t actions_len); +struct ofpbuf *make_del_flow(const flow_t *); +struct ofpbuf *make_add_simple_flow(const flow_t *, + uint32_t buffer_id, uint16_t out_port, + uint16_t max_idle); +struct ofpbuf *make_packet_out(const struct ofpbuf *packet, uint32_t buffer_id, + uint16_t in_port, + const struct ofp_action_header *, + size_t n_actions); +struct ofpbuf *make_buffered_packet_out(uint32_t buffer_id, + uint16_t in_port, uint16_t out_port); +struct ofpbuf *make_unbuffered_packet_out(const struct ofpbuf *packet, + uint16_t in_port, uint16_t out_port); +struct ofpbuf *make_echo_request(void); +struct ofpbuf *make_echo_reply(const struct ofp_header *rq); +int check_ofp_message(const struct ofp_header *, uint8_t type, size_t size); +int check_ofp_message_array(const struct ofp_header *, uint8_t type, + size_t size, size_t array_elt_size, + size_t *n_array_elts); +int check_ofp_packet_out(const struct ofp_header *, struct ofpbuf *data, + int *n_actions, int max_ports); + +struct flow_stats_iterator { + const uint8_t *pos, *end; +}; +const struct ofp_flow_stats *flow_stats_first(struct flow_stats_iterator *, + const struct ofp_stats_reply *); +const struct ofp_flow_stats *flow_stats_next(struct flow_stats_iterator *); + +struct actions_iterator { + const union ofp_action *pos, *end; +}; +const union ofp_action *actions_first(struct actions_iterator *, + const union ofp_action *, + size_t n_actions); +const union ofp_action *actions_next(struct actions_iterator *); +int validate_actions(const union ofp_action *, size_t n_actions, + int max_ports); + +void normalize_match(struct ofp_match *); + +static inline int +ofp_mkerr(uint16_t type, uint16_t code) +{ + assert(type > 0 && type <= 0x7fff); + return (type << 16) | code; +} + +#endif /* vconn.h */ diff --git a/lib/vlog-modules.def b/lib/vlog-modules.def new file mode 100644 index 00000000..833c20a4 --- /dev/null +++ b/lib/vlog-modules.def @@ -0,0 +1,65 @@ +/* Modules that can emit log messages. */ +VLOG_MODULE(backtrace) +VLOG_MODULE(brcompatd) +VLOG_MODULE(bridge) +VLOG_MODULE(chain) +VLOG_MODULE(cfg) +VLOG_MODULE(cfg_mod) +VLOG_MODULE(controller) +VLOG_MODULE(coverage) +VLOG_MODULE(ctlpath) +VLOG_MODULE(daemon) +VLOG_MODULE(datapath) +VLOG_MODULE(dhcp) +VLOG_MODULE(dhcp_client) +VLOG_MODULE(discovery) +VLOG_MODULE(dpif) +VLOG_MODULE(dpctl) +VLOG_MODULE(executer) +VLOG_MODULE(ezio_term) +VLOG_MODULE(fail_open) +VLOG_MODULE(fault) +VLOG_MODULE(flow) +VLOG_MODULE(in_band) +VLOG_MODULE(leak_checker) +VLOG_MODULE(learning_switch) +VLOG_MODULE(mac_learning) +VLOG_MODULE(mgmt) +VLOG_MODULE(netdev) +VLOG_MODULE(netflow) +VLOG_MODULE(netlink) +VLOG_MODULE(ofctl) +VLOG_MODULE(ovs_discover) +VLOG_MODULE(ofproto) +VLOG_MODULE(pktbuf) +VLOG_MODULE(pcap) +VLOG_MODULE(poll_loop) +VLOG_MODULE(port_watcher) +VLOG_MODULE(proc_net_compat) +VLOG_MODULE(process) +VLOG_MODULE(secchan) +VLOG_MODULE(rconn) +VLOG_MODULE(stp) +VLOG_MODULE(stp_secchan) +VLOG_MODULE(stats) +VLOG_MODULE(status) +VLOG_MODULE(svec) +VLOG_MODULE(switch) +VLOG_MODULE(terminal) +VLOG_MODULE(timeval) +VLOG_MODULE(tty) +VLOG_MODULE(socket_util) +VLOG_MODULE(switchui) +VLOG_MODULE(unixctl) +VLOG_MODULE(vconn_tcp) +VLOG_MODULE(vconn_ssl) +VLOG_MODULE(vconn_stream) +VLOG_MODULE(vconn_unix) +VLOG_MODULE(vconn) +VLOG_MODULE(vlog) +VLOG_MODULE(wcelim) +VLOG_MODULE(vswitchd) +VLOG_MODULE(vt) +VLOG_MODULE(xenserver) + +#undef VLOG_MODULE diff --git a/lib/vlog.c b/lib/vlog.c new file mode 100644 index 00000000..408cc74a --- /dev/null +++ b/lib/vlog.c @@ -0,0 +1,711 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "vlog.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dirs.h" +#include "dynamic-string.h" +#include "sat-math.h" +#include "timeval.h" +#include "unixctl.h" +#include "util.h" + +#define THIS_MODULE VLM_vlog + +/* Name for each logging level. */ +static const char *level_names[VLL_N_LEVELS] = { +#define VLOG_LEVEL(NAME, SYSLOG_LEVEL) #NAME, + VLOG_LEVELS +#undef VLOG_LEVEL +}; + +/* Syslog value for each logging level. */ +static int syslog_levels[VLL_N_LEVELS] = { +#define VLOG_LEVEL(NAME, SYSLOG_LEVEL) SYSLOG_LEVEL, + VLOG_LEVELS +#undef VLOG_LEVEL +}; + +/* Name for each logging module */ +static const char *module_names[VLM_N_MODULES] = { +#define VLOG_MODULE(NAME) #NAME, +#include "vlog-modules.def" +#undef VLOG_MODULE +}; + +/* Information about each facility. */ +struct facility { + const char *name; /* Name. */ + char *pattern; /* Current pattern. */ + bool default_pattern; /* Whether current pattern is the default. */ +}; +static struct facility facilities[VLF_N_FACILITIES] = { +#define VLOG_FACILITY(NAME, PATTERN) {#NAME, PATTERN, true}, + VLOG_FACILITIES +#undef VLOG_FACILITY +}; + +/* Current log levels. */ +static int levels[VLM_N_MODULES][VLF_N_FACILITIES]; + +/* For fast checking whether we're logging anything for a given module and + * level.*/ +enum vlog_level min_vlog_levels[VLM_N_MODULES]; + +/* Time at which vlog was initialized, in milliseconds. */ +static long long int boot_time; + +/* VLF_FILE configuration. */ +static char *log_file_name; +static FILE *log_file; + +static void format_log_message(enum vlog_module, enum vlog_level, + enum vlog_facility, unsigned int msg_num, + const char *message, va_list, struct ds *) + PRINTF_FORMAT(5, 0); + +/* Searches the 'n_names' in 'names'. Returns the index of a match for + * 'target', or 'n_names' if no name matches. */ +static size_t +search_name_array(const char *target, const char **names, size_t n_names) +{ + size_t i; + + for (i = 0; i < n_names; i++) { + assert(names[i]); + if (!strcasecmp(names[i], target)) { + break; + } + } + return i; +} + +/* Returns the name for logging level 'level'. */ +const char * +vlog_get_level_name(enum vlog_level level) +{ + assert(level < VLL_N_LEVELS); + return level_names[level]; +} + +/* Returns the logging level with the given 'name', or VLL_N_LEVELS if 'name' + * is not the name of a logging level. */ +enum vlog_level +vlog_get_level_val(const char *name) +{ + return search_name_array(name, level_names, ARRAY_SIZE(level_names)); +} + +/* Returns the name for logging facility 'facility'. */ +const char * +vlog_get_facility_name(enum vlog_facility facility) +{ + assert(facility < VLF_N_FACILITIES); + return facilities[facility].name; +} + +/* Returns the logging facility named 'name', or VLF_N_FACILITIES if 'name' is + * not the name of a logging facility. */ +enum vlog_facility +vlog_get_facility_val(const char *name) +{ + size_t i; + + for (i = 0; i < VLF_N_FACILITIES; i++) { + if (!strcasecmp(facilities[i].name, name)) { + break; + } + } + return i; +} + +/* Returns the name for logging module 'module'. */ +const char *vlog_get_module_name(enum vlog_module module) +{ + assert(module < VLM_N_MODULES); + return module_names[module]; +} + +/* Returns the logging module named 'name', or VLM_N_MODULES if 'name' is not + * the name of a logging module. */ +enum vlog_module +vlog_get_module_val(const char *name) +{ + return search_name_array(name, module_names, ARRAY_SIZE(module_names)); +} + +/* Returns the current logging level for the given 'module' and 'facility'. */ +enum vlog_level +vlog_get_level(enum vlog_module module, enum vlog_facility facility) +{ + assert(module < VLM_N_MODULES); + assert(facility < VLF_N_FACILITIES); + return levels[module][facility]; +} + +static void +update_min_level(enum vlog_module module) +{ + enum vlog_level min_level = VLL_EMER; + enum vlog_facility facility; + + for (facility = 0; facility < VLF_N_FACILITIES; facility++) { + if (log_file || facility != VLF_FILE) { + min_level = MAX(min_level, levels[module][facility]); + } + } + min_vlog_levels[module] = min_level; +} + +static void +set_facility_level(enum vlog_facility facility, enum vlog_module module, + enum vlog_level level) +{ + assert(facility >= 0 && facility < VLF_N_FACILITIES); + assert(level < VLL_N_LEVELS); + + if (module == VLM_ANY_MODULE) { + for (module = 0; module < VLM_N_MODULES; module++) { + levels[module][facility] = level; + update_min_level(module); + } + } else { + levels[module][facility] = level; + update_min_level(module); + } +} + +/* Sets the logging level for the given 'module' and 'facility' to 'level'. */ +void +vlog_set_levels(enum vlog_module module, enum vlog_facility facility, + enum vlog_level level) +{ + assert(facility < VLF_N_FACILITIES || facility == VLF_ANY_FACILITY); + if (facility == VLF_ANY_FACILITY) { + for (facility = 0; facility < VLF_N_FACILITIES; facility++) { + set_facility_level(facility, module, level); + } + } else { + set_facility_level(facility, module, level); + } +} + +static void +do_set_pattern(enum vlog_facility facility, const char *pattern) +{ + struct facility *f = &facilities[facility]; + if (!f->default_pattern) { + free(f->pattern); + } else { + f->default_pattern = false; + } + f->pattern = xstrdup(pattern); +} + +/* Sets the pattern for the given 'facility' to 'pattern'. */ +void +vlog_set_pattern(enum vlog_facility facility, const char *pattern) +{ + assert(facility < VLF_N_FACILITIES || facility == VLF_ANY_FACILITY); + if (facility == VLF_ANY_FACILITY) { + for (facility = 0; facility < VLF_N_FACILITIES; facility++) { + do_set_pattern(facility, pattern); + } + } else { + do_set_pattern(facility, pattern); + } +} + +/* Returns the name of the log file used by VLF_FILE, or a null pointer if no + * log file has been set. (A non-null return value does not assert that the + * named log file is in use: if vlog_set_log_file() or vlog_reopen_log_file() + * fails, it still sets the log file name.) */ +const char * +vlog_get_log_file(void) +{ + return log_file_name; +} + +/* Sets the name of the log file used by VLF_FILE to 'file_name', or to the + * default file name if 'file_name' is null. Returns 0 if successful, + * otherwise a positive errno value. */ +int +vlog_set_log_file(const char *file_name) +{ + char *old_log_file_name; + enum vlog_module module; + int error; + + /* Close old log file. */ + if (log_file) { + VLOG_INFO("closing log file"); + fclose(log_file); + log_file = NULL; + } + + /* Update log file name and free old name. The ordering is important + * because 'file_name' might be 'log_file_name' or some suffix of it. */ + old_log_file_name = log_file_name; + log_file_name = (file_name + ? xstrdup(file_name) + : xasprintf("%s/%s.log", ovs_logdir, program_name)); + free(old_log_file_name); + file_name = NULL; /* Might have been freed. */ + + /* Open new log file and update min_levels[] to reflect whether we actually + * have a log_file. */ + log_file = fopen(log_file_name, "a"); + for (module = 0; module < VLM_N_MODULES; module++) { + update_min_level(module); + } + + /* Log success or failure. */ + if (!log_file) { + VLOG_WARN("failed to open %s for logging: %s", + log_file_name, strerror(errno)); + error = errno; + } else { + VLOG_INFO("opened log file %s", log_file_name); + error = 0; + } + + return error; +} + +/* Closes and then attempts to re-open the current log file. (This is useful + * just after log rotation, to ensure that the new log file starts being used.) + * Returns 0 if successful, otherwise a positive errno value. */ +int +vlog_reopen_log_file(void) +{ + return log_file_name ? vlog_set_log_file(log_file_name) : 0; +} + +/* Set debugging levels: + * + * mod[:facility[:level]] mod2[:facility[:level]] ... + * + * Return null if successful, otherwise an error message that the caller must + * free(). + */ +char * +vlog_set_levels_from_string(const char *s_) +{ + char *save_ptr; + char *s = xstrdup(s_); + char *module, *facility; + + for (module = strtok_r(s, ": \t", &save_ptr); module != NULL; + module = strtok_r(NULL, ": \t", &save_ptr)) { + enum vlog_module e_module; + enum vlog_facility e_facility; + + facility = strtok_r(NULL, ":", &save_ptr); + + if (!facility || !strcmp(facility, "ANY")) { + e_facility = VLF_ANY_FACILITY; + } else { + e_facility = vlog_get_facility_val(facility); + if (e_facility >= VLF_N_FACILITIES) { + char *msg = xasprintf("unknown facility \"%s\"", facility); + free(s); + return msg; + } + } + + if (!strcmp(module, "PATTERN")) { + vlog_set_pattern(e_facility, save_ptr); + break; + } else { + char *level; + enum vlog_level e_level; + + if (!strcmp(module, "ANY")) { + e_module = VLM_ANY_MODULE; + } else { + e_module = vlog_get_module_val(module); + if (e_module >= VLM_N_MODULES) { + char *msg = xasprintf("unknown module \"%s\"", module); + free(s); + return msg; + } + } + + level = strtok_r(NULL, ":", &save_ptr); + e_level = level ? vlog_get_level_val(level) : VLL_DBG; + if (e_level >= VLL_N_LEVELS) { + char *msg = xasprintf("unknown level \"%s\"", level); + free(s); + return msg; + } + + vlog_set_levels(e_module, e_facility, e_level); + } + } + free(s); + return NULL; +} + +/* If 'arg' is null, configure maximum verbosity. Otherwise, sets + * configuration according to 'arg' (see vlog_set_levels_from_string()). */ +void +vlog_set_verbosity(const char *arg) +{ + if (arg) { + char *msg = vlog_set_levels_from_string(arg); + if (msg) { + ovs_fatal(0, "processing \"%s\": %s", arg, msg); + } + } else { + vlog_set_levels(VLM_ANY_MODULE, VLF_ANY_FACILITY, VLL_DBG); + } +} + +static void +vlog_unixctl_set(struct unixctl_conn *conn, const char *args) +{ + char *msg = vlog_set_levels_from_string(args); + unixctl_command_reply(conn, msg ? 501 : 202, msg); + free(msg); +} + +static void +vlog_unixctl_list(struct unixctl_conn *conn, const char *args UNUSED) +{ + char *msg = vlog_get_levels(); + unixctl_command_reply(conn, 200, msg); + free(msg); +} + +static void +vlog_unixctl_reopen(struct unixctl_conn *conn, const char *args UNUSED) +{ + if (log_file_name) { + int error = vlog_reopen_log_file(); + if (error) { + unixctl_command_reply(conn, 503, strerror(errno)); + } else { + unixctl_command_reply(conn, 202, NULL); + } + } else { + unixctl_command_reply(conn, 403, "Logging to file not configured"); + } +} + +/* Initializes the logging subsystem. */ +void +vlog_init(void) +{ + time_t now; + + openlog(program_name, LOG_NDELAY, LOG_DAEMON); + vlog_set_levels(VLM_ANY_MODULE, VLF_ANY_FACILITY, VLL_INFO); + + boot_time = time_msec(); + now = time_now(); + if (now < 0) { + struct tm tm; + char s[128]; + + localtime_r(&now, &tm); + strftime(s, sizeof s, "%a, %d %b %Y %H:%M:%S %z", &tm); + VLOG_ERR("current time is negative: %s (%ld)", s, (long int) now); + } + + unixctl_command_register("vlog/set", vlog_unixctl_set); + unixctl_command_register("vlog/list", vlog_unixctl_list); + unixctl_command_register("vlog/reopen", vlog_unixctl_reopen); +} + +/* Closes the logging subsystem. */ +void +vlog_exit(void) +{ + closelog(); +} + +/* Print the current logging level for each module. */ +char * +vlog_get_levels(void) +{ + struct ds s = DS_EMPTY_INITIALIZER; + enum vlog_module module; + + ds_put_format(&s, " console syslog file\n"); + ds_put_format(&s, " ------- ------ ------\n"); + + for (module = 0; module < VLM_N_MODULES; module++) { + ds_put_format(&s, "%-16s %4s %4s %4s\n", + vlog_get_module_name(module), + vlog_get_level_name(vlog_get_level(module, VLF_CONSOLE)), + vlog_get_level_name(vlog_get_level(module, VLF_SYSLOG)), + vlog_get_level_name(vlog_get_level(module, VLF_FILE))); + } + + return ds_cstr(&s); +} + +/* Returns true if a log message emitted for the given 'module' and 'level' + * would cause some log output, false if that module and level are completely + * disabled. */ +bool +vlog_is_enabled(enum vlog_module module, enum vlog_level level) +{ + return min_vlog_levels[module] >= level; +} + +static const char * +fetch_braces(const char *p, const char *def, char *out, size_t out_size) +{ + if (*p == '{') { + size_t n = strcspn(p + 1, "}"); + size_t n_copy = MIN(n, out_size - 1); + memcpy(out, p + 1, n_copy); + out[n_copy] = '\0'; + p += n + 2; + } else { + ovs_strlcpy(out, def, out_size); + } + return p; +} + +static void +format_log_message(enum vlog_module module, enum vlog_level level, + enum vlog_facility facility, unsigned int msg_num, + const char *message, va_list args_, struct ds *s) +{ + char tmp[128]; + va_list args; + const char *p; + + ds_clear(s); + for (p = facilities[facility].pattern; *p != '\0'; ) { + enum { LEFT, RIGHT } justify = RIGHT; + int pad = '0'; + size_t length, field, used; + + if (*p != '%') { + ds_put_char(s, *p++); + continue; + } + + p++; + if (*p == '-') { + justify = LEFT; + p++; + } + if (*p == '0') { + pad = '0'; + p++; + } + field = 0; + while (isdigit(*p)) { + field = (field * 10) + (*p - '0'); + p++; + } + + length = s->length; + switch (*p++) { + case 'A': + ds_put_cstr(s, program_name); + break; + case 'c': + p = fetch_braces(p, "", tmp, sizeof tmp); + ds_put_cstr(s, vlog_get_module_name(module)); + break; + case 'd': + p = fetch_braces(p, "%Y-%m-%d %H:%M:%S", tmp, sizeof tmp); + ds_put_strftime(s, tmp, NULL); + break; + case 'm': + /* Format user-supplied log message and trim trailing new-lines. */ + length = s->length; + va_copy(args, args_); + ds_put_format_valist(s, message, args); + va_end(args); + while (s->length > length && s->string[s->length - 1] == '\n') { + s->length--; + } + break; + case 'N': + ds_put_format(s, "%u", msg_num); + break; + case 'n': + ds_put_char(s, '\n'); + break; + case 'p': + ds_put_cstr(s, vlog_get_level_name(level)); + break; + case 'P': + ds_put_format(s, "%ld", (long int) getpid()); + break; + case 'r': + ds_put_format(s, "%lld", time_msec() - boot_time); + break; + default: + ds_put_char(s, p[-1]); + break; + } + used = s->length - length; + if (used < field) { + size_t n_pad = field - used; + if (justify == RIGHT) { + ds_put_uninit(s, n_pad); + memmove(&s->string[length + n_pad], &s->string[length], used); + memset(&s->string[length], pad, n_pad); + } else { + ds_put_char_multiple(s, pad, n_pad); + } + } + } +} + +/* Writes 'message' to the log at the given 'level' and as coming from the + * given 'module'. + * + * Guaranteed to preserve errno. */ +void +vlog_valist(enum vlog_module module, enum vlog_level level, + const char *message, va_list args) +{ + bool log_to_console = levels[module][VLF_CONSOLE] >= level; + bool log_to_syslog = levels[module][VLF_SYSLOG] >= level; + bool log_to_file = levels[module][VLF_FILE] >= level && log_file; + if (log_to_console || log_to_syslog || log_to_file) { + int save_errno = errno; + static unsigned int msg_num; + struct ds s; + + ds_init(&s); + ds_reserve(&s, 1024); + msg_num++; + + if (log_to_console) { + format_log_message(module, level, VLF_CONSOLE, msg_num, + message, args, &s); + ds_put_char(&s, '\n'); + fputs(ds_cstr(&s), stderr); + } + + if (log_to_syslog) { + int syslog_level = syslog_levels[level]; + char *save_ptr = NULL; + char *line; + + format_log_message(module, level, VLF_SYSLOG, msg_num, + message, args, &s); + for (line = strtok_r(s.string, "\n", &save_ptr); line; + line = strtok_r(NULL, "\n", &save_ptr)) { + syslog(syslog_level, "%s", line); + } + } + + if (log_to_file) { + format_log_message(module, level, VLF_FILE, msg_num, + message, args, &s); + ds_put_char(&s, '\n'); + fputs(ds_cstr(&s), log_file); + fflush(log_file); + } + + ds_destroy(&s); + errno = save_errno; + } +} + +void +vlog(enum vlog_module module, enum vlog_level level, const char *message, ...) +{ + va_list args; + + va_start(args, message); + vlog_valist(module, level, message, args); + va_end(args); +} + +bool +vlog_should_drop(enum vlog_module module, enum vlog_level level, + struct vlog_rate_limit *rl) +{ + if (!vlog_is_enabled(module, level)) { + return true; + } + + if (rl->tokens < VLOG_MSG_TOKENS) { + time_t now = time_now(); + if (rl->last_fill > now) { + /* Last filled in the future? Time must have gone backward, or + * 'rl' has not been used before. */ + rl->tokens = rl->burst; + } else if (rl->last_fill < now) { + unsigned int add = sat_mul(rl->rate, now - rl->last_fill); + unsigned int tokens = sat_add(rl->tokens, add); + rl->tokens = MIN(tokens, rl->burst); + rl->last_fill = now; + } + if (rl->tokens < VLOG_MSG_TOKENS) { + if (!rl->n_dropped) { + rl->first_dropped = now; + } + rl->n_dropped++; + return true; + } + } + rl->tokens -= VLOG_MSG_TOKENS; + + if (rl->n_dropped) { + vlog(module, level, + "Dropped %u log messages in last %u seconds " + "due to excessive rate", + rl->n_dropped, (unsigned int) (time_now() - rl->first_dropped)); + rl->n_dropped = 0; + } + return false; +} + +void +vlog_rate_limit(enum vlog_module module, enum vlog_level level, + struct vlog_rate_limit *rl, const char *message, ...) +{ + if (!vlog_should_drop(module, level, rl)) { + va_list args; + + va_start(args, message); + vlog_valist(module, level, message, args); + va_end(args); + } +} + +void +vlog_usage(void) +{ + printf("\nLogging options:\n" + " -v, --verbose=MODULE[:FACILITY[:LEVEL]] set logging levels\n" + " -v, --verbose set maximum verbosity level\n" + " --log-file[=FILE] enable logging to specified FILE\n" + " (default: %s/%s.log)\n", + ovs_logdir, program_name); +} diff --git a/lib/vlog.h b/lib/vlog.h new file mode 100644 index 00000000..87036151 --- /dev/null +++ b/lib/vlog.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef VLOG_H +#define VLOG_H 1 + +#include +#include +#include +#include +#include "util.h" + +/* Logging importance levels. */ +#define VLOG_LEVELS \ + VLOG_LEVEL(EMER, LOG_ALERT) \ + VLOG_LEVEL(ERR, LOG_ERR) \ + VLOG_LEVEL(WARN, LOG_WARNING) \ + VLOG_LEVEL(INFO, LOG_NOTICE) \ + VLOG_LEVEL(DBG, LOG_DEBUG) +enum vlog_level { +#define VLOG_LEVEL(NAME, SYSLOG_LEVEL) VLL_##NAME, + VLOG_LEVELS +#undef VLOG_LEVEL + VLL_N_LEVELS +}; + +const char *vlog_get_level_name(enum vlog_level); +enum vlog_level vlog_get_level_val(const char *name); + +/* Facilities that we can log to. */ +#define VLOG_FACILITIES \ + VLOG_FACILITY(SYSLOG, "%05N|%c|%p|%m") \ + VLOG_FACILITY(CONSOLE, "%d{%b %d %H:%M:%S}|%05N|%c|%p|%m") \ + VLOG_FACILITY(FILE, "%d{%b %d %H:%M:%S}|%05N|%c|%p|%m") +enum vlog_facility { +#define VLOG_FACILITY(NAME, PATTERN) VLF_##NAME, + VLOG_FACILITIES +#undef VLOG_FACILITY + VLF_N_FACILITIES, + VLF_ANY_FACILITY = -1 +}; + +const char *vlog_get_facility_name(enum vlog_facility); +enum vlog_facility vlog_get_facility_val(const char *name); + +/* VLM_ constant for each vlog module. */ +enum vlog_module { +#define VLOG_MODULE(NAME) VLM_##NAME, +#include "vlog-modules.def" + VLM_N_MODULES, + VLM_ANY_MODULE = -1 +}; + +const char *vlog_get_module_name(enum vlog_module); +enum vlog_module vlog_get_module_val(const char *name); + +/* Rate-limiter for log messages. */ +struct vlog_rate_limit { + /* Configuration settings. */ + unsigned int rate; /* Tokens per second. */ + unsigned int burst; /* Max cumulative tokens credit. */ + + /* Current status. */ + unsigned int tokens; /* Current number of tokens. */ + time_t last_fill; /* Last time tokens added. */ + time_t first_dropped; /* Time first message was dropped. */ + unsigned int n_dropped; /* Number of messages dropped. */ +}; + +/* Number of tokens to emit a message. We add 'rate' tokens per second, which + * is 60 times the unit used for 'rate', thus 60 tokens are required to emit + * one message. */ +#define VLOG_MSG_TOKENS 60 + +/* Initializer for a struct vlog_rate_limit, to set up a maximum rate of RATE + * messages per minute and a maximum burst size of BURST messages. */ +#define VLOG_RATE_LIMIT_INIT(RATE, BURST) \ + { \ + RATE, /* rate */ \ + (MIN(BURST, UINT_MAX / VLOG_MSG_TOKENS) \ + * VLOG_MSG_TOKENS), /* burst */ \ + 0, /* tokens */ \ + 0, /* last_fill */ \ + 0, /* first_dropped */ \ + 0, /* n_dropped */ \ + } + +/* Configuring how each module logs messages. */ +enum vlog_level vlog_get_level(enum vlog_module, enum vlog_facility); +void vlog_set_levels(enum vlog_module, enum vlog_facility, enum vlog_level); +char *vlog_set_levels_from_string(const char *); +char *vlog_get_levels(void); +bool vlog_is_enabled(enum vlog_module, enum vlog_level); +bool vlog_should_drop(enum vlog_module, enum vlog_level, + struct vlog_rate_limit *); +void vlog_set_verbosity(const char *arg); + +/* Configuring log facilities. */ +void vlog_set_pattern(enum vlog_facility, const char *pattern); +const char *vlog_get_log_file(void); +int vlog_set_log_file(const char *file_name); +int vlog_reopen_log_file(void); + +/* Function for actual logging. */ +void vlog_init(void); +void vlog_exit(void); +void vlog(enum vlog_module, enum vlog_level, const char *format, ...) + __attribute__((format(printf, 3, 4))); +void vlog_valist(enum vlog_module, enum vlog_level, const char *, va_list) + __attribute__((format(printf, 3, 0))); +void vlog_rate_limit(enum vlog_module, enum vlog_level, + struct vlog_rate_limit *, const char *, ...) + __attribute__((format(printf, 4, 5))); + +/* Convenience macros. To use these, define THIS_MODULE as a macro that + * expands to the module used by the current source file, e.g. + * #include "vlog.h" + * #define THIS_MODULE VLM_netlink + * Guaranteed to preserve errno. + */ +#define VLOG_EMER(...) VLOG(VLL_EMER, __VA_ARGS__) +#define VLOG_ERR(...) VLOG(VLL_ERR, __VA_ARGS__) +#define VLOG_WARN(...) VLOG(VLL_WARN, __VA_ARGS__) +#define VLOG_INFO(...) VLOG(VLL_INFO, __VA_ARGS__) +#define VLOG_DBG(...) VLOG(VLL_DBG, __VA_ARGS__) + +/* More convenience macros, for testing whether a given level is enabled in + * THIS_MODULE. When constructing a log message is expensive, this enables it + * to be skipped. */ +#define VLOG_IS_EMER_ENABLED() true +#define VLOG_IS_ERR_ENABLED() vlog_is_enabled(THIS_MODULE, VLL_EMER) +#define VLOG_IS_WARN_ENABLED() vlog_is_enabled(THIS_MODULE, VLL_WARN) +#define VLOG_IS_INFO_ENABLED() vlog_is_enabled(THIS_MODULE, VLL_INFO) +#define VLOG_IS_DBG_ENABLED() vlog_is_enabled(THIS_MODULE, VLL_DBG) + +/* Convenience macros for rate-limiting. + * Guaranteed to preserve errno. + */ +#define VLOG_ERR_RL(RL, ...) VLOG_RL(RL, VLL_ERR, __VA_ARGS__) +#define VLOG_WARN_RL(RL, ...) VLOG_RL(RL, VLL_WARN, __VA_ARGS__) +#define VLOG_INFO_RL(RL, ...) VLOG_RL(RL, VLL_INFO, __VA_ARGS__) +#define VLOG_DBG_RL(RL, ...) VLOG_RL(RL, VLL_DBG, __VA_ARGS__) + +#define VLOG_DROP_ERR(RL) vlog_should_drop(THIS_MODULE, VLL_ERR, RL) +#define VLOG_DROP_WARN(RL) vlog_should_drop(THIS_MODULE, VLL_WARN, RL) +#define VLOG_DROP_INFO(RL) vlog_should_drop(THIS_MODULE, VLL_INFO, RL) +#define VLOG_DROP_DBG(RL) vlog_should_drop(THIS_MODULE, VLL_DBG, RL) + +/* Command line processing. */ +#define VLOG_OPTION_ENUMS OPT_LOG_FILE +#define VLOG_LONG_OPTIONS \ + {"verbose", optional_argument, 0, 'v'}, \ + {"log-file", optional_argument, 0, OPT_LOG_FILE} +#define VLOG_OPTION_HANDLERS \ + case 'v': \ + vlog_set_verbosity(optarg); \ + break; \ + case OPT_LOG_FILE: \ + vlog_set_log_file(optarg); \ + break; +void vlog_usage(void); + +/* Implementation details. */ +#define VLOG(LEVEL, ...) \ + do { \ + if (min_vlog_levels[THIS_MODULE] >= LEVEL) { \ + vlog(THIS_MODULE, LEVEL, __VA_ARGS__); \ + } \ + } while (0) +#define VLOG_RL(RL, LEVEL, ...) \ + do { \ + if (min_vlog_levels[THIS_MODULE] >= LEVEL) { \ + vlog_rate_limit(THIS_MODULE, LEVEL, RL, __VA_ARGS__); \ + } \ + } while (0) +extern enum vlog_level min_vlog_levels[VLM_N_MODULES]; + + +#endif /* vlog.h */ diff --git a/lib/vlog.man b/lib/vlog.man new file mode 100644 index 00000000..0bd8a26e --- /dev/null +++ b/lib/vlog.man @@ -0,0 +1,44 @@ +.TP +\fB-v\fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]], \fB--verbose=\fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]] + +Sets the logging level for \fImodule\fR in \fIfacility\fR to +\fIlevel\fR: + +.RS +.IP \(bu +\fImodule\fR may be any valid module name (as displayed by the +\fB--list\fR action on \fBovs-appctl\fR(8)), or the special name +\fBANY\fR to set the logging levels for all modules. + +.IP \(bu +\fIfacility\fR may be \fBsyslog\fR, \fBconsole\fR, or \fBfile\fR to +set the levels for logging to the system log, the console, or a file +respectively, or \fBANY\fR to set the logging levels for both +facilities. If it is omitted, \fIfacility\fR defaults to \fBANY\fR. + +Regardless of the log levels set for \fBfile\fR, logging to a file +will not take place unless \fB--log-file\fR is also specified (see +below). + +.IP \(bu +\fIlevel\fR must be one of \fBemer\fR, \fBerr\fR, \fBwarn\fR, +\fBinfo\fR, or +\fBdbg\fR, designating the minimum severity of a message for it to be +logged. If it is omitted, \fIlevel\fR defaults to \fBdbg\fR. +.RE + +.TP +\fB-v\fR, \fB--verbose\fR +Sets the maximum logging verbosity level, equivalent to +\fB--verbose=ANY:ANY:dbg\fR. + +.TP +\fB-vPATTERN:\fIfacility\fB:\fIpattern\fR, \fB--verbose=PATTERN:\fIfacility\fB:\fIpattern\fR +Sets the log pattern for \fIfacility\fR to \fIpattern\fR. Refer to +\fBovs-appctl\fR(8) for a description of the valid syntax for \fIpattern\fR. + +.TP +\fB--log-file\fR[\fB=\fIfile\fR] +Enables logging to a file. If \fIfile\fR is specified, then it is +used as the exact name for the log file. The default log file name +used if \fIfile\fR is omitted is \fB@LOGDIR@/\*(PN.log\fR. diff --git a/lib/xtoxll.h b/lib/xtoxll.h new file mode 100644 index 00000000..853a8063 --- /dev/null +++ b/lib/xtoxll.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2008 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#ifndef XTOXLL_H +#define XTOXLL_H 1 + +#include +#include + +static inline uint64_t +htonll(uint64_t n) +{ + return htonl(1) == 1 ? n : ((uint64_t) htonl(n) << 32) | htonl(n >> 32); +} + +static inline uint64_t +ntohll(uint64_t n) +{ + return htonl(1) == 1 ? n : ((uint64_t) ntohl(n) << 32) | ntohl(n >> 32); +} + +#endif /* xtoxll.h */ diff --git a/m4/nx-build.m4 b/m4/nx-build.m4 new file mode 100644 index 00000000..ed2a5897 --- /dev/null +++ b/m4/nx-build.m4 @@ -0,0 +1,53 @@ +# -*- autoconf -*- + +# Copyright (c) 2008 Nicira Networks. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +dnl NX_BUILDNR +dnl +dnl If --with-build-number=NUMBER is used, substitutes a Makefile +dnl variable BUILDNR with NUMBER, and sets a C preprocessor variable +dnl BUILDNR to "+buildNUMBER". +dnl +dnl Otherwise, if --with-build-number is not used, substitutes BUILDNR +dnl with 0 and sets C preprocessor variable BUILDNR to "". +AC_DEFUN([NX_BUILDNR], + [AC_ARG_WITH( + [build-number], + [AS_HELP_STRING([--with-build-number=NUMBER], + [Official build number (default is none)])]) + AC_MSG_CHECKING([build number]) + case $with_build_number in # ( + [[0-9]] | \ + [[0-9]][[0-9]] | \ + [[0-9]][[0-9]][[0-9]] | \ + [[0-9]][[0-9]][[0-9]][[0-9]] | \ + [[0-9]][[0-9]][[0-9]][[0-9]][[0-9]]) + BUILDNR=$with_build_number + buildnr='"+build'$BUILDNR'"' + AC_MSG_RESULT([$with_build_number]) + ;; # ( + ''|no) + BUILDNR=0 + buildnr='""' + AC_MSG_RESULT([none]) + ;; # ( + *) + AC_MSG_ERROR([invalid build number $with_build_number]) + ;; + esac + AC_SUBST([BUILDNR]) + AC_DEFINE_UNQUOTED([BUILDNR], [$buildnr], + [Official build number as a VERSION suffix string, e.g. "+build123", + or "" if this is not an official build.])]) diff --git a/m4/openvswitch.m4 b/m4/openvswitch.m4 new file mode 100644 index 00000000..7b23516a --- /dev/null +++ b/m4/openvswitch.m4 @@ -0,0 +1,210 @@ +# -*- autoconf -*- + +# Copyright (c) 2008, 2009 Nicira Networks. +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +dnl Checks for --enable-ndebug and defines NDEBUG if it is specified. +AC_DEFUN([OVS_CHECK_NDEBUG], + [AC_ARG_ENABLE( + [ndebug], + [AC_HELP_STRING([--enable-ndebug], + [Disable debugging features for max performance])], + [case "${enableval}" in + (yes) ndebug=true ;; + (no) ndebug=false ;; + (*) AC_MSG_ERROR([bad value ${enableval} for --enable-ndebug]) ;; + esac], + [ndebug=false]) + AM_CONDITIONAL([NDEBUG], [test x$ndebug = xtrue])]) + +dnl Checks for Netlink support. +AC_DEFUN([OVS_CHECK_NETLINK], + [AC_CHECK_HEADER([linux/netlink.h], + [HAVE_NETLINK=yes], + [HAVE_NETLINK=no], + [#include + #include + ]) + AM_CONDITIONAL([HAVE_NETLINK], [test "$HAVE_NETLINK" = yes]) + if test "$HAVE_NETLINK" = yes; then + AC_DEFINE([HAVE_NETLINK], [1], + [Define to 1 if Netlink protocol is available.]) + fi]) + +dnl Checks for OpenSSL, if --enable-ssl is passed in. +AC_DEFUN([OVS_CHECK_OPENSSL], + [AC_ARG_ENABLE( + [ssl], + [AC_HELP_STRING([--enable-ssl], + [Enable ssl support (requires libssl)])], + [case "${enableval}" in + (yes) ssl=true ;; + (no) ssl=false ;; + (*) AC_MSG_ERROR([bad value ${enableval} for --enable-ssl]) ;; + esac], + [ssl=false]) + + if test "$ssl" = true; then + dnl Make sure that pkg-config is installed. + m4_pattern_forbid([PKG_CHECK_MODULES]) + PKG_CHECK_MODULES([SSL], [libssl], + [HAVE_OPENSSL=yes], + [HAVE_OPENSSL=no + AC_MSG_WARN([Cannot find libssl: + + $SSL_PKG_ERRORS + + OpenFlow connections over SSL will not be supported.])]) + + fi + AM_CONDITIONAL([HAVE_OPENSSL], [test "$HAVE_OPENSSL" = yes]) + if test "$HAVE_OPENSSL" = yes; then + AC_DEFINE([HAVE_OPENSSL], [1], [Define to 1 if OpenSSL is installed.]) + fi]) + +dnl Checks for libraries needed by lib/fault.c. +AC_DEFUN([OVS_CHECK_FAULT_LIBS], + [AC_CHECK_LIB([dl], [dladdr], [FAULT_LIBS=-ldl]) + AC_SUBST([FAULT_LIBS])]) + +dnl Checks for libraries needed by lib/socket-util.c. +AC_DEFUN([OVS_CHECK_SOCKET_LIBS], + [AC_CHECK_LIB([socket], [connect]) + AC_SEARCH_LIBS([gethostbyname], [resolv], [RESOLVER_LIBS=-lresolv])]) + +dnl Checks for the directory in which to store the PKI. +AC_DEFUN([OVS_CHECK_PKIDIR], + [AC_ARG_WITH( + [pkidir], + AC_HELP_STRING([--with-pkidir=DIR], + [PKI hierarchy directory [[DATADIR/openvswitch/pki]]]), + [PKIDIR=$withval], + [PKIDIR='${pkgdatadir}/pki']) + AC_SUBST([PKIDIR])]) + +dnl Checks for the directory in which to store pidfiles. +AC_DEFUN([OVS_CHECK_RUNDIR], + [AC_ARG_WITH( + [rundir], + AC_HELP_STRING([--with-rundir=DIR], + [directory used for pidfiles [[LOCALSTATEDIR/run]]]), + [RUNDIR=$withval], + [RUNDIR='${localstatedir}/run']) + AC_SUBST([RUNDIR])]) + +dnl Checks for the directory in which to store logs. +AC_DEFUN([OVS_CHECK_LOGDIR], + [AC_ARG_WITH( + [logdir], + AC_HELP_STRING([--with-logdir=DIR], + [directory used for logs [[LOCALSTATEDIR/log/PACKAGE]]]), + [LOGDIR=$withval], + [LOGDIR='${localstatedir}/log/${PACKAGE}']) + AC_SUBST([LOGDIR])]) + +dnl Checks for __malloc_hook, etc., supported by glibc. +AC_DEFUN([OVS_CHECK_MALLOC_HOOKS], + [AC_CACHE_CHECK( + [whether libc supports hooks for malloc and related functions], + [ovs_cv_malloc_hooks], + [AC_COMPILE_IFELSE( + [AC_LANG_PROGRAM( + [#include + ], + [(void) __malloc_hook; + (void) __realloc_hook; + (void) __free_hook;])], + [ovs_cv_malloc_hooks=yes], + [ovs_cv_malloc_hooks=no])]) + if test $ovs_cv_malloc_hooks = yes; then + AC_DEFINE([HAVE_MALLOC_HOOKS], [1], + [Define to 1 if you have __malloc_hook, __realloc_hook, and + __free_hook in .]) + fi]) + +dnl Checks for valgrind/valgrind.h. +AC_DEFUN([OVS_CHECK_VALGRIND], + [AC_CHECK_HEADERS([valgrind/valgrind.h])]) + +dnl Searches for a directory to put lockfiles for tty devices. +dnl Defines C preprocessor variable TTY_LOCK_DIR to a quoted string +dnl for that directory. +AC_DEFUN([OVS_CHECK_TTY_LOCK_DIR], + [AC_CACHE_CHECK([directory used for serial device lockfiles], + [ovs_cv_path_tty_locks], + [# This list of candidate directories is from minicom. + ovs_cv_path_tty_locks=none + for dir in /etc/locks /var/lock /usr/spool/locks \ + /var/spool/locks /var/spool/lock \ + /usr/spool/uucp /var/spool/uucp /var/run; do + if test -d $dir; then + ovs_cv_path_tty_locks=$dir + break + fi + done]) + if test "$ovs_cv_path_tty_locks" = none; then + AC_MSG_ERROR([cannot find a directory for tty locks]) + fi + AC_DEFINE_UNQUOTED([TTY_LOCK_DIR], "$ovs_cv_path_tty_locks", + [Directory used for serial device lockfiles])]) + +dnl The following check is adapted from GNU PSPP. +dnl It searches for the ncurses library. If it finds it, it sets +dnl HAVE_CURSES to yes and sets NCURSES_LIBS and NCURSES_CFLAGS +dnl appropriate. Otherwise, it sets HAVE_CURSES to no. +AC_DEFUN([OVS_CHECK_CURSES], + [if test "$cross_compiling" != yes; then + AC_CHECK_PROGS([NCURSES_CONFIG], [ncurses5-config ncurses8-config]) + fi + if test "$NCURSES_CONFIG" = ""; then + AC_SEARCH_LIBS([tgetent], [ncurses], + [AC_CHECK_HEADERS([term.h curses.h], + [HAVE_CURSES=yes], + [HAVE_CURSES=no])]) + else + save_cflags=$CFLAGS + CFLAGS="$CFLAGS $($NCURSES_CONFIG --cflags)" + AC_CHECK_HEADERS([term.h curses.h], + [HAVE_CURSES=yes], + [HAVE_CURSES=no]) + CFLAGS=$save_cflags + if test "$HAVE_CURSES" = yes; then + NCURSES_LIBS=$($NCURSES_CONFIG --libs) + NCURSES_CFLAGS=$($NCURSES_CONFIG --cflags) + AC_SUBST(NCURSES_CFLAGS) + AC_SUBST(NCURSES_LIBS) + fi + fi + AM_CONDITIONAL([HAVE_CURSES], [test "$HAVE_CURSES" = yes])]) + +dnl Checks for linux/vt.h. +AC_DEFUN([OVS_CHECK_LINUX_VT_H], + [AC_CHECK_HEADER([linux/vt.h], + [HAVE_LINUX_VT_H=yes], + [HAVE_LINUX_VT_H=no]) + AM_CONDITIONAL([HAVE_LINUX_VT_H], [test "$HAVE_LINUX_VT_H" = yes]) + if test "$HAVE_LINUX_VT_H" = yes; then + AC_DEFINE([HAVE_LINUX_VT_H], [1], + [Define to 1 if linux/vt.h is available.]) + fi]) + +dnl Checks for libpcre. +AC_DEFUN([OVS_CHECK_PCRE], + [dnl Make sure that pkg-config is installed. + m4_pattern_forbid([PKG_CHECK_MODULES]) + PKG_CHECK_MODULES([PCRE], [libpcre], [HAVE_PCRE=yes], [HAVE_PCRE=no]) + AM_CONDITIONAL([HAVE_PCRE], [test "$HAVE_PCRE" = yes]) + if test "$HAVE_PCRE" = yes; then + AC_DEFINE([HAVE_PCRE], [1], [Define to 1 if libpcre is installed.]) + fi]) diff --git a/secchan/.gitignore b/secchan/.gitignore new file mode 100644 index 00000000..ada65665 --- /dev/null +++ b/secchan/.gitignore @@ -0,0 +1,4 @@ +/Makefile +/Makefile.in +/secchan +/secchan.8 diff --git a/secchan/automake.mk b/secchan/automake.mk new file mode 100644 index 00000000..d6bf1b0c --- /dev/null +++ b/secchan/automake.mk @@ -0,0 +1,42 @@ +# Copyright (C) 2009 Nicira Networks, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. This file is offered as-is, +# without warranty of any kind. + +bin_PROGRAMS += secchan/secchan +man_MANS += secchan/secchan.8 + +secchan_secchan_SOURCES = secchan/main.c +secchan_secchan_LDADD = \ + secchan/libsecchan.a \ + lib/libopenvswitch.a \ + $(FAULT_LIBS) \ + $(SSL_LIBS) + +noinst_LIBRARIES += secchan/libsecchan.a +secchan_libsecchan_a_SOURCES = \ + secchan/discovery.c \ + secchan/discovery.h \ + secchan/executer.c \ + secchan/executer.h \ + secchan/fail-open.c \ + secchan/fail-open.h \ + secchan/in-band.c \ + secchan/in-band.h \ + secchan/netflow.c \ + secchan/netflow.h \ + secchan/ofproto.c \ + secchan/ofproto.h \ + secchan/pktbuf.c \ + secchan/pktbuf.h \ + secchan/pinsched.c \ + secchan/pinsched.h \ + secchan/status.c \ + secchan/status.h + +EXTRA_DIST += secchan/secchan.8.in +DISTCLEANFILES += secchan/secchan.8 + +include secchan/commands/automake.mk diff --git a/secchan/commands/automake.mk b/secchan/commands/automake.mk new file mode 100644 index 00000000..cbe44d8c --- /dev/null +++ b/secchan/commands/automake.mk @@ -0,0 +1,3 @@ +commandsdir = ${pkgdatadir}/commands +dist_commands_SCRIPTS = \ + secchan/commands/reboot diff --git a/secchan/commands/reboot b/secchan/commands/reboot new file mode 100755 index 00000000..42fd10c1 --- /dev/null +++ b/secchan/commands/reboot @@ -0,0 +1,3 @@ +#! /bin/sh +ovs-kill --force --signal=USR1 ovs-switchui.pid +reboot diff --git a/secchan/discovery.c b/secchan/discovery.c new file mode 100644 index 00000000..06de6f07 --- /dev/null +++ b/secchan/discovery.c @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "discovery.h" +#include +#include +#include +#include +#include +#include +#include "dhcp-client.h" +#include "dhcp.h" +#include "dpif.h" +#include "netdev.h" +#include "openflow/openflow.h" +#include "packets.h" +#include "status.h" +#include "vconn-ssl.h" + +#define THIS_MODULE VLM_discovery +#include "vlog.h" + +struct discovery { + char *re; + bool update_resolv_conf; + regex_t *regex; + struct dhclient *dhcp; + int n_changes; + struct status_category *ss_cat; +}; + +static void modify_dhcp_request(struct dhcp_msg *, void *aux); +static bool validate_dhcp_offer(const struct dhcp_msg *, void *aux); + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60); + +static void +discovery_status_cb(struct status_reply *sr, void *d_) +{ + struct discovery *d = d_; + + status_reply_put(sr, "accept-remote=%s", d->re); + status_reply_put(sr, "n-changes=%d", d->n_changes); + if (d->dhcp) { + status_reply_put(sr, "state=%s", dhclient_get_state(d->dhcp)); + status_reply_put(sr, "state-elapsed=%u", + dhclient_get_state_elapsed(d->dhcp)); + if (dhclient_is_bound(d->dhcp)) { + uint32_t ip = dhclient_get_ip(d->dhcp); + uint32_t netmask = dhclient_get_netmask(d->dhcp); + uint32_t router = dhclient_get_router(d->dhcp); + + const struct dhcp_msg *cfg = dhclient_get_config(d->dhcp); + uint32_t dns_server; + char *domain_name; + int i; + + status_reply_put(sr, "ip="IP_FMT, IP_ARGS(&ip)); + status_reply_put(sr, "netmask="IP_FMT, IP_ARGS(&netmask)); + if (router) { + status_reply_put(sr, "router="IP_FMT, IP_ARGS(&router)); + } + + for (i = 0; dhcp_msg_get_ip(cfg, DHCP_CODE_DNS_SERVER, i, + &dns_server); + i++) { + status_reply_put(sr, "dns%d="IP_FMT, i, IP_ARGS(&dns_server)); + } + + domain_name = dhcp_msg_get_string(cfg, DHCP_CODE_DOMAIN_NAME); + if (domain_name) { + status_reply_put(sr, "domain=%s", domain_name); + free(domain_name); + } + + status_reply_put(sr, "lease-remaining=%u", + dhclient_get_lease_remaining(d->dhcp)); + } + } +} + +int +discovery_create(const char *re, bool update_resolv_conf, + struct dpif *dpif, struct switch_status *ss, + struct discovery **discoveryp) +{ + struct discovery *d; + char local_name[IF_NAMESIZE]; + int error; + + d = xcalloc(1, sizeof *d); + + /* Controller regular expression. */ + error = discovery_set_accept_controller_re(d, re); + if (error) { + goto error_free; + } + d->update_resolv_conf = update_resolv_conf; + + /* Initialize DHCP client. */ + error = dpif_get_name(dpif, local_name, sizeof local_name); + if (error) { + VLOG_ERR("failed to query datapath local port: %s", strerror(error)); + goto error_regfree; + } + error = dhclient_create(local_name, modify_dhcp_request, + validate_dhcp_offer, d, &d->dhcp); + if (error) { + VLOG_ERR("failed to initialize DHCP client: %s", strerror(error)); + goto error_regfree; + } + dhclient_set_max_timeout(d->dhcp, 3); + dhclient_init(d->dhcp, 0); + + d->ss_cat = switch_status_register(ss, "discovery", + discovery_status_cb, d); + + *discoveryp = d; + return 0; + +error_regfree: + regfree(d->regex); + free(d->regex); +error_free: + free(d); + *discoveryp = 0; + return error; +} + +void +discovery_destroy(struct discovery *d) +{ + if (d) { + free(d->re); + regfree(d->regex); + free(d->regex); + dhclient_destroy(d->dhcp); + switch_status_unregister(d->ss_cat); + free(d); + } +} + +void +discovery_set_update_resolv_conf(struct discovery *d, + bool update_resolv_conf) +{ + d->update_resolv_conf = update_resolv_conf; +} + +int +discovery_set_accept_controller_re(struct discovery *d, const char *re_) +{ + regex_t *regex; + int error; + char *re; + + re = (!re_ ? xstrdup(vconn_ssl_is_configured() ? "^ssl:.*" : ".*") + : re_[0] == '^' ? xstrdup(re_) : xasprintf("^%s", re_)); + regex = xmalloc(sizeof *regex); + error = regcomp(regex, re, REG_NOSUB | REG_EXTENDED); + if (error) { + size_t length = regerror(error, regex, NULL, 0); + char *buffer = xmalloc(length); + regerror(error, regex, buffer, length); + VLOG_WARN("%s: %s", re, buffer); + free(regex); + free(re); + return EINVAL; + } else { + if (d->regex) { + regfree(d->regex); + free(d->regex); + } + free(d->re); + + d->regex = regex; + d->re = re; + return 0; + } +} + +void +discovery_question_connectivity(struct discovery *d) +{ + if (d->dhcp) { + dhclient_force_renew(d->dhcp, 15); + } +} + +bool +discovery_run(struct discovery *d, char **controller_name) +{ + if (!d->dhcp) { + *controller_name = NULL; + return true; + } + + dhclient_run(d->dhcp); + if (!dhclient_changed(d->dhcp)) { + return false; + } + + dhclient_configure_netdev(d->dhcp); + if (d->update_resolv_conf) { + dhclient_update_resolv_conf(d->dhcp); + } + + if (dhclient_is_bound(d->dhcp)) { + *controller_name = dhcp_msg_get_string(dhclient_get_config(d->dhcp), + DHCP_CODE_OFP_CONTROLLER_VCONN); + VLOG_INFO("%s: discovered controller", *controller_name); + d->n_changes++; + } else { + *controller_name = NULL; + if (d->n_changes) { + VLOG_INFO("discovered controller no longer available"); + d->n_changes++; + } + } + return true; +} + +void +discovery_wait(struct discovery *d) +{ + if (d->dhcp) { + dhclient_wait(d->dhcp); + } +} + +static void +modify_dhcp_request(struct dhcp_msg *msg, void *aux UNUSED) +{ + dhcp_msg_put_string(msg, DHCP_CODE_VENDOR_CLASS, "OpenFlow"); +} + +static bool +validate_dhcp_offer(const struct dhcp_msg *msg, void *d_) +{ + const struct discovery *d = d_; + char *vconn_name; + bool accept; + + vconn_name = dhcp_msg_get_string(msg, DHCP_CODE_OFP_CONTROLLER_VCONN); + if (!vconn_name) { + VLOG_WARN_RL(&rl, "rejecting DHCP offer missing controller vconn"); + return false; + } + accept = !regexec(d->regex, vconn_name, 0, NULL, 0); + if (!accept) { + VLOG_WARN_RL(&rl, "rejecting controller vconn that fails to match %s", + d->re); + } + free(vconn_name); + return accept; +} diff --git a/secchan/discovery.h b/secchan/discovery.h new file mode 100644 index 00000000..cf8a925f --- /dev/null +++ b/secchan/discovery.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef DISCOVERY_H +#define DISCOVERY_H 1 + +#include + +struct dpif; +struct discovery; +struct settings; +struct switch_status; + +int discovery_create(const char *accept_controller_re, bool update_resolv_conf, + struct dpif *, struct switch_status *, + struct discovery **); +void discovery_destroy(struct discovery *); +void discovery_set_update_resolv_conf(struct discovery *, + bool update_resolv_conf); +int discovery_set_accept_controller_re(struct discovery *, const char *re); +void discovery_question_connectivity(struct discovery *); +bool discovery_run(struct discovery *, char **controller_name); +void discovery_wait(struct discovery *); + +#endif /* discovery.h */ diff --git a/secchan/executer.c b/secchan/executer.c new file mode 100644 index 00000000..304df56c --- /dev/null +++ b/secchan/executer.c @@ -0,0 +1,519 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "executer.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dirs.h" +#include "dynamic-string.h" +#include "fatal-signal.h" +#include "openflow/nicira-ext.h" +#include "ofpbuf.h" +#include "openflow/openflow.h" +#include "poll-loop.h" +#include "rconn.h" +#include "socket-util.h" +#include "util.h" +#include "vconn.h" + +#define THIS_MODULE VLM_executer +#include "vlog.h" + +#define MAX_CHILDREN 8 + +struct child { + /* Information about child process. */ + char *name; /* argv[0] passed to child. */ + pid_t pid; /* Child's process ID. */ + + /* For sending a reply to the controller when the child dies. */ + struct rconn *rconn; + uint32_t xid; /* Transaction ID used by controller. */ + + /* We read up to MAX_OUTPUT bytes of output and send them back to the + * controller when the child dies. */ +#define MAX_OUTPUT 4096 + int output_fd; /* FD from which to read child's output. */ + uint8_t *output; /* Output data. */ + size_t output_size; /* Number of bytes of output data so far. */ +}; + +struct executer { + /* Settings. */ + char *command_acl; /* Command white/blacklist, as shell globs. */ + char *command_dir; /* Directory that contains commands. */ + + /* Children. */ + struct child children[MAX_CHILDREN]; + size_t n_children; +}; + +/* File descriptors for waking up when a child dies. */ +static int signal_fds[2]; + +/* File descriptor for /dev/null. */ +static int null_fd = -1; + +static void send_child_status(struct rconn *, uint32_t xid, uint32_t status, + const void *data, size_t size); +static void send_child_message(struct rconn *, uint32_t xid, uint32_t status, + const char *message); + +/* Returns true if 'cmd' is allowed by 'acl', which is a command-separated + * access control list in the format described for --command-acl in + * secchan(8). */ +static bool +executer_is_permitted(const char *acl_, const char *cmd) +{ + char *acl, *save_ptr, *pattern; + bool allowed, denied; + + /* Verify that 'cmd' consists only of alphanumerics plus _ or -. */ + if (cmd[strspn(cmd, "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-")] != '\0') { + VLOG_WARN("rejecting command name \"%s\" that contain forbidden " + "characters", cmd); + return false; + } + + /* Check 'cmd' against 'acl'. */ + acl = xstrdup(acl_); + save_ptr = acl; + allowed = denied = false; + while ((pattern = strsep(&save_ptr, ",")) != NULL && !denied) { + if (pattern[0] != '!' && !fnmatch(pattern, cmd, 0)) { + allowed = true; + } else if (pattern[0] == '!' && !fnmatch(pattern + 1, cmd, 0)) { + denied = true; + } + } + free(acl); + + /* Check the command white/blacklisted state. */ + if (allowed && !denied) { + VLOG_INFO("permitting command execution: \"%s\" is whitelisted", cmd); + } else if (allowed && denied) { + VLOG_WARN("denying command execution: \"%s\" is both blacklisted " + "and whitelisted", cmd); + } else if (!allowed) { + VLOG_WARN("denying command execution: \"%s\" is not whitelisted", cmd); + } else if (denied) { + VLOG_WARN("denying command execution: \"%s\" is blacklisted", cmd); + } + return allowed && !denied; +} + +int +executer_handle_request(struct executer *e, struct rconn *rconn, + struct nicira_header *request) +{ + char **argv; + char *args; + char *exec_file = NULL; + int max_fds; + struct stat s; + size_t args_size; + size_t argc; + size_t i; + pid_t pid; + int output_fds[2]; + + /* Verify limit on children not exceeded. + * XXX should probably kill children when the connection drops? */ + if (e->n_children >= MAX_CHILDREN) { + send_child_message(rconn, request->header.xid, NXT_STATUS_ERROR, + "too many child processes"); + return 0; + } + + /* Copy argument buffer, adding a null terminator at the end. Now every + * argument is null-terminated, instead of being merely null-delimited. */ + args_size = ntohs(request->header.length) - sizeof *request; + args = xmemdup0((const void *) (request + 1), args_size); + + /* Count arguments. */ + argc = 0; + for (i = 0; i <= args_size; i++) { + argc += args[i] == '\0'; + } + + /* Set argv[*] to point to each argument. */ + argv = xmalloc((argc + 1) * sizeof *argv); + argv[0] = args; + for (i = 1; i < argc; i++) { + argv[i] = strchr(argv[i - 1], '\0') + 1; + } + argv[argc] = NULL; + + /* Check permissions. */ + if (!executer_is_permitted(e->command_acl, argv[0])) { + send_child_message(rconn, request->header.xid, NXT_STATUS_ERROR, + "command not allowed"); + goto done; + } + + /* Find the executable. */ + exec_file = xasprintf("%s/%s", e->command_dir, argv[0]); + if (stat(exec_file, &s)) { + VLOG_WARN("failed to stat \"%s\": %s", exec_file, strerror(errno)); + send_child_message(rconn, request->header.xid, NXT_STATUS_ERROR, + "command not allowed"); + goto done; + } + if (!S_ISREG(s.st_mode)) { + VLOG_WARN("\"%s\" is not a regular file", exec_file); + send_child_message(rconn, request->header.xid, NXT_STATUS_ERROR, + "command not allowed"); + goto done; + } + argv[0] = exec_file; + + /* Arrange to capture output. */ + if (pipe(output_fds)) { + VLOG_WARN("pipe failed: %s", strerror(errno)); + send_child_message(rconn, request->header.xid, NXT_STATUS_ERROR, + "internal error (pipe)"); + goto done; + } + + pid = fork(); + if (!pid) { + /* Running in child. + * XXX should run in new process group so that we can signal all + * subprocesses at once? Would also want to catch fatal signals and + * kill them at the same time though. */ + fatal_signal_fork(); + dup2(null_fd, 0); + dup2(output_fds[1], 1); + dup2(null_fd, 2); + max_fds = get_max_fds(); + for (i = 3; i < max_fds; i++) { + close(i); + } + if (chdir(e->command_dir)) { + printf("could not change directory to \"%s\": %s", + e->command_dir, strerror(errno)); + exit(EXIT_FAILURE); + } + execv(argv[0], argv); + printf("failed to start \"%s\": %s\n", argv[0], strerror(errno)); + exit(EXIT_FAILURE); + } else if (pid > 0) { + /* Running in parent. */ + struct child *child; + + VLOG_INFO("started \"%s\" subprocess", argv[0]); + send_child_status(rconn, request->header.xid, NXT_STATUS_STARTED, + NULL, 0); + child = &e->children[e->n_children++]; + child->name = xstrdup(argv[0]); + child->pid = pid; + child->rconn = rconn; + child->xid = request->header.xid; + child->output_fd = output_fds[0]; + child->output = xmalloc(MAX_OUTPUT); + child->output_size = 0; + set_nonblocking(output_fds[0]); + close(output_fds[1]); + } else { + VLOG_WARN("fork failed: %s", strerror(errno)); + send_child_message(rconn, request->header.xid, NXT_STATUS_ERROR, + "internal error (fork)"); + close(output_fds[0]); + close(output_fds[1]); + } + +done: + free(exec_file); + free(args); + free(argv); + return 0; +} + +static void +send_child_status(struct rconn *rconn, uint32_t xid, uint32_t status, + const void *data, size_t size) +{ + if (rconn) { + struct nx_command_reply *r; + struct ofpbuf *buffer; + + r = make_openflow_xid(sizeof *r, OFPT_VENDOR, xid, &buffer); + r->nxh.vendor = htonl(NX_VENDOR_ID); + r->nxh.subtype = htonl(NXT_COMMAND_REPLY); + r->status = htonl(status); + ofpbuf_put(buffer, data, size); + update_openflow_length(buffer); + if (rconn_send(rconn, buffer, NULL)) { + ofpbuf_delete(buffer); + } + } +} + +static void +send_child_message(struct rconn *rconn, uint32_t xid, uint32_t status, + const char *message) +{ + send_child_status(rconn, xid, status, message, strlen(message)); +} + +/* 'child' died with 'status' as its return code. Deal with it. */ +static void +child_terminated(struct child *child, int status) +{ + struct ds ds; + uint32_t ofp_status; + + /* Log how it terminated. */ + ds_init(&ds); + if (WIFEXITED(status)) { + ds_put_format(&ds, "normally with status %d", WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + const char *name = NULL; +#ifdef HAVE_STRSIGNAL + name = strsignal(WTERMSIG(status)); +#endif + ds_put_format(&ds, "by signal %d", WTERMSIG(status)); + if (name) { + ds_put_format(&ds, " (%s)", name); + } + } + if (WCOREDUMP(status)) { + ds_put_cstr(&ds, " (core dumped)"); + } + VLOG_INFO("child process \"%s\" with pid %ld terminated %s", + child->name, (long int) child->pid, ds_cstr(&ds)); + ds_destroy(&ds); + + /* Send a status message back to the controller that requested the + * command. */ + if (WIFEXITED(status)) { + ofp_status = WEXITSTATUS(status) | NXT_STATUS_EXITED; + } else if (WIFSIGNALED(status)) { + ofp_status = WTERMSIG(status) | NXT_STATUS_SIGNALED; + } else { + ofp_status = NXT_STATUS_UNKNOWN; + } + if (WCOREDUMP(status)) { + ofp_status |= NXT_STATUS_COREDUMP; + } + send_child_status(child->rconn, child->xid, ofp_status, + child->output, child->output_size); +} + +/* Read output from 'child' and append it to its output buffer. */ +static void +poll_child(struct child *child) +{ + ssize_t n; + + if (child->output_fd < 0) { + return; + } + + do { + n = read(child->output_fd, child->output + child->output_size, + MAX_OUTPUT - child->output_size); + } while (n < 0 && errno == EINTR); + if (n > 0) { + child->output_size += n; + if (child->output_size < MAX_OUTPUT) { + return; + } + } else if (n < 0 && errno == EAGAIN) { + return; + } + close(child->output_fd); + child->output_fd = -1; +} + +void +executer_run(struct executer *e) +{ + char buffer[MAX_CHILDREN]; + size_t i; + + if (!e->n_children) { + return; + } + + /* Read output from children. */ + for (i = 0; i < e->n_children; i++) { + struct child *child = &e->children[i]; + poll_child(child); + } + + /* If SIGCHLD was received, reap dead children. */ + if (read(signal_fds[0], buffer, sizeof buffer) <= 0) { + return; + } + for (;;) { + int status; + pid_t pid; + + /* Get dead child in 'pid' and its return code in 'status'. */ + pid = waitpid(WAIT_ANY, &status, WNOHANG); + if (pid < 0 && errno == EINTR) { + continue; + } else if (pid <= 0) { + return; + } + + /* Find child with given 'pid' and drop it from the list. */ + for (i = 0; i < e->n_children; i++) { + struct child *child = &e->children[i]; + if (child->pid == pid) { + poll_child(child); + child_terminated(child, status); + free(child->name); + free(child->output); + *child = e->children[--e->n_children]; + goto found; + } + } + VLOG_WARN("child with unknown pid %ld terminated", (long int) pid); + found:; + } + +} + +void +executer_wait(struct executer *e) +{ + if (e->n_children) { + size_t i; + + /* Wake up on SIGCHLD. */ + poll_fd_wait(signal_fds[0], POLLIN); + + /* Wake up when we get output from a child. */ + for (i = 0; i < e->n_children; i++) { + struct child *child = &e->children[i]; + if (child->output_fd >= 0) { + poll_fd_wait(child->output_fd, POLLIN); + } + } + } +} + +void +executer_rconn_closing(struct executer *e, struct rconn *rconn) +{ + size_t i; + + /* If any of our children was connected to 'r', then disconnect it so we + * don't try to reference a dead connection when the process terminates + * later. + * XXX kill the children started by 'r'? */ + for (i = 0; i < e->n_children; i++) { + if (e->children[i].rconn == rconn) { + e->children[i].rconn = NULL; + } + } +} + +static void +sigchld_handler(int signr UNUSED) +{ + write(signal_fds[1], "", 1); +} + +int +executer_create(const char *command_acl, const char *command_dir, + struct executer **executerp) +{ + struct executer *e; + struct sigaction sa; + + *executerp = NULL; + if (null_fd == -1) { + /* Create pipe for notifying us that SIGCHLD was invoked. */ + if (pipe(signal_fds)) { + VLOG_ERR("pipe failed: %s", strerror(errno)); + return errno; + } + set_nonblocking(signal_fds[0]); + set_nonblocking(signal_fds[1]); + + /* Open /dev/null. */ + null_fd = open("/dev/null", O_RDWR); + if (null_fd < 0) { + int error = errno; + VLOG_ERR("could not open /dev/null: %s", strerror(error)); + close(signal_fds[0]); + close(signal_fds[1]); + return error; + } + } + + /* Set up signal handler. */ + memset(&sa, 0, sizeof sa); + sa.sa_handler = sigchld_handler; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_NOCLDSTOP | SA_RESTART; + if (sigaction(SIGCHLD, &sa, NULL)) { + VLOG_ERR("sigaction(SIGCHLD) failed: %s", strerror(errno)); + return errno; + } + + e = xcalloc(1, sizeof *e); + e->command_acl = xstrdup(command_acl); + e->command_dir = (command_dir + ? xstrdup(command_dir) + : xasprintf("%s/commands", ovs_pkgdatadir)); + e->n_children = 0; + *executerp = e; + return 0; +} + +void +executer_destroy(struct executer *e) +{ + if (e) { + size_t i; + + free(e->command_acl); + free(e->command_dir); + for (i = 0; i < e->n_children; i++) { + struct child *child = &e->children[i]; + + free(child->name); + kill(child->pid, SIGHUP); + /* We don't own child->rconn. */ + free(child->output); + free(child); + } + free(e); + } +} + +void +executer_set_acl(struct executer *e, const char *acl, const char *dir) +{ + free(e->command_acl); + e->command_acl = xstrdup(acl); + free(e->command_dir); + e->command_dir = xstrdup(dir); +} diff --git a/secchan/executer.h b/secchan/executer.h new file mode 100644 index 00000000..a663367b --- /dev/null +++ b/secchan/executer.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef EXECUTER_H +#define EXECUTER_H 1 + +struct executer; +struct nicira_header; +struct rconn; + +int executer_create(const char *acl, const char *dir, struct executer **); +void executer_set_acl(struct executer *, const char *acl, const char *dir); +void executer_destroy(struct executer *); +void executer_run(struct executer *); +void executer_wait(struct executer *); +void executer_rconn_closing(struct executer *, struct rconn *); +int executer_handle_request(struct executer *, struct rconn *, + struct nicira_header *); + +#endif /* executer.h */ diff --git a/secchan/fail-open.c b/secchan/fail-open.c new file mode 100644 index 00000000..1ef989a0 --- /dev/null +++ b/secchan/fail-open.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "fail-open.h" +#include +#include +#include "flow.h" +#include "mac-learning.h" +#include "odp-util.h" +#include "ofproto.h" +#include "rconn.h" +#include "status.h" +#include "timeval.h" + +#define THIS_MODULE VLM_fail_open +#include "vlog.h" + +struct fail_open { + struct ofproto *ofproto; + struct rconn *controller; + int trigger_duration; + int last_disconn_secs; + struct status_category *ss_cat; +}; + +/* Causes the switch to enter or leave fail-open mode, if appropriate. */ +void +fail_open_run(struct fail_open *fo) +{ + int disconn_secs = rconn_failure_duration(fo->controller); + bool open = disconn_secs >= fo->trigger_duration; + if (open != (fo->last_disconn_secs != 0)) { + if (!open) { + flow_t flow; + + VLOG_WARN("No longer in fail-open mode"); + fo->last_disconn_secs = 0; + + memset(&flow, 0, sizeof flow); + ofproto_delete_flow(fo->ofproto, &flow, OFPFW_ALL, 70000); + } else { + VLOG_WARN("Could not connect to controller for %d seconds, " + "failing open", disconn_secs); + fo->last_disconn_secs = disconn_secs; + + /* Flush all OpenFlow and datapath flows. We will set up our + * fail-open rule from fail_open_flushed() when + * ofproto_flush_flows() calls back to us. */ + ofproto_flush_flows(fo->ofproto); + } + } else if (open && disconn_secs > fo->last_disconn_secs + 60) { + VLOG_INFO("Still in fail-open mode after %d seconds disconnected " + "from controller", disconn_secs); + fo->last_disconn_secs = disconn_secs; + } +} + +void +fail_open_wait(struct fail_open *fo UNUSED) +{ + /* Nothing to do. */ +} + +void +fail_open_flushed(struct fail_open *fo) +{ + int disconn_secs = rconn_failure_duration(fo->controller); + bool open = disconn_secs >= fo->trigger_duration; + if (open) { + union ofp_action action; + flow_t flow; + + /* Set up a flow that matches every packet and directs them to + * OFPP_NORMAL. */ + memset(&action, 0, sizeof action); + action.type = htons(OFPAT_OUTPUT); + action.output.len = htons(sizeof action); + action.output.port = htons(OFPP_NORMAL); + memset(&flow, 0, sizeof flow); + ofproto_add_flow(fo->ofproto, &flow, OFPFW_ALL, 70000, + &action, 1, 0); + } +} + +static void +fail_open_status_cb(struct status_reply *sr, void *fo_) +{ + struct fail_open *fo = fo_; + int cur_duration = rconn_failure_duration(fo->controller); + + status_reply_put(sr, "trigger-duration=%d", fo->trigger_duration); + status_reply_put(sr, "current-duration=%d", cur_duration); + status_reply_put(sr, "triggered=%s", + cur_duration >= fo->trigger_duration ? "true" : "false"); +} + +struct fail_open * +fail_open_create(struct ofproto *ofproto, + int trigger_duration, struct switch_status *switch_status, + struct rconn *controller) +{ + struct fail_open *fo = xmalloc(sizeof *fo); + fo->ofproto = ofproto; + fo->controller = controller; + fo->trigger_duration = trigger_duration; + fo->last_disconn_secs = 0; + fo->ss_cat = switch_status_register(switch_status, "fail-open", + fail_open_status_cb, fo); + return fo; +} + +void +fail_open_set_trigger_duration(struct fail_open *fo, int trigger_duration) +{ + fo->trigger_duration = trigger_duration; +} + +void +fail_open_destroy(struct fail_open *fo) +{ + if (fo) { + /* We don't own fo->controller. */ + switch_status_unregister(fo->ss_cat); + free(fo); + } +} diff --git a/secchan/fail-open.h b/secchan/fail-open.h new file mode 100644 index 00000000..8e01da2d --- /dev/null +++ b/secchan/fail-open.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef FAIL_OPEN_H +#define FAIL_OPEN_H 1 + +#include +#include +#include "flow.h" + +struct fail_open; +struct ofproto; +struct rconn; +struct switch_status; + +struct fail_open *fail_open_create(struct ofproto *, int trigger_duration, + struct switch_status *, + struct rconn *controller); +void fail_open_set_trigger_duration(struct fail_open *, int trigger_duration); +void fail_open_destroy(struct fail_open *); +void fail_open_wait(struct fail_open *); +void fail_open_run(struct fail_open *); +void fail_open_flushed(struct fail_open *); + +#endif /* fail-open.h */ diff --git a/secchan/in-band.c b/secchan/in-band.c new file mode 100644 index 00000000..51bf9ab4 --- /dev/null +++ b/secchan/in-band.c @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "in-band.h" +#include +#include +#include +#include +#include +#include "dpif.h" +#include "flow.h" +#include "mac-learning.h" +#include "netdev.h" +#include "odp-util.h" +#include "ofp-print.h" +#include "ofproto.h" +#include "ofpbuf.h" +#include "openflow/openflow.h" +#include "packets.h" +#include "poll-loop.h" +#include "rconn.h" +#include "status.h" +#include "timeval.h" +#include "vconn.h" + +#define THIS_MODULE VLM_in_band +#include "vlog.h" + +#define IB_BASE_PRIORITY 18181800 + +enum { + IBR_FROM_LOCAL_PORT, /* Sent by secure channel. */ + IBR_TO_LOCAL_PORT, /* Sent to secure channel. */ + IBR_ARP_FROM_CTL, /* ARP from the controller. */ + IBR_TO_CTL_OFP_SRC, /* To controller, OpenFlow source port. */ + IBR_TO_CTL_OFP_DST, /* To controller, OpenFlow dest port. */ + IBR_FROM_CTL_OFP_SRC, /* From controller, OpenFlow source port. */ + IBR_FROM_CTL_OFP_DST, /* From controller, OpenFlow dest port. */ +#if OFP_TCP_PORT != OFP_SSL_PORT +#error Need to support separate TCP and SSL flows. +#endif + N_IB_RULES +}; + +struct ib_rule { + bool installed; + flow_t flow; + uint32_t wildcards; + unsigned int priority; +}; + +struct in_band { + struct ofproto *ofproto; + struct netdev *netdev; + struct rconn *controller; + struct status_category *ss_cat; + + /* Keeping track of controller's MAC address. */ + uint32_t ip; /* Current IP, 0 if unknown. */ + uint32_t last_ip; /* Last known IP, 0 if never known. */ + uint8_t mac[ETH_ADDR_LEN]; /* Current MAC, 0 if unknown. */ + uint8_t last_mac[ETH_ADDR_LEN]; /* Last known MAC, 0 if never known */ + time_t next_refresh; /* Next time to refresh MAC address. */ + + /* Keeping track of the local port's MAC address. */ + uint8_t local_mac[ETH_ADDR_LEN]; /* Current MAC. */ + time_t next_local_refresh; /* Next time to refresh MAC address. */ + + /* Rules that we set up. */ + struct ib_rule rules[N_IB_RULES]; +}; + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60); + +static const uint8_t * +get_controller_mac(struct in_band *ib) +{ + time_t now = time_now(); + uint32_t ip; + + ip = rconn_get_ip(ib->controller); + if (ip != ib->ip || now >= ib->next_refresh) { + bool have_mac; + + ib->ip = ip; + + /* Look up MAC address. */ + memset(ib->mac, 0, sizeof ib->mac); + if (ib->ip) { + int retval = netdev_arp_lookup(ib->netdev, ib->ip, ib->mac); + if (retval) { + VLOG_DBG_RL(&rl, "cannot look up controller hw address " + "("IP_FMT"): %s", + IP_ARGS(&ib->ip), strerror(retval)); + } + } + have_mac = !eth_addr_is_zero(ib->mac); + + /* Log changes in IP, MAC addresses. */ + if (ib->ip && ib->ip != ib->last_ip) { + VLOG_DBG("controller IP address changed from "IP_FMT + " to "IP_FMT, IP_ARGS(&ib->last_ip), IP_ARGS(&ib->ip)); + ib->last_ip = ib->ip; + } + if (have_mac && memcmp(ib->last_mac, ib->mac, ETH_ADDR_LEN)) { + VLOG_DBG("controller MAC address changed from "ETH_ADDR_FMT" to " + ETH_ADDR_FMT, + ETH_ADDR_ARGS(ib->last_mac), ETH_ADDR_ARGS(ib->mac)); + memcpy(ib->last_mac, ib->mac, ETH_ADDR_LEN); + } + + /* Schedule next refresh. + * + * If we have an IP address but not a MAC address, then refresh + * quickly, since we probably will get a MAC address soon (via ARP). + * Otherwise, we can afford to wait a little while. */ + ib->next_refresh = now + (!ib->ip || have_mac ? 10 : 1); + } + return !eth_addr_is_zero(ib->mac) ? ib->mac : NULL; +} + +static const uint8_t * +get_local_mac(struct in_band *ib) +{ + time_t now = time_now(); + if (now >= ib->next_local_refresh) { + uint8_t ea[ETH_ADDR_LEN]; + if (!netdev_nodev_get_etheraddr(netdev_get_name(ib->netdev), ea)) { + memcpy(ib->local_mac, ea, ETH_ADDR_LEN); + } + ib->next_local_refresh = now + 1; + } + return !eth_addr_is_zero(ib->local_mac) ? ib->local_mac : NULL; +} + +static void +in_band_status_cb(struct status_reply *sr, void *in_band_) +{ + struct in_band *in_band = in_band_; + struct in_addr local_ip; + const uint8_t *local_mac; + uint32_t controller_ip; + const uint8_t *controller_mac; + + if (netdev_get_in4(in_band->netdev, &local_ip)) { + status_reply_put(sr, "local-ip="IP_FMT, IP_ARGS(&local_ip.s_addr)); + } + local_mac = get_local_mac(in_band); + if (local_mac) { + status_reply_put(sr, "local-mac="ETH_ADDR_FMT, + ETH_ADDR_ARGS(local_mac)); + } + + controller_ip = rconn_get_ip(in_band->controller); + if (controller_ip) { + status_reply_put(sr, "controller-ip="IP_FMT, + IP_ARGS(&controller_ip)); + } + controller_mac = get_controller_mac(in_band); + if (controller_mac) { + status_reply_put(sr, "controller-mac="ETH_ADDR_FMT, + ETH_ADDR_ARGS(controller_mac)); + } +} + +static void +drop_flow(struct in_band *in_band, int rule_idx) +{ + struct ib_rule *rule = &in_band->rules[rule_idx]; + + if (rule->installed) { + rule->installed = false; + ofproto_delete_flow(in_band->ofproto, &rule->flow, rule->wildcards, + rule->priority); + } +} + +/* out_port and fixed_fields are assumed never to change. */ +static void +setup_flow(struct in_band *in_band, int rule_idx, const flow_t *flow, + uint32_t fixed_fields, uint16_t out_port) +{ + struct ib_rule *rule = &in_band->rules[rule_idx]; + + if (!rule->installed || memcmp(flow, &rule->flow, sizeof *flow)) { + union ofp_action action; + + drop_flow(in_band, rule_idx); + + rule->installed = true; + rule->flow = *flow; + rule->wildcards = OFPFW_ALL & ~fixed_fields; + rule->priority = IB_BASE_PRIORITY + (N_IB_RULES - rule_idx); + + action.type = htons(OFPAT_OUTPUT); + action.output.len = htons(sizeof action); + action.output.port = htons(out_port); + action.output.max_len = htons(0); + ofproto_add_flow(in_band->ofproto, &rule->flow, rule->wildcards, + rule->priority, &action, 1, 0); + } +} + +void +in_band_run(struct in_band *in_band) +{ + const uint8_t *controller_mac; + const uint8_t *local_mac; + flow_t flow; + + if (time_now() < MIN(in_band->next_refresh, in_band->next_local_refresh)) { + return; + } + controller_mac = get_controller_mac(in_band); + local_mac = get_local_mac(in_band); + + /* Switch traffic sent by the secure channel. */ + memset(&flow, 0, sizeof flow); + flow.in_port = ODPP_LOCAL; + setup_flow(in_band, IBR_FROM_LOCAL_PORT, &flow, OFPFW_IN_PORT, + OFPP_NORMAL); + + /* Deliver traffic sent to the secure channel to the local port. */ + if (local_mac) { + memset(&flow, 0, sizeof flow); + memcpy(flow.dl_dst, local_mac, ETH_ADDR_LEN); + setup_flow(in_band, IBR_TO_LOCAL_PORT, &flow, OFPFW_DL_DST, + OFPP_NORMAL); + } else { + drop_flow(in_band, IBR_TO_LOCAL_PORT); + } + + if (controller_mac) { + /* Switch ARP requests sent by the controller. (OFPP_NORMAL will "do + * the right thing" regarding VLANs here.) */ + memset(&flow, 0, sizeof flow); + flow.dl_type = htons(ETH_TYPE_ARP); + memcpy(flow.dl_dst, eth_addr_broadcast, ETH_ADDR_LEN); + memcpy(flow.dl_src, controller_mac, ETH_ADDR_LEN); + setup_flow(in_band, IBR_ARP_FROM_CTL, &flow, + OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_DL_SRC, + OFPP_NORMAL); + + /* OpenFlow traffic to or from the controller. + * + * (A given field's value is completely ignored if it is wildcarded, + * which is why we can get away with using a single 'flow' in each + * case here.) */ + memset(&flow, 0, sizeof flow); + flow.dl_type = htons(ETH_TYPE_IP); + memcpy(flow.dl_src, controller_mac, ETH_ADDR_LEN); + memcpy(flow.dl_dst, controller_mac, ETH_ADDR_LEN); + flow.nw_proto = IP_TYPE_TCP; + flow.tp_src = htons(OFP_TCP_PORT); + flow.tp_dst = htons(OFP_TCP_PORT); + setup_flow(in_band, IBR_TO_CTL_OFP_SRC, &flow, + (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO + | OFPFW_TP_SRC), OFPP_NORMAL); + setup_flow(in_band, IBR_TO_CTL_OFP_DST, &flow, + (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO + | OFPFW_TP_DST), OFPP_NORMAL); + setup_flow(in_band, IBR_FROM_CTL_OFP_SRC, &flow, + (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO + | OFPFW_TP_SRC), OFPP_NORMAL); + setup_flow(in_band, IBR_FROM_CTL_OFP_DST, &flow, + (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO + | OFPFW_TP_DST), OFPP_NORMAL); + } else { + drop_flow(in_band, IBR_ARP_FROM_CTL); + drop_flow(in_band, IBR_TO_CTL_OFP_DST); + drop_flow(in_band, IBR_TO_CTL_OFP_SRC); + drop_flow(in_band, IBR_FROM_CTL_OFP_DST); + drop_flow(in_band, IBR_FROM_CTL_OFP_SRC); + } +} + +void +in_band_wait(struct in_band *in_band) +{ + time_t now = time_now(); + time_t wakeup = MIN(in_band->next_refresh, in_band->next_local_refresh); + if (wakeup > now) { + poll_timer_wait((wakeup - now) * 1000); + } else { + poll_immediate_wake(); + } +} + +void +in_band_flushed(struct in_band *in_band) +{ + int i; + + for (i = 0; i < N_IB_RULES; i++) { + in_band->rules[i].installed = false; + } +} + +int +in_band_create(struct ofproto *ofproto, + struct dpif *dpif, struct switch_status *ss, + struct rconn *controller, struct in_band **in_bandp) +{ + struct in_band *in_band; + struct netdev *netdev; + char local_name[IF_NAMESIZE]; + int error; + + *in_bandp = NULL; + error = dpif_get_name(dpif, local_name, sizeof local_name); + if (error) { + return error; + } + + error = netdev_open(local_name, NETDEV_ETH_TYPE_NONE, &netdev); + if (error) { + VLOG_ERR("failed to open %s network device: %s", + local_name, strerror(error)); + return error; + } + + in_band = xcalloc(1, sizeof *in_band); + in_band->ofproto = ofproto; + in_band->netdev = netdev; + in_band->controller = controller; + in_band->ss_cat = switch_status_register(ss, "in-band", + in_band_status_cb, in_band); + in_band->next_refresh = TIME_MIN; + in_band->next_local_refresh = TIME_MIN; + + *in_bandp = in_band; + return 0; +} + +void +in_band_destroy(struct in_band *in_band) +{ + if (in_band) { + netdev_close(in_band->netdev); + switch_status_unregister(in_band->ss_cat); + /* We don't own the rconn. */ + } +} + diff --git a/secchan/in-band.h b/secchan/in-band.h new file mode 100644 index 00000000..8673d2ad --- /dev/null +++ b/secchan/in-band.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef IN_BAND_H +#define IN_BAND_H 1 + +#include "flow.h" + +struct dpif; +struct in_band; +struct ofproto; +struct rconn; +struct secchan; +struct settings; +struct switch_status; + +int in_band_create(struct ofproto *, struct dpif *, struct switch_status *, + struct rconn *controller, struct in_band **); +void in_band_destroy(struct in_band *); +void in_band_run(struct in_band *); +void in_band_wait(struct in_band *); +void in_band_flushed(struct in_band *); + +#endif /* in-band.h */ diff --git a/secchan/main.c b/secchan/main.c new file mode 100644 index 00000000..ca385766 --- /dev/null +++ b/secchan/main.c @@ -0,0 +1,565 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "command-line.h" +#include "compiler.h" +#include "daemon.h" +#include "dirs.h" +#include "discovery.h" +#include "dpif.h" +#include "fail-open.h" +#include "fault.h" +#include "in-band.h" +#include "leak-checker.h" +#include "list.h" +#include "netdev.h" +#include "ofpbuf.h" +#include "ofproto.h" +#include "openflow/openflow.h" +#include "packets.h" +#include "poll-loop.h" +#include "rconn.h" +#include "status.h" +#include "svec.h" +#include "timeval.h" +#include "unixctl.h" +#include "util.h" +#include "vconn-ssl.h" +#include "vconn.h" + +#include "vlog.h" +#define THIS_MODULE VLM_secchan + +/* Behavior when the connection to the controller fails. */ +enum fail_mode { + FAIL_OPEN, /* Act as learning switch. */ + FAIL_CLOSED /* Drop all packets. */ +}; + +/* Settings that may be configured by the user. */ +struct ofsettings { + /* Overall mode of operation. */ + bool discovery; /* Discover the controller automatically? */ + bool in_band; /* Connect to controller in-band? */ + + /* Datapath. */ + uint64_t datapath_id; /* Datapath ID. */ + const char *dp_name; /* Name of local datapath. */ + + /* Description strings. */ + const char *mfr_desc; /* Manufacturer. */ + const char *hw_desc; /* Hardware. */ + const char *sw_desc; /* Software version. */ + const char *serial_desc; /* Serial number. */ + + /* Related vconns and network devices. */ + const char *controller_name; /* Controller (if not discovery mode). */ + struct svec listeners; /* Listen for management connections. */ + struct svec snoops; /* Listen for controller snooping conns. */ + + /* Failure behavior. */ + enum fail_mode fail_mode; /* Act as learning switch if no controller? */ + int max_idle; /* Idle time for flows in fail-open mode. */ + int probe_interval; /* # seconds idle before sending echo request. */ + int max_backoff; /* Max # seconds between connection attempts. */ + + /* Packet-in rate-limiting. */ + int rate_limit; /* Tokens added to bucket per second. */ + int burst_limit; /* Maximum number token bucket size. */ + + /* Discovery behavior. */ + const char *accept_controller_re; /* Controller vconns to accept. */ + bool update_resolv_conf; /* Update /etc/resolv.conf? */ + + /* Spanning tree protocol. */ + bool enable_stp; + + /* Remote command execution. */ + char *command_acl; /* Command white/blacklist, as shell globs. */ + char *command_dir; /* Directory that contains commands. */ + + /* Management. */ + uint64_t mgmt_id; /* Management ID. */ + + /* NetFlow. */ + struct svec netflow; /* NetFlow targets. */ +}; + +static void parse_options(int argc, char *argv[], struct ofsettings *); +static void usage(void) NO_RETURN; + +int +main(int argc, char *argv[]) +{ + struct unixctl_server *unixctl; + struct ofproto *ofproto; + struct ofsettings s; + int error; + + set_program_name(argv[0]); + register_fault_handlers(); + time_init(); + vlog_init(); + parse_options(argc, argv, &s); + signal(SIGPIPE, SIG_IGN); + + die_if_already_running(); + daemonize(); + + /* Start listening for ovs-appctl requests. */ + error = unixctl_server_create(NULL, &unixctl); + if (error) { + ovs_fatal(error, "Could not listen for unixctl connections"); + } + + VLOG_INFO("Open vSwitch version %s", VERSION BUILDNR); + VLOG_INFO("OpenFlow protocol version 0x%02x", OFP_VERSION); + + /* Start OpenFlow processing. */ + error = ofproto_create(s.dp_name, NULL, NULL, &ofproto); + if (error) { + ovs_fatal(error, "could not initialize openflow switch"); + } + error = ofproto_set_in_band(ofproto, s.in_band); + if (error) { + ovs_fatal(error, "failed to configure in-band control"); + } + error = ofproto_set_discovery(ofproto, s.discovery, s.accept_controller_re, + s.update_resolv_conf); + if (error) { + ovs_fatal(error, "failed to configure controller discovery"); + } + if (s.datapath_id) { + ofproto_set_datapath_id(ofproto, s.datapath_id); + } + if (s.mgmt_id) { + ofproto_set_mgmt_id(ofproto, s.mgmt_id); + } + ofproto_set_desc(ofproto, s.mfr_desc, s.hw_desc, s.sw_desc, s.serial_desc); + error = ofproto_set_listeners(ofproto, &s.listeners); + if (error) { + ovs_fatal(error, "failed to configure management connections"); + } + error = ofproto_set_snoops(ofproto, &s.snoops); + if (error) { + ovs_fatal(error, + "failed to configure controller snooping connections"); + } + error = ofproto_set_netflow(ofproto, &s.netflow, 0, 0, false); + if (error) { + ovs_fatal(error, "failed to configure NetFlow collectors"); + } + ofproto_set_failure(ofproto, s.fail_mode == FAIL_OPEN); + ofproto_set_probe_interval(ofproto, s.probe_interval); + ofproto_set_max_backoff(ofproto, s.max_backoff); + ofproto_set_rate_limit(ofproto, s.rate_limit, s.burst_limit); + error = ofproto_set_stp(ofproto, s.enable_stp); + if (error) { + ovs_fatal(error, "failed to configure STP"); + } + error = ofproto_set_remote_execution(ofproto, s.command_acl, + s.command_dir); + if (error) { + ovs_fatal(error, "failed to configure remote command execution"); + } + if (!s.discovery) { + error = ofproto_set_controller(ofproto, s.controller_name); + if (error) { + ovs_fatal(error, "failed to configure controller"); + } + } + + while (ofproto_is_alive(ofproto)) { + error = ofproto_run(ofproto); + if (error) { + ovs_fatal(error, "unrecoverable datapath error"); + } + unixctl_server_run(unixctl); + + ofproto_wait(ofproto); + unixctl_server_wait(unixctl); + poll_block(); + } + + return 0; +} + +/* User interface. */ + +static void +parse_options(int argc, char *argv[], struct ofsettings *s) +{ + enum { + OPT_DATAPATH_ID = UCHAR_MAX + 1, + OPT_MANUFACTURER, + OPT_HARDWARE, + OPT_SOFTWARE, + OPT_SERIAL, + OPT_ACCEPT_VCONN, + OPT_NO_RESOLV_CONF, + OPT_BR_NAME, + OPT_FAIL_MODE, + OPT_INACTIVITY_PROBE, + OPT_MAX_IDLE, + OPT_MAX_BACKOFF, + OPT_SNOOP, + OPT_RATE_LIMIT, + OPT_BURST_LIMIT, + OPT_BOOTSTRAP_CA_CERT, + OPT_STP, + OPT_NO_STP, + OPT_OUT_OF_BAND, + OPT_IN_BAND, + OPT_COMMAND_ACL, + OPT_COMMAND_DIR, + OPT_NETFLOW, + OPT_MGMT_ID, + VLOG_OPTION_ENUMS, + LEAK_CHECKER_OPTION_ENUMS + }; + static struct option long_options[] = { + {"datapath-id", required_argument, 0, OPT_DATAPATH_ID}, + {"manufacturer", required_argument, 0, OPT_MANUFACTURER}, + {"hardware", required_argument, 0, OPT_HARDWARE}, + {"software", required_argument, 0, OPT_SOFTWARE}, + {"serial", required_argument, 0, OPT_SERIAL}, + {"accept-vconn", required_argument, 0, OPT_ACCEPT_VCONN}, + {"no-resolv-conf", no_argument, 0, OPT_NO_RESOLV_CONF}, + {"config", required_argument, 0, 'F'}, + {"br-name", required_argument, 0, OPT_BR_NAME}, + {"fail", required_argument, 0, OPT_FAIL_MODE}, + {"inactivity-probe", required_argument, 0, OPT_INACTIVITY_PROBE}, + {"max-idle", required_argument, 0, OPT_MAX_IDLE}, + {"max-backoff", required_argument, 0, OPT_MAX_BACKOFF}, + {"listen", required_argument, 0, 'l'}, + {"snoop", required_argument, 0, OPT_SNOOP}, + {"rate-limit", optional_argument, 0, OPT_RATE_LIMIT}, + {"burst-limit", required_argument, 0, OPT_BURST_LIMIT}, + {"stp", no_argument, 0, OPT_STP}, + {"no-stp", no_argument, 0, OPT_NO_STP}, + {"out-of-band", no_argument, 0, OPT_OUT_OF_BAND}, + {"in-band", no_argument, 0, OPT_IN_BAND}, + {"command-acl", required_argument, 0, OPT_COMMAND_ACL}, + {"command-dir", required_argument, 0, OPT_COMMAND_DIR}, + {"netflow", required_argument, 0, OPT_NETFLOW}, + {"mgmt-id", required_argument, 0, OPT_MGMT_ID}, + {"verbose", optional_argument, 0, 'v'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + DAEMON_LONG_OPTIONS, + VLOG_LONG_OPTIONS, + LEAK_CHECKER_LONG_OPTIONS, +#ifdef HAVE_OPENSSL + VCONN_SSL_LONG_OPTIONS + {"bootstrap-ca-cert", required_argument, 0, OPT_BOOTSTRAP_CA_CERT}, +#endif + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + + /* Set defaults that we can figure out before parsing options. */ + s->datapath_id = 0; + s->mfr_desc = NULL; + s->hw_desc = NULL; + s->sw_desc = NULL; + s->serial_desc = NULL; + svec_init(&s->listeners); + svec_init(&s->snoops); + s->fail_mode = FAIL_OPEN; + s->max_idle = 0; + s->probe_interval = 0; + s->max_backoff = 15; + s->update_resolv_conf = true; + s->rate_limit = 0; + s->burst_limit = 0; + s->accept_controller_re = NULL; + s->enable_stp = false; + s->in_band = true; + s->command_acl = ""; + s->command_dir = NULL; + svec_init(&s->netflow); + s->mgmt_id = 0; + for (;;) { + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case OPT_DATAPATH_ID: + if (strlen(optarg) != 12 + || strspn(optarg, "0123456789abcdefABCDEF") != 12) { + ovs_fatal(0, "argument to --datapath-id must be " + "exactly 12 hex digits"); + } + s->datapath_id = strtoll(optarg, NULL, 16); + if (!s->datapath_id) { + ovs_fatal(0, "argument to --datapath-id must be nonzero"); + } + break; + + case OPT_MANUFACTURER: + s->mfr_desc = optarg; + break; + + case OPT_HARDWARE: + s->hw_desc = optarg; + break; + + case OPT_SOFTWARE: + s->sw_desc = optarg; + break; + + case OPT_SERIAL: + s->serial_desc = optarg; + break; + + case OPT_ACCEPT_VCONN: + s->accept_controller_re = optarg; + break; + + case OPT_NO_RESOLV_CONF: + s->update_resolv_conf = false; + break; + + case OPT_FAIL_MODE: + if (!strcmp(optarg, "open")) { + s->fail_mode = FAIL_OPEN; + } else if (!strcmp(optarg, "closed")) { + s->fail_mode = FAIL_CLOSED; + } else { + ovs_fatal(0, "-f or --fail argument must be \"open\" " + "or \"closed\""); + } + break; + + case OPT_INACTIVITY_PROBE: + s->probe_interval = atoi(optarg); + if (s->probe_interval < 5) { + ovs_fatal(0, "--inactivity-probe argument must be at least 5"); + } + break; + + case OPT_MAX_IDLE: + if (!strcmp(optarg, "permanent")) { + s->max_idle = OFP_FLOW_PERMANENT; + } else { + s->max_idle = atoi(optarg); + if (s->max_idle < 1 || s->max_idle > 65535) { + ovs_fatal(0, "--max-idle argument must be between 1 and " + "65535 or the word 'permanent'"); + } + } + break; + + case OPT_MAX_BACKOFF: + s->max_backoff = atoi(optarg); + if (s->max_backoff < 1) { + ovs_fatal(0, "--max-backoff argument must be at least 1"); + } else if (s->max_backoff > 3600) { + s->max_backoff = 3600; + } + break; + + case OPT_RATE_LIMIT: + if (optarg) { + s->rate_limit = atoi(optarg); + if (s->rate_limit < 1) { + ovs_fatal(0, "--rate-limit argument must be at least 1"); + } + } else { + s->rate_limit = 1000; + } + break; + + case OPT_BURST_LIMIT: + s->burst_limit = atoi(optarg); + if (s->burst_limit < 1) { + ovs_fatal(0, "--burst-limit argument must be at least 1"); + } + break; + + case OPT_STP: + s->enable_stp = true; + break; + + case OPT_NO_STP: + s->enable_stp = false; + break; + + case OPT_OUT_OF_BAND: + s->in_band = false; + break; + + case OPT_IN_BAND: + s->in_band = true; + break; + + case OPT_COMMAND_ACL: + s->command_acl = (s->command_acl[0] + ? xasprintf("%s,%s", s->command_acl, optarg) + : optarg); + break; + + case OPT_COMMAND_DIR: + s->command_dir = optarg; + break; + + case OPT_NETFLOW: + svec_add(&s->netflow, optarg); + break; + + case OPT_MGMT_ID: + if (strlen(optarg) != 12 + || strspn(optarg, "0123456789abcdefABCDEF") != 12) { + ovs_fatal(0, "argument to --mgmt-id must be " + "exactly 12 hex digits"); + } + s->mgmt_id = strtoll(optarg, NULL, 16); + if (!s->mgmt_id) { + ovs_fatal(0, "argument to --mgmt-id must be nonzero"); + } + break; + + case 'l': + svec_add(&s->listeners, optarg); + break; + + case OPT_SNOOP: + svec_add(&s->snoops, optarg); + break; + + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION); + exit(EXIT_SUCCESS); + + DAEMON_OPTION_HANDLERS + + VLOG_OPTION_HANDLERS + + LEAK_CHECKER_OPTION_HANDLERS + +#ifdef HAVE_OPENSSL + VCONN_SSL_OPTION_HANDLERS + + case OPT_BOOTSTRAP_CA_CERT: + vconn_ssl_set_ca_cert_file(optarg, true); + break; +#endif + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); + + argc -= optind; + argv += optind; + if (argc < 1 || argc > 2) { + ovs_fatal(0, "need one or two non-option arguments; " + "use --help for usage"); + } + + /* Local and remote vconns. */ + s->dp_name = argv[0]; + s->controller_name = argc > 1 ? xstrdup(argv[1]) : NULL; + + /* Set accept_controller_regex. */ + if (!s->accept_controller_re) { + s->accept_controller_re = vconn_ssl_is_configured() ? "^ssl:.*" : ".*"; + } + + /* Mode of operation. */ + s->discovery = s->controller_name == NULL; + if (s->discovery && !s->in_band) { + ovs_fatal(0, "Cannot perform discovery with out-of-band control"); + } + + /* Rate limiting. */ + if (s->rate_limit && s->rate_limit < 100) { + VLOG_WARN("Rate limit set to unusually low value %d", s->rate_limit); + } +} + +static void +usage(void) +{ + printf("%s: an OpenFlow switch implementation.\n" + "usage: %s [OPTIONS] DATAPATH [CONTROLLER]\n" + "DATAPATH is a local datapath (e.g. \"dp0\").\n" + "CONTROLLER is an active OpenFlow connection method; if it is\n" + "omitted, then secchan performs controller discovery.\n", + program_name, program_name); + vconn_usage(true, true, true); + printf("\nOpenFlow options:\n" + " -d, --datapath-id=ID Use ID as the OpenFlow switch ID\n" + " (ID must consist of 12 hex digits)\n" + " --mgmt-id=ID Use ID as the management ID\n" + " (ID must consist of 12 hex digits)\n" + " --manufacturer=MFR Identify manufacturer as MFR\n" + " --hardware=HW Identify hardware as HW\n" + " --software=SW Identify software as SW\n" + " --serial=SERIAL Identify serial number as SERIAL\n" + "\nController discovery options:\n" + " --accept-vconn=REGEX accept matching discovered controllers\n" + " --no-resolv-conf do not update /etc/resolv.conf\n" + "\nNetworking options:\n" + " --fail=open|closed when controller connection fails:\n" + " closed: drop all packets\n" + " open (default): act as learning switch\n" + " --inactivity-probe=SECS time between inactivity probes\n" + " --max-idle=SECS max idle for flows set up by secchan\n" + " --max-backoff=SECS max time between controller connection\n" + " attempts (default: 15 seconds)\n" + " -l, --listen=METHOD allow management connections on METHOD\n" + " (a passive OpenFlow connection method)\n" + " --snoop=METHOD allow controller snooping on METHOD\n" + " (a passive OpenFlow connection method)\n" + " --out-of-band controller connection is out-of-band\n" + " --netflow=HOST:PORT configure NetFlow output target\n" + "\nRate-limiting of \"packet-in\" messages to the controller:\n" + " --rate-limit[=PACKETS] max rate, in packets/s (default: 1000)\n" + " --burst-limit=BURST limit on packet credit for idle time\n" + "\nRemote command execution options:\n" + " --command-acl=[!]GLOB[,[!]GLOB...] set allowed/denied commands\n" + " --command-dir=DIR set command dir (default: %s/commands)\n", + ovs_pkgdatadir); + daemon_usage(); + vlog_usage(); + printf("\nOther options:\n" + " -h, --help display this help message\n" + " -V, --version display version information\n"); + leak_checker_usage(); + exit(EXIT_SUCCESS); +} diff --git a/secchan/netflow.c b/secchan/netflow.c new file mode 100644 index 00000000..99f3eea4 --- /dev/null +++ b/secchan/netflow.c @@ -0,0 +1,328 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "netflow.h" +#include +#include +#include +#include +#include "cfg.h" +#include "flow.h" +#include "netflow.h" +#include "ofpbuf.h" +#include "ofproto.h" +#include "packets.h" +#include "socket-util.h" +#include "svec.h" +#include "timeval.h" +#include "util.h" +#include "xtoxll.h" + +#define THIS_MODULE VLM_netflow +#include "vlog.h" + +#define NETFLOW_V5_VERSION 5 + +/* Every NetFlow v5 message contains the header that follows. This is + * followed by up to thirty records that describe a terminating flow. + * We only send a single record per NetFlow message. + */ +struct netflow_v5_header { + uint16_t version; /* NetFlow version is 5. */ + uint16_t count; /* Number of records in this message. */ + uint32_t sysuptime; /* System uptime in milliseconds. */ + uint32_t unix_secs; /* Number of seconds since Unix epoch. */ + uint32_t unix_nsecs; /* Number of residual nanoseconds + after epoch seconds. */ + uint32_t flow_seq; /* Number of flows since sending + messages began. */ + uint8_t engine_type; /* Engine type. */ + uint8_t engine_id; /* Engine id. */ + uint16_t sampling_interval; /* Set to zero. */ +}; +BUILD_ASSERT_DECL(sizeof(struct netflow_v5_header) == 24); + +/* A NetFlow v5 description of a terminating flow. It is preceded by a + * NetFlow v5 header. + */ +struct netflow_v5_record { + uint32_t src_addr; /* Source IP address. */ + uint32_t dst_addr; /* Destination IP address. */ + uint32_t nexthop; /* IP address of next hop. Set to 0. */ + uint16_t input; /* Input interface index. */ + uint16_t output; /* Output interface index. */ + uint32_t packet_count; /* Number of packets. */ + uint32_t byte_count; /* Number of bytes. */ + uint32_t init_time; /* Value of sysuptime on first packet. */ + uint32_t used_time; /* Value of sysuptime on last packet. */ + + /* The 'src_port' and 'dst_port' identify the source and destination + * port, respectively, for TCP and UDP. For ICMP, the high-order + * byte identifies the type and low-order byte identifies the code + * in the 'dst_port' field. */ + uint16_t src_port; + uint16_t dst_port; + + uint8_t pad1; + uint8_t tcp_flags; /* Union of seen TCP flags. */ + uint8_t ip_proto; /* IP protocol. */ + uint8_t ip_tos; /* IP TOS value. */ + uint16_t src_as; /* Source AS ID. Set to 0. */ + uint16_t dst_as; /* Destination AS ID. Set to 0. */ + uint8_t src_mask; /* Source mask bits. Set to 0. */ + uint8_t dst_mask; /* Destination mask bits. Set to 0. */ + uint8_t pad[2]; +}; +BUILD_ASSERT_DECL(sizeof(struct netflow_v5_record) == 48); + +struct netflow { + uint8_t engine_type; /* Value of engine_type to use. */ + uint8_t engine_id; /* Value of engine_id to use. */ + long long int boot_time; /* Time when netflow_create() was called. */ + int *fds; /* Sockets for NetFlow collectors. */ + size_t n_fds; /* Number of Netflow collectors. */ + bool add_id_to_iface; /* Put the 7 least signficiant bits of + * 'engine_id' into the most signficant + * bits of the interface fields. */ + uint32_t netflow_cnt; /* Flow sequence number for NetFlow. */ + struct ofpbuf packet; /* NetFlow packet being accumulated. */ +}; + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + +static int +open_collector(char *dst) +{ + char *save_ptr; + const char *host_name; + const char *port_string; + struct sockaddr_in sin; + int retval; + int fd; + + /* Glibc 2.7 has a bug in strtok_r when compiling with optimization that + * can cause segfaults here: + * http://sources.redhat.com/bugzilla/show_bug.cgi?id=5614. + * Using "::" instead of the obvious ":" works around it. */ + host_name = strtok_r(dst, "::", &save_ptr); + port_string = strtok_r(NULL, "::", &save_ptr); + if (!host_name) { + ovs_error(0, "%s: bad peer name format", dst); + return -EAFNOSUPPORT; + } + if (!port_string) { + ovs_error(0, "%s: bad port format", dst); + return -EAFNOSUPPORT; + } + + memset(&sin, 0, sizeof sin); + sin.sin_family = AF_INET; + if (lookup_ip(host_name, &sin.sin_addr)) { + return -ENOENT; + } + sin.sin_port = htons(atoi(port_string)); + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) { + VLOG_ERR("%s: socket: %s", dst, strerror(errno)); + return -errno; + } + + retval = set_nonblocking(fd); + if (retval) { + close(fd); + return -retval; + } + + retval = connect(fd, (struct sockaddr *) &sin, sizeof sin); + if (retval < 0) { + int error = errno; + VLOG_ERR("%s: connect: %s", dst, strerror(error)); + close(fd); + return -error; + } + + return fd; +} + +void +netflow_expire(struct netflow *nf, const struct ofexpired *expired) +{ + struct netflow_v5_header *nf_hdr; + struct netflow_v5_record *nf_rec; + struct timeval now; + + /* NetFlow only reports on IP packets. */ + if (expired->flow.dl_type != htons(ETH_TYPE_IP)) { + return; + } + + time_timeval(&now); + + if (!nf->packet.size) { + nf_hdr = ofpbuf_put_zeros(&nf->packet, sizeof *nf_hdr); + nf_hdr->version = htons(NETFLOW_V5_VERSION); + nf_hdr->count = htons(0); + nf_hdr->sysuptime = htonl(time_msec() - nf->boot_time); + nf_hdr->unix_secs = htonl(now.tv_sec); + nf_hdr->unix_nsecs = htonl(now.tv_usec * 1000); + nf_hdr->flow_seq = htonl(nf->netflow_cnt++); + nf_hdr->engine_type = nf->engine_type; + nf_hdr->engine_id = nf->engine_id; + nf_hdr->sampling_interval = htons(0); + } + + nf_hdr = nf->packet.data; + nf_hdr->count = htons(ntohs(nf_hdr->count) + 1); + + nf_rec = ofpbuf_put_zeros(&nf->packet, sizeof *nf_rec); + nf_rec->src_addr = expired->flow.nw_src; + nf_rec->dst_addr = expired->flow.nw_dst; + nf_rec->nexthop = htons(0); + if (nf->add_id_to_iface) { + uint16_t iface = (nf->engine_id & 0x7f) << 9; + nf_rec->input = htons(iface | (expired->flow.in_port & 0x1ff)); + nf_rec->output = htons(iface); + printf("input: %x\n", ntohs(nf_rec->input)); + } else { + nf_rec->input = htons(expired->flow.in_port); + nf_rec->output = htons(0); + } + nf_rec->packet_count = htonl(MIN(expired->packet_count, UINT32_MAX)); + nf_rec->byte_count = htonl(MIN(expired->byte_count, UINT32_MAX)); + nf_rec->init_time = htonl(expired->created - nf->boot_time); + nf_rec->used_time = htonl(MAX(expired->created, expired->used) + - nf->boot_time); + if (expired->flow.nw_proto == IP_TYPE_ICMP) { + /* In NetFlow, the ICMP type and code are concatenated and + * placed in the 'dst_port' field. */ + uint8_t type = ntohs(expired->flow.tp_src); + uint8_t code = ntohs(expired->flow.tp_dst); + nf_rec->src_port = htons(0); + nf_rec->dst_port = htons((type << 8) | code); + } else { + nf_rec->src_port = expired->flow.tp_src; + nf_rec->dst_port = expired->flow.tp_dst; + } + nf_rec->tcp_flags = expired->tcp_flags; + nf_rec->ip_proto = expired->flow.nw_proto; + nf_rec->ip_tos = expired->ip_tos; + + /* NetFlow messages are limited to 30 records. A length of 1400 + * bytes guarantees that the limit is not exceeded. */ + if (nf->packet.size >= 1400) { + netflow_run(nf); + } +} + +void +netflow_run(struct netflow *nf) +{ + size_t i; + + if (!nf->packet.size) { + return; + } + + for (i = 0; i < nf->n_fds; i++) { + if (send(nf->fds[i], nf->packet.data, nf->packet.size, 0) == -1) { + VLOG_WARN_RL(&rl, "netflow message send failed: %s", + strerror(errno)); + } + } + nf->packet.size = 0; +} + +static void +clear_collectors(struct netflow *nf) +{ + size_t i; + + for (i = 0; i < nf->n_fds; i++) { + close(nf->fds[i]); + } + free(nf->fds); + nf->fds = NULL; + nf->n_fds = 0; +} + +int +netflow_set_collectors(struct netflow *nf, const struct svec *collectors_) +{ + struct svec collectors; + int error = 0; + size_t i; + + clear_collectors(nf); + + svec_clone(&collectors, collectors_); + svec_sort_unique(&collectors); + + nf->fds = xmalloc(sizeof *nf->fds * collectors.n); + for (i = 0; i < collectors.n; i++) { + const char *name = collectors.names[i]; + char *tmpname = xstrdup(name); + int fd = open_collector(tmpname); + free(tmpname); + if (fd >= 0) { + nf->fds[nf->n_fds++] = fd; + } else { + VLOG_WARN("couldn't open connection to collector (%s), " + "ignoring %s\n", strerror(-fd), name); + if (!error) { + error = -fd; + } + } + } + + svec_destroy(&collectors); + return error; +} + +void +netflow_set_engine(struct netflow *nf, uint8_t engine_type, + uint8_t engine_id, bool add_id_to_iface) +{ + nf->engine_type = engine_type; + nf->engine_id = engine_id; + nf->add_id_to_iface = add_id_to_iface; +} + +struct netflow * +netflow_create(void) +{ + struct netflow *nf = xmalloc(sizeof *nf); + nf->engine_type = 0; + nf->engine_id = 0; + nf->boot_time = time_msec(); + nf->fds = NULL; + nf->n_fds = 0; + nf->add_id_to_iface = false; + nf->netflow_cnt = 0; + ofpbuf_init(&nf->packet, 1500); + return nf; +} + +void +netflow_destroy(struct netflow *nf) +{ + if (nf) { + ofpbuf_uninit(&nf->packet); + clear_collectors(nf); + free(nf); + } +} diff --git a/secchan/netflow.h b/secchan/netflow.h new file mode 100644 index 00000000..f37d6eff --- /dev/null +++ b/secchan/netflow.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef NETFLOW_H +#define NETFLOW_H 1 + +#include "flow.h" + +struct ofexpired; +struct svec; + +struct netflow *netflow_create(void); +void netflow_destroy(struct netflow *); +int netflow_set_collectors(struct netflow *, const struct svec *collectors); +void netflow_set_engine(struct netflow *nf, uint8_t engine_type, + uint8_t engine_id, bool add_id_to_iface); +void netflow_expire(struct netflow *, const struct ofexpired *); +void netflow_run(struct netflow *); + +#endif /* netflow.h */ diff --git a/secchan/ofproto.c b/secchan/ofproto.c new file mode 100644 index 00000000..8220fb8d --- /dev/null +++ b/secchan/ofproto.c @@ -0,0 +1,3305 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "ofproto.h" +#include +#include +#include +#include +#include +#include +#include "classifier.h" +#include "coverage.h" +#include "discovery.h" +#include "dpif.h" +#include "executer.h" +#include "fail-open.h" +#include "in-band.h" +#include "mac-learning.h" +#include "netdev.h" +#include "netflow.h" +#include "odp-util.h" +#include "ofp-print.h" +#include "ofpbuf.h" +#include "openflow/nicira-ext.h" +#include "openflow/openflow.h" +#include "openflow/openflow-mgmt.h" +#include "openvswitch/datapath-protocol.h" +#include "packets.h" +#include "pinsched.h" +#include "pktbuf.h" +#include "poll-loop.h" +#include "port-array.h" +#include "rconn.h" +#include "shash.h" +#include "status.h" +#include "stp.h" +#include "svec.h" +#include "tag.h" +#include "timeval.h" +#include "vconn.h" +#include "vconn-ssl.h" +#include "xtoxll.h" + +#define THIS_MODULE VLM_ofproto +#include "vlog.h" + +enum { + DP_GROUP_FLOOD = 0, + DP_GROUP_ALL = 1 +}; + +enum { + TABLEID_HASH = 0, + TABLEID_CLASSIFIER = 1 +}; + +struct ofport { + struct netdev *netdev; + struct ofp_phy_port opp; /* In host byte order. */ +}; + +static void ofport_free(struct ofport *); +static void hton_ofp_phy_port(struct ofp_phy_port *); + +static int xlate_actions(const union ofp_action *in, size_t n_in, + const flow_t *flow, struct ofproto *ofproto, + const struct ofpbuf *packet, + struct odp_actions *out, tag_type *tags, + bool *may_setup_flow); + +struct rule { + struct cls_rule cr; + + uint16_t idle_timeout; /* In seconds from time of last use. */ + uint16_t hard_timeout; /* In seconds from time of creation. */ + long long int used; /* Last-used time (0 if never used). */ + long long int created; /* Creation time. */ + uint64_t packet_count; /* Number of packets received. */ + uint64_t byte_count; /* Number of bytes received. */ + uint64_t accounted_bytes; /* Number of bytes passed to account_cb. */ + uint8_t tcp_flags; /* Bitwise-OR of all TCP flags seen. */ + uint8_t ip_tos; /* Last-seen IP type-of-service. */ + tag_type tags; /* Tags (set only by hooks). */ + + /* If 'super' is non-NULL, this rule is a subrule, that is, it is an + * exact-match rule (having cr.wc.wildcards of 0) generated from the + * wildcard rule 'super'. In this case, 'list' is an element of the + * super-rule's list. + * + * If 'super' is NULL, this rule is a super-rule, and 'list' is the head of + * a list of subrules. A super-rule with no wildcards (where + * cr.wc.wildcards is 0) will never have any subrules. */ + struct rule *super; + struct list list; + + /* OpenFlow actions. + * + * A subrule has no actions (it uses the super-rule's actions). */ + int n_actions; + union ofp_action *actions; + + /* Datapath actions. + * + * A super-rule with wildcard fields never has ODP actions (since the + * datapath only supports exact-match flows). */ + bool installed; /* Installed in datapath? */ + bool may_install; /* True ordinarily; false if actions must + * be reassessed for every packet. */ + int n_odp_actions; + union odp_action *odp_actions; +}; + +static inline bool +rule_is_hidden(const struct rule *rule) +{ + /* Subrules are merely an implementation detail, so hide them from the + * controller. */ + if (rule->super != NULL) { + return true; + } + + /* Rules with priority higher than UINT16_MAX are set up by secchan itself + * (e.g. by in-band control) and are intentionally hidden from the + * controller. */ + if (rule->cr.priority > UINT16_MAX) { + return true; + } + + return false; +} + +static struct rule *rule_create(struct rule *super, const union ofp_action *, + size_t n_actions, uint16_t idle_timeout, + uint16_t hard_timeout); +static void rule_free(struct rule *); +static void rule_destroy(struct ofproto *, struct rule *); +static struct rule *rule_from_cls_rule(const struct cls_rule *); +static void rule_insert(struct ofproto *, struct rule *, + struct ofpbuf *packet, uint16_t in_port); +static void rule_remove(struct ofproto *, struct rule *); +static bool rule_make_actions(struct ofproto *, struct rule *, + const struct ofpbuf *packet); +static void rule_install(struct ofproto *, struct rule *, + struct rule *displaced_rule); +static void rule_uninstall(struct ofproto *, struct rule *); +static void rule_post_uninstall(struct ofproto *, struct rule *); + +struct ofconn { + struct list node; + struct rconn *rconn; + struct pktbuf *pktbuf; + bool send_flow_exp; + int miss_send_len; + + struct rconn_packet_counter *packet_in_counter; + + /* Number of OpenFlow messages queued as replies to OpenFlow requests, and + * the maximum number before we stop reading OpenFlow requests. */ +#define OFCONN_REPLY_MAX 100 + struct rconn_packet_counter *reply_counter; +}; + +static struct ofconn *ofconn_create(struct ofproto *, struct rconn *); +static void ofconn_destroy(struct ofconn *, struct ofproto *); +static void ofconn_run(struct ofconn *, struct ofproto *); +static void ofconn_wait(struct ofconn *); +static void queue_tx(struct ofpbuf *msg, const struct ofconn *ofconn, + struct rconn_packet_counter *counter); + +struct ofproto { + /* Settings. */ + uint64_t datapath_id; /* Datapath ID. */ + uint64_t fallback_dpid; /* Datapath ID if no better choice found. */ + uint64_t mgmt_id; /* Management channel identifier. */ + char *manufacturer; /* Manufacturer. */ + char *hardware; /* Hardware. */ + char *software; /* Software version. */ + char *serial; /* Serial number. */ + + /* Datapath. */ + struct dpif dpif; + struct dpifmon *dpifmon; + struct port_array ports; /* Index is ODP port nr; ofport->opp.port_no is + * OFP port nr. */ + struct shash port_by_name; + uint32_t max_ports; + + /* Configuration. */ + struct switch_status *switch_status; + struct status_category *ss_cat; + struct in_band *in_band; + struct discovery *discovery; + struct fail_open *fail_open; + struct pinsched *miss_sched, *action_sched; + struct executer *executer; + struct netflow *netflow; + + /* Flow table. */ + struct classifier cls; + bool need_revalidate; + long long int next_expiration; + struct tag_set revalidate_set; + + /* OpenFlow connections. */ + struct list all_conns; + struct ofconn *controller; + struct pvconn **listeners; + size_t n_listeners; + struct pvconn **snoops; + size_t n_snoops; + + /* Hooks for ovs-vswitchd. */ + const struct ofhooks *ofhooks; + void *aux; + + /* Used by default ofhooks. */ + struct mac_learning *ml; +}; + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + +static const struct ofhooks default_ofhooks; + +static uint64_t pick_datapath_id(struct dpif *, uint64_t fallback_dpid); +static uint64_t pick_fallback_dpid(void); +static void send_packet_in_miss(struct ofpbuf *, void *ofproto); +static void send_packet_in_action(struct ofpbuf *, void *ofproto); +static void update_used(struct ofproto *); +static void update_stats(struct rule *, const struct odp_flow_stats *); +static void expire_rule(struct cls_rule *, void *ofproto); +static bool revalidate_rule(struct ofproto *p, struct rule *rule); +static void revalidate_cb(struct cls_rule *rule_, void *p_); + +static void handle_odp_msg(struct ofproto *, struct ofpbuf *); + +static void handle_openflow(struct ofconn *, struct ofproto *, + struct ofpbuf *); + +static void refresh_port_group(struct ofproto *, unsigned int group); +static void update_port(struct ofproto *, const char *devname); +static int init_ports(struct ofproto *); +static void reinit_ports(struct ofproto *); + +int +ofproto_create(const char *datapath, const struct ofhooks *ofhooks, void *aux, + struct ofproto **ofprotop) +{ + struct dpifmon *dpifmon; + struct odp_stats stats; + struct ofproto *p; + struct dpif dpif; + int error; + + *ofprotop = NULL; + + /* Connect to datapath and start listening for messages. */ + error = dpif_open(datapath, &dpif); + if (error) { + VLOG_ERR("failed to open datapath %s: %s", datapath, strerror(error)); + return error; + } + error = dpif_get_dp_stats(&dpif, &stats); + if (error) { + VLOG_ERR("failed to obtain stats for datapath %s: %s", + datapath, strerror(error)); + dpif_close(&dpif); + return error; + } + error = dpif_set_listen_mask(&dpif, ODPL_MISS | ODPL_ACTION); + if (error) { + VLOG_ERR("failed to listen on datapath %s: %s", + datapath, strerror(error)); + dpif_close(&dpif); + return error; + } + dpif_flow_flush(&dpif); + dpif_purge(&dpif); + + /* Start monitoring datapath ports for status changes. */ + error = dpifmon_create(datapath, &dpifmon); + if (error) { + VLOG_ERR("failed to starting monitoring datapath %s: %s", + datapath, strerror(error)); + dpif_close(&dpif); + return error; + } + + /* Initialize settings. */ + p = xcalloc(1, sizeof *p); + p->fallback_dpid = pick_fallback_dpid(); + p->datapath_id = pick_datapath_id(&dpif, p->fallback_dpid); + VLOG_INFO("using datapath ID %012"PRIx64, p->datapath_id); + p->manufacturer = xstrdup("Nicira Networks, Inc."); + p->hardware = xstrdup("Reference Implementation"); + p->software = xstrdup(VERSION BUILDNR); + p->serial = xstrdup("None"); + + /* Initialize datapath. */ + p->dpif = dpif; + p->dpifmon = dpifmon; + port_array_init(&p->ports); + shash_init(&p->port_by_name); + p->max_ports = stats.max_ports; + + /* Initialize submodules. */ + p->switch_status = switch_status_create(p); + p->in_band = NULL; + p->discovery = NULL; + p->fail_open = NULL; + p->miss_sched = p->action_sched = NULL; + p->executer = NULL; + p->netflow = NULL; + + /* Initialize flow table. */ + classifier_init(&p->cls); + p->need_revalidate = false; + p->next_expiration = time_msec() + 1000; + tag_set_init(&p->revalidate_set); + + /* Initialize OpenFlow connections. */ + list_init(&p->all_conns); + p->controller = ofconn_create(p, rconn_create(15, 15)); + p->controller->pktbuf = pktbuf_create(); + p->controller->miss_send_len = OFP_DEFAULT_MISS_SEND_LEN; + p->listeners = NULL; + p->n_listeners = 0; + p->snoops = NULL; + p->n_snoops = 0; + + /* Initialize hooks. */ + if (ofhooks) { + p->ofhooks = ofhooks; + p->aux = aux; + p->ml = NULL; + } else { + p->ofhooks = &default_ofhooks; + p->aux = p; + p->ml = mac_learning_create(); + } + + /* Register switch status category. */ + p->ss_cat = switch_status_register(p->switch_status, "remote", + rconn_status_cb, p->controller->rconn); + + /* Almost done... */ + error = init_ports(p); + if (error) { + ofproto_destroy(p); + return error; + } + + *ofprotop = p; + return 0; +} + +void +ofproto_set_datapath_id(struct ofproto *p, uint64_t datapath_id) +{ + uint64_t old_dpid = p->datapath_id; + p->datapath_id = (datapath_id + ? datapath_id + : pick_datapath_id(&p->dpif, p->fallback_dpid)); + if (p->datapath_id != old_dpid) { + VLOG_INFO("datapath ID changed to %012"PRIx64, p->datapath_id); + rconn_reconnect(p->controller->rconn); + } +} + +void +ofproto_set_mgmt_id(struct ofproto *p, uint64_t mgmt_id) +{ + p->mgmt_id = mgmt_id; +} + +void +ofproto_set_probe_interval(struct ofproto *p, int probe_interval) +{ + probe_interval = probe_interval ? MAX(probe_interval, 5) : 0; + rconn_set_probe_interval(p->controller->rconn, probe_interval); + if (p->fail_open) { + int trigger_duration = probe_interval ? probe_interval * 3 : 15; + fail_open_set_trigger_duration(p->fail_open, trigger_duration); + } +} + +void +ofproto_set_max_backoff(struct ofproto *p, int max_backoff) +{ + rconn_set_max_backoff(p->controller->rconn, max_backoff); +} + +void +ofproto_set_desc(struct ofproto *p, + const char *manufacturer, const char *hardware, + const char *software, const char *serial) +{ + if (manufacturer) { + free(p->manufacturer); + p->manufacturer = xstrdup(manufacturer); + } + if (hardware) { + free(p->hardware); + p->hardware = xstrdup(hardware); + } + if (software) { + free(p->software); + p->software = xstrdup(software); + } + if (serial) { + free(p->serial); + p->serial = xstrdup(serial); + } +} + +int +ofproto_set_in_band(struct ofproto *p, bool in_band) +{ + if (in_band != (p->in_band != NULL)) { + if (in_band) { + return in_band_create(p, &p->dpif, p->switch_status, + p->controller->rconn, &p->in_band); + } else { + ofproto_set_discovery(p, false, NULL, true); + in_band_destroy(p->in_band); + p->in_band = NULL; + } + rconn_reconnect(p->controller->rconn); + } + return 0; +} + +int +ofproto_set_discovery(struct ofproto *p, bool discovery, + const char *re, bool update_resolv_conf) +{ + if (discovery != (p->discovery != NULL)) { + if (discovery) { + int error = ofproto_set_in_band(p, true); + if (error) { + return error; + } + error = discovery_create(re, update_resolv_conf, + &p->dpif, p->switch_status, + &p->discovery); + if (error) { + return error; + } + } else { + discovery_destroy(p->discovery); + p->discovery = NULL; + } + rconn_disconnect(p->controller->rconn); + } else if (discovery) { + discovery_set_update_resolv_conf(p->discovery, update_resolv_conf); + return discovery_set_accept_controller_re(p->discovery, re); + } + return 0; +} + +int +ofproto_set_controller(struct ofproto *ofproto, const char *controller) +{ + if (ofproto->discovery) { + return EINVAL; + } else if (controller) { + if (strcmp(rconn_get_name(ofproto->controller->rconn), controller)) { + return rconn_connect(ofproto->controller->rconn, controller); + } else { + return 0; + } + } else { + rconn_disconnect(ofproto->controller->rconn); + return 0; + } +} + +static int +set_pvconns(struct pvconn ***pvconnsp, size_t *n_pvconnsp, + const struct svec *svec) +{ + struct pvconn **pvconns = *pvconnsp; + size_t n_pvconns = *n_pvconnsp; + int retval = 0; + size_t i; + + for (i = 0; i < n_pvconns; i++) { + pvconn_close(pvconns[i]); + } + free(pvconns); + + pvconns = xmalloc(svec->n * sizeof *pvconns); + n_pvconns = 0; + for (i = 0; i < svec->n; i++) { + const char *name = svec->names[i]; + struct pvconn *pvconn; + int error; + + error = pvconn_open(name, &pvconn); + if (!error) { + pvconns[n_pvconns++] = pvconn; + } else { + VLOG_ERR("failed to listen on %s: %s", name, strerror(error)); + if (!retval) { + retval = error; + } + } + } + + *pvconnsp = pvconns; + *n_pvconnsp = n_pvconns; + + return retval; +} + +int +ofproto_set_listeners(struct ofproto *ofproto, const struct svec *listeners) +{ + return set_pvconns(&ofproto->listeners, &ofproto->n_listeners, listeners); +} + +int +ofproto_set_snoops(struct ofproto *ofproto, const struct svec *snoops) +{ + return set_pvconns(&ofproto->snoops, &ofproto->n_snoops, snoops); +} + +int +ofproto_set_netflow(struct ofproto *ofproto, const struct svec *collectors, + uint8_t engine_type, uint8_t engine_id, bool add_id_to_iface) +{ + if (collectors && collectors->n) { + if (!ofproto->netflow) { + ofproto->netflow = netflow_create(); + } + netflow_set_engine(ofproto->netflow, engine_type, engine_id, + add_id_to_iface); + return netflow_set_collectors(ofproto->netflow, collectors); + } else { + netflow_destroy(ofproto->netflow); + ofproto->netflow = NULL; + return 0; + } +} + +void +ofproto_set_failure(struct ofproto *ofproto, bool fail_open) +{ + if (fail_open) { + struct rconn *rconn = ofproto->controller->rconn; + int trigger_duration = rconn_get_probe_interval(rconn) * 3; + if (!ofproto->fail_open) { + ofproto->fail_open = fail_open_create(ofproto, trigger_duration, + ofproto->switch_status, + rconn); + } else { + fail_open_set_trigger_duration(ofproto->fail_open, + trigger_duration); + } + } else { + fail_open_destroy(ofproto->fail_open); + ofproto->fail_open = NULL; + } +} + +void +ofproto_set_rate_limit(struct ofproto *ofproto, + int rate_limit, int burst_limit) +{ + if (rate_limit > 0) { + if (!ofproto->miss_sched) { + ofproto->miss_sched = pinsched_create(rate_limit, burst_limit, + ofproto->switch_status); + ofproto->action_sched = pinsched_create(rate_limit, burst_limit, + NULL); + } else { + pinsched_set_limits(ofproto->miss_sched, rate_limit, burst_limit); + pinsched_set_limits(ofproto->action_sched, + rate_limit, burst_limit); + } + } else { + pinsched_destroy(ofproto->miss_sched); + ofproto->miss_sched = NULL; + pinsched_destroy(ofproto->action_sched); + ofproto->action_sched = NULL; + } +} + +int +ofproto_set_stp(struct ofproto *ofproto UNUSED, bool enable_stp) +{ + /* XXX */ + if (enable_stp) { + VLOG_WARN("STP is not yet implemented"); + return EINVAL; + } else { + return 0; + } +} + +int +ofproto_set_remote_execution(struct ofproto *ofproto, const char *command_acl, + const char *command_dir) +{ + if (command_acl) { + if (!ofproto->executer) { + return executer_create(command_acl, command_dir, + &ofproto->executer); + } else { + executer_set_acl(ofproto->executer, command_acl, command_dir); + } + } else { + executer_destroy(ofproto->executer); + ofproto->executer = NULL; + } + return 0; +} + +uint64_t +ofproto_get_datapath_id(const struct ofproto *ofproto) +{ + return ofproto->datapath_id; +} + +int +ofproto_get_probe_interval(const struct ofproto *ofproto) +{ + return rconn_get_probe_interval(ofproto->controller->rconn); +} + +int +ofproto_get_max_backoff(const struct ofproto *ofproto) +{ + return rconn_get_max_backoff(ofproto->controller->rconn); +} + +bool +ofproto_get_in_band(const struct ofproto *ofproto) +{ + return ofproto->in_band != NULL; +} + +bool +ofproto_get_discovery(const struct ofproto *ofproto) +{ + return ofproto->discovery != NULL; +} + +const char * +ofproto_get_controller(const struct ofproto *ofproto) +{ + return rconn_get_name(ofproto->controller->rconn); +} + +void +ofproto_get_listeners(const struct ofproto *ofproto, struct svec *listeners) +{ + size_t i; + + for (i = 0; i < ofproto->n_listeners; i++) { + svec_add(listeners, pvconn_get_name(ofproto->listeners[i])); + } +} + +void +ofproto_get_snoops(const struct ofproto *ofproto, struct svec *snoops) +{ + size_t i; + + for (i = 0; i < ofproto->n_snoops; i++) { + svec_add(snoops, pvconn_get_name(ofproto->snoops[i])); + } +} + +void +ofproto_destroy(struct ofproto *p) +{ + struct ofconn *ofconn, *next_ofconn; + struct ofport *ofport; + unsigned int port_no; + size_t i; + + if (!p) { + return; + } + + ofproto_flush_flows(p); + classifier_destroy(&p->cls); + + LIST_FOR_EACH_SAFE (ofconn, next_ofconn, struct ofconn, node, + &p->all_conns) { + ofconn_destroy(ofconn, p); + } + + dpif_close(&p->dpif); + dpifmon_destroy(p->dpifmon); + PORT_ARRAY_FOR_EACH (ofport, &p->ports, port_no) { + ofport_free(ofport); + } + shash_destroy(&p->port_by_name); + + switch_status_destroy(p->switch_status); + in_band_destroy(p->in_band); + discovery_destroy(p->discovery); + fail_open_destroy(p->fail_open); + pinsched_destroy(p->miss_sched); + pinsched_destroy(p->action_sched); + executer_destroy(p->executer); + netflow_destroy(p->netflow); + + switch_status_unregister(p->ss_cat); + + for (i = 0; i < p->n_listeners; i++) { + pvconn_close(p->listeners[i]); + } + free(p->listeners); + + for (i = 0; i < p->n_snoops; i++) { + pvconn_close(p->snoops[i]); + } + free(p->snoops); + + mac_learning_destroy(p->ml); + + free(p); +} + +int +ofproto_run(struct ofproto *p) +{ + int error = ofproto_run1(p); + if (!error) { + error = ofproto_run2(p, false); + } + return error; +} + +int +ofproto_run1(struct ofproto *p) +{ + struct ofconn *ofconn, *next_ofconn; + char *devname; + int error; + int i; + + for (i = 0; i < 50; i++) { + struct ofpbuf *buf; + int error; + + error = dpif_recv(&p->dpif, &buf); + if (error) { + if (error == ENODEV) { + /* Someone destroyed the datapath behind our back. The caller + * better destroy us and give up, because we're just going to + * spin from here on out. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_ERR_RL(&rl, "dp%u: datapath was destroyed externally", + dpif_id(&p->dpif)); + return ENODEV; + } + break; + } + + handle_odp_msg(p, buf); + } + + while ((error = dpifmon_poll(p->dpifmon, &devname)) != EAGAIN) { + if (error == ENOBUFS) { + reinit_ports(p); + } else if (!error) { + update_port(p, devname); + free(devname); + } + } + + if (p->in_band) { + in_band_run(p->in_band); + } + if (p->discovery) { + char *controller_name; + if (rconn_is_connectivity_questionable(p->controller->rconn)) { + discovery_question_connectivity(p->discovery); + } + if (discovery_run(p->discovery, &controller_name)) { + if (controller_name) { + rconn_connect(p->controller->rconn, controller_name); + } else { + rconn_disconnect(p->controller->rconn); + } + } + } + if (p->fail_open) { + fail_open_run(p->fail_open); + } + pinsched_run(p->miss_sched, send_packet_in_miss, p); + pinsched_run(p->action_sched, send_packet_in_action, p); + if (p->executer) { + executer_run(p->executer); + } + + LIST_FOR_EACH_SAFE (ofconn, next_ofconn, struct ofconn, node, + &p->all_conns) { + ofconn_run(ofconn, p); + } + + for (i = 0; i < p->n_listeners; i++) { + struct vconn *vconn; + int retval; + + retval = pvconn_accept(p->listeners[i], OFP_VERSION, &vconn); + if (!retval) { + ofconn_create(p, rconn_new_from_vconn("passive", vconn)); + } else if (retval != EAGAIN) { + VLOG_WARN_RL(&rl, "accept failed (%s)", strerror(retval)); + } + } + + for (i = 0; i < p->n_snoops; i++) { + struct vconn *vconn; + int retval; + + retval = pvconn_accept(p->snoops[i], OFP_VERSION, &vconn); + if (!retval) { + rconn_add_monitor(p->controller->rconn, vconn); + } else if (retval != EAGAIN) { + VLOG_WARN_RL(&rl, "accept failed (%s)", strerror(retval)); + } + } + + if (time_msec() >= p->next_expiration) { + COVERAGE_INC(ofproto_expiration); + p->next_expiration = time_msec() + 1000; + update_used(p); + + classifier_for_each(&p->cls, CLS_INC_ALL, expire_rule, p); + + /* Let the hook know that we're at a stable point: all outstanding data + * in existing flows has been accounted to the account_cb. Thus, the + * hook can now reasonably do operations that depend on having accurate + * flow volume accounting (currently, that's just bond rebalancing). */ + if (p->ofhooks->account_checkpoint_cb) { + p->ofhooks->account_checkpoint_cb(p->aux); + } + } + + if (p->netflow) { + netflow_run(p->netflow); + } + + return 0; +} + +struct revalidate_cbdata { + struct ofproto *ofproto; + bool revalidate_all; /* Revalidate all exact-match rules? */ + bool revalidate_subrules; /* Revalidate all exact-match subrules? */ + struct tag_set revalidate_set; /* Set of tags to revalidate. */ +}; + +int +ofproto_run2(struct ofproto *p, bool revalidate_all) +{ + if (p->need_revalidate || revalidate_all + || !tag_set_is_empty(&p->revalidate_set)) { + struct revalidate_cbdata cbdata; + cbdata.ofproto = p; + cbdata.revalidate_all = revalidate_all; + cbdata.revalidate_subrules = p->need_revalidate; + cbdata.revalidate_set = p->revalidate_set; + tag_set_init(&p->revalidate_set); + COVERAGE_INC(ofproto_revalidate); + classifier_for_each(&p->cls, CLS_INC_EXACT, revalidate_cb, &cbdata); + p->need_revalidate = false; + } + + return 0; +} + +void +ofproto_wait(struct ofproto *p) +{ + struct ofconn *ofconn; + size_t i; + + dpif_recv_wait(&p->dpif); + dpifmon_wait(p->dpifmon); + LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) { + ofconn_wait(ofconn); + } + if (p->in_band) { + in_band_wait(p->in_band); + } + if (p->discovery) { + discovery_wait(p->discovery); + } + if (p->fail_open) { + fail_open_wait(p->fail_open); + } + pinsched_wait(p->miss_sched); + pinsched_wait(p->action_sched); + if (p->executer) { + executer_wait(p->executer); + } + if (!tag_set_is_empty(&p->revalidate_set)) { + poll_immediate_wake(); + } + if (p->need_revalidate) { + /* Shouldn't happen, but if it does just go around again. */ + VLOG_DBG_RL(&rl, "need revalidate in ofproto_wait_cb()"); + poll_immediate_wake(); + } else if (p->next_expiration != LLONG_MAX) { + poll_timer_wait(p->next_expiration - time_msec()); + } + for (i = 0; i < p->n_listeners; i++) { + pvconn_wait(p->listeners[i]); + } + for (i = 0; i < p->n_snoops; i++) { + pvconn_wait(p->snoops[i]); + } +} + +void +ofproto_revalidate(struct ofproto *ofproto, tag_type tag) +{ + tag_set_add(&ofproto->revalidate_set, tag); +} + +struct tag_set * +ofproto_get_revalidate_set(struct ofproto *ofproto) +{ + return &ofproto->revalidate_set; +} + +bool +ofproto_is_alive(const struct ofproto *p) +{ + return p->discovery || rconn_is_alive(p->controller->rconn); +} + +int +ofproto_send_packet(struct ofproto *p, const flow_t *flow, + const union ofp_action *actions, size_t n_actions, + const struct ofpbuf *packet) +{ + struct odp_actions odp_actions; + int error; + + error = xlate_actions(actions, n_actions, flow, p, packet, &odp_actions, + NULL, NULL); + if (error) { + return error; + } + + /* XXX Should we translate the dpif_execute() errno value into an OpenFlow + * error code? */ + dpif_execute(&p->dpif, flow->in_port, odp_actions.actions, + odp_actions.n_actions, packet); + return 0; +} + +void +ofproto_add_flow(struct ofproto *p, + const flow_t *flow, uint32_t wildcards, unsigned int priority, + const union ofp_action *actions, size_t n_actions, + int idle_timeout) +{ + struct rule *rule; + rule = rule_create(NULL, actions, n_actions, + idle_timeout >= 0 ? idle_timeout : 5 /* XXX */, 0); + cls_rule_from_flow(&rule->cr, flow, wildcards, priority); + rule_insert(p, rule, NULL, 0); +} + +void +ofproto_delete_flow(struct ofproto *ofproto, const flow_t *flow, + uint32_t wildcards, unsigned int priority) +{ + struct rule *rule; + + rule = rule_from_cls_rule(classifier_find_rule_exactly(&ofproto->cls, + flow, wildcards, + priority)); + if (rule) { + rule_remove(ofproto, rule); + } +} + +static void +destroy_rule(struct cls_rule *rule_, void *ofproto_) +{ + struct rule *rule = rule_from_cls_rule(rule_); + struct ofproto *ofproto = ofproto_; + + /* Mark the flow as not installed, even though it might really be + * installed, so that rule_remove() doesn't bother trying to uninstall it. + * There is no point in uninstalling it individually since we are about to + * blow away all the flows with dpif_flow_flush(). */ + rule->installed = false; + + rule_remove(ofproto, rule); +} + +void +ofproto_flush_flows(struct ofproto *ofproto) +{ + COVERAGE_INC(ofproto_flush); + classifier_for_each(&ofproto->cls, CLS_INC_ALL, destroy_rule, ofproto); + dpif_flow_flush(&ofproto->dpif); + if (ofproto->in_band) { + in_band_flushed(ofproto->in_band); + } + if (ofproto->fail_open) { + fail_open_flushed(ofproto->fail_open); + } +} + +static void +reinit_ports(struct ofproto *p) +{ + struct svec devnames; + struct ofport *ofport; + unsigned int port_no; + struct odp_port *odp_ports; + size_t n_odp_ports; + size_t i; + + svec_init(&devnames); + PORT_ARRAY_FOR_EACH (ofport, &p->ports, port_no) { + svec_add (&devnames, (char *) ofport->opp.name); + } + dpif_port_list(&p->dpif, &odp_ports, &n_odp_ports); + for (i = 0; i < n_odp_ports; i++) { + svec_add (&devnames, odp_ports[i].devname); + } + free(odp_ports); + + svec_sort_unique(&devnames); + for (i = 0; i < devnames.n; i++) { + update_port(p, devnames.names[i]); + } + svec_destroy(&devnames); +} + +static void +refresh_port_group(struct ofproto *p, unsigned int group) +{ + uint16_t *ports; + size_t n_ports; + struct ofport *port; + unsigned int port_no; + + assert(group == DP_GROUP_ALL || group == DP_GROUP_FLOOD); + + ports = xmalloc(port_array_count(&p->ports) * sizeof *ports); + n_ports = 0; + PORT_ARRAY_FOR_EACH (port, &p->ports, port_no) { + if (group == DP_GROUP_ALL || !(port->opp.config & OFPPC_NO_FLOOD)) { + ports[n_ports++] = port_no; + } + } + dpif_port_group_set(&p->dpif, group, ports, n_ports); + free(ports); +} + +static void +refresh_port_groups(struct ofproto *p) +{ + refresh_port_group(p, DP_GROUP_FLOOD); + refresh_port_group(p, DP_GROUP_ALL); +} + +static struct ofport * +make_ofport(const struct odp_port *odp_port) +{ + enum netdev_flags flags; + struct ofport *ofport; + struct netdev *netdev; + bool carrier; + int error; + + error = netdev_open(odp_port->devname, NETDEV_ETH_TYPE_NONE, &netdev); + if (error) { + VLOG_WARN_RL(&rl, "ignoring port %s (%"PRIu16") because netdev %s " + "cannot be opened (%s)", + odp_port->devname, odp_port->port, + odp_port->devname, strerror(error)); + return NULL; + } + + ofport = xmalloc(sizeof *ofport); + ofport->netdev = netdev; + ofport->opp.port_no = odp_port_to_ofp_port(odp_port->port); + memcpy(ofport->opp.hw_addr, netdev_get_etheraddr(netdev), ETH_ALEN); + memcpy(ofport->opp.name, odp_port->devname, + MIN(sizeof ofport->opp.name, sizeof odp_port->devname)); + ofport->opp.name[sizeof ofport->opp.name - 1] = '\0'; + + netdev_get_flags(netdev, &flags); + ofport->opp.config = flags & NETDEV_UP ? 0 : OFPPC_PORT_DOWN; + + netdev_get_carrier(netdev, &carrier); + ofport->opp.state = carrier ? 0 : OFPPS_LINK_DOWN; + + netdev_get_features(netdev, + &ofport->opp.curr, &ofport->opp.advertised, + &ofport->opp.supported, &ofport->opp.peer); + return ofport; +} + +static bool +ofport_conflicts(const struct ofproto *p, const struct odp_port *odp_port) +{ + if (port_array_get(&p->ports, odp_port->port)) { + VLOG_WARN_RL(&rl, "ignoring duplicate port %"PRIu16" in datapath", + odp_port->port); + return true; + } else if (shash_find(&p->port_by_name, odp_port->devname)) { + VLOG_WARN_RL(&rl, "ignoring duplicate device %s in datapath", + odp_port->devname); + return true; + } else { + return false; + } +} + +static int +ofport_equal(const struct ofport *a_, const struct ofport *b_) +{ + const struct ofp_phy_port *a = &a_->opp; + const struct ofp_phy_port *b = &b_->opp; + + BUILD_ASSERT_DECL(sizeof *a == 48); /* Detect ofp_phy_port changes. */ + return (a->port_no == b->port_no + && !memcmp(a->hw_addr, b->hw_addr, sizeof a->hw_addr) + && !strcmp((char *) a->name, (char *) b->name) + && a->state == b->state + && a->config == b->config + && a->curr == b->curr + && a->advertised == b->advertised + && a->supported == b->supported + && a->peer == b->peer); +} + +static void +send_port_status(struct ofproto *p, const struct ofport *ofport, + uint8_t reason) +{ + /* XXX Should limit the number of queued port status change messages. */ + struct ofconn *ofconn; + LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) { + struct ofp_port_status *ops; + struct ofpbuf *b; + + ops = make_openflow_xid(sizeof *ops, OFPT_PORT_STATUS, 0, &b); + ops->reason = reason; + ops->desc = ofport->opp; + hton_ofp_phy_port(&ops->desc); + queue_tx(b, ofconn, NULL); + } + if (p->ofhooks->port_changed_cb) { + p->ofhooks->port_changed_cb(reason, &ofport->opp, p->aux); + } +} + +static void +ofport_install(struct ofproto *p, struct ofport *ofport) +{ + port_array_set(&p->ports, ofp_port_to_odp_port(ofport->opp.port_no), + ofport); + shash_add(&p->port_by_name, (char *) ofport->opp.name, ofport); +} + +static void +ofport_remove(struct ofproto *p, struct ofport *ofport) +{ + port_array_set(&p->ports, ofp_port_to_odp_port(ofport->opp.port_no), NULL); + shash_delete(&p->port_by_name, + shash_find(&p->port_by_name, (char *) ofport->opp.name)); +} + +static void +ofport_free(struct ofport *ofport) +{ + if (ofport) { + netdev_close(ofport->netdev); + free(ofport); + } +} + +static void +update_port(struct ofproto *p, const char *devname) +{ + struct odp_port odp_port; + struct ofport *ofport; + int error; + + COVERAGE_INC(ofproto_update_port); + ofport = shash_find_data(&p->port_by_name, devname); + error = dpif_port_query_by_name(&p->dpif, devname, &odp_port); + if (!error) { + if (!ofport) { + /* New port. */ + if (!ofport_conflicts(p, &odp_port)) { + ofport = make_ofport(&odp_port); + if (ofport) { + ofport_install(p, ofport); + send_port_status(p, ofport, OFPPR_ADD); + } + } + } else { + /* Modified port. */ + struct ofport *new_ofport = make_ofport(&odp_port); + if (!new_ofport) { + return; + } + + new_ofport->opp.config &= OFPPC_PORT_DOWN; + new_ofport->opp.config |= ofport->opp.config & ~OFPPC_PORT_DOWN; + if (ofport_equal(ofport, new_ofport)) { + /* False alarm--no change. */ + ofport_free(new_ofport); + } else { + ofport_remove(p, ofport); + ofport_install(p, new_ofport); + ofport_free(ofport); + send_port_status(p, new_ofport, OFPPR_MODIFY); + } + } + } else if (error == ENOENT || error == ENODEV) { + /* Deleted port. */ + if (ofport) { + send_port_status(p, ofport, OFPPR_DELETE); + ofport_remove(p, ofport); + ofport_free(ofport); + } + } else { + VLOG_WARN_RL(&rl, "dpif_port_query_by_name returned unexpected error " + "%s", strerror(error)); + return; + } + refresh_port_groups(p); +} + +static int +init_ports(struct ofproto *p) +{ + struct odp_port *ports; + size_t n_ports; + size_t i; + int error; + + error = dpif_port_list(&p->dpif, &ports, &n_ports); + if (error) { + return error; + } + + for (i = 0; i < n_ports; i++) { + const struct odp_port *odp_port = &ports[i]; + if (!ofport_conflicts(p, odp_port)) { + struct ofport *ofport = make_ofport(odp_port); + if (ofport) { + ofport_install(p, ofport); + } + } + } + free(ports); + refresh_port_groups(p); + return 0; +} + +static struct ofconn * +ofconn_create(struct ofproto *p, struct rconn *rconn) +{ + struct ofconn *ofconn = xmalloc(sizeof *ofconn); + list_push_back(&p->all_conns, &ofconn->node); + ofconn->rconn = rconn; + ofconn->pktbuf = NULL; + ofconn->send_flow_exp = false; + ofconn->miss_send_len = 0; + ofconn->packet_in_counter = rconn_packet_counter_create (); + ofconn->reply_counter = rconn_packet_counter_create (); + return ofconn; +} + +static void +ofconn_destroy(struct ofconn *ofconn, struct ofproto *p) +{ + if (p->executer) { + executer_rconn_closing(p->executer, ofconn->rconn); + } + + list_remove(&ofconn->node); + rconn_destroy(ofconn->rconn); + rconn_packet_counter_destroy(ofconn->packet_in_counter); + rconn_packet_counter_destroy(ofconn->reply_counter); + pktbuf_destroy(ofconn->pktbuf); + free(ofconn); +} + +static void +ofconn_run(struct ofconn *ofconn, struct ofproto *p) +{ + int iteration; + + rconn_run(ofconn->rconn); + + if (rconn_packet_counter_read (ofconn->reply_counter) < OFCONN_REPLY_MAX) { + /* Limit the number of iterations to prevent other tasks from + * starving. */ + for (iteration = 0; iteration < 50; iteration++) { + struct ofpbuf *of_msg = rconn_recv(ofconn->rconn); + if (!of_msg) { + break; + } + handle_openflow(ofconn, p, of_msg); + ofpbuf_delete(of_msg); + } + } + + if (ofconn != p->controller && !rconn_is_alive(ofconn->rconn)) { + ofconn_destroy(ofconn, p); + } +} + +static void +ofconn_wait(struct ofconn *ofconn) +{ + rconn_run_wait(ofconn->rconn); + if (rconn_packet_counter_read (ofconn->reply_counter) < OFCONN_REPLY_MAX) { + rconn_recv_wait(ofconn->rconn); + } else { + COVERAGE_INC(ofproto_ofconn_stuck); + } +} + +/* Caller is responsible for initializing the 'cr' member of the returned + * rule. */ +static struct rule * +rule_create(struct rule *super, + const union ofp_action *actions, size_t n_actions, + uint16_t idle_timeout, uint16_t hard_timeout) +{ + struct rule *rule = xcalloc(1, sizeof *rule); + rule->idle_timeout = idle_timeout; + rule->hard_timeout = hard_timeout; + rule->used = rule->created = time_msec(); + rule->super = super; + if (super) { + list_push_back(&super->list, &rule->list); + } else { + list_init(&rule->list); + } + rule->n_actions = n_actions; + rule->actions = xmemdup(actions, n_actions * sizeof *actions); + return rule; +} + +static struct rule * +rule_from_cls_rule(const struct cls_rule *cls_rule) +{ + return cls_rule ? CONTAINER_OF(cls_rule, struct rule, cr) : NULL; +} + +static void +rule_free(struct rule *rule) +{ + free(rule->actions); + free(rule->odp_actions); + free(rule); +} + +/* Destroys 'rule'. If 'rule' is a subrule, also removes it from its + * super-rule's list of subrules. If 'rule' is a super-rule, also iterates + * through all of its subrules and revalidates them, destroying any that no + * longer has a super-rule (which is probably all of them). + * + * Before calling this function, the caller must make have removed 'rule' from + * the classifier. If 'rule' is an exact-match rule, the caller is also + * responsible for ensuring that it has been uninstalled from the datapath. */ +static void +rule_destroy(struct ofproto *ofproto, struct rule *rule) +{ + if (!rule->super) { + struct rule *subrule, *next; + LIST_FOR_EACH_SAFE (subrule, next, struct rule, list, &rule->list) { + revalidate_rule(ofproto, subrule); + } + } else { + list_remove(&rule->list); + } + rule_free(rule); +} + +static bool +rule_has_out_port(const struct rule *rule, uint16_t out_port) +{ + const union ofp_action *oa; + struct actions_iterator i; + + if (out_port == htons(OFPP_NONE)) { + return true; + } + for (oa = actions_first(&i, rule->actions, rule->n_actions); oa; + oa = actions_next(&i)) { + if (oa->type == htons(OFPAT_OUTPUT) && oa->output.port == out_port) { + return true; + } + } + return false; +} + +/* Executes the actions indicated by 'rule' on 'packet', which is in flow + * 'flow' and is considered to have arrived on ODP port 'in_port'. + * + * The flow that 'packet' actually contains does not need to actually match + * 'rule'; the actions in 'rule' will be applied to it either way. Likewise, + * the packet and byte counters for 'rule' will be credited for the packet sent + * out whether or not the packet actually matches 'rule'. + * + * If 'rule' is an exact-match rule and 'flow' actually equals the rule's flow, + * the caller must already have accurately composed ODP actions for it given + * 'packet' using rule_make_actions(). If 'rule' is a wildcard rule, or if + * 'rule' is an exact-match rule but 'flow' is not the rule's flow, then this + * function will compose a set of ODP actions based on 'rule''s OpenFlow + * actions and apply them to 'packet'. */ +static void +rule_execute(struct ofproto *ofproto, struct rule *rule, + struct ofpbuf *packet, const flow_t *flow) +{ + const union odp_action *actions; + size_t n_actions; + struct odp_actions a; + + /* Grab or compose the ODP actions. + * + * The special case for an exact-match 'rule' where 'flow' is not the + * rule's flow is important to avoid, e.g., sending a packet out its input + * port simply because the ODP actions were composed for the wrong + * scenario. */ + if (rule->cr.wc.wildcards || !flow_equal(flow, &rule->cr.flow)) { + struct rule *super = rule->super ? rule->super : rule; + if (xlate_actions(super->actions, super->n_actions, flow, ofproto, + packet, &a, NULL, 0)) { + return; + } + actions = a.actions; + n_actions = a.n_actions; + } else { + actions = rule->odp_actions; + n_actions = rule->n_odp_actions; + } + + /* Execute the ODP actions. */ + if (!dpif_execute(&ofproto->dpif, flow->in_port, + actions, n_actions, packet)) { + struct odp_flow_stats stats; + flow_extract_stats(flow, packet, &stats); + update_stats(rule, &stats); + rule->used = time_msec(); + } +} + +static void +rule_insert(struct ofproto *p, struct rule *rule, struct ofpbuf *packet, + uint16_t in_port) +{ + struct rule *displaced_rule; + + /* Insert the rule in the classifier. */ + displaced_rule = rule_from_cls_rule(classifier_insert(&p->cls, &rule->cr)); + if (!rule->cr.wc.wildcards) { + rule_make_actions(p, rule, packet); + } + + /* Send the packet and credit it to the rule. */ + if (packet) { + flow_t flow; + flow_extract(packet, in_port, &flow); + rule_execute(p, rule, packet, &flow); + } + + /* Install the rule in the datapath only after sending the packet, to + * avoid packet reordering. */ + if (rule->cr.wc.wildcards) { + COVERAGE_INC(ofproto_add_wc_flow); + p->need_revalidate = true; + } else { + rule_install(p, rule, displaced_rule); + } + + /* Free the rule that was displaced, if any. */ + if (displaced_rule) { + rule_destroy(p, displaced_rule); + } +} + +static struct rule * +rule_create_subrule(struct ofproto *ofproto, struct rule *rule, + const flow_t *flow) +{ + struct rule *subrule = rule_create(rule, NULL, 0, + rule->idle_timeout, rule->hard_timeout); + COVERAGE_INC(ofproto_subrule_create); + cls_rule_from_flow(&subrule->cr, flow, 0, + (rule->cr.priority <= UINT16_MAX ? UINT16_MAX + : rule->cr.priority)); + classifier_insert_exact(&ofproto->cls, &subrule->cr); + + return subrule; +} + +static void +rule_remove(struct ofproto *ofproto, struct rule *rule) +{ + if (rule->cr.wc.wildcards) { + COVERAGE_INC(ofproto_del_wc_flow); + ofproto->need_revalidate = true; + } else { + rule_uninstall(ofproto, rule); + } + classifier_remove(&ofproto->cls, &rule->cr); + rule_destroy(ofproto, rule); +} + +/* Returns true if the actions changed, false otherwise. */ +static bool +rule_make_actions(struct ofproto *p, struct rule *rule, + const struct ofpbuf *packet) +{ + const struct rule *super; + struct odp_actions a; + size_t actions_len; + + assert(!rule->cr.wc.wildcards); + + super = rule->super ? rule->super : rule; + rule->tags = 0; + xlate_actions(super->actions, super->n_actions, &rule->cr.flow, p, + packet, &a, &rule->tags, &rule->may_install); + + actions_len = a.n_actions * sizeof *a.actions; + if (rule->n_odp_actions != a.n_actions + || memcmp(rule->odp_actions, a.actions, actions_len)) { + COVERAGE_INC(ofproto_odp_unchanged); + free(rule->odp_actions); + rule->n_odp_actions = a.n_actions; + rule->odp_actions = xmemdup(a.actions, actions_len); + return true; + } else { + return false; + } +} + +static int +do_put_flow(struct ofproto *ofproto, struct rule *rule, int flags, + struct odp_flow_put *put) +{ + memset(&put->flow.stats, 0, sizeof put->flow.stats); + put->flow.key = rule->cr.flow; + put->flow.actions = rule->odp_actions; + put->flow.n_actions = rule->n_odp_actions; + put->flags = flags; + return dpif_flow_put(&ofproto->dpif, put); +} + +static void +rule_install(struct ofproto *p, struct rule *rule, struct rule *displaced_rule) +{ + assert(!rule->cr.wc.wildcards); + + if (rule->may_install) { + struct odp_flow_put put; + if (!do_put_flow(p, rule, + ODPPF_CREATE | ODPPF_MODIFY | ODPPF_ZERO_STATS, + &put)) { + rule->installed = true; + if (displaced_rule) { + update_stats(rule, &put.flow.stats); + rule_post_uninstall(p, displaced_rule); + } + } + } else if (displaced_rule) { + rule_uninstall(p, displaced_rule); + } +} + +static void +rule_reinstall(struct ofproto *ofproto, struct rule *rule) +{ + if (rule->installed) { + struct odp_flow_put put; + COVERAGE_INC(ofproto_dp_missed); + do_put_flow(ofproto, rule, ODPPF_CREATE | ODPPF_MODIFY, &put); + } else { + rule_install(ofproto, rule, NULL); + } +} + +static void +rule_update_actions(struct ofproto *ofproto, struct rule *rule) +{ + bool actions_changed = rule_make_actions(ofproto, rule, NULL); + if (rule->may_install) { + if (rule->installed) { + if (actions_changed) { + /* XXX should really do rule_post_uninstall() for the *old* set + * of actions, and distinguish the old stats from the new. */ + struct odp_flow_put put; + do_put_flow(ofproto, rule, ODPPF_CREATE | ODPPF_MODIFY, &put); + } + } else { + rule_install(ofproto, rule, NULL); + } + } else { + rule_uninstall(ofproto, rule); + } +} + +static void +rule_account(struct ofproto *ofproto, struct rule *rule, uint64_t extra_bytes) +{ + uint64_t total_bytes = rule->byte_count + extra_bytes; + + if (ofproto->ofhooks->account_flow_cb + && total_bytes > rule->accounted_bytes) + { + ofproto->ofhooks->account_flow_cb( + &rule->cr.flow, rule->odp_actions, rule->n_odp_actions, + total_bytes - rule->accounted_bytes, ofproto->aux); + rule->accounted_bytes = total_bytes; + } +} + +static void +rule_uninstall(struct ofproto *p, struct rule *rule) +{ + assert(!rule->cr.wc.wildcards); + if (rule->installed) { + struct odp_flow odp_flow; + + odp_flow.key = rule->cr.flow; + odp_flow.actions = NULL; + odp_flow.n_actions = 0; + if (!dpif_flow_del(&p->dpif, &odp_flow)) { + update_stats(rule, &odp_flow.stats); + } + rule->installed = false; + + rule_post_uninstall(p, rule); + } +} + +static void +rule_post_uninstall(struct ofproto *ofproto, struct rule *rule) +{ + struct rule *super = rule->super; + + rule_account(ofproto, rule, 0); + if (ofproto->netflow) { + struct ofexpired expired; + expired.flow = rule->cr.flow; + expired.packet_count = rule->packet_count; + expired.byte_count = rule->byte_count; + expired.used = rule->used; + expired.created = rule->created; + expired.tcp_flags = rule->tcp_flags; + expired.ip_tos = rule->ip_tos; + netflow_expire(ofproto->netflow, &expired); + } + if (super) { + super->packet_count += rule->packet_count; + super->byte_count += rule->byte_count; + super->tcp_flags |= rule->tcp_flags; + if (rule->packet_count) { + super->ip_tos = rule->ip_tos; + } + } + + /* Reset counters to prevent double counting if the rule ever gets + * reinstalled. */ + rule->packet_count = 0; + rule->byte_count = 0; + rule->accounted_bytes = 0; + rule->tcp_flags = 0; + rule->ip_tos = 0; +} + +static void +queue_tx(struct ofpbuf *msg, const struct ofconn *ofconn, + struct rconn_packet_counter *counter) +{ + update_openflow_length(msg); + if (rconn_send(ofconn->rconn, msg, counter)) { + ofpbuf_delete(msg); + } +} + +static void +send_error(const struct ofconn *ofconn, const struct ofp_header *oh, + int error, const void *data, size_t len) +{ + struct ofpbuf *buf; + struct ofp_error_msg *oem; + + if (!(error >> 16)) { + VLOG_WARN_RL(&rl, "not sending bad error code %d to controller", + error); + return; + } + + COVERAGE_INC(ofproto_error); + oem = make_openflow_xid(len + sizeof *oem, OFPT_ERROR, + oh ? oh->xid : 0, &buf); + oem->type = htons((unsigned int) error >> 16); + oem->code = htons(error & 0xffff); + memcpy(oem->data, data, len); + queue_tx(buf, ofconn, ofconn->reply_counter); +} + +static void +send_error_oh(const struct ofconn *ofconn, const struct ofp_header *oh, + int error) +{ + size_t oh_length = ntohs(oh->length); + send_error(ofconn, oh, error, oh, MIN(oh_length, 64)); +} + +static void +hton_ofp_phy_port(struct ofp_phy_port *opp) +{ + opp->port_no = htons(opp->port_no); + opp->config = htonl(opp->config); + opp->state = htonl(opp->state); + opp->curr = htonl(opp->curr); + opp->advertised = htonl(opp->advertised); + opp->supported = htonl(opp->supported); + opp->peer = htonl(opp->peer); +} + +static int +handle_echo_request(struct ofconn *ofconn, struct ofp_header *oh) +{ + struct ofp_header *rq = oh; + queue_tx(make_echo_reply(rq), ofconn, ofconn->reply_counter); + return 0; +} + +static int +handle_features_request(struct ofproto *p, struct ofconn *ofconn, + struct ofp_header *oh) +{ + struct ofp_switch_features *osf; + struct ofpbuf *buf; + unsigned int port_no; + struct ofport *port; + + osf = make_openflow_xid(sizeof *osf, OFPT_FEATURES_REPLY, oh->xid, &buf); + osf->datapath_id = htonll(p->datapath_id); + osf->n_buffers = htonl(pktbuf_capacity()); + osf->n_tables = 2; + osf->capabilities = htonl(OFPC_FLOW_STATS | OFPC_TABLE_STATS | + OFPC_PORT_STATS | OFPC_MULTI_PHY_TX); + osf->actions = htonl((1u << OFPAT_OUTPUT) | + (1u << OFPAT_SET_VLAN_VID) | + (1u << OFPAT_SET_VLAN_PCP) | + (1u << OFPAT_STRIP_VLAN) | + (1u << OFPAT_SET_DL_SRC) | + (1u << OFPAT_SET_DL_DST) | + (1u << OFPAT_SET_NW_SRC) | + (1u << OFPAT_SET_NW_DST) | + (1u << OFPAT_SET_TP_SRC) | + (1u << OFPAT_SET_TP_DST)); + + PORT_ARRAY_FOR_EACH (port, &p->ports, port_no) { + hton_ofp_phy_port(ofpbuf_put(buf, &port->opp, sizeof port->opp)); + } + + queue_tx(buf, ofconn, ofconn->reply_counter); + return 0; +} + +static int +handle_get_config_request(struct ofproto *p, struct ofconn *ofconn, + struct ofp_header *oh) +{ + struct ofpbuf *buf; + struct ofp_switch_config *osc; + uint16_t flags; + bool drop_frags; + + /* Figure out flags. */ + dpif_get_drop_frags(&p->dpif, &drop_frags); + flags = drop_frags ? OFPC_FRAG_DROP : OFPC_FRAG_NORMAL; + if (ofconn->send_flow_exp) { + flags |= OFPC_SEND_FLOW_EXP; + } + + /* Send reply. */ + osc = make_openflow_xid(sizeof *osc, OFPT_GET_CONFIG_REPLY, oh->xid, &buf); + osc->flags = htons(flags); + osc->miss_send_len = htons(ofconn->miss_send_len); + queue_tx(buf, ofconn, ofconn->reply_counter); + + return 0; +} + +static int +handle_set_config(struct ofproto *p, struct ofconn *ofconn, + struct ofp_switch_config *osc) +{ + uint16_t flags; + int error; + + error = check_ofp_message(&osc->header, OFPT_SET_CONFIG, sizeof *osc); + if (error) { + return error; + } + flags = ntohs(osc->flags); + + ofconn->send_flow_exp = (flags & OFPC_SEND_FLOW_EXP) != 0; + + if (ofconn == p->controller) { + switch (flags & OFPC_FRAG_MASK) { + case OFPC_FRAG_NORMAL: + dpif_set_drop_frags(&p->dpif, false); + break; + case OFPC_FRAG_DROP: + dpif_set_drop_frags(&p->dpif, true); + break; + default: + VLOG_WARN_RL(&rl, "requested bad fragment mode (flags=%"PRIx16")", + osc->flags); + break; + } + } + + if ((ntohs(osc->miss_send_len) != 0) != (ofconn->miss_send_len != 0)) { + if (ntohs(osc->miss_send_len) != 0) { + ofconn->pktbuf = pktbuf_create(); + } else { + pktbuf_destroy(ofconn->pktbuf); + } + } + + ofconn->miss_send_len = ntohs(osc->miss_send_len); + + return 0; +} + +static void +add_output_group_action(struct odp_actions *actions, uint16_t group) +{ + odp_actions_add(actions, ODPAT_OUTPUT_GROUP)->output_group.group = group; +} + +static void +add_controller_action(struct odp_actions *actions, + const struct ofp_action_output *oao) +{ + union odp_action *a = odp_actions_add(actions, ODPAT_CONTROLLER); + a->controller.arg = oao->max_len ? ntohs(oao->max_len) : UINT32_MAX; +} + +struct action_xlate_ctx { + /* Input. */ + const flow_t *flow; /* Flow to which these actions correspond. */ + int recurse; /* Recursion level, via xlate_table_action. */ + struct ofproto *ofproto; + const struct ofpbuf *packet; /* The packet corresponding to 'flow', or a + * null pointer if we are revalidating + * without a packet to refer to. */ + + /* Output. */ + struct odp_actions *out; /* Datapath actions. */ + tag_type *tags; /* Tags associated with OFPP_NORMAL actions. */ + bool may_setup_flow; /* True ordinarily; false if the actions must + * be reassessed for every packet. */ +}; + +static void do_xlate_actions(const union ofp_action *in, size_t n_in, + struct action_xlate_ctx *ctx); + +static void +add_output_action(struct action_xlate_ctx *ctx, uint16_t port) +{ + const struct ofport *ofport = port_array_get(&ctx->ofproto->ports, port); + if (!ofport || !(ofport->opp.config & OFPPC_NO_FWD)) { + odp_actions_add(ctx->out, ODPAT_OUTPUT)->output.port = port; + } +} + +static struct rule * +lookup_valid_rule(struct ofproto *ofproto, const flow_t *flow) +{ + struct rule *rule; + rule = rule_from_cls_rule(classifier_lookup(&ofproto->cls, flow)); + + /* The rule we found might not be valid, since we could be in need of + * revalidation. If it is not valid, don't return it. */ + if (rule + && rule->super + && ofproto->need_revalidate + && !revalidate_rule(ofproto, rule)) { + COVERAGE_INC(ofproto_invalidated); + return NULL; + } + + return rule; +} + +static void +xlate_table_action(struct action_xlate_ctx *ctx, uint16_t in_port) +{ + if (!ctx->recurse) { + struct rule *rule; + flow_t flow; + + flow = *ctx->flow; + flow.in_port = in_port; + + rule = lookup_valid_rule(ctx->ofproto, &flow); + if (rule) { + if (rule->super) { + rule = rule->super; + } + + ctx->recurse++; + do_xlate_actions(rule->actions, rule->n_actions, ctx); + ctx->recurse--; + } + } +} + +static void +xlate_output_action(struct action_xlate_ctx *ctx, + const struct ofp_action_output *oao) +{ + uint16_t odp_port; + + switch (ntohs(oao->port)) { + case OFPP_IN_PORT: + add_output_action(ctx, ctx->flow->in_port); + break; + case OFPP_TABLE: + xlate_table_action(ctx, ctx->flow->in_port); + break; + case OFPP_NORMAL: + if (!ctx->ofproto->ofhooks->normal_cb(ctx->flow, ctx->packet, + ctx->out, ctx->tags, + ctx->ofproto->aux)) { + COVERAGE_INC(ofproto_uninstallable); + ctx->may_setup_flow = false; + } + break; + case OFPP_FLOOD: + add_output_group_action(ctx->out, DP_GROUP_FLOOD); + break; + case OFPP_ALL: + add_output_group_action(ctx->out, DP_GROUP_ALL); + break; + case OFPP_CONTROLLER: + add_controller_action(ctx->out, oao); + break; + case OFPP_LOCAL: + add_output_action(ctx, ODPP_LOCAL); + break; + default: + odp_port = ofp_port_to_odp_port(ntohs(oao->port)); + if (odp_port != ctx->flow->in_port) { + add_output_action(ctx, odp_port); + } + break; + } +} + +static void +xlate_nicira_action(struct action_xlate_ctx *ctx, + const struct nx_action_header *nah) +{ + const struct nx_action_resubmit *nar; + int subtype = ntohs(nah->subtype); + + assert(nah->vendor == htonl(NX_VENDOR_ID)); + switch (subtype) { + case NXAST_RESUBMIT: + nar = (const struct nx_action_resubmit *) nah; + xlate_table_action(ctx, ofp_port_to_odp_port(ntohs(nar->in_port))); + break; + + default: + VLOG_DBG_RL(&rl, "unknown Nicira action type %"PRIu16, subtype); + break; + } +} + +static void +do_xlate_actions(const union ofp_action *in, size_t n_in, + struct action_xlate_ctx *ctx) +{ + struct actions_iterator iter; + const union ofp_action *ia; + const struct ofport *port; + + port = port_array_get(&ctx->ofproto->ports, ctx->flow->in_port); + if (port && port->opp.config & (OFPPC_NO_RECV | OFPPC_NO_RECV_STP) && + port->opp.config & (eth_addr_equals(ctx->flow->dl_dst, stp_eth_addr) + ? OFPPC_NO_RECV_STP : OFPPC_NO_RECV)) { + /* Drop this flow. */ + return; + } + + for (ia = actions_first(&iter, in, n_in); ia; ia = actions_next(&iter)) { + uint16_t type = ntohs(ia->type); + union odp_action *oa; + + switch (type) { + case OFPAT_OUTPUT: + xlate_output_action(ctx, &ia->output); + break; + + case OFPAT_SET_VLAN_VID: + oa = odp_actions_add(ctx->out, ODPAT_SET_VLAN_VID); + oa->vlan_vid.vlan_vid = ia->vlan_vid.vlan_vid; + break; + + case OFPAT_SET_VLAN_PCP: + oa = odp_actions_add(ctx->out, ODPAT_SET_VLAN_PCP); + oa->vlan_pcp.vlan_pcp = ia->vlan_pcp.vlan_pcp; + break; + + case OFPAT_STRIP_VLAN: + odp_actions_add(ctx->out, ODPAT_STRIP_VLAN); + break; + + case OFPAT_SET_DL_SRC: + oa = odp_actions_add(ctx->out, ODPAT_SET_DL_SRC); + memcpy(oa->dl_addr.dl_addr, + ((struct ofp_action_dl_addr *) ia)->dl_addr, ETH_ADDR_LEN); + break; + + case OFPAT_SET_DL_DST: + oa = odp_actions_add(ctx->out, ODPAT_SET_DL_DST); + memcpy(oa->dl_addr.dl_addr, + ((struct ofp_action_dl_addr *) ia)->dl_addr, ETH_ADDR_LEN); + break; + + case OFPAT_SET_NW_SRC: + oa = odp_actions_add(ctx->out, ODPAT_SET_NW_SRC); + oa->nw_addr.nw_addr = ia->nw_addr.nw_addr; + break; + + case OFPAT_SET_TP_SRC: + oa = odp_actions_add(ctx->out, ODPAT_SET_TP_SRC); + oa->tp_port.tp_port = ia->tp_port.tp_port; + break; + + case OFPAT_VENDOR: + xlate_nicira_action(ctx, (const struct nx_action_header *) ia); + break; + + default: + VLOG_DBG_RL(&rl, "unknown action type %"PRIu16, type); + break; + } + } +} + +static int +xlate_actions(const union ofp_action *in, size_t n_in, + const flow_t *flow, struct ofproto *ofproto, + const struct ofpbuf *packet, + struct odp_actions *out, tag_type *tags, bool *may_setup_flow) +{ + tag_type no_tags = 0; + struct action_xlate_ctx ctx; + COVERAGE_INC(ofproto_ofp2odp); + odp_actions_init(out); + ctx.flow = flow; + ctx.recurse = 0; + ctx.ofproto = ofproto; + ctx.packet = packet; + ctx.out = out; + ctx.tags = tags ? tags : &no_tags; + ctx.may_setup_flow = true; + do_xlate_actions(in, n_in, &ctx); + if (may_setup_flow) { + *may_setup_flow = ctx.may_setup_flow; + } + if (odp_actions_overflow(out)) { + odp_actions_init(out); + return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_TOO_MANY); + } + return 0; +} + +static int +handle_packet_out(struct ofproto *p, struct ofconn *ofconn, + struct ofp_header *oh) +{ + struct ofp_packet_out *opo; + struct ofpbuf payload, *buffer; + struct odp_actions actions; + int n_actions; + uint16_t in_port; + flow_t flow; + int error; + + error = check_ofp_packet_out(oh, &payload, &n_actions, p->max_ports); + if (error) { + return error; + } + opo = (struct ofp_packet_out *) oh; + + COVERAGE_INC(ofproto_packet_out); + if (opo->buffer_id != htonl(UINT32_MAX)) { + error = pktbuf_retrieve(ofconn->pktbuf, ntohl(opo->buffer_id), + &buffer, &in_port); + if (error) { + return error; + } + payload = *buffer; + } else { + buffer = NULL; + } + + flow_extract(&payload, ofp_port_to_odp_port(ntohs(opo->in_port)), &flow); + error = xlate_actions((const union ofp_action *) opo->actions, n_actions, + &flow, p, &payload, &actions, NULL, NULL); + if (error) { + return error; + } + + dpif_execute(&p->dpif, flow.in_port, actions.actions, actions.n_actions, + &payload); + ofpbuf_delete(buffer); + + return 0; +} + +static void +update_port_config(struct ofproto *p, struct ofport *port, + uint32_t config, uint32_t mask) +{ + mask &= config ^ port->opp.config; + if (mask & OFPPC_PORT_DOWN) { + if (config & OFPPC_PORT_DOWN) { + netdev_turn_flags_off(port->netdev, NETDEV_UP, true); + } else { + netdev_turn_flags_on(port->netdev, NETDEV_UP, true); + } + } +#define REVALIDATE_BITS (OFPPC_NO_RECV | OFPPC_NO_RECV_STP | OFPPC_NO_FWD) + if (mask & REVALIDATE_BITS) { + COVERAGE_INC(ofproto_costly_flags); + port->opp.config ^= mask & REVALIDATE_BITS; + p->need_revalidate = true; + } +#undef REVALIDATE_BITS + if (mask & OFPPC_NO_FLOOD) { + port->opp.config ^= OFPPC_NO_FLOOD; + refresh_port_group(p, DP_GROUP_FLOOD); + } + if (mask & OFPPC_NO_PACKET_IN) { + port->opp.config ^= OFPPC_NO_PACKET_IN; + } +} + +static int +handle_port_mod(struct ofproto *p, struct ofp_header *oh) +{ + const struct ofp_port_mod *opm; + struct ofport *port; + int error; + + error = check_ofp_message(oh, OFPT_PORT_MOD, sizeof *opm); + if (error) { + return error; + } + opm = (struct ofp_port_mod *) oh; + + port = port_array_get(&p->ports, + ofp_port_to_odp_port(ntohs(opm->port_no))); + if (!port) { + return ofp_mkerr(OFPET_PORT_MOD_FAILED, OFPPMFC_BAD_PORT); + } else if (memcmp(port->opp.hw_addr, opm->hw_addr, OFP_ETH_ALEN)) { + return ofp_mkerr(OFPET_PORT_MOD_FAILED, OFPPMFC_BAD_HW_ADDR); + } else { + update_port_config(p, port, ntohl(opm->config), ntohl(opm->mask)); + if (opm->advertise) { + netdev_set_advertisements(port->netdev, ntohl(opm->advertise)); + } + } + return 0; +} + +static struct ofpbuf * +make_stats_reply(uint32_t xid, uint16_t type, size_t body_len) +{ + struct ofp_stats_reply *osr; + struct ofpbuf *msg; + + msg = ofpbuf_new(MIN(sizeof *osr + body_len, UINT16_MAX)); + osr = put_openflow_xid(sizeof *osr, OFPT_STATS_REPLY, xid, msg); + osr->type = type; + osr->flags = htons(0); + return msg; +} + +static struct ofpbuf * +start_stats_reply(const struct ofp_stats_request *request, size_t body_len) +{ + return make_stats_reply(request->header.xid, request->type, body_len); +} + +static void * +append_stats_reply(size_t nbytes, struct ofconn *ofconn, struct ofpbuf **msgp) +{ + struct ofpbuf *msg = *msgp; + assert(nbytes <= UINT16_MAX - sizeof(struct ofp_stats_reply)); + if (nbytes + msg->size > UINT16_MAX) { + struct ofp_stats_reply *reply = msg->data; + reply->flags = htons(OFPSF_REPLY_MORE); + *msgp = make_stats_reply(reply->header.xid, reply->type, nbytes); + queue_tx(msg, ofconn, ofconn->reply_counter); + } + return ofpbuf_put_uninit(*msgp, nbytes); +} + +static int +handle_desc_stats_request(struct ofproto *p, struct ofconn *ofconn, + struct ofp_stats_request *request) +{ + struct ofp_desc_stats *ods; + struct ofpbuf *msg; + + msg = start_stats_reply(request, sizeof *ods); + ods = append_stats_reply(sizeof *ods, ofconn, &msg); + strncpy(ods->mfr_desc, p->manufacturer, sizeof ods->mfr_desc); + strncpy(ods->hw_desc, p->hardware, sizeof ods->hw_desc); + strncpy(ods->sw_desc, p->software, sizeof ods->sw_desc); + strncpy(ods->serial_num, p->serial, sizeof ods->serial_num); + queue_tx(msg, ofconn, ofconn->reply_counter); + + return 0; +} + +static void +count_subrules(struct cls_rule *cls_rule, void *n_subrules_) +{ + struct rule *rule = rule_from_cls_rule(cls_rule); + int *n_subrules = n_subrules_; + + if (rule->super) { + (*n_subrules)++; + } +} + +static int +handle_table_stats_request(struct ofproto *p, struct ofconn *ofconn, + struct ofp_stats_request *request) +{ + struct ofp_table_stats *ots; + struct ofpbuf *msg; + struct odp_stats dpstats; + int n_exact, n_subrules, n_wild; + + msg = start_stats_reply(request, sizeof *ots * 2); + + /* Count rules of various kinds. */ + n_subrules = 0; + classifier_for_each(&p->cls, CLS_INC_EXACT, count_subrules, &n_subrules); + n_exact = classifier_count_exact(&p->cls) - n_subrules; + n_wild = classifier_count(&p->cls) - classifier_count_exact(&p->cls); + + /* Hash table. */ + dpif_get_dp_stats(&p->dpif, &dpstats); + ots = append_stats_reply(sizeof *ots, ofconn, &msg); + memset(ots, 0, sizeof *ots); + ots->table_id = TABLEID_HASH; + strcpy(ots->name, "hash"); + ots->wildcards = htonl(0); + ots->max_entries = htonl(dpstats.max_capacity); + ots->active_count = htonl(n_exact); + ots->lookup_count = htonll(dpstats.n_frags + dpstats.n_hit + + dpstats.n_missed); + ots->matched_count = htonll(dpstats.n_hit); /* XXX */ + + /* Classifier table. */ + ots = append_stats_reply(sizeof *ots, ofconn, &msg); + memset(ots, 0, sizeof *ots); + ots->table_id = TABLEID_CLASSIFIER; + strcpy(ots->name, "classifier"); + ots->wildcards = htonl(OFPFW_ALL); + ots->max_entries = htonl(65536); + ots->active_count = htonl(n_wild); + ots->lookup_count = htonll(0); /* XXX */ + ots->matched_count = htonll(0); /* XXX */ + + queue_tx(msg, ofconn, ofconn->reply_counter); + return 0; +} + +static int +handle_port_stats_request(struct ofproto *p, struct ofconn *ofconn, + struct ofp_stats_request *request) +{ + struct ofp_port_stats *ops; + struct ofpbuf *msg; + struct ofport *port; + unsigned int port_no; + + msg = start_stats_reply(request, sizeof *ops * 16); + PORT_ARRAY_FOR_EACH (port, &p->ports, port_no) { + struct netdev_stats stats; + + /* Intentionally ignore return value, since errors will set 'stats' to + * all-1s, which is correct for OpenFlow, and netdev_get_stats() will + * log errors. */ + netdev_get_stats(port->netdev, &stats); + + ops = append_stats_reply(sizeof *ops, ofconn, &msg); + ops->port_no = htons(odp_port_to_ofp_port(port_no)); + memset(ops->pad, 0, sizeof ops->pad); + ops->rx_packets = htonll(stats.rx_packets); + ops->tx_packets = htonll(stats.tx_packets); + ops->rx_bytes = htonll(stats.rx_bytes); + ops->tx_bytes = htonll(stats.tx_bytes); + ops->rx_dropped = htonll(stats.rx_dropped); + ops->tx_dropped = htonll(stats.tx_dropped); + ops->rx_errors = htonll(stats.rx_errors); + ops->tx_errors = htonll(stats.tx_errors); + ops->rx_frame_err = htonll(stats.rx_frame_errors); + ops->rx_over_err = htonll(stats.rx_over_errors); + ops->rx_crc_err = htonll(stats.rx_crc_errors); + ops->collisions = htonll(stats.collisions); + } + + queue_tx(msg, ofconn, ofconn->reply_counter); + return 0; +} + +struct flow_stats_cbdata { + struct ofproto *ofproto; + struct ofconn *ofconn; + uint16_t out_port; + struct ofpbuf *msg; +}; + +static void +query_stats(struct ofproto *p, struct rule *rule, + uint64_t *packet_countp, uint64_t *byte_countp) +{ + uint64_t packet_count, byte_count; + struct rule *subrule; + struct odp_flow *odp_flows; + size_t n_odp_flows; + + n_odp_flows = rule->cr.wc.wildcards ? list_size(&rule->list) : 1; + odp_flows = xcalloc(1, n_odp_flows * sizeof *odp_flows); + if (rule->cr.wc.wildcards) { + size_t i = 0; + LIST_FOR_EACH (subrule, struct rule, list, &rule->list) { + odp_flows[i++].key = subrule->cr.flow; + } + } else { + odp_flows[0].key = rule->cr.flow; + } + + packet_count = rule->packet_count; + byte_count = rule->byte_count; + if (!dpif_flow_get_multiple(&p->dpif, odp_flows, n_odp_flows)) { + size_t i; + for (i = 0; i < n_odp_flows; i++) { + struct odp_flow *odp_flow = &odp_flows[i]; + packet_count += odp_flow->stats.n_packets; + byte_count += odp_flow->stats.n_bytes; + } + } + free(odp_flows); + + *packet_countp = packet_count; + *byte_countp = byte_count; +} + +static void +flow_stats_cb(struct cls_rule *rule_, void *cbdata_) +{ + struct rule *rule = rule_from_cls_rule(rule_); + struct flow_stats_cbdata *cbdata = cbdata_; + struct ofp_flow_stats *ofs; + uint64_t packet_count, byte_count; + size_t act_len, len; + + if (rule_is_hidden(rule) || !rule_has_out_port(rule, cbdata->out_port)) { + return; + } + + act_len = sizeof *rule->actions * rule->n_actions; + len = offsetof(struct ofp_flow_stats, actions) + act_len; + + query_stats(cbdata->ofproto, rule, &packet_count, &byte_count); + + ofs = append_stats_reply(len, cbdata->ofconn, &cbdata->msg); + ofs->length = htons(len); + ofs->table_id = rule->cr.wc.wildcards ? TABLEID_CLASSIFIER : TABLEID_HASH; + ofs->pad = 0; + flow_to_match(&rule->cr.flow, rule->cr.wc.wildcards, &ofs->match); + ofs->duration = htonl((time_msec() - rule->created) / 1000); + ofs->priority = htons(rule->cr.priority); + ofs->idle_timeout = htons(rule->idle_timeout); + ofs->hard_timeout = htons(rule->hard_timeout); + memset(ofs->pad2, 0, sizeof ofs->pad2); + ofs->packet_count = htonll(packet_count); + ofs->byte_count = htonll(byte_count); + memcpy(ofs->actions, rule->actions, act_len); +} + +static int +table_id_to_include(uint8_t table_id) +{ + return (table_id == TABLEID_HASH ? CLS_INC_EXACT + : table_id == TABLEID_CLASSIFIER ? CLS_INC_WILD + : table_id == 0xff ? CLS_INC_ALL + : 0); +} + +static int +handle_flow_stats_request(struct ofproto *p, struct ofconn *ofconn, + const struct ofp_stats_request *osr, + size_t arg_size) +{ + struct ofp_flow_stats_request *fsr; + struct flow_stats_cbdata cbdata; + struct cls_rule target; + + if (arg_size != sizeof *fsr) { + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + fsr = (struct ofp_flow_stats_request *) osr->body; + + COVERAGE_INC(ofproto_flows_req); + cbdata.ofproto = p; + cbdata.ofconn = ofconn; + cbdata.out_port = fsr->out_port; + cbdata.msg = start_stats_reply(osr, 1024); + cls_rule_from_match(&target, &fsr->match, 0); + classifier_for_each_match(&p->cls, &target, + table_id_to_include(fsr->table_id), + flow_stats_cb, &cbdata); + queue_tx(cbdata.msg, ofconn, ofconn->reply_counter); + return 0; +} + +struct aggregate_stats_cbdata { + struct ofproto *ofproto; + uint16_t out_port; + uint64_t packet_count; + uint64_t byte_count; + uint32_t n_flows; +}; + +static void +aggregate_stats_cb(struct cls_rule *rule_, void *cbdata_) +{ + struct rule *rule = rule_from_cls_rule(rule_); + struct aggregate_stats_cbdata *cbdata = cbdata_; + uint64_t packet_count, byte_count; + + if (rule_is_hidden(rule) || !rule_has_out_port(rule, cbdata->out_port)) { + return; + } + + query_stats(cbdata->ofproto, rule, &packet_count, &byte_count); + + cbdata->packet_count += packet_count; + cbdata->byte_count += byte_count; + cbdata->n_flows++; +} + +static int +handle_aggregate_stats_request(struct ofproto *p, struct ofconn *ofconn, + const struct ofp_stats_request *osr, + size_t arg_size) +{ + struct ofp_aggregate_stats_request *asr; + struct ofp_aggregate_stats_reply *reply; + struct aggregate_stats_cbdata cbdata; + struct cls_rule target; + struct ofpbuf *msg; + + if (arg_size != sizeof *asr) { + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + asr = (struct ofp_aggregate_stats_request *) osr->body; + + COVERAGE_INC(ofproto_agg_request); + cbdata.ofproto = p; + cbdata.out_port = asr->out_port; + cbdata.packet_count = 0; + cbdata.byte_count = 0; + cbdata.n_flows = 0; + cls_rule_from_match(&target, &asr->match, 0); + classifier_for_each_match(&p->cls, &target, + table_id_to_include(asr->table_id), + aggregate_stats_cb, &cbdata); + + msg = start_stats_reply(osr, sizeof *reply); + reply = append_stats_reply(sizeof *reply, ofconn, &msg); + reply->flow_count = htonl(cbdata.n_flows); + reply->packet_count = htonll(cbdata.packet_count); + reply->byte_count = htonll(cbdata.byte_count); + queue_tx(msg, ofconn, ofconn->reply_counter); + return 0; +} + +static int +handle_stats_request(struct ofproto *p, struct ofconn *ofconn, + struct ofp_header *oh) +{ + struct ofp_stats_request *osr; + size_t arg_size; + int error; + + error = check_ofp_message_array(oh, OFPT_STATS_REQUEST, sizeof *osr, + 1, &arg_size); + if (error) { + return error; + } + osr = (struct ofp_stats_request *) oh; + + switch (ntohs(osr->type)) { + case OFPST_DESC: + return handle_desc_stats_request(p, ofconn, osr); + + case OFPST_FLOW: + return handle_flow_stats_request(p, ofconn, osr, arg_size); + + case OFPST_AGGREGATE: + return handle_aggregate_stats_request(p, ofconn, osr, arg_size); + + case OFPST_TABLE: + return handle_table_stats_request(p, ofconn, osr); + + case OFPST_PORT: + return handle_port_stats_request(p, ofconn, osr); + + case OFPST_VENDOR: + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_VENDOR); + + default: + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_STAT); + } +} + +static long long int +msec_from_nsec(uint64_t sec, uint32_t nsec) +{ + return !sec ? 0 : sec * 1000 + nsec / 1000000; +} + +static void +update_time(struct rule *rule, const struct odp_flow_stats *stats) +{ + long long int used = msec_from_nsec(stats->used_sec, stats->used_nsec); + if (used > rule->used) { + rule->used = used; + } +} + +static void +update_stats(struct rule *rule, const struct odp_flow_stats *stats) +{ + update_time(rule, stats); + rule->packet_count += stats->n_packets; + rule->byte_count += stats->n_bytes; + rule->tcp_flags |= stats->tcp_flags; + if (stats->n_packets) { + rule->ip_tos = stats->ip_tos; + } +} + +static int +add_flow(struct ofproto *p, struct ofconn *ofconn, + struct ofp_flow_mod *ofm, size_t n_actions) +{ + struct ofpbuf *packet; + struct rule *rule; + uint16_t in_port; + int error; + + rule = rule_create(NULL, (const union ofp_action *) ofm->actions, + n_actions, ntohs(ofm->idle_timeout), + ntohs(ofm->hard_timeout)); + cls_rule_from_match(&rule->cr, &ofm->match, ntohs(ofm->priority)); + + packet = NULL; + error = 0; + if (ofm->buffer_id != htonl(UINT32_MAX)) { + error = pktbuf_retrieve(ofconn->pktbuf, ntohl(ofm->buffer_id), + &packet, &in_port); + } + + rule_insert(p, rule, packet, in_port); + ofpbuf_delete(packet); + return error; +} + +static int +modify_flow(struct ofproto *p, const struct ofp_flow_mod *ofm, + size_t n_actions, uint16_t command, struct rule *rule) +{ + if (rule_is_hidden(rule)) { + return 0; + } + + if (command == OFPFC_DELETE) { + rule_remove(p, rule); + } else { + size_t actions_len = n_actions * sizeof *rule->actions; + + if (n_actions == rule->n_actions + && !memcmp(ofm->actions, rule->actions, actions_len)) + { + return 0; + } + + free(rule->actions); + rule->actions = xmemdup(ofm->actions, actions_len); + rule->n_actions = n_actions; + + if (rule->cr.wc.wildcards) { + COVERAGE_INC(ofproto_mod_wc_flow); + p->need_revalidate = true; + } else { + rule_update_actions(p, rule); + } + } + + return 0; +} + +static int +modify_flows_strict(struct ofproto *p, const struct ofp_flow_mod *ofm, + size_t n_actions, uint16_t command) +{ + struct rule *rule; + uint32_t wildcards; + flow_t flow; + + flow_from_match(&flow, &wildcards, &ofm->match); + rule = rule_from_cls_rule(classifier_find_rule_exactly( + &p->cls, &flow, wildcards, + ntohs(ofm->priority))); + + if (rule) { + if (command == OFPFC_DELETE + && ofm->out_port != htons(OFPP_NONE) + && !rule_has_out_port(rule, ofm->out_port)) { + return 0; + } + + modify_flow(p, ofm, n_actions, command, rule); + } + return 0; +} + +struct modify_flows_cbdata { + struct ofproto *ofproto; + const struct ofp_flow_mod *ofm; + uint16_t out_port; + size_t n_actions; + uint16_t command; +}; + +static void +modify_flows_cb(struct cls_rule *rule_, void *cbdata_) +{ + struct rule *rule = rule_from_cls_rule(rule_); + struct modify_flows_cbdata *cbdata = cbdata_; + + if (cbdata->out_port != htons(OFPP_NONE) + && !rule_has_out_port(rule, cbdata->out_port)) { + return; + } + + modify_flow(cbdata->ofproto, cbdata->ofm, cbdata->n_actions, + cbdata->command, rule); +} + +static int +modify_flows_loose(struct ofproto *p, const struct ofp_flow_mod *ofm, + size_t n_actions, uint16_t command) +{ + struct modify_flows_cbdata cbdata; + struct cls_rule target; + + cbdata.ofproto = p; + cbdata.ofm = ofm; + cbdata.out_port = (command == OFPFC_DELETE ? ofm->out_port + : htons(OFPP_NONE)); + cbdata.n_actions = n_actions; + cbdata.command = command; + + cls_rule_from_match(&target, &ofm->match, 0); + + classifier_for_each_match(&p->cls, &target, CLS_INC_ALL, + modify_flows_cb, &cbdata); + return 0; +} + +static int +handle_flow_mod(struct ofproto *p, struct ofconn *ofconn, + struct ofp_flow_mod *ofm) +{ + size_t n_actions; + int error; + + error = check_ofp_message_array(&ofm->header, OFPT_FLOW_MOD, sizeof *ofm, + sizeof *ofm->actions, &n_actions); + if (error) { + return error; + } + + normalize_match(&ofm->match); + if (!ofm->match.wildcards) { + ofm->priority = htons(UINT16_MAX); + } + + error = validate_actions((const union ofp_action *) ofm->actions, + n_actions, p->max_ports); + if (error) { + return error; + } + + switch (ntohs(ofm->command)) { + case OFPFC_ADD: + return add_flow(p, ofconn, ofm, n_actions); + + case OFPFC_MODIFY: + return modify_flows_loose(p, ofm, n_actions, OFPFC_MODIFY); + + case OFPFC_MODIFY_STRICT: + return modify_flows_strict(p, ofm, n_actions, OFPFC_MODIFY); + + case OFPFC_DELETE: + return modify_flows_loose(p, ofm, n_actions, OFPFC_DELETE); + + case OFPFC_DELETE_STRICT: + return modify_flows_strict(p, ofm, n_actions, OFPFC_DELETE); + + default: + return ofp_mkerr(OFPET_FLOW_MOD_FAILED, OFPFMFC_BAD_COMMAND); + } +} + +static void +send_capability_reply(struct ofproto *p, struct ofconn *ofconn, uint32_t xid) +{ + struct ofmp_capability_reply *ocr; + struct ofpbuf *b; + char capabilities[] = "com.nicira.mgmt.manager=false\n"; + + ocr = make_openflow_xid(sizeof(*ocr), OFPT_VENDOR, xid, &b); + ocr->header.header.vendor = htonl(NX_VENDOR_ID); + ocr->header.header.subtype = htonl(NXT_MGMT); + ocr->header.type = htons(OFMPT_CAPABILITY_REPLY); + + ocr->format = htonl(OFMPCOF_SIMPLE); + ocr->mgmt_id = htonll(p->mgmt_id); + + ofpbuf_put(b, capabilities, strlen(capabilities)); + + queue_tx(b, ofconn, ofconn->reply_counter); +} + +static int +handle_ofmp(struct ofproto *p, struct ofconn *ofconn, + struct ofmp_header *ofmph) +{ + size_t msg_len = ntohs(ofmph->header.header.length); + if (msg_len < sizeof(*ofmph)) { + VLOG_WARN_RL(&rl, "dropping short managment message: %d\n", msg_len); + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + + if (ofmph->type == htons(OFMPT_CAPABILITY_REQUEST)) { + struct ofmp_capability_request *ofmpcr; + + if (msg_len < sizeof(struct ofmp_capability_request)) { + VLOG_WARN_RL(&rl, "dropping short capability request: %d\n", + msg_len); + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + + ofmpcr = (struct ofmp_capability_request *)ofmph; + if (ofmpcr->format != htonl(OFMPCAF_SIMPLE)) { + /* xxx Find a better type than bad subtype */ + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE); + } + + send_capability_reply(p, ofconn, ofmph->header.header.xid); + return 0; + } else { + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE); + } +} + +static int +handle_vendor(struct ofproto *p, struct ofconn *ofconn, void *msg) +{ + struct ofp_vendor_header *ovh = msg; + struct nicira_header *nh; + + if (ntohs(ovh->header.length) < sizeof(struct ofp_vendor_header)) { + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + if (ovh->vendor != htonl(NX_VENDOR_ID)) { + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_VENDOR); + } + if (ntohs(ovh->header.length) < sizeof(struct nicira_header)) { + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH); + } + + nh = msg; + switch (ntohl(nh->subtype)) { + case NXT_STATUS_REQUEST: + return switch_status_handle_request(p->switch_status, ofconn->rconn, + msg); + + case NXT_ACT_SET_CONFIG: + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE); /* XXX */ + + case NXT_ACT_GET_CONFIG: + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE); /* XXX */ + + case NXT_COMMAND_REQUEST: + if (p->executer) { + return executer_handle_request(p->executer, ofconn->rconn, msg); + } + break; + + case NXT_MGMT: + return handle_ofmp(p, ofconn, msg); + } + + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE); +} + +static void +handle_openflow(struct ofconn *ofconn, struct ofproto *p, + struct ofpbuf *ofp_msg) +{ + struct ofp_header *oh = ofp_msg->data; + int error; + + COVERAGE_INC(ofproto_recv_openflow); + switch (oh->type) { + case OFPT_ECHO_REQUEST: + error = handle_echo_request(ofconn, oh); + break; + + case OFPT_ECHO_REPLY: + error = 0; + break; + + case OFPT_FEATURES_REQUEST: + error = handle_features_request(p, ofconn, oh); + break; + + case OFPT_GET_CONFIG_REQUEST: + error = handle_get_config_request(p, ofconn, oh); + break; + + case OFPT_SET_CONFIG: + error = handle_set_config(p, ofconn, ofp_msg->data); + break; + + case OFPT_PACKET_OUT: + error = handle_packet_out(p, ofconn, ofp_msg->data); + break; + + case OFPT_PORT_MOD: + error = handle_port_mod(p, oh); + break; + + case OFPT_FLOW_MOD: + error = handle_flow_mod(p, ofconn, ofp_msg->data); + break; + + case OFPT_STATS_REQUEST: + error = handle_stats_request(p, ofconn, oh); + break; + + case OFPT_VENDOR: + error = handle_vendor(p, ofconn, ofp_msg->data); + break; + + default: + if (VLOG_IS_WARN_ENABLED()) { + char *s = ofp_to_string(oh, ntohs(oh->length), 2); + VLOG_DBG_RL(&rl, "OpenFlow message ignored: %s", s); + free(s); + } + error = ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_TYPE); + break; + } + + if (error) { + send_error_oh(ofconn, ofp_msg->data, error); + } +} + +static void +handle_odp_msg(struct ofproto *p, struct ofpbuf *packet) +{ + struct odp_msg *msg = packet->data; + uint16_t in_port = odp_port_to_ofp_port(msg->port); + struct rule *rule; + struct ofpbuf payload; + flow_t flow; + + /* Handle controller actions. */ + if (msg->type == _ODPL_ACTION_NR) { + COVERAGE_INC(ofproto_ctlr_action); + pinsched_send(p->action_sched, in_port, packet, + send_packet_in_action, p); + return; + } + + payload.data = msg + 1; + payload.size = msg->length - sizeof *msg; + flow_extract(&payload, msg->port, &flow); + + rule = lookup_valid_rule(p, &flow); + if (!rule) { + /* Don't send a packet-in if OFPPC_NO_PACKET_IN asserted. */ + struct ofport *port = port_array_get(&p->ports, msg->port); + if (port) { + if (port->opp.config & OFPPC_NO_PACKET_IN) { + COVERAGE_INC(ofproto_no_packet_in); + /* XXX install 'drop' flow entry */ + ofpbuf_delete(packet); + return; + } + } else { + VLOG_WARN_RL(&rl, "packet-in on unknown port %"PRIu16, msg->port); + } + + COVERAGE_INC(ofproto_packet_in); + pinsched_send(p->miss_sched, in_port, packet, send_packet_in_miss, p); + return; + } + + if (rule->cr.wc.wildcards) { + rule = rule_create_subrule(p, rule, &flow); + rule_make_actions(p, rule, packet); + } else { + if (!rule->may_install) { + /* The rule is not installable, that is, we need to process every + * packet, so process the current packet and set its actions into + * 'subrule'. */ + rule_make_actions(p, rule, packet); + } else { + /* XXX revalidate rule if it needs it */ + } + } + + rule_execute(p, rule, &payload, &flow); + rule_reinstall(p, rule); + ofpbuf_delete(packet); +} + +static void +revalidate_cb(struct cls_rule *sub_, void *cbdata_) +{ + struct rule *sub = rule_from_cls_rule(sub_); + struct revalidate_cbdata *cbdata = cbdata_; + + if (cbdata->revalidate_all + || (cbdata->revalidate_subrules && sub->super) + || (tag_set_intersects(&cbdata->revalidate_set, sub->tags))) { + revalidate_rule(cbdata->ofproto, sub); + } +} + +static bool +revalidate_rule(struct ofproto *p, struct rule *rule) +{ + const flow_t *flow = &rule->cr.flow; + + COVERAGE_INC(ofproto_revalidate_rule); + if (rule->super) { + struct rule *super; + super = rule_from_cls_rule(classifier_lookup_wild(&p->cls, flow)); + if (!super) { + rule_remove(p, rule); + return false; + } else if (super != rule->super) { + COVERAGE_INC(ofproto_revalidate_moved); + list_remove(&rule->list); + list_push_back(&super->list, &rule->list); + rule->super = super; + rule->hard_timeout = super->hard_timeout; + rule->idle_timeout = super->idle_timeout; + rule->created = super->created; + rule->used = 0; + } + } + + rule_update_actions(p, rule); + return true; +} + +static struct ofpbuf * +compose_flow_exp(const struct rule *rule, long long int now, uint8_t reason) +{ + struct ofp_flow_expired *ofe; + struct ofpbuf *buf; + + ofe = make_openflow(sizeof *ofe, OFPT_FLOW_EXPIRED, &buf); + flow_to_match(&rule->cr.flow, rule->cr.wc.wildcards, &ofe->match); + ofe->priority = htons(rule->cr.priority); + ofe->reason = reason; + ofe->duration = (now - rule->created) / 1000; + ofe->packet_count = rule->packet_count; + ofe->byte_count = rule->byte_count; + + return buf; +} + +static void +send_flow_exp(struct ofproto *p, struct rule *rule, + long long int now, uint8_t reason) +{ + struct ofconn *ofconn; + struct ofconn *prev; + struct ofpbuf *buf; + + /* We limit the maximum number of queued flow expirations it by accounting + * them under the counter for replies. That works because preventing + * OpenFlow requests from being processed also prevents new flows from + * being added (and expiring). (It also prevents processing OpenFlow + * requests that would not add new flows, so it is imperfect.) */ + + prev = NULL; + LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) { + if (ofconn->send_flow_exp && rconn_is_connected(ofconn->rconn)) { + if (prev) { + queue_tx(ofpbuf_clone(buf), prev, ofconn->reply_counter); + } else { + buf = compose_flow_exp(rule, now, reason); + } + prev = ofconn; + } + } + if (prev) { + queue_tx(buf, prev, ofconn->reply_counter); + } +} + +static void +uninstall_idle_flow(struct ofproto *ofproto, struct rule *rule) +{ + assert(rule->installed); + assert(!rule->cr.wc.wildcards); + + if (rule->super) { + rule_remove(ofproto, rule); + } else { + rule_uninstall(ofproto, rule); + } +} + +static void +expire_rule(struct cls_rule *cls_rule, void *p_) +{ + struct ofproto *p = p_; + struct rule *rule = rule_from_cls_rule(cls_rule); + long long int hard_expire, idle_expire, expire, now; + + hard_expire = (rule->hard_timeout + ? rule->created + rule->hard_timeout * 1000 + : LLONG_MAX); + idle_expire = (rule->idle_timeout + && (rule->super || list_is_empty(&rule->list)) + ? rule->used + rule->idle_timeout * 1000 + : LLONG_MAX); + expire = MIN(hard_expire, idle_expire); + if (expire == LLONG_MAX) { + if (rule->installed && time_msec() >= rule->used + 5000) { + uninstall_idle_flow(p, rule); + } + return; + } + + now = time_msec(); + if (now < expire) { + if (rule->installed && now >= rule->used + 5000) { + uninstall_idle_flow(p, rule); + } + return; + } + + COVERAGE_INC(ofproto_expired); + if (rule->cr.wc.wildcards) { + /* Update stats. (This code will be a no-op if the rule expired + * due to an idle timeout, because in that case the rule has no + * subrules left.) */ + struct rule *subrule, *next; + LIST_FOR_EACH_SAFE (subrule, next, struct rule, list, &rule->list) { + rule_remove(p, subrule); + } + } + + send_flow_exp(p, rule, now, + (now >= hard_expire + ? OFPER_HARD_TIMEOUT : OFPER_IDLE_TIMEOUT)); + rule_remove(p, rule); +} + +static void +update_used(struct ofproto *p) +{ + struct odp_flow *flows; + size_t n_flows; + size_t i; + int error; + + error = dpif_flow_list_all(&p->dpif, &flows, &n_flows); + if (error) { + return; + } + + for (i = 0; i < n_flows; i++) { + struct odp_flow *f = &flows[i]; + struct rule *rule; + + rule = rule_from_cls_rule( + classifier_find_rule_exactly(&p->cls, &f->key, 0, UINT16_MAX)); + if (!rule || !rule->installed) { + COVERAGE_INC(ofproto_unexpected_rule); + dpif_flow_del(&p->dpif, f); + continue; + } + + update_time(rule, &f->stats); + rule_account(p, rule, f->stats.n_bytes); + } + free(flows); +} + +static void +do_send_packet_in(struct ofconn *ofconn, uint32_t buffer_id, + const struct ofpbuf *packet, int send_len) +{ + struct ofp_packet_in *opi; + struct ofpbuf payload, *buf; + struct odp_msg *msg; + + msg = packet->data; + payload.data = msg + 1; + payload.size = msg->length - sizeof *msg; + + send_len = MIN(send_len, payload.size); + buf = ofpbuf_new(sizeof *opi + send_len); + opi = put_openflow_xid(offsetof(struct ofp_packet_in, data), + OFPT_PACKET_IN, 0, buf); + opi->buffer_id = htonl(buffer_id); + opi->total_len = htons(payload.size); + opi->in_port = htons(odp_port_to_ofp_port(msg->port)); + opi->reason = msg->type == _ODPL_ACTION_NR ? OFPR_ACTION : OFPR_NO_MATCH; + ofpbuf_put(buf, payload.data, MIN(send_len, payload.size)); + update_openflow_length(buf); + rconn_send_with_limit(ofconn->rconn, buf, ofconn->packet_in_counter, 100); +} + +static void +send_packet_in_action(struct ofpbuf *packet, void *p_) +{ + struct ofproto *p = p_; + struct ofconn *ofconn; + struct odp_msg *msg; + + msg = packet->data; + LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) { + if (ofconn == p->controller || ofconn->miss_send_len) { + do_send_packet_in(ofconn, UINT32_MAX, packet, msg->arg); + } + } + ofpbuf_delete(packet); +} + +static void +send_packet_in_miss(struct ofpbuf *packet, void *p_) +{ + struct ofproto *p = p_; + struct ofconn *ofconn; + struct ofpbuf payload; + struct odp_msg *msg; + + msg = packet->data; + payload.data = msg + 1; + payload.size = msg->length - sizeof *msg; + LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) { + if (ofconn->miss_send_len) { + uint32_t buffer_id = pktbuf_save(ofconn->pktbuf, &payload, + msg->port); + int send_len = (buffer_id != UINT32_MAX ? ofconn->miss_send_len + : UINT32_MAX); + do_send_packet_in(ofconn, buffer_id, packet, send_len); + } + } + ofpbuf_delete(packet); +} + +static uint64_t +pick_datapath_id(struct dpif *dpif, uint64_t fallback_dpid) +{ + char local_name[IF_NAMESIZE]; + uint8_t ea[ETH_ADDR_LEN]; + int error; + + error = dpif_get_name(dpif, local_name, sizeof local_name); + if (!error) { + error = netdev_nodev_get_etheraddr(local_name, ea); + if (!error) { + return eth_addr_to_uint64(ea); + } + VLOG_WARN("could not get MAC address for %s (%s)", + local_name, strerror(error)); + } + + return fallback_dpid; +} + +static uint64_t +pick_fallback_dpid(void) +{ + uint8_t ea[ETH_ADDR_LEN]; + eth_addr_random(ea); + ea[0] = 0x00; /* Set Nicira OUI. */ + ea[1] = 0x23; + ea[2] = 0x20; + return eth_addr_to_uint64(ea); +} + +static bool +default_normal_ofhook_cb(const flow_t *flow, const struct ofpbuf *packet, + struct odp_actions *actions, tag_type *tags, + void *ofproto_) +{ + struct ofproto *ofproto = ofproto_; + int out_port; + + /* Drop frames for reserved multicast addresses. */ + if (eth_addr_is_reserved(flow->dl_dst)) { + return true; + } + + /* Learn source MAC (but don't try to learn from revalidation). */ + if (packet != NULL) { + tag_type rev_tag = mac_learning_learn(ofproto->ml, flow->dl_src, + 0, flow->in_port); + if (rev_tag) { + /* The log messages here could actually be useful in debugging, + * so keep the rate limit relatively high. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, 300); + VLOG_DBG_RL(&rl, "learned that "ETH_ADDR_FMT" is on port %"PRIu16, + ETH_ADDR_ARGS(flow->dl_src), flow->in_port); + ofproto_revalidate(ofproto, rev_tag); + } + } + + /* Determine output port. */ + out_port = mac_learning_lookup_tag(ofproto->ml, flow->dl_dst, 0, tags); + if (out_port < 0) { + add_output_group_action(actions, DP_GROUP_FLOOD); + } else if (out_port != flow->in_port) { + odp_actions_add(actions, ODPAT_OUTPUT)->output.port = out_port; + } else { + /* Drop. */ + } + + return true; +} + +static const struct ofhooks default_ofhooks = { + NULL, + default_normal_ofhook_cb, + NULL, + NULL +}; diff --git a/secchan/ofproto.h b/secchan/ofproto.h new file mode 100644 index 00000000..6272d279 --- /dev/null +++ b/secchan/ofproto.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef OFPROTO_H +#define OFPROTO_H 1 + +#include +#include +#include +#include "flow.h" +#include "tag.h" + +struct odp_actions; +struct ofhooks; +struct ofproto; +struct svec; + +struct ofexpired { + flow_t flow; + uint64_t packet_count; /* Packets from *expired* subrules. */ + uint64_t byte_count; /* Bytes from *expired* subrules. */ + long long int used; /* Last-used time (0 if never used). */ + long long int created; /* Creation time. */ + uint8_t tcp_flags; /* Bitwise-OR of all TCP flags seen. */ + uint8_t ip_tos; /* Last-seen IP type-of-service. */ +}; + +int ofproto_create(const char *datapath, const struct ofhooks *, void *aux, + struct ofproto **ofprotop); +void ofproto_destroy(struct ofproto *); +int ofproto_run(struct ofproto *); +int ofproto_run1(struct ofproto *); +int ofproto_run2(struct ofproto *, bool revalidate_all); +void ofproto_wait(struct ofproto *); +bool ofproto_is_alive(const struct ofproto *); + +/* Configuration. */ +void ofproto_set_datapath_id(struct ofproto *, uint64_t datapath_id); +void ofproto_set_mgmt_id(struct ofproto *, uint64_t mgmt_id); +void ofproto_set_probe_interval(struct ofproto *, int probe_interval); +void ofproto_set_max_backoff(struct ofproto *, int max_backoff); +void ofproto_set_desc(struct ofproto *, + const char *manufacturer, const char *hardware, + const char *software, const char *serial); +int ofproto_set_in_band(struct ofproto *, bool in_band); +int ofproto_set_discovery(struct ofproto *, bool discovery, + const char *accept_controller_re, + bool update_resolv_conf); +int ofproto_set_controller(struct ofproto *, const char *controller); +int ofproto_set_listeners(struct ofproto *, const struct svec *listeners); +int ofproto_set_snoops(struct ofproto *, const struct svec *snoops); +int ofproto_set_netflow(struct ofproto *, const struct svec *collectors, + uint8_t engine_type, uint8_t engine_id, bool add_id_to_iface); +void ofproto_set_failure(struct ofproto *, bool fail_open); +void ofproto_set_rate_limit(struct ofproto *, int rate_limit, int burst_limit); +int ofproto_set_stp(struct ofproto *, bool enable_stp); +int ofproto_set_remote_execution(struct ofproto *, const char *command_acl, + const char *command_dir); + +/* Configuration querying. */ +uint64_t ofproto_get_datapath_id(const struct ofproto *); +int ofproto_get_probe_interval(const struct ofproto *); +int ofproto_get_max_backoff(const struct ofproto *); +bool ofproto_get_in_band(const struct ofproto *); +bool ofproto_get_discovery(const struct ofproto *); +const char *ofproto_get_controller(const struct ofproto *); +void ofproto_get_listeners(const struct ofproto *, struct svec *); +void ofproto_get_snoops(const struct ofproto *, struct svec *); + +/* Functions for use by ofproto implementation modules, not by clients. */ +int ofproto_send_packet(struct ofproto *, const flow_t *, + const union ofp_action *, size_t n_actions, + const struct ofpbuf *); +void ofproto_add_flow(struct ofproto *, const flow_t *, uint32_t wildcards, + unsigned int priority, + const union ofp_action *, size_t n_actions, + int idle_timeout); +void ofproto_delete_flow(struct ofproto *, const flow_t *, uint32_t wildcards, + unsigned int priority); +void ofproto_flush_flows(struct ofproto *); + +/* Hooks for ovs-vswitchd. */ +struct ofhooks { + void (*port_changed_cb)(enum ofp_port_reason, const struct ofp_phy_port *, + void *aux); + bool (*normal_cb)(const flow_t *, const struct ofpbuf *packet, + struct odp_actions *, tag_type *, void *aux); + void (*account_flow_cb)(const flow_t *, const union odp_action *, + size_t n_actions, unsigned long long int n_bytes, + void *aux); + void (*account_checkpoint_cb)(void *aux); +}; +void ofproto_revalidate(struct ofproto *, tag_type); +struct tag_set *ofproto_get_revalidate_set(struct ofproto *); + +#endif /* ofproto.h */ diff --git a/secchan/pinsched.c b/secchan/pinsched.c new file mode 100644 index 00000000..910b7a2e --- /dev/null +++ b/secchan/pinsched.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "pinsched.h" +#include +#include +#include "ofpbuf.h" +#include "openflow/openflow.h" +#include "poll-loop.h" +#include "port-array.h" +#include "queue.h" +#include "random.h" +#include "rconn.h" +#include "status.h" +#include "timeval.h" +#include "vconn.h" + +struct pinsched { + /* Client-supplied parameters. */ + int rate_limit; /* Packets added to bucket per second. */ + int burst_limit; /* Maximum token bucket size, in packets. */ + + /* One queue per physical port. */ + struct port_array queues; /* Array of "struct ovs_queue *". */ + int n_queued; /* Sum over queues[*].n. */ + unsigned int last_tx_port; /* Last port checked in round-robin. */ + + /* Token bucket. + * + * It costs 1000 tokens to send a single packet_in message. A single token + * per message would be more straightforward, but this choice lets us avoid + * round-off error in refill_bucket()'s calculation of how many tokens to + * add to the bucket, since no division step is needed. */ + long long int last_fill; /* Time at which we last added tokens. */ + int tokens; /* Current number of tokens. */ + + /* Transmission queue. */ + int n_txq; /* No. of packets waiting in rconn for tx. */ + + /* Statistics reporting. */ + unsigned long long n_normal; /* # txed w/o rate limit queuing. */ + unsigned long long n_limited; /* # queued for rate limiting. */ + unsigned long long n_queue_dropped; /* # dropped due to queue overflow. */ + + /* Switch status. */ + struct status_category *ss_cat; +}; + +static struct ofpbuf * +dequeue_packet(struct pinsched *ps, struct ovs_queue *q, + unsigned int port_no) +{ + struct ofpbuf *packet = queue_pop_head(q); + if (!q->n) { + free(q); + port_array_set(&ps->queues, port_no, NULL); + } + ps->n_queued--; + return packet; +} + +/* Drop a packet from the longest queue in 'ps'. */ +static void +drop_packet(struct pinsched *ps) +{ + struct ovs_queue *longest; /* Queue currently selected as longest. */ + int n_longest; /* # of queues of same length as 'longest'. */ + unsigned int longest_port_no; + unsigned int port_no; + struct ovs_queue *q; + + ps->n_queue_dropped++; + + longest = port_array_first(&ps->queues, &port_no); + longest_port_no = port_no; + n_longest = 1; + while ((q = port_array_next(&ps->queues, &port_no)) != NULL) { + if (longest->n < q->n) { + longest = q; + n_longest = 1; + } else if (longest->n == q->n) { + n_longest++; + + /* Randomly select one of the longest queues, with a uniform + * distribution (Knuth algorithm 3.4.2R). */ + if (!random_range(n_longest)) { + longest = q; + longest_port_no = port_no; + } + } + } + + /* FIXME: do we want to pop the tail instead? */ + ofpbuf_delete(dequeue_packet(ps, longest, longest_port_no)); +} + +/* Remove and return the next packet to transmit (in round-robin order). */ +static struct ofpbuf * +get_tx_packet(struct pinsched *ps) +{ + struct ovs_queue *q = port_array_next(&ps->queues, &ps->last_tx_port); + if (!q) { + q = port_array_first(&ps->queues, &ps->last_tx_port); + } + return dequeue_packet(ps, q, ps->last_tx_port); +} + +/* Add tokens to the bucket based on elapsed time. */ +static void +refill_bucket(struct pinsched *ps) +{ + long long int now = time_msec(); + long long int tokens = (now - ps->last_fill) * ps->rate_limit + ps->tokens; + if (tokens >= 1000) { + ps->last_fill = now; + ps->tokens = MIN(tokens, ps->burst_limit * 1000); + } +} + +/* Attempts to remove enough tokens from 'ps' to transmit a packet. Returns + * true if successful, false otherwise. (In the latter case no tokens are + * removed.) */ +static bool +get_token(struct pinsched *ps) +{ + if (ps->tokens >= 1000) { + ps->tokens -= 1000; + return true; + } else { + return false; + } +} + +void +pinsched_send(struct pinsched *ps, uint16_t port_no, + struct ofpbuf *packet, pinsched_tx_cb *cb, void *aux) +{ + if (!ps) { + cb(packet, aux); + } else if (!ps->n_queued && get_token(ps)) { + /* In the common case where we are not constrained by the rate limit, + * let the packet take the normal path. */ + ps->n_normal++; + cb(packet, aux); + } else { + /* Otherwise queue it up for the periodic callback to drain out. */ + struct ovs_queue *q; + + /* We are called with a buffer obtained from dpif_recv() that has much + * more allocated space than actual content most of the time. Since + * we're going to store the packet for some time, free up that + * otherwise wasted space. */ + ofpbuf_trim(packet); + + if (ps->n_queued >= ps->burst_limit) { + drop_packet(ps); + } + q = port_array_get(&ps->queues, port_no); + if (!q) { + q = xmalloc(sizeof *q); + queue_init(q); + port_array_set(&ps->queues, port_no, q); + } + queue_push_tail(q, packet); + ps->n_queued++; + ps->n_limited++; + } +} + +static void +pinsched_status_cb(struct status_reply *sr, void *ps_) +{ + struct pinsched *ps = ps_; + + status_reply_put(sr, "normal=%llu", ps->n_normal); + status_reply_put(sr, "limited=%llu", ps->n_limited); + status_reply_put(sr, "queue-dropped=%llu", ps->n_queue_dropped); +} + +void +pinsched_run(struct pinsched *ps, pinsched_tx_cb *cb, void *aux) +{ + if (ps) { + int i; + + /* Drain some packets out of the bucket if possible, but limit the + * number of iterations to allow other code to get work done too. */ + refill_bucket(ps); + for (i = 0; ps->n_queued && get_token(ps) && i < 50; i++) { + cb(get_tx_packet(ps), aux); + } + } +} + +void +pinsched_wait(struct pinsched *ps) +{ + if (ps && ps->n_queued) { + if (ps->tokens >= 1000) { + /* We can transmit more packets as soon as we're called again. */ + poll_immediate_wake(); + } else { + /* We have to wait for the bucket to re-fill. We could calculate + * the exact amount of time here for increased smoothness. */ + poll_timer_wait(TIME_UPDATE_INTERVAL / 2); + } + } +} + +/* Creates and returns a scheduler for sending packet-in messages. */ +struct pinsched * +pinsched_create(int rate_limit, int burst_limit, struct switch_status *ss) +{ + struct pinsched *ps; + + ps = xcalloc(1, sizeof *ps); + port_array_init(&ps->queues); + ps->n_queued = 0; + ps->last_tx_port = PORT_ARRAY_SIZE; + ps->last_fill = time_msec(); + ps->tokens = rate_limit * 100; + ps->n_txq = 0; + ps->n_normal = 0; + ps->n_limited = 0; + ps->n_queue_dropped = 0; + pinsched_set_limits(ps, rate_limit, burst_limit); + + if (ss) { + ps->ss_cat = switch_status_register(ss, "rate-limit", + pinsched_status_cb, ps); + } + + return ps; +} + +void +pinsched_destroy(struct pinsched *ps) +{ + if (ps) { + struct ovs_queue *queue; + unsigned int port_no; + + PORT_ARRAY_FOR_EACH (queue, &ps->queues, port_no) { + queue_destroy(queue); + free(queue); + } + port_array_destroy(&ps->queues); + switch_status_unregister(ps->ss_cat); + free(ps); + } +} + +void +pinsched_set_limits(struct pinsched *ps, int rate_limit, int burst_limit) +{ + if (rate_limit <= 0) { + rate_limit = 1000; + } + if (burst_limit <= 0) { + burst_limit = rate_limit / 4; + } + burst_limit = MAX(burst_limit, 1); + burst_limit = MIN(burst_limit, INT_MAX / 1000); + + ps->rate_limit = rate_limit; + ps->burst_limit = burst_limit; + while (ps->n_queued > burst_limit) { + drop_packet(ps); + } +} diff --git a/secchan/pinsched.h b/secchan/pinsched.h new file mode 100644 index 00000000..e439450f --- /dev/null +++ b/secchan/pinsched.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef PINSCHED_H +#define PINSCHED_H_H 1 + +#include + +struct ofpbuf; +struct switch_status; + +typedef void pinsched_tx_cb(struct ofpbuf *, void *aux); +struct pinsched *pinsched_create(int rate_limit, int burst_limit, + struct switch_status *); +void pinsched_set_limits(struct pinsched *, int rate_limit, int burst_limit); +void pinsched_destroy(struct pinsched *); +void pinsched_send(struct pinsched *, uint16_t port_no, struct ofpbuf *, + pinsched_tx_cb *, void *aux); +void pinsched_run(struct pinsched *, pinsched_tx_cb *, void *aux); +void pinsched_wait(struct pinsched *); + +#endif /* pinsched.h */ diff --git a/secchan/pktbuf.c b/secchan/pktbuf.c new file mode 100644 index 00000000..126c1314 --- /dev/null +++ b/secchan/pktbuf.c @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "pktbuf.h" +#include +#include +#include "coverage.h" +#include "ofpbuf.h" +#include "timeval.h" +#include "util.h" +#include "vconn.h" + +#define THIS_MODULE VLM_pktbuf +#include "vlog.h" + +/* Buffers are identified by a 32-bit opaque ID. We divide the ID + * into a buffer number (low bits) and a cookie (high bits). The buffer number + * is an index into an array of buffers. The cookie distinguishes between + * different packets that have occupied a single buffer. Thus, the more + * buffers we have, the lower-quality the cookie... */ +#define PKTBUF_BITS 8 +#define PKTBUF_MASK (PKTBUF_CNT - 1) +#define PKTBUF_CNT (1u << PKTBUF_BITS) + +#define COOKIE_BITS (32 - PKTBUF_BITS) +#define COOKIE_MAX ((1u << COOKIE_BITS) - 1) + +#define OVERWRITE_MSECS 5000 + +struct packet { + struct ofpbuf *buffer; + uint32_t cookie; + long long int timeout; + uint16_t in_port; +}; + +struct pktbuf { + struct packet packets[PKTBUF_CNT]; + unsigned int buffer_idx; +}; + +int +pktbuf_capacity(void) +{ + return PKTBUF_CNT; +} + +struct pktbuf * +pktbuf_create(void) +{ + return xcalloc(1, sizeof *pktbuf_create()); +} + +void +pktbuf_destroy(struct pktbuf *pb) +{ + if (pb) { + size_t i; + + for (i = 0; i < PKTBUF_CNT; i++) { + ofpbuf_delete(pb->packets[i].buffer); + } + free(pb); + } +} + +uint32_t +pktbuf_save(struct pktbuf *pb, struct ofpbuf *buffer, uint16_t in_port) +{ + struct packet *p = &pb->packets[pb->buffer_idx]; + pb->buffer_idx = (pb->buffer_idx + 1) & PKTBUF_MASK; + if (p->buffer) { + if (time_msec() < p->timeout) { + return UINT32_MAX; + } + ofpbuf_delete(p->buffer); + } + + /* Don't use maximum cookie value since all-1-bits ID is special. */ + if (++p->cookie >= COOKIE_MAX) { + p->cookie = 0; + } + p->buffer = ofpbuf_clone(buffer); + p->timeout = time_msec() + OVERWRITE_MSECS; + p->in_port = in_port; + return (p - pb->packets) | (p->cookie << PKTBUF_BITS); +} + +int +pktbuf_retrieve(struct pktbuf *pb, uint32_t id, struct ofpbuf **bufferp, + uint16_t *in_port) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 20); + struct packet *p; + int error; + + if (!pb) { + VLOG_WARN_RL(&rl, "attempt to send buffered packet via connection " + "without buffers"); + return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_COOKIE); + } + + p = &pb->packets[id & PKTBUF_MASK]; + if (p->cookie == id >> PKTBUF_BITS) { + struct ofpbuf *buffer = p->buffer; + if (buffer) { + *bufferp = buffer; + *in_port = p->in_port; + p->buffer = NULL; + COVERAGE_INC(pktbuf_retrieved); + return 0; + } else { + COVERAGE_INC(pktbuf_reuse_error); + VLOG_WARN_RL(&rl, "attempt to reuse buffer %08"PRIx32, id); + error = ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BUFFER_EMPTY); + } + } else { + COVERAGE_INC(pktbuf_bad_cookie); + VLOG_WARN_RL(&rl, "cookie mismatch: %08"PRIx32" != %08"PRIx32, + id, (id & PKTBUF_MASK) | (p->cookie << PKTBUF_BITS)); + error = ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_COOKIE); + } + *bufferp = NULL; + *in_port = -1; + return error; +} + +void +pktbuf_discard(struct pktbuf *pb, uint32_t id) +{ + struct packet *p = &pb->packets[id & PKTBUF_MASK]; + if (p->cookie == id >> PKTBUF_BITS) { + ofpbuf_delete(p->buffer); + p->buffer = NULL; + } +} diff --git a/secchan/pktbuf.h b/secchan/pktbuf.h new file mode 100644 index 00000000..5ff7cf06 --- /dev/null +++ b/secchan/pktbuf.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef PKTBUF_H +#define PKTBUF_H 1 + +#include + +struct pktbuf; +struct ofpbuf; + +int pktbuf_capacity(void); + +struct pktbuf *pktbuf_create(void); +void pktbuf_destroy(struct pktbuf *); +uint32_t pktbuf_save(struct pktbuf *, struct ofpbuf *buffer, uint16_t in_port); +int pktbuf_retrieve(struct pktbuf *, uint32_t id, struct ofpbuf **bufferp, + uint16_t *in_port); +void pktbuf_discard(struct pktbuf *, uint32_t id); + +#endif /* pktbuf.h */ diff --git a/secchan/secchan.8.in b/secchan/secchan.8.in new file mode 100644 index 00000000..b40842a9 --- /dev/null +++ b/secchan/secchan.8.in @@ -0,0 +1,463 @@ +.TH secchan 8 "March 2009" "Open vSwitch" "Open vSwitch Manual" +.ds PN secchan + +.SH NAME +secchan \- OpenFlow switch implementation + +.SH SYNOPSIS +.B secchan +[\fIoptions\fR] \fIdatapath\fR [\fIcontroller\fR] + +.SH DESCRIPTION +The \fBsecchan\fR program implements an OpenFlow switch using a +flow-based datapath. \fBsecchan\fR connects to an OpenFlow controller +over TCP or SSL. + +The mandatory \fIdatapath\fR argument argument specifies the local datapath +to relay. It takes one of the following forms: + +.so lib/dpif.man + +.PP +The optional \fIcontroller\fR argument specifies how to connect to +the OpenFlow controller. It takes one of the following forms: + +.RS +.TP +\fBssl:\fIhost\fR[\fB:\fIport\fR] +The specified SSL \fIport\fR (default: 6633) on the given remote +\fIhost\fR. The \fB--private-key\fR, \fB--certificate\fR, and +\fB--ca-cert\fR options are mandatory when this form is used. + +.TP +\fBtcp:\fIhost\fR[\fB:\fIport\fR] +The specified TCP \fIport\fR (default: 6633) on the given remote +\fIhost\fR. + +.TP +\fBunix:\fIfile\fR +The Unix domain server socket named \fIfile\fR. +.RE + +.PP +If \fIcontroller\fR is omitted, \fBsecchan\fR attempts to discover the +location of the controller automatically (see below). + +.SS "Contacting the Controller" +The OpenFlow switch must be able to contact the OpenFlow controller +over the network. It can do so in one of two ways: + +.IP out-of-band +In this configuration, OpenFlow traffic uses a network separate from +the data traffic that it controls, that is, the switch does not use +any of the network devices added to the datapath with \fBovs\-dpctl +add\-if\fR in its communication with the controller. + +To use \fBsecchan\fR in a network with out-of-band control, specify +\fB--out-of-band\fR on the \fBsecchan\fR command line. The control +network must be configured separately, before or after \fBsecchan\fR +is started. + +.IP in-band +In this configuration, a single network is used for OpenFlow traffic +and other data traffic, that is, the switch contacts the controller +over one of the network devices added to the datapath with \fBovs\-dpctl +add\-if\fR. This configuration is often more convenient than +out-of-band control, because it is not necessary to maintain two +independent networks. + +In-band control is the default for \fBsecchan\fR, so no special +command-line option is required. + +With in-band control, the location of the controller can be configured +manually or discovered automatically: + +.RS +.IP "controller discovery" +To make \fBsecchan\fR discover the location of the controller +automatically, do not specify the location of the controller on the +\fBsecchan\fR command line. + +In this mode, \fBsecchan\fR will broadcast a DHCP request with vendor +class identifier \fBOpenFlow\fR across the network devices added to +the datapath with \fBovs\-dpctl add\-if\fR. It will accept any valid DHCP +reply that has the same vendor class identifier and includes a +vendor-specific option with code 1 whose contents are a string +specifying the location of the controller in the same format used on +the \fBsecchan\fR command line (e.g. \fBssl:192.168.0.1\fR). + +The DHCP reply may also, optionally, include a vendor-specific option +with code 2 whose contents are a string specifying the URI to the base +of the OpenFlow PKI (e.g. \fBhttp://192.168.0.1/openflow/pki\fR). +This URI is used only for bootstrapping the OpenFlow PKI at initial +switch setup; \fBsecchan\fR does not use it at all. + +The following ISC DHCP server configuration file assigns the IP +address range 192.168.0.20 through 192.168.0.30 to OpenFlow switches +that follow the switch protocol and addresses 192.168.0.1 through +192.168.0.10 to all other DHCP clients: + +default-lease-time 600; +.br +max-lease-time 7200; +.br +option space openflow; +.br +option openflow.controller-vconn code 1 = text; +.br +option openflow.pki-uri code 2 = text; +.br +class "OpenFlow" { +.br + match if option vendor-class-identifier = "OpenFlow"; +.br + vendor-option-space openflow; +.br + option openflow.controller-vconn "tcp:192.168.0.10"; +.br + option openflow.pki-uri "http://192.168.0.10/openflow/pki"; +.br + option vendor-class-identifier "OpenFlow"; +.br +} +.br +subnet 192.168.0.0 netmask 255.255.255.0 { +.br + pool { +.br + allow members of "OpenFlow"; +.br + range 192.168.0.20 192.168.0.30; +.br + } +.br + pool { +.br + deny members of "OpenFlow"; +.br + range 192.168.0.1 192.168.0.10; +.br + } +.br +} +.br + +.IP "manual configuration" +To configure in-band control manually, specify the location of the +controller on the \fBsecchan\fR command line as the \fIcontroller\fR +argument. You must also configure the network device for the OpenFlow +``local port'' to allow \fBsecchan\fR to connect to that controller. +The OpenFlow local port is a virtual network port that \fBsecchan\fR +bridges to the physical switch ports. The name of the local port for +a given \fIdatapath\fR may be seen by running \fBovs\-dpctl show +\fIdatapath\fR; the local port is listed as port 0 in \fBshow\fR's +output. + +.IP +Before \fBsecchan\fR starts, the local port network device is not +bridged to any physical network, so the next step depends on whether +connectivity is required to configure the device's IP address. If the +switch has a static IP address, you may configure its IP address now +with a command such as +.B ifconfig of0 192.168.1.1 +and then invoke \fBsecchan\fR. + +On the other hand, if the switch does not have a static IP address, +e.g. it obtains its IP address dynamically via DHCP, the DHCP client +will not be able to contact the DHCP server until the secure channel +has started up. Thus, start \fBsecchan\fR without configuring +the local port network device, and start the DHCP client afterward. +.RE + +.SH OPTIONS +.SS "Controller Discovery Options" +.TP +\fB--accept-vconn=\fIregex\fR +When \fBsecchan\fR performs controller discovery (see \fBContacting +the Controller\fR, above, for more information about controller +discovery), it validates the controller location obtained via DHCP +with a POSIX extended regular expression. Only controllers whose +names match the regular expression will be accepted. + +The default regular expression is \fBssl:.*\fR (meaning that only SSL +controller connections will be accepted) when any of the SSL +configuration options \fB--private-key\fR, \fB--certificate\fR, or +\fB--ca-cert\fR is specified. The default is \fB.*\fR otherwise +(meaning that any controller will be accepted). + +The \fIregex\fR is implicitly anchored at the beginning of the +controller location string, as if it begins with \fB^\fR. + +When controller discovery is not performed, this option has no effect. + +.TP +\fB--no-resolv-conf\fR +When \fBsecchan\fR performs controller discovery (see \fBContacting +the Controller\fR, above, for more information about controller +discovery), by default it overwrites the system's +\fB/etc/resolv.conf\fR with domain information and DNS servers +obtained via DHCP. If the location of the controller is specified +using a hostname, rather than an IP address, and the network's DNS +servers ever change, this behavior is essential. But because it also +interferes with any administrator or process that manages +\fB/etc/resolv.conf\fR, when this option is specified, \fBsecchan\fR +will not modify \fB/etc/resolv.conf\fR. + +\fBsecchan\fR will only modify \fBresolv.conf\fR if the DHCP response +that it receives specifies one or more DNS servers. + +When controller discovery is not performed, this option has no effect. + +.SS "Networking Options" +.TP +\fB--datapath-id=\fIdpid\fR +Sets \fIdpid\fR, which must consist of exactly 12 hexadecimal digits, +as the datapath ID that the switch will use to identify itself to the +OpenFlow controller. + +If this option is omitted, the default datapath ID is taken from the +Ethernet address of the datapath's local port (which is typically +randomly generated). + +.TP +\fB--mgmt-id=\fImgmtid\fR +Sets \fImgmtid\fR, which must consist of exactly 12 hexadecimal +digits, as the switch's management ID. + +If this option is omitted, the management ID defaults to 0, signaling +to the controller that management is supported but not configured. + +.TP +\fB--fail=\fR[\fBopen\fR|\fBclosed\fR] +The controller is, ordinarily, responsible for setting up all flows on +the OpenFlow switch. Thus, if the connection to the controller fails, +no new network connections can be set up. If the connection to the +controller stays down long enough, no packets can pass through the +switch at all. + +If this option is set to \fBopen\fR (the default), \fBsecchan\fR will +take over responsibility for setting up flows in the local datapath +when no message has been received from the controller for three times +the inactivity probe interval (see below), or 45 seconds by default. +In this ``fail open'' mode, \fBsecchan\fR causes the datapath to act +like an ordinary MAC-learning switch. \fBsecchan\fR will continue to +retry connection to the controller in the background and, when the +connection succeeds, it discontinues its fail-open behavior. + +If this option is set to \fBclosed\fR, then \fBsecchan\fR will not +set up flows on its own when the controller connection fails. + +.TP +\fB--inactivity-probe=\fIsecs\fR +When the secure channel is connected to the controller, the secure +channel waits for a message to be received from the controller for +\fIsecs\fR seconds before it sends a inactivity probe to the +controller. After sending the inactivity probe, if no response is +received for an additional \fIsecs\fR seconds, the secure channel +assumes that the connection has been broken and attempts to reconnect. +The default is 15 seconds, and the minimum value is 5 seconds. + +When fail-open mode is configured, changing the inactivity probe +interval also changes the interval before entering fail-open mode (see +above). + +.TP +\fB--max-idle=\fIsecs\fR|\fBpermanent\fR +Sets \fIsecs\fR as the number of seconds that a flow set up by the +secure channel will remain in the switch's flow table without any +matching packets being seen. If \fBpermanent\fR is specified, which +is not recommended, flows set up by the secure channel will never +expire. The default is 15 seconds. + +Most flows are set up by the OpenFlow controller, not by the secure +channel. This option affects only the following flows, which the +secure channel sets up itself: + +.RS +.IP \(bu +When \fB--fail=open\fR is specified, flows set up when the secure +channel has not been able to contact the controller for the configured +fail-open delay. + +.IP \(bu +When in-band control is in use, flows set up to bootstrap contacting +the controller (see \fBContacting the Controller\fR, above, for +more information about in-band control). +.RE + +.IP +As a result, when both \fB--fail=closed\fR and \fB--out-of-band\fR are +specified, this option has no effect. + +.TP +\fB--max-backoff=\fIsecs\fR +Sets the maximum time between attempts to connect to the controller to +\fIsecs\fR, which must be at least 1. The actual interval between +connection attempts starts at 1 second and doubles on each failing +attempt until it reaches the maximum. The default maximum backoff +time is 15 seconds. + +.TP +\fB-l\fR, \fB--listen=\fImethod\fR +Configures the switch to additionally listen for incoming OpenFlow +connections for switch management with \fBovs\-ofctl\fR. The \fImethod\fR +must be given as one of the passive OpenFlow connection methods listed +below. This option may be specified multiple times to listen to +multiple connection methods. + +.RS +.TP +\fBpssl:\fR[\fIport\fR] +Listens for SSL connections on \fIport\fR (default: 6633). The +\fB--private-key\fR, \fB--certificate\fR, and \fB--ca-cert\fR options +are mandatory when this form is used. + +.TP +\fBptcp:\fR[\fIport\fR] +Listens for TCP connections on \fIport\fR (default: 6633). + +.TP +\fBpunix:\fIfile\fR +Listens for connections on Unix domain server socket named \fIfile\fR. +.RE + +.TP +\fB--snoop=\fImethod\fR +Configures the switch to additionally listen for incoming OpenFlow +connections for controller connection snooping. The \fImethod\fR must +be given as one of the passive OpenFlow connection methods listed +under the \fB--listen\fR option above. This option may be specified +multiple times to listen to multiple connection methods. + +If \fBovs\-ofctl monitor\fR is used to connect to \fImethod\fR specified on +\fB--snoop\fR, it will display all the OpenFlow messages traveling +between the switch and its controller on the primary OpenFlow +connection. This can be useful for debugging switch and controller +problems. + +.TP +\fB--in-band\fR, \fB--out-of-band\fR +Configures \fBsecchan\fR to operate in in-band or out-of-band control +mode (see \fBContacting the Controller\fR above). When neither option +is given, the default is in-band control. + +.TP +\fB--netflow=\fIhost\fB:\fIport\fR +Configures the given UDP \fIport\fR on the specified IP \fIhost\fR as +a recipient of NetFlow messages for expired flows. + +This option may be specified multiple times to configure additional +NetFlow collectors. + +.SS "Rate-Limiting Options" + +These options configure how the switch applies a ``token bucket'' to +limit the rate at which packets in unknown flows are forwarded to an +OpenFlow controller for flow-setup processing. This feature prevents +a single OpenFlow switch from overwhelming a controller. + +.TP +\fB--rate-limit\fR[\fB=\fIrate\fR] +. +Limits the maximum rate at which packets will be forwarded to the +OpenFlow controller to \fIrate\fR packets per second. If \fIrate\fR +is not specified then the default of 1,000 packets per second is used. + +If \fB--rate-limit\fR is not used, then the switch does not limit the +rate at which packets are forwarded to the controller. + +.TP +\fB--burst-limit=\fIburst\fR +. +Sets the maximum number of unused packet credits that the switch will +allow to accumulate during time in which no packets are being +forwarded to the OpenFlow controller to \fIburst\fR (measured in +packets). The default \fIburst\fR is one-quarter of the \fIrate\fR +specified on \fB--rate-limit\fR. + +This option takes effect only when \fB--rate-limit\fR is also specified. + +.SS "Remote Command Execution Options" + +.TP +\fB--command-acl=\fR[\fB!\fR]\fIglob\fR[\fB,\fR[\fB!\fR]\fIglob\fR...] +Configures the commands that remote OpenFlow connections are allowed +to invoke using (e.g.) \fBovs\-ofctl execute\fR. The argument is a +comma-separated sequence of shell glob patterns. A glob pattern +specified without a leading \fB!\fR is a ``whitelist'' that specifies +a set of commands that are that may be invoked, whereas a pattern that +does begin with \fB!\fR is a ``blacklist'' that specifies commands +that may not be invoked. To be permitted, a command name must be +whitelisted and must not be blacklisted; +e.g. \fB--command-acl=up*,!upgrade\fR would allow any command whose name +begins with \fBup\fR except for the command named \fBupgrade\fR. +Command names that include characters other than upper- and lower-case +English letters, digits, and the underscore and hyphen characters are +unconditionally disallowed. + +When the whitelist and blacklist permit a command name, \fBsecchan\fR +looks for a program with the same name as the command in the commands +directory (see below). Other directories are not searched. + +.TP +\fB--command-dir=\fIdirectory\fR +Sets the directory searched for remote command execution to +\fBdirectory\fR. The default directory is +\fB@pkgdatadir@/commands\fR. + +.SS "Daemon Options" +.so lib/daemon.man + +.SS "Public Key Infrastructure Options" + +.TP +\fB-p\fR, \fB--private-key=\fIprivkey.pem\fR +Specifies a PEM file containing the private key used as the switch's +identity for SSL connections to the controller. + +.TP +\fB-c\fR, \fB--certificate=\fIcert.pem\fR +Specifies a PEM file containing a certificate, signed by the +controller's certificate authority (CA), that certifies the switch's +private key to identify a trustworthy switch. + +.TP +\fB-C\fR, \fB--ca-cert=\fIcacert.pem\fR +Specifies a PEM file containing the CA certificate used to verify that +the switch is connected to a trustworthy controller. + +.TP +\fB--bootstrap-ca-cert=\fIcacert.pem\fR +When \fIcacert.pem\fR exists, this option has the same effect as +\fB-C\fR or \fB--ca-cert\fR. If it does not exist, then \fBsecchan\fR +will attempt to obtain the CA certificate from the controller on its +first SSL connection and save it to the named PEM file. If it is +successful, it will immediately drop the connection and reconnect, and +from then on all SSL connections must be authenticated by a +certificate signed by the CA certificate thus obtained. + +\fBThis option exposes the SSL connection to a man-in-the-middle +attack obtaining the initial CA certificate\fR, but it may be useful +for bootstrapping. + +This option is only useful if the controller sends its CA certificate +as part of the SSL certificate chain. The SSL protocol does not +require the controller to send the CA certificate, but +\fBcontroller\fR(8) can be configured to do so with the +\fB--peer-ca-cert\fR option. + +.SS "Logging Options" +.so lib/vlog.man +.SS "Other Options" +.so lib/common.man +.so lib/leak-checker.man + +.SH "SEE ALSO" + +.BR ovs\-appctl (8), +.BR ovs\-controller (8), +.BR ovs\-discover (8), +.BR ovs\-dpctl (8), +.BR ovs\-ofctl (8), +.BR ovs\-pki (8), +.BR ovs\-vswitchd.conf (5) diff --git a/secchan/status.c b/secchan/status.c new file mode 100644 index 00000000..cf67d51c --- /dev/null +++ b/secchan/status.c @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "status.h" +#include +#include +#include +#include +#include +#include "dynamic-string.h" +#include "list.h" +#include "ofpbuf.h" +#include "ofproto.h" +#include "openflow/nicira-ext.h" +#include "rconn.h" +#include "svec.h" +#include "timeval.h" +#include "vconn.h" + +#define THIS_MODULE VLM_status +#include "vlog.h" + +struct status_category { + struct list node; + char *name; + void (*cb)(struct status_reply *, void *aux); + void *aux; +}; + +struct switch_status { + time_t booted; + struct status_category *config_cat; + struct status_category *switch_cat; + struct list categories; +}; + +struct status_reply { + struct status_category *category; + struct ds request; + struct ds output; +}; + +int +switch_status_handle_request(struct switch_status *ss, struct rconn *rconn, + struct nicira_header *request) +{ + struct status_category *c; + struct nicira_header *reply; + struct status_reply sr; + struct ofpbuf *b; + int retval; + + sr.request.string = (void *) (request + 1); + sr.request.length = ntohs(request->header.length) - sizeof *request; + ds_init(&sr.output); + LIST_FOR_EACH (c, struct status_category, node, &ss->categories) { + if (!memcmp(c->name, sr.request.string, + MIN(strlen(c->name), sr.request.length))) { + sr.category = c; + c->cb(&sr, c->aux); + } + } + reply = make_openflow_xid(sizeof *reply + sr.output.length, + OFPT_VENDOR, request->header.xid, &b); + reply->vendor = htonl(NX_VENDOR_ID); + reply->subtype = htonl(NXT_STATUS_REPLY); + memcpy(reply + 1, sr.output.string, sr.output.length); + retval = rconn_send(rconn, b, NULL); + if (retval && retval != EAGAIN) { + VLOG_WARN("send failed (%s)", strerror(retval)); + } + ds_destroy(&sr.output); + return 0; +} + +void +rconn_status_cb(struct status_reply *sr, void *rconn_) +{ + struct rconn *rconn = rconn_; + time_t now = time_now(); + + status_reply_put(sr, "name=%s", rconn_get_name(rconn)); + status_reply_put(sr, "state=%s", rconn_get_state(rconn)); + status_reply_put(sr, "backoff=%d", rconn_get_backoff(rconn)); + status_reply_put(sr, "is-connected=%s", + rconn_is_connected(rconn) ? "true" : "false"); + status_reply_put(sr, "sent-msgs=%u", rconn_packets_sent(rconn)); + status_reply_put(sr, "received-msgs=%u", rconn_packets_received(rconn)); + status_reply_put(sr, "attempted-connections=%u", + rconn_get_attempted_connections(rconn)); + status_reply_put(sr, "successful-connections=%u", + rconn_get_successful_connections(rconn)); + status_reply_put(sr, "last-connection=%ld", + (long int) (now - rconn_get_last_connection(rconn))); + status_reply_put(sr, "time-connected=%lu", + rconn_get_total_time_connected(rconn)); + status_reply_put(sr, "state-elapsed=%u", rconn_get_state_elapsed(rconn)); +} + +static void +config_status_cb(struct status_reply *sr, void *ofproto_) +{ + const struct ofproto *ofproto = ofproto_; + struct svec listeners; + int probe_interval, max_backoff; + size_t i; + + svec_init(&listeners); + ofproto_get_listeners(ofproto, &listeners); + for (i = 0; i < listeners.n; i++) { + status_reply_put(sr, "management%zu=%s", i, listeners.names[i]); + } + svec_destroy(&listeners); + + probe_interval = ofproto_get_probe_interval(ofproto); + if (probe_interval) { + status_reply_put(sr, "probe-interval=%d", probe_interval); + } + + max_backoff = ofproto_get_max_backoff(ofproto); + if (max_backoff) { + status_reply_put(sr, "max-backoff=%d", max_backoff); + } +} + +static void +switch_status_cb(struct status_reply *sr, void *ss_) +{ + struct switch_status *ss = ss_; + time_t now = time_now(); + + status_reply_put(sr, "now=%ld", (long int) now); + status_reply_put(sr, "uptime=%ld", (long int) (now - ss->booted)); + status_reply_put(sr, "pid=%ld", (long int) getpid()); +} + +struct switch_status * +switch_status_create(const struct ofproto *ofproto) +{ + struct switch_status *ss = xcalloc(1, sizeof *ss); + ss->booted = time_now(); + list_init(&ss->categories); + ss->config_cat = switch_status_register(ss, "config", config_status_cb, + (void *) ofproto); + ss->switch_cat = switch_status_register(ss, "switch", switch_status_cb, + ss); + return ss; +} + +void +switch_status_destroy(struct switch_status *ss) +{ + if (ss) { + /* Orphan any remaining categories, so that unregistering them later + * won't write to bad memory. */ + struct status_category *c, *next; + LIST_FOR_EACH_SAFE (c, next, + struct status_category, node, &ss->categories) { + list_init(&c->node); + } + switch_status_unregister(ss->config_cat); + switch_status_unregister(ss->switch_cat); + free(ss); + } +} + +struct status_category * +switch_status_register(struct switch_status *ss, + const char *category, + status_cb_func *cb, void *aux) +{ + struct status_category *c = xmalloc(sizeof *c); + c->cb = cb; + c->aux = aux; + c->name = xstrdup(category); + list_push_back(&ss->categories, &c->node); + return c; +} + +void +switch_status_unregister(struct status_category *c) +{ + if (c) { + if (!list_is_empty(&c->node)) { + list_remove(&c->node); + } + free(c->name); + free(c); + } +} + +void +status_reply_put(struct status_reply *sr, const char *content, ...) +{ + size_t old_length = sr->output.length; + size_t added; + va_list args; + + /* Append the status reply to the output. */ + ds_put_format(&sr->output, "%s.", sr->category->name); + va_start(args, content); + ds_put_format_valist(&sr->output, content, args); + va_end(args); + if (ds_last(&sr->output) != '\n') { + ds_put_char(&sr->output, '\n'); + } + + /* Drop what we just added if it doesn't match the request. */ + added = sr->output.length - old_length; + if (added < sr->request.length + || memcmp(&sr->output.string[old_length], + sr->request.string, sr->request.length)) { + ds_truncate(&sr->output, old_length); + } +} diff --git a/secchan/status.h b/secchan/status.h new file mode 100644 index 00000000..e7b73068 --- /dev/null +++ b/secchan/status.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef STATUS_H +#define STATUS_H 1 + +#include "compiler.h" + +struct nicira_header; +struct rconn; +struct secchan; +struct ofproto; +struct status_reply; + +struct switch_status *switch_status_create(const struct ofproto *); +void switch_status_destroy(struct switch_status *); + +int switch_status_handle_request(struct switch_status *, struct rconn *, + struct nicira_header *); + +typedef void status_cb_func(struct status_reply *, void *aux); +struct status_category *switch_status_register(struct switch_status *, + const char *category, + status_cb_func *, void *aux); +void switch_status_unregister(struct status_category *); + +void status_reply_put(struct status_reply *, const char *, ...) + PRINTF_FORMAT(2, 3); + +void rconn_status_cb(struct status_reply *, void *rconn_); + +#endif /* status.h */ diff --git a/soexpand.pl b/soexpand.pl new file mode 100755 index 00000000..4e130056 --- /dev/null +++ b/soexpand.pl @@ -0,0 +1,26 @@ +use strict; +use warnings; +use Getopt::Long; + +my ($exit_code) = 0; +my (@include_dirs); +Getopt::Long::Configure ("bundling"); +GetOptions("I|include=s" => \@include_dirs) or exit(1); +@include_dirs = ('.') if !@include_dirs; +OUTER: while () { + if (my ($name) = /^\.so (\S+)$/) { + foreach my $dir (@include_dirs, '.') { + if (open(INNER, "$dir/$name")) { + while () { + print $_; + } + close(INNER); + next OUTER; + } + } + print STDERR "$name not found in: ", join(' ', @include_dirs), "\n"; + $exit_code = 1; + } + print $_; +} +exit $exit_code; diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 00000000..11125035 --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1,10 @@ +/Makefile +/Makefile.in +/test-classifier +/test-dhcp-client +/test-flows +/test-hash +/test-hmap +/test-list +/test-stp +/test-type-props diff --git a/tests/automake.mk b/tests/automake.mk new file mode 100644 index 00000000..0508144f --- /dev/null +++ b/tests/automake.mk @@ -0,0 +1,56 @@ +TESTS += tests/test-classifier +noinst_PROGRAMS += tests/test-classifier +tests_test_classifier_SOURCES = tests/test-classifier.c +tests_test_classifier_LDADD = lib/libopenvswitch.a + +TESTS += tests/test-flows.sh +noinst_PROGRAMS += tests/test-flows +tests_test_flows_SOURCES = tests/test-flows.c +tests_test_flows_LDADD = lib/libopenvswitch.a +dist_check_SCRIPTS = tests/test-flows.sh tests/flowgen.pl + +TESTS += tests/test-hash +noinst_PROGRAMS += tests/test-hash +tests_test_hash_SOURCES = tests/test-hash.c +tests_test_hash_LDADD = lib/libopenvswitch.a + +TESTS += tests/test-hmap +noinst_PROGRAMS += tests/test-hmap +tests_test_hmap_SOURCES = tests/test-hmap.c +tests_test_hmap_LDADD = lib/libopenvswitch.a + +TESTS += tests/test-list +noinst_PROGRAMS += tests/test-list +tests_test_list_SOURCES = tests/test-list.c +tests_test_list_LDADD = lib/libopenvswitch.a + +TESTS += tests/test-type-props +noinst_PROGRAMS += tests/test-type-props +tests_test_type_props_SOURCES = tests/test-type-props.c + +noinst_PROGRAMS += tests/test-dhcp-client +tests_test_dhcp_client_SOURCES = tests/test-dhcp-client.c +tests_test_dhcp_client_LDADD = lib/libopenvswitch.a $(FAULT_LIBS) + +TESTS += tests/test-stp.sh +EXTRA_DIST += tests/test-stp.sh +noinst_PROGRAMS += tests/test-stp + +tests_test_stp_SOURCES = tests/test-stp.c +tests_test_stp_LDADD = lib/libopenvswitch.a +stp_files = \ + tests/test-stp-ieee802.1d-1998 \ + tests/test-stp-ieee802.1d-2004-fig17.4 \ + tests/test-stp-ieee802.1d-2004-fig17.6 \ + tests/test-stp-ieee802.1d-2004-fig17.7 \ + tests/test-stp-iol-op-1.1 \ + tests/test-stp-iol-op-1.4 \ + tests/test-stp-iol-op-3.1 \ + tests/test-stp-iol-op-3.3 \ + tests/test-stp-iol-io-1.1 \ + tests/test-stp-iol-io-1.2 \ + tests/test-stp-iol-io-1.4 \ + tests/test-stp-iol-io-1.5 +TESTS_ENVIRONMENT += stp_files='$(stp_files)' + +EXTRA_DIST += $(stp_files) diff --git a/tests/flowgen.pl b/tests/flowgen.pl new file mode 100755 index 00000000..6325f1fe --- /dev/null +++ b/tests/flowgen.pl @@ -0,0 +1,224 @@ +#! /usr/bin/perl + +use strict; +use warnings; + +open(FLOWS, ">&=3");# or die "failed to open fd 3 for writing: $!\n"; +open(PACKETS, ">&=4");# or die "failed to open fd 4 for writing: $!\n"; + +# Print pcap file header. +print PACKETS pack('NnnNNNN', + 0xa1b2c3d4, # magic number + 2, # major version + 4, # minor version + 0, # time zone offset + 0, # time stamp accuracy + 1518, # snaplen + 1); # Ethernet + +output(DL_HEADER => '802.2'); + +for my $dl_header qw(802.2+SNAP Ethernet) { + my %a = (DL_HEADER => $dl_header); + for my $dl_vlan qw(none zero nonzero) { + my %b = (%a, DL_VLAN => $dl_vlan); + + # Non-IP case. + output(%b, DL_TYPE => 'non-ip'); + + for my $ip_options qw(no yes) { + my %c = (%b, DL_TYPE => 'ip', IP_OPTIONS => $ip_options); + for my $ip_fragment qw(no first middle last) { + my %d = (%c, IP_FRAGMENT => $ip_fragment); + for my $tp_proto qw(TCP TCP+options UDP ICMP other) { + output(%d, TP_PROTO => $tp_proto); + } + } + } + } +} + +sub output { + my (%attrs) = @_; + + # Compose flow. + my (%flow); + $flow{DL_SRC} = "00:02:e3:0f:80:a4"; + $flow{DL_DST} = "00:1a:92:40:ac:05"; + $flow{NW_PROTO} = 0; + $flow{NW_SRC} = '0.0.0.0'; + $flow{NW_DST} = '0.0.0.0'; + $flow{TP_SRC} = 0; + $flow{TP_DST} = 0; + if (defined($attrs{DL_VLAN})) { + my (%vlan_map) = ('none' => 0xffff, + 'zero' => 0, + 'nonzero' => 0x0123); + $flow{DL_VLAN} = $vlan_map{$attrs{DL_VLAN}}; + } else { + $flow{DL_VLAN} = 0xffff; # OFP_VLAN_NONE + } + if ($attrs{DL_HEADER} eq '802.2') { + $flow{DL_TYPE} = 0x5ff; # OFP_DL_TYPE_NOT_ETH_TYPE + } elsif ($attrs{DL_TYPE} eq 'ip') { + $flow{DL_TYPE} = 0x0800; # ETH_TYPE_IP + $flow{NW_SRC} = '10.0.2.15'; + $flow{NW_DST} = '192.168.1.20'; + if ($attrs{TP_PROTO} eq 'other') { + $flow{NW_PROTO} = 42; + } elsif ($attrs{TP_PROTO} eq 'TCP' || + $attrs{TP_PROTO} eq 'TCP+options') { + $flow{NW_PROTO} = 6; # IP_TYPE_TCP + $flow{TP_SRC} = 6667; + $flow{TP_DST} = 9998; + } elsif ($attrs{TP_PROTO} eq 'UDP') { + $flow{NW_PROTO} = 17; # IP_TYPE_UDP + $flow{TP_SRC} = 1112; + $flow{TP_DST} = 2223; + } elsif ($attrs{TP_PROTO} eq 'ICMP') { + $flow{NW_PROTO} = 1; # IP_TYPE_ICMP + $flow{TP_SRC} = 8; # echo request + $flow{TP_DST} = 0; # code + } else { + die; + } + if ($attrs{IP_FRAGMENT} ne 'no') { + $flow{TP_SRC} = $flow{TP_DST} = 0; + } + } elsif ($attrs{DL_TYPE} eq 'non-ip') { + $flow{DL_TYPE} = 0x5678; + } else { + die; + } + + # Compose packet. + my $packet = ''; + $packet .= pack_ethaddr($flow{DL_DST}); + $packet .= pack_ethaddr($flow{DL_SRC}); + $packet .= pack('n', 0) if $attrs{DL_HEADER} =~ /^802.2/; + if ($attrs{DL_HEADER} eq '802.2') { + $packet .= pack('CCC', 0x42, 0x42, 0x03); # LLC for 802.1D STP. + } else { + if ($attrs{DL_HEADER} eq '802.2+SNAP') { + $packet .= pack('CCC', 0xaa, 0xaa, 0x03); # LLC for SNAP. + $packet .= pack('CCC', 0, 0, 0); # SNAP OUI. + } + if ($attrs{DL_VLAN} ne 'none') { + $packet .= pack('nn', 0x8100, $flow{DL_VLAN}); + } + $packet .= pack('n', $flow{DL_TYPE}); + if ($attrs{DL_TYPE} eq 'ip') { + my $ip = pack('CCnnnCCnNN', + (4 << 4) | 5, # version, hdrlen + 0, # type of service + 0, # total length (filled in later) + 65432, # id + 0, # frag offset + 64, # ttl + $flow{NW_PROTO}, # protocol + 0, # checksum + 0x0a00020f, # source + 0xc0a80114); # dest + if ($attrs{IP_OPTIONS} eq 'yes') { + substr($ip, 0, 1) = pack('C', (4 << 4) | 8); + $ip .= pack('CCnnnCCCx', + 130, # type + 11, # length + 0x6bc5, # top secret + 0xabcd, + 0x1234, + 1, + 2, + 3); + } + if ($attrs{IP_FRAGMENT} ne 'no') { + my (%frag_map) = ('first' => 0x2000, # more frags, ofs 0 + 'middle' => 0x2111, # more frags, ofs 0x888 + 'last' => 0x0222); # last frag, ofs 0x1110 + substr($ip, 6, 2) + = pack('n', $frag_map{$attrs{IP_FRAGMENT}}); + } + + if ($attrs{TP_PROTO} =~ '^TCP') { + my $tcp = pack('nnNNnnnn', + $flow{TP_SRC}, # source port + $flow{TP_DST}, # dest port + 87123455, # seqno + 712378912, # ackno + (5 << 12) | 0x02 | 0x10, # hdrlen, SYN, ACK + 5823, # window size + 18923, # checksum + 12893); # urgent pointer + if ($attrs{TP_PROTO} eq 'TCP+options') { + substr($tcp, 12, 2) = pack('n', (6 << 12) | 0x02 | 0x10); + $tcp .= pack('CCn', 2, 4, 1975); # MSS option + } + $tcp .= 'payload'; + $ip .= $tcp; + } elsif ($attrs{TP_PROTO} eq 'UDP') { + my $len = 15; + my $udp = pack('nnnn', $flow{TP_SRC}, $flow{TP_DST}, $len, 0); + $udp .= chr($len) while length($udp) < $len; + $ip .= $udp; + } elsif ($attrs{TP_PROTO} eq 'ICMP') { + $ip .= pack('CCnnn', + 8, # echo request + 0, # code + 0, # checksum + 736, # identifier + 931); # sequence number + } elsif ($attrs{TP_PROTO} eq 'other') { + $ip .= 'other header'; + } else { + die; + } + + substr($ip, 2, 2) = pack('n', length($ip)); + $packet .= $ip; + } + } + substr($packet, 12, 2) = pack('n', length($packet)) + if $attrs{DL_HEADER} =~ /^802.2/; + + print join(' ', map("$_=$attrs{$_}", keys(%attrs))), "\n"; + print join(' ', map("$_=$flow{$_}", keys(%flow))), "\n"; + print "\n"; + + print FLOWS pack('Nn', + 0, # wildcards + 1); # in_port + print FLOWS pack_ethaddr($flow{DL_SRC}); + print FLOWS pack_ethaddr($flow{DL_DST}); + print FLOWS pack('nnCxNNnn', + $flow{DL_VLAN}, + $flow{DL_TYPE}, + $flow{NW_PROTO}, + inet_aton($flow{NW_SRC}), + inet_aton($flow{NW_DST}), + $flow{TP_SRC}, + $flow{TP_DST}); + + print PACKETS pack('NNNN', + 0, # timestamp seconds + 0, # timestamp microseconds + length($packet), # bytes saved + length($packet)), # total length + $packet; +} + +sub pack_ethaddr { + local ($_) = @_; + my $xx = '([0-9a-fA-F][0-9a-fA-F])'; + my (@octets) = /$xx:$xx:$xx:$xx:$xx:$xx/; + @octets == 6 or die $_; + my ($out) = ''; + $out .= pack('C', hex($_)) foreach @octets; + return $out; +} + +sub inet_aton { + local ($_) = @_; + my ($a, $b, $c, $d) = /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/; + defined $d or die $_; + return ($a << 24) | ($b << 16) | ($c << 8) | $d; +} diff --git a/tests/test-classifier.c b/tests/test-classifier.c new file mode 100644 index 00000000..309c4dd6 --- /dev/null +++ b/tests/test-classifier.c @@ -0,0 +1,977 @@ +/* + * Copyright (c) 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* "White box" tests for classifier. + * + * With very few exceptions, these tests obtain complete coverage of every + * basic block and every branch in the classifier implementation, e.g. a clean + * report from "gcov -b". (Covering the exceptions would require finding + * collisions in the hash function used for flow data, etc.) + * + * This test should receive a clean report from "valgrind --leak-check=full": + * it frees every heap block that it allocates. + */ + +#include +#include +#include "classifier.h" +#include +#include +#include "flow.h" +#include +#include "packets.h" + +#undef NDEBUG +#include + +struct test_rule { + int aux; /* Auxiliary data. */ + struct cls_rule cls_rule; /* Classifier rule data. */ +}; + +static struct test_rule * +test_rule_from_cls_rule(const struct cls_rule *rule) +{ + return rule ? CONTAINER_OF(rule, struct test_rule, cls_rule) : NULL; +} + +/* Trivial (linear) classifier. */ +struct tcls { + size_t n_rules; + size_t allocated_rules; + struct test_rule **rules; +}; + +static void +tcls_init(struct tcls *tcls) +{ + tcls->n_rules = 0; + tcls->allocated_rules = 0; + tcls->rules = NULL; +} + +static void +tcls_destroy(struct tcls *tcls) +{ + if (tcls) { + size_t i; + + for (i = 0; i < tcls->n_rules; i++) { + free(tcls->rules[i]); + } + free(tcls->rules); + } +} + +static int +tcls_count_exact(const struct tcls *tcls) +{ + int n_exact; + size_t i; + + n_exact = 0; + for (i = 0; i < tcls->n_rules; i++) { + n_exact += tcls->rules[i]->cls_rule.wc.wildcards == 0; + } + return n_exact; +} + +static bool +tcls_is_empty(const struct tcls *tcls) +{ + return tcls->n_rules == 0; +} + +static struct test_rule * +tcls_insert(struct tcls *tcls, const struct test_rule *rule) +{ + size_t i; + + assert(rule->cls_rule.wc.wildcards || rule->cls_rule.priority == UINT_MAX); + for (i = 0; i < tcls->n_rules; i++) { + const struct cls_rule *pos = &tcls->rules[i]->cls_rule; + if (pos->priority == rule->cls_rule.priority + && pos->wc.wildcards == rule->cls_rule.wc.wildcards + && flow_equal(&pos->flow, &rule->cls_rule.flow)) { + /* Exact match. + * XXX flow_equal should ignore wildcarded fields */ + free(tcls->rules[i]); + tcls->rules[i] = xmemdup(rule, sizeof *rule); + return tcls->rules[i]; + } else if (pos->priority <= rule->cls_rule.priority) { + break; + } + } + + if (tcls->n_rules >= tcls->allocated_rules) { + tcls->rules = x2nrealloc(tcls->rules, &tcls->allocated_rules, + sizeof *tcls->rules); + } + if (i != tcls->n_rules) { + memmove(&tcls->rules[i + 1], &tcls->rules[i], + sizeof *tcls->rules * (tcls->n_rules - i)); + } + tcls->rules[i] = xmemdup(rule, sizeof *rule); + tcls->n_rules++; + return tcls->rules[i]; +} + +static void +tcls_remove(struct tcls *cls, const struct test_rule *rule) +{ + size_t i; + + for (i = 0; i < cls->n_rules; i++) { + struct test_rule *pos = cls->rules[i]; + if (pos == rule) { + free(pos); + memmove(&cls->rules[i], &cls->rules[i + 1], + sizeof *cls->rules * (cls->n_rules - i - 1)); + cls->n_rules--; + return; + } + } + NOT_REACHED(); +} + +static uint32_t +read_uint32(const void *p) +{ + uint32_t x; + memcpy(&x, p, sizeof x); + return x; +} + +static bool +match(const struct cls_rule *wild, const flow_t *fixed) +{ + int f_idx; + + for (f_idx = 0; f_idx < CLS_N_FIELDS; f_idx++) { + const struct cls_field *f = &cls_fields[f_idx]; + void *wild_field = (char *) &wild->flow + f->ofs; + void *fixed_field = (char *) fixed + f->ofs; + + if ((wild->wc.wildcards & f->wildcards) == f->wildcards || + !memcmp(wild_field, fixed_field, f->len)) { + /* Definite match. */ + continue; + } + + if (wild->wc.wildcards & f->wildcards) { + uint32_t test = read_uint32(wild_field); + uint32_t ip = read_uint32(fixed_field); + int shift = (f_idx == CLS_F_IDX_NW_SRC + ? OFPFW_NW_SRC_SHIFT : OFPFW_NW_DST_SHIFT); + uint32_t mask = flow_nw_bits_to_mask(wild->wc.wildcards, shift); + if (!((test ^ ip) & mask)) { + continue; + } + } + + return false; + } + return true; +} + +static struct cls_rule * +tcls_lookup(const struct tcls *cls, const flow_t *flow, int include) +{ + size_t i; + + for (i = 0; i < cls->n_rules; i++) { + struct test_rule *pos = cls->rules[i]; + uint32_t wildcards = pos->cls_rule.wc.wildcards; + if (include & (wildcards ? CLS_INC_WILD : CLS_INC_EXACT) + && match(&pos->cls_rule, flow)) { + return &pos->cls_rule; + } + } + return NULL; +} + +static void +tcls_delete_matches(struct tcls *cls, + const struct cls_rule *target, + int include) +{ + size_t i; + + for (i = 0; i < cls->n_rules; ) { + struct test_rule *pos = cls->rules[i]; + uint32_t wildcards = pos->cls_rule.wc.wildcards; + if (include & (wildcards ? CLS_INC_WILD : CLS_INC_EXACT) + && match(target, &pos->cls_rule.flow)) { + tcls_remove(cls, pos); + } else { + i++; + } + } +} + +#ifdef WORDS_BIGENDIAN +#define HTONL(VALUE) ((uint32_t) (VALUE)) +#define HTONS(VALUE) ((uint32_t) (VALUE)) +#else +#define HTONL(VALUE) (((((uint32_t) (VALUE)) & 0x000000ff) << 24) | \ + ((((uint32_t) (VALUE)) & 0x0000ff00) << 8) | \ + ((((uint32_t) (VALUE)) & 0x00ff0000) >> 8) | \ + ((((uint32_t) (VALUE)) & 0xff000000) >> 24)) +#define HTONS(VALUE) (((((uint16_t) (VALUE)) & 0xff00) >> 8) | \ + ((((uint16_t) (VALUE)) & 0x00ff) << 8)) +#endif + +static uint32_t nw_src_values[] = { HTONL(0xc0a80001), + HTONL(0xc0a04455) }; +static uint32_t nw_dst_values[] = { HTONL(0xc0a80002), + HTONL(0xc0a04455) }; +static uint16_t in_port_values[] = { HTONS(1), HTONS(OFPP_LOCAL) }; +static uint16_t dl_vlan_values[] = { HTONS(101), HTONS(0) }; +static uint16_t dl_type_values[] = { HTONS(ETH_TYPE_IP), HTONS(ETH_TYPE_ARP) }; +static uint16_t tp_src_values[] = { HTONS(49362), HTONS(80) }; +static uint16_t tp_dst_values[] = { HTONS(6667), HTONS(22) }; +static uint8_t dl_src_values[][6] = { { 0x00, 0x02, 0xe3, 0x0f, 0x80, 0xa4 }, + { 0x5e, 0x33, 0x7f, 0x5f, 0x1e, 0x99 } }; +static uint8_t dl_dst_values[][6] = { { 0x4a, 0x27, 0x71, 0xae, 0x64, 0xc1 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff } }; +static uint8_t nw_proto_values[] = { IP_TYPE_TCP, IP_TYPE_ICMP }; + +static void *values[CLS_N_FIELDS][2]; + +static void +init_values(void) +{ + values[CLS_F_IDX_IN_PORT][0] = &in_port_values[0]; + values[CLS_F_IDX_IN_PORT][1] = &in_port_values[1]; + + values[CLS_F_IDX_DL_VLAN][0] = &dl_vlan_values[0]; + values[CLS_F_IDX_DL_VLAN][1] = &dl_vlan_values[1]; + + values[CLS_F_IDX_DL_SRC][0] = dl_src_values[0]; + values[CLS_F_IDX_DL_SRC][1] = dl_src_values[1]; + + values[CLS_F_IDX_DL_DST][0] = dl_dst_values[0]; + values[CLS_F_IDX_DL_DST][1] = dl_dst_values[1]; + + values[CLS_F_IDX_DL_TYPE][0] = &dl_type_values[0]; + values[CLS_F_IDX_DL_TYPE][1] = &dl_type_values[1]; + + values[CLS_F_IDX_NW_SRC][0] = &nw_src_values[0]; + values[CLS_F_IDX_NW_SRC][1] = &nw_src_values[1]; + + values[CLS_F_IDX_NW_DST][0] = &nw_dst_values[0]; + values[CLS_F_IDX_NW_DST][1] = &nw_dst_values[1]; + + values[CLS_F_IDX_NW_PROTO][0] = &nw_proto_values[0]; + values[CLS_F_IDX_NW_PROTO][1] = &nw_proto_values[1]; + + values[CLS_F_IDX_TP_SRC][0] = &tp_src_values[0]; + values[CLS_F_IDX_TP_SRC][1] = &tp_src_values[1]; + + values[CLS_F_IDX_TP_DST][0] = &tp_dst_values[0]; + values[CLS_F_IDX_TP_DST][1] = &tp_dst_values[1]; +} + +#define N_NW_SRC_VALUES ARRAY_SIZE(nw_src_values) +#define N_NW_DST_VALUES ARRAY_SIZE(nw_dst_values) +#define N_IN_PORT_VALUES ARRAY_SIZE(in_port_values) +#define N_DL_VLAN_VALUES ARRAY_SIZE(dl_vlan_values) +#define N_DL_TYPE_VALUES ARRAY_SIZE(dl_type_values) +#define N_TP_SRC_VALUES ARRAY_SIZE(tp_src_values) +#define N_TP_DST_VALUES ARRAY_SIZE(tp_dst_values) +#define N_DL_SRC_VALUES ARRAY_SIZE(dl_src_values) +#define N_DL_DST_VALUES ARRAY_SIZE(dl_dst_values) +#define N_NW_PROTO_VALUES ARRAY_SIZE(nw_proto_values) + +#define N_FLOW_VALUES (N_NW_SRC_VALUES * \ + N_NW_DST_VALUES * \ + N_IN_PORT_VALUES * \ + N_DL_VLAN_VALUES * \ + N_DL_TYPE_VALUES * \ + N_TP_SRC_VALUES * \ + N_TP_DST_VALUES * \ + N_DL_SRC_VALUES * \ + N_DL_DST_VALUES * \ + N_NW_PROTO_VALUES) + +static unsigned int +get_value(unsigned int *x, unsigned n_values) +{ + unsigned int rem = *x % n_values; + *x /= n_values; + return rem; +} + +static struct cls_rule * +lookup_with_include_bits(const struct classifier *cls, + const flow_t *flow, int include) +{ + switch (include) { + case CLS_INC_WILD: + return classifier_lookup_wild(cls, flow); + case CLS_INC_EXACT: + return classifier_lookup_exact(cls, flow); + case CLS_INC_WILD | CLS_INC_EXACT: + return classifier_lookup(cls, flow); + default: + abort(); + } +} + +static void +compare_classifiers(struct classifier *cls, struct tcls *tcls) +{ + unsigned int i; + + assert(classifier_count(cls) == tcls->n_rules); + assert(classifier_count_exact(cls) == tcls_count_exact(tcls)); + for (i = 0; i < N_FLOW_VALUES; i++) { + struct cls_rule *cr0, *cr1; + flow_t flow; + unsigned int x; + int include; + + x = i; + flow.nw_src = nw_src_values[get_value(&x, N_NW_SRC_VALUES)]; + flow.nw_dst = nw_dst_values[get_value(&x, N_NW_DST_VALUES)]; + flow.in_port = in_port_values[get_value(&x, N_IN_PORT_VALUES)]; + flow.dl_vlan = dl_vlan_values[get_value(&x, N_DL_VLAN_VALUES)]; + flow.dl_type = dl_type_values[get_value(&x, N_DL_TYPE_VALUES)]; + flow.tp_src = tp_src_values[get_value(&x, N_TP_SRC_VALUES)]; + flow.tp_dst = tp_dst_values[get_value(&x, N_TP_DST_VALUES)]; + memcpy(flow.dl_src, dl_src_values[get_value(&x, N_DL_SRC_VALUES)], + ETH_ADDR_LEN); + memcpy(flow.dl_dst, dl_dst_values[get_value(&x, N_DL_DST_VALUES)], + ETH_ADDR_LEN); + flow.nw_proto = nw_proto_values[get_value(&x, N_NW_PROTO_VALUES)]; + flow.reserved = 0; + + for (include = 1; include <= 3; include++) { + cr0 = lookup_with_include_bits(cls, &flow, include); + cr1 = tcls_lookup(tcls, &flow, include); + assert((cr0 == NULL) == (cr1 == NULL)); + if (cr0 != NULL) { + const struct test_rule *tr0 = test_rule_from_cls_rule(cr0); + const struct test_rule *tr1 = test_rule_from_cls_rule(cr1); + + assert(flow_equal(&cr0->flow, &cr1->flow)); + assert(cr0->wc.wildcards == cr1->wc.wildcards); + assert(cr0->priority == cr1->priority); + /* Skip nw_src_mask and nw_dst_mask, because they are derived + * members whose values are used only for optimization. */ + assert(tr0->aux == tr1->aux); + } + } + } +} + +static void +free_rule(struct cls_rule *cls_rule, void *cls) +{ + classifier_remove(cls, cls_rule); + free(test_rule_from_cls_rule(cls_rule)); +} + +static void +destroy_classifier(struct classifier *cls) +{ + classifier_for_each(cls, CLS_INC_ALL, free_rule, cls); + classifier_destroy(cls); +} + +static void +check_tables(const struct classifier *cls, + int n_tables, int n_buckets, int n_rules) +{ + int found_tables = 0; + int found_buckets = 0; + int found_rules = 0; + int i; + + BUILD_ASSERT(CLS_N_FIELDS == ARRAY_SIZE(cls->tables)); + for (i = 0; i < CLS_N_FIELDS; i++) { + const struct cls_bucket *bucket; + if (!hmap_is_empty(&cls->tables[i])) { + found_tables++; + } + HMAP_FOR_EACH (bucket, struct cls_bucket, hmap_node, &cls->tables[i]) { + found_buckets++; + assert(!list_is_empty(&bucket->rules)); + found_rules += list_size(&bucket->rules); + } + } + + if (!hmap_is_empty(&cls->exact_table)) { + found_tables++; + found_buckets++; + found_rules += hmap_count(&cls->exact_table); + } + + assert(n_tables == -1 || found_tables == n_tables); + assert(n_rules == -1 || found_rules == n_rules); + assert(n_buckets == -1 || found_buckets == n_buckets); +} + +static struct test_rule * +make_rule(int wc_fields, unsigned int priority, int value_pat) +{ + const struct cls_field *f; + struct test_rule *rule; + uint32_t wildcards; + flow_t flow; + + wildcards = 0; + memset(&flow, 0, sizeof flow); + for (f = &cls_fields[0]; f < &cls_fields[CLS_N_FIELDS]; f++) { + int f_idx = f - cls_fields; + if (wc_fields & (1u << f_idx)) { + wildcards |= f->wildcards; + } else { + int value_idx = (value_pat & (1u << f_idx)) != 0; + memcpy((char *) &flow + f->ofs, values[f_idx][value_idx], f->len); + } + } + + rule = xcalloc(1, sizeof *rule); + cls_rule_from_flow(&rule->cls_rule, &flow, wildcards, + !wildcards ? UINT_MAX : priority); + return rule; +} + +static void +shuffle(unsigned int *p, size_t n) +{ + for (; n > 1; n--, p++) { + unsigned int *q = &p[rand() % n]; + unsigned int tmp = *p; + *p = *q; + *q = tmp; + } +} + +/* Tests an empty classifier. */ +static void +test_empty(void) +{ + struct classifier cls; + struct tcls tcls; + + classifier_init(&cls); + tcls_init(&tcls); + assert(classifier_is_empty(&cls)); + assert(tcls_is_empty(&tcls)); + compare_classifiers(&cls, &tcls); + classifier_destroy(&cls); + tcls_destroy(&tcls); +} + +/* Destroys a null classifier. */ +static void +test_destroy_null(void) +{ + classifier_destroy(NULL); +} + +/* Tests classification with one rule at a time. */ +static void +test_single_rule(void) +{ + unsigned int wc_fields; /* Hilarious. */ + + for (wc_fields = 0; wc_fields < (1u << CLS_N_FIELDS); wc_fields++) { + struct classifier cls; + struct test_rule *rule, *tcls_rule; + struct tcls tcls; + + rule = make_rule(wc_fields, + hash_bytes(&wc_fields, sizeof wc_fields, 0), 0); + + classifier_init(&cls); + tcls_init(&tcls); + + tcls_rule = tcls_insert(&tcls, rule); + if (wc_fields) { + assert(!classifier_insert(&cls, &rule->cls_rule)); + } else { + classifier_insert_exact(&cls, &rule->cls_rule); + } + check_tables(&cls, 1, 1, 1); + compare_classifiers(&cls, &tcls); + + classifier_remove(&cls, &rule->cls_rule); + tcls_remove(&tcls, tcls_rule); + assert(classifier_is_empty(&cls)); + assert(tcls_is_empty(&tcls)); + compare_classifiers(&cls, &tcls); + + free(rule); + classifier_destroy(&cls); + tcls_destroy(&tcls); + } +} + +/* Tests replacing one rule by another. */ +static void +test_rule_replacement(void) +{ + unsigned int wc_fields; + + for (wc_fields = 0; wc_fields < (1u << CLS_N_FIELDS); wc_fields++) { + struct classifier cls; + struct test_rule *rule1, *tcls_rule1; + struct test_rule *rule2, *tcls_rule2; + struct tcls tcls; + + rule1 = make_rule(wc_fields, OFP_DEFAULT_PRIORITY, UINT_MAX); + rule2 = make_rule(wc_fields, OFP_DEFAULT_PRIORITY, UINT_MAX); + rule2->aux += 5; + rule2->aux += 5; + + classifier_init(&cls); + tcls_init(&tcls); + tcls_rule1 = tcls_insert(&tcls, rule1); + assert(!classifier_insert(&cls, &rule1->cls_rule)); + check_tables(&cls, 1, 1, 1); + compare_classifiers(&cls, &tcls); + tcls_destroy(&tcls); + + tcls_init(&tcls); + tcls_rule2 = tcls_insert(&tcls, rule2); + assert(test_rule_from_cls_rule( + classifier_insert(&cls, &rule2->cls_rule)) == rule1); + free(rule1); + check_tables(&cls, 1, 1, 1); + compare_classifiers(&cls, &tcls); + tcls_destroy(&tcls); + destroy_classifier(&cls); + } +} + +static int +table_mask(int table) +{ + return ((1u << CLS_N_FIELDS) - 1) & ~((1u << table) - 1); +} + +static int +random_wcf_in_table(int table, int seed) +{ + int wc_fields = (1u << table) | hash_int(seed, 0); + return wc_fields & table_mask(table); +} + +/* Tests classification with two rules at a time that fall into the same + * bucket. */ +static void +test_two_rules_in_one_bucket(void) +{ + int table, rel_pri, wcf_pat, value_pat; + + for (table = 0; table <= CLS_N_FIELDS; table++) { + for (rel_pri = -1; rel_pri <= +1; rel_pri++) { + for (wcf_pat = 0; wcf_pat < 4; wcf_pat++) { + int n_value_pats = table == CLS_N_FIELDS - 1 ? 1 : 2; + for (value_pat = 0; value_pat < n_value_pats; value_pat++) { + struct test_rule *rule1, *tcls_rule1; + struct test_rule *rule2, *tcls_rule2; + struct test_rule *displaced_rule; + struct classifier cls; + struct tcls tcls; + unsigned int pri1, pri2; + int wcf1, wcf2; + + if (table != CLS_F_IDX_EXACT) { + /* We can use identical priorities in this test because + * the classifier always chooses the rule added later + * for equal-priority rules that fall into the same + * bucket. */ + pri1 = table * 257 + 50; + pri2 = pri1 + rel_pri; + + wcf1 = (wcf_pat & 1 + ? random_wcf_in_table(table, pri1) + : 1u << table); + wcf2 = (wcf_pat & 2 + ? random_wcf_in_table(table, pri2) + : 1u << table); + if (value_pat) { + wcf1 &= ~(1u << (CLS_N_FIELDS - 1)); + wcf2 &= ~(1u << (CLS_N_FIELDS - 1)); + } + } else { + /* This classifier always puts exact-match rules at + * maximum priority. */ + pri1 = pri2 = UINT_MAX; + + /* No wildcard fields. */ + wcf1 = wcf2 = 0; + } + + rule1 = make_rule(wcf1, pri1, 0); + rule2 = make_rule(wcf2, pri2, + value_pat << (CLS_N_FIELDS - 1)); + + classifier_init(&cls); + tcls_init(&tcls); + + tcls_rule1 = tcls_insert(&tcls, rule1); + tcls_rule2 = tcls_insert(&tcls, rule2); + assert(!classifier_insert(&cls, &rule1->cls_rule)); + displaced_rule = test_rule_from_cls_rule( + classifier_insert(&cls, &rule2->cls_rule)); + if (wcf1 != wcf2 || pri1 != pri2 || value_pat) { + assert(!displaced_rule); + + check_tables(&cls, 1, 1, 2); + compare_classifiers(&cls, &tcls); + + classifier_remove(&cls, &rule1->cls_rule); + tcls_remove(&tcls, tcls_rule1); + check_tables(&cls, 1, 1, 1); + compare_classifiers(&cls, &tcls); + } else { + assert(displaced_rule == rule1); + check_tables(&cls, 1, 1, 1); + compare_classifiers(&cls, &tcls); + } + free(rule1); + + classifier_remove(&cls, &rule2->cls_rule); + tcls_remove(&tcls, tcls_rule2); + compare_classifiers(&cls, &tcls); + free(rule2); + + destroy_classifier(&cls); + tcls_destroy(&tcls); + } + } + } + } +} + +/* Tests classification with two rules at a time that fall into the same + * table but different buckets. */ +static void +test_two_rules_in_one_table(void) +{ + int table, rel_pri, wcf_pat; + + /* Skip tables 0 and CLS_F_IDX_EXACT because they have one bucket. */ + for (table = 1; table < CLS_N_FIELDS; table++) { + for (rel_pri = -1; rel_pri <= +1; rel_pri++) { + for (wcf_pat = 0; wcf_pat < 5; wcf_pat++) { + struct test_rule *rule1, *tcls_rule1; + struct test_rule *rule2, *tcls_rule2; + struct classifier cls; + struct tcls tcls; + unsigned int pri1, pri2; + int wcf1, wcf2; + int value_mask, value_pat1, value_pat2; + int i; + + /* We can use identical priorities in this test because the + * classifier always chooses the rule added later for + * equal-priority rules that fall into the same table. */ + pri1 = table * 257 + 50; + pri2 = pri1 + rel_pri; + + if (wcf_pat & 4) { + wcf1 = wcf2 = random_wcf_in_table(table, pri1); + } else { + wcf1 = (wcf_pat & 1 + ? random_wcf_in_table(table, pri1) + : 1u << table); + wcf2 = (wcf_pat & 2 + ? random_wcf_in_table(table, pri2) + : 1u << table); + } + + /* Generate value patterns that will put the two rules into + * different buckets. */ + value_mask = ((1u << table) - 1); + value_pat1 = hash_int(pri1, 1) & value_mask; + i = 0; + do { + value_pat2 = (hash_int(pri2, i++) & value_mask); + } while (value_pat1 == value_pat2); + rule1 = make_rule(wcf1, pri1, value_pat1); + rule2 = make_rule(wcf2, pri2, value_pat2); + + classifier_init(&cls); + tcls_init(&tcls); + + tcls_rule1 = tcls_insert(&tcls, rule1); + tcls_rule2 = tcls_insert(&tcls, rule2); + assert(!classifier_insert(&cls, &rule1->cls_rule)); + assert(!classifier_insert(&cls, &rule2->cls_rule)); + check_tables(&cls, 1, 2, 2); + compare_classifiers(&cls, &tcls); + + classifier_remove(&cls, &rule1->cls_rule); + tcls_remove(&tcls, tcls_rule1); + check_tables(&cls, 1, 1, 1); + compare_classifiers(&cls, &tcls); + free(rule1); + + classifier_remove(&cls, &rule2->cls_rule); + tcls_remove(&tcls, tcls_rule2); + compare_classifiers(&cls, &tcls); + free(rule2); + + classifier_destroy(&cls); + tcls_destroy(&tcls); + } + } + } +} + +/* Tests classification with two rules at a time that fall into different + * tables. */ +static void +test_two_rules_in_different_tables(void) +{ + int table1, table2, rel_pri, wcf_pat; + + for (table1 = 0; table1 < CLS_N_FIELDS; table1++) { + for (table2 = table1 + 1; table2 <= CLS_N_FIELDS; table2++) { + for (rel_pri = 0; rel_pri < 2; rel_pri++) { + for (wcf_pat = 0; wcf_pat < 4; wcf_pat++) { + struct test_rule *rule1, *tcls_rule1; + struct test_rule *rule2, *tcls_rule2; + struct classifier cls; + struct tcls tcls; + unsigned int pri1, pri2; + int wcf1, wcf2; + + /* We must use unique priorities in this test because the + * classifier makes the rule choice undefined for rules of + * equal priority that fall into different tables. (In + * practice, lower-numbered tables win.) */ + pri1 = table1 * 257 + 50; + pri2 = rel_pri ? pri1 - 1 : pri1 + 1; + + wcf1 = (wcf_pat & 1 + ? random_wcf_in_table(table1, pri1) + : 1u << table1); + wcf2 = (wcf_pat & 2 + ? random_wcf_in_table(table2, pri2) + : 1u << table2); + + if (table2 == CLS_F_IDX_EXACT) { + pri2 = UINT16_MAX; + wcf2 = 0; + } + + rule1 = make_rule(wcf1, pri1, 0); + rule2 = make_rule(wcf2, pri2, 0); + + classifier_init(&cls); + tcls_init(&tcls); + + tcls_rule1 = tcls_insert(&tcls, rule1); + tcls_rule2 = tcls_insert(&tcls, rule2); + assert(!classifier_insert(&cls, &rule1->cls_rule)); + assert(!classifier_insert(&cls, &rule2->cls_rule)); + check_tables(&cls, 2, 2, 2); + compare_classifiers(&cls, &tcls); + + classifier_remove(&cls, &rule1->cls_rule); + tcls_remove(&tcls, tcls_rule1); + check_tables(&cls, 1, 1, 1); + compare_classifiers(&cls, &tcls); + free(rule1); + + classifier_remove(&cls, &rule2->cls_rule); + tcls_remove(&tcls, tcls_rule2); + compare_classifiers(&cls, &tcls); + free(rule2); + + classifier_destroy(&cls); + tcls_destroy(&tcls); + } + } + } + } +} + +/* Tests classification with many rules at a time that fall into the same + * bucket but have unique priorities (and various wildcards). */ +static void +test_many_rules_in_one_bucket(void) +{ + enum { MAX_RULES = 50 }; + int iteration, table; + + for (iteration = 0; iteration < 3; iteration++) { + for (table = 0; table <= CLS_N_FIELDS; table++) { + unsigned int priorities[MAX_RULES]; + struct classifier cls; + struct tcls tcls; + int i; + + srand(hash_int(table, iteration)); + for (i = 0; i < MAX_RULES; i++) { + priorities[i] = i * 129; + } + shuffle(priorities, ARRAY_SIZE(priorities)); + + classifier_init(&cls); + tcls_init(&tcls); + + for (i = 0; i < MAX_RULES; i++) { + struct test_rule *rule; + unsigned int priority = priorities[i]; + int wcf; + + wcf = random_wcf_in_table(table, priority); + rule = make_rule(wcf, priority, + table == CLS_F_IDX_EXACT ? i : 1234); + tcls_insert(&tcls, rule); + assert(!classifier_insert(&cls, &rule->cls_rule)); + check_tables(&cls, 1, 1, i + 1); + compare_classifiers(&cls, &tcls); + } + + destroy_classifier(&cls); + tcls_destroy(&tcls); + } + } +} + +/* Tests classification with many rules at a time that fall into the same + * table but random buckets. */ +static void +test_many_rules_in_one_table(void) +{ + enum { MAX_RULES = 50 }; + int iteration, table; + + for (iteration = 0; iteration < 3; iteration++) { + for (table = 0; table < CLS_N_FIELDS; table++) { + unsigned int priorities[MAX_RULES]; + struct classifier cls; + struct tcls tcls; + int i; + + srand(hash_int(table, iteration)); + for (i = 0; i < MAX_RULES; i++) { + priorities[i] = i * 129; + } + shuffle(priorities, ARRAY_SIZE(priorities)); + + classifier_init(&cls); + tcls_init(&tcls); + + for (i = 0; i < MAX_RULES; i++) { + struct test_rule *rule; + unsigned int priority = priorities[i]; + int wcf; + + wcf = random_wcf_in_table(table, priority); + rule = make_rule(wcf, priority, hash_int(priority, 1)); + tcls_insert(&tcls, rule); + assert(!classifier_insert(&cls, &rule->cls_rule)); + check_tables(&cls, 1, -1, i + 1); + compare_classifiers(&cls, &tcls); + } + + destroy_classifier(&cls); + tcls_destroy(&tcls); + } + } +} + +/* Tests classification with many rules at a time that fall into random buckets + * in random tables. */ +static void +test_many_rules_in_different_tables(void) +{ + enum { MAX_RULES = 50 }; + int iteration; + + for (iteration = 0; iteration < 30; iteration++) { + unsigned int priorities[MAX_RULES]; + struct classifier cls; + struct tcls tcls; + int i; + + srand(iteration); + for (i = 0; i < MAX_RULES; i++) { + priorities[i] = i * 129; + } + shuffle(priorities, ARRAY_SIZE(priorities)); + + classifier_init(&cls); + tcls_init(&tcls); + + for (i = 0; i < MAX_RULES; i++) { + struct test_rule *rule; + unsigned int priority = priorities[i]; + int table = rand() % (CLS_N_FIELDS + 1); + int wcf = random_wcf_in_table(table, rand()); + int value_pat = rand() & ((1u << CLS_N_FIELDS) - 1); + rule = make_rule(wcf, priority, value_pat); + tcls_insert(&tcls, rule); + assert(!classifier_insert(&cls, &rule->cls_rule)); + check_tables(&cls, -1, -1, i + 1); + compare_classifiers(&cls, &tcls); + } + + while (!classifier_is_empty(&cls)) { + struct test_rule *rule = xmemdup(tcls.rules[rand() % tcls.n_rules], + sizeof(struct test_rule)); + int include = rand() % 2 ? CLS_INC_WILD : CLS_INC_EXACT; + include |= (rule->cls_rule.wc.wildcards + ? CLS_INC_WILD : CLS_INC_EXACT); + classifier_for_each_match(&cls, &rule->cls_rule, include, + free_rule, &cls); + tcls_delete_matches(&tcls, &rule->cls_rule, include); + compare_classifiers(&cls, &tcls); + free(rule); + } + putchar('.'); + fflush(stdout); + + destroy_classifier(&cls); + tcls_destroy(&tcls); + } +} + +static void +run_test(void (*function)(void)) +{ + function(); + putchar('.'); + fflush(stdout); +} + +int +main(void) +{ + init_values(); + run_test(test_empty); + run_test(test_destroy_null); + run_test(test_single_rule); + run_test(test_rule_replacement); + run_test(test_two_rules_in_one_bucket); + run_test(test_two_rules_in_one_table); + run_test(test_two_rules_in_different_tables); + run_test(test_many_rules_in_one_bucket); + run_test(test_many_rules_in_one_table); + run_test(test_many_rules_in_different_tables); + putchar('\n'); + return 0; +} diff --git a/tests/test-dhcp-client.c b/tests/test-dhcp-client.c new file mode 100644 index 00000000..2fee3fc1 --- /dev/null +++ b/tests/test-dhcp-client.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include "dhcp-client.h" +#include +#include +#include +#include +#include "command-line.h" +#include "dhcp.h" +#include "fatal-signal.h" +#include "fault.h" +#include "poll-loop.h" +#include "util.h" +#include "vlog.h" + +/* --request-ip: IP address to request from server. If zero, then do not + * request a specific IP address. */ +static struct in_addr request_ip; + +/* --vendor-class: Vendor class string to include in request. If null, no + * vendor class string is included. */ +static const char *vendor_class; + +/* --no-resolv-conf: Update /etc/resolv.conf to match DHCP reply? */ +static bool update_resolv_conf = true; + +static void parse_options(int argc, char *argv[]); +static void usage(void); +static void release(void *cli_); +static void modify_dhcp_request(struct dhcp_msg *, void *aux); + +int +main(int argc, char *argv[]) +{ + struct dhclient *cli; + int error; + + set_program_name(argv[0]); + register_fault_handlers(); + vlog_init(); + parse_options(argc, argv); + + argc -= optind; + argv += optind; + if (argc != 1) { + ovs_fatal(0, "exactly one non-option argument required; " + "use --help for help"); + } + + error = dhclient_create(argv[0], modify_dhcp_request, NULL, NULL, &cli); + if (error) { + ovs_fatal(error, "dhclient_create failed"); + } + dhclient_init(cli, request_ip.s_addr); + fatal_signal_add_hook(release, cli, true); + + for (;;) { + fatal_signal_block(); + dhclient_run(cli); + if (dhclient_changed(cli)) { + dhclient_configure_netdev(cli); + if (update_resolv_conf) { + dhclient_update_resolv_conf(cli); + } + } + dhclient_wait(cli); + fatal_signal_unblock(); + poll_block(); + } +} + +static void +release(void *cli_) +{ + struct dhclient *cli = cli_; + dhclient_release(cli); + if (dhclient_changed(cli)) { + dhclient_configure_netdev(cli); + } +} + +static void +modify_dhcp_request(struct dhcp_msg *msg, void *aux UNUSED) +{ + if (vendor_class) { + dhcp_msg_put_string(msg, DHCP_CODE_VENDOR_CLASS, vendor_class); + } +} + +static void +parse_options(int argc, char *argv[]) +{ + enum { + OPT_REQUEST_IP = UCHAR_MAX + 1, + OPT_VENDOR_CLASS, + OPT_NO_RESOLV_CONF + }; + static struct option long_options[] = { + {"request-ip", required_argument, 0, OPT_REQUEST_IP }, + {"vendor-class", required_argument, 0, OPT_VENDOR_CLASS }, + {"no-resolv-conf", no_argument, 0, OPT_NO_RESOLV_CONF}, + {"verbose", optional_argument, 0, 'v'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + + for (;;) { + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case OPT_REQUEST_IP: + if (!inet_aton(optarg, &request_ip)) { + ovs_fatal(0, + "--request-ip argument is not a valid IP address"); + } + break; + + case OPT_VENDOR_CLASS: + vendor_class = optarg; + break; + + case OPT_NO_RESOLV_CONF: + update_resolv_conf = false; + break; + + case 'h': + usage(); + + case 'V': + printf("%s %s compiled "__DATE__" "__TIME__"\n", + program_name, VERSION BUILDNR); + exit(EXIT_SUCCESS); + + case 'v': + vlog_set_verbosity(optarg); + break; + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); +} + +static void +usage(void) +{ + printf("%s: standalone program for testing Open vSwitch DHCP client.\n" + "usage: %s [OPTIONS] NETDEV\n" + "where NETDEV is a network device (e.g. eth0).\n" + "\nDHCP options:\n" + " --request-ip=IP request specified IP address (default:\n" + " do not request a specific IP)\n" + " --vendor-class=STRING use STRING as vendor class (default:\n" + " none); use OpenFlow to imitate secchan\n" + " --no-resolv-conf do not update /etc/resolv.conf\n", + program_name, program_name); + vlog_usage(); + printf("\nOther options:\n" + " -h, --help display this help message\n" + " -V, --version display version information\n"); + exit(EXIT_SUCCESS); +} + diff --git a/tests/test-flows.c b/tests/test-flows.c new file mode 100644 index 00000000..663c5a6d --- /dev/null +++ b/tests/test-flows.c @@ -0,0 +1,76 @@ +#include +#include "flow.h" +#include +#include +#include +#include "openflow/openflow.h" +#include "timeval.h" +#include "ofpbuf.h" +#include "ofp-print.h" +#include "pcap.h" +#include "util.h" +#include "vlog.h" + +#undef NDEBUG +#include + +int +main(int argc UNUSED, char *argv[]) +{ + struct ofp_match expected_match; + FILE *flows, *pcap; + int retval; + int n = 0, errors = 0; + + set_program_name(argv[0]); + time_init(); + vlog_init(); + + flows = stdin; + pcap = fdopen(3, "rb"); + if (!pcap) { + ovs_fatal(errno, "failed to open fd 3 for reading"); + } + + retval = pcap_read_header(pcap); + if (retval) { + ovs_fatal(retval > 0 ? retval : 0, "reading pcap header failed"); + } + + while (fread(&expected_match, sizeof expected_match, 1, flows)) { + struct ofpbuf *packet; + struct ofp_match extracted_match; + flow_t flow; + + n++; + + retval = pcap_read(pcap, &packet); + if (retval == EOF) { + ovs_fatal(0, "unexpected end of file reading pcap file"); + } else if (retval) { + ovs_fatal(retval, "error reading pcap file"); + } + + flow_extract(packet, 1, &flow); + flow_to_match(&flow, 0, &extracted_match); + + if (memcmp(&expected_match, &extracted_match, sizeof expected_match)) { + char *exp_s = ofp_match_to_string(&expected_match, 2); + char *got_s = ofp_match_to_string(&extracted_match, 2); + errors++; + printf("mismatch on packet #%d (1-based).\n", n); + printf("Packet:\n"); + ofp_print_packet(stdout, packet->data, packet->size, packet->size); + printf("Expected flow:\n%s\n", exp_s); + printf("Actually extracted flow:\n%s\n", got_s); + printf("\n"); + free(exp_s); + free(got_s); + } + + ofpbuf_delete(packet); + } + printf("checked %d packets, %d errors\n", n, errors); + return errors != 0; +} + diff --git a/tests/test-flows.sh b/tests/test-flows.sh new file mode 100755 index 00000000..0d38ad78 --- /dev/null +++ b/tests/test-flows.sh @@ -0,0 +1,9 @@ +#! /bin/sh -e +srcdir=`cd $srcdir && pwd` +trap 'rm -f flows$$ pcap$$ out$$' 0 1 2 13 15 +cd tests +"$srcdir"/tests/flowgen.pl >/dev/null 3>flows$$ 4>pcap$$ +./test-flows out$$ || true +diff -u - out$$ < +#include +#include +#include +#include +#include "hash.h" + +#undef NDEBUG +#include + +static void +set_bit(uint32_t array[3], int bit) +{ + assert(bit >= 0 && bit <= 96); + memset(array, 0, sizeof(uint32_t) * 3); + if (bit < 96) { + array[bit / 32] = UINT32_C(1) << (bit % 32); + } +} + +static uint32_t +hash_words_cb(uint32_t input) +{ + return hash_words(&input, 1, 0); +} + +static uint32_t +hash_int_cb(uint32_t input) +{ + return hash_int(input, 0); +} + +static void +check_word_hash(uint32_t (*hash)(uint32_t), const char *name, + int min_unique) +{ + int i, j; + + for (i = 0; i <= 32; i++) { + uint32_t in1 = i < 32 ? UINT32_C(1) << i : 0; + for (j = i + 1; j <= 32; j++) { + uint32_t in2 = j < 32 ? UINT32_C(1) << j : 0; + uint32_t out1 = hash(in1); + uint32_t out2 = hash(in2); + const uint32_t unique_mask = (UINT32_C(1) << min_unique) - 1; + int ofs; + for (ofs = 0; ofs < 32 - min_unique; ofs++) { + uint32_t bits1 = (out1 >> ofs) & unique_mask; + uint32_t bits2 = (out2 >> ofs) & unique_mask; + if (bits1 == bits2) { + printf("Partial collision for '%s':\n", name); + printf("%s(%08"PRIx32") = %08"PRIx32"\n", name, in1, out1); + printf("%s(%08"PRIx32") = %08"PRIx32"\n", name, in2, out2); + printf("%d bits of output starting at bit %d " + "are both 0x%"PRIx32"\n", min_unique, ofs, bits1); + exit(1); + } + } + } + } +} + +int +main(void) +{ + int i, j; + + /* Check that all hashes computed with hash_words with one 1-bit (or no + * 1-bits) set within a single 32-bit word have different values in all + * 11-bit consecutive runs. + * + * Given a random distribution, the probability of at least one collision + * in any set of 11 bits is approximately + * + * 1 - ((2**11 - 1)/2**11)**C(33,2) + * == 1 - (2047/2048)**528 + * =~ 0.22 + * + * There are 21 ways to pick 11 consecutive bits in a 32-bit word, so if we + * assumed independence then the chance of having no collisions in any of + * those 11-bit runs would be (1-0.22)**21 =~ .0044. Obviously + * independence must be a bad assumption :-) + */ + check_word_hash(hash_words_cb, "hash_words", 11); + + /* Check that all hash functions of with one 1-bit (or no 1-bits) set + * within three 32-bit words have different values in their lowest 12 + * bits. + * + * Given a random distribution, the probability of at least one collision + * in 12 bits is approximately + * + * 1 - ((2**12 - 1)/2**12)**C(97,2) + * == 1 - (4095/4096)**4656 + * =~ 0.68 + * + * so we are doing pretty well to not have any collisions in 12 bits. + */ + for (i = 0; i <= 96; i++) { + for (j = i + 1; j <= 96; j++) { + uint32_t in1[3], in2[3]; + uint32_t out1, out2; + const int min_unique = 12; + const uint32_t unique_mask = (UINT32_C(1) << min_unique) - 1; + + set_bit(in1, i); + set_bit(in2, j); + out1 = hash_words(in1, 3, 0); + out2 = hash_words(in2, 3, 0); + if ((out1 & unique_mask) == (out2 & unique_mask)) { + printf("Partial collision:\n"); + printf("hash(1 << %d) == %08"PRIx32"\n", i, out1); + printf("hash(1 << %d) == %08"PRIx32"\n", j, out2); + printf("The low-order %d bits of output are both " + "0x%"PRIx32"\n", min_unique, out1 & unique_mask); + exit(1); + } + } + } + + /* Check that all hashes computed with hash_int with one 1-bit (or no + * 1-bits) set within a single 32-bit word have different values in all + * 14-bit consecutive runs. + * + * Given a random distribution, the probability of at least one collision + * in any set of 14 bits is approximately + * + * 1 - ((2**14 - 1)/2**14)**C(33,2) + * == 1 - (16,383/16,834)**528 + * =~ 0.031 + * + * There are 18 ways to pick 14 consecutive bits in a 32-bit word, so if we + * assumed independence then the chance of having no collisions in any of + * those 14-bit runs would be (1-0.03)**18 =~ 0.56. This seems reasonable. + */ + check_word_hash(hash_int_cb, "hash_int", 14); + + return 0; +} diff --git a/tests/test-hmap.c b/tests/test-hmap.c new file mode 100644 index 00000000..684a5bae --- /dev/null +++ b/tests/test-hmap.c @@ -0,0 +1,281 @@ +/* A non-exhaustive test for some of the functions and macros declared in + * hmap.h. */ + +#include +#include "hmap.h" +#include +#include "hash.h" +#include "util.h" + +#undef NDEBUG +#include + +/* Sample hmap element. */ +struct element { + int value; + struct hmap_node node; +}; + +typedef size_t hash_func(int value); + +static int +compare_ints(const void *a_, const void *b_) +{ + const int *a = a_; + const int *b = b_; + return *a < *b ? -1 : *a > *b; +} + +/* Verifies that 'hmap' contains exactly the 'n' values in 'values'. */ +static void +check_hmap(struct hmap *hmap, const int values[], size_t n, + hash_func *hash) +{ + int *sort_values, *hmap_values; + struct element *e; + size_t i; + + /* Check that all the values are there in iteration. */ + sort_values = xmalloc(sizeof *sort_values * n); + hmap_values = xmalloc(sizeof *sort_values * n); + + i = 0; + HMAP_FOR_EACH (e, struct element, node, hmap) { + assert(i < n); + hmap_values[i++] = e->value; + } + assert(i == n); + + memcpy(sort_values, values, sizeof *sort_values * n); + qsort(sort_values, n, sizeof *sort_values, compare_ints); + qsort(hmap_values, n, sizeof *hmap_values, compare_ints); + + for (i = 0; i < n; i++) { + assert(sort_values[i] == hmap_values[i]); + } + + free(hmap_values); + free(sort_values); + + /* Check that all the values are there in lookup. */ + for (i = 0; i < n; i++) { + size_t count = 0; + + HMAP_FOR_EACH_WITH_HASH (e, struct element, node, + hash(values[i]), hmap) { + count += e->value == values[i]; + } + assert(count == 1); + } + + /* Check counters. */ + assert(hmap_is_empty(hmap) == !n); + assert(hmap_count(hmap) == n); +} + +/* Puts the 'n' values in 'values' into 'elements', and then puts those + * elements into 'hmap'. */ +static void +make_hmap(struct hmap *hmap, struct element elements[], + int values[], size_t n, hash_func *hash) +{ + size_t i; + + hmap_init(hmap); + for (i = 0; i < n; i++) { + elements[i].value = i; + hmap_insert(hmap, &elements[i].node, hash(elements[i].value)); + values[i] = i; + } +} + +static void +shuffle(int *p, size_t n) +{ + for (; n > 1; n--, p++) { + int *q = &p[rand() % n]; + int tmp = *p; + *p = *q; + *q = tmp; + } +} + +#if 0 +/* Prints the values in 'hmap', plus 'name' as a title. */ +static void +print_hmap(const char *name, struct hmap *hmap) +{ + struct element *e; + + printf("%s:", name); + HMAP_FOR_EACH (e, struct element, node, hmap) { + printf(" %d(%zu)", e->value, e->node.hash & hmap->mask); + } + printf("\n"); +} + +/* Prints the 'n' values in 'values', plus 'name' as a title. */ +static void +print_ints(const char *name, const int *values, size_t n) +{ + size_t i; + + printf("%s:", name); + for (i = 0; i < n; i++) { + printf(" %d", values[i]); + } + printf("\n"); +} +#endif + +static size_t +identity_hash(int value) +{ + return value; +} + +static size_t +good_hash(int value) +{ + return hash_int(value, 0x1234abcd); +} + +static size_t +constant_hash(int value UNUSED) +{ + return 123; +} + +/* Tests basic hmap insertion and deletion. */ +static void +test_hmap_insert_delete(hash_func *hash) +{ + enum { N_ELEMS = 100 }; + + struct element elements[N_ELEMS]; + int values[N_ELEMS]; + struct hmap hmap; + size_t i; + + hmap_init(&hmap); + for (i = 0; i < N_ELEMS; i++) { + elements[i].value = i; + hmap_insert(&hmap, &elements[i].node, hash(i)); + values[i] = i; + check_hmap(&hmap, values, i + 1, hash); + } + shuffle(values, N_ELEMS); + for (i = 0; i < N_ELEMS; i++) { + hmap_remove(&hmap, &elements[values[i]].node); + check_hmap(&hmap, values + (i + 1), N_ELEMS - (i + 1), hash); + } + hmap_destroy(&hmap); +} + +/* Tests basic hmap_reserve() and hmap_shrink(). */ +static void +test_hmap_reserve_shrink(hash_func *hash) +{ + enum { N_ELEMS = 32 }; + + size_t i; + + for (i = 0; i < N_ELEMS; i++) { + struct element elements[N_ELEMS]; + int values[N_ELEMS]; + struct hmap hmap; + size_t j; + + hmap_init(&hmap); + hmap_reserve(&hmap, i); + for (j = 0; j < N_ELEMS; j++) { + elements[j].value = j; + hmap_insert(&hmap, &elements[j].node, hash(j)); + values[j] = j; + check_hmap(&hmap, values, j + 1, hash); + } + shuffle(values, N_ELEMS); + for (j = 0; j < N_ELEMS; j++) { + hmap_remove(&hmap, &elements[values[j]].node); + hmap_shrink(&hmap); + check_hmap(&hmap, values + (j + 1), N_ELEMS - (j + 1), hash); + } + hmap_destroy(&hmap); + } +} + +/* Tests that HMAP_FOR_EACH_SAFE properly allows for deletion of the current + * element of a hmap. */ +static void +test_hmap_for_each_safe(hash_func *hash) +{ + enum { MAX_ELEMS = 10 }; + size_t n; + unsigned long int pattern; + + for (n = 0; n <= MAX_ELEMS; n++) { + for (pattern = 0; pattern < 1ul << n; pattern++) { + struct element elements[MAX_ELEMS]; + int values[MAX_ELEMS]; + struct hmap hmap; + struct element *e, *next; + size_t n_remaining; + int i; + + make_hmap(&hmap, elements, values, n, hash); + + i = 0; + n_remaining = n; + HMAP_FOR_EACH_SAFE (e, next, struct element, node, &hmap) { + assert(i < n); + if (pattern & (1ul << e->value)) { + size_t j; + hmap_remove(&hmap, &e->node); + for (j = 0; ; j++) { + assert(j < n_remaining); + if (values[j] == e->value) { + values[j] = values[--n_remaining]; + break; + } + } + } + check_hmap(&hmap, values, n_remaining, hash); + i++; + } + assert(i == n); + + for (i = 0; i < n; i++) { + if (pattern & (1ul << i)) { + n_remaining++; + } + } + assert(n == n_remaining); + + hmap_destroy(&hmap); + } + } +} + +static void +run_test(void (*function)(hash_func *)) +{ + hash_func *hash_funcs[] = { identity_hash, good_hash, constant_hash }; + size_t i; + + for (i = 0; i < ARRAY_SIZE(hash_funcs); i++) { + function(hash_funcs[i]); + printf("."); + fflush(stdout); + } +} + +int +main(void) +{ + run_test(test_hmap_insert_delete); + run_test(test_hmap_for_each_safe); + run_test(test_hmap_reserve_shrink); + printf("\n"); + return 0; +} + diff --git a/tests/test-list.c b/tests/test-list.c new file mode 100644 index 00000000..62857be9 --- /dev/null +++ b/tests/test-list.c @@ -0,0 +1,159 @@ +/* A non-exhaustive test for some of the functions and macros declared in + * list.h. */ + +#include +#include "list.h" +#include + +#undef NDEBUG +#include + +/* Sample list element. */ +struct element { + int value; + struct list node; +}; + +/* Puts the 'n' values in 'values' into 'elements', and then puts those + * elements in order into 'list'. */ +static void +make_list(struct list *list, struct element elements[], + int values[], size_t n) +{ + size_t i; + + list_init(list); + for (i = 0; i < n; i++) { + elements[i].value = i; + list_push_back(list, &elements[i].node); + values[i] = i; + } +} + +/* Verifies that 'list' contains exactly the 'n' values in 'values', in the + * specified order. */ +static void +check_list(struct list *list, const int values[], size_t n) +{ + struct element *e; + size_t i; + + i = 0; + LIST_FOR_EACH (e, struct element, node, list) { + assert(i < n); + assert(e->value == values[i]); + i++; + } + assert(&e->node == list); + assert(i == n); + + i = 0; + LIST_FOR_EACH_REVERSE (e, struct element, node, list) { + assert(i < n); + assert(e->value == values[n - i - 1]); + i++; + } + assert(&e->node == list); + assert(i == n); + + assert(list_is_empty(list) == !n); + assert(list_size(list) == n); +} + +#if 0 +/* Prints the values in 'list', plus 'name' as a title. */ +static void +print_list(const char *name, struct list *list) +{ + struct element *e; + + printf("%s:", name); + LIST_FOR_EACH (e, struct element, node, list) { + printf(" %d", e->value); + } + printf("\n"); +} +#endif + +/* Tests basic list construction. */ +static void +test_list_construction(void) +{ + enum { MAX_ELEMS = 100 }; + size_t n; + + for (n = 0; n <= MAX_ELEMS; n++) { + struct element elements[MAX_ELEMS]; + int values[MAX_ELEMS]; + struct list list; + + make_list(&list, elements, values, n); + check_list(&list, values, n); + } +} + +/* Tests that LIST_FOR_EACH_SAFE properly allows for deletion of the current + * element of a list. */ +static void +test_list_for_each_safe(void) +{ + enum { MAX_ELEMS = 10 }; + size_t n; + unsigned long int pattern; + + for (n = 0; n <= MAX_ELEMS; n++) { + for (pattern = 0; pattern < 1ul << n; pattern++) { + struct element elements[MAX_ELEMS]; + int values[MAX_ELEMS]; + struct list list; + struct element *e, *next; + size_t values_idx, n_remaining; + int i; + + make_list(&list, elements, values, n); + + i = 0; + values_idx = 0; + n_remaining = n; + LIST_FOR_EACH_SAFE (e, next, struct element, node, &list) { + assert(i < n); + if (pattern & (1ul << i)) { + list_remove(&e->node); + n_remaining--; + memmove(&values[values_idx], &values[values_idx + 1], + sizeof *values * (n_remaining - values_idx)); + } else { + values_idx++; + } + check_list(&list, values, n_remaining); + i++; + } + assert(i == n); + assert(&e->node == &list); + + for (i = 0; i < n; i++) { + if (pattern & (1ul << i)) { + n_remaining++; + } + } + assert(n == n_remaining); + } + } +} + +static void +run_test(void (*function)(void)) +{ + function(); + printf("."); +} + +int +main(void) +{ + run_test(test_list_construction); + run_test(test_list_for_each_safe); + printf("\n"); + return 0; +} + diff --git a/tests/test-stp-ieee802.1d-1998 b/tests/test-stp-ieee802.1d-1998 new file mode 100644 index 00000000..f1982a03 --- /dev/null +++ b/tests/test-stp-ieee802.1d-1998 @@ -0,0 +1,12 @@ +# This is the STP example from IEEE 802.1D-1998. +bridge 0 0x42 = a b +bridge 1 0x97 = c:5 a d:5 +bridge 2 0x45 = b e +bridge 3 0x57 = b:5 e:5 +bridge 4 0x83 = a:5 e:5 +run 1000 +check 0 = root +check 1 = F F:10 F +check 2 = F:10 B +check 3 = F:5 F +check 4 = F:5 B diff --git a/tests/test-stp-ieee802.1d-2004-fig17.4 b/tests/test-stp-ieee802.1d-2004-fig17.4 new file mode 100644 index 00000000..1f708630 --- /dev/null +++ b/tests/test-stp-ieee802.1d-2004-fig17.4 @@ -0,0 +1,31 @@ +# This is the STP example from IEEE 802.1D-2004 figures 17.4 and 17.5. +bridge 0 0x111 = a b e c +bridge 1 0x222 = a b d f +bridge 2 0x333 = c d l j h g +bridge 3 0x444 = e f n m k i +bridge 4 0x555 = g i 0 0 +bridge 5 0x666 = h k 0 0 +bridge 6 0x777 = j m 0 0 +bridge 7 0x888 = l n 0 0 +run 1000 +check 0 = root +check 1 = F:10 B F F +check 2 = F:10 B F F F F +check 3 = F:10 B F F F F +check 4 = F:20 B F F +check 5 = F:20 B F F +check 6 = F:20 B F F +check 7 = F:20 B F F + +# Now connect two ports of bridge 7 to the same LAN. +bridge 7 = l n o o +# Same results except for bridge 7: +run 1000 +check 0 = root +check 1 = F:10 B F F +check 2 = F:10 B F F F F +check 3 = F:10 B F F F F +check 4 = F:20 B F F +check 5 = F:20 B F F +check 6 = F:20 B F F +check 7 = F:20 B F B diff --git a/tests/test-stp-ieee802.1d-2004-fig17.6 b/tests/test-stp-ieee802.1d-2004-fig17.6 new file mode 100644 index 00000000..6ed59177 --- /dev/null +++ b/tests/test-stp-ieee802.1d-2004-fig17.6 @@ -0,0 +1,14 @@ +# This is the STP example from IEEE 802.1D-2004 figure 17.6. +bridge 0 0x111 = a b l +bridge 1 0x222 = b c d +bridge 2 0x333 = d e f +bridge 3 0x444 = f g h +bridge 4 0x555 = j h i +bridge 5 0x666 = l j k +run 1000 +check 0 = root +check 1 = F:10 F F +check 2 = F:20 F F +check 3 = F:30 F B +check 4 = F:20 F F +check 5 = F:10 F F diff --git a/tests/test-stp-ieee802.1d-2004-fig17.7 b/tests/test-stp-ieee802.1d-2004-fig17.7 new file mode 100644 index 00000000..daa0cdf2 --- /dev/null +++ b/tests/test-stp-ieee802.1d-2004-fig17.7 @@ -0,0 +1,17 @@ +# This is the STP example from IEEE 802.1D-2004 figure 17.7. +bridge 0 0xaa = b +bridge 1 0x111 = a b d f h g e c +bridge 2 0x222 = g h j l n m k i +run 1000 +check 0 = root +check 1 = F F:10 F F F F F F +check 2 = B F:20 F F F F F F + +# This is not the port priority change described in that figure, +# but I don't understand what port priority change would cause +# that change. +bridge 2 = g X j l n m k i +run 1000 +check 0 = root +check 1 = F F:10 F F F F F F +check 2 = F:20 D F F F F F F diff --git a/tests/test-stp-iol-io-1.1 b/tests/test-stp-iol-io-1.1 new file mode 100644 index 00000000..186d6c4c --- /dev/null +++ b/tests/test-stp-iol-io-1.1 @@ -0,0 +1,25 @@ +# This test file approximates the following test from "Bridge +# Functions Consortium Spanning Tree Interoperability Test Suite +# Version 1.5": +# STP.io.1.1: Link Failure +bridge 0 0x111 = a b c +bridge 1 0x222 = a b c +run 1000 +check 0 = root +check 1 = F:10 B B +bridge 1 = 0 _ _ +run 1000 +check 0 = root +check 1 = F F:10 B +bridge 1 = X _ _ +run 1000 +check 0 = root +check 1 = D F:10 B +bridge 1 = _ 0 _ +run 1000 +check 0 = root +check 1 = D F F:10 +bridge 1 = _ X _ +run 1000 +check 0 = root +check 1 = D D F:10 diff --git a/tests/test-stp-iol-io-1.2 b/tests/test-stp-iol-io-1.2 new file mode 100644 index 00000000..285bbd88 --- /dev/null +++ b/tests/test-stp-iol-io-1.2 @@ -0,0 +1,14 @@ +# This test file approximates the following test from "Bridge +# Functions Consortium Spanning Tree Interoperability Test Suite +# Version 1.5": +# STP.io.1.2: Repeated Network +bridge 0 0x111 = a a +bridge 1 0x222 = a a +run 1000 +check 0 = rootid:0x111 F B +check 1 = rootid:0x111 F:10 B +bridge 1 = a^0x90 _ +run 1000 +check 0 = rootid:0x111 F B +check 1 = rootid:0x111 B F:10 + diff --git a/tests/test-stp-iol-io-1.4 b/tests/test-stp-iol-io-1.4 new file mode 100644 index 00000000..0065aaf5 --- /dev/null +++ b/tests/test-stp-iol-io-1.4 @@ -0,0 +1,13 @@ +# This test file approximates the following test from "Bridge +# Functions Consortium Spanning Tree Interoperability Test Suite +# Version 1.5": +# STP.io.1.4: Network Initialization +bridge 0 0x111 = a b c +bridge 1 0x222 = b d e +bridge 2 0x333 = a d f +bridge 3 0x444 = c e f +run 1000 +check 0 = root +check 1 = F:10 F F +check 2 = F:10 B F +check 3 = F:10 B B diff --git a/tests/test-stp-iol-io-1.5 b/tests/test-stp-iol-io-1.5 new file mode 100644 index 00000000..285d29de --- /dev/null +++ b/tests/test-stp-iol-io-1.5 @@ -0,0 +1,40 @@ +# This test file approximates the following test from "Bridge +# Functions Consortium Spanning Tree Interoperability Test Suite +# Version 1.5": +# STP.io.1.5: Topology Change +bridge 0 0x111 = a b d c +bridge 1 0x222 = a b f e +bridge 2 0x333 = c d g h +bridge 3 0x444 = e f g h +run 1000 +check 0 = root +check 1 = F:10 B F F +check 2 = B F:10 F F +check 3 = B F:20 B B +bridge 1^0x7000 +run 1000 +check 0 = F:10 B F F +check 1 = root +check 2 = B F:20 B B +check 3 = B F:10 F F +bridge 2^0x6000 +run 1000 +check 0 = F F B F:10 +check 1 = F:20 B B B +check 2 = root +check 3 = F F F:10 B +bridge 3^0x5000 +run 1000 +check 0 = B B B F:20 +check 1 = F F B F:10 +check 2 = F F F:10 B +check 3 = root +bridge 0^0x4000 +bridge 1^0x4001 +bridge 2^0x4002 +bridge 3^0x4003 +run 1000 +check 0 = root +check 1 = F:10 B F F +check 2 = B F:10 F F +check 3 = B F:20 B B diff --git a/tests/test-stp-iol-op-1.1 b/tests/test-stp-iol-op-1.1 new file mode 100644 index 00000000..8432bf36 --- /dev/null +++ b/tests/test-stp-iol-op-1.1 @@ -0,0 +1,7 @@ +# This test file approximates the following tests from "Bridge +# Functions Consortium Spanning Tree Protocol Operations Test Suite +# Version 2.3": +# Test STP.op.1.1 ­ Root ID Initialized to Bridge ID +# Test STP.op.1.2 ­ Root Path Cost Initialized to Zero +bridge 0 0x123 = +check 0 = root diff --git a/tests/test-stp-iol-op-1.4 b/tests/test-stp-iol-op-1.4 new file mode 100644 index 00000000..6a121164 --- /dev/null +++ b/tests/test-stp-iol-op-1.4 @@ -0,0 +1,8 @@ +# This test file approximates the following test from "Bridge +# Functions Consortium Spanning Tree Protocol Operations Test Suite +# Version 2.3": +# Test STP.op.1.4 ­ All Ports Initialized to Designated Ports +bridge 0 0x123 = a b c d e f +check 0 = Li Li Li Li Li Li +run 1000 +check 0 = F F F F F F diff --git a/tests/test-stp-iol-op-3.1 b/tests/test-stp-iol-op-3.1 new file mode 100644 index 00000000..3e1099cb --- /dev/null +++ b/tests/test-stp-iol-op-3.1 @@ -0,0 +1,11 @@ +# This test file approximates the following test from "Bridge +# Functions Consortium Spanning Tree Protocol Operations Test Suite +# Version 2.3": +# Test STP.op.3.1 ­ Root Bridge Selection: Root ID Values +bridge 0 0x111 = a +bridge 1 0x222 = a +check 0 = rootid:0x111 Li +check 1 = rootid:0x222 Li +run 1000 +check 0 = rootid:0x111 root +check 1 = rootid:0x111 F:10 diff --git a/tests/test-stp-iol-op-3.3 b/tests/test-stp-iol-op-3.3 new file mode 100644 index 00000000..2bcd45e1 --- /dev/null +++ b/tests/test-stp-iol-op-3.3 @@ -0,0 +1,11 @@ +# This test file approximates the following test from "Bridge +# Functions Consortium Spanning Tree Protocol Operations Test Suite +# Version 2.3": +# Test STP.op.3.3 ­ Root Bridge Selection: Bridge ID Values +bridge 0 0x333^0x6000 = a +bridge 1 0x222^0x7000 = b +bridge 2 0x111 = a b +run 1000 +check 0 = rootid:0x333^0x6000 root +check 1 = rootid:0x333^0x6000 F:20 +check 2 = rootid:0x333^0x6000 F:10 F diff --git a/tests/test-stp-iol-op-3.4 b/tests/test-stp-iol-op-3.4 new file mode 100644 index 00000000..2bcd45e1 --- /dev/null +++ b/tests/test-stp-iol-op-3.4 @@ -0,0 +1,11 @@ +# This test file approximates the following test from "Bridge +# Functions Consortium Spanning Tree Protocol Operations Test Suite +# Version 2.3": +# Test STP.op.3.3 ­ Root Bridge Selection: Bridge ID Values +bridge 0 0x333^0x6000 = a +bridge 1 0x222^0x7000 = b +bridge 2 0x111 = a b +run 1000 +check 0 = rootid:0x333^0x6000 root +check 1 = rootid:0x333^0x6000 F:20 +check 2 = rootid:0x333^0x6000 F:10 F diff --git a/tests/test-stp.c b/tests/test-stp.c new file mode 100644 index 00000000..07336815 --- /dev/null +++ b/tests/test-stp.c @@ -0,0 +1,648 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "stp.h" +#include +#include +#include +#include +#include +#include +#include "ofpbuf.h" +#include "packets.h" + +struct bpdu { + int port_no; + void *data; + size_t size; +}; + +struct bridge { + struct test_case *tc; + int id; + bool reached; + + struct stp *stp; + + struct lan *ports[STP_MAX_PORTS]; + int n_ports; + +#define RXQ_SIZE 16 + struct bpdu rxq[RXQ_SIZE]; + int rxq_head, rxq_tail; +}; + +struct lan_conn { + struct bridge *bridge; + int port_no; +}; + +struct lan { + struct test_case *tc; + const char *name; + bool reached; + struct lan_conn conns[16]; + int n_conns; +}; + +struct test_case { + struct bridge *bridges[16]; + int n_bridges; + struct lan *lans[26]; + int n_lans; +}; + +static const char *file_name; +static int line_number; +static char line[128]; +static char *pos, *token; +static int n_warnings; + +static struct test_case * +new_test_case(void) +{ + struct test_case *tc = xmalloc(sizeof *tc); + tc->n_bridges = 0; + tc->n_lans = 0; + return tc; +} + +static void +send_bpdu(struct ofpbuf *pkt, int port_no, void *b_) +{ + struct bridge *b = b_; + struct lan *lan; + + assert(port_no < b->n_ports); + lan = b->ports[port_no]; + if (lan) { + const void *data = pkt->l3; + size_t size = (char *) ofpbuf_tail(pkt) - (char *) data; + int i; + + for (i = 0; i < lan->n_conns; i++) { + struct lan_conn *conn = &lan->conns[i]; + if (conn->bridge != b || conn->port_no != port_no) { + struct bridge *dst = conn->bridge; + struct bpdu *bpdu = &dst->rxq[dst->rxq_head++ % RXQ_SIZE]; + assert(dst->rxq_head - dst->rxq_tail <= RXQ_SIZE); + bpdu->data = xmemdup(data, size); + bpdu->size = size; + bpdu->port_no = conn->port_no; + } + } + } + ofpbuf_delete(pkt); +} + +static struct bridge * +new_bridge(struct test_case *tc, int id) +{ + struct bridge *b = xmalloc(sizeof *b); + char name[16]; + b->tc = tc; + b->id = id; + snprintf(name, sizeof name, "stp%x", id); + b->stp = stp_create(name, id, send_bpdu, b); + assert(tc->n_bridges < ARRAY_SIZE(tc->bridges)); + b->n_ports = 0; + b->rxq_head = b->rxq_tail = 0; + tc->bridges[tc->n_bridges++] = b; + return b; +} + +static struct lan * +new_lan(struct test_case *tc, const char *name) +{ + struct lan *lan = xmalloc(sizeof *lan); + lan->tc = tc; + lan->name = xstrdup(name); + lan->n_conns = 0; + assert(tc->n_lans < ARRAY_SIZE(tc->lans)); + tc->lans[tc->n_lans++] = lan; + return lan; +} + +static void +reconnect_port(struct bridge *b, int port_no, struct lan *new_lan) +{ + struct lan *old_lan; + int j; + + assert(port_no < b->n_ports); + old_lan = b->ports[port_no]; + if (old_lan == new_lan) { + return; + } + + /* Disconnect from old_lan. */ + if (old_lan) { + for (j = 0; j < old_lan->n_conns; j++) { + struct lan_conn *c = &old_lan->conns[j]; + if (c->bridge == b && c->port_no == port_no) { + memmove(c, c + 1, sizeof *c * (old_lan->n_conns - j - 1)); + old_lan->n_conns--; + break; + } + } + } + + /* Connect to new_lan. */ + b->ports[port_no] = new_lan; + if (new_lan) { + int conn_no = new_lan->n_conns++; + assert(conn_no < ARRAY_SIZE(new_lan->conns)); + new_lan->conns[conn_no].bridge = b; + new_lan->conns[conn_no].port_no = port_no; + } +} + +static void +new_port(struct bridge *b, struct lan *lan, int path_cost) +{ + int port_no = b->n_ports++; + struct stp_port *p = stp_get_port(b->stp, port_no); + assert(port_no < ARRAY_SIZE(b->ports)); + b->ports[port_no] = NULL; + stp_port_set_path_cost(p, path_cost); + stp_port_enable(p); + reconnect_port(b, port_no, lan); +} + +static void +dump(struct test_case *tc) +{ + int i; + + for (i = 0; i < tc->n_bridges; i++) { + struct bridge *b = tc->bridges[i]; + struct stp *stp = b->stp; + int j; + + printf("%s:", stp_get_name(stp)); + if (stp_is_root_bridge(stp)) { + printf(" root"); + } + printf("\n"); + for (j = 0; j < b->n_ports; j++) { + struct stp_port *p = stp_get_port(stp, j); + enum stp_state state = stp_port_get_state(p); + + printf("\tport %d", j); + if (b->ports[j]) { + printf(" (lan %s)", b->ports[j]->name); + } else { + printf(" (disconnected)"); + } + printf(": %s", stp_state_name(state)); + if (p == stp_get_root_port(stp)) { + printf(" (root port, root_path_cost=%u)", stp_get_root_path_cost(stp)); + } + printf("\n"); + } + } +} + +static void dump_lan_tree(struct test_case *, struct lan *, int level); + +static void +dump_bridge_tree(struct test_case *tc, struct bridge *b, int level) +{ + int i; + + if (b->reached) { + return; + } + b->reached = true; + for (i = 0; i < level; i++) { + printf("\t"); + } + printf("%s\n", stp_get_name(b->stp)); + for (i = 0; i < b->n_ports; i++) { + struct lan *lan = b->ports[i]; + struct stp_port *p = stp_get_port(b->stp, i); + if (stp_port_get_state(p) == STP_FORWARDING && lan) { + dump_lan_tree(tc, lan, level + 1); + } + } +} + +static void +dump_lan_tree(struct test_case *tc, struct lan *lan, int level) +{ + int i; + + if (lan->reached) { + return; + } + lan->reached = true; + for (i = 0; i < level; i++) { + printf("\t"); + } + printf("%s\n", lan->name); + for (i = 0; i < lan->n_conns; i++) { + struct bridge *b = lan->conns[i].bridge; + dump_bridge_tree(tc, b, level + 1); + } +} + +static void +tree(struct test_case *tc) +{ + int i; + + for (i = 0; i < tc->n_bridges; i++) { + struct bridge *b = tc->bridges[i]; + b->reached = false; + } + for (i = 0; i < tc->n_lans; i++) { + struct lan *lan = tc->lans[i]; + lan->reached = false; + } + for (i = 0; i < tc->n_bridges; i++) { + struct bridge *b = tc->bridges[i]; + struct stp *stp = b->stp; + if (stp_is_root_bridge(stp)) { + dump_bridge_tree(tc, b, 0); + } + } +} + +static void +simulate(struct test_case *tc, int granularity) +{ + int time; + + for (time = 0; time < 1000 * 180; time += granularity) { + int round_trips; + int i; + + for (i = 0; i < tc->n_bridges; i++) { + stp_tick(tc->bridges[i]->stp, granularity); + } + for (round_trips = 0; round_trips < granularity; round_trips++) { + bool any = false; + for (i = 0; i < tc->n_bridges; i++) { + struct bridge *b = tc->bridges[i]; + for (; b->rxq_tail != b->rxq_head; b->rxq_tail++) { + struct bpdu *bpdu = &b->rxq[b->rxq_tail % RXQ_SIZE]; + stp_received_bpdu(stp_get_port(b->stp, bpdu->port_no), + bpdu->data, bpdu->size); + any = true; + } + } + if (!any) { + break; + } + } + } +} + +static void +err(const char *message, ...) + PRINTF_FORMAT(1, 2) + NO_RETURN; + +static void +err(const char *message, ...) +{ + va_list args; + + fprintf(stderr, "%s:%d:%td: ", file_name, line_number, pos - line); + va_start(args, message); + vfprintf(stderr, message, args); + va_end(args); + putc('\n', stderr); + + exit(EXIT_FAILURE); +} + +static void +warn(const char *message, ...) + PRINTF_FORMAT(1, 2); + +static void +warn(const char *message, ...) +{ + va_list args; + + fprintf(stderr, "%s:%d: ", file_name, line_number); + va_start(args, message); + vfprintf(stderr, message, args); + va_end(args); + putc('\n', stderr); + + n_warnings++; +} + +static bool +get_token(void) +{ + char *start; + + while (isspace((unsigned char) *pos)) { + pos++; + } + if (*pos == '\0') { + token = NULL; + return false; + } + + start = pos; + if (isalpha((unsigned char) *pos)) { + while (isalpha((unsigned char) *++pos)) { + continue; + } + } else if (isdigit((unsigned char) *pos)) { + if (*pos == '0' && (pos[1] == 'x' || pos[1] == 'X')) { + pos += 2; + while (isxdigit((unsigned char) *pos)) { + pos++; + } + } else { + while (isdigit((unsigned char) *++pos)) { + continue; + } + } + } else { + pos++; + } + + free(token); + token = xmemdup0(start, pos - start); + return true; +} + +static bool +get_int(int *intp) +{ + char *save_pos = pos; + if (token && isdigit((unsigned char) *token)) { + *intp = strtol(token, NULL, 0); + get_token(); + return true; + } else { + pos = save_pos; + return false; + } +} + +static bool +match(const char *want) +{ + if (token && !strcmp(want, token)) { + get_token(); + return true; + } else { + return false; + } +} + +static int +must_get_int(void) +{ + int x; + if (!get_int(&x)) { + err("expected integer"); + } + return x; +} + +static void +must_match(const char *want) +{ + if (!match(want)) { + err("expected \"%s\"", want); + } +} + +int +main(int argc, char *argv[]) +{ + struct test_case *tc; + FILE *input_file; + int i; + + if (argc != 2) { + ovs_fatal(0, "usage: test-stp INPUT.STP\n"); + } + file_name = argv[1]; + + input_file = fopen(file_name, "r"); + if (!input_file) { + ovs_fatal(errno, "error opening \"%s\"", file_name); + } + + tc = new_test_case(); + for (i = 0; i < 26; i++) { + char name[2]; + name[0] = 'a' + i; + name[1] = '\0'; + new_lan(tc, name); + } + + for (line_number = 1; fgets(line, sizeof line, input_file); + line_number++) + { + char *newline, *hash; + + newline = strchr(line, '\n'); + if (newline) { + *newline = '\0'; + } + hash = strchr(line, '#'); + if (hash) { + *hash = '\0'; + } + + pos = line; + if (!get_token()) { + continue; + } + if (match("bridge")) { + struct bridge *bridge; + int bridge_no, port_no; + + bridge_no = must_get_int(); + if (bridge_no < tc->n_bridges) { + bridge = tc->bridges[bridge_no]; + } else if (bridge_no == tc->n_bridges) { + bridge = new_bridge(tc, must_get_int()); + } else { + err("bridges must be numbered consecutively from 0"); + } + if (match("^")) { + stp_set_bridge_priority(bridge->stp, must_get_int()); + } + + if (match("=")) { + for (port_no = 0; port_no < STP_MAX_PORTS; port_no++) { + struct stp_port *p = stp_get_port(bridge->stp, port_no); + if (!token || match("X")) { + stp_port_disable(p); + } else if (match("_")) { + /* Nothing to do. */ + } else { + struct lan *lan; + int path_cost; + + if (!strcmp(token, "0")) { + lan = NULL; + } else if (strlen(token) == 1 && islower(*token)) { + lan = tc->lans[*token - 'a']; + } else { + err("%s is not a valid LAN name " + "(0 or a lowercase letter)", token); + } + get_token(); + + path_cost = match(":") ? must_get_int() : 10; + if (port_no < bridge->n_ports) { + stp_port_set_path_cost(p, path_cost); + stp_port_enable(p); + reconnect_port(bridge, port_no, lan); + } else if (port_no == bridge->n_ports) { + new_port(bridge, lan, path_cost); + } else { + err("ports must be numbered consecutively"); + } + if (match("^")) { + stp_port_set_priority(p, must_get_int()); + } + } + } + } + } else if (match("run")) { + simulate(tc, must_get_int()); + } else if (match("dump")) { + dump(tc); + } else if (match("tree")) { + tree(tc); + } else if (match("check")) { + struct bridge *b; + struct stp *stp; + int bridge_no, port_no; + + bridge_no = must_get_int(); + if (bridge_no >= tc->n_bridges) { + err("no bridge numbered %d", bridge_no); + } + b = tc->bridges[bridge_no]; + stp = b->stp; + + must_match("="); + + if (match("rootid")) { + uint64_t rootid; + must_match(":"); + rootid = must_get_int(); + if (match("^")) { + rootid |= (uint64_t) must_get_int() << 48; + } else { + rootid |= UINT64_C(0x8000) << 48; + } + if (stp_get_designated_root(stp) != rootid) { + warn("%s: root %"PRIx64", not %"PRIx64, + stp_get_name(stp), stp_get_designated_root(stp), + rootid); + } + } + + if (match("root")) { + if (stp_get_root_path_cost(stp)) { + warn("%s: root path cost of root is %u but should be 0", + stp_get_name(stp), stp_get_root_path_cost(stp)); + } + if (!stp_is_root_bridge(stp)) { + warn("%s: root is %"PRIx64", not %"PRIx64, + stp_get_name(stp), + stp_get_designated_root(stp), stp_get_bridge_id(stp)); + } + for (port_no = 0; port_no < b->n_ports; port_no++) { + struct stp_port *p = stp_get_port(stp, port_no); + enum stp_state state = stp_port_get_state(p); + if (!(state & (STP_DISABLED | STP_FORWARDING))) { + warn("%s: root port %d in state %s", + stp_get_name(b->stp), port_no, + stp_state_name(state)); + } + } + } else { + for (port_no = 0; port_no < STP_MAX_PORTS; port_no++) { + struct stp_port *p = stp_get_port(stp, port_no); + enum stp_state state; + if (token == NULL || match("D")) { + state = STP_DISABLED; + } else if (match("B")) { + state = STP_BLOCKING; + } else if (match("Li")) { + state = STP_LISTENING; + } else if (match("Le")) { + state = STP_LEARNING; + } else if (match("F")) { + state = STP_FORWARDING; + } else if (match("_")) { + continue; + } else { + err("unknown port state %s", token); + } + if (stp_port_get_state(p) != state) { + warn("%s port %d: state is %s but should be %s", + stp_get_name(stp), port_no, + stp_state_name(stp_port_get_state(p)), + stp_state_name(state)); + } + if (state == STP_FORWARDING) { + struct stp_port *root_port = stp_get_root_port(stp); + if (match(":")) { + int root_path_cost = must_get_int(); + if (p != root_port) { + warn("%s: port %d is not the root port", + stp_get_name(stp), port_no); + if (!root_port) { + warn("%s: (there is no root port)", + stp_get_name(stp)); + } else { + warn("%s: (port %d is the root port)", + stp_get_name(stp), + stp_port_no(root_port)); + } + } else if (root_path_cost + != stp_get_root_path_cost(stp)) { + warn("%s: root path cost is %u, should be %d", + stp_get_name(stp), + stp_get_root_path_cost(stp), + root_path_cost); + } + } else if (p == root_port) { + warn("%s: port %d is the root port but " + "not expected to be", + stp_get_name(stp), port_no); + } + } + } + } + if (n_warnings) { + exit(EXIT_FAILURE); + } + } + if (get_token()) { + err("trailing garbage on line"); + } + } + + return 0; +} diff --git a/tests/test-stp.sh b/tests/test-stp.sh new file mode 100755 index 00000000..fd6acf54 --- /dev/null +++ b/tests/test-stp.sh @@ -0,0 +1,7 @@ +#! /bin/sh +set -e +progress= +for d in ${stp_files}; do + echo "Testing $d..." + $SUPERVISOR ./tests/test-stp ${srcdir}/$d +done diff --git a/tests/test-type-props.c b/tests/test-type-props.c new file mode 100644 index 00000000..67dabae8 --- /dev/null +++ b/tests/test-type-props.c @@ -0,0 +1,41 @@ +#include +#include "type-props.h" +#include +#include + +#define MUST_SUCCEED(EXPRESSION) \ + if (!(EXPRESSION)) { \ + fprintf(stderr, "%s:%d: %s failed\n", \ + __FILE__, __LINE__, #EXPRESSION); \ + exit(EXIT_FAILURE); \ + } + +#define TEST_TYPE(type, minimum, maximum, is_signed) \ + MUST_SUCCEED(TYPE_IS_INTEGER(type)); \ + MUST_SUCCEED(TYPE_IS_SIGNED(type) == is_signed); \ + MUST_SUCCEED(TYPE_MAXIMUM(type) == maximum); \ + MUST_SUCCEED(TYPE_MINIMUM(type) == minimum); + +int +main (void) +{ + TEST_TYPE(char, CHAR_MIN, CHAR_MAX, (CHAR_MIN < 0)); + + TEST_TYPE(signed char, SCHAR_MIN, SCHAR_MAX, 1); + TEST_TYPE(short int, SHRT_MIN, SHRT_MAX, 1); + TEST_TYPE(int, INT_MIN, INT_MAX, 1); + TEST_TYPE(long int, LONG_MIN, LONG_MAX, 1); + TEST_TYPE(long long int, LLONG_MIN, LLONG_MAX, 1); + + TEST_TYPE(unsigned char, 0, UCHAR_MAX, 0); + TEST_TYPE(unsigned short int, 0, USHRT_MAX, 0); + TEST_TYPE(unsigned int, 0, UINT_MAX, 0); + TEST_TYPE(unsigned long int, 0, ULONG_MAX, 0); + TEST_TYPE(unsigned long long int, 0, ULLONG_MAX, 0); + + MUST_SUCCEED(!(TYPE_IS_INTEGER(float))); + MUST_SUCCEED(!(TYPE_IS_INTEGER(double))); + MUST_SUCCEED(!(TYPE_IS_INTEGER(long double))); + + return 0; +} diff --git a/third-party/.gitignore b/third-party/.gitignore new file mode 100644 index 00000000..b336cc7c --- /dev/null +++ b/third-party/.gitignore @@ -0,0 +1,2 @@ +/Makefile +/Makefile.in diff --git a/third-party/README b/third-party/README new file mode 100644 index 00000000..15f4d647 --- /dev/null +++ b/third-party/README @@ -0,0 +1,35 @@ +This directory contains third-party software that may be useful for +debugging. + +tcpdump +------- +The "ofp-tcpdump.patch" patch adds the ability to parse OpenFlow +messages to tcpdump. These instructions assume that tcpdump 3.9.8 +is going to be used, but it should work with other versions that are not +substantially different. To begin, download tcpdump and apply the +patch: + + wget http://www.tcpdump.org/release/tcpdump-3.9.8.tar.gz + tar xzf tcpdump-3.9.8.tar.gz + ln -s tcpdump-3.9.8 tcpdump + patch -p0 < ofp-tcpdump.patch + +Then build the new version of tcpdump: + + cd tcpdump + ./configure + make + +Clearly, tcpdump can only parse unencrypted packets, so you will need to +connect the controller and datapath using plain TCP. To look at the +traffic, tcpdump will be started in a manner similar to the following: + + sudo ./tcpdump -s0 -i eth0 port 6633 + +The "-s0" flag indicates that tcpdump should capture the entire packet. +If the OpenFlow message is not received in its entirety, "[|openflow]" will +be printed instead of the OpenFlow message contents. + +The verbosity of the output may be increased by adding additional "-v" +flags. If "-vvv" is used, the raw OpenFlow data is also printed in +hex and ASCII. diff --git a/third-party/automake.mk b/third-party/automake.mk new file mode 100644 index 00000000..02636bb5 --- /dev/null +++ b/third-party/automake.mk @@ -0,0 +1,3 @@ +EXTRA_DIST += \ + third-party/README \ + third-party/ofp-tcpdump.patch diff --git a/third-party/ofp-tcpdump.patch b/third-party/ofp-tcpdump.patch new file mode 100644 index 00000000..d9a23cb1 --- /dev/null +++ b/third-party/ofp-tcpdump.patch @@ -0,0 +1,109 @@ +--- tcpdump/interface.h 2007-06-13 18:03:20.000000000 -0700 ++++ tcpdump/interface.h 2008-04-15 18:28:55.000000000 -0700 +@@ -148,7 +148,8 @@ + + extern const char *dnaddr_string(u_short); + +-extern void error(const char *, ...) ++#define error(fmt, args...) tcpdump_error(fmt, ## args) ++extern void tcpdump_error(const char *, ...) + __attribute__((noreturn, format (printf, 1, 2))); + extern void warning(const char *, ...) __attribute__ ((format (printf, 1, 2))); + +@@ -176,6 +177,7 @@ + extern void hex_print_with_offset(const char *, const u_char *, u_int, u_int); + extern void hex_print(const char *, const u_char *, u_int); + extern void telnet_print(const u_char *, u_int); ++extern void openflow_print(const u_char *, u_int); + extern int ether_encap_print(u_short, const u_char *, u_int, u_int, u_short *); + extern int llc_print(const u_char *, u_int, u_int, const u_char *, + const u_char *, u_short *); +--- tcpdump/Makefile.in 2007-09-25 18:59:52.000000000 -0700 ++++ tcpdump/Makefile.in 2009-05-11 15:59:28.000000000 -0700 +@@ -49,10 +49,10 @@ + CFLAGS = $(CCOPT) $(DEFS) $(INCLS) + + # Standard LDFLAGS +-LDFLAGS = @LDFLAGS@ ++LDFLAGS = @LDFLAGS@ -L../../lib + + # Standard LIBS +-LIBS = @LIBS@ ++LIBS = @LIBS@ -lopenvswitch + + INSTALL = @INSTALL@ + INSTALL_PROGRAM = @INSTALL_PROGRAM@ +@@ -87,7 +87,8 @@ + print-slow.c print-snmp.c print-stp.c print-sunatm.c print-sunrpc.c \ + print-symantec.c print-syslog.c print-tcp.c print-telnet.c print-tftp.c \ + print-timed.c print-token.c print-udp.c print-vjc.c print-vrrp.c \ +- print-wb.c print-zephyr.c setsignal.c tcpdump.c util.c ++ print-wb.c print-zephyr.c setsignal.c tcpdump.c util.c \ ++ print-openflow.c + + LOCALSRC = @LOCALSRC@ + GENSRC = version.c +--- tcpdump/print-openflow.c 1969-12-31 16:00:00.000000000 -0800 ++++ tcpdump/print-openflow.c 2009-05-11 15:38:41.000000000 -0700 +@@ -0,0 +1,40 @@ ++/* Copyright (C) 2007, 2008, 2009 Nicira Networks. ++ * ++ * Permission to use, copy, modify, and/or distribute this software for any ++ * purpose with or without fee is hereby granted, provided that the above ++ * copyright notice and this permission notice appear in all copies. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES ++ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF ++ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ++ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES ++ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ++ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF ++ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ++ */ ++ * ++ ++#ifdef HAVE_CONFIG_H ++#include "config.h" ++#endif ++ ++#include ++ ++#include "interface.h" ++#include "../../include/openflow/openflow.h" ++#include "../../lib/ofp-print.h" ++ ++void ++openflow_print(const u_char *sp, u_int length) ++{ ++ const struct ofp_header *ofp = (struct ofp_header *)sp; ++ ++ if (!TTEST2(*sp, ntohs(ofp->length))) ++ goto trunc; ++ ++ ofp_print(stdout, sp, length, vflag); ++ return; ++ ++trunc: ++ printf("[|openflow]"); ++} +--- tcpdump/print-tcp.c 2006-09-19 12:07:57.000000000 -0700 ++++ tcpdump/print-tcp.c 2009-05-11 15:38:25.000000000 -0700 +@@ -52,6 +52,8 @@ + + #include "nameser.h" + ++#include "../../include/openflow/openflow.h" ++ + #ifdef HAVE_LIBCRYPTO + #include + +@@ -680,7 +682,8 @@ + } + else if (length > 0 && (sport == LDP_PORT || dport == LDP_PORT)) { + ldp_print(bp, length); +- } ++ } else if (sport == OFP_TCP_PORT || dport == OFP_TCP_PORT) ++ openflow_print(bp, length); + } + return; + bad: diff --git a/utilities/.gitignore b/utilities/.gitignore new file mode 100644 index 00000000..32a7f2eb --- /dev/null +++ b/utilities/.gitignore @@ -0,0 +1,22 @@ +/Makefile +/Makefile.in +/nlmon +/ovs-appctl +/ovs-appctl.8 +/ovs-cfg-mod +/ovs-cfg-mod.8 +/ovs-controller +/ovs-controller.8 +/ovs-discover +/ovs-discover.8 +/ovs-dpctl +/ovs-dpctl.8 +/ovs-kill +/ovs-kill.8 +/ovs-ofctl +/ovs-ofctl.8 +/ovs-parse-leaks +/ovs-pki +/ovs-pki-cgi +/ovs-pki.8 +/ovs-wdt diff --git a/utilities/automake.mk b/utilities/automake.mk new file mode 100644 index 00000000..97b827ac --- /dev/null +++ b/utilities/automake.mk @@ -0,0 +1,74 @@ +bin_PROGRAMS += \ + utilities/ovs-appctl \ + utilities/ovs-cfg-mod \ + utilities/ovs-controller \ + utilities/ovs-discover \ + utilities/ovs-dpctl \ + utilities/ovs-kill \ + utilities/ovs-ofctl \ + utilities/ovs-wdt +noinst_PROGRAMS += utilities/nlmon +bin_SCRIPTS += utilities/ovs-pki +noinst_SCRIPTS += utilities/ovs-pki-cgi utilities/ovs-parse-leaks +dist_sbin_SCRIPTS += utilities/ovs-monitor + +EXTRA_DIST += \ + utilities/ovs-appctl.8.in \ + utilities/ovs-cfg-mod.8.in \ + utilities/ovs-controller.8.in \ + utilities/ovs-discover.8.in \ + utilities/ovs-dpctl.8.in \ + utilities/ovs-kill.8.in \ + utilities/ovs-ofctl.8.in \ + utilities/ovs-parse-leaks.in \ + utilities/ovs-pki-cgi.in \ + utilities/ovs-pki.8.in \ + utilities/ovs-pki.in +DISTCLEANFILES += \ + utilities/ovs-appctl.8 \ + utilities/ovs-cfg-mod.8 \ + utilities/ovs-controller.8 \ + utilities/ovs-discover.8 \ + utilities/ovs-dpctl.8 \ + utilities/ovs-kill.8 \ + utilities/ovs-ofctl.8 \ + utilities/ovs-parse-leaks \ + utilities/ovs-pki \ + utilities/ovs-pki.8 \ + utilities/ovs-pki-cgi + +man_MANS += \ + utilities/ovs-appctl.8 \ + utilities/ovs-cfg-mod.8 \ + utilities/ovs-controller.8 \ + utilities/ovs-discover.8 \ + utilities/ovs-dpctl.8 \ + utilities/ovs-kill.8 \ + utilities/ovs-ofctl.8 \ + utilities/ovs-pki.8 + +utilities_ovs_appctl_SOURCES = utilities/ovs-appctl.c +utilities_ovs_appctl_LDADD = lib/libopenvswitch.a + +utilities_ovs_cfg_mod_SOURCES = utilities/ovs-cfg-mod.c +utilities_ovs_cfg_mod_LDADD = lib/libopenvswitch.a + +utilities_ovs_controller_SOURCES = utilities/ovs-controller.c +utilities_ovs_controller_LDADD = lib/libopenvswitch.a $(FAULT_LIBS) $(SSL_LIBS) + +utilities_ovs_discover_SOURCES = utilities/ovs-discover.c +utilities_ovs_discover_LDADD = lib/libopenvswitch.a + +utilities_ovs_dpctl_SOURCES = utilities/ovs-dpctl.c +utilities_ovs_dpctl_LDADD = lib/libopenvswitch.a $(FAULT_LIBS) + +utilities_ovs_kill_SOURCES = utilities/ovs-kill.c +utilities_ovs_kill_LDADD = lib/libopenvswitch.a + +utilities_ovs_ofctl_SOURCES = utilities/ovs-ofctl.c +utilities_ovs_ofctl_LDADD = lib/libopenvswitch.a $(FAULT_LIBS) $(SSL_LIBS) + +utilities_ovs_wdt_SOURCES = utilities/ovs-wdt.c + +utilities_nlmon_SOURCES = utilities/nlmon.c +utilities_nlmon_LDADD = lib/libopenvswitch.a diff --git a/utilities/nlmon.c b/utilities/nlmon.c new file mode 100644 index 00000000..eb1be60a --- /dev/null +++ b/utilities/nlmon.c @@ -0,0 +1,90 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "netlink.h" +#include "ofpbuf.h" +#include "poll-loop.h" +#include "timeval.h" +#include "util.h" +#include "vlog.h" + +static const struct nl_policy rtnlgrp_link_policy[] = { + [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false }, + [IFLA_MASTER] = { .type = NL_A_U32, .optional = true }, +}; + +int +main(int argc UNUSED, char *argv[]) +{ + struct nl_sock *sock; + int error; + + set_program_name(argv[0]); + time_init(); + vlog_init(); + vlog_set_levels(VLM_ANY_MODULE, VLF_ANY_FACILITY, VLL_DBG); + + error = nl_sock_create(NETLINK_ROUTE, RTNLGRP_LINK, 0, 0, &sock); + if (error) { + ovs_fatal(error, "could not create rtnetlink socket"); + } + + for (;;) { + struct ofpbuf *buf; + + error = nl_sock_recv(sock, &buf, false); + if (error == EAGAIN) { + /* Nothing to do. */ + } else if (error == ENOBUFS) { + ovs_error(0, "network monitor socket overflowed"); + } else if (error) { + ovs_fatal(error, "error on network monitor socket"); + } else { + struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)]; + struct nlmsghdr *nlh; + struct ifinfomsg *iim; + + nlh = ofpbuf_at(buf, 0, NLMSG_HDRLEN); + iim = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *iim); + if (!iim) { + ovs_error(0, "received bad rtnl message (no ifinfomsg)"); + ofpbuf_delete(buf); + continue; + } + + if (!nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct ifinfomsg), + rtnlgrp_link_policy, + attrs, ARRAY_SIZE(rtnlgrp_link_policy))) { + ovs_error(0, "received bad rtnl message (policy)"); + ofpbuf_delete(buf); + continue; + } + printf("netdev %s changed (%s):\n", + nl_attr_get_string(attrs[IFLA_IFNAME]), + (nlh->nlmsg_type == RTM_NEWLINK ? "RTM_NEWLINK" + : nlh->nlmsg_type == RTM_DELLINK ? "RTM_DELLINK" + : nlh->nlmsg_type == RTM_GETLINK ? "RTM_GETLINK" + : nlh->nlmsg_type == RTM_SETLINK ? "RTM_SETLINK" + : "other")); + if (attrs[IFLA_MASTER]) { + uint32_t idx = nl_attr_get_u32(attrs[IFLA_MASTER]); + char ifname[IFNAMSIZ]; + if (!if_indextoname(idx, ifname)) { + strcpy(ifname, "unknown"); + } + printf("\tmaster=%"PRIu32" (%s)\n", idx, ifname); + } + ofpbuf_delete(buf); + } + + nl_sock_wait(sock, POLLIN); + poll_block(); + } +} + diff --git a/utilities/ovs-appctl.8.in b/utilities/ovs-appctl.8.in new file mode 100644 index 00000000..9bf97fd2 --- /dev/null +++ b/utilities/ovs-appctl.8.in @@ -0,0 +1,166 @@ +.\" -*- nroff -*- +.de IQ +. br +. ns +. IP "\\$1" +.. +.TH ovs\-appctl 8 "April 2009" "Open vSwitch" "Open vSwitch Manual" +.ds PN ovs\-appctl + +.SH NAME +ovs\-appctl \- utility for configuring running Open vSwitch daemons + +.SH SYNOPSIS +\fBovs\-appctl\fR [\fB-h\fR | \fB--help\fR] [\fItarget\fR...] [\fIaction\fR...] +.sp 1 +The available \fItarget\fR options are: +.br +[\fB-t\fR \fIpid\fR | \fB--target=\fIpid\fR] +.sp 1 +The available \fIaction\fR options are: +.br +[\fB-l\fR | \fB--list\fR] [\fB-s\fR +\fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]] | +\fB--set=\fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]]] +[\fB-r\fR | \fB--reopen\fR] +[\fB-e\fR | \fB--execute=\fIcommand\fR] + +.SH DESCRIPTION +The \fBovs\-appctl\fR program connects to one or more running +Open vSwitch daemons (such as \fBovs\-vswitchd\fR(8)), as specified by the +user, and sends them commands to query or modify their behavior. +Its primary purpose is currently to adjust daemons' logging levels. + +\fBovs\-appctl\fR applies one or more actions to each of one or more +target processes. Targets may be specified using: + +.IP "\fB-t \fIsocket\fR" +.IQ "\fB--target=\fIsocket\fR" +The specified \fIsocket\fR must be the name of a Unix domain socket +for a \fBovs\-appctl\fR-controllable process. If \fIsocket\fR does not +begin with \fB/\fR, it is treated as relative to \fB@RUNDIR@\fR. + +Each Open vSwitch daemon by default creates a socket named +\fB@RUNDIR@/\fIprogram\fB.\fIpid\fB.ctl\fR, where \fIprogram\fR is +the program's name (such as \fBovs\-vswitchd\fR) and \fIpid\fR is the +daemon's PID. + +.PP +The available actions are: + +.IP "\fB-l\fR" +.IQ "\fB--list\fR" +Print the list of known logging modules and their current levels to +stdout. + +.IP "\fB-s\fR \fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]]" +.IQ "\fB--set=\fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]]" + +Sets the logging level for \fImodule\fR in \fIfacility\fR to +\fIlevel\fR. The \fImodule\fR may be any valid module name (as +displayed by the \fB--list\fR option) or the special name \fBANY\fR to +set the logging levels for all modules. The \fIfacility\fR may be +\fBsyslog\fR or \fBconsole\fR to set the levels for logging to the +system log or to the console, respectively, or \fBANY\fR to set the +logging levels for both facilities. If it is omitted, +\fIfacility\fR defaults to \fBANY\fR. The \fIlevel\fR must be one of +\fBemer\fR, \fBerr\fR, \fBwarn\fR, \fBinfo\fR, or \fBdbg\fR, designating the +minimum severity of a message for it to be logged. If it is omitted, +\fIlevel\fR defaults to \fBdbg\fR. + +.IP "\fB-s PATTERN:\fIfacility\fB:\fIpattern\fR" +.IQ "\fB--set=PATTERN:\fIfacility\fB:\fIpattern\fR" + +Sets the log pattern for \fIfacility\fR to \fIpattern\fR. Each time a +message is logged to \fIfacility\fR, \fIpattern\fR determines the +message's formatting. Most characters in \fIpattern\fR are copied +literally to the log, but special escapes beginning with \fB%\fR are +expanded as follows: + +.RS +.IP \fB%A\fR +The name of the application logging the message, e.g. \fBsecchan\fR. + +.IP \fB%c\fR +The name of the module (as shown by \fBovs\-appctl --list\fR) logging +the message. + +.IP \fB%d\fR +The current date and time in ISO 8601 format (YYYY-MM-DD HH:MM:SS). + +.IP \fB%d{\fIformat\fB}\fR +The current date and time in the specified \fIformat\fR, which takes +the same format as the \fItemplate\fR argument to \fBstrftime\fR(3). + +.IP \fB%m\fR +The message being logged. + +.IP \fB%N\fR +A serial number for this message within this run of the program, as a +decimal number. The first message a program logs has serial number 1, +the second one has serial number 2, and so on. + +.IP \fB%n\fR +A new-line. + +.IP \fB%p\fR +The level at which the message is logged, e.g. \fBDBG\fR. + +.IP \fB%P\fR +The program's process ID (pid), as a decimal number. + +.IP \fB%r\fR +The number of milliseconds elapsed from the start of the application +to the time the message was logged. + +.IP \fB%%\fR +A literal \fB%\fR. +.RE + +.IP +A few options may appear between the \fB%\fR and the format specifier +character, in this order: + +.RS +.IP \fB-\fR +Left justify the escape's expansion within its field width. Right +justification is the default. + +.IP \fB0\fR +Pad the field to the field width with \fB0\fRs. Padding with spaces +is the default. + +.IP \fIwidth\fR +A number specifies the minimum field width. If the escape expands to +fewer characters than \fIwidth\fR then it is padded to fill the field +width. (A field wider than \fIwidth\fR is not truncated to fit.) +.RE + +.IP +The default pattern for console output is \fB%d{%b %d +%H:%M:%S}|%05N|%c|%p|%m\fR; for syslog output, \fB%05N|%c|%p|%m\fR. + +.IP \fB-r\fR +.IQ \fB--reopen\fR +Causes the target application to close and reopen its log file. (This +is useful after rotating log files, to cause a new log file to be +used.) + +This has no effect if the target application was not invoked with the +\fB--log-file\fR option. + +.IP "\fB-e \fIcommand\fR" +.IQ "\fB--execute=\fIcommand\fR" +Passes the specified \fIcommand\fR literally to the target application +and prints its response to stdout, if successful, or to stderr if an +error occurs. Use \fB-e help\fR to print a list of available commands. + +.SH OPTIONS + +.so lib/common.man + +.SH "SEE ALSO" + +.BR ovs\-controller (8), +.BR ovs\-dpctl (8), +.BR secchan (8) diff --git a/utilities/ovs-appctl.c b/utilities/ovs-appctl.c new file mode 100644 index 00000000..eb544452 --- /dev/null +++ b/utilities/ovs-appctl.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include "vlog.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "command-line.h" +#include "compiler.h" +#include "timeval.h" +#include "unixctl.h" +#include "util.h" + +static void +usage(char *prog_name, int exit_code) +{ + printf("Usage: %s [TARGET] [ACTION...]\n" + "Targets:\n" + " -t, --target=TARGET Path to Unix domain socket\n" + "Actions:\n" + " -l, --list List current settings\n" + " -s, --set=MODULE[:FACILITY[:LEVEL]]\n" + " Set MODULE and FACILITY log level to LEVEL\n" + " MODULE may be any valid module name or 'ANY'\n" + " FACILITY may be 'syslog', 'console', 'file', or 'ANY' (default)\n" + " LEVEL may be 'emer', 'err', 'warn', 'info', or 'dbg' (default)\n" + " -r, --reopen Make the program reopen its log file\n" + " -e, --execute=COMMAND Execute control COMMAND and print its output\n" + "Other options:\n" + " -h, --help Print this helpful information\n" + " -V, --version Display version information\n", + prog_name); + exit(exit_code); +} + +static char * +transact(struct unixctl_client *client, const char *request, bool *ok) +{ + int code; + char *reply; + int error = unixctl_client_transact(client, request, &code, &reply); + if (error) { + fprintf(stderr, "%s: transaction error: %s\n", + unixctl_client_target(client), strerror(error)); + *ok = false; + return xstrdup(""); + } else { + if (code / 100 != 2) { + fprintf(stderr, "%s: server returned reply code %03d\n", + unixctl_client_target(client), code); + } + return reply; + } +} + +static void +transact_ack(struct unixctl_client *client, const char *request, bool *ok) +{ + free(transact(client, request, ok)); +} + +static void +execute_command(struct unixctl_client *client, const char *request, bool *ok) +{ + int code; + char *reply; + int error = unixctl_client_transact(client, request, &code, &reply); + if (error) { + fprintf(stderr, "%s: transaction error: %s\n", + unixctl_client_target(client), strerror(error)); + *ok = false; + } else { + if (code / 100 != 2) { + fprintf(stderr, "%s: server returned reply code %03d\n", + unixctl_client_target(client), code); + fputs(reply, stderr); + *ok = false; + } else { + fputs(reply, stdout); + } + } +} + +static void +add_target(struct unixctl_client ***clients, size_t *n_clients, + const char *path, bool *ok) +{ + struct unixctl_client *client; + int error = unixctl_client_create(path, &client); + if (error) { + fprintf(stderr, "Error connecting to \"%s\": %s\n", + path, strerror(error)); + *ok = false; + } else { + *clients = xrealloc(*clients, sizeof *clients * (*n_clients + 1)); + (*clients)[*n_clients] = client; + ++*n_clients; + } +} + +int main(int argc, char *argv[]) +{ + static const struct option long_options[] = { + /* Target options must come first. */ + {"target", required_argument, NULL, 't'}, + {"help", no_argument, NULL, 'h'}, + {"version", no_argument, NULL, 'V'}, + + /* Action options come afterward. */ + {"list", no_argument, NULL, 'l'}, + {"set", required_argument, NULL, 's'}, + {"reopen", no_argument, NULL, 'r'}, + {"execute", required_argument, NULL, 'e'}, + {0, 0, 0, 0}, + }; + char *short_options; + + /* Determine targets. */ + bool ok = true; + int n_actions = 0; + struct unixctl_client **clients = NULL; + size_t n_clients = 0; + + set_program_name(argv[0]); + time_init(); + + short_options = long_options_to_short_options(long_options); + for (;;) { + int option; + size_t i; + + option = getopt_long(argc, argv, short_options, long_options, NULL); + if (option == -1) { + break; + } + if (!strchr("thV", option) && n_clients == 0) { + ovs_fatal(0, "no targets specified (use --help for help)"); + } else { + ++n_actions; + } + switch (option) { + case 't': + add_target(&clients, &n_clients, optarg, &ok); + break; + + case 'l': + for (i = 0; i < n_clients; i++) { + struct unixctl_client *client = clients[i]; + char *reply; + + printf("%s:\n", unixctl_client_target(client)); + reply = transact(client, "vlog/list", &ok); + fputs(reply, stdout); + free(reply); + } + break; + + case 's': + for (i = 0; i < n_clients; i++) { + struct unixctl_client *client = clients[i]; + char *request = xasprintf("vlog/set %s", optarg); + transact_ack(client, request, &ok); + free(request); + } + break; + + case 'r': + for (i = 0; i < n_clients; i++) { + struct unixctl_client *client = clients[i]; + char *request = xstrdup("vlog/reopen"); + transact_ack(client, request, &ok); + free(request); + } + break; + + case 'e': + for (i = 0; i < n_clients; i++) { + execute_command(clients[i], optarg, &ok); + } + break; + + case 'h': + usage(argv[0], EXIT_SUCCESS); + break; + + case 'V': + OVS_PRINT_VERSION(0, 0); + exit(EXIT_SUCCESS); + + case '?': + exit(EXIT_FAILURE); + + default: + NOT_REACHED(); + } + } + if (!n_actions) { + fprintf(stderr, + "warning: no actions specified (use --help for help)\n"); + } + exit(ok ? 0 : 1); +} diff --git a/utilities/ovs-cfg-mod.8.in b/utilities/ovs-cfg-mod.8.in new file mode 100644 index 00000000..5b96f288 --- /dev/null +++ b/utilities/ovs-cfg-mod.8.in @@ -0,0 +1,101 @@ +.\" -*- nroff -*- +.de IQ +. br +. ns +. IP "\\$1" +.. +.TH ovs\-cfg\-mod 8 "June 2009" "Open vSwitch" "Open vSwitch Manual" +.ds PN ovs\-cfg\-mod +. +.SH NAME +ovs\-cfg\-mod \- Safely manage a ovs\-vswitchd.conf\-style configuration file +. +.SH SYNOPSIS +\fB ovs\-cfg\-mod \fR[\fB\-T \fItimeout\fR] \fB\-F \fIfile\fR +[\fIaction\fR] [\fIaction\fR...\fR] +. +.SH DESCRIPTION +A program for managing a \fovs\-vswitchd.conf\fR(5)\-style configuration +file. \fBovs\-cfg\-mod\fR uses the same locking mechanisms as +\fBovs\-vswitchd\fR and its related utilities. This allows it to be +run safely on ``live'' configurations. +. +.SH OPTIONS +.SS "Specifying the Configuration File" +. +.IP "\fB\-T\fR \fItimeout\fR +.IQ "\fB\-\-timeout=\fItimeout\fR +By default, \fBovs\-cfg\-mod\fR will wait forever to lock the +configuration file specified on \fB\-F\fR or \fB\-\-config\-file\fR. This +option makes \fBovs\-cfg\-mod\fR wait no more than \fItimeout\fR +milliseconds to obtain the lock, after which it exits unsuccessfully. +. +If it is present, this option must be specified before \fB\-F\fR or +\fB\-\-config\-file\fR. +. +.IP "\fB\-F\fR \fIfile\fR" +.IQ "\fB\-\-config\-file=\fIfile\fR" +Use \fIfile\fR as the configuration file to query or modify. +. +This option is required. It must be specified before any action +options. +. +.SS "Specifying Actions" +A series of one or more action options may follow the configuration +file options. These are executed in the order provided and under a +single lock instance, so they appear atomic to external viewers of +\fIfile\fR. +. +As discussed in \fBovs\-vswitchd.conf\fR(5), each line in the +configuration file consists of a key\-value pair. Actions generally +take either a \fIkey\fR or \fIentry\fR argument. A \fIkey\fR is a +dot\-separated description of a configuration option. A \fIentry\fR is +a key\-value pair, separated by the \fB=\fR sign. +. +The following actions are supported: +. +.IP "\fB\-a\fR \fIentry\fR" +.IQ "\fB\-\-add=\fIentry\fR" +Add \fIentry\fR to \fIfile\fR. Please note that duplicates are +allowed, so if a unique key is required, a delete must be done first. +. +.IP "\fB\-d\fR \fIentry\fR" +.IQ "\fB\-\-del\-entry=\fIentry\fR" +Delete \fIentry\fR from \fIfile\fR. Deletes only the first entry +that matches \fIentry\fR. +. +.IP "\fB\-D\fR \fIkey\fR" +.IQ "\fB\-\-del\-section=\fIkey\fR" +Delete section \fIkey\fR from \fIfile\fR. +. +.IP "\fB\-\-del\-match=\fIpattern\fR" +Deletes every entry that matches the given shell glob \fIpattern\fR. +For example, \fB\-\-del\-match=bridge.*.port=*\fR deletes all the ports +from every bridge, and \fB\-\-del\-match=bonding.bond0.*\fR is equivalent +to \fB\-\-del\-section=bonding.bond0\fR. +. +.IP "\fB\-q\fR \fIkey\fR" +.IQ "\fB\-\-query=\fIkey\fR" +Queries \fIfile\fR for entries that match \fIkey\fR. Each matching +value is printed on a separate line. Duplicates will be printed +multiple times. +. +.IP "\fB\-c\fR" +.IQ "\fB\-\-changes\fR" +Logs all of the changes made to the configuration file in a ``unified +diff''\-like format. Only actual changes are logged, so that if, for +example, a \fB\-\-del\-match\fR action did not match any key\-value pairs, +then nothing will be logged due to that action. Furthermore, only the +net effects of changes are logged: if a key\-value pair was deleted and +then an identical key\-value pair was added back, then nothing would be +logged due to those changes. +. +This action logs changes that have taken effect at the point where it +is inserted. Thus, if it is given before any other action, it will +not log any changes. If \fB\-\-changes\fR is given more than once, +instances after the first log only the changes since the previous +instance. +. +.SH "SEE ALSO" +.BR ovs\-vswitchd (8), +.BR ovs\-vswitchd.conf (5) diff --git a/utilities/ovs-cfg-mod.c b/utilities/ovs-cfg-mod.c new file mode 100644 index 00000000..53ebd00a --- /dev/null +++ b/utilities/ovs-cfg-mod.c @@ -0,0 +1,239 @@ +/* Copyright (c) 2008, 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "cfg.h" +#include "command-line.h" +#include "svec.h" +#include "timeval.h" +#include "util.h" + +#define THIS_MODULE VLM_cfg_mod +#include "vlog.h" + +/* Configuration when we first read the configuration file. */ +static struct svec orig_cfg = SVEC_EMPTY_INITIALIZER; + +static void +usage(char *prog_name, int exit_code) +{ + printf("Usage: %s --config-file=FILE ACTIONS\n" + "\nConfig:\n" + " -T, --timeout=MS wait at most MS milliseconds for lock\n" + " -F, --config-file=FILE use configuration FILE\n" + "\nActions:\n" + " -a, --add=ENTRY add ENTRY\n" + " -d, --del-entry=ENTRY delete ENTRY\n" + " -D, --del-section=KEY delete section matching KEY\n" + " --del-match=PATTERN delete entries matching shell PATTERN\n" + " -q, --query=KEY return all entries matching KEY\n" + " -c, --log-changes log changes up to this point\n" + "\nOther options:\n" + " -h, --help display this help message\n" + " -V, --version display version information\n", + prog_name); + exit(exit_code); +} + +static void +open_config(char *config_file, int timeout) +{ + int error; + + error = cfg_set_file(config_file); + if (error) { + ovs_fatal(error, "failed to add configuration file \"%s\"", + config_file); + } + + error = cfg_lock(NULL, timeout); + if (error) { + ovs_fatal(error, "could not lock configuration file\n"); + } + + cfg_get_all(&orig_cfg); +} + +static void +print_vals(char *key) +{ + struct svec vals; + int i; + + svec_init(&vals); + cfg_get_all_strings(&vals, "%s", key); + + for (i=0; i UCHAR_MAX || !strchr("FhVv?", option)) + && config_set == false) { + ovs_fatal(0, "no config file specified (use --help for help)"); + } + + switch (option) { + case 'T': + if (config_set) { + ovs_fatal(0, "--timeout or -T must be specified " + "before --file or -F"); + } + timeout = atoi(optarg); + break; + + case 'F': + open_config(optarg, timeout); + config_set = true; + break; + + case 'a': + cfg_add_entry("%s", optarg); + break; + + case 'd': + cfg_del_entry("%s", optarg); + break; + + case 'D': + cfg_del_section("%s", optarg); + break; + + case OPT_DEL_MATCH: + cfg_del_match("%s", optarg); + break; + + case 'q': + print_vals(optarg); + break; + + case 'c': + log_diffs(); + break; + + case 'h': + usage(argv[0], EXIT_SUCCESS); + break; + + case 'V': + OVS_PRINT_VERSION(0, 0); + exit(EXIT_SUCCESS); + + case 'v': + vlog_set_verbosity(optarg); + break; + + case '?': + exit(EXIT_FAILURE); + + default: + NOT_REACHED(); + } + } + free(short_options); + + if (optind != argc) { + ovs_fatal(0, "non-option arguments not accepted " + "(use --help for help)"); + } + + if (cfg_is_dirty()) { + cfg_write(); + } + cfg_unlock(); + + exit(0); +} diff --git a/utilities/ovs-controller.8.in b/utilities/ovs-controller.8.in new file mode 100644 index 00000000..31c7a865 --- /dev/null +++ b/utilities/ovs-controller.8.in @@ -0,0 +1,132 @@ +.TH ovs\-controller 8 "March 2009" "Open vSwitch" "Open vSwitch Manual" +.ds PN ovs\-controller + +.SH NAME +ovs\-controller \- simple OpenFlow controller reference implementation + +.SH SYNOPSIS +.B ovs\-controller +[\fIoptions\fR] \fImethod\fR \fB[\fImethod\fR]\&... + +.SH DESCRIPTION +\fBovs\-controller\fR manages any number of remote switches over OpenFlow +protocol, causing them to function as L2 MAC-learning switches or hub. + +\fBovs\-controller\fR controls one or more OpenFlow switches, specified as +one or more of the following OpenFlow connection methods: + +.TP +\fBpssl:\fR[\fIport\fR] +Listens for SSL connections from remote OpenFlow switches on +\fIport\fR (default: 6633). The \fB--private-key\fR, +\fB--certificate\fR, and \fB--ca-cert\fR options are mandatory when +this form is used. + +.TP +\fBptcp:\fR[\fIport\fR] +Listens for TCP connections from remote OpenFlow switches on +\fIport\fR (default: 6633). + +.TP +\fBpunix:\fIfile\fR +Listens for connections from OpenFlow switches on the Unix domain +server socket named \fIfile\fR. + +.TP +\fBssl:\fIhost\fR[\fB:\fIport\fR] +The specified SSL \fIport\fR (default: 6633) on the given remote +\fIhost\fR. The \fB--private-key\fR, \fB--certificate\fR, and +\fB--ca-cert\fR options are mandatory when this form is used. + +.TP +\fBtcp:\fIhost\fR[\fB:\fIport\fR] +The specified TCP \fIport\fR (default: 6633) on the given remote +\fIhost\fR. + +.TP +\fBunix:\fIfile\fR +The Unix domain server socket named \fIfile\fR. + +.SH OPTIONS +.TP +\fB-p\fR, \fB--private-key=\fIprivkey.pem\fR +Specifies a PEM file containing the private key used as the switch's +identity for SSL connections to the controller. + +.TP +\fB-c\fR, \fB--certificate=\fIcert.pem\fR +Specifies a PEM file containing a certificate, signed by the +controller's certificate authority (CA), that certifies the switch's +private key to identify a trustworthy switch. + +.TP +\fB-C\fR, \fB--ca-cert=\fIswitch-cacert.pem\fR +Specifies a PEM file containing the CA certificate used to verify that +the switch is connected to a trustworthy controller. + +.TP +\fB--peer-ca-cert=\fIcontroller-cacert.pem\fR +Specifies a PEM file that contains one or more additional certificates +to send to switches. \fIcontroller-cacert.pem\fR should be the CA +certificate used to sign the controller's own certificate (the +certificate specified on \fB-c\fR or \fB--certificate\fR). + +This option is not useful in normal operation, because the switch must +already have the controller CA certificate for it to have any +confidence in the controller's identity. However, this option allows +a newly installed switch to obtain the controller CA certificate on +first boot using, e.g., the \fB--bootstrap-ca-cert\fR option to +\fBsecchan\fR(8). + +.IP "\fB-n\fR, \fB--noflow\fR" +By default, \fBovs\-controller\fR sets up a flow in each OpenFlow switch +whenever it receives a packet whose destination is known due through +MAC learning. This option disables flow setup, so that every packet +in the network passes through the controller. + +This option is most useful for debugging. It reduces switching +performance, so it should not be used in production. + +.TP +\fB--max-idle=\fIsecs\fR|\fBpermanent\fR +Sets \fIsecs\fR as the number of seconds that a flow set up by the +controller will remain in the switch's flow table without any matching +packets being seen. If \fBpermanent\fR is specified, which is not +recommended, flows will never expire. The default is 60 seconds. + +This option affects only flows set up by the OpenFlow controller. In +some configurations, the switch can set up some flows +on its own. To set the idle time for those flows, pass +\fB--max-idle\fR to \fBsecchan\fR (on the switch). + +This option has no effect when \fB-n\fR (or \fB--noflow\fR) is in use +(because the controller does not set up flows in that case). + +.IP "\fB-H\fR, \fB--hub\fR" +By default, the controller acts as an L2 MAC-learning switch. This +option changes its behavior to that of a hub that floods packets on +all but the incoming port. + +If \fB-H\fR (or \fB--hub\fR) and \fB-n\fR (or \fB--noflow\fR) are used +together, then the cumulative effect is that every packet passes +through the controller and every packet is flooded. + +This option is most useful for debugging. It reduces switching +performance, so it should not be used in production. + +.so lib/daemon.man +.so lib/vlog.man +.so lib/common.man + +.SH EXAMPLES + +.TP +To bind locally to port 6633 (the default) and wait for incoming connections from OpenFlow switches: + +.B % ovs\-controller ptcp: + +.SH "SEE ALSO" + +.BR secchan (8), +.BR ovs\-appctl (8), +.BR ovs\-dpctl (8) diff --git a/utilities/ovs-controller.c b/utilities/ovs-controller.c new file mode 100644 index 00000000..423ce195 --- /dev/null +++ b/utilities/ovs-controller.c @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include "command-line.h" +#include "compiler.h" +#include "daemon.h" +#include "fault.h" +#include "learning-switch.h" +#include "ofpbuf.h" +#include "openflow/openflow.h" +#include "poll-loop.h" +#include "rconn.h" +#include "timeval.h" +#include "unixctl.h" +#include "util.h" +#include "vconn-ssl.h" +#include "vconn.h" + +#include "vlog.h" +#define THIS_MODULE VLM_controller + +#define MAX_SWITCHES 16 +#define MAX_LISTENERS 16 + +struct switch_ { + struct lswitch *lswitch; + struct rconn *rconn; +}; + +/* Learn the ports on which MAC addresses appear? */ +static bool learn_macs = true; + +/* Set up flows? (If not, every packet is processed at the controller.) */ +static bool setup_flows = true; + +/* --max-idle: Maximum idle time, in seconds, before flows expire. */ +static int max_idle = 60; + +static int do_switching(struct switch_ *); +static void new_switch(struct switch_ *, struct vconn *, const char *name); +static void parse_options(int argc, char *argv[]); +static void usage(void) NO_RETURN; + +int +main(int argc, char *argv[]) +{ + struct unixctl_server *unixctl; + struct switch_ switches[MAX_SWITCHES]; + struct pvconn *listeners[MAX_LISTENERS]; + int n_switches, n_listeners; + int retval; + int i; + + set_program_name(argv[0]); + register_fault_handlers(); + time_init(); + vlog_init(); + parse_options(argc, argv); + signal(SIGPIPE, SIG_IGN); + + if (argc - optind < 1) { + ovs_fatal(0, "at least one vconn argument required; " + "use --help for usage"); + } + + n_switches = n_listeners = 0; + for (i = optind; i < argc; i++) { + const char *name = argv[i]; + struct vconn *vconn; + int retval; + + retval = vconn_open(name, OFP_VERSION, &vconn); + if (!retval) { + if (n_switches >= MAX_SWITCHES) { + ovs_fatal(0, "max %d switch connections", n_switches); + } + new_switch(&switches[n_switches++], vconn, name); + continue; + } else if (retval == EAFNOSUPPORT) { + struct pvconn *pvconn; + retval = pvconn_open(name, &pvconn); + if (!retval) { + if (n_listeners >= MAX_LISTENERS) { + ovs_fatal(0, "max %d passive connections", n_listeners); + } + listeners[n_listeners++] = pvconn; + } + } + if (retval) { + VLOG_ERR("%s: connect: %s", name, strerror(retval)); + } + } + if (n_switches == 0 && n_listeners == 0) { + ovs_fatal(0, "no active or passive switch connections"); + } + + die_if_already_running(); + daemonize(); + + retval = unixctl_server_create(NULL, &unixctl); + if (retval) { + ovs_fatal(retval, "Could not listen for unixctl connections"); + } + + while (n_switches > 0 || n_listeners > 0) { + int iteration; + int i; + + /* Accept connections on listening vconns. */ + for (i = 0; i < n_listeners && n_switches < MAX_SWITCHES; ) { + struct vconn *new_vconn; + int retval; + + retval = pvconn_accept(listeners[i], OFP_VERSION, &new_vconn); + if (!retval || retval == EAGAIN) { + if (!retval) { + new_switch(&switches[n_switches++], new_vconn, "tcp"); + } + i++; + } else { + pvconn_close(listeners[i]); + listeners[i] = listeners[--n_listeners]; + } + } + + /* Do some switching work. Limit the number of iterations so that + * callbacks registered with the poll loop don't starve. */ + for (iteration = 0; iteration < 50; iteration++) { + bool progress = false; + for (i = 0; i < n_switches; ) { + struct switch_ *this = &switches[i]; + int retval = do_switching(this); + if (!retval || retval == EAGAIN) { + if (!retval) { + progress = true; + } + i++; + } else { + rconn_destroy(this->rconn); + lswitch_destroy(this->lswitch); + switches[i] = switches[--n_switches]; + } + } + if (!progress) { + break; + } + } + for (i = 0; i < n_switches; i++) { + struct switch_ *this = &switches[i]; + lswitch_run(this->lswitch, this->rconn); + } + + unixctl_server_run(unixctl); + + /* Wait for something to happen. */ + if (n_switches < MAX_SWITCHES) { + for (i = 0; i < n_listeners; i++) { + pvconn_wait(listeners[i]); + } + } + for (i = 0; i < n_switches; i++) { + struct switch_ *sw = &switches[i]; + rconn_run_wait(sw->rconn); + rconn_recv_wait(sw->rconn); + lswitch_wait(sw->lswitch); + } + unixctl_server_wait(unixctl); + poll_block(); + } + + return 0; +} + +static void +new_switch(struct switch_ *sw, struct vconn *vconn, const char *name) +{ + sw->rconn = rconn_new_from_vconn(name, vconn); + sw->lswitch = lswitch_create(sw->rconn, learn_macs, + setup_flows ? max_idle : -1); +} + +static int +do_switching(struct switch_ *sw) +{ + unsigned int packets_sent; + struct ofpbuf *msg; + + packets_sent = rconn_packets_sent(sw->rconn); + + msg = rconn_recv(sw->rconn); + if (msg) { + lswitch_process_packet(sw->lswitch, sw->rconn, msg); + ofpbuf_delete(msg); + } + rconn_run(sw->rconn); + + return (!rconn_is_alive(sw->rconn) ? EOF + : rconn_packets_sent(sw->rconn) != packets_sent ? 0 + : EAGAIN); +} + +static void +parse_options(int argc, char *argv[]) +{ + enum { + OPT_MAX_IDLE = UCHAR_MAX + 1, + OPT_PEER_CA_CERT, + VLOG_OPTION_ENUMS + }; + static struct option long_options[] = { + {"hub", no_argument, 0, 'H'}, + {"noflow", no_argument, 0, 'n'}, + {"max-idle", required_argument, 0, OPT_MAX_IDLE}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + DAEMON_LONG_OPTIONS, + VLOG_LONG_OPTIONS, +#ifdef HAVE_OPENSSL + VCONN_SSL_LONG_OPTIONS + {"peer-ca-cert", required_argument, 0, OPT_PEER_CA_CERT}, +#endif + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + + for (;;) { + int indexptr; + int c; + + c = getopt_long(argc, argv, short_options, long_options, &indexptr); + if (c == -1) { + break; + } + + switch (c) { + case 'H': + learn_macs = false; + break; + + case 'n': + setup_flows = false; + break; + + case OPT_MAX_IDLE: + if (!strcmp(optarg, "permanent")) { + max_idle = OFP_FLOW_PERMANENT; + } else { + max_idle = atoi(optarg); + if (max_idle < 1 || max_idle > 65535) { + ovs_fatal(0, "--max-idle argument must be between 1 and " + "65535 or the word 'permanent'"); + } + } + break; + + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION); + exit(EXIT_SUCCESS); + + VLOG_OPTION_HANDLERS + DAEMON_OPTION_HANDLERS + +#ifdef HAVE_OPENSSL + VCONN_SSL_OPTION_HANDLERS + + case OPT_PEER_CA_CERT: + vconn_ssl_set_peer_ca_cert_file(optarg); + break; +#endif + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); +} + +static void +usage(void) +{ + printf("%s: OpenFlow controller\n" + "usage: %s [OPTIONS] METHOD\n" + "where METHOD is any OpenFlow connection method.\n", + program_name, program_name); + vconn_usage(true, true, false); + daemon_usage(); + vlog_usage(); + printf("\nOther options:\n" + " -H, --hub act as hub instead of learning switch\n" + " -n, --noflow pass traffic, but don't add flows\n" + " --max-idle=SECS max idle time for new flows\n" + " -h, --help display this help message\n" + " -V, --version display version information\n"); + exit(EXIT_SUCCESS); +} diff --git a/utilities/ovs-discover.8.in b/utilities/ovs-discover.8.in new file mode 100644 index 00000000..d38ce9ee --- /dev/null +++ b/utilities/ovs-discover.8.in @@ -0,0 +1,118 @@ +.TH ovs\-discover 8 "May 2008" "Open vSwitch" "Open vSwitch Manual" +.ds PN ovs\-discover + +.SH NAME +ovs\-discover \- controller discovery utility + +.SH SYNOPSIS +.B ovs\-discover +[\fIoptions\fR] \fInetdev\fR [\fInetdev\fR...] + +.SH DESCRIPTION +The \fBovs\-discover\fR program attempts to discover the location of +an OpenFlow controller on one of the network devices listed on the +command line. It repeatedly broadcasts a DHCP request with vendor +class identifier \fBOpenFlow\fR on each network device until it +receives an acceptable DHCP response. It will accept any valid DHCP +reply that has the same vendor class identifier and includes a +vendor-specific option with code 1 whose contents are a string +specifying the location of the controller in the same format used on +the \fBsecchan\fR command line (e.g. \fBssl:192.168.0.1\fR). + +When \fBovs\-discover\fR receives an acceptable response, it prints +the details of the response on \fBstdout\fR. Then, by default, it +configures the network device on which the response was received with +the received IP address, netmask, and default gateway, and detaches +itself to the background. + +.SH OPTIONS +.TP +\fB--accept-vconn=\fIregex\fR +By default, \fBovs\-discover\fR accepts any controller location +advertised over DHCP. With this option, only controllers whose names +match POSIX extended regular expression \fIregex\fR will be accepted. +Specifying \fBssl:.*\fR for \fIregex\fR, for example, would cause only +SSL controller connections to be accepted. + +The \fIregex\fR is implicitly anchored at the beginning of the +controller location string, as if it begins with \fB^\fR. + +.TP +\fB--exit-without-bind\fR +By default, \fBovs\-discover\fR binds the network device that receives +the first acceptable response to the IP address received over DHCP. +With this option, the configuration of the network device is not +changed at all, except to bring it up if it is initially down, and +\fBovs\-discover\fR will exit immediately after it receives an +acceptable DHCP response. + +This option is mutually exclusive with \fB--exit-after-bind\fR and +\fB--no-detach\fR. + +.TP +\fB--exit-after-bind\fR +By default, after it receives an acceptable DHCP response, +\fBovs\-discover\fR detaches itself from the foreground session and +runs in the background maintaining the DHCP lease as necessary. With +this option, \fBovs\-discover\fR will exit immediately after it +receives an acceptable DHCP response and configures the network device +with the received IP address. The address obtained via DHCP could +therefore be used past the expiration of its lease. + +This option is mutually exclusive with \fB--exit-without-bind\fR and +\fB--no-detach\fR. + +.TP +\fB--no-detach\fR +By default, \fBovs\-discover\fR runs in the foreground until it obtains +an acceptable DHCP response, then it detaches itself from the +foreground session and run as a background process. This option +prevents \fBovs\-discover\fR from detaching, causing it to run in the +foreground even after it obtains a DHCP response. + +This option is mutually exclusive with \fB--exit-without-bind\fR and +\fB--exit-after-bind\fR. + +.TP +\fB-P\fR[\fIpidfile\fR], \fB--pidfile\fR[\fB=\fIpidfile\fR] +Causes a file (by default, \fBovs\-discover.pid\fR) to be created indicating +the PID of the running process. If \fIpidfile\fR is not specified, or +if it does not begin with \fB/\fR, then it is created in +\fB@RUNDIR@\fR. + +The \fIpidfile\fR is created when \fBovs\-discover\fR detaches, so +this this option has no effect when one of \fB--exit-without-bind\fR, +\fB--exit-after-bind\fR, or \fB--no-detach\fR is also given. + +.TP +\fB-f\fR, \fB--force\fR +By default, when \fB-P\fR or \fB--pidfile\fR is specified and the +specified pidfile already exists and is locked by a running process, +\fBcontroller\fR refuses to start. Specify \fB-f\fR or \fB--force\fR +to cause it to instead overwrite the pidfile. + +When \fB-P\fR or \fB--pidfile\fR is not specified, this option has no +effect. + +.so lib/vlog.man +.so lib/common.man + +.SH BUGS + +If the network devices specified on the command line have been added +to an Open vSwitch datapath with \fBovs\-dpctl add\-if\fR, then controller +discovery will fail because \fBovs\-discover\fR will not be able to +see DHCP responses, even though tools such as \fBtcpdump\fR(8) and +\fBwireshark\fR(1) can see them on the wire. This is because of the +structure of the Linux kernel networking stack, which hands packets +first to programs that listen for all arriving packets, then to +Open vSwitch, then to programs that listen for a specific kind of packet. +Open vSwitch consumes all the packets handed to it, so tools like +\fBtcpdump\fR that look at all packets will see packets arriving on +Open vSwitch interfaces, but \fRovs\-discover\fR, which listens only for +arriving IP packets, will not. + +.SH "SEE ALSO" + +.BR secchan (8), +.BR ovs-pki (8) diff --git a/utilities/ovs-discover.c b/utilities/ovs-discover.c new file mode 100644 index 00000000..b664321f --- /dev/null +++ b/utilities/ovs-discover.c @@ -0,0 +1,405 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "command-line.h" +#include "daemon.h" +#include "dhcp-client.h" +#include "dhcp.h" +#include "dirs.h" +#include "dynamic-string.h" +#include "fatal-signal.h" +#include "netdev.h" +#include "poll-loop.h" +#include "timeval.h" +#include "unixctl.h" +#include "util.h" + +#include "vlog.h" +#define THIS_MODULE VLM_ovs_discover + +struct iface { + const char *name; + struct dhclient *dhcp; +}; + +/* The interfaces that we serve. */ +static struct iface *ifaces; +static int n_ifaces; + +/* --accept-vconn: Regular expression specifying the class of controller vconns + * that we will accept during autodiscovery. */ +static const char *accept_controller_re = ".*"; +static regex_t accept_controller_regex; + +/* --exit-without-bind: Exit after discovering the controller, without binding + * the network device to an IP address? */ +static bool exit_without_bind; + +/* --exit-after-bind: Exit after discovering the controller, after binding the + * network device to an IP address? */ +static bool exit_after_bind; + +static bool iface_init(struct iface *, const char *netdev_name); +static void release_ifaces(void *aux UNUSED); + +static void parse_options(int argc, char *argv[]); +static void usage(void) NO_RETURN; + +static void modify_dhcp_request(struct dhcp_msg *, void *aux); +static bool validate_dhcp_offer(const struct dhcp_msg *, void *aux); + +int +main(int argc, char *argv[]) +{ + struct unixctl_server *unixctl; + int retval; + int i; + + set_program_name(argv[0]); + time_init(); + vlog_init(); + parse_options(argc, argv); + + argc -= optind; + argv += optind; + if (argc < 1) { + ovs_fatal(0, "need at least one non-option argument; " + "use --help for usage"); + } + + ifaces = xmalloc(argc * sizeof *ifaces); + n_ifaces = 0; + for (i = 0; i < argc; i++) { + if (iface_init(&ifaces[n_ifaces], argv[i])) { + n_ifaces++; + } + } + if (!n_ifaces) { + ovs_fatal(0, "failed to initialize any DHCP clients"); + } + + for (i = 0; i < n_ifaces; i++) { + struct iface *iface = &ifaces[i]; + dhclient_init(iface->dhcp, 0); + } + fatal_signal_add_hook(release_ifaces, NULL, true); + + retval = regcomp(&accept_controller_regex, accept_controller_re, + REG_NOSUB | REG_EXTENDED); + if (retval) { + size_t length = regerror(retval, &accept_controller_regex, NULL, 0); + char *buffer = xmalloc(length); + regerror(retval, &accept_controller_regex, buffer, length); + ovs_fatal(0, "%s: %s", accept_controller_re, buffer); + } + + retval = unixctl_server_create(NULL, &unixctl); + if (retval) { + ovs_fatal(retval, "Could not listen for unixctl connections"); + } + + die_if_already_running(); + + signal(SIGPIPE, SIG_IGN); + for (;;) { + fatal_signal_block(); + for (i = 0; i < n_ifaces; i++) { + struct iface *iface = &ifaces[i]; + dhclient_run(iface->dhcp); + if (dhclient_changed(iface->dhcp)) { + bool is_bound = dhclient_is_bound(iface->dhcp); + int j; + + /* Configure network device. */ + if (!exit_without_bind) { + dhclient_configure_netdev(iface->dhcp); + dhclient_update_resolv_conf(iface->dhcp); + } + + if (is_bound) { + static bool detached = false; + struct ds ds; + + /* Disable timeout, since discovery was successful. */ + time_alarm(0); + + /* Print discovered parameters. */ + ds_init(&ds); + dhcp_msg_to_string(dhclient_get_config(iface->dhcp), + true, &ds); + fputs(ds_cstr(&ds), stdout); + putchar('\n'); + fflush(stdout); + ds_destroy(&ds); + + /* Exit if the user requested it. */ + if (exit_without_bind) { + VLOG_DBG("exiting because of successful binding on %s " + "and --exit-without-bind specified", + iface->name); + exit(0); + } + if (exit_after_bind) { + VLOG_DBG("exiting because of successful binding on %s " + "and --exit-after-bind specified", + iface->name); + exit(0); + } + + /* Detach into background, if we haven't already. */ + if (!detached) { + detached = true; + daemonize(); + } + } + + /* We only want an address on a single one of our interfaces. + * So: if we have an address on this interface, stop looking + * for one on the others; if we don't have an address on this + * interface, start looking everywhere. */ + for (j = 0; j < n_ifaces; j++) { + struct iface *if2 = &ifaces[j]; + if (iface != if2) { + if (is_bound) { + dhclient_release(if2->dhcp); + } else { + dhclient_init(if2->dhcp, 0); + } + } + } + } + } + unixctl_server_run(unixctl); + for (i = 0; i < n_ifaces; i++) { + struct iface *iface = &ifaces[i]; + dhclient_wait(iface->dhcp); + } + unixctl_server_wait(unixctl); + fatal_signal_unblock(); + poll_block(); + } + + return 0; +} + +static bool +iface_init(struct iface *iface, const char *netdev_name) +{ + int retval; + + iface->name = netdev_name; + iface->dhcp = NULL; + + if (exit_after_bind) { + /* Bring this interface up permanently, so that the bound address + * persists past program termination. */ + struct netdev *netdev; + + retval = netdev_open(iface->name, NETDEV_ETH_TYPE_NONE, &netdev); + if (retval) { + ovs_error(retval, "Could not open %s device", iface->name); + return false; + } + retval = netdev_turn_flags_on(netdev, NETDEV_UP, true); + if (retval) { + ovs_error(retval, "Could not bring %s device up", iface->name); + return false; + } + netdev_close(netdev); + } + + retval = dhclient_create(iface->name, modify_dhcp_request, + validate_dhcp_offer, NULL, &iface->dhcp); + if (retval) { + ovs_error(retval, "%s: failed to initialize DHCP client", iface->name); + return false; + } + + return true; +} + +static void +release_ifaces(void *aux UNUSED) +{ + int i; + + for (i = 0; i < n_ifaces; i++) { + struct dhclient *dhcp = ifaces[i].dhcp; + dhclient_release(dhcp); + if (dhclient_changed(dhcp)) { + dhclient_configure_netdev(dhcp); + } + } +} + +static void +modify_dhcp_request(struct dhcp_msg *msg, void *aux UNUSED) +{ + dhcp_msg_put_string(msg, DHCP_CODE_VENDOR_CLASS, "OpenFlow"); +} + +static bool +validate_dhcp_offer(const struct dhcp_msg *msg, void *aux UNUSED) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60); + char *vconn_name; + bool accept; + + vconn_name = dhcp_msg_get_string(msg, DHCP_CODE_OFP_CONTROLLER_VCONN); + if (!vconn_name) { + VLOG_WARN_RL(&rl, "rejecting DHCP offer missing controller vconn"); + return false; + } + accept = !regexec(&accept_controller_regex, vconn_name, 0, NULL, 0); + free(vconn_name); + return accept; +} + +static void +parse_options(int argc, char *argv[]) +{ + enum { + OPT_ACCEPT_VCONN = UCHAR_MAX + 1, + OPT_EXIT_WITHOUT_BIND, + OPT_EXIT_AFTER_BIND, + OPT_NO_DETACH, + }; + static struct option long_options[] = { + {"accept-vconn", required_argument, 0, OPT_ACCEPT_VCONN}, + {"exit-without-bind", no_argument, 0, OPT_EXIT_WITHOUT_BIND}, + {"exit-after-bind", no_argument, 0, OPT_EXIT_AFTER_BIND}, + {"no-detach", no_argument, 0, OPT_NO_DETACH}, + {"timeout", required_argument, 0, 't'}, + {"pidfile", optional_argument, 0, 'P'}, + {"force", no_argument, 0, 'f'}, + {"verbose", optional_argument, 0, 'v'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + bool detach_after_bind = true; + + for (;;) { + unsigned long int timeout; + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case OPT_ACCEPT_VCONN: + accept_controller_re = (optarg[0] == '^' + ? optarg + : xasprintf("^%s", optarg)); + break; + + case OPT_EXIT_WITHOUT_BIND: + exit_without_bind = true; + break; + + case OPT_EXIT_AFTER_BIND: + exit_after_bind = true; + break; + + case OPT_NO_DETACH: + detach_after_bind = false; + break; + + case 'P': + set_pidfile(optarg); + break; + + case 'f': + ignore_existing_pidfile(); + break; + + case 't': + timeout = strtoul(optarg, NULL, 10); + if (timeout <= 0) { + ovs_fatal(0, "value %s on -t or --timeout is not at least 1", + optarg); + } else { + time_alarm(timeout); + } + signal(SIGALRM, SIG_DFL); + break; + + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(0, 0); + exit(EXIT_SUCCESS); + + case 'v': + vlog_set_verbosity(optarg); + break; + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); + + if ((exit_without_bind + exit_after_bind + !detach_after_bind) > 1) { + ovs_fatal(0, "--exit-without-bind, --exit-after-bind, and --no-detach " + "are mutually exclusive"); + } + if (detach_after_bind) { + set_detach(); + } +} + +static void +usage(void) +{ + printf("%s: a tool for discovering OpenFlow controllers.\n" + "usage: %s [OPTIONS] NETDEV [NETDEV...]\n" + "where each NETDEV is a network device on which to perform\n" + "controller discovery.\n" + "\nOrdinarily, ovs-discover runs in the foreground until it\n" + "obtains an IP address and discovers an OpenFlow controller via\n" + "DHCP, then it prints information about the controller to stdout\n" + "and detaches to the background to maintain the IP address lease.\n" + "\nNetworking options:\n" + " --accept-vconn=REGEX accept matching discovered controllers\n" + " --exit-without-bind exit after discovery, without binding\n" + " --exit-after-bind exit after discovery, after binding\n" + " --no-detach do not detach after discovery\n", + program_name, program_name); + vlog_usage(); + printf("\nOther options:\n" + " -t, --timeout=SECS give up discovery after SECS seconds\n" + " -P, --pidfile[=FILE] create pidfile (default: %s/%s.pid)\n" + " -f, --force with -P, start even if already running\n" + " -h, --help display this help message\n" + " -V, --version display version information\n", + ovs_rundir, program_name); + exit(EXIT_SUCCESS); +} diff --git a/utilities/ovs-dpctl.8.in b/utilities/ovs-dpctl.8.in new file mode 100644 index 00000000..652ebb13 --- /dev/null +++ b/utilities/ovs-dpctl.8.in @@ -0,0 +1,166 @@ +.TH ovs\-dpctl 8 "March 2009" "Open vSwitch" "Open vSwitch Manual" +.ds PN ovs\-dpctl + +.SH NAME +ovs\-dpctl \- administer Open vSwitch datapaths + +.SH SYNOPSIS +.B ovs\-dpctl +[\fIoptions\fR] \fIcommand \fR[\fIswitch\fR] [\fIargs\fR\&...] + +.SH DESCRIPTION + +The \fBovs\-dpctl\fR program can create, modify, and delete Open vSwitch +datapaths. A single machine may host up to 256 datapaths (numbered 0 +to 255). + +A newly created datapath is associated with only one network device, a +virtual network device sometimes called the datapath's ``local port''. +A newly created datapath is not, however, associated with any of the +host's other network devices. To intercept and process traffic on a +given network device, use the \fBadd\-if\fR command to explicitly add +that network device to the datapath. + +Do not use \fBovs\-dpctl\fR commands to modify datapaths if +\fBovs\-vswitchd\fR(8) is in use. Instead, modify the +\fBovs\-vswitchd\fR configuration file and send \fBSIGHUP\fR to the +\fBovs\-vswitchd\fR process. + +.PP +Most \fBovs\-dpctl\fR commands that work with datapaths take an argument +that specifies the name of the datapath, in one of the following +forms: + +.so lib/dpif.man + +.PP +The following commands manage datapaths. + +.TP +\fBadd\-dp \fIdp\fR [\fInetdev\fR...] + +Creates datapath \fIdp\fR. The name of the new datapath's local port +depends on how \fIdp\fR is specified: if it takes the form +\fBdp\fIN\fR, the local port will be named \fBdp\fIN\fR; if \fIdp\fR +is \fBnl:\fI, the local port will be named \fBof\fIN\fR; otherwise, +the local port's name will be \fIdp\fR. + +This will fail if the host already has 256 datapaths, if a network +device with the same name as the new datapath's local port already +exists, or if \fIdp\fR is given in the form \fBdp\fIN\fR or +\fBnl:\fIN\fR and a datapath numbered \fIN\fR already exists. + +If \fInetdev\fRs are specified, \fBovs\-dpctl\fR adds them to the datapath. + +.TP +\fBdel\-dp \fIdp\fR +Deletes datapath \fIdp\fR. If \fIdp\fR is associated with any network +devices, they are automatically removed. + +.TP +\fBadd\-if \fIdp netdev\fR[\fIoption\fR...]... +Adds each \fInetdev\fR to the set of network devices datapath +\fIdp\fR monitors, where \fIdp\fR is the name of an existing +datapath, and \fInetdev\fR is the name of one of the host's +network devices, e.g. \fBeth0\fR. Once a network device has been added +to a datapath, the datapath has complete ownership of the network device's +traffic and the network device appears silent to the rest of the +system. + +A \fInetdev\fR may be followed by a comma-separated list of options. +The following options are currently supported: + +.RS +.IP "\fBport=\fIportno\fR" +Specifies \fIportno\fR (a number between 1 and 255) as the port number +at which \fInetdev\fR will be attached. By default, \fBadd\-if\fR +automatically selects the lowest available port number. + +.IP "\fBinternal\fR" +Instead of attaching an existing \fInetdev\fR, creates an internal +port (analogous to the local port) with that name. +.RE + +.TP +\fBdel\-if \fIdp netdev\fR... +Removes each \fInetdev\fR from the list of network devices datapath +\fIdp\fR monitors. + +.TP +\fBshow \fR[\fIdp\fR...] +Prints a summary of configured datapaths, including their datapath +numbers and a list of ports connected to each datapath. (The local +port is identified as port 0.) + +If one or more datapaths are specified, information on only those +datapaths are displayed. Otherwise, \fBovs\-dpctl\fR displays information +about all configured datapaths. + +.IP "\fBdump-flows \fIdp\fR" +Prints to the console all flow entries in datapath \fIdp\fR's +flow table. + +This command is primarily useful for debugging Open vSwitch. The flow +table entries that it displays are not +OpenFlow flow entries. Instead, they are different and considerably +simpler flows maintained by the Open vSwitch kernel module. + +.IP "\fBdel-flows \fIdp\fR" +Deletes all flow entries from datapath \fIdp\fR's flow table. + +This command is primarily useful for debugging Open vSwitch. As +discussed in \fBdump-flows\fR, these entries are +not OpenFlow flow entries. By deleting them, the process that set them +up may be confused about their disappearance. + +.IP "\fBdump-groups \fIdp\fR" +Prints to the console the sets of port groups maintained by datapath +\fIdp\fR. Ordinarily there are at least 2 port groups in a datapath +that \fBsecchan\fR or \fBvswitch\fR is controlling: group 0 contains +all ports except those disabled by STP, and group 1 contains all +ports. Additional groups might be used in the future. + +This command is primarily useful for debugging Open vSwitch. OpenFlow +does not have a concept of port groups. + +.SH OPTIONS +.TP +\fB-t\fR, \fB--timeout=\fIsecs\fR +Limits \fBovs\-dpctl\fR runtime to approximately \fIsecs\fR seconds. If +the timeout expires, \fBovs\-dpctl\fR will exit with a \fBSIGALRM\fR +signal. + +.so lib/vlog.man +.so lib/common.man + +.SH EXAMPLES + +A typical \fBovs\-dpctl\fR command sequence for controlling an +Open vSwitch kernel module: + +.TP +\fBovs\-dpctl add\-dp dp0\fR +Creates datapath number 0. + +.TP +\fBovs\-dpctl add\-if dp0 eth0 eth1\fR +Adds two network devices to the new datapath. + +.PP +At this point one would ordinarily start \fBsecchan\fR(8) on +\fBdp0\fR, transforming \fBdp0\fR into an OpenFlow switch. Then, when +the switch and the datapath is no longer needed: + +.TP +\fBovs\-dpctl del\-if dp0 eth0 eth1\fR +Removes network devices from the datapath. + +.TP +\fBovs\-dpctl del\-dp dp0\fR +Deletes the datapath. + +.SH "SEE ALSO" + +.BR secchan (8), +.BR ovs\-appctl (8), +.BR ovs\-vswitchd (8) diff --git a/utilities/ovs-dpctl.c b/utilities/ovs-dpctl.c new file mode 100644 index 00000000..ccddd2f2 --- /dev/null +++ b/utilities/ovs-dpctl.c @@ -0,0 +1,552 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "command-line.h" +#include "compiler.h" +#include "dirs.h" +#include "dpif.h" +#include "dynamic-string.h" +#include "netdev.h" +#include "odp-util.h" +#include "timeval.h" +#include "util.h" + +#include "vlog.h" +#define THIS_MODULE VLM_dpctl + +struct command { + const char *name; + int min_args; + int max_args; + void (*handler)(int argc, char *argv[]); +}; + +static struct command all_commands[]; + +static void usage(void) NO_RETURN; +static void parse_options(int argc, char *argv[]); + +int main(int argc, char *argv[]) +{ + struct command *p; + + set_program_name(argv[0]); + time_init(); + vlog_init(); + parse_options(argc, argv); + signal(SIGPIPE, SIG_IGN); + + argc -= optind; + argv += optind; + if (argc < 1) + ovs_fatal(0, "missing command name; use --help for help"); + + for (p = all_commands; p->name != NULL; p++) { + if (!strcmp(p->name, argv[0])) { + int n_arg = argc - 1; + if (n_arg < p->min_args) + ovs_fatal(0, "'%s' command requires at least %d arguments", + p->name, p->min_args); + else if (n_arg > p->max_args) + ovs_fatal(0, "'%s' command takes at most %d arguments", + p->name, p->max_args); + else { + p->handler(argc, argv); + if (ferror(stdout)) { + ovs_fatal(0, "write to stdout failed"); + } + if (ferror(stderr)) { + ovs_fatal(0, "write to stderr failed"); + } + exit(0); + } + } + } + ovs_fatal(0, "unknown command '%s'; use --help for help", argv[0]); + + return 0; +} + +static void +parse_options(int argc, char *argv[]) +{ + static struct option long_options[] = { + {"timeout", required_argument, 0, 't'}, + {"verbose", optional_argument, 0, 'v'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + + for (;;) { + unsigned long int timeout; + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case 't': + timeout = strtoul(optarg, NULL, 10); + if (timeout <= 0) { + ovs_fatal(0, "value %s on -t or --timeout is not at least 1", + optarg); + } else { + time_alarm(timeout); + } + break; + + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(0, 0); + exit(EXIT_SUCCESS); + + case 'v': + vlog_set_verbosity(optarg); + break; + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); +} + +static void +usage(void) +{ + printf("%s: Open vSwitch datapath management utility\n" + "usage: %s [OPTIONS] COMMAND [ARG...]\n" + " add-dp DP [IFACE...] add new datapath DP (with IFACEs)\n" + " del-dp DP delete local datapath DP\n" + " add-if DP IFACE... add each IFACE as a port on DP\n" + " del-if DP IFACE... delete each IFACE from DP\n" + " show show basic info on all datapaths\n" + " show DP... show basic info on each DP\n" + " dump-flows DP display flows in DP\n" + " del-flows DP delete all flows from DP\n" + " dump-groups DP display port groups in DP\n", + program_name, program_name); + vlog_usage(); + printf("\nOther options:\n" + " -t, --timeout=SECS give up after SECS seconds\n" + " -h, --help display this help message\n" + " -V, --version display version information\n"); + exit(EXIT_SUCCESS); +} + +static void run(int retval, const char *message, ...) + PRINTF_FORMAT(2, 3); + +static void run(int retval, const char *message, ...) +{ + if (retval) { + va_list args; + + fprintf(stderr, "%s: ", program_name); + va_start(args, message); + vfprintf(stderr, message, args); + va_end(args); + if (retval == EOF) { + fputs(": unexpected end of file\n", stderr); + } else { + fprintf(stderr, ": %s\n", strerror(retval)); + } + + exit(EXIT_FAILURE); + } +} + +static void do_add_if(int argc, char *argv[]); + +static int if_up(const char *netdev_name) +{ + struct netdev *netdev; + int retval; + + retval = netdev_open(netdev_name, NETDEV_ETH_TYPE_NONE, &netdev); + if (!retval) { + retval = netdev_turn_flags_on(netdev, NETDEV_UP, true); + netdev_close(netdev); + } + return retval; +} + +static void +do_add_dp(int argc UNUSED, char *argv[]) +{ + struct dpif dpif; + run(dpif_create(argv[1], &dpif), "add_dp"); + dpif_close(&dpif); + if (argc > 2) { + do_add_if(argc, argv); + } +} + +static void +do_del_dp(int argc UNUSED, char *argv[]) +{ + struct dpif dpif; + run(dpif_open(argv[1], &dpif), "opening datapath"); + run(dpif_delete(&dpif), "del_dp"); + dpif_close(&dpif); +} + +static int +compare_ports(const void *a_, const void *b_) +{ + const struct odp_port *a = a_; + const struct odp_port *b = b_; + return a->port < b->port ? -1 : a->port > b->port; +} + +static void +query_ports(struct dpif *dpif, struct odp_port **ports, size_t *n_ports) +{ + run(dpif_port_list(dpif, ports, n_ports), "listing ports"); + qsort(*ports, *n_ports, sizeof **ports, compare_ports); +} + +static uint16_t +get_free_port(struct dpif *dpif) +{ + struct odp_port *ports; + size_t n_ports; + int port_no; + + query_ports(dpif, &ports, &n_ports); + for (port_no = 0; port_no <= UINT16_MAX; port_no++) { + size_t i; + for (i = 0; i < n_ports; i++) { + if (ports[i].port == port_no) { + goto next_portno; + } + } + free(ports); + return port_no; + + next_portno: ; + } + ovs_fatal(0, "no free datapath ports"); +} + +static void +do_add_if(int argc UNUSED, char *argv[]) +{ + bool failure = false; + struct dpif dpif; + int i; + + run(dpif_open(argv[1], &dpif), "opening datapath"); + for (i = 2; i < argc; i++) { + char *save_ptr = NULL; + char *devname, *suboptions; + int port = -1; + int flags = 0; + int error; + + devname = strtok_r(argv[i], ",,", &save_ptr); + if (!devname) { + ovs_error(0, "%s is not a valid network device name", argv[i]); + continue; + } + + suboptions = strtok_r(NULL, "", &save_ptr); + if (suboptions) { + enum { + AP_PORT, + AP_INTERNAL + }; + static char *options[] = { + "port", + "internal" + }; + + while (*suboptions != '\0') { + char *value; + + switch (getsubopt(&suboptions, options, &value)) { + case AP_PORT: + if (!value) { + ovs_error(0, "'port' suboption requires a value"); + } + port = atoi(value); + break; + + case AP_INTERNAL: + flags |= ODP_PORT_INTERNAL; + break; + + default: + ovs_error(0, "unknown suboption '%s'", value); + break; + } + } + } + if (port < 0) { + port = get_free_port(&dpif); + } + + error = dpif_port_add(&dpif, devname, port, flags); + if (error) { + ovs_error(error, "adding %s as port %"PRIu16" of %s failed", + devname, port, argv[1]); + failure = true; + } else if (if_up(devname)) { + failure = true; + } + } + dpif_close(&dpif); + if (failure) { + exit(EXIT_FAILURE); + } +} + +static bool +get_port_number(struct dpif *dpif, const char *name, uint16_t *port) +{ + struct odp_port *ports; + size_t n_ports; + size_t i; + + query_ports(dpif, &ports, &n_ports); + for (i = 0; i < n_ports; i++) { + if (!strcmp(name, ports[i].devname)) { + *port = ports[i].port; + free(ports); + return true; + } + } + free(ports); + ovs_error(0, "no port named %s", name); + return false; +} + +static void +do_del_if(int argc UNUSED, char *argv[]) +{ + bool failure = false; + struct dpif dpif; + int i; + + run(dpif_open(argv[1], &dpif), "opening datapath"); + for (i = 2; i < argc; i++) { + const char *name = argv[i]; + uint16_t port; + int error; + + if (!name[strspn(name, "0123456789")]) { + port = atoi(name); + } else if (!get_port_number(&dpif, name, &port)) { + failure = true; + continue; + } + + error = dpif_port_del(&dpif, port); + if (error) { + ovs_error(error, "deleting port %s from %s failed", name, argv[1]); + failure = true; + } + } + dpif_close(&dpif); + if (failure) { + exit(EXIT_FAILURE); + } +} + +static void +show_dpif(struct dpif *dpif) +{ + struct odp_port *ports; + struct odp_stats stats; + size_t n_ports; + size_t i; + + printf("dp%u:\n", dpif_id(dpif)); + if (!dpif_get_dp_stats(dpif, &stats)) { + printf("\tflows: cur:%"PRIu32", soft-max:%"PRIu32", " + "hard-max:%"PRIu32"\n", + stats.n_flows, stats.cur_capacity, stats.max_capacity); + printf("\tports: cur:%"PRIu32", max:%"PRIu32"\n", + stats.n_ports, stats.max_ports); + printf("\tgroups: max:%"PRIu16"\n", stats.max_groups); + printf("\tlookups: frags:%"PRIu64", hit:%"PRIu64", missed:%"PRIu64", " + "lost:%"PRIu64"\n", + stats.n_frags, stats.n_hit, stats.n_missed, stats.n_lost); + printf("\tqueues: max-miss:%"PRIu16", max-action:%"PRIu16"\n", + stats.max_miss_queue, stats.max_action_queue); + } + query_ports(dpif, &ports, &n_ports); + for (i = 0; i < n_ports; i++) { + printf("\tport %u: %s", ports[i].port, ports[i].devname); + if (ports[i].flags & ODP_PORT_INTERNAL) { + printf(" (internal)"); + } + printf("\n"); + } + free(ports); + dpif_close(dpif); +} + +static void +do_show(int argc UNUSED, char *argv[]) +{ + bool failure = false; + if (argc > 1) { + int i; + for (i = 1; i < argc; i++) { + const char *name = argv[i]; + struct dpif dpif; + int error; + + error = dpif_open(name, &dpif); + if (!error) { + show_dpif(&dpif); + } else { + ovs_error(error, "opening datapath %s failed", name); + failure = true; + } + } + } else { + unsigned int i; + for (i = 0; i < ODP_MAX; i++) { + char name[128]; + struct dpif dpif; + int error; + + sprintf(name, "dp%u", i); + error = dpif_open(name, &dpif); + if (!error) { + show_dpif(&dpif); + } else if (error != ENODEV) { + ovs_error(error, "opening datapath %s failed", name); + failure = true; + } + } + } + if (failure) { + exit(EXIT_FAILURE); + } +} + +static void +do_dump_flows(int argc UNUSED, char *argv[]) +{ + struct odp_flow *flows; + struct dpif dpif; + size_t n_flows; + struct ds ds; + size_t i; + + run(dpif_open(argv[1], &dpif), "opening datapath"); + run(dpif_flow_list_all(&dpif, &flows, &n_flows), "listing all flows"); + + ds_init(&ds); + for (i = 0; i < n_flows; i++) { + struct odp_flow *f = &flows[i]; + enum { MAX_ACTIONS = 4096 / sizeof(union odp_action) }; + union odp_action actions[MAX_ACTIONS]; + + f->actions = actions; + f->n_actions = MAX_ACTIONS; + dpif_flow_get(&dpif, f); + + ds_clear(&ds); + format_odp_flow(&ds, f); + printf("%s\n", ds_cstr(&ds)); + } + ds_destroy(&ds); + dpif_close(&dpif); +} + +static void +do_del_flows(int argc UNUSED, char *argv[]) +{ + struct dpif dpif; + + run(dpif_open(argv[1], &dpif), "opening datapath"); + run(dpif_flow_flush(&dpif), "deleting all flows"); + dpif_close(&dpif); +} + +static void +do_dump_groups(int argc UNUSED, char *argv[]) +{ + struct odp_stats stats; + struct dpif dpif; + unsigned int i; + + run(dpif_open(argv[1], &dpif), "opening datapath"); + run(dpif_get_dp_stats(&dpif, &stats), "get datapath stats"); + for (i = 0; i < stats.max_groups; i++) { + uint16_t ports[UINT16_MAX]; + size_t n_ports; + + if (!dpif_port_group_get(&dpif, i, ports, + ARRAY_SIZE(ports), &n_ports) && n_ports) { + size_t j; + + printf("group %u:", i); + for (j = 0; j < n_ports; j++) { + printf(" %"PRIu16, ports[j]); + } + printf("\n"); + } + } + dpif_close(&dpif); +} + +static void +do_help(int argc UNUSED, char *argv[] UNUSED) +{ + usage(); +} + +static struct command all_commands[] = { + { "add-dp", 1, INT_MAX, do_add_dp }, + { "del-dp", 1, 1, do_del_dp }, + { "add-if", 2, INT_MAX, do_add_if }, + { "del-if", 2, INT_MAX, do_del_if }, + { "show", 0, INT_MAX, do_show }, + { "dump-flows", 1, 1, do_dump_flows }, + { "del-flows", 1, 1, do_del_flows }, + { "dump-groups", 1, 1, do_dump_groups }, + { "help", 0, INT_MAX, do_help }, + { NULL, 0, 0, NULL }, +}; diff --git a/utilities/ovs-kill.8.in b/utilities/ovs-kill.8.in new file mode 100644 index 00000000..af4ec987 --- /dev/null +++ b/utilities/ovs-kill.8.in @@ -0,0 +1,60 @@ +.TH ovs\-kill 8 "May 2008" "Open vSwitch" "Open vSwitch Manual" +.ds PN ovs\-kill + +.SH NAME +ovs\-kill \- kills processes given their pidfiles + +.SH SYNOPSIS +.B ovs\-kill +[\fIoptions\fR] \fIpidfile\fR [\fIpidfile\fR...] + +.SH DESCRIPTION +The \fBovs\-kill\fR program reads each \fIpidfile\fR specified on the +command line and sends a signal to the program associated with it, if +any. It reads one line of text from \fIpidfile\fR, which must contain +the PID of the process to kill as a text string. It then uses +\fBfcntl\fR(2) to verify that a process with the PID from the file +owns a lock on \fIpidfile\fR before it sends the signal. + +A \fIpidfile\fR whose name begins with \fB/\fR is used literally. +Otherwise, \fB@RUNDIR@/\fR is prefixed. + +This program exists for use by \fBovs\-switch\-setup\fR, which cannot +easily implement its functionality since Perl has no portable +interface to \fBfcntl\fR-based file locking. + +.SH OPTIONS +.TP +\fB-s \fInumber\fR|\fIname\fR, \fB\-\^\-signal=\fInumber\fR|\fIname\fR +Sets the signal to be sent to each process. Signals may be given by +number (e.g. \fB1\fR) or by name (e.g. \fBHUP\fR or \fBSIGHUP\fR). +By default, \fBSIGTERM\fR is sent. + +.TP +\fB-f\fR, \fB\-\^\-force\fR +Causes \fBovs\-kill\fR to ignore all errors without printing a message +to \fBstderr\fR, and to exit with return code 0. + +.so lib/common.man + +.SH "EXIT CODE" + +Without \fB-f\fR or \fB\-\^\-force\fR, \fBovs\-kill\fR exits with +status 0 if at least one \fIpidfile\fR was given and the process +represented by every \fIpidfile\fR was signaled successfully, +otherwise with status 1. + +With \fB-f\fR or \fB\-\^\-force\fR, \fBovs\-kill\fR always exits with +status 0. + +.SH BUGS + +There is a race between verifying the lock on \fIpidfile\fR and +actually killing the process. + +\fBovs\-kill\fR does not wait for the signaled processes to die before +exiting. + +.SH "SEE ALSO" + +.BR ovs\-switch\-setup (8) diff --git a/utilities/ovs-kill.c b/utilities/ovs-kill.c new file mode 100644 index 00000000..f30bf0b9 --- /dev/null +++ b/utilities/ovs-kill.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "command-line.h" +#include "daemon.h" +#include "timeval.h" +#include "util.h" +#include "vlog.h" + +/* -s, --signal: signal to send. */ +static int sig_nr = SIGTERM; + +/* -f, --force: ignore errors. */ +static bool force; + +static void cond_error(int err_no, const char *, ...) PRINTF_FORMAT(2, 3); + +static void parse_options(int argc, char *argv[]); +static void usage(void); + +int +main(int argc, char *argv[]) +{ + bool ok = true; + int i; + + set_program_name(argv[0]); + time_init(); + vlog_init(); + parse_options(argc, argv); + + argc -= optind; + argv += optind; + if (argc < 1) { + if (!force) { + ovs_fatal(0, "need at least one non-option argument; " + "use --help for usage"); + } + } + + for (i = 0; i < argc; i++) { + char *pidfile; + pid_t pid; + + pidfile = make_pidfile_name(argv[i]); + pid = read_pidfile(pidfile); + if (pid >= 0) { + if (kill(pid, sig_nr) < 0) { + cond_error(errno, "%s: kill(%ld)", pidfile, (long int) pid); + } + } else { + cond_error(-pid, "could not read %s", pidfile); + } + free(pidfile); + } + + return ok || force ? EXIT_SUCCESS : EXIT_FAILURE; +} + +static void +parse_options(int argc, char *argv[]) +{ + static struct option long_options[] = { + {"signal", required_argument, 0, 's'}, + {"force", no_argument, 0, 'f'}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + + for (;;) { + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case 's': + if (atoi(optarg) || !strcmp(optarg, "0")) { + sig_nr = atoi(optarg); + } else { + struct signal_name { + const char *name; + int number; + }; + + static const struct signal_name signals[] = { +#define SIGNAL(NAME) { #NAME, NAME } + SIGNAL(SIGABRT), + SIGNAL(SIGALRM), + SIGNAL(SIGBUS), + SIGNAL(SIGCHLD), + SIGNAL(SIGCONT), + SIGNAL(SIGFPE), + SIGNAL(SIGHUP), + SIGNAL(SIGILL), + SIGNAL(SIGINT), + SIGNAL(SIGKILL), + SIGNAL(SIGPIPE), + SIGNAL(SIGQUIT), + SIGNAL(SIGSEGV), + SIGNAL(SIGSTOP), + SIGNAL(SIGTERM), + SIGNAL(SIGTSTP), + SIGNAL(SIGTTIN), + SIGNAL(SIGTTOU), + SIGNAL(SIGUSR1), + SIGNAL(SIGUSR2), +#ifdef SIGPOLL + SIGNAL(SIGPOLL), +#endif + SIGNAL(SIGPROF), + SIGNAL(SIGSYS), + SIGNAL(SIGTRAP), + SIGNAL(SIGURG), + SIGNAL(SIGVTALRM), + SIGNAL(SIGXCPU), + SIGNAL(SIGXFSZ), +#undef SIGNAL + }; + int i; + + for (i = 0; i < ARRAY_SIZE(signals); i++) { + const struct signal_name *s = &signals[i]; + if (!strcmp(optarg, s->name) + || !strcmp(optarg, s->name + 3)) { + sig_nr = s->number; + goto got_name; + } + } + ovs_fatal(0, "unknown signal \"%s\"", optarg); + got_name: ; + } + break; + + case 'f': + force = true; + break; + + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(0, 0); + exit(EXIT_SUCCESS); + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); +} + +static void +usage(void) +{ + printf("%s: kills a program using a pidfile\n" + "usage: %s [OPTIONS] PIDFILE [PIDFILE...]\n" + "where PIDFILE is a pidfile created by an Open vSwitch daemon.\n" + "\nOptions:\n" + " -s, --signal=NUMBER|NAME signal to send (default: TERM)\n" + " -f, --force ignore errors\n" + " -h, --help display this help message\n" + " -V, --version display version information\n", + program_name, program_name); + exit(EXIT_SUCCESS); +} + +static void +cond_error(int err_no, const char *format, ...) +{ + if (!force) { + va_list args; + + fprintf(stderr, "%s: ", program_name); + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + if (err_no != 0) + fprintf(stderr, " (%s)", strerror(err_no)); + putc('\n', stderr); + } +} diff --git a/utilities/ovs-monitor b/utilities/ovs-monitor new file mode 100755 index 00000000..4e098612 --- /dev/null +++ b/utilities/ovs-monitor @@ -0,0 +1,128 @@ +#!/bin/sh + +# Copyright (C) 2008, 2009 Nicira Networks, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin + +SECCHAN_PID=/var/run/secchan.pid +SECCHAN_SOCK=/var/run/secchan.mgmt +LOG_FILE=/var/log/openflow/monitor +INTERVAL=1 +FAIL_THRESH=3 + +usage() { + echo usage: $0 options + echo + echo "OPTIONS:" + echo " -h Show this message" + echo " -p PID file for secchan (default: $SECCHAN_PID)" + echo " -s Unix socket for secchan (default: $SECCHAN_SOCK)" + echo " -l File to log messages (default: $LOG_FILE)" + echo " -i Interval to send probes in seconds (default: $INTERVAL)" + echo " -c Number of failed probes before reboot (default: $FAIL_THRESH)" +} + +log() { + echo `date +"%b %d %X"`:$1 + echo `date +"%b %d %X"`:$1 >> $LOG_FILE +} + + +while getopts "hp:s:l:i:c:" OPTION; do + case $OPTION in + h) + usage + exit 1 + ;; + + p) + SECCHAN_PID=$OPTARG + ;; + + s) + SECCHAN_SOCK=$OPTARG + ;; + + l) + LOG_FILE=$OPTARG + ;; + + i) + INTERVAL=$OPTARG + ;; + + c) + FAIL_THRESH=$OPTARG + ;; + + *) + echo "Unknown option: ${OPTION}" + esac +done + + +if [ ! -f $SECCHAN_PID ]; then + log "No secchan pid file: ${SECCHAN_PID}" + echo "No secchan pid file: ${SECCHAN_PID}" +fi + +if [ ! -S $SECCHAN_SOCK ]; then + log "No secchan sock file: ${SECCHAN_SOCK}" + echo "No secchan sock file: ${SECCHAN_SOCK}" +fi + +if [ ! -d `dirname $LOG_FILE` ]; then + mkdir -p `dirname $LOG_FILE` +fi + +let DP_DOWN=0 +let SECCHAN_DOWN=0 +log "===== Starting Monitor ====" +while `/bin/true`; do + # Only check for liveness if the secchan's PID file exists. The PID + # file is removed when secchan is brought down gracefully. + if [ -f $SECCHAN_PID ]; then + pid=`cat $SECCHAN_PID` + if [ -d /proc/$pid ]; then + # Check if the secchan and datapath still can communicate + if [ -S $SECCHAN_SOCK ]; then + ovs-ofctl probe -t 2 unix:$SECCHAN_SOCK + if [ $? -ne 0 ]; then + log "datapath probe failed" + let DP_DOWN++ + else + let DP_DOWN=0 + fi + fi + let SECCHAN_DOWN=0 + else + log "secchan probe failed" + let SECCHAN_DOWN++ + fi + fi + + if [ $SECCHAN_DOWN -ge $FAIL_THRESH ]; then + log "Failed to probe secchan after ${SECCHAN_DOWN} tries...rebooting!" + reboot + fi + + if [ $DP_DOWN -ge $FAIL_THRESH ]; then + log "Failed to probe datapath after ${DP_DOWN} tries...rebooting!" + reboot + fi + + sleep $INTERVAL +done diff --git a/utilities/ovs-ofctl.8.in b/utilities/ovs-ofctl.8.in new file mode 100644 index 00000000..3a9c305f --- /dev/null +++ b/utilities/ovs-ofctl.8.in @@ -0,0 +1,489 @@ +.TH ovs\-ofctl 8 "March 2009" "Open vSwitch" "Open vSwitch Manual" +.ds PN ovs\-ofctl + +.SH NAME +ovs\-ofctl \- administer OpenFlow switches + +.SH SYNOPSIS +.B ovs\-ofctl +[\fIoptions\fR] \fIcommand \fR[\fIswitch\fR] [\fIargs\fR\&...] + +.SH DESCRIPTION +The +.B ovs\-ofctl +program is a command line tool for monitoring and administering +OpenFlow switches. It can also show the current state of an OpenFlow +switch, including features, configuration, and table entries. + +.SS "OpenFlow Switch Management Commands" + +These commands allow \fBovs\-ofctl\fR to monitor and administer an OpenFlow +switch. It is able to show the current state of a switch, including +features, configuration, and table entries. + +Most of these commands take an argument that specifies the method for +connecting to an OpenFlow switch. The following connection methods +are supported: + +.RS +.TP +\fBssl:\fIhost\fR[\fB:\fIport\fR] +The specified SSL \fIport\fR (default: 6633) on the given remote +\fIhost\fR. The \fB--private-key\fR, \fB--certificate\fR, and +\fB--ca-cert\fR options are mandatory when this form is used. + +.TP +\fBtcp:\fIhost\fR[\fB:\fIport\fR] +The specified TCP \fIport\fR (default: 6633) on the given remote +\fIhost\fR. + +.TP +\fBunix:\fIfile\fR +The Unix domain server socket named \fIfile\fR. + +.IP "\fIfile\fR" +This is short for \fBunix:\fIfile\fR, as long as \fIfile\fR does not +contain a colon. + +.IP \fIdp\fR +This is short for \fBunix:@RUNDIR@/\fIdp\fB.mgmt\fR, as long as +\fIdp\fR does not contain a colon. +.RE + +.TP +\fBshow \fIswitch\fR +Prints to the console information on \fIswitch\fR, including +information on its flow tables and ports. + +.TP +\fBstatus \fIswitch\fR [\fIkey\fR] +Prints to the console a series of key-value pairs that report the +status of \fIswitch\fR. If \fIkey\fR is specified, only the key-value +pairs whose key names begin with \fIkey\fR are printed. If \fIkey\fR is +omitted, all key-value pairs are printed. + +.TP +\fBdump-tables \fIswitch\fR +Prints to the console statistics for each of the flow tables used by +\fIswitch\fR. + +.TP +\fBdump-ports \fIswitch\fR +Prints to the console statistics for each of the network devices +associated with \fIswitch\fR. + +.TP +\fBmod-port \fIswitch\fR \fInetdev\fR \fIaction\fR +Modify characteristics of an interface monitored by \fIswitch\fR. +\fInetdev\fR can be referred to by its OpenFlow assigned port number or +the device name, e.g. \fBeth0\fR. The \fIaction\fR may be any one of the +following: + +.RS +.IP \fBup\fR +Enables the interface. This is equivalent to ``ifconfig up'' on a Unix +system. + +.IP \fBdown\fR +Disables the interface. This is equivalent to ``ifconfig down'' on a Unix +system. + +.IP \fBflood\fR +When a \fIflood\fR action is specified, traffic will be sent out this +interface. This is the default posture for monitored ports. + +.IP \fBnoflood\fR +When a \fIflood\fR action is specified, traffic will not be sent out +this interface. This is primarily useful to prevent loops when a +spanning tree protocol is not in use. + +.RE + +.TP +\fBdump-flows \fIswitch \fR[\fIflows\fR] +Prints to the console all flow entries in \fIswitch\fR's +tables that match \fIflows\fR. If \fIflows\fR is omitted, all flows +in the switch are retrieved. See \fBFlow Syntax\fR, below, for the +syntax of \fIflows\fR. The output format is described in +\fBTable Entry Output\fR. + +.TP +\fBdump-aggregate \fIswitch \fR[\fIflows\fR] +Prints to the console aggregate statistics for flows in +\fIswitch\fR's tables that match \fIflows\fR. If \fIflows\fR is omitted, +the statistics are aggregated across all flows in the switch's flow +tables. See \fBFlow Syntax\fR, below, for the syntax of \fIflows\fR. +The output format is descrbed in \fBTable Entry Output\fR. + +.TP +\fBadd-flow \fIswitch flow\fR +Add the flow entry as described by \fIflow\fR to the \fIswitch\fR's +tables. The flow entry is in the format described in \fBFlow Syntax\fR, +below. + +.TP +\fBadd-flows \fIswitch file\fR +Add flow entries as described in \fIfile\fR to \fIswitch\fR's +tables. Each line in \fIfile\fR is a flow entry in the format +described in \fBFlow Syntax\fR, below. + +.TP +\fBmod-flows \fIswitch flow\fR +Modify the actions in entries from the \fIswitch\fR's tables +that match \fIflow\fR. When invoked with the \fB--strict\fR option, +wildcards are not treated as active for matching purposes. See +\fBFlow Syntax\fR, below, for the syntax of \fIflows\fR. + +.TP +\fBdel-flows \fIswitch \fR[\fIflow\fR] +Deletes entries from the \fIswitch\fR's tables that match +\fIflow\fR. When invoked with the \fB--strict\fR option, wildcards are +not treated as active for matching purposes. If \fIflow\fR is +omitted and the \fB--strict\fR option is not used, all flows in the +switch's tables are removed. See \fBFlow Syntax\fR, below, for the +syntax of \fIflows\fR. + +.TP +\fBmonitor \fIswitch\fR [\fImiss-len\fR [\fIsend-exp]] +Connects to \fIswitch\fR and prints to the console all OpenFlow +messages received. Usually, \fIswitch\fR should specify a connection +named on \fBsecchan\fR(8)'s \fB-l\fR or \fB--listen\fR command line +option. + +If \fImiss-len\fR is provided, \fBovs\-ofctl\fR sends an OpenFlow ``set +configuration'' message at connection setup time that requests +\fImiss-len\fR bytes of each packet that misses the flow table. The +OpenFlow reference implementation not send these messages to the +\fBovs\-ofctl monitor\fR client connection unless a nonzero value is +specified on this argument. + +If \fIsend-exp\fR is specified as \fB1\fR, \fBovs\-ofctl\fR will also +request to be sent flow expiration messages. If this argument is +omitted, or \fB0\fR is specified, then \fRovs\-ofctl\fR will not request +flow expirations. + +This command may be useful for debugging switch or controller +implementations. + +.TP +\fBexecute \fIswitch command \fR[\fIarg\fR...] +Sends a request to \fIswitch\fR to execute \fIcommand\fR along with +each \fIarg\fR, if any, then waits for the command to complete and +reports its completion status on \fBstderr\fR and its output, if any, +on \fBstdout\fR. The set of available commands and their argument is +switch-dependent. (This command uses a Nicira extension to OpenFlow +that may not be available on all switches.) + +.SS "OpenFlow Switch and Controller Commands" + +The following commands, like those in the previous section, may be +applied to OpenFlow switches, using any of the connection methods +described in that section. Unlike those commands, these may also be +applied to OpenFlow controllers. + +.TP +\fBprobe \fItarget\fR +Sends a single OpenFlow echo-request message to \fItarget\fR and waits +for the response. With the \fB-t\fR or \fB--timeout\fR option, this +command can test whether an OpenFlow switch or controller is up and +running. + +.TP +\fBping \fItarget \fR[\fIn\fR] +Sends a series of 10 echo request packets to \fItarget\fR and times +each reply. The echo request packets consist of an OpenFlow header +plus \fIn\fR bytes (default: 64) of randomly generated payload. This +measures the latency of individual requests. + +.TP +\fBbenchmark \fItarget n count\fR +Sends \fIcount\fR echo request packets that each consist of an +OpenFlow header plus \fIn\fR bytes of payload and waits for each +response. Reports the total time required. This is a measure of the +maximum bandwidth to \fItarget\fR for round-trips of \fIn\fR-byte +messages. + +.SS "Flow Syntax" + +Some \fBovs\-ofctl\fR commands accept an argument that describes a flow or +flows. Such flow descriptions comprise a series +\fIfield\fB=\fIvalue\fR assignments, separated by commas or white +space. (Embedding spaces into a flow description normally requires +quoting to prevent the shell from breaking the description into +multiple arguments.) + +The following field assignments describe how a flow matches a packet. +If any of these assignments is omitted from the flow syntax, the field +is treated as a wildcard; thus, if all of them are omitted, the +resulting flow matches all packets. The string \fB*\fR or \fBANY\fR +may be specified to explicitly mark any of these fields as a wildcard. +(\fB*\fR should be quoted to protect it from shell expansion.) + +.IP \fBin_port=\fIport_no\fR +Matches physical port \fIport_no\fR. Switch ports are numbered as +displayed by \fBovs\-ofctl show\fR. + +.IP \fBdl_vlan=\fIvlan\fR +Matches IEEE 802.1q virtual LAN tag \fIvlan\fR. Specify \fB0xffff\fR +as \fIvlan\fR to match packets that are not tagged with a virtual LAN; +otherwise, specify a number between 0 and 4095, inclusive, as the +12-bit VLAN ID to match. + +.IP \fBdl_src=\fImac\fR +Matches Ethernet source address \fImac\fR, which is specified as 6 pairs +of hexadecimal digits delimited by colons (e.g. \fB00:0A:E4:25:6B:B0\fR). + +.IP \fBdl_dst=\fImac\fR +Matches Ethernet destination address \fImac\fR. + +.IP \fBdl_type=\fIethertype\fR +Matches Ethernet protocol type \fIethertype\fR, which is specified as an +integer between 0 and 65535, inclusive, either in decimal or as a +hexadecimal number prefixed by \fB0x\fR (e.g. \fB0x0806\fR to match ARP +packets). + +.IP \fBnw_src=\fIip\fR[\fB/\fInetmask\fR] +Matches IPv4 source address \fIip\fR, which may be specified as an +IP address or host name (e.g. \fB192.168.1.1\fR or +\fBwww.example.com\fR). The optional \fInetmask\fR allows restricting a +match to an IPv4 address prefix. The netmask may be specified as a dotted +quad (e.g. \fB192.168.1.0/255.255.255.0\fR) or as a CIDR block +(e.g. \fB192.168.1.0/24\fR). + +.IP \fBnw_dst=\fIip\fR[\fB/\fInetmask\fR] +Matches IPv4 destination address \fIip\fR. + +.IP \fBnw_proto=\fIproto\fR +Matches IP protocol type \fIproto\fR, which is specified as a decimal +number between 0 and 255, inclusive (e.g. 6 to match TCP packets). + +.IP \fBtp_src=\fIport\fR +Matches UDP or TCP source port \fIport\fR, which is specified as a decimal +number between 0 and 65535, inclusive (e.g. 80 to match packets originating +from a HTTP server). + +.IP \fBtp_dst=\fIport\fR +Matches UDP or TCP destination port \fIport\fR. + +.IP \fBicmp_type=\fItype\fR +Matches ICMP message with \fItype\fR, which is specified as a decimal +number between 0 and 255, inclusive. + +.IP \fBicmp_code=\fIcode\fR +Matches ICMP messages with \fIcode\fR. + +.PP +The following shorthand notations are also available: + +.IP \fBip\fR +Same as \fBdl_type=0x0800\fR. + +.IP \fBicmp\fR +Same as \fBdl_type=0x0800,nw_proto=1\fR. + +.IP \fBtcp\fR +Same as \fBdl_type=0x0800,nw_proto=6\fR. + +.IP \fBudp\fR +Same as \fBdl_type=0x0800,nw_proto=17\fR. + +.IP \fBarp\fR +Same as \fBdl_type=0x0806\fR. + +.PP +The \fBadd-flow\fR and \fBadd-flows\fR commands require an additional field: + +.IP \fBactions=\fR[\fItarget\fR][\fB,\fItarget\fR...]\fR +Specifies a comma-separated list of actions to take on a packet when the +flow entry matches. If no \fItarget\fR is specified, then packets +matching the flow are dropped. The \fItarget\fR may be a decimal port +number designating the physical port on which to output the packet, or one +of the following keywords: + +.RS +.IP \fBoutput\fR:\fIport\fR +Outputs the packet on the port specified by \fIport\fR. + +.IP \fBnormal\fR +Subjects the packet to the device's normal L2/L3 processing. (This +action is not implemented by all OpenFlow switches.) + +.IP \fBflood\fR +Outputs the packet on all switch physical ports other than the port on +which it was received and any ports on which flooding is disabled +(typically, these would be ports disabled by the IEEE 802.1D spanning +tree protocol). + +.IP \fBall\fR +Outputs the packet on all switch physical ports other than the port on +which it was received. + +.IP \fBcontroller\fR:\fImax_len\fR +Sends the packet to the OpenFlow controller as a ``packet in'' +message. If \fImax_len\fR is a number, then it specifies the maximum +number of bytes that should be sent. If \fImax_len\fR is \fBALL\fR or +omitted, then the entire packet is sent. + +.IP \fBlocal\fR +Outputs the packet on the ``local port,'' which corresponds to the +\fBof\fIn\fR network device (see \fBCONTACTING THE CONTROLLER\fR in +\fBsecchan\fR(8) for information on the \fBof\fIn\fR network device). + +.IP \fBdrop\fR +Discards the packet, so no further processing or forwarding takes place. +If a drop action is used, no other actions may be specified. + +.IP \fBmod_vlan_vid\fR:\fIvlan_vid\fR +Modifies the VLAN id on a packet. The VLAN tag is added or modified +as necessary to match the value specified. If the VLAN tag is added, +a priority of zero is used (see the \fBmod_vlan_pcp\fR action to set +this). + +.IP \fBmod_vlan_pcp\fR:\fIvlan_pcp\fR +Modifies the VLAN priority on a packet. The VLAN tag is added or modified +as necessary to match the value specified. Valid values are between 0 +(lowest) and 7 (highest). If the VLAN tag is added, a vid of zero is used +(see the \fBmod_vlan_vid\fR action to set this). + +.IP \fBstrip_vlan\fR +Strips the VLAN tag from a packet if it is present. + +.IP \fBmod_dl_src\fB:\fImac\fR +Sets the source Ethernet address to \fImac\fR. + +.IP \fBmod_dl_dst\fB:\fImac\fR +Sets the destination Ethernet address to \fImac\fR. +.RE + +.IP +(The OpenFlow protocol supports other actions that \fBovs\-ofctl\fR does +not yet expose to the user.) + +.PP +The \fBadd-flow\fR, \fBadd-flows\fR, and \fBdel-flows\fR commands +support an additional optional field: + +.IP \fBpriority=\fIvalue\fR +The priority at which a wildcarded entry will match in comparison to +others. \fIvalue\fR is a number between 0 and 65535, inclusive. A higher +\fIvalue\fR will match before a lower one. An exact-match entry will always +have priority over an entry containing wildcards, so it has an implicit +priority value of 65535. When adding a flow, if the field is not specified, +the flow's priority will default to 32768. + +.PP +The \fBadd-flow\fR and \fBadd-flows\fR commands support additional +optional fields: + +.TP +\fBidle_timeout=\fIseconds\fR +Causes the flow to expire after the given number of seconds of +inactivity. A value of 0 prevents a flow from expiring due to +inactivity. The default is 60 seconds. + +.IP \fBhard_timeout=\fIseconds\fR +Causes the flow to expire after the given number of seconds, +regardless of activity. A value of 0 (the default) gives the flow no +hard expiration deadline. + +.PP +The \fBdump-flows\fR, \fBdump-aggregate\fR, \fBdel-flow\fR +and \fBdel-flows\fR commands support one additional optional field: + +.TP +\fBout_port=\fIport\fR +If set, a matching flow must include an output action to \fIport\fR. + +.PP +The \fBdump-flows\fR and \fBdump-aggregate\fR commands support an +additional optional field: + +.IP \fBtable=\fInumber\fR +If specified, limits the flows about which statistics are gathered to +those in the table with the given \fInumber\fR. Tables are numbered +as shown by the \fBdump-tables\fR command. + +If this field is not specified, or if \fInumber\fR is given as +\fB255\fR, statistics are gathered about flows from all tables. + +.SS "Table Entry Output" + +The \fBdump-tables\fR and \fBdump-aggregate\fR commands print information +about the entries in a datapath's tables. Each line of output is a +unique flow entry, which begins with some common information: + +.IP \fBduration\fR +The number of seconds the entry has been in the table. + +.IP \fBtable_id\fR +The table that contains the flow. When a packet arrives, the switch +begins searching for an entry at the lowest numbered table. Tables are +numbered as shown by the \fBdump-tables\fR command. + +.IP \fBpriority\fR +The priority of the entry in relation to other entries within the same +table. A higher value will match before a lower one. + +.IP \fBn_packets\fR +The number of packets that have matched the entry. + +.IP \fBn_bytes\fR +The total number of bytes from packets that have matched the entry. + +.PP +The rest of the line consists of a description of the flow entry as +described in \fBFlow Syntax\fR, above. + + +.SH OPTIONS +.TP +\fB--strict\fR +Uses strict matching when running flow modification commands. + +.TP +\fB-t\fR, \fB--timeout=\fIsecs\fR +Limits \fBovs\-ofctl\fR runtime to approximately \fIsecs\fR seconds. If +the timeout expires, \fBovs\-ofctl\fR will exit with a \fBSIGALRM\fR +signal. + +.TP +\fB-p\fR, \fB--private-key=\fIprivkey.pem\fR +Specifies a PEM file containing the private key used as the +identity for SSL connections to a switch. + +.TP +\fB-c\fR, \fB--certificate=\fIcert.pem\fR +Specifies a PEM file containing a certificate, signed by the +controller's certificate authority (CA), that certifies the +private key to identify a trustworthy controller. + +.TP +\fB-C\fR, \fB--ca-cert=\fIcacert.pem\fR +Specifies a PEM file containing the CA certificate used to verify that +a switch is trustworthy. + +.so lib/vlog.man +.so lib/common.man + +.SH EXAMPLES + +The following examples assume that an OpenFlow switch on the local +host has been configured to listen for management connections on a +Unix domain socket named \fB@RUNDIR@/openflow.sock\fR, e.g. by +specifying \fB--listen=punix:@RUNDIR@/openflow.sock\fR on the +\fBsecchan\fR(8) command line. + +.TP +\fBovs\-ofctl dump-tables unix:@RUNDIR@/openflow.sock\fR +Prints out the switch's table stats. (This is more interesting after +some traffic has passed through.) + +.TP +\fBovs\-ofctl dump-flows unix:@RUNDIR@/openflow.sock\fR +Prints the flow entries in the switch. + +.SH "SEE ALSO" + +.BR ovs\-appctl (8), +.BR ovs\-controller (8), +.BR ovs\-vswitchd (8) diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c new file mode 100644 index 00000000..8b343956 --- /dev/null +++ b/utilities/ovs-ofctl.c @@ -0,0 +1,1278 @@ +/* + * Copyright (c) 2008, 2009 Nicira Networks. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "command-line.h" +#include "compiler.h" +#include "dirs.h" +#include "dpif.h" +#include "dynamic-string.h" +#include "netdev.h" +#include "netlink.h" +#include "odp-util.h" +#include "ofp-print.h" +#include "ofpbuf.h" +#include "openflow/nicira-ext.h" +#include "openflow/openflow.h" +#include "packets.h" +#include "random.h" +#include "socket-util.h" +#include "timeval.h" +#include "util.h" +#include "vconn-ssl.h" +#include "vconn.h" + +#include "vlog.h" +#define THIS_MODULE VLM_ofctl + +#define DEFAULT_IDLE_TIMEOUT 60 + +#define MOD_PORT_CMD_UP "up" +#define MOD_PORT_CMD_DOWN "down" +#define MOD_PORT_CMD_FLOOD "flood" +#define MOD_PORT_CMD_NOFLOOD "noflood" + + +/* Settings that may be configured by the user. */ +struct settings { + bool strict; /* Use strict matching for flow mod commands */ +}; + +struct command { + const char *name; + int min_args; + int max_args; + void (*handler)(const struct settings *, int argc, char *argv[]); +}; + +static struct command all_commands[]; + +static void usage(void) NO_RETURN; +static void parse_options(int argc, char *argv[], struct settings *); + +int main(int argc, char *argv[]) +{ + struct settings s; + struct command *p; + + set_program_name(argv[0]); + time_init(); + vlog_init(); + parse_options(argc, argv, &s); + signal(SIGPIPE, SIG_IGN); + + argc -= optind; + argv += optind; + if (argc < 1) + ovs_fatal(0, "missing command name; use --help for help"); + + for (p = all_commands; p->name != NULL; p++) { + if (!strcmp(p->name, argv[0])) { + int n_arg = argc - 1; + if (n_arg < p->min_args) + ovs_fatal(0, "'%s' command requires at least %d arguments", + p->name, p->min_args); + else if (n_arg > p->max_args) + ovs_fatal(0, "'%s' command takes at most %d arguments", + p->name, p->max_args); + else { + p->handler(&s, argc, argv); + if (ferror(stdout)) { + ovs_fatal(0, "write to stdout failed"); + } + if (ferror(stderr)) { + ovs_fatal(0, "write to stderr failed"); + } + exit(0); + } + } + } + ovs_fatal(0, "unknown command '%s'; use --help for help", argv[0]); + + return 0; +} + +static void +parse_options(int argc, char *argv[], struct settings *s) +{ + enum { + OPT_STRICT = UCHAR_MAX + 1 + }; + static struct option long_options[] = { + {"timeout", required_argument, 0, 't'}, + {"verbose", optional_argument, 0, 'v'}, + {"strict", no_argument, 0, OPT_STRICT}, + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + VCONN_SSL_LONG_OPTIONS + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + + /* Set defaults that we can figure out before parsing options. */ + s->strict = false; + + for (;;) { + unsigned long int timeout; + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case 't': + timeout = strtoul(optarg, NULL, 10); + if (timeout <= 0) { + ovs_fatal(0, "value %s on -t or --timeout is not at least 1", + optarg); + } else { + time_alarm(timeout); + } + break; + + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION); + exit(EXIT_SUCCESS); + + case 'v': + vlog_set_verbosity(optarg); + break; + + case OPT_STRICT: + s->strict = true; + break; + + VCONN_SSL_OPTION_HANDLERS + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); +} + +static void +usage(void) +{ + printf("%s: OpenFlow switch management utility\n" + "usage: %s [OPTIONS] COMMAND [ARG...]\n" + "\nFor OpenFlow switches:\n" + " show SWITCH show OpenFlow information\n" + " status SWITCH [KEY] report statistics (about KEY)\n" + " dump-desc SWITCH print switch description\n" + " dump-tables SWITCH print table stats\n" + " mod-port SWITCH IFACE ACT modify port behavior\n" + " dump-ports SWITCH print port statistics\n" + " dump-flows SWITCH print all flow entries\n" + " dump-flows SWITCH FLOW print matching FLOWs\n" + " dump-aggregate SWITCH print aggregate flow statistics\n" + " dump-aggregate SWITCH FLOW print aggregate stats for FLOWs\n" + " add-flow SWITCH FLOW add flow described by FLOW\n" + " add-flows SWITCH FILE add flows from FILE\n" + " mod-flows SWITCH FLOW modify actions of matching FLOWs\n" + " del-flows SWITCH [FLOW] delete matching FLOWs\n" + " monitor SWITCH MISSLEN EXP print packets received from SWITCH\n" + " execute SWITCH CMD [ARG...] execute CMD with ARGS on SWITCH\n" + "\nFor OpenFlow switches and controllers:\n" + " probe VCONN probe whether VCONN is up\n" + " ping VCONN [N] latency of N-byte echos\n" + " benchmark VCONN N COUNT bandwidth of COUNT N-byte echos\n" + "where each SWITCH is an active OpenFlow connection method.\n", + program_name, program_name); + vconn_usage(true, false, false); + vlog_usage(); + printf("\nOther options:\n" + " --strict use strict match for flow commands\n" + " -t, --timeout=SECS give up after SECS seconds\n" + " -h, --help display this help message\n" + " -V, --version display version information\n"); + exit(EXIT_SUCCESS); +} + +static void run(int retval, const char *message, ...) + PRINTF_FORMAT(2, 3); + +static void run(int retval, const char *message, ...) +{ + if (retval) { + va_list args; + + fprintf(stderr, "%s: ", program_name); + va_start(args, message); + vfprintf(stderr, message, args); + va_end(args); + if (retval == EOF) { + fputs(": unexpected end of file\n", stderr); + } else { + fprintf(stderr, ": %s\n", strerror(retval)); + } + + exit(EXIT_FAILURE); + } +} + +/* Generic commands. */ + +static void +open_vconn(const char *name, struct vconn **vconnp) +{ + struct dpif dpif; + struct stat s; + + if (strstr(name, ":")) { + run(vconn_open_block(name, OFP_VERSION, vconnp), + "connecting to %s", name); + } else if (!stat(name, &s) && S_ISSOCK(s.st_mode)) { + char *vconn_name = xasprintf("unix:%s", name); + VLOG_INFO("connecting to %s", vconn_name); + run(vconn_open_block(vconn_name, OFP_VERSION, vconnp), + "connecting to %s", vconn_name); + free(vconn_name); + } else if (!dpif_open(name, &dpif)) { + char dpif_name[IF_NAMESIZE + 1]; + char *socket_name; + char *vconn_name; + + run(dpif_get_name(&dpif, dpif_name, sizeof dpif_name), + "obtaining name of %s", dpif_name); + dpif_close(&dpif); + if (strcmp(dpif_name, name)) { + VLOG_INFO("datapath %s is named %s", name, dpif_name); + } + + socket_name = xasprintf("%s/%s.mgmt", ovs_rundir, dpif_name); + if (stat(socket_name, &s)) { + ovs_fatal(errno, "cannot connect to %s: stat failed on %s", + name, socket_name); + } else if (!S_ISSOCK(s.st_mode)) { + ovs_fatal(0, "cannot connect to %s: %s is not a socket", + name, socket_name); + } + + vconn_name = xasprintf("unix:%s", socket_name); + VLOG_INFO("connecting to %s", vconn_name); + run(vconn_open_block(vconn_name, OFP_VERSION, vconnp), + "connecting to %s", vconn_name); + free(socket_name); + free(vconn_name); + } else { + ovs_fatal(0, "%s is not a valid connection method", name); + } +} + +static void * +alloc_stats_request(size_t body_len, uint16_t type, struct ofpbuf **bufferp) +{ + struct ofp_stats_request *rq; + rq = make_openflow((offsetof(struct ofp_stats_request, body) + + body_len), OFPT_STATS_REQUEST, bufferp); + rq->type = htons(type); + rq->flags = htons(0); + return rq->body; +} + +static void +send_openflow_buffer(struct vconn *vconn, struct ofpbuf *buffer) +{ + update_openflow_length(buffer); + run(vconn_send_block(vconn, buffer), "failed to send packet to switch"); +} + +static void +dump_transaction(const char *vconn_name, struct ofpbuf *request) +{ + struct vconn *vconn; + struct ofpbuf *reply; + + update_openflow_length(request); + open_vconn(vconn_name, &vconn); + run(vconn_transact(vconn, request, &reply), "talking to %s", vconn_name); + ofp_print(stdout, reply->data, reply->size, 1); + vconn_close(vconn); +} + +static void +dump_trivial_transaction(const char *vconn_name, uint8_t request_type) +{ + struct ofpbuf *request; + make_openflow(sizeof(struct ofp_header), request_type, &request); + dump_transaction(vconn_name, request); +} + +static void +dump_stats_transaction(const char *vconn_name, struct ofpbuf *request) +{ + uint32_t send_xid = ((struct ofp_header *) request->data)->xid; + struct vconn *vconn; + bool done = false; + + open_vconn(vconn_name, &vconn); + send_openflow_buffer(vconn, request); + while (!done) { + uint32_t recv_xid; + struct ofpbuf *reply; + + run(vconn_recv_block(vconn, &reply), "OpenFlow packet receive failed"); + recv_xid = ((struct ofp_header *) reply->data)->xid; + if (send_xid == recv_xid) { + struct ofp_stats_reply *osr; + + ofp_print(stdout, reply->data, reply->size, 1); + + osr = ofpbuf_at(reply, 0, sizeof *osr); + done = !osr || !(ntohs(osr->flags) & OFPSF_REPLY_MORE); + } else { + VLOG_DBG("received reply with xid %08"PRIx32" " + "!= expected %08"PRIx32, recv_xid, send_xid); + } + ofpbuf_delete(reply); + } + vconn_close(vconn); +} + +static void +dump_trivial_stats_transaction(const char *vconn_name, uint8_t stats_type) +{ + struct ofpbuf *request; + alloc_stats_request(0, stats_type, &request); + dump_stats_transaction(vconn_name, request); +} + +static void +do_show(const struct settings *s UNUSED, int argc UNUSED, char *argv[]) +{ + dump_trivial_transaction(argv[1], OFPT_FEATURES_REQUEST); + dump_trivial_transaction(argv[1], OFPT_GET_CONFIG_REQUEST); +} + +static void +do_status(const struct settings *s UNUSED, int argc, char *argv[]) +{ + struct nicira_header *request, *reply; + struct vconn *vconn; + struct ofpbuf *b; + + request = make_openflow(sizeof *request, OFPT_VENDOR, &b); + request->vendor = htonl(NX_VENDOR_ID); + request->subtype = htonl(NXT_STATUS_REQUEST); + if (argc > 2) { + ofpbuf_put(b, argv[2], strlen(argv[2])); + update_openflow_length(b); + } + open_vconn(argv[1], &vconn); + run(vconn_transact(vconn, b, &b), "talking to %s", argv[1]); + vconn_close(vconn); + + if (b->size < sizeof *reply) { + ovs_fatal(0, "short reply (%zu bytes)", b->size); + } + reply = b->data; + if (reply->header.type != OFPT_VENDOR + || reply->vendor != ntohl(NX_VENDOR_ID) + || reply->subtype != ntohl(NXT_STATUS_REPLY)) { + ofp_print(stderr, b->data, b->size, 2); + ovs_fatal(0, "bad reply"); + } + + fwrite(reply + 1, b->size - sizeof *reply, 1, stdout); +} + +static void +do_dump_desc(const struct settings *s UNUSED, int argc UNUSED, char *argv[]) +{ + dump_trivial_stats_transaction(argv[1], OFPST_DESC); +} + +static void +do_dump_tables(const struct settings *s UNUSED, int argc UNUSED, char *argv[]) +{ + dump_trivial_stats_transaction(argv[1], OFPST_TABLE); +} + + +static uint32_t +str_to_u32(const char *str) +{ + char *tail; + uint32_t value; + + errno = 0; + value = strtoul(str, &tail, 0); + if (errno == EINVAL || errno == ERANGE || *tail) { + ovs_fatal(0, "invalid numeric format %s", str); + } + return value; +} + +static void +str_to_mac(const char *str, uint8_t mac[6]) +{ + if (sscanf(str, "%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8, + &mac[0], &mac[1], &mac[2], &mac[3], &mac[4], &mac[5]) != 6) { + ovs_fatal(0, "invalid mac address %s", str); + } +} + +static uint32_t +str_to_ip(const char *str_, uint32_t *ip) +{ + char *str = xstrdup(str_); + char *save_ptr = NULL; + const char *name, *netmask; + struct in_addr in_addr; + int n_wild, retval; + + name = strtok_r(str, "//", &save_ptr); + retval = name ? lookup_ip(name, &in_addr) : EINVAL; + if (retval) { + ovs_fatal(0, "%s: could not convert to IP address", str); + } + *ip = in_addr.s_addr; + + netmask = strtok_r(NULL, "//", &save_ptr); + if (netmask) { + uint8_t o[4]; + if (sscanf(netmask, "%"SCNu8".%"SCNu8".%"SCNu8".%"SCNu8, + &o[0], &o[1], &o[2], &o[3]) == 4) { + uint32_t nm = (o[0] << 24) | (o[1] << 16) | (o[2] << 8) | o[3]; + int i; + + /* Find first 1-bit. */ + for (i = 0; i < 32; i++) { + if (nm & (1u << i)) { + break; + } + } + n_wild = i; + + /* Verify that the rest of the bits are 1-bits. */ + for (; i < 32; i++) { + if (!(nm & (1u << i))) { + ovs_fatal(0, "%s: %s is not a valid netmask", + str, netmask); + } + } + } else { + int prefix = atoi(netmask); + if (prefix <= 0 || prefix > 32) { + ovs_fatal(0, "%s: network prefix bits not between 1 and 32", + str); + } + n_wild = 32 - prefix; + } + } else { + n_wild = 0; + } + + free(str); + return n_wild; +} + +static void * +put_action(struct ofpbuf *b, size_t size, uint16_t type) +{ + struct ofp_action_header *ah = ofpbuf_put_zeros(b, size); + ah->type = htons(type); + ah->len = htons(size); + return ah; +} + +static struct ofp_action_output * +put_output_action(struct ofpbuf *b, uint16_t port) +{ + struct ofp_action_output *oao = put_action(b, sizeof *oao, OFPAT_OUTPUT); + oao->port = htons(port); + return oao; +} + +static void +put_dl_addr_action(struct ofpbuf *b, uint16_t type, const char *addr) +{ + struct ofp_action_dl_addr *oada = put_action(b, sizeof *oada, type); + str_to_mac(addr, oada->dl_addr); +} + + +static bool +parse_port_name(const char *name, uint16_t *port) +{ + struct pair { + const char *name; + uint16_t value; + }; + static const struct pair pairs[] = { +#define DEF_PAIR(NAME) {#NAME, OFPP_##NAME} + DEF_PAIR(IN_PORT), + DEF_PAIR(TABLE), + DEF_PAIR(NORMAL), + DEF_PAIR(FLOOD), + DEF_PAIR(ALL), + DEF_PAIR(CONTROLLER), + DEF_PAIR(LOCAL), + DEF_PAIR(NONE), +#undef DEF_PAIR + }; + static const int n_pairs = ARRAY_SIZE(pairs); + size_t i; + + for (i = 0; i < n_pairs; i++) { + if (!strcasecmp(name, pairs[i].name)) { + *port = pairs[i].value; + return true; + } + } + return false; +} + +static void +str_to_action(char *str, struct ofpbuf *b) +{ + char *act, *arg; + char *saveptr = NULL; + bool drop = false; + int n_actions; + + for (act = strtok_r(str, ", \t\r\n", &saveptr), n_actions = 0; act; + act = strtok_r(NULL, ", \t\r\n", &saveptr), n_actions++) + { + uint16_t port; + + if (drop) { + ovs_fatal(0, "Drop actions must not be followed by other actions"); + } + + /* Arguments are separated by colons */ + arg = strchr(act, ':'); + if (arg) { + *arg = '\0'; + arg++; + } + + if (!strcasecmp(act, "mod_vlan_vid")) { + struct ofp_action_vlan_vid *va; + va = put_action(b, sizeof *va, OFPAT_SET_VLAN_VID); + va->vlan_vid = htons(str_to_u32(arg)); + } else if (!strcasecmp(act, "mod_vlan_pcp")) { + struct ofp_action_vlan_pcp *va; + va = put_action(b, sizeof *va, OFPAT_SET_VLAN_PCP); + va->vlan_pcp = str_to_u32(arg); + } else if (!strcasecmp(act, "strip_vlan")) { + struct ofp_action_header *ah; + ah = put_action(b, sizeof *ah, OFPAT_STRIP_VLAN); + ah->type = htons(OFPAT_STRIP_VLAN); + } else if (!strcasecmp(act, "mod_dl_src")) { + put_dl_addr_action(b, OFPAT_SET_DL_SRC, arg); + } else if (!strcasecmp(act, "mod_dl_dst")) { + put_dl_addr_action(b, OFPAT_SET_DL_DST, arg); + } else if (!strcasecmp(act, "output")) { + put_output_action(b, str_to_u32(arg)); + } else if (!strcasecmp(act, "drop")) { + /* A drop action in OpenFlow occurs by just not setting + * an action. */ + drop = true; + if (n_actions) { + ovs_fatal(0, "Drop actions must not be preceded by other " + "actions"); + } + } else if (!strcasecmp(act, "CONTROLLER")) { + struct ofp_action_output *oao; + oao = put_output_action(b, OFPP_CONTROLLER); + + /* Unless a numeric argument is specified, we send the whole + * packet to the controller. */ + if (arg && (strspn(act, "0123456789") == strlen(act))) { + oao->max_len = htons(str_to_u32(arg)); + } + } else if (parse_port_name(act, &port)) { + put_output_action(b, port); + } else if (strspn(act, "0123456789") == strlen(act)) { + put_output_action(b, str_to_u32(act)); + } else { + ovs_fatal(0, "Unknown action: %s", act); + } + } +} + +struct protocol { + const char *name; + uint16_t dl_type; + uint8_t nw_proto; +}; + +static bool +parse_protocol(const char *name, const struct protocol **p_out) +{ + static const struct protocol protocols[] = { + { "ip", ETH_TYPE_IP, 0 }, + { "arp", ETH_TYPE_ARP, 0 }, + { "icmp", ETH_TYPE_IP, IP_TYPE_ICMP }, + { "tcp", ETH_TYPE_IP, IP_TYPE_TCP }, + { "udp", ETH_TYPE_IP, IP_TYPE_UDP }, + }; + const struct protocol *p; + + for (p = protocols; p < &protocols[ARRAY_SIZE(protocols)]; p++) { + if (!strcmp(p->name, name)) { + *p_out = p; + return true; + } + } + *p_out = NULL; + return false; +} + +struct field { + const char *name; + uint32_t wildcard; + enum { F_U8, F_U16, F_MAC, F_IP } type; + size_t offset, shift; +}; + +static bool +parse_field(const char *name, const struct field **f_out) +{ +#define F_OFS(MEMBER) offsetof(struct ofp_match, MEMBER) + static const struct field fields[] = { + { "in_port", OFPFW_IN_PORT, F_U16, F_OFS(in_port), 0 }, + { "dl_vlan", OFPFW_DL_VLAN, F_U16, F_OFS(dl_vlan), 0 }, + { "dl_src", OFPFW_DL_SRC, F_MAC, F_OFS(dl_src), 0 }, + { "dl_dst", OFPFW_DL_DST, F_MAC, F_OFS(dl_dst), 0 }, + { "dl_type", OFPFW_DL_TYPE, F_U16, F_OFS(dl_type), 0 }, + { "nw_src", OFPFW_NW_SRC_MASK, F_IP, + F_OFS(nw_src), OFPFW_NW_SRC_SHIFT }, + { "nw_dst", OFPFW_NW_DST_MASK, F_IP, + F_OFS(nw_dst), OFPFW_NW_DST_SHIFT }, + { "nw_proto", OFPFW_NW_PROTO, F_U8, F_OFS(nw_proto), 0 }, + { "tp_src", OFPFW_TP_SRC, F_U16, F_OFS(tp_src), 0 }, + { "tp_dst", OFPFW_TP_DST, F_U16, F_OFS(tp_dst), 0 }, + { "icmp_type", OFPFW_ICMP_TYPE, F_U16, F_OFS(icmp_type), 0 }, + { "icmp_code", OFPFW_ICMP_CODE, F_U16, F_OFS(icmp_code), 0 } + }; + const struct field *f; + + for (f = fields; f < &fields[ARRAY_SIZE(fields)]; f++) { + if (!strcmp(f->name, name)) { + *f_out = f; + return true; + } + } + *f_out = NULL; + return false; +} + +static void +str_to_flow(char *string, struct ofp_match *match, struct ofpbuf *actions, + uint8_t *table_idx, uint16_t *out_port, uint16_t *priority, + uint16_t *idle_timeout, uint16_t *hard_timeout) +{ + char *save_ptr = NULL; + char *name; + uint32_t wildcards; + + if (table_idx) { + *table_idx = 0xff; + } + if (out_port) { + *out_port = OFPP_NONE; + } + if (priority) { + *priority = OFP_DEFAULT_PRIORITY; + } + if (idle_timeout) { + *idle_timeout = DEFAULT_IDLE_TIMEOUT; + } + if (hard_timeout) { + *hard_timeout = OFP_FLOW_PERMANENT; + } + if (actions) { + char *act_str = strstr(string, "action"); + if (!act_str) { + ovs_fatal(0, "must specify an action"); + } + *(act_str-1) = '\0'; + + act_str = strchr(act_str, '='); + if (!act_str) { + ovs_fatal(0, "must specify an action"); + } + + act_str++; + + str_to_action(act_str, actions); + } + memset(match, 0, sizeof *match); + wildcards = OFPFW_ALL; + for (name = strtok_r(string, "=, \t\r\n", &save_ptr); name; + name = strtok_r(NULL, "=, \t\r\n", &save_ptr)) { + const struct protocol *p; + + if (parse_protocol(name, &p)) { + wildcards &= ~OFPFW_DL_TYPE; + match->dl_type = htons(p->dl_type); + if (p->nw_proto) { + wildcards &= ~OFPFW_NW_PROTO; + match->nw_proto = p->nw_proto; + } + } else { + const struct field *f; + char *value; + + value = strtok_r(NULL, ", \t\r\n", &save_ptr); + if (!value) { + ovs_fatal(0, "field %s missing value", name); + } + + if (table_idx && !strcmp(name, "table")) { + *table_idx = atoi(value); + } else if (out_port && !strcmp(name, "out_port")) { + *out_port = atoi(value); + } else if (priority && !strcmp(name, "priority")) { + *priority = atoi(value); + } else if (idle_timeout && !strcmp(name, "idle_timeout")) { + *idle_timeout = atoi(value); + } else if (hard_timeout && !strcmp(name, "hard_timeout")) { + *hard_timeout = atoi(value); + } else if (parse_field(name, &f)) { + void *data = (char *) match + f->offset; + if (!strcmp(value, "*") || !strcmp(value, "ANY")) { + wildcards |= f->wildcard; + } else { + wildcards &= ~f->wildcard; + if (f->wildcard == OFPFW_IN_PORT + && parse_port_name(value, (uint16_t *) data)) { + /* Nothing to do. */ + } else if (f->type == F_U8) { + *(uint8_t *) data = str_to_u32(value); + } else if (f->type == F_U16) { + *(uint16_t *) data = htons(str_to_u32(value)); + } else if (f->type == F_MAC) { + str_to_mac(value, data); + } else if (f->type == F_IP) { + wildcards |= str_to_ip(value, data) << f->shift; + } else { + NOT_REACHED(); + } + } + } else { + ovs_fatal(0, "unknown keyword %s", name); + } + } + } + match->wildcards = htonl(wildcards); +} + +static void +do_dump_flows(const struct settings *s UNUSED, int argc, char *argv[]) +{ + struct ofp_flow_stats_request *req; + uint16_t out_port; + struct ofpbuf *request; + + req = alloc_stats_request(sizeof *req, OFPST_FLOW, &request); + str_to_flow(argc > 2 ? argv[2] : "", &req->match, NULL, + &req->table_id, &out_port, NULL, NULL, NULL); + memset(&req->pad, 0, sizeof req->pad); + req->out_port = htons(out_port); + + dump_stats_transaction(argv[1], request); +} + +static void +do_dump_aggregate(const struct settings *s UNUSED, int argc, char *argv[]) +{ + struct ofp_aggregate_stats_request *req; + struct ofpbuf *request; + uint16_t out_port; + + req = alloc_stats_request(sizeof *req, OFPST_AGGREGATE, &request); + str_to_flow(argc > 2 ? argv[2] : "", &req->match, NULL, + &req->table_id, &out_port, NULL, NULL, NULL); + memset(&req->pad, 0, sizeof req->pad); + req->out_port = htons(out_port); + + dump_stats_transaction(argv[1], request); +} + +static void +do_add_flow(const struct settings *s UNUSED, int argc UNUSED, char *argv[]) +{ + struct vconn *vconn; + struct ofpbuf *buffer; + struct ofp_flow_mod *ofm; + uint16_t priority, idle_timeout, hard_timeout; + struct ofp_match match; + + /* Parse and send. str_to_flow() will expand and reallocate the data in + * 'buffer', so we can't keep pointers to across the str_to_flow() call. */ + make_openflow(sizeof *ofm, OFPT_FLOW_MOD, &buffer); + str_to_flow(argv[2], &match, buffer, + NULL, NULL, &priority, &idle_timeout, &hard_timeout); + ofm = buffer->data; + ofm->match = match; + ofm->command = htons(OFPFC_ADD); + ofm->idle_timeout = htons(idle_timeout); + ofm->hard_timeout = htons(hard_timeout); + ofm->buffer_id = htonl(UINT32_MAX); + ofm->priority = htons(priority); + ofm->reserved = htonl(0); + + open_vconn(argv[1], &vconn); + send_openflow_buffer(vconn, buffer); + vconn_close(vconn); +} + +static void +do_add_flows(const struct settings *s UNUSED, int argc UNUSED, char *argv[]) +{ + struct vconn *vconn; + FILE *file; + char line[1024]; + + file = fopen(argv[2], "r"); + if (file == NULL) { + ovs_fatal(errno, "%s: open", argv[2]); + } + + open_vconn(argv[1], &vconn); + while (fgets(line, sizeof line, file)) { + struct ofpbuf *buffer; + struct ofp_flow_mod *ofm; + uint16_t priority, idle_timeout, hard_timeout; + struct ofp_match match; + + char *comment; + + /* Delete comments. */ + comment = strchr(line, '#'); + if (comment) { + *comment = '\0'; + } + + /* Drop empty lines. */ + if (line[strspn(line, " \t\n")] == '\0') { + continue; + } + + /* Parse and send. str_to_flow() will expand and reallocate the data + * in 'buffer', so we can't keep pointers to across the str_to_flow() + * call. */ + ofm = make_openflow(sizeof *ofm, OFPT_FLOW_MOD, &buffer); + str_to_flow(line, &match, buffer, + NULL, NULL, &priority, &idle_timeout, &hard_timeout); + ofm = buffer->data; + ofm->match = match; + ofm->command = htons(OFPFC_ADD); + ofm->idle_timeout = htons(idle_timeout); + ofm->hard_timeout = htons(hard_timeout); + ofm->buffer_id = htonl(UINT32_MAX); + ofm->priority = htons(priority); + ofm->reserved = htonl(0); + + send_openflow_buffer(vconn, buffer); + } + vconn_close(vconn); + fclose(file); +} + +static void +do_mod_flows(const struct settings *s, int argc UNUSED, char *argv[]) +{ + uint16_t priority, idle_timeout, hard_timeout; + struct vconn *vconn; + struct ofpbuf *buffer; + struct ofp_flow_mod *ofm; + + /* Parse and send. */ + ofm = make_openflow(sizeof *ofm, OFPT_FLOW_MOD, &buffer); + str_to_flow(argv[2], &ofm->match, buffer, + NULL, NULL, &priority, &idle_timeout, &hard_timeout); + if (s->strict) { + ofm->command = htons(OFPFC_MODIFY_STRICT); + } else { + ofm->command = htons(OFPFC_MODIFY); + } + ofm->idle_timeout = htons(idle_timeout); + ofm->hard_timeout = htons(hard_timeout); + ofm->buffer_id = htonl(UINT32_MAX); + ofm->priority = htons(priority); + ofm->reserved = htonl(0); + + open_vconn(argv[1], &vconn); + send_openflow_buffer(vconn, buffer); + vconn_close(vconn); +} + +static void do_del_flows(const struct settings *s, int argc, char *argv[]) +{ + struct vconn *vconn; + uint16_t priority; + uint16_t out_port; + struct ofpbuf *buffer; + struct ofp_flow_mod *ofm; + + /* Parse and send. */ + ofm = make_openflow(sizeof *ofm, OFPT_FLOW_MOD, &buffer); + str_to_flow(argc > 2 ? argv[2] : "", &ofm->match, NULL, NULL, + &out_port, &priority, NULL, NULL); + if (s->strict) { + ofm->command = htons(OFPFC_DELETE_STRICT); + } else { + ofm->command = htons(OFPFC_DELETE); + } + ofm->idle_timeout = htons(0); + ofm->hard_timeout = htons(0); + ofm->buffer_id = htonl(UINT32_MAX); + ofm->out_port = htons(out_port); + ofm->priority = htons(priority); + ofm->reserved = htonl(0); + + open_vconn(argv[1], &vconn); + send_openflow_buffer(vconn, buffer); + vconn_close(vconn); +} + +static void +do_monitor(const struct settings *s UNUSED, int argc UNUSED, char *argv[]) +{ + struct vconn *vconn; + + open_vconn(argv[1], &vconn); + if (argc > 2) { + int miss_send_len = atoi(argv[2]); + int send_flow_exp = argc > 3 ? atoi(argv[3]) : 0; + struct ofp_switch_config *osc; + struct ofpbuf *buf; + + osc = make_openflow(sizeof *osc, OFPT_SET_CONFIG, &buf); + osc->flags = htons(send_flow_exp ? OFPC_SEND_FLOW_EXP : 0); + osc->miss_send_len = htons(miss_send_len); + send_openflow_buffer(vconn, buf); + } + for (;;) { + struct ofpbuf *b; + run(vconn_recv_block(vconn, &b), "vconn_recv"); + ofp_print(stderr, b->data, b->size, 2); + ofpbuf_delete(b); + } +} + +static void +do_dump_ports(const struct settings *s UNUSED, int argc UNUSED, char *argv[]) +{ + dump_trivial_stats_transaction(argv[1], OFPST_PORT); +} + +static void +do_probe(const struct settings *s UNUSED, int argc UNUSED, char *argv[]) +{ + struct ofpbuf *request; + struct vconn *vconn; + struct ofpbuf *reply; + + make_openflow(sizeof(struct ofp_header), OFPT_ECHO_REQUEST, &request); + open_vconn(argv[1], &vconn); + run(vconn_transact(vconn, request, &reply), "talking to %s", argv[1]); + if (reply->size != sizeof(struct ofp_header)) { + ovs_fatal(0, "reply does not match request"); + } + ofpbuf_delete(reply); + vconn_close(vconn); +} + +static void +do_mod_port(const struct settings *s UNUSED, int argc UNUSED, char *argv[]) +{ + struct ofpbuf *request, *reply; + struct ofp_switch_features *osf; + struct ofp_port_mod *opm; + struct vconn *vconn; + char *endptr; + int n_ports; + int port_idx; + int port_no; + + + /* Check if the argument is a port index. Otherwise, treat it as + * the port name. */ + port_no = strtol(argv[2], &endptr, 10); + if (port_no == 0 && endptr == argv[2]) { + port_no = -1; + } + + /* Send a "Features Request" to get the information we need in order + * to modify the port. */ + make_openflow(sizeof(struct ofp_header), OFPT_FEATURES_REQUEST, &request); + open_vconn(argv[1], &vconn); + run(vconn_transact(vconn, request, &reply), "talking to %s", argv[1]); + + osf = reply->data; + n_ports = (reply->size - sizeof *osf) / sizeof *osf->ports; + + for (port_idx = 0; port_idx < n_ports; port_idx++) { + if (port_no != -1) { + /* Check argument as a port index */ + if (osf->ports[port_idx].port_no == htons(port_no)) { + break; + } + } else { + /* Check argument as an interface name */ + if (!strncmp((char *)osf->ports[port_idx].name, argv[2], + sizeof osf->ports[0].name)) { + break; + } + + } + } + if (port_idx == n_ports) { + ovs_fatal(0, "couldn't find monitored port: %s", argv[2]); + } + + opm = make_openflow(sizeof(struct ofp_port_mod), OFPT_PORT_MOD, &request); + opm->port_no = osf->ports[port_idx].port_no; + memcpy(opm->hw_addr, osf->ports[port_idx].hw_addr, sizeof opm->hw_addr); + opm->config = htonl(0); + opm->mask = htonl(0); + opm->advertise = htonl(0); + + printf("modifying port: %s\n", osf->ports[port_idx].name); + + if (!strncasecmp(argv[3], MOD_PORT_CMD_UP, sizeof MOD_PORT_CMD_UP)) { + opm->mask |= htonl(OFPPC_PORT_DOWN); + } else if (!strncasecmp(argv[3], MOD_PORT_CMD_DOWN, + sizeof MOD_PORT_CMD_DOWN)) { + opm->mask |= htonl(OFPPC_PORT_DOWN); + opm->config |= htonl(OFPPC_PORT_DOWN); + } else if (!strncasecmp(argv[3], MOD_PORT_CMD_FLOOD, + sizeof MOD_PORT_CMD_FLOOD)) { + opm->mask |= htonl(OFPPC_NO_FLOOD); + } else if (!strncasecmp(argv[3], MOD_PORT_CMD_NOFLOOD, + sizeof MOD_PORT_CMD_NOFLOOD)) { + opm->mask |= htonl(OFPPC_NO_FLOOD); + opm->config |= htonl(OFPPC_NO_FLOOD); + } else { + ovs_fatal(0, "unknown mod-port command '%s'", argv[3]); + } + + send_openflow_buffer(vconn, request); + + ofpbuf_delete(reply); + vconn_close(vconn); +} + +static void +do_ping(const struct settings *s UNUSED, int argc, char *argv[]) +{ + size_t max_payload = 65535 - sizeof(struct ofp_header); + unsigned int payload; + struct vconn *vconn; + int i; + + payload = argc > 2 ? atoi(argv[2]) : 64; + if (payload > max_payload) { + ovs_fatal(0, "payload must be between 0 and %zu bytes", max_payload); + } + + open_vconn(argv[1], &vconn); + for (i = 0; i < 10; i++) { + struct timeval start, end; + struct ofpbuf *request, *reply; + struct ofp_header *rq_hdr, *rpy_hdr; + + rq_hdr = make_openflow(sizeof(struct ofp_header) + payload, + OFPT_ECHO_REQUEST, &request); + random_bytes(rq_hdr + 1, payload); + + gettimeofday(&start, NULL); + run(vconn_transact(vconn, ofpbuf_clone(request), &reply), "transact"); + gettimeofday(&end, NULL); + + rpy_hdr = reply->data; + if (reply->size != request->size + || memcmp(rpy_hdr + 1, rq_hdr + 1, payload) + || rpy_hdr->xid != rq_hdr->xid + || rpy_hdr->type != OFPT_ECHO_REPLY) { + printf("Reply does not match request. Request:\n"); + ofp_print(stdout, request, request->size, 2); + printf("Reply:\n"); + ofp_print(stdout, reply, reply->size, 2); + } + printf("%d bytes from %s: xid=%08"PRIx32" time=%.1f ms\n", + reply->size - sizeof *rpy_hdr, argv[1], rpy_hdr->xid, + (1000*(double)(end.tv_sec - start.tv_sec)) + + (.001*(end.tv_usec - start.tv_usec))); + ofpbuf_delete(request); + ofpbuf_delete(reply); + } + vconn_close(vconn); +} + +static void +do_benchmark(const struct settings *s UNUSED, int argc UNUSED, char *argv[]) +{ + size_t max_payload = 65535 - sizeof(struct ofp_header); + struct timeval start, end; + unsigned int payload_size, message_size; + struct vconn *vconn; + double duration; + int count; + int i; + + payload_size = atoi(argv[2]); + if (payload_size > max_payload) { + ovs_fatal(0, "payload must be between 0 and %zu bytes", max_payload); + } + message_size = sizeof(struct ofp_header) + payload_size; + + count = atoi(argv[3]); + + printf("Sending %d packets * %u bytes (with header) = %u bytes total\n", + count, message_size, count * message_size); + + open_vconn(argv[1], &vconn); + gettimeofday(&start, NULL); + for (i = 0; i < count; i++) { + struct ofpbuf *request, *reply; + struct ofp_header *rq_hdr; + + rq_hdr = make_openflow(message_size, OFPT_ECHO_REQUEST, &request); + memset(rq_hdr + 1, 0, payload_size); + run(vconn_transact(vconn, request, &reply), "transact"); + ofpbuf_delete(reply); + } + gettimeofday(&end, NULL); + vconn_close(vconn); + + duration = ((1000*(double)(end.tv_sec - start.tv_sec)) + + (.001*(end.tv_usec - start.tv_usec))); + printf("Finished in %.1f ms (%.0f packets/s) (%.0f bytes/s)\n", + duration, count / (duration / 1000.0), + count * message_size / (duration / 1000.0)); +} + +static void +do_execute(const struct settings *s UNUSED, int argc, char *argv[]) +{ + struct vconn *vconn; + struct ofpbuf *request; + struct nicira_header *nicira; + struct nx_command_reply *ncr; + uint32_t xid; + int i; + + nicira = make_openflow(sizeof *nicira, OFPT_VENDOR, &request); + xid = nicira->header.xid; + nicira->vendor = htonl(NX_VENDOR_ID); + nicira->subtype = htonl(NXT_COMMAND_REQUEST); + ofpbuf_put(request, argv[2], strlen(argv[2])); + for (i = 3; i < argc; i++) { + ofpbuf_put_zeros(request, 1); + ofpbuf_put(request, argv[i], strlen(argv[i])); + } + update_openflow_length(request); + + open_vconn(argv[1], &vconn); + run(vconn_send_block(vconn, request), "send"); + + for (;;) { + struct ofpbuf *reply; + uint32_t status; + + run(vconn_recv_xid(vconn, xid, &reply), "recv_xid"); + if (reply->size < sizeof *ncr) { + ovs_fatal(0, "reply is too short (%zu bytes < %zu bytes)", + reply->size, sizeof *ncr); + } + ncr = reply->data; + if (ncr->nxh.header.type != OFPT_VENDOR + || ncr->nxh.vendor != htonl(NX_VENDOR_ID) + || ncr->nxh.subtype != htonl(NXT_COMMAND_REPLY)) { + ovs_fatal(0, "reply is invalid"); + } + + status = ntohl(ncr->status); + if (status & NXT_STATUS_STARTED) { + /* Wait for a second reply. */ + continue; + } else if (status & NXT_STATUS_EXITED) { + fprintf(stderr, "process terminated normally with exit code %d", + status & NXT_STATUS_EXITSTATUS); + } else if (status & NXT_STATUS_SIGNALED) { + fprintf(stderr, "process terminated by signal %d", + status & NXT_STATUS_TERMSIG); + } else if (status & NXT_STATUS_ERROR) { + fprintf(stderr, "error executing command"); + } else { + fprintf(stderr, "process terminated for unknown reason"); + } + if (status & NXT_STATUS_COREDUMP) { + fprintf(stderr, " (core dumped)"); + } + putc('\n', stderr); + + fwrite(ncr + 1, reply->size - sizeof *ncr, 1, stdout); + break; + } +} + +static void +do_help(const struct settings *s UNUSED, int argc UNUSED, char *argv[] UNUSED) +{ + usage(); +} + +static struct command all_commands[] = { + { "show", 1, 1, do_show }, + { "status", 1, 2, do_status }, + { "monitor", 1, 3, do_monitor }, + { "dump-desc", 1, 1, do_dump_desc }, + { "dump-tables", 1, 1, do_dump_tables }, + { "dump-flows", 1, 2, do_dump_flows }, + { "dump-aggregate", 1, 2, do_dump_aggregate }, + { "add-flow", 2, 2, do_add_flow }, + { "add-flows", 2, 2, do_add_flows }, + { "mod-flows", 2, 2, do_mod_flows }, + { "del-flows", 1, 2, do_del_flows }, + { "dump-ports", 1, 1, do_dump_ports }, + { "mod-port", 3, 3, do_mod_port }, + { "probe", 1, 1, do_probe }, + { "ping", 1, 2, do_ping }, + { "benchmark", 3, 3, do_benchmark }, + { "execute", 2, INT_MAX, do_execute }, + { "help", 0, INT_MAX, do_help }, + { NULL, 0, 0, NULL }, +}; diff --git a/utilities/ovs-parse-leaks.in b/utilities/ovs-parse-leaks.in new file mode 100755 index 00000000..4e06847c --- /dev/null +++ b/utilities/ovs-parse-leaks.in @@ -0,0 +1,285 @@ +#! @PERL@ + +use strict; +use warnings; + +if (grep($_ eq '--help', @ARGV)) { + print < 1; +die "$0: $ARGV[0] does not exist" if @ARGV > 0 && ! -e $ARGV[0]; + +our ($binary); +our ($a2l) = search_path("addr2line"); +my ($no_syms) = "symbols will not be translated"; +if (!@ARGV) { + print "no binary specified; $no_syms\n"; +} elsif (! -e $ARGV[0]) { + print "$ARGV[0] does not exist; $no_syms"; +} elsif (!defined($a2l)) { + print "addr2line not found in PATH; $no_syms"; +} else { + $binary = $ARGV[0]; +} + +our ($objdump) = search_path("objdump"); +print "objdump not found; dynamic library symbols will not be translated\n" + if !defined($objdump); + +our %blocks; +our @segments; +while () { + my $ptr = "((?:0x)?[0-9a-fA-F]+|\\(nil\\))"; + my $callers = ":((?: $ptr)+)"; + if (/^malloc\((\d+)\) -> $ptr$callers$/) { + allocated($., $2, $1, $3); + } elsif (/^claim\($ptr\)$callers$/) { + claimed($., $1, $2); + } elsif (/realloc\($ptr, (\d+)\) -> $ptr$callers$/) { + my ($callers) = $4; + freed($., $1, $callers); + allocated($., $3, $2, $callers); + } elsif (/^free\($ptr\)$callers$/) { + freed($., $1, $2); + } elsif (/^segment: $ptr-$ptr $ptr [-r][-w][-x][sp] (.*)/) { + add_segment(hex($1), hex($2), hex($3), $4); + } else { + print "stdin:$.: syntax error\n"; + } +} +if (%blocks) { + my $n_blocks = scalar(keys(%blocks)); + my $n_bytes = 0; + $n_bytes += $_->{SIZE} foreach values(%blocks); + print "$n_bytes bytes in $n_blocks blocks not freed at end of run\n"; + my %blocks_by_callers; + foreach my $block (values(%blocks)) { + my ($trimmed_callers) = trim_callers($block->{CALLERS}); + push (@{$blocks_by_callers{$trimmed_callers}}, $block); + } + foreach my $callers (sort {@{$b} <=> @{$a}} (values(%blocks_by_callers))) { + $n_blocks = scalar(@{$callers}); + $n_bytes = 0; + $n_bytes += $_->{SIZE} foreach @{$callers}; + print "$n_bytes bytes in these $n_blocks blocks were not freed:\n"; + my $i = 0; + my $max = 5; + foreach my $block (sort {$a->{LINE} <=> $b->{LINE}} (@{$callers})) { + printf "\t%d-byte block at 0x%08x allocated on stdin:%d\n", + $block->{SIZE}, $block->{BASE}, $block->{LINE}; + last if $i++ > $max; + } + print "\t...and ", $n_blocks - $max, " others...\n" + if $n_blocks > $max; + print "The blocks listed above were allocated by:\n"; + print_callers("\t", ${$callers}[0]->{CALLERS}); + } +} +sub interp_pointer { + my ($s_ptr) = @_; + return $s_ptr eq '(nil)' ? 0 : hex($s_ptr); +} + +sub allocated { + my ($line, $s_base, $size, $callers) = @_; + my ($base) = interp_pointer($s_base); + return if !$base; + my ($info) = {LINE => $line, + BASE => $base, + SIZE => $size, + CALLERS => $callers}; + if (exists($blocks{$base})) { + print "In-use address returned by allocator:\n"; + print "\tInitial allocation:\n"; + print_block("\t\t", $blocks{$base}); + print "\tNew allocation:\n"; + print_block("\t\t", $info); + } + $blocks{$base} = $info; +} + +sub claimed { + my ($line, $s_base, $callers) = @_; + my ($base) = interp_pointer($s_base); + return if !$base; + if (exists($blocks{$base})) { + $blocks{$base}{LINE} = $line; + $blocks{$base}{CALLERS} = $callers; + } else { + printf "Claim asserted on not-in-use block 0x%08x by:\n", $base; + print_callers('', $callers); + } +} + +sub freed { + my ($line, $s_base, $callers) = @_; + my ($base) = interp_pointer($s_base); + return if !$base; + + if (!delete($blocks{$base})) { + printf "Bad free of not-allocated address 0x%08x on stdin:%d by:\n", $base, $line; + print_callers('', $callers); + } +} + +sub print_block { + my ($prefix, $info) = @_; + printf '%s%d-byte block at 0x%08x allocated on stdin:%d by:' . "\n", + $prefix, $info->{SIZE}, $info->{BASE}, $info->{LINE}; + print_callers($prefix, $info->{CALLERS}); +} + +sub print_callers { + my ($prefix, $callers) = @_; + foreach my $pc (split(' ', $callers)) { + print "$prefix\t", lookup_pc($pc), "\n"; + } +} + +our (%cache); +sub lookup_pc { + my ($s_pc) = @_; + if (defined($binary)) { + my ($pc) = hex($s_pc); + my ($output) = "$s_pc: "; + if (!exists($cache{$pc})) { + open(A2L, "$a2l -fe $binary --demangle $s_pc|"); + chomp(my $function = ); + chomp(my $line = ); + close(A2L); + if ($function eq '??') { + ($function, $line) = lookup_pc_by_segment($pc); + } + $line =~ s/^(\.\.\/)*//; + $line = "..." . substr($line, -25) if length($line) > 28; + $cache{$pc} = "$s_pc: $function ($line)"; + } + return $cache{$pc}; + } else { + return "$s_pc"; + } +} + +sub trim_callers { + my ($in) = @_; + my (@out); + foreach my $pc (split(' ', $in)) { + my $xlated = lookup_pc($pc); + if ($xlated =~ /\?\?/) { + push(@out, "...") if !@out || $out[$#out] ne '...'; + } else { + push(@out, $pc); + } + } + return join(' ', @out); +} + +sub search_path { + my ($target) = @_; + for my $dir (split (':', $ENV{PATH})) { + my ($file) = "$dir/$target"; + return $file if -e $file; + } + return undef; +} + +sub add_segment { + my ($vm_start, $vm_end, $vm_pgoff, $file) = @_; + for (my $i = 0; $i <= $#segments; $i++) { + my ($s) = $segments[$i]; + next if $vm_end <= $s->{START} || $vm_start >= $s->{END}; + if ($vm_start <= $s->{START} && $vm_end >= $s->{END}) { + splice(@segments, $i, 1); + --$i; + } else { + $s->{START} = $vm_end if $vm_end > $s->{START}; + $s->{END} = $vm_start if $vm_start <= $s->{END}; + } + } + push(@segments, {START => $vm_start, + END => $vm_end, + PGOFF => $vm_pgoff, + FILE => $file}); + @segments = sort { $a->{START} <=> $b->{START} } @segments; +} + +sub binary_search { + my ($array, $value) = @_; + my $l = 0; + my $r = $#{$array}; + while ($l <= $r) { + my $m = int(($l + $r) / 2); + my $e = $array->[$m]; + if ($value < $e->{START}) { + $r = $m - 1; + } elsif ($value >= $e->{END}) { + $l = $m + 1; + } else { + return $e; + } + } + return undef; +} + +sub read_sections { + my ($file) = @_; + my (@sections); + open(OBJDUMP, "$objdump -h $file|"); + while () { + my $ptr = "([0-9a-fA-F]+)"; + my ($name, $size, $vma, $lma, $file_off) + = /^\s*\d+\s+(\S+)\s+$ptr\s+$ptr\s+$ptr\s+$ptr/ + or next; + push(@sections, {START => hex($file_off), + END => hex($file_off) + hex($size), + NAME => $name}); + } + close(OBJDUMP); + return [sort { $a->{START} <=> $b->{START} } @sections ]; +} + +our %file_to_sections; +sub segment_to_section { + my ($file, $file_offset) = @_; + if (!defined($file_to_sections{$file})) { + $file_to_sections{$file} = read_sections($file); + } + return binary_search($file_to_sections{$file}, $file_offset); +} + +sub address_to_segment { + my ($pc) = @_; + return binary_search(\@segments, $pc); +} + +sub lookup_pc_by_segment { + return ('??', 0) if !defined($objdump); + + my ($pc) = @_; + my ($segment) = address_to_segment($pc); + return ('??', 0) if !defined($segment) || $segment->{FILE} eq ''; + + my ($file_offset) = $pc - $segment->{START} + $segment->{PGOFF}; + my ($section) = segment_to_section($segment->{FILE}, $file_offset); + return ('??', 0) if !defined($section); + + my ($section_offset) = $file_offset - $section->{START}; + open(A2L, sprintf("%s -fe %s --demangle --section=$section->{NAME} 0x%x|", + $a2l, $segment->{FILE}, $section_offset)); + chomp(my $function = ); + chomp(my $line = ); + close(A2L); + + return ($function, $line); +} + +# Local Variables: +# mode: perl +# End: diff --git a/utilities/ovs-pki-cgi.in b/utilities/ovs-pki-cgi.in new file mode 100755 index 00000000..837b3f92 --- /dev/null +++ b/utilities/ovs-pki-cgi.in @@ -0,0 +1,41 @@ +#! @PERL@ + +use CGI; +use Digest::SHA1; +use Fcntl; + +$CGI::POST_MAX = 65536; # Limit POSTs to 64 kB. + +use strict; +use warnings; + +my $pkidir = '@PKIDIR@'; +my $q = new CGI; + +die unless $q->request_method() eq 'POST'; + +my $type = $q->param('type'); +die unless defined $type; +die unless $type eq 'switch' or $type eq 'controller'; + +my $req = $q->param('req'); +die unless defined $req; +die unless $req =~ /^-----BEGIN CERTIFICATE REQUEST-----$/m; +die unless $req =~ /^-----END CERTIFICATE REQUEST-----$/m; + +my $digest = Digest::SHA1::sha1_hex($req); +my $incoming = "$pkidir/${type}ca/incoming"; +my $dst = "$incoming/$digest-req.pem"; + +sysopen(REQUEST, "$dst.tmp", O_RDWR | O_CREAT | O_EXCL, 0600) + or die "sysopen $dst.tmp: $!"; +print REQUEST $req; +close(REQUEST) or die "close $dst.tmp: $!"; + +rename("$dst.tmp", $dst) or die "rename $dst.tmp to $dst: $!"; + +print $q->header('text/html', '204 No response'); + +# Local Variables: +# mode: perl +# End: diff --git a/utilities/ovs-pki.8.in b/utilities/ovs-pki.8.in new file mode 100644 index 00000000..35bf0ea4 --- /dev/null +++ b/utilities/ovs-pki.8.in @@ -0,0 +1,323 @@ +.TH ovs\-pki 8 "May 2008" "Open vSwitch" "Open vSwitch Manual" + +.SH NAME +ovs\-pki \- OpenFlow public key infrastructure management utility + +.SH SYNOPSIS +\fBovs\-pki\fR [\fIOPTIONS\fR] \fICOMMAND\fR [\fIARGS\fR] +.sp +Stand\-alone commands with their arguments: +.br +\fBovs\-pki\fR \fBinit\fR +.br +\fBovs\-pki\fR \fBreq\fR \fINAME\fR +.br +\fBovs\-pki\fR \fBsign\fR \fINAME\fR [\fITYPE\fR] +.br +\fBovs\-pki\fR \fBreq+sign\fR \fINAME\fR [\fITYPE\fR] +.br +\fBovs\-pki\fR \fBverify\fR \fINAME\fR [\fITYPE\fR] +.br +\fBovs\-pki\fR \fBfingerprint\fR \fIFILE\fR +.br +\fBovs\-pki\fR \self-sign\fR \fINAME\fR +.sp +The following additional commands manage an online PKI: +.br +\fBovs\-pki\fR \fBls\fR [\fIPREFIX\fR] [\fITYPE\fR] +.br +\fBovs\-pki\fR \fBflush\fR [\fITYPE\fR] +.br +\fBovs\-pki\fR \fBreject\fR \fIPREFIX\fR [\fITYPE\fR] +.br +\fBovs\-pki\fR \fBapprove\fR \fIPREFIX\fR [\fITYPE\fR] +.br +\fBovs\-pki\fR \fBprompt\fR [\fITYPE\fR] +.br +\fBovs\-pki\fR \fBexpire\fR [\fIAGE\fR] +.sp +Each \fITYPE\fR above is a certificate type, either \fBswitch\fR +(default) or \fBcontroller\fR. +.sp +The available options are: +.br +[\fB\-k\fR \fItype\fR | \fB\-\^\-key=\fItype\fR] +[\fB\-B\fR \fInbits\fR | \fB\-\^\-bits=\fInbits\fR] +[\fB\-D\fR \fIfile\fR | \fB\-\^\-dsaparam=\fIfile\fR] +[\fB\-b\fR | \fB\-\^\-batch\fR] +[\fB\-f\fR | \fB\-\^\-force\fR] +[\fB\-d\fR \fIdir\fR | \fB\-\^\-dir=\fR\fIdir\fR] +[\fB\-l\fR \fIfile\fR | \fB\-\^\-log=\fIfile\fR] +[\fB\-h\fR | \fB\-\^\-help\fR] +.br +Some options do not apply to every command. + +.SH DESCRIPTION +The \fBovs\-pki\fR program sets up and manages a public key +infrastructure for use with OpenFlow. It is intended to be a simple +interface for organizations that do not have an established public key +infrastructure. Other PKI tools can substitute for or supplement the +use of \fBovs\-pki\fR. + +\fBovs\-pki\fR uses \fBopenssl\fR(1) for certificate management and key +generation. + +.SH "OFFLINE COMMANDS" + +The following \fBovs\-pki\fR commands support manual PKI +administration: + +.TP +\fBinit\fR +Initializes a new PKI (by default in directory \fB@PKIDIR@\fR) and populates +it with a pair of certificate authorities for controllers and +switches. + +This command should ideally be run on a high\-security machine separate +from any OpenFlow controller or switch, called the CA machine. The +files \fBpki/controllerca/cacert.pem\fR and +\fBpki/switchca/cacert.pem\fR that it produces will need to be copied +over to the OpenFlow switches and controllers, respectively. Their +contents may safely be made public. + +By default, \fBovs\-pki\fR generates 2048\-bit RSA keys. The \fB\-B\fR +or \fB\-\^\-bits\fR option (see below) may be used to override the key +length. The \fB\-k dsa\fR or \fB\-\^\-key=dsa\fR option may be used to use +DSA in place of RSA. If DSA is selected, the \fBdsaparam.pem\fR file +generated in the new PKI hierarchy must be copied to any machine on +which the \fBreq\fR command (see below) will be executed. Its +contents may safely be made public. + +Other files generated by \fBinit\fR may remain on the CA machine. +The files \fBpki/controllerca/private/cakey.pem\fR and +\fBpki/switchca/private/cakey.pem\fR have particularly sensitive +contents that should not be exposed. + +.TP +\fBreq\fR \fINAME\fR +Generates a new private key named \fINAME\fR\fB\-privkey.pem\fR and +corresponding certificate request named \fINAME\fR\fB\-req.pem\fR. +The private key can be intended for use by a switch or a controller. + +This command should ideally be run on the switch or controller that +will use the private key to identify itself. The file +\fINAME\fR\fB\-req.pem\fR must be copied to the CA machine for signing +with the \fBsign\fR command (below). + +This command will output a fingerprint to stdout as its final step. +Write down the fingerprint and take it to the CA machine before +continuing with the \fBsign\fR step. + +When RSA keys are in use (as is the default), \fBreq\fR, unlike the +rest of \fBovs\-pki\fR's commands, does not need access to a PKI +hierarchy created by \fBovs\-pki init\fR. The \fB\-B\fR or +\fB\-\^\-bits\fR option (see below) may be used to specify the number of +bits in the generated RSA key. + +When DSA keys are used (as specified with \fB\-\^\-key=dsa\fR), \fBreq\fR +needs access to the \fBdsaparam.pem\fR file created as part of the PKI +hierarchy (but not to other files in that tree). By default, +\fBovs\-pki\fR looks for this file in \fB@PKIDIR@/dsaparam.pem\fR, but +the \fB\-D\fR or \fB\-\^\-dsaparam\fR option (see below) may be used to +specify an alternate location. + +\fINAME\fR\fB\-privkey.pem\fR has sensitive contents that should not be +exposed. \fINAME\fR\fB\-req.pem\fR may be safely made public. + +.TP +\fBsign\fR \fINAME\fR [\fITYPE\fR] +Signs the certificate request named \fINAME\fR\fB\-req.pem\fR that was +produced in the previous step, producing a certificate named +\fINAME\fR\fB\-cert.pem\fR. \fITYPE\fR, either \fBswitch\fR (default) or +\fBcontroller\fR, indicates the use for which the key is being +certified. + +This command must be run on the CA machine. + +The command will output a fingerprint to stdout and request that you +verify that it is the same fingerprint output by the \fBreq\fR +command. This ensures that the request being signed is the same one +produced by \fBreq\fR. (The \fB\-b\fR or \fB\-\^\-batch\fR option +suppresses the verification step.) + +The file \fINAME\fR\fB\-cert.pem\fR will need to be copied back to the +switch or controller for which it is intended. Its contents may +safely be made public. + +.TP +\fBreq+sign\fR \fINAME\fR [\fITYPE\fR] +Combines the \fBreq\fR and \fBsign\fR commands into a single step, +outputting all the files produced by each. The +\fINAME\fR\fB\-privkey.pem\fR and \fINAME\fR\fB\-cert.pem\fR files must +be copied securely to the switch or controller. +\fINAME\fR\fB\-privkey.pem\fR has sensitive contents and must not be +exposed in transit. Afterward, it should be deleted from the CA +machine. + +This combined method is, theoretically, less secure than the +individual steps performed separately on two different machines, +because there is additional potential for exposure of the private +key. However, it is also more convenient. + +.TP +\fBverify\fR \fINAME\fR [\fITYPE\fR] +Verifies that \fINAME\fR\fB\-cert.pem\fR is a valid certificate for the +given \fITYPE\fR of use, either \fBswitch\fR (default) or +\fBcontroller\fR. If the certificate is valid for this use, it prints +the message ``\fINAME\fR\fB\-cert.pem\fR: OK''; otherwise, it prints an +error message. + +.TP +\fBfingerprint\fR \fIFILE\fR +Prints the fingerprint for \fIFILE\fR. If \fIFILE\fR is a +certificate, then this is the SHA\-1 digest of the DER encoded version +of the certificate; otherwise, it is the SHA\-1 digest of the entire +file. + +.TP +\fBself-sign\fR \fINAME\fR +Signs the certificate request named \fINAME\fB\-req.pem\fR using the +private key \fINAME\fB-privkey.pem\fR, producing a self-signed +certificate named \fINAMEfB\-cert.pem\fR. The input files should have +been produced with \fBovs\-pki req\fR. + +Some controllers accept such self-signed certificates. + +.SH "ONLINE COMMANDS" + +An OpenFlow PKI can be administered online, in conjunction with +.BR ovs\-pki\-cgi (8) +and a web server such as Apache: + +.IP \(bu +The web server exports the contents of the PKI via HTTP. All files in +a PKI hierarchy files may be made public, except for the files +\fBpki/controllerca/private/cakey.pem\fR and +\fBpki/switchca/private/cakey.pem\fR, which must not be exposed. + +.IP \(bu +\fBovs\-pki\-cgi\fR allows newly generated certificate requests for +controllers and switches to be uploaded into the +\fBpki/controllerca/incoming\fR and \fBpki/switchca/incoming\fR +directories, respectively. Uploaded certificate requests are stored +in those directories under names of the form +\fIFINGERPRINT\fB\-req.pem\fR, which \fIFINGERPRINT\fR is the SHA\-1 +hash of the file. + +.IP \(bu +These \fBovs\-pki\fR commands allow incoming certificate requests to +be approved or rejected, in a form are suitable for use by humans or +other software. + +.PP +The following \fBovs\-pki\fR commands support online administration: + +.TP +\fBovs\-pki\fR \fBls\fR [\fIPREFIX\fR] [\fITYPE\fR] +Lists all of the incoming certificate requests of the given \fITYPE\fR +(either \fBswitch\fR, the default, or \fBcontroller\fR). If +\fIPREFIX\fR, which must be at least 4 characters long, is specified, +it causes the list to be limited to files whose names begin with +\fIPREFIX\fR. This is useful, for example, to avoid typing in an +entire fingerprint when checking that a specific certificate request +has been received. + +.TP +\fBovs\-pki\fR \fBflush\fR [\fITYPE\fR] +Deletes all certificate requests of the given \fITYPE\fR. + +.TP +\fBovs\-pki\fR \fBreject\fR \fIPREFIX\fR [\fITYPE\fR] +Rejects the certificate request whose name begins with \fIPREFIX\fR, +which must be at least 4 characters long, of the given type (either +\fBswitch\fR, the default, or \fBcontroller\fR). \fIPREFIX\fR must +match exactly one certificate request; its purpose is to allow the +user to type fewer characters, not to match multiple certificate +requests. + +.TP +\fBovs\-pki\fR \fBapprove\fR \fIPREFIX\fR [\fITYPE\fR] +Approves the certificate request whose name begins with \fIPREFIX\fR, +which must be at least 4 characters long, of the given \fITYPE\fR +(either \fBswitch\fR, the default, or \fBcontroller\fR). \fIPREFIX\fR +must match exactly one certificate request; its purpose is to allow +the user to type fewer characters, not to match multiple certificate +requests. + +The command will output a fingerprint to stdout and request that you +verify that it is correct. (The \fB\-b\fR or \fB\-\^\-batch\fR option +suppresses the verification step.) + +.TP +\fBovs\-pki\fR \fBprompt\fR [\fITYPE\fR] +Prompts the user for each incoming certificate request of the given +\fITYPE\fR (either \fBswitch\fR, the default, or \fBcontroller\fR). +Based on the certificate request's fingerprint, the user is given the +option of approving, rejecting, or skipping the certificate request. + +.TP +\fBovs\-pki\fR \fBexpire\fR [\fIAGE\fR] + +Rejects all the incoming certificate requests, of either type, that is +older than \fIAGE\fR, which must in one of the forms \fIN\fBs\fR, +\fIN\fBmin\fR, \fIN\fBh\fR, \fIN\fBday\fR. The default is \fB1day\fR. + +.SH OPTIONS +.TP +\fB\-k\fR \fItype\fR | \fB\-\^\-key=\fItype\fR +For the \fBinit\fR command, sets the public key algorithm to use for +the new PKI hierarchy. For the \fBreq\fR and \fBreq+sign\fR commands, +sets the public key algorithm to use for the key to be generated, +which must match the value specified on \fBinit\fR. With other +commands, the value has no effect. + +The \fItype\fR may be \fBrsa\fR (the default) or \fBdsa\fR. + +.TP +\fB\-B\fR \fInbits\fR | \fB\-\^\-bits=\fInbits\fR +Sets the number of bits in the key to be generated. When RSA keys are +in use, this option affects only the \fBinit\fR, \fBreq\fR, and +\fBreq+sign\fR commands, and the same value should be given each time. +With DSA keys are in use, this option affects only the \fBinit\fR +command. + +The value must be at least 1024. The default is 2048. + +.TP +\fB\-D\fR \fIfile\fR | \fB\-\^\-dsaparam=\fIfile\fR +Specifies an alternate location for the \fBdsaparam.pem\fR file +required by the \fBreq\fR and \fBreq+sign\fR commands. This option +affects only these commands, and only when DSA keys are used. + +The default is \fBdsaparam.pem\fR under the PKI hierarchy. + +.TP +\fB\-b\fR | \fB\-\^\-batch\fR +Suppresses the interactive verification of fingerprints that the +\fBsign\fR and \fBapprove\fR commands by default require. + +.TP +\fB\-d\fR \fIdir\fR | \fB\-\^\-dir=\fR\fIdir\fR +Specifies the location of the PKI hierarchy to be used or created by +the command (default: \fB@PKIDIR@\fR). All commands, except \fBreq\fR, +need access to a PKI hierarchy. + +.TP +\fB\-f\fR | \fB\-\^\-force\fR +By default, \fBovs\-pki\fR will not overwrite existing files or +directories. This option overrides this behavior. + +.TP +\fB\-l\fR \fIfile\fR | \fB\-\^\-log=\fIfile\fR +Sets the log file to \fIfile\fR. Default: +\fB@LOGDIR@/ovs\-pki.log\fR. + +.TP +\fB\-h\fR | \fB\-\^\-help\fR +Prints a help usage message and exits. + +.SH "SEE ALSO" + +.BR controller (8), +.BR ovs\-pki\-cgi (8), +.BR secchan (8) diff --git a/utilities/ovs-pki.in b/utilities/ovs-pki.in new file mode 100755 index 00000000..15ac17b9 --- /dev/null +++ b/utilities/ovs-pki.in @@ -0,0 +1,582 @@ +#! /bin/sh + +set -e + +pkidir='@PKIDIR@' +command= +prev= +force=no +batch=no +log='@LOGDIR@/ovs-pki.log' +keytype=rsa +bits=2048 +for option; do + # This option-parsing mechanism borrowed from a Autoconf-generated + # configure script under the following license: + + # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, + # 2002, 2003, 2004, 2005, 2006, 2009 Free Software Foundation, Inc. + # This configure script is free software; the Free Software Foundation + # gives unlimited permission to copy, distribute and modify it. + + # If the previous option needs an argument, assign it. + if test -n "$prev"; then + eval $prev=\$option + prev= + continue + fi + case $option in + *=*) optarg=`expr "X$option" : '[^=]*=\(.*\)'` ;; + *) optarg=yes ;; + esac + + case $dashdash$option in + --) + dashdash=yes ;; + -h|--help) + cat <&2 + exit 1 + ;; + *) + if test -z "$command"; then + command=$option + elif test -z "${arg1+set}"; then + arg1=$option + elif test -z "${arg2+set}"; then + arg2=$option + else + echo "$option: only two arguments may be specified" >&2 + exit 1 + fi + ;; + esac + shift +done +if test -n "$prev"; then + option=--`echo $prev | sed 's/_/-/g'` + { echo "$as_me: error: missing argument to $option" >&2 + { (exit 1); exit 1; }; } +fi +if test -z "$command"; then + echo "$0: missing command name; use --help for help" >&2 + exit 1 +fi +if test "$keytype" != rsa && test "$keytype" != dsa; then + echo "$0: argument to -k or --key must be rsa or dsa" + exit 1 +fi +if test "$bits" -lt 1024; then + echo "$0: argument to -B or --bits must be at least 1024" + exit 1 +fi +if test -z "$dsaparam"; then + dsaparam=$pkidir/dsaparam.pem +fi +case $log in + /*) ;; + *) $log="$PWD/$log" ;; +esac + +if test "$command" = "init"; then + if test -e "$pkidir" && test "$force" != "yes"; then + echo "$0: $pkidir already exists and --force not specified" >&2 + exit 1 + fi + + if test ! -d "$pkidir"; then + mkdir -p "$pkidir" + fi + cd "$pkidir" + exec 3>>$log + + if test $keytype = dsa && test ! -e dsaparam.pem; then + echo "Generating DSA parameters, please wait..." >&2 + openssl dsaparam -out dsaparam.pem $bits 1>&3 2>&3 + fi + + # Create the CAs. + for ca in controllerca switchca; do + echo "Creating $ca..." >&2 + oldpwd=$PWD + mkdir -p $ca + cd $ca + + mkdir -p certs crl newcerts + mkdir -p -m 0700 private + mkdir -p -m 0733 incoming + touch index.txt + test -e crlnumber || echo 01 > crlnumber + test -e serial || echo 01 > serial + + # Put DSA parameters in directory. + if test $keytype = dsa && test ! -e dsaparam.pem; then + cp ../dsaparam.pem . + fi + + # Write CA configuration file. + if test ! -e ca.cnf; then + sed "s/@ca@/$ca/g" > ca.cnf <<'EOF' +[ req ] +prompt = no +distinguished_name = req_distinguished_name + +[ req_distinguished_name ] +C = US +ST = CA +L = Palo Alto +O = Open vSwitch +OU = @ca@ +CN = Open vSwitch @ca@ CA Certificate + +[ ca ] +default_ca = the_ca + +[ the_ca ] +dir = . # top dir +database = $dir/index.txt # index file. +new_certs_dir = $dir/newcerts # new certs dir +certificate = $dir/cacert.pem # The CA cert +serial = $dir/serial # serial no file +private_key = $dir/private/cakey.pem# CA private key +RANDFILE = $dir/private/.rand # random number file +default_days = 365 # how long to certify for +default_crl_days= 30 # how long before next CRL +default_md = md5 # md to use +policy = policy # default policy +email_in_dn = no # Don't add the email into cert DN +name_opt = ca_default # Subject name display option +cert_opt = ca_default # Certificate display option +copy_extensions = none # Don't copy extensions from request + +# For the CA policy +[ policy ] +countryName = optional +stateOrProvinceName = optional +organizationName = match +organizationalUnitName = optional +commonName = supplied +emailAddress = optional +EOF + fi + + # Create certificate authority. + if test $keytype = dsa; then + newkey=dsa:dsaparam.pem + else + newkey=rsa:$bits + fi + openssl req -config ca.cnf -nodes \ + -newkey $newkey -keyout private/cakey.pem -out careq.pem \ + 1>&3 2>&3 + openssl ca -config ca.cnf -create_serial -out cacert.pem \ + -days 1095 -batch -keyfile private/cakey.pem -selfsign \ + -infiles careq.pem 1>&3 2>&3 + chmod 0700 private/cakey.pem + + cd "$oldpwd" + done + exit 0 +fi + +one_arg() { + if test -z "$arg1" || test -n "$arg2"; then + echo "$0: $command must have exactly one argument; use --help for help" >&2 + exit 1 + fi +} + +zero_or_one_args() { + if test -n "$arg2"; then + echo "$0: $command must have zero or one arguments; use --help for help" >&2 + exit 1 + fi +} + +one_or_two_args() { + if test -z "$arg1"; then + echo "$0: $command must have one or two arguments; use --help for help" >&2 + exit 1 + fi +} + +must_not_exist() { + if test -e "$1" && test "$force" != "yes"; then + echo "$0: $1 already exists and --force not supplied" >&2 + exit 1 + fi +} + +resolve_prefix() { + test -n "$type" || exit 123 # Forgot to call check_type? + + case $1 in + ????*) + ;; + *) + echo "Prefix $arg1 is too short (less than 4 hex digits)" + exit 0 + ;; + esac + + fingerprint=$(cd "$pkidir/${type}ca/incoming" && echo "$1"*-req.pem | sed 's/-req\.pem$//') + case $fingerprint in + "${1}*") + echo "No certificate requests matching $1" + exit 1 + ;; + *" "*) + echo "$1 matches more than one certificate request:" + echo $fingerprint | sed 's/ /\ +/g' + exit 1 + ;; + *) + # Nothing to do. + ;; + esac + req="$pkidir/${type}ca/incoming/$fingerprint-req.pem" + cert="$pkidir/${type}ca/certs/$fingerprint-cert.pem" +} + +make_tmpdir() { + TMP=/tmp/ovs-pki.tmp$$ + rm -rf $TMP + trap "rm -rf $TMP" 0 + mkdir -m 0700 $TMP +} + +fingerprint() { + local file=$1 + local name=${1-$2} + local date=$(date -r $file) + local fingerprint + if grep -q -e '-BEGIN CERTIFICATE-' "$file"; then + fingerprint=$(openssl x509 -noout -in "$file" -fingerprint | + sed 's/SHA1 Fingerprint=//' | tr -d ':') + else + fingerprint=$(sha1sum "$file" | awk '{print $1}') + fi + printf "$name\\t$date\\n" + case $file in + $fingerprint*) + printf "\\t(correct fingerprint in filename)\\n" + ;; + *) + printf "\\tfingerprint $fingerprint\\n" + ;; + esac +} + +verify_fingerprint() { + fingerprint "$@" + if test $batch != yes; then + echo "Does fingerprint match? (yes/no)" + read answer + if test "$answer" != yes; then + echo "Match failure, aborting" >&2 + exit 1 + fi + fi +} + +check_type() { + if test x = x"$1"; then + type=switch + elif test "$1" = switch || test "$1" = controller; then + type=$1 + else + echo "$0: type argument must be 'switch' or 'controller'" >&2 + exit 1 + fi +} + +parse_age() { + number=$(echo $1 | sed 's/^\([0-9]\+\)\([[:alpha:]]\+\)/\1/') + unit=$(echo $1 | sed 's/^\([0-9]\+\)\([[:alpha:]]\+\)/\2/') + case $unit in + s) + factor=1 + ;; + min) + factor=60 + ;; + h) + factor=3600 + ;; + day) + factor=86400 + ;; + *) + echo "$1: age not in the form Ns, Nmin, Nh, Nday (e.g. 1day)" >&2 + exit 1 + ;; + esac + echo $(($number * $factor)) +} + +must_exist() { + if test ! -e "$1"; then + echo "$0: $1 does not exist" >&2 + exit 1 + fi +} + +pkidir_must_exist() { + if test ! -e "$pkidir"; then + echo "$0: $pkidir does not exist (need to run 'init' or use '--dir'?)" >&2 + exit 1 + elif test ! -d "$pkidir"; then + echo "$0: $pkidir is not a directory" >&2 + exit 1 + fi +} + +make_request() { + must_not_exist "$arg1-privkey.pem" + must_not_exist "$arg1-req.pem" + make_tmpdir + cat > "$TMP/req.cnf" <&3 2>&3 +} + +sign_request() { + must_exist "$1" + must_not_exist "$2" + pkidir_must_exist + + (cd "$pkidir/${type}ca" && + openssl ca -config ca.cnf -batch -in /dev/stdin) \ + < "$1" > "$2.tmp$$" 2>&3 + mv "$2.tmp$$" "$2" +} + +glob() { + local files=$(echo $1) + if test "$files" != "$1"; then + echo "$files" + fi +} + +exec 3>>$log || true +if test "$command" = req; then + one_arg + + make_request "$arg1" + fingerprint "$arg1-req.pem" +elif test "$command" = sign; then + one_or_two_args + check_type "$arg2" + verify_fingerprint "$arg1-req.pem" + + sign_request "$arg1-req.pem" "$arg2-cert.pem" +elif test "$command" = req+sign; then + one_or_two_args + check_type "$arg2" + + pkidir_must_exist + make_request "$arg1" + sign_request "$arg1-req.pem" "$arg1-cert.pem" + fingerprint "$arg1-req.pem" +elif test "$command" = verify; then + one_or_two_args + must_exist "$arg1-cert.pem" + check_type "$arg2" + + pkidir_must_exist + openssl verify -CAfile "$pkidir/${type}ca/cacert.pem" "$arg1-cert.pem" +elif test "$command" = fingerprint; then + one_arg + + fingerprint "$arg1" +elif test "$command" = self-sign; then + one_arg + must_exist "$arg1-req.pem" + must_exist "$arg1-privkey.pem" + must_not_exist "$arg1-cert.pem" + + openssl x509 -in "$arg1-req.pem" -out "$arg1-cert.pem" \ + -signkey "$arg1-privkey.pem" -req -text 2>&3 +elif test "$command" = ls; then + check_type "$arg2" + + cd "$pkidir/${type}ca/incoming" + for file in $(glob "$arg1*-req.pem"); do + fingerprint $file + done +elif test "$command" = flush; then + check_type "$arg1" + + rm -f "$pkidir/${type}ca/incoming/"* +elif test "$command" = reject; then + one_or_two_args + check_type "$arg2" + resolve_prefix "$arg1" + + rm -f "$req" +elif test "$command" = approve; then + one_or_two_args + check_type "$arg2" + resolve_prefix "$arg1" + + make_tmpdir + cp "$req" "$TMP/$req" + verify_fingerprint "$TMP/$req" + sign_request "$TMP/$req" + rm -f "$req" "$TMP/$req" +elif test "$command" = prompt; then + zero_or_one_args + check_type "$arg1" + + make_tmpdir + cd "$pkidir/${type}ca/incoming" + for req in $(glob "*-req.pem"); do + cp "$req" "$TMP/$req" + + cert=$(echo "$pkidir/${type}ca/certs/$req" | + sed 's/-req.pem/-cert.pem/') + if test -f $cert; then + echo "Request $req already approved--dropping duplicate request" + rm -f "$req" "$TMP/$req" + continue + fi + + echo + echo + fingerprint "$TMP/$req" "$req" + printf "Disposition for this request (skip/approve/reject)? " + read answer + case $answer in + approve) + echo "Approving $req" + sign_request "$TMP/$req" "$cert" + rm -f "$req" "$TMP/$req" + ;; + r*) + echo "Rejecting $req" + rm -f "$req" "$TMP/$req" + ;; + *) + echo "Skipping $req" + ;; + esac + done +elif test "$command" = expire; then + zero_or_one_args + cutoff=$(($(date +%s) - $(parse_age ${arg1-1day}))) + for type in switch controller; do + cd "$pkidir/${type}ca/incoming" || exit 1 + for file in $(glob "*"); do + time=$(date -r "$file" +%s) + if test "$time" -lt "$cutoff"; then + rm -f "$file" + fi + done + done +else + echo "$0: $command command unknown; use --help for help" >&2 + exit 1 +fi diff --git a/utilities/ovs-wdt.c b/utilities/ovs-wdt.c new file mode 100644 index 00000000..3c5d797c --- /dev/null +++ b/utilities/ovs-wdt.c @@ -0,0 +1,263 @@ +/* Copyright (c) 2008, 2009 Nicira Networks, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Default values for the interval and timer. In seconds. */ +#define DEFAULT_INTERVAL 1 +#define DEFAULT_TIMEOUT 30 + +int fd = -1; + +/* The WDT is automatically enabled when /dev/watchdog is opened. If we + * do not send the magic value to the device first before exiting, the + * system will reboot. This function allows the program to exit without + * causing a reboot. + */ +static void +cleanup(void) +{ + if (fd == -1) { + return; + } + + /* Writing the magic value "V" to the device is an indication that + * the device is about to be closed. This causes the watchdog to be + * disabled after the call to close. + */ + if (write(fd, "V", 1) != 1) { + fprintf(stderr, "Couldn't write magic val: %d\n", errno); + return; + } + close(fd); + fd = -1; +} + + +/* If we receive a SIGINT, cleanup first, which will disable the + * watchdog timer. + */ +static void +sighandler(int signum) +{ + cleanup(); + signal(signum, SIG_DFL); + raise(signum); +} + +static void +setup_signal(void) +{ + struct sigaction action; + + action.sa_handler = sighandler; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + + if (sigaction(SIGINT, &action, NULL) != 0) { + fprintf(stderr, "Problem setting up SIGINT handler...\n"); + } + if (sigaction(SIGTERM, &action, NULL) != 0) { + fprintf(stderr, "Problem setting up SIGTERM handler...\n"); + } +} + + +/* Print information on the WDT hardware */ +static void +print_wdt_info(void) +{ + struct watchdog_info ident; + + if (ioctl(fd, WDIOC_GETSUPPORT, &ident) == -1) { + fprintf(stderr, "Couldn't get version: %d\n", errno); + cleanup(); + exit(-1); + } + printf("identity: %s, ver: %d, opt: %#x\n", ident.identity, + ident.firmware_version, ident.options); +} + + +static void +print_help(char *progname) +{ + printf("%s: Watchdog timer utility\n", progname); + printf("usage: %s [OPTIONS]\n\n", progname); + printf("Options:\n"); + printf(" -t, --timeout=SECS expiration time of WDT (default: %d)\n", + DEFAULT_TIMEOUT); + printf(" -i, --interval=SECS interval to send keep-alives (default: %d)\n", + DEFAULT_INTERVAL); + printf(" -d, --disable disable the WDT and exit\n"); + printf(" -h, --help display this help message\n"); + printf(" -v, --verbose enable verbose printing\n"); + printf(" -V, --version display version information of WDT and exit\n"); +} + + +int main(int argc, char *argv[]) +{ + int arg; + int optc; + int verbose = 0; + int interval = DEFAULT_INTERVAL; + int timeout = DEFAULT_TIMEOUT; + static struct option const longopts[] = + { + {"timeout", required_argument, NULL, 't'}, + {"interval", required_argument, NULL, 'i'}, + {"disable", no_argument, NULL, 'd'}, + {"help", no_argument, NULL, 'h'}, + {"verbose", no_argument, NULL, 'v'}, + {"version", no_argument, NULL, 'V'}, + {0, 0, 0, 0} + }; + + setup_signal(); + + fd = open("/dev/watchdog", O_RDWR); + if (fd == -1) { + fprintf(stderr, "Couldn't open watchdog device: %s\n", strerror(errno)); + exit(-1); + } + + while ((optc = getopt_long(argc, argv, "t:i:dh?vV", longopts, NULL)) != -1) { + switch (optc) { + case 't': + timeout = strtol(optarg, NULL, 10); + if (!timeout) { + fprintf(stderr, "Invalid timeout: %s\n", optarg); + goto error; + } + break; + + case 'i': + interval = strtol(optarg, NULL, 10); + if (!interval) { + fprintf(stderr, "Invalid interval: %s\n", optarg); + goto error; + } + break; + + case 'd': + arg = WDIOS_DISABLECARD; + if (ioctl(fd, WDIOC_SETOPTIONS, &arg) == -1) { + fprintf(stderr, "Couldn't disable: %d\n", errno); + goto error; + } + cleanup(); + exit(0); + break; + + case 'h': + print_help(argv[0]); + cleanup(); + exit(0); + break; + + case 'v': + verbose = 1; + break; + + case 'V': + print_wdt_info(); + cleanup(); + exit(0); + break; + + default: + print_help(argv[0]); + goto error; + break; + } + } + + argc -= optind; + argv += optind; + + /* Sanity-check the arguments */ + if (argc != 0) { + fprintf(stderr, "Illegal argument: %s\n", argv[0]); + goto error; + } + + if (verbose) { + print_wdt_info(); + printf("timeout: %d, interval: %d\n", timeout, interval); + } + + /* Prevent the interval being greater than the timeout, since it + * will always cause a reboot. + */ + if (interval > timeout) { + fprintf(stderr, "Interval greater than timeout: %d > %d\n", + interval, timeout); + goto error; + } + + /* Always set the timeout */ + if (ioctl(fd, WDIOC_SETTIMEOUT, &timeout) == -1) { + fprintf(stderr, "Couldn't set timeout: %d\n", errno); + goto error; + } + + /* Loop and send a keep-alive every "interval" seconds */ + while (1) { + if (verbose) { + if (ioctl(fd, WDIOC_GETTIMELEFT, &arg) == -1) { + fprintf(stderr, "Couldn't get time left: %d\n", errno); + goto error; + } + printf("Sending keep alive, time remaining: %d\n", arg); + } + + /* Send a keep-alive. The argument is ignored */ + if (ioctl(fd, WDIOC_KEEPALIVE, &arg) == -1) { + fprintf(stderr, "Couldn't keepalive: %d\n", errno); + goto error; + } + + sleep(interval); + } + + /* Never directly reached... */ +error: + cleanup(); + exit(-1); +} diff --git a/vswitchd/.gitignore b/vswitchd/.gitignore new file mode 100644 index 00000000..01d57ae7 --- /dev/null +++ b/vswitchd/.gitignore @@ -0,0 +1,7 @@ +/Makefile +/Makefile.in +/ovs-brcompatd +/ovs-brcompatd.8 +/ovs-vswitchd +/ovs-vswitchd.8 +/ovs-vswitchd.conf.5 diff --git a/vswitchd/automake.mk b/vswitchd/automake.mk new file mode 100644 index 00000000..6883731e --- /dev/null +++ b/vswitchd/automake.mk @@ -0,0 +1,40 @@ +sbin_PROGRAMS += vswitchd/ovs-vswitchd vswitchd/ovs-brcompatd +man_MANS += \ + vswitchd/ovs-vswitchd.conf.5 \ + vswitchd/ovs-vswitchd.8 \ + vswitchd/ovs-brcompatd.8 +DISTCLEANFILES += \ + vswitchd/ovs-vswitchd.conf.5 \ + vswitchd/ovs-vswitchd.8 \ + vswitchd/ovs-brcompatd.8 + +vswitchd_ovs_vswitchd_SOURCES = \ + vswitchd/bridge.c \ + vswitchd/bridge.h \ + vswitchd/mgmt.c \ + vswitchd/mgmt.h \ + vswitchd/port.c \ + vswitchd/port.h \ + vswitchd/proc-net-compat.c \ + vswitchd/proc-net-compat.h \ + vswitchd/ovs-vswitchd.c \ + vswitchd/ovs-vswitchd.h \ + vswitchd/xenserver.c \ + vswitchd/xenserver.h +vswitchd_ovs_vswitchd_LDADD = \ + secchan/libsecchan.a \ + lib/libopenvswitch.a \ + $(FAULT_LIBS) \ + $(SSL_LIBS) + +vswitchd_ovs_brcompatd_SOURCES = \ + vswitchd/ovs-brcompatd.c + +vswitchd_ovs_brcompatd_LDADD = \ + lib/libopenvswitch.a \ + $(FAULT_LIBS) + +EXTRA_DIST += \ + vswitchd/ovs-vswitchd.conf.5.in \ + vswitchd/ovs-vswitchd.8.in \ + vswitchd/ovs-brcompatd.8.in diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c new file mode 100644 index 00000000..cfd4dcf7 --- /dev/null +++ b/vswitchd/bridge.c @@ -0,0 +1,3058 @@ +/* Copyright (c) 2008, 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include +#include "bridge.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bitmap.h" +#include "cfg.h" +#include "coverage.h" +#include "dirs.h" +#include "dpif.h" +#include "dynamic-string.h" +#include "flow.h" +#include "hash.h" +#include "list.h" +#include "mac-learning.h" +#include "netdev.h" +#include "odp-util.h" +#include "ofp-print.h" +#include "ofpbuf.h" +#include "poll-loop.h" +#include "port-array.h" +#include "proc-net-compat.h" +#include "process.h" +#include "secchan/ofproto.h" +#include "socket-util.h" +#include "stp.h" +#include "svec.h" +#include "timeval.h" +#include "util.h" +#include "vconn.h" +#include "vconn-ssl.h" +#include "xenserver.h" +#include "xtoxll.h" + +#define THIS_MODULE VLM_bridge +#include "vlog.h" + +struct dst { + uint16_t vlan; + uint16_t dp_ifidx; +}; + +extern uint64_t mgmt_id; + +struct iface { + struct port *port; /* Containing port. */ + size_t port_ifidx; /* Index within containing port. */ + + char *name; /* Host network device name. */ + int dp_ifidx; /* Index within kernel datapath. */ + + uint8_t mac[ETH_ADDR_LEN]; /* Ethernet address (all zeros if unknowns). */ + + tag_type tag; /* Tag associated with this interface. */ + bool enabled; /* May be chosen for flows? */ + long long delay_expires; /* Time after which 'enabled' may change. */ +}; + +#define BOND_MASK 0xff +struct bond_entry { + int iface_idx; /* Index of assigned iface, or -1 if none. */ + uint64_t tx_bytes; /* Count of bytes recently transmitted. */ + tag_type iface_tag; /* Tag associated with iface_idx. */ +}; + +#define MAX_MIRRORS 32 +typedef uint32_t mirror_mask_t; +#define MIRROR_MASK_C(X) UINT32_C(X) +BUILD_ASSERT_DECL(sizeof(mirror_mask_t) * CHAR_BIT >= MAX_MIRRORS); +struct mirror { + struct bridge *bridge; + size_t idx; + char *name; + + /* Selection criteria. */ + struct svec src_ports; + struct svec dst_ports; + int *vlans; + size_t n_vlans; + + /* Output. */ + struct port *out_port; + int out_vlan; +}; + +#define FLOOD_PORT ((struct port *) 1) /* The 'flood' output port. */ +struct port { + struct bridge *bridge; + size_t port_idx; + int vlan; /* -1=trunk port, else a 12-bit VLAN ID. */ + unsigned long *trunks; /* Bitmap of trunked VLANs, if 'vlan' == -1. */ + char *name; + + /* An ordinary bridge port has 1 interface. + * A bridge port for bonding has at least 2 interfaces. */ + struct iface **ifaces; + size_t n_ifaces, allocated_ifaces; + + /* Bonding info. */ + struct bond_entry *bond_hash; /* An array of (BOND_MASK + 1) elements. */ + int active_iface; /* Ifidx on which bcasts accepted, or -1. */ + tag_type active_iface_tag; /* Tag for bcast flows. */ + tag_type no_ifaces_tag; /* Tag for flows when all ifaces disabled. */ + int updelay, downdelay; /* Delay before iface goes up/down, in ms. */ + + /* Port mirroring info. */ + mirror_mask_t src_mirrors; /* Mirrors triggered when packet received. */ + mirror_mask_t dst_mirrors; /* Mirrors triggered when packet sent. */ + bool is_mirror_output_port; /* Does port mirroring send frames here? */ + + /* Spanning tree info. */ + enum stp_state stp_state; /* Always STP_FORWARDING if STP not in use. */ + tag_type stp_state_tag; /* Tag for STP state change. */ +}; + +#define DP_MAX_PORTS 255 +struct bridge { + struct list node; /* Node in global list of bridges. */ + char *name; /* User-specified arbitrary name. */ + struct mac_learning *ml; /* MAC learning table, or null not to learn. */ + bool sent_config_request; /* Successfully sent config request? */ + uint8_t default_ea[ETH_ADDR_LEN]; /* Default MAC. */ + + /* Support for remote controllers. */ + char *controller; /* NULL if there is no remote controller; + * "discover" to do controller discovery; + * otherwise a vconn name. */ + + /* OpenFlow switch processing. */ + struct ofproto *ofproto; /* OpenFlow switch. */ + + /* Kernel datapath information. */ + struct dpif dpif; /* Kernel datapath. */ + struct port_array ifaces; /* Indexed by kernel datapath port number. */ + + /* Bridge ports. */ + struct port **ports; + size_t n_ports, allocated_ports; + + /* Bonding. */ + bool has_bonded_ports; + long long int bond_next_rebalance; + + /* Flow tracking. */ + bool flush; + + /* Flow statistics gathering. */ + time_t next_stats_request; + + /* Port mirroring. */ + struct mirror *mirrors[MAX_MIRRORS]; + + /* Spanning tree. */ + struct stp *stp; + long long int stp_last_tick; +}; + +/* List of all bridges. */ +static struct list all_bridges = LIST_INITIALIZER(&all_bridges); + +/* Maximum number of datapaths. */ +enum { DP_MAX = 256 }; + +static struct bridge *bridge_create(const char *name); +static void bridge_destroy(struct bridge *); +static struct bridge *bridge_lookup(const char *name); +static int bridge_run_one(struct bridge *); +static void bridge_reconfigure_one(struct bridge *); +static void bridge_reconfigure_controller(struct bridge *); +static void bridge_get_all_ifaces(const struct bridge *, struct svec *ifaces); +static void bridge_fetch_dp_ifaces(struct bridge *); +static void bridge_flush(struct bridge *); +static void bridge_pick_local_hw_addr(struct bridge *, + uint8_t ea[ETH_ADDR_LEN], + const char **devname); +static uint64_t bridge_pick_datapath_id(struct bridge *, + const uint8_t bridge_ea[ETH_ADDR_LEN], + const char *devname); +static uint64_t dpid_from_hash(const void *, size_t nbytes); + +static void bond_run(struct bridge *); +static void bond_wait(struct bridge *); +static void bond_rebalance_port(struct port *); + +static void port_create(struct bridge *, const char *name); +static void port_reconfigure(struct port *); +static void port_destroy(struct port *); +static struct port *port_lookup(const struct bridge *, const char *name); +static struct port *port_from_dp_ifidx(const struct bridge *, + uint16_t dp_ifidx); +static void port_update_bond_compat(struct port *); +static void port_update_vlan_compat(struct port *); + +static void mirror_create(struct bridge *, const char *name); +static void mirror_destroy(struct mirror *); +static void mirror_reconfigure(struct bridge *); +static void mirror_reconfigure_one(struct mirror *); +static bool vlan_is_mirrored(const struct mirror *, int vlan); + +static void brstp_reconfigure(struct bridge *); +static void brstp_adjust_timers(struct bridge *); +static void brstp_run(struct bridge *); +static void brstp_wait(struct bridge *); + +static void iface_create(struct port *, const char *name); +static void iface_destroy(struct iface *); +static struct iface *iface_lookup(const struct bridge *, const char *name); +static struct iface *iface_from_dp_ifidx(const struct bridge *, + uint16_t dp_ifidx); + +/* Hooks into ofproto processing. */ +static struct ofhooks bridge_ofhooks; + +/* Public functions. */ + +/* Adds the name of each interface used by a bridge, including local and + * internal ports, to 'svec'. */ +void +bridge_get_ifaces(struct svec *svec) +{ + struct bridge *br, *next; + size_t i, j; + + LIST_FOR_EACH_SAFE (br, next, struct bridge, node, &all_bridges) { + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (iface->dp_ifidx < 0) { + VLOG_ERR("%s interface not in dp%u, ignoring", + iface->name, dpif_id(&br->dpif)); + } else { + if (iface->dp_ifidx != ODPP_LOCAL) { + svec_add(svec, iface->name); + } + } + } + } + } +} + +/* The caller must already have called cfg_read(). */ +void +bridge_init(void) +{ + int retval; + int i; + + for (i = 0; i < DP_MAX; i++) { + struct dpif dpif; + char devname[16]; + + sprintf(devname, "dp%d", i); + retval = dpif_open(devname, &dpif); + if (!retval) { + char dpif_name[IF_NAMESIZE]; + if (dpif_get_name(&dpif, dpif_name, sizeof dpif_name) + || !cfg_has("bridge.%s.port", dpif_name)) { + dpif_delete(&dpif); + } + dpif_close(&dpif); + } else if (retval != ENODEV) { + VLOG_ERR("failed to delete datapath dp%d: %s", + i, strerror(retval)); + } + } + + bridge_reconfigure(); +} + +#ifdef HAVE_OPENSSL +static bool +config_string_change(const char *key, char **valuep) +{ + const char *value = cfg_get_string(0, "%s", key); + if (value && (!*valuep || strcmp(value, *valuep))) { + free(*valuep); + *valuep = xstrdup(value); + return true; + } else { + return false; + } +} + +static void +bridge_configure_ssl(void) +{ + /* XXX SSL should be configurable on a per-bridge basis. + * XXX should be possible to de-configure SSL. */ + static char *private_key_file; + static char *certificate_file; + static char *cacert_file; + + if (config_string_change("ssl.private-key", &private_key_file)) { + vconn_ssl_set_private_key_file(private_key_file); + } + + if (config_string_change("ssl.certificate", &certificate_file)) { + vconn_ssl_set_certificate_file(certificate_file); + } + + if (config_string_change("ssl.ca-cert", &cacert_file)) { + vconn_ssl_set_ca_cert_file(cacert_file, + cfg_get_bool(0, "ssl.bootstrap-ca-cert")); + } +} +#endif + +void +bridge_reconfigure(void) +{ + struct svec old_br, new_br, raw_new_br; + struct bridge *br, *next; + size_t i, j; + + COVERAGE_INC(bridge_reconfigure); + + /* Collect old bridges. */ + svec_init(&old_br); + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + svec_add(&old_br, br->name); + } + + /* Collect new bridges. */ + svec_init(&raw_new_br); + cfg_get_subsections(&raw_new_br, "bridge"); + svec_init(&new_br); + for (i = 0; i < raw_new_br.n; i++) { + const char *name = raw_new_br.names[i]; + if ((!strncmp(name, "dp", 2) && isdigit(name[2])) || + (!strncmp(name, "nl:", 3) && isdigit(name[3]))) { + VLOG_ERR("%s is not a valid bridge name (bridges may not be " + "named \"dp\" or \"nl:\" followed by a digit)", name); + } else { + svec_add(&new_br, name); + } + } + svec_destroy(&raw_new_br); + + /* Get rid of deleted bridges and add new bridges. */ + svec_sort(&old_br); + svec_sort(&new_br); + assert(svec_is_unique(&old_br)); + assert(svec_is_unique(&new_br)); + LIST_FOR_EACH_SAFE (br, next, struct bridge, node, &all_bridges) { + if (!svec_contains(&new_br, br->name)) { + bridge_destroy(br); + } + } + for (i = 0; i < new_br.n; i++) { + const char *name = new_br.names[i]; + if (!svec_contains(&old_br, name)) { + bridge_create(name); + } + } + svec_destroy(&old_br); + svec_destroy(&new_br); + +#ifdef HAVE_OPENSSL + /* Configure SSL. */ + bridge_configure_ssl(); +#endif + + /* Reconfigure all bridges. */ + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + bridge_reconfigure_one(br); + } + + /* Add and delete ports on all datapaths. + * + * The kernel will reject any attempt to add a given port to a datapath if + * that port already belongs to a different datapath, so we must do all + * port deletions before any port additions. */ + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + struct odp_port *dpif_ports; + size_t n_dpif_ports; + struct svec want_ifaces; + + dpif_port_list(&br->dpif, &dpif_ports, &n_dpif_ports); + bridge_get_all_ifaces(br, &want_ifaces); + for (i = 0; i < n_dpif_ports; i++) { + const struct odp_port *p = &dpif_ports[i]; + if (!svec_contains(&want_ifaces, p->devname) + && strcmp(p->devname, br->name)) { + int retval = dpif_port_del(&br->dpif, p->port); + if (retval) { + VLOG_ERR("failed to remove %s interface from dp%u: %s", + p->devname, dpif_id(&br->dpif), strerror(retval)); + } + } + } + svec_destroy(&want_ifaces); + free(dpif_ports); + } + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + struct odp_port *dpif_ports; + size_t n_dpif_ports; + struct svec cur_ifaces, want_ifaces, add_ifaces; + int next_port_no; + + dpif_port_list(&br->dpif, &dpif_ports, &n_dpif_ports); + svec_init(&cur_ifaces); + for (i = 0; i < n_dpif_ports; i++) { + svec_add(&cur_ifaces, dpif_ports[i].devname); + } + free(dpif_ports); + svec_sort_unique(&cur_ifaces); + bridge_get_all_ifaces(br, &want_ifaces); + svec_diff(&want_ifaces, &cur_ifaces, &add_ifaces, NULL, NULL); + + next_port_no = 1; + for (i = 0; i < add_ifaces.n; i++) { + const char *if_name = add_ifaces.names[i]; + for (;;) { + int internal = cfg_get_bool(0, "iface.%s.internal", if_name); + int error = dpif_port_add(&br->dpif, if_name, next_port_no++, + internal ? ODP_PORT_INTERNAL : 0); + if (error != EEXIST) { + if (next_port_no >= 256) { + VLOG_ERR("ran out of valid port numbers on dp%u", + dpif_id(&br->dpif)); + goto out; + } + if (error) { + VLOG_ERR("failed to add %s interface to dp%u: %s", + if_name, dpif_id(&br->dpif), strerror(error)); + } + break; + } + } + } + out: + svec_destroy(&cur_ifaces); + svec_destroy(&want_ifaces); + svec_destroy(&add_ifaces); + } + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + uint8_t ea[8]; + uint64_t dpid; + struct iface *local_iface = NULL; + const char *devname; + uint8_t engine_type = br->dpif.minor; + uint8_t engine_id = br->dpif.minor; + bool add_id_to_iface = false; + struct svec nf_hosts; + + + bridge_fetch_dp_ifaces(br); + for (i = 0; i < br->n_ports; ) { + struct port *port = br->ports[i]; + + for (j = 0; j < port->n_ifaces; ) { + struct iface *iface = port->ifaces[j]; + if (iface->dp_ifidx < 0) { + VLOG_ERR("%s interface not in dp%u, dropping", + iface->name, dpif_id(&br->dpif)); + iface_destroy(iface); + } else { + if (iface->dp_ifidx == ODPP_LOCAL) { + local_iface = iface; + } + VLOG_DBG("dp%u has interface %s on port %d", + dpif_id(&br->dpif), iface->name, iface->dp_ifidx); + j++; + } + } + if (!port->n_ifaces) { + VLOG_ERR("%s port has no interfaces, dropping", port->name); + port_destroy(port); + continue; + } + i++; + } + + /* Pick local port hardware address, datapath ID. */ + bridge_pick_local_hw_addr(br, ea, &devname); + if (local_iface) { + int error = netdev_nodev_set_etheraddr(local_iface->name, ea); + if (error) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_ERR_RL(&rl, "bridge %s: failed to set bridge " + "Ethernet address: %s", + br->name, strerror(error)); + } + } + + dpid = bridge_pick_datapath_id(br, ea, devname); + ofproto_set_datapath_id(br->ofproto, dpid); + + /* Set NetFlow configuration on this bridge. */ + if (cfg_has("netflow.%s.engine-type", br->name)) { + engine_type = cfg_get_int(0, "netflow.%s.engine-type", + br->name); + } + if (cfg_has("netflow.%s.engine-id", br->name)) { + engine_id = cfg_get_int(0, "netflow.%s.engine-id", br->name); + } + if (cfg_has("netflow.%s.add-id-to-iface", br->name)) { + add_id_to_iface = cfg_get_bool(0, "netflow.%s.add-id-to-iface", + br->name); + } + if (add_id_to_iface && engine_id > 0x7f) { + VLOG_WARN("bridge %s: netflow port mangling may conflict with " + "another vswitch, choose an engine id less than 128", + br->name); + } + if (add_id_to_iface && br->n_ports > 0x1ff) { + VLOG_WARN("bridge %s: netflow port mangling will conflict with " + "another port when 512 or more ports are used", + br->name); + } + svec_init(&nf_hosts); + cfg_get_all_keys(&nf_hosts, "netflow.%s.host", br->name); + if (ofproto_set_netflow(br->ofproto, &nf_hosts, engine_type, + engine_id, add_id_to_iface)) { + VLOG_ERR("bridge %s: problem setting netflow collectors", + br->name); + } + + /* Update the controller and related settings. It would be more + * straightforward to call this from bridge_reconfigure_one(), but we + * can't do it there for two reasons. First, and most importantly, at + * that point we don't know the dp_ifidx of any interfaces that have + * been added to the bridge (because we haven't actually added them to + * the datapath). Second, at that point we haven't set the datapath ID + * yet; when a controller is configured, resetting the datapath ID will + * immediately disconnect from the controller, so it's better to set + * the datapath ID before the controller. */ + bridge_reconfigure_controller(br); + } + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + port_update_vlan_compat(port); + } + } + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + brstp_reconfigure(br); + } +} + +static void +bridge_pick_local_hw_addr(struct bridge *br, uint8_t ea[ETH_ADDR_LEN], + const char **devname) +{ + uint64_t requested_ea; + size_t i, j; + int error; + + *devname = NULL; + + /* Did the user request a particular MAC? */ + requested_ea = cfg_get_mac(0, "bridge.%s.mac", br->name); + if (requested_ea) { + eth_addr_from_uint64(requested_ea, ea); + if (eth_addr_is_multicast(ea)) { + VLOG_ERR("bridge %s: cannot set MAC address to multicast " + "address "ETH_ADDR_FMT, br->name, ETH_ADDR_ARGS(ea)); + } else if (eth_addr_is_zero(ea)) { + VLOG_ERR("bridge %s: cannot set MAC address to zero", br->name); + } else { + return; + } + } + + /* Otherwise choose the minimum MAC address among all of the interfaces. + * (Xen uses FE:FF:FF:FF:FF:FF for virtual interfaces so this will get the + * MAC of the physical interface in such an environment.) */ + memset(ea, 0xff, sizeof ea); + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (port->is_mirror_output_port) { + continue; + } + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + uint8_t iface_ea[ETH_ADDR_LEN]; + if (iface->dp_ifidx == ODPP_LOCAL + || cfg_get_bool(0, "iface.%s.internal", iface->name)) { + continue; + } + error = netdev_nodev_get_etheraddr(iface->name, iface_ea); + if (!error) { + if (!eth_addr_is_multicast(iface_ea) && + !eth_addr_is_reserved(iface_ea) && + !eth_addr_is_zero(iface_ea) && + memcmp(iface_ea, ea, ETH_ADDR_LEN) < 0) { + memcpy(ea, iface_ea, ETH_ADDR_LEN); + *devname = iface->name; + } + } else { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_ERR_RL(&rl, "failed to obtain Ethernet address of %s: %s", + iface->name, strerror(error)); + } + } + } + if (eth_addr_is_multicast(ea) || eth_addr_is_vif(ea)) { + memcpy(ea, br->default_ea, ETH_ADDR_LEN); + *devname = NULL; + VLOG_WARN("bridge %s: using default bridge Ethernet " + "address "ETH_ADDR_FMT, br->name, ETH_ADDR_ARGS(ea)); + } else { + VLOG_DBG("bridge %s: using bridge Ethernet address "ETH_ADDR_FMT, + br->name, ETH_ADDR_ARGS(ea)); + } +} + +/* Choose and returns the datapath ID for bridge 'br' given that the bridge + * Ethernet address is 'bridge_ea'. If 'bridge_ea' is the Ethernet address of + * a network device, then that network device's name must be passed in as + * 'devname'; if 'bridge_ea' was derived some other way, then 'devname' must be + * passed in as a null pointer. */ +static uint64_t +bridge_pick_datapath_id(struct bridge *br, + const uint8_t bridge_ea[ETH_ADDR_LEN], + const char *devname) +{ + /* + * The procedure for choosing a bridge MAC address will, in the most + * ordinary case, also choose a unique MAC that we can use as a datapath + * ID. In some special cases, though, multiple bridges will end up with + * the same MAC address. This is OK for the bridges, but it will confuse + * the OpenFlow controller, because each datapath needs a unique datapath + * ID. + * + * Datapath IDs must be unique. It is also very desirable that they be + * stable from one run to the next, so that policy set on a datapath + * "sticks". + */ + uint64_t dpid; + + dpid = cfg_get_dpid(0, "bridge.%s.datapath-id", br->name); + if (dpid) { + return dpid; + } + + if (devname) { + int vlan; + if (!netdev_get_vlan_vid(devname, &vlan)) { + /* + * A bridge whose MAC address is taken from a VLAN network device + * (that is, a network device created with vconfig(8) or similar + * tool) will have the same MAC address as a bridge on the VLAN + * device's physical network device. + * + * Handle this case by hashing the physical network device MAC + * along with the VLAN identifier. + */ + uint8_t buf[ETH_ADDR_LEN + 2]; + memcpy(buf, bridge_ea, ETH_ADDR_LEN); + buf[ETH_ADDR_LEN] = vlan >> 8; + buf[ETH_ADDR_LEN + 1] = vlan; + return dpid_from_hash(buf, sizeof buf); + } else { + /* + * Assume that this bridge's MAC address is unique, since it + * doesn't fit any of the cases we handle specially. + */ + } + } else { + /* + * A purely internal bridge, that is, one that has no non-virtual + * network devices on it at all, is more difficult because it has no + * natural unique identifier at all. + * + * When the host is a XenServer, we handle this case by hashing the + * host's UUID with the name of the bridge. Names of bridges are + * persistent across XenServer reboots, although they can be reused if + * an internal network is destroyed and then a new one is later + * created, so this is fairly effective. + * + * When the host is not a XenServer, we punt by using a random MAC + * address on each run. + */ + const char *host_uuid = xenserver_get_host_uuid(); + if (host_uuid) { + char *combined = xasprintf("%s,%s", host_uuid, br->name); + dpid = dpid_from_hash(combined, strlen(combined)); + free(combined); + return dpid; + } + } + + return eth_addr_to_uint64(bridge_ea); +} + +static uint64_t +dpid_from_hash(const void *data, size_t n) +{ + uint8_t hash[SHA1HashSize]; + + BUILD_ASSERT_DECL(sizeof hash >= ETH_ADDR_LEN); + SHA1Bytes(data, n, hash); + eth_addr_mark_random(hash); + return eth_addr_to_uint64(hash); +} + +int +bridge_run(void) +{ + struct bridge *br, *next; + int retval; + + retval = 0; + LIST_FOR_EACH_SAFE (br, next, struct bridge, node, &all_bridges) { + int error = bridge_run_one(br); + if (error) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_ERR_RL(&rl, "bridge %s: datapath was destroyed externally, " + "forcing reconfiguration", br->name); + if (!retval) { + retval = error; + } + } + } + return retval; +} + +void +bridge_wait(void) +{ + struct bridge *br; + + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + ofproto_wait(br->ofproto); + if (br->controller) { + continue; + } + + if (br->ml) { + mac_learning_wait(br->ml); + } + bond_wait(br); + brstp_wait(br); + } +} + +/* Forces 'br' to revalidate all of its flows. This is appropriate when 'br''s + * configuration changes. */ +static void +bridge_flush(struct bridge *br) +{ + COVERAGE_INC(bridge_flush); + br->flush = true; + if (br->ml) { + mac_learning_flush(br->ml); + } +} + +/* Bridge reconfiguration functions. */ + +static struct bridge * +bridge_create(const char *name) +{ + struct bridge *br; + int error; + + assert(!bridge_lookup(name)); + br = xcalloc(1, sizeof *br); + + error = dpif_create(name, &br->dpif); + if (error == EEXIST) { + error = dpif_open(name, &br->dpif); + if (error) { + VLOG_ERR("datapath %s already exists but cannot be opened: %s", + name, strerror(error)); + free(br); + return NULL; + } + dpif_flow_flush(&br->dpif); + } else if (error) { + VLOG_ERR("failed to create datapath %s: %s", name, strerror(error)); + free(br); + return NULL; + } + + error = ofproto_create(name, &bridge_ofhooks, br, &br->ofproto); + if (error) { + VLOG_ERR("failed to create switch %s: %s", name, strerror(error)); + dpif_delete(&br->dpif); + dpif_close(&br->dpif); + free(br); + return NULL; + } + + br->name = xstrdup(name); + br->ml = mac_learning_create(); + br->sent_config_request = false; + eth_addr_random(br->default_ea); + + port_array_init(&br->ifaces); + + br->flush = false; + br->bond_next_rebalance = time_msec() + 10000; + + list_push_back(&all_bridges, &br->node); + + VLOG_INFO("created bridge %s on dp%u", br->name, dpif_id(&br->dpif)); + + return br; +} + +static void +bridge_destroy(struct bridge *br) +{ + if (br) { + int error; + + while (br->n_ports > 0) { + port_destroy(br->ports[br->n_ports - 1]); + } + list_remove(&br->node); + error = dpif_delete(&br->dpif); + if (error && error != ENOENT) { + VLOG_ERR("failed to delete dp%u: %s", + dpif_id(&br->dpif), strerror(error)); + } + dpif_close(&br->dpif); + ofproto_destroy(br->ofproto); + free(br->controller); + mac_learning_destroy(br->ml); + port_array_destroy(&br->ifaces); + free(br->ports); + free(br->name); + free(br); + } +} + +static struct bridge * +bridge_lookup(const char *name) +{ + struct bridge *br; + + LIST_FOR_EACH (br, struct bridge, node, &all_bridges) { + if (!strcmp(br->name, name)) { + return br; + } + } + return NULL; +} + +bool +bridge_exists(const char *name) +{ + return bridge_lookup(name) ? true : false; +} + +uint64_t +bridge_get_datapathid(const char *name) +{ + struct bridge *br = bridge_lookup(name); + return br ? ofproto_get_datapath_id(br->ofproto) : 0; +} + +static int +bridge_run_one(struct bridge *br) +{ + int error; + + error = ofproto_run1(br->ofproto); + if (error) { + return error; + } + + if (br->ml) { + mac_learning_run(br->ml, ofproto_get_revalidate_set(br->ofproto)); + } + bond_run(br); + brstp_run(br); + + error = ofproto_run2(br->ofproto, br->flush); + br->flush = false; + + return error; +} + +static const char * +bridge_get_controller(const struct bridge *br) +{ + const char *controller; + + controller = cfg_get_string(0, "bridge.%s.controller", br->name); + if (!controller) { + controller = cfg_get_string(0, "mgmt.controller"); + } + return controller && controller[0] ? controller : NULL; +} + +static void +bridge_reconfigure_one(struct bridge *br) +{ + struct svec old_ports, new_ports, ifaces; + struct svec listeners, old_listeners; + struct svec snoops, old_snoops; + size_t i, j; + + /* Collect old ports. */ + svec_init(&old_ports); + for (i = 0; i < br->n_ports; i++) { + svec_add(&old_ports, br->ports[i]->name); + } + svec_sort(&old_ports); + assert(svec_is_unique(&old_ports)); + + /* Collect new ports. */ + svec_init(&new_ports); + cfg_get_all_keys(&new_ports, "bridge.%s.port", br->name); + svec_sort(&new_ports); + if (bridge_get_controller(br) && !svec_contains(&new_ports, br->name)) { + svec_add(&new_ports, br->name); + svec_sort(&new_ports); + } + if (!svec_is_unique(&new_ports)) { + VLOG_WARN("bridge %s: %s specified twice as bridge port", + br->name, svec_get_duplicate(&new_ports)); + svec_unique(&new_ports); + } + + ofproto_set_mgmt_id(br->ofproto, mgmt_id); + + /* Get rid of deleted ports and add new ports. */ + for (i = 0; i < br->n_ports; ) { + struct port *port = br->ports[i]; + if (!svec_contains(&new_ports, port->name)) { + port_destroy(port); + } else { + i++; + } + } + for (i = 0; i < new_ports.n; i++) { + const char *name = new_ports.names[i]; + if (!svec_contains(&old_ports, name)) { + port_create(br, name); + } + } + svec_destroy(&old_ports); + svec_destroy(&new_ports); + + /* Reconfigure all ports. */ + for (i = 0; i < br->n_ports; i++) { + port_reconfigure(br->ports[i]); + } + + /* Check and delete duplicate interfaces. */ + svec_init(&ifaces); + for (i = 0; i < br->n_ports; ) { + struct port *port = br->ports[i]; + for (j = 0; j < port->n_ifaces; ) { + struct iface *iface = port->ifaces[j]; + if (svec_contains(&ifaces, iface->name)) { + VLOG_ERR("bridge %s: %s interface is on multiple ports, " + "removing from %s", + br->name, iface->name, port->name); + iface_destroy(iface); + } else { + svec_add(&ifaces, iface->name); + svec_sort(&ifaces); + j++; + } + } + if (!port->n_ifaces) { + VLOG_ERR("%s port has no interfaces, dropping", port->name); + port_destroy(port); + } else { + i++; + } + } + svec_destroy(&ifaces); + + /* Delete all flows if we're switching from connected to standalone or vice + * versa. (XXX Should we delete all flows if we are switching from one + * controller to another?) */ + + /* Configure OpenFlow management listeners. */ + svec_init(&listeners); + cfg_get_all_strings(&listeners, "bridge.%s.openflow.listeners", br->name); + if (!listeners.n) { + svec_add_nocopy(&listeners, xasprintf("punix:%s/%s.mgmt", + ovs_rundir, br->name)); + } else if (listeners.n == 1 && !strcmp(listeners.names[0], "none")) { + svec_clear(&listeners); + } + svec_sort_unique(&listeners); + + svec_init(&old_listeners); + ofproto_get_listeners(br->ofproto, &old_listeners); + svec_sort_unique(&old_listeners); + + if (!svec_equal(&listeners, &old_listeners)) { + ofproto_set_listeners(br->ofproto, &listeners); + } + svec_destroy(&listeners); + svec_destroy(&old_listeners); + + /* Configure OpenFlow controller connection snooping. */ + svec_init(&snoops); + cfg_get_all_strings(&snoops, "bridge.%s.openflow.snoops", br->name); + if (!snoops.n) { + svec_add_nocopy(&snoops, xasprintf("punix:%s/%s.snoop", + ovs_rundir, br->name)); + } else if (snoops.n == 1 && !strcmp(snoops.names[0], "none")) { + svec_clear(&snoops); + } + svec_sort_unique(&snoops); + + svec_init(&old_snoops); + ofproto_get_snoops(br->ofproto, &old_snoops); + svec_sort_unique(&old_snoops); + + if (!svec_equal(&snoops, &old_snoops)) { + ofproto_set_snoops(br->ofproto, &snoops); + } + svec_destroy(&snoops); + svec_destroy(&old_snoops); + + mirror_reconfigure(br); +} + +static void +bridge_reconfigure_controller(struct bridge *br) +{ + char *pfx = xasprintf("bridge.%s.controller", br->name); + const char *controller; + + controller = bridge_get_controller(br); + if ((br->controller != NULL) != (controller != NULL)) { + ofproto_flush_flows(br->ofproto); + } + free(br->controller); + br->controller = controller ? xstrdup(controller) : NULL; + + if (controller) { + const char *fail_mode; + int max_backoff, probe; + int rate_limit, burst_limit; + + if (!strcmp(controller, "discover")) { + ofproto_set_discovery(br->ofproto, true, + cfg_get_string(0, "%s.accept-regex", pfx), + cfg_get_bool(0, "%s.update-resolv.conf", + pfx)); + } else { + struct netdev *netdev; + bool in_band; + int error; + + in_band = (!cfg_is_valid(CFG_BOOL | CFG_REQUIRED, + "%s.in-band", pfx) + || cfg_get_bool(0, "%s.in-band", pfx)); + ofproto_set_discovery(br->ofproto, false, NULL, NULL); + ofproto_set_in_band(br->ofproto, in_band); + + error = netdev_open(br->name, NETDEV_ETH_TYPE_NONE, &netdev); + if (!error) { + if (cfg_is_valid(CFG_IP | CFG_REQUIRED, "%s.ip", pfx)) { + struct in_addr ip, mask, gateway; + ip.s_addr = cfg_get_ip(0, "%s.ip", pfx); + mask.s_addr = cfg_get_ip(0, "%s.netmask", pfx); + gateway.s_addr = cfg_get_ip(0, "%s.gateway", pfx); + + netdev_turn_flags_on(netdev, NETDEV_UP, true); + if (!mask.s_addr) { + mask.s_addr = guess_netmask(ip.s_addr); + } + if (!netdev_set_in4(netdev, ip, mask)) { + VLOG_INFO("bridge %s: configured IP address "IP_FMT", " + "netmask "IP_FMT, + br->name, IP_ARGS(&ip.s_addr), + IP_ARGS(&mask.s_addr)); + } + + if (gateway.s_addr) { + if (!netdev_add_router(gateway)) { + VLOG_INFO("bridge %s: configured gateway "IP_FMT, + br->name, IP_ARGS(&gateway.s_addr)); + } + } + } + netdev_close(netdev); + } + } + + fail_mode = cfg_get_string(0, "%s.fail-mode", pfx); + if (!fail_mode) { + fail_mode = cfg_get_string(0, "mgmt.fail-mode"); + } + ofproto_set_failure(br->ofproto, + (!fail_mode + || !strcmp(fail_mode, "standalone") + || !strcmp(fail_mode, "open"))); + + probe = cfg_get_int(0, "%s.inactivity-probe", pfx); + ofproto_set_probe_interval(br->ofproto, + probe ? probe : cfg_get_int(0, "mgmt.inactivity-probe")); + + max_backoff = cfg_get_int(0, "%s.max-backoff", pfx); + if (!max_backoff) { + max_backoff = cfg_get_int(0, "mgmt.max-backoff"); + if (!max_backoff) { + max_backoff = 15; + } + } + ofproto_set_max_backoff(br->ofproto, max_backoff); + + rate_limit = cfg_get_int(0, "%s.rate-limit", pfx); + if (!rate_limit) { + rate_limit = cfg_get_int(0, "mgmt.rate-limit"); + } + burst_limit = cfg_get_int(0, "%s.burst-limit", pfx); + if (!burst_limit) { + burst_limit = cfg_get_int(0, "mgmt.burst-limit"); + } + ofproto_set_rate_limit(br->ofproto, rate_limit, burst_limit); + + ofproto_set_stp(br->ofproto, cfg_get_bool(0, "%s.stp", pfx)); + + if (cfg_has("%s.commands.acl", pfx)) { + struct svec command_acls; + char *command_acl; + + svec_init(&command_acls); + cfg_get_all_strings(&command_acls, "%s.commands.acl", pfx); + command_acl = svec_join(&command_acls, ",", ""); + + ofproto_set_remote_execution(br->ofproto, command_acl, + cfg_get_string(0, "%s.commands.dir", + pfx)); + + svec_destroy(&command_acls); + free(command_acl); + } else { + ofproto_set_remote_execution(br->ofproto, NULL, NULL); + } + } else { + union ofp_action action; + flow_t flow; + + /* Set up a flow that matches every packet and directs them to + * OFPP_NORMAL (which goes to us). */ + memset(&action, 0, sizeof action); + action.type = htons(OFPAT_OUTPUT); + action.output.len = htons(sizeof action); + action.output.port = htons(OFPP_NORMAL); + memset(&flow, 0, sizeof flow); + ofproto_add_flow(br->ofproto, &flow, OFPFW_ALL, 0, + &action, 1, 0); + + ofproto_set_in_band(br->ofproto, false); + ofproto_set_max_backoff(br->ofproto, 1); + ofproto_set_probe_interval(br->ofproto, 5); + ofproto_set_failure(br->ofproto, false); + ofproto_set_stp(br->ofproto, false); + } + free(pfx); + + ofproto_set_controller(br->ofproto, br->controller); +} + +static void +bridge_get_all_ifaces(const struct bridge *br, struct svec *ifaces) +{ + size_t i, j; + + svec_init(ifaces); + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + svec_add(ifaces, iface->name); + } + } + svec_sort(ifaces); + assert(svec_is_unique(ifaces)); +} + +/* For robustness, in case the administrator moves around datapath ports behind + * our back, we re-check all the datapath port numbers here. + * + * This function will set the 'dp_ifidx' members of interfaces that have + * disappeared to -1, so only call this function from a context where those + * 'struct iface's will be removed from the bridge. Otherwise, the -1 + * 'dp_ifidx'es will cause trouble later when we try to send them to the + * datapath, which doesn't support UINT16_MAX+1 ports. */ +static void +bridge_fetch_dp_ifaces(struct bridge *br) +{ + struct odp_port *dpif_ports; + size_t n_dpif_ports; + size_t i, j; + + /* Reset all interface numbers. */ + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + iface->dp_ifidx = -1; + } + } + port_array_clear(&br->ifaces); + + dpif_port_list(&br->dpif, &dpif_ports, &n_dpif_ports); + for (i = 0; i < n_dpif_ports; i++) { + struct odp_port *p = &dpif_ports[i]; + struct iface *iface = iface_lookup(br, p->devname); + if (iface) { + if (iface->dp_ifidx >= 0) { + VLOG_WARN("dp%u reported interface %s twice", + dpif_id(&br->dpif), p->devname); + } else if (iface_from_dp_ifidx(br, p->port)) { + VLOG_WARN("dp%u reported interface %"PRIu16" twice", + dpif_id(&br->dpif), p->port); + } else { + port_array_set(&br->ifaces, p->port, iface); + iface->dp_ifidx = p->port; + } + } + } + free(dpif_ports); +} + +/* Bridge packet processing functions. */ + +static struct bond_entry * +lookup_bond_entry(const struct port *port, const uint8_t mac[ETH_ADDR_LEN]) +{ + size_t h = hash_bytes(mac, ETH_ADDR_LEN, 0); + return &port->bond_hash[h & BOND_MASK]; +} + +static int +bond_choose_iface(const struct port *port) +{ + size_t i; + for (i = 0; i < port->n_ifaces; i++) { + if (port->ifaces[i]->enabled) { + return i; + } + } + return -1; +} + +static bool +choose_output_iface(const struct port *port, const flow_t *flow, + uint16_t *dp_ifidx, tag_type *tags) +{ + struct iface *iface; + + assert(port->n_ifaces); + if (port->n_ifaces == 1) { + iface = port->ifaces[0]; + } else { + struct bond_entry *e = lookup_bond_entry(port, flow->dl_src); + if (e->iface_idx < 0 || e->iface_idx >= port->n_ifaces + || !port->ifaces[e->iface_idx]->enabled) { + /* XXX select interface properly. The current interface selection + * is only good for testing the rebalancing code. */ + e->iface_idx = bond_choose_iface(port); + if (e->iface_idx < 0) { + *tags |= port->no_ifaces_tag; + return false; + } + e->iface_tag = tag_create_random(); + } + *tags |= e->iface_tag; + iface = port->ifaces[e->iface_idx]; + } + *dp_ifidx = iface->dp_ifidx; + *tags |= iface->tag; /* Currently only used for bonding. */ + return true; +} + +static void +bond_link_status_update(struct iface *iface, bool carrier) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); + struct port *port = iface->port; + + if ((carrier == iface->enabled) == (iface->delay_expires == LLONG_MAX)) { + /* Nothing to do. */ + return; + } + VLOG_INFO_RL(&rl, "interface %s: carrier %s", + iface->name, carrier ? "detected" : "dropped"); + if (carrier == iface->enabled) { + iface->delay_expires = LLONG_MAX; + VLOG_INFO_RL(&rl, "interface %s: will not be %s", + iface->name, carrier ? "disabled" : "enabled"); + } else { + int delay = carrier ? port->updelay : port->downdelay; + iface->delay_expires = time_msec() + delay; + if (delay) { + VLOG_INFO_RL(&rl, + "interface %s: will be %s if it stays %s for %d ms", + iface->name, + carrier ? "enabled" : "disabled", + carrier ? "up" : "down", + delay); + } + } +} + +static void +bond_choose_active_iface(struct port *port) +{ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); + + port->active_iface = bond_choose_iface(port); + port->active_iface_tag = tag_create_random(); + if (port->active_iface >= 0) { + VLOG_INFO_RL(&rl, "port %s: active interface is now %s", + port->name, port->ifaces[port->active_iface]->name); + } else { + VLOG_WARN_RL(&rl, "port %s: all ports disabled, no active interface", + port->name); + } +} + +static void +bond_run(struct bridge *br) +{ + size_t i, j; + + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (port->n_ifaces < 2) { + continue; + } + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (time_msec() >= iface->delay_expires) { + iface->delay_expires = LLONG_MAX; + iface->enabled = !iface->enabled; + VLOG_WARN("interface %s: %s", + iface->name, + iface->enabled ? "enabled" : "disabled"); + if (!iface->enabled) { + ofproto_revalidate(br->ofproto, iface->tag); + if (iface->port_ifidx == port->active_iface) { + ofproto_revalidate(br->ofproto, + port->active_iface_tag); + bond_choose_active_iface(port); + } + } else { + if (port->active_iface < 0) { + ofproto_revalidate(br->ofproto, port->no_ifaces_tag); + bond_choose_active_iface(port); + } + iface->tag = tag_create_random(); + } + } + } + } +} + +static void +bond_wait(struct bridge *br) +{ + size_t i, j; + + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (port->n_ifaces < 2) { + continue; + } + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (iface->delay_expires != LLONG_MAX) { + poll_timer_wait(iface->delay_expires - time_msec()); + } + } + } +} + +static bool +set_dst(struct dst *p, const flow_t *flow, + const struct port *in_port, const struct port *out_port, + tag_type *tags) +{ + /* STP handling. + * + * XXX This uses too many tags: any broadcast flow will get one tag per + * destination port, and thus a broadcast on a switch of any size is likely + * to have all tag bits set. We should figure out a way to be smarter. + * + * This is OK when STP is disabled, because stp_state_tag is 0 then. */ + *tags |= out_port->stp_state_tag; + if (!(out_port->stp_state & (STP_DISABLED | STP_FORWARDING))) { + return false; + } + + p->vlan = (out_port->vlan >= 0 ? OFP_VLAN_NONE + : in_port->vlan >= 0 ? in_port->vlan + : ntohs(flow->dl_vlan)); + return choose_output_iface(out_port, flow, &p->dp_ifidx, tags); +} + +static void +swap_dst(struct dst *p, struct dst *q) +{ + struct dst tmp = *p; + *p = *q; + *q = tmp; +} + +/* Moves all the dsts with vlan == 'vlan' to the front of the 'n_dsts' in + * 'dsts'. (This may help performance by reducing the number of VLAN changes + * that we push to the datapath. We could in fact fully sort the array by + * vlan, but in most cases there are at most two different vlan tags so that's + * possibly overkill.) */ +static void +partition_dsts(struct dst *dsts, size_t n_dsts, int vlan) +{ + struct dst *first = dsts; + struct dst *last = dsts + n_dsts; + + while (first != last) { + /* Invariants: + * - All dsts < first have vlan == 'vlan'. + * - All dsts >= last have vlan != 'vlan'. + * - first < last. */ + while (first->vlan == vlan) { + if (++first == last) { + return; + } + } + + /* Same invariants, plus one additional: + * - first->vlan != vlan. + */ + while (last[-1].vlan != vlan) { + if (--last == first) { + return; + } + } + + /* Same invariants, plus one additional: + * - last[-1].vlan == vlan.*/ + swap_dst(first++, --last); + } +} + +static int +mirror_mask_ffs(mirror_mask_t mask) +{ + BUILD_ASSERT_DECL(sizeof(unsigned int) >= sizeof(mask)); + return ffs(mask); +} + +static bool +dst_is_duplicate(const struct dst *dsts, size_t n_dsts, + const struct dst *test) +{ + size_t i; + for (i = 0; i < n_dsts; i++) { + if (dsts[i].vlan == test->vlan && dsts[i].dp_ifidx == test->dp_ifidx) { + return true; + } + } + return false; +} + +static bool +port_trunks_vlan(const struct port *port, uint16_t vlan) +{ + return port->vlan < 0 && bitmap_is_set(port->trunks, vlan); +} + +static bool +port_includes_vlan(const struct port *port, uint16_t vlan) +{ + return vlan == port->vlan || port_trunks_vlan(port, vlan); +} + +static size_t +compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan, + const struct port *in_port, const struct port *out_port, + struct dst dsts[], tag_type *tags) +{ + mirror_mask_t mirrors = in_port->src_mirrors; + struct dst *dst = dsts; + size_t i; + + *tags |= in_port->stp_state_tag; + if (out_port == FLOOD_PORT) { + /* XXX use ODP_FLOOD if no vlans or bonding. */ + /* XXX even better, define each VLAN as a datapath port group */ + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (port != in_port && port_includes_vlan(port, vlan) + && !port->is_mirror_output_port + && set_dst(dst, flow, in_port, port, tags)) { + mirrors |= port->dst_mirrors; + dst++; + } + } + } else if (out_port && set_dst(dst, flow, in_port, out_port, tags)) { + mirrors |= out_port->dst_mirrors; + dst++; + } + + while (mirrors) { + struct mirror *m = br->mirrors[mirror_mask_ffs(mirrors) - 1]; + if (!m->n_vlans || vlan_is_mirrored(m, vlan)) { + if (m->out_port) { + if (set_dst(dst, flow, in_port, m->out_port, tags) + && !dst_is_duplicate(dsts, dst - dsts, dst)) { + dst++; + } + } else { + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (port_includes_vlan(port, m->out_vlan) + && set_dst(dst, flow, in_port, port, tags) + && !dst_is_duplicate(dsts, dst - dsts, dst)) + { + if (port->vlan < 0) { + dst->vlan = m->out_vlan; + } + if (dst->dp_ifidx == flow->in_port + && dst->vlan == vlan) { + /* Don't send out input port on same VLAN. */ + continue; + } + dst++; + } + } + } + } + mirrors &= mirrors - 1; + } + + partition_dsts(dsts, dst - dsts, ntohs(flow->dl_vlan)); + return dst - dsts; +} + +static void UNUSED +print_dsts(const struct dst *dsts, size_t n) +{ + for (; n--; dsts++) { + printf(">p%"PRIu16, dsts->dp_ifidx); + if (dsts->vlan != OFP_VLAN_NONE) { + printf("v%"PRIu16, dsts->vlan); + } + } +} + +static void +compose_actions(struct bridge *br, const flow_t *flow, uint16_t vlan, + const struct port *in_port, const struct port *out_port, + tag_type *tags, struct odp_actions *actions) +{ + struct dst dsts[DP_MAX_PORTS * (MAX_MIRRORS + 1)]; + size_t n_dsts; + const struct dst *p; + uint16_t cur_vlan; + + n_dsts = compose_dsts(br, flow, vlan, in_port, out_port, dsts, tags); + + cur_vlan = ntohs(flow->dl_vlan); + for (p = dsts; p < &dsts[n_dsts]; p++) { + union odp_action *a; + if (p->vlan != cur_vlan) { + if (p->vlan == OFP_VLAN_NONE) { + odp_actions_add(actions, ODPAT_STRIP_VLAN); + } else { + a = odp_actions_add(actions, ODPAT_SET_VLAN_VID); + a->vlan_vid.vlan_vid = htons(p->vlan); + } + cur_vlan = p->vlan; + } + a = odp_actions_add(actions, ODPAT_OUTPUT); + a->output.port = p->dp_ifidx; + } +} + +static bool +is_bcast_arp_reply(const flow_t *flow, const struct ofpbuf *packet) +{ + struct arp_eth_header *arp = (struct arp_eth_header *) packet->data; + return (flow->dl_type == htons(ETH_TYPE_ARP) + && eth_addr_is_broadcast(flow->dl_dst) + && packet->size >= sizeof(struct arp_eth_header) + && arp->ar_op == ARP_OP_REQUEST); +} + +/* If the composed actions may be applied to any packet in the given 'flow', + * returns true. Otherwise, the actions should only be applied to 'packet', or + * not at all, if 'packet' was NULL. */ +static bool +process_flow(struct bridge *br, const flow_t *flow, + const struct ofpbuf *packet, struct odp_actions *actions, + tag_type *tags) +{ + struct iface *in_iface; + struct port *in_port; + struct port *out_port = NULL; /* By default, drop the packet/flow. */ + int vlan; + + /* Find the interface and port structure for the received packet. */ + in_iface = iface_from_dp_ifidx(br, flow->in_port); + if (!in_iface) { + /* No interface? Something fishy... */ + if (packet != NULL) { + /* Odd. A few possible reasons here: + * + * - We deleted an interface but there are still a few packets + * queued up from it. + * + * - Someone externally added an interface (e.g. with "ovs-dpctl + * add-if") that we don't know about. + * + * - Packet arrived on the local port but the local port is not + * one of our bridge ports. + */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + + VLOG_WARN_RL(&rl, "bridge %s: received packet on unknown " + "interface %"PRIu16, br->name, flow->in_port); + } + + /* Return without adding any actions, to drop packets on this flow. */ + return true; + } + in_port = in_iface->port; + + /* Figure out what VLAN this packet belongs to. + * + * Note that dl_vlan of 0 and of OFP_VLAN_NONE both mean that the packet + * belongs to VLAN 0, so we should treat both cases identically. (In the + * former case, the packet has an 802.1Q header that specifies VLAN 0, + * presumably to allow a priority to be specified. In the latter case, the + * packet does not have any 802.1Q header.) */ + vlan = ntohs(flow->dl_vlan); + if (vlan == OFP_VLAN_NONE) { + vlan = 0; + } + if (in_port->vlan >= 0) { + if (vlan) { + /* XXX support double tagging? */ + if (packet != NULL) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %"PRIu16" tagged " + "packet received on port %s configured with " + "implicit VLAN %"PRIu16, + br->name, ntohs(flow->dl_vlan), + in_port->name, in_port->vlan); + } + goto done; + } + vlan = in_port->vlan; + } else { + if (!port_includes_vlan(in_port, vlan)) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %d tagged " + "packet received on port %s not configured for " + "trunking VLAN %d", + br->name, vlan, in_port->name, vlan); + goto done; + } + } + + /* Drop frames for ports that STP wants entirely killed (both for + * forwarding and for learning). Later, after we do learning, we'll drop + * the frames that STP wants to do learning but not forwarding on. */ + if (in_port->stp_state & (STP_LISTENING | STP_BLOCKING)) { + goto done; + } + + /* Drop frames for reserved multicast addresses. */ + if (eth_addr_is_reserved(flow->dl_dst)) { + goto done; + } + + /* Drop frames on ports reserved for mirroring. */ + if (in_port->is_mirror_output_port) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + VLOG_WARN_RL(&rl, "bridge %s: dropping packet received on port %s, " + "which is reserved exclusively for mirroring", + br->name, in_port->name); + goto done; + } + + /* Drop multicast and broadcast packets on inactive bonded interfaces, to + * avoid receiving duplicates. */ + if (in_port->n_ifaces > 1 && eth_addr_is_multicast(flow->dl_dst)) { + *tags |= in_port->active_iface_tag; + if (in_port->active_iface != in_iface->port_ifidx) { + goto done; + } + } + + /* MAC learning. */ + out_port = FLOOD_PORT; + if (br->ml) { + int out_port_idx; + bool may_learn; + + if (!packet) { + /* Don't try to learn from revalidation. */ + may_learn = false; + } else if (in_port->n_ifaces > 1) { + /* If the packet arrived on a bonded port, don't learn from it + * unless we haven't learned any port at all for that address + * (because we probably sent the packet on one bonded interface and + * got it back on the other). Broadcast ARP replies are an + * exception to this rule: the host has moved to another switch. */ + int src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan); + may_learn = (src_idx < 0 + || src_idx == in_port->port_idx + || is_bcast_arp_reply(flow, packet)); + } else { + may_learn = true; + } + + /* Learn source MAC. */ + if (may_learn) { + tag_type rev_tag = mac_learning_learn(br->ml, flow->dl_src, + vlan, in_port->port_idx); + if (rev_tag) { + /* The log messages here could actually be useful in debugging, + * so keep the rate limit relatively high. */ + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, + 300); + VLOG_DBG_RL(&rl, "bridge %s: learned that "ETH_ADDR_FMT" is " + "on port %s in VLAN %d", + br->name, ETH_ADDR_ARGS(flow->dl_src), + in_port->name, vlan); + ofproto_revalidate(br->ofproto, rev_tag); + } + } + + /* Determine output port. */ + out_port_idx = mac_learning_lookup_tag(br->ml, flow->dl_dst, vlan, + tags); + if (out_port_idx >= 0 && out_port_idx < br->n_ports) { + out_port = br->ports[out_port_idx]; + } + } + + /* Don't send packets out their input ports. Don't forward frames that STP + * wants us to discard. */ + if (in_port == out_port || in_port->stp_state == STP_LEARNING) { + out_port = NULL; + } + +done: + compose_actions(br, flow, vlan, in_port, out_port, tags, actions); + + /* + * We send out only a single packet, instead of setting up a flow, if the + * packet is an ARP directed to broadcast that arrived on a bonded + * interface. In such a situation ARP requests and replies must be handled + * differently, but OpenFlow unfortunately can't distinguish them. + */ + return (in_port->n_ifaces < 2 + || flow->dl_type != htons(ETH_TYPE_ARP) + || !eth_addr_is_broadcast(flow->dl_dst)); +} + +/* Careful: 'opp' is in host byte order and opp->port_no is an OFP port + * number. */ +static void +bridge_port_changed_ofhook_cb(enum ofp_port_reason reason, + const struct ofp_phy_port *opp, + void *br_) +{ + struct bridge *br = br_; + struct iface *iface; + struct port *port; + + iface = iface_from_dp_ifidx(br, ofp_port_to_odp_port(opp->port_no)); + if (!iface) { + return; + } + port = iface->port; + + if (reason == OFPPR_DELETE) { + VLOG_WARN("bridge %s: interface %s deleted unexpectedly", + br->name, iface->name); + iface_destroy(iface); + if (!port->n_ifaces) { + VLOG_WARN("bridge %s: port %s has no interfaces, dropping", + br->name, port->name); + port_destroy(port); + } + + bridge_flush(br); + } else { + memcpy(iface->mac, opp->hw_addr, ETH_ADDR_LEN); + if (port->n_ifaces > 1) { + bool up = !(opp->state & OFPPS_LINK_DOWN); + bond_link_status_update(iface, up); + port_update_bond_compat(port); + } + } +} + +static bool +bridge_normal_ofhook_cb(const flow_t *flow, const struct ofpbuf *packet, + struct odp_actions *actions, tag_type *tags, void *br_) +{ + struct bridge *br = br_; + +#if 0 + if (flow->dl_type == htons(OFP_DL_TYPE_NOT_ETH_TYPE) + && eth_addr_equals(flow->dl_dst, stp_eth_addr)) { + brstp_receive(br, flow, payload); + return true; + } +#endif + + COVERAGE_INC(bridge_process_flow); + return process_flow(br, flow, packet, actions, tags); +} + +static void +bridge_account_flow_ofhook_cb(const flow_t *flow, + const union odp_action *actions, + size_t n_actions, unsigned long long int n_bytes, + void *br_) +{ + struct bridge *br = br_; + const union odp_action *a; + + if (!br->has_bonded_ports) { + return; + } + + for (a = actions; a < &actions[n_actions]; a++) { + if (a->type == ODPAT_OUTPUT) { + struct port *port = port_from_dp_ifidx(br, a->output.port); + if (port && port->n_ifaces >= 2) { + struct bond_entry *e = lookup_bond_entry(port, flow->dl_src); + e->tx_bytes += n_bytes; + } + } + } +} + +static void +bridge_account_checkpoint_ofhook_cb(void *br_) +{ + struct bridge *br = br_; + size_t i; + + if (!br->has_bonded_ports) { + return; + } + + /* The current ofproto implementation calls this callback at least once a + * second, so this timer implementation is sufficient. */ + if (time_msec() < br->bond_next_rebalance) { + return; + } + br->bond_next_rebalance = time_msec() + 10000; + + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (port->n_ifaces > 1) { + bond_rebalance_port(port); + } + } +} + +static struct ofhooks bridge_ofhooks = { + bridge_port_changed_ofhook_cb, + bridge_normal_ofhook_cb, + bridge_account_flow_ofhook_cb, + bridge_account_checkpoint_ofhook_cb, +}; + +/* Statistics for a single interface on a bonded port, used for load-based + * bond rebalancing. */ +struct slave_balance { + struct iface *iface; /* The interface. */ + uint64_t tx_bytes; /* Sum of hashes[*]->tx_bytes. */ + + /* All the "bond_entry"s that are assigned to this interface, in order of + * increasing tx_bytes. */ + struct bond_entry **hashes; + size_t n_hashes; +}; + +/* Sorts pointers to pointers to bond_entries in ascending order by the + * interface to which they are assigned, and within a single interface in + * ascending order of bytes transmitted. */ +static int +compare_bond_entries(const void *a_, const void *b_) +{ + const struct bond_entry *const *ap = a_; + const struct bond_entry *const *bp = b_; + const struct bond_entry *a = *ap; + const struct bond_entry *b = *bp; + if (a->iface_idx != b->iface_idx) { + return a->iface_idx > b->iface_idx ? 1 : -1; + } else if (a->tx_bytes != b->tx_bytes) { + return a->tx_bytes > b->tx_bytes ? 1 : -1; + } else { + return 0; + } +} + +/* Sorts slave_balances so that enabled ports come first, and otherwise in + * *descending* order by number of bytes transmitted. */ +static int +compare_slave_balance(const void *a_, const void *b_) +{ + const struct slave_balance *a = a_; + const struct slave_balance *b = b_; + if (a->iface->enabled != b->iface->enabled) { + return a->iface->enabled ? -1 : 1; + } else if (a->tx_bytes != b->tx_bytes) { + return a->tx_bytes > b->tx_bytes ? -1 : 1; + } else { + return 0; + } +} + +static void +swap_bals(struct slave_balance *a, struct slave_balance *b) +{ + struct slave_balance tmp = *a; + *a = *b; + *b = tmp; +} + +/* Restores the 'n_bals' slave_balance structures in 'bals' to sorted order + * given that 'p' (and only 'p') might be in the wrong location. + * + * This function invalidates 'p', since it might now be in a different memory + * location. */ +static void +resort_bals(struct slave_balance *p, + struct slave_balance bals[], size_t n_bals) +{ + if (n_bals > 1) { + for (; p > bals && p->tx_bytes > p[-1].tx_bytes; p--) { + swap_bals(p, p - 1); + } + for (; p < &bals[n_bals - 1] && p->tx_bytes < p[1].tx_bytes; p++) { + swap_bals(p, p + 1); + } + } +} + +static void +log_bals(const struct slave_balance *bals, size_t n_bals, struct port *port) +{ + if (VLOG_IS_DBG_ENABLED()) { + struct ds ds = DS_EMPTY_INITIALIZER; + const struct slave_balance *b; + + for (b = bals; b < bals + n_bals; b++) { + size_t i; + + if (b > bals) { + ds_put_char(&ds, ','); + } + ds_put_format(&ds, " %s %"PRIu64"kB", + b->iface->name, b->tx_bytes / 1024); + + if (!b->iface->enabled) { + ds_put_cstr(&ds, " (disabled)"); + } + if (b->n_hashes > 0) { + ds_put_cstr(&ds, " ("); + for (i = 0; i < b->n_hashes; i++) { + const struct bond_entry *e = b->hashes[i]; + if (i > 0) { + ds_put_cstr(&ds, " + "); + } + ds_put_format(&ds, "h%td: %"PRIu64"kB", + e - port->bond_hash, e->tx_bytes / 1024); + } + ds_put_cstr(&ds, ")"); + } + } + VLOG_DBG("bond %s:%s", port->name, ds_cstr(&ds)); + ds_destroy(&ds); + } +} + +/* Shifts 'hash' from 'from' to 'to' within 'port'. */ +static void +bond_shift_load(struct slave_balance *from, struct slave_balance *to, + struct bond_entry *hash) +{ + struct port *port = from->iface->port; + uint64_t delta = hash->tx_bytes; + + VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) " + "from %s to %s (now carrying %"PRIu64"kB and " + "%"PRIu64"kB load, respectively)", + port->name, delta / 1024, hash - port->bond_hash, + from->iface->name, to->iface->name, + (from->tx_bytes - delta) / 1024, + (to->tx_bytes + delta) / 1024); + + /* Delete element from from->hashes. + * + * We don't bother to add the element to to->hashes because not only would + * it require more work, the only purpose it would be to allow that hash to + * be migrated to another slave in this rebalancing run, and there is no + * point in doing that. */ + if (from->hashes[0] == hash) { + from->hashes++; + } else { + int i = hash - from->hashes[0]; + memmove(from->hashes + i, from->hashes + i + 1, + (from->n_hashes - (i + 1)) * sizeof *from->hashes); + } + from->n_hashes--; + + /* Shift load away from 'from' to 'to'. */ + from->tx_bytes -= delta; + to->tx_bytes += delta; + + /* Arrange for flows to be revalidated. */ + ofproto_revalidate(port->bridge->ofproto, hash->iface_tag); + hash->iface_idx = to->iface->port_ifidx; + hash->iface_tag = tag_create_random(); + +} + +static void +bond_rebalance_port(struct port *port) +{ + struct slave_balance bals[DP_MAX_PORTS]; + size_t n_bals; + struct bond_entry *hashes[BOND_MASK + 1]; + struct slave_balance *b, *from, *to; + struct bond_entry *e; + size_t i; + + /* Sets up 'bals' to describe each of the port's interfaces, sorted in + * descending order of tx_bytes, so that bals[0] represents the most + * heavily loaded slave and bals[n_bals - 1] represents the least heavily + * loaded slave. + * + * The code is a bit tricky: to avoid dynamically allocating a 'hashes' + * array for each slave_balance structure, we sort our local array of + * hashes in order by slave, so that all of the hashes for a given slave + * become contiguous in memory, and then we point each 'hashes' members of + * a slave_balance structure to the start of a contiguous group. */ + n_bals = port->n_ifaces; + for (b = bals; b < &bals[n_bals]; b++) { + b->iface = port->ifaces[b - bals]; + b->tx_bytes = 0; + b->hashes = NULL; + b->n_hashes = 0; + } + for (i = 0; i <= BOND_MASK; i++) { + hashes[i] = &port->bond_hash[i]; + } + qsort(hashes, BOND_MASK + 1, sizeof *hashes, compare_bond_entries); + for (i = 0; i <= BOND_MASK; i++) { + e = hashes[i]; + if (e->iface_idx >= 0 && e->iface_idx < port->n_ifaces) { + b = &bals[e->iface_idx]; + b->tx_bytes += e->tx_bytes; + if (!b->hashes) { + b->hashes = &hashes[i]; + } + b->n_hashes++; + } + } + qsort(bals, n_bals, sizeof *bals, compare_slave_balance); + log_bals(bals, n_bals, port); + + /* Discard slaves that aren't enabled (which were sorted to the back of the + * array earlier). */ + while (!bals[n_bals - 1].iface->enabled) { + n_bals--; + if (!n_bals) { + return; + } + } + + /* Shift load from the most-loaded slaves to the least-loaded slaves. */ + to = &bals[n_bals - 1]; + for (from = bals; from < to; ) { + uint64_t overload = from->tx_bytes - to->tx_bytes; + if (overload < to->tx_bytes >> 5 || overload < 100000) { + /* The extra load on 'from' (and all less-loaded slaves), compared + * to that of 'to' (the least-loaded slave), is less than ~3%, or + * it is less than ~1Mbps. No point in rebalancing. */ + break; + } else if (from->n_hashes == 1) { + /* 'from' only carries a single MAC hash, so we can't shift any + * load away from it, even though we want to. */ + from++; + } else { + /* 'from' is carrying significantly more load than 'to', and that + * load is split across at least two different hashes. Pick a hash + * to migrate to 'to' (the least-loaded slave), given that doing so + * must not cause 'to''s load to exceed 'from''s load. + * + * The sort order we use means that we prefer to shift away the + * smallest hashes instead of the biggest ones. There is little + * reason behind this decision; we could use the opposite sort + * order to shift away big hashes ahead of small ones. */ + size_t i; + + for (i = 0; i < from->n_hashes; i++) { + uint64_t delta = from->hashes[i]->tx_bytes; + if (to->tx_bytes + delta < from->tx_bytes - delta) { + break; + } + } + if (i < from->n_hashes) { + bond_shift_load(from, to, from->hashes[i]); + + /* Re-sort 'bals'. Note that this may make 'from' and 'to' + * point to different slave_balance structures. It is only + * valid to do these two operations in a row at all because we + * know that 'from' will not move past 'to' and vice versa. */ + resort_bals(from, bals, n_bals); + resort_bals(to, bals, n_bals); + } else { + from++; + } + } + } + + /* Implement exponentially weighted moving average. A weight of 1/2 causes + * historical data to decay to <1% in 7 rebalancing runs. */ + for (e = &port->bond_hash[0]; e <= &port->bond_hash[BOND_MASK]; e++) { + e->tx_bytes /= 2; + } +} + +/* Port functions. */ + +static void +port_create(struct bridge *br, const char *name) +{ + struct port *port; + + port = xcalloc(1, sizeof *port); + port->bridge = br; + port->port_idx = br->n_ports; + port->vlan = -1; + port->trunks = NULL; + port->name = xstrdup(name); + port->active_iface = -1; + port->stp_state = STP_DISABLED; + port->stp_state_tag = 0; + + if (br->n_ports >= br->allocated_ports) { + br->ports = x2nrealloc(br->ports, &br->allocated_ports, + sizeof *br->ports); + } + br->ports[br->n_ports++] = port; + + VLOG_INFO("created port %s on bridge %s", port->name, br->name); + bridge_flush(br); +} + +static void +port_reconfigure(struct port *port) +{ + bool bonded = cfg_has_section("bonding.%s", port->name); + struct svec old_ifaces, new_ifaces; + unsigned long *trunks; + int vlan; + size_t i; + + /* Collect old and new interfaces. */ + svec_init(&old_ifaces); + svec_init(&new_ifaces); + for (i = 0; i < port->n_ifaces; i++) { + svec_add(&old_ifaces, port->ifaces[i]->name); + } + svec_sort(&old_ifaces); + if (bonded) { + cfg_get_all_keys(&new_ifaces, "bonding.%s.slave", port->name); + if (!new_ifaces.n) { + VLOG_ERR("port %s: no interfaces specified for bonded port", + port->name); + } else if (new_ifaces.n == 1) { + VLOG_WARN("port %s: only 1 interface specified for bonded port", + port->name); + } + + port->updelay = cfg_get_int(0, "bonding.%s.updelay", port->name); + if (port->updelay < 0) { + port->updelay = 0; + } + port->downdelay = cfg_get_int(0, "bonding.%s.downdelay", port->name); + if (port->downdelay < 0) { + port->downdelay = 0; + } + } else { + svec_init(&new_ifaces); + svec_add(&new_ifaces, port->name); + } + + /* Get rid of deleted interfaces and add new interfaces. */ + for (i = 0; i < port->n_ifaces; i++) { + struct iface *iface = port->ifaces[i]; + if (!svec_contains(&new_ifaces, iface->name)) { + iface_destroy(iface); + } else { + i++; + } + } + for (i = 0; i < new_ifaces.n; i++) { + const char *name = new_ifaces.names[i]; + if (!svec_contains(&old_ifaces, name)) { + iface_create(port, name); + } + } + + /* Get VLAN tag. */ + vlan = -1; + if (cfg_has("vlan.%s.tag", port->name)) { + if (!bonded) { + vlan = cfg_get_vlan(0, "vlan.%s.tag", port->name); + if (vlan >= 0 && vlan <= 4095) { + VLOG_DBG("port %s: assigning VLAN tag %d", port->name, vlan); + } + } else { + /* It's possible that bonded, VLAN-tagged ports make sense. Maybe + * they even work as-is. But they have not been tested. */ + VLOG_WARN("port %s: VLAN tags not supported on bonded ports", + port->name); + } + } + if (port->vlan != vlan) { + port->vlan = vlan; + bridge_flush(port->bridge); + } + + /* Get trunked VLANs. */ + trunks = NULL; + if (vlan < 0) { + size_t n_trunks, n_errors; + size_t i; + + trunks = bitmap_allocate(4096); + n_trunks = cfg_count("vlan.%s.trunks", port->name); + n_errors = 0; + for (i = 0; i < n_trunks; i++) { + int trunk = cfg_get_vlan(i, "vlan.%s.trunks", port->name); + if (trunk >= 0) { + bitmap_set1(trunks, trunk); + } else { + n_errors++; + } + } + if (n_errors) { + VLOG_ERR("port %s: invalid values for %zu trunk VLANs", + port->name, n_trunks); + } + if (n_errors == n_trunks) { + if (n_errors) { + VLOG_ERR("port %s: no valid trunks, trunking all VLANs", + port->name); + } + bitmap_set_multiple(trunks, 0, 4096, 1); + } + } else { + if (cfg_has("vlan.%s.trunks", port->name)) { + VLOG_ERR("ignoring vlan.%s.trunks in favor of vlan.%s.vlan", + port->name, port->name); + } + } + if (trunks == NULL + ? port->trunks != NULL + : port->trunks == NULL || !bitmap_equal(trunks, port->trunks, 4096)) { + bridge_flush(port->bridge); + } + bitmap_free(port->trunks); + port->trunks = trunks; + + svec_destroy(&old_ifaces); + svec_destroy(&new_ifaces); +} + +static void +port_destroy(struct port *port) +{ + if (port) { + struct bridge *br = port->bridge; + struct port *del; + size_t i; + + proc_net_compat_update_vlan(port->name, NULL, 0); + + for (i = 0; i < MAX_MIRRORS; i++) { + struct mirror *m = br->mirrors[i]; + if (m && m->out_port == port) { + mirror_destroy(m); + } + } + + while (port->n_ifaces > 0) { + iface_destroy(port->ifaces[port->n_ifaces - 1]); + } + + del = br->ports[port->port_idx] = br->ports[--br->n_ports]; + del->port_idx = port->port_idx; + + free(port->ifaces); + bitmap_free(port->trunks); + free(port->name); + free(port); + bridge_flush(br); + } +} + +static struct port * +port_from_dp_ifidx(const struct bridge *br, uint16_t dp_ifidx) +{ + struct iface *iface = iface_from_dp_ifidx(br, dp_ifidx); + return iface ? iface->port : NULL; +} + +static struct port * +port_lookup(const struct bridge *br, const char *name) +{ + size_t i; + + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + if (!strcmp(port->name, name)) { + return port; + } + } + return NULL; +} + +static void +port_update_bonding(struct port *port) +{ + if (port->n_ifaces < 2) { + /* Not a bonded port. */ + if (port->bond_hash) { + free(port->bond_hash); + port->bond_hash = NULL; + proc_net_compat_update_bond(port->name, NULL); + } + } else { + if (!port->bond_hash) { + size_t i; + + port->bond_hash = xcalloc(BOND_MASK + 1, sizeof *port->bond_hash); + for (i = 0; i <= BOND_MASK; i++) { + struct bond_entry *e = &port->bond_hash[i]; + e->iface_idx = -1; + e->tx_bytes = 0; + } + port->no_ifaces_tag = tag_create_random(); + bond_choose_active_iface(port); + } + port_update_bond_compat(port); + } +} + +static void +port_update_bond_compat(struct port *port) +{ + struct compat_bond bond; + size_t i; + + if (port->n_ifaces < 2) { + return; + } + + bond.up = false; + bond.updelay = port->updelay; + bond.downdelay = port->downdelay; + bond.n_slaves = port->n_ifaces; + bond.slaves = xmalloc(port->n_ifaces * sizeof *bond.slaves); + for (i = 0; i < port->n_ifaces; i++) { + struct iface *iface = port->ifaces[i]; + struct compat_bond_slave *slave = &bond.slaves[i]; + slave->name = iface->name; + slave->up = ((iface->enabled && iface->delay_expires == LLONG_MAX) || + (!iface->enabled && iface->delay_expires != LLONG_MAX)); + if (slave->up) { + bond.up = true; + } + memcpy(slave->mac, iface->mac, ETH_ADDR_LEN); + } + proc_net_compat_update_bond(port->name, &bond); + free(bond.slaves); +} + +static void +port_update_vlan_compat(struct port *port) +{ + struct bridge *br = port->bridge; + char *vlandev_name = NULL; + + if (port->vlan > 0) { + /* Figure out the name that the VLAN device should actually have, if it + * existed. This takes some work because the VLAN device would not + * have port->name in its name; rather, it would have the trunk port's + * name, and 'port' would be attached to a bridge that also had the + * VLAN device one of its ports. So we need to find a trunk port that + * includes port->vlan. + * + * There might be more than one candidate. This doesn't happen on + * XenServer, so if it happens we just pick the first choice in + * alphabetical order instead of creating multiple VLAN devices. */ + size_t i; + for (i = 0; i < br->n_ports; i++) { + struct port *p = br->ports[i]; + if (port_trunks_vlan(p, port->vlan) + && p->n_ifaces + && (!vlandev_name || strcmp(p->name, vlandev_name) <= 0)) + { + const uint8_t *ea = p->ifaces[0]->mac; + if (!eth_addr_is_multicast(ea) && + !eth_addr_is_reserved(ea) && + !eth_addr_is_zero(ea)) { + vlandev_name = p->name; + } + } + } + } + proc_net_compat_update_vlan(port->name, vlandev_name, port->vlan); +} + +/* Interface functions. */ + +static void +iface_create(struct port *port, const char *name) +{ + enum netdev_flags flags; + struct iface *iface; + + iface = xcalloc(1, sizeof *iface); + iface->port = port; + iface->port_ifidx = port->n_ifaces; + iface->name = xstrdup(name); + iface->dp_ifidx = -1; + iface->tag = tag_create_random(); + iface->enabled = true; + iface->delay_expires = LLONG_MAX; + + netdev_nodev_get_etheraddr(name, iface->mac); + + if (!netdev_nodev_get_flags(name, &flags)) { + iface->enabled = (flags & NETDEV_UP) != 0; + } + + if (port->n_ifaces >= port->allocated_ifaces) { + port->ifaces = x2nrealloc(port->ifaces, &port->allocated_ifaces, + sizeof *port->ifaces); + } + port->ifaces[port->n_ifaces++] = iface; + if (port->n_ifaces > 1) { + port->bridge->has_bonded_ports = true; + } + + VLOG_DBG("attached network device %s to port %s", iface->name, port->name); + + port_update_bonding(port); + bridge_flush(port->bridge); +} + +static void +iface_destroy(struct iface *iface) +{ + if (iface) { + struct port *port = iface->port; + struct bridge *br = port->bridge; + bool del_active = port->active_iface == iface->port_ifidx; + struct iface *del; + + if (iface->dp_ifidx >= 0) { + port_array_set(&br->ifaces, iface->dp_ifidx, NULL); + } + + del = port->ifaces[iface->port_ifidx] = port->ifaces[--port->n_ifaces]; + del->port_ifidx = iface->port_ifidx; + + free(iface->name); + free(iface); + + if (del_active) { + ofproto_revalidate(port->bridge->ofproto, port->active_iface_tag); + bond_choose_active_iface(port); + } + + port_update_bonding(port); + bridge_flush(port->bridge); + } +} + +static struct iface * +iface_lookup(const struct bridge *br, const char *name) +{ + size_t i, j; + + for (i = 0; i < br->n_ports; i++) { + struct port *port = br->ports[i]; + for (j = 0; j < port->n_ifaces; j++) { + struct iface *iface = port->ifaces[j]; + if (!strcmp(iface->name, name)) { + return iface; + } + } + } + return NULL; +} + +static struct iface * +iface_from_dp_ifidx(const struct bridge *br, uint16_t dp_ifidx) +{ + return port_array_get(&br->ifaces, dp_ifidx); +} + +/* Port mirroring. */ + +static void +mirror_reconfigure(struct bridge *br) +{ + struct svec old_mirrors, new_mirrors; + size_t i; + + /* Collect old and new mirrors. */ + svec_init(&old_mirrors); + svec_init(&new_mirrors); + cfg_get_subsections(&new_mirrors, "mirror.%s", br->name); + for (i = 0; i < MAX_MIRRORS; i++) { + if (br->mirrors[i]) { + svec_add(&old_mirrors, br->mirrors[i]->name); + } + } + + /* Get rid of deleted mirrors and add new mirrors. */ + svec_sort(&old_mirrors); + assert(svec_is_unique(&old_mirrors)); + svec_sort(&new_mirrors); + assert(svec_is_unique(&new_mirrors)); + for (i = 0; i < MAX_MIRRORS; i++) { + struct mirror *m = br->mirrors[i]; + if (m && !svec_contains(&new_mirrors, m->name)) { + mirror_destroy(m); + } + } + for (i = 0; i < new_mirrors.n; i++) { + const char *name = new_mirrors.names[i]; + if (!svec_contains(&old_mirrors, name)) { + mirror_create(br, name); + } + } + svec_destroy(&old_mirrors); + svec_destroy(&new_mirrors); + + /* Reconfigure all mirrors. */ + for (i = 0; i < MAX_MIRRORS; i++) { + if (br->mirrors[i]) { + mirror_reconfigure_one(br->mirrors[i]); + } + } + + /* Update port reserved status. */ + for (i = 0; i < br->n_ports; i++) { + br->ports[i]->is_mirror_output_port = false; + } + for (i = 0; i < MAX_MIRRORS; i++) { + struct mirror *m = br->mirrors[i]; + if (m && m->out_port) { + m->out_port->is_mirror_output_port = true; + } + } +} + +static void +mirror_create(struct bridge *br, const char *name) +{ + struct mirror *m; + size_t i; + + for (i = 0; ; i++) { + if (i >= MAX_MIRRORS) { + VLOG_WARN("bridge %s: maximum of %d port mirrors reached, " + "cannot create %s", br->name, MAX_MIRRORS, name); + return; + } + if (!br->mirrors[i]) { + break; + } + } + + VLOG_INFO("created port mirror %s on bridge %s", name, br->name); + bridge_flush(br); + + br->mirrors[i] = m = xcalloc(1, sizeof *m); + m->bridge = br; + m->idx = i; + m->name = xstrdup(name); + svec_init(&m->src_ports); + svec_init(&m->dst_ports); + m->vlans = NULL; + m->n_vlans = 0; + m->out_vlan = -1; + m->out_port = NULL; +} + +static void +mirror_destroy(struct mirror *m) +{ + if (m) { + struct bridge *br = m->bridge; + size_t i; + + for (i = 0; i < br->n_ports; i++) { + br->ports[i]->src_mirrors &= ~(MIRROR_MASK_C(1) << m->idx); + br->ports[i]->dst_mirrors &= ~(MIRROR_MASK_C(1) << m->idx); + } + + svec_destroy(&m->src_ports); + svec_destroy(&m->dst_ports); + free(m->vlans); + + m->bridge->mirrors[m->idx] = NULL; + free(m); + + bridge_flush(br); + } +} + +static void +prune_ports(struct mirror *m, struct svec *ports) +{ + struct svec tmp; + size_t i; + + svec_sort_unique(ports); + + svec_init(&tmp); + for (i = 0; i < ports->n; i++) { + const char *name = ports->names[i]; + if (port_lookup(m->bridge, name)) { + svec_add(&tmp, name); + } else { + VLOG_WARN("mirror.%s.%s: cannot match on nonexistent port %s", + m->bridge->name, m->name, name); + } + } + svec_swap(ports, &tmp); + svec_destroy(&tmp); +} + +static size_t +prune_vlans(struct mirror *m, struct svec *vlan_strings, int **vlans) +{ + size_t n_vlans, i; + + /* This isn't perfect: it won't combine "0" and "00", and the textual sort + * order won't give us numeric sort order. But that's good enough for what + * we need right now. */ + svec_sort_unique(vlan_strings); + + *vlans = xmalloc(sizeof *vlans * vlan_strings->n); + n_vlans = 0; + for (i = 0; i < vlan_strings->n; i++) { + const char *name = vlan_strings->names[i]; + int vlan; + if (!str_to_int(name, 10, &vlan) || vlan < 0 || vlan > 4095) { + VLOG_WARN("mirror.%s.%s.select.vlan: ignoring invalid VLAN %s", + m->bridge->name, m->name, name); + } else { + (*vlans)[n_vlans++] = vlan; + } + } + return n_vlans; +} + +static bool +vlan_is_mirrored(const struct mirror *m, int vlan) +{ + size_t i; + + for (i = 0; i < m->n_vlans; i++) { + if (m->vlans[i] == vlan) { + return true; + } + } + return false; +} + +static bool +port_trunks_any_mirrored_vlan(const struct mirror *m, const struct port *p) +{ + size_t i; + + for (i = 0; i < m->n_vlans; i++) { + if (port_trunks_vlan(p, m->vlans[i])) { + return true; + } + } + return false; +} + +static void +mirror_reconfigure_one(struct mirror *m) +{ + char *pfx = xasprintf("mirror.%s.%s", m->bridge->name, m->name); + struct svec src_ports, dst_ports, ports; + struct svec vlan_strings; + mirror_mask_t mirror_bit; + const char *out_port_name; + struct port *out_port; + int out_vlan; + size_t n_vlans; + int *vlans; + size_t i; + bool mirror_all_ports; + + /* Get output port. */ + out_port_name = cfg_get_key(0, "mirror.%s.%s.output.port", + m->bridge->name, m->name); + if (out_port_name) { + out_port = port_lookup(m->bridge, out_port_name); + if (!out_port) { + VLOG_ERR("%s.output.port: bridge %s does not have a port " + "named %s", pfx, m->bridge->name, out_port_name); + mirror_destroy(m); + free(pfx); + return; + } + out_vlan = -1; + + if (cfg_has("%s.output.vlan", pfx)) { + VLOG_ERR("%s.output.port and %s.output.vlan both specified; " + "ignoring %s.output.vlan", pfx, pfx, pfx); + } + } else if (cfg_has("%s.output.vlan", pfx)) { + out_port = NULL; + out_vlan = cfg_get_vlan(0, "%s.output.vlan", pfx); + } else { + VLOG_ERR("%s: neither %s.output.port nor %s.output.vlan specified, " + "but exactly one is required; disabling port mirror %s", + pfx, pfx, pfx, pfx); + mirror_destroy(m); + free(pfx); + return; + } + + /* Get all the ports, and drop duplicates and ports that don't exist. */ + svec_init(&src_ports); + svec_init(&dst_ports); + svec_init(&ports); + cfg_get_all_keys(&src_ports, "%s.select.src-port", pfx); + cfg_get_all_keys(&dst_ports, "%s.select.dst-port", pfx); + cfg_get_all_keys(&ports, "%s.select.port", pfx); + svec_append(&src_ports, &ports); + svec_append(&dst_ports, &ports); + svec_destroy(&ports); + prune_ports(m, &src_ports); + prune_ports(m, &dst_ports); + + /* Get all the vlans, and drop duplicate and invalid vlans. */ + svec_init(&vlan_strings); + cfg_get_all_keys(&vlan_strings, "%s.select.vlan", pfx); + n_vlans = prune_vlans(m, &vlan_strings, &vlans); + svec_destroy(&vlan_strings); + + /* Update mirror data. */ + if (!svec_equal(&m->src_ports, &src_ports) + || !svec_equal(&m->dst_ports, &dst_ports) + || m->n_vlans != n_vlans + || memcmp(m->vlans, vlans, sizeof *vlans * n_vlans) + || m->out_port != out_port + || m->out_vlan != out_vlan) { + bridge_flush(m->bridge); + } + svec_swap(&m->src_ports, &src_ports); + svec_swap(&m->dst_ports, &dst_ports); + free(m->vlans); + m->vlans = vlans; + m->n_vlans = n_vlans; + m->out_port = out_port; + m->out_vlan = out_vlan; + + /* If no selection criteria have been given, mirror for all ports. */ + mirror_all_ports = (!m->src_ports.n) && (!m->dst_ports.n) && (!m->n_vlans); + + /* Update ports. */ + mirror_bit = MIRROR_MASK_C(1) << m->idx; + for (i = 0; i < m->bridge->n_ports; i++) { + struct port *port = m->bridge->ports[i]; + + if (mirror_all_ports + || svec_contains(&m->src_ports, port->name) + || (m->n_vlans + && (!port->vlan + ? port_trunks_any_mirrored_vlan(m, port) + : vlan_is_mirrored(m, port->vlan)))) { + port->src_mirrors |= mirror_bit; + } else { + port->src_mirrors &= ~mirror_bit; + } + + if (mirror_all_ports || svec_contains(&m->dst_ports, port->name)) { + port->dst_mirrors |= mirror_bit; + } else { + port->dst_mirrors &= ~mirror_bit; + } + } + + /* Clean up. */ + svec_destroy(&src_ports); + svec_destroy(&dst_ports); + free(pfx); +} + +/* Spanning tree protocol. */ + +static void brstp_update_port_state(struct port *); + +static void +brstp_send_bpdu(struct ofpbuf *pkt, int port_no, void *br_) +{ + struct bridge *br = br_; + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + struct iface *iface = iface_from_dp_ifidx(br, port_no); + if (!iface) { + VLOG_WARN_RL(&rl, "%s: cannot send BPDU on unknown port %d", + br->name, port_no); + } else if (eth_addr_is_zero(iface->mac)) { + VLOG_WARN_RL(&rl, "%s: cannot send BPDU on port %d with unknown MAC", + br->name, port_no); + } else { + union ofp_action action; + struct eth_header *eth = pkt->l2; + flow_t flow; + + memcpy(eth->eth_src, iface->mac, ETH_ADDR_LEN); + + memset(&action, 0, sizeof action); + action.type = htons(OFPAT_OUTPUT); + action.output.len = htons(sizeof action); + action.output.port = htons(port_no); + + flow_extract(pkt, ODPP_NONE, &flow); + ofproto_send_packet(br->ofproto, &flow, &action, 1, pkt); + } + ofpbuf_delete(pkt); +} + +static void +brstp_reconfigure(struct bridge *br) +{ + size_t i; + + if (!cfg_get_bool(0, "stp.%s.enabled", br->name)) { + if (br->stp) { + stp_destroy(br->stp); + br->stp = NULL; + + bridge_flush(br); + } + } else { + uint64_t bridge_address, bridge_id; + int bridge_priority; + + bridge_address = cfg_get_mac(0, "stp.%s.address", br->name); + if (!bridge_address) { + if (br->stp) { + bridge_address = (stp_get_bridge_id(br->stp) + & ((UINT64_C(1) << 48) - 1)); + } else { + uint8_t mac[ETH_ADDR_LEN]; + eth_addr_random(mac); + bridge_address = eth_addr_to_uint64(mac); + } + } + + if (cfg_is_valid(CFG_INT | CFG_REQUIRED, "stp.%s.priority", + br->name)) { + bridge_priority = cfg_get_int(0, "stp.%s.priority", br->name); + } else { + bridge_priority = STP_DEFAULT_BRIDGE_PRIORITY; + } + + bridge_id = bridge_address | ((uint64_t) bridge_priority << 48); + if (!br->stp) { + br->stp = stp_create(br->name, bridge_id, brstp_send_bpdu, br); + br->stp_last_tick = time_msec(); + bridge_flush(br); + } else { + if (bridge_id != stp_get_bridge_id(br->stp)) { + stp_set_bridge_id(br->stp, bridge_id); + bridge_flush(br); + } + } + + for (i = 0; i < br->n_ports; i++) { + struct port *p = br->ports[i]; + int dp_ifidx; + struct stp_port *sp; + int path_cost, priority; + bool enable; + + if (!p->n_ifaces) { + continue; + } + dp_ifidx = p->ifaces[0]->dp_ifidx; + if (dp_ifidx < 0 || dp_ifidx >= STP_MAX_PORTS) { + continue; + } + + sp = stp_get_port(br->stp, dp_ifidx); + enable = (!cfg_is_valid(CFG_BOOL | CFG_REQUIRED, + "stp.%s.port.%s.enabled", + br->name, p->name) + || cfg_get_bool(0, "stp.%s.port.%s.enabled", + br->name, p->name)); + if (p->is_mirror_output_port) { + enable = false; + } + if (enable != (stp_port_get_state(sp) != STP_DISABLED)) { + bridge_flush(br); /* Might not be necessary. */ + if (enable) { + stp_port_enable(sp); + } else { + stp_port_disable(sp); + } + } + + path_cost = cfg_get_int(0, "stp.%s.port.%s.path-cost", + br->name, p->name); + stp_port_set_path_cost(sp, path_cost ? path_cost : 19 /* XXX */); + + priority = (cfg_is_valid(CFG_INT | CFG_REQUIRED, + "stp.%s.port.%s.priority", + br->name, p->name) + ? cfg_get_int(0, "stp.%s.port.%s.priority", + br->name, p->name) + : STP_DEFAULT_PORT_PRIORITY); + stp_port_set_priority(sp, priority); + } + + brstp_adjust_timers(br); + } + for (i = 0; i < br->n_ports; i++) { + brstp_update_port_state(br->ports[i]); + } +} + +static void +brstp_update_port_state(struct port *p) +{ + struct bridge *br = p->bridge; + enum stp_state state; + + /* Figure out new state. */ + state = STP_DISABLED; + if (br->stp && p->n_ifaces > 0) { + int dp_ifidx = p->ifaces[0]->dp_ifidx; + if (dp_ifidx >= 0 && dp_ifidx < STP_MAX_PORTS) { + state = stp_port_get_state(stp_get_port(br->stp, dp_ifidx)); + } + } + + /* Update state. */ + if (p->stp_state != state) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10); + VLOG_INFO_RL(&rl, "port %s: STP state changed from %s to %s", + p->name, stp_state_name(p->stp_state), + stp_state_name(state)); + if (p->stp_state == STP_DISABLED) { + bridge_flush(br); + } else { + ofproto_revalidate(p->bridge->ofproto, p->stp_state_tag); + } + p->stp_state = state; + p->stp_state_tag = (p->stp_state == STP_DISABLED ? 0 + : tag_create_random()); + } +} + +static void +brstp_adjust_timers(struct bridge *br) +{ + int hello_time = cfg_get_int(0, "stp.%s.hello-time", br->name); + int max_age = cfg_get_int(0, "stp.%s.max-age", br->name); + int forward_delay = cfg_get_int(0, "stp.%s.forward-delay", br->name); + + stp_set_hello_time(br->stp, hello_time ? hello_time : 2000); + stp_set_max_age(br->stp, max_age ? max_age : 20000); + stp_set_forward_delay(br->stp, forward_delay ? forward_delay : 15000); +} + +static void +brstp_run(struct bridge *br) +{ + if (br->stp) { + long long int now = time_msec(); + long long int elapsed = now - br->stp_last_tick; + struct stp_port *sp; + + if (elapsed > 0) { + stp_tick(br->stp, MIN(INT_MAX, elapsed)); + br->stp_last_tick = now; + } + while (stp_get_changed_port(br->stp, &sp)) { + struct port *p = port_from_dp_ifidx(br, stp_port_no(sp)); + if (p) { + brstp_update_port_state(p); + } + } + } +} + +static void +brstp_wait(struct bridge *br) +{ + if (br->stp) { + poll_timer_wait(1000); + } +} diff --git a/vswitchd/bridge.h b/vswitchd/bridge.h new file mode 100644 index 00000000..b9435370 --- /dev/null +++ b/vswitchd/bridge.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2008, 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#ifndef VSWITCHD_BRIDGE_H +#define VSWITCHD_BRIDGE_H 1 + +#include +#include "list.h" + +struct svec; + +void bridge_init(void); +void bridge_reconfigure(void); +int bridge_run(void); +void bridge_wait(void); +bool bridge_exists(const char *); +uint64_t bridge_get_datapathid(const char *name); +void bridge_get_ifaces(struct svec *svec); + +#endif /* bridge.h */ diff --git a/vswitchd/mgmt.c b/vswitchd/mgmt.c new file mode 100644 index 00000000..f5dcd184 --- /dev/null +++ b/vswitchd/mgmt.c @@ -0,0 +1,679 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include + +#include +#include +#include +#include + +#include "bridge.h" +#include "cfg.h" +#include "coverage.h" +#include "list.h" +#include "mgmt.h" +#include "openflow/nicira-ext.h" +#include "openflow/openflow.h" +#include "openflow/openflow-mgmt.h" +#include "ofpbuf.h" +#include "ovs-vswitchd.h" +#include "packets.h" +#include "rconn.h" +#include "svec.h" +#include "vconn.h" +#include "vconn-ssl.h" +#include "xtoxll.h" + +#define THIS_MODULE VLM_mgmt +#include "vlog.h" + +#define MAX_BACKOFF_DEFAULT 15 +#define INACTIVITY_PROBE_DEFAULT 15 + +static struct svec mgmt_cfg; +static uint8_t cfg_cookie[CFG_COOKIE_LEN]; +static struct rconn *mgmt_rconn; +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60); +static struct svec capabilities; +uint64_t mgmt_id; + + +#define TXQ_LIMIT 128 /* Max number of packets to queue for tx. */ +struct rconn_packet_counter *txqlen; /* # pkts queued for tx on mgmt_rconn. */ + +static uint64_t pick_fallback_mgmt_id(void); +static void send_config_update(uint32_t xid, bool use_xid); +static void send_resources_update(uint32_t xid, bool use_xid); + +void +mgmt_init(void) +{ + txqlen = rconn_packet_counter_create(); + + svec_init(&mgmt_cfg); + svec_init(&capabilities); + svec_add_nocopy(&capabilities, + xasprintf("com.nicira.mgmt.manager=true\n")); + + mgmt_id = cfg_get_dpid(0, "mgmt.id"); + if (!mgmt_id) { + /* Randomly generate a mgmt id */ + mgmt_id = pick_fallback_mgmt_id(); + } +} + +#ifdef HAVE_OPENSSL +static bool +config_string_change(const char *key, char **valuep) +{ + const char *value = cfg_get_string(0, "%s", key); + if (value && (!*valuep || strcmp(value, *valuep))) { + free(*valuep); + *valuep = xstrdup(value); + return true; + } else { + return false; + } +} + +static void +mgmt_configure_ssl(void) +{ + static char *private_key_file; + static char *certificate_file; + static char *cacert_file; + + /* XXX SSL should be configurable separate from the bridges. + * XXX should be possible to de-configure SSL. */ + if (config_string_change("ssl.private-key", &private_key_file)) { + vconn_ssl_set_private_key_file(private_key_file); + } + + if (config_string_change("ssl.certificate", &certificate_file)) { + vconn_ssl_set_certificate_file(certificate_file); + } + + if (config_string_change("ssl.ca-cert", &cacert_file)) { + vconn_ssl_set_ca_cert_file(cacert_file, + cfg_get_bool(0, "ssl.bootstrap-ca-cert")); + } +} +#endif + +void +mgmt_reconfigure(void) +{ + struct svec new_cfg; + uint8_t new_cookie[CFG_COOKIE_LEN]; + bool cfg_updated = false; + const char *controller_name; + int max_backoff; + int inactivity_probe; + int retval; + + if (!cfg_has_section("mgmt")) { + if (mgmt_rconn) { + rconn_destroy(mgmt_rconn); + mgmt_rconn = NULL; + } + return; + } + + /* If this is an established connection, send a resources update. */ + /* xxx This is wasteful if there were no resource changes!!! */ + if (mgmt_rconn) { + send_resources_update(0, false); + } + + cfg_get_cookie(new_cookie); + if (memcmp(cfg_cookie, new_cookie, sizeof(cfg_cookie))) { + memcpy(cfg_cookie, new_cookie, sizeof(cfg_cookie)); + cfg_updated = true; + } + + svec_init(&new_cfg); + cfg_get_section(&new_cfg, "mgmt"); + if (svec_equal(&mgmt_cfg, &new_cfg)) { + /* Reconnecting to the controller causes the config file to be + * resent automatically. If we're not reconnecting and the + * config file has changed, we need to notify the controller of + * changes. */ + if (cfg_updated && mgmt_rconn) { + send_config_update(0, false); + } + svec_destroy(&new_cfg); + return; + } + + controller_name = cfg_get_string(0, "mgmt.controller"); + if (!controller_name) { + VLOG_ERR("no controller specified for managment"); + svec_destroy(&new_cfg); + return; + } + + max_backoff = cfg_get_int(0, "mgmt.max-backoff"); + if (max_backoff < 1) { + max_backoff = MAX_BACKOFF_DEFAULT; + } else if (max_backoff > 3600) { + max_backoff = 3600; + } + + inactivity_probe = cfg_get_int(0, "mgmt.inactivity-probe"); + if (inactivity_probe < 5) { + inactivity_probe = INACTIVITY_PROBE_DEFAULT; + } + + /* xxx If this changes, we need to restart bridges to use new id, + * xxx but they need the id before the connect to controller, but we + * xxx need their dpids. */ + /* Check if a different mgmt id has been assigned. */ + if (cfg_has("mgmt.id")) { + uint64_t cfg_mgmt_id = cfg_get_dpid(0, "mgmt.id"); + if (cfg_mgmt_id != mgmt_id) { + mgmt_id = cfg_mgmt_id; + } + } + + svec_swap(&new_cfg, &mgmt_cfg); + svec_destroy(&new_cfg); + +#ifdef HAVE_OPENSSL + /* Configure SSL. */ + mgmt_configure_ssl(); +#endif + + if (mgmt_rconn) { + rconn_destroy(mgmt_rconn); + mgmt_rconn = NULL; + } + mgmt_rconn = rconn_create(inactivity_probe, max_backoff); + retval = rconn_connect(mgmt_rconn, controller_name); + if (retval == EAFNOSUPPORT) { + VLOG_ERR("no support for %s vconn", controller_name); + } +} + +static int +send_openflow_buffer(struct ofpbuf *buffer) +{ + int retval; + + if (!mgmt_rconn) { + VLOG_ERR("attempt to send openflow packet with no rconn\n"); + return EINVAL; + } + + update_openflow_length(buffer); + retval = rconn_send_with_limit(mgmt_rconn, buffer, txqlen, TXQ_LIMIT); + if (retval) { + VLOG_WARN_RL(&rl, "send to %s failed: %s", + rconn_get_name(mgmt_rconn), strerror(retval)); + } + return retval; +} + +static void +send_features_reply(uint32_t xid) +{ + struct ofpbuf *buffer; + struct ofp_switch_features *ofr; + + ofr = make_openflow_xid(sizeof *ofr, OFPT_FEATURES_REPLY, xid, &buffer); + ofr->datapath_id = 0; + ofr->n_tables = 0; + ofr->n_buffers = 0; + ofr->capabilities = 0; + ofr->actions = 0; + send_openflow_buffer(buffer); +} + +static void * +make_ofmp_xid(size_t ofmp_len, uint16_t type, uint32_t xid, + struct ofpbuf **bufferp) +{ + struct ofmp_header *oh; + + oh = make_openflow_xid(ofmp_len, OFPT_VENDOR, xid, bufferp); + oh->header.vendor = htonl(NX_VENDOR_ID); + oh->header.subtype = htonl(NXT_MGMT); + oh->type = htons(type); + + return oh; +} + +static void * +make_ofmp(size_t ofmp_len, uint16_t type, struct ofpbuf **bufferp) +{ + struct ofmp_header *oh; + + oh = make_openflow(ofmp_len, OFPT_VENDOR, bufferp); + oh->header.vendor = htonl(NX_VENDOR_ID); + oh->header.subtype = htonl(NXT_MGMT); + oh->type = htons(type); + + return oh; +} + +static void +send_capability_reply(uint32_t xid) +{ + int i; + struct ofpbuf *buffer; + struct ofmp_capability_reply *ofmpcr; + + ofmpcr = make_ofmp_xid(sizeof *ofmpcr, OFMPT_CAPABILITY_REPLY, + xid, &buffer); + ofmpcr->format = htonl(OFMPCOF_SIMPLE); + ofmpcr->mgmt_id = htonll(mgmt_id); + for (i=0; itype = htons(OFMPTSR_DP); + dp_tlv->len = htons(sizeof(*dp_tlv)); + + dp_tlv->dp_id = htonll(dp_id); + memcpy(dp_tlv->name, br_list.names[i], strlen(br_list.names[i])+1); + } + + /* Put end marker. */ + tlv = ofpbuf_put_zeros(buffer, sizeof(*tlv)); + tlv->type = htons(OFMPTSR_END); + tlv->len = htons(sizeof(*tlv)); + send_openflow_buffer(buffer); +} + +static void +send_config_update(uint32_t xid, bool use_xid) +{ + struct ofpbuf *buffer; + struct ofmp_config_update *ofmpcu; + + if (use_xid) { + ofmpcu = make_ofmp_xid(sizeof *ofmpcu, OFMPT_CONFIG_UPDATE, + xid, &buffer); + } else { + ofmpcu = make_ofmp(sizeof *ofmpcu, OFMPT_CONFIG_UPDATE, &buffer); + } + + ofmpcu->format = htonl(OFMPCOF_SIMPLE); + memcpy(ofmpcu->cookie, cfg_cookie, sizeof(ofmpcu->cookie)); + cfg_buf_put(buffer); + send_openflow_buffer(buffer); +} + +static void +send_config_update_ack(uint32_t xid, bool success) +{ + struct ofpbuf *buffer; + struct ofmp_config_update_ack *ofmpcua; + + ofmpcua = make_ofmp_xid(sizeof *ofmpcua, OFMPT_CONFIG_UPDATE_ACK, + xid, &buffer); + + ofmpcua->format = htonl(OFMPCOF_SIMPLE); + if (success) { + ofmpcua->flags = htonl(OFMPCUAF_SUCCESS); + } + cfg_get_cookie(ofmpcua->cookie); + send_openflow_buffer(buffer); +} + +static void +send_ofmp_error_msg(uint32_t xid, uint16_t type, uint16_t code, + const void *data, size_t len) +{ + struct ofpbuf *buffer; + struct ofmp_error_msg *oem; + + oem = make_ofmp_xid(sizeof(*oem)+len, OFMPT_ERROR, xid, &buffer); + oem->type = htons(type); + oem->code = htons(code); + memcpy(oem->data, data, len); + send_openflow_buffer(buffer); +} + +static void +send_error_msg(uint32_t xid, uint16_t type, uint16_t code, + const void *data, size_t len) +{ + struct ofpbuf *buffer; + struct ofp_error_msg *oem; + + oem = make_openflow_xid(sizeof(*oem)+len, OFPT_ERROR, xid, &buffer); + oem->type = htons(type); + oem->code = htons(code); + memcpy(oem->data, data, len); + send_openflow_buffer(buffer); +} + +static int +recv_echo_request(uint32_t xid UNUSED, const void *msg) +{ + const struct ofp_header *rq = msg; + send_openflow_buffer(make_echo_reply(rq)); + return 0; +} + +static int +recv_features_request(uint32_t xid, const void *msg UNUSED) +{ + send_features_reply(xid); + return 0; +} + +static int +recv_set_config(uint32_t xid UNUSED, const void *msg UNUSED) +{ + /* Nothing to configure! */ + return 0; +} + +static int +recv_ofmp_capability_request(uint32_t xid, const struct ofmp_header *ofmph) +{ + struct ofmp_capability_request *ofmpcr; + + if (htons(ofmph->header.header.length) != sizeof(*ofmpcr)) { + /* xxx Send error */ + return -EINVAL; + } + + ofmpcr = (struct ofmp_capability_request *)ofmph; + if (ofmpcr->format != htonl(OFMPCAF_SIMPLE)) { + /* xxx Send error */ + return -EINVAL; + } + + send_capability_reply(xid); + + return 0; +} + +static int +recv_ofmp_resources_request(uint32_t xid, const void *msg UNUSED) +{ + send_resources_update(xid, true); + return 0; +} + +static int +recv_ofmp_config_request(uint32_t xid, const struct ofmp_header *ofmph) +{ + struct ofmp_config_request *ofmpcr; + + if (htons(ofmph->header.header.length) != sizeof(*ofmpcr)) { + /* xxx Send error */ + return -EINVAL; + } + + ofmpcr = (struct ofmp_config_request *)ofmph; + if (ofmpcr->format != htonl(OFMPCOF_SIMPLE)) { + /* xxx Send error */ + return -EINVAL; + } + + send_config_update(xid, true); + + return 0; +} + +static int +recv_ofmp_config_update(uint32_t xid, const struct ofmp_header *ofmph) +{ + struct ofmp_config_update *ofmpcu; + int data_len; + + data_len = htons(ofmph->header.header.length) - sizeof(*ofmpcu); + if (data_len <= sizeof(*ofmpcu)) { + /* xxx Send error. */ + return -EINVAL; + } + + ofmpcu = (struct ofmp_config_update *)ofmph; + if (ofmpcu->format != htonl(OFMPCOF_SIMPLE)) { + /* xxx Send error */ + return -EINVAL; + } + + /* Check if the supplied cookie matches our current understanding of + * it. If they don't match, tell the controller and let it sort + * things out. */ + if (cfg_lock(ofmpcu->cookie, 0)) { + /* xxx cfg_lock can fail for other reasons, such as being + * xxx locked... */ + VLOG_WARN_RL(&rl, "config update failed due to bad cookie\n"); + send_config_update_ack(xid, false); + return 0; + } + + /* xxx We should probably do more sanity checking than this. */ + + cfg_write_data(ofmpcu->data, data_len); + cfg_unlock(); + + /* Send the ACK before running reconfigure, since our management + * connection settings may have changed. */ + send_config_update_ack(xid, true); + + reconfigure(); + + + return 0; +} + +static +int recv_ofmp(uint32_t xid, struct ofmp_header *ofmph) +{ + /* xxx Should sanity-check for min/max length */ + switch (ntohs(ofmph->type)) + { + case OFMPT_CAPABILITY_REQUEST: + return recv_ofmp_capability_request(xid, ofmph); + case OFMPT_RESOURCES_REQUEST: + return recv_ofmp_resources_request(xid, ofmph); + case OFMPT_CONFIG_REQUEST: + return recv_ofmp_config_request(xid, ofmph); + case OFMPT_CONFIG_UPDATE: + return recv_ofmp_config_update(xid, ofmph); + default: + VLOG_WARN_RL(&rl, "unknown mgmt message: %d", + ntohs(ofmph->type)); + return -EINVAL; + } +} + +static int +recv_nx_msg(uint32_t xid, const void *oh) +{ + const struct nicira_header *nh = oh; + + switch (ntohl(nh->subtype)) { + + case NXT_MGMT: + return recv_ofmp(xid, (struct ofmp_header *)oh); + + default: + send_error_msg(xid, OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE, + oh, htons(nh->header.length)); + return -EINVAL; + } +} + +static int +recv_vendor(uint32_t xid, const void *oh) +{ + const struct ofp_vendor_header *ovh = oh; + + switch (ntohl(ovh->vendor)) + { + case NX_VENDOR_ID: + return recv_nx_msg(xid, oh); + + default: + VLOG_WARN_RL(&rl, "unknown vendor: 0x%x", ntohl(ovh->vendor)); + send_error_msg(xid, OFPET_BAD_REQUEST, OFPBRC_BAD_VENDOR, + oh, ntohs(ovh->header.length)); + return -EINVAL; + } +} + +static int +handle_msg(uint32_t xid, const void *msg, size_t length) +{ + int (*handler)(uint32_t, const void *); + struct ofp_header *oh; + size_t min_size; + + COVERAGE_INC(mgmt_received); + + /* Check encapsulated length. */ + oh = (struct ofp_header *) msg; + if (ntohs(oh->length) > length) { + return -EINVAL; + } + assert(oh->version == OFP_VERSION); + + /* Figure out how to handle it. */ + switch (oh->type) { + case OFPT_ECHO_REQUEST: + min_size = sizeof(struct ofp_header); + handler = recv_echo_request; + break; + case OFPT_ECHO_REPLY: + return 0; + case OFPT_FEATURES_REQUEST: + min_size = sizeof(struct ofp_header); + handler = recv_features_request; + break; + case OFPT_SET_CONFIG: + min_size = sizeof(struct ofp_switch_config); + handler = recv_set_config; + break; + case OFPT_VENDOR: + min_size = sizeof(struct ofp_vendor_header); + handler = recv_vendor; + break; + default: + VLOG_WARN_RL(&rl, "unknown openflow type: %d", oh->type); + send_error_msg(xid, OFPET_BAD_REQUEST, OFPBRC_BAD_TYPE, + msg, length); + return -EINVAL; + } + + /* Handle it. */ + if (length < min_size) { + return -EFAULT; + } + return handler(xid, msg); +} + +void +mgmt_run(void) +{ + int i; + + if (!mgmt_rconn) { + return; + } + + rconn_run(mgmt_rconn); + + /* Do some processing, but cap it at a reasonable amount so that + * other processing doesn't starve. */ + for (i=0; i<50; i++) { + struct ofpbuf *buffer; + struct ofp_header *oh; + + buffer = rconn_recv(mgmt_rconn); + if (!buffer) { + break; + } + + if (buffer->size >= sizeof *oh) { + oh = buffer->data; + handle_msg(oh->xid, buffer->data, buffer->size); + ofpbuf_delete(buffer); + } else { + VLOG_WARN_RL(&rl, "received too-short OpenFlow message"); + } + } +} + +void +mgmt_wait(void) +{ + if (!mgmt_rconn) { + return; + } + + rconn_run_wait(mgmt_rconn); + rconn_recv_wait(mgmt_rconn); +} + +static uint64_t +pick_fallback_mgmt_id(void) +{ + uint8_t ea[ETH_ADDR_LEN]; + eth_addr_random(ea); + ea[0] = 0x00; /* Set Nicira OUI. */ + ea[1] = 0x23; + ea[2] = 0x20; + return eth_addr_to_uint64(ea); +} diff --git a/vswitchd/mgmt.h b/vswitchd/mgmt.h new file mode 100644 index 00000000..6db66c69 --- /dev/null +++ b/vswitchd/mgmt.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#ifndef VSWITCHD_MGMT_H +#define VSWITCHD_MGMT_H 1 + +void mgmt_init(void); +void mgmt_reconfigure(void); +void mgmt_run(void); +void mgmt_wait(void); +uint64_t mgmt_get_mgmt_id(void); + +#endif /* mgmt.h */ diff --git a/vswitchd/ovs-brcompatd.8.in b/vswitchd/ovs-brcompatd.8.in new file mode 100644 index 00000000..ebd67028 --- /dev/null +++ b/vswitchd/ovs-brcompatd.8.in @@ -0,0 +1,49 @@ +.TH ovs\-brcompatd 8 "March 2009" "Open vSwitch" "Open vSwitch Manual" +.ds PN ovs\-brcompatd +. +.SH NAME +ovs\-brcompatd \- Bridge compatibility front-end for ovs\-vswitchd +. +.SH SYNOPSIS +.B ovs\-brcompatd +[\fIoptions\fR] \fIconfig\fR +. +.SH DESCRIPTION +A daemon that provides a legacy bridge front-end for \fBovs\-vswitchd\fR. It +does this by listening for bridge ioctl commands (e.g., those generated by +the \fBbrctl\fR program) to add or remove datapaths and the interfaces +that attach to them. It modifies \fIconfig\fR and forces +\fBovs\-vswitchd\fR to reload its configuration file. +.PP +.SH OPTIONS +.IP "\fB--reload-command=\fIcommand\fR" +Sets the command that \fBovs\-brcompatd\fR runs to force \fBovs\-vswitchd\fR to +reload its configuration file to \fIcommand\fR. The command is run in +a subshell, so it may contain arbitrary shell metacharacters, etc. +The \fB--help\fR option displays the default reload command. +.TP +\fB--prune-timeout=\fIsecs\fR +. +Sets the maximum time between pruning port entries to \fIsecs\fR seconds. +Pruning ports is the process of removing port entries from \fIconfig\fR +that no longer exist. If \fIsecs\fR is zero, then entries are never +pruned. The default prune timeout is 5 seconds. +.TP +\fB--lock-timeout=\fImsecs\fR +. +Sets the maximum time to wait for \fIconfig\fR to become unlocked to +\fImsecs\fR milliseconds. The default lock timeout is 500 milliseconds. +. +.so lib/daemon.man +.so lib/vlog.man +.so lib/common.man +.so lib/leak-checker.man +. +.SH NOTES +\fBovs\-brcompatd\fR requires the \fBbrcompat_mod.ko\fR kernel module to be +loaded. +.SH "SEE ALSO" +.BR ovs\-appctl (8), +.BR ovs\-vswitchd (8), +.BR ovs\-vswitchd.conf (5), +\fBINSTALL\fR in the Open vSwitch distribution. diff --git a/vswitchd/ovs-brcompatd.c b/vswitchd/ovs-brcompatd.c new file mode 100644 index 00000000..93d9469b --- /dev/null +++ b/vswitchd/ovs-brcompatd.c @@ -0,0 +1,766 @@ +/* Copyright (c) 2008, 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cfg.h" +#include "command-line.h" +#include "coverage.h" +#include "daemon.h" +#include "dirs.h" +#include "dpif.h" +#include "fatal-signal.h" +#include "fault.h" +#include "leak-checker.h" +#include "netdev.h" +#include "netlink.h" +#include "ofpbuf.h" +#include "openvswitch/brcompat-netlink.h" +#include "poll-loop.h" +#include "process.h" +#include "signals.h" +#include "svec.h" +#include "timeval.h" +#include "unixctl.h" +#include "util.h" + +#include "vlog.h" +#define THIS_MODULE VLM_brcompatd + + +/* xxx Just hangs if datapath is rmmod/insmod. Learn to reconnect? */ + +/* Actions to modify bridge compatibility configuration. */ +enum bmc_action { + BMC_ADD_DP, + BMC_DEL_DP, + BMC_ADD_PORT, + BMC_DEL_PORT +}; + +static void parse_options(int argc, char *argv[]); +static void usage(void) NO_RETURN; + +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 60); + +/* Maximum number of milliseconds to wait for the config file to be + * unlocked. If set to zero, no waiting will occur. */ +static int lock_timeout = 500; + +/* Maximum number of milliseconds to wait before pruning port entries that + * no longer exist. If set to zero, ports are never pruned. */ +static int prune_timeout = 5000; + +/* Config file shared with ovs-vswitchd (usually ovs-vswitchd.conf). */ +static char *config_file; + +/* Command to run (via system()) to reload the ovs-vswitchd configuration + * file. */ +static char *reload_command; + +/* Netlink socket to listen for interface changes. */ +static struct nl_sock *rtnl_sock; + +/* Netlink socket to bridge compatibility kernel module. */ +static struct nl_sock *brc_sock; + +/* The Generic Netlink family number used for bridge compatibility. */ +static int brc_family; + +static const struct nl_policy brc_multicast_policy[] = { + [BRC_GENL_A_MC_GROUP] = {.type = NL_A_U32 } +}; + +static const struct nl_policy rtnlgrp_link_policy[] = { + [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false }, + [IFLA_MASTER] = { .type = NL_A_U32, .optional = true }, +}; + +static int +lookup_brc_multicast_group(int *multicast_group) +{ + struct nl_sock *sock; + struct ofpbuf request, *reply; + struct nlattr *attrs[ARRAY_SIZE(brc_multicast_policy)]; + int retval; + + retval = nl_sock_create(NETLINK_GENERIC, 0, 0, 0, &sock); + if (retval) { + return retval; + } + ofpbuf_init(&request, 0); + nl_msg_put_genlmsghdr(&request, sock, 0, brc_family, + NLM_F_REQUEST, BRC_GENL_C_QUERY_MC, 1); + retval = nl_sock_transact(sock, &request, &reply); + ofpbuf_uninit(&request); + if (retval) { + nl_sock_destroy(sock); + return retval; + } + if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN, + brc_multicast_policy, attrs, + ARRAY_SIZE(brc_multicast_policy))) { + nl_sock_destroy(sock); + ofpbuf_delete(reply); + return EPROTO; + } + *multicast_group = nl_attr_get_u32(attrs[BRC_GENL_A_MC_GROUP]); + nl_sock_destroy(sock); + ofpbuf_delete(reply); + + return 0; +} + +/* Opens a socket for brcompat notifications. Returns 0 if successful, + * otherwise a positive errno value. */ +static int +brc_open(struct nl_sock **sock) +{ + int multicast_group = 0; + int retval; + + retval = nl_lookup_genl_family(BRC_GENL_FAMILY_NAME, &brc_family); + if (retval) { + return retval; + } + + retval = lookup_brc_multicast_group(&multicast_group); + if (retval) { + return retval; + } + + retval = nl_sock_create(NETLINK_GENERIC, multicast_group, 0, 0, sock); + if (retval) { + return retval; + } + + return 0; +} + +static const struct nl_policy brc_dp_policy[] = { + [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING }, +}; + +static bool +bridge_exists(const char *name) +{ + return cfg_has_section("bridge.%s", name); +} + +static int +rewrite_and_reload_config(void) +{ + if (cfg_is_dirty()) { + int error1 = cfg_write(); + int error2 = cfg_read(); + long long int reload_start = time_msec(); + int error3 = system(reload_command); + long long int elapsed = time_msec() - reload_start; + COVERAGE_INC(brcompatd_reload); + if (elapsed > 0) { + VLOG_INFO("reload command executed in %lld ms", elapsed); + } + if (error3 == -1) { + VLOG_ERR("failed to execute reload command: %s", strerror(errno)); + } else if (error3 != 0) { + char *msg = process_status_msg(error3); + VLOG_ERR("reload command exited with error (%s)", msg); + free(msg); + } + return error1 ? error1 : error2 ? error2 : error3 ? ECHILD : 0; + } + return 0; +} + +/* Go through the configuration file and remove any ports that no longer + * exist associated with a bridge. */ +static void +prune_ports(void) +{ + int i, j; + int error; + struct svec bridges, delete; + + if (cfg_lock(NULL, 0)) { + /* Couldn't lock config file. */ + return; + } + + svec_init(&bridges); + svec_init(&delete); + cfg_get_subsections(&bridges, "bridge"); + for (i=0; idata)->nlmsg_seq; + *br_name = nl_attr_get_string(attrs[BRC_GENL_A_DP_NAME]); + if (port_name) { + *port_name = nl_attr_get_string(attrs[BRC_GENL_A_PORT_NAME]); + } + return 0; +} + +static void +send_reply(uint32_t seq, int error) +{ + struct ofpbuf msg; + int retval; + + /* Compose reply. */ + ofpbuf_init(&msg, 0); + nl_msg_put_genlmsghdr(&msg, brc_sock, 32, brc_family, NLM_F_REQUEST, + BRC_GENL_C_DP_RESULT, 1); + ((struct nlmsghdr *) msg.data)->nlmsg_seq = seq; + nl_msg_put_u32(&msg, BRC_GENL_A_ERR_CODE, error); + + /* Send reply. */ + retval = nl_sock_send(brc_sock, &msg, false); + if (retval) { + VLOG_WARN_RL(&rl, "replying to brcompat request: %s", + strerror(retval)); + } + ofpbuf_uninit(&msg); +} + +static int +handle_bridge_cmd(struct ofpbuf *buffer, bool add) +{ + const char *br_name; + uint32_t seq; + int error; + + error = parse_command(buffer, &seq, &br_name, NULL); + if (!error) { + error = add ? add_bridge(br_name) : del_bridge(br_name); + if (!error) { + error = rewrite_and_reload_config(); + } + send_reply(seq, error); + } + return error; +} + +static const struct nl_policy brc_port_policy[] = { + [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING }, + [BRC_GENL_A_PORT_NAME] = { .type = NL_A_STRING }, +}; + +static void +del_port(const char *br_name, const char *port_name) +{ + cfg_del_entry("bridge.%s.port=%s", br_name, port_name); + cfg_del_match("bonding.*.slave=%s", port_name); + cfg_del_match("vlan.%s.*", port_name); +} + +static int +handle_port_cmd(struct ofpbuf *buffer, bool add) +{ + const char *cmd_name = add ? "add-if" : "del-if"; + const char *br_name, *port_name; + uint32_t seq; + int error; + + error = parse_command(buffer, &seq, &br_name, &port_name); + if (!error) { + if (!bridge_exists(br_name)) { + VLOG_WARN("%s %s %s: no bridge named %s", + cmd_name, br_name, port_name, br_name); + error = EINVAL; + } else if (!netdev_exists(port_name)) { + VLOG_WARN("%s %s %s: no network device named %s", + cmd_name, br_name, port_name, port_name); + error = EINVAL; + } else { + if (add) { + cfg_add_entry("bridge.%s.port=%s", br_name, port_name); + } else { + del_port(br_name, port_name); + } + VLOG_INFO("%s %s %s: success", cmd_name, br_name, port_name); + error = rewrite_and_reload_config(); + } + send_reply(seq, error); + } + + return error; +} + +static int +brc_recv_update(void) +{ + int retval; + struct ofpbuf *buffer; + struct genlmsghdr *genlmsghdr; + + + buffer = NULL; + do { + ofpbuf_delete(buffer); + retval = nl_sock_recv(brc_sock, &buffer, false); + } while (retval == ENOBUFS + || (!retval + && (nl_msg_nlmsgerr(buffer, NULL) + || nl_msg_nlmsghdr(buffer)->nlmsg_type == NLMSG_DONE))); + if (retval) { + if (retval != EAGAIN) { + VLOG_WARN_RL(&rl, "brc_recv_update: %s", strerror(retval)); + } + return retval; + } + + genlmsghdr = nl_msg_genlmsghdr(buffer); + if (!genlmsghdr) { + VLOG_WARN_RL(&rl, "received packet too short for generic NetLink"); + goto error; + } + + if (nl_msg_nlmsghdr(buffer)->nlmsg_type != brc_family) { + VLOG_DBG_RL(&rl, "received type (%"PRIu16") != brcompat family (%d)", + nl_msg_nlmsghdr(buffer)->nlmsg_type, brc_family); + goto error; + } + + if (cfg_lock(NULL, lock_timeout)) { + /* Couldn't lock config file. */ + retval = EAGAIN; + goto error; + } + + switch (genlmsghdr->cmd) { + case BRC_GENL_C_DP_ADD: + retval = handle_bridge_cmd(buffer, true); + break; + + case BRC_GENL_C_DP_DEL: + retval = handle_bridge_cmd(buffer, false); + break; + + case BRC_GENL_C_PORT_ADD: + retval = handle_port_cmd(buffer, true); + break; + + case BRC_GENL_C_PORT_DEL: + retval = handle_port_cmd(buffer, false); + break; + + default: + retval = EPROTO; + } + + cfg_unlock(); + +error: + ofpbuf_delete(buffer); + return retval; +} + +/* Check for interface configuration changes announced through RTNL. */ +static void +rtnl_recv_update(void) +{ + struct ofpbuf *buf; + + int error = nl_sock_recv(rtnl_sock, &buf, false); + if (error == EAGAIN) { + /* Nothing to do. */ + } else if (error == ENOBUFS) { + VLOG_WARN_RL(&rl, "network monitor socket overflowed"); + } else if (error) { + VLOG_WARN_RL(&rl, "error on network monitor socket: %s", + strerror(error)); + } else { + struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)]; + struct nlmsghdr *nlh; + struct ifinfomsg *iim; + + nlh = ofpbuf_at(buf, 0, NLMSG_HDRLEN); + iim = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *iim); + if (!iim) { + VLOG_WARN_RL(&rl, "received bad rtnl message (no ifinfomsg)"); + ofpbuf_delete(buf); + return; + } + + if (!nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct ifinfomsg), + rtnlgrp_link_policy, + attrs, ARRAY_SIZE(rtnlgrp_link_policy))) { + VLOG_WARN_RL(&rl,"received bad rtnl message (policy)"); + ofpbuf_delete(buf); + return; + } + if (nlh->nlmsg_type == RTM_DELLINK && attrs[IFLA_MASTER]) { + const char *port_name = nl_attr_get_string(attrs[IFLA_IFNAME]); + char br_name[IFNAMSIZ]; + uint32_t br_idx = nl_attr_get_u32(attrs[IFLA_MASTER]); + struct svec ports; + + if (!if_indextoname(br_idx, br_name)) { + ofpbuf_delete(buf); + return; + } + + if (cfg_lock(NULL, lock_timeout)) { + /* Couldn't lock config file. */ + /* xxx this should try again and print error msg. */ + ofpbuf_delete(buf); + return; + } + + svec_init(&ports); + cfg_get_all_keys(&ports, "bridge.%s.port", br_name); + svec_sort(&ports); + if (svec_contains(&ports, port_name)) { + del_port(br_name, port_name); + rewrite_and_reload_config(); + } + cfg_unlock(); + } + ofpbuf_delete(buf); + } +} + +int +main(int argc, char *argv[]) +{ + struct unixctl_server *unixctl; + int retval; + + set_program_name(argv[0]); + register_fault_handlers(); + time_init(); + vlog_init(); + parse_options(argc, argv); + signal(SIGPIPE, SIG_IGN); + process_init(); + + die_if_already_running(); + daemonize(); + + retval = unixctl_server_create(NULL, &unixctl); + if (retval) { + ovs_fatal(retval, "could not listen for vlog connections"); + } + + if (brc_open(&brc_sock)) { + ovs_fatal(0, "could not open brcompat socket. Check " + "\"brcompat\" kernel module."); + } + + if (prune_timeout) { + if (nl_sock_create(NETLINK_ROUTE, RTNLGRP_LINK, 0, 0, &rtnl_sock)) { + ovs_fatal(0, "could not create rtnetlink socket"); + } + } + + cfg_read(); + + for (;;) { + unixctl_server_run(unixctl); + brc_recv_update(); + + /* If 'prune_timeout' is non-zero, we actively prune from the + * config file any 'bridge..port' entries that are no + * longer valid. We use two methods: + * + * 1) The kernel explicitly notifies us of removed ports + * through the RTNL messages. + * + * 2) We periodically check all ports associated with bridges + * to see if they no longer exist. + */ + if (prune_timeout) { + rtnl_recv_update(); + prune_ports(); + + nl_sock_wait(rtnl_sock, POLLIN); + poll_timer_wait(prune_timeout); + } + + nl_sock_wait(brc_sock, POLLIN); + unixctl_server_wait(unixctl); + poll_block(); + } + + return 0; +} + +static void +parse_options(int argc, char *argv[]) +{ + enum { + OPT_LOCK_TIMEOUT = UCHAR_MAX + 1, + OPT_PRUNE_TIMEOUT, + OPT_RELOAD_COMMAND, + VLOG_OPTION_ENUMS, + LEAK_CHECKER_OPTION_ENUMS + }; + static struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {"lock-timeout", required_argument, 0, OPT_LOCK_TIMEOUT}, + {"prune-timeout", required_argument, 0, OPT_PRUNE_TIMEOUT}, + {"reload-command", required_argument, 0, OPT_RELOAD_COMMAND}, + DAEMON_LONG_OPTIONS, + VLOG_LONG_OPTIONS, + LEAK_CHECKER_LONG_OPTIONS, + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + int error; + + reload_command = xasprintf("%s/ovs-appctl -t " + "%s/ovs-vswitchd.`cat %s/ovs-vswitchd.pid`.ctl " + "-e vswitchd/reload 2>&1 " + "| /usr/bin/logger -t brcompatd-reload", + ovs_bindir, ovs_rundir, ovs_rundir); + for (;;) { + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case 'H': + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(0, 0); + exit(EXIT_SUCCESS); + + case OPT_LOCK_TIMEOUT: + lock_timeout = atoi(optarg); + break; + + case OPT_PRUNE_TIMEOUT: + prune_timeout = atoi(optarg) * 1000; + break; + + case OPT_RELOAD_COMMAND: + reload_command = optarg; + break; + + VLOG_OPTION_HANDLERS + DAEMON_OPTION_HANDLERS + LEAK_CHECKER_OPTION_HANDLERS + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); + + argc -= optind; + argv += optind; + + if (argc != 1) { + ovs_fatal(0, "exactly one non-option argument required; " + "use --help for usage"); + } + + config_file = argv[0]; + error = cfg_set_file(config_file); + if (error) { + ovs_fatal(error, "failed to add configuration file \"%s\"", + config_file); + } +} + +static void +usage(void) +{ + printf("%s: bridge compatibility front-end for ovs-vswitchd\n" + "usage: %s [OPTIONS] CONFIG\n" + "CONFIG is the configuration file used by ovs-vswitchd.\n", + program_name, program_name); + printf("\nConfiguration options:\n" + " --reload-command=COMMAND shell command to reload ovs-vswitchd\n" + " --prune-timeout=SECS wait at most SECS before pruning ports\n" + " --lock-timeout=MSECS wait at most MSECS for CONFIG to unlock\n" + ); + daemon_usage(); + vlog_usage(); + printf("\nOther options:\n" + " -h, --help display this help message\n" + " -V, --version display version information\n"); + leak_checker_usage(); + printf("\nThe default reload command is:\n%s\n", reload_command); + exit(EXIT_SUCCESS); +} diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in new file mode 100644 index 00000000..28e55ba3 --- /dev/null +++ b/vswitchd/ovs-vswitchd.8.in @@ -0,0 +1,87 @@ +.TH ovs\-vswitchd 8 "March 2009" "Open vSwitch" "OpenVSwitch Manual" +.ds PN ovs\-vswitchd +. +.SH NAME +ovs\-vswitchd \- virtual switch daemon +. +.SH SYNOPSIS +.B ovs\-vswitchd +\fIconfig\fR +. +.SH DESCRIPTION +A daemon that manages and controls any number of virtual switches on +the local machine. +.PP +The mandatory \fIconfig\fR argument specifies a configuration file. +For a description of \fBovs\-vswitchd\fR configuration syntax, see +\fBovs\-vswitchd.conf\fR(5). +.PP +At startup or upon receipt of a \fBSIGHUP\fR signal, \fBovs\-vswitchd\fR +reads the configuration file. It sets up Open vSwitch datapaths and then +operates switching across each bridge described in its configuration +files. If a logfile was specified on the command line it will also +be opened or reopened. +.PP +\fBovs\-vswitchd\fR virtual switches may be configured with any of the +following features: +. +.IP \(bu +L2 switching with MAC learning. +. +.IP \(bu +NIC bonding with automatic fail-over and source MAC-based TX load +balancing ("SLB"). +. +.IP \(bu +802.1Q VLAN support. +. +.IP \(bu +Port mirroring, with optional VLAN tagging. +. +.IP \(bu +NetFlow v5 flow logging. +. +.IP \(bu +Connectivity to an external OpenFlow controller, such as NOX. +. +.PP +Only a single instance of \fBovs\-vswitchd\fR is intended to run at a time. +A single \fBovs\-vswitchd\fR can manage any number of virtual switches, up +to the maximum number of supported Open vSwitch datapaths. +.PP +\fBovs\-vswitchd\fR does all the necessary management of OpenVSwitch datapaths +itself. Thus, external tools, such \fBovs\-dpctl\fR(8), are not needed for +managing datapaths in conjunction with \fBovs\-vswitchd\fR, and their use +to modify datapaths when \fBovs\-vswitchd\fR is running can interfere with +its operation. (\fBovs\-dpctl\fR may still be useful for diagnostics.) +.PP +An Open vSwitch datapath kernel module must be loaded for \fBovs\-vswitchd\fR +to be useful. Please refer to the \fBINSTALL\fR file included in the +Open vSwitch distribution for instructions on how to build and load +the Open vSwitch kernel module. +.PP +.SH OPTIONS +.IP "\fB--fake-proc-net\fR" +Causes \fBovs\-vswitchd\fR to simulate some files in \fB/proc/net/vlan\fR +and \fB/proc/net/bonding\fR that some legacy software expects to +exist. This option should only be used if such legacy software is +actually in use. It requires the \fBbrcompat_mod.ko\fR kernel module +to be loaded. +. +.so lib/daemon.man +.so lib/vlog.man +.so lib/common.man +.so lib/leak-checker.man +. +.SH "BUGS" +. +Only Open vSwitch kernel-based datapaths are currently supported. In the +future, this restriction may be lifted. +.PP +Only Linux 2.6.\fIx\fR is currently supported. +. +.SH "SEE ALSO" +.BR ovs\-appctl (8), +.BR ovs\-vswitchd.conf (5), +.BR ovs\-brcompatd (8), +\fBINSTALL\fR in the Open vSwitch distribution. diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c new file mode 100644 index 00000000..9528ec5f --- /dev/null +++ b/vswitchd/ovs-vswitchd.c @@ -0,0 +1,255 @@ +/* Copyright (c) 2008, 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "bridge.h" +#include "cfg.h" +#include "command-line.h" +#include "compiler.h" +#include "daemon.h" +#include "fault.h" +#include "leak-checker.h" +#include "mgmt.h" +#include "ovs-vswitchd.h" +#include "poll-loop.h" +#include "port.h" +#include "proc-net-compat.h" +#include "process.h" +#include "signals.h" +#include "svec.h" +#include "timeval.h" +#include "unixctl.h" +#include "util.h" +#include "vconn-ssl.h" +#include "vconn.h" + +#include "vlog.h" +#define THIS_MODULE VLM_vswitchd + +static void parse_options(int argc, char *argv[]); +static void usage(void) NO_RETURN; +static void reload(struct unixctl_conn *, const char *args); + +static bool need_reconfigure; +static struct unixctl_conn **conns; +static size_t n_conns; + +int +main(int argc, char *argv[]) +{ + struct unixctl_server *unixctl; + struct signal *sighup; + int retval; + + set_program_name(argv[0]); + register_fault_handlers(); + time_init(); + vlog_init(); + parse_options(argc, argv); + signal(SIGPIPE, SIG_IGN); + sighup = signal_register(SIGHUP); + process_init(); + + die_if_already_running(); + daemonize(); + + retval = unixctl_server_create(NULL, &unixctl); + if (retval) { + ovs_fatal(retval, "could not listen for control connections"); + } + unixctl_command_register("vswitchd/reload", reload); + + cfg_read(); + mgmt_init(); + bridge_init(); + port_init(); + mgmt_reconfigure(); + + need_reconfigure = false; + for (;;) { + if (need_reconfigure || signal_poll(sighup)) { + need_reconfigure = false; + vlog_reopen_log_file(); + reconfigure(); + } + mgmt_run(); + if (bridge_run()) { + need_reconfigure = true; + } + unixctl_server_run(unixctl); + + if (need_reconfigure) { + poll_immediate_wake(); + } + signal_wait(sighup); + mgmt_wait(); + bridge_wait(); + unixctl_server_wait(unixctl); + poll_block(); + } + + return 0; +} + +static void +reload(struct unixctl_conn *conn, const char *args UNUSED) +{ + need_reconfigure = true; + conns = xrealloc(conns, sizeof *conns * (n_conns + 1)); + conns[n_conns++] = conn; +} + +void +reconfigure(void) +{ + size_t i; + + cfg_read(); + bridge_reconfigure(); + mgmt_reconfigure(); + port_reconfigure(); + + for (i = 0; i < n_conns; i++) { + unixctl_command_reply(conns[i], 202, NULL); + } + free(conns); + conns = NULL; + n_conns = 0; +} + +static void +parse_options(int argc, char *argv[]) +{ + enum { + OPT_PEER_CA_CERT = UCHAR_MAX + 1, + OPT_FAKE_PROC_NET, + VLOG_OPTION_ENUMS, + LEAK_CHECKER_OPTION_ENUMS + }; + static struct option long_options[] = { + {"help", no_argument, 0, 'h'}, + {"version", no_argument, 0, 'V'}, + {"fake-proc-net", no_argument, 0, OPT_FAKE_PROC_NET}, + DAEMON_LONG_OPTIONS, + VLOG_LONG_OPTIONS, + LEAK_CHECKER_LONG_OPTIONS, +#ifdef HAVE_OPENSSL + VCONN_SSL_LONG_OPTIONS + {"peer-ca-cert", required_argument, 0, OPT_PEER_CA_CERT}, +#endif + {0, 0, 0, 0}, + }; + char *short_options = long_options_to_short_options(long_options); + const char *config_file; + int error; + + for (;;) { + int c; + + c = getopt_long(argc, argv, short_options, long_options, NULL); + if (c == -1) { + break; + } + + switch (c) { + case 'H': + case 'h': + usage(); + + case 'V': + OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION); + exit(EXIT_SUCCESS); + + case OPT_FAKE_PROC_NET: + error = proc_net_compat_init(); + if (error) { + ovs_fatal(error, "failed to initialize /proc/net " + "compatibility"); + } + break; + + VLOG_OPTION_HANDLERS + DAEMON_OPTION_HANDLERS + VCONN_SSL_OPTION_HANDLERS + LEAK_CHECKER_OPTION_HANDLERS + +#ifdef HAVE_OPENSSL + case OPT_PEER_CA_CERT: + vconn_ssl_set_peer_ca_cert_file(optarg); + break; +#endif + + case '?': + exit(EXIT_FAILURE); + + default: + abort(); + } + } + free(short_options); + + argc -= optind; + argv += optind; + + if (argc != 1) { + ovs_fatal(0, "config file is only non-option argument; " + "use --help for usage"); + } + + config_file = argv[0]; + error = cfg_set_file(config_file); + if (error) { + ovs_fatal(error, "failed to add configuration file \"%s\"", + config_file); + } +} + +static void +usage(void) +{ + printf("%s: virtual switch daemon\n" + "usage: %s [OPTIONS] CONFIG\n" + "CONFIG is a configuration file in ovs-vswitchd.conf(5) format.\n", + program_name, program_name); + daemon_usage(); + vlog_usage(); + printf("\nLegacy compatibility options:\n" + " --fake-proc-net simulate some files in /proc/net\n" + "\nOther options:\n" + " -h, --help display this help message\n" + " -V, --version display version information\n"); + leak_checker_usage(); + exit(EXIT_SUCCESS); +} diff --git a/vswitchd/ovs-vswitchd.conf.5.in b/vswitchd/ovs-vswitchd.conf.5.in new file mode 100644 index 00000000..89872184 --- /dev/null +++ b/vswitchd/ovs-vswitchd.conf.5.in @@ -0,0 +1,642 @@ +.\" -*- nroff -*- +.de TQ +. br +. ns +. TP "\\$1" +.. +.de IQ +. br +. ns +. IP "\\$1" +.. +.de ST +. PP +. RS -0.15in +. I "\\$1" +. RE +. PP +.. +.TH ovs\-vswitchd.conf 5 "April 2009" "Open vSwitch" "OpenVSwitch Manual" +. +.SH NAME +ovs\-vswitchd.conf \- configuration file for \fBovs\-vswitchd\fR +. +.SH DESCRIPTION +This manual page describes the syntax for the configuration file used +by \fBovs\-vswitchd\fR(8), the virtual switch daemon. +.PP +The configuration file is based on key-value pairs, which are given +one per line in the form \fIkey\fB=\fIvalue\fR. Each \fIkey\fR +consists of one or more parts separated by dots, +e.g. \fIpart1\fB.\fIpart2\fB.\fIpart3\fR. Each \fIpart\fR may consist +only of the English letters, digits, and the special characters +\fB_-@$:+\fR. White space within \fIvalue\fR and at the beginning of a +line is significant, but is otherwise ignored. +.PP +If a single key is specified more than once, that key has multiple +values, one value for each time the key is specified. The ordering of +key-value pairs, and the ordering of multiple values for a single key, +within a configuration file is not significant. +.PP +Blank lines, lines that consist only of white space, and lines that +begin with \fB#\fR (optionally preceded by white space) are ignored. +Keep in mind that programs that modify the configuration file, such as +\fBovs\-brcompatd\fR and \fBovs-cfg-mod\fR, may alter the order of +elements and +strip comments and blank lines. +.PP +The following subsections describe how key-value pairs are used to +configure \fBovs\-vswitchd\fR. +.SS "Bridge Configuration" +A bridge (switch) with a given \fIname\fR is configured by specifying +the names of its network devices as values for key +\fBbridge.\fIname\fB.port\fR. (The specified \fIname\fR may not begin +with \fBdp\fR or \fBnl:\fR followed by a digit.) +.PP +The names given on \fBbridge.\fIname\fB.port\fR must be the names of +existing network devices, except for ``internal ports.'' An internal +port is a simulated network device that receives traffic only +through the virtual switch and switches any traffic sent it through +virtual switch. An internal port may configured with an IP address, +etc. using the usual system tools (e.g. \fBifconfig\fR, \fBip\fR). To +designate network device \fInetdev\fR as an internal port, add +\fBiface.\fInetdev\fB.internal=true\fR to the configuration file. +\fBovs\-vswitchd\fR will honor this configuration setting by automatically +creating the named internal port. +.PP +A bridge with a given \fIname\fR always has an internal port with the +same \fIname\fR, called the ``local port.'' This network device may +be included +in the bridge, by specifying it as one of the values for key +\fBbridge.\fIname\fB.port\fR, or it may be omitted. If it is +included, then its MAC address is by default the lowest-numbered MAC +address among the other bridge ports, ignoring other internal ports +and bridge ports that are +used as port mirroring destinations (see \fBPort Mirroring\fR, below). To +use a specific MAC address instead, set \fBbridge.\fIname\fB.mac\fR to +a MAC address in the format +\fIxx\fB:\fIxx\fB:\fIxx\fB:\fIxx\fB:\fIxx\fB:\fIxx\fR, where each +\fIx\fR is a hex digit. If no valid MAC address can be determined +either of these ways, then a MAC address is randomly generated. +.PP +The following syntax defines a bridge named \fBmybr\fR, configured +with network devices \fBeth0\fR, \fBeth1\fR, and \fBeth2\fR: +.RS +.nf + +bridge.mybr.port=eth0 +bridge.mybr.port=eth1 +bridge.mybr.port=eth2 + +.fi +.RE +.SS "802.1Q VLAN support" +A bridge port may be configured either as a trunk port or as belonging +to a single, untagged VLAN. These two options are mutually exclusive, +and a port must be configured in one way or the other. +.ST "Trunk Ports" +By default, bridge ports are trunk ports that carry all VLANs. To +limit the VLANs that a trunk port carries, define +\fBvlan.\fIport\fB.trunks\fR to one or more integers between 0 and +4095 designating VLANs. Only frames that have an 802.1Q header with +one of the listed VLANs are accepted on a trunk port. If 0 is +included in the list, then frames without an 802.1Q header are also +accepted. Other frames are discarded. +.PP +The following syntax makes network device \fBeth0\fR a trunk port that +carries VLANs 1, 2, and 3: +.PP +.RS +.nf + +vlan.eth0.trunks=1 +vlan.eth0.trunks=2 +vlan.eth0.trunks=3 + +.fi +.RE +.ST "Untagged VLAN Ports" +A bridge port may be configured with an implicit, untagged VLAN. +Define key +\fBvlan.\fIport\fB.tag\fR to an integer value \fIvid\fR between 0 and +4095, inclusive, to designate the named \fIport\fR as a member +of 802.1Q VLAN \fIvid\fR. When \fIport\fR is assigned a VLAN tag this +way, frames arriving on trunk ports will be forwarded to \fIport\fR +only if they are tagged with VLAN \fIvid\fR, and frames arriving on +other VLAN ports will be forwarded to \fIport\fR only if their +\fIvid\fR values are equal. Frames forwarded to \fIport\fR will not +have an 802.1Q header. +.PP +When \fIvid\fR is 0, frames arriving on trunk ports without an 802.1Q +VLAN header will also be forwarded to \fIport\fR. +.PP +When a frame with a 802.1Q header that indicates a nonzero VLAN is +received on an implicit VLAN port, it is discarded. +.PP +The following syntax makes network device \fBeth0\fR a member of VLAN +101: +.PP +.RS +.nf + +vlan.eth0.tag=101 + +.fi +.RE +.SS "Network Device Bonding" +Bonding allows multiple ``slave'' network devices to be treated as if +they were a single virtual ``bonded'' network device. It is useful for +load balancing and fail-over. +.PP +\fBovs\-vswitchd\fR supports ``source load balancing'' (SLB) bonding, which +assigns flows to slaves based on source MAC address, with periodic +rebalancing as traffic patterns change. This form of bonding does not +require 802.3ad or other special support from the upstream switch to +which the slave devices are connected. +.PP +To configure bonding, create a virtual bonding device by specifying +the slave network device names as values for +\fBbonding.\fIname\fB.slave\fR, then specify \fIname\fR as a bridge +port. The chosen \fIname\fR should not be the name of any real +network device on the host system. +.PP +By default, bonding interfaces are enabled or disabled immediately +when a carrier is detected or dropped on the underlying network +device. To insert a delay when carrier comes up or goes down before +enabling or disabling an interface, set the value of +\fBbonding.\fIname\fB.updelay\fR or +\fBbonding.\fIname\fB.downdelay\fR, respectively, to a positive +integer, interpreted in milliseconds. +.PP +The following syntax bonds \fBeth0\fR and \fBeth1\fR into a bonding +device named \fBbond0\fR, which is added to bridge \fBmybr\fR along +with physical network devices \fBeth2\fR and \fBeth3\fR: +.PP +.RS +.nf + +bridge.mybr.port=bond0 +bridge.mybr.port=eth2 +bridge.mybr.port=eth3 + +bonding.bond0.slave=eth0 +bonding.bond0.slave=eth1 + +.fi +.RE +.SS "Port Mirroring (SPAN and RSPAN)" +\fBovs\-vswitchd\fR may be configured to send selected frames to special +``mirrored'' ports, in addition to their normal destinations. Mirroring +traffic may also be referred to as SPAN or RSPAN, depending on the +mechanism used for delivery. +.PP +Up to 32 instances of port mirroring may be configured on a given +bridge. Each must be given a name that is unique within the bridge. +The keys associated with port mirroring instance \fIpmname\fR for +bridge \fIbrname\fR begin with \fBmirror.\fIbrname\fB.\fIpmname\fR. +.PP +The selection of the frames to mirror and the form in which they +should be output is configured separately for each port mirroring +instances, through a subsection of +\fBmirror.\fIbrname\fB.\fIpmname\fR, named \fBselect\fR, and +\fBoutput\fR, respectively. +.ST "Selecting Frames to Mirror" +The values for the following keys, if specified, limit the frames that +are chosen for mirroring. If none of these keys is specified, then +all frames received by the bridge are mirrored. If more than one of +these keys is specified, then a frame must meet all specified criteria +to be mirrored. +.TP +\fBmirror.\fIbrname\fB.\fIpmname\fB.select.src-port=\fIport\fR +.TQ +\fBmirror.\fIbrname\fB.\fIpmname\fB.select.dst-port=\fIport\fR +.TQ +\fBmirror.\fIbrname\fB.\fIpmname\fB.select.port=\fIport\fR +Frame received on \fIport\fR, output to \fIport\fR, or either received +on or output to \fIport\fR, respectively. \fIport\fR must be part of +the bridge \fIbrname\fR; that is, it must be listed on +\fBbridge.\fIbrname\fB.port\fR. +.TP +\fBmirror.\fIbrname\fB.\fIpmname\fB.select.vlan=\fIvid\fR +. +\fIvid\fR must be an integer between 0 and 4095, inclusive. A nonzero +\fIvid\fR selects frames that belong to VLAN \fIvid\fR, that is, +frames that arrived on a trunk port tagged with VLAN \fIvid\fR or on a +port that is configured as part of VLAN \fIvid\fR (see \fB802.1Q VLAN +tagging\fR, above). A \fIvid\fR of zero selects frames that do not +belong to a VLAN, that is, frames that arrived on a trunk port without +a VLAN tag or tagged with VLAN 0. +.ST "Mirror Output" +The values of the following keys determine how frames selected for +mirroring are output. Only one of the keys may be specified. +.TP +\fBmirror.\fIbrname\fB.\fIpmname\fB.output.port=\fIport\fR +. +Causes the selected frames to be sent out \fIport\fR, which must be +part of the bridge \fIbrname\fR; that is, it must be listed on +\fBbridge.\fIbrname\fB.port\fR. +.IP +Specifying a \fIport\fR in this way reserves that port exclusively for +mirroring. No frames other than those selected for mirroring will be +forwarded to \fIport\fR, and any frames received on \fIport\fR will be +discarded. This type of mirroring may be referred to as SPAN. +.TP +\fBmirror.\fIbrname\fB.\fIpmname\fB.output.vlan=\fIvid\fR +. +Causes the selected frames to be sent on the VLAN numbered \fIvid\fR, +which must be an integer between 0 and 4095, inclusive. The frames +will be sent out all ports that trunk VLAN \fIvid\fR, as well as any +ports with implicit VLAN \fIvid\fR. When a mirrored frame is sent out +a trunk port, the frame's VLAN tag will be set to \fIvid\fR, replacing +any existing tag; when it is sent out an implicit VLAN port, the frame +will not be tagged. This type of mirroring may be referred to as +RSPAN. +.ST "Example" +The following \fBovs\-vswitchd\fR configuration copies all frames received +on \fBeth1\fR or \fBeth2\fR to \fBeth3\fR. +.PP +.RS +.nf + +bridge.mybr.port=eth1 +bridge.mybr.port=eth2 +bridge.mybr.port=eth3 + +mirror.mybr.a.select.src-port=eth1 +mirror.mybr.a.select.src-port=eth2 +mirror.mybr.a.output.port=eth3 + +.fi +.RE +.SS "Port Rate-Limiting" +Traffic policing and shaping are configured on physical ports. Policing +defines a hard limit at which traffic that exceeds the specified rate is +dropped. Shaping uses queues to delay packets so that egress traffic +leaves at the specified rate. + +.ST "Ingress Policing" +The rate at which traffic is allowed to enter through a particular +physical port can be configured with ingress policing. The rate is +specified in kilobits (1000 bits) per second with a maximum burst size +specified in kilobits (1000 bits). The burst size should be at least +the size of the port's MTU. + +A port may be configured to enforce ingress policing by defining the +key \fBport.\fIname\fB.ingress.policing-rate\fR with an integer +indicating the rate. The port \fIname\fR will only allow traffic to be +received at the rate specified in kilobits per second. If the rate is zero +or the key is not defined, then ingress policing is disabled. + +If ingress policing is enabled, then the burst rate may be set by defining +the key \fBport.\fIname\fB.ingress.policing-burst\fR with an integer +indicating the burst rate in kilobits. If the key is not supplied or is +zero, then the default burst is 10 kilobits. + +.PP +The following syntax limits port \fBeth1\fR to receiving traffic at +\fB512\fR kilobits per second with a burst of \fB20\fR kilobits: +.PP +.RS +.nf + +port.eth1.ingress.policing-rate=512 +port.eth1.ingress.policing-burst=20 + +.fi +.SS "NetFlow v5 Flow Logging" +NetFlow is a protocol that exports a number of details about terminating +IP flows, such as the principals involved and duration. A bridge may be +configured to send NetFlow v5 records to NetFlow collectors when flows +end. To enable, define the key \fBnetflow.\fIbridge\fB.host\fR for each +collector in the form \fIhost\fB:\fIport\fR. Records from \fIbridge\fR +will be sent to each \fIhost\fR on UDP \fIport\fR. + +The NetFlow messages will use the datapath index for the engine type and id. +This can be overridden with the \fBnetflow.\fIbridge\fB.engine-type\fR and +\fBnetflow.\fIbridge\fB.engine-id\fR, respectively. Each takes a value +between 0 and 255, inclusive. + +Many NetFlow collectors do not expect multiple virtual switches to be +sending messages from the same host, and they do not store the engine +information which could be used to disambiguate the traffic. To prevent +flows from multiple switches appearing as if they came on the interface, +add \fBnetflow.\fIbridge\fB.add-id-to-iface=true\fR to the configuration +file. This will place the least significant 7 bits of the engine id +into the most significant bits of the ingress and egress interface fields +of flow records. By default, this behavior is disabled. + +The following syntax sends NetFlow records for \fBmybr\fR to the NetFlow +collector \fBnflow.example.com\fR on UDP port \fB9995\fR: +.PP +.RS +.nf + +netflow.mybr.host=nflow.example.com:9995 + +.fi +.RE +.SS "Remote Management" +A \fBovs\-vswitchd\fR instance may be remotely managed by a controller that +supports the OpenFlow Management Protocol, such as NOX. This +functionality is enabled by setting the key \fBmgmt.controller\fR to one +of the following values: +. +.TP +\fBssl:\fIhost\fR[\fB:\fIport\fR] +The specified SSL \fIport\fR (default: 6633) on the given remote +\fIhost\fR. SSL must be configured when this form is used (see \fBSSL +Configuration\fR, below). +. +.TP +\fBtcp:\fIhost\fR[\fB:\fIport\fR] +The specified TCP \fIport\fR (default: 6633) on the given remote +\fIhost\fR. +.PP +The maximum time between attempts to connect to the controller may be +specified in integral seconds with the \fBmgmt.max-backoff\fR key. The +default maximum backoff is 15 seconds, and the minimum value is 1 +second. + +An inactivity probe may be configured with the \fBmgmt.inactivity-probe\fR +key. If \fBovs\-vswitchd\fR does not communicate with the controller for the +specified number of seconds, it will send a probe. If a response is not +received for an additional amount of that time, \fBovs\-vswitchd\fR assumes +the connection has been broken and attempts to reconnect. The default +is 15 seconds, and the minimum value is 5 seconds. + +A management id may be specified with the \fBmgmt.id\fR key. It takes +an id in the form of exactly 12 hexadecimal digits. If one is not +specified, a random id is generated each time \fBovs\-vswitchd\fR is started. +.fi +.RE +.SS "OpenFlow Controller Connectivity" +\fBovs\-vswitchd\fR can perform all configured bridging and switching +locally, or it can be configured to connect a given bridge to an +external OpenFlow controller, such as NOX. Its behavior depends on +the \fBbridge.\fIname\fB.controller\fR setting: +. +.TP +\fI\[la]unset\[ra]\fR +When the key is not set, the behavior depends on whether remote +management is configured. If management is configured, then the switch +will connect to the controller specified on \fBmgmt.controller\fR. If +management is not configured, the switch will perform all configured +bridging and switching locally. +. +.TP +\fI\[la]empty\[ra]\fR +Setting an empty string value disables controller connectivity. The +switch will perform all configured bridging and switching locally. +. +.TP +\fBdiscover\fR +Use controller discovery to find the local OpenFlow controller. +Refer to \fBsecchan\fR(8) for information on how to configure a DHCP +server to support controller discovery. The following additional +options control the discovery process: +. +.RS +.TP +\fBbridge.\fIname\fB.controller.accept-regex=\fIregex\fR +A POSIX extended regular expression against which the discovered +controller location is validated. Only controllers whose names match +the regular expression will be accepted. +.IP +The default regular expression is \fBssl:.*\fR, meaning that only SSL +controller connections will be accepted, when SSL is configured (see +\fBSSL Configuration\fR), and \fB.*\fR otherwise, meaning that any +controller will be accepted. +.IP +The regular expression is implicitly anchored at the beginning of the +controller location string, as if it begins with \fB^\fR. +.TP +\fBbridge.\fIname\fB.controller.update-resolv.conf=\fBtrue\fR|\fBfalse\fR +By default, or if this is set to \fBtrue\fR, \fBovs\-vswitchd\fR overwrites +the system's \fB/etc/resolv.conf\fR with domain information and DNS +servers obtained via DHCP. If this setting is \fBfalse\fR, +\fBovs\-vswitchd\fR will not modify \fB/etc/resolv.conf\fR. +.IP +\fBovs\-vswitchd\fR will only modify \fBresolv.conf\fR if the DHCP response +that it receives specifies one or more DNS servers. +.RE +. +.TP +\fBssl:\fIhost\fR[\fB:\fIport\fR] +The specified SSL \fIport\fR (default: 6633) on the given remote +\fIhost\fR. SSL must be configured when this form is used (see \fBSSL +Configuration\fR, below). +. +.TP +\fBtcp:\fIhost\fR[\fB:\fIport\fR] +The specified TCP \fIport\fR (default: 6633) on the given remote +\fIhost\fR. +. +.TP +\fBunix:\fIfile\fR +The Unix domain server socket named \fIfile\fR. +.PP +The datapath ID used by the bridge to identify itself to the remote +controller may be specified as \fBbridge.\fIname\fB.datapath-id\fR, +in the form of exactly 12 hexadecimal digits. If the datapath ID +is not specified, then it defaults to the bridge's MAC address (see +\fBBridge Configuration\fR, above, for information on how the bridge's +MAC address is chosen). +.ST "Local Port Network Configuration" +When an external controller is configured, but controller discovery is +not in use, the following additional settings are honored: +.TP +\fBbridge.\fIname\fB.controller.in-band=\fBtrue\fR|\fBfalse\fR +By default, or if this is set to \fBtrue\fR, \fBovs\-vswitchd\fR connects +to the controller in-band. If this is set to \fBfalse\fR, +\fBovs\-vswitchd\fR connects to the controller out-of-band. Refer to +\fBsecchan\fR(8) for a description of in-band and out-of-band control. +.IP "\fBbridge.\fIname\fB.controller.ip=\fIip\fR" +If specified, the IP address to configure on the bridge's local port. +.IP "\fBbridge.\fIname\fB.controller.netmask=\fInetmask\fR" +When an IP is specified, the corresponding netmask. The default is +255.255.255.0 for a Class C IP address, 255.255.0.0 for Class B, and +255.0.0.0 for Class A. +.IP "\fBbridge.\fIname\fB.controller.gateway=\fIip\fR" +When an IP is specified, the corresponding IP gateway. There is no +default gateway. +.ST "Controller Failure Settings" +The following additional settings take effect when any remote +controller is configured: +.IP "\fBbridge.\fIname\fB.controller.inactivity-probe=\fIsecs\fR" +This optional setting may be set to \fIsecs\fR, a number of seconds. +The minimum value of \fIsecs\fR is 5 seconds. The default is taken +from \fBmgmt.inactivity-probe\fR (see above). +.IP +When the virtual switch is connected to the controller, it waits for a +message to be received from the controller for \fIsecs\fR seconds +before it sends a inactivity probe to the controller. After sending +the inactivity probe, if no response is received for an additional +\fIsecs\fR seconds, the secure channel assumes that the connection has +been broken and attempts to reconnect. +.IP +Changing the inactivity probe interval also changes the interval +before entering standalone mode (see below). +.IP "\fBbridge.\fIname\fB.controller.fail-mode=\fBstandalone\fR|\fBsecure\fR" +.IQ "\fBmgmt.fail-mode=standalone\fR|\fBsecure\fR" +When a controller is configured, it is, ordinarily, responsible for +setting up all flows on the virtual switch. Thus, if the connection to +the controller fails, no new network connections can be set up. If +the connection to the controller stays down long enough, no packets +can pass through the switch at all. +.IP +The first of these that is set takes effect. +If the value is \fBstandalone\fR, \fBovs\-vswitchd\fR will take over +responsibility for setting up +flows when no message has been received from the controller for three +times the inactivity probe interval (see above). In this mode, +\fBovs\-vswitchd\fR causes the datapath to act like an ordinary +MAC-learning switch. \fBovs\-vswitchd\fR will continue to retry connecting +to the controller in the background and, when the connection succeeds, +it discontinues its standalone behavior. +.IP +If this option is set to \fBsecure\fR, or if neither of these settings +is set, \fBovs\-vswitchd\fR will not set up flows on its own when the +controller connection fails. +.IP "\fBbridge.\fIname\fB.controller.max-backoff=\fIsecs\fR" +Sets the maximum time between attempts to connect to the controller to +\fIsecs\fR, which must be at least 1. The actual interval between +connection attempts starts at 1 second and doubles on each failing +attempt until it reaches the maximum. The default maximum backoff +time is taken from \fBmgmt.max-backoff\fR. +.ST "Controller Rate-Limiting" +These settings configure how the virtual switch applies a ``token +bucket'' to limit the rate at which packets in unknown flows are +forwarded to the OpenFlow controller for flow-setup processing. This +feature prevents a single bridge from overwhelming a controller. +.IP "\fBbridge.\fIname\fB.controller.rate-limit=\fIrate\fR" +.IQ "\fBmgmt.rate-limit=\fIrate\fR" +Limits the maximum rate at which packets will be forwarded to the +OpenFlow controller to \fIrate\fR packets per second. A rate specified +explicitly for \fIname\fR overrides a value configured using the +\fBmgmt.rate-limit\fR key. +.IP +If neither one of these settings is set, then the bridge does not +limit the rate at which packets are forwarded to the controller. +.IP "\fBbridge.\fIname\fB.controller.burst-limit=\fIburst\fR" +.IQ "\fBmgmt.burst-limit=\fIburst\fR" +Sets the maximum number of unused packet credits that the bridge will +allow to accumulate during the time in which no packets are being +forwarded to the OpenFlow controller to \fIburst\fR (measured in +packets). The default \fIburst\fR is one-quarter of the \fIrate\fR +specified in the rate-limit setting. +.IP +A burst specified explicitly for \fIname\fR overrides a value configured +using the \fBmgmt.burst-limit\fR key. This option takes effect only +when a rate-limit is specified. +.ST "Remote Command Execution Settings" +These settings configure the commands that remote OpenFlow connections +are allowed to invoke using (e.g.) \fBovs\-ofctl execute\fR. To be +permitted, a command name must be whitelisted and must not be +blacklisted. When the whitelist and blacklist permit a command name, +\fBovs\-vswitchd\fR looks for a program with the same name as the command +in the commands directory (see below). Other directories are not +searched. +.IP "\fBbridge.\fIname\fB.controller.commands.acl=\fIglob\fR" +Whitelists commands whose names match shell glob pattern \fIglob\fR, +allowing those commands to be invoked by the remote controller. +.IP +By default, no commands are whitelisted, so this setting is mandatory +if any remote command execution is to be allowed. +.IP "\fBbridge.\fIname\fB.controller.commands.acl=\fB!\fR\fIglob\fR" +Blacklists commands whose names match shell glob pattern \fIglob\fR, +prohibiting those commands from being invoked by the remote +controller. Command names that include characters other than upper- +and lower-case English letters, digits, and the underscore and hyphen +characters are blacklisted unconditionally. +.IP "\fBbridge.\fIname\fB.controller.commands.dir=\fIdirectory\fR" +Sets the directory searched for remote command execution to +\fIdirectory\fR. The default directory is +\fB@pkgdatadir@/commands\fR. +.SS "SSL Configuration" +When \fBovs\-vswitchd\fR is configured to connect over SSL for management or +for controller connectivity, the following settings are required: +.TP +\fBssl.private-key=\fIprivkey.pem\fR +Specifies a PEM file containing the private key used as the virtual +switch's identity for SSL connections to the controller. +.TP +\fBssl.certificate=\fIcert.pem\fR +Specifies a PEM file containing a certificate, signed by the +certificate authority (CA) used by the controller and manager, that +certifies the virtual switch's private key, identifying a trustworthy +switch. +.TP +\fBssl.ca-cert=\fIcacert.pem\fR +Specifies a PEM file containing the CA certificate used to verify that +the virtual switch is connected to a trustworthy controller. +.PP +These files are read only once, at \fBovs\-vswitchd\fR startup time. If +their contents change, \fBovs\-vswitchd\fR must be killed and restarted. +.PP +These SSL settings apply to all SSL connections made by the virtual +switch. +.ST "CA Certificate Bootstrap" +Ordinarily, all of the files named in the SSL configuration must exist +when \fBovs\-vswitchd\fR starts. However, if \fBssl.bootstrap-ca-cert\fR +is set to \fBtrue\fR, then \fBovs\-vswitchd\fR will attempt to obtain the +CA certificate from the controller on its first SSL connection and +save it to the named PEM file. If it is successful, it will +immediately drop the connection and reconnect, and from then on all +SSL connections must be authenticated by a certificate signed by the +CA certificate thus obtained. +.PP +\fBThis option exposes the SSL connection to a man-in-the-middle +attack obtaining the initial CA certificate\fR, but it may be useful +for bootstrapping. +.PP +This option is only useful if the controller sends its CA certificate +as part of the SSL certificate chain. The SSL protocol does not +require the controller to send the CA certificate, but +\fBcontroller\fR(8) can be configured to do so with the +\fB--peer-ca-cert\fR option. +.SS "OpenFlow Management Connections" +By default, each bridge \fIname\fR listens for OpenFlow management +connections on a Unix domain socket named +\fB@RUNDIR@/\fIname\fB.mgmt\fR. This socket can be used to perform +local OpenFlow monitoring and administration, e.g., \fBovs\-ofctl dump-flows +unix:@RUNDIR@/\fIname\fB.mgmt\fR to display the flows currently set up +in bridge \fIname\fR. +.PP +If \fBbridge.\fIname\fB.openflow.listeners\fR is set to one or more +values, \fBovs\-vswitchd\fR instead listens on the specified connection +methods. Acceptable connection methods include: +.RS +.IP "\fBpunix:\fIfile\fR" +Listens for connections on the Unix domain server socket named \fIfile\fR. +.IP "\fBpssl:\fR[\fIport\fR]" +Listens for SSL connections on \fIport\fR (default: 6633). SSL must +be configured when this form is used (see \fBSSL Configuration\fR, +above). +.IP "\fBptcp:\fR[\fIport\fR]" +Listens for TCP connections on \fIport\fR (default: 6633). +.RE +To entirely disable listening for management connections, set +\fBbridge.\fIname\fB.openflow.listeners\fR to the single value +\fBnone\fR. + +.SS "OpenFlow Controller Connection Snooping" +By default, each bridge \fIname\fR listens for OpenFlow controller +connection snooping connections on a Unix domain socket named +\fB@RUNDIR@/\fIname\fB.snoop\fR. A client that connects to this +socket, e.g., \fBovs\-ofctl monitor unix:@RUNDIR@/\fIname\fB.snoop\fR, will +receive a copy of every OpenFlow message sent by the switch to the +controller, or vice versa, on the primary OpenFlow controller +connection. +.PP +If \fBbridge.\fIname\fB.openflow.snoops\fR is set to one or more +values, \fBovs\-vswitchd\fR instead listens on the specified connection +methods. The acceptable connection methods are the same as for +OpenFlow management connections (see above). +.PP +To entirely disable controller connection snooping, set +\fBbridge.\fIname\fB.openflow.snoops\fR to the single value +\fBnone\fR. +.SH "SEE ALSO" +.BR ovs\-brcompatd (8), +.BR ovs\-cfg\-mod (8), +.BR ovs\-vswitchd (8) diff --git a/vswitchd/ovs-vswitchd.h b/vswitchd/ovs-vswitchd.h new file mode 100644 index 00000000..f292d682 --- /dev/null +++ b/vswitchd/ovs-vswitchd.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#ifndef VSWITCHD_H +#define VSWITCHD_H 1 + +void reconfigure(void); + +#endif /* ovs-vswitchd.h */ diff --git a/vswitchd/port.c b/vswitchd/port.c new file mode 100644 index 00000000..f6348f35 --- /dev/null +++ b/vswitchd/port.c @@ -0,0 +1,68 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + * + */ + +#include + +#include "bridge.h" +#include "cfg.h" +#include "netdev.h" +#include "ovs-vswitchd.h" +#include "port.h" +#include "svec.h" + +#define THIS_MODULE VLM_port +#include "vlog.h" + +static int +set_ingress_policing(const char *port_name) +{ + int kbits_rate = cfg_get_int(0, "port.%s.ingress.policing-rate", + port_name); + int kbits_burst = cfg_get_int(0, "port.%s.ingress.policing-burst", + port_name); + + return netdev_nodev_set_policing(port_name, kbits_rate, kbits_burst); +} + +void +port_init(void) +{ + port_reconfigure(); +} + +void +port_reconfigure(void) +{ + struct svec ports; + int i; + + svec_init(&ports); + bridge_get_ifaces(&ports); + for (i=0; i. + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#ifndef VSWITCHD_PORT_H +#define VSWITCHD_PORT_H 1 + +void port_init(void); +void port_reconfigure(void); + +#endif /* port.h */ diff --git a/vswitchd/proc-net-compat.c b/vswitchd/proc-net-compat.c new file mode 100644 index 00000000..3f5cf44a --- /dev/null +++ b/vswitchd/proc-net-compat.c @@ -0,0 +1,344 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + */ + +#include +#include "proc-net-compat.h" +#include +#include +#include +#include +#include +#include "dynamic-string.h" +#include "hash.h" +#include "netlink-protocol.h" +#include "netlink.h" +#include "ofpbuf.h" +#include "openvswitch/brcompat-netlink.h" +#include "hmap.h" +#include "shash.h" +#include "svec.h" + +#define THIS_MODULE VLM_proc_net_compat +#include "vlog.h" + +/* Netlink socket to bridge compatibility kernel module. */ +static struct nl_sock *brc_sock; + +/* The Generic Netlink family number used for bridge compatibility. */ +static int brc_family = 0; + +/* Rate limiting for log messages. */ +static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); + +static void flush_dir(const char *dir); +static int set_proc_file(const char *dir, const char *file, const char *data); + +/* Initializes the /proc/net compatibility layer. Returns 0 if successful, + * otherwise a positive errno value. */ +int +proc_net_compat_init(void) +{ + if (!brc_sock) { + int retval = nl_lookup_genl_family(BRC_GENL_FAMILY_NAME, &brc_family); + if (retval) { + return retval; + } + + retval = nl_sock_create(NETLINK_GENERIC, 0, 0, 0, &brc_sock); + if (retval) { + return retval; + } + + flush_dir("/proc/net/vlan"); + flush_dir("/proc/net/bonding"); + } + return 0; +} + +static int +set_proc_file(const char *dir, const char *file, const char *data) +{ + struct ofpbuf request, *reply; + int retval; + + ofpbuf_init(&request, 0); + nl_msg_put_genlmsghdr(&request, brc_sock, 1024, brc_family, NLM_F_REQUEST, + BRC_GENL_C_SET_PROC, 1); + nl_msg_put_string(&request, BRC_GENL_A_PROC_DIR, dir); + nl_msg_put_string(&request, BRC_GENL_A_PROC_NAME, file); + if (data) { + nl_msg_put_string(&request, BRC_GENL_A_PROC_DATA, data); + } + + retval = nl_sock_transact(brc_sock, &request, &reply); + ofpbuf_uninit(&request); + ofpbuf_delete(reply); + if (retval) { + VLOG_WARN_RL(&rl, "failed to %s /proc/%s/%s (%s)", + data ? "update" : "remove", dir, file, strerror(retval)); + } + return retval; +} + +static void +flush_dir(const char *dir) +{ + const char *subdir; + struct dirent *de; + DIR *stream; + + assert(!memcmp(dir, "/proc/", 6)); + subdir = dir + 6; + + stream = opendir(dir); + if (!stream) { + if (errno != ENOENT) { + VLOG_WARN_RL(&rl, "%s: open failed (%s)", dir, strerror(errno)); + } + return; + } + + while ((de = readdir(stream)) != NULL) { + if (strcmp(de->d_name, ".") && strcmp(de->d_name, "..")) { + set_proc_file(subdir, de->d_name, NULL); + } + } + closedir(stream); +} + +/* If 'bond' is nonnull, creates a file in /proc/net/bonding for a bond with + * the given 'name' and the details in 'bond'. If 'bond' is null, deletes + * the /proc/net/bonding file with the given 'name'. + * + * This function has no effect unless proc_net_compat_init() has been + * called. */ +void +proc_net_compat_update_bond(const char *name, const struct compat_bond *bond) +{ + struct ds ds; + int i; + + if (!brc_sock) { + return; + } + + if (!bond) { + set_proc_file("net/bonding", name, NULL); + return; + } + + ds_init(&ds); + ds_put_format( + &ds, + "Ethernet Channel Bonding Driver: ovs-vswitchd " + VERSION BUILDNR" ("__DATE__" "__TIME__")\n" + "Bonding Mode: source load balancing\n" + "Primary Slave: None\n" + "Currently Active Slave: None\n" + "MII Status: %s\n" + "MII Polling Interval (ms): 100\n" + "Up Delay (ms): %d\n" + "Down Delay (ms): %d\n" + "\n" + "Source load balancing info:\n", + bond->up ? "up" : "down", bond->updelay, bond->downdelay); + for (i = 0; i < bond->n_slaves; i++) { + const struct compat_bond_slave *slave = &bond->slaves[i]; + ds_put_format( + &ds, + "\n" + "Slave Interface: %s\n" + "MII Status: %s\n" + "Link Failure Count: 0\n" + "Permanent HW addr: "ETH_ADDR_FMT"\n", + slave->name, slave->up ? "up" : "down", + ETH_ADDR_ARGS(slave->mac)); + } + set_proc_file("net/bonding", name, ds_cstr(&ds)); + ds_destroy(&ds); +} + +/* /proc/net/vlan compatibility. + * + * This is much more complex than I expected it to be. */ + +struct compat_vlan { + /* Hash key. */ + struct hmap_node trunk_node; /* Hash map node. */ + char *trunk_dev; /* Name of trunk network device. */ + int vid; /* VLAN number. */ + + /* Auxiliary data. */ + char *vlan_dev; /* sprintf("%s.%d", trunk_dev, vid); */ + struct svec tagged_devs; /* Name of tagged network device(s). */ +}; + +/* Current set of VLAN devices, indexed two different ways. */ +static struct hmap vlans_by_trunk = HMAP_INITIALIZER(&vlans_by_trunk); +static struct shash vlans_by_tagged = SHASH_INITIALIZER(&vlans_by_tagged); + +static bool remove_tagged_dev(struct shash_node *, const char *tagged_dev); +static void update_vlan_config(void); +static void set_vlan_proc_file(const struct compat_vlan *); +static uint32_t hash_vlan(const char *trunk_dev, uint32_t vid); + +/* Updates the /proc/net/vlan compatibility layer's idea of what trunk device + * and VLAN the given 'tagged_dev' is associated with. If 'tagged_dev' has an + * implicit VLAN tag, then 'trunk_dev' should be the name of a network device + * on the same bridge that trunks that VLAN, and 'vid' should be the VLAN tag + * number. If 'tagged_dev' does not have an implicit VLAN tag, then + * 'trunk_dev' should be NULL and 'vid' should be -1. + * + * This function has no effect unless proc_net_compat_init() has been + * called. */ +void +proc_net_compat_update_vlan(const char *tagged_dev, const char *trunk_dev, + int vid) +{ + struct compat_vlan *vlan; + struct shash_node *node; + + if (!brc_sock) { + return; + } + + /* Find the compat_vlan that we currently have for 'tagged_dev' (if + * any). */ + node = shash_find(&vlans_by_tagged, tagged_dev); + vlan = node ? node->data : NULL; + if (vid <= 0 || !trunk_dev) { + if (vlan) { + if (remove_tagged_dev(node, tagged_dev)) { + update_vlan_config(); + } + } + } else { + if (vlan) { + if (!strcmp(trunk_dev, vlan->trunk_dev) && vid == vlan->vid) { + /* No change. */ + return; + } else { + /* 'tagged_dev' is attached to the wrong compat_vlan. Start + * by removing it from that one. */ + remove_tagged_dev(node, tagged_dev); + node = NULL; + vlan = NULL; + } + } + + /* 'tagged_dev' is not attached to any compat_vlan. Find the + * compat_vlan corresponding to (trunk_dev,vid) to attach it to, or + * create a new compat_vlan if none exists for (trunk_dev,vid). */ + HMAP_FOR_EACH_WITH_HASH (vlan, struct compat_vlan, trunk_node, + hash_vlan(trunk_dev, vid), + &vlans_by_trunk) { + if (!strcmp(trunk_dev, vlan->trunk_dev) && vid == vlan->vid) { + break; + } + } + if (!vlan) { + /* Create a new compat_vlan for (trunk_dev,vid). */ + vlan = xcalloc(1, sizeof *vlan); + vlan->trunk_dev = xstrdup(trunk_dev); + vlan->vid = vid; + vlan->vlan_dev = xasprintf("%s.%d", trunk_dev, vid); + svec_init(&vlan->tagged_devs); + hmap_insert(&vlans_by_trunk, &vlan->trunk_node, + hash_vlan(trunk_dev, vid)); + set_vlan_proc_file(vlan); + } + + /* Attach 'tagged_dev' to 'vlan'. */ + svec_add(&vlan->tagged_devs, tagged_dev); + shash_add(&vlans_by_tagged, tagged_dev, vlan); + svec_sort(&vlan->tagged_devs); + update_vlan_config(); + } +} + +/* Remove 'tagged_dev' from the compat_vlan in 'node'. If that causes the + * compat_vlan to have no tagged_devs left, destroy the compat_vlan too. */ +static bool +remove_tagged_dev(struct shash_node *node, const char *tagged_dev) +{ + struct compat_vlan *vlan = node->data; + + svec_del(&vlan->tagged_devs, tagged_dev); + shash_delete(&vlans_by_tagged, node); + if (!vlan->tagged_devs.n) { + set_proc_file("net/vlan", vlan->vlan_dev, NULL); + + hmap_remove(&vlans_by_trunk, &vlan->trunk_node); + svec_destroy(&vlan->tagged_devs); + free(vlan->trunk_dev); + free(vlan->vlan_dev); + free(vlan); + return true; + } + return false; +} + +/* Returns a hash value for (trunk_dev,vid). */ +static uint32_t +hash_vlan(const char *trunk_dev, uint32_t vid) +{ + return hash_int(vid, hash_string(trunk_dev, 0)); +} + +/* Update /proc/net/vlan/ for 'vlan'. */ +static void +set_vlan_proc_file(const struct compat_vlan *vlan) +{ + struct ds ds; + + ds_init(&ds); + ds_put_format( + &ds, + "%s VID: %d\t REORDER_HDR: 1 dev->priv_flags: 81\n" + " total frames received 0\n" + " total bytes received 0\n" + " Broadcast/Multicast Rcvd 0\n" + "\n" + " total frames transmitted 0\n" + " total bytes transmitted 0\n" + " total headroom inc 0\n" + " total encap on xmit 0\n" + "Device: %s\n" + "INGRESS priority mappings: 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0\n" + "EGRESSS priority Mappings: \n", + vlan->vlan_dev, vlan->vid, vlan->trunk_dev); + set_proc_file("net/vlan", vlan->vlan_dev, ds_cstr(&ds)); + ds_destroy(&ds); +} + +/* Update /proc/net/vlan/config. */ +static void +update_vlan_config(void) +{ + struct compat_vlan *vlan; + struct ds ds; + + ds_init(&ds); + ds_put_cstr(&ds, "VLAN Dev name | VLAN ID\n" + "Name-Type: VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD\n"); + HMAP_FOR_EACH (vlan, struct compat_vlan, trunk_node, &vlans_by_trunk) { + ds_put_format(&ds, "%-15s| %d | %s\n", + vlan->vlan_dev, vlan->vid, vlan->trunk_dev); + } + set_proc_file("net/vlan", "config", ds_cstr(&ds)); + ds_destroy(&ds); +} diff --git a/vswitchd/proc-net-compat.h b/vswitchd/proc-net-compat.h new file mode 100644 index 00000000..ce97176b --- /dev/null +++ b/vswitchd/proc-net-compat.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#ifndef VSWITCHD_PROC_NET_COMPAT_H +#define VSWITCHD_PROC_NET_COMPAT_H 1 + +#include "packets.h" + +struct compat_bond { + bool up; + int updelay; + int downdelay; + int n_slaves; + struct compat_bond_slave *slaves; +}; + +struct compat_bond_slave { + const char *name; + bool up; + uint8_t mac[ETH_ADDR_LEN]; +}; + +int proc_net_compat_init(void); +void proc_net_compat_update_bond(const char *name, const struct compat_bond *); +void proc_net_compat_update_vlan(const char *dev, const char *vlandev, + int vlan); + +#endif /* vswitchd/proc-net-compat.h */ diff --git a/vswitchd/xenserver.c b/vswitchd/xenserver.c new file mode 100644 index 00000000..7a8d255f --- /dev/null +++ b/vswitchd/xenserver.c @@ -0,0 +1,90 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#include +#include "xenserver.h" +#include +#include +#include +#include +#include +#include "dynamic-string.h" +#include "process.h" + +#include "vlog.h" +#define THIS_MODULE VLM_xenserver + +static char * +read_host_uuid(void) +{ + static const char filename[] = "/etc/xensource-inventory"; + char line[128]; + FILE *file; + + file = fopen(filename, "r"); + if (!file) { + if (errno == ENOENT) { + VLOG_INFO("not running on a XenServer"); + } else { + VLOG_INFO("%s: open: %s", filename, strerror(errno)); + } + return NULL; + } + + while (fgets(line, sizeof line, file)) { + static const char leader[] = "INSTALLATION_UUID='"; + const int leader_len = strlen(leader); + const int uuid_len = 36; + static const char trailer[] = "'\n"; + const int trailer_len = strlen(trailer); + + if (strlen(line) == leader_len + uuid_len + trailer_len + && !memcmp(line, leader, leader_len) + && !memcmp(line + leader_len + uuid_len, trailer, trailer_len)) { + char *host_uuid = xmemdup0(line + leader_len, uuid_len); + VLOG_INFO("running on XenServer, host-uuid %s", host_uuid); + fclose(file); + return host_uuid; + } + } + fclose(file); + VLOG_ERR("%s: INSTALLATION_UUID not found", filename); + return NULL; +} + +const char * +xenserver_get_host_uuid(void) +{ + static char *host_uuid; + static bool inited; + + if (!inited) { + host_uuid = read_host_uuid(); + inited = true; + } + return host_uuid; +} + diff --git a/vswitchd/xenserver.h b/vswitchd/xenserver.h new file mode 100644 index 00000000..c69b133a --- /dev/null +++ b/vswitchd/xenserver.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2009 Nicira Networks + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * In addition, as a special exception, Nicira Networks gives permission + * to link the code of its release of vswitchd with the OpenSSL project's + * "OpenSSL" library (or with modified versions of it that use the same + * license as the "OpenSSL" library), and distribute the linked + * executables. You must obey the GNU General Public License in all + * respects for all of the code used other than "OpenSSL". If you modify + * this file, you may extend this exception to your version of the file, + * but you are not obligated to do so. If you do not wish to do so, + * delete this exception statement from your version. + */ + +#ifndef VSWITCHD_XENSERVER_H +#define VSWITCHD_XENSERVER_H 1 + +const char *xenserver_get_host_uuid(void); + +#endif /* xenserver.h */ diff --git a/xenserver/README b/xenserver/README new file mode 100644 index 00000000..7cd04ab6 --- /dev/null +++ b/xenserver/README @@ -0,0 +1,78 @@ +This directory contains files for seamless integration of vswitch on +Citrix XenServer hosts managed by the Citrix management tools. + +Some of these files are modifications of Citrix's proprietary code. +Citrix has given permission to distribute these modified files. +Citrix has not specified a particular license for them. There is no +guarantee that, should Citrix specify a license, that it would be +DFSG-compliant or GPL-compatible. + +Most of the files in this directory is installed on a XenServer system +under the same name, if underscores are replaced by slashes. The +files are: + + etc_init.d_vswitch + + Initializes the vswitch at boot and shuts it down at shutdown. + + etc_init.d_vswitch-xapi-update + + Init script to ensure vswitch-cfg-update is called for the + current host at boot. + + etc_logrotate.d_vswitch + + Ensures that /var/log/ovs-vswitchd.log is rotated periodically + and that ovs-vswitchd reopens its log file at that point. + + etc_profile.d_vswitch.sh + + vswitch-related shell functions for the administrator's + convenience. + + etc_sysconfig_vswitch.example + + Example configuration options for vswitch. + + etc_xapi.d_plugins_vswitch-cfg-update + + xapi plugin script to update the cache of configuration items + in the ovs-vswitchd configuration file that are managed in the + xapi database when integrated with Citrix management tools. + + etc_xensource_scripts_vif + + vswitch-aware replacement for Citrix script of the same name. + + opt_xensource_libexec_interface-reconfigure + + vswitch-aware replacement for Citrix script of the same name. + + usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py + + xsconsole plugin to configure the pool-wide configuration keys + used to control vswitch when integrated with Citrix management + tools. + + vswitch-xen.spec + + spec file for building RPMs to install on a XenServer host. + +To install, build the vswitch RPM with a command like this: + + rpmbuild -D "vswitch_version $full_version" \ + -D "xen_version $XENKERNEL" \ + -D "build_number --with-build-number=$buildnr" \ + -bb vswitch-xen.spec + +Then, "rpm -U" the resulting vswitch package on the XenServer hosts in +question and reboot them. (The vswitch-dbg package that is also +produced need not be installed, but it is harmless to do so.) + +---------------------------------------------------------------------- +Copyright (C) 2009 Nicira Networks, Inc. + +Copying and distribution of this file, with or without modification, +are permitted in any medium without royalty provided the copyright +notice and this notice are preserved. This file is offered as-is, +without warranty of any kind. diff --git a/xenserver/automake.mk b/xenserver/automake.mk new file mode 100644 index 00000000..53e109d5 --- /dev/null +++ b/xenserver/automake.mk @@ -0,0 +1,19 @@ +# Copyright (C) 2009 Nicira Networks, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. This file is offered as-is, +# without warranty of any kind. + +EXTRA_DIST += \ + xenserver/README \ + xenserver/etc_init.d_vswitch \ + xenserver/etc_init.d_vswitch-xapi-update \ + xenserver/etc_logrotate.d_vswitch \ + xenserver/etc_profile.d_vswitch.sh \ + xenserver/etc_sysconfig_vswitch.example \ + xenserver/etc_xapi.d_plugins_vswitch-cfg-update \ + xenserver/etc_xensource_scripts_vif \ + xenserver/opt_xensource_libexec_interface-reconfigure \ + xenserver/usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py \ + xenserver/vswitch-xen.spec diff --git a/xenserver/etc_init.d_vswitch b/xenserver/etc_init.d_vswitch new file mode 100755 index 00000000..25aca61f --- /dev/null +++ b/xenserver/etc_init.d_vswitch @@ -0,0 +1,302 @@ +#!/bin/bash +# +# vswitch +# +# chkconfig: 2345 09 91 +# description: Manage vswitch kernel modules and user-space daemon + +# Copyright (C) 2009 Nicira Networks, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. /etc/init.d/functions + +test -e /etc/sysconfig/vswitch && . /etc/sysconfig/vswitch + +# General config variables in /etc/sysconfig/vswitch +VSWITCH_BASE="${VSWITCH_BASE:-/root/vswitch}" +ENABLE_BRCOMPAT="${ENABLE_BRCOMPAT:-y}" +ENABLE_FAKE_PROC_NET="${ENABLE_FAKE_PROC_NET:-y}" +FORCE_COREFILES="${FORCE_COREFILES:-n}" +COREFILE_PATTERN="${COREFILE_PATTERN:-/var/log/%e-%t}" + +# Config variables specific to ovs-vswitchd +VSWITCHD_CONF="${VSWITCHD_CONF:-/etc/ovs-vswitchd.conf}" +VSWITCHD_PIDFILE="${VSWITCHD_PIDFILE:-/var/run/ovs-vswitchd.pid}" +VSWITCHD_PRIORITY="${VSWITCHD_PRIORITY:--5}" +VSWITCHD_LOGFILE="${VSWITCHD_LOGFILE:-/var/log/ovs-vswitchd.log}" +VSWITCHD_FILE_LOGLEVEL="${VSWITCHD_FILE_LOGLEVEL:-}" +VSWITCHD_SYSLOG_LOGLEVEL="${VSWITCHD_SYSLOG_LOGLEVEL:-WARN}" +VSWITCHD_MEMLEAK_LOGFILE="${VSWITCHD_MEMLEAK_LOGFILE:-}" +VSWITCHD_STRACE_LOG="${VSWITCHD_STRACE_LOG:-}" +VSWITCHD_STRACE_OPT="${VSWITCHD_STRACE_OPT:-}" +VSWITCHD_VALGRIND_LOG="${VSWITCHD_VALGRIND_LOG:-}" +VSWITCHD_VALGRIND_OPT="${VSWITCHD_VALGRIND_OPT:-}" + +# Config variables specific to ovs-brcompatd +BRCOMPATD_PIDFILE="${BRCOMPATD_PIDFILE:-/var/run/ovs-brcompatd.pid}" +BRCOMPATD_PRIORITY="${BRCOMPATD_PRIORITY:--5}" +BRCOMPATD_LOGFILE="${BRCOMPATD_LOGFILE:-/var/log/ovs-brcompatd.log}" +BRCOMPATD_FILE_LOGLEVEL="${BRCOMPATD_FILE_LOGLEVEL:-}" +BRCOMPATD_SYSLOG_LOGLEVEL="${BRCOMPATD_SYSLOG_LOGLEVEL:-WARN}" +BRCOMPATD_MEMLEAK_LOGFILE="${BRCOMPATD_MEMLEAK_LOGFILE:-}" +BRCOMPATD_STRACE_LOG="${BRCOMPATD_STRACE_LOG:-}" +BRCOMPATD_STRACE_OPT="${BRCOMPATD_STRACE_OPT:-}" +BRCOMPATD_VALGRIND_LOG="${BRCOMPATD_VALGRIND_LOG:-}" +BRCOMPATD_VALGRIND_OPT="${BRCOMPATD_VALGRIND_OPT:-}" + + + + +# Full paths to executables & modules +vswitchd="$VSWITCH_BASE/sbin/ovs-vswitchd" +brcompatd="$VSWITCH_BASE/sbin/ovs-brcompatd" +dpctl="$VSWITCH_BASE/bin/ovs-dpctl" +appctl="$VSWITCH_BASE/bin/ovs-appctl" +ofctl="$VSWITCH_BASE/bin/ovs-ofctl" + + +if [ "$ENABLE_FAKE_PROC_NET" == "y" ]; then + if [ "$ENABLE_BRCOMPAT" != "y" ]; then + warning "FAKE_PROC_NET required BRCOMPAT which was disabled. Force enabling." + ENABLE_BRCOMPAT="y" + fi +fi + +function dp_list { + "$dpctl" show | grep '^dp[0-9]\+:' | cut -d':' -f 1 +} + +function turn_on_corefiles { + # This has global effect so should not normally be used... + ulimit -c unlimited + echo "$COREFILE_PATTERN" > /proc/sys/kernel/core_pattern +} + +function remove_all_dp { + for dp in $(dp_list); do + action "Removing datapath: $dp" "$dpctl" del-dp "$dp" + done +} + +function insert_modules_if_required { + if ! lsmod | grep -q "openvswitch_mod"; then + action "Inserting llc module" modprobe llc + action "Inserting openvswitch module" insmod $VSWITCH_BASE/kernel_modules/openvswitch_mod.ko + fi + if [ -n "$BRCOMPATD_PIDFILE" ] && ! lsmod | grep -q "brcompat_mod"; then + action "Inserting brcompat module" insmod $VSWITCH_BASE/kernel_modules/brcompat_mod.ko + fi +} + +function remove_modules { + if lsmod | grep -q "brcompat_mod"; then + action "Removing brcompat module" rmmod brcompat_mod.ko + fi + if lsmod | grep -q "openvswitch_mod"; then + action "Removing openvswitch module" rmmod openvswitch_mod.ko + fi +} + +function reload_vswitchd { + if [ -f "$VSWITCHD_PIDFILE" ]; then + "$appctl" \ + --target=ovs-vswitchd.$(cat "$VSWITCHD_PIDFILE").ctl \ + --execute=vswitchd/reload + fi +} + +function start_vswitchd { + local syslog_opt="-vANY:SYSLOG:${VSWITCHD_SYSLOG_LOGLEVEL}" + local logfile_file_opt="" + local logfile_level_opt="" + if [ -n "$VSWITCHD_FILE_LOGLEVEL" ]; then + logfile_level_opt="-vANY:FILE:${VSWITCHD_FILE_LOGLEVEL}" + logfile_file_opt="--log-file=$VSWITCHD_LOGFILE" + fi + local leak_opt="" + if [ -n "$VSWITCHD_MEMLEAK_LOGFILE" ]; then + leak_opt="--check-leaks=$VSWITCHD_MEMLEAK_LOGFILE" + if [ -e "$VSWITCHD_MEMLEAK_LOGFILE" ]; then + mv "$VSWITCHD_MEMLEAK_LOGFILE" "$VSWITCHD_MEMLEAK_LOGFILE.prev" + fi + fi + local strace_opt="" + local daemonize="y" + if [ -n "$VSWITCHD_STRACE_LOG" ] && [ -n "$VSWITCHD_VALGRIND_LOG" ]; then + printf "Can not start with both VALGRIND and STRACE\n" + exit 1 + fi + if [ -n "$VSWITCHD_STRACE_LOG" ]; then + strace_opt="strace -o $VSWITCHD_STRACE_LOG $VSWITCHD_STRACE_OPT" + daemonize="n" + fi + if [ -n "$VSWITCHD_VALGRIND_LOG" ]; then + valgrind_opt="valgrind --log-file=$VSWITCHD_VALGRIND_LOG $VSWITCHD_VALGRIND_OPT" + daemonize="n" + fi + local fake_proc_net_opt="" + if [ "$ENABLE_FAKE_PROC_NET" == "y" ]; then + fake_proc_net_opt="--fake-proc-net" + fi + if [ "$daemonize" != "y" ]; then + # Start in background and force a "success" message + action "Starting ovs-vswitchd ($strace_opt$valgrind_opt)" true + (nice -n "$VSWITCHD_PRIORITY" $strace_opt $valgrind_opt "$vswitchd" -P"$VSWITCHD_PIDFILE" -D $fake_proc_net_opt -vANY:CONSOLE:EMER $syslog_opt $logfile_level_opt $logfile_file_opt $leak_opt "$VSWITCHD_CONF") & + else + action "Starting ovs-vswitchd" nice -n "$VSWITCHD_PRIORITY" "$vswitchd" -P"$VSWITCHD_PIDFILE" -D $fake_proc_net_opt -vANY:CONSOLE:EMER $syslog_opt $logfile_level_opt $logfile_file_opt $leak_opt "$VSWITCHD_CONF" + fi +} + +function start_brcompatd { + local syslog_opt="-vANY:SYSLOG:${BRCOMPATD_SYSLOG_LOGLEVEL}" + local logfile_file_opt="" + local logfile_level_opt="" + if [ -n "$BRCOMPATD_FILE_LOGLEVEL" ]; then + logfile_level_opt="-vANY:FILE:${BRCOMPATD_FILE_LOGLEVEL}" + logfile_file_opt="--log-file=$BRCOMPATD_LOGFILE" + fi + local leak_opt="" + if [ -n "$BRCOMPATD_MEMLEAK_LOG" ]; then + leak_opt="--check-leaks=$BRCOMPATD_MEMLEAK_LOGFILE" + if [ -e "$BRCOMPATD_MEMLEAK_LOGFILE" ]; then + mv "$BRCOMPATD_MEMLEAK_LOGFILE" "$BRCOMPATD_MEMLEAK_LOGFILE.prev" + fi + fi + local strace_opt="" + local daemonize="y" + if [ -n "$BRCOMPATD_STRACE_LOG" ] && [ -n "$BRCOMPATD_VALGRIND_LOG" ]; then + printf "Can not start with both VALGRIND and STRACE\n" + exit 1 + fi + if [ -n "$BRCOMPATD_STRACE_LOG" ]; then + strace_opt="strace -o $BRCOMPATD_STRACE_LOG $BRCOMPATD_STRACE_OPT" + daemonize="n" + fi + if [ -n "$VALGRIND_LOG" ]; then + valgrind_opt="valgrind --log-file=$BRCOMPATD_VALGRIND_LOG $BRCOMPATD_VALGRIND_OPT" + daemonize="n" + fi + reload_cmd='/root/vswitch/bin/ovs-appctl -t /var/run/ovs-vswitchd.`cat /var/run/ovs-vswitchd.pid`.ctl -e vswitchd/reload 2>&1 | /usr/bin/logger -t brcompatd-reload' + if [ "$daemonize" != "y" ]; then + # Start in background and force a "success" message + action "Starting ovs-brcompatd ($strace_opt$valgrind_opt)" true + (nice -n "$VSWITCHD_PRIORITY" $strace_opt $valgrind_opt "$brcompatd" --reload-command="$reload_cmd" -P$BRCOMPATD_PIDFILE -vANY:CONSOLE:EMER $syslog_opt $logfile_level_opt $logfile_file_opt $leak_opt "$VSWITCHD_CONF") & + else + action "Starting ovs-brcompatd" nice -n "$BRCOMPATD_PRIORITY" $strace_opt $valgrind_opt "$brcompatd" --reload-command="$reload_cmd" -P$BRCOMPATD_PIDFILE -D -vANY:CONSOLE:EMER $syslog_opt $logfile_level_opt $logfile_file_opt $leak_opt "$VSWITCHD_CONF" + fi +} + +function stop_vswitchd { + if [ -f "$VSWITCHD_PIDFILE" ]; then + local pid=$(cat "$VSWITCHD_PIDFILE") + action "Killing ovs-vswitchd ($pid)" kill -TERM $pid + rm -f "$VSWITCHD_PIDFILE" + fi +} + +function stop_brcompatd { + if [ -f "$BRCOMPATD_PIDFILE" ]; then + local pid=$(cat "$BRCOMPATD_PIDFILE") + action "Killing ovs-brcompatd ($pid)" kill -TERM $pid + rm -f "$BRCOMPATD_PIDFILE" + fi +} + +function restart_approval { + if test ! -t 0; then + # Don't prompt if invoked non-interactively. + return 0 + fi + cat <. + +. /etc/init.d/functions + +test -e /etc/sysconfig/vswitch && . /etc/sysconfig/vswitch +VSWITCH_BASE="${VSWITCH_BASE:-/root/vswitch}" +VSWITCHD_CONF="${VSWITCHD_CONF:-/etc/ovs-vswitchd.conf}" +VSWITCHD_PIDFILE="${VSWITCHD_PIDFILE:-/var/run/ovs-vswitchd.pid}" +VSWITCHD_PRIORITY="${VSWITCHD_PRIORITY:--5}" +VSWITCHD_LOGFILE="${VSWITCHD_LOGFILE:-/var/log/ovs-vswitchd.log}" +VSWITCHD_FILE_LOGLEVEL="${VSWITCHD_FILE_LOGLEVEL:-}" +VSWITCHD_SYSLOG_LOGLEVEL="${VSWITCHD_SYSLOG_LOGLEVEL:-WARN}" +VSWITCHD_MEMLEAK_LOGFILE="${VSWITCHD_MEMLEAK_LOGFILE:-}" +BRCOMPATD_PIDFILE="${BRCOMPATD_PIDFILE:-/var/run/ovs-brcompatd.pid}" +BRCOMPATD_PRIORITY="${BRCOMPATD_PRIORITY:--5}" +BRCOMPATD_LOGFILE="${BRCOMPATD_LOGFILE:-/var/log/ovs-brcompatd.log}" +BRCOMPATD_FILE_LOGLEVEL="${BRCOMPATD_FILE_LOGLEVEL:-}" +BRCOMPATD_SYSLOG_LOGLEVEL="${BRCOMPATD_SYSLOG_LOGLEVEL:-WARN}" +BRCOMPATD_MEMLEAK_LOGFILE="${BRCOMPATD_MEMLEAK_LOGFILE:-}" + +function do_host_call { + xe host-call-plugin host-uuid="$INSTALLATION_UUID" plugin="vswitch-cfg-update" fn="update" >/dev/null +} + +function start { + if [ ! -f /etc/xensource-inventory ]; then + printf "vxwitch-xapi-update ERROR: XenSource inventory not present in /etc/xensource-inventory\n" + exit 1 + fi + source /etc/xensource-inventory + action "Updating configuration" do_host_call +} + +case "$1" in + start) + start + ;; + stop) + # Nothing to do here. + ;; + restart) + start + ;; + help) + printf "vswitch [start|stop|restart]\n" + ;; + *) + printf "Unknown command: $1\n" + exit 1 + ;; +esac diff --git a/xenserver/etc_logrotate.d_vswitch b/xenserver/etc_logrotate.d_vswitch new file mode 100644 index 00000000..dae235bd --- /dev/null +++ b/xenserver/etc_logrotate.d_vswitch @@ -0,0 +1,14 @@ +# Copyright (C) 2009 Nicira Networks, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. This file is offered as-is, +# without warranty of any kind. + +/var/log/ovs-vswitchd.log { + sharedscripts + postrotate + # Send sighup to vswitch which will cause it to reopen its log files. + /sbin/service vswitch reload + endscript +} diff --git a/xenserver/etc_profile.d_vswitch.sh b/xenserver/etc_profile.d_vswitch.sh new file mode 100644 index 00000000..90927547 --- /dev/null +++ b/xenserver/etc_profile.d_vswitch.sh @@ -0,0 +1,56 @@ +# Copyright (C) 2009 Nicira Networks, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. This file is offered as-is, +# without warranty of any kind. + +PATH=/root/vswitch/bin:$PATH +export PATH +MANPATH=/root/vswitch/share/man:$MANPATH +export MANPATH + +alias vswitch='service vswitch' + +function watchconf { + watch cat /etc/ovs-vswitchd.conf +} + +function watchdp { + watch ovs-dpctl show "$@" +} + +function watchdpflows { + local grep="" + local dp=$1 + shift + if [ $# -gt 0 ]; then + grep="| grep $@" + fi + watch "ovs-dpctl dump-flows $dp $grep" +} + +function watchflows { + local grep="" + local dp=$1 + shift + bridge=$(ovs-dpctl show $dp | grep 'port 0:' | cut -d' ' -f 3) + if [ $# -gt 0 ]; then + grep="| grep $@" + fi + watch "ovs-ofctl dump-flows unix:/var/run/$bridge.mgmt $grep" +} + +function monitorlogs { + local grep="" + if [ $# -gt 0 ]; then + grep="| grep --line-buffered '^==> .* <==$" + for i in "$@"; do + grep="$grep\|$i" + done + grep="$grep'" + fi + cmd="tail -F /var/log/messages /var/log/ovs-vswitchd.log /var/log/xensource.log $grep | tee /var/log/monitorlogs.out" + printf "cmd: $cmd\n" + eval "$cmd" +} diff --git a/xenserver/etc_sysconfig_vswitch.example b/xenserver/etc_sysconfig_vswitch.example new file mode 100644 index 00000000..cd13b591 --- /dev/null +++ b/xenserver/etc_sysconfig_vswitch.example @@ -0,0 +1,79 @@ +### Configuration options for vswitch + +# Copyright (C) 2009 Nicira Networks, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. This file is offered as-is, +# without warranty of any kind. + +# VSWITCH_BASE: Root directory where vswitch binaries are installed +# VSWITCH_BASE=/root/vswitch/openvswitch/build + +# ENABLE_BRCOMPAT: If 'y' than emulate linux bridging interfaces +# using the brcompat kernel module and ovs-brcompatd daemon +# ENABLE_BRCOMPAT=y + +# ENABLE_FAKE_PROC_NET: If 'y' then emulate linux bonding and vlan +# files in /proc as if the bonding and vlan demultiplexing done in +# ovs-vswitchd were being implemented using existing Linux mechanisms. +# This is useful in some cases when replacing existing solutions. +# ENABLE_FAKE_PROC_NET=y + +# FORCE_COREFILES: If 'y' then core files will be enabled. +# FORCE_COREFILES=n + +# COREFILE_PATTERN: Pattern used to determine path and filename for +# core files when FORCE_COREFILES is 'y'. This is Linux specific. +# See the manpage for "core". +# COREFILE_PATTERN="/var/log/%e-%t" + +# VSWITCHD_CONF: File in which ovs-vswitchd stores its configuration. +# VSWITCHD_CONF=/etc/ovs-vswitchd.conf + +# VSWITCHD_PIDFILE: File in which to store the pid of the running +# ovs-vswitchd. +# VSWITCHD_PIDFILE=/var/run/ovs-vswitchd.pid + +# VSWITCHD_PRIORITY: "nice" priority at which to run ovs-vswitchd and related +# processes. +# VSWITCHD_PRIORITY=-5 + +# VSWITCHD_LOGFILE: File to send the FILE_LOGLEVEL log messages to. +# VSWITCHD_LOGFILE=/var/log/ovs-vswitchd.log + +# VSWITCHD_FILE_LOGLEVEL: Log level at which to log into the +# VSWITCHD_LOG file. If this is null or not set the logfile will +# not be created and nothing will be sent to it. This is the +# default. The available options are: EMER, WARN, INFO and DBG. +# VSWITCHD_FILE_LOGLEVEL="" + +# VSWITCHD_SYSLOG_LOGLEVEL: Log level at which to log into syslog. If +# this is null or not set the default is to log to syslog +# emergency and warning level messages only. +# VSWITCHD_SYSLOG_LOGLEVEL="WARN" + +# BRCOMPATD_PIDFILE: File in which to store the pid of the running +# ovs-brcompatd (the Linux bridge compatibility daemon for ovs-vswitchd). +# If this is the empty string, ovs-brcompatd will not be started and +# the brcompat_mod kernel module will not be inserted. Note that +# the default is to use brcompat! +# BRCOMPATD_PIDFILE=/var/run/ovs-brcompatd.pid + +# BRCOMPATD_PRIORITY: "nice" priority at which to run ovs-vswitchd and related +# processes. +# BRCOMPATD_PRIORITY=-5 + +# BRCOMPATD_LOGFILE: File to send the FILE_LOGLEVEL log messages to. +# BRCOMPATD_LOGFILE=/var/log/ovs-brcompatd.log + +# BRCOMPATD_FILE_LOGLEVEL: Log level at which to log into the +# BRCOMPATD_LOG file. If this is null or not set the logfile will +# not be created and nothing will be sent to it. This is the +# default. The available options are: EMER, WARN, INFO and DBG. +# BRCOMPATD_FILE_LOGLEVEL="" + +# BRCOMPATD_SYSLOG_LOGLEVEL: Log level at which to log into syslog. If +# this is null or not set the default is to log to syslog +# emergency and warning level messages only. +# BRCOMPATD_SYSLOG_LOGLEVEL="WARN" diff --git a/xenserver/etc_xapi.d_plugins_vswitch-cfg-update b/xenserver/etc_xapi.d_plugins_vswitch-cfg-update new file mode 100755 index 00000000..b21cf46d --- /dev/null +++ b/xenserver/etc_xapi.d_plugins_vswitch-cfg-update @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# +# xapi plugin script to update the cache of configuration items in the +# ovs-vswitchd configuration file that are managed in the xapi database +# when integrated with Citrix management tools. + +# Copyright (C) 2009 Nicira Networks, Inc. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# TBD: - error handling needs to be improved. Currently this can leave +# TBD: the system in a bad state if anything goes wrong. + +import logging +log = logging.getLogger("vswitch-cfg-update") +logging.basicConfig(filename="/var/log/vswitch-cfg-update.log", level=logging.DEBUG) + +import XenAPIPlugin +import XenAPI +import subprocess + +cfg_mod="/root/vswitch/bin/ovs-cfg-mod" +vswitchd_cfg_filename="/etc/ovs-vswitchd.conf" + +def update(session, args): + pools = session.xenapi.pool.get_all() + # We assume there is only ever one pool... + if len(pools) == 0: + log.error("No pool for host.") + raise XenAPIPlugin.Failure("NO_POOL_FOR_HOST", []) + if len(pools) > 1: + log.error("More than one pool for host.") + raise XenAPIPlugin.Failure("MORE_THAN_ONE_POOL_FOR_HOST", []) + pool = session.xenapi.pool.get_record(pools[0]) + try: + controller = pool["other_config"]["vSwitchController"] + except KeyError, e: + controller = "" + currentController = vswitchCurrentController() + if controller == "" and currentController != "": + log.debug("Removing controller configuration.") + removeControllerCfg() + return "Successfully removed controller config" + elif controller != currentController: + if len(controller) == 0: + log.debug("Setting controller to: %s" % (controller)) + else: + log.debug("Changing controller from %s to %s" % (currentController, controller)) + setControllerCfg(controller) + return "Successfully set controller to " + controller + else: + log.debug("No change to controller configuration required.") + return "No change to configuration" + +def vswitchCurrentController(): + controller = vswitchCfgQuery("mgmt.controller") + if controller == "": + return controller + if len(controller) < 4 or controller[0:4] != "ssl:": + log.warning("Controller does not specify ssl connection type, returning entire string.") + return controller + else: + return controller[4:] + +def removeControllerCfg(): + vswitchCfgMod(["--del-match", "mgmt.controller=*", + "--del-match", "ssl.bootstrap-ca-cert=*", + "--del-match", "ssl.ca-cert=*", + "--del-match", "ssl.private-key=*", + "--del-match", "ssl.certificate=*"]) + +def setControllerCfg(controller): + vswitchCfgMod(["--del-match", "mgmt.controller=*", + "--del-match", "ssl.bootstrap-ca-cert=*", + "--del-match", "ssl.ca-cert=*", + "--del-match", "ssl.private-key=*", + "--del-match", "ssl.certificate=*", + "-a", "mgmt.controller=ssl:" + controller, + "-a", "ssl.bootstrap-ca-cert=true", + "-a", "ssl.ca-cert=/etc/ovs-vswitchd.cacert", + "-a", "ssl.private-key=/etc/xensource/xapi-ssl.pem", + "-a", "ssl.certificate=/etc/xensource/xapi-ssl.pem"]) + +def vswitchCfgQuery(key): + cmd = [cfg_mod, "--config-file=" + vswitchd_cfg_filename, "-q", key] + output = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate() + if len(output) == 0 or output[0] == None: + output = "" + else: + output = output[0].strip() + return output + +def vswitchCfgMod(action_args): + cmd = [cfg_mod, "-vANY:console:emer", + "--config-file=" + vswitchd_cfg_filename] + action_args + exitcode = subprocess.call(cmd) + if exitcode != 0: + log.error("ovs-cfg-mod failed with exit code " + + str(exitcode) + " for " + repr(action_args)) + raise XenAPIPlugin.Failure("VSWITCH_CONFIG_MOD_FAILURE", + [ str(exitcode) , str(action_args) ]) + vswitchReload() + +def vswitchReload(): + exitcode = subprocess.call(["/sbin/service", "vswitch", "reload"]) + if exitcode != 0: + log.error("vswitch reload failed with exit code " + str(exitcode)) + raise XenAPIPlugin.Failure("VSWITCH_CFG_RELOAD_FAILURE", [ str(exitcode) ]) + + +if __name__ == "__main__": + XenAPIPlugin.dispatch({"update": update}) diff --git a/xenserver/etc_xensource_scripts_vif b/xenserver/etc_xensource_scripts_vif new file mode 100755 index 00000000..fcf13a69 --- /dev/null +++ b/xenserver/etc_xensource_scripts_vif @@ -0,0 +1,130 @@ +#!/bin/sh + +# This file is based on /etc/xensource/script/vif from Citrix XenServer 5.0.0. +# The original file did not contain a copyright notice or license statement. +# +# Copyright (C) 2009 Nicira Networks, Inc. + +# CA-23900: Warning: when VIFs are added to windows guests with PV drivers the backend vif device is registered, +# unregistered and then registered again. This causes the udev event to fire twice and this script runs twice. +# Since the first invocation of the script races with the device unregistration, spurious errors are possible +# which will be logged but are safe to ignore since the second script invocation should complete the operation. +# Note that each script invocation is run synchronously from udev and so the scripts don't race with each other. + +# Keep other-config/ keys in sync with device.ml:vif_udev_keys + +cfg_mod="/root/vswitch/bin/ovs-cfg-mod" +service="/sbin/service" + +TYPE=`echo ${XENBUS_PATH} | cut -f 2 -d '/'` +DOMID=`echo ${XENBUS_PATH} | cut -f 3 -d '/'` +DEVID=`echo ${XENBUS_PATH} | cut -f 4 -d '/'` + +XAPI=/xapi/${DOMID}/hotplug/${TYPE}/${DEVID} +HOTPLUG=/xapi/${DOMID}/hotplug/${TYPE}/${DEVID} +PRIVATE=/xapi/${DOMID}/private/${TYPE}/${DEVID} +BRCTL=/usr/sbin/brctl +IP=/sbin/ip + + +handle_promiscuous() +{ + local arg=$(xenstore-read "${PRIVATE}/other-config/promiscuous") + if [ $? -eq 0 -a -n "${arg}" ] ; then + case "${arg}" in + true|on) echo 1 > /sys/class/net/${vif}/brport/promisc ;; + *) echo 0 > /sys/class/net/${vif}/brport/promisc ;; + esac + fi +} + +handle_ethtool() +{ + local opt=$1 + local arg=$(xenstore-read "${PRIVATE}/other-config/ethtool-${opt}") + if [ $? -eq 0 -a -n "${arg}" ] ; then + case "${arg}" in + true|on) /sbin/ethtool -K "${vif}" "${opt}" on ;; + false|off) /sbin/ethtool -K "${vif}" "${opt}" off ;; + *) logger -t scripts-vif "Unknown ethtool argument ${opt}=${arg} on ${vif}/${VIFUUID}" ;; + esac + fi +} + +handle_mtu() +{ + local mtu=$(xenstore-read "${PRIVATE}/MTU") + if [ $? -eq 0 -a -n "${mtu}" ]; then + echo "${mtu}" > /sys/class/net/${vif}/mtu + fi +} + +add_to_bridge() +{ + local address=$(xenstore-read "${PRIVATE}/bridge-MAC") + if [ $? -ne 0 -o -z "${address}" ]; then + logger -t scripts-vif "Failed to read ${PRIVATE}/bridge-MAC from xenstore" + fi + local bridge=$(xenstore-read "${PRIVATE}/bridge") + if [ $? -ne 0 -o -z "${bridge}" ]; then + logger -t scripts-vif "Failed to read ${PRIVATE}/bridge from xenstore" + fi + logger -t scripts-vif "Adding ${vif} to ${bridge} with address ${address}" + + vid= + if [ -e "/etc/openvswitch/br-$bridge" ]; then + . "/etc/openvswitch/br-$bridge" + if [ -n "$VLAN_SLAVE" -a -n "$VLAN_VID" ]; then + bridge=$VLAN_SLAVE + vid="--add=vlan.$vif.tag=$VLAN_VID" + fi + fi + + ${IP} link set "${vif}" down || logger -t scripts-vif "Failed to ip link set ${vif} down" + ${IP} link set "${vif}" arp off || logger -t scripts-vif "Failed to ip link set ${vif} arp off" + ${IP} link set "${vif}" multicast off || logger -t scripts-vif "Failed to ip link set ${vif} multicast off" + ${IP} link set "${vif}" address "${address}" || logger -t scripts-vif "Failed to ip link set ${vif} address ${address}" + ${IP} addr flush "${vif}" || logger -t scripts-vif "Failed to ip addr flush ${vif}" + + $cfg_mod -F /etc/ovs-vswitchd.conf \ + --del-match="bridge.*.port=$vif" \ + --del-match="vlan.$vif.[!0-9]*" \ + --del-match="port.$vif.[!0-9]*" \ + --add="bridge.$bridge.port=$vif" \ + $vid -c + $service vswitch reload + + ${IP} link set "${vif}" up || logger -t scripts-vif "Failed to ip link set ${vif} up" +} + +echo Called as "$@" "$TYPE" "$DOMID" "$DEVID" | logger -t scripts-vif +case "$1" in +online) + handle_ethtool rx + handle_ethtool tx + handle_ethtool sg + handle_ethtool tso + handle_ethtool ufo + handle_ethtool gso + + handle_mtu + add_to_bridge + handle_promiscuous + + xenstore-write "${HOTPLUG}/vif" "${vif}" + xenstore-write "${HOTPLUG}/hotplug" "online" + + # xs-xen.pq.hq:91e986b8e49f netback-wait-for-hotplug + xenstore-write "/local/domain/0/backend/vif/${DOMID}/${DEVID}/hotplug-status" "connected" + + ;; +remove) + xenstore-rm "${HOTPLUG}/hotplug" + vif=vif${DOMID}.${DEVID} + logger -t scripts-vif "${vif} has been removed" + $cfg_mod -vANY:console:emer -F /etc/ovs-vswitchd.conf \ + --del-match="bridge.*.port=${vif}" \ + --del-match="vlan.${vif}.[!0-9]*" \ + --del-match="port.${vif}.[!0-9]*" -c + ;; +esac diff --git a/xenserver/opt_xensource_libexec_interface-reconfigure b/xenserver/opt_xensource_libexec_interface-reconfigure new file mode 100755 index 00000000..6ea369ff --- /dev/null +++ b/xenserver/opt_xensource_libexec_interface-reconfigure @@ -0,0 +1,1572 @@ +#!/usr/bin/python +# +# Copyright (c) Citrix Systems 2008. All rights reserved. +# Copyright (c) 2009 Nicira Networks. +# +"""Usage: + + %(command-name)s --session --pif [up|down|rewrite] + %(command-name)s --force [up|down|rewrite ] + %(command-name)s --force all down + + where, + = --device= --mode=dhcp + = --device= --mode=static --ip= --netmask= [--gateway=] + + Options: + --session A session reference to use to access the xapi DB + --pif A PIF reference. + --force-interface An interface name. Mutually exclusive with --session/--pif. + + Either both --session and --pif or just --pif-uuid. + + is either "up" or "down" or "rewrite" +""" + +# +# Undocumented parameters for test & dev: +# +# --output-directory= Write configuration to . Also disables actually +# raising/lowering the interfaces +# --pif-uuid A PIF UUID, use instead of --session/--pif. +# +# +# +# Notes: +# 1. Every pif belongs to exactly one network +# 2. Every network has zero or one pifs +# 3. A network may have an associated bridge, allowing vifs to be attached +# 4. A network may be bridgeless (there's no point having a bridge over a storage pif) + +# XXX: --force-interface=all down + +# XXX: --force-interface rewrite + +# XXX: Sometimes this leaves "orphaned" datapaths, e.g. a datapath whose +# only port is the local port. Should delete those. + +# XXX: This can leave crud in ovs-vswitchd.conf in this scenario: +# - Create bond in XenCenter. +# - Create VLAN on bond in XenCenter. +# - Attempt to delete bond in XenCenter (this will fail because there +# is a VLAN on the bond, although the error may not be reported +# until the next step) +# - Delete VLAN in XenCenter. +# - Delete bond in XenCenter. +# At this point there will still be some configuration data for the bond +# or the VLAN in ovs-vswitchd.conf. + +import XenAPI +import os, sys, getopt, time, signal +import syslog +import traceback +import time +import re +import pickle + +output_directory = None + +db = None +management_pif = None + +dbcache_file = "/etc/vswitch.dbcache" +vswitch_config_dir = "/etc/openvswitch" + +class Usage(Exception): + def __init__(self, msg): + Exception.__init__(self) + self.msg = msg + +class Error(Exception): + def __init__(self, msg): + Exception.__init__(self) + self.msg = msg + +class ConfigurationFile(object): + """Write a file, tracking old and new versions. + + Supports writing a new version of a file and applying and + reverting those changes. + """ + + __STATE = {"OPEN":"OPEN", + "NOT-APPLIED":"NOT-APPLIED", "APPLIED":"APPLIED", + "REVERTED":"REVERTED", "COMMITTED": "COMMITTED"} + + def __init__(self, fname, path="/etc/sysconfig/network-scripts"): + + self.__state = self.__STATE['OPEN'] + self.__fname = fname + self.__children = [] + + if debug_mode(): + dirname = output_directory + else: + dirname = path + + self.__path = os.path.join(dirname, fname) + self.__oldpath = os.path.join(dirname, "." + fname + ".xapi-old") + self.__newpath = os.path.join(dirname, "." + fname + ".xapi-new") + self.__unlink = False + + self.__f = open(self.__newpath, "w") + + def attach_child(self, child): + self.__children.append(child) + + def path(self): + return self.__path + + def readlines(self): + try: + return open(self.path()).readlines() + except: + return "" + + def write(self, args): + if self.__state != self.__STATE['OPEN']: + raise Error("Attempt to write to file in state %s" % self.__state) + self.__f.write(args) + + def unlink(self): + if self.__state != self.__STATE['OPEN']: + raise Error("Attempt to unlink file in state %s" % self.__state) + self.__unlink = True + self.__f.close() + self.__state = self.__STATE['NOT-APPLIED'] + + def close(self): + if self.__state != self.__STATE['OPEN']: + raise Error("Attempt to close file in state %s" % self.__state) + + self.__f.close() + self.__state = self.__STATE['NOT-APPLIED'] + + def changed(self): + if self.__state != self.__STATE['NOT-APPLIED']: + raise Error("Attempt to compare file in state %s" % self.__state) + + return True + + def apply(self): + if self.__state != self.__STATE['NOT-APPLIED']: + raise Error("Attempt to apply configuration from state %s" % self.__state) + + for child in self.__children: + child.apply() + + log("Applying changes to %s configuration" % self.__fname) + + # Remove previous backup. + if os.access(self.__oldpath, os.F_OK): + os.unlink(self.__oldpath) + + # Save current configuration. + if os.access(self.__path, os.F_OK): + os.link(self.__path, self.__oldpath) + os.unlink(self.__path) + + # Apply new configuration. + assert(os.path.exists(self.__newpath)) + if not self.__unlink: + os.link(self.__newpath, self.__path) + else: + pass # implicit unlink of original file + + # Remove temporary file. + os.unlink(self.__newpath) + + self.__state = self.__STATE['APPLIED'] + + def revert(self): + if self.__state != self.__STATE['APPLIED']: + raise Error("Attempt to revert configuration from state %s" % self.__state) + + for child in self.__children: + child.revert() + + log("Reverting changes to %s configuration" % self.__fname) + + # Remove existing new configuration + if os.access(self.__newpath, os.F_OK): + os.unlink(self.__newpath) + + # Revert new configuration. + if os.access(self.__path, os.F_OK): + os.link(self.__path, self.__newpath) + os.unlink(self.__path) + + # Revert to old configuration. + if os.access(self.__oldpath, os.F_OK): + os.link(self.__oldpath, self.__path) + os.unlink(self.__oldpath) + + # Leave .*.xapi-new as an aid to debugging. + + self.__state = self.__STATE['REVERTED'] + + def commit(self): + if self.__state != self.__STATE['APPLIED']: + raise Error("Attempt to commit configuration from state %s" % self.__state) + + for child in self.__children: + child.commit() + + log("Committing changes to %s configuration" % self.__fname) + + if os.access(self.__oldpath, os.F_OK): + os.unlink(self.__oldpath) + if os.access(self.__newpath, os.F_OK): + os.unlink(self.__newpath) + + self.__state = self.__STATE['COMMITTED'] + +def debug_mode(): + return output_directory is not None + +def log(s): + if debug_mode(): + print >>sys.stderr, s + else: + syslog.syslog(s) + +def check_allowed(pif): + pifrec = db.get_pif_record(pif) + try: + f = open("/proc/ardence") + macline = filter(lambda x: x.startswith("HWaddr:"), f.readlines()) + f.close() + if len(macline) == 1: + p = re.compile(".*\s%(MAC)s\s.*" % pifrec, re.IGNORECASE) + if p.match(macline[0]): + log("Skipping PVS device %(device)s (%(MAC)s)" % pifrec) + return False + except IOError: + pass + return True + +def interface_exists(i): + return os.path.exists("/sys/class/net/" + i) + +class DatabaseCache(object): + def __init__(self, session_ref=None, cache_file=None): + if session_ref and cache_file: + raise Error("can't specify session reference and cache file") + + if cache_file == None: + session = XenAPI.xapi_local() + + if not session_ref: + log("No session ref given on command line, logging in.") + session.xenapi.login_with_password("root", "") + else: + session._session = session_ref + + try: + self.__vlans = session.xenapi.VLAN.get_all_records() + self.__bonds = session.xenapi.Bond.get_all_records() + self.__pifs = session.xenapi.PIF.get_all_records() + self.__networks = session.xenapi.network.get_all_records() + finally: + if not session_ref: + session.xenapi.session.logout() + else: + log("Loading xapi database cache from %s" % cache_file) + f = open(cache_file, 'r') + members = pickle.load(f) + self.extras = pickle.load(f) + f.close() + + self.__vlans = members['vlans'] + self.__bonds = members['bonds'] + self.__pifs = members['pifs'] + self.__networks = members['networks'] + + def save(self, cache_file, extras): + f = open(cache_file, 'w') + pickle.dump({'vlans': self.__vlans, + 'bonds': self.__bonds, + 'pifs': self.__pifs, + 'networks': self.__networks}, f) + pickle.dump(extras, f) + f.close() + + def get_pif_by_uuid(self, uuid): + pifs = map(lambda (ref,rec): ref, + filter(lambda (ref,rec): uuid == rec['uuid'], + self.__pifs.items())) + if len(pifs) == 0: + raise Error("Unknown PIF \"%s\"" % uuid) + elif len(pifs) > 1: + raise Error("Non-unique PIF \"%s\"" % uuid) + + return pifs[0] + + def get_pifs_by_record(self, record): + """record is partial pif record. + Get the pif(s) whose record matches. + """ + def match(pifrec): + for key in record: + if record[key] != pifrec[key]: + return False + return True + + return map(lambda (ref,rec): ref, + filter(lambda (ref,rec): match(rec), + self.__pifs.items())) + + def get_pif_by_record(self, record): + """record is partial pif record. + Get the pif whose record matches. + """ + pifs = self.get_pifs_by_record(record) + if len(pifs) == 0: + raise Error("No matching PIF \"%s\"" % str(record)) + elif len(pifs) > 1: + raise Error("Multiple matching PIFs \"%s\"" % str(record)) + + return pifs[0] + + def get_pif_by_bridge(self, host, bridge): + networks = map(lambda (ref,rec): ref, + filter(lambda (ref,rec): rec['bridge'] == bridge, + self.__networks.items())) + if len(networks) == 0: + raise Error("No matching network \"%s\"") + + answer = None + for network in networks: + nwrec = self.get_network_record(network) + for pif in nwrec['PIFs']: + pifrec = self.get_pif_record(pif) + if pifrec['host'] != host: + continue + if answer: + raise Error("Multiple PIFs on %s for network %s" % (host, bridge)) + answer = pif + if not answer: + raise Error("No PIF on %s for network %s" % (host, bridge)) + return answer + + def get_pif_record(self, pif): + if self.__pifs.has_key(pif): + return self.__pifs[pif] + raise Error("Unknown PIF \"%s\"" % pif) + def get_all_pifs(self): + return self.__pifs + def pif_exists(self, pif): + return self.__pifs.has_key(pif) + + def get_management_pif(self, host): + """ Returns the management pif on host + """ + all = self.get_all_pifs() + for pif in all: + pifrec = self.get_pif_record(pif) + if pifrec['management'] and pifrec['host'] == host : + return pif + return None + + def get_network_record(self, network): + if self.__networks.has_key(network): + return self.__networks[network] + raise Error("Unknown network \"%s\"" % network) + def get_all_networks(self): + return self.__networks + + def get_bond_record(self, bond): + if self.__bonds.has_key(bond): + return self.__bonds[bond] + else: + return None + + def get_vlan_record(self, vlan): + if self.__vlans.has_key(vlan): + return self.__vlans[vlan] + else: + return None + +def bridge_name(pif): + """Return the bridge name associated with pif, or None if network is bridgeless""" + pifrec = db.get_pif_record(pif) + nwrec = db.get_network_record(pifrec['network']) + + if nwrec['bridge']: + # TODO: sanity check that nwrec['bridgeless'] != 'true' + return nwrec['bridge'] + else: + # TODO: sanity check that nwrec['bridgeless'] == 'true' + return None + +def interface_name(pif): + """Construct an interface name from the given PIF record.""" + + pifrec = db.get_pif_record(pif) + + if pifrec['VLAN'] == '-1': + return pifrec['device'] + else: + return "%(device)s.%(VLAN)s" % pifrec + +def datapath_name(pif): + """Return the OpenFlow datapath name associated with pif. +For a non-VLAN PIF, the datapath name is the bridge name. +For a VLAN PIF, the datapath name is the bridge name for the PIF's VLAN slave. +(xapi will create a datapath named with the bridge name even though we won't +use it.) +""" + + pifrec = db.get_pif_record(pif) + + if pifrec['VLAN'] == '-1': + return bridge_name(pif) + else: + return bridge_name(get_vlan_slave_of_pif(pif)) + +def ipdev_name(pif): + """Return the the name of the network device that carries the +IP configuration (if any) associated with pif. +The ipdev name is the same as the bridge name. +""" + + pifrec = db.get_pif_record(pif) + return bridge_name(pif) + +def physdev_names(pif): + """Return the name(s) of the physical network device(s) associated with pif. +For a VLAN PIF, the physical devices are the VLAN slave's physical devices. +For a bond master PIF, the physical devices are the bond slaves. +For a non-VLAN, non-bond master PIF, the physical device is the PIF itself. +""" + + pifrec = db.get_pif_record(pif) + + if pifrec['VLAN'] != '-1': + return physdev_names(get_vlan_slave_of_pif(pif)) + elif len(pifrec['bond_master_of']) != 0: + physdevs = [] + for slave in get_bond_slaves_of_pif(pif): + physdevs += physdev_names(slave) + return physdevs + else: + return [pifrec['device']] + +def log_pif_action(action, pif): + pifrec = db.get_pif_record(pif) + pifrec['action'] = action + pifrec['interface-name'] = interface_name(pif) + if action == "rewrite": + pifrec['message'] = "Rewrite PIF %(uuid)s configuration" % pifrec + else: + pifrec['message'] = "Bring %(action)s PIF %(uuid)s" % pifrec + log("%(message)s: %(interface-name)s configured as %(ip_configuration_mode)s" % pifrec) + +def get_bond_masters_of_pif(pif): + """Returns a list of PIFs which are bond masters of this PIF""" + + pifrec = db.get_pif_record(pif) + + bso = pifrec['bond_slave_of'] + + # bond-slave-of is currently a single reference but in principle a + # PIF could be a member of several bonds which are not + # concurrently attached. Be robust to this possibility. + if not bso or bso == "OpaqueRef:NULL": + bso = [] + elif not type(bso) == list: + bso = [bso] + + bondrecs = [db.get_bond_record(bond) for bond in bso] + bondrecs = [rec for rec in bondrecs if rec] + + return [bond['master'] for bond in bondrecs] + +def get_bond_slaves_of_pif(pif): + """Returns a list of PIFs which make up the given bonded pif.""" + + pifrec = db.get_pif_record(pif) + host = pifrec['host'] + + bmo = pifrec['bond_master_of'] + if len(bmo) > 1: + raise Error("Bond-master-of contains too many elements") + + if len(bmo) == 0: + return [] + + bondrec = db.get_bond_record(bmo[0]) + if not bondrec: + raise Error("No bond record for bond master PIF") + + return bondrec['slaves'] + +def get_vlan_slave_of_pif(pif): + """Find the PIF which is the VLAN slave of pif. + +Returns the 'physical' PIF underneath the a VLAN PIF @pif.""" + + pifrec = db.get_pif_record(pif) + + vlan = pifrec['VLAN_master_of'] + if not vlan or vlan == "OpaqueRef:NULL": + raise Error("PIF is not a VLAN master") + + vlanrec = db.get_vlan_record(vlan) + if not vlanrec: + raise Error("No VLAN record found for PIF") + + return vlanrec['tagged_PIF'] + +def get_vlan_masters_of_pif(pif): + """Returns a list of PIFs which are VLANs on top of the given pif.""" + + pifrec = db.get_pif_record(pif) + vlans = [db.get_vlan_record(v) for v in pifrec['VLAN_slave_of']] + return [v['untagged_PIF'] for v in vlans if v and db.pif_exists(v['untagged_PIF'])] + +def interface_deconfigure_commands(interface): + # The use of [!0-9] keeps an interface of 'eth0' from matching + # VLANs attached to eth0 (such as 'eth0.123'), which are distinct + # interfaces. + return ['--del-match=bridge.*.port=%s' % interface, + '--del-match=bonding.%s.[!0-9]*' % interface, + '--del-match=bonding.*.slave=%s' % interface, + '--del-match=vlan.%s.[!0-9]*' % interface, + '--del-match=port.%s.[!0-9]*' % interface, + '--del-match=iface.%s.[!0-9]*' % interface] + +def run_command(command): + log("Running command: " + ' '.join(command)) + if os.spawnl(os.P_WAIT, command[0], *command) != 0: + log("Command failed: " + ' '.join(command)) + return False + return True + +def down_netdev(interface, deconfigure=True): + if not interface_exists(interface): + log("down_netdev: interface %s does not exist, ignoring" % interface) + return + argv = ["/sbin/ifconfig", interface, 'down'] + if deconfigure: + argv += ['0.0.0.0'] + + # Kill dhclient. + pidfile_name = '/var/run/dhclient-%s.pid' % interface + pidfile = None + try: + pidfile = open(pidfile_name, 'r') + os.kill(int(pidfile.readline()), signal.SIGTERM) + except: + pass + if pidfile != None: + pidfile.close() + + # Remove dhclient pidfile. + try: + os.remove(pidfile_name) + except: + pass + run_command(argv) + +def up_netdev(interface): + run_command(["/sbin/ifconfig", interface, 'up']) + +def find_distinguished_pifs(pif): + """Returns the PIFs on host that own DNS and the default route. +The peerdns pif will be the one with pif::other-config:peerdns=true, or the mgmt pif if none have this set. +The gateway pif will be the one with pif::other-config:defaultroute=true, or the mgmt pif if none have this set. + +Note: we prune out the bond master pif (if it exists). +This is because when we are called to bring up an interface with a bond master, it is implicit that +we should bring down that master.""" + + pifrec = db.get_pif_record(pif) + host = pifrec['host'] + + pifs_on_host = [ __pif for __pif in db.get_all_pifs() if + db.get_pif_record(__pif)['host'] == host and + (not __pif in get_bond_masters_of_pif(pif)) ] + + peerdns_pif = None + defaultroute_pif = None + + # loop through all the pifs on this host looking for one with + # other-config:peerdns = true, and one with + # other-config:default-route=true + for __pif in pifs_on_host: + __pifrec = db.get_pif_record(__pif) + __oc = __pifrec['other_config'] + if __oc.has_key('peerdns') and __oc['peerdns'] == 'true': + if peerdns_pif == None: + peerdns_pif = __pif + else: + log('Warning: multiple pifs with "peerdns=true" - choosing %s and ignoring %s' % \ + (db.get_pif_record(peerdns_pif)['device'], __pifrec['device'])) + if __oc.has_key('defaultroute') and __oc['defaultroute'] == 'true': + if defaultroute_pif == None: + defaultroute_pif = __pif + else: + log('Warning: multiple pifs with "defaultroute=true" - choosing %s and ignoring %s' % \ + (db.get_pif_record(defaultroute_pif)['device'], __pifrec['device'])) + + # If no pif is explicitly specified then use the mgmt pif for peerdns/defaultroute + if peerdns_pif == None: + peerdns_pif = management_pif + if defaultroute_pif == None: + defaultroute_pif = management_pif + + return peerdns_pif, defaultroute_pif + +def ethtool_settings(oc): + # Options for "ethtool -s" + settings = [] + if oc.has_key('ethtool-speed'): + val = oc['ethtool-speed'] + if val in ["10", "100", "1000"]: + settings += ['speed', val] + else: + log("Invalid value for ethtool-speed = %s. Must be 10|100|1000." % val) + if oc.has_key('ethtool-duplex'): + val = oc['ethtool-duplex'] + if val in ["10", "100", "1000"]: + settings += ['duplex', 'val'] + else: + log("Invalid value for ethtool-duplex = %s. Must be half|full." % val) + if oc.has_key('ethtool-autoneg'): + val = oc['ethtool-autoneg'] + if val in ["true", "on"]: + settings += ['autoneg', 'on'] + elif val in ["false", "off"]: + settings += ['autoneg', 'off'] + else: + log("Invalid value for ethtool-autoneg = %s. Must be on|true|off|false." % val) + + # Options for "ethtool -K" + offload = [] + for opt in ("rx", "tx", "sg", "tso", "ufo", "gso"): + if oc.has_key("ethtool-" + opt): + val = oc["ethtool-" + opt] + if val in ["true", "on"]: + offload += [opt, 'on'] + elif val in ["false", "off"]: + offload += [opt, 'off'] + else: + log("Invalid value for ethtool-%s = %s. Must be on|true|off|false." % (opt, val)) + + return settings, offload + +def configure_netdev(pif): + pifrec = db.get_pif_record(pif) + datapath = datapath_name(pif) + ipdev = ipdev_name(pif) + + host = pifrec['host'] + nw = pifrec['network'] + nwrec = db.get_network_record(nw) + + ifconfig_argv = ['/sbin/ifconfig', ipdev, 'up'] + gateway = '' + if pifrec['ip_configuration_mode'] == "DHCP": + pass + elif pifrec['ip_configuration_mode'] == "Static": + ifconfig_argv += [pifrec['IP']] + ifconfig_argv += ['netmask', pifrec['netmask']] + gateway = pifrec['gateway'] + elif pifrec['ip_configuration_mode'] == "None": + # Nothing to do. + pass + else: + raise Error("Unknown IP-configuration-mode %s" % pifrec['ip_configuration_mode']) + + oc = {} + if pifrec.has_key('other_config'): + oc = pifrec['other_config'] + if oc.has_key('mtu'): + int(oc['mtu']) # Check that the value is an integer + ifconfig_argv += ['mtu', oc['mtu']] + + run_command(ifconfig_argv) + + (peerdns_pif, defaultroute_pif) = find_distinguished_pifs(pif) + + if peerdns_pif == pif: + f = ConfigurationFile('resolv.conf', "/etc") + if oc.has_key('domain'): + f.write("search %s\n" % oc['domain']) + for dns in pifrec['DNS'].split(","): + f.write("nameserver %s\n" % dns) + f.close() + f.apply() + f.commit() + + if defaultroute_pif == pif and gateway != '': + run_command(['/sbin/ip', 'route', 'replace', 'default', + 'via', gateway, 'dev', ipdev]) + + if oc.has_key('static-routes'): + for line in oc['static-routes'].split(','): + network, masklen, gateway = line.split('/') + run_command(['/sbin/ip', 'route', 'add', + '%s/%s' % (netmask, masklen), 'via', gateway, + 'dev', ipdev]) + + settings, offload = ethtool_settings(oc) + if settings: + run_command(['/sbin/ethtool', '-s', ipdev] + settings) + if offload: + run_command(['/sbin/ethtool', '-K', ipdev] + offload) + + if pifrec['ip_configuration_mode'] == "DHCP": + print + print "Determining IP information for %s..." % ipdev, + argv = ['/sbin/dhclient', '-q', + '-lf', '/var/lib/dhclient/dhclient-%s.leases' % ipdev, + '-pf', '/var/run/dhclient-%s.pid' % ipdev, + ipdev] + if run_command(argv): + print 'done.' + else: + print 'failed.' + +def modify_config(commands): + run_command(['/root/vswitch/bin/ovs-cfg-mod', '-vANY:console:emer', + '-F', '/etc/ovs-vswitchd.conf'] + + commands + ['-c']) + run_command(['/sbin/service', 'vswitch', 'reload']) + +def is_bond_pif(pif): + pifrec = db.get_pif_record(pif) + return len(pifrec['bond_master_of']) != 0 + +def configure_bond(pif): + pifrec = db.get_pif_record(pif) + interface = interface_name(pif) + ipdev = ipdev_name(pif) + datapath = datapath_name(pif) + physdevs = physdev_names(pif) + + argv = ['--del-match=bonding.%s.[!0-9]*' % interface] + argv += ["--add=bonding.%s.slave=%s" % (interface, slave) + for slave in physdevs] + + # Bonding options. + bond_options = { + "mode": "balance-slb", + "miimon": "100", + "downdelay": "200", + "updelay": "31000", + "use_carrier": "1", + } + # override defaults with values from other-config whose keys + # being with "bond-" + oc = pifrec['other_config'] + overrides = filter(lambda (key,val): + key.startswith("bond-"), oc.items()) + overrides = map(lambda (key,val): (key[5:], val), overrides) + bond_options.update(overrides) + for (name,val) in bond_options.items(): + argv += ["--add=bonding.%s.%s=%s" % (interface, name, val)] + return argv + +def action_up(pif): + pifrec = db.get_pif_record(pif) + + bridge = bridge_name(pif) + interface = interface_name(pif) + ipdev = ipdev_name(pif) + datapath = datapath_name(pif) + physdevs = physdev_names(pif) + vlan_slave = None + if pifrec['VLAN'] != '-1': + vlan_slave = get_vlan_slave_of_pif(pif) + if vlan_slave and is_bond_pif(vlan_slave): + bond_master = vlan_slave + elif is_bond_pif(pif): + bond_master = pif + else: + bond_master = None + bond_masters = get_bond_masters_of_pif(pif) + + # Support "rpm -e vswitch" gracefully by keeping Centos configuration + # files up-to-date, even though we don't use them or need them. + f = configure_pif(pif) + mode = pifrec['ip_configuration_mode'] + if bridge: + log("Configuring %s using %s configuration" % (bridge, mode)) + br = open_network_ifcfg(pif) + configure_network(pif, br) + br.close() + f.attach_child(br) + else: + log("Configuring %s using %s configuration" % (interface, mode)) + configure_network(pif, f) + f.close() + for master in bond_masters: + master_bridge = bridge_name(master) + removed = unconfigure_pif(master) + f.attach_child(removed) + if master_bridge: + removed = open_network_ifcfg(master) + log("Unlinking stale file %s" % removed.path()) + removed.unlink() + f.attach_child(removed) + + # /etc/xensource/scripts/vif needs to know where to add VIFs. + if vlan_slave: + if not os.path.exists(vswitch_config_dir): + os.mkdir(vswitch_config_dir) + br = ConfigurationFile("br-%s" % bridge, vswitch_config_dir) + br.write("VLAN_SLAVE=%s\n" % datapath) + br.write("VLAN_VID=%s\n" % pifrec['VLAN']) + br.close() + f.attach_child(br) + + # Update all configuration files (both ours and Centos's). + f.apply() + f.commit() + + # "ifconfig down" the network device and delete its IP address, etc. + down_netdev(ipdev) + for physdev in physdevs: + down_netdev(physdev) + + # Remove all keys related to pif and any bond masters linked to PIF. + del_ports = [ipdev] + physdevs + bond_masters + if vlan_slave and bond_master: + del_ports += [interface_name(bond_master)] + + # What ports do we need to add to the datapath? + # + # We definitely need the ipdev, and ordinarily we want the + # physical devices too, but for bonds we need the bond as bridge + # port. + add_ports = [ipdev, datapath] + if not bond_master: + add_ports += physdevs + else: + add_ports += [interface_name(bond_master)] + + # What ports do we need to delete? + # + # - All the ports that we add, to avoid duplication and to drop + # them from another datapath in case they're misassigned. + # + # - The physical devices, since they will either be in add_ports + # or added to the bonding device (see below). + # + # - The bond masters for pif. (Ordinarily pif shouldn't have any + # bond masters. If it does then interface-reconfigure is + # implicitly being asked to take them down.) + del_ports = add_ports + physdevs + bond_masters + + # What networks does this datapath carry? + # + # - The network corresponding to the datapath's PIF. + # + # - The networks corresponding to any VLANs attached to the + # datapath's PIF. + network_uuids = [] + for nwpif in db.get_pifs_by_record({'device': pifrec['device'], + 'host': pifrec['host']}): + net = db.get_pif_record(nwpif)['network'] + network_uuids += [db.get_network_record(net)['uuid']] + + # Now modify the ovs-vswitchd config file. + argv = [] + for port in set(del_ports): + argv += interface_deconfigure_commands(port) + for port in set(add_ports): + argv += ['--add=bridge.%s.port=%s' % (datapath, port)] + if vlan_slave: + argv += ['--add=vlan.%s.tag=%s' % (ipdev, pifrec['VLAN'])] + argv += ['--add=iface.%s.internal=true' % (ipdev)] + + # xapi creates a bridge by the name of the ipdev and requires + # that the IP address will be on it. We need to delete this + # bridge because we need that device to be a member of our + # datapath. + argv += ['--del-match=bridge.%s.[!0-9]*' % ipdev] + + # xapi insists that its attempts to create the bridge succeed, + # so force that to happen. + argv += ['--add=iface.%s.fake-bridge=true' % (ipdev)] + else: + try: + os.unlink("%s/br-%s" % (vswitch_config_dir, bridge)) + except OSError: + pass + argv += ['--del-match=bridge.%s.xs-network-uuids=*' % datapath] + argv += ['--add=bridge.%s.xs-network-uuids=%s' % (datapath, uuid) + for uuid in set(network_uuids)] + if bond_master: + argv += configure_bond(bond_master) + modify_config(argv) + + # Configure network devices. + configure_netdev(pif) + + # Bring up VLAN slave and bond slaves. + if vlan_slave: + up_netdev(ipdev_name(vlan_slave)) + for physdev in physdevs: + up_netdev(physdev) + + # Update /etc/issue (which contains the IP address of the management interface) + os.system("/sbin/update-issue") + +def action_down(pif): + rec = db.get_pif_record(pif) + interface = interface_name(pif) + bridge = bridge_name(pif) + ipdev = ipdev_name(pif) + + # Support "rpm -e vswitch" gracefully by keeping Centos configuration + # files up-to-date, even though we don't use them or need them. + f = unconfigure_pif(pif) + if bridge: + br = open_network_ifcfg(pif) + log("Unlinking stale file %s" % br.path()) + br.unlink() + f.attach_child(br) + try: + f.apply() + f.commit() + except Error, e: + log("action_down failed to apply changes: %s" % e.msg) + f.revert() + raise + + argv = [] + if rec['VLAN'] != '-1': + # Get rid of the VLAN device itself. + down_netdev(ipdev) + argv += interface_deconfigure_commands(ipdev) + + # If the VLAN's slave is attached, stop here. + slave = get_vlan_slave_of_pif(pif) + if db.get_pif_record(slave)['currently_attached']: + log("VLAN slave is currently attached") + modify_config(argv) + return + + # If the VLAN's slave has other VLANs that are attached, stop here. + masters = get_vlan_masters_of_pif(slave) + for m in masters: + if m != pif and db.get_pif_record(m)['currently_attached']: + log("VLAN slave has other master %s" % interface_naem(m)) + modify_config(argv) + return + + # Otherwise, take down the VLAN's slave too. + log("No more masters, bring down vlan slave %s" % interface_name(slave)) + pif = slave + else: + # Stop here if this PIF has attached VLAN masters. + vlan_masters = get_vlan_masters_of_pif(pif) + log("VLAN masters of %s - %s" % (rec['device'], [interface_name(m) for m in vlan_masters])) + for m in vlan_masters: + if db.get_pif_record(m)['currently_attached']: + log("Leaving %s up due to currently attached VLAN master %s" % (interface, interface_name(m))) + return + + # pif is now either a bond or a physical device which needs to be + # brought down. pif might have changed so re-check all its attributes. + rec = db.get_pif_record(pif) + interface = interface_name(pif) + bridge = bridge_name(pif) + ipdev = ipdev_name(pif) + + + bond_slaves = get_bond_slaves_of_pif(pif) + log("bond slaves of %s - %s" % (rec['device'], [interface_name(s) for s in bond_slaves])) + for slave in bond_slaves: + slave_interface = interface_name(slave) + log("bring down bond slave %s" % slave_interface) + argv += interface_deconfigure_commands(slave_interface) + down_netdev(slave_interface) + + argv += interface_deconfigure_commands(ipdev) + down_netdev(ipdev) + + argv += ['--del-match', 'bridge.%s.*' % datapath_name(pif)] + argv += ['--del-match', 'bonding.%s.[!0-9]*' % interface] + modify_config(argv) + +def action_rewrite(pif): + # Support "rpm -e vswitch" gracefully by keeping Centos configuration + # files up-to-date, even though we don't use them or need them. + pifrec = db.get_pif_record(pif) + f = configure_pif(pif) + interface = interface_name(pif) + bridge = bridge_name(pif) + mode = pifrec['ip_configuration_mode'] + if bridge: + log("Configuring %s using %s configuration" % (bridge, mode)) + br = open_network_ifcfg(pif) + configure_network(pif, br) + br.close() + f.attach_child(br) + else: + log("Configuring %s using %s configuration" % (interface, mode)) + configure_network(pif, f) + f.close() + try: + f.apply() + f.commit() + except Error, e: + log("failed to apply changes: %s" % e.msg) + f.revert() + raise + + # We have no code of our own to run here. + pass + +def main(argv=None): + global output_directory, management_pif + + session = None + pif_uuid = None + pif = None + + force_interface = None + force_management = False + + if argv is None: + argv = sys.argv + + try: + try: + shortops = "h" + longops = [ "output-directory=", + "pif=", "pif-uuid=", + "session=", + "force=", + "force-interface=", + "management", + "test-mode", + "device=", "mode=", "ip=", "netmask=", "gateway=", + "help" ] + arglist, args = getopt.gnu_getopt(argv[1:], shortops, longops) + except getopt.GetoptError, msg: + raise Usage(msg) + + force_rewrite_config = {} + + for o,a in arglist: + if o == "--output-directory": + output_directory = a + elif o == "--pif": + pif = a + elif o == "--pif-uuid": + pif_uuid = a + elif o == "--session": + session = a + elif o == "--force-interface" or o == "--force": + force_interface = a + elif o == "--management": + force_management = True + elif o in ["--device", "--mode", "--ip", "--netmask", "--gateway"]: + force_rewrite_config[o[2:]] = a + elif o == "-h" or o == "--help": + print __doc__ % {'command-name': os.path.basename(argv[0])} + return 0 + + if not debug_mode(): + syslog.openlog(os.path.basename(argv[0])) + log("Called as " + str.join(" ", argv)) + if len(args) < 1: + raise Usage("Required option not present") + if len(args) > 1: + raise Usage("Too many arguments") + + action = args[0] + # backwards compatibility + if action == "rewrite-configuration": action = "rewrite" + + if output_directory and ( session or pif ): + raise Usage("--session/--pif cannot be used with --output-directory") + if ( session or pif ) and pif_uuid: + raise Usage("--session/--pif and --pif-uuid are mutually exclusive.") + if ( session and not pif ) or ( not session and pif ): + raise Usage("--session and --pif must be used together.") + if force_interface and ( session or pif or pif_uuid ): + raise Usage("--force is mutually exclusive with --session, --pif and --pif-uuid") + if len(force_rewrite_config) and not (force_interface and action == "rewrite"): + raise Usage("\"--force rewrite\" needed for --device, --mode, --ip, --netmask, and --gateway") + + global db + if force_interface: + log("Force interface %s %s" % (force_interface, action)) + + if action == "rewrite": + action_force_rewrite(force_interface, force_rewrite_config) + else: + db = DatabaseCache(cache_file=dbcache_file) + host = db.extras['host'] + pif = db.get_pif_by_bridge(host, force_interface) + management_pif = db.get_management_pif(host) + + if action == "up": + action_up(pif) + elif action == "down": + action_down(pif) + else: + raise Usage("Unknown action %s" % action) + else: + db = DatabaseCache(session_ref=session) + + if pif_uuid: + pif = db.get_pif_by_uuid(pif_uuid) + + if not pif: + raise Usage("No PIF given") + + if force_management: + # pif is going to be the management pif + management_pif = pif + else: + # pif is not going to be the management pif. + # Search DB cache for pif on same host with management=true + pifrec = db.get_pif_record(pif) + host = pifrec['host'] + management_pif = db.get_management_pif(host) + + log_pif_action(action, pif) + + if not check_allowed(pif): + return 0 + + if action == "up": + action_up(pif) + elif action == "down": + action_down(pif) + elif action == "rewrite": + action_rewrite(pif) + else: + raise Usage("Unknown action %s" % action) + + # Save cache. + pifrec = db.get_pif_record(pif) + db.save(dbcache_file, {'host': pifrec['host']}) + + except Usage, err: + print >>sys.stderr, err.msg + print >>sys.stderr, "For help use --help." + return 2 + except Error, err: + log(err.msg) + return 1 + + return 0 + +# The following code allows interface-reconfigure to keep Centos +# network configuration files up-to-date, even though the vswitch +# never uses them. In turn, that means that "rpm -e vswitch" does not +# have to update any configuration files. + +def configure_ethtool(oc, f): + # Options for "ethtool -s" + settings = None + setting_opts = ["autoneg", "speed", "duplex"] + # Options for "ethtool -K" + offload = None + offload_opts = ["rx", "tx", "sg", "tso", "ufo", "gso"] + + for opt in [opt for opt in setting_opts + offload_opts if oc.has_key("ethtool-" + opt)]: + val = oc["ethtool-" + opt] + + if opt in ["speed"]: + if val in ["10", "100", "1000"]: + val = "speed " + val + else: + log("Invalid value for ethtool-speed = %s. Must be 10|100|1000." % val) + val = None + elif opt in ["duplex"]: + if val in ["half", "full"]: + val = "duplex " + val + else: + log("Invalid value for ethtool-duplex = %s. Must be half|full." % val) + val = None + elif opt in ["autoneg"] + offload_opts: + if val in ["true", "on"]: + val = opt + " on" + elif val in ["false", "off"]: + val = opt + " off" + else: + log("Invalid value for ethtool-%s = %s. Must be on|true|off|false." % (opt, val)) + val = None + + if opt in setting_opts: + if val and settings: + settings = settings + " " + val + else: + settings = val + elif opt in offload_opts: + if val and offload: + offload = offload + " " + val + else: + offload = val + + if settings: + f.write("ETHTOOL_OPTS=\"%s\"\n" % settings) + if offload: + f.write("ETHTOOL_OFFLOAD_OPTS=\"%s\"\n" % offload) + +def configure_mtu(oc, f): + if not oc.has_key('mtu'): + return + + try: + mtu = int(oc['mtu']) + f.write("MTU=%d\n" % mtu) + except ValueError, x: + log("Invalid value for mtu = %s" % mtu) + +def configure_static_routes(interface, oc, f): + """Open a route- file for static routes. + + Opens the static routes configuration file for interface and writes one + line for each route specified in the network's other config "static-routes" value. + E.g. if + interface ( RO): xenbr1 + other-config (MRW): static-routes: 172.16.0.0/15/192.168.0.3,172.18.0.0/16/192.168.0.4;... + + Then route-xenbr1 should be + 172.16.0.0/15 via 192.168.0.3 dev xenbr1 + 172.18.0.0/16 via 192.168.0.4 dev xenbr1 + """ + fname = "route-%s" % interface + if oc.has_key('static-routes'): + # The key is present - extract comma seperates entries + lines = oc['static-routes'].split(',') + else: + # The key is not present, i.e. there are no static routes + lines = [] + + child = ConfigurationFile(fname) + child.write("# DO NOT EDIT: This file (%s) was autogenerated by %s\n" % \ + (os.path.basename(child.path()), os.path.basename(sys.argv[0]))) + + try: + for l in lines: + network, masklen, gateway = l.split('/') + child.write("%s/%s via %s dev %s\n" % (network, masklen, gateway, interface)) + + f.attach_child(child) + child.close() + + except ValueError, e: + log("Error in other-config['static-routes'] format for network %s: %s" % (interface, e)) + +def __open_ifcfg(interface): + """Open a network interface configuration file. + + Opens the configuration file for interface, writes a header and + common options and returns the file object. + """ + fname = "ifcfg-%s" % interface + f = ConfigurationFile(fname) + + f.write("# DO NOT EDIT: This file (%s) was autogenerated by %s\n" % \ + (os.path.basename(f.path()), os.path.basename(sys.argv[0]))) + f.write("XEMANAGED=yes\n") + f.write("DEVICE=%s\n" % interface) + f.write("ONBOOT=no\n") + + return f + +def open_network_ifcfg(pif): + bridge = bridge_name(pif) + interface = interface_name(pif) + if bridge: + return __open_ifcfg(bridge) + else: + return __open_ifcfg(interface) + + +def open_pif_ifcfg(pif): + pifrec = db.get_pif_record(pif) + + log("Configuring %s (%s)" % (interface_name(pif), pifrec['MAC'])) + + f = __open_ifcfg(interface_name(pif)) + + if pifrec.has_key('other_config'): + configure_ethtool(pifrec['other_config'], f) + configure_mtu(pifrec['other_config'], f) + + return f + +def configure_network(pif, f): + """Write the configuration file for a network. + + Writes configuration derived from the network object into the relevant + ifcfg file. The configuration file is passed in, but if the network is + bridgeless it will be ifcfg-, otherwise it will be ifcfg-. + + This routine may also write ifcfg files of the networks corresponding to other PIFs + in order to maintain consistency. + + params: + pif: Opaque_ref of pif + f : ConfigurationFile(/path/to/ifcfg) to which we append network configuration + """ + + pifrec = db.get_pif_record(pif) + host = pifrec['host'] + nw = pifrec['network'] + nwrec = db.get_network_record(nw) + oc = None + bridge = bridge_name(pif) + interface = interface_name(pif) + if bridge: + device = bridge + else: + device = interface + + if nwrec.has_key('other_config'): + configure_ethtool(nwrec['other_config'], f) + configure_mtu(nwrec['other_config'], f) + configure_static_routes(device, nwrec['other_config'], f) + + + if pifrec.has_key('other_config'): + oc = pifrec['other_config'] + + if device == bridge: + f.write("TYPE=Bridge\n") + f.write("DELAY=0\n") + f.write("STP=off\n") + f.write("PIFDEV=%s\n" % interface_name(pif)) + + if pifrec['ip_configuration_mode'] == "DHCP": + f.write("BOOTPROTO=dhcp\n") + f.write("PERSISTENT_DHCLIENT=yes\n") + elif pifrec['ip_configuration_mode'] == "Static": + f.write("BOOTPROTO=none\n") + f.write("NETMASK=%(netmask)s\n" % pifrec) + f.write("IPADDR=%(IP)s\n" % pifrec) + f.write("GATEWAY=%(gateway)s\n" % pifrec) + elif pifrec['ip_configuration_mode'] == "None": + f.write("BOOTPROTO=none\n") + else: + raise Error("Unknown ip-configuration-mode %s" % pifrec['ip_configuration_mode']) + + if pifrec.has_key('DNS') and pifrec['DNS'] != "": + ServerList = pifrec['DNS'].split(",") + for i in range(len(ServerList)): f.write("DNS%d=%s\n" % (i+1, ServerList[i])) + if oc and oc.has_key('domain'): + f.write("DOMAIN='%s'\n" % oc['domain'].replace(',', ' ')) + + # We only allow one ifcfg-xenbr* to have PEERDNS=yes and there can be only one GATEWAYDEV in /etc/sysconfig/network. + # The peerdns pif will be the one with pif::other-config:peerdns=true, or the mgmt pif if none have this set. + # The gateway pif will be the one with pif::other-config:defaultroute=true, or the mgmt pif if none have this set. + + # Work out which pif on this host should be the one with PEERDNS=yes and which should be the GATEWAYDEV + # + # Note: we prune out the bond master pif (if it exists). + # This is because when we are called to bring up an interface with a bond master, it is implicit that + # we should bring down that master. + pifs_on_host = [ __pif for __pif in db.get_all_pifs() if + db.get_pif_record(__pif)['host'] == host and + (not __pif in get_bond_masters_of_pif(pif)) ] + other_pifs_on_host = [ __pif for __pif in pifs_on_host if __pif != pif ] + + peerdns_pif = None + defaultroute_pif = None + + # loop through all the pifs on this host looking for one with + # other-config:peerdns = true, and one with + # other-config:default-route=true + for __pif in pifs_on_host: + __pifrec = db.get_pif_record(__pif) + __oc = __pifrec['other_config'] + if __oc.has_key('peerdns') and __oc['peerdns'] == 'true': + if peerdns_pif == None: + peerdns_pif = __pif + else: + log('Warning: multiple pifs with "peerdns=true" - choosing %s and ignoring %s' % \ + (db.get_pif_record(peerdns_pif)['device'], __pifrec['device'])) + if __oc.has_key('defaultroute') and __oc['defaultroute'] == 'true': + if defaultroute_pif == None: + defaultroute_pif = __pif + else: + log('Warning: multiple pifs with "defaultroute=true" - choosing %s and ignoring %s' % \ + (db.get_pif_record(defaultroute_pif)['device'], __pifrec['device'])) + + # If no pif is explicitly specified then use the mgmt pif for peerdns/defaultroute + if peerdns_pif == None: + peerdns_pif = management_pif + if defaultroute_pif == None: + defaultroute_pif = management_pif + + # Update all the other network's ifcfg files and ensure consistency + for __pif in other_pifs_on_host: + __f = open_network_ifcfg(__pif) + peerdns_line_wanted = 'PEERDNS=%s\n' % ((__pif == peerdns_pif) and 'yes' or 'no') + lines = __f.readlines() + + if not peerdns_line_wanted in lines: + # the PIF selected for DNS has changed and as a result this ifcfg file needs rewriting + for line in lines: + if not line.lstrip().startswith('PEERDNS'): + __f.write(line) + log("Setting %s in %s" % (peerdns_line_wanted.strip(), __f.path())) + __f.write(peerdns_line_wanted) + __f.close() + f.attach_child(__f) + + else: + # There is no need to change this ifcfg file. So don't attach_child. + pass + + # ... and for this pif too + f.write('PEERDNS=%s\n' % ((pif == peerdns_pif) and 'yes' or 'no')) + + # Update gatewaydev + fnetwork = ConfigurationFile("network", "/etc/sysconfig") + for line in fnetwork.readlines(): + if line.lstrip().startswith('GATEWAY') : + continue + fnetwork.write(line) + if defaultroute_pif: + gatewaydev = bridge_name(defaultroute_pif) + if not gatewaydev: + gatewaydev = interface_name(defaultroute_pif) + fnetwork.write('GATEWAYDEV=%s\n' % gatewaydev) + fnetwork.close() + f.attach_child(fnetwork) + + return + + +def configure_physical_interface(pif): + """Write the configuration for a physical interface. + + Writes the configuration file for the physical interface described by + the pif object. + + Returns the open file handle for the interface configuration file. + """ + + pifrec = db.get_pif_record(pif) + + f = open_pif_ifcfg(pif) + + f.write("TYPE=Ethernet\n") + f.write("HWADDR=%(MAC)s\n" % pifrec) + + return f + +def configure_bond_interface(pif): + """Write the configuration for a bond interface. + + Writes the configuration file for the bond interface described by + the pif object. Handles writing the configuration for the slave + interfaces. + + Returns the open file handle for the bond interface configuration + file. + """ + + pifrec = db.get_pif_record(pif) + oc = pifrec['other_config'] + f = open_pif_ifcfg(pif) + + if pifrec['MAC'] != "": + f.write("MACADDR=%s\n" % pifrec['MAC']) + + for slave in get_bond_slaves_of_pif(pif): + s = configure_physical_interface(slave) + s.write("MASTER=%(device)s\n" % pifrec) + s.write("SLAVE=yes\n") + s.close() + f.attach_child(s) + + # The bond option defaults + bond_options = { + "mode": "balance-slb", + "miimon": "100", + "downdelay": "200", + "updelay": "31000", + "use_carrier": "1", + } + + # override defaults with values from other-config whose keys being with "bond-" + overrides = filter(lambda (key,val): key.startswith("bond-"), oc.items()) + overrides = map(lambda (key,val): (key[5:], val), overrides) + bond_options.update(overrides) + + # write the bond options to ifcfg-bondX + f.write('BONDING_OPTS="') + for (name,val) in bond_options.items(): + f.write("%s=%s " % (name,val)) + f.write('"\n') + return f + +def configure_vlan_interface(pif): + """Write the configuration for a VLAN interface. + + Writes the configuration file for the VLAN interface described by + the pif object. Handles writing the configuration for the master + interface if necessary. + + Returns the open file handle for the VLAN interface configuration + file. + """ + + slave = configure_pif(get_vlan_slave_of_pif(pif)) + slave.close() + + f = open_pif_ifcfg(pif) + f.write("VLAN=yes\n") + f.attach_child(slave) + + return f + +def configure_pif(pif): + """Write the configuration for a PIF object. + + Writes the configuration file the PIF and all dependent + interfaces (bond slaves and VLAN masters etc). + + Returns the open file handle for the interface configuration file. + """ + + pifrec = db.get_pif_record(pif) + + if pifrec['VLAN'] != '-1': + f = configure_vlan_interface(pif) + elif len(pifrec['bond_master_of']) != 0: + f = configure_bond_interface(pif) + else: + f = configure_physical_interface(pif) + + bridge = bridge_name(pif) + if bridge: + f.write("BRIDGE=%s\n" % bridge) + + return f + +def unconfigure_pif(pif): + """Clear up the files created by configure_pif""" + f = open_pif_ifcfg(pif) + log("Unlinking stale file %s" % f.path()) + f.unlink() + return f + +if __name__ == "__main__": + rc = 1 + try: + rc = main() + except: + ex = sys.exc_info() + err = traceback.format_exception(*ex) + for exline in err: + log(exline) + + if not debug_mode(): + syslog.closelog() + + sys.exit(rc) diff --git a/xenserver/usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py b/xenserver/usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py new file mode 100644 index 00000000..8f4be313 --- /dev/null +++ b/xenserver/usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py @@ -0,0 +1,296 @@ +# Copyright (c) Citrix Systems 2008. All rights reserved. +# xsconsole is proprietary software. +# +# Xen, the Xen logo, XenCenter, XenMotion are trademarks or registered +# trademarks of Citrix Systems, Inc., in the United States and other +# countries. + +# Copyright (c) 2009 Nicira Networks. + +import logging +log = logging.getLogger("vswitch-cfg-update") +logging.basicConfig(filename="/var/log/vswitch-xsplugin.log", level=logging.DEBUG) + +import os +import subprocess + +cfg_mod="/root/vswitch/bin/ovs-cfg-mod" +vswitchd_cfg_filename="/etc/ovs-vswitchd.conf" + +if __name__ == "__main__": + raise Exception("This script is a plugin for xsconsole and cannot run independently") + +from XSConsoleStandard import * + +class VSwitchService: + service = {} + + def __init__(self, name, processname=None): + self.name = name + self.processname = processname + if self.processname == None: + self.processname = name + + def status(self): + try: + output = ShellPipe(["service", self.name, "status"]).Stdout() + except StandardError, e: + log.error("status retrieval error: " + str(e)) + return "" + if len(output) == 0: + return "" + for l in output: + if self.processname not in l: + continue + elif "running" in l: + return "Running" + elif "stop" in l: + return "Stopped" + else: + return "" + return "" + + def restart(self): + try: + ShellPipe(["service", self.name, "restart"]).Call() + except StandardError, e: + log.error("restart error: " + str(e)) + + @classmethod + def Inst(cls, name, processname=None): + key = name + if processname != None: + key = key + "-" + processname + if name not in cls.service: + cls.service[key] = VSwitchService(name, processname) + return cls.service[key] + +class VSwitchConfig: + + @staticmethod + def Get(key): + try: + output = ShellPipe([cfg_mod, "-vANY:console:emer", "-F", + vswitchd_cfg_filename, "-q", key]).Stdout() + except StandardError, e: + log.error("config retrieval error: " + str(e)) + return "" + + if len(output) == 0: + output = "" + else: + output = output[0].strip() + return output + + +class VSwitchControllerDialogue(Dialogue): + def __init__(self): + Dialogue.__init__(self) + data=Data.Inst() + + self.hostsInPool = 0 + self.hostsUpdated = 0 + self.controller = data.GetPoolForThisHost().get("other_config", {}).get("vSwitchController", "") + + choiceDefs = [ + ChoiceDef(Lang("Set pool-wide controller"), + lambda: self.getController()), + ChoiceDef(Lang("Delete pool-wide controller"), + lambda: self.deleteController()), + ChoiceDef(Lang("Resync server controller config"), + lambda: self.syncController()), +# ChoiceDef(Lang("Restart ovs-vswitchd"), +# lambda: self.restartService("vswitch")), +# ChoiceDef(Lang("Restart ovs-brcompatd"), +# lambda: self.restartService("vswitch-brcompatd")) + ] + self.menu = Menu(self, None, Lang("Configure vSwitch"), choiceDefs) + + self.ChangeState("INITIAL") + + def BuildPane(self): + pane = self.NewPane(DialoguePane(self.parent)) + pane.TitleSet(Lang("Configure vSwitch")) + pane.AddBox() + + def ChangeState(self, inState): + self.state = inState + self.BuildPane() + self.UpdateFields() + + def UpdateFields(self): + self.Pane().ResetPosition() + getattr(self, "UpdateFields" + self.state)() # Dispatch method named 'UpdateFields'+self.state + + def UpdateFieldsINITIAL(self): + pane = self.Pane() + pane.AddTitleField(Lang("Select an action")) + pane.AddMenuField(self.menu) + pane.AddKeyHelpField( { Lang("") : Lang("OK"), Lang("") : Lang("Cancel") } ) + + def UpdateFieldsGETCONTROLLER(self): + pane = self.Pane() + pane.ResetFields() + + pane.AddTitleField(Lang("Enter IP address of controller")) + pane.AddInputField(Lang("Address", 16), self.controller, "address") + pane.AddKeyHelpField( { Lang("") : Lang("OK"), Lang("") : Lang("Exit") } ) + if pane.CurrentInput() is None: + pane.InputIndexSet(0) + + def HandleKey(self, inKey): + handled = False + if hasattr(self, "HandleKey" + self.state): + handled = getattr(self, "HandleKey" + self.state)(inKey) + if not handled and inKey == 'KEY_ESCAPE': + Layout.Inst().PopDialogue() + handled = True + return handled + + def HandleKeyINITIAL(self, inKey): + return self.menu.HandleKey(inKey) + + def HandleKeyGETCONTROLLER(self, inKey): + pane = self.Pane() + if pane.CurrentInput() is None: + pane.InputIndexSet(0) + if inKey == 'KEY_ENTER': + inputValues = pane.GetFieldValues() + self.controller = inputValues['address'] + Layout.Inst().PopDialogue() + Layout.Inst().TransientBanner(Lang("Setting controller...")) + try: + self.SetController(self.controller) + Layout.Inst().PushDialogue(InfoDialogue(Lang("Setting controller successful"))) + except Exception, e: + Layout.Inst().PushDialogue(InfoDialogue(Lang("Setting controller failed"))) + + self.ChangeState("INITIAL") + return True + else: + return pane.CurrentInput().HandleKey(inKey) + + def restartService(self, name): + s = VSwitchService.Inst(name) + s.restart() + Layout.Inst().PopDialogue() + + def getController(self): + self.ChangeState("GETCONTROLLER") + self.Pane().InputIndexSet(0) + + def deleteController(self): + self.controller = "" + Layout.Inst().PopDialogue() + Layout.Inst().TransientBanner(Lang("Deleting controller...")) + try: + self.SetController(None) + Layout.Inst().PushDialogue(InfoDialogue(Lang("Controller deletion successful"))) + except Exception, e: + Layout.Inst().PushDialogue(InfoDialogue(Lang("Controller deletion failed"))) + + def syncController(self): + Layout.Inst().PopDialogue() + Layout.Inst().TransientBanner(Lang("Resyncing controller setting...")) + try: + Task.Sync(lambda s: self._updateThisServer(s)) + Layout.Inst().PushDialogue(InfoDialogue(Lang("Resyncing controller config successful"))) + except Exception, e: + Layout.Inst().PushDialogue(InfoDialogue(Lang("Resyncing controller config failed"))) + + def SetController(self, ip): + self.hostsInPool = 0 + self.hostsUpdated = 0 + Task.Sync(lambda s: self._modifyPoolConfig(s, "vSwitchController", ip)) + # Should be done asynchronously, maybe with an external script? + Task.Sync(lambda s: self._updateActiveServers(s)) + + def _modifyPoolConfig(self, session, key, value): + """Modify pool configuration. + + If value == None then delete key, otherwise set key to value.""" + pools = session.xenapi.pool.get_all() + # We assume there is only ever one pool... + if len(pools) == 0: + log.error("No pool for host.") + raise XenAPIPlugin.Failure("NO_POOL_FOR_HOST", []) + if len(pools) > 1: + log.error("More than one pool for host.") + raise XenAPIPlugin.Failure("MORE_THAN_ONE_POOL_FOR_HOST", []) + session.xenapi.pool.remove_from_other_config(pools[0], key) + if value != None: + session.xenapi.pool.add_to_other_config(pools[0], key, value) + Data.Inst().Update() + + def _updateActiveServers(self, session): + hosts = session.xenapi.host.get_all() + self.hostsUpdated = 0 + self.hostsInPool = len(hosts) + self.UpdateFields() + for host in hosts: + Layout.Inst().TransientBanner("Updating host %d out of %d" + % (self.hostsUpdated + 1, self.hostsInPool)) + session.xenapi.host.call_plugin(host, "vswitch-cfg-update", "update", {}) + self.hostsUpdated = self.hostsUpdated + 1 + + def _updateThisServer(self, session): + data = Data.Inst() + host = data.host.opaqueref() + session.xenapi.host.call_plugin(host, "vswitch-cfg-update", "update", {}) + + +class XSFeatureVSwitch: + + @classmethod + def StatusUpdateHandler(cls, inPane): + data = Data.Inst() + + inPane.AddTitleField(Lang("vSwitch")) + + inPane.NewLine() + + versionStr = data.host.other_config({}).get("vSwitchVersion", "") + inPane.AddStatusField(Lang("Version", 20), versionStr) + + inPane.NewLine() + dbController = data.GetPoolForThisHost().get("other_config", {}).get("vSwitchController", "") + if dbController == "": + dbController = Lang("") + inPane.AddStatusField(Lang("Controller (config)", 20), dbController) + controller = VSwitchConfig.Get("mgmt.controller") + if controller == "": + controller = Lang("") + elif controller[0:4] == "ssl:": + controller = controller[4:] + inPane.AddStatusField(Lang("Controller (in-use)", 20), controller) + + inPane.NewLine() + inPane.AddStatusField(Lang("ovs-vswitchd status", 20), + VSwitchService.Inst("vswitch", "ovs-vswitchd").status()) + inPane.AddStatusField(Lang("ovs-brcompatd status", 20), + VSwitchService.Inst("vswitch", "ovs-brcompatd").status()) + + inPane.AddKeyHelpField( { + Lang("") : Lang("Reconfigure"), + Lang("") : Lang("Refresh") + }) + + @classmethod + def ActivateHandler(cls): + DialogueUtils.AuthenticatedOnly(lambda: Layout.Inst().PushDialogue(VSwitchControllerDialogue())) + + def Register(self): + Importer.RegisterNamedPlugIn( + self, + 'VSwitch', # Key of this plugin for replacement, etc. + { + 'menuname' : 'MENU_NETWORK', + 'menupriority' : 800, + 'menutext' : Lang('vSwitch') , + 'statusupdatehandler' : self.StatusUpdateHandler, + 'activatehandler' : self.ActivateHandler + } + ) + +# Register this plugin when module is imported +XSFeatureVSwitch().Register() diff --git a/xenserver/vswitch-xen.spec b/xenserver/vswitch-xen.spec new file mode 100644 index 00000000..58e35dec --- /dev/null +++ b/xenserver/vswitch-xen.spec @@ -0,0 +1,310 @@ +# Spec file for vswitch and related programs. + +# Copyright (C) 2009 Nicira Networks, Inc. +# +# Copying and distribution of this file, with or without modification, +# are permitted in any medium without royalty provided the copyright +# notice and this notice are preserved. This file is offered as-is, +# without warranty of any kind. + +# When building, the rpmbuild command line should define +# vswitch_version, xen_version, and build_number using -D arguments. +# for example: +# +# rpmbuild -D "vswitch_version 0.8.9~1+build123" -D "xen_version 2.6.18-128.1.1.el5.xs5.1.0.483.1000xen" -D "build_number --with-build-number=123" -bb /usr/src/redhat/SPECS/vswitch-xen.spec +# +%define version %{vswitch_version}-%{xen_version} +%define _prefix /root/vswitch + +Name: vswitch +Summary: Virtual switch +Group: System Environment/Daemons +URL: http://www.vswitch.org/ +Version: %{vswitch_version} +License: GPL3 +Release: 1 +Source: openvswitch-%{vswitch_version}.tar.gz +Buildroot: /tmp/vswitch-xen-rpm + +%description +The vswitch provides standard network bridging functions augmented with +support for the OpenFlow protocol for remote per-flow control of +traffic. + +%prep +%setup -q -n openvswitch-%{vswitch_version} + +%build +./configure --prefix=%{_prefix} --localstatedir=%{_localstatedir} --with-l26=/lib/modules/%{xen_version}/build --enable-ssl %{build_number} +make + +%install +rm -rf $RPM_BUILD_ROOT +make install DESTDIR=$RPM_BUILD_ROOT prefix=%{_prefix} +install -d -m 755 $RPM_BUILD_ROOT/etc +install -d -m 755 $RPM_BUILD_ROOT/etc/init.d +install -m 755 xenserver/etc_init.d_vswitch \ + $RPM_BUILD_ROOT/etc/init.d/vswitch +install -m 755 xenserver/etc_init.d_vswitch-xapi-update \ + $RPM_BUILD_ROOT/etc/init.d/vswitch-xapi-update +install -d -m 755 $RPM_BUILD_ROOT/etc/sysconfig +install -m 755 xenserver/etc_sysconfig_vswitch.example \ + $RPM_BUILD_ROOT/etc/sysconfig/vswitch.example +install -d -m 755 $RPM_BUILD_ROOT/etc/logrotate.d +install -m 755 xenserver/etc_logrotate.d_vswitch \ + $RPM_BUILD_ROOT/etc/logrotate.d/vswitch +install -d -m 755 $RPM_BUILD_ROOT/etc/profile.d +install -m 755 xenserver/etc_profile.d_vswitch.sh \ + $RPM_BUILD_ROOT/etc/profile.d/vswitch.sh +install -d -m 755 $RPM_BUILD_ROOT/etc/xapi.d/plugins +install -m 755 xenserver/etc_xapi.d_plugins_vswitch-cfg-update \ + $RPM_BUILD_ROOT/etc/xapi.d/plugins/vswitch-cfg-update +install -d -m 755 $RPM_BUILD_ROOT%{_prefix}/scripts +install -m 755 xenserver/opt_xensource_libexec_interface-reconfigure \ + $RPM_BUILD_ROOT%{_prefix}/scripts/interface-reconfigure +install -m 755 xenserver/etc_xensource_scripts_vif \ + $RPM_BUILD_ROOT%{_prefix}/scripts/vif +install -m 755 \ + xenserver/usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py \ + $RPM_BUILD_ROOT%{_prefix}/scripts/XSFeatureVSwitch.py + +install -d -m 755 $RPM_BUILD_ROOT%{_prefix}/kernel_modules +find datapath/linux-2.6 -name *.ko -exec install -m 755 \{\} $RPM_BUILD_ROOT%{_prefix}/kernel_modules/ \; + +# Get rid of stuff we don't want to make RPM happy. +rm -rf \ + $RPM_BUILD_ROOT/root/vswitch/bin/ezio-term \ + $RPM_BUILD_ROOT/root/vswitch/bin/ovs-controller \ + $RPM_BUILD_ROOT/root/vswitch/bin/ovs-discover \ + $RPM_BUILD_ROOT/root/vswitch/bin/ovs-kill \ + $RPM_BUILD_ROOT/root/vswitch/bin/ovs-pki \ + $RPM_BUILD_ROOT/root/vswitch/bin/ovs-switchui \ + $RPM_BUILD_ROOT/root/vswitch/bin/ovs-wdt \ + $RPM_BUILD_ROOT/root/vswitch/bin/secchan \ + $RPM_BUILD_ROOT/root/vswitch/sbin/ovs-monitor \ + $RPM_BUILD_ROOT/root/vswitch/share/man/man8/ovs-controller.8 \ + $RPM_BUILD_ROOT/root/vswitch/share/man/man8/ovs-discover.8 \ + $RPM_BUILD_ROOT/root/vswitch/share/man/man8/ovs-kill.8 \ + $RPM_BUILD_ROOT/root/vswitch/share/man/man8/ovs-pki.8 \ + $RPM_BUILD_ROOT/root/vswitch/share/man/man8/secchan.8 \ + $RPM_BUILD_ROOT/root/vswitch/share/openvswitch + +%clean +rm -rf $RPM_BUILD_ROOT + +%pre +if [ ! -f /etc/xensource-inventory ]; then + printf "XenSource inventory not present in /etc/xensource-inventory" + exit 1 +fi + +if [ "$1" = "1" ]; then + if ! md5sum -c --status </dev/null 2>&1; then + chkconfig --del $s || printf "Could not remove $s init script." + fi + chkconfig --add $s || printf "Could not add $s init script." + chkconfig $s on || printf "Could not enable $s init script." +done + +if [ "$1" = "1" ]; then # $1 = 2 for upgrade + printf "\nYou MUST reboot the server NOW to complete the change to the\n" + printf "the vSwitch. Attempts to modify networking on the server\n" + printf "or any hosted VM will fail until after the reboot and could\n" + printf "leave the server in an state requiring manual recovery.\n\n" +else + printf "\nTo use the new vSwitch, you should reboot the server\n" + printf "now. Failure to do so may result in incorrect operation.\n\n" +fi + +%preun +if [ "$1" = "0" ]; then # $1 = 1 for upgrade + for s in vswitch vswitch-xapi-update; do + chkconfig --del $s || printf "Could not remove $s init script." + done +fi + + +%postun +if [ "$1" = "0" ]; then # $1 = 1 for upgrade + + rm -f /usr/lib/xsconsole/plugins-base/XSFeatureVSwitch.py \ + /usr/lib/xsconsole/plugins-base/XSFeatureVSwitch.pyc \ + /usr/lib/xsconsole/plugins-base/XSFeatureVSwitch.pyo \ + || printf "Could not remove vSwitch xsconsole plugin.\n" + + # Restore original XenServer scripts + for f in \ + /opt/xensource/libexec/interface-reconfigure \ + /etc/xensource/scripts/vif + do + s=$(basename "$f") + if [ ! -f "%{_prefix}/xs-original/$s" ]; then + printf "Original XenServer $s script not present in %{_prefix}/xs-original\n" + printf "Could not restore original XenServer script.\n" + else + (rm -f "$f" \ + && mv "%{_prefix}/xs-original/$s" "$f") \ + || printf "Could not restore original XenServer $s script.\n" + fi + done + + find %{_prefix} -type d -depth -exec rmdir \{\} \; \ + || printf "Could not remove vSwitch install directory.\n" + + # Remove all configuration and log files + rm -f /etc/ovs-vswitchd.conf + rm -f /etc/sysconfig/vswitch + rm -f /var/log/vswitch* + rm -f /etc/ovs-vswitchd.cacert + + if [ ! -f /etc/xensource-inventory ]; then + printf "XenSource inventory not present in /etc/xensource-inventory\n" + printf "Could not remove vSwitchVersion from XAPI database.\n" + exit 1 + else + source /etc/xensource-inventory + xe host-param-remove \ + param-name=other-config param-key=vSwitchVersion \ + uuid="$INSTALLATION_UUID" || + echo "Could not clear vSwitchVersion config parameter." + fi + + printf "\nYou MUST reboot the server now to complete the change to\n" + printf "standard Xen networking. Attempts to modify networking on the\n" + printf "server or any hosted VM will fail until after the reboot and\n" + printf "could leave the server in a state requiring manual recovery.\n\n" +fi + + +%files +%defattr(-,root,root) +/etc/init.d/vswitch +/etc/init.d/vswitch-xapi-update +/etc/xapi.d/plugins/vswitch-cfg-update +/etc/sysconfig/vswitch.example +/etc/logrotate.d/vswitch +/etc/profile.d/vswitch.sh +/root/vswitch/kernel_modules/brcompat_mod.ko +/root/vswitch/kernel_modules/openvswitch_mod.ko +/root/vswitch/kernel_modules/veth_mod.ko +/root/vswitch/scripts/interface-reconfigure +/root/vswitch/scripts/vif +/root/vswitch/scripts/XSFeatureVSwitch.py +# Following two files are generated automatically by rpm. We don't +# really need them and they won't be used on the XenServer, but there +# isn't an obvious place to get rid of them since they are generated +# after the install script runs. Since they are small, we just +# include them. +/root/vswitch/scripts/XSFeatureVSwitch.pyc +/root/vswitch/scripts/XSFeatureVSwitch.pyo +/root/vswitch/sbin/ovs-brcompatd +/root/vswitch/sbin/ovs-vswitchd +/root/vswitch/bin/ovs-appctl +/root/vswitch/bin/ovs-cfg-mod +/root/vswitch/bin/ovs-dpctl +/root/vswitch/bin/ovs-ofctl +/root/vswitch/share/man/man5/ovs-vswitchd.conf.5 +/root/vswitch/share/man/man8/ovs-appctl.8 +/root/vswitch/share/man/man8/ovs-brcompatd.8 +/root/vswitch/share/man/man8/ovs-cfg-mod.8 +/root/vswitch/share/man/man8/ovs-dpctl.8 +/root/vswitch/share/man/man8/ovs-ofctl.8 +/root/vswitch/share/man/man8/ovs-vswitchd.8 -- 2.30.2