Granular link health statistics for cfm.
authorMehak Mahajan <mmahajan@nicira.com>
Thu, 5 Apr 2012 21:30:23 +0000 (14:30 -0700)
committerMehak Mahajan <mmahajan@nicira.com>
Thu, 5 Apr 2012 21:41:37 +0000 (14:41 -0700)
The changes display the cfm_health of an interface.  The cfm_health
is an exponential weighted moving average of the health of all
remote_mpids.  The value can vary from 0 to 100, 100 being very healthy
and 0 being unhealthy.

Feature #10363
Requested-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Mehak Mahajan <mmahajan@nicira.com>
NEWS
lib/cfm.c
lib/cfm.h
ofproto/ofproto-dpif.c
ofproto/ofproto-provider.h
ofproto/ofproto.c
ofproto/ofproto.h
vswitchd/bridge.c
vswitchd/vswitch.ovsschema
vswitchd/vswitch.xml

diff --git a/NEWS b/NEWS
index a466f923f82122bbf09c0c07f22ce54802ece7a4..ed3fc888e4209f63b72890c5bcdde7508434e887 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,8 @@ post-v1.6.0
     - Added ability to configure dscp setting for manager and controller
       connections.  By default, these connections have a DSCP value of
       Internetwork Control (0xc0).
+    - Added the granular link health statistics, 'cfm_health', to an
+      interface.
 
 
 v1.6.0 - xx xxx xxxx
index 8b9e5bc545d01f88577e977be69fbd75cd5fba6e..ea39e27a1829b0d52476050b365518c0022f9cea 100644 (file)
--- a/lib/cfm.c
+++ b/lib/cfm.c
@@ -60,6 +60,7 @@ static const uint8_t eth_addr_ccm_x[6] = {
 #define CCM_MAID_LEN 48
 #define CCM_OPCODE 1 /* CFM message opcode meaning CCM. */
 #define CCM_RDI_MASK 0x80
+#define CFM_HEALTH_INTERVAL 6
 struct ccm {
     uint8_t  mdlevel_version; /* MD Level and Version */
     uint8_t  opcode;
@@ -111,6 +112,12 @@ struct cfm {
      * avoid flapping. */
     uint64_t *rmps_array;     /* Cache of remote_mps. */
     size_t rmps_array_len;    /* Number of rmps in 'rmps_array'. */
+
+    int health;               /* Percentage of the number of CCM frames
+                                 received. */
+    int health_interval;      /* Number of fault_intervals since health was
+                                 recomputed. */
+
 };
 
 /* Remote MPs represent foreign network entities that are configured to have
@@ -124,6 +131,9 @@ struct remote_mp {
                             receiving CCMs that it's expecting to. */
     bool opup;           /* Operational State. */
     uint32_t seq;        /* Most recently received sequence number. */
+    uint8_t num_health_ccm; /* Number of received ccm frames every
+                               CFM_HEALTH_INTERVAL * 'fault_interval'. */
+
 };
 
 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(20, 30);
@@ -290,6 +300,7 @@ cfm_create(const char *name)
     hmap_insert(&all_cfms, &cfm->hmap_node, hash_string(cfm->name, 0));
     cfm->remote_opup = true;
     cfm->fault_override = -1;
+    cfm->health = -1;
     return cfm;
 }
 
@@ -332,6 +343,37 @@ cfm_run(struct cfm *cfm)
                                   sizeof *cfm->rmps_array);
 
         cfm->remote_opup = true;
+        if (cfm->health_interval == CFM_HEALTH_INTERVAL) {
+            /* Calculate the cfm health of the interface.  If the number of
+             * remote_mpids of a cfm interface is > 1, the cfm health is
+             * undefined. If the number of remote_mpids is 1, the cfm health is
+             * the percentage of the ccm frames received in the
+             * (CFM_HEALTH_INTERVAL * 3.5)ms, else it is 0. */
+            if (hmap_count(&cfm->remote_mps) > 1) {
+                cfm->health = -1;
+            } else if (hmap_is_empty(&cfm->remote_mps)) {
+                cfm->health = 0;
+            } else {
+                int exp_ccm_recvd;
+
+                rmp = CONTAINER_OF(hmap_first(&cfm->remote_mps),
+                                   struct remote_mp, node);
+                exp_ccm_recvd = (CFM_HEALTH_INTERVAL * 7) / 2;
+                /* Calculate the percentage of healthy ccm frames received.
+                 * Since the 'fault_interval' is (3.5 * cfm_interval), and
+                 * 1 CCM packet must be received every cfm_interval,
+                 * the 'remote_mpid' health reports the percentage of
+                 * healthy CCM frames received every
+                 * 'CFM_HEALTH_INTERVAL'th 'fault_interval'. */
+                cfm->health = (rmp->num_health_ccm * 100) / exp_ccm_recvd;
+                cfm->health = MIN(cfm->health, 100);
+                rmp->num_health_ccm = 0;
+                assert(cfm->health >= 0 && cfm->health <= 100);
+            }
+            cfm->health_interval = 0;
+        }
+        cfm->health_interval++;
+
         HMAP_FOR_EACH_SAFE (rmp, rmp_next, node, &cfm->remote_mps) {
 
             if (!rmp->recv) {
@@ -535,6 +577,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
         uint64_t ccm_mpid;
         uint32_t ccm_seq;
         bool ccm_opdown;
+        bool fault = false;
 
         if (cfm->extended) {
             ccm_mpid = ntohll(ccm->mpid64);
@@ -549,6 +592,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
             VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid interval"
                          " (%"PRIu8") from RMP %"PRIu64, cfm->name,
                          ccm_interval, ccm_mpid);
+            fault = true;
         }
 
         if (cfm->extended && ccm_interval == 0
@@ -556,6 +600,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
             VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid extended"
                          " interval (%"PRIu16"ms) from RMP %"PRIu64, cfm->name,
                          ccm_interval_ms_x, ccm_mpid);
+            fault = true;
         }
 
         rmp = lookup_remote_mp(cfm, ccm_mpid);
@@ -569,6 +614,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
                              "%s: dropped CCM with MPID %"PRIu64" from MAC "
                              ETH_ADDR_FMT, cfm->name, ccm_mpid,
                              ETH_ADDR_ARGS(eth->eth_src));
+                fault = true;
             }
         }
 
@@ -576,16 +622,23 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
                  " (interval %"PRIu8") (RDI %s)", cfm->name, ccm_seq,
                  ccm_mpid, ccm_interval, ccm_rdi ? "true" : "false");
 
+        if (ccm_rdi) {
+            fault = true;
+        }
         if (rmp) {
             if (rmp->seq && ccm_seq != (rmp->seq + 1)) {
                 VLOG_WARN_RL(&rl, "%s: (mpid %"PRIu64") detected sequence"
                              " numbers which indicate possible connectivity"
                              " problems (previous %"PRIu32") (current %"PRIu32
                              ")", cfm->name, ccm_mpid, rmp->seq, ccm_seq);
+                fault = true;
             }
 
             rmp->mpid = ccm_mpid;
             rmp->recv = true;
+            if (!fault) {
+                rmp->num_health_ccm++;
+            }
             rmp->seq = ccm_seq;
             rmp->rdi = ccm_rdi;
             rmp->opup = !ccm_opdown;
@@ -605,6 +658,17 @@ cfm_get_fault(const struct cfm *cfm)
     return cfm->fault;
 }
 
+/* Gets the health of 'cfm'.  Returns an integer between 0 and 100 indicating
+ * the health of the link as a percentage of ccm frames received in
+ * CFM_HEALTH_INTERVAL * 'fault_interval' if there is only 1 remote_mpid,
+ * returns 0 if there are no remote_mpids, and returns -1 if there are more
+ * than 1 remote_mpids. */
+int
+cfm_get_health(const struct cfm *cfm)
+{
+    return cfm->health;
+}
+
 /* Gets the operational state of 'cfm'.  'cfm' is considered operationally down
  * if it has received a CCM with the operationally down bit set from any of its
  * remote maintenance points. Returns true if 'cfm' is operationally up. False
@@ -656,6 +720,11 @@ cfm_print_details(struct ds *ds, const struct cfm *cfm)
         ds_put_cstr(ds, "\n");
     }
 
+    if (cfm->health == -1) {
+        ds_put_format(ds, "\taverage health: undefined\n");
+    } else {
+        ds_put_format(ds, "\taverage health: %d\n", cfm->health);
+    }
     ds_put_format(ds, "\topstate: %s\n", cfm->opup ? "up" : "down");
     ds_put_format(ds, "\tremote_opstate: %s\n",
                   cfm->remote_opup ? "up" : "down");
index 2556a325b36b9bc396537c79ff18e8a38799edf2..2b4f888f576ac9e33bbe8526856f3dc80cd6506f 100644 (file)
--- a/lib/cfm.h
+++ b/lib/cfm.h
@@ -69,6 +69,7 @@ bool cfm_configure(struct cfm *, const struct cfm_settings *);
 bool cfm_should_process_flow(const struct cfm *cfm, const struct flow *);
 void cfm_process_heartbeat(struct cfm *, const struct ofpbuf *packet);
 int cfm_get_fault(const struct cfm *);
+int cfm_get_health(const struct cfm *);
 bool cfm_get_opup(const struct cfm *);
 void cfm_get_remote_mpids(const struct cfm *, const uint64_t **rmps,
                           size_t *n_rmps);
index 51b847f24d1a1829d3870f435e04f61ffe6326bb..a42d09eb9575f2dc8f6f09667335e5bbc667a567 100644 (file)
@@ -1119,6 +1119,14 @@ get_cfm_remote_mpids(const struct ofport *ofport_, const uint64_t **rmps,
         return -1;
     }
 }
+
+static int
+get_cfm_health(const struct ofport *ofport_)
+{
+    struct ofport_dpif *ofport = ofport_dpif_cast(ofport_);
+
+    return ofport->cfm ? cfm_get_health(ofport->cfm) : -1;
+}
 \f
 /* Spanning Tree. */
 
@@ -6491,6 +6499,7 @@ const struct ofproto_class ofproto_dpif_class = {
     set_cfm,
     get_cfm_fault,
     get_cfm_remote_mpids,
+    get_cfm_health,
     set_stp,
     get_stp_status,
     set_stp_port,
index 26904ef021a1adeb7d9bf551536de1317c87e152..7b0e478bf983194bca3acc1c31da3d7feb2164e7 100644 (file)
@@ -980,6 +980,17 @@ struct ofproto_class {
     int (*get_cfm_remote_mpids)(const struct ofport *ofport,
                                 const uint64_t **rmps, size_t *n_rmps);
 
+    /* Checks the health of CFM configured on 'ofport'.  Returns an integer
+     * to indicate the health percentage of the 'ofport' which is an average of
+     * the health of all the remote_mps.  Returns an integer between 0 and 100
+     * where 0 means that the 'ofport' is very unhealthy and 100 means the
+     * 'ofport' is perfectly healthy.  Returns -1 if CFM is not enabled on
+     * 'port' or if the number of remote_mpids is > 1.
+     *
+     * This function may be a null pointer if the ofproto implementation does
+     * not support CFM. */
+    int (*get_cfm_health)(const struct ofport *ofport);
+
     /* Configures spanning tree protocol (STP) on 'ofproto' using the
      * settings defined in 's'.
      *
index e7e040126eb12c13259b78e2d13e66e7f43673be..f9343069b9126860463c91ddcd0bf69898884b32 100644 (file)
@@ -2481,6 +2481,19 @@ ofproto_port_get_cfm_remote_mpids(const struct ofproto *ofproto,
             : -1);
 }
 
+/* Checks the health of the CFM for 'ofp_port' within 'ofproto'.  Returns an
+ * integer value between 0 and 100 to indicate the health of the port as a
+ * percentage which is the average of cfm health of all the remote_mpids or
+ * returns -1 if CFM is not enabled on 'ofport'. */
+int
+ofproto_port_get_cfm_health(const struct ofproto *ofproto, uint16_t ofp_port)
+{
+    struct ofport *ofport = ofproto_get_port(ofproto, ofp_port);
+    return (ofport && ofproto->ofproto_class->get_cfm_health
+            ? ofproto->ofproto_class->get_cfm_health(ofport)
+            : -1);
+}
+
 static enum ofperr
 handle_aggregate_stats_request(struct ofconn *ofconn,
                                const struct ofp_stats_msg *osm)
index 6172f291f22f542e3685eb00340597ed307c98e0..c40f5d3a419cf80c5def70735f72a068c88c1bf9 100644 (file)
@@ -348,7 +348,8 @@ int ofproto_port_get_cfm_fault(const struct ofproto *, uint16_t ofp_port);
 int ofproto_port_get_cfm_remote_mpids(const struct ofproto *,
                                       uint16_t ofp_port, const uint64_t **rmps,
                                       size_t *n_rmps);
-
+int ofproto_port_get_cfm_health(const struct ofproto *ofproto,
+                                uint16_t ofp_port);
 void ofproto_get_ofproto_controller_info(const struct ofproto *, struct shash *);
 void ofproto_free_ofproto_controller_info(struct shash *);
 \f
index adc3b47cc9b524e087d5ddf755adc7476e9ca864..35d8723354ad2d9e3e321b6d9e4dde7653d08398 100644 (file)
@@ -279,6 +279,7 @@ bridge_init(const char *remote)
     ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_fault);
     ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_fault_status);
     ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_remote_mpids);
+    ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_health);
     ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_lacp_current);
     ovsdb_idl_omit(idl, &ovsrec_interface_col_external_ids);
 
@@ -1547,6 +1548,7 @@ iface_refresh_cfm_stats(struct iface *iface)
     int fault, error;
     const uint64_t *rmps;
     size_t n_rmps;
+    int health;
 
     if (iface_is_synthetic(iface)) {
         return;
@@ -1582,6 +1584,15 @@ iface_refresh_cfm_stats(struct iface *iface)
     } else {
         ovsrec_interface_set_cfm_remote_mpids(cfg, NULL, 0);
     }
+
+    health = ofproto_port_get_cfm_health(iface->port->bridge->ofproto,
+                                        iface->ofp_port);
+    if (health >= 0) {
+        int64_t cfm_health = health;
+        ovsrec_interface_set_cfm_health(cfg, &cfm_health, 1);
+    } else {
+        ovsrec_interface_set_cfm_health(cfg, NULL, 0);
+    }
 }
 
 static void
index a3847e777ba1e2ff34dc66f3be7eba26ec19badb..c7e1ac9ef4e619f65b682db5c16434be079b0e32 100644 (file)
@@ -1,6 +1,6 @@
 {"name": "Open_vSwitch",
- "version": "6.8.0",
- "cksum": "4106006492 16485",
+ "version": "6.9.0",
+ "cksum": "617116616 16682",
  "tables": {
    "Open_vSwitch": {
      "columns": {
          "ephemeral": true},
        "cfm_fault_status": {
          "type": {"key": "string", "min": 0, "max": "unlimited"}},
+       "cfm_health": {
+         "type": {"key": {"type": "integer",
+                          "minInteger": 0,
+                          "maxInteger": 100},
+                  "min": 0, "max": 1}},
        "lacp_current": {
          "type": {"key": {"type": "boolean"},
                   "min": 0, "max": 1},
index f3ea33871ba2bf456c90e0b8fac33951b7677e6e..03c8539d0a448e31a5594f6173c00ae6f514b278 100644 (file)
         an <code>ovs-appctl</code> command.
       </column>
 
+      <column name="cfm_health">
+        <p>
+          Indicates the health of the interface as a percentage of CCM frames
+          received over 21 <ref column="other_config" key="cfm_interval"/>s.
+          The health of an interface is undefined if it is communicating with
+          more than one <ref column="cfm_remote_mpids"/>.  It reduces if
+          healthy heartbeats are not received at the expected rate, and
+          gradually improves as healthy heartbeats are received at the desired
+          rate. Every 21 <ref column="other_config" key="cfm_interval"/>s, the
+          health of the interface is refreshed.
+        </p>
+        <p>
+          As mentioned above, the faults can be triggered for several reasons.
+          The link health will deteriorate even if heartbeats are received but
+          they are reported to be unhealthy.  An unhealthy heartbeat in this
+          context is a heartbeat for which either some fault is set or is out
+          of sequence.  The interface health can be 100 only on receiving
+          healthy heartbeats at the desired rate.
+        </p>
+      </column>
+
       <column name="cfm_remote_mpids">
         When CFM is properly configured, Open vSwitch will occasionally
         receive CCM broadcasts.  These broadcasts contain the MPID of the