Granular link health statistics for cfm.

author Mehak Mahajan <mmahajan@nicira.com>

Thu, 5 Apr 2012 21:30:23 +0000 (14:30 -0700)

committer Mehak Mahajan <mmahajan@nicira.com>

Thu, 5 Apr 2012 21:41:37 +0000 (14:41 -0700)
author Mehak Mahajan <mmahajan@nicira.com>
Thu, 5 Apr 2012 21:30:23 +0000 (14:30 -0700)
committer Mehak Mahajan <mmahajan@nicira.com>
Thu, 5 Apr 2012 21:41:37 +0000 (14:41 -0700)
diff --git a/NEWS b/NEWS

index a466f923f82122bbf09c0c07f22ce54802ece7a4..ed3fc888e4209f63b72890c5bcdde7508434e887 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,8 @@ post-v1.6.0
      - Added ability to configure dscp setting for manager and controller
        connections.  By default, these connections have a DSCP value of
        Internetwork Control (0xc0).
+    - Added the granular link health statistics, 'cfm_health', to an
+      interface.
  
  
  v1.6.0 - xx xxx xxxx
diff --git a/lib/cfm.c b/lib/cfm.c

index 8b9e5bc545d01f88577e977be69fbd75cd5fba6e..ea39e27a1829b0d52476050b365518c0022f9cea 100644 (file)
--- a/lib/cfm.c
+++ b/lib/cfm.c
@@ -60,6 +60,7 @@ static const uint8_t eth_addr_ccm_x[6] = {
  #define CCM_MAID_LEN 48
  #define CCM_OPCODE 1 /* CFM message opcode meaning CCM. */
  #define CCM_RDI_MASK 0x80
+#define CFM_HEALTH_INTERVAL 6
  struct ccm {
      uint8_t  mdlevel_version; /* MD Level and Version */
      uint8_t  opcode;
@@ -111,6 +112,12 @@ struct cfm {
       * avoid flapping. */
      uint64_t *rmps_array;     /* Cache of remote_mps. */
      size_t rmps_array_len;    /* Number of rmps in 'rmps_array'. */
+
+    int health;               /* Percentage of the number of CCM frames
+                                 received. */
+    int health_interval;      /* Number of fault_intervals since health was
+                                 recomputed. */
+
  };
  
  /* Remote MPs represent foreign network entities that are configured to have
@@ -124,6 +131,9 @@ struct remote_mp {
                              receiving CCMs that it's expecting to. */
      bool opup;           /* Operational State. */
      uint32_t seq;        /* Most recently received sequence number. */
+    uint8_t num_health_ccm; /* Number of received ccm frames every
+                               CFM_HEALTH_INTERVAL * 'fault_interval'. */
+
  };
  
  static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(20, 30);
@@ -290,6 +300,7 @@ cfm_create(const char *name)
      hmap_insert(&all_cfms, &cfm->hmap_node, hash_string(cfm->name, 0));
      cfm->remote_opup = true;
      cfm->fault_override = -1;
+    cfm->health = -1;
      return cfm;
  }
  
@@ -332,6 +343,37 @@ cfm_run(struct cfm *cfm)
                                    sizeof *cfm->rmps_array);
  
          cfm->remote_opup = true;
+        if (cfm->health_interval == CFM_HEALTH_INTERVAL) {
+            /* Calculate the cfm health of the interface.  If the number of
+             * remote_mpids of a cfm interface is > 1, the cfm health is
+             * undefined. If the number of remote_mpids is 1, the cfm health is
+             * the percentage of the ccm frames received in the
+             * (CFM_HEALTH_INTERVAL * 3.5)ms, else it is 0. */
+            if (hmap_count(&cfm->remote_mps) > 1) {
+                cfm->health = -1;
+            } else if (hmap_is_empty(&cfm->remote_mps)) {
+                cfm->health = 0;
+            } else {
+                int exp_ccm_recvd;
+
+                rmp = CONTAINER_OF(hmap_first(&cfm->remote_mps),
+                                   struct remote_mp, node);
+                exp_ccm_recvd = (CFM_HEALTH_INTERVAL * 7) / 2;
+                /* Calculate the percentage of healthy ccm frames received.
+                 * Since the 'fault_interval' is (3.5 * cfm_interval), and
+                 * 1 CCM packet must be received every cfm_interval,
+                 * the 'remote_mpid' health reports the percentage of
+                 * healthy CCM frames received every
+                 * 'CFM_HEALTH_INTERVAL'th 'fault_interval'. */
+                cfm->health = (rmp->num_health_ccm * 100) / exp_ccm_recvd;
+                cfm->health = MIN(cfm->health, 100);
+                rmp->num_health_ccm = 0;
+                assert(cfm->health >= 0 && cfm->health <= 100);
+            }
+            cfm->health_interval = 0;
+        }
+        cfm->health_interval++;
+
          HMAP_FOR_EACH_SAFE (rmp, rmp_next, node, &cfm->remote_mps) {
  
              if (!rmp->recv) {
@@ -535,6 +577,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
          uint64_t ccm_mpid;
          uint32_t ccm_seq;
          bool ccm_opdown;
+        bool fault = false;
  
          if (cfm->extended) {
              ccm_mpid = ntohll(ccm->mpid64);
@@ -549,6 +592,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
              VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid interval"
                           " (%"PRIu8") from RMP %"PRIu64, cfm->name,
                           ccm_interval, ccm_mpid);
+            fault = true;
          }
  
          if (cfm->extended && ccm_interval == 0
@@ -556,6 +600,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
              VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid extended"
                           " interval (%"PRIu16"ms) from RMP %"PRIu64, cfm->name,
                           ccm_interval_ms_x, ccm_mpid);
+            fault = true;
          }
  
          rmp = lookup_remote_mp(cfm, ccm_mpid);
@@ -569,6 +614,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
                               "%s: dropped CCM with MPID %"PRIu64" from MAC "
                               ETH_ADDR_FMT, cfm->name, ccm_mpid,
                               ETH_ADDR_ARGS(eth->eth_src));
+                fault = true;
              }
          }
  
@@ -576,16 +622,23 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
                   " (interval %"PRIu8") (RDI %s)", cfm->name, ccm_seq,
                   ccm_mpid, ccm_interval, ccm_rdi ? "true" : "false");
  
+        if (ccm_rdi) {
+            fault = true;
+        }
          if (rmp) {
              if (rmp->seq && ccm_seq != (rmp->seq + 1)) {
                  VLOG_WARN_RL(&rl, "%s: (mpid %"PRIu64") detected sequence"
                               " numbers which indicate possible connectivity"
                               " problems (previous %"PRIu32") (current %"PRIu32
                               ")", cfm->name, ccm_mpid, rmp->seq, ccm_seq);
+                fault = true;
              }
  
              rmp->mpid = ccm_mpid;
              rmp->recv = true;
+            if (!fault) {
+                rmp->num_health_ccm++;
+            }
              rmp->seq = ccm_seq;
              rmp->rdi = ccm_rdi;
              rmp->opup = !ccm_opdown;
@@ -605,6 +658,17 @@ cfm_get_fault(const struct cfm *cfm)
      return cfm->fault;
  }
  
+/* Gets the health of 'cfm'.  Returns an integer between 0 and 100 indicating
+ * the health of the link as a percentage of ccm frames received in
+ * CFM_HEALTH_INTERVAL * 'fault_interval' if there is only 1 remote_mpid,
+ * returns 0 if there are no remote_mpids, and returns -1 if there are more
+ * than 1 remote_mpids. */
+int
+cfm_get_health(const struct cfm *cfm)
+{
+    return cfm->health;
+}
+
  /* Gets the operational state of 'cfm'.  'cfm' is considered operationally down
   * if it has received a CCM with the operationally down bit set from any of its
   * remote maintenance points. Returns true if 'cfm' is operationally up. False
@@ -656,6 +720,11 @@ cfm_print_details(struct ds *ds, const struct cfm *cfm)
          ds_put_cstr(ds, "\n");
      }
  
+    if (cfm->health == -1) {
+        ds_put_format(ds, "\taverage health: undefined\n");
+    } else {
+        ds_put_format(ds, "\taverage health: %d\n", cfm->health);
+    }
      ds_put_format(ds, "\topstate: %s\n", cfm->opup ? "up" : "down");
      ds_put_format(ds, "\tremote_opstate: %s\n",
                    cfm->remote_opup ? "up" : "down");
diff --git a/lib/cfm.h b/lib/cfm.h

index 2556a325b36b9bc396537c79ff18e8a38799edf2..2b4f888f576ac9e33bbe8526856f3dc80cd6506f 100644 (file)
--- a/lib/cfm.h
+++ b/lib/cfm.h
@@ -69,6 +69,7 @@ bool cfm_configure(struct cfm *, const struct cfm_settings *);
  bool cfm_should_process_flow(const struct cfm *cfm, const struct flow *);
  void cfm_process_heartbeat(struct cfm *, const struct ofpbuf *packet);
  int cfm_get_fault(const struct cfm *);
+int cfm_get_health(const struct cfm *);
  bool cfm_get_opup(const struct cfm *);
  void cfm_get_remote_mpids(const struct cfm *, const uint64_t **rmps,
                            size_t *n_rmps);
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c

index 51b847f24d1a1829d3870f435e04f61ffe6326bb..a42d09eb9575f2dc8f6f09667335e5bbc667a567 100644 (file)
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -1119,6 +1119,14 @@ get_cfm_remote_mpids(const struct ofport *ofport_, const uint64_t **rmps,
          return -1;
      }
  }
+
+static int
+get_cfm_health(const struct ofport *ofport_)
+{
+    struct ofport_dpif *ofport = ofport_dpif_cast(ofport_);
+
+    return ofport->cfm ? cfm_get_health(ofport->cfm) : -1;
+}
  \f
  /* Spanning Tree. */
  
@@ -6491,6 +6499,7 @@ const struct ofproto_class ofproto_dpif_class = {
      set_cfm,
      get_cfm_fault,
      get_cfm_remote_mpids,
+    get_cfm_health,
      set_stp,
      get_stp_status,
      set_stp_port,
diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h

index 26904ef021a1adeb7d9bf551536de1317c87e152..7b0e478bf983194bca3acc1c31da3d7feb2164e7 100644 (file)
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -980,6 +980,17 @@ struct ofproto_class {
      int (*get_cfm_remote_mpids)(const struct ofport *ofport,
                                  const uint64_t **rmps, size_t *n_rmps);
  
+    /* Checks the health of CFM configured on 'ofport'.  Returns an integer
+     * to indicate the health percentage of the 'ofport' which is an average of
+     * the health of all the remote_mps.  Returns an integer between 0 and 100
+     * where 0 means that the 'ofport' is very unhealthy and 100 means the
+     * 'ofport' is perfectly healthy.  Returns -1 if CFM is not enabled on
+     * 'port' or if the number of remote_mpids is > 1.
+     *
+     * This function may be a null pointer if the ofproto implementation does
+     * not support CFM. */
+    int (*get_cfm_health)(const struct ofport *ofport);
+
      /* Configures spanning tree protocol (STP) on 'ofproto' using the
       * settings defined in 's'.
       *
diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c

index e7e040126eb12c13259b78e2d13e66e7f43673be..f9343069b9126860463c91ddcd0bf69898884b32 100644 (file)
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -2481,6 +2481,19 @@ ofproto_port_get_cfm_remote_mpids(const struct ofproto *ofproto,
              : -1);
  }
  
+/* Checks the health of the CFM for 'ofp_port' within 'ofproto'.  Returns an
+ * integer value between 0 and 100 to indicate the health of the port as a
+ * percentage which is the average of cfm health of all the remote_mpids or
+ * returns -1 if CFM is not enabled on 'ofport'. */
+int
+ofproto_port_get_cfm_health(const struct ofproto *ofproto, uint16_t ofp_port)
+{
+    struct ofport *ofport = ofproto_get_port(ofproto, ofp_port);
+    return (ofport && ofproto->ofproto_class->get_cfm_health
+            ? ofproto->ofproto_class->get_cfm_health(ofport)
+            : -1);
+}
+
  static enum ofperr
  handle_aggregate_stats_request(struct ofconn *ofconn,
                                 const struct ofp_stats_msg *osm)
diff --git a/ofproto/ofproto.h b/ofproto/ofproto.h

index 6172f291f22f542e3685eb00340597ed307c98e0..c40f5d3a419cf80c5def70735f72a068c88c1bf9 100644 (file)
--- a/ofproto/ofproto.h
+++ b/ofproto/ofproto.h
@@ -348,7 +348,8 @@ int ofproto_port_get_cfm_fault(const struct ofproto *, uint16_t ofp_port);
  int ofproto_port_get_cfm_remote_mpids(const struct ofproto *,
                                        uint16_t ofp_port, const uint64_t **rmps,
                                        size_t *n_rmps);
-
+int ofproto_port_get_cfm_health(const struct ofproto *ofproto,
+                                uint16_t ofp_port);
  void ofproto_get_ofproto_controller_info(const struct ofproto *, struct shash *);
  void ofproto_free_ofproto_controller_info(struct shash *);
  \f
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c

index adc3b47cc9b524e087d5ddf755adc7476e9ca864..35d8723354ad2d9e3e321b6d9e4dde7653d08398 100644 (file)
--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -279,6 +279,7 @@ bridge_init(const char *remote)
      ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_fault);
      ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_fault_status);
      ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_remote_mpids);
+    ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_health);
      ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_lacp_current);
      ovsdb_idl_omit(idl, &ovsrec_interface_col_external_ids);
  
@@ -1547,6 +1548,7 @@ iface_refresh_cfm_stats(struct iface *iface)
      int fault, error;
      const uint64_t *rmps;
      size_t n_rmps;
+    int health;
  
      if (iface_is_synthetic(iface)) {
          return;
@@ -1582,6 +1584,15 @@ iface_refresh_cfm_stats(struct iface *iface)
      } else {
          ovsrec_interface_set_cfm_remote_mpids(cfg, NULL, 0);
      }
+
+    health = ofproto_port_get_cfm_health(iface->port->bridge->ofproto,
+                                        iface->ofp_port);
+    if (health >= 0) {
+        int64_t cfm_health = health;
+        ovsrec_interface_set_cfm_health(cfg, &cfm_health, 1);
+    } else {
+        ovsrec_interface_set_cfm_health(cfg, NULL, 0);
+    }
  }
  
  static void
diff --git a/vswitchd/vswitch.ovsschema b/vswitchd/vswitch.ovsschema

index a3847e777ba1e2ff34dc66f3be7eba26ec19badb..c7e1ac9ef4e619f65b682db5c16434be079b0e32 100644 (file)
--- a/vswitchd/vswitch.ovsschema
+++ b/vswitchd/vswitch.ovsschema
@@ -1,6 +1,6 @@
  {"name": "Open_vSwitch",
- "version": "6.8.0",
- "cksum": "4106006492 16485",
+ "version": "6.9.0",
+ "cksum": "617116616 16682",
   "tables": {
     "Open_vSwitch": {
       "columns": {
@@ -197,6 +197,11 @@
           "ephemeral": true},
         "cfm_fault_status": {
           "type": {"key": "string", "min": 0, "max": "unlimited"}},
+       "cfm_health": {
+         "type": {"key": {"type": "integer",
+                          "minInteger": 0,
+                          "maxInteger": 100},
+                  "min": 0, "max": 1}},
         "lacp_current": {
           "type": {"key": {"type": "boolean"},
                    "min": 0, "max": 1},
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml

index f3ea33871ba2bf456c90e0b8fac33951b7677e6e..03c8539d0a448e31a5594f6173c00ae6f514b278 100644 (file)
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -1726,6 +1726,27 @@
          an <code>ovs-appctl</code> command.
        </column>
  
+      <column name="cfm_health">
+        <p>
+          Indicates the health of the interface as a percentage of CCM frames
+          received over 21 <ref column="other_config" key="cfm_interval"/>s.
+          The health of an interface is undefined if it is communicating with
+          more than one <ref column="cfm_remote_mpids"/>.  It reduces if
+          healthy heartbeats are not received at the expected rate, and
+          gradually improves as healthy heartbeats are received at the desired
+          rate. Every 21 <ref column="other_config" key="cfm_interval"/>s, the
+          health of the interface is refreshed.
+        </p>
+        <p>
+          As mentioned above, the faults can be triggered for several reasons.
+          The link health will deteriorate even if heartbeats are received but
+          they are reported to be unhealthy.  An unhealthy heartbeat in this
+          context is a heartbeat for which either some fault is set or is out
+          of sequence.  The interface health can be 100 only on receiving
+          healthy heartbeats at the desired rate.
+        </p>
+      </column>
+
        <column name="cfm_remote_mpids">
          When CFM is properly configured, Open vSwitch will occasionally
          receive CCM broadcasts.  These broadcasts contain the MPID of the
author	Mehak Mahajan <mmahajan@nicira.com>
	Thu, 5 Apr 2012 21:30:23 +0000 (14:30 -0700)
committer	Mehak Mahajan <mmahajan@nicira.com>
	Thu, 5 Apr 2012 21:41:37 +0000 (14:41 -0700)
NEWS		patch \| blob \| history
lib/cfm.c		patch \| blob \| history
lib/cfm.h		patch \| blob \| history
ofproto/ofproto-dpif.c		patch \| blob \| history
ofproto/ofproto-provider.h		patch \| blob \| history
ofproto/ofproto.c		patch \| blob \| history
ofproto/ofproto.h		patch \| blob \| history
vswitchd/bridge.c		patch \| blob \| history
vswitchd/vswitch.ovsschema		patch \| blob \| history
vswitchd/vswitch.xml		patch \| blob \| history