cfm: Log more aggressively amidst packet loss.
authorEthan Jackson <ethan@nicira.com>
Tue, 17 Jan 2012 00:42:08 +0000 (16:42 -0800)
committerEthan Jackson <ethan@nicira.com>
Mon, 23 Jan 2012 23:14:43 +0000 (15:14 -0800)
While debugging some issues today it became clear that it would be
useful to log when the CFM fault status changes and when packets
are lost.  The CFM module logs pretty aggressively when in debug
mode, but this can be chatty and most systems don't operate under
this logging level for extended periods of time.  This patch logs
when CCMs are received which indicate reordering or packet loss and
when the CFM fault status changed.

Requested-by: Jacob Cherkas <jcherkas@nicira.com>
Signed-off-by: Ethan Jackson <ethan@nicira.com>
AUTHORS
lib/cfm.c

diff --git a/AUTHORS b/AUTHORS
index fbf83552b49592ddb04bb8c3351544632cd944e5..4479aebb5762cabc63fde7ed610485268e81f3ce 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -88,6 +88,7 @@ Gregor Schaffrath       grsch@net.t-labs.tu-berlin.de
 Hassan Khan             hassan.khan@seecs.edu.pk
 Hector Oron             hector.oron@gmail.com
 Henrik Amren            henrik@nicira.com
+Jacob Cherkas           jcherkas@nicira.com
 Jad Naous               jnaous@gmail.com
 Jamal Hadi Salim        hadi@cyberus.ca
 Jan Medved              jmedved@juniper.net
index d2995a58cd8840d6b0dcd7e215dbd7d2a2e09c44..1e05853d70ef0174f53a52d08d02ea80c26c9b69 100644 (file)
--- a/lib/cfm.c
+++ b/lib/cfm.c
@@ -117,9 +117,10 @@ struct remote_mp {
     bool rdi;            /* Remote Defect Indicator. Indicates remote_mp isn't
                             receiving CCMs that it's expecting to. */
     bool opup;           /* Operational State. */
+    uint32_t seq;        /* Most recently received sequence number. */
 };
 
-static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(20, 30);
 static struct hmap all_cfms = HMAP_INITIALIZER(&all_cfms);
 
 static unixctl_cb_func cfm_unixctl_show;
@@ -280,6 +281,7 @@ cfm_run(struct cfm *cfm)
     if (timer_expired(&cfm->fault_timer)) {
         long long int interval = cfm_fault_interval(cfm);
         struct remote_mp *rmp, *rmp_next;
+        bool old_cfm_fault = cfm->fault;
 
         cfm->fault = cfm->unexpected_recv;
         cfm->unexpected_recv = false;
@@ -324,6 +326,11 @@ cfm_run(struct cfm *cfm)
             cfm->fault = true;
         }
 
+        if (old_cfm_fault != cfm->fault) {
+            VLOG_INFO_RL(&rl, "%s: CFM fault status changed to %s",
+                         cfm->name, cfm->fault ? "true" : "false");
+        }
+
         timer_set_duration(&cfm->fault_timer, interval);
     }
 }
@@ -474,6 +481,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
 
         struct remote_mp *rmp;
         uint64_t ccm_mpid;
+        uint32_t ccm_seq;
         bool ccm_opdown;
 
         if (cfm->extended) {
@@ -483,6 +491,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
             ccm_mpid = ntohs(ccm->mpid);
             ccm_opdown = false;
         }
+        ccm_seq = ntohl(ccm->seq);
 
         if (ccm_interval != cfm->ccm_interval) {
             VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid interval"
@@ -500,7 +509,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
         rmp = lookup_remote_mp(cfm, ccm_mpid);
         if (!rmp) {
             if (hmap_count(&cfm->remote_mps) < CFM_MAX_RMPS) {
-                rmp = xmalloc(sizeof *rmp);
+                rmp = xzalloc(sizeof *rmp);
                 hmap_insert(&cfm->remote_mps, &rmp->node, hash_mpid(ccm_mpid));
             } else {
                 cfm->unexpected_recv = true;
@@ -511,16 +520,24 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
             }
         }
 
+        VLOG_DBG("%s: received CCM (seq %"PRIu32") (mpid %"PRIu64")"
+                 " (interval %"PRIu8") (RDI %s)", cfm->name, ccm_seq,
+                 ccm_mpid, ccm_interval, ccm_rdi ? "true" : "false");
+
         if (rmp) {
+            if (rmp->seq && ccm_seq != (rmp->seq + 1)) {
+                VLOG_WARN_RL(&rl, "%s: (mpid %"PRIu64") detected sequence"
+                             " numbers which indicate possible connectivity"
+                             " problems (previous %"PRIu32") (current %"PRIu32
+                             ")", cfm->name, ccm_mpid, rmp->seq, ccm_seq);
+            }
+
             rmp->mpid = ccm_mpid;
             rmp->recv = true;
+            rmp->seq = ccm_seq;
             rmp->rdi = ccm_rdi;
             rmp->opup = !ccm_opdown;
         }
-
-        VLOG_DBG("%s: received CCM (seq %"PRIu32") (mpid %"PRIu64")"
-                 " (interval %"PRIu8") (RDI %s)", cfm->name, ntohl(ccm->seq),
-                 ccm_mpid, ccm_interval, ccm_rdi ? "true" : "false");
     }
 }