From 3ec4ce83403f756f2c983ca2b5544cb1dcf9dfe0 Mon Sep 17 00:00:00 2001
From: John Darrington <john@darrington.wattle.id.au>
Date: Fri, 8 Jul 2011 16:32:30 +0200
Subject: [PATCH] QUICK CLUSTER: Implement pairwise missing option

---
 src/language/stats/quick-cluster.c    | 68 ++++++++++++++++++---------
 tests/language/stats/quick-cluster.at | 57 ++++++++++++++++++++++
 2 files changed, 103 insertions(+), 22 deletions(-)

diff --git a/src/language/stats/quick-cluster.c b/src/language/stats/quick-cluster.c
index c61be3b777..10572cee5e 100644
--- a/src/language/stats/quick-cluster.c
+++ b/src/language/stats/quick-cluster.c
@@ -184,18 +184,18 @@ static int
 kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c, const struct qc *qc)
 {
   int result = -1;
-  double x;
   int i, j;
-  double dist;
-  double mindist;
-  mindist = INFINITY;
+  double mindist = INFINITY;
   for (i = 0; i < qc->ngroups; i++)
     {
-      dist = 0;
+      double dist = 0;
       for (j = 0; j < qc->n_vars; j++)
 	{
-	  x = case_data (c, qc->vars[j])->f;
-	  dist += pow2 (gsl_matrix_get (kmeans->centers, i, j) - x);
+	  const union value *val = case_data (c, qc->vars[j]);
+	  if ( var_is_value_missing (qc->vars[j], val, qc->exclude))
+	    continue;
+
+	  dist += pow2 (gsl_matrix_get (kmeans->centers, i, j) - val->f);
 	}
       if (dist < mindist)
 	{
@@ -210,29 +210,28 @@ kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c, const struct q
 static void
 kmeans_recalculate_centers (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *qc)
 {
-  casenumber i;
+  casenumber i = 0;
   int v, j;
-  double x, curval;
   struct ccase *c;
-  struct ccase *c_index;
-  struct casereader *cs;
-  struct casereader *cs_index;
-  int index;
 
-  i = 0;
-  cs = casereader_clone (reader);
-  cs_index = casereader_clone (kmeans->index_rdr);
+  struct casereader *cs = casereader_clone (reader);
+  struct casereader *cs_index = casereader_clone (kmeans->index_rdr);
 
   gsl_matrix_set_all (kmeans->centers, 0.0);
   for (; (c = casereader_read (cs)) != NULL; case_unref (c))
     {
       double weight = qc->wv ? case_data (c, qc->wv)->f : 1.0;
-      c_index = casereader_read (cs_index);
-      index = case_data_idx (c_index, 0)->f;
+      struct ccase *c_index = casereader_read (cs_index);
+      int index = case_data_idx (c_index, 0)->f;
       for (v = 0; v < qc->n_vars; ++v)
 	{
-	  x = case_data (c, qc->vars[v])->f * weight;
-	  curval = gsl_matrix_get (kmeans->centers, index, v);
+	  const union value *val = case_data (c, qc->vars[v]);
+	  double x = val->f * weight;
+
+	  if ( var_is_value_missing (qc->vars[v], val, qc->exclude))
+	    continue;
+
+	  double curval = gsl_matrix_get (kmeans->centers, index, v);
 	  gsl_matrix_set (kmeans->centers, index, v, curval + x);
 	}
       i++;
@@ -499,6 +498,7 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds)
   qc.ngroups = 2;
   qc.maxiter = 2;
   qc.missing_type = MISS_LISTWISE;
+  qc.exclude = MV_ANY;
 
   if (!parse_variables_const (lexer, dict, &qc.vars, &qc.n_vars,
 			      PV_NO_DUPLICATE | PV_NUMERIC))
@@ -506,9 +506,33 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds)
       return (CMD_FAILURE);
     }
 
-  if (lex_match (lexer, T_SLASH))
+  while (lex_token (lexer) != T_ENDCMD)
     {
-      if (lex_match_id (lexer, "CRITERIA"))
+      lex_match (lexer, T_SLASH);
+
+      if (lex_match_id (lexer, "MISSING"))
+	{
+	  lex_match (lexer, T_EQUALS);
+	  while (lex_token (lexer) != T_ENDCMD
+		 && lex_token (lexer) != T_SLASH)
+	    {
+	      if (lex_match_id (lexer, "LISTWISE") || lex_match_id (lexer, "DEFAULT"))
+		{
+		  qc.missing_type = MISS_LISTWISE;
+		}
+	      else if (lex_match_id (lexer, "PAIRWISE"))
+		{
+		  qc.missing_type = MISS_PAIRWISE;
+		}
+	      else if (lex_match_id (lexer, "INCLUDE"))
+		{
+		  qc.exclude = MV_SYSTEM;
+		}
+	      else
+		goto error;
+	    }	  
+	}
+      else if (lex_match_id (lexer, "CRITERIA"))
 	{
 	  lex_match (lexer, T_EQUALS);
 	  while (lex_token (lexer) != T_ENDCMD
diff --git a/tests/language/stats/quick-cluster.at b/tests/language/stats/quick-cluster.at
index 05688e8e2e..9e3e87c899 100644
--- a/tests/language/stats/quick-cluster.at
+++ b/tests/language/stats/quick-cluster.at
@@ -151,5 +151,62 @@ AT_CHECK([pspp -o pspp-nm.csv quick-nmiss.sps])
 
 AT_CHECK([diff pspp-m.csv pspp-nm.csv], [0])
 
+AT_CLEANUP
+
+
+AT_SETUP([QUICK CLUSTER with pairwise missing])
+AT_DATA([quick-s.sps], [dnl
+data list notable list /x * y *.
+begin data.
+1   2
+1   2.2
+1.1 1.9
+1   9
+1   10
+1.3 9.5
+0.9 8.9
+3.5 2
+3.4 3
+3.5 2.5
+3.1 2.0
+3.9 2.5
+3.8 2.0
+end data.
+
+QUICK CLUSTER x y 
+	/CRITERIA = CLUSTER(3) MXITER (100)
+	.
+])
+
+AT_CHECK([pspp -O format=csv quick-s.sps | tail -5 > pspp-s.csv])
+
+AT_DATA([quick-pw.sps], [dnl
+data list notable list /x * y *.
+begin data.
+1   2
+1   2.2
+1.1 1.9
+1   9
+1   10
+1.3 9.5
+0.9 8.9
+3.5 2
+3.4 3
+3.5 2.5
+3.1 2.0
+3.9 .
+3.8 .
+end data.
+
+QUICK CLUSTER x y 
+	/CRITERIA = CLUSTER(3) MXITER (100)
+	/MISSING = PAIRWISE
+	.
+])
+
+AT_CHECK([pspp -O format=csv quick-pw.sps | tail -5 > pspp-pw.csv])
+
+AT_CHECK([diff pspp-s.csv pspp-pw.csv], [0])
+
 
 AT_CLEANUP
-- 
2.30.2