X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fquick-cluster.c;h=68b50123144e2b0b10c19d494d2e4b02b32ac430;hb=refs%2Fheads%2Fcenter-titles;hp=9adcc64243a70c50539909304f758d09488a4020;hpb=4da69790b2ee4d597b0f04a9a0a13d4dfa9fbe9a;p=pspp

diff --git a/src/language/stats/quick-cluster.c b/src/language/stats/quick-cluster.c
index 9adcc64243..68b5012314 100644
--- a/src/language/stats/quick-cluster.c
+++ b/src/language/stats/quick-cluster.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 2011 Free Software Foundation, Inc.
+   Copyright (C) 2011, 2012 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -46,6 +46,13 @@
 #define _(msgid) gettext (msgid)
 #define N_(msgid) msgid
 
+enum missing_type
+  {
+    MISS_LISTWISE,
+    MISS_PAIRWISE,
+  };
+
+
 struct qc
 {
   const struct variable **vars;
@@ -55,6 +62,9 @@ struct qc
   int maxiter;			/* Maximum iterations (Given by the user) */
 
   const struct variable *wv;	/* Weighting variable. */
+
+  enum missing_type missing_type;
+  enum mv_class exclude;
 };
 
 /* Holds all of the information for the functions.  int n, holds the number of
@@ -174,18 +184,18 @@ static int
 kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c, const struct qc *qc)
 {
   int result = -1;
-  double x;
   int i, j;
-  double dist;
-  double mindist;
-  mindist = INFINITY;
+  double mindist = INFINITY;
   for (i = 0; i < qc->ngroups; i++)
     {
-      dist = 0;
+      double dist = 0;
       for (j = 0; j < qc->n_vars; j++)
 	{
-	  x = case_data (c, qc->vars[j])->f;
-	  dist += pow2 (gsl_matrix_get (kmeans->centers, i, j) - x);
+	  const union value *val = case_data (c, qc->vars[j]);
+	  if ( var_is_value_missing (qc->vars[j], val, qc->exclude))
+	    continue;
+
+	  dist += pow2 (gsl_matrix_get (kmeans->centers, i, j) - val->f);
 	}
       if (dist < mindist)
 	{
@@ -200,28 +210,28 @@ kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c, const struct q
 static void
 kmeans_recalculate_centers (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *qc)
 {
-  casenumber i;
+  casenumber i = 0;
   int v, j;
-  double x, curval;
   struct ccase *c;
-  struct ccase *c_index;
-  struct casereader *cs;
-  struct casereader *cs_index;
-  int index;
 
-  i = 0;
-  cs = casereader_clone (reader);
-  cs_index = casereader_clone (kmeans->index_rdr);
+  struct casereader *cs = casereader_clone (reader);
+  struct casereader *cs_index = casereader_clone (kmeans->index_rdr);
 
   gsl_matrix_set_all (kmeans->centers, 0.0);
   for (; (c = casereader_read (cs)) != NULL; case_unref (c))
     {
       double weight = qc->wv ? case_data (c, qc->wv)->f : 1.0;
-      c_index = casereader_read (cs_index);
-      index = case_data_idx (c_index, 0)->f;
+      struct ccase *c_index = casereader_read (cs_index);
+      int index = case_data_idx (c_index, 0)->f;
       for (v = 0; v < qc->n_vars; ++v)
 	{
-	  x = case_data (c, qc->vars[v])->f * weight;
+	  const union value *val = case_data (c, qc->vars[v]);
+	  double x = val->f * weight;
+	  double curval;
+
+	  if ( var_is_value_missing (qc->vars[v], val, qc->exclude))
+	    continue;
+
 	  curval = gsl_matrix_get (kmeans->centers, index, v);
 	  gsl_matrix_set (kmeans->centers, index, v, curval + x);
 	}
@@ -279,6 +289,7 @@ kmeans_calculate_indexes_and_check_convergence (struct Kmeans *kmeans, const str
       struct ccase *index_case_new = case_create (kmeans->proto);
       int bestindex = kmeans_get_nearest_group (kmeans, c, qc);
       double weight = qc->wv ? case_data (c, qc->wv)->f : 1.0;
+      assert (bestindex < kmeans->num_elements_groups->size);
       kmeans->num_elements_groups->data[bestindex] += weight;
       if (kmeans->index_rdr)
 	{
@@ -379,11 +390,10 @@ static void
 quick_cluster_show_centers (struct Kmeans *kmeans, bool initial, const struct qc *qc)
 {
   struct tab_table *t;
-  int nc, nr, heading_columns, currow;
+  int nc, nr, currow;
   int i, j;
   nc = qc->ngroups + 1;
   nr = qc->n_vars + 4;
-  heading_columns = 1;
   t = tab_create (nc, nr);
   tab_headers (t, 0, nc - 1, 0, 1);
   currow = 0;
@@ -422,14 +432,14 @@ quick_cluster_show_centers (struct Kmeans *kmeans, bool initial, const struct qc
 	      tab_double (t, i + 1, j + 4, TAB_CENTER,
 			  gsl_matrix_get (kmeans->centers,
 					  kmeans->group_order->data[i], j),
-			  var_get_print_format (qc->vars[j]));
+			  var_get_print_format (qc->vars[j]), RC_OTHER);
 	    }
 	  else
 	    {
 	      tab_double (t, i + 1, j + 4, TAB_CENTER,
 			  gsl_matrix_get (kmeans->initial_centers,
 					  kmeans->group_order->data[i], j),
-			  var_get_print_format (qc->vars[j]));
+			  var_get_print_format (qc->vars[j]), RC_OTHER);
 	    }
 	}
     }
@@ -487,6 +497,8 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds)
   const struct dictionary *dict = dataset_dict (ds);
   qc.ngroups = 2;
   qc.maxiter = 2;
+  qc.missing_type = MISS_LISTWISE;
+  qc.exclude = MV_ANY;
 
   if (!parse_variables_const (lexer, dict, &qc.vars, &qc.n_vars,
 			      PV_NO_DUPLICATE | PV_NUMERIC))
@@ -494,9 +506,37 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds)
       return (CMD_FAILURE);
     }
 
-  if (lex_match (lexer, T_SLASH))
+  while (lex_token (lexer) != T_ENDCMD)
     {
-      if (lex_match_id (lexer, "CRITERIA"))
+      lex_match (lexer, T_SLASH);
+
+      if (lex_match_id (lexer, "MISSING"))
+	{
+	  lex_match (lexer, T_EQUALS);
+	  while (lex_token (lexer) != T_ENDCMD
+		 && lex_token (lexer) != T_SLASH)
+	    {
+	      if (lex_match_id (lexer, "LISTWISE") || lex_match_id (lexer, "DEFAULT"))
+		{
+		  qc.missing_type = MISS_LISTWISE;
+		}
+	      else if (lex_match_id (lexer, "PAIRWISE"))
+		{
+		  qc.missing_type = MISS_PAIRWISE;
+		}
+	      else if (lex_match_id (lexer, "INCLUDE"))
+		{
+		  qc.exclude = MV_SYSTEM;
+		}
+	      else if (lex_match_id (lexer, "EXCLUDE"))
+		{
+		  qc.exclude = MV_ANY;
+		}
+	      else
+		goto error;
+	    }	  
+	}
+      else if (lex_match_id (lexer, "CRITERIA"))
 	{
 	  lex_match (lexer, T_EQUALS);
 	  while (lex_token (lexer) != T_ENDCMD
@@ -508,6 +548,11 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds)
 		    {
 		      lex_force_int (lexer);
 		      qc.ngroups = lex_integer (lexer);
+		      if (qc.ngroups <= 0)
+			{
+			  lex_error (lexer, _("The number of clusters must be positive"));
+			  goto error;
+			}
 		      lex_get (lexer);
 		      lex_force_match (lexer, T_RPAREN);
 		    }
@@ -518,6 +563,11 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds)
 		    {
 		      lex_force_int (lexer);
 		      qc.maxiter = lex_integer (lexer);
+		      if (qc.maxiter <= 0)
+			{
+			  lex_error (lexer, _("The number of iterations must be positive"));
+			  goto error;
+			}
 		      lex_get (lexer);
 		      lex_force_match (lexer, T_RPAREN);
 		    }
@@ -536,6 +586,13 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds)
 
     while (casegrouper_get_next_group (grouper, &group))
       {
+	if ( qc.missing_type == MISS_LISTWISE )
+	  {
+	    group  = casereader_create_filter_missing (group, qc.vars, qc.n_vars,
+						     qc.exclude,
+						     NULL,  NULL);
+	  }
+
 	kmeans = kmeans_create (&qc);
 	kmeans_cluster (kmeans, group, &qc);
 	quick_cluster_show_results (kmeans, &qc);