QUICK CLUSTER: New subcommand: /PRINT

author Alan Mead <amead2@alanmead.org>

Wed, 21 Oct 2015 09:30:36 +0000 (11:30 +0200)

committer John Darrington <john@darrington.wattle.id.au>

Wed, 21 Oct 2015 09:30:36 +0000 (11:30 +0200)
author Alan Mead <amead2@alanmead.org>
Wed, 21 Oct 2015 09:30:36 +0000 (11:30 +0200)
committer John Darrington <john@darrington.wattle.id.au>
Wed, 21 Oct 2015 09:30:36 +0000 (11:30 +0200)
diff --git a/AUTHORS b/AUTHORS

index 20998f08eb8b81e59569298509b83907c3404e58..97c179e894f5628abd7bc14cddfe4ae05cdb6425 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -14,7 +14,8 @@ revisions to other modules.
  including lib/gslextras and the linear regression features. Jason 
  is also an important contributor to GSL, which is used by PSPP. 
  
-* Mehmet Hakan Satman wrote the QUICK CLUSTER command.
+* Mehmet Hakan Satman wrote the QUICK CLUSTER command and Alan Mead
+  contributed improvements including the cluster membership subcommands.
  
  * Friedrich Beckmann wrote the GRAPH command.
  
diff --git a/NEWS b/NEWS

index b1031e7edbdc388ced578f709228dc1e47b927f5..0cd87cb23fd907379e27e3e30650a74b1320fc63 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,10 @@ Please send PSPP bug reports to bug-gnu-pspp@gnu.org.
   
  Changes since 0.8.5:
  
+ * The QUICK CLUSTER command has a  /PRINT subcommand which shows
+   the initial cluster centres and the final cluster membership of
+   each case.
+
   * A Russian localisation has been contributed.
  
   * The graphical user interface uses Gtk+ version 3 instead of version 2.
diff --git a/doc/statistics.texi b/doc/statistics.texi

index 60d665311c491c8263185bb83b38f741c114b0b1..ac3f0b5c06f64e97df1f27417a0ccb67c4ff6881 100644 (file)
--- a/doc/statistics.texi
+++ b/doc/statistics.texi
@@ -1649,6 +1649,7 @@ The default is 0.05.
  QUICK CLUSTER @var{var_list}
        [/CRITERIA=CLUSTERS(@var{k}) [MXITER(@var{max_iter})]]
        [/MISSING=@{EXCLUDE,INCLUDE@} @{LISTWISE, PAIRWISE@}]
+      [/PRINT=@{INITIAL@} @{CLUSTERS@}]
  @end display
  
  The @cmd{QUICK CLUSTER} command performs k-means clustering on the
@@ -1677,6 +1678,12 @@ clustering variables contain missing values.  Otherwise it is clustered
  on the basis of the non-missing values.
  The default is @subcmd{LISTWISE}.
  
+The @subcmd{PRINT} subcommand requests additional output to be printed.
+If @subcmd{INITIAL} is set, then the initial cluster memberships will
+be printed.
+If @subcmd{CLUSTERS} is set, the cluster memberships of the individual
+cases will be displayed (potentially generating lengthy output).
+
  
  @node RANK
  @section RANK
diff --git a/src/language/stats/quick-cluster.c b/src/language/stats/quick-cluster.c

index 0c871c8bf2d54849e19d4bbbd0c262b371ff666a..56c95c3fbd4ce95e913359f196f626ce87e00f55 100644 (file)
--- a/src/language/stats/quick-cluster.c
+++ b/src/language/stats/quick-cluster.c
@@ -60,6 +60,8 @@ struct qc
  
    int ngroups;                 /* Number of group. (Given by the user) */
    int maxiter;                 /* Maximum iterations (Given by the user) */
+  int print_cluster_membership; /* true => print membership */
+  int print_initial_clusters;   /* true => print initial cluster */
  
    const struct variable *wv;   /* Weighting variable. */
  
@@ -89,7 +91,7 @@ struct Kmeans
  
  static struct Kmeans *kmeans_create (const struct qc *qc);
  
-static void kmeans_randomize_centers (struct Kmeans *kmeans, const struct qc *qc);
+static void kmeans_randomize_centers (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *qc);
  
  static int kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c, const struct qc *);
  
@@ -104,9 +106,11 @@ static void kmeans_cluster (struct Kmeans *kmeans, struct casereader *reader, co
  
  static void quick_cluster_show_centers (struct Kmeans *kmeans, bool initial, const struct qc *);
  
+static void quick_cluster_show_membership (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *);
+
  static void quick_cluster_show_number_cases (struct Kmeans *kmeans, const struct qc *);
  
-static void quick_cluster_show_results (struct Kmeans *kmeans, const struct qc *);
+static void quick_cluster_show_results (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *);
  
  int cmd_quick_cluster (struct lexer *lexer, struct dataset *ds);
  
@@ -152,7 +156,7 @@ kmeans_destroy (struct Kmeans *kmeans)
  
  /* Creates random centers using randomly selected cases from the data. */
  static void
-kmeans_randomize_centers (struct Kmeans *kmeans, const struct qc *qc)
+kmeans_randomize_centers (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *qc)
  {
    int i, j;
    for (i = 0; i < qc->ngroups; i++)
@@ -346,11 +350,12 @@ kmeans_cluster (struct Kmeans *kmeans, struct casereader *reader, const struct q
    bool redo;
    int diffs;
    bool show_warning1;
+  int redo_count = 0;
  
    show_warning1 = true;
  cluster:
    redo = false;
-  kmeans_randomize_centers (kmeans, qc);
+  kmeans_randomize_centers (kmeans, reader, qc);
    for (kmeans->lastiter = 0; kmeans->lastiter < qc->maxiter;
         kmeans->lastiter++)
      {
@@ -377,8 +382,13 @@ cluster:
           break;
         }
      }
+
    if (redo)
-    goto cluster;
+    {
+      redo_count++;
+      assert (redo_count < 10);
+      goto cluster;
+    }
  
  }
  
@@ -446,6 +456,38 @@ quick_cluster_show_centers (struct Kmeans *kmeans, bool initial, const struct qc
    tab_submit (t);
  }
  
+/* Reports cluster membership for each case. */
+static void
+quick_cluster_show_membership (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *qc)
+{
+  struct tab_table *t;
+  int nc, nr;
+  int i, clust; 
+  struct ccase *c;
+  struct casereader *cs = casereader_clone (reader);
+  nc = 2;
+  nr = kmeans->n + 1;
+  t = tab_create (nc, nr);
+  tab_headers (t, 0, nc - 1, 0, 0);
+  tab_title (t, _("Cluster Membership"));
+  tab_text (t, 0, 0, TAB_CENTER, _("Case Number"));
+  tab_text (t, 1, 0, TAB_CENTER, _("Cluster"));
+  tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, nc - 1, nr - 1);
+
+  for (i = 0; (c = casereader_read (cs)) != NULL; i++, case_unref (c))
+    {
+      assert (i < kmeans->n);
+      clust = kmeans_get_nearest_group (kmeans, c, qc);
+      clust = kmeans->group_order->data[clust];
+      tab_text_format (t, 0, i+1, TAB_CENTER, "%d", (i + 1));
+      tab_text_format (t, 1, i+1, TAB_CENTER, "%d", (clust + 1));
+    }
+  assert (i == kmeans->n);
+  tab_submit (t);
+  casereader_destroy (cs);
+}
+
+
  /* Reports number of cases of each single cluster. */
  static void
  quick_cluster_show_number_cases (struct Kmeans *kmeans, const struct qc *qc)
@@ -479,13 +521,15 @@ quick_cluster_show_number_cases (struct Kmeans *kmeans, const struct qc *qc)
  
  /* Reports. */
  static void
-quick_cluster_show_results (struct Kmeans *kmeans, const struct qc *qc)
+quick_cluster_show_results (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *qc)
  {
-  kmeans_order_groups (kmeans, qc);
-  /* Uncomment the line below for reporting initial centers. */
-  /* quick_cluster_show_centers (kmeans, true); */
+  kmeans_order_groups (kmeans, qc); /* what does this do? */
+  if( qc->print_initial_clusters )
+    quick_cluster_show_centers (kmeans, true, qc);
    quick_cluster_show_centers (kmeans, false, qc);
    quick_cluster_show_number_cases (kmeans, qc);
+  if( qc->print_cluster_membership )
+     quick_cluster_show_membership(kmeans, reader, qc);
  }
  
  int
@@ -499,6 +543,8 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds)
    qc.maxiter = 2;
    qc.missing_type = MISS_LISTWISE;
    qc.exclude = MV_ANY;
+  qc.print_cluster_membership = false; /* default = do not output case cluster membership */
+  qc.print_initial_clusters = false;   /* default = do not print initial clusters */
  
    if (!parse_variables_const (lexer, dict, &qc.vars, &qc.n_vars,
                               PV_NO_DUPLICATE | PV_NUMERIC))
@@ -536,6 +582,20 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds)
                 goto error;
             }     
         }
+      else if (lex_match_id (lexer, "PRINT"))
+       {
+         lex_match (lexer, T_EQUALS);
+         while (lex_token (lexer) != T_ENDCMD
+                && lex_token (lexer) != T_SLASH)
+           {
+             if (lex_match_id (lexer, "CLUSTER"))
+                qc.print_cluster_membership = true;
+             else if (lex_match_id (lexer, "INITIAL"))
+               qc.print_initial_clusters = true;
+             else
+                goto error;
+           }
+       }
        else if (lex_match_id (lexer, "CRITERIA"))
         {
           lex_match (lexer, T_EQUALS);
@@ -600,7 +660,7 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds)
  
         kmeans = kmeans_create (&qc);
         kmeans_cluster (kmeans, group, &qc);
-       quick_cluster_show_results (kmeans, &qc);
+       quick_cluster_show_results (kmeans, group, &qc);
         kmeans_destroy (kmeans);
         casereader_destroy (group);
        }
diff --git a/tests/language/stats/quick-cluster.at b/tests/language/stats/quick-cluster.at

index b3025651dcc6000da790595a60a82df79a73e265..75dd52cb19f89a0a584ab3281bc9c24334ae7953 100644 (file)
--- a/tests/language/stats/quick-cluster.at
+++ b/tests/language/stats/quick-cluster.at
@@ -244,3 +244,159 @@ AT_CHECK([pspp -O format=csv quick-cluster.sps], [1], [dnl
  quick-cluster.sps:7.20-7.30: error: QUICK CLUSTER: Syntax error at `UNSUPPORTED'.
  ])
  AT_CLEANUP
+
+
+
+AT_SETUP([QUICK CLUSTER /PRINT subcommand])
+AT_DATA([quick-cluster.sps], [dnl
+data list notable list /cluster (A8) x (F) y (F).
+begin data.
+A 10.45 9.38
+A 10.67 9.17
+A 10.86 9.63
+A 8.77 8.45
+A 8.04 11.77
+A 10.34 9.83
+A 10.37 10.54
+A 11.49 8.18
+A 10.17 11.10
+A 11.37 9.16
+A 10.25 8.83
+A 8.69 9.92
+A 10.36 10.39
+A 10.89 10.51
+A 9.9 11.39
+A 11.1 10.91
+A 11.77 8.47
+A 9.5 10.46
+B -11.01 -9.21
+B -10.8 -11.76
+B -10.03 -10.29
+B -9.54 -9.17
+B -10.16 -9.82
+B -10.01 -8.63
+B -9.6 -10.22
+B -11.36 -10.93
+B -10.63 -10.97
+B -9.53 -10.78
+B -9.40 -10.26
+B -10.76 -9.76
+B -9.9 -10.11
+B -10.16 -9.75
+B -8.65 -11.31
+B -10.10 -10.90
+B -11.67 -9.89
+B -11.11 -9.23
+B -8.7 -8.43
+B -11.35 -8.68
+C -10.20 9.00
+C -10.12 9.92
+C -10.41 10.16
+C -9.86 10.12
+C -10.31 10.12
+C -9.57 10.16
+C -9.69 9.93
+C -9.14 10.84
+C -9.8 10.19
+C -9.97 10.22
+C -11.65 10.81
+C -9.80 11.39
+C -10.31 10.74
+C -10.26 10.38
+C -11.57 10.02
+C -10.50 9.75
+C -9.06 9.63
+C -10.17 10.82
+C -10.22 9.99
+end data.
+
+QUICK CLUSTER x y
+  /CRITERIA=CLUSTERS(3)
+  /PRINT=INITIAL CLUSTER.
+])
+
+AT_CHECK([pspp -O format=csv quick-cluster.sps], [0], [dnl
+Table: Initial Cluster Centers
+,Cluster,,
+,,,
+,1,2,3
+,,,
+x,0,0,1
+y,0,1,0
+
+Table: Final Cluster Centers
+,Cluster,,
+,,,
+,1,2,3
+,,,
+x,-10,-10,10
+y,-10,10,10
+
+Table: Number of Cases in each Cluster
+Cluster,1,20
+,2,19
+,3,18
+Valid,,57
+
+Table: Cluster Membership
+Case Number,Cluster
+1,3
+2,3
+3,3
+4,3
+5,3
+6,3
+7,3
+8,3
+9,3
+10,3
+11,3
+12,3
+13,3
+14,3
+15,3
+16,3
+17,3
+18,3
+19,1
+20,1
+21,1
+22,1
+23,1
+24,1
+25,1
+26,1
+27,1
+28,1
+29,1
+30,1
+31,1
+32,1
+33,1
+34,1
+35,1
+36,1
+37,1
+38,1
+39,2
+40,2
+41,2
+42,2
+43,2
+44,2
+45,2
+46,2
+47,2
+48,2
+49,2
+50,2
+51,2
+52,2
+53,2
+54,2
+55,2
+56,2
+57,2
+])
+
+AT_CLEANUP
author	Alan Mead <amead2@alanmead.org>
	Wed, 21 Oct 2015 09:30:36 +0000 (11:30 +0200)
committer	John Darrington <john@darrington.wattle.id.au>
	Wed, 21 Oct 2015 09:30:36 +0000 (11:30 +0200)
AUTHORS		patch \| blob \| history
NEWS		patch \| blob \| history
doc/statistics.texi		patch \| blob \| history
src/language/stats/quick-cluster.c		patch \| blob \| history
tests/language/stats/quick-cluster.at		patch \| blob \| history