From: Alan Mead Date: Wed, 21 Oct 2015 09:30:36 +0000 (+0200) Subject: QUICK CLUSTER: New subcommand: /PRINT X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=56d6f17c81105cffb326be040430fefe53f95eea QUICK CLUSTER: New subcommand: /PRINT This subcommand can be used to show the initial cluster centres and the final cluster membership. Closes bug #41019 --- diff --git a/AUTHORS b/AUTHORS index 20998f08eb..97c179e894 100644 --- a/AUTHORS +++ b/AUTHORS @@ -14,7 +14,8 @@ revisions to other modules. including lib/gslextras and the linear regression features. Jason is also an important contributor to GSL, which is used by PSPP. -* Mehmet Hakan Satman wrote the QUICK CLUSTER command. +* Mehmet Hakan Satman wrote the QUICK CLUSTER command and Alan Mead + contributed improvements including the cluster membership subcommands. * Friedrich Beckmann wrote the GRAPH command. diff --git a/NEWS b/NEWS index b1031e7edb..0cd87cb23f 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,10 @@ Please send PSPP bug reports to bug-gnu-pspp@gnu.org. Changes since 0.8.5: + * The QUICK CLUSTER command has a /PRINT subcommand which shows + the initial cluster centres and the final cluster membership of + each case. + * A Russian localisation has been contributed. * The graphical user interface uses Gtk+ version 3 instead of version 2. diff --git a/doc/statistics.texi b/doc/statistics.texi index 60d665311c..ac3f0b5c06 100644 --- a/doc/statistics.texi +++ b/doc/statistics.texi @@ -1649,6 +1649,7 @@ The default is 0.05. QUICK CLUSTER @var{var_list} [/CRITERIA=CLUSTERS(@var{k}) [MXITER(@var{max_iter})]] [/MISSING=@{EXCLUDE,INCLUDE@} @{LISTWISE, PAIRWISE@}] + [/PRINT=@{INITIAL@} @{CLUSTERS@}] @end display The @cmd{QUICK CLUSTER} command performs k-means clustering on the @@ -1677,6 +1678,12 @@ clustering variables contain missing values. Otherwise it is clustered on the basis of the non-missing values. The default is @subcmd{LISTWISE}. +The @subcmd{PRINT} subcommand requests additional output to be printed. +If @subcmd{INITIAL} is set, then the initial cluster memberships will +be printed. +If @subcmd{CLUSTERS} is set, the cluster memberships of the individual +cases will be displayed (potentially generating lengthy output). + @node RANK @section RANK diff --git a/src/language/stats/quick-cluster.c b/src/language/stats/quick-cluster.c index 0c871c8bf2..56c95c3fbd 100644 --- a/src/language/stats/quick-cluster.c +++ b/src/language/stats/quick-cluster.c @@ -60,6 +60,8 @@ struct qc int ngroups; /* Number of group. (Given by the user) */ int maxiter; /* Maximum iterations (Given by the user) */ + int print_cluster_membership; /* true => print membership */ + int print_initial_clusters; /* true => print initial cluster */ const struct variable *wv; /* Weighting variable. */ @@ -89,7 +91,7 @@ struct Kmeans static struct Kmeans *kmeans_create (const struct qc *qc); -static void kmeans_randomize_centers (struct Kmeans *kmeans, const struct qc *qc); +static void kmeans_randomize_centers (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *qc); static int kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c, const struct qc *); @@ -104,9 +106,11 @@ static void kmeans_cluster (struct Kmeans *kmeans, struct casereader *reader, co static void quick_cluster_show_centers (struct Kmeans *kmeans, bool initial, const struct qc *); +static void quick_cluster_show_membership (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *); + static void quick_cluster_show_number_cases (struct Kmeans *kmeans, const struct qc *); -static void quick_cluster_show_results (struct Kmeans *kmeans, const struct qc *); +static void quick_cluster_show_results (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *); int cmd_quick_cluster (struct lexer *lexer, struct dataset *ds); @@ -152,7 +156,7 @@ kmeans_destroy (struct Kmeans *kmeans) /* Creates random centers using randomly selected cases from the data. */ static void -kmeans_randomize_centers (struct Kmeans *kmeans, const struct qc *qc) +kmeans_randomize_centers (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *qc) { int i, j; for (i = 0; i < qc->ngroups; i++) @@ -346,11 +350,12 @@ kmeans_cluster (struct Kmeans *kmeans, struct casereader *reader, const struct q bool redo; int diffs; bool show_warning1; + int redo_count = 0; show_warning1 = true; cluster: redo = false; - kmeans_randomize_centers (kmeans, qc); + kmeans_randomize_centers (kmeans, reader, qc); for (kmeans->lastiter = 0; kmeans->lastiter < qc->maxiter; kmeans->lastiter++) { @@ -377,8 +382,13 @@ cluster: break; } } + if (redo) - goto cluster; + { + redo_count++; + assert (redo_count < 10); + goto cluster; + } } @@ -446,6 +456,38 @@ quick_cluster_show_centers (struct Kmeans *kmeans, bool initial, const struct qc tab_submit (t); } +/* Reports cluster membership for each case. */ +static void +quick_cluster_show_membership (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *qc) +{ + struct tab_table *t; + int nc, nr; + int i, clust; + struct ccase *c; + struct casereader *cs = casereader_clone (reader); + nc = 2; + nr = kmeans->n + 1; + t = tab_create (nc, nr); + tab_headers (t, 0, nc - 1, 0, 0); + tab_title (t, _("Cluster Membership")); + tab_text (t, 0, 0, TAB_CENTER, _("Case Number")); + tab_text (t, 1, 0, TAB_CENTER, _("Cluster")); + tab_box (t, TAL_2, TAL_2, TAL_0, TAL_1, 0, 0, nc - 1, nr - 1); + + for (i = 0; (c = casereader_read (cs)) != NULL; i++, case_unref (c)) + { + assert (i < kmeans->n); + clust = kmeans_get_nearest_group (kmeans, c, qc); + clust = kmeans->group_order->data[clust]; + tab_text_format (t, 0, i+1, TAB_CENTER, "%d", (i + 1)); + tab_text_format (t, 1, i+1, TAB_CENTER, "%d", (clust + 1)); + } + assert (i == kmeans->n); + tab_submit (t); + casereader_destroy (cs); +} + + /* Reports number of cases of each single cluster. */ static void quick_cluster_show_number_cases (struct Kmeans *kmeans, const struct qc *qc) @@ -479,13 +521,15 @@ quick_cluster_show_number_cases (struct Kmeans *kmeans, const struct qc *qc) /* Reports. */ static void -quick_cluster_show_results (struct Kmeans *kmeans, const struct qc *qc) +quick_cluster_show_results (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *qc) { - kmeans_order_groups (kmeans, qc); - /* Uncomment the line below for reporting initial centers. */ - /* quick_cluster_show_centers (kmeans, true); */ + kmeans_order_groups (kmeans, qc); /* what does this do? */ + if( qc->print_initial_clusters ) + quick_cluster_show_centers (kmeans, true, qc); quick_cluster_show_centers (kmeans, false, qc); quick_cluster_show_number_cases (kmeans, qc); + if( qc->print_cluster_membership ) + quick_cluster_show_membership(kmeans, reader, qc); } int @@ -499,6 +543,8 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) qc.maxiter = 2; qc.missing_type = MISS_LISTWISE; qc.exclude = MV_ANY; + qc.print_cluster_membership = false; /* default = do not output case cluster membership */ + qc.print_initial_clusters = false; /* default = do not print initial clusters */ if (!parse_variables_const (lexer, dict, &qc.vars, &qc.n_vars, PV_NO_DUPLICATE | PV_NUMERIC)) @@ -536,6 +582,20 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) goto error; } } + else if (lex_match_id (lexer, "PRINT")) + { + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { + if (lex_match_id (lexer, "CLUSTER")) + qc.print_cluster_membership = true; + else if (lex_match_id (lexer, "INITIAL")) + qc.print_initial_clusters = true; + else + goto error; + } + } else if (lex_match_id (lexer, "CRITERIA")) { lex_match (lexer, T_EQUALS); @@ -600,7 +660,7 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) kmeans = kmeans_create (&qc); kmeans_cluster (kmeans, group, &qc); - quick_cluster_show_results (kmeans, &qc); + quick_cluster_show_results (kmeans, group, &qc); kmeans_destroy (kmeans); casereader_destroy (group); } diff --git a/tests/language/stats/quick-cluster.at b/tests/language/stats/quick-cluster.at index b3025651dc..75dd52cb19 100644 --- a/tests/language/stats/quick-cluster.at +++ b/tests/language/stats/quick-cluster.at @@ -244,3 +244,159 @@ AT_CHECK([pspp -O format=csv quick-cluster.sps], [1], [dnl quick-cluster.sps:7.20-7.30: error: QUICK CLUSTER: Syntax error at `UNSUPPORTED'. ]) AT_CLEANUP + + + +AT_SETUP([QUICK CLUSTER /PRINT subcommand]) +AT_DATA([quick-cluster.sps], [dnl +data list notable list /cluster (A8) x (F) y (F). +begin data. +A 10.45 9.38 +A 10.67 9.17 +A 10.86 9.63 +A 8.77 8.45 +A 8.04 11.77 +A 10.34 9.83 +A 10.37 10.54 +A 11.49 8.18 +A 10.17 11.10 +A 11.37 9.16 +A 10.25 8.83 +A 8.69 9.92 +A 10.36 10.39 +A 10.89 10.51 +A 9.9 11.39 +A 11.1 10.91 +A 11.77 8.47 +A 9.5 10.46 +B -11.01 -9.21 +B -10.8 -11.76 +B -10.03 -10.29 +B -9.54 -9.17 +B -10.16 -9.82 +B -10.01 -8.63 +B -9.6 -10.22 +B -11.36 -10.93 +B -10.63 -10.97 +B -9.53 -10.78 +B -9.40 -10.26 +B -10.76 -9.76 +B -9.9 -10.11 +B -10.16 -9.75 +B -8.65 -11.31 +B -10.10 -10.90 +B -11.67 -9.89 +B -11.11 -9.23 +B -8.7 -8.43 +B -11.35 -8.68 +C -10.20 9.00 +C -10.12 9.92 +C -10.41 10.16 +C -9.86 10.12 +C -10.31 10.12 +C -9.57 10.16 +C -9.69 9.93 +C -9.14 10.84 +C -9.8 10.19 +C -9.97 10.22 +C -11.65 10.81 +C -9.80 11.39 +C -10.31 10.74 +C -10.26 10.38 +C -11.57 10.02 +C -10.50 9.75 +C -9.06 9.63 +C -10.17 10.82 +C -10.22 9.99 +end data. + +QUICK CLUSTER x y + /CRITERIA=CLUSTERS(3) + /PRINT=INITIAL CLUSTER. +]) + +AT_CHECK([pspp -O format=csv quick-cluster.sps], [0], [dnl +Table: Initial Cluster Centers +,Cluster,, +,,, +,1,2,3 +,,, +x,0,0,1 +y,0,1,0 + +Table: Final Cluster Centers +,Cluster,, +,,, +,1,2,3 +,,, +x,-10,-10,10 +y,-10,10,10 + +Table: Number of Cases in each Cluster +Cluster,1,20 +,2,19 +,3,18 +Valid,,57 + +Table: Cluster Membership +Case Number,Cluster +1,3 +2,3 +3,3 +4,3 +5,3 +6,3 +7,3 +8,3 +9,3 +10,3 +11,3 +12,3 +13,3 +14,3 +15,3 +16,3 +17,3 +18,3 +19,1 +20,1 +21,1 +22,1 +23,1 +24,1 +25,1 +26,1 +27,1 +28,1 +29,1 +30,1 +31,1 +32,1 +33,1 +34,1 +35,1 +36,1 +37,1 +38,1 +39,2 +40,2 +41,2 +42,2 +43,2 +44,2 +45,2 +46,2 +47,2 +48,2 +49,2 +50,2 +51,2 +52,2 +53,2 +54,2 +55,2 +56,2 +57,2 +]) + +AT_CLEANUP