From 3ec4ce83403f756f2c983ca2b5544cb1dcf9dfe0 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Fri, 8 Jul 2011 16:32:30 +0200 Subject: [PATCH] QUICK CLUSTER: Implement pairwise missing option --- src/language/stats/quick-cluster.c | 68 ++++++++++++++++++--------- tests/language/stats/quick-cluster.at | 57 ++++++++++++++++++++++ 2 files changed, 103 insertions(+), 22 deletions(-) diff --git a/src/language/stats/quick-cluster.c b/src/language/stats/quick-cluster.c index c61be3b7..10572cee 100644 --- a/src/language/stats/quick-cluster.c +++ b/src/language/stats/quick-cluster.c @@ -184,18 +184,18 @@ static int kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c, const struct qc *qc) { int result = -1; - double x; int i, j; - double dist; - double mindist; - mindist = INFINITY; + double mindist = INFINITY; for (i = 0; i < qc->ngroups; i++) { - dist = 0; + double dist = 0; for (j = 0; j < qc->n_vars; j++) { - x = case_data (c, qc->vars[j])->f; - dist += pow2 (gsl_matrix_get (kmeans->centers, i, j) - x); + const union value *val = case_data (c, qc->vars[j]); + if ( var_is_value_missing (qc->vars[j], val, qc->exclude)) + continue; + + dist += pow2 (gsl_matrix_get (kmeans->centers, i, j) - val->f); } if (dist < mindist) { @@ -210,29 +210,28 @@ kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c, const struct q static void kmeans_recalculate_centers (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *qc) { - casenumber i; + casenumber i = 0; int v, j; - double x, curval; struct ccase *c; - struct ccase *c_index; - struct casereader *cs; - struct casereader *cs_index; - int index; - i = 0; - cs = casereader_clone (reader); - cs_index = casereader_clone (kmeans->index_rdr); + struct casereader *cs = casereader_clone (reader); + struct casereader *cs_index = casereader_clone (kmeans->index_rdr); gsl_matrix_set_all (kmeans->centers, 0.0); for (; (c = casereader_read (cs)) != NULL; case_unref (c)) { double weight = qc->wv ? case_data (c, qc->wv)->f : 1.0; - c_index = casereader_read (cs_index); - index = case_data_idx (c_index, 0)->f; + struct ccase *c_index = casereader_read (cs_index); + int index = case_data_idx (c_index, 0)->f; for (v = 0; v < qc->n_vars; ++v) { - x = case_data (c, qc->vars[v])->f * weight; - curval = gsl_matrix_get (kmeans->centers, index, v); + const union value *val = case_data (c, qc->vars[v]); + double x = val->f * weight; + + if ( var_is_value_missing (qc->vars[v], val, qc->exclude)) + continue; + + double curval = gsl_matrix_get (kmeans->centers, index, v); gsl_matrix_set (kmeans->centers, index, v, curval + x); } i++; @@ -499,6 +498,7 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) qc.ngroups = 2; qc.maxiter = 2; qc.missing_type = MISS_LISTWISE; + qc.exclude = MV_ANY; if (!parse_variables_const (lexer, dict, &qc.vars, &qc.n_vars, PV_NO_DUPLICATE | PV_NUMERIC)) @@ -506,9 +506,33 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) return (CMD_FAILURE); } - if (lex_match (lexer, T_SLASH)) + while (lex_token (lexer) != T_ENDCMD) { - if (lex_match_id (lexer, "CRITERIA")) + lex_match (lexer, T_SLASH); + + if (lex_match_id (lexer, "MISSING")) + { + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { + if (lex_match_id (lexer, "LISTWISE") || lex_match_id (lexer, "DEFAULT")) + { + qc.missing_type = MISS_LISTWISE; + } + else if (lex_match_id (lexer, "PAIRWISE")) + { + qc.missing_type = MISS_PAIRWISE; + } + else if (lex_match_id (lexer, "INCLUDE")) + { + qc.exclude = MV_SYSTEM; + } + else + goto error; + } + } + else if (lex_match_id (lexer, "CRITERIA")) { lex_match (lexer, T_EQUALS); while (lex_token (lexer) != T_ENDCMD diff --git a/tests/language/stats/quick-cluster.at b/tests/language/stats/quick-cluster.at index 05688e8e..9e3e87c8 100644 --- a/tests/language/stats/quick-cluster.at +++ b/tests/language/stats/quick-cluster.at @@ -151,5 +151,62 @@ AT_CHECK([pspp -o pspp-nm.csv quick-nmiss.sps]) AT_CHECK([diff pspp-m.csv pspp-nm.csv], [0]) +AT_CLEANUP + + +AT_SETUP([QUICK CLUSTER with pairwise missing]) +AT_DATA([quick-s.sps], [dnl +data list notable list /x * y *. +begin data. +1 2 +1 2.2 +1.1 1.9 +1 9 +1 10 +1.3 9.5 +0.9 8.9 +3.5 2 +3.4 3 +3.5 2.5 +3.1 2.0 +3.9 2.5 +3.8 2.0 +end data. + +QUICK CLUSTER x y + /CRITERIA = CLUSTER(3) MXITER (100) + . +]) + +AT_CHECK([pspp -O format=csv quick-s.sps | tail -5 > pspp-s.csv]) + +AT_DATA([quick-pw.sps], [dnl +data list notable list /x * y *. +begin data. +1 2 +1 2.2 +1.1 1.9 +1 9 +1 10 +1.3 9.5 +0.9 8.9 +3.5 2 +3.4 3 +3.5 2.5 +3.1 2.0 +3.9 . +3.8 . +end data. + +QUICK CLUSTER x y + /CRITERIA = CLUSTER(3) MXITER (100) + /MISSING = PAIRWISE + . +]) + +AT_CHECK([pspp -O format=csv quick-pw.sps | tail -5 > pspp-pw.csv]) + +AT_CHECK([diff pspp-s.csv pspp-pw.csv], [0]) + AT_CLEANUP -- 2.30.2