X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fquick-cluster.c;h=68b50123144e2b0b10c19d494d2e4b02b32ac430;hb=refs%2Fheads%2Fcenter-titles;hp=9adcc64243a70c50539909304f758d09488a4020;hpb=4da69790b2ee4d597b0f04a9a0a13d4dfa9fbe9a;p=pspp diff --git a/src/language/stats/quick-cluster.c b/src/language/stats/quick-cluster.c index 9adcc64243..68b5012314 100644 --- a/src/language/stats/quick-cluster.c +++ b/src/language/stats/quick-cluster.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2011 Free Software Foundation, Inc. + Copyright (C) 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -46,6 +46,13 @@ #define _(msgid) gettext (msgid) #define N_(msgid) msgid +enum missing_type + { + MISS_LISTWISE, + MISS_PAIRWISE, + }; + + struct qc { const struct variable **vars; @@ -55,6 +62,9 @@ struct qc int maxiter; /* Maximum iterations (Given by the user) */ const struct variable *wv; /* Weighting variable. */ + + enum missing_type missing_type; + enum mv_class exclude; }; /* Holds all of the information for the functions. int n, holds the number of @@ -174,18 +184,18 @@ static int kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c, const struct qc *qc) { int result = -1; - double x; int i, j; - double dist; - double mindist; - mindist = INFINITY; + double mindist = INFINITY; for (i = 0; i < qc->ngroups; i++) { - dist = 0; + double dist = 0; for (j = 0; j < qc->n_vars; j++) { - x = case_data (c, qc->vars[j])->f; - dist += pow2 (gsl_matrix_get (kmeans->centers, i, j) - x); + const union value *val = case_data (c, qc->vars[j]); + if ( var_is_value_missing (qc->vars[j], val, qc->exclude)) + continue; + + dist += pow2 (gsl_matrix_get (kmeans->centers, i, j) - val->f); } if (dist < mindist) { @@ -200,28 +210,28 @@ kmeans_get_nearest_group (struct Kmeans *kmeans, struct ccase *c, const struct q static void kmeans_recalculate_centers (struct Kmeans *kmeans, const struct casereader *reader, const struct qc *qc) { - casenumber i; + casenumber i = 0; int v, j; - double x, curval; struct ccase *c; - struct ccase *c_index; - struct casereader *cs; - struct casereader *cs_index; - int index; - i = 0; - cs = casereader_clone (reader); - cs_index = casereader_clone (kmeans->index_rdr); + struct casereader *cs = casereader_clone (reader); + struct casereader *cs_index = casereader_clone (kmeans->index_rdr); gsl_matrix_set_all (kmeans->centers, 0.0); for (; (c = casereader_read (cs)) != NULL; case_unref (c)) { double weight = qc->wv ? case_data (c, qc->wv)->f : 1.0; - c_index = casereader_read (cs_index); - index = case_data_idx (c_index, 0)->f; + struct ccase *c_index = casereader_read (cs_index); + int index = case_data_idx (c_index, 0)->f; for (v = 0; v < qc->n_vars; ++v) { - x = case_data (c, qc->vars[v])->f * weight; + const union value *val = case_data (c, qc->vars[v]); + double x = val->f * weight; + double curval; + + if ( var_is_value_missing (qc->vars[v], val, qc->exclude)) + continue; + curval = gsl_matrix_get (kmeans->centers, index, v); gsl_matrix_set (kmeans->centers, index, v, curval + x); } @@ -279,6 +289,7 @@ kmeans_calculate_indexes_and_check_convergence (struct Kmeans *kmeans, const str struct ccase *index_case_new = case_create (kmeans->proto); int bestindex = kmeans_get_nearest_group (kmeans, c, qc); double weight = qc->wv ? case_data (c, qc->wv)->f : 1.0; + assert (bestindex < kmeans->num_elements_groups->size); kmeans->num_elements_groups->data[bestindex] += weight; if (kmeans->index_rdr) { @@ -379,11 +390,10 @@ static void quick_cluster_show_centers (struct Kmeans *kmeans, bool initial, const struct qc *qc) { struct tab_table *t; - int nc, nr, heading_columns, currow; + int nc, nr, currow; int i, j; nc = qc->ngroups + 1; nr = qc->n_vars + 4; - heading_columns = 1; t = tab_create (nc, nr); tab_headers (t, 0, nc - 1, 0, 1); currow = 0; @@ -422,14 +432,14 @@ quick_cluster_show_centers (struct Kmeans *kmeans, bool initial, const struct qc tab_double (t, i + 1, j + 4, TAB_CENTER, gsl_matrix_get (kmeans->centers, kmeans->group_order->data[i], j), - var_get_print_format (qc->vars[j])); + var_get_print_format (qc->vars[j]), RC_OTHER); } else { tab_double (t, i + 1, j + 4, TAB_CENTER, gsl_matrix_get (kmeans->initial_centers, kmeans->group_order->data[i], j), - var_get_print_format (qc->vars[j])); + var_get_print_format (qc->vars[j]), RC_OTHER); } } } @@ -487,6 +497,8 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) const struct dictionary *dict = dataset_dict (ds); qc.ngroups = 2; qc.maxiter = 2; + qc.missing_type = MISS_LISTWISE; + qc.exclude = MV_ANY; if (!parse_variables_const (lexer, dict, &qc.vars, &qc.n_vars, PV_NO_DUPLICATE | PV_NUMERIC)) @@ -494,9 +506,37 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) return (CMD_FAILURE); } - if (lex_match (lexer, T_SLASH)) + while (lex_token (lexer) != T_ENDCMD) { - if (lex_match_id (lexer, "CRITERIA")) + lex_match (lexer, T_SLASH); + + if (lex_match_id (lexer, "MISSING")) + { + lex_match (lexer, T_EQUALS); + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { + if (lex_match_id (lexer, "LISTWISE") || lex_match_id (lexer, "DEFAULT")) + { + qc.missing_type = MISS_LISTWISE; + } + else if (lex_match_id (lexer, "PAIRWISE")) + { + qc.missing_type = MISS_PAIRWISE; + } + else if (lex_match_id (lexer, "INCLUDE")) + { + qc.exclude = MV_SYSTEM; + } + else if (lex_match_id (lexer, "EXCLUDE")) + { + qc.exclude = MV_ANY; + } + else + goto error; + } + } + else if (lex_match_id (lexer, "CRITERIA")) { lex_match (lexer, T_EQUALS); while (lex_token (lexer) != T_ENDCMD @@ -508,6 +548,11 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) { lex_force_int (lexer); qc.ngroups = lex_integer (lexer); + if (qc.ngroups <= 0) + { + lex_error (lexer, _("The number of clusters must be positive")); + goto error; + } lex_get (lexer); lex_force_match (lexer, T_RPAREN); } @@ -518,6 +563,11 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) { lex_force_int (lexer); qc.maxiter = lex_integer (lexer); + if (qc.maxiter <= 0) + { + lex_error (lexer, _("The number of iterations must be positive")); + goto error; + } lex_get (lexer); lex_force_match (lexer, T_RPAREN); } @@ -536,6 +586,13 @@ cmd_quick_cluster (struct lexer *lexer, struct dataset *ds) while (casegrouper_get_next_group (grouper, &group)) { + if ( qc.missing_type == MISS_LISTWISE ) + { + group = casereader_create_filter_missing (group, qc.vars, qc.n_vars, + qc.exclude, + NULL, NULL); + } + kmeans = kmeans_create (&qc); kmeans_cluster (kmeans, group, &qc); quick_cluster_show_results (kmeans, &qc);