X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=src%2Flanguage%2Fstats%2Fchisquare.c;h=1b8392724db880a3fdae76187a9e4b2a08ccbe57;hb=fab68e4ef0a6f43179bdd5bebe959ef391489a7a;hp=05fe41e9719fbb33c69bd02925e967bc01b47990;hpb=d7d263511bd18042408c40d5536c20c59988ee15;p=pspp-builds.git diff --git a/src/language/stats/chisquare.c b/src/language/stats/chisquare.c index 05fe41e9..1b839272 100644 --- a/src/language/stats/chisquare.c +++ b/src/language/stats/chisquare.c @@ -1,80 +1,70 @@ -/* PSPP - computes sample statistics. - Copyright (C) 2006 Free Software Foundation, Inc. +/* PSPP - a program for statistical analysis. + Copyright (C) 2006, 2007 Free Software Foundation, Inc. - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. */ + along with this program. If not, see . */ #include -#include -#include + +#include #include +#include #include -#include -#include -#include +#include #include #include - -#include -#include +#include +#include +#include +#include #include - -#include - +#include +#include +#include +#include +#include #include -#include - -#include "npar.h" -#include "chisquare.h" -#include "freq.h" -#include +#include #include "gettext.h" #define _(msgid) gettext (msgid) - - - -/* Return a hash table containing the frequency counts of each +/* Return a hash table containing the frequency counts of each value of VAR in CF . - It is the caller's responsibility to free the hash table when + It is the caller's responsibility to free the hash table when no longer required. */ static struct hsh_table * -create_freq_hash_with_range (const struct dictionary *dict, - const struct casefile *cf, - struct casefilter *filter, - const struct variable *var, - double lo, +create_freq_hash_with_range (const struct dictionary *dict, + struct casereader *input, + const struct variable *var, + double lo, double hi) { bool warn = true; float i_d; struct ccase c; - struct casereader *r = casefile_get_reader (cf, filter); - struct hsh_table *freq_hash = - hsh_create (4, compare_freq, hash_freq, + struct hsh_table *freq_hash = + hsh_create (4, compare_freq, hash_freq, free_freq_mutable_hash, (void *) var); /* Populate the hash with zero entries */ - for (i_d = trunc (lo); i_d <= trunc (hi); i_d += 1.0 ) + for (i_d = trunc (lo); i_d <= trunc (hi); i_d += 1.0 ) { union value the_value; struct freq_mutable *fr = xmalloc (sizeof (*fr)); @@ -87,24 +77,18 @@ create_freq_hash_with_range (const struct dictionary *dict, hsh_insert (freq_hash, fr); } - while (casereader_read(r, &c)) + while (casereader_read (input, &c)) { union value obs_value; struct freq **existing_fr; struct freq *fr = xmalloc(sizeof (*fr)); fr->value = case_data (&c, var); - if ( casefilter_variable_missing (filter, &c, var)) - { - free (fr); - continue; - } - fr->count = dict_get_case_weight (dict, &c, &warn); obs_value.f = trunc (fr->value->f); - if ( obs_value.f < lo || obs_value.f > hi) + if ( obs_value.f < lo || obs_value.f > hi) { free (fr); case_destroy (&c); @@ -115,7 +99,7 @@ create_freq_hash_with_range (const struct dictionary *dict, existing_fr = (struct freq **) hsh_probe (freq_hash, fr); - /* This must exist in the hash, because we previously populated it + /* This must exist in the hash, because we previously populated it with zero counts */ assert (*existing_fr); @@ -124,72 +108,70 @@ create_freq_hash_with_range (const struct dictionary *dict, case_destroy (&c); } - casereader_destroy (r); - - return freq_hash; + if (casereader_destroy (input)) + return freq_hash; + else + { + hsh_destroy (freq_hash); + return NULL; + } } -/* Return a hash table containing the frequency counts of each - value of VAR in CF . - It is the caller's responsibility to free the hash table when +/* Return a hash table containing the frequency counts of each + value of VAR in INPUT . + It is the caller's responsibility to free the hash table when no longer required. */ static struct hsh_table * -create_freq_hash (const struct dictionary *dict, - const struct casefile *cf, - struct casefilter *filter, +create_freq_hash (const struct dictionary *dict, + struct casereader *input, const struct variable *var) { bool warn = true; struct ccase c; - struct casereader *r = casefile_get_reader (cf, filter); - struct hsh_table *freq_hash = - hsh_create (4, compare_freq, hash_freq, - free_freq_hash, + struct hsh_table *freq_hash = + hsh_create (4, compare_freq, hash_freq, + free_freq_mutable_hash, (void *) var); - while (casereader_read(r, &c)) + for (; casereader_read (input, &c); case_destroy (&c)) { struct freq **existing_fr; struct freq *fr = xmalloc(sizeof (*fr)); - fr->value = case_data (&c, var ); - - if ( casefilter_variable_missing (filter, &c, var)) - { - free (fr); - continue; - } + fr->value = case_data (&c, var); fr->count = dict_get_case_weight (dict, &c, &warn); existing_fr = (struct freq **) hsh_probe (freq_hash, fr); - if ( *existing_fr) + if ( *existing_fr) { (*existing_fr)->count += fr->count; free (fr); } else { - *existing_fr = fr; + *existing_fr = fr; + fr->value = value_dup (fr->value, var_get_width (var)); } - - case_destroy (&c); } - casereader_destroy (r); - - return freq_hash; + if (casereader_destroy (input)) + return freq_hash; + else + { + hsh_destroy (freq_hash); + return NULL; + } } static struct tab_table * -create_variable_frequency_table (const struct dictionary *dict, - const struct casefile *cf, - struct casefilter *filter, - const struct chisquare_test *test, - int v, +create_variable_frequency_table (const struct dictionary *dict, + struct casereader *input, + const struct chisquare_test *test, + int v, struct hsh_table **freq_hash) { @@ -199,17 +181,21 @@ create_variable_frequency_table (const struct dictionary *dict, struct tab_table *table ; const struct variable *var = ost->vars[v]; - *freq_hash = create_freq_hash (dict, cf, filter, var); - + *freq_hash = create_freq_hash (dict, input, var); + if (*freq_hash == NULL) + return NULL; + n_cells = hsh_count (*freq_hash); - if ( test->n_expected > 0 && n_cells != test->n_expected ) + if ( test->n_expected > 0 && n_cells != test->n_expected ) { msg(ME, _("CHISQUARE test specified %d expected values, but" - " %d distinct values were encountered in variable %s."), - test->n_expected, n_cells, + " %d distinct values were encountered in variable %s."), + test->n_expected, n_cells, var_get_name (var) ); + hsh_destroy (*freq_hash); + *freq_hash = NULL; return NULL; } @@ -220,16 +206,16 @@ create_variable_frequency_table (const struct dictionary *dict, tab_text (table, 1, 0, TAB_LEFT, _("Observed N")); tab_text (table, 2, 0, TAB_LEFT, _("Expected N")); tab_text (table, 3, 0, TAB_LEFT, _("Residual")); - + tab_headers (table, 1, 0, 1, 0); - tab_box (table, TAL_1, TAL_1, -1, -1, + tab_box (table, TAL_1, TAL_1, -1, -1, 0, 0, table->nc - 1, tab_nr(table) - 1 ); tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 1); tab_vline (table, TAL_2, 1, 0, tab_nr(table) - 1); - for ( i = 2 ; i < 4 ; ++i ) + for ( i = 2 ; i < 4 ; ++i ) tab_vline (table, TAL_1, i, 0, tab_nr(table) - 1); @@ -253,7 +239,7 @@ create_combo_frequency_table (const struct chisquare_test *test) tab_dim (table, tab_natural_dimensions); tab_title (table, _("Frequencies")); - for ( i = 0 ; i < ost->n_vars ; ++i ) + for ( i = 0 ; i < ost->n_vars ; ++i ) { const struct variable *var = ost->vars[i]; tab_text (table, i * 4 + 1, 1, TAB_LEFT, _("Category")); @@ -261,33 +247,33 @@ create_combo_frequency_table (const struct chisquare_test *test) tab_text (table, i * 4 + 3, 1, TAB_LEFT, _("Expected N")); tab_text (table, i * 4 + 4, 1, TAB_LEFT, _("Residual")); - tab_vline (table, TAL_2, i * 4 + 1, + tab_vline (table, TAL_2, i * 4 + 1, 0, tab_nr (table) - 1); - tab_vline (table, TAL_1, i * 4 + 2, + tab_vline (table, TAL_1, i * 4 + 2, 0, tab_nr (table) - 1); - tab_vline (table, TAL_1, i * 4 + 3, + tab_vline (table, TAL_1, i * 4 + 3, 1, tab_nr (table) - 1); - tab_vline (table, TAL_1, i * 4 + 4, + tab_vline (table, TAL_1, i * 4 + 4, 1, tab_nr (table) - 1); - tab_joint_text (table, + tab_joint_text (table, i * 4 + 1, 0, i * 4 + 4, 0, - TAB_CENTER, + TAB_CENTER, var_to_string (var)); } - for ( i = test->lo ; i <= test->hi ; ++i ) - tab_float (table, 0, 2 + i - test->lo, + for ( i = test->lo ; i <= test->hi ; ++i ) + tab_float (table, 0, 2 + i - test->lo, TAB_LEFT, 1 + i - test->lo, 8, 0); - + tab_headers (table, 1, 0, 2, 0); - tab_box (table, TAL_1, TAL_1, -1, -1, + tab_box (table, TAL_1, TAL_1, -1, -1, 0, 0, table->nc - 1, tab_nr(table) - 1 ); tab_hline (table, TAL_1, 1, tab_nc(table) - 1, 1); @@ -303,8 +289,9 @@ static struct tab_table * create_stats_table (const struct chisquare_test *test) { const struct one_sample_test *ost = (const struct one_sample_test*) test; - - struct tab_table *table = tab_create (1 + ost->n_vars, 4, 0); + + struct tab_table *table; + table = tab_create (1 + ost->n_vars, 4, 0); tab_dim (table, tab_natural_dimensions); tab_title (table, _("Test Statistics")); tab_headers (table, 1, 0, 1, 0); @@ -318,7 +305,7 @@ create_stats_table (const struct chisquare_test *test) tab_vline (table, TAL_2, 1, 0, tab_nr (table) - 1); tab_hline (table, TAL_1, 0, tab_nc (table) - 1, 1); - + tab_text (table, 0, 1, TAB_LEFT, _("Chi-Square")); tab_text (table, 0, 2, TAB_LEFT, _("df")); @@ -328,57 +315,57 @@ create_stats_table (const struct chisquare_test *test) } -void +void chisquare_execute (const struct dataset *ds, - const struct casefile *cf, - struct casefilter *filter, + struct casereader *input, + enum mv_class exclude, const struct npar_test *test) { const struct dictionary *dict = dataset_dict (ds); int v, i; struct one_sample_test *ost = (struct one_sample_test *) test; struct chisquare_test *cst = (struct chisquare_test *) test; - struct tab_table *stats_table = create_stats_table (cst); int n_cells = 0; double total_expected = 0.0; double *df = xzalloc (sizeof (*df) * ost->n_vars); double *xsq = xzalloc (sizeof (*df) * ost->n_vars); - - for ( i = 0 ; i < cst->n_expected ; ++i ) + bool ok; + + for ( i = 0 ; i < cst->n_expected ; ++i ) total_expected += cst->expected[i]; - if ( cst->ranged == false ) + if ( cst->ranged == false ) { - for ( v = 0 ; v < ost->n_vars ; ++v ) + for ( v = 0 ; v < ost->n_vars ; ++v ) { double total_obs = 0.0; struct hsh_table *freq_hash = NULL; - struct tab_table *freq_table = - create_variable_frequency_table(dict, cf, filter, cst, - v, &freq_hash); + struct casereader *reader = + casereader_create_filter_missing (casereader_clone (input), + &ost->vars[v], 1, exclude, NULL); + struct tab_table *freq_table = + create_variable_frequency_table(dict, reader, cst, v, &freq_hash); - struct freq **ff = (struct freq **) hsh_sort (freq_hash); + struct freq **ff; - if ( NULL == freq_table ) - { - hsh_destroy (freq_hash); - continue; - } + if ( NULL == freq_table ) + continue; + ff = (struct freq **) hsh_sort (freq_hash); n_cells = hsh_count (freq_hash); - for ( i = 0 ; i < n_cells ; ++i ) + for ( i = 0 ; i < n_cells ; ++i ) total_obs += ff[i]->count; xsq[v] = 0.0; - for ( i = 0 ; i < n_cells ; ++i ) + for ( i = 0 ; i < n_cells ; ++i ) { double exp; const union value *observed_value = ff[i]->value; /* The key */ - tab_text (freq_table, 0, i + 1, TAB_LEFT, + tab_text (freq_table, 0, i + 1, TAB_LEFT, var_get_value_name (ost->vars[v], observed_value)); /* The observed N */ @@ -386,9 +373,9 @@ chisquare_execute (const struct dataset *ds, ff[i]->count, 8, 0); if ( cst->n_expected > 0 ) - exp = cst->expected[i] * total_obs / total_expected ; + exp = cst->expected[i] * total_obs / total_expected ; else - exp = total_obs / (double) n_cells; + exp = total_obs / (double) n_cells; tab_float (freq_table, 2, i + 1, TAB_NONE, exp, 8, 2); @@ -413,32 +400,39 @@ chisquare_execute (const struct dataset *ds, else /* ranged == true */ { struct tab_table *freq_table = create_combo_frequency_table (cst); - + n_cells = cst->hi - cst->lo + 1; - for ( v = 0 ; v < ost->n_vars ; ++v ) + for ( v = 0 ; v < ost->n_vars ; ++v ) { double total_obs = 0.0; - struct hsh_table *freq_hash = - create_freq_hash_with_range (dict, cf, filter, ost->vars[v], - cst->lo, cst->hi); + struct casereader *reader = + casereader_create_filter_missing (casereader_clone (input), + &ost->vars[v], 1, exclude, NULL); + struct hsh_table *freq_hash = + create_freq_hash_with_range (dict, reader, + ost->vars[v], cst->lo, cst->hi); + + struct freq **ff; - struct freq **ff = (struct freq **) hsh_sort (freq_hash); + if (freq_hash == NULL) + continue; + ff = (struct freq **) hsh_sort (freq_hash); assert ( n_cells == hsh_count (freq_hash)); - for ( i = 0 ; i < hsh_count (freq_hash) ; ++i ) + for ( i = 0 ; i < hsh_count (freq_hash) ; ++i ) total_obs += ff[i]->count; xsq[v] = 0.0; - for ( i = 0 ; i < hsh_count (freq_hash) ; ++i ) + for ( i = 0 ; i < hsh_count (freq_hash) ; ++i ) { double exp; const union value *observed_value = ff[i]->value; /* The key */ - tab_text (freq_table, v * 4 + 1, i + 2 , TAB_LEFT, + tab_text (freq_table, v * 4 + 1, i + 2 , TAB_LEFT, var_get_value_name (ost->vars[v], observed_value)); /* The observed N */ @@ -446,9 +440,9 @@ chisquare_execute (const struct dataset *ds, ff[i]->count, 8, 0); if ( cst->n_expected > 0 ) - exp = cst->expected[i] * total_obs / total_expected ; + exp = cst->expected[i] * total_obs / total_expected ; else - exp = total_obs / (double) hsh_count (freq_hash); + exp = total_obs / (double) hsh_count (freq_hash); /* The expected N */ tab_float (freq_table, v * 4 + 3, i + 2 , TAB_NONE, @@ -461,36 +455,41 @@ chisquare_execute (const struct dataset *ds, xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp; } - + tab_float (freq_table, v * 4 + 2, tab_nr (freq_table) - 1, TAB_NONE, total_obs, 8, 0); - + df[v] = n_cells - 1.0; - + hsh_destroy (freq_hash); } tab_submit (freq_table); } + ok = !taint_has_tainted_successor (casereader_get_taint (input)); + casereader_destroy (input); - - /* Populate the summary statistics table */ - for ( v = 0 ; v < ost->n_vars ; ++v ) + if (ok) { - const struct variable *var = ost->vars[v]; + struct tab_table *stats_table = create_stats_table (cst); - tab_text (stats_table, 1 + v, 0, TAB_CENTER, var_get_name (var)); + /* Populate the summary statistics table */ + for ( v = 0 ; v < ost->n_vars ; ++v ) + { + const struct variable *var = ost->vars[v]; - tab_float (stats_table, 1 + v, 1, TAB_NONE, xsq[v], 8,3); - tab_float (stats_table, 1 + v, 2, TAB_NONE, df[v], 8,0); + tab_text (stats_table, 1 + v, 0, TAB_CENTER, var_get_name (var)); - tab_float (stats_table, 1 + v, 3, TAB_NONE, - gsl_cdf_chisq_Q (xsq[v], df[v]), 8,3); + tab_float (stats_table, 1 + v, 1, TAB_NONE, xsq[v], 8,3); + tab_float (stats_table, 1 + v, 2, TAB_NONE, df[v], 8,0); + + tab_float (stats_table, 1 + v, 3, TAB_NONE, + gsl_cdf_chisq_Q (xsq[v], df[v]), 8,3); + } + tab_submit (stats_table); } free (xsq); free (df); - - tab_submit (stats_table); }