X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Fchisquare.c;h=d1d81bab39dcc26e422aa32e96354c5c16a808d5;hb=f6c6ee97c412e47eb1c7c30adc224e93b6c1586a;hp=a406edc7433d8ff9ef9f128d68385e32ef590f12;hpb=35c4cb8cfb59bf6e1eb770114850e1184cfafc9b;p=pspp diff --git a/src/language/stats/chisquare.c b/src/language/stats/chisquare.c index a406edc743..d1d81bab39 100644 --- a/src/language/stats/chisquare.c +++ b/src/language/stats/chisquare.c @@ -1,497 +1,349 @@ -/* PSPP - computes sample statistics. - Copyright (C) 2006, 2007 Free Software Foundation, Inc. +/* PSPP - a program for statistical analysis. + Copyright (C) 2006, 2007, 2009, 2010, 2011 Free Software Foundation, Inc. - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. */ + along with this program. If not, see . */ #include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include -#include -#include +#include "language/stats/chisquare.h" #include - -#include -#include - -#include "npar.h" -#include "chisquare.h" -#include "freq.h" - #include +#include + +#include "data/case.h" +#include "data/casereader.h" +#include "data/dataset.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/value-labels.h" +#include "data/variable.h" +#include "language/stats/freq.h" +#include "language/stats/npar.h" +#include "libpspp/array.h" +#include "libpspp/assertion.h" +#include "libpspp/cast.h" +#include "libpspp/compiler.h" +#include "libpspp/hash-functions.h" +#include "libpspp/message.h" +#include "libpspp/taint.h" +#include "output/pivot-table.h" + +#include "gl/xalloc.h" #include "gettext.h" +#define N_(msgid) msgid #define _(msgid) gettext (msgid) - - - -/* Return a hash table containing the frequency counts of each - value of VAR in CF . - It is the caller's responsibility to free the hash table when - no longer required. -*/ -static struct hsh_table * -create_freq_hash_with_range (const struct dictionary *dict, - const struct casefile *cf, - struct casefilter *filter, - const struct variable *var, - double lo, - double hi) +/* Adds frequency counts of each value of VAR in INPUT between LO and HI to + FREQ_HASH. LO and HI and each input value is truncated to an integer. + Returns true if successful, false on input error. It is the caller's + responsibility to initialize FREQ_HASH and to free it when no longer + required, even on failure. */ +static bool +create_freq_hash_with_range (const struct dictionary *dict, + struct casereader *input, + const struct variable *var, + double lo_, double hi_, + struct hmap *freq_hash) { + struct freq **entries; bool warn = true; - float i_d; - struct ccase c; - struct casereader *r = casefile_get_reader (cf, filter); + struct ccase *c; + double lo, hi; + double i_d; - struct hsh_table *freq_hash = - hsh_create (4, compare_freq, hash_freq, - free_freq_mutable_hash, - (void *) var); + assert (var_is_numeric (var)); + lo = trunc (lo_); + hi = trunc (hi_); /* Populate the hash with zero entries */ - for (i_d = trunc (lo); i_d <= trunc (hi); i_d += 1.0 ) + entries = xnmalloc (hi - lo + 1, sizeof *entries); + for (i_d = lo; i_d <= hi; i_d += 1.0) { - union value the_value; - struct freq_mutable *fr = xmalloc (sizeof (*fr)); - - the_value.f = i_d; - - fr->value = value_dup (&the_value, 0); - fr->count = 0; - - hsh_insert (freq_hash, fr); + size_t ofs = i_d - lo; + union value value = { i_d }; + entries[ofs] = freq_hmap_insert (freq_hash, &value, 0, + value_hash (&value, 0, 0)); } - while (casereader_read(r, &c)) + for (; (c = casereader_read (input)) != NULL; case_unref (c)) { - union value obs_value; - struct freq **existing_fr; - struct freq *fr = xmalloc(sizeof (*fr)); - fr->value = case_data (&c, var); - - if ( casefilter_variable_missing (filter, &c, var)) - { - free (fr); - continue; - } - - fr->count = dict_get_case_weight (dict, &c, &warn); - - obs_value.f = trunc (fr->value->f); - - if ( obs_value.f < lo || obs_value.f > hi) - { - free (fr); - case_destroy (&c); - continue; - } - - fr->value = &obs_value; - - existing_fr = (struct freq **) hsh_probe (freq_hash, fr); - - /* This must exist in the hash, because we previously populated it - with zero counts */ - assert (*existing_fr); - - (*existing_fr)->count += fr->count; - free (fr); - - case_destroy (&c); + double x = trunc (case_num (c, var)); + if (x >= lo && x <= hi) + { + size_t ofs = x - lo; + struct freq *fr = entries[ofs]; + fr->count += dict_get_case_weight (dict, c, &warn); + } } - casereader_destroy (r); - - return freq_hash; -} - -/* Return a hash table containing the frequency counts of each - value of VAR in CF . - It is the caller's responsibility to free the hash table when - no longer required. -*/ -static struct hsh_table * -create_freq_hash (const struct dictionary *dict, - const struct casefile *cf, - struct casefilter *filter, - const struct variable *var) -{ - bool warn = true; - struct ccase c; - struct casereader *r = casefile_get_reader (cf, filter); - - struct hsh_table *freq_hash = - hsh_create (4, compare_freq, hash_freq, - free_freq_mutable_hash, - (void *) var); - - while (casereader_read(r, &c)) - { - struct freq **existing_fr; - struct freq *fr = xmalloc(sizeof (*fr)); - fr->value = case_data (&c, var ); - - if ( casefilter_variable_missing (filter, &c, var)) - { - free (fr); - continue; - } - - fr->count = dict_get_case_weight (dict, &c, &warn); - - existing_fr = (struct freq **) hsh_probe (freq_hash, fr); - if ( *existing_fr) - { - (*existing_fr)->count += fr->count; - free (fr); - } - else - { - *existing_fr = fr; - fr->value = value_dup (fr->value, var_get_width (var)); - } - - case_destroy (&c); - } - casereader_destroy (r); + free (entries); - return freq_hash; + return casereader_destroy (input); } - - -static struct tab_table * -create_variable_frequency_table (const struct dictionary *dict, - const struct casefile *cf, - struct casefilter *filter, - const struct chisquare_test *test, - int v, - struct hsh_table **freq_hash) - +/* Adds frequency counts of each value of VAR in INPUT to FREQ_HASH. LO and HI + and each input value is truncated to an integer. Returns true if + successful, false on input error. It is the caller's responsibility to + initialize FREQ_HASH and to free it when no longer required, even on + failure. */ +static bool +create_freq_hash (const struct dictionary *dict, + struct casereader *input, + const struct variable *var, + struct hmap *freq_hash) { - int i; - const struct one_sample_test *ost = (const struct one_sample_test*)test; - int n_cells; - struct tab_table *table ; - const struct variable *var = ost->vars[v]; - - *freq_hash = create_freq_hash (dict, cf, filter, var); - - n_cells = hsh_count (*freq_hash); - - if ( test->n_expected > 0 && n_cells != test->n_expected ) - { - msg(ME, _("CHISQUARE test specified %d expected values, but" - " %d distinct values were encountered in variable %s."), - test->n_expected, n_cells, - var_get_name (var) - ); - return NULL; - } - - table = tab_create(4, n_cells + 2, 0); - tab_dim (table, tab_natural_dimensions); - - tab_title (table, var_to_string(var)); - tab_text (table, 1, 0, TAB_LEFT, _("Observed N")); - tab_text (table, 2, 0, TAB_LEFT, _("Expected N")); - tab_text (table, 3, 0, TAB_LEFT, _("Residual")); - - tab_headers (table, 1, 0, 1, 0); - - tab_box (table, TAL_1, TAL_1, -1, -1, - 0, 0, table->nc - 1, tab_nr(table) - 1 ); - - tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 1); - - tab_vline (table, TAL_2, 1, 0, tab_nr(table) - 1); - for ( i = 2 ; i < 4 ; ++i ) - tab_vline (table, TAL_1, i, 0, tab_nr(table) - 1); - - - tab_text (table, 0, table->nr - 1, TAB_LEFT, _("Total")); - - return table; -} - - -static struct tab_table * -create_combo_frequency_table (const struct chisquare_test *test) -{ - int i; - const struct one_sample_test *ost = (const struct one_sample_test*)test; - - struct tab_table *table ; - - int n_cells = test->hi - test->lo + 1; - - table = tab_create(1 + ost->n_vars * 4, n_cells + 3, 0); - tab_dim (table, tab_natural_dimensions); + int width = var_get_width (var); + bool warn = true; + struct ccase *c; - tab_title (table, _("Frequencies")); - for ( i = 0 ; i < ost->n_vars ; ++i ) + for (; (c = casereader_read (input)) != NULL; case_unref (c)) { - const struct variable *var = ost->vars[i]; - tab_text (table, i * 4 + 1, 1, TAB_LEFT, _("Category")); - tab_text (table, i * 4 + 2, 1, TAB_LEFT, _("Observed N")); - tab_text (table, i * 4 + 3, 1, TAB_LEFT, _("Expected N")); - tab_text (table, i * 4 + 4, 1, TAB_LEFT, _("Residual")); - - tab_vline (table, TAL_2, i * 4 + 1, - 0, tab_nr (table) - 1); - - tab_vline (table, TAL_1, i * 4 + 2, - 0, tab_nr (table) - 1); - - tab_vline (table, TAL_1, i * 4 + 3, - 1, tab_nr (table) - 1); + const union value *value = case_data (c, var); + size_t hash = value_hash (value, width, 0); + double weight = dict_get_case_weight (dict, c, &warn); + struct freq *f; - tab_vline (table, TAL_1, i * 4 + 4, - 1, tab_nr (table) - 1); + f = freq_hmap_search (freq_hash, value, width, hash); + if (f == NULL) + f = freq_hmap_insert (freq_hash, value, width, hash); - - tab_joint_text (table, - i * 4 + 1, 0, - i * 4 + 4, 0, - TAB_CENTER, - var_to_string (var)); + f->count += weight; } - for ( i = test->lo ; i <= test->hi ; ++i ) - tab_float (table, 0, 2 + i - test->lo, - TAB_LEFT, 1 + i - test->lo, 8, 0); - - tab_headers (table, 1, 0, 2, 0); - - tab_box (table, TAL_1, TAL_1, -1, -1, - 0, 0, table->nc - 1, tab_nr(table) - 1 ); - - tab_hline (table, TAL_1, 1, tab_nc(table) - 1, 1); - tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 2); - - tab_text (table, 0, table->nr - 1, TAB_LEFT, _("Total")); - - return table; + return casereader_destroy (input); } - -static struct tab_table * -create_stats_table (const struct chisquare_test *test) -{ - const struct one_sample_test *ost = (const struct one_sample_test*) test; - - struct tab_table *table = tab_create (1 + ost->n_vars, 4, 0); - tab_dim (table, tab_natural_dimensions); - tab_title (table, _("Test Statistics")); - tab_headers (table, 1, 0, 1, 0); - - tab_box (table, TAL_1, TAL_1, -1, -1, - 0, 0, tab_nc(table) - 1, tab_nr(table) - 1 ); - - tab_box (table, -1, -1, -1, TAL_1, - 1, 0, tab_nc(table) - 1, tab_nr(table) - 1 ); - - - tab_vline (table, TAL_2, 1, 0, tab_nr (table) - 1); - tab_hline (table, TAL_1, 0, tab_nc (table) - 1, 1); - - - tab_text (table, 0, 1, TAB_LEFT, _("Chi-Square")); - tab_text (table, 0, 2, TAB_LEFT, _("df")); - tab_text (table, 0, 3, TAB_LEFT, _("Asymp. Sig.")); - - return table; -} - - -void +void chisquare_execute (const struct dataset *ds, - const struct casefile *cf, - struct casefilter *filter, - const struct npar_test *test) + struct casereader *input, + enum mv_class exclude, + const struct npar_test *test, + bool exact UNUSED, + double timer UNUSED) { const struct dictionary *dict = dataset_dict (ds); int v, i; - struct one_sample_test *ost = (struct one_sample_test *) test; - struct chisquare_test *cst = (struct chisquare_test *) test; - struct tab_table *stats_table = create_stats_table (cst); - int n_cells = 0; + struct chisquare_test *cst = UP_CAST (test, struct chisquare_test, + parent.parent); + struct one_sample_test *ost = &cst->parent; double total_expected = 0.0; - double *df = xzalloc (sizeof (*df) * ost->n_vars); - double *xsq = xzalloc (sizeof (*df) * ost->n_vars); - - for ( i = 0 ; i < cst->n_expected ; ++i ) + double *df = XCALLOC (ost->n_vars, double); + double *xsq = XCALLOC (ost->n_vars, double); + bool ok; + + for (i = 0 ; i < cst->n_expected ; ++i) total_expected += cst->expected[i]; - if ( cst->ranged == false ) + if (cst->ranged == false) { - for ( v = 0 ; v < ost->n_vars ; ++v ) + for (v = 0 ; v < ost->n_vars ; ++v) { - double total_obs = 0.0; - struct hsh_table *freq_hash = NULL; - struct tab_table *freq_table = - create_variable_frequency_table(dict, cf, filter, cst, - v, &freq_hash); + const struct variable *var = ost->vars[v]; + + struct hmap freq_hash = HMAP_INITIALIZER (freq_hash); + struct casereader *reader = + casereader_create_filter_missing (casereader_clone (input), + &var, 1, exclude, + NULL, NULL); + if (!create_freq_hash (dict, reader, var, &freq_hash)) + { + freq_hmap_destroy (&freq_hash, var_get_width (var)); + return; + } + + size_t n_cells = hmap_count (&freq_hash); + if (cst->n_expected > 0 && n_cells != cst->n_expected) + { + msg (ME, _("CHISQUARE test specified %d expected values, but " + "variable %s has %zu distinct values."), + cst->n_expected, var_get_name (var), n_cells); + freq_hmap_destroy (&freq_hash, var_get_width (var)); + continue; + } + + struct pivot_table *table = pivot_table_create__ ( + pivot_value_new_variable (var), "Chisquare"); + pivot_table_set_weight_var (table, dict_get_weight (dict)); + + pivot_dimension_create ( + table, PIVOT_AXIS_COLUMN, N_("Statistics"), + N_("Observed N"), PIVOT_RC_COUNT, + N_("Expected N"), PIVOT_RC_OTHER, + N_("Residual"), PIVOT_RC_RESIDUAL); + + struct freq **ff = freq_hmap_sort (&freq_hash, var_get_width (var)); - struct freq **ff = (struct freq **) hsh_sort (freq_hash); - - if ( NULL == freq_table ) - { - hsh_destroy (freq_hash); - continue; - } - - n_cells = hsh_count (freq_hash); - - for ( i = 0 ; i < n_cells ; ++i ) + double total_obs = 0.0; + for (size_t i = 0; i < n_cells; i++) total_obs += ff[i]->count; + struct pivot_dimension *values = pivot_dimension_create ( + table, PIVOT_AXIS_ROW, N_("Value")); + values->root->show_label = true; + xsq[v] = 0.0; - for ( i = 0 ; i < n_cells ; ++i ) + for (size_t i = 0; i < n_cells; i++) { - double exp; - const union value *observed_value = ff[i]->value; - - /* The key */ - tab_text (freq_table, 0, i + 1, TAB_LEFT, - var_get_value_name (ost->vars[v], observed_value)); - - /* The observed N */ - tab_float (freq_table, 1, i + 1, TAB_NONE, - ff[i]->count, 8, 0); - - if ( cst->n_expected > 0 ) - exp = cst->expected[i] * total_obs / total_expected ; - else - exp = total_obs / (double) n_cells; - - tab_float (freq_table, 2, i + 1, TAB_NONE, - exp, 8, 2); - - /* The residual */ - tab_float (freq_table, 3, i + 1, TAB_NONE, - ff[i]->count - exp, 8, 2); + int row = pivot_category_create_leaf ( + values->root, pivot_value_new_var_value ( + var, &ff[i]->values[0])); + + double exp = (cst->n_expected > 0 + ? cst->expected[i] * total_obs / total_expected + : total_obs / (double) n_cells); + double entries[] = { + ff[i]->count, + exp, + ff[i]->count - exp, + }; + for (size_t j = 0; j < sizeof entries / sizeof *entries; j++) + pivot_table_put2 ( + table, j, row, pivot_value_new_number (entries[j])); xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp; } df[v] = n_cells - 1.0; - tab_float (freq_table, 1, i + 1, TAB_NONE, - total_obs, 8, 0); + int row = pivot_category_create_leaf ( + values->root, pivot_value_new_text (N_("Total"))); + pivot_table_put2 (table, 0, row, + pivot_value_new_number (total_obs)); - tab_submit (freq_table); + pivot_table_submit (table); - hsh_destroy (freq_hash); + freq_hmap_destroy (&freq_hash, var_get_width (var)); + free (ff); } } else /* ranged == true */ { - struct tab_table *freq_table = create_combo_frequency_table (cst); - - n_cells = cst->hi - cst->lo + 1; - - for ( v = 0 ; v < ost->n_vars ; ++v ) + struct pivot_table *table = pivot_table_create (N_("Frequencies")); + pivot_table_set_weight_var (table, dict_get_weight (dict)); + + pivot_dimension_create ( + table, PIVOT_AXIS_COLUMN, N_("Statistics"), + N_("Category"), + N_("Observed N"), PIVOT_RC_COUNT, + N_("Expected N"), PIVOT_RC_OTHER, + N_("Residual"), PIVOT_RC_RESIDUAL); + + struct pivot_dimension *var_dim + = pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Variable")); + for (size_t i = 0 ; i < ost->n_vars ; ++i) + pivot_category_create_leaf (var_dim->root, + pivot_value_new_variable (ost->vars[i])); + + struct pivot_dimension *category_dim + = pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Category")); + size_t n_cells = cst->hi - cst->lo + 1; + for (size_t i = 0 ; i < n_cells; ++i) + pivot_category_create_leaf (category_dim->root, + pivot_value_new_integer (i + 1)); + pivot_category_create_leaves (category_dim->root, N_("Total")); + + for (size_t v = 0 ; v < ost->n_vars ; ++v) { - double total_obs = 0.0; - struct hsh_table *freq_hash = - create_freq_hash_with_range (dict, cf, filter, ost->vars[v], - cst->lo, cst->hi); - - struct freq **ff = (struct freq **) hsh_sort (freq_hash); - - assert ( n_cells == hsh_count (freq_hash)); - - for ( i = 0 ; i < hsh_count (freq_hash) ; ++i ) + const struct variable *var = ost->vars[v]; + struct casereader *reader = + casereader_create_filter_missing (casereader_clone (input), + &var, 1, exclude, + NULL, NULL); + struct hmap freq_hash = HMAP_INITIALIZER (freq_hash); + if (!create_freq_hash_with_range (dict, reader, var, + cst->lo, cst->hi, &freq_hash)) + { + freq_hmap_destroy (&freq_hash, var_get_width (var)); + continue; + } + + struct freq **ff = freq_hmap_sort (&freq_hash, var_get_width (var)); + + double total_obs = 0.0; + for (size_t i = 0 ; i < hmap_count (&freq_hash) ; ++i) total_obs += ff[i]->count; xsq[v] = 0.0; - for ( i = 0 ; i < hsh_count (freq_hash) ; ++i ) + for (size_t i = 0 ; i < hmap_count (&freq_hash) ; ++i) { - double exp; - - const union value *observed_value = ff[i]->value; - - /* The key */ - tab_text (freq_table, v * 4 + 1, i + 2 , TAB_LEFT, - var_get_value_name (ost->vars[v], observed_value)); - - /* The observed N */ - tab_float (freq_table, v * 4 + 2, i + 2 , TAB_NONE, - ff[i]->count, 8, 0); - - if ( cst->n_expected > 0 ) - exp = cst->expected[i] * total_obs / total_expected ; - else - exp = total_obs / (double) hsh_count (freq_hash); - - /* The expected N */ - tab_float (freq_table, v * 4 + 3, i + 2 , TAB_NONE, - exp, 8, 2); + /* Category. */ + pivot_table_put3 (table, 0, v, i, + pivot_value_new_var_value ( + var, &ff[i]->values[0])); + + double exp = (cst->n_expected > 0 + ? cst->expected[i] * total_obs / total_expected + : total_obs / (double) hmap_count (&freq_hash)); + double entries[] = { + ff[i]->count, + exp, + ff[i]->count - exp, + }; + for (size_t j = 0; j < sizeof entries / sizeof *entries; j++) + pivot_table_put3 (table, j + 1, v, i, + pivot_value_new_number (entries[j])); - /* The residual */ - tab_float (freq_table, v * 4 + 4, i + 2 , TAB_NONE, - ff[i]->count - exp, 8, 2); xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp; } - - tab_float (freq_table, v * 4 + 2, tab_nr (freq_table) - 1, TAB_NONE, - total_obs, 8, 0); - df[v] = n_cells - 1.0; - - hsh_destroy (freq_hash); + + freq_hmap_destroy (&freq_hash, var_get_width (var)); + free (ff); + + pivot_table_put3 (table, 1, v, n_cells, + pivot_value_new_number (total_obs)); } - tab_submit (freq_table); + pivot_table_submit (table); } + ok = !taint_has_tainted_successor (casereader_get_taint (input)); + casereader_destroy (input); - - /* Populate the summary statistics table */ - for ( v = 0 ; v < ost->n_vars ; ++v ) + if (ok) { - const struct variable *var = ost->vars[v]; - - tab_text (stats_table, 1 + v, 0, TAB_CENTER, var_get_name (var)); - - tab_float (stats_table, 1 + v, 1, TAB_NONE, xsq[v], 8,3); - tab_float (stats_table, 1 + v, 2, TAB_NONE, df[v], 8,0); - - tab_float (stats_table, 1 + v, 3, TAB_NONE, - gsl_cdf_chisq_Q (xsq[v], df[v]), 8,3); + struct pivot_table *table = pivot_table_create (N_("Test Statistics")); + + pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"), + N_("Chi-square"), PIVOT_RC_OTHER, + N_("df"), PIVOT_RC_INTEGER, + N_("Asymp. Sig."), PIVOT_RC_SIGNIFICANCE); + + struct pivot_dimension *variables = pivot_dimension_create ( + table, PIVOT_AXIS_ROW, N_("Variable")); + + for (size_t v = 0 ; v < ost->n_vars ; ++v) + { + const struct variable *var = ost->vars[v]; + + int row = pivot_category_create_leaf ( + variables->root, pivot_value_new_variable (var)); + + double sig = gsl_cdf_chisq_Q (xsq[v], df[v]); + double entries[] = { xsq[v], df[v], sig }; + for (size_t i = 0; i < sizeof entries / sizeof *entries; i++) + pivot_table_put2 (table, i, row, + pivot_value_new_number (entries[i])); + } + pivot_table_submit (table); } free (xsq); free (df); - - tab_submit (stats_table); }