1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2007, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/stats/chisquare.h"
21 #include <gsl/gsl_cdf.h>
25 #include "data/case.h"
26 #include "data/casereader.h"
27 #include "data/dataset.h"
28 #include "data/dictionary.h"
29 #include "data/format.h"
30 #include "data/value-labels.h"
31 #include "data/variable.h"
32 #include "language/stats/freq.h"
33 #include "language/stats/npar.h"
34 #include "libpspp/array.h"
35 #include "libpspp/assertion.h"
36 #include "libpspp/cast.h"
37 #include "libpspp/compiler.h"
38 #include "libpspp/hash-functions.h"
39 #include "libpspp/message.h"
40 #include "libpspp/taint.h"
41 #include "output/tab.h"
43 #include "gl/xalloc.h"
46 #define _(msgid) gettext (msgid)
48 /* Adds frequency counts of each value of VAR in INPUT between LO and HI to
49 FREQ_HASH. LO and HI and each input value is truncated to an integer.
50 Returns true if successful, false on input error. It is the caller's
51 responsibility to initialize FREQ_HASH and to free it when no longer
52 required, even on failure. */
54 create_freq_hash_with_range (const struct dictionary *dict,
55 struct casereader *input,
56 const struct variable *var,
57 double lo_, double hi_,
58 struct hmap *freq_hash)
60 struct freq **entries;
66 assert (var_is_numeric (var));
70 /* Populate the hash with zero entries */
71 entries = xnmalloc (hi - lo + 1, sizeof *entries);
72 for (i_d = lo; i_d <= hi; i_d += 1.0 )
74 size_t ofs = i_d - lo;
75 union value value = { i_d };
76 entries[ofs] = freq_hmap_insert (freq_hash, &value, 0,
77 value_hash (&value, 0, 0));
80 for (; (c = casereader_read (input)) != NULL; case_unref (c))
82 double x = trunc (case_num (c, var));
83 if (x >= lo && x <= hi)
86 struct freq *fr = entries[ofs];
87 fr->count += dict_get_case_weight (dict, c, &warn);
93 return casereader_destroy (input);
96 /* Adds frequency counts of each value of VAR in INPUT to FREQ_HASH. LO and HI
97 and each input value is truncated to an integer. Returns true if
98 successful, false on input error. It is the caller's responsibility to
99 initialize FREQ_HASH and to free it when no longer required, even on
102 create_freq_hash (const struct dictionary *dict,
103 struct casereader *input,
104 const struct variable *var,
105 struct hmap *freq_hash)
107 int width = var_get_width (var);
111 for (; (c = casereader_read (input)) != NULL; case_unref (c))
113 const union value *value = case_data (c, var);
114 size_t hash = value_hash (value, width, 0);
115 double weight = dict_get_case_weight (dict, c, &warn);
118 f = freq_hmap_search (freq_hash, value, width, hash);
120 f = freq_hmap_insert (freq_hash, value, width, hash);
125 return casereader_destroy (input);
128 static struct tab_table *
129 create_variable_frequency_table (const struct dictionary *dict,
130 struct casereader *input,
131 const struct chisquare_test *test,
132 int v, struct hmap *freq_hash)
136 const struct one_sample_test *ost = (const struct one_sample_test*)test;
138 struct tab_table *table ;
139 const struct variable *var = ost->vars[v];
141 const struct variable *wvar = dict_get_weight (dict);
142 const struct fmt_spec *wfmt = wvar ? var_get_print_format (wvar) : & F_8_0;
144 hmap_init (freq_hash);
145 if (!create_freq_hash (dict, input, var, freq_hash))
147 freq_hmap_destroy (freq_hash, var_get_width (var));
151 n_cells = hmap_count (freq_hash);
153 if ( test->n_expected > 0 && n_cells != test->n_expected )
155 msg(ME, _("CHISQUARE test specified %d expected values, but"
156 " %d distinct values were encountered in variable %s."),
157 test->n_expected, n_cells,
160 freq_hmap_destroy (freq_hash, var_get_width (var));
164 table = tab_create(4, n_cells + 2);
165 tab_set_format (table, RC_WEIGHT, wfmt);
167 tab_title (table, "%s", var_to_string(var));
168 tab_text (table, 1, 0, TAB_LEFT, _("Observed N"));
169 tab_text (table, 2, 0, TAB_LEFT, _("Expected N"));
170 tab_text (table, 3, 0, TAB_LEFT, _("Residual"));
172 tab_headers (table, 1, 0, 1, 0);
174 tab_box (table, TAL_1, TAL_1, -1, -1,
175 0, 0, tab_nc (table) - 1, tab_nr(table) - 1 );
177 tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 1);
179 tab_vline (table, TAL_2, 1, 0, tab_nr(table) - 1);
180 for ( i = 2 ; i < 4 ; ++i )
181 tab_vline (table, TAL_1, i, 0, tab_nr(table) - 1);
184 tab_text (table, 0, tab_nr (table) - 1, TAB_LEFT, _("Total"));
190 static struct tab_table *
191 create_combo_frequency_table (const struct dictionary *dict, const struct chisquare_test *test)
194 const struct one_sample_test *ost = (const struct one_sample_test*)test;
196 struct tab_table *table ;
198 const struct variable *wvar = dict_get_weight (dict);
199 const struct fmt_spec *wfmt = wvar ? var_get_print_format (wvar) : & F_8_0;
201 int n_cells = test->hi - test->lo + 1;
203 table = tab_create(1 + ost->n_vars * 4, n_cells + 3);
204 tab_set_format (table, RC_WEIGHT, wfmt);
206 tab_title (table, _("Frequencies"));
207 for ( i = 0 ; i < ost->n_vars ; ++i )
209 const struct variable *var = ost->vars[i];
210 tab_text (table, i * 4 + 1, 1, TAB_LEFT, _("Category"));
211 tab_text (table, i * 4 + 2, 1, TAB_LEFT, _("Observed N"));
212 tab_text (table, i * 4 + 3, 1, TAB_LEFT, _("Expected N"));
213 tab_text (table, i * 4 + 4, 1, TAB_LEFT, _("Residual"));
215 tab_vline (table, TAL_2, i * 4 + 1,
216 0, tab_nr (table) - 1);
218 tab_vline (table, TAL_1, i * 4 + 2,
219 0, tab_nr (table) - 1);
221 tab_vline (table, TAL_1, i * 4 + 3,
222 1, tab_nr (table) - 1);
224 tab_vline (table, TAL_1, i * 4 + 4,
225 1, tab_nr (table) - 1);
228 tab_joint_text (table,
232 var_to_string (var));
235 for ( i = test->lo ; i <= test->hi ; ++i )
236 tab_double (table, 0, 2 + i - test->lo, TAB_LEFT, 1 + i - test->lo, NULL, RC_INTEGER);
238 tab_headers (table, 1, 0, 2, 0);
240 tab_box (table, TAL_1, TAL_1, -1, -1,
241 0, 0, tab_nc (table) - 1, tab_nr(table) - 1 );
243 tab_hline (table, TAL_1, 1, tab_nc(table) - 1, 1);
244 tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 2);
246 tab_text (table, 0, tab_nr (table) - 1, TAB_LEFT, _("Total"));
252 static struct tab_table *
253 create_stats_table (const struct chisquare_test *test)
255 const struct one_sample_test *ost = (const struct one_sample_test*) test;
257 struct tab_table *table;
258 table = tab_create (1 + ost->n_vars, 4);
259 tab_title (table, _("Test Statistics"));
260 tab_headers (table, 1, 0, 1, 0);
262 tab_box (table, TAL_1, TAL_1, -1, -1,
263 0, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
265 tab_box (table, -1, -1, -1, TAL_1,
266 1, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
269 tab_vline (table, TAL_2, 1, 0, tab_nr (table) - 1);
270 tab_hline (table, TAL_1, 0, tab_nc (table) - 1, 1);
273 tab_text (table, 0, 1, TAB_LEFT, _("Chi-Square"));
274 tab_text (table, 0, 2, TAB_LEFT, _("df"));
275 tab_text (table, 0, 3, TAB_LEFT, _("Asymp. Sig."));
282 chisquare_execute (const struct dataset *ds,
283 struct casereader *input,
284 enum mv_class exclude,
285 const struct npar_test *test,
289 const struct dictionary *dict = dataset_dict (ds);
291 struct chisquare_test *cst = UP_CAST (test, struct chisquare_test,
293 struct one_sample_test *ost = &cst->parent;
295 double total_expected = 0.0;
297 double *df = xzalloc (sizeof (*df) * ost->n_vars);
298 double *xsq = xzalloc (sizeof (*df) * ost->n_vars);
301 for ( i = 0 ; i < cst->n_expected ; ++i )
302 total_expected += cst->expected[i];
304 if ( cst->ranged == false )
306 for ( v = 0 ; v < ost->n_vars ; ++v )
308 const struct variable *var = ost->vars[v];
309 double total_obs = 0.0;
310 struct hmap freq_hash;
311 struct casereader *reader =
312 casereader_create_filter_missing (casereader_clone (input),
315 struct tab_table *freq_table =
316 create_variable_frequency_table (dict, reader, cst, v, &freq_hash);
320 if ( NULL == freq_table )
322 ff = freq_hmap_sort (&freq_hash, var_get_width (var));
324 n_cells = hmap_count (&freq_hash);
326 for ( i = 0 ; i < n_cells ; ++i )
327 total_obs += ff[i]->count;
330 for ( i = 0 ; i < n_cells ; ++i )
334 const union value *observed_value = &ff[i]->values[0];
336 ds_init_empty (&str);
337 var_append_value_name (var, observed_value, &str);
340 tab_text (freq_table, 0, i + 1, TAB_LEFT, ds_cstr (&str));
345 tab_double (freq_table, 1, i + 1, TAB_NONE,
346 ff[i]->count, NULL, RC_WEIGHT);
348 if ( cst->n_expected > 0 )
349 exp = cst->expected[i] * total_obs / total_expected ;
351 exp = total_obs / (double) n_cells;
353 tab_double (freq_table, 2, i + 1, TAB_NONE,
354 exp, NULL, RC_OTHER);
357 tab_double (freq_table, 3, i + 1, TAB_NONE,
358 ff[i]->count - exp, NULL, RC_OTHER);
360 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
363 df[v] = n_cells - 1.0;
365 tab_double (freq_table, 1, i + 1, TAB_NONE,
366 total_obs, NULL, RC_WEIGHT);
368 tab_submit (freq_table);
370 freq_hmap_destroy (&freq_hash, var_get_width (var));
374 else /* ranged == true */
376 struct tab_table *freq_table = create_combo_frequency_table (dict, cst);
378 n_cells = cst->hi - cst->lo + 1;
380 for ( v = 0 ; v < ost->n_vars ; ++v )
382 const struct variable *var = ost->vars[v];
383 double total_obs = 0.0;
384 struct casereader *reader =
385 casereader_create_filter_missing (casereader_clone (input),
388 struct hmap freq_hash;
391 hmap_init (&freq_hash);
392 if (!create_freq_hash_with_range (dict, reader, var,
393 cst->lo, cst->hi, &freq_hash))
395 freq_hmap_destroy (&freq_hash, var_get_width (var));
399 ff = freq_hmap_sort (&freq_hash, var_get_width (var));
401 for ( i = 0 ; i < hmap_count (&freq_hash) ; ++i )
402 total_obs += ff[i]->count;
405 for ( i = 0 ; i < hmap_count (&freq_hash) ; ++i )
410 const union value *observed_value = &ff[i]->values[0];
412 ds_init_empty (&str);
413 var_append_value_name (ost->vars[v], observed_value, &str);
415 tab_text (freq_table, v * 4 + 1, i + 2 , TAB_LEFT,
420 tab_double (freq_table, v * 4 + 2, i + 2 , TAB_NONE,
421 ff[i]->count, NULL, RC_WEIGHT);
423 if ( cst->n_expected > 0 )
424 exp = cst->expected[i] * total_obs / total_expected ;
426 exp = total_obs / (double) hmap_count (&freq_hash);
429 tab_double (freq_table, v * 4 + 3, i + 2 , TAB_NONE,
430 exp, NULL, RC_OTHER);
433 tab_double (freq_table, v * 4 + 4, i + 2 , TAB_NONE,
434 ff[i]->count - exp, NULL, RC_OTHER);
436 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
440 tab_double (freq_table, v * 4 + 2, tab_nr (freq_table) - 1, TAB_NONE,
441 total_obs, NULL, RC_WEIGHT);
443 df[v] = n_cells - 1.0;
445 freq_hmap_destroy (&freq_hash, var_get_width (var));
449 tab_submit (freq_table);
451 ok = !taint_has_tainted_successor (casereader_get_taint (input));
452 casereader_destroy (input);
456 struct tab_table *stats_table = create_stats_table (cst);
458 /* Populate the summary statistics table */
459 for ( v = 0 ; v < ost->n_vars ; ++v )
461 const struct variable *var = ost->vars[v];
463 tab_text (stats_table, 1 + v, 0, TAB_CENTER, var_get_name (var));
465 tab_double (stats_table, 1 + v, 1, TAB_NONE, xsq[v], NULL, RC_OTHER);
466 tab_double (stats_table, 1 + v, 2, TAB_NONE, df[v], NULL, RC_INTEGER);
468 tab_double (stats_table, 1 + v, 3, TAB_NONE,
469 gsl_cdf_chisq_Q (xsq[v], df[v]), NULL, RC_PVALUE);
471 tab_submit (stats_table);