1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <language/stats/chisquare.h>
24 #include <data/case.h>
25 #include <data/casereader.h>
26 #include <data/dictionary.h>
27 #include <data/procedure.h>
28 #include <data/value-labels.h>
29 #include <data/variable.h>
30 #include <language/stats/freq.h>
31 #include <language/stats/npar.h>
32 #include <libpspp/assertion.h>
33 #include <libpspp/compiler.h>
34 #include <libpspp/hash.h>
35 #include <libpspp/message.h>
36 #include <libpspp/taint.h>
37 #include <output/table.h>
39 #include <gsl/gsl_cdf.h>
44 #define _(msgid) gettext (msgid)
46 /* Return a hash table containing the frequency counts of each
48 It is the caller's responsibility to free the hash table when
51 static struct hsh_table *
52 create_freq_hash_with_range (const struct dictionary *dict,
53 struct casereader *input,
54 const struct variable *var,
62 struct hsh_table *freq_hash =
63 hsh_create (4, compare_freq, hash_freq,
64 free_freq_mutable_hash,
67 /* Populate the hash with zero entries */
68 for (i_d = trunc (lo); i_d <= trunc (hi); i_d += 1.0 )
70 union value the_value;
71 struct freq_mutable *fr = xmalloc (sizeof (*fr));
75 fr->value = value_dup (&the_value, 0);
78 hsh_insert (freq_hash, fr);
81 while (casereader_read (input, &c))
83 union value obs_value;
84 struct freq **existing_fr;
85 struct freq *fr = xmalloc(sizeof (*fr));
86 fr->value = case_data (&c, var);
88 fr->count = dict_get_case_weight (dict, &c, &warn);
90 obs_value.f = trunc (fr->value->f);
92 if ( obs_value.f < lo || obs_value.f > hi)
99 fr->value = &obs_value;
101 existing_fr = (struct freq **) hsh_probe (freq_hash, fr);
103 /* This must exist in the hash, because we previously populated it
105 assert (*existing_fr);
107 (*existing_fr)->count += fr->count;
112 if (casereader_destroy (input))
116 hsh_destroy (freq_hash);
122 /* Return a hash table containing the frequency counts of each
123 value of VAR in INPUT .
124 It is the caller's responsibility to free the hash table when
127 static struct hsh_table *
128 create_freq_hash (const struct dictionary *dict,
129 struct casereader *input,
130 const struct variable *var)
135 struct hsh_table *freq_hash =
136 hsh_create (4, compare_freq, hash_freq,
137 free_freq_mutable_hash,
140 for (; casereader_read (input, &c); case_destroy (&c))
142 struct freq **existing_fr;
143 struct freq *fr = xmalloc(sizeof (*fr));
144 fr->value = case_data (&c, var);
146 fr->count = dict_get_case_weight (dict, &c, &warn);
148 existing_fr = (struct freq **) hsh_probe (freq_hash, fr);
151 (*existing_fr)->count += fr->count;
157 fr->value = value_dup (fr->value, var_get_width (var));
160 if (casereader_destroy (input))
164 hsh_destroy (freq_hash);
171 static struct tab_table *
172 create_variable_frequency_table (const struct dictionary *dict,
173 struct casereader *input,
174 const struct chisquare_test *test,
176 struct hsh_table **freq_hash)
180 const struct one_sample_test *ost = (const struct one_sample_test*)test;
182 struct tab_table *table ;
183 const struct variable *var = ost->vars[v];
185 *freq_hash = create_freq_hash (dict, input, var);
186 if (*freq_hash == NULL)
189 n_cells = hsh_count (*freq_hash);
191 if ( test->n_expected > 0 && n_cells != test->n_expected )
193 msg(ME, _("CHISQUARE test specified %d expected values, but"
194 " %d distinct values were encountered in variable %s."),
195 test->n_expected, n_cells,
198 hsh_destroy (*freq_hash);
203 table = tab_create(4, n_cells + 2, 0);
204 tab_dim (table, tab_natural_dimensions);
206 tab_title (table, var_to_string(var));
207 tab_text (table, 1, 0, TAB_LEFT, _("Observed N"));
208 tab_text (table, 2, 0, TAB_LEFT, _("Expected N"));
209 tab_text (table, 3, 0, TAB_LEFT, _("Residual"));
211 tab_headers (table, 1, 0, 1, 0);
213 tab_box (table, TAL_1, TAL_1, -1, -1,
214 0, 0, table->nc - 1, tab_nr(table) - 1 );
216 tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 1);
218 tab_vline (table, TAL_2, 1, 0, tab_nr(table) - 1);
219 for ( i = 2 ; i < 4 ; ++i )
220 tab_vline (table, TAL_1, i, 0, tab_nr(table) - 1);
223 tab_text (table, 0, table->nr - 1, TAB_LEFT, _("Total"));
229 static struct tab_table *
230 create_combo_frequency_table (const struct chisquare_test *test)
233 const struct one_sample_test *ost = (const struct one_sample_test*)test;
235 struct tab_table *table ;
237 int n_cells = test->hi - test->lo + 1;
239 table = tab_create(1 + ost->n_vars * 4, n_cells + 3, 0);
240 tab_dim (table, tab_natural_dimensions);
242 tab_title (table, _("Frequencies"));
243 for ( i = 0 ; i < ost->n_vars ; ++i )
245 const struct variable *var = ost->vars[i];
246 tab_text (table, i * 4 + 1, 1, TAB_LEFT, _("Category"));
247 tab_text (table, i * 4 + 2, 1, TAB_LEFT, _("Observed N"));
248 tab_text (table, i * 4 + 3, 1, TAB_LEFT, _("Expected N"));
249 tab_text (table, i * 4 + 4, 1, TAB_LEFT, _("Residual"));
251 tab_vline (table, TAL_2, i * 4 + 1,
252 0, tab_nr (table) - 1);
254 tab_vline (table, TAL_1, i * 4 + 2,
255 0, tab_nr (table) - 1);
257 tab_vline (table, TAL_1, i * 4 + 3,
258 1, tab_nr (table) - 1);
260 tab_vline (table, TAL_1, i * 4 + 4,
261 1, tab_nr (table) - 1);
264 tab_joint_text (table,
268 var_to_string (var));
271 for ( i = test->lo ; i <= test->hi ; ++i )
272 tab_float (table, 0, 2 + i - test->lo,
273 TAB_LEFT, 1 + i - test->lo, 8, 0);
275 tab_headers (table, 1, 0, 2, 0);
277 tab_box (table, TAL_1, TAL_1, -1, -1,
278 0, 0, table->nc - 1, tab_nr(table) - 1 );
280 tab_hline (table, TAL_1, 1, tab_nc(table) - 1, 1);
281 tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 2);
283 tab_text (table, 0, table->nr - 1, TAB_LEFT, _("Total"));
289 static struct tab_table *
290 create_stats_table (const struct chisquare_test *test)
292 const struct one_sample_test *ost = (const struct one_sample_test*) test;
294 struct tab_table *table;
295 table = tab_create (1 + ost->n_vars, 4, 0);
296 tab_dim (table, tab_natural_dimensions);
297 tab_title (table, _("Test Statistics"));
298 tab_headers (table, 1, 0, 1, 0);
300 tab_box (table, TAL_1, TAL_1, -1, -1,
301 0, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
303 tab_box (table, -1, -1, -1, TAL_1,
304 1, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
307 tab_vline (table, TAL_2, 1, 0, tab_nr (table) - 1);
308 tab_hline (table, TAL_1, 0, tab_nc (table) - 1, 1);
311 tab_text (table, 0, 1, TAB_LEFT, _("Chi-Square"));
312 tab_text (table, 0, 2, TAB_LEFT, _("df"));
313 tab_text (table, 0, 3, TAB_LEFT, _("Asymp. Sig."));
320 chisquare_execute (const struct dataset *ds,
321 struct casereader *input,
322 enum mv_class exclude,
323 const struct npar_test *test)
325 const struct dictionary *dict = dataset_dict (ds);
327 struct one_sample_test *ost = (struct one_sample_test *) test;
328 struct chisquare_test *cst = (struct chisquare_test *) test;
330 double total_expected = 0.0;
332 double *df = xzalloc (sizeof (*df) * ost->n_vars);
333 double *xsq = xzalloc (sizeof (*df) * ost->n_vars);
336 for ( i = 0 ; i < cst->n_expected ; ++i )
337 total_expected += cst->expected[i];
339 if ( cst->ranged == false )
341 for ( v = 0 ; v < ost->n_vars ; ++v )
343 double total_obs = 0.0;
344 struct hsh_table *freq_hash = NULL;
345 struct casereader *reader =
346 casereader_create_filter_missing (casereader_clone (input),
347 &ost->vars[v], 1, exclude, NULL);
348 struct tab_table *freq_table =
349 create_variable_frequency_table(dict, reader, cst, v, &freq_hash);
353 if ( NULL == freq_table )
355 ff = (struct freq **) hsh_sort (freq_hash);
357 n_cells = hsh_count (freq_hash);
359 for ( i = 0 ; i < n_cells ; ++i )
360 total_obs += ff[i]->count;
363 for ( i = 0 ; i < n_cells ; ++i )
367 const union value *observed_value = ff[i]->value;
369 ds_init_empty (&str);
370 var_append_value_name (ost->vars[v], observed_value, &str);
373 tab_text (freq_table, 0, i + 1, TAB_LEFT, ds_cstr (&str));
378 tab_float (freq_table, 1, i + 1, TAB_NONE,
381 if ( cst->n_expected > 0 )
382 exp = cst->expected[i] * total_obs / total_expected ;
384 exp = total_obs / (double) n_cells;
386 tab_float (freq_table, 2, i + 1, TAB_NONE,
390 tab_float (freq_table, 3, i + 1, TAB_NONE,
391 ff[i]->count - exp, 8, 2);
393 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
396 df[v] = n_cells - 1.0;
398 tab_float (freq_table, 1, i + 1, TAB_NONE,
401 tab_submit (freq_table);
403 hsh_destroy (freq_hash);
406 else /* ranged == true */
408 struct tab_table *freq_table = create_combo_frequency_table (cst);
410 n_cells = cst->hi - cst->lo + 1;
412 for ( v = 0 ; v < ost->n_vars ; ++v )
414 double total_obs = 0.0;
415 struct casereader *reader =
416 casereader_create_filter_missing (casereader_clone (input),
417 &ost->vars[v], 1, exclude, NULL);
418 struct hsh_table *freq_hash =
419 create_freq_hash_with_range (dict, reader,
420 ost->vars[v], cst->lo, cst->hi);
424 if (freq_hash == NULL)
427 ff = (struct freq **) hsh_sort (freq_hash);
428 assert ( n_cells == hsh_count (freq_hash));
430 for ( i = 0 ; i < hsh_count (freq_hash) ; ++i )
431 total_obs += ff[i]->count;
434 for ( i = 0 ; i < hsh_count (freq_hash) ; ++i )
439 const union value *observed_value = ff[i]->value;
441 ds_init_empty (&str);
442 var_append_value_name (ost->vars[v], observed_value, &str);
444 tab_text (freq_table, v * 4 + 1, i + 2 , TAB_LEFT,
449 tab_float (freq_table, v * 4 + 2, i + 2 , TAB_NONE,
452 if ( cst->n_expected > 0 )
453 exp = cst->expected[i] * total_obs / total_expected ;
455 exp = total_obs / (double) hsh_count (freq_hash);
458 tab_float (freq_table, v * 4 + 3, i + 2 , TAB_NONE,
462 tab_float (freq_table, v * 4 + 4, i + 2 , TAB_NONE,
463 ff[i]->count - exp, 8, 2);
465 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
469 tab_float (freq_table, v * 4 + 2, tab_nr (freq_table) - 1, TAB_NONE,
472 df[v] = n_cells - 1.0;
474 hsh_destroy (freq_hash);
477 tab_submit (freq_table);
479 ok = !taint_has_tainted_successor (casereader_get_taint (input));
480 casereader_destroy (input);
484 struct tab_table *stats_table = create_stats_table (cst);
486 /* Populate the summary statistics table */
487 for ( v = 0 ; v < ost->n_vars ; ++v )
489 const struct variable *var = ost->vars[v];
491 tab_text (stats_table, 1 + v, 0, TAB_CENTER, var_get_name (var));
493 tab_float (stats_table, 1 + v, 1, TAB_NONE, xsq[v], 8,3);
494 tab_float (stats_table, 1 + v, 2, TAB_NONE, df[v], 8,0);
496 tab_float (stats_table, 1 + v, 3, TAB_NONE,
497 gsl_cdf_chisq_Q (xsq[v], df[v]), 8,3);
499 tab_submit (stats_table);