1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2007, 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <language/stats/chisquare.h>
24 #include <data/format.h>
25 #include <data/case.h>
26 #include <data/casereader.h>
27 #include <data/dictionary.h>
28 #include <data/procedure.h>
29 #include <data/value-labels.h>
30 #include <data/variable.h>
31 #include <language/stats/freq.h>
32 #include <language/stats/npar.h>
33 #include <libpspp/assertion.h>
34 #include <libpspp/cast.h>
35 #include <libpspp/compiler.h>
36 #include <libpspp/hash.h>
37 #include <libpspp/message.h>
38 #include <libpspp/taint.h>
39 #include <output/table.h>
41 #include <gsl/gsl_cdf.h>
46 #define _(msgid) gettext (msgid)
48 /* Return a hash table containing the frequency counts of each
50 It is the caller's responsibility to free the hash table when
53 static struct hsh_table *
54 create_freq_hash_with_range (const struct dictionary *dict,
55 struct casereader *input,
56 const struct variable *var,
64 struct hsh_table *freq_hash =
65 hsh_create (4, compare_freq, hash_freq,
66 free_freq_mutable_hash,
69 /* Populate the hash with zero entries */
70 for (i_d = trunc (lo); i_d <= trunc (hi); i_d += 1.0 )
72 struct freq_mutable *fr = xmalloc (sizeof (*fr));
73 value_init (&fr->value, 0);
76 hsh_insert (freq_hash, fr);
79 for (; (c = casereader_read (input)) != NULL; case_unref (c))
81 struct freq_mutable fr;
82 fr.value.f = trunc (case_num (c, var));
83 if (fr.value.f >= lo && fr.value.f <= hi)
85 struct freq_mutable *existing_fr = hsh_force_find (freq_hash, &fr);
86 existing_fr->count += dict_get_case_weight (dict, c, &warn);
89 if (casereader_destroy (input))
93 hsh_destroy (freq_hash);
99 /* Return a hash table containing the frequency counts of each
100 value of VAR in INPUT .
101 It is the caller's responsibility to free the hash table when
104 static struct hsh_table *
105 create_freq_hash (const struct dictionary *dict,
106 struct casereader *input,
107 const struct variable *var)
109 int width = var_get_width (var);
113 struct hsh_table *freq_hash =
114 hsh_create (4, compare_freq, hash_freq,
115 free_freq_mutable_hash,
118 for (; (c = casereader_read (input)) != NULL; case_unref (c))
120 struct freq_mutable fr;
123 fr.value = *case_data (c, var);
124 fr.count = dict_get_case_weight (dict, c, &warn);
126 p = hsh_probe (freq_hash, &fr);
129 struct freq_mutable *new_fr = *p = xmalloc (sizeof *new_fr);
130 value_init (&new_fr->value, width);
131 value_copy (&new_fr->value, &fr.value, width);
132 new_fr->count = fr.count;
136 struct freq *existing_fr = *p;
137 existing_fr->count += fr.count;
140 if (casereader_destroy (input))
144 hsh_destroy (freq_hash);
151 static struct tab_table *
152 create_variable_frequency_table (const struct dictionary *dict,
153 struct casereader *input,
154 const struct chisquare_test *test,
156 struct hsh_table **freq_hash)
160 const struct one_sample_test *ost = (const struct one_sample_test*)test;
162 struct tab_table *table ;
163 const struct variable *var = ost->vars[v];
165 *freq_hash = create_freq_hash (dict, input, var);
166 if (*freq_hash == NULL)
169 n_cells = hsh_count (*freq_hash);
171 if ( test->n_expected > 0 && n_cells != test->n_expected )
173 msg(ME, _("CHISQUARE test specified %d expected values, but"
174 " %d distinct values were encountered in variable %s."),
175 test->n_expected, n_cells,
178 hsh_destroy (*freq_hash);
183 table = tab_create(4, n_cells + 2);
184 tab_dim (table, tab_natural_dimensions, NULL, NULL);
186 tab_title (table, var_to_string(var));
187 tab_text (table, 1, 0, TAB_LEFT, _("Observed N"));
188 tab_text (table, 2, 0, TAB_LEFT, _("Expected N"));
189 tab_text (table, 3, 0, TAB_LEFT, _("Residual"));
191 tab_headers (table, 1, 0, 1, 0);
193 tab_box (table, TAL_1, TAL_1, -1, -1,
194 0, 0, tab_nc (table) - 1, tab_nr(table) - 1 );
196 tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 1);
198 tab_vline (table, TAL_2, 1, 0, tab_nr(table) - 1);
199 for ( i = 2 ; i < 4 ; ++i )
200 tab_vline (table, TAL_1, i, 0, tab_nr(table) - 1);
203 tab_text (table, 0, tab_nr (table) - 1, TAB_LEFT, _("Total"));
209 static struct tab_table *
210 create_combo_frequency_table (const struct chisquare_test *test)
213 const struct one_sample_test *ost = (const struct one_sample_test*)test;
215 struct tab_table *table ;
217 int n_cells = test->hi - test->lo + 1;
219 table = tab_create(1 + ost->n_vars * 4, n_cells + 3);
220 tab_dim (table, tab_natural_dimensions, NULL, NULL);
222 tab_title (table, _("Frequencies"));
223 for ( i = 0 ; i < ost->n_vars ; ++i )
225 const struct variable *var = ost->vars[i];
226 tab_text (table, i * 4 + 1, 1, TAB_LEFT, _("Category"));
227 tab_text (table, i * 4 + 2, 1, TAB_LEFT, _("Observed N"));
228 tab_text (table, i * 4 + 3, 1, TAB_LEFT, _("Expected N"));
229 tab_text (table, i * 4 + 4, 1, TAB_LEFT, _("Residual"));
231 tab_vline (table, TAL_2, i * 4 + 1,
232 0, tab_nr (table) - 1);
234 tab_vline (table, TAL_1, i * 4 + 2,
235 0, tab_nr (table) - 1);
237 tab_vline (table, TAL_1, i * 4 + 3,
238 1, tab_nr (table) - 1);
240 tab_vline (table, TAL_1, i * 4 + 4,
241 1, tab_nr (table) - 1);
244 tab_joint_text (table,
248 var_to_string (var));
251 for ( i = test->lo ; i <= test->hi ; ++i )
252 tab_fixed (table, 0, 2 + i - test->lo,
253 TAB_LEFT, 1 + i - test->lo, 8, 0);
255 tab_headers (table, 1, 0, 2, 0);
257 tab_box (table, TAL_1, TAL_1, -1, -1,
258 0, 0, tab_nc (table) - 1, tab_nr(table) - 1 );
260 tab_hline (table, TAL_1, 1, tab_nc(table) - 1, 1);
261 tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 2);
263 tab_text (table, 0, tab_nr (table) - 1, TAB_LEFT, _("Total"));
269 static struct tab_table *
270 create_stats_table (const struct chisquare_test *test)
272 const struct one_sample_test *ost = (const struct one_sample_test*) test;
274 struct tab_table *table;
275 table = tab_create (1 + ost->n_vars, 4);
276 tab_dim (table, tab_natural_dimensions, NULL, NULL);
277 tab_title (table, _("Test Statistics"));
278 tab_headers (table, 1, 0, 1, 0);
280 tab_box (table, TAL_1, TAL_1, -1, -1,
281 0, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
283 tab_box (table, -1, -1, -1, TAL_1,
284 1, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
287 tab_vline (table, TAL_2, 1, 0, tab_nr (table) - 1);
288 tab_hline (table, TAL_1, 0, tab_nc (table) - 1, 1);
291 tab_text (table, 0, 1, TAB_LEFT, _("Chi-Square"));
292 tab_text (table, 0, 2, TAB_LEFT, _("df"));
293 tab_text (table, 0, 3, TAB_LEFT, _("Asymp. Sig."));
300 chisquare_execute (const struct dataset *ds,
301 struct casereader *input,
302 enum mv_class exclude,
303 const struct npar_test *test,
307 const struct dictionary *dict = dataset_dict (ds);
309 struct chisquare_test *cst = UP_CAST (test, struct chisquare_test,
311 struct one_sample_test *ost = &cst->parent;
313 double total_expected = 0.0;
314 const struct variable *wvar = dict_get_weight (dict);
315 const struct fmt_spec *wfmt = wvar ?
316 var_get_print_format (wvar) : & F_8_0;
318 double *df = xzalloc (sizeof (*df) * ost->n_vars);
319 double *xsq = xzalloc (sizeof (*df) * ost->n_vars);
322 for ( i = 0 ; i < cst->n_expected ; ++i )
323 total_expected += cst->expected[i];
325 if ( cst->ranged == false )
327 for ( v = 0 ; v < ost->n_vars ; ++v )
329 double total_obs = 0.0;
330 struct hsh_table *freq_hash = NULL;
331 struct casereader *reader =
332 casereader_create_filter_missing (casereader_clone (input),
333 &ost->vars[v], 1, exclude,
335 struct tab_table *freq_table =
336 create_variable_frequency_table(dict, reader, cst, v, &freq_hash);
340 if ( NULL == freq_table )
342 ff = (struct freq **) hsh_sort (freq_hash);
344 n_cells = hsh_count (freq_hash);
346 for ( i = 0 ; i < n_cells ; ++i )
347 total_obs += ff[i]->count;
350 for ( i = 0 ; i < n_cells ; ++i )
354 const union value *observed_value = &ff[i]->value;
356 ds_init_empty (&str);
357 var_append_value_name (ost->vars[v], observed_value, &str);
360 tab_text (freq_table, 0, i + 1, TAB_LEFT, ds_cstr (&str));
365 tab_double (freq_table, 1, i + 1, TAB_NONE,
368 if ( cst->n_expected > 0 )
369 exp = cst->expected[i] * total_obs / total_expected ;
371 exp = total_obs / (double) n_cells;
373 tab_double (freq_table, 2, i + 1, TAB_NONE,
377 tab_double (freq_table, 3, i + 1, TAB_NONE,
378 ff[i]->count - exp, NULL);
380 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
383 df[v] = n_cells - 1.0;
385 tab_double (freq_table, 1, i + 1, TAB_NONE,
388 tab_submit (freq_table);
390 hsh_destroy (freq_hash);
393 else /* ranged == true */
395 struct tab_table *freq_table = create_combo_frequency_table (cst);
397 n_cells = cst->hi - cst->lo + 1;
399 for ( v = 0 ; v < ost->n_vars ; ++v )
401 double total_obs = 0.0;
402 struct casereader *reader =
403 casereader_create_filter_missing (casereader_clone (input),
404 &ost->vars[v], 1, exclude,
406 struct hsh_table *freq_hash =
407 create_freq_hash_with_range (dict, reader,
408 ost->vars[v], cst->lo, cst->hi);
412 if (freq_hash == NULL)
415 ff = (struct freq **) hsh_sort (freq_hash);
416 assert ( n_cells == hsh_count (freq_hash));
418 for ( i = 0 ; i < hsh_count (freq_hash) ; ++i )
419 total_obs += ff[i]->count;
422 for ( i = 0 ; i < hsh_count (freq_hash) ; ++i )
427 const union value *observed_value = &ff[i]->value;
429 ds_init_empty (&str);
430 var_append_value_name (ost->vars[v], observed_value, &str);
432 tab_text (freq_table, v * 4 + 1, i + 2 , TAB_LEFT,
437 tab_double (freq_table, v * 4 + 2, i + 2 , TAB_NONE,
440 if ( cst->n_expected > 0 )
441 exp = cst->expected[i] * total_obs / total_expected ;
443 exp = total_obs / (double) hsh_count (freq_hash);
446 tab_double (freq_table, v * 4 + 3, i + 2 , TAB_NONE,
450 tab_double (freq_table, v * 4 + 4, i + 2 , TAB_NONE,
451 ff[i]->count - exp, NULL);
453 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
457 tab_double (freq_table, v * 4 + 2, tab_nr (freq_table) - 1, TAB_NONE,
460 df[v] = n_cells - 1.0;
462 hsh_destroy (freq_hash);
465 tab_submit (freq_table);
467 ok = !taint_has_tainted_successor (casereader_get_taint (input));
468 casereader_destroy (input);
472 struct tab_table *stats_table = create_stats_table (cst);
474 /* Populate the summary statistics table */
475 for ( v = 0 ; v < ost->n_vars ; ++v )
477 const struct variable *var = ost->vars[v];
479 tab_text (stats_table, 1 + v, 0, TAB_CENTER, var_get_name (var));
481 tab_double (stats_table, 1 + v, 1, TAB_NONE, xsq[v], NULL);
482 tab_fixed (stats_table, 1 + v, 2, TAB_NONE, df[v], 8, 0);
484 tab_double (stats_table, 1 + v, 3, TAB_NONE,
485 gsl_cdf_chisq_Q (xsq[v], df[v]), NULL);
487 tab_submit (stats_table);