1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2007 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <language/stats/chisquare.h>
24 #include <data/case.h>
25 #include <data/casereader.h>
26 #include <data/dictionary.h>
27 #include <data/procedure.h>
28 #include <data/value-labels.h>
29 #include <data/variable.h>
30 #include <language/stats/freq.h>
31 #include <language/stats/npar.h>
32 #include <libpspp/alloc.h>
33 #include <libpspp/assertion.h>
34 #include <libpspp/compiler.h>
35 #include <libpspp/hash.h>
36 #include <libpspp/message.h>
37 #include <libpspp/taint.h>
38 #include <output/table.h>
40 #include <gsl/gsl_cdf.h>
43 #define _(msgid) gettext (msgid)
45 /* Return a hash table containing the frequency counts of each
47 It is the caller's responsibility to free the hash table when
50 static struct hsh_table *
51 create_freq_hash_with_range (const struct dictionary *dict,
52 struct casereader *input,
53 const struct variable *var,
61 struct hsh_table *freq_hash =
62 hsh_create (4, compare_freq, hash_freq,
63 free_freq_mutable_hash,
66 /* Populate the hash with zero entries */
67 for (i_d = trunc (lo); i_d <= trunc (hi); i_d += 1.0 )
69 union value the_value;
70 struct freq_mutable *fr = xmalloc (sizeof (*fr));
74 fr->value = value_dup (&the_value, 0);
77 hsh_insert (freq_hash, fr);
80 while (casereader_read (input, &c))
82 union value obs_value;
83 struct freq **existing_fr;
84 struct freq *fr = xmalloc(sizeof (*fr));
85 fr->value = case_data (&c, var);
87 fr->count = dict_get_case_weight (dict, &c, &warn);
89 obs_value.f = trunc (fr->value->f);
91 if ( obs_value.f < lo || obs_value.f > hi)
98 fr->value = &obs_value;
100 existing_fr = (struct freq **) hsh_probe (freq_hash, fr);
102 /* This must exist in the hash, because we previously populated it
104 assert (*existing_fr);
106 (*existing_fr)->count += fr->count;
111 if (casereader_destroy (input))
115 hsh_destroy (freq_hash);
121 /* Return a hash table containing the frequency counts of each
122 value of VAR in INPUT .
123 It is the caller's responsibility to free the hash table when
126 static struct hsh_table *
127 create_freq_hash (const struct dictionary *dict,
128 struct casereader *input,
129 const struct variable *var)
134 struct hsh_table *freq_hash =
135 hsh_create (4, compare_freq, hash_freq,
136 free_freq_mutable_hash,
139 for (; casereader_read (input, &c); case_destroy (&c))
141 struct freq **existing_fr;
142 struct freq *fr = xmalloc(sizeof (*fr));
143 fr->value = case_data (&c, var);
145 fr->count = dict_get_case_weight (dict, &c, &warn);
147 existing_fr = (struct freq **) hsh_probe (freq_hash, fr);
150 (*existing_fr)->count += fr->count;
156 fr->value = value_dup (fr->value, var_get_width (var));
159 if (casereader_destroy (input))
163 hsh_destroy (freq_hash);
170 static struct tab_table *
171 create_variable_frequency_table (const struct dictionary *dict,
172 struct casereader *input,
173 const struct chisquare_test *test,
175 struct hsh_table **freq_hash)
179 const struct one_sample_test *ost = (const struct one_sample_test*)test;
181 struct tab_table *table ;
182 const struct variable *var = ost->vars[v];
184 *freq_hash = create_freq_hash (dict, input, var);
185 if (*freq_hash == NULL)
188 n_cells = hsh_count (*freq_hash);
190 if ( test->n_expected > 0 && n_cells != test->n_expected )
192 msg(ME, _("CHISQUARE test specified %d expected values, but"
193 " %d distinct values were encountered in variable %s."),
194 test->n_expected, n_cells,
197 hsh_destroy (*freq_hash);
202 table = tab_create(4, n_cells + 2, 0);
203 tab_dim (table, tab_natural_dimensions);
205 tab_title (table, var_to_string(var));
206 tab_text (table, 1, 0, TAB_LEFT, _("Observed N"));
207 tab_text (table, 2, 0, TAB_LEFT, _("Expected N"));
208 tab_text (table, 3, 0, TAB_LEFT, _("Residual"));
210 tab_headers (table, 1, 0, 1, 0);
212 tab_box (table, TAL_1, TAL_1, -1, -1,
213 0, 0, table->nc - 1, tab_nr(table) - 1 );
215 tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 1);
217 tab_vline (table, TAL_2, 1, 0, tab_nr(table) - 1);
218 for ( i = 2 ; i < 4 ; ++i )
219 tab_vline (table, TAL_1, i, 0, tab_nr(table) - 1);
222 tab_text (table, 0, table->nr - 1, TAB_LEFT, _("Total"));
228 static struct tab_table *
229 create_combo_frequency_table (const struct chisquare_test *test)
232 const struct one_sample_test *ost = (const struct one_sample_test*)test;
234 struct tab_table *table ;
236 int n_cells = test->hi - test->lo + 1;
238 table = tab_create(1 + ost->n_vars * 4, n_cells + 3, 0);
239 tab_dim (table, tab_natural_dimensions);
241 tab_title (table, _("Frequencies"));
242 for ( i = 0 ; i < ost->n_vars ; ++i )
244 const struct variable *var = ost->vars[i];
245 tab_text (table, i * 4 + 1, 1, TAB_LEFT, _("Category"));
246 tab_text (table, i * 4 + 2, 1, TAB_LEFT, _("Observed N"));
247 tab_text (table, i * 4 + 3, 1, TAB_LEFT, _("Expected N"));
248 tab_text (table, i * 4 + 4, 1, TAB_LEFT, _("Residual"));
250 tab_vline (table, TAL_2, i * 4 + 1,
251 0, tab_nr (table) - 1);
253 tab_vline (table, TAL_1, i * 4 + 2,
254 0, tab_nr (table) - 1);
256 tab_vline (table, TAL_1, i * 4 + 3,
257 1, tab_nr (table) - 1);
259 tab_vline (table, TAL_1, i * 4 + 4,
260 1, tab_nr (table) - 1);
263 tab_joint_text (table,
267 var_to_string (var));
270 for ( i = test->lo ; i <= test->hi ; ++i )
271 tab_float (table, 0, 2 + i - test->lo,
272 TAB_LEFT, 1 + i - test->lo, 8, 0);
274 tab_headers (table, 1, 0, 2, 0);
276 tab_box (table, TAL_1, TAL_1, -1, -1,
277 0, 0, table->nc - 1, tab_nr(table) - 1 );
279 tab_hline (table, TAL_1, 1, tab_nc(table) - 1, 1);
280 tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 2);
282 tab_text (table, 0, table->nr - 1, TAB_LEFT, _("Total"));
288 static struct tab_table *
289 create_stats_table (const struct chisquare_test *test)
291 const struct one_sample_test *ost = (const struct one_sample_test*) test;
293 struct tab_table *table;
294 table = tab_create (1 + ost->n_vars, 4, 0);
295 tab_dim (table, tab_natural_dimensions);
296 tab_title (table, _("Test Statistics"));
297 tab_headers (table, 1, 0, 1, 0);
299 tab_box (table, TAL_1, TAL_1, -1, -1,
300 0, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
302 tab_box (table, -1, -1, -1, TAL_1,
303 1, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
306 tab_vline (table, TAL_2, 1, 0, tab_nr (table) - 1);
307 tab_hline (table, TAL_1, 0, tab_nc (table) - 1, 1);
310 tab_text (table, 0, 1, TAB_LEFT, _("Chi-Square"));
311 tab_text (table, 0, 2, TAB_LEFT, _("df"));
312 tab_text (table, 0, 3, TAB_LEFT, _("Asymp. Sig."));
319 chisquare_execute (const struct dataset *ds,
320 struct casereader *input,
321 enum mv_class exclude,
322 const struct npar_test *test)
324 const struct dictionary *dict = dataset_dict (ds);
326 struct one_sample_test *ost = (struct one_sample_test *) test;
327 struct chisquare_test *cst = (struct chisquare_test *) test;
329 double total_expected = 0.0;
331 double *df = xzalloc (sizeof (*df) * ost->n_vars);
332 double *xsq = xzalloc (sizeof (*df) * ost->n_vars);
335 for ( i = 0 ; i < cst->n_expected ; ++i )
336 total_expected += cst->expected[i];
338 if ( cst->ranged == false )
340 for ( v = 0 ; v < ost->n_vars ; ++v )
342 double total_obs = 0.0;
343 struct hsh_table *freq_hash = NULL;
344 struct casereader *reader =
345 casereader_create_filter_missing (casereader_clone (input),
346 &ost->vars[v], 1, exclude, NULL);
347 struct tab_table *freq_table =
348 create_variable_frequency_table(dict, reader, cst, v, &freq_hash);
352 if ( NULL == freq_table )
354 ff = (struct freq **) hsh_sort (freq_hash);
356 n_cells = hsh_count (freq_hash);
358 for ( i = 0 ; i < n_cells ; ++i )
359 total_obs += ff[i]->count;
362 for ( i = 0 ; i < n_cells ; ++i )
365 const union value *observed_value = ff[i]->value;
368 tab_text (freq_table, 0, i + 1, TAB_LEFT,
369 var_get_value_name (ost->vars[v], observed_value));
372 tab_float (freq_table, 1, i + 1, TAB_NONE,
375 if ( cst->n_expected > 0 )
376 exp = cst->expected[i] * total_obs / total_expected ;
378 exp = total_obs / (double) n_cells;
380 tab_float (freq_table, 2, i + 1, TAB_NONE,
384 tab_float (freq_table, 3, i + 1, TAB_NONE,
385 ff[i]->count - exp, 8, 2);
387 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
390 df[v] = n_cells - 1.0;
392 tab_float (freq_table, 1, i + 1, TAB_NONE,
395 tab_submit (freq_table);
397 hsh_destroy (freq_hash);
400 else /* ranged == true */
402 struct tab_table *freq_table = create_combo_frequency_table (cst);
404 n_cells = cst->hi - cst->lo + 1;
406 for ( v = 0 ; v < ost->n_vars ; ++v )
408 double total_obs = 0.0;
409 struct casereader *reader =
410 casereader_create_filter_missing (casereader_clone (input),
411 &ost->vars[v], 1, exclude, NULL);
412 struct hsh_table *freq_hash =
413 create_freq_hash_with_range (dict, reader,
414 ost->vars[v], cst->lo, cst->hi);
418 if (freq_hash == NULL)
421 ff = (struct freq **) hsh_sort (freq_hash);
422 assert ( n_cells == hsh_count (freq_hash));
424 for ( i = 0 ; i < hsh_count (freq_hash) ; ++i )
425 total_obs += ff[i]->count;
428 for ( i = 0 ; i < hsh_count (freq_hash) ; ++i )
432 const union value *observed_value = ff[i]->value;
435 tab_text (freq_table, v * 4 + 1, i + 2 , TAB_LEFT,
436 var_get_value_name (ost->vars[v], observed_value));
439 tab_float (freq_table, v * 4 + 2, i + 2 , TAB_NONE,
442 if ( cst->n_expected > 0 )
443 exp = cst->expected[i] * total_obs / total_expected ;
445 exp = total_obs / (double) hsh_count (freq_hash);
448 tab_float (freq_table, v * 4 + 3, i + 2 , TAB_NONE,
452 tab_float (freq_table, v * 4 + 4, i + 2 , TAB_NONE,
453 ff[i]->count - exp, 8, 2);
455 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
459 tab_float (freq_table, v * 4 + 2, tab_nr (freq_table) - 1, TAB_NONE,
462 df[v] = n_cells - 1.0;
464 hsh_destroy (freq_hash);
467 tab_submit (freq_table);
469 ok = !taint_has_tainted_successor (casereader_get_taint (input));
470 casereader_destroy (input);
474 struct tab_table *stats_table = create_stats_table (cst);
476 /* Populate the summary statistics table */
477 for ( v = 0 ; v < ost->n_vars ; ++v )
479 const struct variable *var = ost->vars[v];
481 tab_text (stats_table, 1 + v, 0, TAB_CENTER, var_get_name (var));
483 tab_float (stats_table, 1 + v, 1, TAB_NONE, xsq[v], 8,3);
484 tab_float (stats_table, 1 + v, 2, TAB_NONE, df[v], 8,0);
486 tab_float (stats_table, 1 + v, 3, TAB_NONE,
487 gsl_cdf_chisq_Q (xsq[v], df[v]), 8,3);
489 tab_submit (stats_table);