1 /* PSPP - computes sample statistics.
2 Copyright (C) 2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20 #include <libpspp/compiler.h>
21 #include <libpspp/assertion.h>
25 #include <data/case.h>
26 #include <data/casefile.h>
27 #include <data/casefilter.h>
28 #include <data/variable.h>
29 #include <data/dictionary.h>
30 #include <data/procedure.h>
32 #include <libpspp/message.h>
33 #include <libpspp/hash.h>
34 #include <libpspp/alloc.h>
36 #include <gsl/gsl_cdf.h>
38 #include <output/table.h>
39 #include <data/value-labels.h>
42 #include "chisquare.h"
48 #define _(msgid) gettext (msgid)
53 /* Return a hash table containing the frequency counts of each
55 It is the caller's responsibility to free the hash table when
58 static struct hsh_table *
59 create_freq_hash_with_range (const struct dictionary *dict,
60 const struct casefile *cf,
61 struct casefilter *filter,
62 const struct variable *var,
69 struct casereader *r = casefile_get_reader (cf, filter);
71 struct hsh_table *freq_hash =
72 hsh_create (4, compare_freq, hash_freq,
73 free_freq_mutable_hash,
76 /* Populate the hash with zero entries */
77 for (i_d = trunc (lo); i_d <= trunc (hi); i_d += 1.0 )
79 union value the_value;
80 struct freq_mutable *fr = xmalloc (sizeof (*fr));
84 fr->value = value_dup (&the_value, 0);
87 hsh_insert (freq_hash, fr);
90 while (casereader_read(r, &c))
92 union value obs_value;
93 struct freq **existing_fr;
94 struct freq *fr = xmalloc(sizeof (*fr));
95 fr->value = case_data (&c, var);
97 if ( casefilter_variable_missing (filter, &c, var))
103 fr->count = dict_get_case_weight (dict, &c, &warn);
105 obs_value.f = trunc (fr->value->f);
107 if ( obs_value.f < lo || obs_value.f > hi)
114 fr->value = &obs_value;
116 existing_fr = (struct freq **) hsh_probe (freq_hash, fr);
118 /* This must exist in the hash, because we previously populated it
120 assert (*existing_fr);
122 (*existing_fr)->count += fr->count;
127 casereader_destroy (r);
133 /* Return a hash table containing the frequency counts of each
135 It is the caller's responsibility to free the hash table when
138 static struct hsh_table *
139 create_freq_hash (const struct dictionary *dict,
140 const struct casefile *cf,
141 struct casefilter *filter,
142 const struct variable *var)
146 struct casereader *r = casefile_get_reader (cf, filter);
148 struct hsh_table *freq_hash =
149 hsh_create (4, compare_freq, hash_freq,
153 while (casereader_read(r, &c))
155 struct freq **existing_fr;
156 struct freq *fr = xmalloc(sizeof (*fr));
157 fr->value = case_data (&c, var );
159 if ( casefilter_variable_missing (filter, &c, var))
165 fr->count = dict_get_case_weight (dict, &c, &warn);
167 existing_fr = (struct freq **) hsh_probe (freq_hash, fr);
170 (*existing_fr)->count += fr->count;
180 casereader_destroy (r);
187 static struct tab_table *
188 create_variable_frequency_table (const struct dictionary *dict,
189 const struct casefile *cf,
190 struct casefilter *filter,
191 const struct chisquare_test *test,
193 struct hsh_table **freq_hash)
197 const struct one_sample_test *ost = (const struct one_sample_test*)test;
199 struct tab_table *table ;
200 const struct variable *var = ost->vars[v];
202 *freq_hash = create_freq_hash (dict, cf, filter, var);
204 n_cells = hsh_count (*freq_hash);
206 if ( test->n_expected > 0 && n_cells != test->n_expected )
208 msg(ME, _("CHISQUARE test specified %d expected values, but"
209 " %d distinct values were encountered in variable %s."),
210 test->n_expected, n_cells,
216 table = tab_create(4, n_cells + 2, 0);
217 tab_dim (table, tab_natural_dimensions);
219 tab_title (table, var_to_string(var));
220 tab_text (table, 1, 0, TAB_LEFT, _("Observed N"));
221 tab_text (table, 2, 0, TAB_LEFT, _("Expected N"));
222 tab_text (table, 3, 0, TAB_LEFT, _("Residual"));
224 tab_headers (table, 1, 0, 1, 0);
226 tab_box (table, TAL_1, TAL_1, -1, -1,
227 0, 0, table->nc - 1, tab_nr(table) - 1 );
229 tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 1);
231 tab_vline (table, TAL_2, 1, 0, tab_nr(table) - 1);
232 for ( i = 2 ; i < 4 ; ++i )
233 tab_vline (table, TAL_1, i, 0, tab_nr(table) - 1);
236 tab_text (table, 0, table->nr - 1, TAB_LEFT, _("Total"));
242 static struct tab_table *
243 create_combo_frequency_table (const struct chisquare_test *test)
246 const struct one_sample_test *ost = (const struct one_sample_test*)test;
248 struct tab_table *table ;
250 int n_cells = test->hi - test->lo + 1;
252 table = tab_create(1 + ost->n_vars * 4, n_cells + 3, 0);
253 tab_dim (table, tab_natural_dimensions);
255 tab_title (table, _("Frequencies"));
256 for ( i = 0 ; i < ost->n_vars ; ++i )
258 const struct variable *var = ost->vars[i];
259 tab_text (table, i * 4 + 1, 1, TAB_LEFT, _("Category"));
260 tab_text (table, i * 4 + 2, 1, TAB_LEFT, _("Observed N"));
261 tab_text (table, i * 4 + 3, 1, TAB_LEFT, _("Expected N"));
262 tab_text (table, i * 4 + 4, 1, TAB_LEFT, _("Residual"));
264 tab_vline (table, TAL_2, i * 4 + 1,
265 0, tab_nr (table) - 1);
267 tab_vline (table, TAL_1, i * 4 + 2,
268 0, tab_nr (table) - 1);
270 tab_vline (table, TAL_1, i * 4 + 3,
271 1, tab_nr (table) - 1);
273 tab_vline (table, TAL_1, i * 4 + 4,
274 1, tab_nr (table) - 1);
277 tab_joint_text (table,
281 var_to_string (var));
284 for ( i = test->lo ; i <= test->hi ; ++i )
285 tab_float (table, 0, 2 + i - test->lo,
286 TAB_LEFT, 1 + i - test->lo, 8, 0);
288 tab_headers (table, 1, 0, 2, 0);
290 tab_box (table, TAL_1, TAL_1, -1, -1,
291 0, 0, table->nc - 1, tab_nr(table) - 1 );
293 tab_hline (table, TAL_1, 1, tab_nc(table) - 1, 1);
294 tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 2);
296 tab_text (table, 0, table->nr - 1, TAB_LEFT, _("Total"));
302 static struct tab_table *
303 create_stats_table (const struct chisquare_test *test)
305 const struct one_sample_test *ost = (const struct one_sample_test*) test;
307 struct tab_table *table = tab_create (1 + ost->n_vars, 4, 0);
308 tab_dim (table, tab_natural_dimensions);
309 tab_title (table, _("Test Statistics"));
310 tab_headers (table, 1, 0, 1, 0);
312 tab_box (table, TAL_1, TAL_1, -1, -1,
313 0, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
315 tab_box (table, -1, -1, -1, TAL_1,
316 1, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
319 tab_vline (table, TAL_2, 1, 0, tab_nr (table) - 1);
320 tab_hline (table, TAL_1, 0, tab_nc (table) - 1, 1);
323 tab_text (table, 0, 1, TAB_LEFT, _("Chi-Square"));
324 tab_text (table, 0, 2, TAB_LEFT, _("df"));
325 tab_text (table, 0, 3, TAB_LEFT, _("Asymp. Sig."));
332 chisquare_execute (const struct dataset *ds,
333 const struct casefile *cf,
334 struct casefilter *filter,
335 const struct npar_test *test)
337 const struct dictionary *dict = dataset_dict (ds);
339 struct one_sample_test *ost = (struct one_sample_test *) test;
340 struct chisquare_test *cst = (struct chisquare_test *) test;
341 struct tab_table *stats_table = create_stats_table (cst);
343 double total_expected = 0.0;
345 double *df = xzalloc (sizeof (*df) * ost->n_vars);
346 double *xsq = xzalloc (sizeof (*df) * ost->n_vars);
348 for ( i = 0 ; i < cst->n_expected ; ++i )
349 total_expected += cst->expected[i];
351 if ( cst->ranged == false )
353 for ( v = 0 ; v < ost->n_vars ; ++v )
355 double total_obs = 0.0;
356 struct hsh_table *freq_hash = NULL;
357 struct tab_table *freq_table =
358 create_variable_frequency_table(dict, cf, filter, cst,
361 struct freq **ff = (struct freq **) hsh_sort (freq_hash);
363 if ( NULL == freq_table )
365 hsh_destroy (freq_hash);
369 n_cells = hsh_count (freq_hash);
371 for ( i = 0 ; i < n_cells ; ++i )
372 total_obs += ff[i]->count;
375 for ( i = 0 ; i < n_cells ; ++i )
378 const union value *observed_value = ff[i]->value;
381 tab_text (freq_table, 0, i + 1, TAB_LEFT,
382 var_get_value_name (ost->vars[v], observed_value));
385 tab_float (freq_table, 1, i + 1, TAB_NONE,
388 if ( cst->n_expected > 0 )
389 exp = cst->expected[i] * total_obs / total_expected ;
391 exp = total_obs / (double) n_cells;
393 tab_float (freq_table, 2, i + 1, TAB_NONE,
397 tab_float (freq_table, 3, i + 1, TAB_NONE,
398 ff[i]->count - exp, 8, 2);
400 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
403 df[v] = n_cells - 1.0;
405 tab_float (freq_table, 1, i + 1, TAB_NONE,
408 tab_submit (freq_table);
410 hsh_destroy (freq_hash);
413 else /* ranged == true */
415 struct tab_table *freq_table = create_combo_frequency_table (cst);
417 n_cells = cst->hi - cst->lo + 1;
419 for ( v = 0 ; v < ost->n_vars ; ++v )
421 double total_obs = 0.0;
422 struct hsh_table *freq_hash =
423 create_freq_hash_with_range (dict, cf, filter, ost->vars[v],
426 struct freq **ff = (struct freq **) hsh_sort (freq_hash);
428 assert ( n_cells == hsh_count (freq_hash));
430 for ( i = 0 ; i < hsh_count (freq_hash) ; ++i )
431 total_obs += ff[i]->count;
434 for ( i = 0 ; i < hsh_count (freq_hash) ; ++i )
438 const union value *observed_value = ff[i]->value;
441 tab_text (freq_table, v * 4 + 1, i + 2 , TAB_LEFT,
442 var_get_value_name (ost->vars[v], observed_value));
445 tab_float (freq_table, v * 4 + 2, i + 2 , TAB_NONE,
448 if ( cst->n_expected > 0 )
449 exp = cst->expected[i] * total_obs / total_expected ;
451 exp = total_obs / (double) hsh_count (freq_hash);
454 tab_float (freq_table, v * 4 + 3, i + 2 , TAB_NONE,
458 tab_float (freq_table, v * 4 + 4, i + 2 , TAB_NONE,
459 ff[i]->count - exp, 8, 2);
461 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
465 tab_float (freq_table, v * 4 + 2, tab_nr (freq_table) - 1, TAB_NONE,
468 df[v] = n_cells - 1.0;
470 hsh_destroy (freq_hash);
473 tab_submit (freq_table);
477 /* Populate the summary statistics table */
478 for ( v = 0 ; v < ost->n_vars ; ++v )
480 const struct variable *var = ost->vars[v];
482 tab_text (stats_table, 1 + v, 0, TAB_CENTER, var_get_name (var));
484 tab_float (stats_table, 1 + v, 1, TAB_NONE, xsq[v], 8,3);
485 tab_float (stats_table, 1 + v, 2, TAB_NONE, df[v], 8,0);
487 tab_float (stats_table, 1 + v, 3, TAB_NONE,
488 gsl_cdf_chisq_Q (xsq[v], df[v]), 8,3);
494 tab_submit (stats_table);