1 /* PSPP - computes sample statistics.
2 Copyright (C) 2006, 2007 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
20 #include <libpspp/compiler.h>
21 #include <libpspp/assertion.h>
25 #include <data/case.h>
26 #include <data/casefile.h>
27 #include <data/casefilter.h>
28 #include <data/variable.h>
29 #include <data/dictionary.h>
30 #include <data/procedure.h>
32 #include <libpspp/message.h>
33 #include <libpspp/hash.h>
34 #include <libpspp/alloc.h>
36 #include <gsl/gsl_cdf.h>
38 #include <output/table.h>
39 #include <data/value-labels.h>
42 #include "chisquare.h"
48 #define _(msgid) gettext (msgid)
53 /* Return a hash table containing the frequency counts of each
55 It is the caller's responsibility to free the hash table when
58 static struct hsh_table *
59 create_freq_hash_with_range (const struct dictionary *dict,
60 const struct casefile *cf,
61 struct casefilter *filter,
62 const struct variable *var,
69 struct casereader *r = casefile_get_reader (cf, filter);
71 struct hsh_table *freq_hash =
72 hsh_create (4, compare_freq, hash_freq,
73 free_freq_mutable_hash,
76 /* Populate the hash with zero entries */
77 for (i_d = trunc (lo); i_d <= trunc (hi); i_d += 1.0 )
79 union value the_value;
80 struct freq_mutable *fr = xmalloc (sizeof (*fr));
84 fr->value = value_dup (&the_value, 0);
87 hsh_insert (freq_hash, fr);
90 while (casereader_read(r, &c))
92 union value obs_value;
93 struct freq **existing_fr;
94 struct freq *fr = xmalloc(sizeof (*fr));
95 fr->value = case_data (&c, var);
97 if ( casefilter_variable_missing (filter, &c, var))
103 fr->count = dict_get_case_weight (dict, &c, &warn);
105 obs_value.f = trunc (fr->value->f);
107 if ( obs_value.f < lo || obs_value.f > hi)
114 fr->value = &obs_value;
116 existing_fr = (struct freq **) hsh_probe (freq_hash, fr);
118 /* This must exist in the hash, because we previously populated it
120 assert (*existing_fr);
122 (*existing_fr)->count += fr->count;
127 casereader_destroy (r);
133 /* Return a hash table containing the frequency counts of each
135 It is the caller's responsibility to free the hash table when
138 static struct hsh_table *
139 create_freq_hash (const struct dictionary *dict,
140 const struct casefile *cf,
141 struct casefilter *filter,
142 const struct variable *var)
146 struct casereader *r = casefile_get_reader (cf, filter);
148 struct hsh_table *freq_hash =
149 hsh_create (4, compare_freq, hash_freq,
150 free_freq_mutable_hash,
153 while (casereader_read(r, &c))
155 struct freq **existing_fr;
156 struct freq *fr = xmalloc(sizeof (*fr));
157 fr->value = case_data (&c, var );
159 if ( casefilter_variable_missing (filter, &c, var))
165 fr->count = dict_get_case_weight (dict, &c, &warn);
167 existing_fr = (struct freq **) hsh_probe (freq_hash, fr);
170 (*existing_fr)->count += fr->count;
176 fr->value = value_dup (fr->value, var_get_width (var));
181 casereader_destroy (r);
188 static struct tab_table *
189 create_variable_frequency_table (const struct dictionary *dict,
190 const struct casefile *cf,
191 struct casefilter *filter,
192 const struct chisquare_test *test,
194 struct hsh_table **freq_hash)
198 const struct one_sample_test *ost = (const struct one_sample_test*)test;
200 struct tab_table *table ;
201 const struct variable *var = ost->vars[v];
203 *freq_hash = create_freq_hash (dict, cf, filter, var);
205 n_cells = hsh_count (*freq_hash);
207 if ( test->n_expected > 0 && n_cells != test->n_expected )
209 msg(ME, _("CHISQUARE test specified %d expected values, but"
210 " %d distinct values were encountered in variable %s."),
211 test->n_expected, n_cells,
217 table = tab_create(4, n_cells + 2, 0);
218 tab_dim (table, tab_natural_dimensions);
220 tab_title (table, var_to_string(var));
221 tab_text (table, 1, 0, TAB_LEFT, _("Observed N"));
222 tab_text (table, 2, 0, TAB_LEFT, _("Expected N"));
223 tab_text (table, 3, 0, TAB_LEFT, _("Residual"));
225 tab_headers (table, 1, 0, 1, 0);
227 tab_box (table, TAL_1, TAL_1, -1, -1,
228 0, 0, table->nc - 1, tab_nr(table) - 1 );
230 tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 1);
232 tab_vline (table, TAL_2, 1, 0, tab_nr(table) - 1);
233 for ( i = 2 ; i < 4 ; ++i )
234 tab_vline (table, TAL_1, i, 0, tab_nr(table) - 1);
237 tab_text (table, 0, table->nr - 1, TAB_LEFT, _("Total"));
243 static struct tab_table *
244 create_combo_frequency_table (const struct chisquare_test *test)
247 const struct one_sample_test *ost = (const struct one_sample_test*)test;
249 struct tab_table *table ;
251 int n_cells = test->hi - test->lo + 1;
253 table = tab_create(1 + ost->n_vars * 4, n_cells + 3, 0);
254 tab_dim (table, tab_natural_dimensions);
256 tab_title (table, _("Frequencies"));
257 for ( i = 0 ; i < ost->n_vars ; ++i )
259 const struct variable *var = ost->vars[i];
260 tab_text (table, i * 4 + 1, 1, TAB_LEFT, _("Category"));
261 tab_text (table, i * 4 + 2, 1, TAB_LEFT, _("Observed N"));
262 tab_text (table, i * 4 + 3, 1, TAB_LEFT, _("Expected N"));
263 tab_text (table, i * 4 + 4, 1, TAB_LEFT, _("Residual"));
265 tab_vline (table, TAL_2, i * 4 + 1,
266 0, tab_nr (table) - 1);
268 tab_vline (table, TAL_1, i * 4 + 2,
269 0, tab_nr (table) - 1);
271 tab_vline (table, TAL_1, i * 4 + 3,
272 1, tab_nr (table) - 1);
274 tab_vline (table, TAL_1, i * 4 + 4,
275 1, tab_nr (table) - 1);
278 tab_joint_text (table,
282 var_to_string (var));
285 for ( i = test->lo ; i <= test->hi ; ++i )
286 tab_float (table, 0, 2 + i - test->lo,
287 TAB_LEFT, 1 + i - test->lo, 8, 0);
289 tab_headers (table, 1, 0, 2, 0);
291 tab_box (table, TAL_1, TAL_1, -1, -1,
292 0, 0, table->nc - 1, tab_nr(table) - 1 );
294 tab_hline (table, TAL_1, 1, tab_nc(table) - 1, 1);
295 tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 2);
297 tab_text (table, 0, table->nr - 1, TAB_LEFT, _("Total"));
303 static struct tab_table *
304 create_stats_table (const struct chisquare_test *test)
306 const struct one_sample_test *ost = (const struct one_sample_test*) test;
308 struct tab_table *table = tab_create (1 + ost->n_vars, 4, 0);
309 tab_dim (table, tab_natural_dimensions);
310 tab_title (table, _("Test Statistics"));
311 tab_headers (table, 1, 0, 1, 0);
313 tab_box (table, TAL_1, TAL_1, -1, -1,
314 0, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
316 tab_box (table, -1, -1, -1, TAL_1,
317 1, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
320 tab_vline (table, TAL_2, 1, 0, tab_nr (table) - 1);
321 tab_hline (table, TAL_1, 0, tab_nc (table) - 1, 1);
324 tab_text (table, 0, 1, TAB_LEFT, _("Chi-Square"));
325 tab_text (table, 0, 2, TAB_LEFT, _("df"));
326 tab_text (table, 0, 3, TAB_LEFT, _("Asymp. Sig."));
333 chisquare_execute (const struct dataset *ds,
334 const struct casefile *cf,
335 struct casefilter *filter,
336 const struct npar_test *test)
338 const struct dictionary *dict = dataset_dict (ds);
340 struct one_sample_test *ost = (struct one_sample_test *) test;
341 struct chisquare_test *cst = (struct chisquare_test *) test;
342 struct tab_table *stats_table = create_stats_table (cst);
344 double total_expected = 0.0;
346 double *df = xzalloc (sizeof (*df) * ost->n_vars);
347 double *xsq = xzalloc (sizeof (*df) * ost->n_vars);
349 for ( i = 0 ; i < cst->n_expected ; ++i )
350 total_expected += cst->expected[i];
352 if ( cst->ranged == false )
354 for ( v = 0 ; v < ost->n_vars ; ++v )
356 double total_obs = 0.0;
357 struct hsh_table *freq_hash = NULL;
358 struct tab_table *freq_table =
359 create_variable_frequency_table(dict, cf, filter, cst,
362 struct freq **ff = (struct freq **) hsh_sort (freq_hash);
364 if ( NULL == freq_table )
366 hsh_destroy (freq_hash);
370 n_cells = hsh_count (freq_hash);
372 for ( i = 0 ; i < n_cells ; ++i )
373 total_obs += ff[i]->count;
376 for ( i = 0 ; i < n_cells ; ++i )
379 const union value *observed_value = ff[i]->value;
382 tab_text (freq_table, 0, i + 1, TAB_LEFT,
383 var_get_value_name (ost->vars[v], observed_value));
386 tab_float (freq_table, 1, i + 1, TAB_NONE,
389 if ( cst->n_expected > 0 )
390 exp = cst->expected[i] * total_obs / total_expected ;
392 exp = total_obs / (double) n_cells;
394 tab_float (freq_table, 2, i + 1, TAB_NONE,
398 tab_float (freq_table, 3, i + 1, TAB_NONE,
399 ff[i]->count - exp, 8, 2);
401 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
404 df[v] = n_cells - 1.0;
406 tab_float (freq_table, 1, i + 1, TAB_NONE,
409 tab_submit (freq_table);
411 hsh_destroy (freq_hash);
414 else /* ranged == true */
416 struct tab_table *freq_table = create_combo_frequency_table (cst);
418 n_cells = cst->hi - cst->lo + 1;
420 for ( v = 0 ; v < ost->n_vars ; ++v )
422 double total_obs = 0.0;
423 struct hsh_table *freq_hash =
424 create_freq_hash_with_range (dict, cf, filter, ost->vars[v],
427 struct freq **ff = (struct freq **) hsh_sort (freq_hash);
429 assert ( n_cells == hsh_count (freq_hash));
431 for ( i = 0 ; i < hsh_count (freq_hash) ; ++i )
432 total_obs += ff[i]->count;
435 for ( i = 0 ; i < hsh_count (freq_hash) ; ++i )
439 const union value *observed_value = ff[i]->value;
442 tab_text (freq_table, v * 4 + 1, i + 2 , TAB_LEFT,
443 var_get_value_name (ost->vars[v], observed_value));
446 tab_float (freq_table, v * 4 + 2, i + 2 , TAB_NONE,
449 if ( cst->n_expected > 0 )
450 exp = cst->expected[i] * total_obs / total_expected ;
452 exp = total_obs / (double) hsh_count (freq_hash);
455 tab_float (freq_table, v * 4 + 3, i + 2 , TAB_NONE,
459 tab_float (freq_table, v * 4 + 4, i + 2 , TAB_NONE,
460 ff[i]->count - exp, 8, 2);
462 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
466 tab_float (freq_table, v * 4 + 2, tab_nr (freq_table) - 1, TAB_NONE,
469 df[v] = n_cells - 1.0;
471 hsh_destroy (freq_hash);
474 tab_submit (freq_table);
478 /* Populate the summary statistics table */
479 for ( v = 0 ; v < ost->n_vars ; ++v )
481 const struct variable *var = ost->vars[v];
483 tab_text (stats_table, 1 + v, 0, TAB_CENTER, var_get_name (var));
485 tab_float (stats_table, 1 + v, 1, TAB_NONE, xsq[v], 8,3);
486 tab_float (stats_table, 1 + v, 2, TAB_NONE, df[v], 8,0);
488 tab_float (stats_table, 1 + v, 3, TAB_NONE,
489 gsl_cdf_chisq_Q (xsq[v], df[v]), 8,3);
495 tab_submit (stats_table);