1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2006, 2007, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/stats/chisquare.h"
21 #include <gsl/gsl_cdf.h>
25 #include "data/case.h"
26 #include "data/casereader.h"
27 #include "data/dataset.h"
28 #include "data/dictionary.h"
29 #include "data/format.h"
30 #include "data/value-labels.h"
31 #include "data/variable.h"
32 #include "language/stats/freq.h"
33 #include "language/stats/npar.h"
34 #include "libpspp/array.h"
35 #include "libpspp/assertion.h"
36 #include "libpspp/cast.h"
37 #include "libpspp/compiler.h"
38 #include "libpspp/hash-functions.h"
39 #include "libpspp/message.h"
40 #include "libpspp/taint.h"
41 #include "output/pivot-table.h"
43 #include "gl/xalloc.h"
46 #define N_(msgid) msgid
47 #define _(msgid) gettext (msgid)
49 /* Adds frequency counts of each value of VAR in INPUT between LO and HI to
50 FREQ_HASH. LO and HI and each input value is truncated to an integer.
51 Returns true if successful, false on input error. It is the caller's
52 responsibility to initialize FREQ_HASH and to free it when no longer
53 required, even on failure. */
55 create_freq_hash_with_range (const struct dictionary *dict,
56 struct casereader *input,
57 const struct variable *var,
58 double lo_, double hi_,
59 struct hmap *freq_hash)
61 struct freq **entries;
67 assert (var_is_numeric (var));
71 /* Populate the hash with zero entries */
72 entries = xnmalloc (hi - lo + 1, sizeof *entries);
73 for (i_d = lo; i_d <= hi; i_d += 1.0)
75 size_t ofs = i_d - lo;
76 union value value = { i_d };
77 entries[ofs] = freq_hmap_insert (freq_hash, &value, 0,
78 value_hash (&value, 0, 0));
81 for (; (c = casereader_read (input)) != NULL; case_unref (c))
83 double x = trunc (case_num (c, var));
84 if (x >= lo && x <= hi)
87 struct freq *fr = entries[ofs];
88 fr->count += dict_get_case_weight (dict, c, &warn);
94 return casereader_destroy (input);
97 /* Adds frequency counts of each value of VAR in INPUT to FREQ_HASH. LO and HI
98 and each input value is truncated to an integer. Returns true if
99 successful, false on input error. It is the caller's responsibility to
100 initialize FREQ_HASH and to free it when no longer required, even on
103 create_freq_hash (const struct dictionary *dict,
104 struct casereader *input,
105 const struct variable *var,
106 struct hmap *freq_hash)
108 int width = var_get_width (var);
112 for (; (c = casereader_read (input)) != NULL; case_unref (c))
114 const union value *value = case_data (c, var);
115 size_t hash = value_hash (value, width, 0);
116 double weight = dict_get_case_weight (dict, c, &warn);
119 f = freq_hmap_search (freq_hash, value, width, hash);
121 f = freq_hmap_insert (freq_hash, value, width, hash);
126 return casereader_destroy (input);
130 chisquare_execute (const struct dataset *ds,
131 struct casereader *input,
132 enum mv_class exclude,
133 const struct npar_test *test,
137 const struct dictionary *dict = dataset_dict (ds);
139 struct chisquare_test *cst = UP_CAST (test, struct chisquare_test,
141 struct one_sample_test *ost = &cst->parent;
142 double total_expected = 0.0;
144 double *df = XCALLOC (ost->n_vars, double);
145 double *xsq = XCALLOC (ost->n_vars, double);
148 for (i = 0 ; i < cst->n_expected ; ++i)
149 total_expected += cst->expected[i];
151 if (cst->ranged == false)
153 for (v = 0 ; v < ost->n_vars ; ++v)
155 const struct variable *var = ost->vars[v];
157 struct hmap freq_hash = HMAP_INITIALIZER (freq_hash);
158 struct casereader *reader =
159 casereader_create_filter_missing (casereader_clone (input),
162 if (!create_freq_hash (dict, reader, var, &freq_hash))
164 freq_hmap_destroy (&freq_hash, var_get_width (var));
168 size_t n_cells = hmap_count (&freq_hash);
169 if (cst->n_expected > 0 && n_cells != cst->n_expected)
171 msg (ME, _("CHISQUARE test specified %d expected values, but "
172 "variable %s has %zu distinct values."),
173 cst->n_expected, var_get_name (var), n_cells);
174 freq_hmap_destroy (&freq_hash, var_get_width (var));
178 struct pivot_table *table = pivot_table_create__ (
179 pivot_value_new_variable (var), "Chisquare");
180 pivot_table_set_weight_var (table, dict_get_weight (dict));
182 pivot_dimension_create (
183 table, PIVOT_AXIS_COLUMN, N_("Statistics"),
184 N_("Observed N"), PIVOT_RC_COUNT,
185 N_("Expected N"), PIVOT_RC_OTHER,
186 N_("Residual"), PIVOT_RC_RESIDUAL);
188 struct freq **ff = freq_hmap_sort (&freq_hash, var_get_width (var));
190 double total_obs = 0.0;
191 for (size_t i = 0; i < n_cells; i++)
192 total_obs += ff[i]->count;
194 struct pivot_dimension *values = pivot_dimension_create (
195 table, PIVOT_AXIS_ROW, N_("Value"));
196 values->root->show_label = true;
199 for (size_t i = 0; i < n_cells; i++)
201 int row = pivot_category_create_leaf (
202 values->root, pivot_value_new_var_value (
203 var, &ff[i]->values[0]));
205 double exp = (cst->n_expected > 0
206 ? cst->expected[i] * total_obs / total_expected
207 : total_obs / (double) n_cells);
213 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
215 table, j, row, pivot_value_new_number (entries[j]));
217 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
220 df[v] = n_cells - 1.0;
222 int row = pivot_category_create_leaf (
223 values->root, pivot_value_new_text (N_("Total")));
224 pivot_table_put2 (table, 0, row,
225 pivot_value_new_number (total_obs));
227 pivot_table_submit (table);
229 freq_hmap_destroy (&freq_hash, var_get_width (var));
233 else /* ranged == true */
235 struct pivot_table *table = pivot_table_create (N_("Frequencies"));
236 pivot_table_set_weight_var (table, dict_get_weight (dict));
238 pivot_dimension_create (
239 table, PIVOT_AXIS_COLUMN, N_("Statistics"),
241 N_("Observed N"), PIVOT_RC_COUNT,
242 N_("Expected N"), PIVOT_RC_OTHER,
243 N_("Residual"), PIVOT_RC_RESIDUAL);
245 struct pivot_dimension *var_dim
246 = pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Variable"));
247 for (size_t i = 0 ; i < ost->n_vars ; ++i)
248 pivot_category_create_leaf (var_dim->root,
249 pivot_value_new_variable (ost->vars[i]));
251 struct pivot_dimension *category_dim
252 = pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Category"));
253 size_t n_cells = cst->hi - cst->lo + 1;
254 for (size_t i = 0 ; i < n_cells; ++i)
255 pivot_category_create_leaf (category_dim->root,
256 pivot_value_new_integer (i + 1));
257 pivot_category_create_leaves (category_dim->root, N_("Total"));
259 for (size_t v = 0 ; v < ost->n_vars ; ++v)
261 const struct variable *var = ost->vars[v];
262 struct casereader *reader =
263 casereader_create_filter_missing (casereader_clone (input),
266 struct hmap freq_hash = HMAP_INITIALIZER (freq_hash);
267 if (!create_freq_hash_with_range (dict, reader, var,
268 cst->lo, cst->hi, &freq_hash))
270 freq_hmap_destroy (&freq_hash, var_get_width (var));
274 struct freq **ff = freq_hmap_sort (&freq_hash, var_get_width (var));
276 double total_obs = 0.0;
277 for (size_t i = 0 ; i < hmap_count (&freq_hash) ; ++i)
278 total_obs += ff[i]->count;
281 for (size_t i = 0 ; i < hmap_count (&freq_hash) ; ++i)
284 pivot_table_put3 (table, 0, v, i,
285 pivot_value_new_var_value (
286 var, &ff[i]->values[0]));
288 double exp = (cst->n_expected > 0
289 ? cst->expected[i] * total_obs / total_expected
290 : total_obs / (double) hmap_count (&freq_hash));
296 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
297 pivot_table_put3 (table, j + 1, v, i,
298 pivot_value_new_number (entries[j]));
301 xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
304 df[v] = n_cells - 1.0;
306 freq_hmap_destroy (&freq_hash, var_get_width (var));
309 pivot_table_put3 (table, 1, v, n_cells,
310 pivot_value_new_number (total_obs));
313 pivot_table_submit (table);
315 ok = !taint_has_tainted_successor (casereader_get_taint (input));
316 casereader_destroy (input);
320 struct pivot_table *table = pivot_table_create (N_("Test Statistics"));
322 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
323 N_("Chi-square"), PIVOT_RC_OTHER,
324 N_("df"), PIVOT_RC_INTEGER,
325 N_("Asymp. Sig."), PIVOT_RC_SIGNIFICANCE);
327 struct pivot_dimension *variables = pivot_dimension_create (
328 table, PIVOT_AXIS_ROW, N_("Variable"));
330 for (size_t v = 0 ; v < ost->n_vars ; ++v)
332 const struct variable *var = ost->vars[v];
334 int row = pivot_category_create_leaf (
335 variables->root, pivot_value_new_variable (var));
337 double sig = gsl_cdf_chisq_Q (xsq[v], df[v]);
338 double entries[] = { xsq[v], df[v], sig };
339 for (size_t i = 0; i < sizeof entries / sizeof *entries; i++)
340 pivot_table_put2 (table, i, row,
341 pivot_value_new_number (entries[i]));
343 pivot_table_submit (table);