--- /dev/null
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 2006, 2007, 2009, 2010, 2011 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include "language/commands/chisquare.h"
+
+#include <gsl/gsl_cdf.h>
+#include <math.h>
+#include <stdlib.h>
+
+#include "data/case.h"
+#include "data/casereader.h"
+#include "data/dataset.h"
+#include "data/dictionary.h"
+#include "data/format.h"
+#include "data/value-labels.h"
+#include "data/variable.h"
+#include "language/commands/freq.h"
+#include "language/commands/npar.h"
+#include "libpspp/array.h"
+#include "libpspp/assertion.h"
+#include "libpspp/cast.h"
+#include "libpspp/compiler.h"
+#include "libpspp/hash-functions.h"
+#include "libpspp/message.h"
+#include "libpspp/taint.h"
+#include "output/pivot-table.h"
+
+#include "gl/xalloc.h"
+
+#include "gettext.h"
+#define N_(msgid) msgid
+#define _(msgid) gettext (msgid)
+
+/* Adds frequency counts of each value of VAR in INPUT between LO and HI to
+ FREQ_HASH. LO and HI and each input value is truncated to an integer.
+ Returns true if successful, false on input error. It is the caller's
+ responsibility to initialize FREQ_HASH and to free it when no longer
+ required, even on failure. */
+static bool
+create_freq_hash_with_range (const struct dictionary *dict,
+ struct casereader *input,
+ const struct variable *var,
+ double lo_, double hi_,
+ struct hmap *freq_hash)
+{
+ struct freq **entries;
+ bool warn = true;
+ struct ccase *c;
+ double lo, hi;
+ double i_d;
+
+ assert (var_is_numeric (var));
+ lo = trunc (lo_);
+ hi = trunc (hi_);
+
+ /* Populate the hash with zero entries */
+ entries = xnmalloc (hi - lo + 1, sizeof *entries);
+ for (i_d = lo; i_d <= hi; i_d += 1.0)
+ {
+ size_t ofs = i_d - lo;
+ union value value = { i_d };
+ entries[ofs] = freq_hmap_insert (freq_hash, &value, 0,
+ value_hash (&value, 0, 0));
+ }
+
+ for (; (c = casereader_read (input)) != NULL; case_unref (c))
+ {
+ double x = trunc (case_num (c, var));
+ if (x >= lo && x <= hi)
+ {
+ size_t ofs = x - lo;
+ struct freq *fr = entries[ofs];
+ fr->count += dict_get_case_weight (dict, c, &warn);
+ }
+ }
+
+ free (entries);
+
+ return casereader_destroy (input);
+}
+
+/* Adds frequency counts of each value of VAR in INPUT to FREQ_HASH. LO and HI
+ and each input value is truncated to an integer. Returns true if
+ successful, false on input error. It is the caller's responsibility to
+ initialize FREQ_HASH and to free it when no longer required, even on
+ failure. */
+static bool
+create_freq_hash (const struct dictionary *dict,
+ struct casereader *input,
+ const struct variable *var,
+ struct hmap *freq_hash)
+{
+ int width = var_get_width (var);
+ bool warn = true;
+ struct ccase *c;
+
+ for (; (c = casereader_read (input)) != NULL; case_unref (c))
+ {
+ const union value *value = case_data (c, var);
+ size_t hash = value_hash (value, width, 0);
+ double weight = dict_get_case_weight (dict, c, &warn);
+ struct freq *f;
+
+ f = freq_hmap_search (freq_hash, value, width, hash);
+ if (f == NULL)
+ f = freq_hmap_insert (freq_hash, value, width, hash);
+
+ f->count += weight;
+ }
+
+ return casereader_destroy (input);
+}
+
+void
+chisquare_execute (const struct dataset *ds,
+ struct casereader *input,
+ enum mv_class exclude,
+ const struct npar_test *test,
+ bool exact UNUSED,
+ double timer UNUSED)
+{
+ const struct dictionary *dict = dataset_dict (ds);
+ int v, i;
+ struct chisquare_test *cst = UP_CAST (test, struct chisquare_test,
+ parent.parent);
+ struct one_sample_test *ost = &cst->parent;
+ double total_expected = 0.0;
+
+ double *df = XCALLOC (ost->n_vars, double);
+ double *xsq = XCALLOC (ost->n_vars, double);
+ bool ok;
+
+ for (i = 0 ; i < cst->n_expected ; ++i)
+ total_expected += cst->expected[i];
+
+ if (cst->ranged == false)
+ {
+ for (v = 0 ; v < ost->n_vars ; ++v)
+ {
+ const struct variable *var = ost->vars[v];
+
+ struct hmap freq_hash = HMAP_INITIALIZER (freq_hash);
+ struct casereader *reader =
+ casereader_create_filter_missing (casereader_clone (input),
+ &var, 1, exclude,
+ NULL, NULL);
+ if (!create_freq_hash (dict, reader, var, &freq_hash))
+ {
+ freq_hmap_destroy (&freq_hash, var_get_width (var));
+ return;
+ }
+
+ size_t n_cells = hmap_count (&freq_hash);
+ if (cst->n_expected > 0 && n_cells != cst->n_expected)
+ {
+ msg (ME, _("CHISQUARE test specified %d expected values, but "
+ "variable %s has %zu distinct values."),
+ cst->n_expected, var_get_name (var), n_cells);
+ freq_hmap_destroy (&freq_hash, var_get_width (var));
+ continue;
+ }
+
+ struct pivot_table *table = pivot_table_create__ (
+ pivot_value_new_variable (var), "Chisquare");
+ pivot_table_set_weight_var (table, dict_get_weight (dict));
+
+ pivot_dimension_create (
+ table, PIVOT_AXIS_COLUMN, N_("Statistics"),
+ N_("Observed N"), PIVOT_RC_COUNT,
+ N_("Expected N"), PIVOT_RC_OTHER,
+ N_("Residual"), PIVOT_RC_RESIDUAL);
+
+ struct freq **ff = freq_hmap_sort (&freq_hash, var_get_width (var));
+
+ double total_obs = 0.0;
+ for (size_t i = 0; i < n_cells; i++)
+ total_obs += ff[i]->count;
+
+ struct pivot_dimension *values = pivot_dimension_create (
+ table, PIVOT_AXIS_ROW, N_("Value"));
+ values->root->show_label = true;
+
+ xsq[v] = 0.0;
+ for (size_t i = 0; i < n_cells; i++)
+ {
+ int row = pivot_category_create_leaf (
+ values->root, pivot_value_new_var_value (
+ var, &ff[i]->values[0]));
+
+ double exp = (cst->n_expected > 0
+ ? cst->expected[i] * total_obs / total_expected
+ : total_obs / (double) n_cells);
+ double entries[] = {
+ ff[i]->count,
+ exp,
+ ff[i]->count - exp,
+ };
+ for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
+ pivot_table_put2 (
+ table, j, row, pivot_value_new_number (entries[j]));
+
+ xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
+ }
+
+ df[v] = n_cells - 1.0;
+
+ int row = pivot_category_create_leaf (
+ values->root, pivot_value_new_text (N_("Total")));
+ pivot_table_put2 (table, 0, row,
+ pivot_value_new_number (total_obs));
+
+ pivot_table_submit (table);
+
+ freq_hmap_destroy (&freq_hash, var_get_width (var));
+ free (ff);
+ }
+ }
+ else /* ranged == true */
+ {
+ struct pivot_table *table = pivot_table_create (N_("Frequencies"));
+ pivot_table_set_weight_var (table, dict_get_weight (dict));
+
+ pivot_dimension_create (
+ table, PIVOT_AXIS_COLUMN, N_("Statistics"),
+ N_("Category"),
+ N_("Observed N"), PIVOT_RC_COUNT,
+ N_("Expected N"), PIVOT_RC_OTHER,
+ N_("Residual"), PIVOT_RC_RESIDUAL);
+
+ struct pivot_dimension *var_dim
+ = pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Variable"));
+ for (size_t i = 0 ; i < ost->n_vars ; ++i)
+ pivot_category_create_leaf (var_dim->root,
+ pivot_value_new_variable (ost->vars[i]));
+
+ struct pivot_dimension *category_dim
+ = pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Category"));
+ size_t n_cells = cst->hi - cst->lo + 1;
+ for (size_t i = 0 ; i < n_cells; ++i)
+ pivot_category_create_leaf (category_dim->root,
+ pivot_value_new_integer (i + 1));
+ pivot_category_create_leaves (category_dim->root, N_("Total"));
+
+ for (size_t v = 0 ; v < ost->n_vars ; ++v)
+ {
+ const struct variable *var = ost->vars[v];
+ struct casereader *reader =
+ casereader_create_filter_missing (casereader_clone (input),
+ &var, 1, exclude,
+ NULL, NULL);
+ struct hmap freq_hash = HMAP_INITIALIZER (freq_hash);
+ if (!create_freq_hash_with_range (dict, reader, var,
+ cst->lo, cst->hi, &freq_hash))
+ {
+ freq_hmap_destroy (&freq_hash, var_get_width (var));
+ continue;
+ }
+
+ struct freq **ff = freq_hmap_sort (&freq_hash, var_get_width (var));
+
+ double total_obs = 0.0;
+ for (size_t i = 0 ; i < hmap_count (&freq_hash) ; ++i)
+ total_obs += ff[i]->count;
+
+ xsq[v] = 0.0;
+ for (size_t i = 0 ; i < hmap_count (&freq_hash) ; ++i)
+ {
+ /* Category. */
+ pivot_table_put3 (table, 0, v, i,
+ pivot_value_new_var_value (
+ var, &ff[i]->values[0]));
+
+ double exp = (cst->n_expected > 0
+ ? cst->expected[i] * total_obs / total_expected
+ : total_obs / (double) hmap_count (&freq_hash));
+ double entries[] = {
+ ff[i]->count,
+ exp,
+ ff[i]->count - exp,
+ };
+ for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
+ pivot_table_put3 (table, j + 1, v, i,
+ pivot_value_new_number (entries[j]));
+
+
+ xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
+ }
+
+ df[v] = n_cells - 1.0;
+
+ freq_hmap_destroy (&freq_hash, var_get_width (var));
+ free (ff);
+
+ pivot_table_put3 (table, 1, v, n_cells,
+ pivot_value_new_number (total_obs));
+ }
+
+ pivot_table_submit (table);
+ }
+ ok = !taint_has_tainted_successor (casereader_get_taint (input));
+ casereader_destroy (input);
+
+ if (ok)
+ {
+ struct pivot_table *table = pivot_table_create (N_("Test Statistics"));
+
+ pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
+ N_("Chi-square"), PIVOT_RC_OTHER,
+ N_("df"), PIVOT_RC_INTEGER,
+ N_("Asymp. Sig."), PIVOT_RC_SIGNIFICANCE);
+
+ struct pivot_dimension *variables = pivot_dimension_create (
+ table, PIVOT_AXIS_ROW, N_("Variable"));
+
+ for (size_t v = 0 ; v < ost->n_vars ; ++v)
+ {
+ const struct variable *var = ost->vars[v];
+
+ int row = pivot_category_create_leaf (
+ variables->root, pivot_value_new_variable (var));
+
+ double sig = gsl_cdf_chisq_Q (xsq[v], df[v]);
+ double entries[] = { xsq[v], df[v], sig };
+ for (size_t i = 0; i < sizeof entries / sizeof *entries; i++)
+ pivot_table_put2 (table, i, row,
+ pivot_value_new_number (entries[i]));
+ }
+ pivot_table_submit (table);
+ }
+
+ free (xsq);
+ free (df);
+}
+