/* PSPP - a program for statistical analysis.
- Copyright (C) 2006, 2007 Free Software Foundation, Inc.
+ Copyright (C) 2006, 2007, 2009, 2010, 2011 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include <config.h>
-#include <language/stats/chisquare.h>
-
-#include <stdlib.h>
-#include <math.h>
-
-#include <data/case.h>
-#include <data/casereader.h>
-#include <data/dictionary.h>
-#include <data/procedure.h>
-#include <data/value-labels.h>
-#include <data/variable.h>
-#include <language/stats/freq.h>
-#include <language/stats/npar.h>
-#include <libpspp/assertion.h>
-#include <libpspp/compiler.h>
-#include <libpspp/hash.h>
-#include <libpspp/message.h>
-#include <libpspp/taint.h>
-#include <output/table.h>
+#include "language/stats/chisquare.h"
#include <gsl/gsl_cdf.h>
+#include <math.h>
+#include <stdlib.h>
-#include "xalloc.h"
+#include "data/case.h"
+#include "data/casereader.h"
+#include "data/dataset.h"
+#include "data/dictionary.h"
+#include "data/format.h"
+#include "data/value-labels.h"
+#include "data/variable.h"
+#include "language/stats/freq.h"
+#include "language/stats/npar.h"
+#include "libpspp/array.h"
+#include "libpspp/assertion.h"
+#include "libpspp/cast.h"
+#include "libpspp/compiler.h"
+#include "libpspp/hash-functions.h"
+#include "libpspp/message.h"
+#include "libpspp/taint.h"
+#include "output/pivot-table.h"
+
+#include "gl/xalloc.h"
#include "gettext.h"
+#define N_(msgid) msgid
#define _(msgid) gettext (msgid)
-/* Return a hash table containing the frequency counts of each
- value of VAR in CF .
- It is the caller's responsibility to free the hash table when
- no longer required.
-*/
-static struct hsh_table *
+/* Adds frequency counts of each value of VAR in INPUT between LO and HI to
+ FREQ_HASH. LO and HI and each input value is truncated to an integer.
+ Returns true if successful, false on input error. It is the caller's
+ responsibility to initialize FREQ_HASH and to free it when no longer
+ required, even on failure. */
+static bool
create_freq_hash_with_range (const struct dictionary *dict,
struct casereader *input,
const struct variable *var,
- double lo,
- double hi)
+ double lo_, double hi_,
+ struct hmap *freq_hash)
{
+ struct freq **entries;
bool warn = true;
- float i_d;
- struct ccase c;
+ struct ccase *c;
+ double lo, hi;
+ double i_d;
- struct hsh_table *freq_hash =
- hsh_create (4, compare_freq, hash_freq,
- free_freq_mutable_hash,
- (void *) var);
+ assert (var_is_numeric (var));
+ lo = trunc (lo_);
+ hi = trunc (hi_);
/* Populate the hash with zero entries */
- for (i_d = trunc (lo); i_d <= trunc (hi); i_d += 1.0 )
+ entries = xnmalloc (hi - lo + 1, sizeof *entries);
+ for (i_d = lo; i_d <= hi; i_d += 1.0)
{
- union value the_value;
- struct freq_mutable *fr = xmalloc (sizeof (*fr));
-
- the_value.f = i_d;
-
- fr->value = value_dup (&the_value, 0);
- fr->count = 0;
-
- hsh_insert (freq_hash, fr);
+ size_t ofs = i_d - lo;
+ union value value = { i_d };
+ entries[ofs] = freq_hmap_insert (freq_hash, &value, 0,
+ value_hash (&value, 0, 0));
}
- while (casereader_read (input, &c))
+ for (; (c = casereader_read (input)) != NULL; case_unref (c))
{
- union value obs_value;
- struct freq **existing_fr;
- struct freq *fr = xmalloc(sizeof (*fr));
- fr->value = case_data (&c, var);
-
- fr->count = dict_get_case_weight (dict, &c, &warn);
-
- obs_value.f = trunc (fr->value->f);
-
- if ( obs_value.f < lo || obs_value.f > hi)
- {
- free (fr);
- case_destroy (&c);
- continue;
- }
-
- fr->value = &obs_value;
-
- existing_fr = (struct freq **) hsh_probe (freq_hash, fr);
-
- /* This must exist in the hash, because we previously populated it
- with zero counts */
- assert (*existing_fr);
+ double x = trunc (case_num (c, var));
+ if (x >= lo && x <= hi)
+ {
+ size_t ofs = x - lo;
+ struct freq *fr = entries[ofs];
+ fr->count += dict_get_case_weight (dict, c, &warn);
+ }
+ }
- (*existing_fr)->count += fr->count;
- free (fr);
+ free (entries);
- case_destroy (&c);
- }
- if (casereader_destroy (input))
- return freq_hash;
- else
- {
- hsh_destroy (freq_hash);
- return NULL;
- }
+ return casereader_destroy (input);
}
-
-/* Return a hash table containing the frequency counts of each
- value of VAR in INPUT .
- It is the caller's responsibility to free the hash table when
- no longer required.
-*/
-static struct hsh_table *
+/* Adds frequency counts of each value of VAR in INPUT to FREQ_HASH. LO and HI
+ and each input value is truncated to an integer. Returns true if
+ successful, false on input error. It is the caller's responsibility to
+ initialize FREQ_HASH and to free it when no longer required, even on
+ failure. */
+static bool
create_freq_hash (const struct dictionary *dict,
struct casereader *input,
- const struct variable *var)
+ const struct variable *var,
+ struct hmap *freq_hash)
{
+ int width = var_get_width (var);
bool warn = true;
- struct ccase c;
-
- struct hsh_table *freq_hash =
- hsh_create (4, compare_freq, hash_freq,
- free_freq_mutable_hash,
- (void *) var);
+ struct ccase *c;
- for (; casereader_read (input, &c); case_destroy (&c))
+ for (; (c = casereader_read (input)) != NULL; case_unref (c))
{
- struct freq **existing_fr;
- struct freq *fr = xmalloc(sizeof (*fr));
- fr->value = case_data (&c, var);
+ const union value *value = case_data (c, var);
+ size_t hash = value_hash (value, width, 0);
+ double weight = dict_get_case_weight (dict, c, &warn);
+ struct freq *f;
- fr->count = dict_get_case_weight (dict, &c, &warn);
+ f = freq_hmap_search (freq_hash, value, width, hash);
+ if (f == NULL)
+ f = freq_hmap_insert (freq_hash, value, width, hash);
- existing_fr = (struct freq **) hsh_probe (freq_hash, fr);
- if ( *existing_fr)
- {
- (*existing_fr)->count += fr->count;
- free (fr);
- }
- else
- {
- *existing_fr = fr;
- fr->value = value_dup (fr->value, var_get_width (var));
- }
+ f->count += weight;
}
- if (casereader_destroy (input))
- return freq_hash;
- else
- {
- hsh_destroy (freq_hash);
- return NULL;
- }
-}
-
-
-
-static struct tab_table *
-create_variable_frequency_table (const struct dictionary *dict,
- struct casereader *input,
- const struct chisquare_test *test,
- int v,
- struct hsh_table **freq_hash)
-
-{
- int i;
- const struct one_sample_test *ost = (const struct one_sample_test*)test;
- int n_cells;
- struct tab_table *table ;
- const struct variable *var = ost->vars[v];
-
- *freq_hash = create_freq_hash (dict, input, var);
- if (*freq_hash == NULL)
- return NULL;
-
- n_cells = hsh_count (*freq_hash);
-
- if ( test->n_expected > 0 && n_cells != test->n_expected )
- {
- msg(ME, _("CHISQUARE test specified %d expected values, but"
- " %d distinct values were encountered in variable %s."),
- test->n_expected, n_cells,
- var_get_name (var)
- );
- hsh_destroy (*freq_hash);
- *freq_hash = NULL;
- return NULL;
- }
-
- table = tab_create(4, n_cells + 2, 0);
- tab_dim (table, tab_natural_dimensions);
-
- tab_title (table, var_to_string(var));
- tab_text (table, 1, 0, TAB_LEFT, _("Observed N"));
- tab_text (table, 2, 0, TAB_LEFT, _("Expected N"));
- tab_text (table, 3, 0, TAB_LEFT, _("Residual"));
-
- tab_headers (table, 1, 0, 1, 0);
- tab_box (table, TAL_1, TAL_1, -1, -1,
- 0, 0, table->nc - 1, tab_nr(table) - 1 );
-
- tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 1);
-
- tab_vline (table, TAL_2, 1, 0, tab_nr(table) - 1);
- for ( i = 2 ; i < 4 ; ++i )
- tab_vline (table, TAL_1, i, 0, tab_nr(table) - 1);
-
-
- tab_text (table, 0, table->nr - 1, TAB_LEFT, _("Total"));
-
- return table;
+ return casereader_destroy (input);
}
-
-static struct tab_table *
-create_combo_frequency_table (const struct chisquare_test *test)
-{
- int i;
- const struct one_sample_test *ost = (const struct one_sample_test*)test;
-
- struct tab_table *table ;
-
- int n_cells = test->hi - test->lo + 1;
-
- table = tab_create(1 + ost->n_vars * 4, n_cells + 3, 0);
- tab_dim (table, tab_natural_dimensions);
-
- tab_title (table, _("Frequencies"));
- for ( i = 0 ; i < ost->n_vars ; ++i )
- {
- const struct variable *var = ost->vars[i];
- tab_text (table, i * 4 + 1, 1, TAB_LEFT, _("Category"));
- tab_text (table, i * 4 + 2, 1, TAB_LEFT, _("Observed N"));
- tab_text (table, i * 4 + 3, 1, TAB_LEFT, _("Expected N"));
- tab_text (table, i * 4 + 4, 1, TAB_LEFT, _("Residual"));
-
- tab_vline (table, TAL_2, i * 4 + 1,
- 0, tab_nr (table) - 1);
-
- tab_vline (table, TAL_1, i * 4 + 2,
- 0, tab_nr (table) - 1);
-
- tab_vline (table, TAL_1, i * 4 + 3,
- 1, tab_nr (table) - 1);
-
- tab_vline (table, TAL_1, i * 4 + 4,
- 1, tab_nr (table) - 1);
-
-
- tab_joint_text (table,
- i * 4 + 1, 0,
- i * 4 + 4, 0,
- TAB_CENTER,
- var_to_string (var));
- }
-
- for ( i = test->lo ; i <= test->hi ; ++i )
- tab_float (table, 0, 2 + i - test->lo,
- TAB_LEFT, 1 + i - test->lo, 8, 0);
-
- tab_headers (table, 1, 0, 2, 0);
-
- tab_box (table, TAL_1, TAL_1, -1, -1,
- 0, 0, table->nc - 1, tab_nr(table) - 1 );
-
- tab_hline (table, TAL_1, 1, tab_nc(table) - 1, 1);
- tab_hline (table, TAL_1, 0, tab_nc(table) - 1, 2);
-
- tab_text (table, 0, table->nr - 1, TAB_LEFT, _("Total"));
-
- return table;
-}
-
-
-static struct tab_table *
-create_stats_table (const struct chisquare_test *test)
-{
- const struct one_sample_test *ost = (const struct one_sample_test*) test;
-
- struct tab_table *table;
- table = tab_create (1 + ost->n_vars, 4, 0);
- tab_dim (table, tab_natural_dimensions);
- tab_title (table, _("Test Statistics"));
- tab_headers (table, 1, 0, 1, 0);
-
- tab_box (table, TAL_1, TAL_1, -1, -1,
- 0, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
-
- tab_box (table, -1, -1, -1, TAL_1,
- 1, 0, tab_nc(table) - 1, tab_nr(table) - 1 );
-
-
- tab_vline (table, TAL_2, 1, 0, tab_nr (table) - 1);
- tab_hline (table, TAL_1, 0, tab_nc (table) - 1, 1);
-
-
- tab_text (table, 0, 1, TAB_LEFT, _("Chi-Square"));
- tab_text (table, 0, 2, TAB_LEFT, _("df"));
- tab_text (table, 0, 3, TAB_LEFT, _("Asymp. Sig."));
-
- return table;
-}
-
-
void
chisquare_execute (const struct dataset *ds,
struct casereader *input,
enum mv_class exclude,
- const struct npar_test *test)
+ const struct npar_test *test,
+ bool exact UNUSED,
+ double timer UNUSED)
{
const struct dictionary *dict = dataset_dict (ds);
int v, i;
- struct one_sample_test *ost = (struct one_sample_test *) test;
- struct chisquare_test *cst = (struct chisquare_test *) test;
- int n_cells = 0;
+ struct chisquare_test *cst = UP_CAST (test, struct chisquare_test,
+ parent.parent);
+ struct one_sample_test *ost = &cst->parent;
double total_expected = 0.0;
- double *df = xzalloc (sizeof (*df) * ost->n_vars);
- double *xsq = xzalloc (sizeof (*df) * ost->n_vars);
+ double *df = XCALLOC (ost->n_vars, double);
+ double *xsq = XCALLOC (ost->n_vars, double);
bool ok;
- for ( i = 0 ; i < cst->n_expected ; ++i )
+ for (i = 0 ; i < cst->n_expected ; ++i)
total_expected += cst->expected[i];
- if ( cst->ranged == false )
+ if (cst->ranged == false)
{
- for ( v = 0 ; v < ost->n_vars ; ++v )
+ for (v = 0 ; v < ost->n_vars ; ++v)
{
- double total_obs = 0.0;
- struct hsh_table *freq_hash = NULL;
+ const struct variable *var = ost->vars[v];
+
+ struct hmap freq_hash = HMAP_INITIALIZER (freq_hash);
struct casereader *reader =
casereader_create_filter_missing (casereader_clone (input),
- &ost->vars[v], 1, exclude, NULL);
- struct tab_table *freq_table =
- create_variable_frequency_table(dict, reader, cst, v, &freq_hash);
+ &var, 1, exclude,
+ NULL, NULL);
+ if (!create_freq_hash (dict, reader, var, &freq_hash))
+ {
+ freq_hmap_destroy (&freq_hash, var_get_width (var));
+ return;
+ }
+
+ size_t n_cells = hmap_count (&freq_hash);
+ if (cst->n_expected > 0 && n_cells != cst->n_expected)
+ {
+ msg (ME, _("CHISQUARE test specified %d expected values, but "
+ "variable %s has %zu distinct values."),
+ cst->n_expected, var_get_name (var), n_cells);
+ freq_hmap_destroy (&freq_hash, var_get_width (var));
+ continue;
+ }
+
+ struct pivot_table *table = pivot_table_create__ (
+ pivot_value_new_variable (var), "Chisquare");
+ pivot_table_set_weight_var (table, dict_get_weight (dict));
+
+ pivot_dimension_create (
+ table, PIVOT_AXIS_COLUMN, N_("Statistics"),
+ N_("Observed N"), PIVOT_RC_COUNT,
+ N_("Expected N"), PIVOT_RC_OTHER,
+ N_("Residual"), PIVOT_RC_RESIDUAL);
+
+ struct freq **ff = freq_hmap_sort (&freq_hash, var_get_width (var));
- struct freq **ff;
-
- if ( NULL == freq_table )
- continue;
- ff = (struct freq **) hsh_sort (freq_hash);
-
- n_cells = hsh_count (freq_hash);
-
- for ( i = 0 ; i < n_cells ; ++i )
+ double total_obs = 0.0;
+ for (size_t i = 0; i < n_cells; i++)
total_obs += ff[i]->count;
+ struct pivot_dimension *values = pivot_dimension_create (
+ table, PIVOT_AXIS_ROW, N_("Value"));
+ values->root->show_label = true;
+
xsq[v] = 0.0;
- for ( i = 0 ; i < n_cells ; ++i )
+ for (size_t i = 0; i < n_cells; i++)
{
- double exp;
- const union value *observed_value = ff[i]->value;
-
- /* The key */
- tab_text (freq_table, 0, i + 1, TAB_LEFT,
- var_get_value_name (ost->vars[v], observed_value));
-
- /* The observed N */
- tab_float (freq_table, 1, i + 1, TAB_NONE,
- ff[i]->count, 8, 0);
-
- if ( cst->n_expected > 0 )
- exp = cst->expected[i] * total_obs / total_expected ;
- else
- exp = total_obs / (double) n_cells;
-
- tab_float (freq_table, 2, i + 1, TAB_NONE,
- exp, 8, 2);
-
- /* The residual */
- tab_float (freq_table, 3, i + 1, TAB_NONE,
- ff[i]->count - exp, 8, 2);
+ int row = pivot_category_create_leaf (
+ values->root, pivot_value_new_var_value (
+ var, &ff[i]->values[0]));
+
+ double exp = (cst->n_expected > 0
+ ? cst->expected[i] * total_obs / total_expected
+ : total_obs / (double) n_cells);
+ double entries[] = {
+ ff[i]->count,
+ exp,
+ ff[i]->count - exp,
+ };
+ for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
+ pivot_table_put2 (
+ table, j, row, pivot_value_new_number (entries[j]));
xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
}
df[v] = n_cells - 1.0;
- tab_float (freq_table, 1, i + 1, TAB_NONE,
- total_obs, 8, 0);
+ int row = pivot_category_create_leaf (
+ values->root, pivot_value_new_text (N_("Total")));
+ pivot_table_put2 (table, 0, row,
+ pivot_value_new_number (total_obs));
- tab_submit (freq_table);
+ pivot_table_submit (table);
- hsh_destroy (freq_hash);
+ freq_hmap_destroy (&freq_hash, var_get_width (var));
+ free (ff);
}
}
else /* ranged == true */
{
- struct tab_table *freq_table = create_combo_frequency_table (cst);
-
- n_cells = cst->hi - cst->lo + 1;
-
- for ( v = 0 ; v < ost->n_vars ; ++v )
+ struct pivot_table *table = pivot_table_create (N_("Frequencies"));
+ pivot_table_set_weight_var (table, dict_get_weight (dict));
+
+ pivot_dimension_create (
+ table, PIVOT_AXIS_COLUMN, N_("Statistics"),
+ N_("Category"),
+ N_("Observed N"), PIVOT_RC_COUNT,
+ N_("Expected N"), PIVOT_RC_OTHER,
+ N_("Residual"), PIVOT_RC_RESIDUAL);
+
+ struct pivot_dimension *var_dim
+ = pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Variable"));
+ for (size_t i = 0 ; i < ost->n_vars ; ++i)
+ pivot_category_create_leaf (var_dim->root,
+ pivot_value_new_variable (ost->vars[i]));
+
+ struct pivot_dimension *category_dim
+ = pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Category"));
+ size_t n_cells = cst->hi - cst->lo + 1;
+ for (size_t i = 0 ; i < n_cells; ++i)
+ pivot_category_create_leaf (category_dim->root,
+ pivot_value_new_integer (i + 1));
+ pivot_category_create_leaves (category_dim->root, N_("Total"));
+
+ for (size_t v = 0 ; v < ost->n_vars ; ++v)
{
- double total_obs = 0.0;
+ const struct variable *var = ost->vars[v];
struct casereader *reader =
casereader_create_filter_missing (casereader_clone (input),
- &ost->vars[v], 1, exclude, NULL);
- struct hsh_table *freq_hash =
- create_freq_hash_with_range (dict, reader,
- ost->vars[v], cst->lo, cst->hi);
-
- struct freq **ff;
-
- if (freq_hash == NULL)
- continue;
-
- ff = (struct freq **) hsh_sort (freq_hash);
- assert ( n_cells == hsh_count (freq_hash));
-
- for ( i = 0 ; i < hsh_count (freq_hash) ; ++i )
+ &var, 1, exclude,
+ NULL, NULL);
+ struct hmap freq_hash = HMAP_INITIALIZER (freq_hash);
+ if (!create_freq_hash_with_range (dict, reader, var,
+ cst->lo, cst->hi, &freq_hash))
+ {
+ freq_hmap_destroy (&freq_hash, var_get_width (var));
+ continue;
+ }
+
+ struct freq **ff = freq_hmap_sort (&freq_hash, var_get_width (var));
+
+ double total_obs = 0.0;
+ for (size_t i = 0 ; i < hmap_count (&freq_hash) ; ++i)
total_obs += ff[i]->count;
xsq[v] = 0.0;
- for ( i = 0 ; i < hsh_count (freq_hash) ; ++i )
+ for (size_t i = 0 ; i < hmap_count (&freq_hash) ; ++i)
{
- double exp;
-
- const union value *observed_value = ff[i]->value;
-
- /* The key */
- tab_text (freq_table, v * 4 + 1, i + 2 , TAB_LEFT,
- var_get_value_name (ost->vars[v], observed_value));
-
- /* The observed N */
- tab_float (freq_table, v * 4 + 2, i + 2 , TAB_NONE,
- ff[i]->count, 8, 0);
-
- if ( cst->n_expected > 0 )
- exp = cst->expected[i] * total_obs / total_expected ;
- else
- exp = total_obs / (double) hsh_count (freq_hash);
+ /* Category. */
+ pivot_table_put3 (table, 0, v, i,
+ pivot_value_new_var_value (
+ var, &ff[i]->values[0]));
+
+ double exp = (cst->n_expected > 0
+ ? cst->expected[i] * total_obs / total_expected
+ : total_obs / (double) hmap_count (&freq_hash));
+ double entries[] = {
+ ff[i]->count,
+ exp,
+ ff[i]->count - exp,
+ };
+ for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
+ pivot_table_put3 (table, j + 1, v, i,
+ pivot_value_new_number (entries[j]));
- /* The expected N */
- tab_float (freq_table, v * 4 + 3, i + 2 , TAB_NONE,
- exp, 8, 2);
-
- /* The residual */
- tab_float (freq_table, v * 4 + 4, i + 2 , TAB_NONE,
- ff[i]->count - exp, 8, 2);
xsq[v] += (ff[i]->count - exp) * (ff[i]->count - exp) / exp;
}
-
- tab_float (freq_table, v * 4 + 2, tab_nr (freq_table) - 1, TAB_NONE,
- total_obs, 8, 0);
-
df[v] = n_cells - 1.0;
- hsh_destroy (freq_hash);
+ freq_hmap_destroy (&freq_hash, var_get_width (var));
+ free (ff);
+
+ pivot_table_put3 (table, 1, v, n_cells,
+ pivot_value_new_number (total_obs));
}
- tab_submit (freq_table);
+ pivot_table_submit (table);
}
ok = !taint_has_tainted_successor (casereader_get_taint (input));
casereader_destroy (input);
if (ok)
{
- struct tab_table *stats_table = create_stats_table (cst);
+ struct pivot_table *table = pivot_table_create (N_("Test Statistics"));
- /* Populate the summary statistics table */
- for ( v = 0 ; v < ost->n_vars ; ++v )
+ pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
+ N_("Chi-square"), PIVOT_RC_OTHER,
+ N_("df"), PIVOT_RC_INTEGER,
+ N_("Asymp. Sig."), PIVOT_RC_SIGNIFICANCE);
+
+ struct pivot_dimension *variables = pivot_dimension_create (
+ table, PIVOT_AXIS_ROW, N_("Variable"));
+
+ for (size_t v = 0 ; v < ost->n_vars ; ++v)
{
const struct variable *var = ost->vars[v];
- tab_text (stats_table, 1 + v, 0, TAB_CENTER, var_get_name (var));
-
- tab_float (stats_table, 1 + v, 1, TAB_NONE, xsq[v], 8,3);
- tab_float (stats_table, 1 + v, 2, TAB_NONE, df[v], 8,0);
+ int row = pivot_category_create_leaf (
+ variables->root, pivot_value_new_variable (var));
- tab_float (stats_table, 1 + v, 3, TAB_NONE,
- gsl_cdf_chisq_Q (xsq[v], df[v]), 8,3);
+ double sig = gsl_cdf_chisq_Q (xsq[v], df[v]);
+ double entries[] = { xsq[v], df[v], sig };
+ for (size_t i = 0; i < sizeof entries / sizeof *entries; i++)
+ pivot_table_put2 (table, i, row,
+ pivot_value_new_number (entries[i]));
}
- tab_submit (stats_table);
+ pivot_table_submit (table);
}
free (xsq);