#include <data/case.h>
#include <data/casegrouper.h>
#include <data/casereader.h>
+#include <data/dictionary.h>
+#include <data/procedure.h>
+#include <data/value.h>
+
#include <math/covariance.h>
#include <math/categoricals.h>
+#include <math/levene.h>
#include <math/moments.h>
#include <gsl/gsl_matrix.h>
#include <linreg/sweep.h>
#include <language/lexer/value-parser.h>
#include <language/command.h>
-#include <data/procedure.h>
-#include <data/value.h>
-#include <data/dictionary.h>
#include <language/dictionary/split-file.h>
-#include <libpspp/hash.h>
#include <libpspp/taint.h>
-#include <math/group-proc.h>
-#include <math/levene.h>
#include <libpspp/misc.h>
#include <output/tab.h>
#include "gettext.h"
#define _(msgid) gettext (msgid)
+
enum missing_type
{
MISS_LISTWISE,
/* Workspace variable for each dependent variable */
struct per_var_ws
{
+ struct categoricals *cat;
struct covariance *cov;
double sst;
int n_groups;
double mse;
+ double levene_w;
};
struct oneway_workspace
missing values are disregarded */
int actual_number_of_groups;
- /* A hash table containing all the distinct values of the independent
- variable */
- struct hsh_table *group_hash;
-
struct per_var_ws *vws;
/* An array of descriptive data. One for each dependent variable */
\f
-static int
-compare_double_3way (const void *a_, const void *b_, const void *aux UNUSED)
-{
- const double *a = a_;
- const double *b = b_;
- return *a < *b ? -1 : *a > *b;
-}
-
-static unsigned
-do_hash_double (const void *value_, const void *aux UNUSED)
-{
- const double *value = value_;
- return hash_double (*value, 0);
-}
-
-static void
-free_double (void *value_, const void *aux UNUSED)
-{
- double *value = value_;
- free (value);
-}
-
-
-
-static void postcalc (const struct oneway_spec *cmd);
-static void precalc (const struct oneway_spec *cmd);
static struct descriptive_data *
dd_create (const struct variable *var)
return dd;
}
+static void
+dd_destroy (struct descriptive_data *dd)
+{
+ moments1_destroy (dd->mom);
+ free (dd);
+}
static void *
makeit (void *aux1, void *aux2 UNUSED)
struct oneway_workspace ws;
ws.actual_number_of_groups = 0;
- ws.vws = xmalloc (cmd->n_vars * sizeof (*ws.vws));
+ ws.vws = xzalloc (cmd->n_vars * sizeof (*ws.vws));
ws.dd_total = xmalloc (sizeof (struct descriptive_data) * cmd->n_vars);
for (v = 0 ; v < cmd->n_vars; ++v)
for (v = 0; v < cmd->n_vars; ++v)
{
- struct categoricals *cats = categoricals_create (&cmd->indep_var, 1,
+ ws.vws[v].cat = categoricals_create (&cmd->indep_var, 1,
cmd->wv, cmd->exclude,
makeit,
updateit,
cmd->vars[v], ws.dd_total[v]);
ws.vws[v].cov = covariance_2pass_create (1, &cmd->vars[v],
- cats,
+ ws.vws[v].cat,
cmd->wv, cmd->exclude);
}
if (c == NULL)
{
casereader_destroy (input);
- return;
+ goto finish;
}
output_split_file_values (ds, c);
case_unref (c);
taint = taint_clone (casereader_get_taint (input));
- ws.group_hash = hsh_create (4,
- compare_double_3way,
- do_hash_double,
- free_double,
- cmd->indep_var);
-
- precalc (cmd);
-
input = casereader_create_filter_missing (input, &cmd->indep_var, 1,
cmd->exclude, NULL, NULL);
if (cmd->missing_type == MISS_LISTWISE)
cmd->exclude, NULL, NULL);
input = casereader_create_filter_weight (input, dict, NULL, NULL);
+
+ if (cmd->stats & STATS_HOMOGENEITY)
+ for (v = 0; v < cmd->n_vars; ++v)
+ {
+ struct per_var_ws *pvw = &ws.vws[v];
+
+ pvw->levene_w = levene (input, cmd->indep_var, cmd->vars[v], cmd->wv, cmd->exclude);
+ }
+
reader = casereader_clone (input);
for (; (c = casereader_read (reader)) != NULL; case_unref (c))
{
- size_t i;
-
- const double weight = dict_get_case_weight (dict, c, NULL);
-
- const union value *indep_val = case_data (c, cmd->indep_var);
- void **p = hsh_probe (ws.group_hash, &indep_val->f);
- if (*p == NULL)
- {
- double *value = *p = xmalloc (sizeof *value);
- *value = indep_val->f;
- }
+ int i;
for (i = 0; i < cmd->n_vars; ++i)
{
struct per_var_ws *pvw = &ws.vws[i];
const struct variable *v = cmd->vars[i];
const union value *val = case_data (c, v);
- struct group_proc *gp = group_proc_get (cmd->vars[i]);
- struct hsh_table *group_hash = gp->group_hash;
- struct group_statistics *gs;
if ( MISS_ANALYSIS == cmd->missing_type)
{
}
covariance_accumulate_pass1 (pvw->cov, c);
-
- gs = hsh_find (group_hash, indep_val );
-
- if ( ! gs )
- {
- gs = xmalloc (sizeof *gs);
- gs->id = *indep_val;
- gs->sum = 0;
- gs->n = 0;
- gs->ssq = 0;
- gs->sum_diff = 0;
- gs->minimum = DBL_MAX;
- gs->maximum = -DBL_MAX;
-
- hsh_insert ( group_hash, gs );
- }
-
- if (!var_is_value_missing (v, val, cmd->exclude))
- {
- struct group_statistics *totals = &gp->ugs;
-
- totals->n += weight;
- totals->sum += weight * val->f;
- totals->ssq += weight * pow2 (val->f);
-
- if ( val->f < totals->minimum )
- totals->minimum = val->f;
-
- if ( val->f > totals->maximum )
- totals->maximum = val->f;
-
- gs->n += weight;
- gs->sum += weight * val->f;
- gs->ssq += weight * pow2 (val->f);
-
- if ( val->f < gs->minimum )
- gs->minimum = val->f;
-
- if ( val->f > gs->maximum )
- gs->maximum = val->f;
- }
-
- gp->n_groups = hsh_count (group_hash );
}
-
}
casereader_destroy (reader);
reader = casereader_clone (input);
pvw->mse = (pvw->sst - pvw->ssa) / (n - pvw->n_groups);
}
- postcalc (cmd);
-
-
for (v = 0; v < cmd->n_vars; ++v)
{
struct categoricals *cats = covariance_get_categoricals (ws.vws[v].cov);
ws.actual_number_of_groups = categoricals_total (cats);
}
- if ( cmd->stats & STATS_HOMOGENEITY )
- levene (dict, casereader_clone (input), cmd->indep_var,
- cmd->n_vars, cmd->vars, cmd->exclude);
-
casereader_destroy (input);
if (!taint_has_tainted_successor (taint))
output_oneway (cmd, &ws);
taint_destroy (taint);
-}
-/* Pre calculations */
-static void
-precalc (const struct oneway_spec *cmd)
-{
- size_t i = 0;
-
- for (i = 0; i < cmd->n_vars; ++i)
+ finish:
+ for (v = 0; v < cmd->n_vars; ++v)
{
- struct group_proc *gp = group_proc_get (cmd->vars[i]);
- struct group_statistics *totals = &gp->ugs;
-
- /* Create a hash for each of the dependent variables.
- The hash contains a group_statistics structure,
- and is keyed by value of the independent variable */
-
- gp->group_hash = hsh_create (4, compare_group, hash_group,
- (hsh_free_func *) free_group,
- cmd->indep_var);
-
- totals->sum = 0;
- totals->n = 0;
- totals->ssq = 0;
- totals->sum_diff = 0;
- totals->maximum = -DBL_MAX;
- totals->minimum = DBL_MAX;
+ covariance_destroy (ws.vws[v].cov);
+ dd_destroy (ws.dd_total[v]);
}
-}
+ free (ws.vws);
+ free (ws.dd_total);
-/* Post calculations for the ONEWAY command */
-static void
-postcalc (const struct oneway_spec *cmd)
-{
- size_t i = 0;
-
- for (i = 0; i < cmd->n_vars; ++i)
- {
- struct group_proc *gp = group_proc_get (cmd->vars[i]);
- struct hsh_table *group_hash = gp->group_hash;
- struct group_statistics *totals = &gp->ugs;
-
- struct hsh_iterator g;
- struct group_statistics *gs;
-
- for (gs = hsh_first (group_hash, &g);
- gs != 0;
- gs = hsh_next (group_hash, &g))
- {
- gs->mean = gs->sum / gs->n;
- gs->s_std_dev = sqrt (gs->ssq / gs->n - pow2 (gs->mean));
-
- gs->std_dev = sqrt (
- gs->n / (gs->n - 1) *
- ( gs->ssq / gs->n - pow2 (gs->mean))
- );
-
- gs->se_mean = gs->std_dev / sqrt (gs->n);
- gs->mean_diff = gs->sum_diff / gs->n;
- }
-
- totals->mean = totals->sum / totals->n;
- totals->std_dev = sqrt (
- totals->n / (totals->n - 1) *
- (totals->ssq / totals->n - pow2 (totals->mean))
- );
-
- totals->se_mean = totals->std_dev / sqrt (totals->n);
- }
}
static void show_contrast_coeffs (const struct oneway_spec *cmd, const struct oneway_workspace *ws);
show_contrast_coeffs (cmd, ws);
show_contrast_tests (cmd, ws);
}
-
- /* Clean up */
- for (i = 0; i < cmd->n_vars; ++i )
- {
- struct hsh_table *group_hash = group_proc_get (cmd->vars[i])->group_hash;
-
- hsh_destroy (group_hash);
- }
-
- hsh_destroy (ws->group_hash);
}
for (v = 0; v < cmd->n_vars; ++v)
{
double n;
-
-
const struct per_var_ws *pvw = &ws->vws[v];
+ double F = pvw->levene_w;
const struct variable *var = cmd->vars[v];
- const struct group_proc *gp = group_proc_get (cmd->vars[v]);
const char *s = var_to_string (var);
double df1, df2;
- double F = gp->levene;
moments1_calculate (ws->dd_total[v]->mom, &n, NULL, NULL, NULL, NULL);
/* PSPP - a program for statistical analysis.
- Copyright (C) 2004, 2009, 2010 Free Software Foundation, Inc.
+ Copyright (C) 2010 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
along with this program. If not, see <http://www.gnu.org/licenses/>. */
#include <config.h>
+
#include "levene.h"
-#include <libpspp/message.h>
-#include <data/case.h>
-#include <data/casereader.h>
-#include <data/dictionary.h>
-#include "group-proc.h"
-#include <libpspp/hash.h>
-#include <libpspp/str.h>
-#include <data/variable.h>
-#include <data/procedure.h>
-#include <libpspp/misc.h>
-#include "group.h"
#include <math.h>
-#include <stdlib.h>
-
-#include "xalloc.h"
-
-
-/* This module calculates the Levene statistic for variables.
-
- Just for reference, the Levene Statistic is a defines as follows:
-
- W = \frac{ (n-k)\sum_{i=1}^k n_i(Z_{iL} - Z_{LL})^2}
- { (k-1)\sum_{i=1}^k \sum_{j=1}^{n_i} (Z_{ij} - Z_{iL})^2}
-
- where:
- k is the number of groups
- n is the total number of samples
- n_i is the number of samples in the ith group
- Z_{ij} is | Y_{ij} - Y_{iL} | where Y_{iL} is the mean of the ith group
- Z_{iL} is the mean of Z_{ij} over the ith group
- Z_{LL} is the grand mean of Z_{ij}
-
- Imagine calculating that with pencil and paper!
- */
-
-
-struct levene_info
-{
-
- /* Per group statistics */
- struct t_test_proc **group_stats;
-
- /* The independent variable */
- const struct variable *v_indep;
-
- /* Number of dependent variables */
- size_t n_dep;
-
- /* The dependent variables */
- const struct variable **v_dep;
-
- /* Filter for missing values */
- enum mv_class exclude;
+#include <libpspp/misc.h>
- /* An array of lz_stats for each variable */
- struct lz_stats *lz;
+#include <data/case.h>
+#include <data/casereader.h>
+#include <data/variable.h>
- /* The denominator for the expression for the Levene */
- double *lz_denominator;
-};
+#include <libpspp/hmap.h>
-/* Per variable statistics */
-struct lz_stats
+struct lev
{
- /* Total of all lz */
- double grand_total;
-
- /* Mean of all lz */
- double grand_mean;
+ struct hmap_node node;
+ union value group;
+ int width;
- /* The total number of cases */
- double total_n ;
-
- /* Number of groups */
- int n_groups;
+ double t_bar;
+ double z_mean;
+ double n;
};
-/* First pass */
-static void levene_precalc (const struct levene_info *l);
-static int levene_calc (const struct dictionary *dict, const struct ccase *,
- const struct levene_info *l);
-static void levene_postcalc (struct levene_info *);
-
-
-/* Second pass */
-static void levene2_precalc (struct levene_info *l);
-static int levene2_calc (const struct dictionary *, const struct ccase *,
- struct levene_info *l);
-static void levene2_postcalc (struct levene_info *);
-
-void
-levene(const struct dictionary *dict,
- struct casereader *reader,
- const struct variable *v_indep, size_t n_dep,
- const struct variable **v_dep,
- enum mv_class exclude)
-{
- struct casereader *pass1, *pass2;
- struct ccase *c;
- struct levene_info l;
-
- l.n_dep = n_dep;
- l.v_indep = v_indep;
- l.v_dep = v_dep;
- l.exclude = exclude;
- l.lz = xnmalloc (l.n_dep, sizeof *l.lz);
- l.lz_denominator = xnmalloc (l.n_dep, sizeof *l.lz_denominator);
-
- casereader_split (reader, &pass1, &pass2);
-
- levene_precalc (&l);
- for (; (c = casereader_read (pass1)) != NULL; case_unref (c))
- levene_calc (dict, c, &l);
- casereader_destroy (pass1);
- levene_postcalc (&l);
-
- levene2_precalc(&l);
- for (; (c = casereader_read (pass2)) != NULL; case_unref (c))
- levene2_calc (dict, c, &l);
- casereader_destroy (pass2);
- levene2_postcalc (&l);
-
- free (l.lz_denominator);
- free (l.lz);
-}
-
-static void
-levene_precalc (const struct levene_info *l)
+static struct lev *
+find_group (struct hmap *map, const union value *target, int width)
{
- size_t i;
-
- for(i = 0; i < l->n_dep ; ++i )
+ struct lev *l = NULL;
+ unsigned int hash = value_hash (target, width, 0);
+ HMAP_FOR_EACH_WITH_HASH (l, struct lev, node, hash, map)
{
- const struct variable *var = l->v_dep[i];
- struct group_proc *gp = group_proc_get (var);
- struct group_statistics *gs;
- struct hsh_iterator hi;
-
- l->lz[i].grand_total = 0;
- l->lz[i].total_n = 0;
- l->lz[i].n_groups = gp->n_groups ;
-
-
- for ( gs = hsh_first(gp->group_hash, &hi);
- gs != NULL;
- gs = hsh_next(gp->group_hash, &hi))
- {
- gs->lz_total = 0;
- }
-
+ if (value_equal (&l->group, target, width))
+ break;
+ l = NULL;
}
+ return l;
}
-static int
-levene_calc (const struct dictionary *dict, const struct ccase *c,
- const struct levene_info *l)
-{
- size_t i;
- bool warn = false;
- const union value *gv = case_data (c, l->v_indep);
- struct group_statistics key;
- double weight = dict_get_case_weight (dict, c, &warn);
-
- key.id = *gv;
-
- for (i = 0; i < l->n_dep; ++i)
- {
- const struct variable *var = l->v_dep[i];
- struct group_proc *gp = group_proc_get (var);
- double levene_z;
- const union value *v = case_data (c, var);
- struct group_statistics *gs;
-
- gs = hsh_find(gp->group_hash,(void *) &key );
- if ( gs == NULL )
- continue ;
-
- if ( !var_is_value_missing (var, v, l->exclude))
- {
- levene_z= fabs(v->f - gs->mean);
- l->lz[i].grand_total += levene_z * weight;
- l->lz[i].total_n += weight;
-
- gs->lz_total += levene_z * weight;
- }
- }
- return 0;
-}
-
-
-static void
-levene_postcalc (struct levene_info *l)
+double
+levene (struct casereader *rx, const struct variable *gvar,
+ const struct variable *var, const struct variable *wv,
+ enum mv_class exclude)
{
- size_t v;
-
- for (v = 0; v < l->n_dep; ++v)
- {
- /* This is Z_LL */
- l->lz[v].grand_mean = l->lz[v].grand_total / l->lz[v].total_n ;
- }
+ double numerator = 0.0;
+ double denominator = 0.0;
+ int n_groups = 0;
+ double z_grand_mean = 0.0;
+ double grand_n = 0.0;
+ struct hmap map = HMAP_INITIALIZER (map);
-}
-
-
-
-static void
-levene2_precalc (struct levene_info *l)
-{
- size_t v;
-
+ struct ccase *c;
+ struct casereader *r = casereader_clone (rx);
- /* This stuff could go in the first post calc . . . */
- for (v = 0;
- v < l->n_dep;
- ++v)
+ for (; (c = casereader_read (r)) != NULL; case_unref (c))
{
- struct hsh_iterator hi;
- struct group_statistics *g;
-
- const struct variable *var = l->v_dep[v] ;
- struct hsh_table *hash = group_proc_get (var)->group_hash;
-
+ struct lev *l = NULL;
+ const union value *target = case_data (c, gvar);
+ int width = var_get_width (gvar);
+ unsigned int hash = value_hash (target, width, 0);
+ const double x = case_data (c, var)->f;
+ const double weight = wv ? case_data (c, wv)->f : 1.0;
+
+ if (var_is_value_missing (var, case_data (c, var), exclude))
+ continue;
- for (g = hsh_first(hash,&hi); g != NULL; g = hsh_next(hash, &hi))
+ l = find_group (&map, target, width);
+
+ if (l == NULL)
{
- g->lz_mean = g->lz_total / g->n ;
+ l = xzalloc (sizeof *l);
+ value_clone (&l->group, target, width);
+ hmap_insert (&map, &l->node, hash);
}
- l->lz_denominator[v] = 0;
- }
-}
-static int
-levene2_calc (const struct dictionary *dict, const struct ccase *c,
- struct levene_info *l)
-{
- size_t i;
- bool warn = false;
-
- double weight = dict_get_case_weight (dict, c, &warn);
-
- const union value *gv = case_data (c, l->v_indep);
- struct group_statistics key;
+ l->n += weight;
+ l->t_bar += x * weight;
+ grand_n += weight;
+ }
+ casereader_destroy (r);
+
+ {
+ struct lev *l;
+ HMAP_FOR_EACH (l, struct lev, node, &map)
+ {
+ l->t_bar /= l->n;
+ }
+ }
- key.id = *gv;
+ n_groups = hmap_count (&map);
- for (i = 0; i < l->n_dep; ++i)
+ r = casereader_clone (rx);
+ for (; (c = casereader_read (r)) != NULL; case_unref (c))
{
- double levene_z;
- const struct variable *var = l->v_dep[i] ;
- const union value *v = case_data (c, var);
- struct group_statistics *gs;
-
- gs = hsh_find(group_proc_get (var)->group_hash,(void *) &key );
+ const union value *target = case_data (c, gvar);
+ int width = var_get_width (gvar);
+ const double x = case_data (c, var)->f;
+ const double weight = wv ? case_data (c, wv)->f : 1.0;
- if ( gs == NULL )
+ if (var_is_value_missing (var, case_data (c, var), exclude))
continue;
- if ( !var_is_value_missing (var, v, l->exclude))
- {
- levene_z = fabs(v->f - gs->mean);
- l->lz_denominator[i] += weight * pow2 (levene_z - gs->lz_mean);
- }
- }
-
- return 0;
-}
+ struct lev *l = find_group (&map, target, width);
+ assert (l);
+
+ l->z_mean += fabs (x - l->t_bar) * weight;
+ z_grand_mean += fabs (x - l->t_bar) * weight;
+ }
+ casereader_destroy (r);
+
+ {
+ struct lev *l;
+ HMAP_FOR_EACH (l, struct lev, node, &map)
+ {
+ l->z_mean /= l->n;
+ }
+ }
-static void
-levene2_postcalc (struct levene_info *l)
-{
- size_t v;
+ z_grand_mean /= grand_n;
- for (v = 0; v < l->n_dep; ++v)
+ r = casereader_clone (rx);
+ for (; (c = casereader_read (r)) != NULL; case_unref (c))
{
- double lz_numerator = 0;
- struct hsh_iterator hi;
- struct group_statistics *g;
+ double z;
+ const union value *target = case_data (c, gvar);
+ int width = var_get_width (gvar);
+ const double x = case_data (c, var)->f;
+ const double weight = wv ? case_data (c, wv)->f : 1.0;
- const struct variable *var = l->v_dep[v] ;
- struct group_proc *gp = group_proc_get (var);
- struct hsh_table *hash = gp->group_hash;
-
- for (g = hsh_first(hash, &hi); g != NULL; g = hsh_next(hash, &hi))
- {
- lz_numerator += g->n * pow2(g->lz_mean - l->lz[v].grand_mean );
- }
- lz_numerator *= ( gp->ugs.n - gp->n_groups );
-
- l->lz_denominator[v] *= (gp->n_groups - 1);
+ if (var_is_value_missing (var, case_data (c, var), exclude))
+ continue;
- gp->levene = lz_numerator / l->lz_denominator[v] ;
+ struct lev *l = find_group (&map, target, width);
+ assert (l);
+ z = fabs (x - l->t_bar);
+ denominator += pow2 (z - l->z_mean) * weight;
}
-}
+ casereader_destroy (r);
+
+ denominator *= n_groups - 1;
+
+ {
+ double grand_n = 0.0;
+ struct hmap_node *next;
+ struct lev *l;
+ HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &map)
+ {
+ numerator += l->n * pow2 (l->z_mean - z_grand_mean);
+ grand_n += l->n;
+
+ /* We don't need these anymore */
+ free (l);
+ }
+ numerator *= grand_n - n_groups ;
+ hmap_destroy (&map);
+ }
+ return numerator/ denominator;
+}