/* PSPP - a program for statistical analysis.
- Copyright (C) 2004 Free Software Foundation, Inc.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
along with this program. If not, see <http://www.gnu.org/licenses/>. */
#include <config.h>
-#include "levene.h"
-#include <libpspp/message.h>
-#include <data/case.h>
-#include <data/casereader.h>
-#include <data/dictionary.h>
-#include "group-proc.h"
-#include <libpspp/hash.h>
-#include <libpspp/str.h>
-#include <data/variable.h>
-#include <data/procedure.h>
-#include <libpspp/alloc.h>
-#include <libpspp/misc.h>
-#include "group.h"
-#include <math.h>
-#include <stdlib.h>
-
-
-/* This module calculates the Levene statistic for variables.
-
- Just for reference, the Levene Statistic is a defines as follows:
-
- W = \frac{ (n-k)\sum_{i=1}^k n_i(Z_{iL} - Z_{LL})^2}
- { (k-1)\sum_{i=1}^k \sum_{j=1}^{n_i} (Z_{ij} - Z_{iL})^2}
-
- where:
- k is the number of groups
- n is the total number of samples
- n_i is the number of samples in the ith group
- Z_{ij} is | Y_{ij} - Y_{iL} | where Y_{iL} is the mean of the ith group
- Z_{iL} is the mean of Z_{ij} over the ith group
- Z_{LL} is the grand mean of Z_{ij}
-
- Imagine calculating that with pencil and paper!
-
- */
-
-
-struct levene_info
-{
+#include "math/levene.h"
- /* Per group statistics */
- struct t_test_proc **group_stats;
-
- /* The independent variable */
- const struct variable *v_indep;
-
- /* Number of dependent variables */
- size_t n_dep;
-
- /* The dependent variables */
- const struct variable **v_dep;
-
- /* Filter for missing values */
- enum mv_class exclude;
-
- /* An array of lz_stats for each variable */
- struct lz_stats *lz;
-
- /* The denominator for the expression for the Levene */
- double *lz_denominator;
+#include <math.h>
-};
+#include "data/case.h"
+#include "data/casereader.h"
+#include "data/variable.h"
+#include "libpspp/hmap.h"
+#include "libpspp/misc.h"
-/* Per variable statistics */
-struct lz_stats
+struct lev
{
- /* Total of all lz */
- double grand_total;
-
- /* Mean of all lz */
- double grand_mean;
-
- /* The total number of cases */
- double total_n ;
+ struct hmap_node node;
+ union value group;
+ int width;
- /* Number of groups */
- int n_groups;
+ double t_bar;
+ double z_mean;
+ double n;
};
-/* First pass */
-static void levene_precalc (const struct levene_info *l);
-static int levene_calc (const struct dictionary *dict, const struct ccase *,
- const struct levene_info *l);
-static void levene_postcalc (struct levene_info *);
-
-/* Second pass */
-static void levene2_precalc (struct levene_info *l);
-static int levene2_calc (const struct dictionary *, const struct ccase *,
- struct levene_info *l);
-static void levene2_postcalc (struct levene_info *);
-
-
-void
-levene(const struct dictionary *dict,
- struct casereader *reader,
- const struct variable *v_indep, size_t n_dep,
- const struct variable **v_dep,
- enum mv_class exclude)
-{
- struct casereader *pass1, *pass2;
- struct ccase c;
- struct levene_info l;
-
- l.n_dep = n_dep;
- l.v_indep = v_indep;
- l.v_dep = v_dep;
- l.exclude = exclude;
- l.lz = xnmalloc (l.n_dep, sizeof *l.lz);
- l.lz_denominator = xnmalloc (l.n_dep, sizeof *l.lz_denominator);
-
- casereader_split (reader, &pass1, &pass2);
-
- levene_precalc (&l);
- for (; casereader_read (pass1, &c); case_destroy (&c))
- levene_calc (dict, &c, &l);
- casereader_destroy (pass1);
- levene_postcalc (&l);
-
- levene2_precalc(&l);
- for (; casereader_read (pass2, &c); case_destroy (&c))
- levene2_calc (dict, &c, &l);
- casereader_destroy (pass2);
- levene2_postcalc (&l);
-
- free (l.lz_denominator);
- free (l.lz);
-}
-
-static void
-levene_precalc (const struct levene_info *l)
+static struct lev *
+find_group (struct hmap *map, const union value *target, int width)
{
- size_t i;
-
- for(i = 0; i < l->n_dep ; ++i )
+ struct lev *l = NULL;
+ unsigned int hash = value_hash (target, width, 0);
+ HMAP_FOR_EACH_WITH_HASH (l, struct lev, node, hash, map)
{
- const struct variable *var = l->v_dep[i];
- struct group_proc *gp = group_proc_get (var);
- struct group_statistics *gs;
- struct hsh_iterator hi;
-
- l->lz[i].grand_total = 0;
- l->lz[i].total_n = 0;
- l->lz[i].n_groups = gp->n_groups ;
-
-
- for ( gs = hsh_first(gp->group_hash, &hi);
- gs != 0;
- gs = hsh_next(gp->group_hash, &hi))
- {
- gs->lz_total = 0;
- }
-
+ if (value_equal (&l->group, target, width))
+ break;
+ l = NULL;
}
+ return l;
}
-static int
-levene_calc (const struct dictionary *dict, const struct ccase *c,
- const struct levene_info *l)
-{
- size_t i;
- bool warn = false;
- const union value *gv = case_data (c, l->v_indep);
- struct group_statistics key;
- double weight = dict_get_case_weight (dict, c, &warn);
-
- key.id = *gv;
-
- for (i = 0; i < l->n_dep; ++i)
- {
- const struct variable *var = l->v_dep[i];
- struct group_proc *gp = group_proc_get (var);
- double levene_z;
- const union value *v = case_data (c, var);
- struct group_statistics *gs;
-
- gs = hsh_find(gp->group_hash,(void *) &key );
-
- if ( 0 == gs )
- continue ;
-
- if ( !var_is_value_missing (var, v, l->exclude))
- {
- levene_z= fabs(v->f - gs->mean);
- l->lz[i].grand_total += levene_z * weight;
- l->lz[i].total_n += weight;
- gs->lz_total += levene_z * weight;
- }
- }
- return 0;
-}
-
-
-static void
-levene_postcalc (struct levene_info *l)
+double
+levene (struct casereader *rx, const struct variable *gvar,
+ const struct variable *var, const struct variable *wv,
+ enum mv_class exclude)
{
- size_t v;
-
- for (v = 0; v < l->n_dep; ++v)
- {
- /* This is Z_LL */
- l->lz[v].grand_mean = l->lz[v].grand_total / l->lz[v].total_n ;
- }
-
-
-}
+ double numerator = 0.0;
+ double denominator = 0.0;
+ int n_groups = 0;
+ double z_grand_mean = 0.0;
+ double grand_n = 0.0;
+ struct hmap map = HMAP_INITIALIZER (map);
+ struct ccase *c;
+ struct casereader *r = casereader_clone (rx);
-static void
-levene2_precalc (struct levene_info *l)
-{
- size_t v;
-
-
- /* This stuff could go in the first post calc . . . */
- for (v = 0;
- v < l->n_dep;
- ++v)
+ for (; (c = casereader_read (r)) != NULL; case_unref (c))
{
- struct hsh_iterator hi;
- struct group_statistics *g;
-
- const struct variable *var = l->v_dep[v] ;
- struct hsh_table *hash = group_proc_get (var)->group_hash;
-
+ struct lev *l = NULL;
+ const union value *target = case_data (c, gvar);
+ int width = var_get_width (gvar);
+ unsigned int hash = value_hash (target, width, 0);
+ const double x = case_data (c, var)->f;
+ const double weight = wv ? case_data (c, wv)->f : 1.0;
+
+ if (var_is_value_missing (var, case_data (c, var), exclude))
+ continue;
- for(g = (struct group_statistics *) hsh_first(hash,&hi);
- g != 0 ;
- g = (struct group_statistics *) hsh_next(hash,&hi) )
+ l = find_group (&map, target, width);
+
+ if (l == NULL)
{
- g->lz_mean = g->lz_total / g->n ;
+ l = xzalloc (sizeof *l);
+ value_clone (&l->group, target, width);
+ hmap_insert (&map, &l->node, hash);
}
- l->lz_denominator[v] = 0;
- }
-}
-
-static int
-levene2_calc (const struct dictionary *dict, const struct ccase *c,
- struct levene_info *l)
-{
- size_t i;
- bool warn = false;
- double weight = dict_get_case_weight (dict, c, &warn);
-
- const union value *gv = case_data (c, l->v_indep);
- struct group_statistics key;
+ l->n += weight;
+ l->t_bar += x * weight;
+ grand_n += weight;
+ }
+ casereader_destroy (r);
+
+ {
+ struct lev *l;
+ HMAP_FOR_EACH (l, struct lev, node, &map)
+ {
+ l->t_bar /= l->n;
+ }
+ }
- key.id = *gv;
+ n_groups = hmap_count (&map);
- for (i = 0; i < l->n_dep; ++i)
+ r = casereader_clone (rx);
+ for (; (c = casereader_read (r)) != NULL; case_unref (c))
{
- double levene_z;
- const struct variable *var = l->v_dep[i] ;
- const union value *v = case_data (c, var);
- struct group_statistics *gs;
+ struct lev *l = NULL;
+ const union value *target = case_data (c, gvar);
+ int width = var_get_width (gvar);
+ const double x = case_data (c, var)->f;
+ const double weight = wv ? case_data (c, wv)->f : 1.0;
- gs = hsh_find(group_proc_get (var)->group_hash,(void *) &key );
-
- if ( 0 == gs )
+ if (var_is_value_missing (var, case_data (c, var), exclude))
continue;
- if ( !var_is_value_missing (var, v, l->exclude))
- {
- levene_z = fabs(v->f - gs->mean);
- l->lz_denominator[i] += weight * pow2 (levene_z - gs->lz_mean);
- }
+ l = find_group (&map, target, width);
+ assert (l);
+
+ l->z_mean += fabs (x - l->t_bar) * weight;
+ z_grand_mean += fabs (x - l->t_bar) * weight;
}
+ casereader_destroy (r);
+
+ {
+ struct lev *l;
+ HMAP_FOR_EACH (l, struct lev, node, &map)
+ {
+ l->z_mean /= l->n;
+ }
+ }
- return 0;
-}
-
-
-static void
-levene2_postcalc (struct levene_info *l)
-{
- size_t v;
+ z_grand_mean /= grand_n;
- for (v = 0; v < l->n_dep; ++v)
+ r = casereader_clone (rx);
+ for (; (c = casereader_read (r)) != NULL; case_unref (c))
{
- double lz_numerator = 0;
- struct hsh_iterator hi;
- struct group_statistics *g;
-
- const struct variable *var = l->v_dep[v] ;
- struct group_proc *gp = group_proc_get (var);
- struct hsh_table *hash = gp->group_hash;
-
- for(g = (struct group_statistics *) hsh_first(hash,&hi);
- g != 0 ;
- g = (struct group_statistics *) hsh_next(hash,&hi) )
- {
- lz_numerator += g->n * pow2(g->lz_mean - l->lz[v].grand_mean );
- }
- lz_numerator *= ( gp->ugs.n - gp->n_groups );
-
- l->lz_denominator[v] *= (gp->n_groups - 1);
+ double z;
+ struct lev *l;
+ const union value *target = case_data (c, gvar);
+ int width = var_get_width (gvar);
+ const double x = case_data (c, var)->f;
+ const double weight = wv ? case_data (c, wv)->f : 1.0;
+
+ if (var_is_value_missing (var, case_data (c, var), exclude))
+ continue;
- gp->levene = lz_numerator / l->lz_denominator[v] ;
+ l = find_group (&map, target, width);
+ assert (l);
+ z = fabs (x - l->t_bar);
+ denominator += pow2 (z - l->z_mean) * weight;
}
-}
+ casereader_destroy (r);
+
+ denominator *= n_groups - 1;
+
+ {
+ double grand_n = 0.0;
+ struct lev *next;
+ struct lev *l;
+ HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &map)
+ {
+ numerator += l->n * pow2 (l->z_mean - z_grand_mean);
+ grand_n += l->n;
+
+ /* We don't need these anymore */
+ free (l);
+ }
+ numerator *= grand_n - n_groups ;
+ hmap_destroy (&map);
+ }
+ return numerator/ denominator;
+}