-/* This file is part of GNU PSPP
- Computes Levene test statistic.
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 2010, 2011 Free Software Foundation, Inc.
- Copyright (C) 2004 Free Software Foundation, Inc.
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
- This program is free software; you can redistribute it and/or
- modify it under the terms of the GNU General Public License as
- published by the Free Software Foundation; either version 2 of the
- License, or (at your option) any later version.
-
- This program is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- 02110-1301, USA. */
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
#include <config.h>
-#include "levene.h"
-#include <libpspp/message.h>
-#include <data/case.h>
-#include <data/casefile.h>
-#include <data/dictionary.h>
-#include "group-proc.h"
-#include <libpspp/hash.h>
-#include <libpspp/str.h>
-#include <data/variable.h>
-#include <data/procedure.h>
-#include <data/casefilter.h>
-#include <libpspp/alloc.h>
-#include <libpspp/misc.h>
-#include "group.h"
+#include "levene.h"
#include <math.h>
-#include <stdlib.h>
-
-/* This module calculates the Levene statistic for variables.
+#include "libpspp/misc.h"
+#include "libpspp/hmap.h"
+#include "data/value.h"
+#include "data/val-type.h"
- Just for reference, the Levene Statistic is a defines as follows:
+#include <gl/xalloc.h>
+#include <assert.h>
- W = \frac{ (n-k)\sum_{i=1}^k n_i(Z_{iL} - Z_{LL})^2}
- { (k-1)\sum_{i=1}^k \sum_{j=1}^{n_i} (Z_{ij} - Z_{iL})^2}
-
- where:
- k is the number of groups
- n is the total number of samples
- n_i is the number of samples in the ith group
- Z_{ij} is | Y_{ij} - Y_{iL} | where Y_{iL} is the mean of the ith group
- Z_{iL} is the mean of Z_{ij} over the ith group
- Z_{LL} is the grand mean of Z_{ij}
-
- Imagine calculating that with pencil and paper!
+struct lev
+{
+ struct hmap_node node;
+ union value group;
- */
+ double t_bar;
+ double z_mean;
+ double n;
+};
+typedef unsigned int hash_func (const struct levene *, const union value *v);
+typedef bool cmp_func (const struct levene *, const union value *v0, const union value *v1);
-struct levene_info
+struct levene
{
+ /* Width of the categorical variable */
+ int gvw ;
- /* Per group statistics */
- struct t_test_proc **group_stats;
+ /* The value dividing the groups. Valid only for dichotomous categorical variable.*/
+ const union value *cutpoint;
- /* The independent variable */
- const struct variable *v_indep;
- /* Number of dependent variables */
- size_t n_dep;
+ /* A hashtable of struct lev objects indexed by union value */
+ struct hmap hmap;
- /* The dependent variables */
- const struct variable **v_dep;
+ hash_func *hash;
+ cmp_func *cmp;
- /* Filter for missing values */
- struct casefilter *filter;
-};
-/* First pass */
-static void levene_precalc (const struct levene_info *l);
-static int levene_calc (const struct dictionary *dict, const struct ccase *,
- const struct levene_info *l);
-static void levene_postcalc (void *);
+ /* A state variable indicating how many passes have been done */
+ int pass;
+ double grand_n;
+ double z_grand_mean;
-/* Second pass */
-static void levene2_precalc (struct levene_info *l);
-static int levene2_calc (const struct dictionary *, const struct ccase *,
- struct levene_info *l);
-static void levene2_postcalc (void *);
+ double denominator;
+};
-void
-levene(const struct dictionary *dict,
- const struct casefile *cf,
- const struct variable *v_indep, size_t n_dep,
- const struct variable **v_dep,
- struct casefilter *filter)
+static unsigned int
+unique_hash (const struct levene *nl, const union value *val)
{
- struct casereader *r;
- struct ccase c;
- struct levene_info l;
-
- l.n_dep = n_dep;
- l.v_indep = v_indep;
- l.v_dep = v_dep;
- l.filter = filter;
+ return value_hash (val, nl->gvw, 0);
+}
+static bool
+unique_cmp (const struct levene *nl, const union value *val0, const union value *val1)
+{
+ return value_equal (val0, val1, nl->gvw);
+}
- levene_precalc (&l);
- for(r = casefile_get_reader (cf, filter);
- casereader_read (r, &c) ;
- case_destroy (&c))
- {
- levene_calc (dict, &c, &l);
- }
- casereader_destroy (r);
- levene_postcalc (&l);
+static unsigned int
+cutpoint_hash (const struct levene *nl, const union value *val)
+{
+ int x = value_compare_3way (val, nl->cutpoint, nl->gvw);
- levene2_precalc(&l);
- for(r = casefile_get_reader (cf, filter);
- casereader_read (r, &c) ;
- case_destroy (&c))
- {
- levene2_calc (dict, &c,&l);
- }
- casereader_destroy (r);
- levene2_postcalc (&l);
+ return (x < 0);
}
-/* Internal variables used in calculating the Levene statistic */
-
-/* Per variable statistics */
-struct lz_stats
+static bool
+cutpoint_cmp (const struct levene *nl, const union value *val0, const union value *val1)
{
- /* Total of all lz */
- double grand_total;
+ int x = value_compare_3way (val0, nl->cutpoint, nl->gvw);
- /* Mean of all lz */
- double grand_mean;
+ int y = value_compare_3way (val1, nl->cutpoint, nl->gvw);
- /* The total number of cases */
- double total_n ;
+ if ( x == 0) x = 1;
+ if ( y == 0) y = 1;
- /* Number of groups */
- int n_groups;
-};
+ return ( x == y);
+}
-/* An array of lz_stats for each variable */
-static struct lz_stats *lz;
-static void
-levene_precalc (const struct levene_info *l)
+static struct lev *
+find_group (const struct levene *nl, const union value *target)
{
- size_t i;
-
- lz = xnmalloc (l->n_dep, sizeof *lz);
+ struct lev *l = NULL;
- for(i = 0; i < l->n_dep ; ++i )
+ HMAP_FOR_EACH_WITH_HASH (l, struct lev, node, nl->hash (nl, target), &nl->hmap)
{
- const struct variable *var = l->v_dep[i];
- struct group_proc *gp = group_proc_get (var);
- struct group_statistics *gs;
- struct hsh_iterator hi;
-
- lz[i].grand_total = 0;
- lz[i].total_n = 0;
- lz[i].n_groups = gp->n_groups ;
-
-
- for ( gs = hsh_first(gp->group_hash, &hi);
- gs != 0;
- gs = hsh_next(gp->group_hash, &hi))
- {
- gs->lz_total = 0;
- }
-
+ if (nl->cmp (nl, &l->group, target))
+ break;
+ l = NULL;
}
-
+ return l;
}
-static int
-levene_calc (const struct dictionary *dict, const struct ccase *c,
- const struct levene_info *l)
-{
- size_t i;
- bool warn = false;
- const union value *gv = case_data (c, l->v_indep);
- struct group_statistics key;
- double weight = dict_get_case_weight (dict, c, &warn);
-
- key.id = *gv;
- for (i = 0; i < l->n_dep; ++i)
- {
- const struct variable *var = l->v_dep[i];
- struct group_proc *gp = group_proc_get (var);
- double levene_z;
- const union value *v = case_data (c, var);
- struct group_statistics *gs;
+struct levene *
+levene_create (int indep_width, const union value *cutpoint)
+{
+ struct levene *nl = xzalloc (sizeof *nl);
- gs = hsh_find(gp->group_hash,(void *) &key );
+ hmap_init (&nl->hmap);
- if ( 0 == gs )
- continue ;
+ nl->gvw = indep_width;
+ nl->cutpoint = cutpoint;
- if ( ! casefilter_variable_missing (l->filter, c, var))
- {
- levene_z= fabs(v->f - gs->mean);
- lz[i].grand_total += levene_z * weight;
- lz[i].total_n += weight;
+ nl->hash = cutpoint ? cutpoint_hash : unique_hash;
+ nl->cmp = cutpoint ? cutpoint_cmp : unique_cmp;
- gs->lz_total += levene_z * weight;
- }
- }
- return 0;
+ return nl;
}
-static void
-levene_postcalc (void *_l)
+/* Data accumulation. First pass */
+void
+levene_pass_one (struct levene *nl, double value, double weight, const union value *gv)
{
- size_t v;
-
- struct levene_info *l = (struct levene_info *) _l;
+ struct lev *lev = find_group (nl, gv);
- for (v = 0; v < l->n_dep; ++v)
+ if ( nl->pass == 0 )
{
- /* This is Z_LL */
- lz[v].grand_mean = lz[v].grand_total / lz[v].total_n ;
+ nl->pass = 1;
}
+ assert (nl->pass == 1);
-
-}
-
+ if ( NULL == lev)
+ {
+ struct lev *l = xzalloc (sizeof *l);
+ value_clone (&l->group, gv, nl->gvw);
+ hmap_insert (&nl->hmap, &l->node, nl->hash (nl, &l->group));
+ lev = l;
+ }
+ lev->n += weight;
+ lev->t_bar += value * weight;
-/* The denominator for the expression for the Levene */
-static double *lz_denominator = 0;
+ nl->grand_n += weight;
+}
-static void
-levene2_precalc (struct levene_info *l)
+/* Data accumulation. Second pass */
+void
+levene_pass_two (struct levene *nl, double value, double weight, const union value *gv)
{
- size_t v;
-
- lz_denominator = xnmalloc (l->n_dep, sizeof *lz_denominator);
+ struct lev *lev = NULL;
- /* This stuff could go in the first post calc . . . */
- for (v = 0;
- v < l->n_dep;
- ++v)
+ if ( nl->pass == 1 )
{
- struct hsh_iterator hi;
- struct group_statistics *g;
+ struct lev *next;
+ struct lev *l;
- const struct variable *var = l->v_dep[v] ;
- struct hsh_table *hash = group_proc_get (var)->group_hash;
+ nl->pass = 2;
+ HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap)
+ {
+ l->t_bar /= l->n;
+ }
+ }
+ assert (nl->pass == 2);
- for(g = (struct group_statistics *) hsh_first(hash,&hi);
- g != 0 ;
- g = (struct group_statistics *) hsh_next(hash,&hi) )
- {
- g->lz_mean = g->lz_total / g->n ;
- }
- lz_denominator[v] = 0;
- }
+ lev = find_group (nl, gv);
+
+ lev->z_mean += fabs (value - lev->t_bar) * weight;
+ nl->z_grand_mean += fabs (value - lev->t_bar) * weight;
}
-static int
-levene2_calc (const struct dictionary *dict, const struct ccase *c,
- struct levene_info *l)
+/* Data accumulation. Third pass */
+void
+levene_pass_three (struct levene *nl, double value, double weight, const union value *gv)
{
- size_t i;
- bool warn = false;
+ double z;
+ struct lev *lev = NULL;
- double weight = dict_get_case_weight (dict, c, &warn);
-
- const union value *gv = case_data (c, l->v_indep);
- struct group_statistics key;
-
- key.id = *gv;
-
- for (i = 0; i < l->n_dep; ++i)
+ if ( nl->pass == 2 )
{
- double levene_z;
- const struct variable *var = l->v_dep[i] ;
- const union value *v = case_data (c, var);
- struct group_statistics *gs;
+ struct lev *next;
+ struct lev *l;
- gs = hsh_find(group_proc_get (var)->group_hash,(void *) &key );
+ nl->pass = 3;
- if ( 0 == gs )
- continue;
+ HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap)
+ {
+ l->z_mean /= l->n;
+ }
- if ( ! casefilter_variable_missing (l->filter, c, var))
+ nl->z_grand_mean /= nl->grand_n;
+ }
- {
- levene_z = fabs(v->f - gs->mean);
- lz_denominator[i] += weight * pow2 (levene_z - gs->lz_mean);
- }
- }
+ assert (nl->pass == 3);
+ lev = find_group (nl, gv);
- return 0;
+ z = fabs (value - lev->t_bar);
+ nl->denominator += pow2 (z - lev->z_mean) * weight;
}
-static void
-levene2_postcalc (void *_l)
+/* Return the value of the levene statistic */
+double
+levene_calculate (struct levene *nl)
{
- size_t v;
+ struct lev *next;
+ struct lev *l;
- struct levene_info *l = (struct levene_info *) _l;
+ double numerator = 0.0;
+ double nn = 0.0;
- for (v = 0; v < l->n_dep; ++v)
- {
- double lz_numerator = 0;
- struct hsh_iterator hi;
- struct group_statistics *g;
+ /* The Levene calculation requires three passes.
+ Normally this should have been done prior to calling this function.
+ However, in abnormal circumstances (eg. the dataset is empty) there
+ will have been no passes.
+ */
+ assert (nl->pass == 0 || nl->pass == 3);
- const struct variable *var = l->v_dep[v] ;
- struct group_proc *gp = group_proc_get (var);
- struct hsh_table *hash = gp->group_hash;
+ if ( nl->pass == 0 )
+ return SYSMIS;
- for(g = (struct group_statistics *) hsh_first(hash,&hi);
- g != 0 ;
- g = (struct group_statistics *) hsh_next(hash,&hi) )
- {
- lz_numerator += g->n * pow2(g->lz_mean - lz[v].grand_mean );
- }
- lz_numerator *= ( gp->ugs.n - gp->n_groups );
+ nl->denominator *= hmap_count (&nl->hmap) - 1;
- lz_denominator[v] *= (gp->n_groups - 1);
+ HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap)
+ {
+ numerator += l->n * pow2 (l->z_mean - nl->z_grand_mean);
+ nn += l->n;
+ }
+
+ numerator *= nn - hmap_count (&nl->hmap);
+
+ return numerator / nl->denominator;
+}
- gp->levene = lz_numerator / lz_denominator[v] ;
+void
+levene_destroy (struct levene *nl)
+{
+ struct lev *next;
+ struct lev *l;
+ HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap)
+ {
+ value_destroy (&l->group, nl->gvw);
+ free (l);
}
- /* Now clear up after ourselves */
- free(lz_denominator);
- free(lz);
+ hmap_destroy (&nl->hmap);
+ free (nl);
}
-