X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fmath%2Flevene.c;h=098fcc5a1ba9feba488060e3406f54f84364c998;hb=490ac70d9c9f754f733552d64c23dd6aedced342;hp=f7e269991346cd0c6cb076eacec04bcbbd62590c;hpb=43c6dd6d4f7750a5f531f5d931628b1ab92ee748;p=pspp diff --git a/src/math/levene.c b/src/math/levene.c index f7e2699913..098fcc5a1b 100644 --- a/src/math/levene.c +++ b/src/math/levene.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2004, 2009 Free Software Foundation, Inc. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -15,307 +15,251 @@ along with this program. If not, see . */ #include -#include "levene.h" -#include -#include -#include -#include -#include "group-proc.h" -#include -#include -#include -#include -#include -#include "group.h" +#include "levene.h" #include -#include - -#include "xalloc.h" - -/* This module calculates the Levene statistic for variables. +#include "libpspp/misc.h" +#include "libpspp/hmap.h" +#include "data/value.h" +#include "data/val-type.h" - Just for reference, the Levene Statistic is a defines as follows: +#include +#include - W = \frac{ (n-k)\sum_{i=1}^k n_i(Z_{iL} - Z_{LL})^2} - { (k-1)\sum_{i=1}^k \sum_{j=1}^{n_i} (Z_{ij} - Z_{iL})^2} - - where: - k is the number of groups - n is the total number of samples - n_i is the number of samples in the ith group - Z_{ij} is | Y_{ij} - Y_{iL} | where Y_{iL} is the mean of the ith group - Z_{iL} is the mean of Z_{ij} over the ith group - Z_{LL} is the grand mean of Z_{ij} - - Imagine calculating that with pencil and paper! +struct lev +{ + struct hmap_node node; + union value group; - */ + double t_bar; + double z_mean; + double n; +}; +typedef unsigned int hash_func (const struct levene *, const union value *v); +typedef bool cmp_func (const struct levene *, const union value *v0, const union value *v1); -struct levene_info +struct levene { + /* Width of the categorical variable */ + int gvw ; - /* Per group statistics */ - struct t_test_proc **group_stats; + /* The value dividing the groups. Valid only for dichotomous categorical variable.*/ + const union value *cutpoint; - /* The independent variable */ - const struct variable *v_indep; - /* Number of dependent variables */ - size_t n_dep; + /* A hashtable of struct lev objects indexed by union value */ + struct hmap hmap; - /* The dependent variables */ - const struct variable **v_dep; + hash_func *hash; + cmp_func *cmp; - /* Filter for missing values */ - enum mv_class exclude; - /* An array of lz_stats for each variable */ - struct lz_stats *lz; + /* A state variable indicating how many passes have been done */ + int pass; - /* The denominator for the expression for the Levene */ - double *lz_denominator; + double grand_n; + double z_grand_mean; + double denominator; }; -/* Per variable statistics */ -struct lz_stats -{ - /* Total of all lz */ - double grand_total; - /* Mean of all lz */ - double grand_mean; +static unsigned int +unique_hash (const struct levene *nl, const union value *val) +{ + return value_hash (val, nl->gvw, 0); +} - /* The total number of cases */ - double total_n ; +static bool +unique_cmp (const struct levene *nl, const union value *val0, const union value *val1) +{ + return value_equal (val0, val1, nl->gvw); +} - /* Number of groups */ - int n_groups; -}; +static unsigned int +cutpoint_hash (const struct levene *nl, const union value *val) +{ + int x = value_compare_3way (val, nl->cutpoint, nl->gvw); -/* First pass */ -static void levene_precalc (const struct levene_info *l); -static int levene_calc (const struct dictionary *dict, const struct ccase *, - const struct levene_info *l); -static void levene_postcalc (struct levene_info *); + return (x < 0); +} +static bool +cutpoint_cmp (const struct levene *nl, const union value *val0, const union value *val1) +{ + int x = value_compare_3way (val0, nl->cutpoint, nl->gvw); -/* Second pass */ -static void levene2_precalc (struct levene_info *l); -static int levene2_calc (const struct dictionary *, const struct ccase *, - struct levene_info *l); -static void levene2_postcalc (struct levene_info *); + int y = value_compare_3way (val1, nl->cutpoint, nl->gvw); + if ( x == 0) x = 1; + if ( y == 0) y = 1; -void -levene(const struct dictionary *dict, - struct casereader *reader, - const struct variable *v_indep, size_t n_dep, - const struct variable **v_dep, - enum mv_class exclude) -{ - struct casereader *pass1, *pass2; - struct ccase *c; - struct levene_info l; - - l.n_dep = n_dep; - l.v_indep = v_indep; - l.v_dep = v_dep; - l.exclude = exclude; - l.lz = xnmalloc (l.n_dep, sizeof *l.lz); - l.lz_denominator = xnmalloc (l.n_dep, sizeof *l.lz_denominator); - - casereader_split (reader, &pass1, &pass2); - - levene_precalc (&l); - for (; (c = casereader_read (pass1)) != NULL; case_unref (c)) - levene_calc (dict, c, &l); - casereader_destroy (pass1); - levene_postcalc (&l); - - levene2_precalc(&l); - for (; (c = casereader_read (pass2)) != NULL; case_unref (c)) - levene2_calc (dict, c, &l); - casereader_destroy (pass2); - levene2_postcalc (&l); - - free (l.lz_denominator); - free (l.lz); + return ( x == y); } -static void -levene_precalc (const struct levene_info *l) + + +static struct lev * +find_group (const struct levene *nl, const union value *target) { - size_t i; + struct lev *l = NULL; - for(i = 0; i < l->n_dep ; ++i ) + HMAP_FOR_EACH_WITH_HASH (l, struct lev, node, nl->hash (nl, target), &nl->hmap) { - const struct variable *var = l->v_dep[i]; - struct group_proc *gp = group_proc_get (var); - struct group_statistics *gs; - struct hsh_iterator hi; + if (nl->cmp (nl, &l->group, target)) + break; + l = NULL; + } + return l; +} - l->lz[i].grand_total = 0; - l->lz[i].total_n = 0; - l->lz[i].n_groups = gp->n_groups ; +struct levene * +levene_create (int indep_width, const union value *cutpoint) +{ + struct levene *nl = xzalloc (sizeof *nl); - for ( gs = hsh_first(gp->group_hash, &hi); - gs != 0; - gs = hsh_next(gp->group_hash, &hi)) - { - gs->lz_total = 0; - } + hmap_init (&nl->hmap); - } + nl->gvw = indep_width; + nl->cutpoint = cutpoint; + nl->hash = cutpoint ? cutpoint_hash : unique_hash; + nl->cmp = cutpoint ? cutpoint_cmp : unique_cmp; + + return nl; } -static int -levene_calc (const struct dictionary *dict, const struct ccase *c, - const struct levene_info *l) -{ - size_t i; - bool warn = false; - const union value *gv = case_data (c, l->v_indep); - struct group_statistics key; - double weight = dict_get_case_weight (dict, c, &warn); - key.id = *gv; +/* Data accumulation. First pass */ +void +levene_pass_one (struct levene *nl, double value, double weight, const union value *gv) +{ + struct lev *lev = find_group (nl, gv); - for (i = 0; i < l->n_dep; ++i) + if ( nl->pass == 0 ) { - const struct variable *var = l->v_dep[i]; - struct group_proc *gp = group_proc_get (var); - double levene_z; - const union value *v = case_data (c, var); - struct group_statistics *gs; - - gs = hsh_find(gp->group_hash,(void *) &key ); + nl->pass = 1; + } + assert (nl->pass == 1); - if ( 0 == gs ) - continue ; + if ( NULL == lev) + { + struct lev *l = xzalloc (sizeof *l); + value_clone (&l->group, gv, nl->gvw); + hmap_insert (&nl->hmap, &l->node, nl->hash (nl, &l->group)); + lev = l; + } - if ( !var_is_value_missing (var, v, l->exclude)) - { - levene_z= fabs(v->f - gs->mean); - l->lz[i].grand_total += levene_z * weight; - l->lz[i].total_n += weight; + lev->n += weight; + lev->t_bar += value * weight; - gs->lz_total += levene_z * weight; - } - } - return 0; + nl->grand_n += weight; } - -static void -levene_postcalc (struct levene_info *l) +/* Data accumulation. Second pass */ +void +levene_pass_two (struct levene *nl, double value, double weight, const union value *gv) { - size_t v; + struct lev *lev = NULL; - for (v = 0; v < l->n_dep; ++v) + if ( nl->pass == 1 ) { - /* This is Z_LL */ - l->lz[v].grand_mean = l->lz[v].grand_total / l->lz[v].total_n ; - } + struct lev *next; + struct lev *l; + nl->pass = 2; -} + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + l->t_bar /= l->n; + } + } + assert (nl->pass == 2); + lev = find_group (nl, gv); + lev->z_mean += fabs (value - lev->t_bar) * weight; + nl->z_grand_mean += fabs (value - lev->t_bar) * weight; +} -static void -levene2_precalc (struct levene_info *l) +/* Data accumulation. Third pass */ +void +levene_pass_three (struct levene *nl, double value, double weight, const union value *gv) { - size_t v; - + double z; + struct lev *lev = NULL; - /* This stuff could go in the first post calc . . . */ - for (v = 0; - v < l->n_dep; - ++v) + if ( nl->pass == 2 ) { - struct hsh_iterator hi; - struct group_statistics *g; + struct lev *next; + struct lev *l; - const struct variable *var = l->v_dep[v] ; - struct hsh_table *hash = group_proc_get (var)->group_hash; + nl->pass = 3; + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + l->z_mean /= l->n; + } - for (g = hsh_first(hash,&hi); g != 0; g = hsh_next(hash, &hi)) - { - g->lz_mean = g->lz_total / g->n ; - } - l->lz_denominator[v] = 0; + nl->z_grand_mean /= nl->grand_n; } -} -static int -levene2_calc (const struct dictionary *dict, const struct ccase *c, - struct levene_info *l) -{ - size_t i; - bool warn = false; + assert (nl->pass == 3); + lev = find_group (nl, gv); - double weight = dict_get_case_weight (dict, c, &warn); + z = fabs (value - lev->t_bar); + nl->denominator += pow2 (z - lev->z_mean) * weight; +} - const union value *gv = case_data (c, l->v_indep); - struct group_statistics key; - key.id = *gv; +/* Return the value of the levene statistic */ +double +levene_calculate (struct levene *nl) +{ + struct lev *next; + struct lev *l; - for (i = 0; i < l->n_dep; ++i) - { - double levene_z; - const struct variable *var = l->v_dep[i] ; - const union value *v = case_data (c, var); - struct group_statistics *gs; + double numerator = 0.0; + double nn = 0.0; - gs = hsh_find(group_proc_get (var)->group_hash,(void *) &key ); + /* The Levene calculation requires three passes. + Normally this should have been done prior to calling this function. + However, in abnormal circumstances (eg. the dataset is empty) there + will have been no passes. + */ + assert (nl->pass == 0 || nl->pass == 3); - if ( 0 == gs ) - continue; + if ( nl->pass == 0 ) + return SYSMIS; - if ( !var_is_value_missing (var, v, l->exclude)) - { - levene_z = fabs(v->f - gs->mean); - l->lz_denominator[i] += weight * pow2 (levene_z - gs->lz_mean); - } + nl->denominator *= hmap_count (&nl->hmap) - 1; + + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + numerator += l->n * pow2 (l->z_mean - nl->z_grand_mean); + nn += l->n; } - return 0; + numerator *= nn - hmap_count (&nl->hmap); + + return numerator / nl->denominator; } - -static void -levene2_postcalc (struct levene_info *l) +void +levene_destroy (struct levene *nl) { - size_t v; + struct lev *next; + struct lev *l; - for (v = 0; v < l->n_dep; ++v) + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) { - double lz_numerator = 0; - struct hsh_iterator hi; - struct group_statistics *g; - - const struct variable *var = l->v_dep[v] ; - struct group_proc *gp = group_proc_get (var); - struct hsh_table *hash = gp->group_hash; - - for (g = hsh_first(hash, &hi); g != 0; g = hsh_next(hash, &hi)) - { - lz_numerator += g->n * pow2(g->lz_mean - l->lz[v].grand_mean ); - } - lz_numerator *= ( gp->ugs.n - gp->n_groups ); - - l->lz_denominator[v] *= (gp->n_groups - 1); - - gp->levene = lz_numerator / l->lz_denominator[v] ; - + value_destroy (&l->group, nl->gvw); + free (l); } -} + hmap_destroy (&nl->hmap); + free (nl); +}