X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fmath%2Flevene.c;h=13fad93e52107ebe5a16ca826fbbedb06dbf8928;hb=81579d9e9f994fb2908f50af41c3eb033d216e58;hp=f7e269991346cd0c6cb076eacec04bcbbd62590c;hpb=43c6dd6d4f7750a5f531f5d931628b1ab92ee748;p=pspp-builds.git diff --git a/src/math/levene.c b/src/math/levene.c index f7e26999..13fad93e 100644 --- a/src/math/levene.c +++ b/src/math/levene.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2004, 2009 Free Software Foundation, Inc. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -15,307 +15,166 @@ along with this program. If not, see . */ #include -#include "levene.h" -#include -#include -#include -#include -#include "group-proc.h" -#include -#include -#include -#include -#include -#include "group.h" -#include -#include - -#include "xalloc.h" - - -/* This module calculates the Levene statistic for variables. - - Just for reference, the Levene Statistic is a defines as follows: - - W = \frac{ (n-k)\sum_{i=1}^k n_i(Z_{iL} - Z_{LL})^2} - { (k-1)\sum_{i=1}^k \sum_{j=1}^{n_i} (Z_{ij} - Z_{iL})^2} - - where: - k is the number of groups - n is the total number of samples - n_i is the number of samples in the ith group - Z_{ij} is | Y_{ij} - Y_{iL} | where Y_{iL} is the mean of the ith group - Z_{iL} is the mean of Z_{ij} over the ith group - Z_{LL} is the grand mean of Z_{ij} - - Imagine calculating that with pencil and paper! - - */ - - -struct levene_info -{ - - /* Per group statistics */ - struct t_test_proc **group_stats; - - /* The independent variable */ - const struct variable *v_indep; - - /* Number of dependent variables */ - size_t n_dep; - - /* The dependent variables */ - const struct variable **v_dep; - - /* Filter for missing values */ - enum mv_class exclude; +#include "math/levene.h" - /* An array of lz_stats for each variable */ - struct lz_stats *lz; - - /* The denominator for the expression for the Levene */ - double *lz_denominator; +#include -}; +#include "data/case.h" +#include "data/casereader.h" +#include "data/variable.h" +#include "libpspp/hmap.h" +#include "libpspp/misc.h" -/* Per variable statistics */ -struct lz_stats +struct lev { - /* Total of all lz */ - double grand_total; - - /* Mean of all lz */ - double grand_mean; - - /* The total number of cases */ - double total_n ; + struct hmap_node node; + union value group; + int width; - /* Number of groups */ - int n_groups; + double t_bar; + double z_mean; + double n; }; -/* First pass */ -static void levene_precalc (const struct levene_info *l); -static int levene_calc (const struct dictionary *dict, const struct ccase *, - const struct levene_info *l); -static void levene_postcalc (struct levene_info *); - -/* Second pass */ -static void levene2_precalc (struct levene_info *l); -static int levene2_calc (const struct dictionary *, const struct ccase *, - struct levene_info *l); -static void levene2_postcalc (struct levene_info *); - - -void -levene(const struct dictionary *dict, - struct casereader *reader, - const struct variable *v_indep, size_t n_dep, - const struct variable **v_dep, - enum mv_class exclude) +static struct lev * +find_group (struct hmap *map, const union value *target, int width) { - struct casereader *pass1, *pass2; - struct ccase *c; - struct levene_info l; - - l.n_dep = n_dep; - l.v_indep = v_indep; - l.v_dep = v_dep; - l.exclude = exclude; - l.lz = xnmalloc (l.n_dep, sizeof *l.lz); - l.lz_denominator = xnmalloc (l.n_dep, sizeof *l.lz_denominator); - - casereader_split (reader, &pass1, &pass2); - - levene_precalc (&l); - for (; (c = casereader_read (pass1)) != NULL; case_unref (c)) - levene_calc (dict, c, &l); - casereader_destroy (pass1); - levene_postcalc (&l); - - levene2_precalc(&l); - for (; (c = casereader_read (pass2)) != NULL; case_unref (c)) - levene2_calc (dict, c, &l); - casereader_destroy (pass2); - levene2_postcalc (&l); - - free (l.lz_denominator); - free (l.lz); -} - -static void -levene_precalc (const struct levene_info *l) -{ - size_t i; - - for(i = 0; i < l->n_dep ; ++i ) + struct lev *l = NULL; + unsigned int hash = value_hash (target, width, 0); + HMAP_FOR_EACH_WITH_HASH (l, struct lev, node, hash, map) { - const struct variable *var = l->v_dep[i]; - struct group_proc *gp = group_proc_get (var); - struct group_statistics *gs; - struct hsh_iterator hi; - - l->lz[i].grand_total = 0; - l->lz[i].total_n = 0; - l->lz[i].n_groups = gp->n_groups ; - - - for ( gs = hsh_first(gp->group_hash, &hi); - gs != 0; - gs = hsh_next(gp->group_hash, &hi)) - { - gs->lz_total = 0; - } - + if (value_equal (&l->group, target, width)) + break; + l = NULL; } -} - -static int -levene_calc (const struct dictionary *dict, const struct ccase *c, - const struct levene_info *l) -{ - size_t i; - bool warn = false; - const union value *gv = case_data (c, l->v_indep); - struct group_statistics key; - double weight = dict_get_case_weight (dict, c, &warn); - - key.id = *gv; - - for (i = 0; i < l->n_dep; ++i) - { - const struct variable *var = l->v_dep[i]; - struct group_proc *gp = group_proc_get (var); - double levene_z; - const union value *v = case_data (c, var); - struct group_statistics *gs; - - gs = hsh_find(gp->group_hash,(void *) &key ); - - if ( 0 == gs ) - continue ; - - if ( !var_is_value_missing (var, v, l->exclude)) - { - levene_z= fabs(v->f - gs->mean); - l->lz[i].grand_total += levene_z * weight; - l->lz[i].total_n += weight; - - gs->lz_total += levene_z * weight; - } - } - return 0; + return l; } -static void -levene_postcalc (struct levene_info *l) +double +levene (struct casereader *rx, const struct variable *gvar, + const struct variable *var, const struct variable *wv, + enum mv_class exclude) { - size_t v; - - for (v = 0; v < l->n_dep; ++v) - { - /* This is Z_LL */ - l->lz[v].grand_mean = l->lz[v].grand_total / l->lz[v].total_n ; - } - - -} - - + double numerator = 0.0; + double denominator = 0.0; + int n_groups = 0; + double z_grand_mean = 0.0; + double grand_n = 0.0; -static void -levene2_precalc (struct levene_info *l) -{ - size_t v; + struct hmap map = HMAP_INITIALIZER (map); + struct ccase *c; + struct casereader *r = casereader_clone (rx); - /* This stuff could go in the first post calc . . . */ - for (v = 0; - v < l->n_dep; - ++v) + for (; (c = casereader_read (r)) != NULL; case_unref (c)) { - struct hsh_iterator hi; - struct group_statistics *g; - - const struct variable *var = l->v_dep[v] ; - struct hsh_table *hash = group_proc_get (var)->group_hash; - + struct lev *l = NULL; + const union value *target = case_data (c, gvar); + int width = var_get_width (gvar); + unsigned int hash = value_hash (target, width, 0); + const double x = case_data (c, var)->f; + const double weight = wv ? case_data (c, wv)->f : 1.0; + + if (var_is_value_missing (var, case_data (c, var), exclude)) + continue; - for (g = hsh_first(hash,&hi); g != 0; g = hsh_next(hash, &hi)) + l = find_group (&map, target, width); + + if (l == NULL) { - g->lz_mean = g->lz_total / g->n ; + l = xzalloc (sizeof *l); + value_clone (&l->group, target, width); + hmap_insert (&map, &l->node, hash); } - l->lz_denominator[v] = 0; - } -} -static int -levene2_calc (const struct dictionary *dict, const struct ccase *c, - struct levene_info *l) -{ - size_t i; - bool warn = false; - - double weight = dict_get_case_weight (dict, c, &warn); - - const union value *gv = case_data (c, l->v_indep); - struct group_statistics key; + l->n += weight; + l->t_bar += x * weight; + grand_n += weight; + } + casereader_destroy (r); + + { + struct lev *l; + HMAP_FOR_EACH (l, struct lev, node, &map) + { + l->t_bar /= l->n; + } + } - key.id = *gv; + n_groups = hmap_count (&map); - for (i = 0; i < l->n_dep; ++i) + r = casereader_clone (rx); + for (; (c = casereader_read (r)) != NULL; case_unref (c)) { - double levene_z; - const struct variable *var = l->v_dep[i] ; - const union value *v = case_data (c, var); - struct group_statistics *gs; - - gs = hsh_find(group_proc_get (var)->group_hash,(void *) &key ); + struct lev *l = NULL; + const union value *target = case_data (c, gvar); + int width = var_get_width (gvar); + const double x = case_data (c, var)->f; + const double weight = wv ? case_data (c, wv)->f : 1.0; - if ( 0 == gs ) + if (var_is_value_missing (var, case_data (c, var), exclude)) continue; - if ( !var_is_value_missing (var, v, l->exclude)) - { - levene_z = fabs(v->f - gs->mean); - l->lz_denominator[i] += weight * pow2 (levene_z - gs->lz_mean); - } + l = find_group (&map, target, width); + assert (l); + + l->z_mean += fabs (x - l->t_bar) * weight; + z_grand_mean += fabs (x - l->t_bar) * weight; } + casereader_destroy (r); + + { + struct lev *l; + HMAP_FOR_EACH (l, struct lev, node, &map) + { + l->z_mean /= l->n; + } + } - return 0; -} - + z_grand_mean /= grand_n; -static void -levene2_postcalc (struct levene_info *l) -{ - size_t v; - - for (v = 0; v < l->n_dep; ++v) + r = casereader_clone (rx); + for (; (c = casereader_read (r)) != NULL; case_unref (c)) { - double lz_numerator = 0; - struct hsh_iterator hi; - struct group_statistics *g; - - const struct variable *var = l->v_dep[v] ; - struct group_proc *gp = group_proc_get (var); - struct hsh_table *hash = gp->group_hash; - - for (g = hsh_first(hash, &hi); g != 0; g = hsh_next(hash, &hi)) - { - lz_numerator += g->n * pow2(g->lz_mean - l->lz[v].grand_mean ); - } - lz_numerator *= ( gp->ugs.n - gp->n_groups ); - - l->lz_denominator[v] *= (gp->n_groups - 1); + double z; + struct lev *l; + const union value *target = case_data (c, gvar); + int width = var_get_width (gvar); + const double x = case_data (c, var)->f; + const double weight = wv ? case_data (c, wv)->f : 1.0; + + if (var_is_value_missing (var, case_data (c, var), exclude)) + continue; - gp->levene = lz_numerator / l->lz_denominator[v] ; + l = find_group (&map, target, width); + assert (l); + z = fabs (x - l->t_bar); + denominator += pow2 (z - l->z_mean) * weight; } -} + casereader_destroy (r); + + denominator *= n_groups - 1; + + { + double grand_n = 0.0; + struct lev *next; + struct lev *l; + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &map) + { + numerator += l->n * pow2 (l->z_mean - z_grand_mean); + grand_n += l->n; + + /* We don't need these anymore */ + free (l); + } + numerator *= grand_n - n_groups ; + hmap_destroy (&map); + } + return numerator/ denominator; +}