X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fmath%2Flevene.c;h=13fad93e52107ebe5a16ca826fbbedb06dbf8928;hb=81579d9e9f994fb2908f50af41c3eb033d216e58;hp=8fb41fed11db2ac52922c5095ae4e32765e9cd62;hpb=244ade48f9c233532cc535d3233fdef53bf9266b;p=pspp-builds.git diff --git a/src/math/levene.c b/src/math/levene.c index 8fb41fed..13fad93e 100644 --- a/src/math/levene.c +++ b/src/math/levene.c @@ -1,378 +1,180 @@ -/* This file is part of GNU PSPP - Computes Levene test statistic. +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. - Copyright (C) 2004 Free Software Foundation, Inc. - Written by John Darrington + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. */ + along with this program. If not, see . */ #include -#include "levene.h" -#include -#include -#include -#include -#include "group-proc.h" -#include -#include -#include -#include -#include -#include -#include "group.h" - -#include -#include - - -/* This module calculates the Levene statistic for variables. - - Just for reference, the Levene Statistic is a defines as follows: - - W = \frac{ (n-k)\sum_{i=1}^k n_i(Z_{iL} - Z_{LL})^2} - { (k-1)\sum_{i=1}^k \sum_{j=1}^{n_i} (Z_{ij} - Z_{iL})^2} - where: - k is the number of groups - n is the total number of samples - n_i is the number of samples in the ith group - Z_{ij} is | Y_{ij} - Y_{iL} | where Y_{iL} is the mean of the ith group - Z_{iL} is the mean of Z_{ij} over the ith group - Z_{LL} is the grand mean of Z_{ij} +#include "math/levene.h" - Imagine calculating that with pencil and paper! - - */ +#include +#include "data/case.h" +#include "data/casereader.h" +#include "data/variable.h" +#include "libpspp/hmap.h" +#include "libpspp/misc.h" -struct levene_info +struct lev { + struct hmap_node node; + union value group; + int width; - /* Per group statistics */ - struct t_test_proc **group_stats; - - /* The independent variable */ - struct variable *v_indep; - - /* Number of dependent variables */ - size_t n_dep; - - /* The dependent variables */ - struct variable **v_dep; - - /* How to treat missing values */ - enum lev_missing missing; - - /* Function to test for missing values */ - is_missing_func *is_missing; + double t_bar; + double z_mean; + double n; }; -/* First pass */ -static void levene_precalc (const struct levene_info *l); -static int levene_calc (const struct dictionary *dict, const struct ccase *, void *); -static void levene_postcalc (void *); - -/* Second pass */ -static void levene2_precalc (void *); -static int levene2_calc (const struct dictionary *,const struct ccase *, void *); -static void levene2_postcalc (void *); - - -void -levene(const struct dictionary *dict, - const struct casefile *cf, - struct variable *v_indep, size_t n_dep, struct variable **v_dep, - enum lev_missing missing, is_missing_func value_is_missing) +static struct lev * +find_group (struct hmap *map, const union value *target, int width) { - struct casereader *r; - struct ccase c; - struct levene_info l; - - l.n_dep = n_dep; - l.v_indep = v_indep; - l.v_dep = v_dep; - l.missing = missing; - l.is_missing = value_is_missing; - - - - levene_precalc(&l); - for(r = casefile_get_reader (cf); - casereader_read (r, &c) ; - case_destroy (&c)) + struct lev *l = NULL; + unsigned int hash = value_hash (target, width, 0); + HMAP_FOR_EACH_WITH_HASH (l, struct lev, node, hash, map) { - levene_calc (dict, &c,&l); + if (value_equal (&l->group, target, width)) + break; + l = NULL; } - casereader_destroy (r); - levene_postcalc(&l); - - levene2_precalc(&l); - for(r = casefile_get_reader (cf); - casereader_read (r, &c) ; - case_destroy (&c)) - { - levene2_calc (dict, &c,&l); - } - casereader_destroy (r); - levene2_postcalc(&l); + return l; } -/* Internal variables used in calculating the Levene statistic */ -/* Per variable statistics */ -struct lz_stats +double +levene (struct casereader *rx, const struct variable *gvar, + const struct variable *var, const struct variable *wv, + enum mv_class exclude) { - /* Total of all lz */ - double grand_total; - - /* Mean of all lz */ - double grand_mean; + double numerator = 0.0; + double denominator = 0.0; + int n_groups = 0; + double z_grand_mean = 0.0; + double grand_n = 0.0; - /* The total number of cases */ - double total_n ; + struct hmap map = HMAP_INITIALIZER (map); - /* Number of groups */ - int n_groups; -}; - -/* An array of lz_stats for each variable */ -static struct lz_stats *lz; + struct ccase *c; + struct casereader *r = casereader_clone (rx); - -static void -levene_precalc (const struct levene_info *l) -{ - size_t i; - - lz = xnmalloc (l->n_dep, sizeof *lz); - - for(i = 0; i < l->n_dep ; ++i ) + for (; (c = casereader_read (r)) != NULL; case_unref (c)) { - struct variable *var = l->v_dep[i]; - struct group_proc *gp = group_proc_get (var); - struct group_statistics *gs; - struct hsh_iterator hi; - - lz[i].grand_total = 0; - lz[i].total_n = 0; - lz[i].n_groups = gp->n_groups ; + struct lev *l = NULL; + const union value *target = case_data (c, gvar); + int width = var_get_width (gvar); + unsigned int hash = value_hash (target, width, 0); + const double x = case_data (c, var)->f; + const double weight = wv ? case_data (c, wv)->f : 1.0; + + if (var_is_value_missing (var, case_data (c, var), exclude)) + continue; + l = find_group (&map, target, width); - for ( gs = hsh_first(gp->group_hash, &hi); - gs != 0; - gs = hsh_next(gp->group_hash, &hi)) + if (l == NULL) { - gs->lz_total = 0; + l = xzalloc (sizeof *l); + value_clone (&l->group, target, width); + hmap_insert (&map, &l->node, hash); } - - } - -} - -static int -levene_calc (const struct dictionary *dict, const struct ccase *c, void *_l) -{ - size_t i; - bool warn = false; - struct levene_info *l = (struct levene_info *) _l; - const union value *gv = case_data (c, l->v_indep->fv); - struct group_statistics key; - double weight = dict_get_case_weight (dict, c, &warn); - - /* Skip the entire case if /MISSING=LISTWISE is set */ - if ( l->missing == LEV_LISTWISE ) - { - for (i = 0; i < l->n_dep; ++i) - { - struct variable *v = l->v_dep[i]; - const union value *val = case_data (c, v->fv); - - if (l->is_missing (&v->miss, val) ) - { - return 0; - } - } - } - - key.id = *gv; - - for (i = 0; i < l->n_dep; ++i) - { - struct variable *var = l->v_dep[i]; - struct group_proc *gp = group_proc_get (var); - double levene_z; - const union value *v = case_data (c, var->fv); - struct group_statistics *gs; - - gs = hsh_find(gp->group_hash,(void *) &key ); - - if ( 0 == gs ) - continue ; - - if ( ! l->is_missing(&var->miss, v)) - { - levene_z= fabs(v->f - gs->mean); - lz[i].grand_total += levene_z * weight; - lz[i].total_n += weight; - - gs->lz_total += levene_z * weight; - } - - } - return 0; -} - -static void -levene_postcalc (void *_l) -{ - size_t v; - - struct levene_info *l = (struct levene_info *) _l; - - for (v = 0; v < l->n_dep; ++v) - { - /* This is Z_LL */ - lz[v].grand_mean = lz[v].grand_total / lz[v].total_n ; + l->n += weight; + l->t_bar += x * weight; + grand_n += weight; } + casereader_destroy (r); - -} - - -/* The denominator for the expression for the Levene */ -static double *lz_denominator; - -static void -levene2_precalc (void *_l) -{ - size_t v; - - struct levene_info *l = (struct levene_info *) _l; - - lz_denominator = xnmalloc (l->n_dep, sizeof *lz_denominator); - - /* This stuff could go in the first post calc . . . */ - for (v = 0; v < l->n_dep; ++v) - { - struct hsh_iterator hi; - struct group_statistics *g; - - struct variable *var = l->v_dep[v] ; - struct hsh_table *hash = group_proc_get (var)->group_hash; - - - for(g = (struct group_statistics *) hsh_first(hash,&hi); - g != 0 ; - g = (struct group_statistics *) hsh_next(hash,&hi) ) - { - g->lz_mean = g->lz_total / g->n ; - } - lz_denominator[v] = 0; + { + struct lev *l; + HMAP_FOR_EACH (l, struct lev, node, &map) + { + l->t_bar /= l->n; + } } -} - -static int -levene2_calc (const struct dictionary *dict, const struct ccase *c, void *_l) -{ - size_t i; - bool warn = false; - - struct levene_info *l = (struct levene_info *) _l; - - double weight = dict_get_case_weight (dict, c, &warn); - const union value *gv = case_data (c, l->v_indep->fv); - struct group_statistics key; + n_groups = hmap_count (&map); - /* Skip the entire case if /MISSING=LISTWISE is set */ - if ( l->missing == LEV_LISTWISE ) + r = casereader_clone (rx); + for (; (c = casereader_read (r)) != NULL; case_unref (c)) { - for (i = 0; i < l->n_dep; ++i) - { - struct variable *v = l->v_dep[i]; - const union value *val = case_data (c, v->fv); - - if (l->is_missing(&v->miss, val) ) - { - return 0; - } - } - } - - key.id = *gv; - - for (i = 0; i < l->n_dep; ++i) - { - double levene_z; - struct variable *var = l->v_dep[i] ; - const union value *v = case_data (c, var->fv); - struct group_statistics *gs; + struct lev *l = NULL; + const union value *target = case_data (c, gvar); + int width = var_get_width (gvar); + const double x = case_data (c, var)->f; + const double weight = wv ? case_data (c, wv)->f : 1.0; - gs = hsh_find(group_proc_get (var)->group_hash,(void *) &key ); - - if ( 0 == gs ) + if (var_is_value_missing (var, case_data (c, var), exclude)) continue; - if ( ! l->is_missing (&var->miss, v) ) - { - levene_z = fabs(v->f - gs->mean); - lz_denominator[i] += weight * pow2(levene_z - gs->lz_mean); - } + l = find_group (&map, target, width); + assert (l); + + l->z_mean += fabs (x - l->t_bar) * weight; + z_grand_mean += fabs (x - l->t_bar) * weight; } + casereader_destroy (r); - return 0; -} - - -static void -levene2_postcalc (void *_l) -{ - size_t v; + { + struct lev *l; + HMAP_FOR_EACH (l, struct lev, node, &map) + { + l->z_mean /= l->n; + } + } - struct levene_info *l = (struct levene_info *) _l; + z_grand_mean /= grand_n; - for (v = 0; v < l->n_dep; ++v) + r = casereader_clone (rx); + for (; (c = casereader_read (r)) != NULL; case_unref (c)) { - double lz_numerator = 0; - struct hsh_iterator hi; - struct group_statistics *g; - - struct variable *var = l->v_dep[v] ; - struct group_proc *gp = group_proc_get (var); - struct hsh_table *hash = gp->group_hash; - - for(g = (struct group_statistics *) hsh_first(hash,&hi); - g != 0 ; - g = (struct group_statistics *) hsh_next(hash,&hi) ) - { - lz_numerator += g->n * pow2(g->lz_mean - lz[v].grand_mean ); - } - lz_numerator *= ( gp->ugs.n - gp->n_groups ); - - lz_denominator[v] *= (gp->n_groups - 1); + double z; + struct lev *l; + const union value *target = case_data (c, gvar); + int width = var_get_width (gvar); + const double x = case_data (c, var)->f; + const double weight = wv ? case_data (c, wv)->f : 1.0; + + if (var_is_value_missing (var, case_data (c, var), exclude)) + continue; - gp->levene = lz_numerator / lz_denominator[v] ; + l = find_group (&map, target, width); + assert (l); + z = fabs (x - l->t_bar); + denominator += pow2 (z - l->z_mean) * weight; } + casereader_destroy (r); - /* Now clear up after ourselves */ - free(lz_denominator); - free(lz); -} + denominator *= n_groups - 1; + + { + double grand_n = 0.0; + struct lev *next; + struct lev *l; + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &map) + { + numerator += l->n * pow2 (l->z_mean - z_grand_mean); + grand_n += l->n; + + /* We don't need these anymore */ + free (l); + } + numerator *= grand_n - n_groups ; + hmap_destroy (&map); + } + return numerator/ denominator; +}