X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fmath%2Flevene.c;h=a005c9436091ecd58b992b936be5e74312a405bd;hb=de786d64f39e8d8c7a3b1d86fbd88ec2d6d19fa6;hp=acb1cebfc0a1b07c2cfdebb9f0108d15395429ee;hpb=164d1274fcb70c54897f2a03fc7c27152ed4821a;p=pspp diff --git a/src/math/levene.c b/src/math/levene.c index acb1cebfc0..a005c94360 100644 --- a/src/math/levene.c +++ b/src/math/levene.c @@ -1,344 +1,264 @@ -/* This file is part of GNU PSPP - Computes Levene test statistic. +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. - Copyright (C) 2004 Free Software Foundation, Inc. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. */ + along with this program. If not, see . */ #include -#include "levene.h" -#include -#include -#include -#include -#include "group-proc.h" -#include -#include -#include -#include -#include -#include -#include -#include "group.h" +#include "levene.h" #include -#include - -/* This module calculates the Levene statistic for variables. +#include "libpspp/misc.h" +#include "libpspp/hmap.h" +#include "data/value.h" +#include "data/val-type.h" - Just for reference, the Levene Statistic is a defines as follows: +#include +#include - W = \frac{ (n-k)\sum_{i=1}^k n_i(Z_{iL} - Z_{LL})^2} - { (k-1)\sum_{i=1}^k \sum_{j=1}^{n_i} (Z_{ij} - Z_{iL})^2} - - where: - k is the number of groups - n is the total number of samples - n_i is the number of samples in the ith group - Z_{ij} is | Y_{ij} - Y_{iL} | where Y_{iL} is the mean of the ith group - Z_{iL} is the mean of Z_{ij} over the ith group - Z_{LL} is the grand mean of Z_{ij} - - Imagine calculating that with pencil and paper! +struct lev +{ + struct hmap_node node; + union value group; - */ + double t_bar; + double z_mean; + double n; +}; +typedef unsigned int hash_func (const struct levene *, const union value *v); +typedef bool cmp_func (const struct levene *, const union value *v0, const union value *v1); -struct levene_info +struct levene { + /* Width of the categorical variable */ + int gvw ; - /* Per group statistics */ - struct t_test_proc **group_stats; + /* The value dividing the groups. Valid only for dichotomous categorical variable.*/ + const union value *cutpoint; - /* The independent variable */ - struct variable *v_indep; - /* Number of dependent variables */ - size_t n_dep; + /* A hashtable of struct lev objects indexed by union value */ + struct hmap hmap; - /* The dependent variables */ - struct variable **v_dep; + hash_func *hash; + cmp_func *cmp; - /* Filter for missing values */ - struct casefilter *filter; -}; -/* First pass */ -static void levene_precalc (const struct levene_info *l); -static int levene_calc (const struct dictionary *dict, const struct ccase *, - const struct levene_info *l); -static void levene_postcalc (void *); + /* A state variable indicating how many passes have been done */ + int pass; + double grand_n; + double z_grand_mean; -/* Second pass */ -static void levene2_precalc (struct levene_info *l); -static int levene2_calc (const struct dictionary *, const struct ccase *, - struct levene_info *l); -static void levene2_postcalc (void *); + double denominator; +}; -void -levene(const struct dictionary *dict, - const struct casefile *cf, - struct variable *v_indep, size_t n_dep, struct variable **v_dep, - struct casefilter *filter) +static unsigned int +unique_hash (const struct levene *nl, const union value *val) { - struct casereader *r; - struct ccase c; - struct levene_info l; - - l.n_dep = n_dep; - l.v_indep = v_indep; - l.v_dep = v_dep; - l.filter = filter; + return value_hash (val, nl->gvw, 0); +} +static bool +unique_cmp (const struct levene *nl, const union value *val0, const union value *val1) +{ + return value_equal (val0, val1, nl->gvw); +} - levene_precalc (&l); - for(r = casefile_get_reader (cf, filter); - casereader_read (r, &c) ; - case_destroy (&c)) - { - levene_calc (dict, &c, &l); - } - casereader_destroy (r); - levene_postcalc (&l); +static unsigned int +cutpoint_hash (const struct levene *nl, const union value *val) +{ + int x = value_compare_3way (val, nl->cutpoint, nl->gvw); - levene2_precalc(&l); - for(r = casefile_get_reader (cf, filter); - casereader_read (r, &c) ; - case_destroy (&c)) - { - levene2_calc (dict, &c,&l); - } - casereader_destroy (r); - levene2_postcalc (&l); + return (x < 0); } -/* Internal variables used in calculating the Levene statistic */ - -/* Per variable statistics */ -struct lz_stats +static bool +cutpoint_cmp (const struct levene *nl, const union value *val0, const union value *val1) { - /* Total of all lz */ - double grand_total; + int x = value_compare_3way (val0, nl->cutpoint, nl->gvw); - /* Mean of all lz */ - double grand_mean; + int y = value_compare_3way (val1, nl->cutpoint, nl->gvw); - /* The total number of cases */ - double total_n ; + if (x == 0) x = 1; + if (y == 0) y = 1; - /* Number of groups */ - int n_groups; -}; + return (x == y); +} -/* An array of lz_stats for each variable */ -static struct lz_stats *lz; -static void -levene_precalc (const struct levene_info *l) +static struct lev * +find_group (const struct levene *nl, const union value *target) { - size_t i; - - lz = xnmalloc (l->n_dep, sizeof *lz); + struct lev *l = NULL; - for(i = 0; i < l->n_dep ; ++i ) + HMAP_FOR_EACH_WITH_HASH (l, struct lev, node, nl->hash (nl, target), &nl->hmap) { - struct variable *var = l->v_dep[i]; - struct group_proc *gp = group_proc_get (var); - struct group_statistics *gs; - struct hsh_iterator hi; - - lz[i].grand_total = 0; - lz[i].total_n = 0; - lz[i].n_groups = gp->n_groups ; - - - for ( gs = hsh_first(gp->group_hash, &hi); - gs != 0; - gs = hsh_next(gp->group_hash, &hi)) - { - gs->lz_total = 0; - } - + if (nl->cmp (nl, &l->group, target)) + break; } - + return l; } -static int -levene_calc (const struct dictionary *dict, const struct ccase *c, - const struct levene_info *l) -{ - size_t i; - bool warn = false; - const union value *gv = case_data (c, l->v_indep); - struct group_statistics key; - double weight = dict_get_case_weight (dict, c, &warn); - - key.id = *gv; - for (i = 0; i < l->n_dep; ++i) - { - struct variable *var = l->v_dep[i]; - struct group_proc *gp = group_proc_get (var); - double levene_z; - const union value *v = case_data (c, var); - struct group_statistics *gs; +struct levene * +levene_create (int indep_width, const union value *cutpoint) +{ + struct levene *nl = XZALLOC (struct levene); - gs = hsh_find(gp->group_hash,(void *) &key ); + hmap_init (&nl->hmap); - if ( 0 == gs ) - continue ; + nl->gvw = indep_width; + nl->cutpoint = cutpoint; - if ( ! casefilter_variable_missing (l->filter, c, var)) - { - levene_z= fabs(v->f - gs->mean); - lz[i].grand_total += levene_z * weight; - lz[i].total_n += weight; + nl->hash = cutpoint ? cutpoint_hash : unique_hash; + nl->cmp = cutpoint ? cutpoint_cmp : unique_cmp; - gs->lz_total += levene_z * weight; - } - } - return 0; + return nl; } -static void -levene_postcalc (void *_l) +/* Data accumulation. First pass */ +void +levene_pass_one (struct levene *nl, double value, double weight, const union value *gv) { - size_t v; + struct lev *lev = find_group (nl, gv); - struct levene_info *l = (struct levene_info *) _l; - - for (v = 0; v < l->n_dep; ++v) + if (nl->pass == 0) { - /* This is Z_LL */ - lz[v].grand_mean = lz[v].grand_total / lz[v].total_n ; + nl->pass = 1; } + assert (nl->pass == 1); - -} - + if (NULL == lev) + { + struct lev *l = XZALLOC (struct lev); + value_clone (&l->group, gv, nl->gvw); + hmap_insert (&nl->hmap, &l->node, nl->hash (nl, &l->group)); + lev = l; + } + lev->n += weight; + lev->t_bar += value * weight; -/* The denominator for the expression for the Levene */ -static double *lz_denominator = 0; + nl->grand_n += weight; +} -static void -levene2_precalc (struct levene_info *l) +/* Data accumulation. Second pass */ +void +levene_pass_two (struct levene *nl, double value, double weight, const union value *gv) { - size_t v; - - lz_denominator = xnmalloc (l->n_dep, sizeof *lz_denominator); + struct lev *lev = NULL; - /* This stuff could go in the first post calc . . . */ - for (v = 0; - v < l->n_dep; - ++v) + if (nl->pass == 1) { - struct hsh_iterator hi; - struct group_statistics *g; + struct lev *next; + struct lev *l; + + nl->pass = 2; - struct variable *var = l->v_dep[v] ; - struct hsh_table *hash = group_proc_get (var)->group_hash; + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + l->t_bar /= l->n; + } + } + assert (nl->pass == 2); + lev = find_group (nl, gv); - for(g = (struct group_statistics *) hsh_first(hash,&hi); - g != 0 ; - g = (struct group_statistics *) hsh_next(hash,&hi) ) - { - g->lz_mean = g->lz_total / g->n ; - } - lz_denominator[v] = 0; - } + lev->z_mean += fabs (value - lev->t_bar) * weight; + nl->z_grand_mean += fabs (value - lev->t_bar) * weight; } -static int -levene2_calc (const struct dictionary *dict, const struct ccase *c, - struct levene_info *l) +/* Data accumulation. Third pass */ +void +levene_pass_three (struct levene *nl, double value, double weight, const union value *gv) { - size_t i; - bool warn = false; - - double weight = dict_get_case_weight (dict, c, &warn); + double z; + struct lev *lev = NULL; - const union value *gv = case_data (c, l->v_indep); - struct group_statistics key; - - key.id = *gv; - - for (i = 0; i < l->n_dep; ++i) + if (nl->pass == 2) { - double levene_z; - struct variable *var = l->v_dep[i] ; - const union value *v = case_data (c, var); - struct group_statistics *gs; + struct lev *next; + struct lev *l; - gs = hsh_find(group_proc_get (var)->group_hash,(void *) &key ); + nl->pass = 3; - if ( 0 == gs ) - continue; + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + l->z_mean /= l->n; + } - if ( ! casefilter_variable_missing (l->filter, c, var)) + nl->z_grand_mean /= nl->grand_n; + } - { - levene_z = fabs(v->f - gs->mean); - lz_denominator[i] += weight * pow2 (levene_z - gs->lz_mean); - } - } + assert (nl->pass == 3); + lev = find_group (nl, gv); - return 0; + z = fabs (value - lev->t_bar); + nl->denominator += pow2 (z - lev->z_mean) * weight; } -static void -levene2_postcalc (void *_l) +/* Return the value of the levene statistic */ +double +levene_calculate (struct levene *nl) { - size_t v; + struct lev *next; + struct lev *l; - struct levene_info *l = (struct levene_info *) _l; + double numerator = 0.0; + double nn = 0.0; - for (v = 0; v < l->n_dep; ++v) - { - double lz_numerator = 0; - struct hsh_iterator hi; - struct group_statistics *g; + /* The Levene calculation requires three passes. + Normally this should have been done prior to calling this function. + However, in abnormal circumstances (eg. the dataset is empty) there + will have been no passes. + */ + assert (nl->pass == 0 || nl->pass == 3); - struct variable *var = l->v_dep[v] ; - struct group_proc *gp = group_proc_get (var); - struct hsh_table *hash = gp->group_hash; + if (nl->pass == 0) + return SYSMIS; - for(g = (struct group_statistics *) hsh_first(hash,&hi); - g != 0 ; - g = (struct group_statistics *) hsh_next(hash,&hi) ) - { - lz_numerator += g->n * pow2(g->lz_mean - lz[v].grand_mean ); - } - lz_numerator *= ( gp->ugs.n - gp->n_groups ); + nl->denominator *= hmap_count (&nl->hmap) - 1; - lz_denominator[v] *= (gp->n_groups - 1); + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + numerator += l->n * pow2 (l->z_mean - nl->z_grand_mean); + nn += l->n; + } - gp->levene = lz_numerator / lz_denominator[v] ; + numerator *= nn - hmap_count (&nl->hmap); + return numerator / nl->denominator; +} + +void +levene_destroy (struct levene *nl) +{ + struct lev *next; + struct lev *l; + + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + value_destroy (&l->group, nl->gvw); + free (l); } - /* Now clear up after ourselves */ - free(lz_denominator); - free(lz); + hmap_destroy (&nl->hmap); + free (nl); } -