X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fmath%2Flevene.c;h=a005c9436091ecd58b992b936be5e74312a405bd;hb=cee6f0eb54144da7034566fa1bcdcee22337ae6a;hp=920440103b4d5b33bcb6721746b1048591f61693;hpb=e16d30a43af6ada7b8846f48a55eb3cb8f4e5f22;p=pspp diff --git a/src/math/levene.c b/src/math/levene.c index 920440103b..a005c94360 100644 --- a/src/math/levene.c +++ b/src/math/levene.c @@ -1,378 +1,264 @@ -/* This file is part of GNU PSPP - Computes Levene test statistic. +/* PSPP - a program for statistical analysis. + Copyright (C) 2010, 2011 Free Software Foundation, Inc. - Copyright (C) 2004 Free Software Foundation, Inc. - Written by John Darrington + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. */ + along with this program. If not, see . */ #include -#include "levene.h" -#include -#include -#include -#include -#include "group-proc.h" -#include -#include -#include -#include -#include -#include -#include "group.h" +#include "levene.h" #include -#include - -/* This module calculates the Levene statistic for variables. +#include "libpspp/misc.h" +#include "libpspp/hmap.h" +#include "data/value.h" +#include "data/val-type.h" - Just for reference, the Levene Statistic is a defines as follows: +#include +#include - W = \frac{ (n-k)\sum_{i=1}^k n_i(Z_{iL} - Z_{LL})^2} - { (k-1)\sum_{i=1}^k \sum_{j=1}^{n_i} (Z_{ij} - Z_{iL})^2} - - where: - k is the number of groups - n is the total number of samples - n_i is the number of samples in the ith group - Z_{ij} is | Y_{ij} - Y_{iL} | where Y_{iL} is the mean of the ith group - Z_{iL} is the mean of Z_{ij} over the ith group - Z_{LL} is the grand mean of Z_{ij} - - Imagine calculating that with pencil and paper! +struct lev +{ + struct hmap_node node; + union value group; - */ + double t_bar; + double z_mean; + double n; +}; +typedef unsigned int hash_func (const struct levene *, const union value *v); +typedef bool cmp_func (const struct levene *, const union value *v0, const union value *v1); -struct levene_info +struct levene { + /* Width of the categorical variable */ + int gvw ; - /* Per group statistics */ - struct t_test_proc **group_stats; + /* The value dividing the groups. Valid only for dichotomous categorical variable.*/ + const union value *cutpoint; - /* The independent variable */ - struct variable *v_indep; - /* Number of dependent variables */ - size_t n_dep; + /* A hashtable of struct lev objects indexed by union value */ + struct hmap hmap; - /* The dependent variables */ - struct variable **v_dep; + hash_func *hash; + cmp_func *cmp; - /* How to treat missing values */ - enum lev_missing missing; - /* Function to test for missing values */ - is_missing_func *is_missing; -}; - -/* First pass */ -static void levene_precalc (const struct levene_info *l); -static int levene_calc (const struct ccase *, void *); -static void levene_postcalc (void *); + /* A state variable indicating how many passes have been done */ + int pass; + double grand_n; + double z_grand_mean; -/* Second pass */ -static void levene2_precalc (void *); -static int levene2_calc (const struct ccase *, void *); -static void levene2_postcalc (void *); + double denominator; +}; -void -levene(const struct casefile *cf, - struct variable *v_indep, size_t n_dep, struct variable **v_dep, - enum lev_missing missing, is_missing_func value_is_missing) +static unsigned int +unique_hash (const struct levene *nl, const union value *val) { - struct casereader *r; - struct ccase c; - struct levene_info l; - - l.n_dep = n_dep; - l.v_indep = v_indep; - l.v_dep = v_dep; - l.missing = missing; - l.is_missing = value_is_missing; - - + return value_hash (val, nl->gvw, 0); +} - levene_precalc(&l); - for(r = casefile_get_reader (cf); - casereader_read (r, &c) ; - case_destroy (&c)) - { - levene_calc(&c,&l); - } - casereader_destroy (r); - levene_postcalc(&l); +static bool +unique_cmp (const struct levene *nl, const union value *val0, const union value *val1) +{ + return value_equal (val0, val1, nl->gvw); +} - levene2_precalc(&l); - for(r = casefile_get_reader (cf); - casereader_read (r, &c) ; - case_destroy (&c)) - { - levene2_calc(&c,&l); - } - casereader_destroy (r); - levene2_postcalc(&l); +static unsigned int +cutpoint_hash (const struct levene *nl, const union value *val) +{ + int x = value_compare_3way (val, nl->cutpoint, nl->gvw); + return (x < 0); } -/* Internal variables used in calculating the Levene statistic */ - -/* Per variable statistics */ -struct lz_stats +static bool +cutpoint_cmp (const struct levene *nl, const union value *val0, const union value *val1) { - /* Total of all lz */ - double grand_total; + int x = value_compare_3way (val0, nl->cutpoint, nl->gvw); - /* Mean of all lz */ - double grand_mean; + int y = value_compare_3way (val1, nl->cutpoint, nl->gvw); - /* The total number of cases */ - double total_n ; + if (x == 0) x = 1; + if (y == 0) y = 1; - /* Number of groups */ - int n_groups; -}; + return (x == y); +} -/* An array of lz_stats for each variable */ -static struct lz_stats *lz; -static void -levene_precalc (const struct levene_info *l) +static struct lev * +find_group (const struct levene *nl, const union value *target) { - size_t i; - - lz = xnmalloc (l->n_dep, sizeof *lz); + struct lev *l = NULL; - for(i = 0; i < l->n_dep ; ++i ) + HMAP_FOR_EACH_WITH_HASH (l, struct lev, node, nl->hash (nl, target), &nl->hmap) { - struct variable *var = l->v_dep[i]; - struct group_proc *gp = group_proc_get (var); - struct group_statistics *gs; - struct hsh_iterator hi; - - lz[i].grand_total = 0; - lz[i].total_n = 0; - lz[i].n_groups = gp->n_groups ; - - - for ( gs = hsh_first(gp->group_hash, &hi); - gs != 0; - gs = hsh_next(gp->group_hash, &hi)) - { - gs->lz_total = 0; - } - + if (nl->cmp (nl, &l->group, target)) + break; } - + return l; } -static int -levene_calc (const struct ccase *c, void *_l) -{ - size_t i; - bool warn = false; - struct levene_info *l = (struct levene_info *) _l; - const union value *gv = case_data (c, l->v_indep->fv); - struct group_statistics key; - double weight = dict_get_case_weight (default_dict, c, &warn); - - /* Skip the entire case if /MISSING=LISTWISE is set */ - if ( l->missing == LEV_LISTWISE ) - { - for (i = 0; i < l->n_dep; ++i) - { - struct variable *v = l->v_dep[i]; - const union value *val = case_data (c, v->fv); - - if (l->is_missing (&v->miss, val) ) - { - return 0; - } - } - } - - - key.id = *gv; - for (i = 0; i < l->n_dep; ++i) - { - struct variable *var = l->v_dep[i]; - struct group_proc *gp = group_proc_get (var); - double levene_z; - const union value *v = case_data (c, var->fv); - struct group_statistics *gs; - - gs = hsh_find(gp->group_hash,(void *) &key ); +struct levene * +levene_create (int indep_width, const union value *cutpoint) +{ + struct levene *nl = XZALLOC (struct levene); - if ( 0 == gs ) - continue ; + hmap_init (&nl->hmap); - if ( ! l->is_missing(&var->miss, v)) - { - levene_z= fabs(v->f - gs->mean); - lz[i].grand_total += levene_z * weight; - lz[i].total_n += weight; + nl->gvw = indep_width; + nl->cutpoint = cutpoint; - gs->lz_total += levene_z * weight; - } + nl->hash = cutpoint ? cutpoint_hash : unique_hash; + nl->cmp = cutpoint ? cutpoint_cmp : unique_cmp; - } - return 0; + return nl; } -static void -levene_postcalc (void *_l) +/* Data accumulation. First pass */ +void +levene_pass_one (struct levene *nl, double value, double weight, const union value *gv) { - size_t v; + struct lev *lev = find_group (nl, gv); - struct levene_info *l = (struct levene_info *) _l; - - for (v = 0; v < l->n_dep; ++v) + if (nl->pass == 0) { - /* This is Z_LL */ - lz[v].grand_mean = lz[v].grand_total / lz[v].total_n ; + nl->pass = 1; } + assert (nl->pass == 1); - -} + if (NULL == lev) + { + struct lev *l = XZALLOC (struct lev); + value_clone (&l->group, gv, nl->gvw); + hmap_insert (&nl->hmap, &l->node, nl->hash (nl, &l->group)); + lev = l; + } + lev->n += weight; + lev->t_bar += value * weight; -/* The denominator for the expression for the Levene */ -static double *lz_denominator; + nl->grand_n += weight; +} -static void -levene2_precalc (void *_l) +/* Data accumulation. Second pass */ +void +levene_pass_two (struct levene *nl, double value, double weight, const union value *gv) { - size_t v; - - struct levene_info *l = (struct levene_info *) _l; + struct lev *lev = NULL; - lz_denominator = xnmalloc (l->n_dep, sizeof *lz_denominator); - - /* This stuff could go in the first post calc . . . */ - for (v = 0; v < l->n_dep; ++v) + if (nl->pass == 1) { - struct hsh_iterator hi; - struct group_statistics *g; + struct lev *next; + struct lev *l; + + nl->pass = 2; - struct variable *var = l->v_dep[v] ; - struct hsh_table *hash = group_proc_get (var)->group_hash; + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + l->t_bar /= l->n; + } + } + assert (nl->pass == 2); + lev = find_group (nl, gv); - for(g = (struct group_statistics *) hsh_first(hash,&hi); - g != 0 ; - g = (struct group_statistics *) hsh_next(hash,&hi) ) - { - g->lz_mean = g->lz_total / g->n ; - } - lz_denominator[v] = 0; - } + lev->z_mean += fabs (value - lev->t_bar) * weight; + nl->z_grand_mean += fabs (value - lev->t_bar) * weight; } -static int -levene2_calc (const struct ccase *c, void *_l) +/* Data accumulation. Third pass */ +void +levene_pass_three (struct levene *nl, double value, double weight, const union value *gv) { - size_t i; - bool warn = false; - - struct levene_info *l = (struct levene_info *) _l; - - double weight = dict_get_case_weight (default_dict, c, &warn); + double z; + struct lev *lev = NULL; - const union value *gv = case_data (c, l->v_indep->fv); - struct group_statistics key; - - /* Skip the entire case if /MISSING=LISTWISE is set */ - if ( l->missing == LEV_LISTWISE ) + if (nl->pass == 2) { - for (i = 0; i < l->n_dep; ++i) - { - struct variable *v = l->v_dep[i]; - const union value *val = case_data (c, v->fv); - - if (l->is_missing(&v->miss, val) ) - { - return 0; - } - } - } + struct lev *next; + struct lev *l; - key.id = *gv; - - for (i = 0; i < l->n_dep; ++i) - { - double levene_z; - struct variable *var = l->v_dep[i] ; - const union value *v = case_data (c, var->fv); - struct group_statistics *gs; + nl->pass = 3; - gs = hsh_find(group_proc_get (var)->group_hash,(void *) &key ); + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + l->z_mean /= l->n; + } - if ( 0 == gs ) - continue; + nl->z_grand_mean /= nl->grand_n; + } - if ( ! l->is_missing (&var->miss, v) ) - { - levene_z = fabs(v->f - gs->mean); - lz_denominator[i] += weight * pow2(levene_z - gs->lz_mean); - } - } + assert (nl->pass == 3); + lev = find_group (nl, gv); - return 0; + z = fabs (value - lev->t_bar); + nl->denominator += pow2 (z - lev->z_mean) * weight; } -static void -levene2_postcalc (void *_l) +/* Return the value of the levene statistic */ +double +levene_calculate (struct levene *nl) { - size_t v; + struct lev *next; + struct lev *l; - struct levene_info *l = (struct levene_info *) _l; + double numerator = 0.0; + double nn = 0.0; - for (v = 0; v < l->n_dep; ++v) - { - double lz_numerator = 0; - struct hsh_iterator hi; - struct group_statistics *g; + /* The Levene calculation requires three passes. + Normally this should have been done prior to calling this function. + However, in abnormal circumstances (eg. the dataset is empty) there + will have been no passes. + */ + assert (nl->pass == 0 || nl->pass == 3); - struct variable *var = l->v_dep[v] ; - struct group_proc *gp = group_proc_get (var); - struct hsh_table *hash = gp->group_hash; + if (nl->pass == 0) + return SYSMIS; - for(g = (struct group_statistics *) hsh_first(hash,&hi); - g != 0 ; - g = (struct group_statistics *) hsh_next(hash,&hi) ) - { - lz_numerator += g->n * pow2(g->lz_mean - lz[v].grand_mean ); - } - lz_numerator *= ( gp->ugs.n - gp->n_groups ); + nl->denominator *= hmap_count (&nl->hmap) - 1; - lz_denominator[v] *= (gp->n_groups - 1); + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + numerator += l->n * pow2 (l->z_mean - nl->z_grand_mean); + nn += l->n; + } + + numerator *= nn - hmap_count (&nl->hmap); - gp->levene = lz_numerator / lz_denominator[v] ; + return numerator / nl->denominator; +} +void +levene_destroy (struct levene *nl) +{ + struct lev *next; + struct lev *l; + + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &nl->hmap) + { + value_destroy (&l->group, nl->gvw); + free (l); } - /* Now clear up after ourselves */ - free(lz_denominator); - free(lz); + hmap_destroy (&nl->hmap); + free (nl); } -