From 889a94b673edadc571a0d0f3763e304f573ff2ec Mon Sep 17 00:00:00 2001 From: John Darrington Date: Sun, 17 Oct 2010 14:03:41 +0200 Subject: [PATCH] Re-implemented the levene calculation --- src/language/stats/oneway.c | 227 ++++----------------- src/language/stats/t-test.q | 9 +- src/math/levene.c | 383 ++++++++++++------------------------ src/math/levene.h | 28 +-- 4 files changed, 171 insertions(+), 476 deletions(-) diff --git a/src/language/stats/oneway.c b/src/language/stats/oneway.c index ea587cec..79b40cbd 100644 --- a/src/language/stats/oneway.c +++ b/src/language/stats/oneway.c @@ -19,9 +19,14 @@ #include #include #include +#include +#include +#include + #include #include +#include #include #include #include @@ -33,15 +38,9 @@ #include #include -#include -#include -#include #include -#include #include -#include -#include #include #include @@ -55,6 +54,7 @@ #include "gettext.h" #define _(msgid) gettext (msgid) + enum missing_type { MISS_LISTWISE, @@ -115,6 +115,7 @@ struct descriptive_data /* Workspace variable for each dependent variable */ struct per_var_ws { + struct categoricals *cat; struct covariance *cov; double sst; @@ -124,6 +125,7 @@ struct per_var_ws int n_groups; double mse; + double levene_w; }; struct oneway_workspace @@ -132,10 +134,6 @@ struct oneway_workspace missing values are disregarded */ int actual_number_of_groups; - /* A hash table containing all the distinct values of the independent - variable */ - struct hsh_table *group_hash; - struct per_var_ws *vws; /* An array of descriptive data. One for each dependent variable */ @@ -295,32 +293,6 @@ cmd_oneway (struct lexer *lexer, struct dataset *ds) -static int -compare_double_3way (const void *a_, const void *b_, const void *aux UNUSED) -{ - const double *a = a_; - const double *b = b_; - return *a < *b ? -1 : *a > *b; -} - -static unsigned -do_hash_double (const void *value_, const void *aux UNUSED) -{ - const double *value = value_; - return hash_double (*value, 0); -} - -static void -free_double (void *value_, const void *aux UNUSED) -{ - double *value = value_; - free (value); -} - - - -static void postcalc (const struct oneway_spec *cmd); -static void precalc (const struct oneway_spec *cmd); static struct descriptive_data * dd_create (const struct variable *var) @@ -335,6 +307,12 @@ dd_create (const struct variable *var) return dd; } +static void +dd_destroy (struct descriptive_data *dd) +{ + moments1_destroy (dd->mom); + free (dd); +} static void * makeit (void *aux1, void *aux2 UNUSED) @@ -406,7 +384,7 @@ run_oneway (const struct oneway_spec *cmd, struct oneway_workspace ws; ws.actual_number_of_groups = 0; - ws.vws = xmalloc (cmd->n_vars * sizeof (*ws.vws)); + ws.vws = xzalloc (cmd->n_vars * sizeof (*ws.vws)); ws.dd_total = xmalloc (sizeof (struct descriptive_data) * cmd->n_vars); for (v = 0 ; v < cmd->n_vars; ++v) @@ -414,14 +392,14 @@ run_oneway (const struct oneway_spec *cmd, for (v = 0; v < cmd->n_vars; ++v) { - struct categoricals *cats = categoricals_create (&cmd->indep_var, 1, + ws.vws[v].cat = categoricals_create (&cmd->indep_var, 1, cmd->wv, cmd->exclude, makeit, updateit, cmd->vars[v], ws.dd_total[v]); ws.vws[v].cov = covariance_2pass_create (1, &cmd->vars[v], - cats, + ws.vws[v].cat, cmd->wv, cmd->exclude); } @@ -429,21 +407,13 @@ run_oneway (const struct oneway_spec *cmd, if (c == NULL) { casereader_destroy (input); - return; + goto finish; } output_split_file_values (ds, c); case_unref (c); taint = taint_clone (casereader_get_taint (input)); - ws.group_hash = hsh_create (4, - compare_double_3way, - do_hash_double, - free_double, - cmd->indep_var); - - precalc (cmd); - input = casereader_create_filter_missing (input, &cmd->indep_var, 1, cmd->exclude, NULL, NULL); if (cmd->missing_type == MISS_LISTWISE) @@ -451,30 +421,26 @@ run_oneway (const struct oneway_spec *cmd, cmd->exclude, NULL, NULL); input = casereader_create_filter_weight (input, dict, NULL, NULL); + + if (cmd->stats & STATS_HOMOGENEITY) + for (v = 0; v < cmd->n_vars; ++v) + { + struct per_var_ws *pvw = &ws.vws[v]; + + pvw->levene_w = levene (input, cmd->indep_var, cmd->vars[v], cmd->wv, cmd->exclude); + } + reader = casereader_clone (input); for (; (c = casereader_read (reader)) != NULL; case_unref (c)) { - size_t i; - - const double weight = dict_get_case_weight (dict, c, NULL); - - const union value *indep_val = case_data (c, cmd->indep_var); - void **p = hsh_probe (ws.group_hash, &indep_val->f); - if (*p == NULL) - { - double *value = *p = xmalloc (sizeof *value); - *value = indep_val->f; - } + int i; for (i = 0; i < cmd->n_vars; ++i) { struct per_var_ws *pvw = &ws.vws[i]; const struct variable *v = cmd->vars[i]; const union value *val = case_data (c, v); - struct group_proc *gp = group_proc_get (cmd->vars[i]); - struct hsh_table *group_hash = gp->group_hash; - struct group_statistics *gs; if ( MISS_ANALYSIS == cmd->missing_type) { @@ -483,51 +449,7 @@ run_oneway (const struct oneway_spec *cmd, } covariance_accumulate_pass1 (pvw->cov, c); - - gs = hsh_find (group_hash, indep_val ); - - if ( ! gs ) - { - gs = xmalloc (sizeof *gs); - gs->id = *indep_val; - gs->sum = 0; - gs->n = 0; - gs->ssq = 0; - gs->sum_diff = 0; - gs->minimum = DBL_MAX; - gs->maximum = -DBL_MAX; - - hsh_insert ( group_hash, gs ); - } - - if (!var_is_value_missing (v, val, cmd->exclude)) - { - struct group_statistics *totals = &gp->ugs; - - totals->n += weight; - totals->sum += weight * val->f; - totals->ssq += weight * pow2 (val->f); - - if ( val->f < totals->minimum ) - totals->minimum = val->f; - - if ( val->f > totals->maximum ) - totals->maximum = val->f; - - gs->n += weight; - gs->sum += weight * val->f; - gs->ssq += weight * pow2 (val->f); - - if ( val->f < gs->minimum ) - gs->minimum = val->f; - - if ( val->f > gs->maximum ) - gs->maximum = val->f; - } - - gp->n_groups = hsh_count (group_hash ); } - } casereader_destroy (reader); reader = casereader_clone (input); @@ -575,9 +497,6 @@ run_oneway (const struct oneway_spec *cmd, pvw->mse = (pvw->sst - pvw->ssa) / (n - pvw->n_groups); } - postcalc (cmd); - - for (v = 0; v < cmd->n_vars; ++v) { struct categoricals *cats = covariance_get_categoricals (ws.vws[v].cov); @@ -588,85 +507,22 @@ run_oneway (const struct oneway_spec *cmd, ws.actual_number_of_groups = categoricals_total (cats); } - if ( cmd->stats & STATS_HOMOGENEITY ) - levene (dict, casereader_clone (input), cmd->indep_var, - cmd->n_vars, cmd->vars, cmd->exclude); - casereader_destroy (input); if (!taint_has_tainted_successor (taint)) output_oneway (cmd, &ws); taint_destroy (taint); -} -/* Pre calculations */ -static void -precalc (const struct oneway_spec *cmd) -{ - size_t i = 0; - - for (i = 0; i < cmd->n_vars; ++i) + finish: + for (v = 0; v < cmd->n_vars; ++v) { - struct group_proc *gp = group_proc_get (cmd->vars[i]); - struct group_statistics *totals = &gp->ugs; - - /* Create a hash for each of the dependent variables. - The hash contains a group_statistics structure, - and is keyed by value of the independent variable */ - - gp->group_hash = hsh_create (4, compare_group, hash_group, - (hsh_free_func *) free_group, - cmd->indep_var); - - totals->sum = 0; - totals->n = 0; - totals->ssq = 0; - totals->sum_diff = 0; - totals->maximum = -DBL_MAX; - totals->minimum = DBL_MAX; + covariance_destroy (ws.vws[v].cov); + dd_destroy (ws.dd_total[v]); } -} + free (ws.vws); + free (ws.dd_total); -/* Post calculations for the ONEWAY command */ -static void -postcalc (const struct oneway_spec *cmd) -{ - size_t i = 0; - - for (i = 0; i < cmd->n_vars; ++i) - { - struct group_proc *gp = group_proc_get (cmd->vars[i]); - struct hsh_table *group_hash = gp->group_hash; - struct group_statistics *totals = &gp->ugs; - - struct hsh_iterator g; - struct group_statistics *gs; - - for (gs = hsh_first (group_hash, &g); - gs != 0; - gs = hsh_next (group_hash, &g)) - { - gs->mean = gs->sum / gs->n; - gs->s_std_dev = sqrt (gs->ssq / gs->n - pow2 (gs->mean)); - - gs->std_dev = sqrt ( - gs->n / (gs->n - 1) * - ( gs->ssq / gs->n - pow2 (gs->mean)) - ); - - gs->se_mean = gs->std_dev / sqrt (gs->n); - gs->mean_diff = gs->sum_diff / gs->n; - } - - totals->mean = totals->sum / totals->n; - totals->std_dev = sqrt ( - totals->n / (totals->n - 1) * - (totals->ssq / totals->n - pow2 (totals->mean)) - ); - - totals->se_mean = totals->std_dev / sqrt (totals->n); - } } static void show_contrast_coeffs (const struct oneway_spec *cmd, const struct oneway_workspace *ws); @@ -714,16 +570,6 @@ output_oneway (const struct oneway_spec *cmd, struct oneway_workspace *ws) show_contrast_coeffs (cmd, ws); show_contrast_tests (cmd, ws); } - - /* Clean up */ - for (i = 0; i < cmd->n_vars; ++i ) - { - struct hsh_table *group_hash = group_proc_get (cmd->vars[i])->group_hash; - - hsh_destroy (group_hash); - } - - hsh_destroy (ws->group_hash); } @@ -1005,15 +851,12 @@ show_homogeneity (const struct oneway_spec *cmd, const struct oneway_workspace * for (v = 0; v < cmd->n_vars; ++v) { double n; - - const struct per_var_ws *pvw = &ws->vws[v]; + double F = pvw->levene_w; const struct variable *var = cmd->vars[v]; - const struct group_proc *gp = group_proc_get (cmd->vars[v]); const char *s = var_to_string (var); double df1, df2; - double F = gp->levene; moments1_calculate (ws->dd_total[v]->mom, &n, NULL, NULL, NULL, NULL); diff --git a/src/language/stats/t-test.q b/src/language/stats/t-test.q index 8aee3b15..77fc4297 100644 --- a/src/language/stats/t-test.q +++ b/src/language/stats/t-test.q @@ -1439,8 +1439,13 @@ calculate (struct t_test_proc *proc, break; case T_IND_SAMPLES: group_calc (dict, proc, casereader_clone (input)); - levene (dict, input, proc->indep_var, proc->n_vars, proc->vars, - proc->exclude); + int i; + for (i = 0; i < proc->n_vars; ++i) + { + struct group_proc *grp_data = group_proc_get (proc->vars[i]); + + grp_data->levene = levene ( input, proc->indep_var, proc->vars[i], dict_get_weight (dict), proc->exclude); + } break; default: NOT_REACHED (); diff --git a/src/math/levene.c b/src/math/levene.c index 5ff474dc..b92441b3 100644 --- a/src/math/levene.c +++ b/src/math/levene.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2004, 2009, 2010 Free Software Foundation, Inc. + Copyright (C) 2010 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -15,307 +15,168 @@ along with this program. If not, see . */ #include + #include "levene.h" -#include -#include -#include -#include -#include "group-proc.h" -#include -#include -#include -#include -#include -#include "group.h" #include -#include - -#include "xalloc.h" - - -/* This module calculates the Levene statistic for variables. - - Just for reference, the Levene Statistic is a defines as follows: - - W = \frac{ (n-k)\sum_{i=1}^k n_i(Z_{iL} - Z_{LL})^2} - { (k-1)\sum_{i=1}^k \sum_{j=1}^{n_i} (Z_{ij} - Z_{iL})^2} - - where: - k is the number of groups - n is the total number of samples - n_i is the number of samples in the ith group - Z_{ij} is | Y_{ij} - Y_{iL} | where Y_{iL} is the mean of the ith group - Z_{iL} is the mean of Z_{ij} over the ith group - Z_{LL} is the grand mean of Z_{ij} - - Imagine calculating that with pencil and paper! - */ - - -struct levene_info -{ - - /* Per group statistics */ - struct t_test_proc **group_stats; - - /* The independent variable */ - const struct variable *v_indep; - - /* Number of dependent variables */ - size_t n_dep; - - /* The dependent variables */ - const struct variable **v_dep; - - /* Filter for missing values */ - enum mv_class exclude; +#include - /* An array of lz_stats for each variable */ - struct lz_stats *lz; +#include +#include +#include - /* The denominator for the expression for the Levene */ - double *lz_denominator; -}; +#include -/* Per variable statistics */ -struct lz_stats +struct lev { - /* Total of all lz */ - double grand_total; - - /* Mean of all lz */ - double grand_mean; + struct hmap_node node; + union value group; + int width; - /* The total number of cases */ - double total_n ; - - /* Number of groups */ - int n_groups; + double t_bar; + double z_mean; + double n; }; -/* First pass */ -static void levene_precalc (const struct levene_info *l); -static int levene_calc (const struct dictionary *dict, const struct ccase *, - const struct levene_info *l); -static void levene_postcalc (struct levene_info *); - - -/* Second pass */ -static void levene2_precalc (struct levene_info *l); -static int levene2_calc (const struct dictionary *, const struct ccase *, - struct levene_info *l); -static void levene2_postcalc (struct levene_info *); - -void -levene(const struct dictionary *dict, - struct casereader *reader, - const struct variable *v_indep, size_t n_dep, - const struct variable **v_dep, - enum mv_class exclude) -{ - struct casereader *pass1, *pass2; - struct ccase *c; - struct levene_info l; - - l.n_dep = n_dep; - l.v_indep = v_indep; - l.v_dep = v_dep; - l.exclude = exclude; - l.lz = xnmalloc (l.n_dep, sizeof *l.lz); - l.lz_denominator = xnmalloc (l.n_dep, sizeof *l.lz_denominator); - - casereader_split (reader, &pass1, &pass2); - - levene_precalc (&l); - for (; (c = casereader_read (pass1)) != NULL; case_unref (c)) - levene_calc (dict, c, &l); - casereader_destroy (pass1); - levene_postcalc (&l); - - levene2_precalc(&l); - for (; (c = casereader_read (pass2)) != NULL; case_unref (c)) - levene2_calc (dict, c, &l); - casereader_destroy (pass2); - levene2_postcalc (&l); - - free (l.lz_denominator); - free (l.lz); -} - -static void -levene_precalc (const struct levene_info *l) +static struct lev * +find_group (struct hmap *map, const union value *target, int width) { - size_t i; - - for(i = 0; i < l->n_dep ; ++i ) + struct lev *l = NULL; + unsigned int hash = value_hash (target, width, 0); + HMAP_FOR_EACH_WITH_HASH (l, struct lev, node, hash, map) { - const struct variable *var = l->v_dep[i]; - struct group_proc *gp = group_proc_get (var); - struct group_statistics *gs; - struct hsh_iterator hi; - - l->lz[i].grand_total = 0; - l->lz[i].total_n = 0; - l->lz[i].n_groups = gp->n_groups ; - - - for ( gs = hsh_first(gp->group_hash, &hi); - gs != NULL; - gs = hsh_next(gp->group_hash, &hi)) - { - gs->lz_total = 0; - } - + if (value_equal (&l->group, target, width)) + break; + l = NULL; } + return l; } -static int -levene_calc (const struct dictionary *dict, const struct ccase *c, - const struct levene_info *l) -{ - size_t i; - bool warn = false; - const union value *gv = case_data (c, l->v_indep); - struct group_statistics key; - double weight = dict_get_case_weight (dict, c, &warn); - - key.id = *gv; - - for (i = 0; i < l->n_dep; ++i) - { - const struct variable *var = l->v_dep[i]; - struct group_proc *gp = group_proc_get (var); - double levene_z; - const union value *v = case_data (c, var); - struct group_statistics *gs; - - gs = hsh_find(gp->group_hash,(void *) &key ); - if ( gs == NULL ) - continue ; - - if ( !var_is_value_missing (var, v, l->exclude)) - { - levene_z= fabs(v->f - gs->mean); - l->lz[i].grand_total += levene_z * weight; - l->lz[i].total_n += weight; - - gs->lz_total += levene_z * weight; - } - } - return 0; -} - - -static void -levene_postcalc (struct levene_info *l) +double +levene (struct casereader *rx, const struct variable *gvar, + const struct variable *var, const struct variable *wv, + enum mv_class exclude) { - size_t v; - - for (v = 0; v < l->n_dep; ++v) - { - /* This is Z_LL */ - l->lz[v].grand_mean = l->lz[v].grand_total / l->lz[v].total_n ; - } + double numerator = 0.0; + double denominator = 0.0; + int n_groups = 0; + double z_grand_mean = 0.0; + double grand_n = 0.0; + struct hmap map = HMAP_INITIALIZER (map); -} - - - -static void -levene2_precalc (struct levene_info *l) -{ - size_t v; - + struct ccase *c; + struct casereader *r = casereader_clone (rx); - /* This stuff could go in the first post calc . . . */ - for (v = 0; - v < l->n_dep; - ++v) + for (; (c = casereader_read (r)) != NULL; case_unref (c)) { - struct hsh_iterator hi; - struct group_statistics *g; - - const struct variable *var = l->v_dep[v] ; - struct hsh_table *hash = group_proc_get (var)->group_hash; - + struct lev *l = NULL; + const union value *target = case_data (c, gvar); + int width = var_get_width (gvar); + unsigned int hash = value_hash (target, width, 0); + const double x = case_data (c, var)->f; + const double weight = wv ? case_data (c, wv)->f : 1.0; + + if (var_is_value_missing (var, case_data (c, var), exclude)) + continue; - for (g = hsh_first(hash,&hi); g != NULL; g = hsh_next(hash, &hi)) + l = find_group (&map, target, width); + + if (l == NULL) { - g->lz_mean = g->lz_total / g->n ; + l = xzalloc (sizeof *l); + value_clone (&l->group, target, width); + hmap_insert (&map, &l->node, hash); } - l->lz_denominator[v] = 0; - } -} -static int -levene2_calc (const struct dictionary *dict, const struct ccase *c, - struct levene_info *l) -{ - size_t i; - bool warn = false; - - double weight = dict_get_case_weight (dict, c, &warn); - - const union value *gv = case_data (c, l->v_indep); - struct group_statistics key; + l->n += weight; + l->t_bar += x * weight; + grand_n += weight; + } + casereader_destroy (r); + + { + struct lev *l; + HMAP_FOR_EACH (l, struct lev, node, &map) + { + l->t_bar /= l->n; + } + } - key.id = *gv; + n_groups = hmap_count (&map); - for (i = 0; i < l->n_dep; ++i) + r = casereader_clone (rx); + for (; (c = casereader_read (r)) != NULL; case_unref (c)) { - double levene_z; - const struct variable *var = l->v_dep[i] ; - const union value *v = case_data (c, var); - struct group_statistics *gs; - - gs = hsh_find(group_proc_get (var)->group_hash,(void *) &key ); + const union value *target = case_data (c, gvar); + int width = var_get_width (gvar); + const double x = case_data (c, var)->f; + const double weight = wv ? case_data (c, wv)->f : 1.0; - if ( gs == NULL ) + if (var_is_value_missing (var, case_data (c, var), exclude)) continue; - if ( !var_is_value_missing (var, v, l->exclude)) - { - levene_z = fabs(v->f - gs->mean); - l->lz_denominator[i] += weight * pow2 (levene_z - gs->lz_mean); - } - } - - return 0; -} + struct lev *l = find_group (&map, target, width); + assert (l); + + l->z_mean += fabs (x - l->t_bar) * weight; + z_grand_mean += fabs (x - l->t_bar) * weight; + } + casereader_destroy (r); + + { + struct lev *l; + HMAP_FOR_EACH (l, struct lev, node, &map) + { + l->z_mean /= l->n; + } + } -static void -levene2_postcalc (struct levene_info *l) -{ - size_t v; + z_grand_mean /= grand_n; - for (v = 0; v < l->n_dep; ++v) + r = casereader_clone (rx); + for (; (c = casereader_read (r)) != NULL; case_unref (c)) { - double lz_numerator = 0; - struct hsh_iterator hi; - struct group_statistics *g; + double z; + const union value *target = case_data (c, gvar); + int width = var_get_width (gvar); + const double x = case_data (c, var)->f; + const double weight = wv ? case_data (c, wv)->f : 1.0; - const struct variable *var = l->v_dep[v] ; - struct group_proc *gp = group_proc_get (var); - struct hsh_table *hash = gp->group_hash; - - for (g = hsh_first(hash, &hi); g != NULL; g = hsh_next(hash, &hi)) - { - lz_numerator += g->n * pow2(g->lz_mean - l->lz[v].grand_mean ); - } - lz_numerator *= ( gp->ugs.n - gp->n_groups ); - - l->lz_denominator[v] *= (gp->n_groups - 1); + if (var_is_value_missing (var, case_data (c, var), exclude)) + continue; - gp->levene = lz_numerator / l->lz_denominator[v] ; + struct lev *l = find_group (&map, target, width); + assert (l); + z = fabs (x - l->t_bar); + denominator += pow2 (z - l->z_mean) * weight; } -} + casereader_destroy (r); + + denominator *= n_groups - 1; + + { + double grand_n = 0.0; + struct hmap_node *next; + struct lev *l; + HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &map) + { + numerator += l->n * pow2 (l->z_mean - z_grand_mean); + grand_n += l->n; + + /* We don't need these anymore */ + free (l); + } + numerator *= grand_n - n_groups ; + hmap_destroy (&map); + } + return numerator/ denominator; +} diff --git a/src/math/levene.h b/src/math/levene.h index 67a4a115..c5e8bc3b 100644 --- a/src/math/levene.h +++ b/src/math/levene.h @@ -17,29 +17,15 @@ #if !levene_h #define levene_h 1 -#include -#include -#include +struct casereader; +struct variable; -/* Calculate the Levene statistic -The independent variable : v_indep; +enum mv_class; -Number of dependent variables : n_dep; +double +levene (struct casereader *rx, const struct variable *gvar, + const struct variable *var, const struct variable *wv, enum mv_class exclude ); -The dependent variables : v_dep; -*/ - - -struct dictionary ; -struct casefilter ; - -void levene(const struct dictionary *dict, struct casereader *, - const struct variable *v_indep, size_t n_dep, - const struct variable **v_dep, - enum mv_class exclude); - - - -#endif /* levene_h */ +#endif -- 2.30.2