Re-implemented the levene calculation

author John Darrington <john@darrington.wattle.id.au>

Sun, 17 Oct 2010 12:03:41 +0000 (14:03 +0200)

committer John Darrington <john@darrington.wattle.id.au>

Sun, 17 Oct 2010 14:37:09 +0000 (16:37 +0200)
author John Darrington <john@darrington.wattle.id.au>
Sun, 17 Oct 2010 12:03:41 +0000 (14:03 +0200)
committer John Darrington <john@darrington.wattle.id.au>
Sun, 17 Oct 2010 14:37:09 +0000 (16:37 +0200)
diff --git a/src/language/stats/oneway.c b/src/language/stats/oneway.c

index ea587cecb24066190087697ef594db3c198622b0..79b40cbd854a1be1d201ab50b8b150c5b740b3f7 100644 (file)
--- a/src/language/stats/oneway.c
+++ b/src/language/stats/oneway.c
@@ -19,9 +19,14 @@
  #include <data/case.h>
  #include <data/casegrouper.h>
  #include <data/casereader.h>
+#include <data/dictionary.h>
+#include <data/procedure.h>
+#include <data/value.h>
+
  
  #include <math/covariance.h>
  #include <math/categoricals.h>
+#include <math/levene.h>
  #include <math/moments.h>
  #include <gsl/gsl_matrix.h>
  #include <linreg/sweep.h>
@@ -33,15 +38,9 @@
  #include <language/lexer/value-parser.h>
  #include <language/command.h>
  
-#include <data/procedure.h>
-#include <data/value.h>
-#include <data/dictionary.h>
  
  #include <language/dictionary/split-file.h>
-#include <libpspp/hash.h>
  #include <libpspp/taint.h>
-#include <math/group-proc.h>
-#include <math/levene.h>
  #include <libpspp/misc.h>
  
  #include <output/tab.h>
@@ -55,6 +54,7 @@
  #include "gettext.h"
  #define _(msgid) gettext (msgid)
  
+
  enum missing_type
    {
      MISS_LISTWISE,
@@ -115,6 +115,7 @@ struct descriptive_data
  /* Workspace variable for each dependent variable */
  struct per_var_ws
  {
+  struct categoricals *cat;
    struct covariance *cov;
  
    double sst;
@@ -124,6 +125,7 @@ struct per_var_ws
    int n_groups;
  
    double mse;
+  double levene_w;
  };
  
  struct oneway_workspace
@@ -132,10 +134,6 @@ struct oneway_workspace
       missing values are disregarded */
    int actual_number_of_groups;
  
-  /* A  hash table containing all the distinct values of the independent
-     variable */
-  struct hsh_table *group_hash;
-
    struct per_var_ws *vws;
  
    /* An array of descriptive data.  One for each dependent variable */
@@ -295,32 +293,6 @@ cmd_oneway (struct lexer *lexer, struct dataset *ds)
  
  \f
  
-static int
-compare_double_3way (const void *a_, const void *b_, const void *aux UNUSED)
-{
-  const double *a = a_;
-  const double *b = b_;
-  return *a < *b ? -1 : *a > *b;
-}
-
-static unsigned
-do_hash_double (const void *value_, const void *aux UNUSED)
-{
-  const double *value = value_;
-  return hash_double (*value, 0);
-}
-
-static void
-free_double (void *value_, const void *aux UNUSED)
-{
-  double *value = value_;
-  free (value);
-}
-
-
-
-static void postcalc (const struct oneway_spec *cmd);
-static void  precalc (const struct oneway_spec *cmd);
  
  static struct descriptive_data *
  dd_create (const struct variable *var)
@@ -335,6 +307,12 @@ dd_create (const struct variable *var)
    return dd;
  }
  
+static void
+dd_destroy (struct descriptive_data *dd)
+{
+  moments1_destroy (dd->mom);
+  free (dd);
+}
  
  static void *
  makeit (void *aux1, void *aux2 UNUSED)
@@ -406,7 +384,7 @@ run_oneway (const struct oneway_spec *cmd,
    struct oneway_workspace ws;
  
    ws.actual_number_of_groups = 0;
-  ws.vws = xmalloc (cmd->n_vars * sizeof (*ws.vws));
+  ws.vws = xzalloc (cmd->n_vars * sizeof (*ws.vws));
    ws.dd_total = xmalloc (sizeof (struct descriptive_data) * cmd->n_vars);
  
    for (v = 0 ; v < cmd->n_vars; ++v)
@@ -414,14 +392,14 @@ run_oneway (const struct oneway_spec *cmd,
  
    for (v = 0; v < cmd->n_vars; ++v)
      {
-      struct categoricals *cats = categoricals_create (&cmd->indep_var, 1,
+      ws.vws[v].cat = categoricals_create (&cmd->indep_var, 1,
                                                        cmd->wv, cmd->exclude, 
                                                        makeit,
                                                        updateit,
                                                        cmd->vars[v], ws.dd_total[v]);
  
        ws.vws[v].cov = covariance_2pass_create (1, &cmd->vars[v],
-                                              cats, 
+                                              ws.vws[v].cat, 
                                                cmd->wv, cmd->exclude);
      }
  
@@ -429,21 +407,13 @@ run_oneway (const struct oneway_spec *cmd,
    if (c == NULL)
      {
        casereader_destroy (input);
-      return;
+      goto finish;
      }
    output_split_file_values (ds, c);
    case_unref (c);
  
    taint = taint_clone (casereader_get_taint (input));
  
-  ws.group_hash = hsh_create (4,
-                                 compare_double_3way,
-                                 do_hash_double,
-                                 free_double,
-                                 cmd->indep_var);
-
-  precalc (cmd);
-
    input = casereader_create_filter_missing (input, &cmd->indep_var, 1,
                                              cmd->exclude, NULL, NULL);
    if (cmd->missing_type == MISS_LISTWISE)
@@ -451,30 +421,26 @@ run_oneway (const struct oneway_spec *cmd,
                                                cmd->exclude, NULL, NULL);
    input = casereader_create_filter_weight (input, dict, NULL, NULL);
  
+
+  if (cmd->stats & STATS_HOMOGENEITY)
+    for (v = 0; v < cmd->n_vars; ++v)
+      {
+       struct per_var_ws *pvw = &ws.vws[v];
+
+       pvw->levene_w = levene (input, cmd->indep_var, cmd->vars[v], cmd->wv, cmd->exclude);
+      }
+
    reader = casereader_clone (input);
  
    for (; (c = casereader_read (reader)) != NULL; case_unref (c))
      {
-      size_t i;
-
-      const double weight = dict_get_case_weight (dict, c, NULL);
-
-      const union value *indep_val = case_data (c, cmd->indep_var);
-      void **p = hsh_probe (ws.group_hash, &indep_val->f);
-      if (*p == NULL)
-        {
-          double *value = *p = xmalloc (sizeof *value);
-          *value = indep_val->f;
-        }
+      int i;
  
        for (i = 0; i < cmd->n_vars; ++i)
         {
           struct per_var_ws *pvw = &ws.vws[i];
           const struct variable *v = cmd->vars[i];
           const union value *val = case_data (c, v);
-          struct group_proc *gp = group_proc_get (cmd->vars[i]);
-         struct hsh_table *group_hash = gp->group_hash;
-         struct group_statistics *gs;
  
           if ( MISS_ANALYSIS == cmd->missing_type)
             {
@@ -483,51 +449,7 @@ run_oneway (const struct oneway_spec *cmd,
             }
  
           covariance_accumulate_pass1 (pvw->cov, c);
-
-         gs = hsh_find (group_hash, indep_val );
-
-         if ( ! gs )
-           {
-             gs = xmalloc (sizeof *gs);
-             gs->id = *indep_val;
-             gs->sum = 0;
-             gs->n = 0;
-             gs->ssq = 0;
-             gs->sum_diff = 0;
-             gs->minimum = DBL_MAX;
-             gs->maximum = -DBL_MAX;
-
-             hsh_insert ( group_hash, gs );
-           }
-
-         if (!var_is_value_missing (v, val, cmd->exclude))
-           {
-             struct group_statistics *totals = &gp->ugs;
-
-             totals->n += weight;
-             totals->sum += weight * val->f;
-             totals->ssq += weight * pow2 (val->f);
-
-             if ( val->f < totals->minimum )
-               totals->minimum = val->f;
-
-             if ( val->f > totals->maximum )
-               totals->maximum = val->f;
-
-             gs->n += weight;
-             gs->sum += weight * val->f;
-             gs->ssq += weight * pow2 (val->f);
-
-             if ( val->f < gs->minimum )
-               gs->minimum = val->f;
-
-             if ( val->f > gs->maximum )
-               gs->maximum = val->f;
-           }
-
-         gp->n_groups = hsh_count (group_hash );
         }
-
      }
    casereader_destroy (reader);
    reader = casereader_clone (input);
@@ -575,9 +497,6 @@ run_oneway (const struct oneway_spec *cmd,
        pvw->mse = (pvw->sst - pvw->ssa) / (n - pvw->n_groups);
      }
  
-  postcalc (cmd);
-
-  
    for (v = 0; v < cmd->n_vars; ++v)
      {
        struct categoricals *cats = covariance_get_categoricals (ws.vws[v].cov);
@@ -588,85 +507,22 @@ run_oneway (const struct oneway_spec *cmd,
         ws.actual_number_of_groups = categoricals_total (cats);
      }
  
-  if ( cmd->stats & STATS_HOMOGENEITY )
-    levene (dict, casereader_clone (input), cmd->indep_var,
-           cmd->n_vars, cmd->vars, cmd->exclude);
-
    casereader_destroy (input);
  
    if (!taint_has_tainted_successor (taint))
      output_oneway (cmd, &ws);
  
    taint_destroy (taint);
-}
  
-/* Pre calculations */
-static void
-precalc (const struct oneway_spec *cmd)
-{
-  size_t i = 0;
-
-  for (i = 0; i < cmd->n_vars; ++i)
+ finish:
+  for (v = 0; v < cmd->n_vars; ++v)
      {
-      struct group_proc *gp = group_proc_get (cmd->vars[i]);
-      struct group_statistics *totals = &gp->ugs;
-
-      /* Create a hash for each of the dependent variables.
-        The hash contains a group_statistics structure,
-        and is keyed by value of the independent variable */
-
-      gp->group_hash = hsh_create (4, compare_group, hash_group,
-                                  (hsh_free_func *) free_group,
-                                  cmd->indep_var);
-
-      totals->sum = 0;
-      totals->n = 0;
-      totals->ssq = 0;
-      totals->sum_diff = 0;
-      totals->maximum = -DBL_MAX;
-      totals->minimum = DBL_MAX;
+      covariance_destroy (ws.vws[v].cov);
+      dd_destroy (ws.dd_total[v]);
      }
-}
+  free (ws.vws);
+  free (ws.dd_total);
  
-/* Post calculations for the ONEWAY command */
-static void
-postcalc (const struct oneway_spec *cmd)
-{
-  size_t i = 0;
-
-  for (i = 0; i < cmd->n_vars; ++i)
-    {
-      struct group_proc *gp = group_proc_get (cmd->vars[i]);
-      struct hsh_table *group_hash = gp->group_hash;
-      struct group_statistics *totals = &gp->ugs;
-
-      struct hsh_iterator g;
-      struct group_statistics *gs;
-
-      for (gs =  hsh_first (group_hash, &g);
-          gs != 0;
-          gs = hsh_next (group_hash, &g))
-       {
-         gs->mean = gs->sum / gs->n;
-         gs->s_std_dev = sqrt (gs->ssq / gs->n - pow2 (gs->mean));
-
-         gs->std_dev = sqrt (
-                             gs->n / (gs->n - 1) *
-                             ( gs->ssq / gs->n - pow2 (gs->mean))
-                             );
-
-         gs->se_mean = gs->std_dev / sqrt (gs->n);
-         gs->mean_diff = gs->sum_diff / gs->n;
-       }
-
-      totals->mean = totals->sum / totals->n;
-      totals->std_dev = sqrt (
-                             totals->n / (totals->n - 1) *
-                             (totals->ssq / totals->n - pow2 (totals->mean))
-                             );
-
-      totals->se_mean = totals->std_dev / sqrt (totals->n);
-    }
  }
  
  static void show_contrast_coeffs (const struct oneway_spec *cmd, const struct oneway_workspace *ws);
@@ -714,16 +570,6 @@ output_oneway (const struct oneway_spec *cmd, struct oneway_workspace *ws)
        show_contrast_coeffs (cmd, ws);
        show_contrast_tests (cmd, ws);
      }
-
-  /* Clean up */
-  for (i = 0; i < cmd->n_vars; ++i )
-    {
-      struct hsh_table *group_hash = group_proc_get (cmd->vars[i])->group_hash;
-
-      hsh_destroy (group_hash);
-    }
-
-  hsh_destroy (ws->group_hash);
  }
  
  
@@ -1005,15 +851,12 @@ show_homogeneity (const struct oneway_spec *cmd, const struct oneway_workspace *
    for (v = 0; v < cmd->n_vars; ++v)
      {
        double n;
-
-
        const struct per_var_ws *pvw = &ws->vws[v];
+      double F = pvw->levene_w;
  
        const struct variable *var = cmd->vars[v];
-      const struct group_proc *gp = group_proc_get (cmd->vars[v]);
        const char *s = var_to_string (var);
        double df1, df2;
-      double F = gp->levene;
  
        moments1_calculate (ws->dd_total[v]->mom, &n, NULL, NULL, NULL, NULL);
  
diff --git a/src/language/stats/t-test.q b/src/language/stats/t-test.q

index 8aee3b157992827fa9021c876b39822809ab43b2..77fc429796428a9cab677563cac142995c97f7da 100644 (file)
--- a/src/language/stats/t-test.q
+++ b/src/language/stats/t-test.q
@@ -1439,8 +1439,13 @@ calculate (struct t_test_proc *proc,
        break;
      case T_IND_SAMPLES:
        group_calc (dict, proc, casereader_clone (input));
-      levene (dict, input, proc->indep_var, proc->n_vars, proc->vars,
-              proc->exclude);
+      int i;
+      for (i = 0; i < proc->n_vars; ++i)
+       {
+         struct group_proc *grp_data = group_proc_get (proc->vars[i]);
+
+         grp_data->levene = levene ( input, proc->indep_var, proc->vars[i], dict_get_weight (dict), proc->exclude);
+       }
        break;
      default:
        NOT_REACHED ();
diff --git a/src/math/levene.c b/src/math/levene.c

index 5ff474dc4ed5c0c39088a2d1a0affabf676835ce..b92441b37babbf8505a931176aa00e1fd1ab7f89 100644 (file)
--- a/src/math/levene.c
+++ b/src/math/levene.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 2004, 2009, 2010 Free Software Foundation, Inc.
+   Copyright (C) 2010 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -15,307 +15,168 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  
  #include <config.h>
+
  #include "levene.h"
-#include <libpspp/message.h>
-#include <data/case.h>
-#include <data/casereader.h>
-#include <data/dictionary.h>
-#include "group-proc.h"
-#include <libpspp/hash.h>
-#include <libpspp/str.h>
-#include <data/variable.h>
-#include <data/procedure.h>
-#include <libpspp/misc.h>
-#include "group.h"
  
  #include <math.h>
-#include <stdlib.h>
-
-#include "xalloc.h"
-
-
-/* This module calculates the Levene statistic for variables.
-
-   Just for reference, the Levene Statistic is a defines as follows:
-
-   W = \frac{ (n-k)\sum_{i=1}^k n_i(Z_{iL} - Z_{LL})^2}
-            { (k-1)\sum_{i=1}^k \sum_{j=1}^{n_i} (Z_{ij} - Z_{iL})^2}
-
-   where:
-        k is the number of groups
-       n is the total number of samples
-        n_i is the number of samples in the ith group
-        Z_{ij} is | Y_{ij} - Y_{iL} | where Y_{iL} is the mean of the ith group
-       Z_{iL} is the  mean of Z_{ij} over the ith group
-       Z_{LL} is the grand mean of Z_{ij}
-
-   Imagine calculating that with pencil and paper!
  
- */
-
-
-struct levene_info
-{
-
-  /* Per group statistics */
-  struct t_test_proc **group_stats;
-
-  /* The independent variable */
-  const struct variable *v_indep;
-
-  /* Number of dependent variables */
-  size_t n_dep;
-
-  /* The dependent variables */
-  const struct variable  **v_dep;
-
-  /* Filter for missing values */
-  enum mv_class exclude;
+#include <libpspp/misc.h>
  
-  /* An array of lz_stats for each variable */
-  struct lz_stats *lz;
+#include <data/case.h>
+#include <data/casereader.h>
+#include <data/variable.h>
  
-  /* The denominator for the expression for the Levene */
-  double *lz_denominator;
  
-};
+#include <libpspp/hmap.h>
  
-/* Per variable statistics */
-struct lz_stats
+struct lev
  {
-  /* Total of all lz */
-  double grand_total;
-
-  /* Mean of all lz */
-  double grand_mean;
+  struct hmap_node node;
+  union value group;
+  int width;
  
-  /* The total number of cases */
-  double total_n ;
-
-  /* Number of groups */
-  int n_groups;
+  double t_bar;
+  double z_mean;
+  double n;
  };
  
-/* First pass */
-static void  levene_precalc (const struct levene_info *l);
-static int levene_calc (const struct dictionary *dict, const struct ccase *,
-                       const struct levene_info *l);
-static void levene_postcalc (struct levene_info *);
-
-
-/* Second pass */
-static void levene2_precalc (struct levene_info *l);
-static int levene2_calc (const struct dictionary *, const struct ccase *,
-                        struct levene_info *l);
-static void levene2_postcalc (struct levene_info *);
  
-
-void
-levene(const struct dictionary *dict,
-       struct casereader *reader,
-       const struct variable *v_indep, size_t n_dep,
-       const struct variable **v_dep,
-       enum mv_class exclude)
-{
-  struct casereader *pass1, *pass2;
-  struct ccase *c;
-  struct levene_info l;
-
-  l.n_dep      = n_dep;
-  l.v_indep    = v_indep;
-  l.v_dep      = v_dep;
-  l.exclude    = exclude;
-  l.lz         = xnmalloc (l.n_dep, sizeof *l.lz);
-  l.lz_denominator = xnmalloc (l.n_dep, sizeof *l.lz_denominator);
-
-  casereader_split (reader, &pass1, &pass2);
-
-  levene_precalc (&l);
-  for (; (c = casereader_read (pass1)) != NULL; case_unref (c))
-    levene_calc (dict, c, &l);
-  casereader_destroy (pass1);
-  levene_postcalc (&l);
-
-  levene2_precalc(&l);
-  for (; (c = casereader_read (pass2)) != NULL; case_unref (c))
-    levene2_calc (dict, c, &l);
-  casereader_destroy (pass2);
-  levene2_postcalc (&l);
-
-  free (l.lz_denominator);
-  free (l.lz);
-}
-
-static void
-levene_precalc (const struct levene_info *l)
+static struct lev *
+find_group (struct hmap *map, const union value *target, int width)
  {
-  size_t i;
-
-  for(i = 0; i < l->n_dep ; ++i )
+  struct lev *l = NULL;
+  unsigned int hash = value_hash (target, width, 0);
+  HMAP_FOR_EACH_WITH_HASH (l, struct lev, node, hash, map)
      {
-      const struct variable *var = l->v_dep[i];
-      struct group_proc *gp = group_proc_get (var);
-      struct group_statistics *gs;
-      struct hsh_iterator hi;
-
-      l->lz[i].grand_total = 0;
-      l->lz[i].total_n = 0;
-      l->lz[i].n_groups = gp->n_groups ;
-
-
-      for ( gs = hsh_first(gp->group_hash, &hi);
-           gs != NULL;
-           gs = hsh_next(gp->group_hash, &hi))
-       {
-         gs->lz_total = 0;
-       }
-
+      if (value_equal (&l->group, target, width))
+       break;
+      l = NULL;
      }
  
+  return l;
  }
  
-static int
-levene_calc (const struct dictionary *dict, const struct ccase *c,
-            const struct levene_info *l)
-{
-  size_t i;
-  bool warn = false;
-  const union value *gv = case_data (c, l->v_indep);
-  struct group_statistics key;
-  double weight = dict_get_case_weight (dict, c, &warn);
-
-  key.id = *gv;
-
-  for (i = 0; i < l->n_dep; ++i)
-    {
-      const struct variable *var = l->v_dep[i];
-      struct group_proc *gp = group_proc_get (var);
-      double levene_z;
-      const union value *v = case_data (c, var);
-      struct group_statistics *gs;
-
-      gs = hsh_find(gp->group_hash,(void *) &key );
  
-      if ( gs == NULL )
-       continue ;
-
-      if ( !var_is_value_missing (var, v, l->exclude))
-       {
-         levene_z= fabs(v->f - gs->mean);
-         l->lz[i].grand_total += levene_z * weight;
-         l->lz[i].total_n += weight;
-
-         gs->lz_total += levene_z * weight;
-       }
-    }
-  return 0;
-}
-
-
-static void
-levene_postcalc (struct levene_info *l)
+double
+levene (struct casereader *rx, const struct variable *gvar,
+       const struct variable *var, const struct variable *wv,
+       enum mv_class exclude)
  {
-  size_t v;
-
-  for (v = 0; v < l->n_dep; ++v)
-    {
-      /* This is Z_LL */
-      l->lz[v].grand_mean = l->lz[v].grand_total / l->lz[v].total_n ;
-    }
+  double numerator = 0.0;
+  double denominator = 0.0;
+  int n_groups = 0;
+  double z_grand_mean = 0.0;
+  double grand_n = 0.0;
  
+  struct hmap map = HMAP_INITIALIZER (map);
  
-}
-
-
-
-static void
-levene2_precalc (struct levene_info *l)
-{
-  size_t v;
-
+  struct ccase *c;
+  struct casereader *r = casereader_clone (rx);
  
-  /* This stuff could go in the first post calc . . . */
-  for (v = 0;
-       v < l->n_dep;
-       ++v)
+  for (; (c = casereader_read (r)) != NULL; case_unref (c))
      {
-      struct hsh_iterator hi;
-      struct group_statistics *g;
-
-      const struct variable *var = l->v_dep[v] ;
-      struct hsh_table *hash = group_proc_get (var)->group_hash;
-
+      struct lev *l = NULL;
+      const union value *target = case_data (c, gvar);
+      int width = var_get_width (gvar);
+      unsigned int hash = value_hash (target, width, 0);
+      const double x = case_data (c, var)->f;
+      const double weight = wv ? case_data (c, wv)->f : 1.0;
+
+      if (var_is_value_missing (var, case_data (c, var), exclude))
+       continue;
  
-      for (g = hsh_first(hash,&hi); g != NULL; g = hsh_next(hash, &hi))
+      l = find_group (&map, target, width);
+      
+      if (l == NULL)
         {
-         g->lz_mean = g->lz_total / g->n ;
+         l = xzalloc (sizeof *l);
+         value_clone (&l->group, target, width);
+         hmap_insert (&map, &l->node, hash);
         }
-      l->lz_denominator[v] = 0;
-  }
-}
  
-static int
-levene2_calc (const struct dictionary *dict, const struct ccase *c,
-             struct levene_info *l)
-{
-  size_t i;
-  bool warn = false;
-
-  double weight = dict_get_case_weight (dict, c, &warn);
-
-  const union value *gv = case_data (c, l->v_indep);
-  struct group_statistics key;
+      l->n += weight;
+      l->t_bar += x * weight;
+      grand_n += weight;
+    }
+  casereader_destroy (r);
+
+  {
+    struct lev *l;
+    HMAP_FOR_EACH (l, struct lev, node, &map)
+      {
+       l->t_bar /= l->n;
+      }
+  }
  
-  key.id = *gv;
+  n_groups = hmap_count (&map);
  
-  for (i = 0; i < l->n_dep; ++i)
+  r = casereader_clone (rx);
+  for (; (c = casereader_read (r)) != NULL; case_unref (c))
      {
-      double levene_z;
-      const struct variable *var = l->v_dep[i] ;
-      const union value *v = case_data (c, var);
-      struct group_statistics *gs;
-
-      gs = hsh_find(group_proc_get (var)->group_hash,(void *) &key );
+      const union value *target = case_data (c, gvar);
+      int width = var_get_width (gvar);
+      const double x = case_data (c, var)->f;
+      const double weight = wv ? case_data (c, wv)->f : 1.0;
  
-      if ( gs == NULL )
+      if (var_is_value_missing (var, case_data (c, var), exclude))
         continue;
  
-      if ( !var_is_value_missing (var, v, l->exclude))
-       {
-         levene_z = fabs(v->f - gs->mean);
-         l->lz_denominator[i] += weight * pow2 (levene_z - gs->lz_mean);
-       }
-    }
-
-  return 0;
-}
+      struct lev *l = find_group (&map, target, width);
  
+      assert (l);
+      
+      l->z_mean += fabs (x - l->t_bar) * weight;
+      z_grand_mean += fabs (x - l->t_bar) * weight;
+    }
+  casereader_destroy (r);
+
+  {
+    struct lev *l;
+    HMAP_FOR_EACH (l, struct lev, node, &map)
+      {
+       l->z_mean /= l->n;
+      }
+  }
  
-static void
-levene2_postcalc (struct levene_info *l)
-{
-  size_t v;
+  z_grand_mean /= grand_n;
  
-  for (v = 0; v < l->n_dep; ++v)
+  r = casereader_clone (rx);
+  for (; (c = casereader_read (r)) != NULL; case_unref (c))
      {
-      double lz_numerator = 0;
-      struct hsh_iterator hi;
-      struct group_statistics *g;
+      double z;
+      const union value *target = case_data (c, gvar);
+      int width = var_get_width (gvar);
+      const double x = case_data (c, var)->f;
+      const double weight = wv ? case_data (c, wv)->f : 1.0;
  
-      const struct variable *var = l->v_dep[v] ;
-      struct group_proc *gp = group_proc_get (var);
-      struct hsh_table *hash = gp->group_hash;
-
-      for (g = hsh_first(hash, &hi); g != NULL; g = hsh_next(hash, &hi))
-       {
-         lz_numerator += g->n * pow2(g->lz_mean - l->lz[v].grand_mean );
-       }
-      lz_numerator *= ( gp->ugs.n - gp->n_groups );
-
-      l->lz_denominator[v] *= (gp->n_groups - 1);
+      if (var_is_value_missing (var, case_data (c, var), exclude))
+       continue;
  
-      gp->levene = lz_numerator / l->lz_denominator[v] ;
+      struct lev *l = find_group (&map, target, width);
+      assert (l);
+      z = fabs (x - l->t_bar);
  
+      denominator += pow2 (z - l->z_mean) * weight;
      }
-}
+  casereader_destroy (r);
+
+  denominator *= n_groups - 1;
+
+  {
+    double grand_n = 0.0;
+    struct hmap_node *next;
+    struct lev *l;
+    HMAP_FOR_EACH_SAFE (l, next, struct lev, node, &map)
+      {
+       numerator += l->n * pow2 (l->z_mean - z_grand_mean);
+       grand_n += l->n;
+
+       /* We don't need these anymore */
+       free (l);
+      }
+    numerator *= grand_n - n_groups ;
+    hmap_destroy (&map);
+  }
  
+  return numerator/ denominator;
+}
diff --git a/src/math/levene.h b/src/math/levene.h

index 67a4a1155b7813d6d84a37fe4987aada81e40f0e..c5e8bc3b4e9a15a49bc97d6837bb680cc06ceb4b 100644 (file)
--- a/src/math/levene.h
+++ b/src/math/levene.h
@@ -17,29 +17,15 @@
  #if !levene_h
  #define levene_h 1
  
-#include <data/casereader.h>
-#include <data/missing-values.h>
-#include <data/variable.h>
+struct casereader;
+struct variable;
  
-/* Calculate the Levene statistic
  
-The independent variable :   v_indep;
+enum mv_class;
  
-Number of dependent variables :   n_dep;
+double
+levene (struct casereader *rx, const struct variable *gvar,
+       const struct variable *var, const struct variable *wv,  enum mv_class exclude );
  
-The dependent variables :   v_dep;
  
-*/
-
-
-struct dictionary ;
-struct casefilter ;
-
-void  levene(const struct dictionary *dict, struct casereader *,
-            const struct variable *v_indep, size_t n_dep,
-            const struct variable **v_dep,
-            enum mv_class exclude);
-
-
-
-#endif /* levene_h */
+#endif
author	John Darrington <john@darrington.wattle.id.au>
	Sun, 17 Oct 2010 12:03:41 +0000 (14:03 +0200)
committer	John Darrington <john@darrington.wattle.id.au>
	Sun, 17 Oct 2010 14:37:09 +0000 (16:37 +0200)
src/language/stats/oneway.c		patch \| blob \| history
src/language/stats/t-test.q		patch \| blob \| history
src/math/levene.c		patch \| blob \| history
src/math/levene.h		patch \| blob \| history