Added basic framework for the MEANS command.

author John Darrington <john@darrington.wattle.id.au>

Sat, 10 Dec 2011 13:25:46 +0000 (14:25 +0100)

committer John Darrington <john@darrington.wattle.id.au>

Sat, 10 Dec 2011 13:25:46 +0000 (14:25 +0100)
author John Darrington <john@darrington.wattle.id.au>
Sat, 10 Dec 2011 13:25:46 +0000 (14:25 +0100)
committer John Darrington <john@darrington.wattle.id.au>
Sat, 10 Dec 2011 13:25:46 +0000 (14:25 +0100)
diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk

index b44ebb2d38eb9dcaa53286a80a8e116413729409..843653b3dc4cf9ef9ccf555ce229fe8c78f73d35 100644 (file)
--- a/src/language/stats/automake.mk
+++ b/src/language/stats/automake.mk
@@ -34,6 +34,7 @@ language_stats_sources = \
         src/language/stats/ks-one-sample.h \
         src/language/stats/mann-whitney.c \
         src/language/stats/mann-whitney.h \
+       src/language/stats/means.c \
         src/language/stats/mcnemar.c \
         src/language/stats/mcnemar.h \
         src/language/stats/median.c \
diff --git a/src/language/stats/means.c b/src/language/stats/means.c

new file mode 100644 (file)

index 0000000..82ba603
--- /dev/null
+++ b/src/language/stats/means.c
@@ -0,0 +1,441 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2011 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+
+#include "data/case.h"
+#include "data/casegrouper.h"
+#include "data/casereader.h"
+#include "data/dataset.h"
+#include "data/dictionary.h"
+#include "data/variable.h"
+#include "language/command.h"
+#include "language/lexer/lexer.h"
+#include "language/lexer/variable-parser.h"
+
+#include "math/categoricals.h"
+#include "math/interaction.h"
+
+#include "output/tab.h"
+
+#include "gettext.h"
+#define _(msgid) gettext (msgid)
+#define N_(msgid) (msgid)
+
+struct cell_spec
+{
+  /* Printable title for output */
+  const char *title;
+
+  /* Keyword for syntax */
+  const char *keyword;
+};
+
+/* Table of cell_specs */
+static const struct cell_spec cell_spec[] =
+{
+  {N_("Means"),          "MEANS"},
+  {N_("N"),              "COUNT"},
+  {N_("Std. Deviation"), "STDDEV"},
+  {N_("Median"),         "MEDIAN"},
+  {N_("Group Median"),   "GMEDIAN"},
+  {N_("S.E. Mean"),      "SEMEAN"},
+  {N_("Sum"),            "SUM"},
+  {N_("Min"),            "MIN"},
+  {N_("Max"),            "MAX"},
+  {N_("Range"),          "RANGE"},
+  {N_("Variance"),       "VARIANCE"},
+  {N_("Kurtosis"),       "KURTOSIS"},
+  {N_("S.E. Kurt"),      "SEKURT"},
+  {N_("Skewness"),       "SKEW"},
+  {N_("S.E. Skew"),      "SESKEW"},
+  {N_("First"),          "FIRST"},
+  {N_("Last"),           "LAST"},
+  {N_("Percent N"),      "NPCT"},
+  {N_("Percent Sum"),    "SPCT"},
+  {N_("Harmonic Mean"),  "HARMONIC"},
+  {N_("Geom. Mean"),     "GEOMETRIC"}
+};
+
+#define n_C (sizeof (cell_spec) / sizeof (struct cell_spec))
+
+struct means
+{
+  size_t n_dep_vars;
+  const struct variable **dep_vars;
+
+  size_t n_interactions;
+  struct interaction **interactions;
+
+  size_t *n_factor_vars;
+  const struct variable ***factor_vars;
+
+  int ii;
+
+  int n_layers;
+
+  const struct dictionary *dict;
+
+  enum mv_class exclude;
+
+  /* an array  indicating which statistics are to be calculated */
+  int *cells;
+
+  /* Size of cells */
+  int n_cells;
+
+  struct categoricals *cats;
+};
+
+
+static void
+run_means (struct means *cmd, struct casereader *input,
+          const struct dataset *ds);
+
+/* Append all the variables belonging to layer and all subsequent layers
+   to iact. And then append iact to the means->interaction.
+   This is a recursive function.
+ */
+static void
+iact_append_factor (struct means *means, int layer, const struct interaction *iact)
+{
+  int v;
+  const struct variable **fv ;
+
+  if (layer >= means->n_layers)
+    return;
+
+  fv = means->factor_vars[layer];
+
+  for (v = 0; v < means->n_factor_vars[layer]; ++v)
+    {
+      struct interaction *nexti = interaction_clone (iact);
+
+      interaction_add_variable (nexti, fv[v]);
+
+      iact_append_factor (means, layer + 1, nexti);
+
+      if (layer == means->n_layers - 1)
+       {
+         means->interactions[means->ii++] = nexti;
+       }
+    }
+}
+
+int
+cmd_means (struct lexer *lexer, struct dataset *ds)
+{
+  int i;
+  int l;
+  struct means means;
+
+  means.n_factor_vars = NULL;
+  means.factor_vars = NULL;
+
+  means.n_layers = 0;
+
+  means.n_dep_vars = 0;
+  means.dict = dataset_dict (ds);
+
+  means.n_cells = 3;
+  means.cells = xcalloc (means.n_cells, sizeof (*means.cells));
+  
+  /* The first three items (MEANS, COUNT, STDDEV) are the default */
+  for (i = 0; i < 3 ; ++i)
+    means.cells[i] = i;
+  
+
+  /*   Optional TABLES =   */
+  if (lex_match_id (lexer, "TABLES"))
+    {
+      lex_force_match (lexer, T_EQUALS);
+    }
+
+  /* Dependent variable (s) */
+  if (!parse_variables_const (lexer, means.dict,
+                             &means.dep_vars, &means.n_dep_vars,
+                             PV_NO_DUPLICATE | PV_NUMERIC))
+    goto error;
+
+  /* Factor variable (s) */
+  while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
+    {
+      if (lex_match (lexer, T_BY))
+       {
+         means.n_layers++;
+         means.factor_vars =
+           xrealloc (means.factor_vars,
+                     sizeof (*means.factor_vars) * means.n_layers);
+         means.n_factor_vars =
+           xrealloc (means.n_factor_vars,
+                     sizeof (*means.n_factor_vars) * means.n_layers);
+
+         if (!parse_variables_const (lexer, means.dict,
+                                     &means.factor_vars[means.n_layers - 1],
+                                     &means.n_factor_vars[means.n_layers -
+                                                          1],
+                                     PV_NO_DUPLICATE | PV_NUMERIC))
+           goto error;
+
+       }
+    }
+
+  /* /MISSING subcommand */
+  while (lex_token (lexer) != T_ENDCMD)
+    {
+      lex_match (lexer, T_SLASH);
+
+      if (lex_match_id (lexer, "MISSING"))
+       {
+         lex_match (lexer, T_EQUALS);
+         while (lex_token (lexer) != T_ENDCMD
+                && lex_token (lexer) != T_SLASH)
+           {
+             if (lex_match_id (lexer, "INCLUDE"))
+               {
+                 means.exclude = MV_SYSTEM;
+               }
+             else if (lex_match_id (lexer, "EXCLUDE"))
+               {
+                 means.exclude = MV_ANY;
+               }
+             else
+               {
+                 lex_error (lexer, NULL);
+                 goto error;
+               }
+           }
+       }
+      else if (lex_match_id (lexer, "CELLS"))
+       {
+         lex_match (lexer, T_EQUALS);
+
+         /* The default values become overwritten */
+         means.n_cells = 0;
+         while (lex_token (lexer) != T_ENDCMD
+                && lex_token (lexer) != T_SLASH)
+           {
+             int k;
+             for (k = 0; k < n_C; ++k)
+               {
+                 if (lex_match_id (lexer, cell_spec[k].keyword))
+                   {
+                     means.cells =
+                       xrealloc (means.cells,
+                                 ++means.n_cells * sizeof (*means.cells));
+
+                     means.cells[means.n_cells - 1] = k;
+                     break;
+                   }
+               }
+             if (k >= n_C)
+               {
+                 lex_error (lexer, NULL);
+                 goto error;
+               }
+           }
+       }
+      else
+       {
+         lex_error (lexer, NULL);
+         goto error;
+       }
+    }
+
+
+  means.n_interactions = 1;
+  for (l = 0; l < means.n_layers; ++l)
+    {
+      const int n_vars = means.n_factor_vars[l];
+      means.n_interactions *= n_vars;
+    }
+
+  means.interactions =
+    xcalloc (means.n_interactions, sizeof (*means.interactions));
+
+  means.ii = 0;
+
+  iact_append_factor (&means, 0, interaction_create (NULL));
+
+  {
+    struct casegrouper *grouper;
+    struct casereader *group;
+    bool ok;
+
+    grouper = casegrouper_create_splits (proc_open (ds), means.dict);
+    while (casegrouper_get_next_group (grouper, &group))
+      {
+       run_means (&means, group, ds);
+      }
+    ok = casegrouper_destroy (grouper);
+    ok = proc_commit (ds) && ok;
+  }
+
+
+  return CMD_SUCCESS;
+
+error:
+
+  free (means.dep_vars);
+
+  return CMD_FAILURE;
+}
+
+static void output_case_processing_summary (const struct means *cmd);
+static void output_report (const struct means *,
+                         const struct interaction *);
+
+static void
+run_means (struct means *cmd, struct casereader *input,
+          const struct dataset *ds)
+{
+  int i;
+  const struct variable *wv = dict_get_weight (cmd->dict);
+  struct ccase *c;
+  struct casereader *reader;
+
+  bool warn_bad_weight = true;
+
+  cmd->cats
+    = categoricals_create (cmd->interactions,
+                          cmd->n_interactions, wv, cmd->exclude, 0, 0, 0, 0);
+
+
+  for (reader = casereader_clone (input);
+       (c = casereader_read (reader)) != NULL; case_unref (c))
+    {
+      double weight = dict_get_case_weight (cmd->dict, c, &warn_bad_weight);
+
+      printf ("%g\n", case_data_idx (c, 0)->f);
+      categoricals_update (cmd->cats, c);
+    }
+  casereader_destroy (reader);
+
+  categoricals_done (cmd->cats);
+
+  output_case_processing_summary (cmd);
+
+  for (i = 0; i < cmd->n_interactions; ++i)
+    {
+      output_report (cmd, cmd->interactions[i]);
+    }
+}
+
+
+static void
+output_case_processing_summary (const struct means *cmd)
+{
+  int i;
+  const int heading_columns = 1;
+  const int heading_rows = 3;
+  struct tab_table *t;
+
+  const int nr = heading_rows + cmd->n_interactions;
+  const int nc = 7;
+
+  t = tab_create (nc, nr);
+  tab_title (t, _("Case Processing Summary"));
+
+  tab_headers (t, heading_columns, 0, heading_rows, 0);
+
+  tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
+
+  tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
+  tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
+
+
+  tab_joint_text (t, heading_columns, 0,
+                 nc - 1, 0, TAB_CENTER | TAT_TITLE, _("Cases"));
+
+  tab_joint_text (t, 1, 1, 2, 1, TAB_CENTER | TAT_TITLE, _("Included"));
+  tab_joint_text (t, 3, 1, 4, 1, TAB_CENTER | TAT_TITLE, _("Excluded"));
+  tab_joint_text (t, 5, 1, 6, 1, TAB_CENTER | TAT_TITLE, _("Total"));
+
+  tab_hline (t, TAL_1, heading_columns, nc - 1, 1);
+  tab_hline (t, TAL_1, heading_columns, nc - 1, 2);
+
+
+  for (i = 0; i < 3; ++i)
+    {
+      tab_text (t, heading_columns + i * 2, 2, TAB_CENTER | TAT_TITLE,
+               _("N"));
+      tab_text (t, heading_columns + i * 2 + 1, 2, TAB_CENTER | TAT_TITLE,
+               _("Percent"));
+    }
+
+  for (i = 0; i < cmd->n_interactions; ++i)
+    {
+      const struct interaction *iact = cmd->interactions[i];
+
+      struct string str;
+      ds_init_empty (&str);
+      interaction_to_string (iact, &str);
+
+      size_t n = categoricals_n_count (cmd->cats, i);
+
+      tab_text (t, 0, i + heading_rows, TAB_LEFT | TAT_TITLE, ds_cstr (&str));
+
+      printf ("Count %d is %d\n", i, n);
+
+
+      ds_destroy (&str);
+    }
+
+  tab_submit (t);
+}
+
+
+
+static void
+output_report (const struct means *cmd, const struct interaction *iact)
+{
+  int i;
+  const int heading_columns = 0;
+  const int heading_rows = 1;
+  struct tab_table *t;
+
+  const int nr = 18;
+  const int nc = heading_columns + iact->n_vars + cmd->n_cells;
+
+
+  t = tab_create (nc, nr);
+  tab_title (t, _("Report"));
+
+  tab_headers (t, heading_columns, 0, heading_rows, 0);
+
+  tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
+
+  tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
+  tab_vline (t, TAL_2, iact->n_vars, 0, nr - 1);
+
+  for (i = 0; i < iact->n_vars; ++i)
+    {
+      tab_text (t, heading_columns + i, 0, TAB_CENTER | TAT_TITLE,
+               var_to_string (iact->vars[i]));
+    }
+
+  for (i = 0; i < cmd->n_cells; ++i)
+    {
+      tab_text (t, heading_columns + iact->n_vars + i, 0,
+               TAB_CENTER | TAT_TITLE,
+               gettext (cell_spec[cmd->cells[i]].title));
+    }
+
+  tab_text (t, heading_columns + 1, 5, TAB_CENTER | TAT_TITLE, "data");
+
+  tab_submit (t);
+}
author	John Darrington <john@darrington.wattle.id.au>
	Sat, 10 Dec 2011 13:25:46 +0000 (14:25 +0100)
committer	John Darrington <john@darrington.wattle.id.au>
	Sat, 10 Dec 2011 13:25:46 +0000 (14:25 +0100)
src/language/stats/automake.mk		patch \| blob \| history
src/language/stats/means.c	[new file with mode: 0644]	patch \| blob