Replace regression.q with regression.c

author John Darrington <john@darrington.wattle.id.au>

Fri, 13 Apr 2012 16:55:36 +0000 (18:55 +0200)

committer John Darrington <john@darrington.wattle.id.au>

Fri, 13 Apr 2012 16:55:36 +0000 (18:55 +0200)
author John Darrington <john@darrington.wattle.id.au>
Fri, 13 Apr 2012 16:55:36 +0000 (18:55 +0200)
committer John Darrington <john@darrington.wattle.id.au>
Fri, 13 Apr 2012 16:55:36 +0000 (18:55 +0200)
diff --git a/src/language/command.def b/src/language/command.def

index f8ee29af615a84a7d6ba24b5f4eb3fe16d6aa9c3..05b18d6c61a94ea5a746261bdcc19e2af45b82dd 100644 (file)
--- a/src/language/command.def
+++ b/src/language/command.def
@@ -128,7 +128,7 @@ DEF_CMD (S_DATA, 0, "ONEWAY", cmd_oneway)
  DEF_CMD (S_DATA, 0, "PEARSON CORRELATIONS", cmd_correlation)
  DEF_CMD (S_DATA, 0, "QUICK CLUSTER", cmd_quick_cluster)
  DEF_CMD (S_DATA, 0, "RANK", cmd_rank)
-DEF_CMD (S_DATA, 0, "REGRESSION", cmd_regression)
+DEF_CMD (S_DATA, 0, "REGRESSION",  cmd_regression)
  DEF_CMD (S_DATA, 0, "RELIABILITY", cmd_reliability)
  DEF_CMD (S_DATA, 0, "RENAME VARIABLES", cmd_rename_variables)
  DEF_CMD (S_DATA, 0, "ROC", cmd_roc)
diff --git a/src/language/stats/.gitignore b/src/language/stats/.gitignore

index d550b0d129284e496c8d5e71803a37e83d6f617e..10f6baf208f940983662548d0931b99fed8329b5 100644 (file)
--- a/src/language/stats/.gitignore
+++ b/src/language/stats/.gitignore
@@ -1,3 +1,2 @@
  crosstabs.c
  frequencies.c
-regression.c
diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk

index 93a8e2053df6809a0ffbc1fcc36e23af08629ca7..8e89d014b937c8ab56ef70b9f89dcc6a04190901 100644 (file)
--- a/src/language/stats/automake.mk
+++ b/src/language/stats/automake.mk
@@ -4,8 +4,7 @@ AM_CPPFLAGS += -I$(top_srcdir)/src/language/stats
  
  src_language_stats_built_sources = \
         src/language/stats/crosstabs.c \
-       src/language/stats/frequencies.c \
-       src/language/stats/regression.c
+       src/language/stats/frequencies.c
  
  language_stats_sources = \
         src/language/stats/aggregate.c \
@@ -50,6 +49,7 @@ language_stats_sources = \
         src/language/stats/reliability.c \
         src/language/stats/roc.c \
         src/language/stats/roc.h \
+       src/language/stats/regression.c \
         src/language/stats/runs.h \
         src/language/stats/runs.c \
         src/language/stats/sign.c \
diff --git a/src/language/stats/regression.c b/src/language/stats/regression.c

new file mode 100644 (file)

index 0000000..0a2e2a8
--- /dev/null
+++ b/src/language/stats/regression.c
@@ -0,0 +1,925 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2005, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#include <stdbool.h>
+
+#include <gsl/gsl_cdf.h>
+#include <gsl/gsl_matrix.h>
+
+#include <data/dataset.h>
+
+#include "language/command.h"
+#include "language/lexer/lexer.h"
+#include "language/lexer/value-parser.h"
+#include "language/lexer/variable-parser.h"
+
+
+#include "data/casegrouper.h"
+#include "data/casereader.h"
+#include "data/dictionary.h"
+
+#include "math/covariance.h"
+#include "math/linreg.h"
+#include "math/moments.h"
+
+#include "libpspp/message.h"
+#include "libpspp/taint.h"
+
+#include "output/tab.h"
+
+#include "gettext.h"
+#define _(msgid) gettext (msgid)
+#define N_(msgid) msgid
+
+
+#include <gl/intprops.h>
+
+#define REG_LARGE_DATA 1000
+
+struct regression
+{
+  struct dataset *ds;
+
+  const struct variable **vars;
+  size_t n_vars;
+
+  const struct variable **dep_vars;
+  size_t n_dep_vars;
+
+  bool r;
+  bool coeff;
+  bool anova;
+  bool bcov;
+
+
+  bool resid;
+  bool pred;
+
+  linreg **models;
+};
+
+
+static void run_regression (const struct regression *cmd, struct casereader *input);
+
+
+
+/*
+  Transformations for saving predicted values
+  and residuals, etc.
+*/
+struct reg_trns
+{
+  int n_trns;                  /* Number of transformations. */
+  int trns_id;                 /* Which trns is this one? */
+  linreg *c;           /* Linear model for this trns. */
+};
+
+/*
+  Gets the predicted values.
+*/
+static int
+regression_trns_pred_proc (void *t_, struct ccase **c,
+                          casenumber case_idx UNUSED)
+{
+  size_t i;
+  size_t n_vals;
+  struct reg_trns *trns = t_;
+  linreg *model;
+  union value *output = NULL;
+  const union value *tmp;
+  double *vals;
+  const struct variable **vars = NULL;
+
+  assert (trns != NULL);
+  model = trns->c;
+  assert (model != NULL);
+  assert (model->depvar != NULL);
+  assert (model->pred != NULL);
+
+  vars = linreg_get_vars (model);
+  n_vals = linreg_n_coeffs (model);
+  vals = xnmalloc (n_vals, sizeof (*vals));
+  *c = case_unshare (*c);
+
+  output = case_data_rw (*c, model->pred);
+
+  for (i = 0; i < n_vals; i++)
+    {
+      tmp = case_data (*c, vars[i]);
+      vals[i] = tmp->f;
+    }
+  output->f = linreg_predict (model, vals, n_vals);
+  free (vals);
+  return TRNS_CONTINUE;
+}
+
+/*
+  Gets the residuals.
+*/
+static int
+regression_trns_resid_proc (void *t_, struct ccase **c,
+                           casenumber case_idx UNUSED)
+{
+  size_t i;
+  size_t n_vals;
+  struct reg_trns *trns = t_;
+  linreg *model;
+  union value *output = NULL;
+  const union value *tmp;
+  double *vals = NULL;
+  double obs;
+  const struct variable **vars = NULL;
+
+  assert (trns != NULL);
+  model = trns->c;
+  assert (model != NULL);
+  assert (model->depvar != NULL);
+  assert (model->resid != NULL);
+
+  vars = linreg_get_vars (model);
+  n_vals = linreg_n_coeffs (model);
+
+  vals = xnmalloc (n_vals, sizeof (*vals));
+  *c = case_unshare (*c);
+  output = case_data_rw (*c, model->resid);
+  assert (output != NULL);
+
+  for (i = 0; i < n_vals; i++)
+    {
+      tmp = case_data (*c, vars[i]);
+      vals[i] = tmp->f;
+    }
+  tmp = case_data (*c, model->depvar);
+  obs = tmp->f;
+  output->f = linreg_residual (model, obs, vals, n_vals);
+  free (vals);
+
+  return TRNS_CONTINUE;
+}
+
+
+static char *
+reg_get_name (const struct dictionary *dict, const char *prefix)
+{
+  char *name;
+  int i;
+
+  /* XXX handle too-long prefixes */
+  name = xmalloc (strlen (prefix) + INT_BUFSIZE_BOUND (i) + 1);
+  for (i = 1; ; i++)
+    {
+      sprintf (name, "%s%d", prefix, i);
+      if (dict_lookup_var (dict, name) == NULL)
+        return name;
+    }
+}
+
+/*
+  Free the transformation. Free its linear model if this
+  transformation is the last one.
+*/
+static bool
+regression_trns_free (void *t_)
+{
+  bool result = true;
+  struct reg_trns *t = t_;
+
+  if (t->trns_id == t->n_trns)
+    {
+      result = linreg_free (t->c);
+    }
+  free (t);
+
+  return result;
+}
+
+static void
+reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
+             linreg * c, struct variable **v, int n_trns)
+{
+  struct dictionary *dict = dataset_dict (ds);
+  static int trns_index = 1;
+  char *name;
+  struct variable *new_var;
+  struct reg_trns *t = NULL;
+
+  t = xmalloc (sizeof (*t));
+  t->trns_id = trns_index;
+  t->n_trns = n_trns;
+  t->c = c;
+
+  name = reg_get_name (dict, prefix);
+  new_var = dict_create_var_assert (dict, name, 0);
+  free (name);
+
+  *v = new_var;
+  add_transformation (ds, f, regression_trns_free, t);
+  trns_index++;
+}
+
+static void
+subcommand_save (const struct regression *cmd)
+{
+  linreg **lc;
+  int n_trns = 0;
+
+  if ( cmd->resid ) n_trns++;
+  if ( cmd->pred ) n_trns++;
+
+  n_trns *= cmd->n_dep_vars;
+
+  for (lc = cmd->models; lc < cmd->models + cmd->n_dep_vars; lc++)
+    {
+      if (*lc != NULL)
+        {
+          if ((*lc)->depvar != NULL)
+            {
+              if (cmd->resid)
+                {
+                  reg_save_var (cmd->ds, "RES", regression_trns_resid_proc, *lc,
+                                &(*lc)->resid, n_trns);
+                }
+              if (cmd->pred)
+                {
+                  reg_save_var (cmd->ds, "PRED", regression_trns_pred_proc, *lc,
+                                &(*lc)->pred, n_trns);
+                }
+            }
+        }
+    }
+}
+
+int
+cmd_regression (struct lexer *lexer, struct dataset *ds)
+{
+  struct regression regression;
+  const struct dictionary *dict = dataset_dict (ds);
+
+  memset (&regression, 0, sizeof (struct regression));
+
+  regression.anova = true;
+  regression.coeff = true;
+  regression.r = true;
+
+  regression.pred = false;
+  regression.resid = false;
+
+  regression.ds = ds;
+
+  /* Accept an optional, completely pointless "/VARIABLES=" */
+  lex_match (lexer, T_SLASH);
+  if (lex_match_id  (lexer, "VARIABLES"))
+    {
+      if (! lex_force_match (lexer, T_EQUALS) )
+        goto error;
+    }
+
+  if (!parse_variables_const (lexer, dict,
+                             &regression.vars, &regression.n_vars,
+                             PV_NO_DUPLICATE | PV_NUMERIC))
+    goto error;
+
+
+  while (lex_token (lexer) != T_ENDCMD)
+    {
+      lex_match (lexer, T_SLASH);
+
+      if (lex_match_id  (lexer, "DEPENDENT"))
+        {
+          if (! lex_force_match (lexer, T_EQUALS) )
+            goto error;
+
+          if (!parse_variables_const (lexer, dict,
+                                      &regression.dep_vars, &regression.n_dep_vars,
+                                      PV_NO_DUPLICATE | PV_NUMERIC))
+            goto error;
+        }
+      else if (lex_match_id (lexer, "METHOD"))
+       {
+         lex_match (lexer, T_EQUALS);
+
+          if (!lex_force_match_id (lexer, "ENTER"))
+            {
+              goto error;
+            }
+        }
+      else if (lex_match_id (lexer, "STATISTICS"))
+       {
+         lex_match (lexer, T_EQUALS);
+
+         while (lex_token (lexer) != T_ENDCMD
+                && lex_token (lexer) != T_SLASH)
+           {
+              if (lex_match (lexer, T_ALL))
+                {
+                }
+              else if (lex_match_id (lexer, "DEFAULTS"))
+                {
+                }
+              else if (lex_match_id (lexer, "R"))
+                {
+                }
+              else if (lex_match_id (lexer, "COEFF"))
+                {
+                }
+              else if (lex_match_id (lexer, "ANOVA"))
+                {
+                }
+              else if (lex_match_id (lexer, "BCOV"))
+                {
+                }
+              else
+                {
+                  lex_error (lexer, NULL);
+                  goto error;
+                }
+            }
+        }
+      else if (lex_match_id (lexer, "SAVE"))
+       {
+         lex_match (lexer, T_EQUALS);
+
+         while (lex_token (lexer) != T_ENDCMD
+                && lex_token (lexer) != T_SLASH)
+           {
+              if (lex_match_id (lexer, "PRED"))
+                {
+                  regression.pred = true;
+                }
+              else if (lex_match_id (lexer, "RESID"))
+                {
+                  regression.resid = true;
+                }
+              else
+                {
+                  lex_error (lexer, NULL);
+                  goto error;
+                }
+            }
+        }
+      else
+        {
+          lex_error (lexer, NULL);
+          goto error;
+        }
+    }
+
+  if (!regression.vars)
+    {
+      dict_get_vars (dict, &regression.vars, &regression.n_vars, 0);
+    }
+
+
+  regression.models = xcalloc (regression.n_dep_vars, sizeof *regression.models);
+
+  {
+    struct casegrouper *grouper;
+    struct casereader *group;
+    bool ok;
+    
+    grouper = casegrouper_create_splits (proc_open (ds), dict);
+    while (casegrouper_get_next_group (grouper, &group))
+      run_regression (&regression, group);
+    ok = casegrouper_destroy (grouper);
+    ok = proc_commit (ds) && ok;
+  }
+
+  if (regression.pred || regression.resid )
+    subcommand_save (&regression);
+ 
+
+  return CMD_SUCCESS;
+  
+ error:
+  return CMD_FAILURE;
+}
+
+
+static size_t
+get_n_all_vars (const struct regression *cmd)
+{
+  size_t result = cmd->n_vars;
+  size_t i;
+  size_t j;
+
+  result += cmd->n_dep_vars;
+  for (i = 0; i < cmd->n_dep_vars; i++)
+    {
+      for (j = 0; j < cmd->n_vars; j++)
+       {
+         if (cmd->vars[j] == cmd->dep_vars[i])
+           {
+             result--;
+           }
+       }
+    }
+  return result;
+}
+
+static void
+fill_all_vars (const struct variable **vars, const struct regression *cmd)
+{
+  size_t i;
+  size_t j;
+  bool absent;
+  
+  for (i = 0; i < cmd->n_vars; i++)
+    {
+      vars[i] = cmd->vars[i];
+    }
+  for (i = 0; i < cmd->n_dep_vars; i++)
+    {
+      absent = true;
+      for (j = 0; j < cmd->n_vars; j++)
+       {
+         if (cmd->dep_vars[i] == cmd->vars[j])
+           {
+             absent = false;
+             break;
+           }
+       }
+      if (absent)
+       {
+         vars[i + cmd->n_vars] = cmd->dep_vars[i];
+       }
+    }
+}
+
+/*
+  Is variable k the dependent variable?
+*/
+static bool
+is_depvar (const struct regression *cmd, size_t k, const struct variable *v)
+{
+  return v == cmd->vars[k];
+}
+
+
+/* Identify the explanatory variables in v_variables.  Returns
+   the number of independent variables. */
+static int
+identify_indep_vars (const struct regression *cmd, 
+                     const struct variable **indep_vars,
+                    const struct variable *depvar)
+{
+  int n_indep_vars = 0;
+  int i;
+
+  for (i = 0; i < cmd->n_vars; i++)
+    if (!is_depvar (cmd, i, depvar))
+      indep_vars[n_indep_vars++] = cmd->vars[i];
+  if ((n_indep_vars < 1) && is_depvar (cmd, 0, depvar))
+    {
+      /*
+       There is only one independent variable, and it is the same
+       as the dependent variable. Print a warning and continue.
+      */
+      msg (SE,
+          gettext ("The dependent variable is equal to the independent variable." 
+                   "The least squares line is therefore Y=X." 
+                   "Standard errors and related statistics may be meaningless."));
+      n_indep_vars = 1;
+      indep_vars[0] = cmd->vars[0];
+    }
+  return n_indep_vars;
+}
+
+
+static double
+fill_covariance (gsl_matrix *cov, struct covariance *all_cov, 
+                const struct variable **vars,
+                size_t n_vars, const struct variable *dep_var, 
+                const struct variable **all_vars, size_t n_all_vars,
+                double *means)
+{
+  size_t i;
+  size_t j;
+  size_t dep_subscript;
+  size_t *rows;
+  const gsl_matrix *ssizes;
+  const gsl_matrix *mean_matrix;
+  const gsl_matrix *ssize_matrix;
+  double result = 0.0;
+  
+  gsl_matrix *cm = covariance_calculate_unnormalized (all_cov);
+
+  if ( cm == NULL)
+    return 0;
+
+  rows = xnmalloc (cov->size1 - 1, sizeof (*rows));
+  
+  for (i = 0; i < n_all_vars; i++)
+    {
+      for (j = 0; j < n_vars; j++)
+       {
+         if (vars[j] == all_vars[i])
+           {
+             rows[j] = i;
+           }
+       }
+      if (all_vars[i] == dep_var)
+       {
+         dep_subscript = i;
+       }
+    }
+  mean_matrix = covariance_moments (all_cov, MOMENT_MEAN);
+  ssize_matrix = covariance_moments (all_cov, MOMENT_NONE);
+  for (i = 0; i < cov->size1 - 1; i++)
+    {
+      means[i] = gsl_matrix_get (mean_matrix, rows[i], 0)
+       / gsl_matrix_get (ssize_matrix, rows[i], 0);
+      for (j = 0; j < cov->size2 - 1; j++)
+       {
+         gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j]));
+         gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i]));
+       }
+    }
+  means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0)
+    / gsl_matrix_get (ssize_matrix, dep_subscript, 0);
+  ssizes = covariance_moments (all_cov, MOMENT_NONE);
+  result = gsl_matrix_get (ssizes, dep_subscript, rows[0]);
+  for (i = 0; i < cov->size1 - 1; i++)
+    {
+      gsl_matrix_set (cov, i, cov->size1 - 1, 
+                     gsl_matrix_get (cm, rows[i], dep_subscript));
+      gsl_matrix_set (cov, cov->size1 - 1, i, 
+                     gsl_matrix_get (cm, rows[i], dep_subscript));
+      if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript))
+       {
+         result = gsl_matrix_get (ssizes, rows[i], dep_subscript);
+       }
+    }
+  gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1, 
+                 gsl_matrix_get (cm, dep_subscript, dep_subscript));
+  free (rows);
+  gsl_matrix_free (cm);
+  return result;
+}
+
+
+/*
+  STATISTICS subcommand output functions.
+*/
+static void reg_stats_r (linreg *, void *);
+static void reg_stats_coeff (linreg *, void *);
+static void reg_stats_anova (linreg *, void *);
+static void reg_stats_bcov (linreg *, void *);
+
+static void statistics_keyword_output (void (*)(linreg *, void *),
+                                      bool, linreg *, void *);
+
+
+
+static void
+subcommand_statistics (const struct regression *cmd , linreg * c, void *aux)
+{
+  statistics_keyword_output (reg_stats_r, cmd->r, c, aux);
+  statistics_keyword_output (reg_stats_anova, cmd->anova, c, aux);
+  statistics_keyword_output (reg_stats_coeff, cmd->coeff, c, aux);
+  statistics_keyword_output (reg_stats_bcov, cmd->bcov, c, aux);
+}
+
+
+static void
+run_regression (const struct regression *cmd, struct casereader *input)
+{
+  size_t i;
+  int n_indep = 0;
+  int k;
+  double *means;
+  struct ccase *c;
+  struct covariance *cov;
+  const struct variable **vars;
+  const struct variable **all_vars;
+  const struct variable *dep_var;
+  struct casereader *reader;
+  size_t n_all_vars;
+
+  linreg **models = cmd->models;
+
+  n_all_vars = get_n_all_vars (cmd);
+  all_vars = xnmalloc (n_all_vars, sizeof (*all_vars));
+  fill_all_vars (all_vars, cmd);
+  vars = xnmalloc (cmd->n_vars, sizeof (*vars));
+  means  = xnmalloc (n_all_vars, sizeof (*means));
+  cov = covariance_1pass_create (n_all_vars, all_vars,
+                                dict_get_weight (dataset_dict (cmd->ds)), MV_ANY);
+
+  reader = casereader_clone (input);
+  reader = casereader_create_filter_missing (reader, all_vars, n_all_vars,
+                                            MV_ANY, NULL, NULL);
+
+
+  for (; (c = casereader_read (reader)) != NULL; case_unref (c))
+    {
+      covariance_accumulate (cov, c);
+    }
+
+  for (k = 0; k < cmd->n_dep_vars; k++)
+    {
+      double n_data;
+
+      gsl_matrix *this_cm;
+      dep_var = cmd->dep_vars[k];
+      n_indep = identify_indep_vars (cmd, vars, dep_var);
+      
+      this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1);
+      n_data = fill_covariance (this_cm, cov, vars, n_indep, 
+                               dep_var, all_vars, n_all_vars, means);
+      models[k] = linreg_alloc (dep_var, (const struct variable **) vars,
+                               n_data, n_indep);
+      models[k]->depvar = dep_var;
+      for (i = 0; i < n_indep; i++)
+       {
+         linreg_set_indep_variable_mean (models[k], i, means[i]);
+       }
+      linreg_set_depvar_mean (models[k], means[i]);
+      /*
+       For large data sets, use QR decomposition.
+      */
+      if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
+       {
+         models[k]->method = LINREG_QR;
+       }
+
+      if (n_data > 0)
+       {
+         /*
+           Find the least-squares estimates and other statistics.
+         */
+         linreg_fit (this_cm, models[k]);
+         
+         if (!taint_has_tainted_successor (casereader_get_taint (input)))
+           {
+             subcommand_statistics (cmd, models[k], this_cm);
+           }
+       }
+      else
+       {
+         msg (SE,
+              _("No valid data found. This command was skipped."));
+         linreg_free (models[k]);
+         models[k] = NULL;
+       }
+      gsl_matrix_free (this_cm);
+    }
+  
+  casereader_destroy (reader);
+  free (vars);
+  free (all_vars);
+  free (means);
+  casereader_destroy (input);
+  covariance_destroy (cov);
+}
+
+
+\f
+
+
+static void
+reg_stats_r (linreg *c, void *aux UNUSED)
+{
+  struct tab_table *t;
+  int n_rows = 2;
+  int n_cols = 5;
+  double rsq;
+  double adjrsq;
+  double std_error;
+
+  assert (c != NULL);
+  rsq = linreg_ssreg (c) / linreg_sst (c);
+  adjrsq = 1.0 - (1.0 - rsq) * (linreg_n_obs (c) - 1.0) / (linreg_n_obs (c) - linreg_n_coeffs (c));
+  std_error = sqrt (linreg_mse (c));
+  t = tab_create (n_cols, n_rows);
+  tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
+  tab_hline (t, TAL_2, 0, n_cols - 1, 1);
+  tab_vline (t, TAL_2, 2, 0, n_rows - 1);
+  tab_vline (t, TAL_0, 1, 0, 0);
+
+  tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
+  tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
+  tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
+  tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
+  tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL);
+  tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL);
+  tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL);
+  tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL);
+  tab_title (t, _("Model Summary"));
+  tab_submit (t);
+}
+
+/*
+  Table showing estimated regression coefficients.
+*/
+static void
+reg_stats_coeff (linreg * c, void *aux_)
+{
+  size_t j;
+  int n_cols = 7;
+  int n_rows;
+  int this_row;
+  double t_stat;
+  double pval;
+  double std_err;
+  double beta;
+  const char *label;
+
+  const struct variable *v;
+  struct tab_table *t;
+  gsl_matrix *cov = aux_;
+
+  assert (c != NULL);
+  n_rows = linreg_n_coeffs (c) + 3;
+
+  t = tab_create (n_cols, n_rows);
+  tab_headers (t, 2, 0, 1, 0);
+  tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
+  tab_hline (t, TAL_2, 0, n_cols - 1, 1);
+  tab_vline (t, TAL_2, 2, 0, n_rows - 1);
+  tab_vline (t, TAL_0, 1, 0, 0);
+
+  tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
+  tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
+  tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
+  tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
+  tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
+  tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
+  tab_double (t, 2, 1, 0, linreg_intercept (c), NULL);
+  std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0));
+  tab_double (t, 3, 1, 0, std_err, NULL);
+  tab_double (t, 4, 1, 0, 0.0, NULL);
+  t_stat = linreg_intercept (c) / std_err;
+  tab_double (t, 5, 1, 0, t_stat, NULL);
+  pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (linreg_n_obs (c) - linreg_n_coeffs (c)));
+  tab_double (t, 6, 1, 0, pval, NULL);
+  for (j = 0; j < linreg_n_coeffs (c); j++)
+    {
+      struct string tstr;
+      ds_init_empty (&tstr);
+      this_row = j + 2;
+
+      v = linreg_indep_var (c, j);
+      label = var_to_string (v);
+      /* Do not overwrite the variable's name. */
+      ds_put_cstr (&tstr, label);
+      tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr));
+      /*
+        Regression coefficients.
+      */
+      tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL);
+      /*
+        Standard error of the coefficients.
+      */
+      std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1));
+      tab_double (t, 3, this_row, 0, std_err, NULL);
+      /*
+        Standardized coefficient, i.e., regression coefficient
+        if all variables had unit variance.
+      */
+      beta = sqrt (gsl_matrix_get (cov, j, j));
+      beta *= linreg_coeff (c, j) / 
+       sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1));
+      tab_double (t, 4, this_row, 0, beta, NULL);
+
+      /*
+        Test statistic for H0: coefficient is 0.
+      */
+      t_stat = linreg_coeff (c, j) / std_err;
+      tab_double (t, 5, this_row, 0, t_stat, NULL);
+      /*
+        P values for the test statistic above.
+      */
+      pval =
+       2 * gsl_cdf_tdist_Q (fabs (t_stat),
+                            (double) (linreg_n_obs (c) - linreg_n_coeffs (c)));
+      tab_double (t, 6, this_row, 0, pval, NULL);
+      ds_destroy (&tstr);
+    }
+  tab_title (t, _("Coefficients"));
+  tab_submit (t);
+}
+
+/*
+  Display the ANOVA table.
+*/
+static void
+reg_stats_anova (linreg * c, void *aux UNUSED)
+{
+  int n_cols = 7;
+  int n_rows = 4;
+  const double msm = linreg_ssreg (c) / linreg_dfmodel (c);
+  const double mse = linreg_mse (c);
+  const double F = msm / mse;
+  const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
+
+  struct tab_table *t;
+
+  assert (c != NULL);
+  t = tab_create (n_cols, n_rows);
+  tab_headers (t, 2, 0, 1, 0);
+
+  tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
+
+  tab_hline (t, TAL_2, 0, n_cols - 1, 1);
+  tab_vline (t, TAL_2, 2, 0, n_rows - 1);
+  tab_vline (t, TAL_0, 1, 0, 0);
+
+  tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
+  tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
+  tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
+  tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
+  tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
+
+  tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
+  tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
+  tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
+
+  /* Sums of Squares */
+  tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL);
+  tab_double (t, 2, 3, 0, linreg_sst (c), NULL);
+  tab_double (t, 2, 2, 0, linreg_sse (c), NULL);
+
+
+  /* Degrees of freedom */
+  tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm);
+  tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe);
+  tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft);
+
+  /* Mean Squares */
+  tab_double (t, 4, 1, TAB_RIGHT, msm, NULL);
+  tab_double (t, 4, 2, TAB_RIGHT, mse, NULL);
+
+  tab_double (t, 5, 1, 0, F, NULL);
+
+  tab_double (t, 6, 1, 0, pval, NULL);
+
+  tab_title (t, _("ANOVA"));
+  tab_submit (t);
+}
+
+
+static void
+reg_stats_bcov (linreg * c, void *aux UNUSED)
+{
+  int n_cols;
+  int n_rows;
+  int i;
+  int k;
+  int row;
+  int col;
+  const char *label;
+  struct tab_table *t;
+
+  assert (c != NULL);
+  n_cols = c->n_indeps + 1 + 2;
+  n_rows = 2 * (c->n_indeps + 1);
+  t = tab_create (n_cols, n_rows);
+  tab_headers (t, 2, 0, 1, 0);
+  tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
+  tab_hline (t, TAL_2, 0, n_cols - 1, 1);
+  tab_vline (t, TAL_2, 2, 0, n_rows - 1);
+  tab_vline (t, TAL_0, 1, 0, 0);
+  tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
+  tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
+  for (i = 0; i < linreg_n_coeffs (c); i++)
+    {
+      const struct variable *v = linreg_indep_var (c, i);
+      label = var_to_string (v);
+      tab_text (t, 2, i, TAB_CENTER, label);
+      tab_text (t, i + 2, 0, TAB_CENTER, label);
+      for (k = 1; k < linreg_n_coeffs (c); k++)
+       {
+         col = (i <= k) ? k : i;
+         row = (i <= k) ? i : k;
+         tab_double (t, k + 2, i, TAB_CENTER,
+                      gsl_matrix_get (c->cov, row, col), NULL);
+       }
+    }
+  tab_title (t, _("Coefficient Correlations"));
+  tab_submit (t);
+}
+
+static void
+statistics_keyword_output (void (*function) (linreg *, void *),
+                          bool keyword, linreg * c, void *aux)
+{
+  if (keyword)
+    {
+      (*function) (c, aux);
+    }
+}
diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q

deleted file mode 100644 (file)

index ca1f67d..0000000
--- a/src/language/stats/regression.q
+++ /dev/null
@@ -1,1036 +0,0 @@
-/* PSPP - a program for statistical analysis.
-   Copyright (C) 2005, 2009, 2010, 2011 Free Software Foundation, Inc.
-
-   This program is free software: you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation, either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
-
-#include <config.h>
-
-#include <gsl/gsl_cdf.h>
-#include <gsl/gsl_matrix.h>
-#include <gsl/gsl_vector.h>
-#include <math.h>
-#include <stdlib.h>
-
-#include "data/case.h"
-#include "data/casegrouper.h"
-#include "data/casereader.h"
-#include "data/dataset.h"
-#include "data/dictionary.h"
-#include "data/missing-values.h"
-#include "data/transformations.h"
-#include "data/value-labels.h"
-#include "data/variable.h"
-#include "language/command.h"
-#include "language/data-io/file-handle.h"
-#include "language/dictionary/split-file.h"
-#include "language/lexer/lexer.h"
-#include "libpspp/compiler.h"
-#include "libpspp/message.h"
-#include "libpspp/taint.h"
-#include "math/covariance.h"
-#include "math/linreg.h"
-#include "math/moments.h"
-#include "output/tab.h"
-
-#include "gl/intprops.h"
-#include "gl/xalloc.h"
-
-#include "gettext.h"
-#define _(msgid) gettext (msgid)
-
-#define REG_LARGE_DATA 1000
-
-/* (headers) */
-
-/* (specification)
-   "REGRESSION" (regression_):
-   *variables=custom;
-   +statistics[st_]=r,
-                    coeff,
-                    anova,
-                    outs,
-                    zpp,
-                    label,
-                    sha,
-                    ci,
-                    bcov,
-                    ses,
-                    xtx,
-                    collin,
-                    tol,
-                    selection,
-                    f,
-                    defaults,
-                    all;
-   ^dependent=varlist;
-   +save[sv_]=resid,pred;
-   +method=enter.
-*/
-/* (declarations) */
-/* (functions) */
-static struct cmd_regression cmd;
-
-/*
-  Moments for each of the variables used.
- */
-struct moments_var
-{
-  struct moments1 *m;
-  const struct variable *v;
-};
-
-/*
-  Transformations for saving predicted values
-  and residuals, etc.
- */
-struct reg_trns
-{
-  int n_trns;                  /* Number of transformations. */
-  int trns_id;                 /* Which trns is this one? */
-  linreg *c;           /* Linear model for this trns. */
-};
-/*
-  Variables used (both explanatory and response).
- */
-static const struct variable **v_variables;
-
-/*
-  Number of variables.
- */
-static size_t n_variables;
-
-static bool run_regression (struct casereader *, struct cmd_regression *,
-                           struct dataset *, linreg **);
-
-/*
-   STATISTICS subcommand output functions.
- */
-static void reg_stats_r (linreg *, void *);
-static void reg_stats_coeff (linreg *, void *);
-static void reg_stats_anova (linreg *, void *);
-static void reg_stats_outs (linreg *, void *);
-static void reg_stats_zpp (linreg *, void *);
-static void reg_stats_label (linreg *, void *);
-static void reg_stats_sha (linreg *, void *);
-static void reg_stats_ci (linreg *, void *);
-static void reg_stats_f (linreg *, void *);
-static void reg_stats_bcov (linreg *, void *);
-static void reg_stats_ses (linreg *, void *);
-static void reg_stats_xtx (linreg *, void *);
-static void reg_stats_collin (linreg *, void *);
-static void reg_stats_tol (linreg *, void *);
-static void reg_stats_selection (linreg *, void *);
-static void statistics_keyword_output (void (*)(linreg *, void *),
-                                      int, linreg *, void *);
-
-static void
-reg_stats_r (linreg *c, void *aux UNUSED)
-{
-  struct tab_table *t;
-  int n_rows = 2;
-  int n_cols = 5;
-  double rsq;
-  double adjrsq;
-  double std_error;
-
-  assert (c != NULL);
-  rsq = linreg_ssreg (c) / linreg_sst (c);
-  adjrsq = 1.0 - (1.0 - rsq) * (linreg_n_obs (c) - 1.0) / (linreg_n_obs (c) - linreg_n_coeffs (c));
-  std_error = sqrt (linreg_mse (c));
-  t = tab_create (n_cols, n_rows);
-  tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
-  tab_hline (t, TAL_2, 0, n_cols - 1, 1);
-  tab_vline (t, TAL_2, 2, 0, n_rows - 1);
-  tab_vline (t, TAL_0, 1, 0, 0);
-
-  tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("R"));
-  tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("R Square"));
-  tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Adjusted R Square"));
-  tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error of the Estimate"));
-  tab_double (t, 1, 1, TAB_RIGHT, sqrt (rsq), NULL);
-  tab_double (t, 2, 1, TAB_RIGHT, rsq, NULL);
-  tab_double (t, 3, 1, TAB_RIGHT, adjrsq, NULL);
-  tab_double (t, 4, 1, TAB_RIGHT, std_error, NULL);
-  tab_title (t, _("Model Summary"));
-  tab_submit (t);
-}
-
-/*
-  Table showing estimated regression coefficients.
- */
-static void
-reg_stats_coeff (linreg * c, void *aux_)
-{
-  size_t j;
-  int n_cols = 7;
-  int n_rows;
-  int this_row;
-  double t_stat;
-  double pval;
-  double std_err;
-  double beta;
-  const char *label;
-
-  const struct variable *v;
-  struct tab_table *t;
-  gsl_matrix *cov = aux_;
-
-  assert (c != NULL);
-  n_rows = linreg_n_coeffs (c) + 3;
-
-  t = tab_create (n_cols, n_rows);
-  tab_headers (t, 2, 0, 1, 0);
-  tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
-  tab_hline (t, TAL_2, 0, n_cols - 1, 1);
-  tab_vline (t, TAL_2, 2, 0, n_rows - 1);
-  tab_vline (t, TAL_0, 1, 0, 0);
-
-  tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("B"));
-  tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
-  tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Beta"));
-  tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t"));
-  tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
-  tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("(Constant)"));
-  tab_double (t, 2, 1, 0, linreg_intercept (c), NULL);
-  std_err = sqrt (gsl_matrix_get (linreg_cov (c), 0, 0));
-  tab_double (t, 3, 1, 0, std_err, NULL);
-  tab_double (t, 4, 1, 0, 0.0, NULL);
-  t_stat = linreg_intercept (c) / std_err;
-  tab_double (t, 5, 1, 0, t_stat, NULL);
-  pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), (double) (linreg_n_obs (c) - linreg_n_coeffs (c)));
-  tab_double (t, 6, 1, 0, pval, NULL);
-  for (j = 0; j < linreg_n_coeffs (c); j++)
-    {
-      struct string tstr;
-      ds_init_empty (&tstr);
-      this_row = j + 2;
-
-      v = linreg_indep_var (c, j);
-      label = var_to_string (v);
-      /* Do not overwrite the variable's name. */
-      ds_put_cstr (&tstr, label);
-      tab_text (t, 1, this_row, TAB_CENTER, ds_cstr (&tstr));
-      /*
-         Regression coefficients.
-       */
-      tab_double (t, 2, this_row, 0, linreg_coeff (c, j), NULL);
-      /*
-         Standard error of the coefficients.
-       */
-      std_err = sqrt (gsl_matrix_get (linreg_cov (c), j + 1, j + 1));
-      tab_double (t, 3, this_row, 0, std_err, NULL);
-      /*
-         Standardized coefficient, i.e., regression coefficient
-         if all variables had unit variance.
-       */
-      beta = sqrt (gsl_matrix_get (cov, j, j));
-      beta *= linreg_coeff (c, j) / 
-       sqrt (gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1));
-      tab_double (t, 4, this_row, 0, beta, NULL);
-
-      /*
-         Test statistic for H0: coefficient is 0.
-       */
-      t_stat = linreg_coeff (c, j) / std_err;
-      tab_double (t, 5, this_row, 0, t_stat, NULL);
-      /*
-         P values for the test statistic above.
-       */
-      pval =
-       2 * gsl_cdf_tdist_Q (fabs (t_stat),
-                            (double) (linreg_n_obs (c) - linreg_n_coeffs (c)));
-      tab_double (t, 6, this_row, 0, pval, NULL);
-      ds_destroy (&tstr);
-    }
-  tab_title (t, _("Coefficients"));
-  tab_submit (t);
-}
-
-/*
-  Display the ANOVA table.
- */
-static void
-reg_stats_anova (linreg * c, void *aux UNUSED)
-{
-  int n_cols = 7;
-  int n_rows = 4;
-  const double msm = linreg_ssreg (c) / linreg_dfmodel (c);
-  const double mse = linreg_mse (c);
-  const double F = msm / mse;
-  const double pval = gsl_cdf_fdist_Q (F, c->dfm, c->dfe);
-
-  struct tab_table *t;
-
-  assert (c != NULL);
-  t = tab_create (n_cols, n_rows);
-  tab_headers (t, 2, 0, 1, 0);
-
-  tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
-
-  tab_hline (t, TAL_2, 0, n_cols - 1, 1);
-  tab_vline (t, TAL_2, 2, 0, n_rows - 1);
-  tab_vline (t, TAL_0, 1, 0, 0);
-
-  tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Sum of Squares"));
-  tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df"));
-  tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Mean Square"));
-  tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("F"));
-  tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance"));
-
-  tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Regression"));
-  tab_text (t, 1, 2, TAB_LEFT | TAT_TITLE, _("Residual"));
-  tab_text (t, 1, 3, TAB_LEFT | TAT_TITLE, _("Total"));
-
-  /* Sums of Squares */
-  tab_double (t, 2, 1, 0, linreg_ssreg (c), NULL);
-  tab_double (t, 2, 3, 0, linreg_sst (c), NULL);
-  tab_double (t, 2, 2, 0, linreg_sse (c), NULL);
-
-
-  /* Degrees of freedom */
-  tab_text_format (t, 3, 1, TAB_RIGHT, "%g", c->dfm);
-  tab_text_format (t, 3, 2, TAB_RIGHT, "%g", c->dfe);
-  tab_text_format (t, 3, 3, TAB_RIGHT, "%g", c->dft);
-
-  /* Mean Squares */
-  tab_double (t, 4, 1, TAB_RIGHT, msm, NULL);
-  tab_double (t, 4, 2, TAB_RIGHT, mse, NULL);
-
-  tab_double (t, 5, 1, 0, F, NULL);
-
-  tab_double (t, 6, 1, 0, pval, NULL);
-
-  tab_title (t, _("ANOVA"));
-  tab_submit (t);
-}
-
-static void
-reg_stats_outs (linreg * c, void *aux UNUSED)
-{
-  assert (c != NULL);
-}
-
-static void
-reg_stats_zpp (linreg * c, void *aux UNUSED)
-{
-  assert (c != NULL);
-}
-
-static void
-reg_stats_label (linreg * c, void *aux UNUSED)
-{
-  assert (c != NULL);
-}
-
-static void
-reg_stats_sha (linreg * c, void *aux UNUSED)
-{
-  assert (c != NULL);
-}
-static void
-reg_stats_ci (linreg * c, void *aux UNUSED)
-{
-  assert (c != NULL);
-}
-static void
-reg_stats_f (linreg * c, void *aux UNUSED)
-{
-  assert (c != NULL);
-}
-static void
-reg_stats_bcov (linreg * c, void *aux UNUSED)
-{
-  int n_cols;
-  int n_rows;
-  int i;
-  int k;
-  int row;
-  int col;
-  const char *label;
-  struct tab_table *t;
-
-  assert (c != NULL);
-  n_cols = c->n_indeps + 1 + 2;
-  n_rows = 2 * (c->n_indeps + 1);
-  t = tab_create (n_cols, n_rows);
-  tab_headers (t, 2, 0, 1, 0);
-  tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, n_cols - 1, n_rows - 1);
-  tab_hline (t, TAL_2, 0, n_cols - 1, 1);
-  tab_vline (t, TAL_2, 2, 0, n_rows - 1);
-  tab_vline (t, TAL_0, 1, 0, 0);
-  tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Model"));
-  tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Covariances"));
-  for (i = 0; i < linreg_n_coeffs (c); i++)
-    {
-      const struct variable *v = linreg_indep_var (c, i);
-      label = var_to_string (v);
-      tab_text (t, 2, i, TAB_CENTER, label);
-      tab_text (t, i + 2, 0, TAB_CENTER, label);
-      for (k = 1; k < linreg_n_coeffs (c); k++)
-       {
-         col = (i <= k) ? k : i;
-         row = (i <= k) ? i : k;
-         tab_double (t, k + 2, i, TAB_CENTER,
-                    gsl_matrix_get (c->cov, row, col), NULL);
-       }
-    }
-  tab_title (t, _("Coefficient Correlations"));
-  tab_submit (t);
-}
-static void
-reg_stats_ses (linreg * c, void *aux UNUSED)
-{
-  assert (c != NULL);
-}
-static void
-reg_stats_xtx (linreg * c, void *aux UNUSED)
-{
-  assert (c != NULL);
-}
-static void
-reg_stats_collin (linreg * c, void *aux UNUSED)
-{
-  assert (c != NULL);
-}
-static void
-reg_stats_tol (linreg * c, void *aux UNUSED)
-{
-  assert (c != NULL);
-}
-static void
-reg_stats_selection (linreg * c, void *aux UNUSED)
-{
-  assert (c != NULL);
-}
-
-static void
-statistics_keyword_output (void (*function) (linreg *, void *),
-                          int keyword, linreg * c, void *aux)
-{
-  if (keyword)
-    {
-      (*function) (c, aux);
-    }
-}
-
-static void
-subcommand_statistics (int *keywords, linreg * c, void *aux)
-{
-  /*
-     The order here must match the order in which the STATISTICS
-     keywords appear in the specification section above.
-   */
-  enum
-  { r,
-    coeff,
-    anova,
-    outs,
-    zpp,
-    label,
-    sha,
-    ci,
-    bcov,
-    ses,
-    xtx,
-    collin,
-    tol,
-    selection,
-    f,
-    defaults,
-    all
-  };
-  int i;
-  int d = 1;
-
-  if (keywords[all])
-    {
-      /*
-         Set everything but F.
-       */
-      for (i = 0; i < f; i++)
-       {
-         keywords[i] = 1;
-       }
-    }
-  else
-    {
-      for (i = 0; i < all; i++)
-       {
-         if (keywords[i])
-           {
-             d = 0;
-           }
-       }
-      /*
-         Default output: ANOVA table, parameter estimates,
-         and statistics for variables not entered into model,
-         if appropriate.
-       */
-      if (keywords[defaults] | d)
-       {
-         keywords[anova] = 1;
-         keywords[outs] = 1;
-         keywords[coeff] = 1;
-         keywords[r] = 1;
-       }
-    }
-  statistics_keyword_output (reg_stats_r, keywords[r], c, aux);
-  statistics_keyword_output (reg_stats_anova, keywords[anova], c, aux);
-  statistics_keyword_output (reg_stats_coeff, keywords[coeff], c, aux);
-  statistics_keyword_output (reg_stats_outs, keywords[outs], c, aux);
-  statistics_keyword_output (reg_stats_zpp, keywords[zpp], c, aux);
-  statistics_keyword_output (reg_stats_label, keywords[label], c, aux);
-  statistics_keyword_output (reg_stats_sha, keywords[sha], c, aux);
-  statistics_keyword_output (reg_stats_ci, keywords[ci], c, aux);
-  statistics_keyword_output (reg_stats_f, keywords[f], c, aux);
-  statistics_keyword_output (reg_stats_bcov, keywords[bcov], c, aux);
-  statistics_keyword_output (reg_stats_ses, keywords[ses], c, aux);
-  statistics_keyword_output (reg_stats_xtx, keywords[xtx], c, aux);
-  statistics_keyword_output (reg_stats_collin, keywords[collin], c, aux);
-  statistics_keyword_output (reg_stats_tol, keywords[tol], c, aux);
-  statistics_keyword_output (reg_stats_selection, keywords[selection], c, aux);
-}
-
-/*
-  Free the transformation. Free its linear model if this
-  transformation is the last one.
- */
-static bool
-regression_trns_free (void *t_)
-{
-  bool result = true;
-  struct reg_trns *t = t_;
-
-  if (t->trns_id == t->n_trns)
-    {
-      result = linreg_free (t->c);
-    }
-  free (t);
-
-  return result;
-}
-
-/*
-  Gets the predicted values.
- */
-static int
-regression_trns_pred_proc (void *t_, struct ccase **c,
-                          casenumber case_idx UNUSED)
-{
-  size_t i;
-  size_t n_vals;
-  struct reg_trns *trns = t_;
-  linreg *model;
-  union value *output = NULL;
-  const union value *tmp;
-  double *vals;
-  const struct variable **vars = NULL;
-
-  assert (trns != NULL);
-  model = trns->c;
-  assert (model != NULL);
-  assert (model->depvar != NULL);
-  assert (model->pred != NULL);
-
-  vars = linreg_get_vars (model);
-  n_vals = linreg_n_coeffs (model);
-  vals = xnmalloc (n_vals, sizeof (*vals));
-  *c = case_unshare (*c);
-
-  output = case_data_rw (*c, model->pred);
-
-  for (i = 0; i < n_vals; i++)
-    {
-      tmp = case_data (*c, vars[i]);
-      vals[i] = tmp->f;
-    }
-  output->f = linreg_predict (model, vals, n_vals);
-  free (vals);
-  return TRNS_CONTINUE;
-}
-
-/*
-  Gets the residuals.
- */
-static int
-regression_trns_resid_proc (void *t_, struct ccase **c,
-                           casenumber case_idx UNUSED)
-{
-  size_t i;
-  size_t n_vals;
-  struct reg_trns *trns = t_;
-  linreg *model;
-  union value *output = NULL;
-  const union value *tmp;
-  double *vals = NULL;
-  double obs;
-  const struct variable **vars = NULL;
-
-  assert (trns != NULL);
-  model = trns->c;
-  assert (model != NULL);
-  assert (model->depvar != NULL);
-  assert (model->resid != NULL);
-
-  vars = linreg_get_vars (model);
-  n_vals = linreg_n_coeffs (model);
-
-  vals = xnmalloc (n_vals, sizeof (*vals));
-  *c = case_unshare (*c);
-  output = case_data_rw (*c, model->resid);
-  assert (output != NULL);
-
-  for (i = 0; i < n_vals; i++)
-    {
-      tmp = case_data (*c, vars[i]);
-      vals[i] = tmp->f;
-    }
-  tmp = case_data (*c, model->depvar);
-  obs = tmp->f;
-  output->f = linreg_residual (model, obs, vals, n_vals);
-  free (vals);
-
-  return TRNS_CONTINUE;
-}
-
-static char *
-reg_get_name (const struct dictionary *dict, const char *prefix)
-{
-  char *name;
-  int i;
-
-  /* XXX handle too-long prefixes */
-  name = xmalloc (strlen (prefix) + INT_BUFSIZE_BOUND (i) + 1);
-  for (i = 1; ; i++)
-    {
-      sprintf (name, "%s%d", prefix, i);
-      if (dict_lookup_var (dict, name) == NULL)
-        return name;
-    }
-}
-
-static void
-reg_save_var (struct dataset *ds, const char *prefix, trns_proc_func * f,
-             linreg * c, struct variable **v, int n_trns)
-{
-  struct dictionary *dict = dataset_dict (ds);
-  static int trns_index = 1;
-  char *name;
-  struct variable *new_var;
-  struct reg_trns *t = NULL;
-
-  t = xmalloc (sizeof (*t));
-  t->trns_id = trns_index;
-  t->n_trns = n_trns;
-  t->c = c;
-
-  name = reg_get_name (dict, prefix);
-  new_var = dict_create_var_assert (dict, name, 0);
-  free (name);
-
-  *v = new_var;
-  add_transformation (ds, f, regression_trns_free, t);
-  trns_index++;
-}
-static void
-subcommand_save (struct dataset *ds, int save, linreg ** models)
-{
-  linreg **lc;
-  int n_trns = 0;
-  int i;
-
-  if (save)
-    {
-      /* Count the number of transformations we will need. */
-      for (i = 0; i < REGRESSION_SV_count; i++)
-       {
-         if (cmd.a_save[i])
-           {
-             n_trns++;
-           }
-       }
-      n_trns *= cmd.n_dependent;
-
-      for (lc = models; lc < models + cmd.n_dependent; lc++)
-       {
-         if (*lc != NULL)
-           {
-             if ((*lc)->depvar != NULL)
-               {
-                 if (cmd.a_save[REGRESSION_SV_RESID])
-                   {
-                     reg_save_var (ds, "RES", regression_trns_resid_proc, *lc,
-                                   &(*lc)->resid, n_trns);
-                   }
-                 if (cmd.a_save[REGRESSION_SV_PRED])
-                   {
-                     reg_save_var (ds, "PRED", regression_trns_pred_proc, *lc,
-                                   &(*lc)->pred, n_trns);
-                   }
-               }
-           }
-       }
-    }
-  else
-    {
-      for (lc = models; lc < models + cmd.n_dependent; lc++)
-       {
-         if (*lc != NULL)
-           {
-             linreg_free (*lc);
-           }
-       }
-    }
-}
-
-int
-cmd_regression (struct lexer *lexer, struct dataset *ds)
-{
-  struct casegrouper *grouper;
-  struct casereader *group;
-  linreg **models;
-  bool ok;
-  size_t i;
-
-  if (!parse_regression (lexer, ds, &cmd, NULL))
-    {
-      return CMD_FAILURE;
-    }
-
-  models = xnmalloc (cmd.n_dependent, sizeof *models);
-  for (i = 0; i < cmd.n_dependent; i++)
-    {
-      models[i] = NULL;
-    }
-
-  /* Data pass. */
-  grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
-  while (casegrouper_get_next_group (grouper, &group))
-    run_regression (group, &cmd, ds, models);
-  ok = casegrouper_destroy (grouper);
-  ok = proc_commit (ds) && ok;
-
-  subcommand_save (ds, cmd.sbc_save, models);
-  free (v_variables);
-  free (models);
-  free_regression (&cmd);
-
-  return ok ? CMD_SUCCESS : CMD_FAILURE;
-}
-
-/*
-  Is variable k the dependent variable?
- */
-static bool
-is_depvar (size_t k, const struct variable *v)
-{
-  return v == v_variables[k];
-}
-
-/* Parser for the variables sub command */
-static int
-regression_custom_variables (struct lexer *lexer, struct dataset *ds,
-                            struct cmd_regression *cmd UNUSED,
-                            void *aux UNUSED)
-{
-  const struct dictionary *dict = dataset_dict (ds);
-
-  lex_match (lexer, T_EQUALS);
-
-  if ((lex_token (lexer) != T_ID
-       || dict_lookup_var (dict, lex_tokcstr (lexer)) == NULL)
-      && lex_token (lexer) != T_ALL)
-    return 2;
-
-
-  if (!parse_variables_const
-      (lexer, dict, &v_variables, &n_variables, PV_NONE))
-    {
-      free (v_variables);
-      return 0;
-    }
-  assert (n_variables);
-
-  return 1;
-}
-
-/* Identify the explanatory variables in v_variables.  Returns
-   the number of independent variables. */
-static int
-identify_indep_vars (const struct variable **indep_vars,
-                    const struct variable *depvar)
-{
-  int n_indep_vars = 0;
-  int i;
-
-  for (i = 0; i < n_variables; i++)
-    if (!is_depvar (i, depvar))
-      indep_vars[n_indep_vars++] = v_variables[i];
-  if ((n_indep_vars < 1) && is_depvar (0, depvar))
-    {
-      /*
-       There is only one independent variable, and it is the same
-       as the dependent variable. Print a warning and continue.
-       */
-      msg (SE,
-          gettext ("The dependent variable is equal to the independent variable." 
-                   "The least squares line is therefore Y=X." 
-                   "Standard errors and related statistics may be meaningless."));
-      n_indep_vars = 1;
-      indep_vars[0] = v_variables[0];
-    }
-  return n_indep_vars;
-}
-
-static double
-fill_covariance (gsl_matrix *cov, struct covariance *all_cov, 
-                const struct variable **vars,
-                size_t n_vars, const struct variable *dep_var, 
-                const struct variable **all_vars, size_t n_all_vars,
-                double *means)
-{
-  size_t i;
-  size_t j;
-  size_t dep_subscript;
-  size_t *rows;
-  const gsl_matrix *ssizes;
-  const gsl_matrix *mean_matrix;
-  const gsl_matrix *ssize_matrix;
-  double result = 0.0;
-  
-  gsl_matrix *cm = covariance_calculate_unnormalized (all_cov);
-
-  if ( cm == NULL)
-    return 0;
-
-  rows = xnmalloc (cov->size1 - 1, sizeof (*rows));
-  
-  for (i = 0; i < n_all_vars; i++)
-    {
-      for (j = 0; j < n_vars; j++)
-       {
-         if (vars[j] == all_vars[i])
-           {
-             rows[j] = i;
-           }
-       }
-      if (all_vars[i] == dep_var)
-       {
-         dep_subscript = i;
-       }
-    }
-  mean_matrix = covariance_moments (all_cov, MOMENT_MEAN);
-  ssize_matrix = covariance_moments (all_cov, MOMENT_NONE);
-  for (i = 0; i < cov->size1 - 1; i++)
-    {
-      means[i] = gsl_matrix_get (mean_matrix, rows[i], 0)
-       / gsl_matrix_get (ssize_matrix, rows[i], 0);
-      for (j = 0; j < cov->size2 - 1; j++)
-       {
-         gsl_matrix_set (cov, i, j, gsl_matrix_get (cm, rows[i], rows[j]));
-         gsl_matrix_set (cov, j, i, gsl_matrix_get (cm, rows[j], rows[i]));
-       }
-    }
-  means[cov->size1 - 1] = gsl_matrix_get (mean_matrix, dep_subscript, 0)
-    / gsl_matrix_get (ssize_matrix, dep_subscript, 0);
-  ssizes = covariance_moments (all_cov, MOMENT_NONE);
-  result = gsl_matrix_get (ssizes, dep_subscript, rows[0]);
-  for (i = 0; i < cov->size1 - 1; i++)
-    {
-      gsl_matrix_set (cov, i, cov->size1 - 1, 
-                     gsl_matrix_get (cm, rows[i], dep_subscript));
-      gsl_matrix_set (cov, cov->size1 - 1, i, 
-                     gsl_matrix_get (cm, rows[i], dep_subscript));
-      if (result > gsl_matrix_get (ssizes, rows[i], dep_subscript))
-       {
-         result = gsl_matrix_get (ssizes, rows[i], dep_subscript);
-       }
-    }
-  gsl_matrix_set (cov, cov->size1 - 1, cov->size1 - 1, 
-                 gsl_matrix_get (cm, dep_subscript, dep_subscript));
-  free (rows);
-  gsl_matrix_free (cm);
-  return result;
-}
-static size_t
-get_n_all_vars (struct cmd_regression *cmd)
-{
-  size_t result = n_variables;
-  size_t i;
-  size_t j;
-
-  result += cmd->n_dependent;
-  for (i = 0; i < cmd->n_dependent; i++)
-    {
-      for (j = 0; j < n_variables; j++)
-       {
-         if (v_variables[j] == cmd->v_dependent[i])
-           {
-             result--;
-           }
-       }
-    }
-  return result;
-}
-static void
-fill_all_vars (const struct variable **vars, struct cmd_regression *cmd)
-{
-  size_t i;
-  size_t j;
-  bool absent;
-  
-  for (i = 0; i < n_variables; i++)
-    {
-      vars[i] = v_variables[i];
-    }
-  for (i = 0; i < cmd->n_dependent; i++)
-    {
-      absent = true;
-      for (j = 0; j < n_variables; j++)
-       {
-         if (cmd->v_dependent[i] == v_variables[j])
-           {
-             absent = false;
-             break;
-           }
-       }
-      if (absent)
-       {
-         vars[i + n_variables] = cmd->v_dependent[i];
-       }
-    }
-}
-static bool
-run_regression (struct casereader *input, struct cmd_regression *cmd,
-               struct dataset *ds, linreg **models)
-{
-  size_t i;
-  int n_indep = 0;
-  int k;
-  double n_data;
-  double *means;
-  struct ccase *c;
-  struct covariance *cov;
-  const struct variable **vars;
-  const struct variable **all_vars;
-  const struct variable *dep_var;
-  struct casereader *reader;
-  const struct dictionary *dict;
-  size_t n_all_vars;
-
-  assert (models != NULL);
-
-  for (i = 0; i < n_variables; i++)
-    {
-      if (!var_is_numeric (v_variables[i]))
-       {
-         msg (SE, _("REGRESSION requires numeric variables."));
-         return false;
-       }
-    }
-
-  c = casereader_peek (input, 0);
-  if (c == NULL)
-    {
-      casereader_destroy (input);
-      return true;
-    }
-  output_split_file_values (ds, c);
-  case_unref (c);
-
-  dict = dataset_dict (ds);
-  if (!v_variables)
-    {
-      dict_get_vars (dict, &v_variables, &n_variables, 0);
-    }
-  n_all_vars = get_n_all_vars (cmd);
-  all_vars = xnmalloc (n_all_vars, sizeof (*all_vars));
-  fill_all_vars (all_vars, cmd);
-  vars = xnmalloc (n_variables, sizeof (*vars));
-  means  = xnmalloc (n_all_vars, sizeof (*means));
-  cov = covariance_1pass_create (n_all_vars, all_vars,
-                                dict_get_weight (dict), MV_ANY);
-
-  reader = casereader_clone (input);
-  reader = casereader_create_filter_missing (reader, all_vars, n_all_vars,
-                                            MV_ANY, NULL, NULL);
-
-
-  for (; (c = casereader_read (reader)) != NULL; case_unref (c))
-    {
-      covariance_accumulate (cov, c);
-    }
-  
-  for (k = 0; k < cmd->n_dependent; k++)
-    {
-      gsl_matrix *this_cm;
-      dep_var = cmd->v_dependent[k];
-      n_indep = identify_indep_vars (vars, dep_var);
-      
-      this_cm = gsl_matrix_alloc (n_indep + 1, n_indep + 1);
-      n_data = fill_covariance (this_cm, cov, vars, n_indep, 
-                               dep_var, all_vars, n_all_vars, means);
-      models[k] = linreg_alloc (dep_var, (const struct variable **) vars,
-                               n_data, n_indep);
-      models[k]->depvar = dep_var;
-      for (i = 0; i < n_indep; i++)
-       {
-         linreg_set_indep_variable_mean (models[k], i, means[i]);
-       }
-      linreg_set_depvar_mean (models[k], means[i]);
-      /*
-       For large data sets, use QR decomposition.
-      */
-      if (n_data > sqrt (n_indep) && n_data > REG_LARGE_DATA)
-       {
-         models[k]->method = LINREG_QR;
-       }
-      
-      if (n_data > 0)
-       {
-         /*
-           Find the least-squares estimates and other statistics.
-         */
-         linreg_fit (this_cm, models[k]);
-         
-         if (!taint_has_tainted_successor (casereader_get_taint (input)))
-           {
-             subcommand_statistics (cmd->a_statistics, models[k], this_cm);
-           }
-       }
-      else
-       {
-         msg (SE,
-              gettext ("No valid data found. This command was skipped."));
-         linreg_free (models[k]);
-         models[k] = NULL;
-       }
-      gsl_matrix_free (this_cm);
-    }
-  
-  casereader_destroy (reader);
-  free (vars);
-  free (all_vars);
-  free (means);
-  casereader_destroy (input);
-  covariance_destroy (cov);
-  
-  return true;
-}
-
-/*
-  Local Variables:
-  mode: c
-  End:
-*/
diff --git a/tests/output/ascii.at b/tests/output/ascii.at

index df1263eb012b761daf611394b3766d8070677913..07b440efb591d797194f732666b952f2a23185d1 100644 (file)
--- a/tests/output/ascii.at
+++ b/tests/output/ascii.at
@@ -555,7 +555,7 @@ REGRESSION
  /DEPENDENT= x y
  /STATISTICS=COEFF R ANOVA.
  ])
-AT_CHECK([pspp ascii.sps], [1], [dnl
+AT_CHECK([pspp ascii.sps], [0], [dnl
  SET PRINTBACK=ON.
  
  DATA LIST LIST /x * y * a (a23).
@@ -577,9 +577,12 @@ END DATA.
  
  REGRESSION
  /VARIABLES= a
+
+ascii.sps:11: warning: REGRESSION: a is not a numeric variable.  It will not be
+included in the variable list.
+
  /DEPENDENT= x y
  /STATISTICS=COEFF R ANOVA.
-
-ascii.sps:12: error: REGRESSION: REGRESSION requires numeric variables.
  ])
+
  AT_CLEANUP
author	John Darrington <john@darrington.wattle.id.au>
	Fri, 13 Apr 2012 16:55:36 +0000 (18:55 +0200)
committer	John Darrington <john@darrington.wattle.id.au>
	Fri, 13 Apr 2012 16:55:36 +0000 (18:55 +0200)
src/language/command.def		patch \| blob \| history
src/language/stats/.gitignore		patch \| blob \| history
src/language/stats/automake.mk		patch \| blob \| history
src/language/stats/regression.c	[new file with mode: 0644]	patch \| blob
src/language/stats/regression.q	[deleted file]	patch \| blob \| history
tests/output/ascii.at		patch \| blob \| history