Change "union value" to dynamically allocate long strings.

[pspp-builds.git] / src / math / ts / innovations.c
diff --git a/src/math/ts/innovations.c b/src/math/ts/innovations.c

index 792bc6c8659a7d84aef55a284070e7a2b986c4e9..3ab2f3edce165ed0a709688e711867986683eee3 100644 (file)
--- a/src/math/ts/innovations.c
+++ b/src/math/ts/innovations.c
@@ -1,22 +1,19 @@
-/*
-  src/math/ts/innovations.c
-  
-  Copyright (C) 2006 Free Software Foundation, Inc. Written by Jason H. Stover.
-  
-  This program is free software; you can redistribute it and/or modify it under
-  the terms of the GNU General Public License as published by the Free
-  Software Foundation; either version 2 of the License, or (at your option)
-  any later version.
-  
-  This program is distributed in the hope that it will be useful, but WITHOUT
-  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
-  more details.
-  
-  You should have received a copy of the GNU General Public License along with
-  this program; if not, write to the Free Software Foundation, Inc., 51
-  Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
- */
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2006 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
  /*
    Find preliminary ARMA coefficients via the innovations algorithm.
    Also compute the sample mean and covariance matrix for each series.
@@ -28,158 +25,128 @@
    0-387-97429-6. Sections 5.2, 8.3 and 8.4.
   */
  
-#include <gsl/gsl_matrix.h>
-#include <gsl/gsl_vector.h>
+#include <config.h>
+
  #include <math.h>
  #include <stdlib.h>
-#include <data/case.h>
-#include <data/casefile.h>
-#include <libpspp/alloc.h>
+
+#include <gsl/gsl_matrix.h>
+#include <gsl/gsl_vector.h>
  #include <libpspp/compiler.h>
-#include <libpspp/message.h>
+#include <libpspp/misc.h>
  #include <math/coefficient.h>
  #include <math/ts/innovations.h>
  
+#include "xalloc.h"
+
  static void
-get_mean_variance (size_t n_vars, const struct casefile *cf,
-                  struct innovations_estimate **est)
-                  
+get_mean (const gsl_matrix *data,
+         struct innovations_estimate **est)
+
  {
-  struct casereader *r;
-  struct ccase c;
    size_t n;
+  size_t i;
    double d;
-  const union value *tmp;
+  double tmp;
  
-  for (n = 0; n < n_vars; n++)
+  for (n = 0; n < data->size2; n++)
      {
-      est[n]->n_obs = 2.0;
+      est[n]->n_obs = 0.0;
        est[n]->mean = 0.0;
-      est[n]->variance = 0.0;
      }
-  for (r = casefile_get_reader (cf); casereader_read (r, &c);
-       case_destroy (&c))
+  for (i = 0; i < data->size1; i++)
      {
-      for (n = 0; n < n_vars; n++)
+      for (n = 0; n < data->size2; n++)
         {
-         tmp = case_data (&c, est[n]->variable->fv);
-         if (!mv_is_value_missing (&(est[n]->variable->miss), tmp))
+         tmp = gsl_matrix_get (data, i, n);
+         if (!isnan (tmp))
             {
-             d = (tmp->f - est[n]->mean) / est[n]->n_obs;
-             est[n]->mean += d;
-             est[n]->variance += est[n]->n_obs * est[n]->n_obs * d * d;
               est[n]->n_obs += 1.0;
+             d = (tmp - est[n]->mean) / est[n]->n_obs;
+             est[n]->mean += d;
             }
         }
      }
-  for (n = 0; n < n_vars; n++)
-    {
-      /* Maximum likelihood estimate of the variance. */
-      est[n]->variance /= est[n]->n_obs;
-    }
  }
-
-/*
-  Read the first MAX_LAG cases.
- */
-static bool
-innovations_init_cases (struct casereader *r, struct ccase **c, size_t max_lag)
+static void
+update_cov (struct innovations_estimate **est, gsl_vector_const_view x,
+           gsl_vector_const_view y, size_t lag)
  {
-  bool value = true;
-  size_t lag = 0;
-
-  while (value && lag < max_lag)
-    {
-      lag++;
-      value = casereader_read (r, c[lag]);
-    }
-  return value;
-}
+  size_t j;
+  double xj;
+  double yj;
  
-/*
-  Read one case and update C, which contains the last MAX_LAG cases.
- */
-static bool
-innovations_update_cases (struct casereader *r, struct ccase **c, size_t max_lag)
-{
-  size_t lag;
-  bool value = false;
-  
-  for (lag = 0; lag < max_lag - 1; lag++)
+  for (j = 0; j < x.vector.size; j++)
      {
-      c[lag] = c[lag+1];
+      xj = gsl_vector_get (&x.vector, j);
+      yj = gsl_vector_get (&y.vector, j);
+      if (!isnan (xj))
+       {
+         if (!isnan (yj))
+           {
+             xj -= est[j]->mean;
+             yj -= est[j]->mean;
+             *(est[j]->cov + lag) += xj * yj;
+           }
+       }
      }
-  value = casereader_read (r, c[lag]);
-  return value;
  }
-static void
-get_covariance (size_t n_vars, const struct casefile *cf, 
+static int
+get_covariance (const gsl_matrix *data,
                 struct innovations_estimate **est, size_t max_lag)
  {
-  struct casereader *r;
-  struct ccase **c;
    size_t lag;
-  size_t n;
-  bool read_case = false;
-  double d;
-  double x;
-  const union value *tmp;
-  const union value *tmp2;
-
-  c = xnmalloc (max_lag, sizeof (*c));
-  
-  for (lag = 0; lag < max_lag; lag++)
-    {
-      c[lag] = xmalloc (sizeof *c[lag]);
-    }
+  size_t j;
+  size_t i;
+  int rc = 1;
  
-  r = casefile_get_reader (cf);
-  read_case = innovations_init_cases (r, c, max_lag);
+  assert (data != NULL);
+  assert (est != NULL);
  
-  while (read_case)
+  for (j = 0; j < data->size2; j++)
      {
-      for (n = 0; n < n_vars; n++)
+      for (lag = 0; lag <= max_lag; lag++)
         {
-         tmp2 = case_data (c[0], est[n]->variable->fv);
-         if (!mv_is_value_missing (&est[n]->variable->miss, tmp2))
-           {
-             x = tmp2->f - est[n]->mean;
-             for (lag = 1; lag <= max_lag; lag++)
-               {
-                 tmp = case_data (c[lag], est[n]->variable->fv);
-                 if (!mv_is_value_missing (&est[n]->variable->miss, tmp))
-                   {
-                     d = (tmp->f - est[n]->mean);
-                     *(est[n]->cov + lag) += d * x;
-                   }
-               }
-           }
+         *(est[j]->cov + lag) = 0.0;
         }
-      read_case = innovations_update_cases (r, c, max_lag);
      }
-  for (lag = 0; lag <= max_lag; lag++)
+  /*
+    The rows are in the outer loop because a gsl_matrix is stored in
+    row-major order.
+   */
+  for (i = 0; i < data->size1; i++)
      {
-      for (n = 0; n < n_vars; n++)
+      for (lag = 0; lag <= max_lag && lag < data->size1 - i; lag++)
         {
-         *(est[n]->cov + lag) /= (est[n]->n_obs - lag);
+         update_cov (est, gsl_matrix_const_row (data, i),
+                     gsl_matrix_const_row (data, i + lag), lag);
         }
      }
-  for (lag = 0; lag < max_lag; lag++)
+  for (j = 0; j < data->size2; j++)
      {
-      free (c[lag]);
+      for (lag = 0; lag <= max_lag; lag++)
+       {
+         *(est[j]->cov + lag) /= est[j]->n_obs;
+       }
      }
-  free (c);
+
+  return rc;
  }
+
  static double
-innovations_convolve (double **theta, struct innovations_estimate *est,
-                     int i, int j)
+innovations_convolve (double *x, double *y, struct innovations_estimate *est,
+                     int i)
  {
    int k;
    double result = 0.0;
  
+  assert (x != NULL && y != NULL);
+  assert (est != NULL);
+  assert (est->scale != NULL);
+  assert (i > 0);
    for (k = 0; k < i; k++)
      {
-      result += theta[i-1][i-k-1] * theta[j-1][j-k-1] * est->scale[k];
+      result += x[k] * y[k] * est->scale[i-k-1];
      }
    return result;
  }
@@ -191,55 +158,94 @@ innovations_update_scale (struct innovations_estimate *est, double *theta,
    size_t j;
    size_t k;
  
-
-  result = est->cov[0];
-  for (j = 0; j < i; j++)
+  if (i < (size_t) est->max_lag)
      {
-      k = i - j;
-      result -= theta[k] * theta[k] * est->scale[j];
+      result = est->cov[0];
+      for (j = 0; j < i; j++)
+       {
+         k = i - j - 1;
+         result -= pow2 (theta[k]) * est->scale[j];
+       }
+      est->scale[i] = result;
      }
-  est->scale[i] = result;
  }
-
  static void
-get_coef (size_t n_vars, const struct casefile *cf, 
-               struct innovations_estimate **est, size_t max_lag)
+init_theta (double **theta, size_t max_lag)
  {
+  size_t i;
    size_t j;
+
+  for (i = 0; i < max_lag; i++)
+    {
+      for (j = 0; j <= i; j++)
+       {
+         theta[i][j] = 0.0;
+       }
+    }
+}
+static void
+innovations_update_coeff (double **theta, struct innovations_estimate *est,
+                         size_t max_lag)
+{
    size_t i;
+  size_t j;
    size_t k;
+
+  for (i = 0; i < max_lag; i++)
+    {
+      theta[i][i] = est->cov[i+1] / est->scale[0];
+      for (j = 1; j <= i; j++)
+       {
+         k = i - j;
+         theta[i][k] = (est->cov[k+1] -
+                        innovations_convolve (theta[i] + k + 1, theta[j - 1], est, j))
+           / est->scale[j];
+       }
+      innovations_update_scale (est, theta[i], i + 1);
+    }
+}
+static void
+get_coef (const gsl_matrix *data,
+         struct innovations_estimate **est, size_t max_lag)
+{
+  size_t i;
    size_t n;
-  double v;
    double **theta;
  
    theta = xnmalloc (max_lag, sizeof (*theta));
    for (i = 0; i < max_lag; i++)
      {
-      theta[i] = xnmalloc (i+1, sizeof (theta[i]));
-
+      theta[i] = xnmalloc (max_lag, sizeof (**(theta + i)));
      }
-  for (n = 0; n < n_vars; n++)
+
+  for (n = 0; n < data->size2; n++)
      {
-      for (i = 0; i < max_lag; i++)
-       {
-         for (j = 0; j < i; j++)
-           {
-             theta[i][j] = 0.0;
-           }
-       }
+      init_theta (theta, max_lag);
        innovations_update_scale (est[n], theta[0], 0);
+      innovations_update_coeff (theta, est[n], max_lag);
+      /* Copy the final row of coefficients into EST->COEFF.*/
        for (i = 0; i < max_lag; i++)
         {
-         v = est[n]->cov[i];
-         for (j = 0; j < i; j++)
-           {
-             k = i - j;
-             theta[i-1][k-1] = est[n]->cov[k] - 
-               innovations_convolve (theta, est[n], i, j);
-           }
-         innovations_update_scale (est[n], theta[i], i);
+         /*
+           The order of storage here means that the best predicted value
+           for the time series is computed as follows:
+
+           Let X[m], X[m-1],... denote the original series.
+           Let X_hat[0] denote the best predicted value of X[0],
+           X_hat[1] denote the projection of X[1] onto the subspace
+           spanned by {X[0] - X_hat[0]}. Let X_hat[m] denote the
+           projection of X[m] onto the subspace spanned by {X[m-1] - X_hat[m-1],
+           X[m-2] - X_hat[m-2],...,X[0] - X_hat[0]}.
+
+           Then X_hat[m] = est->coeff[m-1] * (X[m-1] - X_hat[m-1])
+                         + est->coeff[m-1] * (X[m-2] - X_hat[m-2])
+                         ...
+                         + est->coeff[m-max_lag] * (X[m - max_lag] - X_hat[m - max_lag])
+          */
+         pspp_coeff_set_estimate (est[n]->coeff[i], theta[max_lag - 1][i]);
         }
      }
+
    for (i = 0; i < max_lag; i++)
      {
        free (theta[i]);
@@ -247,44 +253,100 @@ get_coef (size_t n_vars, const struct casefile *cf,
    free (theta);
  }
  
-struct innovations_estimate ** 
-pspp_innovations (const struct variable **vars, 
-                 size_t *n_vars,
-                 size_t lag, 
-                 const struct casefile *cf)
+static void
+innovations_struct_init (struct innovations_estimate *est,
+                        const struct design_matrix *dm,
+                        size_t lag)
+{
+  size_t j;
+
+  est->mean = 0.0;
+  /* COV[0] stores the lag 0 covariance (i.e., the variance), COV[1]
+     holds the lag-1 covariance, etc.
+   */
+  est->cov = xnmalloc (lag + 1, sizeof (*est->cov));
+  est->scale = xnmalloc (lag + 1, sizeof (*est->scale));
+  est->coeff = xnmalloc (lag, sizeof (*est->coeff)); /* No intercept. */
+
+  /*
+    The loop below is an unusual use of PSPP_COEFF_INIT(). In a
+    typical model, one column of a DESIGN_MATRIX has one
+    coefficient. But in a time-series model, one column has many
+    coefficients.
+   */
+  for (j = 0; j < lag; j++)
+    {
+      pspp_coeff_init (est->coeff + j, dm);
+    }
+  est->max_lag = (double) lag;
+}
+/*
+  The mean is subtracted from the original data before computing the
+  coefficients. The mean is NOT added back, so if you want to predict
+  a new value, you must add the mean to X_hat[m] to get the correct
+  value.
+ */
+static void
+subtract_mean (gsl_matrix *m, struct innovations_estimate **est)
  {
-  struct innovations_estimate **est;
    size_t i;
    size_t j;
+  double tmp;
  
-  est = xnmalloc (*n_vars, sizeof *est);
-  for (i = 0; i < *n_vars; i++)
+  for (i = 0; i < m->size1; i++)
      {
-      if (vars[i]->type == NUMERIC)
-       {
-         est[i] = xmalloc (sizeof **est);
-         est[i]->variable = vars[i];
-         est[i]->mean = 0.0;
-         est[i]->variance = 0.0;
-         est[i]->cov = xnmalloc (lag, sizeof (*est[i]->cov));
-         est[i]->scale = xnmalloc (lag, sizeof (*est[i]->scale));
-         est[i]->coeff = xnmalloc (lag, sizeof (*est[i]->coeff));
-         for (j = 0; j < lag; j++)
-           {
-             est[i]->coeff[j] = xmalloc (sizeof (*(est[i]->coeff + j)));
-           }
-       }
-      else
+      for (j = 0; j < m->size2; j++)
         {
-         *n_vars--;
-/*       msg (MW, _("Cannot compute autocovariance for a non-numeric variable %s"), */
-/*                  var_to_string (vars[i])); */
+         tmp = gsl_matrix_get (m, i, j) - est[j]->mean;
+         gsl_matrix_set (m, i, j, tmp);
         }
      }
+}
+struct innovations_estimate **
+pspp_innovations (const struct design_matrix *dm, size_t lag)
+{
+  struct innovations_estimate **est;
+  size_t i;
+
+  est = xnmalloc (dm->m->size2, sizeof *est);
+  for (i = 0; i < dm->m->size2; i++)
+    {
+      est[i] = xmalloc (sizeof *est[i]);
+/*       est[i]->variable = vars[i]; */
+      innovations_struct_init (est[i], dm, lag);
+    }
+
+  get_mean (dm->m, est);
+  subtract_mean (dm->m, est);
+  get_covariance (dm->m, est, lag);
+  get_coef (dm->m, est, lag);
  
-  get_mean_variance (*n_vars, cf, est);
-  get_covariance (*n_vars, cf, est, lag);
-  get_coef (*n_vars, cf, est, lag);
-  
    return est;
  }
+
+static void
+pspp_innovations_free_one (struct innovations_estimate *est)
+{
+  size_t i;
+
+  assert (est != NULL);
+  for (i = 0; i < (size_t) est->max_lag; i++)
+    {
+      pspp_coeff_free (est->coeff[i]);
+    }
+  free (est->scale);
+  free (est->cov);
+  free (est);
+}
+
+void pspp_innovations_free (struct innovations_estimate **est, size_t n)
+{
+  size_t i;
+
+  assert (est != NULL);
+  for (i = 0; i < n; i++)
+    {
+      pspp_innovations_free_one (est[i]);
+    }
+  free (est);
+}