src/math/ts/innovations.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /*
  18   Find preliminary ARMA coefficients via the innovations algorithm.
  19   Also compute the sample mean and covariance matrix for each series.
  20
  21   Reference:
  22
  23   P. J. Brockwell and R. A. Davis. Time Series: Theory and
  24   Methods. Second edition. Springer. New York. 1991. ISBN
  25   0-387-97429-6. Sections 5.2, 8.3 and 8.4.
  26  */
  27
  28 #include <config.h>
  29 #include <gsl/gsl_matrix.h>
  30 #include <gsl/gsl_vector.h>
  31 #include <gsl/gsl_math.h>
  32 #include <stdlib.h>
  33 #include <libpspp/compiler.h>
  34 #include <math/coefficient.h>
  35 #include <math/ts/innovations.h>
  36
  37 #include "xalloc.h"
  38
  39 static void
  40 get_mean (const gsl_matrix *data,
  41           struct innovations_estimate **est)
  42
  43 {
  44   size_t n;
  45   size_t i;
  46   double d;
  47   double tmp;
  48
  49   for (n = 0; n < data->size2; n++)
  50     {
  51       est[n]->n_obs = 0.0;
  52       est[n]->mean = 0.0;
  53     }
  54   for (i = 0; i < data->size1; i++)
  55     {
  56       for (n = 0; n < data->size2; n++)
  57         {
  58           tmp = gsl_matrix_get (data, i, n);
  59           if (!gsl_isnan (tmp))
  60             {
  61               est[n]->n_obs += 1.0;
  62               d = (tmp - est[n]->mean) / est[n]->n_obs;
  63               est[n]->mean += d;
  64             }
  65         }
  66     }
  67 }
  68 static void
  69 update_cov (struct innovations_estimate **est, gsl_vector_const_view x,
  70             gsl_vector_const_view y, size_t lag)
  71 {
  72   size_t j;
  73   double xj;
  74   double yj;
  75
  76   for (j = 0; j < x.vector.size; j++)
  77     {
  78       xj = gsl_vector_get (&x.vector, j);
  79       yj = gsl_vector_get (&y.vector, j);
  80       if (!gsl_isnan (xj))
  81         {
  82           if (!gsl_isnan (yj))
  83             {
  84               xj -= est[j]->mean;
  85               yj -= est[j]->mean;
  86               *(est[j]->cov + lag) += xj * yj;
  87             }
  88         }
  89     }
  90 }
  91 static int
  92 get_covariance (const gsl_matrix *data,
  93                 struct innovations_estimate **est, size_t max_lag)
  94 {
  95   size_t lag;
  96   size_t j;
  97   size_t i;
  98   int rc = 1;
  99
 100   assert (data != NULL);
 101   assert (est != NULL);
 102
 103   for (j = 0; j < data->size2; j++)
 104     {
 105       for (lag = 0; lag <= max_lag; lag++)
 106         {
 107           *(est[j]->cov + lag) = 0.0;
 108         }
 109     }
 110   /*
 111     The rows are in the outer loop because a gsl_matrix is stored in
 112     row-major order.
 113    */
 114   for (i = 0; i < data->size1; i++)
 115     {
 116       for (lag = 0; lag <= max_lag && lag < data->size1 - i; lag++)
 117         {
 118           update_cov (est, gsl_matrix_const_row (data, i),
 119                       gsl_matrix_const_row (data, i + lag), lag);
 120         }
 121     }
 122   for (j = 0; j < data->size2; j++)
 123     {
 124       for (lag = 0; lag <= max_lag; lag++)
 125         {
 126           *(est[j]->cov + lag) /= est[j]->n_obs;
 127         }
 128     }
 129
 130   return rc;
 131 }
 132
 133 static double
 134 innovations_convolve (double *x, double *y, struct innovations_estimate *est,
 135                       int i)
 136 {
 137   int k;
 138   double result = 0.0;
 139
 140   assert (x != NULL && y != NULL);
 141   assert (est != NULL);
 142   assert (est->scale != NULL);
 143   assert (i > 0);
 144   for (k = 0; k < i; k++)
 145     {
 146       result += x[k] * y[k] * est->scale[i-k-1];
 147     }
 148   return result;
 149 }
 150 static void
 151 innovations_update_scale (struct innovations_estimate *est, double *theta,
 152                           size_t i)
 153 {
 154   double result = 0.0;
 155   size_t j;
 156   size_t k;
 157
 158   if (i < (size_t) est->max_lag)
 159     {
 160       result = est->cov[0];
 161       for (j = 0; j < i; j++)
 162         {
 163           k = i - j - 1;
 164           result -= theta[k] * theta[k] * est->scale[j];
 165         }
 166       est->scale[i] = result;
 167     }
 168 }
 169 static void
 170 init_theta (double **theta, size_t max_lag)
 171 {
 172   size_t i;
 173   size_t j;
 174
 175   for (i = 0; i < max_lag; i++)
 176     {
 177       for (j = 0; j <= i; j++)
 178         {
 179           theta[i][j] = 0.0;
 180         }
 181     }
 182 }
 183 static void
 184 innovations_update_coeff (double **theta, struct innovations_estimate *est,
 185                           size_t max_lag)
 186 {
 187   size_t i;
 188   size_t j;
 189   size_t k;
 190
 191   for (i = 0; i < max_lag; i++)
 192     {
 193       theta[i][i] = est->cov[i+1] / est->scale[0];
 194       for (j = 1; j <= i; j++)
 195         {
 196           k = i - j;
 197           theta[i][k] = (est->cov[k+1] -
 198                          innovations_convolve (theta[i] + k + 1, theta[j - 1], est, j))
 199             / est->scale[j];
 200         }
 201       innovations_update_scale (est, theta[i], i + 1);
 202     }
 203 }
 204 static void
 205 get_coef (const gsl_matrix *data,
 206           struct innovations_estimate **est, size_t max_lag)
 207 {
 208   size_t i;
 209   size_t n;
 210   double **theta;
 211
 212   theta = xnmalloc (max_lag, sizeof (*theta));
 213   for (i = 0; i < max_lag; i++)
 214     {
 215       theta[i] = xnmalloc (max_lag, sizeof (**(theta + i)));
 216     }
 217
 218   for (n = 0; n < data->size2; n++)
 219     {
 220       init_theta (theta, max_lag);
 221       innovations_update_scale (est[n], theta[0], 0);
 222       innovations_update_coeff (theta, est[n], max_lag);
 223       /* Copy the final row of coefficients into EST->COEFF.*/
 224       for (i = 0; i < max_lag; i++)
 225         {
 226           /*
 227             The order of storage here means that the best predicted value
 228             for the time series is computed as follows:
 229
 230             Let X[m], X[m-1],... denote the original series.
 231             Let X_hat[0] denote the best predicted value of X[0],
 232             X_hat[1] denote the projection of X[1] onto the subspace
 233             spanned by {X[0] - X_hat[0]}. Let X_hat[m] denote the
 234             projection of X[m] onto the subspace spanned by {X[m-1] - X_hat[m-1],
 235             X[m-2] - X_hat[m-2],...,X[0] - X_hat[0]}.
 236
 237             Then X_hat[m] = est->coeff[m-1] * (X[m-1] - X_hat[m-1])
 238                           + est->coeff[m-1] * (X[m-2] - X_hat[m-2])
 239                           ...
 240                           + est->coeff[m-max_lag] * (X[m - max_lag] - X_hat[m - max_lag])
 241            */
 242           pspp_coeff_set_estimate (est[n]->coeff[i], theta[max_lag - 1][i]);
 243         }
 244     }
 245
 246   for (i = 0; i < max_lag; i++)
 247     {
 248       free (theta[i]);
 249     }
 250   free (theta);
 251 }
 252
 253 static void
 254 innovations_struct_init (struct innovations_estimate *est,
 255                          const struct design_matrix *dm,
 256                          size_t lag)
 257 {
 258   size_t j;
 259
 260   est->mean = 0.0;
 261   /* COV[0] stores the lag 0 covariance (i.e., the variance), COV[1]
 262      holds the lag-1 covariance, etc.
 263    */
 264   est->cov = xnmalloc (lag + 1, sizeof (*est->cov));
 265   est->scale = xnmalloc (lag + 1, sizeof (*est->scale));
 266   est->coeff = xnmalloc (lag, sizeof (*est->coeff)); /* No intercept. */
 267
 268   /*
 269     The loop below is an unusual use of PSPP_COEFF_INIT(). In a
 270     typical model, one column of a DESIGN_MATRIX has one
 271     coefficient. But in a time-series model, one column has many
 272     coefficients.
 273    */
 274   for (j = 0; j < lag; j++)
 275     {
 276       pspp_coeff_init (est->coeff + j, dm);
 277     }
 278   est->max_lag = (double) lag;
 279 }
 280 /*
 281   The mean is subtracted from the original data before computing the
 282   coefficients. The mean is NOT added back, so if you want to predict
 283   a new value, you must add the mean to X_hat[m] to get the correct
 284   value.
 285  */
 286 static void
 287 subtract_mean (gsl_matrix *m, struct innovations_estimate **est)
 288 {
 289   size_t i;
 290   size_t j;
 291   double tmp;
 292
 293   for (i = 0; i < m->size1; i++)
 294     {
 295       for (j = 0; j < m->size2; j++)
 296         {
 297           tmp = gsl_matrix_get (m, i, j) - est[j]->mean;
 298           gsl_matrix_set (m, i, j, tmp);
 299         }
 300     }
 301 }
 302 struct innovations_estimate **
 303 pspp_innovations (const struct design_matrix *dm, size_t lag)
 304 {
 305   struct innovations_estimate **est;
 306   size_t i;
 307
 308   est = xnmalloc (dm->m->size2, sizeof *est);
 309   for (i = 0; i < dm->m->size2; i++)
 310     {
 311       est[i] = xmalloc (sizeof *est[i]);
 312 /*       est[i]->variable = vars[i]; */
 313       innovations_struct_init (est[i], dm, lag);
 314     }
 315
 316   get_mean (dm->m, est);
 317   subtract_mean (dm->m, est);
 318   get_covariance (dm->m, est, lag);
 319   get_coef (dm->m, est, lag);
 320
 321   return est;
 322 }
 323
 324 static void
 325 pspp_innovations_free_one (struct innovations_estimate *est)
 326 {
 327   size_t i;
 328
 329   assert (est != NULL);
 330   for (i = 0; i < (size_t) est->max_lag; i++)
 331     {
 332       pspp_coeff_free (est->coeff[i]);
 333     }
 334   free (est->scale);
 335   free (est->cov);
 336   free (est);
 337 }
 338
 339 void pspp_innovations_free (struct innovations_estimate **est, size_t n)
 340 {
 341   size_t i;
 342
 343   assert (est != NULL);
 344   for (i = 0; i < n; i++)
 345     {
 346       pspp_innovations_free_one (est[i]);
 347     }
 348   free (est);
 349 }