src/math/ts/innovations.c

   1 /*
   2   src/math/ts/innovations.c
   3
   4   Copyright (C) 2006 Free Software Foundation, Inc. Written by Jason H. Stover.
   5
   6   This program is free software; you can redistribute it and/or modify it under
   7   the terms of the GNU General Public License as published by the Free
   8   Software Foundation; either version 2 of the License, or (at your option)
   9   any later version.
  10
  11   This program is distributed in the hope that it will be useful, but WITHOUT
  12   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  14   more details.
  15
  16   You should have received a copy of the GNU General Public License along with
  17   this program; if not, write to the Free Software Foundation, Inc., 51
  18   Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
  19  */
  20 /*
  21   Find preliminary ARMA coefficients via the innovations algorithm.
  22   Also compute the sample mean and covariance matrix for each series.
  23
  24   Reference:
  25
  26   P. J. Brockwell and R. A. Davis. Time Series: Theory and
  27   Methods. Second edition. Springer. New York. 1991. ISBN
  28   0-387-97429-6. Sections 5.2, 8.3 and 8.4.
  29  */
  30
  31 #include <config.h>
  32 #include <gsl/gsl_matrix.h>
  33 #include <gsl/gsl_vector.h>
  34 #include <gsl/gsl_math.h>
  35 #include <stdlib.h>
  36 #include <libpspp/alloc.h>
  37 #include <libpspp/compiler.h>
  38 #include <math/coefficient.h>
  39 #include <math/ts/innovations.h>
  40
  41 static void
  42 get_mean (const gsl_matrix *data,
  43           struct innovations_estimate **est)
  44
  45 {
  46   size_t n;
  47   size_t i;
  48   double d;
  49   double tmp;
  50
  51   for (n = 0; n < data->size2; n++)
  52     {
  53       est[n]->n_obs = 0.0;
  54       est[n]->mean = 0.0;
  55     }
  56   for (i = 0; i < data->size1; i++)
  57     {
  58       for (n = 0; n < data->size2; n++)
  59         {
  60           tmp = gsl_matrix_get (data, i, n);
  61           if (!gsl_isnan (tmp))
  62             {
  63               est[n]->n_obs += 1.0;
  64               d = (tmp - est[n]->mean) / est[n]->n_obs;
  65               est[n]->mean += d;
  66             }
  67         }
  68     }
  69 }
  70 static void
  71 update_cov (struct innovations_estimate **est, gsl_vector_const_view x,
  72             gsl_vector_const_view y, size_t lag)
  73 {
  74   size_t j;
  75   double xj;
  76   double yj;
  77
  78   for (j = 0; j < x.vector.size; j++)
  79     {
  80       xj = gsl_vector_get (&x.vector, j);
  81       yj = gsl_vector_get (&y.vector, j);
  82       if (!gsl_isnan (xj))
  83         {
  84           if (!gsl_isnan (yj))
  85             {
  86               xj -= est[j]->mean;
  87               yj -= est[j]->mean;
  88               *(est[j]->cov + lag) += xj * yj;
  89             }
  90         }
  91     }
  92 }
  93 static int
  94 get_covariance (const gsl_matrix *data,
  95                 struct innovations_estimate **est, size_t max_lag)
  96 {
  97   size_t lag;
  98   size_t j;
  99   size_t i;
 100   int rc = 1;
 101
 102   assert (data != NULL);
 103   assert (est != NULL);
 104
 105   for (j = 0; j < data->size2; j++)
 106     {
 107       for (lag = 0; lag <= max_lag; lag++)
 108         {
 109           *(est[j]->cov + lag) = 0.0;
 110         }
 111     }
 112   /*
 113     The rows are in the outer loop because a gsl_matrix is stored in
 114     row-major order.
 115    */
 116   for (i = 0; i < data->size1; i++)
 117     {
 118       for (lag = 0; lag <= max_lag && lag < data->size1 - i; lag++)
 119         {
 120           update_cov (est, gsl_matrix_const_row (data, i),
 121                       gsl_matrix_const_row (data, i + lag), lag);
 122         }
 123     }
 124   for (j = 0; j < data->size2; j++)
 125     {
 126       for (lag = 0; lag <= max_lag; lag++)
 127         {
 128           *(est[j]->cov + lag) /= est[j]->n_obs;
 129         }
 130     }
 131
 132   return rc;
 133 }
 134
 135 static double
 136 innovations_convolve (double *x, double *y, struct innovations_estimate *est,
 137                       int i)
 138 {
 139   int k;
 140   double result = 0.0;
 141
 142   assert (x != NULL && y != NULL);
 143   assert (est != NULL);
 144   assert (est->scale != NULL);
 145   assert (i > 0);
 146   for (k = 0; k < i; k++)
 147     {
 148       result += x[k] * y[k] * est->scale[i-k-1];
 149     }
 150   return result;
 151 }
 152 static void
 153 innovations_update_scale (struct innovations_estimate *est, double *theta,
 154                           size_t i)
 155 {
 156   double result = 0.0;
 157   size_t j;
 158   size_t k;
 159
 160   if (i < (size_t) est->max_lag)
 161     {
 162       result = est->cov[0];
 163       for (j = 0; j < i; j++)
 164         {
 165           k = i - j - 1;
 166           result -= theta[k] * theta[k] * est->scale[j];
 167         }
 168       est->scale[i] = result;
 169     }
 170 }
 171 static void
 172 init_theta (double **theta, size_t max_lag)
 173 {
 174   size_t i;
 175   size_t j;
 176
 177   for (i = 0; i < max_lag; i++)
 178     {
 179       for (j = 0; j <= i; j++)
 180         {
 181           theta[i][j] = 0.0;
 182         }
 183     }
 184 }
 185 static void
 186 innovations_update_coeff (double **theta, struct innovations_estimate *est,
 187                           size_t max_lag)
 188 {
 189   size_t i;
 190   size_t j;
 191   size_t k;
 192
 193   for (i = 0; i < max_lag; i++)
 194     {
 195       theta[i][i] = est->cov[i+1] / est->scale[0];
 196       for (j = 1; j <= i; j++)
 197         {
 198           k = i - j;
 199           theta[i][k] = (est->cov[k+1] -
 200                          innovations_convolve (theta[i] + k + 1, theta[j - 1], est, j))
 201             / est->scale[j];
 202         }
 203       innovations_update_scale (est, theta[i], i + 1);
 204     }
 205 }
 206 static void
 207 get_coef (const gsl_matrix *data,
 208           struct innovations_estimate **est, size_t max_lag)
 209 {
 210   size_t i;
 211   size_t n;
 212   double **theta;
 213
 214   theta = xnmalloc (max_lag, sizeof (*theta));
 215   for (i = 0; i < max_lag; i++)
 216     {
 217       theta[i] = xnmalloc (max_lag, sizeof (**(theta + i)));
 218     }
 219
 220   for (n = 0; n < data->size2; n++)
 221     {
 222       init_theta (theta, max_lag);
 223       innovations_update_scale (est[n], theta[0], 0);
 224       innovations_update_coeff (theta, est[n], max_lag);
 225       /* Copy the final row of coefficients into EST->COEFF.*/
 226       for (i = 0; i < max_lag; i++)
 227         {
 228           /*
 229             The order of storage here means that the best predicted value
 230             for the time series is computed as follows:
 231
 232             Let X[m], X[m-1],... denote the original series.
 233             Let X_hat[0] denote the best predicted value of X[0],
 234             X_hat[1] denote the projection of X[1] onto the subspace
 235             spanned by {X[0] - X_hat[0]}. Let X_hat[m] denote the
 236             projection of X[m] onto the subspace spanned by {X[m-1] - X_hat[m-1],
 237             X[m-2] - X_hat[m-2],...,X[0] - X_hat[0]}.
 238
 239             Then X_hat[m] = est->coeff[m-1] * (X[m-1] - X_hat[m-1])
 240                           + est->coeff[m-1] * (X[m-2] - X_hat[m-2])
 241                           ...
 242                           + est->coeff[m-max_lag] * (X[m - max_lag] - X_hat[m - max_lag])
 243            */
 244           pspp_coeff_set_estimate (est[n]->coeff[i], theta[max_lag - 1][i]);
 245         }
 246     }
 247
 248   for (i = 0; i < max_lag; i++)
 249     {
 250       free (theta[i]);
 251     }
 252   free (theta);
 253 }
 254
 255 static void
 256 innovations_struct_init (struct innovations_estimate *est,
 257                          const struct design_matrix *dm,
 258                          size_t lag)
 259 {
 260   size_t j;
 261
 262   est->mean = 0.0;
 263   /* COV[0] stores the lag 0 covariance (i.e., the variance), COV[1]
 264      holds the lag-1 covariance, etc.
 265    */
 266   est->cov = xnmalloc (lag + 1, sizeof (*est->cov));
 267   est->scale = xnmalloc (lag + 1, sizeof (*est->scale));
 268   est->coeff = xnmalloc (lag, sizeof (*est->coeff)); /* No intercept. */
 269
 270   /*
 271     The loop below is an unusual use of PSPP_COEFF_INIT(). In a
 272     typical model, one column of a DESIGN_MATRIX has one
 273     coefficient. But in a time-series model, one column has many
 274     coefficients.
 275    */
 276   for (j = 0; j < lag; j++)
 277     {
 278       pspp_coeff_init (est->coeff + j, dm);
 279     }
 280   est->max_lag = (double) lag;
 281 }
 282 /*
 283   The mean is subtracted from the original data before computing the
 284   coefficients. The mean is NOT added back, so if you want to predict
 285   a new value, you must add the mean to X_hat[m] to get the correct
 286   value.
 287  */
 288 static void
 289 subtract_mean (gsl_matrix *m, struct innovations_estimate **est)
 290 {
 291   size_t i;
 292   size_t j;
 293   double tmp;
 294
 295   for (i = 0; i < m->size1; i++)
 296     {
 297       for (j = 0; j < m->size2; j++)
 298         {
 299           tmp = gsl_matrix_get (m, i, j) - est[j]->mean;
 300           gsl_matrix_set (m, i, j, tmp);
 301         }
 302     }
 303 }
 304 struct innovations_estimate **
 305 pspp_innovations (const struct design_matrix *dm, size_t lag)
 306 {
 307   struct innovations_estimate **est;
 308   size_t i;
 309
 310   est = xnmalloc (dm->m->size2, sizeof *est);
 311   for (i = 0; i < dm->m->size2; i++)
 312     {
 313       est[i] = xmalloc (sizeof *est[i]);
 314 /*       est[i]->variable = vars[i]; */
 315       innovations_struct_init (est[i], dm, lag);
 316     }
 317
 318   get_mean (dm->m, est);
 319   subtract_mean (dm->m, est);
 320   get_covariance (dm->m, est, lag);
 321   get_coef (dm->m, est, lag);
 322
 323   return est;
 324 }
 325
 326 static void
 327 pspp_innovations_free_one (struct innovations_estimate *est)
 328 {
 329   size_t i;
 330
 331   assert (est != NULL);
 332   for (i = 0; i < (size_t) est->max_lag; i++)
 333     {
 334       pspp_coeff_free (est->coeff[i]);
 335     }
 336   free (est->scale);
 337   free (est->cov);
 338   free (est);
 339 }
 340
 341 void pspp_innovations_free (struct innovations_estimate **est, size_t n)
 342 {
 343   size_t i;
 344
 345   assert (est != NULL);
 346   for (i = 0; i < n; i++)
 347     {
 348       pspp_innovations_free_one (est[i]);
 349     }
 350   free (est);
 351 }