src/math/ts/innovations.c

   1 /*
   2   src/math/ts/innovations.c
   3
   4   Copyright (C) 2006 Free Software Foundation, Inc. Written by Jason H. Stover.
   5
   6   This program is free software; you can redistribute it and/or modify it under
   7   the terms of the GNU General Public License as published by the Free
   8   Software Foundation; either version 2 of the License, or (at your option)
   9   any later version.
  10
  11   This program is distributed in the hope that it will be useful, but WITHOUT
  12   ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13   FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  14   more details.
  15
  16   You should have received a copy of the GNU General Public License along with
  17   this program; if not, write to the Free Software Foundation, Inc., 51
  18   Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
  19  */
  20 /*
  21   Find preliminary ARMA coefficients via the innovations algorithm.
  22   Also compute the sample mean and covariance matrix for each series.
  23
  24   Reference:
  25
  26   P. J. Brockwell and R. A. Davis. Time Series: Theory and
  27   Methods. Second edition. Springer. New York. 1991. ISBN
  28   0-387-97429-6. Sections 5.2, 8.3 and 8.4.
  29  */
  30
  31 #include <gsl/gsl_matrix.h>
  32 #include <gsl/gsl_vector.h>
  33 #include <gsl/gsl_math.h>
  34 #include <stdlib.h>
  35 #include <libpspp/alloc.h>
  36 #include <libpspp/compiler.h>
  37 #include <math/coefficient.h>
  38 #include <math/ts/innovations.h>
  39
  40 static void
  41 get_mean (const gsl_matrix *data,
  42           struct innovations_estimate **est)
  43
  44 {
  45   size_t n;
  46   size_t i;
  47   double d;
  48   double tmp;
  49
  50   for (n = 0; n < data->size2; n++)
  51     {
  52       est[n]->n_obs = 0.0;
  53       est[n]->mean = 0.0;
  54     }
  55   for (i = 0; i < data->size1; i++)
  56     {
  57       for (n = 0; n < data->size2; n++)
  58         {
  59           tmp = gsl_matrix_get (data, i, n);
  60           if (!gsl_isnan (tmp))
  61             {
  62               est[n]->n_obs += 1.0;
  63               d = (tmp - est[n]->mean) / est[n]->n_obs;
  64               est[n]->mean += d;
  65             }
  66         }
  67     }
  68 }
  69 static void
  70 update_cov (struct innovations_estimate **est, gsl_vector_const_view x,
  71             gsl_vector_const_view y, size_t lag)
  72 {
  73   size_t j;
  74   double xj;
  75   double yj;
  76
  77   for (j = 0; j < x.vector.size; j++)
  78     {
  79       xj = gsl_vector_get (&x.vector, j);
  80       yj = gsl_vector_get (&y.vector, j);
  81       if (!gsl_isnan (xj))
  82         {
  83           if (!gsl_isnan (yj))
  84             {
  85               xj -= est[j]->mean;
  86               yj -= est[j]->mean;
  87               *(est[j]->cov + lag) += xj * yj;
  88             }
  89         }
  90     }
  91 }
  92 static int
  93 get_covariance (const gsl_matrix *data,
  94                 struct innovations_estimate **est, size_t max_lag)
  95 {
  96   size_t lag;
  97   size_t j;
  98   size_t i;
  99   int rc = 1;
 100
 101   assert (data != NULL);
 102   assert (est != NULL);
 103
 104   for (j = 0; j < data->size2; j++)
 105     {
 106       for (lag = 0; lag <= max_lag; lag++)
 107         {
 108           *(est[j]->cov + lag) = 0.0;
 109         }
 110     }
 111   /*
 112     The rows are in the outer loop because a gsl_matrix is stored in
 113     row-major order.
 114    */
 115   for (i = 0; i < data->size1; i++)
 116     {
 117       for (lag = 0; lag <= max_lag && lag < data->size1 - i; lag++)
 118         {
 119           update_cov (est, gsl_matrix_const_row (data, i),
 120                       gsl_matrix_const_row (data, i + lag), lag);
 121         }
 122     }
 123   for (j = 0; j < data->size2; j++)
 124     {
 125       for (lag = 0; lag <= max_lag; lag++)
 126         {
 127           *(est[j]->cov + lag) /= est[j]->n_obs;
 128         }
 129     }
 130
 131   return rc;
 132 }
 133
 134 static double
 135 innovations_convolve (double *x, double *y, struct innovations_estimate *est,
 136                       int i)
 137 {
 138   int k;
 139   double result = 0.0;
 140
 141   assert (x != NULL && y != NULL);
 142   assert (est != NULL);
 143   assert (est->scale != NULL);
 144   assert (i > 0);
 145   for (k = 0; k < i; k++)
 146     {
 147       result += x[k] * y[k] * est->scale[i-k-1];
 148     }
 149   return result;
 150 }
 151 static void
 152 innovations_update_scale (struct innovations_estimate *est, double *theta,
 153                           size_t i)
 154 {
 155   double result = 0.0;
 156   size_t j;
 157   size_t k;
 158
 159   if (i < (size_t) est->max_lag)
 160     {
 161       result = est->cov[0];
 162       for (j = 0; j < i; j++)
 163         {
 164           k = i - j - 1;
 165           result -= theta[k] * theta[k] * est->scale[j];
 166         }
 167       est->scale[i] = result;
 168     }
 169 }
 170 static void
 171 init_theta (double **theta, size_t max_lag)
 172 {
 173   size_t i;
 174   size_t j;
 175
 176   for (i = 0; i < max_lag; i++)
 177     {
 178       for (j = 0; j <= i; j++)
 179         {
 180           theta[i][j] = 0.0;
 181         }
 182     }
 183 }
 184 static void
 185 innovations_update_coeff (double **theta, struct innovations_estimate *est,
 186                           size_t max_lag)
 187 {
 188   size_t i;
 189   size_t j;
 190   size_t k;
 191
 192   for (i = 0; i < max_lag; i++)
 193     {
 194       theta[i][i] = est->cov[i+1] / est->scale[0];
 195       for (j = 1; j <= i; j++)
 196         {
 197           k = i - j;
 198           theta[i][k] = (est->cov[k+1] -
 199                          innovations_convolve (theta[i] + k + 1, theta[j - 1], est, j))
 200             / est->scale[j];
 201         }
 202       innovations_update_scale (est, theta[i], i + 1);
 203     }
 204 }
 205 static void
 206 get_coef (const gsl_matrix *data,
 207           struct innovations_estimate **est, size_t max_lag)
 208 {
 209   size_t i;
 210   size_t n;
 211   double **theta;
 212
 213   theta = xnmalloc (max_lag, sizeof (*theta));
 214   for (i = 0; i < max_lag; i++)
 215     {
 216       theta[i] = xnmalloc (max_lag, sizeof (**(theta + i)));
 217     }
 218
 219   for (n = 0; n < data->size2; n++)
 220     {
 221       init_theta (theta, max_lag);
 222       innovations_update_scale (est[n], theta[0], 0);
 223       innovations_update_coeff (theta, est[n], max_lag);
 224       /* Copy the final row of coefficients into EST->COEFF.*/
 225       for (i = 0; i < max_lag; i++)
 226         {
 227           /*
 228             The order of storage here means that the best predicted value
 229             for the time series is computed as follows:
 230
 231             Let X[m], X[m-1],... denote the original series.
 232             Let X_hat[0] denote the best predicted value of X[0],
 233             X_hat[1] denote the projection of X[1] onto the subspace
 234             spanned by {X[0] - X_hat[0]}. Let X_hat[m] denote the
 235             projection of X[m] onto the subspace spanned by {X[m-1] - X_hat[m-1],
 236             X[m-2] - X_hat[m-2],...,X[0] - X_hat[0]}.
 237
 238             Then X_hat[m] = est->coeff[m-1] * (X[m-1] - X_hat[m-1])
 239                           + est->coeff[m-1] * (X[m-2] - X_hat[m-2])
 240                           ...
 241                           + est->coeff[m-max_lag] * (X[m - max_lag] - X_hat[m - max_lag])
 242
 243             (That is what X_hat[m] SHOULD be, anyway. These routines need
 244             to be tested.)
 245            */
 246           pspp_coeff_set_estimate (est[n]->coeff[i], theta[max_lag - 1][i]);
 247         }
 248     }
 249
 250   for (i = 0; i < max_lag; i++)
 251     {
 252       free (theta[i]);
 253     }
 254   free (theta);
 255 }
 256
 257 static void
 258 innovations_struct_init (struct innovations_estimate *est,
 259                          const struct design_matrix *dm,
 260                          size_t lag)
 261 {
 262   size_t j;
 263
 264   est->mean = 0.0;
 265   /* COV[0] stores the lag 0 covariance (i.e., the variance), COV[1]
 266      holds the lag-1 covariance, etc.
 267    */
 268   est->cov = xnmalloc (lag + 1, sizeof (*est->cov));
 269   est->scale = xnmalloc (lag + 1, sizeof (*est->scale));
 270   est->coeff = xnmalloc (lag, sizeof (*est->coeff)); /* No intercept. */
 271
 272   /*
 273     The loop below is an unusual use of PSPP_COEFF_INIT(). In a
 274     typical model, one column of a DESIGN_MATRIX has one
 275     coefficient. But in a time-series model, one column has many
 276     coefficients.
 277    */
 278   for (j = 0; j < lag; j++)
 279     {
 280       pspp_coeff_init (est->coeff + j, dm);
 281     }
 282   est->max_lag = (double) lag;
 283 }
 284
 285 struct innovations_estimate **
 286 pspp_innovations (const struct design_matrix *dm, size_t lag)
 287 {
 288   struct innovations_estimate **est;
 289   size_t i;
 290
 291   est = xnmalloc (dm->m->size2, sizeof *est);
 292   for (i = 0; i < dm->m->size2; i++)
 293     {
 294       est[i] = xmalloc (sizeof *est[i]);
 295 /*       est[i]->variable = vars[i]; */
 296       innovations_struct_init (est[i], dm, lag);
 297     }
 298
 299   get_mean (dm->m, est);
 300   get_covariance (dm->m, est, lag);
 301   get_coef (dm->m, est, lag);
 302
 303   return est;
 304 }
 305
 306 static void
 307 pspp_innovations_free_one (struct innovations_estimate *est)
 308 {
 309   size_t i;
 310
 311   assert (est != NULL);
 312   for (i = 0; i < (size_t) est->max_lag; i++)
 313     {
 314       pspp_coeff_free (est->coeff[i]);
 315     }
 316   free (est->scale);
 317   free (est->cov);
 318   free (est);
 319 }
 320
 321 void pspp_innovations_free (struct innovations_estimate **est, size_t n)
 322 {
 323   size_t i;
 324
 325   assert (est != NULL);
 326   for (i = 0; i < n; i++)
 327     {
 328       pspp_innovations_free_one (est[i]);
 329     }
 330   free (est);
 331 }