pintos-os.org Git - pspp/blob - src/math/ts/innovations.c

   1 /* PSPP - a program for statistical analysis.
   2    Copyright (C) 2006 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
  16
  17 /*
  18   Find preliminary ARMA coefficients via the innovations algorithm.
  19   Also compute the sample mean and covariance matrix for each series.
  20
  21   Reference:
  22
  23   P. J. Brockwell and R. A. Davis. Time Series: Theory and
  24   Methods. Second edition. Springer. New York. 1991. ISBN
  25   0-387-97429-6. Sections 5.2, 8.3 and 8.4.
  26  */
  27
  28 #include <config.h>
  29
  30 #include <math.h>
  31 #include <stdlib.h>
  32
  33 #include <gsl/gsl_matrix.h>
  34 #include <gsl/gsl_vector.h>
  35 #include <libpspp/compiler.h>
  36 #include <libpspp/misc.h>
  37 #include <math/coefficient.h>
  38 #include <math/ts/innovations.h>
  39
  40 #include "xalloc.h"
  41
  42 static void
  43 get_mean (const gsl_matrix *data,
  44           struct innovations_estimate **est)
  45
  46 {
  47   size_t n;
  48   size_t i;
  49   double d;
  50   double tmp;
  51
  52   for (n = 0; n < data->size2; n++)
  53     {
  54       est[n]->n_obs = 0.0;
  55       est[n]->mean = 0.0;
  56     }
  57   for (i = 0; i < data->size1; i++)
  58     {
  59       for (n = 0; n < data->size2; n++)
  60         {
  61           tmp = gsl_matrix_get (data, i, n);
  62           if (!isnan (tmp))
  63             {
  64               est[n]->n_obs += 1.0;
  65               d = (tmp - est[n]->mean) / est[n]->n_obs;
  66               est[n]->mean += d;
  67             }
  68         }
  69     }
  70 }
  71 static void
  72 update_cov (struct innovations_estimate **est, gsl_vector_const_view x,
  73             gsl_vector_const_view y, size_t lag)
  74 {
  75   size_t j;
  76   double xj;
  77   double yj;
  78
  79   for (j = 0; j < x.vector.size; j++)
  80     {
  81       xj = gsl_vector_get (&x.vector, j);
  82       yj = gsl_vector_get (&y.vector, j);
  83       if (!isnan (xj))
  84         {
  85           if (!isnan (yj))
  86             {
  87               xj -= est[j]->mean;
  88               yj -= est[j]->mean;
  89               *(est[j]->cov + lag) += xj * yj;
  90             }
  91         }
  92     }
  93 }
  94 static int
  95 get_covariance (const gsl_matrix *data,
  96                 struct innovations_estimate **est, size_t max_lag)
  97 {
  98   size_t lag;
  99   size_t j;
 100   size_t i;
 101   int rc = 1;
 102
 103   assert (data != NULL);
 104   assert (est != NULL);
 105
 106   for (j = 0; j < data->size2; j++)
 107     {
 108       for (lag = 0; lag <= max_lag; lag++)
 109         {
 110           *(est[j]->cov + lag) = 0.0;
 111         }
 112     }
 113   /*
 114     The rows are in the outer loop because a gsl_matrix is stored in
 115     row-major order.
 116    */
 117   for (i = 0; i < data->size1; i++)
 118     {
 119       for (lag = 0; lag <= max_lag && lag < data->size1 - i; lag++)
 120         {
 121           update_cov (est, gsl_matrix_const_row (data, i),
 122                       gsl_matrix_const_row (data, i + lag), lag);
 123         }
 124     }
 125   for (j = 0; j < data->size2; j++)
 126     {
 127       for (lag = 0; lag <= max_lag; lag++)
 128         {
 129           *(est[j]->cov + lag) /= est[j]->n_obs;
 130         }
 131     }
 132
 133   return rc;
 134 }
 135
 136 static double
 137 innovations_convolve (double *x, double *y, struct innovations_estimate *est,
 138                       int i)
 139 {
 140   int k;
 141   double result = 0.0;
 142
 143   assert (x != NULL && y != NULL);
 144   assert (est != NULL);
 145   assert (est->scale != NULL);
 146   assert (i > 0);
 147   for (k = 0; k < i; k++)
 148     {
 149       result += x[k] * y[k] * est->scale[i-k-1];
 150     }
 151   return result;
 152 }
 153 static void
 154 innovations_update_scale (struct innovations_estimate *est, double *theta,
 155                           size_t i)
 156 {
 157   double result = 0.0;
 158   size_t j;
 159   size_t k;
 160
 161   if (i < (size_t) est->max_lag)
 162     {
 163       result = est->cov[0];
 164       for (j = 0; j < i; j++)
 165         {
 166           k = i - j - 1;
 167           result -= pow2 (theta[k]) * est->scale[j];
 168         }
 169       est->scale[i] = result;
 170     }
 171 }
 172 static void
 173 init_theta (double **theta, size_t max_lag)
 174 {
 175   size_t i;
 176   size_t j;
 177
 178   for (i = 0; i < max_lag; i++)
 179     {
 180       for (j = 0; j <= i; j++)
 181         {
 182           theta[i][j] = 0.0;
 183         }
 184     }
 185 }
 186 static void
 187 innovations_update_coeff (double **theta, struct innovations_estimate *est,
 188                           size_t max_lag)
 189 {
 190   size_t i;
 191   size_t j;
 192   size_t k;
 193
 194   for (i = 0; i < max_lag; i++)
 195     {
 196       theta[i][i] = est->cov[i+1] / est->scale[0];
 197       for (j = 1; j <= i; j++)
 198         {
 199           k = i - j;
 200           theta[i][k] = (est->cov[k+1] -
 201                          innovations_convolve (theta[i] + k + 1, theta[j - 1], est, j))
 202             / est->scale[j];
 203         }
 204       innovations_update_scale (est, theta[i], i + 1);
 205     }
 206 }
 207 static void
 208 get_coef (const gsl_matrix *data,
 209           struct innovations_estimate **est, size_t max_lag)
 210 {
 211   size_t i;
 212   size_t n;
 213   double **theta;
 214
 215   theta = xnmalloc (max_lag, sizeof (*theta));
 216   for (i = 0; i < max_lag; i++)
 217     {
 218       theta[i] = xnmalloc (max_lag, sizeof (**(theta + i)));
 219     }
 220
 221   for (n = 0; n < data->size2; n++)
 222     {
 223       init_theta (theta, max_lag);
 224       innovations_update_scale (est[n], theta[0], 0);
 225       innovations_update_coeff (theta, est[n], max_lag);
 226       /* Copy the final row of coefficients into EST->COEFF.*/
 227       for (i = 0; i < max_lag; i++)
 228         {
 229           /*
 230             The order of storage here means that the best predicted value
 231             for the time series is computed as follows:
 232
 233             Let X[m], X[m-1],... denote the original series.
 234             Let X_hat[0] denote the best predicted value of X[0],
 235             X_hat[1] denote the projection of X[1] onto the subspace
 236             spanned by {X[0] - X_hat[0]}. Let X_hat[m] denote the
 237             projection of X[m] onto the subspace spanned by {X[m-1] - X_hat[m-1],
 238             X[m-2] - X_hat[m-2],...,X[0] - X_hat[0]}.
 239
 240             Then X_hat[m] = est->coeff[m-1] * (X[m-1] - X_hat[m-1])
 241                           + est->coeff[m-1] * (X[m-2] - X_hat[m-2])
 242                           ...
 243                           + est->coeff[m-max_lag] * (X[m - max_lag] - X_hat[m - max_lag])
 244            */
 245           pspp_coeff_set_estimate (est[n]->coeff[i], theta[max_lag - 1][i]);
 246         }
 247     }
 248
 249   for (i = 0; i < max_lag; i++)
 250     {
 251       free (theta[i]);
 252     }
 253   free (theta);
 254 }
 255
 256 static void
 257 innovations_struct_init (struct innovations_estimate *est,
 258                          const struct design_matrix *dm,
 259                          size_t lag)
 260 {
 261   size_t j;
 262
 263   est->mean = 0.0;
 264   /* COV[0] stores the lag 0 covariance (i.e., the variance), COV[1]
 265      holds the lag-1 covariance, etc.
 266    */
 267   est->cov = xnmalloc (lag + 1, sizeof (*est->cov));
 268   est->scale = xnmalloc (lag + 1, sizeof (*est->scale));
 269   est->coeff = xnmalloc (lag, sizeof (*est->coeff)); /* No intercept. */
 270
 271   /*
 272     The loop below is an unusual use of PSPP_COEFF_INIT(). In a
 273     typical model, one column of a DESIGN_MATRIX has one
 274     coefficient. But in a time-series model, one column has many
 275     coefficients.
 276    */
 277   for (j = 0; j < lag; j++)
 278     {
 279       pspp_coeff_init (est->coeff + j, dm);
 280     }
 281   est->max_lag = (double) lag;
 282 }
 283 /*
 284   The mean is subtracted from the original data before computing the
 285   coefficients. The mean is NOT added back, so if you want to predict
 286   a new value, you must add the mean to X_hat[m] to get the correct
 287   value.
 288  */
 289 static void
 290 subtract_mean (gsl_matrix *m, struct innovations_estimate **est)
 291 {
 292   size_t i;
 293   size_t j;
 294   double tmp;
 295
 296   for (i = 0; i < m->size1; i++)
 297     {
 298       for (j = 0; j < m->size2; j++)
 299         {
 300           tmp = gsl_matrix_get (m, i, j) - est[j]->mean;
 301           gsl_matrix_set (m, i, j, tmp);
 302         }
 303     }
 304 }
 305 struct innovations_estimate **
 306 pspp_innovations (const struct design_matrix *dm, size_t lag)
 307 {
 308   struct innovations_estimate **est;
 309   size_t i;
 310
 311   est = xnmalloc (dm->m->size2, sizeof *est);
 312   for (i = 0; i < dm->m->size2; i++)
 313     {
 314       est[i] = xmalloc (sizeof *est[i]);
 315 /*       est[i]->variable = vars[i]; */
 316       innovations_struct_init (est[i], dm, lag);
 317     }
 318
 319   get_mean (dm->m, est);
 320   subtract_mean (dm->m, est);
 321   get_covariance (dm->m, est, lag);
 322   get_coef (dm->m, est, lag);
 323
 324   return est;
 325 }
 326
 327 static void
 328 pspp_innovations_free_one (struct innovations_estimate *est)
 329 {
 330   size_t i;
 331
 332   assert (est != NULL);
 333   for (i = 0; i < (size_t) est->max_lag; i++)
 334     {
 335       pspp_coeff_free (est->coeff[i]);
 336     }
 337   free (est->scale);
 338   free (est->cov);
 339   free (est);
 340 }
 341
 342 void pspp_innovations_free (struct innovations_estimate **est, size_t n)
 343 {
 344   size_t i;
 345
 346   assert (est != NULL);
 347   for (i = 0; i < n; i++)
 348     {
 349       pspp_innovations_free_one (est[i]);
 350     }
 351   free (est);
 352 }