4 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2 of the License, or (at your option)
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 You should have received a copy of the GNU General Public License along with
17 this program; if not, write to the Free Software Foundation, Inc., 51
18 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
24 #include <gsl/gsl_blas.h>
25 #include <gsl/gsl_cblas.h>
30 Find the least-squares estimate of b for the linear model:
34 where Y is an n-by-1 column vector, X is an n-by-p matrix of
35 independent variables, b is a p-by-1 vector of regression coefficients,
36 and Z is an n-by-1 normally-distributed random vector with independent
37 identically distributed components with mean 0.
39 This estimate is found via the sweep operator or singular-value
40 decomposition with gsl.
45 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
46 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
48 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
51 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
52 Springer. 1998. ISBN 0-387-98542-5.
55 #include <math/linreg/linreg.h>
56 #include <math/coefficient.h>
57 #include <gsl/gsl_errno.h>
58 #include <linreg/sweep.h>
60 Get the mean and standard deviation of a vector
61 of doubles via a form of the Kalman filter as
62 described on page 32 of [3].
65 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
74 mean = gsl_vector_get (&v.vector, 0);
76 for (i = 1; i < v.vector.size; i++)
79 tmp = gsl_vector_get (&v.vector, i);
82 variance += j * (j - 1.0) * d * d;
85 *sp = sqrt (variance / (j - 1.0));
92 Set V to contain an array of pointers to the variables
93 used in the model. V must be at least C->N_COEFFS in length.
94 The return value is the number of distinct variables found.
97 pspp_linreg_get_vars (const void *c_, struct variable **v)
99 const pspp_linreg_cache *c = c_;
100 struct pspp_coeff *coef = NULL;
101 const struct variable *tmp;
106 Make sure the caller doesn't try to sneak a variable
107 into V that is not in the model.
109 for (i = 0; i < c->n_coeffs; i++)
114 Start at c->coeff[1] to avoid the intercept.
116 v[result] = (struct variable *) pspp_coeff_get_var (c->coeff[1], 0);
117 result = (v[result] == NULL) ? 0 : 1;
119 for (coef = c->coeff[2]; coef < c->coeff[c->n_coeffs]; coef++)
121 tmp = pspp_coeff_get_var (coef, 0);
122 assert (tmp != NULL);
123 /* Repeated variables are likely to bunch together, at the end
126 while (i >= 0 && (v[i]->index != tmp->index))
130 if (i < 0 && result < c->n_coeffs)
132 v[result] = (struct variable *) tmp;
140 Allocate a pspp_linreg_cache and return a pointer
141 to it. n is the number of cases, p is the number of
142 independent variables.
145 pspp_linreg_cache_alloc (size_t n, size_t p)
147 pspp_linreg_cache *c;
149 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
151 c->indep_means = gsl_vector_alloc (p);
152 c->indep_std = gsl_vector_alloc (p);
153 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
154 independent variables.
156 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
159 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
165 c->method = PSPP_LINREG_SWEEP;
166 c->predict = pspp_linreg_predict;
167 c->residual = pspp_linreg_residual; /* The procedure to compute my
169 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
172 c->resid = NULL; /* The variable storing my residuals. */
173 c->pred = NULL; /* The variable storing my predicted values. */
179 pspp_linreg_cache_free (void *m)
183 pspp_linreg_cache *c = m;
184 gsl_vector_free (c->indep_means);
185 gsl_vector_free (c->indep_std);
186 gsl_vector_free (c->ss_indeps);
187 gsl_matrix_free (c->cov);
188 for (i = 0; i < c->n_coeffs; i++)
190 pspp_coeff_free (c->coeff[i]);
197 Fit the linear model via least squares. All pointers passed to pspp_linreg
198 are assumed to be allocated to the correct size and initialized to the
199 values as indicated by opts.
202 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
203 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
209 gsl_matrix_view xmxtx;
213 gsl_vector *param_estimates;
226 if (opts->get_depvar_mean_std)
228 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
230 cache->depvar_mean = m;
231 cache->depvar_std = s;
234 for (i = 0; i < cache->n_indeps; i++)
236 if (opts->get_indep_mean_std[i])
238 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
239 gsl_vector_set (cache->indep_means, i, m);
240 gsl_vector_set (cache->indep_std, i, s);
241 gsl_vector_set (cache->ssx, i, ss);
244 cache->dft = cache->n_obs - 1;
245 cache->dfm = cache->n_indeps;
246 cache->dfe = cache->dft - cache->dfm;
247 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
248 regression through the origin.
250 if (cache->method == PSPP_LINREG_SWEEP)
254 Subtract the means to improve the condition of the design
255 matrix. This requires copying X and Y. We do not divide by the
256 standard deviations of the independent variables here since doing
257 so would cause a miscalculation of the residual sums of
258 squares. Dividing by the standard deviation is done GSL's linear
259 regression functions, so if the design matrix has a poor
260 condition, use QR decomposition.
262 The design matrix here does not include a column for the intercept
263 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
264 so design is allocated here when sweeping, or below if using QR.
266 design = gsl_matrix_alloc (X->size1, X->size2);
267 for (i = 0; i < X->size2; i++)
269 m = gsl_vector_get (cache->indep_means, i);
270 for (j = 0; j < X->size1; j++)
272 tmp = (gsl_matrix_get (X, j, i) - m);
273 gsl_matrix_set (design, j, i, tmp);
276 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
277 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
279 for (i = 0; i < xtx.matrix.size1; i++)
281 tmp = gsl_vector_get (cache->ssx, i);
282 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
283 xi = gsl_matrix_column (design, i);
284 for (j = (i + 1); j < xtx.matrix.size2; j++)
286 xj = gsl_matrix_column (design, j);
287 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
288 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
292 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
293 xty = gsl_matrix_column (sw, cache->n_indeps);
295 This loop starts at 1, with i=0 outside the loop, so we can get
296 the model sum of squares due to the first independent variable.
298 xi = gsl_matrix_column (design, 0);
299 gsl_blas_ddot (&(xi.vector), Y, &tmp);
300 gsl_vector_set (&(xty.vector), 0, tmp);
301 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
302 gsl_vector_set (cache->ss_indeps, 0, tmp);
303 for (i = 1; i < cache->n_indeps; i++)
305 xi = gsl_matrix_column (design, i);
306 gsl_blas_ddot (&(xi.vector), Y, &tmp);
307 gsl_vector_set (&(xty.vector), i, tmp);
311 Sweep on the matrix sw, which contains XtX, XtY and YtY.
314 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
315 cache->mse = cache->sse / cache->dfe;
319 m = cache->depvar_mean;
320 for (i = 0; i < cache->n_indeps; i++)
322 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
323 cache->coeff[i + 1]->estimate = tmp;
324 m -= tmp * gsl_vector_get (cache->indep_means, i);
327 Get the covariance matrix of the parameter estimates.
328 Only the upper triangle is necessary.
332 The loops below do not compute the entries related
333 to the estimated intercept.
335 for (i = 0; i < cache->n_indeps; i++)
336 for (j = i; j < cache->n_indeps; j++)
338 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
339 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
342 Get the covariances related to the intercept.
344 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
345 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
346 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
347 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
348 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
349 if (rc == GSL_SUCCESS)
351 tmp = cache->mse / cache->n_obs;
352 for (i = 1; i < 1 + cache->n_indeps; i++)
354 tmp -= gsl_matrix_get (cache->cov, 0, i)
355 * gsl_vector_get (cache->indep_means, i - 1);
357 gsl_matrix_set (cache->cov, 0, 0, tmp);
359 cache->coeff[0]->estimate = m;
363 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
364 __FILE__, __LINE__, gsl_strerror (rc));
367 gsl_matrix_free (sw);
371 gsl_multifit_linear_workspace *wk;
373 Use QR decomposition via GSL.
376 param_estimates = gsl_vector_alloc (1 + X->size2);
377 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
379 for (j = 0; j < X->size1; j++)
381 gsl_matrix_set (design, j, 0, 1.0);
382 for (i = 0; i < X->size2; i++)
384 tmp = gsl_matrix_get (X, j, i);
385 gsl_matrix_set (design, j, i + 1, tmp);
389 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
390 rc = gsl_multifit_linear (design, Y, param_estimates,
391 cache->cov, &(cache->sse), wk);
392 for (i = 0; i < cache->n_coeffs; i++)
394 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i);
396 if (rc == GSL_SUCCESS)
398 gsl_multifit_linear_free (wk);
399 gsl_vector_free (param_estimates);
403 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
404 __FILE__, __LINE__, rc);
409 cache->ssm = cache->sst - cache->sse;
411 Get the remaining sums of squares for the independent
415 for (i = 1; i < cache->n_indeps; i++)
418 m += gsl_vector_get (cache->ss_indeps, j);
419 tmp = cache->ssm - m;
420 gsl_vector_set (cache->ss_indeps, i, tmp);
423 gsl_matrix_free (design);