4 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2 of the License, or (at your option)
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 You should have received a copy of the GNU General Public License along with
17 this program; if not, write to the Free Software Foundation, Inc., 51
18 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
24 #include <gsl/gsl_blas.h>
25 #include <gsl/gsl_cblas.h>
30 Find the least-squares estimate of b for the linear model:
34 where Y is an n-by-1 column vector, X is an n-by-p matrix of
35 independent variables, b is a p-by-1 vector of regression coefficients,
36 and Z is an n-by-1 normally-distributed random vector with independent
37 identically distributed components with mean 0.
39 This estimate is found via the sweep operator or singular-value
40 decomposition with gsl.
45 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
46 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
48 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
51 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
52 Springer. 1998. ISBN 0-387-98542-5.
55 #include <math/linreg/linreg.h>
56 #include <math/linreg/coefficient.h>
57 #include <gsl/gsl_errno.h>
58 #include <linreg/sweep.h>
60 Get the mean and standard deviation of a vector
61 of doubles via a form of the Kalman filter as
62 described on page 32 of [3].
65 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
74 mean = gsl_vector_get (&v.vector, 0);
76 for (i = 1; i < v.vector.size; i++)
79 tmp = gsl_vector_get (&v.vector, i);
82 variance += j * (j - 1.0) * d * d;
85 *sp = sqrt (variance / (j - 1.0));
92 Allocate a pspp_linreg_cache and return a pointer
93 to it. n is the number of cases, p is the number of
94 independent variables.
97 pspp_linreg_cache_alloc (size_t n, size_t p)
101 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
103 c->indep_means = gsl_vector_alloc (p);
104 c->indep_std = gsl_vector_alloc (p);
105 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
106 independent variables.
108 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
111 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
117 c->method = PSPP_LINREG_SWEEP;
118 c->predict = pspp_linreg_predict;
119 c->residual = pspp_linreg_residual; /* The procedure to comput my residuals. */
120 c->resid = NULL; /* The variable storing my residuals. */
126 pspp_linreg_cache_free (void * m)
128 pspp_linreg_cache *c = m;
129 gsl_vector_free (c->indep_means);
130 gsl_vector_free (c->indep_std);
131 gsl_vector_free (c->ss_indeps);
132 gsl_matrix_free (c->cov);
133 pspp_linreg_coeff_free (c->coeff);
139 Fit the linear model via least squares. All pointers passed to pspp_linreg
140 are assumed to be allocated to the correct size and initialized to the
141 values as indicated by opts.
144 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
145 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
151 gsl_matrix_view xmxtx;
155 gsl_vector *param_estimates;
168 if (opts->get_depvar_mean_std)
170 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
172 cache->depvar_mean = m;
173 cache->depvar_std = s;
176 for (i = 0; i < cache->n_indeps; i++)
178 if (opts->get_indep_mean_std[i])
180 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
181 gsl_vector_set (cache->indep_means, i, m);
182 gsl_vector_set (cache->indep_std, i, s);
183 gsl_vector_set (cache->ssx, i, ss);
186 cache->dft = cache->n_obs - 1;
187 cache->dfm = cache->n_indeps;
188 cache->dfe = cache->dft - cache->dfm;
189 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
190 regression through the origin.
192 if (cache->method == PSPP_LINREG_SWEEP)
196 Subtract the means to improve the condition of the design
197 matrix. This requires copying X and Y. We do not divide by the
198 standard deviations of the independent variables here since doing
199 so would cause a miscalculation of the residual sums of
200 squares. Dividing by the standard deviation is done GSL's linear
201 regression functions, so if the design matrix has a poor
202 condition, use QR decomposition.
204 The design matrix here does not include a column for the intercept
205 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
206 so design is allocated here when sweeping, or below if using QR.
208 design = gsl_matrix_alloc (X->size1, X->size2);
209 for (i = 0; i < X->size2; i++)
211 m = gsl_vector_get (cache->indep_means, i);
212 for (j = 0; j < X->size1; j++)
214 tmp = (gsl_matrix_get (X, j, i) - m);
215 gsl_matrix_set (design, j, i, tmp);
218 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
219 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
221 for (i = 0; i < xtx.matrix.size1; i++)
223 tmp = gsl_vector_get (cache->ssx, i);
224 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
225 xi = gsl_matrix_column (design, i);
226 for (j = (i + 1); j < xtx.matrix.size2; j++)
228 xj = gsl_matrix_column (design, j);
229 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
230 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
234 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
235 xty = gsl_matrix_column (sw, cache->n_indeps);
237 This loop starts at 1, with i=0 outside the loop, so we can get
238 the model sum of squares due to the first independent variable.
240 xi = gsl_matrix_column (design, 0);
241 gsl_blas_ddot (&(xi.vector), Y, &tmp);
242 gsl_vector_set (&(xty.vector), 0, tmp);
243 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
244 gsl_vector_set (cache->ss_indeps, 0, tmp);
245 for (i = 1; i < cache->n_indeps; i++)
247 xi = gsl_matrix_column (design, i);
248 gsl_blas_ddot (&(xi.vector), Y, &tmp);
249 gsl_vector_set (&(xty.vector), i, tmp);
253 Sweep on the matrix sw, which contains XtX, XtY and YtY.
256 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
257 cache->mse = cache->sse / cache->dfe;
261 m = cache->depvar_mean;
262 for (i = 0; i < cache->n_indeps; i++)
264 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
265 cache->coeff[i + 1].estimate = tmp;
266 m -= tmp * gsl_vector_get (cache->indep_means, i);
269 Get the covariance matrix of the parameter estimates.
270 Only the upper triangle is necessary.
274 The loops below do not compute the entries related
275 to the estimated intercept.
277 for (i = 0; i < cache->n_indeps; i++)
278 for (j = i; j < cache->n_indeps; j++)
280 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
281 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
284 Get the covariances related to the intercept.
286 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
287 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
288 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
289 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
290 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
291 if (rc == GSL_SUCCESS)
293 tmp = cache->mse / cache->n_obs;
294 for (i = 1; i < 1 + cache->n_indeps; i++)
296 tmp -= gsl_matrix_get (cache->cov, 0, i)
297 * gsl_vector_get (cache->indep_means, i - 1);
299 gsl_matrix_set (cache->cov, 0, 0, tmp);
301 cache->coeff[0].estimate = m;
305 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
306 __FILE__, __LINE__, gsl_strerror (rc));
309 gsl_matrix_free (sw);
313 gsl_multifit_linear_workspace *wk;
315 Use QR decomposition via GSL.
318 param_estimates = gsl_vector_alloc (1 + X->size2);
319 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
321 for (j = 0; j < X->size1; j++)
323 gsl_matrix_set (design, j, 0, 1.0);
324 for (i = 0; i < X->size2; i++)
326 tmp = gsl_matrix_get (X, j, i);
327 gsl_matrix_set (design, j, i + 1, tmp);
331 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
332 rc = gsl_multifit_linear (design, Y, param_estimates,
333 cache->cov, &(cache->sse), wk);
334 for (i = 0; i < cache->n_coeffs; i++)
336 cache->coeff[i].estimate = gsl_vector_get (param_estimates, i);
338 if (rc == GSL_SUCCESS)
340 gsl_multifit_linear_free (wk);
341 gsl_vector_free (param_estimates);
345 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
346 __FILE__, __LINE__, rc);
351 cache->ssm = cache->sst - cache->sse;
353 Get the remaining sums of squares for the independent
357 for (i = 1; i < cache->n_indeps; i++)
360 m += gsl_vector_get (cache->ss_indeps, j);
361 tmp = cache->ssm - m;
362 gsl_vector_set (cache->ss_indeps, i, tmp);
365 gsl_matrix_free (design);