4 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2 of the License, or (at your option)
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 You should have received a copy of the GNU General Public License along with
17 this program; if not, write to the Free Software Foundation, Inc., 51
18 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
24 #include <gsl/gsl_blas.h>
25 #include <gsl/gsl_cblas.h>
30 Find the least-squares estimate of b for the linear model:
34 where Y is an n-by-1 column vector, X is an n-by-p matrix of
35 independent variables, b is a p-by-1 vector of regression coefficients,
36 and Z is an n-by-1 normally-distributed random vector with independent
37 identically distributed components with mean 0.
39 This estimate is found via the sweep operator or singular-value
40 decomposition with gsl.
45 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
46 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
48 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
51 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
52 Springer. 1998. ISBN 0-387-98542-5.
55 #include <math/linreg/linreg.h>
56 #include <math/linreg/coefficient.h>
57 #include <gsl/gsl_errno.h>
58 #include <linreg/sweep.h>
60 Get the mean and standard deviation of a vector
61 of doubles via a form of the Kalman filter as
62 described on page 32 of [3].
65 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
74 mean = gsl_vector_get (&v.vector, 0);
76 for (i = 1; i < v.vector.size; i++)
79 tmp = gsl_vector_get (&v.vector, i);
82 variance += j * (j - 1.0) * d * d;
85 *sp = sqrt (variance / (j - 1.0));
92 Allocate a pspp_linreg_cache and return a pointer
93 to it. n is the number of cases, p is the number of
94 independent variables.
97 pspp_linreg_cache_alloc (size_t n, size_t p)
101 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
103 c->indep_means = gsl_vector_alloc (p);
104 c->indep_std = gsl_vector_alloc (p);
105 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
106 independent variables.
108 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
111 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
117 c->method = PSPP_LINREG_SWEEP;
118 c->predict = pspp_linreg_predict;
119 c->residual = pspp_linreg_residual;
125 pspp_linreg_cache_free (pspp_linreg_cache * c)
127 gsl_vector_free (c->indep_means);
128 gsl_vector_free (c->indep_std);
129 gsl_vector_free (c->ss_indeps);
130 gsl_matrix_free (c->cov);
131 pspp_linreg_coeff_free (c->coeff);
137 Fit the linear model via least squares. All pointers passed to pspp_linreg
138 are assumed to be allocated to the correct size and initialized to the
139 values as indicated by opts.
142 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
143 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
149 gsl_matrix_view xmxtx;
153 gsl_vector *param_estimates;
166 if (opts->get_depvar_mean_std)
168 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
170 cache->depvar_mean = m;
171 cache->depvar_std = s;
174 for (i = 0; i < cache->n_indeps; i++)
176 if (opts->get_indep_mean_std[i])
178 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
179 gsl_vector_set (cache->indep_means, i, m);
180 gsl_vector_set (cache->indep_std, i, s);
181 gsl_vector_set (cache->ssx, i, ss);
184 cache->dft = cache->n_obs - 1;
185 cache->dfm = cache->n_indeps;
186 cache->dfe = cache->dft - cache->dfm;
187 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
188 regression through the origin.
190 if (cache->method == PSPP_LINREG_SWEEP)
194 Subtract the means to improve the condition of the design
195 matrix. This requires copying X and Y. We do not divide by the
196 standard deviations of the independent variables here since doing
197 so would cause a miscalculation of the residual sums of
198 squares. Dividing by the standard deviation is done GSL's linear
199 regression functions, so if the design matrix has a poor
200 condition, use QR decomposition.
202 The design matrix here does not include a column for the intercept
203 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
204 so design is allocated here when sweeping, or below if using QR.
206 design = gsl_matrix_alloc (X->size1, X->size2);
207 for (i = 0; i < X->size2; i++)
209 m = gsl_vector_get (cache->indep_means, i);
210 for (j = 0; j < X->size1; j++)
212 tmp = (gsl_matrix_get (X, j, i) - m);
213 gsl_matrix_set (design, j, i, tmp);
216 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
217 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
219 for (i = 0; i < xtx.matrix.size1; i++)
221 tmp = gsl_vector_get (cache->ssx, i);
222 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
223 xi = gsl_matrix_column (design, i);
224 for (j = (i + 1); j < xtx.matrix.size2; j++)
226 xj = gsl_matrix_column (design, j);
227 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
228 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
232 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
233 xty = gsl_matrix_column (sw, cache->n_indeps);
235 This loop starts at 1, with i=0 outside the loop, so we can get
236 the model sum of squares due to the first independent variable.
238 xi = gsl_matrix_column (design, 0);
239 gsl_blas_ddot (&(xi.vector), Y, &tmp);
240 gsl_vector_set (&(xty.vector), 0, tmp);
241 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
242 gsl_vector_set (cache->ss_indeps, 0, tmp);
243 for (i = 1; i < cache->n_indeps; i++)
245 xi = gsl_matrix_column (design, i);
246 gsl_blas_ddot (&(xi.vector), Y, &tmp);
247 gsl_vector_set (&(xty.vector), i, tmp);
251 Sweep on the matrix sw, which contains XtX, XtY and YtY.
254 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
255 cache->mse = cache->sse / cache->dfe;
259 m = cache->depvar_mean;
260 for (i = 0; i < cache->n_indeps; i++)
262 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
263 cache->coeff[i + 1].estimate = tmp;
264 m -= tmp * gsl_vector_get (cache->indep_means, i);
267 Get the covariance matrix of the parameter estimates.
268 Only the upper triangle is necessary.
272 The loops below do not compute the entries related
273 to the estimated intercept.
275 for (i = 0; i < cache->n_indeps; i++)
276 for (j = i; j < cache->n_indeps; j++)
278 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
279 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
282 Get the covariances related to the intercept.
284 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
285 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
286 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
287 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
288 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
289 if (rc == GSL_SUCCESS)
291 tmp = cache->mse / cache->n_obs;
292 for (i = 1; i < 1 + cache->n_indeps; i++)
294 tmp -= gsl_matrix_get (cache->cov, 0, i)
295 * gsl_vector_get (cache->indep_means, i - 1);
297 gsl_matrix_set (cache->cov, 0, 0, tmp);
299 cache->coeff[0].estimate = m;
303 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
304 __FILE__, __LINE__, gsl_strerror (rc));
307 gsl_matrix_free (sw);
311 gsl_multifit_linear_workspace *wk;
313 Use QR decomposition via GSL.
316 param_estimates = gsl_vector_alloc (1 + X->size2);
317 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
319 for (j = 0; j < X->size1; j++)
321 gsl_matrix_set (design, j, 0, 1.0);
322 for (i = 0; i < X->size2; i++)
324 tmp = gsl_matrix_get (X, j, i);
325 gsl_matrix_set (design, j, i + 1, tmp);
329 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
330 rc = gsl_multifit_linear (design, Y, param_estimates,
331 cache->cov, &(cache->sse), wk);
332 for (i = 0; i < cache->n_coeffs; i++)
334 cache->coeff[i].estimate = gsl_vector_get (param_estimates, i);
336 if (rc == GSL_SUCCESS)
338 gsl_multifit_linear_free (wk);
339 gsl_vector_free (param_estimates);
343 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
344 __FILE__, __LINE__, rc);
349 cache->ssm = cache->sst - cache->sse;
351 Get the remaining sums of squares for the independent
355 for (i = 1; i < cache->n_indeps; i++)
358 m += gsl_vector_get (cache->ss_indeps, j);
359 tmp = cache->ssm - m;
360 gsl_vector_set (cache->ss_indeps, i, tmp);
363 gsl_matrix_free (design);