4 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2 of the License, or (at your option)
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 You should have received a copy of the GNU General Public License along with
17 this program; if not, write to the Free Software Foundation, Inc., 51
18 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
24 #include <gsl/gsl_blas.h>
25 #include <gsl/gsl_cblas.h>
30 Find the least-squares estimate of b for the linear model:
34 where Y is an n-by-1 column vector, X is an n-by-p matrix of
35 independent variables, b is a p-by-1 vector of regression coefficients,
36 and Z is an n-by-1 normally-distributed random vector with independent
37 identically distributed components with mean 0.
39 This estimate is found via the sweep operator or singular-value
40 decomposition with gsl.
45 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
46 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
48 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
51 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
52 Springer. 1998. ISBN 0-387-98542-5.
55 #include <math/linreg/linreg.h>
56 #include <math/linreg/coefficient.h>
57 #include <gsl/gsl_errno.h>
58 #include <linreg/sweep.h>
60 Get the mean and standard deviation of a vector
61 of doubles via a form of the Kalman filter as
62 described on page 32 of [3].
65 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
74 mean = gsl_vector_get (&v.vector, 0);
76 for (i = 1; i < v.vector.size; i++)
79 tmp = gsl_vector_get (&v.vector, i);
82 variance += j * (j - 1.0) * d * d;
85 *sp = sqrt (variance / (j - 1.0));
92 Allocate a pspp_linreg_cache and return a pointer
93 to it. n is the number of cases, p is the number of
94 independent variables.
97 pspp_linreg_cache_alloc (size_t n, size_t p)
101 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
102 c->indep_means = gsl_vector_alloc (p);
103 c->indep_std = gsl_vector_alloc (p);
104 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
105 independent variables.
107 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
110 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
116 c->method = PSPP_LINREG_SWEEP;
117 c->predict = pspp_linreg_predict;
118 c->residual = pspp_linreg_residual;
124 pspp_linreg_cache_free (pspp_linreg_cache * c)
126 gsl_vector_free (c->indep_means);
127 gsl_vector_free (c->indep_std);
128 gsl_vector_free (c->ss_indeps);
129 gsl_matrix_free (c->cov);
130 pspp_linreg_coeff_free (c->coeff);
135 Fit the linear model via least squares. All pointers passed to pspp_linreg
136 are assumed to be allocated to the correct size and initialized to the
137 values as indicated by opts.
140 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
141 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
147 gsl_matrix_view xmxtx;
151 gsl_vector *param_estimates;
164 if (opts->get_depvar_mean_std)
166 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
168 cache->depvar_mean = m;
169 cache->depvar_std = s;
172 for (i = 0; i < cache->n_indeps; i++)
174 if (opts->get_indep_mean_std[i])
176 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
177 gsl_vector_set (cache->indep_means, i, m);
178 gsl_vector_set (cache->indep_std, i, s);
179 gsl_vector_set (cache->ssx, i, ss);
182 cache->dft = cache->n_obs - 1;
183 cache->dfm = cache->n_indeps;
184 cache->dfe = cache->dft - cache->dfm;
185 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
186 regression through the origin.
188 if (cache->method == PSPP_LINREG_SWEEP)
192 Subtract the means to improve the condition of the design
193 matrix. This requires copying X and Y. We do not divide by the
194 standard deviations of the independent variables here since doing
195 so would cause a miscalculation of the residual sums of
196 squares. Dividing by the standard deviation is done GSL's linear
197 regression functions, so if the design matrix has a poor
198 condition, use QR decomposition.
200 The design matrix here does not include a column for the intercept
201 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
202 so design is allocated here when sweeping, or below if using QR.
204 design = gsl_matrix_alloc (X->size1, X->size2);
205 for (i = 0; i < X->size2; i++)
207 m = gsl_vector_get (cache->indep_means, i);
208 for (j = 0; j < X->size1; j++)
210 tmp = (gsl_matrix_get (X, j, i) - m);
211 gsl_matrix_set (design, j, i, tmp);
214 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
215 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
217 for (i = 0; i < xtx.matrix.size1; i++)
219 tmp = gsl_vector_get (cache->ssx, i);
220 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
221 xi = gsl_matrix_column (design, i);
222 for (j = (i + 1); j < xtx.matrix.size2; j++)
224 xj = gsl_matrix_column (design, j);
225 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
226 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
230 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
231 xty = gsl_matrix_column (sw, cache->n_indeps);
233 This loop starts at 1, with i=0 outside the loop, so we can get
234 the model sum of squares due to the first independent variable.
236 xi = gsl_matrix_column (design, 0);
237 gsl_blas_ddot (&(xi.vector), Y, &tmp);
238 gsl_vector_set (&(xty.vector), 0, tmp);
239 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
240 gsl_vector_set (cache->ss_indeps, 0, tmp);
241 for (i = 1; i < cache->n_indeps; i++)
243 xi = gsl_matrix_column (design, i);
244 gsl_blas_ddot (&(xi.vector), Y, &tmp);
245 gsl_vector_set (&(xty.vector), i, tmp);
249 Sweep on the matrix sw, which contains XtX, XtY and YtY.
252 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
253 cache->mse = cache->sse / cache->dfe;
257 m = cache->depvar_mean;
258 for (i = 0; i < cache->n_indeps; i++)
260 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
261 cache->coeff[i + 1].estimate = tmp;
262 m -= tmp * gsl_vector_get (cache->indep_means, i);
265 Get the covariance matrix of the parameter estimates.
266 Only the upper triangle is necessary.
270 The loops below do not compute the entries related
271 to the estimated intercept.
273 for (i = 0; i < cache->n_indeps; i++)
274 for (j = i; j < cache->n_indeps; j++)
276 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
277 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
280 Get the covariances related to the intercept.
282 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
283 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
284 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
285 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
286 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
287 if (rc == GSL_SUCCESS)
289 tmp = cache->mse / cache->n_obs;
290 for (i = 1; i < 1 + cache->n_indeps; i++)
292 tmp -= gsl_matrix_get (cache->cov, 0, i)
293 * gsl_vector_get (cache->indep_means, i - 1);
295 gsl_matrix_set (cache->cov, 0, 0, tmp);
297 cache->coeff[0].estimate = m;
301 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
302 __FILE__, __LINE__, gsl_strerror (rc));
305 gsl_matrix_free (sw);
309 gsl_multifit_linear_workspace *wk;
311 Use QR decomposition via GSL.
314 param_estimates = gsl_vector_alloc (1 + X->size2);
315 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
317 for (j = 0; j < X->size1; j++)
319 gsl_matrix_set (design, j, 0, 1.0);
320 for (i = 0; i < X->size2; i++)
322 tmp = gsl_matrix_get (X, j, i);
323 gsl_matrix_set (design, j, i + 1, tmp);
327 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
328 rc = gsl_multifit_linear (design, Y, param_estimates,
329 cache->cov, &(cache->sse), wk);
330 for (i = 0; i < cache->n_coeffs; i++)
332 cache->coeff[i].estimate = gsl_vector_get (param_estimates, i);
334 if (rc == GSL_SUCCESS)
336 gsl_multifit_linear_free (wk);
337 gsl_vector_free (param_estimates);
341 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
342 __FILE__, __LINE__, rc);
347 cache->ssm = cache->sst - cache->sse;
349 Get the remaining sums of squares for the independent
353 for (i = 1; i < cache->n_indeps; i++)
356 m += gsl_vector_get (cache->ss_indeps, j);
357 tmp = cache->ssm - m;
358 gsl_vector_set (cache->ss_indeps, i, tmp);
361 gsl_matrix_free (design);