4 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2 of the License, or (at your option)
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 You should have received a copy of the GNU General Public License along with
17 this program; if not, write to the Free Software Foundation, Inc., 51
18 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
24 #include <gsl/gsl_blas.h>
25 #include <gsl/gsl_cblas.h>
30 Find the least-squares estimate of b for the linear model:
34 where Y is an n-by-1 column vector, X is an n-by-p matrix of
35 independent variables, b is a p-by-1 vector of regression coefficients,
36 and Z is an n-by-1 normally-distributed random vector with independent
37 identically distributed components with mean 0.
39 This estimate is found via the sweep operator or singular-value
40 decomposition with gsl.
45 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
46 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
48 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
51 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
52 Springer. 1998. ISBN 0-387-98542-5.
55 #include <math/linreg/linreg.h>
56 #include <math/linreg/coefficient.h>
57 #include <gsl/gsl_errno.h>
58 #include <linreg/sweep.h>
60 Get the mean and standard deviation of a vector
61 of doubles via a form of the Kalman filter as
62 described on page 32 of [3].
65 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
74 mean = gsl_vector_get (&v.vector, 0);
76 for (i = 1; i < v.vector.size; i++)
79 tmp = gsl_vector_get (&v.vector, i);
82 variance += j * (j - 1.0) * d * d;
85 *sp = sqrt (variance / (j - 1.0));
92 Allocate a pspp_linreg_cache and return a pointer
93 to it. n is the number of cases, p is the number of
94 independent variables.
97 pspp_linreg_cache_alloc (size_t n, size_t p)
101 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
103 c->indep_means = gsl_vector_alloc (p);
104 c->indep_std = gsl_vector_alloc (p);
105 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
106 independent variables.
108 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
111 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
117 c->method = PSPP_LINREG_SWEEP;
118 c->predict = pspp_linreg_predict;
119 c->residual = pspp_linreg_residual; /* The procedure to comput my residuals. */
120 c->resid = NULL; /* The variable storing my residuals. */
121 c->pred = NULL; /* The variable storing my predicted values. */
127 pspp_linreg_cache_free (void * m)
129 pspp_linreg_cache *c = m;
130 gsl_vector_free (c->indep_means);
131 gsl_vector_free (c->indep_std);
132 gsl_vector_free (c->ss_indeps);
133 gsl_matrix_free (c->cov);
134 pspp_linreg_coeff_free (c->coeff);
140 Fit the linear model via least squares. All pointers passed to pspp_linreg
141 are assumed to be allocated to the correct size and initialized to the
142 values as indicated by opts.
145 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
146 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
152 gsl_matrix_view xmxtx;
156 gsl_vector *param_estimates;
169 if (opts->get_depvar_mean_std)
171 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
173 cache->depvar_mean = m;
174 cache->depvar_std = s;
177 for (i = 0; i < cache->n_indeps; i++)
179 if (opts->get_indep_mean_std[i])
181 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
182 gsl_vector_set (cache->indep_means, i, m);
183 gsl_vector_set (cache->indep_std, i, s);
184 gsl_vector_set (cache->ssx, i, ss);
187 cache->dft = cache->n_obs - 1;
188 cache->dfm = cache->n_indeps;
189 cache->dfe = cache->dft - cache->dfm;
190 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
191 regression through the origin.
193 if (cache->method == PSPP_LINREG_SWEEP)
197 Subtract the means to improve the condition of the design
198 matrix. This requires copying X and Y. We do not divide by the
199 standard deviations of the independent variables here since doing
200 so would cause a miscalculation of the residual sums of
201 squares. Dividing by the standard deviation is done GSL's linear
202 regression functions, so if the design matrix has a poor
203 condition, use QR decomposition.
205 The design matrix here does not include a column for the intercept
206 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
207 so design is allocated here when sweeping, or below if using QR.
209 design = gsl_matrix_alloc (X->size1, X->size2);
210 for (i = 0; i < X->size2; i++)
212 m = gsl_vector_get (cache->indep_means, i);
213 for (j = 0; j < X->size1; j++)
215 tmp = (gsl_matrix_get (X, j, i) - m);
216 gsl_matrix_set (design, j, i, tmp);
219 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
220 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
222 for (i = 0; i < xtx.matrix.size1; i++)
224 tmp = gsl_vector_get (cache->ssx, i);
225 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
226 xi = gsl_matrix_column (design, i);
227 for (j = (i + 1); j < xtx.matrix.size2; j++)
229 xj = gsl_matrix_column (design, j);
230 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
231 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
235 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
236 xty = gsl_matrix_column (sw, cache->n_indeps);
238 This loop starts at 1, with i=0 outside the loop, so we can get
239 the model sum of squares due to the first independent variable.
241 xi = gsl_matrix_column (design, 0);
242 gsl_blas_ddot (&(xi.vector), Y, &tmp);
243 gsl_vector_set (&(xty.vector), 0, tmp);
244 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
245 gsl_vector_set (cache->ss_indeps, 0, tmp);
246 for (i = 1; i < cache->n_indeps; i++)
248 xi = gsl_matrix_column (design, i);
249 gsl_blas_ddot (&(xi.vector), Y, &tmp);
250 gsl_vector_set (&(xty.vector), i, tmp);
254 Sweep on the matrix sw, which contains XtX, XtY and YtY.
257 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
258 cache->mse = cache->sse / cache->dfe;
262 m = cache->depvar_mean;
263 for (i = 0; i < cache->n_indeps; i++)
265 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
266 cache->coeff[i + 1].estimate = tmp;
267 m -= tmp * gsl_vector_get (cache->indep_means, i);
270 Get the covariance matrix of the parameter estimates.
271 Only the upper triangle is necessary.
275 The loops below do not compute the entries related
276 to the estimated intercept.
278 for (i = 0; i < cache->n_indeps; i++)
279 for (j = i; j < cache->n_indeps; j++)
281 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
282 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
285 Get the covariances related to the intercept.
287 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
288 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
289 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
290 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
291 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
292 if (rc == GSL_SUCCESS)
294 tmp = cache->mse / cache->n_obs;
295 for (i = 1; i < 1 + cache->n_indeps; i++)
297 tmp -= gsl_matrix_get (cache->cov, 0, i)
298 * gsl_vector_get (cache->indep_means, i - 1);
300 gsl_matrix_set (cache->cov, 0, 0, tmp);
302 cache->coeff[0].estimate = m;
306 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
307 __FILE__, __LINE__, gsl_strerror (rc));
310 gsl_matrix_free (sw);
314 gsl_multifit_linear_workspace *wk;
316 Use QR decomposition via GSL.
319 param_estimates = gsl_vector_alloc (1 + X->size2);
320 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
322 for (j = 0; j < X->size1; j++)
324 gsl_matrix_set (design, j, 0, 1.0);
325 for (i = 0; i < X->size2; i++)
327 tmp = gsl_matrix_get (X, j, i);
328 gsl_matrix_set (design, j, i + 1, tmp);
332 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
333 rc = gsl_multifit_linear (design, Y, param_estimates,
334 cache->cov, &(cache->sse), wk);
335 for (i = 0; i < cache->n_coeffs; i++)
337 cache->coeff[i].estimate = gsl_vector_get (param_estimates, i);
339 if (rc == GSL_SUCCESS)
341 gsl_multifit_linear_free (wk);
342 gsl_vector_free (param_estimates);
346 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
347 __FILE__, __LINE__, rc);
352 cache->ssm = cache->sst - cache->sse;
354 Get the remaining sums of squares for the independent
358 for (i = 1; i < cache->n_indeps; i++)
361 m += gsl_vector_get (cache->ss_indeps, j);
362 tmp = cache->ssm - m;
363 gsl_vector_set (cache->ss_indeps, i, tmp);
366 gsl_matrix_free (design);