4 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2 of the License, or (at your option)
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 You should have received a copy of the GNU General Public License along with
17 this program; if not, write to the Free Software Foundation, Inc., 51
18 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
24 #include <gsl/gsl_blas.h>
25 #include <gsl/gsl_cblas.h>
30 Find the least-squares estimate of b for the linear model:
34 where Y is an n-by-1 column vector, X is an n-by-p matrix of
35 independent variables, b is a p-by-1 vector of regression coefficients,
36 and Z is an n-by-1 normally-distributed random vector with independent
37 identically distributed components with mean 0.
39 This estimate is found via the sweep operator or singular-value
40 decomposition with gsl.
45 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
46 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
48 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
51 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
52 Springer. 1998. ISBN 0-387-98542-5.
55 #include <math/linreg/linreg.h>
56 #include <math/linreg/coefficient.h>
57 #include <gsl/gsl_errno.h>
58 #include <linreg/sweep.h>
60 Get the mean and standard deviation of a vector
61 of doubles via a form of the Kalman filter as
62 described on page 32 of [3].
65 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
74 mean = gsl_vector_get (&v.vector, 0);
76 for (i = 1; i < v.vector.size; i++)
79 tmp = gsl_vector_get (&v.vector, i);
82 variance += j * (j - 1.0) * d * d;
85 *sp = sqrt (variance / (j - 1.0));
92 Allocate a pspp_linreg_cache and return a pointer
93 to it. n is the number of cases, p is the number of
94 independent variables.
97 pspp_linreg_cache_alloc (size_t n, size_t p)
101 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
103 c->indep_means = gsl_vector_alloc (p);
104 c->indep_std = gsl_vector_alloc (p);
105 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
106 independent variables.
108 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
111 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
117 c->method = PSPP_LINREG_SWEEP;
118 c->predict = pspp_linreg_predict;
119 c->residual = pspp_linreg_residual;
125 pspp_linreg_cache_free (pspp_linreg_cache * c)
127 gsl_vector_free (c->indep_means);
128 gsl_vector_free (c->indep_std);
129 gsl_vector_free (c->ss_indeps);
130 gsl_matrix_free (c->cov);
131 pspp_linreg_coeff_free (c->coeff);
136 Fit the linear model via least squares. All pointers passed to pspp_linreg
137 are assumed to be allocated to the correct size and initialized to the
138 values as indicated by opts.
141 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
142 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
148 gsl_matrix_view xmxtx;
152 gsl_vector *param_estimates;
165 if (opts->get_depvar_mean_std)
167 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
169 cache->depvar_mean = m;
170 cache->depvar_std = s;
173 for (i = 0; i < cache->n_indeps; i++)
175 if (opts->get_indep_mean_std[i])
177 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
178 gsl_vector_set (cache->indep_means, i, m);
179 gsl_vector_set (cache->indep_std, i, s);
180 gsl_vector_set (cache->ssx, i, ss);
183 cache->dft = cache->n_obs - 1;
184 cache->dfm = cache->n_indeps;
185 cache->dfe = cache->dft - cache->dfm;
186 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
187 regression through the origin.
189 if (cache->method == PSPP_LINREG_SWEEP)
193 Subtract the means to improve the condition of the design
194 matrix. This requires copying X and Y. We do not divide by the
195 standard deviations of the independent variables here since doing
196 so would cause a miscalculation of the residual sums of
197 squares. Dividing by the standard deviation is done GSL's linear
198 regression functions, so if the design matrix has a poor
199 condition, use QR decomposition.
201 The design matrix here does not include a column for the intercept
202 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
203 so design is allocated here when sweeping, or below if using QR.
205 design = gsl_matrix_alloc (X->size1, X->size2);
206 for (i = 0; i < X->size2; i++)
208 m = gsl_vector_get (cache->indep_means, i);
209 for (j = 0; j < X->size1; j++)
211 tmp = (gsl_matrix_get (X, j, i) - m);
212 gsl_matrix_set (design, j, i, tmp);
215 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
216 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
218 for (i = 0; i < xtx.matrix.size1; i++)
220 tmp = gsl_vector_get (cache->ssx, i);
221 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
222 xi = gsl_matrix_column (design, i);
223 for (j = (i + 1); j < xtx.matrix.size2; j++)
225 xj = gsl_matrix_column (design, j);
226 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
227 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
231 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
232 xty = gsl_matrix_column (sw, cache->n_indeps);
234 This loop starts at 1, with i=0 outside the loop, so we can get
235 the model sum of squares due to the first independent variable.
237 xi = gsl_matrix_column (design, 0);
238 gsl_blas_ddot (&(xi.vector), Y, &tmp);
239 gsl_vector_set (&(xty.vector), 0, tmp);
240 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
241 gsl_vector_set (cache->ss_indeps, 0, tmp);
242 for (i = 1; i < cache->n_indeps; i++)
244 xi = gsl_matrix_column (design, i);
245 gsl_blas_ddot (&(xi.vector), Y, &tmp);
246 gsl_vector_set (&(xty.vector), i, tmp);
250 Sweep on the matrix sw, which contains XtX, XtY and YtY.
253 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
254 cache->mse = cache->sse / cache->dfe;
258 m = cache->depvar_mean;
259 for (i = 0; i < cache->n_indeps; i++)
261 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
262 cache->coeff[i + 1].estimate = tmp;
263 m -= tmp * gsl_vector_get (cache->indep_means, i);
266 Get the covariance matrix of the parameter estimates.
267 Only the upper triangle is necessary.
271 The loops below do not compute the entries related
272 to the estimated intercept.
274 for (i = 0; i < cache->n_indeps; i++)
275 for (j = i; j < cache->n_indeps; j++)
277 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
278 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
281 Get the covariances related to the intercept.
283 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
284 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
285 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
286 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
287 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
288 if (rc == GSL_SUCCESS)
290 tmp = cache->mse / cache->n_obs;
291 for (i = 1; i < 1 + cache->n_indeps; i++)
293 tmp -= gsl_matrix_get (cache->cov, 0, i)
294 * gsl_vector_get (cache->indep_means, i - 1);
296 gsl_matrix_set (cache->cov, 0, 0, tmp);
298 cache->coeff[0].estimate = m;
302 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
303 __FILE__, __LINE__, gsl_strerror (rc));
306 gsl_matrix_free (sw);
310 gsl_multifit_linear_workspace *wk;
312 Use QR decomposition via GSL.
315 param_estimates = gsl_vector_alloc (1 + X->size2);
316 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
318 for (j = 0; j < X->size1; j++)
320 gsl_matrix_set (design, j, 0, 1.0);
321 for (i = 0; i < X->size2; i++)
323 tmp = gsl_matrix_get (X, j, i);
324 gsl_matrix_set (design, j, i + 1, tmp);
328 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
329 rc = gsl_multifit_linear (design, Y, param_estimates,
330 cache->cov, &(cache->sse), wk);
331 for (i = 0; i < cache->n_coeffs; i++)
333 cache->coeff[i].estimate = gsl_vector_get (param_estimates, i);
335 if (rc == GSL_SUCCESS)
337 gsl_multifit_linear_free (wk);
338 gsl_vector_free (param_estimates);
342 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
343 __FILE__, __LINE__, rc);
348 cache->ssm = cache->sst - cache->sse;
350 Get the remaining sums of squares for the independent
354 for (i = 1; i < cache->n_indeps; i++)
357 m += gsl_vector_get (cache->ss_indeps, j);
358 tmp = cache->ssm - m;
359 gsl_vector_set (cache->ss_indeps, i, tmp);
362 gsl_matrix_free (design);