4 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2 of the License, or (at your option)
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 You should have received a copy of the GNU General Public License along with
17 this program; if not, write to the Free Software Foundation, Inc., 51
18 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
24 #include <gsl/gsl_blas.h>
25 #include <gsl/gsl_cblas.h>
30 Find the least-squares estimate of b for the linear model:
34 where Y is an n-by-1 column vector, X is an n-by-p matrix of
35 independent variables, b is a p-by-1 vector of regression coefficients,
36 and Z is an n-by-1 normally-distributed random vector with independent
37 identically distributed components with mean 0.
39 This estimate is found via the sweep operator or singular-value
40 decomposition with gsl.
45 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
46 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
48 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
51 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
52 Springer. 1998. ISBN 0-387-98542-5.
55 #include <math/linreg/linreg.h>
56 #include <math/linreg/coefficient.h>
57 #include <gsl/gsl_errno.h>
58 #include <linreg/sweep.h>
60 Get the mean and standard deviation of a vector
61 of doubles via a form of the Kalman filter as
62 described on page 32 of [3].
65 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
74 mean = gsl_vector_get (&v.vector, 0);
76 for (i = 1; i < v.vector.size; i++)
79 tmp = gsl_vector_get (&v.vector, i);
82 variance += j * (j - 1.0) * d * d;
85 *sp = sqrt (variance / (j - 1.0));
92 Allocate a pspp_linreg_cache and return a pointer
93 to it. n is the number of cases, p is the number of
94 independent variables.
97 pspp_linreg_cache_alloc (size_t n, size_t p)
101 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
103 c->indep_means = gsl_vector_alloc (p);
104 c->indep_std = gsl_vector_alloc (p);
105 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
106 independent variables.
108 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
111 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
117 c->method = PSPP_LINREG_SWEEP;
118 c->predict = pspp_linreg_predict;
119 c->residual = pspp_linreg_residual;
125 pspp_linreg_cache_free (void * m)
127 pspp_linreg_cache *c = m;
128 gsl_vector_free (c->indep_means);
129 gsl_vector_free (c->indep_std);
130 gsl_vector_free (c->ss_indeps);
131 gsl_matrix_free (c->cov);
132 pspp_linreg_coeff_free (c->coeff);
138 Fit the linear model via least squares. All pointers passed to pspp_linreg
139 are assumed to be allocated to the correct size and initialized to the
140 values as indicated by opts.
143 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
144 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
150 gsl_matrix_view xmxtx;
154 gsl_vector *param_estimates;
167 if (opts->get_depvar_mean_std)
169 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
171 cache->depvar_mean = m;
172 cache->depvar_std = s;
175 for (i = 0; i < cache->n_indeps; i++)
177 if (opts->get_indep_mean_std[i])
179 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
180 gsl_vector_set (cache->indep_means, i, m);
181 gsl_vector_set (cache->indep_std, i, s);
182 gsl_vector_set (cache->ssx, i, ss);
185 cache->dft = cache->n_obs - 1;
186 cache->dfm = cache->n_indeps;
187 cache->dfe = cache->dft - cache->dfm;
188 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
189 regression through the origin.
191 if (cache->method == PSPP_LINREG_SWEEP)
195 Subtract the means to improve the condition of the design
196 matrix. This requires copying X and Y. We do not divide by the
197 standard deviations of the independent variables here since doing
198 so would cause a miscalculation of the residual sums of
199 squares. Dividing by the standard deviation is done GSL's linear
200 regression functions, so if the design matrix has a poor
201 condition, use QR decomposition.
203 The design matrix here does not include a column for the intercept
204 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
205 so design is allocated here when sweeping, or below if using QR.
207 design = gsl_matrix_alloc (X->size1, X->size2);
208 for (i = 0; i < X->size2; i++)
210 m = gsl_vector_get (cache->indep_means, i);
211 for (j = 0; j < X->size1; j++)
213 tmp = (gsl_matrix_get (X, j, i) - m);
214 gsl_matrix_set (design, j, i, tmp);
217 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
218 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
220 for (i = 0; i < xtx.matrix.size1; i++)
222 tmp = gsl_vector_get (cache->ssx, i);
223 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
224 xi = gsl_matrix_column (design, i);
225 for (j = (i + 1); j < xtx.matrix.size2; j++)
227 xj = gsl_matrix_column (design, j);
228 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
229 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
233 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
234 xty = gsl_matrix_column (sw, cache->n_indeps);
236 This loop starts at 1, with i=0 outside the loop, so we can get
237 the model sum of squares due to the first independent variable.
239 xi = gsl_matrix_column (design, 0);
240 gsl_blas_ddot (&(xi.vector), Y, &tmp);
241 gsl_vector_set (&(xty.vector), 0, tmp);
242 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
243 gsl_vector_set (cache->ss_indeps, 0, tmp);
244 for (i = 1; i < cache->n_indeps; i++)
246 xi = gsl_matrix_column (design, i);
247 gsl_blas_ddot (&(xi.vector), Y, &tmp);
248 gsl_vector_set (&(xty.vector), i, tmp);
252 Sweep on the matrix sw, which contains XtX, XtY and YtY.
255 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
256 cache->mse = cache->sse / cache->dfe;
260 m = cache->depvar_mean;
261 for (i = 0; i < cache->n_indeps; i++)
263 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
264 cache->coeff[i + 1].estimate = tmp;
265 m -= tmp * gsl_vector_get (cache->indep_means, i);
268 Get the covariance matrix of the parameter estimates.
269 Only the upper triangle is necessary.
273 The loops below do not compute the entries related
274 to the estimated intercept.
276 for (i = 0; i < cache->n_indeps; i++)
277 for (j = i; j < cache->n_indeps; j++)
279 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
280 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
283 Get the covariances related to the intercept.
285 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
286 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
287 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
288 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
289 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
290 if (rc == GSL_SUCCESS)
292 tmp = cache->mse / cache->n_obs;
293 for (i = 1; i < 1 + cache->n_indeps; i++)
295 tmp -= gsl_matrix_get (cache->cov, 0, i)
296 * gsl_vector_get (cache->indep_means, i - 1);
298 gsl_matrix_set (cache->cov, 0, 0, tmp);
300 cache->coeff[0].estimate = m;
304 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
305 __FILE__, __LINE__, gsl_strerror (rc));
308 gsl_matrix_free (sw);
312 gsl_multifit_linear_workspace *wk;
314 Use QR decomposition via GSL.
317 param_estimates = gsl_vector_alloc (1 + X->size2);
318 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
320 for (j = 0; j < X->size1; j++)
322 gsl_matrix_set (design, j, 0, 1.0);
323 for (i = 0; i < X->size2; i++)
325 tmp = gsl_matrix_get (X, j, i);
326 gsl_matrix_set (design, j, i + 1, tmp);
330 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
331 rc = gsl_multifit_linear (design, Y, param_estimates,
332 cache->cov, &(cache->sse), wk);
333 for (i = 0; i < cache->n_coeffs; i++)
335 cache->coeff[i].estimate = gsl_vector_get (param_estimates, i);
337 if (rc == GSL_SUCCESS)
339 gsl_multifit_linear_free (wk);
340 gsl_vector_free (param_estimates);
344 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
345 __FILE__, __LINE__, rc);
350 cache->ssm = cache->sst - cache->sse;
352 Get the remaining sums of squares for the independent
356 for (i = 1; i < cache->n_indeps; i++)
359 m += gsl_vector_get (cache->ss_indeps, j);
360 tmp = cache->ssm - m;
361 gsl_vector_set (cache->ss_indeps, i, tmp);
364 gsl_matrix_free (design);