3 Copyright (C) 2005 Free Software Foundation, Inc.
4 Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_fit.h>
23 #include <gsl/gsl_multifit.h>
25 #include <gsl/gsl_blas.h>
26 #include <gsl/gsl_cblas.h>
31 Find the least-squares estimate of b for the linear model:
35 where Y is an n-by-1 column vector, X is an n-by-p matrix of
36 independent variables, b is a p-by-1 vector of regression coefficients,
37 and Z is an n-by-1 normally-distributed random vector with independent
38 identically distributed components with mean 0.
40 This estimate is found via the sweep operator or singular-value
41 decomposition with gsl.
46 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
47 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
49 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
52 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
53 Springer. 1998. ISBN 0-387-98542-5.
56 #include <math/linreg/linreg.h>
57 #include <math/linreg/coefficient.h>
58 #include <gsl/gsl_errno.h>
59 #include <linreg/sweep.h>
61 Get the mean and standard deviation of a vector
62 of doubles via a form of the Kalman filter as
63 described on page 32 of [3].
66 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
75 mean = gsl_vector_get (&v.vector, 0);
77 for (i = 1; i < v.vector.size; i++)
80 tmp = gsl_vector_get (&v.vector, i);
83 variance += j * (j - 1.0) * d * d;
86 *sp = sqrt (variance / (j - 1.0));
93 Allocate a pspp_linreg_cache and return a pointer
94 to it. n is the number of cases, p is the number of
95 independent variables.
98 pspp_linreg_cache_alloc (size_t n, size_t p)
100 pspp_linreg_cache *c;
102 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
103 c->indep_means = gsl_vector_alloc (p);
104 c->indep_std = gsl_vector_alloc (p);
105 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the independent
108 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the model
111 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
117 c->method = PSPP_LINREG_SWEEP;
123 pspp_linreg_cache_free (pspp_linreg_cache * c)
125 gsl_vector_free (c->indep_means);
126 gsl_vector_free (c->indep_std);
127 gsl_vector_free (c->ss_indeps);
128 gsl_matrix_free (c->cov);
129 pspp_linreg_coeff_free (c->coeff);
134 Fit the linear model via least squares. All pointers passed to pspp_linreg
135 are assumed to be allocated to the correct size and initialized to the
136 values as indicated by opts.
139 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
140 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
146 gsl_matrix_view xmxtx;
150 gsl_vector *param_estimates;
163 if (opts->get_depvar_mean_std)
165 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
167 cache->depvar_mean = m;
168 cache->depvar_std = s;
171 for (i = 0; i < cache->n_indeps; i++)
173 if (opts->get_indep_mean_std[i])
175 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
176 gsl_vector_set (cache->indep_means, i, m);
177 gsl_vector_set (cache->indep_std, i, s);
178 gsl_vector_set (cache->ssx, i, ss);
181 cache->dft = cache->n_obs - 1;
182 cache->dfm = cache->n_indeps;
183 cache->dfe = cache->dft - cache->dfm;
184 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for regression
187 if (cache->method == PSPP_LINREG_SWEEP)
191 Subtract the means to improve the condition of the design
192 matrix. This requires copying X and Y. We do not divide by the
193 standard deviations of the independent variables here since doing
194 so would cause a miscalculation of the residual sums of
195 squares. Dividing by the standard deviation is done GSL's linear
196 regression functions, so if the design matrix has a poor
197 condition, use QR decomposition.
199 The design matrix here does not include a column for the intercept
200 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
201 so design is allocated here when sweeping, or below if using QR.
203 design = gsl_matrix_alloc (X->size1, X->size2);
204 for (i = 0; i < X->size2; i++)
206 m = gsl_vector_get (cache->indep_means, i);
207 for (j = 0; j < X->size1; j++)
209 tmp = (gsl_matrix_get (X, j, i) - m);
210 gsl_matrix_set (design, j, i, tmp);
213 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
214 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
216 for (i = 0; i < xtx.matrix.size1; i++)
218 tmp = gsl_vector_get (cache->ssx, i);
219 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
220 xi = gsl_matrix_column (design, i);
221 for (j = (i + 1); j < xtx.matrix.size2; j++)
223 xj = gsl_matrix_column (design, j);
224 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
225 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
229 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
230 xty = gsl_matrix_column (sw, cache->n_indeps);
232 This loop starts at 1, with i=0 outside the loop, so we can get
233 the model sum of squares due to the first independent variable.
235 xi = gsl_matrix_column (design, 0);
236 gsl_blas_ddot (&(xi.vector), Y, &tmp);
237 gsl_vector_set (&(xty.vector), 0, tmp);
238 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
239 gsl_vector_set (cache->ss_indeps, 0, tmp);
240 for (i = 1; i < cache->n_indeps; i++)
242 xi = gsl_matrix_column (design, i);
243 gsl_blas_ddot (&(xi.vector), Y, &tmp);
244 gsl_vector_set (&(xty.vector), i, tmp);
248 Sweep on the matrix sw, which contains XtX, XtY and YtY.
251 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
252 cache->mse = cache->sse / cache->dfe;
256 m = cache->depvar_mean;
257 for (i = 0; i < cache->n_indeps; i++)
259 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
260 cache->coeff[i + 1].estimate = tmp;
261 m -= tmp * gsl_vector_get (cache->indep_means, i);
264 Get the covariance matrix of the parameter estimates.
265 Only the upper triangle is necessary.
269 The loops below do not compute the entries related
270 to the estimated intercept.
272 for (i = 0; i < cache->n_indeps; i++)
273 for (j = i; j < cache->n_indeps; j++)
275 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
276 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
279 Get the covariances related to the intercept.
281 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
282 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
283 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
284 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
285 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
286 if (rc == GSL_SUCCESS)
288 tmp = cache->mse / cache->n_obs;
289 for (i = 1; i < 1 + cache->n_indeps; i++)
291 tmp -= gsl_matrix_get (cache->cov, 0, i)
292 * gsl_vector_get (cache->indep_means, i - 1);
294 gsl_matrix_set (cache->cov, 0, 0, tmp);
296 cache->coeff[0].estimate = m;
300 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
301 __FILE__, __LINE__, gsl_strerror (rc));
304 gsl_matrix_free (sw);
309 Use QR decomposition via GSL.
312 param_estimates = gsl_vector_alloc (1 + X->size2);
313 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
315 for (j = 0; j < X->size1; j++)
317 gsl_matrix_set (design, j, 0, 1.0);
318 for (i = 0; i < X->size2; i++)
320 tmp = gsl_matrix_get (X, j, i);
321 gsl_matrix_set (design, j, i + 1, tmp);
324 gsl_multifit_linear_workspace *wk =
325 gsl_multifit_linear_alloc (design->size1, design->size2);
326 rc = gsl_multifit_linear (design, Y, param_estimates,
327 cache->cov, &(cache->sse), wk);
328 for (i = 0; i < cache->n_coeffs; i++)
330 cache->coeff[i].estimate = gsl_vector_get (param_estimates, i);
332 if (rc == GSL_SUCCESS)
334 gsl_multifit_linear_free (wk);
335 gsl_vector_free (param_estimates);
339 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
340 __FILE__, __LINE__, rc);
345 cache->ssm = cache->sst - cache->sse;
347 Get the remaining sums of squares for the independent
351 for (i = 1; i < cache->n_indeps; i++)
354 m += gsl_vector_get (cache->ss_indeps, j);
355 tmp = cache->ssm - m;
356 gsl_vector_set (cache->ss_indeps, i, tmp);
359 gsl_matrix_free (design);