4 * Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
6 * This program is free software; you can redistribute it and/or modify it under
7 * the terms of the GNU General Public License as published by the Free
8 * Software Foundation; either version 2 of the License, or (at your option)
11 * This program is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc., 51
18 * Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
24 #include <gsl/gsl_blas.h>
25 #include <gsl/gsl_cblas.h>
30 Find the least-squares estimate of b for the linear model:
34 where Y is an n-by-1 column vector, X is an n-by-p matrix of
35 independent variables, b is a p-by-1 vector of regression coefficients,
36 and Z is an n-by-1 normally-distributed random vector with independent
37 identically distributed components with mean 0.
39 This estimate is found via the sweep operator or singular-value
40 decomposition with gsl.
45 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
46 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
48 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
51 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
52 Springer. 1998. ISBN 0-387-98542-5.
55 #include <math/linreg/linreg.h>
56 #include <math/linreg/coefficient.h>
57 #include <gsl/gsl_errno.h>
58 #include <linreg/sweep.h>
60 Get the mean and standard deviation of a vector
61 of doubles via a form of the Kalman filter as
62 described on page 32 of [3].
65 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
74 mean = gsl_vector_get (&v.vector, 0);
76 for (i = 1; i < v.vector.size; i++)
79 tmp = gsl_vector_get (&v.vector, i);
82 variance += j * (j - 1.0) * d * d;
85 *sp = sqrt (variance / (j - 1.0));
92 Allocate a pspp_linreg_cache and return a pointer
93 to it. n is the number of cases, p is the number of
94 independent variables.
97 pspp_linreg_cache_alloc (size_t n, size_t p)
101 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
102 c->indep_means = gsl_vector_alloc (p);
103 c->indep_std = gsl_vector_alloc (p);
104 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
105 * independent variables. */
106 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
107 * model parameters. */
108 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
114 c->method = PSPP_LINREG_SWEEP;
115 c->predict = pspp_linreg_predict;
121 pspp_linreg_cache_free (pspp_linreg_cache * c)
123 gsl_vector_free (c->indep_means);
124 gsl_vector_free (c->indep_std);
125 gsl_vector_free (c->ss_indeps);
126 gsl_matrix_free (c->cov);
127 pspp_linreg_coeff_free (c->coeff);
132 Fit the linear model via least squares. All pointers passed to pspp_linreg
133 are assumed to be allocated to the correct size and initialized to the
134 values as indicated by opts.
137 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
138 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
144 gsl_matrix_view xmxtx;
148 gsl_vector *param_estimates;
161 if (opts->get_depvar_mean_std)
163 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
165 cache->depvar_mean = m;
166 cache->depvar_std = s;
169 for (i = 0; i < cache->n_indeps; i++)
171 if (opts->get_indep_mean_std[i])
173 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
174 gsl_vector_set (cache->indep_means, i, m);
175 gsl_vector_set (cache->indep_std, i, s);
176 gsl_vector_set (cache->ssx, i, ss);
179 cache->dft = cache->n_obs - 1;
180 cache->dfm = cache->n_indeps;
181 cache->dfe = cache->dft - cache->dfm;
182 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
183 * regression through the origin. */
184 if (cache->method == PSPP_LINREG_SWEEP)
188 Subtract the means to improve the condition of the design
189 matrix. This requires copying X and Y. We do not divide by the
190 standard deviations of the independent variables here since doing
191 so would cause a miscalculation of the residual sums of
192 squares. Dividing by the standard deviation is done GSL's linear
193 regression functions, so if the design matrix has a poor
194 condition, use QR decomposition.
196 The design matrix here does not include a column for the intercept
197 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
198 so design is allocated here when sweeping, or below if using QR.
200 design = gsl_matrix_alloc (X->size1, X->size2);
201 for (i = 0; i < X->size2; i++)
203 m = gsl_vector_get (cache->indep_means, i);
204 for (j = 0; j < X->size1; j++)
206 tmp = (gsl_matrix_get (X, j, i) - m);
207 gsl_matrix_set (design, j, i, tmp);
210 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
211 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
213 for (i = 0; i < xtx.matrix.size1; i++)
215 tmp = gsl_vector_get (cache->ssx, i);
216 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
217 xi = gsl_matrix_column (design, i);
218 for (j = (i + 1); j < xtx.matrix.size2; j++)
220 xj = gsl_matrix_column (design, j);
221 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
222 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
226 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
227 xty = gsl_matrix_column (sw, cache->n_indeps);
229 This loop starts at 1, with i=0 outside the loop, so we can get
230 the model sum of squares due to the first independent variable.
232 xi = gsl_matrix_column (design, 0);
233 gsl_blas_ddot (&(xi.vector), Y, &tmp);
234 gsl_vector_set (&(xty.vector), 0, tmp);
235 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
236 gsl_vector_set (cache->ss_indeps, 0, tmp);
237 for (i = 1; i < cache->n_indeps; i++)
239 xi = gsl_matrix_column (design, i);
240 gsl_blas_ddot (&(xi.vector), Y, &tmp);
241 gsl_vector_set (&(xty.vector), i, tmp);
245 Sweep on the matrix sw, which contains XtX, XtY and YtY.
248 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
249 cache->mse = cache->sse / cache->dfe;
253 m = cache->depvar_mean;
254 for (i = 0; i < cache->n_indeps; i++)
256 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
257 cache->coeff[i + 1].estimate = tmp;
258 m -= tmp * gsl_vector_get (cache->indep_means, i);
261 Get the covariance matrix of the parameter estimates.
262 Only the upper triangle is necessary.
266 The loops below do not compute the entries related
267 to the estimated intercept.
269 for (i = 0; i < cache->n_indeps; i++)
270 for (j = i; j < cache->n_indeps; j++)
272 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
273 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
276 Get the covariances related to the intercept.
278 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
279 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
280 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
281 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
282 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
283 if (rc == GSL_SUCCESS)
285 tmp = cache->mse / cache->n_obs;
286 for (i = 1; i < 1 + cache->n_indeps; i++)
288 tmp -= gsl_matrix_get (cache->cov, 0, i)
289 * gsl_vector_get (cache->indep_means, i - 1);
291 gsl_matrix_set (cache->cov, 0, 0, tmp);
293 cache->coeff[0].estimate = m;
297 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
298 __FILE__, __LINE__, gsl_strerror (rc));
301 gsl_matrix_free (sw);
305 gsl_multifit_linear_workspace *wk;
307 Use QR decomposition via GSL.
310 param_estimates = gsl_vector_alloc (1 + X->size2);
311 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
313 for (j = 0; j < X->size1; j++)
315 gsl_matrix_set (design, j, 0, 1.0);
316 for (i = 0; i < X->size2; i++)
318 tmp = gsl_matrix_get (X, j, i);
319 gsl_matrix_set (design, j, i + 1, tmp);
323 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
324 rc = gsl_multifit_linear (design, Y, param_estimates,
325 cache->cov, &(cache->sse), wk);
326 for (i = 0; i < cache->n_coeffs; i++)
328 cache->coeff[i].estimate = gsl_vector_get (param_estimates, i);
330 if (rc == GSL_SUCCESS)
332 gsl_multifit_linear_free (wk);
333 gsl_vector_free (param_estimates);
337 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
338 __FILE__, __LINE__, rc);
343 cache->ssm = cache->sst - cache->sse;
345 Get the remaining sums of squares for the independent
349 for (i = 1; i < cache->n_indeps; i++)
352 m += gsl_vector_get (cache->ss_indeps, j);
353 tmp = cache->ssm - m;
354 gsl_vector_set (cache->ss_indeps, i, tmp);
357 gsl_matrix_free (design);