4 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2 of the License, or (at your option)
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 You should have received a copy of the GNU General Public License along with
17 this program; if not, write to the Free Software Foundation, Inc., 51
18 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
22 #include <gsl/gsl_fit.h>
23 #include <gsl/gsl_multifit.h>
25 #include <gsl/gsl_blas.h>
26 #include <gsl/gsl_cblas.h>
31 Find the least-squares estimate of b for the linear model:
35 where Y is an n-by-1 column vector, X is an n-by-p matrix of
36 independent variables, b is a p-by-1 vector of regression coefficients,
37 and Z is an n-by-1 normally-distributed random vector with independent
38 identically distributed components with mean 0.
40 This estimate is found via the sweep operator or singular-value
41 decomposition with gsl.
46 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
47 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
49 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
52 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
53 Springer. 1998. ISBN 0-387-98542-5.
56 #include <math/linreg/linreg.h>
57 #include <math/coefficient.h>
58 #include <gsl/gsl_errno.h>
59 #include <linreg/sweep.h>
61 Get the mean and standard deviation of a vector
62 of doubles via a form of the Kalman filter as
63 described on page 32 of [3].
66 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
75 mean = gsl_vector_get (&v.vector, 0);
77 for (i = 1; i < v.vector.size; i++)
80 tmp = gsl_vector_get (&v.vector, i);
83 variance += j * (j - 1.0) * d * d;
86 *sp = sqrt (variance / (j - 1.0));
93 Set V to contain an array of pointers to the variables
94 used in the model. V must be at least C->N_COEFFS in length.
95 The return value is the number of distinct variables found.
98 pspp_linreg_get_vars (const void *c_, struct variable **v)
100 const pspp_linreg_cache *c = c_;
101 struct pspp_coeff *coef = NULL;
102 const struct variable *tmp;
107 Make sure the caller doesn't try to sneak a variable
108 into V that is not in the model.
110 for (i = 0; i < c->n_coeffs; i++)
115 Start at c->coeff[1] to avoid the intercept.
117 v[result] = (struct variable *) pspp_coeff_get_var (c->coeff[1], 0);
118 result = (v[result] == NULL) ? 0 : 1;
120 for (coef = c->coeff[2]; coef < c->coeff[c->n_coeffs]; coef++)
122 tmp = pspp_coeff_get_var (coef, 0);
123 assert (tmp != NULL);
124 /* Repeated variables are likely to bunch together, at the end
127 while (i >= 0 && v[i] != tmp)
131 if (i < 0 && result < c->n_coeffs)
133 v[result] = (struct variable *) tmp;
141 Allocate a pspp_linreg_cache and return a pointer
142 to it. n is the number of cases, p is the number of
143 independent variables.
146 pspp_linreg_cache_alloc (size_t n, size_t p)
148 pspp_linreg_cache *c;
150 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
152 c->indep_means = gsl_vector_alloc (p);
153 c->indep_std = gsl_vector_alloc (p);
154 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
155 independent variables.
157 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
160 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
166 c->method = PSPP_LINREG_SWEEP;
167 c->predict = pspp_linreg_predict;
168 c->residual = pspp_linreg_residual; /* The procedure to compute my
170 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
173 c->resid = NULL; /* The variable storing my residuals. */
174 c->pred = NULL; /* The variable storing my predicted values. */
180 pspp_linreg_cache_free (void *m)
184 pspp_linreg_cache *c = m;
185 gsl_vector_free (c->indep_means);
186 gsl_vector_free (c->indep_std);
187 gsl_vector_free (c->ss_indeps);
188 gsl_matrix_free (c->cov);
189 for (i = 0; i < c->n_coeffs; i++)
191 pspp_coeff_free (c->coeff[i]);
198 Fit the linear model via least squares. All pointers passed to pspp_linreg
199 are assumed to be allocated to the correct size and initialized to the
200 values as indicated by opts.
203 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
204 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
210 gsl_matrix_view xmxtx;
214 gsl_vector *param_estimates;
227 if (opts->get_depvar_mean_std)
229 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
231 cache->depvar_mean = m;
232 cache->depvar_std = s;
235 for (i = 0; i < cache->n_indeps; i++)
237 if (opts->get_indep_mean_std[i])
239 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
240 gsl_vector_set (cache->indep_means, i, m);
241 gsl_vector_set (cache->indep_std, i, s);
242 gsl_vector_set (cache->ssx, i, ss);
245 cache->dft = cache->n_obs - 1;
246 cache->dfm = cache->n_indeps;
247 cache->dfe = cache->dft - cache->dfm;
248 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
249 regression through the origin.
251 if (cache->method == PSPP_LINREG_SWEEP)
255 Subtract the means to improve the condition of the design
256 matrix. This requires copying X and Y. We do not divide by the
257 standard deviations of the independent variables here since doing
258 so would cause a miscalculation of the residual sums of
259 squares. Dividing by the standard deviation is done GSL's linear
260 regression functions, so if the design matrix has a poor
261 condition, use QR decomposition.
263 The design matrix here does not include a column for the intercept
264 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
265 so design is allocated here when sweeping, or below if using QR.
267 design = gsl_matrix_alloc (X->size1, X->size2);
268 for (i = 0; i < X->size2; i++)
270 m = gsl_vector_get (cache->indep_means, i);
271 for (j = 0; j < X->size1; j++)
273 tmp = (gsl_matrix_get (X, j, i) - m);
274 gsl_matrix_set (design, j, i, tmp);
277 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
278 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
280 for (i = 0; i < xtx.matrix.size1; i++)
282 tmp = gsl_vector_get (cache->ssx, i);
283 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
284 xi = gsl_matrix_column (design, i);
285 for (j = (i + 1); j < xtx.matrix.size2; j++)
287 xj = gsl_matrix_column (design, j);
288 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
289 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
293 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
294 xty = gsl_matrix_column (sw, cache->n_indeps);
296 This loop starts at 1, with i=0 outside the loop, so we can get
297 the model sum of squares due to the first independent variable.
299 xi = gsl_matrix_column (design, 0);
300 gsl_blas_ddot (&(xi.vector), Y, &tmp);
301 gsl_vector_set (&(xty.vector), 0, tmp);
302 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
303 gsl_vector_set (cache->ss_indeps, 0, tmp);
304 for (i = 1; i < cache->n_indeps; i++)
306 xi = gsl_matrix_column (design, i);
307 gsl_blas_ddot (&(xi.vector), Y, &tmp);
308 gsl_vector_set (&(xty.vector), i, tmp);
312 Sweep on the matrix sw, which contains XtX, XtY and YtY.
315 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
316 cache->mse = cache->sse / cache->dfe;
320 m = cache->depvar_mean;
321 for (i = 0; i < cache->n_indeps; i++)
323 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
324 cache->coeff[i + 1]->estimate = tmp;
325 m -= tmp * gsl_vector_get (cache->indep_means, i);
328 Get the covariance matrix of the parameter estimates.
329 Only the upper triangle is necessary.
333 The loops below do not compute the entries related
334 to the estimated intercept.
336 for (i = 0; i < cache->n_indeps; i++)
337 for (j = i; j < cache->n_indeps; j++)
339 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
340 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
343 Get the covariances related to the intercept.
345 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
346 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
347 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
348 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
349 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
350 if (rc == GSL_SUCCESS)
352 tmp = cache->mse / cache->n_obs;
353 for (i = 1; i < 1 + cache->n_indeps; i++)
355 tmp -= gsl_matrix_get (cache->cov, 0, i)
356 * gsl_vector_get (cache->indep_means, i - 1);
358 gsl_matrix_set (cache->cov, 0, 0, tmp);
360 cache->coeff[0]->estimate = m;
364 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
365 __FILE__, __LINE__, gsl_strerror (rc));
368 gsl_matrix_free (sw);
372 gsl_multifit_linear_workspace *wk;
374 Use QR decomposition via GSL.
377 param_estimates = gsl_vector_alloc (1 + X->size2);
378 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
380 for (j = 0; j < X->size1; j++)
382 gsl_matrix_set (design, j, 0, 1.0);
383 for (i = 0; i < X->size2; i++)
385 tmp = gsl_matrix_get (X, j, i);
386 gsl_matrix_set (design, j, i + 1, tmp);
390 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
391 rc = gsl_multifit_linear (design, Y, param_estimates,
392 cache->cov, &(cache->sse), wk);
393 for (i = 0; i < cache->n_coeffs; i++)
395 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i);
397 if (rc == GSL_SUCCESS)
399 gsl_multifit_linear_free (wk);
400 gsl_vector_free (param_estimates);
404 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
405 __FILE__, __LINE__, rc);
410 cache->ssm = cache->sst - cache->sse;
412 Get the remaining sums of squares for the independent
416 for (i = 1; i < cache->n_indeps; i++)
419 m += gsl_vector_get (cache->ss_indeps, j);
420 tmp = cache->ssm - m;
421 gsl_vector_set (cache->ss_indeps, i, tmp);
424 gsl_matrix_free (design);