4 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2 of the License, or (at your option)
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 You should have received a copy of the GNU General Public License along with
17 this program; if not, write to the Free Software Foundation, Inc., 51
18 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
22 #include <gsl/gsl_fit.h>
23 #include <gsl/gsl_multifit.h>
25 #include <gsl/gsl_blas.h>
26 #include <gsl/gsl_cblas.h>
31 Find the least-squares estimate of b for the linear model:
35 where Y is an n-by-1 column vector, X is an n-by-p matrix of
36 independent variables, b is a p-by-1 vector of regression coefficients,
37 and Z is an n-by-1 normally-distributed random vector with independent
38 identically distributed components with mean 0.
40 This estimate is found via the sweep operator or singular-value
41 decomposition with gsl.
46 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
47 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
49 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
52 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
53 Springer. 1998. ISBN 0-387-98542-5.
56 #include <math/linreg/linreg.h>
57 #include <math/coefficient.h>
58 #include <gsl/gsl_errno.h>
59 #include <linreg/sweep.h>
61 Get the mean and standard deviation of a vector
62 of doubles via a form of the Kalman filter as
63 described on page 32 of [3].
66 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
75 mean = gsl_vector_get (&v.vector, 0);
77 for (i = 1; i < v.vector.size; i++)
80 tmp = gsl_vector_get (&v.vector, i);
83 variance += j * (j - 1.0) * d * d;
86 *sp = sqrt (variance / (j - 1.0));
93 Set V to contain an array of pointers to the variables
94 used in the model. V must be at least C->N_COEFFS in length.
95 The return value is the number of distinct variables found.
98 pspp_linreg_get_vars (const void *c_, struct variable **v)
100 const pspp_linreg_cache *c = c_;
101 struct pspp_coeff *coef = NULL;
102 const struct variable *tmp;
107 Make sure the caller doesn't try to sneak a variable
108 into V that is not in the model.
110 for (i = 0; i < c->n_coeffs; i++)
115 Start at c->coeff[1] to avoid the intercept.
117 v[result] = (struct variable *) pspp_coeff_get_var (c->coeff[1], 0);
118 result = (v[result] == NULL) ? 0 : 1;
120 for (coef = c->coeff[2]; coef < c->coeff[c->n_coeffs]; coef++)
122 tmp = pspp_coeff_get_var (coef, 0);
123 assert (tmp != NULL);
124 /* Repeated variables are likely to bunch together, at the end
127 while (i >= 0 && v[i] != tmp)
131 if (i < 0 && result < c->n_coeffs)
133 v[result] = (struct variable *) tmp;
141 Allocate a pspp_linreg_cache and return a pointer
142 to it. n is the number of cases, p is the number of
143 independent variables.
146 pspp_linreg_cache_alloc (size_t n, size_t p)
148 pspp_linreg_cache *c;
150 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
152 c->indep_means = gsl_vector_alloc (p);
153 c->indep_std = gsl_vector_alloc (p);
154 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
155 independent variables.
157 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
160 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
166 c->method = PSPP_LINREG_SWEEP;
167 c->predict = pspp_linreg_predict;
168 c->residual = pspp_linreg_residual; /* The procedure to compute my
170 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
173 c->resid = NULL; /* The variable storing my residuals. */
174 c->pred = NULL; /* The variable storing my predicted values. */
180 pspp_linreg_cache_free (void *m)
184 pspp_linreg_cache *c = m;
187 gsl_vector_free (c->indep_means);
188 gsl_vector_free (c->indep_std);
189 gsl_vector_free (c->ss_indeps);
190 gsl_matrix_free (c->cov);
191 for (i = 0; i < c->n_coeffs; i++)
193 pspp_coeff_free (c->coeff[i]);
201 Fit the linear model via least squares. All pointers passed to pspp_linreg
202 are assumed to be allocated to the correct size and initialized to the
203 values as indicated by opts.
206 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
207 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
213 gsl_matrix_view xmxtx;
217 gsl_vector *param_estimates;
230 if (opts->get_depvar_mean_std)
232 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
234 cache->depvar_mean = m;
235 cache->depvar_std = s;
238 for (i = 0; i < cache->n_indeps; i++)
240 if (opts->get_indep_mean_std[i])
242 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
243 gsl_vector_set (cache->indep_means, i, m);
244 gsl_vector_set (cache->indep_std, i, s);
245 gsl_vector_set (cache->ssx, i, ss);
248 cache->dft = cache->n_obs - 1;
249 cache->dfm = cache->n_indeps;
250 cache->dfe = cache->dft - cache->dfm;
251 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
252 regression through the origin.
254 if (cache->method == PSPP_LINREG_SWEEP)
258 Subtract the means to improve the condition of the design
259 matrix. This requires copying X and Y. We do not divide by the
260 standard deviations of the independent variables here since doing
261 so would cause a miscalculation of the residual sums of
262 squares. Dividing by the standard deviation is done GSL's linear
263 regression functions, so if the design matrix has a poor
264 condition, use QR decomposition.
266 The design matrix here does not include a column for the intercept
267 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
268 so design is allocated here when sweeping, or below if using QR.
270 design = gsl_matrix_alloc (X->size1, X->size2);
271 for (i = 0; i < X->size2; i++)
273 m = gsl_vector_get (cache->indep_means, i);
274 for (j = 0; j < X->size1; j++)
276 tmp = (gsl_matrix_get (X, j, i) - m);
277 gsl_matrix_set (design, j, i, tmp);
280 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
281 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
283 for (i = 0; i < xtx.matrix.size1; i++)
285 tmp = gsl_vector_get (cache->ssx, i);
286 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
287 xi = gsl_matrix_column (design, i);
288 for (j = (i + 1); j < xtx.matrix.size2; j++)
290 xj = gsl_matrix_column (design, j);
291 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
292 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
296 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
297 xty = gsl_matrix_column (sw, cache->n_indeps);
299 This loop starts at 1, with i=0 outside the loop, so we can get
300 the model sum of squares due to the first independent variable.
302 xi = gsl_matrix_column (design, 0);
303 gsl_blas_ddot (&(xi.vector), Y, &tmp);
304 gsl_vector_set (&(xty.vector), 0, tmp);
305 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
306 gsl_vector_set (cache->ss_indeps, 0, tmp);
307 for (i = 1; i < cache->n_indeps; i++)
309 xi = gsl_matrix_column (design, i);
310 gsl_blas_ddot (&(xi.vector), Y, &tmp);
311 gsl_vector_set (&(xty.vector), i, tmp);
315 Sweep on the matrix sw, which contains XtX, XtY and YtY.
318 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
319 cache->mse = cache->sse / cache->dfe;
323 m = cache->depvar_mean;
324 for (i = 0; i < cache->n_indeps; i++)
326 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
327 cache->coeff[i + 1]->estimate = tmp;
328 m -= tmp * gsl_vector_get (cache->indep_means, i);
331 Get the covariance matrix of the parameter estimates.
332 Only the upper triangle is necessary.
336 The loops below do not compute the entries related
337 to the estimated intercept.
339 for (i = 0; i < cache->n_indeps; i++)
340 for (j = i; j < cache->n_indeps; j++)
342 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
343 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
346 Get the covariances related to the intercept.
348 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
349 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
350 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
351 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
352 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
353 if (rc == GSL_SUCCESS)
355 tmp = cache->mse / cache->n_obs;
356 for (i = 1; i < 1 + cache->n_indeps; i++)
358 tmp -= gsl_matrix_get (cache->cov, 0, i)
359 * gsl_vector_get (cache->indep_means, i - 1);
361 gsl_matrix_set (cache->cov, 0, 0, tmp);
363 cache->coeff[0]->estimate = m;
367 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
368 __FILE__, __LINE__, gsl_strerror (rc));
371 gsl_matrix_free (sw);
375 gsl_multifit_linear_workspace *wk;
377 Use QR decomposition via GSL.
380 param_estimates = gsl_vector_alloc (1 + X->size2);
381 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
383 for (j = 0; j < X->size1; j++)
385 gsl_matrix_set (design, j, 0, 1.0);
386 for (i = 0; i < X->size2; i++)
388 tmp = gsl_matrix_get (X, j, i);
389 gsl_matrix_set (design, j, i + 1, tmp);
393 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
394 rc = gsl_multifit_linear (design, Y, param_estimates,
395 cache->cov, &(cache->sse), wk);
396 for (i = 0; i < cache->n_coeffs; i++)
398 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i);
400 if (rc == GSL_SUCCESS)
402 gsl_multifit_linear_free (wk);
403 gsl_vector_free (param_estimates);
407 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
408 __FILE__, __LINE__, rc);
413 cache->ssm = cache->sst - cache->sse;
415 Get the remaining sums of squares for the independent
419 for (i = 1; i < cache->n_indeps; i++)
422 m += gsl_vector_get (cache->ss_indeps, j);
423 tmp = cache->ssm - m;
424 gsl_vector_set (cache->ss_indeps, i, tmp);
427 gsl_matrix_free (design);