4 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2 of the License, or (at your option)
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 You should have received a copy of the GNU General Public License along with
17 this program; if not, write to the Free Software Foundation, Inc., 51
18 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
24 #include <gsl/gsl_blas.h>
25 #include <gsl/gsl_cblas.h>
30 Find the least-squares estimate of b for the linear model:
34 where Y is an n-by-1 column vector, X is an n-by-p matrix of
35 independent variables, b is a p-by-1 vector of regression coefficients,
36 and Z is an n-by-1 normally-distributed random vector with independent
37 identically distributed components with mean 0.
39 This estimate is found via the sweep operator or singular-value
40 decomposition with gsl.
45 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
46 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
48 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
51 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
52 Springer. 1998. ISBN 0-387-98542-5.
55 #include <math/linreg/linreg.h>
56 #include <math/linreg/coefficient.h>
57 #include <gsl/gsl_errno.h>
58 #include <linreg/sweep.h>
60 Get the mean and standard deviation of a vector
61 of doubles via a form of the Kalman filter as
62 described on page 32 of [3].
65 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
74 mean = gsl_vector_get (&v.vector, 0);
76 for (i = 1; i < v.vector.size; i++)
79 tmp = gsl_vector_get (&v.vector, i);
82 variance += j * (j - 1.0) * d * d;
85 *sp = sqrt (variance / (j - 1.0));
91 Set V to contain an array of pointers to the variables
92 used in the model. V must be at least C->N_COEFFS in length.
93 The return value is the number of distinct variables found.
96 pspp_linreg_get_vars (const void *c_, struct variable **v)
98 const pspp_linreg_cache *c = c_;
99 struct pspp_linreg_coeff *coef = NULL;
100 const struct variable *tmp;
105 Make sure the caller doesn't try to sneak a variable
106 into V that is not in the model.
108 for (i = 0; i < c->n_coeffs; i++)
113 Start at c->coeff + 1 to avoid the intercept.
115 v[result] = (struct variable *) pspp_linreg_coeff_get_var (c->coeff + 1, 0);
116 result = (v[result] == NULL) ? 0 : 1;
118 for (coef = c->coeff + 2; coef < c->coeff + c->n_coeffs; coef++)
120 tmp = pspp_linreg_coeff_get_var (coef, 0);
121 assert (tmp != NULL);
122 /* Repeated variables are likely to bunch together, at the end
125 while (i >= 0 && (v[i]->index != tmp->index))
129 if (i < 0 && result < c->n_coeffs)
131 v[result] = (struct variable *) tmp;
139 Allocate a pspp_linreg_cache and return a pointer
140 to it. n is the number of cases, p is the number of
141 independent variables.
144 pspp_linreg_cache_alloc (size_t n, size_t p)
146 pspp_linreg_cache *c;
148 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
150 c->indep_means = gsl_vector_alloc (p);
151 c->indep_std = gsl_vector_alloc (p);
152 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
153 independent variables.
155 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
158 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
164 c->method = PSPP_LINREG_SWEEP;
165 c->predict = pspp_linreg_predict;
166 c->residual = pspp_linreg_residual; /* The procedure to compute my
168 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
171 c->resid = NULL; /* The variable storing my residuals. */
172 c->pred = NULL; /* The variable storing my predicted values. */
178 pspp_linreg_cache_free (void * m)
180 pspp_linreg_cache *c = m;
181 gsl_vector_free (c->indep_means);
182 gsl_vector_free (c->indep_std);
183 gsl_vector_free (c->ss_indeps);
184 gsl_matrix_free (c->cov);
185 pspp_linreg_coeff_free (c->coeff);
191 Fit the linear model via least squares. All pointers passed to pspp_linreg
192 are assumed to be allocated to the correct size and initialized to the
193 values as indicated by opts.
196 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
197 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
203 gsl_matrix_view xmxtx;
207 gsl_vector *param_estimates;
220 if (opts->get_depvar_mean_std)
222 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
224 cache->depvar_mean = m;
225 cache->depvar_std = s;
228 for (i = 0; i < cache->n_indeps; i++)
230 if (opts->get_indep_mean_std[i])
232 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
233 gsl_vector_set (cache->indep_means, i, m);
234 gsl_vector_set (cache->indep_std, i, s);
235 gsl_vector_set (cache->ssx, i, ss);
238 cache->dft = cache->n_obs - 1;
239 cache->dfm = cache->n_indeps;
240 cache->dfe = cache->dft - cache->dfm;
241 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
242 regression through the origin.
244 if (cache->method == PSPP_LINREG_SWEEP)
248 Subtract the means to improve the condition of the design
249 matrix. This requires copying X and Y. We do not divide by the
250 standard deviations of the independent variables here since doing
251 so would cause a miscalculation of the residual sums of
252 squares. Dividing by the standard deviation is done GSL's linear
253 regression functions, so if the design matrix has a poor
254 condition, use QR decomposition.
256 The design matrix here does not include a column for the intercept
257 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
258 so design is allocated here when sweeping, or below if using QR.
260 design = gsl_matrix_alloc (X->size1, X->size2);
261 for (i = 0; i < X->size2; i++)
263 m = gsl_vector_get (cache->indep_means, i);
264 for (j = 0; j < X->size1; j++)
266 tmp = (gsl_matrix_get (X, j, i) - m);
267 gsl_matrix_set (design, j, i, tmp);
270 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
271 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
273 for (i = 0; i < xtx.matrix.size1; i++)
275 tmp = gsl_vector_get (cache->ssx, i);
276 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
277 xi = gsl_matrix_column (design, i);
278 for (j = (i + 1); j < xtx.matrix.size2; j++)
280 xj = gsl_matrix_column (design, j);
281 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
282 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
286 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
287 xty = gsl_matrix_column (sw, cache->n_indeps);
289 This loop starts at 1, with i=0 outside the loop, so we can get
290 the model sum of squares due to the first independent variable.
292 xi = gsl_matrix_column (design, 0);
293 gsl_blas_ddot (&(xi.vector), Y, &tmp);
294 gsl_vector_set (&(xty.vector), 0, tmp);
295 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
296 gsl_vector_set (cache->ss_indeps, 0, tmp);
297 for (i = 1; i < cache->n_indeps; i++)
299 xi = gsl_matrix_column (design, i);
300 gsl_blas_ddot (&(xi.vector), Y, &tmp);
301 gsl_vector_set (&(xty.vector), i, tmp);
305 Sweep on the matrix sw, which contains XtX, XtY and YtY.
308 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
309 cache->mse = cache->sse / cache->dfe;
313 m = cache->depvar_mean;
314 for (i = 0; i < cache->n_indeps; i++)
316 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
317 cache->coeff[i + 1].estimate = tmp;
318 m -= tmp * gsl_vector_get (cache->indep_means, i);
321 Get the covariance matrix of the parameter estimates.
322 Only the upper triangle is necessary.
326 The loops below do not compute the entries related
327 to the estimated intercept.
329 for (i = 0; i < cache->n_indeps; i++)
330 for (j = i; j < cache->n_indeps; j++)
332 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
333 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
336 Get the covariances related to the intercept.
338 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
339 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
340 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
341 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
342 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
343 if (rc == GSL_SUCCESS)
345 tmp = cache->mse / cache->n_obs;
346 for (i = 1; i < 1 + cache->n_indeps; i++)
348 tmp -= gsl_matrix_get (cache->cov, 0, i)
349 * gsl_vector_get (cache->indep_means, i - 1);
351 gsl_matrix_set (cache->cov, 0, 0, tmp);
353 cache->coeff[0].estimate = m;
357 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
358 __FILE__, __LINE__, gsl_strerror (rc));
361 gsl_matrix_free (sw);
365 gsl_multifit_linear_workspace *wk;
367 Use QR decomposition via GSL.
370 param_estimates = gsl_vector_alloc (1 + X->size2);
371 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
373 for (j = 0; j < X->size1; j++)
375 gsl_matrix_set (design, j, 0, 1.0);
376 for (i = 0; i < X->size2; i++)
378 tmp = gsl_matrix_get (X, j, i);
379 gsl_matrix_set (design, j, i + 1, tmp);
383 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
384 rc = gsl_multifit_linear (design, Y, param_estimates,
385 cache->cov, &(cache->sse), wk);
386 for (i = 0; i < cache->n_coeffs; i++)
388 cache->coeff[i].estimate = gsl_vector_get (param_estimates, i);
390 if (rc == GSL_SUCCESS)
392 gsl_multifit_linear_free (wk);
393 gsl_vector_free (param_estimates);
397 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
398 __FILE__, __LINE__, rc);
403 cache->ssm = cache->sst - cache->sse;
405 Get the remaining sums of squares for the independent
409 for (i = 1; i < cache->n_indeps; i++)
412 m += gsl_vector_get (cache->ss_indeps, j);
413 tmp = cache->ssm - m;
414 gsl_vector_set (cache->ss_indeps, i, tmp);
417 gsl_matrix_free (design);