1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_fit.h>
19 #include <gsl/gsl_multifit.h>
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
27 Find the least-squares estimate of b for the linear model:
31 where Y is an n-by-1 column vector, X is an n-by-p matrix of
32 independent variables, b is a p-by-1 vector of regression coefficients,
33 and Z is an n-by-1 normally-distributed random vector with independent
34 identically distributed components with mean 0.
36 This estimate is found via the sweep operator or singular-value
37 decomposition with gsl.
42 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
43 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
45 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
48 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
49 Springer. 1998. ISBN 0-387-98542-5.
52 #include <math/linreg/linreg.h>
53 #include <math/coefficient.h>
54 #include <gsl/gsl_errno.h>
55 #include <linreg/sweep.h>
57 Get the mean and standard deviation of a vector
58 of doubles via a form of the Kalman filter as
59 described on page 32 of [3].
62 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
71 mean = gsl_vector_get (&v.vector, 0);
73 for (i = 1; i < v.vector.size; i++)
76 tmp = gsl_vector_get (&v.vector, i);
79 variance += j * (j - 1.0) * d * d;
82 *sp = sqrt (variance / (j - 1.0));
89 Set V to contain an array of pointers to the variables
90 used in the model. V must be at least C->N_COEFFS in length.
91 The return value is the number of distinct variables found.
94 pspp_linreg_get_vars (const void *c_, const struct variable **v)
96 const pspp_linreg_cache *c = c_;
97 struct pspp_coeff *coef = NULL;
98 const struct variable *tmp;
103 Make sure the caller doesn't try to sneak a variable
104 into V that is not in the model.
106 for (i = 0; i < c->n_coeffs; i++)
111 Start at c->coeff[1] to avoid the intercept.
113 v[result] = pspp_coeff_get_var (c->coeff[1], 0);
114 result = (v[result] == NULL) ? 0 : 1;
116 for (coef = c->coeff[2]; coef < c->coeff[c->n_coeffs]; coef++)
118 tmp = pspp_coeff_get_var (coef, 0);
119 assert (tmp != NULL);
120 /* Repeated variables are likely to bunch together, at the end
123 while (i >= 0 && v[i] != tmp)
127 if (i < 0 && result < c->n_coeffs)
137 Allocate a pspp_linreg_cache and return a pointer
138 to it. n is the number of cases, p is the number of
139 independent variables.
142 pspp_linreg_cache_alloc (size_t n, size_t p)
144 pspp_linreg_cache *c;
146 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
148 c->indep_means = gsl_vector_alloc (p);
149 c->indep_std = gsl_vector_alloc (p);
150 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
151 independent variables.
153 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
156 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
162 c->method = PSPP_LINREG_SWEEP;
163 c->predict = pspp_linreg_predict;
164 c->residual = pspp_linreg_residual; /* The procedure to compute my
166 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
169 c->resid = NULL; /* The variable storing my residuals. */
170 c->pred = NULL; /* The variable storing my predicted values. */
176 pspp_linreg_cache_free (void *m)
180 pspp_linreg_cache *c = m;
183 gsl_vector_free (c->indep_means);
184 gsl_vector_free (c->indep_std);
185 gsl_vector_free (c->ss_indeps);
186 gsl_matrix_free (c->cov);
187 gsl_vector_free (c->ssx);
188 for (i = 0; i < c->n_coeffs; i++)
190 pspp_coeff_free (c->coeff[i]);
198 Fit the linear model via least squares. All pointers passed to pspp_linreg
199 are assumed to be allocated to the correct size and initialized to the
200 values as indicated by opts.
203 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
204 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
210 gsl_matrix_view xmxtx;
214 gsl_vector *param_estimates;
227 if (opts->get_depvar_mean_std)
229 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
231 cache->depvar_mean = m;
232 cache->depvar_std = s;
235 for (i = 0; i < cache->n_indeps; i++)
237 if (opts->get_indep_mean_std[i])
239 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
240 gsl_vector_set (cache->indep_means, i, m);
241 gsl_vector_set (cache->indep_std, i, s);
242 gsl_vector_set (cache->ssx, i, ss);
245 cache->dft = cache->n_obs - 1;
246 cache->dfm = cache->n_indeps;
247 cache->dfe = cache->dft - cache->dfm;
248 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
249 regression through the origin.
251 if (cache->method == PSPP_LINREG_SWEEP)
255 Subtract the means to improve the condition of the design
256 matrix. This requires copying X and Y. We do not divide by the
257 standard deviations of the independent variables here since doing
258 so would cause a miscalculation of the residual sums of
259 squares. Dividing by the standard deviation is done GSL's linear
260 regression functions, so if the design matrix has a poor
261 condition, use QR decomposition.
263 The design matrix here does not include a column for the intercept
264 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
265 so design is allocated here when sweeping, or below if using QR.
267 design = gsl_matrix_alloc (X->size1, X->size2);
268 for (i = 0; i < X->size2; i++)
270 m = gsl_vector_get (cache->indep_means, i);
271 for (j = 0; j < X->size1; j++)
273 tmp = (gsl_matrix_get (X, j, i) - m);
274 gsl_matrix_set (design, j, i, tmp);
277 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
278 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
280 for (i = 0; i < xtx.matrix.size1; i++)
282 tmp = gsl_vector_get (cache->ssx, i);
283 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
284 xi = gsl_matrix_column (design, i);
285 for (j = (i + 1); j < xtx.matrix.size2; j++)
287 xj = gsl_matrix_column (design, j);
288 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
289 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
293 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
294 xty = gsl_matrix_column (sw, cache->n_indeps);
296 This loop starts at 1, with i=0 outside the loop, so we can get
297 the model sum of squares due to the first independent variable.
299 xi = gsl_matrix_column (design, 0);
300 gsl_blas_ddot (&(xi.vector), Y, &tmp);
301 gsl_vector_set (&(xty.vector), 0, tmp);
302 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
303 gsl_vector_set (cache->ss_indeps, 0, tmp);
304 for (i = 1; i < cache->n_indeps; i++)
306 xi = gsl_matrix_column (design, i);
307 gsl_blas_ddot (&(xi.vector), Y, &tmp);
308 gsl_vector_set (&(xty.vector), i, tmp);
312 Sweep on the matrix sw, which contains XtX, XtY and YtY.
315 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
316 cache->mse = cache->sse / cache->dfe;
320 m = cache->depvar_mean;
321 for (i = 0; i < cache->n_indeps; i++)
323 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
324 cache->coeff[i + 1]->estimate = tmp;
325 m -= tmp * gsl_vector_get (cache->indep_means, i);
328 Get the covariance matrix of the parameter estimates.
329 Only the upper triangle is necessary.
333 The loops below do not compute the entries related
334 to the estimated intercept.
336 for (i = 0; i < cache->n_indeps; i++)
337 for (j = i; j < cache->n_indeps; j++)
339 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
340 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
343 Get the covariances related to the intercept.
345 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
346 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
347 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
348 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
349 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
350 if (rc == GSL_SUCCESS)
352 tmp = cache->mse / cache->n_obs;
353 for (i = 1; i < 1 + cache->n_indeps; i++)
355 tmp -= gsl_matrix_get (cache->cov, 0, i)
356 * gsl_vector_get (cache->indep_means, i - 1);
358 gsl_matrix_set (cache->cov, 0, 0, tmp);
360 cache->coeff[0]->estimate = m;
364 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
365 __FILE__, __LINE__, gsl_strerror (rc));
368 gsl_matrix_free (sw);
370 else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
373 Use the SVD of X^T X to find a conditional inverse of X^TX. If
374 the SVD is X^T X = U D V^T, then set the conditional inverse
375 to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
376 (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
377 sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
378 equations by setting the estimated parameter vector to
384 gsl_multifit_linear_workspace *wk;
386 Use QR decomposition via GSL.
389 param_estimates = gsl_vector_alloc (1 + X->size2);
390 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
392 for (j = 0; j < X->size1; j++)
394 gsl_matrix_set (design, j, 0, 1.0);
395 for (i = 0; i < X->size2; i++)
397 tmp = gsl_matrix_get (X, j, i);
398 gsl_matrix_set (design, j, i + 1, tmp);
402 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
403 rc = gsl_multifit_linear (design, Y, param_estimates,
404 cache->cov, &(cache->sse), wk);
405 for (i = 0; i < cache->n_coeffs; i++)
407 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i);
409 if (rc == GSL_SUCCESS)
411 gsl_multifit_linear_free (wk);
412 gsl_vector_free (param_estimates);
416 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
417 __FILE__, __LINE__, rc);
422 cache->ssm = cache->sst - cache->sse;
424 Get the remaining sums of squares for the independent
428 for (i = 1; i < cache->n_indeps; i++)
431 m += gsl_vector_get (cache->ss_indeps, j);
432 tmp = cache->ssm - m;
433 gsl_vector_set (cache->ss_indeps, i, tmp);
436 gsl_matrix_free (design);