1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_fit.h>
19 #include <gsl/gsl_multifit.h>
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
27 Find the least-squares estimate of b for the linear model:
31 where Y is an n-by-1 column vector, X is an n-by-p matrix of
32 independent variables, b is a p-by-1 vector of regression coefficients,
33 and Z is an n-by-1 normally-distributed random vector with independent
34 identically distributed components with mean 0.
36 This estimate is found via the sweep operator or singular-value
37 decomposition with gsl.
42 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
43 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
45 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
48 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
49 Springer. 1998. ISBN 0-387-98542-5.
52 #include <math/linreg/linreg.h>
53 #include <math/coefficient.h>
54 #include <gsl/gsl_errno.h>
55 #include <linreg/sweep.h>
57 Get the mean and standard deviation of a vector
58 of doubles via a form of the Kalman filter as
59 described on page 32 of [3].
62 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
71 mean = gsl_vector_get (&v.vector, 0);
73 for (i = 1; i < v.vector.size; i++)
76 tmp = gsl_vector_get (&v.vector, i);
79 variance += j * (j - 1.0) * d * d;
82 *sp = sqrt (variance / (j - 1.0));
89 Set V to contain an array of pointers to the variables
90 used in the model. V must be at least C->N_COEFFS in length.
91 The return value is the number of distinct variables found.
94 pspp_linreg_get_vars (const void *c_, const struct variable **v)
96 const pspp_linreg_cache *c = c_;
97 struct pspp_coeff *coef = NULL;
98 const struct variable *tmp;
103 Make sure the caller doesn't try to sneak a variable
104 into V that is not in the model.
106 for (i = 0; i < c->n_coeffs; i++)
111 Start at c->coeff[1] to avoid the intercept.
113 v[result] = pspp_coeff_get_var (c->coeff[1], 0);
114 result = (v[result] == NULL) ? 0 : 1;
116 for (coef = c->coeff[2]; coef < c->coeff[c->n_coeffs]; coef++)
118 tmp = pspp_coeff_get_var (coef, 0);
119 assert (tmp != NULL);
120 /* Repeated variables are likely to bunch together, at the end
123 while (i >= 0 && v[i] != tmp)
127 if (i < 0 && result < c->n_coeffs)
137 Allocate a pspp_linreg_cache and return a pointer
138 to it. n is the number of cases, p is the number of
139 independent variables.
142 pspp_linreg_cache_alloc (size_t n, size_t p)
144 pspp_linreg_cache *c;
146 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
148 c->indep_means = gsl_vector_alloc (p);
149 c->indep_std = gsl_vector_alloc (p);
150 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
151 independent variables.
153 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
156 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
162 c->method = PSPP_LINREG_SWEEP;
163 c->predict = pspp_linreg_predict;
164 c->residual = pspp_linreg_residual; /* The procedure to compute my
166 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
169 c->resid = NULL; /* The variable storing my residuals. */
170 c->pred = NULL; /* The variable storing my predicted values. */
176 pspp_linreg_cache_free (void *m)
180 pspp_linreg_cache *c = m;
183 gsl_vector_free (c->indep_means);
184 gsl_vector_free (c->indep_std);
185 gsl_vector_free (c->ss_indeps);
186 gsl_matrix_free (c->cov);
187 gsl_vector_free (c->ssx);
188 for (i = 0; i < c->n_coeffs; i++)
190 pspp_coeff_free (c->coeff[i]);
199 Fit the linear model via least squares. All pointers passed to pspp_linreg
200 are assumed to be allocated to the correct size and initialized to the
201 values as indicated by opts.
204 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
205 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
208 gsl_matrix *design = NULL;
211 gsl_matrix_view xmxtx;
215 gsl_vector *param_estimates;
228 if (opts->get_depvar_mean_std)
230 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
232 cache->depvar_mean = m;
233 cache->depvar_std = s;
236 for (i = 0; i < cache->n_indeps; i++)
238 if (opts->get_indep_mean_std[i])
240 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
241 gsl_vector_set (cache->indep_means, i, m);
242 gsl_vector_set (cache->indep_std, i, s);
243 gsl_vector_set (cache->ssx, i, ss);
246 cache->dft = cache->n_obs - 1;
247 cache->dfm = cache->n_indeps;
248 cache->dfe = cache->dft - cache->dfm;
249 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
250 regression through the origin.
252 if (cache->method == PSPP_LINREG_SWEEP)
256 Subtract the means to improve the condition of the design
257 matrix. This requires copying X and Y. We do not divide by the
258 standard deviations of the independent variables here since doing
259 so would cause a miscalculation of the residual sums of
260 squares. Dividing by the standard deviation is done GSL's linear
261 regression functions, so if the design matrix has a poor
262 condition, use QR decomposition.
264 The design matrix here does not include a column for the intercept
265 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
266 so design is allocated here when sweeping, or below if using QR.
268 design = gsl_matrix_alloc (X->size1, X->size2);
269 for (i = 0; i < X->size2; i++)
271 m = gsl_vector_get (cache->indep_means, i);
272 for (j = 0; j < X->size1; j++)
274 tmp = (gsl_matrix_get (X, j, i) - m);
275 gsl_matrix_set (design, j, i, tmp);
278 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
279 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
281 for (i = 0; i < xtx.matrix.size1; i++)
283 tmp = gsl_vector_get (cache->ssx, i);
284 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
285 xi = gsl_matrix_column (design, i);
286 for (j = (i + 1); j < xtx.matrix.size2; j++)
288 xj = gsl_matrix_column (design, j);
289 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
290 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
294 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
295 xty = gsl_matrix_column (sw, cache->n_indeps);
297 This loop starts at 1, with i=0 outside the loop, so we can get
298 the model sum of squares due to the first independent variable.
300 xi = gsl_matrix_column (design, 0);
301 gsl_blas_ddot (&(xi.vector), Y, &tmp);
302 gsl_vector_set (&(xty.vector), 0, tmp);
303 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
304 gsl_vector_set (cache->ss_indeps, 0, tmp);
305 for (i = 1; i < cache->n_indeps; i++)
307 xi = gsl_matrix_column (design, i);
308 gsl_blas_ddot (&(xi.vector), Y, &tmp);
309 gsl_vector_set (&(xty.vector), i, tmp);
313 Sweep on the matrix sw, which contains XtX, XtY and YtY.
316 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
317 cache->mse = cache->sse / cache->dfe;
321 m = cache->depvar_mean;
322 for (i = 0; i < cache->n_indeps; i++)
324 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
325 cache->coeff[i + 1]->estimate = tmp;
326 m -= tmp * gsl_vector_get (cache->indep_means, i);
329 Get the covariance matrix of the parameter estimates.
330 Only the upper triangle is necessary.
334 The loops below do not compute the entries related
335 to the estimated intercept.
337 for (i = 0; i < cache->n_indeps; i++)
338 for (j = i; j < cache->n_indeps; j++)
340 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
341 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
344 Get the covariances related to the intercept.
346 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
347 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
348 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
349 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
350 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
351 if (rc == GSL_SUCCESS)
353 tmp = cache->mse / cache->n_obs;
354 for (i = 1; i < 1 + cache->n_indeps; i++)
356 tmp -= gsl_matrix_get (cache->cov, 0, i)
357 * gsl_vector_get (cache->indep_means, i - 1);
359 gsl_matrix_set (cache->cov, 0, 0, tmp);
361 cache->coeff[0]->estimate = m;
365 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
366 __FILE__, __LINE__, gsl_strerror (rc));
369 gsl_matrix_free (sw);
371 else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
374 Use the SVD of X^T X to find a conditional inverse of X^TX. If
375 the SVD is X^T X = U D V^T, then set the conditional inverse
376 to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
377 (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
378 sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
379 equations by setting the estimated parameter vector to
385 gsl_multifit_linear_workspace *wk;
387 Use QR decomposition via GSL.
390 param_estimates = gsl_vector_alloc (1 + X->size2);
391 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
393 for (j = 0; j < X->size1; j++)
395 gsl_matrix_set (design, j, 0, 1.0);
396 for (i = 0; i < X->size2; i++)
398 tmp = gsl_matrix_get (X, j, i);
399 gsl_matrix_set (design, j, i + 1, tmp);
403 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
404 rc = gsl_multifit_linear (design, Y, param_estimates,
405 cache->cov, &(cache->sse), wk);
406 for (i = 0; i < cache->n_coeffs; i++)
408 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i);
410 if (rc == GSL_SUCCESS)
412 gsl_multifit_linear_free (wk);
413 gsl_vector_free (param_estimates);
417 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
418 __FILE__, __LINE__, rc);
423 cache->ssm = cache->sst - cache->sse;
425 Get the remaining sums of squares for the independent
429 for (i = 1; i < cache->n_indeps; i++)
432 m += gsl_vector_get (cache->ss_indeps, j);
433 tmp = cache->ssm - m;
434 gsl_vector_set (cache->ss_indeps, i, tmp);
437 gsl_matrix_free (design);