1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_fit.h>
19 #include <gsl/gsl_multifit.h>
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
27 Find the least-squares estimate of b for the linear model:
31 where Y is an n-by-1 column vector, X is an n-by-p matrix of
32 independent variables, b is a p-by-1 vector of regression coefficients,
33 and Z is an n-by-1 normally-distributed random vector with independent
34 identically distributed components with mean 0.
36 This estimate is found via the sweep operator or singular-value
37 decomposition with gsl.
42 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
43 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
45 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
48 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
49 Springer. 1998. ISBN 0-387-98542-5.
52 #include <math/linreg/linreg.h>
53 #include <math/coefficient.h>
54 #include <gsl/gsl_errno.h>
55 #include <linreg/sweep.h>
57 Get the mean and standard deviation of a vector
58 of doubles via a form of the Kalman filter as
59 described on page 32 of [3].
62 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
71 mean = gsl_vector_get (&v.vector, 0);
73 for (i = 1; i < v.vector.size; i++)
76 tmp = gsl_vector_get (&v.vector, i);
79 variance += j * (j - 1.0) * d * d;
82 *sp = sqrt (variance / (j - 1.0));
89 Set V to contain an array of pointers to the variables
90 used in the model. V must be at least C->N_COEFFS in length.
91 The return value is the number of distinct variables found.
94 pspp_linreg_get_vars (const void *c_, const struct variable **v)
96 const pspp_linreg_cache *c = c_;
97 const struct variable *tmp;
103 Make sure the caller doesn't try to sneak a variable
104 into V that is not in the model.
106 for (i = 0; i < c->n_coeffs; i++)
110 for (j = 0; j < c->n_coeffs; j++)
112 tmp = pspp_coeff_get_var (c->coeff[j], 0);
113 assert (tmp != NULL);
114 /* Repeated variables are likely to bunch together, at the end
117 while (i >= 0 && v[i] != tmp)
121 if (i < 0 && result < c->n_coeffs)
131 Allocate a pspp_linreg_cache and return a pointer
132 to it. n is the number of cases, p is the number of
133 independent variables.
136 pspp_linreg_cache_alloc (size_t n, size_t p)
138 pspp_linreg_cache *c;
140 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
142 c->indep_means = gsl_vector_alloc (p);
143 c->indep_std = gsl_vector_alloc (p);
144 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
145 independent variables.
147 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
150 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
156 c->method = PSPP_LINREG_SWEEP;
157 c->predict = pspp_linreg_predict;
158 c->residual = pspp_linreg_residual; /* The procedure to compute my
160 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
163 c->resid = NULL; /* The variable storing my residuals. */
164 c->pred = NULL; /* The variable storing my predicted values. */
170 pspp_linreg_cache_free (void *m)
174 pspp_linreg_cache *c = m;
177 gsl_vector_free (c->indep_means);
178 gsl_vector_free (c->indep_std);
179 gsl_vector_free (c->ss_indeps);
180 gsl_matrix_free (c->cov);
181 gsl_vector_free (c->ssx);
182 for (i = 0; i < c->n_coeffs; i++)
184 pspp_coeff_free (c->coeff[i]);
193 Fit the linear model via least squares. All pointers passed to pspp_linreg
194 are assumed to be allocated to the correct size and initialized to the
195 values as indicated by opts.
198 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
199 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
202 gsl_matrix *design = NULL;
205 gsl_matrix_view xmxtx;
209 gsl_vector *param_estimates;
222 if (opts->get_depvar_mean_std)
224 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
226 cache->depvar_mean = m;
227 cache->depvar_std = s;
230 for (i = 0; i < cache->n_indeps; i++)
232 if (opts->get_indep_mean_std[i])
234 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
235 gsl_vector_set (cache->indep_means, i, m);
236 gsl_vector_set (cache->indep_std, i, s);
237 gsl_vector_set (cache->ssx, i, ss);
240 cache->dft = cache->n_obs - 1;
241 cache->dfm = cache->n_indeps;
242 cache->dfe = cache->dft - cache->dfm;
243 cache->n_coeffs = X->size2;
244 cache->intercept = 0.0;
246 if (cache->method == PSPP_LINREG_SWEEP)
250 Subtract the means to improve the condition of the design
251 matrix. This requires copying X and Y. We do not divide by the
252 standard deviations of the independent variables here since doing
253 so would cause a miscalculation of the residual sums of
254 squares. Dividing by the standard deviation is done GSL's linear
255 regression functions, so if the design matrix has a poor
256 condition, use QR decomposition.
258 The design matrix here does not include a column for the intercept
259 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
260 so design is allocated here when sweeping, or below if using QR.
262 design = gsl_matrix_alloc (X->size1, X->size2);
263 for (i = 0; i < X->size2; i++)
265 m = gsl_vector_get (cache->indep_means, i);
266 for (j = 0; j < X->size1; j++)
268 tmp = (gsl_matrix_get (X, j, i) - m);
269 gsl_matrix_set (design, j, i, tmp);
272 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
273 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
275 for (i = 0; i < xtx.matrix.size1; i++)
277 tmp = gsl_vector_get (cache->ssx, i);
278 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
279 xi = gsl_matrix_column (design, i);
280 for (j = (i + 1); j < xtx.matrix.size2; j++)
282 xj = gsl_matrix_column (design, j);
283 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
284 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
288 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
289 xty = gsl_matrix_column (sw, cache->n_indeps);
291 This loop starts at 1, with i=0 outside the loop, so we can get
292 the model sum of squares due to the first independent variable.
294 xi = gsl_matrix_column (design, 0);
295 gsl_blas_ddot (&(xi.vector), Y, &tmp);
296 gsl_vector_set (&(xty.vector), 0, tmp);
297 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
298 gsl_vector_set (cache->ss_indeps, 0, tmp);
299 for (i = 1; i < cache->n_indeps; i++)
301 xi = gsl_matrix_column (design, i);
302 gsl_blas_ddot (&(xi.vector), Y, &tmp);
303 gsl_vector_set (&(xty.vector), i, tmp);
307 Sweep on the matrix sw, which contains XtX, XtY and YtY.
310 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
311 cache->mse = cache->sse / cache->dfe;
315 m = cache->depvar_mean;
316 for (i = 0; i < cache->n_indeps; i++)
318 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
319 cache->coeff[i]->estimate = tmp;
320 m -= tmp * gsl_vector_get (cache->indep_means, i);
323 Get the covariance matrix of the parameter estimates.
324 Only the upper triangle is necessary.
328 The loops below do not compute the entries related
329 to the estimated intercept.
331 for (i = 0; i < cache->n_indeps; i++)
332 for (j = i; j < cache->n_indeps; j++)
334 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
335 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
338 Get the covariances related to the intercept.
340 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
341 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
342 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
343 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
344 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
345 if (rc == GSL_SUCCESS)
347 tmp = cache->mse / cache->n_obs;
348 for (i = 1; i < 1 + cache->n_indeps; i++)
350 tmp -= gsl_matrix_get (cache->cov, 0, i)
351 * gsl_vector_get (cache->indep_means, i - 1);
353 gsl_matrix_set (cache->cov, 0, 0, tmp);
355 cache->intercept = m;
359 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
360 __FILE__, __LINE__, gsl_strerror (rc));
363 gsl_matrix_free (sw);
365 else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
368 Use the SVD of X^T X to find a conditional inverse of X^TX. If
369 the SVD is X^T X = U D V^T, then set the conditional inverse
370 to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
371 (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
372 sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
373 equations by setting the estimated parameter vector to
379 gsl_multifit_linear_workspace *wk;
381 Use QR decomposition via GSL.
384 param_estimates = gsl_vector_alloc (1 + X->size2);
385 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
387 for (j = 0; j < X->size1; j++)
389 gsl_matrix_set (design, j, 0, 1.0);
390 for (i = 0; i < X->size2; i++)
392 tmp = gsl_matrix_get (X, j, i);
393 gsl_matrix_set (design, j, i + 1, tmp);
397 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
398 rc = gsl_multifit_linear (design, Y, param_estimates,
399 cache->cov, &(cache->sse), wk);
400 for (i = 0; i < cache->n_coeffs; i++)
402 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i + 1);
404 cache->intercept = gsl_vector_get (param_estimates, 0);
405 if (rc == GSL_SUCCESS)
407 gsl_multifit_linear_free (wk);
408 gsl_vector_free (param_estimates);
412 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
413 __FILE__, __LINE__, rc);
418 cache->ssm = cache->sst - cache->sse;
420 Get the remaining sums of squares for the independent
424 for (i = 1; i < cache->n_indeps; i++)
427 m += gsl_vector_get (cache->ss_indeps, j);
428 tmp = cache->ssm - m;
429 gsl_vector_set (cache->ss_indeps, i, tmp);
432 gsl_matrix_free (design);