1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/linreg.h"
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
23 #include <gsl/gsl_errno.h>
24 #include <gsl/gsl_fit.h>
25 #include <gsl/gsl_linalg.h>
26 #include <gsl/gsl_multifit.h>
28 #include "data/value.h"
29 #include "data/variable.h"
30 #include "linreg/sweep.h"
32 #include "gl/xalloc.h"
35 Find the least-squares estimate of b for the linear model:
39 where Y is an n-by-1 column vector, X is an n-by-p matrix of
40 independent variables, b is a p-by-1 vector of regression coefficients,
41 and Z is an n-by-1 normally-distributed random vector with independent
42 identically distributed components with mean 0.
44 This estimate is found via the sweep operator or singular-value
45 decomposition with gsl.
50 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
51 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
53 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
56 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
57 Springer. 1998. ISBN 0-387-98542-5.
61 const struct variable **
62 linreg_get_vars (const linreg *c)
68 Allocate a linreg and return a pointer to it. n is the number of
69 cases, p is the number of independent variables.
72 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
78 c = xmalloc (sizeof (*c));
80 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
81 c->dependent_column = p;
82 for (i = 0; i < p; i++)
84 c->indep_vars[i] = indep_vars[i];
86 c->indep_means = gsl_vector_alloc (p);
87 c->indep_std = gsl_vector_alloc (p);
89 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
95 c->coeff = xnmalloc (p, sizeof (*c->coeff));
96 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
99 c->dfe = c->dft - c->dfm;
101 c->depvar_mean = 0.0;
106 c->method = LINREG_SWEEP;
114 linreg_free (void *m)
119 gsl_vector_free (c->indep_means);
120 gsl_vector_free (c->indep_std);
121 gsl_vector_free (c->ss_indeps);
122 gsl_matrix_free (c->cov);
123 free (c->indep_vars);
131 post_sweep_computations (linreg *l, gsl_matrix *sw)
135 gsl_matrix_view xmxtx;
145 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
146 l->mse = l->sse / l->dfe;
151 for (i = 0; i < l->n_indeps; i++)
153 tmp = gsl_matrix_get (sw, i, l->n_indeps);
155 m -= tmp * linreg_get_indep_variable_mean (l, i);
158 Get the covariance matrix of the parameter estimates.
159 Only the upper triangle is necessary.
163 The loops below do not compute the entries related
164 to the estimated intercept.
166 for (i = 0; i < l->n_indeps; i++)
167 for (j = i; j < l->n_indeps; j++)
169 tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
170 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
173 Get the covariances related to the intercept.
175 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
176 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
177 xm = gsl_matrix_calloc (1, l->n_indeps);
178 for (i = 0; i < xm->size2; i++)
180 gsl_matrix_set (xm, 0, i,
181 linreg_get_indep_variable_mean (l, i));
183 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
184 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
185 gsl_matrix_free (xm);
186 if (rc == GSL_SUCCESS)
188 tmp = l->mse / l->n_obs;
189 for (i = 1; i < 1 + l->n_indeps; i++)
191 tmp -= gsl_matrix_get (l->cov, 0, i)
192 * linreg_get_indep_variable_mean (l, i - 1);
194 gsl_matrix_set (l->cov, 0, 0, tmp);
200 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
201 __FILE__, __LINE__, gsl_strerror (rc));
207 Predict the value of the dependent variable with the new set of
208 predictors. VALS are assumed to be in the order corresponding to the
209 order of the coefficients in the linreg struct.
212 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
217 assert (n_vals = c->n_coeffs);
218 if (vals == NULL || c == NULL)
222 if (c->coeff == NULL)
224 /* The stupid model: just guess the mean. */
225 return c->depvar_mean;
227 result = c->intercept;
229 for (j = 0; j < n_vals; j++)
231 result += linreg_coeff (c, j) * vals[j];
238 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
240 if (vals == NULL || c == NULL)
244 return (obs - linreg_predict (c, vals, n_vals));
247 double linreg_get_indep_variable_sd (linreg *c, size_t j)
250 return gsl_vector_get (c->indep_std, j);
253 void linreg_set_indep_variable_sd (linreg *c, size_t j, double s)
256 gsl_vector_set (c->indep_std, j, s);
260 Mean of the independent variable.
262 double linreg_get_indep_variable_mean (linreg *c, size_t j)
265 return gsl_vector_get (c->indep_means, j);
268 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
271 gsl_vector_set (c->indep_means, j, m);
275 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
277 double intcpt_coef = 0.0;
278 double intercept_variance = 0.0;
289 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
290 xty = gsl_vector_alloc (cov->size1 - 1);
291 tau = gsl_vector_alloc (cov->size1 - 1);
292 params = gsl_vector_alloc (cov->size1 - 1);
294 for (i = 0; i < xtx->size1; i++)
296 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
297 for (j = 0; j < xtx->size2; j++)
299 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
302 gsl_linalg_QR_decomp (xtx, tau);
303 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
304 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
306 gsl_linalg_QR_unpack (xtx, tau, q, r);
307 gsl_linalg_QR_solve (xtx, tau, xty, params);
308 for (i = 0; i < params->size; i++)
310 l->coeff[i] = gsl_vector_get (params, i);
312 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
314 for (i = 0; i < l->n_indeps; i++)
316 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
318 l->sse = l->sst - l->ssm;
320 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
322 /* Copy the lower triangle into the upper triangle. */
323 for (i = 0; i < q->size1; i++)
325 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
326 for (j = i + 1; j < q->size2; j++)
328 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
329 linreg_get_indep_variable_mean (l, i) *
330 linreg_get_indep_variable_mean (l, j);
331 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
334 l->intercept = linreg_get_depvar_mean (l);
336 for (i = 0; i < l->n_indeps; i++)
338 tmp = linreg_get_indep_variable_mean (l, i);
339 l->intercept -= l->coeff[i] * tmp;
340 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
343 /* Covariances related to the intercept. */
344 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
345 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
346 for (i = 0; i < q->size1; i++)
348 for (j = 0; j < q->size2; j++)
350 intcpt_coef -= gsl_matrix_get (q, i, j)
351 * linreg_get_indep_variable_mean (l, j);
353 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
354 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
360 gsl_vector_free (xty);
361 gsl_vector_free (tau);
362 gsl_matrix_free (xtx);
363 gsl_vector_free (params);
367 Estimate the model parameters from the covariance matrix. This
368 function assumes the covariance entries corresponding to the
369 dependent variable are in the final row and column of the covariance
373 linreg_fit (const gsl_matrix *cov, linreg *l)
376 assert (cov != NULL);
378 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
379 if (l->method == LINREG_SWEEP)
382 params = gsl_matrix_calloc (cov->size1, cov->size2);
383 gsl_matrix_memcpy (params, cov);
384 reg_sweep (params, l->dependent_column);
385 post_sweep_computations (l, params);
386 gsl_matrix_free (params);
388 else if (l->method == LINREG_QR)
390 linreg_fit_qr (cov, l);
394 double linreg_mse (const linreg *c)
397 return (c->sse / c->dfe);
400 double linreg_intercept (const linreg *c)
406 linreg_cov (const linreg *c)
412 linreg_coeff (const linreg *c, size_t i)
414 return (c->coeff[i]);
417 const struct variable *
418 linreg_indep_var (const linreg *c, size_t i)
420 return (c->indep_vars[i]);
424 linreg_n_coeffs (const linreg *c)
430 linreg_n_obs (const linreg *c)
436 linreg_sse (const linreg *c)
442 linreg_ssreg (const linreg *c)
444 return (c->sst - c->sse);
447 double linreg_sst (const linreg *c)
453 linreg_dfmodel ( const linreg *c)
459 linreg_set_depvar_mean (linreg *c, double x)
465 linreg_get_depvar_mean (linreg *c)
467 return c->depvar_mean;