1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/linreg.h"
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
23 #include <gsl/gsl_errno.h>
24 #include <gsl/gsl_fit.h>
25 #include <gsl/gsl_linalg.h>
26 #include <gsl/gsl_multifit.h>
28 #include "data/value.h"
29 #include "data/variable.h"
30 #include "linreg/sweep.h"
32 #include "gl/xalloc.h"
35 Find the least-squares estimate of b for the linear model:
39 where Y is an n-by-1 column vector, X is an n-by-p matrix of
40 independent variables, b is a p-by-1 vector of regression coefficients,
41 and Z is an n-by-1 normally-distributed random vector with independent
42 identically distributed components with mean 0.
44 This estimate is found via the sweep operator or singular-value
45 decomposition with gsl.
50 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
51 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
53 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
56 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
57 Springer. 1998. ISBN 0-387-98542-5.
61 const struct variable **
62 linreg_get_vars (const linreg *c)
68 Allocate a linreg and return a pointer to it. n is the number of
69 cases, p is the number of independent variables.
72 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
78 c = xmalloc (sizeof (*c));
80 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
81 c->dependent_column = p;
82 for (i = 0; i < p; i++)
84 c->indep_vars[i] = indep_vars[i];
86 c->indep_means = gsl_vector_alloc (p);
87 c->indep_std = gsl_vector_alloc (p);
89 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
95 c->coeff = xnmalloc (p, sizeof (*c->coeff));
96 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
99 c->dfe = c->dft - c->dfm;
101 c->depvar_mean = 0.0;
106 c->method = LINREG_SWEEP;
114 linreg_free (void *m)
119 gsl_vector_free (c->indep_means);
120 gsl_vector_free (c->indep_std);
121 gsl_matrix_free (c->cov);
122 free (c->indep_vars);
130 post_sweep_computations (linreg *l, gsl_matrix *sw)
134 gsl_matrix_view xmxtx;
144 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
145 l->mse = l->sse / l->dfe;
150 for (i = 0; i < l->n_indeps; i++)
152 tmp = gsl_matrix_get (sw, i, l->n_indeps);
154 m -= tmp * linreg_get_indep_variable_mean (l, i);
157 Get the covariance matrix of the parameter estimates.
158 Only the upper triangle is necessary.
162 The loops below do not compute the entries related
163 to the estimated intercept.
165 for (i = 0; i < l->n_indeps; i++)
166 for (j = i; j < l->n_indeps; j++)
168 tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
169 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
172 Get the covariances related to the intercept.
174 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
175 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
176 xm = gsl_matrix_calloc (1, l->n_indeps);
177 for (i = 0; i < xm->size2; i++)
179 gsl_matrix_set (xm, 0, i,
180 linreg_get_indep_variable_mean (l, i));
182 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
183 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
184 gsl_matrix_free (xm);
185 if (rc == GSL_SUCCESS)
187 tmp = l->mse / l->n_obs;
188 for (i = 1; i < 1 + l->n_indeps; i++)
190 tmp -= gsl_matrix_get (l->cov, 0, i)
191 * linreg_get_indep_variable_mean (l, i - 1);
193 gsl_matrix_set (l->cov, 0, 0, tmp);
199 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
200 __FILE__, __LINE__, gsl_strerror (rc));
206 Predict the value of the dependent variable with the new set of
207 predictors. VALS are assumed to be in the order corresponding to the
208 order of the coefficients in the linreg struct.
211 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
216 assert (n_vals = c->n_coeffs);
217 if (vals == NULL || c == NULL)
221 if (c->coeff == NULL)
223 /* The stupid model: just guess the mean. */
224 return c->depvar_mean;
226 result = c->intercept;
228 for (j = 0; j < n_vals; j++)
230 result += linreg_coeff (c, j) * vals[j];
237 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
239 if (vals == NULL || c == NULL)
243 return (obs - linreg_predict (c, vals, n_vals));
246 double linreg_get_indep_variable_sd (linreg *c, size_t j)
249 return gsl_vector_get (c->indep_std, j);
252 void linreg_set_indep_variable_sd (linreg *c, size_t j, double s)
255 gsl_vector_set (c->indep_std, j, s);
259 Mean of the independent variable.
261 double linreg_get_indep_variable_mean (linreg *c, size_t j)
264 return gsl_vector_get (c->indep_means, j);
267 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
270 gsl_vector_set (c->indep_means, j, m);
274 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
276 double intcpt_coef = 0.0;
277 double intercept_variance = 0.0;
288 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
289 xty = gsl_vector_alloc (cov->size1 - 1);
290 tau = gsl_vector_alloc (cov->size1 - 1);
291 params = gsl_vector_alloc (cov->size1 - 1);
293 for (i = 0; i < xtx->size1; i++)
295 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
296 for (j = 0; j < xtx->size2; j++)
298 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
301 gsl_linalg_QR_decomp (xtx, tau);
302 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
303 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
305 gsl_linalg_QR_unpack (xtx, tau, q, r);
306 gsl_linalg_QR_solve (xtx, tau, xty, params);
307 for (i = 0; i < params->size; i++)
309 l->coeff[i] = gsl_vector_get (params, i);
311 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
313 for (i = 0; i < l->n_indeps; i++)
315 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
317 l->sse = l->sst - l->ssm;
319 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
321 /* Copy the lower triangle into the upper triangle. */
322 for (i = 0; i < q->size1; i++)
324 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
325 for (j = i + 1; j < q->size2; j++)
327 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
328 linreg_get_indep_variable_mean (l, i) *
329 linreg_get_indep_variable_mean (l, j);
330 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
333 l->intercept = linreg_get_depvar_mean (l);
335 for (i = 0; i < l->n_indeps; i++)
337 tmp = linreg_get_indep_variable_mean (l, i);
338 l->intercept -= l->coeff[i] * tmp;
339 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
342 /* Covariances related to the intercept. */
343 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
344 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
345 for (i = 0; i < q->size1; i++)
347 for (j = 0; j < q->size2; j++)
349 intcpt_coef -= gsl_matrix_get (q, i, j)
350 * linreg_get_indep_variable_mean (l, j);
352 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
353 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
359 gsl_vector_free (xty);
360 gsl_vector_free (tau);
361 gsl_matrix_free (xtx);
362 gsl_vector_free (params);
366 Estimate the model parameters from the covariance matrix. This
367 function assumes the covariance entries corresponding to the
368 dependent variable are in the final row and column of the covariance
372 linreg_fit (const gsl_matrix *cov, linreg *l)
375 assert (cov != NULL);
377 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
378 if (l->method == LINREG_SWEEP)
381 params = gsl_matrix_calloc (cov->size1, cov->size2);
382 gsl_matrix_memcpy (params, cov);
383 reg_sweep (params, l->dependent_column);
384 post_sweep_computations (l, params);
385 gsl_matrix_free (params);
387 else if (l->method == LINREG_QR)
389 linreg_fit_qr (cov, l);
393 double linreg_mse (const linreg *c)
396 return (c->sse / c->dfe);
399 double linreg_intercept (const linreg *c)
405 linreg_cov (const linreg *c)
411 linreg_coeff (const linreg *c, size_t i)
413 return (c->coeff[i]);
416 const struct variable *
417 linreg_indep_var (const linreg *c, size_t i)
419 return (c->indep_vars[i]);
423 linreg_n_coeffs (const linreg *c)
429 linreg_n_obs (const linreg *c)
435 linreg_sse (const linreg *c)
441 linreg_ssreg (const linreg *c)
443 return (c->sst - c->sse);
446 double linreg_sst (const linreg *c)
452 linreg_dfmodel ( const linreg *c)
458 linreg_set_depvar_mean (linreg *c, double x)
464 linreg_get_depvar_mean (linreg *c)
466 return c->depvar_mean;