1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/linreg.h"
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
23 #include <gsl/gsl_errno.h>
24 #include <gsl/gsl_fit.h>
25 #include <gsl/gsl_linalg.h>
26 #include <gsl/gsl_multifit.h>
28 #include "data/value.h"
29 #include "data/variable.h"
30 #include "linreg/sweep.h"
32 #include "gl/xalloc.h"
35 Find the least-squares estimate of b for the linear model:
39 where Y is an n-by-1 column vector, X is an n-by-p matrix of
40 independent variables, b is a p-by-1 vector of regression coefficients,
41 and Z is an n-by-1 normally-distributed random vector with independent
42 identically distributed components with mean 0.
44 This estimate is found via the sweep operator or singular-value
45 decomposition with gsl.
50 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
51 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
53 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
56 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
57 Springer. 1998. ISBN 0-387-98542-5.
61 const struct variable **
62 linreg_get_vars (const linreg *c)
68 Allocate a linreg and return a pointer to it. n is the number of
69 cases, p is the number of independent variables.
72 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
78 c = xmalloc (sizeof (*c));
80 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
81 c->dependent_column = p;
82 for (i = 0; i < p; i++)
84 c->indep_vars[i] = indep_vars[i];
86 c->indep_means = gsl_vector_alloc (p);
87 c->indep_std = gsl_vector_alloc (p);
89 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
95 c->coeff = xnmalloc (p, sizeof (*c->coeff));
96 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
99 c->dfe = c->dft - c->dfm;
101 c->depvar_mean = 0.0;
106 c->method = LINREG_SWEEP;
115 linreg_ref (linreg *c)
121 linreg_unref (linreg *c)
123 if (c && --c->refcnt == 0)
125 gsl_vector_free (c->indep_means);
126 gsl_vector_free (c->indep_std);
127 gsl_vector_free (c->ss_indeps);
128 gsl_matrix_free (c->cov);
129 free (c->indep_vars);
136 post_sweep_computations (linreg *l, gsl_matrix *sw)
140 gsl_matrix_view xmxtx;
150 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
151 l->mse = l->sse / l->dfe;
156 for (i = 0; i < l->n_indeps; i++)
158 tmp = gsl_matrix_get (sw, i, l->n_indeps);
160 m -= tmp * linreg_get_indep_variable_mean (l, i);
163 Get the covariance matrix of the parameter estimates.
164 Only the upper triangle is necessary.
168 The loops below do not compute the entries related
169 to the estimated intercept.
171 for (i = 0; i < l->n_indeps; i++)
172 for (j = i; j < l->n_indeps; j++)
174 tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
175 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
178 Get the covariances related to the intercept.
180 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
181 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
182 xm = gsl_matrix_calloc (1, l->n_indeps);
183 for (i = 0; i < xm->size2; i++)
185 gsl_matrix_set (xm, 0, i,
186 linreg_get_indep_variable_mean (l, i));
188 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
189 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
190 gsl_matrix_free (xm);
191 if (rc == GSL_SUCCESS)
193 tmp = l->mse / l->n_obs;
194 for (i = 1; i < 1 + l->n_indeps; i++)
196 tmp -= gsl_matrix_get (l->cov, 0, i)
197 * linreg_get_indep_variable_mean (l, i - 1);
199 gsl_matrix_set (l->cov, 0, 0, tmp);
205 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
206 __FILE__, __LINE__, gsl_strerror (rc));
212 Predict the value of the dependent variable with the new set of
213 predictors. VALS are assumed to be in the order corresponding to the
214 order of the coefficients in the linreg struct.
217 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
222 assert (n_vals = c->n_coeffs);
223 if (vals == NULL || c == NULL)
227 if (c->coeff == NULL)
229 /* The stupid model: just guess the mean. */
230 return c->depvar_mean;
232 result = c->intercept;
234 for (j = 0; j < n_vals; j++)
236 result += linreg_coeff (c, j) * vals[j];
243 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
245 if (vals == NULL || c == NULL)
249 return (obs - linreg_predict (c, vals, n_vals));
252 double linreg_get_indep_variable_sd (linreg *c, size_t j)
255 return gsl_vector_get (c->indep_std, j);
258 void linreg_set_indep_variable_sd (linreg *c, size_t j, double s)
261 gsl_vector_set (c->indep_std, j, s);
265 Mean of the independent variable.
267 double linreg_get_indep_variable_mean (linreg *c, size_t j)
270 return gsl_vector_get (c->indep_means, j);
273 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
276 gsl_vector_set (c->indep_means, j, m);
280 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
282 double intcpt_coef = 0.0;
283 double intercept_variance = 0.0;
294 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
295 xty = gsl_vector_alloc (cov->size1 - 1);
296 tau = gsl_vector_alloc (cov->size1 - 1);
297 params = gsl_vector_alloc (cov->size1 - 1);
299 for (i = 0; i < xtx->size1; i++)
301 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
302 for (j = 0; j < xtx->size2; j++)
304 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
307 gsl_linalg_QR_decomp (xtx, tau);
308 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
309 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
311 gsl_linalg_QR_unpack (xtx, tau, q, r);
312 gsl_linalg_QR_solve (xtx, tau, xty, params);
313 for (i = 0; i < params->size; i++)
315 l->coeff[i] = gsl_vector_get (params, i);
317 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
319 for (i = 0; i < l->n_indeps; i++)
321 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
323 l->sse = l->sst - l->ssm;
325 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
327 /* Copy the lower triangle into the upper triangle. */
328 for (i = 0; i < q->size1; i++)
330 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
331 for (j = i + 1; j < q->size2; j++)
333 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
334 linreg_get_indep_variable_mean (l, i) *
335 linreg_get_indep_variable_mean (l, j);
336 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
339 l->intercept = linreg_get_depvar_mean (l);
341 for (i = 0; i < l->n_indeps; i++)
343 tmp = linreg_get_indep_variable_mean (l, i);
344 l->intercept -= l->coeff[i] * tmp;
345 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
348 /* Covariances related to the intercept. */
349 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
350 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
351 for (i = 0; i < q->size1; i++)
353 for (j = 0; j < q->size2; j++)
355 intcpt_coef -= gsl_matrix_get (q, i, j)
356 * linreg_get_indep_variable_mean (l, j);
358 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
359 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
365 gsl_vector_free (xty);
366 gsl_vector_free (tau);
367 gsl_matrix_free (xtx);
368 gsl_vector_free (params);
372 Estimate the model parameters from the covariance matrix. This
373 function assumes the covariance entries corresponding to the
374 dependent variable are in the final row and column of the covariance
378 linreg_fit (const gsl_matrix *cov, linreg *l)
381 assert (cov != NULL);
383 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
384 if (l->method == LINREG_SWEEP)
387 params = gsl_matrix_calloc (cov->size1, cov->size2);
388 gsl_matrix_memcpy (params, cov);
389 reg_sweep (params, l->dependent_column);
390 post_sweep_computations (l, params);
391 gsl_matrix_free (params);
393 else if (l->method == LINREG_QR)
395 linreg_fit_qr (cov, l);
399 double linreg_mse (const linreg *c)
402 return (c->sse / c->dfe);
405 double linreg_intercept (const linreg *c)
411 linreg_cov (const linreg *c)
417 linreg_coeff (const linreg *c, size_t i)
419 return (c->coeff[i]);
422 const struct variable *
423 linreg_indep_var (const linreg *c, size_t i)
425 return (c->indep_vars[i]);
429 linreg_n_coeffs (const linreg *c)
435 linreg_n_obs (const linreg *c)
441 linreg_sse (const linreg *c)
447 linreg_ssreg (const linreg *c)
449 return (c->sst - c->sse);
452 double linreg_sst (const linreg *c)
458 linreg_dfmodel ( const linreg *c)
464 linreg_set_depvar_mean (linreg *c, double x)
470 linreg_get_depvar_mean (linreg *c)
472 return c->depvar_mean;