1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/linreg.h"
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
23 #include <gsl/gsl_errno.h>
24 #include <gsl/gsl_fit.h>
25 #include <gsl/gsl_linalg.h>
26 #include <gsl/gsl_multifit.h>
28 #include "data/value.h"
29 #include "data/variable.h"
30 #include "linreg/sweep.h"
32 #include "gl/xalloc.h"
35 Find the least-squares estimate of b for the linear model:
39 where Y is an n-by-1 column vector, X is an n-by-p matrix of
40 independent variables, b is a p-by-1 vector of regression coefficients,
41 and Z is an n-by-1 normally-distributed random vector with independent
42 identically distributed components with mean 0.
44 This estimate is found via the sweep operator or singular-value
45 decomposition with gsl.
50 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
51 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
53 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
56 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
57 Springer. 1998. ISBN 0-387-98542-5.
61 const struct variable **
62 linreg_get_vars (const linreg *c)
68 Allocate a linreg and return a pointer to it. n is the number of
69 cases, p is the number of independent variables.
72 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
78 c = xmalloc (sizeof (*c));
80 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
81 c->dependent_column = p;
82 for (i = 0; i < p; i++)
84 c->indep_vars[i] = indep_vars[i];
86 c->indep_means = gsl_vector_alloc (p);
87 c->indep_std = gsl_vector_alloc (p);
92 c->coeff = xnmalloc (p, sizeof (*c->coeff));
93 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
96 c->dfe = c->dft - c->dfm;
102 c->method = LINREG_SWEEP;
111 linreg_ref (linreg *c)
117 linreg_unref (linreg *c)
119 if (--c->refcnt == 0)
121 gsl_vector_free (c->indep_means);
122 gsl_vector_free (c->indep_std);
123 gsl_matrix_free (c->cov);
124 free (c->indep_vars);
131 post_sweep_computations (linreg *l, gsl_matrix *sw)
135 gsl_matrix_view xmxtx;
144 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
145 l->mse = l->sse / l->dfe;
150 for (i = 0; i < l->n_indeps; i++)
152 double tmp = gsl_matrix_get (sw, i, l->n_indeps);
154 m -= tmp * linreg_get_indep_variable_mean (l, i);
157 Get the covariance matrix of the parameter estimates.
158 Only the upper triangle is necessary.
162 The loops below do not compute the entries related
163 to the estimated intercept.
165 for (i = 0; i < l->n_indeps; i++)
166 for (j = i; j < l->n_indeps; j++)
168 double tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
169 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
172 Get the covariances related to the intercept.
174 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
175 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
176 xm = gsl_matrix_calloc (1, l->n_indeps);
177 for (i = 0; i < xm->size2; i++)
179 gsl_matrix_set (xm, 0, i,
180 linreg_get_indep_variable_mean (l, i));
182 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
183 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
184 gsl_matrix_free (xm);
185 if (rc == GSL_SUCCESS)
187 double tmp = l->mse / l->n_obs;
188 for (i = 1; i < 1 + l->n_indeps; i++)
190 tmp -= gsl_matrix_get (l->cov, 0, i)
191 * linreg_get_indep_variable_mean (l, i - 1);
193 gsl_matrix_set (l->cov, 0, 0, tmp);
199 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
200 __FILE__, __LINE__, gsl_strerror (rc));
206 Predict the value of the dependent variable with the new set of
207 predictors. VALS are assumed to be in the order corresponding to the
208 order of the coefficients in the linreg struct.
211 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
216 assert (n_vals = c->n_coeffs);
217 if (vals == NULL || c == NULL)
221 if (c->coeff == NULL)
223 /* The stupid model: just guess the mean. */
224 return c->depvar_mean;
226 result = c->intercept;
228 for (j = 0; j < n_vals; j++)
230 result += linreg_coeff (c, j) * vals[j];
237 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
239 if (vals == NULL || c == NULL)
243 return (obs - linreg_predict (c, vals, n_vals));
247 Mean of the independent variable.
249 double linreg_get_indep_variable_mean (const linreg *c, size_t j)
252 return gsl_vector_get (c->indep_means, j);
255 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
258 gsl_vector_set (c->indep_means, j, m);
262 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
264 double intcpt_coef = 0.0;
265 double intercept_variance = 0.0;
275 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
276 xty = gsl_vector_alloc (cov->size1 - 1);
277 tau = gsl_vector_alloc (cov->size1 - 1);
278 params = gsl_vector_alloc (cov->size1 - 1);
280 for (i = 0; i < xtx->size1; i++)
282 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
283 for (j = 0; j < xtx->size2; j++)
285 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
288 gsl_linalg_QR_decomp (xtx, tau);
289 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
290 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
292 gsl_linalg_QR_unpack (xtx, tau, q, r);
293 gsl_linalg_QR_solve (xtx, tau, xty, params);
294 for (i = 0; i < params->size; i++)
296 l->coeff[i] = gsl_vector_get (params, i);
298 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
300 for (i = 0; i < l->n_indeps; i++)
302 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
304 l->sse = l->sst - l->ssm;
306 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
308 /* Copy the lower triangle into the upper triangle. */
309 for (i = 0; i < q->size1; i++)
311 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
312 for (j = i + 1; j < q->size2; j++)
314 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
315 linreg_get_indep_variable_mean (l, i) *
316 linreg_get_indep_variable_mean (l, j);
317 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
320 l->intercept = linreg_get_depvar_mean (l);
321 for (i = 0; i < l->n_indeps; i++)
323 double tmp = linreg_get_indep_variable_mean (l, i);
324 l->intercept -= l->coeff[i] * tmp;
325 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
328 /* Covariances related to the intercept. */
329 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
330 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
331 for (i = 0; i < q->size1; i++)
333 for (j = 0; j < q->size2; j++)
335 intcpt_coef -= gsl_matrix_get (q, i, j)
336 * linreg_get_indep_variable_mean (l, j);
338 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
339 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
345 gsl_vector_free (xty);
346 gsl_vector_free (tau);
347 gsl_matrix_free (xtx);
348 gsl_vector_free (params);
352 Estimate the model parameters from the covariance matrix. This
353 function assumes the covariance entries corresponding to the
354 dependent variable are in the final row and column of the covariance
358 linreg_fit (const gsl_matrix *cov, linreg *l)
361 assert (cov != NULL);
363 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
364 if (l->method == LINREG_SWEEP)
367 params = gsl_matrix_calloc (cov->size1, cov->size2);
368 gsl_matrix_memcpy (params, cov);
369 reg_sweep (params, l->dependent_column);
370 post_sweep_computations (l, params);
371 gsl_matrix_free (params);
373 else if (l->method == LINREG_QR)
375 linreg_fit_qr (cov, l);
379 double linreg_mse (const linreg *c)
382 return (c->sse / c->dfe);
385 double linreg_intercept (const linreg *c)
391 linreg_cov (const linreg *c)
397 linreg_coeff (const linreg *c, size_t i)
399 return (c->coeff[i]);
402 const struct variable *
403 linreg_indep_var (const linreg *c, size_t i)
405 return (c->indep_vars[i]);
409 linreg_n_coeffs (const linreg *c)
415 linreg_n_obs (const linreg *c)
421 linreg_sse (const linreg *c)
427 linreg_ssreg (const linreg *c)
429 return (c->sst - c->sse);
432 double linreg_sst (const linreg *c)
438 linreg_dfmodel ( const linreg *c)
444 linreg_set_depvar_mean (linreg *c, double x)
450 linreg_get_depvar_mean (const linreg *c)
452 return c->depvar_mean;