1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/linreg.h"
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
23 #include <gsl/gsl_errno.h>
24 #include <gsl/gsl_fit.h>
25 #include <gsl/gsl_linalg.h>
26 #include <gsl/gsl_multifit.h>
28 #include "data/value.h"
29 #include "data/variable.h"
30 #include "linreg/sweep.h"
32 #include "gl/xalloc.h"
35 Find the least-squares estimate of b for the linear model:
39 where Y is an n-by-1 column vector, X is an n-by-p matrix of
40 independent variables, b is a p-by-1 vector of regression coefficients,
41 and Z is an n-by-1 normally-distributed random vector with independent
42 identically distributed components with mean 0.
44 This estimate is found via the sweep operator or singular-value
45 decomposition with gsl.
50 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
51 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
53 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
56 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
57 Springer. 1998. ISBN 0-387-98542-5.
61 const struct variable **
62 linreg_get_vars (const linreg *c)
68 Allocate a linreg and return a pointer to it. n is the number of
69 cases, p is the number of independent variables.
72 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
78 c = xmalloc (sizeof (*c));
80 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
81 c->dependent_column = p;
82 for (i = 0; i < p; i++)
84 c->indep_vars[i] = indep_vars[i];
86 c->indep_means = gsl_vector_alloc (p);
87 c->indep_std = gsl_vector_alloc (p);
92 c->coeff = xnmalloc (p, sizeof (*c->coeff));
93 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
96 c->dfe = c->dft - c->dfm;
102 c->method = LINREG_SWEEP;
111 linreg_ref (linreg *c)
117 linreg_unref (linreg *c)
119 if (--c->refcnt == 0)
121 gsl_vector_free (c->indep_means);
122 gsl_vector_free (c->indep_std);
123 gsl_matrix_free (c->cov);
124 free (c->indep_vars);
131 post_sweep_computations (linreg *l, gsl_matrix *sw)
135 gsl_matrix_view xmxtx;
145 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
146 l->mse = l->sse / l->dfe;
151 for (i = 0; i < l->n_indeps; i++)
153 tmp = gsl_matrix_get (sw, i, l->n_indeps);
155 m -= tmp * linreg_get_indep_variable_mean (l, i);
158 Get the covariance matrix of the parameter estimates.
159 Only the upper triangle is necessary.
163 The loops below do not compute the entries related
164 to the estimated intercept.
166 for (i = 0; i < l->n_indeps; i++)
167 for (j = i; j < l->n_indeps; j++)
169 tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
170 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
173 Get the covariances related to the intercept.
175 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
176 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
177 xm = gsl_matrix_calloc (1, l->n_indeps);
178 for (i = 0; i < xm->size2; i++)
180 gsl_matrix_set (xm, 0, i,
181 linreg_get_indep_variable_mean (l, i));
183 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
184 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
185 gsl_matrix_free (xm);
186 if (rc == GSL_SUCCESS)
188 tmp = l->mse / l->n_obs;
189 for (i = 1; i < 1 + l->n_indeps; i++)
191 tmp -= gsl_matrix_get (l->cov, 0, i)
192 * linreg_get_indep_variable_mean (l, i - 1);
194 gsl_matrix_set (l->cov, 0, 0, tmp);
200 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
201 __FILE__, __LINE__, gsl_strerror (rc));
207 Predict the value of the dependent variable with the new set of
208 predictors. VALS are assumed to be in the order corresponding to the
209 order of the coefficients in the linreg struct.
212 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
217 assert (n_vals = c->n_coeffs);
218 if (vals == NULL || c == NULL)
222 if (c->coeff == NULL)
224 /* The stupid model: just guess the mean. */
225 return c->depvar_mean;
227 result = c->intercept;
229 for (j = 0; j < n_vals; j++)
231 result += linreg_coeff (c, j) * vals[j];
238 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
240 if (vals == NULL || c == NULL)
244 return (obs - linreg_predict (c, vals, n_vals));
248 Mean of the independent variable.
250 double linreg_get_indep_variable_mean (const linreg *c, size_t j)
253 return gsl_vector_get (c->indep_means, j);
256 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
259 gsl_vector_set (c->indep_means, j, m);
263 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
265 double intcpt_coef = 0.0;
266 double intercept_variance = 0.0;
277 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
278 xty = gsl_vector_alloc (cov->size1 - 1);
279 tau = gsl_vector_alloc (cov->size1 - 1);
280 params = gsl_vector_alloc (cov->size1 - 1);
282 for (i = 0; i < xtx->size1; i++)
284 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
285 for (j = 0; j < xtx->size2; j++)
287 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
290 gsl_linalg_QR_decomp (xtx, tau);
291 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
292 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
294 gsl_linalg_QR_unpack (xtx, tau, q, r);
295 gsl_linalg_QR_solve (xtx, tau, xty, params);
296 for (i = 0; i < params->size; i++)
298 l->coeff[i] = gsl_vector_get (params, i);
300 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
302 for (i = 0; i < l->n_indeps; i++)
304 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
306 l->sse = l->sst - l->ssm;
308 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
310 /* Copy the lower triangle into the upper triangle. */
311 for (i = 0; i < q->size1; i++)
313 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
314 for (j = i + 1; j < q->size2; j++)
316 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
317 linreg_get_indep_variable_mean (l, i) *
318 linreg_get_indep_variable_mean (l, j);
319 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
322 l->intercept = linreg_get_depvar_mean (l);
324 for (i = 0; i < l->n_indeps; i++)
326 tmp = linreg_get_indep_variable_mean (l, i);
327 l->intercept -= l->coeff[i] * tmp;
328 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
331 /* Covariances related to the intercept. */
332 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
333 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
334 for (i = 0; i < q->size1; i++)
336 for (j = 0; j < q->size2; j++)
338 intcpt_coef -= gsl_matrix_get (q, i, j)
339 * linreg_get_indep_variable_mean (l, j);
341 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
342 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
348 gsl_vector_free (xty);
349 gsl_vector_free (tau);
350 gsl_matrix_free (xtx);
351 gsl_vector_free (params);
355 Estimate the model parameters from the covariance matrix. This
356 function assumes the covariance entries corresponding to the
357 dependent variable are in the final row and column of the covariance
361 linreg_fit (const gsl_matrix *cov, linreg *l)
364 assert (cov != NULL);
366 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
367 if (l->method == LINREG_SWEEP)
370 params = gsl_matrix_calloc (cov->size1, cov->size2);
371 gsl_matrix_memcpy (params, cov);
372 reg_sweep (params, l->dependent_column);
373 post_sweep_computations (l, params);
374 gsl_matrix_free (params);
376 else if (l->method == LINREG_QR)
378 linreg_fit_qr (cov, l);
382 double linreg_mse (const linreg *c)
385 return (c->sse / c->dfe);
388 double linreg_intercept (const linreg *c)
394 linreg_cov (const linreg *c)
400 linreg_coeff (const linreg *c, size_t i)
402 return (c->coeff[i]);
405 const struct variable *
406 linreg_indep_var (const linreg *c, size_t i)
408 return (c->indep_vars[i]);
412 linreg_n_coeffs (const linreg *c)
418 linreg_n_obs (const linreg *c)
424 linreg_sse (const linreg *c)
430 linreg_ssreg (const linreg *c)
432 return (c->sst - c->sse);
435 double linreg_sst (const linreg *c)
441 linreg_dfmodel ( const linreg *c)
447 linreg_set_depvar_mean (linreg *c, double x)
453 linreg_get_depvar_mean (const linreg *c)
455 return c->depvar_mean;