1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005, 2010, 2011, 2017 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/linreg.h"
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
23 #include <gsl/gsl_errno.h>
24 #include <gsl/gsl_fit.h>
25 #include <gsl/gsl_linalg.h>
26 #include <gsl/gsl_multifit.h>
28 #include "data/value.h"
29 #include "data/variable.h"
30 #include "linreg/sweep.h"
32 #include "gl/xalloc.h"
35 Find the least-squares estimate of b for the linear model:
39 where Y is an n-by-1 column vector, X is an n-by-p matrix of
40 independent variables, b is a p-by-1 vector of regression coefficients,
41 and Z is an n-by-1 normally-distributed random vector with independent
42 identically distributed components with mean 0.
44 This estimate is found via the sweep operator or singular-value
45 decomposition with gsl.
50 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
51 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
53 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
56 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
57 Springer. 1998. ISBN 0-387-98542-5.
61 const struct variable **
62 linreg_get_vars (const linreg *c)
68 Allocate a linreg and return a pointer to it. n is the number of
69 cases, p is the number of independent variables.
72 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
73 double n, size_t p, bool origin)
78 c = xmalloc (sizeof (*c));
80 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
81 c->dependent_column = p;
82 for (i = 0; i < p; i++)
84 c->indep_vars[i] = indep_vars[i];
86 c->indep_means = gsl_vector_alloc (p);
87 c->indep_std = gsl_vector_alloc (p);
92 c->coeff = xnmalloc (p, sizeof (*c->coeff));
93 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
99 c->dfe = c->dft - c->dfm;
101 c->depvar_mean = 0.0;
105 c->method = LINREG_SWEEP;
116 linreg_ref (linreg *c)
122 linreg_unref (linreg *c)
124 if (--c->refcnt == 0)
126 gsl_vector_free (c->indep_means);
127 gsl_vector_free (c->indep_std);
128 gsl_matrix_free (c->cov);
129 free (c->indep_vars);
136 post_sweep_computations (linreg *l, gsl_matrix *sw)
146 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
147 l->mse = l->sse / l->dfe;
152 for (i = 0; i < l->n_indeps; i++)
154 double tmp = gsl_matrix_get (sw, i, l->n_indeps);
156 m -= tmp * linreg_get_indep_variable_mean (l, i);
159 Get the covariance matrix of the parameter estimates.
160 Only the upper triangle is necessary.
164 The loops below do not compute the entries related
165 to the estimated intercept.
167 for (i = 0; i < l->n_indeps; i++)
168 for (j = i; j < l->n_indeps; j++)
170 double tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
171 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
178 gsl_matrix_view xmxtx;
180 Get the covariances related to the intercept.
182 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
183 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
184 xm = gsl_matrix_calloc (1, l->n_indeps);
185 for (i = 0; i < xm->size2; i++)
187 gsl_matrix_set (xm, 0, i,
188 linreg_get_indep_variable_mean (l, i));
190 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
191 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
192 gsl_matrix_free (xm);
193 if (rc == GSL_SUCCESS)
195 double tmp = l->mse / l->n_obs;
196 for (i = 1; i < 1 + l->n_indeps; i++)
198 tmp -= gsl_matrix_get (l->cov, 0, i)
199 * linreg_get_indep_variable_mean (l, i - 1);
201 gsl_matrix_set (l->cov, 0, 0, tmp);
207 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
208 __FILE__, __LINE__, gsl_strerror (rc));
215 Predict the value of the dependent variable with the new set of
216 predictors. VALS are assumed to be in the order corresponding to the
217 order of the coefficients in the linreg struct.
220 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
225 assert (n_vals = c->n_coeffs);
226 if (vals == NULL || c == NULL)
230 if (c->coeff == NULL)
232 /* The stupid model: just guess the mean. */
233 return c->depvar_mean;
235 result = c->intercept;
237 for (j = 0; j < n_vals; j++)
239 result += linreg_coeff (c, j) * vals[j];
246 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
248 if (vals == NULL || c == NULL)
252 return (obs - linreg_predict (c, vals, n_vals));
256 Mean of the independent variable.
258 double linreg_get_indep_variable_mean (const linreg *c, size_t j)
261 return gsl_vector_get (c->indep_means, j);
264 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
267 gsl_vector_set (c->indep_means, j, m);
271 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
273 double intcpt_coef = 0.0;
274 double intercept_variance = 0.0;
284 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
285 xty = gsl_vector_alloc (cov->size1 - 1);
286 tau = gsl_vector_alloc (cov->size1 - 1);
287 params = gsl_vector_alloc (cov->size1 - 1);
289 for (i = 0; i < xtx->size1; i++)
291 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
292 for (j = 0; j < xtx->size2; j++)
294 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
297 gsl_linalg_QR_decomp (xtx, tau);
298 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
299 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
301 gsl_linalg_QR_unpack (xtx, tau, q, r);
302 gsl_linalg_QR_solve (xtx, tau, xty, params);
303 for (i = 0; i < params->size; i++)
305 l->coeff[i] = gsl_vector_get (params, i);
307 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
309 for (i = 0; i < l->n_indeps; i++)
311 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
313 l->sse = l->sst - l->ssm;
315 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
317 /* Copy the lower triangle into the upper triangle. */
318 for (i = 0; i < q->size1; i++)
320 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
321 for (j = i + 1; j < q->size2; j++)
323 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
324 linreg_get_indep_variable_mean (l, i) *
325 linreg_get_indep_variable_mean (l, j);
326 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
329 l->intercept = linreg_get_depvar_mean (l);
330 for (i = 0; i < l->n_indeps; i++)
332 double tmp = linreg_get_indep_variable_mean (l, i);
333 l->intercept -= l->coeff[i] * tmp;
334 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
337 /* Covariances related to the intercept. */
338 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
339 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
340 for (i = 0; i < q->size1; i++)
342 for (j = 0; j < q->size2; j++)
344 intcpt_coef -= gsl_matrix_get (q, i, j)
345 * linreg_get_indep_variable_mean (l, j);
347 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
348 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
354 gsl_vector_free (xty);
355 gsl_vector_free (tau);
356 gsl_matrix_free (xtx);
357 gsl_vector_free (params);
361 Estimate the model parameters from the covariance matrix. This
362 function assumes the covariance entries corresponding to the
363 dependent variable are in the final row and column of the covariance
367 linreg_fit (const gsl_matrix *cov, linreg *l)
370 assert (cov != NULL);
372 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
373 if (l->method == LINREG_SWEEP)
376 params = gsl_matrix_calloc (cov->size1, cov->size2);
377 gsl_matrix_memcpy (params, cov);
378 reg_sweep (params, l->dependent_column);
379 post_sweep_computations (l, params);
380 gsl_matrix_free (params);
382 else if (l->method == LINREG_QR)
384 linreg_fit_qr (cov, l);
388 double linreg_mse (const linreg *c)
391 return (c->sse / c->dfe);
394 double linreg_intercept (const linreg *c)
400 linreg_cov (const linreg *c)
406 linreg_coeff (const linreg *c, size_t i)
408 return (c->coeff[i]);
411 const struct variable *
412 linreg_indep_var (const linreg *c, size_t i)
414 return (c->indep_vars[i]);
418 linreg_n_coeffs (const linreg *c)
424 linreg_n_obs (const linreg *c)
430 linreg_sse (const linreg *c)
436 linreg_ssreg (const linreg *c)
438 return (c->sst - c->sse);
441 double linreg_sst (const linreg *c)
447 linreg_dfmodel ( const linreg *c)
453 linreg_set_depvar_mean (linreg *c, double x)
459 linreg_get_depvar_mean (const linreg *c)
461 return c->depvar_mean;