1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005, 2010, 2011, 2017 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/linreg.h"
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
23 #include <gsl/gsl_errno.h>
24 #include <gsl/gsl_fit.h>
25 #include <gsl/gsl_linalg.h>
26 #include <gsl/gsl_multifit.h>
28 #include "data/value.h"
29 #include "data/variable.h"
30 #include "linreg/sweep.h"
32 #include "gl/xalloc.h"
35 Find the least-squares estimate of b for the linear model:
39 where Y is an n-by-1 column vector, X is an n-by-p matrix of
40 independent variables, b is a p-by-1 vector of regression coefficients,
41 and Z is an n-by-1 normally-distributed random vector with independent
42 identically distributed components with mean 0.
44 This estimate is found via the sweep operator or singular-value
45 decomposition with gsl.
50 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
51 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
53 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
56 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
57 Springer. 1998. ISBN 0-387-98542-5.
62 double n_obs; /* Number of observations. */
63 int n_indeps; /* Number of independent variables. */
64 int n_coeffs; /* The intercept is not considered a
68 Pointers to the variables.
70 const struct variable *depvar;
71 const struct variable **indep_vars;
76 Means and standard deviations of the variables.
77 If these pointers are null when pspp_linreg() is
78 called, pspp_linreg() will compute their values.
80 Entry i of indep_means is the mean of independent
81 variable i, whose observations are stored in the ith
82 column of the design matrix.
85 gsl_vector *indep_means;
86 gsl_vector *indep_std;
91 double ssm; /* Sums of squares for the overall model. */
92 double sst; /* Sum of squares total. */
93 double sse; /* Sum of squares error. */
94 double mse; /* Mean squared error. This is just sse /
95 dfe, but since it is the best unbiased
96 estimate of the population variance, it
97 has its own entry here. */
99 Covariance matrix of the parameter estimates.
109 int dependent_column; /* Column containing the dependent variable. Defaults to last column. */
115 const struct variable **
116 linreg_get_vars (const struct linreg *c)
118 return c->indep_vars;
122 Allocate a linreg and return a pointer to it. n is the number of
123 cases, p is the number of independent variables.
126 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
127 double n, size_t p, bool origin)
132 c = xmalloc (sizeof (*c));
134 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
135 c->dependent_column = p;
136 for (i = 0; i < p; i++)
138 c->indep_vars[i] = indep_vars[i];
140 c->indep_means = gsl_vector_alloc (p);
141 c->indep_std = gsl_vector_alloc (p);
146 c->coeff = xnmalloc (p, sizeof (*c->coeff));
147 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
153 c->dfe = c->dft - c->dfm;
155 c->depvar_mean = 0.0;
169 linreg_ref (struct linreg *c)
175 linreg_unref (struct linreg *c)
177 if (--c->refcnt == 0)
179 gsl_vector_free (c->indep_means);
180 gsl_vector_free (c->indep_std);
181 gsl_matrix_free (c->cov);
182 free (c->indep_vars);
189 post_sweep_computations (struct linreg *l, gsl_matrix *sw)
199 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
200 l->mse = l->sse / l->dfe;
205 for (i = 0; i < l->n_indeps; i++)
207 double tmp = gsl_matrix_get (sw, i, l->n_indeps);
209 m -= tmp * linreg_get_indep_variable_mean (l, i);
212 Get the covariance matrix of the parameter estimates.
213 Only the upper triangle is necessary.
217 The loops below do not compute the entries related
218 to the estimated intercept.
220 for (i = 0; i < l->n_indeps; i++)
221 for (j = i; j < l->n_indeps; j++)
223 double tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
224 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
231 gsl_matrix_view xmxtx;
233 Get the covariances related to the intercept.
235 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
236 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
237 xm = gsl_matrix_calloc (1, l->n_indeps);
238 for (i = 0; i < xm->size2; i++)
240 gsl_matrix_set (xm, 0, i,
241 linreg_get_indep_variable_mean (l, i));
243 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
244 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
245 gsl_matrix_free (xm);
246 if (rc == GSL_SUCCESS)
248 double tmp = l->mse / l->n_obs;
249 for (i = 1; i < 1 + l->n_indeps; i++)
251 tmp -= gsl_matrix_get (l->cov, 0, i)
252 * linreg_get_indep_variable_mean (l, i - 1);
254 gsl_matrix_set (l->cov, 0, 0, tmp);
260 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
261 __FILE__, __LINE__, gsl_strerror (rc));
268 Predict the value of the dependent variable with the new set of
269 predictors. VALS are assumed to be in the order corresponding to the
270 order of the coefficients in the linreg struct.
273 linreg_predict (const struct linreg *c, const double *vals, size_t n_vals)
278 if (vals == NULL || c == NULL)
282 assert (n_vals == c->n_coeffs);
283 if (c->coeff == NULL)
285 /* The stupid model: just guess the mean. */
286 return c->depvar_mean;
288 result = c->intercept;
290 for (j = 0; j < n_vals; j++)
292 result += linreg_coeff (c, j) * vals[j];
299 linreg_residual (const struct linreg *c, double obs, const double *vals, size_t n_vals)
301 if (vals == NULL || c == NULL)
305 return (obs - linreg_predict (c, vals, n_vals));
309 Mean of the independent variable.
312 linreg_get_indep_variable_mean (const struct linreg *c, size_t j)
315 return gsl_vector_get (c->indep_means, j);
319 linreg_set_indep_variable_mean (struct linreg *c, size_t j, double m)
322 gsl_vector_set (c->indep_means, j, m);
327 linreg_fit_qr (const gsl_matrix *cov, struct linreg *l)
329 double intcpt_coef = 0.0;
330 double intercept_variance = 0.0;
340 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
341 xty = gsl_vector_alloc (cov->size1 - 1);
342 tau = gsl_vector_alloc (cov->size1 - 1);
343 params = gsl_vector_alloc (cov->size1 - 1);
345 for (i = 0; i < xtx->size1; i++)
347 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
348 for (j = 0; j < xtx->size2; j++)
350 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
353 gsl_linalg_QR_decomp (xtx, tau);
354 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
355 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
357 gsl_linalg_QR_unpack (xtx, tau, q, r);
358 gsl_linalg_QR_solve (xtx, tau, xty, params);
359 for (i = 0; i < params->size; i++)
361 l->coeff[i] = gsl_vector_get (params, i);
363 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
365 for (i = 0; i < l->n_indeps; i++)
367 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
369 l->sse = l->sst - l->ssm;
371 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
373 /* Copy the lower triangle into the upper triangle. */
374 for (i = 0; i < q->size1; i++)
376 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
377 for (j = i + 1; j < q->size2; j++)
379 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
380 linreg_get_indep_variable_mean (l, i) *
381 linreg_get_indep_variable_mean (l, j);
382 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
388 l->intercept = linreg_get_depvar_mean (l);
389 for (i = 0; i < l->n_indeps; i++)
391 double tmp = linreg_get_indep_variable_mean (l, i);
392 l->intercept -= l->coeff[i] * tmp;
393 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
396 /* Covariances related to the intercept. */
397 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
398 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
399 for (i = 0; i < q->size1; i++)
401 for (j = 0; j < q->size2; j++)
403 intcpt_coef -= gsl_matrix_get (q, i, j)
404 * linreg_get_indep_variable_mean (l, j);
406 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
407 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
414 gsl_vector_free (xty);
415 gsl_vector_free (tau);
416 gsl_matrix_free (xtx);
417 gsl_vector_free (params);
421 #define REG_LARGE_DATA 1000
424 Estimate the model parameters from the covariance matrix. This
425 function assumes the covariance entries corresponding to the
426 dependent variable are in the final row and column of the covariance
430 linreg_fit (const gsl_matrix *cov, struct linreg *l)
433 assert (cov != NULL);
435 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
438 /* This QR decomposition path seems to produce the incorrect
439 values. See https://savannah.gnu.org/bugs/?51373 */
440 if ((l->n_obs * l->n_obs > l->n_indeps) && (l->n_obs > REG_LARGE_DATA))
443 For large data sets, use QR decomposition.
445 linreg_fit_qr (cov, l);
450 gsl_matrix *params = gsl_matrix_calloc (cov->size1, cov->size2);
451 gsl_matrix_memcpy (params, cov);
452 reg_sweep (params, l->dependent_column);
453 post_sweep_computations (l, params);
454 gsl_matrix_free (params);
459 linreg_mse (const struct linreg *c)
462 return (c->sse / c->dfe);
466 linreg_intercept (const struct linreg *c)
472 linreg_cov (const struct linreg *c)
478 linreg_coeff (const struct linreg *c, size_t i)
480 return (c->coeff[i]);
483 const struct variable *
484 linreg_indep_var (const struct linreg *c, size_t i)
486 return (c->indep_vars[i]);
490 linreg_n_indeps (const struct linreg *c)
496 const struct variable *
497 linreg_dep_var (const struct linreg *c)
504 linreg_n_coeffs (const struct linreg *c)
510 linreg_n_obs (const struct linreg *c)
516 linreg_sse (const struct linreg *c)
522 linreg_ssreg (const struct linreg *c)
524 return (c->sst - c->sse);
527 double linreg_sst (const struct linreg *c)
533 linreg_dfmodel (const struct linreg *c)
539 linreg_dferror (const struct linreg *c)
545 linreg_dftotal (const struct linreg *c)
551 linreg_set_depvar_mean (struct linreg *c, double x)
557 linreg_get_depvar_mean (const struct linreg *c)
559 return c->depvar_mean;