1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005, 2010, 2011, 2017 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "math/linreg.h"
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
23 #include <gsl/gsl_errno.h>
24 #include <gsl/gsl_fit.h>
25 #include <gsl/gsl_linalg.h>
26 #include <gsl/gsl_multifit.h>
28 #include "data/value.h"
29 #include "data/variable.h"
30 #include "linreg/sweep.h"
32 #include "gl/xalloc.h"
35 Find the least-squares estimate of b for the linear model:
39 where Y is an n-by-1 column vector, X is an n-by-p matrix of
40 independent variables, b is a p-by-1 vector of regression coefficients,
41 and Z is an n-by-1 normally-distributed random vector with independent
42 identically distributed components with mean 0.
44 This estimate is found via the sweep operator or singular-value
45 decomposition with gsl.
50 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
51 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
53 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
56 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
57 Springer. 1998. ISBN 0-387-98542-5.
62 double n_obs; /* Number of observations. */
63 int n_indeps; /* Number of independent variables. */
64 int n_coeffs; /* The intercept is not considered a
68 Pointers to the variables.
70 const struct variable *depvar;
71 const struct variable **indep_vars;
76 Means and standard deviations of the variables.
77 If these pointers are null when pspp_linreg() is
78 called, pspp_linreg() will compute their values.
80 Entry i of indep_means is the mean of independent
81 variable i, whose observations are stored in the ith
82 column of the design matrix.
85 gsl_vector *indep_means;
86 gsl_vector *indep_std;
91 double ssm; /* Sums of squares for the overall model. */
92 double sst; /* Sum of squares total. */
93 double sse; /* Sum of squares error. */
94 double mse; /* Mean squared error. This is just sse /
95 dfe, but since it is the best unbiased
96 estimate of the population variance, it
97 has its own entry here. */
99 Covariance matrix of the parameter estimates.
109 int dependent_column; /* Column containing the dependent variable. Defaults to last column. */
115 const struct variable **
116 linreg_get_vars (const struct linreg *c)
118 return c->indep_vars;
122 Allocate a linreg and return a pointer to it. n is the number of
123 cases, p is the number of independent variables.
126 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
127 double n, size_t p, bool origin)
132 c = xmalloc (sizeof (*c));
134 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
135 c->dependent_column = p;
136 for (i = 0; i < p; i++)
138 c->indep_vars[i] = indep_vars[i];
140 c->indep_means = gsl_vector_alloc (p);
141 c->indep_std = gsl_vector_alloc (p);
146 c->coeff = xnmalloc (p, sizeof (*c->coeff));
147 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
153 c->dfe = c->dft - c->dfm;
155 c->depvar_mean = 0.0;
169 linreg_ref (struct linreg *c)
175 linreg_unref (struct linreg *c)
177 if (--c->refcnt == 0)
179 gsl_vector_free (c->indep_means);
180 gsl_vector_free (c->indep_std);
181 gsl_matrix_free (c->cov);
182 free (c->indep_vars);
189 post_sweep_computations (struct linreg *l, gsl_matrix *sw)
199 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
200 l->mse = l->sse / l->dfe;
205 for (i = 0; i < l->n_indeps; i++)
207 double tmp = gsl_matrix_get (sw, i, l->n_indeps);
209 m -= tmp * linreg_get_indep_variable_mean (l, i);
212 Get the covariance matrix of the parameter estimates.
213 Only the upper triangle is necessary.
217 The loops below do not compute the entries related
218 to the estimated intercept.
220 for (i = 0; i < l->n_indeps; i++)
221 for (j = i; j < l->n_indeps; j++)
223 double tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
224 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
231 gsl_matrix_view xmxtx;
233 Get the covariances related to the intercept.
235 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
236 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
237 xm = gsl_matrix_calloc (1, l->n_indeps);
238 for (i = 0; i < xm->size2; i++)
240 gsl_matrix_set (xm, 0, i,
241 linreg_get_indep_variable_mean (l, i));
243 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
244 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
245 gsl_matrix_free (xm);
246 if (rc == GSL_SUCCESS)
248 double tmp = l->mse / l->n_obs;
249 for (i = 1; i < 1 + l->n_indeps; i++)
251 tmp -= gsl_matrix_get (l->cov, 0, i)
252 * linreg_get_indep_variable_mean (l, i - 1);
254 gsl_matrix_set (l->cov, 0, 0, tmp);
260 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
261 __FILE__, __LINE__, gsl_strerror (rc));
268 Predict the value of the dependent variable with the new set of
269 predictors. VALS are assumed to be in the order corresponding to the
270 order of the coefficients in the linreg struct.
273 linreg_predict (const struct linreg *c, const double *vals, size_t n_vals)
278 assert (n_vals = c->n_coeffs);
279 if (vals == NULL || c == NULL)
283 if (c->coeff == NULL)
285 /* The stupid model: just guess the mean. */
286 return c->depvar_mean;
288 result = c->intercept;
290 for (j = 0; j < n_vals; j++)
292 result += linreg_coeff (c, j) * vals[j];
299 linreg_residual (const struct linreg *c, double obs, const double *vals, size_t n_vals)
301 if (vals == NULL || c == NULL)
305 return (obs - linreg_predict (c, vals, n_vals));
309 Mean of the independent variable.
311 double linreg_get_indep_variable_mean (const struct linreg *c, size_t j)
314 return gsl_vector_get (c->indep_means, j);
317 void linreg_set_indep_variable_mean (struct linreg *c, size_t j, double m)
320 gsl_vector_set (c->indep_means, j, m);
324 linreg_fit_qr (const gsl_matrix *cov, struct linreg *l)
326 double intcpt_coef = 0.0;
327 double intercept_variance = 0.0;
337 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
338 xty = gsl_vector_alloc (cov->size1 - 1);
339 tau = gsl_vector_alloc (cov->size1 - 1);
340 params = gsl_vector_alloc (cov->size1 - 1);
342 for (i = 0; i < xtx->size1; i++)
344 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
345 for (j = 0; j < xtx->size2; j++)
347 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
350 gsl_linalg_QR_decomp (xtx, tau);
351 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
352 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
354 gsl_linalg_QR_unpack (xtx, tau, q, r);
355 gsl_linalg_QR_solve (xtx, tau, xty, params);
356 for (i = 0; i < params->size; i++)
358 l->coeff[i] = gsl_vector_get (params, i);
360 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
362 for (i = 0; i < l->n_indeps; i++)
364 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
366 l->sse = l->sst - l->ssm;
368 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
370 /* Copy the lower triangle into the upper triangle. */
371 for (i = 0; i < q->size1; i++)
373 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
374 for (j = i + 1; j < q->size2; j++)
376 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
377 linreg_get_indep_variable_mean (l, i) *
378 linreg_get_indep_variable_mean (l, j);
379 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
382 l->intercept = linreg_get_depvar_mean (l);
383 for (i = 0; i < l->n_indeps; i++)
385 double tmp = linreg_get_indep_variable_mean (l, i);
386 l->intercept -= l->coeff[i] * tmp;
387 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
390 /* Covariances related to the intercept. */
391 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
392 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
393 for (i = 0; i < q->size1; i++)
395 for (j = 0; j < q->size2; j++)
397 intcpt_coef -= gsl_matrix_get (q, i, j)
398 * linreg_get_indep_variable_mean (l, j);
400 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
401 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
407 gsl_vector_free (xty);
408 gsl_vector_free (tau);
409 gsl_matrix_free (xtx);
410 gsl_vector_free (params);
413 #define REG_LARGE_DATA 1000
416 Estimate the model parameters from the covariance matrix. This
417 function assumes the covariance entries corresponding to the
418 dependent variable are in the final row and column of the covariance
422 linreg_fit (const gsl_matrix *cov, struct linreg *l)
425 assert (cov != NULL);
427 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
429 if ((l->n_obs * l->n_obs > l->n_indeps) && ( l->n_obs > REG_LARGE_DATA))
432 For large data sets, use QR decomposition.
434 linreg_fit_qr (cov, l);
438 gsl_matrix *params = gsl_matrix_calloc (cov->size1, cov->size2);
439 gsl_matrix_memcpy (params, cov);
440 reg_sweep (params, l->dependent_column);
441 post_sweep_computations (l, params);
442 gsl_matrix_free (params);
446 double linreg_mse (const struct linreg *c)
449 return (c->sse / c->dfe);
452 double linreg_intercept (const struct linreg *c)
458 linreg_cov (const struct linreg *c)
464 linreg_coeff (const struct linreg *c, size_t i)
466 return (c->coeff[i]);
469 const struct variable *
470 linreg_indep_var (const struct linreg *c, size_t i)
472 return (c->indep_vars[i]);
476 linreg_n_indeps (const struct linreg *c)
482 const struct variable *
483 linreg_dep_var (const struct linreg *c)
490 linreg_n_coeffs (const struct linreg *c)
496 linreg_n_obs (const struct linreg *c)
502 linreg_sse (const struct linreg *c)
508 linreg_ssreg (const struct linreg *c)
510 return (c->sst - c->sse);
513 double linreg_sst (const struct linreg *c)
519 linreg_dfmodel ( const struct linreg *c)
525 linreg_dferror ( const struct linreg *c)
531 linreg_dftotal ( const struct linreg *c)
537 linreg_set_depvar_mean (struct linreg *c, double x)
543 linreg_get_depvar_mean (const struct linreg *c)
545 return c->depvar_mean;