1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005, 2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_blas.h>
19 #include <gsl/gsl_cblas.h>
20 #include <gsl/gsl_errno.h>
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_linalg.h>
23 #include <gsl/gsl_multifit.h>
24 #include <linreg/sweep.h>
25 #include <math/linreg.h>
26 #include <src/data/variable.h>
27 #include <src/data/value.h>
28 #include <gl/xalloc.h>
31 Find the least-squares estimate of b for the linear model:
35 where Y is an n-by-1 column vector, X is an n-by-p matrix of
36 independent variables, b is a p-by-1 vector of regression coefficients,
37 and Z is an n-by-1 normally-distributed random vector with independent
38 identically distributed components with mean 0.
40 This estimate is found via the sweep operator or singular-value
41 decomposition with gsl.
46 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
47 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
49 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
52 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
53 Springer. 1998. ISBN 0-387-98542-5.
57 const struct variable **
58 linreg_get_vars (const linreg *c)
64 Allocate a linreg and return a pointer to it. n is the number of
65 cases, p is the number of independent variables.
68 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
74 c = xmalloc (sizeof (*c));
76 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
77 c->dependent_column = p;
78 for (i = 0; i < p; i++)
80 c->indep_vars[i] = indep_vars[i];
82 c->indep_means = gsl_vector_alloc (p);
83 c->indep_std = gsl_vector_alloc (p);
85 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
91 c->coeff = xnmalloc (p, sizeof (*c->coeff));
92 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
95 c->dfe = c->dft - c->dfm;
102 c->method = LINREG_SWEEP;
110 linreg_free (void *m)
115 gsl_vector_free (c->indep_means);
116 gsl_vector_free (c->indep_std);
117 gsl_matrix_free (c->cov);
118 free (c->indep_vars);
126 post_sweep_computations (linreg *l, gsl_matrix *sw)
130 gsl_matrix_view xmxtx;
140 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
141 l->mse = l->sse / l->dfe;
146 for (i = 0; i < l->n_indeps; i++)
148 tmp = gsl_matrix_get (sw, i, l->n_indeps);
150 m -= tmp * linreg_get_indep_variable_mean (l, i);
153 Get the covariance matrix of the parameter estimates.
154 Only the upper triangle is necessary.
158 The loops below do not compute the entries related
159 to the estimated intercept.
161 for (i = 0; i < l->n_indeps; i++)
162 for (j = i; j < l->n_indeps; j++)
164 tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
165 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
168 Get the covariances related to the intercept.
170 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
171 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
172 xm = gsl_matrix_calloc (1, l->n_indeps);
173 for (i = 0; i < xm->size2; i++)
175 gsl_matrix_set (xm, 0, i,
176 linreg_get_indep_variable_mean (l, i));
178 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
179 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
180 gsl_matrix_free (xm);
181 if (rc == GSL_SUCCESS)
183 tmp = l->mse / l->n_obs;
184 for (i = 1; i < 1 + l->n_indeps; i++)
186 tmp -= gsl_matrix_get (l->cov, 0, i)
187 * linreg_get_indep_variable_mean (l, i - 1);
189 gsl_matrix_set (l->cov, 0, 0, tmp);
195 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
196 __FILE__, __LINE__, gsl_strerror (rc));
202 Predict the value of the dependent variable with the new set of
203 predictors. VALS are assumed to be in the order corresponding to the
204 order of the coefficients in the linreg struct.
207 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
212 assert (n_vals = c->n_coeffs);
213 if (vals == NULL || c == NULL)
217 if (c->coeff == NULL)
219 /* The stupid model: just guess the mean. */
220 return c->depvar_mean;
222 result = c->intercept;
224 for (j = 0; j < n_vals; j++)
226 result += linreg_coeff (c, j) * vals[j];
233 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
235 if (vals == NULL || c == NULL)
239 return (obs - linreg_predict (c, vals, n_vals));
242 double linreg_get_indep_variable_sd (linreg *c, size_t j)
245 return gsl_vector_get (c->indep_std, j);
248 void linreg_set_indep_variable_sd (linreg *c, size_t j, double s)
251 gsl_vector_set (c->indep_std, j, s);
255 Mean of the independent variable.
257 double linreg_get_indep_variable_mean (linreg *c, size_t j)
260 return gsl_vector_get (c->indep_means, j);
263 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
266 gsl_vector_set (c->indep_means, j, m);
270 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
272 double intcpt_coef = 0.0;
273 double intercept_variance = 0.0;
284 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
285 xty = gsl_vector_alloc (cov->size1 - 1);
286 tau = gsl_vector_alloc (cov->size1 - 1);
287 params = gsl_vector_alloc (cov->size1 - 1);
289 for (i = 0; i < xtx->size1; i++)
291 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
292 for (j = 0; j < xtx->size2; j++)
294 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
297 gsl_linalg_QR_decomp (xtx, tau);
298 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
299 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
301 gsl_linalg_QR_unpack (xtx, tau, q, r);
302 gsl_linalg_QR_solve (xtx, tau, xty, params);
303 for (i = 0; i < params->size; i++)
305 l->coeff[i] = gsl_vector_get (params, i);
307 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
309 for (i = 0; i < l->n_indeps; i++)
311 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
313 l->sse = l->sst - l->ssm;
315 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
317 /* Copy the lower triangle into the upper triangle. */
318 for (i = 0; i < q->size1; i++)
320 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
321 for (j = i + 1; j < q->size2; j++)
323 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
324 linreg_get_indep_variable_mean (l, i) *
325 linreg_get_indep_variable_mean (l, j);
326 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
329 l->intercept = linreg_get_depvar_mean (l);
331 for (i = 0; i < l->n_indeps; i++)
333 tmp = linreg_get_indep_variable_mean (l, i);
334 l->intercept -= l->coeff[i] * tmp;
335 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
338 /* Covariances related to the intercept. */
339 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
340 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
341 for (i = 0; i < q->size1; i++)
343 for (j = 0; j < q->size2; j++)
345 intcpt_coef -= gsl_matrix_get (q, i, j)
346 * linreg_get_indep_variable_mean (l, j);
348 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
349 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
355 gsl_vector_free (xty);
356 gsl_vector_free (tau);
357 gsl_matrix_free (xtx);
358 gsl_vector_free (params);
362 Estimate the model parameters from the covariance matrix. This
363 function assumes the covariance entries corresponding to the
364 dependent variable are in the final row and column of the covariance
368 linreg_fit (const gsl_matrix *cov, linreg *l)
371 assert (cov != NULL);
373 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
374 if (l->method == LINREG_SWEEP)
377 params = gsl_matrix_calloc (cov->size1, cov->size2);
378 gsl_matrix_memcpy (params, cov);
379 reg_sweep (params, l->dependent_column);
380 post_sweep_computations (l, params);
381 gsl_matrix_free (params);
383 else if (l->method == LINREG_QR)
385 linreg_fit_qr (cov, l);
389 double linreg_mse (const linreg *c)
392 return (c->sse / c->dfe);
395 double linreg_intercept (const linreg *c)
401 linreg_cov (const linreg *c)
407 linreg_coeff (const linreg *c, size_t i)
409 return (c->coeff[i]);
412 const struct variable *
413 linreg_indep_var (const linreg *c, size_t i)
415 return (c->indep_vars[i]);
419 linreg_n_coeffs (const linreg *c)
425 linreg_n_obs (const linreg *c)
431 linreg_sse (const linreg *c)
437 linreg_ssreg (const linreg *c)
439 return (c->sst - c->sse);
442 double linreg_sst (const linreg *c)
448 linreg_dfmodel ( const linreg *c)
454 linreg_set_depvar_mean (linreg *c, double x)
460 linreg_get_depvar_mean (linreg *c)
462 return c->depvar_mean;