1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_blas.h>
19 #include <gsl/gsl_cblas.h>
20 #include <gsl/gsl_errno.h>
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_linalg.h>
23 #include <gsl/gsl_multifit.h>
24 #include <linreg/sweep.h>
25 #include <math/linreg.h>
26 #include <src/data/category.h>
27 #include <src/data/variable.h>
28 #include <src/data/value.h>
29 #include <gl/xalloc.h>
32 Find the least-squares estimate of b for the linear model:
36 where Y is an n-by-1 column vector, X is an n-by-p matrix of
37 independent variables, b is a p-by-1 vector of regression coefficients,
38 and Z is an n-by-1 normally-distributed random vector with independent
39 identically distributed components with mean 0.
41 This estimate is found via the sweep operator or singular-value
42 decomposition with gsl.
47 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
48 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
50 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
53 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
54 Springer. 1998. ISBN 0-387-98542-5.
58 const struct variable **
59 linreg_get_vars (const linreg *c)
65 Allocate a linreg and return a pointer to it. n is the number of
66 cases, p is the number of independent variables.
69 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
74 c = (linreg *) malloc (sizeof (linreg));
76 c->indep_vars = indep_vars;
77 c->indep_means = gsl_vector_alloc (p);
78 c->indep_std = gsl_vector_alloc (p);
79 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
80 independent variables.
82 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
88 c->coeff = xnmalloc (p, sizeof (*c->coeff));
89 c->cov = gsl_matrix_alloc (c->n_coeffs + 1, c->n_coeffs + 1);
92 c->dfe = c->dft - c->dfm;
97 c->method = LINREG_SWEEP;
98 c->resid = NULL; /* The variable storing my residuals. */
99 c->pred = NULL; /* The variable storing my predicted values. */
105 linreg_free (void *m)
110 gsl_vector_free (c->indep_means);
111 gsl_vector_free (c->indep_std);
112 gsl_matrix_free (c->cov);
113 gsl_vector_free (c->ssx);
121 post_sweep_computations (linreg *l, gsl_matrix *sw)
125 gsl_matrix_view xmxtx;
135 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
136 l->mse = l->sse / l->dfe;
141 for (i = 0; i < l->n_indeps; i++)
143 tmp = gsl_matrix_get (sw, i, l->n_indeps);
145 m -= tmp * linreg_get_indep_variable_mean (l, i);
148 Get the covariance matrix of the parameter estimates.
149 Only the upper triangle is necessary.
153 The loops below do not compute the entries related
154 to the estimated intercept.
156 for (i = 0; i < l->n_indeps; i++)
157 for (j = i; j < l->n_indeps; j++)
159 tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
160 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
163 Get the covariances related to the intercept.
165 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
166 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
167 xm = gsl_matrix_calloc (1, l->n_indeps);
168 for (i = 0; i < xm->size2; i++)
170 gsl_matrix_set (xm, 0, i,
171 linreg_get_indep_variable_mean (l, i));
173 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
174 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
175 gsl_matrix_free (xm);
176 if (rc == GSL_SUCCESS)
178 tmp = l->mse / l->n_obs;
179 for (i = 1; i < 1 + l->n_indeps; i++)
181 tmp -= gsl_matrix_get (l->cov, 0, i)
182 * linreg_get_indep_variable_mean (l, i - 1);
184 gsl_matrix_set (l->cov, 0, 0, tmp);
190 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
191 __FILE__, __LINE__, gsl_strerror (rc));
197 Predict the value of the dependent variable with the new set of
198 predictors. VALS are assumed to be in the order corresponding to the
199 order of the coefficients in the linreg struct.
202 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
207 assert (n_vals = c->n_coeffs);
208 if (vals == NULL || c == NULL)
212 if (c->coeff == NULL)
214 /* The stupid model: just guess the mean. */
215 return c->depvar_mean;
217 result = c->intercept;
219 for (j = 0; j < n_vals; j++)
221 result += linreg_coeff (c, j) * vals[j];
228 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
230 if (vals == NULL || c == NULL)
234 return (obs - linreg_predict (c, vals, n_vals));
237 double linreg_get_indep_variable_sd (linreg *c, size_t j)
240 return gsl_vector_get (c->indep_std, j);
243 void linreg_set_indep_variable_sd (linreg *c, size_t j, double s)
246 gsl_vector_set (c->indep_std, j, s);
250 Mean of the independent variable.
252 double linreg_get_indep_variable_mean (linreg *c, size_t j)
255 return gsl_vector_get (c->indep_means, j);
258 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
261 gsl_vector_set (c->indep_means, j, m);
265 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
277 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
278 xty = gsl_vector_alloc (cov->size1 - 1);
279 tau = gsl_vector_alloc (cov->size1 - 1);
280 params = gsl_vector_alloc (cov->size1 - 1);
282 for (i = 0; i < xtx->size1; i++)
284 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
285 for (j = 0; j < xtx->size2; j++)
287 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
290 gsl_linalg_QR_decomp (xtx, tau);
291 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
292 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
293 gsl_linalg_QR_unpack (xtx, tau, q, r);
294 gsl_linalg_QR_solve (xtx, tau, xty, params);
295 for (i = 0; i < params->size; i++)
297 l->coeff[i] = gsl_vector_get (params, i);
300 l->intercept = l->depvar_mean;
301 for (i = 0; i < l->n_indeps; i++)
303 l->intercept -= l->coeff[i] * linreg_get_indep_variable_mean (l, i);
306 l->sse = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
307 for (i = 0; i < l->n_indeps; i++)
309 tmp += gsl_vector_get (xty, i) * l->coeff[i];
312 for (i = 0; i < xtx->size1; i++)
315 for (j = i; j < xtx->size2; j++)
317 tmp += gsl_matrix_get (xtx, i, j) * l->coeff[j];
323 p = l->hat->size1 - 1;
324 for (i = 0; i < l->cov->size1 - 1; i++)
326 gsl_matrix_set (l->cov, p - i, p - i, 1.0 / gsl_matrix_get (r, p - i, p - i));
327 for (j = 0; j < i; j++)
329 tmp = -1.0 * gsl_matrix_get (r, p - i, p - j);
330 tmp /= gsl_matrix_get (r, p - i, p - i) * gsl_matrix_get (r, p - j, p - j);
331 gsl_matrix_set (l->cov, p - i, p - j, tmp);
335 gsl_matrix_transpose_memcpy (l->cov, q);
336 gsl_blas_dtrsm (CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 1.0,
341 gsl_vector_free (xty);
342 gsl_vector_free (tau);
343 gsl_matrix_free (xtx);
344 gsl_vector_free (params);
348 Estimate the model parameters from the covariance matrix. This
349 function assumes the covariance entries corresponding to the
350 dependent variable are in the final row and column of the covariance
354 linreg_fit (const gsl_matrix *cov, linreg *l)
358 assert (cov != NULL);
360 params = gsl_matrix_calloc (cov->size1, cov->size2);
361 gsl_matrix_memcpy (params, cov);
363 if (l->method == LINREG_SWEEP)
366 post_sweep_computations (l, params);
368 else if (l->method == LINREG_QR)
370 linreg_fit_qr (params, l);
372 gsl_matrix_free (params);
375 double linreg_mse (const linreg *c)
378 return (c->sse / c->dfe);
381 double linreg_intercept (const linreg *c)
387 linreg_cov (const linreg *c)
393 linreg_coeff (const linreg *c, size_t i)
395 return (c->coeff[i]);
398 const struct variable *
399 linreg_indep_var (const linreg *c, size_t i)
401 return (c->indep_vars[i]);
405 linreg_n_coeffs (const linreg *c)
411 linreg_n_obs (const linreg *c)
417 linreg_ssreg (const linreg *c)
423 linreg_dfmodel ( const linreg *c)