1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_blas.h>
19 #include <gsl/gsl_cblas.h>
20 #include <gsl/gsl_errno.h>
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_linalg.h>
23 #include <gsl/gsl_multifit.h>
24 #include <linreg/sweep.h>
25 #include <math/linreg.h>
26 #include <src/data/category.h>
27 #include <src/data/variable.h>
28 #include <src/data/value.h>
29 #include <gl/xalloc.h>
32 Find the least-squares estimate of b for the linear model:
36 where Y is an n-by-1 column vector, X is an n-by-p matrix of
37 independent variables, b is a p-by-1 vector of regression coefficients,
38 and Z is an n-by-1 normally-distributed random vector with independent
39 identically distributed components with mean 0.
41 This estimate is found via the sweep operator or singular-value
42 decomposition with gsl.
47 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
48 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
50 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
53 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
54 Springer. 1998. ISBN 0-387-98542-5.
58 const struct variable **
59 linreg_get_vars (const linreg *c)
65 Allocate a linreg and return a pointer to it. n is the number of
66 cases, p is the number of independent variables.
69 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
75 c = xmalloc (sizeof (*c));
77 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
78 for (i = 0; i < p; i++)
80 c->indep_vars[i] = indep_vars[i];
82 c->indep_means = gsl_vector_alloc (p);
83 c->indep_std = gsl_vector_alloc (p);
84 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
85 independent variables.
87 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
93 c->coeff = xnmalloc (p, sizeof (*c->coeff));
94 c->cov = gsl_matrix_alloc (c->n_coeffs + 1, c->n_coeffs + 1);
97 c->dfe = c->dft - c->dfm;
102 c->method = LINREG_SWEEP;
110 linreg_free (void *m)
115 gsl_vector_free (c->indep_means);
116 gsl_vector_free (c->indep_std);
117 gsl_matrix_free (c->cov);
118 gsl_vector_free (c->ssx);
119 free (c->indep_vars);
127 post_sweep_computations (linreg *l, gsl_matrix *sw)
131 gsl_matrix_view xmxtx;
141 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
142 l->mse = l->sse / l->dfe;
147 for (i = 0; i < l->n_indeps; i++)
149 tmp = gsl_matrix_get (sw, i, l->n_indeps);
151 m -= tmp * linreg_get_indep_variable_mean (l, i);
154 Get the covariance matrix of the parameter estimates.
155 Only the upper triangle is necessary.
159 The loops below do not compute the entries related
160 to the estimated intercept.
162 for (i = 0; i < l->n_indeps; i++)
163 for (j = i; j < l->n_indeps; j++)
165 tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
166 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
169 Get the covariances related to the intercept.
171 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
172 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
173 xm = gsl_matrix_calloc (1, l->n_indeps);
174 for (i = 0; i < xm->size2; i++)
176 gsl_matrix_set (xm, 0, i,
177 linreg_get_indep_variable_mean (l, i));
179 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
180 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
181 gsl_matrix_free (xm);
182 if (rc == GSL_SUCCESS)
184 tmp = l->mse / l->n_obs;
185 for (i = 1; i < 1 + l->n_indeps; i++)
187 tmp -= gsl_matrix_get (l->cov, 0, i)
188 * linreg_get_indep_variable_mean (l, i - 1);
190 gsl_matrix_set (l->cov, 0, 0, tmp);
196 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
197 __FILE__, __LINE__, gsl_strerror (rc));
203 Predict the value of the dependent variable with the new set of
204 predictors. VALS are assumed to be in the order corresponding to the
205 order of the coefficients in the linreg struct.
208 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
213 assert (n_vals = c->n_coeffs);
214 if (vals == NULL || c == NULL)
218 if (c->coeff == NULL)
220 /* The stupid model: just guess the mean. */
221 return c->depvar_mean;
223 result = c->intercept;
225 for (j = 0; j < n_vals; j++)
227 result += linreg_coeff (c, j) * vals[j];
234 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
236 if (vals == NULL || c == NULL)
240 return (obs - linreg_predict (c, vals, n_vals));
243 double linreg_get_indep_variable_sd (linreg *c, size_t j)
246 return gsl_vector_get (c->indep_std, j);
249 void linreg_set_indep_variable_sd (linreg *c, size_t j, double s)
252 gsl_vector_set (c->indep_std, j, s);
256 Mean of the independent variable.
258 double linreg_get_indep_variable_mean (linreg *c, size_t j)
261 return gsl_vector_get (c->indep_means, j);
264 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
267 gsl_vector_set (c->indep_means, j, m);
271 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
283 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
284 xty = gsl_vector_alloc (cov->size1 - 1);
285 tau = gsl_vector_alloc (cov->size1 - 1);
286 params = gsl_vector_alloc (cov->size1 - 1);
288 for (i = 0; i < xtx->size1; i++)
290 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
291 for (j = 0; j < xtx->size2; j++)
293 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
296 gsl_linalg_QR_decomp (xtx, tau);
297 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
298 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
299 gsl_linalg_QR_unpack (xtx, tau, q, r);
300 gsl_linalg_QR_solve (xtx, tau, xty, params);
301 for (i = 0; i < params->size; i++)
303 l->coeff[i] = gsl_vector_get (params, i);
306 l->intercept = l->depvar_mean;
307 for (i = 0; i < l->n_indeps; i++)
309 l->intercept -= l->coeff[i] * linreg_get_indep_variable_mean (l, i);
312 l->sse = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
313 for (i = 0; i < l->n_indeps; i++)
315 tmp += gsl_vector_get (xty, i) * l->coeff[i];
318 for (i = 0; i < xtx->size1; i++)
321 for (j = i; j < xtx->size2; j++)
323 tmp += gsl_matrix_get (xtx, i, j) * l->coeff[j];
329 p = l->hat->size1 - 1;
330 for (i = 0; i < l->cov->size1 - 1; i++)
332 gsl_matrix_set (l->cov, p - i, p - i, 1.0 / gsl_matrix_get (r, p - i, p - i));
333 for (j = 0; j < i; j++)
335 tmp = -1.0 * gsl_matrix_get (r, p - i, p - j);
336 tmp /= gsl_matrix_get (r, p - i, p - i) * gsl_matrix_get (r, p - j, p - j);
337 gsl_matrix_set (l->cov, p - i, p - j, tmp);
341 gsl_matrix_transpose_memcpy (l->cov, q);
342 gsl_blas_dtrsm (CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit, 1.0,
347 gsl_vector_free (xty);
348 gsl_vector_free (tau);
349 gsl_matrix_free (xtx);
350 gsl_vector_free (params);
354 Estimate the model parameters from the covariance matrix. This
355 function assumes the covariance entries corresponding to the
356 dependent variable are in the final row and column of the covariance
360 linreg_fit (const gsl_matrix *cov, linreg *l)
364 assert (cov != NULL);
366 params = gsl_matrix_calloc (cov->size1, cov->size2);
367 gsl_matrix_memcpy (params, cov);
368 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
370 if (l->method == LINREG_SWEEP)
373 post_sweep_computations (l, params);
375 else if (l->method == LINREG_QR)
377 linreg_fit_qr (params, l);
379 gsl_matrix_free (params);
382 double linreg_mse (const linreg *c)
385 return (c->sse / c->dfe);
388 double linreg_intercept (const linreg *c)
394 linreg_cov (const linreg *c)
400 linreg_coeff (const linreg *c, size_t i)
402 return (c->coeff[i]);
405 const struct variable *
406 linreg_indep_var (const linreg *c, size_t i)
408 return (c->indep_vars[i]);
412 linreg_n_coeffs (const linreg *c)
418 linreg_n_obs (const linreg *c)
424 linreg_sse (const linreg *c)
430 linreg_ssreg (const linreg *c)
432 return (c->sst - c->sse);
435 double linreg_sst (const linreg *c)
441 linreg_dfmodel ( const linreg *c)