1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_blas.h>
19 #include <gsl/gsl_cblas.h>
20 #include <gsl/gsl_errno.h>
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_linalg.h>
23 #include <gsl/gsl_multifit.h>
24 #include <linreg/sweep.h>
25 #include <math/linreg.h>
26 #include <src/data/variable.h>
27 #include <src/data/value.h>
28 #include <gl/xalloc.h>
31 Find the least-squares estimate of b for the linear model:
35 where Y is an n-by-1 column vector, X is an n-by-p matrix of
36 independent variables, b is a p-by-1 vector of regression coefficients,
37 and Z is an n-by-1 normally-distributed random vector with independent
38 identically distributed components with mean 0.
40 This estimate is found via the sweep operator or singular-value
41 decomposition with gsl.
46 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
47 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
49 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
52 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
53 Springer. 1998. ISBN 0-387-98542-5.
57 const struct variable **
58 linreg_get_vars (const linreg *c)
64 Allocate a linreg and return a pointer to it. n is the number of
65 cases, p is the number of independent variables.
68 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
74 c = xmalloc (sizeof (*c));
76 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
77 for (i = 0; i < p; i++)
79 c->indep_vars[i] = indep_vars[i];
81 c->indep_means = gsl_vector_alloc (p);
82 c->indep_std = gsl_vector_alloc (p);
84 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
90 c->coeff = xnmalloc (p, sizeof (*c->coeff));
91 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
94 c->dfe = c->dft - c->dfm;
101 c->method = LINREG_SWEEP;
109 linreg_free (void *m)
114 gsl_vector_free (c->indep_means);
115 gsl_vector_free (c->indep_std);
116 gsl_matrix_free (c->cov);
117 free (c->indep_vars);
125 post_sweep_computations (linreg *l, gsl_matrix *sw)
129 gsl_matrix_view xmxtx;
139 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
140 l->mse = l->sse / l->dfe;
145 for (i = 0; i < l->n_indeps; i++)
147 tmp = gsl_matrix_get (sw, i, l->n_indeps);
149 m -= tmp * linreg_get_indep_variable_mean (l, i);
152 Get the covariance matrix of the parameter estimates.
153 Only the upper triangle is necessary.
157 The loops below do not compute the entries related
158 to the estimated intercept.
160 for (i = 0; i < l->n_indeps; i++)
161 for (j = i; j < l->n_indeps; j++)
163 tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
164 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
167 Get the covariances related to the intercept.
169 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
170 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
171 xm = gsl_matrix_calloc (1, l->n_indeps);
172 for (i = 0; i < xm->size2; i++)
174 gsl_matrix_set (xm, 0, i,
175 linreg_get_indep_variable_mean (l, i));
177 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
178 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
179 gsl_matrix_free (xm);
180 if (rc == GSL_SUCCESS)
182 tmp = l->mse / l->n_obs;
183 for (i = 1; i < 1 + l->n_indeps; i++)
185 tmp -= gsl_matrix_get (l->cov, 0, i)
186 * linreg_get_indep_variable_mean (l, i - 1);
188 gsl_matrix_set (l->cov, 0, 0, tmp);
194 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
195 __FILE__, __LINE__, gsl_strerror (rc));
201 Predict the value of the dependent variable with the new set of
202 predictors. VALS are assumed to be in the order corresponding to the
203 order of the coefficients in the linreg struct.
206 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
211 assert (n_vals = c->n_coeffs);
212 if (vals == NULL || c == NULL)
216 if (c->coeff == NULL)
218 /* The stupid model: just guess the mean. */
219 return c->depvar_mean;
221 result = c->intercept;
223 for (j = 0; j < n_vals; j++)
225 result += linreg_coeff (c, j) * vals[j];
232 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
234 if (vals == NULL || c == NULL)
238 return (obs - linreg_predict (c, vals, n_vals));
241 double linreg_get_indep_variable_sd (linreg *c, size_t j)
244 return gsl_vector_get (c->indep_std, j);
247 void linreg_set_indep_variable_sd (linreg *c, size_t j, double s)
250 gsl_vector_set (c->indep_std, j, s);
254 Mean of the independent variable.
256 double linreg_get_indep_variable_mean (linreg *c, size_t j)
259 return gsl_vector_get (c->indep_means, j);
262 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
265 gsl_vector_set (c->indep_means, j, m);
269 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
281 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
282 xty = gsl_vector_alloc (cov->size1 - 1);
283 tau = gsl_vector_alloc (cov->size1 - 1);
284 params = gsl_vector_alloc (cov->size1 - 1);
286 for (i = 0; i < xtx->size1; i++)
288 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
289 for (j = 0; j < xtx->size2; j++)
291 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
294 gsl_linalg_QR_decomp (xtx, tau);
295 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
296 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
298 gsl_linalg_QR_unpack (xtx, tau, q, r);
299 gsl_linalg_QR_solve (xtx, tau, xty, params);
300 for (i = 0; i < params->size; i++)
302 l->coeff[i] = gsl_vector_get (params, i);
304 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
306 for (i = 0; i < l->n_indeps; i++)
308 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
310 l->sse = l->sst - l->ssm;
312 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
314 /* Copy the lower triangle into the upper triangle. */
315 double intercept_variance = 0.0;
316 for (i = 0; i < q->size1; i++)
318 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
319 for (j = i + 1; j < q->size2; j++)
321 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
322 linreg_get_indep_variable_mean (l, i) *
323 linreg_get_indep_variable_mean (l, j);
324 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
327 l->intercept = linreg_get_depvar_mean (l);
329 for (i = 0; i < l->n_indeps; i++)
331 tmp = linreg_get_indep_variable_mean (l, i);
332 l->intercept -= l->coeff[i] * tmp;
333 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
336 /* Covariances related to the intercept. */
337 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
338 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
339 double intcpt_coef = 0.0;
340 for (i = 0; i < q->size1; i++)
342 for (j = 0; j < q->size2; j++)
344 intcpt_coef -= gsl_matrix_get (q, i, j)
345 * linreg_get_indep_variable_mean (l, j);
347 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
348 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
354 gsl_vector_free (xty);
355 gsl_vector_free (tau);
356 gsl_matrix_free (xtx);
357 gsl_vector_free (params);
361 Estimate the model parameters from the covariance matrix. This
362 function assumes the covariance entries corresponding to the
363 dependent variable are in the final row and column of the covariance
367 linreg_fit (const gsl_matrix *cov, linreg *l)
370 assert (cov != NULL);
372 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
373 if (l->method == LINREG_SWEEP)
376 params = gsl_matrix_calloc (cov->size1, cov->size2);
377 gsl_matrix_memcpy (params, cov);
379 post_sweep_computations (l, params);
380 gsl_matrix_free (params);
382 else if (l->method == LINREG_QR)
384 linreg_fit_qr (cov, l);
388 double linreg_mse (const linreg *c)
391 return (c->sse / c->dfe);
394 double linreg_intercept (const linreg *c)
400 linreg_cov (const linreg *c)
406 linreg_coeff (const linreg *c, size_t i)
408 return (c->coeff[i]);
411 const struct variable *
412 linreg_indep_var (const linreg *c, size_t i)
414 return (c->indep_vars[i]);
418 linreg_n_coeffs (const linreg *c)
424 linreg_n_obs (const linreg *c)
430 linreg_sse (const linreg *c)
436 linreg_ssreg (const linreg *c)
438 return (c->sst - c->sse);
441 double linreg_sst (const linreg *c)
447 linreg_dfmodel ( const linreg *c)
453 linreg_set_depvar_mean (linreg *c, double x)
459 linreg_get_depvar_mean (linreg *c)
461 return c->depvar_mean;