1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_blas.h>
19 #include <gsl/gsl_cblas.h>
20 #include <gsl/gsl_errno.h>
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_linalg.h>
23 #include <gsl/gsl_multifit.h>
24 #include <linreg/sweep.h>
25 #include <math/linreg.h>
26 #include <src/data/variable.h>
27 #include <src/data/value.h>
28 #include <gl/xalloc.h>
31 Find the least-squares estimate of b for the linear model:
35 where Y is an n-by-1 column vector, X is an n-by-p matrix of
36 independent variables, b is a p-by-1 vector of regression coefficients,
37 and Z is an n-by-1 normally-distributed random vector with independent
38 identically distributed components with mean 0.
40 This estimate is found via the sweep operator or singular-value
41 decomposition with gsl.
46 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
47 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
49 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
52 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
53 Springer. 1998. ISBN 0-387-98542-5.
57 const struct variable **
58 linreg_get_vars (const linreg *c)
64 Allocate a linreg and return a pointer to it. n is the number of
65 cases, p is the number of independent variables.
68 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
74 c = xmalloc (sizeof (*c));
76 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
77 for (i = 0; i < p; i++)
79 c->indep_vars[i] = indep_vars[i];
81 c->indep_means = gsl_vector_alloc (p);
82 c->indep_std = gsl_vector_alloc (p);
84 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
90 c->coeff = xnmalloc (p, sizeof (*c->coeff));
91 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
94 c->dfe = c->dft - c->dfm;
101 c->method = LINREG_SWEEP;
109 linreg_free (void *m)
114 gsl_vector_free (c->indep_means);
115 gsl_vector_free (c->indep_std);
116 gsl_matrix_free (c->cov);
117 free (c->indep_vars);
125 post_sweep_computations (linreg *l, gsl_matrix *sw)
129 gsl_matrix_view xmxtx;
139 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
140 l->mse = l->sse / l->dfe;
145 for (i = 0; i < l->n_indeps; i++)
147 tmp = gsl_matrix_get (sw, i, l->n_indeps);
149 m -= tmp * linreg_get_indep_variable_mean (l, i);
152 Get the covariance matrix of the parameter estimates.
153 Only the upper triangle is necessary.
157 The loops below do not compute the entries related
158 to the estimated intercept.
160 for (i = 0; i < l->n_indeps; i++)
161 for (j = i; j < l->n_indeps; j++)
163 tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
164 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
167 Get the covariances related to the intercept.
169 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
170 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
171 xm = gsl_matrix_calloc (1, l->n_indeps);
172 for (i = 0; i < xm->size2; i++)
174 gsl_matrix_set (xm, 0, i,
175 linreg_get_indep_variable_mean (l, i));
177 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
178 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
179 gsl_matrix_free (xm);
180 if (rc == GSL_SUCCESS)
182 tmp = l->mse / l->n_obs;
183 for (i = 1; i < 1 + l->n_indeps; i++)
185 tmp -= gsl_matrix_get (l->cov, 0, i)
186 * linreg_get_indep_variable_mean (l, i - 1);
188 gsl_matrix_set (l->cov, 0, 0, tmp);
194 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
195 __FILE__, __LINE__, gsl_strerror (rc));
201 Predict the value of the dependent variable with the new set of
202 predictors. VALS are assumed to be in the order corresponding to the
203 order of the coefficients in the linreg struct.
206 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
211 assert (n_vals = c->n_coeffs);
212 if (vals == NULL || c == NULL)
216 if (c->coeff == NULL)
218 /* The stupid model: just guess the mean. */
219 return c->depvar_mean;
221 result = c->intercept;
223 for (j = 0; j < n_vals; j++)
225 result += linreg_coeff (c, j) * vals[j];
232 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
234 if (vals == NULL || c == NULL)
238 return (obs - linreg_predict (c, vals, n_vals));
241 double linreg_get_indep_variable_sd (linreg *c, size_t j)
244 return gsl_vector_get (c->indep_std, j);
247 void linreg_set_indep_variable_sd (linreg *c, size_t j, double s)
250 gsl_vector_set (c->indep_std, j, s);
254 Mean of the independent variable.
256 double linreg_get_indep_variable_mean (linreg *c, size_t j)
259 return gsl_vector_get (c->indep_means, j);
262 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
265 gsl_vector_set (c->indep_means, j, m);
269 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
271 double intcpt_coef = 0.0;
272 double intercept_variance = 0.0;
283 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
284 xty = gsl_vector_alloc (cov->size1 - 1);
285 tau = gsl_vector_alloc (cov->size1 - 1);
286 params = gsl_vector_alloc (cov->size1 - 1);
288 for (i = 0; i < xtx->size1; i++)
290 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
291 for (j = 0; j < xtx->size2; j++)
293 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
296 gsl_linalg_QR_decomp (xtx, tau);
297 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
298 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
300 gsl_linalg_QR_unpack (xtx, tau, q, r);
301 gsl_linalg_QR_solve (xtx, tau, xty, params);
302 for (i = 0; i < params->size; i++)
304 l->coeff[i] = gsl_vector_get (params, i);
306 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
308 for (i = 0; i < l->n_indeps; i++)
310 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
312 l->sse = l->sst - l->ssm;
314 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
316 /* Copy the lower triangle into the upper triangle. */
317 for (i = 0; i < q->size1; i++)
319 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
320 for (j = i + 1; j < q->size2; j++)
322 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
323 linreg_get_indep_variable_mean (l, i) *
324 linreg_get_indep_variable_mean (l, j);
325 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
328 l->intercept = linreg_get_depvar_mean (l);
330 for (i = 0; i < l->n_indeps; i++)
332 tmp = linreg_get_indep_variable_mean (l, i);
333 l->intercept -= l->coeff[i] * tmp;
334 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
337 /* Covariances related to the intercept. */
338 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
339 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
340 for (i = 0; i < q->size1; i++)
342 for (j = 0; j < q->size2; j++)
344 intcpt_coef -= gsl_matrix_get (q, i, j)
345 * linreg_get_indep_variable_mean (l, j);
347 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
348 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
354 gsl_vector_free (xty);
355 gsl_vector_free (tau);
356 gsl_matrix_free (xtx);
357 gsl_vector_free (params);
361 Estimate the model parameters from the covariance matrix. This
362 function assumes the covariance entries corresponding to the
363 dependent variable are in the final row and column of the covariance
367 linreg_fit (const gsl_matrix *cov, linreg *l)
370 assert (cov != NULL);
372 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
373 if (l->method == LINREG_SWEEP)
376 params = gsl_matrix_calloc (cov->size1, cov->size2);
377 gsl_matrix_memcpy (params, cov);
379 post_sweep_computations (l, params);
380 gsl_matrix_free (params);
382 else if (l->method == LINREG_QR)
384 linreg_fit_qr (cov, l);
388 double linreg_mse (const linreg *c)
391 return (c->sse / c->dfe);
394 double linreg_intercept (const linreg *c)
400 linreg_cov (const linreg *c)
406 linreg_coeff (const linreg *c, size_t i)
408 return (c->coeff[i]);
411 const struct variable *
412 linreg_indep_var (const linreg *c, size_t i)
414 return (c->indep_vars[i]);
418 linreg_n_coeffs (const linreg *c)
424 linreg_n_obs (const linreg *c)
430 linreg_sse (const linreg *c)
436 linreg_ssreg (const linreg *c)
438 return (c->sst - c->sse);
441 double linreg_sst (const linreg *c)
447 linreg_dfmodel ( const linreg *c)
453 linreg_set_depvar_mean (linreg *c, double x)
459 linreg_get_depvar_mean (linreg *c)
461 return c->depvar_mean;