1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_blas.h>
19 #include <gsl/gsl_cblas.h>
20 #include <gsl/gsl_errno.h>
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_linalg.h>
23 #include <gsl/gsl_multifit.h>
24 #include <linreg/sweep.h>
25 #include <math/linreg.h>
26 #include <src/data/variable.h>
27 #include <src/data/value.h>
28 #include <gl/xalloc.h>
31 Find the least-squares estimate of b for the linear model:
35 where Y is an n-by-1 column vector, X is an n-by-p matrix of
36 independent variables, b is a p-by-1 vector of regression coefficients,
37 and Z is an n-by-1 normally-distributed random vector with independent
38 identically distributed components with mean 0.
40 This estimate is found via the sweep operator or singular-value
41 decomposition with gsl.
46 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
47 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
49 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
52 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
53 Springer. 1998. ISBN 0-387-98542-5.
57 const struct variable **
58 linreg_get_vars (const linreg *c)
64 Allocate a linreg and return a pointer to it. n is the number of
65 cases, p is the number of independent variables.
68 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
74 c = xmalloc (sizeof (*c));
76 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
77 for (i = 0; i < p; i++)
79 c->indep_vars[i] = indep_vars[i];
81 c->indep_means = gsl_vector_alloc (p);
82 c->indep_std = gsl_vector_alloc (p);
84 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
90 c->coeff = xnmalloc (p, sizeof (*c->coeff));
91 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
94 c->dfe = c->dft - c->dfm;
101 c->method = LINREG_SWEEP;
109 linreg_free (void *m)
114 gsl_vector_free (c->indep_means);
115 gsl_vector_free (c->indep_std);
116 gsl_matrix_free (c->cov);
117 free (c->indep_vars);
125 post_sweep_computations (linreg *l, gsl_matrix *sw)
129 gsl_matrix_view xmxtx;
139 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
140 l->mse = l->sse / l->dfe;
145 for (i = 0; i < l->n_indeps; i++)
147 tmp = gsl_matrix_get (sw, i, l->n_indeps);
149 m -= tmp * linreg_get_indep_variable_mean (l, i);
152 Get the covariance matrix of the parameter estimates.
153 Only the upper triangle is necessary.
157 The loops below do not compute the entries related
158 to the estimated intercept.
160 for (i = 0; i < l->n_indeps; i++)
161 for (j = i; j < l->n_indeps; j++)
163 tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
164 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
167 Get the covariances related to the intercept.
169 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
170 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
171 xm = gsl_matrix_calloc (1, l->n_indeps);
172 for (i = 0; i < xm->size2; i++)
174 gsl_matrix_set (xm, 0, i,
175 linreg_get_indep_variable_mean (l, i));
177 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
178 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
179 gsl_matrix_free (xm);
180 if (rc == GSL_SUCCESS)
182 tmp = l->mse / l->n_obs;
183 for (i = 1; i < 1 + l->n_indeps; i++)
185 tmp -= gsl_matrix_get (l->cov, 0, i)
186 * linreg_get_indep_variable_mean (l, i - 1);
188 gsl_matrix_set (l->cov, 0, 0, tmp);
194 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
195 __FILE__, __LINE__, gsl_strerror (rc));
201 Predict the value of the dependent variable with the new set of
202 predictors. VALS are assumed to be in the order corresponding to the
203 order of the coefficients in the linreg struct.
206 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
211 assert (n_vals = c->n_coeffs);
212 if (vals == NULL || c == NULL)
216 if (c->coeff == NULL)
218 /* The stupid model: just guess the mean. */
219 return c->depvar_mean;
221 result = c->intercept;
223 for (j = 0; j < n_vals; j++)
225 result += linreg_coeff (c, j) * vals[j];
232 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
234 if (vals == NULL || c == NULL)
238 return (obs - linreg_predict (c, vals, n_vals));
241 double linreg_get_indep_variable_sd (linreg *c, size_t j)
244 return gsl_vector_get (c->indep_std, j);
247 void linreg_set_indep_variable_sd (linreg *c, size_t j, double s)
250 gsl_vector_set (c->indep_std, j, s);
254 Mean of the independent variable.
256 double linreg_get_indep_variable_mean (linreg *c, size_t j)
259 return gsl_vector_get (c->indep_means, j);
262 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
265 gsl_vector_set (c->indep_means, j, m);
267 static void invert_r (gsl_matrix *r, gsl_matrix *r_inv)
275 for (i = 0; i < r->size1; i++)
277 gsl_matrix_set (r_inv, i, i, 1.0 / gsl_matrix_get (r, i, i));
279 for (i = 0; i < r->size1; i++)
282 for (j = row + 1 + i; j < r->size2; j++)
285 for (k = 1; k <= j - row; k++)
287 tmp += gsl_matrix_get (r, row, row + k)
288 * gsl_matrix_get (r_inv, row + k, j);
290 gsl_matrix_set (r_inv, row, j, -tmp / gsl_matrix_get (r, row, row));
297 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
309 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
310 xty = gsl_vector_alloc (cov->size1 - 1);
311 tau = gsl_vector_alloc (cov->size1 - 1);
312 params = gsl_vector_alloc (cov->size1 - 1);
314 for (i = 0; i < xtx->size1; i++)
316 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
317 for (j = 0; j < xtx->size2; j++)
319 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
322 gsl_linalg_QR_decomp (xtx, tau);
323 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
324 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
326 gsl_linalg_QR_unpack (xtx, tau, q, r);
327 gsl_linalg_QR_solve (xtx, tau, xty, params);
328 for (i = 0; i < params->size; i++)
330 l->coeff[i] = gsl_vector_get (params, i);
332 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
334 for (i = 0; i < l->n_indeps; i++)
336 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
338 l->sse = l->sst - l->ssm;
340 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
342 /* Copy the lower triangle into the upper triangle. */
343 double intercept_variance = 0.0;
344 for (i = 0; i < q->size1; i++)
346 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
347 for (j = i + 1; j < q->size2; j++)
349 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
350 linreg_get_indep_variable_mean (l, i) *
351 linreg_get_indep_variable_mean (l, j);
352 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
355 l->intercept = linreg_get_depvar_mean (l);
357 for (i = 0; i < l->n_indeps; i++)
359 tmp = linreg_get_indep_variable_mean (l, i);
360 l->intercept -= l->coeff[i] * tmp;
361 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
364 /* Covariances related to the intercept. */
365 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
366 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
367 double intcpt_coef = 0.0;
368 for (i = 0; i < q->size1; i++)
370 for (j = 0; j < q->size2; j++)
372 intcpt_coef -= gsl_matrix_get (q, i, j)
373 * linreg_get_indep_variable_mean (l, j);
375 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
376 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
382 gsl_vector_free (xty);
383 gsl_vector_free (tau);
384 gsl_matrix_free (xtx);
385 gsl_vector_free (params);
389 Estimate the model parameters from the covariance matrix. This
390 function assumes the covariance entries corresponding to the
391 dependent variable are in the final row and column of the covariance
395 linreg_fit (const gsl_matrix *cov, linreg *l)
398 assert (cov != NULL);
400 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
401 if (l->method == LINREG_SWEEP)
404 params = gsl_matrix_calloc (cov->size1, cov->size2);
405 gsl_matrix_memcpy (params, cov);
407 post_sweep_computations (l, params);
408 gsl_matrix_free (params);
410 else if (l->method == LINREG_QR)
412 linreg_fit_qr (cov, l);
416 double linreg_mse (const linreg *c)
419 return (c->sse / c->dfe);
422 double linreg_intercept (const linreg *c)
428 linreg_cov (const linreg *c)
434 linreg_coeff (const linreg *c, size_t i)
436 return (c->coeff[i]);
439 const struct variable *
440 linreg_indep_var (const linreg *c, size_t i)
442 return (c->indep_vars[i]);
446 linreg_n_coeffs (const linreg *c)
452 linreg_n_obs (const linreg *c)
458 linreg_sse (const linreg *c)
464 linreg_ssreg (const linreg *c)
466 return (c->sst - c->sse);
469 double linreg_sst (const linreg *c)
475 linreg_dfmodel ( const linreg *c)
481 linreg_set_depvar_mean (linreg *c, double x)
487 linreg_get_depvar_mean (linreg *c)
489 return c->depvar_mean;