1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_blas.h>
19 #include <gsl/gsl_cblas.h>
20 #include <gsl/gsl_errno.h>
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_linalg.h>
23 #include <gsl/gsl_multifit.h>
24 #include <linreg/sweep.h>
25 #include <math/linreg.h>
26 #include <src/data/category.h>
27 #include <src/data/variable.h>
28 #include <src/data/value.h>
29 #include <gl/xalloc.h>
32 Find the least-squares estimate of b for the linear model:
36 where Y is an n-by-1 column vector, X is an n-by-p matrix of
37 independent variables, b is a p-by-1 vector of regression coefficients,
38 and Z is an n-by-1 normally-distributed random vector with independent
39 identically distributed components with mean 0.
41 This estimate is found via the sweep operator or singular-value
42 decomposition with gsl.
47 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
48 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
50 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
53 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
54 Springer. 1998. ISBN 0-387-98542-5.
58 const struct variable **
59 linreg_get_vars (const linreg *c)
65 Allocate a linreg and return a pointer to it. n is the number of
66 cases, p is the number of independent variables.
69 linreg_alloc (const struct variable *depvar, const struct variable **indep_vars,
75 c = xmalloc (sizeof (*c));
77 c->indep_vars = xnmalloc (p, sizeof (*indep_vars));
78 for (i = 0; i < p; i++)
80 c->indep_vars[i] = indep_vars[i];
82 c->indep_means = gsl_vector_alloc (p);
83 c->indep_std = gsl_vector_alloc (p);
85 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
91 c->coeff = xnmalloc (p, sizeof (*c->coeff));
92 c->cov = gsl_matrix_calloc (c->n_coeffs + 1, c->n_coeffs + 1);
95 c->dfe = c->dft - c->dfm;
102 c->method = LINREG_SWEEP;
110 linreg_free (void *m)
115 gsl_vector_free (c->indep_means);
116 gsl_vector_free (c->indep_std);
117 gsl_matrix_free (c->cov);
118 free (c->indep_vars);
126 post_sweep_computations (linreg *l, gsl_matrix *sw)
130 gsl_matrix_view xmxtx;
140 l->sse = gsl_matrix_get (sw, l->n_indeps, l->n_indeps);
141 l->mse = l->sse / l->dfe;
146 for (i = 0; i < l->n_indeps; i++)
148 tmp = gsl_matrix_get (sw, i, l->n_indeps);
150 m -= tmp * linreg_get_indep_variable_mean (l, i);
153 Get the covariance matrix of the parameter estimates.
154 Only the upper triangle is necessary.
158 The loops below do not compute the entries related
159 to the estimated intercept.
161 for (i = 0; i < l->n_indeps; i++)
162 for (j = i; j < l->n_indeps; j++)
164 tmp = -1.0 * l->mse * gsl_matrix_get (sw, i, j);
165 gsl_matrix_set (l->cov, i + 1, j + 1, tmp);
168 Get the covariances related to the intercept.
170 xtx = gsl_matrix_submatrix (sw, 0, 0, l->n_indeps, l->n_indeps);
171 xmxtx = gsl_matrix_submatrix (l->cov, 0, 1, 1, l->n_indeps);
172 xm = gsl_matrix_calloc (1, l->n_indeps);
173 for (i = 0; i < xm->size2; i++)
175 gsl_matrix_set (xm, 0, i,
176 linreg_get_indep_variable_mean (l, i));
178 rc = gsl_blas_dsymm (CblasRight, CblasUpper, l->mse,
179 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
180 gsl_matrix_free (xm);
181 if (rc == GSL_SUCCESS)
183 tmp = l->mse / l->n_obs;
184 for (i = 1; i < 1 + l->n_indeps; i++)
186 tmp -= gsl_matrix_get (l->cov, 0, i)
187 * linreg_get_indep_variable_mean (l, i - 1);
189 gsl_matrix_set (l->cov, 0, 0, tmp);
195 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
196 __FILE__, __LINE__, gsl_strerror (rc));
202 Predict the value of the dependent variable with the new set of
203 predictors. VALS are assumed to be in the order corresponding to the
204 order of the coefficients in the linreg struct.
207 linreg_predict (const linreg *c, const double *vals, size_t n_vals)
212 assert (n_vals = c->n_coeffs);
213 if (vals == NULL || c == NULL)
217 if (c->coeff == NULL)
219 /* The stupid model: just guess the mean. */
220 return c->depvar_mean;
222 result = c->intercept;
224 for (j = 0; j < n_vals; j++)
226 result += linreg_coeff (c, j) * vals[j];
233 linreg_residual (const linreg *c, double obs, const double *vals, size_t n_vals)
235 if (vals == NULL || c == NULL)
239 return (obs - linreg_predict (c, vals, n_vals));
242 double linreg_get_indep_variable_sd (linreg *c, size_t j)
245 return gsl_vector_get (c->indep_std, j);
248 void linreg_set_indep_variable_sd (linreg *c, size_t j, double s)
251 gsl_vector_set (c->indep_std, j, s);
255 Mean of the independent variable.
257 double linreg_get_indep_variable_mean (linreg *c, size_t j)
260 return gsl_vector_get (c->indep_means, j);
263 void linreg_set_indep_variable_mean (linreg *c, size_t j, double m)
266 gsl_vector_set (c->indep_means, j, m);
268 static void invert_r (gsl_matrix *r, gsl_matrix *r_inv)
276 for (i = 0; i < r->size1; i++)
278 gsl_matrix_set (r_inv, i, i, 1.0 / gsl_matrix_get (r, i, i));
280 for (i = 0; i < r->size1; i++)
283 for (j = row + 1 + i; j < r->size2; j++)
286 for (k = 1; k <= j - row; k++)
288 tmp += gsl_matrix_get (r, row, row + k)
289 * gsl_matrix_get (r_inv, row + k, j);
291 gsl_matrix_set (r_inv, row, j, -tmp / gsl_matrix_get (r, row, row));
298 linreg_fit_qr (const gsl_matrix *cov, linreg *l)
310 xtx = gsl_matrix_alloc (cov->size1 - 1, cov->size2 - 1);
311 xty = gsl_vector_alloc (cov->size1 - 1);
312 tau = gsl_vector_alloc (cov->size1 - 1);
313 params = gsl_vector_alloc (cov->size1 - 1);
315 for (i = 0; i < xtx->size1; i++)
317 gsl_vector_set (xty, i, gsl_matrix_get (cov, cov->size2 - 1, i));
318 for (j = 0; j < xtx->size2; j++)
320 gsl_matrix_set (xtx, i, j, gsl_matrix_get (cov, i, j));
323 gsl_linalg_QR_decomp (xtx, tau);
324 q = gsl_matrix_alloc (xtx->size1, xtx->size2);
325 r = gsl_matrix_alloc (xtx->size1, xtx->size2);
327 gsl_linalg_QR_unpack (xtx, tau, q, r);
328 gsl_linalg_QR_solve (xtx, tau, xty, params);
329 for (i = 0; i < params->size; i++)
331 l->coeff[i] = gsl_vector_get (params, i);
333 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
335 for (i = 0; i < l->n_indeps; i++)
337 l->ssm += gsl_vector_get (xty, i) * l->coeff[i];
339 l->sse = l->sst - l->ssm;
341 gsl_blas_dtrsm (CblasLeft, CblasLower, CblasNoTrans, CblasNonUnit, linreg_mse (l),
343 /* Copy the lower triangle into the upper triangle. */
344 double intercept_variance = 0.0;
345 for (i = 0; i < q->size1; i++)
347 gsl_matrix_set (l->cov, i + 1, i + 1, gsl_matrix_get (q, i, i));
348 for (j = i + 1; j < q->size2; j++)
350 intercept_variance -= 2.0 * gsl_matrix_get (q, i, j) *
351 linreg_get_indep_variable_mean (l, i) *
352 linreg_get_indep_variable_mean (l, j);
353 gsl_matrix_set (q, i, j, gsl_matrix_get (q, j, i));
356 l->intercept = linreg_get_depvar_mean (l);
358 for (i = 0; i < l->n_indeps; i++)
360 tmp = linreg_get_indep_variable_mean (l, i);
361 l->intercept -= l->coeff[i] * tmp;
362 intercept_variance += tmp * tmp * gsl_matrix_get (q, i, i);
365 /* Covariances related to the intercept. */
366 intercept_variance += linreg_mse (l) / linreg_n_obs (l);
367 gsl_matrix_set (l->cov, 0, 0, intercept_variance);
368 double intcpt_coef = 0.0;
369 for (i = 0; i < q->size1; i++)
371 for (j = 0; j < q->size2; j++)
373 intcpt_coef -= gsl_matrix_get (q, i, j)
374 * linreg_get_indep_variable_mean (l, j);
376 gsl_matrix_set (l->cov, 0, i + 1, intcpt_coef);
377 gsl_matrix_set (l->cov, i + 1, 0, intcpt_coef);
383 gsl_vector_free (xty);
384 gsl_vector_free (tau);
385 gsl_matrix_free (xtx);
386 gsl_vector_free (params);
390 Estimate the model parameters from the covariance matrix. This
391 function assumes the covariance entries corresponding to the
392 dependent variable are in the final row and column of the covariance
396 linreg_fit (const gsl_matrix *cov, linreg *l)
399 assert (cov != NULL);
401 l->sst = gsl_matrix_get (cov, cov->size1 - 1, cov->size2 - 1);
402 if (l->method == LINREG_SWEEP)
405 params = gsl_matrix_calloc (cov->size1, cov->size2);
406 gsl_matrix_memcpy (params, cov);
408 post_sweep_computations (l, params);
409 gsl_matrix_free (params);
411 else if (l->method == LINREG_QR)
413 linreg_fit_qr (cov, l);
417 double linreg_mse (const linreg *c)
420 return (c->sse / c->dfe);
423 double linreg_intercept (const linreg *c)
429 linreg_cov (const linreg *c)
435 linreg_coeff (const linreg *c, size_t i)
437 return (c->coeff[i]);
440 const struct variable *
441 linreg_indep_var (const linreg *c, size_t i)
443 return (c->indep_vars[i]);
447 linreg_n_coeffs (const linreg *c)
453 linreg_n_obs (const linreg *c)
459 linreg_sse (const linreg *c)
465 linreg_ssreg (const linreg *c)
467 return (c->sst - c->sse);
470 double linreg_sst (const linreg *c)
476 linreg_dfmodel ( const linreg *c)
482 linreg_set_depvar_mean (linreg *c, double x)
488 linreg_get_depvar_mean (linreg *c)
490 return c->depvar_mean;