1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_blas.h>
19 #include <gsl/gsl_cblas.h>
20 #include <gsl/gsl_errno.h>
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
23 #include <linreg/sweep.h>
24 #include <math/coefficient.h>
25 #include <math/linreg.h>
26 #include <math/coefficient.h>
27 #include <math/design-matrix.h>
28 #include <src/data/category.h>
29 #include <src/data/variable.h>
30 #include <src/data/value.h>
31 #include <gl/xalloc.h>
34 Find the least-squares estimate of b for the linear model:
38 where Y is an n-by-1 column vector, X is an n-by-p matrix of
39 independent variables, b is a p-by-1 vector of regression coefficients,
40 and Z is an n-by-1 normally-distributed random vector with independent
41 identically distributed components with mean 0.
43 This estimate is found via the sweep operator or singular-value
44 decomposition with gsl.
49 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
50 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
52 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
55 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
56 Springer. 1998. ISBN 0-387-98542-5.
61 Get the mean and standard deviation of a vector
62 of doubles via a form of the Kalman filter as
63 described on page 32 of [3].
66 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
75 mean = gsl_vector_get (&v.vector, 0);
77 for (i = 1; i < v.vector.size; i++)
80 tmp = gsl_vector_get (&v.vector, i);
83 variance += j * (j - 1.0) * d * d;
86 *sp = sqrt (variance / (j - 1.0));
93 Set V to contain an array of pointers to the variables
94 used in the model. V must be at least C->N_COEFFS in length.
95 The return value is the number of distinct variables found.
98 pspp_linreg_get_vars (const void *c_, const struct variable **v)
100 const pspp_linreg_cache *c = c_;
101 const struct variable *tmp;
107 Make sure the caller doesn't try to sneak a variable
108 into V that is not in the model.
110 for (i = 0; i < c->n_coeffs; i++)
114 for (j = 0; j < c->n_coeffs; j++)
116 tmp = pspp_coeff_get_var (c->coeff[j], 0);
117 assert (tmp != NULL);
118 /* Repeated variables are likely to bunch together, at the end
121 while (i >= 0 && v[i] != tmp)
125 if (i < 0 && result < c->n_coeffs)
135 Allocate a pspp_linreg_cache and return a pointer
136 to it. n is the number of cases, p is the number of
137 independent variables.
140 pspp_linreg_cache_alloc (const struct variable *depvar, const struct variable **indep_vars,
144 pspp_linreg_cache *c;
146 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
148 c->indep_vars = indep_vars;
149 c->indep_means = gsl_vector_alloc (p);
150 c->indep_std = gsl_vector_alloc (p);
151 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
152 independent variables.
154 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
160 for (i = 0; i < p; i++)
162 if (var_is_numeric (indep_vars[i]))
168 c->n_coeffs += cat_get_n_categories (indep_vars[i]) - 1;
172 c->cov = gsl_matrix_alloc (c->n_coeffs + 1, c->n_coeffs + 1);
176 c->method = PSPP_LINREG_SWEEP;
177 c->predict = pspp_linreg_predict;
178 c->residual = pspp_linreg_residual; /* The procedure to compute my
180 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
183 c->resid = NULL; /* The variable storing my residuals. */
184 c->pred = NULL; /* The variable storing my predicted values. */
190 pspp_linreg_cache_free (void *m)
194 pspp_linreg_cache *c = m;
197 gsl_vector_free (c->indep_means);
198 gsl_vector_free (c->indep_std);
199 gsl_vector_free (c->ss_indeps);
200 gsl_matrix_free (c->cov);
201 gsl_vector_free (c->ssx);
202 for (i = 0; i < c->n_coeffs; i++)
204 pspp_coeff_free (c->coeff[i]);
212 cache_init (pspp_linreg_cache *cache)
214 assert (cache != NULL);
215 cache->dft = cache->n_obs - 1;
216 cache->dfm = cache->n_indeps;
217 cache->dfe = cache->dft - cache->dfm;
218 cache->intercept = 0.0;
222 post_sweep_computations (pspp_linreg_cache *cache, const struct design_matrix *dm,
227 gsl_matrix_view xmxtx;
235 assert (cache != NULL);
237 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
238 cache->mse = cache->sse / cache->dfe;
242 m = cache->depvar_mean;
243 for (i = 0; i < cache->n_indeps; i++)
245 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
246 cache->coeff[i]->estimate = tmp;
247 m -= tmp * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i));
250 Get the covariance matrix of the parameter estimates.
251 Only the upper triangle is necessary.
255 The loops below do not compute the entries related
256 to the estimated intercept.
258 for (i = 0; i < cache->n_indeps; i++)
259 for (j = i; j < cache->n_indeps; j++)
261 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
262 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
265 Get the covariances related to the intercept.
267 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
268 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
269 xm = gsl_matrix_calloc (1, cache->n_indeps);
270 for (i = 0; i < xm->size2; i++)
272 gsl_matrix_set (xm, 0, i,
273 pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i)));
275 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
276 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
277 gsl_matrix_free (xm);
278 if (rc == GSL_SUCCESS)
280 tmp = cache->mse / cache->n_obs;
281 for (i = 1; i < 1 + cache->n_indeps; i++)
283 tmp -= gsl_matrix_get (cache->cov, 0, i)
284 * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i - 1));
286 gsl_matrix_set (cache->cov, 0, 0, tmp);
288 cache->intercept = m;
292 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
293 __FILE__, __LINE__, gsl_strerror (rc));
299 Fit the linear model via least squares. All pointers passed to pspp_linreg
300 are assumed to be allocated to the correct size and initialized to the
301 values as indicated by opts.
304 pspp_linreg (const gsl_vector * Y, const struct design_matrix *dm,
305 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
308 gsl_matrix *design = NULL;
313 gsl_vector *param_estimates;
314 struct pspp_coeff *coef;
315 const struct variable *v;
316 const union value *val;
329 if (opts->get_depvar_mean_std)
331 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
333 cache->depvar_mean = m;
334 cache->depvar_std = s;
338 cache->n_coeffs = dm->m->size2;
339 for (i = 0; i < dm->m->size2; i++)
341 if (opts->get_indep_mean_std[i])
343 linreg_mean_std (gsl_matrix_const_column (dm->m, i), &m, &s, &ss);
344 v = design_matrix_col_to_var (dm, i);
346 if (var_is_alpha (v))
348 j = i - design_matrix_var_to_column (dm, v);
349 val = cat_subscript_to_value (j, v);
351 coef = pspp_linreg_get_coeff (cache, v, val);
352 pspp_coeff_set_mean (coef, m);
353 pspp_coeff_set_sd (coef, s);
354 gsl_vector_set (cache->ssx, i, ss);
359 if (cache->method == PSPP_LINREG_SWEEP)
363 Subtract the means to improve the condition of the design
364 matrix. This requires copying dm->m and Y. We do not divide by the
365 standard deviations of the independent variables here since doing
366 so would cause a miscalculation of the residual sums of
367 squares. Dividing by the standard deviation is done GSL's linear
368 regression functions, so if the design matrix has a poor
369 condition, use QR decomposition.
371 The design matrix here does not include a column for the intercept
372 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
373 so design is allocated here when sweeping, or below if using QR.
375 design = gsl_matrix_alloc (dm->m->size1, dm->m->size2);
376 for (i = 0; i < dm->m->size2; i++)
378 v = design_matrix_col_to_var (dm, i);
379 m = pspp_linreg_get_indep_variable_mean (cache, v);
380 for (j = 0; j < dm->m->size1; j++)
382 tmp = (gsl_matrix_get (dm->m, j, i) - m);
383 gsl_matrix_set (design, j, i, tmp);
386 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
387 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
389 for (i = 0; i < xtx.matrix.size1; i++)
391 tmp = gsl_vector_get (cache->ssx, i);
392 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
393 xi = gsl_matrix_column (design, i);
394 for (j = (i + 1); j < xtx.matrix.size2; j++)
396 xj = gsl_matrix_column (design, j);
397 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
398 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
402 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
403 xty = gsl_matrix_column (sw, cache->n_indeps);
405 This loop starts at 1, with i=0 outside the loop, so we can get
406 the model sum of squares due to the first independent variable.
408 xi = gsl_matrix_column (design, 0);
409 gsl_blas_ddot (&(xi.vector), Y, &tmp);
410 gsl_vector_set (&(xty.vector), 0, tmp);
411 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
412 gsl_vector_set (cache->ss_indeps, 0, tmp);
413 for (i = 1; i < cache->n_indeps; i++)
415 xi = gsl_matrix_column (design, i);
416 gsl_blas_ddot (&(xi.vector), Y, &tmp);
417 gsl_vector_set (&(xty.vector), i, tmp);
421 Sweep on the matrix sw, which contains XtX, XtY and YtY.
424 post_sweep_computations (cache, dm, sw);
425 gsl_matrix_free (sw);
427 else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
430 Use the SVD of X^T X to find a conditional inverse of X^TX. If
431 the SVD is X^T X = U D V^T, then set the conditional inverse
432 to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
433 (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
434 sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
435 equations by setting the estimated parameter vector to
441 gsl_multifit_linear_workspace *wk;
443 Use QR decomposition via GSL.
446 param_estimates = gsl_vector_alloc (1 + dm->m->size2);
447 design = gsl_matrix_alloc (dm->m->size1, 1 + dm->m->size2);
449 for (j = 0; j < dm->m->size1; j++)
451 gsl_matrix_set (design, j, 0, 1.0);
452 for (i = 0; i < dm->m->size2; i++)
454 tmp = gsl_matrix_get (dm->m, j, i);
455 gsl_matrix_set (design, j, i + 1, tmp);
459 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
460 rc = gsl_multifit_linear (design, Y, param_estimates,
461 cache->cov, &(cache->sse), wk);
462 for (i = 0; i < cache->n_coeffs; i++)
464 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i + 1);
466 cache->intercept = gsl_vector_get (param_estimates, 0);
467 if (rc == GSL_SUCCESS)
469 gsl_multifit_linear_free (wk);
470 gsl_vector_free (param_estimates);
474 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
475 __FILE__, __LINE__, rc);
480 cache->ssm = cache->sst - cache->sse;
482 Get the remaining sums of squares for the independent
486 for (i = 1; i < cache->n_indeps; i++)
489 m += gsl_vector_get (cache->ss_indeps, j);
490 tmp = cache->ssm - m;
491 gsl_vector_set (cache->ss_indeps, i, tmp);
494 gsl_matrix_free (design);
499 Is the coefficient COEF contained in the list of coefficients
503 has_coefficient (const struct pspp_coeff **coef_list, const struct pspp_coeff *coef,
510 if (coef_list[i] == coef)
519 Predict the value of the dependent variable with the
520 new set of predictors. PREDICTORS must point to a list
521 of variables, each of whose values are stored in VALS,
525 pspp_linreg_predict (const struct variable **predictors,
526 const union value **vals, const void *c_, int n_vals)
528 const pspp_linreg_cache *c = c_;
530 size_t next_coef = 0;
531 const struct pspp_coeff **coef_list;
532 const struct pspp_coeff *coe;
536 if (predictors == NULL || vals == NULL || c == NULL)
540 if (c->coeff == NULL)
542 /* The stupid model: just guess the mean. */
543 return c->depvar_mean;
545 coef_list = xnmalloc (c->n_coeffs, sizeof (*coef_list));
546 result = c->intercept;
549 The loops guard against the possibility that the caller passed us
550 inadequate information, such as too few or too many values, or
551 a redundant list of variable names.
553 for (j = 0; j < n_vals; j++)
555 coe = pspp_linreg_get_coeff (c, predictors[j], vals[j]);
556 if (!has_coefficient (coef_list, coe, next_coef))
558 tmp = pspp_coeff_get_est (coe);
559 if (var_is_numeric (predictors[j]))
564 coef_list[next_coef++] = coe;
573 pspp_linreg_residual (const struct variable **predictors,
574 const union value **vals,
575 const union value *obs, const void *c, int n_vals)
580 if (predictors == NULL || vals == NULL || c == NULL || obs == NULL)
584 pred = pspp_linreg_predict (predictors, vals, c, n_vals);
586 result = isnan (pred) ? GSL_NAN : (obs->f - pred);
591 Which coefficient is associated with V? The VAL argument is relevant
592 only to categorical variables.
595 pspp_linreg_get_coeff (const pspp_linreg_cache * c,
596 const struct variable *v, const union value *val)
602 if (c->coeff == NULL || c->n_indeps == 0 || v == NULL)
606 return pspp_coeff_var_to_coeff (v, c->coeff, c->n_coeffs, val);
609 Return the standard deviation of the independent variable.
611 double pspp_linreg_get_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v)
613 if (var_is_numeric (v))
615 const struct pspp_coeff *coef;
616 coef = pspp_linreg_get_coeff (c, v, NULL);
617 return pspp_coeff_get_sd (coef);
622 void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v,
625 if (var_is_numeric (v))
627 struct pspp_coeff *coef;
628 coef = pspp_linreg_get_coeff (c, v, NULL);
629 pspp_coeff_set_sd (coef, s);
634 Mean of the independent variable.
636 double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v)
638 if (var_is_numeric (v))
640 struct pspp_coeff *coef;
641 coef = pspp_linreg_get_coeff (c, v, NULL);
642 return pspp_coeff_get_mean (coef);
647 void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v,
650 if (var_is_numeric (v))
652 struct pspp_coeff *coef;
653 coef = pspp_linreg_get_coeff (c, v, NULL);
654 pspp_coeff_set_mean (coef, m);
659 Make sure the dependent variable is at the last column, and that
660 only variables in the model are in the covariance matrix.
662 static struct design_matrix *
663 rearrange_covariance_matrix (const struct covariance_matrix *cm, pspp_linreg_cache *c)
665 const struct variable **model_vars;
666 struct design_matrix *cov;
667 struct design_matrix *result;
675 cov = covariance_to_design (cm);
676 assert (cov != NULL);
678 assert (cov->m->size1 > 0);
679 assert (cov->m->size2 == cov->m->size1);
680 model_vars = xnmalloc (1 + c->n_indeps, sizeof (*model_vars));
683 Put the model variables in the right order in MODEL_VARS.
684 Count the number of coefficients.
686 for (i = 0; i < c->n_indeps; i++)
688 model_vars[i] = c->indep_vars[i];
690 model_vars[i] = c->depvar;
691 result = covariance_matrix_create (1 + c->n_indeps, model_vars);
692 permutation = xnmalloc (design_matrix_get_n_cols (result), sizeof (*permutation));
694 for (j = 0; j < cov->m->size2; j++)
697 while (k < result->m->size2)
699 if (design_matrix_col_to_var (cov, j) == design_matrix_col_to_var (result, k))
706 for (i = 0; i < result->m->size1; i++)
707 for (j = 0; j < result->m->size2; j++)
709 gsl_matrix_set (result->m, i, j, gsl_matrix_get (cov->m, permutation[i], permutation[j]));
716 Estimate the model parameters from the covariance matrix only. This
717 method uses less memory than PSPP_LINREG, which requires the entire
718 data set to be stored in memory.
720 The function assumes FULL_COV may contain columns corresponding to
721 variables that are not in the model. It fixes this in
722 REARRANG_COVARIANCE_MATRIX. This allows the caller to compute a
723 large covariance matrix once before, then pass it to this without
724 having to alter it. The problem is that this means the caller must
728 pspp_linreg_with_cov (const struct covariance_matrix *full_cov,
729 pspp_linreg_cache * cache)
731 struct design_matrix *cov;
733 assert (full_cov != NULL);
734 assert (cache != NULL);
736 cov = rearrange_covariance_matrix (full_cov, cache);
739 post_sweep_computations (cache, cov, cov->m);
740 design_matrix_destroy (cov);
743 double pspp_linreg_mse (const pspp_linreg_cache *c)
746 return (c->sse / c->dfe);