1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_blas.h>
19 #include <gsl/gsl_cblas.h>
20 #include <gsl/gsl_errno.h>
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
23 #include <linreg/sweep.h>
24 #include <math/coefficient.h>
25 #include <math/linreg.h>
26 #include <math/coefficient.h>
27 #include <math/covariance-matrix.h>
28 #include <math/design-matrix.h>
29 #include <src/data/category.h>
30 #include <src/data/variable.h>
31 #include <src/data/value.h>
32 #include <gl/xalloc.h>
35 Find the least-squares estimate of b for the linear model:
39 where Y is an n-by-1 column vector, X is an n-by-p matrix of
40 independent variables, b is a p-by-1 vector of regression coefficients,
41 and Z is an n-by-1 normally-distributed random vector with independent
42 identically distributed components with mean 0.
44 This estimate is found via the sweep operator or singular-value
45 decomposition with gsl.
50 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
51 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
53 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
56 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
57 Springer. 1998. ISBN 0-387-98542-5.
62 Get the mean and standard deviation of a vector
63 of doubles via a form of the Kalman filter as
64 described on page 32 of [3].
67 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
76 mean = gsl_vector_get (&v.vector, 0);
78 for (i = 1; i < v.vector.size; i++)
81 tmp = gsl_vector_get (&v.vector, i);
84 variance += j * (j - 1.0) * d * d;
87 *sp = sqrt (variance / (j - 1.0));
94 Set V to contain an array of pointers to the variables
95 used in the model. V must be at least C->N_COEFFS in length.
96 The return value is the number of distinct variables found.
99 pspp_linreg_get_vars (const void *c_, const struct variable **v)
101 const pspp_linreg_cache *c = c_;
102 const struct variable *tmp;
108 Make sure the caller doesn't try to sneak a variable
109 into V that is not in the model.
111 for (i = 0; i < c->n_coeffs; i++)
115 for (j = 0; j < c->n_coeffs; j++)
117 tmp = pspp_coeff_get_var (c->coeff[j], 0);
118 assert (tmp != NULL);
119 /* Repeated variables are likely to bunch together, at the end
122 while (i >= 0 && v[i] != tmp)
126 if (i < 0 && result < c->n_coeffs)
136 Allocate a pspp_linreg_cache and return a pointer
137 to it. n is the number of cases, p is the number of
138 independent variables.
141 pspp_linreg_cache_alloc (const struct variable *depvar, const struct variable **indep_vars,
145 pspp_linreg_cache *c;
147 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
149 c->indep_vars = indep_vars;
150 c->indep_means = gsl_vector_alloc (p);
151 c->indep_std = gsl_vector_alloc (p);
152 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
153 independent variables.
155 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
161 for (i = 0; i < p; i++)
163 if (var_is_numeric (indep_vars[i]))
169 c->n_coeffs += cat_get_n_categories (indep_vars[i]) - 1;
173 c->cov = gsl_matrix_alloc (c->n_coeffs + 1, c->n_coeffs + 1);
177 c->method = PSPP_LINREG_SWEEP;
178 c->predict = pspp_linreg_predict;
179 c->residual = pspp_linreg_residual; /* The procedure to compute my
181 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
184 c->resid = NULL; /* The variable storing my residuals. */
185 c->pred = NULL; /* The variable storing my predicted values. */
191 pspp_linreg_cache_free (void *m)
195 pspp_linreg_cache *c = m;
198 gsl_vector_free (c->indep_means);
199 gsl_vector_free (c->indep_std);
200 gsl_vector_free (c->ss_indeps);
201 gsl_matrix_free (c->cov);
202 gsl_vector_free (c->ssx);
203 for (i = 0; i < c->n_coeffs; i++)
205 pspp_coeff_free (c->coeff[i]);
213 cache_init (pspp_linreg_cache *cache)
215 assert (cache != NULL);
216 cache->dft = cache->n_obs - 1;
217 cache->dfm = cache->n_indeps;
218 cache->dfe = cache->dft - cache->dfm;
219 cache->intercept = 0.0;
223 post_sweep_computations (pspp_linreg_cache *cache, const struct design_matrix *dm,
228 gsl_matrix_view xmxtx;
236 assert (cache != NULL);
238 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
239 cache->mse = cache->sse / cache->dfe;
243 m = cache->depvar_mean;
244 for (i = 0; i < cache->n_indeps; i++)
246 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
247 cache->coeff[i]->estimate = tmp;
248 m -= tmp * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i));
251 Get the covariance matrix of the parameter estimates.
252 Only the upper triangle is necessary.
256 The loops below do not compute the entries related
257 to the estimated intercept.
259 for (i = 0; i < cache->n_indeps; i++)
260 for (j = i; j < cache->n_indeps; j++)
262 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
263 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
266 Get the covariances related to the intercept.
268 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
269 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
270 xm = gsl_matrix_calloc (1, cache->n_indeps);
271 for (i = 0; i < xm->size2; i++)
273 gsl_matrix_set (xm, 0, i,
274 pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i)));
276 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
277 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
278 gsl_matrix_free (xm);
279 if (rc == GSL_SUCCESS)
281 tmp = cache->mse / cache->n_obs;
282 for (i = 1; i < 1 + cache->n_indeps; i++)
284 tmp -= gsl_matrix_get (cache->cov, 0, i)
285 * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i - 1));
287 gsl_matrix_set (cache->cov, 0, 0, tmp);
289 cache->intercept = m;
293 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
294 __FILE__, __LINE__, gsl_strerror (rc));
300 Fit the linear model via least squares. All pointers passed to pspp_linreg
301 are assumed to be allocated to the correct size and initialized to the
302 values as indicated by opts.
305 pspp_linreg (const gsl_vector * Y, const struct design_matrix *dm,
306 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
309 gsl_matrix *design = NULL;
314 gsl_vector *param_estimates;
315 struct pspp_coeff *coef;
316 const struct variable *v;
317 const union value *val;
330 if (opts->get_depvar_mean_std)
332 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
334 cache->depvar_mean = m;
335 cache->depvar_std = s;
339 cache->n_coeffs = dm->m->size2;
340 for (i = 0; i < dm->m->size2; i++)
342 if (opts->get_indep_mean_std[i])
344 linreg_mean_std (gsl_matrix_const_column (dm->m, i), &m, &s, &ss);
345 v = design_matrix_col_to_var (dm, i);
347 if (var_is_alpha (v))
349 j = i - design_matrix_var_to_column (dm, v);
350 val = cat_subscript_to_value (j, v);
352 coef = pspp_linreg_get_coeff (cache, v, val);
353 pspp_coeff_set_mean (coef, m);
354 pspp_coeff_set_sd (coef, s);
355 gsl_vector_set (cache->ssx, i, ss);
360 if (cache->method == PSPP_LINREG_SWEEP)
364 Subtract the means to improve the condition of the design
365 matrix. This requires copying dm->m and Y. We do not divide by the
366 standard deviations of the independent variables here since doing
367 so would cause a miscalculation of the residual sums of
368 squares. Dividing by the standard deviation is done GSL's linear
369 regression functions, so if the design matrix has a poor
370 condition, use QR decomposition.
372 The design matrix here does not include a column for the intercept
373 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
374 so design is allocated here when sweeping, or below if using QR.
376 design = gsl_matrix_alloc (dm->m->size1, dm->m->size2);
377 for (i = 0; i < dm->m->size2; i++)
379 v = design_matrix_col_to_var (dm, i);
380 m = pspp_linreg_get_indep_variable_mean (cache, v);
381 for (j = 0; j < dm->m->size1; j++)
383 tmp = (gsl_matrix_get (dm->m, j, i) - m);
384 gsl_matrix_set (design, j, i, tmp);
387 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
388 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
390 for (i = 0; i < xtx.matrix.size1; i++)
392 tmp = gsl_vector_get (cache->ssx, i);
393 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
394 xi = gsl_matrix_column (design, i);
395 for (j = (i + 1); j < xtx.matrix.size2; j++)
397 xj = gsl_matrix_column (design, j);
398 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
399 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
403 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
404 xty = gsl_matrix_column (sw, cache->n_indeps);
406 This loop starts at 1, with i=0 outside the loop, so we can get
407 the model sum of squares due to the first independent variable.
409 xi = gsl_matrix_column (design, 0);
410 gsl_blas_ddot (&(xi.vector), Y, &tmp);
411 gsl_vector_set (&(xty.vector), 0, tmp);
412 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
413 gsl_vector_set (cache->ss_indeps, 0, tmp);
414 for (i = 1; i < cache->n_indeps; i++)
416 xi = gsl_matrix_column (design, i);
417 gsl_blas_ddot (&(xi.vector), Y, &tmp);
418 gsl_vector_set (&(xty.vector), i, tmp);
422 Sweep on the matrix sw, which contains XtX, XtY and YtY.
425 post_sweep_computations (cache, dm, sw);
426 gsl_matrix_free (sw);
428 else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
431 Use the SVD of X^T X to find a conditional inverse of X^TX. If
432 the SVD is X^T X = U D V^T, then set the conditional inverse
433 to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
434 (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
435 sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
436 equations by setting the estimated parameter vector to
442 gsl_multifit_linear_workspace *wk;
444 Use QR decomposition via GSL.
447 param_estimates = gsl_vector_alloc (1 + dm->m->size2);
448 design = gsl_matrix_alloc (dm->m->size1, 1 + dm->m->size2);
450 for (j = 0; j < dm->m->size1; j++)
452 gsl_matrix_set (design, j, 0, 1.0);
453 for (i = 0; i < dm->m->size2; i++)
455 tmp = gsl_matrix_get (dm->m, j, i);
456 gsl_matrix_set (design, j, i + 1, tmp);
460 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
461 rc = gsl_multifit_linear (design, Y, param_estimates,
462 cache->cov, &(cache->sse), wk);
463 for (i = 0; i < cache->n_coeffs; i++)
465 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i + 1);
467 cache->intercept = gsl_vector_get (param_estimates, 0);
468 if (rc == GSL_SUCCESS)
470 gsl_multifit_linear_free (wk);
471 gsl_vector_free (param_estimates);
475 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
476 __FILE__, __LINE__, rc);
481 cache->ssm = cache->sst - cache->sse;
483 Get the remaining sums of squares for the independent
487 for (i = 1; i < cache->n_indeps; i++)
490 m += gsl_vector_get (cache->ss_indeps, j);
491 tmp = cache->ssm - m;
492 gsl_vector_set (cache->ss_indeps, i, tmp);
495 gsl_matrix_free (design);
500 Is the coefficient COEF contained in the list of coefficients
504 has_coefficient (const struct pspp_coeff **coef_list, const struct pspp_coeff *coef,
511 if (coef_list[i] == coef)
520 Predict the value of the dependent variable with the
521 new set of predictors. PREDICTORS must point to a list
522 of variables, each of whose values are stored in VALS,
526 pspp_linreg_predict (const struct variable **predictors,
527 const union value **vals, const void *c_, int n_vals)
529 const pspp_linreg_cache *c = c_;
531 size_t next_coef = 0;
532 const struct pspp_coeff **coef_list;
533 const struct pspp_coeff *coe;
537 if (predictors == NULL || vals == NULL || c == NULL)
541 if (c->coeff == NULL)
543 /* The stupid model: just guess the mean. */
544 return c->depvar_mean;
546 coef_list = xnmalloc (c->n_coeffs, sizeof (*coef_list));
547 result = c->intercept;
550 The loops guard against the possibility that the caller passed us
551 inadequate information, such as too few or too many values, or
552 a redundant list of variable names.
554 for (j = 0; j < n_vals; j++)
556 coe = pspp_linreg_get_coeff (c, predictors[j], vals[j]);
557 if (!has_coefficient (coef_list, coe, next_coef))
559 tmp = pspp_coeff_get_est (coe);
560 if (var_is_numeric (predictors[j]))
565 coef_list[next_coef++] = coe;
574 pspp_linreg_residual (const struct variable **predictors,
575 const union value **vals,
576 const union value *obs, const void *c, int n_vals)
581 if (predictors == NULL || vals == NULL || c == NULL || obs == NULL)
585 pred = pspp_linreg_predict (predictors, vals, c, n_vals);
587 result = isnan (pred) ? GSL_NAN : (obs->f - pred);
592 Which coefficient is associated with V? The VAL argument is relevant
593 only to categorical variables.
596 pspp_linreg_get_coeff (const pspp_linreg_cache * c,
597 const struct variable *v, const union value *val)
603 if (c->coeff == NULL || c->n_indeps == 0 || v == NULL)
607 return pspp_coeff_var_to_coeff (v, c->coeff, c->n_coeffs, val);
610 Return the standard deviation of the independent variable.
612 double pspp_linreg_get_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v)
614 if (var_is_numeric (v))
616 const struct pspp_coeff *coef;
617 coef = pspp_linreg_get_coeff (c, v, NULL);
618 return pspp_coeff_get_sd (coef);
623 void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v,
626 if (var_is_numeric (v))
628 struct pspp_coeff *coef;
629 coef = pspp_linreg_get_coeff (c, v, NULL);
630 pspp_coeff_set_sd (coef, s);
635 Mean of the independent variable.
637 double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v)
639 if (var_is_numeric (v))
641 struct pspp_coeff *coef;
642 coef = pspp_linreg_get_coeff (c, v, NULL);
643 return pspp_coeff_get_mean (coef);
648 void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v,
651 if (var_is_numeric (v))
653 struct pspp_coeff *coef;
654 coef = pspp_linreg_get_coeff (c, v, NULL);
655 pspp_coeff_set_mean (coef, m);
660 Make sure the dependent variable is at the last column, and that
661 only variables in the model are in the covariance matrix.
663 static struct design_matrix *
664 rearrange_covariance_matrix (const struct design_matrix *cov, pspp_linreg_cache *c)
666 const struct variable **model_vars;
667 struct design_matrix *result;
673 assert (cov != NULL);
675 assert (cov->m->size1 > 0);
676 assert (cov->m->size2 == cov->m->size1);
677 permutation = xnmalloc (1 + c->n_indeps, sizeof (*permutation));
678 model_vars = xnmalloc (1 + c->n_indeps, sizeof (*model_vars));
681 Put the model variables in the right order in MODEL_VARS.
683 for (i = 0; i < c->n_indeps; i++)
685 model_vars[i] = c->indep_vars[i];
687 model_vars[i] = c->depvar;
688 result = covariance_matrix_create (1 + c->n_indeps, model_vars);
689 for (j = 0; j < cov->m->size2; j++)
692 while (k < result->m->size2)
694 if (design_matrix_col_to_var (cov, j) == design_matrix_col_to_var (result, k))
701 for (i = 0; i < result->m->size1; i++)
702 for (j = 0; j < result->m->size2; j++)
704 gsl_matrix_set (result->m, i, j, gsl_matrix_get (cov->m, permutation[i], permutation[j]));
711 Estimate the model parameters from the covariance matrix only. This
712 method uses less memory than PSPP_LINREG, which requires the entire
713 data set to be stored in memory.
715 The function assumes FULL_COV may contain columns corresponding to
716 variables that are not in the model. It fixes this in
717 REARRANG_COVARIANCE_MATRIX. This allows the caller to compute a
718 large covariance matrix once before, then pass it to this without
719 having to alter it. The problem is that this means the caller must
723 pspp_linreg_with_cov (const struct design_matrix *full_cov,
724 pspp_linreg_cache * cache)
726 struct design_matrix *cov;
728 assert (full_cov != NULL);
729 assert (cache != NULL);
731 cov = rearrange_covariance_matrix (full_cov, cache);
734 post_sweep_computations (cache, cov, cov->m);
735 covariance_matrix_destroy (cov);