1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_blas.h>
19 #include <gsl/gsl_cblas.h>
20 #include <gsl/gsl_errno.h>
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
23 #include <linreg/sweep.h>
24 #include <math/coefficient.h>
25 #include <math/linreg.h>
26 #include <math/coefficient.h>
27 #include <math/covariance-matrix.h>
28 #include <math/design-matrix.h>
29 #include <src/data/category.h>
30 #include <src/data/variable.h>
31 #include <src/data/value.h>
32 #include <gl/xalloc.h>
35 Find the least-squares estimate of b for the linear model:
39 where Y is an n-by-1 column vector, X is an n-by-p matrix of
40 independent variables, b is a p-by-1 vector of regression coefficients,
41 and Z is an n-by-1 normally-distributed random vector with independent
42 identically distributed components with mean 0.
44 This estimate is found via the sweep operator or singular-value
45 decomposition with gsl.
50 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
51 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
53 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
56 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
57 Springer. 1998. ISBN 0-387-98542-5.
62 Get the mean and standard deviation of a vector
63 of doubles via a form of the Kalman filter as
64 described on page 32 of [3].
67 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
76 mean = gsl_vector_get (&v.vector, 0);
78 for (i = 1; i < v.vector.size; i++)
81 tmp = gsl_vector_get (&v.vector, i);
84 variance += j * (j - 1.0) * d * d;
87 *sp = sqrt (variance / (j - 1.0));
94 Set V to contain an array of pointers to the variables
95 used in the model. V must be at least C->N_COEFFS in length.
96 The return value is the number of distinct variables found.
99 pspp_linreg_get_vars (const void *c_, const struct variable **v)
101 const pspp_linreg_cache *c = c_;
102 const struct variable *tmp;
108 Make sure the caller doesn't try to sneak a variable
109 into V that is not in the model.
111 for (i = 0; i < c->n_coeffs; i++)
115 for (j = 0; j < c->n_coeffs; j++)
117 tmp = pspp_coeff_get_var (c->coeff[j], 0);
118 assert (tmp != NULL);
119 /* Repeated variables are likely to bunch together, at the end
122 while (i >= 0 && v[i] != tmp)
126 if (i < 0 && result < c->n_coeffs)
136 Allocate a pspp_linreg_cache and return a pointer
137 to it. n is the number of cases, p is the number of
138 independent variables.
141 pspp_linreg_cache_alloc (size_t n, size_t p)
143 pspp_linreg_cache *c;
145 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
147 c->indep_means = gsl_vector_alloc (p);
148 c->indep_std = gsl_vector_alloc (p);
149 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
150 independent variables.
152 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
155 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
161 c->method = PSPP_LINREG_SWEEP;
162 c->predict = pspp_linreg_predict;
163 c->residual = pspp_linreg_residual; /* The procedure to compute my
165 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
168 c->resid = NULL; /* The variable storing my residuals. */
169 c->pred = NULL; /* The variable storing my predicted values. */
175 pspp_linreg_cache_free (void *m)
179 pspp_linreg_cache *c = m;
182 gsl_vector_free (c->indep_means);
183 gsl_vector_free (c->indep_std);
184 gsl_vector_free (c->ss_indeps);
185 gsl_matrix_free (c->cov);
186 gsl_vector_free (c->ssx);
187 for (i = 0; i < c->n_coeffs; i++)
189 pspp_coeff_free (c->coeff[i]);
197 cache_init (pspp_linreg_cache *cache, const struct design_matrix *dm)
199 assert (cache != NULL);
200 cache->dft = cache->n_obs - 1;
201 cache->dfm = cache->n_indeps;
202 cache->dfe = cache->dft - cache->dfm;
203 cache->n_coeffs = dm->m->size2;
204 cache->intercept = 0.0;
208 post_sweep_computations (pspp_linreg_cache *cache, const struct design_matrix *dm,
213 gsl_matrix_view xmxtx;
221 assert (cache != NULL);
223 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
224 cache->mse = cache->sse / cache->dfe;
228 m = cache->depvar_mean;
229 for (i = 0; i < cache->n_indeps; i++)
231 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
232 cache->coeff[i]->estimate = tmp;
233 m -= tmp * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i));
236 Get the covariance matrix of the parameter estimates.
237 Only the upper triangle is necessary.
241 The loops below do not compute the entries related
242 to the estimated intercept.
244 for (i = 0; i < cache->n_indeps; i++)
245 for (j = i; j < cache->n_indeps; j++)
247 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
248 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
251 Get the covariances related to the intercept.
253 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
254 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
255 xm = gsl_matrix_calloc (1, cache->n_indeps);
256 for (i = 0; i < xm->size2; i++)
258 gsl_matrix_set (xm, 0, i,
259 pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i)));
261 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
262 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
263 gsl_matrix_free (xm);
264 if (rc == GSL_SUCCESS)
266 tmp = cache->mse / cache->n_obs;
267 for (i = 1; i < 1 + cache->n_indeps; i++)
269 tmp -= gsl_matrix_get (cache->cov, 0, i)
270 * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i - 1));
272 gsl_matrix_set (cache->cov, 0, 0, tmp);
274 cache->intercept = m;
278 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
279 __FILE__, __LINE__, gsl_strerror (rc));
285 Fit the linear model via least squares. All pointers passed to pspp_linreg
286 are assumed to be allocated to the correct size and initialized to the
287 values as indicated by opts.
290 pspp_linreg (const gsl_vector * Y, const struct design_matrix *dm,
291 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
294 gsl_matrix *design = NULL;
299 gsl_vector *param_estimates;
300 struct pspp_coeff *coef;
301 const struct variable *v;
302 const union value *val;
315 if (opts->get_depvar_mean_std)
317 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
319 cache->depvar_mean = m;
320 cache->depvar_std = s;
323 cache_init (cache, dm);
324 for (i = 0; i < dm->m->size2; i++)
326 if (opts->get_indep_mean_std[i])
328 linreg_mean_std (gsl_matrix_const_column (dm->m, i), &m, &s, &ss);
329 v = design_matrix_col_to_var (dm, i);
331 if (var_is_alpha (v))
333 j = i - design_matrix_var_to_column (dm, v);
334 val = cat_subscript_to_value (j, v);
336 coef = pspp_linreg_get_coeff (cache, v, val);
337 pspp_coeff_set_mean (coef, m);
338 pspp_coeff_set_sd (coef, s);
339 gsl_vector_set (cache->ssx, i, ss);
344 if (cache->method == PSPP_LINREG_SWEEP)
348 Subtract the means to improve the condition of the design
349 matrix. This requires copying dm->m and Y. We do not divide by the
350 standard deviations of the independent variables here since doing
351 so would cause a miscalculation of the residual sums of
352 squares. Dividing by the standard deviation is done GSL's linear
353 regression functions, so if the design matrix has a poor
354 condition, use QR decomposition.
356 The design matrix here does not include a column for the intercept
357 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
358 so design is allocated here when sweeping, or below if using QR.
360 design = gsl_matrix_alloc (dm->m->size1, dm->m->size2);
361 for (i = 0; i < dm->m->size2; i++)
363 v = design_matrix_col_to_var (dm, i);
364 m = pspp_linreg_get_indep_variable_mean (cache, v);
365 for (j = 0; j < dm->m->size1; j++)
367 tmp = (gsl_matrix_get (dm->m, j, i) - m);
368 gsl_matrix_set (design, j, i, tmp);
371 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
372 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
374 for (i = 0; i < xtx.matrix.size1; i++)
376 tmp = gsl_vector_get (cache->ssx, i);
377 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
378 xi = gsl_matrix_column (design, i);
379 for (j = (i + 1); j < xtx.matrix.size2; j++)
381 xj = gsl_matrix_column (design, j);
382 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
383 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
387 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
388 xty = gsl_matrix_column (sw, cache->n_indeps);
390 This loop starts at 1, with i=0 outside the loop, so we can get
391 the model sum of squares due to the first independent variable.
393 xi = gsl_matrix_column (design, 0);
394 gsl_blas_ddot (&(xi.vector), Y, &tmp);
395 gsl_vector_set (&(xty.vector), 0, tmp);
396 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
397 gsl_vector_set (cache->ss_indeps, 0, tmp);
398 for (i = 1; i < cache->n_indeps; i++)
400 xi = gsl_matrix_column (design, i);
401 gsl_blas_ddot (&(xi.vector), Y, &tmp);
402 gsl_vector_set (&(xty.vector), i, tmp);
406 Sweep on the matrix sw, which contains XtX, XtY and YtY.
409 post_sweep_computations (cache, dm, sw);
410 gsl_matrix_free (sw);
412 else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
415 Use the SVD of X^T X to find a conditional inverse of X^TX. If
416 the SVD is X^T X = U D V^T, then set the conditional inverse
417 to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
418 (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
419 sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
420 equations by setting the estimated parameter vector to
426 gsl_multifit_linear_workspace *wk;
428 Use QR decomposition via GSL.
431 param_estimates = gsl_vector_alloc (1 + dm->m->size2);
432 design = gsl_matrix_alloc (dm->m->size1, 1 + dm->m->size2);
434 for (j = 0; j < dm->m->size1; j++)
436 gsl_matrix_set (design, j, 0, 1.0);
437 for (i = 0; i < dm->m->size2; i++)
439 tmp = gsl_matrix_get (dm->m, j, i);
440 gsl_matrix_set (design, j, i + 1, tmp);
444 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
445 rc = gsl_multifit_linear (design, Y, param_estimates,
446 cache->cov, &(cache->sse), wk);
447 for (i = 0; i < cache->n_coeffs; i++)
449 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i + 1);
451 cache->intercept = gsl_vector_get (param_estimates, 0);
452 if (rc == GSL_SUCCESS)
454 gsl_multifit_linear_free (wk);
455 gsl_vector_free (param_estimates);
459 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
460 __FILE__, __LINE__, rc);
465 cache->ssm = cache->sst - cache->sse;
467 Get the remaining sums of squares for the independent
471 for (i = 1; i < cache->n_indeps; i++)
474 m += gsl_vector_get (cache->ss_indeps, j);
475 tmp = cache->ssm - m;
476 gsl_vector_set (cache->ss_indeps, i, tmp);
479 gsl_matrix_free (design);
484 Is the coefficient COEF contained in the list of coefficients
488 has_coefficient (const struct pspp_coeff **coef_list, const struct pspp_coeff *coef,
495 if (coef_list[i] == coef)
504 Predict the value of the dependent variable with the
505 new set of predictors. PREDICTORS must point to a list
506 of variables, each of whose values are stored in VALS,
510 pspp_linreg_predict (const struct variable **predictors,
511 const union value **vals, const void *c_, int n_vals)
513 const pspp_linreg_cache *c = c_;
515 size_t next_coef = 0;
516 const struct pspp_coeff **coef_list;
517 const struct pspp_coeff *coe;
521 if (predictors == NULL || vals == NULL || c == NULL)
525 if (c->coeff == NULL)
527 /* The stupid model: just guess the mean. */
528 return c->depvar_mean;
530 coef_list = xnmalloc (c->n_coeffs, sizeof (*coef_list));
531 result = c->intercept;
534 The loops guard against the possibility that the caller passed us
535 inadequate information, such as too few or too many values, or
536 a redundant list of variable names.
538 for (j = 0; j < n_vals; j++)
540 coe = pspp_linreg_get_coeff (c, predictors[j], vals[j]);
541 if (!has_coefficient (coef_list, coe, next_coef))
543 tmp = pspp_coeff_get_est (coe);
544 if (var_is_numeric (predictors[j]))
549 coef_list[next_coef++] = coe;
558 pspp_linreg_residual (const struct variable **predictors,
559 const union value **vals,
560 const union value *obs, const void *c, int n_vals)
565 if (predictors == NULL || vals == NULL || c == NULL || obs == NULL)
569 pred = pspp_linreg_predict (predictors, vals, c, n_vals);
571 result = isnan (pred) ? GSL_NAN : (obs->f - pred);
576 Which coefficient is associated with V? The VAL argument is relevant
577 only to categorical variables.
580 pspp_linreg_get_coeff (const pspp_linreg_cache * c,
581 const struct variable *v, const union value *val)
587 if (c->coeff == NULL || c->n_indeps == 0 || v == NULL)
591 return pspp_coeff_var_to_coeff (v, c->coeff, c->n_coeffs, val);
594 Return the standard deviation of the independent variable.
596 double pspp_linreg_get_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v)
598 if (var_is_numeric (v))
600 const struct pspp_coeff *coef;
601 coef = pspp_linreg_get_coeff (c, v, NULL);
602 return pspp_coeff_get_sd (coef);
607 void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v,
610 if (var_is_numeric (v))
612 struct pspp_coeff *coef;
613 coef = pspp_linreg_get_coeff (c, v, NULL);
614 pspp_coeff_set_sd (coef, s);
619 Mean of the independent variable.
621 double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v)
623 if (var_is_numeric (v))
625 struct pspp_coeff *coef;
626 coef = pspp_linreg_get_coeff (c, v, NULL);
627 return pspp_coeff_get_mean (coef);
632 void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v,
635 if (var_is_numeric (v))
637 struct pspp_coeff *coef;
638 coef = pspp_linreg_get_coeff (c, v, NULL);
639 pspp_coeff_set_mean (coef, m);
644 Make sure the dependent variable is at the last column, and that
645 only variables in the model are in the covariance matrix.
647 static struct design_matrix *
648 rearrange_covariance_matrix (const struct design_matrix *cov, pspp_linreg_cache *c)
651 struct variable **model_vars;
652 struct variable *tmp;
653 struct design_matrix *result;
662 assert (cov != NULL);
664 assert (cov->m->size1 > 0);
665 assert (cov->m->size2 == cov->m->size1);
666 v = xnmalloc (c->n_coeffs, sizeof (*v));
667 model_vars = xnmalloc (c->n_coeffs, sizeof (*model_vars));
668 columns = xnmalloc (cov->m->size2, sizeof (*columns));
669 n_vars = pspp_linreg_get_vars (c, (const struct variable **) v);
672 for (i = 0; i < cov->m->size2; i++)
674 tmp = design_matrix_col_to_var (cov, i);
677 while (!found && j < n_vars)
682 if (tmp == c->depvar)
696 columns[k] = dep_col;
698 K should now be equal to C->N_INDEPS + 1. If it is not, then
699 either the code above is wrong or the caller didn't send us the
702 assert (k == c->n_indeps + 1);
704 Put the model variables in the right order in MODEL_VARS.
706 for (i = 0; i < k; i++)
708 model_vars[i] = v[columns[i]];
711 result = covariance_matrix_create (k, model_vars);
712 for (i = 0; i < result->m->size1; i++)
714 for (j = 0; j < result->m->size2; j++)
716 gsl_matrix_set (result->m, i, j, gsl_matrix_get (cov->m, columns[i], columns[j]));
724 Estimate the model parameters from the covariance matrix only. This
725 method uses less memory than PSPP_LINREG, which requires the entire
726 data set to be stored in memory.
729 pspp_linreg_with_cov (const struct design_matrix *full_cov,
730 pspp_linreg_cache * cache)
732 struct design_matrix *cov;
734 assert (cov != NULL);
735 assert (cache != NULL);
737 cov = rearrange_covariance_matrix (full_cov, cache);
738 cache_init (cache, cov);
740 post_sweep_computations (cache, cov, cov->m);
741 covariance_matrix_destroy (cov);