1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_blas.h>
19 #include <gsl/gsl_cblas.h>
20 #include <gsl/gsl_errno.h>
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
23 #include <linreg/sweep.h>
24 #include <math/coefficient.h>
25 #include <math/linreg.h>
26 #include <math/coefficient.h>
27 #include <math/design-matrix.h>
28 #include <src/data/category.h>
29 #include <src/data/variable.h>
30 #include <src/data/value.h>
31 #include <gl/xalloc.h>
34 Find the least-squares estimate of b for the linear model:
38 where Y is an n-by-1 column vector, X is an n-by-p matrix of
39 independent variables, b is a p-by-1 vector of regression coefficients,
40 and Z is an n-by-1 normally-distributed random vector with independent
41 identically distributed components with mean 0.
43 This estimate is found via the sweep operator or singular-value
44 decomposition with gsl.
49 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
50 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
52 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
55 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
56 Springer. 1998. ISBN 0-387-98542-5.
61 Get the mean and standard deviation of a vector
62 of doubles via a form of the Kalman filter as
63 described on page 32 of [3].
66 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
75 mean = gsl_vector_get (&v.vector, 0);
77 for (i = 1; i < v.vector.size; i++)
80 tmp = gsl_vector_get (&v.vector, i);
83 variance += j * (j - 1.0) * d * d;
86 *sp = sqrt (variance / (j - 1.0));
93 Set V to contain an array of pointers to the variables
94 used in the model. V must be at least C->N_COEFFS in length.
95 The return value is the number of distinct variables found.
98 pspp_linreg_get_vars (const void *c_, const struct variable **v)
100 const pspp_linreg_cache *c = c_;
101 const struct variable *tmp;
107 Make sure the caller doesn't try to sneak a variable
108 into V that is not in the model.
110 for (i = 0; i < c->n_coeffs; i++)
114 for (j = 0; j < c->n_coeffs; j++)
116 tmp = pspp_coeff_get_var (c->coeff[j], 0);
117 assert (tmp != NULL);
118 /* Repeated variables are likely to bunch together, at the end
121 while (i >= 0 && v[i] != tmp)
125 if (i < 0 && result < c->n_coeffs)
135 Allocate a pspp_linreg_cache and return a pointer
136 to it. n is the number of cases, p is the number of
137 independent variables.
140 pspp_linreg_cache_alloc (size_t n, size_t p)
142 pspp_linreg_cache *c;
144 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
146 c->indep_means = gsl_vector_alloc (p);
147 c->indep_std = gsl_vector_alloc (p);
148 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
149 independent variables.
151 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
154 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
160 c->method = PSPP_LINREG_SWEEP;
161 c->predict = pspp_linreg_predict;
162 c->residual = pspp_linreg_residual; /* The procedure to compute my
164 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
167 c->resid = NULL; /* The variable storing my residuals. */
168 c->pred = NULL; /* The variable storing my predicted values. */
174 pspp_linreg_cache_free (void *m)
178 pspp_linreg_cache *c = m;
181 gsl_vector_free (c->indep_means);
182 gsl_vector_free (c->indep_std);
183 gsl_vector_free (c->ss_indeps);
184 gsl_matrix_free (c->cov);
185 gsl_vector_free (c->ssx);
186 for (i = 0; i < c->n_coeffs; i++)
188 pspp_coeff_free (c->coeff[i]);
197 Fit the linear model via least squares. All pointers passed to pspp_linreg
198 are assumed to be allocated to the correct size and initialized to the
199 values as indicated by opts.
202 pspp_linreg (const gsl_vector * Y, const struct design_matrix *dm,
203 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
206 gsl_matrix *design = NULL;
209 gsl_matrix_view xmxtx;
213 gsl_vector *param_estimates;
214 struct pspp_coeff *coef;
215 const struct variable *v;
216 const union value *val;
229 if (opts->get_depvar_mean_std)
231 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
233 cache->depvar_mean = m;
234 cache->depvar_std = s;
238 cache->dft = cache->n_obs - 1;
239 cache->dfm = cache->n_indeps;
240 cache->dfe = cache->dft - cache->dfm;
241 cache->n_coeffs = dm->m->size2;
242 cache->intercept = 0.0;
244 for (i = 0; i < dm->m->size2; i++)
246 if (opts->get_indep_mean_std[i])
248 linreg_mean_std (gsl_matrix_const_column (dm->m, i), &m, &s, &ss);
249 v = design_matrix_col_to_var (dm, i);
251 if (var_is_alpha (v))
253 j = i - design_matrix_var_to_column (dm, v);
254 val = cat_subscript_to_value (j, v);
256 coef = pspp_linreg_get_coeff (cache, v, val);
257 pspp_coeff_set_mean (coef, m);
258 pspp_coeff_set_sd (coef, s);
259 gsl_vector_set (cache->ssx, i, ss);
264 if (cache->method == PSPP_LINREG_SWEEP)
268 Subtract the means to improve the condition of the design
269 matrix. This requires copying dm->m and Y. We do not divide by the
270 standard deviations of the independent variables here since doing
271 so would cause a miscalculation of the residual sums of
272 squares. Dividing by the standard deviation is done GSL's linear
273 regression functions, so if the design matrix has a poor
274 condition, use QR decomposition.
276 The design matrix here does not include a column for the intercept
277 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
278 so design is allocated here when sweeping, or below if using QR.
280 design = gsl_matrix_alloc (dm->m->size1, dm->m->size2);
281 for (i = 0; i < dm->m->size2; i++)
283 v = design_matrix_col_to_var (dm, i);
284 m = pspp_linreg_get_indep_variable_mean (cache, v);
285 for (j = 0; j < dm->m->size1; j++)
287 tmp = (gsl_matrix_get (dm->m, j, i) - m);
288 gsl_matrix_set (design, j, i, tmp);
291 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
292 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
294 for (i = 0; i < xtx.matrix.size1; i++)
296 tmp = gsl_vector_get (cache->ssx, i);
297 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
298 xi = gsl_matrix_column (design, i);
299 for (j = (i + 1); j < xtx.matrix.size2; j++)
301 xj = gsl_matrix_column (design, j);
302 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
303 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
307 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
308 xty = gsl_matrix_column (sw, cache->n_indeps);
310 This loop starts at 1, with i=0 outside the loop, so we can get
311 the model sum of squares due to the first independent variable.
313 xi = gsl_matrix_column (design, 0);
314 gsl_blas_ddot (&(xi.vector), Y, &tmp);
315 gsl_vector_set (&(xty.vector), 0, tmp);
316 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
317 gsl_vector_set (cache->ss_indeps, 0, tmp);
318 for (i = 1; i < cache->n_indeps; i++)
320 xi = gsl_matrix_column (design, i);
321 gsl_blas_ddot (&(xi.vector), Y, &tmp);
322 gsl_vector_set (&(xty.vector), i, tmp);
326 Sweep on the matrix sw, which contains XtX, XtY and YtY.
329 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
330 cache->mse = cache->sse / cache->dfe;
334 m = cache->depvar_mean;
335 for (i = 0; i < cache->n_indeps; i++)
337 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
338 cache->coeff[i]->estimate = tmp;
339 m -= tmp * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i));
342 Get the covariance matrix of the parameter estimates.
343 Only the upper triangle is necessary.
347 The loops below do not compute the entries related
348 to the estimated intercept.
350 for (i = 0; i < cache->n_indeps; i++)
351 for (j = i; j < cache->n_indeps; j++)
353 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
354 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
357 Get the covariances related to the intercept.
359 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
360 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
361 xm = gsl_matrix_calloc (1, cache->n_indeps);
362 for (i = 0; i < xm->size2; i++)
364 gsl_matrix_set (xm, 0, i,
365 pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i)));
367 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
368 &xtx.matrix, xm, 0.0, &xmxtx.matrix);
369 gsl_matrix_free (xm);
370 if (rc == GSL_SUCCESS)
372 tmp = cache->mse / cache->n_obs;
373 for (i = 1; i < 1 + cache->n_indeps; i++)
375 tmp -= gsl_matrix_get (cache->cov, 0, i)
376 * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i - 1));
378 gsl_matrix_set (cache->cov, 0, 0, tmp);
380 cache->intercept = m;
384 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
385 __FILE__, __LINE__, gsl_strerror (rc));
388 gsl_matrix_free (sw);
390 else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
393 Use the SVD of X^T X to find a conditional inverse of X^TX. If
394 the SVD is X^T X = U D V^T, then set the conditional inverse
395 to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
396 (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
397 sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
398 equations by setting the estimated parameter vector to
404 gsl_multifit_linear_workspace *wk;
406 Use QR decomposition via GSL.
409 param_estimates = gsl_vector_alloc (1 + dm->m->size2);
410 design = gsl_matrix_alloc (dm->m->size1, 1 + dm->m->size2);
412 for (j = 0; j < dm->m->size1; j++)
414 gsl_matrix_set (design, j, 0, 1.0);
415 for (i = 0; i < dm->m->size2; i++)
417 tmp = gsl_matrix_get (dm->m, j, i);
418 gsl_matrix_set (design, j, i + 1, tmp);
422 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
423 rc = gsl_multifit_linear (design, Y, param_estimates,
424 cache->cov, &(cache->sse), wk);
425 for (i = 0; i < cache->n_coeffs; i++)
427 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i + 1);
429 cache->intercept = gsl_vector_get (param_estimates, 0);
430 if (rc == GSL_SUCCESS)
432 gsl_multifit_linear_free (wk);
433 gsl_vector_free (param_estimates);
437 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
438 __FILE__, __LINE__, rc);
443 cache->ssm = cache->sst - cache->sse;
445 Get the remaining sums of squares for the independent
449 for (i = 1; i < cache->n_indeps; i++)
452 m += gsl_vector_get (cache->ss_indeps, j);
453 tmp = cache->ssm - m;
454 gsl_vector_set (cache->ss_indeps, i, tmp);
457 gsl_matrix_free (design);
462 Is the coefficient COEF contained in the list of coefficients
466 has_coefficient (const struct pspp_coeff **coef_list, const struct pspp_coeff *coef,
473 if (coef_list[i] == coef)
482 Predict the value of the dependent variable with the
483 new set of predictors. PREDICTORS must point to a list
484 of variables, each of whose values are stored in VALS,
488 pspp_linreg_predict (const struct variable **predictors,
489 const union value **vals, const void *c_, int n_vals)
491 const pspp_linreg_cache *c = c_;
493 size_t next_coef = 0;
494 const struct pspp_coeff **coef_list;
495 const struct pspp_coeff *coe;
499 if (predictors == NULL || vals == NULL || c == NULL)
503 if (c->coeff == NULL)
505 /* The stupid model: just guess the mean. */
506 return c->depvar_mean;
508 coef_list = xnmalloc (c->n_coeffs, sizeof (*coef_list));
509 result = c->intercept;
512 The loops guard against the possibility that the caller passed us
513 inadequate information, such as too few or too many values, or
514 a redundant list of variable names.
516 for (j = 0; j < n_vals; j++)
518 coe = pspp_linreg_get_coeff (c, predictors[j], vals[j]);
519 if (!has_coefficient (coef_list, coe, next_coef))
521 tmp = pspp_coeff_get_est (coe);
522 if (var_is_numeric (predictors[j]))
527 coef_list[next_coef++] = coe;
536 pspp_linreg_residual (const struct variable **predictors,
537 const union value **vals,
538 const union value *obs, const void *c, int n_vals)
543 if (predictors == NULL || vals == NULL || c == NULL || obs == NULL)
547 pred = pspp_linreg_predict (predictors, vals, c, n_vals);
549 result = isnan (pred) ? GSL_NAN : (obs->f - pred);
554 Which coefficient is associated with V? The VAL argument is relevant
555 only to categorical variables.
558 pspp_linreg_get_coeff (const pspp_linreg_cache * c,
559 const struct variable *v, const union value *val)
565 if (c->coeff == NULL || c->n_indeps == 0 || v == NULL)
569 return pspp_coeff_var_to_coeff (v, c->coeff, c->n_coeffs, val);
572 Return the standard deviation of the independent variable.
574 double pspp_linreg_get_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v)
576 if (var_is_numeric (v))
578 const struct pspp_coeff *coef;
579 coef = pspp_linreg_get_coeff (c, v, NULL);
580 return pspp_coeff_get_sd (coef);
585 void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v,
588 if (var_is_numeric (v))
590 struct pspp_coeff *coef;
591 coef = pspp_linreg_get_coeff (c, v, NULL);
592 pspp_coeff_set_sd (coef, s);
597 Mean of the independent variable.
599 double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v)
601 if (var_is_numeric (v))
603 struct pspp_coeff *coef;
604 coef = pspp_linreg_get_coeff (c, v, NULL);
605 return pspp_coeff_get_mean (coef);
610 void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v,
613 if (var_is_numeric (v))
615 struct pspp_coeff *coef;
616 coef = pspp_linreg_get_coeff (c, v, NULL);
617 pspp_coeff_set_mean (coef, m);