1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_blas.h>
19 #include <gsl/gsl_cblas.h>
20 #include <gsl/gsl_errno.h>
21 #include <gsl/gsl_fit.h>
22 #include <gsl/gsl_multifit.h>
23 #include <linreg/sweep.h>
24 #include <math/coefficient.h>
25 #include <math/linreg.h>
26 #include <math/coefficient.h>
27 #include <src/data/variable.h>
28 #include <src/data/value.h>
29 #include <gl/xalloc.h>
32 Find the least-squares estimate of b for the linear model:
36 where Y is an n-by-1 column vector, X is an n-by-p matrix of
37 independent variables, b is a p-by-1 vector of regression coefficients,
38 and Z is an n-by-1 normally-distributed random vector with independent
39 identically distributed components with mean 0.
41 This estimate is found via the sweep operator or singular-value
42 decomposition with gsl.
47 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
48 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
50 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
53 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
54 Springer. 1998. ISBN 0-387-98542-5.
59 Get the mean and standard deviation of a vector
60 of doubles via a form of the Kalman filter as
61 described on page 32 of [3].
64 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
73 mean = gsl_vector_get (&v.vector, 0);
75 for (i = 1; i < v.vector.size; i++)
78 tmp = gsl_vector_get (&v.vector, i);
81 variance += j * (j - 1.0) * d * d;
84 *sp = sqrt (variance / (j - 1.0));
91 Set V to contain an array of pointers to the variables
92 used in the model. V must be at least C->N_COEFFS in length.
93 The return value is the number of distinct variables found.
96 pspp_linreg_get_vars (const void *c_, const struct variable **v)
98 const pspp_linreg_cache *c = c_;
99 const struct variable *tmp;
105 Make sure the caller doesn't try to sneak a variable
106 into V that is not in the model.
108 for (i = 0; i < c->n_coeffs; i++)
112 for (j = 0; j < c->n_coeffs; j++)
114 tmp = pspp_coeff_get_var (c->coeff[j], 0);
115 assert (tmp != NULL);
116 /* Repeated variables are likely to bunch together, at the end
119 while (i >= 0 && v[i] != tmp)
123 if (i < 0 && result < c->n_coeffs)
133 Allocate a pspp_linreg_cache and return a pointer
134 to it. n is the number of cases, p is the number of
135 independent variables.
138 pspp_linreg_cache_alloc (size_t n, size_t p)
140 pspp_linreg_cache *c;
142 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
144 c->indep_means = gsl_vector_alloc (p);
145 c->indep_std = gsl_vector_alloc (p);
146 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
147 independent variables.
149 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
152 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
158 c->method = PSPP_LINREG_SWEEP;
159 c->predict = pspp_linreg_predict;
160 c->residual = pspp_linreg_residual; /* The procedure to compute my
162 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
165 c->resid = NULL; /* The variable storing my residuals. */
166 c->pred = NULL; /* The variable storing my predicted values. */
172 pspp_linreg_cache_free (void *m)
176 pspp_linreg_cache *c = m;
179 gsl_vector_free (c->indep_means);
180 gsl_vector_free (c->indep_std);
181 gsl_vector_free (c->ss_indeps);
182 gsl_matrix_free (c->cov);
183 gsl_vector_free (c->ssx);
184 for (i = 0; i < c->n_coeffs; i++)
186 pspp_coeff_free (c->coeff[i]);
195 Fit the linear model via least squares. All pointers passed to pspp_linreg
196 are assumed to be allocated to the correct size and initialized to the
197 values as indicated by opts.
200 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
201 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
204 gsl_matrix *design = NULL;
207 gsl_matrix_view xmxtx;
211 gsl_vector *param_estimates;
224 if (opts->get_depvar_mean_std)
226 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
228 cache->depvar_mean = m;
229 cache->depvar_std = s;
232 for (i = 0; i < cache->n_indeps; i++)
234 if (opts->get_indep_mean_std[i])
236 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
237 gsl_vector_set (cache->indep_means, i, m);
238 gsl_vector_set (cache->indep_std, i, s);
239 gsl_vector_set (cache->ssx, i, ss);
242 cache->dft = cache->n_obs - 1;
243 cache->dfm = cache->n_indeps;
244 cache->dfe = cache->dft - cache->dfm;
245 cache->n_coeffs = X->size2;
246 cache->intercept = 0.0;
248 if (cache->method == PSPP_LINREG_SWEEP)
252 Subtract the means to improve the condition of the design
253 matrix. This requires copying X and Y. We do not divide by the
254 standard deviations of the independent variables here since doing
255 so would cause a miscalculation of the residual sums of
256 squares. Dividing by the standard deviation is done GSL's linear
257 regression functions, so if the design matrix has a poor
258 condition, use QR decomposition.
260 The design matrix here does not include a column for the intercept
261 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
262 so design is allocated here when sweeping, or below if using QR.
264 design = gsl_matrix_alloc (X->size1, X->size2);
265 for (i = 0; i < X->size2; i++)
267 m = gsl_vector_get (cache->indep_means, i);
268 for (j = 0; j < X->size1; j++)
270 tmp = (gsl_matrix_get (X, j, i) - m);
271 gsl_matrix_set (design, j, i, tmp);
274 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
275 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
277 for (i = 0; i < xtx.matrix.size1; i++)
279 tmp = gsl_vector_get (cache->ssx, i);
280 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
281 xi = gsl_matrix_column (design, i);
282 for (j = (i + 1); j < xtx.matrix.size2; j++)
284 xj = gsl_matrix_column (design, j);
285 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
286 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
290 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
291 xty = gsl_matrix_column (sw, cache->n_indeps);
293 This loop starts at 1, with i=0 outside the loop, so we can get
294 the model sum of squares due to the first independent variable.
296 xi = gsl_matrix_column (design, 0);
297 gsl_blas_ddot (&(xi.vector), Y, &tmp);
298 gsl_vector_set (&(xty.vector), 0, tmp);
299 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
300 gsl_vector_set (cache->ss_indeps, 0, tmp);
301 for (i = 1; i < cache->n_indeps; i++)
303 xi = gsl_matrix_column (design, i);
304 gsl_blas_ddot (&(xi.vector), Y, &tmp);
305 gsl_vector_set (&(xty.vector), i, tmp);
309 Sweep on the matrix sw, which contains XtX, XtY and YtY.
312 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
313 cache->mse = cache->sse / cache->dfe;
317 m = cache->depvar_mean;
318 for (i = 0; i < cache->n_indeps; i++)
320 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
321 cache->coeff[i]->estimate = tmp;
322 m -= tmp * gsl_vector_get (cache->indep_means, i);
325 Get the covariance matrix of the parameter estimates.
326 Only the upper triangle is necessary.
330 The loops below do not compute the entries related
331 to the estimated intercept.
333 for (i = 0; i < cache->n_indeps; i++)
334 for (j = i; j < cache->n_indeps; j++)
336 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
337 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
340 Get the covariances related to the intercept.
342 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
343 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
344 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
345 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
346 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
347 if (rc == GSL_SUCCESS)
349 tmp = cache->mse / cache->n_obs;
350 for (i = 1; i < 1 + cache->n_indeps; i++)
352 tmp -= gsl_matrix_get (cache->cov, 0, i)
353 * gsl_vector_get (cache->indep_means, i - 1);
355 gsl_matrix_set (cache->cov, 0, 0, tmp);
357 cache->intercept = m;
361 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
362 __FILE__, __LINE__, gsl_strerror (rc));
365 gsl_matrix_free (sw);
367 else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
370 Use the SVD of X^T X to find a conditional inverse of X^TX. If
371 the SVD is X^T X = U D V^T, then set the conditional inverse
372 to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
373 (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
374 sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
375 equations by setting the estimated parameter vector to
381 gsl_multifit_linear_workspace *wk;
383 Use QR decomposition via GSL.
386 param_estimates = gsl_vector_alloc (1 + X->size2);
387 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
389 for (j = 0; j < X->size1; j++)
391 gsl_matrix_set (design, j, 0, 1.0);
392 for (i = 0; i < X->size2; i++)
394 tmp = gsl_matrix_get (X, j, i);
395 gsl_matrix_set (design, j, i + 1, tmp);
399 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
400 rc = gsl_multifit_linear (design, Y, param_estimates,
401 cache->cov, &(cache->sse), wk);
402 for (i = 0; i < cache->n_coeffs; i++)
404 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i + 1);
406 cache->intercept = gsl_vector_get (param_estimates, 0);
407 if (rc == GSL_SUCCESS)
409 gsl_multifit_linear_free (wk);
410 gsl_vector_free (param_estimates);
414 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
415 __FILE__, __LINE__, rc);
420 cache->ssm = cache->sst - cache->sse;
422 Get the remaining sums of squares for the independent
426 for (i = 1; i < cache->n_indeps; i++)
429 m += gsl_vector_get (cache->ss_indeps, j);
430 tmp = cache->ssm - m;
431 gsl_vector_set (cache->ss_indeps, i, tmp);
434 gsl_matrix_free (design);
439 Is the coefficient COEF contained in the list of coefficients
443 has_coefficient (const struct pspp_coeff **coef_list, const struct pspp_coeff *coef,
450 if (coef_list[i] == coef)
459 Predict the value of the dependent variable with the
460 new set of predictors. PREDICTORS must point to a list
461 of variables, each of whose values are stored in VALS,
465 pspp_linreg_predict (const struct variable **predictors,
466 const union value **vals, const void *c_, int n_vals)
468 const pspp_linreg_cache *c = c_;
470 size_t next_coef = 0;
471 const struct pspp_coeff **coef_list;
472 const struct pspp_coeff *coe;
476 if (predictors == NULL || vals == NULL || c == NULL)
480 if (c->coeff == NULL)
482 /* The stupid model: just guess the mean. */
483 return c->depvar_mean;
485 coef_list = xnmalloc (c->n_coeffs, sizeof (*coef_list));
486 result = c->intercept;
489 The loops guard against the possibility that the caller passed us
490 inadequate information, such as too few or too many values, or
491 a redundant list of variable names.
493 for (j = 0; j < n_vals; j++)
495 coe = pspp_linreg_get_coeff (c, predictors[j], vals[j]);
496 if (!has_coefficient (coef_list, coe, next_coef))
498 tmp = pspp_coeff_get_est (coe);
499 if (var_is_numeric (predictors[j]))
504 coef_list[next_coef++] = coe;
513 pspp_linreg_residual (const struct variable **predictors,
514 const union value **vals,
515 const union value *obs, const void *c, int n_vals)
520 if (predictors == NULL || vals == NULL || c == NULL || obs == NULL)
524 pred = pspp_linreg_predict (predictors, vals, c, n_vals);
526 result = gsl_isnan (pred) ? GSL_NAN : (obs->f - pred);
531 Which coefficient is associated with V? The VAL argument is relevant
532 only to categorical variables.
534 const struct pspp_coeff *
535 pspp_linreg_get_coeff (const pspp_linreg_cache * c,
536 const struct variable *v, const union value *val)
539 struct pspp_coeff *result = NULL;
540 const struct variable *tmp = NULL;
546 if (c->coeff == NULL || c->n_indeps == 0 || v == NULL)
551 result = c->coeff[0];
552 tmp = pspp_coeff_get_var (result, 0);
553 while (tmp != v && i < c->n_coeffs)
555 result = c->coeff[i];
556 tmp = pspp_coeff_get_var (result, 0);
566 if (var_is_numeric (v))
570 else if (val != NULL)
573 If v is categorical, we need to ensure the coefficient
576 while (tmp != v && i < c->n_coeffs
577 && compare_values (pspp_coeff_get_value (result, tmp),
578 val, var_get_width (v)))
581 result = c->coeff[i];
582 tmp = pspp_coeff_get_var (result, 0);
584 if (i == c->n_coeffs && tmp != v)