1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_fit.h>
19 #include <gsl/gsl_multifit.h>
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
27 Find the least-squares estimate of b for the linear model:
31 where Y is an n-by-1 column vector, X is an n-by-p matrix of
32 independent variables, b is a p-by-1 vector of regression coefficients,
33 and Z is an n-by-1 normally-distributed random vector with independent
34 identically distributed components with mean 0.
36 This estimate is found via the sweep operator or singular-value
37 decomposition with gsl.
42 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
43 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
45 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
48 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
49 Springer. 1998. ISBN 0-387-98542-5.
52 #include <math/linreg/linreg.h>
53 #include <math/coefficient.h>
54 #include <gsl/gsl_errno.h>
55 #include <linreg/sweep.h>
57 Get the mean and standard deviation of a vector
58 of doubles via a form of the Kalman filter as
59 described on page 32 of [3].
62 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
71 mean = gsl_vector_get (&v.vector, 0);
73 for (i = 1; i < v.vector.size; i++)
76 tmp = gsl_vector_get (&v.vector, i);
79 variance += j * (j - 1.0) * d * d;
82 *sp = sqrt (variance / (j - 1.0));
89 Set V to contain an array of pointers to the variables
90 used in the model. V must be at least C->N_COEFFS in length.
91 The return value is the number of distinct variables found.
94 pspp_linreg_get_vars (const void *c_, const struct variable **v)
96 const pspp_linreg_cache *c = c_;
97 struct pspp_coeff *coef = NULL;
98 const struct variable *tmp;
103 Make sure the caller doesn't try to sneak a variable
104 into V that is not in the model.
106 for (i = 0; i < c->n_coeffs; i++)
111 Start at c->coeff[1] to avoid the intercept.
113 v[result] = pspp_coeff_get_var (c->coeff[1], 0);
114 result = (v[result] == NULL) ? 0 : 1;
116 for (coef = c->coeff[2]; coef < c->coeff[c->n_coeffs]; coef++)
118 tmp = pspp_coeff_get_var (coef, 0);
119 assert (tmp != NULL);
120 /* Repeated variables are likely to bunch together, at the end
123 while (i >= 0 && v[i] != tmp)
127 if (i < 0 && result < c->n_coeffs)
137 Allocate a pspp_linreg_cache and return a pointer
138 to it. n is the number of cases, p is the number of
139 independent variables.
142 pspp_linreg_cache_alloc (size_t n, size_t p)
144 pspp_linreg_cache *c;
146 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
148 c->indep_means = gsl_vector_alloc (p);
149 c->indep_std = gsl_vector_alloc (p);
150 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
151 independent variables.
153 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
156 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
162 c->method = PSPP_LINREG_SWEEP;
163 c->predict = pspp_linreg_predict;
164 c->residual = pspp_linreg_residual; /* The procedure to compute my
166 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
169 c->resid = NULL; /* The variable storing my residuals. */
170 c->pred = NULL; /* The variable storing my predicted values. */
176 pspp_linreg_cache_free (void *m)
180 pspp_linreg_cache *c = m;
183 gsl_vector_free (c->indep_means);
184 gsl_vector_free (c->indep_std);
185 gsl_vector_free (c->ss_indeps);
186 gsl_matrix_free (c->cov);
187 for (i = 0; i < c->n_coeffs; i++)
189 pspp_coeff_free (c->coeff[i]);
197 Fit the linear model via least squares. All pointers passed to pspp_linreg
198 are assumed to be allocated to the correct size and initialized to the
199 values as indicated by opts.
202 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
203 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
209 gsl_matrix_view xmxtx;
213 gsl_vector *param_estimates;
226 if (opts->get_depvar_mean_std)
228 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
230 cache->depvar_mean = m;
231 cache->depvar_std = s;
234 for (i = 0; i < cache->n_indeps; i++)
236 if (opts->get_indep_mean_std[i])
238 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
239 gsl_vector_set (cache->indep_means, i, m);
240 gsl_vector_set (cache->indep_std, i, s);
241 gsl_vector_set (cache->ssx, i, ss);
244 cache->dft = cache->n_obs - 1;
245 cache->dfm = cache->n_indeps;
246 cache->dfe = cache->dft - cache->dfm;
247 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
248 regression through the origin.
250 if (cache->method == PSPP_LINREG_SWEEP)
254 Subtract the means to improve the condition of the design
255 matrix. This requires copying X and Y. We do not divide by the
256 standard deviations of the independent variables here since doing
257 so would cause a miscalculation of the residual sums of
258 squares. Dividing by the standard deviation is done GSL's linear
259 regression functions, so if the design matrix has a poor
260 condition, use QR decomposition.
262 The design matrix here does not include a column for the intercept
263 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
264 so design is allocated here when sweeping, or below if using QR.
266 design = gsl_matrix_alloc (X->size1, X->size2);
267 for (i = 0; i < X->size2; i++)
269 m = gsl_vector_get (cache->indep_means, i);
270 for (j = 0; j < X->size1; j++)
272 tmp = (gsl_matrix_get (X, j, i) - m);
273 gsl_matrix_set (design, j, i, tmp);
276 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
277 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
279 for (i = 0; i < xtx.matrix.size1; i++)
281 tmp = gsl_vector_get (cache->ssx, i);
282 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
283 xi = gsl_matrix_column (design, i);
284 for (j = (i + 1); j < xtx.matrix.size2; j++)
286 xj = gsl_matrix_column (design, j);
287 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
288 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
292 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
293 xty = gsl_matrix_column (sw, cache->n_indeps);
295 This loop starts at 1, with i=0 outside the loop, so we can get
296 the model sum of squares due to the first independent variable.
298 xi = gsl_matrix_column (design, 0);
299 gsl_blas_ddot (&(xi.vector), Y, &tmp);
300 gsl_vector_set (&(xty.vector), 0, tmp);
301 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
302 gsl_vector_set (cache->ss_indeps, 0, tmp);
303 for (i = 1; i < cache->n_indeps; i++)
305 xi = gsl_matrix_column (design, i);
306 gsl_blas_ddot (&(xi.vector), Y, &tmp);
307 gsl_vector_set (&(xty.vector), i, tmp);
311 Sweep on the matrix sw, which contains XtX, XtY and YtY.
314 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
315 cache->mse = cache->sse / cache->dfe;
319 m = cache->depvar_mean;
320 for (i = 0; i < cache->n_indeps; i++)
322 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
323 cache->coeff[i + 1]->estimate = tmp;
324 m -= tmp * gsl_vector_get (cache->indep_means, i);
327 Get the covariance matrix of the parameter estimates.
328 Only the upper triangle is necessary.
332 The loops below do not compute the entries related
333 to the estimated intercept.
335 for (i = 0; i < cache->n_indeps; i++)
336 for (j = i; j < cache->n_indeps; j++)
338 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
339 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
342 Get the covariances related to the intercept.
344 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
345 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
346 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
347 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
348 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
349 if (rc == GSL_SUCCESS)
351 tmp = cache->mse / cache->n_obs;
352 for (i = 1; i < 1 + cache->n_indeps; i++)
354 tmp -= gsl_matrix_get (cache->cov, 0, i)
355 * gsl_vector_get (cache->indep_means, i - 1);
357 gsl_matrix_set (cache->cov, 0, 0, tmp);
359 cache->coeff[0]->estimate = m;
363 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
364 __FILE__, __LINE__, gsl_strerror (rc));
367 gsl_matrix_free (sw);
369 else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
372 Use the SVD of X^T X to find a conditional inverse of X^TX. If
373 the SVD is X^T X = U D V^T, then set the conditional inverse
374 to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
375 (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
376 sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
377 equations by setting the estimated parameter vector to
383 gsl_multifit_linear_workspace *wk;
385 Use QR decomposition via GSL.
388 param_estimates = gsl_vector_alloc (1 + X->size2);
389 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
391 for (j = 0; j < X->size1; j++)
393 gsl_matrix_set (design, j, 0, 1.0);
394 for (i = 0; i < X->size2; i++)
396 tmp = gsl_matrix_get (X, j, i);
397 gsl_matrix_set (design, j, i + 1, tmp);
401 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
402 rc = gsl_multifit_linear (design, Y, param_estimates,
403 cache->cov, &(cache->sse), wk);
404 for (i = 0; i < cache->n_coeffs; i++)
406 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i);
408 if (rc == GSL_SUCCESS)
410 gsl_multifit_linear_free (wk);
411 gsl_vector_free (param_estimates);
415 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
416 __FILE__, __LINE__, rc);
421 cache->ssm = cache->sst - cache->sse;
423 Get the remaining sums of squares for the independent
427 for (i = 1; i < cache->n_indeps; i++)
430 m += gsl_vector_get (cache->ss_indeps, j);
431 tmp = cache->ssm - m;
432 gsl_vector_set (cache->ss_indeps, i, tmp);
435 gsl_matrix_free (design);