1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
18 #include <gsl/gsl_fit.h>
19 #include <gsl/gsl_multifit.h>
21 #include <gsl/gsl_blas.h>
22 #include <gsl/gsl_cblas.h>
27 Find the least-squares estimate of b for the linear model:
31 where Y is an n-by-1 column vector, X is an n-by-p matrix of
32 independent variables, b is a p-by-1 vector of regression coefficients,
33 and Z is an n-by-1 normally-distributed random vector with independent
34 identically distributed components with mean 0.
36 This estimate is found via the sweep operator or singular-value
37 decomposition with gsl.
42 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
43 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
45 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
48 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
49 Springer. 1998. ISBN 0-387-98542-5.
52 #include <math/linreg/linreg.h>
53 #include <math/coefficient.h>
54 #include <gsl/gsl_errno.h>
55 #include <linreg/sweep.h>
57 Get the mean and standard deviation of a vector
58 of doubles via a form of the Kalman filter as
59 described on page 32 of [3].
62 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
71 mean = gsl_vector_get (&v.vector, 0);
73 for (i = 1; i < v.vector.size; i++)
76 tmp = gsl_vector_get (&v.vector, i);
79 variance += j * (j - 1.0) * d * d;
82 *sp = sqrt (variance / (j - 1.0));
89 Set V to contain an array of pointers to the variables
90 used in the model. V must be at least C->N_COEFFS in length.
91 The return value is the number of distinct variables found.
94 pspp_linreg_get_vars (const void *c_, const struct variable **v)
96 const pspp_linreg_cache *c = c_;
97 const struct variable *tmp;
103 Make sure the caller doesn't try to sneak a variable
104 into V that is not in the model.
106 for (i = 0; i < c->n_coeffs; i++)
111 Start at c->coeff[1] to avoid the intercept.
113 for (j = 1; j < c->n_coeffs; j++)
115 tmp = pspp_coeff_get_var (c->coeff[j], 0);
116 assert (tmp != NULL);
117 /* Repeated variables are likely to bunch together, at the end
120 while (i >= 0 && v[i] != tmp)
124 if (i < 0 && result < c->n_coeffs)
134 Allocate a pspp_linreg_cache and return a pointer
135 to it. n is the number of cases, p is the number of
136 independent variables.
139 pspp_linreg_cache_alloc (size_t n, size_t p)
141 pspp_linreg_cache *c;
143 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
145 c->indep_means = gsl_vector_alloc (p);
146 c->indep_std = gsl_vector_alloc (p);
147 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
148 independent variables.
150 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
153 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
159 c->method = PSPP_LINREG_SWEEP;
160 c->predict = pspp_linreg_predict;
161 c->residual = pspp_linreg_residual; /* The procedure to compute my
163 c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
166 c->resid = NULL; /* The variable storing my residuals. */
167 c->pred = NULL; /* The variable storing my predicted values. */
173 pspp_linreg_cache_free (void *m)
177 pspp_linreg_cache *c = m;
180 gsl_vector_free (c->indep_means);
181 gsl_vector_free (c->indep_std);
182 gsl_vector_free (c->ss_indeps);
183 gsl_matrix_free (c->cov);
184 gsl_vector_free (c->ssx);
185 for (i = 0; i < c->n_coeffs; i++)
187 pspp_coeff_free (c->coeff[i]);
196 Fit the linear model via least squares. All pointers passed to pspp_linreg
197 are assumed to be allocated to the correct size and initialized to the
198 values as indicated by opts.
201 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
202 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
205 gsl_matrix *design = NULL;
208 gsl_matrix_view xmxtx;
212 gsl_vector *param_estimates;
225 if (opts->get_depvar_mean_std)
227 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
229 cache->depvar_mean = m;
230 cache->depvar_std = s;
233 for (i = 0; i < cache->n_indeps; i++)
235 if (opts->get_indep_mean_std[i])
237 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
238 gsl_vector_set (cache->indep_means, i, m);
239 gsl_vector_set (cache->indep_std, i, s);
240 gsl_vector_set (cache->ssx, i, ss);
243 cache->dft = cache->n_obs - 1;
244 cache->dfm = cache->n_indeps;
245 cache->dfe = cache->dft - cache->dfm;
246 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
247 regression through the origin.
249 if (cache->method == PSPP_LINREG_SWEEP)
253 Subtract the means to improve the condition of the design
254 matrix. This requires copying X and Y. We do not divide by the
255 standard deviations of the independent variables here since doing
256 so would cause a miscalculation of the residual sums of
257 squares. Dividing by the standard deviation is done GSL's linear
258 regression functions, so if the design matrix has a poor
259 condition, use QR decomposition.
261 The design matrix here does not include a column for the intercept
262 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
263 so design is allocated here when sweeping, or below if using QR.
265 design = gsl_matrix_alloc (X->size1, X->size2);
266 for (i = 0; i < X->size2; i++)
268 m = gsl_vector_get (cache->indep_means, i);
269 for (j = 0; j < X->size1; j++)
271 tmp = (gsl_matrix_get (X, j, i) - m);
272 gsl_matrix_set (design, j, i, tmp);
275 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
276 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
278 for (i = 0; i < xtx.matrix.size1; i++)
280 tmp = gsl_vector_get (cache->ssx, i);
281 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
282 xi = gsl_matrix_column (design, i);
283 for (j = (i + 1); j < xtx.matrix.size2; j++)
285 xj = gsl_matrix_column (design, j);
286 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
287 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
291 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
292 xty = gsl_matrix_column (sw, cache->n_indeps);
294 This loop starts at 1, with i=0 outside the loop, so we can get
295 the model sum of squares due to the first independent variable.
297 xi = gsl_matrix_column (design, 0);
298 gsl_blas_ddot (&(xi.vector), Y, &tmp);
299 gsl_vector_set (&(xty.vector), 0, tmp);
300 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
301 gsl_vector_set (cache->ss_indeps, 0, tmp);
302 for (i = 1; i < cache->n_indeps; i++)
304 xi = gsl_matrix_column (design, i);
305 gsl_blas_ddot (&(xi.vector), Y, &tmp);
306 gsl_vector_set (&(xty.vector), i, tmp);
310 Sweep on the matrix sw, which contains XtX, XtY and YtY.
313 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
314 cache->mse = cache->sse / cache->dfe;
318 m = cache->depvar_mean;
319 for (i = 0; i < cache->n_indeps; i++)
321 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
322 cache->coeff[i + 1]->estimate = tmp;
323 m -= tmp * gsl_vector_get (cache->indep_means, i);
326 Get the covariance matrix of the parameter estimates.
327 Only the upper triangle is necessary.
331 The loops below do not compute the entries related
332 to the estimated intercept.
334 for (i = 0; i < cache->n_indeps; i++)
335 for (j = i; j < cache->n_indeps; j++)
337 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
338 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
341 Get the covariances related to the intercept.
343 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
344 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
345 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
346 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
347 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
348 if (rc == GSL_SUCCESS)
350 tmp = cache->mse / cache->n_obs;
351 for (i = 1; i < 1 + cache->n_indeps; i++)
353 tmp -= gsl_matrix_get (cache->cov, 0, i)
354 * gsl_vector_get (cache->indep_means, i - 1);
356 gsl_matrix_set (cache->cov, 0, 0, tmp);
358 cache->coeff[0]->estimate = m;
362 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
363 __FILE__, __LINE__, gsl_strerror (rc));
366 gsl_matrix_free (sw);
368 else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
371 Use the SVD of X^T X to find a conditional inverse of X^TX. If
372 the SVD is X^T X = U D V^T, then set the conditional inverse
373 to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
374 (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
375 sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
376 equations by setting the estimated parameter vector to
382 gsl_multifit_linear_workspace *wk;
384 Use QR decomposition via GSL.
387 param_estimates = gsl_vector_alloc (1 + X->size2);
388 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
390 for (j = 0; j < X->size1; j++)
392 gsl_matrix_set (design, j, 0, 1.0);
393 for (i = 0; i < X->size2; i++)
395 tmp = gsl_matrix_get (X, j, i);
396 gsl_matrix_set (design, j, i + 1, tmp);
400 wk = gsl_multifit_linear_alloc (design->size1, design->size2);
401 rc = gsl_multifit_linear (design, Y, param_estimates,
402 cache->cov, &(cache->sse), wk);
403 for (i = 0; i < cache->n_coeffs; i++)
405 cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i);
407 if (rc == GSL_SUCCESS)
409 gsl_multifit_linear_free (wk);
410 gsl_vector_free (param_estimates);
414 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
415 __FILE__, __LINE__, rc);
420 cache->ssm = cache->sst - cache->sse;
422 Get the remaining sums of squares for the independent
426 for (i = 1; i < cache->n_indeps; i++)
429 m += gsl_vector_get (cache->ss_indeps, j);
430 tmp = cache->ssm - m;
431 gsl_vector_set (cache->ss_indeps, i, tmp);
434 gsl_matrix_free (design);