3 Copyright (C) 2005 Free Software Foundation, Inc.
4 Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Find the least-squares estimate of b for the linear model:
27 where Y is an n-by-1 column vector, X is an n-by-p matrix of
28 independent variables, b is a p-by-1 vector of regression coefficients,
29 and Z is an n-by-1 normally-distributed random vector with independent
30 identically distributed components with mean 0.
32 This estimate is found via the sweep operator or singular-value
33 decomposition with gsl.
38 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
39 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
41 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
44 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
45 Springer. 1998. ISBN 0-387-98542-5.
48 #include "pspp_linreg.h"
49 #include <gsl/gsl_errno.h>
51 Get the mean and standard deviation of a vector
52 of doubles via a form of the Kalman filter as
53 described on page 32 of [3].
56 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
65 mean = gsl_vector_get (&v.vector, 0);
67 for (i = 1; i < v.vector.size; i++)
70 tmp = gsl_vector_get (&v.vector, i);
73 variance += j * (j - 1.0) * d * d;
76 *sp = sqrt (variance / (j - 1.0));
83 Allocate a pspp_linreg_cache and return a pointer
84 to it. n is the number of cases, p is the number of
85 independent variables.
88 pspp_linreg_cache_alloc (size_t n, size_t p)
92 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
93 c->param_estimates = gsl_vector_alloc (p + 1);
94 c->indep_means = gsl_vector_alloc (p);
95 c->indep_std = gsl_vector_alloc (p);
96 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the independent
99 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the model
102 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
108 c->method = PSPP_LINREG_SWEEP;
114 pspp_linreg_cache_free (pspp_linreg_cache * c)
116 gsl_vector_free (c->param_estimates);
117 gsl_vector_free (c->indep_means);
118 gsl_vector_free (c->indep_std);
119 gsl_vector_free (c->ss_indeps);
120 gsl_matrix_free (c->cov);
126 Fit the linear model via least squares. All pointers passed to pspp_linreg
127 are assumed to be allocated to the correct size and initialized to the
128 values as indicated by opts.
131 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
132 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
138 gsl_matrix_view xmxtx;
154 if (opts->get_depvar_mean_std)
156 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
158 cache->depvar_mean = m;
159 cache->depvar_std = s;
162 for (i = 0; i < cache->n_indeps; i++)
164 if (opts->get_indep_mean_std[i])
166 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
167 gsl_vector_set (cache->indep_means, i, m);
168 gsl_vector_set (cache->indep_std, i, s);
169 gsl_vector_set (cache->ssx, i, ss);
172 cache->dft = cache->n_obs - 1;
173 cache->dfm = cache->n_indeps;
174 cache->dfe = cache->dft - cache->dfm;
175 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for regression
178 if (cache->method == PSPP_LINREG_SWEEP)
182 Subtract the means to improve the condition of the design
183 matrix. This requires copying X and Y. We do not divide by the
184 standard deviations of the independent variables here since doing
185 so would cause a miscalculation of the residual sums of
186 squares. Dividing by the standard deviation is done GSL's linear
187 regression functions, so if the design matrix has a poor
188 condition, use QR decomposition.
190 The design matrix here does not include a column for the intercept
191 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
192 so design is allocated here when sweeping, or below if using QR.
194 design = gsl_matrix_alloc (X->size1, X->size2);
195 for (i = 0; i < X->size2; i++)
197 m = gsl_vector_get (cache->indep_means, i);
198 for (j = 0; j < X->size1; j++)
200 tmp = (gsl_matrix_get (X, j, i) - m);
201 gsl_matrix_set (design, j, i, tmp);
204 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
205 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
207 for (i = 0; i < xtx.matrix.size1; i++)
209 tmp = gsl_vector_get (cache->ssx, i);
210 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
211 xi = gsl_matrix_column (design, i);
212 for (j = (i + 1); j < xtx.matrix.size2; j++)
214 xj = gsl_matrix_column (design, j);
215 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
216 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
220 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
221 xty = gsl_matrix_column (sw, cache->n_indeps);
223 This loop starts at 1, with i=0 outside the loop, so we can get
224 the model sum of squares due to the first independent variable.
226 xi = gsl_matrix_column (design, 0);
227 gsl_blas_ddot (&(xi.vector), Y, &tmp);
228 gsl_vector_set (&(xty.vector), 0, tmp);
229 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
230 gsl_vector_set (cache->ss_indeps, 0, tmp);
231 for (i = 1; i < cache->n_indeps; i++)
233 xi = gsl_matrix_column (design, i);
234 gsl_blas_ddot (&(xi.vector), Y, &tmp);
235 gsl_vector_set (&(xty.vector), i, tmp);
239 Sweep on the matrix sw, which contains XtX, XtY and YtY.
242 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
243 cache->mse = cache->sse / cache->dfe;
247 m = cache->depvar_mean;
248 for (i = 0; i < cache->n_indeps; i++)
250 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
251 cache->coeff[i + 1].estimate = tmp;
252 gsl_vector_set (cache->param_estimates, i + 1, tmp);
253 m -= tmp * gsl_vector_get (cache->indep_means, i);
256 Get the covariance matrix of the parameter estimates.
257 Only the upper triangle is necessary.
261 The loops below do not compute the entries related
262 to the estimated intercept.
264 for (i = 0; i < cache->n_indeps; i++)
265 for (j = i; j < cache->n_indeps; j++)
267 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
268 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
271 Get the covariances related to the intercept.
273 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
274 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
275 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
276 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
277 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
278 if (rc == GSL_SUCCESS)
280 tmp = cache->mse / cache->n_obs;
281 for (i = 1; i < 1 + cache->n_indeps; i++)
283 tmp -= gsl_matrix_get (cache->cov, 0, i)
284 * gsl_vector_get (cache->indep_means, i - 1);
286 gsl_matrix_set (cache->cov, 0, 0, tmp);
288 gsl_vector_set (cache->param_estimates, 0, m);
292 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
293 __FILE__, __LINE__, gsl_strerror (rc));
296 gsl_matrix_free (sw);
301 Use QR decomposition via GSL.
303 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
305 for (j = 0; j < X->size1; j++)
307 gsl_matrix_set (design, j, 0, 1.0);
308 for (i = 0; i < X->size2; i++)
310 tmp = gsl_matrix_get (X, j, i);
311 gsl_matrix_set (design, j, i + 1, tmp);
314 gsl_multifit_linear_workspace *wk =
315 gsl_multifit_linear_alloc (design->size1, design->size2);
316 rc = gsl_multifit_linear (design, Y, cache->param_estimates,
317 cache->cov, &(cache->sse), wk);
318 if (rc == GSL_SUCCESS)
320 gsl_multifit_linear_free (wk);
324 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
325 __FILE__, __LINE__, rc);
330 cache->ssm = cache->sst - cache->sse;
332 Get the remaining sums of squares for the independent
336 for (i = 1; i < cache->n_indeps; i++)
339 m += gsl_vector_get (cache->ss_indeps, j);
340 tmp = cache->ssm - m;
341 gsl_vector_set (cache->ss_indeps, i, tmp);
344 gsl_matrix_free (design);