3 Copyright (C) 2005 Free Software Foundation, Inc.
4 Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Find the least-squares estimate of b for the linear model:
27 where Y is an n-by-1 column vector, X is an n-by-p matrix of
28 independent variables, b is a p-by-1 vector of regression coefficients,
29 and Z is an n-by-1 normally-distributed random vector with independent
30 identically distributed components with mean 0.
32 This estimate is found via the sweep operator or singular-value
33 decomposition with gsl.
38 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
39 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
41 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
44 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
45 Springer. 1998. ISBN 0-387-98542-5.
48 #include "pspp_linreg.h"
49 #include <gsl/gsl_errno.h>
51 Get the mean and standard deviation of a vector
52 of doubles via a form of the Kalman filter as
53 described on page 32 of [3].
56 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
65 mean = gsl_vector_get (&v.vector, 0);
67 for (i = 1; i < v.vector.size; i++)
70 tmp = gsl_vector_get (&v.vector, i);
73 variance += j * (j - 1.0) * d * d;
76 *sp = sqrt (variance / (j - 1.0));
83 Allocate a pspp_linreg_cache and return a pointer
84 to it. n is the number of cases, p is the number of
85 independent variables.
88 pspp_linreg_cache_alloc (size_t n, size_t p)
90 pspp_linreg_cache *cache;
92 cache = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
93 cache->param_estimates = gsl_vector_alloc (p + 1);
94 cache->indep_means = gsl_vector_alloc (p);
95 cache->indep_std = gsl_vector_alloc (p);
96 cache->ssx = gsl_vector_alloc (p); /* Sums of squares for the independent
99 cache->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the model
102 cache->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
108 cache->method = PSPP_LINREG_SWEEP;
114 pspp_linreg_cache_free (pspp_linreg_cache * cache)
116 gsl_vector_free (cache->param_estimates);
117 gsl_vector_free (cache->indep_means);
118 gsl_vector_free (cache->indep_std);
119 gsl_vector_free (cache->ss_indeps);
120 gsl_matrix_free (cache->cov);
125 Fit the linear model via least squares. All pointers passed to pspp_linreg
126 are assumed to be allocated to the correct size and initialized to the
127 values as indicated by opts.
130 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
131 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
137 gsl_matrix_view xmxtx;
154 if (opts->get_depvar_mean_std)
156 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
158 cache->depvar_mean = m;
159 cache->depvar_std = s;
162 for (i = 0; i < cache->n_indeps; i++)
164 if (opts->get_indep_mean_std[i])
166 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
167 gsl_vector_set (cache->indep_means, i, m);
168 gsl_vector_set (cache->indep_std, i, s);
169 gsl_vector_set (cache->ssx, i, ss);
172 cache->dft = cache->n_obs - 1;
173 cache->dfm = cache->n_indeps;
174 cache->dfe = cache->dft - cache->dfm;
175 if (cache->method == PSPP_LINREG_SWEEP)
179 Subtract the means to improve the condition of the design
180 matrix. This requires copying X and Y. We do not divide by the
181 standard deviations of the independent variables here since doing
182 so would cause a miscalculation of the residual sums of
183 squares. Dividing by the standard deviation is done GSL's linear
184 regression functions, so if the design matrix has a very poor
185 condition, use QR decomposition.
187 The design matrix here does not include a column for the intercept
188 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
189 so design is allocated here when sweeping, or below if using QR.
191 design = gsl_matrix_alloc (X->size1, X->size2);
192 for (i = 0; i < X->size2; i++)
194 m = gsl_vector_get (cache->indep_means, i);
195 for (j = 0; j < X->size1; j++)
197 tmp = (gsl_matrix_get (X, j, i) - m);
198 gsl_matrix_set (design, j, i, tmp);
201 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
202 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
204 for (i = 0; i < xtx.matrix.size1; i++)
206 tmp = gsl_vector_get (cache->ssx, i);
207 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
208 xi = gsl_matrix_column (design, i);
209 for (j = (i + 1); j < xtx.matrix.size2; j++)
211 xj = gsl_matrix_column (design, j);
212 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
213 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
217 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
218 xty = gsl_matrix_column (sw, cache->n_indeps);
220 This loop starts at 1, with i=0 outside the loop, so we can get
221 the model sum of squares due to the first independent variable.
223 xi = gsl_matrix_column (design, 0);
224 gsl_blas_ddot (&(xi.vector), Y, &tmp);
225 gsl_vector_set (&(xty.vector), 0, tmp);
226 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
227 gsl_vector_set (cache->ss_indeps, 0, tmp);
228 for (i = 1; i < cache->n_indeps; i++)
230 xi = gsl_matrix_column (design, i);
231 gsl_blas_ddot (&(xi.vector), Y, &tmp);
232 gsl_vector_set (&(xty.vector), i, tmp);
236 Sweep on the matrix sw, which contains XtX, XtY and YtY.
239 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
240 cache->mse = cache->sse / cache->dfe;
244 m = cache->depvar_mean;
245 for (i = 0; i < cache->n_indeps; i++)
247 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
248 gsl_vector_set (cache->param_estimates, i + 1, tmp);
249 m -= tmp * gsl_vector_get (cache->indep_means, i);
252 Get the covariance matrix of the parameter estimates.
253 Only the upper triangle is necessary.
257 The loops below do not compute the entries related
258 to the estimated intercept.
260 for (i = 0; i < cache->n_indeps; i++)
261 for (j = i; j < cache->n_indeps; j++)
263 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
264 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
267 Get the covariances related to the intercept.
269 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
270 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
271 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
272 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
273 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
274 if (rc == GSL_SUCCESS)
276 tmp = cache->mse / cache->n_obs;
277 for (i = 1; i < 1 + cache->n_indeps; i++)
279 tmp -= gsl_matrix_get (cache->cov, 0, i)
280 * gsl_vector_get (cache->indep_means, i - 1);
282 gsl_matrix_set (cache->cov, 0, 0, tmp);
284 gsl_vector_set (cache->param_estimates, 0, m);
288 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
289 __FILE__, __LINE__, gsl_strerror (rc));
292 gsl_matrix_free (sw);
297 Use QR decomposition via GSL.
299 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
301 for (j = 0; j < X->size1; j++)
303 gsl_matrix_set (design, j, 0, 1.0);
304 for (i = 0; i < X->size2; i++)
306 tmp = gsl_matrix_get (X, j, i);
307 gsl_matrix_set (design, j, i + 1, tmp);
310 gsl_multifit_linear_workspace *wk =
311 gsl_multifit_linear_alloc (design->size1, design->size2);
312 rc = gsl_multifit_linear (design, Y, cache->param_estimates,
313 cache->cov, &(cache->sse), wk);
314 if (rc == GSL_SUCCESS)
316 gsl_multifit_linear_free (wk);
320 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
321 __FILE__, __LINE__, rc);
326 cache->ssm = cache->sst - cache->sse;
328 Get the remaining sums of squares for the independent
332 for (i = 1; i < cache->n_indeps; i++)
335 m += gsl_vector_get (cache->ss_indeps, j);
336 tmp = cache->ssm - m;
337 gsl_vector_set (cache->ss_indeps, i, tmp);
340 gsl_matrix_free (design);