3 Copyright (C) 2005 Free Software Foundation, Inc.
4 Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Find the least-squares estimate of b for the linear model:
27 where Y is an n-by-1 column vector, X is an n-by-p matrix of
28 independent variables, b is a p-by-1 vector of regression coefficients,
29 and Z is an n-by-1 normally-distributed random vector with independent
30 identically distributed components with mean 0.
32 This estimate is found via the sweep operator or singular-value
33 decomposition with gsl.
38 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
39 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
41 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
44 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
45 Springer. 1998. ISBN 0-387-98542-5.
48 #include "pspp_linreg.h"
49 #include <gsl/gsl_errno.h>
51 Get the mean and standard deviation of a vector
52 of doubles via a form of the Kalman filter as
53 described on page 32 of [3].
56 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
65 mean = gsl_vector_get (&v.vector, 0);
67 for (i = 1; i < v.vector.size; i++)
70 tmp = gsl_vector_get (&v.vector, i);
73 variance += j * (j - 1.0) * d * d;
76 *sp = sqrt (variance / (j - 1.0));
83 Allocate a pspp_linreg_cache and return a pointer
84 to it. n is the number of cases, p is the number of
85 independent variables.
88 pspp_linreg_cache_alloc (size_t n, size_t p)
90 pspp_linreg_cache *cache;
92 cache = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
93 cache->param_estimates = gsl_vector_alloc (p + 1);
94 cache->indep_means = gsl_vector_alloc (p);
95 cache->indep_std = gsl_vector_alloc (p);
96 cache->ssx = gsl_vector_alloc (p); /* Sums of squares for the independent
99 cache->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the model
102 cache->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
108 cache->method = PSPP_LINREG_SWEEP;
114 pspp_linreg_cache_free (pspp_linreg_cache * cache)
116 gsl_vector_free (cache->param_estimates);
117 gsl_vector_free (cache->indep_means);
118 gsl_vector_free (cache->indep_std);
119 gsl_vector_free (cache->ss_indeps);
120 gsl_matrix_free (cache->cov);
125 Fit the linear model via least squares. All pointers passed to pspp_linreg
126 are assumed to be allocated to the correct size and initialized to the
127 values as indicated by opts.
130 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
131 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
137 gsl_matrix_view xmxtx;
153 if (opts->get_depvar_mean_std)
155 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
157 cache->depvar_mean = m;
158 cache->depvar_std = s;
161 for (i = 0; i < cache->n_indeps; i++)
163 if (opts->get_indep_mean_std[i])
165 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
166 gsl_vector_set (cache->indep_means, i, m);
167 gsl_vector_set (cache->indep_std, i, s);
168 gsl_vector_set (cache->ssx, i, ss);
171 cache->dft = cache->n_obs - 1;
172 cache->dfm = cache->n_indeps;
173 cache->dfe = cache->dft - cache->dfm;
174 if (cache->method == PSPP_LINREG_SWEEP)
178 Subtract the means to improve the condition of the design
179 matrix. This requires copying X and Y. We do not divide by the
180 standard deviations of the independent variables here since doing
181 so would cause a miscalculation of the residual sums of
182 squares. Dividing by the standard deviation is done GSL's linear
183 regression functions, so if the design matrix has a very poor
184 condition, use QR decomposition.
186 The design matrix here does not include a column for the intercept
187 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
188 so design is allocated here when sweeping, or below if using QR.
190 design = gsl_matrix_alloc (X->size1, X->size2);
191 for (i = 0; i < X->size2; i++)
193 m = gsl_vector_get (cache->indep_means, i);
194 for (j = 0; j < X->size1; j++)
196 tmp = (gsl_matrix_get (X, j, i) - m);
197 gsl_matrix_set (design, j, i, tmp);
200 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
201 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
203 for (i = 0; i < xtx.matrix.size1; i++)
205 tmp = gsl_vector_get (cache->ssx, i);
206 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
207 xi = gsl_matrix_column (design, i);
208 for (j = (i + 1); j < xtx.matrix.size2; j++)
210 xj = gsl_matrix_column (design, j);
211 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
212 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
216 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
217 xty = gsl_matrix_column (sw, cache->n_indeps);
219 This loop starts at 1, with i=0 outside the loop, so we can get
220 the model sum of squares due to the first independent variable.
222 xi = gsl_matrix_column (design, 0);
223 gsl_blas_ddot (&(xi.vector), Y, &tmp);
224 gsl_vector_set (&(xty.vector), 0, tmp);
225 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
226 gsl_vector_set (cache->ss_indeps, 0, tmp);
227 for (i = 1; i < cache->n_indeps; i++)
229 xi = gsl_matrix_column (design, i);
230 gsl_blas_ddot (&(xi.vector), Y, &tmp);
231 gsl_vector_set (&(xty.vector), i, tmp);
235 Sweep on the matrix sw, which contains XtX, XtY and YtY.
238 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
239 cache->mse = cache->sse / cache->dfe;
243 m = cache->depvar_mean;
244 for (i = 0; i < cache->n_indeps; i++)
246 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
247 gsl_vector_set (cache->param_estimates, i + 1, tmp);
248 m -= tmp * gsl_vector_get (cache->indep_means, i);
251 Get the covariance matrix of the parameter estimates.
252 Only the upper triangle is necessary.
256 The loops below do not compute the entries related
257 to the estimated intercept.
259 for (i = 0; i < cache->n_indeps; i++)
260 for (j = i; j < cache->n_indeps; j++)
262 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
263 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
266 Get the covariances related to the intercept.
268 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
269 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
270 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
271 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
272 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
273 if (rc == GSL_SUCCESS)
275 tmp = cache->mse / cache->n_obs;
276 for (i = 1; i < 1 + cache->n_indeps; i++)
278 tmp -= gsl_matrix_get (cache->cov, 0, i)
279 * gsl_vector_get (cache->indep_means, i - 1);
281 gsl_matrix_set (cache->cov, 0, 0, tmp);
283 gsl_vector_set (cache->param_estimates, 0, m);
287 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
288 __FILE__, __LINE__, gsl_strerror (rc));
291 gsl_matrix_free (sw);
296 Use QR decomposition via GSL.
298 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
300 for (j = 0; j < X->size1; j++)
302 gsl_matrix_set (design, j, 0, 1.0);
303 for (i = 0; i < X->size2; i++)
305 tmp = gsl_matrix_get (X, j, i);
306 gsl_matrix_set (design, j, i + 1, tmp);
309 gsl_multifit_linear_workspace *wk =
310 gsl_multifit_linear_alloc (design->size1, design->size2);
311 rc = gsl_multifit_linear (design, Y, cache->param_estimates,
312 cache->cov, &(cache->sse), wk);
313 if (rc == GSL_SUCCESS)
315 gsl_multifit_linear_free (wk);
319 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
320 __FILE__, __LINE__, rc);
325 cache->ssm = cache->sst - cache->sse;
327 Get the remaining sums of squares for the independent
331 for (i = 1; i < cache->n_indeps; i++)
334 m += gsl_vector_get (cache->ss_indeps, j);
335 tmp = cache->ssm - m;
336 gsl_vector_set (cache->ss_indeps, i, tmp);
339 gsl_matrix_free (design);