3 Copyright (C) 2005 Free Software Foundation, Inc.
4 Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Find the least-squares estimate of b for the linear model:
27 where Y is an n-by-1 column vector, X is an n-by-p matrix of
28 independent variables, b is a p-by-1 vector of regression coefficients,
29 and Z is an n-by-1 normally-distributed random vector with independent
30 identically distributed components with mean 0.
32 This estimate is found via the sweep operator or singular-value
33 decomposition with gsl.
38 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
39 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
41 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
44 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
45 Springer. 1998. ISBN 0-387-98542-5.
48 #include "pspp_linreg.h"
49 #include <gsl/gsl_errno.h>
51 Get the mean and standard deviation of a vector
52 of doubles via a form of the Kalman filter as
53 described on page 32 of [3].
56 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
65 mean = gsl_vector_get (&v.vector, 0);
67 for (i = 1; i < v.vector.size; i++)
70 tmp = gsl_vector_get (&v.vector, i);
73 variance += j * (j - 1.0) * d * d;
76 *sp = sqrt (variance / (j - 1.0));
83 Allocate a pspp_linreg_cache and return a pointer
84 to it. n is the number of cases, p is the number of
85 independent variables.
88 pspp_linreg_cache_alloc (size_t n, size_t p)
92 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
93 c->indep_means = gsl_vector_alloc (p);
94 c->indep_std = gsl_vector_alloc (p);
95 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the independent
98 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the model
101 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
107 c->method = PSPP_LINREG_SWEEP;
113 pspp_linreg_cache_free (pspp_linreg_cache * c)
115 gsl_vector_free (c->indep_means);
116 gsl_vector_free (c->indep_std);
117 gsl_vector_free (c->ss_indeps);
118 gsl_matrix_free (c->cov);
124 Fit the linear model via least squares. All pointers passed to pspp_linreg
125 are assumed to be allocated to the correct size and initialized to the
126 values as indicated by opts.
129 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
130 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
136 gsl_matrix_view xmxtx;
140 gsl_vector *param_estimates;
153 if (opts->get_depvar_mean_std)
155 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
157 cache->depvar_mean = m;
158 cache->depvar_std = s;
161 for (i = 0; i < cache->n_indeps; i++)
163 if (opts->get_indep_mean_std[i])
165 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
166 gsl_vector_set (cache->indep_means, i, m);
167 gsl_vector_set (cache->indep_std, i, s);
168 gsl_vector_set (cache->ssx, i, ss);
171 cache->dft = cache->n_obs - 1;
172 cache->dfm = cache->n_indeps;
173 cache->dfe = cache->dft - cache->dfm;
174 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for regression
177 if (cache->method == PSPP_LINREG_SWEEP)
181 Subtract the means to improve the condition of the design
182 matrix. This requires copying X and Y. We do not divide by the
183 standard deviations of the independent variables here since doing
184 so would cause a miscalculation of the residual sums of
185 squares. Dividing by the standard deviation is done GSL's linear
186 regression functions, so if the design matrix has a poor
187 condition, use QR decomposition.
189 The design matrix here does not include a column for the intercept
190 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
191 so design is allocated here when sweeping, or below if using QR.
193 design = gsl_matrix_alloc (X->size1, X->size2);
194 for (i = 0; i < X->size2; i++)
196 m = gsl_vector_get (cache->indep_means, i);
197 for (j = 0; j < X->size1; j++)
199 tmp = (gsl_matrix_get (X, j, i) - m);
200 gsl_matrix_set (design, j, i, tmp);
203 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
204 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
206 for (i = 0; i < xtx.matrix.size1; i++)
208 tmp = gsl_vector_get (cache->ssx, i);
209 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
210 xi = gsl_matrix_column (design, i);
211 for (j = (i + 1); j < xtx.matrix.size2; j++)
213 xj = gsl_matrix_column (design, j);
214 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
215 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
219 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
220 xty = gsl_matrix_column (sw, cache->n_indeps);
222 This loop starts at 1, with i=0 outside the loop, so we can get
223 the model sum of squares due to the first independent variable.
225 xi = gsl_matrix_column (design, 0);
226 gsl_blas_ddot (&(xi.vector), Y, &tmp);
227 gsl_vector_set (&(xty.vector), 0, tmp);
228 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
229 gsl_vector_set (cache->ss_indeps, 0, tmp);
230 for (i = 1; i < cache->n_indeps; i++)
232 xi = gsl_matrix_column (design, i);
233 gsl_blas_ddot (&(xi.vector), Y, &tmp);
234 gsl_vector_set (&(xty.vector), i, tmp);
238 Sweep on the matrix sw, which contains XtX, XtY and YtY.
241 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
242 cache->mse = cache->sse / cache->dfe;
246 m = cache->depvar_mean;
247 for (i = 0; i < cache->n_indeps; i++)
249 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
250 cache->coeff[i + 1].estimate = tmp;
251 m -= tmp * gsl_vector_get (cache->indep_means, i);
254 Get the covariance matrix of the parameter estimates.
255 Only the upper triangle is necessary.
259 The loops below do not compute the entries related
260 to the estimated intercept.
262 for (i = 0; i < cache->n_indeps; i++)
263 for (j = i; j < cache->n_indeps; j++)
265 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
266 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
269 Get the covariances related to the intercept.
271 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
272 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
273 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
274 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
275 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
276 if (rc == GSL_SUCCESS)
278 tmp = cache->mse / cache->n_obs;
279 for (i = 1; i < 1 + cache->n_indeps; i++)
281 tmp -= gsl_matrix_get (cache->cov, 0, i)
282 * gsl_vector_get (cache->indep_means, i - 1);
284 gsl_matrix_set (cache->cov, 0, 0, tmp);
286 cache->coeff[0].estimate = m;
290 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
291 __FILE__, __LINE__, gsl_strerror (rc));
294 gsl_matrix_free (sw);
299 Use QR decomposition via GSL.
302 param_estimates = gsl_vector_alloc (1 + X->size2);
303 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
305 for (j = 0; j < X->size1; j++)
307 gsl_matrix_set (design, j, 0, 1.0);
308 for (i = 0; i < X->size2; i++)
310 tmp = gsl_matrix_get (X, j, i);
311 gsl_matrix_set (design, j, i + 1, tmp);
314 gsl_multifit_linear_workspace *wk =
315 gsl_multifit_linear_alloc (design->size1, design->size2);
316 rc = gsl_multifit_linear (design, Y, param_estimates,
317 cache->cov, &(cache->sse), wk);
318 for (i = 0; i < cache->n_coeffs; i++)
320 cache->coeff[i].estimate = gsl_vector_get (param_estimates, i);
322 if (rc == GSL_SUCCESS)
324 gsl_multifit_linear_free (wk);
325 gsl_vector_free (param_estimates);
329 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
330 __FILE__, __LINE__, rc);
335 cache->ssm = cache->sst - cache->sse;
337 Get the remaining sums of squares for the independent
341 for (i = 1; i < cache->n_indeps; i++)
344 m += gsl_vector_get (cache->ss_indeps, j);
345 tmp = cache->ssm - m;
346 gsl_vector_set (cache->ss_indeps, i, tmp);
349 gsl_matrix_free (design);