3 Copyright (C) 2005 Free Software Foundation, Inc.
4 Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Find the least-squares estimate of b for the linear model:
27 where Y is an n-by-1 column vector, X is an n-by-p matrix of
28 independent variables, b is a p-by-1 vector of regression coefficients,
29 and Z is an n-by-1 normally-distributed random vector with independent
30 identically distributed components with mean 0.
32 This estimate is found via the sweep operator or singular-value
33 decomposition with gsl.
38 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
39 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
41 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
44 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
45 Springer. 1998. ISBN 0-387-98542-5.
48 #include "pspp_linreg.h"
49 #include <gsl/gsl_errno.h>
51 Get the mean and standard deviation of a vector
52 of doubles via a form of the Kalman filter as
53 described on page 32 of [3].
56 linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp)
65 mean = gsl_vector_get (&v.vector, 0);
67 for (i = 1; i < v.vector.size; i++)
70 tmp = gsl_vector_get (&v.vector, i);
73 variance += j * (j - 1.0) * d * d;
76 *sp = sqrt (variance / (j - 1.0));
83 Allocate a pspp_linreg_cache and return a pointer
84 to it. n is the number of cases, p is the number of
85 independent variables.
88 pspp_linreg_cache_alloc (size_t n, size_t p)
92 c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
93 c->param_estimates = gsl_vector_alloc (p + 1);
94 c->indep_means = gsl_vector_alloc (p);
95 c->indep_std = gsl_vector_alloc (p);
96 c->ssx = gsl_vector_alloc (p); /* Sums of squares for the independent
99 c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the model
102 c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
108 c->method = PSPP_LINREG_SWEEP;
114 pspp_linreg_cache_free (pspp_linreg_cache * c)
116 gsl_vector_free (c->param_estimates);
117 gsl_vector_free (c->indep_means);
118 gsl_vector_free (c->indep_std);
119 gsl_vector_free (c->ss_indeps);
120 gsl_matrix_free (c->cov);
126 Fit the linear model via least squares. All pointers passed to pspp_linreg
127 are assumed to be allocated to the correct size and initialized to the
128 values as indicated by opts.
131 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
132 const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
138 gsl_matrix_view xmxtx;
142 gsl_vector *param_estimates;
155 if (opts->get_depvar_mean_std)
157 linreg_mean_std (gsl_vector_const_subvector (Y, 0, Y->size),
159 cache->depvar_mean = m;
160 cache->depvar_std = s;
163 for (i = 0; i < cache->n_indeps; i++)
165 if (opts->get_indep_mean_std[i])
167 linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
168 gsl_vector_set (cache->indep_means, i, m);
169 gsl_vector_set (cache->indep_std, i, s);
170 gsl_vector_set (cache->ssx, i, ss);
173 cache->dft = cache->n_obs - 1;
174 cache->dfm = cache->n_indeps;
175 cache->dfe = cache->dft - cache->dfm;
176 cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for regression
179 if (cache->method == PSPP_LINREG_SWEEP)
183 Subtract the means to improve the condition of the design
184 matrix. This requires copying X and Y. We do not divide by the
185 standard deviations of the independent variables here since doing
186 so would cause a miscalculation of the residual sums of
187 squares. Dividing by the standard deviation is done GSL's linear
188 regression functions, so if the design matrix has a poor
189 condition, use QR decomposition.
191 The design matrix here does not include a column for the intercept
192 (i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
193 so design is allocated here when sweeping, or below if using QR.
195 design = gsl_matrix_alloc (X->size1, X->size2);
196 for (i = 0; i < X->size2; i++)
198 m = gsl_vector_get (cache->indep_means, i);
199 for (j = 0; j < X->size1; j++)
201 tmp = (gsl_matrix_get (X, j, i) - m);
202 gsl_matrix_set (design, j, i, tmp);
205 sw = gsl_matrix_calloc (cache->n_indeps + 1, cache->n_indeps + 1);
206 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
208 for (i = 0; i < xtx.matrix.size1; i++)
210 tmp = gsl_vector_get (cache->ssx, i);
211 gsl_matrix_set (&(xtx.matrix), i, i, tmp);
212 xi = gsl_matrix_column (design, i);
213 for (j = (i + 1); j < xtx.matrix.size2; j++)
215 xj = gsl_matrix_column (design, j);
216 gsl_blas_ddot (&(xi.vector), &(xj.vector), &tmp);
217 gsl_matrix_set (&(xtx.matrix), i, j, tmp);
221 gsl_matrix_set (sw, cache->n_indeps, cache->n_indeps, cache->sst);
222 xty = gsl_matrix_column (sw, cache->n_indeps);
224 This loop starts at 1, with i=0 outside the loop, so we can get
225 the model sum of squares due to the first independent variable.
227 xi = gsl_matrix_column (design, 0);
228 gsl_blas_ddot (&(xi.vector), Y, &tmp);
229 gsl_vector_set (&(xty.vector), 0, tmp);
230 tmp *= tmp / gsl_vector_get (cache->ssx, 0);
231 gsl_vector_set (cache->ss_indeps, 0, tmp);
232 for (i = 1; i < cache->n_indeps; i++)
234 xi = gsl_matrix_column (design, i);
235 gsl_blas_ddot (&(xi.vector), Y, &tmp);
236 gsl_vector_set (&(xty.vector), i, tmp);
240 Sweep on the matrix sw, which contains XtX, XtY and YtY.
243 cache->sse = gsl_matrix_get (sw, cache->n_indeps, cache->n_indeps);
244 cache->mse = cache->sse / cache->dfe;
248 m = cache->depvar_mean;
249 for (i = 0; i < cache->n_indeps; i++)
251 tmp = gsl_matrix_get (sw, i, cache->n_indeps);
252 cache->coeff[i + 1].estimate = tmp;
253 gsl_vector_set (cache->param_estimates, i + 1, tmp);
254 m -= tmp * gsl_vector_get (cache->indep_means, i);
257 Get the covariance matrix of the parameter estimates.
258 Only the upper triangle is necessary.
262 The loops below do not compute the entries related
263 to the estimated intercept.
265 for (i = 0; i < cache->n_indeps; i++)
266 for (j = i; j < cache->n_indeps; j++)
268 tmp = -1.0 * cache->mse * gsl_matrix_get (sw, i, j);
269 gsl_matrix_set (cache->cov, i + 1, j + 1, tmp);
272 Get the covariances related to the intercept.
274 xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
275 xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
276 xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
277 rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
278 &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
279 if (rc == GSL_SUCCESS)
281 tmp = cache->mse / cache->n_obs;
282 for (i = 1; i < 1 + cache->n_indeps; i++)
284 tmp -= gsl_matrix_get (cache->cov, 0, i)
285 * gsl_vector_get (cache->indep_means, i - 1);
287 gsl_matrix_set (cache->cov, 0, 0, tmp);
289 gsl_vector_set (cache->param_estimates, 0, m);
290 cache->coeff[0].estimate = m;
294 fprintf (stderr, "%s:%d:gsl_blas_dsymm: %s\n",
295 __FILE__, __LINE__, gsl_strerror (rc));
298 gsl_matrix_free (sw);
303 Use QR decomposition via GSL.
306 param_estimates = gsl_vector_alloc (1 + X->size2);
307 design = gsl_matrix_alloc (X->size1, 1 + X->size2);
309 for (j = 0; j < X->size1; j++)
311 gsl_matrix_set (design, j, 0, 1.0);
312 for (i = 0; i < X->size2; i++)
314 tmp = gsl_matrix_get (X, j, i);
315 gsl_matrix_set (design, j, i + 1, tmp);
318 gsl_multifit_linear_workspace *wk =
319 gsl_multifit_linear_alloc (design->size1, design->size2);
320 rc = gsl_multifit_linear (design, Y, param_estimates,
321 cache->cov, &(cache->sse), wk);
322 for (i = 0; i < cache->n_coeffs; i++)
324 cache->coeff[i].estimate = gsl_vector_get (param_estimates, i);
326 if (rc == GSL_SUCCESS)
328 gsl_multifit_linear_free (wk);
329 gsl_vector_free (param_estimates);
333 fprintf (stderr, "%s:%d: gsl_multifit_linear returned %d\n",
334 __FILE__, __LINE__, rc);
339 cache->ssm = cache->sst - cache->sse;
341 Get the remaining sums of squares for the independent
345 for (i = 1; i < cache->n_indeps; i++)
348 m += gsl_vector_get (cache->ss_indeps, j);
349 tmp = cache->ssm - m;
350 gsl_vector_set (cache->ss_indeps, i, tmp);
353 gsl_matrix_free (design);