-/* lib/linreg/linreg.c
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
- Copyright (C) 2005 Free Software Foundation, Inc.
- Written by Jason H. Stover.
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or (at
- your option) any later version.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
- This program is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- 02111-1307, USA.
-*/
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
+#include <config.h>
#include <gsl/gsl_fit.h>
#include <gsl/gsl_multifit.h>
Y = Xb + Z
- where Y is an n-by-1 column vector, X is an n-by-p matrix of
+ where Y is an n-by-1 column vector, X is an n-by-p matrix of
independent variables, b is a p-by-1 vector of regression coefficients,
and Z is an n-by-1 normally-distributed random vector with independent
identically distributed components with mean 0.
Springer. 1998. ISBN 0-387-98542-5.
*/
-#include "linreg.h"
-#include "coefficient.h"
-#include "sweep.h"
+#include <math/linreg/linreg.h>
+#include <math/coefficient.h>
#include <gsl/gsl_errno.h>
+#include <linreg/sweep.h>
/*
Get the mean and standard deviation of a vector
of doubles via a form of the Kalman filter as
return GSL_SUCCESS;
}
+/*
+ Set V to contain an array of pointers to the variables
+ used in the model. V must be at least C->N_COEFFS in length.
+ The return value is the number of distinct variables found.
+ */
+int
+pspp_linreg_get_vars (const void *c_, const struct variable **v)
+{
+ const pspp_linreg_cache *c = c_;
+ struct pspp_coeff *coef = NULL;
+ const struct variable *tmp;
+ int i;
+ int result = 0;
+
+ /*
+ Make sure the caller doesn't try to sneak a variable
+ into V that is not in the model.
+ */
+ for (i = 0; i < c->n_coeffs; i++)
+ {
+ v[i] = NULL;
+ }
+ /*
+ Start at c->coeff[1] to avoid the intercept.
+ */
+ v[result] = pspp_coeff_get_var (c->coeff[1], 0);
+ result = (v[result] == NULL) ? 0 : 1;
+
+ for (coef = c->coeff[2]; coef < c->coeff[c->n_coeffs]; coef++)
+ {
+ tmp = pspp_coeff_get_var (coef, 0);
+ assert (tmp != NULL);
+ /* Repeated variables are likely to bunch together, at the end
+ of the array. */
+ i = result - 1;
+ while (i >= 0 && v[i] != tmp)
+ {
+ i--;
+ }
+ if (i < 0 && result < c->n_coeffs)
+ {
+ v[result] = tmp;
+ result++;
+ }
+ }
+ return result;
+}
+
/*
Allocate a pspp_linreg_cache and return a pointer
- to it. n is the number of cases, p is the number of
+ to it. n is the number of cases, p is the number of
independent variables.
*/
pspp_linreg_cache *
pspp_linreg_cache *c;
c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache));
+ c->depvar = NULL;
c->indep_means = gsl_vector_alloc (p);
c->indep_std = gsl_vector_alloc (p);
- c->ssx = gsl_vector_alloc (p); /* Sums of squares for the independent
- variables.
+ c->ssx = gsl_vector_alloc (p); /* Sums of squares for the
+ independent variables.
*/
- c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the model
- parameters.
+ c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the
+ model parameters.
*/
c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */
c->n_obs = n;
Default settings.
*/
c->method = PSPP_LINREG_SWEEP;
+ c->predict = pspp_linreg_predict;
+ c->residual = pspp_linreg_residual; /* The procedure to compute my
+ residuals. */
+ c->get_vars = pspp_linreg_get_vars; /* The procedure that returns
+ pointers to model
+ variables. */
+ c->resid = NULL; /* The variable storing my residuals. */
+ c->pred = NULL; /* The variable storing my predicted values. */
return c;
}
-void
-pspp_linreg_cache_free (pspp_linreg_cache * c)
+bool
+pspp_linreg_cache_free (void *m)
{
- gsl_vector_free (c->indep_means);
- gsl_vector_free (c->indep_std);
- gsl_vector_free (c->ss_indeps);
- gsl_matrix_free (c->cov);
- pspp_linreg_coeff_free (c->coeff);
- free (c);
+ int i;
+
+ pspp_linreg_cache *c = m;
+ if (c != NULL)
+ {
+ gsl_vector_free (c->indep_means);
+ gsl_vector_free (c->indep_std);
+ gsl_vector_free (c->ss_indeps);
+ gsl_matrix_free (c->cov);
+ gsl_vector_free (c->ssx);
+ for (i = 0; i < c->n_coeffs; i++)
+ {
+ pspp_coeff_free (c->coeff[i]);
+ }
+ free (c);
+ }
+ return true;
}
/*
Fit the linear model via least squares. All pointers passed to pspp_linreg
are assumed to be allocated to the correct size and initialized to the
- values as indicated by opts.
+ values as indicated by opts.
*/
int
pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
{
int rc;
- gsl_matrix *design;
+ gsl_matrix *design = NULL;
gsl_matrix_view xtx;
gsl_matrix_view xm;
gsl_matrix_view xmxtx;
cache->dft = cache->n_obs - 1;
cache->dfm = cache->n_indeps;
cache->dfe = cache->dft - cache->dfm;
- cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for regression
- through the origin.
- */
+ cache->n_coeffs = X->size2 + 1; /* Adjust this later to allow for
+ regression through the origin.
+ */
if (cache->method == PSPP_LINREG_SWEEP)
{
gsl_matrix *sw;
for (i = 0; i < cache->n_indeps; i++)
{
tmp = gsl_matrix_get (sw, i, cache->n_indeps);
- cache->coeff[i + 1].estimate = tmp;
+ cache->coeff[i + 1]->estimate = tmp;
m -= tmp * gsl_vector_get (cache->indep_means, i);
}
/*
Get the covariance matrix of the parameter estimates.
- Only the upper triangle is necessary.
+ Only the upper triangle is necessary.
*/
/*
}
gsl_matrix_set (cache->cov, 0, 0, tmp);
- cache->coeff[0].estimate = m;
+ cache->coeff[0]->estimate = m;
}
else
{
}
gsl_matrix_free (sw);
}
+ else if (cache->method == PSPP_LINREG_CONDITIONAL_INVERSE)
+ {
+ /*
+ Use the SVD of X^T X to find a conditional inverse of X^TX. If
+ the SVD is X^T X = U D V^T, then set the conditional inverse
+ to (X^T X)^c = V D^- U^T. D^- is defined as follows: If entry
+ (i, i) has value sigma_i, then entry (i, i) of D^- is 1 /
+ sigma_i if sigma_i > 0, and 0 otherwise. Then solve the normal
+ equations by setting the estimated parameter vector to
+ (X^TX)^c X^T Y.
+ */
+ }
else
{
+ gsl_multifit_linear_workspace *wk;
/*
Use QR decomposition via GSL.
*/
gsl_matrix_set (design, j, i + 1, tmp);
}
}
- gsl_multifit_linear_workspace *wk =
- gsl_multifit_linear_alloc (design->size1, design->size2);
+
+ wk = gsl_multifit_linear_alloc (design->size1, design->size2);
rc = gsl_multifit_linear (design, Y, param_estimates,
cache->cov, &(cache->sse), wk);
for (i = 0; i < cache->n_coeffs; i++)
{
- cache->coeff[i].estimate = gsl_vector_get (param_estimates, i);
+ cache->coeff[i]->estimate = gsl_vector_get (param_estimates, i);
}
if (rc == GSL_SUCCESS)
{