4 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
6 This program is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 2 of the License, or (at your option)
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
16 You should have received a copy of the GNU General Public License along with
17 this program; if not, write to the Free Software Foundation, Inc., 51
18 Franklin Street, Fifth Floor, Boston, MA 02111-1307, USA.
24 #include <gsl/gsl_math.h>
25 #include <gsl/gsl_vector.h>
26 #include <gsl/gsl_matrix.h>
41 Options describing what special values should be computed.
43 struct pspp_linreg_opts_struct
45 int get_depvar_mean_std;
46 int *get_indep_mean_std; /* Array of booleans
48 independent variables need
49 their means and standard
50 deviations computed within
51 pspp_linreg. This array
53 n_indeps. If element i is
56 variance of indpendent
57 variable i. If element i
58 is 0, it will not compute
60 deviation, and assume the
62 cache->indep_mean[i] is
64 cache->indep_std[i] is the
65 sample standard deviation. */
67 typedef struct pspp_linreg_opts_struct pspp_linreg_opts;
71 Find the least-squares estimate of b for the linear model:
75 where Y is an n-by-1 column vector, X is an n-by-p matrix of
76 independent variables, b is a p-by-1 vector of regression coefficients,
77 and Z is an n-by-1 normally-distributed random vector with independent
78 identically distributed components with mean 0.
80 This estimate is found via the sweep operator or singular-value
81 decomposition with gsl.
86 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
87 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
89 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
92 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
93 Springer. 1998. ISBN 0-387-98542-5.
97 struct pspp_linreg_cache_struct
99 int n_obs; /* Number of observations. */
100 int n_indeps; /* Number of independent variables. */
104 The variable struct is ignored during estimation. It is here so
105 the calling procedure can find the variable used in the model.
107 const struct variable *depvar;
109 gsl_vector *residuals;
110 struct pspp_coeff **coeff;
111 int method; /* Method to use to estimate parameters. */
113 Means and standard deviations of the variables.
114 If these pointers are null when pspp_linreg() is
115 called, pspp_linreg() will compute their values.
117 Entry i of indep_means is the mean of independent
118 variable i, whose observations are stored in the ith
119 column of the design matrix.
123 gsl_vector *indep_means;
124 gsl_vector *indep_std;
129 double ssm; /* Sums of squares for the overall model. */
130 gsl_vector *ss_indeps; /* Sums of squares from each
131 independent variable. */
132 double sst; /* Sum of squares total. */
133 double sse; /* Sum of squares error. */
134 double mse; /* Mean squared error. This is just sse /
135 dfe, but since it is the best unbiased
136 estimate of the population variance, it
137 has its own entry here. */
138 gsl_vector *ssx; /* Centered sums of squares for independent
139 variables, i.e. \sum (x[i] - mean(x))^2. */
140 double ssy; /* Centered sums of squares for dependent
144 Covariance matrix of the parameter estimates.
155 'Hat' or Hessian matrix, i.e. (X'X)^{-1}, where X is our
160 double (*predict) (const struct variable **, const union value **,
162 double (*residual) (const struct variable **,
163 const union value **,
164 const union value *, const void *, int);
166 Returns pointers to the variables used in the model.
168 int (*get_vars) (const void *, const struct variable **);
169 struct variable *resid;
170 struct variable *pred;
174 typedef struct pspp_linreg_cache_struct pspp_linreg_cache;
179 Allocate a pspp_linreg_cache and return a pointer
180 to it. n is the number of cases, p is the number of
181 independent variables.
183 pspp_linreg_cache *pspp_linreg_cache_alloc (size_t n, size_t p);
185 bool pspp_linreg_cache_free (void *);
188 Fit the linear model via least squares. All pointers passed to pspp_linreg
189 are assumed to be allocated to the correct size and initialized to the
190 values as indicated by opts.
193 pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
194 const pspp_linreg_opts * opts, pspp_linreg_cache * cache);
197 pspp_linreg_predict (const struct variable **, const union value **,
200 pspp_linreg_residual (const struct variable **, const union value **,
201 const union value *, const void *, int);
203 All variables used in the model.
205 int pspp_linreg_get_vars (const void *, const struct variable **);