1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H. Stover.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 #include <gsl/gsl_math.h>
21 #include <gsl/gsl_vector.h>
22 #include <gsl/gsl_matrix.h>
23 #include <src/math/coefficient.h>
24 #include <math/covariance-matrix.h>
28 PSPP_LINREG_CONDITIONAL_INVERSE,
36 Options describing what special values should be computed.
38 struct pspp_linreg_opts_struct
40 int get_depvar_mean_std;
41 int *get_indep_mean_std; /* Array of booleans
43 independent variables need
44 their means and standard
45 deviations computed within
46 pspp_linreg. This array
48 n_indeps. If element i is
51 variance of indpendent
52 variable i. If element i
53 is 0, it will not compute
55 deviation, and assume the
57 cache->indep_mean[i] is
59 cache->indep_std[i] is the
60 sample standard deviation. */
62 typedef struct pspp_linreg_opts_struct pspp_linreg_opts;
66 Find the least-squares estimate of b for the linear model:
70 where Y is an n-by-1 column vector, X is an n-by-p matrix of
71 independent variables, b is a p-by-1 vector of regression coefficients,
72 and Z is an n-by-1 normally-distributed random vector with independent
73 identically distributed components with mean 0.
75 This estimate is found via the sweep operator or singular-value
76 decomposition with gsl.
81 1. Matrix Computations, third edition. GH Golub and CF Van Loan.
82 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
84 2. Numerical Analysis for Statisticians. K Lange. Springer. 1999.
87 3. Numerical Linear Algebra for Applications in Statistics. JE Gentle.
88 Springer. 1998. ISBN 0-387-98542-5.
92 struct pspp_linreg_cache_struct
94 int n_obs; /* Number of observations. */
95 int n_indeps; /* Number of independent variables. */
96 int n_coeffs; /* The intercept is not considered a
100 Pointers to the variables.
102 const struct variable *depvar;
103 const struct variable **indep_vars;
105 gsl_vector *residuals;
106 struct pspp_coeff **coeff;
108 int method; /* Method to use to estimate parameters. */
110 Means and standard deviations of the variables.
111 If these pointers are null when pspp_linreg() is
112 called, pspp_linreg() will compute their values.
114 Entry i of indep_means is the mean of independent
115 variable i, whose observations are stored in the ith
116 column of the design matrix.
120 gsl_vector *indep_means;
121 gsl_vector *indep_std;
126 double ssm; /* Sums of squares for the overall model. */
127 gsl_vector *ss_indeps; /* Sums of squares from each
128 independent variable. */
129 double sst; /* Sum of squares total. */
130 double sse; /* Sum of squares error. */
131 double mse; /* Mean squared error. This is just sse /
132 dfe, but since it is the best unbiased
133 estimate of the population variance, it
134 has its own entry here. */
135 gsl_vector *ssx; /* Centered sums of squares for independent
136 variables, i.e. \sum (x[i] - mean(x))^2. */
137 double ssy; /* Centered sums of squares for dependent
141 Covariance matrix of the parameter estimates.
152 'Hat' or Hessian matrix, i.e. (X'X)^{-1}, where X is our
157 double (*predict) (const struct variable **, const union value **,
159 double (*residual) (const struct variable **,
160 const union value **,
161 const union value *, const void *, int);
163 Returns pointers to the variables used in the model.
165 int (*get_vars) (const void *, const struct variable **);
166 struct variable *resid;
167 struct variable *pred;
171 typedef struct pspp_linreg_cache_struct pspp_linreg_cache;
176 Allocate a pspp_linreg_cache and return a pointer
177 to it. n is the number of cases, p is the number of
178 independent variables.
180 pspp_linreg_cache *pspp_linreg_cache_alloc (const struct variable *, const struct variable **,
183 bool pspp_linreg_cache_free (void *);
186 Fit the linear model via least squares. All pointers passed to pspp_linreg
187 are assumed to be allocated to the correct size and initialized to the
188 values as indicated by opts.
191 pspp_linreg (const gsl_vector *, const struct design_matrix *,
192 const pspp_linreg_opts *, pspp_linreg_cache *);
195 pspp_linreg_predict (const struct variable **, const union value **,
198 pspp_linreg_residual (const struct variable **, const union value **,
199 const union value *, const void *, int);
201 All variables used in the model.
203 int pspp_linreg_get_vars (const void *, const struct variable **);
205 struct pspp_coeff *pspp_linreg_get_coeff (const pspp_linreg_cache
207 const struct variable
209 const union value *);
211 Return or set the standard deviation of the independent variable.
213 double pspp_linreg_get_indep_variable_sd (pspp_linreg_cache *, const struct variable *);
214 void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *, const struct variable *, double);
216 Mean of the independent variable.
218 double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *, const struct variable *);
219 void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *, const struct variable *, double);
222 Regression using only the covariance matrix.
224 void pspp_linreg_with_cov (const struct covariance_matrix *, pspp_linreg_cache *);
225 double pspp_linreg_mse (const pspp_linreg_cache *);