1 /* lib/linreg/pspp_linreg.h
3 Copyright (C) 2005 Free Software Foundation, Inc.
4 Written by Jason H Stover.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Find the least-squares estimate of b for the linear model:
27 where Y is an n-by-1 column vector, X is an n-by-p matrix of
28 independent variables, b is a p-by-1 vector of regression coefficients,
29 and Z is an n-by-1 normally-distributed random vector with independent
30 identically distributed components with mean 0.
32 This estimate is found via the sweep operator or singular-value
38 Matrix Computations, third edition. GH Golub and CF Van Loan.
39 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
41 Numerical Analysis for Statisticians. K Lange. Springer. 1999.
44 Numerical Linear Algebra for Applications in Statistics. JE Gentle.
45 Springer. 1998. ISBN 0-387-98542-5.
48 #define PSPP_LINREG_H 1
49 #include <gsl/gsl_vector.h>
50 #include <gsl/gsl_matrix.h>
51 #include <gsl/gsl_math.h>
52 #include <gsl/gsl_errno.h>
53 #include <gsl/gsl_fit.h>
54 #include <gsl/gsl_multifit.h>
55 #include <gsl/gsl_blas.h>
56 #include <gsl/gsl_cblas.h>
57 #include <src/design-matrix.h>
59 #define PSPP_LINREG_VAL_NOT_FOUND -1
67 Cache for the relevant data from the model. There are several
68 members which the caller might not use, and which could use a lot of
69 storage. Therefore non-essential members of the struct will be
70 allocated only when requested.
72 struct pspp_linreg_coeff
74 double estimate; /* Estimated coefficient. */
75 double std_err; /* Standard error of the estimate. */
76 struct varinfo *v_info; /* Information pertaining to the
77 variable(s) associated with this
78 coefficient. The calling function
79 should initialize this value with the
80 functions in coefficient.c. The
81 estimation procedure ignores this
82 member. It is here so the caller can
83 match parameters with relevant variables
84 and values. If the coefficient is
85 associated with an interaction, then
86 v_info contains information for multiple
89 int n_vars; /* Number of variables associated with this coefficient.
90 Coefficients corresponding to interaction terms will
91 have more than one variable.
94 struct pspp_linreg_cache_struct
96 int n_obs; /* Number of observations. */
97 int n_indeps; /* Number of independent variables. */
101 The variable struct is ignored during estimation.
102 It is here so the calling procedure can
103 find the variable used in the model.
105 const struct variable *depvar;
107 gsl_vector *residuals;
108 struct pspp_linreg_coeff *coeff;
109 int method; /* Method to use to estimate parameters. */
111 Means and standard deviations of the variables.
112 If these pointers are null when pspp_linreg() is
113 called, pspp_linreg() will compute their values.
115 Entry i of indep_means is the mean of independent
116 variable i, whose observations are stored in the ith
117 column of the design matrix.
121 gsl_vector *indep_means;
122 gsl_vector *indep_std;
127 double ssm; /* Sums of squares for the overall model. */
128 gsl_vector *ss_indeps; /* Sums of squares from each
129 independent variable.
131 double sst; /* Sum of squares total. */
132 double sse; /* Sum of squares error. */
133 double mse; /* Mean squared error. This is just sse / dfe, but
134 since it is the best unbiased estimate of the population
135 variance, it has its own entry here.
137 gsl_vector *ssx; /* Centered sums of squares for independent variables,
138 i.e. \sum (x[i] - mean(x))^2.
140 double ssy; /* Centered sums of squares for dependent variable. */
142 Covariance matrix of the parameter estimates.
153 'Hat' or Hessian matrix, i.e. (X'X)^{-1}, where X is our
158 typedef struct pspp_linreg_cache_struct pspp_linreg_cache;
161 Options describing what special values should be computed.
163 struct pspp_linreg_opts_struct
165 int resid; /* Should the residuals be returned? */
167 int get_depvar_mean_std;
168 int *get_indep_mean_std; /* Array of booleans dictating which
169 independent variables need their means
170 and standard deviations computed within
171 pspp_linreg. This array MUST be of
172 length n_indeps. If element i is 1,
173 pspp_linreg will compute the mean and
174 variance of indpendent variable i. If
175 element i is 0, it will not compute the
176 mean and standard deviation, and assume
177 the values are stored.
178 cache->indep_mean[i] is the mean and
179 cache->indep_std[i] is the sample
183 typedef struct pspp_linreg_opts_struct pspp_linreg_opts;
185 int pspp_reg_sweep (gsl_matrix * A);
187 pspp_linreg_cache *pspp_linreg_cache_alloc (size_t n, size_t p);
189 void pspp_linreg_cache_free (pspp_linreg_cache * cache);
191 int pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
192 const pspp_linreg_opts * opts, pspp_linreg_cache * cache);
193 void pspp_linreg_coeff_init (pspp_linreg_cache *, struct design_matrix *);
195 void pspp_linreg_coeff_free (struct pspp_linreg_coeff *);
197 void pspp_linreg_coeff_set_estimate (struct pspp_linreg_coeff *, double);
199 void pspp_linreg_coeff_set_std_err (struct pspp_linreg_coeff *, double);
201 int pspp_linreg_coeff_get_n_vars (struct pspp_linreg_coeff *);
203 const struct variable *pspp_linreg_coeff_get_var (struct pspp_linreg_coeff *, int);
205 const union value *pspp_linreg_coeff_get_value (struct pspp_linreg_coeff *, const struct variable *);