1 /* lib/linreg/pspp_linreg.h
3 Copyright (C) 2005 Free Software Foundation, Inc.
4 Written by Jason H Stover.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2 of the License, or (at
9 your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 Find the least-squares estimate of b for the linear model:
27 where Y is an n-by-1 column vector, X is an n-by-p matrix of
28 independent variables, b is a p-by-1 vector of regression coefficients,
29 and Z is an n-by-1 normally-distributed random vector with independent
30 identically distributed components with mean 0.
32 This estimate is found via the sweep operator or singular-value
38 Matrix Computations, third edition. GH Golub and CF Van Loan.
39 The Johns Hopkins University Press. 1996. ISBN 0-8018-5414-8.
41 Numerical Analysis for Statisticians. K Lange. Springer. 1999.
44 Numerical Linear Algebra for Applications in Statistics. JE Gentle.
45 Springer. 1998. ISBN 0-387-98542-5.
48 #define PSPP_LINREG_H 1
49 #include <gsl/gsl_vector.h>
50 #include <gsl/gsl_matrix.h>
51 #include <gsl/gsl_math.h>
52 #include <gsl/gsl_errno.h>
53 #include <gsl/gsl_fit.h>
54 #include <gsl/gsl_multifit.h>
55 #include <gsl/gsl_blas.h>
56 #include <gsl/gsl_cblas.h>
65 Cache for the relevant data from the model. There are several
66 members which the caller might not use, and which could use a lot of
67 storage. Therefore non-essential members of the struct will be
68 allocated only when requested.
70 struct pspp_linreg_coeff
72 double estimate; /* Estimated coefficient. */
73 const struct variable *v; /* The variable associated with this coefficient.
74 The calling function should supply the variable
75 when it creates the design matrix. The estimation
76 procedure ignores the struct variable *. It is here so
77 the caller can match parameters with relevant
81 struct pspp_linreg_cache_struct
83 int n_obs; /* Number of observations. */
84 int n_indeps; /* Number of independent variables. */
88 The variable struct is ignored during estimation.
89 It is here so the calling procedure can
90 find the variable used in the model.
92 const struct variable *depvar;
94 gsl_vector *residuals;
95 struct pspp_linreg_coeff *coeff;
96 gsl_vector *param_estimates;
97 int method; /* Method to use to estimate parameters. */
99 Means and standard deviations of the variables.
100 If these pointers are null when pspp_linreg() is
101 called, pspp_linreg() will compute their values.
103 Entry i of indep_means is the mean of independent
104 variable i, whose observations are stored in the ith
105 column of the design matrix.
109 gsl_vector *indep_means;
110 gsl_vector *indep_std;
115 double ssm; /* Sums of squares for the overall model. */
116 gsl_vector *ss_indeps; /* Sums of squares from each
117 independent variable.
119 double sst; /* Sum of squares total. */
120 double sse; /* Sum of squares error. */
121 double mse; /* Mean squared error. This is just sse / dfe, but
122 since it is the best unbiased estimate of the population
123 variance, it has its own entry here.
125 gsl_vector *ssx; /* Centered sums of squares for independent variables,
126 i.e. \sum (x[i] - mean(x))^2.
128 double ssy; /* Centered sums of squares for dependent variable. */
130 Covariance matrix of the parameter estimates.
141 'Hat' or Hessian matrix, i.e. (X'X)^{-1}, where X is our
146 typedef struct pspp_linreg_cache_struct pspp_linreg_cache;
149 Options describing what special values should be computed.
151 struct pspp_linreg_opts_struct
153 int resid; /* Should the residuals be returned? */
155 int get_depvar_mean_std;
156 int *get_indep_mean_std; /* Array of booleans dictating which
157 independent variables need their means
158 and standard deviations computed within
159 pspp_linreg. This array MUST be of
160 length n_indeps. If element i is 1,
161 pspp_linreg will compute the mean and
162 variance of indpendent variable i. If
163 element i is 0, it will not compute the
164 mean and standard deviation, and assume
165 the values are stored.
166 cache->indep_mean[i] is the mean and
167 cache->indep_std[i] is the sample
171 typedef struct pspp_linreg_opts_struct pspp_linreg_opts;
173 int pspp_reg_sweep (gsl_matrix * A);
175 pspp_linreg_cache *pspp_linreg_cache_alloc (size_t n, size_t p);
177 void pspp_linreg_cache_free (pspp_linreg_cache * cache);
179 int pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
180 const pspp_linreg_opts * opts, pspp_linreg_cache * cache);