+2008-06-21 Jason Stover <jhs@math.gcsu.edu>
+
+ * regression.q (reg_stats_coeff): Use new accessor function
+ pspp_coeff_get_sd. Fixed bug 23567. No longer call compute_moments
+ in run_regression. Pass entire design_matrix to pspp_linreg ().
+
2008-05-29 John Darrington <john@darrington.wattle.id.au>
* examine.q: Fixed bug where incorrect levels of dereferencing
string variables are tabulated, so we must not copy or zero out
more data than that.
-2008-03-10 Jason Stover <jhs@debs.hoobahooba.net>
+2008-03-10 Jason Stover <jhs@math.gcsu.edu>
* regression.q (run_regression): Removed code for EXPORT
subcommand. Remove use of coefficient 0 as the intercept.
* descriptives.c (calc_descriptives): Ditto. Also avoid
gratuitous casereader_clone.
-2007-09-13 Jason Stover <jhs@debs.hoobahooba.net>
+2007-09-13 Jason Stover <jhs@math.gcsu.edu>
* regression.q (cmd_regression): Move declaration of models in to
definition of cmd_regression.
int this_row;
double t_stat;
double pval;
- double coeff;
double std_err;
double beta;
const char *label;
tab_float (t, 2, 1, 0, c->intercept, 10, 2);
std_err = sqrt (gsl_matrix_get (c->cov, 0, 0));
tab_float (t, 3, 1, 0, std_err, 10, 2);
- beta = c->intercept / c->depvar_std;
- tab_float (t, 4, 1, 0, beta, 10, 2);
+ tab_float (t, 4, 1, 0, 0.0, 10, 2);
t_stat = c->intercept / std_err;
tab_float (t, 5, 1, 0, t_stat, 10, 2);
pval = 2 * gsl_cdf_tdist_Q (fabs (t_stat), 1.0);
/*
Regression coefficients.
*/
- coeff = c->coeff[j]->estimate;
- tab_float (t, 2, this_row, 0, coeff, 10, 2);
+ tab_float (t, 2, this_row, 0, c->coeff[j]->estimate, 10, 2);
/*
Standard error of the coefficients.
*/
Standardized coefficient, i.e., regression coefficient
if all variables had unit variance.
*/
- beta = gsl_vector_get (c->indep_std, j + 1);
- beta *= coeff / c->depvar_std;
+ beta = pspp_coeff_get_sd (c->coeff[j]);
+ beta *= c->coeff[j]->estimate / c->depvar_std;
tab_float (t, 4, this_row, 0, beta, 10, 2);
/*
Test statistic for H0: coefficient is 0.
*/
- t_stat = coeff / std_err;
+ t_stat = c->coeff[j]->estimate / std_err;
tab_float (t, 5, this_row, 0, t_stat, 10, 2);
/*
P values for the test statistic above.
{
moments1_calculate ((mom + j)->m, &weight, &mean, &variance,
&skewness, &kurtosis);
- gsl_vector_set (c->indep_means, i, mean);
- gsl_vector_set (c->indep_std, i, sqrt (variance));
+ pspp_linreg_set_indep_variable_mean (c, (mom + j)->v, mean);
+ pspp_linreg_set_indep_variable_sd (c, (mom + j)->v, sqrt (variance));
}
}
}
/*
Find the least-squares estimates and other statistics.
*/
- pspp_linreg ((const gsl_vector *) Y, X->m, &lopts, models[k]);
- compute_moments (models[k], mom, X, n_variables);
+ pspp_linreg ((const gsl_vector *) Y, X, &lopts, models[k]);
if (!taint_has_tainted_successor (casereader_get_taint (input)))
{
+2008-06-21 Jason Stover <jhs@math.gcsu.edu>
+
+ * linreg.c (pspp_linreg): Accept a struct design_matrix. Use new
+ accessor functions pspp_coeff_get_mean, pspp_coeff_get_sd. New
+ functions pspp_linreg_get_indep_variable_mean,
+ pspp_linreg_set_indep_variable_mean,
+ pspp_linreg_get_indep_variable_sd,
+ pspp_linreg_set_indep_variable_sd. Altered pspp_linreg_get_coeff
+ to use accessor function pspp_coeff_var_to_coeff.
+
+ * coefficient.c: New functions pspp_coeff_get_mean,
+ pspp_coeff_set_mean, pspp_coeff_get_sd, pspp_coeff_set_sd,
+ pspp_coeff_var_to_coeff. Added doubles to hold mean and standard
+ deviation in struct varinfo.
+
2008-06-14 Jason Stover <jhs@math.gcsu.edu>
* linreg/: moved linreg.[ch] to src/math.
stopgap measure for portability until appropriate gnulib modules
are available.
-2008-03-10 Jason Stover <jhs@debs.hoobahooba.net>
+2008-03-10 Jason Stover <jhs@math.gcsu.edu>
* coefficient.c (pspp_linreg_get_coeff): Removed use of
coefficient 0 as intercept.
const union value *val; /* Value of the variable v which this varinfo
refers to. This member is relevant only to
categorical variables. */
+ double mean; /* Mean for this variable */
+ double sd; /* Standard deviation for this variable */
};
void
coefficient structures for the model.
*/
void
-pspp_coeff_init (struct pspp_coeff ** c, const struct design_matrix *X)
+pspp_coeff_init (struct pspp_coeff ** coeff, const struct design_matrix *X)
{
size_t i;
int n_vals = 1;
- assert (c != NULL);
+ assert (coeff != NULL);
for (i = 0; i < X->m->size2; i++)
{
- c[i] = xmalloc (sizeof (*c[i]));
- c[i]->n_vars = n_vals; /* Currently, no procedures allow
- interactions. This line will have to
- change when procedures that allow
- interaction terms are written.
- */
- c[i]->v_info = xnmalloc (c[i]->n_vars, sizeof (*c[i]->v_info));
- assert (c[i]->v_info != NULL);
- c[i]->v_info->v = design_matrix_col_to_var (X, i);
-
- if (var_is_alpha (c[i]->v_info->v))
+ coeff[i] = xmalloc (sizeof (*coeff[i]));
+ coeff[i]->n_vars = n_vals; /* Currently, no procedures allow
+ interactions. This line will have to
+ change when procedures that allow
+ interaction terms are written.
+ */
+ coeff[i]->v_info = xnmalloc (coeff[i]->n_vars, sizeof (*coeff[i]->v_info));
+ assert (coeff[i]->v_info != NULL);
+ coeff[i]->v_info->v = design_matrix_col_to_var (X, i);
+
+ if (var_is_alpha (coeff[i]->v_info->v))
{
size_t k;
- k = design_matrix_var_to_column (X, c[i]->v_info->v);
+ k = design_matrix_var_to_column (X, coeff[i]->v_info->v);
assert (k <= i);
k = i - k;
- c[i]->v_info->val =
- cat_subscript_to_value (k, c[i]->v_info->v);
+ coeff[i]->v_info->val =
+ cat_subscript_to_value (k, coeff[i]->v_info->v);
}
+ coeff[i]->v_info->mean = 0.0;
+ coeff[i]->v_info->sd = 0.0;
}
}
void
-pspp_coeff_set_estimate (struct pspp_coeff *c, double estimate)
+pspp_coeff_set_estimate (struct pspp_coeff *coef, double estimate)
{
- c->estimate = estimate;
+ coef->estimate = estimate;
}
void
-pspp_coeff_set_std_err (struct pspp_coeff *c, double std_err)
+pspp_coeff_set_std_err (struct pspp_coeff *coef, double std_err)
{
- c->std_err = std_err;
+ coef->std_err = std_err;
}
/*
Return the estimated value of the coefficient.
*/
double
-pspp_coeff_get_est (const struct pspp_coeff *c)
+pspp_coeff_get_est (const struct pspp_coeff *coef)
{
- if (c == NULL)
+ if (coef == NULL)
{
return 0.0;
}
- return c->estimate;
+ return coef->estimate;
}
/*
Return the standard error of the estimated coefficient.
*/
double
-pspp_coeff_get_std_err (const struct pspp_coeff *c)
+pspp_coeff_get_std_err (const struct pspp_coeff *coef)
{
- if (c == NULL)
+ if (coef == NULL)
{
return 0.0;
}
- return c->std_err;
+ return coef->std_err;
}
/*
How many variables are associated with this coefficient?
*/
int
-pspp_coeff_get_n_vars (struct pspp_coeff *c)
+pspp_coeff_get_n_vars (struct pspp_coeff *coef)
{
- if (c == NULL)
+ if (coef == NULL)
{
return 0;
}
- return c->n_vars;
+ return coef->n_vars;
}
/*
0 unless the coefficient refers to an interaction term.
*/
const struct variable *
-pspp_coeff_get_var (struct pspp_coeff *c, int i)
+pspp_coeff_get_var (struct pspp_coeff *coef, int i)
{
- if (c == NULL)
+ if (coef == NULL)
{
return NULL;
}
- assert (i < c->n_vars);
- return (c->v_info + i)->v;
+ assert (i < coef->n_vars);
+ return (coef->v_info + i)->v;
+}
+
+/*
+ Which coefficient does this variable match? If the variable is
+ categorical, and has more than one coefficient, use the VAL to find
+ its coefficient.
+ */
+const struct pspp_coeff *
+pspp_coeff_var_to_coeff (const struct variable *v, struct pspp_coeff **coefs,
+ size_t n_coef, const union value *val)
+{
+ size_t i = 0;
+ size_t j = 0;
+ size_t v_idx;
+ struct pspp_coeff *result = NULL;
+
+ if (v != NULL)
+ {
+ v_idx = var_get_dict_index (v);
+ while (i < n_coef && var_get_dict_index (coefs[i]->v_info->v) != v_idx)
+ {
+ i++;
+ }
+ result = coefs[i];
+ if (var_is_alpha (v))
+ {
+ /*
+ Use the VAL to find the coefficient.
+ */
+ if (val != NULL)
+ {
+ j = i;
+ while (j < n_coef && compare_values (pspp_coeff_get_value (coefs[j], v),
+ val, var_get_width (v)) != 0)
+ {
+ j++;
+ }
+ result = ((j < n_coef) ? coefs[j] : NULL);
+ }
+ }
+ }
+ return result;
}
/*
Which value is associated with this coefficient/variable combination?
*/
const union value *
-pspp_coeff_get_value (struct pspp_coeff *c,
+pspp_coeff_get_value (struct pspp_coeff *coef,
const struct variable *v)
{
int i = 0;
const struct variable *candidate;
- if (c == NULL || v == NULL)
+ if (coef == NULL || v == NULL)
{
return NULL;
}
{
return NULL;
}
- while (i < c->n_vars)
+ while (i < coef->n_vars)
{
- candidate = pspp_coeff_get_var (c, i);
+ candidate = pspp_coeff_get_var (coef, i);
if (v == candidate)
{
- return (c->v_info + i)->val;
+ return (coef->v_info + i)->val;
}
i++;
}
return NULL;
}
+/*
+ Get or set the standard deviation of the variable associated with this coefficient.
+ */
+double pspp_coeff_get_sd (const struct pspp_coeff *coef)
+{
+ return coef->v_info->sd;
+}
+void pspp_coeff_set_sd (struct pspp_coeff *coef, double s)
+{
+ coef->v_info->sd = s;
+}
+
+/*
+ Get or set the mean for the variable associated with this coefficient.
+*/
+double pspp_coeff_get_mean (const struct pspp_coeff *coef)
+{
+ return coef->v_info->mean;
+}
+
+void pspp_coeff_set_mean (struct pspp_coeff *coef, double m)
+{
+ coef->v_info->mean = m;
+}
+
*/
int pspp_coeff_get_n_vars (struct pspp_coeff *);
+/*
+ Which coefficient does this variable match? If the variable is
+ categorical, and has more than one coefficient, report the first
+ coefficient found. Note that in this case, the result will depend on
+ the order of COEFS.
+ */
+const struct pspp_coeff *
+pspp_coeff_var_to_coeff (const struct variable *, struct pspp_coeff **, size_t, const union value *);
+
/*
Which variable does this coefficient match? The int argument is usually
0, unless the coefficient refers to an interaction.
*/
const union value *pspp_coeff_get_value (struct pspp_coeff *,
const struct variable *);
+
+/*
+ Get or set the standard deviation of the variable associated with this coefficient.
+ */
+double pspp_coeff_get_sd (const struct pspp_coeff *);
+void pspp_coeff_set_sd (struct pspp_coeff *, double);
+
+/*
+ Get or set the mean for the variable associated with this coefficient.
+*/
+double pspp_coeff_get_mean (const struct pspp_coeff *);
+void pspp_coeff_set_mean (struct pspp_coeff *, double);
#endif
#include <math/coefficient.h>
#include <math/linreg.h>
#include <math/coefficient.h>
+#include <math/design-matrix.h>
+#include <src/data/category.h>
#include <src/data/variable.h>
#include <src/data/value.h>
#include <gl/xalloc.h>
values as indicated by opts.
*/
int
-pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
+pspp_linreg (const gsl_vector * Y, const struct design_matrix *dm,
const pspp_linreg_opts * opts, pspp_linreg_cache * cache)
{
int rc;
gsl_matrix *design = NULL;
gsl_matrix_view xtx;
- gsl_matrix_view xm;
+ gsl_matrix *xm;
gsl_matrix_view xmxtx;
gsl_vector_view xty;
gsl_vector_view xi;
gsl_vector_view xj;
gsl_vector *param_estimates;
+ struct pspp_coeff *coef;
+ const struct variable *v;
+ const union value *val;
size_t i;
size_t j;
cache->depvar_std = s;
cache->sst = ss;
}
- for (i = 0; i < cache->n_indeps; i++)
+
+ cache->dft = cache->n_obs - 1;
+ cache->dfm = cache->n_indeps;
+ cache->dfe = cache->dft - cache->dfm;
+ cache->n_coeffs = dm->m->size2;
+ cache->intercept = 0.0;
+
+ for (i = 0; i < dm->m->size2; i++)
{
if (opts->get_indep_mean_std[i])
{
- linreg_mean_std (gsl_matrix_const_column (X, i), &m, &s, &ss);
- gsl_vector_set (cache->indep_means, i, m);
- gsl_vector_set (cache->indep_std, i, s);
+ linreg_mean_std (gsl_matrix_const_column (dm->m, i), &m, &s, &ss);
+ v = design_matrix_col_to_var (dm, i);
+ val = NULL;
+ if (var_is_alpha (v))
+ {
+ j = i - design_matrix_var_to_column (dm, v);
+ val = cat_subscript_to_value (j, v);
+ }
+ coef = pspp_linreg_get_coeff (cache, v, val);
+ pspp_coeff_set_mean (coef, m);
+ pspp_coeff_set_sd (coef, s);
gsl_vector_set (cache->ssx, i, ss);
+
}
}
- cache->dft = cache->n_obs - 1;
- cache->dfm = cache->n_indeps;
- cache->dfe = cache->dft - cache->dfm;
- cache->n_coeffs = X->size2;
- cache->intercept = 0.0;
if (cache->method == PSPP_LINREG_SWEEP)
{
gsl_matrix *sw;
/*
Subtract the means to improve the condition of the design
- matrix. This requires copying X and Y. We do not divide by the
+ matrix. This requires copying dm->m and Y. We do not divide by the
standard deviations of the independent variables here since doing
so would cause a miscalculation of the residual sums of
squares. Dividing by the standard deviation is done GSL's linear
(i.e., a column of 1's). If using PSPP_LINREG_QR, we need that column,
so design is allocated here when sweeping, or below if using QR.
*/
- design = gsl_matrix_alloc (X->size1, X->size2);
- for (i = 0; i < X->size2; i++)
+ design = gsl_matrix_alloc (dm->m->size1, dm->m->size2);
+ for (i = 0; i < dm->m->size2; i++)
{
- m = gsl_vector_get (cache->indep_means, i);
- for (j = 0; j < X->size1; j++)
+ v = design_matrix_col_to_var (dm, i);
+ m = pspp_linreg_get_indep_variable_mean (cache, v);
+ for (j = 0; j < dm->m->size1; j++)
{
- tmp = (gsl_matrix_get (X, j, i) - m);
+ tmp = (gsl_matrix_get (dm->m, j, i) - m);
gsl_matrix_set (design, j, i, tmp);
}
}
{
tmp = gsl_matrix_get (sw, i, cache->n_indeps);
cache->coeff[i]->estimate = tmp;
- m -= tmp * gsl_vector_get (cache->indep_means, i);
+ m -= tmp * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i));
}
/*
Get the covariance matrix of the parameter estimates.
*/
xtx = gsl_matrix_submatrix (sw, 0, 0, cache->n_indeps, cache->n_indeps);
xmxtx = gsl_matrix_submatrix (cache->cov, 0, 1, 1, cache->n_indeps);
- xm = gsl_matrix_view_vector (cache->indep_means, 1, cache->n_indeps);
+ xm = gsl_matrix_calloc (1, cache->n_indeps);
+ for (i = 0; i < xm->size2; i++)
+ {
+ gsl_matrix_set (xm, 0, i,
+ pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i)));
+ }
rc = gsl_blas_dsymm (CblasRight, CblasUpper, cache->mse,
- &xtx.matrix, &xm.matrix, 0.0, &xmxtx.matrix);
+ &xtx.matrix, xm, 0.0, &xmxtx.matrix);
+ gsl_matrix_free (xm);
if (rc == GSL_SUCCESS)
{
tmp = cache->mse / cache->n_obs;
for (i = 1; i < 1 + cache->n_indeps; i++)
{
tmp -= gsl_matrix_get (cache->cov, 0, i)
- * gsl_vector_get (cache->indep_means, i - 1);
+ * pspp_linreg_get_indep_variable_mean (cache, design_matrix_col_to_var (dm, i - 1));
}
gsl_matrix_set (cache->cov, 0, 0, tmp);
Use QR decomposition via GSL.
*/
- param_estimates = gsl_vector_alloc (1 + X->size2);
- design = gsl_matrix_alloc (X->size1, 1 + X->size2);
+ param_estimates = gsl_vector_alloc (1 + dm->m->size2);
+ design = gsl_matrix_alloc (dm->m->size1, 1 + dm->m->size2);
- for (j = 0; j < X->size1; j++)
+ for (j = 0; j < dm->m->size1; j++)
{
gsl_matrix_set (design, j, 0, 1.0);
- for (i = 0; i < X->size2; i++)
+ for (i = 0; i < dm->m->size2; i++)
{
- tmp = gsl_matrix_get (X, j, i);
+ tmp = gsl_matrix_get (dm->m, j, i);
gsl_matrix_set (design, j, i + 1, tmp);
}
}
pspp_linreg_get_coeff (const pspp_linreg_cache * c,
const struct variable *v, const union value *val)
{
- int i;
- struct pspp_coeff *result = NULL;
- const struct variable *tmp = NULL;
-
if (c == NULL)
{
return NULL;
{
return NULL;
}
- i = 0;
- result = c->coeff[0];
- tmp = pspp_coeff_get_var (result, 0);
- while (tmp != v && i < c->n_coeffs)
+ return pspp_coeff_var_to_coeff (v, c->coeff, c->n_coeffs, val);
+}
+/*
+ Return the standard deviation of the independent variable.
+ */
+double pspp_linreg_get_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v)
+{
+ if (var_is_numeric (v))
{
- result = c->coeff[i];
- tmp = pspp_coeff_get_var (result, 0);
- i++;
+ const struct pspp_coeff *coef;
+ coef = pspp_linreg_get_coeff (c, v, NULL);
+ return pspp_coeff_get_sd (coef);
}
- if (tmp != v)
+ return GSL_NAN;
+}
+
+void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *c, const struct variable *v,
+ double s)
+{
+ if (var_is_numeric (v))
{
- /*
- Not found.
- */
- return NULL;
+ struct pspp_coeff *coef;
+ coef = pspp_linreg_get_coeff (c, v, NULL);
+ pspp_coeff_set_sd (coef, s);
}
+}
+
+/*
+ Mean of the independent variable.
+ */
+double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v)
+{
if (var_is_numeric (v))
{
- return result;
+ struct pspp_coeff *coef;
+ coef = pspp_linreg_get_coeff (c, v, NULL);
+ return pspp_coeff_get_mean (coef);
}
- else if (val != NULL)
+ return GSL_NAN;
+}
+
+void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *c, const struct variable *v,
+ double m)
+{
+ if (var_is_numeric (v))
{
- /*
- If v is categorical, we need to ensure the coefficient
- matches the VAL.
- */
- while (tmp != v && i < c->n_coeffs
- && compare_values (pspp_coeff_get_value (result, tmp),
- val, var_get_width (v)))
- { /* FIX THIS */
- i++;
- result = c->coeff[i];
- tmp = pspp_coeff_get_var (result, 0);
- }
- if (i == c->n_coeffs && tmp != v)
- {
- return NULL;
- }
- return result;
+ struct pspp_coeff *coef;
+ coef = pspp_linreg_get_coeff (c, v, NULL);
+ pspp_coeff_set_mean (coef, m);
}
- return NULL;
}
#include <gsl/gsl_math.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_matrix.h>
-
-struct variable;
-struct pspp_coeff;
-union value;
+#include <src/math/coefficient.h>
enum
{
values as indicated by opts.
*/
int
-pspp_linreg (const gsl_vector * Y, const gsl_matrix * X,
- const pspp_linreg_opts * opts, pspp_linreg_cache * cache);
+pspp_linreg (const gsl_vector *, const struct design_matrix *,
+ const pspp_linreg_opts *, pspp_linreg_cache *);
double
pspp_linreg_predict (const struct variable **, const union value **,
const struct variable
*,
const union value *);
+/*
+ Return or set the standard deviation of the independent variable.
+ */
+double pspp_linreg_get_indep_variable_sd (pspp_linreg_cache *, const struct variable *);
+void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *, const struct variable *, double);
+/*
+ Mean of the independent variable.
+ */
+double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *, const struct variable *);
+void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *, const struct variable *, double);
#endif
+2008-06-21 Jason Stover <jhs@math.gcsu.edu>
+
+ * regression.sh, regression-qr.sh: Fixed column showing
+ standardized coefficients.
+
2008-05-16 John Darrington <john@darrington.wattle.id.au>
* compression.sh print-crash.sh print-strings.sh very-long-strings.sh :
#===================#====#==========#====#====#============#
# # B |Std. Error|Beta| t |Significance#
#========#==========#====#==========#====#====#============#
-# |(Constant)#1.24| .42| .15|2.95| .21#
-# | v1 #1.37| .72| .00|1.89| .06#
+# |(Constant)#1.24| .42| .00|2.95| .21#
+# | v1 #1.37| .72| .05|1.89| .06#
# | # | | | | #
#========#==========#====#==========#====#====#============#
EOF
# |Total # 215.26| 9| | | #
#========#==========#==============#==#===========#======#============#
2.3 REGRESSION. Coefficients
-#===================#=====#==========#====#======#============#
-# # B |Std. Error|Beta| t |Significance#
-#========#==========#=====#==========#====#======#============#
-# |(Constant)# 2.19| 2.36| .45| .93| .52#
-# | v0 # 1.81| 1.05| .54| 1.72| .12#
-# | v1 #-3.43| .33| .00|-10.33| .00#
-# | # | | | | #
-#========#==========#=====#==========#====#======#============#
+#===================#=====#==========#=====#======#============#
+# # B |Std. Error| Beta| t |Significance#
+#========#==========#=====#==========#=====#======#============#
+# |(Constant)# 2.19| 2.36| .00| .93| .52#
+# | v0 # 1.81| 1.05| .17| 1.72| .12#
+# | v1 #-3.43| .33|-1.03|-10.33| .00#
+# | # | | | | #
+#========#==========#=====#==========#=====#======#============#
v0 v1 v2 RES1 PRED1
-------- -------- -------- -------- --------
.65 7.74 -23.98 -.84 -23.13