From d39f4365f9e9ee7923e7dade0d7571a0bf3ec58d Mon Sep 17 00:00:00 2001 From: Jason Stover Date: Mon, 9 Jan 2006 00:49:11 +0000 Subject: [PATCH] Functions to handle coefficient-to-variable/value matching --- lib/linreg/Makefile.am | 2 +- lib/linreg/coefficient.c | 134 +++++++++++++++++++++++++++++++++++++++ lib/linreg/linreg.c | 2 +- lib/linreg/pspp_linreg.h | 40 ++++++++++-- src/regression.q | 108 +++++++++---------------------- 5 files changed, 199 insertions(+), 87 deletions(-) create mode 100644 lib/linreg/coefficient.c diff --git a/lib/linreg/Makefile.am b/lib/linreg/Makefile.am index a81f575c..998b75ef 100644 --- a/lib/linreg/Makefile.am +++ b/lib/linreg/Makefile.am @@ -12,4 +12,4 @@ AM_CFLAGS+=-Wall -W -Wwrite-strings -Wstrict-prototypes \ -ansi endif -liblinreg_a_SOURCES = sweep.c linreg.c pspp_linreg.h +liblinreg_a_SOURCES = coefficient.c sweep.c linreg.c pspp_linreg.h diff --git a/lib/linreg/coefficient.c b/lib/linreg/coefficient.c new file mode 100644 index 00000000..1645d2c2 --- /dev/null +++ b/lib/linreg/coefficient.c @@ -0,0 +1,134 @@ +/* lib/linreg/coefficient.c + + Copyright (C) 2005 Free Software Foundation, Inc. + Written by Jason H Stover. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or (at + your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02111-1307, USA. + */ + +/* + Accessor functions for matching coefficients and variables. + */ +#include +#include "pspp_linreg.h" +#include + + +struct varinfo +{ + const struct variable *v; /* Variable associated with this + coefficient. Note this variable may not + be unique. In other words, a + coefficient structure may have other + v_info's, each with its own variable. + */ + const union value *val; /* Value of the variable v which this + varinfo refers to. This member is relevant + only to categorical variables. + */ +}; + +void pspp_linreg_coeff_free (struct pspp_linreg_coeff *c) +{ + free (c->v_info); + free (c); +} + +/* + Initialize the variable and value pointers inside the + coefficient structures for the linear model. + */ +void +pspp_linreg_coeff_init (pspp_linreg_cache *c, struct design_matrix *X) +{ + size_t i; + size_t j; + int n_vals = 1; + struct pspp_linreg_coeff *coeff; + + c->coeff = xnmalloc (X->m->size2 + 1, sizeof (*c->coeff)); + for (i = 0; i < X->m->size2; i++) + { + j = i + 1; /* The first coefficient is the intercept. */ + coeff = c->coeff + j; + coeff->n_vars = n_vals; /* Currently, no procedures allow interactions. + This will have to change when procedures that + allow interaction terms are written. + */ + coeff->v_info = xnmalloc (coeff->n_vars, sizeof (*coeff->v_info)); + assert (coeff->v_info != NULL); + coeff->v_info->v = (const struct variable *) design_matrix_col_to_var (X, i); + + if (coeff->v_info->v->type == ALPHA) + { + size_t k; + k = design_matrix_var_to_column (X, coeff->v_info->v); + assert (k <= i); + k = i - k; + coeff->v_info->val = cat_subscript_to_value (k, (struct variable *) coeff->v_info->v); + } + } +} +void +pspp_linreg_coeff_set_estimate (struct pspp_linreg_coeff *c, + double estimate) +{ + c->estimate = estimate; +} +void +pspp_linreg_coeff_set_std_err (struct pspp_linreg_coeff *c, + double std_err) +{ + c->std_err = std_err; +} +/* + How many variables are associated with this coefficient? + */ +int +pspp_linreg_coeff_get_n_vars (struct pspp_linreg_coeff *c) +{ + return c->n_vars; +} +/* + Which variable does this coefficient match? + */ +const struct variable * +pspp_linreg_coeff_get_var (struct pspp_linreg_coeff *c, int i) +{ + assert (i < c->n_vars); + return (c->v_info + i)->v; +} +/* + Which value is associated with this coefficient/variable comination? +*/ +const union value * +pspp_linreg_coeff_get_value (struct pspp_linreg_coeff *c, + const struct variable *v) +{ + int i = 0; + const struct variable *candidate; + + while (i < c->n_vars) + { + candidate = pspp_linreg_coeff_get_var (c, i); + if (v->index == candidate->index) + { + return (c->v_info + i)->val; + } + i++; + } + return NULL; +} diff --git a/lib/linreg/linreg.c b/lib/linreg/linreg.c index 6c2ee289..0bc1f0ce 100644 --- a/lib/linreg/linreg.c +++ b/lib/linreg/linreg.c @@ -116,7 +116,7 @@ pspp_linreg_cache_free (pspp_linreg_cache * c) gsl_vector_free (c->indep_std); gsl_vector_free (c->ss_indeps); gsl_matrix_free (c->cov); - free (c->coeff); + pspp_linreg_coeff_free (c->coeff); free (c); } diff --git a/lib/linreg/pspp_linreg.h b/lib/linreg/pspp_linreg.h index a8db8a10..a83f7b3d 100644 --- a/lib/linreg/pspp_linreg.h +++ b/lib/linreg/pspp_linreg.h @@ -54,7 +54,9 @@ #include #include #include +#include #include +#define PSPP_LINREG_VAL_NOT_FOUND -1 enum { PSPP_LINREG_SWEEP, @@ -70,13 +72,24 @@ enum struct pspp_linreg_coeff { double estimate; /* Estimated coefficient. */ - const struct variable *v; /* The variable associated with this coefficient. - The calling function should supply the variable - when it creates the design matrix. The estimation - procedure ignores the struct variable *. It is here so - the caller can match parameters with relevant - variables. - */ + double std_err; /* Standard error of the estimate. */ + struct varinfo *v_info; /* Information pertaining to the + variable(s) associated with this + coefficient. The calling function + should initialize this value with the + functions in coefficient.c. The + estimation procedure ignores this + member. It is here so the caller can + match parameters with relevant variables + and values. If the coefficient is + associated with an interaction, then + v_info contains information for multiple + variables. + */ + int n_vars; /* Number of variables associated with this coefficient. + Coefficients corresponding to interaction terms will + have more than one variable. + */ }; struct pspp_linreg_cache_struct { @@ -177,4 +190,17 @@ void pspp_linreg_cache_free (pspp_linreg_cache * cache); int pspp_linreg (const gsl_vector * Y, const gsl_matrix * X, const pspp_linreg_opts * opts, pspp_linreg_cache * cache); +void pspp_linreg_coeff_init (pspp_linreg_cache *, struct design_matrix *); + +void pspp_linreg_coeff_free (struct pspp_linreg_coeff *); + +void pspp_linreg_coeff_set_estimate (struct pspp_linreg_coeff *, double); + +void pspp_linreg_coeff_set_std_err (struct pspp_linreg_coeff *, double); + +int pspp_linreg_coeff_get_n_vars (struct pspp_linreg_coeff *); + +const struct variable *pspp_linreg_coeff_get_var (struct pspp_linreg_coeff *, int); + +const union value *pspp_linreg_coeff_get_value (struct pspp_linreg_coeff *, const struct variable *); #endif diff --git a/src/regression.q b/src/regression.q index dd22daea..68469e44 100644 --- a/src/regression.q +++ b/src/regression.q @@ -91,56 +91,6 @@ struct file_handle *model_file; int pspp_reg_rc = CMD_SUCCESS; static void run_regression (const struct casefile *, void *); -static union value *pspp_reg_coefficient_to_value (const pspp_linreg_cache *, - const struct - pspp_linreg_coeff *, - size_t); - -/* - Which value of a categorical variable corresponds to the - coefficient? This routine provides the answer IF the - categorical variable is encoded in the usual way: The - first value maps to the zero vector, the second value to - the vector (0, 1, 0, ...), the third value (0, 0, 1, 0, ...), - etc. - */ -static union value * -pspp_reg_coefficient_to_value (const pspp_linreg_cache * cache, - const struct pspp_linreg_coeff *coeff, - size_t j) -{ - size_t which = 0; - int k = 1; /* First coefficient is the intercept. */ - int found = 0; - const union value *result; - struct pspp_linreg_coeff c; - - assert (cache != NULL); - assert (coeff != NULL); - assert (coeff->v->type == ALPHA); - while (!found && k < cache->n_coeffs) - { - c = cache->coeff[k]; - /* - compare_var_names returns 0 if the variable - names match. - */ - if (!compare_var_names (coeff->v, c.v, NULL)) - { - if (j == k) - { - found = 1; - } - else - { - which++; - } - } - k++; - } - result = cat_subscript_to_value ((const size_t) which, coeff->v); - return result; -} /* STATISTICS subcommand output functions. @@ -213,6 +163,7 @@ reg_stats_coeff (pspp_linreg_cache * c) double beta; const char *label; char *tmp; + const struct variable *v; const union value *val; const char *val_s; struct tab_table *t; @@ -248,18 +199,20 @@ reg_stats_coeff (pspp_linreg_cache * c) for (j = 1; j <= c->n_indeps; j++) { i = indep_vars[j]; - label = var_to_string (c->coeff[j].v); + v = pspp_linreg_coeff_get_var (c->coeff + j, 0); + label = var_to_string (v); /* Do not overwrite the variable's name. */ strncpy (tmp, label, MAX_STRING); - if (c->coeff[j].v->type == ALPHA) + if (v->type == ALPHA) { /* Append the value associated with this coefficient. This makes sense only if we us the usual binary encoding for that value. */ - val = pspp_reg_coefficient_to_value (c, &(c->coeff[j]), j); - val_s = value_to_string (val, c->coeff[j].v); + + val = pspp_linreg_coeff_get_value (c->coeff + j, v); + val_s = value_to_string (val, v); strncat (tmp, val_s, MAX_STRING); } @@ -544,7 +497,7 @@ subcommand_statistics (int *keywords, pspp_linreg_cache * c) statistics_keyword_output (reg_stats_selection, keywords[selection], c); } static int -reg_inserted (struct variable *v, struct variable **varlist, int n_vars) +reg_inserted (const struct variable *v, struct variable **varlist, int n_vars) { int i; @@ -564,7 +517,8 @@ reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c) size_t j; int n_vars = 0; struct variable **varlist; - struct pspp_linreg_coeff coeff; + struct pspp_linreg_coeff *coeff; + const struct variable *v; union value *val; fprintf (fp, "%s", reg_export_categorical_encode_1); @@ -572,14 +526,15 @@ reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c) varlist = xnmalloc (c->n_indeps, sizeof (*varlist)); for (i = 1; i < c->n_indeps; i++) /* c->coeff[0] is the intercept. */ { - coeff = c->coeff[i]; - if (coeff.v->type == ALPHA) + coeff = c->coeff + i; + v = pspp_linreg_coeff_get_var (coeff, 0); + if (v->type == ALPHA) { - if (!reg_inserted (coeff.v, varlist, n_vars)) + if (!reg_inserted (v, varlist, n_vars)) { fprintf (fp, "struct pspp_reg_categorical_variable %s;\n\t", - coeff.v->name); - varlist[n_vars] = coeff.v; + v->name); + varlist[n_vars] = (struct variable *) v; n_vars++; } } @@ -595,7 +550,7 @@ reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c) for (i = 0; i < n_vars; i++) { - coeff = c->coeff[i]; + coeff = c->coeff + i; fprintf (fp, "%s.name = \"%s\";\n\t", varlist[i]->name, varlist[i]->name); fprintf (fp, "%s.n_vals = %d;\n\t", varlist[i]->name, @@ -615,16 +570,19 @@ static void reg_print_depvars (FILE * fp, pspp_linreg_cache * c) { int i; - struct pspp_linreg_coeff coeff; + struct pspp_linreg_coeff *coeff; + const struct variable *v; fprintf (fp, "char *model_depvars[%d] = {", c->n_indeps); for (i = 1; i < c->n_indeps; i++) { - coeff = c->coeff[i]; - fprintf (fp, "\"%s\",\n\t\t", coeff.v->name); + coeff = c->coeff + i; + v = pspp_linreg_coeff_get_var (coeff, 0); + fprintf (fp, "\"%s\",\n\t\t", v->name); } - coeff = c->coeff[i]; - fprintf (fp, "\"%s\"};\n\t", coeff.v->name); + coeff = c->coeff + i; + v = pspp_linreg_coeff_get_var (coeff, 0); + fprintf (fp, "\"%s\"};\n\t", v->name); } static void reg_print_getvar (FILE * fp, pspp_linreg_cache * c) @@ -640,7 +598,6 @@ reg_print_getvar (FILE * fp, pspp_linreg_cache * c) static void subcommand_export (int export, pspp_linreg_cache * c) { - FILE *fp; size_t i; size_t j; int n_quantiles = 100; @@ -650,6 +607,7 @@ subcommand_export (int export, pspp_linreg_cache * c) if (export) { + FILE *fp; assert (c != NULL); assert (model_file != NULL); assert (fp != NULL); @@ -776,12 +734,12 @@ is_depvar (size_t k) */ static size_t mark_missing_cases (const struct casefile *cf, struct variable *v, - double *is_missing_case, double n_data) + int *is_missing_case, double n_data) { struct casereader *r; struct ccase c; size_t row; - union value *val; + const union value *val; for (r = casefile_get_reader (cf); casereader_read (r, &c); case_destroy (&c)) @@ -804,6 +762,7 @@ mark_missing_cases (const struct casefile *cf, struct variable *v, return n_data; } + static void run_regression (const struct casefile *cf, void *cmd_ UNUSED) { @@ -955,14 +914,7 @@ run_regression (const struct casefile *cf, void *cmd_ UNUSED) and store pointers to the variables that correspond to the coefficients. */ - lcache->coeff = xnmalloc (X->m->size2 + 1, sizeof (*lcache->coeff)); - for (i = 0; i < X->m->size2; i++) - { - j = i + 1; /* The first coeff is the intercept. */ - lcache->coeff[j].v = - (const struct variable *) design_matrix_col_to_var (X, i); - assert (lcache->coeff[j].v != NULL); - } + pspp_linreg_coeff_init (lcache, X); /* Find the least-squares estimates and other statistics. -- 2.30.2