From 41a3a550334da96a9b4e5e089ad1768acf288092 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Sun, 22 Apr 2007 00:48:50 +0000 Subject: [PATCH] Miscellaneous cleanup to categorical values, linreg and design matrix code. --- src/data/ChangeLog | 5 +++ src/data/automake.mk | 1 - src/data/cat-routines.h | 53 ---------------------------- src/data/category.c | 26 +++++++++++--- src/data/category.h | 35 +++++++++---------- src/data/dictionary.c | 1 - src/data/sys-file-reader.c | 2 -- src/data/variable.c | 3 +- src/language/stats/regression.q | 10 +++--- src/math/coefficient.c | 5 ++- src/math/design-matrix.c | 62 ++++----------------------------- src/math/design-matrix.h | 10 +++--- src/math/linreg/linreg.c | 6 ++-- src/math/linreg/linreg.h | 4 +-- 14 files changed, 68 insertions(+), 155 deletions(-) delete mode 100644 src/data/cat-routines.h diff --git a/src/data/ChangeLog b/src/data/ChangeLog index b5a299ad..73aba2ff 100644 --- a/src/data/ChangeLog +++ b/src/data/ChangeLog @@ -1,3 +1,8 @@ +2007-04-22 John Darrington + + * Deleted existing category.h and moved cat-routines.h into + category.h Encapsulated struct cat_vals better. + 2007-04-19 John Darrington * sys-file-reader.c: When reading a system file which has no diff --git a/src/data/automake.mk b/src/data/automake.mk index 54fddf05..e3c83e1d 100644 --- a/src/data/automake.mk +++ b/src/data/automake.mk @@ -27,7 +27,6 @@ src_data_libdata_a_SOURCES = \ src/data/case.h \ src/data/category.c \ src/data/category.h \ - src/data/cat-routines.h \ src/data/data-in.c \ src/data/data-in.h \ src/data/data-out.c \ diff --git a/src/data/cat-routines.h b/src/data/cat-routines.h deleted file mode 100644 index 4fed886b..00000000 --- a/src/data/cat-routines.h +++ /dev/null @@ -1,53 +0,0 @@ -/* PSPP - Binary encodings for categorical variables. - Copyright (C) 2005 Free Software Foundation, Inc. - Written by Jason H Stover . - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. */ - -/* - Functions and data structures to recode categorical variables into - vectors and sub-rows of matrices. - - To fit many types of statistical models, it is necessary - to change each value of a categorical variable to a vector with binary - entries. These vectors are then stored as sub-rows within a matrix - during model-fitting. We need functions and data strucutres to, - e.g., map a value, say 'a', of a variable named 'cat_var', to a - vector, say (0 1 0 0 0), and vice versa. We also need to be able - to map the vector back to the value 'a', and if the vector is a - sub-row of a matrix, we need to know which sub-row corresponds to - the variable 'cat_var'. - - */ - -#ifndef CAT_ROUTINES_H -#define CAT_ROUTINES_H -#define CAT_VALUE_NOT_FOUND -2 -#include -#include "category.h" - -size_t cat_value_find (const struct variable *, const union value *); - -union value *cat_subscript_to_value (const size_t, struct variable *); - -void cat_stored_values_create (const struct variable *); - -void cat_value_update (const struct variable *, const union value *); - -void cat_create_value_matrix (const struct variable *); - -void cat_stored_values_destroy (struct cat_vals *); -#endif diff --git a/src/data/category.c b/src/data/category.c index 9e5c6b0c..fe053b86 100644 --- a/src/data/category.c +++ b/src/data/category.c @@ -41,18 +41,36 @@ #include #include -#include "cat-routines.h" +#include "category.h" #include "value.h" #include "variable.h" +#define CAT_VALUE_NOT_FOUND -2 + #define N_INITIAL_CATEGORIES 1 +/* + This structure contains the observed values of a + categorical variable. + */ +struct cat_vals +{ + union value *vals; + size_t n_categories; + size_t n_allocated_categories; /* This is used only during + initialization to keep + track of the number of + values stored. + */ +}; + void cat_stored_values_create (const struct variable *v) { if (!var_has_obs_vals (v)) { struct cat_vals *obs_vals = xmalloc (sizeof *obs_vals); + obs_vals->n_categories = 0; obs_vals->n_allocated_categories = N_INITIAL_CATEGORIES; obs_vals->vals = xnmalloc (N_INITIAL_CATEGORIES, sizeof *obs_vals->vals); @@ -63,7 +81,7 @@ cat_stored_values_create (const struct variable *v) void cat_stored_values_destroy (struct cat_vals *obs_vals) { - if (obs_vals != NULL) + if (obs_vals != NULL) { if (obs_vals->n_allocated_categories > 0) free (obs_vals->vals); @@ -117,8 +135,8 @@ cat_value_update (const struct variable *v, const union value *val) } } -union value * -cat_subscript_to_value (const size_t s, struct variable *v) +const union value * +cat_subscript_to_value (const size_t s, const struct variable *v) { struct cat_vals *obs_vals = var_get_obs_vals (v); return s < obs_vals->n_categories ? obs_vals->vals + s : NULL; diff --git a/src/data/category.h b/src/data/category.h index 35df104b..40abebaa 100644 --- a/src/data/category.h +++ b/src/data/category.h @@ -33,29 +33,26 @@ */ -#ifndef CAT_H -#define CAT_H -#define CAT_VALUE_NOT_FOUND -2 -#include +#ifndef CATEGORY_H +#define CATEGORY_H + #include -union value; +struct cat_vals; struct variable ; +union value; + +void cat_stored_values_create (const struct variable *); +void cat_stored_values_destroy (struct cat_vals *); + +size_t cat_value_find (const struct variable *, const union value *); + +const union value *cat_subscript_to_value (const size_t, + const struct variable *); + + +void cat_value_update (const struct variable *, const union value *); -/* - This structure contains the observed values of a - categorical variable. - */ -struct cat_vals -{ - union value *vals; - size_t n_categories; - size_t n_allocated_categories; /* This is used only during - initialization to keep - track of the number of - values stored. - */ -}; /* Return the number of categories of a categorical variable. diff --git a/src/data/dictionary.c b/src/data/dictionary.c index 845a46e3..761b540b 100644 --- a/src/data/dictionary.c +++ b/src/data/dictionary.c @@ -24,7 +24,6 @@ #include #include "case.h" -#include "cat-routines.h" #include "category.h" #include "settings.h" #include "value-labels.h" diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index d04a757f..87ba172d 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -261,8 +261,6 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict, struct variable *var = dict_get_var (*dict, i); char short_name [SHORT_NAME_LEN + 1]; char long_name [SHORT_NAME_LEN + 1]; - char *s = short_name; - char *d = long_name; strcpy (short_name, var_get_name (var)); diff --git a/src/data/variable.c b/src/data/variable.c index 28065caf..f890746a 100644 --- a/src/data/variable.c +++ b/src/data/variable.c @@ -21,7 +21,8 @@ #include -#include "cat-routines.h" + +#include "category.h" #include "data-out.h" #include "format.h" #include "dictionary.h" diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q index 7baaef91..e3816790 100644 --- a/src/language/stats/regression.q +++ b/src/language/stats/regression.q @@ -27,7 +27,6 @@ #include "regression-export.h" #include #include -#include #include #include #include @@ -565,7 +564,7 @@ regression_trns_pred_proc (void *t_, struct ccase *c, pspp_linreg_cache *model; union value *output = NULL; const union value **vals = NULL; - struct variable **vars = NULL; + const struct variable **vars = NULL; assert (trns != NULL); model = trns->c; @@ -605,7 +604,7 @@ regression_trns_resid_proc (void *t_, struct ccase *c, union value *output = NULL; const union value **vals = NULL; const union value *obs = NULL; - struct variable **vars = NULL; + const struct variable **vars = NULL; assert (trns != NULL); model = trns->c; @@ -790,7 +789,7 @@ reg_print_categorical_encoding (FILE * fp, pspp_linreg_cache * c) for (j = 0; j < n_categories; j++) { - union value *val = cat_subscript_to_value (j, varlist[i]); + const union value *val = cat_subscript_to_value (j, varlist[i]); fprintf (fp, "%s.values[%d] = \"%s\";\n\t", var_get_name (varlist[i]), j, var_get_value_name (varlist[i], val)); @@ -1094,7 +1093,7 @@ prepare_data (int n_data, int is_missing_case[], j++; if (var_is_alpha (v_variables[i])) { - /* Make a place to hold the binary vectors + /* Make a place to hold the binary vectors corresponding to this variable's values. */ cat_stored_values_create (v_variables[i]); } @@ -1225,7 +1224,6 @@ run_regression (const struct ccase *first, if (n_data > 0) { Y = gsl_vector_alloc (n_data); - X = design_matrix_create (n_indep, (const struct variable **) indep_vars, n_data); diff --git a/src/math/coefficient.c b/src/math/coefficient.c index fe0cd438..51c1a907 100644 --- a/src/math/coefficient.c +++ b/src/math/coefficient.c @@ -69,8 +69,7 @@ pspp_coeff_init (struct pspp_coeff ** c, const struct design_matrix *X) */ c[i]->v_info = xnmalloc (c[i]->n_vars, sizeof (*c[i]->v_info)); assert (c[i]->v_info != NULL); - c[i]->v_info->v = - (const struct variable *) design_matrix_col_to_var (X, i); + c[i]->v_info->v = design_matrix_col_to_var (X, i); if (var_is_alpha (c[i]->v_info->v)) { @@ -79,7 +78,7 @@ pspp_coeff_init (struct pspp_coeff ** c, const struct design_matrix *X) assert (k <= i); k = i - k; c[i]->v_info->val = - cat_subscript_to_value (k, (struct variable *) c[i]->v_info->v); + cat_subscript_to_value (k, c[i]->v_info->v); } } } diff --git a/src/math/design-matrix.c b/src/math/design-matrix.c index 0f5242d5..163c42fa 100644 --- a/src/math/design-matrix.c +++ b/src/math/design-matrix.c @@ -42,57 +42,6 @@ #define DM_COLUMN_NOT_FOUND -1 #define DM_INDEX_NOT_FOUND -3 -/* - Which element of a vector is equal to the value x? - */ -static size_t -cat_which_element_eq (const gsl_vector * vec, double x) -{ - size_t i; - - for (i = 0; i < vec->size; i++) - { - if (fabs (gsl_vector_get (vec, i) - x) < GSL_DBL_EPSILON) - { - return i; - } - } - return CAT_VALUE_NOT_FOUND; -} -static int -cat_is_zero_vector (const gsl_vector * vec) -{ - size_t i; - - for (i = 0; i < vec->size; i++) - { - if (gsl_vector_get (vec, i) != 0.0) - { - return 0; - } - } - return 1; -} - -/* - Return the value of v corresponding to the vector vec. - */ -union value * -cat_vector_to_value (const gsl_vector * vec, struct variable *v) -{ - size_t i; - - i = cat_which_element_eq (vec, 1.0); - if (i != CAT_VALUE_NOT_FOUND) - { - return cat_subscript_to_value (i + 1, v); - } - if (cat_is_zero_vector (vec)) - { - return cat_subscript_to_value (0, v); - } - return NULL; -} struct design_matrix * design_matrix_create (int n_variables, @@ -123,10 +72,10 @@ design_matrix_create (int n_variables, } else if (var_is_alpha (v)) { - struct cat_vals *obs_vals = var_get_obs_vals (v); + size_t n_categories = cat_get_n_categories (v); (dm->vars + i)->last_column = - (dm->vars + i)->first_column + obs_vals->n_categories - 2; - n_cols += obs_vals->n_categories - 1; + (dm->vars + i)->first_column + n_categories - 2; + n_cols += n_categories - 1; } } dm->m = gsl_matrix_calloc (n_data, n_cols); @@ -147,7 +96,7 @@ design_matrix_destroy (struct design_matrix *dm) Return the index of the variable for the given column. */ -struct variable * +const struct variable * design_matrix_col_to_var (const struct design_matrix *dm, size_t col) { size_t i; @@ -157,7 +106,7 @@ design_matrix_col_to_var (const struct design_matrix *dm, size_t col) { v = dm->vars[i]; if (v.first_column <= col && col <= v.last_column) - return (struct variable *) v.v; + return v.v; } return NULL; } @@ -232,6 +181,7 @@ design_matrix_set_categorical (struct design_matrix *dm, size_t row, gsl_matrix_set (dm->m, row, col, entry); } } + void design_matrix_set_numeric (struct design_matrix *dm, size_t row, const struct variable *var, const union value *val) diff --git a/src/math/design-matrix.h b/src/math/design-matrix.h index ce17e596..2b6bae50 100644 --- a/src/math/design-matrix.h +++ b/src/math/design-matrix.h @@ -26,7 +26,7 @@ #include #include #include -#include + struct design_matrix_var { size_t first_column; /* First column for this variable in @@ -40,6 +40,7 @@ struct design_matrix_var size_t last_column; const struct variable *v; }; + struct design_matrix { gsl_matrix *m; @@ -61,7 +62,7 @@ struct design_matrix */ size_t n_vars; }; -union value *cat_vector_to_value (const gsl_vector *, struct variable *); + struct design_matrix *design_matrix_create (int, const struct variable *[], const size_t); @@ -73,12 +74,13 @@ void design_matrix_set_categorical (struct design_matrix *, size_t, const union value *); void design_matrix_set_numeric (struct design_matrix *, size_t, - const struct variable *, const union value *); + const struct variable *, + const union value *); size_t design_matrix_var_to_column (const struct design_matrix *, const struct variable *); -struct variable *design_matrix_col_to_var (const struct design_matrix *, +const struct variable *design_matrix_col_to_var (const struct design_matrix *, size_t); #endif diff --git a/src/math/linreg/linreg.c b/src/math/linreg/linreg.c index f4eea028..558f6646 100644 --- a/src/math/linreg/linreg.c +++ b/src/math/linreg/linreg.c @@ -95,7 +95,7 @@ linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp) The return value is the number of distinct variables found. */ int -pspp_linreg_get_vars (const void *c_, struct variable **v) +pspp_linreg_get_vars (const void *c_, const struct variable **v) { const pspp_linreg_cache *c = c_; struct pspp_coeff *coef = NULL; @@ -114,7 +114,7 @@ pspp_linreg_get_vars (const void *c_, struct variable **v) /* Start at c->coeff[1] to avoid the intercept. */ - v[result] = (struct variable *) pspp_coeff_get_var (c->coeff[1], 0); + v[result] = pspp_coeff_get_var (c->coeff[1], 0); result = (v[result] == NULL) ? 0 : 1; for (coef = c->coeff[2]; coef < c->coeff[c->n_coeffs]; coef++) @@ -130,7 +130,7 @@ pspp_linreg_get_vars (const void *c_, struct variable **v) } if (i < 0 && result < c->n_coeffs) { - v[result] = (struct variable *) tmp; + v[result] = tmp; result++; } } diff --git a/src/math/linreg/linreg.h b/src/math/linreg/linreg.h index c5f5ef10..28ab650e 100644 --- a/src/math/linreg/linreg.h +++ b/src/math/linreg/linreg.h @@ -165,7 +165,7 @@ struct pspp_linreg_cache_struct /* Returns pointers to the variables used in the model. */ - int (*get_vars) (const void *, struct variable **); + int (*get_vars) (const void *, const struct variable **); struct variable *resid; struct variable *pred; @@ -202,5 +202,5 @@ pspp_linreg_residual (const struct variable **, const union value **, /* All variables used in the model. */ -int pspp_linreg_get_vars (const void *, struct variable **); +int pspp_linreg_get_vars (const void *, const struct variable **); #endif -- 2.30.2