From: Jason Stover Date: Sun, 27 Nov 2005 20:25:51 +0000 (+0000) Subject: Split categorical encoding and design matrix routines X-Git-Tag: v0.6.0~1134 X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=95a7059c5c07aab50ff820bf695fe8ee144fa025;p=pspp-builds.git Split categorical encoding and design matrix routines --- diff --git a/src/Makefile.am b/src/Makefile.am index 28092820..fbd3ef20 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -84,6 +84,7 @@ pspp_SOURCES = \ casefile.h \ cat.c \ cat.h \ + cat-routines.h \ chart.c \ chart.h \ ctl-stack.c \ @@ -105,6 +106,8 @@ pspp_SOURCES = \ date.c \ debug-print.h \ descript.c \ + design-matrix.h \ + design-matrix.c \ dfm-read.c \ dfm-read.h \ dfm-write.c \ diff --git a/src/cat-routines.h b/src/cat-routines.h new file mode 100644 index 00000000..6842fab5 --- /dev/null +++ b/src/cat-routines.h @@ -0,0 +1,53 @@ +/* PSPP - Binary encodings for categorical variables. + Copyright (C) 2005 Free Software Foundation, Inc. + Written by Jason H Stover . + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* + Functions and data structures to recode categorical variables into + vectors and sub-rows of matrices. + + To fit many types of statistical models, it is necessary + to change each value of a categorical variable to a vector with binary + entries. These vectors are then stored as sub-rows within a matrix + during model-fitting. We need functions and data strucutres to, + e.g., map a value, say 'a', of a variable named 'cat_var', to a + vector, say (0 1 0 0 0), and vice versa. We also need to be able + to map the vector back to the value 'a', and if the vector is a + sub-row of a matrix, we need to know which sub-row corresponds to + the variable 'cat_var'. + + */ + +#ifndef CAT_ROUTINES_H +#define CAT_ROUTINES_H +#define CAT_VALUE_NOT_FOUND -2 +#include +#include "cat.h" + +size_t cat_value_find (const struct variable *, const union value *); + +union value *cat_subscript_to_value (const size_t, struct variable *); + +void cat_stored_values_create (struct variable *); + +void cat_value_update (struct variable *, const union value *); + +void cat_create_value_matrix (struct variable *); + +void cat_stored_values_destroy (struct variable *); +#endif diff --git a/src/cat.c b/src/cat.c index 3dc56576..c4bd17a8 100644 --- a/src/cat.c +++ b/src/cat.c @@ -1,4 +1,4 @@ -/* PSPP - linear regression. +/* PSPP - binary encodings for categorical variables. Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H Stover . @@ -36,18 +36,11 @@ #include #include "alloc.h" #include "error.h" -#include "var.h" #include "cat.h" +#include "cat-routines.h" #include -#include -#include -#include -#include #define N_INITIAL_CATEGORIES 1 -#define CAT_COLUMN_NOT_FOUND -1 -#define CAT_VALUE_NOT_FOUND -2 -#define CAT_INDEX_NOT_FOUND -3 void cat_stored_values_create (struct variable *v) @@ -72,7 +65,10 @@ cat_stored_values_destroy (struct variable *v) } } -static size_t +/* + Which subscript corresponds to val? + */ +size_t cat_value_find (const struct variable *v, const union value *val) { size_t i; @@ -121,35 +117,6 @@ cat_value_update (struct variable *v, const union value *val) } } -/* - Return the subscript of the binary vector corresponding - to this value. - */ -size_t -cat_value_to_subscript (const union value *val, struct variable *v) -{ - const union value *val2; - size_t subscript; - int different; - - assert (v != NULL); - assert (val != NULL); - assert (v->obs_vals != NULL); - subscript = v->obs_vals->n_categories - 1; - while (subscript > 0) - { - val2 = v->obs_vals->vals + subscript; - assert (val2 != NULL); - different = compare_values (val, val2, v->width); - if (!different) - { - return subscript; - } - subscript--; - } - return subscript; -} - union value * cat_subscript_to_value (const size_t s, struct variable *v) { @@ -163,236 +130,3 @@ cat_subscript_to_value (const size_t s, struct variable *v) return NULL; } } - -/* - Which element of a vector is equal to the value x? - */ -static size_t -cat_which_element_eq (const gsl_vector * vec, double x) -{ - size_t i; - - for (i = 0; i < vec->size; i++) - { - if (fabs (gsl_vector_get (vec, i) - x) < GSL_DBL_EPSILON) - { - return i; - } - } - return CAT_VALUE_NOT_FOUND; -} -static int -cat_is_zero_vector (const gsl_vector * vec) -{ - size_t i; - - for (i = 0; i < vec->size; i++) - { - if (gsl_vector_get (vec, i) != 0.0) - { - return 0; - } - } - return 1; -} - -/* - Return the value of v corresponding to the vector vec. - */ -union value * -cat_vector_to_value (const gsl_vector * vec, struct variable *v) -{ - size_t i; - - i = cat_which_element_eq (vec, 1.0); - if (i != CAT_VALUE_NOT_FOUND) - { - return cat_subscript_to_value (i + 1, v); - } - if (cat_is_zero_vector (vec)) - { - return cat_subscript_to_value (0, v); - } - return NULL; -} - -struct design_matrix * -design_matrix_create (int n_variables, - const struct variable *v_variables[], - const size_t n_data) -{ - struct design_matrix *dm; - const struct variable *v; - size_t i; - size_t n_cols = 0; - size_t col; - - dm = xmalloc (sizeof *dm); - dm->vars = xnmalloc (n_variables, sizeof *dm->vars); - dm->n_vars = n_variables; - - for (i = 0; i < n_variables; i++) - { - v = v_variables[i]; - assert ((dm->vars + i) != NULL); - (dm->vars + i)->v = v; /* Allows us to look up the variable from - the design matrix. */ - (dm->vars + i)->first_column = n_cols; - if (v->type == NUMERIC) - { - n_cols++; - (dm->vars + i)->last_column = n_cols; - } - else if (v->type == ALPHA) - { - assert (v->obs_vals != NULL); - (dm->vars + i)->last_column = - (dm->vars + i)->first_column + v->obs_vals->n_categories - 2; - n_cols += v->obs_vals->n_categories - 1; - } - } - dm->m = gsl_matrix_calloc (n_data, n_cols); - col = 0; - - return dm; -} - -void -design_matrix_destroy (struct design_matrix *dm) -{ - free (dm->vars); - gsl_matrix_free (dm->m); - free (dm); -} - -/* - Return the index of the variable for the - given column. - */ -static size_t -design_matrix_col_to_var_index (const struct design_matrix *dm, size_t col) -{ - size_t i; - struct design_matrix_var v; - - for (i = 0; i < dm->n_vars; i++) - { - v = dm->vars[i]; - if (v.first_column <= col && col <= v.last_column) - return (v.v)->index; - } - return CAT_INDEX_NOT_FOUND; -} - -/* - Return a pointer to the variable whose values - are stored in column col. - */ -struct variable * -design_matrix_col_to_var (const struct design_matrix *dm, size_t col) -{ - size_t index; - size_t i; - struct design_matrix_var dmv; - - index = design_matrix_col_to_var_index (dm, col); - for (i = 0; i < dm->n_vars; i++) - { - dmv = dm->vars[i]; - if ((dmv.v)->index == index) - { - return (struct variable *) dmv.v; - } - } - return NULL; -} - -static size_t -cmp_dm_var_index (const struct design_matrix_var *dmv, size_t index) -{ - if (dmv->v->index == index) - return 1; - return 0; -} - -/* - Return the number of the first column which holds the - values for variable v. - */ -size_t -design_matrix_var_to_column (const struct design_matrix * dm, - const struct variable * v) -{ - size_t i; - struct design_matrix_var tmp; - - for (i = 0; i < dm->n_vars; i++) - { - tmp = dm->vars[i]; - if (cmp_dm_var_index (&tmp, v->index)) - { - return tmp.first_column; - } - } - return CAT_COLUMN_NOT_FOUND; -} - -/* Last column. */ -static size_t -dm_var_to_last_column (const struct design_matrix *dm, - const struct variable *v) -{ - size_t i; - struct design_matrix_var tmp; - - for (i = 0; i < dm->n_vars; i++) - { - tmp = dm->vars[i]; - if (cmp_dm_var_index (&tmp, v->index)) - { - return tmp.last_column; - } - } - return CAT_COLUMN_NOT_FOUND; -} - -/* - Set the appropriate value in the design matrix, - whether that value is from a categorical or numeric - variable. For a categorical variable, only the usual - binary encoding is allowed. - */ -void -design_matrix_set_categorical (struct design_matrix *dm, size_t row, - const struct variable *var, - const union value *val) -{ - size_t col; - size_t is_one; - size_t fc; - size_t lc; - double entry; - - assert (var->type == ALPHA); - fc = design_matrix_var_to_column (dm, var); - lc = dm_var_to_last_column (dm, var); - assert (lc != CAT_COLUMN_NOT_FOUND); - assert (fc != CAT_COLUMN_NOT_FOUND); - is_one = fc + cat_value_find (var, val); - for (col = fc; col <= lc; col++) - { - entry = (col == is_one) ? 1.0 : 0.0; - gsl_matrix_set (dm->m, row, col, entry); - } -} -void -design_matrix_set_numeric (struct design_matrix *dm, size_t row, - const struct variable *var, const union value *val) -{ - size_t col; - - assert (var->type == NUMERIC); - col = design_matrix_var_to_column ((const struct design_matrix *) dm, var); - assert (col != CAT_COLUMN_NOT_FOUND); - gsl_matrix_set (dm->m, row, col, val->f); -} diff --git a/src/cat.h b/src/cat.h index 314c3cde..47c4ba54 100644 --- a/src/cat.h +++ b/src/cat.h @@ -1,4 +1,4 @@ -/* PSPP - linear regression. +/* PSPP - Binary encodings for categorical variables. Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H Stover . @@ -31,16 +31,14 @@ sub-row of a matrix, we need to know which sub-row corresponds to the variable 'cat_var'. - The data structures defined here will be placed in the variable - structure in the future. When that happens, the useful code - in this file will be that which refers to design matrices. */ #ifndef CAT_H -#define CAT_H 1 - -#include +#define CAT_H +#define CAT_VALUE_NOT_FOUND -2 #include +#include "val.h" +#include "var.h" /* This structure contains the observed values of a categorical variable. @@ -55,104 +53,4 @@ struct cat_vals values stored. */ }; - -/* - There are usually multiple categorical variables to recode. Get rid - of this structure immediately when the variable structure has been - modified to contain the binary encoding. - */ -struct recoded_categorical_array -{ - struct recoded_categorical **a; - size_t n_vars; -}; -/* - The design matrix structure holds the design - matrix and an array to tell us which columns - correspond to which variables. This structure - is not restricted to categorical variables, and - perhaps should be moved to its own module. -*/ - -struct design_matrix_var -{ - int first_column; /* First column for this variable in - the design_matix. If this variable - is categorical, its values are - stored in multiple, contiguous - columns, as dictated by its vector - encoding in the variable's struct - recoded_categorical. - */ - int last_column; - const struct variable *v; -}; -struct design_matrix -{ - gsl_matrix *m; - struct design_matrix_var *vars; /* Element i corresponds to - the variable whose values - are stored in at least one - column of m. If that - variable is categorical - with more than two - categories, its values are - stored in multiple, - contiguous columns. The - variable's values are then - stored in the columns - first_column through - last_column of the - design_matrix_var - structure. - */ - size_t n_vars; -}; -union value *cat_vector_to_value (const gsl_vector *, struct variable *); - -void cat_stored_values_create (struct variable *); - -void cat_value_update (struct variable *, const union value *); - -int cat_free_recoded_array (struct recoded_categorical_array *); - -struct recoded_categorical_array *cr_recoded_cat_ar_create (int, - struct variable - *[]); - -void cat_recoded_categorical_create (struct variable *); - -void cat_create_value_matrix (struct variable *); - -struct recoded_categorical *cat_var_to_recoded_categorical (const struct - variable *, - struct - recoded_categorical_array - *); - -struct design_matrix *design_matrix_create (int, const struct variable *[], - const size_t); - -void design_matrix_destroy (struct design_matrix *); - -void design_matrix_set_categorical (struct design_matrix *, size_t, - const struct variable *, - const union value *); - -void design_matrix_set_numeric (struct design_matrix *, size_t, - const struct variable *, const union value *); - -size_t design_matrix_var_to_column (const struct design_matrix *, - const struct variable *); - -struct variable *design_matrix_col_to_var (const struct design_matrix *, - size_t); - -void -design_matrix_set (struct design_matrix *, size_t, - const struct variable *, const union value *, - struct recoded_categorical *); - -void cat_stored_values_destroy (struct variable *); - #endif diff --git a/src/design-matrix.c b/src/design-matrix.c new file mode 100644 index 00000000..5d513985 --- /dev/null +++ b/src/design-matrix.c @@ -0,0 +1,271 @@ +/* PSPP - Creates design-matrices. + Copyright (C) 2005 Free Software Foundation, Inc. + Written by Jason H Stover . + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* + Create design matrices for procedures that need them. +*/ +#include +#include +#include +#include "alloc.h" +#include "error.h" +#include "var.h" +#include "cat.h" +#include "design-matrix.h" +#include +#include +#include +#include +#include + +#define DM_COLUMN_NOT_FOUND -1 +#define DM_INDEX_NOT_FOUND -3 + +/* + Which element of a vector is equal to the value x? + */ +static size_t +cat_which_element_eq (const gsl_vector * vec, double x) +{ + size_t i; + + for (i = 0; i < vec->size; i++) + { + if (fabs (gsl_vector_get (vec, i) - x) < GSL_DBL_EPSILON) + { + return i; + } + } + return CAT_VALUE_NOT_FOUND; +} +static int +cat_is_zero_vector (const gsl_vector * vec) +{ + size_t i; + + for (i = 0; i < vec->size; i++) + { + if (gsl_vector_get (vec, i) != 0.0) + { + return 0; + } + } + return 1; +} + +/* + Return the value of v corresponding to the vector vec. + */ +union value * +cat_vector_to_value (const gsl_vector * vec, struct variable *v) +{ + size_t i; + + i = cat_which_element_eq (vec, 1.0); + if (i != CAT_VALUE_NOT_FOUND) + { + return cat_subscript_to_value (i + 1, v); + } + if (cat_is_zero_vector (vec)) + { + return cat_subscript_to_value (0, v); + } + return NULL; +} + +struct design_matrix * +design_matrix_create (int n_variables, + const struct variable *v_variables[], + const size_t n_data) +{ + struct design_matrix *dm; + const struct variable *v; + size_t i; + size_t n_cols = 0; + size_t col; + + dm = xmalloc (sizeof *dm); + dm->vars = xnmalloc (n_variables, sizeof *dm->vars); + dm->n_vars = n_variables; + + for (i = 0; i < n_variables; i++) + { + v = v_variables[i]; + assert ((dm->vars + i) != NULL); + (dm->vars + i)->v = v; /* Allows us to look up the variable from + the design matrix. */ + (dm->vars + i)->first_column = n_cols; + if (v->type == NUMERIC) + { + n_cols++; + (dm->vars + i)->last_column = n_cols; + } + else if (v->type == ALPHA) + { + assert (v->obs_vals != NULL); + (dm->vars + i)->last_column = + (dm->vars + i)->first_column + v->obs_vals->n_categories - 2; + n_cols += v->obs_vals->n_categories - 1; + } + } + dm->m = gsl_matrix_calloc (n_data, n_cols); + col = 0; + + return dm; +} + +void +design_matrix_destroy (struct design_matrix *dm) +{ + free (dm->vars); + gsl_matrix_free (dm->m); + free (dm); +} + +/* + Return the index of the variable for the + given column. + */ +static size_t +design_matrix_col_to_var_index (const struct design_matrix *dm, size_t col) +{ + size_t i; + struct design_matrix_var v; + + for (i = 0; i < dm->n_vars; i++) + { + v = dm->vars[i]; + if (v.first_column <= col && col <= v.last_column) + return (v.v)->index; + } + return DM_INDEX_NOT_FOUND; +} + +/* + Return a pointer to the variable whose values + are stored in column col. + */ +struct variable * +design_matrix_col_to_var (const struct design_matrix *dm, size_t col) +{ + size_t index; + size_t i; + struct design_matrix_var dmv; + + index = design_matrix_col_to_var_index (dm, col); + for (i = 0; i < dm->n_vars; i++) + { + dmv = dm->vars[i]; + if ((dmv.v)->index == index) + { + return (struct variable *) dmv.v; + } + } + return NULL; +} + +static size_t +cmp_dm_var_index (const struct design_matrix_var *dmv, size_t index) +{ + if (dmv->v->index == index) + return 1; + return 0; +} + +/* + Return the number of the first column which holds the + values for variable v. + */ +size_t +design_matrix_var_to_column (const struct design_matrix * dm, + const struct variable * v) +{ + size_t i; + struct design_matrix_var tmp; + + for (i = 0; i < dm->n_vars; i++) + { + tmp = dm->vars[i]; + if (cmp_dm_var_index (&tmp, v->index)) + { + return tmp.first_column; + } + } + return DM_COLUMN_NOT_FOUND; +} + +/* Last column. */ +static size_t +dm_var_to_last_column (const struct design_matrix *dm, + const struct variable *v) +{ + size_t i; + struct design_matrix_var tmp; + + for (i = 0; i < dm->n_vars; i++) + { + tmp = dm->vars[i]; + if (cmp_dm_var_index (&tmp, v->index)) + { + return tmp.last_column; + } + } + return DM_COLUMN_NOT_FOUND; +} + +/* + Set the appropriate value in the design matrix, + whether that value is from a categorical or numeric + variable. For a categorical variable, only the usual + binary encoding is allowed. + */ +void +design_matrix_set_categorical (struct design_matrix *dm, size_t row, + const struct variable *var, + const union value *val) +{ + size_t col; + size_t is_one; + size_t fc; + size_t lc; + double entry; + + assert (var->type == ALPHA); + fc = design_matrix_var_to_column (dm, var); + lc = dm_var_to_last_column (dm, var); + assert (lc != DM_COLUMN_NOT_FOUND); + assert (fc != DM_COLUMN_NOT_FOUND); + is_one = fc + cat_value_find (var, val); + for (col = fc; col <= lc; col++) + { + entry = (col == is_one) ? 1.0 : 0.0; + gsl_matrix_set (dm->m, row, col, entry); + } +} +void +design_matrix_set_numeric (struct design_matrix *dm, size_t row, + const struct variable *var, const union value *val) +{ + size_t col; + + assert (var->type == NUMERIC); + col = design_matrix_var_to_column ((const struct design_matrix *) dm, var); + assert (col != DM_COLUMN_NOT_FOUND); + gsl_matrix_set (dm->m, row, col, val->f); +} diff --git a/src/design-matrix.h b/src/design-matrix.h new file mode 100644 index 00000000..55e4c73c --- /dev/null +++ b/src/design-matrix.h @@ -0,0 +1,85 @@ +/* PSPP - Creates design matrices. + Copyright (C) 2005 Free Software Foundation, Inc. + Written by Jason H Stover . + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. */ + +/* + Create design matrices for procedures that need them. + */ + +#ifndef DESIGN_MATRIX_H +#define DESIGN_MATRIX_H + +#include +#include +#include "cat.h" +#include "cat-routines.h" +struct design_matrix_var +{ + int first_column; /* First column for this variable in + the design_matix. If this variable + is categorical, its values are + stored in multiple, contiguous + columns, as dictated by its vector + encoding in the variable's struct + cat_vals. + */ + int last_column; + const struct variable *v; +}; +struct design_matrix +{ + gsl_matrix *m; + struct design_matrix_var *vars; /* Element i corresponds to + the variable whose values + are stored in at least one + column of m. If that + variable is categorical + with more than two + categories, its values are + stored in multiple, + contiguous columns. The + variable's values are then + stored in the columns + first_column through + last_column of the + design_matrix_var + structure. + */ + size_t n_vars; +}; +union value *cat_vector_to_value (const gsl_vector *, struct variable *); + +struct design_matrix *design_matrix_create (int, const struct variable *[], + const size_t); + +void design_matrix_destroy (struct design_matrix *); + +void design_matrix_set_categorical (struct design_matrix *, size_t, + const struct variable *, + const union value *); + +void design_matrix_set_numeric (struct design_matrix *, size_t, + const struct variable *, const union value *); + +size_t design_matrix_var_to_column (const struct design_matrix *, + const struct variable *); + +struct variable *design_matrix_col_to_var (const struct design_matrix *, + size_t); + +#endif diff --git a/src/regression.q b/src/regression.q index 196172c4..0f8e8532 100644 --- a/src/regression.q +++ b/src/regression.q @@ -26,7 +26,9 @@ #include "case.h" #include "casefile.h" #include "cat.h" +#include "cat-routines.h" #include "command.h" +#include "design-matrix.h" #include "dictionary.h" #include "error.h" #include "file-handle.h"