X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fcat.c;h=c4bd17a85d16d459f0da83cff9458edc3e892ba3;hb=d594340cb1ae007a92d094ae67116f9f622f2b5d;hp=934a4d31c9d50a8f3cbc74e8155d4af6a3e1d4ec;hpb=494a50b0d023c55b61caf33e3887379332166d07;p=pspp-builds.git diff --git a/src/cat.c b/src/cat.c index 934a4d31..c4bd17a8 100644 --- a/src/cat.c +++ b/src/cat.c @@ -1,4 +1,4 @@ -/* PSPP - linear regression. +/* PSPP - binary encodings for categorical variables. Copyright (C) 2005 Free Software Foundation, Inc. Written by Jason H Stover . @@ -18,477 +18,115 @@ 02110-1301, USA. */ /* - Functions and data structures to recode categorical variables into - vectors and sub-rows of matrices. + Functions and data structures to store values of a categorical + variable, and to recode those values into binary vectors. For some statistical models, it is necessary to change each value of a categorical variable to a vector with binary entries. These vectors are then stored as sub-rows within a matrix during - model-fitting. E.g., we need functions and data strucutres to map a + model-fitting. For example, we need functions and data strucutres to map a value, say 'a', of a variable named 'cat_var', to a vector, say (0 1 0 0 0), and vice versa. We also need to be able to map the vector back to the value 'a', and if the vector is a sub-row of a matrix, we need to know which sub-row corresponds to the variable 'cat_var'. - - The data structures defined here will be placed in the variable - structure in the future. When that happens, the useful code - in this file will be that which refers to design matrices. */ #include #include #include #include "alloc.h" #include "error.h" -#include "var.h" #include "cat.h" +#include "cat-routines.h" #include -#include -#include -#include -#include #define N_INITIAL_CATEGORIES 1 -#define CR_COLUMN_NOT_FOUND -1 -#define CR_VALUE_NOT_FOUND -2 -#define CR_INDEX_NOT_FOUND -3 - -static gsl_vector_const_view cr_value_to_vector (const union value *, - struct recoded_categorical *); - -struct recoded_categorical * -cr_recoded_categorical_create (const struct variable *v) -{ - struct recoded_categorical *rc; - - rc = xmalloc (sizeof (*rc)); - rc->v = v; - rc->n_categories = 0; - rc->n_allocated_categories = N_INITIAL_CATEGORIES; - rc->vals = (union value **) xmalloc (N_INITIAL_CATEGORIES * - sizeof (*rc->vals)); - - return rc; -} void -cr_recoded_categorical_destroy (struct recoded_categorical *r) +cat_stored_values_create (struct variable *v) { - free (r->vals); - free (r); -} - -struct recoded_categorical_array * -cr_recoded_cat_ar_create (int n_variables, struct variable *v_variables[]) -{ - size_t n_categoricals = 0; - size_t i; - struct recoded_categorical_array *ca; - struct variable *v; - - ca = (struct recoded_categorical_array *) xmalloc (sizeof (*ca)); - for (i = 0; i < n_variables; i++) - { - v = v_variables[i]; - if (v->type == ALPHA) - { - n_categoricals++; - } - } - ca->n_vars = n_categoricals; - ca->a = xmalloc (n_categoricals * sizeof (*(ca->a))); - for (i = 0; i < n_categoricals; i++) + if (v->obs_vals == NULL) { - *(ca->a + i) = cr_recoded_categorical_create (v_variables[i]); + v->obs_vals = xmalloc (sizeof (*v->obs_vals)); + v->obs_vals->n_categories = 0; + v->obs_vals->n_allocated_categories = N_INITIAL_CATEGORIES; + v->obs_vals->vals = + xnmalloc (N_INITIAL_CATEGORIES, sizeof *v->obs_vals->vals); } - - return ca; } -int -cr_free_recoded_array (struct recoded_categorical_array *r) -{ - int rc = 0; - size_t i; - - for (i = 0; i < r->n_vars; i++) - { - cr_recoded_categorical_destroy (*(r->a + i)); - } - return rc; -} - -static size_t -cr_value_find (struct recoded_categorical *rc, const union value *v) -{ - size_t i; - const union value *val; - - for (i = 0; i < rc->n_categories; i++) - { - val = *(rc->vals + i); - if (!compare_values (val, v, rc->v->width)) - { - return i; - } - } - return CR_VALUE_NOT_FOUND; -} - -/* - Add the new value unless it is already present. - */ void -cr_value_update (struct recoded_categorical *rc, const union value *v) +cat_stored_values_destroy (struct variable *v) { - if (cr_value_find (rc, v) == CR_VALUE_NOT_FOUND) + assert (v != NULL); + if (v->obs_vals != NULL) { - if (rc->n_categories >= rc->n_allocated_categories) - { - rc->n_allocated_categories *= 2; - rc->vals = (union value **) - xrealloc (rc->vals, rc->n_allocated_categories - * sizeof (*(rc->vals))); - } - *(rc->vals + rc->n_categories) = v; - rc->n_categories++; + free (v->obs_vals); } } /* - Create a set of gsl_matrix's, each of whose rows correspond to - values of a categorical variable. Since n categories have n-1 - degrees of freedom, the gsl_matrix is n-by-(n-1), with the first - category encoded as the zero vector. + Which subscript corresponds to val? */ -void -cr_create_value_matrices (struct recoded_categorical_array *r) -{ - size_t i; - size_t row; - size_t col; - size_t n_rows; - size_t n_cols; - - for (i = 0; i < r->n_vars; i++) - { - n_rows = (*(r->a + i))->n_categories; - n_cols = (*(r->a + i))->n_categories - 1; - (*(r->a + i))->m = gsl_matrix_calloc (n_rows, n_cols); - for (row = 1; row < n_rows; row++) - { - col = row - 1; - gsl_matrix_set ((*(r->a + i))->m, row, col, 1.0); - } - } -} - -static size_t -cr_value_to_subscript (const union value *val, struct recoded_categorical *cr) -{ - const union value *v; - size_t subscript; - int different; - - subscript = cr->n_categories - 1; - while (subscript > 0) - { - v = *(cr->vals + subscript); - different = compare_values (val, v, cr->v->width); - if (!different) - { - return subscript; - } - subscript--; - } - return subscript; -} - -static const union value * -cr_subscript_to_value (const size_t s, struct recoded_categorical *cr) -{ - if (s < cr->n_categories) - { - return cr->vals[s]; - } - else - { - return NULL; - } -} - -/* - Return the row of the matrix corresponding - to the value v. - */ -static gsl_vector_const_view -cr_value_to_vector (const union value * v, struct recoded_categorical * cr) -{ - size_t row; - row = cr_value_to_subscript (v, cr); - return gsl_matrix_const_row (cr->m, row); -} - -/* - Which element of a vector is equal to the value x? - */ -static size_t -cr_which_element_eq (const gsl_vector * vec, double x) +size_t +cat_value_find (const struct variable *v, const union value *val) { size_t i; + const union value *candidate; - for (i = 0; i < vec->size; i++) + assert (val != NULL); + assert (v != NULL); + assert (v->obs_vals != NULL); + for (i = 0; i < v->obs_vals->n_categories; i++) { - if (fabs (gsl_vector_get (vec, i) - x) < GSL_DBL_EPSILON) + candidate = v->obs_vals->vals + i; + assert (candidate != NULL); + if (!compare_values (candidate, val, v->width)) { return i; } } - return CR_VALUE_NOT_FOUND; -} -static int -cr_is_zero_vector (const gsl_vector * vec) -{ - size_t i; - - for (i = 0; i < vec->size; i++) - { - if (gsl_vector_get (vec, i) != 0.0) - { - return 0; - } - } - return 1; -} - -/* - Return the value corresponding to the vector. - To avoid searching the matrix, this routine takes - advantage of the fact that element (i,i+1) is 1 - when i is between 1 and cr->n_categories - 1 and - i is 0 otherwise. - */ -const union value * -cr_vector_to_value (const gsl_vector * vec, struct recoded_categorical *cr) -{ - size_t i; - - i = cr_which_element_eq (vec, 1.0); - if (i != CR_VALUE_NOT_FOUND) - { - return cr_subscript_to_value (i + 1, cr); - } - if (cr_is_zero_vector (vec)) - { - return cr_subscript_to_value (0, cr); - } - return NULL; + return CAT_VALUE_NOT_FOUND; } /* - Given a variable, return a pointer to its recoded - structure. BUSTED IN HERE. + Add the new value unless it is already present. */ -struct recoded_categorical * -cr_var_to_recoded_categorical (const struct variable *v, - struct recoded_categorical_array *ca) -{ - struct recoded_categorical *rc; - size_t i; - - for (i = 0; i < ca->n_vars; i++) - { - rc = *(ca->a + i); - if (rc->v->index == v->index) - { - return rc; - } - } - return NULL; -} - -struct design_matrix * -design_matrix_create (int n_variables, - const struct variable *v_variables[], - struct recoded_categorical_array *ca, - const size_t n_data) -{ - struct design_matrix *dm; - struct design_matrix_var *tmp; - struct recoded_categorical *rc; - const struct variable *v; - size_t i; - size_t n_cols = 0; - size_t col; - - dm = xmalloc (sizeof (*dm)); - dm->vars = xmalloc (n_variables * sizeof (struct variable *)); - dm->n_vars = n_variables; - - for (i = 0; i < n_variables; i++) - { - v = v_variables[i]; - if (v->type == NUMERIC) - { - n_cols++; - } - else if (v->type == ALPHA) - { - assert (ca != NULL); - rc = cr_var_to_recoded_categorical (v, ca); - assert (rc != NULL); - rc->first_column = n_cols; - rc->last_column = rc->first_column + rc->n_categories - 2; - n_cols += rc->n_categories - 1; - } - } - dm->m = gsl_matrix_calloc (n_data, n_cols); - dm->vars = xmalloc (dm->n_vars * sizeof (*(dm->vars))); - assert (dm->vars != NULL); - col = 0; - - for (i = 0; i < n_variables; i++) - { - v = v_variables[i]; - (dm->vars[i]).v = v; - if (v->type == NUMERIC) - { - tmp = &(dm->vars[col]); - tmp->v = v; - tmp->first_column = col; - col++; - } - else if (v->type == ALPHA) - { - assert (ca != NULL); - rc = cr_var_to_recoded_categorical (v, ca); - assert (rc != NULL); - tmp = &(dm->vars[col]); - tmp->v = v; - tmp->last_column = rc->last_column; - col = rc->last_column + 1; - } - } - return dm; -} - void -design_matrix_destroy (struct design_matrix *dm) -{ - free (dm->vars); - gsl_matrix_free (dm->m); - free (dm); -} - -/* - Return the index of the variable for the - given column. - */ -static const size_t -design_matrix_col_to_var_index (const struct design_matrix *dm, size_t col) -{ - size_t i; - struct design_matrix_var v; - - for (i = 0; i < dm->n_vars; i++) - { - v = dm->vars[i]; - if (v.first_column <= col && col <= v.last_column) - return (v.v)->index; - } - return CR_INDEX_NOT_FOUND; -} - -/* - Return a pointer to the variable whose values - are stored in column col. - */ -const struct variable * -design_matrix_col_to_var (const struct design_matrix *dm, size_t col) +cat_value_update (struct variable *v, const union value *val) { - size_t index; - size_t i; - struct design_matrix_var dmv; - const struct variable *v; + struct cat_vals *cv; - index = design_matrix_col_to_var_index (dm, col); - for (i = 0; i < dm->n_vars; i++) + if (v->type == ALPHA) { - dmv = dm->vars[i]; - v = (dmv.v)->index; - if (v->index == index) + assert (val != NULL); + assert (v != NULL); + cv = v->obs_vals; + if (cat_value_find (v, val) == CAT_VALUE_NOT_FOUND) { - return v; + if (cv->n_categories >= cv->n_allocated_categories) + { + cv->n_allocated_categories *= 2; + cv->vals = xnrealloc (cv->vals, + cv->n_allocated_categories, + sizeof *cv->vals); + } + cv->vals[cv->n_categories] = *val; + cv->n_categories++; } } - return NULL; } -static size_t -cmp_dm_var_index (const struct design_matrix_var *dmv, size_t index) -{ - if (dmv->v->index == index) - return 1; - return 0; -} - -/* - Return the number of the first column storing the - values for variable v. - */ -size_t -design_matrix_var_to_column (const struct design_matrix * dm, - const struct variable * v) +union value * +cat_subscript_to_value (const size_t s, struct variable *v) { - size_t i; - struct design_matrix_var tmp; - - for (i = 0; i < dm->n_vars; i++) + assert (v->obs_vals != NULL); + if (s < v->obs_vals->n_categories) { - tmp = dm->vars[i]; - if (cmp_dm_var_index (&tmp, v->index)) - { - return tmp.first_column; - } + return (v->obs_vals->vals + s); } - return CR_COLUMN_NOT_FOUND; -} - -/* - Set the appropriate value in the design matrix, - whether that value is from a categorical or numeric - variable. - */ -void -design_matrix_set_categorical (struct design_matrix *dm, size_t row, - const struct variable *var, - const union value *val, - struct recoded_categorical *rc) -{ - size_t col; - double x; - - assert (var->type == ALPHA); - gsl_vector_const_view vec = cr_value_to_vector (val, rc); - - /* - Copying values here is not the 'most efficient' way, - but it will work even if we change the vector encoding later. - */ - for (col = rc->first_column; col <= rc->last_column; col++) + else { - x = gsl_vector_get (&vec.vector, col); - gsl_matrix_set (dm->m, row, col, x); + return NULL; } } -void -design_matrix_set_numeric (struct design_matrix *dm, size_t row, - const struct variable *var, const union value *val) -{ - size_t col; - - assert (var->type == NUMERIC); - col = design_matrix_var_to_column ((const struct design_matrix *) dm, var); - assert (col != CR_COLUMN_NOT_FOUND); - gsl_matrix_set (dm->m, row, col, val->f); -}