02110-1301, USA. */
/*
- Functions and data structures to recode categorical variables into
- vectors and sub-rows of matrices.
+ Functions and data structures to store values of a categorical
+ variable, and to recode those values into binary vectors.
For some statistical models, it is necessary to change each value
of a categorical variable to a vector with binary entries. These
vectors are then stored as sub-rows within a matrix during
- model-fitting. E.g., we need functions and data strucutres to map a
+ model-fitting. For example, we need functions and data strucutres to map a
value, say 'a', of a variable named 'cat_var', to a vector, say (0
1 0 0 0), and vice versa. We also need to be able to map the
vector back to the value 'a', and if the vector is a sub-row of a
matrix, we need to know which sub-row corresponds to the variable
'cat_var'.
-
- The data structures defined here will be placed in the variable
- structure in the future. When that happens, the useful code
- in this file will be that which refers to design matrices.
*/
#include <config.h>
#include <stdlib.h>
#include <gsl/gsl_matrix.h>
#define N_INITIAL_CATEGORIES 1
-#define CR_COLUMN_NOT_FOUND -1
-#define CR_VALUE_NOT_FOUND -2
-#define CR_INDEX_NOT_FOUND -3
-
-static gsl_vector_view cr_value_to_vector (const union value *,
- struct recoded_categorical *);
-
-struct recoded_categorical *
-cr_recoded_categorical_create (const struct variable *v)
-{
- struct recoded_categorical *rc;
-
- rc = xmalloc (sizeof (*rc));
- rc->v = v;
- rc->n_categories = 0;
- rc->n_allocated_categories = N_INITIAL_CATEGORIES;
- rc->vals = xmalloc (N_INITIAL_CATEGORIES *
- sizeof (*rc->vals));
-
- return rc;
-}
+#define CAT_COLUMN_NOT_FOUND -1
+#define CAT_VALUE_NOT_FOUND -2
+#define CAT_INDEX_NOT_FOUND -3
void
-cr_recoded_categorical_destroy (struct recoded_categorical *r)
-{
- free (r->vals);
- free (r);
-}
-
-struct recoded_categorical_array *
-cr_recoded_cat_ar_create (int n_variables, struct variable *v_variables[])
+cat_stored_values_create (struct variable *v)
{
- size_t n_categoricals = 0;
- size_t i;
- struct recoded_categorical_array *ca;
- struct variable *v;
-
- ca = (struct recoded_categorical_array *) xmalloc (sizeof (*ca));
- for (i = 0; i < n_variables; i++)
- {
- v = v_variables[i];
- if (v->type == ALPHA)
- {
- n_categoricals++;
- }
- }
- ca->n_vars = n_categoricals;
- ca->a = xmalloc (n_categoricals * sizeof (*(ca->a)));
- for (i = 0; i < n_categoricals; i++)
+ if (v->obs_vals == NULL)
{
- *(ca->a + i) = cr_recoded_categorical_create (v_variables[i]);
+ v->obs_vals = xmalloc (sizeof (*v->obs_vals));
+ v->obs_vals->n_categories = 0;
+ v->obs_vals->n_allocated_categories = N_INITIAL_CATEGORIES;
+ v->obs_vals->vals =
+ xnmalloc (N_INITIAL_CATEGORIES, sizeof *v->obs_vals->vals);
}
-
- return ca;
}
-int
-cr_free_recoded_array (struct recoded_categorical_array *r)
+void
+cat_stored_values_destroy (struct variable *v)
{
- int rc = 0;
- size_t i;
-
- for (i = 0; i < r->n_vars; i++)
+ assert (v != NULL);
+ if (v->obs_vals != NULL)
{
- cr_recoded_categorical_destroy (*(r->a + i));
+ free (v->obs_vals);
}
- return rc;
}
static size_t
-cr_value_find (struct recoded_categorical *rc, const union value *v)
+cat_value_find (const struct variable *v, const union value *val)
{
size_t i;
- const union value *val;
+ const union value *candidate;
- for (i = 0; i < rc->n_categories; i++)
+ assert (val != NULL);
+ assert (v != NULL);
+ assert (v->obs_vals != NULL);
+ for (i = 0; i < v->obs_vals->n_categories; i++)
{
- val = rc->vals + i;
- if (!compare_values (val, v, rc->v->width))
+ candidate = v->obs_vals->vals + i;
+ assert (candidate != NULL);
+ if (!compare_values (candidate, val, v->width))
{
return i;
}
}
- return CR_VALUE_NOT_FOUND;
+ return CAT_VALUE_NOT_FOUND;
}
/*
Add the new value unless it is already present.
*/
void
-cr_value_update (struct recoded_categorical *rc, const union value *v)
+cat_value_update (struct variable *v, const union value *val)
{
- if (cr_value_find (rc, v) == CR_VALUE_NOT_FOUND)
+ struct cat_vals *cv;
+
+ if (v->type == ALPHA)
{
- if (rc->n_categories >= rc->n_allocated_categories)
+ assert (val != NULL);
+ assert (v != NULL);
+ cv = v->obs_vals;
+ if (cat_value_find (v, val) == CAT_VALUE_NOT_FOUND)
{
- rc->n_allocated_categories *= 2;
- rc->vals = xrealloc (rc->vals, rc->n_allocated_categories
- * sizeof (*(rc->vals)));
+ if (cv->n_categories >= cv->n_allocated_categories)
+ {
+ cv->n_allocated_categories *= 2;
+ cv->vals = xnrealloc (cv->vals,
+ cv->n_allocated_categories,
+ sizeof *cv->vals);
+ }
+ cv->vals[cv->n_categories] = *val;
+ cv->n_categories++;
}
- rc->vals[rc->n_categories] = *v;
- rc->n_categories++;
}
}
/*
- Create a set of gsl_matrix's, each of whose rows correspond to
- values of a categorical variable. Since n categories have n-1
- degrees of freedom, the gsl_matrix is n-by-(n-1), with the first
- category encoded as the zero vector.
+ Return the subscript of the binary vector corresponding
+ to this value.
*/
-void
-cr_create_value_matrices (struct recoded_categorical_array *r)
-{
- size_t i;
- size_t row;
- size_t col;
- size_t n_rows;
- size_t n_cols;
-
- for (i = 0; i < r->n_vars; i++)
- {
- n_rows = (*(r->a + i))->n_categories;
- n_cols = (*(r->a + i))->n_categories - 1;
- (*(r->a + i))->m = gsl_matrix_calloc (n_rows, n_cols);
- for (row = 1; row < n_rows; row++)
- {
- col = row - 1;
- gsl_matrix_set ((*(r->a + i))->m, row, col, 1.0);
- }
- }
-}
-
-static size_t
-cr_value_to_subscript (const union value *val, struct recoded_categorical *cr)
+size_t
+cat_value_to_subscript (const union value *val, struct variable *v)
{
- const union value *v;
+ const union value *val2;
size_t subscript;
int different;
- subscript = cr->n_categories - 1;
+ assert (v != NULL);
+ assert (val != NULL);
+ assert (v->obs_vals != NULL);
+ subscript = v->obs_vals->n_categories - 1;
while (subscript > 0)
{
- v = cr->vals + subscript;
- different = compare_values (val, v, cr->v->width);
+ val2 = v->obs_vals->vals + subscript;
+ assert (val2 != NULL);
+ different = compare_values (val, val2, v->width);
if (!different)
{
return subscript;
return subscript;
}
-static union value *
-cr_subscript_to_value (const size_t s, struct recoded_categorical *cr)
+union value *
+cat_subscript_to_value (const size_t s, struct variable *v)
{
- if (s < cr->n_categories)
+ assert (v->obs_vals != NULL);
+ if (s < v->obs_vals->n_categories)
{
- return (cr->vals + s);
+ return (v->obs_vals->vals + s);
}
else
{
}
}
-/*
- Return the row of the matrix corresponding
- to the value v.
- */
-static gsl_vector_view
-cr_value_to_vector (const union value * v, struct recoded_categorical * cr)
-{
- size_t row;
- row = cr_value_to_subscript (v, cr);
- return gsl_matrix_row (cr->m, row);
-}
-
/*
Which element of a vector is equal to the value x?
*/
static size_t
-cr_which_element_eq (const gsl_vector * vec, double x)
+cat_which_element_eq (const gsl_vector * vec, double x)
{
size_t i;
return i;
}
}
- return CR_VALUE_NOT_FOUND;
+ return CAT_VALUE_NOT_FOUND;
}
static int
-cr_is_zero_vector (const gsl_vector * vec)
+cat_is_zero_vector (const gsl_vector * vec)
{
size_t i;
}
/*
- Return the value corresponding to the vector.
- To avoid searching the matrix, this routine takes
- advantage of the fact that element (i,i+1) is 1
- when i is between 1 and cr->n_categories - 1 and
- i is 0 otherwise.
+ Return the value of v corresponding to the vector vec.
*/
union value *
-cr_vector_to_value (const gsl_vector * vec, struct recoded_categorical *cr)
+cat_vector_to_value (const gsl_vector * vec, struct variable *v)
{
size_t i;
- i = cr_which_element_eq (vec, 1.0);
- if (i != CR_VALUE_NOT_FOUND)
+ i = cat_which_element_eq (vec, 1.0);
+ if (i != CAT_VALUE_NOT_FOUND)
{
- return cr_subscript_to_value (i + 1, cr);
+ return cat_subscript_to_value (i + 1, v);
}
- if (cr_is_zero_vector (vec))
- {
- return cr_subscript_to_value (0, cr);
- }
- return NULL;
-}
-
-/*
- Given a variable, return a pointer to its recoded
- structure. BUSTED IN HERE.
- */
-struct recoded_categorical *
-cr_var_to_recoded_categorical (const struct variable *v,
- struct recoded_categorical_array *ca)
-{
- struct recoded_categorical *rc;
- size_t i;
-
- for (i = 0; i < ca->n_vars; i++)
+ if (cat_is_zero_vector (vec))
{
- rc = *(ca->a + i);
- if (rc->v->index == v->index)
- {
- return rc;
- }
+ return cat_subscript_to_value (0, v);
}
return NULL;
}
struct design_matrix *
design_matrix_create (int n_variables,
const struct variable *v_variables[],
- struct recoded_categorical_array *ca,
const size_t n_data)
{
struct design_matrix *dm;
- struct design_matrix_var *tmp;
- struct recoded_categorical *rc;
const struct variable *v;
size_t i;
size_t n_cols = 0;
size_t col;
- dm = xmalloc (sizeof (*dm));
- dm->vars = xmalloc (n_variables * sizeof (struct variable *));
+ dm = xmalloc (sizeof *dm);
+ dm->vars = xnmalloc (n_variables, sizeof *dm->vars);
dm->n_vars = n_variables;
for (i = 0; i < n_variables; i++)
{
v = v_variables[i];
+ assert ((dm->vars + i) != NULL);
+ (dm->vars + i)->v = v; /* Allows us to look up the variable from
+ the design matrix. */
+ (dm->vars + i)->first_column = n_cols;
if (v->type == NUMERIC)
{
n_cols++;
+ (dm->vars + i)->last_column = n_cols;
}
else if (v->type == ALPHA)
{
- assert (ca != NULL);
- rc = cr_var_to_recoded_categorical (v, ca);
- assert (rc != NULL);
- rc->first_column = n_cols;
- rc->last_column = rc->first_column + rc->n_categories - 2;
- n_cols += rc->n_categories - 1;
+ assert (v->obs_vals != NULL);
+ (dm->vars + i)->last_column =
+ (dm->vars + i)->first_column + v->obs_vals->n_categories - 2;
+ n_cols += v->obs_vals->n_categories - 1;
}
}
dm->m = gsl_matrix_calloc (n_data, n_cols);
- dm->vars = xmalloc (dm->n_vars * sizeof (*(dm->vars)));
- assert (dm->vars != NULL);
col = 0;
- for (i = 0; i < n_variables; i++)
- {
- v = v_variables[i];
- (dm->vars[i]).v = v;
- if (v->type == NUMERIC)
- {
- tmp = &(dm->vars[col]);
- tmp->v = v;
- tmp->first_column = col;
- col++;
- }
- else if (v->type == ALPHA)
- {
- assert (ca != NULL);
- rc = cr_var_to_recoded_categorical (v, ca);
- assert (rc != NULL);
- tmp = &(dm->vars[col]);
- tmp->v = v;
- tmp->last_column = rc->last_column;
- col = rc->last_column + 1;
- }
- }
return dm;
}
if (v.first_column <= col && col <= v.last_column)
return (v.v)->index;
}
- return CR_INDEX_NOT_FOUND;
+ return CAT_INDEX_NOT_FOUND;
}
/*
dmv = dm->vars[i];
if ((dmv.v)->index == index)
{
- return dmv.v;
+ return (struct variable *) dmv.v;
}
}
return NULL;
}
/*
- Return the number of the first column storing the
+ Return the number of the first column which holds the
values for variable v.
*/
size_t
return tmp.first_column;
}
}
- return CR_COLUMN_NOT_FOUND;
+ return CAT_COLUMN_NOT_FOUND;
+}
+
+/* Last column. */
+static size_t
+dm_var_to_last_column (const struct design_matrix *dm,
+ const struct variable *v)
+{
+ size_t i;
+ struct design_matrix_var tmp;
+
+ for (i = 0; i < dm->n_vars; i++)
+ {
+ tmp = dm->vars[i];
+ if (cmp_dm_var_index (&tmp, v->index))
+ {
+ return tmp.last_column;
+ }
+ }
+ return CAT_COLUMN_NOT_FOUND;
}
/*
Set the appropriate value in the design matrix,
whether that value is from a categorical or numeric
- variable.
+ variable. For a categorical variable, only the usual
+ binary encoding is allowed.
*/
void
design_matrix_set_categorical (struct design_matrix *dm, size_t row,
const struct variable *var,
- const union value *val,
- struct recoded_categorical *rc)
+ const union value *val)
{
size_t col;
- double x;
+ size_t is_one;
+ size_t fc;
+ size_t lc;
+ double entry;
assert (var->type == ALPHA);
- const gsl_vector_view vec = cr_value_to_vector (val, rc);
-
- /*
- Copying values here is not the 'most efficient' way,
- but it will work even if we change the vector encoding later.
- */
- for (col = rc->first_column; col <= rc->last_column; col++)
+ fc = design_matrix_var_to_column (dm, var);
+ lc = dm_var_to_last_column (dm, var);
+ assert (lc != CAT_COLUMN_NOT_FOUND);
+ assert (fc != CAT_COLUMN_NOT_FOUND);
+ is_one = fc + cat_value_find (var, val);
+ for (col = fc; col <= lc; col++)
{
- x = gsl_vector_get (&vec.vector, col);
- gsl_matrix_set (dm->m, row, col, x);
+ entry = (col == is_one) ? 1.0 : 0.0;
+ gsl_matrix_set (dm->m, row, col, entry);
}
}
void
assert (var->type == NUMERIC);
col = design_matrix_var_to_column ((const struct design_matrix *) dm, var);
- assert (col != CR_COLUMN_NOT_FOUND);
+ assert (col != CAT_COLUMN_NOT_FOUND);
gsl_matrix_set (dm->m, row, col, val->f);
}