1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 Functions and data structures to recode categorical variables into
22 vectors and sub-rows of matrices.
24 For some statistical models, it is necessary to change each value
25 of a categorical variable to a vector with binary entries. These
26 vectors are then stored as sub-rows within a matrix during
27 model-fitting. E.g., we need functions and data strucutres to map a
28 value, say 'a', of a variable named 'cat_var', to a vector, say (0
29 1 0 0 0), and vice versa. We also need to be able to map the
30 vector back to the value 'a', and if the vector is a sub-row of a
31 matrix, we need to know which sub-row corresponds to the variable
34 The data structures defined here will be placed in the variable
35 structure in the future. When that happens, the useful code
36 in this file will be that which refers to design matrices.
47 #include <gsl/gsl_machine.h>
48 #include <gsl/gsl_vector.h>
49 #include <gsl/gsl_matrix.h>
51 #define N_INITIAL_CATEGORIES 1
52 #define CR_COLUMN_NOT_FOUND -1
53 #define CR_VALUE_NOT_FOUND -2
54 #define CR_INDEX_NOT_FOUND -3
56 struct recoded_categorical *
57 cr_recoded_categorical_create (const struct variable *v)
59 struct recoded_categorical *rc;
61 rc = xmalloc (sizeof (*rc));
64 rc->n_allocated_categories = N_INITIAL_CATEGORIES;
65 rc->vals = (union value **) xmalloc (N_INITIAL_CATEGORIES *
72 cr_recoded_categorical_destroy (struct recoded_categorical *r)
78 struct recoded_categorical_array *
79 cr_recoded_cat_ar_create (int n_variables, struct variable *v_variables[])
81 size_t n_categoricals = 0;
83 struct recoded_categorical_array *ca;
86 ca = (struct recoded_categorical_array *) xmalloc (sizeof (*ca));
87 for (i = 0; i < n_variables; i++)
95 ca->n_vars = n_categoricals;
96 ca->a = xmalloc (n_categoricals * sizeof (*(ca->a)));
97 for (i = 0; i < n_categoricals; i++)
99 *(ca->a + i) = cr_recoded_categorical_create (v_variables[i]);
106 cr_free_recoded_array (struct recoded_categorical_array *r)
111 for (i = 0; i < r->n_vars; i++)
113 cr_recoded_categorical_destroy (*(r->a + i));
119 cr_value_find (struct recoded_categorical *rc, const union value *v)
122 const union value *val;
124 for (i = 0; i < rc->n_categories; i++)
126 val = *(rc->vals + i);
127 if (!compare_values (val, v, rc->v->width))
132 return CR_VALUE_NOT_FOUND;
136 Add the new value unless it is already present.
139 cr_value_update (struct recoded_categorical *rc, const union value *v)
141 if (cr_value_find (rc, v) == CR_VALUE_NOT_FOUND)
143 if (rc->n_categories >= rc->n_allocated_categories)
145 rc->n_allocated_categories *= 2;
146 rc->vals = (union value **)
147 xrealloc (rc->vals, rc->n_allocated_categories
148 * sizeof (*(rc->vals)));
150 *(rc->vals + rc->n_categories) = v;
156 Create a set of gsl_matrix's, each of whose rows correspond to
157 values of a categorical variable. Since n categories have n-1
158 degrees of freedom, the gsl_matrix is n-by-(n-1), with the first
159 category encoded as the zero vector.
162 cr_create_value_matrices (struct recoded_categorical_array *r)
170 for (i = 0; i < r->n_vars; i++)
172 n_rows = (*(r->a + i))->n_categories;
173 n_cols = (*(r->a + i))->n_categories - 1;
174 (*(r->a + i))->m = gsl_matrix_calloc (n_rows, n_cols);
175 for (row = 1; row < n_rows; row++)
178 gsl_matrix_set ((*(r->a + i))->m, row, col, 1.0);
184 cr_value_to_subscript (const union value *val, struct recoded_categorical *cr)
186 const union value *v;
190 subscript = cr->n_categories - 1;
191 while (subscript > 0)
193 v = *(cr->vals + subscript);
194 different = compare_values (val, v, cr->v->width);
204 static const union value *
205 cr_subscript_to_value (const size_t s, struct recoded_categorical *cr)
207 if (s < cr->n_categories)
218 Return the row of the matrix corresponding
221 gsl_vector_const_view
222 cr_value_to_vector (const union value * v, struct recoded_categorical * cr)
225 row = cr_value_to_subscript (v, cr);
226 return gsl_matrix_const_row (cr->m, row);
230 Which element of a vector is equal to the value x?
233 cr_which_element_eq (const gsl_vector * vec, double x)
237 for (i = 0; i < vec->size; i++)
239 if (fabs (gsl_vector_get (vec, i) - x) < GSL_DBL_EPSILON)
244 return CR_VALUE_NOT_FOUND;
247 cr_is_zero_vector (const gsl_vector * vec)
251 for (i = 0; i < vec->size; i++)
253 if (gsl_vector_get (vec, i) != 0.0)
262 Return the value corresponding to the vector.
263 To avoid searching the matrix, this routine takes
264 advantage of the fact that element (i,i+1) is 1
265 when i is between 1 and cr->n_categories - 1 and
269 cr_vector_to_value (const gsl_vector * vec, struct recoded_categorical *cr)
273 i = cr_which_element_eq (vec, 1.0);
274 if (i != CR_VALUE_NOT_FOUND)
276 return cr_subscript_to_value (i + 1, cr);
278 if (cr_is_zero_vector (vec))
280 return cr_subscript_to_value (0, cr);
286 Given a variable, return a pointer to its recoded
287 structure. BUSTED IN HERE.
289 struct recoded_categorical *
290 cr_var_to_recoded_categorical (const struct variable *v,
291 struct recoded_categorical_array *ca)
293 struct recoded_categorical *rc;
296 for (i = 0; i < ca->n_vars; i++)
299 if (rc->v->index == v->index)
307 struct design_matrix *
308 design_matrix_create (int n_variables,
309 const struct variable *v_variables[],
310 struct recoded_categorical_array *ca,
313 struct design_matrix *dm;
314 struct design_matrix_var *tmp;
315 struct recoded_categorical *rc;
316 const struct variable *v;
321 dm = xmalloc (sizeof (*dm));
322 dm->vars = xmalloc (n_variables * sizeof (struct variable *));
323 dm->n_vars = n_variables;
325 for (i = 0; i < n_variables; i++)
328 if (v->type == NUMERIC)
332 else if (v->type == ALPHA)
335 rc = cr_var_to_recoded_categorical (v, ca);
337 rc->first_column = n_cols;
338 rc->last_column = rc->first_column + rc->n_categories - 2;
339 n_cols += rc->n_categories - 1;
342 dm->m = gsl_matrix_calloc (n_data, n_cols);
343 dm->vars = xmalloc (dm->n_vars * sizeof (*(dm->vars)));
344 assert (dm->vars != NULL);
347 for (i = 0; i < n_variables; i++)
351 if (v->type == NUMERIC)
353 tmp = &(dm->vars[col]);
355 tmp->first_column = col;
358 else if (v->type == ALPHA)
361 rc = cr_var_to_recoded_categorical (v, ca);
363 tmp = &(dm->vars[col]);
365 tmp->last_column = rc->last_column;
366 col = rc->last_column + 1;
373 design_matrix_destroy (struct design_matrix *dm)
376 gsl_matrix_free (dm->m);
381 Return the index of the variable for the
385 design_matrix_col_to_var_index (const struct design_matrix *dm, size_t col)
388 struct design_matrix_var v;
390 for (i = 0; i < dm->n_vars; i++)
393 if (v.first_column <= col && col <= v.last_column)
396 return CR_INDEX_NOT_FOUND;
400 Return a pointer to the variable whose values
401 are stored in column col.
403 const struct variable *
404 design_matrix_col_to_var (const struct design_matrix *dm, size_t col)
408 struct design_matrix_var dmv;
409 const struct variable *v;
411 index = design_matrix_col_to_var_index (dm, col);
412 for (i = 0; i < dm->n_vars; i++)
416 if (v->index == index)
425 cmp_dm_var_index (const struct design_matrix_var *dmv, size_t index)
427 if (dmv->v->index == index)
433 Return the number of the first column storing the
434 values for variable v.
437 design_matrix_var_to_column (const struct design_matrix * dm,
438 const struct variable * v)
441 struct design_matrix_var tmp;
443 for (i = 0; i < dm->n_vars; i++)
446 if (cmp_dm_var_index (&tmp, v->index))
448 return tmp.first_column;
451 return CR_COLUMN_NOT_FOUND;
455 Set the appropriate value in the design matrix,
456 whether that value is from a categorical or numeric
460 design_matrix_set_categorical (struct design_matrix *dm, size_t row,
461 const struct variable *var,
462 const union value *val,
463 struct recoded_categorical *rc)
468 assert (var->type == ALPHA);
469 gsl_vector_const_view vec = cr_value_to_vector (val, rc);
472 Copying values here is not the 'most efficient' way,
473 but it will work even if we change the vector encoding later.
475 for (col = rc->first_column; col <= rc->last_column; col++)
477 x = gsl_vector_get (&vec.vector, col);
478 gsl_matrix_set (dm->m, row, col, x);
482 design_matrix_set_numeric (struct design_matrix *dm, size_t row,
483 const struct variable *var, const union value *val)
487 assert (var->type == NUMERIC);
488 col = design_matrix_var_to_column ((const struct design_matrix *) dm, var);
489 assert (col != CR_COLUMN_NOT_FOUND);
490 gsl_matrix_set (dm->m, row, col, val->f);