1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 Functions and data structures to recode categorical variables into
22 vectors and sub-rows of matrices.
24 For some statistical models, it is necessary to change each value
25 of a categorical variable to a vector with binary entries. These
26 vectors are then stored as sub-rows within a matrix during
27 model-fitting. E.g., we need functions and data strucutres to map a
28 value, say 'a', of a variable named 'cat_var', to a vector, say (0
29 1 0 0 0), and vice versa. We also need to be able to map the
30 vector back to the value 'a', and if the vector is a sub-row of a
31 matrix, we need to know which sub-row corresponds to the variable
34 The data structures defined here will be placed in the variable
35 structure in the future. When that happens, the useful code
36 in this file will be that which refers to design matrices.
47 #include <gsl/gsl_machine.h>
48 #include <gsl/gsl_vector.h>
49 #include <gsl/gsl_matrix.h>
51 #define N_INITIAL_CATEGORIES 1
52 #define CR_COLUMN_NOT_FOUND -1
53 #define CR_VALUE_NOT_FOUND -2
54 #define CR_INDEX_NOT_FOUND -3
56 static gsl_vector_view cr_value_to_vector (const union value *,
57 struct recoded_categorical *);
59 struct recoded_categorical *
60 cr_recoded_categorical_create (const struct variable *v)
62 struct recoded_categorical *rc;
64 rc = xmalloc (sizeof (*rc));
67 rc->n_allocated_categories = N_INITIAL_CATEGORIES;
68 rc->vals = xmalloc (N_INITIAL_CATEGORIES * sizeof (*rc->vals));
74 cr_recoded_categorical_destroy (struct recoded_categorical *r)
80 struct recoded_categorical_array *
81 cr_recoded_cat_ar_create (int n_variables, struct variable *v_variables[])
83 size_t n_categoricals = 0;
85 struct recoded_categorical_array *ca;
88 ca = (struct recoded_categorical_array *) xmalloc (sizeof (*ca));
89 for (i = 0; i < n_variables; i++)
97 ca->n_vars = n_categoricals;
98 ca->a = xmalloc (n_categoricals * sizeof (*(ca->a)));
99 for (i = 0; i < n_categoricals; i++)
101 *(ca->a + i) = cr_recoded_categorical_create (v_variables[i]);
108 cr_free_recoded_array (struct recoded_categorical_array *r)
113 for (i = 0; i < r->n_vars; i++)
115 cr_recoded_categorical_destroy (*(r->a + i));
121 cr_value_find (struct recoded_categorical *rc, const union value *v)
124 const union value *val;
126 for (i = 0; i < rc->n_categories; i++)
129 if (!compare_values (val, v, rc->v->width))
134 return CR_VALUE_NOT_FOUND;
138 Add the new value unless it is already present.
141 cr_value_update (struct recoded_categorical *rc, const union value *v)
143 if (cr_value_find (rc, v) == CR_VALUE_NOT_FOUND)
145 if (rc->n_categories >= rc->n_allocated_categories)
147 rc->n_allocated_categories *= 2;
148 rc->vals = xrealloc (rc->vals, rc->n_allocated_categories
149 * sizeof (*(rc->vals)));
151 rc->vals[rc->n_categories] = *v;
157 Create a set of gsl_matrix's, each of whose rows correspond to
158 values of a categorical variable. Since n categories have n-1
159 degrees of freedom, the gsl_matrix is n-by-(n-1), with the first
160 category encoded as the zero vector.
163 cr_create_value_matrices (struct recoded_categorical_array *r)
171 for (i = 0; i < r->n_vars; i++)
173 n_rows = (*(r->a + i))->n_categories;
174 n_cols = (*(r->a + i))->n_categories - 1;
175 (*(r->a + i))->m = gsl_matrix_calloc (n_rows, n_cols);
176 for (row = 1; row < n_rows; row++)
179 gsl_matrix_set ((*(r->a + i))->m, row, col, 1.0);
185 cr_value_to_subscript (const union value *val, struct recoded_categorical *cr)
187 const union value *v;
191 subscript = cr->n_categories - 1;
192 while (subscript > 0)
194 v = cr->vals + subscript;
195 different = compare_values (val, v, cr->v->width);
206 cr_subscript_to_value (const size_t s, struct recoded_categorical *cr)
208 if (s < cr->n_categories)
210 return (cr->vals + s);
219 Return the row of the matrix corresponding
222 static gsl_vector_view
223 cr_value_to_vector (const union value *v, struct recoded_categorical *cr)
226 row = cr_value_to_subscript (v, cr);
227 return gsl_matrix_row (cr->m, row);
231 Which element of a vector is equal to the value x?
234 cr_which_element_eq (const gsl_vector * vec, double x)
238 for (i = 0; i < vec->size; i++)
240 if (fabs (gsl_vector_get (vec, i) - x) < GSL_DBL_EPSILON)
245 return CR_VALUE_NOT_FOUND;
248 cr_is_zero_vector (const gsl_vector * vec)
252 for (i = 0; i < vec->size; i++)
254 if (gsl_vector_get (vec, i) != 0.0)
263 Return the value corresponding to the vector.
264 To avoid searching the matrix, this routine takes
265 advantage of the fact that element (i,i+1) is 1
266 when i is between 1 and cr->n_categories - 1 and
270 cr_vector_to_value (const gsl_vector * vec, struct recoded_categorical *cr)
274 i = cr_which_element_eq (vec, 1.0);
275 if (i != CR_VALUE_NOT_FOUND)
277 return cr_subscript_to_value (i + 1, cr);
279 if (cr_is_zero_vector (vec))
281 return cr_subscript_to_value (0, cr);
287 Given a variable, return a pointer to its recoded
288 structure. BUSTED IN HERE.
290 struct recoded_categorical *
291 cr_var_to_recoded_categorical (const struct variable *v,
292 struct recoded_categorical_array *ca)
294 struct recoded_categorical *rc;
297 for (i = 0; i < ca->n_vars; i++)
300 if (rc->v->index == v->index)
308 struct design_matrix *
309 design_matrix_create (int n_variables,
310 const struct variable *v_variables[],
311 struct recoded_categorical_array *ca,
314 struct design_matrix *dm;
315 struct design_matrix_var *tmp;
316 struct recoded_categorical *rc;
317 const struct variable *v;
322 dm = xmalloc (sizeof (*dm));
323 dm->vars = xmalloc (n_variables * sizeof (struct variable *));
324 dm->n_vars = n_variables;
326 for (i = 0; i < n_variables; i++)
329 if (v->type == NUMERIC)
333 else if (v->type == ALPHA)
336 rc = cr_var_to_recoded_categorical (v, ca);
338 rc->first_column = n_cols;
339 rc->last_column = rc->first_column + rc->n_categories - 2;
340 n_cols += rc->n_categories - 1;
343 dm->m = gsl_matrix_calloc (n_data, n_cols);
344 dm->vars = xmalloc (dm->n_vars * sizeof (*(dm->vars)));
345 assert (dm->vars != NULL);
348 for (i = 0; i < n_variables; i++)
352 if (v->type == NUMERIC)
354 tmp = &(dm->vars[col]);
356 tmp->first_column = col;
359 else if (v->type == ALPHA)
362 rc = cr_var_to_recoded_categorical (v, ca);
364 tmp = &(dm->vars[col]);
366 tmp->last_column = rc->last_column;
367 col = rc->last_column + 1;
374 design_matrix_destroy (struct design_matrix *dm)
377 gsl_matrix_free (dm->m);
382 Return the index of the variable for the
386 design_matrix_col_to_var_index (const struct design_matrix *dm, size_t col)
389 struct design_matrix_var v;
391 for (i = 0; i < dm->n_vars; i++)
394 if (v.first_column <= col && col <= v.last_column)
397 return CR_INDEX_NOT_FOUND;
401 Return a pointer to the variable whose values
402 are stored in column col.
405 design_matrix_col_to_var (const struct design_matrix *dm, size_t col)
409 struct design_matrix_var dmv;
411 index = design_matrix_col_to_var_index (dm, col);
412 for (i = 0; i < dm->n_vars; i++)
415 if ((dmv.v)->index == index)
417 return (struct variable *) dmv.v;
424 cmp_dm_var_index (const struct design_matrix_var *dmv, size_t index)
426 if (dmv->v->index == index)
432 Return the number of the first column storing the
433 values for variable v.
436 design_matrix_var_to_column (const struct design_matrix * dm,
437 const struct variable * v)
440 struct design_matrix_var tmp;
442 for (i = 0; i < dm->n_vars; i++)
445 if (cmp_dm_var_index (&tmp, v->index))
447 return tmp.first_column;
450 return CR_COLUMN_NOT_FOUND;
454 Set the appropriate value in the design matrix,
455 whether that value is from a categorical or numeric
459 design_matrix_set_categorical (struct design_matrix *dm, size_t row,
460 const struct variable *var,
461 const union value *val,
462 struct recoded_categorical *rc)
467 assert (var->type == ALPHA);
468 const gsl_vector_view vec = cr_value_to_vector (val, rc);
471 Copying values here is not the 'most efficient' way,
472 but it will work even if we change the vector encoding later.
474 for (col = rc->first_column; col <= rc->last_column; col++)
476 x = gsl_vector_get (&vec.vector, col);
477 gsl_matrix_set (dm->m, row, col, x);
481 design_matrix_set_numeric (struct design_matrix *dm, size_t row,
482 const struct variable *var, const union value *val)
486 assert (var->type == NUMERIC);
487 col = design_matrix_var_to_column ((const struct design_matrix *) dm, var);
488 assert (col != CR_COLUMN_NOT_FOUND);
489 gsl_matrix_set (dm->m, row, col, val->f);