1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 Functions and data structures to recode categorical variables into
22 vectors and sub-rows of matrices.
24 For some statistical models, it is necessary to change each value
25 of a categorical variable to a vector with binary entries. These
26 vectors are then stored as sub-rows within a matrix during
27 model-fitting. E.g., we need functions and data strucutres to map a
28 value, say 'a', of a variable named 'cat_var', to a vector, say (0
29 1 0 0 0), and vice versa. We also need to be able to map the
30 vector back to the value 'a', and if the vector is a sub-row of a
31 matrix, we need to know which sub-row corresponds to the variable
34 The data structures defined here will be placed in the variable
35 structure in the future. When that happens, the useful code
36 in this file will be that which refers to design matrices.
47 #include <gsl/gsl_machine.h>
48 #include <gsl/gsl_vector.h>
49 #include <gsl/gsl_matrix.h>
51 #define N_INITIAL_CATEGORIES 1
52 #define CR_COLUMN_NOT_FOUND -1
53 #define CR_VALUE_NOT_FOUND -2
54 #define CR_INDEX_NOT_FOUND -3
56 static gsl_vector_const_view cr_value_to_vector (const union value *,
57 struct recoded_categorical *);
59 struct recoded_categorical *
60 cr_recoded_categorical_create (const struct variable *v)
62 struct recoded_categorical *rc;
64 rc = xmalloc (sizeof (*rc));
67 rc->n_allocated_categories = N_INITIAL_CATEGORIES;
68 rc->vals = (union value **) xmalloc (N_INITIAL_CATEGORIES *
75 cr_recoded_categorical_destroy (struct recoded_categorical *r)
81 struct recoded_categorical_array *
82 cr_recoded_cat_ar_create (int n_variables, struct variable *v_variables[])
84 size_t n_categoricals = 0;
86 struct recoded_categorical_array *ca;
89 ca = (struct recoded_categorical_array *) xmalloc (sizeof (*ca));
90 for (i = 0; i < n_variables; i++)
98 ca->n_vars = n_categoricals;
99 ca->a = xmalloc (n_categoricals * sizeof (*(ca->a)));
100 for (i = 0; i < n_categoricals; i++)
102 *(ca->a + i) = cr_recoded_categorical_create (v_variables[i]);
109 cr_free_recoded_array (struct recoded_categorical_array *r)
114 for (i = 0; i < r->n_vars; i++)
116 cr_recoded_categorical_destroy (*(r->a + i));
122 cr_value_find (struct recoded_categorical *rc, const union value *v)
125 const union value *val;
127 for (i = 0; i < rc->n_categories; i++)
129 val = *(rc->vals + i);
130 if (!compare_values (val, v, rc->v->width))
135 return CR_VALUE_NOT_FOUND;
139 Add the new value unless it is already present.
142 cr_value_update (struct recoded_categorical *rc, const union value *v)
144 if (cr_value_find (rc, v) == CR_VALUE_NOT_FOUND)
146 if (rc->n_categories >= rc->n_allocated_categories)
148 rc->n_allocated_categories *= 2;
149 rc->vals = (union value **)
150 xrealloc (rc->vals, rc->n_allocated_categories
151 * sizeof (*(rc->vals)));
153 *(rc->vals + rc->n_categories) = v;
159 Create a set of gsl_matrix's, each of whose rows correspond to
160 values of a categorical variable. Since n categories have n-1
161 degrees of freedom, the gsl_matrix is n-by-(n-1), with the first
162 category encoded as the zero vector.
165 cr_create_value_matrices (struct recoded_categorical_array *r)
173 for (i = 0; i < r->n_vars; i++)
175 n_rows = (*(r->a + i))->n_categories;
176 n_cols = (*(r->a + i))->n_categories - 1;
177 (*(r->a + i))->m = gsl_matrix_calloc (n_rows, n_cols);
178 for (row = 1; row < n_rows; row++)
181 gsl_matrix_set ((*(r->a + i))->m, row, col, 1.0);
187 cr_value_to_subscript (const union value *val, struct recoded_categorical *cr)
189 const union value *v;
193 subscript = cr->n_categories - 1;
194 while (subscript > 0)
196 v = *(cr->vals + subscript);
197 different = compare_values (val, v, cr->v->width);
207 static const union value *
208 cr_subscript_to_value (const size_t s, struct recoded_categorical *cr)
210 if (s < cr->n_categories)
221 Return the row of the matrix corresponding
224 static gsl_vector_const_view
225 cr_value_to_vector (const union value * v, struct recoded_categorical * cr)
228 row = cr_value_to_subscript (v, cr);
229 return gsl_matrix_const_row (cr->m, row);
233 Which element of a vector is equal to the value x?
236 cr_which_element_eq (const gsl_vector * vec, double x)
240 for (i = 0; i < vec->size; i++)
242 if (fabs (gsl_vector_get (vec, i) - x) < GSL_DBL_EPSILON)
247 return CR_VALUE_NOT_FOUND;
250 cr_is_zero_vector (const gsl_vector * vec)
254 for (i = 0; i < vec->size; i++)
256 if (gsl_vector_get (vec, i) != 0.0)
265 Return the value corresponding to the vector.
266 To avoid searching the matrix, this routine takes
267 advantage of the fact that element (i,i+1) is 1
268 when i is between 1 and cr->n_categories - 1 and
272 cr_vector_to_value (const gsl_vector * vec, struct recoded_categorical *cr)
276 i = cr_which_element_eq (vec, 1.0);
277 if (i != CR_VALUE_NOT_FOUND)
279 return cr_subscript_to_value (i + 1, cr);
281 if (cr_is_zero_vector (vec))
283 return cr_subscript_to_value (0, cr);
289 Given a variable, return a pointer to its recoded
290 structure. BUSTED IN HERE.
292 struct recoded_categorical *
293 cr_var_to_recoded_categorical (const struct variable *v,
294 struct recoded_categorical_array *ca)
296 struct recoded_categorical *rc;
299 for (i = 0; i < ca->n_vars; i++)
302 if (rc->v->index == v->index)
310 struct design_matrix *
311 design_matrix_create (int n_variables,
312 const struct variable *v_variables[],
313 struct recoded_categorical_array *ca,
316 struct design_matrix *dm;
317 struct design_matrix_var *tmp;
318 struct recoded_categorical *rc;
319 const struct variable *v;
324 dm = xmalloc (sizeof (*dm));
325 dm->vars = xmalloc (n_variables * sizeof (struct variable *));
326 dm->n_vars = n_variables;
328 for (i = 0; i < n_variables; i++)
331 if (v->type == NUMERIC)
335 else if (v->type == ALPHA)
338 rc = cr_var_to_recoded_categorical (v, ca);
340 rc->first_column = n_cols;
341 rc->last_column = rc->first_column + rc->n_categories - 2;
342 n_cols += rc->n_categories - 1;
345 dm->m = gsl_matrix_calloc (n_data, n_cols);
346 dm->vars = xmalloc (dm->n_vars * sizeof (*(dm->vars)));
347 assert (dm->vars != NULL);
350 for (i = 0; i < n_variables; i++)
354 if (v->type == NUMERIC)
356 tmp = &(dm->vars[col]);
358 tmp->first_column = col;
361 else if (v->type == ALPHA)
364 rc = cr_var_to_recoded_categorical (v, ca);
366 tmp = &(dm->vars[col]);
368 tmp->last_column = rc->last_column;
369 col = rc->last_column + 1;
376 design_matrix_destroy (struct design_matrix *dm)
379 gsl_matrix_free (dm->m);
384 Return the index of the variable for the
388 design_matrix_col_to_var_index (const struct design_matrix *dm, size_t col)
391 struct design_matrix_var v;
393 for (i = 0; i < dm->n_vars; i++)
396 if (v.first_column <= col && col <= v.last_column)
399 return CR_INDEX_NOT_FOUND;
403 Return a pointer to the variable whose values
404 are stored in column col.
406 const struct variable *
407 design_matrix_col_to_var (const struct design_matrix *dm, size_t col)
411 struct design_matrix_var dmv;
412 const struct variable *v;
414 index = design_matrix_col_to_var_index (dm, col);
415 for (i = 0; i < dm->n_vars; i++)
419 if (v->index == index)
428 cmp_dm_var_index (const struct design_matrix_var *dmv, size_t index)
430 if (dmv->v->index == index)
436 Return the number of the first column storing the
437 values for variable v.
440 design_matrix_var_to_column (const struct design_matrix * dm,
441 const struct variable * v)
444 struct design_matrix_var tmp;
446 for (i = 0; i < dm->n_vars; i++)
449 if (cmp_dm_var_index (&tmp, v->index))
451 return tmp.first_column;
454 return CR_COLUMN_NOT_FOUND;
458 Set the appropriate value in the design matrix,
459 whether that value is from a categorical or numeric
463 design_matrix_set_categorical (struct design_matrix *dm, size_t row,
464 const struct variable *var,
465 const union value *val,
466 struct recoded_categorical *rc)
471 assert (var->type == ALPHA);
472 gsl_vector_const_view vec = cr_value_to_vector (val, rc);
475 Copying values here is not the 'most efficient' way,
476 but it will work even if we change the vector encoding later.
478 for (col = rc->first_column; col <= rc->last_column; col++)
480 x = gsl_vector_get (&vec.vector, col);
481 gsl_matrix_set (dm->m, row, col, x);
485 design_matrix_set_numeric (struct design_matrix *dm, size_t row,
486 const struct variable *var, const union value *val)
490 assert (var->type == NUMERIC);
491 col = design_matrix_var_to_column ((const struct design_matrix *) dm, var);
492 assert (col != CR_COLUMN_NOT_FOUND);
493 gsl_matrix_set (dm->m, row, col, val->f);