1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 Functions and data structures to recode categorical variables into
22 vectors and sub-rows of matrices.
24 For some statistical models, it is necessary to change each value
25 of a categorical variable to a vector with binary entries. These
26 vectors are then stored as sub-rows within a matrix during
27 model-fitting. E.g., we need functions and data strucutres to map a
28 value, say 'a', of a variable named 'cat_var', to a vector, say (0
29 1 0 0 0), and vice versa. We also need to be able to map the
30 vector back to the value 'a', and if the vector is a sub-row of a
31 matrix, we need to know which sub-row corresponds to the variable
34 The data structures defined here will be placed in the variable
35 structure in the future. When that happens, the useful code
36 in this file will be that which refers to design matrices.
47 #include <gsl/gsl_machine.h>
48 #include <gsl/gsl_vector.h>
49 #include <gsl/gsl_matrix.h>
51 #define N_INITIAL_CATEGORIES 1
52 #define CAT_COLUMN_NOT_FOUND -1
53 #define CAT_VALUE_NOT_FOUND -2
54 #define CAT_INDEX_NOT_FOUND -3
57 cat_stored_values_create (struct variable *v)
59 if (v->obs_vals == NULL)
61 v->obs_vals = xmalloc (sizeof (*v->obs_vals));
62 v->obs_vals->n_categories = 0;
63 v->obs_vals->n_allocated_categories = N_INITIAL_CATEGORIES;
65 xnmalloc (N_INITIAL_CATEGORIES, sizeof *v->obs_vals->vals);
70 cat_stored_values_destroy (struct variable *v)
73 if (v->obs_vals != NULL)
80 struct recoded_categorical_array *
81 cr_recoded_cat_ar_create (int n_variables, struct variable *v_variables[])
83 size_t n_categoricals = 0;
85 struct recoded_categorical_array *ca;
88 ca = xmalloc (sizeof *ca);
89 for (i = 0; i < n_variables; i++)
97 ca->n_vars = n_categoricals;
98 ca->a = xnmalloc (n_categoricals, sizeof *ca->a);
99 for (i = 0; i < n_categoricals; i++)
101 *(ca->a + i) = cr_recoded_categorical_create (v_variables[i]);
108 cr_free_recoded_array (struct recoded_categorical_array *r)
113 for (i = 0; i < r->n_vars; i++)
115 cr_recoded_categorical_destroy (*(r->a + i));
121 cat_value_find (const struct variable *v, const union value *val)
124 const union value *candidate;
126 assert (val != NULL);
128 assert (v->obs_vals != NULL);
129 for (i = 0; i < v->obs_vals->n_categories; i++)
131 candidate = v->obs_vals->vals + i;
132 assert (candidate != NULL);
133 if (!compare_values (candidate, val, v->width))
138 return CAT_VALUE_NOT_FOUND;
142 Add the new value unless it is already present.
145 cat_value_update (struct variable *v, const union value *val)
149 if (v->type == ALPHA)
151 assert (val != NULL);
154 if (cat_value_find (v, val) == CAT_VALUE_NOT_FOUND)
156 if (cv->n_categories >= cv->n_allocated_categories)
158 cv->n_allocated_categories *= 2;
159 cv->vals = xnrealloc (cv->vals,
160 cv->n_allocated_categories,
163 cv->vals[cv->n_categories] = *val;
170 Create a gsl_matrix, whose rows correspond to values of a
171 categorical variable. Since n categories have n-1 degrees of
172 freedom, the gsl_matrix is n-by-(n-1), with the first category
173 encoded as the zero vector.
177 cat_create_value_matrix (struct variable *v)
186 if (v->type == ALPHA)
188 assert (v->rc != NULL);
189 n_rows = v->rc->n_categories;
190 n_cols = v->rc->n_categories - 1;
191 v->rc->m = gsl_matrix_calloc (n_rows, n_cols);
192 for (row = 1; row < n_rows; row++)
195 gsl_matrix_set (v->rc->m, row, col, 1.0);
201 Return the subscript of the binary vector corresponding
205 cat_value_to_subscript (const union value *val, struct variable *v)
207 const union value *val2;
212 assert (val != NULL);
213 assert (v->obs_vals != NULL);
214 subscript = v->obs_vals->n_categories - 1;
215 while (subscript > 0)
217 val2 = v->obs_vals->vals + subscript;
218 assert (val2 != NULL);
219 different = compare_values (val, val2, v->width);
230 cat_subscript_to_value (const size_t s, struct variable *v)
232 assert (v->obs_vals != NULL);
233 if (s < v->obs_vals->n_categories)
235 return (v->obs_vals->vals + s);
245 Return the row of the matrix corresponding
248 static gsl_vector_view
249 cr_value_to_vector (const union value *v, struct recoded_categorical *cr)
252 row = cr_value_to_subscript (v, cr);
253 return gsl_matrix_row (cr->m, row);
257 Which element of a vector is equal to the value x?
260 cat_which_element_eq (const gsl_vector * vec, double x)
264 for (i = 0; i < vec->size; i++)
266 if (fabs (gsl_vector_get (vec, i) - x) < GSL_DBL_EPSILON)
271 return CAT_VALUE_NOT_FOUND;
274 cat_is_zero_vector (const gsl_vector * vec)
278 for (i = 0; i < vec->size; i++)
280 if (gsl_vector_get (vec, i) != 0.0)
289 Return the value corresponding to the vector.
290 To avoid searching the matrix, this routine takes
291 advantage of the fact that element (i,i+1) is 1
292 when i is between 1 and cr->n_categories - 1 and
296 cr_vector_to_value (const gsl_vector * vec, struct variable *v)
300 i = cat_which_element_eq (vec, 1.0);
301 if (i != CAT_VALUE_NOT_FOUND)
303 return cat_subscript_to_value (i + 1, v);
305 if (cat_is_zero_vector (vec))
307 return cat_subscript_to_value (0, v);
314 Given a variable, return a pointer to its recoded
315 structure. BUSTED IN HERE.
317 struct recoded_categorical *
318 cr_var_to_recoded_categorical (const struct variable *v,
319 struct recoded_categorical_array *ca)
321 struct recoded_categorical *rc;
324 for (i = 0; i < ca->n_vars; i++)
327 if (rc->v->index == v->index)
335 struct design_matrix *
336 design_matrix_create (int n_variables,
337 const struct variable *v_variables[],
340 struct design_matrix *dm;
341 const struct variable *v;
346 dm = xmalloc (sizeof *dm);
347 dm->vars = xnmalloc (n_variables, sizeof *dm->vars);
348 dm->n_vars = n_variables;
350 for (i = 0; i < n_variables; i++)
353 assert ((dm->vars + i) != NULL);
354 (dm->vars + i)->v = v; /* Allows us to look up the variable from
355 the design matrix. */
356 (dm->vars + i)->first_column = n_cols;
357 if (v->type == NUMERIC)
360 (dm->vars + i)->last_column = n_cols;
362 else if (v->type == ALPHA)
364 assert (v->obs_vals != NULL);
365 (dm->vars + i)->last_column =
366 (dm->vars + i)->first_column + v->obs_vals->n_categories - 2;
367 n_cols += v->obs_vals->n_categories - 1;
370 dm->m = gsl_matrix_calloc (n_data, n_cols);
377 design_matrix_destroy (struct design_matrix *dm)
380 gsl_matrix_free (dm->m);
385 Return the index of the variable for the
389 design_matrix_col_to_var_index (const struct design_matrix *dm, size_t col)
392 struct design_matrix_var v;
394 for (i = 0; i < dm->n_vars; i++)
397 if (v.first_column <= col && col <= v.last_column)
400 return CAT_INDEX_NOT_FOUND;
404 Return a pointer to the variable whose values
405 are stored in column col.
408 design_matrix_col_to_var (const struct design_matrix *dm, size_t col)
412 struct design_matrix_var dmv;
414 index = design_matrix_col_to_var_index (dm, col);
415 for (i = 0; i < dm->n_vars; i++)
418 if ((dmv.v)->index == index)
420 return (struct variable *) dmv.v;
427 cmp_dm_var_index (const struct design_matrix_var *dmv, size_t index)
429 if (dmv->v->index == index)
435 Return the number of the first column which holds the
436 values for variable v.
439 design_matrix_var_to_column (const struct design_matrix * dm,
440 const struct variable * v)
443 struct design_matrix_var tmp;
445 for (i = 0; i < dm->n_vars; i++)
448 if (cmp_dm_var_index (&tmp, v->index))
450 return tmp.first_column;
453 return CAT_COLUMN_NOT_FOUND;
458 dm_var_to_last_column (const struct design_matrix *dm,
459 const struct variable *v)
462 struct design_matrix_var tmp;
464 for (i = 0; i < dm->n_vars; i++)
467 if (cmp_dm_var_index (&tmp, v->index))
469 return tmp.last_column;
472 return CAT_COLUMN_NOT_FOUND;
476 Set the appropriate value in the design matrix,
477 whether that value is from a categorical or numeric
478 variable. For a categorical variable, only the usual
479 binary encoding is allowed.
482 design_matrix_set_categorical (struct design_matrix *dm, size_t row,
483 const struct variable *var,
484 const union value *val)
492 assert (var->type == ALPHA);
493 fc = design_matrix_var_to_column (dm, var);
494 lc = dm_var_to_last_column (dm, var);
495 assert (lc != CAT_COLUMN_NOT_FOUND);
496 assert (fc != CAT_COLUMN_NOT_FOUND);
497 is_one = fc + cat_value_find (var, val);
498 for (col = fc; col <= lc; col++)
500 entry = (col == is_one) ? 1.0 : 0.0;
501 gsl_matrix_set (dm->m, row, col, entry);
505 design_matrix_set_numeric (struct design_matrix *dm, size_t row,
506 const struct variable *var, const union value *val)
510 assert (var->type == NUMERIC);
511 col = design_matrix_var_to_column ((const struct design_matrix *) dm, var);
512 assert (col != CAT_COLUMN_NOT_FOUND);
513 gsl_matrix_set (dm->m, row, col, val->f);