1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 Functions and data structures to store values of a categorical
22 variable, and to recode those values into binary vectors.
24 For some statistical models, it is necessary to change each value
25 of a categorical variable to a vector with binary entries. These
26 vectors are then stored as sub-rows within a matrix during
27 model-fitting. For example, we need functions and data strucutres to map a
28 value, say 'a', of a variable named 'cat_var', to a vector, say (0
29 1 0 0 0), and vice versa. We also need to be able to map the
30 vector back to the value 'a', and if the vector is a sub-row of a
31 matrix, we need to know which sub-row corresponds to the variable
43 #include <gsl/gsl_machine.h>
44 #include <gsl/gsl_vector.h>
45 #include <gsl/gsl_matrix.h>
47 #define N_INITIAL_CATEGORIES 1
48 #define CAT_COLUMN_NOT_FOUND -1
49 #define CAT_VALUE_NOT_FOUND -2
50 #define CAT_INDEX_NOT_FOUND -3
53 cat_stored_values_create (struct variable *v)
55 if (v->obs_vals == NULL)
57 v->obs_vals = xmalloc (sizeof (*v->obs_vals));
58 v->obs_vals->n_categories = 0;
59 v->obs_vals->n_allocated_categories = N_INITIAL_CATEGORIES;
61 xnmalloc (N_INITIAL_CATEGORIES, sizeof *v->obs_vals->vals);
66 cat_stored_values_destroy (struct variable *v)
69 if (v->obs_vals != NULL)
76 cat_value_find (const struct variable *v, const union value *val)
79 const union value *candidate;
83 assert (v->obs_vals != NULL);
84 for (i = 0; i < v->obs_vals->n_categories; i++)
86 candidate = v->obs_vals->vals + i;
87 assert (candidate != NULL);
88 if (!compare_values (candidate, val, v->width))
93 return CAT_VALUE_NOT_FOUND;
97 Add the new value unless it is already present.
100 cat_value_update (struct variable *v, const union value *val)
104 if (v->type == ALPHA)
106 assert (val != NULL);
109 if (cat_value_find (v, val) == CAT_VALUE_NOT_FOUND)
111 if (cv->n_categories >= cv->n_allocated_categories)
113 cv->n_allocated_categories *= 2;
114 cv->vals = xnrealloc (cv->vals,
115 cv->n_allocated_categories,
118 cv->vals[cv->n_categories] = *val;
125 Return the subscript of the binary vector corresponding
129 cat_value_to_subscript (const union value *val, struct variable *v)
131 const union value *val2;
136 assert (val != NULL);
137 assert (v->obs_vals != NULL);
138 subscript = v->obs_vals->n_categories - 1;
139 while (subscript > 0)
141 val2 = v->obs_vals->vals + subscript;
142 assert (val2 != NULL);
143 different = compare_values (val, val2, v->width);
154 cat_subscript_to_value (const size_t s, struct variable *v)
156 assert (v->obs_vals != NULL);
157 if (s < v->obs_vals->n_categories)
159 return (v->obs_vals->vals + s);
168 Which element of a vector is equal to the value x?
171 cat_which_element_eq (const gsl_vector * vec, double x)
175 for (i = 0; i < vec->size; i++)
177 if (fabs (gsl_vector_get (vec, i) - x) < GSL_DBL_EPSILON)
182 return CAT_VALUE_NOT_FOUND;
185 cat_is_zero_vector (const gsl_vector * vec)
189 for (i = 0; i < vec->size; i++)
191 if (gsl_vector_get (vec, i) != 0.0)
200 Return the value of v corresponding to the vector vec.
203 cat_vector_to_value (const gsl_vector * vec, struct variable *v)
207 i = cat_which_element_eq (vec, 1.0);
208 if (i != CAT_VALUE_NOT_FOUND)
210 return cat_subscript_to_value (i + 1, v);
212 if (cat_is_zero_vector (vec))
214 return cat_subscript_to_value (0, v);
219 struct design_matrix *
220 design_matrix_create (int n_variables,
221 const struct variable *v_variables[],
224 struct design_matrix *dm;
225 const struct variable *v;
230 dm = xmalloc (sizeof *dm);
231 dm->vars = xnmalloc (n_variables, sizeof *dm->vars);
232 dm->n_vars = n_variables;
234 for (i = 0; i < n_variables; i++)
237 assert ((dm->vars + i) != NULL);
238 (dm->vars + i)->v = v; /* Allows us to look up the variable from
239 the design matrix. */
240 (dm->vars + i)->first_column = n_cols;
241 if (v->type == NUMERIC)
244 (dm->vars + i)->last_column = n_cols;
246 else if (v->type == ALPHA)
248 assert (v->obs_vals != NULL);
249 (dm->vars + i)->last_column =
250 (dm->vars + i)->first_column + v->obs_vals->n_categories - 2;
251 n_cols += v->obs_vals->n_categories - 1;
254 dm->m = gsl_matrix_calloc (n_data, n_cols);
261 design_matrix_destroy (struct design_matrix *dm)
264 gsl_matrix_free (dm->m);
269 Return the index of the variable for the
273 design_matrix_col_to_var_index (const struct design_matrix *dm, size_t col)
276 struct design_matrix_var v;
278 for (i = 0; i < dm->n_vars; i++)
281 if (v.first_column <= col && col <= v.last_column)
284 return CAT_INDEX_NOT_FOUND;
288 Return a pointer to the variable whose values
289 are stored in column col.
292 design_matrix_col_to_var (const struct design_matrix *dm, size_t col)
296 struct design_matrix_var dmv;
298 index = design_matrix_col_to_var_index (dm, col);
299 for (i = 0; i < dm->n_vars; i++)
302 if ((dmv.v)->index == index)
304 return (struct variable *) dmv.v;
311 cmp_dm_var_index (const struct design_matrix_var *dmv, size_t index)
313 if (dmv->v->index == index)
319 Return the number of the first column which holds the
320 values for variable v.
323 design_matrix_var_to_column (const struct design_matrix * dm,
324 const struct variable * v)
327 struct design_matrix_var tmp;
329 for (i = 0; i < dm->n_vars; i++)
332 if (cmp_dm_var_index (&tmp, v->index))
334 return tmp.first_column;
337 return CAT_COLUMN_NOT_FOUND;
342 dm_var_to_last_column (const struct design_matrix *dm,
343 const struct variable *v)
346 struct design_matrix_var tmp;
348 for (i = 0; i < dm->n_vars; i++)
351 if (cmp_dm_var_index (&tmp, v->index))
353 return tmp.last_column;
356 return CAT_COLUMN_NOT_FOUND;
360 Set the appropriate value in the design matrix,
361 whether that value is from a categorical or numeric
362 variable. For a categorical variable, only the usual
363 binary encoding is allowed.
366 design_matrix_set_categorical (struct design_matrix *dm, size_t row,
367 const struct variable *var,
368 const union value *val)
376 assert (var->type == ALPHA);
377 fc = design_matrix_var_to_column (dm, var);
378 lc = dm_var_to_last_column (dm, var);
379 assert (lc != CAT_COLUMN_NOT_FOUND);
380 assert (fc != CAT_COLUMN_NOT_FOUND);
381 is_one = fc + cat_value_find (var, val);
382 for (col = fc; col <= lc; col++)
384 entry = (col == is_one) ? 1.0 : 0.0;
385 gsl_matrix_set (dm->m, row, col, entry);
389 design_matrix_set_numeric (struct design_matrix *dm, size_t row,
390 const struct variable *var, const union value *val)
394 assert (var->type == NUMERIC);
395 col = design_matrix_var_to_column ((const struct design_matrix *) dm, var);
396 assert (col != CAT_COLUMN_NOT_FOUND);
397 gsl_matrix_set (dm->m, row, col, val->f);