1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 Functions and data structures to recode categorical variables into
22 vectors and sub-rows of matrices.
24 To fit many types of statistical models, it is necessary
25 to change each value of a categorical variable to a vector with binary
26 entries. These vectors are then stored as sub-rows within a matrix
27 during model-fitting. We need functions and data strucutres to,
28 e.g., map a value, say 'a', of a variable named 'cat_var', to a
29 vector, say (0 1 0 0 0), and vice versa. We also need to be able
30 to map the vector back to the value 'a', and if the vector is a
31 sub-row of a matrix, we need to know which sub-row corresponds to
32 the variable 'cat_var'.
34 The data structures defined here will be placed in the variable
35 structure in the future. When that happens, the useful code
36 in this file will be that which refers to design matrices.
42 #include <gsl/gsl_matrix.h>
44 This structure contains the binary encoding of a
47 struct recoded_categorical
49 const struct variable *v; /* Original variable. */
51 gsl_matrix *m; /* Vector-encoded values of the
52 original variable. The ith row of
53 the matrix corresponds to the ith
54 value of a categorical variable.
57 size_t first_column; /* First column of the gsl_matrix which
58 contains recoded values of the categorical
61 size_t last_column; /* Last column containing the recoded
62 categories. The practice of
63 keeping only the first and last
64 columns of the matrix implies those
65 columns corresponding to v must be
68 size_t n_allocated_categories; /* This is used only during
69 initialization to keep
70 track of the number of
76 There are usually multiple categorical variables to recode. Get rid
77 of this structure immediately when the variable structure has been
78 modified to contain the binary encoding.
80 struct recoded_categorical_array
82 struct recoded_categorical **a;
86 The design matrix structure holds the design
87 matrix and an array to tell us which columns
88 correspond to which variables. This structure
89 is not restricted to categorical variables, and
90 perhaps should be moved to its own module.
93 struct design_matrix_var
95 int first_column; /* First column for this variable in the
96 design_matix. If this variable is categorical,
97 its values are stored in multiple, contiguous
98 columns, as dictated by its vector encoding
99 in the variable's struct recoded_categorical.
102 const struct variable *v;
107 struct design_matrix_var *vars; /* Element i is the the variable whose
108 values are stored in column i of m. If that
109 variable is categorical with more than two
110 categories, its values are stored in multiple,
111 contiguous columns. In this case, element i is
112 the first column for that variable. The
113 variable's values are then stored in the
114 columns first_column through
115 last_column. first_column and last_column for
116 a categorical variable are stored in the
117 variable's recoded_categorical structure.
121 union value *cr_vector_to_value (const gsl_vector *,
122 struct recoded_categorical *);
124 void cr_value_update (struct recoded_categorical *, const union value *);
126 int cr_free_recoded_array (struct recoded_categorical_array *);
128 struct recoded_categorical_array *cr_recoded_cat_ar_create (int,
132 struct recoded_categorical *cr_recoded_categorical_create (const struct
135 void cr_create_value_matrices (struct recoded_categorical_array *);
137 struct recoded_categorical *cr_var_to_recoded_categorical (const struct
140 recoded_categorical_array
143 struct design_matrix *design_matrix_create (int, const struct variable *[],
145 recoded_categorical_array *,
148 void design_matrix_destroy (struct design_matrix *);
150 void design_matrix_set_categorical (struct design_matrix *, size_t,
151 const struct variable *,
153 struct recoded_categorical *);
155 void design_matrix_set_numeric (struct design_matrix *, size_t,
156 const struct variable *, const union value *);
158 size_t design_matrix_var_to_column (const struct design_matrix *,
159 const struct variable *);
161 struct variable *design_matrix_col_to_var (const struct design_matrix *,
165 design_matrix_set (struct design_matrix *, size_t,
166 const struct variable *, const union value *,
167 struct recoded_categorical *);
169 void cr_recoded_categorical_destroy (struct recoded_categorical *);