1 /* PSPP - linear regression.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3 Written by Jason H Stover <jason@sakla.net>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
21 Functions and data structures to recode categorical variables into
22 vectors and sub-rows of matrices.
24 To fit many types of statistical models, it is necessary
25 to change each value of a categorical variable to a vector with binary
26 entries. These vectors are then stored as sub-rows within a matrix
27 during model-fitting. We need functions and data strucutres to,
28 e.g., map a value, say 'a', of a variable named 'cat_var', to a
29 vector, say (0 1 0 0 0), and vice versa. We also need to be able
30 to map the vector back to the value 'a', and if the vector is a
31 sub-row of a matrix, we need to know which sub-row corresponds to
32 the variable 'cat_var'.
34 The data structures defined here will be placed in the variable
35 structure in the future. When that happens, the useful code
36 in this file will be that which refers to design matrices.
42 #include <gsl/gsl_matrix.h>
44 This structure contains the binary encoding of a
47 struct recoded_categorical
49 const struct variable *v; /* Original variable. */
51 gsl_matrix *m; /* Vector-encoded values of the original
52 variable. The ith row of the matrix corresponds
53 to the ith value of a categorical variable.
56 size_t first_column; /* First column of the gsl_matrix which
57 contains recoded values of the categorical
60 size_t last_column; /* Last column containing the recoded
61 categories. The practice of keeping only the
62 first and last columns of the matrix implies
63 those columns corresponding to v must be
66 size_t n_allocated_categories; /* This is used only during initialization
67 to keep track of the number of values
73 There are usually multiple categorical variables to recode. Get rid
74 of this structure immediately when the variable structure has been
75 modified to contain the binary encoding.
77 struct recoded_categorical_array
79 struct recoded_categorical **a;
83 The design matrix structure holds the design
84 matrix and an array to tell us which columns
85 correspond to which variables. This structure
86 is not restricted to categorical variables, and
87 perhaps should be moved to its own module.
90 struct design_matrix_var
92 int first_column; /* First column for this variable in the
93 design_matix. If this variable is categorical,
94 its values are stored in multiple, contiguous
95 columns, as dictated by its vector encoding
96 in the variable's struct recoded_categorical.
104 struct design_matrix_var *vars; /* Element i is the the variable whose
105 values are stored in column i of m. If that
106 variable is categorical with more than two
107 categories, its values are stored in multiple,
108 contiguous columns. In this case, element i is
109 the first column for that variable. The
110 variable's values are then stored in the
111 columns first_column through
112 last_column. first_column and last_column for
113 a categorical variable are stored in the
114 variable's recoded_categorical structure.
118 const union value *cr_vector_to_value (const gsl_vector *,
119 struct recoded_categorical *);
121 void cr_value_update (struct recoded_categorical *, const union value *);
123 int cr_free_recoded_array (struct recoded_categorical_array *);
125 struct recoded_categorical_array *cr_recoded_cat_ar_create (int,
129 struct recoded_categorical *cr_recoded_categorical_create (const struct
132 void cr_create_value_matrices (struct recoded_categorical_array *);
134 struct recoded_categorical *cr_var_to_recoded_categorical (const struct
137 recoded_categorical_array
140 struct design_matrix *design_matrix_create (int, const struct variable *[],
142 recoded_categorical_array *,
145 void design_matrix_destroy (struct design_matrix *);
147 void design_matrix_set_categorical (struct design_matrix *, size_t,
148 const struct variable *,
150 struct recoded_categorical *);
152 void design_matrix_set_numeric (struct design_matrix *, size_t,
153 const struct variable *, const union value *);
155 size_t design_matrix_var_to_column (const struct design_matrix *,
156 const struct variable *);
158 const struct variable *design_matrix_col_to_var (const struct design_matrix *,
162 design_matrix_set (struct design_matrix *, size_t,
163 const struct variable *, const union value *,
164 struct recoded_categorical *);
166 void cr_recoded_categorical_destroy (struct recoded_categorical *);