casefile.h \
cat.c \
cat.h \
+ cat-routines.h \
chart.c \
chart.h \
ctl-stack.c \
date.c \
debug-print.h \
descript.c \
+ design-matrix.h \
+ design-matrix.c \
dfm-read.c \
dfm-read.h \
dfm-write.c \
--- /dev/null
+/* PSPP - Binary encodings for categorical variables.
+ Copyright (C) 2005 Free Software Foundation, Inc.
+ Written by Jason H Stover <jason@sakla.net>.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/*
+ Functions and data structures to recode categorical variables into
+ vectors and sub-rows of matrices.
+
+ To fit many types of statistical models, it is necessary
+ to change each value of a categorical variable to a vector with binary
+ entries. These vectors are then stored as sub-rows within a matrix
+ during model-fitting. We need functions and data strucutres to,
+ e.g., map a value, say 'a', of a variable named 'cat_var', to a
+ vector, say (0 1 0 0 0), and vice versa. We also need to be able
+ to map the vector back to the value 'a', and if the vector is a
+ sub-row of a matrix, we need to know which sub-row corresponds to
+ the variable 'cat_var'.
+
+ */
+
+#ifndef CAT_ROUTINES_H
+#define CAT_ROUTINES_H
+#define CAT_VALUE_NOT_FOUND -2
+#include <stdbool.h>
+#include "cat.h"
+
+size_t cat_value_find (const struct variable *, const union value *);
+
+union value *cat_subscript_to_value (const size_t, struct variable *);
+
+void cat_stored_values_create (struct variable *);
+
+void cat_value_update (struct variable *, const union value *);
+
+void cat_create_value_matrix (struct variable *);
+
+void cat_stored_values_destroy (struct variable *);
+#endif
-/* PSPP - linear regression.
+/* PSPP - binary encodings for categorical variables.
Copyright (C) 2005 Free Software Foundation, Inc.
Written by Jason H Stover <jason@sakla.net>.
#include <error.h>
#include "alloc.h"
#include "error.h"
-#include "var.h"
#include "cat.h"
+#include "cat-routines.h"
#include <string.h>
-#include <math.h>
-#include <gsl/gsl_machine.h>
-#include <gsl/gsl_vector.h>
-#include <gsl/gsl_matrix.h>
#define N_INITIAL_CATEGORIES 1
-#define CAT_COLUMN_NOT_FOUND -1
-#define CAT_VALUE_NOT_FOUND -2
-#define CAT_INDEX_NOT_FOUND -3
void
cat_stored_values_create (struct variable *v)
}
}
-static size_t
+/*
+ Which subscript corresponds to val?
+ */
+size_t
cat_value_find (const struct variable *v, const union value *val)
{
size_t i;
}
}
-/*
- Return the subscript of the binary vector corresponding
- to this value.
- */
-size_t
-cat_value_to_subscript (const union value *val, struct variable *v)
-{
- const union value *val2;
- size_t subscript;
- int different;
-
- assert (v != NULL);
- assert (val != NULL);
- assert (v->obs_vals != NULL);
- subscript = v->obs_vals->n_categories - 1;
- while (subscript > 0)
- {
- val2 = v->obs_vals->vals + subscript;
- assert (val2 != NULL);
- different = compare_values (val, val2, v->width);
- if (!different)
- {
- return subscript;
- }
- subscript--;
- }
- return subscript;
-}
-
union value *
cat_subscript_to_value (const size_t s, struct variable *v)
{
return NULL;
}
}
-
-/*
- Which element of a vector is equal to the value x?
- */
-static size_t
-cat_which_element_eq (const gsl_vector * vec, double x)
-{
- size_t i;
-
- for (i = 0; i < vec->size; i++)
- {
- if (fabs (gsl_vector_get (vec, i) - x) < GSL_DBL_EPSILON)
- {
- return i;
- }
- }
- return CAT_VALUE_NOT_FOUND;
-}
-static int
-cat_is_zero_vector (const gsl_vector * vec)
-{
- size_t i;
-
- for (i = 0; i < vec->size; i++)
- {
- if (gsl_vector_get (vec, i) != 0.0)
- {
- return 0;
- }
- }
- return 1;
-}
-
-/*
- Return the value of v corresponding to the vector vec.
- */
-union value *
-cat_vector_to_value (const gsl_vector * vec, struct variable *v)
-{
- size_t i;
-
- i = cat_which_element_eq (vec, 1.0);
- if (i != CAT_VALUE_NOT_FOUND)
- {
- return cat_subscript_to_value (i + 1, v);
- }
- if (cat_is_zero_vector (vec))
- {
- return cat_subscript_to_value (0, v);
- }
- return NULL;
-}
-
-struct design_matrix *
-design_matrix_create (int n_variables,
- const struct variable *v_variables[],
- const size_t n_data)
-{
- struct design_matrix *dm;
- const struct variable *v;
- size_t i;
- size_t n_cols = 0;
- size_t col;
-
- dm = xmalloc (sizeof *dm);
- dm->vars = xnmalloc (n_variables, sizeof *dm->vars);
- dm->n_vars = n_variables;
-
- for (i = 0; i < n_variables; i++)
- {
- v = v_variables[i];
- assert ((dm->vars + i) != NULL);
- (dm->vars + i)->v = v; /* Allows us to look up the variable from
- the design matrix. */
- (dm->vars + i)->first_column = n_cols;
- if (v->type == NUMERIC)
- {
- n_cols++;
- (dm->vars + i)->last_column = n_cols;
- }
- else if (v->type == ALPHA)
- {
- assert (v->obs_vals != NULL);
- (dm->vars + i)->last_column =
- (dm->vars + i)->first_column + v->obs_vals->n_categories - 2;
- n_cols += v->obs_vals->n_categories - 1;
- }
- }
- dm->m = gsl_matrix_calloc (n_data, n_cols);
- col = 0;
-
- return dm;
-}
-
-void
-design_matrix_destroy (struct design_matrix *dm)
-{
- free (dm->vars);
- gsl_matrix_free (dm->m);
- free (dm);
-}
-
-/*
- Return the index of the variable for the
- given column.
- */
-static size_t
-design_matrix_col_to_var_index (const struct design_matrix *dm, size_t col)
-{
- size_t i;
- struct design_matrix_var v;
-
- for (i = 0; i < dm->n_vars; i++)
- {
- v = dm->vars[i];
- if (v.first_column <= col && col <= v.last_column)
- return (v.v)->index;
- }
- return CAT_INDEX_NOT_FOUND;
-}
-
-/*
- Return a pointer to the variable whose values
- are stored in column col.
- */
-struct variable *
-design_matrix_col_to_var (const struct design_matrix *dm, size_t col)
-{
- size_t index;
- size_t i;
- struct design_matrix_var dmv;
-
- index = design_matrix_col_to_var_index (dm, col);
- for (i = 0; i < dm->n_vars; i++)
- {
- dmv = dm->vars[i];
- if ((dmv.v)->index == index)
- {
- return (struct variable *) dmv.v;
- }
- }
- return NULL;
-}
-
-static size_t
-cmp_dm_var_index (const struct design_matrix_var *dmv, size_t index)
-{
- if (dmv->v->index == index)
- return 1;
- return 0;
-}
-
-/*
- Return the number of the first column which holds the
- values for variable v.
- */
-size_t
-design_matrix_var_to_column (const struct design_matrix * dm,
- const struct variable * v)
-{
- size_t i;
- struct design_matrix_var tmp;
-
- for (i = 0; i < dm->n_vars; i++)
- {
- tmp = dm->vars[i];
- if (cmp_dm_var_index (&tmp, v->index))
- {
- return tmp.first_column;
- }
- }
- return CAT_COLUMN_NOT_FOUND;
-}
-
-/* Last column. */
-static size_t
-dm_var_to_last_column (const struct design_matrix *dm,
- const struct variable *v)
-{
- size_t i;
- struct design_matrix_var tmp;
-
- for (i = 0; i < dm->n_vars; i++)
- {
- tmp = dm->vars[i];
- if (cmp_dm_var_index (&tmp, v->index))
- {
- return tmp.last_column;
- }
- }
- return CAT_COLUMN_NOT_FOUND;
-}
-
-/*
- Set the appropriate value in the design matrix,
- whether that value is from a categorical or numeric
- variable. For a categorical variable, only the usual
- binary encoding is allowed.
- */
-void
-design_matrix_set_categorical (struct design_matrix *dm, size_t row,
- const struct variable *var,
- const union value *val)
-{
- size_t col;
- size_t is_one;
- size_t fc;
- size_t lc;
- double entry;
-
- assert (var->type == ALPHA);
- fc = design_matrix_var_to_column (dm, var);
- lc = dm_var_to_last_column (dm, var);
- assert (lc != CAT_COLUMN_NOT_FOUND);
- assert (fc != CAT_COLUMN_NOT_FOUND);
- is_one = fc + cat_value_find (var, val);
- for (col = fc; col <= lc; col++)
- {
- entry = (col == is_one) ? 1.0 : 0.0;
- gsl_matrix_set (dm->m, row, col, entry);
- }
-}
-void
-design_matrix_set_numeric (struct design_matrix *dm, size_t row,
- const struct variable *var, const union value *val)
-{
- size_t col;
-
- assert (var->type == NUMERIC);
- col = design_matrix_var_to_column ((const struct design_matrix *) dm, var);
- assert (col != CAT_COLUMN_NOT_FOUND);
- gsl_matrix_set (dm->m, row, col, val->f);
-}
-/* PSPP - linear regression.
+/* PSPP - Binary encodings for categorical variables.
Copyright (C) 2005 Free Software Foundation, Inc.
Written by Jason H Stover <jason@sakla.net>.
sub-row of a matrix, we need to know which sub-row corresponds to
the variable 'cat_var'.
- The data structures defined here will be placed in the variable
- structure in the future. When that happens, the useful code
- in this file will be that which refers to design matrices.
*/
#ifndef CAT_H
-#define CAT_H 1
-
-#include <gsl/gsl_matrix.h>
+#define CAT_H
+#define CAT_VALUE_NOT_FOUND -2
#include <stdbool.h>
+#include "val.h"
+#include "var.h"
/*
This structure contains the observed values of a
categorical variable.
values stored.
*/
};
-
-/*
- There are usually multiple categorical variables to recode. Get rid
- of this structure immediately when the variable structure has been
- modified to contain the binary encoding.
- */
-struct recoded_categorical_array
-{
- struct recoded_categorical **a;
- size_t n_vars;
-};
-/*
- The design matrix structure holds the design
- matrix and an array to tell us which columns
- correspond to which variables. This structure
- is not restricted to categorical variables, and
- perhaps should be moved to its own module.
-*/
-
-struct design_matrix_var
-{
- int first_column; /* First column for this variable in
- the design_matix. If this variable
- is categorical, its values are
- stored in multiple, contiguous
- columns, as dictated by its vector
- encoding in the variable's struct
- recoded_categorical.
- */
- int last_column;
- const struct variable *v;
-};
-struct design_matrix
-{
- gsl_matrix *m;
- struct design_matrix_var *vars; /* Element i corresponds to
- the variable whose values
- are stored in at least one
- column of m. If that
- variable is categorical
- with more than two
- categories, its values are
- stored in multiple,
- contiguous columns. The
- variable's values are then
- stored in the columns
- first_column through
- last_column of the
- design_matrix_var
- structure.
- */
- size_t n_vars;
-};
-union value *cat_vector_to_value (const gsl_vector *, struct variable *);
-
-void cat_stored_values_create (struct variable *);
-
-void cat_value_update (struct variable *, const union value *);
-
-int cat_free_recoded_array (struct recoded_categorical_array *);
-
-struct recoded_categorical_array *cr_recoded_cat_ar_create (int,
- struct variable
- *[]);
-
-void cat_recoded_categorical_create (struct variable *);
-
-void cat_create_value_matrix (struct variable *);
-
-struct recoded_categorical *cat_var_to_recoded_categorical (const struct
- variable *,
- struct
- recoded_categorical_array
- *);
-
-struct design_matrix *design_matrix_create (int, const struct variable *[],
- const size_t);
-
-void design_matrix_destroy (struct design_matrix *);
-
-void design_matrix_set_categorical (struct design_matrix *, size_t,
- const struct variable *,
- const union value *);
-
-void design_matrix_set_numeric (struct design_matrix *, size_t,
- const struct variable *, const union value *);
-
-size_t design_matrix_var_to_column (const struct design_matrix *,
- const struct variable *);
-
-struct variable *design_matrix_col_to_var (const struct design_matrix *,
- size_t);
-
-void
-design_matrix_set (struct design_matrix *, size_t,
- const struct variable *, const union value *,
- struct recoded_categorical *);
-
-void cat_stored_values_destroy (struct variable *);
-
#endif
--- /dev/null
+/* PSPP - Creates design-matrices.
+ Copyright (C) 2005 Free Software Foundation, Inc.
+ Written by Jason H Stover <jason@sakla.net>.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/*
+ Create design matrices for procedures that need them.
+*/
+#include <config.h>
+#include <stdlib.h>
+#include <error.h>
+#include "alloc.h"
+#include "error.h"
+#include "var.h"
+#include "cat.h"
+#include "design-matrix.h"
+#include <string.h>
+#include <math.h>
+#include <gsl/gsl_machine.h>
+#include <gsl/gsl_vector.h>
+#include <gsl/gsl_matrix.h>
+
+#define DM_COLUMN_NOT_FOUND -1
+#define DM_INDEX_NOT_FOUND -3
+
+/*
+ Which element of a vector is equal to the value x?
+ */
+static size_t
+cat_which_element_eq (const gsl_vector * vec, double x)
+{
+ size_t i;
+
+ for (i = 0; i < vec->size; i++)
+ {
+ if (fabs (gsl_vector_get (vec, i) - x) < GSL_DBL_EPSILON)
+ {
+ return i;
+ }
+ }
+ return CAT_VALUE_NOT_FOUND;
+}
+static int
+cat_is_zero_vector (const gsl_vector * vec)
+{
+ size_t i;
+
+ for (i = 0; i < vec->size; i++)
+ {
+ if (gsl_vector_get (vec, i) != 0.0)
+ {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/*
+ Return the value of v corresponding to the vector vec.
+ */
+union value *
+cat_vector_to_value (const gsl_vector * vec, struct variable *v)
+{
+ size_t i;
+
+ i = cat_which_element_eq (vec, 1.0);
+ if (i != CAT_VALUE_NOT_FOUND)
+ {
+ return cat_subscript_to_value (i + 1, v);
+ }
+ if (cat_is_zero_vector (vec))
+ {
+ return cat_subscript_to_value (0, v);
+ }
+ return NULL;
+}
+
+struct design_matrix *
+design_matrix_create (int n_variables,
+ const struct variable *v_variables[],
+ const size_t n_data)
+{
+ struct design_matrix *dm;
+ const struct variable *v;
+ size_t i;
+ size_t n_cols = 0;
+ size_t col;
+
+ dm = xmalloc (sizeof *dm);
+ dm->vars = xnmalloc (n_variables, sizeof *dm->vars);
+ dm->n_vars = n_variables;
+
+ for (i = 0; i < n_variables; i++)
+ {
+ v = v_variables[i];
+ assert ((dm->vars + i) != NULL);
+ (dm->vars + i)->v = v; /* Allows us to look up the variable from
+ the design matrix. */
+ (dm->vars + i)->first_column = n_cols;
+ if (v->type == NUMERIC)
+ {
+ n_cols++;
+ (dm->vars + i)->last_column = n_cols;
+ }
+ else if (v->type == ALPHA)
+ {
+ assert (v->obs_vals != NULL);
+ (dm->vars + i)->last_column =
+ (dm->vars + i)->first_column + v->obs_vals->n_categories - 2;
+ n_cols += v->obs_vals->n_categories - 1;
+ }
+ }
+ dm->m = gsl_matrix_calloc (n_data, n_cols);
+ col = 0;
+
+ return dm;
+}
+
+void
+design_matrix_destroy (struct design_matrix *dm)
+{
+ free (dm->vars);
+ gsl_matrix_free (dm->m);
+ free (dm);
+}
+
+/*
+ Return the index of the variable for the
+ given column.
+ */
+static size_t
+design_matrix_col_to_var_index (const struct design_matrix *dm, size_t col)
+{
+ size_t i;
+ struct design_matrix_var v;
+
+ for (i = 0; i < dm->n_vars; i++)
+ {
+ v = dm->vars[i];
+ if (v.first_column <= col && col <= v.last_column)
+ return (v.v)->index;
+ }
+ return DM_INDEX_NOT_FOUND;
+}
+
+/*
+ Return a pointer to the variable whose values
+ are stored in column col.
+ */
+struct variable *
+design_matrix_col_to_var (const struct design_matrix *dm, size_t col)
+{
+ size_t index;
+ size_t i;
+ struct design_matrix_var dmv;
+
+ index = design_matrix_col_to_var_index (dm, col);
+ for (i = 0; i < dm->n_vars; i++)
+ {
+ dmv = dm->vars[i];
+ if ((dmv.v)->index == index)
+ {
+ return (struct variable *) dmv.v;
+ }
+ }
+ return NULL;
+}
+
+static size_t
+cmp_dm_var_index (const struct design_matrix_var *dmv, size_t index)
+{
+ if (dmv->v->index == index)
+ return 1;
+ return 0;
+}
+
+/*
+ Return the number of the first column which holds the
+ values for variable v.
+ */
+size_t
+design_matrix_var_to_column (const struct design_matrix * dm,
+ const struct variable * v)
+{
+ size_t i;
+ struct design_matrix_var tmp;
+
+ for (i = 0; i < dm->n_vars; i++)
+ {
+ tmp = dm->vars[i];
+ if (cmp_dm_var_index (&tmp, v->index))
+ {
+ return tmp.first_column;
+ }
+ }
+ return DM_COLUMN_NOT_FOUND;
+}
+
+/* Last column. */
+static size_t
+dm_var_to_last_column (const struct design_matrix *dm,
+ const struct variable *v)
+{
+ size_t i;
+ struct design_matrix_var tmp;
+
+ for (i = 0; i < dm->n_vars; i++)
+ {
+ tmp = dm->vars[i];
+ if (cmp_dm_var_index (&tmp, v->index))
+ {
+ return tmp.last_column;
+ }
+ }
+ return DM_COLUMN_NOT_FOUND;
+}
+
+/*
+ Set the appropriate value in the design matrix,
+ whether that value is from a categorical or numeric
+ variable. For a categorical variable, only the usual
+ binary encoding is allowed.
+ */
+void
+design_matrix_set_categorical (struct design_matrix *dm, size_t row,
+ const struct variable *var,
+ const union value *val)
+{
+ size_t col;
+ size_t is_one;
+ size_t fc;
+ size_t lc;
+ double entry;
+
+ assert (var->type == ALPHA);
+ fc = design_matrix_var_to_column (dm, var);
+ lc = dm_var_to_last_column (dm, var);
+ assert (lc != DM_COLUMN_NOT_FOUND);
+ assert (fc != DM_COLUMN_NOT_FOUND);
+ is_one = fc + cat_value_find (var, val);
+ for (col = fc; col <= lc; col++)
+ {
+ entry = (col == is_one) ? 1.0 : 0.0;
+ gsl_matrix_set (dm->m, row, col, entry);
+ }
+}
+void
+design_matrix_set_numeric (struct design_matrix *dm, size_t row,
+ const struct variable *var, const union value *val)
+{
+ size_t col;
+
+ assert (var->type == NUMERIC);
+ col = design_matrix_var_to_column ((const struct design_matrix *) dm, var);
+ assert (col != DM_COLUMN_NOT_FOUND);
+ gsl_matrix_set (dm->m, row, col, val->f);
+}
--- /dev/null
+/* PSPP - Creates design matrices.
+ Copyright (C) 2005 Free Software Foundation, Inc.
+ Written by Jason H Stover <jason@sakla.net>.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License as
+ published by the Free Software Foundation; either version 2 of the
+ License, or (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA. */
+
+/*
+ Create design matrices for procedures that need them.
+ */
+
+#ifndef DESIGN_MATRIX_H
+#define DESIGN_MATRIX_H
+
+#include <gsl/gsl_matrix.h>
+#include <stdbool.h>
+#include "cat.h"
+#include "cat-routines.h"
+struct design_matrix_var
+{
+ int first_column; /* First column for this variable in
+ the design_matix. If this variable
+ is categorical, its values are
+ stored in multiple, contiguous
+ columns, as dictated by its vector
+ encoding in the variable's struct
+ cat_vals.
+ */
+ int last_column;
+ const struct variable *v;
+};
+struct design_matrix
+{
+ gsl_matrix *m;
+ struct design_matrix_var *vars; /* Element i corresponds to
+ the variable whose values
+ are stored in at least one
+ column of m. If that
+ variable is categorical
+ with more than two
+ categories, its values are
+ stored in multiple,
+ contiguous columns. The
+ variable's values are then
+ stored in the columns
+ first_column through
+ last_column of the
+ design_matrix_var
+ structure.
+ */
+ size_t n_vars;
+};
+union value *cat_vector_to_value (const gsl_vector *, struct variable *);
+
+struct design_matrix *design_matrix_create (int, const struct variable *[],
+ const size_t);
+
+void design_matrix_destroy (struct design_matrix *);
+
+void design_matrix_set_categorical (struct design_matrix *, size_t,
+ const struct variable *,
+ const union value *);
+
+void design_matrix_set_numeric (struct design_matrix *, size_t,
+ const struct variable *, const union value *);
+
+size_t design_matrix_var_to_column (const struct design_matrix *,
+ const struct variable *);
+
+struct variable *design_matrix_col_to_var (const struct design_matrix *,
+ size_t);
+
+#endif
#include "case.h"
#include "casefile.h"
#include "cat.h"
+#include "cat-routines.h"
#include "command.h"
+#include "design-matrix.h"
#include "dictionary.h"
#include "error.h"
#include "file-handle.h"