#define CAT_H 1
#include <gsl/gsl_matrix.h>
+#include <stdbool.h>
/*
- This structure contains the binary encoding of a
+ This structure contains the observed values of a
categorical variable.
*/
-struct recoded_categorical
+struct cat_vals
{
- const struct variable *v; /* Original variable. */
union value *vals;
- gsl_matrix *m; /* Vector-encoded values of the
- original variable. The ith row of
- the matrix corresponds to the ith
- value of a categorical variable.
- */
size_t n_categories;
- size_t first_column; /* First column of the gsl_matrix which
- contains recoded values of the categorical
- variable.
- */
- size_t last_column; /* Last column containing the recoded
- categories. The practice of
- keeping only the first and last
- columns of the matrix implies those
- columns corresponding to v must be
- contiguous.
- */
- size_t n_allocated_categories; /* This is used only during
- initialization to keep
- track of the number of
- values stored.
- */
+ size_t n_allocated_categories; /* This is used only during
+ initialization to keep
+ track of the number of
+ values stored.
+ */
};
/*
struct design_matrix_var
{
- int first_column; /* First column for this variable in the
- design_matix. If this variable is categorical,
- its values are stored in multiple, contiguous
- columns, as dictated by its vector encoding
- in the variable's struct recoded_categorical.
+ int first_column; /* First column for this variable in
+ the design_matix. If this variable
+ is categorical, its values are
+ stored in multiple, contiguous
+ columns, as dictated by its vector
+ encoding in the variable's struct
+ recoded_categorical.
*/
int last_column;
const struct variable *v;
struct design_matrix
{
gsl_matrix *m;
- struct design_matrix_var *vars; /* Element i is the the variable whose
- values are stored in column i of m. If that
- variable is categorical with more than two
- categories, its values are stored in multiple,
- contiguous columns. In this case, element i is
- the first column for that variable. The
- variable's values are then stored in the
- columns first_column through
- last_column. first_column and last_column for
- a categorical variable are stored in the
- variable's recoded_categorical structure.
+ struct design_matrix_var *vars; /* Element i corresponds to
+ the variable whose values
+ are stored in at least one
+ column of m. If that
+ variable is categorical
+ with more than two
+ categories, its values are
+ stored in multiple,
+ contiguous columns. The
+ variable's values are then
+ stored in the columns
+ first_column through
+ last_column of the
+ design_matrix_var
+ structure.
*/
size_t n_vars;
};
-union value *cr_vector_to_value (const gsl_vector *,
- struct recoded_categorical *);
+union value *cr_vector_to_value (const gsl_vector *, struct variable *);
+
+void cat_stored_values_create (struct variable *);
-void cr_value_update (struct recoded_categorical *, const union value *);
+void cat_value_update (struct variable *, const union value *);
-int cr_free_recoded_array (struct recoded_categorical_array *);
+int cat_free_recoded_array (struct recoded_categorical_array *);
struct recoded_categorical_array *cr_recoded_cat_ar_create (int,
struct variable
*[]);
-struct recoded_categorical *cr_recoded_categorical_create (const struct
- variable *);
+void cat_recoded_categorical_create (struct variable *);
-void cr_create_value_matrices (struct recoded_categorical_array *);
+void cat_create_value_matrix (struct variable *);
-struct recoded_categorical *cr_var_to_recoded_categorical (const struct
- variable *,
- struct
- recoded_categorical_array
- *);
+struct recoded_categorical *cat_var_to_recoded_categorical (const struct
+ variable *,
+ struct
+ recoded_categorical_array
+ *);
struct design_matrix *design_matrix_create (int, const struct variable *[],
- struct
- recoded_categorical_array *,
const size_t);
void design_matrix_destroy (struct design_matrix *);
void design_matrix_set_categorical (struct design_matrix *, size_t,
const struct variable *,
- const union value *,
- struct recoded_categorical *);
+ const union value *);
void design_matrix_set_numeric (struct design_matrix *, size_t,
const struct variable *, const union value *);
const struct variable *, const union value *,
struct recoded_categorical *);
-void cr_recoded_categorical_destroy (struct recoded_categorical *);
+void cat_stored_values_destroy (struct variable *);
#endif
int n_indep;
int j = 0;
/*
- Keep track of the missing cases.
- */
+ Keep track of the missing cases.
+ */
int *is_missing_case;
const union value *val;
struct casereader *r;
struct casereader *r2;
struct ccase c;
- const struct variable *v;
- struct recoded_categorical_array *ca;
- struct recoded_categorical *rc;
+ struct variable *v;
struct design_matrix *X;
gsl_vector *Y;
pspp_linreg_cache *lcache;
Read from the active file. The first pass encodes categorical
variables and drops cases with missing values.
*/
- ca = cr_recoded_cat_ar_create (cmd.n_variables, cmd.v_variables);
- for (r = casefile_get_reader (cf);
- casereader_read (r, &c); case_destroy (&c))
+ for (i = 0; i < cmd.n_variables; i++)
{
- row = casereader_cnum (r) - 1;
- for (i = 0; i < ca->n_vars; i++)
+ v = cmd.v_variables[i];
+ if (v->type == ALPHA)
{
- v = (*(ca->a + i))->v;
- val = case_data (&c, v->fv);
- cr_value_update (*(ca->a + i), val);
+ /* Make a place to hold the binary vectors
+ corresponding to this variable's values. */
+ cat_stored_values_create (v);
}
- for (i = 0; i < cmd.n_variables; i++)
+ for (r = casefile_get_reader (cf);
+ casereader_read (r, &c); case_destroy (&c))
{
- v = cmd.v_variables[i];
+ row = casereader_cnum (r) - 1;
+
val = case_data (&c, v->fv);
+ cat_value_update (v, val);
if (mv_is_value_missing (&v->miss, val))
{
- n_data--;
- is_missing_case[row] = 1;
+ if (!is_missing_case[row])
+ {
+ /* Now it is missing. */
+ n_data--;
+ is_missing_case[row] = 1;
+ }
}
}
}
+
Y = gsl_vector_alloc (n_data);
- cr_create_value_matrices (ca);
X =
design_matrix_create (n_indep, (const struct variable **) cmd.v_variables,
- ca, n_data);
- lcache = pspp_linreg_cache_alloc (n_data, n_indep);
- lcache->indep_means = gsl_vector_alloc (n_indep);
- lcache->indep_std = gsl_vector_alloc (n_indep);
+ n_data);
+ lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2);
+ lcache->indep_means = gsl_vector_alloc (X->m->size2);
+ lcache->indep_std = gsl_vector_alloc (X->m->size2);
/*
The second pass creates the design matrix.
{
for (i = 0; i < cmd.n_variables; ++i) /* Iterate over the variables
for the current case.
- */
+ */
{
v = cmd.v_variables[i];
val = case_data (&c, v->fv);
/*
- Independent/dependent variable separation. The
- 'variables' subcommand specifies a varlist which contains
- both dependent and independent variables. The dependent
- variables are specified with the 'dependent'
- subcommand. We need to separate the two.
- */
+ Independent/dependent variable separation. The
+ 'variables' subcommand specifies a varlist which contains
+ both dependent and independent variables. The dependent
+ variables are specified with the 'dependent'
+ subcommand. We need to separate the two.
+ */
if (is_depvar (i))
{
if (v->type != NUMERIC)
{
- msg (SE, gettext ("Dependent variable must be numeric."));
+ msg (SE,
+ gettext ("Dependent variable must be numeric."));
pspp_reg_rc = CMD_FAILURE;
return;
}
{
if (v->type == ALPHA)
{
- rc = cr_var_to_recoded_categorical (v, ca);
- design_matrix_set_categorical (X, row, v, val, rc);
+ design_matrix_set_categorical (X, row, v, val);
}
else if (v->type == NUMERIC)
{
design_matrix_set_numeric (X, row, v, val);
}
-
+
indep_vars[k] = i;
k++;
lopts.get_indep_mean_std[i] = 1;
}
}
/*
- Now that we know the number of coefficients, allocate space
- and store pointers to the variables that correspond to the
- coefficients.
+ Now that we know the number of coefficients, allocate space
+ and store pointers to the variables that correspond to the
+ coefficients.
*/
lcache->coeff = xnmalloc (X->m->size2 + 1, sizeof (*lcache->coeff));
for (i = 0; i < X->m->size2; i++)