From c38b1e667a1fdb12df302ce54872dee88c04e65e Mon Sep 17 00:00:00 2001 From: Jason Stover Date: Tue, 22 Nov 2005 22:04:45 +0000 Subject: [PATCH] Rewrote categorical value-handling --- src/cat.h | 101 +++++++++++++++++++++-------------------------- src/regression.q | 75 ++++++++++++++++++----------------- 2 files changed, 83 insertions(+), 93 deletions(-) diff --git a/src/cat.h b/src/cat.h index 593a5764..a47008f5 100644 --- a/src/cat.h +++ b/src/cat.h @@ -40,36 +40,20 @@ #define CAT_H 1 #include +#include /* - This structure contains the binary encoding of a + This structure contains the observed values of a categorical variable. */ -struct recoded_categorical +struct cat_vals { - const struct variable *v; /* Original variable. */ union value *vals; - gsl_matrix *m; /* Vector-encoded values of the - original variable. The ith row of - the matrix corresponds to the ith - value of a categorical variable. - */ size_t n_categories; - size_t first_column; /* First column of the gsl_matrix which - contains recoded values of the categorical - variable. - */ - size_t last_column; /* Last column containing the recoded - categories. The practice of - keeping only the first and last - columns of the matrix implies those - columns corresponding to v must be - contiguous. - */ - size_t n_allocated_categories; /* This is used only during - initialization to keep - track of the number of - values stored. - */ + size_t n_allocated_categories; /* This is used only during + initialization to keep + track of the number of + values stored. + */ }; /* @@ -92,11 +76,13 @@ struct recoded_categorical_array struct design_matrix_var { - int first_column; /* First column for this variable in the - design_matix. If this variable is categorical, - its values are stored in multiple, contiguous - columns, as dictated by its vector encoding - in the variable's struct recoded_categorical. + int first_column; /* First column for this variable in + the design_matix. If this variable + is categorical, its values are + stored in multiple, contiguous + columns, as dictated by its vector + encoding in the variable's struct + recoded_categorical. */ int last_column; const struct variable *v; @@ -104,53 +90,54 @@ struct design_matrix_var struct design_matrix { gsl_matrix *m; - struct design_matrix_var *vars; /* Element i is the the variable whose - values are stored in column i of m. If that - variable is categorical with more than two - categories, its values are stored in multiple, - contiguous columns. In this case, element i is - the first column for that variable. The - variable's values are then stored in the - columns first_column through - last_column. first_column and last_column for - a categorical variable are stored in the - variable's recoded_categorical structure. + struct design_matrix_var *vars; /* Element i corresponds to + the variable whose values + are stored in at least one + column of m. If that + variable is categorical + with more than two + categories, its values are + stored in multiple, + contiguous columns. The + variable's values are then + stored in the columns + first_column through + last_column of the + design_matrix_var + structure. */ size_t n_vars; }; -union value *cr_vector_to_value (const gsl_vector *, - struct recoded_categorical *); +union value *cr_vector_to_value (const gsl_vector *, struct variable *); + +void cat_stored_values_create (struct variable *); -void cr_value_update (struct recoded_categorical *, const union value *); +void cat_value_update (struct variable *, const union value *); -int cr_free_recoded_array (struct recoded_categorical_array *); +int cat_free_recoded_array (struct recoded_categorical_array *); struct recoded_categorical_array *cr_recoded_cat_ar_create (int, struct variable *[]); -struct recoded_categorical *cr_recoded_categorical_create (const struct - variable *); +void cat_recoded_categorical_create (struct variable *); -void cr_create_value_matrices (struct recoded_categorical_array *); +void cat_create_value_matrix (struct variable *); -struct recoded_categorical *cr_var_to_recoded_categorical (const struct - variable *, - struct - recoded_categorical_array - *); +struct recoded_categorical *cat_var_to_recoded_categorical (const struct + variable *, + struct + recoded_categorical_array + *); struct design_matrix *design_matrix_create (int, const struct variable *[], - struct - recoded_categorical_array *, const size_t); void design_matrix_destroy (struct design_matrix *); void design_matrix_set_categorical (struct design_matrix *, size_t, const struct variable *, - const union value *, - struct recoded_categorical *); + const union value *); void design_matrix_set_numeric (struct design_matrix *, size_t, const struct variable *, const union value *); @@ -166,6 +153,6 @@ design_matrix_set (struct design_matrix *, size_t, const struct variable *, const union value *, struct recoded_categorical *); -void cr_recoded_categorical_destroy (struct recoded_categorical *); +void cat_stored_values_destroy (struct variable *); #endif diff --git a/src/regression.q b/src/regression.q index 3c80d700..ee7c0075 100644 --- a/src/regression.q +++ b/src/regression.q @@ -504,16 +504,14 @@ run_regression (const struct casefile *cf, void *cmd_ UNUSED) int n_indep; int j = 0; /* - Keep track of the missing cases. - */ + Keep track of the missing cases. + */ int *is_missing_case; const union value *val; struct casereader *r; struct casereader *r2; struct ccase c; - const struct variable *v; - struct recoded_categorical_array *ca; - struct recoded_categorical *rc; + struct variable *v; struct design_matrix *X; gsl_vector *Y; pspp_linreg_cache *lcache; @@ -536,36 +534,41 @@ run_regression (const struct casefile *cf, void *cmd_ UNUSED) Read from the active file. The first pass encodes categorical variables and drops cases with missing values. */ - ca = cr_recoded_cat_ar_create (cmd.n_variables, cmd.v_variables); - for (r = casefile_get_reader (cf); - casereader_read (r, &c); case_destroy (&c)) + for (i = 0; i < cmd.n_variables; i++) { - row = casereader_cnum (r) - 1; - for (i = 0; i < ca->n_vars; i++) + v = cmd.v_variables[i]; + if (v->type == ALPHA) { - v = (*(ca->a + i))->v; - val = case_data (&c, v->fv); - cr_value_update (*(ca->a + i), val); + /* Make a place to hold the binary vectors + corresponding to this variable's values. */ + cat_stored_values_create (v); } - for (i = 0; i < cmd.n_variables; i++) + for (r = casefile_get_reader (cf); + casereader_read (r, &c); case_destroy (&c)) { - v = cmd.v_variables[i]; + row = casereader_cnum (r) - 1; + val = case_data (&c, v->fv); + cat_value_update (v, val); if (mv_is_value_missing (&v->miss, val)) { - n_data--; - is_missing_case[row] = 1; + if (!is_missing_case[row]) + { + /* Now it is missing. */ + n_data--; + is_missing_case[row] = 1; + } } } } + Y = gsl_vector_alloc (n_data); - cr_create_value_matrices (ca); X = design_matrix_create (n_indep, (const struct variable **) cmd.v_variables, - ca, n_data); - lcache = pspp_linreg_cache_alloc (n_data, n_indep); - lcache->indep_means = gsl_vector_alloc (n_indep); - lcache->indep_std = gsl_vector_alloc (n_indep); + n_data); + lcache = pspp_linreg_cache_alloc (X->m->size1, X->m->size2); + lcache->indep_means = gsl_vector_alloc (X->m->size2); + lcache->indep_std = gsl_vector_alloc (X->m->size2); /* The second pass creates the design matrix. @@ -581,22 +584,23 @@ run_regression (const struct casefile *cf, void *cmd_ UNUSED) { for (i = 0; i < cmd.n_variables; ++i) /* Iterate over the variables for the current case. - */ + */ { v = cmd.v_variables[i]; val = case_data (&c, v->fv); /* - Independent/dependent variable separation. The - 'variables' subcommand specifies a varlist which contains - both dependent and independent variables. The dependent - variables are specified with the 'dependent' - subcommand. We need to separate the two. - */ + Independent/dependent variable separation. The + 'variables' subcommand specifies a varlist which contains + both dependent and independent variables. The dependent + variables are specified with the 'dependent' + subcommand. We need to separate the two. + */ if (is_depvar (i)) { if (v->type != NUMERIC) { - msg (SE, gettext ("Dependent variable must be numeric.")); + msg (SE, + gettext ("Dependent variable must be numeric.")); pspp_reg_rc = CMD_FAILURE; return; } @@ -607,14 +611,13 @@ run_regression (const struct casefile *cf, void *cmd_ UNUSED) { if (v->type == ALPHA) { - rc = cr_var_to_recoded_categorical (v, ca); - design_matrix_set_categorical (X, row, v, val, rc); + design_matrix_set_categorical (X, row, v, val); } else if (v->type == NUMERIC) { design_matrix_set_numeric (X, row, v, val); } - + indep_vars[k] = i; k++; lopts.get_indep_mean_std[i] = 1; @@ -624,9 +627,9 @@ run_regression (const struct casefile *cf, void *cmd_ UNUSED) } } /* - Now that we know the number of coefficients, allocate space - and store pointers to the variables that correspond to the - coefficients. + Now that we know the number of coefficients, allocate space + and store pointers to the variables that correspond to the + coefficients. */ lcache->coeff = xnmalloc (X->m->size2 + 1, sizeof (*lcache->coeff)); for (i = 0; i < X->m->size2; i++) -- 2.30.2