From dda82f83e66a6e5efa72f0e15a1f76a61f990e92 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Wed, 14 Nov 2012 17:05:01 +0100 Subject: [PATCH] Logistic Regression: Handle missing categoricals Ensure that missing values on categorical predictors are handled properly. --- src/language/stats/logistic.c | 91 ++++++++++++++++++--- tests/language/stats/logistic.at | 136 +++++++++++++++++++++++++++++++ 2 files changed, 217 insertions(+), 10 deletions(-) diff --git a/src/language/stats/logistic.c b/src/language/stats/logistic.c index f85b6ec39e..60e1757abe 100644 --- a/src/language/stats/logistic.c +++ b/src/language/stats/logistic.c @@ -65,6 +65,8 @@ #include "libpspp/misc.h" #include "math/categoricals.h" #include "math/interaction.h" +#include "libpspp/hmap.h" +#include "libpspp/hash-functions.h" #include "output/tab.h" @@ -101,6 +103,12 @@ struct lr_spec struct interaction **cat_predictors; size_t n_cat_predictors; + + /* The union of the categorical and non-categorical variables */ + const struct variable **indep_vars; + size_t n_indep_vars; + + /* Which classes of missing vars are to be excluded */ enum mv_class exclude; @@ -441,10 +449,10 @@ beta_hat_initial (const struct lr_spec *cmd, struct lr_result *res, struct caser double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight); const union value *depval = case_data (c, cmd->dep_var); - for (v = 0; v < cmd->n_predictor_vars; ++v) + for (v = 0; v < cmd->n_indep_vars; ++v) { - const union value *val = case_data (c, cmd->predictor_vars[v]); - if (var_is_value_missing (cmd->predictor_vars[v], val, cmd->exclude)) + const union value *val = case_data (c, cmd->indep_vars[v]); + if (var_is_value_missing (cmd->indep_vars[v], val, cmd->exclude)) { missing = true; break; @@ -583,8 +591,8 @@ run_lr (const struct lr_spec *cmd, struct casereader *input, input = casereader_create_filter_missing (input, - cmd->predictor_vars, - cmd->n_predictor_vars, + cmd->indep_vars, + cmd->n_indep_vars, cmd->exclude, NULL, NULL); @@ -671,6 +679,28 @@ run_lr (const struct lr_spec *cmd, struct casereader *input, return true; } +struct variable_node +{ + struct hmap_node node; /* Node in hash map. */ + const struct variable *var; /* The variable */ +}; + +static struct variable_node * +lookup_variable (const struct hmap *map, const struct variable *var, unsigned int hash) +{ + struct variable_node *vn = NULL; + HMAP_FOR_EACH_WITH_HASH (vn, struct variable_node, node, hash, map) + { + if (vn->var == var) + break; + + fprintf (stderr, "Warning: Hash table collision\n"); + } + + return vn; +} + + /* Parse the LOGISTIC REGRESSION command syntax */ int cmd_logistic (struct lexer *lexer, struct dataset *ds) @@ -697,7 +727,7 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds) lr.print = PRINT_DEFAULT; lr.cat_predictors = NULL; lr.n_cat_predictors = 0; - + lr.indep_vars = NULL; if (lex_match_id (lexer, "VARIABLES")) @@ -920,26 +950,50 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds) final one, dropping any categorical variables which appear there. FIXME: This is O(NxM). */ + + { + struct variable_node *vn, *next; + struct hmap allvars; + hmap_init (&allvars); for (v = x = 0; v < n_pred_vars; ++v) { bool drop = false; const struct variable *var = pred_vars[v]; int cv = 0; + + unsigned int hash = hash_pointer (var, 0); + struct variable_node *vn = lookup_variable (&allvars, var, hash); + if (vn == NULL) + { + vn = xmalloc (sizeof *vn); + vn->var = var; + hmap_insert (&allvars, &vn->node, hash); + } + for (cv = 0; cv < lr.n_cat_predictors ; ++cv) { int iv; const struct interaction *iact = lr.cat_predictors[cv]; for (iv = 0 ; iv < iact->n_vars ; ++iv) { - if (var == iact->vars[iv]) + const struct variable *ivar = iact->vars[iv]; + unsigned int hash = hash_pointer (ivar, 0); + struct variable_node *vn = lookup_variable (&allvars, ivar, hash); + if (vn == NULL) + { + vn = xmalloc (sizeof *vn); + vn->var = ivar; + + hmap_insert (&allvars, &vn->node, hash); + } + + if (var == ivar) { drop = true; - goto dropped; } } } - dropped: if (drop) continue; @@ -949,8 +1003,21 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds) } free (pred_vars); + lr.n_indep_vars = hmap_count (&allvars); + lr.indep_vars = xmalloc (lr.n_indep_vars * sizeof *lr.indep_vars); + + /* Interate over each variable and push it into the array */ + x = 0; + HMAP_FOR_EACH_SAFE (vn, next, struct variable_node, node, &allvars) + { + lr.indep_vars[x++] = vn->var; + free (vn); + } + hmap_destroy (&allvars); + } - /* Run logistical regression for each split group */ + + /* logistical regression for each split group */ { struct casegrouper *grouper; struct casereader *group; @@ -965,12 +1032,16 @@ cmd_logistic (struct lexer *lexer, struct dataset *ds) free (lr.predictor_vars); free (lr.cat_predictors); + free (lr.indep_vars); + return CMD_SUCCESS; error: free (lr.predictor_vars); free (lr.cat_predictors); + free (lr.indep_vars); + return CMD_FAILURE; } diff --git a/tests/language/stats/logistic.at b/tests/language/stats/logistic.at index b4d9eef458..9ab04f8cec 100644 --- a/tests/language/stats/logistic.at +++ b/tests/language/stats/logistic.at @@ -1028,3 +1028,139 @@ logistic regression y with x1 x2 AT_CHECK([pspp -O format=csv crash.sps], [1], [ignore]) AT_CLEANUP + + +dnl Test that missing values on the categorical predictors are treated +dnl properly. +AT_SETUP([LOGISTIC REGRESSION missing categoricals]) + +AT_DATA([data.txt], [dnl + .00 3.69 .00 + .00 1.16 1.00 + 1.00 -12.99 .00 + .00 2.97 1.00 + .00 20.48 .00 + .00 4.90 .00 + 1.00 -4.38 .00 + .00 -1.69 1.00 + 1.00 -5.71 .00 + 1.00 -14.28 .00 + .00 9.00 .00 + .00 2.89 1.00 + .00 13.51 1.00 + .00 23.32 1.00 + .00 2.31 1.00 + .00 -2.07 1.00 + 1.00 -4.52 1.00 + 1.00 -5.83 .00 + 1.00 -1.91 .00 + 1.00 -11.12 1.00 + .00 -1.51 .00 + .00 6.59 1.00 + .00 19.28 1.00 + .00 5.94 .00 + .00 8.21 1.00 + .00 8.11 1.00 + .00 2.49 .00 + .00 9.62 .00 + 1.00 -20.74 1.00 + .00 -1.41 1.00 + .00 15.15 1.00 + .00 9.39 .00 + 1.00 -15.14 1.00 + 1.00 -5.86 .00 + 1.00 -11.64 1.00 + 1.00 -14.36 .00 + 1.00 -8.95 1.00 + 1.00 -16.42 1.00 + 1.00 -1.04 1.00 + .00 12.89 1.00 + .00 -7.08 1.00 + .00 4.87 1.00 + .00 11.53 1.00 + 1.00 -6.24 1.00 + .00 1.25 1.00 + .00 4.39 1.00 + .00 3.17 .00 + .00 19.39 1.00 + .00 13.03 1.00 + .00 2.43 .00 + 1.00 -14.73 1.00 + .00 8.25 1.00 + 1.00 -13.28 1.00 + .00 5.27 1.00 + 1.00 -3.46 1.00 + .00 13.81 1.00 + .00 1.35 1.00 + 1.00 -3.94 1.00 + .00 20.73 1.00 + 1.00 -15.40 .00 + 1.00 -11.01 1.00 + .00 4.56 .00 + 1.00 -15.35 1.00 + .00 15.21 .00 + .00 5.34 1.00 + 1.00 -21.55 1.00 + .00 10.12 1.00 + .00 -.73 1.00 + .00 15.28 1.00 + .00 11.08 1.00 + 1.00 -8.24 .00 + .00 2.46 .00 + .00 9.60 .00 + .00 11.24 .00 + .00 14.13 1.00 + .00 19.72 1.00 + .00 5.58 .00 + .00 26.23 1.00 + .00 7.25 .00 + 1.00 -.79 .00 + .00 6.24 .00 + 1.00 1.16 .00 + 1.00 -7.89 1.00 + 1.00 -1.86 1.00 + 1.00 -10.80 1.00 + 1.00 -5.51 .00 + .00 7.51 .00 + .00 11.18 .00 + .00 8.73 .00 + 1.00 -11.21 1.00 + 1.00 -13.24 .00 + .00 19.34 .00 + .00 9.32 1.00 + .00 17.97 1.00 + 1.00 -1.56 1.00 + 1.00 -3.13 .00 + .00 3.98 .00 + .00 -1.21 1.00 + .00 2.37 .00 + 1.00 -18.03 1.00 +]) + +AT_DATA([miss.sps], [dnl +data list notable file='data.txt' list /y x1 cat0*. + +logistic regression y with x1 cat0 + /categorical = cat0. +]) + +AT_CHECK([pspp -O format=csv miss.sps > file1], [0], [ignore]) + +dnl Append a case with a missing categorical. +AT_CHECK([echo '1 34 .' >> data.txt], [0], [ignore]) + +AT_CHECK([pspp -O format=csv miss.sps > file2], [0], [ignore]) + +AT_CHECK([diff file1 file2], [1], [dnl +8,10c8,10 +< Included in Analysis,100,100.00 +< Missing Cases,0,.00 +< Total,100,100.00 +--- +> Included in Analysis,100,99.01 +> Missing Cases,1,.99 +> Total,101,100.00 +]) + +AT_CLEANUP + -- 2.30.2