From 6982cc4aef29f7026e75702174efa8091d9badb2 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Fri, 23 Nov 2012 09:01:36 +0100 Subject: [PATCH] Logistic Regression: Ignore cases with missing dependent variables. Prior to this change, cases with missing values in the predictor variables were ignored, but cases with missing values only in the dependent variable would provoke an error. Subsequent to this change such cases are also ignored. This seems to be the behaviour of other software, despite what their documentation may suggest. --- src/language/stats/logistic.c | 11 +++++++++++ tests/language/stats/logistic.at | 10 ++++++---- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/language/stats/logistic.c b/src/language/stats/logistic.c index 8e21b175e1..2c214516b7 100644 --- a/src/language/stats/logistic.c +++ b/src/language/stats/logistic.c @@ -478,6 +478,11 @@ initial_pass (const struct lr_spec *cmd, struct lr_result *res, struct casereade double weight = dict_get_case_weight (cmd->dict, c, &res->warn_bad_weight); const union value *depval = case_data (c, cmd->dep_var); + if (var_is_value_missing (cmd->dep_var, depval, cmd->exclude)) + { + missing = true; + } + else for (v = 0; v < cmd->n_indep_vars; ++v) { const union value *val = case_data (c, cmd->indep_vars[v]); @@ -623,6 +628,12 @@ run_lr (const struct lr_spec *cmd, struct casereader *input, NULL, NULL); + input = casereader_create_filter_missing (input, + &cmd->dep_var, + 1, + cmd->exclude, + NULL, + NULL); work.hessian = gsl_matrix_calloc (work.beta_hat->size, work.beta_hat->size); diff --git a/tests/language/stats/logistic.at b/tests/language/stats/logistic.at index b175244bdb..b7285e26d1 100644 --- a/tests/language/stats/logistic.at +++ b/tests/language/stats/logistic.at @@ -140,7 +140,7 @@ set decimal dot. data list notable file='lr-data.txt' list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *. -missing values survrate (999) avoid (44444). +missing values survrate (999) avoid (44444) outcome (99). logistic regression variables = outcome with survrate avoid @@ -149,10 +149,12 @@ logistic regression AT_CHECK([pspp -O format=csv lr-data.sps > run0], [0], [ignore]) +dnl Append some cases with missing values into the data. cat >> lr-data.txt << HERE 105.00 1.00 999.00 3.00 2.00 .35 17.00 20.00 .50110 -2.00440 1 106.00 1.00 999.00 2.00 3.00 .38 7.00 15.00 .20168 -1.25264 1 107.00 1.00 5.00 3.00 2.00 .28 44444 34 .00897 -1.00905 1 + 108.00 99 5.00 3.00 2.00 .28 4 34 .00897 -1.00905 1 HERE AT_CHECK([pspp -O format=csv lr-data.sps > run1], [0], [ignore]) @@ -164,9 +166,9 @@ AT_CHECK([diff run0 run1], [1], [dnl < Missing Cases,0,.000 < Total,66,100.000 --- -> Included in Analysis,66,95.652 -> Missing Cases,3,4.348 -> Total,69,100.000 +> Included in Analysis,66,94.286 +> Missing Cases,4,5.714 +> Total,70,100.000 ]) AT_CLEANUP -- 2.30.2