From d0ac5f484f29fa3a9775e981a67866b7df2b61a5 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Thu, 23 Jun 2016 10:07:23 +0200 Subject: [PATCH] Fixed GLM vs. missing values in the dependent variable --- NEWS | 3 + doc/statistics.texi | 13 ++-- src/language/stats/glm.c | 10 +++ tests/language/stats/glm.at | 128 ++++++++++++++++++++++++++++++++++++ 4 files changed, 148 insertions(+), 6 deletions(-) diff --git a/NEWS b/NEWS index 8ba15ffcdf..beebc9dd9f 100644 --- a/NEWS +++ b/NEWS @@ -20,6 +20,9 @@ Changes from 0.10.1 to 0.10.2: - The variable info dialog showed the previous selected variable superimposed with the currently selected one. + - The GLM command did not properly deal with missing values. This + has been fixed. + Changes from 0.10.0 to 0.10.1: * Bug fixes, including the following: diff --git a/doc/statistics.texi b/doc/statistics.texi index 34a7a3d987..e18c6109db 100644 --- a/doc/statistics.texi +++ b/doc/statistics.texi @@ -951,12 +951,13 @@ implies the model The @subcmd{MISSING} subcommand determines the handling of missing variables. -If @subcmd{INCLUDE} is set, then user-missing values are included in the -calculations, but system-missing values are not. -If @subcmd{EXCLUDE} is set, which is the default, user-missing -values are excluded as well as system-missing values. -This is the default. - +If @subcmd{INCLUDE} is set then, for the purposes of GLM analysis, +only system-missing values are considered +to be missing; user-missing values are not regarded as missing. +If @subcmd{EXCLUDE} is set, which is the default, then user-missing +values are considered to be missing as well as system-missing values. +A case for which any dependent variable or any factor +variable has a missing value is excluded from the analysis. @node LOGISTIC REGRESSION @section LOGISTIC REGRESSION diff --git a/src/language/stats/glm.c b/src/language/stats/glm.c index 625de2d2a9..9ac3150c0a 100644 --- a/src/language/stats/glm.c +++ b/src/language/stats/glm.c @@ -587,6 +587,16 @@ run_glm (struct glm_spec *cmd, struct casereader *input, struct glm_workspace ws; struct covariance *cov; + input = casereader_create_filter_missing (input, + cmd->dep_vars, cmd->n_dep_vars, + cmd->exclude, + NULL, NULL); + + input = casereader_create_filter_missing (input, + cmd->factor_vars, cmd->n_factor_vars, + cmd->exclude, + NULL, NULL); + ws.cats = categoricals_create (cmd->interactions, cmd->n_interactions, cmd->wv, cmd->exclude, MV_ANY); diff --git a/tests/language/stats/glm.at b/tests/language/stats/glm.at index de1ad2a4bc..2ac5909fea 100644 --- a/tests/language/stats/glm.at +++ b/tests/language/stats/glm.at @@ -326,3 +326,131 @@ Corrected Total,436.784,19,,, AT_CLEANUP + +AT_SETUP([GLM missing values]) + +AT_DATA([glm.data], [dnl +1 1 6 3.5 +1 2 2 8.9 +1 3 3 9.6 +1 4 4 10.5 +1 5 5 3.1 +1 6 1 5.9 +2 1 2 4.2 +2 2 6 1.9 +2 3 5 3.7 +2 4 3 10.2 +2 5 1 7.2 +2 6 4 7.6 +3 1 1 6.7 +3 2 4 5.8 +3 3 6 -2.7 +3 4 2 4.6 +3 5 3 4.0 +3 6 5 -0.7 +4 1 4 6.6 +4 2 1 4.5 +4 3 2 3.7 +4 4 5 3.7 +4 5 6 -3.3 +4 6 3 3.0 +5 1 3 4.1 +5 2 5 2.4 +5 3 4 6.0 +5 4 1 5.1 +5 5 2 3.5 +5 6 6 4.0 +6 1 5 3.8 +6 2 3 5.8 +6 3 1 7.0 +6 4 6 3.8 +6 5 4 5.0 +6 6 2 8.6 +]) + +AT_DATA([glm-miss.sps], [dnl +set format = F20.3. +data list file='glm.data' notable fixed /a 1 b 3 c 5 y 7-10(2). + +do if a=6. +recode y (else=SYSMIS). +end if. + +glm y by b a c + /criteria=alpha(.05) + /design = a b c + . +]) + +AT_CHECK([pspp -O format=csv glm-miss.sps], [0], [dnl +Table: Tests of Between-Subjects Effects +Source,Type III Sum of Squares,df,Mean Square,F,Sig. +Corrected Model,251.621,14,17.973,4.969,.002 +Intercept,628.376,1,628.376,173.737,.000 +a,72.929,4,18.232,5.041,.009 +b,20.703,5,4.141,1.145,.380 +c,135.179,5,27.036,7.475,.001 +Error,54.253,15,3.617,, +Total,934.250,30,,, +Corrected Total,305.874,29,,, +]) + + + +AT_DATA([glm-miss2.sps], [dnl +set format = F20.3. +data list file='glm.data' notable fixed /a 1 b 3 c 5 y 7-10(2). + +select if a <> 6. + +glm y by b a c + /criteria=alpha(.05) + /design = a b c + . +]) + +AT_CHECK([pspp -O format=csv glm-miss2.sps], [0], [dnl +Table: Tests of Between-Subjects Effects +Source,Type III Sum of Squares,df,Mean Square,F,Sig. +Corrected Model,251.621,14,17.973,4.969,.002 +Intercept,628.376,1,628.376,173.737,.000 +a,72.929,4,18.232,5.041,.009 +b,20.703,5,4.141,1.145,.380 +c,135.179,5,27.036,7.475,.001 +Error,54.253,15,3.617,, +Total,934.250,30,,, +Corrected Total,305.874,29,,, +]) + + +dnl Now for some missing values in the factor variables. + +AT_DATA([glm-miss3.sps], [dnl +set format = F20.3. +data list file=glm.data notable fixed /a 1 b 3 c 5 y 7-10(2). + +do if a=6. +recode a (else=SYSMIS). +end if. + +glm y by b a c + /criteria=alpha(.05) + /design = a b c + . +]) + +AT_CHECK([pspp -O format=csv glm-miss3.sps], [0], [dnl +Table: Tests of Between-Subjects Effects +Source,Type III Sum of Squares,df,Mean Square,F,Sig. +Corrected Model,251.621,14,17.973,4.969,.002 +Intercept,628.376,1,628.376,173.737,.000 +a,72.929,4,18.232,5.041,.009 +b,20.703,5,4.141,1.145,.380 +c,135.179,5,27.036,7.475,.001 +Error,54.253,15,3.617,, +Total,934.250,30,,, +Corrected Total,305.874,29,,, +]) + +AT_CLEANUP + -- 2.30.2