1 dnl PSPP - a program for statistical analysis.
2 dnl Copyright (C) 2017 Free Software Foundation, Inc.
4 dnl This program is free software: you can redistribute it and/or modify
5 dnl it under the terms of the GNU General Public License as published by
6 dnl the Free Software Foundation, either version 3 of the License, or
7 dnl (at your option) any later version.
9 dnl This program is distributed in the hope that it will be useful,
10 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
11 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 dnl GNU General Public License for more details.
14 dnl You should have received a copy of the GNU General Public License
15 dnl along with this program. If not, see <http://www.gnu.org/licenses/>.
17 AT_BANNER([LOGISTIC REGRESSION])
19 dnl These examples are adapted from
20 dnl http://www.uvm.edu/~dhowell/gradstat/psych341/lectures/Logistic%20Regression/LogisticReg1.html
24 m4_define([LOGIT_TEST_DATA],
25 [AT_DATA([lr-data.txt], dnl
26 105.00 1.00 33.00 3.00 2.00 .35 17.00 20.00 .50110 -2.00440 1
27 106.00 1.00 50.00 2.00 3.00 .38 7.00 15.00 .20168 -1.25264 1
28 107.00 1.00 91.00 3.00 2.00 .28 15.00 7.00 .00897 -1.00905 1
29 108.00 1.00 90.00 3.00 2.00 .20 2.00 2.00 .00972 -1.00982 1
30 109.00 1.00 70.00 3.00 3.00 .38 23.00 27.00 .04745 -1.04981 1
31 111.00 2.00 31.00 2.00 2.00 .00 19.00 10.00 .54159 1.84640 1
32 112.00 1.00 91.00 2.00 3.00 .18 6.00 16.00 .00897 -1.00905 1
33 113.00 1.00 81.00 3.00 2.00 .00 3.00 9.00 .01998 -1.02039 1
34 114.00 2.00 15.00 1.00 2.00 .13 19.00 13.00 .81241 1.23090 1
35 116.00 2.00 1.00 1.00 2.00 .88 15.00 7.00 .93102 1.07410 1
36 117.00 1.00 93.00 3.00 2.00 .18 9.00 15.00 .00764 -1.00770 1
37 118.00 2.00 14.00 1.00 3.00 .15 23.00 18.00 .82447 1.21289 1
38 120.00 1.00 91.00 2.00 2.00 .43 17.00 14.00 .00897 -1.00905 1
39 121.00 1.00 55.00 3.00 2.00 .69 20.00 14.00 .14409 -1.16834 1
40 122.00 1.00 70.00 2.00 3.00 .03 .00 6.00 .04745 -1.04981 1
41 123.00 1.00 25.00 2.00 2.00 .45 4.00 10.00 .65789 -2.92301 1
42 125.00 1.00 91.00 2.00 2.00 .13 .00 3.00 .00897 -1.00905 1
43 126.00 1.00 91.00 3.00 3.00 .23 4.00 6.00 .00897 -1.00905 1
44 127.00 1.00 91.00 3.00 2.00 .00 8.00 8.00 .00897 -1.00905 1
45 128.00 2.00 13.00 2.00 2.00 .65 16.00 14.00 .83592 1.19629 1
46 129.00 1.00 50.00 2.00 2.00 .25 20.00 23.00 .20168 -1.25264 1
47 135.00 1.00 90.00 3.00 3.00 .03 5.00 12.00 .00972 -1.00982 1
48 138.00 1.00 70.00 3.00 3.00 .10 1.00 6.00 .04745 -1.04981 1
49 139.00 2.00 19.00 3.00 3.00 .10 11.00 12.00 .75787 1.31949 1
50 149.00 2.00 50.00 3.00 2.00 .03 .00 .00 .20168 4.95826 1
51 204.00 1.00 50.00 3.00 1.00 .13 .00 1.00 .20168 -1.25264 1
52 205.00 1.00 91.00 3.00 3.00 .72 16.00 18.00 .00897 -1.00905 1
53 206.00 2.00 24.00 1.00 1.00 .10 5.00 21.00 .67592 1.47947 1
54 207.00 1.00 80.00 3.00 3.00 .13 6.00 7.00 .02164 -1.02212 1
55 208.00 1.00 87.00 2.00 2.00 .18 9.00 20.00 .01237 -1.01253 1
56 209.00 1.00 70.00 2.00 2.00 .53 15.00 12.00 .04745 -1.04981 1
57 211.00 1.00 55.00 2.00 1.00 .33 8.00 5.00 .14409 -1.16834 1
58 212.00 1.00 56.00 3.00 1.00 .30 6.00 20.00 .13436 -1.15522 1
59 214.00 1.00 54.00 2.00 2.00 .15 .00 16.00 .15439 -1.18258 1
60 215.00 1.00 71.00 3.00 3.00 .35 12.00 12.00 .04391 -1.04592 1
61 217.00 2.00 36.00 1.00 1.00 .10 12.00 8.00 .44049 2.27020 1
62 218.00 1.00 91.00 2.00 2.00 .05 11.00 25.00 .00897 -1.00905 1
63 219.00 1.00 91.00 2.00 2.00 1.23 11.00 24.00 .00897 -1.00905 1
64 220.00 1.00 91.00 2.00 3.00 .08 8.00 11.00 .00897 -1.00905 1
65 221.00 1.00 91.00 2.00 2.00 .33 5.00 11.00 .00897 -1.00905 1
66 222.00 2.00 36.00 2.00 1.00 .18 5.00 3.00 .44049 2.27020 1
67 223.00 1.00 70.00 2.00 3.00 .18 14.00 3.00 .04745 -1.04981 1
68 224.00 1.00 91.00 2.00 2.00 .43 2.00 10.00 .00897 -1.00905 1
69 225.00 1.00 55.00 2.00 1.00 .18 6.00 11.00 .14409 -1.16834 1
70 229.00 2.00 75.00 2.00 2.00 .40 30.00 25.00 .03212 31.12941 1
71 232.00 1.00 91.00 3.00 2.00 .15 6.00 3.00 .00897 -1.00905 1
72 233.00 1.00 70.00 2.00 1.00 .00 11.00 8.00 .04745 -1.04981 1
73 234.00 1.00 54.00 3.00 2.00 .10 .00 .00 .15439 -1.18258 1
74 237.00 1.00 70.00 3.00 2.00 .18 5.00 25.00 .04745 -1.04981 1
75 241.00 1.00 19.00 2.00 3.00 .33 13.00 9.00 .75787 -4.12995 1
76 304.00 2.00 18.00 2.00 2.00 .26 25.00 6.00 .77245 1.29458 1
77 305.00 1.00 88.00 3.00 2.00 1.35 17.00 29.00 .01142 -1.01155 1
78 306.00 1.00 70.00 2.00 3.00 .63 14.00 33.00 .04745 -1.04981 1
79 307.00 1.00 85.00 2.00 2.00 2.65 18.00 14.00 .01452 -1.01474 1
80 308.00 1.00 13.00 2.00 2.00 .23 5.00 5.00 .83592 -6.09442 1
81 309.00 2.00 13.00 2.00 2.00 .23 7.00 17.00 .83592 1.19629 1
82 311.00 2.00 1.00 2.00 2.00 .50 20.00 14.00 .93102 1.07410 1
83 315.00 1.00 19.00 2.00 3.00 .18 1.00 11.00 .75787 -4.12995 1
84 316.00 1.00 88.00 2.00 2.00 .38 12.00 11.00 .01142 -1.01155 2
85 318.00 1.00 88.00 3.00 2.00 .03 5.00 5.00 .01142 -1.01155 3
86 319.00 2.00 18.00 2.00 3.00 .30 15.00 16.00 .77245 1.29458 1
87 321.00 2.00 15.00 2.00 2.00 .63 15.00 18.00 .81241 1.23090 1
88 322.00 1.00 88.00 3.00 2.00 .40 18.00 15.00 .01142 -1.01155 1
89 325.00 2.00 18.00 2.00 3.00 1.00 28.00 18.00 .77245 1.29458 1
90 329.00 1.00 88.00 3.00 2.00 .03 7.00 11.00 .01142 -1.01155 4
91 332.00 2.00 2.00 2.00 2.00 .05 8.00 9.00 .92562 1.08036 1
94 dnl Note: In the above data cases 305, 316 318 and 329 have identical values
95 dnl of the 2nd and 3rd variables. We use this for weight testing.
97 AT_SETUP([LOGISTIC REGRESSION basic test])
98 AT_KEYWORDS([categorical categoricals])
102 AT_DATA([lr-data.sps], [dnl
105 data list notable file='lr-data.txt'
106 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
109 variables = outcome with survrate
113 AT_CHECK([pspp -O format=csv lr-data.sps], [0],
115 Table: Dependent Variable Encoding
116 Original Value,Internal Value
120 Table: Case Processing Summary
121 Unweighted Cases,N,Percent
122 Included in Analysis,66,100.000
126 note: Estimation terminated at iteration number 6 because parameter estimates changed by less than 0.001
129 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
132 Table: Classification Table
134 ,,,outcome,,"Percentage
136 ,Observed,,1.000,2.000,
137 Step 1,outcome,1.000,43,5,89.583
139 ,Overall Percentage,,,,86.364
141 Table: Variables in the Equation
142 ,,B,S.E.,Wald,df,Sig.,Exp(B)
143 Step 1,survrate,-.081,.019,17.756,1,.000,.922
144 ,Constant,2.684,.811,10.941,1,.001,14.639
150 AT_SETUP([LOGISTIC REGRESSION missing values])
151 AT_KEYWORDS([categorical categoricals])
155 AT_DATA([lr-data.sps], [dnl
158 data list notable file='lr-data.txt'
159 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
161 missing values survrate (999) avoid (44444) outcome (99).
164 variables = outcome with survrate avoid
168 AT_CHECK([pspp -O format=csv lr-data.sps > run0], [0], [ignore])
170 dnl Append some cases with missing values into the data.
171 cat >> lr-data.txt << HERE
172 105.00 1.00 999.00 3.00 2.00 .35 17.00 20.00 .50110 -2.00440 1
173 106.00 1.00 999.00 2.00 3.00 .38 7.00 15.00 .20168 -1.25264 1
174 107.00 1.00 5.00 3.00 2.00 .28 44444 34 .00897 -1.00905 1
175 108.00 99 5.00 3.00 2.00 .28 4 34 .00897 -1.00905 1
178 AT_CHECK([pspp -O format=csv lr-data.sps > run1], [0], [ignore])
180 dnl Only the summary information should be different
181 AT_CHECK([diff run0 run1], [1], [dnl
183 < Included in Analysis,66,100.000
184 < Missing Cases,0,.000
187 > Included in Analysis,66,94.286
188 > Missing Cases,4,5.714
196 dnl Check that a weighted dataset is interpreted correctly
197 dnl To do this, the same data set is used, one weighted, one not.
198 dnl The weighted dataset omits certain cases which are identical
199 AT_SETUP([LOGISTIC REGRESSION weights])
200 AT_KEYWORDS([categorical categoricals])
204 AT_DATA([lr-data-unweighted.sps], [dnl
207 data list notable file='lr-data.txt'
208 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
211 variables = outcome with survrate
215 AT_DATA([lr-data-weighted.sps], [dnl
218 data list notable file='lr-data.txt'
219 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
223 * Omit duplicate cases.
224 select if id <> 305 and id <> 316 and id <> 318.
227 variables = outcome with survrate
232 AT_CHECK([pspp -O format=csv lr-data-unweighted.sps > unweighted-result], [0], [ignore])
233 AT_CHECK([pspp -O format=csv lr-data-weighted.sps > weighted-result], [0], [ignore])
235 dnl The only difference should be the summary information, since
236 dnl this displays the unweighted totals.
237 AT_CHECK([diff unweighted-result weighted-result], [1], [dnl
239 < Included in Analysis,66,100.000
241 > Included in Analysis,63,100.000
247 < Step 1,outcome,1.000,43,5,89.583
248 < ,,2.000,4,14,77.778
250 > Step 1,outcome,1.000,43.000,5.000,89.583
251 > ,,2.000,4.000,14.000,77.778
258 dnl Check that the /NOCONST option works as intended.
259 dnl The results this produces are very similar to those
260 dnl at the example in http://www.ats.ucla.edu/stat/SPSS/faq/logregconst.htm
261 AT_SETUP([LOGISTIC REGRESSION without constant])
262 AT_KEYWORDS([categorical categoricals])
264 AT_DATA([non-const.sps], [dnl
269 compute female = (#i > 91).
275 compute constant = 1.
277 logistic regression female with constant /noconst.
280 AT_CHECK([pspp -O format=csv non-const.sps], [0],
282 Table: Dependent Variable Encoding
283 Original Value,Internal Value
287 Table: Case Processing Summary
288 Unweighted Cases,N,Percent
289 Included in Analysis,200,100.000
293 note: Estimation terminated at iteration number 2 because parameter estimates changed by less than 0.001
296 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
299 Table: Classification Table
301 ,,,female,,"Percentage
304 Step 1,female,.00,0,91,.000
306 ,Overall Percentage,,,,54.500
308 Table: Variables in the Equation
309 ,,B,S.E.,Wald,df,Sig.,Exp(B)
310 Step 1,constant,.180,.142,1.616,1,.204,1.198
317 dnl Check that if somebody passes a dependent variable which is not dichtomous,
318 dnl then an error is raised.
319 AT_SETUP([LOGISTIC REGRESSION non-dichotomous dep var])
320 AT_KEYWORDS([categorical categoricals])
322 AT_DATA([non-dich.sps], [dnl
323 data list notable list /y x1 x2 x3 x4.
330 logistic regression y with x1 x2 x3 x4.
333 AT_CHECK([pspp -O format=csv non-dich.sps], [1],
335 error: Dependent variable's values are not dichotomous.
342 dnl An example to check the behaviour of LOGISTIC REGRESSION with a categorical
343 dnl variable. This examṕle was inspired from that at:
344 dnl http://www.ats.ucla.edu/stat/spss/dae/logit.htm
345 AT_SETUP([LOGISTIC REGRESSION with categorical])
346 AT_KEYWORDS([categorical categoricals])
348 AT_DATA([lr-cat.data], [dnl
751 AT_DATA([lr-cat.sps], [dnl
754 data list notable list file='lr-cat.data' /b1 b2 bcat y.
762 AT_CHECK([pspp -O format=csv lr-cat.sps], [0],
764 Table: Dependent Variable Encoding
765 Original Value,Internal Value
769 Table: Case Processing Summary
770 Unweighted Cases,N,Percent
771 Included in Analysis,400,100.000
775 note: Estimation terminated at iteration number 4 because parameter estimates changed by less than 0.001
778 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
781 Table: Categorical Variables' Codings
782 ,,,Parameter coding,,
783 ,,Frequency,(1),(2),(3)
789 Table: Classification Table
793 ,Observed,,4.000,9.000,
794 Step 1,y,4.000,254,19,93.040
796 ,Overall Percentage,,,,71.000
798 Table: Variables in the Equation
799 ,,B,S.E.,Wald,df,Sig.,Exp(B)
800 Step 1,b1,.002,.001,4.284,1,.038,1.002
801 ,b2,.804,.332,5.872,1,.015,2.235
802 ,bcat,,,20.895,3,.000,
803 ,bcat(1),1.551,.418,13.788,1,.000,4.718
804 ,bcat(2),.876,.367,5.706,1,.017,2.401
805 ,bcat(3),.211,.393,.289,1,.591,1.235
806 ,Constant,-5.541,1.138,23.709,1,.000,.004
813 dnl This example is inspired by http://www.ats.ucla.edu/stat/spss/output/logistic.htm
814 AT_SETUP([LOGISTIC REGRESSION with cat var 2])
815 AT_KEYWORDS([categorical categoricals])
817 AT_DATA([lr-cat2.data], [dnl
818 60.00 1.00 8.00 50.00
820 57.00 1.00 7.00 53.00
828 68.00 1.00 9.00 69.00
832 57.00 1.00 7.00 61.00
833 55.00 1.00 8.00 50.00
836 50.00 1.00 9.00 66.00
840 47.00 1.00 7.00 34.00
852 68.00 1.00 9.00 69.00
854 63.00 1.00 9.00 61.00
855 65.00 1.00 9.00 61.00
856 63.00 1.00 9.00 53.00
860 52.00 1.00 7.00 56.00
862 47.00 1.00 7.00 53.00
864 50.00 1.00 8.00 55.00
875 68.00 1.00 9.00 55.00
876 47.00 1.00 8.00 50.00
884 55.00 1.00 9.00 49.00
885 68.00 1.00 8.00 50.00
886 52.00 1.00 9.00 63.00
889 66.00 1.00 9.00 61.00
890 65.00 1.00 7.00 58.00
892 68.00 1.00 7.00 59.00
893 60.00 1.00 9.00 61.00
895 57.00 1.00 7.00 54.00
904 63.00 1.00 7.00 63.00
906 57.00 1.00 8.00 63.00
913 65.00 1.00 9.00 63.00
918 63.00 1.00 9.00 55.00
927 47.00 1.00 9.00 69.00
932 50.00 1.00 7.00 63.00
935 73.00 1.00 9.00 61.00
940 57.00 1.00 8.00 55.00
941 53.00 1.00 8.00 57.00
945 57.00 1.00 8.00 58.00
955 73.00 1.00 8.00 69.00
956 71.00 1.00 9.00 58.00
958 63.00 1.00 7.00 54.00
964 65.00 1.00 8.00 55.00
965 76.00 1.00 9.00 67.00
966 71.00 1.00 8.00 66.00
968 47.00 1.00 9.00 63.00
971 54.00 1.00 9.00 55.00
972 55.00 1.00 8.00 58.00
974 55.00 1.00 9.00 63.00
983 65.00 1.00 9.00 66.00
987 63.00 1.00 8.00 72.00
993 73.00 1.00 9.00 58.00
995 63.00 1.00 9.00 69.00
997 65.00 1.00 9.00 66.00
998 73.00 1.00 8.00 63.00
1000 36.00 .00 8.00 42.00
1001 28.00 .00 7.00 44.00
1002 47.00 .00 8.00 44.00
1003 57.00 .00 7.00 47.00
1004 34.00 .00 7.00 29.00
1005 47.00 .00 9.00 66.00
1006 57.00 .00 8.00 58.00
1007 60.00 1.00 9.00 50.00
1008 50.00 .00 9.00 47.00
1009 73.00 1.00 9.00 55.00
1010 52.00 1.00 8.00 47.00
1011 55.00 .00 8.00 53.00
1012 47.00 .00 8.00 53.00
1013 50.00 .00 8.00 61.00
1014 61.00 .00 7.00 44.00
1015 52.00 .00 9.00 53.00
1016 47.00 .00 7.00 40.00
1017 47.00 .00 7.00 50.00
1020 AT_DATA([stringcat.sps], [dnl
1022 data list notable file='lr-cat2.data' list /read honcomp wiz science *.
1025 recode wiz (7 = "a") (8 = "b") (9 = "c") into ses.
1027 logistic regression honcomp with read science ses
1032 AT_CHECK([pspp -O format=csv stringcat.sps], [0],
1034 Table: Dependent Variable Encoding
1035 Original Value,Internal Value
1039 Table: Case Processing Summary
1040 Unweighted Cases,N,Percent
1041 Included in Analysis,200,100.000
1042 Missing Cases,0,.000
1045 note: Estimation terminated at iteration number 5 because parameter estimates changed by less than 0.001
1047 Table: Model Summary
1048 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
1051 Table: Categorical Variables' Codings
1052 ,,,Parameter coding,
1058 Table: Classification Table
1060 ,,,honcomp,,"Percentage
1062 ,Observed,,.000,1.000,
1063 Step 1,honcomp,.000,132,15,89.796
1064 ,,1.000,26,27,50.943
1065 ,Overall Percentage,,,,79.500
1067 Table: Variables in the Equation
1068 ,,B,S.E.,Wald,df,Sig.,Exp(B)
1069 Step 1,read,.098,.025,15.199,1,.000,1.103
1070 ,science,.066,.027,5.867,1,.015,1.068
1071 ,ses,,,6.690,2,.035,
1072 ,ses(1),.058,.532,.012,1,.913,1.060
1073 ,ses(2),-1.013,.444,5.212,1,.022,.363
1074 ,Constant,-9.561,1.662,33.113,1,.000,.000
1080 dnl Check that it doesn't crash if a categorical variable
1081 dnl has only one distinct value
1082 AT_SETUP([LOGISTIC REGRESSION identical categories])
1083 AT_KEYWORDS([categorical categoricals])
1085 AT_DATA([crash.sps], [dnl
1086 data list notable list /y x1 x2*.
1092 logistic regression y with x1 x2
1096 AT_CHECK([pspp -O format=csv crash.sps], [1], [ignore])
1101 dnl Test that missing values on the categorical predictors are treated
1103 AT_SETUP([LOGISTIC REGRESSION missing categoricals])
1104 AT_KEYWORDS([categorical categoricals])
1106 AT_DATA([data.txt], [dnl
1209 AT_DATA([miss.sps], [dnl
1210 data list notable file='data.txt' list /y x1 cat0*.
1212 logistic regression y with x1 cat0
1213 /categorical = cat0.
1216 AT_CHECK([pspp -O format=csv miss.sps > file1], [0], [ignore])
1218 dnl Append a case with a missing categorical.
1219 AT_CHECK([echo '1 34 .' >> data.txt], [0], [ignore])
1221 AT_CHECK([pspp -O format=csv miss.sps > file2], [0], [ignore])
1223 AT_CHECK([diff file1 file2], [1], [dnl
1225 < Included in Analysis,100,100.00
1226 < Missing Cases,0,.00
1229 > Included in Analysis,100,99.01
1230 > Missing Cases,1,.99
1237 dnl Check that the confidence intervals are properly reported.
1238 dnl Use an example with categoricals, because that was buggy at
1239 dnl one point. The data in this example comes from:
1240 dnl http://people.ysu.edu/~gchang/SPSSE/SPSS_lab2Regression.pdf
1241 AT_SETUP([LOGISTIC REGRESSION confidence interval])
1242 AT_KEYWORDS([categorical categoricals])
1244 AT_DATA([ci.sps], [dnl
1246 data list notable list /disease age sciostat sector savings *.
1447 disease WITH age sciostat sector savings
1448 /categorical = sciostat sector
1452 AT_CHECK([pspp -O format=csv ci.sps], [0], [dnl
1453 Table: Dependent Variable Encoding
1454 Original Value,Internal Value
1458 Table: Case Processing Summary
1459 Unweighted Cases,N,Percent
1460 Included in Analysis,196,100.000
1461 Missing Cases,0,.000
1464 note: Estimation terminated at iteration number 4 because parameter estimates changed by less than 0.001
1466 Table: Model Summary
1467 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
1470 Table: Categorical Variables' Codings
1471 ,,,Parameter coding,
1473 sciostat,1.000,77,1,0
1479 Table: Classification Table
1481 ,,,disease,,"Percentage
1483 ,Observed,,.000,1.000,
1484 Step 1,disease,.000,131,8,94.245
1485 ,,1.000,41,16,28.070
1486 ,Overall Percentage,,,,75.000
1488 Table: Variables in the Equation
1489 ,,,,,,,,95% CI for Exp(B),
1490 ,,B,S.E.,Wald,df,Sig.,Exp(B),Lower,Upper
1491 Step 1,age,.027,.009,8.647,1,.003,1.027,1.009,1.045
1492 ,savings,.061,.386,.025,1,.874,1.063,.499,2.264
1493 ,sciostat,,,.440,2,.803,,,
1494 ,sciostat(1),-.278,.434,.409,1,.522,.757,.323,1.775
1495 ,sciostat(2),-.219,.459,.227,1,.634,.803,.327,1.976
1496 ,sector,,,11.974,1,.001,,,
1497 ,sector(1),-1.235,.357,11.974,1,.001,.291,.145,.586
1498 ,Constant,-.814,.452,3.246,1,.072,.443,,