1 dnl PSPP - a program for statistical analysis.
2 dnl Copyright (C) 2017 Free Software Foundation, Inc.
4 dnl This program is free software: you can redistribute it and/or modify
5 dnl it under the terms of the GNU General Public License as published by
6 dnl the Free Software Foundation, either version 3 of the License, or
7 dnl (at your option) any later version.
9 dnl This program is distributed in the hope that it will be useful,
10 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
11 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 dnl GNU General Public License for more details.
14 dnl You should have received a copy of the GNU General Public License
15 dnl along with this program. If not, see <http://www.gnu.org/licenses/>.
17 AT_BANNER([LOGISTIC REGRESSION])
19 dnl These examples are adapted from
20 dnl http://www.uvm.edu/~dhowell/gradstat/psych341/lectures/Logistic%20Regression/LogisticReg1.html
24 m4_define([LOGIT_TEST_DATA],
25 [AT_DATA([lr-data.txt], dnl
26 105.00 1.00 33.00 3.00 2.00 .35 17.00 20.00 .50110 -2.00440 1
27 106.00 1.00 50.00 2.00 3.00 .38 7.00 15.00 .20168 -1.25264 1
28 107.00 1.00 91.00 3.00 2.00 .28 15.00 7.00 .00897 -1.00905 1
29 108.00 1.00 90.00 3.00 2.00 .20 2.00 2.00 .00972 -1.00982 1
30 109.00 1.00 70.00 3.00 3.00 .38 23.00 27.00 .04745 -1.04981 1
31 111.00 2.00 31.00 2.00 2.00 .00 19.00 10.00 .54159 1.84640 1
32 112.00 1.00 91.00 2.00 3.00 .18 6.00 16.00 .00897 -1.00905 1
33 113.00 1.00 81.00 3.00 2.00 .00 3.00 9.00 .01998 -1.02039 1
34 114.00 2.00 15.00 1.00 2.00 .13 19.00 13.00 .81241 1.23090 1
35 116.00 2.00 1.00 1.00 2.00 .88 15.00 7.00 .93102 1.07410 1
36 117.00 1.00 93.00 3.00 2.00 .18 9.00 15.00 .00764 -1.00770 1
37 118.00 2.00 14.00 1.00 3.00 .15 23.00 18.00 .82447 1.21289 1
38 120.00 1.00 91.00 2.00 2.00 .43 17.00 14.00 .00897 -1.00905 1
39 121.00 1.00 55.00 3.00 2.00 .69 20.00 14.00 .14409 -1.16834 1
40 122.00 1.00 70.00 2.00 3.00 .03 .00 6.00 .04745 -1.04981 1
41 123.00 1.00 25.00 2.00 2.00 .45 4.00 10.00 .65789 -2.92301 1
42 125.00 1.00 91.00 2.00 2.00 .13 .00 3.00 .00897 -1.00905 1
43 126.00 1.00 91.00 3.00 3.00 .23 4.00 6.00 .00897 -1.00905 1
44 127.00 1.00 91.00 3.00 2.00 .00 8.00 8.00 .00897 -1.00905 1
45 128.00 2.00 13.00 2.00 2.00 .65 16.00 14.00 .83592 1.19629 1
46 129.00 1.00 50.00 2.00 2.00 .25 20.00 23.00 .20168 -1.25264 1
47 135.00 1.00 90.00 3.00 3.00 .03 5.00 12.00 .00972 -1.00982 1
48 138.00 1.00 70.00 3.00 3.00 .10 1.00 6.00 .04745 -1.04981 1
49 139.00 2.00 19.00 3.00 3.00 .10 11.00 12.00 .75787 1.31949 1
50 149.00 2.00 50.00 3.00 2.00 .03 .00 .00 .20168 4.95826 1
51 204.00 1.00 50.00 3.00 1.00 .13 .00 1.00 .20168 -1.25264 1
52 205.00 1.00 91.00 3.00 3.00 .72 16.00 18.00 .00897 -1.00905 1
53 206.00 2.00 24.00 1.00 1.00 .10 5.00 21.00 .67592 1.47947 1
54 207.00 1.00 80.00 3.00 3.00 .13 6.00 7.00 .02164 -1.02212 1
55 208.00 1.00 87.00 2.00 2.00 .18 9.00 20.00 .01237 -1.01253 1
56 209.00 1.00 70.00 2.00 2.00 .53 15.00 12.00 .04745 -1.04981 1
57 211.00 1.00 55.00 2.00 1.00 .33 8.00 5.00 .14409 -1.16834 1
58 212.00 1.00 56.00 3.00 1.00 .30 6.00 20.00 .13436 -1.15522 1
59 214.00 1.00 54.00 2.00 2.00 .15 .00 16.00 .15439 -1.18258 1
60 215.00 1.00 71.00 3.00 3.00 .35 12.00 12.00 .04391 -1.04592 1
61 217.00 2.00 36.00 1.00 1.00 .10 12.00 8.00 .44049 2.27020 1
62 218.00 1.00 91.00 2.00 2.00 .05 11.00 25.00 .00897 -1.00905 1
63 219.00 1.00 91.00 2.00 2.00 1.23 11.00 24.00 .00897 -1.00905 1
64 220.00 1.00 91.00 2.00 3.00 .08 8.00 11.00 .00897 -1.00905 1
65 221.00 1.00 91.00 2.00 2.00 .33 5.00 11.00 .00897 -1.00905 1
66 222.00 2.00 36.00 2.00 1.00 .18 5.00 3.00 .44049 2.27020 1
67 223.00 1.00 70.00 2.00 3.00 .18 14.00 3.00 .04745 -1.04981 1
68 224.00 1.00 91.00 2.00 2.00 .43 2.00 10.00 .00897 -1.00905 1
69 225.00 1.00 55.00 2.00 1.00 .18 6.00 11.00 .14409 -1.16834 1
70 229.00 2.00 75.00 2.00 2.00 .40 30.00 25.00 .03212 31.12941 1
71 232.00 1.00 91.00 3.00 2.00 .15 6.00 3.00 .00897 -1.00905 1
72 233.00 1.00 70.00 2.00 1.00 .00 11.00 8.00 .04745 -1.04981 1
73 234.00 1.00 54.00 3.00 2.00 .10 .00 .00 .15439 -1.18258 1
74 237.00 1.00 70.00 3.00 2.00 .18 5.00 25.00 .04745 -1.04981 1
75 241.00 1.00 19.00 2.00 3.00 .33 13.00 9.00 .75787 -4.12995 1
76 304.00 2.00 18.00 2.00 2.00 .26 25.00 6.00 .77245 1.29458 1
77 305.00 1.00 88.00 3.00 2.00 1.35 17.00 29.00 .01142 -1.01155 1
78 306.00 1.00 70.00 2.00 3.00 .63 14.00 33.00 .04745 -1.04981 1
79 307.00 1.00 85.00 2.00 2.00 2.65 18.00 14.00 .01452 -1.01474 1
80 308.00 1.00 13.00 2.00 2.00 .23 5.00 5.00 .83592 -6.09442 1
81 309.00 2.00 13.00 2.00 2.00 .23 7.00 17.00 .83592 1.19629 1
82 311.00 2.00 1.00 2.00 2.00 .50 20.00 14.00 .93102 1.07410 1
83 315.00 1.00 19.00 2.00 3.00 .18 1.00 11.00 .75787 -4.12995 1
84 316.00 1.00 88.00 2.00 2.00 .38 12.00 11.00 .01142 -1.01155 2
85 318.00 1.00 88.00 3.00 2.00 .03 5.00 5.00 .01142 -1.01155 3
86 319.00 2.00 18.00 2.00 3.00 .30 15.00 16.00 .77245 1.29458 1
87 321.00 2.00 15.00 2.00 2.00 .63 15.00 18.00 .81241 1.23090 1
88 322.00 1.00 88.00 3.00 2.00 .40 18.00 15.00 .01142 -1.01155 1
89 325.00 2.00 18.00 2.00 3.00 1.00 28.00 18.00 .77245 1.29458 1
90 329.00 1.00 88.00 3.00 2.00 .03 7.00 11.00 .01142 -1.01155 4
91 332.00 2.00 2.00 2.00 2.00 .05 8.00 9.00 .92562 1.08036 1
94 dnl Note: In the above data cases 305, 316 318 and 329 have identical values
95 dnl of the 2nd and 3rd variables. We use this for weight testing.
97 AT_SETUP([LOGISTIC REGRESSION basic test])
101 AT_DATA([lr-data.sps], [dnl
104 data list notable file='lr-data.txt'
105 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
108 variables = outcome with survrate
112 AT_CHECK([pspp -O format=csv lr-data.sps], [0],
114 Table: Dependent Variable Encoding
115 Original Value,Internal Value
119 Table: Case Processing Summary
120 Unweighted Cases,N,Percent
121 Included in Analysis,66,100.000
125 note: Estimation terminated at iteration number 6 because parameter estimates changed by less than 0.001
128 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
131 Table: Classification Table
133 ,,,outcome,,"Percentage
135 ,Observed,,1.000,2.000,
136 Step 1,outcome,1.000,43,5,89.583
138 ,Overall Percentage,,,,86.364
140 Table: Variables in the Equation
141 ,,B,S.E.,Wald,df,Sig.,Exp(B)
142 Step 1,survrate,-.081,.019,17.756,1,.000,.922
143 ,Constant,2.684,.811,10.941,1,.001,14.639
149 AT_SETUP([LOGISTIC REGRESSION missing values])
153 AT_DATA([lr-data.sps], [dnl
156 data list notable file='lr-data.txt'
157 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
159 missing values survrate (999) avoid (44444) outcome (99).
162 variables = outcome with survrate avoid
166 AT_CHECK([pspp -O format=csv lr-data.sps > run0], [0], [ignore])
168 dnl Append some cases with missing values into the data.
169 cat >> lr-data.txt << HERE
170 105.00 1.00 999.00 3.00 2.00 .35 17.00 20.00 .50110 -2.00440 1
171 106.00 1.00 999.00 2.00 3.00 .38 7.00 15.00 .20168 -1.25264 1
172 107.00 1.00 5.00 3.00 2.00 .28 44444 34 .00897 -1.00905 1
173 108.00 99 5.00 3.00 2.00 .28 4 34 .00897 -1.00905 1
176 AT_CHECK([pspp -O format=csv lr-data.sps > run1], [0], [ignore])
178 dnl Only the summary information should be different
179 AT_CHECK([diff run0 run1], [1], [dnl
181 < Included in Analysis,66,100.000
182 < Missing Cases,0,.000
185 > Included in Analysis,66,94.286
186 > Missing Cases,4,5.714
194 dnl Check that a weighted dataset is interpreted correctly
195 dnl To do this, the same data set is used, one weighted, one not.
196 dnl The weighted dataset omits certain cases which are identical
197 AT_SETUP([LOGISTIC REGRESSION weights])
201 AT_DATA([lr-data-unweighted.sps], [dnl
204 data list notable file='lr-data.txt'
205 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
208 variables = outcome with survrate
212 AT_DATA([lr-data-weighted.sps], [dnl
215 data list notable file='lr-data.txt'
216 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
220 * Omit duplicate cases.
221 select if id <> 305 and id <> 316 and id <> 318.
224 variables = outcome with survrate
229 AT_CHECK([pspp -O format=csv lr-data-unweighted.sps > unweighted-result], [0], [ignore])
230 AT_CHECK([pspp -O format=csv lr-data-weighted.sps > weighted-result], [0], [ignore])
232 dnl The only difference should be the summary information, since
233 dnl this displays the unweighted totals.
234 AT_CHECK([diff unweighted-result weighted-result], [1], [dnl
236 < Included in Analysis,66,100.000
238 > Included in Analysis,63,100.000
244 < Step 1,outcome,1.000,43,5,89.583
245 < ,,2.000,4,14,77.778
247 > Step 1,outcome,1.000,43.000,5.000,89.583
248 > ,,2.000,4.000,14.000,77.778
255 dnl Check that the /NOCONST option works as intended.
256 dnl The results this produces are very similar to those
257 dnl at the example in http://www.ats.ucla.edu/stat/SPSS/faq/logregconst.htm
258 AT_SETUP([LOGISTIC REGRESSION without constant])
260 AT_DATA([non-const.sps], [dnl
265 compute female = (#i > 91).
271 compute constant = 1.
273 logistic regression female with constant /noconst.
276 AT_CHECK([pspp -O format=csv non-const.sps], [0],
278 Table: Dependent Variable Encoding
279 Original Value,Internal Value
283 Table: Case Processing Summary
284 Unweighted Cases,N,Percent
285 Included in Analysis,200,100.000
289 note: Estimation terminated at iteration number 2 because parameter estimates changed by less than 0.001
292 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
295 Table: Classification Table
297 ,,,female,,"Percentage
300 Step 1,female,.00,0,91,.000
302 ,Overall Percentage,,,,54.500
304 Table: Variables in the Equation
305 ,,B,S.E.,Wald,df,Sig.,Exp(B)
306 Step 1,constant,.180,.142,1.616,1,.204,1.198
313 dnl Check that if somebody passes a dependent variable which is not dichtomous,
314 dnl then an error is raised.
315 AT_SETUP([LOGISTIC REGRESSION non-dichotomous dep var])
317 AT_DATA([non-dich.sps], [dnl
318 data list notable list /y x1 x2 x3 x4.
325 logistic regression y with x1 x2 x3 x4.
328 AT_CHECK([pspp -O format=csv non-dich.sps], [1],
330 error: Dependent variable's values are not dichotomous.
337 dnl An example to check the behaviour of LOGISTIC REGRESSION with a categorical
338 dnl variable. This examṕle was inspired from that at:
339 dnl http://www.ats.ucla.edu/stat/spss/dae/logit.htm
340 AT_SETUP([LOGISTIC REGRESSION with categorical])
342 AT_DATA([lr-cat.data], [dnl
745 AT_DATA([lr-cat.sps], [dnl
748 data list notable list file='lr-cat.data' /b1 b2 bcat y.
756 AT_CHECK([pspp -O format=csv lr-cat.sps], [0],
758 Table: Dependent Variable Encoding
759 Original Value,Internal Value
763 Table: Case Processing Summary
764 Unweighted Cases,N,Percent
765 Included in Analysis,400,100.000
769 note: Estimation terminated at iteration number 4 because parameter estimates changed by less than 0.001
772 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
775 Table: Categorical Variables' Codings
776 ,,,Parameter coding,,
777 ,,Frequency,(1),(2),(3)
783 Table: Classification Table
787 ,Observed,,4.000,9.000,
788 Step 1,y,4.000,254,19,93.040
790 ,Overall Percentage,,,,71.000
792 Table: Variables in the Equation
793 ,,B,S.E.,Wald,df,Sig.,Exp(B)
794 Step 1,b1,.002,.001,4.284,1,.038,1.002
795 ,b2,.804,.332,5.872,1,.015,2.235
796 ,bcat,,,20.895,3,.000,
797 ,bcat(1),1.551,.418,13.788,1,.000,4.718
798 ,bcat(2),.876,.367,5.706,1,.017,2.401
799 ,bcat(3),.211,.393,.289,1,.591,1.235
800 ,Constant,-5.541,1.138,23.709,1,.000,.004
807 dnl This example is inspired by http://www.ats.ucla.edu/stat/spss/output/logistic.htm
808 AT_SETUP([LOGISTIC REGRESSION with cat var 2])
810 AT_DATA([lr-cat2.data], [dnl
811 60.00 1.00 8.00 50.00
813 57.00 1.00 7.00 53.00
821 68.00 1.00 9.00 69.00
825 57.00 1.00 7.00 61.00
826 55.00 1.00 8.00 50.00
829 50.00 1.00 9.00 66.00
833 47.00 1.00 7.00 34.00
845 68.00 1.00 9.00 69.00
847 63.00 1.00 9.00 61.00
848 65.00 1.00 9.00 61.00
849 63.00 1.00 9.00 53.00
853 52.00 1.00 7.00 56.00
855 47.00 1.00 7.00 53.00
857 50.00 1.00 8.00 55.00
868 68.00 1.00 9.00 55.00
869 47.00 1.00 8.00 50.00
877 55.00 1.00 9.00 49.00
878 68.00 1.00 8.00 50.00
879 52.00 1.00 9.00 63.00
882 66.00 1.00 9.00 61.00
883 65.00 1.00 7.00 58.00
885 68.00 1.00 7.00 59.00
886 60.00 1.00 9.00 61.00
888 57.00 1.00 7.00 54.00
897 63.00 1.00 7.00 63.00
899 57.00 1.00 8.00 63.00
906 65.00 1.00 9.00 63.00
911 63.00 1.00 9.00 55.00
920 47.00 1.00 9.00 69.00
925 50.00 1.00 7.00 63.00
928 73.00 1.00 9.00 61.00
933 57.00 1.00 8.00 55.00
934 53.00 1.00 8.00 57.00
938 57.00 1.00 8.00 58.00
948 73.00 1.00 8.00 69.00
949 71.00 1.00 9.00 58.00
951 63.00 1.00 7.00 54.00
957 65.00 1.00 8.00 55.00
958 76.00 1.00 9.00 67.00
959 71.00 1.00 8.00 66.00
961 47.00 1.00 9.00 63.00
964 54.00 1.00 9.00 55.00
965 55.00 1.00 8.00 58.00
967 55.00 1.00 9.00 63.00
976 65.00 1.00 9.00 66.00
980 63.00 1.00 8.00 72.00
986 73.00 1.00 9.00 58.00
988 63.00 1.00 9.00 69.00
990 65.00 1.00 9.00 66.00
991 73.00 1.00 8.00 63.00
1000 60.00 1.00 9.00 50.00
1001 50.00 .00 9.00 47.00
1002 73.00 1.00 9.00 55.00
1003 52.00 1.00 8.00 47.00
1004 55.00 .00 8.00 53.00
1005 47.00 .00 8.00 53.00
1006 50.00 .00 8.00 61.00
1007 61.00 .00 7.00 44.00
1008 52.00 .00 9.00 53.00
1009 47.00 .00 7.00 40.00
1010 47.00 .00 7.00 50.00
1013 AT_DATA([stringcat.sps], [dnl
1015 data list notable file='lr-cat2.data' list /read honcomp wiz science *.
1018 recode wiz (7 = "a") (8 = "b") (9 = "c") into ses.
1020 logistic regression honcomp with read science ses
1025 AT_CHECK([pspp -O format=csv stringcat.sps], [0],
1027 Table: Dependent Variable Encoding
1028 Original Value,Internal Value
1032 Table: Case Processing Summary
1033 Unweighted Cases,N,Percent
1034 Included in Analysis,200,100.000
1035 Missing Cases,0,.000
1038 note: Estimation terminated at iteration number 5 because parameter estimates changed by less than 0.001
1040 Table: Model Summary
1041 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
1044 Table: Categorical Variables' Codings
1045 ,,,Parameter coding,
1051 Table: Classification Table
1053 ,,,honcomp,,"Percentage
1055 ,Observed,,.000,1.000,
1056 Step 1,honcomp,.000,132,15,89.796
1057 ,,1.000,26,27,50.943
1058 ,Overall Percentage,,,,79.500
1060 Table: Variables in the Equation
1061 ,,B,S.E.,Wald,df,Sig.,Exp(B)
1062 Step 1,read,.098,.025,15.199,1,.000,1.103
1063 ,science,.066,.027,5.867,1,.015,1.068
1064 ,ses,,,6.690,2,.035,
1065 ,ses(1),.058,.532,.012,1,.913,1.060
1066 ,ses(2),-1.013,.444,5.212,1,.022,.363
1067 ,Constant,-9.561,1.662,33.113,1,.000,.000
1073 dnl Check that it doesn't crash if a categorical variable
1074 dnl has only one distinct value
1075 AT_SETUP([LOGISTIC REGRESSION identical categories])
1077 AT_DATA([crash.sps], [dnl
1078 data list notable list /y x1 x2*.
1084 logistic regression y with x1 x2
1088 AT_CHECK([pspp -O format=csv crash.sps], [1], [ignore])
1093 dnl Test that missing values on the categorical predictors are treated
1095 AT_SETUP([LOGISTIC REGRESSION missing categoricals])
1097 AT_DATA([data.txt], [dnl
1200 AT_DATA([miss.sps], [dnl
1201 data list notable file='data.txt' list /y x1 cat0*.
1203 logistic regression y with x1 cat0
1204 /categorical = cat0.
1207 AT_CHECK([pspp -O format=csv miss.sps > file1], [0], [ignore])
1209 dnl Append a case with a missing categorical.
1210 AT_CHECK([echo '1 34 .' >> data.txt], [0], [ignore])
1212 AT_CHECK([pspp -O format=csv miss.sps > file2], [0], [ignore])
1214 AT_CHECK([diff file1 file2], [1], [dnl
1216 < Included in Analysis,100,100.00
1217 < Missing Cases,0,.00
1220 > Included in Analysis,100,99.01
1221 > Missing Cases,1,.99
1228 dnl Check that the confidence intervals are properly reported.
1229 dnl Use an example with categoricals, because that was buggy at
1230 dnl one point. The data in this example comes from:
1231 dnl http://people.ysu.edu/~gchang/SPSSE/SPSS_lab2Regression.pdf
1232 AT_SETUP([LOGISTIC REGRESSION confidence interval])
1234 AT_DATA([ci.sps], [dnl
1236 data list notable list /disease age sciostat sector savings *.
1437 disease WITH age sciostat sector savings
1438 /categorical = sciostat sector
1442 AT_CHECK([pspp -O format=csv ci.sps], [0], [dnl
1443 Table: Dependent Variable Encoding
1444 Original Value,Internal Value
1448 Table: Case Processing Summary
1449 Unweighted Cases,N,Percent
1450 Included in Analysis,196,100.000
1451 Missing Cases,0,.000
1454 note: Estimation terminated at iteration number 4 because parameter estimates changed by less than 0.001
1456 Table: Model Summary
1457 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
1460 Table: Categorical Variables' Codings
1461 ,,,Parameter coding,
1463 sciostat,1.000,77,1,0
1469 Table: Classification Table
1471 ,,,disease,,"Percentage
1473 ,Observed,,.000,1.000,
1474 Step 1,disease,.000,131,8,94.245
1475 ,,1.000,41,16,28.070
1476 ,Overall Percentage,,,,75.000
1478 Table: Variables in the Equation
1479 ,,,,,,,,95% CI for Exp(B),
1480 ,,B,S.E.,Wald,df,Sig.,Exp(B),Lower,Upper
1481 Step 1,age,.027,.009,8.647,1,.003,1.027,1.009,1.045
1482 ,savings,.061,.386,.025,1,.874,1.063,.499,2.264
1483 ,sciostat,,,.440,2,.803,,,
1484 ,sciostat(1),-.278,.434,.409,1,.522,.757,.323,1.775
1485 ,sciostat(2),-.219,.459,.227,1,.634,.803,.327,1.976
1486 ,sector,,,11.974,1,.001,,,
1487 ,sector(1),-1.235,.357,11.974,1,.001,.291,.145,.586
1488 ,Constant,-.814,.452,3.246,1,.072,.443,,