1 dnl PSPP - a program for statistical analysis.
2 dnl Copyright (C) 2017 Free Software Foundation, Inc.
4 dnl This program is free software: you can redistribute it and/or modify
5 dnl it under the terms of the GNU General Public License as published by
6 dnl the Free Software Foundation, either version 3 of the License, or
7 dnl (at your option) any later version.
9 dnl This program is distributed in the hope that it will be useful,
10 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
11 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 dnl GNU General Public License for more details.
14 dnl You should have received a copy of the GNU General Public License
15 dnl along with this program. If not, see <http://www.gnu.org/licenses/>.
16 dnl AT_BANNER([LOGISTIC REGRESSION])
18 dnl These examples are adapted from
19 dnl http://www.uvm.edu/~dhowell/gradstat/psych341/lectures/Logistic%20Regression/LogisticReg1.html
23 m4_define([LOGIT_TEST_DATA],
24 [AT_DATA([lr-data.txt], dnl
25 105.00 1.00 33.00 3.00 2.00 .35 17.00 20.00 .50110 -2.00440 1
26 106.00 1.00 50.00 2.00 3.00 .38 7.00 15.00 .20168 -1.25264 1
27 107.00 1.00 91.00 3.00 2.00 .28 15.00 7.00 .00897 -1.00905 1
28 108.00 1.00 90.00 3.00 2.00 .20 2.00 2.00 .00972 -1.00982 1
29 109.00 1.00 70.00 3.00 3.00 .38 23.00 27.00 .04745 -1.04981 1
30 111.00 2.00 31.00 2.00 2.00 .00 19.00 10.00 .54159 1.84640 1
31 112.00 1.00 91.00 2.00 3.00 .18 6.00 16.00 .00897 -1.00905 1
32 113.00 1.00 81.00 3.00 2.00 .00 3.00 9.00 .01998 -1.02039 1
33 114.00 2.00 15.00 1.00 2.00 .13 19.00 13.00 .81241 1.23090 1
34 116.00 2.00 1.00 1.00 2.00 .88 15.00 7.00 .93102 1.07410 1
35 117.00 1.00 93.00 3.00 2.00 .18 9.00 15.00 .00764 -1.00770 1
36 118.00 2.00 14.00 1.00 3.00 .15 23.00 18.00 .82447 1.21289 1
37 120.00 1.00 91.00 2.00 2.00 .43 17.00 14.00 .00897 -1.00905 1
38 121.00 1.00 55.00 3.00 2.00 .69 20.00 14.00 .14409 -1.16834 1
39 122.00 1.00 70.00 2.00 3.00 .03 .00 6.00 .04745 -1.04981 1
40 123.00 1.00 25.00 2.00 2.00 .45 4.00 10.00 .65789 -2.92301 1
41 125.00 1.00 91.00 2.00 2.00 .13 .00 3.00 .00897 -1.00905 1
42 126.00 1.00 91.00 3.00 3.00 .23 4.00 6.00 .00897 -1.00905 1
43 127.00 1.00 91.00 3.00 2.00 .00 8.00 8.00 .00897 -1.00905 1
44 128.00 2.00 13.00 2.00 2.00 .65 16.00 14.00 .83592 1.19629 1
45 129.00 1.00 50.00 2.00 2.00 .25 20.00 23.00 .20168 -1.25264 1
46 135.00 1.00 90.00 3.00 3.00 .03 5.00 12.00 .00972 -1.00982 1
47 138.00 1.00 70.00 3.00 3.00 .10 1.00 6.00 .04745 -1.04981 1
48 139.00 2.00 19.00 3.00 3.00 .10 11.00 12.00 .75787 1.31949 1
49 149.00 2.00 50.00 3.00 2.00 .03 .00 .00 .20168 4.95826 1
50 204.00 1.00 50.00 3.00 1.00 .13 .00 1.00 .20168 -1.25264 1
51 205.00 1.00 91.00 3.00 3.00 .72 16.00 18.00 .00897 -1.00905 1
52 206.00 2.00 24.00 1.00 1.00 .10 5.00 21.00 .67592 1.47947 1
53 207.00 1.00 80.00 3.00 3.00 .13 6.00 7.00 .02164 -1.02212 1
54 208.00 1.00 87.00 2.00 2.00 .18 9.00 20.00 .01237 -1.01253 1
55 209.00 1.00 70.00 2.00 2.00 .53 15.00 12.00 .04745 -1.04981 1
56 211.00 1.00 55.00 2.00 1.00 .33 8.00 5.00 .14409 -1.16834 1
57 212.00 1.00 56.00 3.00 1.00 .30 6.00 20.00 .13436 -1.15522 1
58 214.00 1.00 54.00 2.00 2.00 .15 .00 16.00 .15439 -1.18258 1
59 215.00 1.00 71.00 3.00 3.00 .35 12.00 12.00 .04391 -1.04592 1
60 217.00 2.00 36.00 1.00 1.00 .10 12.00 8.00 .44049 2.27020 1
61 218.00 1.00 91.00 2.00 2.00 .05 11.00 25.00 .00897 -1.00905 1
62 219.00 1.00 91.00 2.00 2.00 1.23 11.00 24.00 .00897 -1.00905 1
63 220.00 1.00 91.00 2.00 3.00 .08 8.00 11.00 .00897 -1.00905 1
64 221.00 1.00 91.00 2.00 2.00 .33 5.00 11.00 .00897 -1.00905 1
65 222.00 2.00 36.00 2.00 1.00 .18 5.00 3.00 .44049 2.27020 1
66 223.00 1.00 70.00 2.00 3.00 .18 14.00 3.00 .04745 -1.04981 1
67 224.00 1.00 91.00 2.00 2.00 .43 2.00 10.00 .00897 -1.00905 1
68 225.00 1.00 55.00 2.00 1.00 .18 6.00 11.00 .14409 -1.16834 1
69 229.00 2.00 75.00 2.00 2.00 .40 30.00 25.00 .03212 31.12941 1
70 232.00 1.00 91.00 3.00 2.00 .15 6.00 3.00 .00897 -1.00905 1
71 233.00 1.00 70.00 2.00 1.00 .00 11.00 8.00 .04745 -1.04981 1
72 234.00 1.00 54.00 3.00 2.00 .10 .00 .00 .15439 -1.18258 1
73 237.00 1.00 70.00 3.00 2.00 .18 5.00 25.00 .04745 -1.04981 1
74 241.00 1.00 19.00 2.00 3.00 .33 13.00 9.00 .75787 -4.12995 1
75 304.00 2.00 18.00 2.00 2.00 .26 25.00 6.00 .77245 1.29458 1
76 305.00 1.00 88.00 3.00 2.00 1.35 17.00 29.00 .01142 -1.01155 1
77 306.00 1.00 70.00 2.00 3.00 .63 14.00 33.00 .04745 -1.04981 1
78 307.00 1.00 85.00 2.00 2.00 2.65 18.00 14.00 .01452 -1.01474 1
79 308.00 1.00 13.00 2.00 2.00 .23 5.00 5.00 .83592 -6.09442 1
80 309.00 2.00 13.00 2.00 2.00 .23 7.00 17.00 .83592 1.19629 1
81 311.00 2.00 1.00 2.00 2.00 .50 20.00 14.00 .93102 1.07410 1
82 315.00 1.00 19.00 2.00 3.00 .18 1.00 11.00 .75787 -4.12995 1
83 316.00 1.00 88.00 2.00 2.00 .38 12.00 11.00 .01142 -1.01155 2
84 318.00 1.00 88.00 3.00 2.00 .03 5.00 5.00 .01142 -1.01155 3
85 319.00 2.00 18.00 2.00 3.00 .30 15.00 16.00 .77245 1.29458 1
86 321.00 2.00 15.00 2.00 2.00 .63 15.00 18.00 .81241 1.23090 1
87 322.00 1.00 88.00 3.00 2.00 .40 18.00 15.00 .01142 -1.01155 1
88 325.00 2.00 18.00 2.00 3.00 1.00 28.00 18.00 .77245 1.29458 1
89 329.00 1.00 88.00 3.00 2.00 .03 7.00 11.00 .01142 -1.01155 4
90 332.00 2.00 2.00 2.00 2.00 .05 8.00 9.00 .92562 1.08036 1
93 dnl Note: In the above data cases 305, 316 318 and 329 have identical values
94 dnl of the 2nd and 3rd variables. We use this for weight testing.
96 AT_SETUP([LOGISTIC REGRESSION basic test])
100 AT_DATA([lr-data.sps], [dnl
103 data list notable file='lr-data.txt'
104 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
107 variables = outcome with survrate
111 AT_CHECK([pspp -O format=csv lr-data.sps], [0],
113 Table: Dependent Variable Encoding
114 Original Value,Internal Value
118 Table: Case Processing Summary
119 Unweighted Cases,N,Percent
120 Included in Analysis,66,100.000
124 note: Estimation terminated at iteration number 6 because parameter estimates changed by less than 0.001
127 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
130 Table: Classification Table
132 ,,,outcome,,"Percentage
134 ,Observed,,1.000,2.000,
135 Step 1,outcome,1.000,43,5,89.583
137 ,Overall Percentage,,,,86.364
139 Table: Variables in the Equation
140 ,,B,S.E.,Wald,df,Sig.,Exp(B)
141 Step 1,survrate,-.081,.019,17.756,1,.000,.922
142 ,Constant,2.684,.811,10.941,1,.001,14.639
148 AT_SETUP([LOGISTIC REGRESSION missing values])
152 AT_DATA([lr-data.sps], [dnl
155 data list notable file='lr-data.txt'
156 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
158 missing values survrate (999) avoid (44444) outcome (99).
161 variables = outcome with survrate avoid
165 AT_CHECK([pspp -O format=csv lr-data.sps > run0], [0], [ignore])
167 dnl Append some cases with missing values into the data.
168 cat >> lr-data.txt << HERE
169 105.00 1.00 999.00 3.00 2.00 .35 17.00 20.00 .50110 -2.00440 1
170 106.00 1.00 999.00 2.00 3.00 .38 7.00 15.00 .20168 -1.25264 1
171 107.00 1.00 5.00 3.00 2.00 .28 44444 34 .00897 -1.00905 1
172 108.00 99 5.00 3.00 2.00 .28 4 34 .00897 -1.00905 1
175 AT_CHECK([pspp -O format=csv lr-data.sps > run1], [0], [ignore])
177 dnl Only the summary information should be different
178 AT_CHECK([diff run0 run1], [1], [dnl
180 < Included in Analysis,66,100.000
181 < Missing Cases,0,.000
184 > Included in Analysis,66,94.286
185 > Missing Cases,4,5.714
193 dnl Check that a weighted dataset is interpreted correctly
194 dnl To do this, the same data set is used, one weighted, one not.
195 dnl The weighted dataset omits certain cases which are identical
196 AT_SETUP([LOGISTIC REGRESSION weights])
200 AT_DATA([lr-data-unweighted.sps], [dnl
203 data list notable file='lr-data.txt'
204 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
207 variables = outcome with survrate
211 AT_DATA([lr-data-weighted.sps], [dnl
214 data list notable file='lr-data.txt'
215 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
219 * Omit duplicate cases.
220 select if id <> 305 and id <> 316 and id <> 318.
223 variables = outcome with survrate
228 AT_CHECK([pspp -O format=csv lr-data-unweighted.sps > unweighted-result], [0], [ignore])
229 AT_CHECK([pspp -O format=csv lr-data-weighted.sps > weighted-result], [0], [ignore])
231 dnl The only difference should be the summary information, since
232 dnl this displays the unweighted totals.
233 AT_CHECK([diff unweighted-result weighted-result], [1], [dnl
235 < Included in Analysis,66,100.000
237 > Included in Analysis,63,100.000
243 < Step 1,outcome,1.000,43,5,89.583
244 < ,,2.000,4,14,77.778
246 > Step 1,outcome,1.000,43.000,5.000,89.583
247 > ,,2.000,4.000,14.000,77.778
254 dnl Check that the /NOCONST option works as intended.
255 dnl The results this produces are very similar to those
256 dnl at the example in http://www.ats.ucla.edu/stat/SPSS/faq/logregconst.htm
257 AT_SETUP([LOGISTIC REGRESSION without constant])
259 AT_DATA([non-const.sps], [dnl
264 compute female = (#i > 91).
270 compute constant = 1.
272 logistic regression female with constant /noconst.
275 AT_CHECK([pspp -O format=csv non-const.sps], [0],
277 Table: Dependent Variable Encoding
278 Original Value,Internal Value
282 Table: Case Processing Summary
283 Unweighted Cases,N,Percent
284 Included in Analysis,200,100.000
288 note: Estimation terminated at iteration number 2 because parameter estimates changed by less than 0.001
291 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
294 Table: Classification Table
296 ,,,female,,"Percentage
299 Step 1,female,.00,0,91,.000
301 ,Overall Percentage,,,,54.500
303 Table: Variables in the Equation
304 ,,B,S.E.,Wald,df,Sig.,Exp(B)
305 Step 1,constant,.180,.142,1.616,1,.204,1.198
312 dnl Check that if somebody passes a dependent variable which is not dichtomous,
313 dnl then an error is raised.
314 AT_SETUP([LOGISTIC REGRESSION non-dichotomous dep var])
316 AT_DATA([non-dich.sps], [dnl
317 data list notable list /y x1 x2 x3 x4.
324 logistic regression y with x1 x2 x3 x4.
327 AT_CHECK([pspp -O format=csv non-dich.sps], [1],
329 error: Dependent variable's values are not dichotomous.
336 dnl An example to check the behaviour of LOGISTIC REGRESSION with a categorical
337 dnl variable. This examṕle was inspired from that at:
338 dnl http://www.ats.ucla.edu/stat/spss/dae/logit.htm
339 AT_SETUP([LOGISTIC REGRESSION with categorical])
341 AT_DATA([lr-cat.data], [dnl
744 AT_DATA([lr-cat.sps], [dnl
747 data list notable list file='lr-cat.data' /b1 b2 bcat y.
755 AT_CHECK([pspp -O format=csv lr-cat.sps], [0],
757 Table: Dependent Variable Encoding
758 Original Value,Internal Value
762 Table: Case Processing Summary
763 Unweighted Cases,N,Percent
764 Included in Analysis,400,100.000
768 note: Estimation terminated at iteration number 4 because parameter estimates changed by less than 0.001
771 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
774 Table: Categorical Variables' Codings
775 ,,,Parameter coding,,
776 ,,Frequency,(1),(2),(3)
782 Table: Classification Table
786 ,Observed,,4.000,9.000,
787 Step 1,y,4.000,254,19,93.040
789 ,Overall Percentage,,,,71.000
791 Table: Variables in the Equation
792 ,,B,S.E.,Wald,df,Sig.,Exp(B)
793 Step 1,b1,.002,.001,4.284,1,.038,1.002
794 ,b2,.804,.332,5.872,1,.015,2.235
795 ,bcat,,,20.895,3,.000,
796 ,bcat(1),1.551,.418,13.788,1,.000,4.718
797 ,bcat(2),.876,.367,5.706,1,.017,2.401
798 ,bcat(3),.211,.393,.289,1,.591,1.235
799 ,Constant,-5.541,1.138,23.709,1,.000,.004
806 dnl This example is inspired by http://www.ats.ucla.edu/stat/spss/output/logistic.htm
807 AT_SETUP([LOGISTIC REGRESSION with cat var 2])
809 AT_DATA([lr-cat2.data], [dnl
810 60.00 1.00 8.00 50.00
812 57.00 1.00 7.00 53.00
820 68.00 1.00 9.00 69.00
824 57.00 1.00 7.00 61.00
825 55.00 1.00 8.00 50.00
828 50.00 1.00 9.00 66.00
832 47.00 1.00 7.00 34.00
844 68.00 1.00 9.00 69.00
846 63.00 1.00 9.00 61.00
847 65.00 1.00 9.00 61.00
848 63.00 1.00 9.00 53.00
852 52.00 1.00 7.00 56.00
854 47.00 1.00 7.00 53.00
856 50.00 1.00 8.00 55.00
867 68.00 1.00 9.00 55.00
868 47.00 1.00 8.00 50.00
876 55.00 1.00 9.00 49.00
877 68.00 1.00 8.00 50.00
878 52.00 1.00 9.00 63.00
881 66.00 1.00 9.00 61.00
882 65.00 1.00 7.00 58.00
884 68.00 1.00 7.00 59.00
885 60.00 1.00 9.00 61.00
887 57.00 1.00 7.00 54.00
896 63.00 1.00 7.00 63.00
898 57.00 1.00 8.00 63.00
905 65.00 1.00 9.00 63.00
910 63.00 1.00 9.00 55.00
919 47.00 1.00 9.00 69.00
924 50.00 1.00 7.00 63.00
927 73.00 1.00 9.00 61.00
932 57.00 1.00 8.00 55.00
933 53.00 1.00 8.00 57.00
937 57.00 1.00 8.00 58.00
947 73.00 1.00 8.00 69.00
948 71.00 1.00 9.00 58.00
950 63.00 1.00 7.00 54.00
956 65.00 1.00 8.00 55.00
957 76.00 1.00 9.00 67.00
958 71.00 1.00 8.00 66.00
960 47.00 1.00 9.00 63.00
963 54.00 1.00 9.00 55.00
964 55.00 1.00 8.00 58.00
966 55.00 1.00 9.00 63.00
975 65.00 1.00 9.00 66.00
979 63.00 1.00 8.00 72.00
985 73.00 1.00 9.00 58.00
987 63.00 1.00 9.00 69.00
989 65.00 1.00 9.00 66.00
990 73.00 1.00 8.00 63.00
999 60.00 1.00 9.00 50.00
1000 50.00 .00 9.00 47.00
1001 73.00 1.00 9.00 55.00
1002 52.00 1.00 8.00 47.00
1003 55.00 .00 8.00 53.00
1004 47.00 .00 8.00 53.00
1005 50.00 .00 8.00 61.00
1006 61.00 .00 7.00 44.00
1007 52.00 .00 9.00 53.00
1008 47.00 .00 7.00 40.00
1009 47.00 .00 7.00 50.00
1012 AT_DATA([stringcat.sps], [dnl
1014 data list notable file='lr-cat2.data' list /read honcomp wiz science *.
1017 recode wiz (7 = "a") (8 = "b") (9 = "c") into ses.
1019 logistic regression honcomp with read science ses
1024 AT_CHECK([pspp -O format=csv stringcat.sps], [0],
1026 Table: Dependent Variable Encoding
1027 Original Value,Internal Value
1031 Table: Case Processing Summary
1032 Unweighted Cases,N,Percent
1033 Included in Analysis,200,100.000
1034 Missing Cases,0,.000
1037 note: Estimation terminated at iteration number 5 because parameter estimates changed by less than 0.001
1039 Table: Model Summary
1040 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
1043 Table: Categorical Variables' Codings
1044 ,,,Parameter coding,
1050 Table: Classification Table
1052 ,,,honcomp,,"Percentage
1054 ,Observed,,.000,1.000,
1055 Step 1,honcomp,.000,132,15,89.796
1056 ,,1.000,26,27,50.943
1057 ,Overall Percentage,,,,79.500
1059 Table: Variables in the Equation
1060 ,,B,S.E.,Wald,df,Sig.,Exp(B)
1061 Step 1,read,.098,.025,15.199,1,.000,1.103
1062 ,science,.066,.027,5.867,1,.015,1.068
1063 ,ses,,,6.690,2,.035,
1064 ,ses(1),.058,.532,.012,1,.913,1.060
1065 ,ses(2),-1.013,.444,5.212,1,.022,.363
1066 ,Constant,-9.561,1.662,33.113,1,.000,.000
1072 dnl Check that it doesn't crash if a categorical variable
1073 dnl has only one distinct value
1074 AT_SETUP([LOGISTIC REGRESSION identical categories])
1076 AT_DATA([crash.sps], [dnl
1077 data list notable list /y x1 x2*.
1083 logistic regression y with x1 x2
1087 AT_CHECK([pspp -O format=csv crash.sps], [1], [ignore])
1092 dnl Test that missing values on the categorical predictors are treated
1094 AT_SETUP([LOGISTIC REGRESSION missing categoricals])
1096 AT_DATA([data.txt], [dnl
1199 AT_DATA([miss.sps], [dnl
1200 data list notable file='data.txt' list /y x1 cat0*.
1202 logistic regression y with x1 cat0
1203 /categorical = cat0.
1206 AT_CHECK([pspp -O format=csv miss.sps > file1], [0], [ignore])
1208 dnl Append a case with a missing categorical.
1209 AT_CHECK([echo '1 34 .' >> data.txt], [0], [ignore])
1211 AT_CHECK([pspp -O format=csv miss.sps > file2], [0], [ignore])
1213 AT_CHECK([diff file1 file2], [1], [dnl
1215 < Included in Analysis,100,100.00
1216 < Missing Cases,0,.00
1219 > Included in Analysis,100,99.01
1220 > Missing Cases,1,.99
1227 dnl Check that the confidence intervals are properly reported.
1228 dnl Use an example with categoricals, because that was buggy at
1229 dnl one point. The data in this example comes from:
1230 dnl http://people.ysu.edu/~gchang/SPSSE/SPSS_lab2Regression.pdf
1231 AT_SETUP([LOGISTIC REGRESSION confidence interval])
1233 AT_DATA([ci.sps], [dnl
1235 data list notable list /disease age sciostat sector savings *.
1436 disease WITH age sciostat sector savings
1437 /categorical = sciostat sector
1441 AT_CHECK([pspp -O format=csv ci.sps], [0], [dnl
1442 Table: Dependent Variable Encoding
1443 Original Value,Internal Value
1447 Table: Case Processing Summary
1448 Unweighted Cases,N,Percent
1449 Included in Analysis,196,100.000
1450 Missing Cases,0,.000
1453 note: Estimation terminated at iteration number 4 because parameter estimates changed by less than 0.001
1455 Table: Model Summary
1456 Step 1,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
1459 Table: Categorical Variables' Codings
1460 ,,,Parameter coding,
1462 sciostat,1.000,77,1,0
1468 Table: Classification Table
1470 ,,,disease,,"Percentage
1472 ,Observed,,.000,1.000,
1473 Step 1,disease,.000,131,8,94.245
1474 ,,1.000,41,16,28.070
1475 ,Overall Percentage,,,,75.000
1477 Table: Variables in the Equation
1478 ,,,,,,,,95% CI for Exp(B),
1479 ,,B,S.E.,Wald,df,Sig.,Exp(B),Lower,Upper
1480 Step 1,age,.027,.009,8.647,1,.003,1.027,1.009,1.045
1481 ,savings,.061,.386,.025,1,.874,1.063,.499,2.264
1482 ,sciostat,,,.440,2,.803,,,
1483 ,sciostat(1),-.278,.434,.409,1,.522,.757,.323,1.775
1484 ,sciostat(2),-.219,.459,.227,1,.634,.803,.327,1.976
1485 ,sector,,,11.974,1,.001,,,
1486 ,sector(1),-1.235,.357,11.974,1,.001,.291,.145,.586
1487 ,Constant,-.814,.452,3.246,1,.072,.443,,