1 dnl PSPP - a program for statistical analysis.
2 dnl Copyright (C) 2017 Free Software Foundation, Inc.
4 dnl This program is free software: you can redistribute it and/or modify
5 dnl it under the terms of the GNU General Public License as published by
6 dnl the Free Software Foundation, either version 3 of the License, or
7 dnl (at your option) any later version.
9 dnl This program is distributed in the hope that it will be useful,
10 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
11 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 dnl GNU General Public License for more details.
14 dnl You should have received a copy of the GNU General Public License
15 dnl along with this program. If not, see <http://www.gnu.org/licenses/>.
17 AT_BANNER([LOGISTIC REGRESSION])
19 dnl These examples are adapted from
20 dnl http://www.uvm.edu/~dhowell/gradstat/psych341/lectures/Logistic%20Regression/LogisticReg1.html
24 m4_define([LOGIT_TEST_DATA],
25 [AT_DATA([lr-data.txt], dnl
26 105.00 1.00 33.00 3.00 2.00 .35 17.00 20.00 .50110 -2.00440 1
27 106.00 1.00 50.00 2.00 3.00 .38 7.00 15.00 .20168 -1.25264 1
28 107.00 1.00 91.00 3.00 2.00 .28 15.00 7.00 .00897 -1.00905 1
29 108.00 1.00 90.00 3.00 2.00 .20 2.00 2.00 .00972 -1.00982 1
30 109.00 1.00 70.00 3.00 3.00 .38 23.00 27.00 .04745 -1.04981 1
31 111.00 2.00 31.00 2.00 2.00 .00 19.00 10.00 .54159 1.84640 1
32 112.00 1.00 91.00 2.00 3.00 .18 6.00 16.00 .00897 -1.00905 1
33 113.00 1.00 81.00 3.00 2.00 .00 3.00 9.00 .01998 -1.02039 1
34 114.00 2.00 15.00 1.00 2.00 .13 19.00 13.00 .81241 1.23090 1
35 116.00 2.00 1.00 1.00 2.00 .88 15.00 7.00 .93102 1.07410 1
36 117.00 1.00 93.00 3.00 2.00 .18 9.00 15.00 .00764 -1.00770 1
37 118.00 2.00 14.00 1.00 3.00 .15 23.00 18.00 .82447 1.21289 1
38 120.00 1.00 91.00 2.00 2.00 .43 17.00 14.00 .00897 -1.00905 1
39 121.00 1.00 55.00 3.00 2.00 .69 20.00 14.00 .14409 -1.16834 1
40 122.00 1.00 70.00 2.00 3.00 .03 .00 6.00 .04745 -1.04981 1
41 123.00 1.00 25.00 2.00 2.00 .45 4.00 10.00 .65789 -2.92301 1
42 125.00 1.00 91.00 2.00 2.00 .13 .00 3.00 .00897 -1.00905 1
43 126.00 1.00 91.00 3.00 3.00 .23 4.00 6.00 .00897 -1.00905 1
44 127.00 1.00 91.00 3.00 2.00 .00 8.00 8.00 .00897 -1.00905 1
45 128.00 2.00 13.00 2.00 2.00 .65 16.00 14.00 .83592 1.19629 1
46 129.00 1.00 50.00 2.00 2.00 .25 20.00 23.00 .20168 -1.25264 1
47 135.00 1.00 90.00 3.00 3.00 .03 5.00 12.00 .00972 -1.00982 1
48 138.00 1.00 70.00 3.00 3.00 .10 1.00 6.00 .04745 -1.04981 1
49 139.00 2.00 19.00 3.00 3.00 .10 11.00 12.00 .75787 1.31949 1
50 149.00 2.00 50.00 3.00 2.00 .03 .00 .00 .20168 4.95826 1
51 204.00 1.00 50.00 3.00 1.00 .13 .00 1.00 .20168 -1.25264 1
52 205.00 1.00 91.00 3.00 3.00 .72 16.00 18.00 .00897 -1.00905 1
53 206.00 2.00 24.00 1.00 1.00 .10 5.00 21.00 .67592 1.47947 1
54 207.00 1.00 80.00 3.00 3.00 .13 6.00 7.00 .02164 -1.02212 1
55 208.00 1.00 87.00 2.00 2.00 .18 9.00 20.00 .01237 -1.01253 1
56 209.00 1.00 70.00 2.00 2.00 .53 15.00 12.00 .04745 -1.04981 1
57 211.00 1.00 55.00 2.00 1.00 .33 8.00 5.00 .14409 -1.16834 1
58 212.00 1.00 56.00 3.00 1.00 .30 6.00 20.00 .13436 -1.15522 1
59 214.00 1.00 54.00 2.00 2.00 .15 .00 16.00 .15439 -1.18258 1
60 215.00 1.00 71.00 3.00 3.00 .35 12.00 12.00 .04391 -1.04592 1
61 217.00 2.00 36.00 1.00 1.00 .10 12.00 8.00 .44049 2.27020 1
62 218.00 1.00 91.00 2.00 2.00 .05 11.00 25.00 .00897 -1.00905 1
63 219.00 1.00 91.00 2.00 2.00 1.23 11.00 24.00 .00897 -1.00905 1
64 220.00 1.00 91.00 2.00 3.00 .08 8.00 11.00 .00897 -1.00905 1
65 221.00 1.00 91.00 2.00 2.00 .33 5.00 11.00 .00897 -1.00905 1
66 222.00 2.00 36.00 2.00 1.00 .18 5.00 3.00 .44049 2.27020 1
67 223.00 1.00 70.00 2.00 3.00 .18 14.00 3.00 .04745 -1.04981 1
68 224.00 1.00 91.00 2.00 2.00 .43 2.00 10.00 .00897 -1.00905 1
69 225.00 1.00 55.00 2.00 1.00 .18 6.00 11.00 .14409 -1.16834 1
70 229.00 2.00 75.00 2.00 2.00 .40 30.00 25.00 .03212 31.12941 1
71 232.00 1.00 91.00 3.00 2.00 .15 6.00 3.00 .00897 -1.00905 1
72 233.00 1.00 70.00 2.00 1.00 .00 11.00 8.00 .04745 -1.04981 1
73 234.00 1.00 54.00 3.00 2.00 .10 .00 .00 .15439 -1.18258 1
74 237.00 1.00 70.00 3.00 2.00 .18 5.00 25.00 .04745 -1.04981 1
75 241.00 1.00 19.00 2.00 3.00 .33 13.00 9.00 .75787 -4.12995 1
76 304.00 2.00 18.00 2.00 2.00 .26 25.00 6.00 .77245 1.29458 1
77 305.00 1.00 88.00 3.00 2.00 1.35 17.00 29.00 .01142 -1.01155 1
78 306.00 1.00 70.00 2.00 3.00 .63 14.00 33.00 .04745 -1.04981 1
79 307.00 1.00 85.00 2.00 2.00 2.65 18.00 14.00 .01452 -1.01474 1
80 308.00 1.00 13.00 2.00 2.00 .23 5.00 5.00 .83592 -6.09442 1
81 309.00 2.00 13.00 2.00 2.00 .23 7.00 17.00 .83592 1.19629 1
82 311.00 2.00 1.00 2.00 2.00 .50 20.00 14.00 .93102 1.07410 1
83 315.00 1.00 19.00 2.00 3.00 .18 1.00 11.00 .75787 -4.12995 1
84 316.00 1.00 88.00 2.00 2.00 .38 12.00 11.00 .01142 -1.01155 2
85 318.00 1.00 88.00 3.00 2.00 .03 5.00 5.00 .01142 -1.01155 3
86 319.00 2.00 18.00 2.00 3.00 .30 15.00 16.00 .77245 1.29458 1
87 321.00 2.00 15.00 2.00 2.00 .63 15.00 18.00 .81241 1.23090 1
88 322.00 1.00 88.00 3.00 2.00 .40 18.00 15.00 .01142 -1.01155 1
89 325.00 2.00 18.00 2.00 3.00 1.00 28.00 18.00 .77245 1.29458 1
90 329.00 1.00 88.00 3.00 2.00 .03 7.00 11.00 .01142 -1.01155 4
91 332.00 2.00 2.00 2.00 2.00 .05 8.00 9.00 .92562 1.08036 1
94 dnl Note: In the above data cases 305, 316 318 and 329 have identical values
95 dnl of the 2nd and 3rd variables. We use this for weight testing.
97 AT_SETUP([LOGISTIC REGRESSION basic test])
98 AT_KEYWORDS([categorical categoricals])
102 AT_DATA([lr-data.sps], [dnl
105 data list notable file='lr-data.txt'
106 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
109 variables = outcome with survrate
113 AT_CHECK([pspp -o pspp.csv -o pspp.txt lr-data.sps], [0], [dnl
114 note: Estimation terminated at iteration number 6 because parameter estimates changed by less than 0.001
116 AT_CHECK([cat pspp.csv], [0], [Table: Dependent Variable Encoding
117 Original Value,Internal Value
121 Table: Case Processing Summary
122 Unweighted Cases,N,Percent
123 Included in Analysis,66,100.0%
127 note: Estimation terminated at iteration number 6 because parameter estimates changed by less than 0.001
130 Step,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
133 Table: Classification Table
135 ,,,outcome,,Percentage Correct
136 ,Observed,,1.000,2.000,
137 Step 1,outcome,1.000,43,5,89.6%
139 ,Overall Percentage,,,,86.4%
141 Table: Variables in the Equation
142 ,,B,S.E.,Wald,df,Sig.,Exp(B)
143 Step 1,survrate,-.081,.019,17.756,1,.000,.922
144 ,Constant,2.684,.811,10.941,1,.001,14.639
148 AT_SETUP([LOGISTIC REGRESSION missing values])
149 AT_KEYWORDS([categorical categoricals])
153 AT_DATA([lr-data.sps], [dnl
156 data list notable file='lr-data.txt'
157 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
159 missing values survrate (999) avoid (44444) outcome (99).
162 variables = outcome with survrate avoid
166 AT_CHECK([pspp -O format=csv lr-data.sps > run0], [0], [ignore])
168 dnl Append some cases with missing values into the data.
169 cat >> lr-data.txt << HERE
170 105.00 1.00 999.00 3.00 2.00 .35 17.00 20.00 .50110 -2.00440 1
171 106.00 1.00 999.00 2.00 3.00 .38 7.00 15.00 .20168 -1.25264 1
172 107.00 1.00 5.00 3.00 2.00 .28 44444 34 .00897 -1.00905 1
173 108.00 99 5.00 3.00 2.00 .28 4 34 .00897 -1.00905 1
176 AT_CHECK([pspp -O format=csv lr-data.sps > run1], [0], [ignore])
178 dnl Only the summary information should be different
179 AT_CHECK([diff run0 run1], [1], [dnl
181 < Included in Analysis,66,100.0%
182 < Missing Cases,0,.0%
185 > Included in Analysis,66,94.3%
186 > Missing Cases,4,5.7%
194 dnl Check that a weighted dataset is interpreted correctly
195 dnl To do this, the same data set is used, one weighted, one not.
196 dnl The weighted dataset omits certain cases which are identical
197 AT_SETUP([LOGISTIC REGRESSION weights])
198 AT_KEYWORDS([categorical categoricals])
202 AT_DATA([lr-data-unweighted.sps], [dnl
205 data list notable file='lr-data.txt'
206 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
209 variables = outcome with survrate
213 AT_DATA([lr-data-weighted.sps], [dnl
216 data list notable file='lr-data.txt'
217 list /id outcome survrate prognos amttreat gsi avoid intrus pre_1 lre_1 w *.
221 * Omit duplicate cases.
222 select if id <> 305 and id <> 316 and id <> 318.
225 variables = outcome with survrate
230 AT_CHECK([pspp -O format=csv lr-data-unweighted.sps > unweighted-result], [0], [ignore])
231 AT_CHECK([pspp -O format=csv lr-data-weighted.sps > weighted-result], [0], [ignore])
233 dnl The only difference should be the summary information, since
234 dnl this displays the unweighted totals.
235 AT_CHECK([diff unweighted-result weighted-result], [1], [dnl
237 < Included in Analysis,66,100.0%
239 > Included in Analysis,63,100.0%
245 < Step 1,outcome,1.000,43,5,89.6%
248 > Step 1,outcome,1.000,43.000,5.000,89.6%
249 > ,,2.000,4.000,14.000,77.8%
256 dnl Check that the /NOCONST option works as intended.
257 dnl The results this produces are very similar to those
258 dnl at the example in http://www.ats.ucla.edu/stat/SPSS/faq/logregconst.htm
259 AT_SETUP([LOGISTIC REGRESSION without constant])
260 AT_KEYWORDS([categorical categoricals])
262 AT_DATA([non-const.sps], [dnl
267 compute female = (#i > 91).
273 compute constant = 1.
275 logistic regression female with constant /noconst.
278 AT_CHECK([pspp -O format=csv non-const.sps], [0], [dnl
279 Table: Dependent Variable Encoding
280 Original Value,Internal Value
284 Table: Case Processing Summary
285 Unweighted Cases,N,Percent
286 Included in Analysis,200,100.0%
290 note: Estimation terminated at iteration number 2 because parameter estimates changed by less than 0.001
293 Step,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
296 Table: Classification Table
298 ,,,female,,Percentage Correct
300 Step 1,female,.00,0,91,.0%
302 ,Overall Percentage,,,,54.5%
304 Table: Variables in the Equation
305 ,,B,S.E.,Wald,df,Sig.,Exp(B)
306 Step 1,constant,.180,.142,1.616,1,.204,1.198
313 dnl Check that if somebody passes a dependent variable which is not dichtomous,
314 dnl then an error is raised.
315 AT_SETUP([LOGISTIC REGRESSION non-dichotomous dep var])
316 AT_KEYWORDS([categorical categoricals])
318 AT_DATA([non-dich.sps], [dnl
319 data list notable list /y x1 x2 x3 x4.
326 logistic regression y with x1 x2 x3 x4.
329 AT_CHECK([pspp -O format=csv non-dich.sps], [1],
331 error: Dependent variable's values are not dichotomous.
338 dnl An example to check the behaviour of LOGISTIC REGRESSION with a categorical
339 dnl variable. This examṕle was inspired from that at:
340 dnl http://www.ats.ucla.edu/stat/spss/dae/logit.htm
341 AT_SETUP([LOGISTIC REGRESSION with categorical])
342 AT_KEYWORDS([categorical categoricals])
344 AT_DATA([lr-cat.data], [dnl
747 AT_DATA([lr-cat.sps], [dnl
750 data list notable list file='lr-cat.data' /b1 b2 bcat y.
758 AT_CHECK([pspp -O format=csv lr-cat.sps], [0], [dnl
759 Table: Dependent Variable Encoding
760 Original Value,Internal Value
764 Table: Case Processing Summary
765 Unweighted Cases,N,Percent
766 Included in Analysis,400,100.0%
770 note: Estimation terminated at iteration number 4 because parameter estimates changed by less than 0.001
773 Step,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
776 Table: Categorical Variables' Codings
777 ,,Frequency,Parameter coding,,
784 Table: Classification Table
786 ,,,y,,Percentage Correct
787 ,Observed,,4.000,9.000,
788 Step 1,y,4.000,254,19,93.0%
790 ,Overall Percentage,,,,71.0%
792 Table: Variables in the Equation
793 ,,B,S.E.,Wald,df,Sig.,Exp(B)
794 Step 1,b1,.002,.001,4.284,1,.038,1.002
795 ,b2,.804,.332,5.872,1,.015,2.235
796 ,bcat,,,20.895,3,.000,
797 ,bcat(1),1.551,.418,13.788,1,.000,4.718
798 ,bcat(2),.876,.367,5.706,1,.017,2.401
799 ,bcat(3),.211,.393,.289,1,.591,1.235
800 ,Constant,-5.541,1.138,23.709,1,.000,.004
806 dnl This example is inspired by http://www.ats.ucla.edu/stat/spss/output/logistic.htm
807 AT_SETUP([LOGISTIC REGRESSION with cat var 2])
808 AT_KEYWORDS([categorical categoricals])
810 AT_DATA([lr-cat2.data], [dnl
811 60.00 1.00 8.00 50.00
813 57.00 1.00 7.00 53.00
821 68.00 1.00 9.00 69.00
825 57.00 1.00 7.00 61.00
826 55.00 1.00 8.00 50.00
829 50.00 1.00 9.00 66.00
833 47.00 1.00 7.00 34.00
845 68.00 1.00 9.00 69.00
847 63.00 1.00 9.00 61.00
848 65.00 1.00 9.00 61.00
849 63.00 1.00 9.00 53.00
853 52.00 1.00 7.00 56.00
855 47.00 1.00 7.00 53.00
857 50.00 1.00 8.00 55.00
868 68.00 1.00 9.00 55.00
869 47.00 1.00 8.00 50.00
877 55.00 1.00 9.00 49.00
878 68.00 1.00 8.00 50.00
879 52.00 1.00 9.00 63.00
882 66.00 1.00 9.00 61.00
883 65.00 1.00 7.00 58.00
885 68.00 1.00 7.00 59.00
886 60.00 1.00 9.00 61.00
888 57.00 1.00 7.00 54.00
897 63.00 1.00 7.00 63.00
899 57.00 1.00 8.00 63.00
906 65.00 1.00 9.00 63.00
911 63.00 1.00 9.00 55.00
920 47.00 1.00 9.00 69.00
925 50.00 1.00 7.00 63.00
928 73.00 1.00 9.00 61.00
933 57.00 1.00 8.00 55.00
934 53.00 1.00 8.00 57.00
938 57.00 1.00 8.00 58.00
948 73.00 1.00 8.00 69.00
949 71.00 1.00 9.00 58.00
951 63.00 1.00 7.00 54.00
957 65.00 1.00 8.00 55.00
958 76.00 1.00 9.00 67.00
959 71.00 1.00 8.00 66.00
961 47.00 1.00 9.00 63.00
964 54.00 1.00 9.00 55.00
965 55.00 1.00 8.00 58.00
967 55.00 1.00 9.00 63.00
976 65.00 1.00 9.00 66.00
980 63.00 1.00 8.00 72.00
986 73.00 1.00 9.00 58.00
988 63.00 1.00 9.00 69.00
990 65.00 1.00 9.00 66.00
991 73.00 1.00 8.00 63.00
1000 60.00 1.00 9.00 50.00
1001 50.00 .00 9.00 47.00
1002 73.00 1.00 9.00 55.00
1003 52.00 1.00 8.00 47.00
1004 55.00 .00 8.00 53.00
1005 47.00 .00 8.00 53.00
1006 50.00 .00 8.00 61.00
1007 61.00 .00 7.00 44.00
1008 52.00 .00 9.00 53.00
1009 47.00 .00 7.00 40.00
1010 47.00 .00 7.00 50.00
1013 AT_DATA([stringcat.sps], [dnl
1015 data list notable file='lr-cat2.data' list /read honcomp wiz science *.
1018 recode wiz (7 = "a") (8 = "b") (9 = "c") into ses.
1020 logistic regression honcomp with read science ses
1025 AT_CHECK([pspp -O format=csv stringcat.sps], [0], [dnl
1026 Table: Dependent Variable Encoding
1027 Original Value,Internal Value
1031 Table: Case Processing Summary
1032 Unweighted Cases,N,Percent
1033 Included in Analysis,200,100.0%
1037 note: Estimation terminated at iteration number 5 because parameter estimates changed by less than 0.001
1039 Table: Model Summary
1040 Step,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
1043 Table: Categorical Variables' Codings
1044 ,,Frequency,Parameter coding,
1050 Table: Classification Table
1052 ,,,honcomp,,Percentage Correct
1053 ,Observed,,.000,1.000,
1054 Step 1,honcomp,.000,132,15,89.8%
1056 ,Overall Percentage,,,,79.5%
1058 Table: Variables in the Equation
1059 ,,B,S.E.,Wald,df,Sig.,Exp(B)
1060 Step 1,read,.098,.025,15.199,1,.000,1.103
1061 ,science,.066,.027,5.867,1,.015,1.068
1062 ,ses,,,6.690,2,.035,
1063 ,ses(1),.058,.532,.012,1,.913,1.060
1064 ,ses(2),-1.013,.444,5.212,1,.022,.363
1065 ,Constant,-9.561,1.662,33.113,1,.000,.000
1071 dnl Check that it doesn't crash if a categorical variable
1072 dnl has only one distinct value
1073 AT_SETUP([LOGISTIC REGRESSION identical categories])
1074 AT_KEYWORDS([categorical categoricals])
1076 AT_DATA([crash.sps], [dnl
1077 data list notable list /y x1 x2*.
1083 logistic regression y with x1 x2
1087 AT_CHECK([pspp -O format=csv crash.sps], [1], [ignore])
1092 dnl Test that missing values on the categorical predictors are treated
1094 AT_SETUP([LOGISTIC REGRESSION missing categoricals])
1095 AT_KEYWORDS([categorical categoricals])
1097 AT_DATA([data.txt], [dnl
1200 AT_DATA([miss.sps], [dnl
1201 data list notable file='data.txt' list /y x1 cat0*.
1203 logistic regression y with x1 cat0
1204 /categorical = cat0.
1207 AT_CHECK([pspp -O format=csv miss.sps > file1], [0], [ignore])
1209 dnl Append a case with a missing categorical.
1210 AT_CHECK([echo '1 34 .' >> data.txt], [0], [ignore])
1212 AT_CHECK([pspp -O format=csv miss.sps > file2], [0], [ignore])
1214 AT_CHECK([diff file1 file2], [1], [dnl
1216 < Included in Analysis,100,100.0%
1217 < Missing Cases,0,.0%
1220 > Included in Analysis,100,99.0%
1221 > Missing Cases,1,1.0%
1228 dnl Check that the confidence intervals are properly reported.
1229 dnl Use an example with categoricals, because that was buggy at
1230 dnl one point. The data in this example comes from:
1231 dnl http://people.ysu.edu/~gchang/SPSSE/SPSS_lab2Regression.pdf
1232 AT_SETUP([LOGISTIC REGRESSION confidence interval])
1233 AT_KEYWORDS([categorical categoricals])
1235 AT_DATA([ci.sps], [dnl
1237 data list notable list /disease age sciostat sector savings *.
1438 disease WITH age sciostat sector savings
1439 /categorical = sciostat sector
1443 AT_CHECK([pspp -O format=csv ci.sps], [0], [dnl
1444 Table: Dependent Variable Encoding
1445 Original Value,Internal Value
1449 Table: Case Processing Summary
1450 Unweighted Cases,N,Percent
1451 Included in Analysis,196,100.0%
1455 note: Estimation terminated at iteration number 4 because parameter estimates changed by less than 0.001
1457 Table: Model Summary
1458 Step,-2 Log likelihood,Cox & Snell R Square,Nagelkerke R Square
1461 Table: Categorical Variables' Codings
1462 ,,Frequency,Parameter coding,
1464 sciostat,1.000,77,1,0
1470 Table: Classification Table
1472 ,,,disease,,Percentage Correct
1473 ,Observed,,.000,1.000,
1474 Step 1,disease,.000,131,8,94.2%
1476 ,Overall Percentage,,,,75.0%
1478 Table: Variables in the Equation
1479 ,,B,S.E.,Wald,df,Sig.,Exp(B),95% CI for Exp(B),
1481 Step 1,age,.027,.009,8.647,1,.003,1.027,1.009,1.045
1482 ,savings,.061,.386,.025,1,.874,1.063,.499,2.264
1483 ,sciostat,,,.440,2,.803,,,
1484 ,sciostat(1),-.278,.434,.409,1,.522,.757,.323,1.775
1485 ,sciostat(2),-.219,.459,.227,1,.634,.803,.327,1.976
1486 ,sector,,,11.974,1,.001,,,
1487 ,sector(1),-1.235,.357,11.974,1,.001,.291,.145,.586
1488 ,Constant,-.814,.452,3.246,1,.072,.443,,