From 4239c455e7b1061b7c960b793f9080e113123845 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Wed, 29 Dec 2004 01:19:57 +0000 Subject: [PATCH] Implemented calculation of percentiles and Tukey hinges --- doc/statistics.texi | 6 + po/en_GB.po | 221 ++++++++++-------- po/pspp.pot | 221 ++++++++++-------- src/ChangeLog | 6 + src/Makefile.am | 3 +- src/examine.q | 427 +++++++++++++++++++++++++++++++++- src/factor_stats.c | 14 +- src/factor_stats.h | 18 +- src/hash.c | 4 +- src/percentiles.c | 399 +++++++++++++++++++++++++++++++ src/percentiles.h | 83 +++++++ src/subclist.c | 4 +- src/subclist.h | 8 +- tests/command/examine.sh | 16 +- tests/command/trimmed-mean.sh | 4 +- 15 files changed, 1209 insertions(+), 225 deletions(-) create mode 100644 src/percentiles.c create mode 100644 src/percentiles.h diff --git a/doc/statistics.texi b/doc/statistics.texi index 19fe6d5a..56c37949 100644 --- a/doc/statistics.texi +++ b/doc/statistics.texi @@ -230,6 +230,7 @@ EXAMINE /COMPARE=@{GROUPS,VARIABLES@} /ID=@{case_number, var_name@} /@{TOTAL,NOTOTAL@} + /PERCENTILE=[value_list]=@{HAVERAGE, WAVERAGE, ROUND, AEMPIRICAL, EMPIRICAL @} /MISSING=@{LISTWISE, PAIRWISE@} [@{EXCLUDE, INCLUDE@}] [@{NOREPORT,REPORT@}] @@ -261,6 +262,11 @@ The PLOT subcommand specifies which plots are to be produced if any. The CINTERVAL subcommand specifies the confidence interval to use in calculation of the descriptives command. The default it 95%. +The PERCENTILES subcommand specifies which percentiles are to be calculated, +and which algorithm to use for calculating them. The default is to +calculate the 5, 10, 25, 50, 75, 90, 95 percentiles using the +HAVERAGE algorithm. + The TOTAL and NOTOTAL subcommands are mutually exclusive. If NOTOTAL is given and factors have been specified in the VARIABLES subcommand, then then statistics for the unfactored dependent variables are diff --git a/po/en_GB.po b/po/en_GB.po index 2dbb8508..6b382054 100644 --- a/po/en_GB.po +++ b/po/en_GB.po @@ -7,7 +7,7 @@ msgid "" msgstr "" "Project-Id-Version: PSPP 0.3.1\n" "Report-Msgid-Bugs-To: pspp-dev@gnu.org\n" -"POT-Creation-Date: 2004-12-02 13:38+0800\n" +"POT-Creation-Date: 2004-12-29 08:18+0800\n" "PO-Revision-Date: 2004-01-23 13:04+0800\n" "Last-Translator: John Darrington \n" "Language-Team: John Darrington \n" @@ -957,8 +957,8 @@ msgstr "" msgid "Only USE ALL is currently implemented." msgstr "" -#: src/descript.c:99 src/examine.q:1268 src/frequencies.q:112 src/oneway.q:396 -#: src/t-test.q:683 src/t-test.q:706 src/t-test.q:829 src/t-test.q:1166 +#: src/descript.c:99 src/examine.q:1400 src/frequencies.q:112 src/oneway.q:396 +#: src/t-test.q:690 src/t-test.q:713 src/t-test.q:836 src/t-test.q:1173 msgid "Mean" msgstr "" @@ -970,11 +970,11 @@ msgstr "" msgid "Std Dev" msgstr "" -#: src/descript.c:102 src/examine.q:1331 src/frequencies.q:117 +#: src/descript.c:102 src/examine.q:1478 src/frequencies.q:117 msgid "Variance" msgstr "" -#: src/descript.c:103 src/examine.q:1416 src/frequencies.q:118 +#: src/descript.c:103 src/examine.q:1585 src/frequencies.q:118 msgid "Kurtosis" msgstr "" @@ -982,7 +982,7 @@ msgstr "" msgid "S E Kurt" msgstr "" -#: src/descript.c:105 src/examine.q:1396 src/frequencies.q:120 +#: src/descript.c:105 src/examine.q:1565 src/frequencies.q:120 msgid "Skewness" msgstr "" @@ -990,16 +990,16 @@ msgstr "" msgid "S E Skew" msgstr "" -#: src/descript.c:107 src/examine.q:1379 src/frequencies.q:122 +#: src/descript.c:107 src/examine.q:1526 src/frequencies.q:122 msgid "Range" msgstr "" -#: src/descript.c:108 src/examine.q:1356 src/frequencies.q:123 +#: src/descript.c:108 src/examine.q:1503 src/frequencies.q:123 #: src/oneway.q:408 msgid "Minimum" msgstr "" -#: src/descript.c:109 src/examine.q:1367 src/frequencies.q:124 +#: src/descript.c:109 src/examine.q:1514 src/frequencies.q:124 #: src/oneway.q:409 msgid "Maximum" msgstr "" @@ -1985,16 +1985,16 @@ msgstr "" msgid "<>" msgstr "" -#: src/hash.c:517 +#: src/hash.c:519 #, c-format msgid "hash table:" msgstr "" -#: src/histogram.c:138 +#: src/histogram.c:115 msgid "HISTOGRAM" msgstr "" -#: src/histogram.c:140 src/frequencies.q:1135 +#: src/histogram.c:117 src/frequencies.q:1137 msgid "Frequency" msgstr "" @@ -2714,6 +2714,26 @@ msgstr "" msgid "Error opening page on %s device of %s class." msgstr "" +#: src/percentiles.c:38 +msgid "HAverage" +msgstr "" + +#: src/percentiles.c:39 +msgid "Weighted Average" +msgstr "" + +#: src/percentiles.c:40 +msgid "Rounded" +msgstr "" + +#: src/percentiles.c:41 +msgid "Empirical" +msgstr "" + +#: src/percentiles.c:42 +msgid "Empirical with averaging" +msgstr "" + #: src/permissions.c:75 #, c-format msgid "Expecting %s or %s." @@ -3703,7 +3723,7 @@ msgstr "" #: src/sysfile-info.c:531 src/vfm.c:875 src/crosstabs.q:1099 #: src/crosstabs.q:1126 src/crosstabs.q:1146 src/crosstabs.q:1168 -#: src/examine.q:927 src/frequencies.q:1134 src/frequencies.q:1255 +#: src/examine.q:1054 src/frequencies.q:1136 src/frequencies.q:1257 msgid "Value" msgstr "" @@ -3924,33 +3944,33 @@ msgstr "" msgid "Summary." msgstr "" -#: src/crosstabs.q:802 src/examine.q:711 +#: src/crosstabs.q:802 src/examine.q:838 msgid "Cases" msgstr "" -#: src/crosstabs.q:803 src/examine.q:645 src/frequencies.q:1132 -#: src/frequencies.q:1505 +#: src/crosstabs.q:803 src/examine.q:772 src/frequencies.q:1134 +#: src/frequencies.q:1507 msgid "Valid" msgstr "" -#: src/crosstabs.q:804 src/examine.q:646 src/frequencies.q:1200 -#: src/frequencies.q:1506 +#: src/crosstabs.q:804 src/examine.q:773 src/frequencies.q:1202 +#: src/frequencies.q:1508 msgid "Missing" msgstr "" #: src/crosstabs.q:805 src/crosstabs.q:1008 src/crosstabs.q:1722 -#: src/examine.q:647 src/frequencies.q:1209 src/oneway.q:307 src/oneway.q:486 +#: src/examine.q:774 src/frequencies.q:1211 src/oneway.q:307 src/oneway.q:486 msgid "Total" msgstr "" -#: src/crosstabs.q:815 src/examine.q:723 src/frequencies.q:1504 -#: src/oneway.q:395 src/t-test.q:682 src/t-test.q:705 src/t-test.q:830 -#: src/t-test.q:1365 +#: src/crosstabs.q:815 src/examine.q:850 src/frequencies.q:1506 +#: src/oneway.q:395 src/t-test.q:689 src/t-test.q:712 src/t-test.q:837 +#: src/t-test.q:1372 msgid "N" msgstr "" -#: src/crosstabs.q:816 src/examine.q:726 src/frequencies.q:1136 -#: src/frequencies.q:1137 src/frequencies.q:1138 +#: src/crosstabs.q:816 src/examine.q:853 src/frequencies.q:1138 +#: src/frequencies.q:1139 src/frequencies.q:1140 msgid "Percent" msgstr "" @@ -3987,12 +4007,12 @@ msgid "adj. resid." msgstr "" #: src/crosstabs.q:1098 src/crosstabs.q:1125 src/crosstabs.q:1145 -#: src/crosstabs.q:1166 src/examine.q:1161 +#: src/crosstabs.q:1166 src/examine.q:1288 msgid "Statistic" msgstr "" -#: src/crosstabs.q:1100 src/oneway.q:278 src/oneway.q:707 src/t-test.q:980 -#: src/t-test.q:1172 src/t-test.q:1264 +#: src/crosstabs.q:1100 src/oneway.q:278 src/oneway.q:707 src/t-test.q:987 +#: src/t-test.q:1179 src/t-test.q:1271 msgid "df" msgstr "" @@ -4029,11 +4049,11 @@ msgstr "" msgid " 95%% Confidence Interval" msgstr "" -#: src/crosstabs.q:1147 src/t-test.q:984 src/t-test.q:1169 src/t-test.q:1267 +#: src/crosstabs.q:1147 src/t-test.q:991 src/t-test.q:1176 src/t-test.q:1274 msgid "Lower" msgstr "" -#: src/crosstabs.q:1148 src/t-test.q:985 src/t-test.q:1170 src/t-test.q:1268 +#: src/crosstabs.q:1148 src/t-test.q:992 src/t-test.q:1177 src/t-test.q:1275 msgid "Upper" msgstr "" @@ -4170,91 +4190,99 @@ msgstr "" msgid "%s Dependent" msgstr "" -#: src/examine.q:300 src/examine.q:312 +#: src/examine.q:418 src/examine.q:430 #, c-format msgid "%s and %s are mutually exclusive" msgstr "" -#: src/examine.q:705 +#: src/examine.q:832 msgid "Case Processing Summary" msgstr "" -#: src/examine.q:911 +#: src/examine.q:1038 msgid "Extreme Values" msgstr "" -#: src/examine.q:928 +#: src/examine.q:1055 msgid "Case Number" msgstr "" -#: src/examine.q:1016 +#: src/examine.q:1143 msgid "Highest" msgstr "" -#: src/examine.q:1021 +#: src/examine.q:1148 msgid "Lowest" msgstr "" -#: src/examine.q:1162 src/oneway.q:398 src/oneway.q:705 +#: src/examine.q:1289 src/oneway.q:398 src/oneway.q:705 msgid "Std. Error" msgstr "" -#: src/examine.q:1164 src/oneway.q:412 +#: src/examine.q:1291 src/oneway.q:412 msgid "Descriptives" msgstr "" -#: src/examine.q:1286 src/oneway.q:403 +#: src/examine.q:1418 src/oneway.q:403 #, c-format msgid "%g%% Confidence Interval for Mean" msgstr "" -#: src/examine.q:1292 src/oneway.q:405 +#: src/examine.q:1424 src/oneway.q:405 msgid "Lower Bound" msgstr "" -#: src/examine.q:1303 src/oneway.q:406 +#: src/examine.q:1435 src/oneway.q:406 msgid "Upper Bound" msgstr "" -#: src/examine.q:1315 +#: src/examine.q:1447 msgid "5% Trimmed Mean" msgstr "" -#: src/examine.q:1326 src/frequencies.q:114 +#: src/examine.q:1458 src/frequencies.q:114 msgid "Median" msgstr "" -#: src/examine.q:1343 src/oneway.q:397 src/t-test.q:684 src/t-test.q:707 -#: src/t-test.q:831 src/t-test.q:1167 +#: src/examine.q:1490 src/oneway.q:397 src/t-test.q:691 src/t-test.q:714 +#: src/t-test.q:838 src/t-test.q:1174 msgid "Std. Deviation" msgstr "" -#: src/examine.q:1391 +#: src/examine.q:1538 msgid "Interquartile Range" msgstr "" -#: src/examine.q:1459 +#: src/examine.q:1628 #, c-format msgid "Normal Q-Q Plot of %s" msgstr "" -#: src/examine.q:1460 src/examine.q:1466 +#: src/examine.q:1629 src/examine.q:1635 msgid "Observed Value" msgstr "" -#: src/examine.q:1461 +#: src/examine.q:1630 msgid "Expected Normal" msgstr "" -#: src/examine.q:1464 +#: src/examine.q:1633 #, c-format msgid "Detrended Normal Q-Q Plot of %s" msgstr "" -#: src/examine.q:1467 +#: src/examine.q:1636 msgid "Dev from Normal" msgstr "" +#: src/examine.q:1757 src/examine.q:1779 src/frequencies.q:1518 +msgid "Percentiles" +msgstr "" + +#: src/examine.q:1904 +msgid "Tukey's Hinges" +msgstr "" + #: src/file-handle.q:122 #, c-format msgid "" @@ -4314,75 +4342,71 @@ msgstr "" msgid "S.E. Skew" msgstr "" -#: src/frequencies.q:394 +#: src/frequencies.q:396 msgid "" "At most one of BARCHART, HISTOGRAM, or HBAR should be given. HBAR will be " "assumed. Argument values will be given precedence increasing along the " "order given." msgstr "" -#: src/frequencies.q:477 +#: src/frequencies.q:479 #, c-format msgid "" "MAX must be greater than or equal to MIN, if both are specified. However, " "MIN was specified as %g and MAX as %g. MIN and MAX will be ignored." msgstr "" -#: src/frequencies.q:798 +#: src/frequencies.q:800 msgid "" "Upper limit of integer mode value range must be greater than lower limit." msgstr "" -#: src/frequencies.q:811 +#: src/frequencies.q:813 #, c-format msgid "Variable %s specified multiple times on VARIABLES subcommand." msgstr "" -#: src/frequencies.q:817 +#: src/frequencies.q:819 #, c-format msgid "Integer mode specified, but %s is not a numeric variable." msgstr "" -#: src/frequencies.q:883 +#: src/frequencies.q:885 msgid "`)' expected after GROUPED interval list." msgstr "" -#: src/frequencies.q:895 +#: src/frequencies.q:897 #, c-format msgid "Variables %s specified on GROUPED but not on VARIABLES." msgstr "" -#: src/frequencies.q:902 +#: src/frequencies.q:904 #, c-format msgid "Variables %s specified multiple times on GROUPED subcommand." msgstr "" -#: src/frequencies.q:1133 src/frequencies.q:1225 src/frequencies.q:1226 -#: src/frequencies.q:1258 +#: src/frequencies.q:1135 src/frequencies.q:1227 src/frequencies.q:1228 +#: src/frequencies.q:1260 msgid "Cum" msgstr "" -#: src/frequencies.q:1155 +#: src/frequencies.q:1157 msgid "Value Label" msgstr "" -#: src/frequencies.q:1256 +#: src/frequencies.q:1258 msgid "Freq" msgstr "" -#: src/frequencies.q:1257 src/frequencies.q:1259 +#: src/frequencies.q:1259 src/frequencies.q:1261 msgid "Pct" msgstr "" -#: src/frequencies.q:1478 +#: src/frequencies.q:1480 #, c-format msgid "No valid data for variable %s; statistics not displayed." msgstr "" -#: src/frequencies.q:1516 -msgid "Percentiles" -msgstr "" - #: src/list.q:150 #, c-format msgid "" @@ -4432,7 +4456,7 @@ msgstr "" msgid "Coefficients for contrast %d do not total zero" msgstr "" -#: src/oneway.q:242 src/t-test.q:366 src/t-test.q:451 +#: src/oneway.q:242 src/t-test.q:366 src/t-test.q:458 #, c-format msgid "`%s' is not a variable name" msgstr "" @@ -4445,7 +4469,7 @@ msgstr "" msgid "Mean Square" msgstr "" -#: src/oneway.q:280 src/t-test.q:977 +#: src/oneway.q:280 src/t-test.q:984 msgid "F" msgstr "" @@ -4497,11 +4521,11 @@ msgstr "" msgid "Value of Contrast" msgstr "" -#: src/oneway.q:706 src/t-test.q:979 src/t-test.q:1171 src/t-test.q:1263 +#: src/oneway.q:706 src/t-test.q:986 src/t-test.q:1178 src/t-test.q:1270 msgid "t" msgstr "" -#: src/oneway.q:708 src/t-test.q:981 src/t-test.q:1173 src/t-test.q:1265 +#: src/oneway.q:708 src/t-test.q:988 src/t-test.q:1180 src/t-test.q:1272 msgid "Sig. (2-tailed)" msgstr "" @@ -4662,111 +4686,110 @@ msgstr "" msgid "Long string variable %s is not valid here." msgstr "" -#: src/t-test.q:399 +#: src/t-test.q:399 src/t-test.q:414 msgid "" -"When applying GROUPS to a string variable, at least one value must be " -"specified." +"When applying GROUPS to a string variable, two values must be specified." msgstr "" -#: src/t-test.q:486 +#: src/t-test.q:493 #, c-format msgid "" "PAIRED was specified but the number of variables preceding WITH (%d) did not " "match the number following (%d)." msgstr "" -#: src/t-test.q:503 +#: src/t-test.q:510 msgid "At least two variables must be specified on PAIRS." msgstr "" -#: src/t-test.q:680 +#: src/t-test.q:687 msgid "One-Sample Statistics" msgstr "" -#: src/t-test.q:685 src/t-test.q:708 src/t-test.q:832 +#: src/t-test.q:692 src/t-test.q:715 src/t-test.q:839 msgid "SE. Mean" msgstr "" -#: src/t-test.q:702 +#: src/t-test.q:709 msgid "Group Statistics" msgstr "" -#: src/t-test.q:826 +#: src/t-test.q:833 msgid "Paired Sample Statistics" msgstr "" -#: src/t-test.q:848 src/t-test.q:1192 src/t-test.q:1382 +#: src/t-test.q:855 src/t-test.q:1199 src/t-test.q:1389 #, c-format msgid "Pair %d" msgstr "" -#: src/t-test.q:965 +#: src/t-test.q:972 msgid "Independent Samples Test" msgstr "" -#: src/t-test.q:973 +#: src/t-test.q:980 msgid "Levene's Test for Equality of Variances" msgstr "" -#: src/t-test.q:975 +#: src/t-test.q:982 msgid "t-test for Equality of Means" msgstr "" -#: src/t-test.q:978 src/t-test.q:1367 +#: src/t-test.q:985 src/t-test.q:1374 msgid "Sig." msgstr "" -#: src/t-test.q:982 src/t-test.q:1266 +#: src/t-test.q:989 src/t-test.q:1273 msgid "Mean Difference" msgstr "" -#: src/t-test.q:983 +#: src/t-test.q:990 msgid "Std. Error Difference" msgstr "" -#: src/t-test.q:988 src/t-test.q:1163 src/t-test.q:1258 +#: src/t-test.q:995 src/t-test.q:1170 src/t-test.q:1265 #, c-format msgid "%g%% Confidence Interval of the Difference" msgstr "" -#: src/t-test.q:1043 +#: src/t-test.q:1050 msgid "Equal variances assumed" msgstr "" -#: src/t-test.q:1095 +#: src/t-test.q:1102 msgid "Equal variances not assumed" msgstr "" -#: src/t-test.q:1153 +#: src/t-test.q:1160 msgid "Paired Samples Test" msgstr "" -#: src/t-test.q:1156 +#: src/t-test.q:1163 msgid "Paired Differences" msgstr "" -#: src/t-test.q:1168 +#: src/t-test.q:1175 msgid "Std. Error Mean" msgstr "" -#: src/t-test.q:1247 +#: src/t-test.q:1254 msgid "One-Sample Test" msgstr "" -#: src/t-test.q:1252 +#: src/t-test.q:1259 #, c-format msgid "Test Value = %f" msgstr "" -#: src/t-test.q:1362 +#: src/t-test.q:1369 msgid "Paired Samples Correlations" msgstr "" -#: src/t-test.q:1366 +#: src/t-test.q:1373 msgid "Correlation" msgstr "" -#: src/t-test.q:1385 +#: src/t-test.q:1392 #, c-format msgid "%s & %s" msgstr "" diff --git a/po/pspp.pot b/po/pspp.pot index a74ef27a..c3fbf686 100644 --- a/po/pspp.pot +++ b/po/pspp.pot @@ -8,7 +8,7 @@ msgid "" msgstr "" "Project-Id-Version: PACKAGE VERSION\n" "Report-Msgid-Bugs-To: pspp-dev@gnu.org\n" -"POT-Creation-Date: 2004-12-02 19:27+0800\n" +"POT-Creation-Date: 2004-12-29 08:18+0800\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME \n" "Language-Team: LANGUAGE \n" @@ -958,8 +958,8 @@ msgstr "" msgid "Only USE ALL is currently implemented." msgstr "" -#: src/descript.c:99 src/examine.q:1268 src/frequencies.q:112 src/oneway.q:396 -#: src/t-test.q:683 src/t-test.q:706 src/t-test.q:829 src/t-test.q:1166 +#: src/descript.c:99 src/examine.q:1400 src/frequencies.q:112 src/oneway.q:396 +#: src/t-test.q:690 src/t-test.q:713 src/t-test.q:836 src/t-test.q:1173 msgid "Mean" msgstr "" @@ -971,11 +971,11 @@ msgstr "" msgid "Std Dev" msgstr "" -#: src/descript.c:102 src/examine.q:1331 src/frequencies.q:117 +#: src/descript.c:102 src/examine.q:1478 src/frequencies.q:117 msgid "Variance" msgstr "" -#: src/descript.c:103 src/examine.q:1416 src/frequencies.q:118 +#: src/descript.c:103 src/examine.q:1585 src/frequencies.q:118 msgid "Kurtosis" msgstr "" @@ -983,7 +983,7 @@ msgstr "" msgid "S E Kurt" msgstr "" -#: src/descript.c:105 src/examine.q:1396 src/frequencies.q:120 +#: src/descript.c:105 src/examine.q:1565 src/frequencies.q:120 msgid "Skewness" msgstr "" @@ -991,16 +991,16 @@ msgstr "" msgid "S E Skew" msgstr "" -#: src/descript.c:107 src/examine.q:1379 src/frequencies.q:122 +#: src/descript.c:107 src/examine.q:1526 src/frequencies.q:122 msgid "Range" msgstr "" -#: src/descript.c:108 src/examine.q:1356 src/frequencies.q:123 +#: src/descript.c:108 src/examine.q:1503 src/frequencies.q:123 #: src/oneway.q:408 msgid "Minimum" msgstr "" -#: src/descript.c:109 src/examine.q:1367 src/frequencies.q:124 +#: src/descript.c:109 src/examine.q:1514 src/frequencies.q:124 #: src/oneway.q:409 msgid "Maximum" msgstr "" @@ -1985,16 +1985,16 @@ msgstr "" msgid "<>" msgstr "" -#: src/hash.c:517 +#: src/hash.c:519 #, c-format msgid "hash table:" msgstr "" -#: src/histogram.c:138 +#: src/histogram.c:115 msgid "HISTOGRAM" msgstr "" -#: src/histogram.c:140 src/frequencies.q:1135 +#: src/histogram.c:117 src/frequencies.q:1137 msgid "Frequency" msgstr "" @@ -2714,6 +2714,26 @@ msgstr "" msgid "Error opening page on %s device of %s class." msgstr "" +#: src/percentiles.c:38 +msgid "HAverage" +msgstr "" + +#: src/percentiles.c:39 +msgid "Weighted Average" +msgstr "" + +#: src/percentiles.c:40 +msgid "Rounded" +msgstr "" + +#: src/percentiles.c:41 +msgid "Empirical" +msgstr "" + +#: src/percentiles.c:42 +msgid "Empirical with averaging" +msgstr "" + #: src/permissions.c:75 #, c-format msgid "Expecting %s or %s." @@ -3703,7 +3723,7 @@ msgstr "" #: src/sysfile-info.c:531 src/vfm.c:875 src/crosstabs.q:1099 #: src/crosstabs.q:1126 src/crosstabs.q:1146 src/crosstabs.q:1168 -#: src/examine.q:927 src/frequencies.q:1134 src/frequencies.q:1255 +#: src/examine.q:1054 src/frequencies.q:1136 src/frequencies.q:1257 msgid "Value" msgstr "" @@ -3924,33 +3944,33 @@ msgstr "" msgid "Summary." msgstr "" -#: src/crosstabs.q:802 src/examine.q:711 +#: src/crosstabs.q:802 src/examine.q:838 msgid "Cases" msgstr "" -#: src/crosstabs.q:803 src/examine.q:645 src/frequencies.q:1132 -#: src/frequencies.q:1505 +#: src/crosstabs.q:803 src/examine.q:772 src/frequencies.q:1134 +#: src/frequencies.q:1507 msgid "Valid" msgstr "" -#: src/crosstabs.q:804 src/examine.q:646 src/frequencies.q:1200 -#: src/frequencies.q:1506 +#: src/crosstabs.q:804 src/examine.q:773 src/frequencies.q:1202 +#: src/frequencies.q:1508 msgid "Missing" msgstr "" #: src/crosstabs.q:805 src/crosstabs.q:1008 src/crosstabs.q:1722 -#: src/examine.q:647 src/frequencies.q:1209 src/oneway.q:307 src/oneway.q:486 +#: src/examine.q:774 src/frequencies.q:1211 src/oneway.q:307 src/oneway.q:486 msgid "Total" msgstr "" -#: src/crosstabs.q:815 src/examine.q:723 src/frequencies.q:1504 -#: src/oneway.q:395 src/t-test.q:682 src/t-test.q:705 src/t-test.q:830 -#: src/t-test.q:1365 +#: src/crosstabs.q:815 src/examine.q:850 src/frequencies.q:1506 +#: src/oneway.q:395 src/t-test.q:689 src/t-test.q:712 src/t-test.q:837 +#: src/t-test.q:1372 msgid "N" msgstr "" -#: src/crosstabs.q:816 src/examine.q:726 src/frequencies.q:1136 -#: src/frequencies.q:1137 src/frequencies.q:1138 +#: src/crosstabs.q:816 src/examine.q:853 src/frequencies.q:1138 +#: src/frequencies.q:1139 src/frequencies.q:1140 msgid "Percent" msgstr "" @@ -3987,12 +4007,12 @@ msgid "adj. resid." msgstr "" #: src/crosstabs.q:1098 src/crosstabs.q:1125 src/crosstabs.q:1145 -#: src/crosstabs.q:1166 src/examine.q:1161 +#: src/crosstabs.q:1166 src/examine.q:1288 msgid "Statistic" msgstr "" -#: src/crosstabs.q:1100 src/oneway.q:278 src/oneway.q:707 src/t-test.q:980 -#: src/t-test.q:1172 src/t-test.q:1264 +#: src/crosstabs.q:1100 src/oneway.q:278 src/oneway.q:707 src/t-test.q:987 +#: src/t-test.q:1179 src/t-test.q:1271 msgid "df" msgstr "" @@ -4029,11 +4049,11 @@ msgstr "" msgid " 95%% Confidence Interval" msgstr "" -#: src/crosstabs.q:1147 src/t-test.q:984 src/t-test.q:1169 src/t-test.q:1267 +#: src/crosstabs.q:1147 src/t-test.q:991 src/t-test.q:1176 src/t-test.q:1274 msgid "Lower" msgstr "" -#: src/crosstabs.q:1148 src/t-test.q:985 src/t-test.q:1170 src/t-test.q:1268 +#: src/crosstabs.q:1148 src/t-test.q:992 src/t-test.q:1177 src/t-test.q:1275 msgid "Upper" msgstr "" @@ -4170,91 +4190,99 @@ msgstr "" msgid "%s Dependent" msgstr "" -#: src/examine.q:300 src/examine.q:312 +#: src/examine.q:418 src/examine.q:430 #, c-format msgid "%s and %s are mutually exclusive" msgstr "" -#: src/examine.q:705 +#: src/examine.q:832 msgid "Case Processing Summary" msgstr "" -#: src/examine.q:911 +#: src/examine.q:1038 msgid "Extreme Values" msgstr "" -#: src/examine.q:928 +#: src/examine.q:1055 msgid "Case Number" msgstr "" -#: src/examine.q:1016 +#: src/examine.q:1143 msgid "Highest" msgstr "" -#: src/examine.q:1021 +#: src/examine.q:1148 msgid "Lowest" msgstr "" -#: src/examine.q:1162 src/oneway.q:398 src/oneway.q:705 +#: src/examine.q:1289 src/oneway.q:398 src/oneway.q:705 msgid "Std. Error" msgstr "" -#: src/examine.q:1164 src/oneway.q:412 +#: src/examine.q:1291 src/oneway.q:412 msgid "Descriptives" msgstr "" -#: src/examine.q:1286 src/oneway.q:403 +#: src/examine.q:1418 src/oneway.q:403 #, c-format msgid "%g%% Confidence Interval for Mean" msgstr "" -#: src/examine.q:1292 src/oneway.q:405 +#: src/examine.q:1424 src/oneway.q:405 msgid "Lower Bound" msgstr "" -#: src/examine.q:1303 src/oneway.q:406 +#: src/examine.q:1435 src/oneway.q:406 msgid "Upper Bound" msgstr "" -#: src/examine.q:1315 +#: src/examine.q:1447 msgid "5% Trimmed Mean" msgstr "" -#: src/examine.q:1326 src/frequencies.q:114 +#: src/examine.q:1458 src/frequencies.q:114 msgid "Median" msgstr "" -#: src/examine.q:1343 src/oneway.q:397 src/t-test.q:684 src/t-test.q:707 -#: src/t-test.q:831 src/t-test.q:1167 +#: src/examine.q:1490 src/oneway.q:397 src/t-test.q:691 src/t-test.q:714 +#: src/t-test.q:838 src/t-test.q:1174 msgid "Std. Deviation" msgstr "" -#: src/examine.q:1391 +#: src/examine.q:1538 msgid "Interquartile Range" msgstr "" -#: src/examine.q:1459 +#: src/examine.q:1628 #, c-format msgid "Normal Q-Q Plot of %s" msgstr "" -#: src/examine.q:1460 src/examine.q:1466 +#: src/examine.q:1629 src/examine.q:1635 msgid "Observed Value" msgstr "" -#: src/examine.q:1461 +#: src/examine.q:1630 msgid "Expected Normal" msgstr "" -#: src/examine.q:1464 +#: src/examine.q:1633 #, c-format msgid "Detrended Normal Q-Q Plot of %s" msgstr "" -#: src/examine.q:1467 +#: src/examine.q:1636 msgid "Dev from Normal" msgstr "" +#: src/examine.q:1757 src/examine.q:1779 src/frequencies.q:1518 +msgid "Percentiles" +msgstr "" + +#: src/examine.q:1904 +msgid "Tukey's Hinges" +msgstr "" + #: src/file-handle.q:122 #, c-format msgid "" @@ -4314,75 +4342,71 @@ msgstr "" msgid "S.E. Skew" msgstr "" -#: src/frequencies.q:394 +#: src/frequencies.q:396 msgid "" "At most one of BARCHART, HISTOGRAM, or HBAR should be given. HBAR will be " "assumed. Argument values will be given precedence increasing along the " "order given." msgstr "" -#: src/frequencies.q:477 +#: src/frequencies.q:479 #, c-format msgid "" "MAX must be greater than or equal to MIN, if both are specified. However, " "MIN was specified as %g and MAX as %g. MIN and MAX will be ignored." msgstr "" -#: src/frequencies.q:798 +#: src/frequencies.q:800 msgid "" "Upper limit of integer mode value range must be greater than lower limit." msgstr "" -#: src/frequencies.q:811 +#: src/frequencies.q:813 #, c-format msgid "Variable %s specified multiple times on VARIABLES subcommand." msgstr "" -#: src/frequencies.q:817 +#: src/frequencies.q:819 #, c-format msgid "Integer mode specified, but %s is not a numeric variable." msgstr "" -#: src/frequencies.q:883 +#: src/frequencies.q:885 msgid "`)' expected after GROUPED interval list." msgstr "" -#: src/frequencies.q:895 +#: src/frequencies.q:897 #, c-format msgid "Variables %s specified on GROUPED but not on VARIABLES." msgstr "" -#: src/frequencies.q:902 +#: src/frequencies.q:904 #, c-format msgid "Variables %s specified multiple times on GROUPED subcommand." msgstr "" -#: src/frequencies.q:1133 src/frequencies.q:1225 src/frequencies.q:1226 -#: src/frequencies.q:1258 +#: src/frequencies.q:1135 src/frequencies.q:1227 src/frequencies.q:1228 +#: src/frequencies.q:1260 msgid "Cum" msgstr "" -#: src/frequencies.q:1155 +#: src/frequencies.q:1157 msgid "Value Label" msgstr "" -#: src/frequencies.q:1256 +#: src/frequencies.q:1258 msgid "Freq" msgstr "" -#: src/frequencies.q:1257 src/frequencies.q:1259 +#: src/frequencies.q:1259 src/frequencies.q:1261 msgid "Pct" msgstr "" -#: src/frequencies.q:1478 +#: src/frequencies.q:1480 #, c-format msgid "No valid data for variable %s; statistics not displayed." msgstr "" -#: src/frequencies.q:1516 -msgid "Percentiles" -msgstr "" - #: src/list.q:150 #, c-format msgid "" @@ -4432,7 +4456,7 @@ msgstr "" msgid "Coefficients for contrast %d do not total zero" msgstr "" -#: src/oneway.q:242 src/t-test.q:366 src/t-test.q:451 +#: src/oneway.q:242 src/t-test.q:366 src/t-test.q:458 #, c-format msgid "`%s' is not a variable name" msgstr "" @@ -4445,7 +4469,7 @@ msgstr "" msgid "Mean Square" msgstr "" -#: src/oneway.q:280 src/t-test.q:977 +#: src/oneway.q:280 src/t-test.q:984 msgid "F" msgstr "" @@ -4497,11 +4521,11 @@ msgstr "" msgid "Value of Contrast" msgstr "" -#: src/oneway.q:706 src/t-test.q:979 src/t-test.q:1171 src/t-test.q:1263 +#: src/oneway.q:706 src/t-test.q:986 src/t-test.q:1178 src/t-test.q:1270 msgid "t" msgstr "" -#: src/oneway.q:708 src/t-test.q:981 src/t-test.q:1173 src/t-test.q:1265 +#: src/oneway.q:708 src/t-test.q:988 src/t-test.q:1180 src/t-test.q:1272 msgid "Sig. (2-tailed)" msgstr "" @@ -4662,111 +4686,110 @@ msgstr "" msgid "Long string variable %s is not valid here." msgstr "" -#: src/t-test.q:399 +#: src/t-test.q:399 src/t-test.q:414 msgid "" -"When applying GROUPS to a string variable, at least one value must be " -"specified." +"When applying GROUPS to a string variable, two values must be specified." msgstr "" -#: src/t-test.q:486 +#: src/t-test.q:493 #, c-format msgid "" "PAIRED was specified but the number of variables preceding WITH (%d) did not " "match the number following (%d)." msgstr "" -#: src/t-test.q:503 +#: src/t-test.q:510 msgid "At least two variables must be specified on PAIRS." msgstr "" -#: src/t-test.q:680 +#: src/t-test.q:687 msgid "One-Sample Statistics" msgstr "" -#: src/t-test.q:685 src/t-test.q:708 src/t-test.q:832 +#: src/t-test.q:692 src/t-test.q:715 src/t-test.q:839 msgid "SE. Mean" msgstr "" -#: src/t-test.q:702 +#: src/t-test.q:709 msgid "Group Statistics" msgstr "" -#: src/t-test.q:826 +#: src/t-test.q:833 msgid "Paired Sample Statistics" msgstr "" -#: src/t-test.q:848 src/t-test.q:1192 src/t-test.q:1382 +#: src/t-test.q:855 src/t-test.q:1199 src/t-test.q:1389 #, c-format msgid "Pair %d" msgstr "" -#: src/t-test.q:965 +#: src/t-test.q:972 msgid "Independent Samples Test" msgstr "" -#: src/t-test.q:973 +#: src/t-test.q:980 msgid "Levene's Test for Equality of Variances" msgstr "" -#: src/t-test.q:975 +#: src/t-test.q:982 msgid "t-test for Equality of Means" msgstr "" -#: src/t-test.q:978 src/t-test.q:1367 +#: src/t-test.q:985 src/t-test.q:1374 msgid "Sig." msgstr "" -#: src/t-test.q:982 src/t-test.q:1266 +#: src/t-test.q:989 src/t-test.q:1273 msgid "Mean Difference" msgstr "" -#: src/t-test.q:983 +#: src/t-test.q:990 msgid "Std. Error Difference" msgstr "" -#: src/t-test.q:988 src/t-test.q:1163 src/t-test.q:1258 +#: src/t-test.q:995 src/t-test.q:1170 src/t-test.q:1265 #, c-format msgid "%g%% Confidence Interval of the Difference" msgstr "" -#: src/t-test.q:1043 +#: src/t-test.q:1050 msgid "Equal variances assumed" msgstr "" -#: src/t-test.q:1095 +#: src/t-test.q:1102 msgid "Equal variances not assumed" msgstr "" -#: src/t-test.q:1153 +#: src/t-test.q:1160 msgid "Paired Samples Test" msgstr "" -#: src/t-test.q:1156 +#: src/t-test.q:1163 msgid "Paired Differences" msgstr "" -#: src/t-test.q:1168 +#: src/t-test.q:1175 msgid "Std. Error Mean" msgstr "" -#: src/t-test.q:1247 +#: src/t-test.q:1254 msgid "One-Sample Test" msgstr "" -#: src/t-test.q:1252 +#: src/t-test.q:1259 #, c-format msgid "Test Value = %f" msgstr "" -#: src/t-test.q:1362 +#: src/t-test.q:1369 msgid "Paired Samples Correlations" msgstr "" -#: src/t-test.q:1366 +#: src/t-test.q:1373 msgid "Correlation" msgstr "" -#: src/t-test.q:1385 +#: src/t-test.q:1392 #, c-format msgid "%s & %s" msgstr "" diff --git a/src/ChangeLog b/src/ChangeLog index ad2e7b24..b2ec7335 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,9 @@ +Wed Dec 29 08:18:08 WST 2004 John Darrington + + * percentiles.[ch] Added. Calculates percentiles and Tukey hinges + + * examine.q factor_stats.[ch] Added calculation of percentiles + Fri Dec 24 15:09:11 WST 2004 John Darrington * t-test.q Fixed bug #11227 Made t-test work when the independent diff --git a/src/Makefile.am b/src/Makefile.am index af860861..1afb336e 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -64,7 +64,8 @@ groff-font.c group.c group.h group_proc.h \ hash.c hash.h html.c htmlP.h include.c inpt-pgm.c lexer.c \ lexer.h levene.c levene.h log.h loop.c magic.c magic.h main.c main.h \ matrix-data.c mis-val.c misc.c misc.h modify-vars.c \ -moments.c moments.h numeric.c output.c output.h permissions.c \ +moments.c moments.h numeric.c output.c output.h \ +percentiles.c percentiles.h permissions.c \ pfm-read.c pfm-read.h \ pfm-write.c pfm-write.h \ pool.c pool.h postscript.c print.c recode.c \ diff --git a/src/examine.q b/src/examine.q index d426ecca..97a63a90 100644 --- a/src/examine.q +++ b/src/examine.q @@ -42,6 +42,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA #include "casefile.h" #include "factor_stats.h" #include "moments.h" +#include "percentiles.h" /* (headers) */ #include "chart.h" @@ -55,6 +56,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA rep:report/!noreport, incl:include/!exclude; +compare=cmp:variables/!groups; + +percentiles=custom; +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none; +cinterval=double; +statistics[st_]=descriptives,:extreme(*d:n),all,none. @@ -112,6 +114,11 @@ static void show_descriptives(struct variable **dependent_var, int n_dep_var, struct factor *factor); +static void show_percentiles(struct variable **dependent_var, + int n_dep_var, + struct factor *factor); + + void np_plot(const struct metrics *m, const char *factorname); @@ -131,10 +138,22 @@ void factor_calc(struct ccase *c, int case_no, static is_missing_func value_is_missing; +/* PERCENTILES */ + +static subc_list_double percentile_list; + +static enum pc_alg percentile_algorithm; + +static short sbc_percentile; + + int cmd_examine(void) { + subc_list_double_create(&percentile_list); + percentile_algorithm = PC_HAVERAGE; + if ( !parse_examine(&cmd) ) return CMD_FAILURE; @@ -150,11 +169,23 @@ cmd_examine(void) if ( ! cmd.sbc_cinterval) cmd.n_cinterval[0] = 95.0; + + /* If descriptives have been requested, make sure the + quartiles are calculated */ + if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] ) + { + subc_list_double_push(&percentile_list, 25); + subc_list_double_push(&percentile_list, 50); + subc_list_double_push(&percentile_list, 75); + } + multipass_procedure_with_splits (run_examine, &cmd); if ( totals ) free(totals); + subc_list_double_destroy(&percentile_list); + return CMD_SUCCESS; }; @@ -180,6 +211,8 @@ output_examine(void) show_descriptives(dependent_vars, n_dependent_vars, 0); } + if ( sbc_percentile ) + show_percentiles(dependent_vars, n_dependent_vars, 0); if ( cmd.sbc_plot) { @@ -208,7 +241,6 @@ output_examine(void) } - } @@ -227,6 +259,10 @@ output_examine(void) show_descriptives(dependent_vars, n_dependent_vars, fctr); } + if ( sbc_percentile ) + show_percentiles(dependent_vars, n_dependent_vars, fctr); + + if ( cmd.sbc_plot) { int v; @@ -290,6 +326,88 @@ output_examine(void) } +static struct hsh_table * +list_to_ptile_hash(const subc_list_double *l) +{ + int i; + + struct hsh_table *h ; + + h = hsh_create(subc_list_double_count(l), + (hsh_compare_func *) ptile_compare, + (hsh_hash_func *) ptile_hash, + (hsh_free_func *) free, + 0); + + + for ( i = 0 ; i < subc_list_double_count(l) ; ++i ) + { + struct percentile *p = xmalloc (sizeof (struct percentile)); + + p->p = subc_list_double_at(l,i); + + hsh_insert(h, p); + + } + + return h; + +} + +/* Parse the PERCENTILES subcommand */ +static int +xmn_custom_percentiles(struct cmd_examine *p UNUSED) +{ + sbc_percentile = 1; + + lex_match('='); + + lex_match('('); + + while ( lex_double_p() ) + { + subc_list_double_push(&percentile_list,lex_double()); + + lex_get(); + + lex_match(',') ; + } + lex_match(')'); + + lex_match('='); + + if ( lex_match_id("HAVERAGE")) + percentile_algorithm = PC_HAVERAGE; + + else if ( lex_match_id("WAVERAGE")) + percentile_algorithm = PC_WAVERAGE; + + else if ( lex_match_id("ROUND")) + percentile_algorithm = PC_ROUND; + + else if ( lex_match_id("EMPIRICAL")) + percentile_algorithm = PC_EMPIRICAL; + + else if ( lex_match_id("AEMPIRICAL")) + percentile_algorithm = PC_AEMPIRICAL; + + else if ( lex_match_id("NONE")) + percentile_algorithm = PC_NONE; + + + if ( 0 == subc_list_double_count(&percentile_list)) + { + subc_list_double_push(&percentile_list, 5); + subc_list_double_push(&percentile_list, 10); + subc_list_double_push(&percentile_list, 25); + subc_list_double_push(&percentile_list, 50); + subc_list_double_push(&percentile_list, 75); + subc_list_double_push(&percentile_list, 90); + subc_list_double_push(&percentile_list, 95); + } + + return 1; +} /* TOTAL and NOTOTAL are simple, mutually exclusive flags */ static int @@ -399,6 +517,9 @@ examine_parse_independent_vars(struct cmd_examine *cmd) +void populate_percentiles(struct tab_table *tbl, int col, int row, + const struct metrics *m); + void populate_descriptives(struct tab_table *t, int col, int row, const struct metrics *fs); @@ -552,11 +673,17 @@ run_examine(const struct casefile *cf, void *cmd_ ) fs != 0 ; fs = hsh_next(fctr->fstats, &hi)) { + + fs->m[v].ptile_hash = list_to_ptile_hash(&percentile_list); + fs->m[v].ptile_alg = percentile_algorithm; metrics_postcalc(&fs->m[v]); } fctr = fctr->next; } + + totals[v].ptile_hash = list_to_ptile_hash(&percentile_list); + totals[v].ptile_alg = percentile_algorithm; metrics_postcalc(&totals[v]); } @@ -1252,6 +1379,11 @@ show_descriptives(struct variable **dependent_var, + + + + + /* Fill in the descriptives data */ void populate_descriptives(struct tab_table *tbl, int col, int row, @@ -1325,6 +1457,21 @@ populate_descriptives(struct tab_table *tbl, int col, int row, TAB_LEFT | TAT_TITLE, _("Median")); + { + struct percentile *p; + double d = 50; + + p = hsh_find(m->ptile_hash, &d); + + assert(p); + + tab_float (tbl, col + 2, + row + 4, + TAB_CENTER, + p->v, + 8, 2); + } + tab_text (tbl, col, row + 5, TAB_LEFT | TAT_TITLE, @@ -1390,6 +1537,28 @@ populate_descriptives(struct tab_table *tbl, int col, int row, TAB_LEFT | TAT_TITLE, _("Interquartile Range")); + { + struct percentile *p1; + struct percentile *p2; + + double d = 75; + p1 = hsh_find(m->ptile_hash, &d); + + d = 25; + p2 = hsh_find(m->ptile_hash, &d); + + assert(p1); + assert(p2); + + tab_float (tbl, col + 2, + row + 10, + TAB_CENTER, + p1->v - p2->v, + 8, 2); + } + + + tab_text (tbl, col, row + 11, TAB_LEFT | TAT_TITLE, @@ -1518,3 +1687,259 @@ np_plot(const struct metrics *m, const char *factorname) chart_finalise(&dnp_chart); } + + + + +/* Show the percentiles */ +void +show_percentiles(struct variable **dependent_var, + int n_dep_var, + struct factor *fctr) +{ + struct tab_table *tbl; + int i; + + int n_cols, n_rows; + int n_factors; + + struct hsh_table *ptiles ; + + int n_heading_columns; + const int n_heading_rows = 2; + const int n_stat_rows = 2; + + int n_ptiles ; + + if ( fctr ) + { + struct factor_statistics **fs = fctr->fs ; + n_heading_columns = 3; + n_factors = hsh_count(fctr->fstats); + + ptiles = (*fs)->m[0].ptile_hash; + + if ( fctr->indep_var[1] ) + n_heading_columns = 4; + } + else + { + n_factors = 1; + n_heading_columns = 2; + + ptiles = totals[0].ptile_hash; + } + + n_ptiles = hsh_count(ptiles); + + n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors; + + n_cols = n_heading_columns + n_ptiles ; + + tbl = tab_create (n_cols, n_rows, 0); + + tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0); + + tab_dim (tbl, tab_natural_dimensions); + + /* Outline the box and have no internal lines*/ + tab_box (tbl, + TAL_2, TAL_2, + -1, -1, + 0, 0, + n_cols - 1, n_rows - 1); + + tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows ); + + tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1); + + + tab_title (tbl, 0, _("Percentiles")); + + + tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 ); + + + tab_box (tbl, + -1, -1, + -1, TAL_1, + 0, n_heading_rows, + n_heading_columns - 1, n_rows - 1); + + + tab_box (tbl, + -1, -1, + -1, TAL_1, + n_heading_columns, n_heading_rows - 1, + n_cols - 1, n_rows - 1); + + tab_joint_text(tbl, n_heading_columns + 1, 0, + n_cols - 1 , 0, + TAB_CENTER | TAT_TITLE , + _("Percentiles")); + + + { + /* Put in the percentile break points as headings */ + + struct percentile **p = (struct percentile **) hsh_sort(ptiles); + + i = 0; + while ( (*p) ) + { + tab_float(tbl, n_heading_columns + i++ , 1, + TAB_CENTER, + (*p)->p, 8, 0); + + p++; + } + + } + + for ( i = 0 ; i < n_dep_var ; ++i ) + { + const int n_stat_rows = 2; + const int row = n_heading_rows + i * n_stat_rows * n_factors ; + + if ( i > 0 ) + tab_hline(tbl, TAL_1, 0, n_cols - 1, row ); + + tab_text (tbl, 0, + i * n_stat_rows * n_factors + n_heading_rows, + TAB_LEFT | TAT_TITLE, + var_to_string(dependent_var[i]) + ); + + if ( fctr ) + { + struct factor_statistics **fs = fctr->fs; + int count = 0; + + tab_text (tbl, 1, n_heading_rows - 1, + TAB_CENTER | TAT_TITLE, + var_to_string(fctr->indep_var[0])); + + + if ( fctr->indep_var[1]) + tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE, + var_to_string(fctr->indep_var[1])); + + while( *fs ) + { + + static union value prev ; + + const int row = n_heading_rows + n_stat_rows * + ( ( i * n_factors ) + count ); + + + if ( 0 != compare_values(&prev, &(*fs)->id[0], + fctr->indep_var[0]->width)) + { + + if ( count > 0 ) + tab_hline (tbl, TAL_1, 1, n_cols - 1, row); + + tab_text (tbl, + 1, row, + TAB_LEFT | TAT_TITLE, + value_to_string(&(*fs)->id[0], fctr->indep_var[0]) + ); + + + } + + prev = (*fs)->id[0]; + + if (fctr->indep_var[1] && count > 0 ) + tab_hline(tbl, TAL_1, 2, n_cols - 1, row); + + if ( fctr->indep_var[1]) + tab_text (tbl, 2, row, + TAB_LEFT | TAT_TITLE, + value_to_string(&(*fs)->id[1], fctr->indep_var[1]) + ); + + + populate_percentiles(tbl, n_heading_columns - 1, + row, &(*fs)->m[i]); + + + count++ ; + fs++; + } + + + } + else + { + populate_percentiles(tbl, n_heading_columns - 1, + i * n_stat_rows * n_factors + n_heading_rows, + &totals[i]); + } + + + } + + + tab_submit(tbl); + + +} + + + + +void +populate_percentiles(struct tab_table *tbl, int col, int row, + const struct metrics *m) +{ + int i; + + struct percentile **p = (struct percentile **) hsh_sort(m->ptile_hash); + + tab_text (tbl, + col, row + 1, + TAB_LEFT | TAT_TITLE, + _("Tukey\'s Hinges") + ); + + tab_text (tbl, + col, row, + TAB_LEFT | TAT_TITLE, + ptile_alg_desc[m->ptile_alg] + ); + + + i = 0; + while ( (*p) ) + { + tab_float(tbl, col + i + 1 , row, + TAB_CENTER, + (*p)->v, 8, 2); + if ( (*p)->p == 25 ) + tab_float(tbl, col + i + 1 , row + 1, + TAB_CENTER, + m->hinges[0], 8, 2); + + if ( (*p)->p == 50 ) + tab_float(tbl, col + i + 1 , row + 1, + TAB_CENTER, + m->hinges[1], 8, 2); + + if ( (*p)->p == 75 ) + tab_float(tbl, col + i + 1 , row + 1, + TAB_CENTER, + m->hinges[2], 8, 2); + + + i++; + + p++; + } + + + + +} + diff --git a/src/factor_stats.c b/src/factor_stats.c index 3a88ab44..7e5ac8b4 100644 --- a/src/factor_stats.c +++ b/src/factor_stats.c @@ -25,6 +25,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA #include "algorithm.h" #include "alloc.h" #include "moments.h" +#include "percentiles.h" #include #include @@ -51,9 +52,6 @@ metrics_precalc(struct metrics *m) (hsh_hash_func *) hash_value, (hsh_free_func *) weighted_value_free, (void *) 0); - - - } @@ -152,7 +150,6 @@ metrics_postcalc(struct metrics *m) gsl_histogram_accumulate(m->histogram, wv[i]->v.f, wv[i]->w); } - /* Trimmed mean calculation */ if ( m->n_data <= 1 ) { @@ -175,9 +172,10 @@ metrics_postcalc(struct metrics *m) if ( cc < tc ) k1 = i; - } + + k2 = m->n_data; for ( i = m->n_data -1 ; i >= 0; --i ) { @@ -185,6 +183,12 @@ metrics_postcalc(struct metrics *m) k2 = i; } + + /* Calculate the percentiles */ + ptiles(m->ptile_hash, m->wvp, m->n_data, m->n, m->ptile_alg); + + tukey_hinges(m->wvp, m->n_data, m->n, m->hinges); + /* Special case here */ if ( k1 + 1 == k2 ) { diff --git a/src/factor_stats.h b/src/factor_stats.h index d29ca4d8..f6394a8a 100644 --- a/src/factor_stats.h +++ b/src/factor_stats.h @@ -28,6 +28,8 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA #include "hash.h" #include "val.h" #include +#include "subclist.h" +#include "percentiles.h" struct moments1; @@ -58,18 +60,28 @@ struct metrics double trimmed_mean; - /* A hash of data for this factor. - */ + /* A hash of data for this factor. */ struct hsh_table *ordered_data; /* A Pointer to this hash table AFTER it has been SORTED and crunched */ struct weighted_value **wvp; - /* The number of values in the above array (if all the weights are 1, then this will be the same as n) */ int n_data; + + /* Percentile stuff */ + + /* A hash of struct percentiles */ + struct hsh_table *ptile_hash; + + /* Algorithm to be used for calculating percentiles */ + enum pc_alg ptile_alg; + + /* Tukey's Hinges */ + double hinges[3]; + }; diff --git a/src/hash.c b/src/hash.c index a7325658..2544a75d 100644 --- a/src/hash.c +++ b/src/hash.c @@ -142,7 +142,9 @@ hsh_create (int size, hsh_compare_func *compare, hsh_hash_func *hash, struct hsh_table *h; int i; - assert (size > 0); + if ( size == 0 ) + return NULL; + assert (compare != NULL); assert (hash != NULL); diff --git a/src/percentiles.c b/src/percentiles.c new file mode 100644 index 00000000..9719676d --- /dev/null +++ b/src/percentiles.c @@ -0,0 +1,399 @@ +/* PSPP - A program for statistical analysis . -*-c-*- + +Copyright (C) 2004 Free Software Foundation, Inc. +Author: John Darrington 2004 + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. */ + +#include "factor_stats.h" +#include "percentiles.h" +#include "misc.h" + +#include + + +struct ptile_params +{ + double g1, g1_star; + double g2, g2_star; + int k1, k2; +}; + + +const char *ptile_alg_desc[] = { + "", + N_("HAverage"), + N_("Weighted Average"), + N_("Rounded"), + N_("Empirical"), + N_("Empirical with averaging") +}; + + + + +/* Individual Percentile algorithms */ + +/* Closest observation to tc1 */ +double ptile_round(const struct weighted_value **wv, + const struct ptile_params *par); + + +/* Weighted average at y_tc2 */ +double ptile_haverage(const struct weighted_value **wv, + const struct ptile_params *par); + + +/* Weighted average at y_tc1 */ +double ptile_waverage(const struct weighted_value **wv, + const struct ptile_params *par); + + +/* Empirical distribution function */ +double ptile_empirical(const struct weighted_value **wv, + const struct ptile_params *par); + + +/* Empirical distribution function with averaging*/ +double ptile_aempirical(const struct weighted_value **wv, + const struct ptile_params *par); + + + + +/* Closest observation to tc1 */ +double +ptile_round(const struct weighted_value **wv, + const struct ptile_params *par) +{ + double x; + + if ( wv[par->k1 + 1]->w >= 1 ) + { + if ( par->g1_star < 0.5 ) + x = wv[par->k1]->v.f; + else + x = wv[par->k1 + 1]->v.f; + } + else + { + if ( par->g1 < 0.5 ) + x = wv[par->k1]->v.f; + else + x = wv[par->k1 + 1]->v.f; + + } + + return x; +} + +/* Weighted average at y_tc2 */ +double +ptile_haverage(const struct weighted_value **wv, + const struct ptile_params *par) +{ + if ( par->g2_star >= 1.0 ) + return wv[par->k2 + 1]->v.f ; + + /* Special case for k2 + 1 >= n_data + (actually it's not a special case, but just avoids indexing errors ) + */ + if ( par->g2_star == 0 ) + { + assert(par->g2 == 0 ); + return wv[par->k2]->v.f; + } + + assert(par->k2 >= 0); + + if ( wv[par->k2 + 1]->w >= 1.0 ) + return ( (1 - par->g2_star) * wv[par->k2]->v.f + + + par->g2_star * wv[par->k2 + 1]->v.f); + else + return ( (1 - par->g2) * wv[par->k2]->v.f + + + par->g2 * wv[par->k2 + 1]->v.f); + +} + + + +/* Weighted average at y_tc1 */ +double +ptile_waverage(const struct weighted_value **wv, + const struct ptile_params *par) +{ + if ( par->g1_star >= 1.0 ) + return wv[par->k1 + 1]->v.f ; + + if ( wv[par->k1 + 1]->w >= 1.0 ) + return ( (1 - par->g1_star) * wv[par->k1]->v.f + + + par->g1_star * wv[par->k1 + 1]->v.f); + else + return ( (1 - par->g1) * wv[par->k1]->v.f + + + par->g1 * wv[par->k1 + 1]->v.f); +} + + +/* Empirical distribution function */ +double +ptile_empirical(const struct weighted_value **wv, + const struct ptile_params *par) +{ + if ( par->g1_star > 0 ) + return wv[par->k1 + 1]->v.f; + else + return wv[par->k1]->v.f; +} + + + +/* Empirical distribution function with averageing */ +double +ptile_aempirical(const struct weighted_value **wv, + const struct ptile_params *par) +{ + if ( par->g1_star > 0 ) + return wv[par->k1 + 1]->v.f; + else + return (wv[par->k1]->v.f + wv[par->k1 + 1]->v.f ) / 2.0 ; +} + + + +/* Compute the percentile p */ +double ptile(double p, + const struct weighted_value **wv, + int n_data, + double w, + enum pc_alg algorithm); + + + +double +ptile(double p, + const struct weighted_value **wv, + int n_data, + double w, + enum pc_alg algorithm) +{ + int i; + double tc1, tc2; + double result; + + struct ptile_params pp; + + assert( p <= 1.0); + + tc1 = w * p ; + tc2 = (w + 1) * p ; + + pp.k1 = -1; + pp.k2 = -1; + + for ( i = 0 ; i < n_data ; ++i ) + { + if ( wv[i]->cc <= tc1 ) + pp.k1 = i; + + if ( wv[i]->cc <= tc2 ) + pp.k2 = i; + + } + + + if ( pp.k1 >= 0 ) + { + pp.g1 = ( tc1 - wv[pp.k1]->cc ) / wv[pp.k1 + 1]->w; + pp.g1_star = tc1 - wv[pp.k1]->cc ; + } + else + { + pp.g1 = tc1 / wv[pp.k1 + 1]->w; + pp.g1_star = tc1 ; + } + + + if ( pp.k2 + 1 >= n_data ) + { + pp.g2 = 0 ; + pp.g2_star = 0; + } + else + { + if ( pp.k2 >= 0 ) + { + pp.g2 = ( tc2 - wv[pp.k2]->cc ) / wv[pp.k2 + 1]->w; + pp.g2_star = tc2 - wv[pp.k2]->cc ; + } + else + { + pp.g2 = tc2 / wv[pp.k2 + 1]->w; + pp.g2_star = tc2 ; + } + } + + switch ( algorithm ) + { + case PC_HAVERAGE: + result = ptile_haverage(wv, &pp); + break; + case PC_WAVERAGE: + result = ptile_waverage(wv, &pp); + break; + case PC_ROUND: + result = ptile_round(wv, &pp); + break; + case PC_EMPIRICAL: + result = ptile_empirical(wv, &pp); + break; + case PC_AEMPIRICAL: + result = ptile_aempirical(wv, &pp); + break; + default: + result = SYSMIS; + } + + return result; +} + + +/* + Calculate the values of the percentiles in pc_hash. + wv is a sorted array of weighted values of the data set. +*/ +void +ptiles(struct hsh_table *pc_hash, + const struct weighted_value **wv, + int n_data, + double w, + enum pc_alg algorithm) +{ + struct hsh_iterator hi; + struct percentile *p; + + if ( !pc_hash ) + return ; + for ( p = hsh_first(pc_hash, &hi); + p != 0 ; + p = hsh_next(pc_hash, &hi)) + { + p->v = ptile(p->p/100.0 , wv, n_data, w, algorithm); + } + +} + + +/* Calculate Tukey's Hinges */ +void +tukey_hinges(const struct weighted_value **wv, + int n_data, + double w, + double hinges[3]) +{ + int i; + double c_star = DBL_MAX; + double d; + double l[3]; + int h[3]; + double a, a_star; + + for ( i = 0 ; i < n_data ; ++i ) + { + c_star = min(c_star, wv[i]->w); + } + + if ( c_star > 1 ) c_star = 1; + + d = floor((w/c_star + 3 ) / 2.0)/ 2.0; + + l[0] = d*c_star; + l[1] = w/2.0 + c_star/2.0; + l[2] = w + c_star - d*c_star; + + h[0]=-1; + h[1]=-1; + h[2]=-1; + + for ( i = 0 ; i < n_data ; ++i ) + { + if ( l[0] >= wv[i]->cc ) h[0] = i ; + if ( l[1] >= wv[i]->cc ) h[1] = i ; + if ( l[2] >= wv[i]->cc ) h[2] = i ; + } + + for ( i = 0 ; i < 3 ; i++ ) + { + assert(h[i] + 1< n_data); + + if ( h[i] >= 0 ) + a_star = l[i] - wv[h[i]]->cc ; + else + a_star = l[i]; + + a = a_star / ( wv[h[i]+1]->cc ) ; + + if ( a_star >= 1.0 ) + { + hinges[i] = wv[h[i] + 1]->v.f ; + continue; + } + + if ( wv[h[i]+1]->w >= 1) + { + hinges[i] = ( 1 - a_star)* wv[h[i]]->v.f + + a_star * wv[h[i]+1]->v.f; + + continue; + } + + hinges[i] = ( 1 - a)* wv[h[i]]->v.f + a * wv[h[i]+1]->v.f; + + } + + assert(hinges[0] <= hinges[1]); + assert(hinges[1] <= hinges[2]); + +} + +int +ptile_compare(const struct percentile *p1, + const struct percentile *p2, + void *aux UNUSED) +{ + + int cmp; + + if ( p1->p == p2->p) + cmp = 0 ; + else if (p1->p < p2->p) + cmp = -1 ; + else + cmp = +1; + + return cmp; +} + +unsigned +ptile_hash(const struct percentile *p, void *aux UNUSED) +{ + return hsh_hash_double(p->p); +} + + diff --git a/src/percentiles.h b/src/percentiles.h new file mode 100644 index 00000000..8baba9fa --- /dev/null +++ b/src/percentiles.h @@ -0,0 +1,83 @@ +/* PSPP - A program for statistical analysis . -*-c-*- + +Copyright (C) 2004 Free Software Foundation, Inc. +Author: John Darrington 2004 + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +02111-1307, USA. */ + +#ifndef PERCENTILES_H +#define PERCENTILES_H + + +#include "hash.h" + +struct weighted_value ; + +/* The algorithm used to calculate percentiles */ +enum pc_alg { + PC_NONE=0, + PC_HAVERAGE, + PC_WAVERAGE, + PC_ROUND, + PC_EMPIRICAL, + PC_AEMPIRICAL +} ; + + + +extern const char *ptile_alg_desc[]; + + + + +struct percentile { + + /* The break point of the percentile */ + double p; + + /* The value of the percentile */ + double v; +}; + + +/* Calculate the percentiles of the break points in pc_bp, + placing the values in pc_val. + wv is a sorted array of weighted values of the data set. +*/ +void ptiles(struct hsh_table *pc_hash, + const struct weighted_value **wv, + int n_data, + double w, + enum pc_alg algorithm); + + +/* Calculate Tukey's Hinges */ +void tukey_hinges(const struct weighted_value **wv, + int n_data, + double w, + double hinges[3]); + + + +/* Hash utility functions */ +int ptile_compare(const struct percentile *p1, + const struct percentile *p2, + void *aux); + +unsigned ptile_hash(const struct percentile *p, void *aux); + + +#endif diff --git a/src/subclist.c b/src/subclist.c index d2add44c..2a7f3679 100644 --- a/src/subclist.c +++ b/src/subclist.c @@ -53,7 +53,7 @@ subc_list_double_push(subc_list_double *l, double d) /* Return the number of items in the list */ int -subc_list_double_count(subc_list_double *l) +subc_list_double_count(const subc_list_double *l) { return l->n_data; } @@ -61,7 +61,7 @@ subc_list_double_count(subc_list_double *l) /* Index into the list (array) */ double -subc_list_double_at(subc_list_double *l, int idx) +subc_list_double_at(const subc_list_double *l, int idx) { return l->data[idx]; } diff --git a/src/subclist.h b/src/subclist.h index b311bc67..1a110e13 100644 --- a/src/subclist.h +++ b/src/subclist.h @@ -57,12 +57,12 @@ void subc_list_double_push(subc_list_double *l, double d) ; void subc_list_int_push(subc_list_int *l, int i) ; /* Index into the list */ -double subc_list_double_at(subc_list_double *l, int idx); -int subc_list_int_at(subc_list_int *l, int idx); +double subc_list_double_at(const subc_list_double *l, int idx); +int subc_list_int_at(const subc_list_int *l, int idx); /* Return the number of values in the list */ -int subc_list_double_count(subc_list_double *l); -int subc_list_int_count(subc_list_int *l); +int subc_list_double_count(const subc_list_double *l); +int subc_list_int_count(const subc_list_int *l); /* Destroy the list */ void subc_list_double_destroy(subc_list_double *l) ; diff --git a/tests/command/examine.sh b/tests/command/examine.sh index 11830edb..55c80a8a 100755 --- a/tests/command/examine.sh +++ b/tests/command/examine.sh @@ -152,13 +152,13 @@ Case# QUALITY W BRAND # 95% Confidence Interval for MeanLower Bound# 3.562 | # # Upper Bound# 3.521 | # # 5% Trimmed Mean # 3.50 | # -# Median # | # +# Median # 4.00 | # # Variance # 2.520 | # # Std. Deviation # 1.587 | # # Minimum # 1.000 | # # Maximum # 7.000 | # # Range # 6.000 | # -# Interquartile Range # | # +# Interquartile Range # 2.75 | # # Skewness # .059 | .472 # # Kurtosis # -.358 | .918 # #==========================================================#=========#==========# @@ -213,13 +213,13 @@ Case# QUALITY W BRAND # 95% Confidence Interval for MeanLower Bound# 2.279 | # # Upper Bound# 2.221 | # # 5% Trimmed Mean # 2.22 | # -# Median # | # +# Median # 2.00 | # # Variance # 1.643 | # # Std. Deviation # 1.282 | # # Minimum # 1.000 | # # Maximum # 4.000 | # # Range # 3.000 | # -# Interquartile Range # | # +# Interquartile Range # 2.75 | # # Skewness # .475 | .752 # # Kurtosis # -1.546 | 1.481 # # -------------------------------------------------------#---------+----------# @@ -227,13 +227,13 @@ Case# QUALITY W BRAND # 95% Confidence Interval for MeanLower Bound# 3.525 | # # Upper Bound# 3.475 | # # 5% Trimmed Mean # 3.50 | # -# Median # | # +# Median # 4.00 | # # Variance # 1.143 | # # Std. Deviation # 1.069 | # # Minimum # 2.000 | # # Maximum # 5.000 | # # Range # 3.000 | # -# Interquartile Range # | # +# Interquartile Range # 1.75 | # # Skewness # -.468 | .752 # # Kurtosis # -.831 | 1.481 # # -------------------------------------------------------#---------+----------# @@ -241,13 +241,13 @@ Case# QUALITY W BRAND # 95% Confidence Interval for MeanLower Bound# 4.904 | # # Upper Bound# 4.846 | # # 5% Trimmed Mean # 4.86 | # -# Median # | # +# Median # 5.00 | # # Variance # 1.554 | # # Std. Deviation # 1.246 | # # Minimum # 3.000 | # # Maximum # 7.000 | # # Range # 4.000 | # -# Interquartile Range # | # +# Interquartile Range # 1.75 | # # Skewness # .304 | .752 # # Kurtosis # .146 | 1.481 # #======================================================================#=========#==========# diff --git a/tests/command/trimmed-mean.sh b/tests/command/trimmed-mean.sh index 0690c0ea..8b84a31c 100755 --- a/tests/command/trimmed-mean.sh +++ b/tests/command/trimmed-mean.sh @@ -100,13 +100,13 @@ diff $TEMPDIR/pspp.list - << EOF # 95% Confidence Interval for MeanLower Bound# 2.021 | # # Upper Bound# 2.017 | # # 5% Trimmed Mean # 2.00 | # -# Median # | # +# Median # 2.00 | # # Variance # .058 | # # Std. Deviation # .242 | # # Minimum # 1.000 | # # Maximum # 3.000 | # # Range # 2.000 | # -# Interquartile Range # | # +# Interquartile Range # .00 | # # Skewness # 1.194 | .330 # # Kurtosis # 15.732 | .650 # #============================================#=========#==========# -- 2.30.2