From: Ben Pfaff Date: Fri, 10 Oct 2008 05:49:13 +0000 (-0700) Subject: Merge commit 'origin/stable' X-Git-Tag: sav-api~870^2~39 X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=015e221b0f8578afee769528572c76387f26c629;hp=d9d37c3a04dc15263c561174b87e48a4650b412c;p=pspp Merge commit 'origin/stable' Conflicts: tests/automake.mk --- diff --git a/.gitignore b/.gitignore index c29aff49c2..1d69e16963 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,10 @@ reloc-ldflags stamp-h1 texinfo.tex gitlog-to-changelog +*~ +*.o +*.lo +*.a +*.dirstamp +*.deps +*.la diff --git a/AUTHORS b/AUTHORS index 12b25ab717..43103dcd8f 100644 --- a/AUTHORS +++ b/AUTHORS @@ -14,6 +14,11 @@ to other modules. including lib/gslextras and the linear regression features. Jason is also an important contributor to GSL, which is used by PSPP. +* Rob van Son wrote the routine for calculation of the significance +of the Wilcoxon matched pairs signed rank statistic used by the + NPAR TEST command. + + We also thank past contributors: * John Williams wrote an initial draft of the T-TEST procedure. diff --git a/Makefile.am b/Makefile.am index ab32f99678..23305294af 100644 --- a/Makefile.am +++ b/Makefile.am @@ -32,6 +32,7 @@ EXTRA_DIST = OChangeLog ONEWS config.rpath pspp-mode.el CLEANFILES = ACLOCAL_AMFLAGS = -I m4 -I gl/m4 noinst_LIBRARIES= +noinst_LTLIBRARIES= noinst_PROGRAMS= check_PROGRAMS= bin_PROGRAMS= diff --git a/configure.ac b/configure.ac index f32434daec..b00a75b147 100644 --- a/configure.ac +++ b/configure.ac @@ -2,7 +2,7 @@ dnl Process this file with autoconf to produce a configure script. dnl Initialize. AC_PREREQ(2.60) -AC_INIT([pspp],[0.6.1],[bug-gnu-pspp@gnu.org]) +AC_INIT([pspp],[0.7.0],[bug-gnu-pspp@gnu.org]) AC_CONFIG_HEADERS([config.h]) AM_INIT_AUTOMAKE @@ -121,7 +121,7 @@ AM_CONDITIONAL(GNM_SUPPORT, test x"$gnm_support" = x"yes") AC_ARG_WITH( gui_tools, - [AS_HELP_STRING([--with-gui-tools], [build the gui developer tools])]) + [AS_HELP_STRING([--with-gui-tools], [build the gui developer tools. For DEVELOPERS only! There is no reason why users will need this flag.])]) if test x"$with_gui_tools" = x"yes" ; then PKG_CHECK_MODULES(GLADE_UI, gladeui-1.0) fi diff --git a/doc/statistics.texi b/doc/statistics.texi index 28398fb75e..89e5e83c56 100644 --- a/doc/statistics.texi +++ b/doc/statistics.texi @@ -14,6 +14,7 @@ far. * ONEWAY:: One way analysis of variance. * RANK:: Compute rank scores. * REGRESSION:: Linear regression. +* RELIABILITY:: Reliability analysis. @end menu @node DESCRIPTIVES @@ -232,7 +233,7 @@ EXAMINE /PLOT=@{BOXPLOT, NPPLOT, HISTOGRAM, ALL, NONE@} /CINTERVAL n /COMPARE=@{GROUPS,VARIABLES@} - /ID=@{case_number, var_name@} + /ID=var_name /@{TOTAL,NOTOTAL@} /PERCENTILE=[value_list]=@{HAVERAGE, WAVERAGE, ROUND, AEMPIRICAL, EMPIRICAL @} /MISSING=@{LISTWISE, PAIRWISE@} [@{EXCLUDE, INCLUDE@}] @@ -271,6 +272,12 @@ If /COMPARE=VARIABLES is specified, then one plot per factor is produced, each each containing one boxplot per dependent variable. If the /COMPARE subcommand is ommitted, then PSPP uses the default value of /COMPARE=GROUPS. + +The ID subcommand also pertains to boxplots. If given, it must +specify a variable name. Outliers and extreme cases plotted in +boxplots will be labelled with the case from that variable. Numeric or +string variables are permissible. If the ID subcommand is not given, +then the casenumber will be used for labelling. The CINTERVAL subcommand specifies the confidence interval to use in calculation of the descriptives command. The default it 95%. @@ -499,6 +506,8 @@ NPAR TESTS [ /STATISTICS=@{DESCRIPTIVES@} ] [ /MISSING=@{ANALYSIS, LISTWISE@} @{INCLUDE, EXCLUDE@} ] + + [ /METHOD=EXACT [ TIMER [(n)] ] ] @end display NPAR TESTS performs nonparametric tests. @@ -508,10 +517,21 @@ One or more tests may be specified by using the corresponding subcommand. If the /STATISTICS subcommand is also specified, then summary statistics are produces for each variable that is the subject of any test. +Certain tests may take a long time to execute, if an exact figure is required. +Therefore, by default asymptotic approximations are used unless the +subcommand /METHOD=EXACT is specified. +Exact tests give more accurate results, but may take an unacceptably long +time to perform. If the TIMER keyword is used, it sets a maximum time, +after which the test will be abandoned, and a warning message printed. +The time, in minutes, should be specified in parentheses after the TIMER keyword. +If the TIMER keyword is given without this figure, then a default value of 5 minutes +is used. + @menu * BINOMIAL:: Binomial Test * CHISQUARE:: Chisquare Test +* WILCOXON:: Wilcoxon Signed Ranks Test @end menu @@ -591,6 +611,34 @@ sum of the frequencies need not be 1. If no /EXPECTED subcommand is given, then then equal frequencies are expected. +@node WILCOXON +@subsection Wilcoxon +@comment node-name, next, previous, up +@vindex WILCOXON +@cindex wilcoxon matched pairs signed ranks test + +@display + [ /WILCOXON varlist [ WITH varlist [ (PAIRED) ]]] +@end display + +The wilcoxon subcommand tests for differences between means of the +variables listed. The test does not make any assumptions about the +variances of the samples. + +If the @code{WITH} keyword is omitted, then tests for all +combinations of the listed variables are performed. +If the @code{WITH} keyword is given, and the @code{(PAIRED)} keyword +is also given, then the number of variables preceding @code{WITH} +must be the same as the number following it. +In this case, tests for each respective pair of variables are +performed. +If the @code{WITH} keyword is given, but the +@code{(PAIRED)} keyword is omitted, then tests for each combination +of variable preceding @code{WITH} against variable following +@code{WITH} are performed. + +If the number of observations is large, and exact tests have been +requested. then the test may take a very long time to complete. @node T-TEST @comment node-name, next, previous, up @@ -831,3 +879,50 @@ user-missing are to be excluded from the rank scores. A setting of INCLUDE means they are to be included. The default is EXCLUDE. @include regression.texi + + +@node RELIABILITY +@section RELIABILITY + +@vindex RELIABILITY +@display +RELIABILITY + /VARIABLES=var_list + /SCALE (@var{name}) = @{var_list, ALL@} + /MODEL=@{ALPHA, SPLIT[(N)]@} + /SUMMARY=@{TOTAL,ALL@} + /MISSING=@{EXCLUDE,INCLUDE@} +@end display + +@cindex Cronbach's Alpha +The @cmd{RELIABILTY} command performs reliablity analysis on the data. + +The VARIABLES subcommand is required. It determines the set of variables +upon which analysis is to be performed. + +The SCALE subcommand determines which variables reliability is to be +calculated for. If it is omitted, then analysis for all variables named +in the VARIABLES subcommand will be used. +Optionally, the @var{name} parameter may be specified to set a string name +for the scale. + +The MODEL subcommand determines the type of analysis. If ALPHA is specified, +then Cronbach's Alpha is calculated for the scale. If the model is SPLIT, +then the variables are divided into 2 subsets. An optional parameter +@var{N} may be given, to specify how many variables to be in the first subset. +If @var{N} is omitted, then it defaults to one half of the variables in the +scale, or one half minus one if there are an odd number of variables. +The default model is ALPHA. + +By default, any cases with user missing, or system missing values for +any variables given +in the VARIABLES subcommand will be omitted from analysis. +The MISSING subcommand determines whether user missing values are to +be included or excluded in the analysis. + +The SUMMARY subcommand determines the type of summary analysis to be performed. +Currently there is only one type: SUMMARY=TOTAL, which displays per-item +analysis tested against the totals. + + + diff --git a/doc/transformation.texi b/doc/transformation.texi index 27bbb2da8d..2a52ad17f7 100644 --- a/doc/transformation.texi +++ b/doc/transformation.texi @@ -83,7 +83,7 @@ list. Each set must have exactly as many source variables as aggregation variables. Each aggregation variable receives the results of applying the specified aggregation function to the corresponding source -variable. The MEAN, SD, and SUM aggregation functions may only be +variable. The MEAN, MEDIAN, SD, and SUM aggregation functions may only be applied to numeric variables. All the rest may be applied to numeric and short and long string variables. @@ -128,6 +128,9 @@ dictionary information from the source variable. Arithmetic mean. Limited to numeric values. The default format is F8.2. +@item MEDIAN(var_name) +The median value. Limited to numeric values. The default format is F8.2. + @item MIN(var_name) Minimum value. The aggregation variable receives the complete dictionary information from the source variable. diff --git a/lib/automake.mk b/lib/automake.mk index 6410988b8b..9197da9742 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -1,6 +1,7 @@ ## Process this file with automake to produce Makefile.in -*- makefile -*- include $(top_srcdir)/lib/linreg/automake.mk +include $(top_srcdir)/lib/misc/automake.mk if WITHGUI include $(top_srcdir)/lib/gtksheet/automake.mk diff --git a/lib/gtksheet/automake.mk b/lib/gtksheet/automake.mk index babcb2aaaa..b28382602d 100644 --- a/lib/gtksheet/automake.mk +++ b/lib/gtksheet/automake.mk @@ -28,4 +28,5 @@ lib_gtksheet_libgtksheet_a_SOURCES = \ lib/gtksheet/gtksheet.c \ lib/gtksheet/gtksheet.h -EXTRA_DIST += lib/gtksheet/OChangeLog +EXTRA_DIST += lib/gtksheet/OChangeLog \ + lib/gtksheet/README diff --git a/lib/linreg/automake.mk b/lib/linreg/automake.mk index 650646c6ee..30fd2e592d 100644 --- a/lib/linreg/automake.mk +++ b/lib/linreg/automake.mk @@ -1,8 +1,8 @@ ## Process this file with automake to produce Makefile.in -*- makefile -*- -noinst_LIBRARIES += lib/linreg/liblinreg.a +noinst_LTLIBRARIES += lib/linreg/liblinreg.la -lib_linreg_liblinreg_a_SOURCES = \ +lib_linreg_liblinreg_la_SOURCES = \ lib/linreg/sweep.c lib/linreg/sweep.h EXTRA_DIST += lib/linreg/OChangeLog diff --git a/lib/misc/README b/lib/misc/README new file mode 100644 index 0000000000..68ba91ad38 --- /dev/null +++ b/lib/misc/README @@ -0,0 +1,2 @@ +This is not part of the GNU PSPP program, but is used with GNU PSPP. + diff --git a/lib/misc/automake.mk b/lib/misc/automake.mk new file mode 100644 index 0000000000..3b4f1a939f --- /dev/null +++ b/lib/misc/automake.mk @@ -0,0 +1,8 @@ +## Process this file with automake to produce Makefile.in -*- makefile -*- + +noinst_LTLIBRARIES += lib/misc/libmisc.la + +lib_misc_libmisc_la_SOURCES = \ + lib/misc/wx-mp-sr.c lib/misc/wx-mp-sr.h + +EXTRA_DIST += lib/misc/README diff --git a/lib/misc/wx-mp-sr.c b/lib/misc/wx-mp-sr.c new file mode 100644 index 0000000000..39e83b9787 --- /dev/null +++ b/lib/misc/wx-mp-sr.c @@ -0,0 +1,101 @@ +#include +#include "wx-mp-sr.h" + +/********************************************************************* +* +* Calculate the exact level of significance for a +* Wilcoxon Matched-Pair Signed-Ranks Test using the sample's +* Sum of Ranks W and the sample size (i.e., number of pairs) N. +* This whole routine can be run as a stand-alone program. +* +* Use: +* WX-MP-SR W N +* +* Copyright 1996, Rob van Son +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License along +* with this program; if not, write to the Free Software Foundation, Inc., +* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +* +* ------------------------------------------------------- +* Rob van Son +* Institute of Phonetic Sciences & IFOTT +* University of Amsterdam, Spuistraat 210 +* NL-1012VT Amsterdam, The Netherlands +* Tel.: (+31) 205252196 Fax.: (+31) 205252197 +* Email: r.j.j.h.vanson@uva.nl +* WWW page: http://www.fon.hum.uva.nl/rob +* ------------------------------------------------------- +* +* This is the actual routine that calculates the exact (two-tailed) +* level of significance for the Wilcoxon Matched-Pairs Signed-Ranks +* test. The inputs are the Sum of Ranks of either the positive of +* negative samples (W) and the sample size (N). +* The Level of significance is calculated by checking for each +* possible outcome (2**N possibilities) whether the sum of ranks +* is larger than or equal to the observed Sum of Ranks (W). +* +* NOTE: The execution-time scales like ~ N*2**N, i.e., N*pow(2, N), +* which is more than exponential. Adding a single pair to the sample +* (i.e., increase N by 1) will more than double the time needed to +* complete the calculations (apart from an additive constant). +* The execution-time of this program can easily outrun your +* patience. +* +***********************************************************************/ + +double LevelOfSignificanceWXMPSR(double Winput, long int N) +{ + unsigned long int W, MaximalW, NumberOfPossibilities, CountLarger; + unsigned long int i, RankSum, j; + double p; + + /* Determine Wmax, i.e., work with the largest Rank Sum */ + MaximalW = N*(N+1)/2; + if(Winput < MaximalW/2)Winput = MaximalW - Winput; + W = Winput; /* Convert to long int */ + if(W != Winput)++W; /* Increase to next full integer */ + + /* The total number of possible outcomes is 2**N */ + NumberOfPossibilities = 1 << N; + + /* Initialize and loop. The loop-interior will be run 2**N times. */ + CountLarger = 0; + /* Generate all distributions of sign over ranks as bit-patterns (i). */ + for(i=0; i < NumberOfPossibilities; ++i) + { + RankSum = 0; + /* + Shift "sign" bits out of i to determine the Sum of Ranks (j). + */ + for(j=0; j < N; ++j) + { + if((i >> j) & 1)RankSum += j + 1; + }; + /* + * Count the number of "samples" that have a Sum of Ranks larger than + * or equal to the one found (i.e., >= W). + */ + if(RankSum >= W)++CountLarger; + }; + /***************************************************************** + * The level of significance is the number of outcomes with a + * sum of ranks equal to or larger than the one found (W) + * divided by the total number of possible outcomes. + * The level is doubled to get the two-tailed result. + ******************************************************************/ + p = 2*((double)CountLarger) / ((double)NumberOfPossibilities); + + return p; +} + diff --git a/lib/misc/wx-mp-sr.h b/lib/misc/wx-mp-sr.h new file mode 100644 index 0000000000..ec3ef5ed3f --- /dev/null +++ b/lib/misc/wx-mp-sr.h @@ -0,0 +1,6 @@ +#ifndef WX_MP_SR +#define WX_MP_SR 1 + +double LevelOfSignificanceWXMPSR(double Winput, long int N); + +#endif diff --git a/po/en_GB.po b/po/en_GB.po index 2c3bc865a7..0c13bd309b 100644 --- a/po/en_GB.po +++ b/po/en_GB.po @@ -7,7 +7,7 @@ msgid "" msgstr "" "Project-Id-Version: PSPP 0.4.3\n" "Report-Msgid-Bugs-To: pspp-dev@gnu.org\n" -"POT-Creation-Date: 2008-08-23 08:35+0800\n" +"POT-Creation-Date: 2008-09-08 18:04+0800\n" "PO-Revision-Date: 2007-09-15 08:29+0800\n" "Last-Translator: John Darrington \n" "Language-Team: John Darrington \n" @@ -1872,8 +1872,8 @@ msgstr "" #: src/language/dictionary/sys-file-info.c:563 #: src/language/stats/crosstabs.q:1155 src/language/stats/crosstabs.q:1182 #: src/language/stats/crosstabs.q:1202 src/language/stats/crosstabs.q:1224 -#: src/language/stats/examine.q:1198 src/language/stats/frequencies.q:1060 -#: src/language/stats/frequencies.q:1184 +#: src/language/stats/examine.q:1948 src/language/stats/frequencies.q:1055 +#: src/language/stats/frequencies.q:1179 msgid "Value" msgstr "" @@ -2635,7 +2635,7 @@ msgstr "" #: src/language/stats/binomial.c:203 src/language/stats/chisquare.c:223 #: src/language/stats/chisquare.c:283 src/language/stats/crosstabs.q:862 #: src/language/stats/crosstabs.q:1062 src/language/stats/crosstabs.q:1785 -#: src/language/stats/examine.q:918 src/language/stats/frequencies.q:1137 +#: src/language/stats/examine.q:1207 src/language/stats/frequencies.q:1132 #: src/language/stats/oneway.q:306 src/language/stats/oneway.q:476 #: src/language/stats/regression.q:309 src/ui/gui/crosstabs-dialog.c:59 msgid "Total" @@ -2647,7 +2647,7 @@ msgid "Category" msgstr "" #: src/language/stats/binomial.c:236 src/language/stats/crosstabs.q:872 -#: src/language/stats/examine.q:993 src/language/stats/frequencies.q:1405 +#: src/language/stats/examine.q:1280 src/language/stats/frequencies.q:1400 #: src/language/stats/npar-summary.c:122 src/language/stats/oneway.q:391 #: src/language/stats/t-test.q:693 src/language/stats/t-test.q:716 #: src/language/stats/t-test.q:850 src/language/stats/t-test.q:1387 @@ -2740,24 +2740,24 @@ msgstr "" msgid "Summary." msgstr "" -#: src/language/stats/crosstabs.q:859 src/language/stats/examine.q:981 +#: src/language/stats/crosstabs.q:859 src/language/stats/examine.q:1268 msgid "Cases" msgstr "" -#: src/language/stats/crosstabs.q:860 src/language/stats/examine.q:916 -#: src/language/stats/frequencies.q:1058 src/language/stats/frequencies.q:1406 +#: src/language/stats/crosstabs.q:860 src/language/stats/examine.q:1205 +#: src/language/stats/frequencies.q:1053 src/language/stats/frequencies.q:1401 msgid "Valid" msgstr "" -#: src/language/stats/crosstabs.q:861 src/language/stats/examine.q:917 -#: src/language/stats/frequencies.q:1128 src/language/stats/frequencies.q:1407 +#: src/language/stats/crosstabs.q:861 src/language/stats/examine.q:1206 +#: src/language/stats/frequencies.q:1123 src/language/stats/frequencies.q:1402 #: src/ui/gui/psppire-var-sheet.c:106 msgid "Missing" msgstr "" -#: src/language/stats/crosstabs.q:873 src/language/stats/examine.q:996 -#: src/language/stats/frequencies.q:1062 src/language/stats/frequencies.q:1063 -#: src/language/stats/frequencies.q:1064 +#: src/language/stats/crosstabs.q:873 src/language/stats/examine.q:1283 +#: src/language/stats/frequencies.q:1057 src/language/stats/frequencies.q:1058 +#: src/language/stats/frequencies.q:1059 msgid "Percent" msgstr "" @@ -2799,7 +2799,7 @@ msgstr "" #: src/language/stats/crosstabs.q:1154 src/language/stats/crosstabs.q:1181 #: src/language/stats/crosstabs.q:1201 src/language/stats/crosstabs.q:1222 -#: src/language/stats/examine.q:1442 src/ui/gui/checkbox-treeview.c:94 +#: src/language/stats/examine.q:1745 src/ui/gui/checkbox-treeview.c:94 msgid "Statistic" msgstr "" @@ -2989,7 +2989,7 @@ msgstr "" msgid "%s Dependent" msgstr "" -#: src/language/stats/descriptives.c:102 src/language/stats/examine.q:1556 +#: src/language/stats/descriptives.c:102 src/language/stats/examine.q:1550 #: src/language/stats/frequencies.q:123 src/language/stats/npar-summary.c:125 #: src/language/stats/oneway.q:392 src/language/stats/t-test.q:694 #: src/language/stats/t-test.q:717 src/language/stats/t-test.q:849 @@ -3006,13 +3006,13 @@ msgstr "" msgid "Std Dev" msgstr "" -#: src/language/stats/descriptives.c:105 src/language/stats/examine.q:1636 +#: src/language/stats/descriptives.c:105 src/language/stats/examine.q:1581 #: src/language/stats/frequencies.q:128 src/ui/gui/descriptives-dialog.c:46 #: src/ui/gui/frequencies-dialog.c:45 msgid "Variance" msgstr "" -#: src/language/stats/descriptives.c:106 src/language/stats/examine.q:1743 +#: src/language/stats/descriptives.c:106 src/language/stats/examine.q:1617 #: src/language/stats/frequencies.q:129 src/ui/gui/descriptives-dialog.c:47 #: src/ui/gui/frequencies-dialog.c:50 msgid "Kurtosis" @@ -3022,7 +3022,7 @@ msgstr "" msgid "S E Kurt" msgstr "" -#: src/language/stats/descriptives.c:108 src/language/stats/examine.q:1723 +#: src/language/stats/descriptives.c:108 src/language/stats/examine.q:1612 #: src/language/stats/frequencies.q:131 src/ui/gui/descriptives-dialog.c:48 #: src/ui/gui/frequencies-dialog.c:46 msgid "Skewness" @@ -3032,20 +3032,20 @@ msgstr "" msgid "S E Skew" msgstr "" -#: src/language/stats/descriptives.c:110 src/language/stats/examine.q:1684 +#: src/language/stats/descriptives.c:110 src/language/stats/examine.q:1601 #: src/language/stats/frequencies.q:133 src/ui/gui/descriptives-dialog.c:43 #: src/ui/gui/frequencies-dialog.c:48 msgid "Range" msgstr "" -#: src/language/stats/descriptives.c:111 src/language/stats/examine.q:1661 +#: src/language/stats/descriptives.c:111 src/language/stats/examine.q:1591 #: src/language/stats/frequencies.q:134 src/language/stats/npar-summary.c:131 #: src/language/stats/oneway.q:404 src/ui/gui/descriptives-dialog.c:41 #: src/ui/gui/frequencies-dialog.c:42 msgid "Minimum" msgstr "" -#: src/language/stats/descriptives.c:112 src/language/stats/examine.q:1672 +#: src/language/stats/descriptives.c:112 src/language/stats/examine.q:1596 #: src/language/stats/frequencies.q:135 src/language/stats/npar-summary.c:134 #: src/language/stats/oneway.q:405 src/ui/gui/descriptives-dialog.c:42 #: src/ui/gui/frequencies-dialog.c:43 @@ -3102,119 +3102,132 @@ msgstr "" msgid "Valid cases = %g; cases with missing value(s) = %g." msgstr "" -#: src/language/stats/examine.q:288 src/language/stats/examine.q:291 -#, c-format -msgid "%s is not currently supported." +#: src/language/stats/examine.q:337 src/language/stats/examine.q:490 +#: src/language/stats/examine.q:1047 +msgid "Not creating plot because data set is empty." msgstr "" -#: src/language/stats/examine.q:501 src/language/stats/examine.q:514 +#: src/language/stats/examine.q:347 #, c-format -msgid "%s and %s are mutually exclusive" +msgid "Normal Q-Q Plot of %s" msgstr "" -#: src/language/stats/examine.q:976 -msgid "Case Processing Summary" +#: src/language/stats/examine.q:348 src/language/stats/examine.q:353 +msgid "Observed Value" msgstr "" -#: src/language/stats/examine.q:1183 -msgid "Extreme Values" +#: src/language/stats/examine.q:349 +msgid "Expected Normal" msgstr "" -#: src/language/stats/examine.q:1199 -msgid "Case Number" +#: src/language/stats/examine.q:351 +#, c-format +msgid "Detrended Normal Q-Q Plot of %s" msgstr "" -#: src/language/stats/examine.q:1297 -msgid "Highest" +#: src/language/stats/examine.q:354 +msgid "Dev from Normal" msgstr "" -#: src/language/stats/examine.q:1302 -msgid "Lowest" +#: src/language/stats/examine.q:507 +#, c-format +msgid "Boxplot of %s vs. %s" msgstr "" -#: src/language/stats/examine.q:1443 src/language/stats/oneway.q:394 -#: src/language/stats/oneway.q:692 src/language/stats/regression.q:203 -msgid "Std. Error" +#: src/language/stats/examine.q:511 +#, c-format +msgid "Boxplot of %s" msgstr "" -#: src/language/stats/examine.q:1445 src/language/stats/oneway.q:408 -#: src/ui/gui/examine.glade:307 -msgid "Descriptives" +#: src/language/stats/examine.q:747 src/language/stats/examine.q:760 +#, c-format +msgid "%s and %s are mutually exclusive" +msgstr "" + +#: src/language/stats/examine.q:1263 +msgid "Case Processing Summary" msgstr "" -#: src/language/stats/examine.q:1574 src/language/stats/oneway.q:399 +#: src/language/stats/examine.q:1555 src/language/stats/oneway.q:399 #, c-format msgid "%g%% Confidence Interval for Mean" msgstr "" -#: src/language/stats/examine.q:1580 src/language/stats/oneway.q:401 +#: src/language/stats/examine.q:1561 src/language/stats/oneway.q:401 msgid "Lower Bound" msgstr "" -#: src/language/stats/examine.q:1591 src/language/stats/oneway.q:402 +#: src/language/stats/examine.q:1566 src/language/stats/oneway.q:402 msgid "Upper Bound" msgstr "" -#: src/language/stats/examine.q:1603 +#: src/language/stats/examine.q:1571 #, c-format msgid "5%% Trimmed Mean" msgstr "" -#: src/language/stats/examine.q:1614 src/language/stats/frequencies.q:125 +#: src/language/stats/examine.q:1576 src/language/stats/frequencies.q:125 #: src/ui/gui/frequencies-dialog.c:52 msgid "Median" msgstr "" -#: src/language/stats/examine.q:1648 src/language/stats/npar-summary.c:128 +#: src/language/stats/examine.q:1586 src/language/stats/npar-summary.c:128 #: src/language/stats/oneway.q:393 src/language/stats/t-test.q:695 #: src/language/stats/t-test.q:718 src/language/stats/t-test.q:851 #: src/language/stats/t-test.q:1188 msgid "Std. Deviation" msgstr "" -#: src/language/stats/examine.q:1696 +#: src/language/stats/examine.q:1606 msgid "Interquartile Range" msgstr "" -#: src/language/stats/examine.q:1850 -#, c-format -msgid "Boxplot of %s vs. %s" +#: src/language/stats/examine.q:1742 src/language/stats/oneway.q:408 +#: src/ui/gui/examine.glade:307 +msgid "Descriptives" msgstr "" -#: src/language/stats/examine.q:1877 -msgid "Boxplot" +#: src/language/stats/examine.q:1748 src/language/stats/oneway.q:394 +#: src/language/stats/oneway.q:692 src/language/stats/regression.q:203 +msgid "Std. Error" msgstr "" -#: src/language/stats/examine.q:1919 +#: src/language/stats/examine.q:1845 src/language/stats/examine.q:1850 +#: src/ui/gui/psppire-var-store.c:675 src/ui/gui/psppire-var-store.c:685 +#: src/ui/gui/psppire-var-store.c:695 #, c-format -msgid "Normal Q-Q Plot of %s" +msgid "%d" msgstr "" -#: src/language/stats/examine.q:1920 src/language/stats/examine.q:1926 -msgid "Observed Value" +#: src/language/stats/examine.q:1928 +msgid "Highest" msgstr "" -#: src/language/stats/examine.q:1921 -msgid "Expected Normal" +#: src/language/stats/examine.q:1933 +msgid "Lowest" msgstr "" -#: src/language/stats/examine.q:1924 -#, c-format -msgid "Detrended Normal Q-Q Plot of %s" +#: src/language/stats/examine.q:1940 +msgid "Extreme Values" msgstr "" -#: src/language/stats/examine.q:1927 -msgid "Dev from Normal" +#: src/language/stats/examine.q:1944 +msgid "Case Number" msgstr "" -#: src/language/stats/examine.q:2046 src/language/stats/examine.q:2068 -#: src/language/stats/frequencies.q:1417 src/language/stats/npar-summary.c:141 +#: src/language/stats/examine.q:2066 +msgid "Tukey's Hinges" +msgstr "" + +#: src/language/stats/examine.q:2106 src/language/stats/examine.q:2124 +#: src/language/stats/frequencies.q:1412 src/language/stats/npar-summary.c:141 #: src/ui/gui/examine.glade:328 msgid "Percentiles" msgstr "" -#: src/language/stats/examine.q:2204 -msgid "Tukey's Hinges" +#: src/language/stats/examine.q:2113 +#, c-format +msgid "%g" msgstr "" #: src/language/stats/flip.c:96 @@ -3317,52 +3330,52 @@ msgid "" "MIN was specified as %g and MAX as %g. MIN and MAX will be ignored." msgstr "" -#: src/language/stats/frequencies.q:759 +#: src/language/stats/frequencies.q:754 #, c-format msgid "Variable %s specified multiple times on VARIABLES subcommand." msgstr "" -#: src/language/stats/frequencies.q:822 +#: src/language/stats/frequencies.q:817 msgid "`)' expected after GROUPED interval list." msgstr "" -#: src/language/stats/frequencies.q:834 +#: src/language/stats/frequencies.q:829 #, c-format msgid "Variables %s specified on GROUPED but not on VARIABLES." msgstr "" -#: src/language/stats/frequencies.q:841 +#: src/language/stats/frequencies.q:836 #, c-format msgid "Variables %s specified multiple times on GROUPED subcommand." msgstr "" -#: src/language/stats/frequencies.q:1059 src/language/stats/frequencies.q:1152 -#: src/language/stats/frequencies.q:1153 src/language/stats/frequencies.q:1187 +#: src/language/stats/frequencies.q:1054 src/language/stats/frequencies.q:1147 +#: src/language/stats/frequencies.q:1148 src/language/stats/frequencies.q:1182 msgid "Cum" msgstr "" -#: src/language/stats/frequencies.q:1061 src/output/charts/plot-hist.c:126 +#: src/language/stats/frequencies.q:1056 src/output/charts/plot-hist.c:138 msgid "Frequency" msgstr "" -#: src/language/stats/frequencies.q:1082 +#: src/language/stats/frequencies.q:1077 msgid "Value Label" msgstr "" -#: src/language/stats/frequencies.q:1185 +#: src/language/stats/frequencies.q:1180 msgid "Freq" msgstr "" -#: src/language/stats/frequencies.q:1186 src/language/stats/frequencies.q:1188 +#: src/language/stats/frequencies.q:1181 src/language/stats/frequencies.q:1183 msgid "Pct" msgstr "" -#: src/language/stats/frequencies.q:1379 +#: src/language/stats/frequencies.q:1374 #, c-format msgid "No valid data for variable %s; statistics not displayed." msgstr "" -#: src/language/stats/frequencies.q:1421 +#: src/language/stats/frequencies.q:1416 msgid "50 (Median)" msgstr "" @@ -3386,20 +3399,20 @@ msgstr "" msgid "NPAR subcommand not currently implemented." msgstr "" -#: src/language/stats/npar.q:236 +#: src/language/stats/npar.q:237 #, c-format msgid "" "The specified value of HI (%d) is lower than the specified value of LO (%d)" msgstr "" -#: src/language/stats/npar.q:291 +#: src/language/stats/npar.q:292 #, c-format msgid "" "%d expected values were given, but the specified range (%d-%d) requires " "exactly %d values." msgstr "" -#: src/language/stats/npar.q:425 src/language/stats/t-test.q:496 +#: src/language/stats/npar.q:426 src/language/stats/t-test.q:496 #, c-format msgid "" "PAIRED was specified but the number of variables preceding WITH (%zu) did " @@ -4117,23 +4130,23 @@ msgstr "" msgid "hash table:" msgstr "" -#: src/math/percentiles.c:41 +#: src/math/percentiles.c:35 msgid "HAverage" msgstr "" -#: src/math/percentiles.c:42 +#: src/math/percentiles.c:36 msgid "Weighted Average" msgstr "" -#: src/math/percentiles.c:43 +#: src/math/percentiles.c:37 msgid "Rounded" msgstr "" -#: src/math/percentiles.c:44 +#: src/math/percentiles.c:38 msgid "Empirical" msgstr "" -#: src/math/percentiles.c:45 +#: src/math/percentiles.c:39 msgid "Empirical with averaging" msgstr "" @@ -4283,7 +4296,7 @@ msgstr "" msgid "creating \"%s\"" msgstr "" -#: src/output/charts/plot-hist.c:124 +#: src/output/charts/plot-hist.c:136 msgid "HISTOGRAM" msgstr "" @@ -5856,12 +5869,6 @@ msgstr "" msgid "Custom" msgstr "" -#: src/ui/gui/psppire-var-store.c:675 src/ui/gui/psppire-var-store.c:685 -#: src/ui/gui/psppire-var-store.c:695 -#, c-format -msgid "%d" -msgstr "" - #: src/ui/gui/rank.glade:111 msgid "By:" msgstr "" diff --git a/src/automake.mk b/src/automake.mk index 1ffbdf3e24..56f7590410 100644 --- a/src/automake.mk +++ b/src/automake.mk @@ -2,14 +2,42 @@ # PSPP -include $(top_srcdir)/src/math/automake.mk include $(top_srcdir)/src/libpspp/automake.mk include $(top_srcdir)/src/data/automake.mk + + + +AM_CPPFLAGS += -I$(top_srcdir)/src -I$(top_srcdir)/lib -DPKGDATADIR=\"$(pkgdatadir)\" + + +lib_LTLIBRARIES = src/libpspp-core.la src/libpspp.la +src_libpspp_core_la_SOURCES = + + +src_libpspp_core_la_LDFLAGS = -release @VERSION@ + +src_libpspp_core_la_LIBADD = \ + src/data/libdata.la \ + src/libpspp/libpspp.la \ + $(LIBXML2_LIBS) $(PG_LIBS) \ + gl/libgl.la + +src_libpspp_la_SOURCES = + +src_libpspp_la_LDFLAGS = -release @VERSION@ + +src_libpspp_la_LIBADD = \ + src/language/liblanguage.la \ + src/math/libpspp-math.la \ + src/output/liboutput.la \ + gl/libgl.la + + +include $(top_srcdir)/src/math/automake.mk include $(top_srcdir)/src/output/automake.mk include $(top_srcdir)/src/language/automake.mk - include $(top_srcdir)/src/ui/automake.mk -AM_CPPFLAGS += -I$(top_srcdir)/src -I$(top_srcdir)/lib -DPKGDATADIR=\"$(pkgdatadir)\" + EXTRA_DIST += src/OChangeLog diff --git a/src/data/automake.mk b/src/data/automake.mk index a6d1858148..5f25c1d25c 100644 --- a/src/data/automake.mk +++ b/src/data/automake.mk @@ -1,10 +1,11 @@ -noinst_LIBRARIES += src/data/libdata.a +noinst_LTLIBRARIES += src/data/libdata.la -src_data_libdata_a_CPPFLAGS = $(LIBXML2_CFLAGS) $(PG_CFLAGS) $(AM_CPPFLAGS) +src_data_libdata_la_CPPFLAGS = $(LIBXML2_CFLAGS) $(PG_CFLAGS) $(AM_CPPFLAGS) +src_data_libdata_la_LIBADD = $(LIBXML2_LIBS) $(PG_LIBS) -src_data_libdata_a_SOURCES = \ +src_data_libdata_la_SOURCES = \ src/data/any-reader.c \ src/data/any-reader.h \ src/data/any-writer.c \ diff --git a/src/data/casereader-filter.c b/src/data/casereader-filter.c index 37e1dc85c8..fcd264e9dc 100644 --- a/src/data/casereader-filter.c +++ b/src/data/casereader-filter.c @@ -245,6 +245,7 @@ struct casereader_filter_missing struct variable **vars; /* Variables whose values to filter. */ size_t var_cnt; /* Number of variables. */ enum mv_class class; /* Types of missing values to filter. */ + casenumber *n_missing; }; static bool casereader_filter_missing_include (const struct ccase *, void *); @@ -264,6 +265,9 @@ static bool casereader_filter_missing_destroy (void *); read or, if that never occurs, until the filtering casereader is destroyed. + If N_MISSING is non-null, then after reading, it will be filled + with the totla number of dropped cases. + After this function is called, READER must not ever again be referenced directly. It will be destroyed automatically when the filtering casereader is destroyed. */ @@ -271,6 +275,7 @@ struct casereader * casereader_create_filter_missing (struct casereader *reader, const struct variable **vars, size_t var_cnt, enum mv_class class, + casenumber *n_missing, struct casewriter *exclude) { if (var_cnt > 0 && class != MV_NEVER) @@ -279,6 +284,8 @@ casereader_create_filter_missing (struct casereader *reader, cfm->vars = xmemdup (vars, sizeof *vars * var_cnt); cfm->var_cnt = var_cnt; cfm->class = class; + cfm->n_missing = n_missing; + if (n_missing) *n_missing = 0; return casereader_create_filter_func (reader, casereader_filter_missing_include, casereader_filter_missing_destroy, @@ -302,7 +309,11 @@ casereader_filter_missing_include (const struct ccase *c, void *cfm_) struct variable *var = cfm->vars[i]; const union value *value = case_data (c, var); if (var_is_value_missing (var, value, cfm->class)) - return false; + { + if ( cfm->n_missing ) + (*cfm->n_missing)++; + return false; + } } return true; } diff --git a/src/data/casereader-translator.c b/src/data/casereader-translator.c index 229dac2e54..ae22f1297e 100644 --- a/src/data/casereader-translator.c +++ b/src/data/casereader-translator.c @@ -15,9 +15,8 @@ along with this program. If not, see . */ #include - +#include #include - #include #include @@ -110,18 +109,86 @@ static const struct casereader_class casereader_translator_class = NULL, NULL, }; + -struct casereader_arithmetic_sequence - { - int value_ofs; - double first; - double increment; - casenumber n; - }; -static void cas_translate (struct ccase *input, struct ccase *output, - void *aux); -static bool cas_destroy (void *aux); +struct casereader_append_numeric +{ + int value_ofs; + casenumber n; + new_value_func *func; + void *aux; + void (*destroy) (void *aux); +}; + +static bool can_destroy (void *can_); + +static void can_translate (struct ccase *input, struct ccase *output, + void *can_); + +/* Creates and returns a new casereader whose cases are produced + by reading from SUBREADER and appending an additional value, + generated by FUNC. AUX is an optional parameter which + gets passed to FUNC. FUNC will also receive N as it, which is + the ordinal number of the case in the reader. DESTROY is an + optional parameter used to destroy AUX. + + After this function is called, SUBREADER must not ever again + be referenced directly. It will be destroyed automatically + when the translating casereader is destroyed. */ +struct casereader * +casereader_create_append_numeric (struct casereader *subreader, + new_value_func func, void *aux, + void (*destroy) (void *aux)) +{ + struct casereader_append_numeric *can = xmalloc (sizeof *can); + can->value_ofs = casereader_get_value_cnt (subreader); + can->n = 0; + can->aux = aux; + can->func = func; + can->destroy = destroy; + return casereader_create_translator (subreader, can->value_ofs + 1, + can_translate, can_destroy, can); +} + + +static void +can_translate (struct ccase *input, struct ccase *output, void *can_) +{ + struct casereader_append_numeric *can = can_; + double new_value = can->func (input, can->n++, can->aux); + case_nullify (output); + case_move (output, input); + case_resize (output, can->value_ofs + 1); + case_data_rw_idx (output, can->value_ofs)->f = new_value; +} + +static bool +can_destroy (void *can_) +{ + struct casereader_append_numeric *can = can_; + if (can->destroy) + can->destroy (can->aux); + free (can); + return true; +} + + + +struct arithmetic_sequence +{ + double first; + double increment; +}; + +static double +next_arithmetic (const struct ccase *c UNUSED, + casenumber n, + void *aux) +{ + struct arithmetic_sequence *as = aux; + return n * as->increment + as->first; +} /* Creates and returns a new casereader whose cases are produced by reading from SUBREADER and appending an additional value, @@ -136,32 +203,163 @@ struct casereader * casereader_create_arithmetic_sequence (struct casereader *subreader, double first, double increment) { - /* This could be implemented with a great deal more efficiency - and generality. However, this implementation is easy. */ - struct casereader_arithmetic_sequence *cas = xmalloc (sizeof *cas); - cas->value_ofs = casereader_get_value_cnt (subreader); - cas->first = first; - cas->increment = increment; - cas->n = 0; - return casereader_create_translator (subreader, cas->value_ofs + 1, - cas_translate, cas_destroy, cas); + struct arithmetic_sequence *as = xzalloc (sizeof *as); + as->first = first; + as->increment = increment; + return casereader_create_append_numeric (subreader, next_arithmetic, + as, free); } -static void -cas_translate (struct ccase *input, struct ccase *output, void *cas_) + + + +struct casereader_append_rank { - struct casereader_arithmetic_sequence *cas = cas_; - case_nullify (output); - case_move (output, input); - case_resize (output, cas->value_ofs + 1); - case_data_rw_idx (output, cas->value_ofs)->f - = cas->first + cas->increment * cas->n++; + struct casereader *clone; + casenumber n; + const struct variable *var; + const struct variable *weight; + int value_ofs; + casenumber n_common; + double mean_rank; + double cc; + distinct_func *distinct; + void *aux; + enum rank_error *err; + double prev_value; +}; + +static bool car_destroy (void *car_); + +static void car_translate (struct ccase *input, struct ccase *output, + void *car_); + +/* Creates and returns a new casereader whose cases are produced + by reading from SUBREADER and appending an additional value, + which is the rank of the observation. W is the weight variable + of the dictionary containing V, or NULL if there is no weight + variable. + + The following preconditions must be met: + + 1. SUBREADER must be sorted on V. + + 2. The weight variables, must be non-negative. + + If either of these preconditions are not satisfied, then the rank + variables may not be correct. In this case, if ERR is non-null, + it will be set according to the erroneous conditions encountered. + + If DISTINCT_CALLBACK is non-null, then it will be called exactly + once for every case containing a distinct value of V. AUX is + an auxilliary pointer passed to DISTINCT_CALLBACK. + + After this function is called, SUBREADER must not ever again + be referenced directly. It will be destroyed automatically + when the translating casereader is destroyed. */ +struct casereader * +casereader_create_append_rank (struct casereader *subreader, + const struct variable *v, + const struct variable *w, + enum rank_error *err, + distinct_func *distinct_callback, + void *aux + ) +{ + struct casereader_append_rank *car = xmalloc (sizeof *car); + car->value_ofs = casereader_get_value_cnt (subreader); + car->weight = w; + car->var = v; + car->n = 0; + car->n_common = 1; + car->cc = 0.0; + car->clone = casereader_clone (subreader); + car->distinct = distinct_callback; + car->aux = aux; + car->err = err; + car->prev_value = SYSMIS; + + return casereader_create_translator (subreader, car->value_ofs + 1, + car_translate, car_destroy, car); } + static bool -cas_destroy (void *cas_) +car_destroy (void *car_) { - struct casereader_arithmetic_sequence *cas = cas_; - free (cas); + struct casereader_append_rank *car = car_; + casereader_destroy (car->clone); + free (car); return true; } + + +static void +car_translate (struct ccase *input, struct ccase *output, void *car_) +{ + struct casereader_append_rank *car = car_; + + const double value = case_data (input, car->var)->f; + + if ( car->prev_value != SYSMIS) + { + if (car->err && value < car->prev_value) + *car->err |= RANK_ERR_UNSORTED; + } + + if ( car->n_common == 1) + { + double vxx = SYSMIS; + casenumber k = 0; + double weight = 1.0; + if (car->weight) + { + weight = case_data (input, car->weight)->f; + if ( car->err && weight < 0 ) + *car->err |= RANK_ERR_NEGATIVE_WEIGHT; + } + + do + { + struct ccase c; + if ( ! casereader_peek (car->clone, car->n + ++k, &c)) + break; + vxx = case_data (&c, car->var)->f; + + if ( vxx == value) + { + if (car->weight) + { + double w = case_data (&c, car->weight)->f; + + if ( car->err && w < 0 ) + *car->err |= RANK_ERR_NEGATIVE_WEIGHT; + + weight += w; + } + else + weight += 1.0; + car->n_common++; + } + case_destroy (&c); + } + while (vxx == value); + car->mean_rank = car->cc + (weight + 1) / 2.0; + car->cc += weight; + + if (car->distinct) + car->distinct (value, car->n_common, weight, car->aux); + } + else + car->n_common--; + + car->n++; + + case_nullify (output); + case_move (output, input); + case_resize (output, car->value_ofs + 1); + case_data_rw_idx (output, car->value_ofs)->f = car->mean_rank ; + car->prev_value = value; +} + + diff --git a/src/data/casereader.h b/src/data/casereader.h index ba65cb18e7..3df80cb08f 100644 --- a/src/data/casereader.h +++ b/src/data/casereader.h @@ -98,6 +98,7 @@ struct casereader * casereader_create_filter_missing (struct casereader *, const struct variable **vars, size_t var_cnt, enum mv_class, + casenumber *n_missing, struct casewriter *exclude); struct casereader * @@ -112,8 +113,33 @@ casereader_create_translator (struct casereader *, size_t output_value_cnt, bool (*destroy) (void *aux), void *aux); +/* A function which creates a numberic value from an existing case */ +typedef double new_value_func (const struct ccase *, casenumber, void *); + +struct casereader * +casereader_create_append_numeric (struct casereader *subreader, + new_value_func func, void *aux, + void (*destroy) (void *aux)); + struct casereader * casereader_create_arithmetic_sequence (struct casereader *, double first, double increment); +enum rank_error + { + RANK_ERR_NONE = 0, + RANK_ERR_NEGATIVE_WEIGHT = 0x01, + RANK_ERR_UNSORTED = 0x02 + }; + + +typedef void distinct_func (double v, casenumber n, double w, void *aux); + +struct casereader * +casereader_create_append_rank (struct casereader *, + const struct variable *v, const struct variable *w, + enum rank_error *err, + distinct_func *distinct_callback, void *aux); + + #endif /* data/casereader.h */ diff --git a/src/data/casewriter.c b/src/data/casewriter.c index a30e50e209..4461d85e84 100644 --- a/src/data/casewriter.c +++ b/src/data/casewriter.c @@ -91,8 +91,7 @@ casewriter_get_value_cnt (const struct casewriter *writer) struct casereader * casewriter_make_reader (struct casewriter *writer) { - struct casereader *reader; - reader = writer->class->convert_to_reader (writer, writer->aux); + struct casereader *reader = writer->class->convert_to_reader (writer, writer->aux); taint_propagate (writer->taint, casereader_get_taint (reader)); taint_destroy (writer->taint); free (writer); @@ -241,10 +240,11 @@ casewriter_window_convert_to_reader (struct casewriter *writer UNUSED, void *window_) { struct casewindow *window = window_; - struct casereader *reader; - reader = casereader_create_random (casewindow_get_value_cnt (window), - casewindow_get_case_cnt (window), - &casereader_window_class, window); + struct casereader *reader = + casereader_create_random (casewindow_get_value_cnt (window), + casewindow_get_case_cnt (window), + &casereader_window_class, window); + taint_propagate (casewindow_get_taint (window), casereader_get_taint (reader)); return reader; diff --git a/src/data/category.c b/src/data/category.c index 1620bc7f7f..7190418c26 100644 --- a/src/data/category.c +++ b/src/data/category.c @@ -108,7 +108,7 @@ cat_value_find (const struct variable *v, const union value *val) { candidate = obs_vals->vals + i; assert (candidate != NULL); - if (!compare_values (candidate, val, var_get_width (v))) + if (!compare_values (candidate, val, v)) { return i; } diff --git a/src/data/datasheet.c b/src/data/datasheet.c index da85d963c1..a11a1e596c 100644 --- a/src/data/datasheet.c +++ b/src/data/datasheet.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -1232,36 +1231,10 @@ source_has_backing (const struct source *source) #define MAX_ROWS 5 #define MAX_COLS 5 -/* Hashes the structure of datasheet DS and returns the hash. - We use MD4 because it is much faster than MD5 or SHA-1 but its - collision resistance is just as good. */ -static unsigned int -hash_datasheet (const struct datasheet *ds) -{ - unsigned int hash[DIV_RND_UP (20, sizeof (unsigned int))]; - struct md4_ctx ctx; - struct range_map_node *r; - - md4_init_ctx (&ctx); - axis_hash (ds->columns, &ctx); - axis_hash (ds->rows, &ctx); - for (r = range_map_first (&ds->sources); r != NULL; - r = range_map_next (&ds->sources, r)) - { - unsigned long int start = range_map_node_get_start (r); - unsigned long int end = range_map_node_get_end (r); - md4_process_bytes (&start, sizeof start, &ctx); - md4_process_bytes (&end, sizeof end, &ctx); - } - md4_process_bytes (&ds->column_min_alloc, sizeof ds->column_min_alloc, - &ctx); - md4_finish_ctx (&ctx, hash); - return hash[0]; -} /* Clones the structure and contents of ODS into a new datasheet, and returns the new datasheet. */ -static struct datasheet * +struct datasheet * clone_datasheet (const struct datasheet *ods) { struct datasheet *ds; @@ -1286,426 +1259,33 @@ clone_datasheet (const struct datasheet *ods) return ds; } -/* lazy_casereader callback function to instantiate a casereader - from the datasheet. */ -static struct casereader * -lazy_callback (void *ds_) -{ - struct datasheet *ds = ds_; - return datasheet_make_reader (ds); -} - -/* Checks that READER contains the ROW_CNT rows and COLUMN_CNT - columns of data in ARRAY, reporting any errors via MC. */ -static void -check_datasheet_casereader (struct mc *mc, struct casereader *reader, - double array[MAX_ROWS][MAX_COLS], - size_t row_cnt, size_t column_cnt) -{ - if (casereader_get_case_cnt (reader) != row_cnt) - { - if (casereader_get_case_cnt (reader) == CASENUMBER_MAX - && casereader_count_cases (reader) == row_cnt) - mc_error (mc, "datasheet casereader has unknown case count"); - else - mc_error (mc, "casereader row count (%lu) does not match " - "expected (%zu)", - (unsigned long int) casereader_get_case_cnt (reader), - row_cnt); - } - else if (casereader_get_value_cnt (reader) != column_cnt) - mc_error (mc, "casereader column count (%zu) does not match " - "expected (%zu)", - casereader_get_value_cnt (reader), column_cnt); - else - { - struct ccase c; - size_t row; - - for (row = 0; row < row_cnt; row++) - { - size_t col; - - if (!casereader_read (reader, &c)) - { - mc_error (mc, "casereader_read failed reading row %zu of %zu " - "(%zu columns)", row, row_cnt, column_cnt); - return; - } - - for (col = 0; col < column_cnt; col++) - if (case_num_idx (&c, col) != array[row][col]) - mc_error (mc, "element %zu,%zu (of %zu,%zu) differs: " - "%g != %g", - row, col, row_cnt, column_cnt, - case_num_idx (&c, col), array[row][col]); - - case_destroy (&c); - } - - if (casereader_read (reader, &c)) - mc_error (mc, "casereader has extra cases (expected %zu)", row_cnt); - } -} - -/* Checks that datasheet DS contains has ROW_CNT rows, COLUMN_CNT - columns, and the same contents as ARRAY, reporting any - mismatches via mc_error. Then, adds DS to MC as a new state. */ -static void -check_datasheet (struct mc *mc, struct datasheet *ds, - double array[MAX_ROWS][MAX_COLS], - size_t row_cnt, size_t column_cnt) -{ - struct datasheet *ds2; - struct casereader *reader; - unsigned long int serial = 0; - - assert (row_cnt < MAX_ROWS); - assert (column_cnt < MAX_COLS); - - /* If it is a duplicate hash, discard the state before checking - its consistency, to save time. */ - if (mc_discard_dup_state (mc, hash_datasheet (ds))) - { - datasheet_destroy (ds); - return; - } - - /* Check contents of datasheet via datasheet functions. */ - if (row_cnt != datasheet_get_row_cnt (ds)) - mc_error (mc, "row count (%lu) does not match expected (%zu)", - (unsigned long int) datasheet_get_row_cnt (ds), row_cnt); - else if (column_cnt != datasheet_get_column_cnt (ds)) - mc_error (mc, "column count (%zu) does not match expected (%zu)", - datasheet_get_column_cnt (ds), column_cnt); - else - { - size_t row, col; - - for (row = 0; row < row_cnt; row++) - for (col = 0; col < column_cnt; col++) - { - union value v; - if (!datasheet_get_value (ds, row, col, &v, 1)) - NOT_REACHED (); - if (v.f != array[row][col]) - mc_error (mc, "element %zu,%zu (of %zu,%zu) differs: %g != %g", - row, col, row_cnt, column_cnt, v.f, array[row][col]); - } - } - - /* Check that datasheet contents are correct when read through - casereader. */ - ds2 = clone_datasheet (ds); - reader = datasheet_make_reader (ds2); - check_datasheet_casereader (mc, reader, array, row_cnt, column_cnt); - casereader_destroy (reader); - - /* Check that datasheet contents are correct when read through - casereader with lazy_casereader wrapped around it. This is - valuable because otherwise there is no non-GUI code that - uses the lazy_casereader. */ - ds2 = clone_datasheet (ds); - reader = lazy_casereader_create (column_cnt, row_cnt, - lazy_callback, ds2, &serial); - check_datasheet_casereader (mc, reader, array, row_cnt, column_cnt); - if (lazy_casereader_destroy (reader, serial)) - { - /* Lazy casereader was never instantiated. This will - only happen if there are no rows (because in that case - casereader_read never gets called). */ - datasheet_destroy (ds2); - if (row_cnt != 0) - mc_error (mc, "lazy casereader not instantiated, but should " - "have been (size %zu,%zu)", row_cnt, column_cnt); - } - else - { - /* Lazy casereader was instantiated. This is the common - case, in which some casereader operation - (casereader_read in this case) was performed on the - lazy casereader. */ - casereader_destroy (reader); - if (row_cnt == 0) - mc_error (mc, "lazy casereader instantiated, but should not " - "have been (size %zu,%zu)", row_cnt, column_cnt); - } - - mc_add_state (mc, ds); -} - -/* Extracts the contents of DS into DATA. */ -static void -extract_data (const struct datasheet *ds, double data[MAX_ROWS][MAX_COLS]) -{ - size_t column_cnt = datasheet_get_column_cnt (ds); - size_t row_cnt = datasheet_get_row_cnt (ds); - size_t row, col; - - assert (row_cnt < MAX_ROWS); - assert (column_cnt < MAX_COLS); - for (row = 0; row < row_cnt; row++) - for (col = 0; col < column_cnt; col++) - { - union value v; - if (!datasheet_get_value (ds, row, col, &v, 1)) - NOT_REACHED (); - data[row][col] = v.f; - } -} - -/* Clones the structure and contents of ODS into *DS, - and the contents of ODATA into DATA. */ -static void -clone_model (const struct datasheet *ods, double odata[MAX_ROWS][MAX_COLS], - struct datasheet **ds, double data[MAX_ROWS][MAX_COLS]) -{ - *ds = clone_datasheet (ods); - memcpy (data, odata, MAX_ROWS * MAX_COLS * sizeof **data); -} -/* "init" function for struct mc_class. */ -static void -datasheet_mc_init (struct mc *mc) +/* Hashes the structure of datasheet DS and returns the hash. + We use MD4 because it is much faster than MD5 or SHA-1 but its + collision resistance is just as good. */ +unsigned int +hash_datasheet (const struct datasheet *ds) { - struct datasheet_test_params *params = mc_get_aux (mc); - struct datasheet *ds; + unsigned int hash[DIV_RND_UP (20, sizeof (unsigned int))]; + struct md4_ctx ctx; + struct range_map_node *r; - if (params->backing_rows == 0 && params->backing_cols == 0) - { - /* Create unbacked datasheet. */ - ds = datasheet_create (NULL); - mc_name_operation (mc, "empty datasheet"); - check_datasheet (mc, ds, NULL, 0, 0); - } - else + md4_init_ctx (&ctx); + axis_hash (ds->columns, &ctx); + axis_hash (ds->rows, &ctx); + for (r = range_map_first (&ds->sources); r != NULL; + r = range_map_next (&ds->sources, r)) { - /* Create datasheet with backing. */ - struct casewriter *writer; - struct casereader *reader; - double data[MAX_ROWS][MAX_COLS]; - int row; - - assert (params->backing_rows > 0 && params->backing_rows <= MAX_ROWS); - assert (params->backing_cols > 0 && params->backing_cols <= MAX_COLS); - - writer = mem_writer_create (params->backing_cols); - for (row = 0; row < params->backing_rows; row++) - { - struct ccase c; - int col; - - case_create (&c, params->backing_cols); - for (col = 0; col < params->backing_cols; col++) - { - double value = params->next_value++; - data[row][col] = value; - case_data_rw_idx (&c, col)->f = value; - } - casewriter_write (writer, &c); - } - reader = casewriter_make_reader (writer); - assert (reader != NULL); - - ds = datasheet_create (reader); - mc_name_operation (mc, "datasheet with (%d,%d) backing", - params->backing_rows, params->backing_cols); - check_datasheet (mc, ds, data, - params->backing_rows, params->backing_cols); + unsigned long int start = range_map_node_get_start (r); + unsigned long int end = range_map_node_get_end (r); + md4_process_bytes (&start, sizeof start, &ctx); + md4_process_bytes (&end, sizeof end, &ctx); } + md4_process_bytes (&ds->column_min_alloc, sizeof ds->column_min_alloc, + &ctx); + md4_finish_ctx (&ctx, hash); + return hash[0]; } -/* "mutate" function for struct mc_class. */ -static void -datasheet_mc_mutate (struct mc *mc, const void *ods_) -{ - struct datasheet_test_params *params = mc_get_aux (mc); - - const struct datasheet *ods = ods_; - double odata[MAX_ROWS][MAX_COLS]; - double data[MAX_ROWS][MAX_COLS]; - size_t column_cnt = datasheet_get_column_cnt (ods); - size_t row_cnt = datasheet_get_row_cnt (ods); - size_t pos, new_pos, cnt; - - extract_data (ods, odata); - - /* Insert all possible numbers of columns in all possible - positions. */ - for (pos = 0; pos <= column_cnt; pos++) - for (cnt = 0; cnt <= params->max_cols - column_cnt; cnt++) - if (mc_include_state (mc)) - { - struct datasheet *ds; - union value new[MAX_COLS]; - size_t i, j; - - mc_name_operation (mc, "insert %zu columns at %zu", cnt, pos); - clone_model (ods, odata, &ds, data); - - for (i = 0; i < cnt; i++) - new[i].f = params->next_value++; - - if (!datasheet_insert_columns (ds, new, cnt, pos)) - mc_error (mc, "datasheet_insert_columns failed"); - - for (i = 0; i < row_cnt; i++) - { - insert_range (&data[i][0], column_cnt, sizeof data[i][0], - pos, cnt); - for (j = 0; j < cnt; j++) - data[i][pos + j] = new[j].f; - } - - check_datasheet (mc, ds, data, row_cnt, column_cnt + cnt); - } - - /* Delete all possible numbers of columns from all possible - positions. */ - for (pos = 0; pos < column_cnt; pos++) - for (cnt = 0; cnt < column_cnt - pos; cnt++) - if (mc_include_state (mc)) - { - struct datasheet *ds; - size_t i; - - mc_name_operation (mc, "delete %zu columns at %zu", cnt, pos); - clone_model (ods, odata, &ds, data); - - datasheet_delete_columns (ds, pos, cnt); - - for (i = 0; i < row_cnt; i++) - remove_range (&data[i], column_cnt, sizeof *data[i], pos, cnt); - - check_datasheet (mc, ds, data, row_cnt, column_cnt - cnt); - } - - /* Move all possible numbers of columns from all possible - existing positions to all possible new positions. */ - for (pos = 0; pos < column_cnt; pos++) - for (cnt = 0; cnt < column_cnt - pos; cnt++) - for (new_pos = 0; new_pos < column_cnt - cnt; new_pos++) - if (mc_include_state (mc)) - { - struct datasheet *ds; - size_t i; - - clone_model (ods, odata, &ds, data); - mc_name_operation (mc, "move %zu columns from %zu to %zu", - cnt, pos, new_pos); - - datasheet_move_columns (ds, pos, new_pos, cnt); - - for (i = 0; i < row_cnt; i++) - move_range (&data[i], column_cnt, sizeof data[i][0], - pos, new_pos, cnt); - - check_datasheet (mc, ds, data, row_cnt, column_cnt); - } - - /* Insert all possible numbers of rows in all possible - positions. */ - for (pos = 0; pos <= row_cnt; pos++) - for (cnt = 0; cnt <= params->max_rows - row_cnt; cnt++) - if (mc_include_state (mc)) - { - struct datasheet *ds; - struct ccase c[MAX_ROWS]; - size_t i, j; - - clone_model (ods, odata, &ds, data); - mc_name_operation (mc, "insert %zu rows at %zu", cnt, pos); - - for (i = 0; i < cnt; i++) - { - case_create (&c[i], column_cnt); - for (j = 0; j < column_cnt; j++) - case_data_rw_idx (&c[i], j)->f = params->next_value++; - } - - insert_range (data, row_cnt, sizeof data[pos], pos, cnt); - for (i = 0; i < cnt; i++) - for (j = 0; j < column_cnt; j++) - data[i + pos][j] = case_num_idx (&c[i], j); - - if (!datasheet_insert_rows (ds, pos, c, cnt)) - mc_error (mc, "datasheet_insert_rows failed"); - - check_datasheet (mc, ds, data, row_cnt + cnt, column_cnt); - } - - /* Delete all possible numbers of rows from all possible - positions. */ - for (pos = 0; pos < row_cnt; pos++) - for (cnt = 0; cnt < row_cnt - pos; cnt++) - if (mc_include_state (mc)) - { - struct datasheet *ds; - - clone_model (ods, odata, &ds, data); - mc_name_operation (mc, "delete %zu rows at %zu", cnt, pos); - - datasheet_delete_rows (ds, pos, cnt); - - remove_range (&data[0], row_cnt, sizeof data[0], pos, cnt); - - check_datasheet (mc, ds, data, row_cnt - cnt, column_cnt); - } - - /* Move all possible numbers of rows from all possible existing - positions to all possible new positions. */ - for (pos = 0; pos < row_cnt; pos++) - for (cnt = 0; cnt < row_cnt - pos; cnt++) - for (new_pos = 0; new_pos < row_cnt - cnt; new_pos++) - if (mc_include_state (mc)) - { - struct datasheet *ds; - - clone_model (ods, odata, &ds, data); - mc_name_operation (mc, "move %zu rows from %zu to %zu", - cnt, pos, new_pos); - - datasheet_move_rows (ds, pos, new_pos, cnt); - - move_range (&data[0], row_cnt, sizeof data[0], - pos, new_pos, cnt); - - check_datasheet (mc, ds, data, row_cnt, column_cnt); - } -} - -/* "destroy" function for struct mc_class. */ -static void -datasheet_mc_destroy (const struct mc *mc UNUSED, void *ds_) -{ - struct datasheet *ds = ds_; - datasheet_destroy (ds); -} -/* Executes the model checker on the datasheet test driver with - the given OPTIONS and passing in the given PARAMS, which must - point to a modifiable "struct datasheet_test_params". If any - value in PARAMS is out of range, it will be adjusted into the - valid range before running the test. - Returns the results of the model checking run. */ -struct mc_results * -datasheet_test (struct mc_options *options, void *params_) -{ - struct datasheet_test_params *params = params_; - static const struct mc_class datasheet_mc_class = - { - datasheet_mc_init, - datasheet_mc_mutate, - datasheet_mc_destroy, - }; - - params->next_value = 1; - params->max_rows = MIN (params->max_rows, MAX_ROWS); - params->max_cols = MIN (params->max_cols, MAX_COLS); - params->backing_rows = MIN (params->backing_rows, params->max_rows); - params->backing_cols = MIN (params->backing_cols, params->max_cols); - - mc_options_set_aux (options, params); - return mc_run (&datasheet_mc_class, options); -} diff --git a/src/data/datasheet.h b/src/data/datasheet.h index 3d106193d1..2a05fe0da5 100644 --- a/src/data/datasheet.h +++ b/src/data/datasheet.h @@ -66,21 +66,7 @@ bool datasheet_get_value (const struct datasheet *, casenumber, size_t column, bool datasheet_put_value (struct datasheet *, casenumber, size_t column, const union value *, int width); -/* Testing. */ -struct mc_options; - -struct datasheet_test_params - { - /* Parameters. */ - int max_rows; - int max_cols; - int backing_rows; - int backing_cols; - - /* State. */ - int next_value; - }; - -struct mc_results *datasheet_test (struct mc_options *options, void *params); +unsigned int hash_datasheet (const struct datasheet *ds); +struct datasheet *clone_datasheet (const struct datasheet *ds); #endif /* data/datasheet.h */ diff --git a/src/data/value.c b/src/data/value.c index 49555d9a4e..9fcf1cbb9b 100644 --- a/src/data/value.c +++ b/src/data/value.c @@ -20,6 +20,7 @@ #include #include #include +#include "variable.h" #include "xalloc.h" @@ -46,8 +47,12 @@ value_create (int width) Only the short string portion of longer strings are compared. */ int -compare_values (const union value *a, const union value *b, int width) +compare_values (const void *a_, const void *b_, const void *var_) { + const union value *a = a_; + const union value *b = b_; + const struct variable *var = var_; + int width = var_get_width (var); return (width == 0 ? (a->f < b->f ? -1 : a->f > b->f) : memcmp (a->s, b->s, MIN (MAX_SHORT_STRING, width))); @@ -56,24 +61,14 @@ compare_values (const union value *a, const union value *b, int width) /* Create a hash of V, which has the given WIDTH. Only the short string portion of a longer string is hashed. */ unsigned -hash_value (const union value *v, int width) +hash_value (const void *v_, const void *var_) { + const union value *v = v_; + const struct variable *var = var_; + int width = var_get_width (var); return (width == 0 ? hsh_hash_double (v->f) - : hsh_hash_bytes (v->s, MIN (MAX_SHORT_STRING, width))); -} - - -int -compare_ptr_values (const union value **v1, const union value **v2, int width) -{ - return compare_values (*v1, *v2, width); -} - -unsigned -hash_ptr_value (const union value **v, int width) -{ - return hash_value (*v, width); + : hsh_hash_bytes (v->s, width)); } diff --git a/src/data/value.h b/src/data/value.h index 4554a36617..9103c6b317 100644 --- a/src/data/value.h +++ b/src/data/value.h @@ -39,12 +39,8 @@ union value union value *value_dup (const union value *, int width); union value *value_create (int width); -int compare_values (const union value *, const union value *, int width); -unsigned hash_value (const union value *, int width); - -int compare_ptr_values (const union value **, const union value **, int width); -unsigned hash_ptr_value (const union value **, int width); - +int compare_values (const void *, const void *, const void *var); +unsigned hash_value (const void *, const void *var); static inline size_t value_cnt_from_width (int width); void value_copy (union value *, const union value *, int width); diff --git a/src/data/variable.c b/src/data/variable.c index a455e40a1f..e39692a062 100644 --- a/src/data/variable.c +++ b/src/data/variable.c @@ -142,6 +142,23 @@ var_clone (const struct variable *old_var) return new_var; } +/* Create a variable to be used for internal calculations only */ +struct variable * +var_create_internal (int case_idx) +{ + struct variable *v = var_create ("$internal", 0); + + struct vardict_info vdi; + + vdi.dict = NULL; + vdi.dict_index = 0; + vdi.case_index = case_idx; + + var_set_vardict (v, &vdi); + + return v; +} + /* Destroys variable V. V must not belong to a dictionary. If it does, use dict_delete_var instead. */ @@ -150,7 +167,11 @@ var_destroy (struct variable *v) { if (v != NULL) { - assert (!var_has_vardict (v)); + if (var_has_vardict (v)) + { + const struct vardict_info *vdi = var_get_vardict (v); + assert (vdi->dict == NULL); + } cat_stored_values_destroy (v->obs_vals); var_clear_short_names (v); var_clear_aux (v); diff --git a/src/data/variable.h b/src/data/variable.h index c7f86aaf71..d2820d34bb 100644 --- a/src/data/variable.h +++ b/src/data/variable.h @@ -32,6 +32,8 @@ union value; struct variable *var_create (const char *name, int width); struct variable *var_clone (const struct variable *); void var_destroy (struct variable *); +struct variable *var_create_internal (int case_idx); + /* Variable names. */ #define VAR_NAME_LEN 64 /* Maximum length of variable name, in bytes. */ diff --git a/src/language/automake.mk b/src/language/automake.mk index d5ebb10e15..a623516300 100644 --- a/src/language/automake.mk +++ b/src/language/automake.mk @@ -10,9 +10,14 @@ include $(top_srcdir)/src/language/stats/automake.mk include $(top_srcdir)/src/language/data-io/automake.mk include $(top_srcdir)/src/language/expressions/automake.mk -noinst_LIBRARIES += src/language/liblanguage.a +noinst_LTLIBRARIES += src/language/liblanguage.la -src_language_liblanguage_a_SOURCES = \ + +src_language_liblanguage_la_LIBADD = \ + lib/misc/libmisc.la \ + src/output/charts/libcharts.la + +src_language_liblanguage_la_SOURCES = \ src/language/syntax-file.c \ src/language/syntax-file.h \ src/language/syntax-string-source.c \ @@ -33,8 +38,7 @@ src_language_liblanguage_a_SOURCES = \ $(language_expressions_sources) - -nodist_src_language_liblanguage_a_SOURCES = \ +nodist_src_language_liblanguage_la_SOURCES = \ $(src_language_data_io_built_sources) \ $(src_language_utilities_built_sources) \ $(src_language_stats_built_sources) \ diff --git a/src/language/command.def b/src/language/command.def index 77eb3a3a7a..c2cc7f624f 100644 --- a/src/language/command.def +++ b/src/language/command.def @@ -113,6 +113,7 @@ DEF_CMD (S_DATA, 0, "ONEWAY", cmd_oneway) DEF_CMD (S_DATA, 0, "PEARSON CORRELATIONS", cmd_correlations) DEF_CMD (S_DATA, 0, "RANK", cmd_rank) DEF_CMD (S_DATA, 0, "REGRESSION", cmd_regression) +DEF_CMD (S_DATA, 0, "RELIABILITY", cmd_reliability) DEF_CMD (S_DATA, 0, "RENAME VARIABLES", cmd_rename_variables) DEF_CMD (S_DATA, 0, "SAMPLE", cmd_sample) DEF_CMD (S_DATA, 0, "SAVE", cmd_save) @@ -231,7 +232,6 @@ UNIMPL_CMD ("RATIO STATISTICS", "Descriptives of ratios") UNIMPL_CMD ("READ MODEL", "Read new model") UNIMPL_CMD ("RECORD TYPE", "Defines a type of record within FILE TYPE") UNIMPL_CMD ("REFORMAT", "Read obsolete files") -UNIMPL_CMD ("RELIABILITY", "Reliability estimates") UNIMPL_CMD ("REPEATING DATA", "Specify multiple cases per input record") UNIMPL_CMD ("REPORT", "Pretty print working file") UNIMPL_CMD ("RESTORE", "Restore settings") diff --git a/src/language/expressions/helpers.c b/src/language/expressions/helpers.c index 695c233537..85705098a2 100644 --- a/src/language/expressions/helpers.c +++ b/src/language/expressions/helpers.c @@ -589,7 +589,7 @@ ncdf_beta (double x, double a, double b, double lambda) double cdf_bvnor (double x0, double x1, double r) { - double z = x0 * x0 - 2. * r * x0 * x1 + x1 * x1; + double z = pow2 (x0) - 2. * r * x0 * x1 + pow2 (x1); return exp (-z / (2. * (1 - r * r))) * (2. * M_PI * sqrt (1 - r * r)); } diff --git a/src/language/lexer/lexer.c b/src/language/lexer/lexer.c index 1c9542d78e..5b24522f5c 100644 --- a/src/language/lexer/lexer.c +++ b/src/language/lexer/lexer.c @@ -1304,3 +1304,28 @@ lex_tokstr (const struct lexer *lexer) { return &lexer->tokstr; } + +/* If the lexer is positioned at the (pseudo)identifier S, which + may contain a hyphen ('-'), skips it and returns true. Each + half of the identifier may be abbreviated to its first three + letters. + Otherwise, returns false. */ +bool +lex_match_hyphenated_word (struct lexer *lexer, const char *s) +{ + const char *hyphen = strchr (s, '-'); + if (hyphen == NULL) + return lex_match_id (lexer, s); + else if (lexer->token != T_ID + || !lex_id_match (ss_buffer (s, hyphen - s), ss_cstr (lexer->tokid)) + || lex_look_ahead (lexer) != '-') + return false; + else + { + lex_get (lexer); + lex_force_match (lexer, '-'); + lex_force_match_id (lexer, hyphen + 1); + return true; + } +} + diff --git a/src/language/lexer/lexer.h b/src/language/lexer/lexer.h index 53732b4999..9e5d09aec0 100644 --- a/src/language/lexer/lexer.h +++ b/src/language/lexer/lexer.h @@ -55,6 +55,8 @@ bool lex_match (struct lexer *, int); bool lex_match_id (struct lexer *, const char *); bool lex_match_id_n (struct lexer *, const char *, size_t n); bool lex_match_int (struct lexer *, int); +bool lex_match_hyphenated_word (struct lexer *lexer, const char *s); + /* Forcible matching functions. */ bool lex_force_match (struct lexer *, int); diff --git a/src/language/lexer/q2c.c b/src/language/lexer/q2c.c index d14c69d0fe..3e23390f05 100644 --- a/src/language/lexer/q2c.c +++ b/src/language/lexer/q2c.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -357,6 +357,44 @@ dump_token (void) } #endif /* DUMP_TOKENS */ + +const char hyphen_proxy = '_'; + +static void +id_cpy (char **cp) +{ + char *dest = tokstr; + char *src = *cp; + + while (*src == '_' || *src == '-' || isalnum ((unsigned char) *src)) + { + *dest++ = *src == '-' ? hyphen_proxy :toupper ((unsigned char) (*src)); + src++; + } + + *cp = src; + *dest++ = '\0'; +} + +static const char * +unmunge (const char *s) +{ + char *dest = xmalloc (strlen (s)); + char *d = dest; + + while (*s) + { + if (*s == hyphen_proxy) + *d = '-'; + else + *d = *s; + s++; + d++; + } + + return dest; +} + /* Reads a token from the input file. */ static int lex_get (void) @@ -398,9 +436,8 @@ lex_get (void) { char *dest = tokstr; token = T_ID; - while (*cp == '_' || isalnum ((unsigned char) *cp)) - *dest++ = toupper ((unsigned char) (*cp++)); - *dest++ = '\0'; + + id_cpy (&cp); } else token = *cp++; @@ -1374,7 +1411,11 @@ make_match (const char *t) else if (isdigit ((unsigned char) t[0])) sprintf (s, "lex_match_int (lexer, %s)", t); else - sprintf (s, "lex_match_id (lexer, \"%s\")", t); + { + char *c = unmunge (t); + sprintf (s, "lex_match_hyphenated_word (lexer, \"%s\")", c); + free (c); + } return s; } diff --git a/src/language/stats/.gitignore b/src/language/stats/.gitignore index d3d5a40f96..6df3f96b69 100644 --- a/src/language/stats/.gitignore +++ b/src/language/stats/.gitignore @@ -2,9 +2,11 @@ correlations.c crosstabs.c examine.c frequencies.c +glm.c means.c npar.c oneway.c rank.c regression.c +reliability.c t-test.c diff --git a/src/language/stats/aggregate.c b/src/language/stats/aggregate.c index f58b97cf89..e1dcd12342 100644 --- a/src/language/stats/aggregate.c +++ b/src/language/stats/aggregate.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include "minmax.h" #include "xalloc.h" @@ -75,12 +77,17 @@ struct agr_var char *string; bool saw_missing; struct moments1 *moments; + double cc; + + struct variable *subject; + struct variable *weight; + struct casewriter *writer; }; /* Aggregation functions. */ enum { - NONE, SUM, MEAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN, + NONE, SUM, MEAN, MEDIAN, SD, MAX, MIN, PGT, PLT, PIN, POUT, FGT, FLT, FIN, FOUT, N, NU, NMISS, NUMISS, FIRST, LAST, N_AGR_FUNCS, N_NO_VARS, NU_NO_VARS, FUNC = 0x1f, /* Function mask. */ @@ -102,6 +109,7 @@ static const struct agr_func agr_func_tab[] = {"", 0, -1, {0, 0, 0}}, {"SUM", 0, -1, {FMT_F, 8, 2}}, {"MEAN", 0, -1, {FMT_F, 8, 2}}, + {"MEDIAN", 0, -1, {FMT_F, 8, 2}}, {"SD", 0, -1, {FMT_F, 8, 2}}, {"MAX", 0, VAL_STRING, {-1, -1, -1}}, {"MIN", 0, VAL_STRING, {-1, -1, -1}}, @@ -135,7 +143,7 @@ enum missing_treatment struct agr_proc { /* Break variables. */ - struct case_ordering *sort; /* Sort criteria. */ + struct case_ordering *sort; /* Sort criteria (break variable). */ const struct variable **break_vars; /* Break variables. */ size_t break_var_cnt; /* Number of break variables. */ struct ccase break_case; /* Last values of break variables. */ @@ -149,6 +157,7 @@ struct agr_proc static void initialize_aggregate_info (struct agr_proc *, const struct ccase *); + static void accumulate_aggregate_info (struct agr_proc *, const struct ccase *); /* Prototypes. */ @@ -344,7 +353,8 @@ error: /* Parse all the aggregate functions. */ static bool -parse_aggregate_functions (struct lexer *lexer, const struct dictionary *dict, struct agr_proc *agr) +parse_aggregate_functions (struct lexer *lexer, const struct dictionary *dict, + struct agr_proc *agr) { struct agr_var *tail; /* Tail of linked list starting at agr->vars. */ @@ -545,7 +555,7 @@ parse_aggregate_functions (struct lexer *lexer, const struct dictionary *dict, s variables. */ for (i = 0; i < n_dest; i++) { - struct agr_var *v = xmalloc (sizeof *v); + struct agr_var *v = xzalloc (sizeof *v); /* Add variable to chain. */ if (agr->agr_vars != NULL) @@ -703,6 +713,10 @@ agr_destroy (struct agr_proc *agr) } else if (iter->function == SD) moments1_destroy (iter->moments); + + var_destroy (iter->subject); + var_destroy (iter->weight); + free (iter); } if (agr->dict != NULL) @@ -755,6 +769,25 @@ accumulate_aggregate_info (struct agr_proc *agr, const struct ccase *input) iter->dbl[0] += v->f * weight; iter->dbl[1] += weight; break; + case MEDIAN: + { + double wv ; + struct ccase cout; + case_create (&cout, 2); + + case_data_rw (&cout, iter->subject)->f = + case_data (input, iter->src)->f; + + wv = dict_get_case_weight (agr->src_dict, input, NULL); + + case_data_rw (&cout, iter->weight)->f = wv; + + iter->cc += wv; + + casewriter_write (iter->writer, &cout); + case_destroy (&cout); + } + break; case SD: moments1_add (iter->moments, v->f, weight); break; @@ -911,6 +944,7 @@ dump_aggregate_info (struct agr_proc *agr, struct casewriter *output) { union value *v = case_data_rw (&c, i->dest); + if (agr->missing == COLUMNWISE && i->saw_missing && (i->function & FUNC) != N && (i->function & FUNC) != NU && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS) @@ -919,6 +953,9 @@ dump_aggregate_info (struct agr_proc *agr, struct casewriter *output) memset (v->s, ' ', var_get_width (i->dest)); else v->f = SYSMIS; + + casewriter_destroy (i->writer); + continue; } @@ -930,6 +967,25 @@ dump_aggregate_info (struct agr_proc *agr, struct casewriter *output) case MEAN: v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS; break; + case MEDIAN: + { + struct casereader *sorted_reader; + struct order_stats *median = percentile_create (0.5, i->cc); + + sorted_reader = casewriter_make_reader (i->writer); + + order_stats_accumulate (&median, 1, + sorted_reader, + i->weight, + i->subject, + i->exclude); + + v->f = percentile_calculate ((struct percentile *) median, + PC_HAVERAGE); + + statistic_destroy ((struct statistic *) median); + } + break; case SD: { double variance; @@ -1044,6 +1100,22 @@ initialize_aggregate_info (struct agr_proc *agr, const struct ccase *input) case MAX | FSTRING: memset (iter->string, 0, var_get_width (iter->src)); break; + case MEDIAN: + { + struct case_ordering *ordering = case_ordering_create (); + + if ( ! iter->subject) + iter->subject = var_create_internal (0); + + if ( ! iter->weight) + iter->weight = var_create_internal (1); + + case_ordering_add_var (ordering, iter->subject, SRT_ASCEND); + + iter->writer = sort_create_writer (ordering, 2); + iter->cc = 0; + } + break; case SD: if (iter->moments == NULL) iter->moments = moments1_create (MOMENT_VARIANCE); diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index d60cb0dbdb..3e9ab232a2 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -13,6 +13,7 @@ src_language_stats_built_sources = \ src/language/stats/oneway.c \ src/language/stats/rank.c \ src/language/stats/regression.c \ + src/language/stats/reliability.c \ src/language/stats/t-test.c language_stats_sources = \ @@ -31,7 +32,9 @@ language_stats_sources = \ src/language/stats/freq.c \ src/language/stats/freq.h \ src/language/stats/npar-summary.c \ - src/language/stats/npar-summary.h + src/language/stats/npar-summary.h \ + src/language/stats/wilcoxon.c \ + src/language/stats/wilcoxon.h all_q_sources += $(src_language_stats_built_sources:.c=.q) EXTRA_DIST += $(src_language_stats_built_sources:.c=.q) diff --git a/src/language/stats/binomial.c b/src/language/stats/binomial.c index d9209f1bf0..5022cebddd 100644 --- a/src/language/stats/binomial.c +++ b/src/language/stats/binomial.c @@ -119,14 +119,14 @@ do_binomial (const struct dictionary *dict, cat1[v].value = value_dup (value, width); cat1[v].count = w; } - else if ( 0 == compare_values (cat1[v].value, value, width)) + else if ( 0 == compare_values (cat1[v].value, value, var)) cat1[v].count += w; else if ( NULL == cat2[v].value ) { cat2[v].value = value_dup (value, width); cat2[v].count = w; } - else if ( 0 == compare_values (cat2[v].value, value, width)) + else if ( 0 == compare_values (cat2[v].value, value, var)) cat2[v].count += w; else if ( bst->category1 == SYSMIS) msg (ME, _("Variable %s is not dichotomous"), var_get_name (var)); @@ -143,7 +143,9 @@ void binomial_execute (const struct dataset *ds, struct casereader *input, enum mv_class exclude, - const struct npar_test *test) + const struct npar_test *test, + bool exact UNUSED, + double timer UNUSED) { int v; const struct binomial_test *bst = (const struct binomial_test *) test; diff --git a/src/language/stats/binomial.h b/src/language/stats/binomial.h index a8360330aa..df01a13bc5 100644 --- a/src/language/stats/binomial.h +++ b/src/language/stats/binomial.h @@ -40,6 +40,7 @@ struct dataset; void binomial_execute (const struct dataset *, struct casereader *, enum mv_class, - const struct npar_test *); + const struct npar_test *, + bool, double); #endif diff --git a/src/language/stats/chisquare.c b/src/language/stats/chisquare.c index 158064dcf9..ed14382418 100644 --- a/src/language/stats/chisquare.c +++ b/src/language/stats/chisquare.c @@ -320,7 +320,9 @@ void chisquare_execute (const struct dataset *ds, struct casereader *input, enum mv_class exclude, - const struct npar_test *test) + const struct npar_test *test, + bool exact UNUSED, + double timer UNUSED) { const struct dictionary *dict = dataset_dict (ds); int v, i; @@ -344,7 +346,8 @@ chisquare_execute (const struct dataset *ds, struct hsh_table *freq_hash = NULL; struct casereader *reader = casereader_create_filter_missing (casereader_clone (input), - &ost->vars[v], 1, exclude, NULL); + &ost->vars[v], 1, exclude, + NULL, NULL); struct tab_table *freq_table = create_variable_frequency_table(dict, reader, cst, v, &freq_hash); @@ -414,7 +417,8 @@ chisquare_execute (const struct dataset *ds, double total_obs = 0.0; struct casereader *reader = casereader_create_filter_missing (casereader_clone (input), - &ost->vars[v], 1, exclude, NULL); + &ost->vars[v], 1, exclude, + NULL, NULL); struct hsh_table *freq_hash = create_freq_hash_with_range (dict, reader, ost->vars[v], cst->lo, cst->hi); diff --git a/src/language/stats/chisquare.h b/src/language/stats/chisquare.h index 916a26392e..91a17d1a14 100644 --- a/src/language/stats/chisquare.h +++ b/src/language/stats/chisquare.h @@ -46,7 +46,9 @@ void chisquare_insert_variables (const struct npar_test *test, void chisquare_execute (const struct dataset *ds, struct casereader *input, enum mv_class exclude, - const struct npar_test *test); + const struct npar_test *test, + bool, + double); diff --git a/src/language/stats/crosstabs.q b/src/language/stats/crosstabs.q index 4c5309bfda..801b128b8a 100644 --- a/src/language/stats/crosstabs.q +++ b/src/language/stats/crosstabs.q @@ -2455,7 +2455,7 @@ calc_r (double *X, double *Y, double *r, double *ase_0, double *ase_1) for (sum_Xr = sum_X2r = 0., i = 0; i < n_rows; i++) { sum_Xr += X[i] * row_tot[i]; - sum_X2r += X[i] * X[i] * row_tot[i]; + sum_X2r += pow2 (X[i]) * row_tot[i]; } Xbar = sum_Xr / W; @@ -2467,11 +2467,11 @@ calc_r (double *X, double *Y, double *r, double *ase_0, double *ase_1) Ybar = sum_Yc / W; S = sum_XYf - sum_Xr * sum_Yc / W; - SX = sum_X2r - sum_Xr * sum_Xr / W; - SY = sum_Y2c - sum_Yc * sum_Yc / W; + SX = sum_X2r - pow2 (sum_Xr) / W; + SY = sum_Y2c - pow2 (sum_Yc) / W; T = sqrt (SX * SY); *r = S / T; - *ase_0 = sqrt ((sum_X2Y2f - (sum_XYf * sum_XYf) / W) / (sum_X2r * sum_Y2c)); + *ase_0 = sqrt ((sum_X2Y2f - pow2 (sum_XYf) / W) / (sum_X2r * sum_Y2c)); { double s, c, y, t; @@ -2561,9 +2561,9 @@ calc_symmetric (double v[N_SYMMETRIC], double ase[N_SYMMETRIC], Dr = Dc = W * W; for (r = 0; r < n_rows; r++) - Dr -= row_tot[r] * row_tot[r]; + Dr -= pow2 (row_tot[r]); for (c = 0; c < n_cols; c++) - Dc -= col_tot[c] * col_tot[c]; + Dc -= pow2 (col_tot[c]); } { @@ -3072,10 +3072,10 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], } for (sum_ri2 = 0., i = 0; i < n_rows; i++) - sum_ri2 += row_tot[i] * row_tot[i]; + sum_ri2 += pow2 (row_tot[i]); for (sum_cj2 = 0., j = 0; j < n_cols; j++) - sum_cj2 += col_tot[j] * col_tot[j]; + sum_cj2 += pow2 (col_tot[j]); v[3] = (W * sum_fij2_ci - sum_ri2) / (W * W - sum_ri2); v[4] = (W * sum_fij2_ri - sum_cj2) / (W * W - sum_cj2); @@ -3165,9 +3165,9 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], for (sum_Xr = sum_X2r = 0., i = 0; i < n_rows; i++) { sum_Xr += rows[i].f * row_tot[i]; - sum_X2r += rows[i].f * rows[i].f * row_tot[i]; + sum_X2r += pow2 (rows[i].f) * row_tot[i]; } - SX = sum_X2r - sum_Xr * sum_Xr / W; + SX = sum_X2r - pow2 (sum_Xr) / W; for (SXW = 0., j = 0; j < n_cols; j++) { @@ -3175,7 +3175,7 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], for (cum = 0., i = 0; i < n_rows; i++) { - SXW += rows[i].f * rows[i].f * mat[j + i * n_cols]; + SXW += pow2 (rows[i].f) * mat[j + i * n_cols]; cum += rows[i].f * mat[j + i * n_cols]; } @@ -3192,7 +3192,7 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], for (sum_Yc = sum_Y2c = 0., i = 0; i < n_cols; i++) { sum_Yc += cols[i].f * col_tot[i]; - sum_Y2c += cols[i].f * cols[i].f * col_tot[i]; + sum_Y2c += pow2 (cols[i].f) * col_tot[i]; } SY = sum_Y2c - sum_Yc * sum_Yc / W; @@ -3202,7 +3202,7 @@ calc_directional (double v[N_DIRECTIONAL], double ase[N_DIRECTIONAL], for (cum = 0., j = 0; j < n_cols; j++) { - SYW += cols[j].f * cols[j].f * mat[j + i * n_cols]; + SYW += pow2 (cols[j].f) * mat[j + i * n_cols]; cum += cols[j].f * mat[j + i * n_cols]; } diff --git a/src/language/stats/examine.q b/src/language/stats/examine.q index 9315e7e89b..e341b18a35 100644 --- a/src/language/stats/examine.q +++ b/src/language/stats/examine.q @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2004, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -22,9 +22,19 @@ #include #include +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include +#include +#include #include #include #include @@ -37,9 +47,7 @@ #include #include #include -#include #include -#include #include #include #include @@ -56,6 +64,7 @@ #include #include #include +#include /* (specification) "EXAMINE" (xmn_): @@ -63,8 +72,8 @@ +total=custom; +nototal=custom; missing=miss:pairwise/!listwise, - rep:report/!noreport, - incl:include/!exclude; + rep:report/!noreport, + incl:include/!exclude; +compare=cmp:variables/!groups; +percentiles=custom; +id=var; @@ -78,73 +87,140 @@ /* (functions) */ - static struct cmd_examine cmd; static const struct variable **dependent_vars; - static size_t n_dependent_vars; +/* PERCENTILES */ + +static subc_list_double percentile_list; +static enum pc_alg percentile_algorithm; -struct factor +struct factor_metrics { - /* The independent variable */ - struct variable *indep_var[2]; + struct moments1 *moments; + + struct percentile **ptl; + size_t n_ptiles; + + struct statistic *tukey_hinges; + struct statistic *box_whisker; + struct statistic *trimmed_mean; + struct statistic *histogram; + struct order_stats *np; + + /* Three quartiles indexing into PTL */ + struct percentile **quartiles; + + /* A reader sorted in ASCENDING order */ + struct casereader *up_reader; + + /* The minimum value of all the weights */ + double cmin; + + /* Sum of all weights, including those for missing values */ + double n; + + double mean; + double variance; - /* Hash table of factor stats indexed by 2 values */ - struct hsh_table *fstats; + double skewness; - /* The hash table after it has been crunched */ - struct factor_statistics **fs; + double kurtosis; - struct factor *next; + double se_mean; + struct extrema *minima; + struct extrema *maxima; }; -/* Linked list of factors */ -static struct factor *factors = 0; +struct factor_result +{ + struct ll ll; -static struct metrics *totals = 0; + union value *value[2]; -/* Parse the clause specifying the factors */ -static int examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd); + /* An array of factor metrics, one for each variable */ + struct factor_metrics *metrics; +}; +struct xfactor +{ + /* We need to make a list of this structure */ + struct ll ll; + /* The independent variable */ + const struct variable const* indep_var[2]; -/* Output functions */ -static void show_summary (const struct variable **dependent_var, int n_dep_var, - const struct factor *f); + /* A list of results for this factor */ + struct ll_list result_list ; +}; -static void show_extremes (const struct variable **dependent_var, - int n_dep_var, - const struct factor *factor, - int n_extremities); -static void show_descriptives (const struct variable **dependent_var, - int n_dep_var, - struct factor *factor); +static void +factor_destroy (struct xfactor *fctr) +{ + struct ll *ll = ll_head (&fctr->result_list); + while (ll != ll_null (&fctr->result_list)) + { + int v; + struct factor_result *result = + ll_data (ll, struct factor_result, ll); -static void show_percentiles (const struct variable **dependent_var, - int n_dep_var, - struct factor *factor); + for (v = 0; v < n_dependent_vars; ++v) + { + int i; + moments1_destroy (result->metrics[v].moments); + extrema_destroy (result->metrics[v].minima); + extrema_destroy (result->metrics[v].maxima); + statistic_destroy (result->metrics[v].trimmed_mean); + statistic_destroy (result->metrics[v].tukey_hinges); + statistic_destroy (result->metrics[v].box_whisker); + statistic_destroy (result->metrics[v].histogram); + for (i = 0 ; i < result->metrics[v].n_ptiles; ++i) + statistic_destroy ((struct statistic *) result->metrics[v].ptl[i]); + free (result->metrics[v].ptl); + free (result->metrics[v].quartiles); + casereader_destroy (result->metrics[v].up_reader); + } + free (result->value[0]); + free (result->value[1]); + free (result->metrics); + ll = ll_next (ll); + free (result); + } +} +static struct xfactor level0_factor; +static struct ll_list factor_list = LL_INITIALIZER (factor_list); +/* Parse the clause specifying the factors */ +static int examine_parse_independent_vars (struct lexer *lexer, + const struct dictionary *dict, + struct cmd_examine *cmd); -void np_plot (const struct metrics *m, const char *factorname); +/* Output functions */ +static void show_summary (const struct variable **dependent_var, int n_dep_var, + const struct xfactor *f); -void box_plot_group (const struct factor *fctr, - const struct variable **vars, int n_vars, - const struct variable *id - ) ; +static void show_descriptives (const struct variable **dependent_var, + int n_dep_var, + const struct xfactor *f); -void box_plot_variables (const struct factor *fctr, - const struct variable **vars, int n_vars, - const struct variable *id - ); +static void show_percentiles (const struct variable **dependent_var, + int n_dep_var, + const struct xfactor *f); + + +static void show_extremes (const struct variable **dependent_var, + int n_dep_var, + const struct xfactor *f); + @@ -161,34 +237,24 @@ void factor_calc (const struct ccase *c, int case_no, /* Represent a factor as a string, so it can be printed in a human readable fashion */ -static void factor_to_string (const struct factor *fctr, - const struct factor_statistics *fs, - const struct variable *var, - struct string *str - ); +static void factor_to_string (const struct xfactor *fctr, + const struct factor_result *result, + struct string *str); /* Represent a factor as a string, so it can be printed in a human readable fashion, but sacrificing some readablility for the sake of brevity */ -static void factor_to_string_concise (const struct factor *fctr, - const struct factor_statistics *fs, - struct string *); - +static void +factor_to_string_concise (const struct xfactor *fctr, + const struct factor_result *result, + struct string *str + ); /* Categories of missing values to exclude. */ static enum mv_class exclude_values; -/* PERCENTILES */ - -static subc_list_double percentile_list; - -static enum pc_alg percentile_algorithm; - -static short sbc_percentile; - - int cmd_examine (struct lexer *lexer, struct dataset *ds) { @@ -224,225 +290,404 @@ cmd_examine (struct lexer *lexer, struct dataset *ds) } grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds)); + while (casegrouper_get_next_group (grouper, &group)) - run_examine (&cmd, group, ds); + { + struct casereader *reader = + casereader_create_arithmetic_sequence (group, 1, 1); + + run_examine (&cmd, reader, ds); + } + ok = casegrouper_destroy (grouper); ok = proc_commit (ds) && ok; - if ( totals ) + if ( dependent_vars ) + free (dependent_vars); + + subc_list_double_destroy (&percentile_list); + + return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE; +}; + + +/* Plot the normal and detrended normal plots for RESULT. + Label the plots with LABEL */ +static void +np_plot (struct np *np, const char *label) +{ + double yfirst = 0, ylast = 0; + + double x_lower; + double x_upper; + double slack; + + /* Normal Plot */ + struct chart *np_chart; + + /* Detrended Normal Plot */ + struct chart *dnp_chart; + + /* The slope and intercept of the ideal normal probability line */ + const double slope = 1.0 / np->stddev; + const double intercept = -np->mean / np->stddev; + + if ( np->n < 1.0 ) { - free ( totals ); + msg (MW, _("Not creating plot because data set is empty.")); + return ; } - if ( dependent_vars ) - free (dependent_vars); + np_chart = chart_create (); + dnp_chart = chart_create (); + + if ( !np_chart || ! dnp_chart ) + return ; + + chart_write_title (np_chart, _("Normal Q-Q Plot of %s"), label); + chart_write_xlabel (np_chart, _("Observed Value")); + chart_write_ylabel (np_chart, _("Expected Normal")); + + chart_write_title (dnp_chart, _("Detrended Normal Q-Q Plot of %s"), + label); + chart_write_xlabel (dnp_chart, _("Observed Value")); + chart_write_ylabel (dnp_chart, _("Dev from Normal")); + + yfirst = gsl_cdf_ugaussian_Pinv (1 / (np->n + 1)); + ylast = gsl_cdf_ugaussian_Pinv (np->n / (np->n + 1)); + + /* Need to make sure that both the scatter plot and the ideal fit into the + plot */ + x_lower = MIN (np->y_min, (yfirst - intercept) / slope) ; + x_upper = MAX (np->y_max, (ylast - intercept) / slope) ; + slack = (x_upper - x_lower) * 0.05 ; + + chart_write_xscale (np_chart, x_lower - slack, x_upper + slack, 5); + chart_write_xscale (dnp_chart, np->y_min, np->y_max, 5); + + chart_write_yscale (np_chart, yfirst, ylast, 5); + chart_write_yscale (dnp_chart, np->dns_min, np->dns_max, 5); { - struct factor *f = factors ; - while ( f ) + struct ccase c; + struct casereader *reader = casewriter_make_reader (np->writer); + while (casereader_read (reader, &c)) { - struct factor *ff = f; + chart_datum (np_chart, 0, case_data_idx (&c, NP_IDX_Y)->f, case_data_idx (&c, NP_IDX_NS)->f); + chart_datum (dnp_chart, 0, case_data_idx (&c, NP_IDX_Y)->f, case_data_idx (&c, NP_IDX_DNS)->f); - f = f->next; - free ( ff->fs ); - hsh_destroy ( ff->fstats ) ; - free ( ff ) ; + case_destroy (&c); } - factors = 0; + casereader_destroy (reader); } - subc_list_double_destroy (&percentile_list); - - return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE; -}; + chart_line (dnp_chart, 0, 0, np->y_min, np->y_max , CHART_DIM_X); + chart_line (np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y); + chart_submit (np_chart); + chart_submit (dnp_chart); +} -/* Show all the appropriate tables */ static void -output_examine (void) +show_npplot (const struct variable **dependent_var, + int n_dep_var, + const struct xfactor *fctr) { - struct factor *fctr; + int v; - /* Show totals if appropriate */ - if ( ! cmd.sbc_nototal || factors == 0 ) + for (v = 0; v < n_dep_var; ++v) { - show_summary (dependent_vars, n_dependent_vars, 0); + struct ll *ll; + for (ll = ll_head (&fctr->result_list); + ll != ll_null (&fctr->result_list); + ll = ll_next (ll)) + { + struct string str; + const struct factor_result *result = + ll_data (ll, struct factor_result, ll); + + ds_init_empty (&str); + ds_put_format (&str, "%s ", var_get_name (dependent_var[v])); + + factor_to_string (fctr, result, &str); + + np_plot ((struct np*) result->metrics[v].np, ds_cstr(&str)); + + statistic_destroy ((struct statistic *)result->metrics[v].np); - if ( cmd.sbc_statistics ) + ds_destroy (&str); + } + } +} + + +static void +show_histogram (const struct variable **dependent_var, + int n_dep_var, + const struct xfactor *fctr) +{ + int v; + + for (v = 0; v < n_dep_var; ++v) + { + struct ll *ll; + for (ll = ll_head (&fctr->result_list); + ll != ll_null (&fctr->result_list); + ll = ll_next (ll)) { - if ( cmd.a_statistics[XMN_ST_EXTREME]) - show_extremes (dependent_vars, n_dependent_vars, 0, cmd.st_n); + struct string str; + const struct factor_result *result = + ll_data (ll, struct factor_result, ll); + + ds_init_empty (&str); + ds_put_format (&str, "%s ", var_get_name (dependent_var[v])); + + factor_to_string (fctr, result, &str); - if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES]) - show_descriptives (dependent_vars, n_dependent_vars, 0); + histogram_plot ((struct histogram *) result->metrics[v].histogram, + ds_cstr (&str), + (struct moments1 *) result->metrics[v].moments); + ds_destroy (&str); } - if ( sbc_percentile ) - show_percentiles (dependent_vars, n_dependent_vars, 0); + } +} + + - if ( cmd.sbc_plot) +static void +show_boxplot_groups (const struct variable **dependent_var, + int n_dep_var, + const struct xfactor *fctr) +{ + int v; + + for (v = 0; v < n_dep_var; ++v) + { + struct ll *ll; + int f = 0; + struct chart *ch = chart_create (); + double y_min = DBL_MAX; + double y_max = -DBL_MAX; + + for (ll = ll_head (&fctr->result_list); + ll != ll_null (&fctr->result_list); + ll = ll_next (ll)) { - int v; - if ( cmd.a_plot[XMN_PLT_STEMLEAF] ) - msg (SW, _ ("%s is not currently supported."), "STEMLEAF"); + const struct extremum *max, *min; + const struct factor_result *result = + ll_data (ll, struct factor_result, ll); - if ( cmd.a_plot[XMN_PLT_SPREADLEVEL] ) - msg (SW, _ ("%s is not currently supported."), "SPREADLEVEL"); + const struct ll_list *max_list = + extrema_list (result->metrics[v].maxima); - if ( cmd.a_plot[XMN_PLT_NPPLOT] ) - { - for ( v = 0 ; v < n_dependent_vars; ++v ) - np_plot (&totals[v], var_to_string (dependent_vars[v])); - } + const struct ll_list *min_list = + extrema_list (result->metrics[v].minima); - if ( cmd.a_plot[XMN_PLT_BOXPLOT] ) + if ( ll_is_empty (max_list)) { - if ( cmd.cmp == XMN_GROUPS ) - { - box_plot_group (0, (const struct variable **) dependent_vars, - n_dependent_vars, cmd.v_id); - } - else - box_plot_variables (0, - (const struct variable **) dependent_vars, - n_dependent_vars, cmd.v_id); + msg (MW, _("Not creating plot because data set is empty.")); + continue; } - if ( cmd.a_plot[XMN_PLT_HISTOGRAM] ) - { - for ( v = 0 ; v < n_dependent_vars; ++v ) - { - struct normal_curve normal; + max = (const struct extremum *) + ll_data (ll_head(max_list), struct extremum, ll); - normal.N = totals[v].n; - normal.mean = totals[v].mean; - normal.stddev = totals[v].stddev; + min = (const struct extremum *) + ll_data (ll_head (min_list), struct extremum, ll); - histogram_plot (totals[v].histogram, - var_to_string (dependent_vars[v]), - &normal, 0); - } - } + y_max = MAX (y_max, max->value); + y_min = MIN (y_min, min->value); + } + + boxplot_draw_yscale (ch, y_max, y_min); + + if ( fctr->indep_var[0]) + chart_write_title (ch, _("Boxplot of %s vs. %s"), + var_to_string (dependent_var[v]), + var_to_string (fctr->indep_var[0]) ); + else + chart_write_title (ch, _("Boxplot of %s"), + var_to_string (dependent_var[v])); + + for (ll = ll_head (&fctr->result_list); + ll != ll_null (&fctr->result_list); + ll = ll_next (ll)) + { + const struct factor_result *result = + ll_data (ll, struct factor_result, ll); + struct string str; + const double box_width = (ch->data_right - ch->data_left) + / (ll_count (&fctr->result_list) * 2.0 ) ; + + const double box_centre = (f++ * 2 + 1) * box_width + ch->data_left; + + ds_init_empty (&str); + factor_to_string_concise (fctr, result, &str); + + boxplot_draw_boxplot (ch, + box_centre, box_width, + (const struct box_whisker *) + result->metrics[v].box_whisker, + ds_cstr (&str)); + + ds_destroy (&str); } + chart_submit (ch); } +} - /* Show grouped statistics as appropriate */ - fctr = factors; - while ( fctr ) - { - show_summary (dependent_vars, n_dependent_vars, fctr); - if ( cmd.sbc_statistics ) - { - if ( cmd.a_statistics[XMN_ST_EXTREME]) - show_extremes (dependent_vars, n_dependent_vars, fctr, cmd.st_n); +static void +show_boxplot_variables (const struct variable **dependent_var, + int n_dep_var, + const struct xfactor *fctr + ) - if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES]) - show_descriptives (dependent_vars, n_dependent_vars, fctr); - } +{ + int v; + struct ll *ll; + const struct ll_list *result_list = &fctr->result_list; + + for (ll = ll_head (result_list); + ll != ll_null (result_list); + ll = ll_next (ll)) + + { + struct string title; + struct chart *ch = chart_create (); + double y_min = DBL_MAX; + double y_max = -DBL_MAX; - if ( sbc_percentile ) - show_percentiles (dependent_vars, n_dependent_vars, fctr); + const struct factor_result *result = + ll_data (ll, struct factor_result, ll); + const double box_width = (ch->data_right - ch->data_left) + / (n_dep_var * 2.0 ) ; - if ( cmd.sbc_plot) + for (v = 0; v < n_dep_var; ++v) { - size_t v; + const struct ll *max_ll = + ll_head (extrema_list (result->metrics[v].maxima)); + const struct ll *min_ll = + ll_head (extrema_list (result->metrics[v].minima)); - struct factor_statistics **fs = fctr->fs ; + const struct extremum *max = + (const struct extremum *) ll_data (max_ll, struct extremum, ll); - if ( cmd.a_plot[XMN_PLT_BOXPLOT] ) - { - if ( cmd.cmp == XMN_VARIABLES ) - box_plot_variables (fctr, - (const struct variable **) dependent_vars, - n_dependent_vars, cmd.v_id); - else - box_plot_group (fctr, - (const struct variable **) dependent_vars, - n_dependent_vars, cmd.v_id); - } + const struct extremum *min = + (const struct extremum *) ll_data (min_ll, struct extremum, ll); - for ( v = 0 ; v < n_dependent_vars; ++v ) - { + y_max = MAX (y_max, max->value); + y_min = MIN (y_min, min->value); + } - for ( fs = fctr->fs ; *fs ; ++fs ) - { - struct string str; - ds_init_empty (&str); - factor_to_string (fctr, *fs, dependent_vars[v], &str); - if ( cmd.a_plot[XMN_PLT_NPPLOT] ) - np_plot (& (*fs)->m[v], ds_cstr (&str)); + boxplot_draw_yscale (ch, y_max, y_min); - if ( cmd.a_plot[XMN_PLT_HISTOGRAM] ) - { - struct normal_curve normal; + ds_init_empty (&title); + factor_to_string (fctr, result, &title); - normal.N = (*fs)->m[v].n; - normal.mean = (*fs)->m[v].mean; - normal.stddev = (*fs)->m[v].stddev; +#if 0 + ds_put_format (&title, "%s = ", var_get_name (fctr->indep_var[0])); + var_append_value_name (fctr->indep_var[0], result->value[0], &title); +#endif - histogram_plot ((*fs)->m[v].histogram, - ds_cstr (&str) , &normal, 0); - } + chart_write_title (ch, ds_cstr (&title)); + ds_destroy (&title); - ds_destroy (&str); + for (v = 0; v < n_dep_var; ++v) + { + struct string str; + const double box_centre = (v * 2 + 1) * box_width + ch->data_left; - } /* for ( fs .... */ + ds_init_empty (&str); + ds_init_cstr (&str, var_get_name (dependent_var[v])); - } /* for ( v = 0 ..... */ + boxplot_draw_boxplot (ch, + box_centre, box_width, + (const struct box_whisker *) result->metrics[v].box_whisker, + ds_cstr (&str)); + ds_destroy (&str); } - fctr = fctr->next; + chart_submit (ch); } - } -/* Create a hash table of percentiles and their values from the list of - percentiles */ -static struct hsh_table * -list_to_ptile_hash (const subc_list_double *l) +/* Show all the appropriate tables */ +static void +output_examine (void) { - int i; + struct ll *ll; + + show_summary (dependent_vars, n_dependent_vars, &level0_factor); - struct hsh_table *h ; + if ( cmd.a_statistics[XMN_ST_EXTREME] ) + show_extremes (dependent_vars, n_dependent_vars, &level0_factor); - h = hsh_create (subc_list_double_count (l), - (hsh_compare_func *) ptile_compare, - (hsh_hash_func *) ptile_hash, - (hsh_free_func *) free, - 0); + if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] ) + show_descriptives (dependent_vars, n_dependent_vars, &level0_factor); + if ( cmd.sbc_percentiles) + show_percentiles (dependent_vars, n_dependent_vars, &level0_factor); - for ( i = 0 ; i < subc_list_double_count (l) ; ++i ) + if ( cmd.sbc_plot) { - struct percentile *p = xmalloc (sizeof *p); - - p->p = subc_list_double_at (l,i); - p->v = SYSMIS; + if (cmd.a_plot[XMN_PLT_BOXPLOT]) + show_boxplot_groups (dependent_vars, n_dependent_vars, &level0_factor); - hsh_insert (h, p); + if (cmd.a_plot[XMN_PLT_HISTOGRAM]) + show_histogram (dependent_vars, n_dependent_vars, &level0_factor); + if (cmd.a_plot[XMN_PLT_NPPLOT]) + show_npplot (dependent_vars, n_dependent_vars, &level0_factor); } - return h; + for (ll = ll_head (&factor_list); + ll != ll_null (&factor_list); ll = ll_next (ll)) + { + struct xfactor *factor = ll_data (ll, struct xfactor, ll); + show_summary (dependent_vars, n_dependent_vars, factor); + + if ( cmd.a_statistics[XMN_ST_EXTREME] ) + show_extremes (dependent_vars, n_dependent_vars, factor); + + if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] ) + show_descriptives (dependent_vars, n_dependent_vars, factor); + + if ( cmd.sbc_percentiles) + show_percentiles (dependent_vars, n_dependent_vars, factor); + + if (cmd.a_plot[XMN_PLT_BOXPLOT] && + cmd.cmp == XMN_GROUPS) + show_boxplot_groups (dependent_vars, n_dependent_vars, factor); + + + if (cmd.a_plot[XMN_PLT_BOXPLOT] && + cmd.cmp == XMN_VARIABLES) + show_boxplot_variables (dependent_vars, n_dependent_vars, + factor); + + if (cmd.a_plot[XMN_PLT_HISTOGRAM]) + show_histogram (dependent_vars, n_dependent_vars, factor); + if (cmd.a_plot[XMN_PLT_NPPLOT]) + show_npplot (dependent_vars, n_dependent_vars, factor); + } } /* Parse the PERCENTILES subcommand */ static int xmn_custom_percentiles (struct lexer *lexer, struct dataset *ds UNUSED, - struct cmd_examine *p UNUSED, void *aux UNUSED) + struct cmd_examine *p UNUSED, void *aux UNUSED) { - sbc_percentile = 1; - lex_match (lexer, '='); lex_match (lexer, '('); @@ -494,11 +739,12 @@ xmn_custom_percentiles (struct lexer *lexer, struct dataset *ds UNUSED, /* TOTAL and NOTOTAL are simple, mutually exclusive flags */ static int -xmn_custom_total (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, struct cmd_examine *p, void *aux UNUSED) +xmn_custom_total (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, + struct cmd_examine *p, void *aux UNUSED) { if ( p->sbc_nototal ) { - msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL"); + msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL"); return 0; } @@ -511,7 +757,7 @@ xmn_custom_nototal (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, { if ( p->sbc_total ) { - msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL"); + msg (SE, _("%s and %s are mutually exclusive"), "TOTAL", "NOTOTAL"); return 0; } @@ -523,19 +769,21 @@ xmn_custom_nototal (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, /* Parser for the variables sub command Returns 1 on success */ static int -xmn_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_examine *cmd, void *aux UNUSED) +xmn_custom_variables (struct lexer *lexer, struct dataset *ds, + struct cmd_examine *cmd, + void *aux UNUSED) { const struct dictionary *dict = dataset_dict (ds); lex_match (lexer, '='); if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL) - && lex_token (lexer) != T_ALL) + && lex_token (lexer) != T_ALL) { return 2; } if (!parse_variables_const (lexer, dict, &dependent_vars, &n_dependent_vars, - PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) ) + PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) ) { free (dependent_vars); return 0; @@ -543,16 +791,15 @@ xmn_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_examin assert (n_dependent_vars); - totals = xnmalloc (n_dependent_vars, sizeof *totals); if ( lex_match (lexer, T_BY)) { int success ; success = examine_parse_independent_vars (lexer, dict, cmd); - if ( success != 1 ) { - free (dependent_vars); - free (totals) ; - } + if ( success != 1 ) + { + free (dependent_vars); + } return success; } @@ -563,47 +810,44 @@ xmn_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_examin /* Parse the clause specifying the factors */ static int -examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd) +examine_parse_independent_vars (struct lexer *lexer, + const struct dictionary *dict, + struct cmd_examine *cmd) { int success; - struct factor *sf = xmalloc (sizeof *sf); + struct xfactor *sf = xmalloc (sizeof *sf); - if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL) - && lex_token (lexer) != T_ALL) + ll_init (&sf->result_list); + + if ( (lex_token (lexer) != T_ID || + dict_lookup_var (dict, lex_tokid (lexer)) == NULL) + && lex_token (lexer) != T_ALL) { free ( sf ) ; return 2; } - sf->indep_var[0] = parse_variable (lexer, dict); - sf->indep_var[1] = 0; + sf->indep_var[1] = NULL; if ( lex_token (lexer) == T_BY ) { - lex_match (lexer, T_BY); - if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL) - && lex_token (lexer) != T_ALL) + if ( (lex_token (lexer) != T_ID || + dict_lookup_var (dict, lex_tokid (lexer)) == NULL) + && lex_token (lexer) != T_ALL) { - free ( sf ) ; + free (sf); return 2; } sf->indep_var[1] = parse_variable (lexer, dict); + ll_push_tail (&factor_list, &sf->ll); } - - - sf->fstats = hsh_create (4, - (hsh_compare_func *) factor_statistics_compare, - (hsh_hash_func *) factor_statistics_hash, - (hsh_free_func *) factor_statistics_free, - 0); - - sf->next = factors; - factors = sf; + else + ll_push_tail (&factor_list, &sf->ll); lex_match (lexer, ','); @@ -618,335 +862,378 @@ examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *di return success; } +static void +examine_group (struct cmd_examine *cmd, struct casereader *reader, int level, + const struct dictionary *dict, struct xfactor *factor) +{ + struct ccase c; + const struct variable *wv = dict_get_weight (dict); + int v; + int n_extrema = 1; + struct factor_result *result = xzalloc (sizeof (*result)); + + result->metrics = xcalloc (n_dependent_vars, sizeof (*result->metrics)); + if ( cmd->a_statistics[XMN_ST_EXTREME] ) + n_extrema = cmd->st_n; -void populate_percentiles (struct tab_table *tbl, int col, int row, - const struct metrics *m); + if (casereader_peek (reader, 0, &c)) + { + if ( level > 0) + { + result->value[0] = + value_dup (case_data (&c, factor->indep_var[0]), + var_get_width (factor->indep_var[0])); + + if ( level > 1) + result->value[1] = + value_dup (case_data (&c, factor->indep_var[1]), + var_get_width (factor->indep_var[1])); + } + case_destroy (&c); + } + + for (v = 0; v < n_dependent_vars; ++v) + { + struct casewriter *writer; + struct casereader *input = casereader_clone (reader); + + result->metrics[v].moments = moments1_create (MOMENT_KURTOSIS); + result->metrics[v].minima = extrema_create (n_extrema, EXTREME_MINIMA); + result->metrics[v].maxima = extrema_create (n_extrema, EXTREME_MAXIMA); + result->metrics[v].cmin = DBL_MAX; + + if (cmd->a_statistics[XMN_ST_DESCRIPTIVES] || + cmd->a_plot[XMN_PLT_BOXPLOT] || + cmd->a_plot[XMN_PLT_NPPLOT] || + cmd->sbc_percentiles) + { + /* In this case, we need to sort the data, so we create a sorting + casewriter */ + struct case_ordering *up_ordering = case_ordering_create (); -void populate_descriptives (struct tab_table *t, int col, int row, - const struct metrics *fs); + case_ordering_add_var (up_ordering, dependent_vars[v], SRT_ASCEND); + writer = sort_create_writer (up_ordering, + casereader_get_value_cnt (reader)); + } + else + { + /* but in this case, sorting is unnecessary, so an ordinary + casewriter is sufficient */ + writer = + autopaging_writer_create (casereader_get_value_cnt (reader)); + } -void populate_extremes (struct tab_table *t, int col, int row, int n, - const struct metrics *m); -void populate_summary (struct tab_table *t, int col, int row, - const struct metrics *m); + /* Sort or just iterate, whilst calculating moments etc */ + while (casereader_read (input, &c)) + { + const casenumber loc = + case_data_idx (&c, casereader_get_value_cnt (reader) - 1)->f; + const double weight = wv ? case_data (&c, wv)->f : 1.0; + if (weight != SYSMIS) + minimize (&result->metrics[v].cmin, weight); + moments1_add (result->metrics[v].moments, + case_data (&c, dependent_vars[v])->f, + weight); -/* Perform calculations for the sub factors */ -void -factor_calc (const struct ccase *c, int case_no, double weight, - bool case_missing) -{ - size_t v; - struct factor *fctr = factors; + result->metrics[v].n += weight; - while ( fctr) - { - struct factor_statistics **foo ; - union value *indep_vals[2] ; + extrema_add (result->metrics[v].maxima, + case_data (&c, dependent_vars[v])->f, + weight, + loc); - indep_vals[0] = value_dup ( - case_data (c, fctr->indep_var[0]), - var_get_width (fctr->indep_var[0]) - ); + extrema_add (result->metrics[v].minima, + case_data (&c, dependent_vars[v])->f, + weight, + loc); - if ( fctr->indep_var[1] ) - indep_vals[1] = value_dup ( - case_data (c, fctr->indep_var[1]), - var_get_width (fctr->indep_var[1]) - ); - else - { - const union value sm = {SYSMIS}; - indep_vals[1] = value_dup (&sm, 0); + casewriter_write (writer, &c); } + casereader_destroy (input); + result->metrics[v].up_reader = casewriter_make_reader (writer); + } - assert (fctr->fstats); + /* If percentiles or descriptives have been requested, then a + second pass through the data (which has now been sorted) + is necessary */ + if ( cmd->a_statistics[XMN_ST_DESCRIPTIVES] || + cmd->a_plot[XMN_PLT_BOXPLOT] || + cmd->a_plot[XMN_PLT_NPPLOT] || + cmd->sbc_percentiles) + { + for (v = 0; v < n_dependent_vars; ++v) + { + int i; + int n_os; + struct order_stats **os ; + struct factor_metrics *metric = &result->metrics[v]; - foo = ( struct factor_statistics ** ) - hsh_probe (fctr->fstats, (void *) &indep_vals); + metric->n_ptiles = percentile_list.n_data; - if ( !*foo ) - { + metric->ptl = xcalloc (metric->n_ptiles, + sizeof (struct percentile *)); - *foo = create_factor_statistics (n_dependent_vars, - indep_vals[0], - indep_vals[1]); + metric->quartiles = xcalloc (3, sizeof (*metric->quartiles)); - for ( v = 0 ; v < n_dependent_vars ; ++v ) + for (i = 0 ; i < metric->n_ptiles; ++i) { - metrics_precalc ( & (*foo)->m[v] ); + metric->ptl[i] = (struct percentile *) + percentile_create (percentile_list.data[i] / 100.0, metric->n); + + if ( percentile_list.data[i] == 25) + metric->quartiles[0] = metric->ptl[i]; + else if ( percentile_list.data[i] == 50) + metric->quartiles[1] = metric->ptl[i]; + else if ( percentile_list.data[i] == 75) + metric->quartiles[2] = metric->ptl[i]; } - } - else - { - free (indep_vals[0]); - free (indep_vals[1]); - } + metric->tukey_hinges = tukey_hinges_create (metric->n, metric->cmin); + metric->trimmed_mean = trimmed_mean_create (metric->n, 0.05); - for ( v = 0 ; v < n_dependent_vars ; ++v ) - { - const struct variable *var = dependent_vars[v]; - union value *val = value_dup ( - case_data (c, var), - var_get_width (var) - ); + n_os = metric->n_ptiles + 2; - if (case_missing || var_is_value_missing (var, val, exclude_values)) + if ( cmd->a_plot[XMN_PLT_NPPLOT] ) { - free (val); - val = NULL; + metric->np = np_create (metric->moments); + n_os ++; } - metrics_calc ( & (*foo)->m[v], val, weight, case_no); + os = xcalloc (sizeof (struct order_stats *), n_os); - free (val); - } + for (i = 0 ; i < metric->n_ptiles ; ++i ) + { + os[i] = (struct order_stats *) metric->ptl[i]; + } - fctr = fctr->next; + os[i] = (struct order_stats *) metric->tukey_hinges; + os[i+1] = (struct order_stats *) metric->trimmed_mean; + + if (cmd->a_plot[XMN_PLT_NPPLOT]) + os[i+2] = metric->np; + + order_stats_accumulate (os, n_os, + casereader_clone (metric->up_reader), + wv, dependent_vars[v], MV_ANY); + free (os); + } } -} -static void -run_examine (struct cmd_examine *cmd, struct casereader *input, - struct dataset *ds) -{ - struct dictionary *dict = dataset_dict (ds); - casenumber case_no; - struct ccase c; - int v; - bool ok; - - struct factor *fctr; - - if (!casereader_peek (input, 0, &c)) - { - casereader_destroy (input); - return; - } - output_split_file_values (ds, &c); - case_destroy (&c); - - input = casereader_create_filter_weight (input, dict, NULL, NULL); - input = casereader_create_counter (input, &case_no, 0); - - /* Make sure we haven't got rubbish left over from a - previous split. */ - fctr = factors; - while (fctr) + /* FIXME: Do this in the above loop */ + if ( cmd->a_plot[XMN_PLT_HISTOGRAM] ) { - struct factor *next = fctr->next; + struct ccase c; + struct casereader *input = casereader_clone (reader); - hsh_clear (fctr->fstats); + for (v = 0; v < n_dependent_vars; ++v) + { + const struct extremum *max, *min; + struct factor_metrics *metric = &result->metrics[v]; - fctr->fs = 0; + const struct ll_list *max_list = + extrema_list (result->metrics[v].maxima); - fctr = next; - } + const struct ll_list *min_list = + extrema_list (result->metrics[v].minima); - for ( v = 0 ; v < n_dependent_vars ; ++v ) - metrics_precalc (&totals[v]); + if ( ll_is_empty (max_list)) + { + msg (MW, _("Not creating plot because data set is empty.")); + continue; + } - for (; casereader_read (input, &c); case_destroy (&c)) - { - bool case_missing = false; - const double weight = dict_get_case_weight (dict, &c, NULL); + assert (! ll_is_empty (min_list)); - if ( cmd->miss == XMN_LISTWISE ) - { - for ( v = 0 ; v < n_dependent_vars ; ++v ) - { - const struct variable *var = dependent_vars[v]; - union value *val = value_dup ( - case_data (&c, var), - var_get_width (var) - ); + max = (const struct extremum *) + ll_data (ll_head(max_list), struct extremum, ll); - if ( var_is_value_missing (var, val, exclude_values)) - case_missing = true; + min = (const struct extremum *) + ll_data (ll_head (min_list), struct extremum, ll); - free (val); - } + metric->histogram = histogram_create (10, min->value, max->value); } - for ( v = 0 ; v < n_dependent_vars ; ++v ) + while (casereader_read (input, &c)) { - const struct variable *var = dependent_vars[v]; - union value *val = value_dup ( - case_data (&c, var), - var_get_width (var) - ); - - if ( var_is_value_missing (var, val, exclude_values) - || case_missing ) + const double weight = wv ? case_data (&c, wv)->f : 1.0; + + for (v = 0; v < n_dependent_vars; ++v) { - free (val) ; - val = NULL; + struct factor_metrics *metric = &result->metrics[v]; + if ( metric->histogram) + histogram_add ((struct histogram *) metric->histogram, + case_data (&c, dependent_vars[v])->f, weight); } - - metrics_calc (&totals[v], val, weight, case_no); - - free (val); + case_destroy (&c); } - - factor_calc (&c, case_no, weight, case_missing); + casereader_destroy (input); } - ok = casereader_destroy (input); - for ( v = 0 ; v < n_dependent_vars ; ++v) + /* In this case, a third iteration is required */ + if (cmd->a_plot[XMN_PLT_BOXPLOT]) { - fctr = factors; - while ( fctr ) + for (v = 0; v < n_dependent_vars; ++v) { - struct hsh_iterator hi; - struct factor_statistics *fs; + struct factor_metrics *metric = &result->metrics[v]; + + metric->box_whisker = + box_whisker_create ((struct tukey_hinges *) metric->tukey_hinges, + cmd->v_id, + casereader_get_value_cnt (metric->up_reader) + - 1); + + order_stats_accumulate ((struct order_stats **) &metric->box_whisker, + 1, + casereader_clone (metric->up_reader), + wv, dependent_vars[v], MV_ANY); + } + } - for ( fs = hsh_first (fctr->fstats, &hi); - fs != 0 ; - fs = hsh_next (fctr->fstats, &hi)) - { + ll_push_tail (&factor->result_list, &result->ll); + casereader_destroy (reader); +} - fs->m[v].ptile_hash = list_to_ptile_hash (&percentile_list); - fs->m[v].ptile_alg = percentile_algorithm; - metrics_postcalc (&fs->m[v]); - } - fctr = fctr->next; - } +static void +run_examine (struct cmd_examine *cmd, struct casereader *input, + struct dataset *ds) +{ + struct ll *ll; + const struct dictionary *dict = dataset_dict (ds); + struct ccase c; + struct casereader *level0 = casereader_clone (input); - totals[v].ptile_hash = list_to_ptile_hash (&percentile_list); - totals[v].ptile_alg = percentile_algorithm; - metrics_postcalc (&totals[v]); + if (!casereader_peek (input, 0, &c)) + { + casereader_destroy (input); + return; } + output_split_file_values (ds, &c); + case_destroy (&c); + + ll_init (&level0_factor.result_list); - /* Make sure that the combination of factors are complete */ + examine_group (cmd, level0, 0, dict, &level0_factor); - fctr = factors; - while ( fctr ) + for (ll = ll_head (&factor_list); + ll != ll_null (&factor_list); + ll = ll_next (ll)) { - struct hsh_iterator hi; - struct hsh_iterator hi0; - struct hsh_iterator hi1; - struct factor_statistics *fs; + struct xfactor *factor = ll_data (ll, struct xfactor, ll); - struct hsh_table *idh0 = NULL; - struct hsh_table *idh1 = NULL; - union value **val0; - union value **val1; + struct casereader *group = NULL; + struct casereader *level1; + struct casegrouper *grouper1 = NULL; + struct case_ordering *ordering1 = case_ordering_create (); + case_ordering_add_var (ordering1, factor->indep_var[0], SRT_ASCEND); - idh0 = hsh_create (4, (hsh_compare_func *) compare_ptr_values, - (hsh_hash_func *) hash_ptr_value, - 0,0); + level1 = casereader_clone (input); - idh1 = hsh_create (4, (hsh_compare_func *) compare_ptr_values, - (hsh_hash_func *) hash_ptr_value, - 0,0); + level1 = sort_execute (level1, + case_ordering_clone (ordering1)); + grouper1 = casegrouper_create_case_ordering (level1, ordering1); + case_ordering_destroy (ordering1); - - for ( fs = hsh_first (fctr->fstats, &hi); - fs != 0 ; - fs = hsh_next (fctr->fstats, &hi)) + while (casegrouper_get_next_group (grouper1, &group)) { - hsh_insert (idh0, &fs->id[0]); - hsh_insert (idh1, &fs->id[1]); - } + struct casereader *group_copy = casereader_clone (group); - /* Ensure that the factors combination is complete */ - for ( val0 = hsh_first (idh0, &hi0); - val0 != 0 ; - val0 = hsh_next (idh0, &hi0)) - { - for ( val1 = hsh_first (idh1, &hi1); - val1 != 0 ; - val1 = hsh_next (idh1, &hi1)) + if ( !factor->indep_var[1]) + examine_group (cmd, group_copy, 1, dict, factor); + else { - struct factor_statistics **ffs; - union value *key[2]; - key[0] = *val0; - key[1] = *val1; - - ffs = (struct factor_statistics **) - hsh_probe (fctr->fstats, &key ); - - if ( !*ffs ) { - size_t i; - (*ffs) = create_factor_statistics (n_dependent_vars, - key[0], key[1]); - for ( i = 0 ; i < n_dependent_vars ; ++i ) - metrics_precalc ( & (*ffs)->m[i]); - } - } - } + int n_groups = 0; + struct casereader *group2 = NULL; + struct casegrouper *grouper2 = NULL; + struct case_ordering *ordering2 = case_ordering_create (); + + case_ordering_add_var (ordering2, + factor->indep_var[1], SRT_ASCEND); + group_copy = sort_execute (group_copy, + case_ordering_clone (ordering2)); + grouper2 = + casegrouper_create_case_ordering (group_copy, ordering2); - hsh_destroy (idh0); - hsh_destroy (idh1); + case_ordering_destroy (ordering2); - fctr->fs = (struct factor_statistics **) hsh_sort_copy (fctr->fstats); + while (casegrouper_get_next_group (grouper2, &group2)) + { + examine_group (cmd, group2, 2, dict, factor); + n_groups++; + } + casegrouper_destroy (grouper2); + } - fctr = fctr->next; + casereader_destroy (group); + } + casegrouper_destroy (grouper1); } - if (ok) - output_examine (); + casereader_destroy (input); + output_examine (); + + factor_destroy (&level0_factor); + + { + struct ll *ll; + for (ll = ll_head (&factor_list); + ll != ll_null (&factor_list); + ll = ll_next (ll)) + { + struct xfactor *f = ll_data (ll, struct xfactor, ll); + factor_destroy (f); + } + } - if ( totals ) - { - size_t i; - for ( i = 0 ; i < n_dependent_vars ; ++i ) - { - metrics_destroy (&totals[i]); - } - } } static void show_summary (const struct variable **dependent_var, int n_dep_var, - const struct factor *fctr) + const struct xfactor *fctr) { static const char *subtitle[]= { - N_ ("Valid"), - N_ ("Missing"), - N_ ("Total") + N_("Valid"), + N_("Missing"), + N_("Total") }; - int i; - int heading_columns ; + int v, j; + int heading_columns = 1; int n_cols; const int heading_rows = 3; struct tab_table *tbl; int n_rows ; - int n_factors = 1; + n_rows = n_dep_var; + + assert (fctr); - if ( fctr ) + if ( fctr->indep_var[0] ) { heading_columns = 2; - n_factors = hsh_count (fctr->fstats); - n_rows = n_dep_var * n_factors ; if ( fctr->indep_var[1] ) - heading_columns = 3; - } - else - { - heading_columns = 1; - n_rows = n_dep_var; + { + heading_columns = 3; + } } + n_rows *= ll_count (&fctr->result_list); n_rows += heading_rows; n_cols = heading_columns + 6; - tbl = tab_create (n_cols,n_rows,0); + tbl = tab_create (n_cols, n_rows, 0); tab_headers (tbl, heading_columns, 0, heading_rows, 0); tab_dim (tbl, tab_natural_dimensions); @@ -973,12 +1260,12 @@ show_summary (const struct variable **dependent_var, int n_dep_var, tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1); - tab_title (tbl, _ ("Case Processing Summary")); + tab_title (tbl, _("Case Processing Summary")); tab_joint_text (tbl, heading_columns, 0, - n_cols -1, 0, - TAB_CENTER | TAT_TITLE, - _ ("Cases")); + n_cols -1, 0, + TAB_CENTER | TAT_TITLE, + _("Cases")); /* Remove lines ... */ tab_box (tbl, @@ -987,28 +1274,28 @@ show_summary (const struct variable **dependent_var, int n_dep_var, heading_columns, 0, n_cols - 1, 0); - for ( i = 0 ; i < 3 ; ++i ) + for (j = 0 ; j < 3 ; ++j) { - tab_text (tbl, heading_columns + i * 2 , 2, TAB_CENTER | TAT_TITLE, - _ ("N")); + tab_text (tbl, heading_columns + j * 2 , 2, TAB_CENTER | TAT_TITLE, + _("N")); - tab_text (tbl, heading_columns + i * 2 + 1, 2, TAB_CENTER | TAT_TITLE, - _ ("Percent")); + tab_text (tbl, heading_columns + j * 2 + 1, 2, TAB_CENTER | TAT_TITLE, + _("Percent")); - tab_joint_text (tbl, heading_columns + i*2 , 1, - heading_columns + i * 2 + 1, 1, - TAB_CENTER | TAT_TITLE, - subtitle[i]); + tab_joint_text (tbl, heading_columns + j * 2 , 1, + heading_columns + j * 2 + 1, 1, + TAB_CENTER | TAT_TITLE, + subtitle[j]); tab_box (tbl, -1, -1, TAL_0, TAL_0, - heading_columns + i * 2, 1, - heading_columns + i * 2 + 1, 1); + heading_columns + j * 2, 1, + heading_columns + j * 2 + 1, 1); } /* Titles for the independent variables */ - if ( fctr ) + if ( fctr->indep_var[0] ) { tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, var_to_string (fctr->indep_var[0])); @@ -1020,1275 +1307,883 @@ show_summary (const struct variable **dependent_var, int n_dep_var, } } - - for ( i = 0 ; i < n_dep_var ; ++i ) + for (v = 0 ; v < n_dep_var ; ++v) { - int n_factors = 1; - if ( fctr ) - n_factors = hsh_count (fctr->fstats); + int j = 0; + struct ll *ll; + union value *last_value = NULL; - if ( i > 0 ) - tab_hline (tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows); + if ( v > 0 ) + tab_hline (tbl, TAL_1, 0, n_cols -1 , + v * ll_count (&fctr->result_list) + + heading_rows); tab_text (tbl, - 0, i * n_factors + heading_rows, + 0, + v * ll_count (&fctr->result_list) + heading_rows, TAB_LEFT | TAT_TITLE, - var_to_string (dependent_var[i]) + var_to_string (dependent_var[v]) ); - if ( !fctr ) - populate_summary (tbl, heading_columns, - (i * n_factors) + heading_rows, - &totals[i]); - else + + for (ll = ll_head (&fctr->result_list); + ll != ll_null (&fctr->result_list); ll = ll_next (ll)) { - struct factor_statistics **fs = fctr->fs; - int count = 0 ; - const union value *prev = NULL; + double n; + const struct factor_result *result = + ll_data (ll, struct factor_result, ll); - while (*fs) + if ( fctr->indep_var[0] ) { - if ( !prev || - 0 != compare_values (prev, (*fs)->id[0], - var_get_width (fctr->indep_var[0]))) + + if ( last_value == NULL || + compare_values (last_value, result->value[0], + fctr->indep_var[0])) { - struct string vstr; - ds_init_empty (&vstr); - var_append_value_name (fctr->indep_var[0], - (*fs)->id[0], &vstr); - - tab_text (tbl, - 1, - (i * n_factors ) + count + - heading_rows, + struct string str; + + last_value = result->value[0]; + ds_init_empty (&str); + + var_append_value_name (fctr->indep_var[0], result->value[0], + &str); + + tab_text (tbl, 1, + heading_rows + j + + v * ll_count (&fctr->result_list), TAB_LEFT | TAT_TITLE, - ds_cstr (&vstr) - ); + ds_cstr (&str)); - ds_destroy (&vstr); + ds_destroy (&str); - if (fctr->indep_var[1] && count > 0 ) + if ( fctr->indep_var[1] && j > 0) tab_hline (tbl, TAL_1, 1, n_cols - 1, - (i * n_factors ) + count + heading_rows); + heading_rows + j + + v * ll_count (&fctr->result_list)); } - prev = (*fs)->id[0]; - if ( fctr->indep_var[1]) { - struct string vstr; - ds_init_empty (&vstr); + struct string str; + + ds_init_empty (&str); + var_append_value_name (fctr->indep_var[1], - (*fs)->id[1], &vstr); - tab_text (tbl, - 2, - (i * n_factors ) + count + - heading_rows, + result->value[1], &str); + + tab_text (tbl, 2, + heading_rows + j + + v * ll_count (&fctr->result_list), TAB_LEFT | TAT_TITLE, - ds_cstr (&vstr) - ); - ds_destroy (&vstr); + ds_cstr (&str)); + + ds_destroy (&str); } + } - populate_summary (tbl, heading_columns, - (i * n_factors) + count - + heading_rows, - & (*fs)->m[i]); - count++ ; - fs++; - } + moments1_calculate (result->metrics[v].moments, + &n, &result->metrics[v].mean, + &result->metrics[v].variance, + &result->metrics[v].skewness, + &result->metrics[v].kurtosis); + + result->metrics[v].se_mean = sqrt (result->metrics[v].variance / n) ; + + /* Total Valid */ + tab_float (tbl, heading_columns, + heading_rows + j + v * ll_count (&fctr->result_list), + TAB_LEFT, + n, 8, 0); + + tab_text (tbl, heading_columns + 1, + heading_rows + j + v * ll_count (&fctr->result_list), + TAB_RIGHT | TAT_PRINTF, + "%g%%", n * 100.0 / result->metrics[v].n); + + /* Total Missing */ + tab_float (tbl, heading_columns + 2, + heading_rows + j + v * ll_count (&fctr->result_list), + TAB_LEFT, + result->metrics[v].n - n, + 8, 0); + + tab_text (tbl, heading_columns + 3, + heading_rows + j + v * ll_count (&fctr->result_list), + TAB_RIGHT | TAT_PRINTF, + "%g%%", + (result->metrics[v].n - n) * 100.0 / result->metrics[v].n + ); + + /* Total Valid + Missing */ + tab_float (tbl, heading_columns + 4, + heading_rows + j + v * ll_count (&fctr->result_list), + TAB_LEFT, + result->metrics[v].n, + 8, 0); + + tab_text (tbl, heading_columns + 5, + heading_rows + j + v * ll_count (&fctr->result_list), + TAB_RIGHT | TAT_PRINTF, + "%g%%", + (result->metrics[v].n) * 100.0 / result->metrics[v].n + ); + + ++j; } } - tab_submit (tbl); -} - - -void -populate_summary (struct tab_table *t, int col, int row, - const struct metrics *m) - -{ - const double total = m->n + m->n_missing ; - - tab_float (t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0); - tab_float (t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0); - tab_float (t, col + 4, row + 0, TAB_RIGHT, total, 8, 0); - - - if ( total > 0 ) { - tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%", - 100.0 * m->n / total ); - tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%", - 100.0 * m->n_missing / total ); - - /* This seems a bit pointless !!! */ - tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%", - 100.0 * total / total ); - } + tab_submit (tbl); } - +#define DESCRIPTIVE_ROWS 13 static void -show_extremes (const struct variable **dependent_var, int n_dep_var, - const struct factor *fctr, int n_extremities) +show_descriptives (const struct variable **dependent_var, + int n_dep_var, + const struct xfactor *fctr) { - int i; - int heading_columns ; + int v; + int heading_columns = 3; int n_cols; const int heading_rows = 1; struct tab_table *tbl; - int n_factors = 1; int n_rows ; + n_rows = n_dep_var; - if ( fctr ) - { - heading_columns = 2; - n_factors = hsh_count (fctr->fstats); + assert (fctr); - n_rows = n_dep_var * 2 * n_extremities * n_factors; + if ( fctr->indep_var[0] ) + { + heading_columns = 4; if ( fctr->indep_var[1] ) - heading_columns = 3; - } - else - { - heading_columns = 1; - n_rows = n_dep_var * 2 * n_extremities; + { + heading_columns = 5; + } } + n_rows *= ll_count (&fctr->result_list) * DESCRIPTIVE_ROWS; n_rows += heading_rows; - heading_columns += 2; n_cols = heading_columns + 2; - tbl = tab_create (n_cols,n_rows,0); + tbl = tab_create (n_cols, n_rows, 0); tab_headers (tbl, heading_columns, 0, heading_rows, 0); tab_dim (tbl, tab_natural_dimensions); - /* Outline the box, No internal lines*/ + /* Outline the box */ tab_box (tbl, TAL_2, TAL_2, -1, -1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows ); - tab_title (tbl, _ ("Extreme Values")); + tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows ); + tab_hline (tbl, TAL_2, 1, n_cols - 1, heading_rows ); - tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1); - tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1); + tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1); - if ( fctr ) - { - tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string (fctr->indep_var[0])); - if ( fctr->indep_var[1] ) - tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string (fctr->indep_var[1])); - } + if ( fctr->indep_var[0]) + tab_text (tbl, 1, 0, TAT_TITLE, var_to_string (fctr->indep_var[0])); - tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Value")); - tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Case Number")); + if ( fctr->indep_var[1]) + tab_text (tbl, 2, 0, TAT_TITLE, var_to_string (fctr->indep_var[1])); - for ( i = 0 ; i < n_dep_var ; ++i ) + for (v = 0 ; v < n_dep_var ; ++v ) { + struct ll *ll; + int i = 0; - if ( i > 0 ) - tab_hline (tbl, TAL_1, 0, n_cols -1 , - i * 2 * n_extremities * n_factors + heading_rows); + const int row_var_start = + v * DESCRIPTIVE_ROWS * ll_count(&fctr->result_list); - tab_text (tbl, 0, - i * 2 * n_extremities * n_factors + heading_rows, + tab_text (tbl, + 0, + heading_rows + row_var_start, TAB_LEFT | TAT_TITLE, - var_to_string (dependent_var[i]) + var_to_string (dependent_var[v]) ); + for (ll = ll_head (&fctr->result_list); + ll != ll_null (&fctr->result_list); i++, ll = ll_next (ll)) + { + const struct factor_result *result = + ll_data (ll, struct factor_result, ll); - if ( !fctr ) - populate_extremes (tbl, heading_columns - 2, - i * 2 * n_extremities * n_factors + heading_rows, - n_extremities, &totals[i]); + const double t = + gsl_cdf_tdist_Qinv ((1 - cmd.n_cinterval[0] / 100.0) / 2.0, + result->metrics[v].n - 1); - else - { - struct factor_statistics **fs = fctr->fs; - int count = 0 ; - const union value *prev = NULL; + if ( i > 0 || v > 0 ) + { + const int left_col = (i == 0) ? 0 : 1; + tab_hline (tbl, TAL_1, left_col, n_cols - 1, + heading_rows + row_var_start + i * DESCRIPTIVE_ROWS); + } - while (*fs) + if ( fctr->indep_var[0]) { - const int row = heading_rows + ( 2 * n_extremities ) * - ( ( i * n_factors ) + count ); + struct string vstr; + ds_init_empty (&vstr); + var_append_value_name (fctr->indep_var[0], + result->value[0], &vstr); + + tab_text (tbl, 1, + heading_rows + row_var_start + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + ds_cstr (&vstr) + ); + ds_destroy (&vstr); + } - if ( !prev || 0 != compare_values (prev, (*fs)->id[0], - var_get_width (fctr->indep_var[0]))) - { - struct string vstr; - ds_init_empty (&vstr); - var_append_value_name (fctr->indep_var[0], - (*fs)->id[0], &vstr); - if ( count > 0 ) - tab_hline (tbl, TAL_1, 1, n_cols - 1, row); + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + _("Mean")); + + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + 1 + i * DESCRIPTIVE_ROWS, + TAB_LEFT | TAT_PRINTF, + _("%g%% Confidence Interval for Mean"), + cmd.n_cinterval[0]); + + tab_text (tbl, n_cols - 3, + heading_rows + row_var_start + 1 + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + _("Lower Bound")); + + tab_text (tbl, n_cols - 3, + heading_rows + row_var_start + 2 + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + _("Upper Bound")); + + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + 3 + i * DESCRIPTIVE_ROWS, + TAB_LEFT | TAT_PRINTF, + _("5%% Trimmed Mean")); + + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + 4 + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + _("Median")); + + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + 5 + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + _("Variance")); + + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + 6 + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + _("Std. Deviation")); + + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + 7 + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + _("Minimum")); + + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + 8 + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + _("Maximum")); + + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + 9 + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + _("Range")); + + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + 10 + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + _("Interquartile Range")); + + + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + 11 + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + _("Skewness")); + + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + 12 + i * DESCRIPTIVE_ROWS, + TAB_LEFT, + _("Kurtosis")); + + + /* Now the statistics ... */ + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + result->metrics[v].mean, + 8, 2); + + tab_float (tbl, n_cols - 1, + heading_rows + row_var_start + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + result->metrics[v].se_mean, + 8, 3); + + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + 1 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + result->metrics[v].mean - t * + result->metrics[v].se_mean, + 8, 3); + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + 2 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + result->metrics[v].mean + t * + result->metrics[v].se_mean, + 8, 3); + + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + 3 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + trimmed_mean_calculate ((struct trimmed_mean *) result->metrics[v].trimmed_mean), + 8, 2); + + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + 4 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + percentile_calculate (result->metrics[v].quartiles[1], percentile_algorithm), + 8, 2); + + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + 5 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + result->metrics[v].variance, + 8, 3); + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + 6 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + sqrt (result->metrics[v].variance), + 8, 3); + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + 10 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + percentile_calculate (result->metrics[v].quartiles[2], + percentile_algorithm) - + percentile_calculate (result->metrics[v].quartiles[0], + percentile_algorithm), + 8, 2); + + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + 11 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + result->metrics[v].skewness, + 8, 3); + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + 12 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + result->metrics[v].kurtosis, + 8, 3); + + tab_float (tbl, n_cols - 1, + heading_rows + row_var_start + 11 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + calc_seskew (result->metrics[v].n), + 8, 3); + + tab_float (tbl, n_cols - 1, + heading_rows + row_var_start + 12 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + calc_sekurt (result->metrics[v].n), + 8, 3); + + { + struct extremum *minimum, *maximum ; + + struct ll *max_ll = ll_head (extrema_list (result->metrics[v].maxima)); + struct ll *min_ll = ll_head (extrema_list (result->metrics[v].minima)); + + maximum = ll_data (max_ll, struct extremum, ll); + minimum = ll_data (min_ll, struct extremum, ll); + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + 7 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + minimum->value, + 8, 3); + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + 8 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + maximum->value, + 8, 3); + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + 9 + i * DESCRIPTIVE_ROWS, + TAB_CENTER, + maximum->value - minimum->value, + 8, 3); + } + } + } - tab_text (tbl, - 1, row, - TAB_LEFT | TAT_TITLE, - ds_cstr (&vstr) - ); + tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1); - ds_destroy (&vstr); - } + tab_title (tbl, _("Descriptives")); - prev = (*fs)->id[0]; + tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, + _("Statistic")); - if (fctr->indep_var[1] && count > 0 ) - tab_hline (tbl, TAL_1, 2, n_cols - 1, row); + tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, + _("Std. Error")); - if ( fctr->indep_var[1]) - { - struct string vstr; - ds_init_empty (&vstr); - var_append_value_name (fctr->indep_var[1], (*fs)->id[1], &vstr); + tab_submit (tbl); +} - tab_text (tbl, 2, row, - TAB_LEFT | TAT_TITLE, - ds_cstr (&vstr) - ); - ds_destroy (&vstr); - } - populate_extremes (tbl, heading_columns - 2, - row, n_extremities, - & (*fs)->m[i]); +static void +show_extremes (const struct variable **dependent_var, + int n_dep_var, + const struct xfactor *fctr) +{ + int v; + int heading_columns = 3; + int n_cols; + const int heading_rows = 1; + struct tab_table *tbl; + + int n_rows ; + n_rows = n_dep_var; - count++ ; - fs++; - } + assert (fctr); + + if ( fctr->indep_var[0] ) + { + heading_columns = 4; + + if ( fctr->indep_var[1] ) + { + heading_columns = 5; } } - tab_submit (tbl); -} + n_rows *= ll_count (&fctr->result_list) * cmd.st_n * 2; + n_rows += heading_rows; + n_cols = heading_columns + 2; + tbl = tab_create (n_cols, n_rows, 0); + tab_headers (tbl, heading_columns, 0, heading_rows, 0); -/* Fill in the extremities table */ -void -populate_extremes (struct tab_table *t, - int col, int row, int n, const struct metrics *m) -{ - int extremity; - int idx=0; + tab_dim (tbl, tab_natural_dimensions); + /* Outline the box */ + tab_box (tbl, + TAL_2, TAL_2, + -1, -1, + 0, 0, + n_cols - 1, n_rows - 1); - tab_text (t, col, row, - TAB_RIGHT | TAT_TITLE , - _ ("Highest") - ); - tab_text (t, col, row + n , - TAB_RIGHT | TAT_TITLE , - _ ("Lowest") - ); + tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows ); + tab_hline (tbl, TAL_2, 1, n_cols - 1, heading_rows ); + tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1); + if ( fctr->indep_var[0]) + tab_text (tbl, 1, 0, TAT_TITLE, var_to_string (fctr->indep_var[0])); - tab_hline (t, TAL_1, col, col + 3, row + n ); + if ( fctr->indep_var[1]) + tab_text (tbl, 2, 0, TAT_TITLE, var_to_string (fctr->indep_var[1])); - for (extremity = 0; extremity < n ; ++extremity ) + for (v = 0 ; v < n_dep_var ; ++v ) { - /* Highest */ - tab_float (t, col + 1, row + extremity, - TAB_RIGHT, - extremity + 1, 8, 0); + struct ll *ll; + int i = 0; + const int row_var_start = v * cmd.st_n * 2 * ll_count(&fctr->result_list); + tab_text (tbl, + 0, + heading_rows + row_var_start, + TAB_LEFT | TAT_TITLE, + var_to_string (dependent_var[v]) + ); - /* Lowest */ - tab_float (t, col + 1, row + extremity + n, - TAB_RIGHT, - extremity + 1, 8, 0); + for (ll = ll_head (&fctr->result_list); + ll != ll_null (&fctr->result_list); i++, ll = ll_next (ll)) + { + int e ; + struct ll *min_ll; + struct ll *max_ll; + const int row_result_start = i * cmd.st_n * 2; - } + const struct factor_result *result = + ll_data (ll, struct factor_result, ll); + if (i > 0 || v > 0) + tab_hline (tbl, TAL_1, 1, n_cols - 1, + heading_rows + row_var_start + row_result_start); - /* Lowest */ - for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx ) - { - int j; - const struct weighted_value *wv = m->wvp[idx]; - struct case_node *cn = wv->case_nos; + tab_hline (tbl, TAL_1, heading_columns - 2, n_cols - 1, + heading_rows + row_var_start + row_result_start + cmd.st_n); + for ( e = 1; e <= cmd.st_n; ++e ) + { + tab_text (tbl, n_cols - 3, + heading_rows + row_var_start + row_result_start + e - 1, + TAB_RIGHT | TAT_PRINTF, + _("%d"), e); + + tab_text (tbl, n_cols - 3, + heading_rows + row_var_start + row_result_start + cmd.st_n + e - 1, + TAB_RIGHT | TAT_PRINTF, + _("%d"), e); + } - for (j = 0 ; j < wv->w ; ++j ) - { - if ( extremity + j >= n ) - break ; - tab_float (t, col + 3, row + extremity + j + n, - TAB_RIGHT, - wv->v.f, 8, 2); + min_ll = ll_head (extrema_list (result->metrics[v].minima)); + for (e = 0; e < cmd.st_n;) + { + struct extremum *minimum = ll_data (min_ll, struct extremum, ll); + double weight = minimum->weight; - tab_float (t, col + 2, row + extremity + j + n, - TAB_RIGHT, - cn->num, 8, 0); + while (weight-- > 0 && e < cmd.st_n) + { + tab_float (tbl, n_cols - 1, + heading_rows + row_var_start + row_result_start + cmd.st_n + e, + TAB_RIGHT, + minimum->value, + 8, 2); + + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + row_result_start + cmd.st_n + e, + TAB_RIGHT, + minimum->location, + 8, 0); + ++e; + } - if ( cn->next ) - cn = cn->next; + min_ll = ll_next (min_ll); + } - } - extremity += wv->w ; - } + max_ll = ll_head (extrema_list (result->metrics[v].maxima)); + for (e = 0; e < cmd.st_n;) + { + struct extremum *maximum = ll_data (max_ll, struct extremum, ll); + double weight = maximum->weight; + while (weight-- > 0 && e < cmd.st_n) + { + tab_float (tbl, n_cols - 1, + heading_rows + row_var_start + row_result_start + e, + TAB_RIGHT, + maximum->value, + 8, 2); + + + tab_float (tbl, n_cols - 2, + heading_rows + row_var_start + row_result_start + e, + TAB_RIGHT, + maximum->location, + 8, 0); + ++e; + } - /* Highest */ - for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx ) - { - int j; - const struct weighted_value *wv = m->wvp[idx]; - struct case_node *cn = wv->case_nos; + max_ll = ll_next (max_ll); + } - for (j = 0 ; j < wv->w ; ++j ) - { - if ( extremity + j >= n ) - break ; - tab_float (t, col + 3, row + extremity + j, + if ( fctr->indep_var[0]) + { + struct string vstr; + ds_init_empty (&vstr); + var_append_value_name (fctr->indep_var[0], + result->value[0], &vstr); + + tab_text (tbl, 1, + heading_rows + row_var_start + row_result_start, + TAB_LEFT, + ds_cstr (&vstr) + ); + + ds_destroy (&vstr); + } + + + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + row_result_start, TAB_RIGHT, - wv->v.f, 8, 2); + _("Highest")); - tab_float (t, col + 2, row + extremity + j, + tab_text (tbl, n_cols - 4, + heading_rows + row_var_start + row_result_start + cmd.st_n, TAB_RIGHT, - cn->num, 8, 0); + _("Lowest")); + } + } - if ( cn->next ) - cn = cn->next; + tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1); - } - extremity += wv->w ; - } + tab_title (tbl, _("Extreme Values")); + + + tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, + _("Case Number")); + + + tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, + _("Value")); + + tab_submit (tbl); } +#define PERCENTILE_ROWS 2 -/* Show the descriptives table */ -void -show_descriptives (const struct variable **dependent_var, +static void +show_percentiles (const struct variable **dependent_var, int n_dep_var, - struct factor *fctr) + const struct xfactor *fctr) { int i; - int heading_columns ; + int v; + int heading_columns = 2; int n_cols; - const int n_stat_rows = 13; - - const int heading_rows = 1; - + const int n_percentiles = subc_list_double_count (&percentile_list); + const int heading_rows = 2; struct tab_table *tbl; - int n_factors = 1; int n_rows ; + n_rows = n_dep_var; - if ( fctr ) - { - heading_columns = 4; - n_factors = hsh_count (fctr->fstats); - - n_rows = n_dep_var * n_stat_rows * n_factors; + assert (fctr); - if ( fctr->indep_var[1] ) - heading_columns = 5; - } - else + if ( fctr->indep_var[0] ) { heading_columns = 3; - n_rows = n_dep_var * n_stat_rows; + + if ( fctr->indep_var[1] ) + { + heading_columns = 4; + } } + n_rows *= ll_count (&fctr->result_list) * PERCENTILE_ROWS; n_rows += heading_rows; - n_cols = heading_columns + 2; - + n_cols = heading_columns + n_percentiles; tbl = tab_create (n_cols, n_rows, 0); - - tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0); + tab_headers (tbl, heading_columns, 0, heading_rows, 0); tab_dim (tbl, tab_natural_dimensions); - /* Outline the box and have no internal lines*/ + /* Outline the box */ tab_box (tbl, TAL_2, TAL_2, -1, -1, 0, 0, n_cols - 1, n_rows - 1); - tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows ); - - tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1); - tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1); - tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1); - tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Statistic")); - tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Std. Error")); + tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows ); + tab_hline (tbl, TAL_2, 1, n_cols - 1, heading_rows ); - tab_title (tbl, _ ("Descriptives")); + if ( fctr->indep_var[0]) + tab_text (tbl, 1, 1, TAT_TITLE, var_to_string (fctr->indep_var[0])); + if ( fctr->indep_var[1]) + tab_text (tbl, 2, 1, TAT_TITLE, var_to_string (fctr->indep_var[1])); - for ( i = 0 ; i < n_dep_var ; ++i ) + for (v = 0 ; v < n_dep_var ; ++v ) { - const int row = heading_rows + i * n_stat_rows * n_factors ; + double hinges[3]; + struct ll *ll; + int i = 0; - if ( i > 0 ) - tab_hline (tbl, TAL_1, 0, n_cols - 1, row ); + const int row_var_start = + v * PERCENTILE_ROWS * ll_count(&fctr->result_list); - tab_text (tbl, 0, - i * n_stat_rows * n_factors + heading_rows, + tab_text (tbl, + 0, + heading_rows + row_var_start, TAB_LEFT | TAT_TITLE, - var_to_string (dependent_var[i]) + var_to_string (dependent_var[v]) ); - - if ( fctr ) + for (ll = ll_head (&fctr->result_list); + ll != ll_null (&fctr->result_list); i++, ll = ll_next (ll)) { - const union value *prev = NULL; + int j; + const struct factor_result *result = + ll_data (ll, struct factor_result, ll); - struct factor_statistics **fs = fctr->fs; - int count = 0; + if ( i > 0 || v > 0 ) + { + const int left_col = (i == 0) ? 0 : 1; + tab_hline (tbl, TAL_1, left_col, n_cols - 1, + heading_rows + row_var_start + i * PERCENTILE_ROWS); + } - tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string (fctr->indep_var[0])); + if ( fctr->indep_var[0]) + { + struct string vstr; + ds_init_empty (&vstr); + var_append_value_name (fctr->indep_var[0], + result->value[0], &vstr); + + tab_text (tbl, 1, + heading_rows + row_var_start + i * PERCENTILE_ROWS, + TAB_LEFT, + ds_cstr (&vstr) + ); + ds_destroy (&vstr); + } - if ( fctr->indep_var[1]) - tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string (fctr->indep_var[1])); - while ( *fs ) - { - const int row = heading_rows + n_stat_rows * - ( ( i * n_factors ) + count ); + tab_text (tbl, n_cols - n_percentiles - 1, + heading_rows + row_var_start + i * PERCENTILE_ROWS, + TAB_LEFT, + ptile_alg_desc [percentile_algorithm]); - if ( !prev || 0 != compare_values (prev, (*fs)->id[0], - var_get_width (fctr->indep_var[0]))) - { - struct string vstr; - ds_init_empty (&vstr); - var_append_value_name (fctr->indep_var[0], - (*fs)->id[0], &vstr); - - if ( count > 0 ) - tab_hline (tbl, TAL_1, 1, n_cols - 1, row); - - tab_text (tbl, - 1, row, - TAB_LEFT | TAT_TITLE, - ds_cstr (&vstr) - ); - - ds_destroy (&vstr); - } - - prev = (*fs)->id[0]; - - if (fctr->indep_var[1] && count > 0 ) - tab_hline (tbl, TAL_1, 2, n_cols - 1, row); - - if ( fctr->indep_var[1]) - { - struct string vstr; - ds_init_empty (&vstr); - var_append_value_name (fctr->indep_var[1], (*fs)->id[1], &vstr); - - tab_text (tbl, 2, row, - TAB_LEFT | TAT_TITLE, - ds_cstr (&vstr) - ); - - ds_destroy (&vstr); - } - - populate_descriptives (tbl, heading_columns - 2, - row, & (*fs)->m[i]); - - count++ ; - fs++; - } - - } - - else - { - - populate_descriptives (tbl, heading_columns - 2, - i * n_stat_rows * n_factors + heading_rows, - &totals[i]); - } - } - - tab_submit (tbl); - -} - - -/* Fill in the descriptives data */ -void -populate_descriptives (struct tab_table *tbl, int col, int row, - const struct metrics *m) -{ - const double t = gsl_cdf_tdist_Qinv ((1 - cmd.n_cinterval[0] / 100.0)/2.0, - m->n -1); - - tab_text (tbl, col, - row, - TAB_LEFT | TAT_TITLE, - _ ("Mean")); - - tab_float (tbl, col + 2, - row, - TAB_CENTER, - m->mean, - 8,2); - - tab_float (tbl, col + 3, - row, - TAB_CENTER, - m->se_mean, - 8,3); - - - tab_text (tbl, col, - row + 1, - TAB_LEFT | TAT_TITLE | TAT_PRINTF, - _ ("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]); - - - tab_text (tbl, col + 1, - row + 1, - TAB_LEFT | TAT_TITLE, - _ ("Lower Bound")); - - tab_float (tbl, col + 2, - row + 1, - TAB_CENTER, - m->mean - t * m->se_mean, - 8,3); - - tab_text (tbl, col + 1, - row + 2, - TAB_LEFT | TAT_TITLE, - _ ("Upper Bound")); - - - tab_float (tbl, col + 2, - row + 2, - TAB_CENTER, - m->mean + t * m->se_mean, - 8,3); - - tab_text (tbl, col, - row + 3, - TAB_LEFT | TAT_TITLE | TAT_PRINTF, - _ ("5%% Trimmed Mean")); - - tab_float (tbl, col + 2, - row + 3, - TAB_CENTER, - m->trimmed_mean, - 8,2); - - tab_text (tbl, col, - row + 4, - TAB_LEFT | TAT_TITLE, - _ ("Median")); - - { - struct percentile *p; - double d = 50; - - p = hsh_find (m->ptile_hash, &d); - - assert (p); - - - tab_float (tbl, col + 2, - row + 4, - TAB_CENTER, - p->v, - 8, 2); - } - - - tab_text (tbl, col, - row + 5, - TAB_LEFT | TAT_TITLE, - _ ("Variance")); - - tab_float (tbl, col + 2, - row + 5, - TAB_CENTER, - m->var, - 8,3); - - - tab_text (tbl, col, - row + 6, - TAB_LEFT | TAT_TITLE, - _ ("Std. Deviation")); - - - tab_float (tbl, col + 2, - row + 6, - TAB_CENTER, - m->stddev, - 8,3); - - - tab_text (tbl, col, - row + 7, - TAB_LEFT | TAT_TITLE, - _ ("Minimum")); - - tab_float (tbl, col + 2, - row + 7, - TAB_CENTER, - m->min, - 8,3); - - tab_text (tbl, col, - row + 8, - TAB_LEFT | TAT_TITLE, - _ ("Maximum")); - - tab_float (tbl, col + 2, - row + 8, - TAB_CENTER, - m->max, - 8,3); - - - tab_text (tbl, col, - row + 9, - TAB_LEFT | TAT_TITLE, - _ ("Range")); - - - tab_float (tbl, col + 2, - row + 9, - TAB_CENTER, - m->max - m->min, - 8,3); - - tab_text (tbl, col, - row + 10, - TAB_LEFT | TAT_TITLE, - _ ("Interquartile Range")); - - { - struct percentile *p1; - struct percentile *p2; - - double d = 75; - p1 = hsh_find (m->ptile_hash, &d); - - d = 25; - p2 = hsh_find (m->ptile_hash, &d); - - assert (p1); - assert (p2); - - tab_float (tbl, col + 2, - row + 10, - TAB_CENTER, - p1->v - p2->v, - 8, 2); - } - - - - tab_text (tbl, col, - row + 11, - TAB_LEFT | TAT_TITLE, - _ ("Skewness")); - - - tab_float (tbl, col + 2, - row + 11, - TAB_CENTER, - m->skewness, - 8,3); - - /* stderr of skewness */ - tab_float (tbl, col + 3, - row + 11, - TAB_CENTER, - calc_seskew (m->n), - 8,3); - - - tab_text (tbl, col, - row + 12, - TAB_LEFT | TAT_TITLE, - _ ("Kurtosis")); - - - tab_float (tbl, col + 2, - row + 12, - TAB_CENTER, - m->kurtosis, - 8,3); - - /* stderr of kurtosis */ - tab_float (tbl, col + 3, - row + 12, - TAB_CENTER, - calc_sekurt (m->n), - 8,3); - - -} - - - -void -box_plot_variables (const struct factor *fctr, - const struct variable **vars, int n_vars, - const struct variable *id) -{ + tab_text (tbl, n_cols - n_percentiles - 1, + heading_rows + row_var_start + 1 + i * PERCENTILE_ROWS, + TAB_LEFT, + _("Tukey's Hinges")); - int i; - struct factor_statistics **fs ; - if ( ! fctr ) - { - box_plot_group (fctr, vars, n_vars, id); - return; - } + tab_vline (tbl, TAL_1, n_cols - n_percentiles -1, heading_rows, n_rows - 1); - for ( fs = fctr->fs ; *fs ; ++fs ) - { - struct string str; - double y_min = DBL_MAX; - double y_max = -DBL_MAX; - struct chart *ch = chart_create (); - ds_init_empty (&str); - factor_to_string (fctr, *fs, 0, &str ); + tukey_hinges_calculate ((struct tukey_hinges *) result->metrics[v].tukey_hinges, + hinges); - chart_write_title (ch, ds_cstr (&str)); - - for ( i = 0 ; i < n_vars ; ++i ) - { - y_max = MAX (y_max, (*fs)->m[i].max); - y_min = MIN (y_min, (*fs)->m[i].min); - } - - boxplot_draw_yscale (ch, y_max, y_min); - - for ( i = 0 ; i < n_vars ; ++i ) - { - - const double box_width = (ch->data_right - ch->data_left) - / (n_vars * 2.0 ) ; - - const double box_centre = ( i * 2 + 1) * box_width - + ch->data_left; - - boxplot_draw_boxplot (ch, - box_centre, box_width, - & (*fs)->m[i], - var_to_string (vars[i])); - - - } - - chart_submit (ch); - ds_destroy (&str); - } -} - - - -/* Do a box plot, grouping all factors into one plot ; - each dependent variable has its own plot. -*/ -void -box_plot_group (const struct factor *fctr, - const struct variable **vars, - int n_vars, - const struct variable *id UNUSED) -{ - - int i; - - for ( i = 0 ; i < n_vars ; ++i ) - { - struct factor_statistics **fs ; - struct chart *ch; - - ch = chart_create (); - - boxplot_draw_yscale (ch, totals[i].max, totals[i].min); - - if ( fctr ) - { - int n_factors = 0; - int f=0; - for ( fs = fctr->fs ; *fs ; ++fs ) - ++n_factors; - - chart_write_title (ch, _ ("Boxplot of %s vs. %s"), - var_to_string (vars[i]), var_to_string (fctr->indep_var[0]) ); - - for ( fs = fctr->fs ; *fs ; ++fs ) + for (j = 0; j < n_percentiles; ++j) { - struct string str; - const double box_width = (ch->data_right - ch->data_left) - / (n_factors * 2.0 ) ; - - const double box_centre = ( f++ * 2 + 1) * box_width - + ch->data_left; + double hinge = SYSMIS; + tab_float (tbl, n_cols - n_percentiles + j, + heading_rows + row_var_start + i * PERCENTILE_ROWS, + TAB_CENTER, + percentile_calculate (result->metrics[v].ptl[j], + percentile_algorithm), + 8, 2 + ); + + if ( result->metrics[v].ptl[j]->ptile == 0.5) + hinge = hinges[1]; + else if ( result->metrics[v].ptl[j]->ptile == 0.25) + hinge = hinges[0]; + else if ( result->metrics[v].ptl[j]->ptile == 0.75) + hinge = hinges[2]; + + if ( hinge != SYSMIS) + tab_float (tbl, n_cols - n_percentiles + j, + heading_rows + row_var_start + 1 + i * PERCENTILE_ROWS, + TAB_CENTER, + hinge, + 8, 2 + ); - ds_init_empty (&str); - factor_to_string_concise (fctr, *fs, &str); - - boxplot_draw_boxplot (ch, - box_centre, box_width, - & (*fs)->m[i], - ds_cstr (&str)); - ds_destroy (&str); } } - else if ( ch ) - { - const double box_width = (ch->data_right - ch->data_left) / 3.0; - const double box_centre = (ch->data_right + ch->data_left) / 2.0; - - chart_write_title (ch, _ ("Boxplot")); - - boxplot_draw_boxplot (ch, - box_centre, box_width, - &totals[i], - var_to_string (vars[i]) ); - - } - - chart_submit (ch); } -} - - -/* Plot the normal and detrended normal plots for m - Label the plots with factorname */ -void -np_plot (const struct metrics *m, const char *factorname) -{ - int i; - double yfirst=0, ylast=0; - - /* Normal Plot */ - struct chart *np_chart; - - /* Detrended Normal Plot */ - struct chart *dnp_chart; - - /* The slope and intercept of the ideal normal probability line */ - const double slope = 1.0 / m->stddev; - const double intercept = - m->mean / m->stddev; - - /* Cowardly refuse to plot an empty data set */ - if ( m->n_data == 0 ) - return ; - - np_chart = chart_create (); - dnp_chart = chart_create (); - - if ( !np_chart || ! dnp_chart ) - return ; - - chart_write_title (np_chart, _ ("Normal Q-Q Plot of %s"), factorname); - chart_write_xlabel (np_chart, _ ("Observed Value")); - chart_write_ylabel (np_chart, _ ("Expected Normal")); - - - chart_write_title (dnp_chart, _ ("Detrended Normal Q-Q Plot of %s"), - factorname); - chart_write_xlabel (dnp_chart, _ ("Observed Value")); - chart_write_ylabel (dnp_chart, _ ("Dev from Normal")); - - yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1)); - ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1)); - - - { - /* Need to make sure that both the scatter plot and the ideal fit into the - plot */ - double x_lower = MIN (m->min, (yfirst - intercept) / slope) ; - double x_upper = MAX (m->max, (ylast - intercept) / slope) ; - double slack = (x_upper - x_lower) * 0.05 ; - chart_write_xscale (np_chart, x_lower - slack, x_upper + slack, 5); - - chart_write_xscale (dnp_chart, m->min, m->max, 5); - - } - - chart_write_yscale (np_chart, yfirst, ylast, 5); - - { - /* We have to cache the detrended data, beacause we need to - find its limits before we can plot it */ - double *d_data = xnmalloc (m->n_data, sizeof *d_data); - double d_max = -DBL_MAX; - double d_min = DBL_MAX; - for ( i = 0 ; i < m->n_data; ++i ) - { - const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1)); - - chart_datum (np_chart, 0, m->wvp[i]->v.f, ns); - - d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns; - - if ( d_data[i] < d_min ) d_min = d_data[i]; - if ( d_data[i] > d_max ) d_max = d_data[i]; - } - chart_write_yscale (dnp_chart, d_min, d_max, 5); - - for ( i = 0 ; i < m->n_data; ++i ) - chart_datum (dnp_chart, 0, m->wvp[i]->v.f, d_data[i]); - - free (d_data); - } - - chart_line (np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y); - chart_line (dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X); - - chart_submit (np_chart); - chart_submit (dnp_chart); -} - - - - -/* Show the percentiles */ -void -show_percentiles (const struct variable **dependent_var, - int n_dep_var, - struct factor *fctr) -{ - struct tab_table *tbl; - int i; - - int n_cols, n_rows; - int n_factors; - - struct hsh_table *ptiles ; + tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1); - int n_heading_columns; - const int n_heading_rows = 2; - const int n_stat_rows = 2; + tab_title (tbl, _("Percentiles")); - int n_ptiles ; - if ( fctr ) + for (i = 0 ; i < n_percentiles; ++i ) { - struct factor_statistics **fs = fctr->fs ; - n_heading_columns = 3; - n_factors = hsh_count (fctr->fstats); - - ptiles = (*fs)->m[0].ptile_hash; + tab_text (tbl, n_cols - n_percentiles + i, 1, + TAB_CENTER | TAT_TITLE | TAT_PRINTF, + _("%g"), + subc_list_double_at (&percentile_list, i) + ); - if ( fctr->indep_var[1] ) - n_heading_columns = 4; - } - else - { - n_factors = 1; - n_heading_columns = 2; - ptiles = totals[0].ptile_hash; } - n_ptiles = hsh_count (ptiles); - - n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors; - - n_cols = n_heading_columns + n_ptiles ; - - tbl = tab_create (n_cols, n_rows, 0); - - tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0); - - tab_dim (tbl, tab_natural_dimensions); - - /* Outline the box and have no internal lines*/ - tab_box (tbl, - TAL_2, TAL_2, - -1, -1, - 0, 0, - n_cols - 1, n_rows - 1); - - tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows ); - - tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1); - - - tab_title (tbl, _ ("Percentiles")); - - - tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 ); - - - tab_box (tbl, - -1, -1, - -1, TAL_1, - 0, n_heading_rows, - n_heading_columns - 1, n_rows - 1); - + tab_joint_text (tbl, + n_cols - n_percentiles, 0, + n_cols - 1, 0, + TAB_CENTER | TAT_TITLE, + _("Percentiles")); + /* Vertical lines for the data only */ tab_box (tbl, -1, -1, -1, TAL_1, - n_heading_columns, n_heading_rows - 1, + n_cols - n_percentiles, 1, n_cols - 1, n_rows - 1); - tab_joint_text (tbl, n_heading_columns + 1, 0, - n_cols - 1 , 0, - TAB_CENTER | TAT_TITLE , - _ ("Percentiles")); - - - { - /* Put in the percentile break points as headings */ - - struct percentile **p = (struct percentile **) hsh_sort (ptiles); - - i = 0; - while ( (*p) ) - { - tab_float (tbl, n_heading_columns + i++ , 1, - TAB_CENTER, - (*p)->p, 8, 0); - - p++; - } - - } - - for ( i = 0 ; i < n_dep_var ; ++i ) - { - const int n_stat_rows = 2; - const int row = n_heading_rows + i * n_stat_rows * n_factors ; - - if ( i > 0 ) - tab_hline (tbl, TAL_1, 0, n_cols - 1, row ); - - tab_text (tbl, 0, - i * n_stat_rows * n_factors + n_heading_rows, - TAB_LEFT | TAT_TITLE, - var_to_string (dependent_var[i]) - ); - - if ( fctr ) - { - const union value *prev = NULL ; - struct factor_statistics **fs = fctr->fs; - int count = 0; - - tab_text (tbl, 1, n_heading_rows - 1, - TAB_CENTER | TAT_TITLE, - var_to_string (fctr->indep_var[0])); - - - if ( fctr->indep_var[1]) - tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string (fctr->indep_var[1])); - - while ( *fs ) - { - const int row = n_heading_rows + n_stat_rows * - ( ( i * n_factors ) + count ); - - - if ( !prev || 0 != compare_values (prev, (*fs)->id[0], - var_get_width (fctr->indep_var[0]))) - { - struct string vstr; - ds_init_empty (&vstr); - var_append_value_name (fctr->indep_var[0], - (*fs)->id[0], &vstr); - - - if ( count > 0 ) - tab_hline (tbl, TAL_1, 1, n_cols - 1, row); - - tab_text (tbl, - 1, row, - TAB_LEFT | TAT_TITLE, - ds_cstr (&vstr) - ); - - ds_destroy (&vstr); - } - - prev = (*fs)->id[0]; - - if (fctr->indep_var[1] && count > 0 ) - tab_hline (tbl, TAL_1, 2, n_cols - 1, row); - - if ( fctr->indep_var[1]) - { - struct string vstr; - ds_init_empty (&vstr); - var_append_value_name (fctr->indep_var[1], (*fs)->id[1], &vstr); - - tab_text (tbl, 2, row, - TAB_LEFT | TAT_TITLE, - ds_cstr (&vstr) - ); - - ds_destroy (&vstr); - } - - - populate_percentiles (tbl, n_heading_columns - 1, - row, & (*fs)->m[i]); - - - count++ ; - fs++; - } - - - } - else - { - populate_percentiles (tbl, n_heading_columns - 1, - i * n_stat_rows * n_factors + n_heading_rows, - &totals[i]); - } - - - } + tab_hline (tbl, TAL_1, n_cols - n_percentiles, n_cols - 1, 1); tab_submit (tbl); - - } - - -void -populate_percentiles (struct tab_table *tbl, int col, int row, - const struct metrics *m) +static void +factor_to_string_concise (const struct xfactor *fctr, + const struct factor_result *result, + struct string *str + ) { - int i; - - struct percentile **p = (struct percentile **) hsh_sort (m->ptile_hash); - - tab_text (tbl, - col, row + 1, - TAB_LEFT | TAT_TITLE, - _ ("Tukey\'s Hinges") - ); - - tab_text (tbl, - col, row, - TAB_LEFT | TAT_TITLE, - ptile_alg_desc[m->ptile_alg] - ); - - - i = 0; - while ( (*p) ) + if (fctr->indep_var[0]) { - tab_float (tbl, col + i + 1 , row, - TAB_CENTER, - (*p)->v, 8, 2); - if ( (*p)->p == 25 ) - tab_float (tbl, col + i + 1 , row + 1, - TAB_CENTER, - m->hinge[0], 8, 2); - - if ( (*p)->p == 50 ) - tab_float (tbl, col + i + 1 , row + 1, - TAB_CENTER, - m->hinge[1], 8, 2); - - if ( (*p)->p == 75 ) - tab_float (tbl, col + i + 1 , row + 1, - TAB_CENTER, - m->hinge[2], 8, 2); + var_append_value_name (fctr->indep_var[0], result->value[0], str); + if ( fctr->indep_var[1] ) + { + ds_put_cstr (str, ","); - i++; + var_append_value_name (fctr->indep_var[1], result->value[1], str); - p++; + ds_put_cstr (str, ")"); + } } - } + static void -factor_to_string (const struct factor *fctr, - const struct factor_statistics *fs, - const struct variable *var, +factor_to_string (const struct xfactor *fctr, + const struct factor_result *result, struct string *str ) { - if (var) - ds_put_format (str, "%s (",var_to_string (var) ); - - - ds_put_format (str, "%s = ", - var_to_string (fctr->indep_var[0])); + if (fctr->indep_var[0]) + { + ds_put_format (str, "(%s = ", var_get_name (fctr->indep_var[0])); - var_append_value_name (fctr->indep_var[0], fs->id[0], str); + var_append_value_name (fctr->indep_var[0], result->value[0], str); - if ( fctr->indep_var[1] ) - { - ds_put_format (str, "; %s = )", - var_to_string (fctr->indep_var[1])); + if ( fctr->indep_var[1] ) + { + ds_put_cstr (str, ","); + ds_put_format (str, "%s = ", var_get_name (fctr->indep_var[1])); - var_append_value_name (fctr->indep_var[1], fs->id[1], str); - } - else - { - if ( var ) - ds_put_cstr (str, ")"); + var_append_value_name (fctr->indep_var[1], result->value[1], str); + } + ds_put_cstr (str, ")"); } } -static void -factor_to_string_concise (const struct factor *fctr, - const struct factor_statistics *fs, - struct string *str - ) - -{ - var_append_value_name (fctr->indep_var[0], fs->id[0], str); - - if ( fctr->indep_var[1] ) - { - ds_put_cstr (str, ","); - - var_append_value_name (fctr->indep_var[1],fs->id[1], str); - ds_put_cstr (str, ")"); - } -} /* Local Variables: diff --git a/src/language/stats/freq.c b/src/language/stats/freq.c index f2054380b8..afad84ee4b 100644 --- a/src/language/stats/freq.c +++ b/src/language/stats/freq.c @@ -30,16 +30,15 @@ compare_freq ( const void *_f1, const void *_f2, const void *_var) const struct freq *f2 = _f2; const struct variable *var = _var; - return compare_values (f1->value, f2->value, var_get_width (var) ); + return compare_values (f1->value, f2->value, var ); } unsigned int -hash_freq (const void *_f, const void *_var) +hash_freq (const void *_f, const void *var) { const struct freq *f = _f; - const struct variable *var = _var; - return hash_value (f->value, var_get_width (var)); + return hash_value (f->value, var); } /* Free function to be used on FR whose value parameter has been copied */ diff --git a/src/language/stats/frequencies.q b/src/language/stats/frequencies.q index 4f26f93d18..94b2bcf9e6 100644 --- a/src/language/stats/frequencies.q +++ b/src/language/stats/frequencies.q @@ -268,7 +268,7 @@ static hsh_compare_func compare_freq_numeric_d, compare_freq_alpha_d; static void do_piechart(const struct variable *var, const struct freq_tab *frq_tab); -gsl_histogram * +struct histogram * freq_tab_to_hist(const struct freq_tab *ft, const struct variable *var); @@ -606,31 +606,26 @@ postcalc (void) if ( chart == GFT_HIST) { double d[frq_n_stats]; - struct normal_curve norm; - gsl_histogram *hist ; - - - norm.N = vf->tab.valid_cases; + struct histogram *hist ; calc_stats (v, d); - norm.mean = d[frq_mean]; - norm.stddev = d[frq_stddev]; - hist = freq_tab_to_hist(ft,v); + hist = freq_tab_to_hist (ft,v); - histogram_plot(hist, var_to_string(v), &norm, normal); + histogram_plot_n (hist, var_to_string(v), + vf->tab.valid_cases, + d[frq_mean], + d[frq_stddev], + normal); - gsl_histogram_free(hist); + statistic_destroy ((struct statistic *)hist); } - if ( chart == GFT_PIE) { do_piechart(v_variables[i], ft); } - - cleanup_freq_tab (v); } @@ -1437,14 +1432,14 @@ dump_statistics (const struct variable *v, int show_varname) /* Create a gsl_histogram from a freq_tab */ -gsl_histogram * -freq_tab_to_hist(const struct freq_tab *ft, const struct variable *var) +struct histogram * +freq_tab_to_hist (const struct freq_tab *ft, const struct variable *var) { int i; double x_min = DBL_MAX; double x_max = -DBL_MAX; - gsl_histogram *hist; + struct statistic *hist; const double bins = 11; struct hsh_iterator hi; @@ -1461,15 +1456,15 @@ freq_tab_to_hist(const struct freq_tab *ft, const struct variable *var) if ( frq->value[0].f > x_max ) x_max = frq->value[0].f ; } - hist = histogram_create(bins, x_min, x_max); + hist = histogram_create (bins, x_min, x_max); for( i = 0 ; i < ft->n_valid ; ++i ) { frq = &ft->valid[i]; - gsl_histogram_accumulate(hist, frq->value[0].f, frq->count); + histogram_add ((struct histogram *)hist, frq->value[0].f, frq->count); } - return hist; + return (struct histogram *)hist; } diff --git a/src/language/stats/glm.q b/src/language/stats/glm.q index fd48b735e5..48c3b221fd 100644 --- a/src/language/stats/glm.q +++ b/src/language/stats/glm.q @@ -47,7 +47,7 @@ #include "xalloc.h" #include "gettext.h" -#define GLM_LARGE_DATA 1000 +#define GLM_LARGE_DATA 10000 /* (headers) */ @@ -95,19 +95,16 @@ int cmd_glm (struct lexer *lexer, struct dataset *ds); static bool run_glm (struct casereader *, struct cmd_glm *, - const struct dataset *, pspp_linreg_cache *); + const struct dataset *); int cmd_glm (struct lexer *lexer, struct dataset *ds) { struct casegrouper *grouper; struct casereader *group; - pspp_linreg_cache *model = NULL; bool ok; - model = xmalloc (sizeof *model); - if (!parse_glm (lexer, ds, &cmd, NULL)) return CMD_FAILURE; @@ -115,12 +112,11 @@ cmd_glm (struct lexer *lexer, struct dataset *ds) grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds)); while (casegrouper_get_next_group (grouper, &group)) { - run_glm (group, &cmd, ds, model); + run_glm (group, &cmd, ds); } ok = casegrouper_destroy (grouper); ok = proc_commit (ds) && ok; - free (model); free (v_dependent); return ok ? CMD_SUCCESS : CMD_FAILURE; } @@ -151,13 +147,16 @@ glm_custom_dependent (struct lexer *lexer, struct dataset *ds, return 1; } +/* + COV is the covariance matrix for variables included in the + model. That means the dependent variable is in there, too. + */ static void -coeff_init (pspp_linreg_cache * c, struct design_matrix *dm) +coeff_init (pspp_linreg_cache * c, struct design_matrix *cov) { - c->coeff = xnmalloc (dm->m->size2 + 1, sizeof (*c->coeff)); - c->coeff[0] = xmalloc (sizeof (*(c->coeff[0]))); /* The first coefficient is the intercept. */ - c->coeff[0]->v_info = NULL; /* Intercept has no associated variable. */ - pspp_coeff_init (c->coeff + 1, dm); + c->coeff = xnmalloc (cov->m->size2, sizeof (*c->coeff)); + c->n_coeffs = cov->m->size2 - 1; + pspp_coeff_init (c->coeff, cov); } /* @@ -250,8 +249,9 @@ data_pass_one (struct casereader *input, static bool run_glm (struct casereader *input, struct cmd_glm *cmd, - const struct dataset *ds, pspp_linreg_cache * model) + const struct dataset *ds) { + pspp_linreg_cache *model = NULL; size_t i; size_t j; int n_indep = 0; @@ -267,8 +267,6 @@ run_glm (struct casereader *input, pspp_linreg_opts lopts; - assert (model != NULL); - if (!casereader_peek (input, 0, &c)) { casereader_destroy (input); @@ -283,8 +281,6 @@ run_glm (struct casereader *input, 1u << DC_SYSTEM); } - - lopts.get_depvar_mean_std = 1; lopts.get_indep_mean_std = xnmalloc (n_dependent, sizeof (int)); @@ -307,15 +303,19 @@ run_glm (struct casereader *input, reader = casereader_clone (input); reader = casereader_create_filter_missing (reader, indep_vars, n_indep, - MV_ANY, NULL); + MV_ANY, NULL, NULL); reader = casereader_create_filter_missing (reader, v_dependent, 1, - MV_ANY, NULL); + MV_ANY, NULL, NULL); n_data = data_pass_one (casereader_clone (reader), (const struct variable **) all_vars, n_all_vars, mom); if ((n_data > 0) && (n_indep > 0)) { + for (i = 0; i < n_all_vars; i++) + if (var_is_alpha (all_vars[i])) + cat_stored_values_create (all_vars[i]); + X = covariance_matrix_create (n_all_vars, (const struct variable **) all_vars); @@ -329,6 +329,8 @@ run_glm (struct casereader *input, { const struct variable *v = all_vars[i]; const union value *val_v = case_data (&c, v); + if (var_is_alpha (all_vars[i])) + cat_value_update (all_vars[i], val_v); for (j = i; j < n_all_vars; j++) { const struct variable *w = all_vars[j]; @@ -339,6 +341,16 @@ run_glm (struct casereader *input, } } } + model = pspp_linreg_cache_alloc (v_dependent[0], indep_vars, n_data, n_indep); + /* + For large data sets, use QR decomposition. + */ + if (n_data > sqrt (n_indep) && n_data > GLM_LARGE_DATA) + { + model->method = PSPP_LINREG_QR; + } + coeff_init (model, X); + pspp_linreg_with_cov (X, model); casereader_destroy (reader); for (i = 0; i < n_all_vars; i++) { @@ -357,6 +369,7 @@ run_glm (struct casereader *input, } free (indep_vars); free (lopts.get_indep_mean_std); + pspp_linreg_cache_free (model); casereader_destroy (input); return true; diff --git a/src/language/stats/npar-summary.c b/src/language/stats/npar-summary.c index c9c2c9da1a..c752d503bb 100644 --- a/src/language/stats/npar-summary.c +++ b/src/language/stats/npar-summary.c @@ -52,7 +52,7 @@ npar_summary_calc_descriptives (struct descriptives *desc, pass = casereader_clone (input); pass = casereader_create_filter_missing (pass, &v, 1, - filter, NULL); + filter, NULL, NULL); pass = casereader_create_filter_weight (pass, dict, NULL, NULL); while (casereader_read(pass, &c)) { diff --git a/src/language/stats/npar.h b/src/language/stats/npar.h index 37939fe917..6aed01cf2c 100644 --- a/src/language/stats/npar.h +++ b/src/language/stats/npar.h @@ -18,6 +18,7 @@ #define npar_h 1 #include +#include #include #include @@ -36,8 +37,9 @@ struct npar_test void (*execute) (const struct dataset *, struct casereader *, enum mv_class exclude, - const struct npar_test * - ); + const struct npar_test *, + bool, + double); void (*insert_variables) (const struct npar_test *, struct const_hsh_table *); diff --git a/src/language/stats/npar.q b/src/language/stats/npar.q index 688ce2379b..34e03677dc 100644 --- a/src/language/stats/npar.q +++ b/src/language/stats/npar.q @@ -1,5 +1,5 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2006 Free Software Foundation, Inc. +/* PSPP - a program for statistical analysis. -*-c-*- + Copyright (C) 2006, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -53,7 +54,8 @@ +friedman=varlist; +kendall=varlist; missing=miss:!analysis/listwise, - incl:include/!exclude; + incl:include/!exclude; + method=custom; +statistics[st_]=descriptives,quartiles,all. */ /* (declarations) */ @@ -70,17 +72,25 @@ struct npar_specs size_t n_tests; const struct variable ** vv; /* Compendium of all variables - (those mentioned on ANY subcommand */ + (those mentioned on ANY subcommand */ int n_vars; /* Number of variables in vv */ enum mv_class filter; /* Missing values to filter. */ bool descriptives; /* Descriptive statistics should be calculated */ bool quartiles; /* Quartiles should be calculated */ + + bool exact; /* Whether exact calculations have been requested */ + double timer; /* Maximum time (in minutes) to wait for exact calculations */ }; -void one_sample_insert_variables (const struct npar_test *test, - struct const_hsh_table *variables); +static void one_sample_insert_variables (const struct npar_test *test, + struct const_hsh_table *variables); + +static void two_sample_insert_variables (const struct npar_test *test, + struct const_hsh_table *variables); + + static void npar_execute(struct casereader *input, @@ -98,7 +108,7 @@ npar_execute(struct casereader *input, msg (SW, _("NPAR subcommand not currently implemented.")); continue; } - test->execute (ds, casereader_clone (input), specs->filter, test); + test->execute (ds, casereader_clone (input), specs->filter, test, specs->exact, specs->timer); } if ( specs->descriptives ) @@ -126,7 +136,7 @@ cmd_npar_tests (struct lexer *lexer, struct dataset *ds) { bool ok; int i; - struct npar_specs npar_specs = {0, 0, 0, 0, 0, 0, 0, 0}; + struct npar_specs npar_specs = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; struct const_hsh_table *var_hash; struct casegrouper *grouper; struct casereader *input, *group; @@ -134,8 +144,8 @@ cmd_npar_tests (struct lexer *lexer, struct dataset *ds) npar_specs.pool = pool_create (); var_hash = const_hsh_create_pool (npar_specs.pool, 0, - compare_vars_by_name, hash_var_by_name, - NULL, NULL); + compare_vars_by_name, hash_var_by_name, + NULL, NULL); if ( ! parse_npar_tests (lexer, ds, &cmd, &npar_specs) ) { @@ -183,10 +193,14 @@ cmd_npar_tests (struct lexer *lexer, struct dataset *ds) input = proc_open (ds); if ( cmd.miss == NPAR_LISTWISE ) - input = casereader_create_filter_missing (input, - npar_specs.vv, - npar_specs.n_vars, - npar_specs.filter, NULL); + { + input = casereader_create_filter_missing (input, + npar_specs.vv, + npar_specs.n_vars, + npar_specs.filter, + NULL, NULL); + } + grouper = casegrouper_create_splits (input, dataset_dict (ds)); while (casegrouper_get_next_group (grouper, &group)) @@ -202,7 +216,8 @@ cmd_npar_tests (struct lexer *lexer, struct dataset *ds) } int -npar_custom_chisquare(struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests *cmd UNUSED, void *aux ) +npar_custom_chisquare (struct lexer *lexer, struct dataset *ds, + struct cmd_npar_tests *cmd UNUSED, void *aux ) { struct npar_specs *specs = aux; @@ -213,8 +228,8 @@ npar_custom_chisquare(struct lexer *lexer, struct dataset *ds, struct cmd_npar_t ((struct npar_test *)tp)->insert_variables = one_sample_insert_variables; if (!parse_variables_const_pool (lexer, specs->pool, dataset_dict (ds), - &tp->vars, &tp->n_vars, - PV_NO_SCRATCH | PV_NO_DUPLICATE)) + &tp->vars, &tp->n_vars, + PV_NO_SCRATCH | PV_NO_DUPLICATE)) { return 2; } @@ -307,7 +322,8 @@ npar_custom_chisquare(struct lexer *lexer, struct dataset *ds, struct cmd_npar_t int -npar_custom_binomial(struct lexer *lexer, struct dataset *ds, struct cmd_npar_tests *cmd UNUSED, void *aux) +npar_custom_binomial (struct lexer *lexer, struct dataset *ds, + struct cmd_npar_tests *cmd UNUSED, void *aux) { struct npar_specs *specs = aux; struct binomial_test *btp = pool_alloc(specs->pool, sizeof(*btp)); @@ -333,8 +349,8 @@ npar_custom_binomial(struct lexer *lexer, struct dataset *ds, struct cmd_npar_te if ( lex_match (lexer, '=') ) { if (parse_variables_const_pool (lexer, specs->pool, dataset_dict (ds), - &tp->vars, &tp->n_vars, - PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) ) + &tp->vars, &tp->n_vars, + PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) ) { if ( lex_match (lexer, '(')) { @@ -398,18 +414,20 @@ parse_two_sample_related_test (struct lexer *lexer, const struct variable **vlist2; size_t n_vlist2; + ((struct npar_test *)test_parameters)->insert_variables = two_sample_insert_variables; + if (!parse_variables_const_pool (lexer, pool, - dict, - &vlist1, &n_vlist1, - PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) ) + dict, + &vlist1, &n_vlist1, + PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) ) return false; if ( lex_match(lexer, T_WITH)) { with = true; if ( !parse_variables_const_pool (lexer, pool, dict, - &vlist2, &n_vlist2, - PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) ) + &vlist2, &n_vlist2, + PV_NUMERIC | PV_NO_SCRATCH | PV_NO_DUPLICATE) ) return false; paired = (lex_match (lexer, '(') && @@ -449,8 +467,8 @@ parse_two_sample_related_test (struct lexer *lexer, assert (n_vlist1 == n_vlist2); for ( i = 0 ; i < n_vlist1; ++i ) { - test_parameters->pairs[n][0] = vlist1[i]; - test_parameters->pairs[n][1] = vlist2[i]; + test_parameters->pairs[n][1] = vlist1[i]; + test_parameters->pairs[n][0] = vlist2[i]; n++; } } @@ -461,8 +479,8 @@ parse_two_sample_related_test (struct lexer *lexer, { for ( j = 0 ; j < n_vlist2; ++j ) { - test_parameters->pairs[n][0] = vlist1[i]; - test_parameters->pairs[n][1] = vlist2[j]; + test_parameters->pairs[n][1] = vlist1[i]; + test_parameters->pairs[n][0] = vlist2[j]; n++; } } @@ -476,8 +494,8 @@ parse_two_sample_related_test (struct lexer *lexer, for ( j = i + 1 ; j < n_vlist1; ++j ) { assert ( n < test_parameters->n_pairs); - test_parameters->pairs[n][0] = vlist1[i]; - test_parameters->pairs[n][1] = vlist1[j]; + test_parameters->pairs[n][1] = vlist1[i]; + test_parameters->pairs[n][0] = vlist1[j]; n++; } } @@ -495,8 +513,8 @@ npar_custom_wilcoxon (struct lexer *lexer, { struct npar_specs *specs = aux; - struct two_sample_test *tp = pool_alloc(specs->pool, sizeof(*tp)); - ((struct npar_test *)tp)->execute = NULL; + struct two_sample_test *tp = pool_alloc (specs->pool, sizeof(*tp)); + ((struct npar_test *)tp)->execute = wilcoxon_execute; if (!parse_two_sample_related_test (lexer, dataset_dict (ds), cmd, tp, specs->pool) ) @@ -559,9 +577,9 @@ npar_custom_sign (struct lexer *lexer, struct dataset *ds, } /* Insert the variables for TEST into VAR_HASH */ -void +static void one_sample_insert_variables (const struct npar_test *test, - struct const_hsh_table *var_hash) + struct const_hsh_table *var_hash) { int i; struct one_sample_test *ost = (struct one_sample_test *) test; @@ -570,3 +588,50 @@ one_sample_insert_variables (const struct npar_test *test, const_hsh_insert (var_hash, ost->vars[i]); } +static void +two_sample_insert_variables (const struct npar_test *test, + struct const_hsh_table *var_hash) +{ + int i; + + const struct two_sample_test *tst = (const struct two_sample_test *) test; + + for ( i = 0 ; i < tst->n_pairs ; ++i ) + { + variable_pair *pair = &tst->pairs[i]; + + const_hsh_insert (var_hash, (*pair)[0]); + const_hsh_insert (var_hash, (*pair)[1]); + } + +} + + +static int +npar_custom_method (struct lexer *lexer, struct dataset *ds UNUSED, + struct cmd_npar_tests *test UNUSED, void *aux) +{ + struct npar_specs *specs = aux; + + if ( lex_match_id (lexer, "EXACT") ) + { + specs->exact = true; + specs->timer = 0.0; + if (lex_match_id (lexer, "TIMER")) + { + specs->timer = 5.0; + + if ( lex_match (lexer, '(')) + { + if ( lex_force_num (lexer) ) + { + specs->timer = lex_number (lexer); + lex_get (lexer); + } + lex_force_match (lexer, ')'); + } + } + } + + return 1; +} diff --git a/src/language/stats/oneway.q b/src/language/stats/oneway.q index 7da992274e..87a7500465 100644 --- a/src/language/stats/oneway.q +++ b/src/language/stats/oneway.q @@ -55,7 +55,7 @@ "ONEWAY" (oneway_): *^variables=custom; missing=miss:!analysis/listwise, - incl:include/!exclude; + incl:include/!exclude; +contrast= double list; +statistics[st_]=descriptives,homogeneity. */ @@ -76,7 +76,7 @@ static const struct variable **vars; /* A hash table containing all the distinct values of the independent variables */ -static struct hsh_table *global_group_hash ; +static struct hsh_table *global_group_hash; /* The number of distinct values of the independent variable, when all missing values are disregarded */ @@ -88,19 +88,19 @@ static void run_oneway (struct cmd_oneway *, struct casereader *, /* Routines to show the output tables */ -static void show_anova_table(void); -static void show_descriptives(void); -static void show_homogeneity(void); +static void show_anova_table (void); +static void show_descriptives (void); +static void show_homogeneity (void); -static void show_contrast_coeffs(short *); -static void show_contrast_tests(short *); +static void show_contrast_coeffs (short *); +static void show_contrast_tests (short *); enum stat_table_t {STAT_DESC = 1, STAT_HOMO = 2}; -static enum stat_table_t stat_tables ; +static enum stat_table_t stat_tables; -void output_oneway(void); +void output_oneway (void); int @@ -111,25 +111,26 @@ cmd_oneway (struct lexer *lexer, struct dataset *ds) int i; bool ok; - if ( !parse_oneway (lexer, ds, &cmd, NULL) ) + if ( !parse_oneway (lexer, ds, &cmd, NULL)) return CMD_FAILURE; /* What statistics were requested */ - if ( cmd.sbc_statistics ) + if ( cmd.sbc_statistics) { - for (i = 0 ; i < ONEWAY_ST_count ; ++i ) + for (i = 0; i < ONEWAY_ST_count; ++i) { - if ( ! cmd.a_statistics[i] ) continue; - - switch (i) { - case ONEWAY_ST_DESCRIPTIVES: - stat_tables |= STAT_DESC; - break; - case ONEWAY_ST_HOMOGENEITY: - stat_tables |= STAT_HOMO; - break; - } + if (! cmd.a_statistics[i]) continue; + + switch (i) + { + case ONEWAY_ST_DESCRIPTIVES: + stat_tables |= STAT_DESC; + break; + case ONEWAY_ST_HOMOGENEITY: + stat_tables |= STAT_HOMO; + break; + } } } @@ -148,91 +149,88 @@ cmd_oneway (struct lexer *lexer, struct dataset *ds) void -output_oneway(void) +output_oneway (void) { size_t i; - short *bad_contrast ; + short *bad_contrast; bad_contrast = xnmalloc (cmd.sbc_contrast, sizeof *bad_contrast); /* Check the sanity of the given contrast values */ - for (i = 0 ; i < cmd.sbc_contrast ; ++i ) + for (i = 0; i < cmd.sbc_contrast; ++i) { int j; double sum = 0; bad_contrast[i] = 0; - if ( subc_list_double_count(&cmd.dl_contrast[i]) != - ostensible_number_of_groups ) + if (subc_list_double_count (&cmd.dl_contrast[i]) != + ostensible_number_of_groups) { - msg(SW, - _("Number of contrast coefficients must equal the number of groups")); + msg (SW, + _("Number of contrast coefficients must equal the number of groups")); bad_contrast[i] = 1; continue; } - for (j=0; j < ostensible_number_of_groups ; ++j ) - sum += subc_list_double_at(&cmd.dl_contrast[i],j); + for (j = 0; j < ostensible_number_of_groups; ++j) + sum += subc_list_double_at (&cmd.dl_contrast[i], j); if ( sum != 0.0 ) - msg(SW,_("Coefficients for contrast %zu do not total zero"), i + 1); + msg (SW, _("Coefficients for contrast %zu do not total zero"), i + 1); } if ( stat_tables & STAT_DESC ) - show_descriptives(); + show_descriptives (); if ( stat_tables & STAT_HOMO ) - show_homogeneity(); + show_homogeneity (); - show_anova_table(); + show_anova_table (); if (cmd.sbc_contrast ) { - show_contrast_coeffs(bad_contrast); - show_contrast_tests(bad_contrast); + show_contrast_coeffs (bad_contrast); + show_contrast_tests (bad_contrast); } - - free(bad_contrast); + free (bad_contrast); /* Clean up */ - for (i = 0 ; i < n_vars ; ++i ) + for (i = 0; i < n_vars; ++i ) { struct hsh_table *group_hash = group_proc_get (vars[i])->group_hash; - hsh_destroy(group_hash); + hsh_destroy (group_hash); } - hsh_destroy(global_group_hash); - + hsh_destroy (global_group_hash); } - - /* Parser for the variables sub command */ static int oneway_custom_variables (struct lexer *lexer, - struct dataset *ds, struct cmd_oneway *cmd UNUSED, - void *aux UNUSED) + struct dataset *ds, struct cmd_oneway *cmd UNUSED, + void *aux UNUSED) { struct dictionary *dict = dataset_dict (ds); lex_match (lexer, '='); - if ((lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL) + if ((lex_token (lexer) != T_ID || + dict_lookup_var (dict, lex_tokid (lexer)) == NULL) && lex_token (lexer) != T_ALL) return 2; if (!parse_variables_const (lexer, dict, &vars, &n_vars, - PV_DUPLICATE - | PV_NUMERIC | PV_NO_SCRATCH) ) + PV_DUPLICATE + | PV_NUMERIC | PV_NO_SCRATCH) ) { free (vars); return 0; } - assert(n_vars); + assert (n_vars); if ( ! lex_match (lexer, T_BY)) return 2; @@ -241,7 +239,7 @@ oneway_custom_variables (struct lexer *lexer, if ( !indep_var ) { - msg(SE,_("`%s' is not a variable name"),lex_tokid (lexer)); + msg (SE, _("`%s' is not a variable name"), lex_tokid (lexer)); return 0; } @@ -251,7 +249,7 @@ oneway_custom_variables (struct lexer *lexer, /* Show the ANOVA table */ static void -show_anova_table(void) +show_anova_table (void) { size_t i; int n_cols =7; @@ -260,7 +258,7 @@ show_anova_table(void) struct tab_table *t; - t = tab_create (n_cols,n_rows,0); + t = tab_create (n_cols, n_rows, 0); tab_headers (t, 2, 0, 1, 0); tab_dim (t, tab_natural_dimensions); @@ -282,23 +280,23 @@ show_anova_table(void) tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("Significance")); - for ( i=0 ; i < n_vars ; ++i ) + for (i = 0; i < n_vars; ++i) { struct group_statistics *totals = &group_proc_get (vars[i])->ugs; struct hsh_table *group_hash = group_proc_get (vars[i])->group_hash; struct hsh_iterator g; struct group_statistics *gs; - double ssa=0; - const char *s = var_to_string(vars[i]); + double ssa = 0; + const char *s = var_to_string (vars[i]); - for (gs = hsh_first (group_hash,&g); + for (gs = hsh_first (group_hash, &g); gs != 0; - gs = hsh_next(group_hash,&g)) + gs = hsh_next (group_hash, &g)) { - ssa += (gs->sum * gs->sum)/gs->n; + ssa += pow2 (gs->sum) / gs->n; } - ssa -= ( totals->sum * totals->sum ) / totals->n ; + ssa -= pow2 (totals->sum) / totals->n; tab_text (t, 0, i * 3 + 1, TAB_LEFT | TAT_TITLE, s); tab_text (t, 1, i * 3 + 1, TAB_LEFT | TAT_TITLE, _("Between Groups")); @@ -306,13 +304,13 @@ show_anova_table(void) tab_text (t, 1, i * 3 + 3, TAB_LEFT | TAT_TITLE, _("Total")); if (i > 0) - tab_hline(t, TAL_1, 0, n_cols - 1 , i * 3 + 1); + tab_hline (t, TAL_1, 0, n_cols - 1, i * 3 + 1); { struct group_proc *gp = group_proc_get (vars[i]); - const double sst = totals->ssq - ( totals->sum * totals->sum) / totals->n ; + const double sst = totals->ssq - pow2 (totals->sum) / totals->n; const double df1 = gp->n_groups - 1; - const double df2 = totals->n - gp->n_groups ; + const double df2 = totals->n - gp->n_groups; const double msa = ssa / df1; gp->mse = (sst - ssa) / df2; @@ -333,19 +331,16 @@ show_anova_table(void) tab_float (t, 4, i * 3 + 1, TAB_RIGHT, msa, 8, 3); tab_float (t, 4, i * 3 + 2, TAB_RIGHT, gp->mse, 8, 3); - { - const double F = msa/gp->mse ; + const double F = msa/gp->mse; /* The F value */ tab_float (t, 5, i * 3 + 1, 0, F, 8, 3); /* The significance */ - tab_float (t, 6, i * 3 + 1, 0, gsl_cdf_fdist_Q(F,df1,df2), 8, 3); + tab_float (t, 6, i * 3 + 1, 0, gsl_cdf_fdist_Q (F, df1, df2), 8, 3); } - } - } @@ -356,23 +351,23 @@ show_anova_table(void) /* Show the descriptives table */ static void -show_descriptives(void) +show_descriptives (void) { size_t v; int n_cols =10; struct tab_table *t; int row; - const double confidence=0.95; + const double confidence = 0.95; const double q = (1.0 - confidence) / 2.0; - int n_rows = 2 ; + int n_rows = 2; - for ( v = 0 ; v < n_vars ; ++v ) + for ( v = 0; v < n_vars; ++v ) n_rows += group_proc_get (vars[v])->n_groups + 1; - t = tab_create (n_cols,n_rows,0); + t = tab_create (n_cols, n_rows, 0); tab_headers (t, 2, 0, 2, 0); tab_dim (t, tab_natural_dimensions); @@ -385,7 +380,7 @@ show_descriptives(void) n_cols - 1, n_rows - 1); /* Underline headers */ - tab_hline (t, TAL_2, 0, n_cols - 1, 2 ); + tab_hline (t, TAL_2, 0, n_cols - 1, 2); tab_vline (t, TAL_2, 2, 0, n_rows - 1); tab_text (t, 2, 1, TAB_CENTER | TAT_TITLE, _("N")); @@ -394,9 +389,10 @@ show_descriptives(void) tab_text (t, 5, 1, TAB_CENTER | TAT_TITLE, _("Std. Error")); - tab_vline(t, TAL_0, 7, 0, 0); - tab_hline(t, TAL_1, 6, 7, 1); - tab_joint_text (t, 6, 0, 7, 0, TAB_CENTER | TAT_TITLE | TAT_PRINTF, _("%g%% Confidence Interval for Mean"),confidence*100.0); + tab_vline (t, TAL_0, 7, 0, 0); + tab_hline (t, TAL_1, 6, 7, 1); + tab_joint_text (t, 6, 0, 7, 0, TAB_CENTER | TAT_TITLE | TAT_PRINTF, + _("%g%% Confidence Interval for Mean"), confidence*100.0); tab_text (t, 6, 1, TAB_CENTER | TAT_TITLE, _("Lower Bound")); tab_text (t, 7, 1, TAB_CENTER | TAT_TITLE, _("Upper Bound")); @@ -409,7 +405,7 @@ show_descriptives(void) row = 2; - for ( v=0 ; v < n_vars ; ++v ) + for (v = 0; v < n_vars; ++v) { double T; double std_error; @@ -419,17 +415,17 @@ show_descriptives(void) struct group_statistics *gs; struct group_statistics *totals = &gp->ugs; - const char *s = var_to_string(vars[v]); + const char *s = var_to_string (vars[v]); struct group_statistics *const *gs_array = - (struct group_statistics *const *) hsh_sort(gp->group_hash); + (struct group_statistics *const *) hsh_sort (gp->group_hash); int count = 0; tab_text (t, 0, row, TAB_LEFT | TAT_TITLE, s); if ( v > 0) - tab_hline(t, TAL_1, 0, n_cols - 1 , row); + tab_hline (t, TAL_1, 0, n_cols - 1, row); - for (count = 0 ; count < hsh_count(gp->group_hash) ; ++count) + for (count = 0; count < hsh_count (gp->group_hash); ++count) { struct string vstr; ds_init_empty (&vstr); @@ -445,73 +441,67 @@ show_descriptives(void) /* Now fill in the numbers ... */ - tab_float (t, 2, row + count, 0, gs->n, 8,0); + tab_float (t, 2, row + count, 0, gs->n, 8, 0); - tab_float (t, 3, row + count, 0, gs->mean,8,2); + tab_float (t, 3, row + count, 0, gs->mean, 8, 2); - tab_float (t, 4, row + count, 0, gs->std_dev,8,2); + tab_float (t, 4, row + count, 0, gs->std_dev, 8, 2); - std_error = gs->std_dev/sqrt(gs->n) ; + std_error = gs->std_dev/sqrt (gs->n); tab_float (t, 5, row + count, 0, - std_error, 8,2); + std_error, 8, 2); /* Now the confidence interval */ - T = gsl_cdf_tdist_Qinv(q,gs->n - 1); + T = gsl_cdf_tdist_Qinv (q, gs->n - 1); - tab_float(t, 6, row + count, 0, - gs->mean - T * std_error, 8, 2); + tab_float (t, 6, row + count, 0, + gs->mean - T * std_error, 8, 2); - tab_float(t, 7, row + count, 0, - gs->mean + T * std_error, 8, 2); + tab_float (t, 7, row + count, 0, + gs->mean + T * std_error, 8, 2); /* Min and Max */ - - tab_float(t, 8, row + count, 0, gs->minimum, 8, 2); - tab_float(t, 9, row + count, 0, gs->maximum, 8, 2); - + tab_float (t, 8, row + count, 0, gs->minimum, 8, 2); + tab_float (t, 9, row + count, 0, gs->maximum, 8, 2); } tab_text (t, 1, row + count, - TAB_LEFT | TAT_TITLE ,_("Total")); + TAB_LEFT | TAT_TITLE, _("Total")); - tab_float (t, 2, row + count, 0, totals->n, 8,0); + tab_float (t, 2, row + count, 0, totals->n, 8, 0); - tab_float (t, 3, row + count, 0, totals->mean, 8,2); + tab_float (t, 3, row + count, 0, totals->mean, 8, 2); - tab_float (t, 4, row + count, 0, totals->std_dev,8,2); + tab_float (t, 4, row + count, 0, totals->std_dev, 8, 2); - std_error = totals->std_dev/sqrt(totals->n) ; + std_error = totals->std_dev/sqrt (totals->n); - tab_float (t, 5, row + count, 0, std_error, 8,2); + tab_float (t, 5, row + count, 0, std_error, 8, 2); /* Now the confidence interval */ - T = gsl_cdf_tdist_Qinv(q,totals->n - 1); + T = gsl_cdf_tdist_Qinv (q, totals->n - 1); - tab_float(t, 6, row + count, 0, - totals->mean - T * std_error, 8, 2); + tab_float (t, 6, row + count, 0, + totals->mean - T * std_error, 8, 2); - tab_float(t, 7, row + count, 0, - totals->mean + T * std_error, 8, 2); + tab_float (t, 7, row + count, 0, + totals->mean + T * std_error, 8, 2); /* Min and Max */ - - tab_float(t, 8, row + count, 0, totals->minimum, 8, 2); - tab_float(t, 9, row + count, 0, totals->maximum, 8, 2); + tab_float (t, 8, row + count, 0, totals->minimum, 8, 2); + tab_float (t, 9, row + count, 0, totals->maximum, 8, 2); row += gp->n_groups + 1; } - tab_submit (t); - - } /* Show the homogeneity table */ static void -show_homogeneity(void) +show_homogeneity (void) { size_t v; int n_cols = 5; @@ -520,7 +510,7 @@ show_homogeneity(void) struct tab_table *t; - t = tab_create (n_cols,n_rows,0); + t = tab_create (n_cols, n_rows, 0); tab_headers (t, 1, 0, 1, 0); tab_dim (t, tab_natural_dimensions); @@ -532,38 +522,37 @@ show_homogeneity(void) n_cols - 1, n_rows - 1); - tab_hline(t, TAL_2, 0, n_cols - 1, 1); - tab_vline(t, TAL_2, 1, 0, n_rows - 1); + tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_vline (t, TAL_2, 1, 0, n_rows - 1); - tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Levene Statistic")); - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("df1")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df2")); - tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Significance")); - + tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Levene Statistic")); + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("df1")); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("df2")); + tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Significance")); tab_title (t, _("Test of Homogeneity of Variances")); - for ( v=0 ; v < n_vars ; ++v ) + for (v = 0; v < n_vars; ++v) { double F; const struct variable *var = vars[v]; const struct group_proc *gp = group_proc_get (vars[v]); - const char *s = var_to_string(var); + const char *s = var_to_string (var); const struct group_statistics *totals = &gp->ugs; const double df1 = gp->n_groups - 1; - const double df2 = totals->n - gp->n_groups ; + const double df2 = totals->n - gp->n_groups; tab_text (t, 0, v + 1, TAB_LEFT | TAT_TITLE, s); F = gp->levene; - tab_float (t, 1, v + 1, TAB_RIGHT, F, 8,3); - tab_float (t, 2, v + 1, TAB_RIGHT, df1 ,8,0); - tab_float (t, 3, v + 1, TAB_RIGHT, df2 ,8,0); + tab_float (t, 1, v + 1, TAB_RIGHT, F, 8, 3); + tab_float (t, 2, v + 1, TAB_RIGHT, df1, 8, 0); + tab_float (t, 3, v + 1, TAB_RIGHT, df2, 8, 0); /* Now the significance */ - tab_float (t, 4, v + 1, TAB_RIGHT,gsl_cdf_fdist_Q(F,df1,df2), 8, 3); + tab_float (t, 4, v + 1, TAB_RIGHT, gsl_cdf_fdist_Q (F, df1, df2), 8, 3); } tab_submit (t); @@ -577,12 +566,12 @@ show_contrast_coeffs (short *bad_contrast) int n_cols = 2 + ostensible_number_of_groups; int n_rows = 2 + cmd.sbc_contrast; union value *group_value; - int count = 0 ; - void *const *group_values ; + int count = 0; + void *const *group_values; struct tab_table *t; - t = tab_create (n_cols,n_rows,0); + t = tab_create (n_cols, n_rows, 0); tab_headers (t, 2, 0, 2, 0); tab_dim (t, tab_natural_dimensions); @@ -594,21 +583,21 @@ show_contrast_coeffs (short *bad_contrast) n_cols - 1, n_rows - 1); tab_box (t, - -1,-1, + -1, -1, TAL_0, TAL_0, 2, 0, n_cols - 1, 0); tab_box (t, - -1,-1, + -1, -1, TAL_0, TAL_0, - 0,0, - 1,1); + 0, 0, + 1, 1); - tab_hline(t, TAL_1, 2, n_cols - 1, 1); - tab_hline(t, TAL_2, 0, n_cols - 1, 2); + tab_hline (t, TAL_1, 2, n_cols - 1, 1); + tab_hline (t, TAL_2, 0, n_cols - 1, 2); - tab_vline(t, TAL_2, 2, 0, n_rows - 1); + tab_vline (t, TAL_2, 2, 0, n_rows - 1); tab_title (t, _("Contrast Coefficients")); @@ -616,11 +605,11 @@ show_contrast_coeffs (short *bad_contrast) tab_joint_text (t, 2, 0, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, - var_to_string(indep_var)); + var_to_string (indep_var)); - group_values = hsh_sort(global_group_hash); - for (count = 0 ; - count < hsh_count(global_group_hash) ; + group_values = hsh_sort (global_group_hash); + for (count = 0; + count < hsh_count (global_group_hash); ++count) { int i; @@ -637,16 +626,16 @@ show_contrast_coeffs (short *bad_contrast) ds_destroy (&vstr); - for (i = 0 ; i < cmd.sbc_contrast ; ++i ) + for (i = 0; i < cmd.sbc_contrast; ++i ) { - tab_text(t, 1, i + 2, TAB_CENTER | TAT_PRINTF, "%d", i + 1); + tab_text (t, 1, i + 2, TAB_CENTER | TAT_PRINTF, "%d", i + 1); if ( bad_contrast[i] ) - tab_text(t, count + 2, i + 2, TAB_RIGHT, "?" ); + tab_text (t, count + 2, i + 2, TAB_RIGHT, "?" ); else - tab_text(t, count + 2, i + 2, TAB_RIGHT | TAT_PRINTF, "%g", - subc_list_double_at(&cmd.dl_contrast[i], count) - ); + tab_text (t, count + 2, i + 2, TAB_RIGHT | TAT_PRINTF, "%g", + subc_list_double_at (&cmd.dl_contrast[i], count) + ); } } @@ -656,7 +645,7 @@ show_contrast_coeffs (short *bad_contrast) /* Show the results of the contrast tests */ static void -show_contrast_tests(short *bad_contrast) +show_contrast_tests (short *bad_contrast) { size_t v; int n_cols = 8; @@ -664,7 +653,7 @@ show_contrast_tests(short *bad_contrast) struct tab_table *t; - t = tab_create (n_cols,n_rows,0); + t = tab_create (n_cols, n_rows, 0); tab_headers (t, 3, 0, 1, 0); tab_dim (t, tab_natural_dimensions); @@ -676,34 +665,34 @@ show_contrast_tests(short *bad_contrast) n_cols - 1, n_rows - 1); tab_box (t, - -1,-1, + -1, -1, TAL_0, TAL_0, 0, 0, 2, 0); - tab_hline(t, TAL_2, 0, n_cols - 1, 1); - tab_vline(t, TAL_2, 3, 0, n_rows - 1); + tab_hline (t, TAL_2, 0, n_cols - 1, 1); + tab_vline (t, TAL_2, 3, 0, n_rows - 1); tab_title (t, _("Contrast Tests")); - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Contrast")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Value of Contrast")); + tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Contrast")); + tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Value of Contrast")); tab_text (t, 4, 0, TAB_CENTER | TAT_TITLE, _("Std. Error")); tab_text (t, 5, 0, TAB_CENTER | TAT_TITLE, _("t")); tab_text (t, 6, 0, TAB_CENTER | TAT_TITLE, _("df")); tab_text (t, 7, 0, TAB_CENTER | TAT_TITLE, _("Sig. (2-tailed)")); - for ( v = 0 ; v < n_vars ; ++v ) + for (v = 0; v < n_vars; ++v) { int i; int lines_per_variable = 2 * cmd.sbc_contrast; tab_text (t, 0, (v * lines_per_variable) + 1, TAB_LEFT | TAT_TITLE, - var_to_string(vars[v])); + var_to_string (vars[v])); - for ( i = 0 ; i < cmd.sbc_contrast ; ++i ) + for (i = 0; i < cmd.sbc_contrast; ++i) { int ci; double contrast_value = 0.0; @@ -714,18 +703,18 @@ show_contrast_tests(short *bad_contrast) void *const *group_stat_array; double T; - double std_error_contrast ; + double std_error_contrast; double df; - double sec_vneq=0.0; + double sec_vneq = 0.0; /* Note: The calculation of the degrees of freedom in the "variances not equal" case is painfull!! The following formula may help to understand it: - \frac{\left(\sum_{i=1}^k{c_i^2\frac{s_i^2}{n_i}}\right)^2} + \frac{\left (\sum_{i=1}^k{c_i^2\frac{s_i^2}{n_i}}\right)^2} { - \sum_{i=1}^k\left( - \frac{\left(c_i^2\frac{s_i^2}{n_i}\right)^2} {n_i-1} + \sum_{i=1}^k\left ( + \frac{\left (c_i^2\frac{s_i^2}{n_i}\right)^2} {n_i-1} \right) } */ @@ -744,72 +733,72 @@ show_contrast_tests(short *bad_contrast) } tab_text (t, 2, (v * lines_per_variable) + i + 1, - TAB_CENTER | TAT_TITLE | TAT_PRINTF, "%d",i+1); + TAB_CENTER | TAT_TITLE | TAT_PRINTF, "%d", i + 1); tab_text (t, 2, (v * lines_per_variable) + i + 1 + cmd.sbc_contrast, - TAB_CENTER | TAT_TITLE | TAT_PRINTF, "%d",i+1); + TAB_CENTER | TAT_TITLE | TAT_PRINTF, "%d", i + 1); if ( bad_contrast[i]) continue; - group_stat_array = hsh_sort(group_hash); + group_stat_array = hsh_sort (group_hash); - for (ci = 0 ; ci < hsh_count(group_hash) ; ++ci) + for (ci = 0; ci < hsh_count (group_hash); ++ci) { - const double coef = subc_list_double_at(&cmd.dl_contrast[i], ci); + const double coef = subc_list_double_at (&cmd.dl_contrast[i], ci); struct group_statistics *gs = group_stat_array[ci]; - const double winv = (gs->std_dev * gs->std_dev) / gs->n; + const double winv = pow2 (gs->std_dev) / gs->n; contrast_value += coef * gs->mean; - coef_msq += (coef * coef) / gs->n ; + coef_msq += (coef * coef) / gs->n; - sec_vneq += (coef * coef) * (gs->std_dev * gs->std_dev ) /gs->n ; + sec_vneq += (coef * coef) * pow2 (gs->std_dev) /gs->n; df_numerator += (coef * coef) * winv; df_denominator += pow2((coef * coef) * winv) / (gs->n - 1); } - sec_vneq = sqrt(sec_vneq); + sec_vneq = sqrt (sec_vneq); df_numerator = pow2(df_numerator); tab_float (t, 3, (v * lines_per_variable) + i + 1, - TAB_RIGHT, contrast_value, 8,2); + TAB_RIGHT, contrast_value, 8, 2); tab_float (t, 3, (v * lines_per_variable) + i + 1 + cmd.sbc_contrast, - TAB_RIGHT, contrast_value, 8,2); + TAB_RIGHT, contrast_value, 8, 2); - std_error_contrast = sqrt(grp_data->mse * coef_msq); + std_error_contrast = sqrt (grp_data->mse * coef_msq); /* Std. Error */ tab_float (t, 4, (v * lines_per_variable) + i + 1, TAB_RIGHT, std_error_contrast, - 8,3); + 8, 3); - T = fabs(contrast_value / std_error_contrast) ; + T = fabs (contrast_value / std_error_contrast); /* T Statistic */ tab_float (t, 5, (v * lines_per_variable) + i + 1, TAB_RIGHT, T, - 8,3); + 8, 3); df = grp_data->ugs.n - grp_data->n_groups; /* Degrees of Freedom */ tab_float (t, 6, (v * lines_per_variable) + i + 1, TAB_RIGHT, df, - 8,0); + 8, 0); /* Significance TWO TAILED !!*/ tab_float (t, 7, (v * lines_per_variable) + i + 1, - TAB_RIGHT, 2 * gsl_cdf_tdist_Q(T,df), - 8,3); + TAB_RIGHT, 2 * gsl_cdf_tdist_Q (T, df), + 8, 3); /* Now for the Variances NOT Equal case */ @@ -818,14 +807,14 @@ show_contrast_tests(short *bad_contrast) tab_float (t, 4, (v * lines_per_variable) + i + 1 + cmd.sbc_contrast, TAB_RIGHT, sec_vneq, - 8,3); + 8, 3); T = contrast_value / sec_vneq; tab_float (t, 5, (v * lines_per_variable) + i + 1 + cmd.sbc_contrast, TAB_RIGHT, T, - 8,3); + 8, 3); df = df_numerator / df_denominator; @@ -833,19 +822,19 @@ show_contrast_tests(short *bad_contrast) tab_float (t, 6, (v * lines_per_variable) + i + 1 + cmd.sbc_contrast, TAB_RIGHT, df, - 8,3); + 8, 3); /* The Significance */ tab_float (t, 7, (v * lines_per_variable) + i + 1 + cmd.sbc_contrast, - TAB_RIGHT, 2 * gsl_cdf_tdist_Q(T,df), - 8,3); + TAB_RIGHT, 2 * gsl_cdf_tdist_Q (T, df), + 8, 3); } if ( v > 0 ) - tab_hline(t, TAL_1, 0, n_cols - 1, (v * lines_per_variable) + 1); + tab_hline (t, TAL_1, 0, n_cols - 1, (v * lines_per_variable) + 1); } tab_submit (t); @@ -855,19 +844,19 @@ show_contrast_tests(short *bad_contrast) /* ONEWAY ANOVA Calculations */ -static void postcalc ( struct cmd_oneway *cmd UNUSED ); +static void postcalc (struct cmd_oneway *cmd UNUSED); -static void precalc ( struct cmd_oneway *cmd UNUSED ); +static void precalc (struct cmd_oneway *cmd UNUSED); /* Pre calculations */ static void -precalc ( struct cmd_oneway *cmd UNUSED ) +precalc (struct cmd_oneway *cmd UNUSED) { - size_t i=0; + size_t i = 0; - for(i=0; i< n_vars ; ++i) + for (i = 0; i < n_vars; ++i) { struct group_proc *gp = group_proc_get (vars[i]); struct group_statistics *totals = &gp->ugs; @@ -876,19 +865,15 @@ precalc ( struct cmd_oneway *cmd UNUSED ) The hash contains a group_statistics structure, and is keyed by value of the independent variable */ - gp->group_hash = - hsh_create(4, - (hsh_compare_func *) compare_group, - (hsh_hash_func *) hash_group, - (hsh_free_func *) free_group, - (void *) var_get_width (indep_var) ); + gp->group_hash = hsh_create (4, compare_group, hash_group, + (hsh_free_func *) free_group, + indep_var); - - totals->sum=0; - totals->n=0; - totals->ssq=0; - totals->sum_diff=0; - totals->maximum = - DBL_MAX; + totals->sum = 0; + totals->n = 0; + totals->ssq = 0; + totals->sum_diff = 0; + totals->maximum = -DBL_MAX; totals->minimum = DBL_MAX; } } @@ -921,20 +906,20 @@ run_oneway (struct cmd_oneway *cmd, taint = taint_clone (casereader_get_taint (input)); - global_group_hash = hsh_create(4, - (hsh_compare_func *) compare_values, - (hsh_hash_func *) hash_value, - free_value, - (void *) var_get_width (indep_var) ); + global_group_hash = hsh_create (4, + compare_values, + hash_value, + free_value, + indep_var); - precalc(cmd); + precalc (cmd); exclude = cmd->incl != ONEWAY_INCLUDE ? MV_ANY : MV_SYSTEM; input = casereader_create_filter_missing (input, &indep_var, 1, - exclude, NULL); + exclude, NULL, NULL); if (cmd->miss == ONEWAY_LISTWISE) input = casereader_create_filter_missing (input, vars, n_vars, - exclude, NULL); + exclude, NULL, NULL); input = casereader_create_filter_weight (input, dict, NULL, NULL); reader = casereader_clone (input); @@ -949,7 +934,7 @@ run_oneway (struct cmd_oneway *cmd, if (*p == NULL) *p = value_dup (indep_val, var_get_width (indep_var)); - for ( i = 0 ; i < n_vars ; ++i ) + for (i = 0; i < n_vars; ++i) { const struct variable *v = vars[i]; @@ -960,29 +945,29 @@ run_oneway (struct cmd_oneway *cmd, struct group_statistics *gs; - gs = hsh_find(group_hash, (void *) indep_val ); + gs = hsh_find (group_hash, indep_val ); if ( ! gs ) { gs = xmalloc (sizeof *gs); gs->id = *indep_val; - gs->sum=0; - gs->n=0; - gs->ssq=0; - gs->sum_diff=0; + gs->sum = 0; + gs->n = 0; + gs->ssq = 0; + gs->sum_diff = 0; gs->minimum = DBL_MAX; gs->maximum = -DBL_MAX; - hsh_insert ( group_hash, (void *) gs ); + hsh_insert ( group_hash, gs ); } if (!var_is_value_missing (v, val, exclude)) { struct group_statistics *totals = &gp->ugs; - totals->n+=weight; - totals->sum+=weight * val->f; - totals->ssq+=weight * val->f * val->f; + totals->n += weight; + totals->sum += weight * val->f; + totals->ssq += weight * pow2 (val->f); if ( val->f * weight < totals->minimum ) totals->minimum = val->f * weight; @@ -990,9 +975,9 @@ run_oneway (struct cmd_oneway *cmd, if ( val->f * weight > totals->maximum ) totals->maximum = val->f * weight; - gs->n+=weight; - gs->sum+=weight * val->f; - gs->ssq+=weight * val->f * val->f; + gs->n += weight; + gs->sum += weight * val->f; + gs->ssq += weight * pow2 (val->f); if ( val->f * weight < gs->minimum ) gs->minimum = val->f * weight; @@ -1007,7 +992,7 @@ run_oneway (struct cmd_oneway *cmd, } casereader_destroy (reader); - postcalc(cmd); + postcalc (cmd); if ( stat_tables & STAT_HOMO ) @@ -1018,7 +1003,7 @@ run_oneway (struct cmd_oneway *cmd, ostensible_number_of_groups = hsh_count (global_group_hash); if (!taint_has_tainted_successor (taint)) - output_oneway(); + output_oneway (); taint_destroy (taint); } @@ -1027,10 +1012,9 @@ run_oneway (struct cmd_oneway *cmd, void postcalc ( struct cmd_oneway *cmd UNUSED ) { - size_t i=0; - + size_t i = 0; - for(i = 0; i < n_vars ; ++i) + for (i = 0; i < n_vars; ++i) { struct group_proc *gp = group_proc_get (vars[i]); struct hsh_table *group_hash = gp->group_hash; @@ -1039,35 +1023,29 @@ postcalc ( struct cmd_oneway *cmd UNUSED ) struct hsh_iterator g; struct group_statistics *gs; - for (gs = hsh_first (group_hash,&g); + for (gs = hsh_first (group_hash, &g); gs != 0; - gs = hsh_next(group_hash,&g)) + gs = hsh_next (group_hash, &g)) { - gs->mean=gs->sum / gs->n; - gs->s_std_dev= sqrt( - ( (gs->ssq / gs->n ) - gs->mean * gs->mean ) - ) ; - - gs->std_dev= sqrt( - gs->n/(gs->n-1) * - ( (gs->ssq / gs->n ) - gs->mean * gs->mean ) - ) ; + gs->mean = gs->sum / gs->n; + gs->s_std_dev = sqrt (gs->ssq / gs->n - pow2 (gs->mean)); - gs->se_mean = gs->std_dev / sqrt(gs->n); - gs->mean_diff= gs->sum_diff / gs->n; + gs->std_dev = sqrt ( + gs->n / (gs->n - 1) * + ( gs->ssq / gs->n - pow2 (gs->mean)) + ); + gs->se_mean = gs->std_dev / sqrt (gs->n); + gs->mean_diff = gs->sum_diff / gs->n; } - - totals->mean = totals->sum / totals->n; - totals->std_dev= sqrt( - totals->n/(totals->n-1) * - ( (totals->ssq / totals->n ) - totals->mean * totals->mean ) - ) ; - - totals->se_mean = totals->std_dev / sqrt(totals->n); + totals->std_dev = sqrt ( + totals->n / (totals->n - 1) * + (totals->ssq / totals->n - pow2 (totals->mean)) + ); + totals->se_mean = totals->std_dev / sqrt (totals->n); } } diff --git a/src/language/stats/rank.q b/src/language/stats/rank.q index cb63949076..13facbdbbf 100644 --- a/src/language/stats/rank.q +++ b/src/language/stats/rank.q @@ -486,7 +486,7 @@ rank_sorted_file (struct casereader *input, input = casereader_create_filter_missing (input, &rank_var, 1, - exclude_values, output); + exclude_values, NULL, output); input = casereader_create_filter_weight (input, dict, NULL, output); casereader_split (input, &pass1, &pass2); diff --git a/src/language/stats/regression.q b/src/language/stats/regression.q index 1d31d1845e..08954d87e5 100644 --- a/src/language/stats/regression.q +++ b/src/language/stats/regression.q @@ -956,9 +956,9 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, n_indep = identify_indep_vars (indep_vars, dep_var); reader = casereader_clone (input); reader = casereader_create_filter_missing (reader, indep_vars, n_indep, - MV_ANY, NULL); + MV_ANY, NULL, NULL); reader = casereader_create_filter_missing (reader, &dep_var, 1, - MV_ANY, NULL); + MV_ANY, NULL, NULL); n_data = prepare_categories (casereader_clone (reader), indep_vars, n_indep, mom); @@ -973,7 +973,8 @@ run_regression (struct casereader *input, struct cmd_regression *cmd, { lopts.get_indep_mean_std[i] = 1; } - models[k] = pspp_linreg_cache_alloc (X->m->size1, X->m->size2); + models[k] = pspp_linreg_cache_alloc (dep_var, (const struct variable **) indep_vars, + X->m->size1, X->m->size2); models[k]->depvar = dep_var; /* For large data sets, use QR decomposition. diff --git a/src/language/stats/reliability.q b/src/language/stats/reliability.q new file mode 100644 index 0000000000..8384bb3503 --- /dev/null +++ b/src/language/stats/reliability.q @@ -0,0 +1,812 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "xalloc.h" +#include "xmalloca.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) +#define N_(msgid) msgid + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +/* (headers) */ + +/* (specification) + reliability (rel_): + *^variables=varlist("PV_NO_SCRATCH | PV_NUMERIC"); + scale=custom; + missing=miss:!exclude/include; + model=custom; + method=covariance; + +summary[sum_]=total. +*/ +/* (declarations) */ +/* (functions) */ + + +static int rel_custom_scale (struct lexer *lexer, struct dataset *ds, + struct cmd_reliability *p, void *aux); + +static int rel_custom_model (struct lexer *, struct dataset *, + struct cmd_reliability *, void *); + +int cmd_reliability (struct lexer *lexer, struct dataset *ds); + +struct cronbach +{ + const struct variable **items; + size_t n_items; + double alpha; + double sum_of_variances; + double variance_of_sums; + int totals_idx; /* Casereader index into the totals */ + + struct moments1 **m ; /* Moments of the items */ + struct moments1 *total ; /* Moments of the totals */ +}; + +#if 0 +static void +dump_cronbach (const struct cronbach *s) +{ + int i; + printf ("N items %d\n", s->n_items); + for (i = 0 ; i < s->n_items; ++i) + { + printf ("%s\n", var_get_name (s->items[i])); + } + + printf ("Totals idx %d\n", s->totals_idx); + + printf ("scale variance %g\n", s->variance_of_sums); + printf ("alpha %g\n", s->alpha); + putchar ('\n'); +} +#endif + +enum model + { + MODEL_ALPHA, + MODEL_SPLIT + }; + + +struct reliability +{ + const struct variable **variables; + int n_variables; + enum mv_class exclude; + + struct cronbach *sc; + int n_sc; + + int total_start; + + struct string scale_name; + + enum model model; + int split_point; +}; + + +static double +alpha (int k, double sum_of_variances, double variance_of_sums) +{ + return k / ( k - 1.0) * ( 1 - sum_of_variances / variance_of_sums); +} + +static void reliability_summary_total (const struct reliability *rel); + +static void reliability_statistics (const struct reliability *rel); + + + +static void +run_reliability (struct casereader *group, struct dataset *ds, + struct reliability *rel); + + +int +cmd_reliability (struct lexer *lexer, struct dataset *ds) +{ + int i; + bool ok = false; + struct casegrouper *grouper; + struct casereader *group; + struct cmd_reliability cmd; + + struct reliability rel = { + NULL, 0, MV_ANY, NULL, 0, -1, + DS_EMPTY_INITIALIZER, + MODEL_ALPHA, 0}; + + cmd.v_variables = NULL; + + if ( ! parse_reliability (lexer, ds, &cmd, &rel) ) + { + goto done; + } + + rel.variables = cmd.v_variables; + rel.n_variables = cmd.n_variables; + rel.exclude = MV_ANY; + + + if (NULL == rel.sc) + { + struct cronbach *c; + /* Create a default Scale */ + + rel.n_sc = 1; + rel.sc = xzalloc (sizeof (struct cronbach) * rel.n_sc); + + ds_init_cstr (&rel.scale_name, "ANY"); + + c = &rel.sc[0]; + c->n_items = cmd.n_variables; + c->items = xzalloc (sizeof (struct variable*) * c->n_items); + + for (i = 0 ; i < c->n_items ; ++i) + c->items[i] = cmd.v_variables[i]; + } + + if ( cmd.miss == REL_INCLUDE) + rel.exclude = MV_SYSTEM; + + if ( rel.model == MODEL_SPLIT) + { + int i; + const struct cronbach *s; + + rel.n_sc += 2 ; + rel.sc = xrealloc (rel.sc, sizeof (struct cronbach) * rel.n_sc); + + s = &rel.sc[0]; + + rel.sc[1].n_items = + (rel.split_point == -1) ? s->n_items / 2 : rel.split_point; + + rel.sc[2].n_items = s->n_items - rel.sc[1].n_items; + rel.sc[1].items = xzalloc (sizeof (struct variable *) + * rel.sc[1].n_items); + + rel.sc[2].items = xzalloc (sizeof (struct variable *) * + rel.sc[2].n_items); + + for (i = 0; i < rel.sc[1].n_items ; ++i) + rel.sc[1].items[i] = s->items[i]; + + while (i < s->n_items) + { + rel.sc[2].items[i - rel.sc[1].n_items] = s->items[i]; + i++; + } + } + + if (cmd.a_summary[REL_SUM_TOTAL]) + { + int i; + const int base_sc = rel.n_sc; + + rel.total_start = base_sc; + + rel.n_sc += rel.sc[0].n_items ; + rel.sc = xrealloc (rel.sc, sizeof (struct cronbach) * rel.n_sc); + + for (i = 0 ; i < rel.sc[0].n_items; ++i ) + { + int v_src; + int v_dest = 0; + struct cronbach *s = &rel.sc[i + base_sc]; + + s->n_items = rel.sc[0].n_items - 1; + s->items = xzalloc (sizeof (struct variable *) * s->n_items); + for (v_src = 0 ; v_src < rel.sc[0].n_items ; ++v_src) + { + if ( v_src != i) + s->items[v_dest++] = rel.sc[0].items[v_src]; + } + } + } + + /* Data pass. */ + grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds)); + while (casegrouper_get_next_group (grouper, &group)) + { + run_reliability (group, ds, &rel); + + reliability_statistics (&rel); + + if (cmd.a_summary[REL_SUM_TOTAL]) + reliability_summary_total (&rel); + } + ok = casegrouper_destroy (grouper); + ok = proc_commit (ds) && ok; + + free_reliability (&cmd); + + done: + + /* Free all the stuff */ + for (i = 0 ; i < rel.n_sc; ++i) + { + int x; + struct cronbach *c = &rel.sc[i]; + free (c->items); + + moments1_destroy (c->total); + + if ( c->m) + for (x = 0 ; x < c->n_items; ++x) + moments1_destroy (c->m[x]); + + free (c->m); + } + + ds_destroy (&rel.scale_name); + free (rel.sc); + + if (ok) + return CMD_SUCCESS; + + return CMD_FAILURE; +} + +/* Return the sum of all the item variables in S */ +static double +append_sum (const struct ccase *c, casenumber n UNUSED, void *aux) +{ + double sum = 0; + const struct cronbach *s = aux; + + int v; + for (v = 0 ; v < s->n_items; ++v) + { + sum += case_data (c, s->items[v])->f; + } + + return sum; +}; + + +static void case_processing_summary (casenumber n_valid, casenumber n_missing); + +static void +run_reliability (struct casereader *input, struct dataset *ds UNUSED, + struct reliability *rel) +{ + int i; + int si; + struct ccase c; + casenumber n_missing ; + casenumber n_valid = 0; + + + for (si = 0 ; si < rel->n_sc; ++si) + { + struct cronbach *s = &rel->sc[si]; + + s->m = xzalloc (sizeof (s->m) * s->n_items); + s->total = moments1_create (MOMENT_VARIANCE); + + for (i = 0 ; i < s->n_items ; ++i ) + s->m[i] = moments1_create (MOMENT_VARIANCE); + } + + input = casereader_create_filter_missing (input, + rel->variables, + rel->n_variables, + rel->exclude, + &n_missing, + NULL); + + for (si = 0 ; si < rel->n_sc; ++si) + { + struct cronbach *s = &rel->sc[si]; + + + s->totals_idx = casereader_get_value_cnt (input); + input = + casereader_create_append_numeric (input, append_sum, + s, NULL); + } + + for (; casereader_read (input, &c); case_destroy (&c)) + { + double weight = 1.0; + n_valid ++; + + for (si = 0; si < rel->n_sc; ++si) + { + struct cronbach *s = &rel->sc[si]; + + for (i = 0 ; i < s->n_items ; ++i ) + moments1_add (s->m[i], case_data (&c, s->items[i])->f, weight); + + moments1_add (s->total, case_data_idx (&c, s->totals_idx)->f, weight); + } + } + casereader_destroy (input); + + for (si = 0; si < rel->n_sc; ++si) + { + struct cronbach *s = &rel->sc[si]; + + s->sum_of_variances = 0; + for (i = 0 ; i < s->n_items ; ++i ) + { + double weight, mean, variance; + moments1_calculate (s->m[i], &weight, &mean, &variance, NULL, NULL); + + s->sum_of_variances += variance; + } + + moments1_calculate (s->total, NULL, NULL, &s->variance_of_sums, + NULL, NULL); + + s->alpha = + alpha (s->n_items, s->sum_of_variances, s->variance_of_sums); + } + + + { + struct tab_table *tab = tab_create(1, 1, 0); + + tab_dim (tab, tab_natural_dimensions); + tab_flags (tab, SOMF_NO_TITLE ); + + tab_text(tab, 0, 0, TAT_PRINTF, "Scale: %s", ds_cstr (&rel->scale_name)); + + tab_submit(tab); + } + + + case_processing_summary (n_valid, n_missing); +} + + +static void reliability_statistics_model_alpha (struct tab_table *tbl, + const struct reliability *rel); + +static void reliability_statistics_model_split (struct tab_table *tbl, + const struct reliability *rel); + +struct reliability_output_table +{ + int n_cols; + int n_rows; + int heading_cols; + int heading_rows; + void (*populate)(struct tab_table *, const struct reliability *); +}; + +static struct reliability_output_table rol[2] = + { + { 2, 2, 1, 1, reliability_statistics_model_alpha}, + { 4, 9, 3, 0, reliability_statistics_model_split} + }; + +static void +reliability_statistics (const struct reliability *rel) +{ + int n_cols = rol[rel->model].n_cols; + int n_rows = rol[rel->model].n_rows; + int heading_columns = rol[rel->model].heading_cols; + int heading_rows = rol[rel->model].heading_rows; + + struct tab_table *tbl = tab_create (n_cols, n_rows, 0); + tab_headers (tbl, heading_columns, 0, heading_rows, 0); + + tab_dim (tbl, tab_natural_dimensions); + + tab_title (tbl, _("Reliability Statistics")); + + /* Vertical lines for the data only */ + tab_box (tbl, + -1, -1, + -1, TAL_1, + heading_columns, 0, + n_cols - 1, n_rows - 1); + + /* Box around table */ + tab_box (tbl, + TAL_2, TAL_2, + -1, -1, + 0, 0, + n_cols - 1, n_rows - 1); + + + tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows); + + tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1); + + if ( rel->model == MODEL_ALPHA ) + reliability_statistics_model_alpha (tbl, rel); + else if (rel->model == MODEL_SPLIT ) + reliability_statistics_model_split (tbl, rel); + + tab_submit (tbl); +} + +static void +reliability_summary_total (const struct reliability *rel) +{ + int i; + const int n_cols = 5; + const int heading_columns = 1; + const int heading_rows = 1; + const int n_rows = rel->sc[0].n_items + heading_rows ; + + struct tab_table *tbl = tab_create (n_cols, n_rows, 0); + tab_headers (tbl, heading_columns, 0, heading_rows, 0); + + tab_dim (tbl, tab_natural_dimensions); + + tab_title (tbl, _("Item-Total Statistics")); + + /* Vertical lines for the data only */ + tab_box (tbl, + -1, -1, + -1, TAL_1, + heading_columns, 0, + n_cols - 1, n_rows - 1); + + /* Box around table */ + tab_box (tbl, + TAL_2, TAL_2, + -1, -1, + 0, 0, + n_cols - 1, n_rows - 1); + + + tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows); + + tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1); + + tab_text (tbl, 1, 0, TAB_CENTER | TAT_TITLE, + _("Scale Mean if Item Deleted")); + + tab_text (tbl, 2, 0, TAB_CENTER | TAT_TITLE, + _("Scale Variance if Item Deleted")); + + tab_text (tbl, 3, 0, TAB_CENTER | TAT_TITLE, + _("Corrected Item-Total Correlation")); + + tab_text (tbl, 4, 0, TAB_CENTER | TAT_TITLE, + _("Cronbach's Alpha if Item Deleted")); + + + for (i = 0 ; i < rel->sc[0].n_items; ++i) + { + double cov, item_to_total_r; + double mean, weight, var; + + const struct cronbach *s = &rel->sc[rel->total_start + i]; + tab_text (tbl, 0, heading_rows + i, TAB_LEFT| TAT_TITLE, + var_to_string (rel->sc[0].items[i])); + + moments1_calculate (s->total, &weight, &mean, &var, 0, 0); + + tab_float (tbl, 1, heading_rows + i, TAB_RIGHT, + mean, 8, 3); + + tab_float (tbl, 2, heading_rows + i, TAB_RIGHT, + s->variance_of_sums, 8, 3); + + tab_float (tbl, 4, heading_rows + i, TAB_RIGHT, + s->alpha, 8, 3); + + + moments1_calculate (rel->sc[0].m[i], &weight, &mean, &var, 0,0); + cov = rel->sc[0].variance_of_sums + var - s->variance_of_sums; + cov /= 2.0; + + item_to_total_r = (cov - var) / (sqrt(var) * sqrt (s->variance_of_sums)); + + + tab_float (tbl, 3, heading_rows + i, TAB_RIGHT, + item_to_total_r, 8, 3); + } + + + tab_submit (tbl); +} + + +static void +reliability_statistics_model_alpha (struct tab_table *tbl, + const struct reliability *rel) +{ + const struct cronbach *s = &rel->sc[0]; + + tab_text (tbl, 0, 0, TAB_CENTER | TAT_TITLE, + _("Cronbach's Alpha")); + + tab_text (tbl, 1, 0, TAB_CENTER | TAT_TITLE, + _("N of items")); + + tab_float (tbl, 0, 1, TAB_RIGHT, s->alpha, 8, 3); + + tab_float (tbl, 1, 1, TAB_RIGHT, s->n_items, 8, 0); +} + + +static void +reliability_statistics_model_split (struct tab_table *tbl, + const struct reliability *rel) +{ + tab_text (tbl, 0, 0, TAB_LEFT, + _("Cronbach's Alpha")); + + tab_text (tbl, 1, 0, TAB_LEFT, + _("Part 1")); + + tab_text (tbl, 2, 0, TAB_LEFT, + _("Value")); + + tab_text (tbl, 2, 1, TAB_LEFT, + _("N of Items")); + + + + tab_text (tbl, 1, 2, TAB_LEFT, + _("Part 2")); + + tab_text (tbl, 2, 2, TAB_LEFT, + _("Value")); + + tab_text (tbl, 2, 3, TAB_LEFT, + _("N of Items")); + + + + tab_text (tbl, 1, 4, TAB_LEFT, + _("Total N of Items")); + + tab_text (tbl, 0, 5, TAB_LEFT, + _("Correlation Between Forms")); + + + tab_text (tbl, 0, 6, TAB_LEFT, + _("Spearman-Brown Coefficient")); + + tab_text (tbl, 1, 6, TAB_LEFT, + _("Equal Length")); + + tab_text (tbl, 1, 7, TAB_LEFT, + _("Unequal Length")); + + + tab_text (tbl, 0, 8, TAB_LEFT, + _("Guttman Split-Half Coefficient")); + + + + tab_float (tbl, 3, 0, TAB_RIGHT, rel->sc[1].alpha, 8, 3); + tab_float (tbl, 3, 2, TAB_RIGHT, rel->sc[2].alpha, 8, 3); + + tab_float (tbl, 3, 1, TAB_RIGHT, rel->sc[1].n_items, 8, 0); + tab_float (tbl, 3, 3, TAB_RIGHT, rel->sc[2].n_items, 8, 0); + + tab_float (tbl, 3, 4, TAB_RIGHT, + rel->sc[1].n_items + rel->sc[2].n_items, 8, 0); + + { + /* R is the correlation between the two parts */ + double r = rel->sc[0].variance_of_sums - + rel->sc[1].variance_of_sums - + rel->sc[2].variance_of_sums ; + + /* Guttman Split Half Coefficient */ + double g = 2 * r / rel->sc[0].variance_of_sums; + + /* Unequal Length Spearman Brown Coefficient, and + intermediate value used in the computation thereof */ + double uly, tmp; + + r /= sqrt (rel->sc[1].variance_of_sums); + r /= sqrt (rel->sc[2].variance_of_sums); + r /= 2.0; + + tab_float (tbl, 3, 5, TAB_RIGHT, r, 8, 3); + + /* Equal length Spearman-Brown Coefficient */ + tab_float (tbl, 3, 6, TAB_RIGHT, 2 * r / (1.0 + r), 8, 3); + + tab_float (tbl, 3, 8, TAB_RIGHT, g, 8, 3); + + tmp = (1.0 - r*r) * rel->sc[1].n_items * rel->sc[2].n_items / + pow2 (rel->sc[0].n_items); + + uly = sqrt( pow4 (r) + 4 * pow2 (r) * tmp); + uly -= pow2 (r); + uly /= 2 * tmp; + + tab_float (tbl, 3, 7, TAB_RIGHT, uly, 8, 3); + + } +} + + + +static void +case_processing_summary (casenumber n_valid, casenumber n_missing) +{ + casenumber total; + int n_cols = 4; + int n_rows = 4; + int heading_columns = 2; + int heading_rows = 1; + struct tab_table *tbl; + tbl = tab_create (n_cols, n_rows, 0); + tab_headers (tbl, heading_columns, 0, heading_rows, 0); + + tab_dim (tbl, tab_natural_dimensions); + + tab_title (tbl, _("Case Processing Summary")); + + /* Vertical lines for the data only */ + tab_box (tbl, + -1, -1, + -1, TAL_1, + heading_columns, 0, + n_cols - 1, n_rows - 1); + + /* Box around table */ + tab_box (tbl, + TAL_2, TAL_2, + -1, -1, + 0, 0, + n_cols - 1, n_rows - 1); + + + tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows); + + tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1); + + + tab_text (tbl, 0, heading_rows, TAB_LEFT | TAT_TITLE, + _("Cases")); + + tab_text (tbl, 1, heading_rows, TAB_LEFT | TAT_TITLE, + _("Valid")); + + tab_text (tbl, 1, heading_rows + 1, TAB_LEFT | TAT_TITLE, + _("Excluded")); + + tab_text (tbl, 1, heading_rows + 2, TAB_LEFT | TAT_TITLE, + _("Total")); + + tab_text (tbl, heading_columns, 0, TAB_CENTER | TAT_TITLE, + _("N")); + + tab_text (tbl, heading_columns + 1, 0, TAB_CENTER | TAT_TITLE | TAT_PRINTF, + _("%%")); + + total = n_missing + n_valid; + + tab_float (tbl, 2, heading_rows, TAB_RIGHT, + n_valid, 8, 0); + + + tab_float (tbl, 2, heading_rows + 1, TAB_RIGHT, + n_missing, 8, 0); + + + tab_float (tbl, 2, heading_rows + 2, TAB_RIGHT, + total, 8, 0); + + + tab_float (tbl, 3, heading_rows, TAB_RIGHT, + 100 * n_valid / (double) total, 8, 1); + + + tab_float (tbl, 3, heading_rows + 1, TAB_RIGHT, + 100 * n_missing / (double) total, 8, 1); + + + tab_float (tbl, 3, heading_rows + 2, TAB_RIGHT, + 100 * total / (double) total, 8, 1); + + + tab_submit (tbl); +} + +static int +rel_custom_model (struct lexer *lexer, struct dataset *ds UNUSED, + struct cmd_reliability *cmd UNUSED, void *aux) +{ + struct reliability *rel = aux; + + if (lex_match_id (lexer, "ALPHA")) + { + rel->model = MODEL_ALPHA; + } + else if (lex_match_id (lexer, "SPLIT")) + { + rel->model = MODEL_SPLIT; + rel->split_point = -1; + if ( lex_match (lexer, '(')) + { + lex_force_num (lexer); + rel->split_point = lex_number (lexer); + lex_get (lexer); + lex_force_match (lexer, ')'); + } + } + else + return 0; + + return 1; +} + + + +static int +rel_custom_scale (struct lexer *lexer, struct dataset *ds UNUSED, + struct cmd_reliability *p, void *aux) +{ + struct const_var_set *vs; + struct reliability *rel = aux; + struct cronbach *scale; + + rel->n_sc = 1; + rel->sc = xzalloc (sizeof (struct cronbach) * rel->n_sc); + scale = &rel->sc[0]; + + if ( ! lex_force_match (lexer, '(')) return 0; + + if ( ! lex_force_string (lexer) ) return 0; + + ds_init_string (&rel->scale_name, lex_tokstr (lexer)); + + lex_get (lexer); + + if ( ! lex_force_match (lexer, ')')) return 0; + + lex_match (lexer, '='); + + vs = const_var_set_create_from_array (p->v_variables, p->n_variables); + + if (!parse_const_var_set_vars (lexer, vs, &scale->items, &scale->n_items, 0)) + { + const_var_set_destroy (vs); + return 2; + } + + const_var_set_destroy (vs); + return 1; +} + +/* + Local Variables: + mode: c + End: +*/ diff --git a/src/language/stats/t-test.q b/src/language/stats/t-test.q index 5bb02e1767..3d27874362 100644 --- a/src/language/stats/t-test.q +++ b/src/language/stats/t-test.q @@ -80,8 +80,8 @@ struct group_properties /* The comparison criterion */ enum comparison criterion; - /* The width of the independent variable */ - int indep_width ; + /* The independent variable */ + struct variable *indep_var; union { /* The value of the independent variable at which groups are determined to @@ -1459,7 +1459,7 @@ common_calc (const struct dictionary *dict, gs->n += weight; gs->sum += weight * val->f; - gs->ssq += weight * val->f * val->f; + gs->ssq += weight * pow2 (val->f); } } return 0; @@ -1496,12 +1496,12 @@ common_postcalc (struct cmd_t_test *cmd) gs->mean=gs->sum / gs->n; gs->s_std_dev= sqrt ( - ( (gs->ssq / gs->n ) - gs->mean * gs->mean ) + ( (gs->ssq / gs->n ) - pow2 (gs->mean)) ) ; gs->std_dev= sqrt ( gs->n/ (gs->n-1) * - ( (gs->ssq / gs->n ) - gs->mean * gs->mean ) + ( (gs->ssq / gs->n ) - pow2 (gs->mean)) ) ; gs->se_mean = gs->std_dev / sqrt (gs->n); @@ -1676,7 +1676,7 @@ group_precalc (struct cmd_t_test *cmd ) /* There's always 2 groups for a T - TEST */ ttpr->n_groups = 2; - gp.indep_width = var_get_width (indep_var); + gp.indep_var = indep_var; ttpr->group_hash = hsh_create (2, (hsh_compare_func *) compare_group_binary, @@ -1772,12 +1772,12 @@ group_postcalc ( struct cmd_t_test *cmd ) gs->mean = gs->sum / gs->n; gs->s_std_dev= sqrt ( - ( (gs->ssq / gs->n ) - gs->mean * gs->mean ) + ( (gs->ssq / gs->n ) - pow2 (gs->mean)) ) ; gs->std_dev= sqrt ( gs->n/ (gs->n-1) * - ( (gs->ssq / gs->n ) - gs->mean * gs->mean ) + ( (gs->ssq / gs->n ) - pow2 (gs->mean)) ) ; gs->se_mean = gs->std_dev / sqrt (gs->n); @@ -1815,7 +1815,7 @@ calculate (struct cmd_t_test *cmd, input = casereader_create_filter_missing (input, cmd->v_variables, cmd->n_variables, - exclude, NULL); + exclude, NULL, NULL); input = casereader_create_filter_weight (input, dict, NULL, NULL); @@ -1888,10 +1888,6 @@ compare_group_binary (const struct group_statistics *a, if ( p->criterion == CMP_LE ) { - /* less-than comparision is not meaningfull for - alpha variables, so we shouldn't ever arrive here */ - assert (p->indep_width == 0 ) ; - flag_a = ( a->id.f < p->v.critical_value ) ; flag_b = ( b->id.f < p->v.critical_value ) ; } @@ -1918,8 +1914,6 @@ hash_group_binary (const struct group_statistics *g, if ( p->criterion == CMP_LE ) { - /* Not meaningfull to do a less than compare for alpha values ? */ - assert (p->indep_width == 0 ) ; flag = ( g->id.f < p->v.critical_value ) ; } else if ( p->criterion == CMP_EQ) @@ -1939,10 +1933,10 @@ short which_group (const struct group_statistics *g, const struct group_properties *p) { - if ( 0 == compare_values (&g->id, &p->v.g_value[0], p->indep_width)) + if ( 0 == compare_values (&g->id, &p->v.g_value[0], p->indep_var)) return 0; - if ( 0 == compare_values (&g->id, &p->v.g_value[1], p->indep_width)) + if ( 0 == compare_values (&g->id, &p->v.g_value[1], p->indep_var)) return 1; return 2; diff --git a/src/language/stats/wilcoxon.c b/src/language/stats/wilcoxon.c new file mode 100644 index 0000000000..bca0b7d96a --- /dev/null +++ b/src/language/stats/wilcoxon.c @@ -0,0 +1,419 @@ +/* Pspp - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + + + +#include +#include "wilcoxon.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static double timed_wilcoxon_significance (double w, long int n, double timer); + + +static double +append_difference (const struct ccase *c, casenumber n UNUSED, void *aux) +{ + const variable_pair *vp = aux; + + return case_data (c, (*vp)[0])->f - case_data (c, (*vp)[1])->f; +} + +static void show_ranks_box (const struct wilcoxon_state *, + const struct two_sample_test *); + +static void show_tests_box (const struct wilcoxon_state *, + const struct two_sample_test *, + bool exact, double timer); + + + +static void +distinct_callback (double v UNUSED, casenumber n, double w UNUSED, void *aux) +{ + struct wilcoxon_state *ws = aux; + + ws->tiebreaker += pow3 (n) - n; +} + +#define WEIGHT_IDX 2 + +void +wilcoxon_execute (const struct dataset *ds, + struct casereader *input, + enum mv_class exclude, + const struct npar_test *test, + bool exact, + double timer) +{ + int i; + bool warn = true; + const struct dictionary *dict = dataset_dict (ds); + const struct two_sample_test *t2s = (struct two_sample_test *) test; + + struct wilcoxon_state *ws = xcalloc (sizeof (*ws), t2s->n_pairs); + const struct variable *weight = dict_get_weight (dict); + struct variable *weightx = var_create_internal (WEIGHT_IDX); + + input = + casereader_create_filter_weight (input, dict, &warn, NULL); + + for (i = 0 ; i < t2s->n_pairs; ++i ) + { + struct casereader *r = casereader_clone (input); + struct casewriter *writer; + struct ccase c; + struct case_ordering *ordering = case_ordering_create (); + variable_pair *vp = &t2s->pairs[i]; + + const int reader_width = weight ? 3 : 2; + + ws[i].sign = var_create_internal (0); + ws[i].absdiff = var_create_internal (1); + + case_ordering_add_var (ordering, ws[i].absdiff, SRT_ASCEND); + + + r = casereader_create_filter_missing (r, *vp, 2, + exclude, + NULL, NULL); + + writer = sort_create_writer (ordering, reader_width); + while (casereader_read (r, &c)) + { + struct ccase output; + double d = append_difference (&c, 0, vp); + + case_create (&output, reader_width); + + if (d > 0) + { + case_data_rw (&output, ws[i].sign)->f = 1.0; + + } + else if (d < 0) + { + case_data_rw (&output, ws[i].sign)->f = -1.0; + } + else + { + double w = 1.0; + if (weight) + w = case_data (&c, weight)->f; + + /* Central point values should be dropped */ + ws[i].n_zeros += w; + case_destroy (&c); + continue; + } + + case_data_rw (&output, ws[i].absdiff)->f = fabs (d); + + if (weight) + case_data_rw (&output, weightx)->f = case_data (&c, weight)->f; + + casewriter_write (writer, &output); + case_destroy (&c); + } + casereader_destroy (r); + ws[i].reader = casewriter_make_reader (writer); + } + + for (i = 0 ; i < t2s->n_pairs; ++i ) + { + struct casereader *rr ; + struct ccase c; + enum rank_error err = 0; + + rr = casereader_create_append_rank (ws[i].reader, ws[i].absdiff, + weight ? weightx : NULL, &err, + distinct_callback, &ws[i] + ); + + while (casereader_read (rr, &c)) + { + double sign = case_data (&c, ws[i].sign)->f; + double rank = case_data_idx (&c, weight ? 3 : 2)->f; + double w = 1.0; + if (weight) + w = case_data (&c, weightx)->f; + + if ( sign > 0 ) + { + ws[i].positives.sum += rank * w; + ws[i].positives.n += w; + } + else if (sign < 0) + { + ws[i].negatives.sum += rank * w; + ws[i].negatives.n += w; + } + else + NOT_REACHED (); + + case_destroy (&c); + } + + casereader_destroy (rr); + } + + casereader_destroy (input); + + var_destroy (weightx); + + show_ranks_box (ws, t2s); + show_tests_box (ws, t2s, exact, timer); + + for (i = 0 ; i < t2s->n_pairs; ++i ) + { + var_destroy (ws[i].sign); + var_destroy (ws[i].absdiff); + } + + free (ws); +} + + + + +#include "gettext.h" +#define _(msgid) gettext (msgid) + +static void +show_ranks_box (const struct wilcoxon_state *ws, const struct two_sample_test *t2s) +{ + size_t i; + struct tab_table *table = tab_create (5, 1 + 4 * t2s->n_pairs, 0); + + tab_dim (table, tab_natural_dimensions); + + tab_title (table, _("Ranks")); + + tab_headers (table, 2, 0, 1, 0); + + /* Vertical lines inside the box */ + tab_box (table, 0, 0, -1, TAL_1, + 1, 0, table->nc - 1, tab_nr (table) - 1 ); + + /* Box around entire table */ + tab_box (table, TAL_2, TAL_2, -1, -1, + 0, 0, table->nc - 1, tab_nr (table) - 1 ); + + + tab_text (table, 2, 0, TAB_CENTER, _("N")); + tab_text (table, 3, 0, TAB_CENTER, _("Mean Rank")); + tab_text (table, 4, 0, TAB_CENTER, _("Sum of Ranks")); + + + for (i = 0 ; i < t2s->n_pairs; ++i) + { + variable_pair *vp = &t2s->pairs[i]; + + struct string pair_name; + ds_init_cstr (&pair_name, var_to_string ((*vp)[0])); + ds_put_cstr (&pair_name, " - "); + ds_put_cstr (&pair_name, var_to_string ((*vp)[1])); + + tab_text (table, 1, 1 + i * 4, TAB_LEFT, _("Negative Ranks")); + tab_text (table, 1, 2 + i * 4, TAB_LEFT, _("Positive Ranks")); + tab_text (table, 1, 3 + i * 4, TAB_LEFT, _("Ties")); + tab_text (table, 1, 4 + i * 4, TAB_LEFT, _("Total")); + + tab_hline (table, TAL_1, 0, table->nc - 1, 1 + i * 4); + + + tab_text (table, 0, 1 + i * 4, TAB_LEFT, ds_cstr (&pair_name)); + ds_destroy (&pair_name); + + + /* N */ + tab_float (table, 2, 1 + i * 4, TAB_RIGHT, ws[i].negatives.n, 8, 0); + tab_float (table, 2, 2 + i * 4, TAB_RIGHT, ws[i].positives.n, 8, 0); + tab_float (table, 2, 3 + i * 4, TAB_RIGHT, ws[i].n_zeros, 8, 0); + + tab_float (table, 2, 4 + i * 4, TAB_RIGHT, + ws[i].n_zeros + ws[i].positives.n + ws[i].negatives.n, 8, 0); + + /* Sums */ + tab_float (table, 4, 1 + i * 4, TAB_RIGHT, ws[i].negatives.sum, 8, 2); + tab_float (table, 4, 2 + i * 4, TAB_RIGHT, ws[i].positives.sum, 8, 2); + + + /* Means */ + tab_float (table, 3, 1 + i * 4, TAB_RIGHT, + ws[i].negatives.sum / (double) ws[i].negatives.n, 8, 2); + + tab_float (table, 3, 2 + i * 4, TAB_RIGHT, + ws[i].positives.sum / (double) ws[i].positives.n, 8, 2); + + } + + tab_hline (table, TAL_2, 0, table->nc - 1, 1); + tab_vline (table, TAL_2, 2, 0, table->nr - 1); + + + tab_submit (table); +} + + +static void +show_tests_box (const struct wilcoxon_state *ws, + const struct two_sample_test *t2s, + bool exact, + double timer + ) +{ + size_t i; + struct tab_table *table = tab_create (1 + t2s->n_pairs, exact ? 5 : 3, 0); + + tab_dim (table, tab_natural_dimensions); + + tab_title (table, _("Test Statistics")); + + tab_headers (table, 1, 0, 1, 0); + + /* Vertical lines inside the box */ + tab_box (table, 0, 0, -1, TAL_1, + 0, 0, table->nc - 1, tab_nr (table) - 1 ); + + /* Box around entire table */ + tab_box (table, TAL_2, TAL_2, -1, -1, + 0, 0, table->nc - 1, tab_nr (table) - 1 ); + + + tab_text (table, 0, 1, TAB_LEFT, _("Z")); + tab_text (table, 0, 2, TAB_LEFT, _("Asymp. Sig (2-tailed)")); + + if ( exact ) + { + tab_text (table, 0, 3, TAB_LEFT, _("Exact Sig (2-tailed)")); + tab_text (table, 0, 4, TAB_LEFT, _("Exact Sig (1-tailed)")); + +#if 0 + tab_text (table, 0, 5, TAB_LEFT, _("Point Probability")); +#endif + } + + for (i = 0 ; i < t2s->n_pairs; ++i) + { + double z; + double n = ws[i].positives.n + ws[i].negatives.n; + variable_pair *vp = &t2s->pairs[i]; + + struct string pair_name; + ds_init_cstr (&pair_name, var_to_string ((*vp)[0])); + ds_put_cstr (&pair_name, " - "); + ds_put_cstr (&pair_name, var_to_string ((*vp)[1])); + + + tab_text (table, 1 + i, 0, TAB_CENTER, ds_cstr (&pair_name)); + ds_destroy (&pair_name); + + z = MIN (ws[i].positives.sum, ws[i].negatives.sum); + z -= n * (n + 1)/ 4.0; + + z /= sqrt (n * (n + 1) * (2*n + 1)/24.0 - ws[i].tiebreaker / 48.0); + + tab_float (table, 1 + i, 1, TAB_RIGHT, z, 8, 3); + + tab_float (table, 1 + i, 2, TAB_RIGHT, + 2.0 * gsl_cdf_ugaussian_P (z), + 8, 3); + + if (exact) + { + double p = + timed_wilcoxon_significance (ws[i].positives.sum, + n, + timer ); + + if ( p == SYSMIS) + { + msg (MW, _("Exact significance was not calculated after %.2f minutes. Skipping test."), timer); + } + else + { + tab_float (table, 1 + i, 3, TAB_RIGHT, p, 8, 3); + tab_float (table, 1 + i, 4, TAB_RIGHT, p / 2.0, 8, 3); + } + } + } + + tab_hline (table, TAL_2, 0, table->nc - 1, 1); + tab_vline (table, TAL_2, 1, 0, table->nr - 1); + + + tab_submit (table); +} + + + +#include + +static sigjmp_buf env; + +static void +give_up_callback (int signal UNUSED) +{ + siglongjmp (env, 1); +} + +static double +timed_wilcoxon_significance (double w, long int n, double timer) +{ + double p = SYSMIS; + + sigset_t set; + + struct sigaction timeout_action; + struct sigaction old_action; + + if (timer <= 0 ) + return LevelOfSignificanceWXMPSR (w, n); + + sigemptyset (&set); + + timeout_action.sa_mask = set; + timeout_action.sa_flags = 0; + + timeout_action.sa_restorer = 0; + timeout_action.sa_handler = give_up_callback; + + if ( 0 == sigsetjmp (env, 1)) + { + sigaction (SIGALRM, &timeout_action, &old_action); + alarm (timer * 60.0); + + p = LevelOfSignificanceWXMPSR (w, n); + } + + sigaction (SIGALRM, &old_action, NULL); + + return p; +} diff --git a/src/language/stats/wilcoxon.h b/src/language/stats/wilcoxon.h new file mode 100644 index 0000000000..b0f86a2c90 --- /dev/null +++ b/src/language/stats/wilcoxon.h @@ -0,0 +1,65 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#if !wilcoxon_h +#define wilcoxon_h 1 + +#include +#include +#include +#include + + +struct rank_sum +{ + double n; + double sum; +}; + +struct wilcoxon_state +{ + struct casereader *reader; + struct variable *sign; + struct variable *absdiff; + + struct rank_sum positives; + struct rank_sum negatives; + double n_zeros; + + double tiebreaker; +}; + + +struct wilcoxon_test +{ + struct two_sample_test parent; +}; + +struct casereader; +struct dataset; + + +void wilcoxon_execute (const struct dataset *ds, + struct casereader *input, + enum mv_class exclude, + const struct npar_test *test, + bool exact, + double timer + ); + + + +#endif diff --git a/src/language/tests/automake.mk b/src/language/tests/automake.mk index 00fcab3734..54d95107d9 100644 --- a/src/language/tests/automake.mk +++ b/src/language/tests/automake.mk @@ -6,9 +6,13 @@ language_tests_built_sources = \ language_tests_sources = \ src/language/tests/check-model.h \ src/language/tests/datasheet-test.c \ + src/language/tests/datasheet-check.c \ + src/language/tests/datasheet-check.h \ src/language/tests/format-guesser-test.c \ src/language/tests/float-format.c \ src/language/tests/moments-test.c \ + src/language/tests/model-checker.c \ + src/language/tests/model-checker.h \ src/language/tests/paper-size.c \ src/language/tests/pool-test.c diff --git a/src/language/tests/check-model.q b/src/language/tests/check-model.q index a265471788..2ef5e7bfd2 100644 --- a/src/language/tests/check-model.q +++ b/src/language/tests/check-model.q @@ -21,7 +21,7 @@ #include -#include +#include "model-checker.h" #include #include "error.h" diff --git a/src/language/tests/datasheet-check.c b/src/language/tests/datasheet-check.c new file mode 100644 index 0000000000..b3b2856d42 --- /dev/null +++ b/src/language/tests/datasheet-check.c @@ -0,0 +1,472 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2007 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include "datasheet-check.h" +#include "model-checker.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "minmax.h" +#include "md4.h" +#include "xalloc.h" + + +/* lazy_casereader callback function to instantiate a casereader + from the datasheet. */ +static struct casereader * +lazy_callback (void *ds_) +{ + struct datasheet *ds = ds_; + return datasheet_make_reader (ds); +} + + +/* Maximum size of datasheet supported for model checking + purposes. */ +#define MAX_ROWS 5 +#define MAX_COLS 5 + + +/* Checks that READER contains the ROW_CNT rows and COLUMN_CNT + columns of data in ARRAY, reporting any errors via MC. */ +static void +check_datasheet_casereader (struct mc *mc, struct casereader *reader, + double array[MAX_ROWS][MAX_COLS], + size_t row_cnt, size_t column_cnt) +{ + if (casereader_get_case_cnt (reader) != row_cnt) + { + if (casereader_get_case_cnt (reader) == CASENUMBER_MAX + && casereader_count_cases (reader) == row_cnt) + mc_error (mc, "datasheet casereader has unknown case count"); + else + mc_error (mc, "casereader row count (%lu) does not match " + "expected (%zu)", + (unsigned long int) casereader_get_case_cnt (reader), + row_cnt); + } + else if (casereader_get_value_cnt (reader) != column_cnt) + mc_error (mc, "casereader column count (%zu) does not match " + "expected (%zu)", + casereader_get_value_cnt (reader), column_cnt); + else + { + struct ccase c; + size_t row; + + for (row = 0; row < row_cnt; row++) + { + size_t col; + + if (!casereader_read (reader, &c)) + { + mc_error (mc, "casereader_read failed reading row %zu of %zu " + "(%zu columns)", row, row_cnt, column_cnt); + return; + } + + for (col = 0; col < column_cnt; col++) + if (case_num_idx (&c, col) != array[row][col]) + mc_error (mc, "element %zu,%zu (of %zu,%zu) differs: " + "%g != %g", + row, col, row_cnt, column_cnt, + case_num_idx (&c, col), array[row][col]); + + case_destroy (&c); + } + + if (casereader_read (reader, &c)) + mc_error (mc, "casereader has extra cases (expected %zu)", row_cnt); + } +} + +/* Checks that datasheet DS contains has ROW_CNT rows, COLUMN_CNT + columns, and the same contents as ARRAY, reporting any + mismatches via mc_error. Then, adds DS to MC as a new state. */ +static void +check_datasheet (struct mc *mc, struct datasheet *ds, + double array[MAX_ROWS][MAX_COLS], + size_t row_cnt, size_t column_cnt) +{ + struct datasheet *ds2; + struct casereader *reader; + unsigned long int serial = 0; + + assert (row_cnt < MAX_ROWS); + assert (column_cnt < MAX_COLS); + + /* If it is a duplicate hash, discard the state before checking + its consistency, to save time. */ + if (mc_discard_dup_state (mc, hash_datasheet (ds))) + { + datasheet_destroy (ds); + return; + } + + /* Check contents of datasheet via datasheet functions. */ + if (row_cnt != datasheet_get_row_cnt (ds)) + mc_error (mc, "row count (%lu) does not match expected (%zu)", + (unsigned long int) datasheet_get_row_cnt (ds), row_cnt); + else if (column_cnt != datasheet_get_column_cnt (ds)) + mc_error (mc, "column count (%zu) does not match expected (%zu)", + datasheet_get_column_cnt (ds), column_cnt); + else + { + size_t row, col; + + for (row = 0; row < row_cnt; row++) + for (col = 0; col < column_cnt; col++) + { + union value v; + if (!datasheet_get_value (ds, row, col, &v, 1)) + NOT_REACHED (); + if (v.f != array[row][col]) + mc_error (mc, "element %zu,%zu (of %zu,%zu) differs: %g != %g", + row, col, row_cnt, column_cnt, v.f, array[row][col]); + } + } + + /* Check that datasheet contents are correct when read through + casereader. */ + ds2 = clone_datasheet (ds); + reader = datasheet_make_reader (ds2); + check_datasheet_casereader (mc, reader, array, row_cnt, column_cnt); + casereader_destroy (reader); + + /* Check that datasheet contents are correct when read through + casereader with lazy_casereader wrapped around it. This is + valuable because otherwise there is no non-GUI code that + uses the lazy_casereader. */ + ds2 = clone_datasheet (ds); + reader = lazy_casereader_create (column_cnt, row_cnt, + lazy_callback, ds2, &serial); + check_datasheet_casereader (mc, reader, array, row_cnt, column_cnt); + if (lazy_casereader_destroy (reader, serial)) + { + /* Lazy casereader was never instantiated. This will + only happen if there are no rows (because in that case + casereader_read never gets called). */ + datasheet_destroy (ds2); + if (row_cnt != 0) + mc_error (mc, "lazy casereader not instantiated, but should " + "have been (size %zu,%zu)", row_cnt, column_cnt); + } + else + { + /* Lazy casereader was instantiated. This is the common + case, in which some casereader operation + (casereader_read in this case) was performed on the + lazy casereader. */ + casereader_destroy (reader); + if (row_cnt == 0) + mc_error (mc, "lazy casereader instantiated, but should not " + "have been (size %zu,%zu)", row_cnt, column_cnt); + } + + mc_add_state (mc, ds); +} + +/* Extracts the contents of DS into DATA. */ +static void +extract_data (const struct datasheet *ds, double data[MAX_ROWS][MAX_COLS]) +{ + size_t column_cnt = datasheet_get_column_cnt (ds); + size_t row_cnt = datasheet_get_row_cnt (ds); + size_t row, col; + + assert (row_cnt < MAX_ROWS); + assert (column_cnt < MAX_COLS); + for (row = 0; row < row_cnt; row++) + for (col = 0; col < column_cnt; col++) + { + union value v; + if (!datasheet_get_value (ds, row, col, &v, 1)) + NOT_REACHED (); + data[row][col] = v.f; + } +} + +/* Clones the structure and contents of ODS into *DS, + and the contents of ODATA into DATA. */ +static void +clone_model (const struct datasheet *ods, double odata[MAX_ROWS][MAX_COLS], + struct datasheet **ds, double data[MAX_ROWS][MAX_COLS]) +{ + *ds = clone_datasheet (ods); + memcpy (data, odata, MAX_ROWS * MAX_COLS * sizeof **data); +} + +/* "init" function for struct mc_class. */ +static void +datasheet_mc_init (struct mc *mc) +{ + struct datasheet_test_params *params = mc_get_aux (mc); + struct datasheet *ds; + + if (params->backing_rows == 0 && params->backing_cols == 0) + { + /* Create unbacked datasheet. */ + ds = datasheet_create (NULL); + mc_name_operation (mc, "empty datasheet"); + check_datasheet (mc, ds, NULL, 0, 0); + } + else + { + /* Create datasheet with backing. */ + struct casewriter *writer; + struct casereader *reader; + double data[MAX_ROWS][MAX_COLS]; + int row; + + assert (params->backing_rows > 0 && params->backing_rows <= MAX_ROWS); + assert (params->backing_cols > 0 && params->backing_cols <= MAX_COLS); + + writer = mem_writer_create (params->backing_cols); + for (row = 0; row < params->backing_rows; row++) + { + struct ccase c; + int col; + + case_create (&c, params->backing_cols); + for (col = 0; col < params->backing_cols; col++) + { + double value = params->next_value++; + data[row][col] = value; + case_data_rw_idx (&c, col)->f = value; + } + casewriter_write (writer, &c); + } + reader = casewriter_make_reader (writer); + assert (reader != NULL); + + ds = datasheet_create (reader); + mc_name_operation (mc, "datasheet with (%d,%d) backing", + params->backing_rows, params->backing_cols); + check_datasheet (mc, ds, data, + params->backing_rows, params->backing_cols); + } +} + +/* "mutate" function for struct mc_class. */ +static void +datasheet_mc_mutate (struct mc *mc, const void *ods_) +{ + struct datasheet_test_params *params = mc_get_aux (mc); + + const struct datasheet *ods = ods_; + double odata[MAX_ROWS][MAX_COLS]; + double data[MAX_ROWS][MAX_COLS]; + size_t column_cnt = datasheet_get_column_cnt (ods); + size_t row_cnt = datasheet_get_row_cnt (ods); + size_t pos, new_pos, cnt; + + extract_data (ods, odata); + + /* Insert all possible numbers of columns in all possible + positions. */ + for (pos = 0; pos <= column_cnt; pos++) + for (cnt = 0; cnt <= params->max_cols - column_cnt; cnt++) + if (mc_include_state (mc)) + { + struct datasheet *ds; + union value new[MAX_COLS]; + size_t i, j; + + mc_name_operation (mc, "insert %zu columns at %zu", cnt, pos); + clone_model (ods, odata, &ds, data); + + for (i = 0; i < cnt; i++) + new[i].f = params->next_value++; + + if (!datasheet_insert_columns (ds, new, cnt, pos)) + mc_error (mc, "datasheet_insert_columns failed"); + + for (i = 0; i < row_cnt; i++) + { + insert_range (&data[i][0], column_cnt, sizeof data[i][0], + pos, cnt); + for (j = 0; j < cnt; j++) + data[i][pos + j] = new[j].f; + } + + check_datasheet (mc, ds, data, row_cnt, column_cnt + cnt); + } + + /* Delete all possible numbers of columns from all possible + positions. */ + for (pos = 0; pos < column_cnt; pos++) + for (cnt = 0; cnt < column_cnt - pos; cnt++) + if (mc_include_state (mc)) + { + struct datasheet *ds; + size_t i; + + mc_name_operation (mc, "delete %zu columns at %zu", cnt, pos); + clone_model (ods, odata, &ds, data); + + datasheet_delete_columns (ds, pos, cnt); + + for (i = 0; i < row_cnt; i++) + remove_range (&data[i], column_cnt, sizeof *data[i], pos, cnt); + + check_datasheet (mc, ds, data, row_cnt, column_cnt - cnt); + } + + /* Move all possible numbers of columns from all possible + existing positions to all possible new positions. */ + for (pos = 0; pos < column_cnt; pos++) + for (cnt = 0; cnt < column_cnt - pos; cnt++) + for (new_pos = 0; new_pos < column_cnt - cnt; new_pos++) + if (mc_include_state (mc)) + { + struct datasheet *ds; + size_t i; + + clone_model (ods, odata, &ds, data); + mc_name_operation (mc, "move %zu columns from %zu to %zu", + cnt, pos, new_pos); + + datasheet_move_columns (ds, pos, new_pos, cnt); + + for (i = 0; i < row_cnt; i++) + move_range (&data[i], column_cnt, sizeof data[i][0], + pos, new_pos, cnt); + + check_datasheet (mc, ds, data, row_cnt, column_cnt); + } + + /* Insert all possible numbers of rows in all possible + positions. */ + for (pos = 0; pos <= row_cnt; pos++) + for (cnt = 0; cnt <= params->max_rows - row_cnt; cnt++) + if (mc_include_state (mc)) + { + struct datasheet *ds; + struct ccase c[MAX_ROWS]; + size_t i, j; + + clone_model (ods, odata, &ds, data); + mc_name_operation (mc, "insert %zu rows at %zu", cnt, pos); + + for (i = 0; i < cnt; i++) + { + case_create (&c[i], column_cnt); + for (j = 0; j < column_cnt; j++) + case_data_rw_idx (&c[i], j)->f = params->next_value++; + } + + insert_range (data, row_cnt, sizeof data[pos], pos, cnt); + for (i = 0; i < cnt; i++) + for (j = 0; j < column_cnt; j++) + data[i + pos][j] = case_num_idx (&c[i], j); + + if (!datasheet_insert_rows (ds, pos, c, cnt)) + mc_error (mc, "datasheet_insert_rows failed"); + + check_datasheet (mc, ds, data, row_cnt + cnt, column_cnt); + } + + /* Delete all possible numbers of rows from all possible + positions. */ + for (pos = 0; pos < row_cnt; pos++) + for (cnt = 0; cnt < row_cnt - pos; cnt++) + if (mc_include_state (mc)) + { + struct datasheet *ds; + + clone_model (ods, odata, &ds, data); + mc_name_operation (mc, "delete %zu rows at %zu", cnt, pos); + + datasheet_delete_rows (ds, pos, cnt); + + remove_range (&data[0], row_cnt, sizeof data[0], pos, cnt); + + check_datasheet (mc, ds, data, row_cnt - cnt, column_cnt); + } + + /* Move all possible numbers of rows from all possible existing + positions to all possible new positions. */ + for (pos = 0; pos < row_cnt; pos++) + for (cnt = 0; cnt < row_cnt - pos; cnt++) + for (new_pos = 0; new_pos < row_cnt - cnt; new_pos++) + if (mc_include_state (mc)) + { + struct datasheet *ds; + + clone_model (ods, odata, &ds, data); + mc_name_operation (mc, "move %zu rows from %zu to %zu", + cnt, pos, new_pos); + + datasheet_move_rows (ds, pos, new_pos, cnt); + + move_range (&data[0], row_cnt, sizeof data[0], + pos, new_pos, cnt); + + check_datasheet (mc, ds, data, row_cnt, column_cnt); + } +} + +/* "destroy" function for struct mc_class. */ +static void +datasheet_mc_destroy (const struct mc *mc UNUSED, void *ds_) +{ + struct datasheet *ds = ds_; + datasheet_destroy (ds); +} + +/* Executes the model checker on the datasheet test driver with + the given OPTIONS and passing in the given PARAMS, which must + point to a modifiable "struct datasheet_test_params". If any + value in PARAMS is out of range, it will be adjusted into the + valid range before running the test. + + Returns the results of the model checking run. */ +struct mc_results * +datasheet_test (struct mc_options *options, void *params_) +{ + struct datasheet_test_params *params = params_; + static const struct mc_class datasheet_mc_class = + { + datasheet_mc_init, + datasheet_mc_mutate, + datasheet_mc_destroy, + }; + + params->next_value = 1; + params->max_rows = MIN (params->max_rows, MAX_ROWS); + params->max_cols = MIN (params->max_cols, MAX_COLS); + params->backing_rows = MIN (params->backing_rows, params->max_rows); + params->backing_cols = MIN (params->backing_cols, params->max_cols); + + mc_options_set_aux (options, params); + return mc_run (&datasheet_mc_class, options); +} diff --git a/src/language/tests/datasheet-check.h b/src/language/tests/datasheet-check.h new file mode 100644 index 0000000000..bd0e492dc0 --- /dev/null +++ b/src/language/tests/datasheet-check.h @@ -0,0 +1,89 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2007 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef DATA_DATASHEET_TEST_H +#define DATA_DATASHEET_TEST_H 1 + +#if 0 +#include +#include + +struct casereader; + +/* A datasheet is a 2-d array of data that may be stored in + memory or on disk. It efficiently supports data storage and + retrieval, as well as adding, removing, and rearranging both + rows and columns. */ + +struct datasheet *datasheet_create (struct casereader *); +void datasheet_destroy (struct datasheet *); +struct datasheet *datasheet_rename (struct datasheet *); + +bool datasheet_error (const struct datasheet *); +void datasheet_force_error (struct datasheet *); +const struct taint *datasheet_get_taint (const struct datasheet *); + +struct casereader *datasheet_make_reader (struct datasheet *); + +/* Columns. */ +size_t datasheet_get_column_cnt (const struct datasheet *); +bool datasheet_insert_columns (struct datasheet *, + const union value[], size_t cnt, + size_t before); +void datasheet_delete_columns (struct datasheet *, size_t start, size_t cnt); +void datasheet_move_columns (struct datasheet *, + size_t old_start, size_t new_start, + size_t cnt); + +/* Rows. */ +casenumber datasheet_get_row_cnt (const struct datasheet *); +bool datasheet_insert_rows (struct datasheet *, + casenumber before, struct ccase[], + casenumber cnt); +void datasheet_delete_rows (struct datasheet *, + casenumber first, casenumber cnt); +void datasheet_move_rows (struct datasheet *, + size_t old_start, size_t new_start, + size_t cnt); + +/* Data. */ +bool datasheet_get_row (const struct datasheet *, casenumber, struct ccase *); +bool datasheet_put_row (struct datasheet *, casenumber, struct ccase *); +bool datasheet_get_value (const struct datasheet *, casenumber, size_t column, + union value *, int width); +bool datasheet_put_value (struct datasheet *, casenumber, size_t column, + const union value *, int width); + +#endif + +/* Testing. */ +struct mc_options; + +struct datasheet_test_params + { + /* Parameters. */ + int max_rows; + int max_cols; + int backing_rows; + int backing_cols; + + /* State. */ + int next_value; + }; + +struct mc_results *datasheet_test (struct mc_options *options, void *params); + +#endif /* data/datasheet.h */ diff --git a/src/language/tests/datasheet-test.c b/src/language/tests/datasheet-test.c index 67f68377e1..dfe8b6b2f0 100644 --- a/src/language/tests/datasheet-test.c +++ b/src/language/tests/datasheet-test.c @@ -16,7 +16,7 @@ #include -#include +#include "datasheet-check.h" #include #include @@ -49,6 +49,7 @@ cmd_debug_datasheet (struct lexer *lexer, struct dataset *dataset UNUSED) params.backing_rows = 0; params.backing_cols = 0; + for (;;) { if (lex_match_id (lexer, "MAX")) diff --git a/src/language/tests/model-checker.c b/src/language/tests/model-checker.c new file mode 100644 index 0000000000..a040750068 --- /dev/null +++ b/src/language/tests/model-checker.c @@ -0,0 +1,1466 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2007 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include "model-checker.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "error.h" +#include "minmax.h" +#include "xalloc.h" + +/* Initializes PATH as an empty path. */ +void +mc_path_init (struct mc_path *path) +{ + path->ops = NULL; + path->length = 0; + path->capacity = 0; +} + +/* Copies the contents of OLD into NEW. */ +void +mc_path_copy (struct mc_path *new, const struct mc_path *old) +{ + if (old->length > new->capacity) + { + new->capacity = old->length; + free (new->ops); + new->ops = xnmalloc (new->capacity, sizeof *new->ops); + } + new->length = old->length; + memcpy (new->ops, old->ops, old->length * sizeof *new->ops); +} + +/* Adds OP to the end of PATH. */ +void +mc_path_push (struct mc_path *path, int op) +{ + if (path->length >= path->capacity) + path->ops = xnrealloc (path->ops, ++path->capacity, sizeof *path->ops); + path->ops[path->length++] = op; +} + +/* Removes and returns the operation at the end of PATH. */ +int +mc_path_pop (struct mc_path *path) +{ + int back = mc_path_back (path); + path->length--; + return back; +} + +/* Returns the operation at the end of PATH. */ +int +mc_path_back (const struct mc_path *path) +{ + assert (path->length > 0); + return path->ops[path->length - 1]; +} + +/* Destroys PATH. */ +void +mc_path_destroy (struct mc_path *path) +{ + free (path->ops); + path->ops = NULL; +} + +/* Returns the operation in position INDEX in PATH. + INDEX must be less than the length of PATH. */ +int +mc_path_get_operation (const struct mc_path *path, size_t index) +{ + assert (index < path->length); + return path->ops[index]; +} + +/* Returns the number of operations in PATH. */ +size_t +mc_path_get_length (const struct mc_path *path) +{ + return path->length; +} + +/* Appends the operations in PATH to STRING, separating each one + with a single space. */ +void +mc_path_to_string (const struct mc_path *path, struct string *string) +{ + size_t i; + + for (i = 0; i < mc_path_get_length (path); i++) + { + if (i > 0) + ds_put_char (string, ' '); + ds_put_format (string, "%d", mc_path_get_operation (path, i)); + } +} + +/* Search options. */ +struct mc_options + { + /* Search strategy. */ + enum mc_strategy strategy; /* Type of strategy. */ + int max_depth; /* Limit on depth (or INT_MAX). */ + int hash_bits; /* Number of bits to hash (or 0). */ + unsigned int seed; /* Random seed for MC_RANDOM + or MC_DROP_RANDOM. */ + struct mc_path follow_path; /* Path for MC_PATH. */ + + /* Queue configuration. */ + int queue_limit; /* Maximum length of queue. */ + enum mc_queue_limit_strategy queue_limit_strategy; + /* How to choose state to drop + from queue. */ + + /* Stop conditions. */ + int max_unique_states; /* Maximum unique states to process. */ + int max_errors; /* Maximum errors to detect. */ + double time_limit; /* Maximum time in seconds. */ + + /* Output configuration. */ + int verbosity; /* 0=low, 1=normal, 2+=high. */ + int failure_verbosity; /* If greater than verbosity, + verbosity of error replays. */ + FILE *output_file; /* File to receive output. */ + + /* How to report intermediate progress. */ + int progress_usec; /* Microseconds between reports. */ + mc_progress_func *progress_func; /* Function to call on each report. */ + + /* Client data. */ + void *aux; + }; + +/* Default progress function. */ +static bool +default_progress (struct mc *mc) +{ + if (mc_results_get_stop_reason (mc_get_results (mc)) == MC_CONTINUING) + putc ('.', stderr); + else + putc ('\n', stderr); + return true; +} + +/* Do-nothing progress function. */ +static bool +null_progress (struct mc *mc UNUSED) +{ + return true; +} + +/* Creates and returns a set of options initialized to the + defaults. */ +struct mc_options * +mc_options_create (void) +{ + struct mc_options *options = xmalloc (sizeof *options); + + options->strategy = MC_BROAD; + options->max_depth = INT_MAX; + options->hash_bits = 20; + options->seed = 0; + mc_path_init (&options->follow_path); + + options->queue_limit = 10000; + options->queue_limit_strategy = MC_DROP_RANDOM; + + options->max_unique_states = INT_MAX; + options->max_errors = 1; + options->time_limit = 0.0; + + options->verbosity = 1; + options->failure_verbosity = 2; + options->output_file = stdout; + options->progress_usec = 250000; + options->progress_func = default_progress; + + options->aux = NULL; + + return options; +} + +/* Returns a copy of the given OPTIONS. */ +struct mc_options * +mc_options_clone (const struct mc_options *options) +{ + return xmemdup (options, sizeof *options); +} + +/* Destroys OPTIONS. */ +void +mc_options_destroy (struct mc_options *options) +{ + mc_path_destroy (&options->follow_path); + free (options); +} + +/* Returns the search strategy used for OPTIONS. The choices + are: + + - MC_BROAD (the default): Breadth-first search. First tries + all the operations with depth 1, then those with depth 2, + then those with depth 3, and so on. + + This search algorithm finds the least number of operations + needed to trigger a given bug. + + - MC_DEEP: Depth-first search. Searches downward in the tree + of states as fast as possible. Good for finding bugs that + require long sequences of operations to trigger. + + - MC_RANDOM: Random-first search. Searches through the tree + of states in random order. The standard C library's rand + function selects the search path; you can control the seed + passed to srand using mc_options_set_seed. + + - MC_PATH: Explicit path. Applies an explicitly specified + sequence of operations. */ +enum mc_strategy +mc_options_get_strategy (const struct mc_options *options) +{ + return options->strategy; +} + +/* Sets the search strategy used for OPTIONS to STRATEGY. + + This function cannot be used to set MC_PATH as the search + strategy. Use mc_options_set_follow_path instead. */ +void +mc_options_set_strategy (struct mc_options *options, enum mc_strategy strategy) +{ + assert (strategy == MC_BROAD + || strategy == MC_DEEP + || strategy == MC_RANDOM); + options->strategy = strategy; +} + +/* Returns OPTION's random seed used by MC_RANDOM and + MC_DROP_RANDOM. */ +unsigned int +mc_options_get_seed (const struct mc_options *options) +{ + return options->seed; +} + +/* Set OPTION's random seed used by MC_RANDOM and MC_DROP_RANDOM + to SEED. */ +void +mc_options_set_seed (struct mc_options *options, unsigned int seed) +{ + options->seed = seed; +} + +/* Returns the maximum depth to which OPTIONS's search will + descend. The initial states are at depth 1, states produced + as their mutations are at depth 2, and so on. */ +int +mc_options_get_max_depth (const struct mc_options *options) +{ + return options->max_depth; +} + +/* Sets the maximum depth to which OPTIONS's search will descend + to MAX_DEPTH. The initial states are at depth 1, states + produced as their mutations are at depth 2, and so on. */ +void +mc_options_set_max_depth (struct mc_options *options, int max_depth) +{ + options->max_depth = max_depth; +} + +/* Returns the base-2 log of the number of bits in OPTIONS's hash + table. The hash table is used for dropping states that are + probably duplicates: any state with a given hash value, as + will only be processed once. A return value of 0 indicates + that the model checker will not discard duplicate states based + on their hashes. + + The hash table is a power of 2 bits long, by default 2**20 + bits (128 kB). Depending on how many states you expect the + model checker to check, how much memory you're willing to let + the hash table take up, and how worried you are about missing + states due to hash collisions, you could make it larger or + smaller. + + The "birthday paradox" points to a reasonable way to size your + hash table. If you expect the model checker to check about + 2**N states, then, assuming a perfect hash, you need a hash + table of 2**(N+1) bits to have a 50% chance of seeing a hash + collision, 2**(N+2) bits to have a 25% chance, and so on. */ +int +mc_options_get_hash_bits (const struct mc_options *options) +{ + return options->hash_bits; +} + +/* Sets the base-2 log of the number of bits in OPTIONS's hash + table to HASH_BITS. A HASH_BITS value of 0 requests that the + model checker not discard duplicate states based on their + hashes. (This causes the model checker to never terminate in + many cases.) */ +void +mc_options_set_hash_bits (struct mc_options *options, int hash_bits) +{ + assert (hash_bits >= 0); + options->hash_bits = MIN (hash_bits, CHAR_BIT * sizeof (unsigned int) - 1); +} + +/* Returns the path set in OPTIONS by mc_options_set_follow_path. + May be used only if the search strategy is MC_PATH. */ +const struct mc_path * +mc_options_get_follow_path (const struct mc_options *options) +{ + assert (options->strategy == MC_PATH); + return &options->follow_path; +} + +/* Sets, in OPTIONS, the search algorithm to MC_PATH and the path + to be the explicit path specified in FOLLOW_PATH. */ +void +mc_options_set_follow_path (struct mc_options *options, + const struct mc_path *follow_path) +{ + assert (mc_path_get_length (follow_path) > 0); + options->strategy = MC_PATH; + mc_path_copy (&options->follow_path, follow_path); +} + +/* Returns the maximum number of queued states in OPTIONS. The + default value is 10,000. The primary reason to limit the + number of queued states is to conserve memory, so if you can + afford the memory and your model needs more room in the queue, + you can raise the limit. Conversely, if your models are large + or memory is constrained, you can reduce the limit. + + Following the execution of the model checker, you can find out + the maximum queue length during the run by calling + mc_results_get_max_queue_length. */ +int +mc_options_get_queue_limit (const struct mc_options *options) +{ + return options->queue_limit; +} + +/* Sets the maximum number of queued states in OPTIONS to + QUEUE_LIMIT. */ +void +mc_options_set_queue_limit (struct mc_options *options, int queue_limit) +{ + assert (queue_limit > 0); + options->queue_limit = queue_limit; +} + +/* Returns the queue limit strategy used by OPTIONS, that is, + when a new state must be inserted into a full state queue is + full, how the state to be dropped is chosen. The choices are: + + - MC_DROP_NEWEST: Drop the newest state; that is, do not + insert the new state into the queue at all. + + - MC_DROP_OLDEST: Drop the state that has been enqueued for + the longest. + + - MC_DROP_RANDOM (the default): Drop a randomly selected state + from the queue. The standard C library's rand function + selects the state to drop; you can control the seed passed + to srand using mc_options_set_seed. */ +enum mc_queue_limit_strategy +mc_options_get_queue_limit_strategy (const struct mc_options *options) +{ + return options->queue_limit_strategy; +} + +/* Sets the queue limit strategy used by OPTIONS to STRATEGY. + + This setting has no effect unless the model being checked + causes the state queue to overflow (see + mc_options_get_queue_limit). */ +void +mc_options_set_queue_limit_strategy (struct mc_options *options, + enum mc_queue_limit_strategy strategy) +{ + assert (strategy == MC_DROP_NEWEST + || strategy == MC_DROP_OLDEST + || strategy == MC_DROP_RANDOM); + options->queue_limit_strategy = strategy; +} + +/* Returns OPTIONS's maximum number of unique states that the + model checker will examine before terminating. The default is + INT_MAX. */ +int +mc_options_get_max_unique_states (const struct mc_options *options) +{ + return options->max_unique_states; +} + +/* Sets OPTIONS's maximum number of unique states that the model + checker will examine before terminating to + MAX_UNIQUE_STATE. */ +void +mc_options_set_max_unique_states (struct mc_options *options, + int max_unique_states) +{ + options->max_unique_states = max_unique_states; +} + +/* Returns the maximum number of errors that OPTIONS will allow + the model checker to encounter before terminating. The + default is 1. */ +int +mc_options_get_max_errors (const struct mc_options *options) +{ + return options->max_errors; +} + +/* Sets the maximum number of errors that OPTIONS will allow the + model checker to encounter before terminating to + MAX_ERRORS. */ +void +mc_options_set_max_errors (struct mc_options *options, int max_errors) +{ + options->max_errors = max_errors; +} + +/* Returns the maximum amount of time, in seconds, that OPTIONS will allow the + model checker to consume before terminating. The + default of 0.0 means that time consumption is unlimited. */ +double +mc_options_get_time_limit (const struct mc_options *options) +{ + return options->time_limit; +} + +/* Sets the maximum amount of time, in seconds, that OPTIONS will + allow the model checker to consume before terminating to + TIME_LIMIT. A value of 0.0 means that time consumption is + unlimited; otherwise, the return value will be positive. */ +void +mc_options_set_time_limit (struct mc_options *options, double time_limit) +{ + assert (time_limit >= 0.0); + options->time_limit = time_limit; +} + +/* Returns the level of verbosity for output messages specified + by OPTIONS. The default verbosity level is 1. + + A verbosity level of 0 inhibits all messages except for + errors; a verbosity level of 1 also allows warnings; a + verbosity level of 2 also causes a description of each state + added to be output; a verbosity level of 3 also causes a + description of each duplicate state to be output. Verbosity + levels less than 0 or greater than 3 are allowed but currently + have no additional effect. */ +int +mc_options_get_verbosity (const struct mc_options *options) +{ + return options->verbosity; +} + +/* Sets the level of verbosity for output messages specified + by OPTIONS to VERBOSITY. */ +void +mc_options_set_verbosity (struct mc_options *options, int verbosity) +{ + options->verbosity = verbosity; +} + +/* Returns the level of verbosity for failures specified by + OPTIONS. The default failure verbosity level is 2. + + The failure verbosity level has an effect only when an error + is reported, and only when the failure verbosity level is + higher than the regular verbosity level. When this is the + case, the model checker replays the error path at the higher + verbosity level specified. This has the effect of outputting + an explicit, human-readable description of the sequence of + operations that caused the error. */ +int +mc_options_get_failure_verbosity (const struct mc_options *options) +{ + return options->failure_verbosity; +} + +/* Sets the level of verbosity for failures specified by OPTIONS + to FAILURE_VERBOSITY. */ +void +mc_options_set_failure_verbosity (struct mc_options *options, + int failure_verbosity) +{ + options->failure_verbosity = failure_verbosity; +} + +/* Returns the output file used for messages printed by the model + checker specified by OPTIONS. The default is stdout. */ +FILE * +mc_options_get_output_file (const struct mc_options *options) +{ + return options->output_file; +} + +/* Sets the output file used for messages printed by the model + checker specified by OPTIONS to OUTPUT_FILE. + + The model checker does not automatically close the specified + output file. If this is desired, the model checker's client + must do so. */ +void +mc_options_set_output_file (struct mc_options *options, + FILE *output_file) +{ + options->output_file = output_file; +} + +/* Returns the number of microseconds between calls to the + progress function specified by OPTIONS. The default is + 250,000 (1/4 second). A value of 0 disables progress + reporting. */ +int +mc_options_get_progress_usec (const struct mc_options *options) +{ + return options->progress_usec; +} + +/* Sets the number of microseconds between calls to the progress + function specified by OPTIONS to PROGRESS_USEC. A value of 0 + disables progress reporting. */ +void +mc_options_set_progress_usec (struct mc_options *options, int progress_usec) +{ + assert (progress_usec >= 0); + options->progress_usec = progress_usec; +} + +/* Returns the function called to report progress specified by + OPTIONS. The function used by default prints '.' to + stderr. */ +mc_progress_func * +mc_options_get_progress_func (const struct mc_options *options) +{ + return options->progress_func; +} + +/* Sets the function called to report progress specified by + OPTIONS to PROGRESS_FUNC. A non-null function must be + specified; to disable progress reporting, set the progress + reporting interval to 0. + + PROGRESS_FUNC will be called zero or more times while the + model checker's run is ongoing. For these calls to the + progress function, mc_results_get_stop_reason will return + MC_CONTINUING. It will also be called exactly once soon + before mc_run returns, in which case + mc_results_get_stop_reason will return a different value. */ +void +mc_options_set_progress_func (struct mc_options *options, + mc_progress_func *progress_func) +{ + assert (options->progress_func != NULL); + options->progress_func = progress_func; +} + +/* Returns the auxiliary data set in OPTIONS by the client. The + default is a null pointer. + + This auxiliary data value can be retrieved by the + client-specified functions in struct mc_class during a model + checking run using mc_get_aux. */ +void * +mc_options_get_aux (const struct mc_options *options) +{ + return options->aux; +} + +/* Sets the auxiliary data in OPTIONS to AUX. */ +void +mc_options_set_aux (struct mc_options *options, void *aux) +{ + options->aux = aux; +} + +/* Results of a model checking run. */ +struct mc_results + { + /* Overall results. */ + enum mc_stop_reason stop_reason; /* Why the run ended. */ + int unique_state_count; /* Number of unique states checked. */ + int error_count; /* Number of errors found. */ + + /* Depth statistics. */ + int max_depth_reached; /* Max depth state examined. */ + struct moments1 *depth_moments; /* Enables reporting mean depth. */ + + /* If error_count > 0, path to the last error reported. */ + struct mc_path error_path; + + /* States dropped... */ + int duplicate_dropped_states; /* ...as duplicates. */ + int off_path_dropped_states; /* ...as off-path (MC_PATH only). */ + int depth_dropped_states; /* ...due to excessive depth. */ + int queue_dropped_states; /* ...due to queue overflow. */ + + /* Queue statistics. */ + int queued_unprocessed_states; /* Enqueued but never dequeued. */ + int max_queue_length; /* Maximum queue length observed. */ + + /* Timing. */ + struct timeval start; /* Start of model checking run. */ + struct timeval end; /* End of model checking run. */ + }; + +/* Creates, initializes, and returns a new set of results. */ +static struct mc_results * +mc_results_create (void) +{ + struct mc_results *results = xcalloc (1, sizeof (struct mc_results)); + results->stop_reason = MC_CONTINUING; + results->depth_moments = moments1_create (MOMENT_MEAN); + gettimeofday (&results->start, NULL); + return results; +} + +/* Destroys RESULTS. */ +void +mc_results_destroy (struct mc_results *results) +{ + if (results != NULL) + { + moments1_destroy (results->depth_moments); + mc_path_destroy (&results->error_path); + free (results); + } +} + +/* Returns RESULTS's reason that the model checking run + terminated. The possible reasons are: + + - MC_CONTINUING: The run is not actually yet complete. This + can only be returned before mc_run has returned, e.g. when + the progress function set by mc_options_set_progress_func + examines the run's results. + + - MC_SUCCESS: The run completed because the queue emptied. + The entire state space might not have been explored due to a + requested limit on maximum depth, hash collisions, etc. + + - MC_MAX_UNIQUE_STATES: The run completed because as many + unique states have been checked as were requested (using + mc_options_set_max_unique_states). + + - MC_MAX_ERROR_COUNT: The run completed because the maximum + requested number of errors (by default, 1 error) was + reached. + + - MC_END_OF_PATH: The run completed because the path specified + with mc_options_set_follow_path was fully traversed. + + - MC_TIMEOUT: The run completed because the time limit set + with mc_options_set_time_limit was exceeded. + + - MC_INTERRUPTED: The run completed because SIGINT was caught + (typically, due to the user typing Ctrl+C). */ +enum mc_stop_reason +mc_results_get_stop_reason (const struct mc_results *results) +{ + return results->stop_reason; +} + +/* Returns the number of unique states checked specified by + RESULTS. */ +int +mc_results_get_unique_state_count (const struct mc_results *results) +{ + return results->unique_state_count; +} + +/* Returns the number of errors found specified by RESULTS. */ +int +mc_results_get_error_count (const struct mc_results *results) +{ + return results->error_count; +} + +/* Returns the maximum depth reached during the model checker run + represented by RESULTS. The initial states are at depth 1, + their child states at depth 2, and so on. */ +int +mc_results_get_max_depth_reached (const struct mc_results *results) +{ + return results->max_depth_reached; +} + +/* Returns the mean depth reached during the model checker run + represented by RESULTS. */ +double +mc_results_get_mean_depth_reached (const struct mc_results *results) +{ + double mean; + moments1_calculate (results->depth_moments, NULL, &mean, NULL, NULL, NULL); + return mean != SYSMIS ? mean : 0.0; +} + +/* Returns the path traversed to obtain the last error + encountered during the model checker run represented by + RESULTS. Returns a null pointer if the run did not report any + errors. */ +const struct mc_path * +mc_results_get_error_path (const struct mc_results *results) +{ + return results->error_count > 0 ? &results->error_path : NULL; +} + +/* Returns the number of states dropped as duplicates (based on + hash value) during the model checker run represented by + RESULTS. */ +int +mc_results_get_duplicate_dropped_states (const struct mc_results *results) +{ + return results->duplicate_dropped_states; +} + +/* Returns the number of states dropped because they were off the + path specified by mc_options_set_follow_path during the model + checker run represented by RESULTS. A nonzero value here + indicates a missing call to mc_include_state in the + client-supplied mutation function. */ +int +mc_results_get_off_path_dropped_states (const struct mc_results *results) +{ + return results->off_path_dropped_states; +} + +/* Returns the number of states dropped because their depth + exceeded the maximum specified with mc_options_set_max_depth + during the model checker run represented by RESULTS. */ +int +mc_results_get_depth_dropped_states (const struct mc_results *results) +{ + return results->depth_dropped_states; +} + +/* Returns the number of states dropped from the queue due to + queue overflow during the model checker run represented by + RESULTS. */ +int +mc_results_get_queue_dropped_states (const struct mc_results *results) +{ + return results->queue_dropped_states; +} + +/* Returns the number of states that were checked and enqueued + but never dequeued and processed during the model checker run + represented by RESULTS. This is zero if the stop reason is + MC_CONTINUING or MC_SUCCESS; otherwise, it is the number of + states in the queue at the time that the checking run + stopped. */ +int +mc_results_get_queued_unprocessed_states (const struct mc_results *results) +{ + return results->queued_unprocessed_states; +} + +/* Returns the maximum length of the queue during the model + checker run represented by RESULTS. If this is equal to the + maximum queue length, then the queue (probably) overflowed + during the run; otherwise, it did not overflow. */ +int +mc_results_get_max_queue_length (const struct mc_results *results) +{ + return results->max_queue_length; +} + +/* Returns the time at which the model checker run represented by + RESULTS started. */ +struct timeval +mc_results_get_start (const struct mc_results *results) +{ + return results->start; +} + +/* Returns the time at which the model checker run represented by + RESULTS ended. (This function may not be called while the run + is still ongoing.) */ +struct timeval +mc_results_get_end (const struct mc_results *results) +{ + assert (results->stop_reason != MC_CONTINUING); + return results->end; +} + +/* Returns the number of seconds obtained by subtracting time Y + from time X. */ +static double +timeval_subtract (struct timeval x, struct timeval y) +{ + /* From libc.info. */ + double difference; + + /* Perform the carry for the later subtraction by updating Y. */ + if (x.tv_usec < y.tv_usec) { + int nsec = (y.tv_usec - x.tv_usec) / 1000000 + 1; + y.tv_usec -= 1000000 * nsec; + y.tv_sec += nsec; + } + if (x.tv_usec - y.tv_usec > 1000000) { + int nsec = (x.tv_usec - y.tv_usec) / 1000000; + y.tv_usec += 1000000 * nsec; + y.tv_sec -= nsec; + } + + /* Compute the time remaining to wait. + `tv_usec' is certainly positive. */ + difference = (x.tv_sec - y.tv_sec) + (x.tv_usec - y.tv_usec) / 1000000.0; + if (x.tv_sec < y.tv_sec) + difference = -difference; + return difference; +} + + +/* Returns the duration, in seconds, of the model checker run + represented by RESULTS. (This function may not be called + while the run is still ongoing.) */ +double +mc_results_get_duration (const struct mc_results *results) +{ + assert (results->stop_reason != MC_CONTINUING); + return timeval_subtract (results->end, results->start); +} + +/* An active model checking run. */ +struct mc + { + /* Related data structures. */ + const struct mc_class *class; + struct mc_options *options; + struct mc_results *results; + + /* Array of 2**(options->hash_bits) bits representing states + already visited. */ + unsigned char *hash; + + /* State queue. */ + struct mc_state **queue; /* Array of pointers to states. */ + struct deque queue_deque; /* Deque. */ + + /* State currently being built by "init" or "mutate". */ + struct mc_path path; /* Path to current state. */ + struct string path_string; /* Buffer for path_string function. */ + bool state_named; /* mc_name_operation called? */ + bool state_error; /* mc_error called? */ + + /* Statistics for calling the progress function. */ + unsigned int progress; /* Current progress value. */ + unsigned int next_progress; /* Next value to call progress func. */ + unsigned int prev_progress; /* Last value progress func called. */ + struct timeval prev_progress_time; /* Last time progress func called. */ + + /* Information for handling and restoring SIGINT. */ + bool interrupted; /* SIGINT received? */ + bool *saved_interrupted_ptr; /* Saved value of interrupted_ptr. */ + void (*saved_sigint) (int); /* Saved SIGINT handler. */ + }; + +/* A state in the queue. */ +struct mc_state + { + struct mc_path path; /* Path to this state. */ + void *data; /* Client-supplied data. */ + }; + +/* Points to the current struct mc's "interrupted" member. */ +static bool *interrupted_ptr = NULL; + +static const char *path_string (struct mc *); +static void free_state (const struct mc *, struct mc_state *); +static void stop (struct mc *, enum mc_stop_reason); +static struct mc_state *make_state (const struct mc *, void *); +static size_t random_queue_index (struct mc *); +static void enqueue_state (struct mc *, struct mc_state *); +static void do_error_state (struct mc *); +static void next_operation (struct mc *); +static bool is_off_path (const struct mc *); +static void sigint_handler (int signum); +static void init_mc (struct mc *, + const struct mc_class *, struct mc_options *); +static void finish_mc (struct mc *); + +/* Runs the model checker on the client-specified CLASS with the + client-specified OPTIONS. OPTIONS may be a null pointer if + the defaults are acceptable. Destroys OPTIONS; use + mc_options_clone if a copy is needed. + + Returns the results of the model checking run, which must be + destroyed by the client with mc_results_destroy. + + To pass auxiliary data to the functions in CLASS, use + mc_options_set_aux on OPTIONS, which may be retrieved from the + CLASS functions using mc_get_aux. */ +struct mc_results * +mc_run (const struct mc_class *class, struct mc_options *options) +{ + struct mc mc; + + init_mc (&mc, class, options); + while (!deque_is_empty (&mc.queue_deque) + && mc.results->stop_reason == MC_CONTINUING) + { + struct mc_state *state = mc.queue[deque_pop_front (&mc.queue_deque)]; + mc_path_copy (&mc.path, &state->path); + mc_path_push (&mc.path, 0); + class->mutate (&mc, state->data); + free_state (&mc, state); + if (mc.interrupted) + stop (&mc, MC_INTERRUPTED); + } + finish_mc (&mc); + + return mc.results; +} + +/* Tests whether the current operation is one that should be + performed, checked, and enqueued. If so, returns true. + Otherwise, returns false and, unless checking is stopped, + advances to the next state. The caller should then advance + to the next operation. + + This function should be called from the client-provided + "mutate" function, according to the pattern explained in the + big comment at the top of model-checker.h. */ +bool +mc_include_state (struct mc *mc) +{ + if (mc->results->stop_reason != MC_CONTINUING) + return false; + else if (is_off_path (mc)) + { + next_operation (mc); + return false; + } + else + return true; +} + +/* Tests whether HASH represents a state that has (probably) + already been enqueued. If not, returns false and marks HASH + so that it will be treated as a duplicate in the future. If + so, returns true and advances to the next state. The + caller should then advance to the next operation. + + This function should be called from the client-provided + "mutate" function, according to the pattern explained in the + big comment at the top of model-checker.h. */ +bool +mc_discard_dup_state (struct mc *mc, unsigned int hash) +{ + if (mc->options->hash_bits > 0) + { + hash &= (1u << mc->options->hash_bits) - 1; + if (TEST_BIT (mc->hash, hash)) + { + if (mc->options->verbosity > 2) + fprintf (mc->options->output_file, + " [%s] discard duplicate state\n", path_string (mc)); + mc->results->duplicate_dropped_states++; + next_operation (mc); + return true; + } + SET_BIT (mc->hash, hash); + } + return false; +} + +/* Names the current state NAME, which may contain + printf-style format specifications. NAME should be a + human-readable name for the current operation. + + This function should be called from the client-provided + "mutate" function, according to the pattern explained in the + big comment at the top of model-checker.h. */ +void +mc_name_operation (struct mc *mc, const char *name, ...) +{ + va_list args; + + va_start (args, name); + mc_vname_operation (mc, name, args); + va_end (args); +} + +/* Names the current state NAME, which may contain + printf-style format specifications, for which the + corresponding arguments must be given in ARGS. NAME should be + a human-readable name for the current operation. + + This function should be called from the client-provided + "mutate" function, according to the pattern explained in the + big comment at the top of model-checker.h. */ +void +mc_vname_operation (struct mc *mc, const char *name, va_list args) +{ + if (mc->state_named && mc->options->verbosity > 0) + fprintf (mc->options->output_file, " [%s] warning: duplicate call " + "to mc_name_operation (missing call to mc_add_state?)\n", + path_string (mc)); + mc->state_named = true; + + if (mc->options->verbosity > 1) + { + fprintf (mc->options->output_file, " [%s] ", path_string (mc)); + vfprintf (mc->options->output_file, name, args); + putc ('\n', mc->options->output_file); + } +} + +/* Reports the given error MESSAGE for the current operation. + The resulting state should still be passed to mc_add_state + when all relevant error messages have been issued. The state + will not, however, be enqueued for later mutation of its own. + + By default, model checking stops after the first error + encountered. + + This function should be called from the client-provided + "mutate" function, according to the pattern explained in the + big comment at the top of model-checker.h. */ +void +mc_error (struct mc *mc, const char *message, ...) +{ + va_list args; + + if (mc->results->stop_reason != MC_CONTINUING) + return; + + if (mc->options->verbosity > 1) + fputs (" ", mc->options->output_file); + fprintf (mc->options->output_file, "[%s] error: ", + path_string (mc)); + va_start (args, message); + vfprintf (mc->options->output_file, message, args); + va_end (args); + putc ('\n', mc->options->output_file); + + mc->state_error = true; +} + +/* Enqueues DATA as the state corresponding to the current + operation. The operation should have been named with a call + to mc_name_operation, and it should have been checked by the + caller (who should have reported any errors with mc_error). + + This function should be called from the client-provided + "mutate" function, according to the pattern explained in the + big comment at the top of model-checker.h. */ +void +mc_add_state (struct mc *mc, void *data) +{ + if (!mc->state_named && mc->options->verbosity > 0) + fprintf (mc->options->output_file, " [%s] warning: unnamed state\n", + path_string (mc)); + + if (mc->results->stop_reason != MC_CONTINUING) + { + /* Nothing to do. */ + } + else if (mc->state_error) + do_error_state (mc); + else if (is_off_path (mc)) + mc->results->off_path_dropped_states++; + else if (mc->path.length + 1 > mc->options->max_depth) + mc->results->depth_dropped_states++; + else + { + /* This is the common case. */ + mc->results->unique_state_count++; + if (mc->results->unique_state_count >= mc->options->max_unique_states) + stop (mc, MC_MAX_UNIQUE_STATES); + enqueue_state (mc, make_state (mc, data)); + next_operation (mc); + return; + } + + mc->class->destroy (mc, data); + next_operation (mc); +} + +/* Returns the options that were passed to mc_run for model + checker MC. */ +const struct mc_options * +mc_get_options (const struct mc *mc) +{ + return mc->options; +} + +/* Returns the current state of the results for model checker + MC. This function is appropriate for use from the progress + function set by mc_options_set_progress_func. + + Not all of the results are meaningful before model checking + completes. */ +const struct mc_results * +mc_get_results (const struct mc *mc) +{ + return mc->results; +} + +/* Returns the auxiliary data set on the options passed to mc_run + with mc_options_set_aux. */ +void * +mc_get_aux (const struct mc *mc) +{ + return mc_options_get_aux (mc_get_options (mc)); +} + +/* Expresses MC->path as a string and returns the string. */ +static const char * +path_string (struct mc *mc) +{ + ds_clear (&mc->path_string); + mc_path_to_string (&mc->path, &mc->path_string); + return ds_cstr (&mc->path_string); +} + +/* Frees STATE, including client data. */ +static void +free_state (const struct mc *mc, struct mc_state *state) +{ + mc->class->destroy (mc, state->data); + mc_path_destroy (&state->path); + free (state); +} + +/* Sets STOP_REASON as the reason that MC's processing stopped, + unless MC is already stopped. */ +static void +stop (struct mc *mc, enum mc_stop_reason stop_reason) +{ + if (mc->results->stop_reason == MC_CONTINUING) + mc->results->stop_reason = stop_reason; +} + +/* Creates and returns a new state whose path is copied from + MC->path and whose data is specified by DATA. */ +static struct mc_state * +make_state (const struct mc *mc, void *data) +{ + struct mc_state *new = xmalloc (sizeof *new); + mc_path_init (&new->path); + mc_path_copy (&new->path, &mc->path); + new->data = data; + return new; +} + +/* Returns the index in MC->queue of a random element in the + queue. */ +static size_t +random_queue_index (struct mc *mc) +{ + assert (!deque_is_empty (&mc->queue_deque)); + return deque_front (&mc->queue_deque, + rand () % deque_count (&mc->queue_deque)); +} + +/* Adds NEW to MC's state queue, dropping a state if necessary + due to overflow. */ +static void +enqueue_state (struct mc *mc, struct mc_state *new) +{ + size_t idx; + + if (new->path.length > mc->results->max_depth_reached) + mc->results->max_depth_reached = new->path.length; + moments1_add (mc->results->depth_moments, new->path.length, 1.0); + + if (deque_count (&mc->queue_deque) < mc->options->queue_limit) + { + /* Add new state to queue. */ + if (deque_is_full (&mc->queue_deque)) + mc->queue = deque_expand (&mc->queue_deque, + mc->queue, sizeof *mc->queue); + switch (mc->options->strategy) + { + case MC_BROAD: + idx = deque_push_back (&mc->queue_deque); + break; + case MC_DEEP: + idx = deque_push_front (&mc->queue_deque); + break; + case MC_RANDOM: + if (!deque_is_empty (&mc->queue_deque)) + { + idx = random_queue_index (mc); + mc->queue[deque_push_front (&mc->queue_deque)] + = mc->queue[idx]; + } + else + idx = deque_push_front (&mc->queue_deque); + break; + case MC_PATH: + assert (deque_is_empty (&mc->queue_deque)); + assert (!is_off_path (mc)); + idx = deque_push_back (&mc->queue_deque); + if (mc->path.length + >= mc_path_get_length (&mc->options->follow_path)) + stop (mc, MC_END_OF_PATH); + break; + default: + NOT_REACHED (); + } + if (deque_count (&mc->queue_deque) > mc->results->max_queue_length) + mc->results->max_queue_length = deque_count (&mc->queue_deque); + } + else + { + /* Queue has reached limit, so replace an existing + state. */ + assert (mc->options->strategy != MC_PATH); + assert (!deque_is_empty (&mc->queue_deque)); + mc->results->queue_dropped_states++; + switch (mc->options->queue_limit_strategy) + { + case MC_DROP_NEWEST: + free_state (mc, new); + return; + case MC_DROP_OLDEST: + switch (mc->options->strategy) + { + case MC_BROAD: + idx = deque_front (&mc->queue_deque, 0); + break; + case MC_DEEP: + idx = deque_back (&mc->queue_deque, 0); + break; + case MC_RANDOM: + case MC_PATH: + default: + NOT_REACHED (); + } + break; + case MC_DROP_RANDOM: + idx = random_queue_index (mc); + break; + default: + NOT_REACHED (); + } + free_state (mc, mc->queue[idx]); + } + mc->queue[idx] = new; +} + +/* Process an error state being added to MC. */ +static void +do_error_state (struct mc *mc) +{ + mc->results->error_count++; + if (mc->results->error_count >= mc->options->max_errors) + stop (mc, MC_MAX_ERROR_COUNT); + + mc_path_copy (&mc->results->error_path, &mc->path); + + if (mc->options->failure_verbosity > mc->options->verbosity) + { + struct mc_options *path_options; + + fprintf (mc->options->output_file, "[%s] retracing error path:\n", + path_string (mc)); + path_options = mc_options_clone (mc->options); + mc_options_set_verbosity (path_options, mc->options->failure_verbosity); + mc_options_set_failure_verbosity (path_options, 0); + mc_options_set_follow_path (path_options, &mc->path); + + mc_results_destroy (mc_run (mc->class, path_options)); + + putc ('\n', mc->options->output_file); + } +} + +/* Advances MC to start processing the operation following the + current one. */ +static void +next_operation (struct mc *mc) +{ + mc_path_push (&mc->path, mc_path_pop (&mc->path) + 1); + mc->state_error = false; + mc->state_named = false; + + if (++mc->progress >= mc->next_progress) + { + struct timeval now; + double elapsed, delta; + + if (mc->results->stop_reason == MC_CONTINUING + && !mc->options->progress_func (mc)) + stop (mc, MC_INTERRUPTED); + + gettimeofday (&now, NULL); + + if (mc->options->time_limit > 0.0 + && (timeval_subtract (now, mc->results->start) + > mc->options->time_limit)) + stop (mc, MC_TIMEOUT); + + elapsed = timeval_subtract (now, mc->prev_progress_time); + if (elapsed > 0.0) + { + /* Re-estimate the amount of progress to take + progress_usec microseconds. */ + unsigned int progress = mc->progress - mc->prev_progress; + double progress_sec = mc->options->progress_usec / 1000000.0; + delta = progress / elapsed * progress_sec; + } + else + { + /* No measurable time at all elapsed during that amount + of progress. Try doubling the amount of progress + required. */ + delta = (mc->progress - mc->prev_progress) * 2; + } + + if (delta > 0.0 && delta + mc->progress + 1.0 < UINT_MAX) + mc->next_progress = mc->progress + delta + 1.0; + else + mc->next_progress = mc->progress + (mc->progress - mc->prev_progress); + + mc->prev_progress = mc->progress; + mc->prev_progress_time = now; + } +} + +/* Returns true if we're tracing an explicit path but the current + operation produces a state off that path, false otherwise. */ +static bool +is_off_path (const struct mc *mc) +{ + return (mc->options->strategy == MC_PATH + && (mc_path_back (&mc->path) + != mc_path_get_operation (&mc->options->follow_path, + mc->path.length - 1))); +} + +/* Handler for SIGINT. */ +static void +sigint_handler (int signum UNUSED) +{ + /* Just mark the model checker as interrupted. */ + *interrupted_ptr = true; +} + +/* Initializes MC as a model checker with the given CLASS and + OPTIONS. OPTIONS may be null to use the default options. */ +static void +init_mc (struct mc *mc, const struct mc_class *class, + struct mc_options *options) +{ + /* Validate and adjust OPTIONS. */ + if (options == NULL) + options = mc_options_create (); + assert (options->queue_limit_strategy != MC_DROP_OLDEST + || options->strategy != MC_RANDOM); + if (options->strategy == MC_PATH) + { + options->max_depth = INT_MAX; + options->hash_bits = 0; + } + if (options->progress_usec == 0) + { + options->progress_func = null_progress; + if (options->time_limit > 0.0) + options->progress_usec = 250000; + } + + /* Initialize MC. */ + mc->class = class; + mc->options = options; + mc->results = mc_results_create (); + + mc->hash = (mc->options->hash_bits > 0 + ? xcalloc (1, DIV_RND_UP (1 << mc->options->hash_bits, CHAR_BIT)) + : NULL); + + mc->queue = NULL; + deque_init_null (&mc->queue_deque); + + mc_path_init (&mc->path); + mc_path_push (&mc->path, 0); + ds_init_empty (&mc->path_string); + mc->state_named = false; + mc->state_error = false; + + mc->progress = 0; + mc->next_progress = mc->options->progress_usec != 0 ? 100 : UINT_MAX; + mc->prev_progress = 0; + mc->prev_progress_time = mc->results->start; + + if (mc->options->strategy == MC_RANDOM + || options->queue_limit_strategy == MC_DROP_RANDOM) + srand (mc->options->seed); + + mc->interrupted = false; + mc->saved_interrupted_ptr = interrupted_ptr; + interrupted_ptr = &mc->interrupted; + mc->saved_sigint = signal (SIGINT, sigint_handler); + + class->init (mc); +} + +/* Complete the model checker run for MC. */ +static void +finish_mc (struct mc *mc) +{ + /* Restore signal handlers. */ + signal (SIGINT, mc->saved_sigint); + interrupted_ptr = mc->saved_interrupted_ptr; + + /* Mark the run complete. */ + stop (mc, MC_SUCCESS); + gettimeofday (&mc->results->end, NULL); + + /* Empty the queue. */ + mc->results->queued_unprocessed_states = deque_count (&mc->queue_deque); + while (!deque_is_empty (&mc->queue_deque)) + { + struct mc_state *state = mc->queue[deque_pop_front (&mc->queue_deque)]; + free_state (mc, state); + } + + /* Notify the progress function of completion. */ + mc->options->progress_func (mc); + + /* Free memory. */ + mc_path_destroy (&mc->path); + ds_destroy (&mc->path_string); + free (mc->options); + free (mc->queue); + free (mc->hash); +} diff --git a/src/language/tests/model-checker.h b/src/language/tests/model-checker.h new file mode 100644 index 0000000000..8c86fae61d --- /dev/null +++ b/src/language/tests/model-checker.h @@ -0,0 +1,463 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2007 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Implementation-level model checker. + + A model checker is a tool for software testing and + verification that works by exploring all the possible states + in a system and verifying their internal consistency. A + conventional model checker requires that the code in a system + be translated into a specification language. The model + checker then verifies the specification, rather than the code. + + This is instead an implementation-level model checker, which + does not require a separate specification. Instead, the model + checker requires writing a second implementation of the system + being checked. The second implementation can usually be made + almost trivial in comparison to the one being checked, because + it's usually acceptable for its performance to be + comparatively poor, e.g. O(N^2) instead of O(lg N), and thus + to use much simpler algorithms. + + For introduction to the implementation-level model checking + approach used here, please refer to the following papers: + + Musuvathi, Park, Chou, Engler, Dill, "CMC: A Pragmatic + Approach to Model Checking Real Code", Proceedings of the + Fifth Symposium on Operating Systems Design and + Implementation (OSDI), Dec 2002. + http://sprout.stanford.edu/PAPERS/CMC-OSDI-2002/CMC-OSDI-2002.pdf + + Yang, Twohey, Engler, Musuvathi, "Using Model Checking to + Find Serious File System Errors", Proceedings of the Sixth + Symposium on Operating System Design and Implementation + (OSDI), Dec 2004. + http://www.stanford.edu/~engler/osdi04-fisc.pdf + + Yang, Twohey, Pfaff, Sar, Engler, "EXPLODE: A Lightweight, + General Approach to Finding Serious Errors in Storage + Systems", First Workshop on the Evaluation of Software + Defect Detection Tools (BUGS), June 2005. + http://benpfaff.org/papers/explode.pdf + + Use of a model checker is appropriate when the system being + checked is difficult to test using handwritten tests. This + can be the case, for example, when the system has a + complicated internal state that is difficult to reason about + over a long series of operations. + + The implementation model checker works by putting a set of one + of more initial states in a queue (and checking them for + consistency). Then the model checker removes a state from the + queue and applies all possible operations of interest to it + ("mutates" it), obtaining a set of zero or more child states + (and checking each of them for consistency). Each of these + states is itself added to the queue. The model checker + continues dequeuing states and mutating and checking them + until the queue is empty. + + In pseudo-code, the process looks like this: + + Q = { initial states } + while Q is not empty: + S = dequeue(Q) + for each operation applicable to S do: + T = operation(S) + check(T) + enqueue(Q, T) + + In many cases this process will never terminate, because every + state has one or more child states. For some systems this is + unavoidable, but in others we can make the process finite by + pursuing a few stratagems: + + 1. Limit the maximum size of the system; for example, limit + the number of rows and columns in the implementation of a + table being checked. The client of the model checker is + responsible for implementing such limits. + + 2. Avoid checking a single state more than one time. This + model checker provides assistance for this function by + allowing the client to provide a hash of the system state. + States with identical hashes will only be checked once + during a single run. + + When a system cannot be made finite, or when a finite system + is too large to check in a practical amount of time, the model + checker provides multiple ways to limit the checking run: + based on maximum depth, maximum unique states checked, maximum + errors found (by default, 1), or maximum time used for + checking. + + The client of the model checker must provide three functions + via function pointers embedded into a "struct mc_class": + + 1. void init (struct mc *mc); + + This function is called once at the beginning of a + checking run. It checks one or more initial states and + adds them to the model checker's queue. (If it does not + add any states to the queue, then there is nothing to + check.) + + Here's an outline for writing the init function: + + void + init_foo (struct mc *mc) + { + struct foo *foo; + + mc_name_operation (mc, "initial state"); + foo = generate_initial_foo (); + if (!state_is_consistent (foo)) + mc_error (mc, "inconsistent state"); + mc_add_state (mc, foo); + } + + 2. void mutate (struct mc *mc, const void *data); + + This function is called when a dequeued state is ready to + be mutated. For each operation that can be applied to + the client-specified DATA, it applies that operation to a + clone of the DATA, checks that the clone is consistent, + and adds the clone to the model checker's queue. + + Here's an outline for writing the mutate function: + + void + mutate_foo (struct mc *mc, void *state_) + { + struct foo *state = state_; + + for (...each operation...) + if (mc_include_state (mc)) + { + struct foo *clone; + + mc_name_operation (mc, "do operation %s", ...); + clone = clone_foo (state); + do_operation (clone); + if (!mc_discard_dup_state (mc, hash_foo (clone))) + { + if (!state_is_consistent (clone)) + mc_error (mc, "inconsistent state"); + mc_add_state (mc, clone); + } + else + destroy_foo (clone); + } + } + + Notes on the above outline: + + - The call to mc_include_state allows currently + uninteresting operations to be skipped. It is not + essential. + + - The call to mc_name_operation should give the current + operation a human-readable name. The name may + include printf-style format specifications. + + When an error occurs, the model checker (by default) + replays the sequence of operations performed to reach + the error, printing the name of the operation at each + step, which is often sufficient information in itself + to debug the error. + + At higher levels of verbosity, the name is printed + for each operation. + + - Operations should be performed on a copy of the data + provided. The data provided should not be destroyed + or modified. + + - The call to mc_discard_dup_state is needed to discard + (probably) duplicate states. It is otherwise + optional. + + To reduce the probability of collisions, use a + high-quality hash function. MD4 is a reasonable + choice: it is fast but high-quality. In one test, + switching to MD4 from MD5 increased overall speed of + model checking by 8% and actually reduced (!) the + number of collisions. + + The hash value needs to include enough of the state + to ensure that interesting states are not excluded, + but it need not include the entire state. For + example, in many cases, the structure of complex data + (metadata) is often much more important than the + contents (data), so it may be reasonable to hash only + the metadata. + + mc_discard_dup_state may be called before or after + checking for consistency, but calling it first avoids + wasting time checking duplicate states for + consistency, which again can be a significant + performance boost. + + - The mc_error function reports errors. It may be + called as many times as desired to report each kind + of inconsistency in a state. + + - The mc_add_state function adds the new state to the + queue. It should be called regardless of whether an + error was reported, to indicate to the model checker + that state processing has finished. + + - The mutation function should be deterministic, to + make it possible to reliably reproduce errors. + + 3. void destroy (struct mc *mc, void *data); + + This function is called to discard the client-specified + DATA associated with a state. + + Numerous options are available for configuring the model + checker. The most important of these are: + + - Search algorithm: + + * Breadth-first search (the default): First try all the + operations with depth 1, then those with depth 2, then + those with depth 3, and so on. + + This search algorithm finds the least number of + operations needed to trigger a given bug. + + * Depth-first search: Searches downward in the tree of + states as fast as possible. Good for finding bugs that + require long sequences of operations to trigger. + + * Random-first search: Searches through the tree of + states in random order. + + * Explicit path: Applies an explicitly specified sequence + of operations. + + - Verbosity: By default, messages are printed only when an + error is encountered, but you can cause the checker to + print a message on every state transition. This is most + useful when the errors in your code cause segfaults or + some other kind of sudden termination. + + - Treatment of errors: By default, when an error is + encountered, the model checker recursively invokes itself + with an increased verbosity level and configured to follow + only the error path. As long as the mutation function is + deterministic, this quickly and concisely replays the + error and describes the path followed to reach it in an + easily human-readable manner. + + - Limits: + + * Maximum depth: You can limit the depth of the operations + performed. Most often useful with depth-first search. + By default, depth is unlimited. + + * Maximum queue length: You can limit the number of states + kept in the queue at any given time. The main reason to + do so is to limit memory consumption. The default + limit is 10,000 states. Several strategies are + available for choosing which state to drop when the + queue overflows. + + - Stop conditions: based on maximum unique states checked, + maximum errors found (by default, 1), or maximum time used + for checking. + + - Progress: by default, the checker prints a '.' on stderr + every .25 seconds, but you can substitute another progress + function or disable progress printing. + + This model checker does not (yet) include two features + described in the papers cited above: utility scoring + heuristics to guide the search strategy and "choice points" to + explore alternative cases. The former feature is less + interesting for this model checker, because the data + structures we are thus far using it to model are much smaller + than those discussed in the paper. The latter feature we + should implement at some point. */ + +#ifndef LIBPSPP_MODEL_CHECKER_H +#define LIBPSPP_MODEL_CHECKER_H 1 + +#include +#include +#include +#include + +#include + +/* An active model checking run. */ +struct mc; + +/* Provided by each client of the model checker. */ +struct mc_class + { + void (*init) (struct mc *); + void (*mutate) (struct mc *, const void *); + void (*destroy) (const struct mc *, void *); + }; + +/* Results of a model checking run. */ +struct mc_results; + +/* Configuration for running the model checker. */ +struct mc_options; + +/* Primary external interface to model checker. */ +struct mc_results *mc_run (const struct mc_class *, struct mc_options *); + +/* Functions for use from client-supplied "init" and "mutate" + functions. */ +bool mc_include_state (struct mc *); +bool mc_discard_dup_state (struct mc *, unsigned int hash); +void mc_name_operation (struct mc *, const char *, ...) PRINTF_FORMAT (2, 3); +void mc_vname_operation (struct mc *, const char *, va_list) + PRINTF_FORMAT (2, 0); +void mc_error (struct mc *, const char *, ...) PRINTF_FORMAT (2, 3); +void mc_add_state (struct mc *, void *data); + +/* Functions for use from client-supplied "init", "mutate", and + "destroy" functions. */ +const struct mc_options *mc_get_options (const struct mc *); +const struct mc_results *mc_get_results (const struct mc *); +void *mc_get_aux (const struct mc *); + +/* A path of operations through a model to arrive at some + particular state. */ +struct mc_path + { + int *ops; /* Sequence of operations. */ + size_t length; /* Number of operations. */ + size_t capacity; /* Number of operations for which room is allocated. */ + }; + +void mc_path_init (struct mc_path *); +void mc_path_copy (struct mc_path *, const struct mc_path *); +void mc_path_push (struct mc_path *, int new_state); +int mc_path_pop (struct mc_path *); +int mc_path_back (const struct mc_path *); +void mc_path_destroy (struct mc_path *); + +int mc_path_get_operation (const struct mc_path *, size_t index); +size_t mc_path_get_length (const struct mc_path *); + +struct string; +void mc_path_to_string (const struct mc_path *, struct string *); + +struct mc_options *mc_options_create (void); +struct mc_options *mc_options_clone (const struct mc_options *); +void mc_options_destroy (struct mc_options *); + +/* Search strategy. */ +enum mc_strategy + { + MC_BROAD, /* Breadth-first search. */ + MC_DEEP, /* Depth-first search. */ + MC_RANDOM, /* Randomly ordered search. */ + MC_PATH /* Follow one explicit path. */ + }; + +enum mc_strategy mc_options_get_strategy (const struct mc_options *); +void mc_options_set_strategy (struct mc_options *, enum mc_strategy); +unsigned int mc_options_get_seed (const struct mc_options *); +void mc_options_set_seed (struct mc_options *, unsigned int seed); +int mc_options_get_max_depth (const struct mc_options *); +void mc_options_set_max_depth (struct mc_options *, int max_depth); +int mc_options_get_hash_bits (const struct mc_options *); +void mc_options_set_hash_bits (struct mc_options *, int hash_bits); + +const struct mc_path *mc_options_get_follow_path (const struct mc_options *); +void mc_options_set_follow_path (struct mc_options *, const struct mc_path *); + +/* Strategy for dropped states from the queue when it + overflows. */ +enum mc_queue_limit_strategy + { + MC_DROP_NEWEST, /* Don't enqueue the new state at all. */ + MC_DROP_OLDEST, /* Drop the oldest state in the queue. */ + MC_DROP_RANDOM /* Drop a random state from the queue. */ + }; + +int mc_options_get_queue_limit (const struct mc_options *); +void mc_options_set_queue_limit (struct mc_options *, int queue_limit); +enum mc_queue_limit_strategy mc_options_get_queue_limit_strategy ( + const struct mc_options *); +void mc_options_set_queue_limit_strategy (struct mc_options *, + enum mc_queue_limit_strategy); + +int mc_options_get_max_unique_states (const struct mc_options *); +void mc_options_set_max_unique_states (struct mc_options *, + int max_unique_states); +int mc_options_get_max_errors (const struct mc_options *); +void mc_options_set_max_errors (struct mc_options *, int max_errors); +double mc_options_get_time_limit (const struct mc_options *); +void mc_options_set_time_limit (struct mc_options *, double time_limit); + +int mc_options_get_verbosity (const struct mc_options *); +void mc_options_set_verbosity (struct mc_options *, int verbosity); +int mc_options_get_failure_verbosity (const struct mc_options *); +void mc_options_set_failure_verbosity (struct mc_options *, + int failure_verbosity); +FILE *mc_options_get_output_file (const struct mc_options *); +void mc_options_set_output_file (struct mc_options *, FILE *); + +typedef bool mc_progress_func (struct mc *); +int mc_options_get_progress_usec (const struct mc_options *); +void mc_options_set_progress_usec (struct mc_options *, int progress_usec); +mc_progress_func *mc_options_get_progress_func (const struct mc_options *); +void mc_options_set_progress_func (struct mc_options *, mc_progress_func *); + +void *mc_options_get_aux (const struct mc_options *); +void mc_options_set_aux (struct mc_options *, void *aux); + +/* Reason that a model checking run terminated. */ +enum mc_stop_reason + { + MC_CONTINUING, /* Run has not yet terminated. */ + MC_SUCCESS, /* Queue emptied (ran out of states). */ + MC_MAX_UNIQUE_STATES, /* Did requested number of unique states. */ + MC_MAX_ERROR_COUNT, /* Too many errors. */ + MC_END_OF_PATH, /* Processed requested path (MC_PATH only). */ + MC_TIMEOUT, /* Timeout. */ + MC_INTERRUPTED /* Received SIGINT (Ctrl+C). */ + }; + +void mc_results_destroy (struct mc_results *); + +enum mc_stop_reason mc_results_get_stop_reason (const struct mc_results *); +int mc_results_get_unique_state_count (const struct mc_results *); +int mc_results_get_error_count (const struct mc_results *); + +int mc_results_get_max_depth_reached (const struct mc_results *); +double mc_results_get_mean_depth_reached (const struct mc_results *); + +const struct mc_path *mc_results_get_error_path (const struct mc_results *); + +int mc_results_get_duplicate_dropped_states (const struct mc_results *); +int mc_results_get_off_path_dropped_states (const struct mc_results *); +int mc_results_get_depth_dropped_states (const struct mc_results *); +int mc_results_get_queue_dropped_states (const struct mc_results *); +int mc_results_get_queued_unprocessed_states (const struct mc_results *); +int mc_results_get_max_queue_length (const struct mc_results *); + +struct timeval mc_results_get_start (const struct mc_results *); +struct timeval mc_results_get_end (const struct mc_results *); +double mc_results_get_duration (const struct mc_results *); + +#endif /* libpspp/model-checker.h */ diff --git a/src/language/xforms/recode.c b/src/language/xforms/recode.c index fb02c910a4..eef48f3d09 100644 --- a/src/language/xforms/recode.c +++ b/src/language/xforms/recode.c @@ -161,7 +161,7 @@ cmd_recode (struct lexer *lexer, struct dataset *ds) This must be the final step; otherwise we'd have to delete destination variables on failure. */ if (trns->src_vars != trns->dst_vars) - create_dst_vars (trns, dataset_dict (ds)); + create_dst_vars (trns, dataset_dict (ds)); /* Done. */ add_transformation (ds, @@ -230,6 +230,7 @@ parse_mappings (struct lexer *lexer, struct recode_trns *trns) do { struct map_in in; + if (!parse_map_in (lexer, &in, trns->pool, trns->src_type, max_src_width)) return false; @@ -240,7 +241,11 @@ parse_mappings (struct lexer *lexer, struct recode_trns *trns) if (!parse_map_out (lexer, trns->pool, &out)) return false; - dst_type = val_type_from_width (out.width); + + if (out.copy_input) + dst_type = trns->src_type; + else + dst_type = val_type_from_width (out.width); if (have_dst_type && dst_type != trns->dst_type) { msg (SE, _("Inconsistent target variable types. " @@ -289,8 +294,11 @@ static bool parse_map_in (struct lexer *lexer, struct map_in *in, struct pool *pool, enum val_type src_type, size_t max_src_width) { + if (lex_match_id (lexer, "ELSE")) + { set_map_in_generic (in, MAP_ELSE); + } else if (src_type == VAL_NUMERIC) { if (lex_match_id (lexer, "MISSING")) @@ -307,16 +315,21 @@ parse_map_in (struct lexer *lexer, struct map_in *in, struct pool *pool, } else { - if (!lex_force_string (lexer)) + if (lex_match_id (lexer, "MISSING")) + set_map_in_generic (in, MAP_MISSING); + else if (!lex_force_string (lexer)) return false; - set_map_in_str (in, pool, lex_tokstr (lexer), max_src_width); - lex_get (lexer); - if (lex_token (lexer) == T_ID - && lex_id_match (ss_cstr ("THRU"), ss_cstr (lex_tokid (lexer)))) - { - msg (SE, _("THRU is not allowed with string variables.")); - return false; - } + else + { + set_map_in_str (in, pool, lex_tokstr (lexer), max_src_width); + lex_get (lexer); + if (lex_token (lexer) == T_ID + && lex_id_match (ss_cstr ("THRU"), ss_cstr (lex_tokid (lexer)))) + { + msg (SE, _("THRU is not allowed with string variables.")); + return false; + } + } } return true; @@ -461,6 +474,7 @@ parse_dst_vars (struct lexer *lexer, struct recode_trns *trns, return false; } } + } else { @@ -584,9 +598,10 @@ find_src_numeric (struct recode_trns *trns, double value, const struct variable /* Returns the output mapping in TRNS for an input of VALUE with the given WIDTH, or a null pointer if there is no mapping. */ static const struct map_out * -find_src_string (struct recode_trns *trns, const char *value, int width) +find_src_string (struct recode_trns *trns, const char *value, const struct variable *src_var) { struct mapping *m; + int width = var_get_width (src_var); for (m = trns->mappings; m < trns->mappings + trns->map_cnt; m++) { @@ -613,6 +628,9 @@ find_src_string (struct recode_trns *trns, const char *value, int width) out->value.f = uv.f; break; } + case MAP_MISSING: + match = var_is_str_missing (src_var, value, MV_ANY); + break; default: NOT_REACHED (); } @@ -644,7 +662,7 @@ recode_trns_proc (void *trns_, struct ccase *c, casenumber case_idx UNUSED) if (trns->src_type == VAL_NUMERIC) out = find_src_numeric (trns, src_data->f, src_var); else - out = find_src_string (trns, src_data->s, var_get_width (src_var)); + out = find_src_string (trns, src_data->s, src_var); if (trns->dst_type == VAL_NUMERIC) { diff --git a/src/libpspp/automake.mk b/src/libpspp/automake.mk index a0d7879f52..0c20643bbc 100644 --- a/src/libpspp/automake.mk +++ b/src/libpspp/automake.mk @@ -1,9 +1,9 @@ ## Process this file with automake to produce Makefile.in -*- makefile -*- -noinst_LIBRARIES += src/libpspp/libpspp.a +noinst_LTLIBRARIES += src/libpspp/libpspp.la -src_libpspp_libpspp_a_SOURCES = \ +src_libpspp_libpspp_la_SOURCES = \ src/libpspp/abt.c \ src/libpspp/abt.h \ src/libpspp/array.c \ @@ -23,10 +23,16 @@ src_libpspp_libpspp_a_SOURCES = \ src/libpspp/freaderror.h \ src/libpspp/getl.c \ src/libpspp/getl.h \ + src/libpspp/hash-functions.c \ + src/libpspp/hash-functions.h \ src/libpspp/hash.c \ src/libpspp/hash.h \ src/libpspp/heap.c \ src/libpspp/heap.h \ + src/libpspp/hmap.c \ + src/libpspp/hmap.h \ + src/libpspp/hmapx.c \ + src/libpspp/hmapx.h \ src/libpspp/i18n.c \ src/libpspp/i18n.h \ src/libpspp/integer-format.c \ @@ -41,8 +47,6 @@ src_libpspp_libpspp_a_SOURCES = \ src/libpspp/message.h \ src/libpspp/misc.c \ src/libpspp/misc.h \ - src/libpspp/model-checker.c \ - src/libpspp/model-checker.h \ src/libpspp/msg-locator.c \ src/libpspp/msg-locator.h \ src/libpspp/pool.c \ @@ -67,9 +71,9 @@ src_libpspp_libpspp_a_SOURCES = \ DISTCLEANFILES+=src/libpspp/version.c -src_libpspp_libpspp_a_CPPFLAGS = -I $(top_srcdir)/src/libpspp $(AM_CPPFLAGS) +src_libpspp_libpspp_la_CPPFLAGS = -I $(top_srcdir)/src/libpspp $(AM_CPPFLAGS) -nodist_src_libpspp_libpspp_a_SOURCES = src/libpspp/version.c +nodist_src_libpspp_libpspp_la_SOURCES = src/libpspp/version.c src/libpspp/version.c: $(top_srcdir)/AUTHORS @$(MKDIR_P) src/libpspp diff --git a/src/libpspp/hash-functions.c b/src/libpspp/hash-functions.c new file mode 100644 index 0000000000..2318777392 --- /dev/null +++ b/src/libpspp/hash-functions.c @@ -0,0 +1,90 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 1997-9, 2000, 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include +#include +#include +#include +#include + +/* Fowler-Noll-Vo hash constants, for 32-bit word sizes. */ +#define FNV_32_PRIME 16777619u +#define FNV_32_BASIS 2166136261u + +/* Fowler-Noll-Vo 32-bit hash, for bytes. */ +unsigned +hsh_hash_bytes (const void *buf_, size_t size) +{ + const unsigned char *buf = (const unsigned char *) buf_; + unsigned hash; + + assert (buf != NULL); + + hash = FNV_32_BASIS; + while (size-- > 0) + hash = (hash * FNV_32_PRIME) ^ *buf++; + + return hash; +} + +/* Fowler-Noll-Vo 32-bit hash, for strings. */ +unsigned +hsh_hash_string (const char *s_) +{ + const unsigned char *s = (const unsigned char *) s_; + unsigned hash; + + assert (s != NULL); + + hash = FNV_32_BASIS; + while (*s != '\0') + hash = (hash * FNV_32_PRIME) ^ *s++; + + return hash; +} + +/* Fowler-Noll-Vo 32-bit hash, for case-insensitive strings. */ +unsigned +hsh_hash_case_string (const char *s_) +{ + const unsigned char *s = (const unsigned char *) s_; + unsigned hash; + + assert (s != NULL); + + hash = FNV_32_BASIS; + while (*s != '\0') + hash = (hash * FNV_32_PRIME) ^ toupper (*s++); + + return hash; +} + +/* Hash for ints. */ +unsigned +hsh_hash_int (int i) +{ + return hsh_hash_bytes (&i, sizeof i); +} + +/* Hash for double. */ +unsigned +hsh_hash_double (double d) +{ + if (!isnan (d)) + return hsh_hash_bytes (&d, sizeof d); + else + return 0; +} diff --git a/src/libpspp/hash-functions.h b/src/libpspp/hash-functions.h new file mode 100644 index 0000000000..328f4deda0 --- /dev/null +++ b/src/libpspp/hash-functions.h @@ -0,0 +1,28 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef LIBPSPP_HASH_FUNCTIONS_H +#define LIBPSPP_HASH_FUNCTIONS_H 1 + +#include + +unsigned hsh_hash_bytes (const void *, size_t); +unsigned hsh_hash_string (const char *); +unsigned hsh_hash_case_string (const char *); +unsigned hsh_hash_int (int); +unsigned hsh_hash_double (double); + +#endif /* libpspp/hash-functions.h */ diff --git a/src/libpspp/hash.c b/src/libpspp/hash.c index 9da3deb120..eb43b54e7f 100644 --- a/src/libpspp/hash.c +++ b/src/libpspp/hash.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,6 @@ #include "hash.h" #include "message.h" #include -#include #include #include #include @@ -70,74 +69,6 @@ next_power_of_2 (size_t x) } } -/* Fowler-Noll-Vo hash constants, for 32-bit word sizes. */ -#define FNV_32_PRIME 16777619u -#define FNV_32_BASIS 2166136261u - -/* Fowler-Noll-Vo 32-bit hash, for bytes. */ -unsigned -hsh_hash_bytes (const void *buf_, size_t size) -{ - const unsigned char *buf = (const unsigned char *) buf_; - unsigned hash; - - assert (buf != NULL); - - hash = FNV_32_BASIS; - while (size-- > 0) - hash = (hash * FNV_32_PRIME) ^ *buf++; - - return hash; -} - -/* Fowler-Noll-Vo 32-bit hash, for strings. */ -unsigned -hsh_hash_string (const char *s_) -{ - const unsigned char *s = (const unsigned char *) s_; - unsigned hash; - - assert (s != NULL); - - hash = FNV_32_BASIS; - while (*s != '\0') - hash = (hash * FNV_32_PRIME) ^ *s++; - - return hash; -} - -/* Fowler-Noll-Vo 32-bit hash, for case-insensitive strings. */ -unsigned -hsh_hash_case_string (const char *s_) -{ - const unsigned char *s = (const unsigned char *) s_; - unsigned hash; - - assert (s != NULL); - - hash = FNV_32_BASIS; - while (*s != '\0') - hash = (hash * FNV_32_PRIME) ^ toupper (*s++); - - return hash; -} - -/* Hash for ints. */ -unsigned -hsh_hash_int (int i) -{ - return hsh_hash_bytes (&i, sizeof i); -} - -/* Hash for double. */ -unsigned -hsh_hash_double (double d) -{ - if (!isnan (d)) - return hsh_hash_bytes (&d, sizeof d); - else - return 0; -} /* Hash tables. */ diff --git a/src/libpspp/hash.h b/src/libpspp/hash.h index 59efbe5622..57fc2678d0 100644 --- a/src/libpspp/hash.h +++ b/src/libpspp/hash.h @@ -19,6 +19,7 @@ #include #include +#include typedef int hsh_compare_func (const void *, const void *, const void *aux); typedef unsigned hsh_hash_func (const void *, const void *aux); @@ -30,13 +31,6 @@ struct hsh_iterator size_t next; /* Index of next entry. */ }; -/* Hash functions. */ -unsigned hsh_hash_bytes (const void *, size_t); -unsigned hsh_hash_string (const char *); -unsigned hsh_hash_case_string (const char *); -unsigned hsh_hash_int (int); -unsigned hsh_hash_double (double); - /* Hash tables. */ struct hsh_table *hsh_create (int m, hsh_compare_func *, hsh_hash_func *, hsh_free_func *, diff --git a/src/libpspp/hmap.c b/src/libpspp/hmap.c new file mode 100644 index 0000000000..081d7cbba5 --- /dev/null +++ b/src/libpspp/hmap.c @@ -0,0 +1,186 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include + +static size_t capacity_to_mask (size_t capacity); + +/* Initializes MAP as a new hash map that is initially empty. */ +void +hmap_init (struct hmap *map) +{ + map->count = 0; + map->mask = 0; + map->buckets = &map->one; + map->one = NULL; +} + +/* Exchanges the contents of hash maps A and B. */ +void +hmap_swap (struct hmap *a, struct hmap *b) +{ + struct hmap tmp = *a; + *a = *b; + *b = tmp; + if (!a->mask) + a->buckets = &a->one; + if (!b->mask) + b->buckets = &b->one; +} + +/* Frees the memory, if any, allocated by hash map MAP. This has + no effect on the actual data items in MAP, if any, because the + client is responsible for allocating and freeing them. It + could, however, render them inaccessible if the only pointers + to them were from MAP itself, so in such a situation one + should iterate through the map and free the data items before + destroying it. */ +void +hmap_destroy (struct hmap *map) +{ + if (map != NULL && map->buckets != &map->one) + free (map->buckets); +} + +/* Reallocates MAP's hash buckets so that NEW_MASK becomes the + hash value bit-mask used to choose a hash bucket, then + rehashes any data elements in MAP into the new hash buckets. + + NEW_MASK must be a power of 2 minus 1 (including 0), that is, + its value in binary must be all 1-bits. */ +static void +hmap_rehash (struct hmap *map, size_t new_mask) +{ + struct hmap_node **new_buckets; + struct hmap_node *node, *next; + + assert ((new_mask & (new_mask + 1)) == 0); + if (new_mask) + new_buckets = calloc (new_mask + 1, sizeof *new_buckets); + else + { + new_buckets = &map->one; + new_buckets[0] = NULL; + } + + if (map->count > 0) + { + for (node = hmap_first (map); node != NULL; node = next) + { + size_t new_idx = node->hash & new_mask; + struct hmap_node **new_bucket = &new_buckets[new_idx]; + next = hmap_next (map, node); + node->next = *new_bucket; + *new_bucket = node; + } + } + if (map->buckets != &map->one) + free (map->buckets); + map->buckets = new_buckets; + map->mask = new_mask; +} + +/* Ensures that MAP has sufficient space to store at least + CAPACITY data elements, allocating a new set of buckets and + rehashing if necessary. */ +void +hmap_reserve (struct hmap *map, size_t capacity) +{ + if (capacity > hmap_capacity (map)) + hmap_rehash (map, capacity_to_mask (capacity)); +} + +/* Shrinks MAP's set of buckets to the minimum number needed to + store its current number of elements, allocating a new set of + buckets and rehashing if that would save space. */ +void +hmap_shrink (struct hmap *map) +{ + size_t new_mask = capacity_to_mask (map->count); + if (new_mask < map->mask) + hmap_rehash (map, new_mask); +} + +/* Moves NODE around in MAP to compensate for its hash value + having changed to NEW_HASH. + + This function does not verify that MAP does not already + contain a data item that duplicates NODE's new value. If + duplicates should be disallowed (which is the usual case), + then the client must check for duplicates before changing + NODE's value. */ +void +hmap_changed (struct hmap *map, struct hmap_node *node, size_t new_hash) +{ + if ((new_hash ^ node->hash) & map->mask) + { + hmap_delete (map, node); + hmap_insert_fast (map, node, new_hash); + } + else + node->hash = new_hash; +} + +/* Hash map nodes may be moved around in memory as necessary, + e.g. as the result of an realloc operation on a block that + contains a node. Once this is done, call this function + passing NODE that was moved, its former location in memory + OLD, and its hash map MAP before attempting any other + operation on MAP, NODE, or any other node in MAP. + + It is not safe to move more than one node, then to call this + function for each node. Instead, move a single node, call + this function, move another node, and so on. Alternatively, + remove all affected nodes from the hash map, move them, then + re-insert all of them. + + Assuming uniform hashing and no duplicate data items in MAP, + this function runs in constant time. */ +void +hmap_moved (struct hmap *map, + struct hmap_node *node, const struct hmap_node *old) +{ + struct hmap_node **p = &map->buckets[node->hash & map->mask]; + while (*p != old) + p = &(*p)->next; + *p = node; +} + +/* Returns the minimum-value mask required to allow for a hash + table capacity of at least CAPACITY. The return value will be + a bit-mask suitable for use as the "mask" member of struct + hmap, that is, a power of 2 minus 1 (including 0). */ +static size_t +capacity_to_mask (size_t capacity) +{ + /* Calculate the minimum mask necesary to support the given + capacity. */ + size_t mask = 0; + while (hmap_mask_to_capacity__ (mask) < capacity) + mask = (mask << 1) | 1; + + /* If the mask is nonzero, make it at least 3, because there is + little point in allocating an array of just 2 pointers. */ + mask |= (mask & 1) << 1; + + return mask; +} diff --git a/src/libpspp/hmap.h b/src/libpspp/hmap.h new file mode 100644 index 0000000000..e73d84fd15 --- /dev/null +++ b/src/libpspp/hmap.h @@ -0,0 +1,509 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Hash table with separate chaining. + + This header (hmap.h) supplies an "embedded" implementation of + a hash table that uses linked lists to resolve collisions + ("separate chaining"). Its companion header (hmapx.h) + supplies a "external" implementation that is otherwise + similar. The two variants are described briefly here. The + embedded variant, for which this is the header, is described + in slightly more detail below. Each function also has a + detailed usage comment at its point of definition. (Many of + those definitions are inline in this file, because they are so + simple. Others are in hmap.c.) + + The "hmap" embedded hash table implementation puts the hash + table node (which includes the linked list used for resolving + collisions) within the data structure that the hash table + contains. This makes allocation efficient, in space and time, + because no additional call into an allocator is needed to + obtain memory for the hash table node. It also makes it easy + to find the hash table node associated with a given object. + However, it's difficult to include a given object in an + arbitrary number of hash tables. + + The "hmapx" external hash table implementation allocates hash + table nodes separately from the objects in the hash table. + Inserting and removing hash table elements requires dynamic + allocation, so it is normally slower and takes more memory + than the embedded implementation. It also requires searching + the table to find the node associated with a given object. + However, it's easy to include a given object in an arbitrary + number of hash tables. It's also possible to create an + external hash table without adding a member to the data + structure that the hash table contains. */ + +#ifndef LIBPSPP_HMAP_H +#define LIBPSPP_HMAP_H 1 + +/* Embedded hash table with separate chaining. + + To create an embedded hash table, declare an instance of + struct hmap, then initialize it with hmap_init(): + struct hmap map; + hmap_init (&map); + or, alternatively: + struct hmap map = HMAP_INITIALIZER (map); + + Each node in the hash table, presumably a structure type, must + include a struct hmap_node member. Here's an example: + struct foo + { + struct hmap_node node; // hmap_node member. + const char *string; // Another member. + }; + The hash table functions work with pointers to struct + hmap_node. To obtain a pointer to your structure type given a + pointer to struct hmap_node, use the HMAP_DATA macro. + + Inserting and deleting elements is straightforward. Use + hmap_insert() to insert an element and hmap_delete() to delete + an element, e.g.: + struct foo my_foo; + my_foo.string = "My string"; + hmap_insert (&map, &my_foo.node, hsh_hash_string (my_foo.string)); + ... + hmap_delete (&map, &my_foo.node); + You must pass the element's hash value as one of + hmap_insert()'s arguments. The hash table saves this hash + value for use later to speed searches and to rehash as the + hash table grows. + + hmap_insert() does not check whether the newly inserted + element duplicates an element already in the hash table. The + client is responsible for doing so, if this is desirable. + + The hash table does not provide a direct way to search for an + existing element. Instead, it provides the means to iterate + over all the elements in the hash table with a given hash + value. It is easy to compose a search function from such a + building block. For example: + const struct foo * + find_foo (const struct hmap *map, const char *name) + { + const struct foo *foo; + size_t hash; + + hash = hsh_hash_string (name); + HMAP_FOR_EACH_WITH_HASH (foo, struct foo, node, hash, map) + if (!strcmp (foo->name, name)) + break; + return foo; + } + + Here is how to iterate through the elements currently in the + hash table: + struct foo *foo; + HMAP_FOR_EACH (foo, struct foo, node, &map) + { + ...do something with foo... + } + */ + +#include + +/* Returns the data structure corresponding to the given NODE, + assuming that NODE is embedded as the given MEMBER name in + data type STRUCT. NODE must not be a null pointer. */ +#define HMAP_DATA(NODE, STRUCT, MEMBER) \ + ((STRUCT *) ((char *) (NODE) - offsetof (STRUCT, MEMBER))) + +/* Like HMAP_DATA, except that a null NODE yields a null pointer + result. */ +#define HMAP_NULLABLE_DATA(NODE, STRUCT, MEMBER) \ + hmap_nullable_data__ (NODE, offsetof (STRUCT, MEMBER)) + +/* Hash table node. */ +struct hmap_node + { + struct hmap_node *next; /* Next in chain. */ + size_t hash; /* Hash value. */ + }; + +static inline size_t hmap_node_hash (const struct hmap_node *); + +/* Hash table. */ +struct hmap + { + size_t count; /* Number of inserted nodes. */ + size_t mask; /* Number of buckets (power of 2), minus 1. */ + struct hmap_node **buckets; /* Array of buckets. */ + struct hmap_node *one; /* One bucket, to eliminate corner cases. */ + }; + +/* Suitable for use as the initializer for a struct hmap named + MAP. Typical usage: + struct hmap map = HMAP_INITIALIZER (map); + HMAP_INITIALIZER() is an alternative to hmap_init(). */ +#define HMAP_INITIALIZER(MAP) { 0, 0, &(MAP).one, NULL } + +/* Creation and destruction. */ +void hmap_init (struct hmap *); +void hmap_swap (struct hmap *, struct hmap *); +void hmap_destroy (struct hmap *); + +/* Storage management. */ +void hmap_reserve (struct hmap *, size_t capacity); +void hmap_shrink (struct hmap *); + +/* Search. Refer to the large comment near the top of this file + for an example.*/ +static inline struct hmap_node *hmap_first_with_hash (const struct hmap *, + size_t hash); +static inline struct hmap_node *hmap_next_with_hash (const struct hmap_node *); + +/* Insertion and deletion. */ +static inline void hmap_insert (struct hmap *, struct hmap_node *, + size_t hash); +static inline void hmap_insert_fast (struct hmap *, struct hmap_node *, + size_t hash); +static inline void hmap_delete (struct hmap *, struct hmap_node *); + +/* Iteration. */ +static inline struct hmap_node *hmap_first (const struct hmap *); +static inline struct hmap_node *hmap_next (const struct hmap *, + const struct hmap_node *); + +/* Counting. */ +static inline size_t hmap_count (const struct hmap *); +static inline size_t hmap_capacity (const struct hmap *); + +/* Updating data elements. */ +void hmap_changed (struct hmap *, struct hmap_node *, size_t new_hash); +void hmap_moved (struct hmap *, + struct hmap_node *, const struct hmap_node *old); + +/* Convenience macros for search. + + These macros automatically use HMAP_DATA to obtain the data + elements that encapsulate hmap nodes, which often saves typing + and can make code easier to read. Refer to the large comment + near the top of this file for an example. + + These macros evaluate HASH only once. They evaluate their + other arguments many times. */ +#define HMAP_FIRST_WITH_HASH(STRUCT, MEMBER, HMAP, HASH) \ + HMAP_NULLABLE_DATA (hmap_first_with_hash (HMAP, HASH), STRUCT, MEMBER) +#define HMAP_NEXT_WITH_HASH(DATA, STRUCT, MEMBER) \ + HMAP_NULLABLE_DATA (hmap_next_with_hash (&(DATA)->MEMBER), STRUCT, MEMBER) +#define HMAP_FOR_EACH_WITH_HASH(DATA, STRUCT, MEMBER, HASH, HMAP) \ + for ((DATA) = HMAP_FIRST_WITH_HASH (STRUCT, MEMBER, HMAP, HASH); \ + (DATA) != NULL; \ + (DATA) = HMAP_NEXT_WITH_HASH (DATA, STRUCT, MEMBER)) +#define HMAP_FOR_EACH_WITH_HASH_SAFE(DATA, NEXT, STRUCT, MEMBER, HASH, HMAP) \ + for ((DATA) = HMAP_FIRST_WITH_HASH (STRUCT, MEMBER, HMAP, HASH); \ + ((DATA) != NULL \ + ? ((NEXT) = HMAP_NEXT_WITH_HASH (DATA, STRUCT, MEMBER), 1) \ + : 0); \ + (DATA) = (NEXT)) + +/* Convenience macros for iteration. + + These macros automatically use HMAP_DATA to obtain the data + elements that encapsulate hmap nodes, which often saves typing + and can make code easier to read. Refer to the large comment + near the top of this file for an example. + + These macros evaluate their arguments many times. */ +#define HMAP_FIRST(STRUCT, MEMBER, HMAP) \ + HMAP_NULLABLE_DATA (hmap_first (HMAP), STRUCT, MEMBER) +#define HMAP_NEXT(DATA, STRUCT, MEMBER, HMAP) \ + HMAP_NULLABLE_DATA (hmap_next (HMAP, &(DATA)->MEMBER), STRUCT, MEMBER) +#define HMAP_FOR_EACH(DATA, STRUCT, MEMBER, HMAP) \ + for ((DATA) = HMAP_FIRST (STRUCT, MEMBER, HMAP); \ + (DATA) != NULL; \ + (DATA) = HMAP_NEXT (DATA, STRUCT, MEMBER, HMAP)) +#define HMAP_FOR_EACH_SAFE(DATA, NEXT, STRUCT, MEMBER, HMAP) \ + for ((DATA) = HMAP_FIRST (STRUCT, MEMBER, HMAP); \ + ((DATA) != NULL \ + ? ((NEXT) = HMAP_NEXT (DATA, STRUCT, MEMBER, HMAP), 1) \ + : 0); \ + (DATA) = (NEXT)) + +/* Inline definitions. */ + +static inline struct hmap_node *hmap_find_hash__ (struct hmap_node *, size_t); +static inline struct hmap_node *hmap_first_nonempty_bucket__ ( + const struct hmap *, size_t start); +static inline size_t hmap_mask_to_capacity__ (size_t mask); + +/* Returns the hash value associated with NODE. */ +size_t +hmap_node_hash (const struct hmap_node *node) +{ + return node->hash; +} + +/* Returns the first node in MAP that has hash value HASH, or a + null pointer if MAP does not contain any node with that hash + value. + + Assuming uniform hashing and no duplicate data items in MAP, + this function runs in constant time. (Amortized over an + iteration over all data items with a given HASH, its runtime + is proportional to the length of the hash chain for HASH, so + given a pathological hash function, e.g. one that returns a + constant value, its runtime degenerates to linear in the + length of NODE's hash chain.) + + Nodes are returned in arbitrary order that may change whenever + the hash table's current capacity changes, as reported by + hmap_capacity(). Calls to hmap_insert(), hmap_reserve(), and + hmap_shrink() can change the capacity of a hash map. + Inserting a node with hmap_insert_fast() or deleting one with + hmap_delete() will not change the relative ordering of nodes. + + The HMAP_FOR_EACH_WITH_HASH and HMAP_FOR_EACH_WITH_HASH_SAFE + macros provide convenient ways to iterate over all the nodes + with a given hash. The HMAP_FIRST_WITH_HASH macro is an + interface to this particular function that is often more + convenient. */ +static inline struct hmap_node * +hmap_first_with_hash (const struct hmap *map, size_t hash) +{ + return hmap_find_hash__ (map->buckets[hash & map->mask], hash); +} + +/* Returns the next node in MAP after NODE that has the same hash + value as NODE, or a null pointer if MAP does not contain any + more nodes with that hash value. + + Assuming uniform hashing and no duplicate data items in MAP, + this function runs in constant time. (Amortized over an + iteration over all data items with a given HASH, its runtime + is proportional to the length of the hash chain for HASH, so + given a pathological hash function, e.g. one that returns a + constant value, its runtime degenerates to linear in the + length of NODE's hash chain.) + + Nodes are returned in arbitrary order that may change whenever + the hash table's current capacity changes, as reported by + hmap_capacity(). Calls to hmap_insert(), hmap_reserve(), and + hmap_shrink() can change the capacity of a hash map. + Inserting a node with hmap_insert_fast() or deleting one with + hmap_delete() will not change the relative ordering of nodes. + + The HMAP_FOR_EACH_WITH_HASH and HMAP_FOR_EACH_WITH_HASH_SAFE + macros provide convenient ways to iterate over all the nodes + with a given hash. The HMAP_NEXT_WITH_HASH macro is an + interface to this particular function that is often more + convenient. */ +static inline struct hmap_node * +hmap_next_with_hash (const struct hmap_node *node) +{ + return hmap_find_hash__ (node->next, node->hash); +} + +/* Inserts NODE into MAP with hash value HASH. If the insertion + causes MAP's current capacity, as reported by hmap_capacity(), + to be exceeded, rehashes MAP with an increased number of hash + buckets. + + This function runs in constant time amortized over all the + insertions into MAP. + + This function does not verify that MAP does not already + contain a data item with the same value as NODE. If + duplicates should be disallowed (which is the usual case), + then the client must check for duplicates itself before + inserting the new node. */ +static inline void +hmap_insert (struct hmap *map, struct hmap_node *node, size_t hash) +{ + hmap_insert_fast (map, node, hash); + if (map->count > hmap_capacity (map)) + hmap_reserve (map, map->count); +} + +/* Inserts NODE into MAP with hash value HASH. Does not check + whether this causes MAP's current capacity to be exceeded. + The caller must take responsibility for that (or use + hmap_insert() instead). + + This function runs in constant time. + + This function does not verify that MAP does not already + contain a data item with the same value as NODE. If + duplicates should be disallowed (which is the usual case), + then the client must check for duplicates itself before + inserting the new node. */ +static inline void +hmap_insert_fast (struct hmap *map, struct hmap_node *node, size_t hash) +{ + struct hmap_node **bucket = &map->buckets[hash & map->mask]; + node->hash = hash; + node->next = *bucket; + *bucket = node; + map->count++; +} + +/* Removes NODE from MAP. The client is responsible for freeing + any data associated with NODE, if necessary. + + Assuming uniform hashing, this function runs in constant time. + (Its runtime is proportional to the position of NODE in its + hash chain, so given a pathological hash function, e.g. one + that returns a constant value, its runtime degenerates to + linear in the length of NODE's hash chain.) + + This function never reduces the number of buckets in MAP. + When one deletes a large number of nodes from a hash table, + calling hmap_shrink() afterward may therefore save a small + amount of memory. It is also more expensive to iterate + through a very sparse hash table than a denser one, so + shrinking the hash table could also save some time. However, + rehashing has an immediate cost that must be weighed against + these benefits. + + hmap_delete() does not change NODE's hash value reported by + hmap_node_hash(). */ +static inline void +hmap_delete (struct hmap *map, struct hmap_node *node) +{ + struct hmap_node **bucket = &map->buckets[node->hash & map->mask]; + while (*bucket != node) + bucket = &(*bucket)->next; + *bucket = (*bucket)->next; + map->count--; +} + +/* Returns the first node in MAP, or a null pointer if MAP is + empty. + + Amortized over iterating through every data element in MAP, + this function runs in constant time. However, this assumes + that MAP is not excessively sparse, that is, that + hmap_capacity(MAP) is at most a constant factor greater than + hmap_count(MAP). This will always be true unless many nodes + have been inserted into MAP and then most or all of them + deleted; in such a case, calling hmap_shrink() is advised. + + Nodes are returned in arbitrary order that may change whenever + the hash table's current capacity changes, as reported by + hmap_capacity(). Calls to hmap_insert(), hmap_reserve(), and + hmap_shrink() can change the capacity of a hash map. + Inserting a node with hmap_insert_fast() or deleting one with + hmap_delete() will not change the relative ordering of nodes. + + The HMAP_FOR_EACH and HMAP_FOR_EACH_SAFE macros provide + convenient ways to iterate over all the nodes in a hash map. + The HMAP_FIRST macro is an interface to this particular + function that is often more convenient. */ +static inline struct hmap_node * +hmap_first (const struct hmap *map) +{ + return hmap_first_nonempty_bucket__ (map, 0); +} + +/* Returns the next node in MAP following NODE, or a null pointer + if NODE is the last node in MAP. + + Amortized over iterating through every data element in MAP, + this function runs in constant time. However, this assumes + that MAP is not excessively sparse, that is, that + hmap_capacity(MAP) is at most a constant factor greater than + hmap_count(MAP). This will always be true unless many nodes + have been inserted into MAP and then most or all of them + deleted; in such a case, calling hmap_shrink() is advised. + + Nodes are returned in arbitrary order that may change whenever + the hash table's current capacity changes, as reported by + hmap_capacity(). Calls to hmap_insert(), hmap_reserve(), and + hmap_shrink() can change the capacity of a hash map. + Inserting a node with hmap_insert_fast() or deleting one with + hmap_delete() will not change the relative ordering of nodes. + + The HMAP_FOR_EACH and HMAP_FOR_EACH_SAFE macros provide + convenient ways to iterate over all the nodes in a hash map. + The HMAP_NEXT macro is an interface to this particular + function that is often more convenient. */ +static inline struct hmap_node * +hmap_next (const struct hmap *map, const struct hmap_node *node) +{ + return (node->next != NULL + ? node->next + : hmap_first_nonempty_bucket__ (map, (node->hash & map->mask) + 1)); +} + +/* Returns the number of data items currently in MAP. */ +static inline size_t +hmap_count (const struct hmap *map) +{ + return map->count; +} + +/* Returns the current capacity of MAP, that is, the maximum + number of data elements that MAP may hold before it becomes + advisable to rehash. + + The capacity is advisory only: it is possible to insert any + number of data elements into a hash map regardless of its + capacity. However, inserting many more elements than the + map's capacity will degrade search performance. */ +static inline size_t +hmap_capacity (const struct hmap *map) +{ + return hmap_mask_to_capacity__ (map->mask); +} + +/* Implementation details. */ + +/* Returns the first node at or after NODE in NODE's chain that + has hash value HASH. */ +static inline struct hmap_node * +hmap_find_hash__ (struct hmap_node *node, size_t hash) +{ + for (; node != NULL; node = node->next) + if (node->hash == hash) + break; + return node; +} + +/* Returns the first node in the lowest-numbered nonempty bucket + in MAP whose index is START or higher, or a null pointer if + all such buckets are empty. */ +static inline struct hmap_node * +hmap_first_nonempty_bucket__ (const struct hmap *map, size_t start) +{ + size_t i; + + for (i = start; i <= map->mask; i++) + if (map->buckets[i] != NULL) + return map->buckets[i]; + return NULL; +} + +/* Returns the hash table capacity associated with a given MASK, + which should be a value for the "mask" member of struct hmap. + MASK must be a power of 2 minus 1 (including 0), that is, its + value in binary must be all 1-bits. */ +static inline size_t +hmap_mask_to_capacity__ (size_t mask) +{ + return (mask + 1) * 2; +} + +/* Helper for HMAP_NULLABLE_DATA (to avoid evaluating its NODE + argument more than once). */ +static inline void * +hmap_nullable_data__ (struct hmap_node *node, size_t member_offset) +{ + return node != NULL ? (char *) node - member_offset : NULL; +} + +#endif /* libpspp/hmap.h */ diff --git a/src/libpspp/hmapx.c b/src/libpspp/hmapx.c new file mode 100644 index 0000000000..d732450841 --- /dev/null +++ b/src/libpspp/hmapx.c @@ -0,0 +1,99 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include "xalloc.h" + +/* Frees the memory, if any, allocated by hash map MAP, including + all hmapx_nodes that it contains. The user-defined data items + that the hmapx_nodes point to are not affected. If those + items should be freed, then it should be done by iterating + through MAP's contents before destroying MAP. */ +void +hmapx_destroy (struct hmapx *map) +{ + if (map != NULL) + { + if (hmapx_count (map) > 0) + { + struct hmapx_node *node, *next; + for (node = hmapx_first (map); node != NULL; node = next) + { + next = hmapx_next (map, node); + free (node); + } + } + hmap_destroy (&map->hmap); + } +} + +/* Allocates and returns a new hmapx_node with DATA as its data + item. */ +static struct hmapx_node * +make_hmapx_node (void *data) +{ + struct hmapx_node *node = xmalloc (sizeof *node); + node->data = data; + return node; +} + +/* Inserts DATA into MAP with hash value HASH and returns the new + hmapx_node created to contain DATA. If the insertion causes + MAP's current capacity, as reported by hmapx_capacity(), to be + exceeded, rehashes MAP with an increased number of hash + buckets. + + This function runs in constant time amortized over all the + insertions into MAP. + + This function does not verify that MAP does not already + contain a data item with the same value as DATA. If + duplicates should be disallowed (which is the usual case), + then the client must check for duplicates itself before + inserting the new item. */ +struct hmapx_node * +hmapx_insert (struct hmapx *map, void *data, size_t hash) +{ + struct hmapx_node *node = make_hmapx_node (data); + hmap_insert (&map->hmap, &node->hmap_node, hash); + return node; +} + +/* Inserts DATA into MAP with hash value HASH and returns the new + hmapx_node created to contain DATA. Does not check whether + this causes MAP's current capacity to be exceeded. The caller + must take responsibility for that (or use hmapx_insert() + instead). + + This function runs in constant time. + + This function does not verify that MAP does not already + contain a data item with the same value as DATA. If + duplicates should be disallowed (which is the usual case), + then the client must check for duplicates itself before + inserting the new node. */ +struct hmapx_node * +hmapx_insert_fast (struct hmapx *map, void *data, size_t hash) +{ + struct hmapx_node *node = make_hmapx_node (data); + hmap_insert_fast (&map->hmap, &node->hmap_node, hash); + return node; +} diff --git a/src/libpspp/hmapx.h b/src/libpspp/hmapx.h new file mode 100644 index 0000000000..32a4452fe2 --- /dev/null +++ b/src/libpspp/hmapx.h @@ -0,0 +1,468 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Hash table with separate chaining. + + This header (hmapx.h) supplies an "external" implementation of + a hash table that uses linked lists to resolve collisions + ("separate chaining"). Its companion header (hmap.h) supplies + a "embedded" implementation that is otherwise similar. The + two variants are described briefly here. The external + variant, for which this is the header, is described in + slightly more detail below. Each function also has a detailed + usage comment at its point of definition. (Many of those + definitions are inline in this file, because they are so + simple. Others are in hmapx.c.) + + The "hmap" embedded hash table implementation puts the hash + table node (which includes the linked list used for resolving + collisions) within the data structure that the hash table + contains. This makes allocation efficient, in space and time, + because no additional call into an allocator is needed to + obtain memory for the hash table node. It also makes it easy + to find the hash table node associated with a given object. + However, it's difficult to include a given object in an + arbitrary number of hash tables. + + The "hmapx" external hash table implementation allocates hash + table nodes separately from the objects in the hash table. + Inserting and removing hash table elements requires dynamic + allocation, so it is normally slower and takes more memory + than the embedded implementation. It also requires searching + the table to find the node associated with a given object. + However, it's easy to include a given object in an arbitrary + number of hash tables. It's also possible to create an + external hash table without adding a member to the data + structure that the hash table contains. */ + +#ifndef LIBPSPP_HMAPX_H +#define LIBPSPP_HMAPX_H 1 + +/* External hash table with separate chaining. + + To create an external hash table, declare an instance of + struct hmapx, then initialize it with hmapx_init(): + struct hmapx map; + hmapx_init (&map); + or, alternatively: + struct hmapx map = HMAPX_INITIALIZER (map); + + An hmapx data structure contains data represented as void *. + The hmapx_insert() function inserts such a datum and returns + the address of a newly created struct hmapx_node that + represents the new element: + struct foo { + const char *key; + const char *value; + }; + struct foo foo = {"key", "value"}; + struct hmapx_node *node; + node = hmapx_insert (&map, &foo, hsh_hash_string (foo.key)); + The element's hash value must be passed as one of + hmapx_insert()'s arguments. The hash table saves this hash + value for use later to speed searches and to rehash as the + hash table grows. + + hmapx_insert() does not check whether the newly inserted + element duplicates an element already in the hash table. The + client is responsible for doing so, if this is desirable. + + Use hmapx_delete() to delete an element from the hash table, + passing in its hmapx_node: + hmapx_delete (&map, node); + Deleting an element frees its node. + + The hash table does not provide a direct way to search for an + existing element. Instead, it provides the means to iterate + over all the elements in the hash table with a given hash + value. It is easy to compose a search function from such a + building block. For example: + struct hmapx_node * + find_node (const struct hmapx *map, const char *target) + { + struct hmapx_node *node; + struct foo *foo; + HMAPX_FOR_EACH_WITH_HASH (foo, node, hsh_hash_string (target), map) + if (!strcmp (foo->key, target)) + break; + return node; + } + This function's client can extract the data item from the + returned hmapx_node using the hmapx_node_data() function. The + hmapx_node can also be useful directly as an argument to other + hmapx functions, such as hmapx_delete(). + + Here is how to iterate through the elements currently in the + hash table: + struct hmapx_node *node; + const char *string; + HMAPX_FOR_EACH (data, node, &map) + { + ...do something with string... + } + */ + +#include +#include + +/* Hash table node. */ +struct hmapx_node + { + struct hmap_node hmap_node; /* Underlying hash node. */ + void *data; /* User data. */ + }; + +static inline void *hmapx_node_data (const struct hmapx_node *); +static inline size_t hmapx_node_hash (const struct hmapx_node *); + +/* Hash table. */ +struct hmapx + { + struct hmap hmap; + }; + +/* Suitable for use as the initializer for a struct hmapx named + MAP. Typical usage: + struct hmap map = HMAPX_INITIALIZER (map); + HMAPX_INITIALIZER() is an alternative to hmapx_init(). */ +#define HMAPX_INITIALIZER(MAP) { HMAP_INITIALIZER (MAP.hmap) } + +/* Creation and destruction. */ +static inline void hmapx_init (struct hmapx *); +static inline void hmapx_swap (struct hmapx *, struct hmapx *); +void hmapx_destroy (struct hmapx *); + +/* Storage management. */ +static inline void hmapx_reserve (struct hmapx *, size_t capacity); +static inline void hmapx_shrink (struct hmapx *); + +/* Search. */ +static inline struct hmapx_node *hmapx_first_with_hash (struct hmapx *, + size_t hash); +static inline struct hmapx_node *hmapx_next_with_hash (struct hmapx_node *); + +/* Insertion and deletion. */ +struct hmapx_node *hmapx_insert (struct hmapx *, void *, size_t hash); +struct hmapx_node *hmapx_insert_fast (struct hmapx *, void *, size_t hash); +static inline void hmapx_delete (struct hmapx *, struct hmapx_node *); + +/* Iteration. */ +static inline struct hmapx_node *hmapx_first (const struct hmapx *); +static inline struct hmapx_node *hmapx_next (const struct hmapx *, + const struct hmapx_node *); + +/* Counting. */ +static inline size_t hmapx_count (const struct hmapx *); +static inline size_t hmapx_capacity (const struct hmapx *); + +/* Updating data elements. */ +static inline void hmapx_change (struct hmapx *, + struct hmapx_node *, void *, size_t new_hash); +static inline void hmapx_changed (struct hmapx *, struct hmapx_node *, + size_t new_hash); +static inline void hmapx_move (struct hmapx_node *, void *); + +/* Convenience macros for search. + + These macros automatically use hmapx_node_data() to obtain the + data elements that encapsulate hmap nodes, which often saves + typing and can make code easier to read. Refer to the large + comment near the top of this file for an example. + + These macros evaluate HASH only once. They evaluate their + other arguments many times. */ +#define HMAPX_FOR_EACH_WITH_HASH(DATA, NODE, HASH, HMAPX) \ + for ((NODE) = hmapx_first_with_hash (HMAPX, HASH); \ + (NODE) != NULL ? ((DATA) = hmapx_node_data (NODE), 1) : 0; \ + (NODE) = hmapx_next_with_hash (NODE)) +#define HMAPX_FOR_EACH_WITH_HASH_SAFE(DATA, NODE, NEXT, HASH, HMAPX) \ + for ((NODE) = hmapx_first_with_hash (HMAPX, HASH); \ + ((NODE) != NULL \ + ? ((DATA) = hmapx_node_data (NODE), \ + (NEXT) = hmapx_next_with_hash (NODE), \ + 1) \ + : 0); \ + (NODE) = (NEXT)) + +/* Convenience macros for iteration. + + These macros automatically use hmapx_node_data() to obtain the + data elements that encapsulate hmap nodes, which often saves + typing and can make code easier to read. Refer to the large + comment near the top of this file for an example. + + These macros evaluate their arguments many times. */ +#define HMAPX_FOR_EACH(DATA, NODE, HMAPX) \ + for ((NODE) = hmapx_first (HMAPX); \ + (NODE) != NULL ? ((DATA) = hmapx_node_data (NODE), 1) : 0; \ + (NODE) = hmapx_next (HMAPX, NODE)) +#define HMAPX_FOR_EACH_SAFE(DATA, NODE, NEXT, HMAPX) \ + for ((NODE) = hmapx_first (HMAPX); \ + ((NODE) != NULL \ + ? ((DATA) = hmapx_node_data (NODE), \ + (NEXT) = hmapx_next (HMAPX, NODE), \ + 1) \ + : 0); \ + (NODE) = (NEXT)) + +/* Inline definitions. */ + +/* Returns the data stored in NODE. */ +static inline void * +hmapx_node_data (const struct hmapx_node *node) +{ + return node->data; +} + +/* Returns the hash value stored in NODE */ +static inline size_t +hmapx_node_hash (const struct hmapx_node *node) +{ + return hmap_node_hash (&node->hmap_node); +} + +/* Initializes MAP as a new hash map that is initially empty. */ +static inline void +hmapx_init (struct hmapx *map) +{ + hmap_init (&map->hmap); +} + +/* Exchanges the contents of hash maps A and B. */ +static inline void +hmapx_swap (struct hmapx *a, struct hmapx *b) +{ + hmap_swap (&a->hmap, &b->hmap); +} + +/* Ensures that MAP has sufficient space to store at least + CAPACITY data elements, allocating a new set of buckets and + rehashing if necessary. */ +static inline void +hmapx_reserve (struct hmapx *map, size_t capacity) +{ + hmap_reserve (&map->hmap, capacity); +} + +/* Shrinks MAP's set of buckets to the minimum number needed to + store its current number of elements, allocating a new set of + buckets and rehashing if that would save space. */ +static inline void +hmapx_shrink (struct hmapx *map) +{ + hmap_shrink (&map->hmap); +} + +/* Returns the first node in MAP that has hash value HASH, or a + null pointer if MAP does not contain any node with that hash + value. + + Assuming uniform hashing and no duplicate data items in MAP, + this function runs in constant time. (Amortized over an + iteration over all data items with a given HASH, its runtime + is proportional to the length of the hash chain for HASH, so + given a pathological hash function, e.g. one that returns a + constant value, its runtime degenerates to linear in the + length of NODE's hash chain.) + + Nodes are returned in arbitrary order that may change whenever + the hash table's current capacity changes, as reported by + hmapx_capacity(). Calls to hmapx_insert(), hmapx_reserve(), + and hmapx_shrink() can change the capacity of a hash map. + Inserting a node with hmapx_insert_fast() or deleting one with + hmapx_delete() will not change the relative ordering of nodes. + + The HMAPX_FOR_EACH_WITH_HASH and HMAPX_FOR_EACH_WITH_HASH_SAFE + macros provide convenient ways to iterate over all the nodes + with a given hash. */ +static inline struct hmapx_node * +hmapx_first_with_hash (struct hmapx *map, size_t hash) +{ + return HMAP_FIRST_WITH_HASH (struct hmapx_node, hmap_node, &map->hmap, hash); +} + +/* Returns the next node in MAP after NODE that has the same hash + value as NODE, or a null pointer if MAP does not contain any + more nodes with that hash value. + + Assuming uniform hashing and no duplicate data items in MAP, + this function runs in constant time. (Amortized over an + iteration over all data items with a given HASH, its runtime + is proportional to the length of the hash chain for HASH, so + given a pathological hash function, e.g. one that returns a + constant value, its runtime degenerates to linear in the + length of NODE's hash chain.) + + Nodes are returned in arbitrary order that may change whenever + the hash table's current capacity changes, as reported by + hmapx_capacity(). Calls to hmapx_insert(), hmapx_reserve(), + and hmapx_shrink() can change the capacity of a hash map. + Inserting a node with hmapx_insert_fast() or deleting one with + hmapx_delete() will not change the relative ordering of nodes. + + The HMAPX_FOR_EACH_WITH_HASH and HMAPX_FOR_EACH_WITH_HASH_SAFE + macros provide convenient ways to iterate over all the nodes + with a given hash. */ +static inline struct hmapx_node * +hmapx_next_with_hash (struct hmapx_node *node) +{ + return HMAP_NEXT_WITH_HASH (node, struct hmapx_node, hmap_node); +} + +/* Removes NODE from MAP and frees NODE. The client is + responsible for freeing the user data associated with NODE, if + appropriate. + + Assuming uniform hashing, this function runs in constant time. + (Its runtime is proportional to the position of NODE in its + hash chain, so given a pathological hash function, e.g. one + that returns a constant value, its runtime degenerates to + linear in the length of NODE's hash chain.) + + This function never reduces the number of buckets in MAP. + When one deletes a large number of nodes from a hash table, + calling hmapx_shrink() afterward may therefore save a small + amount of memory. It is also more expensive to iterate + through a very sparse hash table than a denser one, so + shrinking the hash table could also save some time. However, + rehashing has an immediate cost that must be weighed against + these benefits. + + hmapx_delete() does not change NODE's hash value reported by + hmapx_node_hash(). */ +static inline void +hmapx_delete (struct hmapx *map, struct hmapx_node *node) +{ + hmap_delete (&map->hmap, &node->hmap_node); + free (node); +} + +/* Returns the first node in MAP, or a null pointer if MAP is + empty. + + Amortized over iterating through every data element in MAP, + this function runs in constant time. However, this assumes + that MAP is not excessively sparse, that is, that + hmapx_capacity(MAP) is at most a constant factor greater than + hmapx_count(MAP). This will always be true unless many nodes + have been inserted into MAP and then most or all of them + deleted; in such a case, calling hmapx_shrink() is advised. + + Nodes are returned in arbitrary order that may change whenever + the hash table's current capacity changes, as reported by + hmapx_capacity(). Calls to hmapx_insert(), hmapx_reserve(), + and hmapx_shrink() can change the capacity of a hash map. + Inserting a node with hmapx_insert_fast() or deleting one with + hmapx_delete() will not change the relative ordering of nodes. + + The HMAPX_FOR_EACH and HMAPX_FOR_EACH_SAFE macros provide + convenient ways to iterate over all the nodes in a hash + map. */ +static inline struct hmapx_node * +hmapx_first (const struct hmapx *map) +{ + return HMAP_FIRST (struct hmapx_node, hmap_node, &map->hmap); +} + +/* Returns the next node in MAP following NODE, or a null pointer + if NODE is the last node in MAP. + + Amortized over iterating through every data element in MAP, + this function runs in constant time. However, this assumes + that MAP is not excessively sparse, that is, that + hmapx_capacity(MAP) is at most a constant factor greater than + hmapx_count(MAP). This will always be true unless many nodes + have been inserted into MAP and then most or all of them + deleted; in such a case, calling hmapx_shrink() is advised. + + Nodes are returned in arbitrary order that may change whenever + the hash table's current capacity changes, as reported by + hmapx_capacity(). Calls to hmapx_insert(), hmapx_reserve(), + and hmapx_shrink() can change the capacity of a hash map. + Inserting a node with hmapx_insert_fast() or deleting one with + hmapx_delete() will not change the relative ordering of nodes. + + The HMAPX_FOR_EACH and HMAPX_FOR_EACH_SAFE macros provide + convenient ways to iterate over all the nodes in a hash + map. */ +static inline struct hmapx_node * +hmapx_next (const struct hmapx *map, const struct hmapx_node *node) +{ + return HMAP_NEXT (node, struct hmapx_node, hmap_node, &map->hmap); +} + +/* Returns the number of data items currently in MAP. */ +static inline size_t +hmapx_count (const struct hmapx *map) +{ + return hmap_count (&map->hmap); +} + +/* Returns the current capacity of MAP, that is, the maximum + number of data elements that MAP may hold before it becomes + advisable to rehash. + + The capacity is advisory only: it is possible to insert any + number of data elements into a hash map regardless of its + capacity. However, inserting many more elements than the + map's capacity will degrade search performance. */ +static inline size_t +hmapx_capacity (const struct hmapx *map) +{ + return hmap_capacity (&map->hmap); +} + +/* Changes NODE's data to DATA and its hash value to NEW_HASH. + NODE must reside in MAP. + + This function does not verify that MAP does not already + contain a data item that duplicates DATA. If duplicates + should be disallowed (which is the usual case), then the + client must check for duplicates before changing NODE's + value. */ +static inline void +hmapx_change (struct hmapx *map, + struct hmapx_node *node, void *data, size_t new_hash) +{ + hmapx_move (node, data); + hmapx_changed (map, node, new_hash); +} + +/* Moves NODE around in MAP to compensate for its hash value + having changed to NEW_HASH. + + This function does not verify that MAP does not already + contain a data item that duplicates the new value of NODE's + data. If duplicates should be disallowed (which is the usual + case), then the client must check for duplicates before + changing NODE's value. */ +static inline void +hmapx_changed (struct hmapx *map, struct hmapx_node *node, size_t new_hash) +{ + hmap_changed (&map->hmap, &node->hmap_node, new_hash); +} + +/* Updates NODE to compensate for its data item having moved + around in memory to new location DATA. The data item's value + and hash value should not have changed. (If they have + changed, call hmapx_change() instead.) */ +static inline void +hmapx_move (struct hmapx_node *node, void *data) +{ + node->data = data; +} + +#endif /* libpspp/hmapx.h */ diff --git a/src/libpspp/misc.h b/src/libpspp/misc.h index e33d588fd8..3b02515709 100644 --- a/src/libpspp/misc.h +++ b/src/libpspp/misc.h @@ -14,8 +14,8 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ -#if !math_misc_h -#define math_misc_h 1 +#if !libpspp_misc_h +#define libpspp_misc_h 1 #include #include @@ -60,4 +60,21 @@ pow4 (double x) return y; } -#endif /* math/misc.h */ +/* Set *DEST to the lower of *DEST and SRC */ +static inline void +minimize (double *dest, double src) +{ + if (src < *dest) + *dest = src; +} + + +/* Set *DEST to the greater of *DEST and SRC */ +static inline void +maximize (double *dest, double src) +{ + if (src > *dest) + *dest = src; +} + +#endif /* libpspp/misc.h */ diff --git a/src/libpspp/model-checker.c b/src/libpspp/model-checker.c deleted file mode 100644 index cb51c485df..0000000000 --- a/src/libpspp/model-checker.c +++ /dev/null @@ -1,1466 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2007 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "error.h" -#include "minmax.h" -#include "xalloc.h" - -/* Initializes PATH as an empty path. */ -void -mc_path_init (struct mc_path *path) -{ - path->ops = NULL; - path->length = 0; - path->capacity = 0; -} - -/* Copies the contents of OLD into NEW. */ -void -mc_path_copy (struct mc_path *new, const struct mc_path *old) -{ - if (old->length > new->capacity) - { - new->capacity = old->length; - free (new->ops); - new->ops = xnmalloc (new->capacity, sizeof *new->ops); - } - new->length = old->length; - memcpy (new->ops, old->ops, old->length * sizeof *new->ops); -} - -/* Adds OP to the end of PATH. */ -void -mc_path_push (struct mc_path *path, int op) -{ - if (path->length >= path->capacity) - path->ops = xnrealloc (path->ops, ++path->capacity, sizeof *path->ops); - path->ops[path->length++] = op; -} - -/* Removes and returns the operation at the end of PATH. */ -int -mc_path_pop (struct mc_path *path) -{ - int back = mc_path_back (path); - path->length--; - return back; -} - -/* Returns the operation at the end of PATH. */ -int -mc_path_back (const struct mc_path *path) -{ - assert (path->length > 0); - return path->ops[path->length - 1]; -} - -/* Destroys PATH. */ -void -mc_path_destroy (struct mc_path *path) -{ - free (path->ops); - path->ops = NULL; -} - -/* Returns the operation in position INDEX in PATH. - INDEX must be less than the length of PATH. */ -int -mc_path_get_operation (const struct mc_path *path, size_t index) -{ - assert (index < path->length); - return path->ops[index]; -} - -/* Returns the number of operations in PATH. */ -size_t -mc_path_get_length (const struct mc_path *path) -{ - return path->length; -} - -/* Appends the operations in PATH to STRING, separating each one - with a single space. */ -void -mc_path_to_string (const struct mc_path *path, struct string *string) -{ - size_t i; - - for (i = 0; i < mc_path_get_length (path); i++) - { - if (i > 0) - ds_put_char (string, ' '); - ds_put_format (string, "%d", mc_path_get_operation (path, i)); - } -} - -/* Search options. */ -struct mc_options - { - /* Search strategy. */ - enum mc_strategy strategy; /* Type of strategy. */ - int max_depth; /* Limit on depth (or INT_MAX). */ - int hash_bits; /* Number of bits to hash (or 0). */ - unsigned int seed; /* Random seed for MC_RANDOM - or MC_DROP_RANDOM. */ - struct mc_path follow_path; /* Path for MC_PATH. */ - - /* Queue configuration. */ - int queue_limit; /* Maximum length of queue. */ - enum mc_queue_limit_strategy queue_limit_strategy; - /* How to choose state to drop - from queue. */ - - /* Stop conditions. */ - int max_unique_states; /* Maximum unique states to process. */ - int max_errors; /* Maximum errors to detect. */ - double time_limit; /* Maximum time in seconds. */ - - /* Output configuration. */ - int verbosity; /* 0=low, 1=normal, 2+=high. */ - int failure_verbosity; /* If greater than verbosity, - verbosity of error replays. */ - FILE *output_file; /* File to receive output. */ - - /* How to report intermediate progress. */ - int progress_usec; /* Microseconds between reports. */ - mc_progress_func *progress_func; /* Function to call on each report. */ - - /* Client data. */ - void *aux; - }; - -/* Default progress function. */ -static bool -default_progress (struct mc *mc) -{ - if (mc_results_get_stop_reason (mc_get_results (mc)) == MC_CONTINUING) - putc ('.', stderr); - else - putc ('\n', stderr); - return true; -} - -/* Do-nothing progress function. */ -static bool -null_progress (struct mc *mc UNUSED) -{ - return true; -} - -/* Creates and returns a set of options initialized to the - defaults. */ -struct mc_options * -mc_options_create (void) -{ - struct mc_options *options = xmalloc (sizeof *options); - - options->strategy = MC_BROAD; - options->max_depth = INT_MAX; - options->hash_bits = 20; - options->seed = 0; - mc_path_init (&options->follow_path); - - options->queue_limit = 10000; - options->queue_limit_strategy = MC_DROP_RANDOM; - - options->max_unique_states = INT_MAX; - options->max_errors = 1; - options->time_limit = 0.0; - - options->verbosity = 1; - options->failure_verbosity = 2; - options->output_file = stdout; - options->progress_usec = 250000; - options->progress_func = default_progress; - - options->aux = NULL; - - return options; -} - -/* Returns a copy of the given OPTIONS. */ -struct mc_options * -mc_options_clone (const struct mc_options *options) -{ - return xmemdup (options, sizeof *options); -} - -/* Destroys OPTIONS. */ -void -mc_options_destroy (struct mc_options *options) -{ - mc_path_destroy (&options->follow_path); - free (options); -} - -/* Returns the search strategy used for OPTIONS. The choices - are: - - - MC_BROAD (the default): Breadth-first search. First tries - all the operations with depth 1, then those with depth 2, - then those with depth 3, and so on. - - This search algorithm finds the least number of operations - needed to trigger a given bug. - - - MC_DEEP: Depth-first search. Searches downward in the tree - of states as fast as possible. Good for finding bugs that - require long sequences of operations to trigger. - - - MC_RANDOM: Random-first search. Searches through the tree - of states in random order. The standard C library's rand - function selects the search path; you can control the seed - passed to srand using mc_options_set_seed. - - - MC_PATH: Explicit path. Applies an explicitly specified - sequence of operations. */ -enum mc_strategy -mc_options_get_strategy (const struct mc_options *options) -{ - return options->strategy; -} - -/* Sets the search strategy used for OPTIONS to STRATEGY. - - This function cannot be used to set MC_PATH as the search - strategy. Use mc_options_set_follow_path instead. */ -void -mc_options_set_strategy (struct mc_options *options, enum mc_strategy strategy) -{ - assert (strategy == MC_BROAD - || strategy == MC_DEEP - || strategy == MC_RANDOM); - options->strategy = strategy; -} - -/* Returns OPTION's random seed used by MC_RANDOM and - MC_DROP_RANDOM. */ -unsigned int -mc_options_get_seed (const struct mc_options *options) -{ - return options->seed; -} - -/* Set OPTION's random seed used by MC_RANDOM and MC_DROP_RANDOM - to SEED. */ -void -mc_options_set_seed (struct mc_options *options, unsigned int seed) -{ - options->seed = seed; -} - -/* Returns the maximum depth to which OPTIONS's search will - descend. The initial states are at depth 1, states produced - as their mutations are at depth 2, and so on. */ -int -mc_options_get_max_depth (const struct mc_options *options) -{ - return options->max_depth; -} - -/* Sets the maximum depth to which OPTIONS's search will descend - to MAX_DEPTH. The initial states are at depth 1, states - produced as their mutations are at depth 2, and so on. */ -void -mc_options_set_max_depth (struct mc_options *options, int max_depth) -{ - options->max_depth = max_depth; -} - -/* Returns the base-2 log of the number of bits in OPTIONS's hash - table. The hash table is used for dropping states that are - probably duplicates: any state with a given hash value, as - will only be processed once. A return value of 0 indicates - that the model checker will not discard duplicate states based - on their hashes. - - The hash table is a power of 2 bits long, by default 2**20 - bits (128 kB). Depending on how many states you expect the - model checker to check, how much memory you're willing to let - the hash table take up, and how worried you are about missing - states due to hash collisions, you could make it larger or - smaller. - - The "birthday paradox" points to a reasonable way to size your - hash table. If you expect the model checker to check about - 2**N states, then, assuming a perfect hash, you need a hash - table of 2**(N+1) bits to have a 50% chance of seeing a hash - collision, 2**(N+2) bits to have a 25% chance, and so on. */ -int -mc_options_get_hash_bits (const struct mc_options *options) -{ - return options->hash_bits; -} - -/* Sets the base-2 log of the number of bits in OPTIONS's hash - table to HASH_BITS. A HASH_BITS value of 0 requests that the - model checker not discard duplicate states based on their - hashes. (This causes the model checker to never terminate in - many cases.) */ -void -mc_options_set_hash_bits (struct mc_options *options, int hash_bits) -{ - assert (hash_bits >= 0); - options->hash_bits = MIN (hash_bits, CHAR_BIT * sizeof (unsigned int) - 1); -} - -/* Returns the path set in OPTIONS by mc_options_set_follow_path. - May be used only if the search strategy is MC_PATH. */ -const struct mc_path * -mc_options_get_follow_path (const struct mc_options *options) -{ - assert (options->strategy == MC_PATH); - return &options->follow_path; -} - -/* Sets, in OPTIONS, the search algorithm to MC_PATH and the path - to be the explicit path specified in FOLLOW_PATH. */ -void -mc_options_set_follow_path (struct mc_options *options, - const struct mc_path *follow_path) -{ - assert (mc_path_get_length (follow_path) > 0); - options->strategy = MC_PATH; - mc_path_copy (&options->follow_path, follow_path); -} - -/* Returns the maximum number of queued states in OPTIONS. The - default value is 10,000. The primary reason to limit the - number of queued states is to conserve memory, so if you can - afford the memory and your model needs more room in the queue, - you can raise the limit. Conversely, if your models are large - or memory is constrained, you can reduce the limit. - - Following the execution of the model checker, you can find out - the maximum queue length during the run by calling - mc_results_get_max_queue_length. */ -int -mc_options_get_queue_limit (const struct mc_options *options) -{ - return options->queue_limit; -} - -/* Sets the maximum number of queued states in OPTIONS to - QUEUE_LIMIT. */ -void -mc_options_set_queue_limit (struct mc_options *options, int queue_limit) -{ - assert (queue_limit > 0); - options->queue_limit = queue_limit; -} - -/* Returns the queue limit strategy used by OPTIONS, that is, - when a new state must be inserted into a full state queue is - full, how the state to be dropped is chosen. The choices are: - - - MC_DROP_NEWEST: Drop the newest state; that is, do not - insert the new state into the queue at all. - - - MC_DROP_OLDEST: Drop the state that has been enqueued for - the longest. - - - MC_DROP_RANDOM (the default): Drop a randomly selected state - from the queue. The standard C library's rand function - selects the state to drop; you can control the seed passed - to srand using mc_options_set_seed. */ -enum mc_queue_limit_strategy -mc_options_get_queue_limit_strategy (const struct mc_options *options) -{ - return options->queue_limit_strategy; -} - -/* Sets the queue limit strategy used by OPTIONS to STRATEGY. - - This setting has no effect unless the model being checked - causes the state queue to overflow (see - mc_options_get_queue_limit). */ -void -mc_options_set_queue_limit_strategy (struct mc_options *options, - enum mc_queue_limit_strategy strategy) -{ - assert (strategy == MC_DROP_NEWEST - || strategy == MC_DROP_OLDEST - || strategy == MC_DROP_RANDOM); - options->queue_limit_strategy = strategy; -} - -/* Returns OPTIONS's maximum number of unique states that the - model checker will examine before terminating. The default is - INT_MAX. */ -int -mc_options_get_max_unique_states (const struct mc_options *options) -{ - return options->max_unique_states; -} - -/* Sets OPTIONS's maximum number of unique states that the model - checker will examine before terminating to - MAX_UNIQUE_STATE. */ -void -mc_options_set_max_unique_states (struct mc_options *options, - int max_unique_states) -{ - options->max_unique_states = max_unique_states; -} - -/* Returns the maximum number of errors that OPTIONS will allow - the model checker to encounter before terminating. The - default is 1. */ -int -mc_options_get_max_errors (const struct mc_options *options) -{ - return options->max_errors; -} - -/* Sets the maximum number of errors that OPTIONS will allow the - model checker to encounter before terminating to - MAX_ERRORS. */ -void -mc_options_set_max_errors (struct mc_options *options, int max_errors) -{ - options->max_errors = max_errors; -} - -/* Returns the maximum amount of time, in seconds, that OPTIONS will allow the - model checker to consume before terminating. The - default of 0.0 means that time consumption is unlimited. */ -double -mc_options_get_time_limit (const struct mc_options *options) -{ - return options->time_limit; -} - -/* Sets the maximum amount of time, in seconds, that OPTIONS will - allow the model checker to consume before terminating to - TIME_LIMIT. A value of 0.0 means that time consumption is - unlimited; otherwise, the return value will be positive. */ -void -mc_options_set_time_limit (struct mc_options *options, double time_limit) -{ - assert (time_limit >= 0.0); - options->time_limit = time_limit; -} - -/* Returns the level of verbosity for output messages specified - by OPTIONS. The default verbosity level is 1. - - A verbosity level of 0 inhibits all messages except for - errors; a verbosity level of 1 also allows warnings; a - verbosity level of 2 also causes a description of each state - added to be output; a verbosity level of 3 also causes a - description of each duplicate state to be output. Verbosity - levels less than 0 or greater than 3 are allowed but currently - have no additional effect. */ -int -mc_options_get_verbosity (const struct mc_options *options) -{ - return options->verbosity; -} - -/* Sets the level of verbosity for output messages specified - by OPTIONS to VERBOSITY. */ -void -mc_options_set_verbosity (struct mc_options *options, int verbosity) -{ - options->verbosity = verbosity; -} - -/* Returns the level of verbosity for failures specified by - OPTIONS. The default failure verbosity level is 2. - - The failure verbosity level has an effect only when an error - is reported, and only when the failure verbosity level is - higher than the regular verbosity level. When this is the - case, the model checker replays the error path at the higher - verbosity level specified. This has the effect of outputting - an explicit, human-readable description of the sequence of - operations that caused the error. */ -int -mc_options_get_failure_verbosity (const struct mc_options *options) -{ - return options->failure_verbosity; -} - -/* Sets the level of verbosity for failures specified by OPTIONS - to FAILURE_VERBOSITY. */ -void -mc_options_set_failure_verbosity (struct mc_options *options, - int failure_verbosity) -{ - options->failure_verbosity = failure_verbosity; -} - -/* Returns the output file used for messages printed by the model - checker specified by OPTIONS. The default is stdout. */ -FILE * -mc_options_get_output_file (const struct mc_options *options) -{ - return options->output_file; -} - -/* Sets the output file used for messages printed by the model - checker specified by OPTIONS to OUTPUT_FILE. - - The model checker does not automatically close the specified - output file. If this is desired, the model checker's client - must do so. */ -void -mc_options_set_output_file (struct mc_options *options, - FILE *output_file) -{ - options->output_file = output_file; -} - -/* Returns the number of microseconds between calls to the - progress function specified by OPTIONS. The default is - 250,000 (1/4 second). A value of 0 disables progress - reporting. */ -int -mc_options_get_progress_usec (const struct mc_options *options) -{ - return options->progress_usec; -} - -/* Sets the number of microseconds between calls to the progress - function specified by OPTIONS to PROGRESS_USEC. A value of 0 - disables progress reporting. */ -void -mc_options_set_progress_usec (struct mc_options *options, int progress_usec) -{ - assert (progress_usec >= 0); - options->progress_usec = progress_usec; -} - -/* Returns the function called to report progress specified by - OPTIONS. The function used by default prints '.' to - stderr. */ -mc_progress_func * -mc_options_get_progress_func (const struct mc_options *options) -{ - return options->progress_func; -} - -/* Sets the function called to report progress specified by - OPTIONS to PROGRESS_FUNC. A non-null function must be - specified; to disable progress reporting, set the progress - reporting interval to 0. - - PROGRESS_FUNC will be called zero or more times while the - model checker's run is ongoing. For these calls to the - progress function, mc_results_get_stop_reason will return - MC_CONTINUING. It will also be called exactly once soon - before mc_run returns, in which case - mc_results_get_stop_reason will return a different value. */ -void -mc_options_set_progress_func (struct mc_options *options, - mc_progress_func *progress_func) -{ - assert (options->progress_func != NULL); - options->progress_func = progress_func; -} - -/* Returns the auxiliary data set in OPTIONS by the client. The - default is a null pointer. - - This auxiliary data value can be retrieved by the - client-specified functions in struct mc_class during a model - checking run using mc_get_aux. */ -void * -mc_options_get_aux (const struct mc_options *options) -{ - return options->aux; -} - -/* Sets the auxiliary data in OPTIONS to AUX. */ -void -mc_options_set_aux (struct mc_options *options, void *aux) -{ - options->aux = aux; -} - -/* Results of a model checking run. */ -struct mc_results - { - /* Overall results. */ - enum mc_stop_reason stop_reason; /* Why the run ended. */ - int unique_state_count; /* Number of unique states checked. */ - int error_count; /* Number of errors found. */ - - /* Depth statistics. */ - int max_depth_reached; /* Max depth state examined. */ - struct moments1 *depth_moments; /* Enables reporting mean depth. */ - - /* If error_count > 0, path to the last error reported. */ - struct mc_path error_path; - - /* States dropped... */ - int duplicate_dropped_states; /* ...as duplicates. */ - int off_path_dropped_states; /* ...as off-path (MC_PATH only). */ - int depth_dropped_states; /* ...due to excessive depth. */ - int queue_dropped_states; /* ...due to queue overflow. */ - - /* Queue statistics. */ - int queued_unprocessed_states; /* Enqueued but never dequeued. */ - int max_queue_length; /* Maximum queue length observed. */ - - /* Timing. */ - struct timeval start; /* Start of model checking run. */ - struct timeval end; /* End of model checking run. */ - }; - -/* Creates, initializes, and returns a new set of results. */ -static struct mc_results * -mc_results_create (void) -{ - struct mc_results *results = xcalloc (1, sizeof (struct mc_results)); - results->stop_reason = MC_CONTINUING; - results->depth_moments = moments1_create (MOMENT_MEAN); - gettimeofday (&results->start, NULL); - return results; -} - -/* Destroys RESULTS. */ -void -mc_results_destroy (struct mc_results *results) -{ - if (results != NULL) - { - moments1_destroy (results->depth_moments); - mc_path_destroy (&results->error_path); - free (results); - } -} - -/* Returns RESULTS's reason that the model checking run - terminated. The possible reasons are: - - - MC_CONTINUING: The run is not actually yet complete. This - can only be returned before mc_run has returned, e.g. when - the progress function set by mc_options_set_progress_func - examines the run's results. - - - MC_SUCCESS: The run completed because the queue emptied. - The entire state space might not have been explored due to a - requested limit on maximum depth, hash collisions, etc. - - - MC_MAX_UNIQUE_STATES: The run completed because as many - unique states have been checked as were requested (using - mc_options_set_max_unique_states). - - - MC_MAX_ERROR_COUNT: The run completed because the maximum - requested number of errors (by default, 1 error) was - reached. - - - MC_END_OF_PATH: The run completed because the path specified - with mc_options_set_follow_path was fully traversed. - - - MC_TIMEOUT: The run completed because the time limit set - with mc_options_set_time_limit was exceeded. - - - MC_INTERRUPTED: The run completed because SIGINT was caught - (typically, due to the user typing Ctrl+C). */ -enum mc_stop_reason -mc_results_get_stop_reason (const struct mc_results *results) -{ - return results->stop_reason; -} - -/* Returns the number of unique states checked specified by - RESULTS. */ -int -mc_results_get_unique_state_count (const struct mc_results *results) -{ - return results->unique_state_count; -} - -/* Returns the number of errors found specified by RESULTS. */ -int -mc_results_get_error_count (const struct mc_results *results) -{ - return results->error_count; -} - -/* Returns the maximum depth reached during the model checker run - represented by RESULTS. The initial states are at depth 1, - their child states at depth 2, and so on. */ -int -mc_results_get_max_depth_reached (const struct mc_results *results) -{ - return results->max_depth_reached; -} - -/* Returns the mean depth reached during the model checker run - represented by RESULTS. */ -double -mc_results_get_mean_depth_reached (const struct mc_results *results) -{ - double mean; - moments1_calculate (results->depth_moments, NULL, &mean, NULL, NULL, NULL); - return mean != SYSMIS ? mean : 0.0; -} - -/* Returns the path traversed to obtain the last error - encountered during the model checker run represented by - RESULTS. Returns a null pointer if the run did not report any - errors. */ -const struct mc_path * -mc_results_get_error_path (const struct mc_results *results) -{ - return results->error_count > 0 ? &results->error_path : NULL; -} - -/* Returns the number of states dropped as duplicates (based on - hash value) during the model checker run represented by - RESULTS. */ -int -mc_results_get_duplicate_dropped_states (const struct mc_results *results) -{ - return results->duplicate_dropped_states; -} - -/* Returns the number of states dropped because they were off the - path specified by mc_options_set_follow_path during the model - checker run represented by RESULTS. A nonzero value here - indicates a missing call to mc_include_state in the - client-supplied mutation function. */ -int -mc_results_get_off_path_dropped_states (const struct mc_results *results) -{ - return results->off_path_dropped_states; -} - -/* Returns the number of states dropped because their depth - exceeded the maximum specified with mc_options_set_max_depth - during the model checker run represented by RESULTS. */ -int -mc_results_get_depth_dropped_states (const struct mc_results *results) -{ - return results->depth_dropped_states; -} - -/* Returns the number of states dropped from the queue due to - queue overflow during the model checker run represented by - RESULTS. */ -int -mc_results_get_queue_dropped_states (const struct mc_results *results) -{ - return results->queue_dropped_states; -} - -/* Returns the number of states that were checked and enqueued - but never dequeued and processed during the model checker run - represented by RESULTS. This is zero if the stop reason is - MC_CONTINUING or MC_SUCCESS; otherwise, it is the number of - states in the queue at the time that the checking run - stopped. */ -int -mc_results_get_queued_unprocessed_states (const struct mc_results *results) -{ - return results->queued_unprocessed_states; -} - -/* Returns the maximum length of the queue during the model - checker run represented by RESULTS. If this is equal to the - maximum queue length, then the queue (probably) overflowed - during the run; otherwise, it did not overflow. */ -int -mc_results_get_max_queue_length (const struct mc_results *results) -{ - return results->max_queue_length; -} - -/* Returns the time at which the model checker run represented by - RESULTS started. */ -struct timeval -mc_results_get_start (const struct mc_results *results) -{ - return results->start; -} - -/* Returns the time at which the model checker run represented by - RESULTS ended. (This function may not be called while the run - is still ongoing.) */ -struct timeval -mc_results_get_end (const struct mc_results *results) -{ - assert (results->stop_reason != MC_CONTINUING); - return results->end; -} - -/* Returns the number of seconds obtained by subtracting time Y - from time X. */ -static double -timeval_subtract (struct timeval x, struct timeval y) -{ - /* From libc.info. */ - double difference; - - /* Perform the carry for the later subtraction by updating Y. */ - if (x.tv_usec < y.tv_usec) { - int nsec = (y.tv_usec - x.tv_usec) / 1000000 + 1; - y.tv_usec -= 1000000 * nsec; - y.tv_sec += nsec; - } - if (x.tv_usec - y.tv_usec > 1000000) { - int nsec = (x.tv_usec - y.tv_usec) / 1000000; - y.tv_usec += 1000000 * nsec; - y.tv_sec -= nsec; - } - - /* Compute the time remaining to wait. - `tv_usec' is certainly positive. */ - difference = (x.tv_sec - y.tv_sec) + (x.tv_usec - y.tv_usec) / 1000000.0; - if (x.tv_sec < y.tv_sec) - difference = -difference; - return difference; -} - - -/* Returns the duration, in seconds, of the model checker run - represented by RESULTS. (This function may not be called - while the run is still ongoing.) */ -double -mc_results_get_duration (const struct mc_results *results) -{ - assert (results->stop_reason != MC_CONTINUING); - return timeval_subtract (results->end, results->start); -} - -/* An active model checking run. */ -struct mc - { - /* Related data structures. */ - const struct mc_class *class; - struct mc_options *options; - struct mc_results *results; - - /* Array of 2**(options->hash_bits) bits representing states - already visited. */ - unsigned char *hash; - - /* State queue. */ - struct mc_state **queue; /* Array of pointers to states. */ - struct deque queue_deque; /* Deque. */ - - /* State currently being built by "init" or "mutate". */ - struct mc_path path; /* Path to current state. */ - struct string path_string; /* Buffer for path_string function. */ - bool state_named; /* mc_name_operation called? */ - bool state_error; /* mc_error called? */ - - /* Statistics for calling the progress function. */ - unsigned int progress; /* Current progress value. */ - unsigned int next_progress; /* Next value to call progress func. */ - unsigned int prev_progress; /* Last value progress func called. */ - struct timeval prev_progress_time; /* Last time progress func called. */ - - /* Information for handling and restoring SIGINT. */ - bool interrupted; /* SIGINT received? */ - bool *saved_interrupted_ptr; /* Saved value of interrupted_ptr. */ - void (*saved_sigint) (int); /* Saved SIGINT handler. */ - }; - -/* A state in the queue. */ -struct mc_state - { - struct mc_path path; /* Path to this state. */ - void *data; /* Client-supplied data. */ - }; - -/* Points to the current struct mc's "interrupted" member. */ -static bool *interrupted_ptr = NULL; - -static const char *path_string (struct mc *); -static void free_state (const struct mc *, struct mc_state *); -static void stop (struct mc *, enum mc_stop_reason); -static struct mc_state *make_state (const struct mc *, void *); -static size_t random_queue_index (struct mc *); -static void enqueue_state (struct mc *, struct mc_state *); -static void do_error_state (struct mc *); -static void next_operation (struct mc *); -static bool is_off_path (const struct mc *); -static void sigint_handler (int signum); -static void init_mc (struct mc *, - const struct mc_class *, struct mc_options *); -static void finish_mc (struct mc *); - -/* Runs the model checker on the client-specified CLASS with the - client-specified OPTIONS. OPTIONS may be a null pointer if - the defaults are acceptable. Destroys OPTIONS; use - mc_options_clone if a copy is needed. - - Returns the results of the model checking run, which must be - destroyed by the client with mc_results_destroy. - - To pass auxiliary data to the functions in CLASS, use - mc_options_set_aux on OPTIONS, which may be retrieved from the - CLASS functions using mc_get_aux. */ -struct mc_results * -mc_run (const struct mc_class *class, struct mc_options *options) -{ - struct mc mc; - - init_mc (&mc, class, options); - while (!deque_is_empty (&mc.queue_deque) - && mc.results->stop_reason == MC_CONTINUING) - { - struct mc_state *state = mc.queue[deque_pop_front (&mc.queue_deque)]; - mc_path_copy (&mc.path, &state->path); - mc_path_push (&mc.path, 0); - class->mutate (&mc, state->data); - free_state (&mc, state); - if (mc.interrupted) - stop (&mc, MC_INTERRUPTED); - } - finish_mc (&mc); - - return mc.results; -} - -/* Tests whether the current operation is one that should be - performed, checked, and enqueued. If so, returns true. - Otherwise, returns false and, unless checking is stopped, - advances to the next state. The caller should then advance - to the next operation. - - This function should be called from the client-provided - "mutate" function, according to the pattern explained in the - big comment at the top of model-checker.h. */ -bool -mc_include_state (struct mc *mc) -{ - if (mc->results->stop_reason != MC_CONTINUING) - return false; - else if (is_off_path (mc)) - { - next_operation (mc); - return false; - } - else - return true; -} - -/* Tests whether HASH represents a state that has (probably) - already been enqueued. If not, returns false and marks HASH - so that it will be treated as a duplicate in the future. If - so, returns true and advances to the next state. The - caller should then advance to the next operation. - - This function should be called from the client-provided - "mutate" function, according to the pattern explained in the - big comment at the top of model-checker.h. */ -bool -mc_discard_dup_state (struct mc *mc, unsigned int hash) -{ - if (mc->options->hash_bits > 0) - { - hash &= (1u << mc->options->hash_bits) - 1; - if (TEST_BIT (mc->hash, hash)) - { - if (mc->options->verbosity > 2) - fprintf (mc->options->output_file, - " [%s] discard duplicate state\n", path_string (mc)); - mc->results->duplicate_dropped_states++; - next_operation (mc); - return true; - } - SET_BIT (mc->hash, hash); - } - return false; -} - -/* Names the current state NAME, which may contain - printf-style format specifications. NAME should be a - human-readable name for the current operation. - - This function should be called from the client-provided - "mutate" function, according to the pattern explained in the - big comment at the top of model-checker.h. */ -void -mc_name_operation (struct mc *mc, const char *name, ...) -{ - va_list args; - - va_start (args, name); - mc_vname_operation (mc, name, args); - va_end (args); -} - -/* Names the current state NAME, which may contain - printf-style format specifications, for which the - corresponding arguments must be given in ARGS. NAME should be - a human-readable name for the current operation. - - This function should be called from the client-provided - "mutate" function, according to the pattern explained in the - big comment at the top of model-checker.h. */ -void -mc_vname_operation (struct mc *mc, const char *name, va_list args) -{ - if (mc->state_named && mc->options->verbosity > 0) - fprintf (mc->options->output_file, " [%s] warning: duplicate call " - "to mc_name_operation (missing call to mc_add_state?)\n", - path_string (mc)); - mc->state_named = true; - - if (mc->options->verbosity > 1) - { - fprintf (mc->options->output_file, " [%s] ", path_string (mc)); - vfprintf (mc->options->output_file, name, args); - putc ('\n', mc->options->output_file); - } -} - -/* Reports the given error MESSAGE for the current operation. - The resulting state should still be passed to mc_add_state - when all relevant error messages have been issued. The state - will not, however, be enqueued for later mutation of its own. - - By default, model checking stops after the first error - encountered. - - This function should be called from the client-provided - "mutate" function, according to the pattern explained in the - big comment at the top of model-checker.h. */ -void -mc_error (struct mc *mc, const char *message, ...) -{ - va_list args; - - if (mc->results->stop_reason != MC_CONTINUING) - return; - - if (mc->options->verbosity > 1) - fputs (" ", mc->options->output_file); - fprintf (mc->options->output_file, "[%s] error: ", - path_string (mc)); - va_start (args, message); - vfprintf (mc->options->output_file, message, args); - va_end (args); - putc ('\n', mc->options->output_file); - - mc->state_error = true; -} - -/* Enqueues DATA as the state corresponding to the current - operation. The operation should have been named with a call - to mc_name_operation, and it should have been checked by the - caller (who should have reported any errors with mc_error). - - This function should be called from the client-provided - "mutate" function, according to the pattern explained in the - big comment at the top of model-checker.h. */ -void -mc_add_state (struct mc *mc, void *data) -{ - if (!mc->state_named && mc->options->verbosity > 0) - fprintf (mc->options->output_file, " [%s] warning: unnamed state\n", - path_string (mc)); - - if (mc->results->stop_reason != MC_CONTINUING) - { - /* Nothing to do. */ - } - else if (mc->state_error) - do_error_state (mc); - else if (is_off_path (mc)) - mc->results->off_path_dropped_states++; - else if (mc->path.length + 1 > mc->options->max_depth) - mc->results->depth_dropped_states++; - else - { - /* This is the common case. */ - mc->results->unique_state_count++; - if (mc->results->unique_state_count >= mc->options->max_unique_states) - stop (mc, MC_MAX_UNIQUE_STATES); - enqueue_state (mc, make_state (mc, data)); - next_operation (mc); - return; - } - - mc->class->destroy (mc, data); - next_operation (mc); -} - -/* Returns the options that were passed to mc_run for model - checker MC. */ -const struct mc_options * -mc_get_options (const struct mc *mc) -{ - return mc->options; -} - -/* Returns the current state of the results for model checker - MC. This function is appropriate for use from the progress - function set by mc_options_set_progress_func. - - Not all of the results are meaningful before model checking - completes. */ -const struct mc_results * -mc_get_results (const struct mc *mc) -{ - return mc->results; -} - -/* Returns the auxiliary data set on the options passed to mc_run - with mc_options_set_aux. */ -void * -mc_get_aux (const struct mc *mc) -{ - return mc_options_get_aux (mc_get_options (mc)); -} - -/* Expresses MC->path as a string and returns the string. */ -static const char * -path_string (struct mc *mc) -{ - ds_clear (&mc->path_string); - mc_path_to_string (&mc->path, &mc->path_string); - return ds_cstr (&mc->path_string); -} - -/* Frees STATE, including client data. */ -static void -free_state (const struct mc *mc, struct mc_state *state) -{ - mc->class->destroy (mc, state->data); - mc_path_destroy (&state->path); - free (state); -} - -/* Sets STOP_REASON as the reason that MC's processing stopped, - unless MC is already stopped. */ -static void -stop (struct mc *mc, enum mc_stop_reason stop_reason) -{ - if (mc->results->stop_reason == MC_CONTINUING) - mc->results->stop_reason = stop_reason; -} - -/* Creates and returns a new state whose path is copied from - MC->path and whose data is specified by DATA. */ -static struct mc_state * -make_state (const struct mc *mc, void *data) -{ - struct mc_state *new = xmalloc (sizeof *new); - mc_path_init (&new->path); - mc_path_copy (&new->path, &mc->path); - new->data = data; - return new; -} - -/* Returns the index in MC->queue of a random element in the - queue. */ -static size_t -random_queue_index (struct mc *mc) -{ - assert (!deque_is_empty (&mc->queue_deque)); - return deque_front (&mc->queue_deque, - rand () % deque_count (&mc->queue_deque)); -} - -/* Adds NEW to MC's state queue, dropping a state if necessary - due to overflow. */ -static void -enqueue_state (struct mc *mc, struct mc_state *new) -{ - size_t idx; - - if (new->path.length > mc->results->max_depth_reached) - mc->results->max_depth_reached = new->path.length; - moments1_add (mc->results->depth_moments, new->path.length, 1.0); - - if (deque_count (&mc->queue_deque) < mc->options->queue_limit) - { - /* Add new state to queue. */ - if (deque_is_full (&mc->queue_deque)) - mc->queue = deque_expand (&mc->queue_deque, - mc->queue, sizeof *mc->queue); - switch (mc->options->strategy) - { - case MC_BROAD: - idx = deque_push_back (&mc->queue_deque); - break; - case MC_DEEP: - idx = deque_push_front (&mc->queue_deque); - break; - case MC_RANDOM: - if (!deque_is_empty (&mc->queue_deque)) - { - idx = random_queue_index (mc); - mc->queue[deque_push_front (&mc->queue_deque)] - = mc->queue[idx]; - } - else - idx = deque_push_front (&mc->queue_deque); - break; - case MC_PATH: - assert (deque_is_empty (&mc->queue_deque)); - assert (!is_off_path (mc)); - idx = deque_push_back (&mc->queue_deque); - if (mc->path.length - >= mc_path_get_length (&mc->options->follow_path)) - stop (mc, MC_END_OF_PATH); - break; - default: - NOT_REACHED (); - } - if (deque_count (&mc->queue_deque) > mc->results->max_queue_length) - mc->results->max_queue_length = deque_count (&mc->queue_deque); - } - else - { - /* Queue has reached limit, so replace an existing - state. */ - assert (mc->options->strategy != MC_PATH); - assert (!deque_is_empty (&mc->queue_deque)); - mc->results->queue_dropped_states++; - switch (mc->options->queue_limit_strategy) - { - case MC_DROP_NEWEST: - free_state (mc, new); - return; - case MC_DROP_OLDEST: - switch (mc->options->strategy) - { - case MC_BROAD: - idx = deque_front (&mc->queue_deque, 0); - break; - case MC_DEEP: - idx = deque_back (&mc->queue_deque, 0); - break; - case MC_RANDOM: - case MC_PATH: - default: - NOT_REACHED (); - } - break; - case MC_DROP_RANDOM: - idx = random_queue_index (mc); - break; - default: - NOT_REACHED (); - } - free_state (mc, mc->queue[idx]); - } - mc->queue[idx] = new; -} - -/* Process an error state being added to MC. */ -static void -do_error_state (struct mc *mc) -{ - mc->results->error_count++; - if (mc->results->error_count >= mc->options->max_errors) - stop (mc, MC_MAX_ERROR_COUNT); - - mc_path_copy (&mc->results->error_path, &mc->path); - - if (mc->options->failure_verbosity > mc->options->verbosity) - { - struct mc_options *path_options; - - fprintf (mc->options->output_file, "[%s] retracing error path:\n", - path_string (mc)); - path_options = mc_options_clone (mc->options); - mc_options_set_verbosity (path_options, mc->options->failure_verbosity); - mc_options_set_failure_verbosity (path_options, 0); - mc_options_set_follow_path (path_options, &mc->path); - - mc_results_destroy (mc_run (mc->class, path_options)); - - putc ('\n', mc->options->output_file); - } -} - -/* Advances MC to start processing the operation following the - current one. */ -static void -next_operation (struct mc *mc) -{ - mc_path_push (&mc->path, mc_path_pop (&mc->path) + 1); - mc->state_error = false; - mc->state_named = false; - - if (++mc->progress >= mc->next_progress) - { - struct timeval now; - double elapsed, delta; - - if (mc->results->stop_reason == MC_CONTINUING - && !mc->options->progress_func (mc)) - stop (mc, MC_INTERRUPTED); - - gettimeofday (&now, NULL); - - if (mc->options->time_limit > 0.0 - && (timeval_subtract (now, mc->results->start) - > mc->options->time_limit)) - stop (mc, MC_TIMEOUT); - - elapsed = timeval_subtract (now, mc->prev_progress_time); - if (elapsed > 0.0) - { - /* Re-estimate the amount of progress to take - progress_usec microseconds. */ - unsigned int progress = mc->progress - mc->prev_progress; - double progress_sec = mc->options->progress_usec / 1000000.0; - delta = progress / elapsed * progress_sec; - } - else - { - /* No measurable time at all elapsed during that amount - of progress. Try doubling the amount of progress - required. */ - delta = (mc->progress - mc->prev_progress) * 2; - } - - if (delta > 0.0 && delta + mc->progress + 1.0 < UINT_MAX) - mc->next_progress = mc->progress + delta + 1.0; - else - mc->next_progress = mc->progress + (mc->progress - mc->prev_progress); - - mc->prev_progress = mc->progress; - mc->prev_progress_time = now; - } -} - -/* Returns true if we're tracing an explicit path but the current - operation produces a state off that path, false otherwise. */ -static bool -is_off_path (const struct mc *mc) -{ - return (mc->options->strategy == MC_PATH - && (mc_path_back (&mc->path) - != mc_path_get_operation (&mc->options->follow_path, - mc->path.length - 1))); -} - -/* Handler for SIGINT. */ -static void -sigint_handler (int signum UNUSED) -{ - /* Just mark the model checker as interrupted. */ - *interrupted_ptr = true; -} - -/* Initializes MC as a model checker with the given CLASS and - OPTIONS. OPTIONS may be null to use the default options. */ -static void -init_mc (struct mc *mc, const struct mc_class *class, - struct mc_options *options) -{ - /* Validate and adjust OPTIONS. */ - if (options == NULL) - options = mc_options_create (); - assert (options->queue_limit_strategy != MC_DROP_OLDEST - || options->strategy != MC_RANDOM); - if (options->strategy == MC_PATH) - { - options->max_depth = INT_MAX; - options->hash_bits = 0; - } - if (options->progress_usec == 0) - { - options->progress_func = null_progress; - if (options->time_limit > 0.0) - options->progress_usec = 250000; - } - - /* Initialize MC. */ - mc->class = class; - mc->options = options; - mc->results = mc_results_create (); - - mc->hash = (mc->options->hash_bits > 0 - ? xcalloc (1, DIV_RND_UP (1 << mc->options->hash_bits, CHAR_BIT)) - : NULL); - - mc->queue = NULL; - deque_init_null (&mc->queue_deque); - - mc_path_init (&mc->path); - mc_path_push (&mc->path, 0); - ds_init_empty (&mc->path_string); - mc->state_named = false; - mc->state_error = false; - - mc->progress = 0; - mc->next_progress = mc->options->progress_usec != 0 ? 100 : UINT_MAX; - mc->prev_progress = 0; - mc->prev_progress_time = mc->results->start; - - if (mc->options->strategy == MC_RANDOM - || options->queue_limit_strategy == MC_DROP_RANDOM) - srand (mc->options->seed); - - mc->interrupted = false; - mc->saved_interrupted_ptr = interrupted_ptr; - interrupted_ptr = &mc->interrupted; - mc->saved_sigint = signal (SIGINT, sigint_handler); - - class->init (mc); -} - -/* Complete the model checker run for MC. */ -static void -finish_mc (struct mc *mc) -{ - /* Restore signal handlers. */ - signal (SIGINT, mc->saved_sigint); - interrupted_ptr = mc->saved_interrupted_ptr; - - /* Mark the run complete. */ - stop (mc, MC_SUCCESS); - gettimeofday (&mc->results->end, NULL); - - /* Empty the queue. */ - mc->results->queued_unprocessed_states = deque_count (&mc->queue_deque); - while (!deque_is_empty (&mc->queue_deque)) - { - struct mc_state *state = mc->queue[deque_pop_front (&mc->queue_deque)]; - free_state (mc, state); - } - - /* Notify the progress function of completion. */ - mc->options->progress_func (mc); - - /* Free memory. */ - mc_path_destroy (&mc->path); - ds_destroy (&mc->path_string); - free (mc->options); - free (mc->queue); - free (mc->hash); -} diff --git a/src/libpspp/model-checker.h b/src/libpspp/model-checker.h deleted file mode 100644 index 8c86fae61d..0000000000 --- a/src/libpspp/model-checker.h +++ /dev/null @@ -1,463 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2007 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -/* Implementation-level model checker. - - A model checker is a tool for software testing and - verification that works by exploring all the possible states - in a system and verifying their internal consistency. A - conventional model checker requires that the code in a system - be translated into a specification language. The model - checker then verifies the specification, rather than the code. - - This is instead an implementation-level model checker, which - does not require a separate specification. Instead, the model - checker requires writing a second implementation of the system - being checked. The second implementation can usually be made - almost trivial in comparison to the one being checked, because - it's usually acceptable for its performance to be - comparatively poor, e.g. O(N^2) instead of O(lg N), and thus - to use much simpler algorithms. - - For introduction to the implementation-level model checking - approach used here, please refer to the following papers: - - Musuvathi, Park, Chou, Engler, Dill, "CMC: A Pragmatic - Approach to Model Checking Real Code", Proceedings of the - Fifth Symposium on Operating Systems Design and - Implementation (OSDI), Dec 2002. - http://sprout.stanford.edu/PAPERS/CMC-OSDI-2002/CMC-OSDI-2002.pdf - - Yang, Twohey, Engler, Musuvathi, "Using Model Checking to - Find Serious File System Errors", Proceedings of the Sixth - Symposium on Operating System Design and Implementation - (OSDI), Dec 2004. - http://www.stanford.edu/~engler/osdi04-fisc.pdf - - Yang, Twohey, Pfaff, Sar, Engler, "EXPLODE: A Lightweight, - General Approach to Finding Serious Errors in Storage - Systems", First Workshop on the Evaluation of Software - Defect Detection Tools (BUGS), June 2005. - http://benpfaff.org/papers/explode.pdf - - Use of a model checker is appropriate when the system being - checked is difficult to test using handwritten tests. This - can be the case, for example, when the system has a - complicated internal state that is difficult to reason about - over a long series of operations. - - The implementation model checker works by putting a set of one - of more initial states in a queue (and checking them for - consistency). Then the model checker removes a state from the - queue and applies all possible operations of interest to it - ("mutates" it), obtaining a set of zero or more child states - (and checking each of them for consistency). Each of these - states is itself added to the queue. The model checker - continues dequeuing states and mutating and checking them - until the queue is empty. - - In pseudo-code, the process looks like this: - - Q = { initial states } - while Q is not empty: - S = dequeue(Q) - for each operation applicable to S do: - T = operation(S) - check(T) - enqueue(Q, T) - - In many cases this process will never terminate, because every - state has one or more child states. For some systems this is - unavoidable, but in others we can make the process finite by - pursuing a few stratagems: - - 1. Limit the maximum size of the system; for example, limit - the number of rows and columns in the implementation of a - table being checked. The client of the model checker is - responsible for implementing such limits. - - 2. Avoid checking a single state more than one time. This - model checker provides assistance for this function by - allowing the client to provide a hash of the system state. - States with identical hashes will only be checked once - during a single run. - - When a system cannot be made finite, or when a finite system - is too large to check in a practical amount of time, the model - checker provides multiple ways to limit the checking run: - based on maximum depth, maximum unique states checked, maximum - errors found (by default, 1), or maximum time used for - checking. - - The client of the model checker must provide three functions - via function pointers embedded into a "struct mc_class": - - 1. void init (struct mc *mc); - - This function is called once at the beginning of a - checking run. It checks one or more initial states and - adds them to the model checker's queue. (If it does not - add any states to the queue, then there is nothing to - check.) - - Here's an outline for writing the init function: - - void - init_foo (struct mc *mc) - { - struct foo *foo; - - mc_name_operation (mc, "initial state"); - foo = generate_initial_foo (); - if (!state_is_consistent (foo)) - mc_error (mc, "inconsistent state"); - mc_add_state (mc, foo); - } - - 2. void mutate (struct mc *mc, const void *data); - - This function is called when a dequeued state is ready to - be mutated. For each operation that can be applied to - the client-specified DATA, it applies that operation to a - clone of the DATA, checks that the clone is consistent, - and adds the clone to the model checker's queue. - - Here's an outline for writing the mutate function: - - void - mutate_foo (struct mc *mc, void *state_) - { - struct foo *state = state_; - - for (...each operation...) - if (mc_include_state (mc)) - { - struct foo *clone; - - mc_name_operation (mc, "do operation %s", ...); - clone = clone_foo (state); - do_operation (clone); - if (!mc_discard_dup_state (mc, hash_foo (clone))) - { - if (!state_is_consistent (clone)) - mc_error (mc, "inconsistent state"); - mc_add_state (mc, clone); - } - else - destroy_foo (clone); - } - } - - Notes on the above outline: - - - The call to mc_include_state allows currently - uninteresting operations to be skipped. It is not - essential. - - - The call to mc_name_operation should give the current - operation a human-readable name. The name may - include printf-style format specifications. - - When an error occurs, the model checker (by default) - replays the sequence of operations performed to reach - the error, printing the name of the operation at each - step, which is often sufficient information in itself - to debug the error. - - At higher levels of verbosity, the name is printed - for each operation. - - - Operations should be performed on a copy of the data - provided. The data provided should not be destroyed - or modified. - - - The call to mc_discard_dup_state is needed to discard - (probably) duplicate states. It is otherwise - optional. - - To reduce the probability of collisions, use a - high-quality hash function. MD4 is a reasonable - choice: it is fast but high-quality. In one test, - switching to MD4 from MD5 increased overall speed of - model checking by 8% and actually reduced (!) the - number of collisions. - - The hash value needs to include enough of the state - to ensure that interesting states are not excluded, - but it need not include the entire state. For - example, in many cases, the structure of complex data - (metadata) is often much more important than the - contents (data), so it may be reasonable to hash only - the metadata. - - mc_discard_dup_state may be called before or after - checking for consistency, but calling it first avoids - wasting time checking duplicate states for - consistency, which again can be a significant - performance boost. - - - The mc_error function reports errors. It may be - called as many times as desired to report each kind - of inconsistency in a state. - - - The mc_add_state function adds the new state to the - queue. It should be called regardless of whether an - error was reported, to indicate to the model checker - that state processing has finished. - - - The mutation function should be deterministic, to - make it possible to reliably reproduce errors. - - 3. void destroy (struct mc *mc, void *data); - - This function is called to discard the client-specified - DATA associated with a state. - - Numerous options are available for configuring the model - checker. The most important of these are: - - - Search algorithm: - - * Breadth-first search (the default): First try all the - operations with depth 1, then those with depth 2, then - those with depth 3, and so on. - - This search algorithm finds the least number of - operations needed to trigger a given bug. - - * Depth-first search: Searches downward in the tree of - states as fast as possible. Good for finding bugs that - require long sequences of operations to trigger. - - * Random-first search: Searches through the tree of - states in random order. - - * Explicit path: Applies an explicitly specified sequence - of operations. - - - Verbosity: By default, messages are printed only when an - error is encountered, but you can cause the checker to - print a message on every state transition. This is most - useful when the errors in your code cause segfaults or - some other kind of sudden termination. - - - Treatment of errors: By default, when an error is - encountered, the model checker recursively invokes itself - with an increased verbosity level and configured to follow - only the error path. As long as the mutation function is - deterministic, this quickly and concisely replays the - error and describes the path followed to reach it in an - easily human-readable manner. - - - Limits: - - * Maximum depth: You can limit the depth of the operations - performed. Most often useful with depth-first search. - By default, depth is unlimited. - - * Maximum queue length: You can limit the number of states - kept in the queue at any given time. The main reason to - do so is to limit memory consumption. The default - limit is 10,000 states. Several strategies are - available for choosing which state to drop when the - queue overflows. - - - Stop conditions: based on maximum unique states checked, - maximum errors found (by default, 1), or maximum time used - for checking. - - - Progress: by default, the checker prints a '.' on stderr - every .25 seconds, but you can substitute another progress - function or disable progress printing. - - This model checker does not (yet) include two features - described in the papers cited above: utility scoring - heuristics to guide the search strategy and "choice points" to - explore alternative cases. The former feature is less - interesting for this model checker, because the data - structures we are thus far using it to model are much smaller - than those discussed in the paper. The latter feature we - should implement at some point. */ - -#ifndef LIBPSPP_MODEL_CHECKER_H -#define LIBPSPP_MODEL_CHECKER_H 1 - -#include -#include -#include -#include - -#include - -/* An active model checking run. */ -struct mc; - -/* Provided by each client of the model checker. */ -struct mc_class - { - void (*init) (struct mc *); - void (*mutate) (struct mc *, const void *); - void (*destroy) (const struct mc *, void *); - }; - -/* Results of a model checking run. */ -struct mc_results; - -/* Configuration for running the model checker. */ -struct mc_options; - -/* Primary external interface to model checker. */ -struct mc_results *mc_run (const struct mc_class *, struct mc_options *); - -/* Functions for use from client-supplied "init" and "mutate" - functions. */ -bool mc_include_state (struct mc *); -bool mc_discard_dup_state (struct mc *, unsigned int hash); -void mc_name_operation (struct mc *, const char *, ...) PRINTF_FORMAT (2, 3); -void mc_vname_operation (struct mc *, const char *, va_list) - PRINTF_FORMAT (2, 0); -void mc_error (struct mc *, const char *, ...) PRINTF_FORMAT (2, 3); -void mc_add_state (struct mc *, void *data); - -/* Functions for use from client-supplied "init", "mutate", and - "destroy" functions. */ -const struct mc_options *mc_get_options (const struct mc *); -const struct mc_results *mc_get_results (const struct mc *); -void *mc_get_aux (const struct mc *); - -/* A path of operations through a model to arrive at some - particular state. */ -struct mc_path - { - int *ops; /* Sequence of operations. */ - size_t length; /* Number of operations. */ - size_t capacity; /* Number of operations for which room is allocated. */ - }; - -void mc_path_init (struct mc_path *); -void mc_path_copy (struct mc_path *, const struct mc_path *); -void mc_path_push (struct mc_path *, int new_state); -int mc_path_pop (struct mc_path *); -int mc_path_back (const struct mc_path *); -void mc_path_destroy (struct mc_path *); - -int mc_path_get_operation (const struct mc_path *, size_t index); -size_t mc_path_get_length (const struct mc_path *); - -struct string; -void mc_path_to_string (const struct mc_path *, struct string *); - -struct mc_options *mc_options_create (void); -struct mc_options *mc_options_clone (const struct mc_options *); -void mc_options_destroy (struct mc_options *); - -/* Search strategy. */ -enum mc_strategy - { - MC_BROAD, /* Breadth-first search. */ - MC_DEEP, /* Depth-first search. */ - MC_RANDOM, /* Randomly ordered search. */ - MC_PATH /* Follow one explicit path. */ - }; - -enum mc_strategy mc_options_get_strategy (const struct mc_options *); -void mc_options_set_strategy (struct mc_options *, enum mc_strategy); -unsigned int mc_options_get_seed (const struct mc_options *); -void mc_options_set_seed (struct mc_options *, unsigned int seed); -int mc_options_get_max_depth (const struct mc_options *); -void mc_options_set_max_depth (struct mc_options *, int max_depth); -int mc_options_get_hash_bits (const struct mc_options *); -void mc_options_set_hash_bits (struct mc_options *, int hash_bits); - -const struct mc_path *mc_options_get_follow_path (const struct mc_options *); -void mc_options_set_follow_path (struct mc_options *, const struct mc_path *); - -/* Strategy for dropped states from the queue when it - overflows. */ -enum mc_queue_limit_strategy - { - MC_DROP_NEWEST, /* Don't enqueue the new state at all. */ - MC_DROP_OLDEST, /* Drop the oldest state in the queue. */ - MC_DROP_RANDOM /* Drop a random state from the queue. */ - }; - -int mc_options_get_queue_limit (const struct mc_options *); -void mc_options_set_queue_limit (struct mc_options *, int queue_limit); -enum mc_queue_limit_strategy mc_options_get_queue_limit_strategy ( - const struct mc_options *); -void mc_options_set_queue_limit_strategy (struct mc_options *, - enum mc_queue_limit_strategy); - -int mc_options_get_max_unique_states (const struct mc_options *); -void mc_options_set_max_unique_states (struct mc_options *, - int max_unique_states); -int mc_options_get_max_errors (const struct mc_options *); -void mc_options_set_max_errors (struct mc_options *, int max_errors); -double mc_options_get_time_limit (const struct mc_options *); -void mc_options_set_time_limit (struct mc_options *, double time_limit); - -int mc_options_get_verbosity (const struct mc_options *); -void mc_options_set_verbosity (struct mc_options *, int verbosity); -int mc_options_get_failure_verbosity (const struct mc_options *); -void mc_options_set_failure_verbosity (struct mc_options *, - int failure_verbosity); -FILE *mc_options_get_output_file (const struct mc_options *); -void mc_options_set_output_file (struct mc_options *, FILE *); - -typedef bool mc_progress_func (struct mc *); -int mc_options_get_progress_usec (const struct mc_options *); -void mc_options_set_progress_usec (struct mc_options *, int progress_usec); -mc_progress_func *mc_options_get_progress_func (const struct mc_options *); -void mc_options_set_progress_func (struct mc_options *, mc_progress_func *); - -void *mc_options_get_aux (const struct mc_options *); -void mc_options_set_aux (struct mc_options *, void *aux); - -/* Reason that a model checking run terminated. */ -enum mc_stop_reason - { - MC_CONTINUING, /* Run has not yet terminated. */ - MC_SUCCESS, /* Queue emptied (ran out of states). */ - MC_MAX_UNIQUE_STATES, /* Did requested number of unique states. */ - MC_MAX_ERROR_COUNT, /* Too many errors. */ - MC_END_OF_PATH, /* Processed requested path (MC_PATH only). */ - MC_TIMEOUT, /* Timeout. */ - MC_INTERRUPTED /* Received SIGINT (Ctrl+C). */ - }; - -void mc_results_destroy (struct mc_results *); - -enum mc_stop_reason mc_results_get_stop_reason (const struct mc_results *); -int mc_results_get_unique_state_count (const struct mc_results *); -int mc_results_get_error_count (const struct mc_results *); - -int mc_results_get_max_depth_reached (const struct mc_results *); -double mc_results_get_mean_depth_reached (const struct mc_results *); - -const struct mc_path *mc_results_get_error_path (const struct mc_results *); - -int mc_results_get_duplicate_dropped_states (const struct mc_results *); -int mc_results_get_off_path_dropped_states (const struct mc_results *); -int mc_results_get_depth_dropped_states (const struct mc_results *); -int mc_results_get_queue_dropped_states (const struct mc_results *); -int mc_results_get_queued_unprocessed_states (const struct mc_results *); -int mc_results_get_max_queue_length (const struct mc_results *); - -struct timeval mc_results_get_start (const struct mc_results *); -struct timeval mc_results_get_end (const struct mc_results *); -double mc_results_get_duration (const struct mc_results *); - -#endif /* libpspp/model-checker.h */ diff --git a/src/math/automake.mk b/src/math/automake.mk index 107a985d44..e463088779 100644 --- a/src/math/automake.mk +++ b/src/math/automake.mk @@ -2,32 +2,34 @@ include $(top_srcdir)/src/math/ts/automake.mk -noinst_LIBRARIES += src/math/libpspp_math.a +noinst_LTLIBRARIES += src/math/libpspp-math.la -src_math_libpspp_math_a_SOURCES = \ - src/math/factor-stats.c \ - src/math/factor-stats.h \ +src_math_libpspp_math_la_LIBADD = \ + lib/linreg/liblinreg.la + +src_math_libpspp_math_la_SOURCES = \ src/math/chart-geometry.c \ src/math/chart-geometry.h \ + src/math/box-whisker.c src/math/box-whisker.h \ src/math/coefficient.c \ src/math/coefficient.h \ src/math/covariance-matrix.c \ src/math/covariance-matrix.h \ + src/math/design-matrix.c src/math/design-matrix.h \ + src/math/extrema.c src/math/extrema.h \ src/math/group.c src/math/group.h \ src/math/group-proc.h \ src/math/histogram.c src/math/histogram.h \ - src/math/interaction.c \ - src/math/interaction.h \ - src/math/levene.c \ - src/math/levene.h \ - src/math/linreg.c \ - src/math/linreg.h \ - src/math/merge.c \ - src/math/merge.h \ + src/math/interaction.c src/math/interaction.h \ + src/math/levene.c src/math/levene.h \ + src/math/linreg.c src/math/linreg.h \ + src/math/merge.c src/math/merge.h \ src/math/moments.c src/math/moments.h \ + src/math/np.c src/math/np.h \ + src/math/order-stats.c src/math/order-stats.h \ src/math/percentiles.c src/math/percentiles.h \ - src/math/design-matrix.c src/math/design-matrix.h \ src/math/random.c src/math/random.h \ - src/math/sort.c src/math/sort.h - -EXTRA_DIST += src/math/OChangeLog + src/math/statistic.h \ + src/math/sort.c src/math/sort.h \ + src/math/trimmed-mean.c src/math/trimmed-mean.h \ + src/math/tukey-hinges.c src/math/tukey-hinges.h diff --git a/src/math/box-whisker.c b/src/math/box-whisker.c new file mode 100644 index 0000000000..288fc072ef --- /dev/null +++ b/src/math/box-whisker.c @@ -0,0 +1,139 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include +#include "box-whisker.h" +#include "order-stats.h" +#include "tukey-hinges.h" +#include +#include +#include +#include +#include +#include +#include +#include + +static void +destroy (struct statistic *s) +{ + struct order_stats *os = (struct order_stats *) s; + struct box_whisker *bw = (struct box_whisker *) s; + struct ll *ll; + + for (ll = ll_head (&bw->outliers); ll != ll_null (&bw->outliers); ) + { + struct outlier *e = ll_data (ll, struct outlier, ll); + + ll = ll_next (ll); + + ds_destroy (&e->label); + free (e); + } + + free (os->k); + free (s); +}; + + +static void +acc (struct statistic *s, const struct ccase *cx, + double c UNUSED, double cc UNUSED, double y) +{ + struct box_whisker *bw = (struct box_whisker *) s; + bool extreme; + struct outlier *o; + + if ( y < bw->hinges[2] + bw->step) + bw->whiskers[1] = y; + + if (bw->whiskers[0] == SYSMIS || bw->hinges[0] - bw->step > y) + bw->whiskers[0] = y; + + if ( y > bw->hinges[2] + bw->step) + extreme = (y > bw->hinges[2] + 2 * bw->step) ; + + else if (y < bw->hinges[0] - bw->step) + extreme = (y < bw->hinges[0] - 2 * bw->step) ; + + else + return; + + o = xzalloc (sizeof *o) ; + o->value = y; + o->extreme = extreme; + ds_init_empty (&o->label); + + if (bw->id_var) + var_append_value_name (bw->id_var, + case_data (cx, bw->id_var), + &o->label); + else + ds_put_format (&o->label, + "%ld", + (casenumber) case_data_idx (cx, bw->casenumber_idx)->f); + + ll_push_head (&bw->outliers, &o->ll); +} + +void +box_whisker_whiskers (const struct box_whisker *bw, double whiskers[2]) +{ + whiskers[0] = bw->whiskers[0]; + whiskers[1] = bw->whiskers[1]; +} + +void +box_whisker_hinges (const struct box_whisker *bw, double hinges[3]) +{ + hinges[0] = bw->hinges[0]; + hinges[1] = bw->hinges[1]; + hinges[2] = bw->hinges[2]; +} + +const struct ll_list * +box_whisker_outliers (const struct box_whisker *bw) +{ + return &bw->outliers; +} + +struct statistic * +box_whisker_create (const struct tukey_hinges *th, + const struct variable *id_var, size_t casenumber_idx) +{ + struct box_whisker *w = xzalloc (sizeof (*w)); + struct order_stats *os = (struct order_stats *) w; + struct statistic *stat = (struct statistic *) w; + + os->n_k = 0; + + stat->destroy = destroy; + stat->accumulate = acc; + + tukey_hinges_calculate (th, w->hinges); + + w->casenumber_idx = casenumber_idx; + w->id_var = id_var; + + w->step = (w->hinges[2] - w->hinges[0]) * 1.5; + + w->whiskers[1] = w->hinges[2]; + w->whiskers[0] = SYSMIS; + + ll_init (&w->outliers); + + return stat; +} diff --git a/src/math/box-whisker.h b/src/math/box-whisker.h new file mode 100644 index 0000000000..5202b64672 --- /dev/null +++ b/src/math/box-whisker.h @@ -0,0 +1,65 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef __MATH_BOX_WHISKER_H__ +#define __MATH_BOX_WHISKER_H__ + +#include +#include +#include +#include "order-stats.h" + +/* This module calculates the statistics typically displayed by box-plots. + However, there's no reason not to use it for other purposes too. + */ +struct tukey_hinges; + + +struct outlier +{ + double value; + struct string label; + bool extreme; + struct ll ll; +}; + + +struct box_whisker +{ + struct order_stats parent; + + double hinges[3]; + double whiskers[2]; + + struct ll_list outliers; + + double step; + + size_t casenumber_idx; + const struct variable *id_var; +}; + +struct statistic * box_whisker_create (const struct tukey_hinges *, + const struct variable *, size_t); + +void box_whisker_whiskers (const struct box_whisker *bw, double whiskers[2]); + +void box_whisker_hinges (const struct box_whisker *bw, double hinges[2]); + +const struct ll_list * box_whisker_outliers (const struct box_whisker *bw); + + +#endif diff --git a/src/math/coefficient.c b/src/math/coefficient.c index 5872b576fa..f738c783f2 100644 --- a/src/math/coefficient.c +++ b/src/math/coefficient.c @@ -160,13 +160,21 @@ pspp_coeff_var_to_coeff (const struct variable *v, struct pspp_coeff **coefs, size_t i = 0; size_t j = 0; size_t v_idx; + struct pspp_coeff *result = NULL; if (v != NULL) { v_idx = var_get_dict_index (v); - while (i < n_coef && var_get_dict_index (coefs[i]->v_info->v) != v_idx) + while (i < n_coef) { + if (coefs[i]->v_info != NULL) + { + if (var_get_dict_index (coefs[i]->v_info->v) == v_idx) + { + break; + } + } i++; } result = coefs[i]; @@ -179,7 +187,7 @@ pspp_coeff_var_to_coeff (const struct variable *v, struct pspp_coeff **coefs, { j = i; while (j < n_coef && compare_values (pspp_coeff_get_value (coefs[j], v), - val, var_get_width (v)) != 0) + val, v) != 0) { j++; } diff --git a/src/math/covariance-matrix.c b/src/math/covariance-matrix.c index 5414379119..f929a370cf 100644 --- a/src/math/covariance-matrix.c +++ b/src/math/covariance-matrix.c @@ -74,7 +74,7 @@ column_iterate (struct design_matrix *cov, const struct variable *v, col += i; y = -1.0 * cat_get_category_count (i, v) / ssize; tmp_val = cat_subscript_to_value (i, v); - if (compare_values (tmp_val, val1, var_get_width (v))) + if (compare_values (tmp_val, val1, v)) { y += -1.0; } @@ -106,7 +106,7 @@ void covariance_pass_two (struct design_matrix *cov, double mean1, double mean2, row += i; x = -1.0 * cat_get_category_count (i, v1) / ssize; tmp_val = cat_subscript_to_value (i, v1); - if (compare_values (tmp_val, val1, var_get_width (v1))) + if (compare_values (tmp_val, val1, v1)) { x += 1.0; } diff --git a/src/math/design-matrix.c b/src/math/design-matrix.c index 298d03357b..e991d2a398 100644 --- a/src/math/design-matrix.c +++ b/src/math/design-matrix.c @@ -191,3 +191,25 @@ design_matrix_set_numeric (struct design_matrix *dm, size_t row, assert (col != DM_COLUMN_NOT_FOUND); gsl_matrix_set (dm->m, row, col, val->f); } + +struct design_matrix * +design_matrix_clone (const struct design_matrix *dm) +{ + struct design_matrix *result; + size_t i; + size_t j; + + assert (dm != NULL); + result = xmalloc (sizeof *result); + result->vars = xnmalloc (dm->n_vars, sizeof *dm->vars); + result->n_vars = dm->n_vars; + result->m = gsl_matrix_alloc (dm->m->size1, dm->m->size2); + + gsl_matrix_memcpy (result->m, dm->m); + for (i = 0; i < result->n_vars; i++) + { + result->vars[i] = dm->vars[i]; + } + return result; +} + diff --git a/src/math/extrema.c b/src/math/extrema.c new file mode 100644 index 0000000000..617c7ac72b --- /dev/null +++ b/src/math/extrema.c @@ -0,0 +1,144 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include +#include "extrema.h" +#include +#include +#include +#include +#include +#include + +struct extrema +{ + size_t capacity; + size_t n; + struct ll_list list; + + ll_compare_func *cmp_func; +}; + + +static int +cmp_descending (const struct ll *a_, const struct ll *b_, void *aux UNUSED) +{ + const struct extremum *a = ll_data (a_, struct extremum, ll); + const struct extremum *b = ll_data (b_, struct extremum, ll); + + if ( a->value > b->value) return -1; + + return (a->value < b->value); +} + +static int +cmp_ascending (const struct ll *a_, const struct ll *b_, void *aux UNUSED) +{ + const struct extremum *a = ll_data (a_, struct extremum, ll); + const struct extremum *b = ll_data (b_, struct extremum, ll); + + if ( a->value < b->value) return -1; + + return (a->value > b->value); +} + + +struct extrema * +extrema_create (size_t n, enum extreme_end end) +{ + struct extrema *extrema = xzalloc (sizeof *extrema); + extrema->capacity = n; + + if ( end == EXTREME_MAXIMA ) + extrema->cmp_func = cmp_descending; + else + extrema->cmp_func = cmp_ascending; + + ll_init (&extrema->list); + + return extrema; +} + +void +extrema_destroy (struct extrema *extrema) +{ + struct ll *ll = ll_head (&extrema->list); + + while (ll != ll_null (&extrema->list)) + { + struct extremum *e = ll_data (ll, struct extremum, ll); + + ll = ll_next (ll); + free (e); + } + + free (extrema); +} + + +void +extrema_add (struct extrema *extrema, double val, + double weight, + casenumber location) +{ + struct extremum *e = xzalloc (sizeof *e) ; + e->value = val; + e->location = location; + e->weight = weight; + + if ( val == SYSMIS) + { + free (e); + return; + } + + ll_insert_ordered (ll_head (&extrema->list), ll_null (&extrema->list), + &e->ll, extrema->cmp_func, NULL); + + if ( extrema->n++ > extrema->capacity) + { + struct ll *tail = ll_tail (&extrema->list); + struct extremum *et = ll_data (tail, struct extremum, ll); + + ll_remove (tail); + + free (et); + } +} + + +const struct ll_list * +extrema_list (const struct extrema *ex) +{ + return &ex->list; +} + + +bool +extrema_top (const struct extrema *ex, double *v) +{ + const struct extremum *top; + + if ( ll_is_empty (&ex->list)) + return false; + + top = (const struct extremum *) + ll_data (ll_head(&ex->list), struct extremum, ll); + + *v = top->value; + + return true; +} diff --git a/src/math/extrema.h b/src/math/extrema.h new file mode 100644 index 0000000000..d891c533c9 --- /dev/null +++ b/src/math/extrema.h @@ -0,0 +1,58 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef __EXTREMA_H__ +#define __EXTREMA_H__ 1 + +#include +#include +#include + +struct extremum +{ + double value; + casenumber location; + double weight; + + /* Internal use only */ + struct ll ll; +}; + + +enum extreme_end + { + EXTREME_MAXIMA, + EXTREME_MINIMA + }; + +struct extrema; + +struct extrema *extrema_create (size_t n, enum extreme_end); + +void extrema_destroy (struct extrema *extrema); + +void extrema_add (struct extrema *extrema, double val, + double weight, + casenumber location); + +void extrema_show (const struct extrema *extrema); + +const struct ll_list * extrema_list (const struct extrema *); + +bool extrema_top (const struct extrema *, double *); + + +#endif diff --git a/src/math/factor-stats.c b/src/math/factor-stats.c deleted file mode 100644 index a97d7f0934..0000000000 --- a/src/math/factor-stats.c +++ /dev/null @@ -1,328 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2004 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#include -#include "factor-stats.h" -#include -#include -#include -#include -#include "moments.h" -#include "percentiles.h" - -#include -#include -#include -#include -#include "histogram.h" - -#include "xalloc.h" - -void -metrics_precalc(struct metrics *m) -{ - assert (m) ; - - m->n_missing = 0; - - m->min = DBL_MAX; - m->max = -DBL_MAX; - - m->histogram = 0; - - m->moments = moments1_create(MOMENT_KURTOSIS); - - m->ordered_data = hsh_create(20, - (hsh_compare_func *) compare_values, - (hsh_hash_func *) hash_value, - (hsh_free_func *) weighted_value_free, - (void *) 0); -} - - -/* Include val in the calculation for the metrics. - If val is null, then treat it as MISSING -*/ -void -metrics_calc (struct metrics *fs, const union value *val, - double weight, int case_no) -{ - struct weighted_value **wv; - double x; - - if ( ! val ) - { - fs->n_missing += weight; - return ; - } - - x = val->f; - - moments1_add(fs->moments, x, weight); - - if ( x < fs->min) fs->min = x; - if ( x > fs->max) fs->max = x; - - wv = (struct weighted_value **) hsh_probe (fs->ordered_data,(void *) val ); - - if ( *wv ) - { - /* If this value has already been seen, then simply - increase its weight and push a new case number */ - - struct case_node *cn; - - assert( (*wv)->v.f == val->f ); - (*wv)->w += weight; - - cn = xmalloc ( sizeof *cn); - cn->next = (*wv)->case_nos ; - cn->num = case_no; - - (*wv)->case_nos = cn; - } - else - { - struct case_node *cn; - - *wv = weighted_value_create(); - (*wv)->v = *val; - (*wv)->w = weight; - - cn = xmalloc (sizeof *cn); - cn->next=0; - cn->num = case_no; - (*wv)->case_nos = cn; - - } - -} - -void -metrics_postcalc(struct metrics *m) -{ - double cc = 0.0; - double tc ; - int k1, k2 ; - int i; - int j = 1; - - moments1_calculate (m->moments, &m->n, &m->mean, &m->var, - &m->skewness, &m->kurtosis); - - moments1_destroy (m->moments); - - - m->stddev = sqrt(m->var); - - /* FIXME: Check this is correct ??? - Shouldn't we use the sample variance ??? */ - m->se_mean = sqrt (m->var / m->n) ; - - - - m->wvp = (struct weighted_value **) hsh_sort(m->ordered_data); - m->n_data = hsh_count(m->ordered_data); - - /* Trimmed mean calculation */ - if ( m->n_data <= 1 ) - { - m->trimmed_mean = m->mean; - return; - } - - m->histogram = histogram_create(10, m->min, m->max); - - for ( i = 0 ; i < m->n_data ; ++i ) - { - struct weighted_value **wv = (m->wvp) ; - gsl_histogram_accumulate(m->histogram, wv[i]->v.f, wv[i]->w); - } - - tc = m->n * 0.05 ; - k1 = -1; - k2 = -1; - - for ( i = 0 ; i < m->n_data ; ++i ) - { - cc += m->wvp[i]->w; - m->wvp[i]->cc = cc; - - m->wvp[i]->rank = j + (m->wvp[i]->w - 1) / 2.0 ; - - j += m->wvp[i]->w; - - if ( cc < tc ) - k1 = i; - } - - - - k2 = m->n_data; - for ( i = m->n_data -1 ; i >= 0; --i ) - { - if ( tc > m->n - m->wvp[i]->cc) - k2 = i; - } - - - /* Calculate the percentiles */ - ptiles (m->ptile_hash, (const struct weighted_value **) m->wvp, - m->n_data, m->n, m->ptile_alg); - - tukey_hinges ((const struct weighted_value **) m->wvp, - m->n_data, m->n, m->hinge); - - /* Special case here */ - if ( k1 + 1 == k2 ) - { - m->trimmed_mean = m->wvp[k2]->v.f; - return; - } - - m->trimmed_mean = 0; - for ( i = k1 + 2 ; i <= k2 - 1 ; ++i ) - { - m->trimmed_mean += m->wvp[i]->v.f * m->wvp[i]->w; - } - - - m->trimmed_mean += (m->n - m->wvp[k2 - 1]->cc - tc) * m->wvp[k2]->v.f ; - m->trimmed_mean += (m->wvp[k1 + 1]->cc - tc) * m->wvp[k1 + 1]->v.f ; - m->trimmed_mean /= 0.9 * m->n ; - - -} - - -struct weighted_value * -weighted_value_create(void) -{ - struct weighted_value *wv; - wv = xmalloc (sizeof *wv); - - wv->cc = 0; - wv->case_nos = 0; - - return wv; -} - -void -weighted_value_free(struct weighted_value *wv) -{ - struct case_node *cn ; - - if ( !wv ) - return ; - - cn = wv->case_nos; - - while(cn) - { - struct case_node *next = cn->next; - - free(cn); - cn = next; - } - - free(wv); - -} - - - - - -/* Create a factor statistics object with for N dependent vars - and ID0 and ID1 as the values of the independent variable */ -struct factor_statistics * -create_factor_statistics (int n, - union value *id0, - union value *id1) -{ - struct factor_statistics *f; - - f = xmalloc (sizeof *f); - - f->id[0] = id0; - f->id[1] = id1; - f->m = xnmalloc (n, sizeof *f->m); - memset (f->m, 0, sizeof(struct metrics) * n); - f->n_var = n; - - return f; -} - -void -metrics_destroy(struct metrics *m) -{ - hsh_destroy(m->ordered_data); - hsh_destroy(m->ptile_hash); - if ( m-> histogram ) - gsl_histogram_free(m->histogram); -} - -void -factor_statistics_free(struct factor_statistics *f) -{ - - int i; - free (f->id[0]); - free (f->id[1]); - for ( i = 0 ; i < f->n_var; ++i ) - metrics_destroy(&f->m[i]); - free(f->m) ; - free(f); -} - - -int -factor_statistics_compare(const struct factor_statistics *f0, - const struct factor_statistics *f1, int width) -{ - - int cmp0; - - assert(f0); - assert(f1); - - cmp0 = compare_values(f0->id[0], f1->id[0], width); - - if ( cmp0 != 0 ) - return cmp0; - - - if ( ( f0->id[1]->f == SYSMIS ) && (f1->id[1]->f != SYSMIS) ) - return 1; - - if ( ( f0->id[1]->f != SYSMIS ) && (f1->id[1]->f == SYSMIS) ) - return -1; - - return compare_values (f0->id[1], f1->id[1], width); -} - -unsigned int -factor_statistics_hash (const struct factor_statistics *f, int width) -{ - unsigned int h; - - h = hash_value (f->id[0], width); - - if ( f->id[1]->f != SYSMIS ) - h += hash_value(f->id[1], width); - - return h; -} diff --git a/src/math/factor-stats.h b/src/math/factor-stats.h deleted file mode 100644 index 3c1c7f9104..0000000000 --- a/src/math/factor-stats.h +++ /dev/null @@ -1,162 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2004 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#ifndef FACTOR_STATS -#define FACTOR_STATS - - -/* FIXME: These things should probably be amalgamated with the - group_statistics struct */ - -#include -#include -#include -#include -#include "percentiles.h" - - -struct moments1; - -struct metrics -{ - double n; - - double n_missing; - - double min; - - double max; - - double mean; - - double se_mean; - - double var; - - double stddev; - - struct moments1 *moments; - - gsl_histogram *histogram; - - double skewness; - double kurtosis; - - double trimmed_mean; - - /* A hash of data for this factor. */ - struct hsh_table *ordered_data; - - /* A Pointer to this hash table AFTER it has been SORTED and crunched */ - struct weighted_value **wvp; - - /* The number of values in the above array - (if all the weights are 1, then this will - be the same as n) */ - int n_data; - - /* Percentile stuff */ - - /* A hash of struct percentiles */ - struct hsh_table *ptile_hash; - - /* Algorithm to be used for calculating percentiles */ - enum pc_alg ptile_alg; - - /* Tukey's Hinges */ - double hinge[3]; - -}; - - -struct metrics * metrics_create(void); - -void metrics_precalc(struct metrics *m); - -void metrics_calc(struct metrics *m, const union value *f, double weight, - int case_no); - -void metrics_postcalc(struct metrics *m); - -void metrics_destroy(struct metrics *m); - - - -/* Linked list of case nos */ -struct case_node -{ - int num; - struct case_node *next; -}; - -struct weighted_value -{ - union value v; - - /* The weight */ - double w; - - /* The cumulative weight */ - double cc; - - /* The rank */ - double rank; - - /* Linked list of cases nos which have this value */ - struct case_node *case_nos; - -}; - - -struct weighted_value *weighted_value_create(void); - -void weighted_value_free(struct weighted_value *wv); - - - -struct factor_statistics { - - /* The values of the independent variables */ - union value *id[2]; - - /* The an array stats for this factor, one for each dependent var */ - struct metrics *m; - - /* The number of dependent variables */ - int n_var; -}; - - -/* Create a factor statistics object with for N dependent vars - and ID as the value of the independent variable */ -struct factor_statistics * create_factor_statistics (int n, - union value *id0, - union value *id1); - - -void factor_statistics_free(struct factor_statistics *f); - - -/* Compare f0 and f1. - width is the width of the independent variable */ -int -factor_statistics_compare(const struct factor_statistics *f0, - const struct factor_statistics *f1, int width); - -unsigned int -factor_statistics_hash(const struct factor_statistics *f, int width); - -#endif diff --git a/src/math/group.c b/src/math/group.c index 29c5ab23db..6101c350e4 100644 --- a/src/math/group.c +++ b/src/math/group.c @@ -29,21 +29,24 @@ /* Return -1 if the id of a is less than b; +1 if greater than and 0 if equal */ int -compare_group (const struct group_statistics *a, - const struct group_statistics *b, - int width) +compare_group (const void *a_, + const void *b_, + const void *var) { - return compare_values(&a->id, &b->id, width); + const struct group_statistics *a = a_; + const struct group_statistics *b = b_; + return compare_values(&a->id, &b->id, var); } -unsigned -hash_group (const struct group_statistics *g, int width) +unsigned int +hash_group (const void *g_, const void *var) { unsigned id_hash; + const struct group_statistics *g = g_;; - id_hash = hash_value(&g->id, width); + id_hash = hash_value(&g->id, var); return id_hash; } diff --git a/src/math/group.h b/src/math/group.h index bc82c8ab82..c5470b2578 100644 --- a/src/math/group.h +++ b/src/math/group.h @@ -18,10 +18,8 @@ #ifndef GROUP_H #define GROUP_H - #include - /* Statistics for grouped data */ struct group_statistics { @@ -67,17 +65,17 @@ struct group_statistics }; - +struct variable ; /* These funcs are useful for hash tables */ /* Return -1 if the id of a is less than b; +1 if greater than and 0 if equal */ -int compare_group (const struct group_statistics *a, - const struct group_statistics *b, - int width); +int compare_group (const void *a, + const void *b, + const void *var); -unsigned hash_group (const struct group_statistics *g, int width); +unsigned int hash_group (const void *g, const void *var); void free_group (struct group_statistics *v, void *aux); diff --git a/src/math/histogram.c b/src/math/histogram.c index 7b875d4089..67079398d1 100644 --- a/src/math/histogram.c +++ b/src/math/histogram.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2004, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -15,36 +15,69 @@ along with this program. If not, see . */ #include -#include -#include -#include #include "histogram.h" + +#include +#include + +#include #include "chart-geometry.h" +#include + + +void +histogram_add (struct histogram *h, double y, double c) +{ + ((struct statistic *)h)->accumulate ((struct statistic *) h, NULL, c, 0, y); +} + -gsl_histogram * -histogram_create(double bins, double x_min, double x_max) +static void +acc (struct statistic *s, const struct ccase *cx UNUSED, double c, double cc UNUSED, double y) { - int n; - double bin_width ; - double bin_width_2 ; + struct histogram *hist = (struct histogram *) s; + + gsl_histogram_accumulate (hist->gsl_hist, y, c); +} + + +static void +destroy (struct statistic *s) +{ + struct histogram *h = (struct histogram *) s; + gsl_histogram_free (h->gsl_hist); + free (s); +} + + +struct statistic * +histogram_create (int bins, double min, double max) +{ + struct histogram *h = xmalloc (sizeof *h); + struct statistic *stat = (struct statistic *) h; double upper_limit, lower_limit; - gsl_histogram *hist = gsl_histogram_alloc(bins); + double bin_width = chart_rounded_tick ((max - min) / (double) bins); + double bin_width_2 = bin_width / 2.0; - bin_width = chart_rounded_tick((x_max - x_min)/ bins); - bin_width_2 = bin_width / 2.0; + int n = ceil (max / (bin_width_2) ) ; + + assert (max > min); - n = ceil( x_max / (bin_width_2) ) ; if ( ! (n % 2 ) ) n++; upper_limit = n * bin_width_2; - n = floor( x_min / (bin_width_2) ) ; + n = floor (min / (bin_width_2) ) ; if ( ! (n % 2 ) ) n--; lower_limit = n * bin_width_2; - gsl_histogram_set_ranges_uniform(hist, lower_limit, upper_limit); + h->gsl_hist = gsl_histogram_alloc (bins); + gsl_histogram_set_ranges_uniform (h->gsl_hist, lower_limit, upper_limit); + + stat->accumulate = acc; + stat->destroy = destroy; - return hist; + return stat; } diff --git a/src/math/histogram.h b/src/math/histogram.h index e4c7819f33..b2b204ee80 100644 --- a/src/math/histogram.h +++ b/src/math/histogram.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -14,11 +14,25 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ -#ifndef HISTOGRAM_H -#define HISTOGRAM_H +#ifndef __NEW_HISTOGRAM_H__ +#define __NEW_HISTOGRAM_H__ + +#include + +#include "statistic.h" #include -gsl_histogram * histogram_create(double bins, double x_min, double x_max); + +struct histogram +{ + struct statistic parent; + gsl_histogram *gsl_hist; +}; + +struct statistic * histogram_create (int bins, double max, double min); + +void histogram_add (struct histogram *h, double y, double c); + #endif diff --git a/src/math/linreg.c b/src/math/linreg.c index 2fb98cf768..306ef31306 100644 --- a/src/math/linreg.c +++ b/src/math/linreg.c @@ -96,10 +96,10 @@ linreg_mean_std (gsl_vector_const_view v, double *mp, double *sp, double *ssp) The return value is the number of distinct variables found. */ int -pspp_linreg_get_vars (const void *c_, const struct variable **v) +pspp_linreg_get_vars (const void *c_, struct variable **v) { const pspp_linreg_cache *c = c_; - const struct variable *tmp; + struct variable *tmp; int i; int j; int result = 0; @@ -138,12 +138,15 @@ pspp_linreg_get_vars (const void *c_, const struct variable **v) independent variables. */ pspp_linreg_cache * -pspp_linreg_cache_alloc (size_t n, size_t p) +pspp_linreg_cache_alloc (const struct variable *depvar, const struct variable **indep_vars, + size_t n, size_t p) { + size_t i; pspp_linreg_cache *c; c = (pspp_linreg_cache *) malloc (sizeof (pspp_linreg_cache)); - c->depvar = NULL; + c->depvar = depvar; + c->indep_vars = indep_vars; c->indep_means = gsl_vector_alloc (p); c->indep_std = gsl_vector_alloc (p); c->ssx = gsl_vector_alloc (p); /* Sums of squares for the @@ -152,9 +155,22 @@ pspp_linreg_cache_alloc (size_t n, size_t p) c->ss_indeps = gsl_vector_alloc (p); /* Sums of squares for the model parameters. */ - c->cov = gsl_matrix_alloc (p + 1, p + 1); /* Covariance matrix. */ c->n_obs = n; c->n_indeps = p; + c->n_coeffs = 0; + for (i = 0; i < p; i++) + { + if (var_is_numeric (indep_vars[i])) + { + c->n_coeffs++; + } + else + { + c->n_coeffs += cat_get_n_categories (indep_vars[i]) - 1; + } + } + + c->cov = gsl_matrix_alloc (c->n_coeffs + 1, c->n_coeffs + 1); /* Default settings. */ @@ -194,13 +210,12 @@ pspp_linreg_cache_free (void *m) return true; } static void -cache_init (pspp_linreg_cache *cache, const struct design_matrix *dm) +cache_init (pspp_linreg_cache *cache) { assert (cache != NULL); cache->dft = cache->n_obs - 1; cache->dfm = cache->n_indeps; cache->dfe = cache->dft - cache->dfm; - cache->n_coeffs = dm->m->size2; cache->intercept = 0.0; } @@ -320,7 +335,8 @@ pspp_linreg (const gsl_vector * Y, const struct design_matrix *dm, cache->depvar_std = s; cache->sst = ss; } - cache_init (cache, dm); + cache_init (cache); + cache->n_coeffs = dm->m->size2; for (i = 0; i < dm->m->size2; i++) { if (opts->get_indep_mean_std[i]) @@ -647,83 +663,61 @@ void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *c, const struct var static struct design_matrix * rearrange_covariance_matrix (const struct design_matrix *cov, pspp_linreg_cache *c) { - struct variable **v; struct variable **model_vars; - struct variable *tmp; struct design_matrix *result; - int n_vars; - int found; - size_t *columns; + size_t *permutation; size_t i; size_t j; size_t k; - size_t dep_col; assert (cov != NULL); assert (c != NULL); assert (cov->m->size1 > 0); assert (cov->m->size2 == cov->m->size1); - v = xnmalloc (c->n_coeffs, sizeof (*v)); - model_vars = xnmalloc (c->n_coeffs, sizeof (*model_vars)); - columns = xnmalloc (cov->m->size2, sizeof (*columns)); - n_vars = pspp_linreg_get_vars (c, (const struct variable **) v); - dep_col = 0; - k = 0; - for (i = 0; i < cov->m->size2; i++) - { - tmp = design_matrix_col_to_var (cov, i); - found = 0; - j = 0; - while (!found && j < n_vars) - { - if (tmp == v[j]) - { - found = 1; - if (tmp == c->depvar) - { - dep_col = j; - } - else - { - columns[k] = j; - k++; - } - } - j++; - } - } - k++; - columns[k] = dep_col; - /* - K should now be equal to C->N_INDEPS + 1. If it is not, then - either the code above is wrong or the caller didn't send us the - correct values in C. - */ - assert (k == c->n_indeps + 1); + permutation = xnmalloc (1 + c->n_indeps, sizeof (*permutation)); + model_vars = xnmalloc (1 + c->n_indeps, sizeof (*model_vars)); + /* Put the model variables in the right order in MODEL_VARS. */ - for (i = 0; i < k; i++) + for (i = 0; i < c->n_indeps; i++) { - model_vars[i] = v[columns[i]]; + model_vars[i] = c->indep_vars[i]; } - - result = covariance_matrix_create (k, model_vars); - for (i = 0; i < result->m->size1; i++) + model_vars[i] = c->depvar; + result = covariance_matrix_create (1 + c->n_indeps, model_vars); + for (j = 0; j < cov->m->size2; j++) { - for (j = 0; j < result->m->size2; j++) + k = 0; + while (k < result->m->size2) { - gsl_matrix_set (result->m, i, j, gsl_matrix_get (cov->m, columns[i], columns[j])); + if (design_matrix_col_to_var (cov, j) == design_matrix_col_to_var (result, k)) + { + permutation[k] = j; + } + k++; } } - free (columns); - free (v); + for (i = 0; i < result->m->size1; i++) + for (j = 0; j < result->m->size2; j++) + { + gsl_matrix_set (result->m, i, j, gsl_matrix_get (cov->m, permutation[i], permutation[j])); + } + free (permutation); + free (model_vars); return result; } /* Estimate the model parameters from the covariance matrix only. This method uses less memory than PSPP_LINREG, which requires the entire data set to be stored in memory. + + The function assumes FULL_COV may contain columns corresponding to + variables that are not in the model. It fixes this in + REARRANG_COVARIANCE_MATRIX. This allows the caller to compute a + large covariance matrix once before, then pass it to this without + having to alter it. The problem is that this means the caller must + set CACHE->N_COEFFS. */ int pspp_linreg_with_cov (const struct design_matrix *full_cov, @@ -731,11 +725,11 @@ pspp_linreg_with_cov (const struct design_matrix *full_cov, { struct design_matrix *cov; - assert (cov != NULL); + assert (full_cov != NULL); assert (cache != NULL); cov = rearrange_covariance_matrix (full_cov, cache); - cache_init (cache, cov); + cache_init (cache); reg_sweep (cov->m); post_sweep_computations (cache, cov, cov->m); covariance_matrix_destroy (cov); diff --git a/src/math/linreg.h b/src/math/linreg.h index 05c8159630..a9577d6483 100644 --- a/src/math/linreg.h +++ b/src/math/linreg.h @@ -96,9 +96,10 @@ struct pspp_linreg_cache_struct coefficient here. */ /* - Pointer to the dependent variable. + Pointers to the variables. */ const struct variable *depvar; + const struct variable **indep_vars; gsl_vector *residuals; struct pspp_coeff **coeff; @@ -175,7 +176,8 @@ typedef struct pspp_linreg_cache_struct pspp_linreg_cache; to it. n is the number of cases, p is the number of independent variables. */ -pspp_linreg_cache *pspp_linreg_cache_alloc (size_t n, size_t p); +pspp_linreg_cache *pspp_linreg_cache_alloc (const struct variable *, const struct variable **, + size_t, size_t); bool pspp_linreg_cache_free (void *); @@ -197,7 +199,7 @@ pspp_linreg_residual (const struct variable **, const union value **, /* All variables used in the model. */ -int pspp_linreg_get_vars (const void *, const struct variable **); +int pspp_linreg_get_vars (const void *, struct variable **); struct pspp_coeff *pspp_linreg_get_coeff (const pspp_linreg_cache *, @@ -214,4 +216,9 @@ void pspp_linreg_set_indep_variable_sd (pspp_linreg_cache *, const struct variab */ double pspp_linreg_get_indep_variable_mean (pspp_linreg_cache *, const struct variable *); void pspp_linreg_set_indep_variable_mean (pspp_linreg_cache *, const struct variable *, double); + +/* + Regression using only the covariance matrix. + */ +int pspp_linreg_with_cov (const struct design_matrix *, pspp_linreg_cache *); #endif diff --git a/src/math/np.c b/src/math/np.c new file mode 100644 index 0000000000..e189b47091 --- /dev/null +++ b/src/math/np.c @@ -0,0 +1,94 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include +#include "np.h" +#include +#include +#include +#include +#include +#include +#include +#include + +static void +destroy (struct statistic *stat) +{ + struct order_stats *os = (struct order_stats *) stat; + free (os); +} + + +static void +acc (struct statistic *s, const struct ccase *cx UNUSED, + double c, double cc, double y) +{ + struct ccase cp; + struct np *np = (struct np *) s; + double rank = np->prev_cc + (c + 1) / 2.0; + + double ns = gsl_cdf_ugaussian_Pinv (rank / ( np->n + 1 )); + + double z = (y - np->mean) / np->stddev; + + double dns = z - ns; + + maximize (&np->ns_max, ns); + minimize (&np->ns_min, ns); + + maximize (&np->dns_max, dns); + minimize (&np->dns_min, dns); + + maximize (&np->y_max, y); + minimize (&np->y_min, y); + + case_create (&cp, n_NP_IDX); + + case_data_rw_idx (&cp, NP_IDX_Y)->f = y; + case_data_rw_idx (&cp, NP_IDX_NS)->f = ns; + case_data_rw_idx (&cp, NP_IDX_DNS)->f = dns; + + casewriter_write (np->writer, &cp); + + np->prev_cc = cc; +} + +struct order_stats * +np_create (const struct moments1 *m) +{ + double variance; + struct np *np = xzalloc (sizeof (*np)); + struct statistic *stat = (struct statistic *) np; + struct order_stats *os = (struct order_stats *) np; + + np->prev_cc = 0; + + moments1_calculate (m, &np->n, &np->mean, &variance, NULL, NULL); + + np->stddev = sqrt (variance); + + np->y_min = np->ns_min = np->dns_min = DBL_MAX; + np->y_max = np->ns_max = np->dns_max = -DBL_MAX; + + np->writer = autopaging_writer_create (n_NP_IDX); + + os->k = 0; + stat->destroy = destroy; + stat->accumulate = acc; + + return os; +} diff --git a/src/math/np.h b/src/math/np.h new file mode 100644 index 0000000000..7db51f73b2 --- /dev/null +++ b/src/math/np.h @@ -0,0 +1,59 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef __NP_H__ +#define __NP_H__ + +#include "order-stats.h" + +struct moments1; +struct casewriter; + +enum + { + NP_IDX_Y = 0, + NP_IDX_NS, + NP_IDX_DNS, + n_NP_IDX + }; + +struct np +{ + struct order_stats parent; + + double n; + double mean; + double stddev; + + + double prev_cc; + + double ns_min; + double ns_max; + + double dns_min; + double dns_max; + + double y_min; + double y_max; + + struct casewriter *writer; +}; + + +struct order_stats * np_create (const struct moments1 *); + +#endif diff --git a/src/math/order-stats.c b/src/math/order-stats.c new file mode 100644 index 0000000000..ca4160f4fd --- /dev/null +++ b/src/math/order-stats.c @@ -0,0 +1,159 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include +#include "order-stats.h" +#include +#include +#include +#include +#include +#include + +#if 0 + +#include + +static void +order_stats_dump_k1 (const struct order_stats *os) +{ + struct k *k = &os->k[0]; + printf ("K1: tc %g; c %g cc %g ccp %g\n", + k->tc, k->c, k->cc, k->cc_p1); + +} + +static void +order_stats_dump_k2 (const struct order_stats *os) +{ + struct k *k = &os->k[1]; + printf ("K2: tc %g; c %g cc %g ccp %g\n", + k->tc, k->c, k->cc, k->cc_p1); +} + + +void +order_stats_dump (const struct order_stats *os) +{ + order_stats_dump_k1 (os); + order_stats_dump_k2 (os); +} + +#endif + +static void +update_k_lower (struct k *kk, + double y_i, double c_i, double cc_i) +{ + if ( cc_i <= kk->tc ) + { + kk->cc = cc_i; + kk->c = c_i; + kk->y = y_i; + } +} + + +static void +update_k_upper (struct k *kk, + double y_i, double c_i, double cc_i) +{ + if ( cc_i > kk->tc && kk->c_p1 == 0) + { + kk->cc_p1 = cc_i; + kk->c_p1 = c_i; + kk->y_p1 = y_i; + } +} + + +static void +update_k_values (const struct ccase *cx, double y_i, double c_i, double cc_i, + struct order_stats **os, size_t n_os) +{ + int j; + + for (j = 0 ; j < n_os ; ++j) + { + int k; + struct order_stats *tos = os[j]; + struct statistic *stat = (struct statistic *) tos; + for (k = 0 ; k < tos->n_k; ++k) + { + struct k *myk = &tos->k[k]; + update_k_lower (myk, y_i, c_i, cc_i); + update_k_upper (myk, y_i, c_i, cc_i); + } + + if ( stat->accumulate ) + stat->accumulate (stat, cx, c_i, cc_i, y_i); + + tos->cc = cc_i; + } +} + + +void +order_stats_accumulate (struct order_stats **os, size_t nos, + struct casereader *reader, + const struct variable *wv, + const struct variable *var, + enum mv_class exclude) +{ + struct ccase cx; + struct ccase prev_cx; + double prev_value = -DBL_MAX; + + double cc_i = 0; + double c_i = 0; + + case_nullify (&prev_cx); + + for (; casereader_read (reader, &cx); case_destroy (&cx)) + { + const double weight = wv ? case_data (&cx, wv)->f : 1.0; + const double this_value = case_data (&cx, var)->f; + + /* The casereader MUST be sorted */ + assert (this_value >= prev_value); + + if ( var_is_value_missing (var, case_data (&cx, var), exclude)) + continue; + + case_destroy (&prev_cx); + + if ( prev_value == -DBL_MAX || prev_value == this_value) + c_i += weight; + + if ( prev_value > -DBL_MAX && this_value > prev_value) + { + update_k_values (&prev_cx, prev_value, c_i, cc_i, os, nos); + c_i = weight; + } + + cc_i += weight; + prev_value = this_value; + case_clone (&prev_cx, &cx); + } + + update_k_values (&prev_cx, prev_value, c_i, cc_i, os, nos); + case_destroy (&prev_cx); + + casereader_destroy (reader); +} + + + diff --git a/src/math/order-stats.h b/src/math/order-stats.h new file mode 100644 index 0000000000..cea50ed80d --- /dev/null +++ b/src/math/order-stats.h @@ -0,0 +1,60 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2004, 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef __ORDER_STATS_H__ +#define __ORDER_STATS_H__ + +#include +#include + +struct casereader; +struct variable; + +/* + cc <= tc < cc_p1 +*/ +struct k +{ + double tc; + double cc; + double cc_p1; + double c; + double c_p1; + double y; + double y_p1; +}; + + +struct order_stats +{ + struct statistic parent; + int n_k; + struct k *k; + + double cc; +}; + +enum mv_class; + +void order_stats_dump (const struct order_stats *os); + +void order_stats_accumulate (struct order_stats **ptl, size_t nos, + struct casereader *reader, + const struct variable *wv, + const struct variable *var, + enum mv_class exclude); + +#endif diff --git a/src/math/percentiles.c b/src/math/percentiles.c index aa7eead6c0..bf99de163f 100644 --- a/src/math/percentiles.c +++ b/src/math/percentiles.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -15,25 +15,19 @@ along with this program. If not, see . */ #include -#include -#include -#include -#include "factor-stats.h" #include "percentiles.h" -#include +#include -#include "minmax.h" #include "gettext.h" #define _(msgid) gettext (msgid) #define N_(msgid) msgid -struct ptile_params -{ - double g1, g1_star; - double g2, g2_star; - int k1, k2; -}; +#include +#include +#include +#include +#include const char *const ptile_alg_desc[] = { @@ -47,380 +41,151 @@ const char *const ptile_alg_desc[] = { - -/* Individual Percentile algorithms */ - -/* Closest observation to tc1 */ -double ptile_round(const struct weighted_value **wv, - const struct ptile_params *par); - - -/* Weighted average at y_tc2 */ -double ptile_haverage(const struct weighted_value **wv, - const struct ptile_params *par); - - -/* Weighted average at y_tc1 */ -double ptile_waverage(const struct weighted_value **wv, - const struct ptile_params *par); - - -/* Empirical distribution function */ -double ptile_empirical(const struct weighted_value **wv, - const struct ptile_params *par); - - -/* Empirical distribution function with averaging*/ -double ptile_aempirical(const struct weighted_value **wv, - const struct ptile_params *par); - - - - -/* Closest observation to tc1 */ double -ptile_round(const struct weighted_value **wv, - const struct ptile_params *par) +percentile_calculate (const struct percentile *ptl, enum pc_alg alg) { - double x; - double a=0; + struct percentile *mutable = (struct percentile *) ptl; + const struct order_stats *os = &ptl->parent; - if ( par->k1 >= 0 ) - a = wv[par->k1]->v.f; + assert (os->cc == ptl->w); - if ( wv[par->k1 + 1]->w >= 1 ) - { - if ( par->g1_star < 0.5 ) - x = a; - else - x = wv[par->k1 + 1]->v.f; - } - else - { - if ( par->g1 < 0.5 ) - x = a; - else - x = wv[par->k1 + 1]->v.f; - - } - - return x; -} + if ( ptl->g1 == SYSMIS) + mutable->g1 = (os->k[0].tc - os->k[0].cc) / os->k[0].c_p1; -/* Weighted average at y_tc2 */ -double -ptile_haverage(const struct weighted_value **wv, - const struct ptile_params *par) -{ - - double a=0; + if ( ptl->g1_star == SYSMIS) + mutable->g1_star = os->k[0].tc - os->k[0].cc; - if ( par->g2_star >= 1.0 ) - return wv[par->k2 + 1]->v.f ; - - /* Special case for k2 + 1 >= n_data - (actually it's not a special case, but just avoids indexing errors ) - */ - if ( par->g2_star == 0 ) + if ( ptl->g2 == SYSMIS) { - assert(par->g2 == 0 ); - return wv[par->k2]->v.f; - } - - /* Ditto for k2 < 0 */ - if ( par->k2 >= 0 ) - { - a = wv[par->k2]->v.f; + if ( os->k[1].c == 0 ) + mutable->g2 = os->k[1].tc / os->k[1].c_p1; + else if ( os->k[1].c_p1 == 0 ) + mutable->g2 = 0; + else + mutable->g2 = (os->k[1].tc - os->k[1].cc) / os->k[1].c_p1; } - if ( wv[par->k2 + 1]->w >= 1.0 ) - return ( (1 - par->g2_star) * a + - par->g2_star * wv[par->k2 + 1]->v.f); - else - return ( (1 - par->g2) * a + - par->g2 * wv[par->k2 + 1]->v.f); - -} - - - -/* Weighted average at y_tc1 */ -double -ptile_waverage(const struct weighted_value **wv, - const struct ptile_params *par) -{ - double a=0; - - if ( par->g1_star >= 1.0 ) - return wv[par->k1 + 1]->v.f ; - - if ( par->k1 >= 0 ) + if ( ptl->g2_star == SYSMIS) { - a = wv[par->k1]->v.f; + if ( os->k[1].c == 0 ) + mutable->g2_star = os->k[1].tc; + else if ( os->k[1].c_p1 == 0 ) + mutable->g2_star = 0; + else + mutable->g2_star = os->k[1].tc - os->k[1].cc; } - if ( wv[par->k1 + 1]->w >= 1.0 ) - return ( (1 - par->g1_star) * a + - par->g1_star * wv[par->k1 + 1]->v.f); - else - return ( (1 - par->g1) * a + - par->g1 * wv[par->k1 + 1]->v.f); -} - - -/* Empirical distribution function */ -double -ptile_empirical(const struct weighted_value **wv, - const struct ptile_params *par) -{ - if ( par->g1_star > 0 ) - return wv[par->k1 + 1]->v.f; - else - return wv[par->k1]->v.f; -} - - - -/* Empirical distribution function with averageing */ -double -ptile_aempirical(const struct weighted_value **wv, - const struct ptile_params *par) -{ - if ( par->g1_star > 0 ) - return wv[par->k1 + 1]->v.f; - else - return (wv[par->k1]->v.f + wv[par->k1 + 1]->v.f ) / 2.0 ; -} - - - -/* Compute the percentile p */ -double ptile(double p, - const struct weighted_value **wv, - int n_data, - double w, - enum pc_alg algorithm); - - - -double -ptile(double p, - const struct weighted_value **wv, - int n_data, - double w, - enum pc_alg algorithm) -{ - int i; - double tc1, tc2; - double result; - - struct ptile_params pp; - - assert( p <= 1.0); - - tc1 = w * p ; - tc2 = (w + 1) * p ; - - pp.k1 = -1; - pp.k2 = -1; - - for ( i = 0 ; i < n_data ; ++i ) + switch (alg) { - if ( wv[i]->cc <= tc1 ) - pp.k1 = i; - - if ( wv[i]->cc <= tc2 ) - pp.k2 = i; + case PC_WAVERAGE: + if ( ptl->g1_star >= 1.0) + return os->k[0].y_p1; + else + { + double a = ( os->k[0].y == SYSMIS ) ? 0 : os->k[0].y; - } + if (os->k[0].c_p1 >= 1.0) + return (1 - ptl->g1_star) * a + ptl->g1_star * os->k[0].y_p1; + else + return (1 - ptl->g1) * a + ptl->g1 * os->k[0].y_p1; + } + break; + case PC_ROUND: + { + double a = ( os->k[0].y == SYSMIS ) ? 0 : os->k[0].y; - if ( pp.k1 >= 0 ) - { - pp.g1 = ( tc1 - wv[pp.k1]->cc ) / wv[pp.k1 + 1]->w; - pp.g1_star = tc1 - wv[pp.k1]->cc ; - } - else - { - pp.g1 = tc1 / wv[pp.k1 + 1]->w; - pp.g1_star = tc1 ; - } + if (os->k[0].c_p1 >= 1.0) + return (ptl->g1_star < 0.5) ? a : os->k[0].y_p1; + else + return (ptl->g1 < 0.5) ? a : os->k[0].y_p1; + } + break; + case PC_EMPIRICAL: + if ( ptl->g1_star == 0 ) + return os->k[0].y; + else + return os->k[0].y_p1; + break; - if ( pp.k2 + 1 >= n_data ) - { - pp.g2 = 0 ; - pp.g2_star = 0; - } - else - { - if ( pp.k2 >= 0 ) + case PC_HAVERAGE: + if ( ptl->g2_star >= 1.0) { - pp.g2 = ( tc2 - wv[pp.k2]->cc ) / wv[pp.k2 + 1]->w; - pp.g2_star = tc2 - wv[pp.k2]->cc ; + return os->k[1].y_p1; } else { - pp.g2 = tc2 / wv[pp.k2 + 1]->w; - pp.g2_star = tc2 ; + double a = ( os->k[1].y == SYSMIS ) ? 0 : os->k[1].y; + + if ( os->k[1].c_p1 >= 1.0) + { + if ( ptl->g2_star == 0) + return os->k[1].y; + + return (1 - ptl->g2_star) * a + ptl->g2_star * os->k[1].y_p1; + } + else + { + return (1 - ptl->g2) * a + ptl->g2 * os->k[1].y_p1; + } } - } - switch ( algorithm ) - { - case PC_HAVERAGE: - result = ptile_haverage(wv, &pp); - break; - case PC_WAVERAGE: - result = ptile_waverage(wv, &pp); - break; - case PC_ROUND: - result = ptile_round(wv, &pp); - break; - case PC_EMPIRICAL: - result = ptile_empirical(wv, &pp); break; + case PC_AEMPIRICAL: - result = ptile_aempirical(wv, &pp); + if ( ptl->g1_star == 0 ) + return (os->k[0].y + os->k[0].y_p1)/ 2.0; + else + return os->k[0].y_p1; break; + default: - result = SYSMIS; + NOT_REACHED (); + break; } - return result; + NOT_REACHED (); + + return SYSMIS; } -/* - Calculate the values of the percentiles in pc_hash. - wv is a sorted array of weighted values of the data set. -*/ -void -ptiles(struct hsh_table *pc_hash, - const struct weighted_value **wv, - int n_data, - double w, - enum pc_alg algorithm) +static void +destroy (struct statistic *stat) { - struct hsh_iterator hi; - struct percentile *p; - - if ( !pc_hash ) - return ; - for ( p = hsh_first(pc_hash, &hi); - p != 0 ; - p = hsh_next(pc_hash, &hi)) - { - p->v = ptile(p->p/100.0 , wv, n_data, w, algorithm); - } - + struct order_stats *os = (struct order_stats *) stat; + free (os->k); + free (os); } -/* Calculate Tukey's Hinges */ -void -tukey_hinges(const struct weighted_value **wv, - int n_data, - double w, - double hinge[3] - ) +struct order_stats * +percentile_create (double p, double W) { - int i; - double c_star = DBL_MAX; - double d; - double l[3]; - int h[3]; - double a, a_star; - - for ( i = 0 ; i < n_data ; ++i ) - { - c_star = MIN(c_star, wv[i]->w); - } - - if ( c_star > 1 ) c_star = 1; - - d = floor((w/c_star + 3 ) / 2.0)/ 2.0; - - l[0] = d*c_star; - l[1] = w/2.0 + c_star/2.0; - l[2] = w + c_star - d*c_star; - - h[0]=-1; - h[1]=-1; - h[2]=-1; - - for ( i = 0 ; i < n_data ; ++i ) - { - if ( l[0] >= wv[i]->cc ) h[0] = i ; - if ( l[1] >= wv[i]->cc ) h[1] = i ; - if ( l[2] >= wv[i]->cc ) h[2] = i ; - } - - for ( i = 0 ; i < 3 ; i++ ) - { - - if ( h[i] >= 0 ) - a_star = l[i] - wv[h[i]]->cc ; - else - a_star = l[i]; - - if ( h[i] + 1 >= n_data ) - { - assert( a_star < 1 ) ; - hinge[i] = (1 - a_star) * wv[h[i]]->v.f; - continue; - } - else - { - a = a_star / ( wv[h[i] + 1]->cc ) ; - } - - if ( a_star >= 1.0 ) - { - hinge[i] = wv[h[i] + 1]->v.f ; - continue; - } - - if ( wv[h[i] + 1]->w >= 1) - { - hinge[i] = ( 1 - a_star) * wv[h[i]]->v.f - + a_star * wv[h[i] + 1]->v.f; - - continue; - } + struct percentile *ptl = xzalloc (sizeof (*ptl)); + struct order_stats *os = (struct order_stats *) ptl; + struct statistic *stat = (struct statistic *) ptl; - hinge[i] = (1 - a) * wv[h[i]]->v.f + a * wv[h[i] + 1]->v.f; + assert (p >= 0); + assert (p <= 1.0); - } - - assert(hinge[0] <= hinge[1]); - assert(hinge[1] <= hinge[2]); + ptl->ptile = p; + ptl->w = W; -} + os->n_k = 2; + os->k = xcalloc (sizeof (*os->k), 2); + os->k[0].tc = W * p; + os->k[1].tc = (W + 1.0) * p; + ptl->g1 = ptl->g1_star = SYSMIS; + ptl->g2 = ptl->g2_star = SYSMIS; -int -ptile_compare(const struct percentile *p1, - const struct percentile *p2, - void *aux UNUSED) -{ - - int cmp; + os->k[1].y_p1 = os->k[1].y = SYSMIS; + os->k[0].y_p1 = os->k[0].y = SYSMIS; - if ( p1->p == p2->p) - cmp = 0 ; - else if (p1->p < p2->p) - cmp = -1 ; - else - cmp = +1; + stat->destroy = destroy; - return cmp; + return os; } -unsigned -ptile_hash(const struct percentile *p, void *aux UNUSED) -{ - return hsh_hash_double(p->p); -} - - diff --git a/src/math/percentiles.h b/src/math/percentiles.h index 9e0eb47a46..0dd0982094 100644 --- a/src/math/percentiles.h +++ b/src/math/percentiles.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2004, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -14,13 +14,12 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . */ -#ifndef PERCENTILES_H -#define PERCENTILES_H +#ifndef __PERCENTILES_H__ +#define __PERCENTILES_H__ +#include -#include - -struct weighted_value ; +#include "order-stats.h" /* The algorithm used to calculate percentiles */ enum pc_alg { @@ -32,48 +31,33 @@ enum pc_alg { PC_AEMPIRICAL } ; - - extern const char *const ptile_alg_desc[]; +struct percentile +{ + struct order_stats parent; + double ptile; + double w; -struct percentile { + /* Mutable */ + double g1; + double g1_star; - /* The break point of the percentile */ - double p; - - /* The value of the percentile */ - double v; + double g2; + double g2_star; }; - -/* Calculate the percentiles of the break points in pc_bp, - placing the values in pc_val. - wv is a sorted array of weighted values of the data set. +/* Create the Pth percentile. + W is the total sum of weights in the data set */ -void ptiles(struct hsh_table *pc_hash, - const struct weighted_value **wv, - int n_data, - double w, - enum pc_alg algorithm); - - -/* Calculate Tukey's Hinges and the Whiskers for the box plot*/ -void tukey_hinges(const struct weighted_value **wv, - int n_data, - double w, - double hinges[3]); - - +struct order_stats *percentile_create (double p, double W); -/* Hash utility functions */ -int ptile_compare(const struct percentile *p1, - const struct percentile *p2, - void *aux); +/* Return the value of the percentile */ +double percentile_calculate (const struct percentile *ptl, enum pc_alg alg); -unsigned ptile_hash(const struct percentile *p, void *aux); +void percentile_dump (const struct percentile *ptl); #endif diff --git a/src/math/statistic.h b/src/math/statistic.h new file mode 100644 index 0000000000..987264b1f9 --- /dev/null +++ b/src/math/statistic.h @@ -0,0 +1,40 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef __STATISTIC_H__ +#define __STATISTIC_H__ + +#include + +struct ccase ; + +struct statistic +{ + void (*accumulate) (struct statistic *, const struct ccase *cx, double c, double cc, double y); + void (*destroy) (struct statistic *); +}; + +static inline void statistic_destroy (struct statistic *s); + + +static inline void +statistic_destroy (struct statistic *s) +{ + if (s) s->destroy (s); +} + + +#endif diff --git a/src/math/trimmed-mean.c b/src/math/trimmed-mean.c new file mode 100644 index 0000000000..da3d4240e5 --- /dev/null +++ b/src/math/trimmed-mean.c @@ -0,0 +1,94 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include +#include "trimmed-mean.h" +#include + +#include +#include +#include +#include + + +static void +acc (struct statistic *s, const struct ccase *cx UNUSED, double c, double cc, double y) +{ + struct trimmed_mean *tm = (struct trimmed_mean *) s; + struct order_stats *os = (struct order_stats *) s; + + if ( cc > os->k[0].tc && cc < os->k[1].tc) + tm->sum += c * y; + + if ( tm->cyk1p1 == SYSMIS && cc >os->k[0].tc) + tm->cyk1p1 = c * y; +} + +static void +destroy (struct statistic *s) +{ + struct order_stats *os = (struct order_stats *) s; + free (os->k); + free (s); +} + +struct statistic * +trimmed_mean_create (double W, double tail) +{ + struct trimmed_mean *tm = xzalloc (sizeof (*tm)); + struct order_stats *os = (struct order_stats *) tm; + struct statistic *stat = (struct statistic *) tm; + + os->n_k = 2; + os->k = xcalloc (sizeof (*os->k), 2); + + assert (tail >= 0); + assert (tail <= 1); + + os->k[0].tc = tail * W; + os->k[1].tc = W * (1 - tail); + + stat->accumulate = acc; + stat->destroy = destroy; + + tm->cyk1p1 = SYSMIS; + tm->w = W; + tm->tail = tail; + + return stat; +} + + +double +trimmed_mean_calculate (const struct trimmed_mean *tm) +{ + const struct order_stats *os = (const struct order_stats *) tm; + + assert (os->cc == tm->w); + + return + ( + (os->k[0].cc_p1 - os->k[0].tc) * os->k[0].y_p1 + - + (os->k[1].cc - os->k[1].tc) * os->k[1].y_p1 + + + tm->sum + - + tm->cyk1p1 + ) + / + ( (1.0 - 2 * tm->tail) * tm->w); +} diff --git a/src/math/trimmed-mean.h b/src/math/trimmed-mean.h new file mode 100644 index 0000000000..9339cab983 --- /dev/null +++ b/src/math/trimmed-mean.h @@ -0,0 +1,42 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef __TRIMMED_MEAN_H__ +#define __TRIMMED_MEAN_H__ + +#include + +#include "order-stats.h" + + +struct trimmed_mean +{ + struct order_stats parent; + + /* The partial sum */ + double sum; + + /* The product of c_{k_1+1} and y_{k_1 + 1} */ + double cyk1p1; + + double w; + double tail; +}; + +struct statistic * trimmed_mean_create (double W, double c_min); +double trimmed_mean_calculate (const struct trimmed_mean *); + +#endif diff --git a/src/math/ts/automake.mk b/src/math/ts/automake.mk index 2ca1f959fa..eecc53a81f 100644 --- a/src/math/ts/automake.mk +++ b/src/math/ts/automake.mk @@ -1,8 +1,8 @@ ## Process this file with automake to produce Makefile.in -*- makefile -*- -noinst_LIBRARIES += src/math/ts/libpspp_ts.a +noinst_LTLIBRARIES += src/math/ts/libpspp_ts.la -src_math_ts_libpspp_ts_a_SOURCES = \ +src_math_ts_libpspp_ts_la_SOURCES = \ src/math/ts/innovations.c \ src/math/ts/innovations.h diff --git a/src/math/ts/innovations.c b/src/math/ts/innovations.c index 553e20e8f7..b9a7cf2234 100644 --- a/src/math/ts/innovations.c +++ b/src/math/ts/innovations.c @@ -160,7 +160,7 @@ innovations_update_scale (struct innovations_estimate *est, double *theta, for (j = 0; j < i; j++) { k = i - j - 1; - result -= theta[k] * theta[k] * est->scale[j]; + result -= pow2 (theta[k]) * est->scale[j]; } est->scale[i] = result; } diff --git a/src/math/tukey-hinges.c b/src/math/tukey-hinges.c new file mode 100644 index 0000000000..95a79c1d30 --- /dev/null +++ b/src/math/tukey-hinges.c @@ -0,0 +1,101 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include +#include "tukey-hinges.h" +#include + +#include +#include +#include + +void +tukey_hinges_calculate (const struct tukey_hinges *th, double hinge[3]) +{ + double a[3]; + double a_star[3]; + int i; + const struct order_stats *os = &th->parent; + + for (i = 0 ; i < 3 ; ++i) + { + a_star[i] = os->k[i].tc - os->k[i].cc; + a[i] = a_star[i] / os->k[i].c_p1; + + if (a_star[i] < 1) + { + if (os->k[i].c_p1 >= 1 ) + { + hinge[i] = (1 - a_star[i]) * os->k[i].y + + a_star[i] * os->k[i].y_p1; + } + else + { + hinge[i] = (1 - a[i]) * os->k[i].y + + a[i] * os->k[i].y_p1; + } + } + else + { + hinge[i] = os->k[i].y_p1; + } + + } +} + +static void +destroy (struct statistic *s) +{ + struct order_stats *os = (struct order_stats *) s; + + free (os->k); + free (s); +}; + +struct statistic * +tukey_hinges_create (double W, double c_min) +{ + double d; + struct tukey_hinges *th = xzalloc (sizeof (*th)); + struct order_stats *os = (struct order_stats *) th; + struct statistic *stat = (struct statistic *) th; + + assert (c_min >= 0); + + os->n_k = 3; + os->k = xcalloc (sizeof (*os->k), 3); + + if ( c_min >= 1.0) + { + d = floor ((W + 3) / 2.0) / 2.0; + + os->k[0].tc = d; + os->k[1].tc = W/2.0 + 0.5; + os->k[2].tc = W + 1 - d; + } + else + { + d = floor ((W/c_min + 3.0)/ 2.0) / 2.0 ; + os->k[0].tc = d * c_min; + os->k[1].tc = (W + c_min) / 2.0; + os->k[2].tc = W + c_min * (1 - d); + } + + + stat->destroy = destroy; + + return stat; +} diff --git a/src/math/tukey-hinges.h b/src/math/tukey-hinges.h new file mode 100644 index 0000000000..d87691f8b0 --- /dev/null +++ b/src/math/tukey-hinges.h @@ -0,0 +1,37 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef __TUKEY_HINGES_H__ +#define __TUKEY_HINGES_H__ + +#include + +#include "order-stats.h" + + +struct tukey_hinges +{ + struct order_stats parent; +}; + +struct statistic * tukey_hinges_create (double W, double c_min); + + +void tukey_hinges_calculate (const struct tukey_hinges *h, double hinge[3]); + + + +#endif diff --git a/src/output/automake.mk b/src/output/automake.mk index d30acf63e5..ec92c559e0 100644 --- a/src/output/automake.mk +++ b/src/output/automake.mk @@ -3,7 +3,7 @@ include $(top_srcdir)/src/output/charts/automake.mk -noinst_LIBRARIES += src/output/liboutput.a +noinst_LTLIBRARIES += src/output/liboutput.la output_sources = \ src/output/afm.c \ @@ -23,11 +23,11 @@ output_sources = \ if WITHCHARTS -src_output_liboutput_a_SOURCES = $(output_sources) src/output/chart.c +src_output_liboutput_la_SOURCES = $(output_sources) src/output/chart.c EXTRA_DIST += src/output/dummy-chart.c else -src_output_liboutput_a_SOURCES = $(output_sources) src/output/dummy-chart.c +src_output_liboutput_la_SOURCES = $(output_sources) src/output/dummy-chart.c EXTRA_DIST += src/output/chart.c endif diff --git a/src/output/charts/automake.mk b/src/output/charts/automake.mk index a480de6da9..ab0ff51047 100644 --- a/src/output/charts/automake.mk +++ b/src/output/charts/automake.mk @@ -1,6 +1,6 @@ ## Process this file with automake to produce Makefile.in -*- makefile -*- -noinst_LIBRARIES += src/output/charts/libcharts.a +noinst_LTLIBRARIES += src/output/charts/libcharts.la chart_sources = \ src/output/charts/barchart.c \ @@ -17,12 +17,12 @@ chart_sources = \ src/output/charts/plot-hist.h if WITHCHARTS -src_output_charts_libcharts_a_SOURCES = \ +src_output_charts_libcharts_la_SOURCES = \ $(chart_sources) EXTRA_DIST += src/output/charts/dummy-chart.c else -src_output_charts_libcharts_a_SOURCES = \ +src_output_charts_libcharts_la_SOURCES = \ src/output/charts/dummy-chart.c EXTRA_DIST += $(chart_sources) diff --git a/src/output/charts/box-whisker.c b/src/output/charts/box-whisker.c index d4a5ccab6c..c3641580e0 100644 --- a/src/output/charts/box-whisker.c +++ b/src/output/charts/box-whisker.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2004, 2008 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -26,193 +26,134 @@ #include #include -#include - - - +#include /* Draw a box-and-whiskers plot */ -/* Draw an outlier on the plot CH +/* Draw an OUTLIER on the plot CH * at CENTRELINE - * The outlier is in (*wvp)[idx] - * If EXTREME is non zero, then consider it to be an extreme - * value */ -void -draw_outlier(struct chart *ch, double centreline, - struct weighted_value **wvp, - int idx, - short extreme); - - -void -draw_outlier(struct chart *ch, double centreline, - struct weighted_value **wvp, - int idx, - short extreme - ) +static void +draw_case (struct chart *ch, double centreline, + const struct outlier *outlier) { - char label[10]; #define MARKER_CIRCLE 4 #define MARKER_STAR 3 pl_fmarker_r(ch->lp, centreline, - ch->data_bottom + - (wvp[idx]->v.f - ch->y_min ) * ch->ordinate_scale, - extreme?MARKER_STAR:MARKER_CIRCLE, + ch->data_bottom + (outlier->value - ch->y_min) * ch->ordinate_scale, + outlier->extreme ? MARKER_STAR : MARKER_CIRCLE, 20); pl_moverel_r(ch->lp, 10,0); - snprintf(label, 10, "%d", wvp[idx]->case_nos->num); - - pl_alabel_r(ch->lp, 'l', 'c', label); - + pl_alabel_r(ch->lp, 'l', 'c', ds_cstr (&outlier->label)); } void -boxplot_draw_boxplot(struct chart *ch, - double box_centre, - double box_width, - struct metrics *m, - const char *name) +boxplot_draw_boxplot (struct chart *ch, + double box_centre, + double box_width, + const struct box_whisker *bw, + const char *name) { double whisker[2]; - int i; - - const double *hinge = m->hinge; - struct weighted_value **wvp = m->wvp; - const int n_data = m->n_data; - - const double step = (hinge[2] - hinge[0]) * 1.5; + double hinge[3]; + struct ll *ll; + const struct ll_list *outliers; const double box_left = box_centre - box_width / 2.0; const double box_right = box_centre + box_width / 2.0; + double box_bottom ; + double box_top ; + double bottom_whisker ; + double top_whisker ; - const double box_bottom = - ch->data_bottom + ( hinge[0] - ch->y_min ) * ch->ordinate_scale; - - - const double box_top = - ch->data_bottom + ( hinge[2] - ch->y_min ) * ch->ordinate_scale; - - assert(m); + box_whisker_whiskers (bw, whisker); + box_whisker_hinges (bw, hinge); - /* Can't really draw a boxplot if there's no data */ - if ( n_data == 0 ) - return ; + box_bottom = ch->data_bottom + (hinge[0] - ch->y_min ) * ch->ordinate_scale; - whisker[1] = hinge[2]; - whisker[0] = wvp[0]->v.f; + box_top = ch->data_bottom + (hinge[2] - ch->y_min ) * ch->ordinate_scale; - for ( i = 0 ; i < n_data ; ++i ) - { - if ( hinge[2] + step > wvp[i]->v.f) - whisker[1] = wvp[i]->v.f; - - if ( hinge[0] - step > wvp[i]->v.f) - whisker[0] = wvp[i]->v.f; - - } - - { - const double bottom_whisker = - ch->data_bottom + ( whisker[0] - ch->y_min ) * ch->ordinate_scale; - - const double top_whisker = - ch->data_bottom + ( whisker[1] - ch->y_min ) * ch->ordinate_scale; + bottom_whisker = ch->data_bottom + (whisker[0] - ch->y_min) * + ch->ordinate_scale; + top_whisker = ch->data_bottom + (whisker[1] - ch->y_min) * ch->ordinate_scale; pl_savestate_r(ch->lp); - /* Draw the box */ - pl_savestate_r(ch->lp); - pl_fillcolorname_r(ch->lp,ch->fill_colour); - pl_filltype_r(ch->lp,1); - pl_fbox_r(ch->lp, + pl_savestate_r (ch->lp); + pl_fillcolorname_r (ch->lp, ch->fill_colour); + pl_filltype_r (ch->lp,1); + pl_fbox_r (ch->lp, box_left, box_bottom, box_right, box_top); - pl_restorestate_r(ch->lp); - - + pl_restorestate_r (ch->lp); /* Draw the median */ - pl_savestate_r(ch->lp); - pl_linewidth_r(ch->lp,5); - pl_fline_r(ch->lp, + pl_savestate_r (ch->lp); + pl_linewidth_r (ch->lp, 5); + pl_fline_r (ch->lp, box_left, - ch->data_bottom + ( hinge[1] - ch->y_min ) * ch->ordinate_scale, + ch->data_bottom + (hinge[1] - ch->y_min) * ch->ordinate_scale, box_right, - ch->data_bottom + ( hinge[1] - ch->y_min ) * ch->ordinate_scale); - pl_restorestate_r(ch->lp); - + ch->data_bottom + (hinge[1] - ch->y_min) * ch->ordinate_scale); + pl_restorestate_r (ch->lp); /* Draw the bottom whisker */ - pl_fline_r(ch->lp, + pl_fline_r (ch->lp, box_left, bottom_whisker, box_right, bottom_whisker); /* Draw top whisker */ - pl_fline_r(ch->lp, + pl_fline_r (ch->lp, box_left, top_whisker, box_right, top_whisker); - /* Draw centre line. (bottom half) */ - pl_fline_r(ch->lp, + pl_fline_r (ch->lp, box_centre, bottom_whisker, box_centre, box_bottom); /* (top half) */ - pl_fline_r(ch->lp, + pl_fline_r (ch->lp, box_centre, top_whisker, box_centre, box_top); - } - /* Draw outliers */ - for ( i = 0 ; i < n_data ; ++i ) + outliers = box_whisker_outliers (bw); + for (ll = ll_head (outliers); + ll != ll_null (outliers); ll = ll_next (ll)) { - if ( wvp[i]->v.f >= hinge[2] + step ) - draw_outlier(ch, box_centre, wvp, i, - ( wvp[i]->v.f > hinge[2] + 2 * step ) - ); - - if ( wvp[i]->v.f <= hinge[0] - step ) - draw_outlier(ch, box_centre, wvp, i, - ( wvp[i]->v.f < hinge[0] - 2 * step ) - ); + const struct outlier *outlier = ll_data (ll, struct outlier, ll); + draw_case (ch, box_centre, outlier); } - /* Draw tick mark on x axis */ draw_tick(ch, TICK_ABSCISSA, box_centre - ch->data_left, name); pl_restorestate_r(ch->lp); - } - - void -boxplot_draw_yscale(struct chart *ch , double y_max, double y_min) +boxplot_draw_yscale (struct chart *ch, double y_max, double y_min) { double y_tick; double d; @@ -223,7 +164,7 @@ boxplot_draw_yscale(struct chart *ch , double y_max, double y_min) ch->y_max = y_max; ch->y_min = y_min; - y_tick = chart_rounded_tick(fabs(ch->y_max - ch->y_min) / 5.0); + y_tick = chart_rounded_tick (fabs(ch->y_max - ch->y_min) / 5.0); ch->y_min = (ceil( ch->y_min / y_tick ) - 1.0 ) * y_tick; @@ -232,7 +173,6 @@ boxplot_draw_yscale(struct chart *ch , double y_max, double y_min) ch->ordinate_scale = fabs(ch->data_top - ch->data_bottom) / fabs(ch->y_max - ch->y_min) ; - /* Move to data bottom-left */ pl_move_r(ch->lp, ch->data_left, ch->data_bottom); @@ -241,5 +181,4 @@ boxplot_draw_yscale(struct chart *ch , double y_max, double y_min) { draw_tick (ch, TICK_ORDINATE, (d - ch->y_min ) * ch->ordinate_scale, "%g", d); } - } diff --git a/src/output/charts/box-whisker.h b/src/output/charts/box-whisker.h index 656d8d49b3..7b2c4b8fdc 100644 --- a/src/output/charts/box-whisker.h +++ b/src/output/charts/box-whisker.h @@ -18,28 +18,15 @@ #define BOX_WHISKER_H struct chart ; -struct weighted_value; -struct metrics; +struct box_whisker; -/* Draw an outlier on the plot CH - * at CENTRELINE - * The outlier is in (*wvp)[idx] - * If EXTREME is non zero, then consider it to be an extreme - * value - */ -void draw_outlier(struct chart *ch, double centreline, - struct weighted_value **wvp, - int idx, - short extreme); +void boxplot_draw_boxplot (struct chart *ch, + double box_centre, + double box_width, + const struct box_whisker *w, + const char *name); -void boxplot_draw_boxplot(struct chart *ch, - double box_centre, - double box_width, - struct metrics *m, - const char *name); - - -void boxplot_draw_yscale(struct chart *ch , double y_max, double y_min); +void boxplot_draw_yscale (struct chart *ch , double y_max, double y_min); #endif diff --git a/src/output/charts/plot-hist.c b/src/output/charts/plot-hist.c index 0f183208a2..e7dcd438d8 100644 --- a/src/output/charts/plot-hist.c +++ b/src/output/charts/plot-hist.c @@ -1,10 +1,10 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2004 Free Software Foundation, Inc. + Copyright (C) 2004 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. + (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -30,78 +30,79 @@ #include #include #include +#include +#include #include "gettext.h" #define _(msgid) gettext (msgid) /* Write the legend of the chart */ -void -histogram_write_legend(struct chart *ch, const struct normal_curve *norm) +static void +histogram_write_legend (struct chart *ch, double n, double mean, double stddev) { char buf[100]; - if ( !ch ) + + if (!ch) return ; - pl_savestate_r(ch->lp); + pl_savestate_r (ch->lp); - sprintf(buf,"N = %.2f",norm->N); - pl_move_r(ch->lp, ch->legend_left, ch->data_bottom); - pl_alabel_r(ch->lp,0,'b',buf); + sprintf (buf, "N = %.2f", n); + pl_move_r (ch->lp, ch->legend_left, ch->data_bottom); + pl_alabel_r (ch->lp, 0, 'b', buf); - sprintf(buf,"Mean = %.1f",norm->mean); - pl_fmove_r(ch->lp,ch->legend_left,ch->data_bottom + ch->font_size * 1.5); - pl_alabel_r(ch->lp,0,'b',buf); + sprintf (buf, "Mean = %.1f", mean); + pl_fmove_r (ch->lp,ch->legend_left,ch->data_bottom + ch->font_size * 1.5); + pl_alabel_r (ch->lp, 0, 'b', buf); - sprintf(buf,"Std. Dev = %.2f",norm->stddev); - pl_fmove_r(ch->lp,ch->legend_left,ch->data_bottom + ch->font_size * 1.5 * 2); - pl_alabel_r(ch->lp,0,'b',buf); + sprintf (buf, "Std. Dev = %.2f", stddev); + pl_fmove_r (ch->lp, ch->legend_left, ch->data_bottom + ch->font_size * 1.5 * 2); + pl_alabel_r (ch->lp, 0, 'b', buf); - pl_restorestate_r(ch->lp); + pl_restorestate_r (ch->lp); } -static void hist_draw_bar(struct chart *ch, const gsl_histogram *hist, int bar); +static void hist_draw_bar (struct chart *ch, const struct histogram *hist, int bar); static void -hist_draw_bar(struct chart *ch, const gsl_histogram *hist, int bar) +hist_draw_bar (struct chart *ch, const struct histogram *hist, int bar) { - if ( !ch ) + if (!ch) return ; - { double upper; double lower; double height; - const size_t bins = gsl_histogram_bins(hist); + const size_t bins = gsl_histogram_bins (hist->gsl_hist); const double x_pos = (ch->data_right - ch->data_left) * bar / (double) bins ; const double width = (ch->data_right - ch->data_left) / (double) bins ; + assert ( 0 == gsl_histogram_get_range (hist->gsl_hist, bar, &lower, &upper)); - assert ( 0 == gsl_histogram_get_range(hist, bar, &lower, &upper)); - - assert( upper >= lower); + assert ( upper >= lower); - height = gsl_histogram_get(hist, bar) * - (ch->data_top - ch->data_bottom) / gsl_histogram_max_val(hist); + height = gsl_histogram_get (hist->gsl_hist, bar) * + (ch->data_top - ch->data_bottom) / gsl_histogram_max_val (hist->gsl_hist); - pl_savestate_r(ch->lp); - pl_move_r(ch->lp,ch->data_left, ch->data_bottom); - pl_fillcolorname_r(ch->lp, ch->fill_colour); - pl_filltype_r(ch->lp,1); + pl_savestate_r (ch->lp); + pl_move_r (ch->lp,ch->data_left, ch->data_bottom); + pl_fillcolorname_r (ch->lp, ch->fill_colour); + pl_filltype_r (ch->lp,1); - pl_fboxrel_r(ch->lp, + pl_fboxrel_r (ch->lp, x_pos, 0, x_pos + width, height); - pl_restorestate_r(ch->lp); + pl_restorestate_r (ch->lp); { char buf[5]; - snprintf(buf,5,"%g",(upper + lower) / 2.0); - draw_tick(ch, TICK_ABSCISSA, + snprintf (buf,5,"%g", (upper + lower) / 2.0); + draw_tick (ch, TICK_ABSCISSA, x_pos + width / 2.0, buf); } } @@ -109,73 +110,85 @@ hist_draw_bar(struct chart *ch, const gsl_histogram *hist, int bar) +void +histogram_plot (const struct histogram *hist, + const char *label, + const struct moments1 *m) +{ + double mean, var, n; + + moments1_calculate (m, &n, &mean, &var, NULL, NULL); + + histogram_plot_n (hist, label, n, mean, sqrt(var), m); +} void -histogram_plot(const gsl_histogram *hist, - const char *factorname, - const struct normal_curve *norm, short show_normal) +histogram_plot_n (const struct histogram *hist, + const char *label, + double n, double mean, double stddev, + bool show_normal) { int i; int bins; - struct chart *ch; + struct chart *ch = chart_create (); - ch = chart_create(); - chart_write_title(ch, _("HISTOGRAM")); + chart_write_title (ch, _("HISTOGRAM")); - chart_write_ylabel(ch, _("Frequency")); - chart_write_xlabel(ch, factorname); + chart_write_ylabel (ch, _("Frequency")); + chart_write_xlabel (ch, label); if ( ! hist ) /* If this happens, probably all values are SYSMIS */ { - chart_submit(ch); - return ; + chart_submit (ch); + return; } else { - bins = gsl_histogram_bins(hist); + bins = gsl_histogram_bins (hist->gsl_hist); } - chart_write_yscale(ch, 0, gsl_histogram_max_val(hist), 5); + chart_write_yscale (ch, 0, gsl_histogram_max_val (hist->gsl_hist), 5); for ( i = 0 ; i < bins ; ++i ) - hist_draw_bar(ch, hist, i); + hist_draw_bar (ch, hist, i); - histogram_write_legend(ch, norm); + histogram_write_legend (ch, n, mean, stddev); - if ( show_normal ) - { - /* Draw the normal curve */ - - double d ; - double x_min, x_max, not_used ; - double abscissa_scale ; - double ordinate_scale ; - double range ; - - gsl_histogram_get_range(hist, 0, &x_min, ¬_used); - range = not_used - x_min; - gsl_histogram_get_range(hist, bins - 1, ¬_used, &x_max); - - abscissa_scale = (ch->data_right - ch->data_left) / (x_max - x_min); - ordinate_scale = (ch->data_top - ch->data_bottom) / - gsl_histogram_max_val(hist) ; - - pl_move_r(ch->lp, ch->data_left, ch->data_bottom); - for( d = ch->data_left; - d <= ch->data_right ; - d += (ch->data_right - ch->data_left) / 100.0) - { - const double x = (d - ch->data_left) / abscissa_scale + x_min ; - const double y = norm->N * range * - gsl_ran_gaussian_pdf(x - norm->mean, norm->stddev); - - pl_fcont_r(ch->lp, d, ch->data_bottom + y * ordinate_scale); - - } - pl_endpath_r(ch->lp); + if (show_normal) + { + /* Draw the normal curve */ + + double d ; + double x_min, x_max, not_used ; + double abscissa_scale ; + double ordinate_scale ; + double range ; + + gsl_histogram_get_range (hist->gsl_hist, 0, &x_min, ¬_used); + range = not_used - x_min; + gsl_histogram_get_range (hist->gsl_hist, bins - 1, ¬_used, &x_max); + + abscissa_scale = (ch->data_right - ch->data_left) / (x_max - x_min); + ordinate_scale = (ch->data_top - ch->data_bottom) / + gsl_histogram_max_val (hist->gsl_hist) ; + + pl_move_r (ch->lp, ch->data_left, ch->data_bottom); + for ( d = ch->data_left; + d <= ch->data_right ; + d += (ch->data_right - ch->data_left) / 100.0) + { + const double x = (d - ch->data_left) / abscissa_scale + x_min ; + const double y = n * range * + gsl_ran_gaussian_pdf (x - mean, stddev); + + pl_fcont_r (ch->lp, d, ch->data_bottom + y * ordinate_scale); + + } + pl_endpath_r (ch->lp); + } - } - chart_submit(ch); + chart_submit (ch); } + diff --git a/src/output/charts/plot-hist.h b/src/output/charts/plot-hist.h index c808cb4e57..606206d501 100644 --- a/src/output/charts/plot-hist.h +++ b/src/output/charts/plot-hist.h @@ -17,23 +17,23 @@ #ifndef PLOT_HIST_H #define PLOT_HIST_H -#include +#include - -struct normal_curve -{ - double N ; - double mean ; - double stddev ; -}; struct chart; +struct moments1; +struct histogram; + +/* Plot M onto histogram HIST and label it with LABEL */ +void histogram_plot (const struct histogram *hist, + const char *label, const struct moments1 *m); -/* Write the legend of the chart */ -void histogram_write_legend(struct chart *ch, const struct normal_curve *norm); -void histogram_plot(const gsl_histogram *hist, - const char *factorname, - const struct normal_curve *norm, short show_normal); +/* A wrapper aroud histogram_plot. + Don't use this function. It's legacy only */ +void histogram_plot_n (const struct histogram *hist, + const char *label, + double n, double mean, double var, + bool show_normal); #endif diff --git a/src/ui/automake.mk b/src/ui/automake.mk index 044c293c54..f1dde68a56 100644 --- a/src/ui/automake.mk +++ b/src/ui/automake.mk @@ -6,9 +6,9 @@ include $(top_srcdir)/src/ui/gui/automake.mk endif -noinst_LIBRARIES += src/ui/libuicommon.a +noinst_LTLIBRARIES += src/ui/libuicommon.la -src_ui_libuicommon_a_SOURCES = \ +src_ui_libuicommon_la_SOURCES = \ src/ui/debugger.c \ src/ui/debugger.h \ src/ui/syntax-gen.c \ diff --git a/src/ui/gui/automake.mk b/src/ui/gui/automake.mk index 64fe75e7cb..f77f88c793 100644 --- a/src/ui/gui/automake.mk +++ b/src/ui/gui/automake.mk @@ -53,19 +53,12 @@ src_ui_gui_psppire_LDADD = \ -dlopen src/ui/gui/libpsppire.la \ src/ui/gui/libpsppwidgets.la \ lib/gtksheet/libgtksheet.a \ - src/language/liblanguage.a \ - src/ui/libuicommon.a \ - src/output/charts/libcharts.a \ - src/output/liboutput.a \ - src/math/libpspp_math.a \ - lib/linreg/liblinreg.a \ - src/data/libdata.a \ - src/libpspp/libpspp.a \ + src/ui/libuicommon.la \ + src/libpspp.la \ + src/libpspp-core.la \ $(GTK_LIBS) \ $(GLADE_LIBS) \ - $(PG_LIBS) \ - gl/libgl.la \ - @LIBINTL@ @LIBREADLINE@ + @LIBINTL@ src_ui_gui_psppiredir = $(pkgdatadir) diff --git a/src/ui/gui/find-dialog.c b/src/ui/gui/find-dialog.c index 5d98b4ac91..7764d04a96 100644 --- a/src/ui/gui/find-dialog.c +++ b/src/ui/gui/find-dialog.c @@ -464,7 +464,7 @@ value_compare (const struct comparator *cmptr, const union value *v) { const struct value_comparator *vc = (const struct value_comparator *) cmptr; - return 0 == compare_values (v, vc->pattern, var_get_width (cmptr->var)); + return 0 == compare_values (v, vc->pattern, cmptr->var); } diff --git a/src/ui/gui/main.c b/src/ui/gui/main.c index eca3f98f68..f68f2ad66a 100644 --- a/src/ui/gui/main.c +++ b/src/ui/gui/main.c @@ -110,7 +110,7 @@ main (int argc, char *argv[]) GTK_MINOR_VERSION, GTK_MICRO_VERSION)) ) { - g_critical (vers); + g_warning (vers); } /* Deal with options like --version, --help etc */ diff --git a/src/ui/gui/psppire.c b/src/ui/gui/psppire.c index 239d153e4b..0467746333 100644 --- a/src/ui/gui/psppire.c +++ b/src/ui/gui/psppire.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "relocatable.h" @@ -122,6 +123,9 @@ initialize (void) journal_enable (); textdomain (PACKAGE); + /* Ignore alarm clock signals */ + signal (SIGALRM, SIG_IGN); + new_data_window (NULL, NULL); } diff --git a/src/ui/terminal/automake.mk b/src/ui/terminal/automake.mk index 928716c851..988f24d5d3 100644 --- a/src/ui/terminal/automake.mk +++ b/src/ui/terminal/automake.mk @@ -1,8 +1,8 @@ ## Process this file with automake to produce Makefile.in -*- makefile -*- -noinst_LIBRARIES += src/ui/terminal/libui.a +noinst_LTLIBRARIES += src/ui/terminal/libui.la -src_ui_terminal_libui_a_SOURCES = \ +src_ui_terminal_libui_la_SOURCES = \ src/ui/terminal/command-line.c \ src/ui/terminal/command-line.h \ src/ui/terminal/read-line.c \ @@ -13,33 +13,22 @@ src_ui_terminal_libui_a_SOURCES = \ src/ui/terminal/terminal.c \ src/ui/terminal/terminal.h -src_ui_terminal_libui_a_CFLAGS = -DINSTALLDIR=\"$(bindir)\" $(NCURSES_CFLAGS) +src_ui_terminal_libui_la_CFLAGS = -DINSTALLDIR=\"$(bindir)\" $(NCURSES_CFLAGS) bin_PROGRAMS += src/ui/terminal/pspp - src_ui_terminal_pspp_SOURCES = src_ui_terminal_pspp_LDADD = \ - src/ui/terminal/libui.a \ - src/language/liblanguage.a \ - src/output/charts/libcharts.a \ - src/output/liboutput.a \ - src/math/libpspp_math.a \ - src/ui/libuicommon.a \ - lib/linreg/liblinreg.a \ - src/data/libdata.a \ - src/libpspp/libpspp.a \ - $(LIBXML2_LIBS) \ - $(PG_LIBS) \ + src/ui/terminal/libui.la \ + src/ui/libuicommon.la \ + src/libpspp.la \ + src/libpspp-core.la \ $(NCURSES_LIBS) \ $(LIBICONV) \ - gl/libgl.la \ @LIBINTL@ @LIBREADLINE@ - - src_ui_terminal_pspp_LDFLAGS = $(PG_LDFLAGS) if RELOCATABLE_VIA_LD diff --git a/src/ui/terminal/main.c b/src/ui/terminal/main.c index b8cabb3acb..e91677b361 100644 --- a/src/ui/terminal/main.c +++ b/src/ui/terminal/main.c @@ -84,6 +84,7 @@ main (int argc, char **argv) signal (SIGABRT, bug_handler); signal (SIGSEGV, bug_handler); signal (SIGFPE, bug_handler); + signal (SIGALRM, SIG_IGN); at_fatal_signal (clean_up); i18n_init (); diff --git a/src/ui/terminal/msg-ui.c b/src/ui/terminal/msg-ui.c index b03efb95e5..682d753d8f 100644 --- a/src/ui/terminal/msg-ui.c +++ b/src/ui/terminal/msg-ui.c @@ -267,7 +267,7 @@ write_stream (int line_indent, struct substring line, void *stream_) /* Writes LINE to the journal. */ static void -write_journal (int line_indent, struct substring line, void *unused UNUSED) +write_journal (int line_indent UNUSED, struct substring line, void *unused UNUSED) { char *s = xstrndup (ss_data (line), ss_length (line)); journal_write (true, s); diff --git a/tests/automake.mk b/tests/automake.mk index bfca93d4b9..dfd49170a6 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -40,6 +40,7 @@ dist_TESTS = \ tests/command/n_of_cases.sh \ tests/command/npar-binomial.sh \ tests/command/npar-chisquare.sh \ + tests/command/npar-wilcoxon.sh \ tests/command/oneway.sh \ tests/command/oneway-missing.sh \ tests/command/oneway-with-splits.sh \ @@ -50,6 +51,7 @@ dist_TESTS = \ tests/command/rename.sh \ tests/command/regression.sh \ tests/command/regression-qr.sh \ + tests/command/reliability.sh \ tests/command/sample.sh \ tests/command/sort.sh \ tests/command/sysfiles.sh \ @@ -107,6 +109,7 @@ dist_TESTS = \ tests/bugs/double-frequency.sh \ tests/bugs/empty-do-repeat.sh \ tests/bugs/get.sh \ + tests/bugs/examine-crash.sh \ tests/bugs/examine-1sample.sh \ tests/bugs/examine-missing.sh \ tests/bugs/examine-missing2.sh \ @@ -167,6 +170,8 @@ nodist_TESTS = \ tests/libpspp/abt-test \ tests/libpspp/bt-test \ tests/libpspp/heap-test \ + tests/libpspp/hmap-test \ + tests/libpspp/hmapx-test \ tests/libpspp/ll-test \ tests/libpspp/llx-test \ tests/libpspp/range-map-test \ @@ -186,6 +191,7 @@ tests_libpspp_ll_test_SOURCES = \ src/libpspp/ll.h \ tests/libpspp/ll-test.c tests_libpspp_ll_test_LDADD = gl/libgl.la @LIBINTL@ +tests_libpspp_ll_test_CFLAGS = $(AM_CFLAGS) tests_libpspp_llx_test_SOURCES = \ src/libpspp/ll.c \ @@ -194,6 +200,7 @@ tests_libpspp_llx_test_SOURCES = \ src/libpspp/llx.h \ tests/libpspp/llx-test.c tests_libpspp_llx_test_LDADD = gl/libgl.la @LIBINTL@ +tests_libpspp_llx_test_CFLAGS = $(AM_CFLAGS) tests_libpspp_heap_test_SOURCES = \ src/libpspp/heap.c \ @@ -204,6 +211,22 @@ tests_libpspp_heap_test_SOURCES = \ tests_libpspp_heap_test_LDADD = gl/libgl.la @LIBINTL@ tests_libpspp_heap_test_CPPFLAGS = $(AM_CPPFLAGS) -DASSERT_LEVEL=10 +tests_libpspp_hmap_test_SOURCES = \ + src/libpspp/hmap.c \ + src/libpspp/hmap.h \ + tests/libpspp/hmap-test.c +tests_libpspp_hmap_test_LDADD = gl/libgl.la @LIBINTL@ +tests_libpspp_hmap_test_CPPFLAGS = $(AM_CPPFLAGS) -DASSERT_LEVEL=10 + +tests_libpspp_hmapx_test_SOURCES = \ + src/libpspp/hmap.c \ + src/libpspp/hmap.h \ + src/libpspp/hmapx.c \ + src/libpspp/hmapx.h \ + tests/libpspp/hmapx-test.c +tests_libpspp_hmapx_test_LDADD = gl/libgl.la @LIBINTL@ +tests_libpspp_hmapx_test_CPPFLAGS = $(AM_CPPFLAGS) -DASSERT_LEVEL=10 + tests_libpspp_abt_test_SOURCES = \ src/libpspp/abt.c \ src/libpspp/abt.h \ @@ -240,7 +263,7 @@ tests_libpspp_range_set_test_CPPFLAGS = $(AM_CPPFLAGS) -DASSERT_LEVEL=10 tests_libpspp_str_test_SOURCES = \ tests/libpspp/str-test.c -tests_libpspp_str_test_LDADD = src/libpspp/libpspp.a gl/libgl.la @LIBINTL@ +tests_libpspp_str_test_LDADD = src/libpspp/libpspp.la gl/libgl.la @LIBINTL@ tests_libpspp_tower_test_SOURCES = \ src/libpspp/abt.c \ diff --git a/tests/bugs/examine-crash.sh b/tests/bugs/examine-crash.sh new file mode 100755 index 0000000000..6cd172fde5 --- /dev/null +++ b/tests/bugs/examine-crash.sh @@ -0,0 +1,80 @@ +#!/bin/sh + +# This program tests for a bug which crashed EXAMINE + +TEMPDIR=/tmp/pspp-tst-$$ +TESTFILE=$TEMPDIR/`basename $0`.sps + +# ensure that top_srcdir and top_builddir are absolute +if [ -z "$top_srcdir" ] ; then top_srcdir=. ; fi +if [ -z "$top_builddir" ] ; then top_builddir=. ; fi +top_srcdir=`cd $top_srcdir; pwd` +top_builddir=`cd $top_builddir; pwd` + +PSPP=$top_builddir/src/ui/terminal/pspp + +STAT_CONFIG_PATH=$top_srcdir/config +export STAT_CONFIG_PATH + +LANG=C +export LANG + + +cleanup() +{ + if [ x"$PSPP_TEST_NO_CLEANUP" != x ] ; then + echo "NOT cleaning $TEMPDIR" + return ; + fi + rm -rf $TEMPDIR +} + + +fail() +{ + echo $activity + echo FAILED + cleanup; + exit 1; +} + + +no_result() +{ + echo $activity + echo NO RESULT; + cleanup; + exit 2; +} + +pass() +{ + cleanup; + exit 0; +} + +mkdir -p $TEMPDIR + +cd $TEMPDIR + +cat < $TESTFILE +data list list /a * x * y *. +begin data. +3 1 3 +5 1 4 +7 2 3 +end data. + +examine a by x by y + /statistics=DESCRIPTIVES + . +EOF +if [ $? -ne 0 ] ; then no_result ; fi + + +activity="run program" +$SUPERVISOR $PSPP --testing-mode -o raw-ascii $TESTFILE +if [ $? -ne 0 ] ; then fail ; fi + + +pass; diff --git a/tests/bugs/examine-missing2.sh b/tests/bugs/examine-missing2.sh index 965b600ee2..97b192625f 100755 --- a/tests/bugs/examine-missing2.sh +++ b/tests/bugs/examine-missing2.sh @@ -92,26 +92,26 @@ diff -b $TEMPDIR/pspp.list - < agg-skel.pspp < $TESTFILE << EOF +data list notable list /foo * bar * w *. +begin data. +1.00 1.00 1 +1.00 2.00 1 +2.00 1.00 1 +1.00 4.00 1 +2.00 5.00 1 +1.00 19.00 1 +2.00 7.00 1 +4.00 5.00 1 +1.00 12.00 1 +2.00 13.00 1 +2.00 2.00 1 +12.00 .00 2 +12.00 1.00 1 +13.00 1.00 1 +end data + +variable labels foo "first" bar "second". + +weight by w. + +npar test + /wilcoxon=foo with bar (paired) + /missing analysis + /method=exact. + +EOF +if [ $? -ne 0 ] ; then no_result ; fi + + +activity="run program 1" +$SUPERVISOR $PSPP --testing-mode -o raw-ascii $TESTFILE +if [ $? -ne 0 ] ; then no_result ; fi + + +activity="generate results" +cat > $TEMPDIR/results.txt < $TESTFILE << EOF +data list notable list /foo * bar * dummy *. +begin data. +1.00 1.00 1 +1.00 2.00 1 +2.00 1.00 1 +1.00 4.00 . +2.00 5.00 . +1.00 19.00 . +2.00 7.00 1 +4.00 5.00 1 +1.00 12.00 1 +2.00 13.00 1 +2.00 2.00 1 +12.00 .00 1 +12.00 .00 1 +34.2 . 1 +12.00 1.00 1 +13.00 1.00 1 +end data + +variable labels foo "first" bar "second". + +npar test + /wilcoxon=foo with bar (paired) + /missing analysis + /method=exact. + +EOF +if [ $? -ne 0 ] ; then no_result ; fi + + +activity="run program 2" +$SUPERVISOR $PSPP --testing-mode -o raw-ascii $TESTFILE +if [ $? -ne 0 ] ; then no_result ; fi + + +activity="compare output 2" +diff pspp.list $TEMPDIR/results.txt +if [ $? -ne 0 ] ; then fail ; fi + + + +pass; diff --git a/tests/command/reliability.sh b/tests/command/reliability.sh new file mode 100755 index 0000000000..0bfa7336b2 --- /dev/null +++ b/tests/command/reliability.sh @@ -0,0 +1,345 @@ +#!/bin/sh + +# This program tests the reliability command. + +TEMPDIR=/tmp/pspp-tst-$$ +TESTFILE=$TEMPDIR/`basename $0`.sps + +# ensure that top_srcdir and top_builddir are absolute +if [ -z "$top_srcdir" ] ; then top_srcdir=. ; fi +if [ -z "$top_builddir" ] ; then top_builddir=. ; fi +top_srcdir=`cd $top_srcdir; pwd` +top_builddir=`cd $top_builddir; pwd` + +PSPP=$top_builddir/src/ui/terminal/pspp + +STAT_CONFIG_PATH=$top_srcdir/config +export STAT_CONFIG_PATH + +LANG=C +export LANG + + +cleanup() +{ + if [ x"$PSPP_TEST_NO_CLEANUP" != x ] ; then + echo "NOT cleaning $TEMPDIR" + return ; + fi + rm -rf $TEMPDIR +} + + +fail() +{ + echo $activity + echo FAILED + cleanup; + exit 1; +} + + +no_result() +{ + echo $activity + echo NO RESULT; + cleanup; + exit 2; +} + +pass() +{ + cleanup; + exit 0; +} + +mkdir -p $TEMPDIR + +cd $TEMPDIR + + +activity="create program" +cat > $TESTFILE <. */ + +/* This is a test program for the hmap_* routines defined in + hmap.c. This test program aims to be as comprehensive as + possible. "gcov -a -b" should report 100% coverage of lines, + blocks and branches in hmap.c (when compiled with -DNDEBUG). + "valgrind --leak-check=yes --show-reachable=yes" should give a + clean report. */ + +/* Warning: + + GCC 4.3 will miscompile this test program, specifically + test_moved(), given small changes. This is a bug in GCC + triggered by the test program, not by the library under test, + so you may safely ignore it. To avoid miscompilation, compile + this file with GCC 4.2 or earlier or GCC 4.4 or later. + + Here is a minimal test program that demonstrates the same or a + similar bug in GCC 4.3: + + #include + #include + + struct node + { + struct node *next; + unsigned int data1; + int data2; + }; + struct list + { + struct node *head; + int dummy; + }; + + static void * + xmalloc (int n) + { + return malloc (n); + } + + static void + check_list (struct list *list) + { + int i __attribute__((unused)); + struct node *e; + for (e = list->head; e != NULL; e = e->next) + if (e->data1 != e->data2) + abort (); + } + + int + main (void) + { + #define MAX_ELEMS 2 + struct node *elements = xmalloc (MAX_ELEMS * sizeof *elements); + int *values = xmalloc (MAX_ELEMS * sizeof *values); + struct list list; + int i; + + list.head = NULL; + for (i = 0; i < MAX_ELEMS; i++) + { + values[i] = elements[i].data2 = i; + elements[i].data1 = elements[i].data2; + elements[i].next = list.head; + list.head = &elements[i]; + } + check_list (&list); + return 0; + } +*/ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Currently running test. */ +static const char *test_name; + +/* Exit with a failure code. + (Place a breakpoint on this function while debugging.) */ +static void +check_die (void) +{ + exit (EXIT_FAILURE); +} + +/* If OK is not true, prints a message about failure on the + current source file and the given LINE and terminates. */ +static void +check_func (bool ok, int line) +{ + if (!ok) + { + printf ("Check failed in %s test at %s, line %d\n", + test_name, __FILE__, line); + check_die (); + } +} + +/* Verifies that EXPR evaluates to true. + If not, prints a message citing the calling line number and + terminates. */ +#define check(EXPR) check_func ((EXPR), __LINE__) + +/* Prints a message about memory exhaustion and exits with a + failure code. */ +static void +xalloc_die (void) +{ + printf ("virtual memory exhausted\n"); + exit (EXIT_FAILURE); +} + +static void *xmalloc (size_t n) MALLOC_LIKE; +static void *xnmalloc (size_t n, size_t m) MALLOC_LIKE; +static void *xmemdup (const void *p, size_t n) MALLOC_LIKE; + +/* Allocates and returns N bytes of memory. */ +static void * +xmalloc (size_t n) +{ + if (n != 0) + { + void *p = malloc (n); + if (p == NULL) + xalloc_die (); + + return p; + } + else + return NULL; +} + +static void * +xmemdup (const void *p, size_t n) +{ + void *q = xmalloc (n); + memcpy (q, p, n); + return q; +} + +/* Allocates and returns N * M bytes of memory. */ +static void * +xnmalloc (size_t n, size_t m) +{ + if ((size_t) -1 / m <= n) + xalloc_die (); + return xmalloc (n * m); +} + +/* Node type and support routines. */ + +/* Test data element. */ +struct element + { + struct hmap_node node; /* Embedded hash table element. */ + int data; /* Primary value. */ + }; + +/* Returns the `struct element' that NODE is embedded within. */ +static struct element * +hmap_node_to_element (const struct hmap_node *node) +{ + return HMAP_DATA (node, struct element, node); +} + +/* Compares A and B and returns a strcmp-type return value. */ +static int +compare_ints (const void *a_, const void *b_) +{ + const int *a = a_; + const int *b = b_; + + return *a < *b ? -1 : *a > *b; +} + +/* Swaps *A and *B. */ +static void +swap (int *a, int *b) +{ + int t = *a; + *a = *b; + *b = t; +} + +/* Reverses the order of the CNT integers starting at VALUES. */ +static void +reverse (int *values, size_t cnt) +{ + size_t i = 0; + size_t j = cnt; + + while (j > i) + swap (&values[i++], &values[--j]); +} + +/* Arranges the CNT elements in VALUES into the lexicographically + next greater permutation. Returns true if successful. + If VALUES is already the lexicographically greatest + permutation of its elements (i.e. ordered from greatest to + smallest), arranges them into the lexicographically least + permutation (i.e. ordered from smallest to largest) and + returns false. */ +static bool +next_permutation (int *values, size_t cnt) +{ + if (cnt > 0) + { + size_t i = cnt - 1; + while (i != 0) + { + i--; + if (values[i] < values[i + 1]) + { + size_t j; + for (j = cnt - 1; values[i] >= values[j]; j--) + continue; + swap (values + i, values + j); + reverse (values + (i + 1), cnt - (i + 1)); + return true; + } + } + + reverse (values, cnt); + } + + return false; +} + +/* Returns N!. */ +static unsigned int +factorial (unsigned int n) +{ + unsigned int value = 1; + while (n > 1) + value *= n--; + return value; +} + +/* Randomly shuffles the CNT elements in ARRAY, each of which is + SIZE bytes in size. */ +static void +random_shuffle (void *array_, size_t cnt, size_t size) +{ + char *array = array_; + char *tmp = xmalloc (size); + size_t i; + + for (i = 0; i < cnt; i++) + { + size_t j = rand () % (cnt - i) + i; + if (i != j) + { + memcpy (tmp, array + j * size, size); + memcpy (array + j * size, array + i * size, size); + memcpy (array + i * size, tmp, size); + } + } + + free (tmp); +} + +typedef size_t hash_function (int data); + +static size_t +identity_hash (int data) +{ + return data; +} + +static size_t +constant_hash (int data UNUSED) +{ + return 0x12345678u; +} + +static inline uint32_t +md4_round (uint32_t a, uint32_t b, uint32_t c, uint32_t d, + uint32_t data, uint32_t n) +{ + uint32_t x = a + (d ^ (b & (c ^ d))) + data; + return (x << n) | (x >> (32 - n)); +} + +static size_t +random_hash (int data) +{ + uint32_t a = data; + uint32_t b = data; + uint32_t c = data; + uint32_t d = data; + a = md4_round (a, b, c, d, 0, 3); + d = md4_round (d, a, b, c, 1, 7); + c = md4_round (c, d, a, b, 2, 11); + b = md4_round (b, c, d, a, 3, 19); + return a ^ b ^ c ^ d; +} + +static struct hmap_node * +find_element (struct hmap *hmap, int data, hash_function *hash) +{ + struct element *e; + HMAP_FOR_EACH_WITH_HASH (e, struct element, node, hash (data), hmap) + if (e->data == data) + break; + return &e->node; +} + +/* Checks that HMAP contains the CNT ints in DATA, that its + structure is correct, and that certain operations on HMAP + produce the expected results. */ +static void +check_hmap (struct hmap *hmap, const int data[], size_t cnt, + hash_function *hash) +{ + size_t i, j; + int *order; + + check (hmap_count (hmap) == cnt); + check (cnt <= hmap_capacity (hmap)); + + order = xmemdup (data, cnt * sizeof *data); + qsort (order, cnt, sizeof *order, compare_ints); + + for (i = 0; i < cnt; i = j) + { + struct element *e; + int count; + + for (j = i + 1; j < cnt; j++) + if (order[i] != order[j]) + break; + + count = 0; + HMAP_FOR_EACH_WITH_HASH (e, struct element, node, hash (order[i]), hmap) + if (e->data == order[i]) + count++; + + check (count == j - i); + } + + check (find_element (hmap, -1, hash) == NULL); + + if (cnt == 0) + check (hmap_first (hmap) == NULL); + else + { + struct hmap_node *p; + int left; + + left = cnt; + for (p = hmap_first (hmap), i = 0; i < cnt; p = hmap_next (hmap, p), i++) + { + struct element *e = hmap_node_to_element (p); + size_t j; + + check (hmap_node_hash (&e->node) == hash (e->data)); + for (j = 0; j < left; j++) + if (order[j] == e->data) + { + order[j] = order[--left]; + goto next; + } + check_die (); + + next: ; + } + check (p == NULL); + } + + free (order); +} + +/* Inserts the CNT values from 0 to CNT - 1 (inclusive) into an + HMAP in the order specified by INSERTIONS, then deletes them in + the order specified by DELETIONS, checking the HMAP's contents + for correctness after each operation. Uses HASH as the hash + function. */ +static void +test_insert_delete (const int insertions[], + const int deletions[], + size_t cnt, + hash_function *hash) +{ + struct element *elements; + struct hmap hmap; + size_t i; + + elements = xnmalloc (cnt, sizeof *elements); + for (i = 0; i < cnt; i++) + elements[i].data = i; + + hmap_init (&hmap); + hmap_reserve (&hmap, 1); + check_hmap (&hmap, NULL, 0, hash); + for (i = 0; i < cnt; i++) + { + size_t capacity; + hmap_insert (&hmap, &elements[insertions[i]].node, hash (insertions[i])); + check_hmap (&hmap, insertions, i + 1, hash); + + /* A series of insertions should not produce a shrinkable hmap. */ + capacity = hmap_capacity (&hmap); + hmap_shrink (&hmap); + check (capacity == hmap_capacity (&hmap)); + } + for (i = 0; i < cnt; i++) + { + hmap_delete (&hmap, &elements[deletions[i]].node); + check_hmap (&hmap, deletions + i + 1, cnt - i - 1, hash); + } + hmap_destroy (&hmap); + + free (elements); +} + +/* Inserts values into an HMAP in each possible order, then + removes them in each possible order, up to a specified maximum + size, using hash function HASH. */ +static void +test_insert_any_remove_any (hash_function *hash) +{ + const int max_elems = 5; + int cnt; + + for (cnt = 0; cnt <= max_elems; cnt++) + { + int *insertions, *deletions; + unsigned int ins_perm_cnt; + int i; + + insertions = xnmalloc (cnt, sizeof *insertions); + deletions = xnmalloc (cnt, sizeof *deletions); + for (i = 0; i < cnt; i++) + insertions[i] = i; + + for (ins_perm_cnt = 0; + ins_perm_cnt == 0 || next_permutation (insertions, cnt); + ins_perm_cnt++) + { + unsigned int del_perm_cnt; + int i; + + for (i = 0; i < cnt; i++) + deletions[i] = i; + + for (del_perm_cnt = 0; + del_perm_cnt == 0 || next_permutation (deletions, cnt); + del_perm_cnt++) + test_insert_delete (insertions, deletions, cnt, hash); + + check (del_perm_cnt == factorial (cnt)); + } + check (ins_perm_cnt == factorial (cnt)); + + free (insertions); + free (deletions); + } +} + +static void +test_insert_any_remove_any_random_hash (void) +{ + test_insert_any_remove_any (random_hash); +} + +static void +test_insert_any_remove_any_identity_hash (void) +{ + test_insert_any_remove_any (identity_hash); +} + +static void +test_insert_any_remove_any_constant_hash (void) +{ + test_insert_any_remove_any (constant_hash); +} + +/* Inserts values into an HMAP in each possible order, then + removes them in the same order, up to a specified maximum + size, using hash function HASH. */ +static void +test_insert_any_remove_same (hash_function *hash) +{ + const int max_elems = 7; + int cnt; + + for (cnt = 0; cnt <= max_elems; cnt++) + { + int *values; + unsigned int permutation_cnt; + int i; + + values = xnmalloc (cnt, sizeof *values); + for (i = 0; i < cnt; i++) + values[i] = i; + + for (permutation_cnt = 0; + permutation_cnt == 0 || next_permutation (values, cnt); + permutation_cnt++) + test_insert_delete (values, values, cnt, hash); + check (permutation_cnt == factorial (cnt)); + + free (values); + } +} + +static void +test_insert_any_remove_same_random_hash (void) +{ + test_insert_any_remove_same (random_hash); +} + +static void +test_insert_any_remove_same_identity_hash (void) +{ + test_insert_any_remove_same (identity_hash); +} + +static void +test_insert_any_remove_same_constant_hash (void) +{ + test_insert_any_remove_same (constant_hash); +} + +/* Inserts values into an HMAP in each possible order, then + removes them in reverse order, up to a specified maximum + size, using hash function HASH. */ +static void +test_insert_any_remove_reverse (hash_function *hash) +{ + const int max_elems = 7; + int cnt; + + for (cnt = 0; cnt <= max_elems; cnt++) + { + int *insertions, *deletions; + unsigned int permutation_cnt; + int i; + + insertions = xnmalloc (cnt, sizeof *insertions); + deletions = xnmalloc (cnt, sizeof *deletions); + for (i = 0; i < cnt; i++) + insertions[i] = i; + + for (permutation_cnt = 0; + permutation_cnt == 0 || next_permutation (insertions, cnt); + permutation_cnt++) + { + memcpy (deletions, insertions, sizeof *insertions * cnt); + reverse (deletions, cnt); + + test_insert_delete (insertions, deletions, cnt, hash); + } + check (permutation_cnt == factorial (cnt)); + + free (insertions); + free (deletions); + } +} + +static void +test_insert_any_remove_reverse_random_hash (void) +{ + test_insert_any_remove_reverse (random_hash); +} + +static void +test_insert_any_remove_reverse_identity_hash (void) +{ + test_insert_any_remove_reverse (identity_hash); +} + +static void +test_insert_any_remove_reverse_constant_hash (void) +{ + test_insert_any_remove_reverse (constant_hash); +} + +/* Inserts and removes up to MAX_ELEMS values in an hmap, in + random order, using hash function HASH. */ +static void +test_random_sequence (int max_elems, hash_function *hash) +{ + const int max_trials = 8; + int cnt; + + for (cnt = 0; cnt <= max_elems; cnt += 2) + { + int *insertions, *deletions; + int trial; + int i; + + insertions = xnmalloc (cnt, sizeof *insertions); + deletions = xnmalloc (cnt, sizeof *deletions); + for (i = 0; i < cnt; i++) + insertions[i] = i; + for (i = 0; i < cnt; i++) + deletions[i] = i; + + for (trial = 0; trial < max_trials; trial++) + { + random_shuffle (insertions, cnt, sizeof *insertions); + random_shuffle (deletions, cnt, sizeof *deletions); + + test_insert_delete (insertions, deletions, cnt, hash); + } + + free (insertions); + free (deletions); + } +} + +static void +test_random_sequence_random_hash (void) +{ + test_random_sequence (64, random_hash); +} + +static void +test_random_sequence_identity_hash (void) +{ + test_random_sequence (64, identity_hash); +} + +static void +test_random_sequence_constant_hash (void) +{ + test_random_sequence (32, constant_hash); +} + +/* Inserts MAX_ELEMS elements into an HMAP in ascending order, + then delete in ascending order and shrink the hmap at each + step, using hash function HASH. */ +static void +test_insert_ordered (int max_elems, hash_function *hash) +{ + struct element *elements; + int *values; + struct hmap hmap; + int i; + + hmap_init (&hmap); + elements = xnmalloc (max_elems, sizeof *elements); + values = xnmalloc (max_elems, sizeof *values); + for (i = 0; i < max_elems; i++) + { + values[i] = elements[i].data = i; + hmap_insert (&hmap, &elements[i].node, hash (elements[i].data)); + check_hmap (&hmap, values, i + 1, hash); + + if (hash == identity_hash) + { + /* Check that every every hash bucket has (almost) the + same number of nodes in it. */ + int min = INT_MAX; + int max = INT_MIN; + int j; + + for (j = 0; j <= hmap.mask; j++) + { + int count = 0; + struct hmap_node *node; + + for (node = hmap.buckets[j]; node != NULL; node = node->next) + count++; + if (count < min) + min = count; + if (count > max) + max = count; + } + check (max - min <= 1); + } + } + for (i = 0; i < max_elems; i++) + { + hmap_delete (&hmap, &elements[i].node); + hmap_shrink (&hmap); + check_hmap (&hmap, values + i + 1, max_elems - i - 1, hash); + } + hmap_destroy (&hmap); + free (elements); + free (values); +} + +static void +test_insert_ordered_random_hash (void) +{ + test_insert_ordered (1024, random_hash); +} + +static void +test_insert_ordered_identity_hash (void) +{ + test_insert_ordered (1024, identity_hash); +} + +static void +test_insert_ordered_constant_hash (void) +{ + test_insert_ordered (128, constant_hash); +} + +/* Inserts up to MAX_ELEMS elements into an HMAP, then moves the + nodes around in memory, using hash function HASH. */ +static void +test_moved (int max_elems, hash_function *hash) +{ + struct element *e[2]; + int cur; + int *values; + struct hmap hmap; + int i, j; + + hmap_init (&hmap); + e[0] = xnmalloc (max_elems, sizeof *e[0]); + e[1] = xnmalloc (max_elems, sizeof *e[1]); + values = xnmalloc (max_elems, sizeof *values); + cur = 0; + for (i = 0; i < max_elems; i++) + { + values[i] = e[cur][i].data = i; + hmap_insert (&hmap, &e[cur][i].node, hash (e[cur][i].data)); + check_hmap (&hmap, values, i + 1, hash); + + for (j = 0; j <= i; j++) + { + e[!cur][j] = e[cur][j]; + hmap_moved (&hmap, &e[!cur][j].node, &e[cur][j].node); + check_hmap (&hmap, values, i + 1, hash); + } + cur = !cur; + } + hmap_destroy (&hmap); + free (e[0]); + free (e[1]); + free (values); +} + +static void +test_moved_random_hash (void) +{ + test_moved (128, random_hash); +} + +static void +test_moved_identity_hash (void) +{ + test_moved (128, identity_hash); +} + +static void +test_moved_constant_hash (void) +{ + test_moved (32, constant_hash); +} + +/* Inserts values into an HMAP, then changes their values, using + hash function HASH. */ +static void +test_changed (hash_function *hash) +{ + const int max_elems = 6; + int cnt; + + for (cnt = 0; cnt <= max_elems; cnt++) + { + int *values, *changed_values; + struct element *elements; + unsigned int permutation_cnt; + int i; + + values = xnmalloc (cnt, sizeof *values); + changed_values = xnmalloc (cnt, sizeof *changed_values); + elements = xnmalloc (cnt, sizeof *elements); + for (i = 0; i < cnt; i++) + values[i] = i; + + for (permutation_cnt = 0; + permutation_cnt == 0 || next_permutation (values, cnt); + permutation_cnt++) + { + for (i = 0; i < cnt; i++) + { + int j, k; + for (j = 0; j <= cnt; j++) + { + struct hmap hmap; + + hmap_init (&hmap); + + /* Add to HMAP in order. */ + for (k = 0; k < cnt; k++) + { + int n = values[k]; + elements[n].data = n; + hmap_insert (&hmap, &elements[n].node, + hash (elements[n].data)); + } + check_hmap (&hmap, values, cnt, hash); + + /* Change value i to j. */ + elements[i].data = j; + hmap_changed (&hmap, &elements[i].node, + hash (elements[i].data)); + for (k = 0; k < cnt; k++) + changed_values[k] = k; + changed_values[i] = j; + check_hmap (&hmap, changed_values, cnt, hash); + + hmap_destroy (&hmap); + } + } + } + check (permutation_cnt == factorial (cnt)); + + free (values); + free (changed_values); + free (elements); + } +} + +static void +test_changed_random_hash (void) +{ + test_changed (random_hash); +} + +static void +test_changed_identity_hash (void) +{ + test_changed (identity_hash); +} + +static void +test_changed_constant_hash (void) +{ + test_changed (constant_hash); +} + +static void +test_swap (int max_elems, hash_function *hash) +{ + struct element *elements; + int *values; + struct hmap a, b; + struct hmap *working, *empty; + int i; + + hmap_init (&a); + hmap_init (&b); + working = &a; + empty = &b; + elements = xnmalloc (max_elems, sizeof *elements); + values = xnmalloc (max_elems, sizeof *values); + for (i = 0; i < max_elems; i++) + { + struct hmap *tmp; + values[i] = elements[i].data = i; + hmap_insert (working, &elements[i].node, hash (elements[i].data)); + check_hmap (working, values, i + 1, hash); + check_hmap (empty, NULL, 0, hash); + hmap_swap (&a, &b); + tmp = working; + working = empty; + empty = tmp; + } + hmap_destroy (&a); + hmap_destroy (&b); + free (elements); + free (values); +} + +static void +test_swap_random_hash (void) +{ + test_swap (128, random_hash); +} + +static void +test_destroy_null (void) +{ + hmap_destroy (NULL); +} + +/* Test shrinking an empty hash table. */ +static void +test_shrink_empty (void) +{ + struct hmap hmap; + + hmap_init (&hmap); + hmap_reserve (&hmap, 123); + hmap_shrink (&hmap); + hmap_destroy (&hmap); +} + +/* Main program. */ + +/* Runs TEST_FUNCTION and prints a message about NAME. */ +static void +run_test (void (*test_function) (void), const char *name) +{ + test_name = name; + putchar ('.'); + fflush (stdout); + test_function (); +} + +int +main (void) +{ + run_test (test_insert_any_remove_any_random_hash, + "insert any order, delete any order (random hash)"); + run_test (test_insert_any_remove_any_identity_hash, + "insert any order, delete any order (identity hash)"); + run_test (test_insert_any_remove_any_constant_hash, + "insert any order, delete any order (constant hash)"); + + run_test (test_insert_any_remove_same_random_hash, + "insert any order, delete same order (random hash)"); + run_test (test_insert_any_remove_same_identity_hash, + "insert any order, delete same order (identity hash)"); + run_test (test_insert_any_remove_same_constant_hash, + "insert any order, delete same order (constant hash)"); + + run_test (test_insert_any_remove_reverse_random_hash, + "insert any order, delete reverse order (random hash)"); + run_test (test_insert_any_remove_reverse_identity_hash, + "insert any order, delete reverse order (identity hash)"); + run_test (test_insert_any_remove_reverse_constant_hash, + "insert any order, delete reverse order (constant hash)"); + + run_test (test_random_sequence_random_hash, + "insert and delete in random sequence (random hash)"); + run_test (test_random_sequence_identity_hash, + "insert and delete in random sequence (identity hash)"); + run_test (test_random_sequence_constant_hash, + "insert and delete in random sequence (constant hash)"); + + run_test (test_insert_ordered_random_hash, + "insert in ascending order (random hash)"); + run_test (test_insert_ordered_identity_hash, + "insert in ascending order (identity hash)"); + run_test (test_insert_ordered_constant_hash, + "insert in ascending order (constant hash)"); + + run_test (test_moved_random_hash, + "move elements around in memory (random hash)"); + run_test (test_moved_identity_hash, + "move elements around in memory (identity hash)"); + run_test (test_moved_constant_hash, + "move elements around in memory (constant hash)"); + + run_test (test_changed_random_hash, + "change key data in nodes (random hash)"); + run_test (test_changed_identity_hash, + "change key data in nodes (identity hash)"); + run_test (test_changed_constant_hash, + "change key data in nodes (constant hash)"); + + run_test (test_swap_random_hash, "test swapping tables"); + + run_test (test_destroy_null, "test destroying null table"); + run_test (test_shrink_empty, "test shrinking an empty table"); + + putchar ('\n'); + + return 0; +} diff --git a/tests/libpspp/hmapx-test.c b/tests/libpspp/hmapx-test.c new file mode 100644 index 0000000000..fc08ca6161 --- /dev/null +++ b/tests/libpspp/hmapx-test.c @@ -0,0 +1,1034 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2007, 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* This is a test program for the hmapx_* routines defined in + hmapx.c. This test program aims to be as comprehensive as + possible. "gcov -a -b" should report 100% coverage of lines, + blocks and branches in hmapx.c (when compiled with -DNDEBUG). + "valgrind --leak-check=yes --show-reachable=yes" should give a + clean report. */ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Currently running test. */ +static const char *test_name; + +/* If OK is not true, prints a message about failure on the + current source file and the given LINE and terminates. */ +static void +check_func (bool ok, int line) +{ + if (!ok) + { + printf ("Check failed in %s test at %s, line %d\n", + test_name, __FILE__, line); + abort (); + } +} + +/* Verifies that EXPR evaluates to true. + If not, prints a message citing the calling line number and + terminates. */ +#define check(EXPR) check_func ((EXPR), __LINE__) + +/* Prints a message about memory exhaustion and exits with a + failure code. */ +static void +xalloc_die (void) +{ + printf ("virtual memory exhausted\n"); + exit (EXIT_FAILURE); +} + +/* Allocates and returns N bytes of memory. */ +static void * +xmalloc (size_t n) +{ + if (n != 0) + { + void *p = malloc (n); + if (p == NULL) + xalloc_die (); + + return p; + } + else + return NULL; +} + +static void * +xmemdup (const void *p, size_t n) +{ + void *q = xmalloc (n); + memcpy (q, p, n); + return q; +} + +/* Allocates and returns N * M bytes of memory. */ +static void * +xnmalloc (size_t n, size_t m) +{ + if ((size_t) -1 / m <= n) + xalloc_die (); + return xmalloc (n * m); +} + +/* Node type and support routines. */ + +/* Test data element. */ +struct element + { + int data; /* Primary value. */ + }; + +/* Compares A and B and returns a strcmp-type return value. */ +static int +compare_ints (const void *a_, const void *b_) +{ + const int *a = a_; + const int *b = b_; + + return *a < *b ? -1 : *a > *b; +} + +/* Swaps *A and *B. */ +static void +swap (int *a, int *b) +{ + int t = *a; + *a = *b; + *b = t; +} + +/* Reverses the order of the CNT integers starting at VALUES. */ +static void +reverse (int *values, size_t cnt) +{ + size_t i = 0; + size_t j = cnt; + + while (j > i) + swap (&values[i++], &values[--j]); +} + +/* Arranges the CNT elements in VALUES into the lexicographically + next greater permutation. Returns true if successful. + If VALUES is already the lexicographically greatest + permutation of its elements (i.e. ordered from greatest to + smallest), arranges them into the lexicographically least + permutation (i.e. ordered from smallest to largest) and + returns false. */ +static bool +next_permutation (int *values, size_t cnt) +{ + if (cnt > 0) + { + size_t i = cnt - 1; + while (i != 0) + { + i--; + if (values[i] < values[i + 1]) + { + size_t j; + for (j = cnt - 1; values[i] >= values[j]; j--) + continue; + swap (values + i, values + j); + reverse (values + (i + 1), cnt - (i + 1)); + return true; + } + } + + reverse (values, cnt); + } + + return false; +} + +/* Returns N!. */ +static unsigned int +factorial (unsigned int n) +{ + unsigned int value = 1; + while (n > 1) + value *= n--; + return value; +} + +/* Randomly shuffles the CNT elements in ARRAY, each of which is + SIZE bytes in size. */ +static void +random_shuffle (void *array_, size_t cnt, size_t size) +{ + char *array = array_; + char *tmp = xmalloc (size); + size_t i; + + for (i = 0; i < cnt; i++) + { + size_t j = rand () % (cnt - i) + i; + if (i != j) + { + memcpy (tmp, array + j * size, size); + memcpy (array + j * size, array + i * size, size); + memcpy (array + i * size, tmp, size); + } + } + + free (tmp); +} + +typedef size_t hash_function (int data); + +static size_t +identity_hash (int data) +{ + return data; +} + +static size_t +constant_hash (int data UNUSED) +{ + return 0x12345678u; +} + +static inline uint32_t +md4_round (uint32_t a, uint32_t b, uint32_t c, uint32_t d, + uint32_t data, uint32_t n) +{ + uint32_t x = a + (d ^ (b & (c ^ d))) + data; + return (x << n) | (x >> (32 - n)); +} + +static size_t +random_hash (int data) +{ + uint32_t a = data; + uint32_t b = data; + uint32_t c = data; + uint32_t d = data; + a = md4_round (a, b, c, d, 0, 3); + d = md4_round (d, a, b, c, 1, 7); + c = md4_round (c, d, a, b, 2, 11); + b = md4_round (b, c, d, a, 3, 19); + return a ^ b ^ c ^ d; +} + +static struct hmapx_node * +find_element (struct hmapx *hmapx, int data, hash_function *hash) +{ + struct hmapx_node *node; + struct element *e; + HMAPX_FOR_EACH_WITH_HASH (e, node, hash (data), hmapx) + if (e->data == data) + break; + return node; +} + +/* Checks that HMAPX contains the CNT ints in DATA, that its + structure is correct, and that certain operations on HMAPX + produce the expected results. */ +static void +check_hmapx (struct hmapx *hmapx, const int data[], size_t cnt, + hash_function *hash) +{ + size_t i, j; + int *order; + + check (hmapx_count (hmapx) == cnt); + check (cnt <= hmapx_capacity (hmapx)); + + order = xmemdup (data, cnt * sizeof *data); + qsort (order, cnt, sizeof *order, compare_ints); + + for (i = 0; i < cnt; i = j) + { + struct hmapx_node *node; + struct element *e; + int count; + + for (j = i + 1; j < cnt; j++) + if (order[i] != order[j]) + break; + + count = 0; + HMAPX_FOR_EACH_WITH_HASH (e, node, hash (order[i]), hmapx) + if (e->data == order[i]) + count++; + + check (count == j - i); + } + + check (find_element (hmapx, -1, hash) == NULL); + + if (cnt == 0) + check (hmapx_first (hmapx) == NULL); + else + { + struct hmapx_node *p; + int left; + + left = cnt; + for (p = hmapx_first (hmapx), i = 0; i < cnt; + p = hmapx_next (hmapx, p), i++) + { + struct element *e = hmapx_node_data (p); + size_t j; + + check (hmapx_node_hash (p) == hash (e->data)); + for (j = 0; j < left; j++) + if (order[j] == e->data) + { + order[j] = order[--left]; + goto next; + } + abort (); + + next: ; + } + check (p == NULL); + } + + free (order); +} + +/* Inserts the CNT values from 0 to CNT - 1 (inclusive) into an + HMAPX in the order specified by INSERTIONS, then deletes them in + the order specified by DELETIONS, checking the HMAPX's contents + for correctness after each operation. Uses HASH as the hash + function. */ +static void +test_insert_delete (const int insertions[], + const int deletions[], + size_t cnt, + hash_function *hash, + size_t reserve) +{ + struct element *elements; + struct hmapx_node **nodes; + struct hmapx hmapx; + size_t i; + + elements = xnmalloc (cnt, sizeof *elements); + nodes = xnmalloc (cnt, sizeof *nodes); + for (i = 0; i < cnt; i++) + elements[i].data = i; + + hmapx_init (&hmapx); + hmapx_reserve (&hmapx, reserve); + check_hmapx (&hmapx, NULL, 0, hash); + for (i = 0; i < cnt; i++) + { + struct hmapx_node *(*insert) (struct hmapx *, void *, size_t hash); + size_t capacity; + + /* Insert the node. Use hmapx_insert_fast if we have not + yet exceeded the reserve. */ + insert = i < reserve ? hmapx_insert_fast : hmapx_insert; + nodes[insertions[i]] = insert (&hmapx, &elements[insertions[i]], + hash (insertions[i])); + check_hmapx (&hmapx, insertions, i + 1, hash); + + /* A series of insertions should not produce a shrinkable hmapx. */ + if (i >= reserve) + { + capacity = hmapx_capacity (&hmapx); + hmapx_shrink (&hmapx); + check (capacity == hmapx_capacity (&hmapx)); + } + } + for (i = 0; i < cnt; i++) + { + hmapx_delete (&hmapx, nodes[deletions[i]]); + check_hmapx (&hmapx, deletions + i + 1, cnt - i - 1, hash); + } + hmapx_destroy (&hmapx); + + free (elements); + free (nodes); +} + +/* Inserts values into an HMAPX in each possible order, then + removes them in each possible order, up to a specified maximum + size, using hash function HASH. */ +static void +test_insert_any_remove_any (hash_function *hash) +{ + const int max_elems = 5; + int cnt; + + for (cnt = 0; cnt <= max_elems; cnt++) + { + int *insertions, *deletions; + unsigned int ins_perm_cnt; + int i; + + insertions = xnmalloc (cnt, sizeof *insertions); + deletions = xnmalloc (cnt, sizeof *deletions); + for (i = 0; i < cnt; i++) + insertions[i] = i; + + for (ins_perm_cnt = 0; + ins_perm_cnt == 0 || next_permutation (insertions, cnt); + ins_perm_cnt++) + { + unsigned int del_perm_cnt; + int i; + + for (i = 0; i < cnt; i++) + deletions[i] = i; + + for (del_perm_cnt = 0; + del_perm_cnt == 0 || next_permutation (deletions, cnt); + del_perm_cnt++) + test_insert_delete (insertions, deletions, cnt, hash, 1); + + check (del_perm_cnt == factorial (cnt)); + } + check (ins_perm_cnt == factorial (cnt)); + + free (insertions); + free (deletions); + } +} + +static void +test_insert_any_remove_any_random_hash (void) +{ + test_insert_any_remove_any (random_hash); +} + +static void +test_insert_any_remove_any_identity_hash (void) +{ + test_insert_any_remove_any (identity_hash); +} + +static void +test_insert_any_remove_any_constant_hash (void) +{ + test_insert_any_remove_any (constant_hash); +} + +/* Inserts values into an HMAPX in each possible order, then + removes them in the same order, up to a specified maximum + size, using hash function HASH. */ +static void +test_insert_any_remove_same (hash_function *hash) +{ + const int max_elems = 7; + int cnt; + + for (cnt = 0; cnt <= max_elems; cnt++) + { + int *values; + unsigned int permutation_cnt; + int i; + + values = xnmalloc (cnt, sizeof *values); + for (i = 0; i < cnt; i++) + values[i] = i; + + for (permutation_cnt = 0; + permutation_cnt == 0 || next_permutation (values, cnt); + permutation_cnt++) + test_insert_delete (values, values, cnt, hash, cnt / 2); + check (permutation_cnt == factorial (cnt)); + + free (values); + } +} + +static void +test_insert_any_remove_same_random_hash (void) +{ + test_insert_any_remove_same (random_hash); +} + +static void +test_insert_any_remove_same_identity_hash (void) +{ + test_insert_any_remove_same (identity_hash); +} + +static void +test_insert_any_remove_same_constant_hash (void) +{ + test_insert_any_remove_same (constant_hash); +} + +/* Inserts values into an HMAPX in each possible order, then + removes them in reverse order, up to a specified maximum + size, using hash function HASH. */ +static void +test_insert_any_remove_reverse (hash_function *hash) +{ + const int max_elems = 7; + int cnt; + + for (cnt = 0; cnt <= max_elems; cnt++) + { + int *insertions, *deletions; + unsigned int permutation_cnt; + int i; + + insertions = xnmalloc (cnt, sizeof *insertions); + deletions = xnmalloc (cnt, sizeof *deletions); + for (i = 0; i < cnt; i++) + insertions[i] = i; + + for (permutation_cnt = 0; + permutation_cnt == 0 || next_permutation (insertions, cnt); + permutation_cnt++) + { + memcpy (deletions, insertions, sizeof *insertions * cnt); + reverse (deletions, cnt); + + test_insert_delete (insertions, deletions, cnt, hash, cnt); + } + check (permutation_cnt == factorial (cnt)); + + free (insertions); + free (deletions); + } +} + +static void +test_insert_any_remove_reverse_random_hash (void) +{ + test_insert_any_remove_reverse (random_hash); +} + +static void +test_insert_any_remove_reverse_identity_hash (void) +{ + test_insert_any_remove_reverse (identity_hash); +} + +static void +test_insert_any_remove_reverse_constant_hash (void) +{ + test_insert_any_remove_reverse (constant_hash); +} + +/* Inserts and removes up to MAX_ELEMS values in an hmapx, in + random order, using hash function HASH. */ +static void +test_random_sequence (int max_elems, hash_function *hash) +{ + const int max_trials = 8; + int cnt; + + for (cnt = 0; cnt <= max_elems; cnt += 2) + { + int *insertions, *deletions; + int trial; + int i; + + insertions = xnmalloc (cnt, sizeof *insertions); + deletions = xnmalloc (cnt, sizeof *deletions); + for (i = 0; i < cnt; i++) + insertions[i] = i; + for (i = 0; i < cnt; i++) + deletions[i] = i; + + for (trial = 0; trial < max_trials; trial++) + { + random_shuffle (insertions, cnt, sizeof *insertions); + random_shuffle (deletions, cnt, sizeof *deletions); + + test_insert_delete (insertions, deletions, cnt, hash, 0); + } + + free (insertions); + free (deletions); + } +} + +static void +test_random_sequence_random_hash (void) +{ + test_random_sequence (64, random_hash); +} + +static void +test_random_sequence_identity_hash (void) +{ + test_random_sequence (64, identity_hash); +} + +static void +test_random_sequence_constant_hash (void) +{ + test_random_sequence (32, constant_hash); +} + +/* Inserts MAX_ELEMS elements into an HMAPX in ascending order, + then delete in ascending order and shrink the hmapx at each + step, using hash function HASH. */ +static void +test_insert_ordered (int max_elems, hash_function *hash) +{ + struct element *elements; + struct hmapx_node **nodes; + int *values; + struct hmapx hmapx; + int i; + + hmapx_init (&hmapx); + elements = xnmalloc (max_elems, sizeof *elements); + nodes = xnmalloc (max_elems, sizeof *nodes); + values = xnmalloc (max_elems, sizeof *values); + for (i = 0; i < max_elems; i++) + { + values[i] = elements[i].data = i; + nodes[i] = hmapx_insert (&hmapx, &elements[i], hash (elements[i].data)); + check_hmapx (&hmapx, values, i + 1, hash); + + if (hash == identity_hash) + { + /* Check that every every hash bucket has (almost) the + same number of nodes in it. */ + int min = INT_MAX; + int max = INT_MIN; + int j; + + for (j = 0; j <= hmapx.hmap.mask; j++) + { + int count = 0; + struct hmap_node *node; + + for (node = hmapx.hmap.buckets[j]; node != NULL; + node = node->next) + count++; + if (count < min) + min = count; + if (count > max) + max = count; + } + check (max - min <= 1); + } + } + for (i = 0; i < max_elems; i++) + { + hmapx_delete (&hmapx, nodes[i]); + hmapx_shrink (&hmapx); + check_hmapx (&hmapx, values + i + 1, max_elems - i - 1, hash); + } + hmapx_destroy (&hmapx); + free (elements); + free (nodes); + free (values); +} + +static void +test_insert_ordered_random_hash (void) +{ + test_insert_ordered (1024, random_hash); +} + +static void +test_insert_ordered_identity_hash (void) +{ + test_insert_ordered (1024, identity_hash); +} + +static void +test_insert_ordered_constant_hash (void) +{ + test_insert_ordered (128, constant_hash); +} + +/* Inserts up to MAX_ELEMS elements into an HMAPX, then moves the + nodes around in memory, using hash function HASH. */ +static void +test_moved (int max_elems, hash_function *hash) +{ + struct element *e[2]; + int cur; + int *values; + struct hmapx_node **nodes; + struct hmapx hmapx; + int i, j; + + hmapx_init (&hmapx); + e[0] = xnmalloc (max_elems, sizeof *e[0]); + e[1] = xnmalloc (max_elems, sizeof *e[1]); + values = xnmalloc (max_elems, sizeof *values); + nodes = xnmalloc (max_elems, sizeof *nodes); + cur = 0; + for (i = 0; i < max_elems; i++) + { + values[i] = e[cur][i].data = i; + nodes[i] = hmapx_insert (&hmapx, &e[cur][i], hash (e[cur][i].data)); + check_hmapx (&hmapx, values, i + 1, hash); + + for (j = 0; j <= i; j++) + { + e[!cur][j] = e[cur][j]; + hmapx_move (nodes[j], &e[cur][j]); + check_hmapx (&hmapx, values, i + 1, hash); + } + cur = !cur; + } + hmapx_destroy (&hmapx); + free (e[0]); + free (e[1]); + free (values); + free (nodes); +} + +static void +test_moved_random_hash (void) +{ + test_moved (128, random_hash); +} + +static void +test_moved_identity_hash (void) +{ + test_moved (128, identity_hash); +} + +static void +test_moved_constant_hash (void) +{ + test_moved (32, constant_hash); +} + +/* Inserts values into an HMAPX, then changes their values, using + hash function HASH. */ +static void +test_changed (hash_function *hash) +{ + const int max_elems = 6; + int cnt; + + for (cnt = 0; cnt <= max_elems; cnt++) + { + int *values, *changed_values; + struct hmapx_node **nodes; + struct element *elements; + unsigned int permutation_cnt; + int i; + + values = xnmalloc (cnt, sizeof *values); + changed_values = xnmalloc (cnt, sizeof *changed_values); + elements = xnmalloc (cnt, sizeof *elements); + nodes = xnmalloc (cnt, sizeof *nodes); + for (i = 0; i < cnt; i++) + values[i] = i; + + for (permutation_cnt = 0; + permutation_cnt == 0 || next_permutation (values, cnt); + permutation_cnt++) + { + for (i = 0; i < cnt; i++) + { + int j, k; + for (j = 0; j <= cnt; j++) + { + struct hmapx hmapx; + + hmapx_init (&hmapx); + + /* Add to HMAPX in order. */ + for (k = 0; k < cnt; k++) + { + int n = values[k]; + elements[n].data = n; + nodes[n] = hmapx_insert (&hmapx, &elements[n], + hash (elements[n].data)); + } + check_hmapx (&hmapx, values, cnt, hash); + + /* Change value i to j. */ + elements[i].data = j; + hmapx_changed (&hmapx, nodes[i], + hash (elements[i].data)); + for (k = 0; k < cnt; k++) + changed_values[k] = k; + changed_values[i] = j; + check_hmapx (&hmapx, changed_values, cnt, hash); + + hmapx_destroy (&hmapx); + } + } + } + check (permutation_cnt == factorial (cnt)); + + free (values); + free (changed_values); + free (elements); + free (nodes); + } +} + +static void +test_changed_random_hash (void) +{ + test_changed (random_hash); +} + +static void +test_changed_identity_hash (void) +{ + test_changed (identity_hash); +} + +static void +test_changed_constant_hash (void) +{ + test_changed (constant_hash); +} + +/* Inserts values into an HMAPX, then changes and moves their + values, using hash function HASH. */ +static void +test_change (hash_function *hash) +{ + const int max_elems = 6; + int cnt; + + for (cnt = 0; cnt <= max_elems; cnt++) + { + int *values, *changed_values; + struct hmapx_node **nodes; + struct element *elements; + struct element replacement; + unsigned int permutation_cnt; + int i; + + values = xnmalloc (cnt, sizeof *values); + changed_values = xnmalloc (cnt, sizeof *changed_values); + elements = xnmalloc (cnt, sizeof *elements); + nodes = xnmalloc (cnt, sizeof *nodes); + for (i = 0; i < cnt; i++) + values[i] = i; + + for (permutation_cnt = 0; + permutation_cnt == 0 || next_permutation (values, cnt); + permutation_cnt++) + { + for (i = 0; i < cnt; i++) + { + int j, k; + for (j = 0; j <= cnt; j++) + { + struct hmapx hmapx; + + hmapx_init (&hmapx); + + /* Add to HMAPX in order. */ + for (k = 0; k < cnt; k++) + { + int n = values[k]; + elements[n].data = n; + nodes[n] = hmapx_insert (&hmapx, &elements[n], + hash (elements[n].data)); + } + check_hmapx (&hmapx, values, cnt, hash); + + /* Change value i to j. */ + replacement.data = j; + hmapx_change (&hmapx, nodes[i], &replacement, hash (j)); + for (k = 0; k < cnt; k++) + changed_values[k] = k; + changed_values[i] = j; + check_hmapx (&hmapx, changed_values, cnt, hash); + + hmapx_destroy (&hmapx); + } + } + } + check (permutation_cnt == factorial (cnt)); + + free (values); + free (changed_values); + free (elements); + free (nodes); + } +} + +static void +test_change_random_hash (void) +{ + test_change (random_hash); +} + +static void +test_change_identity_hash (void) +{ + test_change (identity_hash); +} + +static void +test_change_constant_hash (void) +{ + test_change (constant_hash); +} + +static void +test_swap (int max_elems, hash_function *hash) +{ + struct element *elements; + int *values; + struct hmapx a, b; + struct hmapx *working, *empty; + int i; + + hmapx_init (&a); + hmapx_init (&b); + working = &a; + empty = &b; + elements = xnmalloc (max_elems, sizeof *elements); + values = xnmalloc (max_elems, sizeof *values); + for (i = 0; i < max_elems; i++) + { + struct hmapx *tmp; + values[i] = elements[i].data = i; + hmapx_insert (working, &elements[i], hash (elements[i].data)); + check_hmapx (working, values, i + 1, hash); + check_hmapx (empty, NULL, 0, hash); + hmapx_swap (&a, &b); + tmp = working; + working = empty; + empty = tmp; + } + hmapx_destroy (&a); + hmapx_destroy (&b); + free (elements); + free (values); +} + +static void +test_swap_random_hash (void) +{ + test_swap (128, random_hash); +} + +static void +test_destroy_null (void) +{ + hmapx_destroy (NULL); +} + +/* Test shrinking an empty hash table. */ +static void +test_shrink_empty (void) +{ + struct hmapx hmapx; + + hmapx_init (&hmapx); + hmapx_reserve (&hmapx, 123); + hmapx_shrink (&hmapx); + hmapx_destroy (&hmapx); +} + +/* Main program. */ + +/* Runs TEST_FUNCTION and prints a message about NAME. */ +static void +run_test (void (*test_function) (void), const char *name) +{ + test_name = name; + putchar ('.'); + fflush (stdout); + test_function (); +} + +int +main (void) +{ + run_test (test_insert_any_remove_any_random_hash, + "insert any order, delete any order (random hash)"); + run_test (test_insert_any_remove_any_identity_hash, + "insert any order, delete any order (identity hash)"); + run_test (test_insert_any_remove_any_constant_hash, + "insert any order, delete any order (constant hash)"); + + run_test (test_insert_any_remove_same_random_hash, + "insert any order, delete same order (random hash)"); + run_test (test_insert_any_remove_same_identity_hash, + "insert any order, delete same order (identity hash)"); + run_test (test_insert_any_remove_same_constant_hash, + "insert any order, delete same order (constant hash)"); + + run_test (test_insert_any_remove_reverse_random_hash, + "insert any order, delete reverse order (random hash)"); + run_test (test_insert_any_remove_reverse_identity_hash, + "insert any order, delete reverse order (identity hash)"); + run_test (test_insert_any_remove_reverse_constant_hash, + "insert any order, delete reverse order (constant hash)"); + + run_test (test_random_sequence_random_hash, + "insert and delete in random sequence (random hash)"); + run_test (test_random_sequence_identity_hash, + "insert and delete in random sequence (identity hash)"); + run_test (test_random_sequence_constant_hash, + "insert and delete in random sequence (constant hash)"); + + run_test (test_insert_ordered_random_hash, + "insert in ascending order (random hash)"); + run_test (test_insert_ordered_identity_hash, + "insert in ascending order (identity hash)"); + run_test (test_insert_ordered_constant_hash, + "insert in ascending order (constant hash)"); + + run_test (test_moved_random_hash, + "move elements around in memory (random hash)"); + run_test (test_moved_identity_hash, + "move elements around in memory (identity hash)"); + run_test (test_moved_constant_hash, + "move elements around in memory (constant hash)"); + + run_test (test_changed_random_hash, + "change key data in nodes (random hash)"); + run_test (test_changed_identity_hash, + "change key data in nodes (identity hash)"); + run_test (test_changed_constant_hash, + "change key data in nodes (constant hash)"); + + run_test (test_change_random_hash, + "change and move key data in nodes (random hash)"); + run_test (test_change_identity_hash, + "change and move key data in nodes (identity hash)"); + run_test (test_change_constant_hash, + "change and move key data in nodes (constant hash)"); + + run_test (test_swap_random_hash, "test swapping tables"); + + run_test (test_destroy_null, "test destroying null table"); + run_test (test_shrink_empty, "test shrinking an empty table"); + + putchar ('\n'); + + return 0; +} diff --git a/tests/xforms/recode.sh b/tests/xforms/recode.sh index b9cabe0a82..ae29cc4d1d 100755 --- a/tests/xforms/recode.sh +++ b/tests/xforms/recode.sh @@ -124,15 +124,17 @@ RECODE x (LOWEST THRU 5=1)(ELSE=COPY) INTO cx5. RECODE x (4 THRU HIGHEST=2)(ELSE=COPY) INTO cx6. RECODE x (LO THRU HI=3)(ELSE=COPY) INTO cx7. RECODE x (SYSMIS=4)(ELSE=COPY) INTO cx8. -LIST x cx0 TO cx8. +RECODE x (5=COPY)(ELSE=22) INTO cx9. +LIST x cx0 TO cx9. * String to string, with INTO, without COPY. -STRING s0 TO s2 (A4)/t0 TO t3 (A10). +STRING s0 TO s3 (A4)/t0 TO t3 (A10). RECODE s t ('a'='b')('ab'='bc') INTO s0 t0. RECODE s t ('abcd'='xyzw') INTO s1 t1. RECODE s t ('abc'='def')(ELSE='xyz') INTO s2 t2. RECODE t ('a'='b')('abcdefghi'='xyz')('abcdefghij'='jklmnopqr') INTO t3. -LIST s t s0 TO s2 t0 TO t3. +RECODE s (MISSING='gone') INTO s3. +LIST s t s0 TO s3 t0 TO t3. * String to string, with INTO, with COPY. STRING cs0 TO cs2 (A4)/ct0 TO ct3 (A10). @@ -168,97 +170,96 @@ perl -pi -e 's/^\s*$//g' $TEMPDIR/pspp.list diff -bu $TEMPDIR/pspp.list - <