From: Ben Pfaff Date: Fri, 6 Feb 2009 05:25:55 +0000 (-0800) Subject: Speed up Wilcoxon matched-pairs signed-ranks significance routine. X-Git-Tag: v0.7.3~308 X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e00c8b2ca5092c6871c0cae67be95a884b5671a3;p=pspp-builds.git Speed up Wilcoxon matched-pairs signed-ranks significance routine. The performance of this routine was O(2**N). This improves it to O(N*W). For N=30, W=120 this reduces calculation time from 6m9s to .002s, which is a 123000x speedup. Refer to bug #25466 for more information. --- diff --git a/lib/automake.mk b/lib/automake.mk index 7cd9c9f4..6b20c674 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -1,7 +1,6 @@ ## Process this file with automake to produce Makefile.in -*- makefile -*- include $(top_srcdir)/lib/linreg/automake.mk -include $(top_srcdir)/lib/misc/automake.mk if WITHGUI include $(top_srcdir)/lib/gtk-contrib/automake.mk diff --git a/lib/misc/README b/lib/misc/README deleted file mode 100644 index 68ba91ad..00000000 --- a/lib/misc/README +++ /dev/null @@ -1,2 +0,0 @@ -This is not part of the GNU PSPP program, but is used with GNU PSPP. - diff --git a/lib/misc/automake.mk b/lib/misc/automake.mk deleted file mode 100644 index 3b4f1a93..00000000 --- a/lib/misc/automake.mk +++ /dev/null @@ -1,8 +0,0 @@ -## Process this file with automake to produce Makefile.in -*- makefile -*- - -noinst_LTLIBRARIES += lib/misc/libmisc.la - -lib_misc_libmisc_la_SOURCES = \ - lib/misc/wx-mp-sr.c lib/misc/wx-mp-sr.h - -EXTRA_DIST += lib/misc/README diff --git a/lib/misc/wx-mp-sr.c b/lib/misc/wx-mp-sr.c deleted file mode 100644 index 39e83b97..00000000 --- a/lib/misc/wx-mp-sr.c +++ /dev/null @@ -1,101 +0,0 @@ -#include -#include "wx-mp-sr.h" - -/********************************************************************* -* -* Calculate the exact level of significance for a -* Wilcoxon Matched-Pair Signed-Ranks Test using the sample's -* Sum of Ranks W and the sample size (i.e., number of pairs) N. -* This whole routine can be run as a stand-alone program. -* -* Use: -* WX-MP-SR W N -* -* Copyright 1996, Rob van Son -* -* This program is free software; you can redistribute it and/or modify -* it under the terms of the GNU General Public License as published by -* the Free Software Foundation; either version 2 of the License, or -* (at your option) any later version. -* -* This program is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -* GNU General Public License for more details. -* -* You should have received a copy of the GNU General Public License along -* with this program; if not, write to the Free Software Foundation, Inc., -* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -* -* ------------------------------------------------------- -* Rob van Son -* Institute of Phonetic Sciences & IFOTT -* University of Amsterdam, Spuistraat 210 -* NL-1012VT Amsterdam, The Netherlands -* Tel.: (+31) 205252196 Fax.: (+31) 205252197 -* Email: r.j.j.h.vanson@uva.nl -* WWW page: http://www.fon.hum.uva.nl/rob -* ------------------------------------------------------- -* -* This is the actual routine that calculates the exact (two-tailed) -* level of significance for the Wilcoxon Matched-Pairs Signed-Ranks -* test. The inputs are the Sum of Ranks of either the positive of -* negative samples (W) and the sample size (N). -* The Level of significance is calculated by checking for each -* possible outcome (2**N possibilities) whether the sum of ranks -* is larger than or equal to the observed Sum of Ranks (W). -* -* NOTE: The execution-time scales like ~ N*2**N, i.e., N*pow(2, N), -* which is more than exponential. Adding a single pair to the sample -* (i.e., increase N by 1) will more than double the time needed to -* complete the calculations (apart from an additive constant). -* The execution-time of this program can easily outrun your -* patience. -* -***********************************************************************/ - -double LevelOfSignificanceWXMPSR(double Winput, long int N) -{ - unsigned long int W, MaximalW, NumberOfPossibilities, CountLarger; - unsigned long int i, RankSum, j; - double p; - - /* Determine Wmax, i.e., work with the largest Rank Sum */ - MaximalW = N*(N+1)/2; - if(Winput < MaximalW/2)Winput = MaximalW - Winput; - W = Winput; /* Convert to long int */ - if(W != Winput)++W; /* Increase to next full integer */ - - /* The total number of possible outcomes is 2**N */ - NumberOfPossibilities = 1 << N; - - /* Initialize and loop. The loop-interior will be run 2**N times. */ - CountLarger = 0; - /* Generate all distributions of sign over ranks as bit-patterns (i). */ - for(i=0; i < NumberOfPossibilities; ++i) - { - RankSum = 0; - /* - Shift "sign" bits out of i to determine the Sum of Ranks (j). - */ - for(j=0; j < N; ++j) - { - if((i >> j) & 1)RankSum += j + 1; - }; - /* - * Count the number of "samples" that have a Sum of Ranks larger than - * or equal to the one found (i.e., >= W). - */ - if(RankSum >= W)++CountLarger; - }; - /***************************************************************** - * The level of significance is the number of outcomes with a - * sum of ranks equal to or larger than the one found (W) - * divided by the total number of possible outcomes. - * The level is doubled to get the two-tailed result. - ******************************************************************/ - p = 2*((double)CountLarger) / ((double)NumberOfPossibilities); - - return p; -} - diff --git a/lib/misc/wx-mp-sr.h b/lib/misc/wx-mp-sr.h deleted file mode 100644 index ec3ef5ed..00000000 --- a/lib/misc/wx-mp-sr.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef WX_MP_SR -#define WX_MP_SR 1 - -double LevelOfSignificanceWXMPSR(double Winput, long int N); - -#endif diff --git a/src/language/stats/wilcoxon.c b/src/language/stats/wilcoxon.c index 2a05f5d9..a485f350 100644 --- a/src/language/stats/wilcoxon.c +++ b/src/language/stats/wilcoxon.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/math/wilcoxon-sig.c b/src/math/wilcoxon-sig.c new file mode 100644 index 00000000..d9a4bcae --- /dev/null +++ b/src/math/wilcoxon-sig.c @@ -0,0 +1,156 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2009 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Thanks to Rob van Son for writing the original version of this + file. This version has been completely rewritten; only the + name of the function has been retained. In the process of + rewriting, it was sped up from O(2**N) to O(N**3). */ + +#include +#include "wilcoxon-sig.h" +#include +#include +#include +#include +#include "xalloc.h" + +/* For integers N and W, with 0 <= N < CHAR_BIT*sizeof(long), + calculates and returns the value of the function S(N,W), + defined as the number of subsets of 1, 2, 3, ..., N that sum + to at least W. There are 2**N subsets of N items, so S(N,W) + is in the range 0...2**N. + + There are a few trivial cases: + + * For W <= 0, S(N,W) = 2**N. + + * For W > N*(N+1)/2, S(N,W) = 0. + + * S(1,1) = 1. + + Notably, these trivial cases include all values of W for N = 1. + + Now consider the remaining, nontrivial cases, that is, N > 1 and + 1 <= W <= N*(N+1)/2. In this case, apply the following identity: + + S(N,W) = S(N-1, W) + S(N-1, W-N). + + The first term on the right hand is the number of subsets that do + not include N that sum to at least W; the second term is the + number of subsets that do include N that sum to at least W. + + Then we repeatedly apply the identity to the result, reducing the + value of N by 1 each time until we reach N=1. Some expansions + yield trivial cases, e.g. if W - N <= 0 (in which case we add a + 2**N term to the final result) or if W is greater than the new N. + + Here is an example: + + S(7,7) = S(6,7) + S(6,0) + = S(6,7) + 64 + + = (S(5,7) + S(5,1)) + 64 + + = (S(4,7) + S(4,2)) + (S(4,1) + S(4,0)) + 64 + = S(4,7) + S(4,2) + S(4,1) + 80 + + = (S(3,7) + S(3,3)) + (S(3,2) + S(3,2)) + (S(3,1) + S(3,0)) + 80 + = S(3,3) + 2*S(3,2) + S(3,1) + 88 + + = (S(2,3) + S(2,0)) + 2*(S(2,2) + S(2,0)) + (S(2,1) + S(2,0)) + 88 + = S(2,3) + 2*S(2,2) + S(2,1) + 104 + + = (S(1,3) + S(1,1)) + 2*(S(1,2) + S(1,0)) + (S(1,1) + S(2,0)) + 104 + = 2*S(1,1) + 112 + + = 114 + + This function runs in O(N*W) = O(N**3) time. It seems very + likely that it can be made to run in O(N**2) time or perhaps + even better, but N is, practically speaking, quite small. + Plus, the return value may be as large as N**2, so N must not + be larger than 31 on 32-bit systems anyhow, and even 63**3 is + only 250,047. +*/ +static unsigned long int +count_sums_to_W (unsigned long int n, unsigned long int w) +{ + /* The array contain ints even though everything else is long, + but no element in the array can have a value bigger than N, + and using int will save some memory on 64-bit systems. */ + unsigned long int total; + unsigned long int max; + int *array; + + assert (n < CHAR_BIT * sizeof (unsigned long int)); + if (n == 0) + return 0; + else if (w <= 0) + return 1 << n; + else if (w > n * (n + 1) / 2) + return 0; + else if (n == 1) + return 1; + + array = xcalloc (sizeof *array, w + 1); + array[w] = 1; + + max = w; + total = 0; + for (; n > 1; n--) + { + unsigned long int max_sum = n * (n + 1) / 2; + int i; + + if (max_sum < max) + max = max_sum; + + for (i = 1; i <= max; i++) + if (array[i] != 0) + { + int new_w = i - n; + if (new_w <= 0) + total += array[i] * (1 << (n - 1)); + else + array[new_w] += array[i]; + } + } + total += array[1]; + free (array); + return total; +} + +/* Returns the exact, two-tailed level of significance for the + Wilcoxon Matched-Pairs Signed-Ranks test, given sum of ranks + of positive (or negative samples) W and sample size N. + + Returns -1 if the exact significance level cannot be + calculated because W is out of the supported range. */ +double +LevelOfSignificanceWXMPSR (double w, long int n) +{ + unsigned long int max_w; + + /* Limit N to valid range that won't cause integer overflow. */ + if (n < 0 || n >= CHAR_BIT * sizeof (unsigned long int)) + return -1; + + max_w = n * (n + 1) / 2; + if (w < max_w / 2) + w = max_w - w; + + return count_sums_to_W (n, ceil (w)) / (double) (1 << n) * 2; +} diff --git a/src/math/wilcoxon-sig.h b/src/math/wilcoxon-sig.h new file mode 100644 index 00000000..07eba552 --- /dev/null +++ b/src/math/wilcoxon-sig.h @@ -0,0 +1,6 @@ +#ifndef WX_MP_SR +#define WX_MP_SR 1 + +double LevelOfSignificanceWXMPSR(double w, long int n); + +#endif