From: Ben Pfaff Date: Sun, 11 Apr 2004 21:32:00 +0000 (+0000) Subject: Rework moments routines for improved numerical stability based on X-Git-Tag: sav-api~2517 X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;ds=sidebyside;h=bf6d99567a762b20bee4bb71147b6387c986bb90;p=pspp Rework moments routines for improved numerical stability based on Michael Kiefte's advice. Any bugs or remaining numerical problems are still mine though. There is now a struct moments1 for use with one-pass moments. It uses a provisional means algorithm as an attempt to improve accuracy of higher moments. The older struct moments now only handles two-pass moments. --- diff --git a/src/ChangeLog b/src/ChangeLog index 9a4c4971f2..1fdf0d54e4 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,40 @@ +Sun Apr 11 14:22:12 2004 Ben Pfaff + + Rework moments routines for improved numerical stability based on + Michael Kiefte's advice. Any bugs or remaining numerical problems + are still mine though. + + There is now a struct moments1 for use with one-pass moments. It + uses a provisional means algorithm as an attempt to improve + accuracy of higher moments. The older struct moments now only + handles two-pass moments. + + * aggregate.c: Use moments1 instead moments. + + * descript.c: Revert previous change, which is no longer needed + due to the moments revision. + + * moments.c: (calc_moments) New function for calculating variance, + skewness, kurtosis. + (moments_pass_one) Only accumulate weights bigger than zero. + (moments_calculate) Allow calculating the mean on pass one, others + require pass two. Implement in terms of calc_moments(). + (struct moments1) New structure. + (init_moments1) New function. + (moments1_clear) Ditto. + (moments1_create) Ditto. + (moments1_add) Ditto. + (moments1_calculate) Ditto. + (moments1_destroy) Ditto. + (cmd_debug_moments) Deal with `struct moments' or `struct + moments1' as requested by user. + +Sun Apr 11 14:21:55 2004 Ben Pfaff + + * Makefile.am (pspp_SOURCES): Remove debug.c. + + * debug.c: Removed. It was empty anyway. + Fri Apr 9 20:04:49 2004 Ben Pfaff * descript.c (calc_descriptives): Fix assert failure when only diff --git a/src/Makefile.am b/src/Makefile.am index 082c6c232c..80195782e9 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -52,7 +52,7 @@ pspp_SOURCES = $(q_sources_c) aggregate.c algorithm.c algorithm.h \ alloc.c alloc.h apply-dict.c ascii.c autorecode.c bitvector.h \ casefile.c casefile.h cmdline.c cmdline.h command.c command.def \ command.h compute.c copyleft.c copyleft.h count.c data-in.c data-in.h \ -data-list.c data-list.h data-out.c date.c debug.c debug-print.h descript.c \ +data-list.c data-list.h data-out.c date.c debug-print.h descript.c \ devind.c devind.h dfm.c dfm.h dictionary.c do-if.c do-ifP.h error.c \ error.h expr-evl.c expr-opt.c expr-prs.c expr.h exprP.h expr.def \ file-handle.h \ diff --git a/src/aggregate.c b/src/aggregate.c index 9ffa35f5b5..094756e94f 100644 --- a/src/aggregate.c +++ b/src/aggregate.c @@ -53,7 +53,7 @@ struct agr_var int int1, int2; char *string; int missing; - struct moments *moments; + struct moments1 *moments; }; /* Aggregation functions. */ @@ -666,7 +666,7 @@ agr_destroy (struct agr_proc *agr) free (iter->string); } else if (iter->function == SD) - moments_destroy (iter->moments); + moments1_destroy (iter->moments); free (iter); } free (agr->prev_break); @@ -831,7 +831,7 @@ accumulate_aggregate_info (struct agr_proc *agr, iter->dbl[1] += weight; break; case SD: - moments_pass_two (iter->moments, v->f, weight); + moments1_add (iter->moments, v->f, weight); break; case MAX: iter->dbl[0] = max (iter->dbl[0], v->f); @@ -996,7 +996,7 @@ dump_aggregate_info (struct agr_proc *agr, struct ccase *output) double variance; /* FIXME: we should use two passes. */ - moments_calculate (i->moments, NULL, NULL, &variance, + moments1_calculate (i->moments, NULL, NULL, &variance, NULL, NULL); if (variance != SYSMIS) v->f = sqrt (variance); @@ -1098,9 +1098,9 @@ initialize_aggregate_info (struct agr_proc *agr) break; case SD: if (iter->moments == NULL) - iter->moments = moments_create (MOMENT_VARIANCE); + iter->moments = moments1_create (MOMENT_VARIANCE); else - moments_clear (iter->moments); + moments1_clear (iter->moments); break; default: iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0; diff --git a/src/debug.c b/src/debug.c deleted file mode 100644 index 5392c68c64..0000000000 --- a/src/debug.c +++ /dev/null @@ -1,32 +0,0 @@ -/* PSPP - computes sample statistics. - Copyright (C) 2004 Free Software Foundation, Inc. - Written by Ben Pfaff . - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA - 02111-1307, USA. */ - -#include -#include -#include -#include -#include -#include "alloc.h" -#include "command.h" -#include "error.h" -#include "expr.h" -#include "lexer.h" -#include "moments.h" -#include "var.h" - diff --git a/src/descript.c b/src/descript.c index 0492b33ed2..93c31717d3 100644 --- a/src/descript.c +++ b/src/descript.c @@ -743,12 +743,7 @@ calc_descriptives (const struct casefile *cf, void *dsc_) } if (dv->moments != NULL) - { - if (dsc->max_moment > MOMENT_MEAN) - moments_pass_one (dv->moments, x, weight); - else - moments_pass_two (dv->moments, x, weight); - } + moments_pass_one (dv->moments, x, weight); if (x < dv->min) dv->min = x; diff --git a/src/moments.c b/src/moments.c index 9e27ae0f2c..c993f7ae0a 100644 --- a/src/moments.c +++ b/src/moments.c @@ -25,12 +25,56 @@ #include "alloc.h" #include "misc.h" #include "val.h" + +/* Calculates variance, skewness, and kurtosis into *VARIANCE, + *SKEWNESS, and *KURTOSIS if they are non-null and not greater + moments than MAX_MOMENT. Accepts W as the total weight, D1 as + the total deviation from the estimated mean, and D2, D3, and + D4 as the sum of the squares, cubes, and 4th powers, + respectively, of the deviation from the estimated mean. */ +static void +calc_moments (enum moment max_moment, + double w, double d1, double d2, double d3, double d4, + double *variance, double *skewness, double *kurtosis) +{ + assert (w > 0.); + + if (max_moment >= MOMENT_VARIANCE && w > 1.) + { + double s2; -/* FIXME? _SPSS Statistical Algorithms_ in the DESCRIPTIVES - second describes a "provisional means algorithm" that might be - useful for improving accuracy when we only do one pass. */ + /* From _Numerical Recipes in C_, 2nd ed., 0-521-43108-5, + section 14.1. */ + s2 = (d2 - pow2 (d1) / w) / (w - 1.); + if (variance != NULL) + *variance = s2; + + /* From _SPSS Statistical Algorithms, 2nd ed., + 0-918469-89-9, section "DESCRIPTIVES". */ + if (fabs (*variance) >= 1e-20) + { + if (max_moment >= MOMENT_SKEWNESS && skewness != NULL && w > 2.) + { + double s3 = s2 * sqrt (s2); + double g1 = (w * d3) / ((w - 1.0) * (w - 2.0) * s3); + if (finite (g1)) + *skewness = g1; + } + if (max_moment >= MOMENT_KURTOSIS && kurtosis != NULL && w > 3.) + { + double g2 = ((w * (w + 1.) * d4 + - 3. * pow2 (d2) * (w - 1.)) + / ((w - 1.) * (w - 2.) * (w - 3.) * pow2 (s2))); + if (finite (g2)) + *kurtosis = g2; + } + } + } +} + +/* Two-pass moments. */ -/* A set of moments in process of calculation. */ +/* A set of two-pass moments. */ struct moments { enum moment max_moment; /* Highest-order moment we're computing. */ @@ -72,13 +116,15 @@ moments_clear (struct moments *m) } /* Creates and returns a data structure for calculating moment - MAX_MOMENT and lower moments on a data series. For greatest - accuracy, the user should call moments_pass_one() for each - value in the series, then call moments_pass_two() for the same - set of values in the same order, then call moments_calculate() - to obtain the moments. At a cost of reduced accuracy, the - first pass can be skipped. In either case, moments_destroy() - should be called when the moments are no longer needed. */ + MAX_MOMENT and lower moments on a data series. The user + should call moments_pass_one() for each value in the series, + then call moments_pass_two() for the same set of values in the + same order, then call moments_calculate() to obtain the + moments. The user may ask for the mean at any time during the + first pass (using moments_calculate()), but otherwise no + statistics may be requested until the end of the second pass. + Call moments_destroy() when the moments are no longer + needed. */ struct moments * moments_create (enum moment max_moment) { @@ -95,7 +141,7 @@ moments_pass_one (struct moments *m, double value, double weight) assert (m != NULL); assert (m->pass == 1); - if (value != SYSMIS && weight >= 0.) + if (value != SYSMIS && weight > 0.) { m->sum += value * weight; m->w1 += weight; @@ -160,16 +206,7 @@ moments_calculate (const struct moments *m, double *mean, double *variance, double *skewness, double *kurtosis) { - double W; - int one_pass; - assert (m != NULL); - assert (m->pass == 2); - - one_pass = m->w1 == 0.; - - /* If passes 1 and 2 are used, then w1 and w2 must agree. */ - assert (one_pass || m->w1 == m->w2); if (mean != NULL) *mean = SYSMIS; @@ -180,52 +217,33 @@ moments_calculate (const struct moments *m, if (kurtosis != NULL) *kurtosis = SYSMIS; - W = m->w2; if (weight != NULL) - *weight = W; - if (W == 0.) - return; - - if (mean != NULL) - *mean = m->mean + m->d1 / W; + *weight = m->w1; - if (m->max_moment >= MOMENT_VARIANCE && W > 1.) + /* How many passes so far? */ + if (m->pass == 1) { - double variance_tmp; - - /* From _Numerical Recipes in C_, 2nd ed., 0-521-43108-5, - section 14.1. */ - if (variance == NULL) - variance = &variance_tmp; - *variance = (m->d2 - pow2 (m->d1) / W) / (W - 1.); - - /* From _SPSS Statistical Algorithms, 2nd ed., - 0-918469-89-9, section "DESCRIPTIVES". */ - if (fabs (*variance) >= 1e-20) + /* In the first pass we can only calculate the mean. */ + if (mean != NULL && m->w1 > 0.) + *mean = m->sum / m->w1; + } + else + { + /* After the second pass we can calculate any stat. We + don't support "online" computation during the second + pass, so As a simple self-check, the total weight for + the passes must agree. */ + assert (m->pass == 2); + assert (m->w1 == m->w2); + + if (m->w2 > 0.) { - if (m->max_moment >= MOMENT_SKEWNESS && skewness != NULL && W > 2.) - { - *skewness = ((W * m->d3 - 3.0 * m->d1 * m->d2 + 2.0 - * pow3 (m->d1) / W) - / ((W - 1.0) * (W - 2.0) - * *variance * sqrt (*variance))); - if (!finite (*skewness)) - *skewness = SYSMIS; - } - if (m->max_moment >= MOMENT_KURTOSIS && kurtosis != NULL && W > 3.) - { - *kurtosis = (((W + 1) * (W * m->d4 - - 4.0 * m->d1 * m->d3 - + 6.0 * pow2 (m->d1) * m->d2 / W - - 3.0 * pow4 (m->d1) / pow2 (W))) - / ((W - 1.0) * (W - 2.0) * (W - 3.0) - * pow2 (*variance)) - - (3.0 * pow2 (W - 1.0)) - / ((W - 2.0) * (W - 3.))); - if (!finite (*kurtosis)) - *kurtosis = SYSMIS; - } - } + if (mean != NULL) + *mean = m->mean; + calc_moments (m->max_moment, + m->w2, m->d1, m->d2, m->d3, m->d4, + variance, skewness, kurtosis); + } } } @@ -301,7 +319,152 @@ moments_of_values (const union value *array, size_t cnt, moments_pass_two (&m, array[idx].f, 1.); moments_calculate (&m, weight, mean, variance, skewness, kurtosis); } + +/* One-pass moments. */ + +/* A set of one-pass moments. */ +struct moments1 + { + enum moment max_moment; /* Highest-order moment we're computing. */ + double w; /* Total weight so far. */ + double d1; /* Sum of deviations from the mean. */ + double d2; /* Sum of squared deviations from the mean. */ + double d3; /* Sum of cubed deviations from the mean. */ + double d4; /* Sum of (deviations from the mean)**4. */ + }; + +/* Initializes one-pass moments M for calculating moment + MAX_MOMENT and lower moments. */ +static void +init_moments1 (struct moments1 *m, enum moment max_moment) +{ + assert (m != NULL); + assert (max_moment == MOMENT_MEAN || max_moment == MOMENT_VARIANCE + || max_moment == MOMENT_SKEWNESS || max_moment == MOMENT_KURTOSIS); + m->max_moment = max_moment; + moments1_clear (m); +} + +/* Clears out a set of one-pass moments so that it can be reused + for a new set of values. The moments to be calculated are not + changed. */ +void +moments1_clear (struct moments1 *m) +{ + m->w = 0.; + m->d1 = m->d2 = m->d3 = m->d4 = 0.; +} + +/* Creates and returns a data structure for calculating moment + MAX_MOMENT and lower moments on a data series in a single + pass. The user should call moments1_add() for each value in + the series. The user may call moments1_calculate() to obtain + the current moments at any time. Call moments1_destroy() when + the moments are no longer needed. + + One-pass moments should only be used when two passes over the + data are impractical. */ +struct moments1 * +moments1_create (enum moment max_moment) +{ + struct moments1 *m = xmalloc (sizeof *m); + init_moments1 (m, max_moment); + return m; +} + +/* Adds VALUE with the given WEIGHT to the calculation of + one-pass moments. */ +void +moments1_add (struct moments1 *m, double value, double weight) +{ + assert (m != NULL); + + if (value != SYSMIS && weight > 0.) + { + double prev_w, v1; + + prev_w = m->w; + m->w += weight; + v1 = (weight / m->w) * (value - m->d1); + m->d1 += v1; + + if (m->max_moment >= MOMENT_VARIANCE) + { + double v2 = v1 * v1; + double w_prev_w = m->w * prev_w; + double prev_m2 = m->d2; + + m->d2 += w_prev_w / weight * v2; + if (m->max_moment >= MOMENT_SKEWNESS) + { + double w2 = weight * weight; + double v3 = v2 * v1; + double prev_m3 = m->d3; + + m->d3 += (-3. * v1 * prev_m2 + + w_prev_w / w2 * (m->w - 2. * weight) * v3); + if (m->max_moment >= MOMENT_KURTOSIS) + { + double w3 = w2 * weight; + double v4 = v2 * v2; + + m->d4 += (-4. * v1 * prev_m3 + + 6. * v2 * prev_m2 + + ((pow2 (m->w) - 3. * weight * prev_w) + * v4 * w_prev_w / w3)); + } + } + } + } +} + +/* Calculates one-pass moments based on the input data. Stores + the total weight in *WEIGHT, the mean in *MEAN, the variance + in *VARIANCE, the skewness in *SKEWNESS, and the kurtosis in + *KURTOSIS. Any of these result parameters may be null + pointers, in which case the values are not calculated. If any + result cannot be calculated, either because they are undefined + based on the input data or because their moments are higher + than the maximum requested on moments_create(), then SYSMIS is + stored into that result. */ +void +moments1_calculate (const struct moments1 *m, + double *weight, + double *mean, double *variance, + double *skewness, double *kurtosis) +{ + assert (m != NULL); + if (mean != NULL) + *mean = SYSMIS; + if (variance != NULL) + *variance = SYSMIS; + if (skewness != NULL) + *skewness = SYSMIS; + if (kurtosis != NULL) + *kurtosis = SYSMIS; + + if (weight != NULL) + *weight = m->w; + + if (m->w > 0.) + { + if (mean != NULL) + *mean = m->d1; + + calc_moments (m->max_moment, + m->w, 0., m->d2, m->d3, m->d4, + variance, skewness, kurtosis); + } +} + +/* Destroy one-pass moments M. */ +void +moments1_destroy (struct moments1 *m) +{ + free (m); +} + /* Returns the standard error of the skewness for the given total weight W. @@ -373,7 +536,6 @@ int cmd_debug_moments (void) { int retval = CMD_FAILURE; - struct moments *m = NULL; double *values = NULL; double *weights = NULL; double weight, M[4]; @@ -391,18 +553,39 @@ cmd_debug_moments (void) fprintf (stderr, "%s => ", lex_rest_of_line (NULL)); lex_get (); - m = moments_create (MOMENT_KURTOSIS); - if (!read_values (&values, &weights, &cnt)) - goto done; if (two_pass) { + struct moments *m = NULL; + + m = moments_create (MOMENT_KURTOSIS); + if (!read_values (&values, &weights, &cnt)) + { + moments_destroy (m); + goto done; + } for (i = 0; i < cnt; i++) moments_pass_one (m, values[i], weights[i]); + for (i = 0; i < cnt; i++) + moments_pass_two (m, values[i], weights[i]); + moments_calculate (m, &weight, &M[0], &M[1], &M[2], &M[3]); + moments_destroy (m); } - for (i = 0; i < cnt; i++) - moments_pass_two (m, values[i], weights[i]); - moments_calculate (m, &weight, &M[0], &M[1], &M[2], &M[3]); - + else + { + struct moments1 *m = NULL; + + m = moments1_create (MOMENT_KURTOSIS); + if (!read_values (&values, &weights, &cnt)) + { + moments1_destroy (m); + goto done; + } + for (i = 0; i < cnt; i++) + moments1_add (m, values[i], weights[i]); + moments1_calculate (m, &weight, &M[0], &M[1], &M[2], &M[3]); + moments1_destroy (m); + } + fprintf (stderr, "W=%.3f", weight); for (i = 0; i < 4; i++) { @@ -419,7 +602,6 @@ cmd_debug_moments (void) retval = lex_end_of_command (); done: - moments_destroy (m); free (values); free (weights); return retval; diff --git a/src/moments.h b/src/moments.h index e48bfd5ec6..791e9c5469 100644 --- a/src/moments.h +++ b/src/moments.h @@ -36,6 +36,7 @@ enum moment struct moments; +/* Two-pass moments. */ struct moments *moments_create (enum moment max_moment); void moments_clear (struct moments *); void moments_pass_one (struct moments *, double value, double weight); @@ -46,6 +47,7 @@ void moments_calculate (const struct moments *, double *skewness, double *kurtosis); void moments_destroy (struct moments *); +/* Convenience functions for two-pass moments. */ void moments_of_doubles (const double *array, size_t cnt, double *weight, double *mean, double *variance, @@ -55,6 +57,18 @@ void moments_of_values (const union value *array, size_t cnt, double *mean, double *variance, double *skewness, double *kurtosis); +/* One-pass moments. Use only if two passes are impractical. */ +struct moments1 *moments1_create (enum moment max_moment); +void moments1_clear (struct moments1 *); +void moments1_add (struct moments1 *, double value, double weight); +void moments1_calculate (const struct moments1 *, + double *weight, + double *mean, double *variance, + double *skewness, double *kurtosis); +void moments1_destroy (struct moments1 *); + +/* Standard errors. */ +double calc_semean (double stddev, double weight); double calc_seskew (double weight); double calc_sekurt (double weight); diff --git a/tests/ChangeLog b/tests/ChangeLog index 83881f0b76..22053d6a9d 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,8 @@ +Sun Apr 11 14:21:16 2004 Ben Pfaff + + * stats/moments.sh: Now that our one-pass moments algorithm is + better we don't have to omit any of the test cases for it. + Fri Apr 9 20:03:33 2004 Ben Pfaff * Makefile.am: (TESTS) Add stats/descript-mean-bug.sh. diff --git a/tests/stats/moments.sh b/tests/stats/moments.sh index e262e82fb0..0bca3fbcfa 100755 --- a/tests/stats/moments.sh +++ b/tests/stats/moments.sh @@ -58,14 +58,15 @@ sed -ne 's/#.*//;/^[ ]*$/!p' > $TEMPDIR/moments-list-1p <<'EOF' 1*3 => W=3.000 M1=1.000 M2=0.000 M3=sysmis M4=sysmis 1*2 3 => W=3.000 M1=1.667 M2=1.333 M3=1.732 M4=sysmis 1 1.00000001 => W=2.000 M1=1.000 M2=0.000 M3=sysmis M4=sysmis +1000001 1000002 1000003 1000004 => W=4.000 M1=1000002.500 M2=1.667 M3=0.000 M4=-1.200 EOF if [ $? -ne 0 ] ; then no_result ; fi cp $TEMPDIR/moments-list-1p $TEMPDIR/moments-list-2p sed -ne 's/#.*//;/^[ ]*$/!p' >> $TEMPDIR/moments-list-2p <<'EOF' -# Only the two-pass algorithm can be expected to produce -# proper third and fourth moments here. -1000001 1000002 1000003 1000004 => W=4.000 M1=1000002.500 M2=1.667 M3=0.000 M4=-1.200 +# We used to have an example for which only the two-pass algorithm +# produced reasonable results, but the provisional means algorithm +# does better, so there aren't any extra tests here. EOF activity="create two-pass input file" @@ -73,11 +74,11 @@ sed < $TEMPDIR/moments-list-2p >> $TEMPDIR/moments-2p.stat \ -e 's#^\(.*\) => \(.*\)$#DEBUG MOMENTS/\1.#' if [ $? -ne 0 ] ; then no_result ; fi -activity="run program" +activity="run two-pass program" $SUPERVISOR $here/../src/pspp --testing-mode -o raw-ascii \ $TEMPDIR/moments-2p.stat >$TEMPDIR/moments-2p.err 2> $TEMPDIR/moments-2p.out -activity="compare output" +activity="compare two-pass output" diff -B -b $TEMPDIR/moments-list-2p $TEMPDIR/moments-2p.out if [ $? -ne 0 ] ; then fail ; fi @@ -86,11 +87,11 @@ sed < $TEMPDIR/moments-list-1p >> $TEMPDIR/moments-1p.stat \ -e 's#^\(.*\) => \(.*\)$#DEBUG MOMENTS ONEPASS/\1.#' if [ $? -ne 0 ] ; then no_result ; fi -activity="run program" +activity="run one-pass program" $SUPERVISOR $here/../src/pspp --testing-mode -o raw-ascii \ $TEMPDIR/moments-1p.stat >$TEMPDIR/moments-1p.err 2> $TEMPDIR/moments-1p.out -activity="compare output" +activity="compare one-pass output" diff -B -b $TEMPDIR/moments-list-1p $TEMPDIR/moments-1p.out if [ $? -ne 0 ] ; then fail ; fi