From ab90a86a87d3f006fbb362907b1c41110a117a79 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 29 Jan 2022 16:56:18 -0800 Subject: [PATCH] math: Make 'accumulate' a feature of order statistics, not all stats. --- src/math/box-whisker.c | 2 +- src/math/histogram.c | 12 +--------- src/math/np.c | 2 +- src/math/order-stats.c | 49 ++++++++++++++++++++++++---------------- src/math/order-stats.h | 50 +++++++++++++++++++++++++++++------------ src/math/percentiles.c | 40 +++++++++++++++------------------ src/math/shapiro-wilk.c | 2 +- src/math/statistic.h | 1 - src/math/trimmed-mean.c | 2 +- 9 files changed, 89 insertions(+), 71 deletions(-) diff --git a/src/math/box-whisker.c b/src/math/box-whisker.c index 86277333ff..e0fbbe94b2 100644 --- a/src/math/box-whisker.c +++ b/src/math/box-whisker.c @@ -151,9 +151,9 @@ box_whisker_create (const struct tukey_hinges *th, *w = (struct box_whisker) { .parent = { .parent = { - .accumulate = acc, .destroy = destroy, }, + .accumulate = acc, }, .hinges = { hinges[0], hinges[1], hinges[2] }, .whiskers = { SYSMIS, hinges[2] }, diff --git a/src/math/histogram.c b/src/math/histogram.c index 8aaa16f9e0..264b078535 100644 --- a/src/math/histogram.c +++ b/src/math/histogram.c @@ -37,16 +37,7 @@ void histogram_add (struct histogram *h, double y, double c) { - struct statistic *stat = &h->parent; - stat->accumulate (stat, NULL, c, 0, y); -} - -static void -acc (struct statistic *s, const struct ccase *cx UNUSED, double c, double cc UNUSED, double y) -{ - struct histogram *hist = UP_CAST (s, struct histogram, parent); - - gsl_histogram_accumulate (hist->gsl_hist, y, c); + gsl_histogram_accumulate (h->gsl_hist, y, c); } static void @@ -177,7 +168,6 @@ histogram_create (double bin_width_in, double min, double max) } stat = &h->parent; - stat->accumulate = acc; stat->destroy = destroy; return h; diff --git a/src/math/np.c b/src/math/np.c index dd21b1f362..d297168c59 100644 --- a/src/math/np.c +++ b/src/math/np.c @@ -93,9 +93,9 @@ np_create (double n, double mean, double var) *np = (struct np) { .parent = { .parent = { - .accumulate = acc, .destroy = destroy, }, + .accumulate = acc, }, .n = n, .mean = mean, diff --git a/src/math/order-stats.c b/src/math/order-stats.c index 2ec7c1a0d4..3f6b8fcfbb 100644 --- a/src/math/order-stats.c +++ b/src/math/order-stats.c @@ -18,6 +18,7 @@ #include "math/order-stats.h" +#include #include #include "data/casereader.h" @@ -86,10 +87,8 @@ update_k_values (const struct ccase *cx, double y_i, double c_i, double cc_i, } } - if (stat->accumulate) - stat->accumulate (stat, cx, c_i, cc_i, y_i); - - tos->cc = cc_i; + if (tos->accumulate) + tos->accumulate (stat, cx, c_i, cc_i, y_i); } } @@ -97,6 +96,8 @@ update_k_values (const struct ccase *cx, double y_i, double c_i, double cc_i, order statistics in OS, taking data from case index DATA_IDX and weights from case index WEIGHT_IDX. WEIGHT_IDX may be -1 to assume weight 1. + This function must be used only once per order_stats. + Takes ownership of READER. Data values must be numeric and sorted in ascending order. Use @@ -117,29 +118,37 @@ order_stats_accumulate_idx (struct order_stats **os, size_t n_os, for (; (cx = casereader_read (reader)) != NULL; case_unref (cx)) { const double weight = weight_idx == -1 ? 1.0 : case_num_idx (cx, weight_idx); - if (weight == SYSMIS) + if (weight == SYSMIS || weight <= 0) continue; const double this_value = case_num_idx (cx, data_idx); - assert (this_value >= prev_value); + if (!isfinite (this_value) || this_value == SYSMIS) + continue; - if (prev_value == -DBL_MAX || prev_value == this_value) - c_i += weight; + if (!prev_cx || this_value > prev_value) + { + if (prev_cx) + update_k_values (prev_cx, prev_value, c_i, cc_i, os, n_os); + prev_value = this_value; + c_i = weight; + } + else + { + /* Data values must be sorted. */ + assert (this_value == prev_value); + + c_i += weight; + } - if (prev_value > -DBL_MAX && this_value > prev_value) - { - update_k_values (prev_cx, prev_value, c_i, cc_i, os, n_os); - c_i = weight; - } - - case_unref (prev_cx); cc_i += weight; - prev_value = this_value; + case_unref (prev_cx); prev_cx = case_ref (cx); } - - update_k_values (prev_cx, prev_value, c_i, cc_i, os, n_os); - case_unref (prev_cx); + if (prev_cx) + { + update_k_values (prev_cx, prev_value, c_i, cc_i, os, n_os); + case_unref (prev_cx); + } casereader_destroy (reader); } @@ -149,6 +158,8 @@ order_stats_accumulate_idx (struct order_stats **os, size_t n_os, WEIGHT_VAR. Drops cases for which the value of DATA_VAR is missing according to EXCLUDE. WEIGHT_VAR may be NULL to assume weight 1. + This function must be used only once per order_stats. + Takes ownership of READER. DATA_VAR must be numeric and sorted in ascending order. Use diff --git a/src/math/order-stats.h b/src/math/order-stats.h index 581bb11455..4bb69383fb 100644 --- a/src/math/order-stats.h +++ b/src/math/order-stats.h @@ -19,9 +19,23 @@ /* Support for order statistics. - This data structure supplies infrastructure for higher-level statistics that - rely on order statistics. It is a kind of "abstract base class" that is not - useful on its own. The common pattern for using the statistics based on it + The kth order statistic of a statistical sample is equal to its kth-smallest + value. The minimum is the first order statistic and the maximum is the + largest. This code and data structure supplies infrastructure for + higher-level statistics that rely on order statistics. It is a kind of + "abstract base class" that is not useful on its own. + + This is implemented here as a kind of "partial frequency table". The + order_stats_accumulate() and order_stats_accumulate_idx() functions + effectively generate all of the frequency table entries for the variable, + one by one, and pass them to the "accumulate" function, if any. They can + also record pairs of frequency tables entries surrounding desired target + cumulative weights in 'k' data structures. + + Client use + ========== + + The common pattern for clients to use statistics based on order statistics is this: - Create the higher-level statistic with, for example, percentile_create(). @@ -43,18 +57,25 @@ struct casereader; struct variable; -/* - cc <= tc < cc_p1 +/* A pair of adjacent frequency table entries. + + cc <= tc < cc_p1 */ struct k { + /* Target cumulative weight. + Set by the client before invoking order_stats_accumulate{,_idx}. */ double tc; - double cc; - double cc_p1; - double c; - double c_p1; - double y; - double y_p1; + + /* Lower order statistics. */ + double cc; /* Largest cumulative weight <= tc. */ + double c; /* Weight for data values equal to 'y'. */ + double y; /* Data value. */ + + /* Upper order statistics. */ + double cc_p1; /* Smallest cumulative weight > tc. */ + double c_p1; /* Weight for data values equal to 'y_p1'. */ + double y_p1; /* Data value. */ }; /* Order statistics calculation data structure. See the comment at the top of @@ -62,10 +83,11 @@ struct k struct order_stats { struct statistic parent; - int n_k; - struct k *k; - double cc; + void (*accumulate) (struct statistic *, const struct ccase *, double c, double cc, double y); + + struct k *k; + size_t n_k; }; void order_stats_accumulate_idx (struct order_stats **os, size_t n_os, diff --git a/src/math/percentiles.c b/src/math/percentiles.c index 79514eb250..a9b2913b5c 100644 --- a/src/math/percentiles.c +++ b/src/math/percentiles.c @@ -143,36 +143,32 @@ destroy (struct statistic *stat) free (ptl); } - /* Create the Pth percentile. W is the total sum of weights in the data set. */ struct percentile * percentile_create (double p, double W) { - struct percentile *ptl = XZALLOC (struct percentile); - struct order_stats *os = &ptl->parent; - struct statistic *stat = &os->parent; - assert (p >= 0); assert (p <= 1.0); - ptl->ptile = p; - ptl->w = W; - - os->n_k = 2; - os->k = ptl->k; - os->k[0].tc = W * p; - os->k[1].tc = (W + 1.0) * p; - - ptl->g1 = ptl->g1_star = SYSMIS; - ptl->g2 = ptl->g2_star = SYSMIS; - - os->k[1].y_p1 = os->k[1].y = SYSMIS; - os->k[0].y_p1 = os->k[0].y = SYSMIS; - - stat->destroy = destroy; - + struct percentile *ptl = xmalloc (sizeof *ptl); + *ptl = (struct percentile) { + .parent = { + .parent = { + .destroy = destroy + }, + .k = ptl->k, + .n_k = 2, + }, + .ptile = p, + .w = W, + .g1 = SYSMIS, + .g1_star = SYSMIS, + .g2 = SYSMIS, + .g2_star = SYSMIS, + .k[0] = { .tc = W * p, .y = SYSMIS, .y_p1 = SYSMIS }, + .k[1] = { .tc = (W + 1.0) * p, .y = SYSMIS, .y_p1 = SYSMIS }, + }; return ptl; } - diff --git a/src/math/shapiro-wilk.c b/src/math/shapiro-wilk.c index 27ca1f0fb5..c2c8da6a9b 100644 --- a/src/math/shapiro-wilk.c +++ b/src/math/shapiro-wilk.c @@ -154,7 +154,7 @@ shapiro_wilk_create (int n, double mean) sw->warned = false; - stat->accumulate = acc; + os->accumulate = acc; stat->destroy = destroy; return sw; diff --git a/src/math/statistic.h b/src/math/statistic.h index 987264b1f9..2109d856a2 100644 --- a/src/math/statistic.h +++ b/src/math/statistic.h @@ -23,7 +23,6 @@ struct ccase ; struct statistic { - void (*accumulate) (struct statistic *, const struct ccase *cx, double c, double cc, double y); void (*destroy) (struct statistic *); }; diff --git a/src/math/trimmed-mean.c b/src/math/trimmed-mean.c index d44131a53b..dff256c0d3 100644 --- a/src/math/trimmed-mean.c +++ b/src/math/trimmed-mean.c @@ -57,9 +57,9 @@ trimmed_mean_create (double W, double tail) *tm = (struct trimmed_mean) { .parent = { .parent = { - .accumulate = acc, .destroy = destroy, }, + .accumulate = acc, .k = tm->k, .n_k = 2, }, -- 2.30.2