From f447ebdf19acf26d2d46cee1595e99c3620ee30d Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 29 Jan 2022 12:00:16 -0800 Subject: [PATCH] math: Improve comments. This includes some changes to parameter names and coding style in order-stats that I think clarify the code and modernize it. --- src/math/np.c | 11 +++++ src/math/np.h | 2 + src/math/order-stats.c | 100 ++++++++++++++++++++--------------------- src/math/order-stats.h | 48 +++++++++++++------- src/math/percentiles.c | 4 ++ src/math/percentiles.h | 22 ++++----- src/math/sort.c | 3 ++ src/math/sort.h | 15 +++++++ 8 files changed, 127 insertions(+), 78 deletions(-) diff --git a/src/math/np.c b/src/math/np.c index 0d691c2aa7..220865ba3c 100644 --- a/src/math/np.c +++ b/src/math/np.c @@ -71,6 +71,17 @@ acc (struct statistic *s, const struct ccase *cx UNUSED, np->prev_cc = cc; } +/* Creates and returns a data structure whose accumulated results can be used + to produce a normal probability plot. The caller must supply the weighted + sample size N and the mean MEAN and variance VAR of the distribution, then + feed in the data with order_stats_accumulate() or + order_stats_accumulate_idx(). + + There is no function to produce the results, which appear in "struct np" for + passing directly to np_plot_create() or dnp_plot_create(). + + The caller must eventually destroy the returned structure, with + statistic_destroy(). */ struct np * np_create (double n, double mean, double var) { diff --git a/src/math/np.h b/src/math/np.h index 2fa009f90a..998bee089e 100644 --- a/src/math/np.h +++ b/src/math/np.h @@ -29,6 +29,8 @@ enum n_NP_IDX }; +/* Statistics needed to produce a normal probability plot with + np_plot_create(). */ struct np { struct order_stats parent; diff --git a/src/math/order-stats.c b/src/math/order-stats.c index e8e078f4b6..2ec7c1a0d4 100644 --- a/src/math/order-stats.c +++ b/src/math/order-stats.c @@ -58,48 +58,32 @@ order_stats_dump (const struct order_stats *os) #endif -static void -update_k_lower (struct k *kk, - double y_i, double c_i, double cc_i) -{ - if (cc_i <= kk->tc) - { - kk->cc = cc_i; - kk->c = c_i; - kk->y = y_i; - } -} - - -static void -update_k_upper (struct k *kk, - double y_i, double c_i, double cc_i) -{ - if (cc_i > kk->tc && kk->c_p1 == 0) - { - kk->cc_p1 = cc_i; - kk->c_p1 = c_i; - kk->y_p1 = y_i; - } -} - - static void update_k_values (const struct ccase *cx, double y_i, double c_i, double cc_i, struct order_stats **os, size_t n_os) { - int j; - - for (j = 0 ; j < n_os ; ++j) + for (size_t j = 0; j < n_os; ++j) { - int k; struct order_stats *tos = os[j]; struct statistic *stat = &tos->parent; - for (k = 0 ; k < tos->n_k; ++k) + + for (struct k *k = tos->k; k < &tos->k[tos->n_k]; ++k) { - struct k *myk = &tos->k[k]; - update_k_lower (myk, y_i, c_i, cc_i); - update_k_upper (myk, y_i, c_i, cc_i); + /* Update 'k' lower values. */ + if (cc_i <= k->tc) + { + k->cc = cc_i; + k->c = c_i; + k->y = y_i; + } + + /* Update 'k' upper values. */ + if (cc_i > k->tc && k->c_p1 == 0) + { + k->cc_p1 = cc_i; + k->c_p1 = c_i; + k->y_p1 = y_i; + } } if (stat->accumulate) @@ -109,12 +93,19 @@ update_k_values (const struct ccase *cx, double y_i, double c_i, double cc_i, } } +/* Reads all the cases from READER and accumulates their data into the N_OS + order statistics in OS, taking data from case index DATA_IDX and weights + from case index WEIGHT_IDX. WEIGHT_IDX may be -1 to assume weight 1. + + Takes ownership of READER. + Data values must be numeric and sorted in ascending order. Use + sort_execute_1var() or related functions to sort unsorted data before + passing it to this function. */ void -order_stats_accumulate_idx (struct order_stats **os, size_t nos, +order_stats_accumulate_idx (struct order_stats **os, size_t n_os, struct casereader *reader, - int wt_idx, - int val_idx) + int weight_idx, int data_idx) { struct ccase *cx; struct ccase *prev_cx = NULL; @@ -125,13 +116,11 @@ order_stats_accumulate_idx (struct order_stats **os, size_t nos, for (; (cx = casereader_read (reader)) != NULL; case_unref (cx)) { - const double weight = wt_idx == -1 ? 1.0 : case_num_idx (cx, wt_idx); - const double this_value = case_num_idx (cx, val_idx); - + const double weight = weight_idx == -1 ? 1.0 : case_num_idx (cx, weight_idx); if (weight == SYSMIS) continue; - /* The casereader MUST be sorted */ + const double this_value = case_num_idx (cx, data_idx); assert (this_value >= prev_value); if (prev_value == -DBL_MAX || prev_value == this_value) @@ -139,7 +128,7 @@ order_stats_accumulate_idx (struct order_stats **os, size_t nos, if (prev_value > -DBL_MAX && this_value > prev_value) { - update_k_values (prev_cx, prev_value, c_i, cc_i, os, nos); + update_k_values (prev_cx, prev_value, c_i, cc_i, os, n_os); c_i = weight; } @@ -149,26 +138,33 @@ order_stats_accumulate_idx (struct order_stats **os, size_t nos, prev_cx = case_ref (cx); } - update_k_values (prev_cx, prev_value, c_i, cc_i, os, nos); + update_k_values (prev_cx, prev_value, c_i, cc_i, os, n_os); case_unref (prev_cx); casereader_destroy (reader); } +/* Reads all the cases from READER and accumulates their data into the N_OS + order statistics in OS, taking data from DATA_VAR and weights from + WEIGHT_VAR. Drops cases for which the value of DATA_VAR is missing + according to EXCLUDE. WEIGHT_VAR may be NULL to assume weight 1. + + Takes ownership of READER. + DATA_VAR must be numeric and sorted in ascending order. Use + sort_execute_1var() or related functions to sort unsorted data before + passing it to this function. */ void -order_stats_accumulate (struct order_stats **os, size_t nos, +order_stats_accumulate (struct order_stats **os, size_t n_os, struct casereader *reader, - const struct variable *wv, - const struct variable *var, + const struct variable *weight_var, + const struct variable *data_var, enum mv_class exclude) { - /* Filter out missing cases */ - reader = casereader_create_filter_missing (reader, &var, 1, + reader = casereader_create_filter_missing (reader, &data_var, 1, exclude, NULL, NULL); - order_stats_accumulate_idx (os, nos, - reader, - wv ? var_get_case_index (wv) : -1, - var_get_case_index (var)); + order_stats_accumulate_idx (os, n_os, reader, + weight_var ? var_get_case_index (weight_var) : -1, + var_get_case_index (data_var)); } diff --git a/src/math/order-stats.h b/src/math/order-stats.h index 03011ea32c..581bb11455 100644 --- a/src/math/order-stats.h +++ b/src/math/order-stats.h @@ -17,6 +17,25 @@ #ifndef __ORDER_STATS_H__ #define __ORDER_STATS_H__ +/* Support for order statistics. + + This data structure supplies infrastructure for higher-level statistics that + rely on order statistics. It is a kind of "abstract base class" that is not + useful on its own. The common pattern for using the statistics based on it + is this: + + - Create the higher-level statistic with, for example, percentile_create(). + + - Feed in all the data with order_stats_accumulate() or + order_stats_accumulate_idx(). The data must be in sorted order: if + necessary, use one of the sorting functions from sort.h to sort them. + + - Obtain the desired results by examining the higher-level data structure or + by calling an appropriate function, e.g. percentile_calculate(). + + - Destroy the data structure with statistic_destroy(). +*/ + #include #include "data/missing-values.h" #include "math/statistic.h" @@ -38,7 +57,8 @@ struct k double y_p1; }; - +/* Order statistics calculation data structure. See the comment at the top of + this file for usage details. */ struct order_stats { struct statistic parent; @@ -48,21 +68,17 @@ struct order_stats double cc; }; -enum mv_class; - -void order_stats_dump (const struct order_stats *os); - -void -order_stats_accumulate_idx (struct order_stats **os, size_t nos, - struct casereader *reader, - int wt_idx, - int val_idx); - - -void order_stats_accumulate (struct order_stats **ptl, size_t nos, - struct casereader *reader, - const struct variable *wv, - const struct variable *var, +void order_stats_accumulate_idx (struct order_stats **os, size_t n_os, + struct casereader *reader, + int weight_idx, + int data_idx); +void order_stats_accumulate (struct order_stats **os, size_t n_os, + struct casereader *, + const struct variable *weight_var, + const struct variable *data_var, enum mv_class exclude); +/* Debugging support. */ +void order_stats_dump (const struct order_stats *); + #endif diff --git a/src/math/percentiles.c b/src/math/percentiles.c index 1b4ff13632..79514eb250 100644 --- a/src/math/percentiles.c +++ b/src/math/percentiles.c @@ -27,6 +27,7 @@ #include "gl/xalloc.h" +/* Return the value of the percentile. */ double percentile_calculate (const struct percentile *ptl, enum pc_alg alg) { @@ -143,6 +144,9 @@ destroy (struct statistic *stat) } +/* Create the Pth percentile. + W is the total sum of weights in the data set. +*/ struct percentile * percentile_create (double p, double W) { diff --git a/src/math/percentiles.h b/src/math/percentiles.h index 86fd641add..c79a629fe3 100644 --- a/src/math/percentiles.h +++ b/src/math/percentiles.h @@ -21,6 +21,16 @@ #include "order-stats.h" +/* To calculate a percentile: + + - Create a "struct percentile" with percentile_create(). + - Feed in the data with order_stats_accumulate() or + order_stats_accumulate_idx(). The data must be in sorted order: if + necessary, use one of the sorting functions from sort.h to sort them. + - Obtain the percentile with percentile_calculate(). + - Destroy the data structure with statistic_destroy(). +*/ + /* The algorithm used to calculate percentiles */ enum pc_alg { PC_NONE=0, @@ -48,15 +58,7 @@ struct percentile struct k k[2]; }; -/* Create the Pth percentile. - W is the total sum of weights in the data set -*/ struct percentile *percentile_create (double p, double W); +double percentile_calculate (const struct percentile *, enum pc_alg); -/* Return the value of the percentile */ -double percentile_calculate (const struct percentile *ptl, enum pc_alg alg); - -void percentile_dump (const struct percentile *ptl); - - -#endif +#endif /* percentiles.h */ diff --git a/src/math/sort.c b/src/math/sort.c index a784960420..a93466f27c 100644 --- a/src/math/sort.c +++ b/src/math/sort.c @@ -60,6 +60,9 @@ static struct ccase *pqueue_pop (struct pqueue *, casenumber *); static void output_record (struct sort_writer *); +/* Creates a casewriter that sorts the cases written to it. Once all the cases + have been written, use casewriter_make_reader() to obtain the sorted + results. */ struct casewriter * sort_create_writer (const struct subcase *ordering, const struct caseproto *proto) diff --git a/src/math/sort.h b/src/math/sort.h index 96ac32cc0b..2f1d3899a8 100644 --- a/src/math/sort.h +++ b/src/math/sort.h @@ -17,6 +17,21 @@ #ifndef MATH_SORT_H #define MATH_SORT_H 1 +/* Support for sorting cases. + + Use sort_create_writer() to sort cases in the most general way: + + - Create a casewriter with sort_create_writer(), specifying the sort + criteria. + - Write all of the cases to be sorted to the casewriter, e.g. with + casewriter_write(). + - Obtain the sorted results with casewriter_make_reader(). + + sort_execute() and sort_execute_1var() are shortcuts for situations where the + cases are already available from a casereader. + + All of these functions can efficiently sort data bigger than memory. */ + struct subcase; struct caseproto; struct variable; -- 2.30.2