math: Improve comments.

author Ben Pfaff <blp@cs.stanford.edu>

Sat, 29 Jan 2022 20:00:16 +0000 (12:00 -0800)

committer Ben Pfaff <blp@cs.stanford.edu>

Sat, 2 Apr 2022 01:48:55 +0000 (18:48 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Sat, 29 Jan 2022 20:00:16 +0000 (12:00 -0800)
committer Ben Pfaff <blp@cs.stanford.edu>
Sat, 2 Apr 2022 01:48:55 +0000 (18:48 -0700)
diff --git a/src/math/np.c b/src/math/np.c

index 0d691c2aa7b6699a82cd174bedb77984e43a2ae0..220865ba3cbec59602ec9afccce2a6c193de813b 100644 (file)
--- a/src/math/np.c
+++ b/src/math/np.c
@@ -71,6 +71,17 @@ acc (struct statistic *s, const struct ccase *cx UNUSED,
    np->prev_cc = cc;
  }
  
+/* Creates and returns a data structure whose accumulated results can be used
+   to produce a normal probability plot.  The caller must supply the weighted
+   sample size N and the mean MEAN and variance VAR of the distribution, then
+   feed in the data with order_stats_accumulate() or
+   order_stats_accumulate_idx().
+
+   There is no function to produce the results, which appear in "struct np" for
+   passing directly to np_plot_create() or dnp_plot_create().
+
+   The caller must eventually destroy the returned structure, with
+   statistic_destroy(). */
  struct np *
  np_create (double n, double mean, double var)
  {
diff --git a/src/math/np.h b/src/math/np.h

index 2fa009f90aa8b2a6465a0435adc356ac2e6d018b..998bee089e20fd66f83288fe4d42fb4de1ea8b9c 100644 (file)
--- a/src/math/np.h
+++ b/src/math/np.h
@@ -29,6 +29,8 @@ enum
      n_NP_IDX
    };
  
+/* Statistics needed to produce a normal probability plot with
+   np_plot_create(). */
  struct np
  {
    struct order_stats parent;
diff --git a/src/math/order-stats.c b/src/math/order-stats.c

index e8e078f4b6f066913d1081aadab2c7692886d19a..2ec7c1a0d476f43bf8d4889ab1b3986f515af934 100644 (file)
--- a/src/math/order-stats.c
+++ b/src/math/order-stats.c
@@ -58,48 +58,32 @@ order_stats_dump (const struct order_stats *os)
  
  #endif
  
-static void
-update_k_lower (struct k *kk,
-               double y_i, double c_i, double cc_i)
-{
-  if (cc_i <= kk->tc)
-    {
-      kk->cc = cc_i;
-      kk->c = c_i;
-      kk->y = y_i;
-    }
-}
-
-
-static void
-update_k_upper (struct k *kk,
-               double y_i, double c_i, double cc_i)
-{
-  if (cc_i > kk->tc && kk->c_p1 == 0)
-    {
-      kk->cc_p1 = cc_i;
-      kk->c_p1 = c_i;
-      kk->y_p1 = y_i;
-    }
-}
-
-
  static void
  update_k_values (const struct ccase *cx, double y_i, double c_i, double cc_i,
                  struct order_stats **os, size_t n_os)
  {
-  int j;
-
-  for (j = 0 ; j < n_os ; ++j)
+  for (size_t j = 0; j < n_os; ++j)
      {
-      int k;
        struct order_stats *tos = os[j];
        struct statistic  *stat = &tos->parent;
-      for (k = 0 ; k < tos->n_k; ++k)
+
+      for (struct k *k = tos->k; k < &tos->k[tos->n_k]; ++k)
         {
-         struct k *myk = &tos->k[k];
-         update_k_lower (myk, y_i, c_i, cc_i);
-         update_k_upper (myk, y_i, c_i, cc_i);
+          /* Update 'k' lower values. */
+          if (cc_i <= k->tc)
+            {
+              k->cc = cc_i;
+              k->c = c_i;
+              k->y = y_i;
+            }
+
+          /* Update 'k' upper values. */
+          if (cc_i > k->tc && k->c_p1 == 0)
+            {
+              k->cc_p1 = cc_i;
+              k->c_p1 = c_i;
+              k->y_p1 = y_i;
+            }
         }
  
        if (stat->accumulate)
@@ -109,12 +93,19 @@ update_k_values (const struct ccase *cx, double y_i, double c_i, double cc_i,
      }
  }
  
+/* Reads all the cases from READER and accumulates their data into the N_OS
+   order statistics in OS, taking data from case index DATA_IDX and weights
+   from case index WEIGHT_IDX.  WEIGHT_IDX may be -1 to assume weight 1.
+
+   Takes ownership of READER.
  
+   Data values must be numeric and sorted in ascending order.  Use
+   sort_execute_1var() or related functions to sort unsorted data before
+   passing it to this function. */
  void
-order_stats_accumulate_idx (struct order_stats **os, size_t nos,
+order_stats_accumulate_idx (struct order_stats **os, size_t n_os,
                              struct casereader *reader,
-                            int wt_idx,
-                            int val_idx)
+                            int weight_idx, int data_idx)
  {
    struct ccase *cx;
    struct ccase *prev_cx = NULL;
@@ -125,13 +116,11 @@ order_stats_accumulate_idx (struct order_stats **os, size_t nos,
  
    for (; (cx = casereader_read (reader)) != NULL; case_unref (cx))
      {
-      const double weight = wt_idx == -1 ? 1.0 : case_num_idx (cx, wt_idx);
-      const double this_value = case_num_idx (cx, val_idx);
-
+      const double weight = weight_idx == -1 ? 1.0 : case_num_idx (cx, weight_idx);
        if (weight == SYSMIS)
          continue;
  
-      /* The casereader MUST be sorted */
+      const double this_value = case_num_idx (cx, data_idx);
        assert (this_value >= prev_value);
  
        if (prev_value == -DBL_MAX || prev_value == this_value)
@@ -139,7 +128,7 @@ order_stats_accumulate_idx (struct order_stats **os, size_t nos,
  
        if (prev_value > -DBL_MAX && this_value > prev_value)
         {
-         update_k_values (prev_cx, prev_value, c_i, cc_i, os, nos);
+         update_k_values (prev_cx, prev_value, c_i, cc_i, os, n_os);
           c_i = weight;
         }
  
@@ -149,26 +138,33 @@ order_stats_accumulate_idx (struct order_stats **os, size_t nos,
        prev_cx = case_ref (cx);
      }
  
-  update_k_values (prev_cx, prev_value, c_i, cc_i, os, nos);
+  update_k_values (prev_cx, prev_value, c_i, cc_i, os, n_os);
    case_unref (prev_cx);
  
    casereader_destroy (reader);
  }
  
+/* Reads all the cases from READER and accumulates their data into the N_OS
+   order statistics in OS, taking data from DATA_VAR and weights from
+   WEIGHT_VAR.  Drops cases for which the value of DATA_VAR is missing
+   according to EXCLUDE.  WEIGHT_VAR may be NULL to assume weight 1.
+
+   Takes ownership of READER.
  
+   DATA_VAR must be numeric and sorted in ascending order.  Use
+   sort_execute_1var() or related functions to sort unsorted data before
+   passing it to this function. */
  void
-order_stats_accumulate (struct order_stats **os, size_t nos,
+order_stats_accumulate (struct order_stats **os, size_t n_os,
                         struct casereader *reader,
-                       const struct variable *wv,
-                       const struct variable *var,
+                       const struct variable *weight_var,
+                       const struct variable *data_var,
                         enum mv_class exclude)
  {
-  /* Filter out missing cases */
-  reader = casereader_create_filter_missing (reader, &var, 1,
+  reader = casereader_create_filter_missing (reader, &data_var, 1,
                                               exclude, NULL, NULL);
  
-  order_stats_accumulate_idx (os, nos,
-                              reader,
-                              wv ? var_get_case_index (wv) : -1,
-                              var_get_case_index (var));
+  order_stats_accumulate_idx (os, n_os, reader,
+                              weight_var ? var_get_case_index (weight_var) : -1,
+                              var_get_case_index (data_var));
  }
diff --git a/src/math/order-stats.h b/src/math/order-stats.h

index 03011ea32c4116a3045e45a61e532c7dfa57bd87..581bb11455eb9ba5291f87c549e6057ef96ae3db 100644 (file)
--- a/src/math/order-stats.h
+++ b/src/math/order-stats.h
@@ -17,6 +17,25 @@
  #ifndef __ORDER_STATS_H__
  #define __ORDER_STATS_H__
  
+/* Support for order statistics.
+
+   This data structure supplies infrastructure for higher-level statistics that
+   rely on order statistics.  It is a kind of "abstract base class" that is not
+   useful on its own.  The common pattern for using the statistics based on it
+   is this:
+
+   - Create the higher-level statistic with, for example, percentile_create().
+
+   - Feed in all the data with order_stats_accumulate() or
+     order_stats_accumulate_idx(). The data must be in sorted order: if
+     necessary, use one of the sorting functions from sort.h to sort them.
+
+   - Obtain the desired results by examining the higher-level data structure or
+     by calling an appropriate function, e.g. percentile_calculate().
+
+   - Destroy the data structure with statistic_destroy().
+*/
+
  #include <stddef.h>
  #include "data/missing-values.h"
  #include "math/statistic.h"
@@ -38,7 +57,8 @@ struct k
    double y_p1;
  };
  
-
+/* Order statistics calculation data structure.  See the comment at the top of
+   this file for usage details. */
  struct order_stats
  {
    struct statistic parent;
@@ -48,21 +68,17 @@ struct order_stats
    double cc;
  };
  
-enum mv_class;
-
-void order_stats_dump (const struct order_stats *os);
-
-void
-order_stats_accumulate_idx (struct order_stats **os, size_t nos,
-                            struct casereader *reader,
-                            int wt_idx,
-                            int val_idx);
-
-
-void order_stats_accumulate (struct order_stats **ptl, size_t nos,
-                            struct casereader *reader,
-                            const struct variable *wv,
-                            const struct variable *var,
+void order_stats_accumulate_idx (struct order_stats **os, size_t n_os,
+                                 struct casereader *reader,
+                                 int weight_idx,
+                                 int data_idx);
+void order_stats_accumulate (struct order_stats **os, size_t n_os,
+                            struct casereader *,
+                            const struct variable *weight_var,
+                            const struct variable *data_var,
                              enum mv_class exclude);
  
+/* Debugging support. */
+void order_stats_dump (const struct order_stats *);
+
  #endif
diff --git a/src/math/percentiles.c b/src/math/percentiles.c

index 1b4ff136328ecdbe0ba9fb2cd1bff521519d5803..79514eb2504f95d9fd3bfad37c3854aa8fe8c510 100644 (file)
--- a/src/math/percentiles.c
+++ b/src/math/percentiles.c
@@ -27,6 +27,7 @@
  
  #include "gl/xalloc.h"
  
+/* Return the value of the percentile. */
  double
  percentile_calculate (const struct percentile *ptl, enum pc_alg alg)
  {
@@ -143,6 +144,9 @@ destroy (struct statistic *stat)
  }
  
  
+/* Create the Pth percentile.
+   W is the total sum of weights in the data set.
+*/
  struct percentile *
  percentile_create (double p, double W)
  {
diff --git a/src/math/percentiles.h b/src/math/percentiles.h

index 86fd641addd46d5e6c3c955e547421a04649cf65..c79a629fe3623de7b531186ec6ef09dec6e9d329 100644 (file)
--- a/src/math/percentiles.h
+++ b/src/math/percentiles.h
@@ -21,6 +21,16 @@
  
  #include "order-stats.h"
  
+/* To calculate a percentile:
+
+   - Create a "struct percentile" with percentile_create().
+   - Feed in the data with order_stats_accumulate() or
+     order_stats_accumulate_idx().  The data must be in sorted order: if
+     necessary, use one of the sorting functions from sort.h to sort them.
+   - Obtain the percentile with percentile_calculate().
+   - Destroy the data structure with statistic_destroy().
+*/
+
  /* The algorithm used to calculate percentiles */
  enum pc_alg {
    PC_NONE=0,
@@ -48,15 +58,7 @@ struct percentile
    struct k k[2];
  };
  
-/* Create the Pth percentile.
-   W is the total sum of weights in the data set
-*/
  struct percentile *percentile_create (double p, double W);
+double percentile_calculate (const struct percentile *, enum pc_alg);
  
-/* Return the value of the percentile */
-double percentile_calculate (const struct percentile *ptl, enum pc_alg alg);
-
-void percentile_dump (const struct percentile *ptl);
-
-
-#endif
+#endif  /* percentiles.h */
diff --git a/src/math/sort.c b/src/math/sort.c

index a784960420874dd17d0bff8ba8715d824506614b..a93466f27c8386cfa1692d2ff349d316197e3c65 100644 (file)
--- a/src/math/sort.c
+++ b/src/math/sort.c
@@ -60,6 +60,9 @@ static struct ccase *pqueue_pop (struct pqueue *, casenumber *);
  
  static void output_record (struct sort_writer *);
  
+/* Creates a casewriter that sorts the cases written to it.  Once all the cases
+   have been written, use casewriter_make_reader() to obtain the sorted
+   results. */
  struct casewriter *
  sort_create_writer (const struct subcase *ordering,
                      const struct caseproto *proto)
diff --git a/src/math/sort.h b/src/math/sort.h

index 96ac32cc0b7a54a85e8297822b80ccfddbd40451..2f1d3899a864aef42a9273cf3588c984df38c558 100644 (file)
--- a/src/math/sort.h
+++ b/src/math/sort.h
@@ -17,6 +17,21 @@
  #ifndef MATH_SORT_H
  #define MATH_SORT_H 1
  
+/* Support for sorting cases.
+
+   Use sort_create_writer() to sort cases in the most general way:
+
+   - Create a casewriter with sort_create_writer(), specifying the sort
+     criteria.
+   - Write all of the cases to be sorted to the casewriter, e.g. with
+     casewriter_write().
+   - Obtain the sorted results with casewriter_make_reader().
+
+  sort_execute() and sort_execute_1var() are shortcuts for situations where the
+  cases are already available from a casereader.
+
+  All of these functions can efficiently sort data bigger than memory. */
+
  struct subcase;
  struct caseproto;
  struct variable;
author	Ben Pfaff <blp@cs.stanford.edu>
	Sat, 29 Jan 2022 20:00:16 +0000 (12:00 -0800)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sat, 2 Apr 2022 01:48:55 +0000 (18:48 -0700)
src/math/np.c		patch \| blob \| history
src/math/np.h		patch \| blob \| history
src/math/order-stats.c		patch \| blob \| history
src/math/order-stats.h		patch \| blob \| history
src/math/percentiles.c		patch \| blob \| history
src/math/percentiles.h		patch \| blob \| history
src/math/sort.c		patch \| blob \| history
src/math/sort.h		patch \| blob \| history