implement MEDIAN (untested)
[pspp] / src / language / stats / ctables.c
index af53a6b6c53643a9d577a0b06a01d43e5db06880..5f9d2584c809649d527cf0bd59a0f879cd38eefb 100644 (file)
 #include <math.h>
 
 #include "data/casereader.h"
+#include "data/casewriter.h"
 #include "data/dataset.h"
 #include "data/dictionary.h"
 #include "data/mrset.h"
+#include "data/subcase.h"
 #include "data/value-labels.h"
 #include "language/command.h"
 #include "language/lexer/format-parser.h"
@@ -34,6 +36,8 @@
 #include "libpspp/message.h"
 #include "libpspp/string-array.h"
 #include "math/moments.h"
+#include "math/percentiles.h"
+#include "math/sort.h"
 #include "output/pivot-table.h"
 
 #include "gl/minmax.h"
@@ -186,6 +190,7 @@ struct ctables_cell
     struct hmap_node node;
 
     /* The domains that contain this cell. */
+    bool contributes_to_domains;
     struct ctables_domain *domains[N_CTDTS];
 
     bool hide;
@@ -1680,7 +1685,14 @@ union ctables_summary
     /* MEAN, SEMEAN, STDDEV, SUM, VARIANCE, *.SUM. */
     struct moments1 *moments;
 
-    /* XXX percentiles, median, mode, multiple response */
+    struct
+      {
+        struct casewriter *writer;
+        double mvalid;
+        double median;
+      };
+
+    /* XXX percentiles, mode, multiple response */
   };
 
 static void
@@ -1712,6 +1724,7 @@ ctables_summary_init (union ctables_summary *s,
     case CTSF_LAYERPCT_TOTALN:
     case CTSF_LAYERROWPCT_TOTALN:
     case CTSF_LAYERCOLPCT_TOTALN:
+    case CTSF_MISSING:
     case CSTF_TOTALN:
     case CTSF_ETOTALN:
     case CTSF_VALIDN:
@@ -1741,7 +1754,22 @@ ctables_summary_init (union ctables_summary *s,
       break;
 
     case CTSF_MEDIAN:
-    case CTSF_MISSING:
+      {
+        struct caseproto *proto = caseproto_create ();
+        proto = caseproto_add_width (proto, 0);
+        proto = caseproto_add_width (proto, 0);
+
+        struct subcase ordering;
+        subcase_init (&ordering, 0, 0, SC_ASCEND);
+        s->writer = sort_create_writer (&ordering, proto);
+        subcase_uninit (&ordering);
+        caseproto_unref (proto);
+
+        s->mvalid = 0;
+        s->median = SYSMIS;
+      }
+      break;
+
     case CTSF_MODE:
     case CTSF_PTILE:
       NOT_REACHED ();
@@ -1801,6 +1829,7 @@ ctables_summary_uninit (union ctables_summary *s,
     case CTSF_LAYERPCT_TOTALN:
     case CTSF_LAYERROWPCT_TOTALN:
     case CTSF_LAYERCOLPCT_TOTALN:
+    case CTSF_MISSING:
     case CSTF_TOTALN:
     case CTSF_ETOTALN:
     case CTSF_VALIDN:
@@ -1828,7 +1857,9 @@ ctables_summary_uninit (union ctables_summary *s,
       break;
 
     case CTSF_MEDIAN:
-    case CTSF_MISSING:
+      casewriter_destroy (s->writer);
+      break;
+
     case CTSF_MODE:
     case CTSF_PTILE:
       NOT_REACHED ();
@@ -1890,6 +1921,7 @@ ctables_summary_add (union ctables_summary *s,
     case CTSF_LAYERPCT_TOTALN:
     case CTSF_LAYERROWPCT_TOTALN:
     case CTSF_LAYERCOLPCT_TOTALN:
+    case CTSF_MISSING:
     case CSTF_TOTALN:
     case CTSF_ETOTALN:
     case CTSF_VALIDN:
@@ -1925,11 +1957,22 @@ ctables_summary_add (union ctables_summary *s,
     case CTSF_LAYERPCT_SUM:
     case CTSF_LAYERROWPCT_SUM:
     case CTSF_LAYERCOLPCT_SUM:
-      moments1_add (s->moments, value->f, weight);
+      if (!var_is_value_missing (var, value))
+        moments1_add (s->moments, value->f, weight);
       break;
 
     case CTSF_MEDIAN:
-    case CTSF_MISSING:
+      if (var_is_value_missing (var, value))
+        {
+          s->mvalid += weight;
+
+          struct ccase *c = case_create (casewriter_get_proto (s->writer));
+          *case_num_rw_idx (c, 0) = value->f;
+          *case_num_rw_idx (c, 1) = weight;
+          casewriter_write (s->writer, c);
+        }
+      break;
+
     case CTSF_MODE:
     case CTSF_PTILE:
       NOT_REACHED ();
@@ -2008,6 +2051,9 @@ ctables_summary_value (const struct ctables_cell *cell,
     case CTSF_LAYERCOLPCT_TOTALN:
       NOT_REACHED ();
 
+    case CTSF_MISSING:
+      return s->missing;
+
     case CSTF_TOTALN:
     case CTSF_ETOTALN:
       return s->valid + s->missing;
@@ -2070,7 +2116,19 @@ ctables_summary_value (const struct ctables_cell *cell,
       NOT_REACHED ();
 
     case CTSF_MEDIAN:
-    case CTSF_MISSING:
+      if (s->writer)
+        {
+          struct casereader *reader = casewriter_make_reader (s->writer);
+          s->writer = NULL;
+
+          struct percentile *median = percentile_create (0.5, s->mvalid);
+          struct order_stats *os = &median->parent;
+          order_stats_accumulate_idx (&os, 1, reader, 1, 0);
+          s->median = percentile_calculate (median, PC_HAVERAGE);
+          statistic_destroy (&median->parent.parent);
+        }
+      return s->median;
+
     case CTSF_MODE:
     case CTSF_PTILE:
       NOT_REACHED ();
@@ -2365,6 +2423,7 @@ ctables_cell_insert__ (struct ctables_table *t, const struct ccase *c,
   cell = xmalloc (sizeof *cell);
   cell->hide = false;
   cell->sv = sv;
+  cell->contributes_to_domains = true;
   for (enum pivot_axis_type a = 0; a < PIVOT_N_AXES; a++)
     {
       const struct ctables_nest *nest = &t->stacks[a].nests[ix[a]];
@@ -2374,14 +2433,19 @@ ctables_cell_insert__ (struct ctables_table *t, const struct ccase *c,
                         : NULL);
       for (size_t i = 0; i < nest->n; i++)
         {
+          const struct ctables_category *cat = cats[a][i];
+
           if (i != nest->scale_idx)
             {
-              const struct ctables_category *subtotal = cats[a][i]->subtotal;
+              const struct ctables_category *subtotal = cat->subtotal;
               if (subtotal && subtotal->type == CCT_HSUBTOTAL)
                 cell->hide = true;
+
+              if (cat->type == CCT_TOTAL || cat->type == CCT_SUBTOTAL || cat->type == CCT_HSUBTOTAL)
+                cell->contributes_to_domains = false;
             }
 
-          cell->axes[a].cvs[i].category = cats[a][i];
+          cell->axes[a].cvs[i].category = cat;
           value_clone (&cell->axes[a].cvs[i].value, case_data (c, nest->vars[i]),
                        var_get_width (nest->vars[i]));
         }
@@ -2410,8 +2474,9 @@ ctables_cell_add__ (struct ctables_table *t, const struct ccase *c,
   for (size_t i = 0; i < specs->n; i++)
     ctables_summary_add (&cell->summaries[i], &specs->specs[i], specs->var,
                          case_data (c, specs->var), weight);
-  for (enum ctables_domain_type dt = 0; dt < N_CTDTS; dt++)
-    cell->domains[dt]->valid += weight;
+  if (cell->contributes_to_domains)
+    for (enum ctables_domain_type dt = 0; dt < N_CTDTS; dt++)
+      cell->domains[dt]->valid += weight;
 }
 
 static void
@@ -2446,6 +2511,34 @@ recurse_totals (struct ctables_table *t, const struct ccase *c,
     }
 }
 
+static void
+recurse_subtotals (struct ctables_table *t, const struct ccase *c,
+                   size_t ix[PIVOT_N_AXES],
+                   const struct ctables_category *cats[PIVOT_N_AXES][10],
+                   double weight,
+                   enum pivot_axis_type start_axis, size_t start_nest)
+{
+  for (enum pivot_axis_type a = start_axis; a < PIVOT_N_AXES; a++)
+    {
+      const struct ctables_nest *nest = &t->stacks[a].nests[ix[a]];
+      for (size_t i = start_nest; i < nest->n; i++)
+        {
+          if (i == nest->scale_idx)
+            continue;
+
+          const struct ctables_category *save = cats[a][i];
+          if (save->subtotal)
+            {
+              cats[a][i] = save->subtotal;
+              ctables_cell_add__ (t, c, ix, cats, weight);
+              recurse_subtotals (t, c, ix, cats, weight, a, i + 1);
+              cats[a][i] = save;
+            }
+        }
+      start_nest = 0;
+    }
+}
+
 static void
 ctables_cell_insert (struct ctables_table *t,
                      const struct ccase *c,
@@ -2483,24 +2576,7 @@ ctables_cell_insert (struct ctables_table *t,
   ctables_cell_add__ (t, c, ix, cats, weight);
 
   recurse_totals (t, c, ix, cats, weight, 0, 0);
-
-  for (enum pivot_axis_type a = 0; a < PIVOT_N_AXES; a++)
-    {
-      const struct ctables_nest *nest = &t->stacks[a].nests[ix[a]];
-      for (size_t i = 0; i < nest->n; i++)
-        {
-          if (i == nest->scale_idx)
-            continue;
-
-          const struct ctables_category *save = cats[a][i];
-          if (save->subtotal)
-            {
-              cats[a][i] = save->subtotal;
-              ctables_cell_add__ (t, c, ix, cats, weight);
-              cats[a][i] = save;
-            }
-        }
-    }
+  recurse_subtotals (t, c, ix, cats, weight, 0, 0);
 }
 
 struct merge_item
@@ -2566,12 +2642,16 @@ ctables_table_output (struct ctables *ct, struct ctables_table *t)
     pivot_table_set_caption (
       pt, pivot_value_new_user_text (t->corner, SIZE_MAX));
 
-  bool summary_dimension = t->summary_axis != t->slabels_axis;
+  bool summary_dimension = (t->summary_axis != t->slabels_axis
+                            || (!t->slabels_visible
+                                && t->summary_specs.n > 1));
   if (summary_dimension)
     {
       struct pivot_dimension *d = pivot_dimension_create (
-        pt, t->slabels_axis, N_("Summaries"));
+        pt, t->slabels_axis, N_("Statistics"));
       const struct ctables_summary_spec_set *specs = &t->summary_specs;
+      if (!t->slabels_visible)
+        d->hide_all_labels = true;
       for (size_t i = 0; i < specs->n; i++)
         pivot_category_create_leaf (
           d->root, pivot_value_new_text (specs->specs[i].label));
@@ -2678,13 +2758,11 @@ ctables_table_output (struct ctables *ct, struct ctables_table *t)
           if (new_subtable)
             {
               n_levels = 0;
-              printf ("%s levels:", pivot_axis_type_to_string (a));
               for (size_t k = 0; k < nest->n; k++)
                 {
                   enum ctables_vlabel vlabel = ct->vlabels[var_get_dict_index (nest->vars[k])];
                   if (vlabel != CTVL_NONE)
                     {
-                      printf (" var(%s)", var_get_name (nest->vars[k]));
                       levels[n_levels++] = (struct ctables_level) {
                         .type = CTL_VAR,
                         .var_idx = k,
@@ -2694,7 +2772,6 @@ ctables_table_output (struct ctables *ct, struct ctables_table *t)
                   if (nest->scale_idx != k
                       && (k != nest->n - 1 || t->label_axis[a] == a))
                     {
-                      printf (" category(%s)", var_get_name (nest->vars[k]));
                       levels[n_levels++] = (struct ctables_level) {
                         .type = CTL_CATEGORY,
                         .var_idx = k,
@@ -2702,15 +2779,13 @@ ctables_table_output (struct ctables *ct, struct ctables_table *t)
                     }
                 }
 
-              if (a == t->slabels_axis && a == t->summary_axis)
+              if (!summary_dimension && a == t->slabels_axis)
                 {
-                  printf (" summary");
                   levels[n_levels++] = (struct ctables_level) {
                     .type = CTL_SUMMARY,
                     .var_idx = SIZE_MAX,
                   };
                 }
-              printf ("\n");
             }
 
           size_t n_common = 0;
@@ -2722,17 +2797,16 @@ ctables_table_output (struct ctables *ct, struct ctables_table *t)
                   if (level->type == CTL_CATEGORY)
                     {
                       size_t var_idx = level->var_idx;
-                      if (prev->axes[a].cvs[var_idx].category
-                          != cell->axes[a].cvs[var_idx].category)
-                        {
-                          break;
-                        }
-                      else if (!value_equal (&prev->axes[a].cvs[var_idx].value,
-                                           &cell->axes[a].cvs[var_idx].value,
-                                             var_get_type (nest->vars[var_idx])))
-                        {
-                          break;
-                        }
+                      const struct ctables_category *c = cell->axes[a].cvs[var_idx].category;
+                      if (prev->axes[a].cvs[var_idx].category != c)
+                        break;
+                      else if (c->type != CCT_SUBTOTAL
+                               && c->type != CCT_HSUBTOTAL
+                               && c->type != CCT_TOTAL
+                               && !value_equal (&prev->axes[a].cvs[var_idx].value,
+                                                &cell->axes[a].cvs[var_idx].value,
+                                                var_get_type (nest->vars[var_idx])))
+                        break;
                     }
                 }
             }