Work on weighting.
authorBen Pfaff <blp@cs.stanford.edu>
Sun, 30 Jan 2022 20:39:30 +0000 (12:39 -0800)
committerBen Pfaff <blp@cs.stanford.edu>
Sat, 2 Apr 2022 01:48:55 +0000 (18:48 -0700)
src/language/stats/ctables.c
tests/language/stats/ctables.at

index 2edd40d5d39d78cfc46a86e1dd6bdfad9df0b0ff..4a0787dd21e9f30d48f89dc12d1acc5af41face5 100644 (file)
@@ -173,8 +173,10 @@ struct ctables_domain
 
     const struct ctables_cell *example;
 
-    double valid;
-    double missing;
+    double d_valid;             /* Dictionary weight. */
+    double d_missing;
+    double e_valid;             /* Effective weight */
+    double e_missing;
   };
 
 enum ctables_summary_variant
@@ -230,7 +232,7 @@ struct ctables
 
     bool mrsets_count_duplicates; /* MRSETS. */
     bool smissing_listwise;       /* SMISSING. */
-    struct variable *base_weight; /* WEIGHT. */
+    struct variable *e_weight;    /* WEIGHT. */
     int hide_threshold;           /* HIDESMALLCOUNTS. */
 
     struct ctables_table **tables;
@@ -1892,11 +1894,19 @@ static void
 ctables_summary_add (union ctables_summary *s,
                      const struct ctables_summary_spec *ss,
                      const struct variable *var, const union value *value,
-                     double weight)
+                     double d_weight, double e_weight)
 {
   switch (ss->function)
     {
     case CTSF_COUNT:
+    case CSTF_TOTALN:
+    case CTSF_VALIDN:
+      if (var_is_value_missing (var, value))
+        s->missing += d_weight;
+      else
+        s->valid += d_weight;
+      break;
+
     case CTSF_ECOUNT:
     case CTSF_ROWPCT_COUNT:
     case CTSF_COLPCT_COUNT:
@@ -1920,14 +1930,12 @@ ctables_summary_add (union ctables_summary *s,
     case CTSF_LAYERROWPCT_TOTALN:
     case CTSF_LAYERCOLPCT_TOTALN:
     case CTSF_MISSING:
-    case CSTF_TOTALN:
     case CTSF_ETOTALN:
-    case CTSF_VALIDN:
     case CTSF_EVALIDN:
       if (var_is_value_missing (var, value))
-        s->missing += weight;
+        s->missing += e_weight;
       else
-        s->valid += weight;
+        s->valid += e_weight;
       break;
 
     case CTSF_MAXIMUM:
@@ -1956,7 +1964,7 @@ ctables_summary_add (union ctables_summary *s,
     case CTSF_LAYERROWPCT_SUM:
     case CTSF_LAYERCOLPCT_SUM:
       if (!var_is_value_missing (var, value))
-        moments1_add (s->moments, value->f, weight);
+        moments1_add (s->moments, value->f, e_weight);
       break;
 
     case CTSF_MEDIAN:
@@ -1964,11 +1972,11 @@ ctables_summary_add (union ctables_summary *s,
     case CTSF_PTILE:
       if (var_is_value_missing (var, value))
         {
-          s->ovalid += weight;
+          s->ovalid += e_weight;
 
           struct ccase *c = case_create (casewriter_get_proto (s->writer));
           *case_num_rw_idx (c, 0) = value->f;
-          *case_num_rw_idx (c, 1) = weight;
+          *case_num_rw_idx (c, 1) = e_weight;
           casewriter_write (s->writer, c);
         }
       break;
@@ -1999,6 +2007,99 @@ ctables_summary_add (union ctables_summary *s,
     }
 }
 
+static enum ctables_domain_type
+ctables_function_domain (enum ctables_summary_function function)
+{
+  switch (function)
+    {
+    case CTSF_COUNT:
+    case CTSF_ECOUNT:
+    case CTSF_MISSING:
+    case CSTF_TOTALN:
+    case CTSF_ETOTALN:
+    case CTSF_VALIDN:
+    case CTSF_EVALIDN:
+    case CTSF_MAXIMUM:
+    case CTSF_MINIMUM:
+    case CTSF_RANGE:
+    case CTSF_MEAN:
+    case CTSF_SEMEAN:
+    case CTSF_STDDEV:
+    case CTSF_SUM:
+    case CTSF_VARIANCE:
+    case CTSF_MEDIAN:
+    case CTSF_PTILE:
+    case CTSF_MODE:
+    case CTSF_RESPONSES:
+      NOT_REACHED ();
+
+    case CTSF_COLPCT_COUNT:
+    case CTSF_COLPCT_COUNT_RESPONSES:
+    case CTSF_COLPCT_RESPONSES:
+    case CTSF_COLPCT_RESPONSES_COUNT:
+    case CTSF_COLPCT_SUM:
+    case CTSF_COLPCT_TOTALN:
+    case CTSF_COLPCT_VALIDN:
+      return CTDT_COL;
+
+    case CTSF_LAYERCOLPCT_COUNT:
+    case CTSF_LAYERCOLPCT_COUNT_RESPONSES:
+    case CTSF_LAYERCOLPCT_RESPONSES:
+    case CTSF_LAYERCOLPCT_RESPONSES_COUNT:
+    case CTSF_LAYERCOLPCT_SUM:
+    case CTSF_LAYERCOLPCT_TOTALN:
+    case CTSF_LAYERCOLPCT_VALIDN:
+      return CTDT_LAYERCOL;
+
+    case CTSF_LAYERPCT_COUNT:
+    case CTSF_LAYERPCT_COUNT_RESPONSES:
+    case CTSF_LAYERPCT_RESPONSES:
+    case CTSF_LAYERPCT_RESPONSES_COUNT:
+    case CTSF_LAYERPCT_SUM:
+    case CTSF_LAYERPCT_TOTALN:
+    case CTSF_LAYERPCT_VALIDN:
+      return CTDT_LAYER;
+
+    case CTSF_LAYERROWPCT_COUNT:
+    case CTSF_LAYERROWPCT_COUNT_RESPONSES:
+    case CTSF_LAYERROWPCT_RESPONSES:
+    case CTSF_LAYERROWPCT_RESPONSES_COUNT:
+    case CTSF_LAYERROWPCT_SUM:
+    case CTSF_LAYERROWPCT_TOTALN:
+    case CTSF_LAYERROWPCT_VALIDN:
+      return CTDT_LAYERROW;
+
+    case CTSF_ROWPCT_COUNT:
+    case CTSF_ROWPCT_COUNT_RESPONSES:
+    case CTSF_ROWPCT_RESPONSES:
+    case CTSF_ROWPCT_RESPONSES_COUNT:
+    case CTSF_ROWPCT_SUM:
+    case CTSF_ROWPCT_TOTALN:
+    case CTSF_ROWPCT_VALIDN:
+      return CTDT_ROW;
+
+    case CTSF_SUBTABLEPCT_COUNT:
+    case CTSF_SUBTABLEPCT_COUNT_RESPONSES:
+    case CTSF_SUBTABLEPCT_RESPONSES:
+    case CTSF_SUBTABLEPCT_RESPONSES_COUNT:
+    case CTSF_SUBTABLEPCT_SUM:
+    case CTSF_SUBTABLEPCT_TOTALN:
+    case CTSF_SUBTABLEPCT_VALIDN:
+      return CTDT_SUBTABLE;
+
+    case CTSF_TABLEPCT_COUNT:
+    case CTSF_TABLEPCT_COUNT_RESPONSES:
+    case CTSF_TABLEPCT_RESPONSES:
+    case CTSF_TABLEPCT_RESPONSES_COUNT:
+    case CTSF_TABLEPCT_SUM:
+    case CTSF_TABLEPCT_TOTALN:
+    case CTSF_TABLEPCT_VALIDN:
+      return CTDT_TABLE;
+    }
+
+  NOT_REACHED ();
+}
+
 static double
 ctables_summary_value (const struct ctables_cell *cell,
                        union ctables_summary *s,
@@ -2010,26 +2111,19 @@ ctables_summary_value (const struct ctables_cell *cell,
     case CTSF_ECOUNT:
       return s->valid;
 
-    case CTSF_SUBTABLEPCT_COUNT:
-      return cell->domains[CTDT_SUBTABLE]->valid ? s->valid / cell->domains[CTDT_SUBTABLE]->valid * 100 : SYSMIS;
-
     case CTSF_ROWPCT_COUNT:
-      return cell->domains[CTDT_ROW]->valid ? s->valid / cell->domains[CTDT_ROW]->valid * 100 : SYSMIS;
-
     case CTSF_COLPCT_COUNT:
-      return cell->domains[CTDT_COL]->valid ? s->valid / cell->domains[CTDT_COL]->valid * 100 : SYSMIS;
-
     case CTSF_TABLEPCT_COUNT:
-      return cell->domains[CTDT_TABLE]->valid ? s->valid / cell->domains[CTDT_TABLE]->valid * 100 : SYSMIS;
-
+    case CTSF_SUBTABLEPCT_COUNT:
     case CTSF_LAYERPCT_COUNT:
-      return cell->domains[CTDT_LAYER]->valid ? s->valid / cell->domains[CTDT_LAYER]->valid * 100 : SYSMIS;
-
     case CTSF_LAYERROWPCT_COUNT:
-      return cell->domains[CTDT_LAYERROW]->valid ? s->valid / cell->domains[CTDT_LAYERROW]->valid * 100 : SYSMIS;
-
     case CTSF_LAYERCOLPCT_COUNT:
-      return cell->domains[CTDT_LAYERCOL]->valid ? s->valid / cell->domains[CTDT_LAYERCOL]->valid * 100 : SYSMIS;
+      {
+        enum ctables_domain_type d = ctables_function_domain (ss->function);
+        return (cell->domains[d]->e_valid
+                ? s->valid / cell->domains[d]->e_valid * 100
+                : SYSMIS);
+      }
 
     case CTSF_ROWPCT_VALIDN:
     case CTSF_COLPCT_VALIDN:
@@ -2473,7 +2567,7 @@ static void
 ctables_cell_add__ (struct ctables_table *t, const struct ccase *c,
                     size_t ix[PIVOT_N_AXES],
                     const struct ctables_category *cats[PIVOT_N_AXES][10],
-                    double weight)
+                    double d_weight, double e_weight)
 {
   struct ctables_cell *cell = ctables_cell_insert__ (t, c, ix, cats);
   const struct ctables_nest *ss = &t->stacks[t->summary_axis].nests[ix[t->summary_axis]];
@@ -2481,17 +2575,22 @@ ctables_cell_add__ (struct ctables_table *t, const struct ccase *c,
   const struct ctables_summary_spec_set *specs = &ss->specs[cell->sv];
   for (size_t i = 0; i < specs->n; i++)
     ctables_summary_add (&cell->summaries[i], &specs->specs[i], specs->var,
-                         case_data (c, specs->var), weight);
+                         case_data (c, specs->var), d_weight, e_weight);
   if (cell->contributes_to_domains)
-    for (enum ctables_domain_type dt = 0; dt < N_CTDTS; dt++)
-      cell->domains[dt]->valid += weight;
+    {
+      for (enum ctables_domain_type dt = 0; dt < N_CTDTS; dt++)
+        {
+          cell->domains[dt]->d_valid += d_weight;
+          cell->domains[dt]->e_valid += e_weight;
+        }
+    }
 }
 
 static void
 recurse_totals (struct ctables_table *t, const struct ccase *c,
                 size_t ix[PIVOT_N_AXES],
                 const struct ctables_category *cats[PIVOT_N_AXES][10],
-                double weight,
+                double d_weight, double e_weight,
                 enum pivot_axis_type start_axis, size_t start_nest)
 {
   for (enum pivot_axis_type a = start_axis; a < PIVOT_N_AXES; a++)
@@ -2510,8 +2609,8 @@ recurse_totals (struct ctables_table *t, const struct ccase *c,
             {
               const struct ctables_category *save = cats[a][i];
               cats[a][i] = total;
-              ctables_cell_add__ (t, c, ix, cats, weight);
-              recurse_totals (t, c, ix, cats, weight, a, i + 1);
+              ctables_cell_add__ (t, c, ix, cats, d_weight, e_weight);
+              recurse_totals (t, c, ix, cats, d_weight, e_weight, a, i + 1);
               cats[a][i] = save;
             }
         }
@@ -2523,7 +2622,7 @@ static void
 recurse_subtotals (struct ctables_table *t, const struct ccase *c,
                    size_t ix[PIVOT_N_AXES],
                    const struct ctables_category *cats[PIVOT_N_AXES][10],
-                   double weight,
+                   double d_weight, double e_weight,
                    enum pivot_axis_type start_axis, size_t start_nest)
 {
   for (enum pivot_axis_type a = start_axis; a < PIVOT_N_AXES; a++)
@@ -2538,8 +2637,8 @@ recurse_subtotals (struct ctables_table *t, const struct ccase *c,
           if (save->subtotal)
             {
               cats[a][i] = save->subtotal;
-              ctables_cell_add__ (t, c, ix, cats, weight);
-              recurse_subtotals (t, c, ix, cats, weight, a, i + 1);
+              ctables_cell_add__ (t, c, ix, cats, d_weight, e_weight);
+              recurse_subtotals (t, c, ix, cats, d_weight, e_weight, a, i + 1);
               cats[a][i] = save;
             }
         }
@@ -2551,7 +2650,7 @@ static void
 ctables_cell_insert (struct ctables_table *t,
                      const struct ccase *c,
                      size_t ir, size_t ic, size_t il,
-                     double weight)
+                     double d_weight, double e_weight)
 {
   size_t ix[PIVOT_N_AXES] = {
     [PIVOT_AXIS_ROW] = ir,
@@ -2581,10 +2680,10 @@ ctables_cell_insert (struct ctables_table *t,
         }
     }
 
-  ctables_cell_add__ (t, c, ix, cats, weight);
+  ctables_cell_add__ (t, c, ix, cats, d_weight, e_weight);
 
-  recurse_totals (t, c, ix, cats, weight, 0, 0);
-  recurse_subtotals (t, c, ix, cats, weight, 0, 0);
+  recurse_totals (t, c, ix, cats, d_weight, e_weight, 0, 0);
+  recurse_subtotals (t, c, ix, cats, d_weight, e_weight, 0, 0);
 }
 
 struct merge_item
@@ -3220,17 +3319,18 @@ ctables_sort_clabels_values (struct ctables_table *t)
 static bool
 ctables_execute (struct dataset *ds, struct ctables *ct)
 {
-  struct casereader *input = casereader_create_filter_weight (proc_open (ds),
-                                                              dataset_dict (ds),
-                                                              NULL, NULL);
+  struct casereader *input = proc_open (ds);
   bool warn_on_invalid = true;
-  double total_weight = 0;
   for (struct ccase *c = casereader_read (input); c;
        case_unref (c), c = casereader_read (input))
     {
-      double weight = dict_get_case_weight (dataset_dict (ds), c,
-                                            &warn_on_invalid);
-      total_weight += weight;
+      double d_weight = dict_get_case_weight (dataset_dict (ds), c,
+                                              &warn_on_invalid);
+      double e_weight = (ct->e_weight
+                         ? var_force_valid_weight (ct->e_weight,
+                                                   case_num (c, ct->e_weight),
+                                                   &warn_on_invalid)
+                         : d_weight);
 
       for (size_t i = 0; i < ct->n_tables; i++)
         {
@@ -3239,7 +3339,7 @@ ctables_execute (struct dataset *ds, struct ctables *ct)
           for (size_t ir = 0; ir < t->stacks[PIVOT_AXIS_ROW].n; ir++)
             for (size_t ic = 0; ic < t->stacks[PIVOT_AXIS_COLUMN].n; ic++)
               for (size_t il = 0; il < t->stacks[PIVOT_AXIS_LAYER].n; il++)
-                ctables_cell_insert (t, c, ir, ic, il, weight);
+                ctables_cell_insert (t, c, ir, ic, il, d_weight, e_weight);
 
           for (enum pivot_axis_type a = 0; a < PIVOT_N_AXES; a++)
             if (t->label_axis[a] != a)
@@ -3439,8 +3539,8 @@ cmd_ctables (struct lexer *lexer, struct dataset *ds)
           if (!lex_force_match_id (lexer, "VARIABLE"))
             goto error;
           lex_match (lexer, T_EQUALS);
-          ct->base_weight = parse_variable (lexer, dataset_dict (ds));
-          if (!ct->base_weight)
+          ct->e_weight = parse_variable (lexer, dataset_dict (ds));
+          if (!ct->e_weight)
             goto error;
         }
       else if (lex_match_id (lexer, "HIDESMALLCOUNTS"))
index f39ebdc02e912d759c8c066871c9f6ce77499dc8..f71a3c572eb5dfb77cab2d24694878b7f3074fb7 100644 (file)
@@ -37,7 +37,7 @@ dnl   * EMPTY.
 dnl   * MISSING.
 dnl - VLABELS.
 dnl - SMISSING.
-dnl - WEIGHT and adjustment weights.
+dnl - Test WEIGHT and adjustment weights.
 dnl - PCOMPUTE and PPROPERTIES.
 dnl - HIDESMALLCOUNTS.