FREQUENCIES and layered split file has tests that work

author Ben Pfaff <blp@cs.stanford.edu>

Mon, 8 Aug 2022 05:26:01 +0000 (22:26 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Mon, 8 Aug 2022 05:26:15 +0000 (22:26 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Mon, 8 Aug 2022 05:26:01 +0000 (22:26 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Mon, 8 Aug 2022 05:26:15 +0000 (22:26 -0700)
diff --git a/src/output/pivot-table.c b/src/output/pivot-table.c

index 1e7d2fbb5ac6b72308f54ace365ade5c4a6053f9..6d1efb882b4748b3a009d6d3e0e64c05b0b1e624 100644 (file)
--- a/src/output/pivot-table.c
+++ b/src/output/pivot-table.c
@@ -3043,6 +3043,7 @@ struct pivot_splits_value
  struct pivot_splits_var
    {
      struct pivot_dimension *dimension;
+    char *name;
      int width;
      size_t idx;
      struct fmt_spec format;
@@ -3056,8 +3057,17 @@ struct pivot_splits
      char *encoding;
  
      size_t dindexes[MAX_SPLITS];
+
+    int warnings_left;
    };
  
+/* Adds a dimension for each layered split file variable in DICT to PT on AXIS.
+   These dimensions should be the last dimensions added to PT (the
+   pivot_splits_put*() functions rely on this).  Returns a new pivot_splits
+   structure if any dimensions were added, otherwise a null pointer.
+
+   See the large comment on split file handling in pivot-table.h for more
+   information. */
  struct pivot_splits *
  pivot_splits_create (struct pivot_table *pt,
                       enum pivot_axis_type axis,
@@ -3071,7 +3081,7 @@ pivot_splits_create (struct pivot_table *pt,
  
    const struct variable *const *vars = dict_get_split_vars (dict);
    struct pivot_splits_var *psvars = xnmalloc (n, sizeof *psvars);
-  for (size_t i = 0; i < n; i++)
+  for (size_t i = n - 1; i < n; i--)
      {
        const struct variable *var = vars[i];
        struct pivot_splits_var *psvar = &psvars[i];
@@ -3082,6 +3092,7 @@ pivot_splits_create (struct pivot_table *pt,
  
        *psvar = (struct pivot_splits_var) {
          .width = var_get_width (var),
+        .name = xstrdup (var_get_name (var)),
          .idx = var_get_case_index (var),
          .format = *var_get_print_format (var),
          .values = HMAP_INITIALIZER (psvar->values),
@@ -3095,16 +3106,25 @@ pivot_splits_create (struct pivot_table *pt,
      .n = n,
      .encoding = xstrdup (dict_get_encoding (dict)),
      .dindexes = { [0] = SIZE_MAX },
+    .warnings_left = 5,
    };
    return ps;
  }
  
+/* Destroys PS. */
  void
  pivot_splits_destroy (struct pivot_splits *ps)
  {
    if (!ps)
      return;
  
+  if (ps->warnings_left < 0)
+    msg (SW, ngettext ("Suppressed %d additional warning about duplicate "
+                       "split values.",
+                       "Suppressed %d additional warnings about duplicate "
+                       "split values.", -ps->warnings_left),
+         -ps->warnings_left);
+
    for (size_t i = 0; i < ps->n; i++)
      {
        struct pivot_splits_var *psvar = &ps->vars[i];
@@ -3116,6 +3136,7 @@ pivot_splits_destroy (struct pivot_splits *ps)
            hmap_delete (&psvar->values, &psval->hmap_node);
            free (psval);
          }
+      free (psvar->name);
        hmap_destroy (&psvar->values);
      }
    free (ps->vars);
@@ -3135,16 +3156,24 @@ pivot_splits_value_find (struct pivot_splits_var *psvar,
    return NULL;
  }
  
+/* Begins adding data for a new split file group to the pivot table associated
+   with PS.  EXAMPLE should be a case from the new split file group.
+
+   This is a no-op if PS is NULL.
+
+   See the large comment on split file handling in pivot-table.h for more
+   information. */
  void
-pivot_splits_new_split (struct pivot_splits *ps, const struct ccase *c)
+pivot_splits_new_split (struct pivot_splits *ps, const struct ccase *example)
  {
    if (!ps)
      return;
  
-  for (size_t i = ps->n - 1; i < ps->n; i--)
+  size_t n_new = 0;
+  for (size_t i = 0; i < ps->n; i++)
      {
        struct pivot_splits_var *psvar = &ps->vars[i];
-      const union value *value = case_data_idx (c, psvar->idx);
+      const union value *value = case_data_idx (example, psvar->idx);
        struct pivot_splits_value *psval = pivot_splits_value_find (psvar, value);
        if (!psval)
          {
@@ -3156,13 +3185,50 @@ pivot_splits_new_split (struct pivot_splits *ps, const struct ccase *c)
              psvar->dimension->root,
              pivot_value_new_value (value, psvar->width, &psvar->format,
                                     ps->encoding));
+          n_new++;
          }
  
        ps->dindexes[i] = psval->leaf;
      }
+
+  if (!n_new)
+    {
+      if (ps->warnings_left-- > 0)
+        {
+          struct string s = DS_EMPTY_INITIALIZER;
+          for (size_t i = 0; i < ps->n; i++)
+            {
+              if (i > 0)
+                ds_put_cstr (&s, ", ");
+
+              struct pivot_splits_var *psvar = &ps->vars[i];
+              const union value *value = case_data_idx (example, psvar->idx);
+              ds_put_format (&s, "%s = ", psvar->name);
+
+              char *s2 = data_out (value, ps->encoding, &psvar->format,
+                                   settings_get_fmt_settings ());
+              ds_put_cstr (&s, s2 + strspn (s2, " "));
+              free (s2);
+            }
+          msg (SW, _("When SPLIT FILE is in effect, the input data must be "
+                     "sorted by the split variables (for example, using SORT "
+                     "CASES), but multiple runs of cases with the same split "
+                     "values were found separated by cases with different "
+                     "values.  Each run will be analyzed separately.  The "
+                     "duplicate split values are: %s"), ds_cstr (&s));
+          ds_destroy (&s);
+        }
+
+      struct pivot_splits_var *psvar = &ps->vars[0];
+      const union value *value = case_data_idx (example, psvar->idx);
+      ps->dindexes[0] = pivot_category_create_leaf (
+        psvar->dimension->root,
+        pivot_value_new_value (value, psvar->width, &psvar->format,
+                               ps->encoding));
+    }
  }
  
-size_t
+static size_t
  pivot_splits_get_dindexes (const struct pivot_splits *ps, size_t *dindexes)
  {
    if (!ps)
@@ -3174,6 +3240,12 @@ pivot_splits_get_dindexes (const struct pivot_splits *ps, size_t *dindexes)
    return ps->n;
  }
  
+/* Puts VALUE in the cell in TABLE with index IDX1.  TABLE must have 1
+   dimension plus the split file dimensions from PS (if nonnull).  Takes
+   ownership of VALUE.
+
+   See the large comment on split file handling in pivot-table.h for more
+   information. */
  void
  pivot_splits_put1 (struct pivot_splits *ps, struct pivot_table *table,
                     size_t idx1, struct pivot_value *value)
@@ -3185,6 +3257,12 @@ pivot_splits_put1 (struct pivot_splits *ps, struct pivot_table *table,
    pivot_table_put (table, dindexes, p - dindexes, value);
  }
  
+/* Puts VALUE in the cell in TABLE with index (IDX1, IDX2).  TABLE must have 2
+   dimensions plus the split file dimensions from PS (if nonnull).  Takes
+   ownership of VALUE.
+
+   See the large comment on split file handling in pivot-table.h for more
+   information. */
  void
  pivot_splits_put2 (struct pivot_splits *ps, struct pivot_table *table,
                     size_t idx1, size_t idx2, struct pivot_value *value)
@@ -3197,6 +3275,12 @@ pivot_splits_put2 (struct pivot_splits *ps, struct pivot_table *table,
    pivot_table_put (table, dindexes, p - dindexes, value);
  }
  
+/* Puts VALUE in the cell in TABLE with index (IDX1, IDX2, IDX3).  TABLE must
+   have 3 dimensions plus the split file dimensions from PS (if nonnull).
+   Takes ownership of VALUE.
+
+   See the large comment on split file handling in pivot-table.h for more
+   information. */
  void
  pivot_splits_put3 (struct pivot_splits *ps, struct pivot_table *table,
                     size_t idx1, size_t idx2, size_t idx3,
@@ -3211,6 +3295,12 @@ pivot_splits_put3 (struct pivot_splits *ps, struct pivot_table *table,
    pivot_table_put (table, dindexes, p - dindexes, value);
  }
  
+/* Puts VALUE in the cell in TABLE with index (IDX1, IDX2, IDX3, IDX4).  TABLE
+   must have 4 dimensions plus the split file dimensions from PS (if nonnull).
+   Takes ownership of VALUE.
+
+   See the large comment on split file handling in pivot-table.h for more
+   information. */
  void
  pivot_splits_put4 (struct pivot_splits *ps, struct pivot_table *table,
                     size_t idx1, size_t idx2, size_t idx3, size_t idx4,
diff --git a/src/output/pivot-table.h b/src/output/pivot-table.h

index 2904fd4f357c1211da1d3b95d5aeb193e3776f6e..f2125c7058b3c103e09a3909a39be019c0cca337 100644 (file)
--- a/src/output/pivot-table.h
+++ b/src/output/pivot-table.h
@@ -282,14 +282,43 @@ void pivot_dimension_destroy (struct pivot_dimension *);
  void pivot_dimension_dump (const struct pivot_dimension *,
                             const struct pivot_table *, int indentation);
  \f
+/* Split file handling with pivot tables.
+
+   When SPLIT FILE is in effect with the LAYERED option, values for the split
+   file variables need to be incorporated into pivot table output.  These
+   functions make that easier.
+
+   To use them:
+
+   1. After adding the rest of the dimensions to an output pivot table, call
+      pivot_splits_create().  If there are any and LAYERED mode is in use, then
+      pivot_splits_create() will add a dimension for each split file
+      variable and return a structure.  Otherwise, it returns NULL.
+
+   2. Before adding data to the pivot table for each SPLIT FILE group, call
+      pivot_splits_new_split(), passing in an example case from the group (the
+      first or last case is fine).  This will the split file handler add
+      categories for the group to the split dimensions.
+
+      pivot_splits_new_split() does nothing if given a null pivot_splits, so
+      it's fine to call it unconditionally.
+
+   3. Use pivot_splits_put*(), instead of pivot_table_put*(), to add data to
+      the pivot table.  These functions automatically add the current group
+      leaf indexes after the indexes passed in, as a convenience.
+
+      These functions still work fine if given a null pivot_splits, so it's
+      fine to use them in all cases.
+
+   4. Destroy the pivot_splits with pivot_splits_destroy() when the pivot table
+      has been fully constructed. */
+
  struct pivot_splits *pivot_splits_create (struct pivot_table *,
                                            enum pivot_axis_type,
                                            const struct dictionary *);
  void pivot_splits_destroy (struct pivot_splits *);
  
  void pivot_splits_new_split (struct pivot_splits *, const struct ccase *);
-size_t pivot_splits_get_dindexes (const struct pivot_splits *,
-                                  size_t *dindexes);
  
  void pivot_splits_put1 (struct pivot_splits *, struct pivot_table *,
                          size_t idx1, struct pivot_value *);
@@ -301,8 +330,6 @@ void pivot_splits_put3 (struct pivot_splits *, struct pivot_table *,
  void pivot_splits_put4 (struct pivot_splits *, struct pivot_table *,
                          size_t idx1, size_t idx2, size_t idx3, size_t idx4,
                          struct pivot_value *);
-
-size_t pivot_splits_count (const struct pivot_splits *);
  \f
  /* A pivot_category is a leaf (a category) or a group:
  
diff --git a/tests/language/stats/frequencies.at b/tests/language/stats/frequencies.at

index ad922bc11473d3f153f56c6e4b75a7332285ffd2..6c85236b49b803f9ae303a26a1c0c96c247f9878 100644 (file)
--- a/tests/language/stats/frequencies.at
+++ b/tests/language/stats/frequencies.at
@@ -47,6 +47,231 @@ Total,,10,100.0%,,
  ])
  AT_CLEANUP
  
+AT_SETUP([FREQUENCIES with SPLIT FILE - LAYERED])
+AT_DATA([frequencies.sps], [dnl
+DATA LIST LIST NOTABLE/name (A8) value quantity.
+BEGIN DATA.
+foo 1 5
+bar 2 6
+baz 1 9
+quux 3 1
+bar 1 2
+baz 4 3
+baz 1 4
+baz 1 1
+foo 6 0
+quux 5 8
+END DATA.
+EXECUTE.
+
+SORT CASES BY name.
+SPLIT FILE BY name.
+FREQUENCIES /VARIABLES=value quantity /FORMAT NOTABLE.
+])
+AT_CHECK([pspp -O format=csv frequencies.sps], [0], [dnl
+Table: Statistics
+name,,,value,quantity
+bar,N,Valid,2,2
+,,Missing,0,0
+,Mean,,1.50,4.00
+,Std Dev,,.71,2.83
+,Minimum,,1.00,2.00
+,Maximum,,2.00,6.00
+baz,N,Valid,4,4
+,,Missing,0,0
+,Mean,,1.75,4.25
+,Std Dev,,1.50,3.40
+,Minimum,,1.00,1.00
+,Maximum,,4.00,9.00
+foo,N,Valid,2,2
+,,Missing,0,0
+,Mean,,3.50,2.50
+,Std Dev,,3.54,3.54
+,Minimum,,1.00,.00
+,Maximum,,6.00,5.00
+quux,N,Valid,2,2
+,,Missing,0,0
+,Mean,,4.00,4.50
+,Std Dev,,1.41,4.95
+,Minimum,,3.00,1.00
+,Maximum,,5.00,8.00
+])
+AT_CLEANUP
+
+AT_SETUP([FREQUENCIES with SPLIT FILE - SEPARATE])
+AT_DATA([frequencies.sps], [dnl
+DATA LIST LIST NOTABLE/name (A8) value quantity.
+BEGIN DATA.
+foo 1 5
+bar 2 6
+baz 1 9
+quux 3 1
+bar 1 2
+baz 4 3
+baz 1 4
+baz 1 1
+foo 6 0
+quux 5 8
+END DATA.
+EXECUTE.
+
+SORT CASES BY name.
+SPLIT FILE SEPARATE BY name.
+FREQUENCIES /VARIABLES=value quantity /FORMAT NOTABLE.
+])
+AT_CHECK([pspp -O format=csv frequencies.sps], [0], [dnl
+Table: Split Values
+Variable,Value
+name,bar
+
+Table: Statistics
+,,value,quantity
+N,Valid,2,2
+,Missing,0,0
+Mean,,1.50,4.00
+Std Dev,,.71,2.83
+Minimum,,1.00,2.00
+Maximum,,2.00,6.00
+
+Table: Split Values
+Variable,Value
+name,baz
+
+Table: Statistics
+,,value,quantity
+N,Valid,4,4
+,Missing,0,0
+Mean,,1.75,4.25
+Std Dev,,1.50,3.40
+Minimum,,1.00,1.00
+Maximum,,4.00,9.00
+
+Table: Split Values
+Variable,Value
+name,foo
+
+Table: Statistics
+,,value,quantity
+N,Valid,2,2
+,Missing,0,0
+Mean,,3.50,2.50
+Std Dev,,3.54,3.54
+Minimum,,1.00,.00
+Maximum,,6.00,5.00
+
+Table: Split Values
+Variable,Value
+name,quux
+
+Table: Statistics
+,,value,quantity
+N,Valid,2,2
+,Missing,0,0
+Mean,,4.00,4.50
+Std Dev,,1.41,4.95
+Minimum,,3.00,1.00
+Maximum,,5.00,8.00
+])
+AT_CLEANUP
+
+AT_SETUP([FREQUENCIES with SPLIT FILE - LAYERED - unsorted data])
+AT_DATA([frequencies.sps], [dnl
+DATA LIST LIST NOTABLE/name (A8) value quantity.
+BEGIN DATA.
+foo 1 5
+bar 2 6
+baz 1 9
+quux 3 1
+baz 4 3
+bar 1 2
+baz 1 1
+foo 6 0
+baz 1 4
+quux 5 8
+END DATA.
+EXECUTE.
+
+SPLIT FILE BY name.
+FREQUENCIES /VARIABLES=value quantity /FORMAT NOTABLE.
+])
+AT_CHECK([pspp -O format=csv frequencies.sps], [0], [dnl
+"frequencies.sps:17: warning: FREQUENCIES: When SPLIT FILE is in effect, the input data must be sorted by the split variables (for example, using SORT CASES), but multiple runs of cases with the same split values were found separated by cases with different values.  Each run will be analyzed separately.  The duplicate split values are: name = baz     "
+
+"frequencies.sps:17: warning: FREQUENCIES: When SPLIT FILE is in effect, the input data must be sorted by the split variables (for example, using SORT CASES), but multiple runs of cases with the same split values were found separated by cases with different values.  Each run will be analyzed separately.  The duplicate split values are: name = bar     "
+
+"frequencies.sps:17: warning: FREQUENCIES: When SPLIT FILE is in effect, the input data must be sorted by the split variables (for example, using SORT CASES), but multiple runs of cases with the same split values were found separated by cases with different values.  Each run will be analyzed separately.  The duplicate split values are: name = baz     "
+
+"frequencies.sps:17: warning: FREQUENCIES: When SPLIT FILE is in effect, the input data must be sorted by the split variables (for example, using SORT CASES), but multiple runs of cases with the same split values were found separated by cases with different values.  Each run will be analyzed separately.  The duplicate split values are: name = foo     "
+
+"frequencies.sps:17: warning: FREQUENCIES: When SPLIT FILE is in effect, the input data must be sorted by the split variables (for example, using SORT CASES), but multiple runs of cases with the same split values were found separated by cases with different values.  Each run will be analyzed separately.  The duplicate split values are: name = baz     "
+
+Table: Statistics
+name,,,value,quantity
+foo,N,Valid,1,1
+,,Missing,0,0
+,Mean,,1.00,5.00
+,Std Dev,,NaN,NaN
+,Minimum,,1.00,5.00
+,Maximum,,1.00,5.00
+bar,N,Valid,1,1
+,,Missing,0,0
+,Mean,,2.00,6.00
+,Std Dev,,NaN,NaN
+,Minimum,,2.00,6.00
+,Maximum,,2.00,6.00
+baz,N,Valid,1,1
+,,Missing,0,0
+,Mean,,1.00,9.00
+,Std Dev,,NaN,NaN
+,Minimum,,1.00,9.00
+,Maximum,,1.00,9.00
+quux,N,Valid,1,1
+,,Missing,0,0
+,Mean,,3.00,1.00
+,Std Dev,,NaN,NaN
+,Minimum,,3.00,1.00
+,Maximum,,3.00,1.00
+baz,N,Valid,1,1
+,,Missing,0,0
+,Mean,,4.00,3.00
+,Std Dev,,NaN,NaN
+,Minimum,,4.00,3.00
+,Maximum,,4.00,3.00
+bar,N,Valid,1,1
+,,Missing,0,0
+,Mean,,1.00,2.00
+,Std Dev,,NaN,NaN
+,Minimum,,1.00,2.00
+,Maximum,,1.00,2.00
+baz,N,Valid,1,1
+,,Missing,0,0
+,Mean,,1.00,1.00
+,Std Dev,,NaN,NaN
+,Minimum,,1.00,1.00
+,Maximum,,1.00,1.00
+foo,N,Valid,1,1
+,,Missing,0,0
+,Mean,,6.00,.00
+,Std Dev,,NaN,NaN
+,Minimum,,6.00,.00
+,Maximum,,6.00,.00
+baz,N,Valid,1,1
+,,Missing,0,0
+,Mean,,1.00,4.00
+,Std Dev,,NaN,NaN
+,Minimum,,1.00,4.00
+,Maximum,,1.00,4.00
+quux,N,Valid,1,1
+,,Missing,0,0
+,Mean,,5.00,8.00
+,Std Dev,,NaN,NaN
+,Minimum,,5.00,8.00
+,Maximum,,5.00,8.00
+
+frequencies.sps:17: warning: FREQUENCIES: Suppressed 1 additional warning about duplicate split values.
+])
+AT_CLEANUP
+
  # Tests for a bug where pspp would crash if two FREQUENCIES commands
  # existed in a input file.
  AT_SETUP([FREQUENCIES two runs crash])
author	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 8 Aug 2022 05:26:01 +0000 (22:26 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Mon, 8 Aug 2022 05:26:15 +0000 (22:26 -0700)
src/output/pivot-table.c		patch \| blob \| history
src/output/pivot-table.h		patch \| blob \| history
tests/language/stats/frequencies.at		patch \| blob \| history