From 573bdf24eadcb5a16c3fab8c87090e2120b04c0a Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 2 Jul 2022 20:26:47 -0700 Subject: [PATCH] SMISSING --- doc/automake.mk | 4 ++ doc/pspp-figures/ctables18.sps | 9 +++ doc/pspp-figures/ctables19.sps | 9 +++ doc/pspp-figures/ctables20.sps | 10 +++ doc/pspp-figures/ctables21.sps | 11 +++ doc/statistics.texi | 49 +++++++++++-- src/language/stats/ctables.c | 124 +++++++++++++++++++++++++------- tests/language/stats/ctables.at | 54 +++++++++++++- 8 files changed, 234 insertions(+), 36 deletions(-) create mode 100644 doc/pspp-figures/ctables18.sps create mode 100644 doc/pspp-figures/ctables19.sps create mode 100644 doc/pspp-figures/ctables20.sps create mode 100644 doc/pspp-figures/ctables21.sps diff --git a/doc/automake.mk b/doc/automake.mk index 286b6844a7..4ecd6ff9c0 100644 --- a/doc/automake.mk +++ b/doc/automake.mk @@ -134,6 +134,10 @@ FIGURE_SYNTAX = \ doc/pspp-figures/ctables15.sps \ doc/pspp-figures/ctables16.sps \ doc/pspp-figures/ctables17.sps \ + doc/pspp-figures/ctables18.sps \ + doc/pspp-figures/ctables19.sps \ + doc/pspp-figures/ctables20.sps \ + doc/pspp-figures/ctables21.sps \ doc/pspp-figures/crosstabs.sps \ doc/pspp-figures/descriptives.sps \ doc/pspp-figures/flip.sps \ diff --git a/doc/pspp-figures/ctables18.sps b/doc/pspp-figures/ctables18.sps new file mode 100644 index 0000000000..71248785cc --- /dev/null +++ b/doc/pspp-figures/ctables18.sps @@ -0,0 +1,9 @@ +DATA LIST LIST NOTABLE/x y z. +BEGIN DATA. +1 . 40 +1 10 50 +1 20 60 +1 30 . +END DATA. +VARIABLE LEVEL x (NOMINAL). +LIST. \ No newline at end of file diff --git a/doc/pspp-figures/ctables19.sps b/doc/pspp-figures/ctables19.sps new file mode 100644 index 0000000000..ada8823bcc --- /dev/null +++ b/doc/pspp-figures/ctables19.sps @@ -0,0 +1,9 @@ +DATA LIST LIST NOTABLE/x y z. +BEGIN DATA. +1 . 40 +1 10 50 +1 20 60 +1 30 . +END DATA. +VARIABLE LEVEL x (NOMINAL). +CTABLES /TABLE (y + z) > x. diff --git a/doc/pspp-figures/ctables20.sps b/doc/pspp-figures/ctables20.sps new file mode 100644 index 0000000000..6814a274e5 --- /dev/null +++ b/doc/pspp-figures/ctables20.sps @@ -0,0 +1,10 @@ +DATA LIST LIST NOTABLE/x y z. +BEGIN DATA. +1 . 40 +1 10 50 +1 20 60 +1 30 . +END DATA. +VARIABLE LEVEL x (NOMINAL). +CTABLES /SMISSING LISTWISE /TABLE (y + z) > x. + diff --git a/doc/pspp-figures/ctables21.sps b/doc/pspp-figures/ctables21.sps new file mode 100644 index 0000000000..8fb18d8b2b --- /dev/null +++ b/doc/pspp-figures/ctables21.sps @@ -0,0 +1,11 @@ +DATA LIST LIST NOTABLE/x y z. +BEGIN DATA. +1 . 40 +1 10 50 +1 20 60 +1 30 . +END DATA. +VARIABLE LEVEL x (NOMINAL). +CTABLES /SMISSING LISTWISE /TABLE (y > x) + (z > x). + + diff --git a/doc/statistics.texi b/doc/statistics.texi index 78b76bfbc4..5b8b841933 100644 --- a/doc/statistics.texi +++ b/doc/statistics.texi @@ -1506,7 +1506,7 @@ order: @table @asis @item Explicit categories. -@anchor{CTABLE Explicit Category List} +@anchor{CTABLES Explicit Category List} To explicitly specify categories to include, list the categories within square brackets in the desired sort order. Use spaces or commas to separate values. Categories not covered by the list are @@ -1707,9 +1707,46 @@ is optional. With @code{SMISSING=VARIABLE}, which is the default, missing values are excluded on a variable-by-variable basis. With -@code{SMISSING=LISTWISE}, when scalar variables are stacked, a missing -value for any of the scalar variables causes the case to be excluded -for all of them. +@code{SMISSING=LISTWISE}, when stacked scalar variables are nested +together with a categorical variable, a missing value for any of the +scalar variables causes the case to be excluded for all of them. + +As an example, consider the following dataset, in which @samp{x} is a +categorical variable and @samp{y} and @samp{z} are scale: + +@psppoutput{ctables18} + +@noindent +With the default missing-value treatment, @samp{x}'s mean is 20, based +on the values 10, 20, and 30, and @samp{y}'s mean is 50, based on 40, +50, and 60: + +@example +CTABLES /TABLE (y + z) > x. +@end example +@psppoutput{ctables19} + +@noindent +By adding @code{SMISSING=LISTWISE}, only cases where @samp{y} and +@samp{z} are both non-missing are considered, so @samp{x}'s mean +becomes 15, as the average of 10 and 20, and @samp{y}'s mean becomes +55, the average of 50 and 60: + +@example +CTABLES /SMISSING LISTWISE /TABLE (y + z) > x. +@end example +@psppoutput{ctables20} + +@noindent +Even with @code{SMISSING=LISTWISE}, if @samp{y} and @samp{z} are +separately nested with @samp{x}, instead of using a single @samp{>} +operator, missing values revert to being considered on a +variable-by-variable basis: + +@example +CTABLES /SMISSING LISTWISE /TABLE (y > x) + (z > x). +@end example +@psppoutput{ctables21} @node CTABLES Computed Categories @subsection Computed Categories @@ -1722,7 +1759,7 @@ for all of them. categories created using arithmetic on categories obtained from the data. The @code{PCOMPUTE} subcommand defines computed categories, which can then be used in two places: on @code{CATEGORIES} within an -explicit category list (@pxref{CTABLE Explicit Category List}), and on +explicit category list (@pxref{CTABLES Explicit Category List}), and on the @code{PPROPERTIES} subcommand to define further properties for a given postcompute. @@ -1749,7 +1786,7 @@ postcompute must have the same form. @itemx MISSING @itemx OTHERNM These forms evaluate to the summary statistics for categories matching -the given syntax, as described in previous sections (@pxref{CTABLE +the given syntax, as described in previous sections (@pxref{CTABLES Explicit Category List}). If more than one category matches, their values are summed. diff --git a/src/language/stats/ctables.c b/src/language/stats/ctables.c index d3a00312a9..961ac6a8ca 100644 --- a/src/language/stats/ctables.c +++ b/src/language/stats/ctables.c @@ -351,6 +351,12 @@ struct ctables_summary_spec_set (VALIDN and TOTALN act differently for summarizing scale and categorical variables.) */ bool is_scale; + + /* If any of these optional additional scale variables are missing, then + treat 'var' as if it's missing too. This is for implementing + SMISSING=LISTWISE. */ + struct variable **listwise_vars; + size_t n_listwise_vars; }; static void ctables_summary_spec_set_clone (struct ctables_summary_spec_set *, @@ -365,6 +371,7 @@ struct ctables_nest size_t scale_idx; size_t *domains[N_CTDTS]; size_t n_domains[N_CTDTS]; + size_t group_head; struct ctables_summary_spec_set specs[N_CSVS]; }; @@ -2031,13 +2038,41 @@ stack_fts (struct ctables_stack s0, struct ctables_stack s1) for (size_t i = 0; i < s0.n; i++) stack.nests[stack.n++] = s0.nests[i]; for (size_t i = 0; i < s1.n; i++) - stack.nests[stack.n++] = s1.nests[i]; + { + stack.nests[stack.n] = s1.nests[i]; + stack.nests[stack.n].group_head += s0.n; + stack.n++; + } assert (stack.n == s0.n + s1.n); free (s0.nests); free (s1.nests); return stack; } +static struct ctables_stack +var_fts (const struct ctables_axis *a) +{ + assert (!a->var.is_mrset); + + struct variable **vars = xmalloc (sizeof *vars); + *vars = a->var.var; + + struct ctables_nest *nest = xmalloc (sizeof *nest); + *nest = (struct ctables_nest) { + .vars = vars, + .n = 1, + .scale_idx = a->scale ? 0 : SIZE_MAX, + }; + if (a->specs[CSV_CELL].n || a->scale) + for (enum ctables_summary_variant sv = 0; sv < N_CSVS; sv++) + { + ctables_summary_spec_set_clone (&nest->specs[sv], &a->specs[sv]); + nest->specs[sv].var = a->var.var; + nest->specs[sv].is_scale = a->scale; + } + return (struct ctables_stack) { .nests = nest, .n = 1 }; +} + static struct ctables_stack enumerate_fts (enum pivot_axis_type axis_type, const struct ctables_axis *a) { @@ -2047,31 +2082,15 @@ enumerate_fts (enum pivot_axis_type axis_type, const struct ctables_axis *a) switch (a->op) { case CTAO_VAR: - assert (!a->var.is_mrset); - - struct variable **vars = xmalloc (sizeof *vars); - *vars = a->var.var; - - struct ctables_nest *nest = xmalloc (sizeof *nest); - *nest = (struct ctables_nest) { - .vars = vars, - .n = 1, - .scale_idx = a->scale ? 0 : SIZE_MAX, - }; - if (a->specs[CSV_CELL].n || a->scale) - for (enum ctables_summary_variant sv = 0; sv < N_CSVS; sv++) - { - ctables_summary_spec_set_clone (&nest->specs[sv], &a->specs[sv]); - nest->specs[sv].var = a->var.var; - nest->specs[sv].is_scale = a->scale; - } - return (struct ctables_stack) { .nests = nest, .n = 1 }; + return var_fts (a); case CTAO_STACK: return stack_fts (enumerate_fts (axis_type, a->subs[0]), enumerate_fts (axis_type, a->subs[1])); case CTAO_NEST: + /* This should consider any of the scale variables found in the result to + be linked to each other listwise for SMISSING=LISTWISE. */ return nest_fts (enumerate_fts (axis_type, a->subs[0]), enumerate_fts (axis_type, a->subs[1])); } @@ -2252,7 +2271,8 @@ static void ctables_summary_add (union ctables_summary *s, const struct ctables_summary_spec *ss, const struct variable *var, const union value *value, - bool is_scale, bool is_missing, bool excluded_missing, + bool is_scale, bool is_scale_missing, + bool is_missing, bool excluded_missing, double d_weight, double e_weight) { /* To determine whether a case is included in a given table for a particular @@ -2307,7 +2327,7 @@ ctables_summary_add (union ctables_summary *s, case CTSF_LAYERROWPCT_VALIDN: case CTSF_LAYERCOLPCT_VALIDN: if (is_scale - ? !var_is_value_missing (var, value) + ? !is_scale_missing : !is_missing) s->count += d_weight; break; @@ -2324,7 +2344,7 @@ ctables_summary_add (union ctables_summary *s, case CTSF_EVALIDN: if (is_scale - ? !var_is_value_missing (var, value) + ? !is_scale_missing : !is_missing) s->count += e_weight; break; @@ -2336,7 +2356,7 @@ ctables_summary_add (union ctables_summary *s, case CTSF_MAXIMUM: case CTSF_MINIMUM: case CTSF_RANGE: - if (!var_is_value_missing (var, value)) + if (!is_scale_missing) { assert (!var_is_alpha (var)); /* XXX? */ if (s->min == SYSMIS || value->f < s->min) @@ -2358,14 +2378,14 @@ ctables_summary_add (union ctables_summary *s, case CTSF_LAYERPCT_SUM: case CTSF_LAYERROWPCT_SUM: case CTSF_LAYERCOLPCT_SUM: - if (!var_is_value_missing (var, value)) + if (!is_scale_missing) moments1_add (s->moments, value->f, e_weight); break; case CTSF_MEDIAN: case CTSF_MODE: case CTSF_PTILE: - if (var_is_value_missing (var, value)) + if (!is_scale_missing) { s->ovalid += e_weight; @@ -2954,6 +2974,26 @@ ctables_cell_insert__ (struct ctables_section *s, const struct ccase *c, return cell; } +static bool +is_scale_missing (const struct ctables_summary_spec_set *specs, + const struct ccase *c) +{ + if (!specs->is_scale) + return false; + + if (var_is_num_missing (specs->var, case_num (c, specs->var))) + return true; + + for (size_t i = 0; i < specs->n_listwise_vars; i++) + { + const struct variable *var = specs->listwise_vars[i]; + if (var_is_num_missing (var, case_num (c, var))) + return true; + } + + return false; +} + static void ctables_cell_add__ (struct ctables_section *s, const struct ccase *c, const struct ctables_category *cats[PIVOT_N_AXES][10], @@ -2964,10 +3004,13 @@ ctables_cell_add__ (struct ctables_section *s, const struct ccase *c, const struct ctables_nest *ss = s->nests[s->table->summary_axis]; const struct ctables_summary_spec_set *specs = &ss->specs[cell->sv]; + + bool scale_missing = is_scale_missing (specs, c); for (size_t i = 0; i < specs->n; i++) ctables_summary_add (&cell->summaries[i], &specs->specs[i], specs->var, case_data (c, specs->var), specs->is_scale, - is_missing, excluded_missing, d_weight, e_weight); + scale_missing, is_missing, excluded_missing, + d_weight, e_weight); for (enum ctables_domain_type dt = 0; dt < N_CTDTS; dt++) if (!(cell->omit_domains && (1u << dt))) { @@ -3993,6 +4036,33 @@ ctables_prepare_table (struct ctables_table *t) else if (!nest->specs[CSV_TOTAL].n) ctables_summary_spec_set_clone (&nest->specs[CSV_TOTAL], &nest->specs[CSV_CELL]); + + if (t->ctables->smissing_listwise) + { + struct variable **listwise_vars = NULL; + size_t n = 0; + size_t allocated = 0; + + for (size_t j = nest->group_head; j < stack->n; j++) + { + const struct ctables_nest *other_nest = &stack->nests[j]; + if (other_nest->group_head != nest->group_head) + break; + + if (nest != other_nest && other_nest->scale_idx < other_nest->n) + { + if (n >= allocated) + listwise_vars = x2nrealloc (listwise_vars, &allocated, + sizeof *listwise_vars); + listwise_vars[n++] = other_nest->vars[other_nest->scale_idx]; + } + } + for (size_t j = 0; j < N_CSVS; j++) + { + nest->specs[j].listwise_vars = listwise_vars; + nest->specs[j].n_listwise_vars = n; + } + } } struct ctables_summary_spec_set *merged = &t->summary_specs; diff --git a/tests/language/stats/ctables.at b/tests/language/stats/ctables.at index e91301a0a5..1384710f8e 100644 --- a/tests/language/stats/ctables.at +++ b/tests/language/stats/ctables.at @@ -12,13 +12,11 @@ dnl * Unimplemented ones. dnl * U-prefix for unweighted summaries. dnl * .LCL and .UCL suffixes. dnl * .SE suffixes. -dnl * Why are summary functions for scale variables also available for totals and subtotals? dnl - CATEGORIES: dnl * String values dnl * Date values dnl * Data-dependent sorting. dnl - TITLES: )DATE, )TIME, )TABLE. -dnl - SMISSING (see documentation). dnl - PCOMPUTE: dnl * multi-dimensional dnl * MISSING, OTHERNM @@ -1198,4 +1196,54 @@ dnl is expected behavior. │ Total N │ 5│ 5│ 5│ 5│ 30│ ╰────────────────────────┴──────┴──────┴──────┴──────┴──────╯ ]) -AT_CLEANUP \ No newline at end of file +AT_CLEANUP + +AT_SETUP([CTABLES SMISSING=LISTWISE]) +AT_KEYWORDS([SMISSING LISTWISE]) +AT_DATA([ctables.sps], +[[DATA LIST LIST NOTABLE/x y z. +BEGIN DATA. +1 . 40 +1 10 50 +1 20 60 +1 . . +1 30 . +END DATA. +VARIABLE LEVEL x (NOMINAL). + +CTABLES /TABLE (y + z) > x. +CTABLES /SMISSING LISTWISE /TABLE (y + z) > x. + +* The following doesn't come out as listwise because the tables are +separate, not linked by an > operator. +CTABLES /SMISSING LISTWISE /TABLE (y > x) + (z > x). +]]) +AT_CHECK([pspp ctables.sps -O box=unicode -O width=120], [0], [dnl + Custom Tables +╭────────┬─────╮ +│ │ Mean│ +├────────┼─────┤ +│y x 1.00│20.00│ +├────────┼─────┤ +│z x 1.00│50.00│ +╰────────┴─────╯ + + Custom Tables +╭────────┬─────╮ +│ │ Mean│ +├────────┼─────┤ +│y x 1.00│15.00│ +├────────┼─────┤ +│z x 1.00│55.00│ +╰────────┴─────╯ + + Custom Tables +╭────────┬─────╮ +│ │ Mean│ +├────────┼─────┤ +│y x 1.00│20.00│ +├────────┼─────┤ +│z x 1.00│50.00│ +╰────────┴─────╯ +]) +AT_CLEANUP -- 2.30.2