From b7f672dd5f58c5891d7845871a13f20240eb9edf Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Thu, 25 Aug 2022 07:00:49 -0700 Subject: [PATCH] work on missing values --- src/language/stats/ctables.c | 165 ++++++++++++++++---------------- tests/language/stats/ctables.at | 10 ++ 2 files changed, 95 insertions(+), 80 deletions(-) diff --git a/src/language/stats/ctables.c b/src/language/stats/ctables.c index bb7f5c37aa..621b9bab50 100644 --- a/src/language/stats/ctables.c +++ b/src/language/stats/ctables.c @@ -373,6 +373,7 @@ struct ctables_nest struct variable **vars; size_t n; size_t scale_idx; + size_t summary_idx; size_t *areas[N_CTATS]; size_t n_areas[N_CTATS]; size_t group_head; @@ -2421,6 +2422,9 @@ nest_fts (struct ctables_stack s0, struct ctables_stack s1) .scale_idx = (a->scale_idx != SIZE_MAX ? a->scale_idx : b->scale_idx != SIZE_MAX ? a->n + b->scale_idx : SIZE_MAX), + .summary_idx = (a->summary_idx != SIZE_MAX ? a->summary_idx + : b->summary_idx != SIZE_MAX ? a->n + b->summary_idx + : SIZE_MAX), .n = n, }; for (enum ctables_summary_variant sv = 0; sv < N_CSVS; sv++) @@ -2455,13 +2459,15 @@ var_fts (const struct ctables_axis *a) struct variable **vars = xmalloc (sizeof *vars); *vars = a->var; + bool is_summary = a->specs[CSV_CELL].n || a->scale; struct ctables_nest *nest = xmalloc (sizeof *nest); *nest = (struct ctables_nest) { .vars = vars, .n = 1, .scale_idx = a->scale ? 0 : SIZE_MAX, + .summary_idx = is_summary ? 0 : SIZE_MAX, }; - if (a->specs[CSV_CELL].n || a->scale) + if (is_summary) for (enum ctables_summary_variant sv = 0; sv < N_CSVS; sv++) { ctables_summary_spec_set_clone (&nest->specs[sv], &a->specs[sv]); @@ -2631,7 +2637,7 @@ ctables_summary_add (union ctables_summary *s, const struct ctables_summary_spec *ss, const struct variable *var, const union value *value, bool is_scale, bool is_scale_missing, - bool is_missing, bool excluded_missing, + bool is_missing, bool is_included, double d_weight, double e_weight) { /* To determine whether a case is included in a given table for a particular @@ -2663,12 +2669,12 @@ ctables_summary_add (union ctables_summary *s, break; case CTSF_COUNT: - if (is_scale || !excluded_missing) + if (is_scale || is_included) s->count += ss->weighted ? d_weight : 1.0; break; case CTSF_areaPCT_COUNT: - if (is_scale || !excluded_missing) + if (is_scale || is_included) s->count += ss->weighted ? e_weight : 1.0; break; @@ -2697,7 +2703,7 @@ ctables_summary_add (union ctables_summary *s, break; case CTSF_ECOUNT: - if (is_scale || !excluded_missing) + if (is_scale || is_included) s->count += e_weight; break; @@ -3300,15 +3306,9 @@ ctables_cell_insert__ (struct ctables_section *s, const struct ccase *c, } static bool -is_scale_missing (const struct ctables_summary_spec_set *specs, - const struct ccase *c) +is_listwise_missing (const struct ctables_summary_spec_set *specs, + const struct ccase *c) { - if (!specs->is_scale) - return false; - - if (var_is_num_missing (specs->var, case_num (c, specs->var))) - return true; - for (size_t i = 0; i < specs->n_listwise_vars; i++) { const struct variable *var = specs->listwise_vars[i]; @@ -3322,19 +3322,32 @@ is_scale_missing (const struct ctables_summary_spec_set *specs, static void ctables_cell_add__ (struct ctables_section *s, const struct ccase *c, const struct ctables_category *cats[PIVOT_N_AXES][10], - bool is_missing, bool excluded_missing, double d_weight, double e_weight) { struct ctables_cell *cell = ctables_cell_insert__ (s, c, cats); const struct ctables_nest *ss = s->nests[s->table->summary_axis]; const struct ctables_summary_spec_set *specs = &ss->specs[cell->sv]; + const union value *value = case_data (c, specs->var); + bool is_missing = var_is_value_missing (specs->var, value); + bool is_included; + bool scale_missing; + if (specs->is_scale) + { + is_included = !is_missing; + scale_missing = is_missing || is_listwise_missing (specs, c); + } + else + { + is_included = (cats[s->table->summary_axis][ss->summary_idx]->type + != CCT_EXCLUDED_MISSING); + scale_missing = false; + } - bool scale_missing = is_scale_missing (specs, c); for (size_t i = 0; i < specs->n; i++) - ctables_summary_add (&cell->summaries[i], &specs->specs[i], - specs->var, case_data (c, specs->var), specs->is_scale, - scale_missing, is_missing, excluded_missing, + ctables_summary_add (&cell->summaries[i], &specs->specs[i], + specs->var, value, specs->is_scale, + scale_missing, is_missing, is_included, d_weight, e_weight); for (enum ctables_area_type at = 0; at < N_CTATS; at++) if (!(cell->omit_areas && (1u << at))) @@ -3343,7 +3356,7 @@ ctables_cell_add__ (struct ctables_section *s, const struct ccase *c, a->d_total += d_weight; a->e_total += e_weight; a->u_total += 1.0; - if (!excluded_missing) + if (is_included) { a->d_count += d_weight; a->e_count += e_weight; @@ -3374,7 +3387,6 @@ ctables_cell_add__ (struct ctables_section *s, const struct ccase *c, static void recurse_totals (struct ctables_section *s, const struct ccase *c, const struct ctables_category *cats[PIVOT_N_AXES][10], - bool is_missing, bool excluded_missing, double d_weight, double e_weight, enum pivot_axis_type start_axis, size_t start_nest) { @@ -3393,12 +3405,17 @@ recurse_totals (struct ctables_section *s, const struct ccase *c, if (total) { const struct ctables_category *save = cats[a][i]; - cats[a][i] = total; - ctables_cell_add__ (s, c, cats, is_missing, excluded_missing, - d_weight, e_weight); - recurse_totals (s, c, cats, is_missing, excluded_missing, - d_weight, e_weight, a, i + 1); - cats[a][i] = save; + if (save->type != CCT_EXCLUDED_MISSING) + /* XXX ^^ this shows why we need to keep track of + 'excluded_missing' (or 'is_included') at a high level, + because it gets replaced by a total category. So we need to + restore that and plumb it through again. */ + { + cats[a][i] = total; + ctables_cell_add__ (s, c, cats, d_weight, e_weight); + recurse_totals (s, c, cats, d_weight, e_weight, a, i + 1); + cats[a][i] = save; + } } } start_nest = 0; @@ -3408,7 +3425,6 @@ recurse_totals (struct ctables_section *s, const struct ccase *c, static void recurse_subtotals (struct ctables_section *s, const struct ccase *c, const struct ctables_category *cats[PIVOT_N_AXES][10], - bool is_missing, bool excluded_missing, double d_weight, double e_weight, enum pivot_axis_type start_axis, size_t start_nest) { @@ -3424,10 +3440,8 @@ recurse_subtotals (struct ctables_section *s, const struct ccase *c, if (save->subtotal) { cats[a][i] = save->subtotal; - ctables_cell_add__ (s, c, cats, is_missing, excluded_missing, - d_weight, e_weight); - recurse_subtotals (s, c, cats, is_missing, excluded_missing, - d_weight, e_weight, a, i + 1); + ctables_cell_add__ (s, c, cats, d_weight, e_weight); + recurse_subtotals (s, c, cats, d_weight, e_weight, a, i + 1); cats[a][i] = save; } } @@ -3461,43 +3475,34 @@ ctables_cell_insert (struct ctables_section *s, { const struct ctables_category *cats[PIVOT_N_AXES][10]; /* XXX */ - /* Does at least one categorical variable have a missing value in an included - or excluded category? */ - bool is_missing = false; - - /* Does at least one categorical variable have a missing value in an excluded - category? */ bool excluded_missing = false; for (enum pivot_axis_type a = 0; a < PIVOT_N_AXES; a++) { const struct ctables_nest *nest = s->nests[a]; for (size_t i = 0; i < nest->n; i++) - { - if (i == nest->scale_idx) - continue; - - const struct variable *var = nest->vars[i]; - const union value *value = case_data (c, var); - - bool var_missing = var_is_value_missing (var, value) != 0; - if (var_missing) - is_missing = true; - - cats[a][i] = ctables_categories_match ( - s->table->categories[var_get_dict_index (var)], value, var); - if (!cats[a][i]) - { - if (!var_missing) - return; + if (i != nest->scale_idx) + { + const struct variable *var = nest->vars[i]; + const union value *value = case_data (c, var); - static const struct ctables_category cct_excluded_missing = { - .type = CCT_EXCLUDED_MISSING, - .hide = true, - }; - cats[a][i] = &cct_excluded_missing; - excluded_missing = true; - } + cats[a][i] = ctables_categories_match ( + s->table->categories[var_get_dict_index (var)], value, var); + if (!cats[a][i]) + { + if (i != nest->summary_idx) + return; + + if (!var_is_value_missing (var, value)) + return; + + static const struct ctables_category cct_excluded_missing = { + .type = CCT_EXCLUDED_MISSING, + .hide = true, + }; + cats[a][i] = &cct_excluded_missing; + excluded_missing = true; + } } } @@ -3514,16 +3519,9 @@ ctables_cell_insert (struct ctables_section *s, } } - ctables_cell_add__ (s, c, cats, is_missing, excluded_missing, - d_weight, e_weight); - - //if (!excluded_missing) - { - recurse_totals (s, c, cats, is_missing, excluded_missing, - d_weight, e_weight, 0, 0); - recurse_subtotals (s, c, cats, is_missing, excluded_missing, - d_weight, e_weight, 0, 0); - } + ctables_cell_add__ (s, c, cats, d_weight, e_weight); + recurse_totals (s, c, cats, d_weight, e_weight, 0, 0); + recurse_subtotals (s, c, cats, d_weight, e_weight, 0, 0); } struct merge_item @@ -4775,7 +4773,11 @@ ctables_prepare_table (struct ctables_table *t) else { struct ctables_nest *nest = xmalloc (sizeof *nest); - *nest = (struct ctables_nest) { .n = 0 }; + *nest = (struct ctables_nest) { + .n = 0, + .scale_idx = SIZE_MAX, + .summary_idx = SIZE_MAX + }; t->stacks[a] = (struct ctables_stack) { .nests = nest, .n = 1 }; /* There's no point in moving labels away from an axis that has no @@ -4789,20 +4791,23 @@ ctables_prepare_table (struct ctables_table *t) struct ctables_nest *nest = &stack->nests[i]; if (!nest->specs[CSV_CELL].n) { - struct ctables_summary_spec_set *specs = &nest->specs[CSV_CELL]; - specs->specs = xmalloc (sizeof *specs->specs); - specs->n = 1; + struct ctables_summary_spec_set *ss = &nest->specs[CSV_CELL]; + ss->specs = xmalloc (sizeof *ss->specs); + ss->n = 1; enum ctables_summary_function function - = specs->is_scale ? CTSF_MEAN : CTSF_COUNT; + = ss->is_scale ? CTSF_MEAN : CTSF_COUNT; - *specs->specs = (struct ctables_summary_spec) { + if (!ss->var) + { + nest->summary_idx = nest->n - 1; + ss->var = nest->vars[nest->summary_idx]; + } + *ss->specs = (struct ctables_summary_spec) { .function = function, .weighted = true, - .format = ctables_summary_default_format (function, specs->var), + .format = ctables_summary_default_format (function, ss->var), }; - if (!specs->var) - specs->var = nest->vars[0]; ctables_summary_spec_set_clone (&nest->specs[CSV_TOTAL], &nest->specs[CSV_CELL]); diff --git a/tests/language/stats/ctables.at b/tests/language/stats/ctables.at index f4eec3cecc..91e8ae0ffc 100644 --- a/tests/language/stats/ctables.at +++ b/tests/language/stats/ctables.at @@ -789,11 +789,21 @@ AT_SETUP([CTABLES one scale variable]) AT_CHECK([ln $top_srcdir/examples/nhtsa.sav . || cp $top_srcdir/examples/nhtsa.sav .]) AT_DATA([ctables.sps], [[GET 'nhtsa.sav'. +DESCRIPTIVES qnd1. CTABLES /TABLE qnd1[COUNT, VALIDN, TOTALN, MEAN, STDDEV, MINIMUM, MAXIMUM]. CTABLES /TABLE BY qnd1. CTABLES /TABLE BY BY qnd1. ]]) AT_CHECK([pspp ctables.sps -O box=unicode -O width=80], [0], [dnl + Descriptive Statistics +╭──────────────────────────┬────┬─────┬───────┬───────┬───────────╮ +│ │ N │ Mean│Std Dev│Minimum│ Maximum │ +├──────────────────────────┼────┼─────┼───────┼───────┼───────────┤ +│D1. AGE: What is your age?│6930│48.26│ 19.01│ 16│86 or older│ +│Valid N (listwise) │6999│ │ │ │ │ +│Missing N (listwise) │ 69│ │ │ │ │ +╰──────────────────────────┴────┴─────┴───────┴───────┴───────────╯ + Custom Tables ╭──────────────────────┬─────┬───────┬───────┬────┬────────────┬───────┬───────╮ │ │ │ │ │ │ Std │ │ │ -- 2.30.2