From bbcb12729a32b7fa0298b188ac13d1743ee8999c Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 3 Jul 2022 09:06:47 -0700 Subject: [PATCH] string ranges --- src/language/stats/ctables.c | 267 +++++++++++++++++++++++--------- tests/language/stats/ctables.at | 5 +- 2 files changed, 201 insertions(+), 71 deletions(-) diff --git a/src/language/stats/ctables.c b/src/language/stats/ctables.c index 4b6cb04900..497969a057 100644 --- a/src/language/stats/ctables.c +++ b/src/language/stats/ctables.c @@ -465,7 +465,8 @@ struct ctables_category /* Explicit category lists. */ CCT_NUMBER, CCT_STRING, - CCT_RANGE, + CCT_NRANGE, /* Numerical range. */ + CCT_SRANGE, /* String range. */ CCT_MISSING, CCT_OTHERNM, CCT_POSTCOMPUTE, @@ -490,9 +491,10 @@ struct ctables_category union { - double number; /* CCT_NUMBER. */ + double number; /* CCT_NUMBER. */ struct substring string; /* CCT_STRING, in dictionary encoding. */ - double range[2]; /* CCT_RANGE. */ + double nrange[2]; /* CCT_NRANGE. */ + struct substring srange[2]; /* CCT_SRANGE. */ struct { @@ -529,7 +531,7 @@ ctables_category_uninit (struct ctables_category *cat) switch (cat->type) { case CCT_NUMBER: - case CCT_RANGE: + case CCT_NRANGE: case CCT_MISSING: case CCT_OTHERNM: case CCT_POSTCOMPUTE: @@ -539,6 +541,11 @@ ctables_category_uninit (struct ctables_category *cat) ss_dealloc (&cat->string); break; + case CCT_SRANGE: + ss_dealloc (&cat->srange[0]); + ss_dealloc (&cat->srange[1]); + break; + case CCT_SUBTOTAL: case CCT_TOTAL: free (cat->total_label); @@ -554,6 +561,13 @@ ctables_category_uninit (struct ctables_category *cat) } } +static bool +nullable_substring_equal (const struct substring *a, + const struct substring *b) +{ + return !a->string ? !b->string : b->string && ss_equals (*a, *b); +} + static bool ctables_category_equal (const struct ctables_category *a, const struct ctables_category *b) @@ -569,8 +583,12 @@ ctables_category_equal (const struct ctables_category *a, case CCT_STRING: return ss_equals (a->string, b->string); - case CCT_RANGE: - return a->range[0] == b->range[0] && a->range[1] == b->range[1]; + case CCT_NRANGE: + return a->nrange[0] == b->nrange[0] && a->nrange[1] == b->nrange[1]; + + case CCT_SRANGE: + return (nullable_substring_equal (&a->srange[0], &b->srange[0]) + && nullable_substring_equal (&a->srange[1], &b->srange[1])); case CCT_MISSING: case CCT_OTHERNM: @@ -1406,11 +1424,20 @@ ctables_destroy (struct ctables *ct) } static struct ctables_category -cct_range (double low, double high) +cct_nrange (double low, double high) { return (struct ctables_category) { - .type = CCT_RANGE, - .range = { low, high } + .type = CCT_NRANGE, + .nrange = { low, high } + }; +} + +static struct ctables_category +cct_srange (struct substring low, struct substring high) +{ + return (struct ctables_category) { + .type = CCT_SRANGE, + .srange = { low, high } }; } @@ -1438,6 +1465,16 @@ ctables_table_parse_subtotal (struct lexer *lexer, bool hide_subcategories, return true; } +static struct substring +parse_substring (struct lexer *lexer, struct dictionary *dict) +{ + struct substring s = recode_substring_pool ( + dict_get_encoding (dict), "UTF-8", lex_tokss (lexer), NULL); + ss_rtrim (&s, ss_cstr (" ")); + lex_get (lexer); + return s; +} + static bool ctables_table_parse_explicit_category (struct lexer *lexer, struct dictionary *dict, @@ -1454,10 +1491,21 @@ ctables_table_parse_explicit_category (struct lexer *lexer, return ctables_table_parse_subtotal (lexer, true, cat); else if (lex_match_id (lexer, "LO")) { - if (!lex_force_match_id (lexer, "THRU") || lex_force_num (lexer)) + if (!lex_force_match_id (lexer, "THRU")) + return false; + if (lex_is_string (lexer)) + { + struct substring sr0 = { .string = NULL }; + struct substring sr1 = parse_substring (lexer, dict); + *cat = cct_srange (sr0, sr1); + } + else if (lex_force_num (lexer)) + { + *cat = cct_nrange (-DBL_MAX, lex_number (lexer)); + lex_get (lexer); + } + else return false; - *cat = cct_range (-DBL_MAX, lex_number (lexer)); - lex_get (lexer); } else if (lex_is_number (lexer)) { @@ -1466,12 +1514,12 @@ ctables_table_parse_explicit_category (struct lexer *lexer, if (lex_match_id (lexer, "THRU")) { if (lex_match_id (lexer, "HI")) - *cat = cct_range (number, DBL_MAX); + *cat = cct_nrange (number, DBL_MAX); else { if (!lex_force_num (lexer)) return false; - *cat = cct_range (number, lex_number (lexer)); + *cat = cct_nrange (number, lex_number (lexer)); lex_get (lexer); } } @@ -1483,12 +1531,24 @@ ctables_table_parse_explicit_category (struct lexer *lexer, } else if (lex_is_string (lexer)) { - struct substring s = recode_substring_pool ( - dict_get_encoding (dict), "UTF-8", lex_tokss (lexer), NULL); - ss_rtrim (&s, ss_cstr (" ")); - - *cat = (struct ctables_category) { .type = CCT_STRING, .string = s }; - lex_get (lexer); + struct substring s = parse_substring (lexer, dict); + if (lex_match_id (lexer, "THRU")) + { + if (lex_match_id (lexer, "HI")) + { + struct substring sr1 = { .string = NULL }; + *cat = cct_srange (s, sr1); + } + else + { + if (!lex_force_string (lexer)) + return false; + struct substring sr1 = parse_substring (lexer, dict); + *cat = cct_srange (s, sr1); + } + } + else + *cat = (struct ctables_category) { .type = CCT_STRING, .string = s }; } else if (lex_match (lexer, T_AND)) { @@ -1539,9 +1599,9 @@ ctables_find_category_for_postcompute (const struct ctables_categories *cats, break; case CTPO_CAT_RANGE: - if (cat->type == CCT_RANGE - && cat->range[0] == e->range[0] - && cat->range[1] == e->range[1]) + if (cat->type == CCT_NRANGE + && cat->nrange[0] == e->range[0] + && cat->nrange[1] == e->range[1]) best = cat; break; @@ -1665,6 +1725,45 @@ ctables_recursive_check_postcompute (const struct ctables_pcexpr *e, } } +static bool +parse_category_string (const struct ctables_category *cat, + struct substring s, struct dictionary *dict, + enum fmt_type format, double *n) +{ + printf ("parse %.*s as %s\n", (int) s.length, s.string, fmt_name (format)); + union value v; + char *error = data_in (s, dict_get_encoding (dict), format, + settings_get_fmt_settings (), &v, 0, NULL); + if (error) + { + msg_at (SE, cat->location, + _("Failed to parse category specification as format %s: %s."), + fmt_name (format), error); + free (error); + return false; + } + + *n = v.f; + return true; +} + +static bool +all_strings (struct variable **vars, size_t n_vars, + const struct ctables_category *cat) +{ + for (size_t j = 0; j < n_vars; j++) + if (var_is_numeric (vars[j])) + { + msg_at (SE, cat->location, + _("This category specification may be applied only to string " + "variables, but this subcommand tries to apply it to " + "numeric variable %s."), + var_get_name (vars[j])); + return false; + } + return true; +} + static bool ctables_table_parse_categories (struct lexer *lexer, struct dictionary *dict, struct ctables *ct, struct ctables_table *t) @@ -1737,7 +1836,7 @@ ctables_table_parse_categories (struct lexer *lexer, struct dictionary *dict, break; case CCT_NUMBER: - case CCT_RANGE: + case CCT_NRANGE: for (size_t j = 0; j < n_vars; j++) if (var_is_alpha (vars[j])) { @@ -1754,40 +1853,46 @@ ctables_table_parse_categories (struct lexer *lexer, struct dictionary *dict, case CCT_STRING: if (parse_strings) { - union value v; - char *error = data_in (cat->string, dict_get_encoding (dict), - common_format->type, - settings_get_fmt_settings (), - &v, 0, NULL); - if (error) - { - msg_at (SE, cat->location, - _("Failed to parse category specification as " - "format %s: %s."), - fmt_name (common_format->type), error); - free (error); - return false; - } + double n; + if (!parse_category_string (cat, cat->string, dict, + common_format->type, &n)) + return false; ss_dealloc (&cat->string); cat->type = CCT_NUMBER; - cat->number = v.f; + cat->number = n; } - else + else if (!all_strings (vars, n_vars, cat)) + return false; + break; + + case CCT_SRANGE: + if (parse_strings) { - for (size_t j = 0; j < n_vars; j++) - if (var_is_numeric (vars[j])) - { - msg_at (SE, cat->location, - _("This category specification may be applied " - "only to string variables, but this " - "subcommand tries to apply it to numeric " - "variable %s."), - var_get_name (vars[j])); - return false; - } + double n[2]; + + if (!cat->srange[0].string) + n[0] = -DBL_MAX; + else if (!parse_category_string (cat, cat->srange[0], dict, + common_format->type, &n[0])) + return false; + + if (!cat->srange[1].string) + n[1] = DBL_MAX; + else if (!parse_category_string (cat, cat->srange[1], dict, + common_format->type, &n[1])) + return false; + + ss_dealloc (&cat->srange[0]); + ss_dealloc (&cat->srange[1]); + + cat->type = CCT_NRANGE; + cat->nrange[0] = n[0]; + cat->nrange[1] = n[1]; } + else if (!all_strings (vars, n_vars, cat)) + return false; break; case CCT_MISSING: @@ -1968,7 +2073,8 @@ ctables_table_parse_categories (struct lexer *lexer, struct dictionary *dict, { case CCT_NUMBER: case CCT_STRING: - case CCT_RANGE: + case CCT_NRANGE: + case CCT_SRANGE: case CCT_MISSING: case CCT_OTHERNM: cat->subtotal = subtotal; @@ -2687,7 +2793,8 @@ ctables_cell_compare_3way (const void *a_, const void *b_, const void *aux_) /* Must be equal. */ continue; - case CCT_RANGE: + case CCT_NRANGE: + case CCT_SRANGE: case CCT_MISSING: case CCT_OTHERNM: { @@ -2796,6 +2903,24 @@ ctables_domain_insert (struct ctables_section *s, struct ctables_cell *cell, return d; } +static struct substring +rtrim_value (const union value *v, const struct variable *var) +{ + struct substring s = ss_buffer (CHAR_CAST (char *, v->s), + var_get_width (var)); + ss_rtrim (&s, ss_cstr (" ")); + return s; +} + +static bool +in_string_range (const union value *v, const struct variable *var, + const struct substring *srange) +{ + struct substring s = rtrim_value (v, var); + return ((!srange[0].string || ss_compare (s, srange[0]) >= 0) + && (!srange[1].string || ss_compare (s, srange[1]) <= 0)); +} + static const struct ctables_category * ctables_categories_match (const struct ctables_categories *c, const union value *v, const struct variable *var) @@ -2815,22 +2940,18 @@ ctables_categories_match (const struct ctables_categories *c, break; case CCT_STRING: - { - struct substring s = ss_buffer (CHAR_CAST (char *, v->s), - var_get_width (var)); - ss_rtrim (&s, ss_cstr (" ")); - printf ("%d '%.*s' ?=? '%.*s'\n", - var_get_width (var), - (int) cat->string.length, cat->string.string, - (int) s.length, s.string); - if (ss_equals (cat->string, s)) - return cat; - } + if (ss_equals (cat->string, rtrim_value (v, var))) + return cat; + break; + + case CCT_NRANGE: + if ((cat->nrange[0] == -DBL_MAX || v->f >= cat->nrange[0]) + && (cat->nrange[1] == DBL_MAX || v->f <= cat->nrange[1])) + return cat; break; - case CCT_RANGE: - if ((cat->range[0] == -DBL_MAX || v->f >= cat->range[0]) - && (cat->range[1] == DBL_MAX || v->f <= cat->range[1])) + case CCT_SRANGE: + if (in_string_range (v, var, cat->srange)) return cat; break; @@ -4258,11 +4379,19 @@ ctables_add_category_occurrences (const struct variable *var, } break; - case CCT_RANGE: + case CCT_NRANGE: assert (var_is_numeric (var)); for (const struct val_lab *vl = val_labs_first (val_labs); vl; vl = val_labs_next (val_labs, vl)) - if (vl->value.f >= c->range[0] && vl->value.f <= c->range[1]) + if (vl->value.f >= c->nrange[0] && vl->value.f <= c->nrange[1]) + ctables_add_occurrence (var, &vl->value, occurrences); + break; + + case CCT_SRANGE: + assert (var_is_alpha (var)); + for (const struct val_lab *vl = val_labs_first (val_labs); vl; + vl = val_labs_next (val_labs, vl)) + if (in_string_range (&vl->value, var, c->srange)) ctables_add_occurrence (var, &vl->value, occurrences); break; diff --git a/tests/language/stats/ctables.at b/tests/language/stats/ctables.at index d9dfa86c29..0923769b0b 100644 --- a/tests/language/stats/ctables.at +++ b/tests/language/stats/ctables.at @@ -23,7 +23,6 @@ dnl * strings dnl - PPROPERTIES: dnl * )LABEL[N]. dnl * summary statistics and formats? -dnl - Are string ranges a thing? dnl dnl Features not yet tested: dnl - Parsing (positive and negative) @@ -33,10 +32,12 @@ dnl - test CLABELS ROWLABELS=LAYER. dnl - Test VLABELS. dnl - Test WEIGHT and adjustment weights. dnl - Test PCOMPUTE and PPROPERTIES. +dnl - EMPTY=INCLUDE For string ranges. dnl - Summary functions: dnl * Separate summary functions for totals and subtotals. dnl - CATEGORIES: -dnl * THRU +dnl * THRU (numeric ranges) +dnl * THRU (string ranges) dnl * OTHERNM dnl - FORMAT: dnl * MINCOLWIDTH, MAXCOLWIDTH, UNITS. -- 2.30.2