From 57c1048c20829711ddcfe363e2d21812a450a522 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 20 Jun 2020 21:52:40 +0000 Subject: [PATCH] sys-file-reader: Accept bad variable indexes, with warnings. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The SPSS file writer software at https://packagist.org/packages/tiamo/spss sometimes generates malformed variable indexes for weighting variables or value label records. PSPP until now has rejected these files entirely. This commit should make it issue a warning instead. Thanks to István Ignácz for reporting this issue at https://lists.gnu.org/archive/html/pspp-users/2020-06/msg00012.html --- src/data/sys-file-reader.c | 206 ++++++++++++++++++++-------------- tests/data/sys-file-reader.at | 92 +++++++++++++-- 2 files changed, 206 insertions(+), 92 deletions(-) diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 155b91373c..c0bdf0ba61 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -234,10 +234,6 @@ sfm_reader_cast (const struct any_reader *r_) static bool sfm_close (struct any_reader *); -static struct variable *lookup_var_by_index (struct sfm_reader *, off_t, - const struct sfm_var_record *, - size_t n, int idx); - static void sys_msg (struct sfm_reader *r, off_t, int class, const char *format, va_list args) PRINTF_FORMAT (4, 0); @@ -358,10 +354,10 @@ static void parse_long_var_name_map (struct sfm_reader *, static bool parse_long_string_map (struct sfm_reader *, const struct sfm_extension_record *, struct dictionary *); -static bool parse_value_labels (struct sfm_reader *, struct dictionary *, - const struct sfm_var_record *, - size_t n_var_recs, - const struct sfm_value_label_record *); +static void parse_value_labels (struct sfm_reader *, struct dictionary *); +static struct variable *parse_weight_var (struct sfm_reader *, + const struct sfm_var_record *, size_t n_var_recs, + int idx); static void parse_data_file_attributes (struct sfm_reader *, const struct sfm_extension_record *, struct dictionary *); @@ -761,7 +757,6 @@ sfm_decode (struct any_reader *r_, const char *encoding, { struct sfm_reader *r = sfm_reader_cast (r_); struct dictionary *dict; - size_t i; if (encoding == NULL) { @@ -808,25 +803,10 @@ sfm_decode (struct any_reader *r_, const char *encoding, /* Parse value labels and the weight variable immediately after the variable records. These records use indexes into var_recs[], so we must parse them before those indexes become invalidated by very long string variables. */ - for (i = 0; i < r->n_labels; i++) - if (!parse_value_labels (r, dict, r->vars, r->n_vars, &r->labels[i])) - goto error; + parse_value_labels (r, dict); if (r->header.weight_idx != 0) - { - struct variable *weight_var; - - weight_var = lookup_var_by_index (r, 76, r->vars, r->n_vars, - r->header.weight_idx); - if (weight_var != NULL) - { - if (var_is_numeric (weight_var)) - dict_set_weight (dict, weight_var); - else - sys_warn (r, -1, _("Ignoring string variable `%s' set " - "as weighting variable."), - var_get_name (weight_var)); - } - } + dict_set_weight (dict, parse_weight_var (r, r->vars, r->n_vars, + r->header.weight_idx)); if (r->extensions[EXT_DISPLAY] != NULL) parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict); @@ -2150,61 +2130,99 @@ parse_long_string_map (struct sfm_reader *r, return true; } -static bool -parse_value_labels (struct sfm_reader *r, struct dictionary *dict, - const struct sfm_var_record *var_recs, size_t n_var_recs, - const struct sfm_value_label_record *record) +#define MAX_LABEL_WARNINGS 5 + +/* Displays a warning for offset OFFSET in the file. */ +static void +value_label_warning (struct sfm_reader *r, off_t offset, int *n_label_warnings, + const char *format, ...) { - struct variable **vars; - char **utf8_labels; - size_t i; + if (++*n_label_warnings > MAX_LABEL_WARNINGS) + return; - utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels); - for (i = 0; i < record->n_labels; i++) + va_list args; + + va_start (args, format); + sys_msg (r, offset, MW, format, args); + va_end (args); +} + +#define MAX_LABEL_WARNINGS 5 + +static void +parse_one_value_label_set (struct sfm_reader *r, struct dictionary *dict, + const struct sfm_var_record *var_recs, + size_t n_var_recs, + const struct sfm_value_label_record *record, + int *n_label_warnings) +{ + char **utf8_labels + = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels); + for (size_t i = 0; i < record->n_labels; i++) utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict), record->labels[i].label, -1, r->pool); - vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars); - for (i = 0; i < record->n_vars; i++) + struct variable **vars = pool_nmalloc (r->pool, + record->n_vars, sizeof *vars); + unsigned int n_vars = 0; + for (size_t i = 0; i < record->n_vars; i++) { - vars[i] = lookup_var_by_index (r, record->pos, - var_recs, n_var_recs, record->vars[i]); - if (vars[i] == NULL) - return false; + int idx = record->vars[i]; + if (idx < 1 || idx > n_var_recs) + { + value_label_warning ( + r, record->pos, n_label_warnings, + _("Value label variable index %d not in valid range 1...%zu."), + idx, n_var_recs); + continue; + } + + const struct sfm_var_record *rec = &var_recs[idx - 1]; + if (rec->var == NULL) + { + value_label_warning ( + r, record->pos, n_label_warnings, + _("Value label variable index %d " + "refers to long string continuation."), idx); + continue; + } + + vars[n_vars++] = rec->var; } + if (!n_vars) + return; - for (i = 1; i < record->n_vars; i++) + for (size_t i = 1; i < n_vars; i++) if (var_get_type (vars[i]) != var_get_type (vars[0])) { - sys_error (r, record->pos, - _("Variables associated with value label are not all of " - "identical type. Variable %s is %s, but variable " - "%s is %s."), - var_get_name (vars[0]), - var_is_numeric (vars[0]) ? _("numeric") : _("string"), - var_get_name (vars[i]), - var_is_numeric (vars[i]) ? _("numeric") : _("string")); - return false; + value_label_warning ( + r, record->pos, n_label_warnings, + _("Variables associated with value label are not all of " + "identical type. Variable %s is %s, but variable " + "%s is %s."), + var_get_name (vars[0]), + var_is_numeric (vars[0]) ? _("numeric") : _("string"), + var_get_name (vars[i]), + var_is_numeric (vars[i]) ? _("numeric") : _("string")); + return; } - for (i = 0; i < record->n_vars; i++) + for (size_t i = 0; i < n_vars; i++) { struct variable *var = vars[i]; - int width; - size_t j; - - width = var_get_width (var); + int width = var_get_width (var); if (width > 8) { - sys_error (r, record->pos, - _("Value labels may not be added to long string " - "variables (e.g. %s) using records types 3 and 4."), - var_get_name (var)); - return false; + value_label_warning ( + r, record->pos, n_label_warnings, + _("Value labels may not be added to long string " + "variables (e.g. %s) using records types 3 and 4."), + var_get_name (var)); + continue; } - for (j = 0; j < record->n_labels; j++) + for (size_t j = 0; j < record->n_labels; j++) { struct sfm_value_label *label = &record->labels[j]; union value value; @@ -2226,13 +2244,14 @@ parse_value_labels (struct sfm_reader *r, struct dictionary *dict, string variable with width 3. */ } else if (var_is_numeric (var)) - sys_warn (r, record->pos, - _("Duplicate value label for %g on %s."), - value.f, var_get_name (var)); + value_label_warning (r, record->pos, n_label_warnings, + _("Duplicate value label for %g on %s."), + value.f, var_get_name (var)); else - sys_warn (r, record->pos, - _("Duplicate value label for `%.*s' on %s."), - width, value.s, var_get_name (var)); + value_label_warning ( + r, record->pos, n_label_warnings, + _("Duplicate value label for `%.*s' on %s."), + width, value.s, var_get_name (var)); } value_destroy (&value, width); @@ -2240,38 +2259,59 @@ parse_value_labels (struct sfm_reader *r, struct dictionary *dict, } pool_free (r->pool, vars); - for (i = 0; i < record->n_labels; i++) + for (size_t i = 0; i < record->n_labels; i++) pool_free (r->pool, utf8_labels[i]); pool_free (r->pool, utf8_labels); +} - return true; +static void +parse_value_labels (struct sfm_reader *r, struct dictionary *dict) +{ + int n_label_warnings = 0; + for (size_t i = 0; i < r->n_labels; i++) + parse_one_value_label_set (r, dict, r->vars, r->n_vars, &r->labels[i], + &n_label_warnings); + if (n_label_warnings > MAX_LABEL_WARNINGS) + sys_warn (r, -1, + _("Suppressed %d additional warnings for value labels."), + n_label_warnings - MAX_LABEL_WARNINGS); } static struct variable * -lookup_var_by_index (struct sfm_reader *r, off_t offset, - const struct sfm_var_record *var_recs, size_t n_var_recs, - int idx) +parse_weight_var (struct sfm_reader *r, + const struct sfm_var_record *var_recs, size_t n_var_recs, + int idx) { - const struct sfm_var_record *rec; + off_t offset = 76; /* Offset to variable index in header. */ if (idx < 1 || idx > n_var_recs) { - sys_error (r, offset, - _("Variable index %d not in valid range 1...%zu."), - idx, n_var_recs); + sys_warn (r, offset, + _("Weight variable index %d not in valid range 1...%zu. " + "Treating file as unweighted."), + idx, n_var_recs); return NULL; } - rec = &var_recs[idx - 1]; + const struct sfm_var_record *rec = &var_recs[idx - 1]; if (rec->var == NULL) { - sys_error (r, offset, - _("Variable index %d refers to long string continuation."), - idx); + sys_warn (r, offset, + _("Weight variable index %d refers to long string " + "continuation. Treating file as unweighted."), idx); + return NULL; + } + + struct variable *weight_var = rec->var; + if (!var_is_numeric (weight_var)) + { + sys_warn (r, offset, _("Ignoring string variable `%s' set " + "as weighting variable."), + var_get_name (weight_var)); return NULL; } - return rec->var; + return weight_var; } /* Parses a set of custom attributes from TEXT into ATTRS. diff --git a/tests/data/sys-file-reader.at b/tests/data/sys-file-reader.at index 0e86db68a8..d257c8ad26 100644 --- a/tests/data/sys-file-reader.at +++ b/tests/data/sys-file-reader.at @@ -1999,7 +1999,7 @@ for variant in be le; do DISPLAY DICTIONARY. ]) AT_CHECK([pspp -O format=csv sys-file.sps], [0], - [warning: `sys-file.sav': Ignoring string variable `STR1' set as weighting variable. + [warning: `sys-file.sav' near offset 0x4c: Ignoring string variable `STR1' set as weighting variable. Table: Variables Name,Position,Label,Measurement Level,Role,Width,Alignment,Print Format,Write Format,Missing Values @@ -2032,8 +2032,8 @@ for variant in be le; do AT_CHECK([sack --$variant sys-file.sack > sys-file.sav]) AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. ]) - AT_CHECK([pspp -O format=csv sys-file.sps], [1], - [error: `sys-file.sav' near offset 0x4c: Variable index 3 not in valid range 1...2. + AT_CHECK([pspp -O format=csv sys-file.sps], 0, + [warning: `sys-file.sav' near offset 0x4c: Weight variable index 3 not in valid range 1...2. Treating file as unweighted. ]) done AT_CLEANUP @@ -2062,8 +2062,8 @@ for variant in be le; do AT_CHECK([sack --$variant sys-file.sack > sys-file.sav]) AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. ]) - AT_CHECK([pspp -O format=csv sys-file.sps], [1], - [error: `sys-file.sav' near offset 0x4c: Variable index 3 refers to long string continuation. + AT_CHECK([pspp -O format=csv sys-file.sps], 0, + [warning: `sys-file.sav' near offset 0x4c: Weight variable index 3 refers to long string continuation. Treating file as unweighted. ]) done AT_CLEANUP @@ -3171,8 +3171,82 @@ for variant in be le; do AT_DATA([sys-file.sps], [dnl GET FILE='sys-file.sav'. ]) - AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl -error: `sys-file.sav' near offset 0xf4: Value labels may not be added to long string variables (e.g. STR1) using records types 3 and 4. + AT_CHECK([pspp -O format=csv sys-file.sps], 0, [dnl +warning: `sys-file.sav' near offset 0xf4: Value labels may not be added to long string variables (e.g. STR1) using records types 3 and 4. +]) +done +AT_CLEANUP + +AT_SETUP([value label variable indexes must be in correct range]) +AT_KEYWORDS([sack synthetic system file negative]) +AT_DATA([sys-file.sack], [dnl +dnl File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; 2; 1; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3; + +dnl Variables. +2; 6; 0; 0; 0x010600 *2; s8 "STR1"; +2; 0; 0; 0; 0x050800 *2; s8 "NUM1"; + +dnl Value labels with bad variable indexes. +3; 1; s8 "xyzzy"; i8 3; s7 "one"; 4; 2; >>3; 4;<< +3; 1; s8 "xyzzy"; i8 3; s7 "one"; 4; 2; >>5; 6;<< +3; 1; s8 "xyzzy"; i8 3; s7 "one"; 4; 2; >>7; 8;<< + +dnl Character encoding record. +7; 20; 1; 12; "windows-1252"; + +dnl End of dictionary. +999; 0; +]) +for variant in be le; do + AT_CHECK([sack --$variant sys-file.sack > sys-file.sav]) + AT_DATA([sys-file.sps], [dnl +GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -O format=csv sys-file.sps], 0, [dnl +warning: `sys-file.sav' near offset 0xf4: Value label variable index 3 not in valid range 1...2. + +warning: `sys-file.sav' near offset 0xf4: Value label variable index 4 not in valid range 1...2. + +warning: `sys-file.sav' near offset 0x11c: Value label variable index 5 not in valid range 1...2. + +warning: `sys-file.sav' near offset 0x11c: Value label variable index 6 not in valid range 1...2. + +warning: `sys-file.sav' near offset 0x144: Value label variable index 7 not in valid range 1...2. + +warning: `sys-file.sav': Suppressed 1 additional warnings for value labels. +]) +done +AT_CLEANUP + +AT_SETUP([value label variable indexes must not be long string continuation]) +AT_KEYWORDS([sack synthetic system file negative]) +AT_DATA([sys-file.sack], [dnl +dnl File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; 2; 1; 0; -1; 100.0; "01 Jan 11"; "20:53:52"; s64 ""; i8 0 *3; + +dnl Long string variable. +2; 9; 0; 0; 0x010900 *2; s8 "STR1"; +(2; -1; 0; 0; 0; 0; s8 ""); + +dnl Value label with long string indexes. +3; 1; s8 "xyzzy"; i8 3; s7 "one"; 4; 1; >>2;<< + +dnl Character encoding record. +7; 20; 1; 12; "windows-1252"; + +dnl End of dictionary. +999; 0; +]) +for variant in be le; do + AT_CHECK([sack --$variant sys-file.sack > sys-file.sav]) + AT_DATA([sys-file.sps], [dnl +GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -O format=csv sys-file.sps], 0, [dnl +warning: `sys-file.sav' near offset 0xf4: Value label variable index 2 refers to long string continuation. ]) done AT_CLEANUP @@ -3202,8 +3276,8 @@ for variant in be le; do AT_DATA([sys-file.sps], [dnl GET FILE='sys-file.sav'. ]) - AT_CHECK([pspp -O format=csv sys-file.sps], [1], [dnl -"error: `sys-file.sav' near offset 0xf4: Variables associated with value label are not all of identical type. Variable STR1 is string, but variable NUM1 is numeric." + AT_CHECK([pspp -O format=csv sys-file.sps], 0, [dnl +"warning: `sys-file.sav' near offset 0xf4: Variables associated with value label are not all of identical type. Variable STR1 is string, but variable NUM1 is numeric." ]) done AT_CLEANUP -- 2.30.2