From: Ben Pfaff Date: Mon, 20 Feb 2023 18:44:38 +0000 (-0800) Subject: Fix understanding of how long string variable missing values work. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=76357f21c7391646acd8017d1c3e1386234f2e79;p=pspp Fix understanding of how long string variable missing values work. Thanks to Andreas Hammer for reporting this bug at https://lists.gnu.org/archive/html/pspp-users/2023-02/msg00000.html. The data file that triggered this bug is ESS10SC.sav, available from https://ess-search.nsd.no/en/study/172ac431-2a06-41df-9dab-c1fd8f3877e7 --- diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi index b8670681d9..c8b37a584e 100644 --- a/doc/dev/system-file-format.texi +++ b/doc/dev/system-file-format.texi @@ -1315,7 +1315,8 @@ int32 count; int32 var_name_len; char var_name[]; char n_missing_values; -long_string_missing_value values[]; +int32 value_len; +char values[values_len * n_missing_values]; @end example @table @code @@ -1342,30 +1343,25 @@ any particular boundary, nor is it null-terminated. The number of missing values, either 1, 2, or 3. (This is, unusually, a single byte instead of a 32-bit number.) -@item long_string_missing_value values[]; -The missing values themselves. This array contains exactly -@code{n_missing_values} elements, each of which has the following -substructure: - -@example -int32 value_len; -char value[]; -@end example - -@table @code @item int32 value_len; -The length of the missing value string, in bytes. This value should +The length of each missing value string, in bytes. This value should be 8, because long string variables are at least 8 bytes wide (by definition), only the first 8 bytes of a long string variable's missing values are allowed to be non-spaces, and any spaces within the first 8 bytes are included in the missing value here. -@item char value[]; -The missing value string, exactly @code{value_len} bytes, without -any padding or null terminator. -@end table +@item char values[values_len * n_missing_values] +The missing values themselves, without any padding or null +terminators. @end table +An earlier version of this document stated that @code{value_len} was +repeated before each of the missing values, so that there was an extra +@code{int32} value of 8 before each missing value after the first. +Old versions of PSPP wrote data files in this format. Readers can +tolerate this mistake, if they wish, by noticing and skipping the +extra @code{int32} values, which wouldn't ordinarily occur in strings. + @node Data File and Variable Attributes Records @section Data File and Variable Attributes Records diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 88c0994dc3..17e2754cc1 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -2477,21 +2477,28 @@ assign_variable_roles (struct sfm_reader *r, struct dictionary *dict) } static bool -check_overflow (struct sfm_reader *r, - const struct sfm_extension_record *record, - size_t ofs, size_t length) +check_overflow__ (const struct sfm_extension_record *record, + size_t ofs, size_t length) { size_t end = record->size * record->count; if (length >= end || ofs + length > end) - { - sys_warn (r, record->pos + end, - _("Extension record subtype %d ends unexpectedly."), - record->subtype); - return false; - } + return false; return true; } +static bool +check_overflow (struct sfm_reader *r, + const struct sfm_extension_record *record, + size_t ofs, size_t length) +{ + bool ok = check_overflow__ (record, ofs, length); + if (!ok) + sys_warn (r, record->pos + record->size * record->count, + _("Extension record subtype %d ends unexpectedly."), + record->subtype); + return ok; +} + static void parse_long_string_value_labels (struct sfm_reader *r, const struct sfm_extension_record *record, @@ -2618,6 +2625,7 @@ parse_long_string_missing_values (struct sfm_reader *r, size_t end = record->size * record->count; size_t ofs = 0; + bool warned = false; while (ofs < end) { struct missing_values mv; @@ -2666,17 +2674,32 @@ parse_long_string_missing_values (struct sfm_reader *r, var = NULL; } + /* Parse value length. */ + if (!check_overflow (r, record, ofs, 4)) + return; + size_t value_length = parse_int (r, record->data, ofs); + ofs += 4; + /* Parse values. */ mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8); for (i = 0; i < n_missing_values; i++) { - size_t value_length; - - /* Parse value length. */ - if (!check_overflow (r, record, ofs, 4)) - return; - value_length = parse_int (r, record->data, ofs); - ofs += 4; + /* Tolerate files written by old, buggy versions of PSPP where we + believed that the value_length was repeated before each missing + value. */ + if (check_overflow__ (record, ofs, value_length) + && parse_int (r, record->data, ofs) == 8) + { + if (!warned) + { + sys_warn (r, record->pos + ofs, + _("This file has corrupted metadata written by a " + "buggy version of PSPP. To fix it, save a new " + "copy of the file.")); + warned = true; + } + ofs += 4; + } /* Parse value. */ if (!check_overflow (r, record, ofs, value_length)) diff --git a/src/data/sys-file-writer.c b/src/data/sys-file-writer.c index c94d13d859..2a6f5680ec 100644 --- a/src/data/sys-file-writer.c +++ b/src/data/sys-file-writer.c @@ -1041,7 +1041,8 @@ write_long_string_missing_values (struct sfm_writer *w, size += 4; size += recode_string_len (encoding, "UTF-8", var_get_name (var), -1); size += 1; - size += mv_n_values (mv) * (4 + 8); + size += 4; + size += mv_n_values (mv) * 8; } if (size == 0) return; @@ -1071,11 +1072,11 @@ write_long_string_missing_values (struct sfm_writer *w, n_missing_values = mv_n_values (mv); write_bytes (w, &n_missing_values, 1); + write_int (w, 8); + for (j = 0; j < n_missing_values; j++) { const union value *value = mv_get_value (mv, j); - - write_int (w, 8); write_bytes (w, value->s, 8); } } diff --git a/tests/data/sys-file-reader.at b/tests/data/sys-file-reader.at index f446631763..542f6978bf 100644 --- a/tests/data/sys-file-reader.at +++ b/tests/data/sys-file-reader.at @@ -119,10 +119,10 @@ dnl One missing value for STR8. COUNT("STR8"); i8 1; 8; "abcdefgh"; dnl Two missing values for STR9. -COUNT("STR9"); i8 2; 8; "abcdefgh"; 8; "01234567"; +COUNT("STR9"); i8 2; 8; "abcdefgh"; "01234567"; dnl Three missing values for STR9. -COUNT("STR10"); i8 3; 8; "abcdefgh"; 8; "01234567"; 8; "0 "; +COUNT("STR10"); i8 3; 8; "abcdefgh"; "01234567"; "0 "; ); dnl Character encoding record. @@ -1921,11 +1921,11 @@ dnl Machine floating-point info record. dnl Long string variable missing values record. 7; 22; 1; COUNT ( dnl Zero missing values (not allowed) for STR1 . -COUNT("STR1"); i8 >>0<<; +COUNT("STR1"); i8 >>0<<; 8; dnl Four missing values (not allowed) for STR2. -COUNT("STR2"); i8 4; -8; "abcdefgh"; 8; "ijklmnop"; 8; "qrstuvwx"; 8; "yz012345"; +COUNT("STR2"); i8 4; 8; +"abcdefgh"; "ijklmnop"; "qrstuvwx"; "yz012345"; dnl Missing values for unknown variable COUNT(>>"Nonexistent"<<); i8 1; 8; "abcdefgh"; @@ -1935,6 +1935,10 @@ COUNT(>>"NUM1"<<); i8 1; 8; "abcdefgh"; dnl Too long missing value COUNT("STR3"); i8 1; >>COUNT("abcdefghijkl")<<; + +dnl Buggy way that this was written in old PSPP, with a length +dnl before each missing value instead of just once. +COUNT("STR3"); i8 2; 8; "ABCDEFGH"; >>8<<; "IJKLMNOP"; ); dnl Character encoding record. @@ -1956,20 +1960,22 @@ DISPLAY DICTIONARY. AT_CHECK([pspp -O format=csv sys-file.sps], [0], ["warning: `sys-file.sav' near offset 0x1f8: Long string missing values record says variable STR1 has 0 missing values, but only 1 to 3 missing values are allowed." -"warning: `sys-file.sav' near offset 0x201: Long string missing values record says variable STR2 has 4 missing values, but only 1 to 3 missing values are allowed." +"warning: `sys-file.sav' near offset 0x205: Long string missing values record says variable STR2 has 4 missing values, but only 1 to 3 missing values are allowed." + +warning: `sys-file.sav' near offset 0x23a: Ignoring long string missing value record for unknown variable Nonexistent. -warning: `sys-file.sav' near offset 0x242: Ignoring long string missing value record for unknown variable Nonexistent. +warning: `sys-file.sav' near offset 0x24f: Ignoring long string missing value record for numeric variable NUM1. -warning: `sys-file.sav' near offset 0x257: Ignoring long string missing value record for numeric variable NUM1. +"warning: `sys-file.sav' near offset 0x268: Ignoring long string missing value 0 for variable str3, with width 11, that has bad value width 12." -"warning: `sys-file.sav' near offset 0x270: Ignoring long string missing value 0 for variable str3, with width 11, that has bad value width 12." +"warning: `sys-file.sav' near offset 0x289: This file has corrupted metadata written by a buggy version of PSPP. To fix it, save a new copy of the file." Table: Variables Name,Position,Measurement Level,Role,Width,Alignment,Print Format,Write Format,Missing Values num1,1,Unknown,Input,8,Right,F8.0,F8.0, str1,2,Nominal,Input,9,Left,A9,A9, str2,3,Nominal,Input,10,Left,A10,A10,"""abcdefgh""; ""ijklmnop""; ""qrstuvwx""" -str3,4,Nominal,Input,11,Left,A11,A11, +str3,4,Nominal,Input,11,Left,A11,A11,"""ABCDEFGH""; ""IJKLMNOP""" ]) done AT_CLEANUP diff --git a/utilities/pspp-dump-sav.c b/utilities/pspp-dump-sav.c index 36cb285380..5f693c515b 100644 --- a/utilities/pspp-dump-sav.c +++ b/utilities/pspp-dump-sav.c @@ -1094,36 +1094,33 @@ read_long_string_missing_values (struct sfm_reader *r, while (ftello (r->file) - start < size * count) { long long posn = ftello (r->file); - char var_name[ID_MAX_LEN + 1]; - uint8_t n_missing_values; - int var_name_len; - int i; /* Read variable name. */ - var_name_len = read_int (r); + int var_name_len = read_int (r); if (var_name_len > ID_MAX_LEN) sys_error (r, "Variable name length in long string value label " "record (%d) exceeds %d-byte limit.", var_name_len, ID_MAX_LEN); + char var_name[ID_MAX_LEN + 1]; read_string (r, var_name, var_name_len + 1); /* Read number of values. */ + uint8_t n_missing_values; read_bytes (r, &n_missing_values, 1); - printf ("\t%08llx: %s, %d missing values:", - posn, var_name, n_missing_values); + /* Read value length. */ + int value_length = read_int (r); + + printf ("\t%08llx: %s, %d missing values, each %d bytes:", + posn, var_name, n_missing_values, value_length); /* Read values. */ - for (i = 0; i < n_missing_values; i++) + for (int i = 0; i < n_missing_values; i++) { - char *value; - int value_length; - posn = ftello (r->file); /* Read value. */ - value_length = read_int (r); - value = xmalloc (value_length + 1); + char *value = xmalloc (value_length + 1); read_string (r, value, value_length + 1); printf (" \"%s\"", value);