From ff85c7d77222c0ea90a9fc35b36eebd34eca52d2 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 29 Sep 2013 19:11:41 -0700 Subject: [PATCH] sys-file-reader: Add support for long string missing values. I assumed earlier that long string missing values used the same fields in the system file as other missing values, but I was wrong. This commit updates the support to match what I've seen in actual system files. Reported by Przemek Powalko . --- NEWS | 12 ++- doc/dev/system-file-format.texi | 73 ++++++++++++++ src/data/sys-file-reader.c | 107 +++++++++++++++++++-- src/data/sys-file-writer.c | 113 ++++++++++++++++++---- tests/data/sys-file-reader.at | 162 +++++++++++++++++++++++++++++--- tests/data/sys-file.at | 51 ++++++++++ utilities/pspp-dump-sav.c | 56 +++++++++++ 7 files changed, 533 insertions(+), 41 deletions(-) diff --git a/NEWS b/NEWS index 7da909cd20..6ac997c833 100644 --- a/NEWS +++ b/NEWS @@ -6,8 +6,16 @@ Please send PSPP bug reports to bug-gnu-pspp@gnu.org. Changes since 0.8.1: - * Charts are now rendered with colours from the Tango palette instead of fully - saturated primaries. + * Charts are now rendered with colours from the Tango palette instead + of fully saturated primaries. + + * Missing values for long string variables are now read from and + written to system files in an SPSS-compatible fashion. + + (Earlier versions of PSPP that supported missing values for long + string variables wrote them to system files in an SPSS-incompatible + way. To fix the problem, read the system file with this version of + PSPP and then save a new copy of it.) Changes from 0.8.0 to 0.8.1: diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi index 52737fb943..6ce8b09c71 100644 --- a/doc/dev/system-file-format.texi +++ b/doc/dev/system-file-format.texi @@ -102,6 +102,7 @@ Each type of record is described separately below. * Very Long String Record:: * Character Encoding Record:: * Long String Value Labels Record:: +* Long String Missing Values Record:: * Data File and Variable Attributes Records:: * Extended Number of Cases Record:: * Miscellaneous Informational Records:: @@ -277,6 +278,10 @@ respectively. If the variable has a range for missing variables, set to -2; if the variable has a range for missing variables plus a single discrete value, set to -3. +A long string variable always has the value 0 here. A separate record +indicates missing values for long string variables (@pxref{Long String +Missing Values Record}). + @item int32 print; Print format for this variable. See below. @@ -1152,6 +1157,74 @@ between 0 and 120, is the number of bytes in @code{label}. The @end table @end table +@node Long String Missing Values Record +@section Long String Missing Values Record + +This record, if present, specifies missing values for long string +variables. + +@example +/* @r{Header.} */ +int32 rec_type; +int32 subtype; +int32 size; +int32 count; + +/* @r{Repeated up to exactly @code{count} bytes.} */ +int32 var_name_len; +char var_name[]; +char n_missing_values; +long_string_missing_value values[]; +@end example + +@table @code +@item int32 rec_type; +Record type. Always set to 7. + +@item int32 subtype; +Record subtype. Always set to 22. + +@item int32 size; +Always set to 1. + +@item int32 count; +The number of bytes following the header until the next header. + +@item int32 var_name_len; +@itemx char var_name[]; +The number of bytes in the name of the long string variable that has +missing values, plus the variable name itself, which consists of +exactly @code{var_name_len} bytes. The variable name is not padded to +any particular boundary, nor is it null-terminated. + +@item char n_missing_values; +The number of missing values, either 1, 2, or 3. (This is, unusually, +a single byte instead of a 32-bit number.) + +@itemx long_string_missing_value values[]; +The missing values themselves. This array contains exactly +@code{n_missing_values} elements, each of which has the following +substructure: + +@example +int32 value_len; +char value[]; +@end example + +@table @code +@item int32 value_len; +The length of the missing value string, in bytes. This value should +be 8, because long string variables are at least 8 bytes wide (by +definition), only the first 8 bytes of a long string variable's +missing values are allowed to be non-spaces, and any spaces within the +first 8 bytes are included in the missing value here. + +@itemx char value[]; +The missing value string, exactly @code{value_len} bytes, without +any padding or null terminator. +@end table +@end table + @node Data File and Variable Attributes Records @section Data File and Variable Attributes Records diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index c416249872..5949ee209c 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -85,6 +85,7 @@ enum EXT_MRSETS2 = 19, /* Multiple response sets (extended). */ EXT_ENCODING = 20, /* Character encoding. */ EXT_LONG_LABELS = 21, /* Value labels for long strings. */ + EXT_LONG_MISSING = 22, /* Missing values for long strings. */ EXT_DATAVIEW = 24 /* "Format properties in dataview table". */ }; @@ -141,6 +142,7 @@ struct sfm_document_record struct sfm_extension_record { + int subtype; /* Record subtype. */ off_t pos; /* Starting offset in file. */ size_t size; /* Size of data elements. */ size_t count; /* Number of data elements. */ @@ -306,6 +308,9 @@ static void assign_variable_roles (struct sfm_reader *, struct dictionary *); static void parse_long_string_value_labels (struct sfm_reader *, const struct sfm_extension_record *, struct dictionary *); +static void parse_long_string_missing_values (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); /* Frees the strings inside INFO. */ void @@ -545,6 +550,8 @@ sfm_open_reader (struct file_handle *fh, const char *volatile encoding, if (extensions[EXT_LONG_LABELS] != NULL) parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict); + if (extensions[EXT_LONG_MISSING] != NULL) + parse_long_string_missing_values (r, extensions[EXT_LONG_MISSING], dict); /* Warn if the actual amount of data per case differs from the amount that the header claims. SPSS version 13 gets this @@ -862,6 +869,7 @@ static void read_extension_record_header (struct sfm_reader *r, int subtype, struct sfm_extension_record *record) { + record->subtype = subtype; record->pos = r->pos; record->size = read_int (r); record->count = read_int (r); @@ -901,6 +909,7 @@ read_extension_record (struct sfm_reader *r, int subtype) { EXT_MRSETS2, 1, 0 }, { EXT_ENCODING, 1, 0 }, { EXT_LONG_LABELS, 1, 0 }, + { EXT_LONG_MISSING, 1, 0 }, /* Ignored record types. */ { EXT_VAR_SETS, 0, 0 }, @@ -1992,7 +2001,8 @@ check_overflow (struct sfm_reader *r, size_t end = record->size * record->count; if (length >= end || ofs + length > end) sys_error (r, record->pos + end, - _("Long string value label record ends unexpectedly.")); + _("Extension record subtype %d ends unexpectedly."), + record->subtype); } static void @@ -2031,20 +2041,20 @@ parse_long_string_value_labels (struct sfm_reader *r, var = dict_lookup_var (dict, var_name); if (var == NULL) sys_warn (r, record->pos + ofs, - _("Ignoring long string value record for " + _("Ignoring long string value label record for " "unknown variable %s."), var_name); else if (var_is_numeric (var)) { sys_warn (r, record->pos + ofs, - _("Ignoring long string value record for " + _("Ignoring long string value label record for " "numeric variable %s."), var_name); var = NULL; } else if (width != var_get_width (var)) { sys_warn (r, record->pos + ofs, - _("Ignoring long string value record for variable %s " - "because the record's width (%d) does not match the " + _("Ignoring long string value label record for variable " + "%s because the record's width (%d) does not match the " "variable's width (%d)."), var_name, width, var_get_width (var)); var = NULL; @@ -2072,8 +2082,8 @@ parse_long_string_value_labels (struct sfm_reader *r, else { sys_warn (r, record->pos + ofs, - _("Ignoring long string value %zu for variable " - "%s, with width %d, that has bad value " + _("Ignoring long string value label %zu for " + "variable %s, with width %d, that has bad value " "width %zu."), i, var_get_name (var), width, value_length); skip = true; @@ -2106,6 +2116,89 @@ parse_long_string_value_labels (struct sfm_reader *r, } } } + +static void +parse_long_string_missing_values (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + const char *dict_encoding = dict_get_encoding (dict); + size_t end = record->size * record->count; + size_t ofs = 0; + + while (ofs < end) + { + struct missing_values mv; + char *var_name; + struct variable *var; + int n_missing_values; + int var_name_len; + size_t i; + + /* Parse variable name length. */ + check_overflow (r, record, ofs, 4); + var_name_len = parse_int (r, record->data, ofs); + ofs += 4; + + /* Parse variable name. */ + check_overflow (r, record, ofs, var_name_len + 1); + var_name = recode_string_pool ("UTF-8", dict_encoding, + (const char *) record->data + ofs, + var_name_len, r->pool); + ofs += var_name_len; + + /* Parse number of missing values. */ + n_missing_values = ((const uint8_t *) record->data)[ofs]; + if (n_missing_values < 1 || n_missing_values > 3) + sys_warn (r, record->pos + ofs, + _("Long string missing values record says variable %s " + "has %d missing values, but only 1 to 3 missing values " + "are allowed."), + var_name, n_missing_values); + ofs++; + + /* Look up 'var' and validate. */ + var = dict_lookup_var (dict, var_name); + if (var == NULL) + sys_warn (r, record->pos + ofs, + _("Ignoring long string missing value record for " + "unknown variable %s."), var_name); + else if (var_is_numeric (var)) + { + sys_warn (r, record->pos + ofs, + _("Ignoring long string missing value record for " + "numeric variable %s."), var_name); + var = NULL; + } + + /* Parse values. */ + mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8); + for (i = 0; i < n_missing_values; i++) + { + size_t value_length; + + /* Parse value length. */ + check_overflow (r, record, ofs, 4); + value_length = parse_int (r, record->data, ofs); + ofs += 4; + + /* Parse value. */ + check_overflow (r, record, ofs, value_length); + if (var != NULL + && i < 3 + && !mv_add_str (&mv, (const uint8_t *) record->data + ofs, + value_length)) + sys_warn (r, record->pos + ofs, + _("Ignoring long string missing value %zu for variable " + "%s, with width %d, that has bad value width %zu."), + i, var_get_name (var), var_get_width (var), + value_length); + ofs += value_length; + } + if (var != NULL) + var_set_missing_values (var, &mv); + } +} /* Case reader. */ diff --git a/src/data/sys-file-writer.c b/src/data/sys-file-writer.c index 2369370ca1..32326e13f2 100644 --- a/src/data/sys-file-writer.c +++ b/src/data/sys-file-writer.c @@ -116,6 +116,8 @@ static void write_vls_length_table (struct sfm_writer *w, static void write_long_string_value_labels (struct sfm_writer *, const struct dictionary *); +static void write_long_string_missing_values (struct sfm_writer *, + const struct dictionary *); static void write_mrsets (struct sfm_writer *, const struct dictionary *, bool pre_v14); @@ -258,6 +260,7 @@ sfm_open_writer (struct file_handle *fh, struct dictionary *d, write_vls_length_table (w, d); write_long_string_value_labels (w, d); + write_long_string_missing_values (w, d); if (opts.version >= 3) { @@ -442,7 +445,6 @@ write_variable (struct sfm_writer *w, const struct variable *v) int segment_cnt = sfm_width_to_segments (width); int seg0_width = sfm_segment_alloc_width (width, 0); const char *encoding = var_get_encoding (v); - struct missing_values mv; int i; /* Record type. */ @@ -456,14 +458,20 @@ write_variable (struct sfm_writer *w, const struct variable *v) /* Number of missing values. If there is a range, then the range counts as 2 missing values and causes the number to be - negated. */ - mv_copy (&mv, var_get_missing_values (v)); - if (mv_get_width (&mv) > 8) - mv_resize (&mv, 8); - if (mv_has_range (&mv)) - write_int (w, -2 - mv_n_values (&mv)); + negated. + + Missing values for long string variables are written in a separate + record. */ + if (width <= MAX_SHORT_STRING) + { + const struct missing_values *mv = var_get_missing_values (v); + if (mv_has_range (mv)) + write_int (w, -2 - mv_n_values (mv)); + else + write_int (w, mv_n_values (mv)); + } else - write_int (w, mv_n_values (&mv)); + write_int (w, 0); /* Print and write formats. */ write_format (w, *var_get_print_format (v), seg0_width); @@ -486,15 +494,19 @@ write_variable (struct sfm_writer *w, const struct variable *v) } /* Write the missing values, if any, range first. */ - if (mv_has_range (&mv)) + if (width <= MAX_SHORT_STRING) { - double x, y; - mv_get_range (&mv, &x, &y); - write_float (w, x); - write_float (w, y); + const struct missing_values *mv = var_get_missing_values (v); + if (mv_has_range (mv)) + { + double x, y; + mv_get_range (mv, &x, &y); + write_float (w, x); + write_float (w, y); + } + for (i = 0; i < mv_n_values (mv); i++) + write_value (w, mv_get_value (mv, i), width); } - for (i = 0; i < mv_n_values (&mv); i++) - write_value (w, mv_get_value (&mv, i), mv_get_width (&mv)); write_variable_continuation_records (w, seg0_width); @@ -514,8 +526,6 @@ write_variable (struct sfm_writer *w, const struct variable *v) write_variable_continuation_records (w, seg_width); } - - mv_destroy (&mv); } /* Writes the value labels to system file W. @@ -891,11 +901,11 @@ write_vls_length_table (struct sfm_writer *w, ds_destroy (&map); } - static void write_long_string_value_labels (struct sfm_writer *w, const struct dictionary *dict) { + const char *encoding = dict_get_encoding (dict); size_t n_vars = dict_get_var_cnt (dict); size_t size, i; off_t start UNUSED; @@ -906,7 +916,6 @@ write_long_string_value_labels (struct sfm_writer *w, { struct variable *var = dict_get_var (dict, i); const struct val_labs *val_labs = var_get_value_labels (var); - const char *encoding = var_get_encoding (var); int width = var_get_width (var); const struct val_lab *val_lab; @@ -936,7 +945,6 @@ write_long_string_value_labels (struct sfm_writer *w, { struct variable *var = dict_get_var (dict, i); const struct val_labs *val_labs = var_get_value_labels (var); - const char *encoding = var_get_encoding (var); int width = var_get_width (var); const struct val_lab *val_lab; char *var_name; @@ -972,6 +980,71 @@ write_long_string_value_labels (struct sfm_writer *w, assert (ftello (w->file) == start + size); } +static void +write_long_string_missing_values (struct sfm_writer *w, + const struct dictionary *dict) +{ + const char *encoding = dict_get_encoding (dict); + size_t n_vars = dict_get_var_cnt (dict); + size_t size, i; + off_t start UNUSED; + + /* Figure out the size in advance. */ + size = 0; + for (i = 0; i < n_vars; i++) + { + struct variable *var = dict_get_var (dict, i); + const struct missing_values *mv = var_get_missing_values (var); + int width = var_get_width (var); + + if (mv_is_empty (mv) || width < 9) + continue; + + size += 4; + size += recode_string_len (encoding, "UTF-8", var_get_name (var), -1); + size += 1; + size += mv_n_values (mv) * (4 + 8); + } + if (size == 0) + return; + + write_int (w, 7); /* Record type. */ + write_int (w, 22); /* Record subtype */ + write_int (w, 1); /* Data item (byte) size. */ + write_int (w, size); /* Number of data items. */ + + start = ftello (w->file); + for (i = 0; i < n_vars; i++) + { + struct variable *var = dict_get_var (dict, i); + const struct missing_values *mv = var_get_missing_values (var); + int width = var_get_width (var); + uint8_t n_missing_values; + char *var_name; + int j; + + if (mv_is_empty (mv) || width < 9) + continue; + + var_name = recode_string (encoding, "UTF-8", var_get_name (var), -1); + write_int (w, strlen (var_name)); + write_bytes (w, var_name, strlen (var_name)); + free (var_name); + + n_missing_values = mv_n_values (mv); + write_bytes (w, &n_missing_values, 1); + + for (j = 0; j < n_missing_values; j++) + { + const union value *value = mv_get_value (mv, j); + + write_int (w, 8); + write_bytes (w, value_str (value, width), 8); + } + } + assert (ftello (w->file) == start + size); +} + static void write_encoding_record (struct sfm_writer *w, const struct dictionary *d) diff --git a/tests/data/sys-file-reader.at b/tests/data/sys-file-reader.at index 602ae76c1a..2706228eb6 100644 --- a/tests/data/sys-file-reader.at +++ b/tests/data/sys-file-reader.at @@ -6,7 +6,7 @@ AT_DATA([sys-file.sack], [dnl dnl File header. "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; 2; dnl Layout code -22; dnl Nominal case size +28; dnl Nominal case size 0; dnl Not compressed 0; dnl Not weighted 1; dnl 1 case. @@ -68,11 +68,24 @@ dnl String variable, three missing values. 2; 4; 0; 3; 0x010400 *2; s8 "STR6"; s8 "MISS"; s8 "OTHR"; s8 "MORE"; dnl Long string variable, one missing value. +dnl (This is not how SPSS represents missing values for long strings--it +dnl uses a separate record as shown later below--but old versions of PSPP +dnl did use this representation so we continue supporting it for backward +dnl compatibility. 2; 11; 0; 1; 0x010b00 *2; s8 "STR7"; "first8by"; 2; -1; 0; 0; 0; 0; s8 ""; +dnl Long string variables that will have missing values added with a +dnl later record. +2; 9; 0; 0; 0x010900 *2; s8 "STR8"; +2; -1; 0; 0; 0; 0; s8 ""; +2; 10; 0; 0; 0x010a00 *2; s8 "STR9"; +2; -1; 0; 0; 0; 0; s8 ""; +2; 11; 0; 0; 0x010b00 *2; s8 "STR10"; +2; -1; 0; 0; 0; 0; s8 ""; + dnl Long string variable, value label. -2; 25; 1; 0; 0x011900 *2; s8 "STR8"; 14; "25-byte string"; i8 0 * 2; +2; 25; 1; 0; 0x011900 *2; s8 "STR11"; 14; "25-byte string"; i8 0 * 2; ( 2; -1; 0; 0; 0; 0; s8 ""; ) * 2; dnl Variable label fields on continuation records have been spotted in system dnl files created by "SPSS Power Macintosh Release 6.1". @@ -84,6 +97,18 @@ dnl Machine integer info record. dnl Machine floating-point info record. 7; 4; 8; 3; SYSMIS; HIGHEST; LOWEST; +dnl Long string variable missing values record. +7; 22; 1; COUNT ( +dnl One missing value for STR8. +COUNT("STR8"); i8 1; 8; "abcdefgh"; + +dnl Two missing values for STR9. +COUNT("STR9"); i8 2; 8; "abcdefgh"; 8; "01234567"; + +dnl Three missing values for STR9. +COUNT("STR10"); i8 3; 8; "abcdefgh"; 8; "01234567"; 8; "0 "; +); + dnl Character encoding record. 7; 20; 1; 12; "windows-1252"; @@ -93,11 +118,12 @@ dnl Dictionary termination record. dnl Data. 1.0; 2.0; 3.0; 4.0; 5.0; 6.0; 7.0; 8.0; 9.0; 10.0; s8 "abcd"; s8 "efgh"; s8 "ijkl"; s8 "mnop"; s8 "qrst"; s8 "uvwx"; -s16 "yzABCDEFGHI"; s32 "JKLMNOPQRSTUVWXYZ01234567"; +s16 "yzABCDEFGHI"; s16 "JKLMNOPQR"; s16 "STUVWXYZ01"; +s16 "23456789abc"; s32 "defghijklmnopqstuvwxyzABC"; ]) for variant in \ - "be 94338da4d8d44244d43f31e2ea4d0a6a" \ - "le e3e7eefb984b81be5531b579293cb127" + "be ae072375af73d628a544cc2230dd72c9" \ + "le 039a21ab64f68c65b240e782a6b0f563" do set $variant AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] @@ -147,12 +173,18 @@ str6,Format: A4,,16 ,"Missing Values: ""MISS""; ""OTHR""; ""MORE""",, str7,Format: A11,,17 ,"Missing Values: ""first8by""",, -str8,25-byte string,,18 +str8,Format: A9,,18 +,"Missing Values: ""abcdefgh""",, +str9,Format: A10,,19 +,"Missing Values: ""abcdefgh""; ""01234567""",, +str10,Format: A11,,20 +,"Missing Values: ""abcdefgh""; ""01234567""; ""0 """,, +str11,25-byte string,,21 ,Format: A25,, Table: Data List -num1,num2,num3,num4,num5,num6,num7,num8,num9,numàèìñò,str1,str2,str3,str4,str5,str6,str7,str8 -1,2,3,4,5,6,7,8,9,10,abcd,efgh,ijkl,mnop,qrst,uvwx,yzABCDEFGHI,JKLMNOPQRSTUVWXYZ01234567 +num1,num2,num3,num4,num5,num6,num7,num8,num9,numàèìñò,str1,str2,str3,str4,str5,str6,str7,str8,str9,str10,str11 +1,2,3,4,5,6,7,8,9,10,abcd,efgh,ijkl,mnop,qrst,uvwx,yzABCDEFGHI,JKLMNOPQR,STUVWXYZ01,23456789abc,defghijklmnopqstuvwxyzABC ]) done AT_CLEANUP @@ -1817,6 +1849,112 @@ warning: `sys-file.sav' near offset 0x124: Variable STR2 with width 4 has invali done AT_CLEANUP +AT_SETUP([invalid long string missing values]) +AT_KEYWORDS([sack synthetic system file negative]) +AT_DATA([sys-file.sack], [dnl +dnl File header. +"$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; dnl Layout code +7; dnl Nominal case size +0; dnl Not compressed +0; dnl Not weighted +1; dnl 1 case. +100.0; dnl Bias. +"01 Jan 11"; "20:53:52"; +"PSPP synthetic test file: "; i8 244; i8 245; i8 246; i8 248; s34 ""; +i8 0 *3; + +dnl One numeric variable. +2; 0; 0; 0; 0x050800 *2; s8 "NUM1"; + +dnl Long string variables that will have missing values added with a +dnl later record. +2; 9; 0; 0; 0x010900 *2; s8 "STR1"; +2; -1; 0; 0; 0; 0; s8 ""; +2; 10; 0; 0; 0x010a00 *2; s8 "STR2"; +2; -1; 0; 0; 0; 0; s8 ""; +2; 11; 0; 0; 0x010b00 *2; s8 "STR3"; +2; -1; 0; 0; 0; 0; s8 ""; + +dnl Machine integer info record. +7; 3; 4; 8; 1; 2; 3; -1; 1; 1; ENDIAN; 1252; + +dnl Machine floating-point info record. +7; 4; 8; 3; SYSMIS; HIGHEST; LOWEST; + +dnl Long string variable missing values record. +7; 22; 1; COUNT ( +dnl Zero missing values (not allowed) for STR1 . +COUNT("STR1"); i8 >>0<<; + +dnl Four missing values (not allowed) for STR2. +COUNT("STR2"); i8 4; +8; "abcdefgh"; 8; "ijklmnop"; 8; "qrstuvwx"; 8; "yz012345"; + +dnl Missing values for unknown variable +COUNT(>>"Nonexistent"<<); i8 1; 8; "abcdefgh"; + +dnl Missing values for numeric variable +COUNT(>>"NUM1"<<); i8 1; 8; "abcdefgh"; + +dnl Too long missing value +COUNT("STR3"); i8 1; >>COUNT("abcdefghijkl")<<; +); + +dnl Character encoding record. +7; 20; 1; 12; "windows-1252"; + +dnl Dictionary termination record. +999; 0; +s8 "abcd"; s8 "efgh"; s8 "ijkl"; s8 "mnop"; s8 "qrst"; s8 "uvwx"; +s16 "yzABCDEFGHI"; s16 "JKLMNOPQR"; s16 "STUVWXYZ01"; +s16 "23456789abc"; s32 "defghijklmnopqstuvwxyzABC"; +]) + +for variant in \ + "be 26e815cfb41eaedb435ea3c81b96215c" \ + "le 72d70456bd4dc88bb0a0fdb039ccdfa3" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [dnl +GET FILE='sys-file.sav'. +DISPLAY DICTIONARY. +]) + AT_CHECK([pspp -O format=csv sys-file.sps], [0], + ["warning: `sys-file.sav' near offset 0x1f8: Long string missing values record says variable STR1 has 0 missing values, but only 1 to 3 missing values are allowed." + +"warning: `sys-file.sav' near offset 0x201: Long string missing values record says variable STR2 has 4 missing values, but only 1 to 3 missing values are allowed." + +warning: `sys-file.sav' near offset 0x242: Ignoring long string missing value record for unknown variable Nonexistent. + +warning: `sys-file.sav' near offset 0x257: Ignoring long string missing value record for numeric variable NUM1. + +"warning: `sys-file.sav' near offset 0x270: Ignoring long string missing value 0 for variable str3, with width 11, that has bad value width 12." + +Variable,Description,,Position +num1,Format: F8.0,,1 +,Measure: Scale,, +,Display Alignment: Right,, +,Display Width: 8,, +str1,Format: A9,,2 +,Measure: Nominal,, +,Display Alignment: Left,, +,Display Width: 9,, +str2,Format: A10,,3 +,Measure: Nominal,, +,Display Alignment: Left,, +,Display Width: 10,, +,"Missing Values: ""abcdefgh""; ""ijklmnop""; ""qrstuvwx""",, +str3,Format: A11,,4 +,Measure: Nominal,, +,Display Alignment: Left,, +,Display Width: 11,, +]) +done +AT_CLEANUP + AT_SETUP([weighting variable must be numeric]) AT_KEYWORDS([sack synthetic system file negative]) AT_DATA([sys-file.sack], [dnl @@ -3238,13 +3376,13 @@ do AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. ]) AT_CHECK([pspp -O format=csv sys-file.sps], [0], [dnl -warning: `sys-file.sav' near offset 0x128: Ignoring long string value record for unknown variable STR9. +warning: `sys-file.sav' near offset 0x128: Ignoring long string value label record for unknown variable STR9. -warning: `sys-file.sav' near offset 0x164: Ignoring long string value record for numeric variable NUM1. +warning: `sys-file.sav' near offset 0x164: Ignoring long string value label record for numeric variable NUM1. -warning: `sys-file.sav' near offset 0x193: Ignoring long string value record for variable STR14 because the record's width (9) does not match the variable's width (14). +warning: `sys-file.sav' near offset 0x193: Ignoring long string value label record for variable STR14 because the record's width (9) does not match the variable's width (14). -"warning: `sys-file.sav' near offset 0x1d4: Ignoring long string value 0 for variable str14, with width 14, that has bad value width 9." +"warning: `sys-file.sav' near offset 0x1d4: Ignoring long string value label 0 for variable str14, with width 14, that has bad value width 9." warning: `sys-file.sav' near offset 0x259: Duplicate value label for `abcdefghijklmn' on str14. ]) diff --git a/tests/data/sys-file.at b/tests/data/sys-file.at index 85ccfd8b49..cd7fbb4f50 100644 --- a/tests/data/sys-file.at +++ b/tests/data/sys-file.at @@ -37,6 +37,57 @@ variable001,variable002,variable003,variable004 ]) AT_CLEANUP +AT_SETUP([write and read long string value labels and missing values]) +AT_DATA([sysfile.sps], [dnl +DATA LIST LIST NOTABLE/s1 s2 s3 (a9). +BEGIN DATA +a b c +END DATA. + +VALUE LABELS + /s1 'abc' 'First value label' + 'abcdefgh' 'Second value label' + 'abcdefghi' 'Third value label' + /s2 '0' 'Fourth value label' + '01234567' 'Fifth value label' + '012345678' 'Sixth value label'. + +MISSING VALUES + s1 ('0') + /s2 ('12' '123') + /s3 ('1234' '12345' '12345678'). + +SAVE /OUTFILE='foo.sav'. +GET /FILE='foo.sav'. +DISPLAY DICTIONARY. +]) +AT_CHECK([pspp -o pspp.csv sysfile.sps]) +AT_CHECK([cat pspp.csv], [0], [dnl +Variable,Description,,Position +s1,Format: A9,,1 +,Measure: Nominal,, +,Display Alignment: Left,, +,Display Width: 9,, +,"Missing Values: ""0 """,, +,abc ,First value label, +,abcdefgh ,Second value label, +,abcdefghi,Third value label, +s2,Format: A9,,2 +,Measure: Nominal,, +,Display Alignment: Left,, +,Display Width: 9,, +,"Missing Values: ""12 ""; ""123 """,, +,0 ,Fourth value label, +,01234567 ,Fifth value label, +,012345678,Sixth value label, +s3,Format: A9,,3 +,Measure: Nominal,, +,Display Alignment: Left,, +,Display Width: 9,, +,"Missing Values: ""1234 ""; ""12345 ""; ""12345678""",, +]) +AT_CLEANUP + AT_SETUP([write and read compressed files]) AT_KEYWORDS([SAVE GET system file]) AT_DATA([sysfile.sps], [dnl diff --git a/utilities/pspp-dump-sav.c b/utilities/pspp-dump-sav.c index 307add8db3..c6b5823660 100644 --- a/utilities/pspp-dump-sav.c +++ b/utilities/pspp-dump-sav.c @@ -83,6 +83,8 @@ static void read_character_encoding (struct sfm_reader *r, size_t size, size_t count); static void read_long_string_value_labels (struct sfm_reader *r, size_t size, size_t count); +static void read_long_string_missing_values (struct sfm_reader *r, + size_t size, size_t count); static void read_unknown_extension (struct sfm_reader *, size_t size, size_t count); static void read_compressed_data (struct sfm_reader *, int max_cases); @@ -615,6 +617,10 @@ read_extension_record (struct sfm_reader *r) read_long_string_value_labels (r, size, count); return; + case 22: + read_long_string_missing_values (r, size, count); + return; + default: sys_warn (r, "Unrecognized record type 7, subtype %d.", subtype); read_unknown_extension (r, size, count); @@ -1036,6 +1042,56 @@ read_long_string_value_labels (struct sfm_reader *r, size_t size, size_t count) } } +static void +read_long_string_missing_values (struct sfm_reader *r, + size_t size, size_t count) +{ + long long int start = ftello (r->file); + + printf ("%08llx: long string missing values\n", start); + while (ftello (r->file) - start < size * count) + { + long long posn = ftello (r->file); + char var_name[ID_MAX_LEN + 1]; + uint8_t n_missing_values; + int var_name_len; + int i; + + /* Read variable name. */ + var_name_len = read_int (r); + if (var_name_len > ID_MAX_LEN) + sys_error (r, "Variable name length in long string value label " + "record (%d) exceeds %d-byte limit.", + var_name_len, ID_MAX_LEN); + read_string (r, var_name, var_name_len + 1); + + /* Read number of values. */ + read_bytes (r, &n_missing_values, 1); + + printf ("\t%08llx: %s, %d missing values:", + posn, var_name, n_missing_values); + + /* Read values. */ + for (i = 0; i < n_missing_values; i++) + { + char *value; + int value_length; + + posn = ftello (r->file); + + /* Read value. */ + value_length = read_int (r); + value = xmalloc (value_length + 1); + read_string (r, value, value_length + 1); + + printf (" \"%s\"", value); + + free (value); + } + printf ("\n"); + } +} + static void hex_dump (size_t offset, const void *buffer_, size_t buffer_size) { -- 2.30.2