From 197c17c92ac8124ae389434afa105bee90b96ad8 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 29 Nov 2014 11:56:15 -0800 Subject: [PATCH] Add support for weighting variables in SPSS/PC+ system files. Based on a file provided by Alan Mead. --- doc/dev/pc+-file-format.texi | 16 ++- src/data/pc+-file-reader.c | 39 +++++-- tests/data/pc+-file-reader.at | 190 +++++++++++++++++++++++++++++++++- 3 files changed, 230 insertions(+), 15 deletions(-) diff --git a/doc/dev/pc+-file-format.texi b/doc/dev/pc+-file-format.texi index f3f1a9d620..826e51f880 100644 --- a/doc/dev/pc+-file-format.texi +++ b/doc/dev/pc+-file-format.texi @@ -93,9 +93,11 @@ uint32 zero1; uint16 one1; uint16 compressed; uint16 nominal_case_size; -uint32 n_cases0; +uint16 n_cases0; +uint16 weight_index; uint16 zero2; -uint32 n_cases1; +uint16 n_cases1; +uint16 zero3; char creation_date[8]; char creation_time[8]; char label[64]; @@ -109,6 +111,7 @@ Always set to 1. @item uint32 zero0; @itemx uint32 zero1; @itemx uint16 zero2; +@itemx uint16 zero3; Always set to 0. It seems likely that one of these variables is set to 1 if weighting @@ -143,12 +146,17 @@ except that long string variables add extra data elements (one for every 8 bytes after the first 8). String variables in SPSS/PC+ system files are limited to 255 bytes. -@item uint32 n_cases0; -@itemx uint32 n_cases1; +@item uint16 n_cases0; +@itemx uint16 n_cases1; The number of cases in the data record. Both values are the same. Some files in the corpus contain data for the number of cases noted here, followed by garbage that somewhat resembles data. +@item uint16 weight_index; +0, if the file is unweighted, otherwise a 1-based index into the data +record of the weighting variable, e.g.@: 4 for the first variable +after the 3 system-defined variables. + @item char creation_date[8]; The date that the file was created, in @samp{mm/dd/yy} format. Single-digit days and months are not prefixed by zeros. The string is diff --git a/src/data/pc+-file-reader.c b/src/data/pc+-file-reader.c index a127323c68..0aa4587d38 100644 --- a/src/data/pc+-file-reader.c +++ b/src/data/pc+-file-reader.c @@ -73,6 +73,7 @@ struct pcp_main_header char creation_date[9]; /* "[m]m/dd/yy". */ char creation_time[9]; /* "[H]H:MM:SS". */ char file_label[65]; /* File label. */ + unsigned int weight_index; /* Index of weighting variable, 0 if none. */ }; struct pcp_var_record @@ -85,6 +86,8 @@ struct pcp_var_record uint8_t missing[8]; char *label; + bool weight; + struct pcp_value_label *val_labs; size_t n_val_labs; @@ -524,8 +527,8 @@ static bool read_main_header (struct pcp_reader *r, struct pcp_main_header *header) { unsigned int base_ofs = r->directory.main.ofs; + unsigned int zero0, zero1, zero2, zero3; size_t min_values, min_data_size; - unsigned int zero0, zero1, zero2; unsigned int one0, one1; unsigned int compressed; unsigned int n_cases1; @@ -551,9 +554,11 @@ read_main_header (struct pcp_reader *r, struct pcp_main_header *header) || !read_uint16 (r, &one1) || !read_uint16 (r, &compressed) || !read_uint16 (r, &header->nominal_case_size) - || !read_uint32 (r, &r->n_cases) + || !read_uint16 (r, &r->n_cases) + || !read_uint16 (r, &header->weight_index) || !read_uint16 (r, &zero2) - || !read_uint32 (r, &n_cases1) + || !read_uint16 (r, &n_cases1) + || !read_uint16 (r, &zero3) || !read_string (r, header->creation_date, sizeof header->creation_date) || !read_string (r, header->creation_time, sizeof header->creation_time) || !read_string (r, header->file_label, sizeof header->file_label)) @@ -565,10 +570,11 @@ read_main_header (struct pcp_reader *r, struct pcp_main_header *header) pcp_warn (r, base_ofs, _("Record 0 specifies unexpected system missing " "value %g (%a)."), d, d); } - if (one0 != 1 || one1 != 1 || zero0 != 0 || zero1 != 0 || zero2 != 0) + if (one0 != 1 || one1 != 1 + || zero0 != 0 || zero1 != 0 || zero2 != 0 || zero3 != 0) pcp_warn (r, base_ofs, _("Record 0 reserved fields have unexpected values " - "(%u,%u,%u,%u,%u)."), - one0, one1, zero0, zero1, zero2); + "(%u,%u,%u,%u,%u,%u)."), + one0, one1, zero0, zero1, zero2, zero3); if (n_cases1 != r->n_cases) pcp_warn (r, base_ofs, _("Record 0 case counts differ (%u versus %u)."), r->n_cases, n_cases1); @@ -701,6 +707,7 @@ static bool read_variables_record (struct pcp_reader *r) { unsigned int i; + bool weighted; if (!pcp_seek (r, r->directory.variables.ofs)) return false; @@ -713,6 +720,7 @@ read_variables_record (struct pcp_reader *r) r->vars = pool_calloc (r->pool, r->header.nominal_case_size, sizeof *r->vars); + weighted = false; for (i = 0; i < r->header.nominal_case_size; i++) { struct pcp_var_record *var = &r->vars[r->n_vars++]; @@ -730,6 +738,10 @@ read_variables_record (struct pcp_reader *r) || !read_bytes (r, var->missing, sizeof var->missing)) return false; + var->weight = r->header.weight_index && i == r->header.weight_index - 1; + if (var->weight) + weighted = true; + raw_type = format >> 16; if (!fmt_from_io (raw_type, &var->format.type)) { @@ -768,6 +780,9 @@ read_variables_record (struct pcp_reader *r) } } + if (r->header.weight_index && !weighted) + pcp_warn (r, -1, _("Invalid weight index %u."), r->header.weight_index); + return true; } @@ -823,14 +838,12 @@ parse_variable_records (struct pcp_reader *r, struct dictionary *dict, for (rec = var_recs; rec < &var_recs[n_var_recs]; rec++) { struct variable *var; - bool weight; char *name; size_t i; name = recode_string_pool ("UTF-8", dict_encoding, rec->name, -1, r->pool); name[strcspn (name, " ")] = '\0'; - weight = !strcmp (name, "$WEIGHT") && rec->width == 0; /* Transform $DATE => DATE_, $WEIGHT => WEIGHT_, $CASENUM => CASENUM_. */ if (name[0] == '$') @@ -852,8 +865,14 @@ parse_variable_records (struct pcp_reader *r, struct dictionary *dict, var = rec->var = dict_create_var_assert (dict, new_name, rec->width); free (new_name); } - if (weight) - dict_set_weight (dict, var); + if (rec->weight) + { + if (!rec->width) + dict_set_weight (dict, var); + else + pcp_warn (r, rec->pos, + _("Cannot weight by string variable `%s'."), name); + } /* Set the short name the same as the long name. */ var_set_short_name (var, 0, name); diff --git a/tests/data/pc+-file-reader.at b/tests/data/pc+-file-reader.at index 1d89d0dbf6..7b88ef4adb 100644 --- a/tests/data/pc+-file-reader.at +++ b/tests/data/pc+-file-reader.at @@ -359,6 +359,90 @@ NUM1,NUM2,STR4,STR8,STR15 1000,.,PQRS,TUVWXYZa,bcdefghijklmnop ]) AT_CLEANUP + +AT_SETUP([weighted]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + i16 1; i16 6; + i16 0; dnl Fixed. + i16 1; i16 0; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +SYSFILE INFO FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [0], [dnl +File:,pc+-file.sav +Label:,PSPP synthetic test file +Created:,11/28/14 15:11:00 by PCSPSS PSPP synthetic test product +Integer Format:,Little Endian +Real Format:,IEEE 754 LE. +Variables:,4 +Cases:,1 +Type:,SPSS/PC+ System File +Weight:,NUM3 +Compression:,None +Encoding:,us-ascii + +Variable,Description,Position +NUM1,"Format: F8.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8",1 +NUM2,"Format: F8.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8",2 +NUM3,"Format: F8.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8",3 +NUM4,"Format: F8.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8",4 +]) +AT_CLEANUP AT_BANNER([SPSS/PC+ file reader - negative]) @@ -518,7 +602,7 @@ AT_CHECK([pspp -O format=csv pc+-file.sps], [0], [dnl warning: `pc+-file.sav' near offset 0x100: Record 0 specifies unexpected system missing value 1 (0x1p+0). -"warning: `pc+-file.sav' near offset 0x100: Record 0 reserved fields have unexpected values (1,1,0,2,0)." +"warning: `pc+-file.sav' near offset 0x100: Record 0 reserved fields have unexpected values (1,1,0,2,0,0)." warning: `pc+-file.sav' near offset 0x100: Record 0 case counts differ (1 versus 3). ]) @@ -1213,3 +1297,107 @@ NUM1,NUM2,STR4,STR8,STR15 1000,.,PQRS,TUVWXYZa,bcdefghijklmnop ]) AT_CLEANUP + +AT_SETUP([invalid weight index]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + i16 1; i16 10; + i16 0; dnl Fixed. + i16 1; i16 0; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [0], + [warning: `pc+-file.sav': Invalid weight index 10. +]) +AT_CLEANUP + +AT_SETUP([string weight]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + i16 1; i16 6; + i16 0; dnl Fixed. + i16 1; i16 0; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Variables. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; s8 "acbdefgh"; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [0], + [warning: `pc+-file.sav' near offset 0x250: Cannot weight by string variable `STR1'. +]) +AT_CLEANUP -- 2.30.2