From 63c7521729b947ace9e192dff9330813ecfb5812 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 28 Nov 2014 21:16:23 -0800 Subject: [PATCH] Add support for reading SPSS/PC+ system files. Thanks to Alan Mead for supplying most of the files in the corpus used to determine the SPSS/PC+ system file format. --- NEWS | 8 + doc/automake.mk | 1 + doc/dev/pc+-file-format.texi | 362 ++++++ doc/dev/system-file-format.texi | 3 +- doc/files.texi | 45 +- doc/pspp-convert.texi | 13 +- doc/pspp-dev.texi | 2 + perl-module/PSPP.xs | 25 +- perl-module/t/Pspp.t | 2 +- src/data/any-reader.c | 248 +++- src/data/any-reader.h | 95 +- src/data/automake.mk | 6 +- src/data/dataset-reader.c | 62 - src/data/dataset-reader.h | 27 - src/data/pc+-file-reader.c | 1343 ++++++++++++++++++++ src/data/por-file-reader.c | 106 +- src/data/por-file-reader.h | 42 - src/data/sys-file-reader.c | 128 +- src/data/sys-file-reader.h | 88 -- src/data/sys-file-writer.c | 26 +- src/data/sys-file-writer.h | 6 +- src/data/sys-file.h | 28 - src/language/data-io/combine-files.c | 3 +- src/language/data-io/get.c | 4 +- src/language/data-io/save.c | 8 +- src/language/dictionary/apply-dictionary.c | 4 +- src/language/dictionary/sys-file-info.c | 62 +- src/ui/gui/psppire-window.c | 8 +- src/ui/gui/psppire.c | 8 +- src/ui/source-init-opts.c | 4 +- tests/automake.mk | 1 + tests/data/pc+-file-reader.at | 1215 ++++++++++++++++++ tests/data/por-file.at | 58 + tests/data/sack.c | 199 ++- tests/language/dictionary/sys-file-info.at | 2 +- tests/perl-module.at | 2 +- utilities/automake.mk | 5 + utilities/pspp-convert.1 | 16 +- utilities/pspp-convert.c | 4 +- 39 files changed, 3718 insertions(+), 551 deletions(-) create mode 100644 doc/dev/pc+-file-format.texi delete mode 100644 src/data/dataset-reader.c delete mode 100644 src/data/dataset-reader.h create mode 100644 src/data/pc+-file-reader.c delete mode 100644 src/data/por-file-reader.h delete mode 100644 src/data/sys-file-reader.h delete mode 100644 src/data/sys-file.h create mode 100644 tests/data/pc+-file-reader.at diff --git a/NEWS b/NEWS index f87a5f5705..fc676cc4b0 100644 --- a/NEWS +++ b/NEWS @@ -6,6 +6,14 @@ Please send PSPP bug reports to bug-gnu-pspp@gnu.org. Changes since 0.8.4: + * SPSS/PC+ system files are now supported on GET and other commands + that read SPSS system files. The pspp-convert program can now read + SPSS/PC+ system files. Writing the obsolete SPSS/PC+ system file + format is not supported. + + * SYSFILE INFO can now read SPSS/PC+ system files and SPSS portable + files. + * FREQUENCIES: A bug was fixed where an assertion failure occured when an empty dataset was presented. diff --git a/doc/automake.mk b/doc/automake.mk index 7c349b4159..a5255e03fe 100644 --- a/doc/automake.mk +++ b/doc/automake.mk @@ -38,6 +38,7 @@ doc_pspp_dev_TEXINFOS = doc/version-dev.texi \ doc/dev/i18n.texi \ doc/dev/output.texi \ doc/dev/system-file-format.texi \ + doc/dev/pc+-file-format.texi \ doc/dev/portable-file-format.texi \ doc/dev/q2c.texi diff --git a/doc/dev/pc+-file-format.texi b/doc/dev/pc+-file-format.texi new file mode 100644 index 0000000000..f3f1a9d620 --- /dev/null +++ b/doc/dev/pc+-file-format.texi @@ -0,0 +1,362 @@ +@node SPSS/PC+ System File Format +@appendix SPSS/PC+ System File Format + +SPSS/PC+, first released in 1984, was a simplified version of SPSS for +IBM PC and compatible computers. It used a data file format related +to the one described in the previous chapter, but simplified and +incompatible. The SPSS/PC+ software became obsolete in the 1990s, so +files in this format are rarely encountered today. Nevertheless, for +completeness, and because it is not very difficult, it seems +worthwhile to support at least reading these files. This chapter +documents this format, based on examination of a corpus of about 60 +files from a variety of sources. + +System files use four data types: 8-bit characters, 16-bit unsigned +integers, 32-bit unsigned integers, and 64-bit floating points, called +here @code{char}, @code{uint16}, @code{uint32}, and @code{flt64}, +respectively. Data is not necessarily aligned on a word or +double-word boundary. + +SPSS/PC+ ran only on IBM PC and compatible computers. Therefore, +values in these files are always in little-endian byte order. +Floating-point numbers are always in IEEE 754 format. + +SPSS/PC+ system files represent the system-missing value as -1.66e308, +or @code{f5 1e 26 02 8a 8c ed ff} expressed as hexadecimal. (This is +an unusual choice: it is close to, but not equal to, the largest +negative 64-bit IEEE 754, which is about -1.8e308.) + +Text in SPSS/PC+ system file is encoded in ASCII-based 8-bit MS DOS +codepages. The corpus used for investigating the format were all +ASCII-only. + +An SPSS/PC+ system file begins with the following 256-byte directory: + +@example +uint32 two; +uint32 zero; +struct @{ + uint32 ofs; + uint32 len; +@} records[15]; +char filename[128]; +@end example + +@table @code +@item uint32 two; +@itemx uint32 zero; +Always set to 2 and 0, respectively. + +These fields could be used as a signature for the file format, but the +@code{product} field in record 0 seems more likely to be unique +(@pxref{Record 0 Main Header Record}). + +@item struct @{ @dots{} @} records[15]; +Each of the elements in this array identifies a record in the system +file. The @code{ofs} is a byte offset, from the beginning of the +file, that identifies the start of the record. @code{len} specifies +the length of the record, in bytes. Many records are optional or not +used. If a record is not present, @code{ofs} and @code{len} for that +record are both are zero. + +@item char filename[128]; +In most files in the corpus, this field is entirely filled with +spaces. In one file, it contains a file name, followed by a null +bytes, followed by spaces to fill the remainder of the field. The +meaning is unknown. +@end table + +The following sections describe the contents of each record, +identified by the index into the @code{records} array. + +@menu +* Record 0 Main Header Record:: +* Record 1 Variables Record:: +* Record 2 Labels Record:: +* Record 3 Data Record:: +* Records 4 and 5 Data Entry:: +@end menu + +@node Record 0 Main Header Record +@section Record 0: Main Header Record + +All files in the corpus have this record at offset 0x100 with length +0xb0 (but readers should find this record, like the others, via the +@code{records} table in the directory). Its format is: + +@example +uint16 one0; +char product[62]; +flt64 sysmis; +uint32 zero0; +uint32 zero1; +uint16 one1; +uint16 compressed; +uint16 nominal_case_size; +uint32 n_cases0; +uint16 zero2; +uint32 n_cases1; +char creation_date[8]; +char creation_time[8]; +char label[64]; +@end example + +@table @code +@item uint16 one0; +@itemx uint16 one1; +Always set to 1. + +@item uint32 zero0; +@itemx uint32 zero1; +@itemx uint16 zero2; +Always set to 0. + +It seems likely that one of these variables is set to 1 if weighting +is enabled, but none of the files in the corpus is weighted. + +@item char product[62]; +Name of the program that created the file. Only the following unique +values have been observed, in each case padded on the right with +spaces: + +@example +DESPSS/PC+ System File Written by Data Entry II +PCSPSS SYSTEM FILE. IBM PC DOS, SPSS/PC+ +PCSPSS SYSTEM FILE. IBM PC DOS, SPSS/PC+ V3.0 +PCSPSS SYSTEM FILE. IBM PC DOS, SPSS for Windows +@end example + +Thus, it is reasonable to use the presence of the string @samp{SPSS} +at offset 0x104 as a simple test for an SPSS/PC+ data file. + +@item flt64 sysmis; +The system-missing value, as described previously (@pxref{SPSS/PC+ +System File Format}). + +@item uint16 compressed; +Set to 0 if the data in the file is not compressed, 1 if the data is +compressed with simple bytecode compression. + +@item uint16 nominal_case_size; +Number of data elements per case. This is the number of variables, +except that long string variables add extra data elements (one for +every 8 bytes after the first 8). String variables in SPSS/PC+ system +files are limited to 255 bytes. + +@item uint32 n_cases0; +@itemx uint32 n_cases1; +The number of cases in the data record. Both values are the same. +Some files in the corpus contain data for the number of cases noted +here, followed by garbage that somewhat resembles data. + +@item char creation_date[8]; +The date that the file was created, in @samp{mm/dd/yy} format. +Single-digit days and months are not prefixed by zeros. The string is +padded with spaces on right or left or both, e.g. @samp{_2/4/93_}, +@samp{10/5/87_}, and @samp{_1/11/88} (with @samp{_} standing in for a +space) are all actual examples from the corpus. + +@item char creation_time[8]; +The time that the file was created, in @samp{HH:MM:SS} format. +Single-digit hours are padded on a left with a space. Minutes and +seconds are always written as two digits. + +@item char file_label[64]; +File label declared by the user, if any (@pxref{FILE LABEL,,,pspp, +PSPP Users Guide}). Padded on the right with spaces. +@end table + +@node Record 1 Variables Record +@section Record 1: Variables Record + +The variables record most commonly starts at offset 0x1b0, but it can +be placed elsewhere. The record contains instances of the following +32-byte structure: + +@example +uint32 value_label_start; +uint32 value_label_end; +uint32 var_label_ofs; +uint32 format; +char name[8]; +union @{ + flt64 f; + char s[8]; +@} missing; +@end example + +The number of instances is the @code{nominal_case_size} specified in +the main header record. There is one instance for each numeric +variable and each string variable with width 8 bytes or less. String +variables wider than 8 bytes have one instance for each 8 bytes, +rounding up. The first instance for a long string specifies the +variable's correct dictionary information. Subsequent instances for a +long string are generally filled with all-zero bytes, although the +@code{missing} field contains the numeric system-missing value, and +some writers also fill in @code{var_label_ofs}, @code{format}, and +@code{name}, sometimes filling the latter with the numeric +system-missing value rather than a text string. Regardless of the +values used, readers should ignore the contents of these additional +instances for long strings. + +@table @code +@item uint32 value_label_start; +@itemx uint32 value_label_end; +For a variable with value labels, these specify offsets into the label +record of the start and end of this variable's value labels, +respectively. @xref{Record 2 Labels Record}, for more information. + +For a variable without any value labels, these are both zero. + +A long string variable may not have value labels. + +@item uint32 var_label_ofs; +For a variable with a variable label, this specifies an offset into +the label record. @xref{Record 2 Labels Record}, for more +information. + +For a variable without a variable label, this is zero. + +@item uint32 format; +The variable's output format, in the same format used in system files. +@xref{System File Output Formats}, for details. SPSS/PC+ system files +only use format types 5 (F, for numeric variables) and 1 (A, for +string variables). + +@item char name[8]; +The variable's name, padded on the right with spaces. + +@item union @{ @dots{} @} missing; +A user-missing value. For numeric variables, @code{missing.f} is the +variable's user-missing value. For string variables, @code{missing.s} +is a string missing value. A variable without a user-missing value is +indicated with @code{missing.f} set to the system-missing value, even +for string variables (!). A Long string variable may not have a +missing value. +@end table + +In addition to the user-defined variables, every SPSS/PC+ system file +contains, as its first three variables, the following system-defined +variables, in the following order. The system-defined variables have +no variable label, value labels, or missing values. + +@table @code +@item $CASENUM +A numeric variable with format F8.0. Most of the time this is a +sequence number, starting with 1 for the first case and counting up +for each subsequent case. Some files skip over values, which probably +reflects cases that were deleted. + +@item $DATE +A string variable with format A8. Same format (including varying +padding) as the @code{creation_date} field in the main header record +(@pxref{Record 0 Main Header Record}). The actual date can differ +from @code{creation_date} and from record to record. This may reflect +when individual cases were added or updated. + +@item $WEIGHT +A numeric variable with format F8.2. This represents the case's +weight; SPSS/PC+ files do not have a user-defined weighting variable. +If weighting has not been enabled, every case has value 1.0. +@end table + +@node Record 2 Labels Record +@section Record 2: Labels Record + +The labels record holds value labels and variable labels. Unlike the +other records, it is not meant to be read directly and sequentially. +Instead, this record must be interpreted one piece at a time, by +following pointers from the variables record. + +The @code{value_label_start}, @code{value_label_end}, and +@code{var_label_ofs} fields in a variable record are all offsets +relative to the beginning of the labels record, with an additional +7-byte offset. That is, if the labels record starts at byte offset +@code{labels_ofs} and a variable has a given @code{var_label_ofs}, +then the variable label begins at byte offset @math{@code{labels_ofs} ++ @code{var_label_ofs} + 7} in the file. + +A variable label, starting at the offset indicated by +@code{var_label_ofs}, consists of a one-byte length followed by the +specified number of bytes of the variable label string, like this: + +@example +uint8 length; +char s[length]; +@end example + +A set of value labels, extending from @code{value_label_start} to +@code{value_label_end} (exclusive), consists of a numeric or string +value followed by a string in the format just described. String +values are padded on the right with spaces to fill the 8-byte field, +like this: + +@example +union @{ + flt64 f; + char s[8]; +@} value; +uint8 length; +char s[length]; +@end example + +The labels record begins with a pair of uint32 values. The first of +these is always 3. The second is between 8 and 16 less than the +number of bytes in the record. Neither value is important for +interpreting the file. + +@node Record 3 Data Record +@section Record 3: Data Record + +The format of the data record varies depending on the value of +@code{compressed} in the file header record: + +@table @asis +@item 0: no compression +Data is arranged as a series of 8-byte elements, one per variable +instance variable in the variable record (@pxref{Record 1 Variables +Record}). Numeric values are given in @code{flt64} format; string +values are literal characters string, padded on the right with spaces +when necessary to fill out 8-byte units. + +@item 1: bytecode compression +The first 8 bytes of the data record is divided into a series of +1-byte command codes. These codes have meanings as described below: + +@table @asis +@item 0 +The system-missing value. + +@item 1 +A numeric or string value that is not +compressible. The value is stored in the 8 bytes following the +current block of command bytes. If this value appears twice in a block +of command bytes, then it indicates the second group of 8 bytes following the +command bytes, and so on. + +@item 2 through 255 +A number with value @var{code} - 100, where @var{code} is the value of +the compression code. For example, code 105 indicates a numeric +variable of value 5. +@end table + +The end of the 8-byte group of bytecodes is followed by any 8-byte +blocks of non-compressible values indicated by code 1. After that +follows another 8-byte group of bytecodes, then those bytecodes' +non-compressible values. The pattern repeats up to the number of +cases specified by the main header record have been seen. + +The corpus does not contain any files with command codes 2 through 95, +so it is possible that some of these codes are used for special +purposes. +@end table + +Cases of data often, but not always, fill the entire data record. +Readers should stop reading after the number of cases specified in the +main header record. Otherwise, readers may try to interpret garbage +following the data as additional cases. + +@node Records 4 and 5 Data Entry +@section Records 4 and 5: Data Entry + +Records 4 and 5 appear to be related to SPSS/PC+ Data Entry. diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi index 0f0940b58f..d100aa9d24 100644 --- a/doc/dev/system-file-format.texi +++ b/doc/dev/system-file-format.texi @@ -178,7 +178,7 @@ contribute to this value beyond the first 255 bytes. Further, system files written by some systems set this value to -1. In general, it is unsafe for systems reading system files to rely upon this value. -@item int32 compressed; +@item int32 compression; Set to 0 if the data in the file is not compressed, 1 if the data is compressed with simple bytecode compression, 2 if the data is ZLIB compressed. This field has value 2 if and only if @code{rec_type} is @@ -352,6 +352,7 @@ in the range. When a range plus a value are present, the third element denotes the additional discrete missing value. @end table +@anchor{System File Output Formats} The @code{print} and @code{write} members of sysfile_variable are output formats coded into @code{int32} types. The least-significant byte of the @code{int32} represents the number of decimal places, and the diff --git a/doc/files.texi b/doc/files.texi index 2e03fc9dd3..eb5a369e78 100644 --- a/doc/files.texi +++ b/doc/files.texi @@ -145,10 +145,9 @@ GET @cmd{GET} clears the current dictionary and active dataset and replaces them with the dictionary and data from a specified file. -The @subcmd{FILE} subcommand is the only required subcommand. -Specify the system -file or portable file to be read as a string file name or -a file handle (@pxref{File Handles}). +The @subcmd{FILE} subcommand is the only required subcommand. Specify +the SPSS system file, SPSS/PC+ system file, or SPSS portable file to +be read as a string file name or a file handle (@pxref{File Handles}). By default, all the variables in a file are read. The DROP subcommand can be used to specify a list of variables that are not to be @@ -175,10 +174,11 @@ Each may be present any number of times. @cmd{GET} never modifies a file on disk. Only the active dataset read from the file is affected by these subcommands. -@pspp{} tries to automatically detect the encoding of string data in the -file. Sometimes, however, this does not work well, -especially for files written by old versions of SPSS or @pspp{}. Specify -the @subcmd{ENCODING} subcommand with an @acronym{IANA} character set name as its string +@pspp{} automatically detects the encoding of string data in the file, +when possible. The character encoding of old SPSS system files cannot +always be guessed correctly, and SPSS/PC+ system files do not include +any indication of their encoding. Specify the @subcmd{ENCODING} +subcommand with an @acronym{IANA} character set name as its string argument to override the default. Use @cmd{SYSFILE INFO} to analyze the encodings that might be valid for a system file. The @subcmd{ENCODING} subcommand is a @pspp{} extension. @@ -914,20 +914,21 @@ qualifier character that appears within a value is doubled. SYSFILE INFO FILE='@var{file_name}' [ENCODING='@var{encoding}']. @end display -@cmd{SYSFILE INFO} reads the dictionary in a system file and -displays the information in its dictionary. - -Specify a file name or file handle. @cmd{SYSFILE INFO} reads that file as -a system file and displays information on its dictionary. - -@pspp{} tries to automatically detect the encoding of string data in -the file. Sometimes, however, this does not work well, especially for -files written by old versions of SPSS or @pspp{}. Specify the -@subcmd{ENCODING} subcommand with an @acronym{IANA} character set name -as its string argument to override the default, or specify -@code{ENCODING='DETECT'} to analyze and report possibly valid -encodings for the system file. The @subcmd{ENCODING} subcommand is a -@pspp{} extension. +@cmd{SYSFILE INFO} reads the dictionary in an SPSS system file, +SPSS/PC+ system file, or SPSS portable file, and displays the +information in its dictionary. + +Specify a file name or file handle. @cmd{SYSFILE INFO} reads that +file and displays information on its dictionary. + +@pspp{} automatically detects the encoding of string data in the file, +when possible. The character encoding of old SPSS system files cannot +always be guessed correctly, and SPSS/PC+ system files do not include +any indication of their encoding. Specify the @subcmd{ENCODING} +subcommand with an @acronym{IANA} character set name as its string +argument to override the default, or specify @code{ENCODING='DETECT'} +to analyze and report possibly valid encodings for the system file. +The @subcmd{ENCODING} subcommand is a @pspp{} extension. @cmd{SYSFILE INFO} does not affect the current active dataset. diff --git a/doc/pspp-convert.texi b/doc/pspp-convert.texi index 328e9caa02..83aa8ae4e9 100644 --- a/doc/pspp-convert.texi +++ b/doc/pspp-convert.texi @@ -16,10 +16,11 @@ Synopsis: @t{pspp-convert -@w{-}version} @end display -The format of @var{Iinput} is automatically detected, except that the -character encoding of old system files cannot always be guessed -correctly. Use @code{-e @var{encoding}} to specify the encoding in this -case. +The format of @var{Iinput} is automatically detected, when possible. +The character encoding of old SPSS system files cannot always be +guessed correctly, and SPSS/PC+ system files do not include any +indication of their encoding. Use @code{-e @var{encoding}} to specify +the encoding in this case. By default, the intended format for @var{output} is inferred based on its extension: @@ -60,8 +61,8 @@ Specifying this option to limit the number of cases written to @item -e @var{charset} @itemx --encoding=@var{charset} Overrides the encoding in which character strings in @var{input} are -interpreted. This option is necessary because old SPSS system files -do not self-identify their encoding. +interpreted. This option is necessary because old SPSS system files, +and SPSS/PC+ system files, do not self-identify their encoding. @item -h @itemx --help diff --git a/doc/pspp-dev.texi b/doc/pspp-dev.texi index 15fe727fa4..80983a951f 100644 --- a/doc/pspp-dev.texi +++ b/doc/pspp-dev.texi @@ -83,6 +83,7 @@ Free Documentation License". * Portable File Format:: Format of PSPP portable files. * System File Format:: Format of PSPP system files. +* SPSS/PC+ System File Format:: Format of SPSS/PC+ system files. * q2c Input Format:: Format of syntax accepted by q2c. * GNU Free Documentation License:: License for copying this manual. @@ -100,6 +101,7 @@ Free Documentation License". @include dev/portable-file-format.texi @include dev/system-file-format.texi +@include dev/pc+-file-format.texi @include dev/q2c.texi @include fdl.texi diff --git a/perl-module/PSPP.xs b/perl-module/PSPP.xs index e600f7b45d..7577b7ad2c 100644 --- a/perl-module/PSPP.xs +++ b/perl-module/PSPP.xs @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -78,7 +77,7 @@ struct syswriter_info /* A thin wrapper around sfm_reader */ struct sysreader_info { - struct sfm_read_info opts; + struct any_read_info opts; /* A pointer to the reader. The reader is owned by the struct */ struct casereader *reader; @@ -633,8 +632,8 @@ INIT: opts.create_writeable = readonly ? ! SvIV (*readonly) : true; opts.compression = (compress && SvIV (*compress) - ? SFM_COMP_SIMPLE - : SFM_COMP_NONE); + ? ANY_COMP_SIMPLE + : ANY_COMP_NONE); opts.version = version ? SvIV (*version) : 3 ; } CODE: @@ -755,26 +754,16 @@ CODE: struct file_handle *fh = fh_create_file (NULL, name, fh_default_properties () ); struct dictionary *dict; - struct sfm_reader *r; sri = xmalloc (sizeof (*sri)); - r = sfm_open (fh); - if (r) - { - sri->reader = sfm_decode (r, NULL, &dict, &sri->opts); - if (sri->reader) - sri->dict = create_pspp_dict (dict); - else - { - free (sri); - sri = NULL; - } - } + sri->reader = any_reader_open_and_decode (fh, NULL, &dict, &sri->opts); + if (sri->reader) + sri->dict = create_pspp_dict (dict); else { free (sri); sri = NULL; - } + } RETVAL = sri; OUTPUT: diff --git a/perl-module/t/Pspp.t b/perl-module/t/Pspp.t index 3f8a711a34..c2c9dbde0a 100644 --- a/perl-module/t/Pspp.t +++ b/perl-module/t/Pspp.t @@ -522,7 +522,7 @@ SYNTAX ok ( !ref $sf, "Returns undef on opening failure"); - ok ("$PSPP::errstr" eq "Error opening `$tempdir/no-such-file.sav' for reading as a system file: No such file or directory.", + ok ("$PSPP::errstr" eq "An error occurred while opening `$tempdir/no-such-file.sav': No such file or directory.", "Error string on open failure"); } diff --git a/src/data/any-reader.c b/src/data/any-reader.c index ad3fcb5b7a..aee034ce31 100644 --- a/src/data/any-reader.c +++ b/src/data/any-reader.c @@ -24,12 +24,13 @@ #include #include -#include "data/dataset-reader.h" +#include "data/casereader.h" +#include "data/dataset.h" +#include "data/dictionary.h" #include "data/file-handle-def.h" #include "data/file-name.h" -#include "data/por-file-reader.h" -#include "data/sys-file-reader.h" #include "libpspp/assertion.h" +#include "libpspp/cast.h" #include "libpspp/message.h" #include "libpspp/str.h" @@ -37,84 +38,85 @@ #include "gettext.h" #define _(msgid) gettext (msgid) +#define N_(msgid) (msgid) -/* Tries to detect whether FILE is a given type of file, by opening the file - and passing it to DETECT, and returns a detect_result. */ -static enum detect_result -try_detect (const char *file_name, bool (*detect) (FILE *)) +static const struct any_reader_class dataset_reader_class; + +static const struct any_reader_class *classes[] = + { + &sys_file_reader_class, + &por_file_reader_class, + &pcp_file_reader_class, + }; +enum { N_CLASSES = sizeof classes / sizeof *classes }; + +int +any_reader_detect (const char *file_name, + const struct any_reader_class **classp) { + struct detector + { + enum any_type type; + int (*detect) (FILE *); + }; + FILE *file; - bool is_type; + int retval; + + if (classp) + *classp = NULL; file = fn_open (file_name, "rb"); if (file == NULL) { msg (ME, _("An error occurred while opening `%s': %s."), file_name, strerror (errno)); - return ANY_ERROR; + return -errno; } - is_type = detect (file); - - fn_close (file_name, file); + retval = 0; + for (int i = 0; i < N_CLASSES; i++) + { + int rc = classes[i]->detect (file); + if (rc == 1) + { + retval = 1; + if (classp) + *classp = classes[i]; + break; + } + else if (rc < 0) + retval = rc; + } - return is_type ? ANY_YES : ANY_NO; -} + if (retval < 0) + msg (ME, _("Error reading `%s': %s."), file_name, strerror (-retval)); -/* Returns true if any_reader_open() would be able to open FILE as a data - file, false otherwise. */ -enum detect_result -any_reader_may_open (const char *file) -{ - enum detect_result res = try_detect (file, sfm_detect); - - if (res == ANY_NO) - res = try_detect (file, pfm_detect); + fn_close (file_name, file); - return res; + return retval; } -/* Returns a casereader for HANDLE. On success, returns the new - casereader and stores the file's dictionary into *DICT. On - failure, returns a null pointer. - - Ordinarily the reader attempts to automatically detect the character - encoding based on the file's contents. This isn't always possible, - especially for files written by old versions of SPSS or PSPP, so specifying - a nonnull ENCODING overrides the choice of character encoding. */ -struct casereader * -any_reader_open (struct file_handle *handle, const char *encoding, - struct dictionary **dict) +struct any_reader * +any_reader_open (struct file_handle *handle) { switch (fh_get_referent (handle)) { case FH_REF_FILE: { - enum detect_result result; + const struct any_reader_class *class; + int retval; - result = try_detect (fh_get_file_name (handle), sfm_detect); - if (result == ANY_ERROR) - return NULL; - else if (result == ANY_YES) + retval = any_reader_detect (fh_get_file_name (handle), &class); + if (retval <= 0) { - struct sfm_reader *r; - - r = sfm_open (handle); - if (r == NULL) - return NULL; - - return sfm_decode (r, encoding, dict, NULL); + if (retval == 0) + msg (SE, _("`%s' is not a system or portable file."), + fh_get_file_name (handle)); + return NULL; } - result = try_detect (fh_get_file_name (handle), pfm_detect); - if (result == ANY_ERROR) - return NULL; - else if (result == ANY_YES) - return pfm_open_reader (handle, dict, NULL); - - msg (SE, _("`%s' is not a system or portable file."), - fh_get_file_name (handle)); - return NULL; + return class->open (handle); } case FH_REF_INLINE: @@ -122,7 +124,139 @@ any_reader_open (struct file_handle *handle, const char *encoding, return NULL; case FH_REF_DATASET: - return dataset_reader_open (handle, dict); + return dataset_reader_class.open (handle); } NOT_REACHED (); } + +bool +any_reader_close (struct any_reader *any_reader) +{ + return any_reader ? any_reader->klass->close (any_reader) : true; +} + +struct casereader * +any_reader_decode (struct any_reader *any_reader, + const char *encoding, + struct dictionary **dictp, + struct any_read_info *info) +{ + const struct any_reader_class *class = any_reader->klass; + struct casereader *reader; + + reader = any_reader->klass->decode (any_reader, encoding, dictp, info); + if (reader && info) + info->klass = class; + return reader; +} + +size_t +any_reader_get_strings (const struct any_reader *any_reader, struct pool *pool, + char ***labels, bool **ids, char ***values) +{ + return (any_reader->klass->get_strings + ? any_reader->klass->get_strings (any_reader, pool, labels, ids, + values) + : 0); +} + +struct casereader * +any_reader_open_and_decode (struct file_handle *handle, + const char *encoding, + struct dictionary **dictp, + struct any_read_info *info) +{ + struct any_reader *any_reader = any_reader_open (handle); + return (any_reader + ? any_reader_decode (any_reader, encoding, dictp, info) + : NULL); +} + +struct dataset_reader + { + struct any_reader any_reader; + struct dictionary *dict; + struct casereader *reader; + }; + +/* Opens FH, which must have referent type FH_REF_DATASET, and returns a + dataset_reader for it, or a null pointer on failure. Stores a copy of the + dictionary for the dataset file into *DICT. The caller takes ownership of + the casereader and the dictionary. */ +static struct any_reader * +dataset_reader_open (struct file_handle *fh) +{ + struct dataset_reader *reader; + struct dataset *ds; + + /* We don't bother doing fh_lock or fh_ref on the file handle, + as there's no advantage in this case, and doing these would + require us to keep track of the "struct file_handle" and + "struct fh_lock" and undo our work later. */ + assert (fh_get_referent (fh) == FH_REF_DATASET); + + ds = fh_get_dataset (fh); + if (ds == NULL || !dataset_has_source (ds)) + { + msg (SE, _("Cannot read from dataset %s because no dictionary or data " + "has been written to it yet."), + fh_get_name (fh)); + return NULL; + } + + reader = xmalloc (sizeof *reader); + reader->any_reader.klass = &dataset_reader_class; + reader->dict = dict_clone (dataset_dict (ds)); + reader->reader = casereader_clone (dataset_source (ds)); + return &reader->any_reader; +} + +static struct dataset_reader * +dataset_reader_cast (const struct any_reader *r_) +{ + assert (r_->klass == &dataset_reader_class); + return UP_CAST (r_, struct dataset_reader, any_reader); +} + +static bool +dataset_reader_close (struct any_reader *r_) +{ + struct dataset_reader *r = dataset_reader_cast (r_); + dict_destroy (r->dict); + casereader_destroy (r->reader); + free (r); + + return true; +} + +static struct casereader * +dataset_reader_decode (struct any_reader *r_, const char *encoding UNUSED, + struct dictionary **dictp, struct any_read_info *info) +{ + struct dataset_reader *r = dataset_reader_cast (r_); + struct casereader *reader; + + *dictp = r->dict; + reader = r->reader; + if (info) + { + memset (info, 0, sizeof *info); + info->integer_format = INTEGER_NATIVE; + info->float_format = FLOAT_NATIVE_DOUBLE; + info->compression = ANY_COMP_NONE; + info->case_cnt = casereader_get_case_cnt (reader); + } + free (r); + + return reader; +} + +static const struct any_reader_class dataset_reader_class = + { + N_("Dataset"), + NULL, + dataset_reader_open, + dataset_reader_close, + dataset_reader_decode, + NULL, + }; diff --git a/src/data/any-reader.h b/src/data/any-reader.h index 063a7e650e..5614a60075 100644 --- a/src/data/any-reader.h +++ b/src/data/any-reader.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2010, 2012 Free Software Foundation, Inc. + Copyright (C) 2006, 2010, 2012, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,20 +18,97 @@ #define ANY_READER_H 1 #include +#include +#include "data/case.h" +#include "libpspp/float-format.h" +#include "libpspp/integer-format.h" -/* Result of type detection. */ -enum detect_result +struct any_read_info; +struct dictionary; +struct file_handle; + +struct any_reader + { + const struct any_reader_class *klass; + }; + +struct any_reader_class { - ANY_YES, /* It is this type. */ - ANY_NO, /* It is not this type. */ - ANY_ERROR /* File couldn't be opened. */ + const char *name; + + int (*detect) (FILE *); + + struct any_reader *(*open) (struct file_handle *); + bool (*close) (struct any_reader *); + struct casereader *(*decode) (struct any_reader *, const char *encoding, + struct dictionary **, + struct any_read_info *); + size_t (*get_strings) (const struct any_reader *, struct pool *pool, + char ***labels, bool **ids, char ***values); }; +extern const struct any_reader_class sys_file_reader_class; +extern const struct any_reader_class por_file_reader_class; +extern const struct any_reader_class pcp_file_reader_class; + +enum any_type + { + ANY_SYS, /* SPSS System File. */ + ANY_PCP, /* SPSS/PC+ System File. */ + ANY_POR, /* SPSS Portable File. */ + }; + +enum any_compression + { + ANY_COMP_NONE, /* No compression. */ + ANY_COMP_SIMPLE, /* Bytecode compression of integer values. */ + ANY_COMP_ZLIB /* ZLIB "deflate" compression. */ + }; + +/* Data file info that doesn't fit in struct dictionary. + + The strings in this structure are encoded in UTF-8. (They are normally in + the ASCII subset of UTF-8.) */ +struct any_read_info + { + const struct any_reader_class *klass; + char *creation_date; + char *creation_time; + enum integer_format integer_format; + enum float_format float_format; + enum any_compression compression; + casenumber case_cnt; /* -1 if unknown. */ + char *product; /* Product name. */ + char *product_ext; /* Extra product info. */ + + /* Writer's version number in X.Y.Z format. + The version number is not always present; if not, then + all of these are set to 0. */ + int version_major; /* X. */ + int version_minor; /* Y. */ + int version_revision; /* Z. */ + }; + +void any_read_info_destroy (struct any_read_info *); struct file_handle; struct dictionary; -enum detect_result any_reader_may_open (const char *file_name); -struct casereader *any_reader_open (struct file_handle *, const char *encoding, - struct dictionary **); + +int any_reader_detect (const char *file_name, + const struct any_reader_class **); + +struct any_reader *any_reader_open (struct file_handle *); +bool any_reader_close (struct any_reader *); +struct casereader *any_reader_decode (struct any_reader *, + const char *encoding, + struct dictionary **, + struct any_read_info *); +size_t any_reader_get_strings (const struct any_reader *, struct pool *pool, + char ***labels, bool **ids, char ***values); + +struct casereader *any_reader_open_and_decode (struct file_handle *, + const char *encoding, + struct dictionary **, + struct any_read_info *); #endif /* any-reader.h */ diff --git a/src/data/automake.mk b/src/data/automake.mk index 4ad8a23738..8b26f525e4 100644 --- a/src/data/automake.mk +++ b/src/data/automake.mk @@ -50,8 +50,6 @@ src_data_libdata_la_SOURCES = \ src/data/data-out.h \ src/data/dataset.c \ src/data/dataset.h \ - src/data/dataset-reader.c \ - src/data/dataset-reader.h \ src/data/dataset-writer.c \ src/data/dataset-writer.h \ src/data/datasheet.c \ @@ -84,8 +82,8 @@ src_data_libdata_la_SOURCES = \ src/data/mrset.h \ src/data/ods-reader.c \ src/data/ods-reader.h \ + src/data/pc+-file-reader.c \ src/data/por-file-reader.c \ - src/data/por-file-reader.h \ src/data/por-file-writer.c \ src/data/por-file-writer.h \ src/data/psql-reader.c \ @@ -106,10 +104,8 @@ src_data_libdata_la_SOURCES = \ src/data/sys-file-private.c \ src/data/sys-file-private.h \ src/data/sys-file-reader.c \ - src/data/sys-file-reader.h \ src/data/sys-file-writer.c \ src/data/sys-file-writer.h \ - src/data/sys-file.h \ src/data/transformations.c \ src/data/transformations.h \ src/data/val-type.h \ diff --git a/src/data/dataset-reader.c b/src/data/dataset-reader.c deleted file mode 100644 index b679342a2f..0000000000 --- a/src/data/dataset-reader.c +++ /dev/null @@ -1,62 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2010, 2011 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#include - -#include "data/dataset-reader.h" - -#include - -#include "data/case.h" -#include "data/casereader.h" -#include "data/dataset.h" -#include "data/dictionary.h" -#include "data/file-handle-def.h" -#include "libpspp/assertion.h" -#include "libpspp/message.h" - -#include "gl/xalloc.h" - -#include "gettext.h" -#define _(msgid) gettext (msgid) - -/* Opens FH, which must have referent type FH_REF_DATASET, and returns a - dataset_reader for it, or a null pointer on failure. Stores a copy of the - dictionary for the dataset file into *DICT. The caller takes ownership of - the casereader and the dictionary. */ -struct casereader * -dataset_reader_open (struct file_handle *fh, struct dictionary **dict) -{ - struct dataset *ds; - - /* We don't bother doing fh_lock or fh_ref on the file handle, - as there's no advantage in this case, and doing these would - require us to keep track of the "struct file_handle" and - "struct fh_lock" and undo our work later. */ - assert (fh_get_referent (fh) == FH_REF_DATASET); - - ds = fh_get_dataset (fh); - if (ds == NULL || !dataset_has_source (ds)) - { - msg (SE, _("Cannot read from dataset %s because no dictionary or data " - "has been written to it yet."), - fh_get_name (fh)); - return NULL; - } - - *dict = dict_clone (dataset_dict (ds)); - return casereader_clone (dataset_source (ds)); -} diff --git a/src/data/dataset-reader.h b/src/data/dataset-reader.h deleted file mode 100644 index 420b6b1b99..0000000000 --- a/src/data/dataset-reader.h +++ /dev/null @@ -1,27 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2006, 2009, 2010 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#ifndef DATASET_READER_H -#define DATASET_READER_H 1 - -#include - -struct dictionary; -struct file_handle; -struct casereader *dataset_reader_open (struct file_handle *, - struct dictionary **); - -#endif /* dataset-reader.h */ diff --git a/src/data/pc+-file-reader.c b/src/data/pc+-file-reader.c new file mode 100644 index 0000000000..a127323c68 --- /dev/null +++ b/src/data/pc+-file-reader.c @@ -0,0 +1,1343 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 1997-2000, 2006-2007, 2009-2014 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include +#include +#include +#include +#include + +#include "data/any-reader.h" +#include "data/case.h" +#include "data/casereader-provider.h" +#include "data/casereader.h" +#include "data/dictionary.h" +#include "data/file-handle-def.h" +#include "data/file-name.h" +#include "data/format.h" +#include "data/identifier.h" +#include "data/missing-values.h" +#include "data/value-labels.h" +#include "data/value.h" +#include "data/variable.h" +#include "libpspp/float-format.h" +#include "libpspp/i18n.h" +#include "libpspp/integer-format.h" +#include "libpspp/message.h" +#include "libpspp/misc.h" +#include "libpspp/pool.h" +#include "libpspp/str.h" + +#include "gl/localcharset.h" +#include "gl/minmax.h" +#include "gl/xalloc.h" +#include "gl/xsize.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) +#define N_(msgid) (msgid) + +struct pcp_dir_entry + { + unsigned int ofs; + unsigned int len; + }; + +struct pcp_directory + { + struct pcp_dir_entry main; + struct pcp_dir_entry variables; + struct pcp_dir_entry labels; + struct pcp_dir_entry data; + }; + +struct pcp_main_header + { + char product[63]; /* "PCSPSS SYSTEM FILE..." */ + unsigned int nominal_case_size; /* Number of var positions. */ + char creation_date[9]; /* "[m]m/dd/yy". */ + char creation_time[9]; /* "[H]H:MM:SS". */ + char file_label[65]; /* File label. */ + }; + +struct pcp_var_record + { + unsigned int pos; + + char name[9]; + int width; + struct fmt_spec format; + uint8_t missing[8]; + char *label; + + struct pcp_value_label *val_labs; + size_t n_val_labs; + + struct variable *var; + }; + +struct pcp_value_label + { + uint8_t value[8]; + char *label; + }; + +/* System file reader. */ +struct pcp_reader + { + struct any_reader any_reader; + + /* Resource tracking. */ + struct pool *pool; /* All system file state. */ + + /* File data. */ + unsigned int file_size; + struct any_read_info info; + struct pcp_directory directory; + struct pcp_main_header header; + struct pcp_var_record *vars; + size_t n_vars; + + /* File state. */ + struct file_handle *fh; /* File handle. */ + struct fh_lock *lock; /* Mutual exclusion for file handle. */ + FILE *file; /* File stream. */ + unsigned int pos; /* Position in file. */ + bool error; /* I/O or corruption error? */ + struct caseproto *proto; /* Format of output cases. */ + + /* File format. */ + unsigned int n_cases; /* Number of cases */ + const char *encoding; /* String encoding. */ + + /* Decompression. */ + bool compressed; + uint8_t opcodes[8]; /* Current block of opcodes. */ + size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */ + bool corruption_warning; /* Warned about possible corruption? */ + }; + +static struct pcp_reader * +pcp_reader_cast (const struct any_reader *r_) +{ + assert (r_->klass == &pcp_file_reader_class); + return UP_CAST (r_, struct pcp_reader, any_reader); +} + +static const struct casereader_class pcp_file_casereader_class; + +static bool pcp_close (struct any_reader *); + +static bool read_variables_record (struct pcp_reader *); + +static void pcp_msg (struct pcp_reader *r, off_t, int class, + const char *format, va_list args) + PRINTF_FORMAT (4, 0); +static void pcp_warn (struct pcp_reader *, off_t, const char *, ...) + PRINTF_FORMAT (3, 4); +static void pcp_error (struct pcp_reader *, off_t, const char *, ...) + PRINTF_FORMAT (3, 4); + +static bool read_bytes (struct pcp_reader *, void *, size_t) + WARN_UNUSED_RESULT; +static int try_read_bytes (struct pcp_reader *, void *, size_t) + WARN_UNUSED_RESULT; +static bool read_uint16 (struct pcp_reader *, unsigned int *) + WARN_UNUSED_RESULT; +static bool read_uint32 (struct pcp_reader *, unsigned int *) + WARN_UNUSED_RESULT; +static bool read_float (struct pcp_reader *, double *) + WARN_UNUSED_RESULT; +static double parse_float (const uint8_t number[8]); +static bool read_string (struct pcp_reader *, char *, size_t) + WARN_UNUSED_RESULT; +static bool skip_bytes (struct pcp_reader *, size_t) WARN_UNUSED_RESULT; + +static bool pcp_seek (struct pcp_reader *, off_t); + +static bool pcp_is_sysmis(const uint8_t *); + +/* Dictionary reader. */ + +static bool read_dictionary (struct pcp_reader *); +static bool read_main_header (struct pcp_reader *, struct pcp_main_header *); +static void parse_header (struct pcp_reader *, + const struct pcp_main_header *, + struct any_read_info *, struct dictionary *); +static bool parse_variable_records (struct pcp_reader *, struct dictionary *, + struct pcp_var_record *, size_t n); + +/* Tries to open FH for reading as an SPSS/PC+ system file. Returns a + pcp_reader if successful, otherwise NULL. */ +static struct any_reader * +pcp_open (struct file_handle *fh) +{ + struct pcp_reader *r; + struct stat s; + + /* Create and initialize reader. */ + r = xzalloc (sizeof *r); + r->any_reader.klass = &pcp_file_reader_class; + r->pool = pool_create (); + pool_register (r->pool, free, r); + r->fh = fh_ref (fh); + r->opcode_idx = sizeof r->opcodes; + + /* TRANSLATORS: this fragment will be interpolated into + messages in fh_lock() that identify types of files. */ + r->lock = fh_lock (fh, FH_REF_FILE, N_("SPSS/PC+ system file"), + FH_ACC_READ, false); + if (r->lock == NULL) + goto error; + + /* Open file. */ + r->file = fn_open (fh_get_file_name (fh), "rb"); + if (r->file == NULL) + { + msg (ME, _("Error opening `%s' for reading as an SPSS/PC+ " + "system file: %s."), + fh_get_file_name (r->fh), strerror (errno)); + goto error; + } + + /* Fetch file size. */ + if (fstat (fileno (r->file), &s)) + { + pcp_error (ME, 0, _("%s: stat failed (%s)."), + fh_get_file_name (r->fh), strerror (errno)); + goto error; + } + if (s.st_size > UINT_MAX) + { + pcp_error (ME, 0, _("%s: file too large."), fh_get_file_name (r->fh)); + goto error; + } + r->file_size = s.st_size; + + /* Read dictionary. */ + if (!read_dictionary (r)) + goto error; + + if (!pcp_seek (r, r->directory.data.ofs)) + goto error; + + return &r->any_reader; + +error: + pcp_close (&r->any_reader); + return NULL; +} + +static bool +pcp_read_dir_entry (struct pcp_reader *r, struct pcp_dir_entry *de) +{ + if (!read_uint32 (r, &de->ofs) || !read_uint32 (r, &de->len)) + return false; + + if (de->len > r->file_size || de->ofs > r->file_size - de->len) + { + pcp_error (r, r->pos - 8, _("Directory entry is for a %u-byte record " + "starting at offset %u but file is only " + "%u bytes long."), + de->ofs, de->len, r->file_size); + return false; + } + + return true; +} + +static bool +read_dictionary (struct pcp_reader *r) +{ + unsigned int two, zero; + + if (!read_uint32 (r, &two) || !read_uint32 (r, &zero)) + return false; + if (two != 2 || zero != 0) + pcp_warn (r, 0, _("Directory fields have unexpected values " + "(%u,%u)."), two, zero); + + if (!pcp_read_dir_entry (r, &r->directory.main) + || !pcp_read_dir_entry (r, &r->directory.variables) + || !pcp_read_dir_entry (r, &r->directory.labels) + || !pcp_read_dir_entry (r, &r->directory.data)) + return false; + + if (!read_main_header (r, &r->header)) + return false; + + read_variables_record (r); + + return true; +} + +struct get_strings_aux + { + struct pool *pool; + char **titles; + char **strings; + bool *ids; + size_t allocated; + size_t n; + }; + +static void +add_string__ (struct get_strings_aux *aux, + const char *string, bool id, char *title) +{ + if (aux->n >= aux->allocated) + { + aux->allocated = 2 * (aux->allocated + 1); + aux->titles = pool_realloc (aux->pool, aux->titles, + aux->allocated * sizeof *aux->titles); + aux->strings = pool_realloc (aux->pool, aux->strings, + aux->allocated * sizeof *aux->strings); + aux->ids = pool_realloc (aux->pool, aux->ids, + aux->allocated * sizeof *aux->ids); + } + + aux->titles[aux->n] = title; + aux->strings[aux->n] = pool_strdup (aux->pool, string); + aux->ids[aux->n] = id; + aux->n++; +} + +static void PRINTF_FORMAT (3, 4) +add_string (struct get_strings_aux *aux, + const char *string, const char *title, ...) +{ + va_list args; + + va_start (args, title); + add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args)); + va_end (args); +} + +static void PRINTF_FORMAT (3, 4) +add_id (struct get_strings_aux *aux, const char *id, const char *title, ...) +{ + va_list args; + + va_start (args, title); + add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args)); + va_end (args); +} + +/* Retrieves significant string data from R in its raw format, to allow the + caller to try to detect the encoding in use. + + Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP, + and *STRINGSP to an array of N elements allocated from POOL. For each I in + 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in + whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must + be a valid PSPP language identifier, false if *STRINGSP[I] is free-form + text. */ +static size_t +pcp_get_strings (const struct any_reader *r_, struct pool *pool, + char ***titlesp, bool **idsp, char ***stringsp) +{ + struct pcp_reader *r = pcp_reader_cast (r_); + struct get_strings_aux aux; + size_t var_idx; + size_t i, j; + + aux.pool = pool; + aux.titles = NULL; + aux.strings = NULL; + aux.ids = NULL; + aux.allocated = 0; + aux.n = 0; + + var_idx = 0; + for (i = 0; i < r->n_vars; i++) + if (r->vars[i].width != -1) + add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx); + + var_idx = 0; + for (i = 0; i < r->n_vars; i++) + if (r->vars[i].width != -1) + { + var_idx++; + if (r->vars[i].label) + add_string (&aux, r->vars[i].label, _("Variable %zu Label"), + var_idx); + + for (j = 0; j < r->vars[i].n_val_labs; j++) + add_string (&aux, r->vars[i].label, + _("Variable %zu Value Label %zu"), var_idx, j); + } + + add_string (&aux, r->header.creation_date, _("Creation Date")); + add_string (&aux, r->header.creation_time, _("Creation Time")); + add_string (&aux, r->header.product, _("Product")); + add_string (&aux, r->header.file_label, _("File Label")); + + *titlesp = aux.titles; + *idsp = aux.ids; + *stringsp = aux.strings; + return aux.n; +} + +static void +find_and_delete_var (struct dictionary *dict, const char *name) +{ + struct variable *var = dict_lookup_var (dict, name); + if (var) + dict_delete_var (dict, var); +} + +/* Decodes the dictionary read from R, saving it into into *DICT. Character + strings in R are decoded using ENCODING, or an encoding obtained from R if + ENCODING is null, or the locale encoding if R specifies no encoding. + + If INFOP is non-null, then it receives additional info about the system + file, which the caller must eventually free with any_read_info_destroy() + when it is no longer needed. + + This function consumes R. The caller must use it again later, even to + destroy it with pcp_close(). */ +static struct casereader * +pcp_decode (struct any_reader *r_, const char *encoding, + struct dictionary **dictp, struct any_read_info *infop) +{ + struct pcp_reader *r = pcp_reader_cast (r_); + struct dictionary *dict; + + if (encoding == NULL) + { + encoding = locale_charset (); + pcp_warn (r, -1, _("Using default encoding %s to read this SPSS/PC+ " + "system file. For best results, specify an " + "encoding explicitly. Use SYSFILE INFO with " + "ENCODING=\"DETECT\" to analyze the possible " + "encodings."), + encoding); + } + + dict = dict_create (encoding); + r->encoding = dict_get_encoding (dict); + + parse_header (r, &r->header, &r->info, dict); + if (!parse_variable_records (r, dict, r->vars, r->n_vars)) + goto error; + + /* Create an index of dictionary variable widths for + pcp_read_case to use. We cannot use the `struct variable's + from the dictionary we created, because the caller owns the + dictionary and may destroy or modify its variables. */ + r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool); + + find_and_delete_var (dict, "CASENUM_"); + find_and_delete_var (dict, "DATE_"); + find_and_delete_var (dict, "WEIGHT_"); + + *dictp = dict; + if (infop) + { + *infop = r->info; + memset (&r->info, 0, sizeof r->info); + } + + return casereader_create_sequential + (NULL, r->proto, r->n_cases, &pcp_file_casereader_class, r); + +error: + pcp_close (&r->any_reader); + dict_destroy (dict); + *dictp = NULL; + return NULL; +} + +/* Closes R, which should have been returned by pcp_open() but not already + closed with pcp_decode() or this function. + Returns true if an I/O error has occurred on READER, false + otherwise. */ +static bool +pcp_close (struct any_reader *r_) +{ + struct pcp_reader *r = pcp_reader_cast (r_); + bool error; + + if (r->file) + { + if (fn_close (fh_get_file_name (r->fh), r->file) == EOF) + { + msg (ME, _("Error closing system file `%s': %s."), + fh_get_file_name (r->fh), strerror (errno)); + r->error = true; + } + r->file = NULL; + } + + any_read_info_destroy (&r->info); + fh_unlock (r->lock); + fh_unref (r->fh); + + error = r->error; + pool_destroy (r->pool); + + return !error; +} + +/* Destroys READER. */ +static void +pcp_file_casereader_destroy (struct casereader *reader UNUSED, void *r_) +{ + struct pcp_reader *r = r_; + pcp_close (&r->any_reader); +} + +/* Returns true if FILE is an SPSS/PC+ system file, + false otherwise. */ +static int +pcp_detect (FILE *file) +{ + static const char signature[4] = "SPSS"; + char buf[sizeof signature]; + + if (fseek (file, 0x104, SEEK_SET) + || (fread (buf, sizeof buf, 1, file) != 1 && !feof (file))) + return -errno; + + return !memcmp (buf, signature, sizeof buf); +} + +/* Reads the main header of the SPSS/PC+ system file. Initializes *HEADER and + *INFO, except for the string fields in *INFO, which parse_header() will + initialize later once the file's encoding is known. */ +static bool +read_main_header (struct pcp_reader *r, struct pcp_main_header *header) +{ + unsigned int base_ofs = r->directory.main.ofs; + size_t min_values, min_data_size; + unsigned int zero0, zero1, zero2; + unsigned int one0, one1; + unsigned int compressed; + unsigned int n_cases1; + uint8_t sysmis[8]; + + if (!pcp_seek (r, base_ofs)) + return false; + + if (r->directory.main.len < 0xb0) + { + pcp_error (r, r->pos, _("This is not an SPSS/PC+ system file.")); + return false; + } + else if (r->directory.main.len > 0xb0) + pcp_warn (r, r->pos, _("Record 0 has unexpected length %u."), + r->directory.main.len); + + if (!read_uint16 (r, &one0) + || !read_string (r, header->product, sizeof header->product) + || !read_bytes (r, sysmis, sizeof sysmis) + || !read_uint32 (r, &zero0) + || !read_uint32 (r, &zero1) + || !read_uint16 (r, &one1) + || !read_uint16 (r, &compressed) + || !read_uint16 (r, &header->nominal_case_size) + || !read_uint32 (r, &r->n_cases) + || !read_uint16 (r, &zero2) + || !read_uint32 (r, &n_cases1) + || !read_string (r, header->creation_date, sizeof header->creation_date) + || !read_string (r, header->creation_time, sizeof header->creation_time) + || !read_string (r, header->file_label, sizeof header->file_label)) + return false; + + if (!pcp_is_sysmis (sysmis)) + { + double d = parse_float (sysmis); + pcp_warn (r, base_ofs, _("Record 0 specifies unexpected system missing " + "value %g (%a)."), d, d); + } + if (one0 != 1 || one1 != 1 || zero0 != 0 || zero1 != 0 || zero2 != 0) + pcp_warn (r, base_ofs, _("Record 0 reserved fields have unexpected values " + "(%u,%u,%u,%u,%u)."), + one0, one1, zero0, zero1, zero2); + if (n_cases1 != r->n_cases) + pcp_warn (r, base_ofs, _("Record 0 case counts differ (%u versus %u)."), + r->n_cases, n_cases1); + if (compressed != 0 && compressed != 1) + { + pcp_error (r, base_ofs, _("Invalid compression type %u."), compressed); + return false; + } + + r->compressed = compressed != 0; + + min_values = xtimes (header->nominal_case_size, r->n_cases); + min_data_size = xtimes (compressed ? 1 : 8, min_values); + if (r->directory.data.len < min_data_size + || size_overflow_p (min_data_size)) + { + pcp_warn (r, base_ofs, _("Record 0 claims %u cases with %u values per " + "case (requiring at least %zu bytes) but data " + "record is only %u bytes long."), + r->n_cases, header->nominal_case_size, min_data_size, + r->directory.data.len); + return true; + } + + return true; +} + +static bool +read_value_labels (struct pcp_reader *r, struct pcp_var_record *var, + unsigned int start, unsigned int end) +{ + size_t allocated_val_labs = 0; + + start += 7; + end += 7; + if (end > r->directory.labels.len) + { + pcp_warn (r, r->pos - 32, + _("Value labels claimed to end at offset %u in labels record " + "but labels record is only %u bytes."), + end, r->directory.labels.len); + return true; + } + + start += r->directory.labels.ofs; + end += r->directory.labels.ofs; + if (start > end || end > r->file_size) + { + pcp_warn (r, r->pos - 32, + _("Value labels claimed to be at offset %u with length %u " + "but file size is only %u bytes."), + start, end - start, r->file_size); + return true; + } + + if (!pcp_seek (r, start)) + return false; + + while (r->pos < end && end - r->pos > 8) + { + struct pcp_value_label *vl; + uint8_t len; + + if (var->n_val_labs >= allocated_val_labs) + var->val_labs = x2nrealloc (var->val_labs, &allocated_val_labs, + sizeof *var->val_labs); + vl = &var->val_labs[var->n_val_labs]; + + if (!read_bytes (r, vl->value, sizeof vl->value) + || !read_bytes (r, &len, 1)) + return false; + + if (end - r->pos < len) + { + pcp_warn (r, r->pos, + _("Value labels end with partial label (%u bytes left in " + "record, label length %"PRIu8")."), + end - r->pos, len); + return true; + } + vl->label = pool_malloc (r->pool, len + 1); + if (!read_bytes (r, vl->label, len)) + return false; + + vl->label[len] = '\0'; + var->n_val_labs++; + } + if (r->pos < end) + pcp_warn (r, r->pos, _("%u leftover bytes following value labels."), + end - r->pos); + + return true; +} + +static bool +read_var_label (struct pcp_reader *r, struct pcp_var_record *var, + unsigned int ofs) +{ + uint8_t len; + + ofs += 7; + if (ofs >= r->directory.labels.len) + { + pcp_warn (r, r->pos - 32, + _("Variable label claimed to start at offset %u in labels " + "record but labels record is only %u bytes."), + ofs, r->directory.labels.len); + return true; + } + + if (!pcp_seek (r, ofs + r->directory.labels.ofs) || !read_bytes (r, &len, 1)) + return false; + + if (len >= r->directory.labels.len - ofs) + { + pcp_warn (r, r->pos - 1, + _("Variable label with length %u starting at offset %u in " + "labels record overruns end of %u-byte labels record."), + len, ofs + 1, r->directory.labels.len); + return false; + } + + var->label = pool_malloc (r->pool, len + 1); + var->label[len] = '\0'; + return read_bytes (r, var->label, len); +} + +/* Reads the variables record (record 1) into R. */ +static bool +read_variables_record (struct pcp_reader *r) +{ + unsigned int i; + + if (!pcp_seek (r, r->directory.variables.ofs)) + return false; + if (r->directory.variables.len != r->header.nominal_case_size * 32) + { + pcp_error (r, r->pos, _("Record 1 has length %u (expected %u)."), + r->directory.variables.len, r->header.nominal_case_size * 32); + return false; + } + + r->vars = pool_calloc (r->pool, + r->header.nominal_case_size, sizeof *r->vars); + for (i = 0; i < r->header.nominal_case_size; i++) + { + struct pcp_var_record *var = &r->vars[r->n_vars++]; + unsigned int value_label_start, value_label_end; + unsigned int var_label_ofs; + unsigned int format; + uint8_t raw_type; + + var->pos = r->pos; + if (!read_uint32 (r, &value_label_start) + || !read_uint32 (r, &value_label_end) + || !read_uint32 (r, &var_label_ofs) + || !read_uint32 (r, &format) + || !read_string (r, var->name, sizeof var->name) + || !read_bytes (r, var->missing, sizeof var->missing)) + return false; + + raw_type = format >> 16; + if (!fmt_from_io (raw_type, &var->format.type)) + { + pcp_error (r, var->pos, _("Variable %u has invalid type %"PRIu8"."), + i, raw_type); + return false; + } + + var->format.w = (format >> 8) & 0xff; + var->format.d = format & 0xff; + fmt_fix_output (&var->format); + var->width = fmt_var_width (&var->format); + + if (var_label_ofs) + { + unsigned int save_pos = r->pos; + if (!read_var_label (r, var, var_label_ofs) + || !pcp_seek (r, save_pos)) + return false; + } + + if (value_label_end > value_label_start && var->width <= 8) + { + unsigned int save_pos = r->pos; + if (!read_value_labels (r, var, value_label_start, value_label_end) + || !pcp_seek (r, save_pos)) + return false; + } + + if (var->width > 8) + { + int extra = DIV_RND_UP (var->width - 8, 8); + i += extra; + if (!skip_bytes (r, 32 * extra)) + return false; + } + } + + return true; +} + +static char * +recode_and_trim_string (struct pool *pool, const char *from, const char *in) +{ + struct substring out; + + out = recode_substring_pool ("UTF-8", from, ss_cstr (in), pool); + ss_trim (&out, ss_cstr (" ")); + return ss_xstrdup (out); +} + +static void +parse_header (struct pcp_reader *r, const struct pcp_main_header *header, + struct any_read_info *info, struct dictionary *dict) +{ + const char *dict_encoding = dict_get_encoding (dict); + char *label; + + memset (info, 0, sizeof *info); + + info->integer_format = INTEGER_LSB_FIRST; + info->float_format = FLOAT_IEEE_DOUBLE_LE; + info->compression = r->compressed ? ANY_COMP_SIMPLE : ANY_COMP_NONE; + info->case_cnt = r->n_cases; + + /* Convert file label to UTF-8 and put it into DICT. */ + label = recode_and_trim_string (r->pool, dict_encoding, header->file_label); + dict_set_label (dict, label); + free (label); + + /* Put creation date, time, and product in UTF-8 into INFO. */ + info->creation_date = recode_and_trim_string (r->pool, dict_encoding, + header->creation_date); + info->creation_time = recode_and_trim_string (r->pool, dict_encoding, + header->creation_time); + info->product = recode_and_trim_string (r->pool, dict_encoding, + header->product); +} + +/* Reads a variable (type 2) record from R and adds the + corresponding variable to DICT. + Also skips past additional variable records for long string + variables. */ +static bool +parse_variable_records (struct pcp_reader *r, struct dictionary *dict, + struct pcp_var_record *var_recs, size_t n_var_recs) +{ + const char *dict_encoding = dict_get_encoding (dict); + struct pcp_var_record *rec; + + for (rec = var_recs; rec < &var_recs[n_var_recs]; rec++) + { + struct variable *var; + bool weight; + char *name; + size_t i; + + name = recode_string_pool ("UTF-8", dict_encoding, + rec->name, -1, r->pool); + name[strcspn (name, " ")] = '\0'; + weight = !strcmp (name, "$WEIGHT") && rec->width == 0; + + /* Transform $DATE => DATE_, $WEIGHT => WEIGHT_, $CASENUM => CASENUM_. */ + if (name[0] == '$') + name = pool_asprintf (r->pool, "%s_", name + 1); + + if (!dict_id_is_valid (dict, name, false) || name[0] == '#') + { + pcp_error (r, rec->pos, _("Invalid variable name `%s'."), name); + return false; + } + + var = rec->var = dict_create_var (dict, name, rec->width); + if (var == NULL) + { + char *new_name = dict_make_unique_var_name (dict, NULL, NULL); + pcp_warn (r, rec->pos, _("Renaming variable with duplicate name " + "`%s' to `%s'."), + name, new_name); + var = rec->var = dict_create_var_assert (dict, new_name, rec->width); + free (new_name); + } + if (weight) + dict_set_weight (dict, var); + + /* Set the short name the same as the long name. */ + var_set_short_name (var, 0, name); + + /* Get variable label, if any. */ + if (rec->label) + { + char *utf8_label; + + utf8_label = recode_string ("UTF-8", dict_encoding, rec->label, -1); + var_set_label (var, utf8_label); + free (utf8_label); + } + + /* Add value labels. */ + for (i = 0; i < rec->n_val_labs; i++) + { + union value value; + char *utf8_label; + + value_init (&value, rec->width); + if (var_is_numeric (var)) + value.f = parse_float (rec->val_labs[i].value); + else + memcpy (value_str_rw (&value, rec->width), + rec->val_labs[i].value, rec->width); + + utf8_label = recode_string ("UTF-8", dict_encoding, + rec->val_labs[i].label, -1); + var_add_value_label (var, &value, utf8_label); + free (utf8_label); + + value_destroy (&value, rec->width); + } + + /* Set missing values. */ + if (rec->width <= 8 && !pcp_is_sysmis (rec->missing)) + { + int width = var_get_width (var); + struct missing_values mv; + + mv_init_pool (r->pool, &mv, width); + if (var_is_numeric (var)) + mv_add_num (&mv, parse_float (rec->missing)); + else + mv_add_str (&mv, rec->missing, MIN (width, 8)); + var_set_missing_values (var, &mv); + } + + /* Set formats. */ + var_set_both_formats (var, &rec->format); + } + + return true; +} + +/* Case reader. */ + +static void read_error (struct casereader *, const struct pcp_reader *); + +static bool read_case_number (struct pcp_reader *, double *); +static int read_case_string (struct pcp_reader *, uint8_t *, size_t); +static int read_opcode (struct pcp_reader *); +static bool read_compressed_number (struct pcp_reader *, double *); +static int read_compressed_string (struct pcp_reader *, uint8_t *); +static int read_whole_strings (struct pcp_reader *, uint8_t *, size_t); + +/* Reads and returns one case from READER's file. Returns a null + pointer if not successful. */ +static struct ccase * +pcp_file_casereader_read (struct casereader *reader, void *r_) +{ + struct pcp_reader *r = r_; + unsigned int start_pos = r->pos; + struct ccase *c; + int retval; + int i; + + if (r->error || !r->n_cases) + return NULL; + r->n_cases--; + + c = case_create (r->proto); + for (i = 0; i < r->n_vars; i++) + { + struct pcp_var_record *var = &r->vars[i]; + union value *v = case_data_rw_idx (c, i); + + if (var->width == 0) + retval = read_case_number (r, &v->f); + else + retval = read_case_string (r, value_str_rw (v, var->width), + var->width); + + if (retval != 1) + { + pcp_error (r, r->pos, _("File ends in partial case.")); + goto error; + } + } + if (r->pos > r->directory.data.ofs + r->directory.data.len) + { + pcp_error (r, r->pos, _("Case beginning at offset 0x%08x extends past " + "end of data record at offset 0x%08x."), + start_pos, r->directory.data.ofs + r->directory.data.len); + goto error; + } + + return c; + +error: + read_error (reader, r); + case_unref (c); + return NULL; +} + +/* Issues an error that an unspecified error occurred PCP, and + marks R tainted. */ +static void +read_error (struct casereader *r, const struct pcp_reader *pcp) +{ + msg (ME, _("Error reading case from file %s."), fh_get_name (pcp->fh)); + casereader_force_error (r); +} + +/* Reads a number from R and stores its value in *D. + If R is compressed, reads a compressed number; + otherwise, reads a number in the regular way. + Returns true if successful, false if end of file is + reached immediately. */ +static bool +read_case_number (struct pcp_reader *r, double *d) +{ + if (!r->compressed) + { + uint8_t number[8]; + if (!try_read_bytes (r, number, sizeof number)) + return false; + *d = parse_float (number); + return true; + } + else + return read_compressed_number (r, d); +} + +/* Reads LENGTH string bytes from R into S. Always reads a multiple of 8 + bytes; if LENGTH is not a multiple of 8, then extra bytes are read and + discarded without being written to S. Reads compressed strings if S is + compressed. Returns 1 if successful, 0 if end of file is reached + immediately, or -1 for some kind of error. */ +static int +read_case_string (struct pcp_reader *r, uint8_t *s, size_t length) +{ + size_t whole = ROUND_DOWN (length, 8); + size_t partial = length % 8; + + if (whole) + { + int retval = read_whole_strings (r, s, whole); + if (retval != 1) + return retval; + } + + if (partial) + { + uint8_t bounce[8]; + int retval = read_whole_strings (r, bounce, sizeof bounce); + if (retval <= 0) + return -1; + memcpy (s + whole, bounce, partial); + } + + return 1; +} + +/* Reads and returns the next compression opcode from R. */ +static int +read_opcode (struct pcp_reader *r) +{ + assert (r->compressed); + if (r->opcode_idx >= sizeof r->opcodes) + { + int retval = try_read_bytes (r, r->opcodes, sizeof r->opcodes); + if (retval != 1) + return -1; + r->opcode_idx = 0; + } + return r->opcodes[r->opcode_idx++]; +} + +/* Reads a compressed number from R and stores its value in D. + Returns true if successful, false if end of file is + reached immediately. */ +static bool +read_compressed_number (struct pcp_reader *r, double *d) +{ + int opcode = read_opcode (r); + switch (opcode) + { + case -1: + return false; + + case 0: + *d = SYSMIS; + return true; + + case 1: + return read_float (r, d); + + default: + *d = opcode - 105.0; + return true; + } +} + +/* Reads a compressed 8-byte string segment from R and stores it in DST. */ +static int +read_compressed_string (struct pcp_reader *r, uint8_t *dst) +{ + int opcode; + int retval; + + opcode = read_opcode (r); + switch (opcode) + { + case -1: + return 0; + + case 1: + retval = read_bytes (r, dst, 8); + return retval == 1 ? 1 : -1; + + default: + if (!r->corruption_warning) + { + r->corruption_warning = true; + pcp_warn (r, r->pos, + _("Possible compressed data corruption: " + "string contains compressed integer (opcode %d)."), + opcode); + } + memset (dst, ' ', 8); + return 1; + } +} + +/* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8. + Reads compressed strings if S is compressed. Returns 1 if successful, 0 if + end of file is reached immediately, or -1 for some kind of error. */ +static int +read_whole_strings (struct pcp_reader *r, uint8_t *s, size_t length) +{ + assert (length % 8 == 0); + if (!r->compressed) + return try_read_bytes (r, s, length); + else + { + size_t ofs; + + for (ofs = 0; ofs < length; ofs += 8) + { + int retval = read_compressed_string (r, s + ofs); + if (retval != 1) + return -1; + } + return 1; + } +} + +/* Messages. */ + +/* Displays a corruption message. */ +static void +pcp_msg (struct pcp_reader *r, off_t offset, + int class, const char *format, va_list args) +{ + struct msg m; + struct string text; + + ds_init_empty (&text); + if (offset >= 0) + ds_put_format (&text, _("`%s' near offset 0x%llx: "), + fh_get_file_name (r->fh), (long long int) offset); + else + ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh)); + ds_put_vformat (&text, format, args); + + m.category = msg_class_to_category (class); + m.severity = msg_class_to_severity (class); + m.file_name = NULL; + m.first_line = 0; + m.last_line = 0; + m.first_column = 0; + m.last_column = 0; + m.text = ds_cstr (&text); + + msg_emit (&m); +} + +/* Displays a warning for offset OFFSET in the file. */ +static void +pcp_warn (struct pcp_reader *r, off_t offset, const char *format, ...) +{ + va_list args; + + va_start (args, format); + pcp_msg (r, offset, MW, format, args); + va_end (args); +} + +/* Displays an error for the current file position, + marks it as in an error state, + and aborts reading it using longjmp. */ +static void +pcp_error (struct pcp_reader *r, off_t offset, const char *format, ...) +{ + va_list args; + + va_start (args, format); + pcp_msg (r, offset, ME, format, args); + va_end (args); + + r->error = true; +} + +/* Reads BYTE_CNT bytes into BUF. + Returns 1 if exactly BYTE_CNT bytes are successfully read. + Returns -1 if an I/O error or a partial read occurs. + Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports + an error. */ +static inline int +read_bytes_internal (struct pcp_reader *r, bool eof_is_ok, + void *buf, size_t byte_cnt) +{ + size_t bytes_read = fread (buf, 1, byte_cnt, r->file); + r->pos += bytes_read; + if (bytes_read == byte_cnt) + return 1; + else if (ferror (r->file)) + { + pcp_error (r, r->pos, _("System error: %s."), strerror (errno)); + return -1; + } + else if (!eof_is_ok || bytes_read != 0) + { + pcp_error (r, r->pos, _("Unexpected end of file.")); + return -1; + } + else + return 0; +} + +/* Reads BYTE_CNT into BUF. + Returns true if successful. + Returns false upon I/O error or if end-of-file is encountered. */ +static bool +read_bytes (struct pcp_reader *r, void *buf, size_t byte_cnt) +{ + return read_bytes_internal (r, false, buf, byte_cnt) == 1; +} + +/* Reads BYTE_CNT bytes into BUF. + Returns 1 if exactly BYTE_CNT bytes are successfully read. + Returns 0 if an immediate end-of-file is encountered. + Returns -1 if an I/O error or a partial read occurs. */ +static int +try_read_bytes (struct pcp_reader *r, void *buf, size_t byte_cnt) +{ + return read_bytes_internal (r, true, buf, byte_cnt); +} + +/* Reads a 16-bit signed integer from R and stores its value in host format in + *X. Returns true if successful, otherwise false. */ +static bool +read_uint16 (struct pcp_reader *r, unsigned int *x) +{ + uint8_t integer[2]; + if (read_bytes (r, integer, sizeof integer) != 1) + return false; + *x = integer_get (INTEGER_LSB_FIRST, integer, sizeof integer); + return true; +} + +/* Reads a 32-bit signed integer from R and stores its value in host format in + *X. Returns true if successful, otherwise false. */ +static bool +read_uint32 (struct pcp_reader *r, unsigned int *x) +{ + uint8_t integer[4]; + if (read_bytes (r, integer, sizeof integer) != 1) + return false; + *x = integer_get (INTEGER_LSB_FIRST, integer, sizeof integer); + return true; +} + +/* Reads exactly SIZE - 1 bytes into BUFFER + and stores a null byte into BUFFER[SIZE - 1]. */ +static bool +read_string (struct pcp_reader *r, char *buffer, size_t size) +{ + bool ok; + + assert (size > 0); + ok = read_bytes (r, buffer, size - 1); + if (ok) + buffer[size - 1] = '\0'; + return ok; +} + +/* Skips BYTES bytes forward in R. */ +static bool +skip_bytes (struct pcp_reader *r, size_t bytes) +{ + while (bytes > 0) + { + char buffer[1024]; + size_t chunk = MIN (sizeof buffer, bytes); + if (!read_bytes (r, buffer, chunk)) + return false; + bytes -= chunk; + } + + return true; +} + +static bool +pcp_seek (struct pcp_reader *r, off_t offset) +{ + if (fseeko (r->file, offset, SEEK_SET)) + { + pcp_error (r, 0, _("%s: seek failed (%s)."), + fh_get_file_name (r->fh), strerror (errno)); + return false; + } + r->pos = offset; + return true; +} + +/* Reads a 64-bit floating-point number from R and returns its + value in host format. */ +static bool +read_float (struct pcp_reader *r, double *d) +{ + uint8_t number[8]; + + if (!read_bytes (r, number, sizeof number)) + return false; + else + { + *d = parse_float (number); + return true; + } +} + +static double +parse_float (const uint8_t number[8]) +{ + return (pcp_is_sysmis (number) + ? SYSMIS + : float_get_double (FLOAT_IEEE_DOUBLE_LE, number)); +} + +static bool +pcp_is_sysmis(const uint8_t *p) +{ + static const uint8_t sysmis[8] + = { 0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff }; + return !memcmp (p, sysmis, 8); +} + +static const struct casereader_class pcp_file_casereader_class = + { + pcp_file_casereader_read, + pcp_file_casereader_destroy, + NULL, + NULL, + }; + +const struct any_reader_class pcp_file_reader_class = + { + N_("SPSS/PC+ System File"), + pcp_detect, + pcp_open, + pcp_close, + pcp_decode, + pcp_get_strings, + }; diff --git a/src/data/por-file-reader.c b/src/data/por-file-reader.c index 0897d77aa2..4fb6c5fb45 100644 --- a/src/data/por-file-reader.c +++ b/src/data/por-file-reader.c @@ -16,8 +16,6 @@ #include -#include "data/por-file-reader.h" - #include #include #include @@ -27,6 +25,7 @@ #include #include +#include "data/any-reader.h" #include "data/casereader-provider.h" #include "data/casereader.h" #include "data/dictionary.h" @@ -47,6 +46,7 @@ #include "gl/intprops.h" #include "gl/minmax.h" #include "gl/xalloc.h" +#include "gl/xmemdup0.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -65,10 +65,13 @@ static const char portable_to_local[256] = /* Portable file reader. */ struct pfm_reader { + struct any_reader any_reader; struct pool *pool; /* All the portable file state. */ jmp_buf bail_out; /* longjmp() target for error handling. */ + struct dictionary *dict; + struct any_read_info info; struct file_handle *fh; /* File handle. */ struct fh_lock *lock; /* Read lock for file. */ FILE *file; /* File stream. */ @@ -83,6 +86,13 @@ struct pfm_reader static const struct casereader_class por_file_casereader_class; +static struct pfm_reader * +pfm_reader_cast (const struct any_reader *r_) +{ + assert (r_->klass == &por_file_reader_class); + return UP_CAST (r_, struct pfm_reader, any_reader); +} + static void error (struct pfm_reader *r, const char *msg,...) PRINTF_FORMAT (2, 3) @@ -151,12 +161,13 @@ warning (struct pfm_reader *r, const char *msg, ...) /* Close and destroy R. Returns false if an error was detected on R, true otherwise. */ static bool -close_reader (struct pfm_reader *r) +pfm_close (struct any_reader *r_) { + struct pfm_reader *r = pfm_reader_cast (r_); bool ok; - if (r == NULL) - return true; + dict_destroy (r->dict); + any_read_info_destroy (&r->info); if (r->file) { if (fn_close (fh_get_file_name (r->fh), r->file) == EOF) @@ -182,7 +193,7 @@ static void por_file_casereader_destroy (struct casereader *reader, void *r_) { struct pfm_reader *r = r_; - if (!close_reader (r)) + if (!pfm_close (&r->any_reader)) casereader_force_error (reader); } @@ -236,7 +247,7 @@ match (struct pfm_reader *r, int c) } static void read_header (struct pfm_reader *); -static void read_version_data (struct pfm_reader *, struct pfm_read_info *); +static void read_version_data (struct pfm_reader *, struct any_read_info *); static void read_variables (struct pfm_reader *, struct dictionary *); static void read_value_label (struct pfm_reader *, struct dictionary *); static void read_documents (struct pfm_reader *, struct dictionary *); @@ -244,18 +255,18 @@ static void read_documents (struct pfm_reader *, struct dictionary *); /* Reads the dictionary from file with handle H, and returns it in a dictionary structure. This dictionary may be modified in order to rename, reorder, and delete variables, etc. */ -struct casereader * -pfm_open_reader (struct file_handle *fh, struct dictionary **dict, - struct pfm_read_info *info) +struct any_reader * +pfm_open (struct file_handle *fh) { struct pool *volatile pool = NULL; struct pfm_reader *volatile r = NULL; - *dict = dict_create (get_default_encoding ()); - /* Create and initialize reader. */ pool = pool_create (); r = pool_alloc (pool, sizeof *r); + r->any_reader.klass = &por_file_reader_class; + r->dict = dict_create (get_default_encoding ()); + memset (&r->info, 0, sizeof r->info); r->pool = pool; r->fh = fh_ref (fh); r->lock = NULL; @@ -288,31 +299,47 @@ pfm_open_reader (struct file_handle *fh, struct dictionary **dict, /* Read header, version, date info, product id, variables. */ read_header (r); - read_version_data (r, info); - read_variables (r, *dict); + read_version_data (r, &r->info); + read_variables (r, r->dict); /* Read value labels. */ while (match (r, 'D')) - read_value_label (r, *dict); + read_value_label (r, r->dict); /* Read documents. */ if (match (r, 'E')) - read_documents (r, *dict); + read_documents (r, r->dict); /* Check that we've made it to the data. */ if (!match (r, 'F')) error (r, _("Data record expected.")); - r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool); - return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX, - &por_file_casereader_class, r); + r->proto = caseproto_ref_pool (dict_get_proto (r->dict), r->pool); + return &r->any_reader; error: - close_reader (r); - dict_destroy (*dict); - *dict = NULL; + pfm_close (&r->any_reader); return NULL; } + +struct casereader * +pfm_decode (struct any_reader *r_, const char *encoding UNUSED, + struct dictionary **dictp, struct any_read_info *info) +{ + struct pfm_reader *r = pfm_reader_cast (r_); + + *dictp = r->dict; + r->dict = NULL; + + if (info) + { + *info = r->info; + memset (&r->info, 0, sizeof r->info); + } + + return casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX, + &por_file_casereader_class, r); +} /* Returns the value of base-30 digit C, or -1 if C is not a base-30 digit. */ @@ -536,7 +563,7 @@ read_header (struct pfm_reader *r) /* Reads the version and date info record, as well as product and subproduct identification records if present. */ static void -read_version_data (struct pfm_reader *r, struct pfm_read_info *info) +read_version_data (struct pfm_reader *r, struct any_read_info *info) { static const char empty_string[] = ""; char *date, *time; @@ -565,16 +592,25 @@ read_version_data (struct pfm_reader *r, struct pfm_read_info *info) /* Save file info. */ if (info != NULL) { + memset (info, 0, sizeof *info); + + info->float_format = FLOAT_NATIVE_DOUBLE; + info->integer_format = INTEGER_NATIVE; + info->compression = ANY_COMP_NONE; + info->case_cnt = -1; + /* Date. */ + info->creation_date = xmalloc (11); for (i = 0; i < 8; i++) { static const int map[] = {6, 7, 8, 9, 3, 4, 0, 1}; info->creation_date[map[i]] = date[i]; } info->creation_date[2] = info->creation_date[5] = ' '; - info->creation_date[10] = 0; + info->creation_date[10] = '\0'; /* Time. */ + info->creation_time = xmalloc (9); for (i = 0; i < 6; i++) { static const int map[] = {0, 1, 3, 4, 6, 7}; @@ -584,8 +620,8 @@ read_version_data (struct pfm_reader *r, struct pfm_read_info *info) info->creation_time[8] = 0; /* Product. */ - str_copy_trunc (info->product, sizeof info->product, product); - str_copy_trunc (info->subproduct, sizeof info->subproduct, subproduct); + info->product = xstrdup (product); + info->product_ext = xstrdup (subproduct); } } @@ -888,7 +924,7 @@ por_file_casereader_read (struct casereader *reader, void *r_) /* Returns true if FILE is an SPSS portable file, false otherwise. */ -bool +int pfm_detect (FILE *file) { unsigned char header[464]; @@ -902,7 +938,7 @@ pfm_detect (FILE *file) { int c = getc (file); if (c == EOF || raw_cnt++ > 512) - return false; + return 0; else if (c == '\n') { while (line_len < 80 && cooked_cnt < sizeof header) @@ -929,9 +965,9 @@ pfm_detect (FILE *file) for (i = 0; i < 8; i++) if (trans[header[i + 456]] != "SPSSPORT"[i]) - return false; + return 0; - return true; + return 1; } static const struct casereader_class por_file_casereader_class = @@ -941,3 +977,13 @@ static const struct casereader_class por_file_casereader_class = NULL, NULL, }; + +const struct any_reader_class por_file_reader_class = + { + N_("SPSS Portable File"), + pfm_detect, + pfm_open, + pfm_close, + pfm_decode, + NULL, /* get_strings */ + }; diff --git a/src/data/por-file-reader.h b/src/data/por-file-reader.h deleted file mode 100644 index 326514f9a7..0000000000 --- a/src/data/por-file-reader.h +++ /dev/null @@ -1,42 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2009 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#ifndef PFM_READ_H -#define PFM_READ_H - -/* Portable file reading. */ - -#include -#include - -/* Information produced by pfm_read_dictionary() that doesn't fit into - a dictionary struct. */ -struct pfm_read_info - { - char creation_date[11]; /* `dd mm yyyy' plus a null. */ - char creation_time[9]; /* `hh:mm:ss' plus a null. */ - char product[61]; /* Product name plus a null. */ - char subproduct[61]; /* Subproduct name plus a null. */ - }; - -struct dictionary; -struct file_handle; -struct casereader *pfm_open_reader (struct file_handle *, - struct dictionary **, - struct pfm_read_info *); -bool pfm_detect (FILE *); - -#endif /* por-file-reader.h */ diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 9bb1c77578..caab3d9b15 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -16,7 +16,6 @@ #include -#include "data/sys-file-reader.h" #include "data/sys-file-private.h" #include @@ -26,6 +25,7 @@ #include #include +#include "data/any-reader.h" #include "data/attributes.h" #include "data/case.h" #include "data/casereader-provider.h" @@ -98,7 +98,7 @@ struct sfm_header_record int weight_idx; /* 0 if unweighted, otherwise a var index. */ int nominal_case_size; /* Number of var positions. */ - /* These correspond to the members of struct sfm_file_info or a dictionary + /* These correspond to the members of struct any_file_info or a dictionary but in the system file's encoding rather than ASCII. */ char creation_date[10]; /* "dd mmm yy". */ char creation_time[9]; /* "hh:mm:ss". */ @@ -168,11 +168,13 @@ struct sfm_extension_record /* System file reader. */ struct sfm_reader { + struct any_reader any_reader; + /* Resource tracking. */ struct pool *pool; /* All system file state. */ /* File data. */ - struct sfm_read_info info; + struct any_read_info info; struct sfm_header_record header; struct sfm_var_record *vars; size_t n_vars; @@ -200,7 +202,7 @@ struct sfm_reader const char *encoding; /* String encoding. */ /* Decompression. */ - enum sfm_compression compression; + enum any_compression compression; double bias; /* Compression bias, usually 100.0. */ uint8_t opcodes[8]; /* Current block of opcodes. */ size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */ @@ -219,6 +221,15 @@ struct sfm_reader static const struct casereader_class sys_file_casereader_class; +static struct sfm_reader * +sfm_reader_cast (const struct any_reader *r_) +{ + assert (r_->klass == &sys_file_reader_class); + return UP_CAST (r_, struct sfm_reader, any_reader); +} + +static bool sfm_close (struct any_reader *); + static struct variable *lookup_var_by_index (struct sfm_reader *, off_t, const struct sfm_var_record *, size_t n, int idx); @@ -312,11 +323,11 @@ enum which_format static bool read_dictionary (struct sfm_reader *); static bool read_record (struct sfm_reader *, int type, size_t *allocated_vars, size_t *allocated_labels); -static bool read_header (struct sfm_reader *, struct sfm_read_info *, +static bool read_header (struct sfm_reader *, struct any_read_info *, struct sfm_header_record *); static void parse_header (struct sfm_reader *, const struct sfm_header_record *, - struct sfm_read_info *, struct dictionary *); + struct any_read_info *, struct dictionary *); static bool parse_variable_records (struct sfm_reader *, struct dictionary *, struct sfm_var_record *, size_t n); static void parse_format_spec (struct sfm_reader *, off_t pos, @@ -328,12 +339,12 @@ static void parse_display_parameters (struct sfm_reader *, struct dictionary *); static bool parse_machine_integer_info (struct sfm_reader *, const struct sfm_extension_record *, - struct sfm_read_info *); + struct any_read_info *); static void parse_machine_float_info (struct sfm_reader *, const struct sfm_extension_record *); static void parse_extra_product_info (struct sfm_reader *, const struct sfm_extension_record *, - struct sfm_read_info *); + struct any_read_info *); static void parse_mrsets (struct sfm_reader *, const struct sfm_extension_record *, size_t *allocated_mrsets); @@ -364,7 +375,7 @@ static bool parse_long_string_missing_values ( /* Frees the strings inside INFO. */ void -sfm_read_info_destroy (struct sfm_read_info *info) +any_read_info_destroy (struct any_read_info *info) { if (info) { @@ -377,7 +388,7 @@ sfm_read_info_destroy (struct sfm_read_info *info) /* Tries to open FH for reading as a system file. Returns an sfm_reader if successful, otherwise NULL. */ -struct sfm_reader * +static struct any_reader * sfm_open (struct file_handle *fh) { size_t allocated_mrsets = 0; @@ -385,6 +396,7 @@ sfm_open (struct file_handle *fh) /* Create and initialize reader. */ r = xzalloc (sizeof *r); + r->any_reader.klass = &sys_file_reader_class; r->pool = pool_create (); pool_register (r->pool, free, r); r->fh = fh_ref (fh); @@ -413,9 +425,11 @@ sfm_open (struct file_handle *fh) if (r->extensions[EXT_MRSETS2] != NULL) parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets); - return r; + return &r->any_reader; + error: - sfm_close (r); + if (r) + sfm_close (&r->any_reader); return NULL; } @@ -445,7 +459,7 @@ read_dictionary (struct sfm_reader *r) if (!skip_bytes (r, 4)) return false; - if (r->compression == SFM_COMP_ZLIB && !read_zheader (r)) + if (r->compression == ANY_COMP_ZLIB && !read_zheader (r)) return false; return true; @@ -628,10 +642,11 @@ add_id (struct get_strings_aux *aux, const char *id, const char *title, ...) whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must be a valid PSPP language identifier, false if *STRINGSP[I] is free-form text. */ -size_t -sfm_get_strings (const struct sfm_reader *r, struct pool *pool, +static size_t +sfm_get_strings (const struct any_reader *r_, struct pool *pool, char ***titlesp, bool **idsp, char ***stringsp) { + struct sfm_reader *r = sfm_reader_cast (r_); const struct sfm_mrset *mrset; struct get_strings_aux aux; size_t var_idx; @@ -722,15 +737,16 @@ sfm_get_strings (const struct sfm_reader *r, struct pool *pool, ENCODING is null, or the locale encoding if R specifies no encoding. If INFOP is non-null, then it receives additional info about the system - file, which the caller must eventually free with sfm_read_info_destroy() + file, which the caller must eventually free with any_read_info_destroy() when it is no longer needed. This function consumes R. The caller must use it again later, even to destroy it with sfm_close(). */ -struct casereader * -sfm_decode (struct sfm_reader *r, const char *encoding, - struct dictionary **dictp, struct sfm_read_info *infop) +static struct casereader * +sfm_decode (struct any_reader *r_, const char *encoding, + struct dictionary **dictp, struct any_read_info *infop) { + struct sfm_reader *r = sfm_reader_cast (r_); struct dictionary *dict; size_t i; @@ -863,7 +879,7 @@ sfm_decode (struct sfm_reader *r, const char *encoding, &sys_file_casereader_class, r); error: - sfm_close (r); + sfm_close (r_); dict_destroy (dict); *dictp = NULL; return NULL; @@ -873,14 +889,12 @@ error: closed with sfm_decode() or this function. Returns true if an I/O error has occurred on READER, false otherwise. */ -bool -sfm_close (struct sfm_reader *r) +static bool +sfm_close (struct any_reader *r_) { + struct sfm_reader *r = sfm_reader_cast (r_); bool error; - if (r == NULL) - return true; - if (r->file) { if (fn_close (fh_get_file_name (r->fh), r->file) == EOF) @@ -892,7 +906,7 @@ sfm_close (struct sfm_reader *r) r->file = NULL; } - sfm_read_info_destroy (&r->info); + any_read_info_destroy (&r->info); fh_unlock (r->lock); fh_unref (r->fh); @@ -907,18 +921,21 @@ static void sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_) { struct sfm_reader *r = r_; - sfm_close (r); + sfm_close (&r->any_reader); } -/* Returns true if FILE is an SPSS system file, - false otherwise. */ -bool +/* Returns 1 if FILE is an SPSS system file, + 0 if it is not, + otherwise a negative errno value. */ +static int sfm_detect (FILE *file) { char magic[5]; + if (fseek (file, 0, SEEK_SET) != 0) + return -errno; if (fread (magic, 4, 1, file) != 1) - return false; + return feof (file) ? 0 : -errno; magic[4] = '\0'; return (!strcmp (ASCII_MAGIC, magic) @@ -930,7 +947,7 @@ sfm_detect (FILE *file) except for the string fields in *INFO, which parse_header() will initialize later once the file's encoding is known. */ static bool -read_header (struct sfm_reader *r, struct sfm_read_info *info, +read_header (struct sfm_reader *r, struct any_read_info *info, struct sfm_header_record *header) { uint8_t raw_layout_code[4]; @@ -979,9 +996,9 @@ read_header (struct sfm_reader *r, struct sfm_read_info *info, if (!zmagic) { if (compressed == 0) - r->compression = SFM_COMP_NONE; + r->compression = ANY_COMP_NONE; else if (compressed == 1) - r->compression = SFM_COMP_SIMPLE; + r->compression = ANY_COMP_SIMPLE; else if (compressed != 0) { sys_error (r, 0, "System file header has invalid compression " @@ -992,7 +1009,7 @@ read_header (struct sfm_reader *r, struct sfm_read_info *info, else { if (compressed == 2) - r->compression = SFM_COMP_ZLIB; + r->compression = ANY_COMP_ZLIB; else { sys_error (r, 0, "ZLIB-compressed system file header has invalid " @@ -1351,7 +1368,7 @@ skip_extension_record (struct sfm_reader *r, int subtype) static void parse_header (struct sfm_reader *r, const struct sfm_header_record *header, - struct sfm_read_info *info, struct dictionary *dict) + struct any_read_info *info, struct dictionary *dict) { const char *dict_encoding = dict_get_encoding (dict); struct substring product; @@ -1477,14 +1494,8 @@ parse_variable_records (struct sfm_reader *r, struct dictionary *dict, } } else - { - union value value; - - value_init_pool (r->pool, &value, width); - value_set_missing (&value, width); - for (i = 0; i < rec->missing_value_code; i++) - mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8)); - } + for (i = 0; i < rec->missing_value_code; i++) + mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8)); var_set_missing_values (var, &mv); } @@ -1585,7 +1596,7 @@ parse_document (struct dictionary *dict, struct sfm_document_record *record) static bool parse_machine_integer_info (struct sfm_reader *r, const struct sfm_extension_record *record, - struct sfm_read_info *info) + struct any_read_info *info) { int float_representation, expected_float_format; int integer_representation, expected_integer_format; @@ -1667,7 +1678,7 @@ parse_machine_float_info (struct sfm_reader *r, static void parse_extra_product_info (struct sfm_reader *r, const struct sfm_extension_record *record, - struct sfm_read_info *info) + struct any_read_info *info) { struct text_record *text; @@ -2711,7 +2722,7 @@ read_error (struct casereader *r, const struct sfm_reader *sfm) static bool read_case_number (struct sfm_reader *r, double *d) { - if (r->compression == SFM_COMP_NONE) + if (r->compression == ANY_COMP_NONE) { uint8_t number[8]; if (!try_read_bytes (r, number, sizeof number)) @@ -2766,7 +2777,7 @@ read_case_string (struct sfm_reader *r, uint8_t *s, size_t length) static int read_opcode (struct sfm_reader *r) { - assert (r->compression != SFM_COMP_NONE); + assert (r->compression != ANY_COMP_NONE); for (;;) { int opcode; @@ -2878,7 +2889,7 @@ static int read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length) { assert (length % 8 == 0); - if (r->compression == SFM_COMP_NONE) + if (r->compression == ANY_COMP_NONE) return try_read_bytes (r, s, length); else { @@ -3186,9 +3197,8 @@ sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...) va_end (args); } -/* Displays an error for the current file position, - marks it as in an error state, - and aborts reading it using longjmp. */ +/* Displays an error for the current file position and marks it as in an error + state. */ static void sys_error (struct sfm_reader *r, off_t offset, const char *format, ...) { @@ -3704,7 +3714,7 @@ read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt) static int read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) { - if (r->compression == SFM_COMP_SIMPLE) + if (r->compression == ANY_COMP_SIMPLE) return read_bytes (r, buf, byte_cnt); else { @@ -3718,7 +3728,7 @@ read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) static int try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) { - if (r->compression == SFM_COMP_SIMPLE) + if (r->compression == ANY_COMP_SIMPLE) return try_read_bytes (r, buf, byte_cnt); else return read_bytes_zlib (r, buf, byte_cnt); @@ -3745,3 +3755,13 @@ static const struct casereader_class sys_file_casereader_class = NULL, NULL, }; + +const struct any_reader_class sys_file_reader_class = + { + N_("SPSS System File"), + sfm_detect, + sfm_open, + sfm_close, + sfm_decode, + sfm_get_strings, + }; diff --git a/src/data/sys-file-reader.h b/src/data/sys-file-reader.h deleted file mode 100644 index 849da670fc..0000000000 --- a/src/data/sys-file-reader.h +++ /dev/null @@ -1,88 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2009, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#ifndef SFM_READ_H -#define SFM_READ_H 1 - -#include -#include - -#include "data/case.h" -#include "data/sys-file.h" -#include "libpspp/float-format.h" -#include "libpspp/integer-format.h" - -/* Reading system files. - - To read a system file: - - 1. Open it with sfm_open(). - - 2. Figure out what encoding to read it with. sfm_get_encoding() can - help. - - 3. Obtain a casereader with sfm_decode(). - - If, after step 1 or 2, you decide that you don't want the system file - anymore, you can close it with sfm_close(). Otherwise, don't call - sfm_close(), because sfm_decode() consumes it. */ - -struct dictionary; -struct file_handle; -struct sfm_read_info; - -/* Opening and closing an sfm_reader. */ -struct sfm_reader *sfm_open (struct file_handle *); -bool sfm_close (struct sfm_reader *); - -/* Obtaining information about an sfm_reader before . */ -const char *sfm_get_encoding (const struct sfm_reader *); -size_t sfm_get_strings (const struct sfm_reader *, struct pool *pool, - char ***labels, bool **ids, char ***values); - -/* Decoding a system file's dictionary and obtaining a casereader. */ -struct casereader *sfm_decode (struct sfm_reader *, const char *encoding, - struct dictionary **, struct sfm_read_info *); - -/* Detecting whether a file is a system file. */ -bool sfm_detect (FILE *); - -/* System file info that doesn't fit in struct dictionary. - - The strings in this structure are encoded in UTF-8. (They are normally in - the ASCII subset of UTF-8.) */ -struct sfm_read_info - { - char *creation_date; /* "dd mmm yy". */ - char *creation_time; /* "hh:mm:ss". */ - enum integer_format integer_format; - enum float_format float_format; - enum sfm_compression compression; - casenumber case_cnt; /* -1 if unknown. */ - char *product; /* Product name. */ - char *product_ext; /* Extra product info. */ - - /* Writer's version number in X.Y.Z format. - The version number is not always present; if not, then - all of these are set to 0. */ - int version_major; /* X. */ - int version_minor; /* Y. */ - int version_revision; /* Z. */ - }; - -void sfm_read_info_destroy (struct sfm_read_info *); - -#endif /* sys-file-reader.h */ diff --git a/src/data/sys-file-writer.c b/src/data/sys-file-writer.c index 8cfd577f1a..e0c6eade4b 100644 --- a/src/data/sys-file-writer.c +++ b/src/data/sys-file-writer.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-2000, 2006-2013 Free Software Foundation, Inc. + Copyright (C) 1997-2000, 2006-2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -73,7 +73,7 @@ struct sfm_writer FILE *file; /* File stream. */ struct replace_file *rf; /* Ticket for replacing output file. */ - enum sfm_compression compression; + enum any_compression compression; casenumber case_cnt; /* Number of cases written so far. */ uint8_t space; /* ' ' in the file's character encoding. */ @@ -183,8 +183,8 @@ sfm_writer_default_options (void) { struct sfm_write_options opts; opts.compression = (settings_get_scompression () - ? SFM_COMP_SIMPLE - : SFM_COMP_NONE); + ? ANY_COMP_SIMPLE + : ANY_COMP_NONE); opts.create_writeable = true; opts.version = 3; return opts; @@ -224,9 +224,9 @@ sfm_open_writer (struct file_handle *fh, struct dictionary *d, files have been observed, so drop back to simple compression for those files. */ w->compression = opts.compression; - if (w->compression == SFM_COMP_ZLIB + if (w->compression == ANY_COMP_ZLIB && is_encoding_ebcdic_compatible (dict_get_encoding (d))) - w->compression = SFM_COMP_SIMPLE; + w->compression = ANY_COMP_SIMPLE; w->case_cnt = 0; @@ -306,7 +306,7 @@ sfm_open_writer (struct file_handle *fh, struct dictionary *d, write_int (w, 999); write_int (w, 0); - if (w->compression == SFM_COMP_ZLIB) + if (w->compression == ANY_COMP_ZLIB) { w->zstream.zalloc = Z_NULL; w->zstream.zfree = Z_NULL; @@ -377,7 +377,7 @@ write_header (struct sfm_writer *w, const struct dictionary *d) /* Record-type code. */ if (is_encoding_ebcdic_compatible (dict_encoding)) write_string (w, EBCDIC_MAGIC, 4); - else if (w->compression == SFM_COMP_ZLIB) + else if (w->compression == ANY_COMP_ZLIB) write_string (w, ASCII_ZMAGIC, 4); else write_string (w, ASCII_MAGIC, 4); @@ -394,8 +394,8 @@ write_header (struct sfm_writer *w, const struct dictionary *d) write_int (w, calc_oct_idx (d, NULL)); /* Compressed? */ - write_int (w, (w->compression == SFM_COMP_NONE ? 0 - : w->compression == SFM_COMP_SIMPLE ? 1 + write_int (w, (w->compression == ANY_COMP_NONE ? 0 + : w->compression == ANY_COMP_SIMPLE ? 1 : 2)); /* Weight variable. */ @@ -1216,7 +1216,7 @@ sys_file_casewriter_write (struct casewriter *writer, void *w_, w->case_cnt++; - if (w->compression == SFM_COMP_NONE) + if (w->compression == ANY_COMP_NONE) write_case_uncompressed (w, c); else write_case_compressed (w, c); @@ -1255,7 +1255,7 @@ close_writer (struct sfm_writer *w) { /* Flush buffer. */ flush_compressed (w); - if (w->compression == SFM_COMP_ZLIB) + if (w->compression == ANY_COMP_ZLIB) { finish_zstream (w); write_ztrailer (w); @@ -1507,7 +1507,7 @@ flush_compressed (struct sfm_writer *w) if (w->n_opcodes) { unsigned int n = 8 * (1 + w->n_elements); - if (w->compression == SFM_COMP_SIMPLE) + if (w->compression == ANY_COMP_SIMPLE) write_bytes (w, w->cbuf, n); else write_zlib (w, w->cbuf, n); diff --git a/src/data/sys-file-writer.h b/src/data/sys-file-writer.h index 4f233f3197..127715b4dc 100644 --- a/src/data/sys-file-writer.h +++ b/src/data/sys-file-writer.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2009, 2013 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2009, 2013, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,14 +18,14 @@ #define SFM_WRITE_H 1 #include -#include "sys-file.h" +#include "any-reader.h" /* Writing system files. */ /* Options for creating a system file. */ struct sfm_write_options { - enum sfm_compression compression; + enum any_compression compression; bool create_writeable; /* File perms: writeable or read/only? */ int version; /* System file version (currently 2 or 3). */ }; diff --git a/src/data/sys-file.h b/src/data/sys-file.h deleted file mode 100644 index 7a582c05f4..0000000000 --- a/src/data/sys-file.h +++ /dev/null @@ -1,28 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2013 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#ifndef SYS_FILE_H -#define SYS_FILE_H 1 - -/* System file compression format. */ -enum sfm_compression - { - SFM_COMP_NONE, /* No compression. */ - SFM_COMP_SIMPLE, /* Bytecode compression of integer values. */ - SFM_COMP_ZLIB /* ZLIB "deflate" compression. */ - }; - -#endif /* sys-file.h */ diff --git a/src/language/data-io/combine-files.c b/src/language/data-io/combine-files.c index 6eb9a3181a..6d9ed5d43b 100644 --- a/src/language/data-io/combine-files.c +++ b/src/language/data-io/combine-files.c @@ -229,7 +229,8 @@ combine_files (enum comb_command_type command, if (file->handle == NULL) goto error; - file->reader = any_reader_open (file->handle, NULL, &file->dict); + file->reader = any_reader_open_and_decode (file->handle, NULL, + &file->dict, NULL); if (file->reader == NULL) goto error; } diff --git a/src/language/data-io/get.c b/src/language/data-io/get.c index 1218a27b18..9a788a01e3 100644 --- a/src/language/data-io/get.c +++ b/src/language/data-io/get.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2007, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2007, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -122,7 +122,7 @@ parse_read_command (struct lexer *lexer, struct dataset *ds, goto error; } - reader = any_reader_open (fh, encoding, &dict); + reader = any_reader_open_and_decode (fh, encoding, &dict, NULL); if (reader == NULL) goto error; diff --git a/src/language/data-io/save.c b/src/language/data-io/save.c index 7f1347db98..b97da69b00 100644 --- a/src/language/data-io/save.c +++ b/src/language/data-io/save.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -234,13 +234,13 @@ parse_write_command (struct lexer *lexer, struct dataset *ds, } else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "COMPRESSED")) - sysfile_opts.compression = SFM_COMP_SIMPLE; + sysfile_opts.compression = ANY_COMP_SIMPLE; else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "UNCOMPRESSED")) - sysfile_opts.compression = SFM_COMP_NONE; + sysfile_opts.compression = ANY_COMP_NONE; else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "ZCOMPRESSED")) - sysfile_opts.compression = SFM_COMP_ZLIB; + sysfile_opts.compression = ANY_COMP_ZLIB; else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "VERSION")) { diff --git a/src/language/dictionary/apply-dictionary.c b/src/language/dictionary/apply-dictionary.c index 05143fcd58..c949510f65 100644 --- a/src/language/dictionary/apply-dictionary.c +++ b/src/language/dictionary/apply-dictionary.c @@ -53,9 +53,9 @@ cmd_apply_dictionary (struct lexer *lexer, struct dataset *ds) handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds)); if (!handle) return CMD_FAILURE; - reader = any_reader_open (handle, NULL, &dict); + reader = any_reader_open_and_decode (handle, NULL, &dict, NULL); fh_unref (handle); - if (dict == NULL) + if (!reader) return CMD_FAILURE; casereader_destroy (reader); diff --git a/src/language/dictionary/sys-file-info.c b/src/language/dictionary/sys-file-info.c index 01881a995a..862d989a6b 100644 --- a/src/language/dictionary/sys-file-info.c +++ b/src/language/dictionary/sys-file-info.c @@ -21,6 +21,7 @@ #include #include +#include "data/any-reader.h" #include "data/attributes.h" #include "data/casereader.h" #include "data/dataset.h" @@ -28,7 +29,6 @@ #include "data/file-handle-def.h" #include "data/format.h" #include "data/missing-values.h" -#include "data/sys-file-reader.h" #include "data/value-labels.h" #include "data/variable.h" #include "data/vector.h" @@ -76,19 +76,20 @@ static unsigned int dict_display_mask (const struct dictionary *); static struct table *describe_variable (const struct variable *v, int flags); -static void report_encodings (const struct file_handle *, - const struct sfm_reader *); +static void report_encodings (const struct file_handle *, struct pool *, + char **titles, bool *ids, + char **strings, size_t n_strings); /* SYSFILE INFO utility. */ int cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED) { - struct sfm_reader *sfm_reader; + struct any_reader *any_reader; struct file_handle *h; struct dictionary *d; struct tab_table *t; struct casereader *reader; - struct sfm_read_info info; + struct any_read_info info; char *encoding; struct table *table; int r, i; @@ -130,21 +131,32 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED) goto error; } - sfm_reader = sfm_open (h); - if (sfm_reader == NULL) - goto error; + any_reader = any_reader_open (h); + if (!any_reader) + return CMD_FAILURE; if (encoding && !strcasecmp (encoding, "detect")) { - report_encodings (h, sfm_reader); + char **titles, **strings; + struct pool *pool; + size_t n_strings; + bool *ids; + + pool = pool_create (); + n_strings = any_reader_get_strings (any_reader, pool, + &titles, &ids, &strings); + any_reader_close (any_reader); + + report_encodings (h, pool, titles, ids, strings, n_strings); fh_unref (h); + pool_destroy (pool); + return CMD_SUCCESS; } - reader = sfm_decode (sfm_reader, encoding, &d, &info); + reader = any_reader_decode (any_reader, encoding, &d, &info); if (!reader) goto error; - casereader_destroy (reader); t = tab_create (2, 11 + (info.product_ext != NULL)); @@ -198,7 +210,7 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED) r++; tab_text (t, 0, r, TAB_LEFT, _("Type:")); - tab_text (t, 1, r++, TAB_LEFT, _("System File")); + tab_text (t, 1, r++, TAB_LEFT, gettext (info.klass->name)); tab_text (t, 0, r, TAB_LEFT, _("Weight:")); { @@ -210,8 +222,8 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED) tab_text (t, 0, r, TAB_LEFT, _("Compression:")); tab_text_format (t, 1, r++, TAB_LEFT, - info.compression == SFM_COMP_NONE ? _("None") - : info.compression == SFM_COMP_SIMPLE ? "SAV" + info.compression == ANY_COMP_NONE ? _("None") + : info.compression == ANY_COMP_SIMPLE ? "SAV" : "ZSAV"); tab_text (t, 0, r, TAB_LEFT, _("Encoding:")); @@ -237,7 +249,7 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED) dict_destroy (d); fh_unref (h); - sfm_read_info_destroy (&info); + any_read_info_destroy (&info); return CMD_SUCCESS; error: @@ -941,21 +953,15 @@ equal_suffix (const struct encoding *encodings, size_t n_encodings, } static void -report_encodings (const struct file_handle *h, const struct sfm_reader *r) +report_encodings (const struct file_handle *h, struct pool *pool, + char **titles, bool *ids, char **strings, size_t n_strings) { - char **titles; - char **strings; - bool *ids; struct encoding encodings[N_ENCODING_NAMES]; - size_t n_encodings, n_strings, n_unique_strings; + size_t n_encodings, n_unique_strings; size_t i, j; struct tab_table *t; - struct pool *pool; size_t row; - pool = pool_create (); - n_strings = sfm_get_strings (r, pool, &titles, &ids, &strings); - n_encodings = 0; for (i = 0; i < N_ENCODING_NAMES; i++) { @@ -990,7 +996,6 @@ report_encodings (const struct file_handle *h, const struct sfm_reader *r) if (!n_encodings) { msg (SW, _("No valid encodings found.")); - pool_destroy (pool); return; } @@ -1026,10 +1031,7 @@ report_encodings (const struct file_handle *h, const struct sfm_reader *r) if (!all_equal (encodings, n_encodings, i)) n_unique_strings++; if (!n_unique_strings) - { - pool_destroy (pool); - return; - } + return; t = tab_create (3, (n_encodings * n_unique_strings) + 1); tab_title (t, _("%s encoded text strings."), fh_get_name (h)); @@ -1078,8 +1080,6 @@ report_encodings (const struct file_handle *h, const struct sfm_reader *r) } } tab_submit (t); - - pool_destroy (pool); } static unsigned int diff --git a/src/ui/gui/psppire-window.c b/src/ui/gui/psppire-window.c index 9826b702d9..4d2f085cad 100644 --- a/src/ui/gui/psppire-window.c +++ b/src/ui/gui/psppire-window.c @@ -777,10 +777,12 @@ psppire_window_open (PsppireWindow *de) gchar *encoding = psppire_encoding_selector_get_encoding ( gtk_file_chooser_get_extra_widget (GTK_FILE_CHOOSER (dialog))); - enum detect_result res = any_reader_may_open (sysname); - if (ANY_YES == res) + int retval; + + retval = any_reader_detect (sysname, NULL); + if (retval == 1) open_data_window (de, name, encoding, NULL); - else if (ANY_NO == res) + else if (retval == 0) open_syntax_window (name, encoding); g_free (encoding); diff --git a/src/ui/gui/psppire.c b/src/ui/gui/psppire.c index eb8fddbe98..5a35b52efb 100644 --- a/src/ui/gui/psppire.c +++ b/src/ui/gui/psppire.c @@ -29,10 +29,8 @@ #include "data/datasheet.h" #include "data/file-handle-def.h" #include "data/file-name.h" -#include "data/por-file-reader.h" #include "data/session.h" #include "data/settings.h" -#include "data/sys-file-reader.h" #include "language/lexer/lexer.h" #include "libpspp/i18n.h" @@ -107,13 +105,13 @@ initialize (const char *data_file) { gchar *filename = local_to_filename_encoding (data_file); - enum detect_result res = any_reader_may_open (filename); + int retval = any_reader_detect (filename, NULL); /* Check to see if the file is a .sav or a .por file. If not assume that it is a syntax file */ - if (res == ANY_YES) + if (retval == 1) open_data_window (NULL, filename, NULL, NULL); - else if (res == ANY_NO) + else if (retval == 0) { create_data_window (); open_syntax_window (filename, NULL); diff --git a/src/ui/source-init-opts.c b/src/ui/source-init-opts.c index f9ced35c95..d55a2d5d9f 100644 --- a/src/ui/source-init-opts.c +++ b/src/ui/source-init-opts.c @@ -1,5 +1,5 @@ /* PSPPIRE - a graphical user interface for PSPP. - Copyright (C) 2008, 2010 Free Software Foundation + Copyright (C) 2008, 2010, 2014 Free Software Foundation This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -23,9 +23,7 @@ #include #include "data/file-name.h" -#include "data/por-file-reader.h" #include "data/settings.h" -#include "data/sys-file-reader.h" #include "language/lexer/include-path.h" #include "language/lexer/lexer.h" #include "libpspp/assertion.h" diff --git a/tests/automake.mk b/tests/automake.mk index b9df0539e1..661bfbb9a5 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -260,6 +260,7 @@ TESTSUITE_AT = \ tests/data/datasheet-test.at \ tests/data/dictionary.at \ tests/data/format-guesser.at \ + tests/data/pc+-file-reader.at \ tests/data/por-file.at \ tests/data/sys-file-reader.at \ tests/data/sys-file.at \ diff --git a/tests/data/pc+-file-reader.at b/tests/data/pc+-file-reader.at new file mode 100644 index 0000000000..1d89d0dbf6 --- /dev/null +++ b/tests/data/pc+-file-reader.at @@ -0,0 +1,1215 @@ +AT_BANNER([SPSS/PC+ file reader - positive]) + +AT_SETUP([variable labels and missing values]) +AT_KEYWORDS([sack synthetic PC+ file positive]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +@LABELS; @LABELS_END - @LABELS; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 15; + 1; + i16 0; dnl Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variable, no label or missing values. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + + dnl Numeric variable, variable label. + 0; 0; @NUM2_LABEL - @LABELS_OFS; 0x050800; s8 "NUM2"; PCSYSMIS; + + dnl Numeric variable with missing value. + 0; 0; 0; 0x050800; s8 "NUM3"; 1.0; + + dnl Numeric variable, variable label and missing value. + 0; 0; @NUM4_LABEL - @LABELS_OFS; 0x050800; s8 "NUM4"; 2.0; + + dnl String variable, no label or missing values. + 0; 0; 0; 0x010800; s8 "STR1"; PCSYSMIS; + + dnl String variable, variable label. + 0; 0; @STR2_LABEL - @LABELS_OFS; 0x010400; s8 "STR2"; PCSYSMIS; + + dnl String variable with missing value. + 0; 0; 0; 0x010500; s8 "STR3"; s8 "MISS"; + + dnl String variable, variable label and missing value. + 0; 0; @STR4_LABEL - @LABELS_OFS; 0x010100; s8 "STR4"; s8 "OTHR"; + + dnl Long string variable + 0; 0; 0; 0x010b00; s8 "STR5"; PCSYSMIS; + 0 * 8; + + dnl Long string variable with variable label + 0; 0; @STR6_LABEL - @LABELS_OFS; 0x010b00; s8 "STR6"; PCSYSMIS; + 0 * 8; +VARS_END: + +LABELS: + 3; i8 0 0 0; LABELS_OFS: i8 0; + NUM2_LABEL: COUNT8("Numeric variable 2's label"); + NUM4_LABEL: COUNT8("Another numeric variable label"); + STR2_LABEL: COUNT8("STR2's variable label"); + STR4_LABEL: COUNT8("STR4's variable label"); + STR6_LABEL: COUNT8("Another string variable's label"); +LABELS_END: + +DATA: + 0.0; "11/28/14"; 1.0; + 0.0; 1.0; 2.0; PCSYSMIS; s8 "abcdefgh"; s8 "ijkl"; s8 "mnopq"; s8 "r"; + s16 "stuvwxyzAB"; s16 "CDEFGHIJKLM"; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +DISPLAY FILE LABEL. +DISPLAY DICTIONARY. +LIST. +]) +AT_CHECK([pspp -o pspp.csv pc+-file.sps]) +AT_CHECK([cat pspp.csv], [0], [dnl +File label: PSPP synthetic test file + +Variable,Description,Position +NUM1,Format: F8.0,1 +NUM2,"Label: Numeric variable 2's label +Format: F8.0",2 +NUM3,"Format: F8.0 +Missing Values: 1",3 +NUM4,"Label: Another numeric variable label +Format: F8.0 +Missing Values: 2",4 +STR1,Format: A8,5 +STR2,"Label: STR2's variable label +Format: A4",6 +STR3,"Format: A5 +Missing Values: ""MISS """,7 +STR4,"Label: STR4's variable label +Format: A1 +Missing Values: ""O""",8 +STR5,Format: A11,9 +STR6,"Label: Another string variable's label +Format: A11",10 + +Table: Data List +NUM1,NUM2,NUM3,NUM4,STR1,STR2,STR3,STR4,STR5,STR6 +0,1,2,.,abcdefgh,ijkl,mnopq,r,stuvwxyzAB ,CDEFGHIJKLM +]) +AT_CLEANUP + +AT_SETUP([value labels]) +AT_KEYWORDS([sack synthetic PC+ file positive]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +@LABELS; @LABELS_END - @LABELS; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 16; + 1; + i16 0; dnl Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + @N1 - @LOFF; @N1E - @LOFF; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + @N2 - @LOFF; @N2E - @LOFF; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + @N3 - @LOFF; @N3E - @LOFF; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + @N4 - @LOFF; @N4E - @LOFF; 0; 0x050800; s8 "NUM4"; PCSYSMIS; + @N5 - @LOFF; @N5E - @LOFF; 0; 0x050800; s8 "NUM5"; PCSYSMIS; + + dnl String variables. + @S1 - @LOFF; @S1E - @LOFF; 0; 0x010100; s8 "STR1"; PCSYSMIS; + @S2 - @LOFF; @S2E - @LOFF; 0; 0x010200; s8 "STR2"; PCSYSMIS; + @S3 - @LOFF; @S3E - @LOFF; 0; 0x010300; s8 "STR3"; PCSYSMIS; + @S4 - @LOFF; @S4E - @LOFF; 0; 0x010400; s8 "STR4"; PCSYSMIS; + @S5 - @LOFF; @S5E - @LOFF; 0; 0x010500; s8 "STR5"; PCSYSMIS; + @S6 - @LOFF; @S6E - @LOFF; 0; 0x010600; s8 "STR6"; PCSYSMIS; + @S7 - @LOFF; @S7E - @LOFF; 0; 0x010700; s8 "STR7"; PCSYSMIS; + @S8 - @LOFF; @S8E - @LOFF; 0; 0x010800; s8 "STR8"; PCSYSMIS; +VARS_END: + +LABELS: + 3; i8 0 0 0; LOFF: i8 0; + + N1: 1.0; COUNT8("one"); N1E: + N2: 2.0; COUNT8("two"); 3.0; COUNT8("three"); N2E: + N3: + 3.0; COUNT8("three"); + N4: N5: + 4.0; COUNT8("four"); + N3E: N4E: + 5.0; COUNT8("five"); + N5E: + + S1: s8 "a"; COUNT8("value label for `a'"); S1E: + S2: s8 "ab"; COUNT8("value label for `ab'"); S2E: + S3: s8 "abc"; COUNT8("value label for `abc'"); S3E: + S4: S5: S6: S7: + s8 "abcdefgh"; COUNT8("value label for abcdefgh"); S4E: + S8: + s8 "ijklmnop"; COUNT8("value label for ijklmnop"); S5E: + s8 "qrstuvwx"; COUNT8("value label for qrstuvwx"); S6E: + s8 "yzABCDEF"; COUNT8("value label for yzABCDEF"); S7E: + s8 "GHIJKLMN"; COUNT8("value label for GHIJKLMN"); S8E: +LABELS_END: + +DATA: + 1.0; "11/28/14"; 1.0; + 1.0; 2.0; 3.0; 4.0; 5.0; + s8 "a"; s8 "bc"; s8 "cde"; s8 "fghj"; s8 "klmno"; s8 "pqrstu"; + s8 "vwxyzAB"; s8 "CDEFGHIJ"; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +DISPLAY FILE LABEL. +DISPLAY DICTIONARY. +LIST. +]) +AT_CHECK([pspp -o pspp.csv pc+-file.sps]) +AT_CHECK([cat pspp.csv], [0], [dnl +File label: PSPP synthetic test file + +Variable,Description,Position +NUM1,"Format: F8.0 + +Value,Label +1,one",1 +NUM2,"Format: F8.0 + +Value,Label +2,two +3,three",2 +NUM3,"Format: F8.0 + +Value,Label +3,three +4,four",3 +NUM4,"Format: F8.0 + +Value,Label +4,four",4 +NUM5,"Format: F8.0 + +Value,Label +4,four +5,five",5 +STR1,"Format: A1 + +Value,Label +a,value label for `a'",6 +STR2,"Format: A2 + +Value,Label +ab,value label for `ab'",7 +STR3,"Format: A3 + +Value,Label +abc,value label for `abc'",8 +STR4,"Format: A4 + +Value,Label +abcd,value label for abcdefgh",9 +STR5,"Format: A5 + +Value,Label +abcde,value label for abcdefgh +ijklm,value label for ijklmnop",10 +STR6,"Format: A6 + +Value,Label +abcdef,value label for abcdefgh +ijklmn,value label for ijklmnop +qrstuv,value label for qrstuvwx",11 +STR7,"Format: A7 + +Value,Label +abcdefg,value label for abcdefgh +ijklmno,value label for ijklmnop +qrstuvw,value label for qrstuvwx +yzABCDE,value label for yzABCDEF",12 +STR8,"Format: A8 + +Value,Label +GHIJKLMN,value label for GHIJKLMN +ijklmnop,value label for ijklmnop +qrstuvwx,value label for qrstuvwx +yzABCDEF,value label for yzABCDEF",13 + +Table: Data List +NUM1,NUM2,NUM3,NUM4,NUM5,STR1,STR2,STR3,STR4,STR5,STR6,STR7,STR8 +1,2,3,4,5,a,bc,cde,fghj,klmno,pqrstu,vwxyzAB,CDEFGHIJ +]) +AT_CLEANUP + +AT_SETUP([compressed data]) +AT_KEYWORDS([sack synthetic PC+ file positive]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 1; + i16 9; + 2; + i16 0; dnl Fixed. + 2; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + + dnl String variables. + 0; 0; 0; 0x010400; s8 "STR4"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "STR8"; PCSYSMIS; + 0; 0; 0; 0x010f00; s8 "STR15"; PCSYSMIS; + 0 * 8; +VARS_END: + +DATA: + i8 101 1 101 100 255 1 1 1; + s8 "11/28/14"; s8 "abcd"; s8 "efghj"; s8 "efghijkl"; + i8 1; i8 102 1 101 1 0 1 1; + s8 "ABCDEFG"; s8 "11/28/14"; 1000.0; s8 "PQRS"; s8 "TUVWXYZa"; + i8 1 1 0 0 0 0 0 0; + s16 "bcdefghijklmnop"; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +DISPLAY FILE LABEL. +DISPLAY DICTIONARY. +LIST. +]) +AT_CHECK([pspp -o pspp.csv pc+-file.sps]) +AT_CHECK([cat pspp.csv], [0], [dnl +File label: PSPP synthetic test file + +Variable,Description,Position +NUM1,Format: F8.0,1 +NUM2,Format: F8.0,2 +STR4,Format: A4,3 +STR8,Format: A8,4 +STR15,Format: A15,5 + +Table: Data List +NUM1,NUM2,STR4,STR8,STR15 +-5,150,abcd,efghj ,efghijklABCDEFG +1000,.,PQRS,TUVWXYZa,bcdefghijklmnop +]) +AT_CLEANUP + +AT_BANNER([SPSS/PC+ file reader - negative]) + +AT_SETUP([unspecified character encoding]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + 1; + i16 0; dnl Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav'. +DISPLAY FILE LABEL. +DISPLAY DICTIONARY. +LIST. + +SYSFILE INFO FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [0], [stdout], []) +AT_CHECK([sed 's/default encoding.*For/default encoding. For/' stdout], [0], [dnl +"warning: `pc+-file.sav': Using default encoding. For best results, specify an encoding explicitly. Use SYSFILE INFO with ENCODING=""DETECT"" to analyze the possible encodings." + +File label: PSPP synthetic test file + +Variable,Description,Position +NUM1,Format: F8.0,1 +NUM2,Format: F8.0,2 +NUM3,Format: F8.0,3 +NUM4,Format: F8.0,4 + +Table: Data List +NUM1,NUM2,NUM3,NUM4 +2,3,4,5 + +File:,pc+-file.sav +Label:,PSPP synthetic test file +Created:,11/28/14 15:11:00 by PCSPSS PSPP synthetic test product +Integer Format:,Little Endian +Real Format:,IEEE 754 LE. +Variables:,4 +Cases:,1 +Type:,SPSS/PC+ System File +Weight:,Not weighted. +Compression:,None +Encoding:,us-ascii + +Variable,Description,Position +NUM1,"Format: F8.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8",1 +NUM2,"Format: F8.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8",2 +NUM3,"Format: F8.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8",3 +NUM4,"Format: F8.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8",4 +]) +AT_CLEANUP + +AT_SETUP([unexpected fixed values]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +>>1; 2;<< +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + >>1.0<<; + 0; >>2<<; i16 1; dnl Fixed. + i16 0; + i16 7; + 1; + i16 0; dnl Fixed. + 3; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [0], [dnl +"warning: `pc+-file.sav' near offset 0x0: Directory fields have unexpected values (1,2)." + +warning: `pc+-file.sav' near offset 0x100: Record 0 specifies unexpected system missing value 1 (0x1p+0). + +"warning: `pc+-file.sav' near offset 0x100: Record 0 reserved fields have unexpected values (1,1,0,2,0)." + +warning: `pc+-file.sav' near offset 0x100: Record 0 case counts differ (1 versus 3). +]) +AT_CLEANUP + +AT_SETUP([short main header]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + 1; + i16 0; dnl Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [1], [dnl +error: `pc+-file.sav' near offset 0x100: This is not an SPSS/PC+ system file. +]) +AT_CLEANUP + +AT_SETUP([long main header]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + 1; + i16 0; dnl Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + >>s80 "PSPP synthetic test file"<<; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [0], [dnl +warning: `pc+-file.sav' near offset 0x100: Record 0 has unexpected length 192. +]) +AT_CLEANUP + +AT_SETUP([invalid compression type]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 >>2<<; + i16 7; + 1; + i16 0; dnl Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [1], [dnl +error: `pc+-file.sav' near offset 0x100: Invalid compression type 2. +]) +AT_CLEANUP + +AT_SETUP([unrealistic number of cases]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + 1000; + i16 0; dnl Fixed. + 1000; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [0], [dnl +warning: `pc+-file.sav' near offset 0x100: Record 0 claims 1000 cases with 7 values per case (requiring at least 56000 bytes) but data record is only 56 bytes long. +]) +AT_CLEANUP + +AT_SETUP([labels bad offsets]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +@LABELS; @LABELS_END - @LABELS; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + 1; + i16 0; dnl Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + @N1L - @LOFF; @N1E - @LOFF; 1000; 0x050800; s8 "NUM1"; PCSYSMIS; + @N1L - @LOFF - 1; @LABELS_END - @LOFF; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + @N1L - @LOFF + 1; @LABELS_END - @LOFF; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; @LABELS_END - @LOFF - 1; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +LABELS: + 3; i8 0 0 0; LOFF: i8 0; + N1L: PCSYSMIS; +LABELS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; N1E: +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [0], [dnl +warning: `pc+-file.sav' near offset 0x210: Variable label claimed to start at offset 1007 in labels record but labels record is only 16 bytes. + +warning: `pc+-file.sav' near offset 0x210: Value labels claimed to end at offset 72 in labels record but labels record is only 16 bytes. + +"warning: `pc+-file.sav' near offset 0x2a0: Value labels end with partial label (0 bytes left in record, label length 255)." + +warning: `pc+-file.sav' near offset 0x299: 7 leftover bytes following value labels. + +warning: `pc+-file.sav' near offset 0x29f: Variable label with length 255 starting at offset 16 in labels record overruns end of 16-byte labels record. +]) +AT_CLEANUP + +AT_SETUP([record 1 bad length]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + 1; + i16 0; dnl Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [1], [dnl +error: `pc+-file.sav' near offset 0x1b0: Record 1 has length 192 (expected 224). +]) +AT_CLEANUP + +AT_SETUP([bad variable format]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + 1; + i16 0; dnl Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0xff0000; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [1], [dnl +error: `pc+-file.sav' near offset 0x210: Variable 3 has invalid type 255. +]) +AT_CLEANUP + +AT_SETUP([bad variable name]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + 1; + i16 0; dnl Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050000; s8 "#NUM"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [1], [dnl +error: `pc+-file.sav' near offset 0x210: Invalid variable name `#NUM'. +]) +AT_CLEANUP + +AT_SETUP([duplicate variable name]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + 1; + i16 0; dnl Fixed. + 1; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050000; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; +VARS_END: + +DATA: + 0.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [0], [dnl +warning: `pc+-file.sav' near offset 0x230: Renaming variable with duplicate name `NUM1' to `VAR001'. + +warning: `pc+-file.sav' near offset 0x250: Renaming variable with duplicate name `NUM1' to `VAR002'. + +warning: `pc+-file.sav' near offset 0x270: Renaming variable with duplicate name `NUM1' to `VAR003'. +]) +AT_CLEANUP + +AT_SETUP([partial case]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + 3; + i16 0; dnl Fixed. + 3; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050000; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 1.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; + 2.0; "11/28/14"; 1.0; 6.0; 7.0; 8.0; 9.0; + 3.0; "11/28/14"; 1.0; 10.0; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +LIST. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [1], [dnl +warning: `pc+-file.sav' near offset 0x100: Record 0 claims 3 cases with 7 values per case (requiring at least 168 bytes) but data record is only 144 bytes long. + +error: `pc+-file.sav' near offset 0x320: File ends in partial case. + +error: Error reading case from file `pc+-file.sav'. + +Table: Data List +NUM1,NUM2,NUM3,NUM4 +2,3,4,5 +6,7,8,9 +]) +AT_CLEANUP + +AT_SETUP([case extends past end of data record]) +AT_KEYWORDS([sack synthetic PC+ file negative]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 0; + i16 7; + 3; + i16 0; dnl Fixed. + 3; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050000; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM3"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM4"; PCSYSMIS; +VARS_END: + +DATA: + 1.0; "11/28/14"; 1.0; 2.0; 3.0; 4.0; 5.0; + 2.0; "11/28/14"; 1.0; 6.0; 7.0; 8.0; 9.0; + 3.0; "11/28/14"; 1.0; 10.0; +DATA_END: + 11.0; 12.0; 13.0; +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +LIST. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [1], [dnl +warning: `pc+-file.sav' near offset 0x100: Record 0 claims 3 cases with 7 values per case (requiring at least 168 bytes) but data record is only 144 bytes long. + +error: `pc+-file.sav' near offset 0x338: Case beginning at offset 0x00000300 extends past end of data record at offset 0x00000320. + +error: Error reading case from file `pc+-file.sav'. + +Table: Data List +NUM1,NUM2,NUM3,NUM4 +2,3,4,5 +6,7,8,9 +]) +AT_CLEANUP + +AT_SETUP([corrupt compressed data]) +AT_KEYWORDS([sack synthetic PC+ file positive]) +AT_DATA([pc+-file.sack], [dnl +dnl File header. +2; 0; +@MAIN; @MAIN_END - @MAIN; +@VARS; @VARS_END - @VARS; +0; 0; +@DATA; @DATA_END - @DATA; +(0; 0) * 11; +i8 0 * 128; + +MAIN: + i16 1; dnl Fixed. + s62 "PCSPSS PSPP synthetic test product"; + PCSYSMIS; + 0; 0; i16 1; dnl Fixed. + i16 1; + i16 9; + 2; + i16 0; dnl Fixed. + 2; + s8 "11/28/14"; + s8 "15:11:00"; + s64 "PSPP synthetic test file"; +MAIN_END: + +VARS: + 0; 0; 0; 0x050800; s8 "$CASENUM"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "$DATE"; PCSYSMIS; + 0; 0; 0; 0x050802; s8 "$WEIGHT"; PCSYSMIS; + + dnl Numeric variables. + 0; 0; 0; 0x050800; s8 "NUM1"; PCSYSMIS; + 0; 0; 0; 0x050800; s8 "NUM2"; PCSYSMIS; + + dnl String variables. + 0; 0; 0; 0x010400; s8 "STR4"; PCSYSMIS; + 0; 0; 0; 0x010800; s8 "STR8"; PCSYSMIS; + 0; 0; 0; 0x010f00; s8 "STR15"; PCSYSMIS; + 0 * 8; +VARS_END: + +DATA: + i8 101 1 101 100 255 1 1 1; + s8 "11/28/14"; s8 "abcd"; s8 "efghj"; s8 "efghijkl"; + i8 1; i8 102 101 101 1 0 1 1; + s8 "ABCDEFG"; 1000.0; s8 "PQRS"; s8 "TUVWXYZa"; + i8 1 1 0 0 0 0 0 0; + s16 "bcdefghijklmnop"; +DATA_END: +]) +AT_CHECK([sack --le pc+-file.sack > pc+-file.sav]) +AT_DATA([pc+-file.sps], [dnl +GET FILE='pc+-file.sav' ENCODING='us-ascii'. +DISPLAY FILE LABEL. +DISPLAY DICTIONARY. +LIST. +]) +AT_CHECK([pspp -O format=csv pc+-file.sps], [0], [dnl +File label: PSPP synthetic test file + +Variable,Description,Position +NUM1,Format: F8.0,1 +NUM2,Format: F8.0,2 +STR4,Format: A4,3 +STR8,Format: A8,4 +STR15,Format: A15,5 + +warning: `pc+-file.sav' near offset 0x308: Possible compressed data corruption: string contains compressed integer (opcode 101). + +Table: Data List +NUM1,NUM2,STR4,STR8,STR15 +-5,150,abcd,efghj ,efghijklABCDEFG +1000,.,PQRS,TUVWXYZa,bcdefghijklmnop +]) +AT_CLEANUP diff --git a/tests/data/por-file.at b/tests/data/por-file.at index a492726ee2..1cc51fc1d6 100644 --- a/tests/data/por-file.at +++ b/tests/data/por-file.at @@ -114,4 +114,62 @@ Table: Data List VAR1,VAR2,VAR3,VAR4,VAR5 1,2,3,4,5 ]) +AT_DATA([sys-file-info.sps], [SYSFILE INFO FILE='data.por' +]) +AT_CHECK([pspp -O format=csv sys-file-info.sps | sed '/Encoding/d +/Integer Format/d +/Real Format/d +/Created/d +'], [0], [dnl +File:,data.por +Label:,No label. +Product:,x86_64-unknown-linux-gnu +Variables:,5 +Cases:,Unknown +Type:,SPSS Portable File +Weight:,Not weighted. +Compression:,None + +Variable,Description,Position +VAR1,"Format: F1.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8 + +Value,Label +1,one",1 +VAR2,"Format: F1.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8 + +Value,Label +2,two",2 +VAR3,"Format: F1.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8 + +Value,Label +3,three",3 +VAR4,"Format: F1.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8 + +Value,Label +4,four",4 +VAR5,"Format: F1.0 +Measure: Scale +Role: Input +Display Alignment: Right +Display Width: 8 + +Value,Label +5,five",5 +]) AT_CLEANUP diff --git a/tests/data/sack.c b/tests/data/sack.c index 9367f00450..f27edacaa0 100644 --- a/tests/data/sack.c +++ b/tests/data/sack.c @@ -29,6 +29,8 @@ #include "libpspp/assertion.h" #include "libpspp/compiler.h" #include "libpspp/float-format.h" +#include "libpspp/hash-functions.h" +#include "libpspp/hmap.h" #include "libpspp/integer-format.h" #include "gl/c-ctype.h" @@ -52,16 +54,23 @@ enum token_type T_EOF, T_INTEGER, T_FLOAT, + T_PCSYSMIS, T_STRING, T_SEMICOLON, T_ASTERISK, T_LPAREN, T_RPAREN, T_I8, + T_I16, T_I64, T_S, T_COUNT, - T_HEX + T_COUNT8, + T_HEX, + T_LABEL, + T_AT, + T_MINUS, + T_PLUS, }; static enum token_type token; @@ -70,6 +79,16 @@ static double tok_float; static char *tok_string; static size_t tok_strlen, tok_allocated; +/* Symbol table. */ +struct symbol + { + struct hmap_node hmap_node; + const char *name; + unsigned int offset; + }; + +static struct hmap symbol_table = HMAP_INITIALIZER (symbol_table); + /* --be, --le: Integer and floating-point formats. */ static enum float_format float_format = FLOAT_IEEE_DOUBLE_BE; static enum integer_format integer_format = INTEGER_MSB_FIRST; @@ -136,8 +155,6 @@ get_token (void) } else if (isdigit (c) || c == '-') { - char *tail; - do { add_char (c); @@ -147,19 +164,26 @@ get_token (void) add_char__ ('\0'); ungetc (c, input); - errno = 0; - if (strchr (tok_string, '.') == NULL) - { - token = T_INTEGER; - tok_integer = strtoull (tok_string, &tail, 0); - } + if (!strcmp (tok_string, "-")) + token = T_MINUS; else { - token = T_FLOAT; - tok_float = strtod (tok_string, &tail); + char *tail; + + errno = 0; + if (strchr (tok_string, '.') == NULL) + { + token = T_INTEGER; + tok_integer = strtoull (tok_string, &tail, 0); + } + else + { + token = T_FLOAT; + tok_float = strtod (tok_string, &tail); + } + if (errno || *tail) + fatal ("invalid numeric syntax \"%s\"", tok_string); } - if (errno || *tail) - fatal ("invalid numeric syntax"); } else if (c == '"') { @@ -176,23 +200,38 @@ get_token (void) token = T_SEMICOLON; else if (c == '*') token = T_ASTERISK; + else if (c == '+') + token = T_PLUS; else if (c == '(') token = T_LPAREN; else if (c == ')') token = T_RPAREN; - else if (isalpha (c)) + else if (isalpha (c) || c == '@' || c == '_') { do { add_char (c); c = getc (input); } - while (isdigit (c) || isalpha (c) || c == '.'); + while (isdigit (c) || isalpha (c) || c == '.' || c == '_'); add_char ('\0'); + + if (c == ':') + { + token = T_LABEL; + return; + } ungetc (c, input); + if (tok_string[0] == '@') + { + token = T_AT; + return; + } if (!strcmp (tok_string, "i8")) token = T_I8; + else if (!strcmp (tok_string, "i16")) + token = T_I16; else if (!strcmp (tok_string, "i64")) token = T_I64; else if (tok_string[0] == 's') @@ -205,6 +244,8 @@ get_token (void) token = T_FLOAT; tok_float = -DBL_MAX; } + else if (!strcmp (tok_string, "PCSYSMIS")) + token = T_PCSYSMIS; else if (!strcmp (tok_string, "LOWEST")) { token = T_FLOAT; @@ -222,6 +263,8 @@ get_token (void) } else if (!strcmp (tok_string, "COUNT")) token = T_COUNT; + else if (!strcmp (tok_string, "COUNT8")) + token = T_COUNT8; else if (!strcmp (tok_string, "hex")) token = T_HEX; else @@ -288,14 +331,13 @@ stdout. A data item is one of the following\n\ to fill up bytes. For example, s8 \"foo\" is output as\n\ the \"foo\" followed by 5 spaces.\n\ \n\ - - The literal \"i8\" followed by an integer. Output as a single\n\ - byte with the specified value.\n\ -\n\ - - The literal \"i64\" followed by an integer. Output as a 64-bit\n\ - binary integer.\n\ + - The literal \"i8\", \"i16\", or \"i64\" followed by an integer. Output\n\ + as a binary integer with the specified number of bits.\n\ \n\ - One of the literals SYSMIS, LOWEST, or HIGHEST. Output as a\n\ 64-bit IEEE 754 float of the appropriate PSPP value.\n\ +\n\ + - PCSYSMIS. Output as SPSS/PC+ system-missing value.\n\ \n\ - The literal ENDIAN. Output as a 32-bit binary integer, either\n\ with value 1 if --be is in effect or 2 if --le is in effect.\n\ @@ -304,9 +346,9 @@ stdout. A data item is one of the following\n\ followed by a semicolon (the last semicolon is optional).\n\ Output as the enclosed data items in sequence.\n\ \n\ - - The literal COUNT followed by a sequence of parenthesized data\n\ - items, as above. Output as a 32-bit binary integer whose value\n\ - is the number of bytes enclosed within the parentheses, followed\n\ + - The literal COUNT or COUNT8 followed by a sequence of parenthesized\n\ + data items, as above. Output as a 32-bit or 8-bit binary integer whose\n\ + value is the number of bytes enclosed within the parentheses, followed\n\ by the enclosed data items themselves.\n\ \n\ optionally followed by an asterisk and a positive integer, which\n\ @@ -371,6 +413,27 @@ parse_options (int argc, char **argv) return argv[optind]; } +static struct symbol * +symbol_find (const char *name) +{ + struct symbol *symbol; + unsigned int hash; + + if (name[0] == '@') + name++; + hash = hash_string (name, 0); + HMAP_FOR_EACH_WITH_HASH (symbol, struct symbol, hmap_node, + hash, &symbol_table) + if (!strcmp (name, symbol->name)) + return symbol; + + symbol = xmalloc (sizeof *symbol); + hmap_insert (&symbol_table, &symbol->hmap_node, hash); + symbol->name = xstrdup (name); + symbol->offset = UINT_MAX; + return symbol; +} + static void parse_data_item (struct buffer *output) { @@ -388,6 +451,13 @@ parse_data_item (struct buffer *output) float_format, buffer_put_uninit (output, 8)); get_token (); } + else if (token == T_PCSYSMIS) + { + static const uint8_t pcsysmis[] = + { 0xf5, 0x1e, 0x26, 0x02, 0x8a, 0x8c, 0xed, 0xff, }; + buffer_put (output, pcsysmis, sizeof pcsysmis); + get_token (); + } else if (token == T_I8) { uint8_t byte; @@ -403,6 +473,19 @@ parse_data_item (struct buffer *output) } while (token == T_INTEGER); } + else if (token == T_I16) + { + get_token (); + do + { + if (token != T_INTEGER) + fatal ("integer expected after `i16'"); + integer_put (tok_integer, integer_format, + buffer_put_uninit (output, 2), 2); + get_token (); + } + while (token == T_INTEGER); + } else if (token == T_I64) { get_token (); @@ -464,6 +547,22 @@ parse_data_item (struct buffer *output) integer_put (output->size - old_size - 4, integer_format, output->data + old_size, 4); } + else if (token == T_COUNT8) + { + buffer_put_uninit (output, 1); + + get_token (); + if (token != T_LPAREN) + fatal ("`(' expected after COUNT8"); + get_token (); + + while (token != T_RPAREN) + parse_data_item (output); + get_token (); + + integer_put (output->size - old_size - 1, integer_format, + output->data + old_size, 1); + } else if (token == T_HEX) { const char *p; @@ -491,6 +590,42 @@ parse_data_item (struct buffer *output) } get_token (); } + else if (token == T_LABEL) + { + struct symbol *sym = symbol_find (tok_string); + if (sym->offset == UINT_MAX) + sym->offset = output->size; + else if (sym->offset != output->size) + fatal ("%s: can't redefine label for offset %u with offset %u", + tok_string, sym->offset, output->size); + get_token (); + return; + } + else if (token == T_AT) + { + unsigned int value = symbol_find (tok_string)->offset; + get_token (); + + while (token == T_MINUS || token == T_PLUS) + { + enum token_type op = token; + unsigned int operand; + get_token (); + if (token == T_AT) + operand = symbol_find (tok_string)->offset; + else if (token == T_INTEGER) + operand = tok_integer; + else + fatal ("expecting @label"); + get_token (); + + if (op == T_PLUS) + value += operand; + else + value -= operand; + } + integer_put (value, integer_format, buffer_put_uninit (output, 4), 4); + } else fatal ("syntax error"); @@ -548,6 +683,24 @@ main (int argc, char **argv) while (token != T_EOF) parse_data_item (&output); + if (!hmap_is_empty (&symbol_table)) + { + struct symbol *symbol; + + HMAP_FOR_EACH (symbol, struct symbol, hmap_node, &symbol_table) + if (symbol->offset == UINT_MAX) + error (1, 0, "label %s used but never defined", symbol->name); + + output.size = 0; + if (fseek (input, 0, SEEK_SET) != 0) + error (1, 0, "failed to rewind stdin for second pass"); + + line_number = 1; + get_token (); + while (token != T_EOF) + parse_data_item (&output); + } + if (input != stdin) fclose (input); diff --git a/tests/language/dictionary/sys-file-info.at b/tests/language/dictionary/sys-file-info.at index 4402cd598e..e822506b33 100644 --- a/tests/language/dictionary/sys-file-info.at +++ b/tests/language/dictionary/sys-file-info.at @@ -29,7 +29,7 @@ File:,pro.sav Label:,No label. Variables:,2 Cases:,3 -Type:,System File +Type:,SPSS System File Weight:,Not weighted. Compression:,SAV diff --git a/tests/perl-module.at b/tests/perl-module.at index e8b9524b1c..5424d10324 100644 --- a/tests/perl-module.at +++ b/tests/perl-module.at @@ -571,7 +571,7 @@ AT_DATA([test.pl], print $PSPP::errstr, "\n"; ]]) AT_CHECK([RUN_PERL_MODULE test.pl], [0], - [[Error opening `no-such-file.sav' for reading as a system file: No such file or directory. + [[An error occurred while opening `no-such-file.sav': No such file or directory. ]], [[Name "PSPP::errstr" used only once: possible typo at test.pl line 8. ]]) diff --git a/utilities/automake.mk b/utilities/automake.mk index 101c7cfd6d..d6a720bb34 100644 --- a/utilities/automake.mk +++ b/utilities/automake.mk @@ -11,3 +11,8 @@ dist_man_MANS += utilities/pspp-convert.1 utilities_pspp_convert_SOURCES = utilities/pspp-convert.c utilities_pspp_convert_CPPFLAGS = $(AM_CPPFLAGS) -DINSTALLDIR=\"$(bindir)\" utilities_pspp_convert_LDADD = src/libpspp-core.la + +utilities_pspp_convert_LDFLAGS = $(PSPP_LDFLAGS) $(PG_LDFLAGS) +if RELOCATABLE_VIA_LD +utilities_pspp_convert_LDFLAGS += `$(RELOCATABLE_LDFLAGS) $(bindir)` +endif diff --git a/utilities/pspp-convert.1 b/utilities/pspp-convert.1 index d4c3a7b11d..2dc608980e 100644 --- a/utilities/pspp-convert.1 +++ b/utilities/pspp-convert.1 @@ -17,14 +17,16 @@ pspp\-convert \- convert SPSS system and portable files to other formats \fBpspp\-convert \-\-version\fR | \fB\-v\fR . .SH DESCRIPTION -The \fBpspp\-convert\fR program reads SPSS system or portable file -\fIinput\fR and writes it to \fIoutput\fR, performing format -conversion as necessary. +The \fBpspp\-convert\fR program reads \fIinput\fR, which may be an +SPSS system file, an SPSS/PC+ system file, or an SPSS portable file, +and writes it to \fIoutput\fR, performing format conversion as +necessary. .PP -The format of \fIinput\fR is automatically detected, except that the -character encoding of old system files cannot always be guessed -correctly. Use \fB\-e \fIencoding\fR to specify the encoding in this -case. +The format of \fIinput\fR is automatically detected, when possible. +The character encoding of old SPSS system files cannot always be +guessed correctly, and SPSS/PC+ system files do not include any +indication of their encoding. Use \fB\-e \fIencoding\fR to specify +the encoding in this case. .PP By default, the intended format for \fIoutput\fR is inferred from its extension: diff --git a/utilities/pspp-convert.c b/utilities/pspp-convert.c index 2dea20d2f3..ebd340ec35 100644 --- a/utilities/pspp-convert.c +++ b/utilities/pspp-convert.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2013 Free Software Foundation, Inc. + Copyright (C) 2013, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -156,7 +156,7 @@ main (int argc, char *argv[]) } input_fh = fh_create_file (NULL, input_filename, fh_default_properties ()); - reader = any_reader_open (input_fh, encoding, &dict); + reader = any_reader_open_and_decode (input_fh, encoding, &dict, NULL); if (reader == NULL) exit (1); -- 2.30.2