From a1c1a4ca100da5c40fe8637b1d460e61a4a8668e Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 22 Oct 2013 22:56:18 -0700 Subject: [PATCH] Implement ZLIB compressed system file reader and writer. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Thanks to Hugo Alejandro and Bastián Díaz for providing sample files in ZLIB compressed format to use as samples. --- NEWS | 4 + doc/dev/system-file-format.texi | 180 ++++++++-- doc/files.texi | 33 +- perl-module/PSPP.xs | 4 +- src/data/automake.mk | 1 + src/data/sys-file-private.h | 17 +- src/data/sys-file-reader.c | 394 ++++++++++++++++++-- src/data/sys-file-reader.h | 3 +- src/data/sys-file-writer.c | 204 ++++++++++- src/data/sys-file-writer.h | 5 +- src/data/sys-file.h | 28 ++ src/language/data-io/save.c | 7 +- src/language/dictionary/sys-file-info.c | 7 +- tests/data/sack.c | 83 ++++- tests/data/sys-file-reader.at | 398 ++++++++++++++++++++- tests/data/sys-file.at | 33 +- tests/language/dictionary/sys-file-info.at | 2 +- utilities/pspp-dump-sav.c | 139 ++++++- 18 files changed, 1444 insertions(+), 98 deletions(-) create mode 100644 src/data/sys-file.h diff --git a/NEWS b/NEWS index 5b166c3ef1..e3da3e489a 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,10 @@ Changes since 0.8.1: * Charts are now rendered with colours from the Tango palette instead of fully saturated primaries. + * PSPP can now read and write ZCOMPRESSED system files, a new format + variant that compresses data much more effectively than the + previous form of compression (which is still supported). + * Missing values for long string variables are now read from and written to system files in an SPSS-compatible fashion. diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi index f408ff2866..89c35aab3f 100644 --- a/doc/dev/system-file-format.texi +++ b/doc/dev/system-file-format.texi @@ -56,6 +56,18 @@ appears in system files only in missing value ranges, which never contain SYSMIS. @end table +System files may use most character encodings based on an 8-bit unit. +UTF-16 and UTF-32, based on wider units, appear to be unacceptable. +@code{rec_type} in the file header record is sufficient to distinguish +between ASCII and EBCDIC based encodings. The best way to determine +the specific encoding in use is to consult the character encoding +record (@pxref{Character Encoding Record}), if present, and failing +that the @code{character_code} in the machine integer info record +(@pxref{Machine Integer Info Record}). The same encoding should be +used for the dictionary and the data in the file, although it is +possible to artificially synthesize files that use different encodings +(@pxref{Character Encoding Record}). + System files are divided into records, each of which begins with a 4-byte record type, usually regarded as an @code{int32}. @@ -121,7 +133,7 @@ char rec_type[4]; char prod_name[60]; int32 layout_code; int32 nominal_case_size; -int32 compressed; +int32 compression; int32 weight_index; int32 ncases; flt64 bias; @@ -133,9 +145,15 @@ char padding[3]; @table @code @item char rec_type[4]; -Record type code, set to @samp{$FL2}, that is, either @code{24 46 4c -32} if the file uses an ASCII-based character encoding, or @code{5b c6 -d3 f2} if the file uses an EBCDIC-based character encoding. +Record type code, either @samp{$FL2} for system files with +uncompressed data or data compressed with simple bytecode compression, +or @samp{$FL3} for system files with ZLIB compressed data. + +This is truly a character field that uses the character encoding as +other strings. Thus, in a file with an ASCII-based character encoding +this field contains @code{24 46 4c 32} or @code{24 46 4c 33}, and in a +file with an EBCDIC-based encoding this field contains @code{5b c6 d3 +f2}. (No EBCDIC-based ZLIB-compressed files have been observed.) @item char prod_name[60]; Product identification string. This always begins with the characters @@ -160,7 +178,10 @@ files written by some systems set this value to -1. In general, it is unsafe for systems reading system files to rely upon this value. @item int32 compressed; -Set to 1 if the data in the file is compressed, 0 otherwise. +Set to 0 if the data in the file is not compressed, 1 if the data is +compressed with simple bytecode compression, 2 if the data is ZLIB +compressed. This field has value 2 if and only if @code{rec_type} is +@samp{$FL3}. @item int32 weight_index; If one of the variables in the data set is used as a weighting @@ -577,7 +598,8 @@ Floating point representation code. For IEEE 754 systems this is 1. IBM 370 sets this to 2, and DEC VAX E to 3. @item int32 compression_code; -Compression code. Always set to 1. +Compression code. Always set to 1, regardless of whether or how the +file is compressed. @item int32 endianness; Machine endianness. 1 indicates big-endian, 2 indicates little-endian. @@ -1434,22 +1456,23 @@ Ignored padding. Should be set to 0. @node Data Record @section Data Record -Data records must follow all other records in the system file. There must -be at least one data record in every system file. - -The format of data records varies depending on whether the data is -compressed. Regardless, the data is arranged in a series of 8-byte -elements. +The data record must follow all other records in the system file. +Every system file must have a data record that specifies data for at +least one case. The format of the data record varies depending on the +value of @code{compression} in the file header record: -When data is not compressed, -each element corresponds to +@table @asis +@item 0: no compression +Data is arranged as a series of 8-byte elements. +Each element corresponds to the variable declared in the respective variable record (@pxref{Variable Record}). Numeric values are given in @code{flt64} format; string values are literal characters string, padded on the right when necessary to fill out 8-byte units. -Compressed data is arranged in the following manner: the first 8 bytes -in the data section is divided into a series of 1-byte command +@item 1: bytecode compression +The first 8 bytes +of the data record is divided into a series of 1-byte command codes. These codes have meanings as described below: @table @asis @@ -1487,8 +1510,125 @@ An 8-byte string value that is all spaces. The system-missing value. @end table -When the end of the an 8-byte group of command bytes is reached, any -blocks of non-compressible values indicated by code 253 are skipped, -and the next element of command bytes is read and interpreted, until -the end of the file or a code with value 252 is reached. +The end of the 8-byte group of bytecodes is followed by any 8-byte +blocks of non-compressible values indicated by code 253. After that +follows another 8-byte group of bytecodes, then those bytecodes' +non-compressible values. The pattern repeats to the end of the file +or a code with value 252. + +@item 2: ZLIB compression +The data record consists of the following, in order: + +@itemize @bullet +@item +ZLIB data header, 24 bytes long. + +@item +One or more variable-length blocks of ZLIB compressed data. + +@item +ZLIB data trailer, with a 24-byte fixed header plus an additional 24 +bytes for each preceding ZLIB compressed data block. +@end itemize + +The ZLIB data header has the following format: + +@example +int64 zheader_ofs; +int64 ztrailer_ofs; +int64 ztrailer_len; +@end example + +@table @code +@item int64 zheader_ofs; +The offset, in bytes, of the beginning of this structure within the +system file. + +@item int64 ztrailer_ofs; +The offset, in bytes, of the first byte of the ZLIB data trailer. + +@item int64 ztrailer_len; +The number of bytes in the ZLIB data trailer. This and the previous +field sum to the size of the system file in bytes. +@end table + +The data header is followed by @code{(ztrailer_ofs - 24) / 24} ZLIB +compressed data blocks. Each ZLIB compressed data block begins with a +ZLIB header as specified in RFC@tie{}1950, e.g.@: hex bytes @code{78 +01} (the only header yet observed in practice). Each block +decompresses to a fixed number of bytes (in practice only +@code{0x3ff000}-byte blocks have been observed), except that the last +block of data may be shorter. The last ZLIB compressed data block +gends just before offset @code{ztrailer_ofs}. + +The result of ZLIB decompression is bytecode compressed data as +described above for compression format 1. + +The ZLIB data trailer begins with the following 24-byte fixed header: + +@example +int64 bias; +int64 zero; +int32 block_size; +int32 n_blocks; +@end example + +@table @code +@item int64 int_bias; +The compression bias as a negative integer, e.g.@: if @code{bias} in +the file header record is 100.0, then @code{int_bias} is @minus{}100 +(this is the only value yet observed in practice). + +@item int64 zero; +Always observed to be zero. + +@item int32 block_size; +The number of bytes in each ZLIB compressed data block, except +possibly the last, following decompression. Only @code{0x3ff000} has +been observed so far. + +@item int32 n_blocks; +The number of ZLIB compressed data blocks, always exactly +@code{(ztrailer_ofs - 24) / 24}. +@end table + +The fixed header is followed by @code{n_blocks} 24-byte ZLIB data +block descriptors, each of which describes the compressed data block +corresponding to its offset. Each block descriptor has the following +format: + +@example +int64 uncompressed_ofs; +int64 compressed_ofs; +int32 uncompressed_size; +int32 compressed_size; +@end example + +@table @code +@item int64 uncompressed_ofs; +The offset, in bytes, that this block of data would have in a similar +system file that uses compression format 1. This is +@code{zheader_ofs} in the first block descriptor, and in each +succeeding block descriptor it is the sum of the previous desciptor's +@code{uncompressed_ofs} and @code{uncompressed_size}. + +@item int64 compressed_ofs; +The offset, in bytes, of the actual beginning of this compressed data +block. This is @code{zheader_ofs + 24} in the first block descriptor, +and in each succeeding block descriptor it is the sum of the previous +descriptor's @code{compressed_ofs} and @code{compressed_size}. The +final block descriptor's @code{compressed_ofs} and +@code{compressed_size} sum to @code{ztrailer_ofs}. + +@item int32 uncompressed_size; +The number of bytes in this data block, after decompression. This is +@code{block_size} in every data block except the last, which may be +smaller. + +@item int32 compressed_size; +The number of bytes in this data block, as stored compressed in this +system file. +@end table +@end table + @setfilename ignored diff --git a/doc/files.texi b/doc/files.texi index 7a97823819..369a2e46f4 100644 --- a/doc/files.texi +++ b/doc/files.texi @@ -684,7 +684,7 @@ Use of @cmd{IMPORT} to read a system file is a @pspp{} extension. SAVE /OUTFILE=@{'@var{file_name}',@var{file_handle}@} /UNSELECTED=@{RETAIN,DELETE@} - /@{COMPRESSED,UNCOMPRESSED@} + /@{UNCOMPRESSED,COMPRESSED,ZCOMPRESSED@} /PERMISSIONS=@{WRITEABLE,READONLY@} /DROP=@var{var_list} /KEEP=@var{var_list} @@ -706,9 +706,32 @@ By default, cases excluded with FILTER are written to the system file. These can be excluded by specifying @subcmd{DELETE} on the @subcmd{UNSELECTED} subcommand. Specifying @subcmd{RETAIN} makes the default explicit. -The @subcmd{COMPRESS} and @subcmd{UNCOMPRESS} subcommand determine whether -the saved system file is compressed. By default, system files are compressed. -This default can be changed with the SET command (@pxref{SET}). +The @subcmd{UNCOMPRESSED}, @subcmd{COMPRESSED}, and +@subcmd{ZCOMPRESSED} subcommand determine the system file's +compression level: + +@table @code +@item UNCOMPRESSED +Data is not compressed. Each numeric value uses 8 bytes of disk +space. Each string value uses one byte per column width, rounded up +to a multiple of 8 bytes. + +@item COMPRESSED +Data is compressed with a simple algorithm. Each integer numeric +value between @minus{}99 and 151, inclusive, or system missing value +uses one byte of disk space. Each 8-byte segment of a string that +consists only of spaces uses 1 byte. Any other numeric value or +8-byte string segment uses 9 bytes of disk space. + +@item ZCOMPRESSED +Data is compressed with the ``deflate'' compression algorithm +specified in RFC@tie{}1951 (the same algorithm used by +@command{gzip}). Files written with this compression level cannot be +read by PSPP 0.8.1 or earlier or by SPSS 20 or earlier. +@end table + +@subcmd{COMPRESSED} is the default compression level. The SET command +(@pxref{SET}) can change this default. The @subcmd{PERMISSIONS} subcommand specifies permissions for the new system file. WRITEABLE, the default, creates the file with read and write @@ -938,7 +961,7 @@ the data is read by a procedure or procedure-like command. @display XSAVE /OUTFILE='@var{file_name}' - /@{COMPRESSED,UNCOMPRESSED@} + /@{UNCOMPRESSED,COMPRESSED,ZCOMPRESSED@} /PERMISSIONS=@{WRITEABLE,READONLY@} /DROP=@var{var_list} /KEEP=@var{var_list} diff --git a/perl-module/PSPP.xs b/perl-module/PSPP.xs index 0895f641a8..802aabf5c0 100644 --- a/perl-module/PSPP.xs +++ b/perl-module/PSPP.xs @@ -632,7 +632,9 @@ INIT: SV** version = hv_fetch(opt_h, "version", 7, 0); opts.create_writeable = readonly ? ! SvIV (*readonly) : true; - opts.compress = compress ? SvIV (*compress) : false; + opts.compression = (compress && SvIV (*compress) + ? SFM_COMP_SIMPLE + : SFM_COMP_NONE); opts.version = version ? SvIV (*version) : 3 ; } CODE: diff --git a/src/data/automake.mk b/src/data/automake.mk index 4385fd6d63..9ce405bb65 100644 --- a/src/data/automake.mk +++ b/src/data/automake.mk @@ -107,6 +107,7 @@ src_data_libdata_la_SOURCES = \ src/data/sys-file-reader.h \ src/data/sys-file-writer.c \ src/data/sys-file-writer.h \ + src/data/sys-file.h \ src/data/transformations.c \ src/data/transformations.h \ src/data/val-type.h \ diff --git a/src/data/sys-file-private.h b/src/data/sys-file-private.h index 21ff8ade3a..a39b0c1aa3 100644 --- a/src/data/sys-file-private.h +++ b/src/data/sys-file-private.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2006-2007, 2009-2012 Free Software Foundation, Inc. + Copyright (C) 2006-2007, 2009-2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,14 +35,19 @@ struct dictionary; -/* Magic numbers. +/* ASCII magic numbers. */ +#define ASCII_MAGIC "$FL2" /* For regular files. */ +#define ASCII_ZMAGIC "$FL3" /* For ZLIB compressed files. */ - Both of these are actually $FL2 in the respective character set. The "FL2" - part is invariant among national variants of each character set, but "$" has - different encodings, so it is safer to write them as hexadecimal. */ -#define ASCII_MAGIC "\x24\x46\x4c\x32" +/* EBCDIC magic number, the same as ASCII_MAGIC but encoded in EBCDIC. + + No EBCDIC ZLIB compressed files have been observed, so we do not define + EBCDIC_ZMAGIC even though the value is obvious. */ #define EBCDIC_MAGIC "\x5b\xc6\xd3\xf2" +/* Amount of data that ZLIB compressed data blocks typically decompress to. */ +#define ZBLOCK_SIZE 0x3ff000 + /* A variable in a system file. */ struct sfm_var { diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index d553b3a0e5..9a4ef86042 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include "data/attributes.h" #include "data/case.h" @@ -57,6 +59,7 @@ #include "gl/minmax.h" #include "gl/unlocked-io.h" #include "gl/xalloc.h" +#include "gl/xalloc-oversized.h" #include "gl/xsize.h" #include "gettext.h" @@ -173,11 +176,21 @@ struct sfm_reader const char *encoding; /* String encoding. */ /* Decompression. */ - bool compressed; /* File is compressed? */ + enum sfm_compression compression; double bias; /* Compression bias, usually 100.0. */ uint8_t opcodes[8]; /* Current block of opcodes. */ size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */ bool corruption_warning; /* Warned about possible corruption? */ + + /* ZLIB decompression. */ + long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */ +#define ZIN_BUF_SIZE 4096 + uint8_t *zin_buf; /* Inflation input buffer. */ +#define ZOUT_BUF_SIZE 16384 + uint8_t *zout_buf; /* Inflation output buffer. */ + unsigned int zout_end; /* Number of bytes of data in zout_buf. */ + unsigned int zout_pos; /* First unconsumed byte in zout_buf. */ + z_stream zstream; /* ZLIB inflater. */ }; static const struct casereader_class sys_file_casereader_class; @@ -200,10 +213,19 @@ static void sys_error (struct sfm_reader *, off_t, const char *, ...) static void read_bytes (struct sfm_reader *, void *, size_t); static bool try_read_bytes (struct sfm_reader *, void *, size_t); static int read_int (struct sfm_reader *); -static double read_float (struct sfm_reader *); +static long long int read_int64 (struct sfm_reader *); static void read_string (struct sfm_reader *, char *, size_t); static void skip_bytes (struct sfm_reader *, size_t); +/* ZLIB compressed data handling. */ +static void read_zheader (struct sfm_reader *); +static void open_zstream (struct sfm_reader *); +static void close_zstream (struct sfm_reader *); +static bool read_bytes_zlib (struct sfm_reader *, void *, size_t); +static void read_compressed_bytes (struct sfm_reader *, void *, size_t); +static bool try_read_compressed_bytes (struct sfm_reader *, void *, size_t); +static double read_compressed_float (struct sfm_reader *); + static char *fix_line_ends (const char *); static int parse_int (struct sfm_reader *, const void *data, size_t ofs); @@ -367,6 +389,7 @@ sfm_open_reader (struct file_handle *fh, const char *volatile encoding, r->error = false; r->opcode_idx = sizeof r->opcodes; r->corruption_warning = false; + r->zin_buf = r->zout_buf = NULL; info = infop ? infop : xmalloc (sizeof *info); memset (info, 0, sizeof *info); @@ -472,6 +495,9 @@ sfm_open_reader (struct file_handle *fh, const char *volatile encoding, } } + if (r->compression == SFM_COMP_ZLIB) + read_zheader (r); + /* Now actually parse what we read. First, figure out the correct character encoding, because this determines @@ -646,7 +672,9 @@ sfm_detect (FILE *file) return false; magic[4] = '\0'; - return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic); + return (!strcmp (ASCII_MAGIC, magic) + || !strcmp (ASCII_ZMAGIC, magic) + || !strcmp (EBCDIC_MAGIC, magic)); } /* Reads the global header of the system file. Initializes *HEADER and *INFO, @@ -658,12 +686,18 @@ read_header (struct sfm_reader *r, struct sfm_read_info *info, { uint8_t raw_layout_code[4]; uint8_t raw_bias[8]; + int compressed; + bool zmagic; read_string (r, header->magic, sizeof header->magic); read_string (r, header->eye_catcher, sizeof header->eye_catcher); - if (strcmp (ASCII_MAGIC, header->magic) - && strcmp (EBCDIC_MAGIC, header->magic)) + if (!strcmp (ASCII_MAGIC, header->magic) + || !strcmp (EBCDIC_MAGIC, header->magic)) + zmagic = false; + else if (!strcmp (ASCII_ZMAGIC, header->magic)) + zmagic = true; + else sys_error (r, 0, _("This is not an SPSS system file.")); /* Identify integer format. */ @@ -681,7 +715,25 @@ read_header (struct sfm_reader *r, struct sfm_read_info *info, || header->nominal_case_size > INT_MAX / 16) header->nominal_case_size = -1; - r->compressed = read_int (r) != 0; + compressed = read_int (r); + if (!zmagic) + { + if (compressed == 0) + r->compression = SFM_COMP_NONE; + else if (compressed == 1) + r->compression = SFM_COMP_SIMPLE; + else if (compressed != 0) + sys_error (r, 0, "System file header has invalid compression " + "value %d.", compressed); + } + else + { + if (compressed == 2) + r->compression = SFM_COMP_ZLIB; + else + sys_error (r, 0, "ZLIB-compressed system file header has invalid " + "compression value %d.", compressed); + } header->weight_idx = read_int (r); @@ -723,7 +775,7 @@ read_header (struct sfm_reader *r, struct sfm_read_info *info, info->integer_format = r->integer_format; info->float_format = r->float_format; - info->compressed = r->compressed; + info->compression = r->compression; info->case_cnt = r->case_cnt; } @@ -2289,7 +2341,7 @@ read_error (struct casereader *r, const struct sfm_reader *sfm) static bool read_case_number (struct sfm_reader *r, double *d) { - if (!r->compressed) + if (r->compression == SFM_COMP_NONE) { uint8_t number[8]; if (!try_read_bytes (r, number, sizeof number)) @@ -2339,13 +2391,13 @@ read_case_string (struct sfm_reader *r, uint8_t *s, size_t length) static int read_opcode (struct sfm_reader *r) { - assert (r->compressed); + assert (r->compression != SFM_COMP_NONE); for (;;) { int opcode; if (r->opcode_idx >= sizeof r->opcodes) { - if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes)) + if (!try_read_compressed_bytes (r, r->opcodes, sizeof r->opcodes)) return -1; r->opcode_idx = 0; } @@ -2370,7 +2422,7 @@ read_compressed_number (struct sfm_reader *r, double *d) return false; case 253: - *d = read_float (r); + *d = read_compressed_float (r); break; case 254: @@ -2411,7 +2463,7 @@ read_compressed_string (struct sfm_reader *r, uint8_t *dst) return false; case 253: - read_bytes (r, dst, 8); + read_compressed_bytes (r, dst, 8); break; case 254: @@ -2453,7 +2505,7 @@ static bool read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length) { assert (length % 8 == 0); - if (!r->compressed) + if (r->compression == SFM_COMP_NONE) return try_read_bytes (r, s, length); else { @@ -2820,14 +2872,14 @@ read_int (struct sfm_reader *r) return integer_get (r->integer_format, integer, sizeof integer); } -/* Reads a 64-bit floating-point number from R and returns its - value in host format. */ -static double -read_float (struct sfm_reader *r) +/* Reads a 64-bit signed integer from R and returns its value in + host format. */ +static long long int +read_int64 (struct sfm_reader *r) { - uint8_t number[8]; - read_bytes (r, number, sizeof number); - return float_get_double (r->float_format, number); + uint8_t integer[8]; + read_bytes (r, integer, sizeof integer); + return integer_get (r->integer_format, integer, sizeof integer); } static int @@ -2894,6 +2946,308 @@ fix_line_ends (const char *s) return dst; } +static void +read_ztrailer (struct sfm_reader *r, + long long int zheader_ofs, + long long int ztrailer_len); + +static void * +zalloc (voidpf pool_, uInt items, uInt size) +{ + struct pool *pool = pool_; + + return (!size || xalloc_oversized (items, size) + ? Z_NULL + : pool_malloc (pool, items * size)); +} + +static void +zfree (voidpf pool_, voidpf address) +{ + struct pool *pool = pool_; + + pool_free (pool, address); +} + +static void +read_zheader (struct sfm_reader *r) +{ + off_t pos = r->pos; + long long int zheader_ofs = read_int64 (r); + long long int ztrailer_ofs = read_int64 (r); + long long int ztrailer_len = read_int64 (r); + + if (zheader_ofs != pos) + sys_error (r, pos, _("Wrong ZLIB data header offset %#llx " + "(expected %#llx)."), + zheader_ofs, (long long int) pos); + + if (ztrailer_ofs < r->pos) + sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."), + ztrailer_ofs); + + if (ztrailer_len < 24 || ztrailer_len % 24) + sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len); + + r->ztrailer_ofs = ztrailer_ofs; + read_ztrailer (r, zheader_ofs, ztrailer_len); + + if (r->zin_buf == NULL) + { + r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE); + r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE); + r->zstream.next_in = NULL; + r->zstream.avail_in = 0; + } + + r->zstream.zalloc = zalloc; + r->zstream.zfree = zfree; + r->zstream.opaque = r->pool; + + open_zstream (r); +} + +static void +seek (struct sfm_reader *r, off_t offset) +{ + if (fseeko (r->file, offset, SEEK_SET)) + sys_error (r, 0, _("%s: seek failed (%s)."), + fh_get_file_name (r->fh), strerror (errno)); + r->pos = offset; +} + +/* Performs some additional consistency checks on the ZLIB compressed data + trailer. */ +static void +read_ztrailer (struct sfm_reader *r, + long long int zheader_ofs, + long long int ztrailer_len) +{ + long long int expected_uncmp_ofs; + long long int expected_cmp_ofs; + long long int bias; + long long int zero; + unsigned int block_size; + unsigned int n_blocks; + unsigned int i; + struct stat s; + + if (fstat (fileno (r->file), &s)) + sys_error (ME, 0, _("%s: stat failed (%s)."), + fh_get_file_name (r->fh), strerror (errno)); + + if (!S_ISREG (s.st_mode)) + { + /* We can't seek to the trailer and then back to the data in this file, + so skip doing extra checks. */ + return; + } + + if (r->ztrailer_ofs + ztrailer_len != s.st_size) + sys_warn (r, r->pos, + _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."), + r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size); + + seek (r, r->ztrailer_ofs); + + /* Read fixed header from ZLIB data trailer. */ + bias = read_int64 (r); + if (-bias != r->bias) + sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from " + "file header bias (%.2f)."), + -bias, r->bias); + + zero = read_int64 (r); + if (zero != 0) + sys_warn (r, r->pos, + _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero); + + block_size = read_int (r); + if (block_size != ZBLOCK_SIZE) + sys_warn (r, r->pos, + _("ZLIB trailer specifies unexpected %u-byte block size."), + block_size); + + n_blocks = read_int (r); + if (n_blocks != (ztrailer_len - 24) / 24) + sys_error (r, r->pos, + _("%lld-byte ZLIB trailer specifies %u data blocks (expected " + "%lld)."), + ztrailer_len, n_blocks, (ztrailer_len - 24) / 24); + + expected_uncmp_ofs = zheader_ofs; + expected_cmp_ofs = zheader_ofs + 24; + for (i = 0; i < n_blocks; i++) + { + off_t desc_ofs = r->pos; + unsigned long long int uncompressed_ofs = read_int64 (r); + unsigned long long int compressed_ofs = read_int64 (r); + unsigned int uncompressed_size = read_int (r); + unsigned int compressed_size = read_int (r); + + if (uncompressed_ofs != expected_uncmp_ofs) + sys_error (r, desc_ofs, + _("ZLIB block descriptor %u reported uncompressed data " + "offset %#llx, when %#llx was expected."), + i, uncompressed_ofs, expected_uncmp_ofs); + + if (compressed_ofs != expected_cmp_ofs) + sys_error (r, desc_ofs, + _("ZLIB block descriptor %u reported compressed data " + "offset %#llx, when %#llx was expected."), + i, compressed_ofs, expected_cmp_ofs); + + if (i < n_blocks - 1) + { + if (uncompressed_size != block_size) + sys_warn (r, desc_ofs, + _("ZLIB block descriptor %u reported block size %#x, " + "when %#x was expected."), + i, uncompressed_size, block_size); + } + else + { + if (uncompressed_size > block_size) + sys_warn (r, desc_ofs, + _("ZLIB block descriptor %u reported block size %#x, " + "when at most %#x was expected."), + i, uncompressed_size, block_size); + } + + /* http://www.zlib.net/zlib_tech.html says that the maximum expansion + from compression, with worst-case parameters, is 13.5% plus 11 bytes. + This code checks for an expansion of more than 14.3% plus 11 + bytes. */ + if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11) + sys_error (r, desc_ofs, + _("ZLIB block descriptor %u reports compressed size %u " + "and uncompressed size %u."), + i, compressed_size, uncompressed_size); + + expected_uncmp_ofs += uncompressed_size; + expected_cmp_ofs += compressed_size; + } + + if (expected_cmp_ofs != r->ztrailer_ofs) + sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx " + "would be expected from block descriptors."), + r->ztrailer_ofs, expected_cmp_ofs); + + seek (r, zheader_ofs + 24); +} + +static void +open_zstream (struct sfm_reader *r) +{ + int error; + + r->zout_pos = r->zout_end = 0; + error = inflateInit (&r->zstream); + if (error != Z_OK) + sys_error (r, r->pos, _("ZLIB initialization failed (%s)."), + r->zstream.msg); +} + +static void +close_zstream (struct sfm_reader *r) +{ + int error; + + error = inflateEnd (&r->zstream); + if (error != Z_OK) + sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."), + r->zstream.msg); +} + +static bool +read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt) +{ + uint8_t *buf = buf_; + + if (byte_cnt == 0) + return true; + + for (;;) + { + int error; + + /* Use already inflated data if there is any. */ + if (r->zout_pos < r->zout_end) + { + unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos); + memcpy (buf, &r->zout_buf[r->zout_pos], n); + r->zout_pos += n; + byte_cnt -= n; + buf += n; + + if (byte_cnt == 0) + return true; + } + + /* We need to inflate some more data. + Get some more input data if we don't have any. */ + if (r->zstream.avail_in == 0) + { + unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos); + if (n == 0 || !try_read_bytes (r, r->zin_buf, n)) + return false; + r->zstream.avail_in = n; + r->zstream.next_in = r->zin_buf; + } + + /* Inflate the (remaining) input data. */ + r->zstream.avail_out = ZOUT_BUF_SIZE; + r->zstream.next_out = r->zout_buf; + error = inflate (&r->zstream, Z_SYNC_FLUSH); + r->zout_pos = 0; + r->zout_end = r->zstream.next_out - r->zout_buf; + if (r->zout_end == 0) + { + if (error == Z_STREAM_END) + { + close_zstream (r); + open_zstream (r); + } + else + sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."), + r->zstream.msg); + } + else + { + /* Process the output data and ignore 'error' for now. ZLIB will + present it to us again on the next inflate() call. */ + } + } +} + +static void +read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +{ + if (r->compression == SFM_COMP_SIMPLE) + return read_bytes (r, buf, byte_cnt); + else if (!read_bytes_zlib (r, buf, byte_cnt)) + sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data.")); +} + +static bool +try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +{ + if (r->compression == SFM_COMP_SIMPLE) + return try_read_bytes (r, buf, byte_cnt); + else + return read_bytes_zlib (r, buf, byte_cnt); +} + +/* Reads a 64-bit floating-point number from R and returns its + value in host format. */ +static double +read_compressed_float (struct sfm_reader *r) +{ + uint8_t number[8]; + read_compressed_bytes (r, number, sizeof number); + return float_get_double (r->float_format, number); +} + static const struct casereader_class sys_file_casereader_class = { sys_file_casereader_read, diff --git a/src/data/sys-file-reader.h b/src/data/sys-file-reader.h index 037d33a394..011d541954 100644 --- a/src/data/sys-file-reader.h +++ b/src/data/sys-file-reader.h @@ -21,6 +21,7 @@ #include #include "data/case.h" +#include "data/sys-file.h" #include "libpspp/float-format.h" #include "libpspp/integer-format.h" @@ -36,7 +37,7 @@ struct sfm_read_info char *creation_time; /* "hh:mm:ss". */ enum integer_format integer_format; enum float_format float_format; - bool compressed; /* 0=no, 1=yes. */ + enum sfm_compression compression; casenumber case_cnt; /* -1 if unknown. */ char *product; /* Product name. */ char *product_ext; /* Extra product info. */ diff --git a/src/data/sys-file-writer.c b/src/data/sys-file-writer.c index c78e04d55f..8cfd577f1a 100644 --- a/src/data/sys-file-writer.c +++ b/src/data/sys-file-writer.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "data/attributes.h" #include "data/case.h" @@ -72,11 +73,11 @@ struct sfm_writer FILE *file; /* File stream. */ struct replace_file *rf; /* Ticket for replacing output file. */ - bool compress; /* 1=compressed, 0=not compressed. */ + enum sfm_compression compression; casenumber case_cnt; /* Number of cases written so far. */ uint8_t space; /* ' ' in the file's character encoding. */ - /* Compression buffering. + /* Simple compression buffering. Compressed data is output as a series of 8-byte elements, with 1 to 9 such elements clustered together. The first element in a cluster is 8 @@ -89,6 +90,12 @@ struct sfm_writer int n_opcodes; /* Number of opcodes in cbuf[0] so far. */ int n_elements; /* Number of elements in cbuf[] so far. */ + /* ZLIB compression. */ + z_stream zstream; /* ZLIB deflater. */ + off_t zstart; + struct zblock *blocks; + size_t n_blocks, allocated_blocks; + /* Variables. */ struct sfm_var *sfm_vars; /* Variables. */ size_t sfm_var_cnt; /* Number of variables. */ @@ -96,6 +103,12 @@ struct sfm_writer for long string variables. */ }; +struct zblock + { + unsigned int uncompressed_size; + unsigned int compressed_size; + }; + static const struct casewriter_class sys_file_casewriter_class; static void write_header (struct sfm_writer *, const struct dictionary *); @@ -134,6 +147,7 @@ static void write_variable_attributes (struct sfm_writer *, const struct dictionary *); static void write_int (struct sfm_writer *, int32_t); +static void write_int64 (struct sfm_writer *, int64_t); static inline void convert_double_to_output_format (double, uint8_t[8]); static void write_float (struct sfm_writer *, double); static void write_string (struct sfm_writer *, const char *, size_t); @@ -156,6 +170,10 @@ static void put_cmp_opcode (struct sfm_writer *, uint8_t); static void put_cmp_number (struct sfm_writer *, double); static void put_cmp_string (struct sfm_writer *, const void *, size_t); +static bool start_zstream (struct sfm_writer *); +static void finish_zstream (struct sfm_writer *); +static void write_ztrailer (struct sfm_writer *); + static bool write_error (const struct sfm_writer *); static bool close_writer (struct sfm_writer *); @@ -164,8 +182,10 @@ struct sfm_write_options sfm_writer_default_options (void) { struct sfm_write_options opts; + opts.compression = (settings_get_scompression () + ? SFM_COMP_SIMPLE + : SFM_COMP_NONE); opts.create_writeable = true; - opts.compress = settings_get_scompression (); opts.version = 3; return opts; } @@ -194,13 +214,20 @@ sfm_open_writer (struct file_handle *fh, struct dictionary *d, } /* Create and initialize writer. */ - w = xmalloc (sizeof *w); + w = xzalloc (sizeof *w); w->fh = fh_ref (fh); w->lock = NULL; w->file = NULL; w->rf = NULL; - w->compress = opts.compress; + /* Use the requested compression, except that no EBCDIC-based ZLIB compressed + files have been observed, so drop back to simple compression for those + files. */ + w->compression = opts.compression; + if (w->compression == SFM_COMP_ZLIB + && is_encoding_ebcdic_compatible (dict_get_encoding (d))) + w->compression = SFM_COMP_SIMPLE; + w->case_cnt = 0; w->n_opcodes = w->n_elements = 0; @@ -279,6 +306,20 @@ sfm_open_writer (struct file_handle *fh, struct dictionary *d, write_int (w, 999); write_int (w, 0); + if (w->compression == SFM_COMP_ZLIB) + { + w->zstream.zalloc = Z_NULL; + w->zstream.zfree = Z_NULL; + w->zstream.opaque = Z_NULL; + w->zstart = ftello (w->file); + + write_int64 (w, w->zstart); + write_int64 (w, 0); + write_int64 (w, 0); + + start_zstream (w); + } + if (write_error (w)) goto error; @@ -336,6 +377,8 @@ write_header (struct sfm_writer *w, const struct dictionary *d) /* Record-type code. */ if (is_encoding_ebcdic_compatible (dict_encoding)) write_string (w, EBCDIC_MAGIC, 4); + else if (w->compression == SFM_COMP_ZLIB) + write_string (w, ASCII_ZMAGIC, 4); else write_string (w, ASCII_MAGIC, 4); @@ -351,7 +394,9 @@ write_header (struct sfm_writer *w, const struct dictionary *d) write_int (w, calc_oct_idx (d, NULL)); /* Compressed? */ - write_int (w, w->compress); + write_int (w, (w->compression == SFM_COMP_NONE ? 0 + : w->compression == SFM_COMP_SIMPLE ? 1 + : 2)); /* Weight variable. */ weight = dict_get_weight (d); @@ -1171,7 +1216,7 @@ sys_file_casewriter_write (struct casewriter *writer, void *w_, w->case_cnt++; - if (!w->compress) + if (w->compression == SFM_COMP_NONE) write_case_uncompressed (w, c); else write_case_compressed (w, c); @@ -1210,6 +1255,11 @@ close_writer (struct sfm_writer *w) { /* Flush buffer. */ flush_compressed (w); + if (w->compression == SFM_COMP_ZLIB) + { + finish_zstream (w); + write_ztrailer (w); + } fflush (w->file); ok = !write_error (w); @@ -1234,6 +1284,8 @@ close_writer (struct sfm_writer *w) ok = false; } + free (w->blocks); + fh_unlock (w->lock); fh_unref (w->fh); @@ -1324,13 +1376,142 @@ write_case_compressed (struct sfm_writer *w, const struct ccase *c) } } +static bool +start_zstream (struct sfm_writer *w) +{ + int error; + + error = deflateInit (&w->zstream, 1); + if (error != Z_OK) + { + msg (ME, _("Failed to initialize ZLIB for compression (%s)."), + w->zstream.msg); + return false; + } + return true; +} + +static void +finish_zstream (struct sfm_writer *w) +{ + struct zblock *block; + int error; + + assert (w->zstream.total_in <= ZBLOCK_SIZE); + + w->zstream.next_in = NULL; + w->zstream.avail_in = 0; + do + { + uint8_t buf[4096]; + + w->zstream.next_out = buf; + w->zstream.avail_out = sizeof buf; + error = deflate (&w->zstream, Z_FINISH); + write_bytes (w, buf, w->zstream.next_out - buf); + } + while (error == Z_OK); + + if (error != Z_STREAM_END) + msg (ME, _("Failed to complete ZLIB stream compression (%s)."), + w->zstream.msg); + + if (w->n_blocks >= w->allocated_blocks) + w->blocks = x2nrealloc (w->blocks, &w->allocated_blocks, + sizeof *w->blocks); + block = &w->blocks[w->n_blocks++]; + block->uncompressed_size = w->zstream.total_in; + block->compressed_size = w->zstream.total_out; +} + +static void +write_zlib (struct sfm_writer *w, const void *data_, unsigned int n) +{ + const uint8_t *data = data_; + + while (n > 0) + { + unsigned int chunk; + + if (w->zstream.total_in >= ZBLOCK_SIZE) + { + finish_zstream (w); + start_zstream (w); + } + + chunk = MIN (n, ZBLOCK_SIZE - w->zstream.total_in); + + w->zstream.next_in = CONST_CAST (uint8_t *, data); + w->zstream.avail_in = chunk; + do + { + uint8_t buf[4096]; + int error; + + w->zstream.next_out = buf; + w->zstream.avail_out = sizeof buf; + error = deflate (&w->zstream, Z_NO_FLUSH); + write_bytes (w, buf, w->zstream.next_out - buf); + if (error != Z_OK) + { + msg (ME, _("ZLIB stream compression failed (%s)."), + w->zstream.msg); + return; + } + } + while (w->zstream.avail_in > 0 || w->zstream.avail_out == 0); + data += chunk; + n -= chunk; + } +} + +static void +write_ztrailer (struct sfm_writer *w) +{ + long long int uncompressed_ofs; + long long int compressed_ofs; + const struct zblock *block; + + write_int64 (w, -COMPRESSION_BIAS); + write_int64 (w, 0); + write_int (w, ZBLOCK_SIZE); + write_int (w, w->n_blocks); + + uncompressed_ofs = w->zstart; + compressed_ofs = w->zstart + 24; + for (block = w->blocks; block < &w->blocks[w->n_blocks]; block++) + { + write_int64 (w, uncompressed_ofs); + write_int64 (w, compressed_ofs); + write_int (w, block->uncompressed_size); + write_int (w, block->compressed_size); + + uncompressed_ofs += block->uncompressed_size; + compressed_ofs += block->compressed_size; + } + + if (!fseeko (w->file, w->zstart + 8, SEEK_SET)) + { + write_int64 (w, compressed_ofs); + write_int64 (w, 24 + (w->n_blocks * 24)); + } + else + msg (ME, _("%s: Seek failed (%s)."), + fh_get_file_name (w->fh), strerror (errno)); +} + /* Flushes buffered compressed opcodes and data to W. */ static void flush_compressed (struct sfm_writer *w) { if (w->n_opcodes) { - write_bytes (w, w->cbuf, 8 * (1 + w->n_elements)); + unsigned int n = 8 * (1 + w->n_elements); + if (w->compression == SFM_COMP_SIMPLE) + write_bytes (w, w->cbuf, n); + else + write_zlib (w, w->cbuf, n); + w->n_opcodes = w->n_elements = 0; memset (w->cbuf[0], 0, 8); } @@ -1376,6 +1557,13 @@ write_int (struct sfm_writer *w, int32_t x) write_bytes (w, &x, sizeof x); } +/* Writes 64-bit integer X to the output file for writer W. */ +static void +write_int64 (struct sfm_writer *w, int64_t x) +{ + write_bytes (w, &x, sizeof x); +} + /* Converts NATIVE to the 64-bit format used in output files in OUTPUT. */ static inline void diff --git a/src/data/sys-file-writer.h b/src/data/sys-file-writer.h index fdff49fe52..4f233f3197 100644 --- a/src/data/sys-file-writer.h +++ b/src/data/sys-file-writer.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2009 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2009, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,14 +18,15 @@ #define SFM_WRITE_H 1 #include +#include "sys-file.h" /* Writing system files. */ /* Options for creating a system file. */ struct sfm_write_options { + enum sfm_compression compression; bool create_writeable; /* File perms: writeable or read/only? */ - bool compress; /* Compress file? */ int version; /* System file version (currently 2 or 3). */ }; diff --git a/src/data/sys-file.h b/src/data/sys-file.h new file mode 100644 index 0000000000..7a582c05f4 --- /dev/null +++ b/src/data/sys-file.h @@ -0,0 +1,28 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2013 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef SYS_FILE_H +#define SYS_FILE_H 1 + +/* System file compression format. */ +enum sfm_compression + { + SFM_COMP_NONE, /* No compression. */ + SFM_COMP_SIMPLE, /* Bytecode compression of integer values. */ + SFM_COMP_ZLIB /* ZLIB "deflate" compression. */ + }; + +#endif /* sys-file.h */ diff --git a/src/language/data-io/save.c b/src/language/data-io/save.c index e01a8c941e..7f1347db98 100644 --- a/src/language/data-io/save.c +++ b/src/language/data-io/save.c @@ -234,10 +234,13 @@ parse_write_command (struct lexer *lexer, struct dataset *ds, } else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "COMPRESSED")) - sysfile_opts.compress = true; + sysfile_opts.compression = SFM_COMP_SIMPLE; else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "UNCOMPRESSED")) - sysfile_opts.compress = false; + sysfile_opts.compression = SFM_COMP_NONE; + else if (writer_type == SYSFILE_WRITER + && lex_match_id (lexer, "ZCOMPRESSED")) + sysfile_opts.compression = SFM_COMP_ZLIB; else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "VERSION")) { diff --git a/src/language/dictionary/sys-file-info.c b/src/language/dictionary/sys-file-info.c index 3327a2c4ca..c7f326f3be 100644 --- a/src/language/dictionary/sys-file-info.c +++ b/src/language/dictionary/sys-file-info.c @@ -150,10 +150,11 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED) ? var_get_name (weight_var) : _("Not weighted."))); } - tab_text (t, 0, r, TAB_LEFT, _("Mode:")); + tab_text (t, 0, r, TAB_LEFT, _("Compression:")); tab_text_format (t, 1, r++, TAB_LEFT, - _("Compression %s."), info.compressed ? _("on") : _("off")); - + info.compression == SFM_COMP_NONE ? _("None") + : info.compression == SFM_COMP_SIMPLE ? "SAV" + : "ZSAV"); tab_text (t, 0, r, TAB_LEFT, _("Charset:")); tab_text (t, 1, r++, TAB_LEFT, dict_get_encoding (d)); diff --git a/tests/data/sack.c b/tests/data/sack.c index ace153cffd..0326b1370d 100644 --- a/tests/data/sack.c +++ b/tests/data/sack.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2011 Free Software Foundation, Inc. + Copyright (C) 2011, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -31,6 +31,7 @@ #include "libpspp/float-format.h" #include "libpspp/integer-format.h" +#include "gl/c-ctype.h" #include "gl/error.h" #include "gl/md5.h" #include "gl/intprops.h" @@ -58,12 +59,14 @@ enum token_type T_LPAREN, T_RPAREN, T_I8, + T_I64, T_S, - T_COUNT + T_COUNT, + T_HEX }; static enum token_type token; -static unsigned long int tok_integer; +static unsigned long long int tok_integer; static double tok_float; static char *tok_string; static size_t tok_strlen, tok_allocated; @@ -92,12 +95,19 @@ fatal (const char *message, ...) } static void -add_char (int c) +add_char__ (int c) { if (tok_strlen >= tok_allocated) tok_string = x2realloc (tok_string, &tok_allocated); - tok_string[tok_strlen++] = c; + tok_string[tok_strlen] = c; +} + +static void +add_char (int c) +{ + add_char__ (c); + tok_strlen++; } static void @@ -135,14 +145,14 @@ get_token (void) c = getc (input); } while (isdigit (c) || isalpha (c) || c == '.'); - add_char ('\0'); + add_char__ ('\0'); ungetc (c, input); errno = 0; if (strchr (tok_string, '.') == NULL) { token = T_INTEGER; - tok_integer = strtoul (tok_string, &tail, 0); + tok_integer = strtoull (tok_string, &tail, 0); } else { @@ -161,6 +171,7 @@ get_token (void) fatal ("new-line inside string"); add_char (c); } + add_char__ ('\0'); } else if (c == ';') token = T_SEMICOLON; @@ -183,6 +194,8 @@ get_token (void) if (!strcmp (tok_string, "i8")) token = T_I8; + else if (!strcmp (tok_string, "i64")) + token = T_I64; else if (tok_string[0] == 's') { token = T_S; @@ -210,6 +223,8 @@ get_token (void) } else if (!strcmp (tok_string, "COUNT")) token = T_COUNT; + else if (!strcmp (tok_string, "hex")) + token = T_HEX; else fatal ("invalid token `%s'", tok_string); } @@ -235,6 +250,17 @@ buffer_put_uninit (struct buffer *buffer, size_t n) return &buffer->data[buffer->size - n]; } +/* Returns the integer value of hex digit C. */ +static int +hexit_value (int c) +{ + const char s[] = "0123456789abcdef"; + const char *cp = strchr (s, c_tolower ((unsigned char) c)); + + assert (cp != NULL); + return cp - s; +} + static void usage (void) { @@ -265,6 +291,9 @@ stdout. A data item is one of the following\n\ \n\ - The literal \"i8\" followed by an integer. Output as a single\n\ byte with the specified value.\n\ +\n\ + - The literal \"i64\" followed by an integer. Output as a 64-bit\n\ + binary integer.\n\ \n\ - One of the literals SYSMIS, LOWEST, or HIGHEST. Output as a\n\ 64-bit IEEE 754 float of the appropriate PSPP value.\n\ @@ -378,6 +407,19 @@ parse_data_item (struct buffer *output) } while (token == T_INTEGER); } + else if (token == T_I64) + { + get_token (); + do + { + if (token != T_INTEGER) + fatal ("integer expected after `i64'"); + integer_put (tok_integer, integer_format, + buffer_put_uninit (output, 8), 8); + get_token (); + } + while (token == T_INTEGER); + } else if (token == T_STRING) { buffer_put (output, tok_string, tok_strlen); @@ -426,6 +468,33 @@ parse_data_item (struct buffer *output) integer_put (output->size - old_size - 4, integer_format, output->data + old_size, 4); } + else if (token == T_HEX) + { + const char *p; + + get_token (); + + if (token != T_STRING) + fatal ("string expected"); + + for (p = tok_string; *p; p++) + { + if (isspace ((unsigned char) *p)) + continue; + else if (isxdigit ((unsigned char) p[0]) + && isxdigit ((unsigned char) p[1])) + { + int high = hexit_value (p[0]); + int low = hexit_value (p[1]); + uint8_t byte = high * 16 + low; + buffer_put (output, &byte, 1); + p++; + } + else + fatal ("invalid format in hex string"); + } + get_token (); + } else fatal ("syntax error"); diff --git a/tests/data/sys-file-reader.at b/tests/data/sys-file-reader.at index 2706228eb6..4b7a2b541f 100644 --- a/tests/data/sys-file-reader.at +++ b/tests/data/sys-file-reader.at @@ -1297,7 +1297,7 @@ dnl File header. "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; 2; dnl Layout code 6; dnl Nominal case size -1; dnl Not compressed +1; dnl Simple compression 0; dnl Not weighted -1; dnl Unspecified number of cases. 100.0; dnl Bias. @@ -1361,7 +1361,7 @@ dnl File header. "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; 2; dnl Layout code 6; dnl Nominal case size -1; dnl Not compressed +1; dnl Simple compression. 0; dnl Not weighted -1; dnl Unspecified number of cases. 0.0; dnl Bias. @@ -1425,7 +1425,7 @@ dnl File header. "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; 2; dnl Layout code 6; dnl Nominal case size -1; dnl Not compressed +1; dnl Simple compression. 0; dnl Not weighted -1; dnl Unspecified number of cases. 50.0; dnl Bias. @@ -1485,6 +1485,105 @@ num1,num2,str4,str8,str15 ]) done AT_CLEANUP + +m4_divert_push([PREPARE_TESTS]) +zcompressed_sack () { + cat <<'EOF' +dnl File header. +"$FL3"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; dnl Layout code +6; dnl Nominal case size +2; dnl zlib compressed +0; dnl Not weighted +-1; dnl Unspecified number of cases. +100.0; dnl Bias. +"01 Jan 11"; "20:53:52"; s64 "PSPP synthetic test file"; +i8 0 *3; + +dnl Numeric variables. +2; 0; 0; 0; 0x050800 *2; s8 "NUM1"; +2; 0; 0; 0; 0x050800 *2; s8 "NUM2"; + +dnl String variable. +2; 4; 0; 0; 0x010400 *2; s8 "STR4"; +2; 8; 0; 0; 0x010800 *2; s8 "STR8"; +2; 15; 0; 0; 0x010f00 *2; s8 "STR15"; +2; -1; 0; 0; 0; 0; s8 ""; + +dnl Dictionary termination record. +999; 0; + +dnl ZLIB data header. +i64 0x178; # zheader_ofs +i64 0x1e9; # ztrailer_ofs +i64 48; # ztrailer_len + +dnl ZLIB data block. +dnl +dnl This is the compressed form of: +dnl +dnl 01 64 fe fd fe fd ff fb 61 62 63 64 65 66 67 68 |.d......abcdefgh| +dnl 30 31 32 33 20 20 20 20 fd fd fd fe 65 66 fd fd |0123 ....ef..| +dnl 6a 6b 6c 6d 20 20 20 20 6e 6f 70 71 72 73 74 75 |jklm nopqrstu| +dnl 76 77 78 79 7a 41 42 43 44 45 46 47 20 20 20 20 |vwxyzABCDEFG | +dnl 48 49 4a 4b 4c 4d 4e 4f fe fd fc 00 00 00 00 00 |HIJKLMNO........| +dnl 50 51 52 53 54 55 56 57 |PQRSTUVW| +dnl +dnl which is the data from the "compressed data" test. +hex "78 01 63 4c f9 f7 f7 df df ff bf 13 93 92 53 52"; +hex "d3 d2 33 0c 0c 8d 8c 15 80 e0 ef df bf ff 52 d3"; +hex "fe fe cd ca ce c9 05 f1 f3 f2 0b 0a 8b 8a 4b 4a"; +hex "cb ca 2b 2a ab 1c 9d 9c 5d 5c dd dc 41 e2 1e 9e"; +hex "5e de 3e be 7e fe ff fe fe 61 00 81 80 c0 a0 e0"; +hex "90 d0 b0 70 00 0f 3f 23 d7"; + +dnl ZLIB data trailer fixed header: +i64 -100; # ztrailer_bias +i64 0; # ztrailer_zero +0x3ff000; # block_size +1; # n_blocks + +dnl ZLIB block descriptor: +i64 0x178; # uncompressed_ofs +i64 0x190; # compressed_ofs +88; # uncompressed_size +89; # compressed_size +EOF +} +m4_divert_pop([PREPARE_TESTS]) + +AT_SETUP([zcompressed data]) +AT_KEYWORDS([sack synthetic system file positive zlib]) +zcompressed_sack > sys-file.sack +for variant in \ + "be 2d706c3ca0cc9be7f1721f09d0d42179" \ + "le 3f362f338d65b0a836b3c752cc3fc5bc" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [dnl +GET FILE='sys-file.sav'. +DISPLAY DICTIONARY. +LIST. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps]) + AT_CHECK([grep -v Measure pspp.csv | grep -v Display], [0], [dnl +Variable,Description,,Position +num1,Format: F8.0,,1 +num2,Format: F8.0,,2 +str4,Format: A4,,3 +str8,Format: A8,,4 +str15,Format: A15,,5 + +Table: Data List +num1,num2,str4,str8,str15 +-99,0,,abcdefgh,0123 @&t@ +.,151,jklm,nopqrstu,vwxyzABC @&t@ +1,2,DEFG,HIJKLMNO,PQRSTUV +]) +done +AT_CLEANUP AT_BANNER([system file reader - negative]) @@ -3553,3 +3652,296 @@ num1,num2,str4,str8,str15 done AT_CLEANUP +AT_SETUP([zcompressed data - bad zheader_ofs]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*zheader_ofs.*/>>i64 0<<;/' > sys-file.sack +for variant in \ + "be 6d5c32f34fa1bed6f9b8f7045d104fdc" \ + "le 1f67fbda4f0021143e141fe8403c5a97" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x178: Wrong ZLIB data header offset 0 (expected 0x178). +]) +done +AT_CLEANUP + +AT_SETUP([zcompressed data - bad ztrailer_ofs]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*ztrailer_ofs.*/>>i64 0<<;/' > sys-file.sack +for variant in \ + "be e2c8dec0c62d3d798825ad5906370634" \ + "le c1cff4cdddeee80bf1580cbc26fa9fd5" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x178: Impossible ZLIB trailer offset 0x0. +]) +done +AT_CLEANUP + +# ztrailer_len must be a multiple of 24 and at least 48, +# so a value of 12 is impossible. +AT_SETUP([zcompressed data - invalid ztrailer_len]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*ztrailer_len.*/>>i64 12<<;/' > sys-file.sack +for variant in \ + "be 27f5203463bc4c7644382f24ae87f84c" \ + "le 0035fa6ee7690720429715150ede85f4" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x178: Invalid ZLIB trailer length 12. +]) +done +AT_CLEANUP + +# ztrailer_ofs + ztrailer_len must be the file size. +AT_SETUP([zcompressed data - wrong ztrailer_len]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*ztrailer_len.*/>>i64 72<<;/' > sys-file.sack +for variant in \ + "be 2ba9ae97bc0a7f5dcfe36e2463b9d7cb" \ + "le d737ea0a53ca5c6f20be359027171d73" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [warning: `sys-file.sav' near offset 0x190: End of ZLIB trailer (0x231) is not file size (0x219). +error: `sys-file.sav' near offset 0x201: 72-byte ZLIB trailer specifies 1 data blocks (expected 2). +]) +done +AT_CLEANUP + +AT_SETUP([zcompressed data - wrong ztrailer_bias]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*ztrailer_bias.*/>>i64 0<<;/' > sys-file.sack +for variant in \ + "be a5b56ab5e799a3626de2cdd7bd8d7a03" \ + "le d7cd584c6d5a95df10ba640eb3f1f24f" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x1f1: ZLIB trailer bias (0) differs from file header bias (100.00). +]) +done +AT_CLEANUP + +AT_SETUP([zcompressed data - wrong ztrailer_zero]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*ztrailer_zero.*/>>i64 100<<;/' > sys-file.sack +for variant in \ + "be 8d746abedb3e74cfdc22207f3455db92" \ + "le 79cea017365cab35d59c7a300cfa66c1" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [0], [warning: `sys-file.sav' near offset 0x1f9: ZLIB trailer "zero" field has nonzero value 100. +]) +done +AT_CLEANUP + +AT_SETUP([zcompressed data - wrong block_size]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*block_size.*/>>0x1000<<;/' > sys-file.sack +for variant in \ + "be 8d5a1caa56be8892d453faf1047005ca" \ + "le 7daa1bd57b192893b313a351202e179b" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [0], [warning: `sys-file.sav' near offset 0x1fd: ZLIB trailer specifies unexpected 4096-byte block size. +]) +done +AT_CLEANUP + +AT_SETUP([zcompressed data - wrong n_blocks]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*n_blocks.*/>>2<<;/' > sys-file.sack +for variant in \ + "be cd29596fd6bf4a2f651febe820a7955f" \ + "le 8fc1f718dfd2abac7c3442c1055d4cab" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x201: 48-byte ZLIB trailer specifies 2 data blocks (expected 1). +]) +done +AT_CLEANUP + +AT_SETUP([zcompressed data - wrong uncompressed_ofs]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*uncompressed_ofs.*/i64 >>0x177<<;/' > sys-file.sack +for variant in \ + "be 5546120fe6161dc6ed20aec48d8e74a4" \ + "le 86fafd625ed5ceaa1bff4fc7f500b6ab" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x201: ZLIB block descriptor 0 reported uncompressed data offset 0x177, when 0x178 was expected. +]) +done +AT_CLEANUP + +AT_SETUP([zcompressed data - wrong compressed_ofs]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*\bcompressed_ofs.*/i64 >>0x191<<;/' > sys-file.sack +for variant in \ + "be 652e28f8d3f8e4ce47ad18d0f30e7bb9" \ + "le ebf2c647f2d7c47858d4f5ed683526e6" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x201: ZLIB block descriptor 0 reported compressed data offset 0x191, when 0x190 was expected. +]) +done +AT_CLEANUP + +AT_SETUP([zcompressed data - compressed sizes don't add up]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +AT_DATA([sys-file.sack], [dnl +dnl File header. +"$FL3"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file"; +2; dnl Layout code +6; dnl Nominal case size +2; dnl zlib compressed +0; dnl Not weighted +-1; dnl Unspecified number of cases. +100.0; dnl Bias. +"01 Jan 11"; "20:53:52"; s64 "PSPP synthetic test file"; +i8 0 *3; + +dnl Numeric variables. +2; 0; 0; 0; 0x050800 *2; s8 "NUM1"; +2; 0; 0; 0; 0x050800 *2; s8 "NUM2"; + +dnl String variable. +2; 4; 0; 0; 0x010400 *2; s8 "STR4"; +2; 8; 0; 0; 0x010800 *2; s8 "STR8"; +2; 15; 0; 0; 0x010f00 *2; s8 "STR15"; +2; -1; 0; 0; 0; 0; s8 ""; + +dnl Dictionary termination record. +999; 0; + +dnl ZLIB data header. +i64 0x178; # zheader_ofs +i64 0x190; # ztrailer_ofs +i64 72; # ztrailer_len + +dnl This is where the ZLIB data blocks would go, but we don't need any to +dnl provoke this message so we omit them. + +dnl ZLIB data trailer fixed header: +i64 -100; # ztrailer_bias +i64 0; # ztrailer_zero +0x3ff000; # block_size +2; # n_blocks + +dnl ZLIB block descriptor 1: +i64 0x178; # uncompressed_ofs +i64 0x190; # compressed_ofs +0x100000; # uncompressed_size +0x12345; # compressed_size + +dnl ZLIB block descriptor 2: +i64 0x100178; # uncompressed_ofs +i64 0x12405; # compressed_ofs +0x100000; # uncompressed_size +0x12345; # compressed_size +]) +for variant in \ + "be 72ebf57bffa340afe16ed79959faac09" \ + "le 80b34e98f6b181dcc2e8ca4ba13f768d" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [warning: `sys-file.sav' near offset 0x1a8: ZLIB block descriptor 0 reported block size 0x100000, when 0x3ff000 was expected. +error: `sys-file.sav' near offset 0x1c0: ZLIB block descriptor 1 reported compressed data offset 0x12405, when 0x124d5 was expected. +]) +done +AT_CLEANUP + +AT_SETUP([zcompressed data - uncompressed_size > block_size]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*uncompressed_size.*/>>0x400000<<;/' > sys-file.sack +for variant in \ + "be 9bb74ef407fe0b79e43c388eedc28212" \ + "le 6f145fb5f820c513f50b6f81310cdad5" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [0], [warning: `sys-file.sav' near offset 0x201: ZLIB block descriptor 0 reported block size 0x400000, when at most 0x3ff000 was expected. +]) +done +AT_CLEANUP + +AT_SETUP([zcompressed data - compression expands data too much]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*uncompressed_size.*/>>50<<;/ +s/.*\bcompressed_size.*/>>100<<;/' > sys-file.sack +for variant in \ + "be e11cadde5f0855c965a1cb388dedc36e" \ + "le 37953e71462b6554c5644fec8b539164" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x201: ZLIB block descriptor 0 reports compressed size 100 and uncompressed size 50. +]) +done +AT_CLEANUP + +AT_SETUP([zcompressed data - compressed sizes don't add up]) +AT_KEYWORDS([sack synthetic system file negative zlib]) +zcompressed_sack | sed 's/.*\bcompressed_size.*/>>88<<;/' > sys-file.sack +for variant in \ + "be 366eaf85be1f26fb6549e2f8ee393628" \ + "le a756e5125e6a908cb4990f66cc419bef" +do + set $variant + AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2] +]) + AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'. +]) + AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x219: ZLIB trailer is at offset 0x1e9 but 0x1e8 would be expected from block descriptors. +]) +done +AT_CLEANUP diff --git a/tests/data/sys-file.at b/tests/data/sys-file.at index cd7fbb4f50..9b8986d589 100644 --- a/tests/data/sys-file.at +++ b/tests/data/sys-file.at @@ -3,7 +3,10 @@ AT_BANNER([system files]) # Also tests that long variable names are preserved by SAVE and GET. AT_SETUP([write and read numeric data]) AT_KEYWORDS([SAVE GET system file]) -AT_DATA([sysfile.sps], [dnl +for variant in 'UNCOMPRESSED $FL2' 'ZCOMPRESSED $FL3'; do + set $variant + compression=$1 magic=$2 + cat >sysfile.sps <sysfile.sps < 1) printf ("Reading \"%s\":\n", r.file_name); @@ -218,8 +226,13 @@ main (int argc, char *argv[]) (long long int) ftello (r.file), (long long int) ftello (r.file) + 4); - if (r.compressed && max_cases > 0) - read_compressed_data (&r, max_cases); + if (r.compression == COMP_SIMPLE) + { + if (max_cases > 0) + read_simple_compressed_data (&r, max_cases); + } + else if (r.compression == COMP_ZLIB) + read_zlib_compressed_data (&r); fclose (r.file); } @@ -241,11 +254,16 @@ read_header (struct sfm_reader *r) char creation_date[10]; char creation_time[9]; char file_label[65]; + bool zmagic; read_string (r, rec_type, sizeof rec_type); read_string (r, eye_catcher, sizeof eye_catcher); - if (strcmp ("$FL2", rec_type) != 0) + if (!strcmp ("$FL2", rec_type)) + zmagic = false; + else if (!strcmp ("$FL3", rec_type)) + zmagic = true; + else sys_error (r, "This is not an SPSS system file."); /* Identify integer format. */ @@ -265,7 +283,24 @@ read_header (struct sfm_reader *r) weight_index = read_int (r); ncases = read_int (r); - r->compressed = compressed != 0; + if (!zmagic) + { + if (compressed == 0) + r->compression = COMP_NONE; + else if (compressed == 1) + r->compression = COMP_SIMPLE; + else if (compressed != 0) + sys_error (r, "SAV file header has invalid compression value " + "%"PRId32".", compressed); + } + else + { + if (compressed == 2) + r->compression = COMP_ZLIB; + else + sys_error (r, "ZSAV file header has invalid compression value " + "%"PRId32".", compressed); + } /* Identify floating-point format and obtain compression bias. */ read_bytes (r, raw_bias, sizeof raw_bias); @@ -289,7 +324,12 @@ read_header (struct sfm_reader *r) printf ("File header record:\n"); printf ("\t%17s: %s\n", "Product name", eye_catcher); printf ("\t%17s: %"PRId32"\n", "Layout code", layout_code); - printf ("\t%17s: %"PRId32"\n", "Compressed", compressed); + printf ("\t%17s: %"PRId32" (%s)\n", "Compressed", + compressed, + r->compression == COMP_NONE ? "no compression" + : r->compression == COMP_SIMPLE ? "simple compression" + : r->compression == COMP_ZLIB ? "ZLIB compression" + : ""); printf ("\t%17s: %"PRId32"\n", "Weight index", weight_index); printf ("\t%17s: %"PRId32"\n", "Number of cases", ncases); printf ("\t%17s: %g\n", "Compression bias", r->bias); @@ -1170,7 +1210,7 @@ read_variable_attributes (struct sfm_reader *r, size_t size, size_t count) } static void -read_compressed_data (struct sfm_reader *r, int max_cases) +read_simple_compressed_data (struct sfm_reader *r, int max_cases) { enum { N_OPCODES = 8 }; uint8_t opcodes[N_OPCODES]; @@ -1258,6 +1298,87 @@ read_compressed_data (struct sfm_reader *r, int max_cases) } } } + +static void +read_zlib_compressed_data (struct sfm_reader *r) +{ + long long int ofs; + long long int this_ofs, next_ofs, next_len; + long long int bias, zero; + long long int expected_uncmp_ofs, expected_cmp_ofs; + unsigned int block_size, n_blocks; + unsigned int i; + + read_int (r); + ofs = ftello (r->file); + printf ("\n%08llx: ZLIB compressed data header:\n", ofs); + + this_ofs = read_int64 (r); + next_ofs = read_int64 (r); + next_len = read_int64 (r); + + printf ("\tzheader_ofs: 0x%llx\n", this_ofs); + if (this_ofs != ofs) + printf ("\t\t(Expected 0x%llx.)\n", ofs); + printf ("\tztrailer_ofs: 0x%llx\n", next_ofs); + printf ("\tztrailer_len: %lld\n", next_len); + if (next_len < 24 || next_len % 24) + printf ("\t\t(Trailer length is not a positive multiple of 24.)\n"); + + printf ("\n%08llx: 0x%llx bytes of ZLIB compressed data\n", + ofs + 8 * 3, next_ofs - (ofs + 8 * 3)); + + skip_bytes (r, next_ofs - (ofs + 8 * 3)); + + printf ("\n%08llx: ZLIB trailer fixed header:\n", next_ofs); + bias = read_int64 (r); + zero = read_int64 (r); + block_size = read_int (r); + n_blocks = read_int (r); + printf ("\tbias: %lld\n", bias); + printf ("\tzero: 0x%llx\n", zero); + if (zero != 0) + printf ("\t\t(Expected 0.)\n"); + printf ("\tblock_size: 0x%x\n", block_size); + if (block_size != 0x3ff000) + printf ("\t\t(Expected 0x3ff000.)\n"); + printf ("\tn_blocks: %u\n", n_blocks); + if (n_blocks != next_len / 24 - 1) + printf ("\t\t(Expected %llu.)\n", next_len / 24 - 1); + + expected_uncmp_ofs = ofs; + expected_cmp_ofs = ofs + 24; + for (i = 0; i < n_blocks; i++) + { + long long int blockinfo_ofs = ftello (r->file); + unsigned long long int uncompressed_ofs = read_int64 (r); + unsigned long long int compressed_ofs = read_int64 (r); + unsigned int uncompressed_size = read_int (r); + unsigned int compressed_size = read_int (r); + + printf ("\n%08llx: ZLIB block descriptor %d\n", blockinfo_ofs, i + 1); + + printf ("\tuncompressed_ofs: 0x%llx\n", uncompressed_ofs); + if (uncompressed_ofs != expected_uncmp_ofs) + printf ("\t\t(Expected 0x%llx.)\n", ofs); + + printf ("\tcompressed_ofs: 0x%llx\n", compressed_ofs); + if (compressed_ofs != expected_cmp_ofs) + printf ("\t\t(Expected 0x%llx.)\n", ofs + 24); + + printf ("\tuncompressed_size: 0x%x\n", uncompressed_size); + if (i < n_blocks - 1 && uncompressed_size != block_size) + printf ("\t\t(Expected 0x%x.)\n", block_size); + + printf ("\tcompressed_size: 0x%x\n", compressed_size); + if (i == n_blocks - 1 && compressed_ofs + compressed_size != next_ofs) + printf ("\t\t(This was expected to be 0x%llx.)\n", + next_ofs - compressed_size); + + expected_uncmp_ofs += uncompressed_size; + expected_cmp_ofs += compressed_size; + } +} /* Helpers for reading records that consist of structured text strings. */ -- 2.30.2