Implement ZLIB compressed system file reader and writer.

author Ben Pfaff <blp@cs.stanford.edu>

Wed, 23 Oct 2013 05:56:18 +0000 (22:56 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Wed, 23 Oct 2013 05:56:56 +0000 (22:56 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Wed, 23 Oct 2013 05:56:18 +0000 (22:56 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Wed, 23 Oct 2013 05:56:56 +0000 (22:56 -0700)
diff --git a/NEWS b/NEWS

index 5b166c3ef13b26a05a80d32c3a35f5d9dcd04cc1..e3da3e489aa883bdad126b1e2dd301a81837a03a 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,10 @@ Changes since 0.8.1:
   * Charts are now rendered with colours from the Tango palette instead
     of fully saturated primaries.
  
+ * PSPP can now read and write ZCOMPRESSED system files, a new format
+   variant that compresses data much more effectively than the
+   previous form of compression (which is still supported).
+
   * Missing values for long string variables are now read from and
     written to system files in an SPSS-compatible fashion.
  
diff --git a/doc/dev/system-file-format.texi b/doc/dev/system-file-format.texi

index f408ff2866bcf18cf6168b14b2fd6a9cee211e84..89c35aab3f16dbe064f14dc94c3ced83ce285f60 100644 (file)
--- a/doc/dev/system-file-format.texi
+++ b/doc/dev/system-file-format.texi
@@ -56,6 +56,18 @@ appears in system files only in missing value ranges, which never
  contain SYSMIS.
  @end table
  
+System files may use most character encodings based on an 8-bit unit.
+UTF-16 and UTF-32, based on wider units, appear to be unacceptable.
+@code{rec_type} in the file header record is sufficient to distinguish
+between ASCII and EBCDIC based encodings.  The best way to determine
+the specific encoding in use is to consult the character encoding
+record (@pxref{Character Encoding Record}), if present, and failing
+that the @code{character_code} in the machine integer info record
+(@pxref{Machine Integer Info Record}).  The same encoding should be
+used for the dictionary and the data in the file, although it is
+possible to artificially synthesize files that use different encodings
+(@pxref{Character Encoding Record}).
+
  System files are divided into records, each of which begins with a
  4-byte record type, usually regarded as an @code{int32}.
  
@@ -121,7 +133,7 @@ char                rec_type[4];
  char                prod_name[60];
  int32               layout_code;
  int32               nominal_case_size;
-int32               compressed;
+int32               compression;
  int32               weight_index;
  int32               ncases;
  flt64               bias;
@@ -133,9 +145,15 @@ char                padding[3];
  
  @table @code
  @item char rec_type[4];
-Record type code, set to @samp{$FL2}, that is, either @code{24 46 4c
-32} if the file uses an ASCII-based character encoding, or @code{5b c6
-d3 f2} if the file uses an EBCDIC-based character encoding.
+Record type code, either @samp{$FL2} for system files with
+uncompressed data or data compressed with simple bytecode compression,
+or @samp{$FL3} for system files with ZLIB compressed data.
+
+This is truly a character field that uses the character encoding as
+other strings.  Thus, in a file with an ASCII-based character encoding
+this field contains @code{24 46 4c 32} or @code{24 46 4c 33}, and in a
+file with an EBCDIC-based encoding this field contains @code{5b c6 d3
+f2}.  (No EBCDIC-based ZLIB-compressed files have been observed.)
  
  @item char prod_name[60];
  Product identification string.  This always begins with the characters
@@ -160,7 +178,10 @@ files written by some systems set this value to -1.  In general, it is
  unsafe for systems reading system files to rely upon this value.
  
  @item int32 compressed;
-Set to 1 if the data in the file is compressed, 0 otherwise.
+Set to 0 if the data in the file is not compressed, 1 if the data is
+compressed with simple bytecode compression, 2 if the data is ZLIB
+compressed.  This field has value 2 if and only if @code{rec_type} is
+@samp{$FL3}.
  
  @item int32 weight_index;
  If one of the variables in the data set is used as a weighting
@@ -577,7 +598,8 @@ Floating point representation code.  For IEEE 754 systems this is 1.
  IBM 370 sets this to 2, and DEC VAX E to 3.
  
  @item int32 compression_code;
-Compression code.  Always set to 1.
+Compression code.  Always set to 1, regardless of whether or how the
+file is compressed.
  
  @item int32 endianness;
  Machine endianness.  1 indicates big-endian, 2 indicates little-endian.
@@ -1434,22 +1456,23 @@ Ignored padding.  Should be set to 0.
  @node Data Record
  @section Data Record
  
-Data records must follow all other records in the system file.  There must
-be at least one data record in every system file.
-
-The format of data records varies depending on whether the data is
-compressed.  Regardless, the data is arranged in a series of 8-byte
-elements.
+The data record must follow all other records in the system file.
+Every system file must have a data record that specifies data for at
+least one case.  The format of the data record varies depending on the
+value of @code{compression} in the file header record:
  
-When data is not compressed,
-each element corresponds to
+@table @asis
+@item 0: no compression
+Data is arranged as a series of 8-byte elements.
+Each element corresponds to
  the variable declared in the respective variable record (@pxref{Variable
  Record}).  Numeric values are given in @code{flt64} format; string
  values are literal characters string, padded on the right when
  necessary to fill out 8-byte units.
  
-Compressed data is arranged in the following manner: the first 8 bytes
-in the data section is divided into a series of 1-byte command
+@item 1: bytecode compression
+The first 8 bytes
+of the data record is divided into a series of 1-byte command
  codes.  These codes have meanings as described below:
  
  @table @asis
@@ -1487,8 +1510,125 @@ An 8-byte string value that is all spaces.
  The system-missing value.
  @end table
  
-When the end of the an 8-byte group of command bytes is reached, any
-blocks of non-compressible values indicated by code 253 are skipped,
-and the next element of command bytes is read and interpreted, until
-the end of the file or a code with value 252 is reached.
+The end of the 8-byte group of bytecodes is followed by any 8-byte
+blocks of non-compressible values indicated by code 253.  After that
+follows another 8-byte group of bytecodes, then those bytecodes'
+non-compressible values.  The pattern repeats to the end of the file
+or a code with value 252.
+
+@item 2: ZLIB compression
+The data record consists of the following, in order:
+
+@itemize @bullet
+@item
+ZLIB data header, 24 bytes long.
+
+@item
+One or more variable-length blocks of ZLIB compressed data.
+
+@item
+ZLIB data trailer, with a 24-byte fixed header plus an additional 24
+bytes for each preceding ZLIB compressed data block.
+@end itemize
+
+The ZLIB data header has the following format:
+
+@example
+int64               zheader_ofs;
+int64               ztrailer_ofs;
+int64               ztrailer_len;
+@end example
+
+@table @code
+@item int64 zheader_ofs;
+The offset, in bytes, of the beginning of this structure within the
+system file.
+
+@item int64 ztrailer_ofs;
+The offset, in bytes, of the first byte of the ZLIB data trailer.
+
+@item int64 ztrailer_len;
+The number of bytes in the ZLIB data trailer.  This and the previous
+field sum to the size of the system file in bytes.
+@end table
+
+The data header is followed by @code{(ztrailer_ofs - 24) / 24} ZLIB
+compressed data blocks.  Each ZLIB compressed data block begins with a
+ZLIB header as specified in RFC@tie{}1950, e.g.@: hex bytes @code{78
+01} (the only header yet observed in practice).  Each block
+decompresses to a fixed number of bytes (in practice only
+@code{0x3ff000}-byte blocks have been observed), except that the last
+block of data may be shorter.  The last ZLIB compressed data block
+gends just before offset @code{ztrailer_ofs}.
+
+The result of ZLIB decompression is bytecode compressed data as
+described above for compression format 1.
+
+The ZLIB data trailer begins with the following 24-byte fixed header:
+
+@example
+int64               bias;
+int64               zero;
+int32               block_size;
+int32               n_blocks;
+@end example
+
+@table @code
+@item int64 int_bias;
+The compression bias as a negative integer, e.g.@: if @code{bias} in
+the file header record is 100.0, then @code{int_bias} is @minus{}100
+(this is the only value yet observed in practice).
+
+@item int64 zero;
+Always observed to be zero.
+
+@item int32 block_size;
+The number of bytes in each ZLIB compressed data block, except
+possibly the last, following decompression.  Only @code{0x3ff000} has
+been observed so far.
+
+@item int32 n_blocks;
+The number of ZLIB compressed data blocks, always exactly
+@code{(ztrailer_ofs - 24) / 24}.
+@end table
+
+The fixed header is followed by @code{n_blocks} 24-byte ZLIB data
+block descriptors, each of which describes the compressed data block
+corresponding to its offset.  Each block descriptor has the following
+format:
+
+@example
+int64               uncompressed_ofs;
+int64               compressed_ofs;
+int32               uncompressed_size;
+int32               compressed_size;
+@end example
+
+@table @code
+@item int64 uncompressed_ofs;
+The offset, in bytes, that this block of data would have in a similar
+system file that uses compression format 1.  This is
+@code{zheader_ofs} in the first block descriptor, and in each
+succeeding block descriptor it is the sum of the previous desciptor's
+@code{uncompressed_ofs} and @code{uncompressed_size}.
+
+@item int64 compressed_ofs;
+The offset, in bytes, of the actual beginning of this compressed data
+block.  This is @code{zheader_ofs + 24} in the first block descriptor,
+and in each succeeding block descriptor it is the sum of the previous
+descriptor's @code{compressed_ofs} and @code{compressed_size}.  The
+final block descriptor's @code{compressed_ofs} and
+@code{compressed_size} sum to @code{ztrailer_ofs}.
+
+@item int32 uncompressed_size;
+The number of bytes in this data block, after decompression.  This is
+@code{block_size} in every data block except the last, which may be
+smaller.
+
+@item int32 compressed_size;
+The number of bytes in this data block, as stored compressed in this
+system file.
+@end table
+@end table
+
  @setfilename ignored
diff --git a/doc/files.texi b/doc/files.texi

index 7a9782381980ea25e9783acd145d455fcc8ff7c7..369a2e46f48737b483243748a4e521026a61801c 100644 (file)
--- a/doc/files.texi
+++ b/doc/files.texi
@@ -684,7 +684,7 @@ Use of @cmd{IMPORT} to read a system file is a @pspp{} extension.
  SAVE
          /OUTFILE=@{'@var{file_name}',@var{file_handle}@}
          /UNSELECTED=@{RETAIN,DELETE@}
-        /@{COMPRESSED,UNCOMPRESSED@}
+        /@{UNCOMPRESSED,COMPRESSED,ZCOMPRESSED@}
          /PERMISSIONS=@{WRITEABLE,READONLY@}
          /DROP=@var{var_list}
          /KEEP=@var{var_list}
@@ -706,9 +706,32 @@ By default, cases excluded with FILTER are written to the system file.
  These can be excluded by specifying @subcmd{DELETE} on the @subcmd{UNSELECTED}
  subcommand.  Specifying @subcmd{RETAIN} makes the default explicit.
  
-The @subcmd{COMPRESS} and @subcmd{UNCOMPRESS} subcommand determine whether
-the saved system file is compressed.  By default, system files are compressed.
-This default can be changed with the SET command (@pxref{SET}).
+The @subcmd{UNCOMPRESSED}, @subcmd{COMPRESSED}, and
+@subcmd{ZCOMPRESSED} subcommand determine the system file's
+compression level:
+
+@table @code
+@item UNCOMPRESSED
+Data is not compressed.  Each numeric value uses 8 bytes of disk
+space.  Each string value uses one byte per column width, rounded up
+to a multiple of 8 bytes.
+
+@item COMPRESSED
+Data is compressed with a simple algorithm.  Each integer numeric
+value between @minus{}99 and 151, inclusive, or system missing value
+uses one byte of disk space.  Each 8-byte segment of a string that
+consists only of spaces uses 1 byte.  Any other numeric value or
+8-byte string segment uses 9 bytes of disk space.
+
+@item ZCOMPRESSED
+Data is compressed with the ``deflate'' compression algorithm
+specified in RFC@tie{}1951 (the same algorithm used by
+@command{gzip}).  Files written with this compression level cannot be
+read by PSPP 0.8.1 or earlier or by SPSS 20 or earlier.
+@end table
+
+@subcmd{COMPRESSED} is the default compression level.  The SET command
+(@pxref{SET}) can change this default.
  
  The @subcmd{PERMISSIONS} subcommand specifies permissions for the new system
  file.  WRITEABLE, the default, creates the file with read and write
@@ -938,7 +961,7 @@ the data is read by a procedure or procedure-like command.
  @display
  XSAVE
          /OUTFILE='@var{file_name}'
-        /@{COMPRESSED,UNCOMPRESSED@}
+        /@{UNCOMPRESSED,COMPRESSED,ZCOMPRESSED@}
          /PERMISSIONS=@{WRITEABLE,READONLY@}
          /DROP=@var{var_list}
          /KEEP=@var{var_list}
diff --git a/perl-module/PSPP.xs b/perl-module/PSPP.xs

index 0895f641a8577570fe5aff2a011d356fc1cf72f8..802aabf5c0f0216240d52d51e6ae51c81ceb8c06 100644 (file)
--- a/perl-module/PSPP.xs
+++ b/perl-module/PSPP.xs
@@ -632,7 +632,9 @@ INIT:
      SV** version = hv_fetch(opt_h, "version", 7, 0);
  
      opts.create_writeable = readonly ? ! SvIV (*readonly) : true;
-    opts.compress = compress ? SvIV (*compress) : false;
+    opts.compression = (compress && SvIV (*compress)
+                        ? SFM_COMP_SIMPLE
+                       : SFM_COMP_NONE);
      opts.version = version ? SvIV (*version) : 3 ;
    }
  CODE:
diff --git a/src/data/automake.mk b/src/data/automake.mk

index 4385fd6d63055ead3ddba1836ba57f8aedbef006..9ce405bb656b095b48c8812b950e4dd8ccb88951 100644 (file)
--- a/src/data/automake.mk
+++ b/src/data/automake.mk
@@ -107,6 +107,7 @@ src_data_libdata_la_SOURCES = \
         src/data/sys-file-reader.h \
         src/data/sys-file-writer.c \
         src/data/sys-file-writer.h \
+       src/data/sys-file.h \
         src/data/transformations.c \
         src/data/transformations.h \
         src/data/val-type.h \
diff --git a/src/data/sys-file-private.h b/src/data/sys-file-private.h

index 21ff8ade3a6026fe4e882997292598e83bf4c08e..a39b0c1aa304c5774f729151eabf5973ef1e7c6e 100644 (file)
--- a/src/data/sys-file-private.h
+++ b/src/data/sys-file-private.h
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 2006-2007, 2009-2012 Free Software Foundation, Inc.
+   Copyright (C) 2006-2007, 2009-2013 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -35,14 +35,19 @@
  
  struct dictionary;
  
-/* Magic numbers.
+/* ASCII magic numbers. */
+#define ASCII_MAGIC  "$FL2"     /* For regular files. */
+#define ASCII_ZMAGIC "$FL3"     /* For ZLIB compressed files. */
  
-   Both of these are actually $FL2 in the respective character set.  The "FL2"
-   part is invariant among national variants of each character set, but "$" has
-   different encodings, so it is safer to write them as hexadecimal. */
-#define ASCII_MAGIC  "\x24\x46\x4c\x32"
+/* EBCDIC magic number, the same as ASCII_MAGIC but encoded in EBCDIC.
+
+   No EBCDIC ZLIB compressed files have been observed, so we do not define
+   EBCDIC_ZMAGIC even though the value is obvious. */
  #define EBCDIC_MAGIC "\x5b\xc6\xd3\xf2"
  
+/* Amount of data that ZLIB compressed data blocks typically decompress to. */
+#define ZBLOCK_SIZE 0x3ff000
+
  /* A variable in a system file. */
  struct sfm_var
    {
diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c

index d553b3a0e5796b4393a4f37d03c38057a14501c8..9a4ef860423cd18193fc4fedb0514208d8dddecf 100644 (file)
--- a/src/data/sys-file-reader.c
+++ b/src/data/sys-file-reader.c
@@ -24,6 +24,8 @@
  #include <inttypes.h>
  #include <setjmp.h>
  #include <stdlib.h>
+#include <sys/stat.h>
+#include <zlib.h>
  
  #include "data/attributes.h"
  #include "data/case.h"
@@ -57,6 +59,7 @@
  #include "gl/minmax.h"
  #include "gl/unlocked-io.h"
  #include "gl/xalloc.h"
+#include "gl/xalloc-oversized.h"
  #include "gl/xsize.h"
  
  #include "gettext.h"
@@ -173,11 +176,21 @@ struct sfm_reader
      const char *encoding;       /* String encoding. */
  
      /* Decompression. */
-    bool compressed;           /* File is compressed? */
+    enum sfm_compression compression;
      double bias;               /* Compression bias, usually 100.0. */
      uint8_t opcodes[8];         /* Current block of opcodes. */
      size_t opcode_idx;          /* Next opcode to interpret, 8 if none left. */
      bool corruption_warning;    /* Warned about possible corruption? */
+
+    /* ZLIB decompression. */
+    long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
+#define ZIN_BUF_SIZE  4096
+    uint8_t *zin_buf;           /* Inflation input buffer. */
+#define ZOUT_BUF_SIZE 16384
+    uint8_t *zout_buf;          /* Inflation output buffer. */
+    unsigned int zout_end;      /* Number of bytes of data in zout_buf. */
+    unsigned int zout_pos;      /* First unconsumed byte in zout_buf. */
+    z_stream zstream;           /* ZLIB inflater. */
    };
  
  static const struct casereader_class sys_file_casereader_class;
@@ -200,10 +213,19 @@ static void sys_error (struct sfm_reader *, off_t, const char *, ...)
  static void read_bytes (struct sfm_reader *, void *, size_t);
  static bool try_read_bytes (struct sfm_reader *, void *, size_t);
  static int read_int (struct sfm_reader *);
-static double read_float (struct sfm_reader *);
+static long long int read_int64 (struct sfm_reader *);
  static void read_string (struct sfm_reader *, char *, size_t);
  static void skip_bytes (struct sfm_reader *, size_t);
  
+/* ZLIB compressed data handling. */
+static void read_zheader (struct sfm_reader *);
+static void open_zstream (struct sfm_reader *);
+static void close_zstream (struct sfm_reader *);
+static bool read_bytes_zlib (struct sfm_reader *, void *, size_t);
+static void read_compressed_bytes (struct sfm_reader *, void *, size_t);
+static bool try_read_compressed_bytes (struct sfm_reader *, void *, size_t);
+static double read_compressed_float (struct sfm_reader *);
+
  static char *fix_line_ends (const char *);
  
  static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
@@ -367,6 +389,7 @@ sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
    r->error = false;
    r->opcode_idx = sizeof r->opcodes;
    r->corruption_warning = false;
+  r->zin_buf = r->zout_buf = NULL;
  
    info = infop ? infop : xmalloc (sizeof *info);
    memset (info, 0, sizeof *info);
@@ -472,6 +495,9 @@ sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
          }
      }
  
+  if (r->compression == SFM_COMP_ZLIB)
+    read_zheader (r);
+
    /* Now actually parse what we read.
  
       First, figure out the correct character encoding, because this determines
@@ -646,7 +672,9 @@ sfm_detect (FILE *file)
      return false;
    magic[4] = '\0';
  
-  return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
+  return (!strcmp (ASCII_MAGIC, magic)
+          || !strcmp (ASCII_ZMAGIC, magic)
+          || !strcmp (EBCDIC_MAGIC, magic));
  }
  \f
  /* Reads the global header of the system file.  Initializes *HEADER and *INFO,
@@ -658,12 +686,18 @@ read_header (struct sfm_reader *r, struct sfm_read_info *info,
  {
    uint8_t raw_layout_code[4];
    uint8_t raw_bias[8];
+  int compressed;
+  bool zmagic;
  
    read_string (r, header->magic, sizeof header->magic);
    read_string (r, header->eye_catcher, sizeof header->eye_catcher);
  
-  if (strcmp (ASCII_MAGIC, header->magic)
-      && strcmp (EBCDIC_MAGIC, header->magic))
+  if (!strcmp (ASCII_MAGIC, header->magic)
+      || !strcmp (EBCDIC_MAGIC, header->magic))
+    zmagic = false;
+  else if (!strcmp (ASCII_ZMAGIC, header->magic))
+    zmagic = true;
+  else
      sys_error (r, 0, _("This is not an SPSS system file."));
  
    /* Identify integer format. */
@@ -681,7 +715,25 @@ read_header (struct sfm_reader *r, struct sfm_read_info *info,
        || header->nominal_case_size > INT_MAX / 16)
      header->nominal_case_size = -1;
  
-  r->compressed = read_int (r) != 0;
+  compressed = read_int (r);
+  if (!zmagic)
+    {
+      if (compressed == 0)
+        r->compression = SFM_COMP_NONE;
+      else if (compressed == 1)
+        r->compression = SFM_COMP_SIMPLE;
+      else if (compressed != 0)
+        sys_error (r, 0, "System file header has invalid compression "
+                   "value %d.", compressed);
+    }
+  else
+    {
+      if (compressed == 2)
+        r->compression = SFM_COMP_ZLIB;
+      else
+        sys_error (r, 0, "ZLIB-compressed system file header has invalid "
+                   "compression value %d.", compressed);
+    }
  
    header->weight_idx = read_int (r);
  
@@ -723,7 +775,7 @@ read_header (struct sfm_reader *r, struct sfm_read_info *info,
  
    info->integer_format = r->integer_format;
    info->float_format = r->float_format;
-  info->compressed = r->compressed;
+  info->compression = r->compression;
    info->case_cnt = r->case_cnt;
  }
  
@@ -2289,7 +2341,7 @@ read_error (struct casereader *r, const struct sfm_reader *sfm)
  static bool
  read_case_number (struct sfm_reader *r, double *d)
  {
-  if (!r->compressed)
+  if (r->compression == SFM_COMP_NONE)
      {
        uint8_t number[8];
        if (!try_read_bytes (r, number, sizeof number))
@@ -2339,13 +2391,13 @@ read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
  static int
  read_opcode (struct sfm_reader *r)
  {
-  assert (r->compressed);
+  assert (r->compression != SFM_COMP_NONE);
    for (;;)
      {
        int opcode;
        if (r->opcode_idx >= sizeof r->opcodes)
          {
-          if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
+          if (!try_read_compressed_bytes (r, r->opcodes, sizeof r->opcodes))
              return -1;
            r->opcode_idx = 0;
          }
@@ -2370,7 +2422,7 @@ read_compressed_number (struct sfm_reader *r, double *d)
        return false;
  
      case 253:
-      *d = read_float (r);
+      *d = read_compressed_float (r);
        break;
  
      case 254:
@@ -2411,7 +2463,7 @@ read_compressed_string (struct sfm_reader *r, uint8_t *dst)
        return false;
  
      case 253:
-      read_bytes (r, dst, 8);
+      read_compressed_bytes (r, dst, 8);
        break;
  
      case 254:
@@ -2453,7 +2505,7 @@ static bool
  read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
  {
    assert (length % 8 == 0);
-  if (!r->compressed)
+  if (r->compression == SFM_COMP_NONE)
      return try_read_bytes (r, s, length);
    else
      {
@@ -2820,14 +2872,14 @@ read_int (struct sfm_reader *r)
    return integer_get (r->integer_format, integer, sizeof integer);
  }
  
-/* Reads a 64-bit floating-point number from R and returns its
-   value in host format. */
-static double
-read_float (struct sfm_reader *r)
+/* Reads a 64-bit signed integer from R and returns its value in
+   host format. */
+static long long int
+read_int64 (struct sfm_reader *r)
  {
-  uint8_t number[8];
-  read_bytes (r, number, sizeof number);
-  return float_get_double (r->float_format, number);
+  uint8_t integer[8];
+  read_bytes (r, integer, sizeof integer);
+  return integer_get (r->integer_format, integer, sizeof integer);
  }
  
  static int
@@ -2894,6 +2946,308 @@ fix_line_ends (const char *s)
    return dst;
  }
  \f
+static void
+read_ztrailer (struct sfm_reader *r,
+               long long int zheader_ofs,
+               long long int ztrailer_len);
+
+static void *
+zalloc (voidpf pool_, uInt items, uInt size)
+{
+  struct pool *pool = pool_;
+
+  return (!size || xalloc_oversized (items, size)
+          ? Z_NULL
+          : pool_malloc (pool, items * size));
+}
+
+static void
+zfree (voidpf pool_, voidpf address)
+{
+  struct pool *pool = pool_;
+
+  pool_free (pool, address);
+}
+
+static void
+read_zheader (struct sfm_reader *r)
+{
+  off_t pos = r->pos;
+  long long int zheader_ofs = read_int64 (r);
+  long long int ztrailer_ofs = read_int64 (r);
+  long long int ztrailer_len = read_int64 (r);
+
+  if (zheader_ofs != pos)
+    sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
+                         "(expected %#llx)."),
+               zheader_ofs, (long long int) pos);
+
+  if (ztrailer_ofs < r->pos)
+    sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
+               ztrailer_ofs);
+
+  if (ztrailer_len < 24 || ztrailer_len % 24)
+    sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
+
+  r->ztrailer_ofs = ztrailer_ofs;
+  read_ztrailer (r, zheader_ofs, ztrailer_len);
+
+  if (r->zin_buf == NULL)
+    {
+      r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
+      r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
+      r->zstream.next_in = NULL;
+      r->zstream.avail_in = 0;
+    }
+
+  r->zstream.zalloc = zalloc;
+  r->zstream.zfree = zfree;
+  r->zstream.opaque = r->pool;
+
+  open_zstream (r);
+}
+
+static void
+seek (struct sfm_reader *r, off_t offset)
+{
+  if (fseeko (r->file, offset, SEEK_SET))
+    sys_error (r, 0, _("%s: seek failed (%s)."),
+               fh_get_file_name (r->fh), strerror (errno));
+  r->pos = offset;
+}
+
+/* Performs some additional consistency checks on the ZLIB compressed data
+   trailer. */
+static void
+read_ztrailer (struct sfm_reader *r,
+               long long int zheader_ofs,
+               long long int ztrailer_len)
+{
+  long long int expected_uncmp_ofs;
+  long long int expected_cmp_ofs;
+  long long int bias;
+  long long int zero;
+  unsigned int block_size;
+  unsigned int n_blocks;
+  unsigned int i;
+  struct stat s;
+
+  if (fstat (fileno (r->file), &s))
+    sys_error (ME, 0, _("%s: stat failed (%s)."),
+               fh_get_file_name (r->fh), strerror (errno));
+
+  if (!S_ISREG (s.st_mode))
+    {
+      /* We can't seek to the trailer and then back to the data in this file,
+         so skip doing extra checks. */
+      return;
+    }
+
+  if (r->ztrailer_ofs + ztrailer_len != s.st_size)
+    sys_warn (r, r->pos,
+              _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
+              r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
+
+  seek (r, r->ztrailer_ofs);
+
+  /* Read fixed header from ZLIB data trailer. */
+  bias = read_int64 (r);
+  if (-bias != r->bias)
+    sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
+                            "file header bias (%.2f)."),
+               -bias, r->bias);
+
+  zero = read_int64 (r);
+  if (zero != 0)
+    sys_warn (r, r->pos,
+              _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
+
+  block_size = read_int (r);
+  if (block_size != ZBLOCK_SIZE)
+    sys_warn (r, r->pos,
+              _("ZLIB trailer specifies unexpected %u-byte block size."),
+              block_size);
+
+  n_blocks = read_int (r);
+  if (n_blocks != (ztrailer_len - 24) / 24)
+    sys_error (r, r->pos,
+               _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
+                 "%lld)."),
+               ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
+
+  expected_uncmp_ofs = zheader_ofs;
+  expected_cmp_ofs = zheader_ofs + 24;
+  for (i = 0; i < n_blocks; i++)
+    {
+      off_t desc_ofs = r->pos;
+      unsigned long long int uncompressed_ofs = read_int64 (r);
+      unsigned long long int compressed_ofs = read_int64 (r);
+      unsigned int uncompressed_size = read_int (r);
+      unsigned int compressed_size = read_int (r);
+
+      if (uncompressed_ofs != expected_uncmp_ofs)
+        sys_error (r, desc_ofs,
+                   _("ZLIB block descriptor %u reported uncompressed data "
+                     "offset %#llx, when %#llx was expected."),
+                   i, uncompressed_ofs, expected_uncmp_ofs);
+
+      if (compressed_ofs != expected_cmp_ofs)
+        sys_error (r, desc_ofs,
+                   _("ZLIB block descriptor %u reported compressed data "
+                     "offset %#llx, when %#llx was expected."),
+                   i, compressed_ofs, expected_cmp_ofs);
+
+      if (i < n_blocks - 1)
+        {
+          if (uncompressed_size != block_size)
+            sys_warn (r, desc_ofs,
+                      _("ZLIB block descriptor %u reported block size %#x, "
+                        "when %#x was expected."),
+                      i, uncompressed_size, block_size);
+        }
+      else
+        {
+          if (uncompressed_size > block_size)
+            sys_warn (r, desc_ofs,
+                      _("ZLIB block descriptor %u reported block size %#x, "
+                        "when at most %#x was expected."),
+                      i, uncompressed_size, block_size);
+        }
+
+      /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
+         from compression, with worst-case parameters, is 13.5% plus 11 bytes.
+         This code checks for an expansion of more than 14.3% plus 11
+         bytes.  */
+      if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
+        sys_error (r, desc_ofs,
+                   _("ZLIB block descriptor %u reports compressed size %u "
+                     "and uncompressed size %u."),
+                   i, compressed_size, uncompressed_size);
+
+      expected_uncmp_ofs += uncompressed_size;
+      expected_cmp_ofs += compressed_size;
+    }
+
+  if (expected_cmp_ofs != r->ztrailer_ofs)
+    sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
+                            "would be expected from block descriptors."),
+               r->ztrailer_ofs, expected_cmp_ofs);
+
+  seek (r, zheader_ofs + 24);
+}
+
+static void
+open_zstream (struct sfm_reader *r)
+{
+  int error;
+
+  r->zout_pos = r->zout_end = 0;
+  error = inflateInit (&r->zstream);
+  if (error != Z_OK)
+    sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
+               r->zstream.msg);
+}
+
+static void
+close_zstream (struct sfm_reader *r)
+{
+  int error;
+
+  error = inflateEnd (&r->zstream);
+  if (error != Z_OK)
+    sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
+               r->zstream.msg);
+}
+
+static bool
+read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
+{
+  uint8_t *buf = buf_;
+
+  if (byte_cnt == 0)
+    return true;
+
+  for (;;)
+    {
+      int error;
+
+      /* Use already inflated data if there is any. */
+      if (r->zout_pos < r->zout_end)
+        {
+          unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
+          memcpy (buf, &r->zout_buf[r->zout_pos], n);
+          r->zout_pos += n;
+          byte_cnt -= n;
+          buf += n;
+
+          if (byte_cnt == 0)
+            return true;
+        }
+
+      /* We need to inflate some more data.
+         Get some more input data if we don't have any. */
+      if (r->zstream.avail_in == 0)
+        {
+          unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
+          if (n == 0 || !try_read_bytes (r, r->zin_buf, n))
+            return false;
+          r->zstream.avail_in = n;
+          r->zstream.next_in = r->zin_buf;
+        }
+
+      /* Inflate the (remaining) input data. */
+      r->zstream.avail_out = ZOUT_BUF_SIZE;
+      r->zstream.next_out = r->zout_buf;
+      error = inflate (&r->zstream, Z_SYNC_FLUSH);
+      r->zout_pos = 0;
+      r->zout_end = r->zstream.next_out - r->zout_buf;
+      if (r->zout_end == 0)
+        {
+          if (error == Z_STREAM_END)
+            {
+              close_zstream (r);
+              open_zstream (r);
+            }
+          else
+            sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
+                       r->zstream.msg);
+        }
+      else
+        {
+          /* Process the output data and ignore 'error' for now.  ZLIB will
+             present it to us again on the next inflate() call. */
+        }
+    }
+}
+
+static void
+read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
+{
+  if (r->compression == SFM_COMP_SIMPLE)
+    return read_bytes (r, buf, byte_cnt);
+  else if (!read_bytes_zlib (r, buf, byte_cnt))
+    sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
+}
+
+static bool
+try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
+{
+  if (r->compression == SFM_COMP_SIMPLE)
+    return try_read_bytes (r, buf, byte_cnt);
+  else
+    return read_bytes_zlib (r, buf, byte_cnt);
+}
+
+/* Reads a 64-bit floating-point number from R and returns its
+   value in host format. */
+static double
+read_compressed_float (struct sfm_reader *r)
+{
+  uint8_t number[8];
+  read_compressed_bytes (r, number, sizeof number);
+  return float_get_double (r->float_format, number);
+}
+\f
  static const struct casereader_class sys_file_casereader_class =
    {
      sys_file_casereader_read,
diff --git a/src/data/sys-file-reader.h b/src/data/sys-file-reader.h

index 037d33a394ce2a8397337aa87373672ba2d31223..011d541954841afebac874bc9820bfc4d3766529 100644 (file)
--- a/src/data/sys-file-reader.h
+++ b/src/data/sys-file-reader.h
@@ -21,6 +21,7 @@
  #include <stdio.h>
  
  #include "data/case.h"
+#include "data/sys-file.h"
  #include "libpspp/float-format.h"
  #include "libpspp/integer-format.h"
  
@@ -36,7 +37,7 @@ struct sfm_read_info
      char *creation_time;       /* "hh:mm:ss". */
      enum integer_format integer_format;
      enum float_format float_format;
-    bool compressed;           /* 0=no, 1=yes. */
+    enum sfm_compression compression;
      casenumber case_cnt;        /* -1 if unknown. */
      char *product;             /* Product name. */
      char *product_ext;          /* Extra product info. */
diff --git a/src/data/sys-file-writer.c b/src/data/sys-file-writer.c

index c78e04d55f24e41ef5ecce805664c896fc15c44e..8cfd577f1a6c656ea0ae70d5a87dcfdd0ecf9a38 100644 (file)
--- a/src/data/sys-file-writer.c
+++ b/src/data/sys-file-writer.c
@@ -25,6 +25,7 @@
  #include <stdlib.h>
  #include <sys/stat.h>
  #include <time.h>
+#include <zlib.h>
  
  #include "data/attributes.h"
  #include "data/case.h"
@@ -72,11 +73,11 @@ struct sfm_writer
      FILE *file;                        /* File stream. */
      struct replace_file *rf;    /* Ticket for replacing output file. */
  
-    bool compress;             /* 1=compressed, 0=not compressed. */
+    enum sfm_compression compression;
      casenumber case_cnt;       /* Number of cases written so far. */
      uint8_t space;              /* ' ' in the file's character encoding. */
  
-    /* Compression buffering.
+    /* Simple compression buffering.
  
         Compressed data is output as a series of 8-byte elements, with 1 to 9
         such elements clustered together.  The first element in a cluster is 8
@@ -89,6 +90,12 @@ struct sfm_writer
      int n_opcodes;              /* Number of opcodes in cbuf[0] so far. */
      int n_elements;             /* Number of elements in cbuf[] so far. */
  
+    /* ZLIB compression. */
+    z_stream zstream;           /* ZLIB deflater. */
+    off_t zstart;
+    struct zblock *blocks;
+    size_t n_blocks, allocated_blocks;
+
      /* Variables. */
      struct sfm_var *sfm_vars;   /* Variables. */
      size_t sfm_var_cnt;         /* Number of variables. */
@@ -96,6 +103,12 @@ struct sfm_writer
                                     for long string variables. */
    };
  
+struct zblock
+  {
+    unsigned int uncompressed_size;
+    unsigned int compressed_size;
+  };
+
  static const struct casewriter_class sys_file_casewriter_class;
  
  static void write_header (struct sfm_writer *, const struct dictionary *);
@@ -134,6 +147,7 @@ static void write_variable_attributes (struct sfm_writer *,
                                         const struct dictionary *);
  
  static void write_int (struct sfm_writer *, int32_t);
+static void write_int64 (struct sfm_writer *, int64_t);
  static inline void convert_double_to_output_format (double, uint8_t[8]);
  static void write_float (struct sfm_writer *, double);
  static void write_string (struct sfm_writer *, const char *, size_t);
@@ -156,6 +170,10 @@ static void put_cmp_opcode (struct sfm_writer *, uint8_t);
  static void put_cmp_number (struct sfm_writer *, double);
  static void put_cmp_string (struct sfm_writer *, const void *, size_t);
  
+static bool start_zstream (struct sfm_writer *);
+static void finish_zstream (struct sfm_writer *);
+static void write_ztrailer (struct sfm_writer *);
+
  static bool write_error (const struct sfm_writer *);
  static bool close_writer (struct sfm_writer *);
  
@@ -164,8 +182,10 @@ struct sfm_write_options
  sfm_writer_default_options (void)
  {
    struct sfm_write_options opts;
+  opts.compression = (settings_get_scompression ()
+                      ? SFM_COMP_SIMPLE
+                      : SFM_COMP_NONE);
    opts.create_writeable = true;
-  opts.compress = settings_get_scompression ();
    opts.version = 3;
    return opts;
  }
@@ -194,13 +214,20 @@ sfm_open_writer (struct file_handle *fh, struct dictionary *d,
      }
  
    /* Create and initialize writer. */
-  w = xmalloc (sizeof *w);
+  w = xzalloc (sizeof *w);
    w->fh = fh_ref (fh);
    w->lock = NULL;
    w->file = NULL;
    w->rf = NULL;
  
-  w->compress = opts.compress;
+  /* Use the requested compression, except that no EBCDIC-based ZLIB compressed
+     files have been observed, so drop back to simple compression for those
+     files. */
+  w->compression = opts.compression;
+  if (w->compression == SFM_COMP_ZLIB
+      && is_encoding_ebcdic_compatible (dict_get_encoding (d)))
+    w->compression = SFM_COMP_SIMPLE;
+
    w->case_cnt = 0;
  
    w->n_opcodes = w->n_elements = 0;
@@ -279,6 +306,20 @@ sfm_open_writer (struct file_handle *fh, struct dictionary *d,
    write_int (w, 999);
    write_int (w, 0);
  
+  if (w->compression == SFM_COMP_ZLIB)
+    {
+      w->zstream.zalloc = Z_NULL;
+      w->zstream.zfree = Z_NULL;
+      w->zstream.opaque = Z_NULL;
+      w->zstart = ftello (w->file);
+
+      write_int64 (w, w->zstart);
+      write_int64 (w, 0);
+      write_int64 (w, 0);
+
+      start_zstream (w);
+    }
+
    if (write_error (w))
      goto error;
  
@@ -336,6 +377,8 @@ write_header (struct sfm_writer *w, const struct dictionary *d)
    /* Record-type code. */
    if (is_encoding_ebcdic_compatible (dict_encoding))
      write_string (w, EBCDIC_MAGIC, 4);
+  else if (w->compression == SFM_COMP_ZLIB)
+    write_string (w, ASCII_ZMAGIC, 4);
    else
      write_string (w, ASCII_MAGIC, 4);
  
@@ -351,7 +394,9 @@ write_header (struct sfm_writer *w, const struct dictionary *d)
    write_int (w, calc_oct_idx (d, NULL));
  
    /* Compressed? */
-  write_int (w, w->compress);
+  write_int (w, (w->compression == SFM_COMP_NONE ? 0
+                 : w->compression == SFM_COMP_SIMPLE ? 1
+                 : 2));
  
    /* Weight variable. */
    weight = dict_get_weight (d);
@@ -1171,7 +1216,7 @@ sys_file_casewriter_write (struct casewriter *writer, void *w_,
  
    w->case_cnt++;
  
-  if (!w->compress)
+  if (w->compression == SFM_COMP_NONE)
      write_case_uncompressed (w, c);
    else
      write_case_compressed (w, c);
@@ -1210,6 +1255,11 @@ close_writer (struct sfm_writer *w)
      {
        /* Flush buffer. */
        flush_compressed (w);
+      if (w->compression == SFM_COMP_ZLIB)
+        {
+          finish_zstream (w);
+          write_ztrailer (w);
+        }
        fflush (w->file);
  
        ok = !write_error (w);
@@ -1234,6 +1284,8 @@ close_writer (struct sfm_writer *w)
          ok = false;
      }
  
+  free (w->blocks);
+
    fh_unlock (w->lock);
    fh_unref (w->fh);
  
@@ -1324,13 +1376,142 @@ write_case_compressed (struct sfm_writer *w, const struct ccase *c)
      }
  }
  
+static bool
+start_zstream (struct sfm_writer *w)
+{
+  int error;
+
+  error = deflateInit (&w->zstream, 1);
+  if (error != Z_OK)
+    {
+      msg (ME, _("Failed to initialize ZLIB for compression (%s)."),
+           w->zstream.msg);
+      return false;
+    }
+  return true;
+}
+
+static void
+finish_zstream (struct sfm_writer *w)
+{
+  struct zblock *block;
+  int error;
+
+  assert (w->zstream.total_in <= ZBLOCK_SIZE);
+
+  w->zstream.next_in = NULL;
+  w->zstream.avail_in = 0;
+  do
+    {
+      uint8_t buf[4096];
+
+      w->zstream.next_out = buf;
+      w->zstream.avail_out = sizeof buf;
+      error = deflate (&w->zstream, Z_FINISH);
+      write_bytes (w, buf, w->zstream.next_out - buf);
+    }
+  while (error == Z_OK);
+
+  if (error != Z_STREAM_END)
+    msg (ME, _("Failed to complete ZLIB stream compression (%s)."),
+         w->zstream.msg);
+
+  if (w->n_blocks >= w->allocated_blocks)
+    w->blocks = x2nrealloc (w->blocks, &w->allocated_blocks,
+                            sizeof *w->blocks);
+  block = &w->blocks[w->n_blocks++];
+  block->uncompressed_size = w->zstream.total_in;
+  block->compressed_size = w->zstream.total_out;
+}
+
+static void
+write_zlib (struct sfm_writer *w, const void *data_, unsigned int n)
+{
+  const uint8_t *data = data_;
+
+  while (n > 0)
+    {
+      unsigned int chunk;
+
+      if (w->zstream.total_in >= ZBLOCK_SIZE)
+        {
+          finish_zstream (w);
+          start_zstream (w);
+        }
+
+      chunk = MIN (n, ZBLOCK_SIZE - w->zstream.total_in);
+
+      w->zstream.next_in = CONST_CAST (uint8_t *, data);
+      w->zstream.avail_in = chunk;
+      do
+        {
+          uint8_t buf[4096];
+          int error;
+
+          w->zstream.next_out = buf;
+          w->zstream.avail_out = sizeof buf;
+          error = deflate (&w->zstream, Z_NO_FLUSH);
+          write_bytes (w, buf, w->zstream.next_out - buf);
+          if (error != Z_OK)
+            {
+              msg (ME, _("ZLIB stream compression failed (%s)."),
+                   w->zstream.msg);
+              return;
+            }
+        }
+      while (w->zstream.avail_in > 0 || w->zstream.avail_out == 0);
+      data += chunk;
+      n -= chunk;
+    }
+}
+
+static void
+write_ztrailer (struct sfm_writer *w)
+{
+  long long int uncompressed_ofs;
+  long long int compressed_ofs;
+  const struct zblock *block;
+
+  write_int64 (w, -COMPRESSION_BIAS);
+  write_int64 (w, 0);
+  write_int (w, ZBLOCK_SIZE);
+  write_int (w, w->n_blocks);
+
+  uncompressed_ofs = w->zstart;
+  compressed_ofs = w->zstart + 24;
+  for (block = w->blocks; block < &w->blocks[w->n_blocks]; block++)
+    {
+      write_int64 (w, uncompressed_ofs);
+      write_int64 (w, compressed_ofs);
+      write_int (w, block->uncompressed_size);
+      write_int (w, block->compressed_size);
+
+      uncompressed_ofs += block->uncompressed_size;
+      compressed_ofs += block->compressed_size;
+    }
+
+  if (!fseeko (w->file, w->zstart + 8, SEEK_SET))
+    {
+      write_int64 (w, compressed_ofs);
+      write_int64 (w, 24 + (w->n_blocks * 24));
+    }
+  else
+    msg (ME, _("%s: Seek failed (%s)."),
+         fh_get_file_name (w->fh), strerror (errno));
+}
+
  /* Flushes buffered compressed opcodes and data to W. */
  static void
  flush_compressed (struct sfm_writer *w)
  {
    if (w->n_opcodes)
      {
-      write_bytes (w, w->cbuf, 8 * (1 + w->n_elements));
+      unsigned int n = 8 * (1 + w->n_elements);
+      if (w->compression == SFM_COMP_SIMPLE)
+        write_bytes (w, w->cbuf, n);
+      else
+        write_zlib (w, w->cbuf, n);
+
        w->n_opcodes = w->n_elements = 0;
        memset (w->cbuf[0], 0, 8);
      }
@@ -1376,6 +1557,13 @@ write_int (struct sfm_writer *w, int32_t x)
    write_bytes (w, &x, sizeof x);
  }
  
+/* Writes 64-bit integer X to the output file for writer W. */
+static void
+write_int64 (struct sfm_writer *w, int64_t x)
+{
+  write_bytes (w, &x, sizeof x);
+}
+
  /* Converts NATIVE to the 64-bit format used in output files in
     OUTPUT. */
  static inline void
diff --git a/src/data/sys-file-writer.h b/src/data/sys-file-writer.h

index fdff49fe52a165d02488e51d67c0efd758e3047b..4f233f3197acefb6a58e7c3938d5c26e54b8f0df 100644 (file)
--- a/src/data/sys-file-writer.h
+++ b/src/data/sys-file-writer.h
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2009 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2009, 2013 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -18,14 +18,15 @@
  #define SFM_WRITE_H 1
  
  #include <stdbool.h>
+#include "sys-file.h"
  
  /* Writing system files. */
  
  /* Options for creating a system file. */
  struct sfm_write_options
    {
+    enum sfm_compression compression;
      bool create_writeable;      /* File perms: writeable or read/only? */
-    bool compress;              /* Compress file? */
      int version;                /* System file version (currently 2 or 3). */
    };
  
diff --git a/src/data/sys-file.h b/src/data/sys-file.h

new file mode 100644 (file)

index 0000000..7a582c0
--- /dev/null
+++ b/src/data/sys-file.h
@@ -0,0 +1,28 @@
+/* PSPP - a program for statistical analysis.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>. */
+
+#ifndef SYS_FILE_H
+#define SYS_FILE_H 1
+
+/* System file compression format. */
+enum sfm_compression
+  {
+    SFM_COMP_NONE,              /* No compression. */
+    SFM_COMP_SIMPLE,            /* Bytecode compression of integer values. */
+    SFM_COMP_ZLIB               /* ZLIB "deflate" compression. */
+  };
+
+#endif /* sys-file.h */
diff --git a/src/language/data-io/save.c b/src/language/data-io/save.c

index e01a8c941edd24f3989c11db9c9bb7588850e99e..7f1347db982a0292895e367cc0571aa3107c4c38 100644 (file)
--- a/src/language/data-io/save.c
+++ b/src/language/data-io/save.c
@@ -234,10 +234,13 @@ parse_write_command (struct lexer *lexer, struct dataset *ds,
          }
        else if (writer_type == SYSFILE_WRITER
                 && lex_match_id (lexer, "COMPRESSED"))
-       sysfile_opts.compress = true;
+       sysfile_opts.compression = SFM_COMP_SIMPLE;
        else if (writer_type == SYSFILE_WRITER
                 && lex_match_id (lexer, "UNCOMPRESSED"))
-       sysfile_opts.compress = false;
+       sysfile_opts.compression = SFM_COMP_NONE;
+      else if (writer_type == SYSFILE_WRITER
+               && lex_match_id (lexer, "ZCOMPRESSED"))
+       sysfile_opts.compression = SFM_COMP_ZLIB;
        else if (writer_type == SYSFILE_WRITER
                 && lex_match_id (lexer, "VERSION"))
         {
diff --git a/src/language/dictionary/sys-file-info.c b/src/language/dictionary/sys-file-info.c

index 3327a2c4ca0019066b421d6ec5e0d4aacf8f0f9a..c7f326f3be7f673fb7b5918785ca08093a24923f 100644 (file)
--- a/src/language/dictionary/sys-file-info.c
+++ b/src/language/dictionary/sys-file-info.c
@@ -150,10 +150,11 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED)
                 ? var_get_name (weight_var) : _("Not weighted.")));
    }
  
-  tab_text (t, 0, r, TAB_LEFT, _("Mode:"));
+  tab_text (t, 0, r, TAB_LEFT, _("Compression:"));
    tab_text_format (t, 1, r++, TAB_LEFT,
-                   _("Compression %s."), info.compressed ? _("on") : _("off"));
-
+                   info.compression == SFM_COMP_NONE ? _("None")
+                   : info.compression == SFM_COMP_SIMPLE ? "SAV"
+                   : "ZSAV");
  
    tab_text (t, 0, r, TAB_LEFT, _("Charset:"));
    tab_text (t, 1, r++, TAB_LEFT, dict_get_encoding (d));
diff --git a/tests/data/sack.c b/tests/data/sack.c

index ace153cffdb891eddf57df159d4e57b0d38f1376..0326b1370d255b693d0a4321feeafbb4eec65776 100644 (file)
--- a/tests/data/sack.c
+++ b/tests/data/sack.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 2011 Free Software Foundation, Inc.
+   Copyright (C) 2011, 2013 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -31,6 +31,7 @@
  #include "libpspp/float-format.h"
  #include "libpspp/integer-format.h"
  
+#include "gl/c-ctype.h"
  #include "gl/error.h"
  #include "gl/md5.h"
  #include "gl/intprops.h"
@@ -58,12 +59,14 @@ enum token_type
      T_LPAREN,
      T_RPAREN,
      T_I8,
+    T_I64,
      T_S,
-    T_COUNT
+    T_COUNT,
+    T_HEX
    };
  
  static enum token_type token;
-static unsigned long int tok_integer;
+static unsigned long long int tok_integer;
  static double tok_float;
  static char *tok_string;
  static size_t tok_strlen, tok_allocated;
@@ -92,12 +95,19 @@ fatal (const char *message, ...)
  }
  
  static void
-add_char (int c)
+add_char__ (int c)
  {
    if (tok_strlen >= tok_allocated)
      tok_string = x2realloc (tok_string, &tok_allocated);
  
-  tok_string[tok_strlen++] = c;
+  tok_string[tok_strlen] = c;
+}
+
+static void
+add_char (int c)
+{
+  add_char__ (c);
+  tok_strlen++;
  }
  
  static void
@@ -135,14 +145,14 @@ get_token (void)
            c = getc (input);
          }
        while (isdigit (c) || isalpha (c) || c == '.');
-      add_char ('\0');
+      add_char__ ('\0');
        ungetc (c, input);
  
        errno = 0;
        if (strchr (tok_string, '.') == NULL)
          {
            token = T_INTEGER;
-          tok_integer = strtoul (tok_string, &tail, 0);
+          tok_integer = strtoull (tok_string, &tail, 0);
          }
        else
          {
@@ -161,6 +171,7 @@ get_token (void)
              fatal ("new-line inside string");
            add_char (c);
          }
+      add_char__ ('\0');
      }
    else if (c == ';')
      token = T_SEMICOLON;
@@ -183,6 +194,8 @@ get_token (void)
  
        if (!strcmp (tok_string, "i8"))
          token = T_I8;
+      else if (!strcmp (tok_string, "i64"))
+        token = T_I64;
        else if (tok_string[0] == 's')
          {
            token = T_S;
@@ -210,6 +223,8 @@ get_token (void)
          }
        else if (!strcmp (tok_string, "COUNT"))
          token = T_COUNT;
+      else if (!strcmp (tok_string, "hex"))
+        token = T_HEX;
        else
          fatal ("invalid token `%s'", tok_string);
      }
@@ -235,6 +250,17 @@ buffer_put_uninit (struct buffer *buffer, size_t n)
    return &buffer->data[buffer->size - n];
  }
  
+/* Returns the integer value of hex digit C. */
+static int
+hexit_value (int c)
+{
+  const char s[] = "0123456789abcdef";
+  const char *cp = strchr (s, c_tolower ((unsigned char) c));
+
+  assert (cp != NULL);
+  return cp - s;
+}
+
  static void
  usage (void)
  {
@@ -265,6 +291,9 @@ stdout.  A data item is one of the following\n\
  \n\
    - The literal \"i8\" followed by an integer.  Output as a single\n\
      byte with the specified value.\n\
+\n\
+  - The literal \"i64\" followed by an integer.  Output as a 64-bit\n\
+    binary integer.\n\
  \n\
    - One of the literals SYSMIS, LOWEST, or HIGHEST.  Output as a\n\
      64-bit IEEE 754 float of the appropriate PSPP value.\n\
@@ -378,6 +407,19 @@ parse_data_item (struct buffer *output)
          }
        while (token == T_INTEGER);
      }
+  else if (token == T_I64)
+    {
+      get_token ();
+      do
+        {
+          if (token != T_INTEGER)
+            fatal ("integer expected after `i64'");
+          integer_put (tok_integer, integer_format,
+                       buffer_put_uninit (output, 8), 8);
+          get_token ();
+        }
+      while (token == T_INTEGER);
+    }
    else if (token == T_STRING)
      {
        buffer_put (output, tok_string, tok_strlen);
@@ -426,6 +468,33 @@ parse_data_item (struct buffer *output)
        integer_put (output->size - old_size - 4, integer_format,
                     output->data + old_size, 4);
      }
+  else if (token == T_HEX)
+    {
+      const char *p;
+
+      get_token ();
+
+      if (token != T_STRING)
+        fatal ("string expected");
+
+      for (p = tok_string; *p; p++)
+        {
+          if (isspace ((unsigned char) *p))
+            continue;
+          else if (isxdigit ((unsigned char) p[0])
+                   && isxdigit ((unsigned char) p[1]))
+            {
+              int high = hexit_value (p[0]);
+              int low = hexit_value (p[1]);
+              uint8_t byte = high * 16 + low;
+              buffer_put (output, &byte, 1);
+              p++;
+            }
+          else
+            fatal ("invalid format in hex string");
+        }
+      get_token ();
+    }
    else
      fatal ("syntax error");
  
diff --git a/tests/data/sys-file-reader.at b/tests/data/sys-file-reader.at

index 2706228eb62987edd3b1e517741e673003181aaa..4b7a2b541fc306e0b903670085932916b859bbd4 100644 (file)
--- a/tests/data/sys-file-reader.at
+++ b/tests/data/sys-file-reader.at
@@ -1297,7 +1297,7 @@ dnl File header.
  "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
  2; dnl Layout code
  6; dnl Nominal case size
-1; dnl Not compressed
+1; dnl Simple compression
  0; dnl Not weighted 
  -1; dnl Unspecified number of cases.
  100.0; dnl Bias.
@@ -1361,7 +1361,7 @@ dnl File header.
  "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
  2; dnl Layout code
  6; dnl Nominal case size
-1; dnl Not compressed
+1; dnl Simple compression.
  0; dnl Not weighted 
  -1; dnl Unspecified number of cases.
  0.0; dnl Bias.
@@ -1425,7 +1425,7 @@ dnl File header.
  "$FL2"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
  2; dnl Layout code
  6; dnl Nominal case size
-1; dnl Not compressed
+1; dnl Simple compression.
  0; dnl Not weighted 
  -1; dnl Unspecified number of cases.
  50.0; dnl Bias.
@@ -1485,6 +1485,105 @@ num1,num2,str4,str8,str15
  ])
  done
  AT_CLEANUP
+
+m4_divert_push([PREPARE_TESTS])
+zcompressed_sack () {
+    cat <<'EOF'
+dnl File header.
+"$FL3"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
+2; dnl Layout code
+6; dnl Nominal case size
+2; dnl zlib compressed
+0; dnl Not weighted
+-1; dnl Unspecified number of cases.
+100.0; dnl Bias.
+"01 Jan 11"; "20:53:52"; s64 "PSPP synthetic test file";
+i8 0 *3;
+
+dnl Numeric variables.
+2; 0; 0; 0; 0x050800 *2; s8 "NUM1";
+2; 0; 0; 0; 0x050800 *2; s8 "NUM2";
+
+dnl String variable.
+2; 4; 0; 0; 0x010400 *2; s8 "STR4";
+2; 8; 0; 0; 0x010800 *2; s8 "STR8";
+2; 15; 0; 0; 0x010f00 *2; s8 "STR15";
+2; -1; 0; 0; 0; 0; s8 "";
+
+dnl Dictionary termination record.
+999; 0;
+
+dnl ZLIB data header.
+i64 0x178;    # zheader_ofs
+i64 0x1e9;    # ztrailer_ofs
+i64 48;       # ztrailer_len
+
+dnl ZLIB data block.
+dnl
+dnl This is the compressed form of:
+dnl
+dnl 01 64 fe fd fe fd ff fb  61 62 63 64 65 66 67 68  |.d......abcdefgh|
+dnl 30 31 32 33 20 20 20 20  fd fd fd fe 65 66 fd fd  |0123    ....ef..|
+dnl 6a 6b 6c 6d 20 20 20 20  6e 6f 70 71 72 73 74 75  |jklm    nopqrstu|
+dnl 76 77 78 79 7a 41 42 43  44 45 46 47 20 20 20 20  |vwxyzABCDEFG    |
+dnl 48 49 4a 4b 4c 4d 4e 4f  fe fd fc 00 00 00 00 00  |HIJKLMNO........|
+dnl 50 51 52 53 54 55 56 57                           |PQRSTUVW|
+dnl
+dnl which is the data from the "compressed data" test.
+hex "78 01 63 4c f9 f7 f7 df  df ff bf 13 93 92 53 52";
+hex "d3 d2 33 0c 0c 8d 8c 15  80 e0 ef df bf ff 52 d3";
+hex "fe fe cd ca ce c9 05 f1  f3 f2 0b 0a 8b 8a 4b 4a";
+hex "cb ca 2b 2a ab 1c 9d 9c  5d 5c dd dc 41 e2 1e 9e";
+hex "5e de 3e be 7e fe ff fe  fe 61 00 81 80 c0 a0 e0";
+hex "90 d0 b0 70 00 0f 3f 23  d7";
+
+dnl ZLIB data trailer fixed header:
+i64 -100;     # ztrailer_bias
+i64 0;        # ztrailer_zero
+0x3ff000;     # block_size
+1;            # n_blocks
+
+dnl ZLIB block descriptor:
+i64 0x178;    # uncompressed_ofs
+i64 0x190;    # compressed_ofs
+88;           # uncompressed_size
+89;           # compressed_size
+EOF
+}
+m4_divert_pop([PREPARE_TESTS])
+
+AT_SETUP([zcompressed data])
+AT_KEYWORDS([sack synthetic system file positive zlib])
+zcompressed_sack > sys-file.sack
+for variant in \
+       "be 2d706c3ca0cc9be7f1721f09d0d42179" \
+       "le 3f362f338d65b0a836b3c752cc3fc5bc"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [dnl
+GET FILE='sys-file.sav'.
+DISPLAY DICTIONARY.
+LIST.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps])
+  AT_CHECK([grep -v Measure pspp.csv | grep -v Display], [0], [dnl
+Variable,Description,,Position
+num1,Format: F8.0,,1
+num2,Format: F8.0,,2
+str4,Format: A4,,3
+str8,Format: A8,,4
+str15,Format: A15,,5
+
+Table: Data List
+num1,num2,str4,str8,str15
+-99,0,,abcdefgh,0123   @&t@
+.,151,jklm,nopqrstu,vwxyzABC       @&t@
+1,2,DEFG,HIJKLMNO,PQRSTUV
+])
+done
+AT_CLEANUP
  \f
  AT_BANNER([system file reader - negative])
  
@@ -3553,3 +3652,296 @@ num1,num2,str4,str8,str15
  done
  AT_CLEANUP
  
+AT_SETUP([zcompressed data - bad zheader_ofs])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*zheader_ofs.*/>>i64 0<<;/' > sys-file.sack
+for variant in \
+       "be 6d5c32f34fa1bed6f9b8f7045d104fdc" \
+       "le 1f67fbda4f0021143e141fe8403c5a97"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x178: Wrong ZLIB data header offset 0 (expected 0x178).
+])
+done
+AT_CLEANUP
+
+AT_SETUP([zcompressed data - bad ztrailer_ofs])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*ztrailer_ofs.*/>>i64 0<<;/' > sys-file.sack
+for variant in \
+       "be e2c8dec0c62d3d798825ad5906370634" \
+       "le c1cff4cdddeee80bf1580cbc26fa9fd5"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x178: Impossible ZLIB trailer offset 0x0.
+])
+done
+AT_CLEANUP
+
+# ztrailer_len must be a multiple of 24 and at least 48,
+# so a value of 12 is impossible.
+AT_SETUP([zcompressed data - invalid ztrailer_len])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*ztrailer_len.*/>>i64 12<<;/' > sys-file.sack
+for variant in \
+       "be 27f5203463bc4c7644382f24ae87f84c" \
+       "le 0035fa6ee7690720429715150ede85f4"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x178: Invalid ZLIB trailer length 12.
+])
+done
+AT_CLEANUP
+
+# ztrailer_ofs + ztrailer_len must be the file size.
+AT_SETUP([zcompressed data - wrong ztrailer_len])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*ztrailer_len.*/>>i64 72<<;/' > sys-file.sack
+for variant in \
+       "be 2ba9ae97bc0a7f5dcfe36e2463b9d7cb" \
+       "le d737ea0a53ca5c6f20be359027171d73"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [warning: `sys-file.sav' near offset 0x190: End of ZLIB trailer (0x231) is not file size (0x219).
+error: `sys-file.sav' near offset 0x201: 72-byte ZLIB trailer specifies 1 data blocks (expected 2).
+])
+done
+AT_CLEANUP
+
+AT_SETUP([zcompressed data - wrong ztrailer_bias])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*ztrailer_bias.*/>>i64 0<<;/' > sys-file.sack
+for variant in \
+       "be a5b56ab5e799a3626de2cdd7bd8d7a03" \
+       "le d7cd584c6d5a95df10ba640eb3f1f24f"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x1f1: ZLIB trailer bias (0) differs from file header bias (100.00).
+])
+done
+AT_CLEANUP
+
+AT_SETUP([zcompressed data - wrong ztrailer_zero])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*ztrailer_zero.*/>>i64 100<<;/' > sys-file.sack
+for variant in \
+       "be 8d746abedb3e74cfdc22207f3455db92" \
+       "le 79cea017365cab35d59c7a300cfa66c1"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [0], [warning: `sys-file.sav' near offset 0x1f9: ZLIB trailer "zero" field has nonzero value 100.
+])
+done
+AT_CLEANUP
+
+AT_SETUP([zcompressed data - wrong block_size])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*block_size.*/>>0x1000<<;/' > sys-file.sack
+for variant in \
+       "be 8d5a1caa56be8892d453faf1047005ca" \
+       "le 7daa1bd57b192893b313a351202e179b"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [0], [warning: `sys-file.sav' near offset 0x1fd: ZLIB trailer specifies unexpected 4096-byte block size.
+])
+done
+AT_CLEANUP
+
+AT_SETUP([zcompressed data - wrong n_blocks])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*n_blocks.*/>>2<<;/' > sys-file.sack
+for variant in \
+       "be cd29596fd6bf4a2f651febe820a7955f" \
+       "le 8fc1f718dfd2abac7c3442c1055d4cab"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x201: 48-byte ZLIB trailer specifies 2 data blocks (expected 1).
+])
+done
+AT_CLEANUP
+
+AT_SETUP([zcompressed data - wrong uncompressed_ofs])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*uncompressed_ofs.*/i64 >>0x177<<;/' > sys-file.sack
+for variant in \
+       "be 5546120fe6161dc6ed20aec48d8e74a4" \
+       "le 86fafd625ed5ceaa1bff4fc7f500b6ab"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x201: ZLIB block descriptor 0 reported uncompressed data offset 0x177, when 0x178 was expected.
+])
+done
+AT_CLEANUP
+
+AT_SETUP([zcompressed data - wrong compressed_ofs])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*\bcompressed_ofs.*/i64 >>0x191<<;/' > sys-file.sack
+for variant in \
+       "be 652e28f8d3f8e4ce47ad18d0f30e7bb9" \
+       "le ebf2c647f2d7c47858d4f5ed683526e6"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x201: ZLIB block descriptor 0 reported compressed data offset 0x191, when 0x190 was expected.
+])
+done
+AT_CLEANUP
+
+AT_SETUP([zcompressed data - compressed sizes don't add up])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+AT_DATA([sys-file.sack], [dnl
+dnl File header.
+"$FL3"; s60 "$(#) SPSS DATA FILE PSPP synthetic test file";
+2; dnl Layout code
+6; dnl Nominal case size
+2; dnl zlib compressed
+0; dnl Not weighted
+-1; dnl Unspecified number of cases.
+100.0; dnl Bias.
+"01 Jan 11"; "20:53:52"; s64 "PSPP synthetic test file";
+i8 0 *3;
+
+dnl Numeric variables.
+2; 0; 0; 0; 0x050800 *2; s8 "NUM1";
+2; 0; 0; 0; 0x050800 *2; s8 "NUM2";
+
+dnl String variable.
+2; 4; 0; 0; 0x010400 *2; s8 "STR4";
+2; 8; 0; 0; 0x010800 *2; s8 "STR8";
+2; 15; 0; 0; 0x010f00 *2; s8 "STR15";
+2; -1; 0; 0; 0; 0; s8 "";
+
+dnl Dictionary termination record.
+999; 0;
+
+dnl ZLIB data header.
+i64 0x178;    # zheader_ofs
+i64 0x190;    # ztrailer_ofs
+i64 72;       # ztrailer_len
+
+dnl This is where the ZLIB data blocks would go, but we don't need any to
+dnl provoke this message so we omit them.
+
+dnl ZLIB data trailer fixed header:
+i64 -100;     # ztrailer_bias
+i64 0;        # ztrailer_zero
+0x3ff000;     # block_size
+2;            # n_blocks
+
+dnl ZLIB block descriptor 1:
+i64 0x178;    # uncompressed_ofs
+i64 0x190;    # compressed_ofs
+0x100000;     # uncompressed_size
+0x12345;      # compressed_size
+
+dnl ZLIB block descriptor 2:
+i64 0x100178; # uncompressed_ofs
+i64 0x12405;  # compressed_ofs
+0x100000;     # uncompressed_size
+0x12345;      # compressed_size
+])
+for variant in \
+       "be 72ebf57bffa340afe16ed79959faac09" \
+       "le 80b34e98f6b181dcc2e8ca4ba13f768d"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [warning: `sys-file.sav' near offset 0x1a8: ZLIB block descriptor 0 reported block size 0x100000, when 0x3ff000 was expected.
+error: `sys-file.sav' near offset 0x1c0: ZLIB block descriptor 1 reported compressed data offset 0x12405, when 0x124d5 was expected.
+])
+done
+AT_CLEANUP
+
+AT_SETUP([zcompressed data - uncompressed_size > block_size])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*uncompressed_size.*/>>0x400000<<;/' > sys-file.sack
+for variant in \
+       "be 9bb74ef407fe0b79e43c388eedc28212" \
+       "le 6f145fb5f820c513f50b6f81310cdad5"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [0], [warning: `sys-file.sav' near offset 0x201: ZLIB block descriptor 0 reported block size 0x400000, when at most 0x3ff000 was expected.
+])
+done
+AT_CLEANUP
+
+AT_SETUP([zcompressed data - compression expands data too much])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*uncompressed_size.*/>>50<<;/
+s/.*\bcompressed_size.*/>>100<<;/' > sys-file.sack
+for variant in \
+       "be e11cadde5f0855c965a1cb388dedc36e" \
+       "le 37953e71462b6554c5644fec8b539164"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x201: ZLIB block descriptor 0 reports compressed size 100 and uncompressed size 50.
+])
+done
+AT_CLEANUP
+
+AT_SETUP([zcompressed data - compressed sizes don't add up])
+AT_KEYWORDS([sack synthetic system file negative zlib])
+zcompressed_sack | sed 's/.*\bcompressed_size.*/>>88<<;/' > sys-file.sack
+for variant in \
+       "be 366eaf85be1f26fb6549e2f8ee393628" \
+       "le a756e5125e6a908cb4990f66cc419bef"
+do
+  set $variant
+  AT_CHECK_UNQUOTED([sack --$[1] sys-file.sack > sys-file.sav], [0], [], [$[2]
+])
+  AT_DATA([sys-file.sps], [GET FILE='sys-file.sav'.
+])
+  AT_CHECK([pspp -o pspp.csv sys-file.sps], [1], [error: `sys-file.sav' near offset 0x219: ZLIB trailer is at offset 0x1e9 but 0x1e8 would be expected from block descriptors.
+])
+done
+AT_CLEANUP
diff --git a/tests/data/sys-file.at b/tests/data/sys-file.at

index cd7fbb4f50a168897a0b84dc8328547e1a575738..9b8986d589c5eb17bac3837aee9aa2519f293903 100644 (file)
--- a/tests/data/sys-file.at
+++ b/tests/data/sys-file.at
@@ -3,7 +3,10 @@ AT_BANNER([system files])
  # Also tests that long variable names are preserved by SAVE and GET.
  AT_SETUP([write and read numeric data])
  AT_KEYWORDS([SAVE GET system file])
-AT_DATA([sysfile.sps], [dnl
+for variant in 'UNCOMPRESSED $FL2' 'ZCOMPRESSED $FL3'; do
+    set $variant
+    compression=$1 magic=$2
+    cat >sysfile.sps <<EOF
  DATA LIST LIST NOTABLE / variable001 * variable002 * variable003 * variable004 * .
  BEGIN DATA.
      1.00     1.00    1.0     2.00
@@ -16,14 +19,14 @@ BEGIN DATA.
      2.00     2.00    2.0     3.00
  END DATA.
  
-SAVE /OUTFILE='foo.sav'.
+SAVE/$compression /OUTFILE='foo.sav'.
  
  GET /FILE='foo.sav'.
  
  LIST.
-])
-AT_CHECK([pspp -o pspp.csv sysfile.sps])
-AT_CHECK([cat pspp.csv], [0], [dnl
+EOF
+    AT_CHECK([pspp -o pspp.csv sysfile.sps])
+    AT_CHECK([cat pspp.csv], [0], [dnl
  Table: Data List
  variable001,variable002,variable003,variable004
  1.00,1.00,1.00,2.00
@@ -35,10 +38,17 @@ variable001,variable002,variable003,variable004
  2.00,2.00,1.00,1.00
  2.00,2.00,2.00,3.00
  ])
+    AT_CHECK_UNQUOTED([dd if=foo.sav bs=1 count=4; echo], [0], [$magic
+], [ignore])
+done
  AT_CLEANUP
  
  AT_SETUP([write and read long string value labels and missing values])
-AT_DATA([sysfile.sps], [dnl
+AT_KEYWORDS([SAVE GET system file])
+for variant in 'UNCOMPRESSED $FL2' 'ZCOMPRESSED $FL3'; do
+    set $variant
+    compression=$1 magic=$2
+    cat >sysfile.sps <<EOF
  DATA LIST LIST NOTABLE/s1 s2 s3 (a9).
  BEGIN DATA
  a b c
@@ -57,12 +67,12 @@ MISSING VALUES
      /s2 ('12' '123')
      /s3 ('1234' '12345' '12345678').
  
-SAVE /OUTFILE='foo.sav'.
+SAVE/$compression /OUTFILE='foo.sav'.
  GET /FILE='foo.sav'.
  DISPLAY DICTIONARY.
-])
-AT_CHECK([pspp -o pspp.csv sysfile.sps])
-AT_CHECK([cat pspp.csv], [0], [dnl
+EOF
+    AT_CHECK([pspp -o pspp.csv sysfile.sps])
+    AT_CHECK([cat pspp.csv], [0], [dnl
  Variable,Description,,Position
  s1,Format: A9,,1
  ,Measure: Nominal,,
@@ -86,6 +96,9 @@ s3,Format: A9,,3
  ,Display Width: 9,,
  ,"Missing Values: ""1234    ""; ""12345   ""; ""12345678""",,
  ])
+    AT_CHECK_UNQUOTED([dd if=foo.sav bs=1 count=4; echo], [0], [$magic
+], [ignore])
+done
  AT_CLEANUP
  
  AT_SETUP([write and read compressed files])
diff --git a/tests/language/dictionary/sys-file-info.at b/tests/language/dictionary/sys-file-info.at

index 923c3a29c7c34c685002dbb08e9c6fdba7a174c5..6a5e4adfef1413b53674fda4abc7a64b1ae0ce6c 100644 (file)
--- a/tests/language/dictionary/sys-file-info.at
+++ b/tests/language/dictionary/sys-file-info.at
@@ -31,7 +31,7 @@ Variables:,2
  Cases:,3
  Type:,System File
  Weight:,Not weighted.
-Mode:,Compression on.
+Compression:,SAV
  
  Variable,Description,,Position
  x,Format: F8.2,,1
diff --git a/utilities/pspp-dump-sav.c b/utilities/pspp-dump-sav.c

index c6b5823660a140d3575d7d7559b47c87bf899f30..ceabd1d57d4b3df973bfdf6129addc1777e1e675 100644 (file)
--- a/utilities/pspp-dump-sav.c
+++ b/utilities/pspp-dump-sav.c
@@ -39,6 +39,13 @@
  
  #define ID_MAX_LEN 64
  
+enum compression
+  {
+    COMP_NONE,
+    COMP_SIMPLE,
+    COMP_ZLIB
+  };
+
  struct sfm_reader
    {
      const char *file_name;
@@ -52,7 +59,7 @@ struct sfm_reader
      enum integer_format integer_format;
      enum float_format float_format;
  
-    bool compressed;
+    enum compression compression;
      double bias;
    };
  
@@ -87,7 +94,8 @@ static void read_long_string_missing_values (struct sfm_reader *r,
                                               size_t size, size_t count);
  static void read_unknown_extension (struct sfm_reader *,
                                      size_t size, size_t count);
-static void read_compressed_data (struct sfm_reader *, int max_cases);
+static void read_simple_compressed_data (struct sfm_reader *, int max_cases);
+static void read_zlib_compressed_data (struct sfm_reader *);
  
  static struct text_record *open_text_record (
    struct sfm_reader *, size_t size);
@@ -180,7 +188,7 @@ main (int argc, char *argv[])
        r.n_var_widths = 0;
        r.allocated_var_widths = 0;
        r.var_widths = 0;
-      r.compressed = false;
+      r.compression = COMP_NONE;
  
        if (argc - optind > 1)
          printf ("Reading \"%s\":\n", r.file_name);
@@ -218,8 +226,13 @@ main (int argc, char *argv[])
                (long long int) ftello (r.file),
                (long long int) ftello (r.file) + 4);
  
-      if (r.compressed && max_cases > 0)
-        read_compressed_data (&r, max_cases);
+      if (r.compression == COMP_SIMPLE)
+        {
+          if (max_cases > 0)
+            read_simple_compressed_data (&r, max_cases);
+        }
+      else if (r.compression == COMP_ZLIB)
+        read_zlib_compressed_data (&r);
  
        fclose (r.file);
      }
@@ -241,11 +254,16 @@ read_header (struct sfm_reader *r)
    char creation_date[10];
    char creation_time[9];
    char file_label[65];
+  bool zmagic;
  
    read_string (r, rec_type, sizeof rec_type);
    read_string (r, eye_catcher, sizeof eye_catcher);
  
-  if (strcmp ("$FL2", rec_type) != 0)
+  if (!strcmp ("$FL2", rec_type))
+    zmagic = false;
+  else if (!strcmp ("$FL3", rec_type))
+    zmagic = true;
+  else
      sys_error (r, "This is not an SPSS system file.");
  
    /* Identify integer format. */
@@ -265,7 +283,24 @@ read_header (struct sfm_reader *r)
    weight_index = read_int (r);
    ncases = read_int (r);
  
-  r->compressed = compressed != 0;
+  if (!zmagic)
+    {
+      if (compressed == 0)
+        r->compression = COMP_NONE;
+      else if (compressed == 1)
+        r->compression = COMP_SIMPLE;
+      else if (compressed != 0)
+        sys_error (r, "SAV file header has invalid compression value "
+                   "%"PRId32".", compressed);
+    }
+  else
+    {
+      if (compressed == 2)
+        r->compression = COMP_ZLIB;
+      else
+        sys_error (r, "ZSAV file header has invalid compression value "
+                   "%"PRId32".", compressed);
+    }
  
    /* Identify floating-point format and obtain compression bias. */
    read_bytes (r, raw_bias, sizeof raw_bias);
@@ -289,7 +324,12 @@ read_header (struct sfm_reader *r)
    printf ("File header record:\n");
    printf ("\t%17s: %s\n", "Product name", eye_catcher);
    printf ("\t%17s: %"PRId32"\n", "Layout code", layout_code);
-  printf ("\t%17s: %"PRId32"\n", "Compressed", compressed);
+  printf ("\t%17s: %"PRId32" (%s)\n", "Compressed",
+          compressed,
+          r->compression == COMP_NONE ? "no compression"
+          : r->compression == COMP_SIMPLE ? "simple compression"
+          : r->compression == COMP_ZLIB ? "ZLIB compression"
+          : "<error>");
    printf ("\t%17s: %"PRId32"\n", "Weight index", weight_index);
    printf ("\t%17s: %"PRId32"\n", "Number of cases", ncases);
    printf ("\t%17s: %g\n", "Compression bias", r->bias);
@@ -1170,7 +1210,7 @@ read_variable_attributes (struct sfm_reader *r, size_t size, size_t count)
  }
  
  static void
-read_compressed_data (struct sfm_reader *r, int max_cases)
+read_simple_compressed_data (struct sfm_reader *r, int max_cases)
  {
    enum { N_OPCODES = 8 };
    uint8_t opcodes[N_OPCODES];
@@ -1258,6 +1298,87 @@ read_compressed_data (struct sfm_reader *r, int max_cases)
          }
      }
  }
+
+static void
+read_zlib_compressed_data (struct sfm_reader *r)
+{
+  long long int ofs;
+  long long int this_ofs, next_ofs, next_len;
+  long long int bias, zero;
+  long long int expected_uncmp_ofs, expected_cmp_ofs;
+  unsigned int block_size, n_blocks;
+  unsigned int i;
+
+  read_int (r);
+  ofs = ftello (r->file);
+  printf ("\n%08llx: ZLIB compressed data header:\n", ofs);
+
+  this_ofs = read_int64 (r);
+  next_ofs = read_int64 (r);
+  next_len = read_int64 (r);
+
+  printf ("\tzheader_ofs: 0x%llx\n", this_ofs);
+  if (this_ofs != ofs)
+    printf ("\t\t(Expected 0x%llx.)\n", ofs);
+  printf ("\tztrailer_ofs: 0x%llx\n", next_ofs);
+  printf ("\tztrailer_len: %lld\n", next_len);
+  if (next_len < 24 || next_len % 24)
+    printf ("\t\t(Trailer length is not a positive multiple of 24.)\n");
+
+  printf ("\n%08llx: 0x%llx bytes of ZLIB compressed data\n",
+          ofs + 8 * 3, next_ofs - (ofs + 8 * 3));
+
+  skip_bytes (r, next_ofs - (ofs + 8 * 3));
+
+  printf ("\n%08llx: ZLIB trailer fixed header:\n", next_ofs);
+  bias = read_int64 (r);
+  zero = read_int64 (r);
+  block_size = read_int (r);
+  n_blocks = read_int (r);
+  printf ("\tbias: %lld\n", bias);
+  printf ("\tzero: 0x%llx\n", zero);
+  if (zero != 0)
+    printf ("\t\t(Expected 0.)\n");
+  printf ("\tblock_size: 0x%x\n", block_size);
+  if (block_size != 0x3ff000)
+    printf ("\t\t(Expected 0x3ff000.)\n");
+  printf ("\tn_blocks: %u\n", n_blocks);
+  if (n_blocks != next_len / 24 - 1)
+    printf ("\t\t(Expected %llu.)\n", next_len / 24 - 1);
+
+  expected_uncmp_ofs = ofs;
+  expected_cmp_ofs = ofs + 24;
+  for (i = 0; i < n_blocks; i++)
+    {
+      long long int blockinfo_ofs = ftello (r->file);
+      unsigned long long int uncompressed_ofs = read_int64 (r);
+      unsigned long long int compressed_ofs = read_int64 (r);
+      unsigned int uncompressed_size = read_int (r);
+      unsigned int compressed_size = read_int (r);
+
+      printf ("\n%08llx: ZLIB block descriptor %d\n", blockinfo_ofs, i + 1);
+
+      printf ("\tuncompressed_ofs: 0x%llx\n", uncompressed_ofs);
+      if (uncompressed_ofs != expected_uncmp_ofs)
+        printf ("\t\t(Expected 0x%llx.)\n", ofs);
+
+      printf ("\tcompressed_ofs: 0x%llx\n", compressed_ofs);
+      if (compressed_ofs != expected_cmp_ofs)
+        printf ("\t\t(Expected 0x%llx.)\n", ofs + 24);
+
+      printf ("\tuncompressed_size: 0x%x\n", uncompressed_size);
+      if (i < n_blocks - 1 && uncompressed_size != block_size)
+        printf ("\t\t(Expected 0x%x.)\n", block_size);
+
+      printf ("\tcompressed_size: 0x%x\n", compressed_size);
+      if (i == n_blocks - 1 && compressed_ofs + compressed_size != next_ofs)
+        printf ("\t\t(This was expected to be 0x%llx.)\n",
+                next_ofs - compressed_size);
+
+      expected_uncmp_ofs += uncompressed_size;
+      expected_cmp_ofs += compressed_size;
+    }
+}
  \f
  /* Helpers for reading records that consist of structured text
     strings. */
author	Ben Pfaff <blp@cs.stanford.edu>
	Wed, 23 Oct 2013 05:56:18 +0000 (22:56 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Wed, 23 Oct 2013 05:56:56 +0000 (22:56 -0700)
NEWS		patch \| blob \| history
doc/dev/system-file-format.texi		patch \| blob \| history
doc/files.texi		patch \| blob \| history
perl-module/PSPP.xs		patch \| blob \| history
src/data/automake.mk		patch \| blob \| history
src/data/sys-file-private.h		patch \| blob \| history
src/data/sys-file-reader.c		patch \| blob \| history
src/data/sys-file-reader.h		patch \| blob \| history
src/data/sys-file-writer.c		patch \| blob \| history
src/data/sys-file-writer.h		patch \| blob \| history
src/data/sys-file.h	[new file with mode: 0644]	patch \| blob
src/language/data-io/save.c		patch \| blob \| history
src/language/dictionary/sys-file-info.c		patch \| blob \| history
tests/data/sack.c		patch \| blob \| history
tests/data/sys-file-reader.at		patch \| blob \| history
tests/data/sys-file.at		patch \| blob \| history
tests/language/dictionary/sys-file-info.at		patch \| blob \| history
utilities/pspp-dump-sav.c		patch \| blob \| history