-/* PSPP - computes sample statistics.
- Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
- Written by Ben Pfaff <blp@gnu.org>.
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 1997-2000, 2006-2007, 2009-2016 Free Software Foundation, Inc.
- This program is free software; you can redistribute it and/or
- modify it under the terms of the GNU General Public License as
- published by the Free Software Foundation; either version 2 of the
- License, or (at your option) any later version.
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
- This program is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- 02110-1301, USA. */
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
#include <config.h>
-#include <stdlib.h>
+#include "data/sys-file-private.h"
+
#include <errno.h>
#include <float.h>
-#include <c-ctype.h>
-#include <minmax.h>
-
-#include <libpspp/alloc.h>
-#include <libpspp/message.h>
-#include <libpspp/compiler.h>
-#include <libpspp/magic.h>
-#include <libpspp/misc.h>
-#include <libpspp/str.h>
-#include <libpspp/hash.h>
-#include <libpspp/array.h>
-
-#include "sys-file-reader.h"
-#include "sfm-private.h"
-#include "case.h"
-#include "dictionary.h"
-#include "file-handle-def.h"
-#include "file-name.h"
-#include "format.h"
-#include "value-labels.h"
-#include "variable.h"
-#include "value.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <zlib.h>
+
+#include "data/any-reader.h"
+#include "data/attributes.h"
+#include "data/case.h"
+#include "data/casereader-provider.h"
+#include "data/casereader.h"
+#include "data/dictionary.h"
+#include "data/file-handle-def.h"
+#include "data/file-name.h"
+#include "data/format.h"
+#include "data/identifier.h"
+#include "data/missing-values.h"
+#include "data/mrset.h"
+#include "data/short-names.h"
+#include "data/value-labels.h"
+#include "data/value.h"
+#include "data/variable.h"
+#include "libpspp/array.h"
+#include "libpspp/assertion.h"
+#include "libpspp/compiler.h"
+#include "libpspp/i18n.h"
+#include "libpspp/message.h"
+#include "libpspp/misc.h"
+#include "libpspp/pool.h"
+#include "libpspp/str.h"
+#include "libpspp/stringi-set.h"
+
+#include "gl/c-strtod.h"
+#include "gl/c-ctype.h"
+#include "gl/inttostr.h"
+#include "gl/localcharset.h"
+#include "gl/minmax.h"
+#include "gl/unlocked-io.h"
+#include "gl/xalloc.h"
+#include "gl/xalloc-oversized.h"
+#include "gl/xsize.h"
#include "gettext.h"
#define _(msgid) gettext (msgid)
+#define N_(msgid) (msgid)
+
+enum
+ {
+ /* subtypes 0-2 unknown */
+ EXT_INTEGER = 3, /* Machine integer info. */
+ EXT_FLOAT = 4, /* Machine floating-point info. */
+ EXT_VAR_SETS = 5, /* Variable sets. */
+ EXT_DATE = 6, /* DATE. */
+ EXT_MRSETS = 7, /* Multiple response sets. */
+ EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
+ /* subtype 9 unknown */
+ EXT_PRODUCT_INFO = 10, /* Extra product info text. */
+ EXT_DISPLAY = 11, /* Variable display parameters. */
+ /* subtype 12 unknown */
+ EXT_LONG_NAMES = 13, /* Long variable names. */
+ EXT_LONG_STRINGS = 14, /* Long strings. */
+ /* subtype 15 unknown */
+ EXT_NCASES = 16, /* Extended number of cases. */
+ EXT_FILE_ATTRS = 17, /* Data file attributes. */
+ EXT_VAR_ATTRS = 18, /* Variable attributes. */
+ EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
+ EXT_ENCODING = 20, /* Character encoding. */
+ EXT_LONG_LABELS = 21, /* Value labels for long strings. */
+ EXT_LONG_MISSING = 22, /* Missing values for long strings. */
+ EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
+ };
+
+/* Fields from the top-level header record. */
+struct sfm_header_record
+ {
+ char magic[5]; /* First 4 bytes of file, then null. */
+ int weight_idx; /* 0 if unweighted, otherwise a var index. */
+ int nominal_case_size; /* Number of var positions. */
+
+ /* These correspond to the members of struct any_file_info or a dictionary
+ but in the system file's encoding rather than ASCII. */
+ char creation_date[10]; /* "dd mmm yy". */
+ char creation_time[9]; /* "hh:mm:ss". */
+ char eye_catcher[61]; /* Eye-catcher string, then product name. */
+ char file_label[65]; /* File label. */
+ };
+
+struct sfm_var_record
+ {
+ off_t pos;
+ int width;
+ char name[9];
+ int print_format;
+ int write_format;
+ int missing_value_code;
+ uint8_t missing[24];
+ char *label;
+ struct variable *var;
+ };
+
+struct sfm_value_label
+ {
+ uint8_t value[8];
+ char *label;
+ };
+
+struct sfm_value_label_record
+ {
+ off_t pos;
+ struct sfm_value_label *labels;
+ unsigned int n_labels;
+
+ int *vars;
+ unsigned int n_vars;
+ };
+
+struct sfm_document_record
+ {
+ off_t pos;
+ char *documents;
+ size_t n_lines;
+ };
+
+struct sfm_mrset
+ {
+ const char *name; /* Name. */
+ const char *label; /* Human-readable label for group. */
+ enum mrset_type type; /* Group type. */
+ const char **vars; /* Constituent variables' names. */
+ size_t n_vars; /* Number of constituent variables. */
+
+ /* MRSET_MD only. */
+ enum mrset_md_cat_source cat_source; /* Source of category labels. */
+ bool label_from_var_label; /* 'label' taken from variable label? */
+ const char *counted; /* Counted value, as string. */
+ };
+
+struct sfm_extension_record
+ {
+ int subtype; /* Record subtype. */
+ off_t pos; /* Starting offset in file. */
+ unsigned int size; /* Size of data elements. */
+ unsigned int count; /* Number of data elements. */
+ void *data; /* Contents. */
+ };
/* System file reader. */
struct sfm_reader
-{
- struct file_handle *fh; /* File handle. */
- FILE *file; /* File stream. */
-
- int reverse_endian; /* 1=file has endianness opposite us. */
- int fix_specials; /* 1=SYSMIS/HIGHEST/LOWEST differs from us. */
- int value_cnt; /* Number of `union values's per case. */
- long case_cnt; /* Number of cases, -1 if unknown. */
- int compressed; /* 1=compressed, 0=not compressed. */
- double bias; /* Compression bias, usually 100.0. */
- int weight_idx; /* 0-based index of weighting variable, or -1. */
- bool ok; /* False after an I/O error or corrupt data. */
- bool has_vls; /* True if the file has one or more Very Long Strings*/
-
- /* Variables. */
- struct hsh_table *var_hash;
- struct variable **svars;
-
- /* File's special constants. */
- flt64 sysmis;
- flt64 highest;
- flt64 lowest;
-
- /* Decompression buffer. */
- flt64 *buf; /* Buffer data. */
- flt64 *ptr; /* Current location in buffer. */
- flt64 *end; /* End of buffer data. */
-
- /* Compression instruction octet. */
- unsigned char x[8]; /* Current instruction octet. */
- unsigned char *y; /* Location in current instruction octet. */
-};
-
-/* A variable in a system file. */
-struct sfm_var
-{
- char name[SHORT_NAME_LEN + 1]; /* name */
- int width; /* 0=numeric, otherwise string width. */
- int fv; /* Index into case. */
-};
-\f
-/* Utilities. */
+ {
+ struct any_reader any_reader;
+
+ /* Resource tracking. */
+ struct pool *pool; /* All system file state. */
+
+ /* File data. */
+ struct any_read_info info;
+ struct sfm_header_record header;
+ struct sfm_var_record *vars;
+ size_t n_vars;
+ struct sfm_value_label_record *labels;
+ size_t n_labels;
+ struct sfm_document_record *document;
+ struct sfm_mrset *mrsets;
+ size_t n_mrsets;
+ struct sfm_extension_record *extensions[32];
+
+ /* File state. */
+ struct file_handle *fh; /* File handle. */
+ struct fh_lock *lock; /* Mutual exclusion for file handle. */
+ FILE *file; /* File stream. */
+ off_t pos; /* Position in file. */
+ bool error; /* I/O or corruption error? */
+ struct caseproto *proto; /* Format of output cases. */
+
+ /* File format. */
+ enum integer_format integer_format; /* On-disk integer format. */
+ enum float_format float_format; /* On-disk floating point format. */
+ struct sfm_var *sfm_vars; /* Variables. */
+ size_t sfm_var_cnt; /* Number of variables. */
+ int case_cnt; /* Number of cases */
+ const char *encoding; /* String encoding. */
+
+ /* Decompression. */
+ enum any_compression compression;
+ double bias; /* Compression bias, usually 100.0. */
+ uint8_t opcodes[8]; /* Current block of opcodes. */
+ size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
+ bool corruption_warning; /* Warned about possible corruption? */
+
+ /* ZLIB decompression. */
+ long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */
+#define ZIN_BUF_SIZE 4096
+ uint8_t *zin_buf; /* Inflation input buffer. */
+#define ZOUT_BUF_SIZE 16384
+ uint8_t *zout_buf; /* Inflation output buffer. */
+ unsigned int zout_end; /* Number of bytes of data in zout_buf. */
+ unsigned int zout_pos; /* First unconsumed byte in zout_buf. */
+ z_stream zstream; /* ZLIB inflater. */
+ };
-/* Swap bytes *A and *B. */
-static inline void
-bswap (char *a, char *b)
-{
- char t = *a;
- *a = *b;
- *b = t;
-}
+static const struct casereader_class sys_file_casereader_class;
-/* Reverse the byte order of 32-bit integer *X. */
-static inline void
-bswap_int32 (int32_t *x_)
+static struct sfm_reader *
+sfm_reader_cast (const struct any_reader *r_)
{
- char *x = (char *) x_;
- bswap (x + 0, x + 3);
- bswap (x + 1, x + 2);
+ assert (r_->klass == &sys_file_reader_class);
+ return UP_CAST (r_, struct sfm_reader, any_reader);
}
-/* Reverse the byte order of 64-bit floating point *X. */
-static inline void
-bswap_flt64 (flt64 *x_)
-{
- char *x = (char *) x_;
- bswap (x + 0, x + 7);
- bswap (x + 1, x + 6);
- bswap (x + 2, x + 5);
- bswap (x + 3, x + 4);
-}
+static bool sfm_close (struct any_reader *);
+
+static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
+ const struct sfm_var_record *,
+ size_t n, int idx);
+
+static void sys_msg (struct sfm_reader *r, off_t, int class,
+ const char *format, va_list args)
+ PRINTF_FORMAT (4, 0);
+static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
+ PRINTF_FORMAT (3, 4);
+static void sys_error (struct sfm_reader *, off_t, const char *, ...)
+ PRINTF_FORMAT (3, 4);
+
+static bool read_bytes (struct sfm_reader *, void *, size_t)
+ WARN_UNUSED_RESULT;
+static int try_read_bytes (struct sfm_reader *, void *, size_t)
+ WARN_UNUSED_RESULT;
+static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT;
+static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT;
+static bool read_int64 (struct sfm_reader *, long long int *)
+ WARN_UNUSED_RESULT;
+static bool read_uint64 (struct sfm_reader *, unsigned long long int *)
+ WARN_UNUSED_RESULT;
+static bool read_string (struct sfm_reader *, char *, size_t)
+ WARN_UNUSED_RESULT;
+static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT;
+
+/* ZLIB compressed data handling. */
+static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT;
+static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
+static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT;
+static int read_bytes_zlib (struct sfm_reader *, void *, size_t)
+ WARN_UNUSED_RESULT;
+static int read_compressed_bytes (struct sfm_reader *, void *, size_t)
+ WARN_UNUSED_RESULT;
+static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t)
+ WARN_UNUSED_RESULT;
+static bool read_compressed_float (struct sfm_reader *, double *)
+ WARN_UNUSED_RESULT;
+
+static char *fix_line_ends (const char *);
+
+static int parse_int (const struct sfm_reader *, const void *data, size_t ofs);
+static double parse_float (const struct sfm_reader *,
+ const void *data, size_t ofs);
+
+static bool read_variable_record (struct sfm_reader *,
+ struct sfm_var_record *);
+static bool read_value_label_record (struct sfm_reader *,
+ struct sfm_value_label_record *);
+static struct sfm_document_record *read_document_record (struct sfm_reader *);
+static bool read_extension_record (struct sfm_reader *, int subtype,
+ struct sfm_extension_record **);
+static bool skip_extension_record (struct sfm_reader *, int subtype);
+
+static struct text_record *open_text_record (
+ struct sfm_reader *, const struct sfm_extension_record *,
+ bool recode_to_utf8);
+static void close_text_record (struct sfm_reader *,
+ struct text_record *);
+static bool read_variable_to_value_pair (struct sfm_reader *,
+ struct dictionary *,
+ struct text_record *,
+ struct variable **var, char **value);
+static void text_warn (struct sfm_reader *r, struct text_record *text,
+ const char *format, ...)
+ PRINTF_FORMAT (3, 4);
+static char *text_get_token (struct text_record *,
+ struct substring delimiters, char *delimiter);
+static bool text_match (struct text_record *, char c);
+static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
+ struct text_record *,
+ struct substring delimiters,
+ struct variable **);
+static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
+ struct text_record *,
+ struct substring delimiters,
+ struct variable **);
+static const char *text_parse_counted_string (struct sfm_reader *,
+ struct text_record *);
+static size_t text_pos (const struct text_record *);
+static const char *text_get_all (const struct text_record *);
+\f
+/* Dictionary reader. */
-static void
-corrupt_msg (int class, const char *format,...)
- PRINTF_FORMAT (2, 3);
+enum which_format
+ {
+ PRINT_FORMAT,
+ WRITE_FORMAT
+ };
- /* Displays a corrupt sysfile error. */
- static void
- corrupt_msg (int class, const char *format,...)
+static bool read_dictionary (struct sfm_reader *);
+static bool read_record (struct sfm_reader *, int type,
+ size_t *allocated_vars, size_t *allocated_labels);
+static bool read_header (struct sfm_reader *, struct any_read_info *,
+ struct sfm_header_record *);
+static void parse_header (struct sfm_reader *,
+ const struct sfm_header_record *,
+ struct any_read_info *, struct dictionary *);
+static bool parse_variable_records (struct sfm_reader *, struct dictionary *,
+ struct sfm_var_record *, size_t n);
+static void parse_format_spec (struct sfm_reader *, off_t pos,
+ unsigned int format, enum which_format,
+ struct variable *, int *format_warning_cnt);
+static void parse_document (struct dictionary *, struct sfm_document_record *);
+static void parse_display_parameters (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct dictionary *);
+static bool parse_machine_integer_info (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct any_read_info *);
+static void parse_machine_float_info (struct sfm_reader *,
+ const struct sfm_extension_record *);
+static void parse_extra_product_info (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct any_read_info *);
+static void parse_mrsets (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ size_t *allocated_mrsets);
+static void decode_mrsets (struct sfm_reader *, struct dictionary *);
+static void parse_long_var_name_map (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct dictionary *);
+static bool parse_long_string_map (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct dictionary *);
+static bool parse_value_labels (struct sfm_reader *, struct dictionary *,
+ const struct sfm_var_record *,
+ size_t n_var_recs,
+ const struct sfm_value_label_record *);
+static void parse_data_file_attributes (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct dictionary *);
+static void parse_variable_attributes (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct dictionary *);
+static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
+static void parse_long_string_value_labels (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct dictionary *);
+static void parse_long_string_missing_values (
+ struct sfm_reader *, const struct sfm_extension_record *,
+ struct dictionary *);
+
+/* Frees the strings inside INFO. */
+void
+any_read_info_destroy (struct any_read_info *info)
{
- struct msg m;
- va_list args;
- struct string text;
-
- ds_init_cstr (&text, _("corrupt system file: "));
- va_start (args, format);
- ds_put_vformat (&text, format, args);
- va_end (args);
-
- m.category = msg_class_to_category (class);
- m.severity = msg_class_to_severity (class);
- m.where.file_name = NULL;
- m.where.line_number = 0;
- m.text = ds_cstr (&text);
-
- msg_emit (&m);
+ if (info)
+ {
+ free (info->creation_date);
+ free (info->creation_time);
+ free (info->product);
+ free (info->product_ext);
+ }
}
-/* Closes a system file after we're done with it. */
-void
-sfm_close_reader (struct sfm_reader *r)
+/* Tries to open FH for reading as a system file. Returns an sfm_reader if
+ successful, otherwise NULL. */
+static struct any_reader *
+sfm_open (struct file_handle *fh)
{
- if (r == NULL)
- return;
+ size_t allocated_mrsets = 0;
+ struct sfm_reader *r;
- if (r->file)
+ /* Create and initialize reader. */
+ r = xzalloc (sizeof *r);
+ r->any_reader.klass = &sys_file_reader_class;
+ r->pool = pool_create ();
+ pool_register (r->pool, free, r);
+ r->fh = fh_ref (fh);
+ r->opcode_idx = sizeof r->opcodes;
+
+ /* TRANSLATORS: this fragment will be interpolated into
+ messages in fh_lock() that identify types of files. */
+ r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
+ if (r->lock == NULL)
+ goto error;
+
+ r->file = fn_open (fh, "rb");
+ if (r->file == NULL)
{
- if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
- msg (ME, _("%s: Closing system file: %s."),
- fh_get_file_name (r->fh), strerror (errno));
- r->file = NULL;
+ msg (ME, _("Error opening `%s' for reading as a system file: %s."),
+ fh_get_file_name (r->fh), strerror (errno));
+ goto error;
}
- if (r->fh != NULL)
- fh_close (r->fh, "system file", "rs");
+ if (!read_dictionary (r))
+ goto error;
- hsh_destroy(r->var_hash);
- free (r->buf);
- free (r);
-}
-\f
-/* Dictionary reader. */
+ if (r->extensions[EXT_MRSETS] != NULL)
+ parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets);
-static void buf_unread(struct sfm_reader *r, size_t byte_cnt);
+ if (r->extensions[EXT_MRSETS2] != NULL)
+ parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets);
-static void *buf_read (struct sfm_reader *, void *buf, size_t byte_cnt,
- size_t min_alloc);
+ return &r->any_reader;
-static int read_header (struct sfm_reader *,
- struct dictionary *, struct sfm_read_info *);
-static int parse_format_spec (struct sfm_reader *, int32_t,
- struct fmt_spec *, const struct variable *);
-static int read_value_labels (struct sfm_reader *, struct dictionary *,
- struct variable **var_by_idx);
-static int read_variables (struct sfm_reader *,
- struct dictionary *, struct variable ***var_by_idx);
-static int read_machine_int32_info (struct sfm_reader *, int size, int count);
-static int read_machine_flt64_info (struct sfm_reader *, int size, int count);
-static int read_documents (struct sfm_reader *, struct dictionary *);
+error:
+ if (r)
+ sfm_close (&r->any_reader);
+ return NULL;
+}
-static int fread_ok (struct sfm_reader *, void *, size_t);
+static bool
+read_dictionary (struct sfm_reader *r)
+{
+ size_t allocated_vars;
+ size_t allocated_labels;
-/* Displays the message X with corrupt_msg, then jumps to the error
- label. */
-#define lose(X) \
- do { \
- corrupt_msg X; \
- goto error; \
- } while (0)
+ if (!read_header (r, &r->info, &r->header))
+ return false;
-/* Calls buf_read with the specified arguments, and jumps to
- error if the read fails. */
-#define assertive_buf_read(a,b,c,d) \
- do { \
- if (!buf_read (a,b,c,d)) \
- goto error; \
- } while (0)
+ allocated_vars = 0;
+ allocated_labels = 0;
+ for (;;)
+ {
+ int type;
+
+ if (!read_int (r, &type))
+ return false;
+ if (type == 999)
+ break;
+ if (!read_record (r, type, &allocated_vars, &allocated_labels))
+ return false;
+ }
+ if (!skip_bytes (r, 4))
+ return false;
-struct name_pair
-{
- char *shortname;
- char *longname;
-};
+ if (r->compression == ANY_COMP_ZLIB && !read_zheader (r))
+ return false;
-static int
-pair_sn_compare(const void *_p1, const void *_p2, void *aux UNUSED)
+ return true;
+}
+
+static bool
+read_record (struct sfm_reader *r, int type,
+ size_t *allocated_vars, size_t *allocated_labels)
{
- int i;
+ int subtype;
- const struct name_pair *p1 = _p1;
- const struct name_pair *p2 = _p2;
+ switch (type)
+ {
+ case 2:
+ if (r->n_vars >= *allocated_vars)
+ r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars,
+ sizeof *r->vars);
+ return read_variable_record (r, &r->vars[r->n_vars++]);
+
+ case 3:
+ if (r->n_labels >= *allocated_labels)
+ r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels,
+ sizeof *r->labels);
+ return read_value_label_record (r, &r->labels[r->n_labels++]);
+
+ case 4:
+ /* A Type 4 record is always immediately after a type 3 record,
+ so the code for type 3 records reads the type 4 record too. */
+ sys_error (r, r->pos, _("Misplaced type 4 record."));
+ return false;
+
+ case 6:
+ if (r->document != NULL)
+ {
+ sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
+ return false;
+ }
+ r->document = read_document_record (r);
+ return r->document != NULL;
+
+ case 7:
+ if (!read_int (r, &subtype))
+ return false;
+ else if (subtype < 0
+ || subtype >= sizeof r->extensions / sizeof *r->extensions)
+ {
+ sys_warn (r, r->pos,
+ _("Unrecognized record type 7, subtype %d. For help, "
+ "please send this file to %s and mention that you were "
+ "using %s."),
+ subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
+ return skip_extension_record (r, subtype);
+ }
+ else if (r->extensions[subtype] != NULL)
+ {
+ sys_warn (r, r->pos,
+ _("Record type 7, subtype %d found here has the same "
+ "type as the record found near offset 0x%llx. For "
+ "help, please send this file to %s and mention that "
+ "you were using %s."),
+ subtype, (long long int) r->extensions[subtype]->pos,
+ PACKAGE_BUGREPORT, PACKAGE_STRING);
+ return skip_extension_record (r, subtype);
+ }
+ else
+ return read_extension_record (r, subtype, &r->extensions[subtype]);
- char buf1[SHORT_NAME_LEN + 1];
- char buf2[SHORT_NAME_LEN + 1];
+ default:
+ sys_error (r, r->pos, _("Unrecognized record type %d."), type);
+ return false;
+ }
- memset(buf1, 0, SHORT_NAME_LEN + 1);
- memset(buf2, 0, SHORT_NAME_LEN + 1);
+ NOT_REACHED ();
+}
- for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
- {
- buf1[i] = p1->shortname[i];
- if ( '\0' == buf1[i])
- break;
- }
+/* Returns the character encoding obtained from R, or a null pointer if R
+ doesn't have an indication of its character encoding. */
+static const char *
+sfm_get_encoding (const struct sfm_reader *r)
+{
+ /* The EXT_ENCODING record is the best way to determine dictionary
+ encoding. */
+ if (r->extensions[EXT_ENCODING])
+ return r->extensions[EXT_ENCODING]->data;
- for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
+ /* But EXT_INTEGER is better than nothing as a fallback. */
+ if (r->extensions[EXT_INTEGER])
{
- buf2[i] = p2->shortname[i];
- if ( '\0' == buf2[i])
- break;
+ int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4);
+ const char *encoding;
+
+ switch (codepage)
+ {
+ case 1:
+ return "EBCDIC-US";
+
+ case 2:
+ case 3:
+ /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
+ respectively. However, many files have character code 2 but data
+ which are clearly not ASCII. Therefore, ignore these values. */
+ break;
+
+ case 4:
+ return "MS_KANJI";
+
+ default:
+ encoding = sys_get_encoding_from_codepage (codepage);
+ if (encoding != NULL)
+ return encoding;
+ break;
+ }
}
- return strncmp(buf1, buf2, SHORT_NAME_LEN);
+ /* If the file magic number is EBCDIC then its character data is too. */
+ if (!strcmp (r->header.magic, EBCDIC_MAGIC))
+ return "EBCDIC-US";
+
+ return NULL;
}
-static unsigned int
-pair_sn_hash(const void *_p, void *aux UNUSED)
-{
- int i;
- const struct name_pair *p = _p;
- char buf[SHORT_NAME_LEN + 1];
+struct get_strings_aux
+ {
+ struct pool *pool;
+ char **titles;
+ char **strings;
+ bool *ids;
+ size_t allocated;
+ size_t n;
+ };
- memset(buf, 0, SHORT_NAME_LEN + 1);
- for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
+static void
+add_string__ (struct get_strings_aux *aux,
+ const char *string, bool id, char *title)
+{
+ if (aux->n >= aux->allocated)
{
- buf[i] = p->shortname[i];
- if ( '\0' == buf[i])
- break;
+ aux->allocated = 2 * (aux->allocated + 1);
+ aux->titles = pool_realloc (aux->pool, aux->titles,
+ aux->allocated * sizeof *aux->titles);
+ aux->strings = pool_realloc (aux->pool, aux->strings,
+ aux->allocated * sizeof *aux->strings);
+ aux->ids = pool_realloc (aux->pool, aux->ids,
+ aux->allocated * sizeof *aux->ids);
}
- return hsh_hash_bytes(buf, strlen(buf));
+ aux->titles[aux->n] = title;
+ aux->strings[aux->n] = pool_strdup (aux->pool, string);
+ aux->ids[aux->n] = id;
+ aux->n++;
}
-static void
-pair_sn_free(void *p, void *aux UNUSED)
+static void PRINTF_FORMAT (3, 4)
+add_string (struct get_strings_aux *aux,
+ const char *string, const char *title, ...)
{
- free(p);
+ va_list args;
+
+ va_start (args, title);
+ add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args));
+ va_end (args);
}
+static void PRINTF_FORMAT (3, 4)
+add_id (struct get_strings_aux *aux, const char *id, const char *title, ...)
+{
+ va_list args;
+ va_start (args, title);
+ add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args));
+ va_end (args);
+}
-/* A hsh_compare_func that orders variables A and B by their
- names. */
-static int
-compare_var_shortnames (const void *a_, const void *b_, void *foo UNUSED)
+/* Retrieves significant string data from R in its raw format, to allow the
+ caller to try to detect the encoding in use.
+
+ Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP,
+ and *STRINGSP to an array of N elements allocated from POOL. For each I in
+ 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in
+ whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must
+ be a valid PSPP language identifier, false if *STRINGSP[I] is free-form
+ text. */
+static size_t
+sfm_get_strings (const struct any_reader *r_, struct pool *pool,
+ char ***titlesp, bool **idsp, char ***stringsp)
{
- int i;
- const struct variable *a = a_;
- const struct variable *b = b_;
+ struct sfm_reader *r = sfm_reader_cast (r_);
+ const struct sfm_mrset *mrset;
+ struct get_strings_aux aux;
+ size_t var_idx;
+ size_t i, j, k;
+
+ aux.pool = pool;
+ aux.titles = NULL;
+ aux.strings = NULL;
+ aux.ids = NULL;
+ aux.allocated = 0;
+ aux.n = 0;
+
+ var_idx = 0;
+ for (i = 0; i < r->n_vars; i++)
+ if (r->vars[i].width != -1)
+ add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx);
+
+ var_idx = 0;
+ for (i = 0; i < r->n_vars; i++)
+ if (r->vars[i].width != -1)
+ {
+ var_idx++;
+ if (r->vars[i].label)
+ add_string (&aux, r->vars[i].label, _("Variable %zu Label"),
+ var_idx);
+ }
+
+ k = 0;
+ for (i = 0; i < r->n_labels; i++)
+ for (j = 0; j < r->labels[i].n_labels; j++)
+ add_string (&aux, r->labels[i].labels[j].label,
+ _("Value Label %zu"), k++);
- char buf1[SHORT_NAME_LEN + 1];
- char buf2[SHORT_NAME_LEN + 1];
+ add_string (&aux, r->header.creation_date, _("Creation Date"));
+ add_string (&aux, r->header.creation_time, _("Creation Time"));
+ add_string (&aux, r->header.eye_catcher, _("Product"));
+ add_string (&aux, r->header.file_label, _("File Label"));
- memset(buf1, 0, SHORT_NAME_LEN + 1);
- memset(buf2, 0, SHORT_NAME_LEN + 1);
+ if (r->extensions[EXT_PRODUCT_INFO])
+ add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data,
+ _("Extra Product Info"));
- for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
+ if (r->document)
{
- buf1[i] = a->short_name[i];
- if ( '\0' == buf1[i])
- break;
+ size_t i;
+
+ for (i = 0; i < r->document->n_lines; i++)
+ {
+ char line[81];
+
+ memcpy (line, r->document->documents + i * 80, 80);
+ line[80] = '\0';
+
+ add_string (&aux, line, _("Document Line %zu"), i + 1);
+ }
}
- for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
+ for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++)
{
- buf2[i] = b->short_name[i];
- if ( '\0' == buf2[i])
- break;
- }
+ size_t mrset_idx = mrset - r->mrsets + 1;
- return strncmp(buf1, buf2, SHORT_NAME_LEN);
-}
+ add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx);
+ if (mrset->label[0])
+ add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx);
-/* A hsh_hash_func that hashes variable V based on its name. */
-static unsigned
-hash_var_shortname (const void *v_, void *foo UNUSED)
-{
- int i;
- const struct variable *v = v_;
- char buf[SHORT_NAME_LEN + 1];
+ /* Skip the variables because they ought to be duplicates. */
- memset(buf, 0, SHORT_NAME_LEN + 1);
- for (i = 0 ; i <= SHORT_NAME_LEN ; ++i )
- {
- buf[i] = v->short_name[i];
- if ( '\0' == buf[i])
- break;
+ if (mrset->counted)
+ add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"),
+ mrset_idx);
}
- return hsh_hash_bytes(buf, strlen(buf));
+ /* */
+ /* data file attributes */
+ /* variable attributes */
+ /* long var map */
+ /* long string value labels */
+ /* long string missing values */
+
+ *titlesp = aux.titles;
+ *idsp = aux.ids;
+ *stringsp = aux.strings;
+ return aux.n;
}
+/* Decodes the dictionary read from R, saving it into into *DICT. Character
+ strings in R are decoded using ENCODING, or an encoding obtained from R if
+ ENCODING is null, or the locale encoding if R specifies no encoding.
+ If INFOP is non-null, then it receives additional info about the system
+ file, which the caller must eventually free with any_read_info_destroy()
+ when it is no longer needed.
-/* Opens the system file designated by file handle FH for
- reading. Reads the system file's dictionary into *DICT.
- If INFO is non-null, then it receives additional info about the
- system file. */
-struct sfm_reader *
-sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
- struct sfm_read_info *info)
+ This function consumes R. The caller must use it again later, even to
+ destroy it with sfm_close(). */
+static struct casereader *
+sfm_decode (struct any_reader *r_, const char *encoding,
+ struct dictionary **dictp, struct any_read_info *infop)
{
- struct sfm_reader *r = NULL;
- struct variable **var_by_idx = NULL;
+ struct sfm_reader *r = sfm_reader_cast (r_);
+ struct dictionary *dict;
+ size_t i;
- /* The data in record 7(14) */
- char *subrec14data = 0;
+ if (encoding == NULL)
+ {
+ encoding = sfm_get_encoding (r);
+ if (encoding == NULL)
+ {
+ sys_warn (r, -1, _("This system file does not indicate its own "
+ "character encoding. Using default encoding "
+ "%s. For best results, specify an encoding "
+ "explicitly. Use SYSFILE INFO with "
+ "ENCODING=\"DETECT\" to analyze the possible "
+ "encodings."),
+ locale_charset ());
+ encoding = locale_charset ();
+ }
+ }
- /* A hash table of long variable names indexed by short name */
- struct hsh_table *short_to_long = NULL;
+ dict = dict_create (encoding);
+ r->encoding = dict_get_encoding (dict);
+ /* These records don't use variables at all. */
+ if (r->document != NULL)
+ parse_document (dict, r->document);
- *dict = dict_create ();
- if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
+ if (r->extensions[EXT_INTEGER] != NULL
+ && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info))
goto error;
- /* Create and initialize reader. */
- r = xmalloc (sizeof *r);
- r->fh = fh;
- r->file = fn_open (fh_get_file_name (fh), "rb");
-
- r->reverse_endian = 0;
- r->fix_specials = 0;
- r->value_cnt = 0;
- r->case_cnt = 0;
- r->compressed = 0;
- r->bias = 100.0;
- r->weight_idx = -1;
- r->ok = true;
- r->has_vls = false;
- r->svars = 0;
-
- r->var_hash = hsh_create(4, compare_var_shortnames, hash_var_shortname, 0, 0);
-
- r->sysmis = -FLT64_MAX;
- r->highest = FLT64_MAX;
- r->lowest = second_lowest_flt64;
-
- r->buf = r->ptr = r->end = NULL;
- r->y = r->x + sizeof r->x;
-
- /* Check that file open succeeded. */
- if (r->file == NULL)
- {
- msg (ME, _("An error occurred while opening \"%s\" for reading "
- "as a system file: %s."),
- fh_get_file_name (r->fh), strerror (errno));
- goto error;
- }
+ if (r->extensions[EXT_FLOAT] != NULL)
+ parse_machine_float_info (r, r->extensions[EXT_FLOAT]);
- /* Read header and variables. */
- if (!read_header (r, *dict, info) || !read_variables (r, *dict, &var_by_idx))
- goto error;
+ if (r->extensions[EXT_PRODUCT_INFO] != NULL)
+ parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info);
+
+ if (r->extensions[EXT_FILE_ATTRS] != NULL)
+ parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict);
+ parse_header (r, &r->header, &r->info, dict);
- /* Handle weighting. */
- if (r->weight_idx != -1)
+ /* Parse the variable records, the basis of almost everything else. */
+ if (!parse_variable_records (r, dict, r->vars, r->n_vars))
+ goto error;
+
+ /* Parse value labels and the weight variable immediately after the variable
+ records. These records use indexes into var_recs[], so we must parse them
+ before those indexes become invalidated by very long string variables. */
+ for (i = 0; i < r->n_labels; i++)
+ if (!parse_value_labels (r, dict, r->vars, r->n_vars, &r->labels[i]))
+ goto error;
+ if (r->header.weight_idx != 0)
{
struct variable *weight_var;
- if (r->weight_idx < 0 || r->weight_idx >= r->value_cnt)
- lose ((ME, _("%s: Index of weighting variable (%d) is not between 0 "
- "and number of elements per case (%d)."),
- fh_get_file_name (r->fh), r->weight_idx, r->value_cnt));
+ weight_var = lookup_var_by_index (r, 76, r->vars, r->n_vars,
+ r->header.weight_idx);
+ if (weight_var != NULL)
+ {
+ if (var_is_numeric (weight_var))
+ dict_set_weight (dict, weight_var);
+ else
+ sys_warn (r, -1, _("Ignoring string variable `%s' set "
+ "as weighting variable."),
+ var_get_name (weight_var));
+ }
+ }
+ if (r->extensions[EXT_DISPLAY] != NULL)
+ parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict);
- weight_var = var_by_idx[r->weight_idx];
+ /* The following records use short names, so they need to be parsed before
+ parse_long_var_name_map() changes short names to long names. */
+ decode_mrsets (r, dict);
- if (weight_var == NULL)
- lose ((ME,
- _("%s: Weighting variable may not be a continuation of "
- "a long string variable."), fh_get_file_name (fh)));
- else if (weight_var->type == ALPHA)
- lose ((ME, _("%s: Weighting variable may not be a string variable."),
- fh_get_file_name (fh)));
+ if (r->extensions[EXT_LONG_STRINGS] != NULL
+ && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict))
+ goto error;
- dict_set_weight (*dict, weight_var);
- }
- else
- dict_set_weight (*dict, NULL);
+ /* Now rename variables to their long names. */
+ parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict);
- /* Read records of types 3, 4, 6, and 7. */
- for (;;)
+ /* The following records use long names, so they need to follow renaming. */
+ if (r->extensions[EXT_VAR_ATTRS] != NULL)
{
- int32_t rec_type;
-
- assertive_buf_read (r, &rec_type, sizeof rec_type, 0);
- if (r->reverse_endian)
- bswap_int32 (&rec_type);
-
+ parse_variable_attributes (r, r->extensions[EXT_VAR_ATTRS], dict);
- switch (rec_type)
- {
- case 3:
- if (!read_value_labels (r, *dict, var_by_idx))
- goto error;
- break;
-
- case 4:
- lose ((ME, _("%s: Orphaned variable index record (type 4). Type 4 "
- "records must always immediately follow type 3 "
- "records."),
- fh_get_file_name (r->fh)));
-
- case 6:
- if (!read_documents (r, *dict))
- goto error;
- break;
-
- case 7:
- {
- struct
- {
- int32_t subtype P;
- int32_t size P;
- int32_t count P;
- }
- data;
- unsigned long bytes;
-
- int skip = 0;
-
- assertive_buf_read (r, &data, sizeof data, 0);
- if (r->reverse_endian)
- {
- bswap_int32 (&data.subtype);
- bswap_int32 (&data.size);
- bswap_int32 (&data.count);
- }
- bytes = data.size * data.count;
-
- if (bytes < data.size || bytes < data.count)
- lose ((ME, "%s: Record type %d subtype %d too large.",
- fh_get_file_name (r->fh), rec_type, data.subtype));
-
- switch (data.subtype)
- {
- case 3:
- if (!read_machine_int32_info (r, data.size, data.count))
- goto error;
- break;
-
- case 4:
- if (!read_machine_flt64_info (r, data.size, data.count))
- goto error;
- break;
-
- case 5:
- case 6: /* ?? Used by SPSS 8.0. */
- skip = 1;
- break;
-
- case 11: /* Variable display parameters */
- {
- const int n_vars = data.count / 3 ;
- int i;
- if ( data.count % 3 || n_vars != dict_get_var_cnt(*dict) )
- {
- msg (MW, _("%s: Invalid subrecord length. "
- "Record: 7; Subrecord: 11"),
- fh_get_file_name (r->fh));
- skip = 1;
- break;
- }
-
- for ( i = 0 ; i < min(n_vars, dict_get_var_cnt(*dict)) ; ++i )
- {
- struct
- {
- int32_t measure P;
- int32_t width P;
- int32_t align P;
- }
- params;
-
- struct variable *v;
-
- assertive_buf_read (r, ¶ms, sizeof(params), 0);
-
- if ( ! measure_is_valid(params.measure)
- ||
- ! alignment_is_valid(params.align))
- {
- msg(MW,
- _("%s: Invalid variable display parameters. Default parameters substituted."),
- fh_get_file_name(r->fh));
- continue;
- }
-
- v = dict_get_var(*dict, i);
-
- v->measure = params.measure;
- v->display_width = params.width;
- v->alignment = params.align;
- }
- }
- break;
-
- case 13: /* SPSS 12.0 Long variable name map */
- {
- char *short_name, *save_ptr;
- int idx;
-
- r->has_vls = true;
-
- /* Read data. */
- subrec14data = xmalloc (bytes + 1);
- if (!buf_read (r, subrec14data, bytes, 0))
- {
- goto error;
- }
- subrec14data[bytes] = '\0';
-
- short_to_long = hsh_create(4,
- pair_sn_compare,
- pair_sn_hash,
- pair_sn_free,
- 0);
-
- /* Parse data. */
- for (short_name = strtok_r (subrec14data, "=", &save_ptr), idx = 0;
- short_name != NULL;
- short_name = strtok_r (NULL, "=", &save_ptr), idx++)
- {
- struct name_pair *pair ;
- char *long_name = strtok_r (NULL, "\t", &save_ptr);
- struct variable *v;
-
- /* Validate long name. */
- if (long_name == NULL)
- {
- msg (MW, _("%s: Trailing garbage in long variable "
- "name map."),
- fh_get_file_name (r->fh));
- break;
- }
- if (!var_is_valid_name (long_name, false))
- {
- msg (MW, _("%s: Long variable mapping to invalid "
- "variable name `%s'."),
- fh_get_file_name (r->fh), long_name);
- break;
- }
-
- /* Find variable using short name. */
- v = dict_lookup_var (*dict, short_name);
- if (v == NULL)
- {
- msg (MW, _("%s: Long variable mapping for "
- "nonexistent variable %s."),
- fh_get_file_name (r->fh), short_name);
- break;
- }
-
- /* Identify any duplicates. */
- if ( compare_var_names(short_name, long_name, 0) &&
- NULL != dict_lookup_var (*dict, long_name))
- lose ((ME, _("%s: Duplicate long variable name `%s' "
- "within system file."),
- fh_get_file_name (r->fh), long_name));
-
-
- /* Set long name.
- Renaming a variable may clear the short
- name, but we want to retain it, so
- re-set it explicitly. */
- dict_rename_var (*dict, v, long_name);
- var_set_short_name (v, short_name);
-
- pair = xmalloc(sizeof *pair);
- pair->shortname = short_name;
- pair->longname = long_name;
- hsh_insert(short_to_long, pair);
-#if 0
- /* This messes up the processing of subtype 14 (below).
- I'm not sure if it is needed anyway, so I'm removing it for
- now. If it's needed, then it will need to be done after all the
- records have been processed. --- JMD 27 April 2006
- */
-
- /* For compatability, make sure dictionary
- is in long variable name map order. In
- the common case, this has no effect,
- because the dictionary and the long
- variable name map are already in the
- same order. */
- dict_reorder_var (*dict, v, idx);
-#endif
- }
-
- }
- break;
-
- case 14:
- {
- int j = 0;
- bool eq_seen = false;
- int i;
-
- /* Read data. */
- char *buffer = xmalloc (bytes + 1);
- if (!buf_read (r, buffer, bytes, 0))
- {
- free (buffer);
- goto error;
- }
- buffer[bytes] = '\0';
-
-
- /* Note: SPSS v13 terminates this record with 00,
- whereas SPSS v14 terminates it with 00 09. We must
- accept either */
- for(i = 0; i < bytes ; ++i)
- {
- long int length;
- static char name[SHORT_NAME_LEN + 1] = {0};
- static char len_str[6] ={0};
-
- switch( buffer[i] )
- {
- case '=':
- eq_seen = true;
- j = 0;
- break;
- case '\0':
- length = strtol(len_str, 0, 10);
- if ( length != LONG_MAX && length != LONG_MIN)
- {
- char *lookup_name = name;
- int l;
- int idx;
- struct variable *v;
-
- if ( short_to_long )
- {
- struct name_pair pair;
- struct name_pair *p;
-
- pair.shortname = name;
- p = hsh_find(short_to_long, &pair);
- if ( p )
- lookup_name = p->longname;
- }
-
- v = dict_lookup_var(*dict, lookup_name);
- if ( !v )
- {
- corrupt_msg(MW,
- _("%s: No variable called %s but it is listed in length table."),
- fh_get_file_name (r->fh), lookup_name);
-
- goto error;
-
- }
-
- l = length;
- if ( v->width > EFFECTIVE_LONG_STRING_LENGTH )
- l -= EFFECTIVE_LONG_STRING_LENGTH;
- else
- l -= v->width;
-
- idx = v->index;
- while ( l > 0 )
- {
- struct variable *v_next;
- v_next = dict_get_var(*dict, idx + 1);
-
- if ( v_next->width > EFFECTIVE_LONG_STRING_LENGTH )
- l -= EFFECTIVE_LONG_STRING_LENGTH;
- else
- l -= v_next->width;
-
- hsh_delete(r->var_hash, v_next);
-
- dict_delete_var(*dict, v_next);
- }
-
- assert ( length > MAX_LONG_STRING );
-
- v->width = length;
- v->print.w = v->width;
- v->write.w = v->width;
- v->nv = DIV_RND_UP (length, MAX_SHORT_STRING);
- }
- eq_seen = false;
- memset(name, 0, SHORT_NAME_LEN+1);
- memset(len_str, 0, 6);
- j = 0;
- break;
- case '\t':
- break;
- default:
- if ( eq_seen )
- len_str[j] = buffer[i];
- else
- name[j] = buffer[i];
- j++;
- break;
- }
- }
- free(buffer);
- dict_compact_values(*dict);
- }
- break;
-
- default:
- msg (MW, _("%s: Unrecognized record type 7, subtype %d "
- "encountered in system file."),
- fh_get_file_name (r->fh), data.subtype);
- skip = 1;
- }
-
- if (skip)
- {
- void *x = buf_read (r, NULL, data.size * data.count, 0);
- if (x == NULL)
- goto error;
- free (x);
- }
- }
- break;
-
- case 999:
- {
- int32_t filler;
-
- assertive_buf_read (r, &filler, sizeof filler, 0);
-
- goto success;
- }
-
- default:
- corrupt_msg(MW, _("%s: Unrecognized record type %d."),
- fh_get_file_name (r->fh), rec_type);
- }
+ /* Roles use the $@Role attribute. */
+ assign_variable_roles (r, dict);
}
- success:
- /* Come here on successful completion. */
-
- free (var_by_idx);
- hsh_destroy(short_to_long);
- free (subrec14data);
- return r;
-
- error:
- /* Come here on unsuccessful completion. */
- sfm_close_reader (r);
- free (var_by_idx);
- hsh_destroy(short_to_long);
- free (subrec14data);
- if (*dict != NULL)
+ if (r->extensions[EXT_LONG_LABELS] != NULL)
+ parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS], dict);
+ if (r->extensions[EXT_LONG_MISSING] != NULL)
+ parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING],
+ dict);
+
+ /* Warn if the actual amount of data per case differs from the
+ amount that the header claims. SPSS version 13 gets this
+ wrong when very long strings are involved, so don't warn in
+ that case. */
+ if (r->header.nominal_case_size != -1
+ && r->header.nominal_case_size != r->n_vars
+ && r->info.version_major != 13)
+ sys_warn (r, -1, _("File header claims %d variable positions but "
+ "%zu were read from file."),
+ r->header.nominal_case_size, r->n_vars);
+
+ /* Create an index of dictionary variable widths for
+ sfm_read_case to use. We cannot use the `struct variable's
+ from the dictionary we created, because the caller owns the
+ dictionary and may destroy or modify its variables. */
+ sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
+ pool_register (r->pool, free, r->sfm_vars);
+ r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
+
+ *dictp = dict;
+ if (infop)
{
- dict_destroy (*dict);
- *dict = NULL;
+ *infop = r->info;
+ memset (&r->info, 0, sizeof r->info);
}
+
+ return casereader_create_sequential
+ (NULL, r->proto,
+ r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
+ &sys_file_casereader_class, r);
+
+error:
+ sfm_close (r_);
+ dict_destroy (dict);
+ *dictp = NULL;
return NULL;
}
-/* Read record type 7, subtype 3. */
-static int
-read_machine_int32_info (struct sfm_reader *r, int size, int count)
+/* Closes R, which should have been returned by sfm_open() but not already
+ closed with sfm_decode() or this function.
+ Returns true if an I/O error has occurred on READER, false
+ otherwise. */
+static bool
+sfm_close (struct any_reader *r_)
{
- int32_t data[8];
- int file_bigendian;
+ struct sfm_reader *r = sfm_reader_cast (r_);
+ bool error;
- int i;
+ if (r->file)
+ {
+ if (fn_close (r->fh, r->file) == EOF)
+ {
+ msg (ME, _("Error closing system file `%s': %s."),
+ fh_get_file_name (r->fh), strerror (errno));
+ r->error = true;
+ }
+ r->file = NULL;
+ }
- if (size != sizeof (int32_t) || count != 8)
- lose ((ME, _("%s: Bad size (%d) or count (%d) field on record type 7, "
- "subtype 3. Expected size %d, count 8."),
- fh_get_file_name (r->fh), size, count, sizeof (int32_t)));
-
- assertive_buf_read (r, data, sizeof data, 0);
- if (r->reverse_endian)
- for (i = 0; i < 8; i++)
- bswap_int32 (&data[i]);
-
-#ifdef FPREP_IEEE754
- if (data[4] != 1)
- lose ((ME, _("%s: Floating-point representation in system file is not "
- "IEEE-754. PSPP cannot convert between floating-point "
- "formats."),
- fh_get_file_name (r->fh)));
-#else
-#error Add support for your floating-point format.
-#endif
-
-#ifdef WORDS_BIGENDIAN
- file_bigendian = 1;
-#else
- file_bigendian = 0;
-#endif
- if (r->reverse_endian)
- file_bigendian ^= 1;
- if (file_bigendian ^ (data[6] == 1))
- lose ((ME, _("%s: File-indicated endianness (%s) does not match "
- "endianness intuited from file header (%s)."),
- fh_get_file_name (r->fh),
- file_bigendian ? _("big-endian") : _("little-endian"),
- data[6] == 1 ? _("big-endian") : (data[6] == 2 ? _("little-endian")
- : _("unknown"))));
-
- /* PORTME: Character representation code. */
- if (data[7] != 2 && data[7] != 3)
- lose ((ME, _("%s: File-indicated character representation code (%s) is "
- "not ASCII."),
- fh_get_file_name (r->fh),
- (data[7] == 1 ? "EBCDIC"
- : (data[7] == 4 ? _("DEC Kanji") : _("Unknown")))));
+ any_read_info_destroy (&r->info);
+ fh_unlock (r->lock);
+ fh_unref (r->fh);
- return 1;
+ error = r->error;
+ pool_destroy (r->pool);
- error:
- return 0;
+ return !error;
}
-/* Read record type 7, subtype 4. */
-static int
-read_machine_flt64_info (struct sfm_reader *r, int size, int count)
+/* Destroys READER. */
+static void
+sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
{
- flt64 data[3];
- int i;
-
- if (size != sizeof (flt64) || count != 3)
- lose ((ME, _("%s: Bad size (%d) or count (%d) field on record type 7, "
- "subtype 4. Expected size %d, count 8."),
- fh_get_file_name (r->fh), size, count, sizeof (flt64)));
-
- assertive_buf_read (r, data, sizeof data, 0);
- if (r->reverse_endian)
- for (i = 0; i < 3; i++)
- bswap_flt64 (&data[i]);
-
- if (data[0] != SYSMIS || data[1] != FLT64_MAX
- || data[2] != second_lowest_flt64)
- {
- r->sysmis = data[0];
- r->highest = data[1];
- r->lowest = data[2];
- msg (MW, _("%s: File-indicated value is different from internal value "
- "for at least one of the three system values. SYSMIS: "
- "indicated %g, expected %g; HIGHEST: %g, %g; LOWEST: "
- "%g, %g."),
- fh_get_file_name (r->fh), (double) data[0], (double) SYSMIS,
- (double) data[1], (double) FLT64_MAX,
- (double) data[2], (double) second_lowest_flt64);
- }
-
- return 1;
-
- error:
- return 0;
+ struct sfm_reader *r = r_;
+ sfm_close (&r->any_reader);
}
+/* Detects whether FILE is an SPSS system file. Returns 1 if so, 0 if not, and
+ a negative errno value if there is an error reading FILE. */
static int
-read_header (struct sfm_reader *r,
- struct dictionary *dict, struct sfm_read_info *info)
+sfm_detect (FILE *file)
{
- struct sysfile_header hdr; /* Disk buffer. */
- char prod_name[sizeof hdr.prod_name + 1]; /* Buffer for product name. */
- int skip_amt = 0; /* Amount of product name to omit. */
- int i;
+ char magic[5];
- /* Read header, check magic. */
- assertive_buf_read (r, &hdr, sizeof hdr, 0);
- if (strncmp ("$FL2", hdr.rec_type, 4) != 0)
- lose ((ME, _("%s: Bad magic. Proper system files begin with "
- "the four characters `$FL2'. This file will not be read."),
- fh_get_file_name (r->fh)));
-
- /* Check eye-category.her string. */
- memcpy (prod_name, hdr.prod_name, sizeof hdr.prod_name);
- for (i = 0; i < 60; i++)
- if (!c_isprint ((unsigned char) prod_name[i]))
- prod_name[i] = ' ';
- for (i = 59; i >= 0; i--)
- if (!c_isgraph ((unsigned char) prod_name[i]))
- {
- prod_name[i] = '\0';
- break;
- }
- prod_name[60] = '\0';
-
- {
-#define N_PREFIXES 2
- static const char *prefix[N_PREFIXES] =
- {
- "@(#) SPSS DATA FILE",
- "SPSS SYSTEM FILE.",
- };
+ if (fseek (file, 0, SEEK_SET) != 0)
+ return -errno;
+ if (fread (magic, 4, 1, file) != 1)
+ return ferror (file) ? -errno : 0;
+ magic[4] = '\0';
- int i;
+ return (!strcmp (ASCII_MAGIC, magic)
+ || !strcmp (ASCII_ZMAGIC, magic)
+ || !strcmp (EBCDIC_MAGIC, magic));
+}
+\f
+/* Reads the global header of the system file. Initializes *HEADER and *INFO,
+ except for the string fields in *INFO, which parse_header() will initialize
+ later once the file's encoding is known. */
+static bool
+read_header (struct sfm_reader *r, struct any_read_info *info,
+ struct sfm_header_record *header)
+{
+ uint8_t raw_layout_code[4];
+ uint8_t raw_bias[8];
+ int compressed;
+ bool zmagic;
- for (i = 0; i < N_PREFIXES; i++)
- if (!strncmp (prefix[i], hdr.prod_name, strlen (prefix[i])))
- {
- skip_amt = strlen (prefix[i]);
- break;
- }
- }
-
- /* Check endianness. */
- if (hdr.layout_code == 2)
- r->reverse_endian = 0;
+ if (!read_string (r, header->magic, sizeof header->magic)
+ || !read_string (r, header->eye_catcher, sizeof header->eye_catcher))
+ return false;
+
+ if (!strcmp (ASCII_MAGIC, header->magic)
+ || !strcmp (EBCDIC_MAGIC, header->magic))
+ zmagic = false;
+ else if (!strcmp (ASCII_ZMAGIC, header->magic))
+ zmagic = true;
else
{
- bswap_int32 (&hdr.layout_code);
- if (hdr.layout_code != 2)
- lose ((ME, _("%s: File layout code has unexpected value %d. Value "
- "should be 2, in big-endian or little-endian format."),
- fh_get_file_name (r->fh), hdr.layout_code));
+ sys_error (r, 0, _("This is not an SPSS system file."));
+ return false;
+ }
- r->reverse_endian = 1;
- bswap_int32 (&hdr.nominal_case_size);
- bswap_int32 (&hdr.compress);
- bswap_int32 (&hdr.weight_idx);
- bswap_int32 (&hdr.case_cnt);
- bswap_flt64 (&hdr.bias);
+ /* Identify integer format. */
+ if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code))
+ return false;
+ if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code,
+ &r->integer_format)
+ && !integer_identify (3, raw_layout_code, sizeof raw_layout_code,
+ &r->integer_format))
+ || (r->integer_format != INTEGER_MSB_FIRST
+ && r->integer_format != INTEGER_LSB_FIRST))
+ {
+ sys_error (r, 64, _("This is not an SPSS system file."));
+ return false;
}
+ if (!read_int (r, &header->nominal_case_size))
+ return false;
- /* Copy basic info and verify correctness. */
- r->value_cnt = hdr.nominal_case_size;
+ if (header->nominal_case_size < 0
+ || header->nominal_case_size > INT_MAX / 16)
+ header->nominal_case_size = -1;
- /* If value count is ridiculous, then force it to -1 (a
- sentinel value). */
- if ( r->value_cnt < 0 ||
- r->value_cnt > (INT_MAX / (int) sizeof (union value) / 2))
- r->value_cnt = -1;
+ if (!read_int (r, &compressed))
+ return false;
+ if (!zmagic)
+ {
+ if (compressed == 0)
+ r->compression = ANY_COMP_NONE;
+ else if (compressed == 1)
+ r->compression = ANY_COMP_SIMPLE;
+ else if (compressed != 0)
+ {
+ sys_error (r, 0, "System file header has invalid compression "
+ "value %d.", compressed);
+ return false;
+ }
+ }
+ else
+ {
+ if (compressed == 2)
+ r->compression = ANY_COMP_ZLIB;
+ else
+ {
+ sys_error (r, 0, "ZLIB-compressed system file header has invalid "
+ "compression value %d.", compressed);
+ return false;
+ }
+ }
- r->compressed = hdr.compress;
+ if (!read_int (r, &header->weight_idx))
+ return false;
- r->weight_idx = hdr.weight_idx - 1;
+ if (!read_int (r, &r->case_cnt))
+ return false;
+ if ( r->case_cnt > INT_MAX / 2)
+ r->case_cnt = -1;
- r->case_cnt = hdr.case_cnt;
- if (r->case_cnt < -1 || r->case_cnt > INT_MAX / 2)
- lose ((ME,
- _("%s: Number of cases in file (%ld) is not between -1 and %d."),
- fh_get_file_name (r->fh), (long) r->case_cnt, INT_MAX / 2));
+ /* Identify floating-point format and obtain compression bias. */
+ if (!read_bytes (r, raw_bias, sizeof raw_bias))
+ return false;
+ if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
+ {
+ uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
- r->bias = hdr.bias;
- if (r->bias != 100.0)
- corrupt_msg (MW, _("%s: Compression bias (%g) is not the usual "
- "value of 100."),
- fh_get_file_name (r->fh), r->bias);
+ if (memcmp (raw_bias, zero_bias, 8))
+ sys_warn (r, r->pos - 8,
+ _("Compression bias is not the usual "
+ "value of 100, or system file uses unrecognized "
+ "floating-point format."));
+ else
+ {
+ /* Some software is known to write all-zeros to this
+ field. Such software also writes floating-point
+ numbers in the format that we expect by default
+ (it seems that all software most likely does, in
+ reality), so don't warn in this case. */
+ }
- /* Make a file label only on the condition that the given label is
- not all spaces or nulls. */
- {
- int i;
+ if (r->integer_format == INTEGER_MSB_FIRST)
+ r->float_format = FLOAT_IEEE_DOUBLE_BE;
+ else
+ r->float_format = FLOAT_IEEE_DOUBLE_LE;
+ }
+ float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
- for (i = sizeof hdr.file_label - 1; i >= 0; i--)
- {
- if (!c_isspace ((unsigned char) hdr.file_label[i])
- && hdr.file_label[i] != 0)
- {
- char *label = xmalloc (i + 2);
- memcpy (label, hdr.file_label, i + 1);
- label[i + 1] = 0;
- dict_set_label (dict, label);
- free (label);
- break;
- }
- }
- }
+ if (!read_string (r, header->creation_date, sizeof header->creation_date)
+ || !read_string (r, header->creation_time, sizeof header->creation_time)
+ || !read_string (r, header->file_label, sizeof header->file_label)
+ || !skip_bytes (r, 3))
+ return false;
- if (info)
- {
- char *cp;
+ info->integer_format = r->integer_format;
+ info->float_format = r->float_format;
+ info->compression = r->compression;
+ info->case_cnt = r->case_cnt;
- memcpy (info->creation_date, hdr.creation_date, 9);
- info->creation_date[9] = 0;
+ return true;
+}
- memcpy (info->creation_time, hdr.creation_time, 8);
- info->creation_time[8] = 0;
+/* Reads a variable (type 2) record from R into RECORD. */
+static bool
+read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
+{
+ int has_variable_label;
-#ifdef WORDS_BIGENDIAN
- info->big_endian = !r->reverse_endian;
-#else
- info->big_endian = r->reverse_endian;
-#endif
+ memset (record, 0, sizeof *record);
- info->compressed = hdr.compress;
+ record->pos = r->pos;
+ if (!read_int (r, &record->width)
+ || !read_int (r, &has_variable_label)
+ || !read_int (r, &record->missing_value_code)
+ || !read_int (r, &record->print_format)
+ || !read_int (r, &record->write_format)
+ || !read_string (r, record->name, sizeof record->name))
+ return false;
- info->case_cnt = hdr.case_cnt;
+ if (has_variable_label == 1)
+ {
+ enum { MAX_LABEL_LEN = 65536 };
+ unsigned int len, read_len;
- for (cp = &prod_name[skip_amt]; cp < &prod_name[60]; cp++)
- if (c_isgraph ((unsigned char) *cp))
- break;
- strcpy (info->product, cp);
- }
+ if (!read_uint (r, &len))
+ return false;
- return 1;
+ /* Read up to MAX_LABEL_LEN bytes of label. */
+ read_len = MIN (MAX_LABEL_LEN, len);
+ record->label = pool_malloc (r->pool, read_len + 1);
+ if (!read_string (r, record->label, read_len + 1))
+ return false;
- error:
- return 0;
-}
+ /* Skip unread label bytes. */
+ if (!skip_bytes (r, len - read_len))
+ return false;
-/* Reads most of the dictionary from file H; also fills in the
- associated VAR_BY_IDX array. */
-static int
-read_variables (struct sfm_reader *r,
- struct dictionary *dict, struct variable ***var_by_idx)
-{
- int i;
+ /* Skip label padding up to multiple of 4 bytes. */
+ if (!skip_bytes (r, ROUND_UP (len, 4) - len))
+ return false;
+ }
+ else if (has_variable_label != 0)
+ {
+ sys_error (r, record->pos,
+ _("Variable label indicator field is not 0 or 1."));
+ return false;
+ }
- struct sysfile_variable sv; /* Disk buffer. */
- int long_string_count = 0; /* # of long string continuation
- records still expected. */
- int next_value = 0; /* Index to next `value' structure. */
+ /* Set missing values. */
+ if (record->missing_value_code != 0)
+ {
+ int code = record->missing_value_code;
+ if (record->width == 0)
+ {
+ if (code < -3 || code > 3 || code == -1)
+ {
+ sys_error (r, record->pos,
+ _("Numeric missing value indicator field is not "
+ "-3, -2, 0, 1, 2, or 3."));
+ return false;
+ }
+ }
+ else
+ {
+ if (code < 1 || code > 3)
+ {
+ sys_error (r, record->pos,
+ _("String missing value indicator field is not "
+ "0, 1, 2, or 3."));
+ return false;
+ }
+ }
- assert(r);
+ if (!read_bytes (r, record->missing, 8 * abs (code)))
+ return false;
+ }
- *var_by_idx = 0;
+ return true;
+}
+/* Reads value labels from R into RECORD. */
+static bool
+read_value_label_record (struct sfm_reader *r,
+ struct sfm_value_label_record *record)
+{
+ size_t i;
+ int type;
- /* Read in the entry for each variable and use the info to
- initialize the dictionary. */
- for (i = 0; ; ++i)
+ /* Read type 3 record. */
+ record->pos = r->pos;
+ if (!read_uint (r, &record->n_labels))
+ return false;
+ if (record->n_labels > UINT_MAX / sizeof *record->labels)
{
- struct variable *vv;
- char name[SHORT_NAME_LEN + 1];
- int nv;
- int j;
+ sys_error (r, r->pos - 4, _("Invalid number of labels %u."),
+ record->n_labels);
+ return false;
+ }
+ record->labels = pool_nmalloc (r->pool, record->n_labels,
+ sizeof *record->labels);
+ for (i = 0; i < record->n_labels; i++)
+ {
+ struct sfm_value_label *label = &record->labels[i];
+ unsigned char label_len;
+ size_t padded_len;
- assertive_buf_read (r, &sv, sizeof sv, 0);
+ if (!read_bytes (r, label->value, sizeof label->value))
+ return false;
- if (r->reverse_endian)
- {
- bswap_int32 (&sv.rec_type);
- bswap_int32 (&sv.type);
- bswap_int32 (&sv.has_var_label);
- bswap_int32 (&sv.n_missing_values);
- bswap_int32 (&sv.print);
- bswap_int32 (&sv.write);
- }
+ /* Read label length. */
+ if (!read_bytes (r, &label_len, sizeof label_len))
+ return false;
+ padded_len = ROUND_UP (label_len + 1, 8);
- /* We've come to the end of the variable entries */
- if (sv.rec_type != 2)
- {
- buf_unread(r, sizeof sv);
- r->value_cnt = i;
- break;
+ /* Read label, padding. */
+ label->label = pool_malloc (r->pool, padded_len + 1);
+ if (!read_bytes (r, label->label, padded_len - 1))
+ return false;
+ label->label[label_len] = '\0';
+ }
+
+ /* Read record type of type 4 record. */
+ if (!read_int (r, &type))
+ return false;
+ if (type != 4)
+ {
+ sys_error (r, r->pos - 4,
+ _("Variable index record (type 4) does not immediately "
+ "follow value label record (type 3) as it should."));
+ return false;
+ }
+
+ /* Read number of variables associated with value label from type 4
+ record. */
+ if (!read_uint (r, &record->n_vars))
+ return false;
+ if (record->n_vars < 1 || record->n_vars > r->n_vars)
+ {
+ sys_error (r, r->pos - 4,
+ _("Number of variables associated with a value label (%u) "
+ "is not between 1 and the number of variables (%zu)."),
+ record->n_vars, r->n_vars);
+ return false;
+ }
+
+ record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
+ for (i = 0; i < record->n_vars; i++)
+ if (!read_int (r, &record->vars[i]))
+ return false;
+
+ return true;
+}
+
+/* Reads a document record from R and returns it. */
+static struct sfm_document_record *
+read_document_record (struct sfm_reader *r)
+{
+ struct sfm_document_record *record;
+ int n_lines;
+
+ record = pool_malloc (r->pool, sizeof *record);
+ record->pos = r->pos;
+
+ if (!read_int (r, &n_lines))
+ return NULL;
+ if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
+ {
+ sys_error (r, record->pos,
+ _("Number of document lines (%d) "
+ "must be greater than 0 and less than %d."),
+ n_lines, INT_MAX / DOC_LINE_LENGTH);
+ return NULL;
+ }
+
+ record->n_lines = n_lines;
+ record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
+ if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines))
+ return NULL;
+
+ return record;
+}
+
+static bool
+read_extension_record_header (struct sfm_reader *r, int subtype,
+ struct sfm_extension_record *record)
+{
+ record->subtype = subtype;
+ record->pos = r->pos;
+ if (!read_uint (r, &record->size) || !read_uint (r, &record->count))
+ return false;
+
+ /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
+ allows an extra byte for a null terminator, used by some
+ extension processing routines. */
+ if (record->size != 0
+ && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX)
+ {
+ sys_error (r, record->pos, "Record type 7 subtype %d too large.",
+ subtype);
+ return false;
+ }
+
+ return true;
+}
+
+/* Reads an extension record from R into RECORD. */
+static bool
+read_extension_record (struct sfm_reader *r, int subtype,
+ struct sfm_extension_record **recordp)
+{
+ struct extension_record_type
+ {
+ int subtype;
+ int size;
+ int count;
+ };
+
+ static const struct extension_record_type types[] =
+ {
+ /* Implemented record types. */
+ { EXT_INTEGER, 4, 8 },
+ { EXT_FLOAT, 8, 3 },
+ { EXT_MRSETS, 1, 0 },
+ { EXT_PRODUCT_INFO, 1, 0 },
+ { EXT_DISPLAY, 4, 0 },
+ { EXT_LONG_NAMES, 1, 0 },
+ { EXT_LONG_STRINGS, 1, 0 },
+ { EXT_NCASES, 8, 2 },
+ { EXT_FILE_ATTRS, 1, 0 },
+ { EXT_VAR_ATTRS, 1, 0 },
+ { EXT_MRSETS2, 1, 0 },
+ { EXT_ENCODING, 1, 0 },
+ { EXT_LONG_LABELS, 1, 0 },
+ { EXT_LONG_MISSING, 1, 0 },
+
+ /* Ignored record types. */
+ { EXT_VAR_SETS, 0, 0 },
+ { EXT_DATE, 0, 0 },
+ { EXT_DATA_ENTRY, 0, 0 },
+ { EXT_DATAVIEW, 0, 0 },
+ };
+
+ const struct extension_record_type *type;
+ struct sfm_extension_record *record;
+ size_t n_bytes;
+
+ *recordp = NULL;
+ record = pool_malloc (r->pool, sizeof *record);
+ if (!read_extension_record_header (r, subtype, record))
+ return false;
+ n_bytes = record->count * record->size;
+
+ for (type = types; type < &types[sizeof types / sizeof *types]; type++)
+ if (subtype == type->subtype)
+ {
+ if (type->size > 0 && record->size != type->size)
+ sys_warn (r, record->pos,
+ _("Record type 7, subtype %d has bad size %u "
+ "(expected %d)."), subtype, record->size, type->size);
+ else if (type->count > 0 && record->count != type->count)
+ sys_warn (r, record->pos,
+ _("Record type 7, subtype %d has bad count %u "
+ "(expected %d)."), subtype, record->count, type->count);
+ else if (type->count == 0 && type->size == 0)
+ {
+ /* Ignore this record. */
+ }
+ else
+ {
+ char *data = pool_malloc (r->pool, n_bytes + 1);
+ data[n_bytes] = '\0';
+
+ record->data = data;
+ if (!read_bytes (r, record->data, n_bytes))
+ return false;
+ *recordp = record;
+ return true;
+ }
+
+ goto skip;
+ }
+
+ sys_warn (r, record->pos,
+ _("Unrecognized record type 7, subtype %d. For help, please "
+ "send this file to %s and mention that you were using %s."),
+ subtype, PACKAGE_BUGREPORT, PACKAGE_STRING);
+
+skip:
+ return skip_bytes (r, n_bytes);
+}
+
+static bool
+skip_extension_record (struct sfm_reader *r, int subtype)
+{
+ struct sfm_extension_record record;
+
+ return (read_extension_record_header (r, subtype, &record)
+ && skip_bytes (r, record.count * record.size));
+}
+
+static void
+parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
+ struct any_read_info *info, struct dictionary *dict)
+{
+ const char *dict_encoding = dict_get_encoding (dict);
+ struct substring product;
+ struct substring label;
+ char *fixed_label;
+
+ /* Convert file label to UTF-8 and put it into DICT. */
+ label = recode_substring_pool ("UTF-8", dict_encoding,
+ ss_cstr (header->file_label), r->pool);
+ ss_trim (&label, ss_cstr (" "));
+ label.string[label.length] = '\0';
+ fixed_label = fix_line_ends (label.string);
+ dict_set_label (dict, fixed_label);
+ free (fixed_label);
+
+ /* Put creation date and time in UTF-8 into INFO. */
+ info->creation_date = recode_string ("UTF-8", dict_encoding,
+ header->creation_date, -1);
+ info->creation_time = recode_string ("UTF-8", dict_encoding,
+ header->creation_time, -1);
+
+ /* Put product name into INFO, dropping eye-catcher string if present. */
+ product = recode_substring_pool ("UTF-8", dict_encoding,
+ ss_cstr (header->eye_catcher), r->pool);
+ ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
+ ss_trim (&product, ss_cstr (" "));
+ info->product = ss_xstrdup (product);
+}
+
+/* Reads a variable (type 2) record from R and adds the
+ corresponding variable to DICT.
+ Also skips past additional variable records for long string
+ variables. */
+static bool
+parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
+ struct sfm_var_record *var_recs, size_t n_var_recs)
+{
+ const char *dict_encoding = dict_get_encoding (dict);
+ struct sfm_var_record *rec;
+ int n_warnings = 0;
+
+ for (rec = var_recs; rec < &var_recs[n_var_recs]; )
+ {
+ struct variable *var;
+ size_t n_values;
+ char *name;
+ size_t i;
+
+ name = recode_string_pool ("UTF-8", dict_encoding,
+ rec->name, -1, r->pool);
+ name[strcspn (name, " ")] = '\0';
+
+ if (!dict_id_is_valid (dict, name, false)
+ || name[0] == '$' || name[0] == '#')
+ {
+ sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
+ return false;
+ }
+
+ if (rec->width < 0 || rec->width > 255)
+ {
+ sys_error (r, rec->pos,
+ _("Bad width %d for variable %s."), rec->width, name);
+ return false;
+ }
+
+ var = rec->var = dict_create_var (dict, name, rec->width);
+ if (var == NULL)
+ {
+ char *new_name = dict_make_unique_var_name (dict, NULL, NULL);
+ sys_warn (r, rec->pos, _("Renaming variable with duplicate name "
+ "`%s' to `%s'."),
+ name, new_name);
+ var = rec->var = dict_create_var_assert (dict, new_name, rec->width);
+ free (new_name);
+ }
+
+ /* Set the short name the same as the long name. */
+ var_set_short_name (var, 0, name);
+
+ /* Get variable label, if any. */
+ if (rec->label)
+ {
+ char *utf8_label;
+
+ utf8_label = recode_string_pool ("UTF-8", dict_encoding,
+ rec->label, -1, r->pool);
+ var_set_label (var, utf8_label);
+ }
+
+ /* Set missing values. */
+ if (rec->missing_value_code != 0)
+ {
+ int width = var_get_width (var);
+ struct missing_values mv;
+
+ mv_init_pool (r->pool, &mv, width);
+ if (var_is_numeric (var))
+ {
+ bool has_range = rec->missing_value_code < 0;
+ int n_discrete = (has_range
+ ? rec->missing_value_code == -3
+ : rec->missing_value_code);
+ int ofs = 0;
+
+ if (has_range)
+ {
+ double low = parse_float (r, rec->missing, 0);
+ double high = parse_float (r, rec->missing, 8);
+
+ /* Deal with SPSS 21 change in representation. */
+ if (low == SYSMIS)
+ low = LOWEST;
+
+ mv_add_range (&mv, low, high);
+ ofs += 16;
+ }
+
+ for (i = 0; i < n_discrete; i++)
+ {
+ mv_add_num (&mv, parse_float (r, rec->missing, ofs));
+ ofs += 8;
+ }
+ }
+ else
+ for (i = 0; i < rec->missing_value_code; i++)
+ mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8));
+ var_set_missing_values (var, &mv);
+ }
+
+ /* Set formats. */
+ parse_format_spec (r, rec->pos + 12, rec->print_format,
+ PRINT_FORMAT, var, &n_warnings);
+ parse_format_spec (r, rec->pos + 16, rec->write_format,
+ WRITE_FORMAT, var, &n_warnings);
+
+ /* Account for values.
+ Skip long string continuation records, if any. */
+ n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
+ for (i = 1; i < n_values; i++)
+ if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
+ {
+ sys_error (r, rec->pos, _("Missing string continuation record."));
+ return false;
+ }
+ rec += n_values;
+ }
+
+ return true;
+}
+
+/* Translates the format spec from sysfile format to internal
+ format. */
+static void
+parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
+ enum which_format which, struct variable *v,
+ int *n_warnings)
+{
+ const int max_warnings = 8;
+ uint8_t raw_type = format >> 16;
+ uint8_t w = format >> 8;
+ uint8_t d = format;
+ struct fmt_spec f;
+ bool ok;
+
+ f.w = w;
+ f.d = d;
+
+ msg_disable ();
+ ok = (fmt_from_io (raw_type, &f.type)
+ && fmt_check_output (&f)
+ && fmt_check_width_compat (&f, var_get_width (v)));
+ msg_enable ();
+
+ if (ok)
+ {
+ if (which == PRINT_FORMAT)
+ var_set_print_format (v, &f);
+ else
+ var_set_write_format (v, &f);
+ }
+ else if (format == 0)
+ {
+ /* Actually observed in the wild. No point in warning about it. */
+ }
+ else if (++*n_warnings <= max_warnings)
+ {
+ if (which == PRINT_FORMAT)
+ sys_warn (r, pos, _("Variable %s with width %d has invalid print "
+ "format 0x%x."),
+ var_get_name (v), var_get_width (v), format);
+ else
+ sys_warn (r, pos, _("Variable %s with width %d has invalid write "
+ "format 0x%x."),
+ var_get_name (v), var_get_width (v), format);
+
+ if (*n_warnings == max_warnings)
+ sys_warn (r, -1, _("Suppressing further invalid format warnings."));
+ }
+}
+
+static void
+parse_document (struct dictionary *dict, struct sfm_document_record *record)
+{
+ const char *p;
+
+ for (p = record->documents;
+ p < record->documents + DOC_LINE_LENGTH * record->n_lines;
+ p += DOC_LINE_LENGTH)
+ {
+ struct substring line;
+
+ line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
+ ss_buffer (p, DOC_LINE_LENGTH), NULL);
+ ss_rtrim (&line, ss_cstr (" "));
+ line.string[line.length] = '\0';
+
+ dict_add_document_line (dict, line.string, false);
+
+ ss_dealloc (&line);
+ }
+}
+
+/* Parses record type 7, subtype 3. */
+static bool
+parse_machine_integer_info (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct any_read_info *info)
+{
+ int float_representation, expected_float_format;
+ int integer_representation, expected_integer_format;
+
+ /* Save version info. */
+ info->version_major = parse_int (r, record->data, 0);
+ info->version_minor = parse_int (r, record->data, 4);
+ info->version_revision = parse_int (r, record->data, 8);
+
+ /* Check floating point format. */
+ float_representation = parse_int (r, record->data, 16);
+ if (r->float_format == FLOAT_IEEE_DOUBLE_BE
+ || r->float_format == FLOAT_IEEE_DOUBLE_LE)
+ expected_float_format = 1;
+ else if (r->float_format == FLOAT_Z_LONG)
+ expected_float_format = 2;
+ else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D)
+ expected_float_format = 3;
+ else
+ NOT_REACHED ();
+ if (float_representation != expected_float_format)
+ {
+ sys_error (r, record->pos,
+ _("Floating-point representation indicated by "
+ "system file (%d) differs from expected (%d)."),
+ float_representation, expected_float_format);
+ return false;
+ }
+
+ /* Check integer format. */
+ integer_representation = parse_int (r, record->data, 24);
+ if (r->integer_format == INTEGER_MSB_FIRST)
+ expected_integer_format = 1;
+ else if (r->integer_format == INTEGER_LSB_FIRST)
+ expected_integer_format = 2;
+ else
+ NOT_REACHED ();
+ if (integer_representation != expected_integer_format)
+ sys_warn (r, record->pos,
+ _("Integer format indicated by system file (%d) "
+ "differs from expected (%d)."),
+ integer_representation, expected_integer_format);
+
+ return true;
+}
+
+/* Parses record type 7, subtype 4. */
+static void
+parse_machine_float_info (struct sfm_reader *r,
+ const struct sfm_extension_record *record)
+{
+ double sysmis = parse_float (r, record->data, 0);
+ double highest = parse_float (r, record->data, 8);
+ double lowest = parse_float (r, record->data, 16);
+
+ if (sysmis != SYSMIS)
+ sys_warn (r, record->pos,
+ _("File specifies unexpected value %g (%a) as %s, "
+ "instead of %g (%a)."),
+ sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
+
+ if (highest != HIGHEST)
+ sys_warn (r, record->pos,
+ _("File specifies unexpected value %g (%a) as %s, "
+ "instead of %g (%a)."),
+ highest, highest, "HIGHEST", HIGHEST, HIGHEST);
+
+ /* SPSS before version 21 used a unique value just bigger than SYSMIS as
+ LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
+ appears in a context (missing values) where SYSMIS cannot. */
+ if (lowest != LOWEST && lowest != SYSMIS)
+ sys_warn (r, record->pos,
+ _("File specifies unexpected value %g (%a) as %s, "
+ "instead of %g (%a) or %g (%a)."),
+ lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
+}
+
+/* Parses record type 7, subtype 10. */
+static void
+parse_extra_product_info (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct any_read_info *info)
+{
+ struct text_record *text;
+
+ text = open_text_record (r, record, true);
+ info->product_ext = fix_line_ends (text_get_all (text));
+ close_text_record (r, text);
+}
+
+/* Parses record type 7, subtype 7 or 19. */
+static void
+parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
+ size_t *allocated_mrsets)
+{
+ struct text_record *text;
+
+ text = open_text_record (r, record, false);
+ for (;;)
+ {
+ struct sfm_mrset *mrset;
+ size_t allocated_vars;
+ char delimiter;
+
+ /* Skip extra line feeds if present. */
+ while (text_match (text, '\n'))
+ continue;
+
+ if (r->n_mrsets >= *allocated_mrsets)
+ r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets,
+ sizeof *r->mrsets);
+ mrset = &r->mrsets[r->n_mrsets];
+ memset(mrset, 0, sizeof *mrset);
+
+ mrset->name = text_get_token (text, ss_cstr ("="), NULL);
+ if (mrset->name == NULL)
+ break;
+
+ if (text_match (text, 'C'))
+ {
+ mrset->type = MRSET_MC;
+ if (!text_match (text, ' '))
+ {
+ sys_warn (r, record->pos,
+ _("Missing space following `%c' at offset %zu "
+ "in MRSETS record."), 'C', text_pos (text));
+ break;
+ }
+ }
+ else if (text_match (text, 'D'))
+ {
+ mrset->type = MRSET_MD;
+ mrset->cat_source = MRSET_VARLABELS;
+ }
+ else if (text_match (text, 'E'))
+ {
+ char *number;
+
+ mrset->type = MRSET_MD;
+ mrset->cat_source = MRSET_COUNTEDVALUES;
+ if (!text_match (text, ' '))
+ {
+ sys_warn (r, record->pos,
+ _("Missing space following `%c' at offset %zu "
+ "in MRSETS record."), 'E', text_pos (text));
+ break;
+ }
+
+ number = text_get_token (text, ss_cstr (" "), NULL);
+ if (!strcmp (number, "11"))
+ mrset->label_from_var_label = true;
+ else if (strcmp (number, "1"))
+ sys_warn (r, record->pos,
+ _("Unexpected label source value following `E' "
+ "at offset %zu in MRSETS record."),
+ text_pos (text));
+ }
+ else
+ {
+ sys_warn (r, record->pos,
+ _("Missing `C', `D', or `E' at offset %zu "
+ "in MRSETS record."),
+ text_pos (text));
+ break;
+ }
+
+ if (mrset->type == MRSET_MD)
+ {
+ mrset->counted = text_parse_counted_string (r, text);
+ if (mrset->counted == NULL)
+ break;
+ }
+
+ mrset->label = text_parse_counted_string (r, text);
+ if (mrset->label == NULL)
+ break;
+
+ allocated_vars = 0;
+ do
+ {
+ const char *var;
+
+ var = text_get_token (text, ss_cstr (" \n"), &delimiter);
+ if (var == NULL)
+ {
+ if (delimiter != '\n')
+ sys_warn (r, record->pos,
+ _("Missing new-line parsing variable names "
+ "at offset %zu in MRSETS record."),
+ text_pos (text));
+ break;
+ }
+
+ if (mrset->n_vars >= allocated_vars)
+ mrset->vars = pool_2nrealloc (r->pool, mrset->vars,
+ &allocated_vars,
+ sizeof *mrset->vars);
+ mrset->vars[mrset->n_vars++] = var;
+ }
+ while (delimiter != '\n');
+
+ r->n_mrsets++;
+ }
+ close_text_record (r, text);
+}
+
+static void
+decode_mrsets (struct sfm_reader *r, struct dictionary *dict)
+{
+ const struct sfm_mrset *s;
+
+ for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++)
+ {
+ struct stringi_set var_names;
+ struct mrset *mrset;
+ char *name;
+ int width;
+ size_t i;
+
+ name = recode_string ("UTF-8", r->encoding, s->name, -1);
+ if (name[0] != '$')
+ {
+ sys_warn (r, -1, _("Multiple response set name `%s' does not begin "
+ "with `$'."),
+ name);
+ free (name);
+ continue;
+ }
+
+ mrset = xzalloc (sizeof *mrset);
+ mrset->name = name;
+ mrset->type = s->type;
+ mrset->cat_source = s->cat_source;
+ mrset->label_from_var_label = s->label_from_var_label;
+ if (s->label[0] != '\0')
+ mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1);
+
+ stringi_set_init (&var_names);
+ mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars);
+ width = INT_MAX;
+ for (i = 0; i < s->n_vars; i++)
+ {
+ struct variable *var;
+ char *var_name;
+
+ var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1);
+
+ var = dict_lookup_var (dict, var_name);
+ if (var == NULL)
+ {
+ free (var_name);
+ continue;
+ }
+ if (!stringi_set_insert (&var_names, var_name))
+ {
+ sys_warn (r, -1,
+ _("MRSET %s contains duplicate variable name %s."),
+ mrset->name, var_name);
+ free (var_name);
+ continue;
+ }
+ free (var_name);
+
+ if (mrset->label == NULL && mrset->label_from_var_label
+ && var_has_label (var))
+ mrset->label = xstrdup (var_get_label (var));
+
+ if (mrset->n_vars
+ && var_get_type (var) != var_get_type (mrset->vars[0]))
+ {
+ sys_warn (r, -1,
+ _("MRSET %s contains both string and "
+ "numeric variables."), mrset->name);
+ continue;
+ }
+ width = MIN (width, var_get_width (var));
+
+ mrset->vars[mrset->n_vars++] = var;
+ }
+
+ if (mrset->n_vars < 2)
+ {
+ if (mrset->n_vars == 0)
+ sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name);
+ else
+ sys_warn (r, -1, _("MRSET %s has only one variable."),
+ mrset->name);
+ mrset_destroy (mrset);
+ stringi_set_destroy (&var_names);
+ continue;
+ }
+
+ if (mrset->type == MRSET_MD)
+ {
+ mrset->width = width;
+ value_init (&mrset->counted, width);
+ if (width == 0)
+ mrset->counted.f = c_strtod (s->counted, NULL);
+ else
+ value_copy_str_rpad (&mrset->counted, width,
+ (const uint8_t *) s->counted, ' ');
+ }
+
+ dict_add_mrset (dict, mrset);
+ stringi_set_destroy (&var_names);
+ }
+}
+
+/* Read record type 7, subtype 11, which specifies how variables
+ should be displayed in GUI environments. */
+static void
+parse_display_parameters (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ bool includes_width;
+ bool warned = false;
+ size_t n_vars;
+ size_t ofs;
+ size_t i;
+
+ n_vars = dict_get_var_cnt (dict);
+ if (record->count == 3 * n_vars)
+ includes_width = true;
+ else if (record->count == 2 * n_vars)
+ includes_width = false;
+ else
+ {
+ sys_warn (r, record->pos,
+ _("Extension 11 has bad count %u (for %zu variables)."),
+ record->count, n_vars);
+ return;
+ }
+
+ ofs = 0;
+ for (i = 0; i < n_vars; ++i)
+ {
+ struct variable *v = dict_get_var (dict, i);
+ int measure, width, align;
+
+ measure = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ if (includes_width)
+ {
+ width = parse_int (r, record->data, ofs);
+ ofs += 4;
+ }
+ else
+ width = 0;
+
+ align = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* SPSS sometimes seems to set variables' measure to zero. */
+ if (0 == measure)
+ measure = 1;
+
+ if (measure < 1 || measure > 3 || align < 0 || align > 2)
+ {
+ if (!warned)
+ sys_warn (r, record->pos,
+ _("Invalid variable display parameters for variable "
+ "%zu (%s). Default parameters substituted."),
+ i, var_get_name (v));
+ warned = true;
+ continue;
+ }
+
+ var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
+ : measure == 2 ? MEASURE_ORDINAL
+ : MEASURE_SCALE));
+ var_set_alignment (v, (align == 0 ? ALIGN_LEFT
+ : align == 1 ? ALIGN_RIGHT
+ : ALIGN_CENTRE));
+
+ /* Older versions (SPSS 9.0) sometimes set the display
+ width to zero. This causes confusion in the GUI, so
+ only set the width if it is nonzero. */
+ if (width > 0)
+ var_set_display_width (v, width);
+ }
+}
+
+static void
+rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
+ const char *new_name)
+{
+ size_t n_short_names;
+ char **short_names;
+ size_t i;
+
+ /* Renaming a variable may clear its short names, but we
+ want to retain them, so we save them and re-set them
+ afterward. */
+ n_short_names = var_get_short_name_cnt (var);
+ short_names = xnmalloc (n_short_names, sizeof *short_names);
+ for (i = 0; i < n_short_names; i++)
+ {
+ const char *s = var_get_short_name (var, i);
+ short_names[i] = s != NULL ? xstrdup (s) : NULL;
+ }
+
+ /* Set long name. */
+ dict_rename_var (dict, var, new_name);
+
+ /* Restore short names. */
+ for (i = 0; i < n_short_names; i++)
+ {
+ var_set_short_name (var, i, short_names[i]);
+ free (short_names[i]);
+ }
+ free (short_names);
+}
+
+/* Parses record type 7, subtype 13, which gives the long name that corresponds
+ to each short name. Modifies variable names in DICT accordingly. */
+static void
+parse_long_var_name_map (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ struct text_record *text;
+ struct variable *var;
+ char *long_name;
+
+ if (record == NULL)
+ {
+ /* There are no long variable names. Use the short variable names,
+ converted to lowercase, as the long variable names. */
+ size_t i;
+
+ for (i = 0; i < dict_get_var_cnt (dict); i++)
+ {
+ struct variable *var = dict_get_var (dict, i);
+ char *new_name;
+
+ new_name = utf8_to_lower (var_get_name (var));
+ rename_var_and_save_short_names (dict, var, new_name);
+ free (new_name);
}
- *var_by_idx = xnrealloc (*var_by_idx, i + 1, sizeof **var_by_idx);
+ return;
+ }
+
+ /* Rename each of the variables, one by one. (In a correctly constructed
+ system file, this cannot create any intermediate duplicate variable names,
+ because all of the new variable names are longer than any of the old
+ variable names and thus there cannot be any overlaps.) */
+ text = open_text_record (r, record, true);
+ while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
+ {
+ /* Validate long name. */
+ if (!dict_id_is_valid (dict, long_name, false)
+ || long_name[0] == '$' || long_name[0] == '#')
+ {
+ sys_warn (r, record->pos,
+ _("Long variable mapping from %s to invalid "
+ "variable name `%s'."),
+ var_get_name (var), long_name);
+ continue;
+ }
+
+ /* Identify any duplicates. */
+ if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
+ && dict_lookup_var (dict, long_name) != NULL)
+ {
+ sys_warn (r, record->pos,
+ _("Duplicate long variable name `%s'."), long_name);
+ continue;
+ }
+
+ rename_var_and_save_short_names (dict, var, long_name);
+ }
+ close_text_record (r, text);
+}
+
+/* Reads record type 7, subtype 14, which gives the real length
+ of each very long string. Rearranges DICT accordingly. */
+static bool
+parse_long_string_map (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ struct text_record *text;
+ struct variable *var;
+ char *length_s;
+
+ text = open_text_record (r, record, true);
+ while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
+ {
+ size_t idx = var_get_dict_index (var);
+ long int length;
+ int segment_cnt;
+ int i;
+
+ /* Get length. */
+ length = strtol (length_s, NULL, 10);
+ if (length < 1 || length > MAX_STRING)
+ {
+ sys_warn (r, record->pos,
+ _("%s listed as string of invalid length %s "
+ "in very long string record."),
+ var_get_name (var), length_s);
+ continue;
+ }
+
+ /* Check segments. */
+ segment_cnt = sfm_width_to_segments (length);
+ if (segment_cnt == 1)
+ {
+ sys_warn (r, record->pos,
+ _("%s listed in very long string record with width %s, "
+ "which requires only one segment."),
+ var_get_name (var), length_s);
+ continue;
+ }
+ if (idx + segment_cnt > dict_get_var_cnt (dict))
+ {
+ sys_error (r, record->pos,
+ _("Very long string %s overflows dictionary."),
+ var_get_name (var));
+ return false;
+ }
+
+ /* Get the short names from the segments and check their
+ lengths. */
+ for (i = 0; i < segment_cnt; i++)
+ {
+ struct variable *seg = dict_get_var (dict, idx + i);
+ int alloc_width = sfm_segment_alloc_width (length, i);
+ int width = var_get_width (seg);
+
+ if (i > 0)
+ var_set_short_name (var, i, var_get_short_name (seg, 0));
+ if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
+ {
+ sys_error (r, record->pos,
+ _("Very long string with width %ld has segment %d "
+ "of width %d (expected %d)."),
+ length, i, width, alloc_width);
+ return false;
+ }
+ }
+ dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
+ var_set_width (var, length);
+ }
+ close_text_record (r, text);
+ dict_compact_values (dict);
+
+ return true;
+}
+
+static bool
+parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
+ const struct sfm_var_record *var_recs, size_t n_var_recs,
+ const struct sfm_value_label_record *record)
+{
+ struct variable **vars;
+ char **utf8_labels;
+ size_t i;
+
+ utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels);
+ for (i = 0; i < record->n_labels; i++)
+ utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
+ record->labels[i].label, -1,
+ r->pool);
+
+ vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
+ for (i = 0; i < record->n_vars; i++)
+ {
+ vars[i] = lookup_var_by_index (r, record->pos,
+ var_recs, n_var_recs, record->vars[i]);
+ if (vars[i] == NULL)
+ return false;
+ }
+
+ for (i = 1; i < record->n_vars; i++)
+ if (var_get_type (vars[i]) != var_get_type (vars[0]))
+ {
+ sys_error (r, record->pos,
+ _("Variables associated with value label are not all of "
+ "identical type. Variable %s is %s, but variable "
+ "%s is %s."),
+ var_get_name (vars[0]),
+ var_is_numeric (vars[0]) ? _("numeric") : _("string"),
+ var_get_name (vars[i]),
+ var_is_numeric (vars[i]) ? _("numeric") : _("string"));
+ return false;
+ }
+
+ for (i = 0; i < record->n_vars; i++)
+ {
+ struct variable *var = vars[i];
+ int width;
+ size_t j;
+
+ width = var_get_width (var);
+ if (width > 8)
+ {
+ sys_error (r, record->pos,
+ _("Value labels may not be added to long string "
+ "variables (e.g. %s) using records types 3 and 4."),
+ var_get_name (var));
+ return false;
+ }
+
+ for (j = 0; j < record->n_labels; j++)
+ {
+ struct sfm_value_label *label = &record->labels[j];
+ union value value;
+
+ value_init (&value, width);
+ if (width == 0)
+ value.f = parse_float (r, label->value, 0);
+ else
+ memcpy (value_str_rw (&value, width), label->value, width);
+
+ if (!var_add_value_label (var, &value, utf8_labels[j]))
+ {
+ if (var_is_numeric (var))
+ sys_warn (r, record->pos,
+ _("Duplicate value label for %g on %s."),
+ value.f, var_get_name (var));
+ else
+ sys_warn (r, record->pos,
+ _("Duplicate value label for `%.*s' on %s."),
+ width, value_str (&value, width),
+ var_get_name (var));
+ }
+
+ value_destroy (&value, width);
+ }
+ }
+
+ pool_free (r->pool, vars);
+ for (i = 0; i < record->n_labels; i++)
+ pool_free (r->pool, utf8_labels[i]);
+ pool_free (r->pool, utf8_labels);
+
+ return true;
+}
+
+static struct variable *
+lookup_var_by_index (struct sfm_reader *r, off_t offset,
+ const struct sfm_var_record *var_recs, size_t n_var_recs,
+ int idx)
+{
+ const struct sfm_var_record *rec;
+
+ if (idx < 1 || idx > n_var_recs)
+ {
+ sys_error (r, offset,
+ _("Variable index %d not in valid range 1...%zu."),
+ idx, n_var_recs);
+ return NULL;
+ }
+
+ rec = &var_recs[idx - 1];
+ if (rec->var == NULL)
+ {
+ sys_error (r, offset,
+ _("Variable index %d refers to long string continuation."),
+ idx);
+ return NULL;
+ }
+
+ return rec->var;
+}
+
+/* Parses a set of custom attributes from TEXT into ATTRS.
+ ATTRS may be a null pointer, in which case the attributes are
+ read but discarded. */
+static void
+parse_attributes (struct sfm_reader *r, struct text_record *text,
+ struct attrset *attrs)
+{
+ do
+ {
+ struct attribute *attr;
+ char *key;
+ int index;
+
+ /* Parse the key. */
+ key = text_get_token (text, ss_cstr ("("), NULL);
+ if (key == NULL)
+ return;
+
+ attr = attribute_create (key);
+ for (index = 1; ; index++)
+ {
+ /* Parse the value. */
+ char *value;
+ size_t length;
+
+ value = text_get_token (text, ss_cstr ("\n"), NULL);
+ if (value == NULL)
+ {
+ text_warn (r, text, _("Error parsing attribute value %s[%d]."),
+ key, index);
+ break;
+ }
+
+ length = strlen (value);
+ if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
+ {
+ value[length - 1] = '\0';
+ attribute_add_value (attr, value + 1);
+ }
+ else
+ {
+ text_warn (r, text,
+ _("Attribute value %s[%d] is not quoted: %s."),
+ key, index, value);
+ attribute_add_value (attr, value);
+ }
+
+ /* Was this the last value for this attribute? */
+ if (text_match (text, ')'))
+ break;
+ }
+ if (attrs != NULL)
+ attrset_add (attrs, attr);
+ else
+ attribute_destroy (attr);
+ }
+ while (!text_match (text, '/'));
+}
+
+/* Reads record type 7, subtype 17, which lists custom
+ attributes on the data file. */
+static void
+parse_data_file_attributes (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ struct text_record *text = open_text_record (r, record, true);
+ parse_attributes (r, text, dict_get_attributes (dict));
+ close_text_record (r, text);
+}
+
+/* Parses record type 7, subtype 18, which lists custom
+ attributes on individual variables. */
+static void
+parse_variable_attributes (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ struct text_record *text;
+ struct variable *var;
+
+ text = open_text_record (r, record, true);
+ while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
+ parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
+ close_text_record (r, text);
+}
+
+static void
+assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
+{
+ size_t n_warnings = 0;
+ size_t i;
+
+ for (i = 0; i < dict_get_var_cnt (dict); i++)
+ {
+ struct variable *var = dict_get_var (dict, i);
+ struct attrset *attrs = var_get_attributes (var);
+ const struct attribute *attr = attrset_lookup (attrs, "$@Role");
+ if (attr != NULL)
+ {
+ int value = atoi (attribute_get_value (attr, 0));
+ enum var_role role;
+
+ switch (value)
+ {
+ case 0:
+ role = ROLE_INPUT;
+ break;
+
+ case 1:
+ role = ROLE_TARGET;
+ break;
+
+ case 2:
+ role = ROLE_BOTH;
+ break;
+
+ case 3:
+ role = ROLE_NONE;
+ break;
+
+ case 4:
+ role = ROLE_PARTITION;
+ break;
+
+ case 5:
+ role = ROLE_SPLIT;
+ break;
+
+ default:
+ role = ROLE_INPUT;
+ if (n_warnings++ == 0)
+ sys_warn (r, -1, _("Invalid role for variable %s."),
+ var_get_name (var));
+ }
+
+ var_set_role (var, role);
+ }
+ }
+
+ if (n_warnings > 1)
+ sys_warn (r, -1, _("%zu other variables had invalid roles."),
+ n_warnings - 1);
+}
+
+static bool
+check_overflow (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ size_t ofs, size_t length)
+{
+ size_t end = record->size * record->count;
+ if (length >= end || ofs + length > end)
+ {
+ sys_warn (r, record->pos + end,
+ _("Extension record subtype %d ends unexpectedly."),
+ record->subtype);
+ return false;
+ }
+ return true;
+}
+
+static void
+parse_long_string_value_labels (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ const char *dict_encoding = dict_get_encoding (dict);
+ size_t end = record->size * record->count;
+ size_t ofs = 0;
+
+ while (ofs < end)
+ {
+ char *var_name;
+ size_t n_labels, i;
+ struct variable *var;
+ union value value;
+ int var_name_len;
+ int width;
+
+ /* Parse variable name length. */
+ if (!check_overflow (r, record, ofs, 4))
+ return;
+ var_name_len = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* Parse variable name, width, and number of labels. */
+ if (!check_overflow (r, record, ofs, var_name_len + 8))
+ return;
+ var_name = recode_string_pool ("UTF-8", dict_encoding,
+ (const char *) record->data + ofs,
+ var_name_len, r->pool);
+ width = parse_int (r, record->data, ofs + var_name_len);
+ n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
+ ofs += var_name_len + 8;
+
+ /* Look up 'var' and validate. */
+ var = dict_lookup_var (dict, var_name);
+ if (var == NULL)
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string value label record for "
+ "unknown variable %s."), var_name);
+ else if (var_is_numeric (var))
+ {
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string value label record for "
+ "numeric variable %s."), var_name);
+ var = NULL;
+ }
+ else if (width != var_get_width (var))
+ {
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string value label record for variable "
+ "%s because the record's width (%d) does not match the "
+ "variable's width (%d)."),
+ var_name, width, var_get_width (var));
+ var = NULL;
+ }
+
+ /* Parse values. */
+ value_init_pool (r->pool, &value, width);
+ for (i = 0; i < n_labels; i++)
+ {
+ size_t value_length, label_length;
+ bool skip = var == NULL;
+
+ /* Parse value length. */
+ if (!check_overflow (r, record, ofs, 4))
+ return;
+ value_length = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* Parse value. */
+ if (!check_overflow (r, record, ofs, value_length))
+ return;
+ if (!skip)
+ {
+ if (value_length == width)
+ memcpy (value_str_rw (&value, width),
+ (const uint8_t *) record->data + ofs, width);
+ else
+ {
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string value label %zu for "
+ "variable %s, with width %d, that has bad value "
+ "width %zu."),
+ i, var_get_name (var), width, value_length);
+ skip = true;
+ }
+ }
+ ofs += value_length;
+
+ /* Parse label length. */
+ if (!check_overflow (r, record, ofs, 4))
+ return;
+ label_length = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* Parse label. */
+ if (!check_overflow (r, record, ofs, label_length))
+ return;
+ if (!skip)
+ {
+ char *label;
+
+ label = recode_string_pool ("UTF-8", dict_encoding,
+ (const char *) record->data + ofs,
+ label_length, r->pool);
+ if (!var_add_value_label (var, &value, label))
+ sys_warn (r, record->pos + ofs,
+ _("Duplicate value label for `%.*s' on %s."),
+ width, value_str (&value, width),
+ var_get_name (var));
+ pool_free (r->pool, label);
+ }
+ ofs += label_length;
+ }
+ }
+}
+
+static void
+parse_long_string_missing_values (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ const char *dict_encoding = dict_get_encoding (dict);
+ size_t end = record->size * record->count;
+ size_t ofs = 0;
+
+ while (ofs < end)
+ {
+ struct missing_values mv;
+ char *var_name;
+ struct variable *var;
+ int n_missing_values;
+ int var_name_len;
+ size_t i;
+
+ /* Parse variable name length. */
+ if (!check_overflow (r, record, ofs, 4))
+ return;
+ var_name_len = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* Parse variable name. */
+ if (!check_overflow (r, record, ofs, var_name_len + 1))
+ return;
+ var_name = recode_string_pool ("UTF-8", dict_encoding,
+ (const char *) record->data + ofs,
+ var_name_len, r->pool);
+ ofs += var_name_len;
+
+ /* Parse number of missing values. */
+ n_missing_values = ((const uint8_t *) record->data)[ofs];
+ if (n_missing_values < 1 || n_missing_values > 3)
+ sys_warn (r, record->pos + ofs,
+ _("Long string missing values record says variable %s "
+ "has %d missing values, but only 1 to 3 missing values "
+ "are allowed."),
+ var_name, n_missing_values);
+ ofs++;
+
+ /* Look up 'var' and validate. */
+ var = dict_lookup_var (dict, var_name);
+ if (var == NULL)
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string missing value record for "
+ "unknown variable %s."), var_name);
+ else if (var_is_numeric (var))
+ {
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string missing value record for "
+ "numeric variable %s."), var_name);
+ var = NULL;
+ }
+
+ /* Parse values. */
+ mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8);
+ for (i = 0; i < n_missing_values; i++)
+ {
+ size_t value_length;
+
+ /* Parse value length. */
+ if (!check_overflow (r, record, ofs, 4))
+ return;
+ value_length = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* Parse value. */
+ if (!check_overflow (r, record, ofs, value_length))
+ return;
+ if (var != NULL
+ && i < 3
+ && !mv_add_str (&mv, (const uint8_t *) record->data + ofs,
+ value_length))
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string missing value %zu for variable "
+ "%s, with width %d, that has bad value width %zu."),
+ i, var_get_name (var), var_get_width (var),
+ value_length);
+ ofs += value_length;
+ }
+ if (var != NULL)
+ var_set_missing_values (var, &mv);
+ }
+}
+\f
+/* Case reader. */
+
+static void partial_record (struct sfm_reader *);
+
+static void read_error (struct casereader *, const struct sfm_reader *);
+
+static bool read_case_number (struct sfm_reader *, double *);
+static int read_case_string (struct sfm_reader *, uint8_t *, size_t);
+static int read_opcode (struct sfm_reader *);
+static bool read_compressed_number (struct sfm_reader *, double *);
+static int read_compressed_string (struct sfm_reader *, uint8_t *);
+static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
+static bool skip_whole_strings (struct sfm_reader *, size_t);
+
+/* Reads and returns one case from READER's file. Returns a null
+ pointer if not successful. */
+static struct ccase *
+sys_file_casereader_read (struct casereader *reader, void *r_)
+{
+ struct sfm_reader *r = r_;
+ struct ccase *c;
+ int retval;
+ int i;
+
+ if (r->error || !r->sfm_var_cnt)
+ return NULL;
+
+ c = case_create (r->proto);
+
+ for (i = 0; i < r->sfm_var_cnt; i++)
+ {
+ struct sfm_var *sv = &r->sfm_vars[i];
+ union value *v = case_data_rw_idx (c, sv->case_index);
+
+ if (sv->var_width == 0)
+ retval = read_case_number (r, &v->f);
+ else
+ {
+ uint8_t *s = value_str_rw (v, sv->var_width);
+ retval = read_case_string (r, s + sv->offset, sv->segment_width);
+ if (retval == 1)
+ {
+ retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8));
+ if (retval == 0)
+ sys_error (r, r->pos, _("File ends in partial string value."));
+ }
+ }
+
+ if (retval != 1)
+ goto eof;
+ }
+ return c;
+
+eof:
+ if (i != 0)
+ partial_record (r);
+ if (r->case_cnt != -1)
+ read_error (reader, r);
+ case_unref (c);
+ return NULL;
+}
+
+/* Issues an error that R ends in a partial record. */
+static void
+partial_record (struct sfm_reader *r)
+{
+ sys_error (r, r->pos, _("File ends in partial case."));
+}
+
+/* Issues an error that an unspecified error occurred SFM, and
+ marks R tainted. */
+static void
+read_error (struct casereader *r, const struct sfm_reader *sfm)
+{
+ msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
+ casereader_force_error (r);
+}
+
+/* Reads a number from R and stores its value in *D.
+ If R is compressed, reads a compressed number;
+ otherwise, reads a number in the regular way.
+ Returns true if successful, false if end of file is
+ reached immediately. */
+static bool
+read_case_number (struct sfm_reader *r, double *d)
+{
+ if (r->compression == ANY_COMP_NONE)
+ {
+ uint8_t number[8];
+ if (!try_read_bytes (r, number, sizeof number))
+ return false;
+ float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
+ return true;
+ }
+ else
+ return read_compressed_number (r, d);
+}
+
+/* Reads LENGTH string bytes from R into S. Always reads a multiple of 8
+ bytes; if LENGTH is not a multiple of 8, then extra bytes are read and
+ discarded without being written to S. Reads compressed strings if S is
+ compressed. Returns 1 if successful, 0 if end of file is reached
+ immediately, or -1 for some kind of error. */
+static int
+read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
+{
+ size_t whole = ROUND_DOWN (length, 8);
+ size_t partial = length % 8;
+
+ if (whole)
+ {
+ int retval = read_whole_strings (r, s, whole);
+ if (retval != 1)
+ return retval;
+ }
+
+ if (partial)
+ {
+ uint8_t bounce[8];
+ int retval = read_whole_strings (r, bounce, sizeof bounce);
+ if (retval == -1)
+ return -1;
+ else if (!retval)
+ {
+ if (whole)
+ {
+ partial_record (r);
+ return -1;
+ }
+ return 0;
+ }
+ memcpy (s + whole, bounce, partial);
+ }
+
+ return 1;
+}
+
+/* Reads and returns the next compression opcode from R. */
+static int
+read_opcode (struct sfm_reader *r)
+{
+ assert (r->compression != ANY_COMP_NONE);
+ for (;;)
+ {
+ int opcode;
+ if (r->opcode_idx >= sizeof r->opcodes)
+ {
+
+ int retval = try_read_compressed_bytes (r, r->opcodes,
+ sizeof r->opcodes);
+ if (retval != 1)
+ return -1;
+ r->opcode_idx = 0;
+ }
+ opcode = r->opcodes[r->opcode_idx++];
+
+ if (opcode != 0)
+ return opcode;
+ }
+}
+
+/* Reads a compressed number from R and stores its value in D.
+ Returns true if successful, false if end of file is
+ reached immediately. */
+static bool
+read_compressed_number (struct sfm_reader *r, double *d)
+{
+ int opcode = read_opcode (r);
+ switch (opcode)
+ {
+ case -1:
+ case 252:
+ return false;
+
+ case 253:
+ return read_compressed_float (r, d);
+
+ case 254:
+ float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
+ if (!r->corruption_warning)
+ {
+ r->corruption_warning = true;
+ sys_warn (r, r->pos,
+ _("Possible compressed data corruption: "
+ "compressed spaces appear in numeric field."));
+ }
+ break;
+
+ case 255:
+ *d = SYSMIS;
+ break;
+
+ default:
+ *d = opcode - r->bias;
+ break;
+ }
+
+ return true;
+}
+
+/* Reads a compressed 8-byte string segment from R and stores it in DST. */
+static int
+read_compressed_string (struct sfm_reader *r, uint8_t *dst)
+{
+ int opcode;
+ int retval;
+
+ opcode = read_opcode (r);
+ switch (opcode)
+ {
+ case -1:
+ case 252:
+ return 0;
+
+ case 253:
+ retval = read_compressed_bytes (r, dst, 8);
+ return retval == 1 ? 1 : -1;
+
+ case 254:
+ memset (dst, ' ', 8);
+ return 1;
+
+ default:
+ {
+ double value = opcode - r->bias;
+ float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
+ if (value == 0.0)
+ {
+ /* This has actually been seen "in the wild". The submitter of the
+ file that showed that the contents decoded as spaces, but they
+ were at the end of the field so it's possible that the null
+ bytes just acted as null terminators. */
+ }
+ else if (!r->corruption_warning)
+ {
+ r->corruption_warning = true;
+ sys_warn (r, r->pos,
+ _("Possible compressed data corruption: "
+ "string contains compressed integer (opcode %d)."),
+ opcode);
+ }
+ }
+ return 1;
+ }
+}
+
+/* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8.
+ Reads compressed strings if S is compressed. Returns 1 if successful, 0 if
+ end of file is reached immediately, or -1 for some kind of error. */
+static int
+read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
+{
+ assert (length % 8 == 0);
+ if (r->compression == ANY_COMP_NONE)
+ return try_read_bytes (r, s, length);
+ else
+ {
+ size_t ofs;
+
+ for (ofs = 0; ofs < length; ofs += 8)
+ {
+ int retval = read_compressed_string (r, s + ofs);
+ if (retval != 1)
+ {
+ if (ofs != 0)
+ {
+ partial_record (r);
+ return -1;
+ }
+ return retval;
+ }
+ }
+ return 1;
+ }
+}
+
+/* Skips LENGTH string bytes from R.
+ LENGTH must be a multiple of 8.
+ (LENGTH is also limited to 1024, but that's only because the
+ current caller never needs more than that many bytes.)
+ Returns true if successful, false if end of file is
+ reached immediately. */
+static bool
+skip_whole_strings (struct sfm_reader *r, size_t length)
+{
+ uint8_t buffer[1024];
+ assert (length < sizeof buffer);
+ return read_whole_strings (r, buffer, length);
+}
+\f
+/* Helpers for reading records that contain structured text
+ strings. */
+
+/* Maximum number of warnings to issue for a single text
+ record. */
+#define MAX_TEXT_WARNINGS 5
+
+/* State. */
+struct text_record
+ {
+ struct substring buffer; /* Record contents. */
+ off_t start; /* Starting offset in file. */
+ size_t pos; /* Current position in buffer. */
+ int n_warnings; /* Number of warnings issued or suppressed. */
+ bool recoded; /* Recoded into UTF-8? */
+ };
+
+static struct text_record *
+open_text_record (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ bool recode_to_utf8)
+{
+ struct text_record *text;
+ struct substring raw;
+
+ text = pool_alloc (r->pool, sizeof *text);
+ raw = ss_buffer (record->data, record->size * record->count);
+ text->start = record->pos;
+ text->buffer = (recode_to_utf8
+ ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
+ : raw);
+ text->pos = 0;
+ text->n_warnings = 0;
+ text->recoded = recode_to_utf8;
+
+ return text;
+}
+
+/* Closes TEXT, frees its storage, and issues a final warning
+ about suppressed warnings if necesary. */
+static void
+close_text_record (struct sfm_reader *r, struct text_record *text)
+{
+ if (text->n_warnings > MAX_TEXT_WARNINGS)
+ sys_warn (r, -1, _("Suppressed %d additional related warnings."),
+ text->n_warnings - MAX_TEXT_WARNINGS);
+ if (text->recoded)
+ pool_free (r->pool, ss_data (text->buffer));
+}
+
+/* Reads a variable=value pair from TEXT.
+ Looks up the variable in DICT and stores it into *VAR.
+ Stores a null-terminated value into *VALUE. */
+static bool
+read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
+ struct text_record *text,
+ struct variable **var, char **value)
+{
+ for (;;)
+ {
+ if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
+ return false;
+
+ *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
+ if (*value == NULL)
+ return false;
+
+ text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
+ ss_buffer ("\t\0", 2));
+
+ if (*var != NULL)
+ return true;
+ }
+}
- /* If there was a long string previously, make sure that the
- continuations are present; otherwise make sure there aren't
- any. */
- if (long_string_count)
- {
- if (sv.type != -1)
- lose ((ME, _("%s: position %d: String variable does not have "
- "proper number of continuation records."),
- fh_get_file_name (r->fh), i));
+static bool
+text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
+ struct text_record *text, struct substring delimiters,
+ struct variable **var)
+{
+ char *name;
+ name = text_get_token (text, delimiters, NULL);
+ if (name == NULL)
+ return false;
- (*var_by_idx)[i] = NULL;
- long_string_count--;
- continue;
- }
- else if (sv.type == -1)
- lose ((ME, _("%s: position %d: Superfluous long string continuation "
- "record."),
- fh_get_file_name (r->fh), i));
-
- /* Check fields for validity. */
- if (sv.type < 0 || sv.type > 255)
- lose ((ME, _("%s: position %d: Bad variable type code %d."),
- fh_get_file_name (r->fh), i, sv.type));
- if (sv.has_var_label != 0 && sv.has_var_label != 1)
- lose ((ME, _("%s: position %d: Variable label indicator field is not "
- "0 or 1."), fh_get_file_name (r->fh), i));
- if (sv.n_missing_values < -3 || sv.n_missing_values > 3
- || sv.n_missing_values == -1)
- lose ((ME, _("%s: position %d: Missing value indicator field is not "
- "-3, -2, 0, 1, 2, or 3."), fh_get_file_name (r->fh), i));
-
- /* Copy first character of variable name. */
- if (sv.name[0] == '@' || sv.name[0] == '#')
- lose ((ME, _("%s: position %d: Variable name begins with invalid "
- "character."),
- fh_get_file_name (r->fh), i));
-
- name[0] = sv.name[0];
-
- /* Copy remaining characters of variable name. */
- for (j = 1; j < SHORT_NAME_LEN; j++)
- {
- int c = (unsigned char) sv.name[j];
+ *var = dict_lookup_var (dict, name);
+ if (*var != NULL)
+ return true;
- if (c == ' ')
- break;
- else
- name[j] = c;
- }
- name[j] = 0;
+ text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
+ name);
+ return false;
+}
- if ( ! var_is_plausible_name(name, false) )
- lose ((ME, _("%s: Invalid variable name `%s' within system file."),
- fh_get_file_name (r->fh), name));
- /* Create variable. */
- vv = (*var_by_idx)[i] = dict_create_var (dict, name, sv.type);
- if (vv == NULL)
- lose ((ME, _("%s: Duplicate variable name `%s' within system file."),
- fh_get_file_name (r->fh), name));
+static bool
+text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
+ struct text_record *text, struct substring delimiters,
+ struct variable **var)
+{
+ char *short_name = text_get_token (text, delimiters, NULL);
+ if (short_name == NULL)
+ return false;
- /* Set the short name the same as the long name */
- var_set_short_name (vv, vv->name);
+ *var = dict_lookup_var (dict, short_name);
+ if (*var == NULL)
+ text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
+ short_name);
+ return true;
+}
- /* Case reading data. */
- nv = sv.type == 0 ? 1 : DIV_RND_UP (sv.type, sizeof (flt64));
- long_string_count = nv - 1;
- next_value += nv;
+/* Displays a warning for the current file position, limiting the
+ number to MAX_TEXT_WARNINGS for TEXT. */
+static void
+text_warn (struct sfm_reader *r, struct text_record *text,
+ const char *format, ...)
+{
+ if (text->n_warnings++ < MAX_TEXT_WARNINGS)
+ {
+ va_list args;
- /* Get variable label, if any. */
- if (sv.has_var_label == 1)
- {
- /* Disk buffer. */
- int32_t len;
-
- /* Read length of label. */
- assertive_buf_read (r, &len, sizeof len, 0);
- if (r->reverse_endian)
- bswap_int32 (&len);
-
- /* Check len. */
- if (len < 0 || len > 255)
- lose ((ME, _("%s: Variable %s indicates variable label of invalid "
- "length %d."),
- fh_get_file_name (r->fh), vv->name, len));
-
- if ( len != 0 )
- {
- /* Read label into variable structure. */
- vv->label = buf_read (r, NULL, ROUND_UP (len, sizeof (int32_t)), len + 1);
- if (vv->label == NULL)
- goto error;
- vv->label[len] = '\0';
- }
- }
+ va_start (args, format);
+ sys_msg (r, text->start + text->pos, MW, format, args);
+ va_end (args);
+ }
+}
- /* Set missing values. */
- if (sv.n_missing_values != 0)
- {
- flt64 mv[3];
- int mv_cnt = abs (sv.n_missing_values);
-
- if (vv->width > MAX_SHORT_STRING)
- lose ((ME, _("%s: Long string variable %s may not have missing "
- "values."),
- fh_get_file_name (r->fh), vv->name));
-
- assertive_buf_read (r, mv, sizeof *mv * mv_cnt, 0);
-
- if (r->reverse_endian && vv->type == NUMERIC)
- for (j = 0; j < mv_cnt; j++)
- bswap_flt64 (&mv[j]);
-
- if (sv.n_missing_values > 0)
- {
- for (j = 0; j < sv.n_missing_values; j++)
- if (vv->type == NUMERIC)
- mv_add_num (&vv->miss, mv[j]);
- else
- mv_add_str (&vv->miss, (char *) &mv[j]);
- }
- else
- {
- if (vv->type == ALPHA)
- lose ((ME, _("%s: String variable %s may not have missing "
- "values specified as a range."),
- fh_get_file_name (r->fh), vv->name));
-
- if (mv[0] == r->lowest)
- mv_add_num_range (&vv->miss, LOWEST, mv[1]);
- else if (mv[1] == r->highest)
- mv_add_num_range (&vv->miss, mv[0], HIGHEST);
- else
- mv_add_num_range (&vv->miss, mv[0], mv[1]);
-
- if (sv.n_missing_values == -3)
- mv_add_num (&vv->miss, mv[2]);
- }
- }
+static char *
+text_get_token (struct text_record *text, struct substring delimiters,
+ char *delimiter)
+{
+ struct substring token;
+ char *end;
- if (!parse_format_spec (r, sv.print, &vv->print, vv)
- || !parse_format_spec (r, sv.write, &vv->write, vv))
- goto error;
+ if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
+ return NULL;
- if ( vv->width != -1)
- hsh_insert(r->var_hash, vv);
- }
+ end = &ss_data (token)[ss_length (token)];
+ if (delimiter != NULL)
+ *delimiter = *end;
+ *end = '\0';
+ return ss_data (token);
+}
- /* Some consistency checks. */
- if (long_string_count != 0)
- lose ((ME, _("%s: Long string continuation records omitted at end of "
- "dictionary."),
- fh_get_file_name (r->fh)));
+/* Reads a integer value expressed in decimal, then a space, then a string that
+ consists of exactly as many bytes as specified by the integer, then a space,
+ from TEXT. Returns the string, null-terminated, as a subset of TEXT's
+ buffer (so the caller should not free the string). */
+static const char *
+text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
+{
+ size_t start;
+ size_t n;
+ char *s;
- if (next_value != r->value_cnt)
- corrupt_msg(MW, _("%s: System file header indicates %d variable positions but "
- "%d were read from file."),
- fh_get_file_name (r->fh), r->value_cnt, next_value);
+ start = text->pos;
+ n = 0;
+ while (text->pos < text->buffer.length)
+ {
+ int c = text->buffer.string[text->pos];
+ if (c < '0' || c > '9')
+ break;
+ n = (n * 10) + (c - '0');
+ text->pos++;
+ }
+ if (text->pos >= text->buffer.length || start == text->pos)
+ {
+ sys_warn (r, text->start,
+ _("Expecting digit at offset %zu in MRSETS record."),
+ text->pos);
+ return NULL;
+ }
+ if (!text_match (text, ' '))
+ {
+ sys_warn (r, text->start,
+ _("Expecting space at offset %zu in MRSETS record."),
+ text->pos);
+ return NULL;
+ }
- return 1;
+ if (text->pos + n > text->buffer.length)
+ {
+ sys_warn (r, text->start,
+ _("%zu-byte string starting at offset %zu "
+ "exceeds record length %zu."),
+ n, text->pos, text->buffer.length);
+ return NULL;
+ }
- error:
- return 0;
+ s = &text->buffer.string[text->pos];
+ if (s[n] != ' ')
+ {
+ sys_warn (r, text->start,
+ _("Expecting space at offset %zu following %zu-byte string."),
+ text->pos + n, n);
+ return NULL;
+ }
+ s[n] = '\0';
+ text->pos += n + 1;
+ return s;
}
-/* Translates the format spec from sysfile format to internal
- format. */
-static int
-parse_format_spec (struct sfm_reader *r, int32_t s,
- struct fmt_spec *f, const struct variable *v)
-{
- f->type = translate_fmt ((s >> 16) & 0xff);
- if (f->type == -1)
- lose ((ME, _("%s: Bad format specifier byte (%d)."),
- fh_get_file_name (r->fh), (s >> 16) & 0xff));
- f->w = (s >> 8) & 0xff;
- f->d = s & 0xff;
-
- if ((v->type == ALPHA) ^ ((formats[f->type].cat & FCAT_STRING) != 0))
- lose ((ME, _("%s: %s variable %s has %s format specifier %s."),
- fh_get_file_name (r->fh),
- v->type == ALPHA ? _("String") : _("Numeric"),
- v->name,
- formats[f->type].cat & FCAT_STRING ? _("string") : _("numeric"),
- formats[f->type].name));
-
- if (!check_output_specifier (f, false)
- || !check_specifier_width (f, v->width, false))
- {
- msg (ME, _("%s variable %s has invalid format specifier %s."),
- v->type == NUMERIC ? _("Numeric") : _("String"),
- v->name, fmt_to_string (f));
- *f = v->type == NUMERIC ? f8_2 : make_output_format (FMT_A, v->width, 0);
+static bool
+text_match (struct text_record *text, char c)
+{
+ if (text->buffer.string[text->pos] == c)
+ {
+ text->pos++;
+ return true;
}
- return 1;
+ else
+ return false;
+}
- error:
- return 0;
+/* Returns the current byte offset (as converted to UTF-8, if it was converted)
+ inside the TEXT's string. */
+static size_t
+text_pos (const struct text_record *text)
+{
+ return text->pos;
}
-/* Reads value labels from sysfile H and inserts them into the
- associated dictionary. */
-int
-read_value_labels (struct sfm_reader *r,
- struct dictionary *dict, struct variable **var_by_idx)
+static const char *
+text_get_all (const struct text_record *text)
{
- struct label
- {
- char raw_value[8]; /* Value as uninterpreted bytes. */
- union value value; /* Value. */
- char *label; /* Null-terminated label string. */
- };
+ return text->buffer.string;
+}
+\f
+/* Messages. */
- struct label *labels = NULL;
- int32_t n_labels; /* Number of labels. */
+/* Displays a corruption message. */
+static void
+sys_msg (struct sfm_reader *r, off_t offset,
+ int class, const char *format, va_list args)
+{
+ struct msg m;
+ struct string text;
- struct variable **var = NULL; /* Associated variables. */
- int32_t n_vars; /* Number of associated variables. */
+ ds_init_empty (&text);
+ if (offset >= 0)
+ ds_put_format (&text, _("`%s' near offset 0x%llx: "),
+ fh_get_file_name (r->fh), (long long int) offset);
+ else
+ ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
+ ds_put_vformat (&text, format, args);
- int i;
+ m.category = msg_class_to_category (class);
+ m.severity = msg_class_to_severity (class);
+ m.file_name = NULL;
+ m.first_line = 0;
+ m.last_line = 0;
+ m.first_column = 0;
+ m.last_column = 0;
+ m.text = ds_cstr (&text);
- /* First step: read the contents of the type 3 record and record its
- contents. Note that we can't do much with the data since we
- don't know yet whether it is of numeric or string type. */
+ msg_emit (&m);
+}
- /* Read number of labels. */
- assertive_buf_read (r, &n_labels, sizeof n_labels, 0);
- if (r->reverse_endian)
- bswap_int32 (&n_labels);
+/* Displays a warning for offset OFFSET in the file. */
+static void
+sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
+{
+ va_list args;
- if ( n_labels >= ((int32_t) ~0) / sizeof *labels)
- {
- corrupt_msg(MW, _("%s: Invalid number of labels: %d. Ignoring labels."),
- fh_get_file_name (r->fh), n_labels);
- n_labels = 0;
- }
+ va_start (args, format);
+ sys_msg (r, offset, MW, format, args);
+ va_end (args);
+}
+
+/* Displays an error for the current file position and marks it as in an error
+ state. */
+static void
+sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
+{
+ va_list args;
- /* Allocate memory. */
- labels = xcalloc (n_labels, sizeof *labels);
- for (i = 0; i < n_labels; i++)
- labels[i].label = NULL;
+ va_start (args, format);
+ sys_msg (r, offset, ME, format, args);
+ va_end (args);
- /* Read each value/label tuple into labels[]. */
- for (i = 0; i < n_labels; i++)
+ r->error = true;
+}
+\f
+/* Reads BYTE_CNT bytes into BUF.
+ Returns 1 if exactly BYTE_CNT bytes are successfully read.
+ Returns -1 if an I/O error or a partial read occurs.
+ Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports
+ an error. */
+static inline int
+read_bytes_internal (struct sfm_reader *r, bool eof_is_ok,
+ void *buf, size_t byte_cnt)
+{
+ size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
+ r->pos += bytes_read;
+ if (bytes_read == byte_cnt)
+ return 1;
+ else if (ferror (r->file))
{
- struct label *label = labels + i;
- unsigned char label_len;
- size_t padded_len;
+ sys_error (r, r->pos, _("System error: %s."), strerror (errno));
+ return -1;
+ }
+ else if (!eof_is_ok || bytes_read != 0)
+ {
+ sys_error (r, r->pos, _("Unexpected end of file."));
+ return -1;
+ }
+ else
+ return 0;
+}
- /* Read value. */
- assertive_buf_read (r, label->raw_value, sizeof label->raw_value, 0);
+/* Reads BYTE_CNT into BUF.
+ Returns true if successful.
+ Returns false upon I/O error or if end-of-file is encountered. */
+static bool
+read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
+{
+ return read_bytes_internal (r, false, buf, byte_cnt) == 1;
+}
- /* Read label length. */
- assertive_buf_read (r, &label_len, sizeof label_len, 0);
- padded_len = ROUND_UP (label_len + 1, sizeof (flt64));
+/* Reads BYTE_CNT bytes into BUF.
+ Returns 1 if exactly BYTE_CNT bytes are successfully read.
+ Returns 0 if an immediate end-of-file is encountered.
+ Returns -1 if an I/O error or a partial read occurs. */
+static int
+try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
+{
+ return read_bytes_internal (r, true, buf, byte_cnt);
+}
- /* Read label, padding. */
- label->label = xmalloc (padded_len + 1);
- assertive_buf_read (r, label->label, padded_len - 1, 0);
- label->label[label_len] = 0;
- }
+/* Reads a 32-bit signed integer from R and stores its value in host format in
+ *X. Returns true if successful, otherwise false. */
+static bool
+read_int (struct sfm_reader *r, int *x)
+{
+ uint8_t integer[4];
+ if (read_bytes (r, integer, sizeof integer) != 1)
+ return false;
+ *x = integer_get (r->integer_format, integer, sizeof integer);
+ return true;
+}
- /* Second step: Read the type 4 record that has the list of
- variables to which the value labels are to be applied. */
+static bool
+read_uint (struct sfm_reader *r, unsigned int *x)
+{
+ bool ok;
+ int y;
- /* Read record type of type 4 record. */
- {
- int32_t rec_type;
-
- assertive_buf_read (r, &rec_type, sizeof rec_type, 0);
- if (r->reverse_endian)
- bswap_int32 (&rec_type);
-
- if (rec_type != 4)
- lose ((ME, _("%s: Variable index record (type 4) does not immediately "
- "follow value label record (type 3) as it should."),
- fh_get_file_name (r->fh)));
- }
+ ok = read_int (r, &y);
+ *x = y;
+ return ok;
+}
- /* Read number of variables associated with value label from type 4
- record. */
- assertive_buf_read (r, &n_vars, sizeof n_vars, 0);
- if (r->reverse_endian)
- bswap_int32 (&n_vars);
- if (n_vars < 1 || n_vars > dict_get_var_cnt (dict))
- lose ((ME, _("%s: Number of variables associated with a value label (%d) "
- "is not between 1 and the number of variables (%d)."),
- fh_get_file_name (r->fh), n_vars, dict_get_var_cnt (dict)));
-
- /* Read the list of variables. */
- var = xnmalloc (n_vars, sizeof *var);
- for (i = 0; i < n_vars; i++)
- {
- int32_t var_idx;
- struct variable *v;
-
- /* Read variable index, check range. */
- assertive_buf_read (r, &var_idx, sizeof var_idx, 0);
- if (r->reverse_endian)
- bswap_int32 (&var_idx);
- if (var_idx < 1 || var_idx > r->value_cnt)
- lose ((ME, _("%s: Variable index associated with value label (%d) is "
- "not between 1 and the number of values (%d)."),
- fh_get_file_name (r->fh), var_idx, r->value_cnt));
-
- /* Make sure it's a real variable. */
- v = var_by_idx[var_idx - 1];
- if (v == NULL)
- lose ((ME, _("%s: Variable index associated with value label (%d) "
- "refers to a continuation of a string variable, not to "
- "an actual variable."),
- fh_get_file_name (r->fh), var_idx));
- if (v->type == ALPHA && v->width > MAX_SHORT_STRING)
- lose ((ME, _("%s: Value labels are not allowed on long string "
- "variables (%s)."),
- fh_get_file_name (r->fh), v->name));
-
- /* Add it to the list of variables. */
- var[i] = v;
- }
-
- /* Type check the variables. */
- for (i = 1; i < n_vars; i++)
- if (var[i]->type != var[0]->type)
- lose ((ME, _("%s: Variables associated with value label are not all of "
- "identical type. Variable %s has %s type, but variable "
- "%s has %s type."),
- fh_get_file_name (r->fh),
- var[0]->name, var[0]->type == ALPHA ? _("string") : _("numeric"),
- var[i]->name, var[i]->type == ALPHA ? _("string") : _("numeric")));
-
- /* Fill in labels[].value, now that we know the desired type. */
- for (i = 0; i < n_labels; i++)
- {
- struct label *label = labels + i;
-
- if (var[0]->type == ALPHA)
- {
- const int copy_len = min (sizeof label->raw_value,
- sizeof label->label);
- memcpy (label->value.s, label->raw_value, copy_len);
- } else {
- flt64 f;
- assert (sizeof f == sizeof label->raw_value);
- memcpy (&f, label->raw_value, sizeof f);
- if (r->reverse_endian)
- bswap_flt64 (&f);
- label->value.f = f;
- }
- }
-
- /* Assign the value_label's to each variable. */
- for (i = 0; i < n_vars; i++)
- {
- struct variable *v = var[i];
- int j;
+/* Reads a 64-bit signed integer from R and returns its value in
+ host format. */
+static bool
+read_int64 (struct sfm_reader *r, long long int *x)
+{
+ uint8_t integer[8];
+ if (read_bytes (r, integer, sizeof integer) != 1)
+ return false;
+ *x = integer_get (r->integer_format, integer, sizeof integer);
+ return true;
+}
- /* Add each label to the variable. */
- for (j = 0; j < n_labels; j++)
- {
- struct label *label = labels + j;
- if (!val_labs_replace (v->val_labs, label->value, label->label))
- continue;
-
- if (var[0]->type == NUMERIC)
- msg (MW, _("%s: File contains duplicate label for value %g for "
- "variable %s."),
- fh_get_file_name (r->fh), label->value.f, v->name);
- else
- msg (MW, _("%s: File contains duplicate label for value `%.*s' "
- "for variable %s."),
- fh_get_file_name (r->fh), v->width, label->value.s, v->name);
- }
- }
+/* Reads a 64-bit signed integer from R and returns its value in
+ host format. */
+static bool
+read_uint64 (struct sfm_reader *r, unsigned long long int *x)
+{
+ long long int y;
+ bool ok;
- for (i = 0; i < n_labels; i++)
- free (labels[i].label);
- free (labels);
- free (var);
- return 1;
+ ok = read_int64 (r, &y);
+ *x = y;
+ return ok;
+}
- error:
- if (labels)
- {
- for (i = 0; i < n_labels; i++)
- free (labels[i].label);
- free (labels);
- }
- free (var);
- return 0;
+static int
+parse_int (const struct sfm_reader *r, const void *data, size_t ofs)
+{
+ return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
}
-/* Reads BYTE_CNT bytes from the file represented by H. If BUF is
- non-NULL, uses that as the buffer; otherwise allocates at least
- MIN_ALLOC bytes. Returns a pointer to the buffer on success, NULL
- on failure. */
-static void *
-buf_read (struct sfm_reader *r, void *buf, size_t byte_cnt, size_t min_alloc)
+static double
+parse_float (const struct sfm_reader *r, const void *data, size_t ofs)
{
- assert (r);
+ return float_get_double (r->float_format, (const uint8_t *) data + ofs);
+}
- if (buf == NULL && byte_cnt > 0 )
- buf = xmalloc (max (byte_cnt, min_alloc));
+/* Reads exactly SIZE - 1 bytes into BUFFER
+ and stores a null byte into BUFFER[SIZE - 1]. */
+static bool
+read_string (struct sfm_reader *r, char *buffer, size_t size)
+{
+ bool ok;
- if ( byte_cnt == 0 )
- return buf;
+ assert (size > 0);
+ ok = read_bytes (r, buffer, size - 1);
+ if (ok)
+ buffer[size - 1] = '\0';
+ return ok;
+}
-
- if (1 != fread (buf, byte_cnt, 1, r->file))
+/* Skips BYTES bytes forward in R. */
+static bool
+skip_bytes (struct sfm_reader *r, size_t bytes)
+{
+ while (bytes > 0)
{
- if (ferror (r->file))
- msg (ME, _("%s: Reading system file: %s."),
- fh_get_file_name (r->fh), strerror (errno));
- else
- corrupt_msg (ME, _("%s: Unexpected end of file."),
- fh_get_file_name (r->fh));
- r->ok = false;
- return NULL;
+ char buffer[1024];
+ size_t chunk = MIN (sizeof buffer, bytes);
+ if (!read_bytes (r, buffer, chunk))
+ return false;
+ bytes -= chunk;
}
- return buf;
+ return true;
}
-/* Winds the reader BYTE_CNT bytes back in the reader stream. */
-void
-buf_unread(struct sfm_reader *r, size_t byte_cnt)
+/* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have
+ been replaced by LFs.
+
+ (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system
+ files that use CR-only line ends in the file label and extra product
+ info.) */
+static char *
+fix_line_ends (const char *s)
{
- assert(byte_cnt > 0);
+ char *dst, *d;
- if ( 0 != fseek(r->file, -byte_cnt, SEEK_CUR))
+ d = dst = xmalloc (strlen (s) + 1);
+ while (*s != '\0')
{
- msg (ME, _("%s: Seeking system file: %s."),
- fh_get_file_name (r->fh), strerror (errno));
+ if (*s == '\r')
+ {
+ s++;
+ if (*s == '\n')
+ s++;
+ *d++ = '\n';
+ }
+ else
+ *d++ = *s++;
}
+ *d = '\0';
+
+ return dst;
}
+\f
+static bool
+read_ztrailer (struct sfm_reader *r,
+ long long int zheader_ofs,
+ long long int ztrailer_len);
-/* Reads a document record, type 6, from system file R, and sets up
- the documents and n_documents fields in the associated
- dictionary. */
-static int
-read_documents (struct sfm_reader *r, struct dictionary *dict)
+static void *
+zalloc (voidpf pool_, uInt items, uInt size)
{
- int32_t line_cnt;
- char *documents;
-
- if (dict_get_documents (dict) != NULL)
- lose ((ME, _("%s: System file contains multiple "
- "type 6 (document) records."),
- fh_get_file_name (r->fh)));
+ struct pool *pool = pool_;
- assertive_buf_read (r, &line_cnt, sizeof line_cnt, 0);
- if (line_cnt <= 0)
- lose ((ME, _("%s: Number of document lines (%ld) "
- "must be greater than 0."),
- fh_get_file_name (r->fh), (long) line_cnt));
+ return (!size || xalloc_oversized (items, size)
+ ? Z_NULL
+ : pool_malloc (pool, items * size));
+}
- documents = buf_read (r, NULL, 80 * line_cnt, line_cnt * 80 + 1);
- /* FIXME? Run through asciify. */
- if (documents == NULL)
- return 0;
- documents[80 * line_cnt] = '\0';
- dict_set_documents (dict, documents);
- free (documents);
- return 1;
+static void
+zfree (voidpf pool_, voidpf address)
+{
+ struct pool *pool = pool_;
- error:
- return 0;
+ pool_free (pool, address);
}
-\f
-/* Data reader. */
-/* Reads compressed data into H->BUF and sets other pointers
- appropriately. Returns nonzero only if both no errors occur and
- data was read. */
-static int
-buffer_input (struct sfm_reader *r)
+static bool
+read_zheader (struct sfm_reader *r)
{
- size_t amt;
-
- if (!r->ok)
+ off_t pos = r->pos;
+ long long int zheader_ofs;
+ long long int ztrailer_ofs;
+ long long int ztrailer_len;
+
+ if (!read_int64 (r, &zheader_ofs)
+ || !read_int64 (r, &ztrailer_ofs)
+ || !read_int64 (r, &ztrailer_len))
return false;
- if (r->buf == NULL)
- r->buf = xnmalloc (128, sizeof *r->buf);
- amt = fread (r->buf, sizeof *r->buf, 128, r->file);
- if (ferror (r->file))
+
+ if (zheader_ofs != pos)
{
- msg (ME, _("%s: Error reading file: %s."),
- fh_get_file_name (r->fh), strerror (errno));
- r->ok = false;
- return 0;
+ sys_error (r, pos, _("Wrong ZLIB data header offset %#llx "
+ "(expected %#llx)."),
+ zheader_ofs, (long long int) pos);
+ return false;
}
- r->ptr = r->buf;
- r->end = &r->buf[amt];
- return amt;
-}
-/* Reads a single case consisting of compressed data from system
- file H into the array BUF[] according to reader R, and
- returns nonzero only if successful. */
-/* Data in system files is compressed in this manner. Data
- values are grouped into sets of eight ("octets"). Each value
- in an octet has one instruction byte that are output together.
- Each instruction byte gives a value for that byte or indicates
- that the value can be found following the instructions. */
-static int
-read_compressed_data (struct sfm_reader *r, flt64 *buf)
-{
- const unsigned char *p_end = r->x + sizeof (flt64);
- unsigned char *p = r->y;
-
- const flt64 *buf_beg = buf;
- const flt64 *buf_end = &buf[r->value_cnt];
+ if (ztrailer_ofs < r->pos)
+ {
+ sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."),
+ ztrailer_ofs);
+ return false;
+ }
- for (;;)
+ if (ztrailer_len < 24 || ztrailer_len % 24)
{
- for (; p < p_end; p++){
- switch (*p)
- {
- case 0:
- /* Code 0 is ignored. */
- continue;
- case 252:
- /* Code 252 is end of file. */
- if (buf_beg == buf)
- return 0;
- lose ((ME, _("%s: Compressed data is corrupted. Data ends "
- "in partial case."),
- fh_get_file_name (r->fh)));
- case 253:
- /* Code 253 indicates that the value is stored explicitly
- following the instruction bytes. */
- if (r->ptr == NULL || r->ptr >= r->end)
- if (!buffer_input (r))
- lose ((ME, _("%s: Unexpected end of file."),
- fh_get_file_name (r->fh)));
- memcpy (buf++, r->ptr++, sizeof *buf);
- if (buf >= buf_end)
- goto success;
- break;
- case 254:
- /* Code 254 indicates a string that is all blanks. */
- memset (buf++, ' ', sizeof *buf);
- if (buf >= buf_end)
- goto success;
- break;
- case 255:
- /* Code 255 indicates the system-missing value. */
- *buf = r->sysmis;
- if (r->reverse_endian)
- bswap_flt64 (buf);
- buf++;
- if (buf >= buf_end)
- goto success;
- break;
- default:
- /* Codes 1 through 251 inclusive are taken to indicate a
- value of (BYTE - BIAS), where BYTE is the byte's value
- and BIAS is the compression bias (generally 100.0). */
- *buf = *p - r->bias;
- if (r->reverse_endian)
- bswap_flt64 (buf);
- buf++;
- if (buf >= buf_end)
- goto success;
- break;
- }
- }
- /* We have reached the end of this instruction octet. Read
- another. */
- if (r->ptr == NULL || r->ptr >= r->end)
- {
- if (!buffer_input (r))
- {
- if (buf_beg != buf)
- lose ((ME, _("%s: Unexpected end of file."),
- fh_get_file_name (r->fh)));
- else
- return 0;
- }
- }
- memcpy (r->x, r->ptr++, sizeof *buf);
- p = r->x;
+ sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len);
+ return false;
}
- abort ();
+ r->ztrailer_ofs = ztrailer_ofs;
+ if (!read_ztrailer (r, zheader_ofs, ztrailer_len))
+ return false;
- success:
- /* We have filled up an entire record. Update state and return
- successfully. */
- r->y = ++p;
- return 1;
+ if (r->zin_buf == NULL)
+ {
+ r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE);
+ r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE);
+ r->zstream.next_in = NULL;
+ r->zstream.avail_in = 0;
+ }
- error:
- /* I/O error. */
- r->ok = false;
- return 0;
-}
+ r->zstream.zalloc = zalloc;
+ r->zstream.zfree = zfree;
+ r->zstream.opaque = r->pool;
+ return open_zstream (r);
+}
-static int
-compare_var_index(const void *_v1, const void *_v2, void *aux UNUSED)
+static void
+seek (struct sfm_reader *r, off_t offset)
{
- const struct variable *const *v1 = _v1;
- const struct variable *const *v2 = _v2;
+ if (fseeko (r->file, offset, SEEK_SET))
+ sys_error (r, 0, _("%s: seek failed (%s)."),
+ fh_get_file_name (r->fh), strerror (errno));
+ r->pos = offset;
+}
- if ( (*v1)->index < (*v2)->index)
- return -1;
+/* Performs some additional consistency checks on the ZLIB compressed data
+ trailer. */
+static bool
+read_ztrailer (struct sfm_reader *r,
+ long long int zheader_ofs,
+ long long int ztrailer_len)
+{
+ long long int expected_uncmp_ofs;
+ long long int expected_cmp_ofs;
+ long long int bias;
+ long long int zero;
+ unsigned int block_size;
+ unsigned int n_blocks;
+ unsigned int i;
+ struct stat s;
+
+ if (fstat (fileno (r->file), &s))
+ {
+ sys_error (ME, 0, _("%s: stat failed (%s)."),
+ fh_get_file_name (r->fh), strerror (errno));
+ return false;
+ }
- return ( (*v1)->index > (*v2)->index) ;
-}
+ if (!S_ISREG (s.st_mode))
+ {
+ /* We can't seek to the trailer and then back to the data in this file,
+ so skip doing extra checks. */
+ return true;
+ }
+ if (r->ztrailer_ofs + ztrailer_len != s.st_size)
+ sys_warn (r, r->pos,
+ _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."),
+ r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size);
-/* Reads one case from READER's file into C. Returns nonzero
- only if successful. */
-int
-sfm_read_case (struct sfm_reader *r, struct ccase *c)
-{
- if (!r->ok)
- return 0;
+ seek (r, r->ztrailer_ofs);
- if ( ! r->svars )
+ /* Read fixed header from ZLIB data trailer. */
+ if (!read_int64 (r, &bias))
+ return false;
+ if (-bias != r->bias)
{
- r->svars = (struct variable **) hsh_data(r->var_hash);
- sort(r->svars, hsh_count(r->var_hash),
- sizeof(*r->svars), compare_var_index, 0);
+ sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from "
+ "file header bias (%.2f)."),
+ -bias, r->bias);
+ return false;
}
- if (!r->compressed && sizeof (flt64) == sizeof (double) && ! r->has_vls)
+ if (!read_int64 (r, &zero))
+ return false;
+ if (zero != 0)
+ sys_warn (r, r->pos,
+ _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero);
+
+ if (!read_uint (r, &block_size))
+ return false;
+ if (block_size != ZBLOCK_SIZE)
+ sys_warn (r, r->pos,
+ _("ZLIB trailer specifies unexpected %u-byte block size."),
+ block_size);
+
+ if (!read_uint (r, &n_blocks))
+ return false;
+ if (n_blocks != (ztrailer_len - 24) / 24)
{
- /* Fast path: external and internal representations are the
- same, except possibly for endianness or SYSMIS. Read
- directly into the case's buffer, then fix up any minor
- details as needed. */
- if (!fread_ok (r, case_data_all_rw (c),
- sizeof (union value) * r->value_cnt))
- return 0;
+ sys_error (r, r->pos,
+ _("%lld-byte ZLIB trailer specifies %u data blocks (expected "
+ "%lld)."),
+ ztrailer_len, n_blocks, (ztrailer_len - 24) / 24);
+ return false;
+ }
- /* Fix up endianness if needed. */
- if (r->reverse_endian)
+ expected_uncmp_ofs = zheader_ofs;
+ expected_cmp_ofs = zheader_ofs + 24;
+ for (i = 0; i < n_blocks; i++)
+ {
+ off_t desc_ofs = r->pos;
+ unsigned long long int uncompressed_ofs;
+ unsigned long long int compressed_ofs;
+ unsigned int uncompressed_size;
+ unsigned int compressed_size;
+
+ if (!read_uint64 (r, &uncompressed_ofs)
+ || !read_uint64 (r, &compressed_ofs)
+ || !read_uint (r, &uncompressed_size)
+ || !read_uint (r, &compressed_size))
+ return false;
+
+ if (uncompressed_ofs != expected_uncmp_ofs)
{
- int i;
-
- for (i = 0; i < hsh_count(r->var_hash); i++)
- {
- struct variable *v = r->svars[i];
- if (v->width == 0)
- bswap_flt64 (&case_data_rw (c, v->fv)->f);
- }
+ sys_error (r, desc_ofs,
+ _("ZLIB block descriptor %u reported uncompressed data "
+ "offset %#llx, when %#llx was expected."),
+ i, uncompressed_ofs, expected_uncmp_ofs);
+ return false;
}
- /* Fix up SYSMIS values if needed.
- I don't think this will ever actually kick in, but it
- can't hurt. */
- if (r->sysmis != SYSMIS)
+ if (compressed_ofs != expected_cmp_ofs)
{
- int i;
- for (i = 0; i < hsh_count(r->var_hash); i++)
- {
- struct variable *v = r->svars[i];
- if (v->width == 0 && case_num (c, i) == r->sysmis)
- case_data_rw (c, v->fv)->f = SYSMIS;
- }
+ sys_error (r, desc_ofs,
+ _("ZLIB block descriptor %u reported compressed data "
+ "offset %#llx, when %#llx was expected."),
+ i, compressed_ofs, expected_cmp_ofs);
+ return false;
}
- }
- else
- {
- /* Slow path: internal and external representations differ.
- Read into a bounce buffer, then copy to C. */
- flt64 *bounce;
- flt64 *bounce_cur;
- size_t bounce_size;
- int read_ok;
- int i;
-
- bounce_size = sizeof *bounce * r->value_cnt;
- bounce = bounce_cur = local_alloc (bounce_size);
-
- memset(bounce, 0, bounce_size);
- if (!r->compressed)
- read_ok = fread_ok (r, bounce, bounce_size);
+ if (i < n_blocks - 1)
+ {
+ if (uncompressed_size != block_size)
+ sys_warn (r, desc_ofs,
+ _("ZLIB block descriptor %u reported block size %#x, "
+ "when %#x was expected."),
+ i, uncompressed_size, block_size);
+ }
else
- read_ok = read_compressed_data (r, bounce);
- if (!read_ok)
{
- local_free (bounce);
- return 0;
+ if (uncompressed_size > block_size)
+ sys_warn (r, desc_ofs,
+ _("ZLIB block descriptor %u reported block size %#x, "
+ "when at most %#x was expected."),
+ i, uncompressed_size, block_size);
}
- for (i = 0; i < hsh_count(r->var_hash); i++)
+ /* http://www.zlib.net/zlib_tech.html says that the maximum expansion
+ from compression, with worst-case parameters, is 13.5% plus 11 bytes.
+ This code checks for an expansion of more than 14.3% plus 11
+ bytes. */
+ if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11)
{
- struct variable *tv = r->svars[i];
+ sys_error (r, desc_ofs,
+ _("ZLIB block descriptor %u reports compressed size %u "
+ "and uncompressed size %u."),
+ i, compressed_size, uncompressed_size);
+ return false;
+ }
- if (tv->width == 0)
- {
- flt64 f = *bounce_cur++;
- if (r->reverse_endian)
- bswap_flt64 (&f);
- case_data_rw (c, tv->fv)->f = f == r->sysmis ? SYSMIS : f;
- }
- else if (tv->width != -1)
- {
- flt64 *bc_start = bounce_cur;
- int ofs = 0;
- while (ofs < tv->width )
- {
- const int chunk = MIN (MAX_LONG_STRING, tv->width - ofs);
- memcpy (case_data_rw (c, tv->fv)->s + ofs, bounce_cur, chunk);
+ expected_uncmp_ofs += uncompressed_size;
+ expected_cmp_ofs += compressed_size;
+ }
- bounce_cur += DIV_RND_UP (chunk, sizeof (flt64));
+ if (expected_cmp_ofs != r->ztrailer_ofs)
+ {
+ sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx "
+ "would be expected from block descriptors."),
+ r->ztrailer_ofs, expected_cmp_ofs);
+ return false;
+ }
- ofs += chunk;
- }
- bounce_cur = bc_start + width_to_bytes(tv->width) / sizeof(flt64);
- }
- }
+ seek (r, zheader_ofs + 24);
+ return true;
+}
+
+static bool
+open_zstream (struct sfm_reader *r)
+{
+ int error;
+
+ r->zout_pos = r->zout_end = 0;
+ error = inflateInit (&r->zstream);
+ if (error != Z_OK)
+ {
+ sys_error (r, r->pos, _("ZLIB initialization failed (%s)."),
+ r->zstream.msg);
+ return false;
+ }
+ return true;
+}
+
+static bool
+close_zstream (struct sfm_reader *r)
+{
+ int error;
- local_free (bounce);
+ error = inflateEnd (&r->zstream);
+ if (error != Z_OK)
+ {
+ sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."),
+ r->zstream.msg);
+ return false;
}
- return 1;
+ return true;
}
static int
-fread_ok (struct sfm_reader *r, void *buffer, size_t byte_cnt)
+read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt)
{
- size_t read_bytes = fread (buffer, 1, byte_cnt, r->file);
+ uint8_t *buf = buf_;
- if (read_bytes == byte_cnt)
+ if (byte_cnt == 0)
return 1;
- else
+
+ for (;;)
{
- if (ferror (r->file))
+ int error;
+
+ /* Use already inflated data if there is any. */
+ if (r->zout_pos < r->zout_end)
{
- msg (ME, _("%s: Reading system file: %s."),
- fh_get_file_name (r->fh), strerror (errno));
- r->ok = false;
+ unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos);
+ memcpy (buf, &r->zout_buf[r->zout_pos], n);
+ r->zout_pos += n;
+ byte_cnt -= n;
+ buf += n;
+
+ if (byte_cnt == 0)
+ return 1;
}
- else if (read_bytes != 0)
+
+ /* We need to inflate some more data.
+ Get some more input data if we don't have any. */
+ if (r->zstream.avail_in == 0)
{
- msg (ME, _("%s: Partial record at end of system file."),
- fh_get_file_name (r->fh));
- r->ok = false;
+ unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos);
+ if (n == 0)
+ return 0;
+ else
+ {
+ int retval = try_read_bytes (r, r->zin_buf, n);
+ if (retval != 1)
+ return retval;
+ r->zstream.avail_in = n;
+ r->zstream.next_in = r->zin_buf;
+ }
+ }
+
+ /* Inflate the (remaining) input data. */
+ r->zstream.avail_out = ZOUT_BUF_SIZE;
+ r->zstream.next_out = r->zout_buf;
+ error = inflate (&r->zstream, Z_SYNC_FLUSH);
+ r->zout_pos = 0;
+ r->zout_end = r->zstream.next_out - r->zout_buf;
+ if (r->zout_end == 0)
+ {
+ if (error != Z_STREAM_END)
+ {
+ sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."),
+ r->zstream.msg);
+ return -1;
+ }
+ else if (!close_zstream (r) || !open_zstream (r))
+ return -1;
+ }
+ else
+ {
+ /* Process the output data and ignore 'error' for now. ZLIB will
+ present it to us again on the next inflate() call. */
}
- return 0;
}
}
-\f
-/* Returns true if an I/O error has occurred on READER, false
- otherwise. */
-bool
-sfm_read_error (const struct sfm_reader *reader)
+
+static int
+read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
{
- return !reader->ok;
+ if (r->compression == ANY_COMP_SIMPLE)
+ return read_bytes (r, buf, byte_cnt);
+ else
+ {
+ int retval = read_bytes_zlib (r, buf, byte_cnt);
+ if (retval == 0)
+ sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data."));
+ return retval;
+ }
}
-/* Returns true if FILE is an SPSS system file,
- false otherwise. */
-bool
-sfm_detect (FILE *file)
+static int
+try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt)
{
- struct sysfile_header hdr;
+ if (r->compression == ANY_COMP_SIMPLE)
+ return try_read_bytes (r, buf, byte_cnt);
+ else
+ return read_bytes_zlib (r, buf, byte_cnt);
+}
- if (fread (&hdr, sizeof hdr, 1, file) != 1)
- return false;
- if (strncmp ("$FL2", hdr.rec_type, 4))
+/* Reads a 64-bit floating-point number from R and returns its
+ value in host format. */
+static bool
+read_compressed_float (struct sfm_reader *r, double *d)
+{
+ uint8_t number[8];
+
+ if (!read_compressed_bytes (r, number, sizeof number))
return false;
- return true;
+
+ *d = float_get_double (r->float_format, number);
+ return true;
}
+\f
+static const struct casereader_class sys_file_casereader_class =
+ {
+ sys_file_casereader_read,
+ sys_file_casereader_destroy,
+ NULL,
+ NULL,
+ };
+const struct any_reader_class sys_file_reader_class =
+ {
+ N_("SPSS System File"),
+ sfm_detect,
+ sfm_open,
+ sfm_close,
+ sfm_decode,
+ sfm_get_strings,
+ };