X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fsys-file-reader.c;h=b0a41a83573b0e986fb35bfb3bce46379282662f;hb=62b5101a28fc2c4a9b8b26a998fb6c4ec12d84c7;hp=401e3e27451e1d9c42b15e4eca19433d541439ef;hpb=8021cf8974a46fe82af7b8952e448c0ea6858a48;p=pspp diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 401e3e2745..8abfe10b68 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -1,410 +1,1064 @@ -/* PSPP - computes sample statistics. - Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc. +/* PSPP - a program for statistical analysis. + Copyright (C) 1997-2000, 2006-2007, 2009-2016 Free Software Foundation, Inc. - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. */ + along with this program. If not, see . */ #include -#include "sys-file-reader.h" -#include "sys-file-private.h" +#include "data/sys-file-private.h" #include #include #include -#include #include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "case.h" -#include "dictionary.h" -#include "file-handle-def.h" -#include "file-name.h" -#include "format.h" -#include "missing-values.h" -#include "value-labels.h" -#include "variable.h" -#include "value.h" - -#include "c-ctype.h" -#include "inttostr.h" -#include "minmax.h" -#include "unlocked-io.h" -#include "xsize.h" +#include +#include + +#include "data/any-reader.h" +#include "data/attributes.h" +#include "data/case.h" +#include "data/casereader-provider.h" +#include "data/casereader.h" +#include "data/dictionary.h" +#include "data/file-handle-def.h" +#include "data/file-name.h" +#include "data/format.h" +#include "data/identifier.h" +#include "data/missing-values.h" +#include "data/mrset.h" +#include "data/short-names.h" +#include "data/value-labels.h" +#include "data/value.h" +#include "data/variable.h" +#include "libpspp/array.h" +#include "libpspp/assertion.h" +#include "libpspp/compiler.h" +#include "libpspp/i18n.h" +#include "libpspp/ll.h" +#include "libpspp/message.h" +#include "libpspp/misc.h" +#include "libpspp/pool.h" +#include "libpspp/str.h" +#include "libpspp/stringi-set.h" + +#include "gl/c-strtod.h" +#include "gl/c-ctype.h" +#include "gl/inttostr.h" +#include "gl/localcharset.h" +#include "gl/minmax.h" +#include "gl/unlocked-io.h" +#include "gl/xalloc.h" +#include "gl/xalloc-oversized.h" +#include "gl/xsize.h" #include "gettext.h" #define _(msgid) gettext (msgid) #define N_(msgid) (msgid) +enum + { + /* subtypes 0-2 unknown */ + EXT_INTEGER = 3, /* Machine integer info. */ + EXT_FLOAT = 4, /* Machine floating-point info. */ + EXT_VAR_SETS = 5, /* Variable sets. */ + EXT_DATE = 6, /* DATE. */ + EXT_MRSETS = 7, /* Multiple response sets. */ + EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */ + /* subtype 9 unknown */ + EXT_PRODUCT_INFO = 10, /* Extra product info text. */ + EXT_DISPLAY = 11, /* Variable display parameters. */ + /* subtype 12 unknown */ + EXT_LONG_NAMES = 13, /* Long variable names. */ + EXT_LONG_STRINGS = 14, /* Long strings. */ + /* subtype 15 unknown */ + EXT_NCASES = 16, /* Extended number of cases. */ + EXT_FILE_ATTRS = 17, /* Data file attributes. */ + EXT_VAR_ATTRS = 18, /* Variable attributes. */ + EXT_MRSETS2 = 19, /* Multiple response sets (extended). */ + EXT_ENCODING = 20, /* Character encoding. */ + EXT_LONG_LABELS = 21, /* Value labels for long strings. */ + EXT_LONG_MISSING = 22, /* Missing values for long strings. */ + EXT_DATAVIEW = 24 /* "Format properties in dataview table". */ + }; + +/* Fields from the top-level header record. */ +struct sfm_header_record + { + char magic[5]; /* First 4 bytes of file, then null. */ + int weight_idx; /* 0 if unweighted, otherwise a var index. */ + int nominal_case_size; /* Number of var positions. */ + + /* These correspond to the members of struct any_file_info or a dictionary + but in the system file's encoding rather than ASCII. */ + char creation_date[10]; /* "dd mmm yy". */ + char creation_time[9]; /* "hh:mm:ss". */ + char eye_catcher[61]; /* Eye-catcher string, then product name. */ + char file_label[65]; /* File label. */ + }; + +struct sfm_var_record + { + off_t pos; + int width; + char name[9]; + int print_format; + int write_format; + int missing_value_code; + uint8_t missing[24]; + char *label; + struct variable *var; + }; + +struct sfm_value_label + { + uint8_t value[8]; + char *label; + }; + +struct sfm_value_label_record + { + off_t pos; + struct sfm_value_label *labels; + unsigned int n_labels; + + int *vars; + unsigned int n_vars; + }; + +struct sfm_document_record + { + off_t pos; + char *documents; + size_t n_lines; + }; + +struct sfm_mrset + { + const char *name; /* Name. */ + const char *label; /* Human-readable label for group. */ + enum mrset_type type; /* Group type. */ + const char **vars; /* Constituent variables' names. */ + size_t n_vars; /* Number of constituent variables. */ + + /* MRSET_MD only. */ + enum mrset_md_cat_source cat_source; /* Source of category labels. */ + bool label_from_var_label; /* 'label' taken from variable label? */ + const char *counted; /* Counted value, as string. */ + }; + +struct sfm_extension_record + { + struct ll ll; /* In struct sfm_reader 'var_attrs' list. */ + int subtype; /* Record subtype. */ + off_t pos; /* Starting offset in file. */ + unsigned int size; /* Size of data elements. */ + unsigned int count; /* Number of data elements. */ + void *data; /* Contents. */ + }; + /* System file reader. */ struct sfm_reader { + struct any_reader any_reader; + /* Resource tracking. */ struct pool *pool; /* All system file state. */ - jmp_buf bail_out; /* longjmp() target for error handling. */ + + /* File data. */ + struct any_read_info info; + struct sfm_header_record header; + struct sfm_var_record *vars; + size_t n_vars; + struct sfm_value_label_record *labels; + size_t n_labels; + struct sfm_document_record *document; + struct sfm_mrset *mrsets; + size_t n_mrsets; + struct sfm_extension_record *extensions[32]; + struct ll_list var_attrs; /* Contains "struct sfm_extension_record"s. */ /* File state. */ struct file_handle *fh; /* File handle. */ + struct fh_lock *lock; /* Mutual exclusion for file handle. */ FILE *file; /* File stream. */ + off_t pos; /* Position in file. */ bool error; /* I/O or corruption error? */ + struct caseproto *proto; /* Format of output cases. */ /* File format. */ enum integer_format integer_format; /* On-disk integer format. */ enum float_format float_format; /* On-disk floating point format. */ - int value_cnt; /* Number of 8-byte units per case. */ - struct sfm_var *vars; /* Variables. */ - size_t var_cnt; /* Number of variables. */ - bool has_long_var_names; /* File has a long variable name map */ - bool has_vls; /* File has one or more very long strings? */ + struct sfm_var *sfm_vars; /* Variables. */ + size_t sfm_var_cnt; /* Number of variables. */ + int case_cnt; /* Number of cases */ + const char *encoding; /* String encoding. */ + bool written_by_readstat; /* From https://github.com/WizardMac/ReadStat? */ /* Decompression. */ - bool compressed; /* File is compressed? */ + enum any_compression compression; double bias; /* Compression bias, usually 100.0. */ uint8_t opcodes[8]; /* Current block of opcodes. */ size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */ + bool corruption_warning; /* Warned about possible corruption? */ + + /* ZLIB decompression. */ + long long int ztrailer_ofs; /* Offset of ZLIB trailer at end of file. */ +#define ZIN_BUF_SIZE 4096 + uint8_t *zin_buf; /* Inflation input buffer. */ +#define ZOUT_BUF_SIZE 16384 + uint8_t *zout_buf; /* Inflation output buffer. */ + unsigned int zout_end; /* Number of bytes of data in zout_buf. */ + unsigned int zout_pos; /* First unconsumed byte in zout_buf. */ + z_stream zstream; /* ZLIB inflater. */ }; -/* A variable in a system file. */ -struct sfm_var - { - int width; /* 0=numeric, otherwise string width. */ - int case_index; /* Index into case. */ - }; +static const struct casereader_class sys_file_casereader_class; + +static struct sfm_reader * +sfm_reader_cast (const struct any_reader *r_) +{ + assert (r_->klass == &sys_file_reader_class); + return UP_CAST (r_, struct sfm_reader, any_reader); +} -static struct variable **make_var_by_value_idx (struct sfm_reader *, - struct dictionary *); -static struct variable *lookup_var_by_value_idx (struct sfm_reader *, - struct variable **, - int value_idx); - -static void sys_warn (struct sfm_reader *, const char *, ...) - PRINTF_FORMAT (2, 3); - -static void sys_error (struct sfm_reader *, const char *, ...) - PRINTF_FORMAT (2, 3) - NO_RETURN; - -static void read_bytes (struct sfm_reader *, void *, size_t); -static bool try_read_bytes (struct sfm_reader *, void *, size_t); -static int32_t read_int32 (struct sfm_reader *); -static double read_flt64 (struct sfm_reader *); -static void read_string (struct sfm_reader *, char *, size_t); -static void skip_bytes (struct sfm_reader *, size_t); - -static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]); -static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]); - -static struct variable_to_value_map *open_variable_to_value_map ( - struct sfm_reader *, size_t size); -static void close_variable_to_value_map (struct sfm_reader *r, - struct variable_to_value_map *); -static bool read_variable_to_value_map (struct sfm_reader *, - struct dictionary *, - struct variable_to_value_map *, - struct variable **var, char **value, - int *warning_cnt); +static bool sfm_close (struct any_reader *); + +static struct variable *lookup_var_by_index (struct sfm_reader *, off_t, + const struct sfm_var_record *, + size_t n, int idx); + +static void sys_msg (struct sfm_reader *r, off_t, int class, + const char *format, va_list args) + PRINTF_FORMAT (4, 0); +static void sys_warn (struct sfm_reader *, off_t, const char *, ...) + PRINTF_FORMAT (3, 4); +static void sys_error (struct sfm_reader *, off_t, const char *, ...) + PRINTF_FORMAT (3, 4); + +static bool read_bytes (struct sfm_reader *, void *, size_t) + WARN_UNUSED_RESULT; +static int try_read_bytes (struct sfm_reader *, void *, size_t) + WARN_UNUSED_RESULT; +static bool read_int (struct sfm_reader *, int *) WARN_UNUSED_RESULT; +static bool read_uint (struct sfm_reader *, unsigned int *) WARN_UNUSED_RESULT; +static bool read_int64 (struct sfm_reader *, long long int *) + WARN_UNUSED_RESULT; +static bool read_uint64 (struct sfm_reader *, unsigned long long int *) + WARN_UNUSED_RESULT; +static bool read_string (struct sfm_reader *, char *, size_t) + WARN_UNUSED_RESULT; +static bool skip_bytes (struct sfm_reader *, size_t) WARN_UNUSED_RESULT; + +/* ZLIB compressed data handling. */ +static bool read_zheader (struct sfm_reader *) WARN_UNUSED_RESULT; +static bool open_zstream (struct sfm_reader *) WARN_UNUSED_RESULT; +static bool close_zstream (struct sfm_reader *) WARN_UNUSED_RESULT; +static int read_bytes_zlib (struct sfm_reader *, void *, size_t) + WARN_UNUSED_RESULT; +static int read_compressed_bytes (struct sfm_reader *, void *, size_t) + WARN_UNUSED_RESULT; +static int try_read_compressed_bytes (struct sfm_reader *, void *, size_t) + WARN_UNUSED_RESULT; +static bool read_compressed_float (struct sfm_reader *, double *) + WARN_UNUSED_RESULT; + +static char *fix_line_ends (const char *); + +static int parse_int (const struct sfm_reader *, const void *data, size_t ofs); +static double parse_float (const struct sfm_reader *, + const void *data, size_t ofs); + +static bool read_variable_record (struct sfm_reader *, + struct sfm_var_record *); +static bool read_value_label_record (struct sfm_reader *, + struct sfm_value_label_record *); +static bool read_document_record (struct sfm_reader *); +static bool read_extension_record (struct sfm_reader *, int subtype, + struct sfm_extension_record **); +static bool skip_extension_record (struct sfm_reader *, int subtype); + +static struct text_record *open_text_record ( + struct sfm_reader *, const struct sfm_extension_record *, + bool recode_to_utf8); +static void close_text_record (struct sfm_reader *, + struct text_record *); +static bool read_variable_to_value_pair (struct sfm_reader *, + struct dictionary *, + struct text_record *, + struct variable **var, char **value); +static void text_warn (struct sfm_reader *r, struct text_record *text, + const char *format, ...) PRINTF_FORMAT (3, 4); +static char *text_get_token (struct text_record *, + struct substring delimiters, char *delimiter); +static bool text_match (struct text_record *, char c); +static bool text_read_variable_name (struct sfm_reader *, struct dictionary *, + struct text_record *, + struct substring delimiters, + struct variable **); +static bool text_read_short_name (struct sfm_reader *, struct dictionary *, + struct text_record *, + struct substring delimiters, + struct variable **); +static const char *text_parse_counted_string (struct sfm_reader *, + struct text_record *); +static size_t text_pos (const struct text_record *); +static const char *text_get_all (const struct text_record *); /* Dictionary reader. */ -enum which_format +enum which_format { PRINT_FORMAT, WRITE_FORMAT }; -static void read_header (struct sfm_reader *, struct dictionary *, - int *weight_idx, int *claimed_value_cnt, - struct sfm_read_info *); -static void read_variable_record (struct sfm_reader *, struct dictionary *, - int *format_warning_cnt); -static void parse_format_spec (struct sfm_reader *, uint32_t, - enum which_format, struct variable *, - int *format_warning_cnt); -static void setup_weight (struct sfm_reader *, int weight_idx, - struct variable **var_by_value_idx, - struct dictionary *); -static void read_documents (struct sfm_reader *, struct dictionary *); -static void read_value_labels (struct sfm_reader *, struct dictionary *, - struct variable **var_by_value_idx); - -static void read_extension_record (struct sfm_reader *, struct dictionary *); -static void read_machine_int32_info (struct sfm_reader *, - size_t size, size_t count); -static void read_machine_flt64_info (struct sfm_reader *, - size_t size, size_t count); -static void read_display_parameters (struct sfm_reader *, - size_t size, size_t count, +static bool read_dictionary (struct sfm_reader *); +static bool read_record (struct sfm_reader *, int type, + size_t *allocated_vars, size_t *allocated_labels); +static bool read_header (struct sfm_reader *, struct any_read_info *, + struct sfm_header_record *); +static void parse_header (struct sfm_reader *, + const struct sfm_header_record *, + struct any_read_info *, struct dictionary *); +static bool parse_variable_records (struct sfm_reader *, struct dictionary *, + struct sfm_var_record *, size_t n); +static void parse_format_spec (struct sfm_reader *, off_t pos, + unsigned int format, enum which_format, + struct variable *, int *format_warning_cnt); +static void parse_document (struct dictionary *, struct sfm_document_record *); +static void parse_display_parameters (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); +static bool parse_machine_integer_info (struct sfm_reader *, + const struct sfm_extension_record *, + struct any_read_info *); +static void parse_machine_float_info (struct sfm_reader *, + const struct sfm_extension_record *); +static void parse_extra_product_info (struct sfm_reader *, + const struct sfm_extension_record *, + struct any_read_info *); +static void parse_mrsets (struct sfm_reader *, + const struct sfm_extension_record *, + size_t *allocated_mrsets); +static void decode_mrsets (struct sfm_reader *, struct dictionary *); +static void parse_long_var_name_map (struct sfm_reader *, + const struct sfm_extension_record *, struct dictionary *); -static void read_long_var_name_map (struct sfm_reader *, - size_t size, size_t count, - struct dictionary *); -static void read_long_string_map (struct sfm_reader *, - size_t size, size_t count, - struct dictionary *); - - -/* Opens the system file designated by file handle FH for - reading. Reads the system file's dictionary into *DICT. - If INFO is non-null, then it receives additional info about the - system file. */ -struct sfm_reader * -sfm_open_reader (struct file_handle *fh, struct dictionary **dict, - struct sfm_read_info *info) -{ - struct sfm_reader *volatile r = NULL; - struct variable **var_by_value_idx; - int format_warning_cnt = 0; - int weight_idx; - int claimed_value_cnt; - int rec_type; - size_t i; - - if (!fh_open (fh, FH_REF_FILE, "system file", "rs")) - return NULL; +static bool parse_long_string_map (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); +static bool parse_value_labels (struct sfm_reader *, struct dictionary *, + const struct sfm_var_record *, + size_t n_var_recs, + const struct sfm_value_label_record *); +static void parse_data_file_attributes (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); +static void parse_variable_attributes (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); +static void assign_variable_roles (struct sfm_reader *, struct dictionary *); +static void parse_long_string_value_labels (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); +static void parse_long_string_missing_values ( + struct sfm_reader *, const struct sfm_extension_record *, + struct dictionary *); + +/* Frees the strings inside INFO. */ +void +any_read_info_destroy (struct any_read_info *info) +{ + if (info) + { + free (info->creation_date); + free (info->creation_time); + free (info->product); + free (info->product_ext); + } +} - *dict = dict_create (); +/* Tries to open FH for reading as a system file. Returns an sfm_reader if + successful, otherwise NULL. */ +static struct any_reader * +sfm_open (struct file_handle *fh) +{ + size_t allocated_mrsets = 0; + struct sfm_reader *r; /* Create and initialize reader. */ - r = pool_create_container (struct sfm_reader, pool); - r->fh = fh; - r->file = fn_open (fh_get_file_name (fh), "rb"); - r->error = false; - r->value_cnt = 0; - r->has_vls = false; - r->has_long_var_names = false; + r = xzalloc (sizeof *r); + r->any_reader.klass = &sys_file_reader_class; + r->pool = pool_create (); + pool_register (r->pool, free, r); + r->fh = fh_ref (fh); r->opcode_idx = sizeof r->opcodes; + ll_init (&r->var_attrs); - if (setjmp (r->bail_out)) - { - sfm_close_reader (r); - dict_destroy (*dict); - *dict = NULL; - return NULL; - } + /* TRANSLATORS: this fragment will be interpolated into + messages in fh_lock() that identify types of files. */ + r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false); + if (r->lock == NULL) + goto error; + r->file = fn_open (fh, "rb"); if (r->file == NULL) { - msg (ME, _("Error opening \"%s\" for reading as a system file: %s."), + msg (ME, _("Error opening `%s' for reading as a system file: %s."), fh_get_file_name (r->fh), strerror (errno)); - longjmp (r->bail_out, 1); + goto error; + } + + if (!read_dictionary (r)) + goto error; + + if (r->extensions[EXT_MRSETS] != NULL) + parse_mrsets (r, r->extensions[EXT_MRSETS], &allocated_mrsets); + + if (r->extensions[EXT_MRSETS2] != NULL) + parse_mrsets (r, r->extensions[EXT_MRSETS2], &allocated_mrsets); + + return &r->any_reader; + +error: + if (r) + sfm_close (&r->any_reader); + return NULL; +} + +static bool +read_dictionary (struct sfm_reader *r) +{ + size_t allocated_vars; + size_t allocated_labels; + + if (!read_header (r, &r->info, &r->header)) + return false; + + allocated_vars = 0; + allocated_labels = 0; + for (;;) + { + int type; + + if (!read_int (r, &type)) + return false; + if (type == 999) + break; + if (!read_record (r, type, &allocated_vars, &allocated_labels)) + return false; } - /* Read header. */ - read_header (r, *dict, &weight_idx, &claimed_value_cnt, info); + if (!skip_bytes (r, 4)) + return false; + + if (r->compression == ANY_COMP_ZLIB && !read_zheader (r)) + return false; + + return true; +} + +static bool +read_record (struct sfm_reader *r, int type, + size_t *allocated_vars, size_t *allocated_labels) +{ + int subtype; - /* Read all the variable definition records. */ - rec_type = read_int32 (r); - while (rec_type == 2) + switch (type) { - read_variable_record (r, *dict, &format_warning_cnt); - rec_type = read_int32 (r); + case 2: + if (r->n_vars >= *allocated_vars) + r->vars = pool_2nrealloc (r->pool, r->vars, allocated_vars, + sizeof *r->vars); + return read_variable_record (r, &r->vars[r->n_vars++]); + + case 3: + if (r->n_labels >= *allocated_labels) + r->labels = pool_2nrealloc (r->pool, r->labels, allocated_labels, + sizeof *r->labels); + return read_value_label_record (r, &r->labels[r->n_labels++]); + + case 4: + /* A Type 4 record is always immediately after a type 3 record, + so the code for type 3 records reads the type 4 record too. */ + sys_error (r, r->pos, _("Misplaced type 4 record.")); + return false; + + case 6: + if (r->document != NULL) + { + sys_error (r, r->pos, _("Duplicate type 6 (document) record.")); + return false; + } + return read_document_record (r); + + case 7: + if (!read_int (r, &subtype)) + return false; + else if (subtype < 0 + || subtype >= sizeof r->extensions / sizeof *r->extensions) + { + sys_warn (r, r->pos, + _("Unrecognized record type 7, subtype %d. For help, " + "please send this file to %s and mention that you were " + "using %s."), + subtype, PACKAGE_BUGREPORT, PACKAGE_STRING); + return skip_extension_record (r, subtype); + } + else if (subtype == 18) + { + /* System files written by "Stata 14.1/-savespss- 1.77 by S.Radyakin" + put each variable attribute into a separate record with subtype + 18. I'm surprised that SPSS puts up with this. */ + struct sfm_extension_record *ext; + bool ok = read_extension_record (r, subtype, &ext); + if (ok && ext) + ll_push_tail (&r->var_attrs, &ext->ll); + return ok; + } + else if (r->extensions[subtype] != NULL) + { + sys_warn (r, r->pos, + _("Record type 7, subtype %d found here has the same " + "type as the record found near offset 0x%llx. For " + "help, please send this file to %s and mention that " + "you were using %s."), + subtype, (long long int) r->extensions[subtype]->pos, + PACKAGE_BUGREPORT, PACKAGE_STRING); + return skip_extension_record (r, subtype); + } + else + return read_extension_record (r, subtype, &r->extensions[subtype]); + + default: + sys_error (r, r->pos, _("Unrecognized record type %d."), type); + return false; } - /* Figure out the case format. */ - var_by_value_idx = make_var_by_value_idx (r, *dict); - setup_weight (r, weight_idx, var_by_value_idx, *dict); + NOT_REACHED (); +} + +/* Returns the character encoding obtained from R, or a null pointer if R + doesn't have an indication of its character encoding. */ +static const char * +sfm_get_encoding (const struct sfm_reader *r) +{ + /* The EXT_ENCODING record is the best way to determine dictionary + encoding. */ + if (r->extensions[EXT_ENCODING]) + return r->extensions[EXT_ENCODING]->data; - /* Read all the rest of the dictionary records. */ - while (rec_type != 999) + /* But EXT_INTEGER is better than nothing as a fallback. */ + if (r->extensions[EXT_INTEGER]) { - switch (rec_type) + int codepage = parse_int (r, r->extensions[EXT_INTEGER]->data, 7 * 4); + const char *encoding; + + switch (codepage) { + case 1: + return "EBCDIC-US"; + + case 2: case 3: - read_value_labels (r, *dict, var_by_value_idx); + /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] + respectively. However, many files have character code 2 but data + which are clearly not ASCII. Therefore, ignore these values. */ break; case 4: - sys_error (r, _("Misplaced type 4 record.")); + return "MS_KANJI"; - case 6: - read_documents (r, *dict); + default: + encoding = sys_get_encoding_from_codepage (codepage); + if (encoding != NULL) + return encoding; break; + } + } - case 7: - read_extension_record (r, *dict); - break; + /* If the file magic number is EBCDIC then its character data is too. */ + if (!strcmp (r->header.magic, EBCDIC_MAGIC)) + return "EBCDIC-US"; - default: - sys_error (r, _("Unrecognized record type %d."), rec_type); + return NULL; +} + +struct get_strings_aux + { + struct pool *pool; + char **titles; + char **strings; + bool *ids; + size_t allocated; + size_t n; + }; + +static void +add_string__ (struct get_strings_aux *aux, + const char *string, bool id, char *title) +{ + if (aux->n >= aux->allocated) + { + aux->allocated = 2 * (aux->allocated + 1); + aux->titles = pool_realloc (aux->pool, aux->titles, + aux->allocated * sizeof *aux->titles); + aux->strings = pool_realloc (aux->pool, aux->strings, + aux->allocated * sizeof *aux->strings); + aux->ids = pool_realloc (aux->pool, aux->ids, + aux->allocated * sizeof *aux->ids); + } + + aux->titles[aux->n] = title; + aux->strings[aux->n] = pool_strdup (aux->pool, string); + aux->ids[aux->n] = id; + aux->n++; +} + +static void PRINTF_FORMAT (3, 4) +add_string (struct get_strings_aux *aux, + const char *string, const char *title, ...) +{ + va_list args; + + va_start (args, title); + add_string__ (aux, string, false, pool_vasprintf (aux->pool, title, args)); + va_end (args); +} + +static void PRINTF_FORMAT (3, 4) +add_id (struct get_strings_aux *aux, const char *id, const char *title, ...) +{ + va_list args; + + va_start (args, title); + add_string__ (aux, id, true, pool_vasprintf (aux->pool, title, args)); + va_end (args); +} + +/* Retrieves significant string data from R in its raw format, to allow the + caller to try to detect the encoding in use. + + Returns the number of strings retrieved N. Sets each of *TITLESP, *IDSP, + and *STRINGSP to an array of N elements allocated from POOL. For each I in + 0...N-1, UTF-8 string *TITLESP[I] describes *STRINGSP[I], which is in + whatever encoding system file R uses. *IDS[I] is true if *STRINGSP[I] must + be a valid PSPP language identifier, false if *STRINGSP[I] is free-form + text. */ +static size_t +sfm_get_strings (const struct any_reader *r_, struct pool *pool, + char ***titlesp, bool **idsp, char ***stringsp) +{ + struct sfm_reader *r = sfm_reader_cast (r_); + const struct sfm_mrset *mrset; + struct get_strings_aux aux; + size_t var_idx; + size_t i, j, k; + + aux.pool = pool; + aux.titles = NULL; + aux.strings = NULL; + aux.ids = NULL; + aux.allocated = 0; + aux.n = 0; + + var_idx = 0; + for (i = 0; i < r->n_vars; i++) + if (r->vars[i].width != -1) + add_id (&aux, r->vars[i].name, _("Variable %zu"), ++var_idx); + + var_idx = 0; + for (i = 0; i < r->n_vars; i++) + if (r->vars[i].width != -1) + { + var_idx++; + if (r->vars[i].label) + add_string (&aux, r->vars[i].label, _("Variable %zu Label"), + var_idx); + } + + k = 0; + for (i = 0; i < r->n_labels; i++) + for (j = 0; j < r->labels[i].n_labels; j++) + add_string (&aux, r->labels[i].labels[j].label, + _("Value Label %zu"), k++); + + add_string (&aux, r->header.creation_date, _("Creation Date")); + add_string (&aux, r->header.creation_time, _("Creation Time")); + add_string (&aux, r->header.eye_catcher, _("Product")); + add_string (&aux, r->header.file_label, _("File Label")); + + if (r->extensions[EXT_PRODUCT_INFO]) + add_string (&aux, r->extensions[EXT_PRODUCT_INFO]->data, + _("Extra Product Info")); + + if (r->document) + { + size_t i; + + for (i = 0; i < r->document->n_lines; i++) + { + char line[81]; + + memcpy (line, r->document->documents + i * 80, 80); + line[80] = '\0'; + + add_string (&aux, line, _("Document Line %zu"), i + 1); } - rec_type = read_int32 (r); } + for (mrset = r->mrsets; mrset < &r->mrsets[r->n_mrsets]; mrset++) + { + size_t mrset_idx = mrset - r->mrsets + 1; + + add_id (&aux, mrset->name, _("MRSET %zu"), mrset_idx); + if (mrset->label[0]) + add_string (&aux, mrset->label, _("MRSET %zu Label"), mrset_idx); + + /* Skip the variables because they ought to be duplicates. */ + + if (mrset->counted) + add_string (&aux, mrset->counted, _("MRSET %zu Counted Value"), + mrset_idx); + } + + /* data file attributes */ + /* variable attributes */ + /* long var map */ + /* long string value labels */ + /* long string missing values */ + + *titlesp = aux.titles; + *idsp = aux.ids; + *stringsp = aux.strings; + return aux.n; +} + +/* Decodes the dictionary read from R, saving it into into *DICT. Character + strings in R are decoded using ENCODING, or an encoding obtained from R if + ENCODING is null, or the locale encoding if R specifies no encoding. + + If INFOP is non-null, then it receives additional info about the system + file, which the caller must eventually free with any_read_info_destroy() + when it is no longer needed. + + This function consumes R. The caller must use it again later, even to + destroy it with sfm_close(). */ +static struct casereader * +sfm_decode (struct any_reader *r_, const char *encoding, + struct dictionary **dictp, struct any_read_info *infop) +{ + struct sfm_reader *r = sfm_reader_cast (r_); + struct dictionary *dict; + size_t i; - if ( ! r->has_long_var_names ) + if (encoding == NULL) { - int i; - for (i = 0; i < dict_get_var_cnt (*dict); i++) - { - struct variable *var = dict_get_var (*dict, i); - char short_name [SHORT_NAME_LEN + 1]; - char long_name [SHORT_NAME_LEN + 1]; + encoding = sfm_get_encoding (r); + if (encoding == NULL) + { + sys_warn (r, -1, _("This system file does not indicate its own " + "character encoding. Using default encoding " + "%s. For best results, specify an encoding " + "explicitly. Use SYSFILE INFO with " + "ENCODING=\"DETECT\" to analyze the possible " + "encodings."), + locale_charset ()); + encoding = locale_charset (); + } + } - strcpy (short_name, var_get_name (var)); + dict = dict_create (encoding); + r->encoding = dict_get_encoding (dict); - strcpy (long_name, short_name); - str_lowercase (long_name); + /* These records don't use variables at all. */ + if (r->document != NULL) + parse_document (dict, r->document); - /* Set long name. Renaming a variable may clear the short - name, but we want to retain it, so re-set it - explicitly. */ - dict_rename_var (*dict, var, long_name); - var_set_short_name (var, short_name); - } + if (r->extensions[EXT_INTEGER] != NULL + && !parse_machine_integer_info (r, r->extensions[EXT_INTEGER], &r->info)) + goto error; + + if (r->extensions[EXT_FLOAT] != NULL) + parse_machine_float_info (r, r->extensions[EXT_FLOAT]); + + if (r->extensions[EXT_PRODUCT_INFO] != NULL) + parse_extra_product_info (r, r->extensions[EXT_PRODUCT_INFO], &r->info); + + if (r->extensions[EXT_FILE_ATTRS] != NULL) + parse_data_file_attributes (r, r->extensions[EXT_FILE_ATTRS], dict); + + parse_header (r, &r->header, &r->info, dict); - r->has_long_var_names = true; + /* Parse the variable records, the basis of almost everything else. */ + if (!parse_variable_records (r, dict, r->vars, r->n_vars)) + goto error; + + /* Parse value labels and the weight variable immediately after the variable + records. These records use indexes into var_recs[], so we must parse them + before those indexes become invalidated by very long string variables. */ + for (i = 0; i < r->n_labels; i++) + if (!parse_value_labels (r, dict, r->vars, r->n_vars, &r->labels[i])) + goto error; + if (r->header.weight_idx != 0) + { + struct variable *weight_var; + + weight_var = lookup_var_by_index (r, 76, r->vars, r->n_vars, + r->header.weight_idx); + if (weight_var != NULL) + { + if (var_is_numeric (weight_var)) + dict_set_weight (dict, weight_var); + else + sys_warn (r, -1, _("Ignoring string variable `%s' set " + "as weighting variable."), + var_get_name (weight_var)); + } } - /* Read record 999 data, which is just filler. */ - read_int32 (r); + if (r->extensions[EXT_DISPLAY] != NULL) + parse_display_parameters (r, r->extensions[EXT_DISPLAY], dict); + + /* The following records use short names, so they need to be parsed before + parse_long_var_name_map() changes short names to long names. */ + decode_mrsets (r, dict); + + if (r->extensions[EXT_LONG_STRINGS] != NULL + && !parse_long_string_map (r, r->extensions[EXT_LONG_STRINGS], dict)) + goto error; - if (claimed_value_cnt != -1 && claimed_value_cnt != r->value_cnt) - sys_warn (r, _("File header claims %d variable positions but " - "%d were read from file."), - claimed_value_cnt, r->value_cnt); + /* Now rename variables to their long names. */ + parse_long_var_name_map (r, r->extensions[EXT_LONG_NAMES], dict); + + /* The following records use long names, so they need to follow renaming. */ + if (!ll_is_empty (&r->var_attrs)) + { + struct sfm_extension_record *ext; + ll_for_each (ext, struct sfm_extension_record, ll, &r->var_attrs) + parse_variable_attributes (r, ext, dict); + + /* Roles use the $@Role attribute. */ + assign_variable_roles (r, dict); + } + if (r->extensions[EXT_LONG_LABELS] != NULL) + parse_long_string_value_labels (r, r->extensions[EXT_LONG_LABELS], dict); + if (r->extensions[EXT_LONG_MISSING] != NULL) + parse_long_string_missing_values (r, r->extensions[EXT_LONG_MISSING], + dict); + + /* Warn if the actual amount of data per case differs from the + amount that the header claims. SPSS version 13 gets this + wrong when very long strings are involved, so don't warn in + that case. */ + if (r->header.nominal_case_size > 0 + && r->header.nominal_case_size != r->n_vars + && r->info.version_major != 13) + sys_warn (r, -1, _("File header claims %d variable positions but " + "%zu were read from file."), + r->header.nominal_case_size, r->n_vars); /* Create an index of dictionary variable widths for sfm_read_case to use. We cannot use the `struct variable's from the dictionary we created, because the caller owns the dictionary and may destroy or modify its variables. */ - r->var_cnt = dict_get_var_cnt (*dict); - r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars); - for (i = 0; i < r->var_cnt; i++) + sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt); + pool_register (r->pool, free, r->sfm_vars); + r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool); + + *dictp = dict; + if (infop) { - struct variable *v = dict_get_var (*dict, i); - struct sfm_var *sv = &r->vars[i]; - sv->width = var_get_width (v); - sv->case_index = var_get_case_index (v); + *infop = r->info; + memset (&r->info, 0, sizeof r->info); } - pool_free (r->pool, var_by_value_idx); - return r; + return casereader_create_sequential + (NULL, r->proto, + r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt, + &sys_file_casereader_class, r); + +error: + sfm_close (r_); + dict_destroy (dict); + *dictp = NULL; + return NULL; } -/* Closes a system file after we're done with it. */ -void -sfm_close_reader (struct sfm_reader *r) +/* Closes R, which should have been returned by sfm_open() but not already + closed with sfm_decode() or this function. + Returns true if an I/O error has occurred on READER, false + otherwise. */ +static bool +sfm_close (struct any_reader *r_) { - if (r == NULL) - return; + struct sfm_reader *r = sfm_reader_cast (r_); + bool error; if (r->file) { - if (fn_close (fh_get_file_name (r->fh), r->file) == EOF) - msg (ME, _("Error closing system file \"%s\": %s."), - fh_get_file_name (r->fh), strerror (errno)); + if (fn_close (r->fh, r->file) == EOF) + { + msg (ME, _("Error closing system file `%s': %s."), + fh_get_file_name (r->fh), strerror (errno)); + r->error = true; + } r->file = NULL; } - if (r->fh != NULL) - fh_close (r->fh, "system file", "rs"); + any_read_info_destroy (&r->info); + fh_unlock (r->lock); + fh_unref (r->fh); + error = r->error; pool_destroy (r->pool); + + return !error; } -/* Returns true if an I/O error has occurred on READER, false - otherwise. */ -bool -sfm_read_error (const struct sfm_reader *reader) +/* Destroys READER. */ +static void +sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_) { - return reader->error; + struct sfm_reader *r = r_; + sfm_close (&r->any_reader); } -/* Returns true if FILE is an SPSS system file, - false otherwise. */ -bool -sfm_detect (FILE *file) +/* Detects whether FILE is an SPSS system file. Returns 1 if so, 0 if not, and + a negative errno value if there is an error reading FILE. */ +static int +sfm_detect (FILE *file) { - char rec_type[5]; + char magic[5]; - if (fread (rec_type, 4, 1, file) != 1) - return false; - rec_type[4] = '\0'; - - return !strcmp ("$FL2", rec_type); + if (fseek (file, 0, SEEK_SET) != 0) + return -errno; + if (fread (magic, 4, 1, file) != 1) + return ferror (file) ? -errno : 0; + magic[4] = '\0'; + + return (!strcmp (ASCII_MAGIC, magic) + || !strcmp (ASCII_ZMAGIC, magic) + || !strcmp (EBCDIC_MAGIC, magic)); } -/* Reads the global header of the system file. - Sets DICT's file label to the system file's label. - Sets *WEIGHT_IDX to 0 if the system file is unweighted, - or to the value index of the weight variable otherwise. - Sets *CLAIMED_VALUE_CNT to the number of values that the file - claims to have (although it is not always correct). - If INFO is non-null, initializes *INFO with header - information. */ -static void -read_header (struct sfm_reader *r, struct dictionary *dict, - int *weight_idx, int *claimed_value_cnt, - struct sfm_read_info *info) +/* Reads the global header of the system file. Initializes *HEADER and *INFO, + except for the string fields in *INFO, which parse_header() will initialize + later once the file's encoding is known. */ +static bool +read_header (struct sfm_reader *r, struct any_read_info *info, + struct sfm_header_record *header) { - char rec_type[5]; - char eye_catcher[61]; uint8_t raw_layout_code[4]; - int case_cnt; uint8_t raw_bias[8]; - char creation_date[10]; - char creation_time[9]; - char file_label[65]; - struct substring file_label_ss; + int compressed; + bool zmagic; - read_string (r, rec_type, sizeof rec_type); - read_string (r, eye_catcher, sizeof eye_catcher); - - if (strcmp ("$FL2", rec_type) != 0) - sys_error (r, _("This is not an SPSS system file.")); + if (!read_string (r, header->magic, sizeof header->magic) + || !read_string (r, header->eye_catcher, sizeof header->eye_catcher)) + return false; + r->written_by_readstat = strstr (header->eye_catcher, + "https://github.com/WizardMac/ReadStat"); + + if (!strcmp (ASCII_MAGIC, header->magic) + || !strcmp (EBCDIC_MAGIC, header->magic)) + zmagic = false; + else if (!strcmp (ASCII_ZMAGIC, header->magic)) + zmagic = true; + else + { + sys_error (r, 0, _("This is not an SPSS system file.")); + return false; + } /* Identify integer format. */ - read_bytes (r, raw_layout_code, sizeof raw_layout_code); + if (!read_bytes (r, raw_layout_code, sizeof raw_layout_code)) + return false; if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code, &r->integer_format) && !integer_identify (3, raw_layout_code, sizeof raw_layout_code, &r->integer_format)) || (r->integer_format != INTEGER_MSB_FIRST && r->integer_format != INTEGER_LSB_FIRST)) - sys_error (r, _("This is not an SPSS system file.")); + { + sys_error (r, 64, _("This is not an SPSS system file.")); + return false; + } + + if (!read_int (r, &header->nominal_case_size)) + return false; - *claimed_value_cnt = read_int32 (r); - if (*claimed_value_cnt < 0 || *claimed_value_cnt > INT_MAX / 16) - *claimed_value_cnt = -1; + if (header->nominal_case_size < 0 + || header->nominal_case_size > INT_MAX / 16) + header->nominal_case_size = -1; - r->compressed = read_int32 (r) != 0; + if (!read_int (r, &compressed)) + return false; + if (!zmagic) + { + if (compressed == 0) + r->compression = ANY_COMP_NONE; + else if (compressed == 1) + r->compression = ANY_COMP_SIMPLE; + else if (compressed != 0) + { + sys_error (r, 0, "System file header has invalid compression " + "value %d.", compressed); + return false; + } + } + else + { + if (compressed == 2) + r->compression = ANY_COMP_ZLIB; + else + { + sys_error (r, 0, "ZLIB-compressed system file header has invalid " + "compression value %d.", compressed); + return false; + } + } - *weight_idx = read_int32 (r); + if (!read_int (r, &header->weight_idx)) + return false; - case_cnt = read_int32 (r); - if (case_cnt < -1 || case_cnt > INT_MAX / 2) - case_cnt = -1; + if (!read_int (r, &r->case_cnt)) + return false; + if ( r->case_cnt > INT_MAX / 2) + r->case_cnt = -1; /* Identify floating-point format and obtain compression bias. */ - read_bytes (r, raw_bias, sizeof raw_bias); + if (!read_bytes (r, raw_bias, sizeof raw_bias)) + return false; if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0) { - sys_warn (r, _("Compression bias (%g) is not the usual " - "value of 100, or system file uses unrecognized " - "floating-point format."), - r->bias); + uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (memcmp (raw_bias, zero_bias, 8)) + sys_warn (r, r->pos - 8, + _("Compression bias is not the usual " + "value of 100, or system file uses unrecognized " + "floating-point format.")); + else + { + /* Some software is known to write all-zeros to this + field. Such software also writes floating-point + numbers in the format that we expect by default + (it seems that all software most likely does, in + reality), so don't warn in this case. */ + } + if (r->integer_format == INTEGER_MSB_FIRST) r->float_format = FLOAT_IEEE_DOUBLE_BE; else @@ -412,365 +1066,564 @@ read_header (struct sfm_reader *r, struct dictionary *dict, } float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias); - read_string (r, creation_date, sizeof creation_date); - read_string (r, creation_time, sizeof creation_time); - read_string (r, file_label, sizeof file_label); - skip_bytes (r, 3); - - file_label_ss = ss_cstr (file_label); - ss_trim (&file_label_ss, ss_cstr (" ")); - if (!ss_is_empty (file_label_ss)) - { - ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0'; - dict_set_label (dict, ss_data (file_label_ss)); - } - - if (info) - { - struct substring product; + if (!read_string (r, header->creation_date, sizeof header->creation_date) + || !read_string (r, header->creation_time, sizeof header->creation_time) + || !read_string (r, header->file_label, sizeof header->file_label) + || !skip_bytes (r, 3)) + return false; - strcpy (info->creation_date, creation_date); - strcpy (info->creation_time, creation_time); - info->integer_format = r->integer_format; - info->float_format = r->float_format; - info->compressed = r->compressed; - info->case_cnt = case_cnt; + info->integer_format = r->integer_format; + info->float_format = r->float_format; + info->compression = r->compression; + info->case_cnt = r->case_cnt; - product = ss_cstr (eye_catcher); - ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE")); - ss_trim (&product, ss_cstr (" ")); - str_copy_buf_trunc (info->product, sizeof info->product, - ss_data (product), ss_length (product)); - } + return true; } -/* Reads a variable (type 2) record from R and adds the - corresponding variable to DICT. - Also skips past additional variable records for long string - variables. */ -static void -read_variable_record (struct sfm_reader *r, struct dictionary *dict, - int *format_warning_cnt) +/* Reads a variable (type 2) record from R into RECORD. */ +static bool +read_variable_record (struct sfm_reader *r, struct sfm_var_record *record) { - int width; int has_variable_label; - int missing_value_code; - int print_format; - int write_format; - char name[9]; - struct variable *var; - int nv; - - width = read_int32 (r); - has_variable_label = read_int32 (r); - missing_value_code = read_int32 (r); - print_format = read_int32 (r); - write_format = read_int32 (r); - read_string (r, name, sizeof name); - name[strcspn (name, " ")] = '\0'; - - /* Check variable name. */ - if (name[0] == '$' || name[0] == '#') - sys_error (r, "Variable name begins with invalid character `%c'.", - name[0]); - if (!var_is_plausible_name (name, false)) - sys_error (r, _("Invalid variable name `%s'."), name); - - /* Create variable. */ - if (width < 0 || width > 255) - sys_error (r, _("Bad variable width %d."), width); - var = dict_create_var (dict, name, width); - if (var == NULL) - sys_error (r, - _("Duplicate variable name `%s' within system file."), - name); - - /* Set the short name the same as the long name */ - var_set_short_name (var, var_get_name (var)); - - /* Get variable label, if any. */ - if (has_variable_label != 0 && has_variable_label != 1) - sys_error (r, _("Variable label indicator field is not 0 or 1.")); + memset (record, 0, sizeof *record); + + record->pos = r->pos; + if (!read_int (r, &record->width) + || !read_int (r, &has_variable_label) + || !read_int (r, &record->missing_value_code) + || !read_int (r, &record->print_format) + || !read_int (r, &record->write_format) + || !read_string (r, record->name, sizeof record->name)) + return false; + if (has_variable_label == 1) { - size_t len; - char label[255 + 1]; + enum { MAX_LABEL_LEN = 65536 }; + unsigned int len, read_len; + + if (!read_uint (r, &len)) + return false; + + /* Read up to MAX_LABEL_LEN bytes of label. */ + read_len = MIN (MAX_LABEL_LEN, len); + record->label = pool_malloc (r->pool, read_len + 1); + if (!read_string (r, record->label, read_len + 1)) + return false; - len = read_int32 (r); - if (len >= sizeof label) - sys_error (r, _("Variable %s has label of invalid length %u."), - name, (unsigned int) len); - read_string (r, label, len + 1); - var_set_label (var, label); - - skip_bytes (r, ROUND_UP (len, 4) - len); + /* Skip unread label bytes. */ + if (!skip_bytes (r, len - read_len)) + return false; + + /* Skip label padding up to multiple of 4 bytes. */ + if (!skip_bytes (r, ROUND_UP (len, 4) - len)) + return false; + } + else if (has_variable_label != 0) + { + sys_error (r, record->pos, + _("Variable label indicator field is not 0 or 1.")); + return false; } /* Set missing values. */ - if (missing_value_code < -3 || missing_value_code > 3 - || missing_value_code == -1) - sys_error (r, _("Missing value indicator field is not " - "-3, -2, 0, 1, 2, or 3.")); - if (missing_value_code != 0) + if (record->missing_value_code != 0) { - struct missing_values mv; - mv_init (&mv, var_get_width (var)); - if (var_is_numeric (var)) + int code = record->missing_value_code; + if (record->width == 0) { - if (missing_value_code > 0) - { - int i; - for (i = 0; i < missing_value_code; i++) - mv_add_num (&mv, read_flt64 (r)); - } - else + if (code < -3 || code > 3 || code == -1) { - double low = read_flt64 (r); - double high = read_flt64 (r); - mv_add_num_range (&mv, low, high); - if (missing_value_code == -3) - mv_add_num (&mv, read_flt64 (r)); + sys_error (r, record->pos, + _("Numeric missing value indicator field is not " + "-3, -2, 0, 1, 2, or 3.")); + return false; } } - else if (var_get_width (var) <= MAX_SHORT_STRING) + else { - if (missing_value_code > 0) + if (code < 1 || code > 3) { - int i; - for (i = 0; i < missing_value_code; i++) - { - char string[9]; - read_string (r, string, sizeof string); - mv_add_str (&mv, string); - } + sys_error (r, record->pos, + _("String missing value indicator field is not " + "0, 1, 2, or 3.")); + return false; } - else - sys_error (r, _("String variable %s may not have missing " - "values specified as a range."), - name); } - else /* var->width > MAX_SHORT_STRING */ - sys_error (r, _("Long string variable %s may not have missing " - "values."), - name); - var_set_missing_values (var, &mv); - } - /* Set formats. */ - parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt); - parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt); + if (!read_bytes (r, record->missing, 8 * abs (code))) + return false; + } - /* Account for values. - Skip long string continuation records, if any. */ - nv = width == 0 ? 1 : DIV_RND_UP (width, 8); - r->value_cnt += nv; - if (width > 8) + return true; +} + +/* Reads value labels from R into RECORD. */ +static bool +read_value_label_record (struct sfm_reader *r, + struct sfm_value_label_record *record) +{ + size_t i; + int type; + + /* Read type 3 record. */ + record->pos = r->pos; + if (!read_uint (r, &record->n_labels)) + return false; + if (record->n_labels > UINT_MAX / sizeof *record->labels) { - int i; + sys_error (r, r->pos - 4, _("Invalid number of labels %u."), + record->n_labels); + return false; + } + record->labels = pool_nmalloc (r->pool, record->n_labels, + sizeof *record->labels); + for (i = 0; i < record->n_labels; i++) + { + struct sfm_value_label *label = &record->labels[i]; + unsigned char label_len; + size_t padded_len; + + if (!read_bytes (r, label->value, sizeof label->value)) + return false; + + /* Read label length. */ + if (!read_bytes (r, &label_len, sizeof label_len)) + return false; + padded_len = ROUND_UP (label_len + 1, 8); + + /* Read label, padding. */ + label->label = pool_malloc (r->pool, padded_len + 1); + if (!read_bytes (r, label->label, padded_len - 1)) + return false; + label->label[label_len] = '\0'; + } + + /* Read record type of type 4 record. */ + if (!read_int (r, &type)) + return false; + if (type != 4) + { + sys_error (r, r->pos - 4, + _("Variable index record (type 4) does not immediately " + "follow value label record (type 3) as it should.")); + return false; + } + + /* Read number of variables associated with value label from type 4 + record. */ + if (!read_uint (r, &record->n_vars)) + return false; + if (record->n_vars < 1 || record->n_vars > r->n_vars) + { + sys_error (r, r->pos - 4, + _("Number of variables associated with a value label (%u) " + "is not between 1 and the number of variables (%zu)."), + record->n_vars, r->n_vars); + return false; + } + + record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars); + for (i = 0; i < record->n_vars; i++) + if (!read_int (r, &record->vars[i])) + return false; + + return true; +} + +/* Reads a document record from R. Returns true if successful, false on + error. */ +static bool +read_document_record (struct sfm_reader *r) +{ + int n_lines; + if (!read_int (r, &n_lines)) + return false; + else if (n_lines == 0) + return true; + else if (n_lines < 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH) + { + sys_error (r, r->pos, + _("Number of document lines (%d) " + "must be greater than 0 and less than %d."), + n_lines, INT_MAX / DOC_LINE_LENGTH); + return false; + } + + struct sfm_document_record *record; + record = pool_malloc (r->pool, sizeof *record); + record->pos = r->pos; + record->n_lines = n_lines; + record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines); + if (!read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines)) + return false; + + r->document = record; + return true; +} + +static bool +read_extension_record_header (struct sfm_reader *r, int subtype, + struct sfm_extension_record *record) +{ + record->subtype = subtype; + record->pos = r->pos; + if (!read_uint (r, &record->size) || !read_uint (r, &record->count)) + return false; + + /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1 + allows an extra byte for a null terminator, used by some + extension processing routines. */ + if (record->size != 0 + && xsum (1, xtimes (record->count, record->size)) >= UINT_MAX) + { + sys_error (r, record->pos, "Record type 7 subtype %d too large.", + subtype); + return false; + } + + return true; +} + +/* Reads an extension record from R into RECORD. */ +static bool +read_extension_record (struct sfm_reader *r, int subtype, + struct sfm_extension_record **recordp) +{ + struct extension_record_type + { + int subtype; + int size; + int count; + }; + + static const struct extension_record_type types[] = + { + /* Implemented record types. */ + { EXT_INTEGER, 4, 8 }, + { EXT_FLOAT, 8, 3 }, + { EXT_MRSETS, 1, 0 }, + { EXT_PRODUCT_INFO, 1, 0 }, + { EXT_DISPLAY, 4, 0 }, + { EXT_LONG_NAMES, 1, 0 }, + { EXT_LONG_STRINGS, 1, 0 }, + { EXT_NCASES, 8, 2 }, + { EXT_FILE_ATTRS, 1, 0 }, + { EXT_VAR_ATTRS, 1, 0 }, + { EXT_MRSETS2, 1, 0 }, + { EXT_ENCODING, 1, 0 }, + { EXT_LONG_LABELS, 1, 0 }, + { EXT_LONG_MISSING, 1, 0 }, + + /* Ignored record types. */ + { EXT_VAR_SETS, 0, 0 }, + { EXT_DATE, 0, 0 }, + { EXT_DATA_ENTRY, 0, 0 }, + { EXT_DATAVIEW, 0, 0 }, + }; + + const struct extension_record_type *type; + struct sfm_extension_record *record; + size_t n_bytes; + + *recordp = NULL; + record = pool_malloc (r->pool, sizeof *record); + if (!read_extension_record_header (r, subtype, record)) + return false; + n_bytes = record->count * record->size; + + for (type = types; type < &types[sizeof types / sizeof *types]; type++) + if (subtype == type->subtype) + { + if (type->size > 0 && record->size != type->size) + sys_warn (r, record->pos, + _("Record type 7, subtype %d has bad size %u " + "(expected %d)."), subtype, record->size, type->size); + else if (type->count > 0 && record->count != type->count) + sys_warn (r, record->pos, + _("Record type 7, subtype %d has bad count %u " + "(expected %d)."), subtype, record->count, type->count); + else if (type->count == 0 && type->size == 0) + { + /* Ignore this record. */ + } + else + { + char *data = pool_malloc (r->pool, n_bytes + 1); + data[n_bytes] = '\0'; + + record->data = data; + if (!read_bytes (r, record->data, n_bytes)) + return false; + *recordp = record; + return true; + } + + goto skip; + } + + sys_warn (r, record->pos, + _("Unrecognized record type 7, subtype %d. For help, please " + "send this file to %s and mention that you were using %s."), + subtype, PACKAGE_BUGREPORT, PACKAGE_STRING); + +skip: + return skip_bytes (r, n_bytes); +} + +static bool +skip_extension_record (struct sfm_reader *r, int subtype) +{ + struct sfm_extension_record record; + + return (read_extension_record_header (r, subtype, &record) + && skip_bytes (r, record.count * record.size)); +} + +static void +parse_header (struct sfm_reader *r, const struct sfm_header_record *header, + struct any_read_info *info, struct dictionary *dict) +{ + const char *dict_encoding = dict_get_encoding (dict); + struct substring product; + struct substring label; + char *fixed_label; + + /* Convert file label to UTF-8 and put it into DICT. */ + label = recode_substring_pool ("UTF-8", dict_encoding, + ss_cstr (header->file_label), r->pool); + ss_trim (&label, ss_cstr (" ")); + label.string[label.length] = '\0'; + fixed_label = fix_line_ends (label.string); + dict_set_label (dict, fixed_label); + free (fixed_label); + + /* Put creation date and time in UTF-8 into INFO. */ + info->creation_date = recode_string ("UTF-8", dict_encoding, + header->creation_date, -1); + info->creation_time = recode_string ("UTF-8", dict_encoding, + header->creation_time, -1); + + /* Put product name into INFO, dropping eye-catcher string if present. */ + product = recode_substring_pool ("UTF-8", dict_encoding, + ss_cstr (header->eye_catcher), r->pool); + ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE")); + ss_trim (&product, ss_cstr (" ")); + info->product = ss_xstrdup (product); +} + +/* Reads a variable (type 2) record from R and adds the + corresponding variable to DICT. + Also skips past additional variable records for long string + variables. */ +static bool +parse_variable_records (struct sfm_reader *r, struct dictionary *dict, + struct sfm_var_record *var_recs, size_t n_var_recs) +{ + const char *dict_encoding = dict_get_encoding (dict); + struct sfm_var_record *rec; + int n_warnings = 0; + + for (rec = var_recs; rec < &var_recs[n_var_recs]; ) + { + struct variable *var; + size_t n_values; + char *name; + size_t i; + + name = recode_string_pool ("UTF-8", dict_encoding, + rec->name, -1, r->pool); + name[strcspn (name, " ")] = '\0'; + + if (!dict_id_is_valid (dict, name, false) + || name[0] == '$' || name[0] == '#') + { + sys_error (r, rec->pos, _("Invalid variable name `%s'."), name); + return false; + } + + if (rec->width < 0 || rec->width > 255) + { + sys_error (r, rec->pos, + _("Bad width %d for variable %s."), rec->width, name); + return false; + } + + var = rec->var = dict_create_var (dict, name, rec->width); + if (var == NULL) + { + char *new_name = dict_make_unique_var_name (dict, NULL, NULL); + sys_warn (r, rec->pos, _("Renaming variable with duplicate name " + "`%s' to `%s'."), + name, new_name); + var = rec->var = dict_create_var_assert (dict, new_name, rec->width); + var_set_short_name (var, 0, new_name); + free (new_name); + } + + /* Set the short name the same as the long name (even if we renamed + it). */ + var_set_short_name (var, 0, var_get_name (var)); + + /* Get variable label, if any. */ + if (rec->label) + { + char *utf8_label; - for (i = 1; i < nv; i++) + utf8_label = recode_string_pool ("UTF-8", dict_encoding, + rec->label, -1, r->pool); + var_set_label (var, utf8_label); + } + + /* Set missing values. */ + if (rec->missing_value_code != 0) { - /* Check for record type 2 and width -1. */ - if (read_int32 (r) != 2 || read_int32 (r) != -1) - sys_error (r, _("Missing string continuation record.")); - - /* Skip and ignore remaining continuation data. */ - has_variable_label = read_int32 (r); - missing_value_code = read_int32 (r); - print_format = read_int32 (r); - write_format = read_int32 (r); - read_string (r, name, sizeof name); - - /* Variable label fields on continuation records have - been spotted in system files created by "SPSS Power - Macintosh Release 6.1". */ - if (has_variable_label) - skip_bytes (r, ROUND_UP (read_int32 (r), 4)); + int width = var_get_width (var); + struct missing_values mv; + + mv_init_pool (r->pool, &mv, width); + if (var_is_numeric (var)) + { + bool has_range = rec->missing_value_code < 0; + int n_discrete = (has_range + ? rec->missing_value_code == -3 + : rec->missing_value_code); + int ofs = 0; + + if (has_range) + { + double low = parse_float (r, rec->missing, 0); + double high = parse_float (r, rec->missing, 8); + + /* Deal with SPSS 21 change in representation. */ + if (low == SYSMIS) + low = LOWEST; + + mv_add_range (&mv, low, high); + ofs += 16; + } + + for (i = 0; i < n_discrete; i++) + { + mv_add_num (&mv, parse_float (r, rec->missing, ofs)); + ofs += 8; + } + } + else + for (i = 0; i < rec->missing_value_code; i++) + mv_add_str (&mv, rec->missing + 8 * i, MIN (width, 8)); + var_set_missing_values (var, &mv); } + + /* Set formats. */ + parse_format_spec (r, rec->pos + 12, rec->print_format, + PRINT_FORMAT, var, &n_warnings); + parse_format_spec (r, rec->pos + 16, rec->write_format, + WRITE_FORMAT, var, &n_warnings); + + /* Account for values. + Skip long string continuation records, if any. */ + n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8); + for (i = 1; i < n_values; i++) + if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1) + { + sys_error (r, rec->pos, _("Missing string continuation record.")); + return false; + } + rec += n_values; } + + return true; } /* Translates the format spec from sysfile format to internal format. */ static void -parse_format_spec (struct sfm_reader *r, uint32_t s, +parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format, enum which_format which, struct variable *v, - int *format_warning_cnt) + int *n_warnings) { - const int max_format_warnings = 8; + const int max_warnings = 8; + uint8_t raw_type = format >> 16; + uint8_t w = format >> 8; + uint8_t d = format; struct fmt_spec f; - uint8_t raw_type = s >> 16; - uint8_t w = s >> 8; - uint8_t d = s; - bool ok; - - if (!fmt_from_io (raw_type, &f.type)) - sys_error (r, _("Unknown variable format %d."), (int) raw_type); + f.w = w; f.d = d; msg_disable (); - ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v)); + ok = (fmt_from_io (raw_type, &f.type) + && fmt_check_output (&f) + && fmt_check_width_compat (&f, var_get_width (v))); msg_enable (); - - if (ok) + + if (ok) { if (which == PRINT_FORMAT) var_set_print_format (v, &f); else var_set_write_format (v, &f); } - else if (*++format_warning_cnt <= max_format_warnings) + else if (format == 0) { - char fmt_string[FMT_STRING_LEN_MAX + 1]; - sys_warn (r, _("%s variable %s has invalid %s format %s."), - var_is_numeric (v) ? _("Numeric") : _("String"), - var_get_name (v), - which == PRINT_FORMAT ? _("print") : _("write"), - fmt_to_string (&f, fmt_string)); - - if (*format_warning_cnt == max_format_warnings) - sys_warn (r, _("Suppressing further invalid format warnings.")); + /* Actually observed in the wild. No point in warning about it. */ } -} - -/* Sets the weighting variable in DICT to the variable - corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is - nonzero. */ -static void -setup_weight (struct sfm_reader *r, int weight_idx, - struct variable **var_by_value_idx, struct dictionary *dict) -{ - if (weight_idx != 0) + else if (++*n_warnings <= max_warnings) { - struct variable *weight_var - = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx); - if (var_is_numeric (weight_var)) - dict_set_weight (dict, weight_var); + if (which == PRINT_FORMAT) + sys_warn (r, pos, _("Variable %s with width %d has invalid print " + "format 0x%x."), + var_get_name (v), var_get_width (v), format); else - sys_error (r, _("Weighting variable must be numeric.")); - } -} - -/* Reads a document record, type 6, from system file R, and sets up - the documents and n_documents fields in the associated - dictionary. */ -static void -read_documents (struct sfm_reader *r, struct dictionary *dict) -{ - int line_cnt; - char *documents; - - if (dict_get_documents (dict) != NULL) - sys_error (r, _("Multiple type 6 (document) records.")); + sys_warn (r, pos, _("Variable %s with width %d has invalid write " + "format 0x%x."), + var_get_name (v), var_get_width (v), format); - line_cnt = read_int32 (r); - if (line_cnt <= 0) - sys_error (r, _("Number of document lines (%d) " - "must be greater than 0."), line_cnt); - - documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH); - read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1); - if (strlen (documents) == DOC_LINE_LENGTH * line_cnt) - dict_set_documents (dict, documents); - else - sys_error (r, _("Document line contains null byte.")); - pool_free (r->pool, documents); + if (*n_warnings == max_warnings) + sys_warn (r, -1, _("Suppressing further invalid format warnings.")); + } } -/* Read a type 7 extension record. */ static void -read_extension_record (struct sfm_reader *r, struct dictionary *dict) +parse_document (struct dictionary *dict, struct sfm_document_record *record) { - int subtype = read_int32 (r); - size_t size = read_int32 (r); - size_t count = read_int32 (r); - size_t bytes = size * count; - - /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1 - allows an extra byte for a null terminator, used by some - extension processing routines. */ - if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size)))) - sys_error (r, "Record type 7 subtype %d too large.", subtype); + const char *p; - switch (subtype) + for (p = record->documents; + p < record->documents + DOC_LINE_LENGTH * record->n_lines; + p += DOC_LINE_LENGTH) { - case 3: - read_machine_int32_info (r, size, count); - return; - - case 4: - read_machine_flt64_info (r, size, count); - return; - - case 5: - /* Variable sets information. We don't use these yet. - They only apply to GUIs; see VARSETS on the APPLY - DICTIONARY command in SPSS documentation. */ - break; - - case 6: - /* DATE variable information. We don't use it yet, but we - should. */ - break; - - case 7: - /* Unknown purpose. */ - break; - - case 11: - read_display_parameters (r, size, count, dict); - return; - - case 13: - read_long_var_name_map (r, size, count, dict); - return; + struct substring line; - case 14: - read_long_string_map (r, size, count, dict); - return; + line = recode_substring_pool ("UTF-8", dict_get_encoding (dict), + ss_buffer (p, DOC_LINE_LENGTH), NULL); + ss_rtrim (&line, ss_cstr (" ")); + line.string[line.length] = '\0'; - case 16: - /* New in SPSS v14? Unknown purpose. */ - break; + dict_add_document_line (dict, line.string, false); - case 17: - /* Text field that defines variable attributes. New in - SPSS 14. */ - break; - - default: - sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype); - break; + ss_dealloc (&line); } - - skip_bytes (r, bytes); } -/* Read record type 7, subtype 3. */ -static void -read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count) +/* Parses record type 7, subtype 3. */ +static bool +parse_machine_integer_info (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct any_read_info *info) { - int version_major UNUSED = read_int32 (r); - int version_minor UNUSED = read_int32 (r); - int version_revision UNUSED = read_int32 (r); - int machine_code UNUSED = read_int32 (r); - int float_representation = read_int32 (r); - int compression_code UNUSED = read_int32 (r); - int integer_representation = read_int32 (r); - int character_code UNUSED = read_int32 (r); + int float_representation, expected_float_format; + int integer_representation, expected_integer_format; - int expected_float_format; - int expected_integer_format; - - if (size != 4 || count != 8) - sys_error (r, _("Bad size (%u) or count (%u) field on record type 7, " - "subtype 3."), - (unsigned int) size, (unsigned int) count); + /* Save version info. */ + info->version_major = parse_int (r, record->data, 0); + info->version_minor = parse_int (r, record->data, 4); + info->version_revision = parse_int (r, record->data, 8); /* Check floating point format. */ + float_representation = parse_int (r, record->data, 16); if (r->float_format == FLOAT_IEEE_DOUBLE_BE || r->float_format == FLOAT_IEEE_DOUBLE_LE) expected_float_format = 1; @@ -781,11 +1634,16 @@ read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count) else NOT_REACHED (); if (float_representation != expected_float_format) - sys_error (r, _("Floating-point representation indicated by " - "system file (%d) differs from expected (%d)."), - r->float_format, expected_float_format); + { + sys_error (r, record->pos, + _("Floating-point representation indicated by " + "system file (%d) differs from expected (%d)."), + float_representation, expected_float_format); + return false; + } /* Check integer format. */ + integer_representation = parse_int (r, record->data, 24); if (r->integer_format == INTEGER_MSB_FIRST) expected_integer_format = 1; else if (r->integer_format == INTEGER_LSB_FIRST) @@ -793,421 +1651,1089 @@ read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count) else NOT_REACHED (); if (integer_representation != expected_integer_format) - { - static const char *endian[] = {N_("little-endian"), N_("big-endian")}; - sys_warn (r, _("Integer format indicated by system file (%s) " - "differs from expected (%s)."), - gettext (endian[integer_representation == 1]), - gettext (endian[expected_integer_format == 1])); - } + sys_warn (r, record->pos, + _("Integer format indicated by system file (%d) " + "differs from expected (%d)."), + integer_representation, expected_integer_format); + + return true; } -/* Read record type 7, subtype 4. */ +/* Parses record type 7, subtype 4. */ static void -read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count) +parse_machine_float_info (struct sfm_reader *r, + const struct sfm_extension_record *record) { - double sysmis = read_flt64 (r); - double highest = read_flt64 (r); - double lowest = read_flt64 (r); - - if (size != 8 || count != 3) - sys_error (r, _("Bad size (%u) or count (%u) on extension 4."), - (unsigned int) size, (unsigned int) count); + double sysmis = parse_float (r, record->data, 0); + double highest = parse_float (r, record->data, 8); + double lowest = parse_float (r, record->data, 16); if (sysmis != SYSMIS) - sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis); + sys_warn (r, record->pos, + _("File specifies unexpected value %g (%a) as %s, " + "instead of %g (%a)."), + sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS); + if (highest != HIGHEST) - sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest); - if (lowest != LOWEST) - sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest); + sys_warn (r, record->pos, + _("File specifies unexpected value %g (%a) as %s, " + "instead of %g (%a)."), + highest, highest, "HIGHEST", HIGHEST, HIGHEST); + + /* SPSS before version 21 used a unique value just bigger than SYSMIS as + LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only + appears in a context (missing values) where SYSMIS cannot. */ + if (lowest != LOWEST && lowest != SYSMIS) + sys_warn (r, record->pos, + _("File specifies unexpected value %g (%a) as %s, " + "instead of %g (%a) or %g (%a)."), + lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS); } -/* Read record type 7, subtype 11, which specifies how variables - should be displayed in GUI environments. */ +/* Parses record type 7, subtype 10. */ static void -read_display_parameters (struct sfm_reader *r, size_t size, size_t count, - struct dictionary *dict) +parse_extra_product_info (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct any_read_info *info) { - const size_t n_vars = count / 3 ; - bool warned = false; - int i; + struct text_record *text; + + text = open_text_record (r, record, true); + info->product_ext = fix_line_ends (text_get_all (text)); + close_text_record (r, text); +} - if (count % 3 || n_vars != dict_get_var_cnt (dict)) - sys_error (r, _("Bad size (%u) or count (%u) on extension 11."), - (unsigned int) size, (unsigned int) count); +/* Parses record type 7, subtype 7 or 19. */ +static void +parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, + size_t *allocated_mrsets) +{ + struct text_record *text; - for (i = 0; i < n_vars; ++i) + text = open_text_record (r, record, false); + for (;;) { - int measure = read_int32 (r); - int width = read_int32 (r); - int align = read_int32 (r); - struct variable *v = dict_get_var (dict, i); + struct sfm_mrset *mrset; + size_t allocated_vars; + char delimiter; - /* spss v14 sometimes seems to set string variables' measure to zero */ - if ( 0 == measure && var_is_alpha (v) ) measure = 1; + /* Skip extra line feeds if present. */ + while (text_match (text, '\n')) + continue; + if (r->n_mrsets >= *allocated_mrsets) + r->mrsets = pool_2nrealloc (r->pool, r->mrsets, allocated_mrsets, + sizeof *r->mrsets); + mrset = &r->mrsets[r->n_mrsets]; + memset(mrset, 0, sizeof *mrset); - if (measure < 1 || measure > 3 || align < 0 || align > 2) + mrset->name = text_get_token (text, ss_cstr ("="), NULL); + if (mrset->name == NULL) + break; + + if (text_match (text, 'C')) { - if (!warned) - sys_warn (r, _("Invalid variable display parameters. " - "Default parameters substituted.")); - warned = true; - continue; + mrset->type = MRSET_MC; + if (!text_match (text, ' ')) + { + sys_warn (r, record->pos, + _("Missing space following `%c' at offset %zu " + "in MRSETS record."), 'C', text_pos (text)); + break; + } } + else if (text_match (text, 'D')) + { + mrset->type = MRSET_MD; + mrset->cat_source = MRSET_VARLABELS; + } + else if (text_match (text, 'E')) + { + char *number; - var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL - : measure == 2 ? MEASURE_ORDINAL - : MEASURE_SCALE)); - var_set_display_width (v, width); - var_set_alignment (v, (align == 0 ? ALIGN_LEFT - : align == 1 ? ALIGN_RIGHT - : ALIGN_CENTRE)); - } -} + mrset->type = MRSET_MD; + mrset->cat_source = MRSET_COUNTEDVALUES; + if (!text_match (text, ' ')) + { + sys_warn (r, record->pos, + _("Missing space following `%c' at offset %zu " + "in MRSETS record."), 'E', text_pos (text)); + break; + } -/* Reads record type 7, subtype 13, which gives the long name - that corresponds to each short name. Modifies variable names - in DICT accordingly. */ -static void -read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count, - struct dictionary *dict) -{ - struct variable_to_value_map *map; - struct variable *var; - char *long_name; - int warning_cnt = 0; - - map = open_variable_to_value_map (r, size * count); - while (read_variable_to_value_map (r, dict, map, &var, &long_name, - &warning_cnt)) - { - char short_name[SHORT_NAME_LEN + 1]; - strcpy (short_name, var_get_short_name (var)); + number = text_get_token (text, ss_cstr (" "), NULL); + if (!strcmp (number, "11")) + mrset->label_from_var_label = true; + else if (strcmp (number, "1")) + sys_warn (r, record->pos, + _("Unexpected label source value following `E' " + "at offset %zu in MRSETS record."), + text_pos (text)); + } + else + { + sys_warn (r, record->pos, + _("Missing `C', `D', or `E' at offset %zu " + "in MRSETS record."), + text_pos (text)); + break; + } - /* Validate long name. */ - if (!var_is_valid_name (long_name, false)) + if (mrset->type == MRSET_MD) { - sys_warn (r, _("Long variable mapping from %s to invalid " - "variable name `%s'."), - var_get_name (var), long_name); - continue; + mrset->counted = text_parse_counted_string (r, text); + if (mrset->counted == NULL) + break; } - - /* Identify any duplicates. */ - if (strcasecmp (short_name, long_name) - && dict_lookup_var (dict, long_name) != NULL) + + mrset->label = text_parse_counted_string (r, text); + if (mrset->label == NULL) + break; + + allocated_vars = 0; + do { - sys_warn (r, _("Duplicate long variable name `%s' " - "within system file."), long_name); - continue; + const char *var; + + var = text_get_token (text, ss_cstr (" \n"), &delimiter); + if (var == NULL) + { + if (delimiter != '\n') + sys_warn (r, record->pos, + _("Missing new-line parsing variable names " + "at offset %zu in MRSETS record."), + text_pos (text)); + break; + } + + if (mrset->n_vars >= allocated_vars) + mrset->vars = pool_2nrealloc (r->pool, mrset->vars, + &allocated_vars, + sizeof *mrset->vars); + mrset->vars[mrset->n_vars++] = var; } + while (delimiter != '\n'); - /* Set long name. Renaming a variable may clear the short - name, but we want to retain it, so re-set it - explicitly. */ - dict_rename_var (dict, var, long_name); - var_set_short_name (var, short_name); + r->n_mrsets++; } - close_variable_to_value_map (r, map); - r->has_long_var_names = true; + close_text_record (r, text); } -/* Reads record type 7, subtype 14, which gives the real length - of each very long string. Rearranges DICT accordingly. */ static void -read_long_string_map (struct sfm_reader *r, size_t size, size_t count, - struct dictionary *dict) +decode_mrsets (struct sfm_reader *r, struct dictionary *dict) { - struct variable_to_value_map *map; - struct variable *var; - char *length_s; - int warning_cnt = 0; - - r->has_vls = true; + const struct sfm_mrset *s; - map = open_variable_to_value_map (r, size * count); - while (read_variable_to_value_map (r, dict, map, &var, &length_s, - &warning_cnt)) + for (s = r->mrsets; s < &r->mrsets[r->n_mrsets]; s++) { - long length, remaining_length; - size_t idx; - - /* Get length. */ - length = strtol (length_s, NULL, 10); - if (length < MIN_VERY_LONG_STRING || length == LONG_MAX) + struct stringi_set var_names; + struct mrset *mrset; + char *name; + int width; + size_t i; + + name = recode_string ("UTF-8", r->encoding, s->name, -1); + if (!mrset_is_valid_name (name, dict_get_encoding (dict), false)) { - sys_warn (r, _("%s listed as string of length %s " - "in length table."), - var_get_name (var), length_s); + sys_warn (r, -1, _("Invalid multiple response set name `%s'."), + name); + free (name); continue; } - /* Group multiple variables into single variable - and delete all but the first. */ - remaining_length = length; - for (idx = var_get_dict_index (var); remaining_length > 0; idx++) - if (idx < dict_get_var_cnt (dict)) - remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)), - EFFECTIVE_LONG_STRING_LENGTH); - else - sys_error (r, _("Very long string %s overflows dictionary."), - var_get_name (var)); - dict_delete_consecutive_vars (dict, - var_get_dict_index (var) + 1, - idx - var_get_dict_index (var) - 1); + mrset = xzalloc (sizeof *mrset); + mrset->name = name; + mrset->type = s->type; + mrset->cat_source = s->cat_source; + mrset->label_from_var_label = s->label_from_var_label; + if (s->label[0] != '\0') + mrset->label = recode_string ("UTF-8", r->encoding, s->label, -1); + + stringi_set_init (&var_names); + mrset->vars = xmalloc (s->n_vars * sizeof *mrset->vars); + width = INT_MAX; + for (i = 0; i < s->n_vars; i++) + { + struct variable *var; + char *var_name; - /* Assign all the length to the first variable. */ - var_set_width (var, length); + var_name = recode_string ("UTF-8", r->encoding, s->vars[i], -1); + + var = dict_lookup_var (dict, var_name); + if (var == NULL) + { + free (var_name); + continue; + } + if (!stringi_set_insert (&var_names, var_name)) + { + sys_warn (r, -1, + _("MRSET %s contains duplicate variable name %s."), + mrset->name, var_name); + free (var_name); + continue; + } + free (var_name); + + if (mrset->label == NULL && mrset->label_from_var_label + && var_has_label (var)) + mrset->label = xstrdup (var_get_label (var)); + + if (mrset->n_vars + && var_get_type (var) != var_get_type (mrset->vars[0])) + { + sys_warn (r, -1, + _("MRSET %s contains both string and " + "numeric variables."), mrset->name); + continue; + } + width = MIN (width, var_get_width (var)); + + mrset->vars[mrset->n_vars++] = var; + } + + if (mrset->n_vars < 2) + { + if (mrset->n_vars == 0) + sys_warn (r, -1, _("MRSET %s has no variables."), mrset->name); + else + sys_warn (r, -1, _("MRSET %s has only one variable."), + mrset->name); + mrset_destroy (mrset); + stringi_set_destroy (&var_names); + continue; + } + + if (mrset->type == MRSET_MD) + { + mrset->width = width; + value_init (&mrset->counted, width); + if (width == 0) + mrset->counted.f = c_strtod (s->counted, NULL); + else + value_copy_str_rpad (&mrset->counted, width, + (const uint8_t *) s->counted, ' '); + } + + dict_add_mrset (dict, mrset); + stringi_set_destroy (&var_names); } - close_variable_to_value_map (r, map); - dict_compact_values (dict); } -/* Reads value labels from sysfile H and inserts them into the - associated dictionary. */ +/* Read record type 7, subtype 11, which specifies how variables + should be displayed in GUI environments. */ static void -read_value_labels (struct sfm_reader *r, - struct dictionary *dict, struct variable **var_by_value_idx) +parse_display_parameters (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) { - struct pool *subpool; - - struct label + bool includes_width; + bool warned = false; + size_t n_vars; + size_t ofs; + size_t i; + + n_vars = dict_get_var_cnt (dict); + if (record->count == 3 * n_vars) + includes_width = true; + else if (record->count == 2 * n_vars) + includes_width = false; + else { - char raw_value[8]; /* Value as uninterpreted bytes. */ - union value value; /* Value. */ - char *label; /* Null-terminated label string. */ - }; + sys_warn (r, record->pos, + _("Extension 11 has bad count %u (for %zu variables)."), + record->count, n_vars); + return; + } - struct label *labels = NULL; - int label_cnt; /* Number of labels. */ + ofs = 0; + for (i = 0; i < n_vars; ++i) + { + struct variable *v = dict_get_var (dict, i); + int measure, width, align; - struct variable **var = NULL; /* Associated variables. */ - int var_cnt; /* Number of associated variables. */ + measure = parse_int (r, record->data, ofs); + ofs += 4; - int i; + if (includes_width) + { + width = parse_int (r, record->data, ofs); + ofs += 4; + } + else + width = 0; - subpool = pool_create_subpool (r->pool); + align = parse_int (r, record->data, ofs); + ofs += 4; - /* Read the type 3 record and record its contents. We can't do - much with the data yet because we don't know whether it is - of numeric or string type. */ + /* SPSS sometimes seems to set variables' measure to zero. */ + if (0 == measure) + measure = 1; - /* Read number of labels. */ - label_cnt = read_int32 (r); + if (measure < 1 || measure > 3 || align < 0 || align > 2) + { + if (!warned) + sys_warn (r, record->pos, + _("Invalid variable display parameters for variable " + "%zu (%s). Default parameters substituted."), + i, var_get_name (v)); + warned = true; + continue; + } - if (label_cnt >= INT32_MAX / sizeof *labels) - { - sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."), - label_cnt); - label_cnt = 0; + var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL + : measure == 2 ? MEASURE_ORDINAL + : MEASURE_SCALE)); + var_set_alignment (v, (align == 0 ? ALIGN_LEFT + : align == 1 ? ALIGN_RIGHT + : ALIGN_CENTRE)); + + /* Older versions (SPSS 9.0) sometimes set the display + width to zero. This causes confusion in the GUI, so + only set the width if it is nonzero. */ + if (width > 0) + var_set_display_width (v, width); } +} - /* Read each value/label tuple into labels[]. */ - labels = pool_nalloc (subpool, label_cnt, sizeof *labels); - for (i = 0; i < label_cnt; i++) - { - struct label *label = labels + i; - unsigned char label_len; - size_t padded_len; +static void +rename_var_and_save_short_names (struct sfm_reader *r, off_t pos, + struct dictionary *dict, + struct variable *var, const char *new_name) +{ + size_t n_short_names; + char **short_names; + size_t i; - /* Read value. */ - read_bytes (r, label->raw_value, sizeof label->raw_value); + /* Renaming a variable may clear its short names, but we + want to retain them, so we save them and re-set them + afterward. */ + n_short_names = var_get_short_name_cnt (var); + short_names = xnmalloc (n_short_names, sizeof *short_names); + for (i = 0; i < n_short_names; i++) + { + const char *s = var_get_short_name (var, i); + short_names[i] = s != NULL ? xstrdup (s) : NULL; + } - /* Read label length. */ - read_bytes (r, &label_len, sizeof label_len); - padded_len = ROUND_UP (label_len + 1, 8); + /* Set long name. */ + if (!dict_try_rename_var (dict, var, new_name)) + sys_warn (r, pos, _("Duplicate long variable name `%s'."), new_name); - /* Read label, padding. */ - label->label = pool_alloc (subpool, padded_len + 1); - read_bytes (r, label->label, padded_len - 1); - label->label[label_len] = 0; + /* Restore short names. */ + for (i = 0; i < n_short_names; i++) + { + var_set_short_name (var, i, short_names[i]); + free (short_names[i]); } + free (short_names); +} - /* Now, read the type 4 record that has the list of variables - to which the value labels are to be applied. */ +/* Parses record type 7, subtype 13, which gives the long name that corresponds + to each short name. Modifies variable names in DICT accordingly. */ +static void +parse_long_var_name_map (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + struct text_record *text; + struct variable *var; + char *long_name; - /* Read record type of type 4 record. */ - if (read_int32 (r) != 4) - sys_error (r, _("Variable index record (type 4) does not immediately " - "follow value label record (type 3) as it should.")); + if (record == NULL) + { + /* There are no long variable names. Use the short variable names, + converted to lowercase, as the long variable names. */ + size_t i; - /* Read number of variables associated with value label from type 4 - record. */ - var_cnt = read_int32 (r); - if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict)) - sys_error (r, _("Number of variables associated with a value label (%d) " - "is not between 1 and the number of variables (%u)."), - var_cnt, (unsigned int) dict_get_var_cnt (dict)); - - /* Read the list of variables. */ - var = pool_nalloc (subpool, var_cnt, sizeof *var); - for (i = 0; i < var_cnt; i++) - { - var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r)); - if (var_is_long_string (var[i])) - sys_error (r, _("Value labels are not allowed on long string " - "variables (%s)."), var_get_name (var[i])); - } - - /* Type check the variables. */ - for (i = 1; i < var_cnt; i++) - if (var_get_type (var[i]) != var_get_type (var[0])) - sys_error (r, _("Variables associated with value label are not all of " - "identical type. Variable %s is %s, but variable " - "%s is %s."), - var_get_name (var[0]), - var_is_numeric (var[0]) ? _("numeric") : _("string"), - var_get_name (var[i]), - var_is_numeric (var[i]) ? _("numeric") : _("string")); - - /* Fill in labels[].value, now that we know the desired type. */ - for (i = 0; i < label_cnt; i++) - { - struct label *label = labels + i; - - if (var_is_alpha (var[0])) - buf_copy_rpad (label->value.s, sizeof label->value.s, - label->raw_value, sizeof label->raw_value); - else - label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value); + for (i = 0; i < dict_get_var_cnt (dict); i++) + { + struct variable *var = dict_get_var (dict, i); + char *new_name; + + new_name = utf8_to_lower (var_get_name (var)); + rename_var_and_save_short_names (r, -1, dict, var, new_name); + free (new_name); + } + + return; } - - /* Assign the `value_label's to each variable. */ - for (i = 0; i < var_cnt; i++) + + /* Rename each of the variables, one by one. (In a correctly constructed + system file, this cannot create any intermediate duplicate variable names, + because all of the new variable names are longer than any of the old + variable names and thus there cannot be any overlaps.) */ + text = open_text_record (r, record, true); + while (read_variable_to_value_pair (r, dict, text, &var, &long_name)) { - struct variable *v = var[i]; - int j; + /* Validate long name. */ + if (!dict_id_is_valid (dict, long_name, false) + || long_name[0] == '$' || long_name[0] == '#') + { + sys_warn (r, record->pos, + _("Long variable mapping from %s to invalid " + "variable name `%s'."), + var_get_name (var), long_name); + continue; + } - /* Add each label to the variable. */ - for (j = 0; j < label_cnt; j++) - { - struct label *label = &labels[j]; - if (!var_add_value_label (v, &label->value, label->label)) + rename_var_and_save_short_names (r, record->pos, dict, var, long_name); + } + close_text_record (r, text); +} + +/* Reads record type 7, subtype 14, which gives the real length + of each very long string. Rearranges DICT accordingly. */ +static bool +parse_long_string_map (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + struct text_record *text; + struct variable *var; + char *length_s; + + text = open_text_record (r, record, true); + while (read_variable_to_value_pair (r, dict, text, &var, &length_s)) + { + size_t idx = var_get_dict_index (var); + long int length; + int segment_cnt; + int i; + + /* Get length. */ + length = strtol (length_s, NULL, 10); + if (length < 1 || length > MAX_STRING) + { + sys_warn (r, record->pos, + _("%s listed as string of invalid length %s " + "in very long string record."), + var_get_name (var), length_s); + continue; + } + + /* Check segments. */ + segment_cnt = sfm_width_to_segments (length); + if (segment_cnt == 1) + { + sys_warn (r, record->pos, + _("%s listed in very long string record with width %s, " + "which requires only one segment."), + var_get_name (var), length_s); + continue; + } + if (idx + segment_cnt > dict_get_var_cnt (dict)) + { + sys_error (r, record->pos, + _("Very long string %s overflows dictionary."), + var_get_name (var)); + return false; + } + + /* Get the short names from the segments and check their + lengths. */ + for (i = 0; i < segment_cnt; i++) + { + struct variable *seg = dict_get_var (dict, idx + i); + int alloc_width = sfm_segment_alloc_width (length, i); + int width = var_get_width (seg); + + if (i > 0) + var_set_short_name (var, i, var_get_short_name (seg, 0)); + if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8)) { - if (var_is_numeric (var[0])) - sys_warn (r, _("Duplicate value label for %g on %s."), - label->value.f, var_get_name (v)); - else - sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."), - var_get_width (v), label->value.s, - var_get_name (v)); + sys_error (r, record->pos, + _("Very long string with width %ld has segment %d " + "of width %d (expected %d)."), + length, i, width, alloc_width); + return false; } - } + } + dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1); + var_set_width (var, length); } + close_text_record (r, text); + dict_compact_values (dict); - pool_destroy (subpool); + return true; } - -/* Case reader. */ -static void partial_record (struct sfm_reader *r) - NO_RETURN; -static bool read_case_number (struct sfm_reader *, double *); -static bool read_case_string (struct sfm_reader *, char *, size_t); -static int read_opcode (struct sfm_reader *); -static bool read_compressed_number (struct sfm_reader *, double *); -static bool read_compressed_string (struct sfm_reader *, char *); -static bool read_whole_strings (struct sfm_reader *, char *, size_t); - -/* Reads one case from READER's file into C. Returns nonzero - only if successful. */ -int -sfm_read_case (struct sfm_reader *r, struct ccase *c) +static bool +parse_value_labels (struct sfm_reader *r, struct dictionary *dict, + const struct sfm_var_record *var_recs, size_t n_var_recs, + const struct sfm_value_label_record *record) { - if (r->error) - return 0; + struct variable **vars; + char **utf8_labels; + size_t i; - if (setjmp (r->bail_out)) - return 0; + utf8_labels = pool_nmalloc (r->pool, record->n_labels, sizeof *utf8_labels); + for (i = 0; i < record->n_labels; i++) + utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict), + record->labels[i].label, -1, + r->pool); - if (!r->compressed && sizeof (double) == 8 && !r->has_vls) + vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars); + for (i = 0; i < record->n_vars; i++) { - /* Fast path. Read the whole case directly. */ - if (!try_read_bytes (r, case_data_all_rw (c), - sizeof (union value) * r->value_cnt)) - return 0; + vars[i] = lookup_var_by_index (r, record->pos, + var_recs, n_var_recs, record->vars[i]); + if (vars[i] == NULL) + return false; + } - /* Convert floating point numbers to native format if needed. */ - if (r->float_format != FLOAT_NATIVE_DOUBLE) + for (i = 1; i < record->n_vars; i++) + if (var_get_type (vars[i]) != var_get_type (vars[0])) + { + sys_error (r, record->pos, + _("Variables associated with value label are not all of " + "identical type. Variable %s is %s, but variable " + "%s is %s."), + var_get_name (vars[0]), + var_is_numeric (vars[0]) ? _("numeric") : _("string"), + var_get_name (vars[i]), + var_is_numeric (vars[i]) ? _("numeric") : _("string")); + return false; + } + + for (i = 0; i < record->n_vars; i++) + { + struct variable *var = vars[i]; + int width; + size_t j; + + width = var_get_width (var); + if (width > 8) { - int i; - - for (i = 0; i < r->var_cnt; i++) - if (r->vars[i].width == 0) - { - double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f; - float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d); - } + sys_error (r, record->pos, + _("Value labels may not be added to long string " + "variables (e.g. %s) using records types 3 and 4."), + var_get_name (var)); + return false; + } + + for (j = 0; j < record->n_labels; j++) + { + struct sfm_value_label *label = &record->labels[j]; + union value value; + + value_init (&value, width); + if (width == 0) + value.f = parse_float (r, label->value, 0); + else + memcpy (value_str_rw (&value, width), label->value, width); + + if (!var_add_value_label (var, &value, utf8_labels[j])) + { + if (r->written_by_readstat) + { + /* Ignore the problem. ReadStat is buggy and emits value + labels whose values are longer than string variables' + widths, that are identical in the actual width of the + variable, e.g. both values "ABC123" and "ABC456" for a + string variable with width 3. */ + } + else if (var_is_numeric (var)) + sys_warn (r, record->pos, + _("Duplicate value label for %g on %s."), + value.f, var_get_name (var)); + else + sys_warn (r, record->pos, + _("Duplicate value label for `%.*s' on %s."), + width, value_str (&value, width), + var_get_name (var)); + } + + value_destroy (&value, width); } - return 1; } - else + + pool_free (r->pool, vars); + for (i = 0; i < record->n_labels; i++) + pool_free (r->pool, utf8_labels[i]); + pool_free (r->pool, utf8_labels); + + return true; +} + +static struct variable * +lookup_var_by_index (struct sfm_reader *r, off_t offset, + const struct sfm_var_record *var_recs, size_t n_var_recs, + int idx) +{ + const struct sfm_var_record *rec; + + if (idx < 1 || idx > n_var_recs) { - /* Slow path. Convert from external to internal format. */ - int i; + sys_error (r, offset, + _("Variable index %d not in valid range 1...%zu."), + idx, n_var_recs); + return NULL; + } + + rec = &var_recs[idx - 1]; + if (rec->var == NULL) + { + sys_error (r, offset, + _("Variable index %d refers to long string continuation."), + idx); + return NULL; + } + + return rec->var; +} + +/* Parses a set of custom attributes from TEXT into ATTRS. + ATTRS may be a null pointer, in which case the attributes are + read but discarded. */ +static void +parse_attributes (struct sfm_reader *r, struct text_record *text, + struct attrset *attrs) +{ + do + { + struct attribute *attr; + char *key; + int index; + + /* Parse the key. */ + key = text_get_token (text, ss_cstr ("("), NULL); + if (key == NULL) + return; - for (i = 0; i < r->var_cnt; i++) + attr = attribute_create (key); + for (index = 1; ; index++) { - struct sfm_var *sv = &r->vars[i]; - union value *v = case_data_rw_idx (c, sv->case_index); + /* Parse the value. */ + char *value; + size_t length; - if (sv->width == 0) + value = text_get_token (text, ss_cstr ("\n"), NULL); + if (value == NULL) { - if (!read_case_number (r, &v->f)) - goto eof; + text_warn (r, text, _("Error parsing attribute value %s[%d]."), + key, index); + break; + } + + length = strlen (value); + if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'') + { + value[length - 1] = '\0'; + attribute_add_value (attr, value + 1); } else { - /* Read the string data in segments up to 255 bytes - at a time, packed into 8-byte units. */ - const int max_chunk = MIN_VERY_LONG_STRING - 1; - int ofs, chunk_size; - for (ofs = 0; ofs < sv->width; ofs += chunk_size) - { - chunk_size = MIN (max_chunk, sv->width - ofs); - if (!read_case_string (r, v->s + ofs, chunk_size)) - { - if (ofs) - partial_record (r); - goto eof; - } - } + text_warn (r, text, + _("Attribute value %s[%d] is not quoted: %s."), + key, index, value); + attribute_add_value (attr, value); + } + + /* Was this the last value for this attribute? */ + if (text_match (text, ')')) + break; + } + if (attrs != NULL) + { + if (!attrset_try_add (attrs, attr)) + { + text_warn (r, text, _("Duplicate attribute %s."), + attribute_get_name (attr)); + attribute_destroy (attr); + } + } + else + attribute_destroy (attr); + } + while (!text_match (text, '/')); +} + +/* Reads record type 7, subtype 17, which lists custom + attributes on the data file. */ +static void +parse_data_file_attributes (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + struct text_record *text = open_text_record (r, record, true); + parse_attributes (r, text, dict_get_attributes (dict)); + close_text_record (r, text); +} + +/* Parses record type 7, subtype 18, which lists custom + attributes on individual variables. */ +static void +parse_variable_attributes (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + struct text_record *text; + struct variable *var; + + text = open_text_record (r, record, true); + while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var)) + parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL); + close_text_record (r, text); +} - /* Very long strings have trailing wasted space - that we must skip. */ - if (sv->width >= MIN_VERY_LONG_STRING) +static void +assign_variable_roles (struct sfm_reader *r, struct dictionary *dict) +{ + size_t n_warnings = 0; + size_t i; + + for (i = 0; i < dict_get_var_cnt (dict); i++) + { + struct variable *var = dict_get_var (dict, i); + struct attrset *attrs = var_get_attributes (var); + const struct attribute *attr = attrset_lookup (attrs, "$@Role"); + if (attr != NULL) + { + int value = atoi (attribute_get_value (attr, 0)); + enum var_role role; + + switch (value) + { + case 0: + role = ROLE_INPUT; + break; + + case 1: + role = ROLE_TARGET; + break; + + case 2: + role = ROLE_BOTH; + break; + + case 3: + role = ROLE_NONE; + break; + + case 4: + role = ROLE_PARTITION; + break; + + case 5: + role = ROLE_SPLIT; + break; + + default: + role = ROLE_INPUT; + if (n_warnings++ == 0) + sys_warn (r, -1, _("Invalid role for variable %s."), + var_get_name (var)); + } + + var_set_role (var, role); + } + } + + if (n_warnings > 1) + sys_warn (r, -1, _("%zu other variables had invalid roles."), + n_warnings - 1); +} + +static bool +check_overflow (struct sfm_reader *r, + const struct sfm_extension_record *record, + size_t ofs, size_t length) +{ + size_t end = record->size * record->count; + if (length >= end || ofs + length > end) + { + sys_warn (r, record->pos + end, + _("Extension record subtype %d ends unexpectedly."), + record->subtype); + return false; + } + return true; +} + +static void +parse_long_string_value_labels (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + const char *dict_encoding = dict_get_encoding (dict); + size_t end = record->size * record->count; + size_t ofs = 0; + + while (ofs < end) + { + char *var_name; + size_t n_labels, i; + struct variable *var; + union value value; + int var_name_len; + int width; + + /* Parse variable name length. */ + if (!check_overflow (r, record, ofs, 4)) + return; + var_name_len = parse_int (r, record->data, ofs); + ofs += 4; + + /* Parse variable name, width, and number of labels. */ + if (!check_overflow (r, record, ofs, var_name_len) + || !check_overflow (r, record, ofs, var_name_len + 8)) + return; + var_name = recode_string_pool ("UTF-8", dict_encoding, + (const char *) record->data + ofs, + var_name_len, r->pool); + width = parse_int (r, record->data, ofs + var_name_len); + n_labels = parse_int (r, record->data, ofs + var_name_len + 4); + ofs += var_name_len + 8; + + /* Look up 'var' and validate. */ + var = dict_lookup_var (dict, var_name); + if (var == NULL) + sys_warn (r, record->pos + ofs, + _("Ignoring long string value label record for " + "unknown variable %s."), var_name); + else if (var_is_numeric (var)) + { + sys_warn (r, record->pos + ofs, + _("Ignoring long string value label record for " + "numeric variable %s."), var_name); + var = NULL; + } + else if (width != var_get_width (var)) + { + sys_warn (r, record->pos + ofs, + _("Ignoring long string value label record for variable " + "%s because the record's width (%d) does not match the " + "variable's width (%d)."), + var_name, width, var_get_width (var)); + var = NULL; + } + + /* Parse values. */ + value_init_pool (r->pool, &value, width); + for (i = 0; i < n_labels; i++) + { + size_t value_length, label_length; + bool skip = var == NULL; + + /* Parse value length. */ + if (!check_overflow (r, record, ofs, 4)) + return; + value_length = parse_int (r, record->data, ofs); + ofs += 4; + + /* Parse value. */ + if (!check_overflow (r, record, ofs, value_length)) + return; + if (!skip) + { + if (value_length == width) + memcpy (value_str_rw (&value, width), + (const uint8_t *) record->data + ofs, width); + else { - int bytes_read = (sv->width / max_chunk * 256 - + ROUND_UP (sv->width % max_chunk, 8)); - int total_bytes = sfm_width_to_bytes (sv->width); - int excess_bytes = total_bytes - bytes_read; - - while (excess_bytes > 0) - { - char buffer[1024]; - size_t chunk = MIN (sizeof buffer, excess_bytes); - if (!read_whole_strings (r, buffer, chunk)) - partial_record (r); - excess_bytes -= chunk; - } + sys_warn (r, record->pos + ofs, + _("Ignoring long string value label %zu for " + "variable %s, with width %d, that has bad value " + "width %zu."), + i, var_get_name (var), width, value_length); + skip = true; } } + ofs += value_length; + + /* Parse label length. */ + if (!check_overflow (r, record, ofs, 4)) + return; + label_length = parse_int (r, record->data, ofs); + ofs += 4; + + /* Parse label. */ + if (!check_overflow (r, record, ofs, label_length)) + return; + if (!skip) + { + char *label; + + label = recode_string_pool ("UTF-8", dict_encoding, + (const char *) record->data + ofs, + label_length, r->pool); + if (!var_add_value_label (var, &value, label)) + sys_warn (r, record->pos + ofs, + _("Duplicate value label for `%.*s' on %s."), + width, value_str (&value, width), + var_get_name (var)); + pool_free (r->pool, label); + } + ofs += label_length; } - return 1; + } +} - eof: - if (i != 0) - partial_record (r); - return 0; +static void +parse_long_string_missing_values (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + const char *dict_encoding = dict_get_encoding (dict); + size_t end = record->size * record->count; + size_t ofs = 0; + + while (ofs < end) + { + struct missing_values mv; + char *var_name; + struct variable *var; + int n_missing_values; + int var_name_len; + size_t i; + + /* Parse variable name length. */ + if (!check_overflow (r, record, ofs, 4)) + return; + var_name_len = parse_int (r, record->data, ofs); + ofs += 4; + + /* Parse variable name. */ + if (!check_overflow (r, record, ofs, var_name_len) + || !check_overflow (r, record, ofs, var_name_len + 1)) + return; + var_name = recode_string_pool ("UTF-8", dict_encoding, + (const char *) record->data + ofs, + var_name_len, r->pool); + ofs += var_name_len; + + /* Parse number of missing values. */ + n_missing_values = ((const uint8_t *) record->data)[ofs]; + if (n_missing_values < 1 || n_missing_values > 3) + sys_warn (r, record->pos + ofs, + _("Long string missing values record says variable %s " + "has %d missing values, but only 1 to 3 missing values " + "are allowed."), + var_name, n_missing_values); + ofs++; + + /* Look up 'var' and validate. */ + var = dict_lookup_var (dict, var_name); + if (var == NULL) + sys_warn (r, record->pos + ofs, + _("Ignoring long string missing value record for " + "unknown variable %s."), var_name); + else if (var_is_numeric (var)) + { + sys_warn (r, record->pos + ofs, + _("Ignoring long string missing value record for " + "numeric variable %s."), var_name); + var = NULL; + } + + /* Parse values. */ + mv_init_pool (r->pool, &mv, var ? var_get_width (var) : 8); + for (i = 0; i < n_missing_values; i++) + { + size_t value_length; + + /* Parse value length. */ + if (!check_overflow (r, record, ofs, 4)) + return; + value_length = parse_int (r, record->data, ofs); + ofs += 4; + + /* Parse value. */ + if (!check_overflow (r, record, ofs, value_length)) + return; + if (var != NULL + && i < 3 + && !mv_add_str (&mv, (const uint8_t *) record->data + ofs, + value_length)) + sys_warn (r, record->pos + ofs, + _("Ignoring long string missing value %zu for variable " + "%s, with width %d, that has bad value width %zu."), + i, var_get_name (var), var_get_width (var), + value_length); + ofs += value_length; + } + if (var != NULL) + var_set_missing_values (var, &mv); } } + +/* Case reader. */ + +static void partial_record (struct sfm_reader *); + +static void read_error (struct casereader *, const struct sfm_reader *); + +static bool read_case_number (struct sfm_reader *, double *); +static int read_case_string (struct sfm_reader *, uint8_t *, size_t); +static int read_opcode (struct sfm_reader *); +static bool read_compressed_number (struct sfm_reader *, double *); +static int read_compressed_string (struct sfm_reader *, uint8_t *); +static int read_whole_strings (struct sfm_reader *, uint8_t *, size_t); +static bool skip_whole_strings (struct sfm_reader *, size_t); + +/* Reads and returns one case from READER's file. Returns a null + pointer if not successful. */ +static struct ccase * +sys_file_casereader_read (struct casereader *reader, void *r_) +{ + struct sfm_reader *r = r_; + struct ccase *c; + int retval; + int i; + + if (r->error || !r->sfm_var_cnt) + return NULL; + + c = case_create (r->proto); + + for (i = 0; i < r->sfm_var_cnt; i++) + { + struct sfm_var *sv = &r->sfm_vars[i]; + union value *v = case_data_rw_idx (c, sv->case_index); + + if (sv->var_width == 0) + retval = read_case_number (r, &v->f); + else + { + uint8_t *s = value_str_rw (v, sv->var_width); + retval = read_case_string (r, s + sv->offset, sv->segment_width); + if (retval == 1) + { + retval = skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)); + if (retval == 0) + sys_error (r, r->pos, _("File ends in partial string value.")); + } + } + + if (retval != 1) + goto eof; + } + return c; + +eof: + if (i != 0) + partial_record (r); + if (r->case_cnt != -1) + read_error (reader, r); + case_unref (c); + return NULL; +} /* Issues an error that R ends in a partial record. */ static void partial_record (struct sfm_reader *r) { - sys_error (r, _("File ends in partial case.")); + sys_error (r, r->pos, _("File ends in partial case.")); +} + +/* Issues an error that an unspecified error occurred SFM, and + marks R tainted. */ +static void +read_error (struct casereader *r, const struct sfm_reader *sfm) +{ + msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh)); + casereader_force_error (r); } /* Reads a number from R and stores its value in *D. @@ -1216,65 +2742,73 @@ partial_record (struct sfm_reader *r) Returns true if successful, false if end of file is reached immediately. */ static bool -read_case_number (struct sfm_reader *r, double *d) +read_case_number (struct sfm_reader *r, double *d) { - if (!r->compressed) + if (r->compression == ANY_COMP_NONE) { - uint8_t flt64[8]; - if (!try_read_bytes (r, flt64, sizeof flt64)) + uint8_t number[8]; + if (!try_read_bytes (r, number, sizeof number)) return false; - *d = flt64_to_double (r, flt64); + float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d); return true; } else return read_compressed_number (r, d); } -/* Reads LENGTH string bytes from R into S. - Always reads a multiple of 8 bytes; if LENGTH is not a - multiple of 8, then extra bytes are read and discarded without - being written to S. - Reads compressed strings if S is compressed. - Returns true if successful, false if end of file is - reached immediately. */ -static bool -read_case_string (struct sfm_reader *r, char *s, size_t length) +/* Reads LENGTH string bytes from R into S. Always reads a multiple of 8 + bytes; if LENGTH is not a multiple of 8, then extra bytes are read and + discarded without being written to S. Reads compressed strings if S is + compressed. Returns 1 if successful, 0 if end of file is reached + immediately, or -1 for some kind of error. */ +static int +read_case_string (struct sfm_reader *r, uint8_t *s, size_t length) { size_t whole = ROUND_DOWN (length, 8); size_t partial = length % 8; - - if (whole) + + if (whole) { - if (!read_whole_strings (r, s, whole)) - return false; + int retval = read_whole_strings (r, s, whole); + if (retval != 1) + return retval; } if (partial) { - char bounce[8]; - if (!read_whole_strings (r, bounce, sizeof bounce)) + uint8_t bounce[8]; + int retval = read_whole_strings (r, bounce, sizeof bounce); + if (retval == -1) + return -1; + else if (!retval) { if (whole) - partial_record (r); - return false; + { + partial_record (r); + return -1; + } + return 0; } memcpy (s + whole, bounce, partial); } - return true; + return 1; } /* Reads and returns the next compression opcode from R. */ static int -read_opcode (struct sfm_reader *r) +read_opcode (struct sfm_reader *r) { - assert (r->compressed); + assert (r->compression != ANY_COMP_NONE); for (;;) { int opcode; - if (r->opcode_idx >= sizeof r->opcodes) + if (r->opcode_idx >= sizeof r->opcodes) { - if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes)) + + int retval = try_read_compressed_bytes (r, r->opcodes, + sizeof r->opcodes); + if (retval != 1) return -1; r->opcode_idx = 0; } @@ -1291,7 +2825,7 @@ read_opcode (struct sfm_reader *r) static bool read_compressed_number (struct sfm_reader *r, double *d) { - int opcode = read_opcode (r); + int opcode = read_opcode (r); switch (opcode) { case -1: @@ -1299,11 +2833,18 @@ read_compressed_number (struct sfm_reader *r, double *d) return false; case 253: - *d = read_flt64 (r); - break; - + return read_compressed_float (r, d); + case 254: - sys_error (r, _("Compressed data is corrupt.")); + float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d); + if (!r->corruption_warning) + { + r->corruption_warning = true; + sys_warn (r, r->pos, + _("Possible compressed data corruption: " + "compressed spaces appear in numeric field.")); + } + break; case 255: *d = SYSMIS; @@ -1317,373 +2858,935 @@ read_compressed_number (struct sfm_reader *r, double *d) return true; } -/* Reads a compressed 8-byte string segment from R and stores it - in DST. - Returns true if successful, false if end of file is - reached immediately. */ -static bool -read_compressed_string (struct sfm_reader *r, char *dst) +/* Reads a compressed 8-byte string segment from R and stores it in DST. */ +static int +read_compressed_string (struct sfm_reader *r, uint8_t *dst) { - switch (read_opcode (r)) + int opcode; + int retval; + + opcode = read_opcode (r); + switch (opcode) { case -1: case 252: - return false; + return 0; case 253: - read_bytes (r, dst, 8); - break; + retval = read_compressed_bytes (r, dst, 8); + return retval == 1 ? 1 : -1; case 254: memset (dst, ' ', 8); - break; + return 1; default: - sys_error (r, _("Compressed data is corrupt.")); + { + double value = opcode - r->bias; + float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst); + if (value == 0.0) + { + /* This has actually been seen "in the wild". The submitter of the + file that showed that the contents decoded as spaces, but they + were at the end of the field so it's possible that the null + bytes just acted as null terminators. */ + } + else if (!r->corruption_warning) + { + r->corruption_warning = true; + sys_warn (r, r->pos, + _("Possible compressed data corruption: " + "string contains compressed integer (opcode %d)."), + opcode); + } + } + return 1; } - - return true; } -/* Reads LENGTH string bytes from R into S. - LENGTH must be a multiple of 8. - Reads compressed strings if S is compressed. - Returns true if successful, false if end of file is - reached immediately. */ -static bool -read_whole_strings (struct sfm_reader *r, char *s, size_t length) +/* Reads LENGTH string bytes from R into S. LENGTH must be a multiple of 8. + Reads compressed strings if S is compressed. Returns 1 if successful, 0 if + end of file is reached immediately, or -1 for some kind of error. */ +static int +read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length) { assert (length % 8 == 0); - if (!r->compressed) + if (r->compression == ANY_COMP_NONE) return try_read_bytes (r, s, length); else { size_t ofs; + for (ofs = 0; ofs < length; ofs += 8) - if (!read_compressed_string (r, s + ofs)) - { - if (ofs != 0) - partial_record (r); - return false; + { + int retval = read_compressed_string (r, s + ofs); + if (retval != 1) + { + if (ofs != 0) + { + partial_record (r); + return -1; + } + return retval; + } } - return true; + return 1; } } + +/* Skips LENGTH string bytes from R. + LENGTH must be a multiple of 8. + (LENGTH is also limited to 1024, but that's only because the + current caller never needs more than that many bytes.) + Returns true if successful, false if end of file is + reached immediately. */ +static bool +skip_whole_strings (struct sfm_reader *r, size_t length) +{ + uint8_t buffer[1024]; + assert (length < sizeof buffer); + return read_whole_strings (r, buffer, length); +} -/* Creates and returns a table that can be used for translating a value - index into a case to a "struct variable *" for DICT. Multiple - system file fields reference variables this way. - - This table must be created before processing the very long - string extension record, because that record causes some - values to be deleted from the case and the dictionary to be - compacted. */ -static struct variable ** -make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict) -{ - struct variable **var_by_value_idx; - int value_idx = 0; - int i; +/* Helpers for reading records that contain structured text + strings. */ - var_by_value_idx = pool_nmalloc (r->pool, - r->value_cnt, sizeof *var_by_value_idx); - for (i = 0; i < dict_get_var_cnt (dict); i++) +/* Maximum number of warnings to issue for a single text + record. */ +#define MAX_TEXT_WARNINGS 5 + +/* State. */ +struct text_record + { + struct substring buffer; /* Record contents. */ + off_t start; /* Starting offset in file. */ + size_t pos; /* Current position in buffer. */ + int n_warnings; /* Number of warnings issued or suppressed. */ + bool recoded; /* Recoded into UTF-8? */ + }; + +static struct text_record * +open_text_record (struct sfm_reader *r, + const struct sfm_extension_record *record, + bool recode_to_utf8) +{ + struct text_record *text; + struct substring raw; + + text = pool_alloc (r->pool, sizeof *text); + raw = ss_buffer (record->data, record->size * record->count); + text->start = record->pos; + text->buffer = (recode_to_utf8 + ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool) + : raw); + text->pos = 0; + text->n_warnings = 0; + text->recoded = recode_to_utf8; + + return text; +} + +/* Closes TEXT, frees its storage, and issues a final warning + about suppressed warnings if necessary. */ +static void +close_text_record (struct sfm_reader *r, struct text_record *text) +{ + if (text->n_warnings > MAX_TEXT_WARNINGS) + sys_warn (r, -1, _("Suppressed %d additional related warnings."), + text->n_warnings - MAX_TEXT_WARNINGS); + if (text->recoded) + pool_free (r->pool, ss_data (text->buffer)); +} + +/* Reads a variable=value pair from TEXT. + Looks up the variable in DICT and stores it into *VAR. + Stores a null-terminated value into *VALUE. */ +static bool +read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict, + struct text_record *text, + struct variable **var, char **value) +{ + for (;;) { - struct variable *v = dict_get_var (dict, i); - int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8); - int j; + if (!text_read_short_name (r, dict, text, ss_cstr ("="), var)) + return false; - var_by_value_idx[value_idx++] = v; - for (j = 1; j < nv; j++) - var_by_value_idx[value_idx++] = NULL; + *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL); + if (*value == NULL) + return false; + + text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX), + ss_buffer ("\t\0", 2)); + + if (*var != NULL) + return true; } - assert (value_idx == r->value_cnt); +} + +static bool +text_read_variable_name (struct sfm_reader *r, struct dictionary *dict, + struct text_record *text, struct substring delimiters, + struct variable **var) +{ + char *name; - return var_by_value_idx; + name = text_get_token (text, delimiters, NULL); + if (name == NULL) + return false; + + *var = dict_lookup_var (dict, name); + if (*var != NULL) + return true; + + text_warn (r, text, _("Dictionary record refers to unknown variable %s."), + name); + return false; } -/* Returns the "struct variable" corresponding to the given - 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index - is valid. */ -static struct variable * -lookup_var_by_value_idx (struct sfm_reader *r, - struct variable **var_by_value_idx, int value_idx) + +static bool +text_read_short_name (struct sfm_reader *r, struct dictionary *dict, + struct text_record *text, struct substring delimiters, + struct variable **var) { - struct variable *var; - - if (value_idx < 1 || value_idx > r->value_cnt) - sys_error (r, _("Variable index %d not in valid range 1...%d."), - value_idx, r->value_cnt); + char *short_name = text_get_token (text, delimiters, NULL); + if (short_name == NULL) + return false; + + *var = dict_lookup_var (dict, short_name); + if (*var == NULL) + text_warn (r, text, _("Dictionary record refers to unknown variable %s."), + short_name); + return true; +} - var = var_by_value_idx[value_idx - 1]; - if (var == NULL) - sys_error (r, _("Variable index %d refers to long string " - "continuation."), - value_idx); +/* Displays a warning for the current file position, limiting the + number to MAX_TEXT_WARNINGS for TEXT. */ +static void +text_warn (struct sfm_reader *r, struct text_record *text, + const char *format, ...) +{ + if (text->n_warnings++ < MAX_TEXT_WARNINGS) + { + va_list args; - return var; + va_start (args, format); + sys_msg (r, text->start + text->pos, MW, format, args); + va_end (args); + } } -/* Returns the variable in D with the given SHORT_NAME, - or a null pointer if there is none. */ -static struct variable * -lookup_var_by_short_name (struct dictionary *d, const char *short_name) +static char * +text_get_token (struct text_record *text, struct substring delimiters, + char *delimiter) { - struct variable *var; - size_t var_cnt; - size_t i; + struct substring token; + char *end; + + if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token)) + return NULL; + + end = &ss_data (token)[ss_length (token)]; + if (delimiter != NULL) + *delimiter = *end; + *end = '\0'; + return ss_data (token); +} + +/* Reads a integer value expressed in decimal, then a space, then a string that + consists of exactly as many bytes as specified by the integer, then a space, + from TEXT. Returns the string, null-terminated, as a subset of TEXT's + buffer (so the caller should not free the string). */ +static const char * +text_parse_counted_string (struct sfm_reader *r, struct text_record *text) +{ + size_t start; + size_t n; + char *s; - /* First try looking up by full name. This often succeeds. */ - var = dict_lookup_var (d, short_name); - if (var != NULL && !strcasecmp (var_get_short_name (var), short_name)) - return var; + start = text->pos; + n = 0; + while (text->pos < text->buffer.length) + { + int c = text->buffer.string[text->pos]; + if (c < '0' || c > '9') + break; + n = (n * 10) + (c - '0'); + text->pos++; + } + if (text->pos >= text->buffer.length || start == text->pos) + { + sys_warn (r, text->start, + _("Expecting digit at offset %zu in MRSETS record."), + text->pos); + return NULL; + } + + if (!text_match (text, ' ')) + { + sys_warn (r, text->start, + _("Expecting space at offset %zu in MRSETS record."), + text->pos); + return NULL; + } + + if (text->pos + n > text->buffer.length) + { + sys_warn (r, text->start, + _("%zu-byte string starting at offset %zu " + "exceeds record length %zu."), + n, text->pos, text->buffer.length); + return NULL; + } - /* Iterate through the whole dictionary as a fallback. */ - var_cnt = dict_get_var_cnt (d); - for (i = 0; i < var_cnt; i++) + s = &text->buffer.string[text->pos]; + if (s[n] != ' ') { - var = dict_get_var (d, i); - if (!strcasecmp (var_get_short_name (var), short_name)) - return var; + sys_warn (r, text->start, + _("Expecting space at offset %zu following %zu-byte string."), + text->pos + n, n); + return NULL; } - - return NULL; + s[n] = '\0'; + text->pos += n + 1; + return s; } - -/* Helpers for reading records that contain "variable=value" - pairs. */ - -/* State. */ -struct variable_to_value_map - { - struct substring buffer; /* Record contents. */ - size_t pos; /* Current position in buffer. */ - }; -/* Reads SIZE bytes into a "variable=value" map for R, - and returns the map. */ -static struct variable_to_value_map * -open_variable_to_value_map (struct sfm_reader *r, size_t size) +static bool +text_match (struct text_record *text, char c) { - struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map); - char *buffer = pool_malloc (r->pool, size + 1); - read_bytes (r, buffer, size); - map->buffer = ss_buffer (buffer, size); - map->pos = 0; - return map; + if (text->pos >= text->buffer.length) + return false; + + if (text->buffer.string[text->pos] == c) + { + text->pos++; + return true; + } + else + return false; } -/* Closes MAP and frees its storage. - Not really needed, because the pool will free the map anyway, - but can be used to free it earlier. */ -static void -close_variable_to_value_map (struct sfm_reader *r, - struct variable_to_value_map *map) +/* Returns the current byte offset (as converted to UTF-8, if it was converted) + inside the TEXT's string. */ +static size_t +text_pos (const struct text_record *text) { - pool_free (r->pool, ss_data (map->buffer)); + return text->pos; } -/* Reads the next variable=value pair from MAP. - Looks up the variable in DICT and stores it into *VAR. - Stores a null-terminated value into *VALUE. */ -static bool -read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict, - struct variable_to_value_map *map, - struct variable **var, char **value, - int *warning_cnt) +static const char * +text_get_all (const struct text_record *text) { - int max_warnings = 5; - - for (;;) - { - struct substring short_name_ss, value_ss; - - if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss) - || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos, - &value_ss)) - { - if (*warning_cnt > max_warnings) - sys_warn (r, _("Suppressed %d additional variable map warnings."), - *warning_cnt - max_warnings); - return false; - } - - map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX), - ss_buffer ("\t\0", 2)); - - ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0'; - *var = lookup_var_by_short_name (dict, ss_data (short_name_ss)); - if (*var == NULL) - { - if (++*warning_cnt <= 5) - sys_warn (r, _("Variable map refers to unknown variable %s."), - ss_data (short_name_ss)); - continue; - } - - ss_data (value_ss)[ss_length (value_ss)] = '\0'; - *value = ss_data (value_ss); - - return true; - } + return text->buffer.string; } /* Messages. */ /* Displays a corruption message. */ static void -sys_msg (struct sfm_reader *r, int class, const char *format, va_list args) +sys_msg (struct sfm_reader *r, off_t offset, + int class, const char *format, va_list args) { struct msg m; struct string text; ds_init_empty (&text); - ds_put_format (&text, "\"%s\" near offset 0x%lx: ", - fh_get_file_name (r->fh), (unsigned long) ftell (r->file)); + if (offset >= 0) + ds_put_format (&text, _("`%s' near offset 0x%llx: "), + fh_get_file_name (r->fh), (long long int) offset); + else + ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh)); ds_put_vformat (&text, format, args); m.category = msg_class_to_category (class); m.severity = msg_class_to_severity (class); - m.where.file_name = NULL; - m.where.line_number = 0; + m.file_name = NULL; + m.first_line = 0; + m.last_line = 0; + m.first_column = 0; + m.last_column = 0; m.text = ds_cstr (&text); msg_emit (&m); } -/* Displays a warning for the current file position. */ +/* Displays a warning for offset OFFSET in the file. */ static void -sys_warn (struct sfm_reader *r, const char *format, ...) +sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...) { va_list args; - + va_start (args, format); - sys_msg (r, MW, format, args); + sys_msg (r, offset, MW, format, args); va_end (args); } -/* Displays an error for the current file position, - marks it as in an error state, - and aborts reading it using longjmp. */ +/* Displays an error for the current file position and marks it as in an error + state. */ static void -sys_error (struct sfm_reader *r, const char *format, ...) +sys_error (struct sfm_reader *r, off_t offset, const char *format, ...) { va_list args; - + va_start (args, format); - sys_msg (r, ME, format, args); + sys_msg (r, offset, ME, format, args); va_end (args); r->error = true; - longjmp (r->bail_out, 1); } /* Reads BYTE_CNT bytes into BUF. - Returns true if exactly BYTE_CNT bytes are successfully read. - Aborts if an I/O error or a partial read occurs. - If EOF_IS_OK, then an immediate end-of-file causes false to be - returned; otherwise, immediate end-of-file causes an abort - too. */ -static inline bool + Returns 1 if exactly BYTE_CNT bytes are successfully read. + Returns -1 if an I/O error or a partial read occurs. + Returns 0 for an immediate end-of-file and, if EOF_IS_OK is false, reports + an error. */ +static inline int read_bytes_internal (struct sfm_reader *r, bool eof_is_ok, - void *buf, size_t byte_cnt) + void *buf, size_t byte_cnt) { size_t bytes_read = fread (buf, 1, byte_cnt, r->file); + r->pos += bytes_read; if (bytes_read == byte_cnt) - return true; + return 1; else if (ferror (r->file)) - sys_error (r, _("System error: %s."), strerror (errno)); + { + sys_error (r, r->pos, _("System error: %s."), strerror (errno)); + return -1; + } else if (!eof_is_ok || bytes_read != 0) - sys_error (r, _("Unexpected end of file.")); + { + sys_error (r, r->pos, _("Unexpected end of file.")); + return -1; + } else - return false; + return 0; } /* Reads BYTE_CNT into BUF. - Aborts upon I/O error or if end-of-file is encountered. */ -static void + Returns true if successful. + Returns false upon I/O error or if end-of-file is encountered. */ +static bool read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) { - read_bytes_internal (r, false, buf, byte_cnt); + return read_bytes_internal (r, false, buf, byte_cnt) == 1; } /* Reads BYTE_CNT bytes into BUF. - Returns true if exactly BYTE_CNT bytes are successfully read. - Returns false if an immediate end-of-file is encountered. - Aborts if an I/O error or a partial read occurs. */ -static bool + Returns 1 if exactly BYTE_CNT bytes are successfully read. + Returns 0 if an immediate end-of-file is encountered. + Returns -1 if an I/O error or a partial read occurs. */ +static int try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) { return read_bytes_internal (r, true, buf, byte_cnt); } -/* Reads a 32-bit signed integer from R and returns its value in +/* Reads a 32-bit signed integer from R and stores its value in host format in + *X. Returns true if successful, otherwise false. */ +static bool +read_int (struct sfm_reader *r, int *x) +{ + uint8_t integer[4]; + if (read_bytes (r, integer, sizeof integer) != 1) + return false; + *x = integer_get (r->integer_format, integer, sizeof integer); + return true; +} + +static bool +read_uint (struct sfm_reader *r, unsigned int *x) +{ + bool ok; + int y; + + ok = read_int (r, &y); + *x = y; + return ok; +} + +/* Reads a 64-bit signed integer from R and returns its value in + host format. */ +static bool +read_int64 (struct sfm_reader *r, long long int *x) +{ + uint8_t integer[8]; + if (read_bytes (r, integer, sizeof integer) != 1) + return false; + *x = integer_get (r->integer_format, integer, sizeof integer); + return true; +} + +/* Reads a 64-bit signed integer from R and returns its value in host format. */ -static int32_t -read_int32 (struct sfm_reader *r) +static bool +read_uint64 (struct sfm_reader *r, unsigned long long int *x) { - uint8_t int32[4]; - read_bytes (r, int32, sizeof int32); - return int32_to_native (r, int32); + long long int y; + bool ok; + + ok = read_int64 (r, &y); + *x = y; + return ok; +} + +static int +parse_int (const struct sfm_reader *r, const void *data, size_t ofs) +{ + return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4); } -/* Reads a 64-bit floating-point number from R and returns its - value in host format. */ static double -read_flt64 (struct sfm_reader *r) +parse_float (const struct sfm_reader *r, const void *data, size_t ofs) { - uint8_t flt64[8]; - read_bytes (r, flt64, sizeof flt64); - return flt64_to_double (r, flt64); + return float_get_double (r->float_format, (const uint8_t *) data + ofs); } /* Reads exactly SIZE - 1 bytes into BUFFER and stores a null byte into BUFFER[SIZE - 1]. */ -static void -read_string (struct sfm_reader *r, char *buffer, size_t size) +static bool +read_string (struct sfm_reader *r, char *buffer, size_t size) { + bool ok; + assert (size > 0); - read_bytes (r, buffer, size - 1); - buffer[size - 1] = '\0'; + ok = read_bytes (r, buffer, size - 1); + if (ok) + buffer[size - 1] = '\0'; + return ok; } /* Skips BYTES bytes forward in R. */ -static void +static bool skip_bytes (struct sfm_reader *r, size_t bytes) { - while (bytes > 0) + while (bytes > 0) { char buffer[1024]; size_t chunk = MIN (sizeof buffer, bytes); - read_bytes (r, buffer, chunk); + if (!read_bytes (r, buffer, chunk)) + return false; bytes -= chunk; } + + return true; +} + +/* Returns a malloc()'d copy of S in which all lone CRs and CR LF pairs have + been replaced by LFs. + + (A product that identifies itself as VOXCO INTERVIEWER 4.3 produces system + files that use CR-only line ends in the file label and extra product + info.) */ +static char * +fix_line_ends (const char *s) +{ + char *dst, *d; + + d = dst = xmalloc (strlen (s) + 1); + while (*s != '\0') + { + if (*s == '\r') + { + s++; + if (*s == '\n') + s++; + *d++ = '\n'; + } + else + *d++ = *s++; + } + *d = '\0'; + + return dst; } -/* Returns the value of the 32-bit signed integer at INT32, - converted from the format used by R to the host format. */ -static int32_t -int32_to_native (const struct sfm_reader *r, const uint8_t int32[4]) -{ - int32_t x; - if (r->integer_format == INTEGER_NATIVE) - memcpy (&x, int32, sizeof x); +static bool +read_ztrailer (struct sfm_reader *r, + long long int zheader_ofs, + long long int ztrailer_len); + +static void * +zalloc (voidpf pool_, uInt items, uInt size) +{ + struct pool *pool = pool_; + + return (!size || xalloc_oversized (items, size) + ? Z_NULL + : pool_malloc (pool, items * size)); +} + +static void +zfree (voidpf pool_, voidpf address) +{ + struct pool *pool = pool_; + + pool_free (pool, address); +} + +static bool +read_zheader (struct sfm_reader *r) +{ + off_t pos = r->pos; + long long int zheader_ofs; + long long int ztrailer_ofs; + long long int ztrailer_len; + + if (!read_int64 (r, &zheader_ofs) + || !read_int64 (r, &ztrailer_ofs) + || !read_int64 (r, &ztrailer_len)) + return false; + + if (zheader_ofs != pos) + { + sys_error (r, pos, _("Wrong ZLIB data header offset %#llx " + "(expected %#llx)."), + zheader_ofs, (long long int) pos); + return false; + } + + if (ztrailer_ofs < r->pos) + { + sys_error (r, pos, _("Impossible ZLIB trailer offset 0x%llx."), + ztrailer_ofs); + return false; + } + + if (ztrailer_len < 24 || ztrailer_len % 24) + { + sys_error (r, pos, _("Invalid ZLIB trailer length %lld."), ztrailer_len); + return false; + } + + r->ztrailer_ofs = ztrailer_ofs; + if (!read_ztrailer (r, zheader_ofs, ztrailer_len)) + return false; + + if (r->zin_buf == NULL) + { + r->zin_buf = pool_malloc (r->pool, ZIN_BUF_SIZE); + r->zout_buf = pool_malloc (r->pool, ZOUT_BUF_SIZE); + r->zstream.next_in = NULL; + r->zstream.avail_in = 0; + } + + r->zstream.zalloc = zalloc; + r->zstream.zfree = zfree; + r->zstream.opaque = r->pool; + + return open_zstream (r); +} + +static void +seek (struct sfm_reader *r, off_t offset) +{ + if (fseeko (r->file, offset, SEEK_SET)) + sys_error (r, 0, _("%s: seek failed (%s)."), + fh_get_file_name (r->fh), strerror (errno)); + r->pos = offset; +} + +/* Performs some additional consistency checks on the ZLIB compressed data + trailer. */ +static bool +read_ztrailer (struct sfm_reader *r, + long long int zheader_ofs, + long long int ztrailer_len) +{ + long long int expected_uncmp_ofs; + long long int expected_cmp_ofs; + long long int bias; + long long int zero; + unsigned int block_size; + unsigned int n_blocks; + unsigned int i; + struct stat s; + + if (fstat (fileno (r->file), &s)) + { + sys_error (ME, 0, _("%s: stat failed (%s)."), + fh_get_file_name (r->fh), strerror (errno)); + return false; + } + + if (!S_ISREG (s.st_mode)) + { + /* We can't seek to the trailer and then back to the data in this file, + so skip doing extra checks. */ + return true; + } + + if (r->ztrailer_ofs + ztrailer_len != s.st_size) + sys_warn (r, r->pos, + _("End of ZLIB trailer (0x%llx) is not file size (0x%llx)."), + r->ztrailer_ofs + ztrailer_len, (long long int) s.st_size); + + seek (r, r->ztrailer_ofs); + + /* Read fixed header from ZLIB data trailer. */ + if (!read_int64 (r, &bias)) + return false; + if (-bias != r->bias) + { + sys_error (r, r->pos, _("ZLIB trailer bias (%lld) differs from " + "file header bias (%.2f)."), + -bias, r->bias); + return false; + } + + if (!read_int64 (r, &zero)) + return false; + if (zero != 0) + sys_warn (r, r->pos, + _("ZLIB trailer \"zero\" field has nonzero value %lld."), zero); + + if (!read_uint (r, &block_size)) + return false; + if (block_size != ZBLOCK_SIZE) + sys_warn (r, r->pos, + _("ZLIB trailer specifies unexpected %u-byte block size."), + block_size); + + if (!read_uint (r, &n_blocks)) + return false; + if (n_blocks != (ztrailer_len - 24) / 24) + { + sys_error (r, r->pos, + _("%lld-byte ZLIB trailer specifies %u data blocks (expected " + "%lld)."), + ztrailer_len, n_blocks, (ztrailer_len - 24) / 24); + return false; + } + + expected_uncmp_ofs = zheader_ofs; + expected_cmp_ofs = zheader_ofs + 24; + for (i = 0; i < n_blocks; i++) + { + off_t desc_ofs = r->pos; + unsigned long long int uncompressed_ofs; + unsigned long long int compressed_ofs; + unsigned int uncompressed_size; + unsigned int compressed_size; + + if (!read_uint64 (r, &uncompressed_ofs) + || !read_uint64 (r, &compressed_ofs) + || !read_uint (r, &uncompressed_size) + || !read_uint (r, &compressed_size)) + return false; + + if (uncompressed_ofs != expected_uncmp_ofs) + { + sys_error (r, desc_ofs, + _("ZLIB block descriptor %u reported uncompressed data " + "offset %#llx, when %#llx was expected."), + i, uncompressed_ofs, expected_uncmp_ofs); + return false; + } + + if (compressed_ofs != expected_cmp_ofs) + { + sys_error (r, desc_ofs, + _("ZLIB block descriptor %u reported compressed data " + "offset %#llx, when %#llx was expected."), + i, compressed_ofs, expected_cmp_ofs); + return false; + } + + if (i < n_blocks - 1) + { + if (uncompressed_size != block_size) + sys_warn (r, desc_ofs, + _("ZLIB block descriptor %u reported block size %#x, " + "when %#x was expected."), + i, uncompressed_size, block_size); + } + else + { + if (uncompressed_size > block_size) + sys_warn (r, desc_ofs, + _("ZLIB block descriptor %u reported block size %#x, " + "when at most %#x was expected."), + i, uncompressed_size, block_size); + } + + /* http://www.zlib.net/zlib_tech.html says that the maximum expansion + from compression, with worst-case parameters, is 13.5% plus 11 bytes. + This code checks for an expansion of more than 14.3% plus 11 + bytes. */ + if (compressed_size > uncompressed_size + uncompressed_size / 7 + 11) + { + sys_error (r, desc_ofs, + _("ZLIB block descriptor %u reports compressed size %u " + "and uncompressed size %u."), + i, compressed_size, uncompressed_size); + return false; + } + + expected_uncmp_ofs += uncompressed_size; + expected_cmp_ofs += compressed_size; + } + + if (expected_cmp_ofs != r->ztrailer_ofs) + { + sys_error (r, r->pos, _("ZLIB trailer is at offset %#llx but %#llx " + "would be expected from block descriptors."), + r->ztrailer_ofs, expected_cmp_ofs); + return false; + } + + seek (r, zheader_ofs + 24); + return true; +} + +static bool +open_zstream (struct sfm_reader *r) +{ + int error; + + r->zout_pos = r->zout_end = 0; + error = inflateInit (&r->zstream); + if (error != Z_OK) + { + sys_error (r, r->pos, _("ZLIB initialization failed (%s)."), + r->zstream.msg); + return false; + } + return true; +} + +static bool +close_zstream (struct sfm_reader *r) +{ + int error; + + error = inflateEnd (&r->zstream); + if (error != Z_OK) + { + sys_error (r, r->pos, _("Inconsistency at end of ZLIB stream (%s)."), + r->zstream.msg); + return false; + } + return true; +} + +static int +read_bytes_zlib (struct sfm_reader *r, void *buf_, size_t byte_cnt) +{ + uint8_t *buf = buf_; + + if (byte_cnt == 0) + return 1; + + for (;;) + { + int error; + + /* Use already inflated data if there is any. */ + if (r->zout_pos < r->zout_end) + { + unsigned int n = MIN (byte_cnt, r->zout_end - r->zout_pos); + memcpy (buf, &r->zout_buf[r->zout_pos], n); + r->zout_pos += n; + byte_cnt -= n; + buf += n; + + if (byte_cnt == 0) + return 1; + } + + /* We need to inflate some more data. + Get some more input data if we don't have any. */ + if (r->zstream.avail_in == 0) + { + unsigned int n = MIN (ZIN_BUF_SIZE, r->ztrailer_ofs - r->pos); + if (n == 0) + return 0; + else + { + int retval = try_read_bytes (r, r->zin_buf, n); + if (retval != 1) + return retval; + r->zstream.avail_in = n; + r->zstream.next_in = r->zin_buf; + } + } + + /* Inflate the (remaining) input data. */ + r->zstream.avail_out = ZOUT_BUF_SIZE; + r->zstream.next_out = r->zout_buf; + error = inflate (&r->zstream, Z_SYNC_FLUSH); + r->zout_pos = 0; + r->zout_end = r->zstream.next_out - r->zout_buf; + if (r->zout_end == 0) + { + if (error != Z_STREAM_END) + { + sys_error (r, r->pos, _("ZLIB stream inconsistency (%s)."), + r->zstream.msg); + return -1; + } + else if (!close_zstream (r) || !open_zstream (r)) + return -1; + } + else + { + /* Process the output data and ignore 'error' for now. ZLIB will + present it to us again on the next inflate() call. */ + } + } +} + +static int +read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +{ + if (r->compression == ANY_COMP_SIMPLE) + return read_bytes (r, buf, byte_cnt); else - x = integer_get (r->integer_format, int32, sizeof x); - return x; + { + int retval = read_bytes_zlib (r, buf, byte_cnt); + if (retval == 0) + sys_error (r, r->pos, _("Unexpected end of ZLIB compressed data.")); + return retval; + } } -/* Returns the value of the 64-bit floating point number at - FLT64, converted from the format used by R to the host - format. */ -static double -flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8]) +static int +try_read_compressed_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) { - double x; - if (r->float_format == FLOAT_NATIVE_DOUBLE) - memcpy (&x, flt64, sizeof x); + if (r->compression == ANY_COMP_SIMPLE) + return try_read_bytes (r, buf, byte_cnt); else - float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x); - return x; + return read_bytes_zlib (r, buf, byte_cnt); +} + +/* Reads a 64-bit floating-point number from R and returns its + value in host format. */ +static bool +read_compressed_float (struct sfm_reader *r, double *d) +{ + uint8_t number[8]; + + if (!read_compressed_bytes (r, number, sizeof number)) + return false; + + *d = float_get_double (r->float_format, number); + return true; } + +static const struct casereader_class sys_file_casereader_class = + { + sys_file_casereader_read, + sys_file_casereader_destroy, + NULL, + NULL, + }; +const struct any_reader_class sys_file_reader_class = + { + N_("SPSS System File"), + sfm_detect, + sfm_open, + sfm_close, + sfm_decode, + sfm_get_strings, + };