X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fsys-file-reader.c;h=ceb4e04fb0484950bba4da5d0bb6df1ba7f0eabf;hb=d8fdf0b4fa919e48397b438e9453d6b82215ff51;hp=072fca3b0964be65209abcd44cf6acc89cc8d809;hpb=97d4f38945476834fd7fce612b663f19f2b291f8;p=pspp-builds.git diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 072fca3b..ceb4e04f 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -1,1781 +1,2607 @@ -/* PSPP - computes sample statistics. - Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc. - Written by Ben Pfaff . +/* PSPP - a program for statistical analysis. + Copyright (C) 1997-2000, 2006-2007, 2009-2011 Free Software Foundation, Inc. - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. */ + along with this program. If not, see . */ #include -#include +#include "data/sys-file-reader.h" +#include "data/sys-file-private.h" + #include #include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sys-file-reader.h" -#include "sfm-private.h" -#include "case.h" -#include "dictionary.h" -#include "file-handle-def.h" -#include "file-name.h" -#include "format.h" -#include "value-labels.h" -#include "variable.h" -#include "value.h" +#include +#include +#include + +#include "data/attributes.h" +#include "data/case.h" +#include "data/casereader-provider.h" +#include "data/casereader.h" +#include "data/dictionary.h" +#include "data/file-handle-def.h" +#include "data/file-name.h" +#include "data/format.h" +#include "data/missing-values.h" +#include "data/mrset.h" +#include "data/short-names.h" +#include "data/value-labels.h" +#include "data/value.h" +#include "data/variable.h" +#include "libpspp/array.h" +#include "libpspp/assertion.h" +#include "libpspp/compiler.h" +#include "libpspp/i18n.h" +#include "libpspp/message.h" +#include "libpspp/misc.h" +#include "libpspp/pool.h" +#include "libpspp/str.h" +#include "libpspp/stringi-set.h" + +#include "gl/c-ctype.h" +#include "gl/inttostr.h" +#include "gl/localcharset.h" +#include "gl/minmax.h" +#include "gl/unlocked-io.h" +#include "gl/xalloc.h" +#include "gl/xsize.h" #include "gettext.h" #define _(msgid) gettext (msgid) +#define N_(msgid) (msgid) -/* System file reader. */ -struct sfm_reader -{ - struct file_handle *fh; /* File handle. */ - FILE *file; /* File stream. */ - - int reverse_endian; /* 1=file has endianness opposite us. */ - int value_cnt; /* Number of `union values's per case. */ - long case_cnt; /* Number of cases, -1 if unknown. */ - int compressed; /* 1=compressed, 0=not compressed. */ - double bias; /* Compression bias, usually 100.0. */ - int weight_idx; /* 0-based index of weighting variable, or -1. */ - bool ok; /* False after an I/O error or corrupt data. */ - bool has_vls; /* True if the file has one or more Very Long Strings*/ - - /* Variables. */ - struct sfm_var *vars; - size_t var_cnt; - - /* File's special constants. */ - flt64 sysmis; - flt64 highest; - flt64 lowest; - - /* Decompression buffer. */ - flt64 *buf; /* Buffer data. */ - flt64 *ptr; /* Current location in buffer. */ - flt64 *end; /* End of buffer data. */ - - /* Compression instruction octet. */ - unsigned char x[8]; /* Current instruction octet. */ - unsigned char *y; /* Location in current instruction octet. */ -}; - -/* A variable in a system file. */ -struct sfm_var -{ - int width; /* 0=numeric, otherwise string width. */ - int fv; /* Index into case. */ -}; - -/* Utilities. */ +enum + { + /* subtypes 0-2 unknown */ + EXT_INTEGER = 3, /* Machine integer info. */ + EXT_FLOAT = 4, /* Machine floating-point info. */ + EXT_VAR_SETS = 5, /* Variable sets. */ + EXT_DATE = 6, /* DATE. */ + EXT_MRSETS = 7, /* Multiple response sets. */ + EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */ + /* subtypes 9-10 unknown */ + EXT_DISPLAY = 11, /* Variable display parameters. */ + /* subtype 12 unknown */ + EXT_LONG_NAMES = 13, /* Long variable names. */ + EXT_LONG_STRINGS = 14, /* Long strings. */ + /* subtype 15 unknown */ + EXT_NCASES = 16, /* Extended number of cases. */ + EXT_FILE_ATTRS = 17, /* Data file attributes. */ + EXT_VAR_ATTRS = 18, /* Variable attributes. */ + EXT_MRSETS2 = 19, /* Multiple response sets (extended). */ + EXT_ENCODING = 20, /* Character encoding. */ + EXT_LONG_LABELS = 21 /* Value labels for long strings. */ + }; -/* Swap bytes *A and *B. */ -static inline void -bswap (char *a, char *b) -{ - char t = *a; - *a = *b; - *b = t; -} +struct sfm_var_record + { + off_t pos; + int width; + char name[8]; + int print_format; + int write_format; + int missing_value_code; + uint8_t missing[24]; + char *label; + struct variable *var; + }; -/* Reverse the byte order of 32-bit integer *X. */ -static inline void -bswap_int32 (int32_t *x_) -{ - char *x = (char *) x_; - bswap (x + 0, x + 3); - bswap (x + 1, x + 2); -} +struct sfm_value_label + { + uint8_t value[8]; + char *label; + }; -/* Reverse the byte order of 64-bit floating point *X. */ -static inline void -bswap_flt64 (flt64 *x_) -{ - char *x = (char *) x_; - bswap (x + 0, x + 7); - bswap (x + 1, x + 6); - bswap (x + 2, x + 5); - bswap (x + 3, x + 4); -} +struct sfm_value_label_record + { + off_t pos; + struct sfm_value_label *labels; + size_t n_labels; -static void -corrupt_msg (int class, const char *format,...) - PRINTF_FORMAT (2, 3); + int *vars; + size_t n_vars; + }; - /* Displays a corrupt sysfile error. */ - static void - corrupt_msg (int class, const char *format,...) -{ - struct msg m; - va_list args; - struct string text; +struct sfm_document_record + { + off_t pos; + char *documents; + size_t n_lines; + }; - ds_init_cstr (&text, _("corrupt system file: ")); - va_start (args, format); - ds_put_vformat (&text, format, args); - va_end (args); +struct sfm_extension_record + { + off_t pos; /* Starting offset in file. */ + size_t size; /* Size of data elements. */ + size_t count; /* Number of data elements. */ + void *data; /* Contents. */ + }; - m.category = msg_class_to_category (class); - m.severity = msg_class_to_severity (class); - m.where.file_name = NULL; - m.where.line_number = 0; - m.text = ds_cstr (&text); +/* System file reader. */ +struct sfm_reader + { + /* Resource tracking. */ + struct pool *pool; /* All system file state. */ + jmp_buf bail_out; /* longjmp() target for error handling. */ + + /* File state. */ + struct file_handle *fh; /* File handle. */ + struct fh_lock *lock; /* Mutual exclusion for file handle. */ + FILE *file; /* File stream. */ + off_t pos; /* Position in file. */ + bool error; /* I/O or corruption error? */ + struct caseproto *proto; /* Format of output cases. */ + + /* File format. */ + enum integer_format integer_format; /* On-disk integer format. */ + enum float_format float_format; /* On-disk floating point format. */ + struct sfm_var *sfm_vars; /* Variables. */ + size_t sfm_var_cnt; /* Number of variables. */ + casenumber case_cnt; /* Number of cases */ + const char *encoding; /* String encoding. */ + + /* Decompression. */ + bool compressed; /* File is compressed? */ + double bias; /* Compression bias, usually 100.0. */ + uint8_t opcodes[8]; /* Current block of opcodes. */ + size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */ + bool corruption_warning; /* Warned about possible corruption? */ + }; - msg_emit (&m); -} +static const struct casereader_class sys_file_casereader_class; + +static bool close_reader (struct sfm_reader *); + +static struct variable *lookup_var_by_index (struct sfm_reader *, off_t, + const struct sfm_var_record *, + size_t n, int idx); + +static void sys_msg (struct sfm_reader *r, off_t, int class, + const char *format, va_list args) + PRINTF_FORMAT (4, 0); +static void sys_warn (struct sfm_reader *, off_t, const char *, ...) + PRINTF_FORMAT (3, 4); +static void sys_error (struct sfm_reader *, off_t, const char *, ...) + PRINTF_FORMAT (3, 4) + NO_RETURN; + +static void read_bytes (struct sfm_reader *, void *, size_t); +static bool try_read_bytes (struct sfm_reader *, void *, size_t); +static int read_int (struct sfm_reader *); +static double read_float (struct sfm_reader *); +static void read_string (struct sfm_reader *, char *, size_t); +static void skip_bytes (struct sfm_reader *, size_t); + +static int parse_int (struct sfm_reader *, const void *data, size_t ofs); +static double parse_float (struct sfm_reader *, const void *data, size_t ofs); + +static void read_variable_record (struct sfm_reader *, + struct sfm_var_record *); +static void read_value_label_record (struct sfm_reader *, + struct sfm_value_label_record *, + size_t n_vars); +static struct sfm_document_record *read_document_record (struct sfm_reader *); +static struct sfm_extension_record *read_extension_record ( + struct sfm_reader *, int subtype); +static void skip_extension_record (struct sfm_reader *, int subtype); + +static const char *choose_encoding ( + struct sfm_reader *, + const struct sfm_extension_record *ext_integer, + const struct sfm_extension_record *ext_encoding); + +static struct text_record *open_text_record ( + struct sfm_reader *, const struct sfm_extension_record *); +static void close_text_record (struct sfm_reader *, + struct text_record *); +static bool read_variable_to_value_pair (struct sfm_reader *, + struct dictionary *, + struct text_record *, + struct variable **var, char **value); +static void text_warn (struct sfm_reader *r, struct text_record *text, + const char *format, ...) + PRINTF_FORMAT (3, 4); +static char *text_get_token (struct text_record *, + struct substring delimiters, char *delimiter); +static bool text_match (struct text_record *, char c); +static bool text_read_variable_name (struct sfm_reader *, struct dictionary *, + struct text_record *, + struct substring delimiters, + struct variable **); +static bool text_read_short_name (struct sfm_reader *, struct dictionary *, + struct text_record *, + struct substring delimiters, + struct variable **); +static const char *text_parse_counted_string (struct sfm_reader *, + struct text_record *); +static size_t text_pos (const struct text_record *); + +static bool close_reader (struct sfm_reader *r); + +/* Dictionary reader. */ -/* Closes a system file after we're done with it. */ -void -sfm_close_reader (struct sfm_reader *r) -{ - if (r == NULL) - return; +enum which_format + { + PRINT_FORMAT, + WRITE_FORMAT + }; - if (r->file) - { - if (fn_close (fh_get_file_name (r->fh), r->file) == EOF) - msg (ME, _("%s: Closing system file: %s."), - fh_get_file_name (r->fh), strerror (errno)); - r->file = NULL; - } +static void read_header (struct sfm_reader *, int *weight_idx, + int *claimed_oct_cnt, struct sfm_read_info *, + char **file_labelp); +static void parse_file_label (struct sfm_reader *, const char *file_label, + struct dictionary *); +static void parse_variable_records (struct sfm_reader *, struct dictionary *, + struct sfm_var_record *, size_t n); +static void parse_format_spec (struct sfm_reader *, off_t pos, + unsigned int format, enum which_format, + struct variable *, int *format_warning_cnt); +static void parse_document (struct dictionary *, struct sfm_document_record *); +static void parse_display_parameters (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); +static void parse_machine_integer_info (struct sfm_reader *, + const struct sfm_extension_record *, + struct sfm_read_info *); +static void parse_machine_float_info (struct sfm_reader *, + const struct sfm_extension_record *); +static void parse_mrsets (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); +static void parse_long_var_name_map (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); +static void parse_long_string_map (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); +static void parse_value_labels (struct sfm_reader *, struct dictionary *, + const struct sfm_var_record *, + size_t n_var_recs, + const struct sfm_value_label_record *); +static void parse_data_file_attributes (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); +static void parse_variable_attributes (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); +static void parse_long_string_value_labels (struct sfm_reader *, + const struct sfm_extension_record *, + struct dictionary *); - if (r->fh != NULL) - fh_close (r->fh, "system file", "rs"); +/* Opens the system file designated by file handle FH for + reading. Reads the system file's dictionary into *DICT. + If INFO is non-null, then it receives additional info about the + system file. */ +struct casereader * +sfm_open_reader (struct file_handle *fh, struct dictionary **dictp, + struct sfm_read_info *volatile info) +{ + struct sfm_reader *volatile r = NULL; + struct sfm_read_info local_info; - free (r->vars); - free (r->buf); - free (r); -} - -/* Dictionary reader. */ + struct sfm_var_record *vars; + size_t n_vars, allocated_vars; -static void buf_unread(struct sfm_reader *r, size_t byte_cnt); - -static void *buf_read (struct sfm_reader *, void *buf, size_t byte_cnt, - size_t min_alloc); - -static int read_header (struct sfm_reader *, - struct dictionary *, struct sfm_read_info *); -static int parse_format_spec (struct sfm_reader *, int32_t, - struct fmt_spec *, const struct variable *); -static int read_value_labels (struct sfm_reader *, struct dictionary *, - struct variable **var_by_idx); -static int read_variables (struct sfm_reader *, - struct dictionary *, struct variable ***var_by_idx); -static int read_machine_int32_info (struct sfm_reader *, int size, int count); -static int read_machine_flt64_info (struct sfm_reader *, int size, int count); -static int read_documents (struct sfm_reader *, struct dictionary *); - -static int fread_ok (struct sfm_reader *, void *, size_t); - -/* Displays the message X with corrupt_msg, then jumps to the error - label. */ -#define lose(X) \ - do { \ - corrupt_msg X; \ - goto error; \ - } while (0) - -/* Calls buf_read with the specified arguments, and jumps to - error if the read fails. */ -#define assertive_buf_read(a,b,c,d) \ - do { \ - if (!buf_read (a,b,c,d)) \ - goto error; \ - } while (0) - - -struct name_pair -{ - char *shortname; - char *longname; -}; + struct sfm_value_label_record *labels; + size_t n_labels, allocated_labels; -static int -pair_sn_compare(const void *_p1, const void *_p2, void *aux UNUSED) -{ - int i; + struct sfm_document_record *document; - const struct name_pair *p1 = _p1; - const struct name_pair *p2 = _p2; + struct sfm_extension_record *extensions[32]; - char buf1[SHORT_NAME_LEN + 1]; - char buf2[SHORT_NAME_LEN + 1]; + int weight_idx; + int claimed_oct_cnt; + char *file_label; - memset(buf1, 0, SHORT_NAME_LEN + 1); - memset(buf2, 0, SHORT_NAME_LEN + 1); + struct dictionary *dict = NULL; + size_t i; - for (i = 0 ; i <= SHORT_NAME_LEN ; ++i ) - { - buf1[i] = p1->shortname[i]; - if ( '\0' == buf1[i]) - break; - } + /* Create and initialize reader. */ + r = pool_create_container (struct sfm_reader, pool); + r->fh = fh_ref (fh); + r->lock = NULL; + r->file = NULL; + r->pos = 0; + r->error = false; + r->opcode_idx = sizeof r->opcodes; + r->corruption_warning = false; + + /* TRANSLATORS: this fragment will be interpolated into + messages in fh_lock() that identify types of files. */ + r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false); + if (r->lock == NULL) + goto error; - for (i = 0 ; i <= SHORT_NAME_LEN ; ++i ) + r->file = fn_open (fh_get_file_name (fh), "rb"); + if (r->file == NULL) { - buf2[i] = p2->shortname[i]; - if ( '\0' == buf2[i]) - break; + msg (ME, _("Error opening `%s' for reading as a system file: %s."), + fh_get_file_name (r->fh), strerror (errno)); + goto error; } - return strncmp(buf1, buf2, SHORT_NAME_LEN); -} + /* Initialize info. */ + if (info == NULL) + info = &local_info; + memset (info, 0, sizeof *info); -static unsigned int -pair_sn_hash(const void *_p, void *aux UNUSED) -{ - int i; - const struct name_pair *p = _p; - char buf[SHORT_NAME_LEN + 1]; + if (setjmp (r->bail_out)) + goto error; - memset(buf, 0, SHORT_NAME_LEN + 1); - for (i = 0 ; i <= SHORT_NAME_LEN ; ++i ) - { - buf[i] = p->shortname[i]; - if ( '\0' == buf[i]) - break; - } + /* Read header. */ + read_header (r, &weight_idx, &claimed_oct_cnt, info, &file_label); - return hsh_hash_bytes(buf, strlen(buf)); -} + vars = NULL; + n_vars = allocated_vars = 0; -static void -pair_sn_free(void *p, void *aux UNUSED) -{ - free(p); -} + labels = NULL; + n_labels = allocated_labels = 0; + document = NULL; + memset (extensions, 0, sizeof extensions); -/* Opens the system file designated by file handle FH for - reading. Reads the system file's dictionary into *DICT. - If INFO is non-null, then it receives additional info about the - system file. */ -struct sfm_reader * -sfm_open_reader (struct file_handle *fh, struct dictionary **dict, - struct sfm_read_info *info) -{ - struct sfm_reader *r = NULL; - struct variable **var_by_idx = NULL; + for (;;) + { + int subtype; + int type; - /* The data in record 7(14) */ - char *subrec14data = 0; + type = read_int (r); + if (type == 999) + { + read_int (r); /* Skip filler. */ + break; + } - /* A hash table of long variable names indexed by short name */ - struct hsh_table *short_to_long = NULL; + switch (type) + { + case 2: + if (n_vars >= allocated_vars) + vars = pool_2nrealloc (r->pool, vars, &allocated_vars, + sizeof *vars); + read_variable_record (r, &vars[n_vars++]); + break; + + case 3: + if (n_labels >= allocated_labels) + labels = pool_2nrealloc (r->pool, labels, &allocated_labels, + sizeof *labels); + read_value_label_record (r, &labels[n_labels++], n_vars); + break; + + case 4: + /* A Type 4 record is always immediately after a type 3 record, + so the code for type 3 records reads the type 4 record too. */ + sys_error (r, r->pos, _("Misplaced type 4 record.")); + + case 6: + if (document != NULL) + sys_error (r, r->pos, _("Duplicate type 6 (document) record.")); + document = read_document_record (r); + break; + + case 7: + subtype = read_int (r); + if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions) + { + sys_warn (r, r->pos, + _("Unrecognized record type 7, subtype %d. Please " + "send a copy of this file, and the syntax which " + "created it to %s."), + subtype, PACKAGE_BUGREPORT); + skip_extension_record (r, subtype); + } + else if (extensions[subtype] != NULL) + { + sys_warn (r, r->pos, + _("Record type 7, subtype %d found here has the same " + "type as the record found near offset 0x%llx. " + "Please send a copy of this file, and the syntax " + "which created it to %s."), + subtype, (long long int) extensions[subtype]->pos, + PACKAGE_BUGREPORT); + skip_extension_record (r, subtype); + } + else + extensions[subtype] = read_extension_record (r, subtype); + break; - *dict = dict_create (); - if (!fh_open (fh, FH_REF_FILE, "system file", "rs")) - goto error; + default: + sys_error (r, r->pos, _("Unrecognized record type %d."), type); + goto error; + } + } - /* Create and initialize reader. */ - r = xmalloc (sizeof *r); - r->fh = fh; - r->file = fn_open (fh_get_file_name (fh), "rb"); + /* Now actually parse what we read. - r->reverse_endian = 0; - r->value_cnt = 0; - r->case_cnt = 0; - r->compressed = 0; - r->bias = 100.0; - r->weight_idx = -1; - r->ok = true; - r->has_vls = false; + First, figure out the correct character encoding, because this determines + how the rest of the header data is to be interpreted. */ + dict = dict_create (); + r->encoding = choose_encoding (r, extensions[EXT_INTEGER], + extensions[EXT_ENCODING]); + dict_set_encoding (dict, r->encoding); - r->vars = NULL; + /* These records don't use variables at all. */ + if (document != NULL) + parse_document (dict, document); - r->sysmis = -FLT64_MAX; - r->highest = FLT64_MAX; - r->lowest = second_lowest_flt64; + if (extensions[EXT_INTEGER] != NULL) + parse_machine_integer_info (r, extensions[EXT_INTEGER], info); - r->buf = r->ptr = r->end = NULL; - r->y = r->x + sizeof r->x; + if (extensions[EXT_FLOAT] != NULL) + parse_machine_float_info (r, extensions[EXT_FLOAT]); - /* Check that file open succeeded. */ - if (r->file == NULL) - { - msg (ME, _("An error occurred while opening \"%s\" for reading " - "as a system file: %s."), - fh_get_file_name (r->fh), strerror (errno)); - goto error; - } + if (extensions[EXT_FILE_ATTRS] != NULL) + parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict); - /* Read header and variables. */ - if (!read_header (r, *dict, info) || !read_variables (r, *dict, &var_by_idx)) - goto error; + parse_file_label (r, file_label, dict); + /* Parse the variable records, the basis of almost everything else. */ + parse_variable_records (r, dict, vars, n_vars); - /* Handle weighting. */ - if (r->weight_idx != -1) + /* Parse value labels and the weight variable immediately after the variable + records. These records use indexes into var_recs[], so we must parse them + before those indexes become invalidated by very long string variables. */ + for (i = 0; i < n_labels; i++) + parse_value_labels (r, dict, vars, n_vars, &labels[i]); + if (weight_idx != 0) { struct variable *weight_var; - if (r->weight_idx < 0 || r->weight_idx >= r->value_cnt) - lose ((ME, _("%s: Index of weighting variable (%d) is not between 0 " - "and number of elements per case (%d)."), - fh_get_file_name (r->fh), r->weight_idx, r->value_cnt)); - + weight_var = lookup_var_by_index (r, 76, vars, n_vars, weight_idx); + if (var_is_numeric (weight_var)) + dict_set_weight (dict, weight_var); + else + sys_error (r, -1, _("Weighting variable must be numeric " + "(not string variable `%s')."), + var_get_name (weight_var)); + } - weight_var = var_by_idx[r->weight_idx]; + if (extensions[EXT_DISPLAY] != NULL) + parse_display_parameters (r, extensions[EXT_DISPLAY], dict); - if (weight_var == NULL) - lose ((ME, - _("%s: Weighting variable may not be a continuation of " - "a long string variable."), fh_get_file_name (fh))); - else if (weight_var->type == ALPHA) - lose ((ME, _("%s: Weighting variable may not be a string variable."), - fh_get_file_name (fh))); + /* The following records use short names, so they need to be parsed before + parse_long_var_name_map() changes short names to long names. */ + if (extensions[EXT_MRSETS] != NULL) + parse_mrsets (r, extensions[EXT_MRSETS], dict); - dict_set_weight (*dict, weight_var); - } - else - dict_set_weight (*dict, NULL); + if (extensions[EXT_MRSETS2] != NULL) + parse_mrsets (r, extensions[EXT_MRSETS2], dict); - /* Read records of types 3, 4, 6, and 7. */ - for (;;) - { - int32_t rec_type; + if (extensions[EXT_LONG_STRINGS] != NULL) + parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict); - assertive_buf_read (r, &rec_type, sizeof rec_type, 0); - if (r->reverse_endian) - bswap_int32 (&rec_type); + /* Now rename variables to their long names. */ + parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict); + /* The following records use long names, so they need to follow renaming. */ + if (extensions[EXT_VAR_ATTRS] != NULL) + parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict); - switch (rec_type) - { - case 3: - if (!read_value_labels (r, *dict, var_by_idx)) - goto error; - break; - - case 4: - lose ((ME, _("%s: Orphaned variable index record (type 4). Type 4 " - "records must always immediately follow type 3 " - "records."), - fh_get_file_name (r->fh))); - - case 6: - if (!read_documents (r, *dict)) - goto error; - break; - - case 7: - { - struct - { - int32_t subtype P; - int32_t size P; - int32_t count P; - } - data; - unsigned long bytes; - - int skip = 0; - - assertive_buf_read (r, &data, sizeof data, 0); - if (r->reverse_endian) - { - bswap_int32 (&data.subtype); - bswap_int32 (&data.size); - bswap_int32 (&data.count); - } - bytes = data.size * data.count; - - if (bytes < data.size || bytes < data.count) - lose ((ME, "%s: Record type %d subtype %d too large.", - fh_get_file_name (r->fh), rec_type, data.subtype)); - - switch (data.subtype) - { - case 3: - if (!read_machine_int32_info (r, data.size, data.count)) - goto error; - break; - - case 4: - if (!read_machine_flt64_info (r, data.size, data.count)) - goto error; - break; - - case 5: - case 6: /* ?? Used by SPSS 8.0. */ - skip = 1; - break; - - case 11: /* Variable display parameters */ - { - const int n_vars = data.count / 3 ; - int i; - if ( data.count % 3 || n_vars != dict_get_var_cnt(*dict) ) - { - msg (MW, _("%s: Invalid subrecord length. " - "Record: 7; Subrecord: 11"), - fh_get_file_name (r->fh)); - skip = 1; - break; - } - - for ( i = 0 ; i < min(n_vars, dict_get_var_cnt(*dict)) ; ++i ) - { - struct - { - int32_t measure P; - int32_t width P; - int32_t align P; - } - params; - - struct variable *v; - - assertive_buf_read (r, ¶ms, sizeof(params), 0); - - if ( ! measure_is_valid(params.measure) - || - ! alignment_is_valid(params.align)) - { - msg(MW, - _("%s: Invalid variable display parameters. Default parameters substituted."), - fh_get_file_name(r->fh)); - continue; - } - - v = dict_get_var(*dict, i); - - v->measure = params.measure; - v->display_width = params.width; - v->alignment = params.align; - } - } - break; - - case 13: /* SPSS 12.0 Long variable name map */ - { - char *short_name, *save_ptr; - int idx; - - /* Read data. */ - subrec14data = xmalloc (bytes + 1); - if (!buf_read (r, subrec14data, bytes, 0)) - { - goto error; - } - subrec14data[bytes] = '\0'; - - short_to_long = hsh_create(4, - pair_sn_compare, - pair_sn_hash, - pair_sn_free, - 0); - - /* Parse data. */ - for (short_name = strtok_r (subrec14data, "=", &save_ptr), idx = 0; - short_name != NULL; - short_name = strtok_r (NULL, "=", &save_ptr), idx++) - { - struct name_pair *pair ; - char *long_name = strtok_r (NULL, "\t", &save_ptr); - struct variable *v; - - /* Validate long name. */ - if (long_name == NULL) - { - msg (MW, _("%s: Trailing garbage in long variable " - "name map."), - fh_get_file_name (r->fh)); - break; - } - if (!var_is_valid_name (long_name, false)) - { - msg (MW, _("%s: Long variable mapping to invalid " - "variable name `%s'."), - fh_get_file_name (r->fh), long_name); - break; - } - - /* Find variable using short name. */ - v = dict_lookup_var (*dict, short_name); - if (v == NULL) - { - msg (MW, _("%s: Long variable mapping for " - "nonexistent variable %s."), - fh_get_file_name (r->fh), short_name); - break; - } - - /* Identify any duplicates. */ - if ( compare_var_names(short_name, long_name, 0) && - NULL != dict_lookup_var (*dict, long_name)) - lose ((ME, _("%s: Duplicate long variable name `%s' " - "within system file."), - fh_get_file_name (r->fh), long_name)); - - - /* Set long name. - Renaming a variable may clear the short - name, but we want to retain it, so - re-set it explicitly. */ - dict_rename_var (*dict, v, long_name); - var_set_short_name (v, short_name); - - pair = xmalloc(sizeof *pair); - pair->shortname = short_name; - pair->longname = long_name; - hsh_insert(short_to_long, pair); -#if 0 - /* This messes up the processing of subtype 14 (below). - I'm not sure if it is needed anyway, so I'm removing it for - now. If it's needed, then it will need to be done after all the - records have been processed. --- JMD 27 April 2006 - */ - - /* For compatibility, make sure dictionary - is in long variable name map order. In - the common case, this has no effect, - because the dictionary and the long - variable name map are already in the - same order. */ - dict_reorder_var (*dict, v, idx); -#endif - } - - } - break; - - case 14: - { - int j = 0; - bool eq_seen = false; - int i; - - /* Read data. */ - char *buffer = xmalloc (bytes + 1); - if (!buf_read (r, buffer, bytes, 0)) - { - free (buffer); - goto error; - } - buffer[bytes] = '\0'; - - r->has_vls = true; - - /* Note: SPSS v13 terminates this record with 00, - whereas SPSS v14 terminates it with 00 09. We must - accept either */ - for(i = 0; i < bytes ; ++i) - { - long int length; - static char name[SHORT_NAME_LEN + 1] = {0}; - static char len_str[6] ={0}; - - switch( buffer[i] ) - { - case '=': - eq_seen = true; - j = 0; - break; - case '\0': - length = strtol(len_str, 0, 10); - if ( length != LONG_MAX && length != LONG_MIN) - { - char *lookup_name = name; - int l; - int idx; - struct variable *v; - - if ( short_to_long ) - { - struct name_pair pair; - struct name_pair *p; - - pair.shortname = name; - p = hsh_find(short_to_long, &pair); - if ( p ) - lookup_name = p->longname; - } - - v = dict_lookup_var(*dict, lookup_name); - if ( !v ) - { - corrupt_msg(MW, - _("%s: No variable called %s but it is listed in length table."), - fh_get_file_name (r->fh), lookup_name); - - goto error; - - } - - l = length; - if ( v->width > EFFECTIVE_LONG_STRING_LENGTH ) - l -= EFFECTIVE_LONG_STRING_LENGTH; - else - l -= v->width; - - idx = v->index; - while ( l > 0 ) - { - struct variable *v_next; - v_next = dict_get_var(*dict, idx + 1); - - if ( v_next->width > EFFECTIVE_LONG_STRING_LENGTH ) - l -= EFFECTIVE_LONG_STRING_LENGTH; - else - l -= v_next->width; - - dict_delete_var(*dict, v_next); - } - - assert ( length > MAX_LONG_STRING ); - - v->width = length; - v->print.w = v->width; - v->write.w = v->width; - v->nv = DIV_RND_UP (length, MAX_SHORT_STRING); - } - eq_seen = false; - memset(name, 0, SHORT_NAME_LEN+1); - memset(len_str, 0, 6); - j = 0; - break; - case '\t': - break; - default: - if ( eq_seen ) - len_str[j] = buffer[i]; - else - name[j] = buffer[i]; - j++; - break; - } - } - free(buffer); - dict_compact_values(*dict); - } - break; - - default: - msg (MW, _("%s: Unrecognized record type 7, subtype %d " - "encountered in system file."), - fh_get_file_name (r->fh), data.subtype); - skip = 1; - } - - if (skip) - { - void *x = buf_read (r, NULL, data.size * data.count, 0); - if (x == NULL) - goto error; - free (x); - } - } - break; - - case 999: - { - int32_t filler; - - assertive_buf_read (r, &filler, sizeof filler, 0); - - goto success; - } - - default: - corrupt_msg(MW, _("%s: Unrecognized record type %d."), - fh_get_file_name (r->fh), rec_type); - } - } + if (extensions[EXT_LONG_LABELS] != NULL) + parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict); - success: - /* Come here on successful completion. */ + /* Warn if the actual amount of data per case differs from the + amount that the header claims. SPSS version 13 gets this + wrong when very long strings are involved, so don't warn in + that case. */ + if (claimed_oct_cnt != -1 && claimed_oct_cnt != n_vars + && info->version_major != 13) + sys_warn (r, -1, _("File header claims %d variable positions but " + "%d were read from file."), + claimed_oct_cnt, n_vars); /* Create an index of dictionary variable widths for - sfm_read_case to use. We cannot use the `struct variables' + sfm_read_case to use. We cannot use the `struct variable's from the dictionary we created, because the caller owns the dictionary and may destroy or modify its variables. */ - { - size_t i; - - r->var_cnt = dict_get_var_cnt (*dict); - r->vars = xnmalloc (r->var_cnt, sizeof *r->vars); - for (i = 0; i < r->var_cnt; i++) - { - struct variable *v = dict_get_var (*dict, i); - struct sfm_var *sv = &r->vars[i]; - sv->width = v->width; - sv->fv = v->fv; - } - } - - free (var_by_idx); - hsh_destroy(short_to_long); - free (subrec14data); - return r; - - error: - /* Come here on unsuccessful completion. */ - sfm_close_reader (r); - free (var_by_idx); - hsh_destroy(short_to_long); - free (subrec14data); - if (*dict != NULL) - { - dict_destroy (*dict); - *dict = NULL; - } + sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt); + pool_register (r->pool, free, r->sfm_vars); + r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool); + + *dictp = dict; + return casereader_create_sequential + (NULL, r->proto, + r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt, + &sys_file_casereader_class, r); + +error: + close_reader (r); + dict_destroy (dict); + *dictp = NULL; return NULL; } -/* Read record type 7, subtype 3. */ -static int -read_machine_int32_info (struct sfm_reader *r, int size, int count) -{ - int32_t data[8]; - int file_bigendian; - - int i; - - if (size != sizeof (int32_t) || count != 8) - lose ((ME, _("%s: Bad size (%d) or count (%d) field on record type 7, " - "subtype 3. Expected size %d, count 8."), - fh_get_file_name (r->fh), size, count, sizeof (int32_t))); - - assertive_buf_read (r, data, sizeof data, 0); - if (r->reverse_endian) - for (i = 0; i < 8; i++) - bswap_int32 (&data[i]); - -#ifdef FPREP_IEEE754 - if (data[4] != 1) - lose ((ME, _("%s: Floating-point representation in system file is not " - "IEEE-754. PSPP cannot convert between floating-point " - "formats."), - fh_get_file_name (r->fh))); -#else -#error Add support for your floating-point format. -#endif - -#ifdef WORDS_BIGENDIAN - file_bigendian = 1; -#else - file_bigendian = 0; -#endif - if (r->reverse_endian) - file_bigendian ^= 1; - if (file_bigendian ^ (data[6] == 1)) - lose ((ME, _("%s: File-indicated endianness (%s) does not match " - "endianness intuited from file header (%s)."), - fh_get_file_name (r->fh), - file_bigendian ? _("big-endian") : _("little-endian"), - data[6] == 1 ? _("big-endian") : (data[6] == 2 ? _("little-endian") - : _("unknown")))); - - /* PORTME: Character representation code. */ - if (data[7] != 2 && data[7] != 3) - lose ((ME, _("%s: File-indicated character representation code (%s) is " - "not ASCII."), - fh_get_file_name (r->fh), - (data[7] == 1 ? "EBCDIC" - : (data[7] == 4 ? _("DEC Kanji") : _("Unknown"))))); - - return 1; - - error: - return 0; -} - -/* Read record type 7, subtype 4. */ -static int -read_machine_flt64_info (struct sfm_reader *r, int size, int count) +/* Closes a system file after we're done with it. + Returns true if an I/O error has occurred on READER, false + otherwise. */ +static bool +close_reader (struct sfm_reader *r) { - flt64 data[3]; - int i; + bool error; - if (size != sizeof (flt64) || count != 3) - lose ((ME, _("%s: Bad size (%d) or count (%d) field on record type 7, " - "subtype 4. Expected size %d, count 8."), - fh_get_file_name (r->fh), size, count, sizeof (flt64))); - - assertive_buf_read (r, data, sizeof data, 0); - if (r->reverse_endian) - for (i = 0; i < 3; i++) - bswap_flt64 (&data[i]); + if (r == NULL) + return true; - if (data[0] != SYSMIS || data[1] != FLT64_MAX - || data[2] != second_lowest_flt64) + if (r->file) { - r->sysmis = data[0]; - r->highest = data[1]; - r->lowest = data[2]; - msg (MW, _("%s: File-indicated value is different from internal value " - "for at least one of the three system values. SYSMIS: " - "indicated %g, expected %g; HIGHEST: %g, %g; LOWEST: " - "%g, %g."), - fh_get_file_name (r->fh), (double) data[0], (double) SYSMIS, - (double) data[1], (double) FLT64_MAX, - (double) data[2], (double) second_lowest_flt64); + if (fn_close (fh_get_file_name (r->fh), r->file) == EOF) + { + msg (ME, _("Error closing system file `%s': %s."), + fh_get_file_name (r->fh), strerror (errno)); + r->error = true; + } + r->file = NULL; } - - return 1; - error: - return 0; + fh_unlock (r->lock); + fh_unref (r->fh); + + error = r->error; + pool_destroy (r->pool); + + return !error; } -static int -read_header (struct sfm_reader *r, - struct dictionary *dict, struct sfm_read_info *info) +/* Destroys READER. */ +static void +sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_) { - struct sysfile_header hdr; /* Disk buffer. */ - char prod_name[sizeof hdr.prod_name + 1]; /* Buffer for product name. */ - int skip_amt = 0; /* Amount of product name to omit. */ - int i; + struct sfm_reader *r = r_; + close_reader (r); +} - /* Read header, check magic. */ - assertive_buf_read (r, &hdr, sizeof hdr, 0); - if (strncmp ("$FL2", hdr.rec_type, 4) != 0) - lose ((ME, _("%s: Bad magic. Proper system files begin with " - "the four characters `$FL2'. This file will not be read."), - fh_get_file_name (r->fh))); - - /* Check eye-category.her string. */ - memcpy (prod_name, hdr.prod_name, sizeof hdr.prod_name); - for (i = 0; i < 60; i++) - if (!c_isprint ((unsigned char) prod_name[i])) - prod_name[i] = ' '; - for (i = 59; i >= 0; i--) - if (!c_isgraph ((unsigned char) prod_name[i])) - { - prod_name[i] = '\0'; - break; - } - prod_name[60] = '\0'; - - { -#define N_PREFIXES 2 - static const char *prefix[N_PREFIXES] = - { - "@(#) SPSS DATA FILE", - "SPSS SYSTEM FILE.", - }; +/* Returns true if FILE is an SPSS system file, + false otherwise. */ +bool +sfm_detect (FILE *file) +{ + char rec_type[5]; - int i; + if (fread (rec_type, 4, 1, file) != 1) + return false; + rec_type[4] = '\0'; - for (i = 0; i < N_PREFIXES; i++) - if (!strncmp (prefix[i], hdr.prod_name, strlen (prefix[i]))) - { - skip_amt = strlen (prefix[i]); - break; - } - } - - /* Check endianness. */ - if (hdr.layout_code == 2) - r->reverse_endian = 0; - else + return !strcmp ("$FL2", rec_type); +} + +/* Reads the global header of the system file. Sets *WEIGHT_IDX to 0 if the + system file is unweighted, or to the value index of the weight variable + otherwise. Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units) per + case that the file claims to have (although it is not always correct). + Initializes INFO with header information. Stores the file label as a string + in dictionary encoding into *FILE_LABELP. */ +static void +read_header (struct sfm_reader *r, int *weight_idx, + int *claimed_oct_cnt, struct sfm_read_info *info, + char **file_labelp) +{ + char rec_type[5]; + char eye_catcher[61]; + uint8_t raw_layout_code[4]; + uint8_t raw_bias[8]; + char creation_date[10]; + char creation_time[9]; + char file_label[65]; + struct substring product; + + read_string (r, rec_type, sizeof rec_type); + read_string (r, eye_catcher, sizeof eye_catcher); + + if (strcmp ("$FL2", rec_type) != 0) + sys_error (r, 0, _("This is not an SPSS system file.")); + + /* Identify integer format. */ + read_bytes (r, raw_layout_code, sizeof raw_layout_code); + if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code, + &r->integer_format) + && !integer_identify (3, raw_layout_code, sizeof raw_layout_code, + &r->integer_format)) + || (r->integer_format != INTEGER_MSB_FIRST + && r->integer_format != INTEGER_LSB_FIRST)) + sys_error (r, 64, _("This is not an SPSS system file.")); + + *claimed_oct_cnt = read_int (r); + if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16) + *claimed_oct_cnt = -1; + + r->compressed = read_int (r) != 0; + + *weight_idx = read_int (r); + + r->case_cnt = read_int (r); + if ( r->case_cnt > INT_MAX / 2) + r->case_cnt = -1; + + /* Identify floating-point format and obtain compression bias. */ + read_bytes (r, raw_bias, sizeof raw_bias); + if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0) { - bswap_int32 (&hdr.layout_code); - if (hdr.layout_code != 2) - lose ((ME, _("%s: File layout code has unexpected value %d. Value " - "should be 2, in big-endian or little-endian format."), - fh_get_file_name (r->fh), hdr.layout_code)); - - r->reverse_endian = 1; - bswap_int32 (&hdr.nominal_case_size); - bswap_int32 (&hdr.compress); - bswap_int32 (&hdr.weight_idx); - bswap_int32 (&hdr.case_cnt); - bswap_flt64 (&hdr.bias); - } + uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (memcmp (raw_bias, zero_bias, 8)) + sys_warn (r, r->pos - 8, + _("Compression bias is not the usual " + "value of 100, or system file uses unrecognized " + "floating-point format.")); + else + { + /* Some software is known to write all-zeros to this + field. Such software also writes floating-point + numbers in the format that we expect by default + (it seems that all software most likely does, in + reality), so don't warn in this case. */ + } + if (r->integer_format == INTEGER_MSB_FIRST) + r->float_format = FLOAT_IEEE_DOUBLE_BE; + else + r->float_format = FLOAT_IEEE_DOUBLE_LE; + } + float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias); + + read_string (r, creation_date, sizeof creation_date); + read_string (r, creation_time, sizeof creation_time); + read_string (r, file_label, sizeof file_label); + skip_bytes (r, 3); + + strcpy (info->creation_date, creation_date); + strcpy (info->creation_time, creation_time); + info->integer_format = r->integer_format; + info->float_format = r->float_format; + info->compressed = r->compressed; + info->case_cnt = r->case_cnt; + + product = ss_cstr (eye_catcher); + ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE")); + ss_trim (&product, ss_cstr (" ")); + str_copy_buf_trunc (info->product, sizeof info->product, + ss_data (product), ss_length (product)); + + *file_labelp = pool_strdup0 (r->pool, file_label, sizeof file_label - 1); +} - /* Copy basic info and verify correctness. */ - r->value_cnt = hdr.nominal_case_size; +/* Reads a variable (type 2) record from R into RECORD. */ +static void +read_variable_record (struct sfm_reader *r, struct sfm_var_record *record) +{ + int has_variable_label; - /* If value count is ridiculous, then force it to -1 (a - sentinel value). */ - if ( r->value_cnt < 0 || - r->value_cnt > (INT_MAX / (int) sizeof (union value) / 2)) - r->value_cnt = -1; + memset (record, 0, sizeof *record); - r->compressed = hdr.compress; + record->pos = r->pos; + record->width = read_int (r); + has_variable_label = read_int (r); + record->missing_value_code = read_int (r); + record->print_format = read_int (r); + record->write_format = read_int (r); + read_bytes (r, record->name, sizeof record->name); - r->weight_idx = hdr.weight_idx - 1; + if (has_variable_label == 1) + { + enum { MAX_LABEL_LEN = 255 }; + size_t len, read_len; - r->case_cnt = hdr.case_cnt; - if (r->case_cnt < -1 || r->case_cnt > INT_MAX / 2) - lose ((ME, - _("%s: Number of cases in file (%ld) is not between -1 and %d."), - fh_get_file_name (r->fh), (long) r->case_cnt, INT_MAX / 2)); + len = read_int (r); - r->bias = hdr.bias; - if (r->bias != 100.0) - corrupt_msg (MW, _("%s: Compression bias (%g) is not the usual " - "value of 100."), - fh_get_file_name (r->fh), r->bias); + /* Read up to MAX_LABEL_LEN bytes of label. */ + read_len = MIN (MAX_LABEL_LEN, len); + record->label = xmalloc (read_len + 1); + read_string (r, record->label, read_len + 1); - /* Make a file label only on the condition that the given label is - not all spaces or nulls. */ - { - int i; + /* Skip unread label bytes. */ + skip_bytes (r, len - read_len); - for (i = sizeof hdr.file_label - 1; i >= 0; i--) - { - if (!c_isspace ((unsigned char) hdr.file_label[i]) - && hdr.file_label[i] != 0) - { - char *label = xmalloc (i + 2); - memcpy (label, hdr.file_label, i + 1); - label[i + 1] = 0; - dict_set_label (dict, label); - free (label); - break; - } - } - } + /* Skip label padding up to multiple of 4 bytes. */ + skip_bytes (r, ROUND_UP (len, 4) - len); + } + else if (has_variable_label != 0) + sys_error (r, record->pos, + _("Variable label indicator field is not 0 or 1.")); - if (info) + /* Set missing values. */ + if (record->missing_value_code != 0) { - char *cp; - - memcpy (info->creation_date, hdr.creation_date, 9); - info->creation_date[9] = 0; + int code = record->missing_value_code; + if (record->width == 0) + { + if (code < -3 || code > 3 || code == -1) + sys_error (r, record->pos, + _("Numeric missing value indicator field is not " + "-3, -2, 0, 1, 2, or 3.")); + } + else + { + if (code < 1 || code > 3) + sys_error (r, record->pos, + _("String missing value indicator field is not " + "0, 1, 2, or 3.")); + } - memcpy (info->creation_time, hdr.creation_time, 8); - info->creation_time[8] = 0; + read_bytes (r, record->missing, 8 * abs (code)); + } +} -#ifdef WORDS_BIGENDIAN - info->big_endian = !r->reverse_endian; -#else - info->big_endian = r->reverse_endian; -#endif +/* Reads value labels from R into RECORD. */ +static void +read_value_label_record (struct sfm_reader *r, + struct sfm_value_label_record *record, + size_t n_vars) +{ + size_t i; + + /* Read type 3 record. */ + record->pos = r->pos; + record->n_labels = read_int (r); + if (record->n_labels > SIZE_MAX / sizeof *record->labels) + sys_error (r, r->pos - 4, _("Invalid number of labels %zu."), + record->n_labels); + record->labels = pool_nmalloc (r->pool, record->n_labels, + sizeof *record->labels); + for (i = 0; i < record->n_labels; i++) + { + struct sfm_value_label *label = &record->labels[i]; + unsigned char label_len; + size_t padded_len; - info->compressed = hdr.compress; + read_bytes (r, label->value, sizeof label->value); - info->case_cnt = hdr.case_cnt; + /* Read label length. */ + read_bytes (r, &label_len, sizeof label_len); + padded_len = ROUND_UP (label_len + 1, 8); - for (cp = &prod_name[skip_amt]; cp < &prod_name[60]; cp++) - if (c_isgraph ((unsigned char) *cp)) - break; - strcpy (info->product, cp); + /* Read label, padding. */ + label->label = pool_malloc (r->pool, padded_len + 1); + read_bytes (r, label->label, padded_len - 1); + label->label[label_len] = '\0'; } - return 1; + /* Read record type of type 4 record. */ + if (read_int (r) != 4) + sys_error (r, r->pos - 4, + _("Variable index record (type 4) does not immediately " + "follow value label record (type 3) as it should.")); - error: - return 0; + /* Read number of variables associated with value label from type 4 + record. */ + record->n_vars = read_int (r); + if (record->n_vars < 1 || record->n_vars > n_vars) + sys_error (r, r->pos - 4, + _("Number of variables associated with a value label (%d) " + "is not between 1 and the number of variables (%zu)."), + record->n_vars, n_vars); + record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars); + for (i = 0; i < record->n_vars; i++) + record->vars[i] = read_int (r); } -/* Reads most of the dictionary from file H; also fills in the - associated VAR_BY_IDX array. */ -static int -read_variables (struct sfm_reader *r, - struct dictionary *dict, struct variable ***var_by_idx) +/* Reads a document record from R and returns it. */ +static struct sfm_document_record * +read_document_record (struct sfm_reader *r) { - int i; + struct sfm_document_record *record; + int n_lines; + + record = pool_malloc (r->pool, sizeof *record); + record->pos = r->pos; - struct sysfile_variable sv; /* Disk buffer. */ - int long_string_count = 0; /* # of long string continuation - records still expected. */ - int next_value = 0; /* Index to next `value' structure. */ + n_lines = read_int (r); + if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH) + sys_error (r, record->pos, + _("Number of document lines (%d) " + "must be greater than 0 and less than %d."), + n_lines, INT_MAX / DOC_LINE_LENGTH); - assert(r); + record->n_lines = n_lines; + record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines); + read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines); - *var_by_idx = 0; + return record; +} +static void +read_extension_record_header (struct sfm_reader *r, int subtype, + struct sfm_extension_record *record) +{ + record->pos = r->pos; + record->size = read_int (r); + record->count = read_int (r); + + /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1 + allows an extra byte for a null terminator, used by some + extension processing routines. */ + if (record->size != 0 + && size_overflow_p (xsum (1, xtimes (record->count, record->size)))) + sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype); +} - /* Read in the entry for each variable and use the info to - initialize the dictionary. */ - for (i = 0; ; ++i) +/* Reads an extension record from R into RECORD. */ +static struct sfm_extension_record * +read_extension_record (struct sfm_reader *r, int subtype) +{ + struct extension_record_type { - struct variable *vv; - char name[SHORT_NAME_LEN + 1]; - int nv; - int j; + int subtype; + int size; + int count; + }; - assertive_buf_read (r, &sv, sizeof sv, 0); + static const struct extension_record_type types[] = + { + /* Implemented record types. */ + { EXT_INTEGER, 4, 8 }, + { EXT_FLOAT, 8, 3 }, + { EXT_MRSETS, 1, 0 }, + { EXT_DISPLAY, 4, 0 }, + { EXT_LONG_NAMES, 1, 0 }, + { EXT_LONG_STRINGS, 1, 0 }, + { EXT_NCASES, 8, 2 }, + { EXT_FILE_ATTRS, 1, 0 }, + { EXT_VAR_ATTRS, 1, 0 }, + { EXT_MRSETS2, 1, 0 }, + { EXT_ENCODING, 1, 0 }, + { EXT_LONG_LABELS, 1, 0 }, + + /* Ignored record types. */ + { EXT_VAR_SETS, 0, 0 }, + { EXT_DATE, 0, 0 }, + { EXT_DATA_ENTRY, 0, 0 }, + }; + + const struct extension_record_type *type; + struct sfm_extension_record *record; + size_t n_bytes; + + record = pool_malloc (r->pool, sizeof *record); + read_extension_record_header (r, subtype, record); + n_bytes = record->count * record->size; + + for (type = types; type < &types[sizeof types / sizeof *types]; type++) + if (subtype == type->subtype) + { + if (type->size > 0 && record->size != type->size) + sys_warn (r, record->pos, + _("Record type 7, subtype %d has bad size %zu " + "(expected %d)."), subtype, record->size, type->size); + else if (type->count > 0 && record->count != type->count) + sys_warn (r, record->pos, + _("Record type 7, subtype %d has bad count %zu " + "(expected %d)."), subtype, record->count, type->count); + else if (type->count == 0 && type->size == 0) + { + /* Ignore this record. */ + } + else + { + char *data = pool_malloc (r->pool, n_bytes + 1); + data[n_bytes] = '\0'; + + record->data = data; + read_bytes (r, record->data, n_bytes); + return record; + } + + goto skip; + } - if (r->reverse_endian) - { - bswap_int32 (&sv.rec_type); - bswap_int32 (&sv.type); - bswap_int32 (&sv.has_var_label); - bswap_int32 (&sv.n_missing_values); - bswap_int32 (&sv.print); - bswap_int32 (&sv.write); - } + sys_warn (r, record->pos, + _("Unrecognized record type 7, subtype %d. Please send a " + "copy of this file, and the syntax which created it to %s."), + subtype, PACKAGE_BUGREPORT); - /* We've come to the end of the variable entries */ - if (sv.rec_type != 2) - { - buf_unread(r, sizeof sv); - r->value_cnt = i; - break; - } +skip: + skip_bytes (r, n_bytes); + return NULL; +} - *var_by_idx = xnrealloc (*var_by_idx, i + 1, sizeof **var_by_idx); +static void +skip_extension_record (struct sfm_reader *r, int subtype) +{ + struct sfm_extension_record record; - /* If there was a long string previously, make sure that the - continuations are present; otherwise make sure there aren't - any. */ - if (long_string_count) - { - if (sv.type != -1) - lose ((ME, _("%s: position %d: String variable does not have " - "proper number of continuation records."), - fh_get_file_name (r->fh), i)); + read_extension_record_header (r, subtype, &record); + skip_bytes (r, record.count * record.size); +} +static void +parse_file_label (struct sfm_reader *r, const char *file_label, + struct dictionary *dict) +{ + char *utf8_file_label; + size_t file_label_len; + + utf8_file_label = recode_string_pool ("UTF-8", dict_get_encoding (dict), + file_label, -1, r->pool); + file_label_len = strlen (utf8_file_label); + while (file_label_len > 0 && utf8_file_label[file_label_len - 1] == ' ') + file_label_len--; + utf8_file_label[file_label_len] = '\0'; + dict_set_label (dict, utf8_file_label); +} - (*var_by_idx)[i] = NULL; - long_string_count--; - continue; - } - else if (sv.type == -1) - lose ((ME, _("%s: position %d: Superfluous long string continuation " - "record."), - fh_get_file_name (r->fh), i)); - - /* Check fields for validity. */ - if (sv.type < 0 || sv.type > 255) - lose ((ME, _("%s: position %d: Bad variable type code %d."), - fh_get_file_name (r->fh), i, sv.type)); - if (sv.has_var_label != 0 && sv.has_var_label != 1) - lose ((ME, _("%s: position %d: Variable label indicator field is not " - "0 or 1."), fh_get_file_name (r->fh), i)); - if (sv.n_missing_values < -3 || sv.n_missing_values > 3 - || sv.n_missing_values == -1) - lose ((ME, _("%s: position %d: Missing value indicator field is not " - "-3, -2, 0, 1, 2, or 3."), fh_get_file_name (r->fh), i)); - - /* Copy first character of variable name. */ - if (sv.name[0] == '@' || sv.name[0] == '#') - lose ((ME, _("%s: position %d: Variable name begins with invalid " - "character."), - fh_get_file_name (r->fh), i)); - - name[0] = sv.name[0]; - - /* Copy remaining characters of variable name. */ - for (j = 1; j < SHORT_NAME_LEN; j++) - { - int c = (unsigned char) sv.name[j]; +/* Reads a variable (type 2) record from R and adds the + corresponding variable to DICT. + Also skips past additional variable records for long string + variables. */ +static void +parse_variable_records (struct sfm_reader *r, struct dictionary *dict, + struct sfm_var_record *var_recs, size_t n_var_recs) +{ + const char *dict_encoding = dict_get_encoding (dict); + struct sfm_var_record *rec; + int n_warnings = 0; - if (c == ' ') - break; - else - name[j] = c; - } - name[j] = 0; + for (rec = var_recs; rec < &var_recs[n_var_recs]; ) + { + struct variable *var; + size_t n_values; + char *name; + size_t i; + + name = recode_string_pool ("UTF-8", dict_encoding, + rec->name, 8, r->pool); + name[strcspn (name, " ")] = '\0'; - if ( ! var_is_plausible_name(name, false) ) - lose ((ME, _("%s: Invalid variable name `%s' within system file."), - fh_get_file_name (r->fh), name)); + if (!var_is_valid_name (name, false) || name[0] == '$' || name[0] == '#') + sys_error (r, rec->pos, _("Invalid variable name `%s'."), name); - /* Create variable. */ - vv = (*var_by_idx)[i] = dict_create_var (dict, name, sv.type); - if (vv == NULL) - lose ((ME, _("%s: Duplicate variable name `%s' within system file."), - fh_get_file_name (r->fh), name)); + if (rec->width < 0 || rec->width > 255) + sys_error (r, rec->pos, + _("Bad width %d for variable %s."), rec->width, name); - /* Set the short name the same as the long name */ - var_set_short_name (vv, vv->name); + var = rec->var = dict_create_var (dict, name, rec->width); + if (var == NULL) + sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name); - /* Case reading data. */ - nv = sv.type == 0 ? 1 : DIV_RND_UP (sv.type, sizeof (flt64)); - long_string_count = nv - 1; - next_value += nv; + /* Set the short name the same as the long name. */ + var_set_short_name (var, 0, name); /* Get variable label, if any. */ - if (sv.has_var_label == 1) - { - /* Disk buffer. */ - int32_t len; - - /* Read length of label. */ - assertive_buf_read (r, &len, sizeof len, 0); - if (r->reverse_endian) - bswap_int32 (&len); - - /* Check len. */ - if (len < 0 || len > 255) - lose ((ME, _("%s: Variable %s indicates variable label of invalid " - "length %d."), - fh_get_file_name (r->fh), vv->name, len)); - - if ( len != 0 ) - { - /* Read label into variable structure. */ - vv->label = buf_read (r, NULL, ROUND_UP (len, sizeof (int32_t)), len + 1); - if (vv->label == NULL) - goto error; - vv->label[len] = '\0'; - } - } + if (rec->label) + { + char *utf8_label; - /* Set missing values. */ - if (sv.n_missing_values != 0) - { - flt64 mv[3]; - int mv_cnt = abs (sv.n_missing_values); - - if (vv->width > MAX_SHORT_STRING) - lose ((ME, _("%s: Long string variable %s may not have missing " - "values."), - fh_get_file_name (r->fh), vv->name)); - - assertive_buf_read (r, mv, sizeof *mv * mv_cnt, 0); - - if (r->reverse_endian && vv->type == NUMERIC) - for (j = 0; j < mv_cnt; j++) - bswap_flt64 (&mv[j]); - - if (sv.n_missing_values > 0) - { - for (j = 0; j < sv.n_missing_values; j++) - if (vv->type == NUMERIC) - mv_add_num (&vv->miss, mv[j]); - else - mv_add_str (&vv->miss, (char *) &mv[j]); - } - else - { - if (vv->type == ALPHA) - lose ((ME, _("%s: String variable %s may not have missing " - "values specified as a range."), - fh_get_file_name (r->fh), vv->name)); - - if (mv[0] == r->lowest) - mv_add_num_range (&vv->miss, LOWEST, mv[1]); - else if (mv[1] == r->highest) - mv_add_num_range (&vv->miss, mv[0], HIGHEST); - else - mv_add_num_range (&vv->miss, mv[0], mv[1]); - - if (sv.n_missing_values == -3) - mv_add_num (&vv->miss, mv[2]); - } - } + utf8_label = recode_string_pool ("UTF-8", dict_encoding, + rec->label, -1, r->pool); + var_set_label (var, utf8_label); + } - if (!parse_format_spec (r, sv.print, &vv->print, vv) - || !parse_format_spec (r, sv.write, &vv->write, vv)) - goto error; - } + /* Set missing values. */ + if (rec->missing_value_code != 0) + { + int width = var_get_width (var); + struct missing_values mv; - /* Some consistency checks. */ - if (long_string_count != 0) - lose ((ME, _("%s: Long string continuation records omitted at end of " - "dictionary."), - fh_get_file_name (r->fh))); + mv_init_pool (r->pool, &mv, width); + if (var_is_numeric (var)) + { + bool has_range = rec->missing_value_code < 0; + int n_discrete = (has_range + ? rec->missing_value_code == -3 + : rec->missing_value_code); + int ofs = 0; - if (next_value != r->value_cnt) - corrupt_msg(MW, _("%s: System file header indicates %d variable positions but " - "%d were read from file."), - fh_get_file_name (r->fh), r->value_cnt, next_value); + if (has_range) + { + double low = parse_float (r, rec->missing, 0); + double high = parse_float (r, rec->missing, 8); + mv_add_range (&mv, low, high); + ofs += 16; + } + for (i = 0; i < n_discrete; i++) + { + mv_add_num (&mv, parse_float (r, rec->missing, ofs)); + ofs += 8; + } + } + else + { + union value value; - return 1; + value_init_pool (r->pool, &value, width); + value_set_missing (&value, width); + for (i = 0; i < rec->missing_value_code; i++) + { + uint8_t *s = value_str_rw (&value, width); + memcpy (s, rec->missing + 8 * i, MIN (width, 8)); + mv_add_str (&mv, s); + } + } + var_set_missing_values (var, &mv); + } - error: - return 0; + /* Set formats. */ + parse_format_spec (r, rec->pos + 12, rec->print_format, + PRINT_FORMAT, var, &n_warnings); + parse_format_spec (r, rec->pos + 16, rec->write_format, + WRITE_FORMAT, var, &n_warnings); + + /* Account for values. + Skip long string continuation records, if any. */ + n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8); + for (i = 1; i < n_values; i++) + if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1) + sys_error (r, rec->pos, _("Missing string continuation record.")); + rec += n_values; + } } /* Translates the format spec from sysfile format to internal format. */ -static int -parse_format_spec (struct sfm_reader *r, int32_t s, - struct fmt_spec *f, const struct variable *v) +static void +parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format, + enum which_format which, struct variable *v, + int *n_warnings) { - f->type = translate_fmt ((s >> 16) & 0xff); - if (f->type == -1) - lose ((ME, _("%s: Bad format specifier byte (%d)."), - fh_get_file_name (r->fh), (s >> 16) & 0xff)); - f->w = (s >> 8) & 0xff; - f->d = s & 0xff; - - if ((v->type == ALPHA) ^ ((formats[f->type].cat & FCAT_STRING) != 0)) - lose ((ME, _("%s: %s variable %s has %s format specifier %s."), - fh_get_file_name (r->fh), - v->type == ALPHA ? _("String") : _("Numeric"), - v->name, - formats[f->type].cat & FCAT_STRING ? _("string") : _("numeric"), - formats[f->type].name)); - - if (!check_output_specifier (f, false) - || !check_specifier_width (f, v->width, false)) + const int max_warnings = 8; + uint8_t raw_type = format >> 16; + uint8_t w = format >> 8; + uint8_t d = format; + struct fmt_spec f; + + bool ok; + + if (!fmt_from_io (raw_type, &f.type)) + sys_error (r, pos, _("Unknown variable format %"PRIu8"."), raw_type); + f.w = w; + f.d = d; + + msg_disable (); + ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v)); + msg_enable (); + + if (ok) { - msg (ME, _("%s variable %s has invalid format specifier %s."), - v->type == NUMERIC ? _("Numeric") : _("String"), - v->name, fmt_to_string (f)); - *f = v->type == NUMERIC ? f8_2 : make_output_format (FMT_A, v->width, 0); + if (which == PRINT_FORMAT) + var_set_print_format (v, &f); + else + var_set_write_format (v, &f); + } + else if (++*n_warnings <= max_warnings) + { + char fmt_string[FMT_STRING_LEN_MAX + 1]; + sys_warn (r, pos, _("%s variable %s has invalid %s format %s."), + var_is_numeric (v) ? _("Numeric") : _("String"), + var_get_name (v), + which == PRINT_FORMAT ? _("print") : _("write"), + fmt_to_string (&f, fmt_string)); + + if (*n_warnings == max_warnings) + sys_warn (r, -1, _("Suppressing further invalid format warnings.")); } - return 1; +} + +static void +parse_document (struct dictionary *dict, struct sfm_document_record *record) +{ + const char *p; - error: - return 0; + for (p = record->documents; + p < record->documents + DOC_LINE_LENGTH * record->n_lines; + p += DOC_LINE_LENGTH) + { + struct substring line; + + line = recode_substring_pool ("UTF-8", dict_get_encoding (dict), + ss_buffer (p, DOC_LINE_LENGTH), NULL); + ss_rtrim (&line, ss_cstr (" ")); + line.string[line.length] = '\0'; + + dict_add_document_line (dict, line.string); + + ss_dealloc (&line); + } } -/* Reads value labels from sysfile H and inserts them into the - associated dictionary. */ -int -read_value_labels (struct sfm_reader *r, - struct dictionary *dict, struct variable **var_by_idx) +/* Parses record type 7, subtype 3. */ +static void +parse_machine_integer_info (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct sfm_read_info *info) { - struct label - { - char raw_value[8]; /* Value as uninterpreted bytes. */ - union value value; /* Value. */ - char *label; /* Null-terminated label string. */ - }; + int float_representation, expected_float_format; + int integer_representation, expected_integer_format; + + /* Save version info. */ + info->version_major = parse_int (r, record->data, 0); + info->version_minor = parse_int (r, record->data, 4); + info->version_revision = parse_int (r, record->data, 8); + + /* Check floating point format. */ + float_representation = parse_int (r, record->data, 16); + if (r->float_format == FLOAT_IEEE_DOUBLE_BE + || r->float_format == FLOAT_IEEE_DOUBLE_LE) + expected_float_format = 1; + else if (r->float_format == FLOAT_Z_LONG) + expected_float_format = 2; + else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D) + expected_float_format = 3; + else + NOT_REACHED (); + if (float_representation != expected_float_format) + sys_error (r, record->pos, _("Floating-point representation indicated by " + "system file (%d) differs from expected (%d)."), + float_representation, expected_float_format); + + /* Check integer format. */ + integer_representation = parse_int (r, record->data, 24); + if (r->integer_format == INTEGER_MSB_FIRST) + expected_integer_format = 1; + else if (r->integer_format == INTEGER_LSB_FIRST) + expected_integer_format = 2; + else + NOT_REACHED (); + if (integer_representation != expected_integer_format) + sys_warn (r, record->pos, + _("Integer format indicated by system file (%d) " + "differs from expected (%d)."), + integer_representation, expected_integer_format); - struct label *labels = NULL; - int32_t n_labels; /* Number of labels. */ +} - struct variable **var = NULL; /* Associated variables. */ - int32_t n_vars; /* Number of associated variables. */ +static const char * +choose_encoding (struct sfm_reader *r, + const struct sfm_extension_record *ext_integer, + const struct sfm_extension_record *ext_encoding) +{ + /* The EXT_ENCODING record is a more reliable way to determine dictionary + encoding. */ + if (ext_encoding) + return ext_encoding->data; - int i; + /* But EXT_INTEGER is better than nothing as a fallback. */ + if (ext_integer) + { + int codepage = parse_int (r, ext_integer->data, 7 * 4); - /* First step: read the contents of the type 3 record and record its - contents. Note that we can't do much with the data since we - don't know yet whether it is of numeric or string type. */ + switch (codepage) + { + case 1: + return "EBCDIC-US"; + + case 2: + case 3: + /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] + respectively. However, there are known to be many files in the wild + with character code 2, yet have data which are clearly not ASCII. + Therefore we ignore these values. */ + break; + + case 4: + return "MS_KANJI"; + + case 65000: + return "UTF-7"; - /* Read number of labels. */ - assertive_buf_read (r, &n_labels, sizeof n_labels, 0); - if (r->reverse_endian) - bswap_int32 (&n_labels); + case 65001: + return "UTF-8"; - if ( n_labels >= ((int32_t) ~0) / sizeof *labels) - { - corrupt_msg(MW, _("%s: Invalid number of labels: %d. Ignoring labels."), - fh_get_file_name (r->fh), n_labels); - n_labels = 0; + default: + return pool_asprintf (r->pool, "CP%d", codepage); + } } - /* Allocate memory. */ - labels = xcalloc (n_labels, sizeof *labels); - for (i = 0; i < n_labels; i++) - labels[i].label = NULL; + return locale_charset (); +} - /* Read each value/label tuple into labels[]. */ - for (i = 0; i < n_labels; i++) +/* Parses record type 7, subtype 4. */ +static void +parse_machine_float_info (struct sfm_reader *r, + const struct sfm_extension_record *record) +{ + double sysmis = parse_float (r, record->data, 0); + double highest = parse_float (r, record->data, 8); + double lowest = parse_float (r, record->data, 16); + + if (sysmis != SYSMIS) + sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."), + sysmis, "SYSMIS"); + + if (highest != HIGHEST) + sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."), + highest, "HIGHEST"); + + if (lowest != LOWEST) + sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."), + lowest, "LOWEST"); +} + +/* Parses record type 7, subtype 7 or 19. */ +static void +parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record, + struct dictionary *dict) +{ + struct text_record *text; + struct mrset *mrset; + + text = open_text_record (r, record); + for (;;) { - struct label *label = labels + i; - unsigned char label_len; - size_t padded_len; + const char *counted = NULL; + const char *name; + const char *label; + struct stringi_set var_names; + size_t allocated_vars; + char delimiter; + int width; + + mrset = xzalloc (sizeof *mrset); + + name = text_get_token (text, ss_cstr ("="), NULL); + if (name == NULL) + break; + mrset->name = xstrdup (name); + + if (mrset->name[0] != '$') + { + sys_warn (r, record->pos, + _("`%s' does not begin with `$' at UTF-8 offset %zu " + "in MRSETS record."), mrset->name, text_pos (text)); + break; + } - /* Read value. */ - assertive_buf_read (r, label->raw_value, sizeof label->raw_value, 0); + if (text_match (text, 'C')) + { + mrset->type = MRSET_MC; + if (!text_match (text, ' ')) + { + sys_warn (r, record->pos, + _("Missing space following `%c' at UTF-8 offset %zu " + "in MRSETS record."), 'C', text_pos (text)); + break; + } + } + else if (text_match (text, 'D')) + { + mrset->type = MRSET_MD; + mrset->cat_source = MRSET_VARLABELS; + } + else if (text_match (text, 'E')) + { + char *number; - /* Read label length. */ - assertive_buf_read (r, &label_len, sizeof label_len, 0); - padded_len = ROUND_UP (label_len + 1, sizeof (flt64)); + mrset->type = MRSET_MD; + mrset->cat_source = MRSET_COUNTEDVALUES; + if (!text_match (text, ' ')) + { + sys_warn (r, record->pos, + _("Missing space following `%c' at UTF-8 offset %zu " + "in MRSETS record."), 'E', text_pos (text)); + break; + } - /* Read label, padding. */ - label->label = xmalloc (padded_len + 1); - assertive_buf_read (r, label->label, padded_len - 1, 0); - label->label[label_len] = 0; - } + number = text_get_token (text, ss_cstr (" "), NULL); + if (!strcmp (number, "11")) + mrset->label_from_var_label = true; + else if (strcmp (number, "1")) + sys_warn (r, record->pos, + _("Unexpected label source value `%s' following `E' " + "at UTF-8 offset %zu in MRSETS record."), + number, text_pos (text)); + } + else + { + sys_warn (r, record->pos, + _("Missing `C', `D', or `E' at UTF-8 offset %zu " + "in MRSETS record."), + text_pos (text)); + break; + } - /* Second step: Read the type 4 record that has the list of - variables to which the value labels are to be applied. */ + if (mrset->type == MRSET_MD) + { + counted = text_parse_counted_string (r, text); + if (counted == NULL) + break; + } - /* Read record type of type 4 record. */ - { - int32_t rec_type; - - assertive_buf_read (r, &rec_type, sizeof rec_type, 0); - if (r->reverse_endian) - bswap_int32 (&rec_type); - - if (rec_type != 4) - lose ((ME, _("%s: Variable index record (type 4) does not immediately " - "follow value label record (type 3) as it should."), - fh_get_file_name (r->fh))); - } + label = text_parse_counted_string (r, text); + if (label == NULL) + break; + mrset->label = label[0] != '\0' ? xstrdup (label) : NULL; - /* Read number of variables associated with value label from type 4 - record. */ - assertive_buf_read (r, &n_vars, sizeof n_vars, 0); - if (r->reverse_endian) - bswap_int32 (&n_vars); - if (n_vars < 1 || n_vars > dict_get_var_cnt (dict)) - lose ((ME, _("%s: Number of variables associated with a value label (%d) " - "is not between 1 and the number of variables (%d)."), - fh_get_file_name (r->fh), n_vars, dict_get_var_cnt (dict))); - - /* Read the list of variables. */ - var = xnmalloc (n_vars, sizeof *var); - for (i = 0; i < n_vars; i++) + stringi_set_init (&var_names); + allocated_vars = 0; + width = INT_MAX; + do + { + struct variable *var; + const char *var_name; + + var_name = text_get_token (text, ss_cstr (" \n"), &delimiter); + if (var_name == NULL) + { + sys_warn (r, record->pos, + _("Missing new-line parsing variable names " + "at UTF-8 offset %zu in MRSETS record."), + text_pos (text)); + break; + } + + var = dict_lookup_var (dict, var_name); + if (var == NULL) + continue; + if (!stringi_set_insert (&var_names, var_name)) + { + sys_warn (r, record->pos, + _("Duplicate variable name %s " + "at UTF-8 offset %zu in MRSETS record."), + var_name, text_pos (text)); + continue; + } + + if (mrset->label == NULL && mrset->label_from_var_label + && var_has_label (var)) + mrset->label = xstrdup (var_get_label (var)); + + if (mrset->n_vars + && var_get_type (var) != var_get_type (mrset->vars[0])) + { + sys_warn (r, record->pos, + _("MRSET %s contains both string and " + "numeric variables."), name); + continue; + } + width = MIN (width, var_get_width (var)); + + if (mrset->n_vars >= allocated_vars) + mrset->vars = x2nrealloc (mrset->vars, &allocated_vars, + sizeof *mrset->vars); + mrset->vars[mrset->n_vars++] = var; + } + while (delimiter != '\n'); + + if (mrset->n_vars < 2) + { + sys_warn (r, record->pos, + _("MRSET %s has only %zu variables."), mrset->name, + mrset->n_vars); + mrset_destroy (mrset); + continue; + } + + if (mrset->type == MRSET_MD) + { + mrset->width = width; + value_init (&mrset->counted, width); + if (width == 0) + mrset->counted.f = strtod (counted, NULL); + else + value_copy_str_rpad (&mrset->counted, width, + (const uint8_t *) counted, ' '); + } + + dict_add_mrset (dict, mrset); + mrset = NULL; + stringi_set_destroy (&var_names); + } + mrset_destroy (mrset); + close_text_record (r, text); +} + +/* Read record type 7, subtype 11, which specifies how variables + should be displayed in GUI environments. */ +static void +parse_display_parameters (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + bool includes_width; + bool warned = false; + size_t n_vars; + size_t ofs; + size_t i; + + n_vars = dict_get_var_cnt (dict); + if (record->count == 3 * n_vars) + includes_width = true; + else if (record->count == 2 * n_vars) + includes_width = false; + else { - int32_t var_idx; - struct variable *v; - - /* Read variable index, check range. */ - assertive_buf_read (r, &var_idx, sizeof var_idx, 0); - if (r->reverse_endian) - bswap_int32 (&var_idx); - if (var_idx < 1 || var_idx > r->value_cnt) - lose ((ME, _("%s: Variable index associated with value label (%d) is " - "not between 1 and the number of values (%d)."), - fh_get_file_name (r->fh), var_idx, r->value_cnt)); - - /* Make sure it's a real variable. */ - v = var_by_idx[var_idx - 1]; - if (v == NULL) - lose ((ME, _("%s: Variable index associated with value label (%d) " - "refers to a continuation of a string variable, not to " - "an actual variable."), - fh_get_file_name (r->fh), var_idx)); - if (v->type == ALPHA && v->width > MAX_SHORT_STRING) - lose ((ME, _("%s: Value labels are not allowed on long string " - "variables (%s)."), - fh_get_file_name (r->fh), v->name)); - - /* Add it to the list of variables. */ - var[i] = v; + sys_warn (r, record->pos, + _("Extension 11 has bad count %zu (for %zu variables)."), + record->count, n_vars); + return; } - /* Type check the variables. */ - for (i = 1; i < n_vars; i++) - if (var[i]->type != var[0]->type) - lose ((ME, _("%s: Variables associated with value label are not all of " - "identical type. Variable %s has %s type, but variable " - "%s has %s type."), - fh_get_file_name (r->fh), - var[0]->name, var[0]->type == ALPHA ? _("string") : _("numeric"), - var[i]->name, var[i]->type == ALPHA ? _("string") : _("numeric"))); - - /* Fill in labels[].value, now that we know the desired type. */ - for (i = 0; i < n_labels; i++) + ofs = 0; + for (i = 0; i < n_vars; ++i) { - struct label *label = labels + i; - - if (var[0]->type == ALPHA) + struct variable *v = dict_get_var (dict, i); + int measure, width, align; + + measure = parse_int (r, record->data, ofs); + ofs += 4; + + if (includes_width) { - const int copy_len = min (sizeof label->raw_value, - sizeof label->label); - memcpy (label->value.s, label->raw_value, copy_len); - } else { - flt64 f; - assert (sizeof f == sizeof label->raw_value); - memcpy (&f, label->raw_value, sizeof f); - if (r->reverse_endian) - bswap_flt64 (&f); - label->value.f = f; + width = parse_int (r, record->data, ofs); + ofs += 4; } + else + width = 0; + + align = parse_int (r, record->data, ofs); + ofs += 4; + + /* SPSS 14 sometimes seems to set string variables' measure + to zero. */ + if (0 == measure && var_is_alpha (v)) + measure = 1; + + if (measure < 1 || measure > 3 || align < 0 || align > 2) + { + if (!warned) + sys_warn (r, record->pos, + _("Invalid variable display parameters for variable " + "%zu (%s). Default parameters substituted."), + i, var_get_name (v)); + warned = true; + continue; + } + + var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL + : measure == 2 ? MEASURE_ORDINAL + : MEASURE_SCALE)); + var_set_alignment (v, (align == 0 ? ALIGN_LEFT + : align == 1 ? ALIGN_RIGHT + : ALIGN_CENTRE)); + + /* Older versions (SPSS 9.0) sometimes set the display + width to zero. This causes confusion in the GUI, so + only set the width if it is nonzero. */ + if (width > 0) + var_set_display_width (v, width); } - - /* Assign the value_label's to each variable. */ - for (i = 0; i < n_vars; i++) +} + +static void +rename_var_and_save_short_names (struct dictionary *dict, struct variable *var, + const char *new_name) +{ + size_t n_short_names; + char **short_names; + size_t i; + + /* Renaming a variable may clear its short names, but we + want to retain them, so we save them and re-set them + afterward. */ + n_short_names = var_get_short_name_cnt (var); + short_names = xnmalloc (n_short_names, sizeof *short_names); + for (i = 0; i < n_short_names; i++) + { + const char *s = var_get_short_name (var, i); + short_names[i] = s != NULL ? xstrdup (s) : NULL; + } + + /* Set long name. */ + dict_rename_var (dict, var, new_name); + + /* Restore short names. */ + for (i = 0; i < n_short_names; i++) + { + var_set_short_name (var, i, short_names[i]); + free (short_names[i]); + } + free (short_names); +} + +/* Parses record type 7, subtype 13, which gives the long name that corresponds + to each short name. Modifies variable names in DICT accordingly. */ +static void +parse_long_var_name_map (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + struct text_record *text; + struct variable *var; + char *long_name; + + if (record == NULL) { - struct variable *v = var[i]; - int j; + /* Convert variable names to lowercase. */ + size_t i; - /* Add each label to the variable. */ - for (j = 0; j < n_labels; j++) + for (i = 0; i < dict_get_var_cnt (dict); i++) { - struct label *label = labels + j; - if (!val_labs_replace (v->val_labs, label->value, label->label)) - continue; - - if (var[0]->type == NUMERIC) - msg (MW, _("%s: File contains duplicate label for value %g for " - "variable %s."), - fh_get_file_name (r->fh), label->value.f, v->name); - else - msg (MW, _("%s: File contains duplicate label for value `%.*s' " - "for variable %s."), - fh_get_file_name (r->fh), v->width, label->value.s, v->name); + struct variable *var = dict_get_var (dict, i); + char *new_name; + + new_name = xstrdup (var_get_name (var)); + str_lowercase (new_name); + + rename_var_and_save_short_names (dict, var, new_name); + + free (new_name); } + + return; } - for (i = 0; i < n_labels; i++) - free (labels[i].label); - free (labels); - free (var); - return 1; + /* Rename each of the variables, one by one. (In a correctly constructed + system file, this cannot create any intermediate duplicate variable names, + because all of the new variable names are longer than any of the old + variable names and thus there cannot be any overlaps.) */ + text = open_text_record (r, record); + while (read_variable_to_value_pair (r, dict, text, &var, &long_name)) + { + /* Validate long name. */ + if (!var_is_valid_name (long_name, false)) + { + sys_warn (r, record->pos, + _("Long variable mapping from %s to invalid " + "variable name `%s'."), + var_get_name (var), long_name); + continue; + } + + /* Identify any duplicates. */ + if (strcasecmp (var_get_short_name (var, 0), long_name) + && dict_lookup_var (dict, long_name) != NULL) + { + sys_warn (r, record->pos, + _("Duplicate long variable name `%s'."), long_name); + continue; + } + + rename_var_and_save_short_names (dict, var, long_name); + } + close_text_record (r, text); +} - error: - if (labels) +/* Reads record type 7, subtype 14, which gives the real length + of each very long string. Rearranges DICT accordingly. */ +static void +parse_long_string_map (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + struct text_record *text; + struct variable *var; + char *length_s; + + text = open_text_record (r, record); + while (read_variable_to_value_pair (r, dict, text, &var, &length_s)) { - for (i = 0; i < n_labels; i++) - free (labels[i].label); - free (labels); + size_t idx = var_get_dict_index (var); + long int length; + int segment_cnt; + int i; + + /* Get length. */ + length = strtol (length_s, NULL, 10); + if (length < 1 || length > MAX_STRING) + { + sys_warn (r, record->pos, + _("%s listed as string of invalid length %s " + "in very long string record."), + var_get_name (var), length_s); + continue; + } + + /* Check segments. */ + segment_cnt = sfm_width_to_segments (length); + if (segment_cnt == 1) + { + sys_warn (r, record->pos, + _("%s listed in very long string record with width %s, " + "which requires only one segment."), + var_get_name (var), length_s); + continue; + } + if (idx + segment_cnt > dict_get_var_cnt (dict)) + sys_error (r, record->pos, + _("Very long string %s overflows dictionary."), + var_get_name (var)); + + /* Get the short names from the segments and check their + lengths. */ + for (i = 0; i < segment_cnt; i++) + { + struct variable *seg = dict_get_var (dict, idx + i); + int alloc_width = sfm_segment_alloc_width (length, i); + int width = var_get_width (seg); + + if (i > 0) + var_set_short_name (var, i, var_get_short_name (seg, 0)); + if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8)) + sys_error (r, record->pos, + _("Very long string with width %ld has segment %d " + "of width %d (expected %d)."), + length, i, width, alloc_width); + } + dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1); + var_set_width (var, length); } - free (var); - return 0; + close_text_record (r, text); + dict_compact_values (dict); } -/* Reads BYTE_CNT bytes from the file represented by H. If BUF is - non-NULL, uses that as the buffer; otherwise allocates at least - MIN_ALLOC bytes. Returns a pointer to the buffer on success, NULL - on failure. */ -static void * -buf_read (struct sfm_reader *r, void *buf, size_t byte_cnt, size_t min_alloc) +static void +parse_value_labels (struct sfm_reader *r, struct dictionary *dict, + const struct sfm_var_record *var_recs, size_t n_var_recs, + const struct sfm_value_label_record *record) { - assert (r); + struct variable **vars; + char **utf8_labels; + size_t i; + + utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels); + for (i = 0; i < record->n_labels; i++) + utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict), + record->labels[i].label, -1, + r->pool); + + vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars); + for (i = 0; i < record->n_vars; i++) + vars[i] = lookup_var_by_index (r, record->pos, + var_recs, n_var_recs, record->vars[i]); + + for (i = 1; i < record->n_vars; i++) + if (var_get_type (vars[i]) != var_get_type (vars[0])) + sys_error (r, record->pos, + _("Variables associated with value label are not all of " + "identical type. Variable %s is %s, but variable " + "%s is %s."), + var_get_name (vars[0]), + var_is_numeric (vars[0]) ? _("numeric") : _("string"), + var_get_name (vars[i]), + var_is_numeric (vars[i]) ? _("numeric") : _("string")); + + for (i = 0; i < record->n_vars; i++) + { + struct variable *var = vars[i]; + int width; + size_t j; + + width = var_get_width (var); + if (width > 8) + sys_error (r, record->pos, + _("Value labels may not be added to long string " + "variables (e.g. %s) using records types 3 and 4."), + var_get_name (var)); + + for (j = 0; j < record->n_labels; j++) + { + struct sfm_value_label *label = &record->labels[j]; + union value value; - if (buf == NULL && byte_cnt > 0 ) - buf = xmalloc (max (byte_cnt, min_alloc)); + value_init (&value, width); + if (width == 0) + value.f = parse_float (r, label->value, 0); + else + memcpy (value_str_rw (&value, width), label->value, width); - if ( byte_cnt == 0 ) - return buf; + if (!var_add_value_label (var, &value, utf8_labels[j])) + { + if (var_is_numeric (var)) + sys_warn (r, record->pos, + _("Duplicate value label for %g on %s."), + value.f, var_get_name (var)); + else + sys_warn (r, record->pos, + _("Duplicate value label for `%.*s' on %s."), + width, value_str (&value, width), + var_get_name (var)); + } - - if (1 != fread (buf, byte_cnt, 1, r->file)) + value_destroy (&value, width); + } + } + + pool_free (r->pool, vars); + for (i = 0; i < record->n_labels; i++) + pool_free (r->pool, utf8_labels[i]); + pool_free (r->pool, utf8_labels); +} + +static struct variable * +lookup_var_by_index (struct sfm_reader *r, off_t offset, + const struct sfm_var_record *var_recs, size_t n_var_recs, + int idx) +{ + const struct sfm_var_record *rec; + + if (idx < 1 || idx > n_var_recs) { - if (ferror (r->file)) - msg (ME, _("%s: Reading system file: %s."), - fh_get_file_name (r->fh), strerror (errno)); - else - corrupt_msg (ME, _("%s: Unexpected end of file."), - fh_get_file_name (r->fh)); - r->ok = false; + sys_error (r, offset, + _("Variable index %d not in valid range 1...%d."), + idx, n_var_recs); return NULL; } - return buf; + rec = &var_recs[idx - 1]; + if (rec->var == NULL) + { + sys_error (r, offset, + _("Variable index %d refers to long string continuation."), + idx); + return NULL; + } + + return rec->var; } -/* Winds the reader BYTE_CNT bytes back in the reader stream. */ -void -buf_unread(struct sfm_reader *r, size_t byte_cnt) +/* Parses a set of custom attributes from TEXT into ATTRS. + ATTRS may be a null pointer, in which case the attributes are + read but discarded. */ +static void +parse_attributes (struct sfm_reader *r, struct text_record *text, + struct attrset *attrs) { - assert(byte_cnt > 0); - - if ( 0 != fseek(r->file, -byte_cnt, SEEK_CUR)) + do { - msg (ME, _("%s: Seeking system file: %s."), - fh_get_file_name (r->fh), strerror (errno)); + struct attribute *attr; + char *key; + int index; + + /* Parse the key. */ + key = text_get_token (text, ss_cstr ("("), NULL); + if (key == NULL) + return; + + attr = attribute_create (key); + for (index = 1; ; index++) + { + /* Parse the value. */ + char *value; + size_t length; + + value = text_get_token (text, ss_cstr ("\n"), NULL); + if (value == NULL) + { + text_warn (r, text, _("Error parsing attribute value %s[%d]."), + key, index); + break; + } + + length = strlen (value); + if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'') + { + value[length - 1] = '\0'; + attribute_add_value (attr, value + 1); + } + else + { + text_warn (r, text, + _("Attribute value %s[%d] is not quoted: %s."), + key, index, value); + attribute_add_value (attr, value); + } + + /* Was this the last value for this attribute? */ + if (text_match (text, ')')) + break; + } + if (attrs != NULL) + attrset_add (attrs, attr); + else + attribute_destroy (attr); } + while (!text_match (text, '/')); } -/* Reads a document record, type 6, from system file R, and sets up - the documents and n_documents fields in the associated - dictionary. */ -static int -read_documents (struct sfm_reader *r, struct dictionary *dict) +/* Reads record type 7, subtype 17, which lists custom + attributes on the data file. */ +static void +parse_data_file_attributes (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) { - int32_t line_cnt; - char *documents; - - if (dict_get_documents (dict) != NULL) - lose ((ME, _("%s: System file contains multiple " - "type 6 (document) records."), - fh_get_file_name (r->fh))); - - assertive_buf_read (r, &line_cnt, sizeof line_cnt, 0); - if (line_cnt <= 0) - lose ((ME, _("%s: Number of document lines (%ld) " - "must be greater than 0."), - fh_get_file_name (r->fh), (long) line_cnt)); - - documents = buf_read (r, NULL, 80 * line_cnt, line_cnt * 80 + 1); - /* FIXME? Run through asciify. */ - if (documents == NULL) - return 0; - documents[80 * line_cnt] = '\0'; - dict_set_documents (dict, documents); - free (documents); - return 1; - - error: - return 0; + struct text_record *text = open_text_record (r, record); + parse_attributes (r, text, dict_get_attributes (dict)); + close_text_record (r, text); } - -/* Data reader. */ -/* Reads compressed data into H->BUF and sets other pointers - appropriately. Returns nonzero only if both no errors occur and - data was read. */ -static int -buffer_input (struct sfm_reader *r) +/* Parses record type 7, subtype 18, which lists custom + attributes on individual variables. */ +static void +parse_variable_attributes (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) { - size_t amt; + struct text_record *text; + struct variable *var; - if (!r->ok) - return false; - if (r->buf == NULL) - r->buf = xnmalloc (128, sizeof *r->buf); - amt = fread (r->buf, sizeof *r->buf, 128, r->file); - if (ferror (r->file)) - { - msg (ME, _("%s: Error reading file: %s."), - fh_get_file_name (r->fh), strerror (errno)); - r->ok = false; - return 0; - } - r->ptr = r->buf; - r->end = &r->buf[amt]; - return amt; + text = open_text_record (r, record); + while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var)) + parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL); + close_text_record (r, text); } -/* Reads a single case consisting of compressed data from system - file H into the array BUF[] according to reader R, and - returns nonzero only if successful. */ -/* Data in system files is compressed in this manner. Data - values are grouped into sets of eight ("octets"). Each value - in an octet has one instruction byte that are output together. - Each instruction byte gives a value for that byte or indicates - that the value can be found following the instructions. */ -static int -read_compressed_data (struct sfm_reader *r, flt64 *buf) +static void +check_overflow (struct sfm_reader *r, + const struct sfm_extension_record *record, + size_t ofs, size_t length) { - const unsigned char *p_end = r->x + sizeof (flt64); - unsigned char *p = r->y; + size_t end = record->size * record->count; + if (length >= end || ofs + length > end) + sys_error (r, record->pos + end, + _("Long string value label record ends unexpectedly.")); +} - const flt64 *buf_beg = buf; - const flt64 *buf_end = &buf[r->value_cnt]; +static void +parse_long_string_value_labels (struct sfm_reader *r, + const struct sfm_extension_record *record, + struct dictionary *dict) +{ + const char *dict_encoding = dict_get_encoding (dict); + size_t end = record->size * record->count; + size_t ofs = 0; - for (;;) + while (ofs < end) { - for (; p < p_end; p++){ - switch (*p) - { - case 0: - /* Code 0 is ignored. */ - continue; - case 252: - /* Code 252 is end of file. */ - if (buf_beg == buf) - return 0; - lose ((ME, _("%s: Compressed data is corrupted. Data ends " - "in partial case."), - fh_get_file_name (r->fh))); - case 253: - /* Code 253 indicates that the value is stored explicitly - following the instruction bytes. */ - if (r->ptr == NULL || r->ptr >= r->end) - if (!buffer_input (r)) - lose ((ME, _("%s: Unexpected end of file."), - fh_get_file_name (r->fh))); - memcpy (buf++, r->ptr++, sizeof *buf); - if (buf >= buf_end) - goto success; - break; - case 254: - /* Code 254 indicates a string that is all blanks. */ - memset (buf++, ' ', sizeof *buf); - if (buf >= buf_end) - goto success; - break; - case 255: - /* Code 255 indicates the system-missing value. */ - *buf = r->sysmis; - if (r->reverse_endian) - bswap_flt64 (buf); - buf++; - if (buf >= buf_end) - goto success; - break; - default: - /* Codes 1 through 251 inclusive are taken to indicate a - value of (BYTE - BIAS), where BYTE is the byte's value - and BIAS is the compression bias (generally 100.0). */ - *buf = *p - r->bias; - if (r->reverse_endian) - bswap_flt64 (buf); - buf++; - if (buf >= buf_end) - goto success; - break; - } - } - /* We have reached the end of this instruction octet. Read - another. */ - if (r->ptr == NULL || r->ptr >= r->end) + char *var_name; + size_t n_labels, i; + struct variable *var; + union value value; + int var_name_len; + int width; + + /* Parse variable name length. */ + check_overflow (r, record, ofs, 4); + var_name_len = parse_int (r, record->data, ofs); + ofs += 4; + + /* Parse variable name, width, and number of labels. */ + check_overflow (r, record, ofs, var_name_len + 8); + var_name = recode_string_pool ("UTF-8", dict_encoding, + (const char *) record->data + ofs, + var_name_len, r->pool); + width = parse_int (r, record->data, ofs + var_name_len); + n_labels = parse_int (r, record->data, ofs + var_name_len + 4); + ofs += var_name_len + 8; + + /* Look up 'var' and validate. */ + var = dict_lookup_var (dict, var_name); + if (var == NULL) + sys_warn (r, record->pos + ofs, + _("Ignoring long string value record for " + "unknown variable %s."), var_name); + else if (var_is_numeric (var)) { - if (!buffer_input (r)) - { - if (buf_beg != buf) - lose ((ME, _("%s: Unexpected end of file."), - fh_get_file_name (r->fh))); - else - return 0; - } + sys_warn (r, record->pos + ofs, + _("Ignoring long string value record for " + "numeric variable %s."), var_name); + var = NULL; + } + else if (width != var_get_width (var)) + { + sys_warn (r, record->pos + ofs, + _("Ignoring long string value record for variable %s " + "because the record's width (%d) does not match the " + "variable's width (%d)."), + var_name, width, var_get_width (var)); + var = NULL; } - memcpy (r->x, r->ptr++, sizeof *buf); - p = r->x; - } - NOT_REACHED (); + /* Parse values. */ + value_init_pool (r->pool, &value, width); + for (i = 0; i < n_labels; i++) + { + size_t value_length, label_length; + bool skip = var == NULL; - success: - /* We have filled up an entire record. Update state and return - successfully. */ - r->y = ++p; - return 1; + /* Parse value length. */ + check_overflow (r, record, ofs, 4); + value_length = parse_int (r, record->data, ofs); + ofs += 4; - error: - /* I/O error. */ - r->ok = false; - return 0; + /* Parse value. */ + check_overflow (r, record, ofs, value_length); + if (!skip) + { + if (value_length == width) + memcpy (value_str_rw (&value, width), + (const uint8_t *) record->data + ofs, width); + else + { + sys_warn (r, record->pos + ofs, + _("Ignoring long string value %zu for variable " + "%s, with width %d, that has bad value " + "width %zu."), + i, var_get_name (var), width, value_length); + skip = true; + } + } + ofs += value_length; + + /* Parse label length. */ + check_overflow (r, record, ofs, 4); + label_length = parse_int (r, record->data, ofs); + ofs += 4; + + /* Parse label. */ + check_overflow (r, record, ofs, label_length); + if (!skip) + { + char *label; + + label = recode_string_pool ("UTF-8", dict_encoding, + (const char *) record->data + ofs, + label_length, r->pool); + if (!var_add_value_label (var, &value, label)) + sys_warn (r, record->pos + ofs, + _("Duplicate value label for `%.*s' on %s."), + width, value_str (&value, width), + var_get_name (var)); + pool_free (r->pool, label); + } + ofs += label_length; + } + } } + +/* Case reader. */ + +static void partial_record (struct sfm_reader *r) + NO_RETURN; -/* Reads one case from READER's file into C. Returns nonzero - only if successful. */ -int -sfm_read_case (struct sfm_reader *r, struct ccase *c) +static void read_error (struct casereader *, const struct sfm_reader *); + +static bool read_case_number (struct sfm_reader *, double *); +static bool read_case_string (struct sfm_reader *, uint8_t *, size_t); +static int read_opcode (struct sfm_reader *); +static bool read_compressed_number (struct sfm_reader *, double *); +static bool read_compressed_string (struct sfm_reader *, uint8_t *); +static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t); +static bool skip_whole_strings (struct sfm_reader *, size_t); + +/* Reads and returns one case from READER's file. Returns a null + pointer if not successful. */ +static struct ccase * +sys_file_casereader_read (struct casereader *reader, void *r_) { - if (!r->ok) - return 0; + struct sfm_reader *r = r_; + struct ccase *volatile c; + int i; - if (!r->compressed && sizeof (flt64) == sizeof (double) && ! r->has_vls) + if (r->error) + return NULL; + + c = case_create (r->proto); + if (setjmp (r->bail_out)) { - /* Fast path: external and internal representations are the - same, except possibly for endianness or SYSMIS. Read - directly into the case's buffer, then fix up any minor - details as needed. */ - if (!fread_ok (r, case_data_all_rw (c), - sizeof (union value) * r->value_cnt)) - return 0; - - /* Fix up endianness if needed. */ - if (r->reverse_endian) + casereader_force_error (reader); + case_unref (c); + return NULL; + } + + for (i = 0; i < r->sfm_var_cnt; i++) + { + struct sfm_var *sv = &r->sfm_vars[i]; + union value *v = case_data_rw_idx (c, sv->case_index); + + if (sv->var_width == 0) { - int i; - - for (i = 0; i < r->var_cnt; i++) - if (r->vars[i].width == 0) - bswap_flt64 (&case_data_rw (c, r->vars[i].fv)->f); + if (!read_case_number (r, &v->f)) + goto eof; } - - /* Fix up SYSMIS values if needed. - I don't think this will ever actually kick in, but it - can't hurt. */ - if (r->sysmis != SYSMIS) + else { - int i; - - for (i = 0; i < r->var_cnt; i++) - if (r->vars[i].width == 0 && case_num (c, i) == r->sysmis) - case_data_rw (c, r->vars[i].fv)->f = SYSMIS; + uint8_t *s = value_str_rw (v, sv->var_width); + if (!read_case_string (r, s + sv->offset, sv->segment_width)) + goto eof; + if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8))) + partial_record (r); } } - else + return c; + +eof: + if (i != 0) + partial_record (r); + if (r->case_cnt != -1) + read_error (reader, r); + case_unref (c); + return NULL; +} + +/* Issues an error that R ends in a partial record. */ +static void +partial_record (struct sfm_reader *r) +{ + sys_error (r, r->pos, _("File ends in partial case.")); +} + +/* Issues an error that an unspecified error occurred SFM, and + marks R tainted. */ +static void +read_error (struct casereader *r, const struct sfm_reader *sfm) +{ + msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh)); + casereader_force_error (r); +} + +/* Reads a number from R and stores its value in *D. + If R is compressed, reads a compressed number; + otherwise, reads a number in the regular way. + Returns true if successful, false if end of file is + reached immediately. */ +static bool +read_case_number (struct sfm_reader *r, double *d) +{ + if (!r->compressed) { - /* Slow path: internal and external representations differ. - Read into a bounce buffer, then copy to C. */ - flt64 *bounce; - flt64 *bounce_cur; - size_t bounce_size; - int read_ok; - int i; + uint8_t number[8]; + if (!try_read_bytes (r, number, sizeof number)) + return false; + float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d); + return true; + } + else + return read_compressed_number (r, d); +} - bounce_size = sizeof *bounce * r->value_cnt; - bounce = bounce_cur = local_alloc (bounce_size); +/* Reads LENGTH string bytes from R into S. + Always reads a multiple of 8 bytes; if LENGTH is not a + multiple of 8, then extra bytes are read and discarded without + being written to S. + Reads compressed strings if S is compressed. + Returns true if successful, false if end of file is + reached immediately. */ +static bool +read_case_string (struct sfm_reader *r, uint8_t *s, size_t length) +{ + size_t whole = ROUND_DOWN (length, 8); + size_t partial = length % 8; - memset(bounce, 0, bounce_size); + if (whole) + { + if (!read_whole_strings (r, s, whole)) + return false; + } - if (!r->compressed) - read_ok = fread_ok (r, bounce, bounce_size); - else - read_ok = read_compressed_data (r, bounce); - if (!read_ok) + if (partial) + { + uint8_t bounce[8]; + if (!read_whole_strings (r, bounce, sizeof bounce)) { - local_free (bounce); - return 0; + if (whole) + partial_record (r); + return false; } + memcpy (s + whole, bounce, partial); + } + + return true; +} - for (i = 0; i < r->var_cnt; i++) +/* Reads and returns the next compression opcode from R. */ +static int +read_opcode (struct sfm_reader *r) +{ + assert (r->compressed); + for (;;) + { + int opcode; + if (r->opcode_idx >= sizeof r->opcodes) { - struct sfm_var *sv = &r->vars[i]; + if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes)) + return -1; + r->opcode_idx = 0; + } + opcode = r->opcodes[r->opcode_idx++]; - if (sv->width == 0) - { - flt64 f = *bounce_cur++; - if (r->reverse_endian) - bswap_flt64 (&f); - case_data_rw (c, sv->fv)->f = f == r->sysmis ? SYSMIS : f; - } - else - { - flt64 *bc_start = bounce_cur; - int ofs = 0; - while (ofs < sv->width ) - { - const int chunk = MIN (MAX_LONG_STRING, sv->width - ofs); - memcpy (case_data_rw (c, sv->fv)->s + ofs, bounce_cur, chunk); + if (opcode != 0) + return opcode; + } +} - bounce_cur += DIV_RND_UP (chunk, sizeof (flt64)); +/* Reads a compressed number from R and stores its value in D. + Returns true if successful, false if end of file is + reached immediately. */ +static bool +read_compressed_number (struct sfm_reader *r, double *d) +{ + int opcode = read_opcode (r); + switch (opcode) + { + case -1: + case 252: + return false; - ofs += chunk; - } - bounce_cur = bc_start + width_to_bytes(sv->width) / sizeof(flt64); - } + case 253: + *d = read_float (r); + break; + + case 254: + float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d); + if (!r->corruption_warning) + { + r->corruption_warning = true; + sys_warn (r, r->pos, + _("Possible compressed data corruption: " + "compressed spaces appear in numeric field.")); } + break; + + case 255: + *d = SYSMIS; + break; - local_free (bounce); + default: + *d = opcode - r->bias; + break; } - return 1; + + return true; } -static int -fread_ok (struct sfm_reader *r, void *buffer, size_t byte_cnt) +/* Reads a compressed 8-byte string segment from R and stores it + in DST. + Returns true if successful, false if end of file is + reached immediately. */ +static bool +read_compressed_string (struct sfm_reader *r, uint8_t *dst) { - size_t read_bytes = fread (buffer, 1, byte_cnt, r->file); + int opcode = read_opcode (r); + switch (opcode) + { + case -1: + case 252: + return false; + + case 253: + read_bytes (r, dst, 8); + break; - if (read_bytes == byte_cnt) - return 1; + case 254: + memset (dst, ' ', 8); + break; + + default: + { + double value = opcode - r->bias; + float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst); + if (value == 0.0) + { + /* This has actually been seen "in the wild". The submitter of the + file that showed that the contents decoded as spaces, but they + were at the end of the field so it's possible that the null + bytes just acted as null terminators. */ + } + else if (!r->corruption_warning) + { + r->corruption_warning = true; + sys_warn (r, r->pos, + _("Possible compressed data corruption: " + "string contains compressed integer (opcode %d)."), + opcode); + } + } + break; + } + + return true; +} + +/* Reads LENGTH string bytes from R into S. + LENGTH must be a multiple of 8. + Reads compressed strings if S is compressed. + Returns true if successful, false if end of file is + reached immediately. */ +static bool +read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length) +{ + assert (length % 8 == 0); + if (!r->compressed) + return try_read_bytes (r, s, length); else { - if (ferror (r->file)) - { - msg (ME, _("%s: Reading system file: %s."), - fh_get_file_name (r->fh), strerror (errno)); - r->ok = false; - } - else if (read_bytes != 0) - { - msg (ME, _("%s: Partial record at end of system file."), - fh_get_file_name (r->fh)); - r->ok = false; - } - return 0; + size_t ofs; + for (ofs = 0; ofs < length; ofs += 8) + if (!read_compressed_string (r, s + ofs)) + { + if (ofs != 0) + partial_record (r); + return false; + } + return true; } } + +/* Skips LENGTH string bytes from R. + LENGTH must be a multiple of 8. + (LENGTH is also limited to 1024, but that's only because the + current caller never needs more than that many bytes.) + Returns true if successful, false if end of file is + reached immediately. */ +static bool +skip_whole_strings (struct sfm_reader *r, size_t length) +{ + uint8_t buffer[1024]; + assert (length < sizeof buffer); + return read_whole_strings (r, buffer, length); +} -/* Returns true if an I/O error has occurred on READER, false - otherwise. */ -bool -sfm_read_error (const struct sfm_reader *reader) +/* Helpers for reading records that contain structured text + strings. */ + +/* Maximum number of warnings to issue for a single text + record. */ +#define MAX_TEXT_WARNINGS 5 + +/* State. */ +struct text_record + { + struct substring buffer; /* Record contents, in UTF-8. */ + off_t start; /* Starting offset in file. */ + size_t pos; /* Current position in buffer. */ + int n_warnings; /* Number of warnings issued or suppressed. */ + }; + +static struct text_record * +open_text_record (struct sfm_reader *r, + const struct sfm_extension_record *record) { - return !reader->ok; + struct text_record *text; + struct substring raw; + + text = pool_alloc (r->pool, sizeof *text); + raw = ss_buffer (record->data, record->size * record->count); + text->start = record->pos; + text->buffer = recode_substring_pool ("UTF-8", r->encoding, raw, r->pool); + text->pos = 0; + text->n_warnings = 0; + + return text; } -/* Returns true if FILE is an SPSS system file, - false otherwise. */ -bool -sfm_detect (FILE *file) +/* Closes TEXT, frees its storage, and issues a final warning + about suppressed warnings if necesary. */ +static void +close_text_record (struct sfm_reader *r, struct text_record *text) +{ + if (text->n_warnings > MAX_TEXT_WARNINGS) + sys_warn (r, -1, _("Suppressed %d additional related warnings."), + text->n_warnings - MAX_TEXT_WARNINGS); + pool_free (r->pool, ss_data (text->buffer)); +} + +/* Reads a variable=value pair from TEXT. + Looks up the variable in DICT and stores it into *VAR. + Stores a null-terminated value into *VALUE. */ +static bool +read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict, + struct text_record *text, + struct variable **var, char **value) +{ + for (;;) + { + if (!text_read_short_name (r, dict, text, ss_cstr ("="), var)) + return false; + + *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL); + if (*value == NULL) + return false; + + text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX), + ss_buffer ("\t\0", 2)); + + if (*var != NULL) + return true; + } +} + +static bool +text_read_variable_name (struct sfm_reader *r, struct dictionary *dict, + struct text_record *text, struct substring delimiters, + struct variable **var) +{ + char *name; + + name = text_get_token (text, delimiters, NULL); + if (name == NULL) + return false; + + *var = dict_lookup_var (dict, name); + if (*var != NULL) + return true; + + text_warn (r, text, _("Dictionary record refers to unknown variable %s."), + name); + return false; +} + + +static bool +text_read_short_name (struct sfm_reader *r, struct dictionary *dict, + struct text_record *text, struct substring delimiters, + struct variable **var) +{ + char *short_name = text_get_token (text, delimiters, NULL); + if (short_name == NULL) + return false; + + *var = dict_lookup_var (dict, short_name); + if (*var == NULL) + text_warn (r, text, _("Dictionary record refers to unknown variable %s."), + short_name); + return true; +} + +/* Displays a warning for the current file position, limiting the + number to MAX_TEXT_WARNINGS for TEXT. */ +static void +text_warn (struct sfm_reader *r, struct text_record *text, + const char *format, ...) { - struct sysfile_header hdr; + if (text->n_warnings++ < MAX_TEXT_WARNINGS) + { + va_list args; + + va_start (args, format); + sys_msg (r, text->start + text->pos, MW, format, args); + va_end (args); + } +} + +static char * +text_get_token (struct text_record *text, struct substring delimiters, + char *delimiter) +{ + struct substring token; + char *end; + + if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token)) + return NULL; + + end = &ss_data (token)[ss_length (token)]; + if (delimiter != NULL) + *delimiter = *end; + *end = '\0'; + return ss_data (token); +} + +/* Reads a integer value expressed in decimal, then a space, then a string that + consists of exactly as many bytes as specified by the integer, then a space, + from TEXT. Returns the string, null-terminated, as a subset of TEXT's + buffer (so the caller should not free the string). */ +static const char * +text_parse_counted_string (struct sfm_reader *r, struct text_record *text) +{ + size_t start; + size_t n; + char *s; + + start = text->pos; + n = 0; + for (;;) + { + int c = text->buffer.string[text->pos]; + if (c < '0' || c > '9') + break; + n = (n * 10) + (c - '0'); + text->pos++; + } + if (start == text->pos) + { + sys_warn (r, text->start, + _("Expecting digit at UTF-8 offset %zu in MRSETS record."), + text->pos); + return NULL; + } + + if (!text_match (text, ' ')) + { + sys_warn (r, text->start, + _("Expecting space at UTF-8 offset %zu in MRSETS record."), + text->pos); + return NULL; + } - if (fread (&hdr, sizeof hdr, 1, file) != 1) + if (text->pos + n > text->buffer.length) + { + sys_warn (r, text->start, + _("%zu-byte string starting at UTF-8 offset %zu " + "exceeds record length %zu."), + n, text->pos, text->buffer.length); + return NULL; + } + + s = &text->buffer.string[text->pos]; + if (s[n] != ' ') + { + sys_warn (r, text->start, + _("Expecting space at UTF-8 offset %zu following %zu-byte " + "string."), + text->pos + n, n); + return NULL; + } + s[n] = '\0'; + text->pos += n + 1; + return s; +} + +static bool +text_match (struct text_record *text, char c) +{ + if (text->buffer.string[text->pos] == c) + { + text->pos++; + return true; + } + else return false; - if (strncmp ("$FL2", hdr.rec_type, 4)) +} + +/* Returns the current byte offset (as convertd to UTF-8) inside the TEXT's + string. */ +static size_t +text_pos (const struct text_record *text) +{ + return text->pos; +} + +/* Messages. */ + +/* Displays a corruption message. */ +static void +sys_msg (struct sfm_reader *r, off_t offset, + int class, const char *format, va_list args) +{ + struct msg m; + struct string text; + + ds_init_empty (&text); + if (offset >= 0) + ds_put_format (&text, _("`%s' near offset 0x%llx: "), + fh_get_file_name (r->fh), (long long int) offset); + else + ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh)); + ds_put_vformat (&text, format, args); + + m.category = msg_class_to_category (class); + m.severity = msg_class_to_severity (class); + m.where.file_name = NULL; + m.where.line_number = 0; + m.where.first_column = 0; + m.where.last_column = 0; + m.text = ds_cstr (&text); + + msg_emit (&m); +} + +/* Displays a warning for offset OFFSET in the file. */ +static void +sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...) +{ + va_list args; + + va_start (args, format); + sys_msg (r, offset, MW, format, args); + va_end (args); +} + +/* Displays an error for the current file position, + marks it as in an error state, + and aborts reading it using longjmp. */ +static void +sys_error (struct sfm_reader *r, off_t offset, const char *format, ...) +{ + va_list args; + + va_start (args, format); + sys_msg (r, offset, ME, format, args); + va_end (args); + + r->error = true; + longjmp (r->bail_out, 1); +} + +/* Reads BYTE_CNT bytes into BUF. + Returns true if exactly BYTE_CNT bytes are successfully read. + Aborts if an I/O error or a partial read occurs. + If EOF_IS_OK, then an immediate end-of-file causes false to be + returned; otherwise, immediate end-of-file causes an abort + too. */ +static inline bool +read_bytes_internal (struct sfm_reader *r, bool eof_is_ok, + void *buf, size_t byte_cnt) +{ + size_t bytes_read = fread (buf, 1, byte_cnt, r->file); + r->pos += bytes_read; + if (bytes_read == byte_cnt) + return true; + else if (ferror (r->file)) + sys_error (r, r->pos, _("System error: %s."), strerror (errno)); + else if (!eof_is_ok || bytes_read != 0) + sys_error (r, r->pos, _("Unexpected end of file.")); + else return false; - return true; } +/* Reads BYTE_CNT into BUF. + Aborts upon I/O error or if end-of-file is encountered. */ +static void +read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +{ + read_bytes_internal (r, false, buf, byte_cnt); +} + +/* Reads BYTE_CNT bytes into BUF. + Returns true if exactly BYTE_CNT bytes are successfully read. + Returns false if an immediate end-of-file is encountered. + Aborts if an I/O error or a partial read occurs. */ +static bool +try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +{ + return read_bytes_internal (r, true, buf, byte_cnt); +} + +/* Reads a 32-bit signed integer from R and returns its value in + host format. */ +static int +read_int (struct sfm_reader *r) +{ + uint8_t integer[4]; + read_bytes (r, integer, sizeof integer); + return integer_get (r->integer_format, integer, sizeof integer); +} + +/* Reads a 64-bit floating-point number from R and returns its + value in host format. */ +static double +read_float (struct sfm_reader *r) +{ + uint8_t number[8]; + read_bytes (r, number, sizeof number); + return float_get_double (r->float_format, number); +} + +static int +parse_int (struct sfm_reader *r, const void *data, size_t ofs) +{ + return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4); +} + +static double +parse_float (struct sfm_reader *r, const void *data, size_t ofs) +{ + return float_get_double (r->float_format, (const uint8_t *) data + ofs); +} + +/* Reads exactly SIZE - 1 bytes into BUFFER + and stores a null byte into BUFFER[SIZE - 1]. */ +static void +read_string (struct sfm_reader *r, char *buffer, size_t size) +{ + assert (size > 0); + read_bytes (r, buffer, size - 1); + buffer[size - 1] = '\0'; +} + +/* Skips BYTES bytes forward in R. */ +static void +skip_bytes (struct sfm_reader *r, size_t bytes) +{ + while (bytes > 0) + { + char buffer[1024]; + size_t chunk = MIN (sizeof buffer, bytes); + read_bytes (r, buffer, chunk); + bytes -= chunk; + } +} + +static const struct casereader_class sys_file_casereader_class = + { + sys_file_casereader_read, + sys_file_casereader_destroy, + NULL, + NULL, + };