From 8a7f5c28b68af3cd03a524329c1d6e7eba355158 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 11 Dec 2006 15:39:23 +0000 Subject: [PATCH] Rewrite system file reader code, to clean up and improve. --- Smake | 2 + src/data/ChangeLog | 24 + src/data/automake.mk | 1 - src/data/dictionary.c | 14 + src/data/dictionary.h | 2 + src/data/sfm-private.h | 88 - src/data/sys-file-reader.c | 2859 +++++++++++------------ src/data/sys-file-reader.h | 8 +- src/data/sys-file-writer.c | 55 +- src/language/dictionary/ChangeLog | 5 + src/language/dictionary/sys-file-info.c | 37 +- src/libpspp/ChangeLog | 8 + src/libpspp/str.c | 21 +- src/libpspp/str.h | 1 + tests/command/sysfile-info.sh | 16 +- 15 files changed, 1522 insertions(+), 1619 deletions(-) delete mode 100644 src/data/sfm-private.h diff --git a/Smake b/Smake index 7aac5a86..6fdba19d 100644 --- a/Smake +++ b/Smake @@ -20,6 +20,7 @@ GNULIB_MODULES = \ getopt \ gettext \ intprops \ + inttostr \ linebreak \ localcharset \ memcasecmp \ @@ -48,6 +49,7 @@ GNULIB_MODULES = \ strtol \ strtoul \ unistd \ + unlocked-io \ vsnprintf \ xalloc \ xalloc-die \ diff --git a/src/data/ChangeLog b/src/data/ChangeLog index 5bb14cd1..bbbda1b7 100644 --- a/src/data/ChangeLog +++ b/src/data/ChangeLog @@ -1,3 +1,27 @@ +Sun Dec 10 14:21:29 2006 Ben Pfaff + + * sfm-private.h: Move contents into sys-file-writer.c, which is + the only remaining user. Removed Borland C++-specific directives. + + * sys-file-reader.c: Clean up and rewrite entire file. The + rewritten version is simpler and better abstracted, and should be + easier to maintain and extend. It avoids using structures to read + file data, which is prone to padding variations among compilers. + It should also handle non-IEEE 754 system files, although I + haven't been able to find any. It has been tested against many + .sav files obtained from the Web and found to produce the same + results as the earlier version of the code, or in some cases + improved results. It is more tolerant of format variations found + in the wild. + + * sys-file-reader.h (struct sfm_read_info): Removed `big_endian' + member, putting an enum integer_format in its place. New member + `float_format'. Changed `compressed' member to type bool. + +Sun Dec 10 13:48:53 2006 Ben Pfaff + + * dictionary.c (dict_delete_consecutive_vars): New function. + Sat Dec 9 20:08:25 2006 Ben Pfaff * file-name.c (fn_search_path): Remove prefix arg that was unused diff --git a/src/data/automake.mk b/src/data/automake.mk index 48db0f6e..1a5028f3 100644 --- a/src/data/automake.mk +++ b/src/data/automake.mk @@ -57,7 +57,6 @@ src_data_libdata_a_SOURCES = \ src/data/scratch-writer.h \ src/data/settings.c \ src/data/settings.h \ - src/data/sfm-private.h \ src/data/storage-stream.c \ src/data/storage-stream.h \ src/data/sys-file-private.c \ diff --git a/src/data/dictionary.c b/src/data/dictionary.c index e312d2e9..3e16831b 100644 --- a/src/data/dictionary.c +++ b/src/data/dictionary.c @@ -463,6 +463,20 @@ dict_delete_vars (struct dictionary *d, dict_delete_var (d, *vars++); } +/* Deletes the COUNT variables in D starting at index IDX. This + is unsafe; see the comment on dict_delete_var() for + details. */ +void +dict_delete_consecutive_vars (struct dictionary *d, size_t idx, size_t count) +{ + /* FIXME: this can be done in O(count) time, but this algorithm + is O(count**2). */ + assert (idx + count <= d->var_cnt); + + while (count-- > 0) + dict_delete_var (d, d->var[idx]); +} + /* Deletes scratch variables from dictionary D. */ void dict_delete_scratch_vars (struct dictionary *d) diff --git a/src/data/dictionary.h b/src/data/dictionary.h index e0425b85..37eb1192 100644 --- a/src/data/dictionary.h +++ b/src/data/dictionary.h @@ -55,6 +55,8 @@ bool dict_contains_var (const struct dictionary *, const struct variable *); void dict_delete_var (struct dictionary *, struct variable *); void dict_delete_vars (struct dictionary *, struct variable *const *, size_t count); +void dict_delete_consecutive_vars (struct dictionary *d, + size_t idx, size_t count); void dict_delete_scratch_vars (struct dictionary *); void dict_reorder_var (struct dictionary *d, struct variable *v, size_t new_index); diff --git a/src/data/sfm-private.h b/src/data/sfm-private.h deleted file mode 100644 index 96d08948..00000000 --- a/src/data/sfm-private.h +++ /dev/null @@ -1,88 +0,0 @@ -/* PSPP - computes sample statistics. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. - Written by Ben Pfaff . - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. */ - -/* PORTME: There might easily be alignment problems with some of these - structures. */ - -#include -#include -#include "variable.h" - - -#if __BORLANDC__ -#pragma option -a- /* Turn off alignment. */ -#endif - -/* Find 64-bit floating-point type. */ -#if SIZEOF_FLOAT == 8 - #define flt64 float - #define FLT64_MAX FLT_MAX -#elif SIZEOF_DOUBLE == 8 - #define flt64 double - #define FLT64_MAX DBL_MAX -#elif SIZEOF_LONG_DOUBLE == 8 - #define flt64 long double - #define FLT64_MAX LDBL_MAX -#else - #error Which one of your basic types is 64-bit floating point? -#endif - -/* Figure out SYSMIS value for flt64. */ -#include -#if SIZEOF_DOUBLE == 8 -#define second_lowest_flt64 second_lowest_value -#else -#error Must define second_lowest_flt64 for your architecture. -#endif - -/* Record Type 1: General Information. */ -struct sysfile_header - { - char rec_type[4] ; /* 00: Record-type code, "$FL2". */ - char prod_name[60] ; /* 04: Product identification. */ - int32_t layout_code ; /* 40: 2. */ - int32_t nominal_case_size ; /* 44: Number of `value's per case. - Note: some systems set this to -1 */ - int32_t compress ; /* 48: 1=compressed, 0=not compressed. */ - int32_t weight_idx ; /* 4c: 1-based index of weighting var, or 0. */ - int32_t case_cnt ; /* 50: Number of cases, -1 if unknown. */ - flt64 bias ; /* 54: Compression bias (100.0). */ - char creation_date[9] ; /* 5c: `dd mmm yy' creation date of file. */ - char creation_time[8] ; /* 65: `hh:mm:ss' 24-hour creation time. */ - char file_label[64] ; /* 6d: File label. */ - char padding[3] ; /* ad: Ignored padding. */ - } ATTRIBUTE((packed)) ; - -/* Record Type 2: Variable. */ -struct sysfile_variable - { - int32_t rec_type ; /* 2. */ - int32_t type ; /* 0=numeric, 1-255=string width, - -1=continued string. */ - int32_t has_var_label ; /* 1=has a variable label, 0=doesn't. */ - int32_t n_missing_values ; /* Missing value code of -3,-2,0,1,2, or 3. */ - int32_t print ; /* Print format. */ - int32_t write ; /* Write format. */ - char name[SHORT_NAME_LEN] ; /* Variable name. */ - /* The rest of the structure varies. */ - } ATTRIBUTE((packed)) ; - -#if __BORLANDC__ -#pragma -a4 -#endif diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index c1a8292e..d62dfc7b 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -20,14 +20,13 @@ #include #include "sys-file-reader.h" -#include "sfm-private.h" #include "sys-file-private.h" -#include #include #include -#include -#include +#include +#include +#include #include #include @@ -35,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -46,232 +46,123 @@ #include "format.h" #include "missing-values.h" #include "value-labels.h" -#include "value.h" #include "variable.h" +#include "value.h" + +#include "c-ctype.h" +#include "inttostr.h" +#include "minmax.h" +#include "unlocked-io.h" +#include "xsize.h" #include "gettext.h" #define _(msgid) gettext (msgid) +#define N_(msgid) (msgid) /* System file reader. */ struct sfm_reader -{ - struct file_handle *fh; /* File handle. */ - FILE *file; /* File stream. */ - - int reverse_endian; /* 1=file has endianness opposite us. */ - int value_cnt; /* Number of `union values's per case. */ - long case_cnt; /* Number of cases, -1 if unknown. */ - int compressed; /* 1=compressed, 0=not compressed. */ - double bias; /* Compression bias, usually 100.0. */ - int weight_idx; /* 0-based index of weighting variable, or -1. */ - bool ok; /* False after an I/O error or corrupt data. */ - bool has_vls; /* True if the file has one or more Very Long Strings*/ - - /* Variables. */ - struct sfm_var *vars; - size_t var_cnt; - - /* File's special constants. */ - flt64 sysmis; - flt64 highest; - flt64 lowest; - - /* Decompression buffer. */ - flt64 *buf; /* Buffer data. */ - flt64 *ptr; /* Current location in buffer. */ - flt64 *end; /* End of buffer data. */ - - /* Compression instruction octet. */ - unsigned char x[8]; /* Current instruction octet. */ - unsigned char *y; /* Location in current instruction octet. */ -}; + { + /* Resource tracking. */ + struct pool *pool; /* All system file state. */ + jmp_buf bail_out; /* longjmp() target for error handling. */ + + /* File state. */ + struct file_handle *fh; /* File handle. */ + FILE *file; /* File stream. */ + bool error; /* I/O or corruption error? */ + + /* File format. */ + enum integer_format integer_format; /* On-disk integer format. */ + enum float_format float_format; /* On-disk floating point format. */ + int value_cnt; /* Number of 8-byte units per case. */ + struct sfm_var *vars; /* Variables. */ + size_t var_cnt; /* Number of variables. */ + bool has_vls; /* File has one or more very long strings? */ + + /* Decompression. */ + bool compressed; /* File is compressed? */ + double bias; /* Compression bias, usually 100.0. */ + uint8_t opcodes[8]; /* Current block of opcodes. */ + size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */ + }; /* A variable in a system file. */ struct sfm_var -{ - int width; /* 0=numeric, otherwise string width. */ - int fv; /* Index into case. */ -}; - -/* Utilities. */ - -/* Swap bytes *A and *B. */ -static inline void -bswap (char *a, char *b) -{ - char t = *a; - *a = *b; - *b = t; -} - -/* Reverse the byte order of 32-bit integer *X. */ -static inline void -bswap_int32 (int32_t *x_) -{ - char *x = (char *) x_; - bswap (x + 0, x + 3); - bswap (x + 1, x + 2); -} + { + int width; /* 0=numeric, otherwise string width. */ + int case_index; /* Index into case. */ + }; -/* Reverse the byte order of 64-bit floating point *X. */ -static inline void -bswap_flt64 (flt64 *x_) -{ - char *x = (char *) x_; - bswap (x + 0, x + 7); - bswap (x + 1, x + 6); - bswap (x + 2, x + 5); - bswap (x + 3, x + 4); -} +static struct variable **make_var_by_value_idx (struct sfm_reader *, + struct dictionary *); +static struct variable *lookup_var_by_value_idx (struct sfm_reader *, + struct variable **, + int value_idx); -static void -corrupt_msg (int class, const char *format,...) +static void sys_warn (struct sfm_reader *, const char *, ...) PRINTF_FORMAT (2, 3); - /* Displays a corrupt sysfile error. */ - static void - corrupt_msg (int class, const char *format,...) -{ - struct msg m; - va_list args; - struct string text; - - ds_init_cstr (&text, _("corrupt system file: ")); - va_start (args, format); - ds_put_vformat (&text, format, args); - va_end (args); - - m.category = msg_class_to_category (class); - m.severity = msg_class_to_severity (class); - m.where.file_name = NULL; - m.where.line_number = 0; - m.text = ds_cstr (&text); - - msg_emit (&m); -} - -/* Closes a system file after we're done with it. */ -void -sfm_close_reader (struct sfm_reader *r) -{ - if (r == NULL) - return; - - if (r->file) - { - if (fn_close (fh_get_file_name (r->fh), r->file) == EOF) - msg (ME, _("%s: Closing system file: %s."), - fh_get_file_name (r->fh), strerror (errno)); - r->file = NULL; - } - - if (r->fh != NULL) - fh_close (r->fh, "system file", "rs"); - - free (r->vars); - free (r->buf); - free (r); -} +static void sys_error (struct sfm_reader *, const char *, ...) + PRINTF_FORMAT (2, 3) + NO_RETURN; + +static void read_bytes (struct sfm_reader *, void *, size_t); +static bool try_read_bytes (struct sfm_reader *, void *, size_t); +static int32_t read_int32 (struct sfm_reader *); +static double read_flt64 (struct sfm_reader *); +static void read_string (struct sfm_reader *, char *, size_t); +static void skip_bytes (struct sfm_reader *, size_t); + +static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]); +static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]); + +static struct variable_to_value_map *open_variable_to_value_map ( + struct sfm_reader *, size_t size); +static void close_variable_to_value_map (struct sfm_reader *r, + struct variable_to_value_map *); +static bool read_variable_to_value_map (struct sfm_reader *, + struct dictionary *, + struct variable_to_value_map *, + struct variable **var, char **value, + int *warning_cnt); /* Dictionary reader. */ -static void buf_unread(struct sfm_reader *r, size_t byte_cnt); - -static void *buf_read (struct sfm_reader *, void *buf, size_t byte_cnt, - size_t min_alloc); - -static int read_header (struct sfm_reader *, - struct dictionary *, struct sfm_read_info *); -static int parse_format_spec (struct sfm_reader *, int32_t, - struct fmt_spec *, const struct variable *); -static int read_value_labels (struct sfm_reader *, struct dictionary *, - struct variable **var_by_idx); -static int read_variables (struct sfm_reader *, - struct dictionary *, struct variable ***var_by_idx); -static int read_machine_int32_info (struct sfm_reader *, int size, int count); -static int read_machine_flt64_info (struct sfm_reader *, int size, int count); -static int read_documents (struct sfm_reader *, struct dictionary *); - -static int fread_ok (struct sfm_reader *, void *, size_t); - -/* Displays the message X with corrupt_msg, then jumps to the error - label. */ -#define lose(X) \ - do { \ - corrupt_msg X; \ - goto error; \ - } while (0) - -/* Calls buf_read with the specified arguments, and jumps to - error if the read fails. */ -#define assertive_buf_read(a,b,c,d) \ - do { \ - if (!buf_read (a,b,c,d)) \ - goto error; \ - } while (0) - - -struct name_pair -{ - char *shortname; - char *longname; -}; - -static int -pair_sn_compare(const void *_p1, const void *_p2, const void *aux UNUSED) -{ - int i; - - const struct name_pair *p1 = _p1; - const struct name_pair *p2 = _p2; - - char buf1[SHORT_NAME_LEN + 1]; - char buf2[SHORT_NAME_LEN + 1]; - - memset(buf1, 0, SHORT_NAME_LEN + 1); - memset(buf2, 0, SHORT_NAME_LEN + 1); - - for (i = 0 ; i <= SHORT_NAME_LEN ; ++i ) - { - buf1[i] = p1->shortname[i]; - if ( '\0' == buf1[i]) - break; - } - - for (i = 0 ; i <= SHORT_NAME_LEN ; ++i ) - { - buf2[i] = p2->shortname[i]; - if ( '\0' == buf2[i]) - break; - } - - return strncmp(buf1, buf2, SHORT_NAME_LEN); -} - -static unsigned int -pair_sn_hash(const void *_p, const void *aux UNUSED) -{ - int i; - const struct name_pair *p = _p; - char buf[SHORT_NAME_LEN + 1]; - - memset(buf, 0, SHORT_NAME_LEN + 1); - for (i = 0 ; i <= SHORT_NAME_LEN ; ++i ) - { - buf[i] = p->shortname[i]; - if ( '\0' == buf[i]) - break; - } - - return hsh_hash_bytes(buf, strlen(buf)); -} - -static void -pair_sn_free(void *p, const void *aux UNUSED) -{ - free(p); -} +enum which_format + { + PRINT_FORMAT, + WRITE_FORMAT + }; +static void read_header (struct sfm_reader *, struct dictionary *, + int *weight_idx, int *claimed_value_cnt, + struct sfm_read_info *); +static void read_variable_record (struct sfm_reader *, struct dictionary *, + int *format_warning_cnt); +static void parse_format_spec (struct sfm_reader *, uint32_t, + enum which_format, struct variable *, + int *format_warning_cnt); +static void setup_weight (struct sfm_reader *, int weight_idx, + struct variable **var_by_value_idx, + struct dictionary *); +static void read_documents (struct sfm_reader *, struct dictionary *); +static void read_value_labels (struct sfm_reader *, struct dictionary *, + struct variable **var_by_value_idx); + +static void read_extension_record (struct sfm_reader *, struct dictionary *); +static void read_machine_int32_info (struct sfm_reader *, + size_t size, size_t count); +static void read_machine_flt64_info (struct sfm_reader *, + size_t size, size_t count); +static void read_display_parameters (struct sfm_reader *, + size_t size, size_t count, + struct dictionary *); +static void read_long_var_name_map (struct sfm_reader *, + size_t size, size_t count, + struct dictionary *); +static void read_long_string_map (struct sfm_reader *, + size_t size, size_t count, + struct dictionary *); /* Opens the system file designated by file handle FH for @@ -282,1522 +173,1480 @@ struct sfm_reader * sfm_open_reader (struct file_handle *fh, struct dictionary **dict, struct sfm_read_info *info) { - struct sfm_reader *r = NULL; - struct variable **var_by_idx = NULL; - - /* The data in record 7(14) */ - char *subrec14data = 0; + struct sfm_reader *volatile r = NULL; + struct variable **var_by_value_idx; + int format_warning_cnt = 0; + int weight_idx; + int claimed_value_cnt; + int rec_type; + size_t i; - /* A hash table of long variable names indexed by short name */ - struct hsh_table *short_to_long = NULL; + if (!fh_open (fh, FH_REF_FILE, "system file", "rs")) + return NULL; *dict = dict_create (); - if (!fh_open (fh, FH_REF_FILE, "system file", "rs")) - goto error; /* Create and initialize reader. */ - r = xmalloc (sizeof *r); + r = pool_create_container (struct sfm_reader, pool); r->fh = fh; r->file = fn_open (fh_get_file_name (fh), "rb"); - - r->reverse_endian = 0; + r->error = false; r->value_cnt = 0; - r->case_cnt = 0; - r->compressed = 0; - r->bias = 100.0; - r->weight_idx = -1; - r->ok = true; r->has_vls = false; + r->opcode_idx = sizeof r->opcodes; - r->vars = NULL; - - r->sysmis = -FLT64_MAX; - r->highest = FLT64_MAX; - r->lowest = second_lowest_flt64; - - r->buf = r->ptr = r->end = NULL; - r->y = r->x + sizeof r->x; + if (setjmp (r->bail_out)) + { + sfm_close_reader (r); + dict_destroy (*dict); + *dict = NULL; + return NULL; + } - /* Check that file open succeeded. */ if (r->file == NULL) { - msg (ME, _("An error occurred while opening \"%s\" for reading " - "as a system file: %s."), + msg (ME, _("Error opening \"%s\" for reading as a system file: %s."), fh_get_file_name (r->fh), strerror (errno)); - goto error; + longjmp (r->bail_out, 1); } - /* Read header and variables. */ - if (!read_header (r, *dict, info) || !read_variables (r, *dict, &var_by_idx)) - goto error; - + /* Read header. */ + read_header (r, *dict, &weight_idx, &claimed_value_cnt, info); - /* Handle weighting. */ - if (r->weight_idx != -1) + /* Read all the variable definition records. */ + rec_type = read_int32 (r); + while (rec_type == 2) { - struct variable *weight_var; - - if (r->weight_idx < 0 || r->weight_idx >= r->value_cnt) - lose ((ME, _("%s: Index of weighting variable (%d) is not between 0 " - "and number of elements per case (%d)."), - fh_get_file_name (r->fh), r->weight_idx, r->value_cnt)); - - - weight_var = var_by_idx[r->weight_idx]; - - if (weight_var == NULL) - lose ((ME, - _("%s: Weighting variable may not be a continuation of " - "a long string variable."), fh_get_file_name (fh))); - else if (var_is_alpha (weight_var)) - lose ((ME, _("%s: Weighting variable may not be a string variable."), - fh_get_file_name (fh))); - - dict_set_weight (*dict, weight_var); + read_variable_record (r, *dict, &format_warning_cnt); + rec_type = read_int32 (r); } - else - dict_set_weight (*dict, NULL); - /* Read records of types 3, 4, 6, and 7. */ - for (;;) + /* Figure out the case format. */ + var_by_value_idx = make_var_by_value_idx (r, *dict); + setup_weight (r, weight_idx, var_by_value_idx, *dict); + + /* Read all the rest of the dictionary records. */ + while (rec_type != 999) { - int32_t rec_type; + switch (rec_type) + { + case 3: + read_value_labels (r, *dict, var_by_value_idx); + break; - assertive_buf_read (r, &rec_type, sizeof rec_type, 0); - if (r->reverse_endian) - bswap_int32 (&rec_type); + case 4: + sys_error (r, _("Misplaced type 4 record.")); + case 6: + read_documents (r, *dict); + break; - switch (rec_type) - { - case 3: - if (!read_value_labels (r, *dict, var_by_idx)) - goto error; - break; - - case 4: - lose ((ME, _("%s: Orphaned variable index record (type 4). Type 4 " - "records must always immediately follow type 3 " - "records."), - fh_get_file_name (r->fh))); - - case 6: - if (!read_documents (r, *dict)) - goto error; - break; - - case 7: - { - struct - { - int32_t subtype ; - int32_t size ; - int32_t count ; - } ATTRIBUTE((packed)) - data; - unsigned long bytes; - - int skip = 0; - - assertive_buf_read (r, &data, sizeof data, 0); - if (r->reverse_endian) - { - bswap_int32 (&data.subtype); - bswap_int32 (&data.size); - bswap_int32 (&data.count); - } - bytes = data.size * data.count; - - if (bytes < data.size || bytes < data.count) - lose ((ME, "%s: Record type %d subtype %d too large.", - fh_get_file_name (r->fh), rec_type, data.subtype)); - - switch (data.subtype) - { - case 3: - if (!read_machine_int32_info (r, data.size, data.count)) - goto error; - break; - - case 4: - if (!read_machine_flt64_info (r, data.size, data.count)) - goto error; - break; - - case 5: - case 6: /* ?? Used by SPSS 8.0. */ - skip = 1; - break; - - case 11: /* Variable display parameters */ - { - const int n_vars = data.count / 3 ; - int i; - if ( data.count % 3 || n_vars != dict_get_var_cnt(*dict) ) - { - msg (MW, _("%s: Invalid subrecord length. " - "Record: 7; Subrecord: 11"), - fh_get_file_name (r->fh)); - skip = 1; - break; - } - - for ( i = 0 ; i < MIN(n_vars, dict_get_var_cnt(*dict)) ; ++i ) - { - struct - { - int32_t measure ; - int32_t width ; - int32_t align ; - } ATTRIBUTE((packed)) - params; - - struct variable *v; - - assertive_buf_read (r, ¶ms, sizeof(params), 0); - - if ( ! measure_is_valid(params.measure) - || - ! alignment_is_valid(params.align)) - { - msg(MW, - _("%s: Invalid variable display parameters. Default parameters substituted."), - fh_get_file_name(r->fh)); - continue; - } - - v = dict_get_var(*dict, i); - - var_set_measure (v, params.measure); - var_set_display_width (v, params.width); - var_set_alignment (v, params.align); - } - } - break; - - case 13: /* SPSS 12.0 Long variable name map */ - { - char *short_name; - char *save_ptr = NULL; - int idx; - - /* Read data. */ - subrec14data = xmalloc (bytes + 1); - if (!buf_read (r, subrec14data, bytes, 0)) - { - goto error; - } - subrec14data[bytes] = '\0'; - - short_to_long = hsh_create(4, - pair_sn_compare, - pair_sn_hash, - pair_sn_free, - 0); - - /* Parse data. */ - for (short_name = strtok_r (subrec14data, "=", &save_ptr), idx = 0; - short_name != NULL; - short_name = strtok_r (NULL, "=", &save_ptr), idx++) - { - struct name_pair *pair ; - char *long_name = strtok_r (NULL, "\t", &save_ptr); - struct variable *v; - - /* Validate long name. */ - if (long_name == NULL) - { - msg (MW, _("%s: Trailing garbage in long variable " - "name map."), - fh_get_file_name (r->fh)); - break; - } - if (!var_is_valid_name (long_name, false)) - { - msg (MW, _("%s: Long variable mapping to invalid " - "variable name `%s'."), - fh_get_file_name (r->fh), long_name); - break; - } - - /* Find variable using short name. */ - v = dict_lookup_var (*dict, short_name); - if (v == NULL) - { - msg (MW, _("%s: Long variable mapping for " - "nonexistent variable %s."), - fh_get_file_name (r->fh), short_name); - break; - } - - /* Identify any duplicates. */ - if ( strcasecmp (short_name, long_name) && - NULL != dict_lookup_var (*dict, long_name)) - lose ((ME, _("%s: Duplicate long variable name `%s' " - "within system file."), - fh_get_file_name (r->fh), long_name)); - - - /* Set long name. - Renaming a variable may clear the short - name, but we want to retain it, so - re-set it explicitly. */ - dict_rename_var (*dict, v, long_name); - var_set_short_name (v, short_name); - - pair = xmalloc(sizeof *pair); - pair->shortname = short_name; - pair->longname = long_name; - hsh_insert(short_to_long, pair); -#if 0 - /* This messes up the processing of subtype 14 (below). - I'm not sure if it is needed anyway, so I'm removing it for - now. If it's needed, then it will need to be done after all the - records have been processed. --- JMD 27 April 2006 - */ - - /* For compatibility, make sure dictionary - is in long variable name map order. In - the common case, this has no effect, - because the dictionary and the long - variable name map are already in the - same order. */ - dict_reorder_var (*dict, v, idx); -#endif - } - - } - break; - - case 14: - { - int j = 0; - bool eq_seen = false; - int i; - - /* Read data. */ - char *buffer = xmalloc (bytes + 1); - if (!buf_read (r, buffer, bytes, 0)) - { - free (buffer); - goto error; - } - buffer[bytes] = '\0'; - - r->has_vls = true; - - /* Note: SPSS v13 terminates this record with 00, - whereas SPSS v14 terminates it with 00 09. We must - accept either */ - for(i = 0; i < bytes ; ++i) - { - long int length; - static char name[SHORT_NAME_LEN + 1] = {0}; - static char len_str[6] ={0}; - - switch( buffer[i] ) - { - case '=': - eq_seen = true; - j = 0; - break; - case '\0': - length = strtol(len_str, 0, 10); - if ( length != LONG_MAX && length != LONG_MIN) - { - char *lookup_name = name; - int l; - int idx; - struct variable *v; - - if ( short_to_long ) - { - struct name_pair pair; - struct name_pair *p; - - pair.shortname = name; - p = hsh_find(short_to_long, &pair); - if ( p ) - lookup_name = p->longname; - } - - v = dict_lookup_var(*dict, lookup_name); - if ( !v ) - { - corrupt_msg(MW, - _("%s: No variable called %s but it is listed in length table."), - fh_get_file_name (r->fh), lookup_name); - - goto error; - - } - - l = length; - if ( var_get_width (v) > EFFECTIVE_LONG_STRING_LENGTH ) - l -= EFFECTIVE_LONG_STRING_LENGTH; - else - l -= var_get_width (v); - - idx = var_get_dict_index (v); - while ( l > 0 ) - { - struct variable *v_next; - v_next = dict_get_var(*dict, idx + 1); - - if ( var_get_width (v_next) > EFFECTIVE_LONG_STRING_LENGTH ) - l -= EFFECTIVE_LONG_STRING_LENGTH; - else - l -= var_get_width (v_next); - - dict_delete_var(*dict, v_next); - } - - assert ( length >= MIN_VERY_LONG_STRING ); - - var_set_width (v, length); - } - eq_seen = false; - memset(name, 0, SHORT_NAME_LEN+1); - memset(len_str, 0, 6); - j = 0; - break; - case '\t': - break; - default: - if ( eq_seen ) - len_str[j] = buffer[i]; - else - name[j] = buffer[i]; - j++; - break; - } - } - free(buffer); - dict_compact_values(*dict); - } - break; - - default: - msg (MW, _("%s: Unrecognized record type 7, subtype %d " - "encountered in system file."), - fh_get_file_name (r->fh), data.subtype); - skip = 1; - } - - if (skip) - { - void *x = buf_read (r, NULL, data.size * data.count, 0); - if (x == NULL) - goto error; - free (x); - } - } - break; - - case 999: - { - int32_t filler; - - assertive_buf_read (r, &filler, sizeof filler, 0); - - goto success; - } - - default: - corrupt_msg(MW, _("%s: Unrecognized record type %d."), - fh_get_file_name (r->fh), rec_type); - } + case 7: + read_extension_record (r, *dict); + break; + + default: + sys_error (r, _("Unrecognized record type %d."), rec_type); + } + rec_type = read_int32 (r); } - success: - /* Come here on successful completion. */ + /* Read record 999 data, which is just filler. */ + read_int32 (r); + + if (claimed_value_cnt != -1 && claimed_value_cnt != r->value_cnt) + sys_warn (r, _("File header claims %d variable positions but " + "%d were read from file."), + claimed_value_cnt, r->value_cnt); /* Create an index of dictionary variable widths for - sfm_read_case to use. We cannot use the `struct variables' + sfm_read_case to use. We cannot use the `struct variable's from the dictionary we created, because the caller owns the dictionary and may destroy or modify its variables. */ - { - size_t i; - - r->var_cnt = dict_get_var_cnt (*dict); - r->vars = xnmalloc (r->var_cnt, sizeof *r->vars); - for (i = 0; i < r->var_cnt; i++) - { - struct variable *v = dict_get_var (*dict, i); - struct sfm_var *sv = &r->vars[i]; - sv->width = var_get_width (v); - sv->fv = var_get_case_index (v); - } - } - - free (var_by_idx); - hsh_destroy(short_to_long); - free (subrec14data); - return r; - - error: - /* Come here on unsuccessful completion. */ - sfm_close_reader (r); - free (var_by_idx); - hsh_destroy(short_to_long); - free (subrec14data); - if (*dict != NULL) + r->var_cnt = dict_get_var_cnt (*dict); + r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars); + for (i = 0; i < r->var_cnt; i++) { - dict_destroy (*dict); - *dict = NULL; + struct variable *v = dict_get_var (*dict, i); + struct sfm_var *sv = &r->vars[i]; + sv->width = var_get_width (v); + sv->case_index = var_get_case_index (v); } - return NULL; + + pool_free (r->pool, var_by_value_idx); + return r; } -/* Read record type 7, subtype 3. */ -static int -read_machine_int32_info (struct sfm_reader *r, int size, int count) +/* Closes a system file after we're done with it. */ +void +sfm_close_reader (struct sfm_reader *r) { - int32_t data[8]; - int file_bigendian; + if (r == NULL) + return; - int i; + if (r->file) + { + if (fn_close (fh_get_file_name (r->fh), r->file) == EOF) + msg (ME, _("Error closing system file \"%s\": %s."), + fh_get_file_name (r->fh), strerror (errno)); + r->file = NULL; + } - if (size != sizeof (int32_t) || count != 8) - lose ((ME, _("%s: Bad size (%d) or count (%d) field on record type 7, " - "subtype 3. Expected size %d, count 8."), - fh_get_file_name (r->fh), size, count, sizeof (int32_t))); - - assertive_buf_read (r, data, sizeof data, 0); - if (r->reverse_endian) - for (i = 0; i < 8; i++) - bswap_int32 (&data[i]); - -#ifdef FPREP_IEEE754 - if (data[4] != 1) - lose ((ME, _("%s: Floating-point representation in system file is not " - "IEEE-754. PSPP cannot convert between floating-point " - "formats."), - fh_get_file_name (r->fh))); -#else -#error Add support for your floating-point format. -#endif - -#ifdef WORDS_BIGENDIAN - file_bigendian = 1; -#else - file_bigendian = 0; -#endif - if (r->reverse_endian) - file_bigendian ^= 1; - if (file_bigendian ^ (data[6] == 1)) - lose ((ME, _("%s: File-indicated endianness (%s) does not match " - "endianness intuited from file header (%s)."), - fh_get_file_name (r->fh), - file_bigendian ? _("big-endian") : _("little-endian"), - data[6] == 1 ? _("big-endian") : (data[6] == 2 ? _("little-endian") - : _("unknown")))); - - /* PORTME: Character representation code. */ - if (data[7] != 2 && data[7] != 3) - lose ((ME, _("%s: File-indicated character representation code (%s) is " - "not ASCII."), - fh_get_file_name (r->fh), - (data[7] == 1 ? "EBCDIC" - : (data[7] == 4 ? _("DEC Kanji") : _("Unknown"))))); - - return 1; - - error: - return 0; + if (r->fh != NULL) + fh_close (r->fh, "system file", "rs"); + + pool_destroy (r->pool); } -/* Read record type 7, subtype 4. */ -static int -read_machine_flt64_info (struct sfm_reader *r, int size, int count) +/* Returns true if an I/O error has occurred on READER, false + otherwise. */ +bool +sfm_read_error (const struct sfm_reader *reader) { - flt64 data[3]; - int i; - - if (size != sizeof (flt64) || count != 3) - lose ((ME, _("%s: Bad size (%d) or count (%d) field on record type 7, " - "subtype 4. Expected size %d, count 8."), - fh_get_file_name (r->fh), size, count, sizeof (flt64))); + return reader->error; +} - assertive_buf_read (r, data, sizeof data, 0); - if (r->reverse_endian) - for (i = 0; i < 3; i++) - bswap_flt64 (&data[i]); +/* Returns true if FILE is an SPSS system file, + false otherwise. */ +bool +sfm_detect (FILE *file) +{ + char rec_type[5]; - if (data[0] != SYSMIS || data[1] != FLT64_MAX - || data[2] != second_lowest_flt64) - { - r->sysmis = data[0]; - r->highest = data[1]; - r->lowest = data[2]; - msg (MW, _("%s: File-indicated value is different from internal value " - "for at least one of the three system values. SYSMIS: " - "indicated %g, expected %g; HIGHEST: %g, %g; LOWEST: " - "%g, %g."), - fh_get_file_name (r->fh), (double) data[0], (double) SYSMIS, - (double) data[1], (double) FLT64_MAX, - (double) data[2], (double) second_lowest_flt64); - } + if (fread (rec_type, 4, 1, file) != 1) + return false; + rec_type[4] = '\0'; - return 1; - - error: - return 0; + return !strcmp ("$FL2", rec_type); } - -static int -read_header (struct sfm_reader *r, - struct dictionary *dict, struct sfm_read_info *info) + +/* Reads the global header of the system file. + Sets DICT's file label to the system file's label. + Sets *WEIGHT_IDX to 0 if the system file is unweighted, + or to the value index of the weight variable otherwise. + Sets *CLAIMED_VALUE_CNT to the number of values that the file + claims to have (although it is not always correct). + If INFO is non-null, initializes *INFO with header + information. */ +static void +read_header (struct sfm_reader *r, struct dictionary *dict, + int *weight_idx, int *claimed_value_cnt, + struct sfm_read_info *info) { - struct sysfile_header hdr; /* Disk buffer. */ - char prod_name[sizeof hdr.prod_name + 1]; /* Buffer for product name. */ - int skip_amt = 0; /* Amount of product name to omit. */ - int i; - - /* Read header, check magic. */ - assertive_buf_read (r, &hdr, sizeof hdr, 0); - if (strncmp ("$FL2", hdr.rec_type, 4) != 0) - lose ((ME, _("%s: Bad magic. Proper system files begin with " - "the four characters `$FL2'. This file will not be read."), - fh_get_file_name (r->fh))); - - /* Check eye-category.her string. */ - memcpy (prod_name, hdr.prod_name, sizeof hdr.prod_name); - for (i = 0; i < 60; i++) - if (!c_isprint ((unsigned char) prod_name[i])) - prod_name[i] = ' '; - for (i = 59; i >= 0; i--) - if (!c_isgraph ((unsigned char) prod_name[i])) - { - prod_name[i] = '\0'; - break; - } - prod_name[60] = '\0'; + char rec_type[5]; + char eye_catcher[61]; + uint8_t raw_layout_code[4]; + int case_cnt; + uint8_t raw_bias[8]; + char creation_date[10]; + char creation_time[9]; + char file_label[65]; + struct substring file_label_ss; + + read_string (r, rec_type, sizeof rec_type); + read_string (r, eye_catcher, sizeof eye_catcher); - { -#define N_PREFIXES 2 - static const char *prefix[N_PREFIXES] = - { - "@(#) SPSS DATA FILE", - "SPSS SYSTEM FILE.", - }; - - int i; + if (strcmp ("$FL2", rec_type) != 0) + sys_error (r, _("This is not an SPSS system file.")); + + /* Identify integer format. */ + read_bytes (r, raw_layout_code, sizeof raw_layout_code); + if ((!integer_identify (2, raw_layout_code, sizeof raw_layout_code, + &r->integer_format) + && !integer_identify (3, raw_layout_code, sizeof raw_layout_code, + &r->integer_format)) + || (r->integer_format != INTEGER_MSB_FIRST + && r->integer_format != INTEGER_LSB_FIRST)) + sys_error (r, _("This is not an SPSS system file.")); + + *claimed_value_cnt = read_int32 (r); + if (*claimed_value_cnt < 0 || *claimed_value_cnt > INT_MAX / 16) + *claimed_value_cnt = -1; + + r->compressed = read_int32 (r) != 0; + + *weight_idx = read_int32 (r); + + case_cnt = read_int32 (r); + if (case_cnt < -1 || case_cnt > INT_MAX / 2) + case_cnt = -1; + + /* Identify floating-point format and obtain compression bias. */ + read_bytes (r, raw_bias, sizeof raw_bias); + if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0) + { + sys_warn (r, _("Compression bias (%g) is not the usual " + "value of 100, or system file uses unrecognized " + "floating-point format."), + r->bias); + if (r->integer_format == INTEGER_MSB_FIRST) + r->float_format = FLOAT_IEEE_DOUBLE_BE; + else + r->float_format = FLOAT_IEEE_DOUBLE_LE; + } + float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias); - for (i = 0; i < N_PREFIXES; i++) - if (!strncmp (prefix[i], hdr.prod_name, strlen (prefix[i]))) - { - skip_amt = strlen (prefix[i]); - break; - } - } + read_string (r, creation_date, sizeof creation_date); + read_string (r, creation_time, sizeof creation_time); + read_string (r, file_label, sizeof file_label); + skip_bytes (r, 3); - /* Check endianness. */ - if (hdr.layout_code == 2) - r->reverse_endian = 0; - else + file_label_ss = ss_cstr (file_label); + ss_trim (&file_label_ss, ss_cstr (" ")); + if (!ss_is_empty (file_label_ss)) { - bswap_int32 (&hdr.layout_code); - if (hdr.layout_code != 2) - lose ((ME, _("%s: File layout code has unexpected value %d. Value " - "should be 2, in big-endian or little-endian format."), - fh_get_file_name (r->fh), hdr.layout_code)); - - r->reverse_endian = 1; - bswap_int32 (&hdr.nominal_case_size); - bswap_int32 (&hdr.compress); - bswap_int32 (&hdr.weight_idx); - bswap_int32 (&hdr.case_cnt); - bswap_flt64 (&hdr.bias); + ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0'; + dict_set_label (dict, ss_data (file_label_ss)); } - - /* Copy basic info and verify correctness. */ - r->value_cnt = hdr.nominal_case_size; - - /* If value count is ridiculous, then force it to -1 (a - sentinel value). */ - if ( r->value_cnt < 0 || - r->value_cnt > (INT_MAX / (int) sizeof (union value) / 2)) - r->value_cnt = -1; - - r->compressed = hdr.compress; - - r->weight_idx = hdr.weight_idx - 1; - - r->case_cnt = hdr.case_cnt; - if (r->case_cnt < -1 || r->case_cnt > INT_MAX / 2) - lose ((ME, - _("%s: Number of cases in file (%ld) is not between -1 and %d."), - fh_get_file_name (r->fh), (long) r->case_cnt, INT_MAX / 2)); - - r->bias = hdr.bias; - if (r->bias != 100.0) - corrupt_msg (MW, _("%s: Compression bias (%g) is not the usual " - "value of 100."), - fh_get_file_name (r->fh), r->bias); - - /* Make a file label only on the condition that the given label is - not all spaces or nulls. */ - { - int i; - - for (i = sizeof hdr.file_label - 1; i >= 0; i--) - { - if (!c_isspace ((unsigned char) hdr.file_label[i]) - && hdr.file_label[i] != 0) - { - char *label = xmalloc (i + 2); - memcpy (label, hdr.file_label, i + 1); - label[i + 1] = 0; - dict_set_label (dict, label); - free (label); - break; - } - } - } - if (info) { - char *cp; - - memcpy (info->creation_date, hdr.creation_date, 9); - info->creation_date[9] = 0; - - memcpy (info->creation_time, hdr.creation_time, 8); - info->creation_time[8] = 0; - -#ifdef WORDS_BIGENDIAN - info->big_endian = !r->reverse_endian; -#else - info->big_endian = r->reverse_endian; -#endif - - info->compressed = hdr.compress; - - info->case_cnt = hdr.case_cnt; - - for (cp = &prod_name[skip_amt]; cp < &prod_name[60]; cp++) - if (c_isgraph ((unsigned char) *cp)) - break; - strcpy (info->product, cp); + struct substring product; + + strcpy (info->creation_date, creation_date); + strcpy (info->creation_time, creation_time); + info->integer_format = r->integer_format; + info->float_format = r->float_format; + info->compressed = r->compressed; + info->case_cnt = case_cnt; + + product = ss_cstr (eye_catcher); + ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE")); + ss_trim (&product, ss_cstr (" ")); + str_copy_buf_trunc (info->product, sizeof info->product, + ss_data (product), ss_length (product)); } - - return 1; - - error: - return 0; } -/* Reads most of the dictionary from file H; also fills in the - associated VAR_BY_IDX array. */ -static int -read_variables (struct sfm_reader *r, - struct dictionary *dict, struct variable ***var_by_idx) +/* Reads a variable (type 2) record from R and adds the + corresponding variable to DICT. + Also skips past additional variable records for long string + variables. */ +static void +read_variable_record (struct sfm_reader *r, struct dictionary *dict, + int *format_warning_cnt) { - int i; - - struct sysfile_variable sv; /* Disk buffer. */ - int long_string_count = 0; /* # of long string continuation - records still expected. */ - int next_value = 0; /* Index to next `value' structure. */ - - assert(r); + int width; + int has_variable_label; + int missing_value_code; + int print_format; + int write_format; + char name[9]; + + struct variable *var; + int nv; + + width = read_int32 (r); + has_variable_label = read_int32 (r); + missing_value_code = read_int32 (r); + print_format = read_int32 (r); + write_format = read_int32 (r); + read_string (r, name, sizeof name); + name[strcspn (name, " ")] = '\0'; + + /* Check variable name. */ + if (name[0] == '$' || name[0] == '#') + sys_error (r, "Variable name begins with invalid character `%c'.", + name[0]); + if (!var_is_plausible_name (name, false)) + sys_error (r, _("Invalid variable name `%s'."), name); + + /* Create variable. */ + if (width < 0 || width > 255) + sys_error (r, _("Bad variable width %d."), width); + var = dict_create_var (dict, name, width); + if (var == NULL) + sys_error (r, + _("Duplicate variable name `%s' within system file."), + name); + + /* Set the short name the same as the long name */ + var_set_short_name (var, var_get_name (var)); + + /* Get variable label, if any. */ + if (has_variable_label != 0 && has_variable_label != 1) + sys_error (r, _("Variable label indicator field is not 0 or 1.")); + if (has_variable_label == 1) + { + size_t len; + char label[255 + 1]; + + len = read_int32 (r); + if (len >= sizeof label) + sys_error (r, _("Variable %s has label of invalid length %d."), + name, len); + read_string (r, label, len + 1); + var_set_label (var, label); + + skip_bytes (r, ROUND_UP (len, 4) - len); + } - *var_by_idx = 0; + /* Set missing values. */ + if (missing_value_code < -3 || missing_value_code > 3 + || missing_value_code == -1) + sys_error (r, _("Missing value indicator field is not " + "-3, -2, 0, 1, 2, or 3.")); + if (missing_value_code != 0) + { + struct missing_values mv; + mv_init (&mv, var_get_width (var)); + if (var_is_numeric (var)) + { + if (missing_value_code > 0) + { + int i; + for (i = 0; i < missing_value_code; i++) + mv_add_num (&mv, read_flt64 (r)); + } + else + { + double low = read_flt64 (r); + double high = read_flt64 (r); + mv_add_num_range (&mv, low, high); + if (missing_value_code == -3) + mv_add_num (&mv, read_flt64 (r)); + } + } + else if (var_get_width (var) <= MAX_SHORT_STRING) + { + if (missing_value_code > 0) + { + int i; + for (i = 0; i < missing_value_code; i++) + { + char string[9]; + read_string (r, string, sizeof string); + mv_add_str (&mv, string); + } + } + else + sys_error (r, _("String variable %s may not have missing " + "values specified as a range."), + name); + } + else /* var->width > MAX_SHORT_STRING */ + sys_error (r, _("Long string variable %s may not have missing " + "values."), + name); + var_set_missing_values (var, &mv); + } + /* Set formats. */ + parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt); + parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt); - /* Read in the entry for each variable and use the info to - initialize the dictionary. */ - for (i = 0; ; ++i) + /* Account for values. + Skip long string continuation records, if any. */ + nv = width == 0 ? 1 : DIV_RND_UP (width, 8); + r->value_cnt += nv; + if (width > 8) { - struct variable *vv; - char name[SHORT_NAME_LEN + 1]; - int nv; - int j; - struct fmt_spec print, write; + int i; + for (i = 1; i < nv; i++) + { + /* Check for record type 2 and width -1. */ + if (read_int32 (r) != 2 || read_int32 (r) != -1) + sys_error (r, _("Missing string continuation record.")); + + /* Skip and ignore remaining continuation data. */ + has_variable_label = read_int32 (r); + missing_value_code = read_int32 (r); + print_format = read_int32 (r); + write_format = read_int32 (r); + read_string (r, name, sizeof name); + + /* Variable label fields on continuation records have + been spotted in system files created by "SPSS Power + Macintosh Release 6.1". */ + if (has_variable_label) + skip_bytes (r, ROUND_UP (read_int32 (r), 4)); + } + } +} - assertive_buf_read (r, &sv, sizeof sv, 0); +/* Translates the format spec from sysfile format to internal + format. */ +static void +parse_format_spec (struct sfm_reader *r, uint32_t s, + enum which_format which, struct variable *v, + int *format_warning_cnt) +{ + const int max_format_warnings = 8; + struct fmt_spec f; + uint8_t raw_type = s >> 16; + uint8_t w = s >> 8; + uint8_t d = s; + + bool ok; + + if (!fmt_from_io (raw_type, &f.type)) + sys_error (r, _("Unknown variable format %d."), (int) raw_type); + f.w = w; + f.d = d; - if (r->reverse_endian) - { - bswap_int32 (&sv.rec_type); - bswap_int32 (&sv.type); - bswap_int32 (&sv.has_var_label); - bswap_int32 (&sv.n_missing_values); - bswap_int32 (&sv.print); - bswap_int32 (&sv.write); - } + msg_disable (); + ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v)); + msg_enable (); + + if (ok) + { + if (which == PRINT_FORMAT) + var_set_print_format (v, &f); + else + var_set_write_format (v, &f); + } + else if (*++format_warning_cnt <= max_format_warnings) + { + char fmt_string[FMT_STRING_LEN_MAX + 1]; + sys_warn (r, _("%s variable %s has invalid %s format %s."), + var_is_numeric (v) ? _("Numeric") : _("String"), + var_get_name (v), + which == PRINT_FORMAT ? _("print") : _("write"), + fmt_to_string (&f, fmt_string)); + + if (*format_warning_cnt == max_format_warnings) + sys_warn (r, _("Suppressing further invalid format warnings.")); + } +} - /* We've come to the end of the variable entries */ - if (sv.rec_type != 2) - { - buf_unread(r, sizeof sv); - r->value_cnt = i; - break; - } +/* Sets the weighting variable in DICT to the variable + corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is + nonzero. */ +static void +setup_weight (struct sfm_reader *r, int weight_idx, + struct variable **var_by_value_idx, struct dictionary *dict) +{ + if (weight_idx != 0) + { + struct variable *weight_var + = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx); + if (var_is_numeric (weight_var)) + dict_set_weight (dict, weight_var); + else + sys_error (r, _("Weighting variable must be numeric.")); + } +} - *var_by_idx = xnrealloc (*var_by_idx, i + 1, sizeof **var_by_idx); +/* Reads a document record, type 6, from system file R, and sets up + the documents and n_documents fields in the associated + dictionary. */ +static void +read_documents (struct sfm_reader *r, struct dictionary *dict) +{ + int line_cnt; + char *documents; - /* If there was a long string previously, make sure that the - continuations are present; otherwise make sure there aren't - any. */ - if (long_string_count) - { - if (sv.type != -1) - lose ((ME, _("%s: position %d: String variable does not have " - "proper number of continuation records."), - fh_get_file_name (r->fh), i)); + if (dict_get_documents (dict) != NULL) + sys_error (r, _("Multiple type 6 (document) records.")); + line_cnt = read_int32 (r); + if (line_cnt <= 0) + sys_error (r, _("Number of document lines (%d) " + "must be greater than 0."), line_cnt); - (*var_by_idx)[i] = NULL; - long_string_count--; - continue; - } - else if (sv.type == -1) - lose ((ME, _("%s: position %d: Superfluous long string continuation " - "record."), - fh_get_file_name (r->fh), i)); - - /* Check fields for validity. */ - if (sv.type < 0 || sv.type > 255) - lose ((ME, _("%s: position %d: Bad variable type code %d."), - fh_get_file_name (r->fh), i, sv.type)); - if (sv.has_var_label != 0 && sv.has_var_label != 1) - lose ((ME, _("%s: position %d: Variable label indicator field is not " - "0 or 1."), fh_get_file_name (r->fh), i)); - if (sv.n_missing_values < -3 || sv.n_missing_values > 3 - || sv.n_missing_values == -1) - lose ((ME, _("%s: position %d: Missing value indicator field is not " - "-3, -2, 0, 1, 2, or 3."), fh_get_file_name (r->fh), i)); - - /* Copy first character of variable name. */ - if (sv.name[0] == '@' || sv.name[0] == '#') - lose ((ME, _("%s: position %d: Variable name begins with invalid " - "character."), - fh_get_file_name (r->fh), i)); - - name[0] = sv.name[0]; - - /* Copy remaining characters of variable name. */ - for (j = 1; j < SHORT_NAME_LEN; j++) - { - int c = (unsigned char) sv.name[j]; + documents = pool_nmalloc (r->pool, line_cnt + 1, 80); + read_string (r, documents, 80 * line_cnt + 1); + dict_set_documents (dict, documents); + pool_free (r->pool, documents); +} - if (c == ' ') - break; - else - name[j] = c; - } - name[j] = 0; +/* Read a type 7 extension record. */ +static void +read_extension_record (struct sfm_reader *r, struct dictionary *dict) +{ + int subtype = read_int32 (r); + size_t size = read_int32 (r); + size_t count = read_int32 (r); + size_t bytes = size * count; + + /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1 + allows an extra byte for a null terminator, used by some + extension processing routines. */ + if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size)))) + sys_error (r, "Record type 7 subtype %d too large.", subtype); + + switch (subtype) + { + case 3: + read_machine_int32_info (r, size, count); + return; + + case 4: + read_machine_flt64_info (r, size, count); + return; + + case 5: + /* Variable sets information. We don't use these yet. + They only apply to GUIs; see VARSETS on the APPLY + DICTIONARY command in SPSS documentation. */ + break; + + case 6: + /* DATE variable information. We don't use it yet, but we + should. */ + break; + + case 7: + /* Unknown purpose. */ + break; + + case 11: + read_display_parameters (r, size, count, dict); + return; + + case 13: + read_long_var_name_map (r, size, count, dict); + return; + + case 14: + read_long_string_map (r, size, count, dict); + return; + + case 16: + /* New in SPSS v14? Unknown purpose. */ + break; + + case 17: + /* Text field that defines variable attributes. New in + SPSS 14. */ + break; + + default: + sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype); + break; + } - if ( ! var_is_plausible_name(name, false) ) - lose ((ME, _("%s: Invalid variable name `%s' within system file."), - fh_get_file_name (r->fh), name)); + skip_bytes (r, bytes); +} - /* Create variable. */ - vv = (*var_by_idx)[i] = dict_create_var (dict, name, sv.type); - if (vv == NULL) - lose ((ME, _("%s: Duplicate variable name `%s' within system file."), - fh_get_file_name (r->fh), name)); +/* Read record type 7, subtype 3. */ +static void +read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count) +{ + int version_major UNUSED = read_int32 (r); + int version_minor UNUSED = read_int32 (r); + int version_revision UNUSED = read_int32 (r); + int machine_code UNUSED = read_int32 (r); + int float_representation = read_int32 (r); + int compression_code UNUSED = read_int32 (r); + int integer_representation = read_int32 (r); + int character_code UNUSED = read_int32 (r); + + int expected_float_format; + int expected_integer_format; + + if (size != 4 || count != 8) + sys_error (r, _("Bad size (%d) or count (%d) field on record type 7, " + "subtype 3."), + size, count); + + /* Check floating point format. */ + if (r->float_format == FLOAT_IEEE_DOUBLE_BE + || r->float_format == FLOAT_IEEE_DOUBLE_LE) + expected_float_format = 1; + else if (r->float_format == FLOAT_Z_LONG) + expected_float_format = 2; + else if (r->float_format == FLOAT_VAX_G || r->float_format == FLOAT_VAX_D) + expected_float_format = 3; + else + NOT_REACHED (); + if (float_representation != expected_float_format) + sys_error (r, _("Floating-point representation indicated by " + "system file (%d) differs from expected (%d)."), + r->float_format, expected_float_format); + + /* Check integer format. */ + if (r->integer_format == INTEGER_MSB_FIRST) + expected_integer_format = 1; + else if (r->integer_format == INTEGER_LSB_FIRST) + expected_integer_format = 2; + else + NOT_REACHED (); + if (integer_representation != expected_integer_format) + { + static const char *endian[] = {N_("little-endian"), N_("big-endian")}; + sys_warn (r, _("Integer format indicated by system file (%s) " + "differs from expected (%s)."), + gettext (endian[integer_representation == 1]), + gettext (endian[expected_integer_format == 1])); + } +} - /* Set the short name the same as the long name */ - var_set_short_name (vv, var_get_name (vv)); +/* Read record type 7, subtype 4. */ +static void +read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count) +{ + double sysmis = read_flt64 (r); + double highest = read_flt64 (r); + double lowest = read_flt64 (r); + + if (size != 8 || count != 3) + sys_error (r, _("Bad size (%d) or count (%d) on extension 4."), + size, count); + + if (sysmis != SYSMIS) + sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis); + if (highest != HIGHEST) + sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest); + if (lowest != LOWEST) + sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest); +} - /* Case reading data. */ - nv = sv.type == 0 ? 1 : DIV_RND_UP (sv.type, sizeof (flt64)); - long_string_count = nv - 1; - next_value += nv; +/* Read record type 7, subtype 11, which specifies how variables + should be displayed in GUI environments. */ +static void +read_display_parameters (struct sfm_reader *r, size_t size, size_t count, + struct dictionary *dict) +{ + const size_t n_vars = count / 3 ; + bool warned = false; + int i; - /* Get variable label, if any. */ - if (sv.has_var_label == 1) - { - /* Disk buffer. */ - int32_t len; - - /* Read length of label. */ - assertive_buf_read (r, &len, sizeof len, 0); - if (r->reverse_endian) - bswap_int32 (&len); - - /* Check len. */ - if (len < 0 || len > 255) - lose ((ME, _("%s: Variable %s indicates variable label of invalid " - "length %d."), - fh_get_file_name (r->fh), var_get_name (vv), len)); - - if ( len != 0 ) - { - /* Read label into variable structure. */ - char label[256]; - assertive_buf_read (r, label, ROUND_UP (len, sizeof (int32_t)), - 0); - label[len] = '\0'; - var_set_label (vv, label); - } - } + if (count % 3 || n_vars != dict_get_var_cnt (dict)) + sys_error (r, _("Bad size (%d) or count (%d) on extension 11."), + size, count); - /* Set missing values. */ - if (sv.n_missing_values != 0) - { - flt64 mv[3]; - int mv_cnt = abs (sv.n_missing_values); - struct missing_values miss; - - if (var_get_width (vv) > MAX_SHORT_STRING) - lose ((ME, _("%s: Long string variable %s may not have missing " - "values."), - fh_get_file_name (r->fh), var_get_name (vv))); - mv_init (&miss, var_get_width (vv)); - - assertive_buf_read (r, mv, sizeof *mv * mv_cnt, 0); - - if (r->reverse_endian && var_is_numeric (vv)) - for (j = 0; j < mv_cnt; j++) - bswap_flt64 (&mv[j]); - - if (sv.n_missing_values > 0) - { - for (j = 0; j < sv.n_missing_values; j++) - if (var_is_numeric (vv)) - mv_add_num (&miss, mv[j]); - else - mv_add_str (&miss, (char *) &mv[j]); - } - else - { - if (var_is_alpha (vv)) - lose ((ME, _("%s: String variable %s may not have missing " - "values specified as a range."), - fh_get_file_name (r->fh), var_get_name (vv))); - - if (mv[0] == r->lowest) - mv_add_num_range (&miss, LOWEST, mv[1]); - else if (mv[1] == r->highest) - mv_add_num_range (&miss, mv[0], HIGHEST); - else - mv_add_num_range (&miss, mv[0], mv[1]); - - if (sv.n_missing_values == -3) - mv_add_num (&miss, mv[2]); - } - var_set_missing_values (vv, &miss); - } + for (i = 0; i < n_vars; ++i) + { + int measure = read_int32 (r); + int width = read_int32 (r); + int align = read_int32 (r); + struct variable *v; - if (!parse_format_spec (r, sv.print, &print, vv) - || !parse_format_spec (r, sv.write, &write, vv)) - goto error; + if (!measure_is_valid (measure) || !alignment_is_valid (align)) + { + if (!warned) + sys_warn (r, _("Invalid variable display parameters. " + "Default parameters substituted.")); + warned = true; + continue; + } - var_set_print_format (vv, &print); - var_set_write_format (vv, &write); + v = dict_get_var (dict, i); + var_set_measure (v, measure); + var_set_display_width (v, width); + var_set_alignment (v, align); } +} - /* Some consistency checks. */ - if (long_string_count != 0) - lose ((ME, _("%s: Long string continuation records omitted at end of " - "dictionary."), - fh_get_file_name (r->fh))); - - if (next_value != r->value_cnt) - corrupt_msg(MW, _("%s: System file header indicates %d variable positions but " - "%d were read from file."), - fh_get_file_name (r->fh), r->value_cnt, next_value); - +/* Reads record type 7, subtype 13, which gives the long name + that corresponds to each short name. Modifies variable names + in DICT accordingly. */ +static void +read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count, + struct dictionary *dict) +{ + struct variable_to_value_map *map; + struct variable *var; + char *long_name; + int warning_cnt = 0; + + map = open_variable_to_value_map (r, size * count); + while (read_variable_to_value_map (r, dict, map, &var, &long_name, + &warning_cnt)) + { + char short_name[SHORT_NAME_LEN + 1]; + strcpy (short_name, var_get_short_name (var)); - return 1; + /* Validate long name. */ + if (!var_is_valid_name (long_name, false)) + { + sys_warn (r, _("Long variable mapping from %s to invalid " + "variable name `%s'."), + var_get_name (var), long_name); + continue; + } + + /* Identify any duplicates. */ + if (strcasecmp (short_name, long_name) + && dict_lookup_var (dict, long_name) != NULL) + { + sys_warn (r, _("Duplicate long variable name `%s' " + "within system file."), long_name); + continue; + } - error: - return 0; + /* Set long name. Renaming a variable may clear the short + name, but we want to retain it, so re-set it + explicitly. */ + dict_rename_var (dict, var, long_name); + var_set_short_name (var, short_name); + } + close_variable_to_value_map (r, map); } -/* Translates the format spec from sysfile format to internal - format. */ -static int -parse_format_spec (struct sfm_reader *r, int32_t s, - struct fmt_spec *f, const struct variable *v) +/* Reads record type 7, subtype 14, which gives the real length + of each very long string. Rearranges DICT accordingly. */ +static void +read_long_string_map (struct sfm_reader *r, size_t size, size_t count, + struct dictionary *dict) { - bool ok; - - if (!fmt_from_io ((s >> 16) & 0xff, &f->type)) - lose ((ME, _("%s: Bad format specifier byte (%d)."), - fh_get_file_name (r->fh), (s >> 16) & 0xff)); - f->w = (s >> 8) & 0xff; - f->d = s & 0xff; - - if (var_is_alpha (v) != fmt_is_string (f->type)) - lose ((ME, _("%s: %s variable %s has %s format specifier %s."), - fh_get_file_name (r->fh), - var_is_alpha (v) ? _("String") : _("Numeric"), - var_get_name (v), - fmt_is_string (f->type) ? _("string") : _("numeric"), - fmt_name (f->type))); + struct variable_to_value_map *map; + struct variable *var; + char *length_s; + int warning_cnt = 0; - msg_disable (); - ok = fmt_check_output (f) && fmt_check_width_compat (f, var_get_width (v)); - msg_enable (); - - if (!ok) + r->has_vls = true; + + map = open_variable_to_value_map (r, size * count); + while (read_variable_to_value_map (r, dict, map, &var, &length_s, + &warning_cnt)) { - char fmt_string[FMT_STRING_LEN_MAX + 1]; - msg (ME, _("%s variable %s has invalid format specifier %s."), - var_is_numeric (v) ? _("Numeric") : _("String"), - var_get_name (v), fmt_to_string (f, fmt_string)); - *f = (var_is_numeric (v) - ? fmt_for_output (FMT_F, 8, 2) - : fmt_for_output (FMT_A, var_get_width (v), 0)); - } - return 1; + long length, remaining_length; + size_t idx; - error: - return 0; + /* Get length. */ + length = strtol (length_s, NULL, 10); + if (length < MIN_VERY_LONG_STRING || length == LONG_MAX) + { + sys_warn (r, _("%s listed as string of length %s " + "in length table."), + var_get_name (var), length_s); + continue; + } + + /* Group multiple variables into single variable + and delete all but the first. */ + remaining_length = length; + for (idx = var_get_dict_index (var); remaining_length > 0; idx++) + if (idx < dict_get_var_cnt (dict)) + remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)), + EFFECTIVE_LONG_STRING_LENGTH); + else + sys_error (r, _("Very long string %s overflows dictionary."), + var_get_name (var)); + dict_delete_consecutive_vars (dict, + var_get_dict_index (var) + 1, + idx - var_get_dict_index (var) - 1); + + /* Assign all the length to the first variable. */ + var_set_width (var, length); + } + close_variable_to_value_map (r, map); + dict_compact_values (dict); } /* Reads value labels from sysfile H and inserts them into the associated dictionary. */ -int +static void read_value_labels (struct sfm_reader *r, - struct dictionary *dict, struct variable **var_by_idx) + struct dictionary *dict, struct variable **var_by_value_idx) { + struct pool *subpool; + struct label - { - char raw_value[8]; /* Value as uninterpreted bytes. */ - union value value; /* Value. */ - char *label; /* Null-terminated label string. */ - }; + { + char raw_value[8]; /* Value as uninterpreted bytes. */ + union value value; /* Value. */ + char *label; /* Null-terminated label string. */ + }; struct label *labels = NULL; - int32_t n_labels; /* Number of labels. */ + int label_cnt; /* Number of labels. */ struct variable **var = NULL; /* Associated variables. */ - int32_t n_vars; /* Number of associated variables. */ + int var_cnt; /* Number of associated variables. */ int i; - /* First step: read the contents of the type 3 record and record its - contents. Note that we can't do much with the data since we - don't know yet whether it is of numeric or string type. */ + subpool = pool_create_subpool (r->pool); + + /* Read the type 3 record and record its contents. We can't do + much with the data yet because we don't know whether it is + of numeric or string type. */ /* Read number of labels. */ - assertive_buf_read (r, &n_labels, sizeof n_labels, 0); - if (r->reverse_endian) - bswap_int32 (&n_labels); + label_cnt = read_int32 (r); - if ( n_labels >= ((int32_t) ~0) / sizeof *labels) + if (label_cnt >= INT32_MAX / sizeof *labels) { - corrupt_msg(MW, _("%s: Invalid number of labels: %d. Ignoring labels."), - fh_get_file_name (r->fh), n_labels); - n_labels = 0; + sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."), + label_cnt); + label_cnt = 0; } - /* Allocate memory. */ - labels = xcalloc (n_labels, sizeof *labels); - for (i = 0; i < n_labels; i++) - labels[i].label = NULL; - /* Read each value/label tuple into labels[]. */ - for (i = 0; i < n_labels; i++) + labels = pool_nalloc (subpool, label_cnt, sizeof *labels); + for (i = 0; i < label_cnt; i++) { struct label *label = labels + i; unsigned char label_len; size_t padded_len; /* Read value. */ - assertive_buf_read (r, label->raw_value, sizeof label->raw_value, 0); + read_bytes (r, label->raw_value, sizeof label->raw_value); /* Read label length. */ - assertive_buf_read (r, &label_len, sizeof label_len, 0); - padded_len = ROUND_UP (label_len + 1, sizeof (flt64)); + read_bytes (r, &label_len, sizeof label_len); + padded_len = ROUND_UP (label_len + 1, 8); /* Read label, padding. */ - label->label = xmalloc (padded_len + 1); - assertive_buf_read (r, label->label, padded_len - 1, 0); + label->label = pool_alloc (subpool, padded_len + 1); + read_bytes (r, label->label, padded_len - 1); label->label[label_len] = 0; } - /* Second step: Read the type 4 record that has the list of - variables to which the value labels are to be applied. */ + /* Now, read the type 4 record that has the list of variables + to which the value labels are to be applied. */ /* Read record type of type 4 record. */ - { - int32_t rec_type; - - assertive_buf_read (r, &rec_type, sizeof rec_type, 0); - if (r->reverse_endian) - bswap_int32 (&rec_type); - - if (rec_type != 4) - lose ((ME, _("%s: Variable index record (type 4) does not immediately " - "follow value label record (type 3) as it should."), - fh_get_file_name (r->fh))); - } + if (read_int32 (r) != 4) + sys_error (r, _("Variable index record (type 4) does not immediately " + "follow value label record (type 3) as it should.")); /* Read number of variables associated with value label from type 4 record. */ - assertive_buf_read (r, &n_vars, sizeof n_vars, 0); - if (r->reverse_endian) - bswap_int32 (&n_vars); - if (n_vars < 1 || n_vars > dict_get_var_cnt (dict)) - lose ((ME, _("%s: Number of variables associated with a value label (%d) " - "is not between 1 and the number of variables (%d)."), - fh_get_file_name (r->fh), n_vars, dict_get_var_cnt (dict))); + var_cnt = read_int32 (r); + if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict)) + sys_error (r, _("Number of variables associated with a value label (%d) " + "is not between 1 and the number of variables (%d)."), + var_cnt, dict_get_var_cnt (dict)); /* Read the list of variables. */ - var = xnmalloc (n_vars, sizeof *var); - for (i = 0; i < n_vars; i++) + var = pool_nalloc (subpool, var_cnt, sizeof *var); + for (i = 0; i < var_cnt; i++) { - int32_t var_idx; - struct variable *v; - - /* Read variable index, check range. */ - assertive_buf_read (r, &var_idx, sizeof var_idx, 0); - if (r->reverse_endian) - bswap_int32 (&var_idx); - if (var_idx < 1 || var_idx > r->value_cnt) - lose ((ME, _("%s: Variable index associated with value label (%d) is " - "not between 1 and the number of values (%d)."), - fh_get_file_name (r->fh), var_idx, r->value_cnt)); - - /* Make sure it's a real variable. */ - v = var_by_idx[var_idx - 1]; - if (v == NULL) - lose ((ME, _("%s: Variable index associated with value label (%d) " - "refers to a continuation of a string variable, not to " - "an actual variable."), - fh_get_file_name (r->fh), var_idx)); - if (var_is_long_string (v)) - lose ((ME, _("%s: Value labels are not allowed on long string " - "variables (%s)."), - fh_get_file_name (r->fh), var_get_name (v))); - - /* Add it to the list of variables. */ - var[i] = v; + var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r)); + if (var_is_long_string (var[i])) + sys_error (r, _("Value labels are not allowed on long string " + "variables (%s)."), var_get_name (var[i])); } /* Type check the variables. */ - for (i = 1; i < n_vars; i++) + for (i = 1; i < var_cnt; i++) if (var_get_type (var[i]) != var_get_type (var[0])) - lose ((ME, _("%s: Variables associated with value label are not all of " - "identical type. Variable %s has %s type, but variable " - "%s has %s type."), - fh_get_file_name (r->fh), - var_get_name (var[0]), - var_is_alpha (var[0]) ? _("string") : _("numeric"), - var_get_name (var[i]), - var_is_alpha (var[i]) ? _("string") : _("numeric"))); + sys_error (r, _("Variables associated with value label are not all of " + "identical type. Variable %s is %s, but variable " + "%s is %s."), + var_get_name (var[0]), + var_is_numeric (var[0]) ? _("numeric") : _("string"), + var_get_name (var[i]), + var_is_numeric (var[i]) ? _("numeric") : _("string")); /* Fill in labels[].value, now that we know the desired type. */ - for (i = 0; i < n_labels; i++) + for (i = 0; i < label_cnt; i++) { struct label *label = labels + i; if (var_is_alpha (var[0])) - { - const int copy_len = MIN (sizeof label->raw_value, - sizeof label->label); - memcpy (label->value.s, label->raw_value, copy_len); - } else { - flt64 f; - assert (sizeof f == sizeof label->raw_value); - memcpy (&f, label->raw_value, sizeof f); - if (r->reverse_endian) - bswap_flt64 (&f); - label->value.f = f; - } + buf_copy_rpad (label->value.s, sizeof label->value.s, + label->raw_value, sizeof label->raw_value); + else + label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value); } - /* Assign the value_label's to each variable. */ - for (i = 0; i < n_vars; i++) + /* Assign the `value_label's to each variable. */ + for (i = 0; i < var_cnt; i++) { struct variable *v = var[i]; int j; /* Add each label to the variable. */ - for (j = 0; j < n_labels; j++) + for (j = 0; j < label_cnt; j++) { - struct label *label = labels + j; - if (var_add_value_label (v, &label->value, label->label)) - continue; - - if (var_is_numeric (var[0])) - msg (MW, _("%s: File contains duplicate label for value %g for " - "variable %s."), - fh_get_file_name (r->fh), label->value.f, var_get_name (v)); - else - msg (MW, _("%s: File contains duplicate label for value `%.*s' " - "for variable %s."), - fh_get_file_name (r->fh), var_get_width (v), - label->value.s, var_get_name (v)); + struct label *label = &labels[j]; + if (!var_add_value_label (v, &label->value, label->label)) + { + if (var_is_numeric (var[0])) + sys_warn (r, _("Duplicate value label for %g on %s."), + label->value.f, var_get_name (v)); + else + sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."), + var_get_width (v), label->value.s, + var_get_name (v)); + } } } - for (i = 0; i < n_labels; i++) - free (labels[i].label); - free (labels); - free (var); - return 1; + pool_destroy (subpool); +} + +/* Case reader. */ + +static void partial_record (struct sfm_reader *r) + NO_RETURN; +static bool read_case_number (struct sfm_reader *, double *); +static bool read_case_string (struct sfm_reader *, char *, size_t); +static int read_opcode (struct sfm_reader *); +static bool read_compressed_number (struct sfm_reader *, double *); +static bool read_compressed_string (struct sfm_reader *, char *); +static bool read_whole_strings (struct sfm_reader *, char *, size_t); + +/* Reads one case from READER's file into C. Returns nonzero + only if successful. */ +int +sfm_read_case (struct sfm_reader *r, struct ccase *c) +{ + if (r->error) + return 0; + + if (setjmp (r->bail_out)) + return 0; - error: - if (labels) + if (!r->compressed && sizeof (double) == 8 && !r->has_vls) + { + /* Fast path. Read the whole case directly. */ + if (!try_read_bytes (r, case_data_all_rw (c), + sizeof (union value) * r->value_cnt)) + return 0; + + /* Convert floating point numbers to native format if needed. */ + if (r->float_format != FLOAT_NATIVE_DOUBLE) + { + int i; + + for (i = 0; i < r->var_cnt; i++) + if (r->vars[i].width == 0) + { + double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f; + float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d); + } + } + return 1; + } + else { - for (i = 0; i < n_labels; i++) - free (labels[i].label); - free (labels); + /* Slow path. Convert from external to internal format. */ + int i; + + for (i = 0; i < r->var_cnt; i++) + { + struct sfm_var *sv = &r->vars[i]; + union value *v = case_data_rw_idx (c, sv->case_index); + + if (sv->width == 0) + { + if (!read_case_number (r, &v->f)) + goto eof; + } + else + { + /* Read the string data in segments up to 255 bytes + at a time, packed into 8-byte units. */ + const int max_chunk = MIN_VERY_LONG_STRING - 1; + int ofs, chunk_size; + for (ofs = 0; ofs < sv->width; ofs += chunk_size) + { + chunk_size = MIN (max_chunk, sv->width - ofs); + if (!read_case_string (r, v->s + ofs, chunk_size)) + { + if (ofs) + partial_record (r); + goto eof; + } + } + + /* Very long strings have trailing wasted space + that we must skip. */ + if (sv->width >= MIN_VERY_LONG_STRING) + { + int bytes_read = (sv->width / max_chunk * 256 + + ROUND_UP (sv->width % max_chunk, 8)); + int total_bytes = sfm_width_to_bytes (sv->width); + int excess_bytes = total_bytes - bytes_read; + + while (excess_bytes > 0) + { + char buffer[1024]; + size_t chunk = MIN (sizeof buffer, excess_bytes); + if (!read_whole_strings (r, buffer, chunk)) + partial_record (r); + excess_bytes -= chunk; + } + } + } + } + return 1; + + eof: + if (i != 0) + partial_record (r); + return 0; } - free (var); - return 0; } -/* Reads BYTE_CNT bytes from the file represented by H. If BUF is - non-NULL, uses that as the buffer; otherwise allocates at least - MIN_ALLOC bytes. Returns a pointer to the buffer on success, NULL - on failure. */ -static void * -buf_read (struct sfm_reader *r, void *buf, size_t byte_cnt, size_t min_alloc) +/* Issues an error that R ends in a partial record. */ +static void +partial_record (struct sfm_reader *r) { - assert (r); - - if (buf == NULL && byte_cnt > 0 ) - buf = xmalloc (MAX (byte_cnt, min_alloc)); + sys_error (r, _("File ends in partial case.")); +} - if ( byte_cnt == 0 ) - return buf; +/* Reads a number from R and stores its value in *D. + If R is compressed, reads a compressed number; + otherwise, reads a number in the regular way. + Returns true if successful, false if end of file is + reached immediately. */ +static bool +read_case_number (struct sfm_reader *r, double *d) +{ + if (!r->compressed) + { + uint8_t flt64[8]; + if (!try_read_bytes (r, flt64, sizeof flt64)) + return false; + *d = flt64_to_double (r, flt64); + return true; + } + else + return read_compressed_number (r, d); +} +/* Reads LENGTH string bytes from R into S. + Always reads a multiple of 8 bytes; if LENGTH is not a + multiple of 8, then extra bytes are read and discarded without + being written to S. + Reads compressed strings if S is compressed. + Returns true if successful, false if end of file is + reached immediately. */ +static bool +read_case_string (struct sfm_reader *r, char *s, size_t length) +{ + size_t whole = ROUND_DOWN (length, 8); + size_t partial = length % 8; - if (1 != fread (buf, byte_cnt, 1, r->file)) + if (whole) { - if (ferror (r->file)) - msg (ME, _("%s: Reading system file: %s."), - fh_get_file_name (r->fh), strerror (errno)); - else - corrupt_msg (ME, _("%s: Unexpected end of file."), - fh_get_file_name (r->fh)); - r->ok = false; - return NULL; + if (!read_whole_strings (r, s, whole)) + return false; + } + + if (partial) + { + char bounce[8]; + if (!read_whole_strings (r, bounce, sizeof bounce)) + { + if (whole) + partial_record (r); + return false; + } + memcpy (s + whole, bounce, partial); } - return buf; + return true; } -/* Winds the reader BYTE_CNT bytes back in the reader stream. */ -void -buf_unread(struct sfm_reader *r, size_t byte_cnt) +/* Reads and returns the next compression opcode from R. */ +static int +read_opcode (struct sfm_reader *r) { - assert(byte_cnt > 0); + assert (r->compressed); + for (;;) + { + int opcode; + if (r->opcode_idx >= sizeof r->opcodes) + { + if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes)) + return -1; + r->opcode_idx = 0; + } + opcode = r->opcodes[r->opcode_idx++]; + + if (opcode != 0) + return opcode; + } +} - if ( 0 != fseek(r->file, -byte_cnt, SEEK_CUR)) +/* Reads a compressed number from R and stores its value in D. + Returns true if successful, false if end of file is + reached immediately. */ +static bool +read_compressed_number (struct sfm_reader *r, double *d) +{ + int opcode = read_opcode (r); + switch (opcode) { - msg (ME, _("%s: Seeking system file: %s."), - fh_get_file_name (r->fh), strerror (errno)); + case -1: + case 252: + return false; + + case 253: + *d = read_flt64 (r); + break; + + case 254: + sys_error (r, _("Compressed data is corrupt.")); + + case 255: + *d = SYSMIS; + break; + + default: + *d = opcode - r->bias; + break; } + + return true; } -/* Reads a document record, type 6, from system file R, and sets up - the documents and n_documents fields in the associated - dictionary. */ -static int -read_documents (struct sfm_reader *r, struct dictionary *dict) +/* Reads a compressed 8-byte string segment from R and stores it + in DST. + Returns true if successful, false if end of file is + reached immediately. */ +static bool +read_compressed_string (struct sfm_reader *r, char *dst) { - int32_t line_cnt; - char *documents; + switch (read_opcode (r)) + { + case -1: + case 252: + return false; - if (dict_get_documents (dict) != NULL) - lose ((ME, _("%s: System file contains multiple " - "type 6 (document) records."), - fh_get_file_name (r->fh))); + case 253: + read_bytes (r, dst, 8); + break; - assertive_buf_read (r, &line_cnt, sizeof line_cnt, 0); - if (line_cnt <= 0) - lose ((ME, _("%s: Number of document lines (%ld) " - "must be greater than 0."), - fh_get_file_name (r->fh), (long) line_cnt)); + case 254: + memset (dst, ' ', 8); + break; - documents = buf_read (r, NULL, 80 * line_cnt, line_cnt * 80 + 1); - /* FIXME? Run through asciify. */ - if (documents == NULL) - return 0; - documents[80 * line_cnt] = '\0'; - dict_set_documents (dict, documents); - free (documents); - return 1; + default: + sys_error (r, _("Compressed data is corrupt.")); + } - error: - return 0; + return true; } - -/* Data reader. */ -/* Reads compressed data into H->BUF and sets other pointers - appropriately. Returns nonzero only if both no errors occur and - data was read. */ -static int -buffer_input (struct sfm_reader *r) +/* Reads LENGTH string bytes from R into S. + LENGTH must be a multiple of 8. + Reads compressed strings if S is compressed. + Returns true if successful, false if end of file is + reached immediately. */ +static bool +read_whole_strings (struct sfm_reader *r, char *s, size_t length) { - size_t amt; + assert (length % 8 == 0); + if (!r->compressed) + return try_read_bytes (r, s, length); + else + { + size_t ofs; + for (ofs = 0; ofs < length; ofs += 8) + if (!read_compressed_string (r, s + ofs)) + { + if (ofs != 0) + partial_record (r); + return false; + } + return true; + } +} + +/* Creates and returns a table that can be used for translating a value + index into a case to a "struct variable *" for DICT. Multiple + system file fields reference variables this way. + + This table must be created before processing the very long + string extension record, because that record causes some + values to be deleted from the case and the dictionary to be + compacted. */ +static struct variable ** +make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict) +{ + struct variable **var_by_value_idx; + int value_idx = 0; + int i; - if (!r->ok) - return false; - if (r->buf == NULL) - r->buf = xnmalloc (128, sizeof *r->buf); - amt = fread (r->buf, sizeof *r->buf, 128, r->file); - if (ferror (r->file)) + var_by_value_idx = pool_nmalloc (r->pool, + r->value_cnt, sizeof *var_by_value_idx); + for (i = 0; i < dict_get_var_cnt (dict); i++) { - msg (ME, _("%s: Error reading file: %s."), - fh_get_file_name (r->fh), strerror (errno)); - r->ok = false; - return 0; + struct variable *v = dict_get_var (dict, i); + int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8); + int j; + + var_by_value_idx[value_idx++] = v; + for (j = 1; j < nv; j++) + var_by_value_idx[value_idx++] = NULL; } - r->ptr = r->buf; - r->end = &r->buf[amt]; - return amt; + assert (value_idx == r->value_cnt); + + return var_by_value_idx; } -/* Reads a single case consisting of compressed data from system - file H into the array BUF[] according to reader R, and - returns nonzero only if successful. */ -/* Data in system files is compressed in this manner. Data - values are grouped into sets of eight ("octets"). Each value - in an octet has one instruction byte that are output together. - Each instruction byte gives a value for that byte or indicates - that the value can be found following the instructions. */ -static int -read_compressed_data (struct sfm_reader *r, flt64 *buf) +/* Returns the "struct variable" corresponding to the given + 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index + is valid. */ +static struct variable * +lookup_var_by_value_idx (struct sfm_reader *r, + struct variable **var_by_value_idx, int value_idx) { - const unsigned char *p_end = r->x + sizeof (flt64); - unsigned char *p = r->y; + struct variable *var; + + if (value_idx < 1 || value_idx > r->value_cnt) + sys_error (r, _("Variable index %d not in valid range 1...%d."), + value_idx, r->value_cnt); - const flt64 *buf_beg = buf; - const flt64 *buf_end = &buf[r->value_cnt]; + var = var_by_value_idx[value_idx - 1]; + if (var == NULL) + sys_error (r, _("Variable index %d refers to long string " + "continuation."), + value_idx); - for (;;) + return var; +} + +/* Returns the variable in D with the given SHORT_NAME, + or a null pointer if there is none. */ +static struct variable * +lookup_var_by_short_name (struct dictionary *d, const char *short_name) +{ + struct variable *var; + size_t var_cnt; + size_t i; + + /* First try looking up by full name. This often succeeds. */ + var = dict_lookup_var (d, short_name); + if (var != NULL && !strcasecmp (var_get_short_name (var), short_name)) + return var; + + /* Iterate through the whole dictionary as a fallback. */ + var_cnt = dict_get_var_cnt (d); + for (i = 0; i < var_cnt; i++) { - for (; p < p_end; p++){ - switch (*p) - { - case 0: - /* Code 0 is ignored. */ - continue; - case 252: - /* Code 252 is end of file. */ - if (buf_beg == buf) - return 0; - lose ((ME, _("%s: Compressed data is corrupted. Data ends " - "in partial case."), - fh_get_file_name (r->fh))); - case 253: - /* Code 253 indicates that the value is stored explicitly - following the instruction bytes. */ - if (r->ptr == NULL || r->ptr >= r->end) - if (!buffer_input (r)) - lose ((ME, _("%s: Unexpected end of file."), - fh_get_file_name (r->fh))); - memcpy (buf++, r->ptr++, sizeof *buf); - if (buf >= buf_end) - goto success; - break; - case 254: - /* Code 254 indicates a string that is all blanks. */ - memset (buf++, ' ', sizeof *buf); - if (buf >= buf_end) - goto success; - break; - case 255: - /* Code 255 indicates the system-missing value. */ - *buf = r->sysmis; - if (r->reverse_endian) - bswap_flt64 (buf); - buf++; - if (buf >= buf_end) - goto success; - break; - default: - /* Codes 1 through 251 inclusive are taken to indicate a - value of (BYTE - BIAS), where BYTE is the byte's value - and BIAS is the compression bias (generally 100.0). */ - *buf = *p - r->bias; - if (r->reverse_endian) - bswap_flt64 (buf); - buf++; - if (buf >= buf_end) - goto success; - break; - } - } - /* We have reached the end of this instruction octet. Read - another. */ - if (r->ptr == NULL || r->ptr >= r->end) - { - if (!buffer_input (r)) - { - if (buf_beg != buf) - lose ((ME, _("%s: Unexpected end of file."), - fh_get_file_name (r->fh))); - else - return 0; - } - } - memcpy (r->x, r->ptr++, sizeof *buf); - p = r->x; + var = dict_get_var (d, i); + if (!strcasecmp (var_get_short_name (var), short_name)) + return var; } - NOT_REACHED (); + return NULL; +} + +/* Helpers for reading records that contain "variable=value" + pairs. */ - success: - /* We have filled up an entire record. Update state and return - successfully. */ - r->y = ++p; - return 1; +/* State. */ +struct variable_to_value_map + { + struct substring buffer; /* Record contents. */ + size_t pos; /* Current position in buffer. */ + }; - error: - /* I/O error. */ - r->ok = false; - return 0; +/* Reads SIZE bytes into a "variable=value" map for R, + and returns the map. */ +static struct variable_to_value_map * +open_variable_to_value_map (struct sfm_reader *r, size_t size) +{ + struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map); + char *buffer = pool_malloc (r->pool, size + 1); + read_bytes (r, buffer, size); + map->buffer = ss_buffer (buffer, size); + map->pos = 0; + return map; } -/* Reads one case from READER's file into C. Returns nonzero - only if successful. */ -int -sfm_read_case (struct sfm_reader *r, struct ccase *c) +/* Closes MAP and frees its storage. + Not really needed, because the pool will free the map anyway, + but can be used to free it earlier. */ +static void +close_variable_to_value_map (struct sfm_reader *r, + struct variable_to_value_map *map) { - if (!r->ok) - return 0; + pool_free (r->pool, ss_data (map->buffer)); +} - if (!r->compressed && sizeof (flt64) == sizeof (double) && ! r->has_vls) +/* Reads the next variable=value pair from MAP. + Looks up the variable in DICT and stores it into *VAR. + Stores a null-terminated value into *VALUE. */ +static bool +read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict, + struct variable_to_value_map *map, + struct variable **var, char **value, + int *warning_cnt) +{ + int max_warnings = 5; + + for (;;) { - /* Fast path: external and internal representations are the - same, except possibly for endianness or SYSMIS. Read - directly into the case's buffer, then fix up any minor - details as needed. */ - if (!fread_ok (r, case_data_all_rw (c), - sizeof (union value) * r->value_cnt)) - return 0; + struct substring short_name_ss, value_ss; - /* Fix up endianness if needed. */ - if (r->reverse_endian) + if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss) + || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos, + &value_ss)) { - int i; - - for (i = 0; i < r->var_cnt; i++) - if (r->vars[i].width == 0) - bswap_flt64 (&case_data_rw_idx (c, r->vars[i].fv)->f); + if (*warning_cnt > max_warnings) + sys_warn (r, _("Suppressed %d additional variable map warnings."), + *warning_cnt - max_warnings); + return false; } + + map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX), + ss_buffer ("\t\0", 2)); - /* Fix up SYSMIS values if needed. - I don't think this will ever actually kick in, but it - can't hurt. */ - if (r->sysmis != SYSMIS) + ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0'; + *var = lookup_var_by_short_name (dict, ss_data (short_name_ss)); + if (*var == NULL) { - int i; - - for (i = 0; i < r->var_cnt; i++) - if (r->vars[i].width == 0 && case_num_idx (c, i) == r->sysmis) - case_data_rw_idx (c, r->vars[i].fv)->f = SYSMIS; + if (++*warning_cnt <= 5) + sys_warn (r, _("Variable map refers to unknown variable %s."), + ss_data (short_name_ss)); + continue; } + + ss_data (value_ss)[ss_length (value_ss)] = '\0'; + *value = ss_data (value_ss); + + return true; } - else - { - /* Slow path: internal and external representations differ. - Read into a bounce buffer, then copy to C. */ - flt64 *bounce; - flt64 *bounce_cur; - size_t bounce_size; - int read_ok; - int i; +} + +/* Messages. */ - bounce_size = sizeof *bounce * r->value_cnt; - bounce = bounce_cur = local_alloc (bounce_size); +/* Displays a corruption message. */ +static void +sys_msg (struct sfm_reader *r, int class, const char *format, va_list args) +{ + struct msg m; + struct string text; - memset(bounce, 0, bounce_size); + ds_init_empty (&text); + ds_put_format (&text, "\"%s\" near offset 0x%lx: ", + fh_get_file_name (r->fh), (unsigned long) ftell (r->file)); + ds_put_vformat (&text, format, args); - if (!r->compressed) - read_ok = fread_ok (r, bounce, bounce_size); - else - read_ok = read_compressed_data (r, bounce); - if (!read_ok) - { - local_free (bounce); - return 0; - } + m.category = msg_class_to_category (class); + m.severity = msg_class_to_severity (class); + m.where.file_name = NULL; + m.where.line_number = 0; + m.text = ds_cstr (&text); - for (i = 0; i < r->var_cnt; i++) - { - struct sfm_var *sv = &r->vars[i]; + msg_emit (&m); +} - if (sv->width == 0) - { - flt64 f = *bounce_cur++; - if (r->reverse_endian) - bswap_flt64 (&f); - case_data_rw_idx (c, sv->fv)->f = f == r->sysmis ? SYSMIS : f; - } - else - { - flt64 *bc_start = bounce_cur; - int ofs = 0; - while (ofs < sv->width ) - { - const int chunk = MIN (MIN_VERY_LONG_STRING - 1, - sv->width - ofs); - memcpy (case_data_rw_idx (c, sv->fv)->s + ofs, - bounce_cur, chunk); +/* Displays a warning for the current file position. */ +static void +sys_warn (struct sfm_reader *r, const char *format, ...) +{ + va_list args; + + va_start (args, format); + sys_msg (r, MW, format, args); + va_end (args); +} - bounce_cur += DIV_RND_UP (chunk, sizeof (flt64)); +/* Displays an error for the current file position, + marks it as in an error state, + and aborts reading it using longjmp. */ +static void +sys_error (struct sfm_reader *r, const char *format, ...) +{ + va_list args; + + va_start (args, format); + sys_msg (r, ME, format, args); + va_end (args); - ofs += chunk; - } - bounce_cur = bc_start + sfm_width_to_bytes (sv->width) / sizeof(flt64); - } - } + r->error = true; + longjmp (r->bail_out, 1); +} + +/* Reads BYTE_CNT bytes into BUF. + Returns true if exactly BYTE_CNT bytes are successfully read. + Aborts if an I/O error or a partial read occurs. + If EOF_IS_OK, then an immediate end-of-file causes false to be + returned; otherwise, immediate end-of-file causes an abort + too. */ +static inline bool +read_bytes_internal (struct sfm_reader *r, bool eof_is_ok, + void *buf, size_t byte_cnt) +{ + size_t bytes_read = fread (buf, 1, byte_cnt, r->file); + if (bytes_read == byte_cnt) + return true; + else if (ferror (r->file)) + sys_error (r, _("System error: %s."), strerror (errno)); + else if (!eof_is_ok || bytes_read != 0) + sys_error (r, _("Unexpected end of file.")); + else + return false; +} - local_free (bounce); - } - return 1; +/* Reads BYTE_CNT into BUF. + Aborts upon I/O error or if end-of-file is encountered. */ +static void +read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +{ + read_bytes_internal (r, false, buf, byte_cnt); } -static int -fread_ok (struct sfm_reader *r, void *buffer, size_t byte_cnt) +/* Reads BYTE_CNT bytes into BUF. + Returns true if exactly BYTE_CNT bytes are successfully read. + Returns false if an immediate end-of-file is encountered. + Aborts if an I/O error or a partial read occurs. */ +static bool +try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) +{ + return read_bytes_internal (r, true, buf, byte_cnt); +} + +/* Reads a 32-bit signed integer from R and returns its value in + host format. */ +static int32_t +read_int32 (struct sfm_reader *r) { - size_t read_bytes = fread (buffer, 1, byte_cnt, r->file); + uint8_t int32[4]; + read_bytes (r, int32, sizeof int32); + return int32_to_native (r, int32); +} - if (read_bytes == byte_cnt) - return 1; - else +/* Reads a 64-bit floating-point number from R and returns its + value in host format. */ +static double +read_flt64 (struct sfm_reader *r) +{ + uint8_t flt64[8]; + read_bytes (r, flt64, sizeof flt64); + return flt64_to_double (r, flt64); +} + +/* Reads exactly SIZE - 1 bytes into BUFFER + and stores a null byte into BUFFER[SIZE - 1]. */ +static void +read_string (struct sfm_reader *r, char *buffer, size_t size) +{ + assert (size > 0); + read_bytes (r, buffer, size - 1); + buffer[size - 1] = '\0'; +} + +/* Skips BYTES bytes forward in R. */ +static void +skip_bytes (struct sfm_reader *r, size_t bytes) +{ + while (bytes > 0) { - if (ferror (r->file)) - { - msg (ME, _("%s: Reading system file: %s."), - fh_get_file_name (r->fh), strerror (errno)); - r->ok = false; - } - else if (read_bytes != 0) - { - msg (ME, _("%s: Partial record at end of system file."), - fh_get_file_name (r->fh)); - r->ok = false; - } - return 0; + char buffer[1024]; + size_t chunk = MIN (sizeof buffer, bytes); + read_bytes (r, buffer, chunk); + bytes -= chunk; } } -/* Returns true if an I/O error has occurred on READER, false - otherwise. */ -bool -sfm_read_error (const struct sfm_reader *reader) +/* Returns the value of the 32-bit signed integer at INT32, + converted from the format used by R to the host format. */ +static int32_t +int32_to_native (const struct sfm_reader *r, const uint8_t int32[4]) { - return !reader->ok; + int32_t x; + if (r->integer_format == INTEGER_NATIVE) + memcpy (&x, int32, sizeof x); + else + x = integer_get (r->integer_format, int32, sizeof x); + return x; } -/* Returns true if FILE is an SPSS system file, - false otherwise. */ -bool -sfm_detect (FILE *file) +/* Returns the value of the 64-bit floating point number at + FLT64, converted from the format used by R to the host + format. */ +static double +flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8]) { - struct sysfile_header hdr; - - if (fread (&hdr, sizeof hdr, 1, file) != 1) - return false; - if (strncmp ("$FL2", hdr.rec_type, 4)) - return false; - return true; + double x; + if (r->float_format == FLOAT_NATIVE_DOUBLE) + memcpy (&x, flt64, sizeof x); + else + float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x); + return x; } diff --git a/src/data/sys-file-reader.h b/src/data/sys-file-reader.h index 68e6e84e..92a30482 100644 --- a/src/data/sys-file-reader.h +++ b/src/data/sys-file-reader.h @@ -23,6 +23,9 @@ #include #include +#include +#include + /* Reading system files. */ /* System file info that doesn't fit in struct dictionary. */ @@ -30,8 +33,9 @@ struct sfm_read_info { char creation_date[10]; /* `dd mmm yy' plus a null. */ char creation_time[9]; /* `hh:mm:ss' plus a null. */ - int big_endian; /* 1=big-endian, 0=little-endian. */ - int compressed; /* 0=no, 1=yes. */ + enum integer_format integer_format; + enum float_format float_format; + bool compressed; /* 0=no, 1=yes. */ int case_cnt; /* -1 if unknown. */ char product[61]; /* Product name plus a null. */ }; diff --git a/src/data/sys-file-writer.c b/src/data/sys-file-writer.c index ddfb4ae7..e923b9d5 100644 --- a/src/data/sys-file-writer.c +++ b/src/data/sys-file-writer.c @@ -20,7 +20,6 @@ #include #include "sys-file-writer.h" -#include "sfm-private.h" #include "sys-file-private.h" #include @@ -54,6 +53,60 @@ #include "gettext.h" #define _(msgid) gettext (msgid) +/* Find 64-bit floating-point type. */ +#if SIZEOF_FLOAT == 8 + #define flt64 float + #define FLT64_MAX FLT_MAX +#elif SIZEOF_DOUBLE == 8 + #define flt64 double + #define FLT64_MAX DBL_MAX +#elif SIZEOF_LONG_DOUBLE == 8 + #define flt64 long double + #define FLT64_MAX LDBL_MAX +#else + #error Which one of your basic types is 64-bit floating point? +#endif + +/* Figure out SYSMIS value for flt64. */ +#include +#if SIZEOF_DOUBLE == 8 +#define second_lowest_flt64 second_lowest_value +#else +#error Must define second_lowest_flt64 for your architecture. +#endif + +/* Record Type 1: General Information. */ +struct sysfile_header + { + char rec_type[4] ; /* 00: Record-type code, "$FL2". */ + char prod_name[60] ; /* 04: Product identification. */ + int32_t layout_code ; /* 40: 2. */ + int32_t nominal_case_size ; /* 44: Number of `value's per case. + Note: some systems set this to -1 */ + int32_t compress ; /* 48: 1=compressed, 0=not compressed. */ + int32_t weight_idx ; /* 4c: 1-based index of weighting var, or 0. */ + int32_t case_cnt ; /* 50: Number of cases, -1 if unknown. */ + flt64 bias ; /* 54: Compression bias (100.0). */ + char creation_date[9] ; /* 5c: `dd mmm yy' creation date of file. */ + char creation_time[8] ; /* 65: `hh:mm:ss' 24-hour creation time. */ + char file_label[64] ; /* 6d: File label. */ + char padding[3] ; /* ad: Ignored padding. */ + } ATTRIBUTE((packed)) ; + +/* Record Type 2: Variable. */ +struct sysfile_variable + { + int32_t rec_type ; /* 2. */ + int32_t type ; /* 0=numeric, 1-255=string width, + -1=continued string. */ + int32_t has_var_label ; /* 1=has a variable label, 0=doesn't. */ + int32_t n_missing_values ; /* Missing value code of -3,-2,0,1,2, or 3. */ + int32_t print ; /* Print format. */ + int32_t write ; /* Write format. */ + char name[SHORT_NAME_LEN] ; /* Variable name. */ + /* The rest of the structure varies. */ + } ATTRIBUTE((packed)) ; + /* Compression bias used by PSPP. Values between (1 - COMPRESSION_BIAS) and (251 - COMPRESSION_BIAS) inclusive can be compressed. */ diff --git a/src/language/dictionary/ChangeLog b/src/language/dictionary/ChangeLog index e2d7d80f..0d6bf8d3 100644 --- a/src/language/dictionary/ChangeLog +++ b/src/language/dictionary/ChangeLog @@ -1,3 +1,8 @@ +Sun Dec 10 13:55:58 2006 Ben Pfaff + + * sys-file-info.c (cmd_sysfile_info): Report floating-point format + used in system file. + Sat Dec 9 18:44:26 2006 Ben Pfaff * variable-label.c: Move to src/data/variable.c. diff --git a/src/language/dictionary/sys-file-info.c b/src/language/dictionary/sys-file-info.c index 47d151ee..ef3c0da2 100644 --- a/src/language/dictionary/sys-file-info.c +++ b/src/language/dictionary/sys-file-info.c @@ -105,7 +105,7 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED) return CMD_FAILURE; sfm_close_reader (reader); - t = tab_create (2, 9, 0); + t = tab_create (2, 10, 0); tab_vline (t, TAL_GAP, 1, 0, 8); tab_text (t, 0, 0, TAB_LEFT, _("File:")); tab_text (t, 1, 0, TAB_LEFT, fh_get_file_name (h)); @@ -119,25 +119,36 @@ cmd_sysfile_info (struct lexer *lexer, struct dataset *ds UNUSED) tab_text (t, 0, 2, TAB_LEFT, _("Created:")); tab_text (t, 1, 2, TAB_LEFT | TAT_PRINTF, "%s %s by %s", info.creation_date, info.creation_time, info.product); - tab_text (t, 0, 3, TAB_LEFT, _("Endian:")); - tab_text (t, 1, 3, TAB_LEFT, info.big_endian ? _("Big.") : _("Little.")); - tab_text (t, 0, 4, TAB_LEFT, _("Variables:")); - tab_text (t, 1, 4, TAB_LEFT | TAT_PRINTF, "%d", + tab_text (t, 0, 3, TAB_LEFT, _("Integer Format:")); + tab_text (t, 1, 3, TAB_LEFT, + info.integer_format == INTEGER_MSB_FIRST ? _("Big Endian.") + : info.integer_format == INTEGER_LSB_FIRST ? _("Little Endian.") + : _("Unknown.")); + tab_text (t, 0, 4, TAB_LEFT, _("Real Format:")); + tab_text (t, 1, 4, TAB_LEFT, + info.float_format == FLOAT_IEEE_DOUBLE_LE ? _("IEEE 754 LE.") + : info.float_format == FLOAT_IEEE_DOUBLE_BE ? _("IEEE 754 BE.") + : info.float_format == FLOAT_VAX_D ? _("VAX D.") + : info.float_format == FLOAT_VAX_G ? _("VAX G.") + : info.float_format == FLOAT_Z_LONG ? _("IBM 390 Hex Long.") + : _("Unknown.")); + tab_text (t, 0, 5, TAB_LEFT, _("Variables:")); + tab_text (t, 1, 5, TAB_LEFT | TAT_PRINTF, "%d", dict_get_var_cnt (d)); - tab_text (t, 0, 5, TAB_LEFT, _("Cases:")); - tab_text (t, 1, 5, TAB_LEFT | TAT_PRINTF, + tab_text (t, 0, 6, TAB_LEFT, _("Cases:")); + tab_text (t, 1, 6, TAB_LEFT | TAT_PRINTF, info.case_cnt == -1 ? _("Unknown") : "%d", info.case_cnt); - tab_text (t, 0, 6, TAB_LEFT, _("Type:")); - tab_text (t, 1, 6, TAB_LEFT, _("System File.")); - tab_text (t, 0, 7, TAB_LEFT, _("Weight:")); + tab_text (t, 0, 7, TAB_LEFT, _("Type:")); + tab_text (t, 1, 7, TAB_LEFT, _("System File.")); + tab_text (t, 0, 8, TAB_LEFT, _("Weight:")); { struct variable *weight_var = dict_get_weight (d); - tab_text (t, 1, 7, TAB_LEFT, + tab_text (t, 1, 8, TAB_LEFT, (weight_var != NULL ? var_get_name (weight_var) : _("Not weighted."))); } - tab_text (t, 0, 8, TAB_LEFT, _("Mode:")); - tab_text (t, 1, 8, TAB_LEFT | TAT_PRINTF, + tab_text (t, 0, 9, TAB_LEFT, _("Mode:")); + tab_text (t, 1, 9, TAB_LEFT | TAT_PRINTF, _("Compression %s."), info.compressed ? _("on") : _("off")); tab_dim (t, tab_natural_dimensions); tab_submit (t); diff --git a/src/libpspp/ChangeLog b/src/libpspp/ChangeLog index 1a6c7c33..3412d669 100644 --- a/src/libpspp/ChangeLog +++ b/src/libpspp/ChangeLog @@ -1,3 +1,11 @@ +Sun Dec 10 13:54:03 2006 Ben Pfaff + + * str.c (ss_tokenize): Skip the first delimiter character + following the token. Otherwise, changing delimiters from token to + token can't have a sensible effect, because we'll get the previous + delimiter as part of the next token. + (ss_match_string): New function. + Sat Dec 9 18:48:55 2006 Ben Pfaff * misc.h (macro range): Removed, as it was unused. diff --git a/src/libpspp/str.c b/src/libpspp/str.c index 6840393a..183a4022 100644 --- a/src/libpspp/str.c +++ b/src/libpspp/str.c @@ -472,7 +472,8 @@ ss_separate (struct substring ss, struct substring delimiters, /* Divides SS into tokens separated by any of the DELIMITERS, merging adjacent delimiters so that the empty string is never produced as a token. Each call replaces TOKEN by the next - token in SS, or by an empty string if no tokens remain. + token in SS, or by an empty string if no tokens remain, and + then skips past the first delimiter following the token. Returns true if a token was obtained, false otherwise. Before the first call, initialize *SAVE_IDX to 0. Do not @@ -483,7 +484,8 @@ ss_tokenize (struct substring ss, struct substring delimiters, { ss_advance (&ss, *save_idx); *save_idx += ss_ltrim (&ss, delimiters); - *save_idx += ss_get_chars (&ss, ss_cspan (ss, delimiters), token); + ss_get_chars (&ss, ss_cspan (ss, delimiters), token); + *save_idx += ss_length (*token) + 1; return ss_length (*token) > 0; } @@ -512,6 +514,21 @@ ss_match_char (struct substring *ss, char c) return false; } +/* If SS begins with TARGET, removes it and returns true. + Otherwise, returns false without changing SS. */ +bool +ss_match_string (struct substring *ss, const struct substring target) +{ + size_t length = ss_length (target); + if (ss_equals (ss_head (*ss, length), target)) + { + ss_advance (ss, length); + return true; + } + else + return false; +} + /* Removes the first character from SS and returns it. If SS is empty, returns EOF without modifying SS. */ int diff --git a/src/libpspp/str.h b/src/libpspp/str.h index 00d8e31f..43891c17 100644 --- a/src/libpspp/str.h +++ b/src/libpspp/str.h @@ -113,6 +113,7 @@ bool ss_tokenize (struct substring src, struct substring delimiters, size_t *save_idx, struct substring *token); void ss_advance (struct substring *, size_t); bool ss_match_char (struct substring *, char); +bool ss_match_string (struct substring *, const struct substring); int ss_get_char (struct substring *); size_t ss_get_chars (struct substring *, size_t cnt, struct substring *); bool ss_get_until (struct substring *, char delimiter, struct substring *); diff --git a/tests/command/sysfile-info.sh b/tests/command/sysfile-info.sh index 99054665..38c61e15 100755 --- a/tests/command/sysfile-info.sh +++ b/tests/command/sysfile-info.sh @@ -92,13 +92,15 @@ diff -b -w $TEMPDIR/out-filtered - << EOF |name |A10 | +--------+------+ 2.1 SYSFILE INFO. -File: pro.sav -Label: No label. -Variables: 2 -Cases: 3 -Type: System File. -Weight: Not weighted. -Mode: Compression on. +File: pro.sav +Label: No label. +Integer Format: Little Endian. +Real Format: IEEE 754 LE. +Variables: 2 +Cases: 3 +Type: System File. +Weight: Not weighted. +Mode: Compression on. +--------+-------------+---+ |Variable|Description |Pos| | | |iti| -- 2.30.2