X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Fdata%2Fsys-file-reader.c;h=b0a41a83573b0e986fb35bfb3bce46379282662f;hb=b5c82cc9aabe7e641011130240ae1b2e84348e23;hp=87ba172d5801e7043dbd9391ff529b5336938f19;hpb=41a3a550334da96a9b4e5e089ad1768acf288092;p=pspp-builds.git diff --git a/src/data/sys-file-reader.c b/src/data/sys-file-reader.c index 87ba172d..b0a41a83 100644 --- a/src/data/sys-file-reader.c +++ b/src/data/sys-file-reader.c @@ -1,25 +1,23 @@ -/* PSPP - computes sample statistics. - Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc. +/* PSPP - a program for statistical analysis. + Copyright (C) 1997-9, 2000, 2006, 2007, 2009 Free Software Foundation, Inc. - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. */ + along with this program. If not, see . */ #include -#include "sys-file-reader.h" -#include "sys-file-private.h" +#include +#include #include #include @@ -27,31 +25,35 @@ #include #include -#include +#include #include #include #include -#include #include #include #include #include #include -#include "case.h" -#include "dictionary.h" -#include "file-handle-def.h" -#include "file-name.h" -#include "format.h" -#include "missing-values.h" -#include "value-labels.h" -#include "variable.h" -#include "value.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "c-ctype.h" #include "inttostr.h" #include "minmax.h" #include "unlocked-io.h" +#include "xalloc.h" #include "xsize.h" #include "gettext.h" @@ -67,17 +69,19 @@ struct sfm_reader /* File state. */ struct file_handle *fh; /* File handle. */ + struct fh_lock *lock; /* Mutual exclusion for file handle. */ FILE *file; /* File stream. */ bool error; /* I/O or corruption error? */ + struct caseproto *proto; /* Format of output cases. */ /* File format. */ enum integer_format integer_format; /* On-disk integer format. */ enum float_format float_format; /* On-disk floating point format. */ - int value_cnt; /* Number of 8-byte units per case. */ - struct sfm_var *vars; /* Variables. */ - size_t var_cnt; /* Number of variables. */ + int oct_cnt; /* Number of 8-byte units per case. */ + struct sfm_var *sfm_vars; /* Variables. */ + size_t sfm_var_cnt; /* Number of variables. */ + casenumber case_cnt; /* Number of cases */ bool has_long_var_names; /* File has a long variable name map */ - bool has_vls; /* File has one or more very long strings? */ /* Decompression. */ bool compressed; /* File is compressed? */ @@ -86,12 +90,9 @@ struct sfm_reader size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */ }; -/* A variable in a system file. */ -struct sfm_var - { - int width; /* 0=numeric, otherwise string width. */ - int case_index; /* Index into case. */ - }; +static const struct casereader_class sys_file_casereader_class; + +static bool close_reader (struct sfm_reader *); static struct variable **make_var_by_value_idx (struct sfm_reader *, struct dictionary *); @@ -99,47 +100,56 @@ static struct variable *lookup_var_by_value_idx (struct sfm_reader *, struct variable **, int value_idx); +static void sys_msg (struct sfm_reader *r, int class, + const char *format, va_list args) + PRINTF_FORMAT (3, 0); static void sys_warn (struct sfm_reader *, const char *, ...) PRINTF_FORMAT (2, 3); - static void sys_error (struct sfm_reader *, const char *, ...) PRINTF_FORMAT (2, 3) NO_RETURN; static void read_bytes (struct sfm_reader *, void *, size_t); static bool try_read_bytes (struct sfm_reader *, void *, size_t); -static int32_t read_int32 (struct sfm_reader *); -static double read_flt64 (struct sfm_reader *); +static int read_int (struct sfm_reader *); +static double read_float (struct sfm_reader *); static void read_string (struct sfm_reader *, char *, size_t); static void skip_bytes (struct sfm_reader *, size_t); -static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]); -static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]); - -static struct variable_to_value_map *open_variable_to_value_map ( - struct sfm_reader *, size_t size); -static void close_variable_to_value_map (struct sfm_reader *r, - struct variable_to_value_map *); -static bool read_variable_to_value_map (struct sfm_reader *, - struct dictionary *, - struct variable_to_value_map *, - struct variable **var, char **value, - int *warning_cnt); +static struct text_record *open_text_record (struct sfm_reader *, size_t size); +static void close_text_record (struct sfm_reader *r, + struct text_record *); +static bool read_variable_to_value_pair (struct sfm_reader *, + struct dictionary *, + struct text_record *, + struct variable **var, char **value); +static void text_warn (struct sfm_reader *r, struct text_record *text, + const char *format, ...) + PRINTF_FORMAT (3, 4); +static char *text_get_token (struct text_record *, + struct substring delimiters); +static bool text_match (struct text_record *, char c); +static bool text_read_short_name (struct sfm_reader *, struct dictionary *, + struct text_record *, + struct substring delimiters, + struct variable **); + +static bool close_reader (struct sfm_reader *r); /* Dictionary reader. */ -enum which_format +enum which_format { PRINT_FORMAT, WRITE_FORMAT }; static void read_header (struct sfm_reader *, struct dictionary *, - int *weight_idx, int *claimed_value_cnt, + int *weight_idx, int *claimed_oct_cnt, struct sfm_read_info *); static void read_variable_record (struct sfm_reader *, struct dictionary *, int *format_warning_cnt); -static void parse_format_spec (struct sfm_reader *, uint32_t, +static void parse_format_spec (struct sfm_reader *, unsigned int, enum which_format, struct variable *, int *format_warning_cnt); static void setup_weight (struct sfm_reader *, int weight_idx, @@ -149,10 +159,14 @@ static void read_documents (struct sfm_reader *, struct dictionary *); static void read_value_labels (struct sfm_reader *, struct dictionary *, struct variable **var_by_value_idx); -static void read_extension_record (struct sfm_reader *, struct dictionary *); -static void read_machine_int32_info (struct sfm_reader *, - size_t size, size_t count); -static void read_machine_flt64_info (struct sfm_reader *, +static void read_extension_record (struct sfm_reader *, struct dictionary *, + struct sfm_read_info *); +static void read_machine_integer_info (struct sfm_reader *, + size_t size, size_t count, + struct sfm_read_info *, + struct dictionary * + ); +static void read_machine_float_info (struct sfm_reader *, size_t size, size_t count); static void read_display_parameters (struct sfm_reader *, size_t size, size_t count, @@ -163,63 +177,132 @@ static void read_long_var_name_map (struct sfm_reader *, static void read_long_string_map (struct sfm_reader *, size_t size, size_t count, struct dictionary *); +static void read_data_file_attributes (struct sfm_reader *, + size_t size, size_t count, + struct dictionary *); +static void read_variable_attributes (struct sfm_reader *, + size_t size, size_t count, + struct dictionary *); +static void read_long_string_value_labels (struct sfm_reader *, + size_t size, size_t count, + struct dictionary *); + +/* Convert all the strings in DICT from the dict encoding to UTF8 */ +static void +recode_strings (struct dictionary *dict) +{ + int i; + + const char *enc = dict_get_encoding (dict); + + if ( NULL == enc) + enc = get_default_encoding (); + for (i = 0 ; i < dict_get_var_cnt (dict); ++i) + { + /* Convert the long variable name */ + struct variable *var = dict_get_var (dict, i); + const char *native_name = var_get_name (var); + char *utf8_name = recode_string (UTF8, enc, native_name, -1); + if ( 0 != strcmp (utf8_name, native_name)) + { + if ( NULL == dict_lookup_var (dict, utf8_name)) + dict_rename_var (dict, var, utf8_name); + else + msg (MW, + _("Recoded variable name duplicates an existing `%s' within system file."), utf8_name); + } + + free (utf8_name); + + /* Convert the variable label */ + if (var_has_label (var)) + { + char *utf8_label = recode_string (UTF8, enc, var_get_label (var), -1); + var_set_label (var, utf8_label); + free (utf8_label); + } + + if (var_has_value_labels (var)) + { + const struct val_lab *vl = NULL; + const struct val_labs *vlabs = var_get_value_labels (var); + + for (vl = val_labs_first (vlabs); vl != NULL; vl = val_labs_next (vlabs, vl)) + { + const union value *val = val_lab_get_value (vl); + const char *label = val_lab_get_label (vl); + char *new_label = NULL; + + new_label = recode_string (UTF8, enc, label, -1); + + var_replace_value_label (var, val, new_label); + free (new_label); + } + } + } +} /* Opens the system file designated by file handle FH for reading. Reads the system file's dictionary into *DICT. If INFO is non-null, then it receives additional info about the system file. */ -struct sfm_reader * +struct casereader * sfm_open_reader (struct file_handle *fh, struct dictionary **dict, - struct sfm_read_info *info) + struct sfm_read_info *volatile info) { struct sfm_reader *volatile r = NULL; struct variable **var_by_value_idx; + struct sfm_read_info local_info; int format_warning_cnt = 0; int weight_idx; - int claimed_value_cnt; + int claimed_oct_cnt; int rec_type; - size_t i; - - if (!fh_open (fh, FH_REF_FILE, "system file", "rs")) - return NULL; *dict = dict_create (); /* Create and initialize reader. */ r = pool_create_container (struct sfm_reader, pool); - r->fh = fh; - r->file = fn_open (fh_get_file_name (fh), "rb"); + r->fh = fh_ref (fh); + r->lock = NULL; + r->file = NULL; r->error = false; - r->value_cnt = 0; - r->has_vls = false; + r->oct_cnt = 0; r->has_long_var_names = false; r->opcode_idx = sizeof r->opcodes; - if (setjmp (r->bail_out)) - { - sfm_close_reader (r); - dict_destroy (*dict); - *dict = NULL; - return NULL; - } + /* TRANSLATORS: this fragment will be interpolated into + messages in fh_lock() that identify types of files. */ + r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false); + if (r->lock == NULL) + goto error; + r->file = fn_open (fh_get_file_name (fh), "rb"); if (r->file == NULL) { msg (ME, _("Error opening \"%s\" for reading as a system file: %s."), fh_get_file_name (r->fh), strerror (errno)); - longjmp (r->bail_out, 1); + goto error; } + /* Initialize info. */ + if (info == NULL) + info = &local_info; + memset (info, 0, sizeof *info); + + if (setjmp (r->bail_out)) + goto error; + + /* Read header. */ - read_header (r, *dict, &weight_idx, &claimed_value_cnt, info); + read_header (r, *dict, &weight_idx, &claimed_oct_cnt, info); /* Read all the variable definition records. */ - rec_type = read_int32 (r); + rec_type = read_int (r); while (rec_type == 2) { - read_variable_record (r, *dict, &format_warning_cnt); - rec_type = read_int32 (r); + read_variable_record (r, *dict, &format_warning_cnt); + rec_type = read_int (r); } /* Figure out the case format. */ @@ -227,7 +310,7 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict, setup_weight (r, weight_idx, var_by_value_idx, *dict); /* Read all the rest of the dictionary records. */ - while (rec_type != 999) + while (rec_type != 999) { switch (rec_type) { @@ -243,13 +326,13 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict, break; case 7: - read_extension_record (r, *dict); + read_extension_record (r, *dict, info); break; default: sys_error (r, _("Unrecognized record type %d."), rec_type); } - rec_type = read_int32 (r); + rec_type = read_int (r); } @@ -259,8 +342,8 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict, for (i = 0; i < dict_get_var_cnt (*dict); i++) { struct variable *var = dict_get_var (*dict, i); - char short_name [SHORT_NAME_LEN + 1]; - char long_name [SHORT_NAME_LEN + 1]; + char short_name[SHORT_NAME_LEN + 1]; + char long_name[SHORT_NAME_LEN + 1]; strcpy (short_name, var_get_name (var)); @@ -271,78 +354,98 @@ sfm_open_reader (struct file_handle *fh, struct dictionary **dict, name, but we want to retain it, so re-set it explicitly. */ dict_rename_var (*dict, var, long_name); - var_set_short_name (var, short_name); + var_set_short_name (var, 0, short_name); } r->has_long_var_names = true; } - /* Read record 999 data, which is just filler. */ - read_int32 (r); + recode_strings (*dict); - if (claimed_value_cnt != -1 && claimed_value_cnt != r->value_cnt) + /* Read record 999 data, which is just filler. */ + read_int (r); + + /* Warn if the actual amount of data per case differs from the + amount that the header claims. SPSS version 13 gets this + wrong when very long strings are involved, so don't warn in + that case. */ + if (claimed_oct_cnt != -1 && claimed_oct_cnt != r->oct_cnt + && info->version_major != 13) sys_warn (r, _("File header claims %d variable positions but " "%d were read from file."), - claimed_value_cnt, r->value_cnt); + claimed_oct_cnt, r->oct_cnt); /* Create an index of dictionary variable widths for sfm_read_case to use. We cannot use the `struct variable's from the dictionary we created, because the caller owns the dictionary and may destroy or modify its variables. */ - r->var_cnt = dict_get_var_cnt (*dict); - r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars); - for (i = 0; i < r->var_cnt; i++) - { - struct variable *v = dict_get_var (*dict, i); - struct sfm_var *sv = &r->vars[i]; - sv->width = var_get_width (v); - sv->case_index = var_get_case_index (v); - } + sfm_dictionary_to_sfm_vars (*dict, &r->sfm_vars, &r->sfm_var_cnt); + pool_register (r->pool, free, r->sfm_vars); + r->proto = caseproto_ref_pool (dict_get_proto (*dict), r->pool); pool_free (r->pool, var_by_value_idx); - return r; + return casereader_create_sequential + (NULL, r->proto, + r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt, + &sys_file_casereader_class, r); + +error: + close_reader (r); + dict_destroy (*dict); + *dict = NULL; + return NULL; } -/* Closes a system file after we're done with it. */ -void -sfm_close_reader (struct sfm_reader *r) +/* Closes a system file after we're done with it. + Returns true if an I/O error has occurred on READER, false + otherwise. */ +static bool +close_reader (struct sfm_reader *r) { + bool error; + if (r == NULL) - return; + return true; if (r->file) { if (fn_close (fh_get_file_name (r->fh), r->file) == EOF) - msg (ME, _("Error closing system file \"%s\": %s."), - fh_get_file_name (r->fh), strerror (errno)); + { + msg (ME, _("Error closing system file \"%s\": %s."), + fh_get_file_name (r->fh), strerror (errno)); + r->error = true; + } r->file = NULL; } - if (r->fh != NULL) - fh_close (r->fh, "system file", "rs"); + fh_unlock (r->lock); + fh_unref (r->fh); + error = r->error; pool_destroy (r->pool); + + return !error; } -/* Returns true if an I/O error has occurred on READER, false - otherwise. */ -bool -sfm_read_error (const struct sfm_reader *reader) +/* Destroys READER. */ +static void +sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_) { - return reader->error; + struct sfm_reader *r = r_; + close_reader (r); } /* Returns true if FILE is an SPSS system file, false otherwise. */ bool -sfm_detect (FILE *file) +sfm_detect (FILE *file) { char rec_type[5]; if (fread (rec_type, 4, 1, file) != 1) return false; rec_type[4] = '\0'; - + return !strcmp ("$FL2", rec_type); } @@ -350,28 +453,28 @@ sfm_detect (FILE *file) Sets DICT's file label to the system file's label. Sets *WEIGHT_IDX to 0 if the system file is unweighted, or to the value index of the weight variable otherwise. - Sets *CLAIMED_VALUE_CNT to the number of values that the file - claims to have (although it is not always correct). - If INFO is non-null, initializes *INFO with header - information. */ + Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units) + per case that the file claims to have (although it is not + always correct). + Initializes INFO with header information. */ static void read_header (struct sfm_reader *r, struct dictionary *dict, - int *weight_idx, int *claimed_value_cnt, + int *weight_idx, int *claimed_oct_cnt, struct sfm_read_info *info) { char rec_type[5]; char eye_catcher[61]; uint8_t raw_layout_code[4]; - int case_cnt; uint8_t raw_bias[8]; char creation_date[10]; char creation_time[9]; char file_label[65]; struct substring file_label_ss; + struct substring product; read_string (r, rec_type, sizeof rec_type); read_string (r, eye_catcher, sizeof eye_catcher); - + if (strcmp ("$FL2", rec_type) != 0) sys_error (r, _("This is not an SPSS system file.")); @@ -385,26 +488,38 @@ read_header (struct sfm_reader *r, struct dictionary *dict, && r->integer_format != INTEGER_LSB_FIRST)) sys_error (r, _("This is not an SPSS system file.")); - *claimed_value_cnt = read_int32 (r); - if (*claimed_value_cnt < 0 || *claimed_value_cnt > INT_MAX / 16) - *claimed_value_cnt = -1; + *claimed_oct_cnt = read_int (r); + if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16) + *claimed_oct_cnt = -1; - r->compressed = read_int32 (r) != 0; + r->compressed = read_int (r) != 0; - *weight_idx = read_int32 (r); + *weight_idx = read_int (r); + + r->case_cnt = read_int (r); + if ( r->case_cnt > INT_MAX / 2) + r->case_cnt = -1; - case_cnt = read_int32 (r); - if (case_cnt < -1 || case_cnt > INT_MAX / 2) - case_cnt = -1; /* Identify floating-point format and obtain compression bias. */ read_bytes (r, raw_bias, sizeof raw_bias); if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0) { - sys_warn (r, _("Compression bias (%g) is not the usual " - "value of 100, or system file uses unrecognized " - "floating-point format."), - r->bias); + uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + + if (memcmp (raw_bias, zero_bias, 8)) + sys_warn (r, _("Compression bias is not the usual " + "value of 100, or system file uses unrecognized " + "floating-point format.")); + else + { + /* Some software is known to write all-zeros to this + field. Such software also writes floating-point + numbers in the format that we expect by default + (it seems that all software most likely does, in + reality), so don't warn in this case. */ + } + if (r->integer_format == INTEGER_MSB_FIRST) r->float_format = FLOAT_IEEE_DOUBLE_BE; else @@ -416,32 +531,27 @@ read_header (struct sfm_reader *r, struct dictionary *dict, read_string (r, creation_time, sizeof creation_time); read_string (r, file_label, sizeof file_label); skip_bytes (r, 3); - + file_label_ss = ss_cstr (file_label); ss_trim (&file_label_ss, ss_cstr (" ")); - if (!ss_is_empty (file_label_ss)) + if (!ss_is_empty (file_label_ss)) { ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0'; dict_set_label (dict, ss_data (file_label_ss)); } - if (info) - { - struct substring product; - - strcpy (info->creation_date, creation_date); - strcpy (info->creation_time, creation_time); - info->integer_format = r->integer_format; - info->float_format = r->float_format; - info->compressed = r->compressed; - info->case_cnt = case_cnt; - - product = ss_cstr (eye_catcher); - ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE")); - ss_trim (&product, ss_cstr (" ")); - str_copy_buf_trunc (info->product, sizeof info->product, - ss_data (product), ss_length (product)); - } + strcpy (info->creation_date, creation_date); + strcpy (info->creation_time, creation_time); + info->integer_format = r->integer_format; + info->float_format = r->float_format; + info->compressed = r->compressed; + info->case_cnt = r->case_cnt; + + product = ss_cstr (eye_catcher); + ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE")); + ss_trim (&product, ss_cstr (" ")); + str_copy_buf_trunc (info->product, sizeof info->product, + ss_data (product), ss_length (product)); } /* Reads a variable (type 2) record from R and adds the @@ -462,11 +572,11 @@ read_variable_record (struct sfm_reader *r, struct dictionary *dict, struct variable *var; int nv; - width = read_int32 (r); - has_variable_label = read_int32 (r); - missing_value_code = read_int32 (r); - print_format = read_int32 (r); - write_format = read_int32 (r); + width = read_int (r); + has_variable_label = read_int (r); + missing_value_code = read_int (r); + print_format = read_int (r); + write_format = read_int (r); read_string (r, name, sizeof name); name[strcspn (name, " ")] = '\0'; @@ -479,15 +589,15 @@ read_variable_record (struct sfm_reader *r, struct dictionary *dict, /* Create variable. */ if (width < 0 || width > 255) - sys_error (r, _("Bad variable width %d."), width); + sys_error (r, _("Bad width %d for variable %s."), width, name); var = dict_create_var (dict, name, width); - if (var == NULL) + if (var == NULL) sys_error (r, _("Duplicate variable name `%s' within system file."), name); - /* Set the short name the same as the long name */ - var_set_short_name (var, var_get_name (var)); + /* Set the short name the same as the long name. */ + var_set_short_name (var, 0, var_get_name (var)); /* Get variable label, if any. */ if (has_variable_label != 0 && has_variable_label != 1) @@ -497,63 +607,58 @@ read_variable_record (struct sfm_reader *r, struct dictionary *dict, size_t len; char label[255 + 1]; - len = read_int32 (r); + len = read_int (r); if (len >= sizeof label) - sys_error (r, _("Variable %s has label of invalid length %u."), - name, (unsigned int) len); + sys_error (r, _("Variable %s has label of invalid length %zu."), + name, len); read_string (r, label, len + 1); var_set_label (var, label); - + skip_bytes (r, ROUND_UP (len, 4) - len); } /* Set missing values. */ - if (missing_value_code < -3 || missing_value_code > 3 - || missing_value_code == -1) - sys_error (r, _("Missing value indicator field is not " - "-3, -2, 0, 1, 2, or 3.")); if (missing_value_code != 0) { struct missing_values mv; - mv_init (&mv, var_get_width (var)); - if (var_is_numeric (var)) + int i; + + mv_init_pool (r->pool, &mv, var_get_width (var)); + if (var_is_numeric (var)) { - if (missing_value_code > 0) + if (missing_value_code < -3 || missing_value_code > 3 + || missing_value_code == -1) + sys_error (r, _("Numeric missing value indicator field is not " + "-3, -2, 0, 1, 2, or 3.")); + if (missing_value_code < 0) { - int i; - for (i = 0; i < missing_value_code; i++) - mv_add_num (&mv, read_flt64 (r)); - } - else - { - double low = read_flt64 (r); - double high = read_flt64 (r); - mv_add_num_range (&mv, low, high); - if (missing_value_code == -3) - mv_add_num (&mv, read_flt64 (r)); + double low = read_float (r); + double high = read_float (r); + mv_add_range (&mv, low, high); + missing_value_code = -missing_value_code - 2; } + for (i = 0; i < missing_value_code; i++) + mv_add_num (&mv, read_float (r)); } - else if (var_get_width (var) <= MAX_SHORT_STRING) + else { - if (missing_value_code > 0) + int mv_width = MAX (width, 8); + union value value; + + if (missing_value_code < 1 || missing_value_code > 3) + sys_error (r, _("String missing value indicator field is not " + "0, 1, 2, or 3.")); + + value_init (&value, mv_width); + value_set_missing (&value, mv_width); + for (i = 0; i < missing_value_code; i++) { - int i; - for (i = 0; i < missing_value_code; i++) - { - char string[9]; - read_string (r, string, sizeof string); - mv_add_str (&mv, string); - } + uint8_t *s = value_str_rw (&value, mv_width); + read_bytes (r, s, 8); + mv_add_str (&mv, s); } - else - sys_error (r, _("String variable %s may not have missing " - "values specified as a range."), - name); + value_destroy (&value, mv_width); } - else /* var->width > MAX_SHORT_STRING */ - sys_error (r, _("Long string variable %s may not have missing " - "values."), - name); var_set_missing_values (var, &mv); } @@ -564,7 +669,7 @@ read_variable_record (struct sfm_reader *r, struct dictionary *dict, /* Account for values. Skip long string continuation records, if any. */ nv = width == 0 ? 1 : DIV_RND_UP (width, 8); - r->value_cnt += nv; + r->oct_cnt += nv; if (width > 8) { int i; @@ -572,21 +677,21 @@ read_variable_record (struct sfm_reader *r, struct dictionary *dict, for (i = 1; i < nv; i++) { /* Check for record type 2 and width -1. */ - if (read_int32 (r) != 2 || read_int32 (r) != -1) + if (read_int (r) != 2 || read_int (r) != -1) sys_error (r, _("Missing string continuation record.")); /* Skip and ignore remaining continuation data. */ - has_variable_label = read_int32 (r); - missing_value_code = read_int32 (r); - print_format = read_int32 (r); - write_format = read_int32 (r); + has_variable_label = read_int (r); + missing_value_code = read_int (r); + print_format = read_int (r); + write_format = read_int (r); read_string (r, name, sizeof name); /* Variable label fields on continuation records have been spotted in system files created by "SPSS Power Macintosh Release 6.1". */ - if (has_variable_label) - skip_bytes (r, ROUND_UP (read_int32 (r), 4)); + if (has_variable_label) + skip_bytes (r, ROUND_UP (read_int (r), 4)); } } } @@ -594,7 +699,7 @@ read_variable_record (struct sfm_reader *r, struct dictionary *dict, /* Translates the format spec from sysfile format to internal format. */ static void -parse_format_spec (struct sfm_reader *r, uint32_t s, +parse_format_spec (struct sfm_reader *r, unsigned int s, enum which_format which, struct variable *v, int *format_warning_cnt) { @@ -603,19 +708,19 @@ parse_format_spec (struct sfm_reader *r, uint32_t s, uint8_t raw_type = s >> 16; uint8_t w = s >> 8; uint8_t d = s; - + bool ok; - + if (!fmt_from_io (raw_type, &f.type)) - sys_error (r, _("Unknown variable format %d."), (int) raw_type); + sys_error (r, _("Unknown variable format %"PRIu8"."), raw_type); f.w = w; f.d = d; msg_disable (); ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v)); msg_enable (); - - if (ok) + + if (ok) { if (which == PRINT_FORMAT) var_set_print_format (v, &f); @@ -641,7 +746,7 @@ parse_format_spec (struct sfm_reader *r, uint32_t s, nonzero. */ static void setup_weight (struct sfm_reader *r, int weight_idx, - struct variable **var_by_value_idx, struct dictionary *dict) + struct variable **var_by_value_idx, struct dictionary *dict) { if (weight_idx != 0) { @@ -666,24 +771,28 @@ read_documents (struct sfm_reader *r, struct dictionary *dict) if (dict_get_documents (dict) != NULL) sys_error (r, _("Multiple type 6 (document) records.")); - line_cnt = read_int32 (r); + line_cnt = read_int (r); if (line_cnt <= 0) sys_error (r, _("Number of document lines (%d) " "must be greater than 0."), line_cnt); - documents = pool_nmalloc (r->pool, line_cnt + 1, 80); - read_string (r, documents, 80 * line_cnt + 1); - dict_set_documents (dict, documents); + documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH); + read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1); + if (strlen (documents) == DOC_LINE_LENGTH * line_cnt) + dict_set_documents (dict, documents); + else + sys_error (r, _("Document line contains null byte.")); pool_free (r->pool, documents); } /* Read a type 7 extension record. */ static void -read_extension_record (struct sfm_reader *r, struct dictionary *dict) +read_extension_record (struct sfm_reader *r, struct dictionary *dict, + struct sfm_read_info *info) { - int subtype = read_int32 (r); - size_t size = read_int32 (r); - size_t count = read_int32 (r); + int subtype = read_int (r); + size_t size = read_int (r); + size_t count = read_int (r); size_t bytes = size * count; /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1 @@ -695,11 +804,11 @@ read_extension_record (struct sfm_reader *r, struct dictionary *dict) switch (subtype) { case 3: - read_machine_int32_info (r, size, count); + read_machine_integer_info (r, size, count, info, dict); return; case 4: - read_machine_flt64_info (r, size, count); + read_machine_float_info (r, size, count); return; case 5: @@ -712,11 +821,15 @@ read_extension_record (struct sfm_reader *r, struct dictionary *dict) /* DATE variable information. We don't use it yet, but we should. */ break; - + case 7: - /* Unknown purpose. */ + /* Used by the MRSETS command. */ break; - + + case 8: + /* Used by the SPSS Data Entry software. */ + break; + case 11: read_display_parameters (r, size, count, dict); return; @@ -734,12 +847,32 @@ read_extension_record (struct sfm_reader *r, struct dictionary *dict) break; case 17: - /* Text field that defines variable attributes. New in - SPSS 14. */ - break; - + read_data_file_attributes (r, size, count, dict); + return; + + case 18: + read_variable_attributes (r, size, count, dict); + return; + + case 20: + /* New in SPSS 16. Contains a single string that describes + the character encoding, e.g. "windows-1252". */ + { + char *encoding = pool_calloc (r->pool, size, count + 1); + read_string (r, encoding, count + 1); + dict_set_encoding (dict, encoding); + return; + } + + case 21: + /* New in SPSS 16. Encodes value labels for long string + variables. */ + read_long_string_value_labels (r, size, count, dict); + return; + default: - sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype); + sys_warn (r, _("Unrecognized record type 7, subtype %d. Please send a copy of this file, and the syntax which created it to %s"), + subtype, PACKAGE_BUGREPORT); break; } @@ -748,24 +881,31 @@ read_extension_record (struct sfm_reader *r, struct dictionary *dict) /* Read record type 7, subtype 3. */ static void -read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count) +read_machine_integer_info (struct sfm_reader *r, size_t size, size_t count, + struct sfm_read_info *info, + struct dictionary *dict) { - int version_major UNUSED = read_int32 (r); - int version_minor UNUSED = read_int32 (r); - int version_revision UNUSED = read_int32 (r); - int machine_code UNUSED = read_int32 (r); - int float_representation = read_int32 (r); - int compression_code UNUSED = read_int32 (r); - int integer_representation = read_int32 (r); - int character_code UNUSED = read_int32 (r); + int version_major = read_int (r); + int version_minor = read_int (r); + int version_revision = read_int (r); + int machine_code UNUSED = read_int (r); + int float_representation = read_int (r); + int compression_code UNUSED = read_int (r); + int integer_representation = read_int (r); + int character_code = read_int (r); int expected_float_format; int expected_integer_format; if (size != 4 || count != 8) - sys_error (r, _("Bad size (%u) or count (%u) field on record type 7, " + sys_error (r, _("Bad size (%zu) or count (%zu) field on record type 7, " "subtype 3."), - (unsigned int) size, (unsigned int) count); + size, count); + + /* Save version info. */ + info->version_major = version_major; + info->version_minor = version_minor; + info->version_revision = version_revision; /* Check floating point format. */ if (r->float_format == FLOAT_IEEE_DOUBLE_BE @@ -791,32 +931,78 @@ read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count) NOT_REACHED (); if (integer_representation != expected_integer_format) { - static const char *endian[] = {N_("little-endian"), N_("big-endian")}; + static const char *const endian[] = {N_("little-endian"), N_("big-endian")}; sys_warn (r, _("Integer format indicated by system file (%s) " "differs from expected (%s)."), gettext (endian[integer_representation == 1]), gettext (endian[expected_integer_format == 1])); } + + + /* + Record 7 (20) provides a much more reliable way of + setting the encoding. + The character_code is used as a fallback only. + */ + if ( NULL == dict_get_encoding (dict)) + { + switch (character_code) + { + case 1: + dict_set_encoding (dict, "EBCDIC-US"); + break; + case 2: + case 3: + /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic] + respectively. However, there are known to be many files + in the wild with character code 2, yet have data which are + clearly not ascii. + Therefore we ignore these values. + */ + return; + case 4: + dict_set_encoding (dict, "MS_KANJI"); + break; + case 65000: + dict_set_encoding (dict, "UTF-7"); + break; + case 65001: + dict_set_encoding (dict, "UTF-8"); + break; + default: + { + char enc[100]; + snprintf (enc, 100, "CP%d", character_code); + dict_set_encoding (dict, enc); + } + break; + }; + } } /* Read record type 7, subtype 4. */ static void -read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count) +read_machine_float_info (struct sfm_reader *r, size_t size, size_t count) { - double sysmis = read_flt64 (r); - double highest = read_flt64 (r); - double lowest = read_flt64 (r); + double sysmis = read_float (r); + double highest = read_float (r); + double lowest = read_float (r); if (size != 8 || count != 3) - sys_error (r, _("Bad size (%u) or count (%u) on extension 4."), - (unsigned int) size, (unsigned int) count); + sys_error (r, _("Bad size (%zu) or count (%zu) on extension 4."), + size, count); if (sysmis != SYSMIS) - sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis); + sys_warn (r, _("File specifies unexpected value %g as %s."), + sysmis, "SYSMIS"); + if (highest != HIGHEST) - sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest); + sys_warn (r, _("File specifies unexpected value %g as %s."), + highest, "HIGHEST"); + if (lowest != LOWEST) - sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest); + sys_warn (r, _("File specifies unexpected value %g as %s."), + lowest, "LOWEST"); } /* Read record type 7, subtype 11, which specifies how variables @@ -825,30 +1011,50 @@ static void read_display_parameters (struct sfm_reader *r, size_t size, size_t count, struct dictionary *dict) { - const size_t n_vars = count / 3 ; + size_t n_vars; + bool includes_width; bool warned = false; - int i; + size_t i; - if (count % 3 || n_vars != dict_get_var_cnt (dict)) - sys_error (r, _("Bad size (%u) or count (%u) on extension 11."), - (unsigned int) size, (unsigned int) count); + if (size != 4) + { + sys_warn (r, _("Bad size %zu on extension 11."), size); + skip_bytes (r, size * count); + return; + } - for (i = 0; i < n_vars; ++i) + n_vars = dict_get_var_cnt (dict); + if (count == 3 * n_vars) + includes_width = true; + else if (count == 2 * n_vars) + includes_width = false; + else { - int measure = read_int32 (r); - int width = read_int32 (r); - int align = read_int32 (r); - struct variable *v = dict_get_var (dict, i); + sys_warn (r, _("Extension 11 has bad count %zu (for %zu variables)."), + count, n_vars); + skip_bytes (r, size * count); + return; + } - /* spss v14 sometimes seems to set string variables' measure to zero */ - if ( 0 == measure && var_is_alpha (v) ) measure = 1; + for (i = 0; i < n_vars; ++i) + { + struct variable *v = dict_get_var (dict, i); + int measure = read_int (r); + int width = includes_width ? read_int (r) : 0; + int align = read_int (r); + /* SPSS 14 sometimes seems to set string variables' measure + to zero. */ + if (0 == measure && var_is_alpha (v)) + measure = 1; if (measure < 1 || measure > 3 || align < 0 || align > 2) { if (!warned) - sys_warn (r, _("Invalid variable display parameters. " - "Default parameters substituted.")); + sys_warn (r, _("Invalid variable display parameters " + "for variable %zu (%s). " + "Default parameters substituted."), + i, var_get_name (v)); warned = true; continue; } @@ -856,10 +1062,15 @@ read_display_parameters (struct sfm_reader *r, size_t size, size_t count, var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL : measure == 2 ? MEASURE_ORDINAL : MEASURE_SCALE)); - var_set_display_width (v, width); var_set_alignment (v, (align == 0 ? ALIGN_LEFT : align == 1 ? ALIGN_RIGHT : ALIGN_CENTRE)); + + /* Older versions (SPSS 9.0) sometimes set the display + width to zero. This causes confusion in the GUI, so + only set the width if it is nonzero. */ + if (width > 0) + var_set_display_width (v, width); } } @@ -870,17 +1081,16 @@ static void read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count, struct dictionary *dict) { - struct variable_to_value_map *map; + struct text_record *text; struct variable *var; char *long_name; - int warning_cnt = 0; - - map = open_variable_to_value_map (r, size * count); - while (read_variable_to_value_map (r, dict, map, &var, &long_name, - &warning_cnt)) + + text = open_text_record (r, size * count); + while (read_variable_to_value_pair (r, dict, text, &var, &long_name)) { - char short_name[SHORT_NAME_LEN + 1]; - strcpy (short_name, var_get_short_name (var)); + char **short_names; + size_t short_name_cnt; + size_t i; /* Validate long name. */ if (!var_is_valid_name (long_name, false)) @@ -890,9 +1100,9 @@ read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count, var_get_name (var), long_name); continue; } - + /* Identify any duplicates. */ - if (strcasecmp (short_name, long_name) + if (strcasecmp (var_get_short_name (var, 0), long_name) && dict_lookup_var (dict, long_name) != NULL) { sys_warn (r, _("Duplicate long variable name `%s' " @@ -900,13 +1110,29 @@ read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count, continue; } - /* Set long name. Renaming a variable may clear the short - name, but we want to retain it, so re-set it - explicitly. */ + /* Renaming a variable may clear its short names, but we + want to retain them, so we save them and re-set them + afterward. */ + short_name_cnt = var_get_short_name_cnt (var); + short_names = xnmalloc (short_name_cnt, sizeof *short_names); + for (i = 0; i < short_name_cnt; i++) + { + const char *s = var_get_short_name (var, i); + short_names[i] = s != NULL ? xstrdup (s) : NULL; + } + + /* Set long name. */ dict_rename_var (dict, var, long_name); - var_set_short_name (var, short_name); + + /* Restore short names. */ + for (i = 0; i < short_name_cnt; i++) + { + var_set_short_name (var, i, short_names[i]); + free (short_names[i]); + } + free (short_names); } - close_variable_to_value_map (r, map); + close_text_record (r, text); r->has_long_var_names = true; } @@ -916,48 +1142,60 @@ static void read_long_string_map (struct sfm_reader *r, size_t size, size_t count, struct dictionary *dict) { - struct variable_to_value_map *map; + struct text_record *text; struct variable *var; char *length_s; - int warning_cnt = 0; - r->has_vls = true; - - map = open_variable_to_value_map (r, size * count); - while (read_variable_to_value_map (r, dict, map, &var, &length_s, - &warning_cnt)) + text = open_text_record (r, size * count); + while (read_variable_to_value_pair (r, dict, text, &var, &length_s)) { - long length, remaining_length; - size_t idx; + size_t idx = var_get_dict_index (var); + long int length; + int segment_cnt; + int i; /* Get length. */ length = strtol (length_s, NULL, 10); - if (length < MIN_VERY_LONG_STRING || length == LONG_MAX) + if (length < 1 || length > MAX_STRING) { - sys_warn (r, _("%s listed as string of length %s " - "in length table."), + sys_warn (r, _("%s listed as string of invalid length %s " + "in very length string record."), var_get_name (var), length_s); continue; } - /* Group multiple variables into single variable - and delete all but the first. */ - remaining_length = length; - for (idx = var_get_dict_index (var); remaining_length > 0; idx++) - if (idx < dict_get_var_cnt (dict)) - remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)), - EFFECTIVE_LONG_STRING_LENGTH); - else - sys_error (r, _("Very long string %s overflows dictionary."), - var_get_name (var)); - dict_delete_consecutive_vars (dict, - var_get_dict_index (var) + 1, - idx - var_get_dict_index (var) - 1); - - /* Assign all the length to the first variable. */ + /* Check segments. */ + segment_cnt = sfm_width_to_segments (length); + if (segment_cnt == 1) + { + sys_warn (r, _("%s listed in very long string record with width %s, " + "which requires only one segment."), + var_get_name (var), length_s); + continue; + } + if (idx + segment_cnt > dict_get_var_cnt (dict)) + sys_error (r, _("Very long string %s overflows dictionary."), + var_get_name (var)); + + /* Get the short names from the segments and check their + lengths. */ + for (i = 0; i < segment_cnt; i++) + { + struct variable *seg = dict_get_var (dict, idx + i); + int alloc_width = sfm_segment_alloc_width (length, i); + int width = var_get_width (seg); + + if (i > 0) + var_set_short_name (var, i, var_get_short_name (seg, 0)); + if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8)) + sys_error (r, _("Very long string with width %ld has segment %d " + "of width %d (expected %d)"), + length, i, width, alloc_width); + } + dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1); var_set_width (var, length); } - close_variable_to_value_map (r, map); + close_text_record (r, text); dict_compact_values (dict); } @@ -968,10 +1206,10 @@ read_value_labels (struct sfm_reader *r, struct dictionary *dict, struct variable **var_by_value_idx) { struct pool *subpool; - - struct label + + struct label { - char raw_value[8]; /* Value as uninterpreted bytes. */ + uint8_t raw_value[8]; /* Value as uninterpreted bytes. */ union value value; /* Value. */ char *label; /* Null-terminated label string. */ }; @@ -981,6 +1219,7 @@ read_value_labels (struct sfm_reader *r, struct variable **var = NULL; /* Associated variables. */ int var_cnt; /* Number of associated variables. */ + int max_width; /* Maximum width of string variables. */ int i; @@ -991,10 +1230,10 @@ read_value_labels (struct sfm_reader *r, of numeric or string type. */ /* Read number of labels. */ - label_cnt = read_int32 (r); + label_cnt = read_int (r); - if (label_cnt >= INT32_MAX / sizeof *labels) - { + if (size_overflow_p (xtimes (label_cnt, sizeof *labels))) + { sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."), label_cnt); label_cnt = 0; @@ -1025,26 +1264,29 @@ read_value_labels (struct sfm_reader *r, to which the value labels are to be applied. */ /* Read record type of type 4 record. */ - if (read_int32 (r) != 4) + if (read_int (r) != 4) sys_error (r, _("Variable index record (type 4) does not immediately " "follow value label record (type 3) as it should.")); /* Read number of variables associated with value label from type 4 record. */ - var_cnt = read_int32 (r); + var_cnt = read_int (r); if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict)) sys_error (r, _("Number of variables associated with a value label (%d) " - "is not between 1 and the number of variables (%u)."), - var_cnt, (unsigned int) dict_get_var_cnt (dict)); + "is not between 1 and the number of variables (%zu)."), + var_cnt, dict_get_var_cnt (dict)); /* Read the list of variables. */ var = pool_nalloc (subpool, var_cnt, sizeof *var); + max_width = 0; for (i = 0; i < var_cnt; i++) { - var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r)); - if (var_is_long_string (var[i])) - sys_error (r, _("Value labels are not allowed on long string " - "variables (%s)."), var_get_name (var[i])); + var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int (r)); + if (var_get_width (var[i]) > 8) + sys_error (r, _("Value labels may not be added to long string " + "variables (e.g. %s) using records types 3 and 4."), + var_get_name (var[i])); + max_width = MAX (max_width, var_get_width (var[i])); } /* Type check the variables. */ @@ -1059,17 +1301,18 @@ read_value_labels (struct sfm_reader *r, var_is_numeric (var[i]) ? _("numeric") : _("string")); /* Fill in labels[].value, now that we know the desired type. */ - for (i = 0; i < label_cnt; i++) + for (i = 0; i < label_cnt; i++) { struct label *label = labels + i; - + + value_init_pool (subpool, &label->value, max_width); if (var_is_alpha (var[0])) - buf_copy_rpad (label->value.s, sizeof label->value.s, - label->raw_value, sizeof label->raw_value); + u8_buf_copy_rpad (value_str_rw (&label->value, max_width), max_width, + label->raw_value, sizeof label->raw_value, ' '); else - label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value); + label->value.f = float_get_double (r->float_format, label->raw_value); } - + /* Assign the `value_label's to each variable. */ for (i = 0; i < var_cnt; i++) { @@ -1080,124 +1323,282 @@ read_value_labels (struct sfm_reader *r, for (j = 0; j < label_cnt; j++) { struct label *label = &labels[j]; - if (!var_add_value_label (v, &label->value, label->label)) + if (!var_add_value_label (v, &label->value, label->label)) { if (var_is_numeric (var[0])) sys_warn (r, _("Duplicate value label for %g on %s."), label->value.f, var_get_name (v)); else sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."), - var_get_width (v), label->value.s, - var_get_name (v)); + max_width, value_str (&label->value, max_width), + var_get_name (v)); } } } pool_destroy (subpool); } + +/* Reads a set of custom attributes from TEXT into ATTRS. + ATTRS may be a null pointer, in which case the attributes are + read but discarded. */ +static void +read_attributes (struct sfm_reader *r, struct text_record *text, + struct attrset *attrs) +{ + do + { + struct attribute *attr; + char *key; + int index; + + /* Parse the key. */ + key = text_get_token (text, ss_cstr ("(")); + if (key == NULL) + return; + + attr = attribute_create (key); + for (index = 1; ; index++) + { + /* Parse the value. */ + char *value; + size_t length; + + value = text_get_token (text, ss_cstr ("\n")); + if (value == NULL) + { + text_warn (r, text, _("Error parsing attribute value %s[%d]"), + key, index); + break; + } + + length = strlen (value); + if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'') + { + value[length - 1] = '\0'; + attribute_add_value (attr, value + 1); + } + else + { + text_warn (r, text, + _("Attribute value %s[%d] is not quoted: %s"), + key, index, value); + attribute_add_value (attr, value); + } + + /* Was this the last value for this attribute? */ + if (text_match (text, ')')) + break; + } + if (attrs != NULL) + attrset_add (attrs, attr); + else + attribute_destroy (attr); + } + while (!text_match (text, '/')); +} + +/* Reads record type 7, subtype 17, which lists custom + attributes on the data file. */ +static void +read_data_file_attributes (struct sfm_reader *r, + size_t size, size_t count, + struct dictionary *dict) +{ + struct text_record *text = open_text_record (r, size * count); + read_attributes (r, text, dict_get_attributes (dict)); + close_text_record (r, text); +} + +static void +skip_long_string_value_labels (struct sfm_reader *r, size_t n_labels) +{ + size_t i; + + for (i = 0; i < n_labels; i++) + { + size_t value_length, label_length; + + value_length = read_int (r); + skip_bytes (r, value_length); + label_length = read_int (r); + skip_bytes (r, label_length); + } +} + +static void +read_long_string_value_labels (struct sfm_reader *r, + size_t size, size_t count, + struct dictionary *d) +{ + const off_t start = ftello (r->file); + while (ftello (r->file) - start < size * count) + { + char var_name[VAR_NAME_LEN + 1]; + size_t n_labels, i; + struct variable *v; + union value value; + int var_name_len; + int width; + + /* Read header. */ + var_name_len = read_int (r); + if (var_name_len > VAR_NAME_LEN) + sys_error (r, _("Variable name length in long string value label " + "record (%d) exceeds %d-byte limit."), + var_name_len, VAR_NAME_LEN); + read_string (r, var_name, var_name_len + 1); + width = read_int (r); + n_labels = read_int (r); + + v = dict_lookup_var (d, var_name); + if (v == NULL) + { + sys_warn (r, _("Ignoring long string value record for " + "unknown variable %s."), var_name); + skip_long_string_value_labels (r, n_labels); + continue; + } + if (var_is_numeric (v)) + { + sys_warn (r, _("Ignoring long string value record for " + "numeric variable %s."), var_name); + skip_long_string_value_labels (r, n_labels); + continue; + } + if (width != var_get_width (v)) + { + sys_warn (r, _("Ignoring long string value record for variable %s " + "because the record's width (%d) does not match the " + "variable's width (%d)"), + var_name, width, var_get_width (v)); + skip_long_string_value_labels (r, n_labels); + continue; + } + + /* Read values. */ + value_init_pool (r->pool, &value, width); + for (i = 0; i < n_labels; i++) + { + size_t value_length, label_length; + char label[256]; + bool skip = false; + + /* Read value. */ + value_length = read_int (r); + if (value_length == width) + read_bytes (r, value_str_rw (&value, width), width); + else + { + sys_warn (r, _("Ignoring long string value %zu for variable %s, " + "with width %d, that has bad value width %zu."), + i, var_get_name (v), width, value_length); + skip_bytes (r, value_length); + skip = true; + } + + /* Read label. */ + label_length = read_int (r); + read_string (r, label, MIN (sizeof label, label_length + 1)); + if (label_length >= sizeof label) + { + /* Skip and silently ignore label text after the + first 255 bytes. The maximum documented length + of a label is 120 bytes so this is more than + generous. */ + skip_bytes (r, sizeof label - (label_length + 1)); + } + + if (!skip && !var_add_value_label (v, &value, label)) + sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."), + width, value_str (&value, width), var_get_name (v)); + } + } +} + + +/* Reads record type 7, subtype 18, which lists custom + attributes on individual variables. */ +static void +read_variable_attributes (struct sfm_reader *r, + size_t size, size_t count, + struct dictionary *dict) +{ + struct text_record *text = open_text_record (r, size * count); + for (;;) + { + struct variable *var; + if (!text_read_short_name (r, dict, text, ss_cstr (":"), &var)) + break; + read_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL); + } + close_text_record (r, text); +} + /* Case reader. */ static void partial_record (struct sfm_reader *r) NO_RETURN; + +static void read_error (struct casereader *, const struct sfm_reader *); + static bool read_case_number (struct sfm_reader *, double *); -static bool read_case_string (struct sfm_reader *, char *, size_t); +static bool read_case_string (struct sfm_reader *, uint8_t *, size_t); static int read_opcode (struct sfm_reader *); static bool read_compressed_number (struct sfm_reader *, double *); -static bool read_compressed_string (struct sfm_reader *, char *); -static bool read_whole_strings (struct sfm_reader *, char *, size_t); - -/* Reads one case from READER's file into C. Returns nonzero - only if successful. */ -int -sfm_read_case (struct sfm_reader *r, struct ccase *c) +static bool read_compressed_string (struct sfm_reader *, uint8_t *); +static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t); +static bool skip_whole_strings (struct sfm_reader *, size_t); + +/* Reads and returns one case from READER's file. Returns a null + pointer if not successful. */ +static struct ccase * +sys_file_casereader_read (struct casereader *reader, void *r_) { + struct sfm_reader *r = r_; + struct ccase *volatile c; + int i; + if (r->error) - return 0; + return NULL; + c = case_create (r->proto); if (setjmp (r->bail_out)) - return 0; + { + casereader_force_error (reader); + case_unref (c); + return NULL; + } - if (!r->compressed && sizeof (double) == 8 && !r->has_vls) + for (i = 0; i < r->sfm_var_cnt; i++) { - /* Fast path. Read the whole case directly. */ - if (!try_read_bytes (r, case_data_all_rw (c), - sizeof (union value) * r->value_cnt)) - return 0; + struct sfm_var *sv = &r->sfm_vars[i]; + union value *v = case_data_rw_idx (c, sv->case_index); - /* Convert floating point numbers to native format if needed. */ - if (r->float_format != FLOAT_NATIVE_DOUBLE) + if (sv->var_width == 0) { - int i; - - for (i = 0; i < r->var_cnt; i++) - if (r->vars[i].width == 0) - { - double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f; - float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d); - } + if (!read_case_number (r, &v->f)) + goto eof; } - return 1; - } - else - { - /* Slow path. Convert from external to internal format. */ - int i; - - for (i = 0; i < r->var_cnt; i++) + else { - struct sfm_var *sv = &r->vars[i]; - union value *v = case_data_rw_idx (c, sv->case_index); - - if (sv->width == 0) - { - if (!read_case_number (r, &v->f)) - goto eof; - } - else - { - /* Read the string data in segments up to 255 bytes - at a time, packed into 8-byte units. */ - const int max_chunk = MIN_VERY_LONG_STRING - 1; - int ofs, chunk_size; - for (ofs = 0; ofs < sv->width; ofs += chunk_size) - { - chunk_size = MIN (max_chunk, sv->width - ofs); - if (!read_case_string (r, v->s + ofs, chunk_size)) - { - if (ofs) - partial_record (r); - goto eof; - } - } - - /* Very long strings have trailing wasted space - that we must skip. */ - if (sv->width >= MIN_VERY_LONG_STRING) - { - int bytes_read = (sv->width / max_chunk * 256 - + ROUND_UP (sv->width % max_chunk, 8)); - int total_bytes = sfm_width_to_bytes (sv->width); - int excess_bytes = total_bytes - bytes_read; - - while (excess_bytes > 0) - { - char buffer[1024]; - size_t chunk = MIN (sizeof buffer, excess_bytes); - if (!read_whole_strings (r, buffer, chunk)) - partial_record (r); - excess_bytes -= chunk; - } - } - } + uint8_t *s = value_str_rw (v, sv->var_width); + if (!read_case_string (r, s + sv->offset, sv->segment_width)) + goto eof; + if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8))) + partial_record (r); } - return 1; - - eof: - if (i != 0) - partial_record (r); - return 0; } + return c; + +eof: + case_unref (c); + if (i != 0) + partial_record (r); + if (r->case_cnt != -1) + read_error (reader, r); + return NULL; } /* Issues an error that R ends in a partial record. */ @@ -1207,20 +1608,29 @@ partial_record (struct sfm_reader *r) sys_error (r, _("File ends in partial case.")); } +/* Issues an error that an unspecified error occurred SFM, and + marks R tainted. */ +static void +read_error (struct casereader *r, const struct sfm_reader *sfm) +{ + msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh)); + casereader_force_error (r); +} + /* Reads a number from R and stores its value in *D. If R is compressed, reads a compressed number; otherwise, reads a number in the regular way. Returns true if successful, false if end of file is reached immediately. */ static bool -read_case_number (struct sfm_reader *r, double *d) +read_case_number (struct sfm_reader *r, double *d) { if (!r->compressed) { - uint8_t flt64[8]; - if (!try_read_bytes (r, flt64, sizeof flt64)) + uint8_t number[8]; + if (!try_read_bytes (r, number, sizeof number)) return false; - *d = flt64_to_double (r, flt64); + float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d); return true; } else @@ -1235,12 +1645,12 @@ read_case_number (struct sfm_reader *r, double *d) Returns true if successful, false if end of file is reached immediately. */ static bool -read_case_string (struct sfm_reader *r, char *s, size_t length) +read_case_string (struct sfm_reader *r, uint8_t *s, size_t length) { size_t whole = ROUND_DOWN (length, 8); size_t partial = length % 8; - - if (whole) + + if (whole) { if (!read_whole_strings (r, s, whole)) return false; @@ -1248,12 +1658,12 @@ read_case_string (struct sfm_reader *r, char *s, size_t length) if (partial) { - char bounce[8]; + uint8_t bounce[8]; if (!read_whole_strings (r, bounce, sizeof bounce)) { if (whole) partial_record (r); - return false; + return false; } memcpy (s + whole, bounce, partial); } @@ -1263,13 +1673,13 @@ read_case_string (struct sfm_reader *r, char *s, size_t length) /* Reads and returns the next compression opcode from R. */ static int -read_opcode (struct sfm_reader *r) +read_opcode (struct sfm_reader *r) { assert (r->compressed); for (;;) { int opcode; - if (r->opcode_idx >= sizeof r->opcodes) + if (r->opcode_idx >= sizeof r->opcodes) { if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes)) return -1; @@ -1288,7 +1698,7 @@ read_opcode (struct sfm_reader *r) static bool read_compressed_number (struct sfm_reader *r, double *d) { - int opcode = read_opcode (r); + int opcode = read_opcode (r); switch (opcode) { case -1: @@ -1296,9 +1706,9 @@ read_compressed_number (struct sfm_reader *r, double *d) return false; case 253: - *d = read_flt64 (r); + *d = read_float (r); break; - + case 254: sys_error (r, _("Compressed data is corrupt.")); @@ -1319,7 +1729,7 @@ read_compressed_number (struct sfm_reader *r, double *d) Returns true if successful, false if end of file is reached immediately. */ static bool -read_compressed_string (struct sfm_reader *r, char *dst) +read_compressed_string (struct sfm_reader *r, uint8_t *dst) { switch (read_opcode (r)) { @@ -1348,7 +1758,7 @@ read_compressed_string (struct sfm_reader *r, char *dst) Returns true if successful, false if end of file is reached immediately. */ static bool -read_whole_strings (struct sfm_reader *r, char *s, size_t length) +read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length) { assert (length % 8 == 0); if (!r->compressed) @@ -1357,7 +1767,7 @@ read_whole_strings (struct sfm_reader *r, char *s, size_t length) { size_t ofs; for (ofs = 0; ofs < length; ofs += 8) - if (!read_compressed_string (r, s + ofs)) + if (!read_compressed_string (r, s + ofs)) { if (ofs != 0) partial_record (r); @@ -1366,6 +1776,20 @@ read_whole_strings (struct sfm_reader *r, char *s, size_t length) return true; } } + +/* Skips LENGTH string bytes from R. + LENGTH must be a multiple of 8. + (LENGTH is also limited to 1024, but that's only because the + current caller never needs more than that many bytes.) + Returns true if successful, false if end of file is + reached immediately. */ +static bool +skip_whole_strings (struct sfm_reader *r, size_t length) +{ + uint8_t buffer[1024]; + assert (length < sizeof buffer); + return read_whole_strings (r, buffer, length); +} /* Creates and returns a table that can be used for translating a value index into a case to a "struct variable *" for DICT. Multiple @@ -1376,15 +1800,15 @@ read_whole_strings (struct sfm_reader *r, char *s, size_t length) values to be deleted from the case and the dictionary to be compacted. */ static struct variable ** -make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict) +make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict) { struct variable **var_by_value_idx; int value_idx = 0; int i; var_by_value_idx = pool_nmalloc (r->pool, - r->value_cnt, sizeof *var_by_value_idx); - for (i = 0; i < dict_get_var_cnt (dict); i++) + r->oct_cnt, sizeof *var_by_value_idx); + for (i = 0; i < dict_get_var_cnt (dict); i++) { struct variable *v = dict_get_var (dict, i); int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8); @@ -1394,7 +1818,7 @@ make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict) for (j = 1; j < nv; j++) var_by_value_idx[value_idx++] = NULL; } - assert (value_idx == r->value_cnt); + assert (value_idx == r->oct_cnt); return var_by_value_idx; } @@ -1404,13 +1828,13 @@ make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict) is valid. */ static struct variable * lookup_var_by_value_idx (struct sfm_reader *r, - struct variable **var_by_value_idx, int value_idx) + struct variable **var_by_value_idx, int value_idx) { struct variable *var; - - if (value_idx < 1 || value_idx > r->value_cnt) + + if (value_idx < 1 || value_idx > r->oct_cnt) sys_error (r, _("Variable index %d not in valid range 1...%d."), - value_idx, r->value_cnt); + value_idx, r->oct_cnt); var = var_by_value_idx[value_idx - 1]; if (var == NULL) @@ -1432,97 +1856,139 @@ lookup_var_by_short_name (struct dictionary *d, const char *short_name) /* First try looking up by full name. This often succeeds. */ var = dict_lookup_var (d, short_name); - if (var != NULL && !strcasecmp (var_get_short_name (var), short_name)) + if (var != NULL && !strcasecmp (var_get_short_name (var, 0), short_name)) return var; /* Iterate through the whole dictionary as a fallback. */ var_cnt = dict_get_var_cnt (d); - for (i = 0; i < var_cnt; i++) + for (i = 0; i < var_cnt; i++) { var = dict_get_var (d, i); - if (!strcasecmp (var_get_short_name (var), short_name)) + if (!strcasecmp (var_get_short_name (var, 0), short_name)) return var; } return NULL; } -/* Helpers for reading records that contain "variable=value" - pairs. */ +/* Helpers for reading records that contain structured text + strings. */ + +/* Maximum number of warnings to issue for a single text + record. */ +#define MAX_TEXT_WARNINGS 5 /* State. */ -struct variable_to_value_map +struct text_record { struct substring buffer; /* Record contents. */ size_t pos; /* Current position in buffer. */ + int n_warnings; /* Number of warnings issued or suppressed. */ }; -/* Reads SIZE bytes into a "variable=value" map for R, - and returns the map. */ -static struct variable_to_value_map * -open_variable_to_value_map (struct sfm_reader *r, size_t size) +/* Reads SIZE bytes into a text record for R, + and returns the new text record. */ +static struct text_record * +open_text_record (struct sfm_reader *r, size_t size) { - struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map); + struct text_record *text = pool_alloc (r->pool, sizeof *text); char *buffer = pool_malloc (r->pool, size + 1); read_bytes (r, buffer, size); - map->buffer = ss_buffer (buffer, size); - map->pos = 0; - return map; + text->buffer = ss_buffer (buffer, size); + text->pos = 0; + text->n_warnings = 0; + return text; } -/* Closes MAP and frees its storage. - Not really needed, because the pool will free the map anyway, - but can be used to free it earlier. */ +/* Closes TEXT, frees its storage, and issues a final warning + about suppressed warnings if necesary. */ static void -close_variable_to_value_map (struct sfm_reader *r, - struct variable_to_value_map *map) +close_text_record (struct sfm_reader *r, struct text_record *text) { - pool_free (r->pool, ss_data (map->buffer)); + if (text->n_warnings > MAX_TEXT_WARNINGS) + sys_warn (r, _("Suppressed %d additional related warnings."), + text->n_warnings - MAX_TEXT_WARNINGS); + pool_free (r->pool, ss_data (text->buffer)); } -/* Reads the next variable=value pair from MAP. +/* Reads a variable=value pair from TEXT. Looks up the variable in DICT and stores it into *VAR. Stores a null-terminated value into *VALUE. */ static bool -read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict, - struct variable_to_value_map *map, - struct variable **var, char **value, - int *warning_cnt) +read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict, + struct text_record *text, + struct variable **var, char **value) { - int max_warnings = 5; - - for (;;) + for (;;) { - struct substring short_name_ss, value_ss; - - if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss) - || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos, - &value_ss)) - { - if (*warning_cnt > max_warnings) - sys_warn (r, _("Suppressed %d additional variable map warnings."), - *warning_cnt - max_warnings); - return false; - } + if (!text_read_short_name (r, dict, text, ss_cstr ("="), var)) + return false; - map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX), - ss_buffer ("\t\0", 2)); + *value = text_get_token (text, ss_buffer ("\t\0", 2)); + if (*value == NULL) + return false; - ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0'; - *var = lookup_var_by_short_name (dict, ss_data (short_name_ss)); - if (*var == NULL) - { - if (++*warning_cnt <= 5) - sys_warn (r, _("Variable map refers to unknown variable %s."), - ss_data (short_name_ss)); - continue; - } + text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX), + ss_buffer ("\t\0", 2)); - ss_data (value_ss)[ss_length (value_ss)] = '\0'; - *value = ss_data (value_ss); + if (*var != NULL) + return true; + } +} +static bool +text_read_short_name (struct sfm_reader *r, struct dictionary *dict, + struct text_record *text, struct substring delimiters, + struct variable **var) +{ + char *short_name = text_get_token (text, delimiters); + if (short_name == NULL) + return false; + + *var = lookup_var_by_short_name (dict, short_name); + if (*var == NULL) + text_warn (r, text, _("Variable map refers to unknown variable %s."), + short_name); + return true; +} + +/* Displays a warning for the current file position, limiting the + number to MAX_TEXT_WARNINGS for TEXT. */ +static void +text_warn (struct sfm_reader *r, struct text_record *text, + const char *format, ...) +{ + if (text->n_warnings++ < MAX_TEXT_WARNINGS) + { + va_list args; + + va_start (args, format); + sys_msg (r, MW, format, args); + va_end (args); + } +} + +static char * +text_get_token (struct text_record *text, struct substring delimiters) +{ + struct substring token; + + if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token)) + return NULL; + ss_data (token)[ss_length (token)] = '\0'; + return ss_data (token); +} + +static bool +text_match (struct text_record *text, char c) +{ + if (text->buffer.string[text->pos] == c) + { + text->pos++; return true; } + else + return false; } /* Messages. */ @@ -1550,10 +2016,10 @@ sys_msg (struct sfm_reader *r, int class, const char *format, va_list args) /* Displays a warning for the current file position. */ static void -sys_warn (struct sfm_reader *r, const char *format, ...) +sys_warn (struct sfm_reader *r, const char *format, ...) { va_list args; - + va_start (args, format); sys_msg (r, MW, format, args); va_end (args); @@ -1563,10 +2029,10 @@ sys_warn (struct sfm_reader *r, const char *format, ...) marks it as in an error state, and aborts reading it using longjmp. */ static void -sys_error (struct sfm_reader *r, const char *format, ...) +sys_error (struct sfm_reader *r, const char *format, ...) { va_list args; - + va_start (args, format); sys_msg (r, ME, format, args); va_end (args); @@ -1616,28 +2082,28 @@ try_read_bytes (struct sfm_reader *r, void *buf, size_t byte_cnt) /* Reads a 32-bit signed integer from R and returns its value in host format. */ -static int32_t -read_int32 (struct sfm_reader *r) +static int +read_int (struct sfm_reader *r) { - uint8_t int32[4]; - read_bytes (r, int32, sizeof int32); - return int32_to_native (r, int32); + uint8_t integer[4]; + read_bytes (r, integer, sizeof integer); + return integer_get (r->integer_format, integer, sizeof integer); } /* Reads a 64-bit floating-point number from R and returns its value in host format. */ static double -read_flt64 (struct sfm_reader *r) +read_float (struct sfm_reader *r) { - uint8_t flt64[8]; - read_bytes (r, flt64, sizeof flt64); - return flt64_to_double (r, flt64); + uint8_t number[8]; + read_bytes (r, number, sizeof number); + return float_get_double (r->float_format, number); } /* Reads exactly SIZE - 1 bytes into BUFFER and stores a null byte into BUFFER[SIZE - 1]. */ static void -read_string (struct sfm_reader *r, char *buffer, size_t size) +read_string (struct sfm_reader *r, char *buffer, size_t size) { assert (size > 0); read_bytes (r, buffer, size - 1); @@ -1648,7 +2114,7 @@ read_string (struct sfm_reader *r, char *buffer, size_t size) static void skip_bytes (struct sfm_reader *r, size_t bytes) { - while (bytes > 0) + while (bytes > 0) { char buffer[1024]; size_t chunk = MIN (sizeof buffer, bytes); @@ -1657,30 +2123,10 @@ skip_bytes (struct sfm_reader *r, size_t bytes) } } -/* Returns the value of the 32-bit signed integer at INT32, - converted from the format used by R to the host format. */ -static int32_t -int32_to_native (const struct sfm_reader *r, const uint8_t int32[4]) -{ - int32_t x; - if (r->integer_format == INTEGER_NATIVE) - memcpy (&x, int32, sizeof x); - else - x = integer_get (r->integer_format, int32, sizeof x); - return x; -} - -/* Returns the value of the 64-bit floating point number at - FLT64, converted from the format used by R to the host - format. */ -static double -flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8]) -{ - double x; - if (r->float_format == FLOAT_NATIVE_DOUBLE) - memcpy (&x, flt64, sizeof x); - else - float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x); - return x; -} - +static const struct casereader_class sys_file_casereader_class = + { + sys_file_casereader_read, + sys_file_casereader_destroy, + NULL, + NULL, + };