-/* PSPP - computes sample statistics.
- Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
+/* PSPP - a program for statistical analysis.
+ Copyright (C) 1997-2000, 2006-2007, 2009-2011 Free Software Foundation, Inc.
- This program is free software; you can redistribute it and/or
- modify it under the terms of the GNU General Public License as
- published by the Free Software Foundation; either version 2 of the
- License, or (at your option) any later version.
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
- This program is distributed in the hope that it will be useful, but
- WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- General Public License for more details.
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- 02110-1301, USA. */
+ along with this program. If not, see <http://www.gnu.org/licenses/>. */
#include <config.h>
-#include <data/sys-file-reader.h>
-#include <data/sys-file-private.h>
+#include "data/sys-file-reader.h"
+#include "data/sys-file-private.h"
#include <errno.h>
#include <float.h>
#include <setjmp.h>
#include <stdlib.h>
-#include <libpspp/alloc.h>
-#include <libpspp/assertion.h>
-#include <libpspp/message.h>
-#include <libpspp/compiler.h>
-#include <libpspp/magic.h>
-#include <libpspp/misc.h>
-#include <libpspp/pool.h>
-#include <libpspp/str.h>
-#include <libpspp/hash.h>
-#include <libpspp/array.h>
-
-#include <data/case.h>
-#include <data/casereader-provider.h>
-#include <data/casereader.h>
-#include <data/dictionary.h>
-#include <data/file-handle-def.h>
-#include <data/file-name.h>
-#include <data/format.h>
-#include <data/missing-values.h>
-#include <data/value-labels.h>
-#include <data/variable.h>
-#include <data/value.h>
-
-#include "c-ctype.h"
-#include "inttostr.h"
-#include "minmax.h"
-#include "unlocked-io.h"
-#include "xsize.h"
+#include "data/attributes.h"
+#include "data/case.h"
+#include "data/casereader-provider.h"
+#include "data/casereader.h"
+#include "data/dictionary.h"
+#include "data/file-handle-def.h"
+#include "data/file-name.h"
+#include "data/format.h"
+#include "data/identifier.h"
+#include "data/missing-values.h"
+#include "data/mrset.h"
+#include "data/short-names.h"
+#include "data/value-labels.h"
+#include "data/value.h"
+#include "data/variable.h"
+#include "libpspp/array.h"
+#include "libpspp/assertion.h"
+#include "libpspp/compiler.h"
+#include "libpspp/i18n.h"
+#include "libpspp/message.h"
+#include "libpspp/misc.h"
+#include "libpspp/pool.h"
+#include "libpspp/str.h"
+#include "libpspp/stringi-set.h"
+
+#include "gl/c-ctype.h"
+#include "gl/inttostr.h"
+#include "gl/localcharset.h"
+#include "gl/minmax.h"
+#include "gl/unlocked-io.h"
+#include "gl/xalloc.h"
+#include "gl/xsize.h"
#include "gettext.h"
#define _(msgid) gettext (msgid)
#define N_(msgid) (msgid)
+enum
+ {
+ /* subtypes 0-2 unknown */
+ EXT_INTEGER = 3, /* Machine integer info. */
+ EXT_FLOAT = 4, /* Machine floating-point info. */
+ EXT_VAR_SETS = 5, /* Variable sets. */
+ EXT_DATE = 6, /* DATE. */
+ EXT_MRSETS = 7, /* Multiple response sets. */
+ EXT_DATA_ENTRY = 8, /* SPSS Data Entry. */
+ /* subtypes 9-10 unknown */
+ EXT_DISPLAY = 11, /* Variable display parameters. */
+ /* subtype 12 unknown */
+ EXT_LONG_NAMES = 13, /* Long variable names. */
+ EXT_LONG_STRINGS = 14, /* Long strings. */
+ /* subtype 15 unknown */
+ EXT_NCASES = 16, /* Extended number of cases. */
+ EXT_FILE_ATTRS = 17, /* Data file attributes. */
+ EXT_VAR_ATTRS = 18, /* Variable attributes. */
+ EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
+ EXT_ENCODING = 20, /* Character encoding. */
+ EXT_LONG_LABELS = 21 /* Value labels for long strings. */
+ };
+
+struct sfm_var_record
+ {
+ off_t pos;
+ int width;
+ char name[8];
+ int print_format;
+ int write_format;
+ int missing_value_code;
+ uint8_t missing[24];
+ char *label;
+ struct variable *var;
+ };
+
+struct sfm_value_label
+ {
+ uint8_t value[8];
+ char *label;
+ };
+
+struct sfm_value_label_record
+ {
+ off_t pos;
+ struct sfm_value_label *labels;
+ size_t n_labels;
+
+ int *vars;
+ size_t n_vars;
+ };
+
+struct sfm_document_record
+ {
+ off_t pos;
+ char *documents;
+ size_t n_lines;
+ };
+
+struct sfm_extension_record
+ {
+ off_t pos; /* Starting offset in file. */
+ size_t size; /* Size of data elements. */
+ size_t count; /* Number of data elements. */
+ void *data; /* Contents. */
+ };
+
/* System file reader. */
struct sfm_reader
{
/* File state. */
struct file_handle *fh; /* File handle. */
+ struct fh_lock *lock; /* Mutual exclusion for file handle. */
FILE *file; /* File stream. */
+ off_t pos; /* Position in file. */
bool error; /* I/O or corruption error? */
- size_t value_cnt; /* Number of "union value"s in struct case. */
+ struct caseproto *proto; /* Format of output cases. */
/* File format. */
enum integer_format integer_format; /* On-disk integer format. */
enum float_format float_format; /* On-disk floating point format. */
- int flt64_cnt; /* Number of 8-byte units per case. */
- struct sfm_var *vars; /* Variables. */
- size_t var_cnt; /* Number of variables. */
- bool has_long_var_names; /* File has a long variable name map */
- bool has_vls; /* File has one or more very long strings? */
+ struct sfm_var *sfm_vars; /* Variables. */
+ size_t sfm_var_cnt; /* Number of variables. */
+ casenumber case_cnt; /* Number of cases */
+ const char *encoding; /* String encoding. */
/* Decompression. */
bool compressed; /* File is compressed? */
double bias; /* Compression bias, usually 100.0. */
uint8_t opcodes[8]; /* Current block of opcodes. */
size_t opcode_idx; /* Next opcode to interpret, 8 if none left. */
+ bool corruption_warning; /* Warned about possible corruption? */
};
-/* A variable in a system file. */
-struct sfm_var
- {
- int width; /* 0=numeric, otherwise string width. */
- int case_index; /* Index into case. */
- };
-
-static struct casereader_class sys_file_casereader_class;
+static const struct casereader_class sys_file_casereader_class;
static bool close_reader (struct sfm_reader *);
-static struct variable **make_var_by_value_idx (struct sfm_reader *,
- struct dictionary *);
-static struct variable *lookup_var_by_value_idx (struct sfm_reader *,
- struct variable **,
- int value_idx);
-
-static void sys_warn (struct sfm_reader *, const char *, ...)
- PRINTF_FORMAT (2, 3);
-
-static void sys_error (struct sfm_reader *, const char *, ...)
- PRINTF_FORMAT (2, 3)
+static struct variable *lookup_var_by_index (struct sfm_reader *, off_t,
+ const struct sfm_var_record *,
+ size_t n, int idx);
+
+static void sys_msg (struct sfm_reader *r, off_t, int class,
+ const char *format, va_list args)
+ PRINTF_FORMAT (4, 0);
+static void sys_warn (struct sfm_reader *, off_t, const char *, ...)
+ PRINTF_FORMAT (3, 4);
+static void sys_error (struct sfm_reader *, off_t, const char *, ...)
+ PRINTF_FORMAT (3, 4)
NO_RETURN;
static void read_bytes (struct sfm_reader *, void *, size_t);
static bool try_read_bytes (struct sfm_reader *, void *, size_t);
-static int32_t read_int32 (struct sfm_reader *);
-static double read_flt64 (struct sfm_reader *);
+static int read_int (struct sfm_reader *);
+static double read_float (struct sfm_reader *);
static void read_string (struct sfm_reader *, char *, size_t);
static void skip_bytes (struct sfm_reader *, size_t);
-static int32_t int32_to_native (const struct sfm_reader *, const uint8_t[4]);
-static double flt64_to_double (const struct sfm_reader *, const uint8_t[8]);
-
-static struct variable_to_value_map *open_variable_to_value_map (
- struct sfm_reader *, size_t size);
-static void close_variable_to_value_map (struct sfm_reader *r,
- struct variable_to_value_map *);
-static bool read_variable_to_value_map (struct sfm_reader *,
- struct dictionary *,
- struct variable_to_value_map *,
- struct variable **var, char **value,
- int *warning_cnt);
+static int parse_int (struct sfm_reader *, const void *data, size_t ofs);
+static double parse_float (struct sfm_reader *, const void *data, size_t ofs);
+
+static void read_variable_record (struct sfm_reader *,
+ struct sfm_var_record *);
+static void read_value_label_record (struct sfm_reader *,
+ struct sfm_value_label_record *,
+ size_t n_vars);
+static struct sfm_document_record *read_document_record (struct sfm_reader *);
+static struct sfm_extension_record *read_extension_record (
+ struct sfm_reader *, int subtype);
+static void skip_extension_record (struct sfm_reader *, int subtype);
+
+static const char *choose_encoding (
+ struct sfm_reader *,
+ const struct sfm_extension_record *ext_integer,
+ const struct sfm_extension_record *ext_encoding);
+
+static struct text_record *open_text_record (
+ struct sfm_reader *, const struct sfm_extension_record *,
+ bool recode_to_utf8);
+static void close_text_record (struct sfm_reader *,
+ struct text_record *);
+static bool read_variable_to_value_pair (struct sfm_reader *,
+ struct dictionary *,
+ struct text_record *,
+ struct variable **var, char **value);
+static void text_warn (struct sfm_reader *r, struct text_record *text,
+ const char *format, ...)
+ PRINTF_FORMAT (3, 4);
+static char *text_get_token (struct text_record *,
+ struct substring delimiters, char *delimiter);
+static bool text_match (struct text_record *, char c);
+static bool text_read_variable_name (struct sfm_reader *, struct dictionary *,
+ struct text_record *,
+ struct substring delimiters,
+ struct variable **);
+static bool text_read_short_name (struct sfm_reader *, struct dictionary *,
+ struct text_record *,
+ struct substring delimiters,
+ struct variable **);
+static const char *text_parse_counted_string (struct sfm_reader *,
+ struct text_record *);
+static size_t text_pos (const struct text_record *);
static bool close_reader (struct sfm_reader *r);
\f
/* Dictionary reader. */
-enum which_format
+enum which_format
{
PRINT_FORMAT,
WRITE_FORMAT
};
-static void read_header (struct sfm_reader *, struct dictionary *,
- int *weight_idx, int *claimed_flt64_cnt,
- struct sfm_read_info *);
-static void read_variable_record (struct sfm_reader *, struct dictionary *,
- int *format_warning_cnt);
-static void parse_format_spec (struct sfm_reader *, uint32_t,
- enum which_format, struct variable *,
- int *format_warning_cnt);
-static void setup_weight (struct sfm_reader *, int weight_idx,
- struct variable **var_by_value_idx,
+static void read_header (struct sfm_reader *, int *weight_idx,
+ int *claimed_oct_cnt, struct sfm_read_info *,
+ char **file_labelp);
+static void parse_file_label (struct sfm_reader *, const char *file_label,
+ struct dictionary *);
+static void parse_variable_records (struct sfm_reader *, struct dictionary *,
+ struct sfm_var_record *, size_t n);
+static void parse_format_spec (struct sfm_reader *, off_t pos,
+ unsigned int format, enum which_format,
+ struct variable *, int *format_warning_cnt);
+static void parse_document (struct dictionary *, struct sfm_document_record *);
+static void parse_display_parameters (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct dictionary *);
+static void parse_machine_integer_info (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct sfm_read_info *);
+static void parse_machine_float_info (struct sfm_reader *,
+ const struct sfm_extension_record *);
+static void parse_mrsets (struct sfm_reader *,
+ const struct sfm_extension_record *,
struct dictionary *);
-static void read_documents (struct sfm_reader *, struct dictionary *);
-static void read_value_labels (struct sfm_reader *, struct dictionary *,
- struct variable **var_by_value_idx);
-
-static void read_extension_record (struct sfm_reader *, struct dictionary *);
-static void read_machine_int32_info (struct sfm_reader *,
- size_t size, size_t count);
-static void read_machine_flt64_info (struct sfm_reader *,
- size_t size, size_t count);
-static void read_display_parameters (struct sfm_reader *,
- size_t size, size_t count,
+static void parse_long_var_name_map (struct sfm_reader *,
+ const struct sfm_extension_record *,
struct dictionary *);
-static void read_long_var_name_map (struct sfm_reader *,
- size_t size, size_t count,
- struct dictionary *);
-static void read_long_string_map (struct sfm_reader *,
- size_t size, size_t count,
- struct dictionary *);
-
+static void parse_long_string_map (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct dictionary *);
+static void parse_value_labels (struct sfm_reader *, struct dictionary *,
+ const struct sfm_var_record *,
+ size_t n_var_recs,
+ const struct sfm_value_label_record *);
+static void parse_data_file_attributes (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct dictionary *);
+static void parse_variable_attributes (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct dictionary *);
+static void parse_long_string_value_labels (struct sfm_reader *,
+ const struct sfm_extension_record *,
+ struct dictionary *);
/* Opens the system file designated by file handle FH for
reading. Reads the system file's dictionary into *DICT.
If INFO is non-null, then it receives additional info about the
system file. */
struct casereader *
-sfm_open_reader (struct file_handle *fh, struct dictionary **dict,
- struct sfm_read_info *info)
+sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
+ struct sfm_read_info *volatile info)
{
struct sfm_reader *volatile r = NULL;
- struct variable **var_by_value_idx;
- int format_warning_cnt = 0;
- int weight_idx;
- int claimed_flt64_cnt;
- int rec_type;
- size_t i;
+ struct sfm_read_info local_info;
- if (!fh_open (fh, FH_REF_FILE, "system file", "rs"))
- return NULL;
+ struct sfm_var_record *vars;
+ size_t n_vars, allocated_vars;
+
+ struct sfm_value_label_record *labels;
+ size_t n_labels, allocated_labels;
- *dict = dict_create ();
+ struct sfm_document_record *document;
+
+ struct sfm_extension_record *extensions[32];
+
+ int weight_idx;
+ int claimed_oct_cnt;
+ char *file_label;
+
+ struct dictionary *dict = NULL;
+ size_t i;
/* Create and initialize reader. */
r = pool_create_container (struct sfm_reader, pool);
- r->fh = fh;
- r->file = fn_open (fh_get_file_name (fh), "rb");
+ r->fh = fh_ref (fh);
+ r->lock = NULL;
+ r->file = NULL;
+ r->pos = 0;
r->error = false;
- r->flt64_cnt = 0;
- r->has_vls = false;
- r->has_long_var_names = false;
r->opcode_idx = sizeof r->opcodes;
+ r->corruption_warning = false;
- if (setjmp (r->bail_out))
- {
- close_reader (r);
- dict_destroy (*dict);
- *dict = NULL;
- return NULL;
- }
+ /* TRANSLATORS: this fragment will be interpolated into
+ messages in fh_lock() that identify types of files. */
+ r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
+ if (r->lock == NULL)
+ goto error;
+ r->file = fn_open (fh_get_file_name (fh), "rb");
if (r->file == NULL)
{
- msg (ME, _("Error opening \"%s\" for reading as a system file: %s."),
+ msg (ME, _("Error opening `%s' for reading as a system file: %s."),
fh_get_file_name (r->fh), strerror (errno));
- longjmp (r->bail_out, 1);
+ goto error;
}
+ /* Initialize info. */
+ if (info == NULL)
+ info = &local_info;
+ memset (info, 0, sizeof *info);
+
+ if (setjmp (r->bail_out))
+ goto error;
+
/* Read header. */
- read_header (r, *dict, &weight_idx, &claimed_flt64_cnt, info);
+ read_header (r, &weight_idx, &claimed_oct_cnt, info, &file_label);
- /* Read all the variable definition records. */
- rec_type = read_int32 (r);
- while (rec_type == 2)
- {
- read_variable_record (r, *dict, &format_warning_cnt);
- rec_type = read_int32 (r);
- }
+ vars = NULL;
+ n_vars = allocated_vars = 0;
+
+ labels = NULL;
+ n_labels = allocated_labels = 0;
- /* Figure out the case format. */
- var_by_value_idx = make_var_by_value_idx (r, *dict);
- setup_weight (r, weight_idx, var_by_value_idx, *dict);
+ document = NULL;
- /* Read all the rest of the dictionary records. */
- while (rec_type != 999)
+ memset (extensions, 0, sizeof extensions);
+
+ for (;;)
{
- switch (rec_type)
+ int subtype;
+ int type;
+
+ type = read_int (r);
+ if (type == 999)
+ {
+ read_int (r); /* Skip filler. */
+ break;
+ }
+
+ switch (type)
{
+ case 2:
+ if (n_vars >= allocated_vars)
+ vars = pool_2nrealloc (r->pool, vars, &allocated_vars,
+ sizeof *vars);
+ read_variable_record (r, &vars[n_vars++]);
+ break;
+
case 3:
- read_value_labels (r, *dict, var_by_value_idx);
+ if (n_labels >= allocated_labels)
+ labels = pool_2nrealloc (r->pool, labels, &allocated_labels,
+ sizeof *labels);
+ read_value_label_record (r, &labels[n_labels++], n_vars);
break;
case 4:
- sys_error (r, _("Misplaced type 4 record."));
+ /* A Type 4 record is always immediately after a type 3 record,
+ so the code for type 3 records reads the type 4 record too. */
+ sys_error (r, r->pos, _("Misplaced type 4 record."));
case 6:
- read_documents (r, *dict);
+ if (document != NULL)
+ sys_error (r, r->pos, _("Duplicate type 6 (document) record."));
+ document = read_document_record (r);
break;
case 7:
- read_extension_record (r, *dict);
+ subtype = read_int (r);
+ if (subtype < 0 || subtype >= sizeof extensions / sizeof *extensions)
+ {
+ sys_warn (r, r->pos,
+ _("Unrecognized record type 7, subtype %d. Please "
+ "send a copy of this file, and the syntax which "
+ "created it to %s."),
+ subtype, PACKAGE_BUGREPORT);
+ skip_extension_record (r, subtype);
+ }
+ else if (extensions[subtype] != NULL)
+ {
+ sys_warn (r, r->pos,
+ _("Record type 7, subtype %d found here has the same "
+ "type as the record found near offset 0x%llx. "
+ "Please send a copy of this file, and the syntax "
+ "which created it to %s."),
+ subtype, (long long int) extensions[subtype]->pos,
+ PACKAGE_BUGREPORT);
+ skip_extension_record (r, subtype);
+ }
+ else
+ extensions[subtype] = read_extension_record (r, subtype);
break;
default:
- sys_error (r, _("Unrecognized record type %d."), rec_type);
+ sys_error (r, r->pos, _("Unrecognized record type %d."), type);
+ goto error;
}
- rec_type = read_int32 (r);
}
+ /* Now actually parse what we read.
- if ( ! r->has_long_var_names )
- {
- int i;
- for (i = 0; i < dict_get_var_cnt (*dict); i++)
- {
- struct variable *var = dict_get_var (*dict, i);
- char short_name [SHORT_NAME_LEN + 1];
- char long_name [SHORT_NAME_LEN + 1];
+ First, figure out the correct character encoding, because this determines
+ how the rest of the header data is to be interpreted. */
+ dict = dict_create (choose_encoding (r, extensions[EXT_INTEGER],
+ extensions[EXT_ENCODING]));
+ r->encoding = dict_get_encoding (dict);
- strcpy (short_name, var_get_name (var));
+ /* These records don't use variables at all. */
+ if (document != NULL)
+ parse_document (dict, document);
- strcpy (long_name, short_name);
- str_lowercase (long_name);
+ if (extensions[EXT_INTEGER] != NULL)
+ parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
- /* Set long name. Renaming a variable may clear the short
- name, but we want to retain it, so re-set it
- explicitly. */
- dict_rename_var (*dict, var, long_name);
- var_set_short_name (var, short_name);
- }
+ if (extensions[EXT_FLOAT] != NULL)
+ parse_machine_float_info (r, extensions[EXT_FLOAT]);
+
+ if (extensions[EXT_FILE_ATTRS] != NULL)
+ parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
+
+ parse_file_label (r, file_label, dict);
- r->has_long_var_names = true;
+ /* Parse the variable records, the basis of almost everything else. */
+ parse_variable_records (r, dict, vars, n_vars);
+
+ /* Parse value labels and the weight variable immediately after the variable
+ records. These records use indexes into var_recs[], so we must parse them
+ before those indexes become invalidated by very long string variables. */
+ for (i = 0; i < n_labels; i++)
+ parse_value_labels (r, dict, vars, n_vars, &labels[i]);
+ if (weight_idx != 0)
+ {
+ struct variable *weight_var;
+
+ weight_var = lookup_var_by_index (r, 76, vars, n_vars, weight_idx);
+ if (var_is_numeric (weight_var))
+ dict_set_weight (dict, weight_var);
+ else
+ sys_error (r, -1, _("Weighting variable must be numeric "
+ "(not string variable `%s')."),
+ var_get_name (weight_var));
}
- /* Read record 999 data, which is just filler. */
- read_int32 (r);
+ if (extensions[EXT_DISPLAY] != NULL)
+ parse_display_parameters (r, extensions[EXT_DISPLAY], dict);
+
+ /* The following records use short names, so they need to be parsed before
+ parse_long_var_name_map() changes short names to long names. */
+ if (extensions[EXT_MRSETS] != NULL)
+ parse_mrsets (r, extensions[EXT_MRSETS], dict);
+
+ if (extensions[EXT_MRSETS2] != NULL)
+ parse_mrsets (r, extensions[EXT_MRSETS2], dict);
+
+ if (extensions[EXT_LONG_STRINGS] != NULL)
+ parse_long_string_map (r, extensions[EXT_LONG_STRINGS], dict);
+
+ /* Now rename variables to their long names. */
+ parse_long_var_name_map (r, extensions[EXT_LONG_NAMES], dict);
+
+ /* The following records use long names, so they need to follow renaming. */
+ if (extensions[EXT_VAR_ATTRS] != NULL)
+ parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
- if (claimed_flt64_cnt != -1 && claimed_flt64_cnt != r->flt64_cnt)
- sys_warn (r, _("File header claims %d variable positions but "
- "%d were read from file."),
- claimed_flt64_cnt, r->flt64_cnt);
+ if (extensions[EXT_LONG_LABELS] != NULL)
+ parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
+
+ /* Warn if the actual amount of data per case differs from the
+ amount that the header claims. SPSS version 13 gets this
+ wrong when very long strings are involved, so don't warn in
+ that case. */
+ if (claimed_oct_cnt != -1 && claimed_oct_cnt != n_vars
+ && info->version_major != 13)
+ sys_warn (r, -1, _("File header claims %d variable positions but "
+ "%zu were read from file."),
+ claimed_oct_cnt, n_vars);
/* Create an index of dictionary variable widths for
sfm_read_case to use. We cannot use the `struct variable's
from the dictionary we created, because the caller owns the
dictionary and may destroy or modify its variables. */
- r->var_cnt = dict_get_var_cnt (*dict);
- r->vars = pool_nalloc (r->pool, r->var_cnt, sizeof *r->vars);
- for (i = 0; i < r->var_cnt; i++)
- {
- struct variable *v = dict_get_var (*dict, i);
- struct sfm_var *sv = &r->vars[i];
- sv->width = var_get_width (v);
- sv->case_index = var_get_case_index (v);
- }
-
- pool_free (r->pool, var_by_value_idx);
- r->value_cnt = dict_get_next_value_idx (*dict);
- return casereader_create_sequential (NULL, r->value_cnt, CASENUMBER_MAX,
+ sfm_dictionary_to_sfm_vars (dict, &r->sfm_vars, &r->sfm_var_cnt);
+ pool_register (r->pool, free, r->sfm_vars);
+ r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
+
+ *dictp = dict;
+ return casereader_create_sequential
+ (NULL, r->proto,
+ r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
&sys_file_casereader_class, r);
+
+error:
+ close_reader (r);
+ dict_destroy (dict);
+ *dictp = NULL;
+ return NULL;
}
/* Closes a system file after we're done with it.
if (r->file)
{
- if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
+ if (fn_close (fh_get_file_name (r->fh), r->file) == EOF)
{
- msg (ME, _("Error closing system file \"%s\": %s."),
+ msg (ME, _("Error closing system file `%s': %s."),
fh_get_file_name (r->fh), strerror (errno));
r->error = true;
}
r->file = NULL;
}
- if (r->fh != NULL)
- fh_close (r->fh, "system file", "rs");
+ fh_unlock (r->lock);
+ fh_unref (r->fh);
error = r->error;
pool_destroy (r->pool);
/* Destroys READER. */
static void
-sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
+sys_file_casereader_destroy (struct casereader *reader UNUSED, void *r_)
{
struct sfm_reader *r = r_;
close_reader (r);
/* Returns true if FILE is an SPSS system file,
false otherwise. */
bool
-sfm_detect (FILE *file)
+sfm_detect (FILE *file)
{
char rec_type[5];
if (fread (rec_type, 4, 1, file) != 1)
return false;
rec_type[4] = '\0';
-
+
return !strcmp ("$FL2", rec_type);
}
\f
-/* Reads the global header of the system file.
- Sets DICT's file label to the system file's label.
- Sets *WEIGHT_IDX to 0 if the system file is unweighted,
- or to the value index of the weight variable otherwise.
- Sets *CLAIMED_FLT64_CNT to the number of values that the file
- claims to have (although it is not always correct).
- If INFO is non-null, initializes *INFO with header
- information. */
+/* Reads the global header of the system file. Sets *WEIGHT_IDX to 0 if the
+ system file is unweighted, or to the value index of the weight variable
+ otherwise. Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units) per
+ case that the file claims to have (although it is not always correct).
+ Initializes INFO with header information. Stores the file label as a string
+ in dictionary encoding into *FILE_LABELP. */
static void
-read_header (struct sfm_reader *r, struct dictionary *dict,
- int *weight_idx, int *claimed_flt64_cnt,
- struct sfm_read_info *info)
+read_header (struct sfm_reader *r, int *weight_idx,
+ int *claimed_oct_cnt, struct sfm_read_info *info,
+ char **file_labelp)
{
char rec_type[5];
char eye_catcher[61];
uint8_t raw_layout_code[4];
- int case_cnt;
uint8_t raw_bias[8];
char creation_date[10];
char creation_time[9];
char file_label[65];
- struct substring file_label_ss;
+ struct substring product;
read_string (r, rec_type, sizeof rec_type);
read_string (r, eye_catcher, sizeof eye_catcher);
-
+
if (strcmp ("$FL2", rec_type) != 0)
- sys_error (r, _("This is not an SPSS system file."));
+ sys_error (r, 0, _("This is not an SPSS system file."));
/* Identify integer format. */
read_bytes (r, raw_layout_code, sizeof raw_layout_code);
&r->integer_format))
|| (r->integer_format != INTEGER_MSB_FIRST
&& r->integer_format != INTEGER_LSB_FIRST))
- sys_error (r, _("This is not an SPSS system file."));
+ sys_error (r, 64, _("This is not an SPSS system file."));
- *claimed_flt64_cnt = read_int32 (r);
- if (*claimed_flt64_cnt < 0 || *claimed_flt64_cnt > INT_MAX / 16)
- *claimed_flt64_cnt = -1;
+ *claimed_oct_cnt = read_int (r);
+ if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
+ *claimed_oct_cnt = -1;
- r->compressed = read_int32 (r) != 0;
+ r->compressed = read_int (r) != 0;
- *weight_idx = read_int32 (r);
+ *weight_idx = read_int (r);
- case_cnt = read_int32 (r);
- if (case_cnt < -1 || case_cnt > INT_MAX / 2)
- case_cnt = -1;
+ r->case_cnt = read_int (r);
+ if ( r->case_cnt > INT_MAX / 2)
+ r->case_cnt = -1;
/* Identify floating-point format and obtain compression bias. */
read_bytes (r, raw_bias, sizeof raw_bias);
if (float_identify (100.0, raw_bias, sizeof raw_bias, &r->float_format) == 0)
{
- sys_warn (r, _("Compression bias (%g) is not the usual "
- "value of 100, or system file uses unrecognized "
- "floating-point format."),
- r->bias);
+ uint8_t zero_bias[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+ if (memcmp (raw_bias, zero_bias, 8))
+ sys_warn (r, r->pos - 8,
+ _("Compression bias is not the usual "
+ "value of 100, or system file uses unrecognized "
+ "floating-point format."));
+ else
+ {
+ /* Some software is known to write all-zeros to this
+ field. Such software also writes floating-point
+ numbers in the format that we expect by default
+ (it seems that all software most likely does, in
+ reality), so don't warn in this case. */
+ }
+
if (r->integer_format == INTEGER_MSB_FIRST)
r->float_format = FLOAT_IEEE_DOUBLE_BE;
else
read_string (r, creation_time, sizeof creation_time);
read_string (r, file_label, sizeof file_label);
skip_bytes (r, 3);
-
- file_label_ss = ss_cstr (file_label);
- ss_trim (&file_label_ss, ss_cstr (" "));
- if (!ss_is_empty (file_label_ss))
+
+ strcpy (info->creation_date, creation_date);
+ strcpy (info->creation_time, creation_time);
+ info->integer_format = r->integer_format;
+ info->float_format = r->float_format;
+ info->compressed = r->compressed;
+ info->case_cnt = r->case_cnt;
+
+ product = ss_cstr (eye_catcher);
+ ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
+ ss_trim (&product, ss_cstr (" "));
+ str_copy_buf_trunc (info->product, sizeof info->product,
+ ss_data (product), ss_length (product));
+
+ *file_labelp = pool_strdup0 (r->pool, file_label, sizeof file_label - 1);
+}
+
+/* Reads a variable (type 2) record from R into RECORD. */
+static void
+read_variable_record (struct sfm_reader *r, struct sfm_var_record *record)
+{
+ int has_variable_label;
+
+ memset (record, 0, sizeof *record);
+
+ record->pos = r->pos;
+ record->width = read_int (r);
+ has_variable_label = read_int (r);
+ record->missing_value_code = read_int (r);
+ record->print_format = read_int (r);
+ record->write_format = read_int (r);
+ read_bytes (r, record->name, sizeof record->name);
+
+ if (has_variable_label == 1)
{
- ss_data (file_label_ss)[ss_length (file_label_ss)] = '\0';
- dict_set_label (dict, ss_data (file_label_ss));
+ enum { MAX_LABEL_LEN = 255 };
+ size_t len, read_len;
+
+ len = read_int (r);
+
+ /* Read up to MAX_LABEL_LEN bytes of label. */
+ read_len = MIN (MAX_LABEL_LEN, len);
+ record->label = xmalloc (read_len + 1);
+ read_string (r, record->label, read_len + 1);
+
+ /* Skip unread label bytes. */
+ skip_bytes (r, len - read_len);
+
+ /* Skip label padding up to multiple of 4 bytes. */
+ skip_bytes (r, ROUND_UP (len, 4) - len);
}
+ else if (has_variable_label != 0)
+ sys_error (r, record->pos,
+ _("Variable label indicator field is not 0 or 1."));
- if (info)
+ /* Set missing values. */
+ if (record->missing_value_code != 0)
{
- struct substring product;
-
- strcpy (info->creation_date, creation_date);
- strcpy (info->creation_time, creation_time);
- info->integer_format = r->integer_format;
- info->float_format = r->float_format;
- info->compressed = r->compressed;
- info->case_cnt = case_cnt;
-
- product = ss_cstr (eye_catcher);
- ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
- ss_trim (&product, ss_cstr (" "));
- str_copy_buf_trunc (info->product, sizeof info->product,
- ss_data (product), ss_length (product));
+ int code = record->missing_value_code;
+ if (record->width == 0)
+ {
+ if (code < -3 || code > 3 || code == -1)
+ sys_error (r, record->pos,
+ _("Numeric missing value indicator field is not "
+ "-3, -2, 0, 1, 2, or 3."));
+ }
+ else
+ {
+ if (code < 1 || code > 3)
+ sys_error (r, record->pos,
+ _("String missing value indicator field is not "
+ "0, 1, 2, or 3."));
+ }
+
+ read_bytes (r, record->missing, 8 * abs (code));
}
}
+/* Reads value labels from R into RECORD. */
+static void
+read_value_label_record (struct sfm_reader *r,
+ struct sfm_value_label_record *record,
+ size_t n_vars)
+{
+ size_t i;
+
+ /* Read type 3 record. */
+ record->pos = r->pos;
+ record->n_labels = read_int (r);
+ if (record->n_labels > SIZE_MAX / sizeof *record->labels)
+ sys_error (r, r->pos - 4, _("Invalid number of labels %zu."),
+ record->n_labels);
+ record->labels = pool_nmalloc (r->pool, record->n_labels,
+ sizeof *record->labels);
+ for (i = 0; i < record->n_labels; i++)
+ {
+ struct sfm_value_label *label = &record->labels[i];
+ unsigned char label_len;
+ size_t padded_len;
+
+ read_bytes (r, label->value, sizeof label->value);
+
+ /* Read label length. */
+ read_bytes (r, &label_len, sizeof label_len);
+ padded_len = ROUND_UP (label_len + 1, 8);
+
+ /* Read label, padding. */
+ label->label = pool_malloc (r->pool, padded_len + 1);
+ read_bytes (r, label->label, padded_len - 1);
+ label->label[label_len] = '\0';
+ }
+
+ /* Read record type of type 4 record. */
+ if (read_int (r) != 4)
+ sys_error (r, r->pos - 4,
+ _("Variable index record (type 4) does not immediately "
+ "follow value label record (type 3) as it should."));
+
+ /* Read number of variables associated with value label from type 4
+ record. */
+ record->n_vars = read_int (r);
+ if (record->n_vars < 1 || record->n_vars > n_vars)
+ sys_error (r, r->pos - 4,
+ _("Number of variables associated with a value label (%zu) "
+ "is not between 1 and the number of variables (%zu)."),
+ record->n_vars, n_vars);
+ record->vars = pool_nmalloc (r->pool, record->n_vars, sizeof *record->vars);
+ for (i = 0; i < record->n_vars; i++)
+ record->vars[i] = read_int (r);
+}
+
+/* Reads a document record from R and returns it. */
+static struct sfm_document_record *
+read_document_record (struct sfm_reader *r)
+{
+ struct sfm_document_record *record;
+ int n_lines;
+
+ record = pool_malloc (r->pool, sizeof *record);
+ record->pos = r->pos;
+
+ n_lines = read_int (r);
+ if (n_lines <= 0 || n_lines >= INT_MAX / DOC_LINE_LENGTH)
+ sys_error (r, record->pos,
+ _("Number of document lines (%d) "
+ "must be greater than 0 and less than %d."),
+ n_lines, INT_MAX / DOC_LINE_LENGTH);
+
+ record->n_lines = n_lines;
+ record->documents = pool_malloc (r->pool, DOC_LINE_LENGTH * n_lines);
+ read_bytes (r, record->documents, DOC_LINE_LENGTH * n_lines);
+
+ return record;
+}
+
+static void
+read_extension_record_header (struct sfm_reader *r, int subtype,
+ struct sfm_extension_record *record)
+{
+ record->pos = r->pos;
+ record->size = read_int (r);
+ record->count = read_int (r);
+
+ /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
+ allows an extra byte for a null terminator, used by some
+ extension processing routines. */
+ if (record->size != 0
+ && size_overflow_p (xsum (1, xtimes (record->count, record->size))))
+ sys_error (r, record->pos, "Record type 7 subtype %d too large.", subtype);
+}
+
+/* Reads an extension record from R into RECORD. */
+static struct sfm_extension_record *
+read_extension_record (struct sfm_reader *r, int subtype)
+{
+ struct extension_record_type
+ {
+ int subtype;
+ int size;
+ int count;
+ };
+
+ static const struct extension_record_type types[] =
+ {
+ /* Implemented record types. */
+ { EXT_INTEGER, 4, 8 },
+ { EXT_FLOAT, 8, 3 },
+ { EXT_MRSETS, 1, 0 },
+ { EXT_DISPLAY, 4, 0 },
+ { EXT_LONG_NAMES, 1, 0 },
+ { EXT_LONG_STRINGS, 1, 0 },
+ { EXT_NCASES, 8, 2 },
+ { EXT_FILE_ATTRS, 1, 0 },
+ { EXT_VAR_ATTRS, 1, 0 },
+ { EXT_MRSETS2, 1, 0 },
+ { EXT_ENCODING, 1, 0 },
+ { EXT_LONG_LABELS, 1, 0 },
+
+ /* Ignored record types. */
+ { EXT_VAR_SETS, 0, 0 },
+ { EXT_DATE, 0, 0 },
+ { EXT_DATA_ENTRY, 0, 0 },
+ };
+
+ const struct extension_record_type *type;
+ struct sfm_extension_record *record;
+ size_t n_bytes;
+
+ record = pool_malloc (r->pool, sizeof *record);
+ read_extension_record_header (r, subtype, record);
+ n_bytes = record->count * record->size;
+
+ for (type = types; type < &types[sizeof types / sizeof *types]; type++)
+ if (subtype == type->subtype)
+ {
+ if (type->size > 0 && record->size != type->size)
+ sys_warn (r, record->pos,
+ _("Record type 7, subtype %d has bad size %zu "
+ "(expected %d)."), subtype, record->size, type->size);
+ else if (type->count > 0 && record->count != type->count)
+ sys_warn (r, record->pos,
+ _("Record type 7, subtype %d has bad count %zu "
+ "(expected %d)."), subtype, record->count, type->count);
+ else if (type->count == 0 && type->size == 0)
+ {
+ /* Ignore this record. */
+ }
+ else
+ {
+ char *data = pool_malloc (r->pool, n_bytes + 1);
+ data[n_bytes] = '\0';
+
+ record->data = data;
+ read_bytes (r, record->data, n_bytes);
+ return record;
+ }
+
+ goto skip;
+ }
+
+ sys_warn (r, record->pos,
+ _("Unrecognized record type 7, subtype %d. Please send a "
+ "copy of this file, and the syntax which created it to %s."),
+ subtype, PACKAGE_BUGREPORT);
+
+skip:
+ skip_bytes (r, n_bytes);
+ return NULL;
+}
+
+static void
+skip_extension_record (struct sfm_reader *r, int subtype)
+{
+ struct sfm_extension_record record;
+
+ read_extension_record_header (r, subtype, &record);
+ skip_bytes (r, record.count * record.size);
+}
+
+static void
+parse_file_label (struct sfm_reader *r, const char *file_label,
+ struct dictionary *dict)
+{
+ char *utf8_file_label;
+ size_t file_label_len;
+
+ utf8_file_label = recode_string_pool ("UTF-8", dict_get_encoding (dict),
+ file_label, -1, r->pool);
+ file_label_len = strlen (utf8_file_label);
+ while (file_label_len > 0 && utf8_file_label[file_label_len - 1] == ' ')
+ file_label_len--;
+ utf8_file_label[file_label_len] = '\0';
+ dict_set_label (dict, utf8_file_label);
+}
+
/* Reads a variable (type 2) record from R and adds the
corresponding variable to DICT.
Also skips past additional variable records for long string
variables. */
static void
-read_variable_record (struct sfm_reader *r, struct dictionary *dict,
- int *format_warning_cnt)
+parse_variable_records (struct sfm_reader *r, struct dictionary *dict,
+ struct sfm_var_record *var_recs, size_t n_var_recs)
{
- int width;
- int has_variable_label;
- int missing_value_code;
- int print_format;
- int write_format;
- char name[9];
+ const char *dict_encoding = dict_get_encoding (dict);
+ struct sfm_var_record *rec;
+ int n_warnings = 0;
- struct variable *var;
- int nv;
-
- width = read_int32 (r);
- has_variable_label = read_int32 (r);
- missing_value_code = read_int32 (r);
- print_format = read_int32 (r);
- write_format = read_int32 (r);
- read_string (r, name, sizeof name);
- name[strcspn (name, " ")] = '\0';
-
- /* Check variable name. */
- if (name[0] == '$' || name[0] == '#')
- sys_error (r, "Variable name begins with invalid character `%c'.",
- name[0]);
- if (!var_is_plausible_name (name, false))
- sys_error (r, _("Invalid variable name `%s'."), name);
-
- /* Create variable. */
- if (width < 0 || width > 255)
- sys_error (r, _("Bad variable width %d."), width);
- var = dict_create_var (dict, name, width);
- if (var == NULL)
- sys_error (r,
- _("Duplicate variable name `%s' within system file."),
- name);
-
- /* Set the short name the same as the long name */
- var_set_short_name (var, var_get_name (var));
-
- /* Get variable label, if any. */
- if (has_variable_label != 0 && has_variable_label != 1)
- sys_error (r, _("Variable label indicator field is not 0 or 1."));
- if (has_variable_label == 1)
+ for (rec = var_recs; rec < &var_recs[n_var_recs]; )
{
- size_t len;
- char label[255 + 1];
-
- len = read_int32 (r);
- if (len >= sizeof label)
- sys_error (r, _("Variable %s has label of invalid length %u."),
- name, (unsigned int) len);
- read_string (r, label, len + 1);
- var_set_label (var, label);
-
- skip_bytes (r, ROUND_UP (len, 4) - len);
- }
+ struct variable *var;
+ size_t n_values;
+ char *name;
+ size_t i;
- /* Set missing values. */
- if (missing_value_code < -3 || missing_value_code > 3
- || missing_value_code == -1)
- sys_error (r, _("Missing value indicator field is not "
- "-3, -2, 0, 1, 2, or 3."));
- if (missing_value_code != 0)
- {
- struct missing_values mv;
- mv_init (&mv, var_get_width (var));
- if (var_is_numeric (var))
+ name = recode_string_pool ("UTF-8", dict_encoding,
+ rec->name, 8, r->pool);
+ name[strcspn (name, " ")] = '\0';
+
+ if (!dict_id_is_valid (dict, name, false)
+ || name[0] == '$' || name[0] == '#')
+ sys_error (r, rec->pos, _("Invalid variable name `%s'."), name);
+
+ if (rec->width < 0 || rec->width > 255)
+ sys_error (r, rec->pos,
+ _("Bad width %d for variable %s."), rec->width, name);
+
+ var = rec->var = dict_create_var (dict, name, rec->width);
+ if (var == NULL)
+ sys_error (r, rec->pos, _("Duplicate variable name `%s'."), name);
+
+ /* Set the short name the same as the long name. */
+ var_set_short_name (var, 0, name);
+
+ /* Get variable label, if any. */
+ if (rec->label)
+ {
+ char *utf8_label;
+
+ utf8_label = recode_string_pool ("UTF-8", dict_encoding,
+ rec->label, -1, r->pool);
+ var_set_label (var, utf8_label, false);
+ }
+
+ /* Set missing values. */
+ if (rec->missing_value_code != 0)
{
- if (missing_value_code > 0)
+ int width = var_get_width (var);
+ struct missing_values mv;
+
+ mv_init_pool (r->pool, &mv, width);
+ if (var_is_numeric (var))
{
- int i;
- for (i = 0; i < missing_value_code; i++)
- mv_add_num (&mv, read_flt64 (r));
+ bool has_range = rec->missing_value_code < 0;
+ int n_discrete = (has_range
+ ? rec->missing_value_code == -3
+ : rec->missing_value_code);
+ int ofs = 0;
+
+ if (has_range)
+ {
+ double low = parse_float (r, rec->missing, 0);
+ double high = parse_float (r, rec->missing, 8);
+ mv_add_range (&mv, low, high);
+ ofs += 16;
+ }
+
+ for (i = 0; i < n_discrete; i++)
+ {
+ mv_add_num (&mv, parse_float (r, rec->missing, ofs));
+ ofs += 8;
+ }
}
else
{
- double low = read_flt64 (r);
- double high = read_flt64 (r);
- mv_add_num_range (&mv, low, high);
- if (missing_value_code == -3)
- mv_add_num (&mv, read_flt64 (r));
- }
- }
- else if (var_get_width (var) <= MAX_SHORT_STRING)
- {
- if (missing_value_code > 0)
- {
- int i;
- for (i = 0; i < missing_value_code; i++)
+ union value value;
+
+ value_init_pool (r->pool, &value, width);
+ value_set_missing (&value, width);
+ for (i = 0; i < rec->missing_value_code; i++)
{
- char string[9];
- read_string (r, string, sizeof string);
- mv_add_str (&mv, string);
+ uint8_t *s = value_str_rw (&value, width);
+ memcpy (s, rec->missing + 8 * i, MIN (width, 8));
+ mv_add_str (&mv, s);
}
}
- else
- sys_error (r, _("String variable %s may not have missing "
- "values specified as a range."),
- name);
+ var_set_missing_values (var, &mv);
}
- else /* var->width > MAX_SHORT_STRING */
- sys_error (r, _("Long string variable %s may not have missing "
- "values."),
- name);
- var_set_missing_values (var, &mv);
- }
-
- /* Set formats. */
- parse_format_spec (r, print_format, PRINT_FORMAT, var, format_warning_cnt);
- parse_format_spec (r, write_format, WRITE_FORMAT, var, format_warning_cnt);
-
- /* Account for values.
- Skip long string continuation records, if any. */
- nv = width == 0 ? 1 : DIV_RND_UP (width, 8);
- r->flt64_cnt += nv;
- if (width > 8)
- {
- int i;
- for (i = 1; i < nv; i++)
- {
- /* Check for record type 2 and width -1. */
- if (read_int32 (r) != 2 || read_int32 (r) != -1)
- sys_error (r, _("Missing string continuation record."));
-
- /* Skip and ignore remaining continuation data. */
- has_variable_label = read_int32 (r);
- missing_value_code = read_int32 (r);
- print_format = read_int32 (r);
- write_format = read_int32 (r);
- read_string (r, name, sizeof name);
-
- /* Variable label fields on continuation records have
- been spotted in system files created by "SPSS Power
- Macintosh Release 6.1". */
- if (has_variable_label)
- skip_bytes (r, ROUND_UP (read_int32 (r), 4));
- }
+ /* Set formats. */
+ parse_format_spec (r, rec->pos + 12, rec->print_format,
+ PRINT_FORMAT, var, &n_warnings);
+ parse_format_spec (r, rec->pos + 16, rec->write_format,
+ WRITE_FORMAT, var, &n_warnings);
+
+ /* Account for values.
+ Skip long string continuation records, if any. */
+ n_values = rec->width == 0 ? 1 : DIV_RND_UP (rec->width, 8);
+ for (i = 1; i < n_values; i++)
+ if (i + (rec - var_recs) >= n_var_recs || rec[i].width != -1)
+ sys_error (r, rec->pos, _("Missing string continuation record."));
+ rec += n_values;
}
}
/* Translates the format spec from sysfile format to internal
format. */
static void
-parse_format_spec (struct sfm_reader *r, uint32_t s,
+parse_format_spec (struct sfm_reader *r, off_t pos, unsigned int format,
enum which_format which, struct variable *v,
- int *format_warning_cnt)
+ int *n_warnings)
{
- const int max_format_warnings = 8;
+ const int max_warnings = 8;
+ uint8_t raw_type = format >> 16;
+ uint8_t w = format >> 8;
+ uint8_t d = format;
struct fmt_spec f;
- uint8_t raw_type = s >> 16;
- uint8_t w = s >> 8;
- uint8_t d = s;
-
bool ok;
-
- if (!fmt_from_io (raw_type, &f.type))
- sys_error (r, _("Unknown variable format %d."), (int) raw_type);
+
f.w = w;
f.d = d;
msg_disable ();
- ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
+ ok = (fmt_from_io (raw_type, &f.type)
+ && fmt_check_output (&f)
+ && fmt_check_width_compat (&f, var_get_width (v)));
msg_enable ();
-
- if (ok)
+
+ if (ok)
{
if (which == PRINT_FORMAT)
var_set_print_format (v, &f);
else
var_set_write_format (v, &f);
}
- else if (*++format_warning_cnt <= max_format_warnings)
+ else if (format == 0)
{
- char fmt_string[FMT_STRING_LEN_MAX + 1];
- sys_warn (r, _("%s variable %s has invalid %s format %s."),
- var_is_numeric (v) ? _("Numeric") : _("String"),
- var_get_name (v),
- which == PRINT_FORMAT ? _("print") : _("write"),
- fmt_to_string (&f, fmt_string));
-
- if (*format_warning_cnt == max_format_warnings)
- sys_warn (r, _("Suppressing further invalid format warnings."));
+ /* Actually observed in the wild. No point in warning about it. */
}
-}
-
-/* Sets the weighting variable in DICT to the variable
- corresponding to the given 1-based VALUE_IDX, if VALUE_IDX is
- nonzero. */
-static void
-setup_weight (struct sfm_reader *r, int weight_idx,
- struct variable **var_by_value_idx, struct dictionary *dict)
-{
- if (weight_idx != 0)
+ else if (++*n_warnings <= max_warnings)
{
- struct variable *weight_var
- = lookup_var_by_value_idx (r, var_by_value_idx, weight_idx);
- if (var_is_numeric (weight_var))
- dict_set_weight (dict, weight_var);
+ if (which == PRINT_FORMAT)
+ sys_warn (r, pos, _("Variable %s with width %d has invalid print "
+ "format 0x%x."),
+ var_get_name (v), var_get_width (v), format);
else
- sys_error (r, _("Weighting variable must be numeric."));
- }
-}
-
-/* Reads a document record, type 6, from system file R, and sets up
- the documents and n_documents fields in the associated
- dictionary. */
-static void
-read_documents (struct sfm_reader *r, struct dictionary *dict)
-{
- int line_cnt;
- char *documents;
-
- if (dict_get_documents (dict) != NULL)
- sys_error (r, _("Multiple type 6 (document) records."));
+ sys_warn (r, pos, _("Variable %s with width %d has invalid write "
+ "format 0x%x."),
+ var_get_name (v), var_get_width (v), format);
- line_cnt = read_int32 (r);
- if (line_cnt <= 0)
- sys_error (r, _("Number of document lines (%d) "
- "must be greater than 0."), line_cnt);
-
- documents = pool_nmalloc (r->pool, line_cnt + 1, DOC_LINE_LENGTH);
- read_string (r, documents, DOC_LINE_LENGTH * line_cnt + 1);
- if (strlen (documents) == DOC_LINE_LENGTH * line_cnt)
- dict_set_documents (dict, documents);
- else
- sys_error (r, _("Document line contains null byte."));
- pool_free (r->pool, documents);
+ if (*n_warnings == max_warnings)
+ sys_warn (r, -1, _("Suppressing further invalid format warnings."));
+ }
}
-/* Read a type 7 extension record. */
static void
-read_extension_record (struct sfm_reader *r, struct dictionary *dict)
+parse_document (struct dictionary *dict, struct sfm_document_record *record)
{
- int subtype = read_int32 (r);
- size_t size = read_int32 (r);
- size_t count = read_int32 (r);
- size_t bytes = size * count;
-
- /* Check that SIZE * COUNT + 1 doesn't overflow. Adding 1
- allows an extra byte for a null terminator, used by some
- extension processing routines. */
- if (size != 0 && size_overflow_p (xsum (1, xtimes (count, size))))
- sys_error (r, "Record type 7 subtype %d too large.", subtype);
+ const char *p;
- switch (subtype)
+ for (p = record->documents;
+ p < record->documents + DOC_LINE_LENGTH * record->n_lines;
+ p += DOC_LINE_LENGTH)
{
- case 3:
- read_machine_int32_info (r, size, count);
- return;
+ struct substring line;
- case 4:
- read_machine_flt64_info (r, size, count);
- return;
+ line = recode_substring_pool ("UTF-8", dict_get_encoding (dict),
+ ss_buffer (p, DOC_LINE_LENGTH), NULL);
+ ss_rtrim (&line, ss_cstr (" "));
+ line.string[line.length] = '\0';
- case 5:
- /* Variable sets information. We don't use these yet.
- They only apply to GUIs; see VARSETS on the APPLY
- DICTIONARY command in SPSS documentation. */
- break;
+ dict_add_document_line (dict, line.string, false);
- case 6:
- /* DATE variable information. We don't use it yet, but we
- should. */
- break;
-
- case 7:
- /* Unknown purpose. */
- break;
-
- case 11:
- read_display_parameters (r, size, count, dict);
- return;
-
- case 13:
- read_long_var_name_map (r, size, count, dict);
- return;
-
- case 14:
- read_long_string_map (r, size, count, dict);
- return;
-
- case 16:
- /* New in SPSS v14? Unknown purpose. */
- break;
-
- case 17:
- /* Text field that defines variable attributes. New in
- SPSS 14. */
- break;
-
- default:
- sys_warn (r, _("Unrecognized record type 7, subtype %d."), subtype);
- break;
+ ss_dealloc (&line);
}
-
- skip_bytes (r, bytes);
}
-/* Read record type 7, subtype 3. */
+/* Parses record type 7, subtype 3. */
static void
-read_machine_int32_info (struct sfm_reader *r, size_t size, size_t count)
+parse_machine_integer_info (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct sfm_read_info *info)
{
- int version_major UNUSED = read_int32 (r);
- int version_minor UNUSED = read_int32 (r);
- int version_revision UNUSED = read_int32 (r);
- int machine_code UNUSED = read_int32 (r);
- int float_representation = read_int32 (r);
- int compression_code UNUSED = read_int32 (r);
- int integer_representation = read_int32 (r);
- int character_code UNUSED = read_int32 (r);
-
- int expected_float_format;
- int expected_integer_format;
-
- if (size != 4 || count != 8)
- sys_error (r, _("Bad size (%u) or count (%u) field on record type 7, "
- "subtype 3."),
- (unsigned int) size, (unsigned int) count);
+ int float_representation, expected_float_format;
+ int integer_representation, expected_integer_format;
+
+ /* Save version info. */
+ info->version_major = parse_int (r, record->data, 0);
+ info->version_minor = parse_int (r, record->data, 4);
+ info->version_revision = parse_int (r, record->data, 8);
/* Check floating point format. */
+ float_representation = parse_int (r, record->data, 16);
if (r->float_format == FLOAT_IEEE_DOUBLE_BE
|| r->float_format == FLOAT_IEEE_DOUBLE_LE)
expected_float_format = 1;
else
NOT_REACHED ();
if (float_representation != expected_float_format)
- sys_error (r, _("Floating-point representation indicated by "
- "system file (%d) differs from expected (%d)."),
- r->float_format, expected_float_format);
+ sys_error (r, record->pos, _("Floating-point representation indicated by "
+ "system file (%d) differs from expected (%d)."),
+ float_representation, expected_float_format);
/* Check integer format. */
+ integer_representation = parse_int (r, record->data, 24);
if (r->integer_format == INTEGER_MSB_FIRST)
expected_integer_format = 1;
else if (r->integer_format == INTEGER_LSB_FIRST)
else
NOT_REACHED ();
if (integer_representation != expected_integer_format)
+ sys_warn (r, record->pos,
+ _("Integer format indicated by system file (%d) "
+ "differs from expected (%d)."),
+ integer_representation, expected_integer_format);
+
+}
+
+static const char *
+choose_encoding (struct sfm_reader *r,
+ const struct sfm_extension_record *ext_integer,
+ const struct sfm_extension_record *ext_encoding)
+{
+ /* The EXT_ENCODING record is a more reliable way to determine dictionary
+ encoding. */
+ if (ext_encoding)
+ return ext_encoding->data;
+
+ /* But EXT_INTEGER is better than nothing as a fallback. */
+ if (ext_integer)
{
- static const char *endian[] = {N_("little-endian"), N_("big-endian")};
- sys_warn (r, _("Integer format indicated by system file (%s) "
- "differs from expected (%s)."),
- gettext (endian[integer_representation == 1]),
- gettext (endian[expected_integer_format == 1]));
+ int codepage = parse_int (r, ext_integer->data, 7 * 4);
+ const char *encoding;
+
+ switch (codepage)
+ {
+ case 1:
+ return "EBCDIC-US";
+
+ case 2:
+ case 3:
+ /* These ostensibly mean "7-bit ASCII" and "8-bit ASCII"[sic]
+ respectively. However, there are known to be many files in the wild
+ with character code 2, yet have data which are clearly not ASCII.
+ Therefore we ignore these values. */
+ break;
+
+ case 4:
+ return "MS_KANJI";
+
+ default:
+ encoding = sys_get_encoding_from_codepage (codepage);
+ if (encoding != NULL)
+ return encoding;
+ break;
+ }
}
+
+ return locale_charset ();
}
-/* Read record type 7, subtype 4. */
+/* Parses record type 7, subtype 4. */
static void
-read_machine_flt64_info (struct sfm_reader *r, size_t size, size_t count)
+parse_machine_float_info (struct sfm_reader *r,
+ const struct sfm_extension_record *record)
{
- double sysmis = read_flt64 (r);
- double highest = read_flt64 (r);
- double lowest = read_flt64 (r);
-
- if (size != 8 || count != 3)
- sys_error (r, _("Bad size (%u) or count (%u) on extension 4."),
- (unsigned int) size, (unsigned int) count);
+ double sysmis = parse_float (r, record->data, 0);
+ double highest = parse_float (r, record->data, 8);
+ double lowest = parse_float (r, record->data, 16);
if (sysmis != SYSMIS)
- sys_warn (r, _("File specifies unexpected value %g as SYSMIS."), sysmis);
+ sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
+ sysmis, "SYSMIS");
+
if (highest != HIGHEST)
- sys_warn (r, _("File specifies unexpected value %g as HIGHEST."), highest);
+ sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
+ highest, "HIGHEST");
+
if (lowest != LOWEST)
- sys_warn (r, _("File specifies unexpected value %g as LOWEST."), lowest);
+ sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
+ lowest, "LOWEST");
+}
+
+/* Parses record type 7, subtype 7 or 19. */
+static void
+parse_mrsets (struct sfm_reader *r, const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ struct text_record *text;
+ struct mrset *mrset;
+
+ text = open_text_record (r, record, false);
+ for (;;)
+ {
+ const char *counted = NULL;
+ const char *name;
+ const char *label;
+ struct stringi_set var_names;
+ size_t allocated_vars;
+ char delimiter;
+ int width;
+
+ mrset = xzalloc (sizeof *mrset);
+
+ name = text_get_token (text, ss_cstr ("="), NULL);
+ if (name == NULL)
+ break;
+ mrset->name = recode_string ("UTF-8", r->encoding, name, -1);
+
+ if (mrset->name[0] != '$')
+ {
+ sys_warn (r, record->pos,
+ _("`%s' does not begin with `$' at offset %zu "
+ "in MRSETS record."), mrset->name, text_pos (text));
+ break;
+ }
+
+ if (text_match (text, 'C'))
+ {
+ mrset->type = MRSET_MC;
+ if (!text_match (text, ' '))
+ {
+ sys_warn (r, record->pos,
+ _("Missing space following `%c' at offset %zu "
+ "in MRSETS record."), 'C', text_pos (text));
+ break;
+ }
+ }
+ else if (text_match (text, 'D'))
+ {
+ mrset->type = MRSET_MD;
+ mrset->cat_source = MRSET_VARLABELS;
+ }
+ else if (text_match (text, 'E'))
+ {
+ char *number;
+
+ mrset->type = MRSET_MD;
+ mrset->cat_source = MRSET_COUNTEDVALUES;
+ if (!text_match (text, ' '))
+ {
+ sys_warn (r, record->pos,
+ _("Missing space following `%c' at offset %zu "
+ "in MRSETS record."), 'E', text_pos (text));
+ break;
+ }
+
+ number = text_get_token (text, ss_cstr (" "), NULL);
+ if (!strcmp (number, "11"))
+ mrset->label_from_var_label = true;
+ else if (strcmp (number, "1"))
+ sys_warn (r, record->pos,
+ _("Unexpected label source value `%s' following `E' "
+ "at offset %zu in MRSETS record."),
+ number, text_pos (text));
+ }
+ else
+ {
+ sys_warn (r, record->pos,
+ _("Missing `C', `D', or `E' at offset %zu "
+ "in MRSETS record."),
+ text_pos (text));
+ break;
+ }
+
+ if (mrset->type == MRSET_MD)
+ {
+ counted = text_parse_counted_string (r, text);
+ if (counted == NULL)
+ break;
+ }
+
+ label = text_parse_counted_string (r, text);
+ if (label == NULL)
+ break;
+ if (label[0] != '\0')
+ mrset->label = recode_string ("UTF-8", r->encoding, label, -1);
+
+ stringi_set_init (&var_names);
+ allocated_vars = 0;
+ width = INT_MAX;
+ do
+ {
+ const char *raw_var_name;
+ struct variable *var;
+ char *var_name;
+
+ raw_var_name = text_get_token (text, ss_cstr (" \n"), &delimiter);
+ if (raw_var_name == NULL)
+ {
+ sys_warn (r, record->pos,
+ _("Missing new-line parsing variable names "
+ "at offset %zu in MRSETS record."),
+ text_pos (text));
+ break;
+ }
+ var_name = recode_string ("UTF-8", r->encoding, raw_var_name, -1);
+
+ var = dict_lookup_var (dict, var_name);
+ if (var == NULL)
+ {
+ free (var_name);
+ continue;
+ }
+ if (!stringi_set_insert (&var_names, var_name))
+ {
+ sys_warn (r, record->pos,
+ _("Duplicate variable name %s "
+ "at offset %zu in MRSETS record."),
+ var_name, text_pos (text));
+ free (var_name);
+ continue;
+ }
+ free (var_name);
+
+ if (mrset->label == NULL && mrset->label_from_var_label
+ && var_has_label (var))
+ mrset->label = xstrdup (var_get_label (var));
+
+ if (mrset->n_vars
+ && var_get_type (var) != var_get_type (mrset->vars[0]))
+ {
+ sys_warn (r, record->pos,
+ _("MRSET %s contains both string and "
+ "numeric variables."), name);
+ continue;
+ }
+ width = MIN (width, var_get_width (var));
+
+ if (mrset->n_vars >= allocated_vars)
+ mrset->vars = x2nrealloc (mrset->vars, &allocated_vars,
+ sizeof *mrset->vars);
+ mrset->vars[mrset->n_vars++] = var;
+ }
+ while (delimiter != '\n');
+
+ if (mrset->n_vars < 2)
+ {
+ sys_warn (r, record->pos,
+ _("MRSET %s has only %zu variables."), mrset->name,
+ mrset->n_vars);
+ mrset_destroy (mrset);
+ continue;
+ }
+
+ if (mrset->type == MRSET_MD)
+ {
+ mrset->width = width;
+ value_init (&mrset->counted, width);
+ if (width == 0)
+ mrset->counted.f = strtod (counted, NULL);
+ else
+ value_copy_str_rpad (&mrset->counted, width,
+ (const uint8_t *) counted, ' ');
+ }
+
+ dict_add_mrset (dict, mrset);
+ mrset = NULL;
+ stringi_set_destroy (&var_names);
+ }
+ mrset_destroy (mrset);
+ close_text_record (r, text);
}
/* Read record type 7, subtype 11, which specifies how variables
should be displayed in GUI environments. */
static void
-read_display_parameters (struct sfm_reader *r, size_t size, size_t count,
+parse_display_parameters (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
struct dictionary *dict)
{
- const size_t n_vars = count / 3 ;
+ bool includes_width;
bool warned = false;
- int i;
+ size_t n_vars;
+ size_t ofs;
+ size_t i;
- if (count % 3 || n_vars != dict_get_var_cnt (dict))
- sys_error (r, _("Bad size (%u) or count (%u) on extension 11."),
- (unsigned int) size, (unsigned int) count);
+ n_vars = dict_get_var_cnt (dict);
+ if (record->count == 3 * n_vars)
+ includes_width = true;
+ else if (record->count == 2 * n_vars)
+ includes_width = false;
+ else
+ {
+ sys_warn (r, record->pos,
+ _("Extension 11 has bad count %zu (for %zu variables)."),
+ record->count, n_vars);
+ return;
+ }
- for (i = 0; i < n_vars; ++i)
+ ofs = 0;
+ for (i = 0; i < n_vars; ++i)
{
- int measure = read_int32 (r);
- int width = read_int32 (r);
- int align = read_int32 (r);
struct variable *v = dict_get_var (dict, i);
+ int measure, width, align;
- /* spss v14 sometimes seems to set string variables' measure to zero */
- if ( 0 == measure && var_is_alpha (v) ) measure = 1;
+ measure = parse_int (r, record->data, ofs);
+ ofs += 4;
+ if (includes_width)
+ {
+ width = parse_int (r, record->data, ofs);
+ ofs += 4;
+ }
+ else
+ width = 0;
+
+ align = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* SPSS 14 sometimes seems to set string variables' measure
+ to zero. */
+ if (0 == measure && var_is_alpha (v))
+ measure = 1;
if (measure < 1 || measure > 3 || align < 0 || align > 2)
{
if (!warned)
- sys_warn (r, _("Invalid variable display parameters. "
- "Default parameters substituted."));
+ sys_warn (r, record->pos,
+ _("Invalid variable display parameters for variable "
+ "%zu (%s). Default parameters substituted."),
+ i, var_get_name (v));
warned = true;
continue;
}
var_set_measure (v, (measure == 1 ? MEASURE_NOMINAL
: measure == 2 ? MEASURE_ORDINAL
: MEASURE_SCALE));
- var_set_display_width (v, width);
var_set_alignment (v, (align == 0 ? ALIGN_LEFT
: align == 1 ? ALIGN_RIGHT
: ALIGN_CENTRE));
+
+ /* Older versions (SPSS 9.0) sometimes set the display
+ width to zero. This causes confusion in the GUI, so
+ only set the width if it is nonzero. */
+ if (width > 0)
+ var_set_display_width (v, width);
}
}
-/* Reads record type 7, subtype 13, which gives the long name
- that corresponds to each short name. Modifies variable names
- in DICT accordingly. */
static void
-read_long_var_name_map (struct sfm_reader *r, size_t size, size_t count,
- struct dictionary *dict)
+rename_var_and_save_short_names (struct dictionary *dict, struct variable *var,
+ const char *new_name)
{
- struct variable_to_value_map *map;
+ size_t n_short_names;
+ char **short_names;
+ size_t i;
+
+ /* Renaming a variable may clear its short names, but we
+ want to retain them, so we save them and re-set them
+ afterward. */
+ n_short_names = var_get_short_name_cnt (var);
+ short_names = xnmalloc (n_short_names, sizeof *short_names);
+ for (i = 0; i < n_short_names; i++)
+ {
+ const char *s = var_get_short_name (var, i);
+ short_names[i] = s != NULL ? xstrdup (s) : NULL;
+ }
+
+ /* Set long name. */
+ dict_rename_var (dict, var, new_name);
+
+ /* Restore short names. */
+ for (i = 0; i < n_short_names; i++)
+ {
+ var_set_short_name (var, i, short_names[i]);
+ free (short_names[i]);
+ }
+ free (short_names);
+}
+
+/* Parses record type 7, subtype 13, which gives the long name that corresponds
+ to each short name. Modifies variable names in DICT accordingly. */
+static void
+parse_long_var_name_map (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ struct text_record *text;
struct variable *var;
char *long_name;
- int warning_cnt = 0;
-
- map = open_variable_to_value_map (r, size * count);
- while (read_variable_to_value_map (r, dict, map, &var, &long_name,
- &warning_cnt))
+
+ if (record == NULL)
{
- char short_name[SHORT_NAME_LEN + 1];
- strcpy (short_name, var_get_short_name (var));
+ /* Convert variable names to lowercase. */
+ size_t i;
+
+ for (i = 0; i < dict_get_var_cnt (dict); i++)
+ {
+ struct variable *var = dict_get_var (dict, i);
+ char *new_name;
+
+ new_name = xstrdup (var_get_name (var));
+ str_lowercase (new_name);
+
+ rename_var_and_save_short_names (dict, var, new_name);
+
+ free (new_name);
+ }
+
+ return;
+ }
+ /* Rename each of the variables, one by one. (In a correctly constructed
+ system file, this cannot create any intermediate duplicate variable names,
+ because all of the new variable names are longer than any of the old
+ variable names and thus there cannot be any overlaps.) */
+ text = open_text_record (r, record, true);
+ while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
+ {
/* Validate long name. */
- if (!var_is_valid_name (long_name, false))
+ /* XXX need to reencode name to UTF-8 */
+ if (!dict_id_is_valid (dict, long_name, false))
{
- sys_warn (r, _("Long variable mapping from %s to invalid "
- "variable name `%s'."),
+ sys_warn (r, record->pos,
+ _("Long variable mapping from %s to invalid "
+ "variable name `%s'."),
var_get_name (var), long_name);
continue;
}
-
+
/* Identify any duplicates. */
- if (strcasecmp (short_name, long_name)
+ if (strcasecmp (var_get_short_name (var, 0), long_name)
&& dict_lookup_var (dict, long_name) != NULL)
{
- sys_warn (r, _("Duplicate long variable name `%s' "
- "within system file."), long_name);
+ sys_warn (r, record->pos,
+ _("Duplicate long variable name `%s'."), long_name);
continue;
}
- /* Set long name. Renaming a variable may clear the short
- name, but we want to retain it, so re-set it
- explicitly. */
- dict_rename_var (dict, var, long_name);
- var_set_short_name (var, short_name);
+ rename_var_and_save_short_names (dict, var, long_name);
}
- close_variable_to_value_map (r, map);
- r->has_long_var_names = true;
+ close_text_record (r, text);
}
/* Reads record type 7, subtype 14, which gives the real length
of each very long string. Rearranges DICT accordingly. */
static void
-read_long_string_map (struct sfm_reader *r, size_t size, size_t count,
- struct dictionary *dict)
+parse_long_string_map (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
{
- struct variable_to_value_map *map;
+ struct text_record *text;
struct variable *var;
char *length_s;
- int warning_cnt = 0;
- r->has_vls = true;
-
- map = open_variable_to_value_map (r, size * count);
- while (read_variable_to_value_map (r, dict, map, &var, &length_s,
- &warning_cnt))
+ text = open_text_record (r, record, true);
+ while (read_variable_to_value_pair (r, dict, text, &var, &length_s))
{
- long length, remaining_length;
- size_t idx;
+ size_t idx = var_get_dict_index (var);
+ long int length;
+ int segment_cnt;
+ int i;
/* Get length. */
length = strtol (length_s, NULL, 10);
- if (length < MIN_VERY_LONG_STRING || length == LONG_MAX)
+ if (length < 1 || length > MAX_STRING)
{
- sys_warn (r, _("%s listed as string of length %s "
- "in length table."),
+ sys_warn (r, record->pos,
+ _("%s listed as string of invalid length %s "
+ "in very long string record."),
var_get_name (var), length_s);
continue;
}
- /* Group multiple variables into single variable
- and delete all but the first. */
- remaining_length = length;
- for (idx = var_get_dict_index (var); remaining_length > 0; idx++)
- if (idx < dict_get_var_cnt (dict))
- remaining_length -= MIN (var_get_width (dict_get_var (dict, idx)),
- EFFECTIVE_LONG_STRING_LENGTH);
- else
- sys_error (r, _("Very long string %s overflows dictionary."),
- var_get_name (var));
- dict_delete_consecutive_vars (dict,
- var_get_dict_index (var) + 1,
- idx - var_get_dict_index (var) - 1);
-
- /* Assign all the length to the first variable. */
+ /* Check segments. */
+ segment_cnt = sfm_width_to_segments (length);
+ if (segment_cnt == 1)
+ {
+ sys_warn (r, record->pos,
+ _("%s listed in very long string record with width %s, "
+ "which requires only one segment."),
+ var_get_name (var), length_s);
+ continue;
+ }
+ if (idx + segment_cnt > dict_get_var_cnt (dict))
+ sys_error (r, record->pos,
+ _("Very long string %s overflows dictionary."),
+ var_get_name (var));
+
+ /* Get the short names from the segments and check their
+ lengths. */
+ for (i = 0; i < segment_cnt; i++)
+ {
+ struct variable *seg = dict_get_var (dict, idx + i);
+ int alloc_width = sfm_segment_alloc_width (length, i);
+ int width = var_get_width (seg);
+
+ if (i > 0)
+ var_set_short_name (var, i, var_get_short_name (seg, 0));
+ if (ROUND_UP (width, 8) != ROUND_UP (alloc_width, 8))
+ sys_error (r, record->pos,
+ _("Very long string with width %ld has segment %d "
+ "of width %d (expected %d)."),
+ length, i, width, alloc_width);
+ }
+ dict_delete_consecutive_vars (dict, idx + 1, segment_cnt - 1);
var_set_width (var, length);
}
- close_variable_to_value_map (r, map);
+ close_text_record (r, text);
dict_compact_values (dict);
}
-/* Reads value labels from sysfile H and inserts them into the
- associated dictionary. */
static void
-read_value_labels (struct sfm_reader *r,
- struct dictionary *dict, struct variable **var_by_value_idx)
+parse_value_labels (struct sfm_reader *r, struct dictionary *dict,
+ const struct sfm_var_record *var_recs, size_t n_var_recs,
+ const struct sfm_value_label_record *record)
{
- struct pool *subpool;
-
- struct label
- {
- char raw_value[8]; /* Value as uninterpreted bytes. */
- union value value; /* Value. */
- char *label; /* Null-terminated label string. */
- };
+ struct variable **vars;
+ char **utf8_labels;
+ size_t i;
- struct label *labels = NULL;
- int label_cnt; /* Number of labels. */
+ utf8_labels = pool_nmalloc (r->pool, sizeof *utf8_labels, record->n_labels);
+ for (i = 0; i < record->n_labels; i++)
+ utf8_labels[i] = recode_string_pool ("UTF-8", dict_get_encoding (dict),
+ record->labels[i].label, -1,
+ r->pool);
+
+ vars = pool_nmalloc (r->pool, record->n_vars, sizeof *vars);
+ for (i = 0; i < record->n_vars; i++)
+ vars[i] = lookup_var_by_index (r, record->pos,
+ var_recs, n_var_recs, record->vars[i]);
+
+ for (i = 1; i < record->n_vars; i++)
+ if (var_get_type (vars[i]) != var_get_type (vars[0]))
+ sys_error (r, record->pos,
+ _("Variables associated with value label are not all of "
+ "identical type. Variable %s is %s, but variable "
+ "%s is %s."),
+ var_get_name (vars[0]),
+ var_is_numeric (vars[0]) ? _("numeric") : _("string"),
+ var_get_name (vars[i]),
+ var_is_numeric (vars[i]) ? _("numeric") : _("string"));
+
+ for (i = 0; i < record->n_vars; i++)
+ {
+ struct variable *var = vars[i];
+ int width;
+ size_t j;
+
+ width = var_get_width (var);
+ if (width > 8)
+ sys_error (r, record->pos,
+ _("Value labels may not be added to long string "
+ "variables (e.g. %s) using records types 3 and 4."),
+ var_get_name (var));
+
+ for (j = 0; j < record->n_labels; j++)
+ {
+ struct sfm_value_label *label = &record->labels[j];
+ union value value;
- struct variable **var = NULL; /* Associated variables. */
- int var_cnt; /* Number of associated variables. */
+ value_init (&value, width);
+ if (width == 0)
+ value.f = parse_float (r, label->value, 0);
+ else
+ memcpy (value_str_rw (&value, width), label->value, width);
- int i;
+ if (!var_add_value_label (var, &value, utf8_labels[j]))
+ {
+ if (var_is_numeric (var))
+ sys_warn (r, record->pos,
+ _("Duplicate value label for %g on %s."),
+ value.f, var_get_name (var));
+ else
+ sys_warn (r, record->pos,
+ _("Duplicate value label for `%.*s' on %s."),
+ width, value_str (&value, width),
+ var_get_name (var));
+ }
- subpool = pool_create_subpool (r->pool);
+ value_destroy (&value, width);
+ }
+ }
- /* Read the type 3 record and record its contents. We can't do
- much with the data yet because we don't know whether it is
- of numeric or string type. */
+ pool_free (r->pool, vars);
+ for (i = 0; i < record->n_labels; i++)
+ pool_free (r->pool, utf8_labels[i]);
+ pool_free (r->pool, utf8_labels);
+}
- /* Read number of labels. */
- label_cnt = read_int32 (r);
+static struct variable *
+lookup_var_by_index (struct sfm_reader *r, off_t offset,
+ const struct sfm_var_record *var_recs, size_t n_var_recs,
+ int idx)
+{
+ const struct sfm_var_record *rec;
- if (label_cnt >= INT32_MAX / sizeof *labels)
- {
- sys_warn (r, _("Invalid number of labels: %d. Ignoring labels."),
- label_cnt);
- label_cnt = 0;
+ if (idx < 1 || idx > n_var_recs)
+ {
+ sys_error (r, offset,
+ _("Variable index %d not in valid range 1...%zu."),
+ idx, n_var_recs);
+ return NULL;
}
- /* Read each value/label tuple into labels[]. */
- labels = pool_nalloc (subpool, label_cnt, sizeof *labels);
- for (i = 0; i < label_cnt; i++)
+ rec = &var_recs[idx - 1];
+ if (rec->var == NULL)
{
- struct label *label = labels + i;
- unsigned char label_len;
- size_t padded_len;
+ sys_error (r, offset,
+ _("Variable index %d refers to long string continuation."),
+ idx);
+ return NULL;
+ }
- /* Read value. */
- read_bytes (r, label->raw_value, sizeof label->raw_value);
+ return rec->var;
+}
- /* Read label length. */
- read_bytes (r, &label_len, sizeof label_len);
- padded_len = ROUND_UP (label_len + 1, 8);
+/* Parses a set of custom attributes from TEXT into ATTRS.
+ ATTRS may be a null pointer, in which case the attributes are
+ read but discarded. */
+static void
+parse_attributes (struct sfm_reader *r, struct text_record *text,
+ struct attrset *attrs)
+{
+ do
+ {
+ struct attribute *attr;
+ char *key;
+ int index;
- /* Read label, padding. */
- label->label = pool_alloc (subpool, padded_len + 1);
- read_bytes (r, label->label, padded_len - 1);
- label->label[label_len] = 0;
- }
+ /* Parse the key. */
+ key = text_get_token (text, ss_cstr ("("), NULL);
+ if (key == NULL)
+ return;
- /* Now, read the type 4 record that has the list of variables
- to which the value labels are to be applied. */
+ attr = attribute_create (key);
+ for (index = 1; ; index++)
+ {
+ /* Parse the value. */
+ char *value;
+ size_t length;
- /* Read record type of type 4 record. */
- if (read_int32 (r) != 4)
- sys_error (r, _("Variable index record (type 4) does not immediately "
- "follow value label record (type 3) as it should."));
+ value = text_get_token (text, ss_cstr ("\n"), NULL);
+ if (value == NULL)
+ {
+ text_warn (r, text, _("Error parsing attribute value %s[%d]."),
+ key, index);
+ break;
+ }
- /* Read number of variables associated with value label from type 4
- record. */
- var_cnt = read_int32 (r);
- if (var_cnt < 1 || var_cnt > dict_get_var_cnt (dict))
- sys_error (r, _("Number of variables associated with a value label (%d) "
- "is not between 1 and the number of variables (%u)."),
- var_cnt, (unsigned int) dict_get_var_cnt (dict));
-
- /* Read the list of variables. */
- var = pool_nalloc (subpool, var_cnt, sizeof *var);
- for (i = 0; i < var_cnt; i++)
- {
- var[i] = lookup_var_by_value_idx (r, var_by_value_idx, read_int32 (r));
- if (var_is_long_string (var[i]))
- sys_error (r, _("Value labels are not allowed on long string "
- "variables (%s)."), var_get_name (var[i]));
- }
+ length = strlen (value);
+ if (length >= 2 && value[0] == '\'' && value[length - 1] == '\'')
+ {
+ value[length - 1] = '\0';
+ attribute_add_value (attr, value + 1);
+ }
+ else
+ {
+ text_warn (r, text,
+ _("Attribute value %s[%d] is not quoted: %s."),
+ key, index, value);
+ attribute_add_value (attr, value);
+ }
- /* Type check the variables. */
- for (i = 1; i < var_cnt; i++)
- if (var_get_type (var[i]) != var_get_type (var[0]))
- sys_error (r, _("Variables associated with value label are not all of "
- "identical type. Variable %s is %s, but variable "
- "%s is %s."),
- var_get_name (var[0]),
- var_is_numeric (var[0]) ? _("numeric") : _("string"),
- var_get_name (var[i]),
- var_is_numeric (var[i]) ? _("numeric") : _("string"));
-
- /* Fill in labels[].value, now that we know the desired type. */
- for (i = 0; i < label_cnt; i++)
- {
- struct label *label = labels + i;
-
- if (var_is_alpha (var[0]))
- buf_copy_rpad (label->value.s, sizeof label->value.s,
- label->raw_value, sizeof label->raw_value);
+ /* Was this the last value for this attribute? */
+ if (text_match (text, ')'))
+ break;
+ }
+ if (attrs != NULL)
+ attrset_add (attrs, attr);
else
- label->value.f = flt64_to_double (r, (uint8_t *) label->raw_value);
+ attribute_destroy (attr);
}
-
- /* Assign the `value_label's to each variable. */
- for (i = 0; i < var_cnt; i++)
+ while (!text_match (text, '/'));
+}
+
+/* Reads record type 7, subtype 17, which lists custom
+ attributes on the data file. */
+static void
+parse_data_file_attributes (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ struct text_record *text = open_text_record (r, record, true);
+ parse_attributes (r, text, dict_get_attributes (dict));
+ close_text_record (r, text);
+}
+
+/* Parses record type 7, subtype 18, which lists custom
+ attributes on individual variables. */
+static void
+parse_variable_attributes (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ struct text_record *text;
+ struct variable *var;
+
+ text = open_text_record (r, record, true);
+ while (text_read_variable_name (r, dict, text, ss_cstr (":"), &var))
+ parse_attributes (r, text, var != NULL ? var_get_attributes (var) : NULL);
+ close_text_record (r, text);
+}
+
+static void
+check_overflow (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ size_t ofs, size_t length)
+{
+ size_t end = record->size * record->count;
+ if (length >= end || ofs + length > end)
+ sys_error (r, record->pos + end,
+ _("Long string value label record ends unexpectedly."));
+}
+
+static void
+parse_long_string_value_labels (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ struct dictionary *dict)
+{
+ const char *dict_encoding = dict_get_encoding (dict);
+ size_t end = record->size * record->count;
+ size_t ofs = 0;
+
+ while (ofs < end)
{
- struct variable *v = var[i];
- int j;
+ char *var_name;
+ size_t n_labels, i;
+ struct variable *var;
+ union value value;
+ int var_name_len;
+ int width;
+
+ /* Parse variable name length. */
+ check_overflow (r, record, ofs, 4);
+ var_name_len = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* Parse variable name, width, and number of labels. */
+ check_overflow (r, record, ofs, var_name_len + 8);
+ var_name = recode_string_pool ("UTF-8", dict_encoding,
+ (const char *) record->data + ofs,
+ var_name_len, r->pool);
+ width = parse_int (r, record->data, ofs + var_name_len);
+ n_labels = parse_int (r, record->data, ofs + var_name_len + 4);
+ ofs += var_name_len + 8;
+
+ /* Look up 'var' and validate. */
+ var = dict_lookup_var (dict, var_name);
+ if (var == NULL)
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string value record for "
+ "unknown variable %s."), var_name);
+ else if (var_is_numeric (var))
+ {
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string value record for "
+ "numeric variable %s."), var_name);
+ var = NULL;
+ }
+ else if (width != var_get_width (var))
+ {
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string value record for variable %s "
+ "because the record's width (%d) does not match the "
+ "variable's width (%d)."),
+ var_name, width, var_get_width (var));
+ var = NULL;
+ }
- /* Add each label to the variable. */
- for (j = 0; j < label_cnt; j++)
+ /* Parse values. */
+ value_init_pool (r->pool, &value, width);
+ for (i = 0; i < n_labels; i++)
{
- struct label *label = &labels[j];
- if (!var_add_value_label (v, &label->value, label->label))
+ size_t value_length, label_length;
+ bool skip = var == NULL;
+
+ /* Parse value length. */
+ check_overflow (r, record, ofs, 4);
+ value_length = parse_int (r, record->data, ofs);
+ ofs += 4;
+
+ /* Parse value. */
+ check_overflow (r, record, ofs, value_length);
+ if (!skip)
{
- if (var_is_numeric (var[0]))
- sys_warn (r, _("Duplicate value label for %g on %s."),
- label->value.f, var_get_name (v));
+ if (value_length == width)
+ memcpy (value_str_rw (&value, width),
+ (const uint8_t *) record->data + ofs, width);
else
- sys_warn (r, _("Duplicate value label for \"%.*s\" on %s."),
- var_get_width (v), label->value.s,
- var_get_name (v));
+ {
+ sys_warn (r, record->pos + ofs,
+ _("Ignoring long string value %zu for variable "
+ "%s, with width %d, that has bad value "
+ "width %zu."),
+ i, var_get_name (var), width, value_length);
+ skip = true;
+ }
}
- }
- }
+ ofs += value_length;
+
+ /* Parse label length. */
+ check_overflow (r, record, ofs, 4);
+ label_length = parse_int (r, record->data, ofs);
+ ofs += 4;
- pool_destroy (subpool);
+ /* Parse label. */
+ check_overflow (r, record, ofs, label_length);
+ if (!skip)
+ {
+ char *label;
+
+ label = recode_string_pool ("UTF-8", dict_encoding,
+ (const char *) record->data + ofs,
+ label_length, r->pool);
+ if (!var_add_value_label (var, &value, label))
+ sys_warn (r, record->pos + ofs,
+ _("Duplicate value label for `%.*s' on %s."),
+ width, value_str (&value, width),
+ var_get_name (var));
+ pool_free (r->pool, label);
+ }
+ ofs += label_length;
+ }
+ }
}
\f
/* Case reader. */
static void partial_record (struct sfm_reader *r)
NO_RETURN;
+
+static void read_error (struct casereader *, const struct sfm_reader *);
+
static bool read_case_number (struct sfm_reader *, double *);
-static bool read_case_string (struct sfm_reader *, char *, size_t);
+static bool read_case_string (struct sfm_reader *, uint8_t *, size_t);
static int read_opcode (struct sfm_reader *);
static bool read_compressed_number (struct sfm_reader *, double *);
-static bool read_compressed_string (struct sfm_reader *, char *);
-static bool read_whole_strings (struct sfm_reader *, char *, size_t);
-
-/* Reads one case from READER's file into C. Returns true only
- if successful. */
-static bool
-sys_file_casereader_read (struct casereader *reader, void *r_,
- struct ccase *c)
+static bool read_compressed_string (struct sfm_reader *, uint8_t *);
+static bool read_whole_strings (struct sfm_reader *, uint8_t *, size_t);
+static bool skip_whole_strings (struct sfm_reader *, size_t);
+
+/* Reads and returns one case from READER's file. Returns a null
+ pointer if not successful. */
+static struct ccase *
+sys_file_casereader_read (struct casereader *reader, void *r_)
{
struct sfm_reader *r = r_;
+ struct ccase *volatile c;
+ int i;
+
if (r->error)
- return false;
+ return NULL;
- case_create (c, r->value_cnt);
- if (setjmp (r->bail_out))
+ c = case_create (r->proto);
+ if (setjmp (r->bail_out))
{
casereader_force_error (reader);
- case_destroy (c);
- return false;
+ case_unref (c);
+ return NULL;
}
- if (!r->compressed && sizeof (double) == 8 && !r->has_vls)
+ for (i = 0; i < r->sfm_var_cnt; i++)
{
- /* Fast path. Read the whole case directly. */
- if (!try_read_bytes (r, case_data_all_rw (c),
- sizeof (union value) * r->flt64_cnt))
- {
- case_destroy (c);
- return false;
- }
+ struct sfm_var *sv = &r->sfm_vars[i];
+ union value *v = case_data_rw_idx (c, sv->case_index);
- /* Convert floating point numbers to native format if needed. */
- if (r->float_format != FLOAT_NATIVE_DOUBLE)
+ if (sv->var_width == 0)
{
- int i;
-
- for (i = 0; i < r->var_cnt; i++)
- if (r->vars[i].width == 0)
- {
- double *d = &case_data_rw_idx (c, r->vars[i].case_index)->f;
- float_convert (r->float_format, d, FLOAT_NATIVE_DOUBLE, d);
- }
+ if (!read_case_number (r, &v->f))
+ goto eof;
}
- return true;
- }
- else
- {
- /* Slow path. Convert from external to internal format. */
- int i;
-
- for (i = 0; i < r->var_cnt; i++)
+ else
{
- struct sfm_var *sv = &r->vars[i];
- union value *v = case_data_rw_idx (c, sv->case_index);
-
- if (sv->width == 0)
- {
- if (!read_case_number (r, &v->f))
- goto eof;
- }
- else
- {
- /* Read the string data in segments up to 255 bytes
- at a time, packed into 8-byte units. */
- const int max_chunk = MIN_VERY_LONG_STRING - 1;
- int ofs, chunk_size;
- for (ofs = 0; ofs < sv->width; ofs += chunk_size)
- {
- chunk_size = MIN (max_chunk, sv->width - ofs);
- if (!read_case_string (r, v->s + ofs, chunk_size))
- {
- if (ofs)
- partial_record (r);
- goto eof;
- }
- }
-
- /* Very long strings have trailing wasted space
- that we must skip. */
- if (sv->width >= MIN_VERY_LONG_STRING)
- {
- int bytes_read = (sv->width / max_chunk * 256
- + ROUND_UP (sv->width % max_chunk, 8));
- int total_bytes = sfm_width_to_bytes (sv->width);
- int excess_bytes = total_bytes - bytes_read;
-
- while (excess_bytes > 0)
- {
- char buffer[1024];
- size_t chunk = MIN (sizeof buffer, excess_bytes);
- if (!read_whole_strings (r, buffer, chunk))
- partial_record (r);
- excess_bytes -= chunk;
- }
- }
- }
+ uint8_t *s = value_str_rw (v, sv->var_width);
+ if (!read_case_string (r, s + sv->offset, sv->segment_width))
+ goto eof;
+ if (!skip_whole_strings (r, ROUND_DOWN (sv->padding, 8)))
+ partial_record (r);
}
- return true;
-
- eof:
- case_destroy (c);
- if (i != 0)
- partial_record (r);
- return false;
}
+ return c;
+
+eof:
+ if (i != 0)
+ partial_record (r);
+ if (r->case_cnt != -1)
+ read_error (reader, r);
+ case_unref (c);
+ return NULL;
}
/* Issues an error that R ends in a partial record. */
static void
partial_record (struct sfm_reader *r)
{
- sys_error (r, _("File ends in partial case."));
+ sys_error (r, r->pos, _("File ends in partial case."));
+}
+
+/* Issues an error that an unspecified error occurred SFM, and
+ marks R tainted. */
+static void
+read_error (struct casereader *r, const struct sfm_reader *sfm)
+{
+ msg (ME, _("Error reading case from file %s."), fh_get_name (sfm->fh));
+ casereader_force_error (r);
}
/* Reads a number from R and stores its value in *D.
Returns true if successful, false if end of file is
reached immediately. */
static bool
-read_case_number (struct sfm_reader *r, double *d)
+read_case_number (struct sfm_reader *r, double *d)
{
if (!r->compressed)
{
- uint8_t flt64[8];
- if (!try_read_bytes (r, flt64, sizeof flt64))
+ uint8_t number[8];
+ if (!try_read_bytes (r, number, sizeof number))
return false;
- *d = flt64_to_double (r, flt64);
+ float_convert (r->float_format, number, FLOAT_NATIVE_DOUBLE, d);
return true;
}
else
Returns true if successful, false if end of file is
reached immediately. */
static bool
-read_case_string (struct sfm_reader *r, char *s, size_t length)
+read_case_string (struct sfm_reader *r, uint8_t *s, size_t length)
{
size_t whole = ROUND_DOWN (length, 8);
size_t partial = length % 8;
-
- if (whole)
+
+ if (whole)
{
if (!read_whole_strings (r, s, whole))
return false;
if (partial)
{
- char bounce[8];
+ uint8_t bounce[8];
if (!read_whole_strings (r, bounce, sizeof bounce))
{
if (whole)
partial_record (r);
- return false;
+ return false;
}
memcpy (s + whole, bounce, partial);
}
/* Reads and returns the next compression opcode from R. */
static int
-read_opcode (struct sfm_reader *r)
+read_opcode (struct sfm_reader *r)
{
assert (r->compressed);
for (;;)
{
int opcode;
- if (r->opcode_idx >= sizeof r->opcodes)
+ if (r->opcode_idx >= sizeof r->opcodes)
{
if (!try_read_bytes (r, r->opcodes, sizeof r->opcodes))
return -1;
static bool
read_compressed_number (struct sfm_reader *r, double *d)
{
- int opcode = read_opcode (r);
+ int opcode = read_opcode (r);
switch (opcode)
{
case -1:
return false;
case 253:
- *d = read_flt64 (r);
+ *d = read_float (r);
break;
-
+
case 254:
- sys_error (r, _("Compressed data is corrupt."));
+ float_convert (r->float_format, " ", FLOAT_NATIVE_DOUBLE, d);
+ if (!r->corruption_warning)
+ {
+ r->corruption_warning = true;
+ sys_warn (r, r->pos,
+ _("Possible compressed data corruption: "
+ "compressed spaces appear in numeric field."));
+ }
+ break;
case 255:
*d = SYSMIS;
Returns true if successful, false if end of file is
reached immediately. */
static bool
-read_compressed_string (struct sfm_reader *r, char *dst)
+read_compressed_string (struct sfm_reader *r, uint8_t *dst)
{
- switch (read_opcode (r))
+ int opcode = read_opcode (r);
+ switch (opcode)
{
case -1:
case 252:
break;
default:
- sys_error (r, _("Compressed data is corrupt."));
+ {
+ double value = opcode - r->bias;
+ float_convert (FLOAT_NATIVE_DOUBLE, &value, r->float_format, dst);
+ if (value == 0.0)
+ {
+ /* This has actually been seen "in the wild". The submitter of the
+ file that showed that the contents decoded as spaces, but they
+ were at the end of the field so it's possible that the null
+ bytes just acted as null terminators. */
+ }
+ else if (!r->corruption_warning)
+ {
+ r->corruption_warning = true;
+ sys_warn (r, r->pos,
+ _("Possible compressed data corruption: "
+ "string contains compressed integer (opcode %d)."),
+ opcode);
+ }
+ }
+ break;
}
return true;
Returns true if successful, false if end of file is
reached immediately. */
static bool
-read_whole_strings (struct sfm_reader *r, char *s, size_t length)
+read_whole_strings (struct sfm_reader *r, uint8_t *s, size_t length)
{
assert (length % 8 == 0);
if (!r->compressed)
{
size_t ofs;
for (ofs = 0; ofs < length; ofs += 8)
- if (!read_compressed_string (r, s + ofs))
+ if (!read_compressed_string (r, s + ofs))
{
if (ofs != 0)
partial_record (r);
return true;
}
}
-\f
-/* Creates and returns a table that can be used for translating a value
- index into a case to a "struct variable *" for DICT. Multiple
- system file fields reference variables this way.
-
- This table must be created before processing the very long
- string extension record, because that record causes some
- values to be deleted from the case and the dictionary to be
- compacted. */
-static struct variable **
-make_var_by_value_idx (struct sfm_reader *r, struct dictionary *dict)
+
+/* Skips LENGTH string bytes from R.
+ LENGTH must be a multiple of 8.
+ (LENGTH is also limited to 1024, but that's only because the
+ current caller never needs more than that many bytes.)
+ Returns true if successful, false if end of file is
+ reached immediately. */
+static bool
+skip_whole_strings (struct sfm_reader *r, size_t length)
{
- struct variable **var_by_value_idx;
- int value_idx = 0;
- int i;
+ uint8_t buffer[1024];
+ assert (length < sizeof buffer);
+ return read_whole_strings (r, buffer, length);
+}
+\f
+/* Helpers for reading records that contain structured text
+ strings. */
- var_by_value_idx = pool_nmalloc (r->pool,
- r->flt64_cnt, sizeof *var_by_value_idx);
- for (i = 0; i < dict_get_var_cnt (dict); i++)
- {
- struct variable *v = dict_get_var (dict, i);
- int nv = var_is_numeric (v) ? 1 : DIV_RND_UP (var_get_width (v), 8);
- int j;
+/* Maximum number of warnings to issue for a single text
+ record. */
+#define MAX_TEXT_WARNINGS 5
- var_by_value_idx[value_idx++] = v;
- for (j = 1; j < nv; j++)
- var_by_value_idx[value_idx++] = NULL;
- }
- assert (value_idx == r->flt64_cnt);
+/* State. */
+struct text_record
+ {
+ struct substring buffer; /* Record contents. */
+ off_t start; /* Starting offset in file. */
+ size_t pos; /* Current position in buffer. */
+ int n_warnings; /* Number of warnings issued or suppressed. */
+ bool recoded; /* Recoded into UTF-8? */
+ };
- return var_by_value_idx;
+static struct text_record *
+open_text_record (struct sfm_reader *r,
+ const struct sfm_extension_record *record,
+ bool recode_to_utf8)
+{
+ struct text_record *text;
+ struct substring raw;
+
+ text = pool_alloc (r->pool, sizeof *text);
+ raw = ss_buffer (record->data, record->size * record->count);
+ text->start = record->pos;
+ text->buffer = (recode_to_utf8
+ ? recode_substring_pool ("UTF-8", r->encoding, raw, r->pool)
+ : raw);
+ text->pos = 0;
+ text->n_warnings = 0;
+ text->recoded = recode_to_utf8;
+
+ return text;
}
-/* Returns the "struct variable" corresponding to the given
- 1-basd VALUE_IDX in VAR_BY_VALUE_IDX. Verifies that the index
- is valid. */
-static struct variable *
-lookup_var_by_value_idx (struct sfm_reader *r,
- struct variable **var_by_value_idx, int value_idx)
+/* Closes TEXT, frees its storage, and issues a final warning
+ about suppressed warnings if necesary. */
+static void
+close_text_record (struct sfm_reader *r, struct text_record *text)
{
- struct variable *var;
-
- if (value_idx < 1 || value_idx > r->flt64_cnt)
- sys_error (r, _("Variable index %d not in valid range 1...%d."),
- value_idx, r->flt64_cnt);
-
- var = var_by_value_idx[value_idx - 1];
- if (var == NULL)
- sys_error (r, _("Variable index %d refers to long string "
- "continuation."),
- value_idx);
-
- return var;
+ if (text->n_warnings > MAX_TEXT_WARNINGS)
+ sys_warn (r, -1, _("Suppressed %d additional related warnings."),
+ text->n_warnings - MAX_TEXT_WARNINGS);
+ if (text->recoded)
+ pool_free (r->pool, ss_data (text->buffer));
}
-/* Returns the variable in D with the given SHORT_NAME,
- or a null pointer if there is none. */
-static struct variable *
-lookup_var_by_short_name (struct dictionary *d, const char *short_name)
+/* Reads a variable=value pair from TEXT.
+ Looks up the variable in DICT and stores it into *VAR.
+ Stores a null-terminated value into *VALUE. */
+static bool
+read_variable_to_value_pair (struct sfm_reader *r, struct dictionary *dict,
+ struct text_record *text,
+ struct variable **var, char **value)
{
- struct variable *var;
- size_t var_cnt;
- size_t i;
+ for (;;)
+ {
+ if (!text_read_short_name (r, dict, text, ss_cstr ("="), var))
+ return false;
+
+ *value = text_get_token (text, ss_buffer ("\t\0", 2), NULL);
+ if (*value == NULL)
+ return false;
- /* First try looking up by full name. This often succeeds. */
- var = dict_lookup_var (d, short_name);
- if (var != NULL && !strcasecmp (var_get_short_name (var), short_name))
- return var;
+ text->pos += ss_span (ss_substr (text->buffer, text->pos, SIZE_MAX),
+ ss_buffer ("\t\0", 2));
- /* Iterate through the whole dictionary as a fallback. */
- var_cnt = dict_get_var_cnt (d);
- for (i = 0; i < var_cnt; i++)
- {
- var = dict_get_var (d, i);
- if (!strcasecmp (var_get_short_name (var), short_name))
- return var;
+ if (*var != NULL)
+ return true;
}
+}
- return NULL;
+static bool
+text_read_variable_name (struct sfm_reader *r, struct dictionary *dict,
+ struct text_record *text, struct substring delimiters,
+ struct variable **var)
+{
+ char *name;
+
+ name = text_get_token (text, delimiters, NULL);
+ if (name == NULL)
+ return false;
+
+ *var = dict_lookup_var (dict, name);
+ if (*var != NULL)
+ return true;
+
+ text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
+ name);
+ return false;
}
-\f
-/* Helpers for reading records that contain "variable=value"
- pairs. */
-/* State. */
-struct variable_to_value_map
- {
- struct substring buffer; /* Record contents. */
- size_t pos; /* Current position in buffer. */
- };
-/* Reads SIZE bytes into a "variable=value" map for R,
- and returns the map. */
-static struct variable_to_value_map *
-open_variable_to_value_map (struct sfm_reader *r, size_t size)
+static bool
+text_read_short_name (struct sfm_reader *r, struct dictionary *dict,
+ struct text_record *text, struct substring delimiters,
+ struct variable **var)
{
- struct variable_to_value_map *map = pool_alloc (r->pool, sizeof *map);
- char *buffer = pool_malloc (r->pool, size + 1);
- read_bytes (r, buffer, size);
- map->buffer = ss_buffer (buffer, size);
- map->pos = 0;
- return map;
+ char *short_name = text_get_token (text, delimiters, NULL);
+ if (short_name == NULL)
+ return false;
+
+ *var = dict_lookup_var (dict, short_name);
+ if (*var == NULL)
+ text_warn (r, text, _("Dictionary record refers to unknown variable %s."),
+ short_name);
+ return true;
}
-/* Closes MAP and frees its storage.
- Not really needed, because the pool will free the map anyway,
- but can be used to free it earlier. */
+/* Displays a warning for the current file position, limiting the
+ number to MAX_TEXT_WARNINGS for TEXT. */
static void
-close_variable_to_value_map (struct sfm_reader *r,
- struct variable_to_value_map *map)
+text_warn (struct sfm_reader *r, struct text_record *text,
+ const char *format, ...)
{
- pool_free (r->pool, ss_data (map->buffer));
+ if (text->n_warnings++ < MAX_TEXT_WARNINGS)
+ {
+ va_list args;
+
+ va_start (args, format);
+ sys_msg (r, text->start + text->pos, MW, format, args);
+ va_end (args);
+ }
}
-/* Reads the next variable=value pair from MAP.
- Looks up the variable in DICT and stores it into *VAR.
- Stores a null-terminated value into *VALUE. */
-static bool
-read_variable_to_value_map (struct sfm_reader *r, struct dictionary *dict,
- struct variable_to_value_map *map,
- struct variable **var, char **value,
- int *warning_cnt)
+static char *
+text_get_token (struct text_record *text, struct substring delimiters,
+ char *delimiter)
+{
+ struct substring token;
+ char *end;
+
+ if (!ss_tokenize (text->buffer, delimiters, &text->pos, &token))
+ return NULL;
+
+ end = &ss_data (token)[ss_length (token)];
+ if (delimiter != NULL)
+ *delimiter = *end;
+ *end = '\0';
+ return ss_data (token);
+}
+
+/* Reads a integer value expressed in decimal, then a space, then a string that
+ consists of exactly as many bytes as specified by the integer, then a space,
+ from TEXT. Returns the string, null-terminated, as a subset of TEXT's
+ buffer (so the caller should not free the string). */
+static const char *
+text_parse_counted_string (struct sfm_reader *r, struct text_record *text)
{
- int max_warnings = 5;
-
- for (;;)
+ size_t start;
+ size_t n;
+ char *s;
+
+ start = text->pos;
+ n = 0;
+ for (;;)
+ {
+ int c = text->buffer.string[text->pos];
+ if (c < '0' || c > '9')
+ break;
+ n = (n * 10) + (c - '0');
+ text->pos++;
+ }
+ if (start == text->pos)
{
- struct substring short_name_ss, value_ss;
+ sys_warn (r, text->start,
+ _("Expecting digit at offset %zu in MRSETS record."),
+ text->pos);
+ return NULL;
+ }
- if (!ss_tokenize (map->buffer, ss_cstr ("="), &map->pos, &short_name_ss)
- || !ss_tokenize (map->buffer, ss_buffer ("\t\0", 2), &map->pos,
- &value_ss))
- {
- if (*warning_cnt > max_warnings)
- sys_warn (r, _("Suppressed %d additional variable map warnings."),
- *warning_cnt - max_warnings);
- return false;
- }
-
- map->pos += ss_span (ss_substr (map->buffer, map->pos, SIZE_MAX),
- ss_buffer ("\t\0", 2));
+ if (!text_match (text, ' '))
+ {
+ sys_warn (r, text->start,
+ _("Expecting space at offset %zu in MRSETS record."),
+ text->pos);
+ return NULL;
+ }
- ss_data (short_name_ss)[ss_length (short_name_ss)] = '\0';
- *var = lookup_var_by_short_name (dict, ss_data (short_name_ss));
- if (*var == NULL)
- {
- if (++*warning_cnt <= 5)
- sys_warn (r, _("Variable map refers to unknown variable %s."),
- ss_data (short_name_ss));
- continue;
- }
+ if (text->pos + n > text->buffer.length)
+ {
+ sys_warn (r, text->start,
+ _("%zu-byte string starting at offset %zu "
+ "exceeds record length %zu."),
+ n, text->pos, text->buffer.length);
+ return NULL;
+ }
- ss_data (value_ss)[ss_length (value_ss)] = '\0';
- *value = ss_data (value_ss);
+ s = &text->buffer.string[text->pos];
+ if (s[n] != ' ')
+ {
+ sys_warn (r, text->start,
+ _("Expecting space at offset %zu following %zu-byte string."),
+ text->pos + n, n);
+ return NULL;
+ }
+ s[n] = '\0';
+ text->pos += n + 1;
+ return s;
+}
+static bool
+text_match (struct text_record *text, char c)
+{
+ if (text->buffer.string[text->pos] == c)
+ {
+ text->pos++;
return true;
}
+ else
+ return false;
+}
+
+/* Returns the current byte offset (as converted to UTF-8, if it was converted)
+ inside the TEXT's string. */
+static size_t
+text_pos (const struct text_record *text)
+{
+ return text->pos;
}
\f
/* Messages. */
/* Displays a corruption message. */
static void
-sys_msg (struct sfm_reader *r, int class, const char *format, va_list args)
+sys_msg (struct sfm_reader *r, off_t offset,
+ int class, const char *format, va_list args)
{
struct msg m;
struct string text;
ds_init_empty (&text);
- ds_put_format (&text, "\"%s\" near offset 0x%lx: ",
- fh_get_file_name (r->fh), (unsigned long) ftell (r->file));
+ if (offset >= 0)
+ ds_put_format (&text, _("`%s' near offset 0x%llx: "),
+ fh_get_file_name (r->fh), (long long int) offset);
+ else
+ ds_put_format (&text, _("`%s': "), fh_get_file_name (r->fh));
ds_put_vformat (&text, format, args);
m.category = msg_class_to_category (class);
m.severity = msg_class_to_severity (class);
- m.where.file_name = NULL;
- m.where.line_number = 0;
+ m.file_name = NULL;
+ m.first_line = 0;
+ m.last_line = 0;
+ m.first_column = 0;
+ m.last_column = 0;
m.text = ds_cstr (&text);
msg_emit (&m);
}
-/* Displays a warning for the current file position. */
+/* Displays a warning for offset OFFSET in the file. */
static void
-sys_warn (struct sfm_reader *r, const char *format, ...)
+sys_warn (struct sfm_reader *r, off_t offset, const char *format, ...)
{
va_list args;
-
+
va_start (args, format);
- sys_msg (r, MW, format, args);
+ sys_msg (r, offset, MW, format, args);
va_end (args);
}
marks it as in an error state,
and aborts reading it using longjmp. */
static void
-sys_error (struct sfm_reader *r, const char *format, ...)
+sys_error (struct sfm_reader *r, off_t offset, const char *format, ...)
{
va_list args;
-
+
va_start (args, format);
- sys_msg (r, ME, format, args);
+ sys_msg (r, offset, ME, format, args);
va_end (args);
r->error = true;
void *buf, size_t byte_cnt)
{
size_t bytes_read = fread (buf, 1, byte_cnt, r->file);
+ r->pos += bytes_read;
if (bytes_read == byte_cnt)
return true;
else if (ferror (r->file))
- sys_error (r, _("System error: %s."), strerror (errno));
+ sys_error (r, r->pos, _("System error: %s."), strerror (errno));
else if (!eof_is_ok || bytes_read != 0)
- sys_error (r, _("Unexpected end of file."));
+ sys_error (r, r->pos, _("Unexpected end of file."));
else
return false;
}
/* Reads a 32-bit signed integer from R and returns its value in
host format. */
-static int32_t
-read_int32 (struct sfm_reader *r)
+static int
+read_int (struct sfm_reader *r)
{
- uint8_t int32[4];
- read_bytes (r, int32, sizeof int32);
- return int32_to_native (r, int32);
+ uint8_t integer[4];
+ read_bytes (r, integer, sizeof integer);
+ return integer_get (r->integer_format, integer, sizeof integer);
}
/* Reads a 64-bit floating-point number from R and returns its
value in host format. */
static double
-read_flt64 (struct sfm_reader *r)
+read_float (struct sfm_reader *r)
+{
+ uint8_t number[8];
+ read_bytes (r, number, sizeof number);
+ return float_get_double (r->float_format, number);
+}
+
+static int
+parse_int (struct sfm_reader *r, const void *data, size_t ofs)
+{
+ return integer_get (r->integer_format, (const uint8_t *) data + ofs, 4);
+}
+
+static double
+parse_float (struct sfm_reader *r, const void *data, size_t ofs)
{
- uint8_t flt64[8];
- read_bytes (r, flt64, sizeof flt64);
- return flt64_to_double (r, flt64);
+ return float_get_double (r->float_format, (const uint8_t *) data + ofs);
}
/* Reads exactly SIZE - 1 bytes into BUFFER
and stores a null byte into BUFFER[SIZE - 1]. */
static void
-read_string (struct sfm_reader *r, char *buffer, size_t size)
+read_string (struct sfm_reader *r, char *buffer, size_t size)
{
assert (size > 0);
read_bytes (r, buffer, size - 1);
static void
skip_bytes (struct sfm_reader *r, size_t bytes)
{
- while (bytes > 0)
+ while (bytes > 0)
{
char buffer[1024];
size_t chunk = MIN (sizeof buffer, bytes);
}
}
\f
-/* Returns the value of the 32-bit signed integer at INT32,
- converted from the format used by R to the host format. */
-static int32_t
-int32_to_native (const struct sfm_reader *r, const uint8_t int32[4])
-{
- int32_t x;
- if (r->integer_format == INTEGER_NATIVE)
- memcpy (&x, int32, sizeof x);
- else
- x = integer_get (r->integer_format, int32, sizeof x);
- return x;
-}
-
-/* Returns the value of the 64-bit floating point number at
- FLT64, converted from the format used by R to the host
- format. */
-static double
-flt64_to_double (const struct sfm_reader *r, const uint8_t flt64[8])
-{
- double x;
- if (r->float_format == FLOAT_NATIVE_DOUBLE)
- memcpy (&x, flt64, sizeof x);
- else
- float_convert (r->float_format, flt64, FLOAT_NATIVE_DOUBLE, &x);
- return x;
-}
-\f
-static struct casereader_class sys_file_casereader_class =
+static const struct casereader_class sys_file_casereader_class =
{
sys_file_casereader_read,
sys_file_casereader_destroy,