/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-2000, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+ Copyright (C) 1997-2000, 2006-2007, 2009-2012 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
EXT_LONG_LABELS = 21 /* Value labels for long strings. */
};
+/* Fields from the top-level header record. */
+struct sfm_header_record
+ {
+ char magic[5]; /* First 4 bytes of file, then null. */
+ int weight_idx; /* 0 if unweighted, otherwise a var index. */
+ int nominal_case_size; /* Number of var positions. */
+
+ /* These correspond to the members of struct sfm_file_info or a dictionary
+ but in the system file's encoding rather than ASCII. */
+ char creation_date[10]; /* "dd mmm yy". */
+ char creation_time[9]; /* "hh:mm:ss". */
+ char eye_catcher[61]; /* Eye-catcher string, then product name. */
+ char file_label[65]; /* File label. */
+ };
+
struct sfm_var_record
{
off_t pos;
static const char *choose_encoding (
struct sfm_reader *,
+ const struct sfm_header_record *,
const struct sfm_extension_record *ext_integer,
const struct sfm_extension_record *ext_encoding);
WRITE_FORMAT
};
-static void read_header (struct sfm_reader *, int *weight_idx,
- int *claimed_oct_cnt, struct sfm_read_info *,
- char **file_labelp);
-static void parse_file_label (struct sfm_reader *, const char *file_label,
- struct dictionary *);
+static void read_header (struct sfm_reader *, struct sfm_read_info *,
+ struct sfm_header_record *);
+static void parse_header (struct sfm_reader *,
+ const struct sfm_header_record *,
+ struct sfm_read_info *, struct dictionary *);
static void parse_variable_records (struct sfm_reader *, struct dictionary *,
struct sfm_var_record *, size_t n);
static void parse_format_spec (struct sfm_reader *, off_t pos,
const struct sfm_extension_record *,
struct dictionary *);
-/* Opens the system file designated by file handle FH for
- reading. Reads the system file's dictionary into *DICT.
- If INFO is non-null, then it receives additional info about the
- system file. */
+/* Frees the strings inside INFO. */
+void
+sfm_read_info_destroy (struct sfm_read_info *info)
+{
+ if (info)
+ {
+ free (info->creation_date);
+ free (info->creation_time);
+ free (info->product);
+ }
+}
+
+/* Opens the system file designated by file handle FH for reading. Reads the
+ system file's dictionary into *DICT.
+
+ Ordinarily the reader attempts to automatically detect the character
+ encoding based on the file's contents. This isn't always possible,
+ especially for files written by old versions of SPSS or PSPP, so specifying
+ a nonnull ENCODING overrides the choice of character encoding.
+
+ If INFO is non-null, then it receives additional info about the system file,
+ which the caller must eventually free with sfm_read_info_destroy() when it
+ is no longer needed. */
struct casereader *
-sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
- struct sfm_read_info *volatile info)
+sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
+ struct dictionary **dictp, struct sfm_read_info *infop)
{
struct sfm_reader *volatile r = NULL;
- struct sfm_read_info local_info;
+ struct sfm_read_info info;
+
+ struct sfm_header_record header;
struct sfm_var_record *vars;
size_t n_vars, allocated_vars;
struct sfm_extension_record *extensions[32];
- int weight_idx;
- int claimed_oct_cnt;
- char *file_label;
-
struct dictionary *dict = NULL;
size_t i;
r->opcode_idx = sizeof r->opcodes;
r->corruption_warning = false;
+ memset (&info, 0, sizeof info);
+
/* TRANSLATORS: this fragment will be interpolated into
messages in fh_lock() that identify types of files. */
r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
goto error;
}
- /* Initialize info. */
- if (info == NULL)
- info = &local_info;
- memset (info, 0, sizeof *info);
-
if (setjmp (r->bail_out))
goto error;
/* Read header. */
- read_header (r, &weight_idx, &claimed_oct_cnt, info, &file_label);
+ read_header (r, &info, &header);
vars = NULL;
n_vars = allocated_vars = 0;
First, figure out the correct character encoding, because this determines
how the rest of the header data is to be interpreted. */
- dict = dict_create (choose_encoding (r, extensions[EXT_INTEGER],
- extensions[EXT_ENCODING]));
+ dict = dict_create (encoding
+ ? encoding
+ : choose_encoding (r, &header, extensions[EXT_INTEGER],
+ extensions[EXT_ENCODING]));
r->encoding = dict_get_encoding (dict);
/* These records don't use variables at all. */
parse_document (dict, document);
if (extensions[EXT_INTEGER] != NULL)
- parse_machine_integer_info (r, extensions[EXT_INTEGER], info);
+ parse_machine_integer_info (r, extensions[EXT_INTEGER], &info);
if (extensions[EXT_FLOAT] != NULL)
parse_machine_float_info (r, extensions[EXT_FLOAT]);
if (extensions[EXT_FILE_ATTRS] != NULL)
parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
- parse_file_label (r, file_label, dict);
+ parse_header (r, &header, &info, dict);
/* Parse the variable records, the basis of almost everything else. */
parse_variable_records (r, dict, vars, n_vars);
before those indexes become invalidated by very long string variables. */
for (i = 0; i < n_labels; i++)
parse_value_labels (r, dict, vars, n_vars, &labels[i]);
- if (weight_idx != 0)
+ if (header.weight_idx != 0)
{
struct variable *weight_var;
- weight_var = lookup_var_by_index (r, 76, vars, n_vars, weight_idx);
+ weight_var = lookup_var_by_index (r, 76, vars, n_vars,
+ header.weight_idx);
if (var_is_numeric (weight_var))
dict_set_weight (dict, weight_var);
else
amount that the header claims. SPSS version 13 gets this
wrong when very long strings are involved, so don't warn in
that case. */
- if (claimed_oct_cnt != -1 && claimed_oct_cnt != n_vars
- && info->version_major != 13)
+ if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
+ && info.version_major != 13)
sys_warn (r, -1, _("File header claims %d variable positions but "
"%zu were read from file."),
- claimed_oct_cnt, n_vars);
+ header.nominal_case_size, n_vars);
/* Create an index of dictionary variable widths for
sfm_read_case to use. We cannot use the `struct variable's
r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
*dictp = dict;
+ if (infop)
+ *infop = info;
+ else
+ sfm_read_info_destroy (&info);
+
return casereader_create_sequential
(NULL, r->proto,
r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
&sys_file_casereader_class, r);
error:
+ sfm_read_info_destroy (&info);
close_reader (r);
dict_destroy (dict);
*dictp = NULL;
bool
sfm_detect (FILE *file)
{
- char rec_type[5];
+ char magic[5];
- if (fread (rec_type, 4, 1, file) != 1)
+ if (fread (magic, 4, 1, file) != 1)
return false;
- rec_type[4] = '\0';
+ magic[4] = '\0';
- return !strcmp ("$FL2", rec_type);
+ return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
}
\f
-/* Reads the global header of the system file. Sets *WEIGHT_IDX to 0 if the
- system file is unweighted, or to the value index of the weight variable
- otherwise. Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units) per
- case that the file claims to have (although it is not always correct).
- Initializes INFO with header information. Stores the file label as a string
- in dictionary encoding into *FILE_LABELP. */
+/* Reads the global header of the system file. Initializes *HEADER and *INFO,
+ except for the string fields in *INFO, which parse_header() will initialize
+ later once the file's encoding is known. */
static void
-read_header (struct sfm_reader *r, int *weight_idx,
- int *claimed_oct_cnt, struct sfm_read_info *info,
- char **file_labelp)
+read_header (struct sfm_reader *r, struct sfm_read_info *info,
+ struct sfm_header_record *header)
{
- char rec_type[5];
- char eye_catcher[61];
uint8_t raw_layout_code[4];
uint8_t raw_bias[8];
- char creation_date[10];
- char creation_time[9];
- char file_label[65];
- struct substring product;
- read_string (r, rec_type, sizeof rec_type);
- read_string (r, eye_catcher, sizeof eye_catcher);
+ read_string (r, header->magic, sizeof header->magic);
+ read_string (r, header->eye_catcher, sizeof header->eye_catcher);
- if (strcmp ("$FL2", rec_type) != 0)
+ if (strcmp (ASCII_MAGIC, header->magic)
+ && strcmp (EBCDIC_MAGIC, header->magic))
sys_error (r, 0, _("This is not an SPSS system file."));
/* Identify integer format. */
&& r->integer_format != INTEGER_LSB_FIRST))
sys_error (r, 64, _("This is not an SPSS system file."));
- *claimed_oct_cnt = read_int (r);
- if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
- *claimed_oct_cnt = -1;
+ header->nominal_case_size = read_int (r);
+ if (header->nominal_case_size < 0
+ || header->nominal_case_size > INT_MAX / 16)
+ header->nominal_case_size = -1;
r->compressed = read_int (r) != 0;
- *weight_idx = read_int (r);
+ header->weight_idx = read_int (r);
r->case_cnt = read_int (r);
if ( r->case_cnt > INT_MAX / 2)
}
float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
- read_string (r, creation_date, sizeof creation_date);
- read_string (r, creation_time, sizeof creation_time);
- read_string (r, file_label, sizeof file_label);
+ read_string (r, header->creation_date, sizeof header->creation_date);
+ read_string (r, header->creation_time, sizeof header->creation_time);
+ read_string (r, header->file_label, sizeof header->file_label);
skip_bytes (r, 3);
- strcpy (info->creation_date, creation_date);
- strcpy (info->creation_time, creation_time);
info->integer_format = r->integer_format;
info->float_format = r->float_format;
info->compressed = r->compressed;
info->case_cnt = r->case_cnt;
-
- product = ss_cstr (eye_catcher);
- ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
- ss_trim (&product, ss_cstr (" "));
- str_copy_buf_trunc (info->product, sizeof info->product,
- ss_data (product), ss_length (product));
-
- *file_labelp = pool_strdup0 (r->pool, file_label, sizeof file_label - 1);
}
/* Reads a variable (type 2) record from R into RECORD. */
/* Read up to MAX_LABEL_LEN bytes of label. */
read_len = MIN (MAX_LABEL_LEN, len);
- record->label = xmalloc (read_len + 1);
+ record->label = pool_malloc (r->pool, read_len + 1);
read_string (r, record->label, read_len + 1);
/* Skip unread label bytes. */
}
static void
-parse_file_label (struct sfm_reader *r, const char *file_label,
- struct dictionary *dict)
+parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
+ struct sfm_read_info *info, struct dictionary *dict)
{
- char *utf8_file_label;
- size_t file_label_len;
-
- utf8_file_label = recode_string_pool ("UTF-8", dict_get_encoding (dict),
- file_label, -1, r->pool);
- file_label_len = strlen (utf8_file_label);
- while (file_label_len > 0 && utf8_file_label[file_label_len - 1] == ' ')
- file_label_len--;
- utf8_file_label[file_label_len] = '\0';
- dict_set_label (dict, utf8_file_label);
+ const char *dict_encoding = dict_get_encoding (dict);
+ struct substring product;
+ struct substring label;
+
+ /* Convert file label to UTF-8 and put it into DICT. */
+ label = recode_substring_pool ("UTF-8", dict_encoding,
+ ss_cstr (header->file_label), r->pool);
+ ss_trim (&label, ss_cstr (" "));
+ label.string[label.length] = '\0';
+ dict_set_label (dict, label.string);
+
+ /* Put creation date and time in UTF-8 into INFO. */
+ info->creation_date = recode_string ("UTF-8", dict_encoding,
+ header->creation_date, -1);
+ info->creation_time = recode_string ("UTF-8", dict_encoding,
+ header->creation_time, -1);
+
+ /* Put product name into INFO, dropping eye-catcher string if present. */
+ product = recode_substring_pool ("UTF-8", dict_encoding,
+ ss_cstr (header->eye_catcher), r->pool);
+ ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
+ ss_trim (&product, ss_cstr (" "));
+ info->product = ss_xstrdup (product);
}
/* Reads a variable (type 2) record from R and adds the
uint8_t w = format >> 8;
uint8_t d = format;
struct fmt_spec f;
-
bool ok;
- if (!fmt_from_io (raw_type, &f.type))
- sys_error (r, pos, _("Unknown variable format %"PRIu8"."), raw_type);
f.w = w;
f.d = d;
msg_disable ();
- ok = fmt_check_output (&f) && fmt_check_width_compat (&f, var_get_width (v));
+ ok = (fmt_from_io (raw_type, &f.type)
+ && fmt_check_output (&f)
+ && fmt_check_width_compat (&f, var_get_width (v)));
msg_enable ();
if (ok)
else
var_set_write_format (v, &f);
}
+ else if (format == 0)
+ {
+ /* Actually observed in the wild. No point in warning about it. */
+ }
else if (++*n_warnings <= max_warnings)
{
- char fmt_string[FMT_STRING_LEN_MAX + 1];
- sys_warn (r, pos, _("%s variable %s has invalid %s format %s."),
- var_is_numeric (v) ? _("Numeric") : _("String"),
- var_get_name (v),
- which == PRINT_FORMAT ? _("print") : _("write"),
- fmt_to_string (&f, fmt_string));
+ if (which == PRINT_FORMAT)
+ sys_warn (r, pos, _("Variable %s with width %d has invalid print "
+ "format 0x%x."),
+ var_get_name (v), var_get_width (v), format);
+ else
+ sys_warn (r, pos, _("Variable %s with width %d has invalid write "
+ "format 0x%x."),
+ var_get_name (v), var_get_width (v), format);
if (*n_warnings == max_warnings)
sys_warn (r, -1, _("Suppressing further invalid format warnings."));
static const char *
choose_encoding (struct sfm_reader *r,
+ const struct sfm_header_record *header,
const struct sfm_extension_record *ext_integer,
const struct sfm_extension_record *ext_encoding)
{
}
}
+ /* If the file magic number is EBCDIC then its character data is too. */
+ if (!strcmp (header->magic, EBCDIC_MAGIC))
+ return "EBCDIC-US";
+
return locale_charset ();
}
start = text->pos;
n = 0;
- for (;;)
+ while (text->pos < text->buffer.length)
{
int c = text->buffer.string[text->pos];
if (c < '0' || c > '9')
n = (n * 10) + (c - '0');
text->pos++;
}
- if (start == text->pos)
+ if (text->pos >= text->buffer.length || start == text->pos)
{
sys_warn (r, text->start,
_("Expecting digit at offset %zu in MRSETS record."),