/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-2000, 2006-2007, 2009-2011 Free Software Foundation, Inc.
+ Copyright (C) 1997-2000, 2006-2007, 2009-2013 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include "libpspp/str.h"
#include "libpspp/stringi-set.h"
+#include "gl/c-strtod.h"
#include "gl/c-ctype.h"
#include "gl/inttostr.h"
#include "gl/localcharset.h"
EXT_VAR_ATTRS = 18, /* Variable attributes. */
EXT_MRSETS2 = 19, /* Multiple response sets (extended). */
EXT_ENCODING = 20, /* Character encoding. */
- EXT_LONG_LABELS = 21 /* Value labels for long strings. */
+ EXT_LONG_LABELS = 21, /* Value labels for long strings. */
+ EXT_DATAVIEW = 24 /* "Format properties in dataview table". */
+ };
+
+/* Fields from the top-level header record. */
+struct sfm_header_record
+ {
+ char magic[5]; /* First 4 bytes of file, then null. */
+ int weight_idx; /* 0 if unweighted, otherwise a var index. */
+ int nominal_case_size; /* Number of var positions. */
+
+ /* These correspond to the members of struct sfm_file_info or a dictionary
+ but in the system file's encoding rather than ASCII. */
+ char creation_date[10]; /* "dd mmm yy". */
+ char creation_time[9]; /* "hh:mm:ss". */
+ char eye_catcher[61]; /* Eye-catcher string, then product name. */
+ char file_label[65]; /* File label. */
};
struct sfm_var_record
static const char *choose_encoding (
struct sfm_reader *,
+ const struct sfm_header_record *,
const struct sfm_extension_record *ext_integer,
const struct sfm_extension_record *ext_encoding);
WRITE_FORMAT
};
-static void read_header (struct sfm_reader *, int *weight_idx,
- int *claimed_oct_cnt, struct sfm_read_info *,
- char **file_labelp);
-static void parse_file_label (struct sfm_reader *, const char *file_label,
- struct dictionary *);
+static void read_header (struct sfm_reader *, struct sfm_read_info *,
+ struct sfm_header_record *);
+static void parse_header (struct sfm_reader *,
+ const struct sfm_header_record *,
+ struct sfm_read_info *, struct dictionary *);
static void parse_variable_records (struct sfm_reader *, struct dictionary *,
struct sfm_var_record *, size_t n);
static void parse_format_spec (struct sfm_reader *, off_t pos,
static void parse_variable_attributes (struct sfm_reader *,
const struct sfm_extension_record *,
struct dictionary *);
+static void assign_variable_roles (struct sfm_reader *, struct dictionary *);
static void parse_long_string_value_labels (struct sfm_reader *,
const struct sfm_extension_record *,
struct dictionary *);
-/* Opens the system file designated by file handle FH for
- reading. Reads the system file's dictionary into *DICT.
- If INFO is non-null, then it receives additional info about the
- system file. */
+/* Frees the strings inside INFO. */
+void
+sfm_read_info_destroy (struct sfm_read_info *info)
+{
+ if (info)
+ {
+ free (info->creation_date);
+ free (info->creation_time);
+ free (info->product);
+ }
+}
+
+/* Opens the system file designated by file handle FH for reading. Reads the
+ system file's dictionary into *DICT.
+
+ Ordinarily the reader attempts to automatically detect the character
+ encoding based on the file's contents. This isn't always possible,
+ especially for files written by old versions of SPSS or PSPP, so specifying
+ a nonnull ENCODING overrides the choice of character encoding.
+
+ If INFO is non-null, then it receives additional info about the system file,
+ which the caller must eventually free with sfm_read_info_destroy() when it
+ is no longer needed. */
struct casereader *
-sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
- struct sfm_read_info *volatile info)
+sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
+ struct dictionary **dictp, struct sfm_read_info *infop)
{
struct sfm_reader *volatile r = NULL;
- struct sfm_read_info local_info;
+ struct sfm_read_info *volatile info;
+
+ struct sfm_header_record header;
struct sfm_var_record *vars;
size_t n_vars, allocated_vars;
struct sfm_extension_record *extensions[32];
- int weight_idx;
- int claimed_oct_cnt;
- char *file_label;
-
- struct dictionary *dict = NULL;
+ struct dictionary *volatile dict = NULL;
size_t i;
/* Create and initialize reader. */
r->opcode_idx = sizeof r->opcodes;
r->corruption_warning = false;
+ info = infop ? infop : xmalloc (sizeof *info);
+ memset (info, 0, sizeof *info);
+
/* TRANSLATORS: this fragment will be interpolated into
messages in fh_lock() that identify types of files. */
r->lock = fh_lock (fh, FH_REF_FILE, N_("system file"), FH_ACC_READ, false);
goto error;
}
- /* Initialize info. */
- if (info == NULL)
- info = &local_info;
- memset (info, 0, sizeof *info);
-
if (setjmp (r->bail_out))
goto error;
/* Read header. */
- read_header (r, &weight_idx, &claimed_oct_cnt, info, &file_label);
+ read_header (r, info, &header);
vars = NULL;
n_vars = allocated_vars = 0;
First, figure out the correct character encoding, because this determines
how the rest of the header data is to be interpreted. */
- dict = dict_create (choose_encoding (r, extensions[EXT_INTEGER],
- extensions[EXT_ENCODING]));
+ dict = dict_create (encoding
+ ? encoding
+ : choose_encoding (r, &header, extensions[EXT_INTEGER],
+ extensions[EXT_ENCODING]));
r->encoding = dict_get_encoding (dict);
/* These records don't use variables at all. */
if (extensions[EXT_FILE_ATTRS] != NULL)
parse_data_file_attributes (r, extensions[EXT_FILE_ATTRS], dict);
- parse_file_label (r, file_label, dict);
+ parse_header (r, &header, info, dict);
/* Parse the variable records, the basis of almost everything else. */
parse_variable_records (r, dict, vars, n_vars);
before those indexes become invalidated by very long string variables. */
for (i = 0; i < n_labels; i++)
parse_value_labels (r, dict, vars, n_vars, &labels[i]);
- if (weight_idx != 0)
+ if (header.weight_idx != 0)
{
struct variable *weight_var;
- weight_var = lookup_var_by_index (r, 76, vars, n_vars, weight_idx);
+ weight_var = lookup_var_by_index (r, 76, vars, n_vars,
+ header.weight_idx);
if (var_is_numeric (weight_var))
dict_set_weight (dict, weight_var);
else
/* The following records use long names, so they need to follow renaming. */
if (extensions[EXT_VAR_ATTRS] != NULL)
- parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
+ {
+ parse_variable_attributes (r, extensions[EXT_VAR_ATTRS], dict);
+
+ /* Roles use the $@Role attribute. */
+ assign_variable_roles (r, dict);
+ }
if (extensions[EXT_LONG_LABELS] != NULL)
parse_long_string_value_labels (r, extensions[EXT_LONG_LABELS], dict);
amount that the header claims. SPSS version 13 gets this
wrong when very long strings are involved, so don't warn in
that case. */
- if (claimed_oct_cnt != -1 && claimed_oct_cnt != n_vars
+ if (header.nominal_case_size != -1 && header.nominal_case_size != n_vars
&& info->version_major != 13)
sys_warn (r, -1, _("File header claims %d variable positions but "
"%zu were read from file."),
- claimed_oct_cnt, n_vars);
+ header.nominal_case_size, n_vars);
/* Create an index of dictionary variable widths for
sfm_read_case to use. We cannot use the `struct variable's
r->proto = caseproto_ref_pool (dict_get_proto (dict), r->pool);
*dictp = dict;
+ if (infop != info)
+ {
+ sfm_read_info_destroy (info);
+ free (info);
+ }
+
return casereader_create_sequential
(NULL, r->proto,
r->case_cnt == -1 ? CASENUMBER_MAX: r->case_cnt,
&sys_file_casereader_class, r);
error:
+ if (infop != info)
+ {
+ sfm_read_info_destroy (info);
+ free (info);
+ }
+
close_reader (r);
dict_destroy (dict);
*dictp = NULL;
bool
sfm_detect (FILE *file)
{
- char rec_type[5];
+ char magic[5];
- if (fread (rec_type, 4, 1, file) != 1)
+ if (fread (magic, 4, 1, file) != 1)
return false;
- rec_type[4] = '\0';
+ magic[4] = '\0';
- return !strcmp ("$FL2", rec_type);
+ return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
}
\f
-/* Reads the global header of the system file. Sets *WEIGHT_IDX to 0 if the
- system file is unweighted, or to the value index of the weight variable
- otherwise. Sets *CLAIMED_OCT_CNT to the number of "octs" (8-byte units) per
- case that the file claims to have (although it is not always correct).
- Initializes INFO with header information. Stores the file label as a string
- in dictionary encoding into *FILE_LABELP. */
+/* Reads the global header of the system file. Initializes *HEADER and *INFO,
+ except for the string fields in *INFO, which parse_header() will initialize
+ later once the file's encoding is known. */
static void
-read_header (struct sfm_reader *r, int *weight_idx,
- int *claimed_oct_cnt, struct sfm_read_info *info,
- char **file_labelp)
+read_header (struct sfm_reader *r, struct sfm_read_info *info,
+ struct sfm_header_record *header)
{
- char rec_type[5];
- char eye_catcher[61];
uint8_t raw_layout_code[4];
uint8_t raw_bias[8];
- char creation_date[10];
- char creation_time[9];
- char file_label[65];
- struct substring product;
- read_string (r, rec_type, sizeof rec_type);
- read_string (r, eye_catcher, sizeof eye_catcher);
+ read_string (r, header->magic, sizeof header->magic);
+ read_string (r, header->eye_catcher, sizeof header->eye_catcher);
- if (strcmp ("$FL2", rec_type) != 0)
+ if (strcmp (ASCII_MAGIC, header->magic)
+ && strcmp (EBCDIC_MAGIC, header->magic))
sys_error (r, 0, _("This is not an SPSS system file."));
/* Identify integer format. */
&& r->integer_format != INTEGER_LSB_FIRST))
sys_error (r, 64, _("This is not an SPSS system file."));
- *claimed_oct_cnt = read_int (r);
- if (*claimed_oct_cnt < 0 || *claimed_oct_cnt > INT_MAX / 16)
- *claimed_oct_cnt = -1;
+ header->nominal_case_size = read_int (r);
+ if (header->nominal_case_size < 0
+ || header->nominal_case_size > INT_MAX / 16)
+ header->nominal_case_size = -1;
r->compressed = read_int (r) != 0;
- *weight_idx = read_int (r);
+ header->weight_idx = read_int (r);
r->case_cnt = read_int (r);
if ( r->case_cnt > INT_MAX / 2)
}
float_convert (r->float_format, raw_bias, FLOAT_NATIVE_DOUBLE, &r->bias);
- read_string (r, creation_date, sizeof creation_date);
- read_string (r, creation_time, sizeof creation_time);
- read_string (r, file_label, sizeof file_label);
+ read_string (r, header->creation_date, sizeof header->creation_date);
+ read_string (r, header->creation_time, sizeof header->creation_time);
+ read_string (r, header->file_label, sizeof header->file_label);
skip_bytes (r, 3);
- strcpy (info->creation_date, creation_date);
- strcpy (info->creation_time, creation_time);
info->integer_format = r->integer_format;
info->float_format = r->float_format;
info->compressed = r->compressed;
info->case_cnt = r->case_cnt;
-
- product = ss_cstr (eye_catcher);
- ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
- ss_trim (&product, ss_cstr (" "));
- str_copy_buf_trunc (info->product, sizeof info->product,
- ss_data (product), ss_length (product));
-
- *file_labelp = pool_strdup0 (r->pool, file_label, sizeof file_label - 1);
}
/* Reads a variable (type 2) record from R into RECORD. */
/* Read up to MAX_LABEL_LEN bytes of label. */
read_len = MIN (MAX_LABEL_LEN, len);
- record->label = xmalloc (read_len + 1);
+ record->label = pool_malloc (r->pool, read_len + 1);
read_string (r, record->label, read_len + 1);
/* Skip unread label bytes. */
{ EXT_VAR_SETS, 0, 0 },
{ EXT_DATE, 0, 0 },
{ EXT_DATA_ENTRY, 0, 0 },
+ { EXT_DATAVIEW, 0, 0 },
};
const struct extension_record_type *type;
}
static void
-parse_file_label (struct sfm_reader *r, const char *file_label,
- struct dictionary *dict)
+parse_header (struct sfm_reader *r, const struct sfm_header_record *header,
+ struct sfm_read_info *info, struct dictionary *dict)
{
- char *utf8_file_label;
- size_t file_label_len;
-
- utf8_file_label = recode_string_pool ("UTF-8", dict_get_encoding (dict),
- file_label, -1, r->pool);
- file_label_len = strlen (utf8_file_label);
- while (file_label_len > 0 && utf8_file_label[file_label_len - 1] == ' ')
- file_label_len--;
- utf8_file_label[file_label_len] = '\0';
- dict_set_label (dict, utf8_file_label);
+ const char *dict_encoding = dict_get_encoding (dict);
+ struct substring product;
+ struct substring label;
+
+ /* Convert file label to UTF-8 and put it into DICT. */
+ label = recode_substring_pool ("UTF-8", dict_encoding,
+ ss_cstr (header->file_label), r->pool);
+ ss_trim (&label, ss_cstr (" "));
+ label.string[label.length] = '\0';
+ dict_set_label (dict, label.string);
+
+ /* Put creation date and time in UTF-8 into INFO. */
+ info->creation_date = recode_string ("UTF-8", dict_encoding,
+ header->creation_date, -1);
+ info->creation_time = recode_string ("UTF-8", dict_encoding,
+ header->creation_time, -1);
+
+ /* Put product name into INFO, dropping eye-catcher string if present. */
+ product = recode_substring_pool ("UTF-8", dict_encoding,
+ ss_cstr (header->eye_catcher), r->pool);
+ ss_match_string (&product, ss_cstr ("@(#) SPSS DATA FILE"));
+ ss_trim (&product, ss_cstr (" "));
+ info->product = ss_xstrdup (product);
}
/* Reads a variable (type 2) record from R and adds the
{
double low = parse_float (r, rec->missing, 0);
double high = parse_float (r, rec->missing, 8);
+
+ /* Deal with SPSS 21 change in representation. */
+ if (low == SYSMIS)
+ low = LOWEST;
+
mv_add_range (&mv, low, high);
ofs += 16;
}
static const char *
choose_encoding (struct sfm_reader *r,
+ const struct sfm_header_record *header,
const struct sfm_extension_record *ext_integer,
const struct sfm_extension_record *ext_encoding)
{
}
}
+ /* If the file magic number is EBCDIC then its character data is too. */
+ if (!strcmp (header->magic, EBCDIC_MAGIC))
+ return "EBCDIC-US";
+
return locale_charset ();
}
double lowest = parse_float (r, record->data, 16);
if (sysmis != SYSMIS)
- sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
- sysmis, "SYSMIS");
+ sys_warn (r, record->pos,
+ _("File specifies unexpected value %g (%a) as %s, "
+ "instead of %g (%a)."),
+ sysmis, sysmis, "SYSMIS", SYSMIS, SYSMIS);
if (highest != HIGHEST)
- sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
- highest, "HIGHEST");
-
- if (lowest != LOWEST)
- sys_warn (r, record->pos, _("File specifies unexpected value %g as %s."),
- lowest, "LOWEST");
+ sys_warn (r, record->pos,
+ _("File specifies unexpected value %g (%a) as %s, "
+ "instead of %g (%a)."),
+ highest, highest, "HIGHEST", HIGHEST, HIGHEST);
+
+ /* SPSS before version 21 used a unique value just bigger than SYSMIS as
+ LOWEST. SPSS 21 uses SYSMIS for LOWEST, which is OK because LOWEST only
+ appears in a context (missing values) where SYSMIS cannot. */
+ if (lowest != LOWEST && lowest != SYSMIS)
+ sys_warn (r, record->pos,
+ _("File specifies unexpected value %g (%a) as %s, "
+ "instead of %g (%a) or %g (%a)."),
+ lowest, lowest, "LOWEST", LOWEST, LOWEST, SYSMIS, SYSMIS);
}
/* Parses record type 7, subtype 7 or 19. */
_("MRSET %s has only %zu variables."), mrset->name,
mrset->n_vars);
mrset_destroy (mrset);
+ stringi_set_destroy (&var_names);
continue;
}
mrset->width = width;
value_init (&mrset->counted, width);
if (width == 0)
- mrset->counted.f = strtod (counted, NULL);
+ mrset->counted.f = c_strtod (counted, NULL);
else
value_copy_str_rpad (&mrset->counted, width,
(const uint8_t *) counted, ' ');
align = parse_int (r, record->data, ofs);
ofs += 4;
- /* SPSS 14 sometimes seems to set string variables' measure
- to zero. */
- if (0 == measure && var_is_alpha (v))
+ /* SPSS sometimes seems to set variables' measure to zero. */
+ if (0 == measure)
measure = 1;
if (measure < 1 || measure > 3 || align < 0 || align > 2)
if (record == NULL)
{
- /* Convert variable names to lowercase. */
+ /* There are no long variable names. Use the short variable names,
+ converted to lowercase, as the long variable names. */
size_t i;
for (i = 0; i < dict_get_var_cnt (dict); i++)
struct variable *var = dict_get_var (dict, i);
char *new_name;
- new_name = xstrdup (var_get_name (var));
- str_lowercase (new_name);
-
+ new_name = utf8_to_lower (var_get_name (var));
rename_var_and_save_short_names (dict, var, new_name);
-
free (new_name);
}
while (read_variable_to_value_pair (r, dict, text, &var, &long_name))
{
/* Validate long name. */
- /* XXX need to reencode name to UTF-8 */
if (!dict_id_is_valid (dict, long_name, false))
{
sys_warn (r, record->pos,
}
/* Identify any duplicates. */
- if (strcasecmp (var_get_short_name (var, 0), long_name)
+ if (utf8_strcasecmp (var_get_short_name (var, 0), long_name)
&& dict_lookup_var (dict, long_name) != NULL)
{
sys_warn (r, record->pos,
close_text_record (r, text);
}
+static void
+assign_variable_roles (struct sfm_reader *r, struct dictionary *dict)
+{
+ size_t n_warnings = 0;
+ size_t i;
+
+ for (i = 0; i < dict_get_var_cnt (dict); i++)
+ {
+ struct variable *var = dict_get_var (dict, i);
+ struct attrset *attrs = var_get_attributes (var);
+ const struct attribute *attr = attrset_lookup (attrs, "$@Role");
+ if (attr != NULL)
+ {
+ int value = atoi (attribute_get_value (attr, 0));
+ enum var_role role;
+
+ switch (value)
+ {
+ case 0:
+ role = ROLE_NONE;
+ break;
+
+ case 1:
+ role = ROLE_INPUT;
+ break;
+
+ case 2:
+ role = ROLE_OUTPUT;
+ break;
+
+ case 3:
+ role = ROLE_BOTH;
+ break;
+
+ case 4:
+ role = ROLE_PARTITION;
+ break;
+
+ case 5:
+ role = ROLE_SPLIT;
+ break;
+
+ default:
+ role = ROLE_NONE;
+ if (n_warnings++ == 0)
+ sys_warn (r, 0, _("Invalid role for variable %s."),
+ var_get_name (var));
+ }
+
+ var_set_role (var, role);
+ }
+ }
+
+ if (n_warnings > 1)
+ sys_warn (r, 0, _("%zu other variables had invalid roles."),
+ n_warnings - 1);
+}
+
static void
check_overflow (struct sfm_reader *r,
const struct sfm_extension_record *record,
start = text->pos;
n = 0;
- for (;;)
+ while (text->pos < text->buffer.length)
{
int c = text->buffer.string[text->pos];
if (c < '0' || c > '9')
n = (n * 10) + (c - '0');
text->pos++;
}
- if (start == text->pos)
+ if (text->pos >= text->buffer.length || start == text->pos)
{
sys_warn (r, text->start,
_("Expecting digit at offset %zu in MRSETS record."),