/* Fields from the top-level header record. */
struct sfm_header_record
{
+ char magic[5]; /* First 4 bytes of file, then null. */
int weight_idx; /* 0 if unweighted, otherwise a var index. */
int nominal_case_size; /* Number of var positions. */
static const char *choose_encoding (
struct sfm_reader *,
+ const struct sfm_header_record *,
const struct sfm_extension_record *ext_integer,
const struct sfm_extension_record *ext_encoding);
/* Opens the system file designated by file handle FH for reading. Reads the
system file's dictionary into *DICT.
+ Ordinarily the reader attempts to automatically detect the character
+ encoding based on the file's contents. This isn't always possible,
+ especially for files written by old versions of SPSS or PSPP, so specifying
+ a nonnull ENCODING overrides the choice of character encoding.
+
If INFO is non-null, then it receives additional info about the system file,
which the caller must eventually free with sfm_read_info_destroy() when it
is no longer needed. */
struct casereader *
-sfm_open_reader (struct file_handle *fh, struct dictionary **dictp,
- struct sfm_read_info *infop)
+sfm_open_reader (struct file_handle *fh, const char *volatile encoding,
+ struct dictionary **dictp, struct sfm_read_info *infop)
{
struct sfm_reader *volatile r = NULL;
struct sfm_read_info info;
First, figure out the correct character encoding, because this determines
how the rest of the header data is to be interpreted. */
- dict = dict_create (choose_encoding (r, extensions[EXT_INTEGER],
- extensions[EXT_ENCODING]));
+ dict = dict_create (encoding
+ ? encoding
+ : choose_encoding (r, &header, extensions[EXT_INTEGER],
+ extensions[EXT_ENCODING]));
r->encoding = dict_get_encoding (dict);
/* These records don't use variables at all. */
bool
sfm_detect (FILE *file)
{
- char rec_type[5];
+ char magic[5];
- if (fread (rec_type, 4, 1, file) != 1)
+ if (fread (magic, 4, 1, file) != 1)
return false;
- rec_type[4] = '\0';
+ magic[4] = '\0';
- return !strcmp ("$FL2", rec_type);
+ return !strcmp (ASCII_MAGIC, magic) || !strcmp (EBCDIC_MAGIC, magic);
}
\f
/* Reads the global header of the system file. Initializes *HEADER and *INFO,
read_header (struct sfm_reader *r, struct sfm_read_info *info,
struct sfm_header_record *header)
{
- char rec_type[5];
uint8_t raw_layout_code[4];
uint8_t raw_bias[8];
- read_string (r, rec_type, sizeof rec_type);
+ read_string (r, header->magic, sizeof header->magic);
read_string (r, header->eye_catcher, sizeof header->eye_catcher);
- if (strcmp ("$FL2", rec_type) != 0)
+ if (strcmp (ASCII_MAGIC, header->magic)
+ && strcmp (EBCDIC_MAGIC, header->magic))
sys_error (r, 0, _("This is not an SPSS system file."));
/* Identify integer format. */
static const char *
choose_encoding (struct sfm_reader *r,
+ const struct sfm_header_record *header,
const struct sfm_extension_record *ext_integer,
const struct sfm_extension_record *ext_encoding)
{
}
}
+ /* If the file magic number is EBCDIC then its character data is too. */
+ if (!strcmp (header->magic, EBCDIC_MAGIC))
+ return "EBCDIC-US";
+
return locale_charset ();
}
start = text->pos;
n = 0;
- for (;;)
+ while (text->pos < text->buffer.length)
{
int c = text->buffer.string[text->pos];
if (c < '0' || c > '9')
n = (n * 10) + (c - '0');
text->pos++;
}
- if (start == text->pos)
+ if (text->pos >= text->buffer.length || start == text->pos)
{
sys_warn (r, text->start,
_("Expecting digit at offset %zu in MRSETS record."),