/* PSPP - a program for statistical analysis.
- Copyright (C) 1997-2004, 2006 Free Software Foundation, Inc.
+ Copyright (C) 1997-2004, 2006, 2010 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
+#include <sys/stat.h>
#include <data/casereader.h>
#include <data/file-handle-def.h>
#include <language/lexer/lexer.h>
#include <language/prompt.h>
#include <libpspp/assertion.h>
+#include <libpspp/cast.h>
+#include <libpspp/integer-format.h>
#include <libpspp/message.h>
#include <libpspp/str.h>
#include "gettext.h"
#define _(msgid) gettext (msgid)
+#define N_(msgid) (msgid)
/* Flags for DFM readers. */
enum dfm_reader_flags
struct string scratch; /* Extra line buffer. */
enum dfm_reader_flags flags; /* Zero or more of DFM_*. */
FILE *file; /* Associated file. */
+ off_t file_size; /* File size, or -1 if unavailable. */
size_t pos; /* Offset in line of current character. */
unsigned eof_cnt; /* # of attempts to advance past EOF. */
struct lexer *lexer; /* The lexer reading the file */
+
+ /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */
+ size_t block_left; /* Bytes left in current block. */
};
/* Closes reader R opened by dfm_open_reader(). */
struct dfm_reader *r;
struct fh_lock *lock;
- lock = fh_lock (fh, FH_REF_FILE | FH_REF_INLINE, "data file",
+ /* TRANSLATORS: this fragment will be interpolated into
+ messages in fh_lock() that identify types of files. */
+ lock = fh_lock (fh, FH_REF_FILE | FH_REF_INLINE, N_("data file"),
FH_ACC_READ, false);
if (lock == NULL)
return NULL;
ds_init_empty (&r->scratch);
r->flags = DFM_ADVANCE;
r->eof_cnt = 0;
+ r->block_left = 0;
if (fh_get_referent (fh) != FH_REF_INLINE)
{
- r->where.file_name = fh_get_file_name (fh);
+ struct stat s;
+ r->where.file_name = CONST_CAST (char *, fh_get_file_name (fh));
r->where.line_number = 0;
r->file = fn_open (fh_get_file_name (fh), "rb");
if (r->file == NULL)
{
- msg (ME, _("Could not open \"%s\" for reading as a data file: %s."),
+ msg (ME, _("Could not open `%s' for reading as a data file: %s."),
fh_get_file_name (r->fh), strerror (errno));
fh_unlock (r->lock);
fh_unref (fh);
free (r);
return NULL;
}
+ r->file_size = fstat (fileno (r->file), &s) == 0 ? s.st_size : -1;
}
+ else
+ r->file_size = -1;
fh_lock_set_aux (lock, r);
return r;
if (!lex_get_line_raw (r->lexer))
{
+ lex_discard_line (r->lexer);
msg (SE, _("Unexpected end-of-file while reading data in BEGIN "
"DATA. This probably indicates "
- "a missing or misformatted END DATA command. "
+ "a missing or incorrectly formatted END DATA command. "
"END DATA must appear by itself on a single line "
"with exactly one space between words."));
return false;
return true;
}
+/* Report a read error or unexpected end-of-file condition on R. */
+static void
+read_error (struct dfm_reader *r)
+{
+ if (ferror (r->file))
+ msg (ME, _("Error reading file %s: %s."),
+ fh_get_name (r->fh), strerror (errno));
+ else if (feof (r->file))
+ msg (ME, _("Unexpected end of file reading %s."), fh_get_name (r->fh));
+ else
+ NOT_REACHED ();
+}
+
+/* Report a partial read at end of file reading R. */
+static void
+partial_record (struct dfm_reader *r)
+{
+ msg (ME, _("Unexpected end of file in partial record reading %s."),
+ fh_get_name (r->fh));
+}
+
+/* Tries to read SIZE bytes from R into BUFFER. Returns 1 if
+ successful, 0 if end of file was reached before any bytes
+ could be read, and -1 if some bytes were read but fewer than
+ SIZE due to end of file or an error mid-read. In the latter
+ case, reports an error. */
+static int
+try_to_read_fully (struct dfm_reader *r, void *buffer, size_t size)
+{
+ size_t bytes_read = fread (buffer, 1, size, r->file);
+ if (bytes_read == size)
+ return 1;
+ else if (bytes_read == 0)
+ return 0;
+ else
+ {
+ partial_record (r);
+ return -1;
+ }
+}
+
+/* Type of a descriptor word. */
+enum descriptor_type
+ {
+ BLOCK,
+ RECORD
+ };
+
+/* Reads a block descriptor word or record descriptor word
+ (according to TYPE) from R. Returns 1 if successful, 0 if
+ end of file was reached before any bytes could be read, -1 if
+ an error occurred. Reports an error in the latter case.
+
+ If successful, stores the number of remaining bytes in the
+ block or record (that is, the block or record length, minus
+ the 4 bytes in the BDW or RDW itself) into *REMAINING_SIZE.
+ If SEGMENT is nonnull, also stores the segment control
+ character (SCC) into *SEGMENT. */
+static int
+read_descriptor_word (struct dfm_reader *r, enum descriptor_type type,
+ size_t *remaining_size, int *segment)
+{
+ uint8_t raw_descriptor[4];
+ int status;
+
+ status = try_to_read_fully (r, raw_descriptor, sizeof raw_descriptor);
+ if (status <= 0)
+ return status;
+
+ *remaining_size = (raw_descriptor[0] << 8) | raw_descriptor[1];
+ if (segment != NULL)
+ *segment = raw_descriptor[2];
+
+ if (*remaining_size < 4)
+ {
+ msg (ME,
+ (type == BLOCK
+ ? _("Corrupt block descriptor word at offset 0x%lx in %s.")
+ : _("Corrupt record descriptor word at offset 0x%lx in %s.")),
+ (long) ftello (r->file) - 4, fh_get_name (r->fh));
+ return -1;
+ }
+
+ *remaining_size -= 4;
+ return 1;
+}
+
+/* Reports that reader R has read a corrupt record size. */
+static void
+corrupt_size (struct dfm_reader *r)
+{
+ msg (ME, _("Corrupt record size at offset 0x%lx in %s."),
+ (long) ftello (r->file) - 4, fh_get_name (r->fh));
+}
+
+/* Reads a 32-byte little-endian signed number from R and stores
+ its value into *SIZE_OUT. Returns 1 if successful, 0 if end
+ of file was reached before any bytes could be read, -1 if an
+ error occurred. Reports an error in the latter case. Numbers
+ less than 0 are considered errors. */
+static int
+read_size (struct dfm_reader *r, size_t *size_out)
+{
+ int32_t size;
+ int status;
+
+ status = try_to_read_fully (r, &size, sizeof size);
+ if (status <= 0)
+ return status;
+
+ integer_convert (INTEGER_LSB_FIRST, &size, INTEGER_NATIVE, &size,
+ sizeof size);
+ if (size < 0)
+ {
+ corrupt_size (r);
+ return -1;
+ }
+
+ *size_out = size;
+ return 1;
+}
+
/* Reads a record from a disk file into R.
- Returns true if successful, false on failure. */
+ Returns true if successful, false on error or at end of file. */
static bool
read_file_record (struct dfm_reader *r)
{
assert (r->fh != fh_inline_file ());
+
ds_clear (&r->line);
- if (fh_get_mode (r->fh) == FH_MODE_TEXT)
+ switch (fh_get_mode (r->fh))
{
- if (!ds_read_line (&r->line, r->file))
+ case FH_MODE_TEXT:
+ if (ds_read_line (&r->line, r->file, SIZE_MAX))
+ {
+ ds_chomp (&r->line, '\n');
+ return true;
+ }
+ else
{
if (ferror (r->file))
- msg (ME, _("Error reading file %s: %s."),
- fh_get_name (r->fh), strerror (errno));
+ read_error (r);
return false;
}
- ds_chomp (&r->line, '\n');
- }
- else if (fh_get_mode (r->fh) == FH_MODE_BINARY)
- {
- size_t record_width = fh_get_record_width (r->fh);
- size_t amt = ds_read_stream (&r->line, 1, record_width, r->file);
- if (record_width != amt)
+ return true;
+
+ case FH_MODE_FIXED:
+ if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file))
+ return true;
+ else
{
if (ferror (r->file))
- msg (ME, _("Error reading file %s: %s."),
- fh_get_name (r->fh), strerror (errno));
- else if (amt != 0)
- msg (ME, _("%s: Partial record at end of file."),
- fh_get_name (r->fh));
+ read_error (r);
+ else if (!ds_is_empty (&r->line))
+ partial_record (r);
+ return false;
+ }
+ return true;
+ case FH_MODE_VARIABLE:
+ {
+ size_t leading_size;
+ size_t trailing_size;
+ int status;
+
+ /* Read leading record size. */
+ status = read_size (r, &leading_size);
+ if (status <= 0)
return false;
+
+ /* Read record data. */
+ if (!ds_read_stream (&r->line, leading_size, 1, r->file))
+ {
+ if (ferror (r->file))
+ read_error (r);
+ else
+ partial_record (r);
+ return false;
+ }
+
+ /* Read trailing record size and check that it's the same
+ as the leading record size. */
+ status = read_size (r, &trailing_size);
+ if (status <= 0)
+ {
+ if (status == 0)
+ partial_record (r);
+ return false;
+ }
+ if (leading_size != trailing_size)
+ {
+ corrupt_size (r);
+ return false;
+ }
+
+ return true;
+ }
+
+ case FH_MODE_360_VARIABLE:
+ case FH_MODE_360_SPANNED:
+ for (;;)
+ {
+ size_t record_size;
+ int segment;
+ int status;
+
+ /* If we've exhausted our current block, start another
+ one by reading the new block descriptor word. */
+ if (r->block_left == 0)
+ {
+ status = read_descriptor_word (r, BLOCK, &r->block_left, NULL);
+ if (status < 0)
+ return false;
+ else if (status == 0)
+ return !ds_is_empty (&r->line);
+ }
+
+ /* Read record descriptor. */
+ if (r->block_left < 4)
+ {
+ partial_record (r);
+ return false;
+ }
+ r->block_left -= 4;
+ status = read_descriptor_word (r, RECORD, &record_size, &segment);
+ if (status <= 0)
+ {
+ if (status == 0)
+ partial_record (r);
+ return false;
+ }
+ if (record_size > r->block_left)
+ {
+ msg (ME, _("Record exceeds remaining block length."));
+ return false;
+ }
+
+ /* Read record data. */
+ if (!ds_read_stream (&r->line, record_size, 1, r->file))
+ {
+ if (ferror (r->file))
+ read_error (r);
+ else
+ partial_record (r);
+ return false;
+ }
+ r->block_left -= record_size;
+
+ /* In variable mode, read only a single record.
+ In spanned mode, a segment value of 0 should
+ designate a whole record without spanning, 1 the
+ first segment in a record, 2 the last segment in a
+ record, and 3 an intermediate segment in a record.
+ For compatibility, though, we actually pay attention
+ only to whether the segment value is even or odd. */
+ if (fh_get_mode (r->fh) == FH_MODE_360_VARIABLE
+ || (segment & 1) == 0)
+ return true;
}
}
- else
- NOT_REACHED ();
-
- r->where.line_number++;
- return true;
+ NOT_REACHED ();
}
/* Reads a record from R, setting the current position to the
static bool
read_record (struct dfm_reader *r)
{
- return (fh_get_referent (r->fh) == FH_REF_FILE
- ? read_file_record (r)
- : read_inline_record (r));
+ if (fh_get_referent (r->fh) == FH_REF_FILE)
+ {
+ bool ok = read_file_record (r);
+ if (ok)
+ r->where.line_number++;
+ return ok;
+ }
+ else
+ return read_inline_record (r);
}
/* Returns the number of attempts, thus far, to advance past
r->flags |= DFM_TABS_EXPANDED;
if (r->fh != fh_inline_file ()
- && (fh_get_mode (r->fh) == FH_MODE_BINARY
+ && (fh_get_mode (r->fh) != FH_MODE_TEXT
|| fh_get_tab_width (r->fh) == 0
|| ds_find_char (&r->line, '\t') == SIZE_MAX))
return;
r->pos = new_pos;
}
+/* Returns the legacy character encoding of data read from READER. */
+const char *
+dfm_reader_get_legacy_encoding (const struct dfm_reader *reader)
+{
+ return fh_get_legacy_encoding (reader->fh);
+}
+
+/* Returns a number between 0 and 100 that approximates the
+ percentage of the data in READER that has already been read,
+ or -1 if this value cannot be estimated.
+
+ ftello is slow in glibc (it flushes the read buffer), so don't
+ call this function unless you need to. */
+int
+dfm_get_percent_read (const struct dfm_reader *reader)
+{
+ if (reader->file_size >= 0)
+ {
+ off_t position = ftello (reader->file);
+ if (position >= 0)
+ {
+ double p = 100.0 * position / reader->file_size;
+ return p < 0 ? 0 : p > 100 ? 100 : p;
+ }
+ }
+ return -1;
+}
+
/* Causes dfm_get_record() or dfm_get_whole_record() to read in
the next record the next time it is executed on file
HANDLE. */
return ds_pointer_to_position (&r->line, p) + 1;
}
-/* Pushes the file name and line number on the fn/ln stack. */
-void
-dfm_push (struct dfm_reader *r)
+const char *
+dfm_get_file_name (const struct dfm_reader *r)
{
- if (r->fh != fh_inline_file ())
- msg_push_msg_locator (&r->where);
+ return fh_get_referent (r->fh) == FH_REF_FILE ? r->where.file_name : NULL;
}
-/* Pops the file name and line number from the fn/ln stack. */
-void
-dfm_pop (struct dfm_reader *r)
+int
+dfm_get_line_number (const struct dfm_reader *r)
{
- if (r->fh != fh_inline_file ())
- msg_pop_msg_locator (&r->where);
+ return fh_get_referent (r->fh) == FH_REF_FILE ? r->where.line_number : -1;
}
\f
/* BEGIN DATA...END DATA procedure. */