X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp-builds.git;a=blobdiff_plain;f=src%2Flanguage%2Fdata-io%2Fdata-reader.c;h=6f620a6a948341a0dfa5db150763d20f51e92284;hp=1899acfb8396b6aa7d180fa229fc4ae968f525f6;hb=9254d30d06a0565c89daccedd93a94c4c6086004;hpb=97d4f38945476834fd7fce612b663f19f2b291f8 diff --git a/src/language/data-io/data-reader.c b/src/language/data-io/data-reader.c index 1899acfb..6f620a6a 100644 --- a/src/language/data-io/data-reader.c +++ b/src/language/data-io/data-reader.c @@ -1,21 +1,18 @@ -/* PSPP - computes sample statistics. +/* PSPP - a program for statistical analysis. Copyright (C) 1997-2004, 2006 Free Software Foundation, Inc. - Written by Ben Pfaff . - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License as - published by the Free Software Foundation; either version 2 of the - License, or (at your option) any later version. + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. - This program is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - 02110-1301, USA. */ + along with this program. If not, see . */ #include @@ -23,32 +20,36 @@ #include #include +#include #include #include +#include +#include #include #include #include #include #include #include -#include -#include +#include #include +#include #include #include #include "minmax.h" -#include "size_max.h" +#include "xalloc.h" #include "gettext.h" #define _(msgid) gettext (msgid) +#define N_(msgid) (msgid) /* Flags for DFM readers. */ enum dfm_reader_flags { DFM_ADVANCE = 002, /* Read next line on dfm_get_record() call? */ - DFM_SAW_BEGIN_DATA = 004, /* For inline_file only, whether we've + DFM_SAW_BEGIN_DATA = 004, /* For inline_file only, whether we've already read a BEGIN DATA line. */ DFM_TABS_EXPANDED = 010, /* Tabs have been expanded. */ }; @@ -57,52 +58,52 @@ enum dfm_reader_flags struct dfm_reader { struct file_handle *fh; /* File handle. */ + struct fh_lock *lock; /* Mutual exclusion lock for file. */ struct msg_locator where; /* Current location in data file. */ struct string line; /* Current line. */ struct string scratch; /* Extra line buffer. */ enum dfm_reader_flags flags; /* Zero or more of DFM_*. */ FILE *file; /* Associated file. */ + off_t file_size; /* File size, or -1 if unavailable. */ size_t pos; /* Offset in line of current character. */ unsigned eof_cnt; /* # of attempts to advance past EOF. */ + struct lexer *lexer; /* The lexer reading the file */ + + /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */ + size_t block_left; /* Bytes left in current block. */ }; /* Closes reader R opened by dfm_open_reader(). */ void dfm_close_reader (struct dfm_reader *r) { - int still_open; - bool is_inline; - char *file_name; - if (r == NULL) return; - is_inline = r->fh == fh_inline_file (); - file_name = is_inline ? NULL : xstrdup (fh_get_file_name (r->fh)); - still_open = fh_close (r->fh, "data file", "rs"); - if (still_open) + if (fh_unlock (r->lock)) { - free (file_name); - return; + /* File is still locked by another client. */ + return; } - if (!is_inline) - fn_close (file_name, r->file); + /* This was the last client, so close the underlying file. */ + if (fh_get_referent (r->fh) != FH_REF_INLINE) + fn_close (fh_get_file_name (r->fh), r->file); else { /* Skip any remaining data on the inline file. */ - if (r->flags & DFM_SAW_BEGIN_DATA) + if (r->flags & DFM_SAW_BEGIN_DATA) { dfm_reread_record (r, 0); while (!dfm_eof (r)) - dfm_forward_record (r); + dfm_forward_record (r); } } + fh_unref (r->fh); ds_destroy (&r->line); ds_destroy (&r->scratch); free (r); - free (file_name); } /* Opens the file designated by file handle FH for reading as a @@ -111,45 +112,59 @@ dfm_close_reader (struct dfm_reader *r) file between BEGIN FILE and END FILE. Returns a reader if successful, or a null pointer otherwise. */ struct dfm_reader * -dfm_open_reader (struct file_handle *fh) +dfm_open_reader (struct file_handle *fh, struct lexer *lexer) { struct dfm_reader *r; - void **rp; + struct fh_lock *lock; - rp = fh_open (fh, FH_REF_FILE | FH_REF_INLINE, "data file", "rs"); - if (rp == NULL) + /* TRANSLATORS: this fragment will be interpolated into + messages in fh_lock() that identify types of files. */ + lock = fh_lock (fh, FH_REF_FILE | FH_REF_INLINE, N_("data file"), + FH_ACC_READ, false); + if (lock == NULL) return NULL; - if (*rp != NULL) - return *rp; - + + r = fh_lock_get_aux (lock); + if (r != NULL) + return r; + r = xmalloc (sizeof *r); - r->fh = fh; + r->fh = fh_ref (fh); + r->lock = lock; + r->lexer = lexer; ds_init_empty (&r->line); ds_init_empty (&r->scratch); r->flags = DFM_ADVANCE; r->eof_cnt = 0; - if (fh != fh_inline_file ()) + r->block_left = 0; + if (fh_get_referent (fh) != FH_REF_INLINE) { + struct stat s; r->where.file_name = fh_get_file_name (fh); - r->where.line_number = 0; - r->file = fn_open (fh_get_file_name (fh), "rb"); + r->where.line_number = 0; + r->file = fn_open (fh_get_file_name (fh), + fh_get_mode (fh) == FH_MODE_TEXT ? "r" : "rb"); if (r->file == NULL) { msg (ME, _("Could not open \"%s\" for reading as a data file: %s."), fh_get_file_name (r->fh), strerror (errno)); - fh_close (fh,"data file", "rs"); + fh_unlock (r->lock); + fh_unref (fh); free (r); return NULL; } + r->file_size = fstat (fileno (r->file), &s) == 0 ? s.st_size : -1; } - *rp = r; + else + r->file_size = -1; + fh_lock_set_aux (lock, r); return r; } /* Returns true if an I/O error occurred on READER, false otherwise. */ bool -dfm_reader_error (const struct dfm_reader *r) +dfm_reader_error (const struct dfm_reader *r) { return fh_get_referent (r->fh) == FH_REF_FILE && ferror (r->file); } @@ -163,15 +178,16 @@ read_inline_record (struct dfm_reader *r) { r->flags |= DFM_SAW_BEGIN_DATA; - while (token == '.') - lex_get (); - if (!lex_force_match_id ("BEGIN") || !lex_force_match_id ("DATA")) + while (lex_token (r->lexer) == '.') + lex_get (r->lexer); + if (!lex_force_match_id (r->lexer, "BEGIN") || !lex_force_match_id (r->lexer, "DATA")) return false; - getl_set_prompt_style (GETL_PROMPT_DATA); + prompt_set_style (PROMPT_DATA); } - - if (!getl_read_line (NULL)) + + if (!lex_get_line_raw (r->lexer)) { + lex_discard_line (r->lexer); msg (SE, _("Unexpected end-of-file while reading data in BEGIN " "DATA. This probably indicates " "a missing or misformatted END DATA command. " @@ -180,56 +196,280 @@ read_inline_record (struct dfm_reader *r) return false; } - if (ds_length (&getl_buf) >= 8 - && !strncasecmp (ds_cstr (&getl_buf), "end data", 8)) + if (ds_length (lex_entire_line_ds (r->lexer) ) >= 8 + && !strncasecmp (lex_entire_line (r->lexer), "end data", 8)) { - lex_set_prog (ds_end (&getl_buf)); + lex_discard_line (r->lexer); return false; } - ds_assign_string (&r->line, &getl_buf); + ds_assign_string (&r->line, lex_entire_line_ds (r->lexer) ); + return true; } +/* Report a read error or unexpected end-of-file condition on R. */ +static void +read_error (struct dfm_reader *r) +{ + if (ferror (r->file)) + msg (ME, _("Error reading file %s: %s."), + fh_get_name (r->fh), strerror (errno)); + else if (feof (r->file)) + msg (ME, _("Unexpected end of file reading %s."), fh_get_name (r->fh)); + else + NOT_REACHED (); +} + +/* Report a partial read at end of file reading R. */ +static void +partial_record (struct dfm_reader *r) +{ + msg (ME, _("Unexpected end of file in partial record reading %s."), + fh_get_name (r->fh)); +} + +/* Tries to read SIZE bytes from R into BUFFER. Returns 1 if + successful, 0 if end of file was reached before any bytes + could be read, and -1 if some bytes were read but fewer than + SIZE due to end of file or an error mid-read. In the latter + case, reports an error. */ +static int +try_to_read_fully (struct dfm_reader *r, void *buffer, size_t size) +{ + size_t bytes_read = fread (buffer, 1, size, r->file); + if (bytes_read == size) + return 1; + else if (bytes_read == 0) + return 0; + else + { + partial_record (r); + return -1; + } +} + +/* Type of a descriptor word. */ +enum descriptor_type + { + BLOCK, + RECORD + }; + +/* Reads a block descriptor word or record descriptor word + (according to TYPE) from R. Returns 1 if successful, 0 if + end of file was reached before any bytes could be read, -1 if + an error occurred. Reports an error in the latter case. + + If successful, stores the number of remaining bytes in the + block or record (that is, the block or record length, minus + the 4 bytes in the BDW or RDW itself) into *REMAINING_SIZE. + If SEGMENT is nonnull, also stores the segment control + character (SCC) into *SEGMENT. */ +static int +read_descriptor_word (struct dfm_reader *r, enum descriptor_type type, + size_t *remaining_size, int *segment) +{ + uint8_t raw_descriptor[4]; + int status; + + status = try_to_read_fully (r, raw_descriptor, sizeof raw_descriptor); + if (status <= 0) + return status; + + *remaining_size = (raw_descriptor[0] << 8) | raw_descriptor[1]; + if (segment != NULL) + *segment = raw_descriptor[2]; + + if (*remaining_size < 4) + { + msg (ME, + (type == BLOCK + ? _("Corrupt block descriptor word at offset 0x%lx in %s.") + : _("Corrupt record descriptor word at offset 0x%lx in %s.")), + (long) ftello (r->file) - 4, fh_get_name (r->fh)); + return -1; + } + + *remaining_size -= 4; + return 1; +} + +/* Reports that reader R has read a corrupt record size. */ +static void +corrupt_size (struct dfm_reader *r) +{ + msg (ME, _("Corrupt record size at offset 0x%lx in %s."), + (long) ftello (r->file) - 4, fh_get_name (r->fh)); +} + +/* Reads a 32-byte little-endian signed number from R and stores + its value into *SIZE_OUT. Returns 1 if successful, 0 if end + of file was reached before any bytes could be read, -1 if an + error occurred. Reports an error in the latter case. Numbers + less than 0 are considered errors. */ +static int +read_size (struct dfm_reader *r, size_t *size_out) +{ + int32_t size; + int status; + + status = try_to_read_fully (r, &size, sizeof size); + if (status <= 0) + return status; + + integer_convert (INTEGER_LSB_FIRST, &size, INTEGER_NATIVE, &size, + sizeof size); + if (size < 0) + { + corrupt_size (r); + return -1; + } + + *size_out = size; + return 1; +} + /* Reads a record from a disk file into R. - Returns true if successful, false on failure. */ + Returns true if successful, false on error or at end of file. */ static bool read_file_record (struct dfm_reader *r) { assert (r->fh != fh_inline_file ()); + ds_clear (&r->line); - if (fh_get_mode (r->fh) == FH_MODE_TEXT) + switch (fh_get_mode (r->fh)) { - if (!ds_read_line (&r->line, r->file)) + case FH_MODE_TEXT: + if (ds_read_line (&r->line, r->file, SIZE_MAX)) + { + ds_chomp (&r->line, '\n'); + return true; + } + else { if (ferror (r->file)) - msg (ME, _("Error reading file %s: %s."), - fh_get_name (r->fh), strerror (errno)); + read_error (r); return false; } - } - else if (fh_get_mode (r->fh) == FH_MODE_BINARY) - { - size_t record_width = fh_get_record_width (r->fh); - size_t amt = ds_read_stream (&r->line, 1, record_width, r->file); - if (record_width != amt) + return true; + + case FH_MODE_FIXED: + if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file)) + return true; + else { if (ferror (r->file)) - msg (ME, _("Error reading file %s: %s."), - fh_get_name (r->fh), strerror (errno)); - else if (amt != 0) - msg (ME, _("%s: Partial record at end of file."), - fh_get_name (r->fh)); + read_error (r); + else if (!ds_is_empty (&r->line)) + partial_record (r); + return false; + } + return true; + case FH_MODE_VARIABLE: + { + size_t leading_size; + size_t trailing_size; + int status; + + /* Read leading record size. */ + status = read_size (r, &leading_size); + if (status <= 0) return false; + + /* Read record data. */ + if (!ds_read_stream (&r->line, leading_size, 1, r->file)) + { + if (ferror (r->file)) + read_error (r); + else + partial_record (r); + return false; + } + + /* Read trailing record size and check that it's the same + as the leading record size. */ + status = read_size (r, &trailing_size); + if (status <= 0) + { + if (status == 0) + partial_record (r); + return false; + } + if (leading_size != trailing_size) + { + corrupt_size (r); + return false; + } + + return true; + } + + case FH_MODE_360_VARIABLE: + case FH_MODE_360_SPANNED: + for (;;) + { + size_t record_size; + int segment; + int status; + + /* If we've exhausted our current block, start another + one by reading the new block descriptor word. */ + if (r->block_left == 0) + { + status = read_descriptor_word (r, BLOCK, &r->block_left, NULL); + if (status < 0) + return false; + else if (status == 0) + return !ds_is_empty (&r->line); + } + + /* Read record descriptor. */ + if (r->block_left < 4) + { + partial_record (r); + return false; + } + r->block_left -= 4; + status = read_descriptor_word (r, RECORD, &record_size, &segment); + if (status <= 0) + { + if (status == 0) + partial_record (r); + return false; + } + if (record_size > r->block_left) + { + msg (ME, _("Record exceeds remaining block length.")); + return false; + } + + /* Read record data. */ + if (!ds_read_stream (&r->line, record_size, 1, r->file)) + { + if (ferror (r->file)) + read_error (r); + else + partial_record (r); + return false; + } + r->block_left -= record_size; + + /* In variable mode, read only a single record. + In spanned mode, a segment value of 0 should + designate a whole record without spanning, 1 the + first segment in a record, 2 the last segment in a + record, and 3 an intermediate segment in a record. + For compatibility, though, we actually pay attention + only to whether the segment value is even or odd. */ + if (fh_get_mode (r->fh) == FH_MODE_360_VARIABLE + || (segment & 1) == 0) + return true; } } - else - NOT_REACHED (); - r->where.line_number++; - - return true; + NOT_REACHED (); } /* Reads a record from R, setting the current position to the @@ -238,9 +478,15 @@ read_file_record (struct dfm_reader *r) static bool read_record (struct dfm_reader *r) { - return (fh_get_referent (r->fh) == FH_REF_FILE - ? read_file_record (r) - : read_inline_record (r)); + if (fh_get_referent (r->fh) == FH_REF_FILE) + { + bool ok = read_file_record (r); + if (ok) + r->where.line_number++; + return ok; + } + else + return read_inline_record (r); } /* Returns the number of attempts, thus far, to advance past @@ -253,16 +499,16 @@ read_record (struct dfm_reader *r) an error message is issued, and the caller should more forcibly abort to avoid an infinite loop. */ unsigned -dfm_eof (struct dfm_reader *r) +dfm_eof (struct dfm_reader *r) { if (r->flags & DFM_ADVANCE) { r->flags &= ~DFM_ADVANCE; - if (r->eof_cnt == 0 && read_record (r)) + if (r->eof_cnt == 0 && read_record (r) ) { r->pos = 0; - return 0; + return 0; } r->eof_cnt++; @@ -296,7 +542,7 @@ dfm_get_record (struct dfm_reader *r) reading from the file is necessary or at end of file, so call dfm_eof() first.*/ void -dfm_expand_tabs (struct dfm_reader *r) +dfm_expand_tabs (struct dfm_reader *r) { size_t ofs, new_pos, tab_width; @@ -308,7 +554,7 @@ dfm_expand_tabs (struct dfm_reader *r) r->flags |= DFM_TABS_EXPANDED; if (r->fh != fh_inline_file () - && (fh_get_mode (r->fh) == FH_MODE_BINARY + && (fh_get_mode (r->fh) != FH_MODE_TEXT || fh_get_tab_width (r->fh) == 0 || ds_find_char (&r->line, '\t') == SIZE_MAX)) return; @@ -321,21 +567,21 @@ dfm_expand_tabs (struct dfm_reader *r) for (ofs = 0; ofs < ds_length (&r->line); ofs++) { unsigned char c; - + if (ofs == r->pos) new_pos = ds_length (&r->scratch); c = ds_data (&r->line)[ofs]; if (c != '\t') ds_put_char (&r->scratch, c); - else + else { do ds_put_char (&r->scratch, ' '); while (ds_length (&r->scratch) % tab_width != 0); } } - if (new_pos == SIZE_MAX) + if (new_pos == SIZE_MAX) { /* Maintain the same relationship between position and line length that we had before. DATA LIST uses a @@ -350,6 +596,34 @@ dfm_expand_tabs (struct dfm_reader *r) r->pos = new_pos; } +/* Returns the legacy character encoding of data read from READER. */ +const char * +dfm_reader_get_legacy_encoding (const struct dfm_reader *reader) +{ + return fh_get_legacy_encoding (reader->fh); +} + +/* Returns a number between 0 and 100 that approximates the + percentage of the data in READER that has already been read, + or -1 if this value cannot be estimated. + + ftello is slow in glibc (it flushes the read buffer), so don't + call this function unless you need to. */ +int +dfm_get_percent_read (const struct dfm_reader *reader) +{ + if (reader->file_size >= 0) + { + off_t position = ftello (reader->file); + if (position >= 0) + { + double p = 100.0 * position / reader->file_size; + return p < 0 ? 0 : p > 100 ? 100 : p; + } + } + return -1; +} + /* Causes dfm_get_record() or dfm_get_whole_record() to read in the next record the next time it is executed on file HANDLE. */ @@ -390,7 +664,7 @@ dfm_column_start (const struct dfm_reader *r) of the line. At or before end-of-line, this is 0; one column after end-of-line, this is 1; and so on. */ size_t -dfm_columns_past_end (const struct dfm_reader *r) +dfm_columns_past_end (const struct dfm_reader *r) { return r->pos < ds_length (&r->line) ? 0 : ds_length (&r->line) - r->pos; } @@ -398,7 +672,7 @@ dfm_columns_past_end (const struct dfm_reader *r) /* Returns the 1-based column within the current line that P designates. */ size_t -dfm_get_column (const struct dfm_reader *r, const char *p) +dfm_get_column (const struct dfm_reader *r, const char *p) { return ds_pointer_to_position (&r->line, p) + 1; } @@ -423,12 +697,12 @@ dfm_pop (struct dfm_reader *r) /* Perform BEGIN DATA...END DATA as a procedure in itself. */ int -cmd_begin_data (void) +cmd_begin_data (struct lexer *lexer, struct dataset *ds) { struct dfm_reader *r; bool ok; - if (!fh_is_open (fh_inline_file ())) + if (!fh_is_locked (fh_inline_file (), FH_ACC_READ)) { msg (SE, _("This command is not valid here since the current " "input program does not access the inline file.")); @@ -436,13 +710,13 @@ cmd_begin_data (void) } /* Open inline file. */ - r = dfm_open_reader (fh_inline_file ()); + r = dfm_open_reader (fh_inline_file (), lexer); r->flags |= DFM_SAW_BEGIN_DATA; /* Input procedure reads from inline file. */ - getl_set_prompt_style (GETL_PROMPT_DATA); - ok = procedure (NULL, NULL); - + prompt_set_style (PROMPT_DATA); + casereader_destroy (proc_open (ds)); + ok = proc_commit (ds); dfm_close_reader (r); return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;