X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fdata-io%2Fdata-reader.c;h=ea95bc983298d54acc24df5c55b62f1495436c07;hb=2814862a2c45a39f9822cf4c64ca3884822d064d;hp=4811ba76ac8164d8f9034142f33273db4472a96e;hpb=01b970b8972e4e457b1d8e3f5af350c325152942;p=pspp diff --git a/src/language/data-io/data-reader.c b/src/language/data-io/data-reader.c index 4811ba76ac..ea95bc9832 100644 --- a/src/language/data-io/data-reader.c +++ b/src/language/data-io/data-reader.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-2004, 2006, 2010 Free Software Foundation, Inc. + Copyright (C) 1997-2004, 2006, 2010, 2011, 2012 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,7 +16,7 @@ #include -#include +#include "language/data-io/data-reader.h" #include #include @@ -25,22 +25,23 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "minmax.h" -#include "xalloc.h" +#include "data/casereader.h" +#include "data/dataset.h" +#include "data/file-handle-def.h" +#include "data/file-name.h" +#include "language/command.h" +#include "language/data-io/file-handle.h" +#include "language/lexer/lexer.h" +#include "libpspp/assertion.h" +#include "libpspp/cast.h" +#include "libpspp/encoding-guesser.h" +#include "libpspp/integer-format.h" +#include "libpspp/line-reader.h" +#include "libpspp/message.h" +#include "libpspp/str.h" + +#include "gl/minmax.h" +#include "gl/xalloc.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -53,6 +54,7 @@ enum dfm_reader_flags DFM_SAW_BEGIN_DATA = 004, /* For inline_file only, whether we've already read a BEGIN DATA line. */ DFM_TABS_EXPANDED = 010, /* Tabs have been expanded. */ + DFM_CONSUME = 020 /* read_inline_record() should get a token? */ }; /* Data file reader. */ @@ -60,7 +62,7 @@ struct dfm_reader { struct file_handle *fh; /* File handle. */ struct fh_lock *lock; /* Mutual exclusion lock for file. */ - struct msg_locator where; /* Current location in data file. */ + int line_number; /* Current line or record number. */ struct string line; /* Current line. */ struct string scratch; /* Extra line buffer. */ enum dfm_reader_flags flags; /* Zero or more of DFM_*. */ @@ -69,6 +71,10 @@ struct dfm_reader size_t pos; /* Offset in line of current character. */ unsigned eof_cnt; /* # of attempts to advance past EOF. */ struct lexer *lexer; /* The lexer reading the file */ + char *encoding; /* Current encoding. */ + + /* For FH_MODE_TEXT only. */ + struct line_reader *line_reader; /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */ size_t block_left; /* Bytes left in current block. */ @@ -101,19 +107,28 @@ dfm_close_reader (struct dfm_reader *r) } } + line_reader_free (r->line_reader); + free (r->encoding); fh_unref (r->fh); ds_destroy (&r->line); ds_destroy (&r->scratch); free (r); } -/* Opens the file designated by file handle FH for reading as a - data file. Providing fh_inline_file() for FH designates the - "inline file", that is, data included inline in the command - file between BEGIN FILE and END FILE. Returns a reader if - successful, or a null pointer otherwise. */ +/* Opens the file designated by file handle FH for reading as a data file. + Returns a reader if successful, or a null pointer otherwise. + + If FH is fh_inline_file() then the new reader reads data included inline in + the command file between BEGIN FILE and END FILE, obtaining data from LEXER. + LEXER must remain valid as long as the new reader is in use. ENCODING is + ignored. + + If FH is not fh_inline_file(), then the encoding of the file read is by + default that of FH itself. If ENCODING is nonnull, then it overrides the + default encoding. LEXER is ignored. */ struct dfm_reader * -dfm_open_reader (struct file_handle *fh, struct lexer *lexer) +dfm_open_reader (struct file_handle *fh, struct lexer *lexer, + const char *encoding) { struct dfm_reader *r; struct fh_lock *lock; @@ -141,17 +156,12 @@ dfm_open_reader (struct file_handle *fh, struct lexer *lexer) if (fh_get_referent (fh) != FH_REF_INLINE) { struct stat s; - r->where.file_name = CONST_CAST (char *, fh_get_file_name (fh)); - r->where.line_number = 0; + r->line_number = 0; r->file = fn_open (fh_get_file_name (fh), "rb"); if (r->file == NULL) { msg (ME, _("Could not open `%s' for reading as a data file: %s."), fh_get_file_name (r->fh), strerror (errno)); - fh_unlock (r->lock); - fh_unref (fh); - free (r); - return NULL; } r->file_size = fstat (fileno (r->file), &s) == 0 ? s.st_size : -1; } @@ -159,14 +169,43 @@ dfm_open_reader (struct file_handle *fh, struct lexer *lexer) r->file_size = -1; fh_lock_set_aux (lock, r); + if (encoding == NULL) + encoding = fh_get_encoding (fh); + if (fh_get_referent (fh) == FH_REF_FILE && fh_get_mode (fh) == FH_MODE_TEXT) + { + r->line_reader = line_reader_for_fd (encoding, fileno (r->file)); + if (r->line_reader == NULL) + { + msg (ME, _("Could not read `%s' as a text file with encoding `%s': " + "%s."), + fh_get_file_name (r->fh), encoding, strerror (errno)); + goto error; + } + r->encoding = xstrdup (line_reader_get_encoding (r->line_reader)); + } + else + { + r->line_reader = NULL; + r->encoding = xstrdup (encoding_guess_parse_encoding (encoding)); + } + return r; + +error: + fh_unlock (r->lock); + fh_unref (fh); + free (r); + return NULL; } /* Returns true if an I/O error occurred on READER, false otherwise. */ bool dfm_reader_error (const struct dfm_reader *r) { - return fh_get_referent (r->fh) == FH_REF_FILE && ferror (r->file); + return (fh_get_referent (r->fh) == FH_REF_FILE + && (r->line_reader != NULL + ? line_reader_error (r->line_reader) != 0 + : ferror (r->file))); } /* Reads a record from the inline file into R. @@ -177,48 +216,47 @@ read_inline_record (struct dfm_reader *r) if ((r->flags & DFM_SAW_BEGIN_DATA) == 0) { r->flags |= DFM_SAW_BEGIN_DATA; + r->flags &= ~DFM_CONSUME; - while (lex_token (r->lexer) == '.') + while (lex_token (r->lexer) == T_ENDCMD) lex_get (r->lexer); - if (!lex_force_match_id (r->lexer, "BEGIN") || !lex_force_match_id (r->lexer, "DATA")) + + if (!lex_force_match_id (r->lexer, "BEGIN") + || !lex_force_match_id (r->lexer, "DATA")) return false; - prompt_set_style (PROMPT_DATA); - } - if (!lex_get_line_raw (r->lexer)) - { - lex_discard_line (r->lexer); - msg (SE, _("Unexpected end-of-file while reading data in BEGIN " - "DATA. This probably indicates " - "a missing or miss-formatted END DATA command. " - "END DATA must appear by itself on a single line " - "with exactly one space between words.")); - return false; + lex_match (r->lexer, T_ENDCMD); } - if (ds_length (lex_entire_line_ds (r->lexer) ) >= 8 - && !strncasecmp (lex_entire_line (r->lexer), "end data", 8)) + if (r->flags & DFM_CONSUME) + lex_get (r->lexer); + + if (!lex_is_string (r->lexer)) { - lex_discard_line (r->lexer); + if (!lex_match_id (r->lexer, "END") || !lex_match_id (r->lexer, "DATA")) + { + msg (SE, _("Missing END DATA while reading inline data. " + "This probably indicates a missing or incorrectly " + "formatted END DATA command. END DATA must appear " + "by itself on a single line with exactly one space " + "between words.")); + lex_discard_rest_of_command (r->lexer); + } return false; } - ds_assign_string (&r->line, lex_entire_line_ds (r->lexer) ); + ds_assign_substring (&r->line, lex_tokss (r->lexer)); + r->flags |= DFM_CONSUME; return true; } -/* Report a read error or unexpected end-of-file condition on R. */ +/* Report a read error on R. */ static void read_error (struct dfm_reader *r) { - if (ferror (r->file)) - msg (ME, _("Error reading file %s: %s."), - fh_get_name (r->fh), strerror (errno)); - else if (feof (r->file)) - msg (ME, _("Unexpected end of file reading %s."), fh_get_name (r->fh)); - else - NOT_REACHED (); + msg (ME, _("Error reading file %s: %s."), + fh_get_name (r->fh), strerror (errno)); } /* Report a partial read at end of file reading R. */ @@ -330,6 +368,34 @@ read_size (struct dfm_reader *r, size_t *size_out) return 1; } +static bool +read_text_record (struct dfm_reader *r) +{ + bool is_auto; + bool ok; + + /* Read a line. If the line reader's encoding changes, update r->encoding to + match. */ + is_auto = line_reader_is_auto (r->line_reader); + ok = line_reader_read (r->line_reader, &r->line, SIZE_MAX); + if (is_auto && !line_reader_is_auto (r->line_reader)) + { + free (r->encoding); + r->encoding = xstrdup (line_reader_get_encoding (r->line_reader)); + } + + /* Detect and report read error. */ + if (!ok) + { + int error = line_reader_error (r->line_reader); + if (error != 0) + msg (ME, _("Error reading file %s: %s."), + fh_get_name (r->fh), strerror (error)); + } + + return ok; +} + /* Reads a record from a disk file into R. Returns true if successful, false on error or at end of file. */ static bool @@ -341,18 +407,7 @@ read_file_record (struct dfm_reader *r) switch (fh_get_mode (r->fh)) { case FH_MODE_TEXT: - if (ds_read_line (&r->line, r->file, SIZE_MAX)) - { - ds_chomp (&r->line, '\n'); - return true; - } - else - { - if (ferror (r->file)) - read_error (r); - return false; - } - return true; + return read_text_record (r); case FH_MODE_FIXED: if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file)) @@ -365,7 +420,6 @@ read_file_record (struct dfm_reader *r) partial_record (r); return false; } - return true; case FH_MODE_VARIABLE: { @@ -482,7 +536,7 @@ read_record (struct dfm_reader *r) { bool ok = read_file_record (r); if (ok) - r->where.line_number++; + r->line_number++; return ok; } else @@ -556,7 +610,7 @@ dfm_expand_tabs (struct dfm_reader *r) if (r->fh != fh_inline_file () && (fh_get_mode (r->fh) != FH_MODE_TEXT || fh_get_tab_width (r->fh) == 0 - || ds_find_char (&r->line, '\t') == SIZE_MAX)) + || ds_find_byte (&r->line, '\t') == SIZE_MAX)) return; /* Expand tabs from r->line into r->scratch, and figure out @@ -573,11 +627,11 @@ dfm_expand_tabs (struct dfm_reader *r) c = ds_data (&r->line)[ofs]; if (c != '\t') - ds_put_char (&r->scratch, c); + ds_put_byte (&r->scratch, c); else { do - ds_put_char (&r->scratch, ' '); + ds_put_byte (&r->scratch, ' '); while (ds_length (&r->scratch) % tab_width != 0); } } @@ -596,11 +650,11 @@ dfm_expand_tabs (struct dfm_reader *r) r->pos = new_pos; } -/* Returns the legacy character encoding of data read from READER. */ +/* Returns the character encoding of data read from READER. */ const char * -dfm_reader_get_legacy_encoding (const struct dfm_reader *reader) +dfm_reader_get_encoding (const struct dfm_reader *reader) { - return fh_get_legacy_encoding (reader->fh); + return reader->encoding; } /* Returns a number between 0 and 100 that approximates the @@ -614,7 +668,11 @@ dfm_get_percent_read (const struct dfm_reader *reader) { if (reader->file_size >= 0) { - off_t position = ftello (reader->file); + off_t position; + + position = (reader->line_reader != NULL + ? line_reader_tell (reader->line_reader) + : ftello (reader->file)); if (position >= 0) { double p = 100.0 * position / reader->file_size; @@ -680,13 +738,15 @@ dfm_get_column (const struct dfm_reader *r, const char *p) const char * dfm_get_file_name (const struct dfm_reader *r) { - return fh_get_referent (r->fh) == FH_REF_FILE ? r->where.file_name : NULL; + return (fh_get_referent (r->fh) == FH_REF_FILE + ? fh_get_file_name (r->fh) + : NULL); } int dfm_get_line_number (const struct dfm_reader *r) { - return fh_get_referent (r->fh) == FH_REF_FILE ? r->where.line_number : -1; + return fh_get_referent (r->fh) == FH_REF_FILE ? r->line_number : -1; } /* BEGIN DATA...END DATA procedure. */ @@ -704,13 +764,14 @@ cmd_begin_data (struct lexer *lexer, struct dataset *ds) "input program does not access the inline file.")); return CMD_CASCADING_FAILURE; } + lex_match (lexer, T_ENDCMD); /* Open inline file. */ - r = dfm_open_reader (fh_inline_file (), lexer); + r = dfm_open_reader (fh_inline_file (), lexer, NULL); r->flags |= DFM_SAW_BEGIN_DATA; + r->flags &= ~DFM_CONSUME; /* Input procedure reads from inline file. */ - prompt_set_style (PROMPT_DATA); casereader_destroy (proc_open (ds)); ok = proc_commit (ds); dfm_close_reader (r);