1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2004, 2006, 2010, 2011, 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/data-io/data-reader.h"
28 #include "data/casereader.h"
29 #include "data/dataset.h"
30 #include "data/file-handle-def.h"
31 #include "data/file-name.h"
32 #include "language/command.h"
33 #include "language/data-io/file-handle.h"
34 #include "language/lexer/lexer.h"
35 #include "libpspp/assertion.h"
36 #include "libpspp/cast.h"
37 #include "libpspp/encoding-guesser.h"
38 #include "libpspp/integer-format.h"
39 #include "libpspp/line-reader.h"
40 #include "libpspp/message.h"
41 #include "libpspp/str.h"
43 #include "gl/minmax.h"
44 #include "gl/xalloc.h"
47 #define _(msgid) gettext (msgid)
48 #define N_(msgid) (msgid)
50 /* Flags for DFM readers. */
53 DFM_ADVANCE = 002, /* Read next line on dfm_get_record() call? */
54 DFM_SAW_BEGIN_DATA = 004, /* For inline_file only, whether we've
55 already read a BEGIN DATA line. */
56 DFM_TABS_EXPANDED = 010, /* Tabs have been expanded. */
57 DFM_CONSUME = 020 /* read_inline_record() should get a token? */
60 /* Data file reader. */
63 struct file_handle *fh; /* File handle. */
64 struct fh_lock *lock; /* Mutual exclusion lock for file. */
65 int line_number; /* Current line or record number. */
66 struct string line; /* Current line. */
67 struct string scratch; /* Extra line buffer. */
68 enum dfm_reader_flags flags; /* Zero or more of DFM_*. */
69 FILE *file; /* Associated file. */
70 off_t file_size; /* File size, or -1 if unavailable. */
71 size_t pos; /* Offset in line of current character. */
72 unsigned eof_cnt; /* # of attempts to advance past EOF. */
73 struct lexer *lexer; /* The lexer reading the file */
74 char *encoding; /* Current encoding. */
76 /* For FH_MODE_TEXT only. */
77 struct line_reader *line_reader;
79 /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */
80 size_t block_left; /* Bytes left in current block. */
83 /* Closes reader R opened by dfm_open_reader(). */
85 dfm_close_reader (struct dfm_reader *r)
90 if (fh_unlock (r->lock))
92 /* File is still locked by another client. */
96 /* This was the last client, so close the underlying file. */
97 if (fh_get_referent (r->fh) != FH_REF_INLINE)
98 fn_close (fh_get_file_name (r->fh), r->file);
101 /* Skip any remaining data on the inline file. */
102 if (r->flags & DFM_SAW_BEGIN_DATA)
104 dfm_reread_record (r, 0);
106 dfm_forward_record (r);
110 line_reader_free (r->line_reader);
113 ds_destroy (&r->line);
114 ds_destroy (&r->scratch);
118 /* Opens the file designated by file handle FH for reading as a data file.
119 Returns a reader if successful, or a null pointer otherwise.
121 If FH is fh_inline_file() then the new reader reads data included inline in
122 the command file between BEGIN FILE and END FILE, obtaining data from LEXER.
123 LEXER must remain valid as long as the new reader is in use. ENCODING is
126 If FH is not fh_inline_file(), then the encoding of the file read is by
127 default that of FH itself. If ENCODING is nonnull, then it overrides the
128 default encoding. LEXER is ignored. */
130 dfm_open_reader (struct file_handle *fh, struct lexer *lexer,
131 const char *encoding)
133 struct dfm_reader *r;
134 struct fh_lock *lock;
136 /* TRANSLATORS: this fragment will be interpolated into
137 messages in fh_lock() that identify types of files. */
138 lock = fh_lock (fh, FH_REF_FILE | FH_REF_INLINE, N_("data file"),
143 r = fh_lock_get_aux (lock);
147 r = xmalloc (sizeof *r);
151 ds_init_empty (&r->line);
152 ds_init_empty (&r->scratch);
153 r->flags = DFM_ADVANCE;
156 if (fh_get_referent (fh) != FH_REF_INLINE)
160 r->file = fn_open (fh_get_file_name (fh), "rb");
163 msg (ME, _("Could not open `%s' for reading as a data file: %s."),
164 fh_get_file_name (r->fh), strerror (errno));
167 r->file_size = fstat (fileno (r->file), &s) == 0 ? s.st_size : -1;
171 fh_lock_set_aux (lock, r);
173 if (encoding == NULL)
174 encoding = fh_get_encoding (fh);
175 if (fh_get_referent (fh) == FH_REF_FILE && fh_get_mode (fh) == FH_MODE_TEXT)
177 r->line_reader = line_reader_for_fd (encoding, fileno (r->file));
178 if (r->line_reader == NULL)
180 msg (ME, _("Could not read `%s' as a text file with encoding `%s': "
182 fh_get_file_name (r->fh), encoding, strerror (errno));
185 r->encoding = xstrdup (line_reader_get_encoding (r->line_reader));
189 r->line_reader = NULL;
190 r->encoding = xstrdup (encoding_guess_parse_encoding (encoding));
202 /* Returns true if an I/O error occurred on READER, false otherwise. */
204 dfm_reader_error (const struct dfm_reader *r)
206 return (fh_get_referent (r->fh) == FH_REF_FILE
207 && (r->line_reader != NULL
208 ? line_reader_error (r->line_reader) != 0
209 : ferror (r->file)));
212 /* Reads a record from the inline file into R.
213 Returns true if successful, false on failure. */
215 read_inline_record (struct dfm_reader *r)
217 if ((r->flags & DFM_SAW_BEGIN_DATA) == 0)
219 r->flags |= DFM_SAW_BEGIN_DATA;
220 r->flags &= ~DFM_CONSUME;
222 while (lex_token (r->lexer) == T_ENDCMD)
225 if (!lex_force_match_id (r->lexer, "BEGIN")
226 || !lex_force_match_id (r->lexer, "DATA"))
229 lex_match (r->lexer, T_ENDCMD);
232 if (r->flags & DFM_CONSUME)
235 if (!lex_is_string (r->lexer))
237 if (!lex_match_id (r->lexer, "END") || !lex_match_id (r->lexer, "DATA"))
239 msg (SE, _("Missing END DATA while reading inline data. "
240 "This probably indicates a missing or incorrectly "
241 "formatted END DATA command. END DATA must appear "
242 "by itself on a single line with exactly one space "
244 lex_discard_rest_of_command (r->lexer);
249 ds_assign_substring (&r->line, lex_tokss (r->lexer));
250 r->flags |= DFM_CONSUME;
255 /* Report a read error on R. */
257 read_error (struct dfm_reader *r)
259 msg (ME, _("Error reading file %s: %s."),
260 fh_get_name (r->fh), strerror (errno));
263 /* Report a partial read at end of file reading R. */
265 partial_record (struct dfm_reader *r)
267 msg (ME, _("Unexpected end of file in partial record reading %s."),
268 fh_get_name (r->fh));
271 /* Tries to read SIZE bytes from R into BUFFER. Returns 1 if
272 successful, 0 if end of file was reached before any bytes
273 could be read, and -1 if some bytes were read but fewer than
274 SIZE due to end of file or an error mid-read. In the latter
275 case, reports an error. */
277 try_to_read_fully (struct dfm_reader *r, void *buffer, size_t size)
279 size_t bytes_read = fread (buffer, 1, size, r->file);
280 if (bytes_read == size)
282 else if (bytes_read == 0)
291 /* Type of a descriptor word. */
298 /* Reads a block descriptor word or record descriptor word
299 (according to TYPE) from R. Returns 1 if successful, 0 if
300 end of file was reached before any bytes could be read, -1 if
301 an error occurred. Reports an error in the latter case.
303 If successful, stores the number of remaining bytes in the
304 block or record (that is, the block or record length, minus
305 the 4 bytes in the BDW or RDW itself) into *REMAINING_SIZE.
306 If SEGMENT is nonnull, also stores the segment control
307 character (SCC) into *SEGMENT. */
309 read_descriptor_word (struct dfm_reader *r, enum descriptor_type type,
310 size_t *remaining_size, int *segment)
312 uint8_t raw_descriptor[4];
315 status = try_to_read_fully (r, raw_descriptor, sizeof raw_descriptor);
319 *remaining_size = (raw_descriptor[0] << 8) | raw_descriptor[1];
321 *segment = raw_descriptor[2];
323 if (*remaining_size < 4)
327 ? _("Corrupt block descriptor word at offset 0x%lx in %s.")
328 : _("Corrupt record descriptor word at offset 0x%lx in %s.")),
329 (long) ftello (r->file) - 4, fh_get_name (r->fh));
333 *remaining_size -= 4;
337 /* Reports that reader R has read a corrupt record size. */
339 corrupt_size (struct dfm_reader *r)
341 msg (ME, _("Corrupt record size at offset 0x%lx in %s."),
342 (long) ftello (r->file) - 4, fh_get_name (r->fh));
345 /* Reads a 32-byte little-endian signed number from R and stores
346 its value into *SIZE_OUT. Returns 1 if successful, 0 if end
347 of file was reached before any bytes could be read, -1 if an
348 error occurred. Reports an error in the latter case. Numbers
349 less than 0 are considered errors. */
351 read_size (struct dfm_reader *r, size_t *size_out)
356 status = try_to_read_fully (r, &size, sizeof size);
360 integer_convert (INTEGER_LSB_FIRST, &size, INTEGER_NATIVE, &size,
373 read_text_record (struct dfm_reader *r)
378 /* Read a line. If the line reader's encoding changes, update r->encoding to
380 is_auto = line_reader_is_auto (r->line_reader);
381 ok = line_reader_read (r->line_reader, &r->line, SIZE_MAX);
382 if (is_auto && !line_reader_is_auto (r->line_reader))
385 r->encoding = xstrdup (line_reader_get_encoding (r->line_reader));
388 /* Detect and report read error. */
391 int error = line_reader_error (r->line_reader);
393 msg (ME, _("Error reading file %s: %s."),
394 fh_get_name (r->fh), strerror (error));
400 /* Reads a record from a disk file into R.
401 Returns true if successful, false on error or at end of file. */
403 read_file_record (struct dfm_reader *r)
405 assert (r->fh != fh_inline_file ());
408 switch (fh_get_mode (r->fh))
411 return read_text_record (r);
414 if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file))
418 if (ferror (r->file))
420 else if (!ds_is_empty (&r->line))
425 case FH_MODE_VARIABLE:
428 size_t trailing_size;
431 /* Read leading record size. */
432 status = read_size (r, &leading_size);
436 /* Read record data. */
437 if (!ds_read_stream (&r->line, leading_size, 1, r->file))
439 if (ferror (r->file))
446 /* Read trailing record size and check that it's the same
447 as the leading record size. */
448 status = read_size (r, &trailing_size);
455 if (leading_size != trailing_size)
464 case FH_MODE_360_VARIABLE:
465 case FH_MODE_360_SPANNED:
472 /* If we've exhausted our current block, start another
473 one by reading the new block descriptor word. */
474 if (r->block_left == 0)
476 status = read_descriptor_word (r, BLOCK, &r->block_left, NULL);
479 else if (status == 0)
480 return !ds_is_empty (&r->line);
483 /* Read record descriptor. */
484 if (r->block_left < 4)
490 status = read_descriptor_word (r, RECORD, &record_size, &segment);
497 if (record_size > r->block_left)
499 msg (ME, _("Record exceeds remaining block length."));
503 /* Read record data. */
504 if (!ds_read_stream (&r->line, record_size, 1, r->file))
506 if (ferror (r->file))
512 r->block_left -= record_size;
514 /* In variable mode, read only a single record.
515 In spanned mode, a segment value of 0 should
516 designate a whole record without spanning, 1 the
517 first segment in a record, 2 the last segment in a
518 record, and 3 an intermediate segment in a record.
519 For compatibility, though, we actually pay attention
520 only to whether the segment value is even or odd. */
521 if (fh_get_mode (r->fh) == FH_MODE_360_VARIABLE
522 || (segment & 1) == 0)
530 /* Reads a record from R, setting the current position to the
531 start of the line. If an error occurs or end-of-file is
532 encountered, the current line is set to null. */
534 read_record (struct dfm_reader *r)
536 if (fh_get_referent (r->fh) == FH_REF_FILE)
538 bool ok = read_file_record (r);
544 return read_inline_record (r);
547 /* Returns the number of attempts, thus far, to advance past
548 end-of-file in reader R. Reads forward in HANDLE's file, if
549 necessary, to find out.
551 Normally, the user stops attempting to read from the file the
552 first time EOF is reached (a return value of 1). If the user
553 tries to read past EOF again (a return value of 2 or more),
554 an error message is issued, and the caller should more
555 forcibly abort to avoid an infinite loop. */
557 dfm_eof (struct dfm_reader *r)
559 if (r->flags & DFM_ADVANCE)
561 r->flags &= ~DFM_ADVANCE;
563 if (r->eof_cnt == 0 && read_record (r) )
572 if (r->fh != fh_inline_file ())
573 msg (ME, _("Attempt to read beyond end-of-file on file %s."),
574 fh_get_name (r->fh));
576 msg (ME, _("Attempt to read beyond END DATA."));
583 /* Returns the current record in the file corresponding to
584 HANDLE. Aborts if reading from the file is necessary or at
585 end of file, so call dfm_eof() first. */
587 dfm_get_record (struct dfm_reader *r)
589 assert ((r->flags & DFM_ADVANCE) == 0);
590 assert (r->eof_cnt == 0);
592 return ds_substr (&r->line, r->pos, SIZE_MAX);
595 /* Expands tabs in the current line into the equivalent number of
596 spaces, if appropriate for this kind of file. Aborts if
597 reading from the file is necessary or at end of file, so call
600 dfm_expand_tabs (struct dfm_reader *r)
602 size_t ofs, new_pos, tab_width;
604 assert ((r->flags & DFM_ADVANCE) == 0);
605 assert (r->eof_cnt == 0);
607 if (r->flags & DFM_TABS_EXPANDED)
609 r->flags |= DFM_TABS_EXPANDED;
611 if (r->fh != fh_inline_file ()
612 && (fh_get_mode (r->fh) != FH_MODE_TEXT
613 || fh_get_tab_width (r->fh) == 0
614 || ds_find_byte (&r->line, '\t') == SIZE_MAX))
617 /* Expand tabs from r->line into r->scratch, and figure out
618 new value for r->pos. */
619 tab_width = fh_get_tab_width (r->fh);
620 ds_clear (&r->scratch);
622 for (ofs = 0; ofs < ds_length (&r->line); ofs++)
627 new_pos = ds_length (&r->scratch);
629 c = ds_data (&r->line)[ofs];
631 ds_put_byte (&r->scratch, c);
635 ds_put_byte (&r->scratch, ' ');
636 while (ds_length (&r->scratch) % tab_width != 0);
639 if (new_pos == SIZE_MAX)
641 /* Maintain the same relationship between position and line
642 length that we had before. DATA LIST uses a
643 beyond-the-end position to deal with an empty field at
644 the end of the line. */
645 assert (r->pos >= ds_length (&r->line));
646 new_pos = (r->pos - ds_length (&r->line)) + ds_length (&r->scratch);
649 /* Swap r->line and r->scratch and set new r->pos. */
650 ds_swap (&r->line, &r->scratch);
654 /* Returns the character encoding of data read from READER. */
656 dfm_reader_get_encoding (const struct dfm_reader *reader)
658 return reader->encoding;
661 /* Returns a number between 0 and 100 that approximates the
662 percentage of the data in READER that has already been read,
663 or -1 if this value cannot be estimated.
665 ftello is slow in glibc (it flushes the read buffer), so don't
666 call this function unless you need to. */
668 dfm_get_percent_read (const struct dfm_reader *reader)
670 if (reader->file_size >= 0)
674 position = (reader->line_reader != NULL
675 ? line_reader_tell (reader->line_reader)
676 : ftello (reader->file));
679 double p = 100.0 * position / reader->file_size;
680 return p < 0 ? 0 : p > 100 ? 100 : p;
686 /* Causes dfm_get_record() or dfm_get_whole_record() to read in
687 the next record the next time it is executed on file
690 dfm_forward_record (struct dfm_reader *r)
692 r->flags |= DFM_ADVANCE;
695 /* Cancels the effect of any previous dfm_fwd_record() executed
696 on file HANDLE. Sets the current line to begin in the 1-based
699 dfm_reread_record (struct dfm_reader *r, size_t column)
701 r->flags &= ~DFM_ADVANCE;
702 r->pos = MAX (column, 1) - 1;
705 /* Sets the current line to begin COLUMNS characters following
706 the current start. */
708 dfm_forward_columns (struct dfm_reader *r, size_t columns)
710 dfm_reread_record (r, (r->pos + 1) + columns);
713 /* Returns the 1-based column to which the line pointer in HANDLE
714 is set. Unless dfm_reread_record() or dfm_forward_columns()
715 have been called, this is 1. */
717 dfm_column_start (const struct dfm_reader *r)
722 /* Returns the number of columns we are currently beyond the end
723 of the line. At or before end-of-line, this is 0; one column
724 after end-of-line, this is 1; and so on. */
726 dfm_columns_past_end (const struct dfm_reader *r)
728 return r->pos < ds_length (&r->line) ? 0 : ds_length (&r->line) - r->pos;
731 /* Returns the 1-based column within the current line that P
734 dfm_get_column (const struct dfm_reader *r, const char *p)
736 return ds_pointer_to_position (&r->line, p) + 1;
740 dfm_get_file_name (const struct dfm_reader *r)
742 return (fh_get_referent (r->fh) == FH_REF_FILE
743 ? fh_get_file_name (r->fh)
748 dfm_get_line_number (const struct dfm_reader *r)
750 return fh_get_referent (r->fh) == FH_REF_FILE ? r->line_number : -1;
753 /* BEGIN DATA...END DATA procedure. */
755 /* Perform BEGIN DATA...END DATA as a procedure in itself. */
757 cmd_begin_data (struct lexer *lexer, struct dataset *ds)
759 struct dfm_reader *r;
762 if (!fh_is_locked (fh_inline_file (), FH_ACC_READ))
764 msg (SE, _("This command is not valid here since the current "
765 "input program does not access the inline file."));
766 return CMD_CASCADING_FAILURE;
768 lex_match (lexer, T_ENDCMD);
770 /* Open inline file. */
771 r = dfm_open_reader (fh_inline_file (), lexer, NULL);
772 r->flags |= DFM_SAW_BEGIN_DATA;
773 r->flags &= ~DFM_CONSUME;
775 /* Input procedure reads from inline file. */
776 casereader_destroy (proc_open (ds));
777 ok = proc_commit (ds);
778 dfm_close_reader (r);
780 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;