1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2004, 2006, 2010, 2011, 2012, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/commands/data-reader.h"
27 #include "data/casereader.h"
28 #include "data/dataset.h"
29 #include "data/file-handle-def.h"
30 #include "data/file-name.h"
31 #include "language/command.h"
32 #include "language/commands/file-handle.h"
33 #include "language/lexer/lexer.h"
34 #include "libpspp/assertion.h"
35 #include "libpspp/cast.h"
36 #include "libpspp/encoding-guesser.h"
37 #include "libpspp/integer-format.h"
38 #include "libpspp/line-reader.h"
39 #include "libpspp/message.h"
40 #include "libpspp/str.h"
42 #include "gl/minmax.h"
43 #include "gl/xalloc.h"
46 #define _(msgid) gettext (msgid)
47 #define N_(msgid) (msgid)
49 /* Flags for DFM readers. */
52 DFM_ADVANCE = 002, /* Read next line on dfm_get_record() call? */
53 DFM_SAW_BEGIN_DATA = 004, /* For inline_file only, whether we've
54 already read a BEGIN DATA line. */
55 DFM_TABS_EXPANDED = 010, /* Tabs have been expanded. */
56 DFM_CONSUME = 020 /* read_inline_record() should get a token? */
59 /* Data file reader. */
62 struct file_handle *fh; /* File handle. */
63 struct fh_lock *lock; /* Mutual exclusion lock for file. */
64 int line_number; /* Current line or record number. */
65 struct string line; /* Current line. */
66 struct string scratch; /* Extra line buffer. */
67 enum dfm_reader_flags flags; /* Zero or more of DFM_*. */
68 FILE *file; /* Associated file. */
69 size_t pos; /* Offset in line of current character. */
70 unsigned n_eofs; /* # of attempts to advance past EOF. */
71 struct lexer *lexer; /* The lexer reading the file */
72 char *encoding; /* Current encoding. */
74 /* For FH_MODE_TEXT only. */
75 struct line_reader *line_reader;
77 /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */
78 size_t block_left; /* Bytes left in current block. */
81 /* Closes reader R opened by dfm_open_reader(). */
83 dfm_close_reader (struct dfm_reader *r)
88 if (fh_unlock (r->lock))
90 /* File is still locked by another client. */
94 /* This was the last client, so close the underlying file. */
95 if (fh_get_referent (r->fh) != FH_REF_INLINE)
96 fn_close (r->fh, r->file);
99 /* Skip any remaining data on the inline file. */
100 if (r->flags & DFM_SAW_BEGIN_DATA)
102 dfm_reread_record (r, 0);
104 dfm_forward_record (r);
108 line_reader_free (r->line_reader);
111 ds_destroy (&r->line);
112 ds_destroy (&r->scratch);
116 /* Opens the file designated by file handle FH for reading as a data file.
117 Returns a reader if successful, or a null pointer otherwise.
119 If FH is fh_inline_file() then the new reader reads data included inline in
120 the command file between BEGIN FILE and END FILE, obtaining data from LEXER.
121 LEXER must remain valid as long as the new reader is in use. ENCODING is
124 If FH is not fh_inline_file(), then the encoding of the file read is by
125 default that of FH itself. If ENCODING is nonnull, then it overrides the
126 default encoding. LEXER is ignored. */
128 dfm_open_reader (struct file_handle *fh, struct lexer *lexer,
129 const char *encoding)
131 struct dfm_reader *r;
132 struct fh_lock *lock;
134 /* TRANSLATORS: this fragment will be interpolated into
135 messages in fh_lock() that identify types of files. */
136 lock = fh_lock (fh, FH_REF_FILE | FH_REF_INLINE, N_("data file"),
141 r = fh_lock_get_aux (lock);
145 r = xmalloc (sizeof *r);
149 ds_init_empty (&r->line);
150 ds_init_empty (&r->scratch);
151 r->flags = DFM_ADVANCE;
154 if (fh_get_referent (fh) != FH_REF_INLINE)
157 r->file = fn_open (fh, "rb");
160 msg (ME, _("Could not open `%s' for reading as a data file: %s."),
161 fh_get_file_name (r->fh), strerror (errno));
165 fh_lock_set_aux (lock, r);
167 if (encoding == NULL)
168 encoding = fh_get_encoding (fh);
169 if (fh_get_referent (fh) == FH_REF_FILE && fh_get_mode (fh) == FH_MODE_TEXT)
171 r->line_reader = line_reader_for_fd (encoding, fileno (r->file));
172 if (r->line_reader == NULL)
174 msg (ME, _("Could not read `%s' as a text file with encoding `%s': "
176 fh_get_file_name (r->fh), encoding, strerror (errno));
179 r->encoding = xstrdup (line_reader_get_encoding (r->line_reader));
183 r->line_reader = NULL;
184 r->encoding = xstrdup (encoding_guess_parse_encoding (encoding));
196 /* Returns true if an I/O error occurred on READER, false otherwise. */
198 dfm_reader_error (const struct dfm_reader *r)
200 return (fh_get_referent (r->fh) == FH_REF_FILE
201 && (r->line_reader != NULL
202 ? line_reader_error (r->line_reader) != 0
203 : ferror (r->file)));
206 /* Reads a record from the inline file into R.
207 Returns true if successful, false on failure. */
209 read_inline_record (struct dfm_reader *r)
211 if ((r->flags & DFM_SAW_BEGIN_DATA) == 0)
213 r->flags |= DFM_SAW_BEGIN_DATA;
214 r->flags &= ~DFM_CONSUME;
216 while (lex_token (r->lexer) == T_ENDCMD)
219 if (!lex_force_match_phrase (r->lexer, "BEGIN DATA"))
222 lex_match (r->lexer, T_ENDCMD);
225 if (r->flags & DFM_CONSUME)
228 if (!lex_is_string (r->lexer))
230 if (!lex_match_id (r->lexer, "END") || !lex_match_id (r->lexer, "DATA"))
232 msg (SE, _("Missing %s while reading inline data. "
233 "This probably indicates a missing or incorrectly "
234 "formatted %s command. %s must appear "
235 "by itself on a single line with exactly one space "
236 "between words."), "END DATA", "END DATA", "END DATA");
237 lex_discard_rest_of_command (r->lexer);
242 ds_assign_substring (&r->line, lex_tokss (r->lexer));
243 r->flags |= DFM_CONSUME;
248 /* Report a read error on R. */
250 read_error (struct dfm_reader *r)
252 msg (ME, _("Error reading file %s: %s."),
253 fh_get_name (r->fh), strerror (errno));
256 /* Report a partial read at end of file reading R. */
258 partial_record (struct dfm_reader *r)
260 msg (ME, _("Unexpected end of file in partial record reading %s."),
261 fh_get_name (r->fh));
264 /* Tries to read SIZE bytes from R into BUFFER. Returns 1 if
265 successful, 0 if end of file was reached before any bytes
266 could be read, and -1 if some bytes were read but fewer than
267 SIZE due to end of file or an error mid-read. In the latter
268 case, reports an error. */
270 try_to_read_fully (struct dfm_reader *r, void *buffer, size_t size)
272 size_t bytes_read = fread (buffer, 1, size, r->file);
273 if (bytes_read == size)
275 else if (bytes_read == 0)
284 /* Type of a descriptor word. */
291 /* Reads a block descriptor word or record descriptor word
292 (according to TYPE) from R. Returns 1 if successful, 0 if
293 end of file was reached before any bytes could be read, -1 if
294 an error occurred. Reports an error in the latter case.
296 If successful, stores the number of remaining bytes in the
297 block or record (that is, the block or record length, minus
298 the 4 bytes in the BDW or RDW itself) into *REMAINING_SIZE.
299 If SEGMENT is nonnull, also stores the segment control
300 character (SCC) into *SEGMENT. */
302 read_descriptor_word (struct dfm_reader *r, enum descriptor_type type,
303 size_t *remaining_size, int *segment)
305 uint8_t raw_descriptor[4];
308 status = try_to_read_fully (r, raw_descriptor, sizeof raw_descriptor);
312 *remaining_size = (raw_descriptor[0] << 8) | raw_descriptor[1];
314 *segment = raw_descriptor[2];
316 if (*remaining_size < 4)
320 ? _("Corrupt block descriptor word at offset 0x%lx in %s.")
321 : _("Corrupt record descriptor word at offset 0x%lx in %s.")),
322 (long) ftello (r->file) - 4, fh_get_name (r->fh));
326 *remaining_size -= 4;
330 /* Reports that reader R has read a corrupt record size. */
332 corrupt_size (struct dfm_reader *r)
334 msg (ME, _("Corrupt record size at offset 0x%lx in %s."),
335 (long) ftello (r->file) - 4, fh_get_name (r->fh));
338 /* Reads a 32-byte little-endian signed number from R and stores
339 its value into *SIZE_OUT. Returns 1 if successful, 0 if end
340 of file was reached before any bytes could be read, -1 if an
341 error occurred. Reports an error in the latter case. Numbers
342 less than 0 are considered errors. */
344 read_size (struct dfm_reader *r, size_t *size_out)
349 status = try_to_read_fully (r, &size, sizeof size);
353 integer_convert (INTEGER_LSB_FIRST, &size, INTEGER_NATIVE, &size,
366 read_text_record (struct dfm_reader *r)
371 /* Read a line. If the line reader's encoding changes, update r->encoding to
373 is_auto = line_reader_is_auto (r->line_reader);
374 ok = line_reader_read (r->line_reader, &r->line, SIZE_MAX);
375 if (is_auto && !line_reader_is_auto (r->line_reader))
378 r->encoding = xstrdup (line_reader_get_encoding (r->line_reader));
381 /* Detect and report read error. */
384 int error = line_reader_error (r->line_reader);
386 msg (ME, _("Error reading file %s: %s."),
387 fh_get_name (r->fh), strerror (error));
393 /* Reads a record from a disk file into R.
394 Returns true if successful, false on error or at end of file. */
396 read_file_record (struct dfm_reader *r)
398 assert (r->fh != fh_inline_file ());
401 switch (fh_get_mode (r->fh))
404 return read_text_record (r);
407 if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file))
411 if (ferror (r->file))
413 else if (!ds_is_empty (&r->line))
418 case FH_MODE_VARIABLE:
421 size_t trailing_size;
424 /* Read leading record size. */
425 status = read_size (r, &leading_size);
429 /* Read record data. */
430 if (!ds_read_stream (&r->line, leading_size, 1, r->file))
432 if (ferror (r->file))
439 /* Read trailing record size and check that it's the same
440 as the leading record size. */
441 status = read_size (r, &trailing_size);
448 if (leading_size != trailing_size)
457 case FH_MODE_360_VARIABLE:
458 case FH_MODE_360_SPANNED:
465 /* If we've exhausted our current block, start another
466 one by reading the new block descriptor word. */
467 if (r->block_left == 0)
469 status = read_descriptor_word (r, BLOCK, &r->block_left, NULL);
472 else if (status == 0)
473 return !ds_is_empty (&r->line);
476 /* Read record descriptor. */
477 if (r->block_left < 4)
483 status = read_descriptor_word (r, RECORD, &record_size, &segment);
490 if (record_size > r->block_left)
492 msg (ME, _("Record exceeds remaining block length."));
496 /* Read record data. */
497 if (!ds_read_stream (&r->line, record_size, 1, r->file))
499 if (ferror (r->file))
505 r->block_left -= record_size;
507 /* In variable mode, read only a single record.
508 In spanned mode, a segment value of 0 should
509 designate a whole record without spanning, 1 the
510 first segment in a record, 2 the last segment in a
511 record, and 3 an intermediate segment in a record.
512 For compatibility, though, we actually pay attention
513 only to whether the segment value is even or odd. */
514 if (fh_get_mode (r->fh) == FH_MODE_360_VARIABLE
515 || (segment & 1) == 0)
523 /* Reads a record from R, setting the current position to the
524 start of the line. If an error occurs or end-of-file is
525 encountered, the current line is set to null. */
527 read_record (struct dfm_reader *r)
529 if (fh_get_referent (r->fh) == FH_REF_FILE)
531 bool ok = read_file_record (r);
537 return read_inline_record (r);
540 /* Returns the number of attempts, thus far, to advance past
541 end-of-file in reader R. Reads forward in HANDLE's file, if
542 necessary, to find out.
544 Normally, the user stops attempting to read from the file the
545 first time EOF is reached (a return value of 1). If the user
546 tries to read past EOF again (a return value of 2 or more),
547 an error message is issued, and the caller should more
548 forcibly abort to avoid an infinite loop. */
550 dfm_eof (struct dfm_reader *r)
552 if (r->flags & DFM_ADVANCE)
554 r->flags &= ~DFM_ADVANCE;
556 if (r->n_eofs == 0 && read_record (r))
565 if (r->fh != fh_inline_file ())
566 msg (ME, _("Attempt to read beyond end-of-file on file %s."),
567 fh_get_name (r->fh));
569 msg (ME, _("Attempt to read beyond %s."), "END DATA");
576 /* Returns the current record in the file corresponding to
577 HANDLE. Aborts if reading from the file is necessary or at
578 end of file, so call dfm_eof() first. */
580 dfm_get_record (struct dfm_reader *r)
582 assert ((r->flags & DFM_ADVANCE) == 0);
583 assert (r->n_eofs == 0);
585 return ds_substr (&r->line, r->pos, SIZE_MAX);
588 /* Expands tabs in the current line into the equivalent number of
589 spaces, if appropriate for this kind of file. Aborts if
590 reading from the file is necessary or at end of file, so call
593 dfm_expand_tabs (struct dfm_reader *r)
595 size_t ofs, new_pos, tab_width;
597 assert ((r->flags & DFM_ADVANCE) == 0);
598 assert (r->n_eofs == 0);
600 if (r->flags & DFM_TABS_EXPANDED)
602 r->flags |= DFM_TABS_EXPANDED;
604 if (r->fh != fh_inline_file ()
605 && (fh_get_mode (r->fh) != FH_MODE_TEXT
606 || fh_get_tab_width (r->fh) == 0
607 || ds_find_byte (&r->line, '\t') == SIZE_MAX))
610 /* Expand tabs from r->line into r->scratch, and figure out
611 new value for r->pos. */
612 tab_width = fh_get_tab_width (r->fh);
613 ds_clear (&r->scratch);
615 for (ofs = 0; ofs < ds_length (&r->line); ofs++)
620 new_pos = ds_length (&r->scratch);
622 c = ds_data (&r->line)[ofs];
624 ds_put_byte (&r->scratch, c);
628 ds_put_byte (&r->scratch, ' ');
629 while (ds_length (&r->scratch) % tab_width != 0);
632 if (new_pos == SIZE_MAX)
634 /* Maintain the same relationship between position and line
635 length that we had before. DATA LIST uses a
636 beyond-the-end position to deal with an empty field at
637 the end of the line. */
638 assert (r->pos >= ds_length (&r->line));
639 new_pos = (r->pos - ds_length (&r->line)) + ds_length (&r->scratch);
642 /* Swap r->line and r->scratch and set new r->pos. */
643 ds_swap (&r->line, &r->scratch);
647 /* Returns the character encoding of data read from READER. */
649 dfm_reader_get_encoding (const struct dfm_reader *reader)
651 return reader->encoding;
654 /* Causes dfm_get_record() or dfm_get_whole_record() to read in
655 the next record the next time it is executed on file
658 dfm_forward_record (struct dfm_reader *r)
660 r->flags |= DFM_ADVANCE;
663 /* Cancels the effect of any previous dfm_fwd_record() executed
664 on file HANDLE. Sets the current line to begin in the 1-based
667 dfm_reread_record (struct dfm_reader *r, size_t column)
669 r->flags &= ~DFM_ADVANCE;
670 r->pos = MAX (column, 1) - 1;
673 /* Sets the current line to begin COLUMNS characters following
674 the current start. */
676 dfm_forward_columns (struct dfm_reader *r, size_t columns)
678 dfm_reread_record (r, (r->pos + 1) + columns);
681 /* Returns the 1-based column to which the line pointer in HANDLE
682 is set. Unless dfm_reread_record() or dfm_forward_columns()
683 have been called, this is 1. */
685 dfm_column_start (const struct dfm_reader *r)
690 /* Returns the number of columns we are currently beyond the end
691 of the line. At or before end-of-line, this is 0; one column
692 after end-of-line, this is 1; and so on. */
694 dfm_columns_past_end (const struct dfm_reader *r)
696 return r->pos < ds_length (&r->line) ? 0 : ds_length (&r->line) - r->pos;
699 /* Returns the 1-based column within the current line that P
702 dfm_get_column (const struct dfm_reader *r, const char *p)
704 return ds_pointer_to_position (&r->line, p) + 1;
708 dfm_get_file_name (const struct dfm_reader *r)
710 enum fh_referent referent = fh_get_referent (r->fh);
711 return (referent == FH_REF_FILE ? fh_get_file_name (r->fh)
712 : referent == FH_REF_INLINE ? lex_get_file_name (r->lexer)
717 dfm_get_line_number (const struct dfm_reader *r)
719 switch (fh_get_referent (r->fh))
722 return r->line_number;
725 return lex_ofs_start_point (r->lexer, lex_ofs (r->lexer)).line;
733 /* BEGIN DATA...END DATA procedure. */
735 /* Perform BEGIN DATA...END DATA as a procedure in itself. */
737 cmd_begin_data (struct lexer *lexer, struct dataset *ds)
739 struct dfm_reader *r;
742 if (!fh_is_locked (fh_inline_file (), FH_ACC_READ))
744 lex_ofs_error (lexer, 0, lex_ofs (lexer) - 1,
745 _("This command is not valid here since the current "
746 "input program does not access the inline file."));
747 return CMD_CASCADING_FAILURE;
749 lex_match (lexer, T_ENDCMD);
751 /* Open inline file. */
752 r = dfm_open_reader (fh_inline_file (), lexer, NULL);
753 r->flags |= DFM_SAW_BEGIN_DATA;
754 r->flags &= ~DFM_CONSUME;
756 /* Input procedure reads from inline file. */
757 casereader_destroy (proc_open (ds));
758 ok = proc_commit (ds);
759 dfm_close_reader (r);
761 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;