1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2004, 2006 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <language/data-io/data-reader.h>
27 #include <data/casereader.h>
28 #include <data/file-handle-def.h>
29 #include <data/file-name.h>
30 #include <data/procedure.h>
31 #include <language/command.h>
32 #include <language/data-io/file-handle.h>
33 #include <language/lexer/lexer.h>
34 #include <language/prompt.h>
35 #include <libpspp/assertion.h>
36 #include <libpspp/integer-format.h>
37 #include <libpspp/message.h>
38 #include <libpspp/str.h>
44 #define _(msgid) gettext (msgid)
45 #define N_(msgid) (msgid)
47 /* Flags for DFM readers. */
50 DFM_ADVANCE = 002, /* Read next line on dfm_get_record() call? */
51 DFM_SAW_BEGIN_DATA = 004, /* For inline_file only, whether we've
52 already read a BEGIN DATA line. */
53 DFM_TABS_EXPANDED = 010, /* Tabs have been expanded. */
56 /* Data file reader. */
59 struct file_handle *fh; /* File handle. */
60 struct fh_lock *lock; /* Mutual exclusion lock for file. */
61 struct msg_locator where; /* Current location in data file. */
62 struct string line; /* Current line. */
63 struct string scratch; /* Extra line buffer. */
64 enum dfm_reader_flags flags; /* Zero or more of DFM_*. */
65 FILE *file; /* Associated file. */
66 size_t pos; /* Offset in line of current character. */
67 unsigned eof_cnt; /* # of attempts to advance past EOF. */
68 struct lexer *lexer; /* The lexer reading the file */
70 /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */
71 size_t block_left; /* Bytes left in current block. */
74 /* Closes reader R opened by dfm_open_reader(). */
76 dfm_close_reader (struct dfm_reader *r)
81 if (fh_unlock (r->lock))
83 /* File is still locked by another client. */
87 /* This was the last client, so close the underlying file. */
88 if (fh_get_referent (r->fh) != FH_REF_INLINE)
89 fn_close (fh_get_file_name (r->fh), r->file);
92 /* Skip any remaining data on the inline file. */
93 if (r->flags & DFM_SAW_BEGIN_DATA)
95 dfm_reread_record (r, 0);
97 dfm_forward_record (r);
102 ds_destroy (&r->line);
103 ds_destroy (&r->scratch);
107 /* Opens the file designated by file handle FH for reading as a
108 data file. Providing fh_inline_file() for FH designates the
109 "inline file", that is, data included inline in the command
110 file between BEGIN FILE and END FILE. Returns a reader if
111 successful, or a null pointer otherwise. */
113 dfm_open_reader (struct file_handle *fh, struct lexer *lexer)
115 struct dfm_reader *r;
116 struct fh_lock *lock;
118 /* TRANSLATORS: this fragment will be interpolated into
119 messages in fh_lock() that identify types of files. */
120 lock = fh_lock (fh, FH_REF_FILE | FH_REF_INLINE, N_("data file"),
125 r = fh_lock_get_aux (lock);
129 r = xmalloc (sizeof *r);
133 ds_init_empty (&r->line);
134 ds_init_empty (&r->scratch);
135 r->flags = DFM_ADVANCE;
138 if (fh_get_referent (fh) != FH_REF_INLINE)
140 r->where.file_name = fh_get_file_name (fh);
141 r->where.line_number = 0;
142 r->file = fn_open (fh_get_file_name (fh),
143 fh_get_mode (fh) == FH_MODE_TEXT ? "r" : "rb");
146 msg (ME, _("Could not open \"%s\" for reading as a data file: %s."),
147 fh_get_file_name (r->fh), strerror (errno));
154 fh_lock_set_aux (lock, r);
159 /* Returns true if an I/O error occurred on READER, false otherwise. */
161 dfm_reader_error (const struct dfm_reader *r)
163 return fh_get_referent (r->fh) == FH_REF_FILE && ferror (r->file);
166 /* Reads a record from the inline file into R.
167 Returns true if successful, false on failure. */
169 read_inline_record (struct dfm_reader *r)
171 if ((r->flags & DFM_SAW_BEGIN_DATA) == 0)
173 r->flags |= DFM_SAW_BEGIN_DATA;
175 while (lex_token (r->lexer) == '.')
177 if (!lex_force_match_id (r->lexer, "BEGIN") || !lex_force_match_id (r->lexer, "DATA"))
179 prompt_set_style (PROMPT_DATA);
182 if (!lex_get_line_raw (r->lexer))
184 msg (SE, _("Unexpected end-of-file while reading data in BEGIN "
185 "DATA. This probably indicates "
186 "a missing or misformatted END DATA command. "
187 "END DATA must appear by itself on a single line "
188 "with exactly one space between words."));
192 if (ds_length (lex_entire_line_ds (r->lexer) ) >= 8
193 && !strncasecmp (lex_entire_line (r->lexer), "end data", 8))
195 lex_discard_line (r->lexer);
199 ds_assign_string (&r->line, lex_entire_line_ds (r->lexer) );
204 /* Report a read error or unexpected end-of-file condition on R. */
206 read_error (struct dfm_reader *r)
208 if (ferror (r->file))
209 msg (ME, _("Error reading file %s: %s."),
210 fh_get_name (r->fh), strerror (errno));
211 else if (feof (r->file))
212 msg (ME, _("Unexpected end of file reading %s."), fh_get_name (r->fh));
217 /* Report a partial read at end of file reading R. */
219 partial_record (struct dfm_reader *r)
221 msg (ME, _("Unexpected end of file in partial record reading %s."),
222 fh_get_name (r->fh));
225 /* Tries to read SIZE bytes from R into BUFFER. Returns 1 if
226 successful, 0 if end of file was reached before any bytes
227 could be read, and -1 if some bytes were read but fewer than
228 SIZE due to end of file or an error mid-read. In the latter
229 case, reports an error. */
231 try_to_read_fully (struct dfm_reader *r, void *buffer, size_t size)
233 size_t bytes_read = fread (buffer, 1, size, r->file);
234 if (bytes_read == size)
236 else if (bytes_read == 0)
245 /* Type of a descriptor word. */
252 /* Reads a block descriptor word or record descriptor word
253 (according to TYPE) from R. Returns 1 if successful, 0 if
254 end of file was reached before any bytes could be read, -1 if
255 an error occurred. Reports an error in the latter case.
257 If successful, stores the number of remaining bytes in the
258 block or record (that is, the block or record length, minus
259 the 4 bytes in the BDW or RDW itself) into *REMAINING_SIZE.
260 If SEGMENT is nonnull, also stores the segment control
261 character (SCC) into *SEGMENT. */
263 read_descriptor_word (struct dfm_reader *r, enum descriptor_type type,
264 size_t *remaining_size, int *segment)
266 uint8_t raw_descriptor[4];
269 status = try_to_read_fully (r, raw_descriptor, sizeof raw_descriptor);
273 *remaining_size = (raw_descriptor[0] << 8) | raw_descriptor[1];
275 *segment = raw_descriptor[2];
277 if (*remaining_size < 4)
281 ? _("Corrupt block descriptor word at offset 0x%lx in %s.")
282 : _("Corrupt record descriptor word at offset 0x%lx in %s.")),
283 (long) ftello (r->file) - 4, fh_get_name (r->fh));
287 *remaining_size -= 4;
291 /* Reports that reader R has read a corrupt record size. */
293 corrupt_size (struct dfm_reader *r)
295 msg (ME, _("Corrupt record size at offset 0x%lx in %s."),
296 (long) ftello (r->file) - 4, fh_get_name (r->fh));
299 /* Reads a 32-byte little-endian signed number from R and stores
300 its value into *SIZE_OUT. Returns 1 if successful, 0 if end
301 of file was reached before any bytes could be read, -1 if an
302 error occurred. Reports an error in the latter case. Numbers
303 less than 0 are considered errors. */
305 read_size (struct dfm_reader *r, size_t *size_out)
310 status = try_to_read_fully (r, &size, sizeof size);
314 integer_convert (INTEGER_LSB_FIRST, &size, INTEGER_NATIVE, &size,
326 /* Reads a record from a disk file into R.
327 Returns true if successful, false on error or at end of file. */
329 read_file_record (struct dfm_reader *r)
331 assert (r->fh != fh_inline_file ());
334 switch (fh_get_mode (r->fh))
337 if (ds_read_line (&r->line, r->file))
339 ds_chomp (&r->line, '\n');
344 if (ferror (r->file))
351 if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file))
355 if (ferror (r->file))
357 else if (!ds_is_empty (&r->line))
363 case FH_MODE_VARIABLE:
366 size_t trailing_size;
369 /* Read leading record size. */
370 status = read_size (r, &leading_size);
374 /* Read record data. */
375 if (!ds_read_stream (&r->line, leading_size, 1, r->file))
377 if (ferror (r->file))
384 /* Read trailing record size and check that it's the same
385 as the leading record size. */
386 status = read_size (r, &trailing_size);
393 if (leading_size != trailing_size)
402 case FH_MODE_360_VARIABLE:
403 case FH_MODE_360_SPANNED:
410 /* If we've exhausted our current block, start another
411 one by reading the new block descriptor word. */
412 if (r->block_left == 0)
414 status = read_descriptor_word (r, BLOCK, &r->block_left, NULL);
417 else if (status == 0)
418 return !ds_is_empty (&r->line);
421 /* Read record descriptor. */
422 if (r->block_left < 4)
428 status = read_descriptor_word (r, RECORD, &record_size, &segment);
435 if (record_size > r->block_left)
437 msg (ME, _("Record exceeds remaining block length."));
441 /* Read record data. */
442 if (!ds_read_stream (&r->line, record_size, 1, r->file))
444 if (ferror (r->file))
450 r->block_left -= record_size;
452 /* In variable mode, read only a single record.
453 In spanned mode, a segment value of 0 should
454 designate a whole record without spanning, 1 the
455 first segment in a record, 2 the last segment in a
456 record, and 3 an intermediate segment in a record.
457 For compatibility, though, we actually pay attention
458 only to whether the segment value is even or odd. */
459 if (fh_get_mode (r->fh) == FH_MODE_360_VARIABLE
460 || (segment & 1) == 0)
468 /* Reads a record from R, setting the current position to the
469 start of the line. If an error occurs or end-of-file is
470 encountered, the current line is set to null. */
472 read_record (struct dfm_reader *r)
474 if (fh_get_referent (r->fh) == FH_REF_FILE)
476 bool ok = read_file_record (r);
478 r->where.line_number++;
482 return read_inline_record (r);
485 /* Returns the number of attempts, thus far, to advance past
486 end-of-file in reader R. Reads forward in HANDLE's file, if
487 necessary, to find out.
489 Normally, the user stops attempting to read from the file the
490 first time EOF is reached (a return value of 1). If the user
491 tries to read past EOF again (a return value of 2 or more),
492 an error message is issued, and the caller should more
493 forcibly abort to avoid an infinite loop. */
495 dfm_eof (struct dfm_reader *r)
497 if (r->flags & DFM_ADVANCE)
499 r->flags &= ~DFM_ADVANCE;
501 if (r->eof_cnt == 0 && read_record (r) )
510 if (r->fh != fh_inline_file ())
511 msg (ME, _("Attempt to read beyond end-of-file on file %s."),
512 fh_get_name (r->fh));
514 msg (ME, _("Attempt to read beyond END DATA."));
521 /* Returns the current record in the file corresponding to
522 HANDLE. Aborts if reading from the file is necessary or at
523 end of file, so call dfm_eof() first. */
525 dfm_get_record (struct dfm_reader *r)
527 assert ((r->flags & DFM_ADVANCE) == 0);
528 assert (r->eof_cnt == 0);
530 return ds_substr (&r->line, r->pos, SIZE_MAX);
533 /* Expands tabs in the current line into the equivalent number of
534 spaces, if appropriate for this kind of file. Aborts if
535 reading from the file is necessary or at end of file, so call
538 dfm_expand_tabs (struct dfm_reader *r)
540 size_t ofs, new_pos, tab_width;
542 assert ((r->flags & DFM_ADVANCE) == 0);
543 assert (r->eof_cnt == 0);
545 if (r->flags & DFM_TABS_EXPANDED)
547 r->flags |= DFM_TABS_EXPANDED;
549 if (r->fh != fh_inline_file ()
550 && (fh_get_mode (r->fh) != FH_MODE_TEXT
551 || fh_get_tab_width (r->fh) == 0
552 || ds_find_char (&r->line, '\t') == SIZE_MAX))
555 /* Expand tabs from r->line into r->scratch, and figure out
556 new value for r->pos. */
557 tab_width = fh_get_tab_width (r->fh);
558 ds_clear (&r->scratch);
560 for (ofs = 0; ofs < ds_length (&r->line); ofs++)
565 new_pos = ds_length (&r->scratch);
567 c = ds_data (&r->line)[ofs];
569 ds_put_char (&r->scratch, c);
573 ds_put_char (&r->scratch, ' ');
574 while (ds_length (&r->scratch) % tab_width != 0);
577 if (new_pos == SIZE_MAX)
579 /* Maintain the same relationship between position and line
580 length that we had before. DATA LIST uses a
581 beyond-the-end position to deal with an empty field at
582 the end of the line. */
583 assert (r->pos >= ds_length (&r->line));
584 new_pos = (r->pos - ds_length (&r->line)) + ds_length (&r->scratch);
587 /* Swap r->line and r->scratch and set new r->pos. */
588 ds_swap (&r->line, &r->scratch);
592 /* Returns the legacy character encoding of data read from READER. */
594 dfm_reader_get_legacy_encoding (const struct dfm_reader *reader)
596 return fh_get_legacy_encoding (reader->fh);
599 /* Causes dfm_get_record() or dfm_get_whole_record() to read in
600 the next record the next time it is executed on file
603 dfm_forward_record (struct dfm_reader *r)
605 r->flags |= DFM_ADVANCE;
608 /* Cancels the effect of any previous dfm_fwd_record() executed
609 on file HANDLE. Sets the current line to begin in the 1-based
612 dfm_reread_record (struct dfm_reader *r, size_t column)
614 r->flags &= ~DFM_ADVANCE;
615 r->pos = MAX (column, 1) - 1;
618 /* Sets the current line to begin COLUMNS characters following
619 the current start. */
621 dfm_forward_columns (struct dfm_reader *r, size_t columns)
623 dfm_reread_record (r, (r->pos + 1) + columns);
626 /* Returns the 1-based column to which the line pointer in HANDLE
627 is set. Unless dfm_reread_record() or dfm_forward_columns()
628 have been called, this is 1. */
630 dfm_column_start (const struct dfm_reader *r)
635 /* Returns the number of columns we are currently beyond the end
636 of the line. At or before end-of-line, this is 0; one column
637 after end-of-line, this is 1; and so on. */
639 dfm_columns_past_end (const struct dfm_reader *r)
641 return r->pos < ds_length (&r->line) ? 0 : ds_length (&r->line) - r->pos;
644 /* Returns the 1-based column within the current line that P
647 dfm_get_column (const struct dfm_reader *r, const char *p)
649 return ds_pointer_to_position (&r->line, p) + 1;
652 /* Pushes the file name and line number on the fn/ln stack. */
654 dfm_push (struct dfm_reader *r)
656 if (r->fh != fh_inline_file ())
657 msg_push_msg_locator (&r->where);
660 /* Pops the file name and line number from the fn/ln stack. */
662 dfm_pop (struct dfm_reader *r)
664 if (r->fh != fh_inline_file ())
665 msg_pop_msg_locator (&r->where);
668 /* BEGIN DATA...END DATA procedure. */
670 /* Perform BEGIN DATA...END DATA as a procedure in itself. */
672 cmd_begin_data (struct lexer *lexer, struct dataset *ds)
674 struct dfm_reader *r;
677 if (!fh_is_locked (fh_inline_file (), FH_ACC_READ))
679 msg (SE, _("This command is not valid here since the current "
680 "input program does not access the inline file."));
681 return CMD_CASCADING_FAILURE;
684 /* Open inline file. */
685 r = dfm_open_reader (fh_inline_file (), lexer);
686 r->flags |= DFM_SAW_BEGIN_DATA;
688 /* Input procedure reads from inline file. */
689 prompt_set_style (PROMPT_DATA);
690 casereader_destroy (proc_open (ds));
691 ok = proc_commit (ds);
692 dfm_close_reader (r);
694 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;