1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2004, 2006 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <language/data-io/data-reader.h>
28 #include <data/casereader.h>
29 #include <data/file-handle-def.h>
30 #include <data/file-name.h>
31 #include <data/procedure.h>
32 #include <language/command.h>
33 #include <language/data-io/file-handle.h>
34 #include <language/lexer/lexer.h>
35 #include <language/prompt.h>
36 #include <libpspp/assertion.h>
37 #include <libpspp/integer-format.h>
38 #include <libpspp/message.h>
39 #include <libpspp/str.h>
45 #define _(msgid) gettext (msgid)
46 #define N_(msgid) (msgid)
48 /* Flags for DFM readers. */
51 DFM_ADVANCE = 002, /* Read next line on dfm_get_record() call? */
52 DFM_SAW_BEGIN_DATA = 004, /* For inline_file only, whether we've
53 already read a BEGIN DATA line. */
54 DFM_TABS_EXPANDED = 010, /* Tabs have been expanded. */
57 /* Data file reader. */
60 struct file_handle *fh; /* File handle. */
61 struct fh_lock *lock; /* Mutual exclusion lock for file. */
62 struct msg_locator where; /* Current location in data file. */
63 struct string line; /* Current line. */
64 struct string scratch; /* Extra line buffer. */
65 enum dfm_reader_flags flags; /* Zero or more of DFM_*. */
66 FILE *file; /* Associated file. */
67 off_t file_size; /* File size, or -1 if unavailable. */
68 size_t pos; /* Offset in line of current character. */
69 unsigned eof_cnt; /* # of attempts to advance past EOF. */
70 struct lexer *lexer; /* The lexer reading the file */
72 /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */
73 size_t block_left; /* Bytes left in current block. */
76 /* Closes reader R opened by dfm_open_reader(). */
78 dfm_close_reader (struct dfm_reader *r)
83 if (fh_unlock (r->lock))
85 /* File is still locked by another client. */
89 /* This was the last client, so close the underlying file. */
90 if (fh_get_referent (r->fh) != FH_REF_INLINE)
91 fn_close (fh_get_file_name (r->fh), r->file);
94 /* Skip any remaining data on the inline file. */
95 if (r->flags & DFM_SAW_BEGIN_DATA)
97 dfm_reread_record (r, 0);
99 dfm_forward_record (r);
104 ds_destroy (&r->line);
105 ds_destroy (&r->scratch);
109 /* Opens the file designated by file handle FH for reading as a
110 data file. Providing fh_inline_file() for FH designates the
111 "inline file", that is, data included inline in the command
112 file between BEGIN FILE and END FILE. Returns a reader if
113 successful, or a null pointer otherwise. */
115 dfm_open_reader (struct file_handle *fh, struct lexer *lexer)
117 struct dfm_reader *r;
118 struct fh_lock *lock;
120 /* TRANSLATORS: this fragment will be interpolated into
121 messages in fh_lock() that identify types of files. */
122 lock = fh_lock (fh, FH_REF_FILE | FH_REF_INLINE, N_("data file"),
127 r = fh_lock_get_aux (lock);
131 r = xmalloc (sizeof *r);
135 ds_init_empty (&r->line);
136 ds_init_empty (&r->scratch);
137 r->flags = DFM_ADVANCE;
140 if (fh_get_referent (fh) != FH_REF_INLINE)
143 r->where.file_name = fh_get_file_name (fh);
144 r->where.line_number = 0;
145 r->file = fn_open (fh_get_file_name (fh),
146 fh_get_mode (fh) == FH_MODE_TEXT ? "r" : "rb");
149 msg (ME, _("Could not open \"%s\" for reading as a data file: %s."),
150 fh_get_file_name (r->fh), strerror (errno));
156 r->file_size = fstat (fileno (r->file), &s) == 0 ? s.st_size : -1;
160 fh_lock_set_aux (lock, r);
165 /* Returns true if an I/O error occurred on READER, false otherwise. */
167 dfm_reader_error (const struct dfm_reader *r)
169 return fh_get_referent (r->fh) == FH_REF_FILE && ferror (r->file);
172 /* Reads a record from the inline file into R.
173 Returns true if successful, false on failure. */
175 read_inline_record (struct dfm_reader *r)
177 if ((r->flags & DFM_SAW_BEGIN_DATA) == 0)
179 r->flags |= DFM_SAW_BEGIN_DATA;
181 while (lex_token (r->lexer) == '.')
183 if (!lex_force_match_id (r->lexer, "BEGIN") || !lex_force_match_id (r->lexer, "DATA"))
185 prompt_set_style (PROMPT_DATA);
188 if (!lex_get_line_raw (r->lexer))
190 msg (SE, _("Unexpected end-of-file while reading data in BEGIN "
191 "DATA. This probably indicates "
192 "a missing or misformatted END DATA command. "
193 "END DATA must appear by itself on a single line "
194 "with exactly one space between words."));
198 if (ds_length (lex_entire_line_ds (r->lexer) ) >= 8
199 && !strncasecmp (lex_entire_line (r->lexer), "end data", 8))
201 lex_discard_line (r->lexer);
205 ds_assign_string (&r->line, lex_entire_line_ds (r->lexer) );
210 /* Report a read error or unexpected end-of-file condition on R. */
212 read_error (struct dfm_reader *r)
214 if (ferror (r->file))
215 msg (ME, _("Error reading file %s: %s."),
216 fh_get_name (r->fh), strerror (errno));
217 else if (feof (r->file))
218 msg (ME, _("Unexpected end of file reading %s."), fh_get_name (r->fh));
223 /* Report a partial read at end of file reading R. */
225 partial_record (struct dfm_reader *r)
227 msg (ME, _("Unexpected end of file in partial record reading %s."),
228 fh_get_name (r->fh));
231 /* Tries to read SIZE bytes from R into BUFFER. Returns 1 if
232 successful, 0 if end of file was reached before any bytes
233 could be read, and -1 if some bytes were read but fewer than
234 SIZE due to end of file or an error mid-read. In the latter
235 case, reports an error. */
237 try_to_read_fully (struct dfm_reader *r, void *buffer, size_t size)
239 size_t bytes_read = fread (buffer, 1, size, r->file);
240 if (bytes_read == size)
242 else if (bytes_read == 0)
251 /* Type of a descriptor word. */
258 /* Reads a block descriptor word or record descriptor word
259 (according to TYPE) from R. Returns 1 if successful, 0 if
260 end of file was reached before any bytes could be read, -1 if
261 an error occurred. Reports an error in the latter case.
263 If successful, stores the number of remaining bytes in the
264 block or record (that is, the block or record length, minus
265 the 4 bytes in the BDW or RDW itself) into *REMAINING_SIZE.
266 If SEGMENT is nonnull, also stores the segment control
267 character (SCC) into *SEGMENT. */
269 read_descriptor_word (struct dfm_reader *r, enum descriptor_type type,
270 size_t *remaining_size, int *segment)
272 uint8_t raw_descriptor[4];
275 status = try_to_read_fully (r, raw_descriptor, sizeof raw_descriptor);
279 *remaining_size = (raw_descriptor[0] << 8) | raw_descriptor[1];
281 *segment = raw_descriptor[2];
283 if (*remaining_size < 4)
287 ? _("Corrupt block descriptor word at offset 0x%lx in %s.")
288 : _("Corrupt record descriptor word at offset 0x%lx in %s.")),
289 (long) ftello (r->file) - 4, fh_get_name (r->fh));
293 *remaining_size -= 4;
297 /* Reports that reader R has read a corrupt record size. */
299 corrupt_size (struct dfm_reader *r)
301 msg (ME, _("Corrupt record size at offset 0x%lx in %s."),
302 (long) ftello (r->file) - 4, fh_get_name (r->fh));
305 /* Reads a 32-byte little-endian signed number from R and stores
306 its value into *SIZE_OUT. Returns 1 if successful, 0 if end
307 of file was reached before any bytes could be read, -1 if an
308 error occurred. Reports an error in the latter case. Numbers
309 less than 0 are considered errors. */
311 read_size (struct dfm_reader *r, size_t *size_out)
316 status = try_to_read_fully (r, &size, sizeof size);
320 integer_convert (INTEGER_LSB_FIRST, &size, INTEGER_NATIVE, &size,
332 /* Reads a record from a disk file into R.
333 Returns true if successful, false on error or at end of file. */
335 read_file_record (struct dfm_reader *r)
337 assert (r->fh != fh_inline_file ());
340 switch (fh_get_mode (r->fh))
343 if (ds_read_line (&r->line, r->file, SIZE_MAX))
345 ds_chomp (&r->line, '\n');
350 if (ferror (r->file))
357 if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file))
361 if (ferror (r->file))
363 else if (!ds_is_empty (&r->line))
369 case FH_MODE_VARIABLE:
372 size_t trailing_size;
375 /* Read leading record size. */
376 status = read_size (r, &leading_size);
380 /* Read record data. */
381 if (!ds_read_stream (&r->line, leading_size, 1, r->file))
383 if (ferror (r->file))
390 /* Read trailing record size and check that it's the same
391 as the leading record size. */
392 status = read_size (r, &trailing_size);
399 if (leading_size != trailing_size)
408 case FH_MODE_360_VARIABLE:
409 case FH_MODE_360_SPANNED:
416 /* If we've exhausted our current block, start another
417 one by reading the new block descriptor word. */
418 if (r->block_left == 0)
420 status = read_descriptor_word (r, BLOCK, &r->block_left, NULL);
423 else if (status == 0)
424 return !ds_is_empty (&r->line);
427 /* Read record descriptor. */
428 if (r->block_left < 4)
434 status = read_descriptor_word (r, RECORD, &record_size, &segment);
441 if (record_size > r->block_left)
443 msg (ME, _("Record exceeds remaining block length."));
447 /* Read record data. */
448 if (!ds_read_stream (&r->line, record_size, 1, r->file))
450 if (ferror (r->file))
456 r->block_left -= record_size;
458 /* In variable mode, read only a single record.
459 In spanned mode, a segment value of 0 should
460 designate a whole record without spanning, 1 the
461 first segment in a record, 2 the last segment in a
462 record, and 3 an intermediate segment in a record.
463 For compatibility, though, we actually pay attention
464 only to whether the segment value is even or odd. */
465 if (fh_get_mode (r->fh) == FH_MODE_360_VARIABLE
466 || (segment & 1) == 0)
474 /* Reads a record from R, setting the current position to the
475 start of the line. If an error occurs or end-of-file is
476 encountered, the current line is set to null. */
478 read_record (struct dfm_reader *r)
480 if (fh_get_referent (r->fh) == FH_REF_FILE)
482 bool ok = read_file_record (r);
484 r->where.line_number++;
488 return read_inline_record (r);
491 /* Returns the number of attempts, thus far, to advance past
492 end-of-file in reader R. Reads forward in HANDLE's file, if
493 necessary, to find out.
495 Normally, the user stops attempting to read from the file the
496 first time EOF is reached (a return value of 1). If the user
497 tries to read past EOF again (a return value of 2 or more),
498 an error message is issued, and the caller should more
499 forcibly abort to avoid an infinite loop. */
501 dfm_eof (struct dfm_reader *r)
503 if (r->flags & DFM_ADVANCE)
505 r->flags &= ~DFM_ADVANCE;
507 if (r->eof_cnt == 0 && read_record (r) )
516 if (r->fh != fh_inline_file ())
517 msg (ME, _("Attempt to read beyond end-of-file on file %s."),
518 fh_get_name (r->fh));
520 msg (ME, _("Attempt to read beyond END DATA."));
527 /* Returns the current record in the file corresponding to
528 HANDLE. Aborts if reading from the file is necessary or at
529 end of file, so call dfm_eof() first. */
531 dfm_get_record (struct dfm_reader *r)
533 assert ((r->flags & DFM_ADVANCE) == 0);
534 assert (r->eof_cnt == 0);
536 return ds_substr (&r->line, r->pos, SIZE_MAX);
539 /* Expands tabs in the current line into the equivalent number of
540 spaces, if appropriate for this kind of file. Aborts if
541 reading from the file is necessary or at end of file, so call
544 dfm_expand_tabs (struct dfm_reader *r)
546 size_t ofs, new_pos, tab_width;
548 assert ((r->flags & DFM_ADVANCE) == 0);
549 assert (r->eof_cnt == 0);
551 if (r->flags & DFM_TABS_EXPANDED)
553 r->flags |= DFM_TABS_EXPANDED;
555 if (r->fh != fh_inline_file ()
556 && (fh_get_mode (r->fh) != FH_MODE_TEXT
557 || fh_get_tab_width (r->fh) == 0
558 || ds_find_char (&r->line, '\t') == SIZE_MAX))
561 /* Expand tabs from r->line into r->scratch, and figure out
562 new value for r->pos. */
563 tab_width = fh_get_tab_width (r->fh);
564 ds_clear (&r->scratch);
566 for (ofs = 0; ofs < ds_length (&r->line); ofs++)
571 new_pos = ds_length (&r->scratch);
573 c = ds_data (&r->line)[ofs];
575 ds_put_char (&r->scratch, c);
579 ds_put_char (&r->scratch, ' ');
580 while (ds_length (&r->scratch) % tab_width != 0);
583 if (new_pos == SIZE_MAX)
585 /* Maintain the same relationship between position and line
586 length that we had before. DATA LIST uses a
587 beyond-the-end position to deal with an empty field at
588 the end of the line. */
589 assert (r->pos >= ds_length (&r->line));
590 new_pos = (r->pos - ds_length (&r->line)) + ds_length (&r->scratch);
593 /* Swap r->line and r->scratch and set new r->pos. */
594 ds_swap (&r->line, &r->scratch);
598 /* Returns the legacy character encoding of data read from READER. */
600 dfm_reader_get_legacy_encoding (const struct dfm_reader *reader)
602 return fh_get_legacy_encoding (reader->fh);
605 /* Returns a number between 0 and 100 that approximates the
606 percentage of the data in READER that has already been read,
607 or -1 if this value cannot be estimated.
609 ftello is slow in glibc (it flushes the read buffer), so don't
610 call this function unless you need to. */
612 dfm_get_percent_read (const struct dfm_reader *reader)
614 if (reader->file_size >= 0)
616 off_t position = ftello (reader->file);
619 double p = 100.0 * position / reader->file_size;
620 return p < 0 ? 0 : p > 100 ? 100 : p;
626 /* Causes dfm_get_record() or dfm_get_whole_record() to read in
627 the next record the next time it is executed on file
630 dfm_forward_record (struct dfm_reader *r)
632 r->flags |= DFM_ADVANCE;
635 /* Cancels the effect of any previous dfm_fwd_record() executed
636 on file HANDLE. Sets the current line to begin in the 1-based
639 dfm_reread_record (struct dfm_reader *r, size_t column)
641 r->flags &= ~DFM_ADVANCE;
642 r->pos = MAX (column, 1) - 1;
645 /* Sets the current line to begin COLUMNS characters following
646 the current start. */
648 dfm_forward_columns (struct dfm_reader *r, size_t columns)
650 dfm_reread_record (r, (r->pos + 1) + columns);
653 /* Returns the 1-based column to which the line pointer in HANDLE
654 is set. Unless dfm_reread_record() or dfm_forward_columns()
655 have been called, this is 1. */
657 dfm_column_start (const struct dfm_reader *r)
662 /* Returns the number of columns we are currently beyond the end
663 of the line. At or before end-of-line, this is 0; one column
664 after end-of-line, this is 1; and so on. */
666 dfm_columns_past_end (const struct dfm_reader *r)
668 return r->pos < ds_length (&r->line) ? 0 : ds_length (&r->line) - r->pos;
671 /* Returns the 1-based column within the current line that P
674 dfm_get_column (const struct dfm_reader *r, const char *p)
676 return ds_pointer_to_position (&r->line, p) + 1;
679 /* Pushes the file name and line number on the fn/ln stack. */
681 dfm_push (struct dfm_reader *r)
683 if (r->fh != fh_inline_file ())
684 msg_push_msg_locator (&r->where);
687 /* Pops the file name and line number from the fn/ln stack. */
689 dfm_pop (struct dfm_reader *r)
691 if (r->fh != fh_inline_file ())
692 msg_pop_msg_locator (&r->where);
695 /* BEGIN DATA...END DATA procedure. */
697 /* Perform BEGIN DATA...END DATA as a procedure in itself. */
699 cmd_begin_data (struct lexer *lexer, struct dataset *ds)
701 struct dfm_reader *r;
704 if (!fh_is_locked (fh_inline_file (), FH_ACC_READ))
706 msg (SE, _("This command is not valid here since the current "
707 "input program does not access the inline file."));
708 return CMD_CASCADING_FAILURE;
711 /* Open inline file. */
712 r = dfm_open_reader (fh_inline_file (), lexer);
713 r->flags |= DFM_SAW_BEGIN_DATA;
715 /* Input procedure reads from inline file. */
716 prompt_set_style (PROMPT_DATA);
717 casereader_destroy (proc_open (ds));
718 ok = proc_commit (ds);
719 dfm_close_reader (r);
721 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;