1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-2004, 2006 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <language/data-io/data-reader.h>
28 #include <data/casereader.h>
29 #include <data/file-handle-def.h>
30 #include <data/file-name.h>
31 #include <data/procedure.h>
32 #include <language/command.h>
33 #include <language/data-io/file-handle.h>
34 #include <language/lexer/lexer.h>
35 #include <language/prompt.h>
36 #include <libpspp/assertion.h>
37 #include <libpspp/cast.h>
38 #include <libpspp/integer-format.h>
39 #include <libpspp/message.h>
40 #include <libpspp/str.h>
46 #define _(msgid) gettext (msgid)
47 #define N_(msgid) (msgid)
49 /* Flags for DFM readers. */
52 DFM_ADVANCE = 002, /* Read next line on dfm_get_record() call? */
53 DFM_SAW_BEGIN_DATA = 004, /* For inline_file only, whether we've
54 already read a BEGIN DATA line. */
55 DFM_TABS_EXPANDED = 010, /* Tabs have been expanded. */
58 /* Data file reader. */
61 struct file_handle *fh; /* File handle. */
62 struct fh_lock *lock; /* Mutual exclusion lock for file. */
63 struct msg_locator where; /* Current location in data file. */
64 struct string line; /* Current line. */
65 struct string scratch; /* Extra line buffer. */
66 enum dfm_reader_flags flags; /* Zero or more of DFM_*. */
67 FILE *file; /* Associated file. */
68 off_t file_size; /* File size, or -1 if unavailable. */
69 size_t pos; /* Offset in line of current character. */
70 unsigned eof_cnt; /* # of attempts to advance past EOF. */
71 struct lexer *lexer; /* The lexer reading the file */
73 /* For FH_MODE_360_VARIABLE and FH_MODE_360_SPANNED files only. */
74 size_t block_left; /* Bytes left in current block. */
77 /* Closes reader R opened by dfm_open_reader(). */
79 dfm_close_reader (struct dfm_reader *r)
84 if (fh_unlock (r->lock))
86 /* File is still locked by another client. */
90 /* This was the last client, so close the underlying file. */
91 if (fh_get_referent (r->fh) != FH_REF_INLINE)
92 fn_close (fh_get_file_name (r->fh), r->file);
95 /* Skip any remaining data on the inline file. */
96 if (r->flags & DFM_SAW_BEGIN_DATA)
98 dfm_reread_record (r, 0);
100 dfm_forward_record (r);
105 ds_destroy (&r->line);
106 ds_destroy (&r->scratch);
110 /* Opens the file designated by file handle FH for reading as a
111 data file. Providing fh_inline_file() for FH designates the
112 "inline file", that is, data included inline in the command
113 file between BEGIN FILE and END FILE. Returns a reader if
114 successful, or a null pointer otherwise. */
116 dfm_open_reader (struct file_handle *fh, struct lexer *lexer)
118 struct dfm_reader *r;
119 struct fh_lock *lock;
121 /* TRANSLATORS: this fragment will be interpolated into
122 messages in fh_lock() that identify types of files. */
123 lock = fh_lock (fh, FH_REF_FILE | FH_REF_INLINE, N_("data file"),
128 r = fh_lock_get_aux (lock);
132 r = xmalloc (sizeof *r);
136 ds_init_empty (&r->line);
137 ds_init_empty (&r->scratch);
138 r->flags = DFM_ADVANCE;
141 if (fh_get_referent (fh) != FH_REF_INLINE)
144 r->where.file_name = CONST_CAST (char *, fh_get_file_name (fh));
145 r->where.line_number = 0;
146 r->file = fn_open (fh_get_file_name (fh),
147 fh_get_mode (fh) == FH_MODE_TEXT ? "r" : "rb");
150 msg (ME, _("Could not open \"%s\" for reading as a data file: %s."),
151 fh_get_file_name (r->fh), strerror (errno));
157 r->file_size = fstat (fileno (r->file), &s) == 0 ? s.st_size : -1;
161 fh_lock_set_aux (lock, r);
166 /* Returns true if an I/O error occurred on READER, false otherwise. */
168 dfm_reader_error (const struct dfm_reader *r)
170 return fh_get_referent (r->fh) == FH_REF_FILE && ferror (r->file);
173 /* Reads a record from the inline file into R.
174 Returns true if successful, false on failure. */
176 read_inline_record (struct dfm_reader *r)
178 if ((r->flags & DFM_SAW_BEGIN_DATA) == 0)
180 r->flags |= DFM_SAW_BEGIN_DATA;
182 while (lex_token (r->lexer) == '.')
184 if (!lex_force_match_id (r->lexer, "BEGIN") || !lex_force_match_id (r->lexer, "DATA"))
186 prompt_set_style (PROMPT_DATA);
189 if (!lex_get_line_raw (r->lexer))
191 lex_discard_line (r->lexer);
192 msg (SE, _("Unexpected end-of-file while reading data in BEGIN "
193 "DATA. This probably indicates "
194 "a missing or misformatted END DATA command. "
195 "END DATA must appear by itself on a single line "
196 "with exactly one space between words."));
200 if (ds_length (lex_entire_line_ds (r->lexer) ) >= 8
201 && !strncasecmp (lex_entire_line (r->lexer), "end data", 8))
203 lex_discard_line (r->lexer);
207 ds_assign_string (&r->line, lex_entire_line_ds (r->lexer) );
212 /* Report a read error or unexpected end-of-file condition on R. */
214 read_error (struct dfm_reader *r)
216 if (ferror (r->file))
217 msg (ME, _("Error reading file %s: %s."),
218 fh_get_name (r->fh), strerror (errno));
219 else if (feof (r->file))
220 msg (ME, _("Unexpected end of file reading %s."), fh_get_name (r->fh));
225 /* Report a partial read at end of file reading R. */
227 partial_record (struct dfm_reader *r)
229 msg (ME, _("Unexpected end of file in partial record reading %s."),
230 fh_get_name (r->fh));
233 /* Tries to read SIZE bytes from R into BUFFER. Returns 1 if
234 successful, 0 if end of file was reached before any bytes
235 could be read, and -1 if some bytes were read but fewer than
236 SIZE due to end of file or an error mid-read. In the latter
237 case, reports an error. */
239 try_to_read_fully (struct dfm_reader *r, void *buffer, size_t size)
241 size_t bytes_read = fread (buffer, 1, size, r->file);
242 if (bytes_read == size)
244 else if (bytes_read == 0)
253 /* Type of a descriptor word. */
260 /* Reads a block descriptor word or record descriptor word
261 (according to TYPE) from R. Returns 1 if successful, 0 if
262 end of file was reached before any bytes could be read, -1 if
263 an error occurred. Reports an error in the latter case.
265 If successful, stores the number of remaining bytes in the
266 block or record (that is, the block or record length, minus
267 the 4 bytes in the BDW or RDW itself) into *REMAINING_SIZE.
268 If SEGMENT is nonnull, also stores the segment control
269 character (SCC) into *SEGMENT. */
271 read_descriptor_word (struct dfm_reader *r, enum descriptor_type type,
272 size_t *remaining_size, int *segment)
274 uint8_t raw_descriptor[4];
277 status = try_to_read_fully (r, raw_descriptor, sizeof raw_descriptor);
281 *remaining_size = (raw_descriptor[0] << 8) | raw_descriptor[1];
283 *segment = raw_descriptor[2];
285 if (*remaining_size < 4)
289 ? _("Corrupt block descriptor word at offset 0x%lx in %s.")
290 : _("Corrupt record descriptor word at offset 0x%lx in %s.")),
291 (long) ftello (r->file) - 4, fh_get_name (r->fh));
295 *remaining_size -= 4;
299 /* Reports that reader R has read a corrupt record size. */
301 corrupt_size (struct dfm_reader *r)
303 msg (ME, _("Corrupt record size at offset 0x%lx in %s."),
304 (long) ftello (r->file) - 4, fh_get_name (r->fh));
307 /* Reads a 32-byte little-endian signed number from R and stores
308 its value into *SIZE_OUT. Returns 1 if successful, 0 if end
309 of file was reached before any bytes could be read, -1 if an
310 error occurred. Reports an error in the latter case. Numbers
311 less than 0 are considered errors. */
313 read_size (struct dfm_reader *r, size_t *size_out)
318 status = try_to_read_fully (r, &size, sizeof size);
322 integer_convert (INTEGER_LSB_FIRST, &size, INTEGER_NATIVE, &size,
334 /* Reads a record from a disk file into R.
335 Returns true if successful, false on error or at end of file. */
337 read_file_record (struct dfm_reader *r)
339 assert (r->fh != fh_inline_file ());
342 switch (fh_get_mode (r->fh))
345 if (ds_read_line (&r->line, r->file, SIZE_MAX))
347 ds_chomp (&r->line, '\n');
352 if (ferror (r->file))
359 if (ds_read_stream (&r->line, 1, fh_get_record_width (r->fh), r->file))
363 if (ferror (r->file))
365 else if (!ds_is_empty (&r->line))
371 case FH_MODE_VARIABLE:
374 size_t trailing_size;
377 /* Read leading record size. */
378 status = read_size (r, &leading_size);
382 /* Read record data. */
383 if (!ds_read_stream (&r->line, leading_size, 1, r->file))
385 if (ferror (r->file))
392 /* Read trailing record size and check that it's the same
393 as the leading record size. */
394 status = read_size (r, &trailing_size);
401 if (leading_size != trailing_size)
410 case FH_MODE_360_VARIABLE:
411 case FH_MODE_360_SPANNED:
418 /* If we've exhausted our current block, start another
419 one by reading the new block descriptor word. */
420 if (r->block_left == 0)
422 status = read_descriptor_word (r, BLOCK, &r->block_left, NULL);
425 else if (status == 0)
426 return !ds_is_empty (&r->line);
429 /* Read record descriptor. */
430 if (r->block_left < 4)
436 status = read_descriptor_word (r, RECORD, &record_size, &segment);
443 if (record_size > r->block_left)
445 msg (ME, _("Record exceeds remaining block length."));
449 /* Read record data. */
450 if (!ds_read_stream (&r->line, record_size, 1, r->file))
452 if (ferror (r->file))
458 r->block_left -= record_size;
460 /* In variable mode, read only a single record.
461 In spanned mode, a segment value of 0 should
462 designate a whole record without spanning, 1 the
463 first segment in a record, 2 the last segment in a
464 record, and 3 an intermediate segment in a record.
465 For compatibility, though, we actually pay attention
466 only to whether the segment value is even or odd. */
467 if (fh_get_mode (r->fh) == FH_MODE_360_VARIABLE
468 || (segment & 1) == 0)
476 /* Reads a record from R, setting the current position to the
477 start of the line. If an error occurs or end-of-file is
478 encountered, the current line is set to null. */
480 read_record (struct dfm_reader *r)
482 if (fh_get_referent (r->fh) == FH_REF_FILE)
484 bool ok = read_file_record (r);
486 r->where.line_number++;
490 return read_inline_record (r);
493 /* Returns the number of attempts, thus far, to advance past
494 end-of-file in reader R. Reads forward in HANDLE's file, if
495 necessary, to find out.
497 Normally, the user stops attempting to read from the file the
498 first time EOF is reached (a return value of 1). If the user
499 tries to read past EOF again (a return value of 2 or more),
500 an error message is issued, and the caller should more
501 forcibly abort to avoid an infinite loop. */
503 dfm_eof (struct dfm_reader *r)
505 if (r->flags & DFM_ADVANCE)
507 r->flags &= ~DFM_ADVANCE;
509 if (r->eof_cnt == 0 && read_record (r) )
518 if (r->fh != fh_inline_file ())
519 msg (ME, _("Attempt to read beyond end-of-file on file %s."),
520 fh_get_name (r->fh));
522 msg (ME, _("Attempt to read beyond END DATA."));
529 /* Returns the current record in the file corresponding to
530 HANDLE. Aborts if reading from the file is necessary or at
531 end of file, so call dfm_eof() first. */
533 dfm_get_record (struct dfm_reader *r)
535 assert ((r->flags & DFM_ADVANCE) == 0);
536 assert (r->eof_cnt == 0);
538 return ds_substr (&r->line, r->pos, SIZE_MAX);
541 /* Expands tabs in the current line into the equivalent number of
542 spaces, if appropriate for this kind of file. Aborts if
543 reading from the file is necessary or at end of file, so call
546 dfm_expand_tabs (struct dfm_reader *r)
548 size_t ofs, new_pos, tab_width;
550 assert ((r->flags & DFM_ADVANCE) == 0);
551 assert (r->eof_cnt == 0);
553 if (r->flags & DFM_TABS_EXPANDED)
555 r->flags |= DFM_TABS_EXPANDED;
557 if (r->fh != fh_inline_file ()
558 && (fh_get_mode (r->fh) != FH_MODE_TEXT
559 || fh_get_tab_width (r->fh) == 0
560 || ds_find_char (&r->line, '\t') == SIZE_MAX))
563 /* Expand tabs from r->line into r->scratch, and figure out
564 new value for r->pos. */
565 tab_width = fh_get_tab_width (r->fh);
566 ds_clear (&r->scratch);
568 for (ofs = 0; ofs < ds_length (&r->line); ofs++)
573 new_pos = ds_length (&r->scratch);
575 c = ds_data (&r->line)[ofs];
577 ds_put_char (&r->scratch, c);
581 ds_put_char (&r->scratch, ' ');
582 while (ds_length (&r->scratch) % tab_width != 0);
585 if (new_pos == SIZE_MAX)
587 /* Maintain the same relationship between position and line
588 length that we had before. DATA LIST uses a
589 beyond-the-end position to deal with an empty field at
590 the end of the line. */
591 assert (r->pos >= ds_length (&r->line));
592 new_pos = (r->pos - ds_length (&r->line)) + ds_length (&r->scratch);
595 /* Swap r->line and r->scratch and set new r->pos. */
596 ds_swap (&r->line, &r->scratch);
600 /* Returns the legacy character encoding of data read from READER. */
602 dfm_reader_get_legacy_encoding (const struct dfm_reader *reader)
604 return fh_get_legacy_encoding (reader->fh);
607 /* Returns a number between 0 and 100 that approximates the
608 percentage of the data in READER that has already been read,
609 or -1 if this value cannot be estimated.
611 ftello is slow in glibc (it flushes the read buffer), so don't
612 call this function unless you need to. */
614 dfm_get_percent_read (const struct dfm_reader *reader)
616 if (reader->file_size >= 0)
618 off_t position = ftello (reader->file);
621 double p = 100.0 * position / reader->file_size;
622 return p < 0 ? 0 : p > 100 ? 100 : p;
628 /* Causes dfm_get_record() or dfm_get_whole_record() to read in
629 the next record the next time it is executed on file
632 dfm_forward_record (struct dfm_reader *r)
634 r->flags |= DFM_ADVANCE;
637 /* Cancels the effect of any previous dfm_fwd_record() executed
638 on file HANDLE. Sets the current line to begin in the 1-based
641 dfm_reread_record (struct dfm_reader *r, size_t column)
643 r->flags &= ~DFM_ADVANCE;
644 r->pos = MAX (column, 1) - 1;
647 /* Sets the current line to begin COLUMNS characters following
648 the current start. */
650 dfm_forward_columns (struct dfm_reader *r, size_t columns)
652 dfm_reread_record (r, (r->pos + 1) + columns);
655 /* Returns the 1-based column to which the line pointer in HANDLE
656 is set. Unless dfm_reread_record() or dfm_forward_columns()
657 have been called, this is 1. */
659 dfm_column_start (const struct dfm_reader *r)
664 /* Returns the number of columns we are currently beyond the end
665 of the line. At or before end-of-line, this is 0; one column
666 after end-of-line, this is 1; and so on. */
668 dfm_columns_past_end (const struct dfm_reader *r)
670 return r->pos < ds_length (&r->line) ? 0 : ds_length (&r->line) - r->pos;
673 /* Returns the 1-based column within the current line that P
676 dfm_get_column (const struct dfm_reader *r, const char *p)
678 return ds_pointer_to_position (&r->line, p) + 1;
681 /* Pushes the file name and line number on the fn/ln stack. */
683 dfm_push (struct dfm_reader *r)
685 if (r->fh != fh_inline_file ())
686 msg_push_msg_locator (&r->where);
689 /* Pops the file name and line number from the fn/ln stack. */
691 dfm_pop (struct dfm_reader *r)
693 if (r->fh != fh_inline_file ())
694 msg_pop_msg_locator (&r->where);
697 /* BEGIN DATA...END DATA procedure. */
699 /* Perform BEGIN DATA...END DATA as a procedure in itself. */
701 cmd_begin_data (struct lexer *lexer, struct dataset *ds)
703 struct dfm_reader *r;
706 if (!fh_is_locked (fh_inline_file (), FH_ACC_READ))
708 msg (SE, _("This command is not valid here since the current "
709 "input program does not access the inline file."));
710 return CMD_CASCADING_FAILURE;
713 /* Open inline file. */
714 r = dfm_open_reader (fh_inline_file (), lexer);
715 r->flags |= DFM_SAW_BEGIN_DATA;
717 /* Input procedure reads from inline file. */
718 prompt_set_style (PROMPT_DATA);
719 casereader_destroy (proc_open (ds));
720 ok = proc_commit (ds);
721 dfm_close_reader (r);
723 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;