1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <language/data-io/data-parser.h>
24 #include <data/casereader-provider.h>
25 #include <data/data-in.h>
26 #include <data/dictionary.h>
27 #include <data/format.h>
28 #include <data/file-handle-def.h>
29 #include <data/procedure.h>
30 #include <data/settings.h>
31 #include <language/data-io/data-reader.h>
32 #include <libpspp/message.h>
33 #include <libpspp/str.h>
34 #include <output/table.h>
39 #define _(msgid) gettext (msgid)
41 /* Data parser for textual data like that read by DATA LIST. */
44 enum data_parser_type type; /* Type of data to parse. */
45 int skip_records; /* Records to skip before first real data. */
46 casenumber max_cases; /* Max number of cases to read. */
47 int percent_cases; /* Approximate percent of cases to read. */
49 struct field *fields; /* Fields to parse. */
50 size_t field_cnt; /* Number of fields. */
51 size_t field_allocated; /* Number of fields spaced allocated for. */
53 /* DP_DELIMITED parsers only. */
54 bool span; /* May cases span multiple records? */
55 bool empty_line_has_field; /* Does an empty line have an (empty) field? */
56 struct substring quotes; /* Characters that can quote separators. */
57 bool quote_escape; /* Doubled quote acts as escape? */
58 struct substring soft_seps; /* Two soft separators act like just one. */
59 struct substring hard_seps; /* Two hard separators yield empty fields. */
60 struct string any_sep; /* Concatenation of soft_seps and hard_seps. */
62 /* DP_FIXED parsers only. */
63 int records_per_case; /* Number of records in each case. */
66 /* How to parse one variable. */
69 struct fmt_spec format; /* Input format of this field. */
70 int case_idx; /* First value in case. */
71 char *name; /* Var name for error messages and tables. */
74 int record; /* Record number (1-based). */
75 int first_column; /* First column in record (1-based). */
78 static void set_any_sep (struct data_parser *parser);
80 /* Creates and returns a new data parser. */
82 data_parser_create (void)
84 struct data_parser *parser = xmalloc (sizeof *parser);
86 parser->type = DP_FIXED;
87 parser->skip_records = 0;
88 parser->max_cases = -1;
89 parser->percent_cases = 100;
91 parser->fields = NULL;
92 parser->field_cnt = 0;
93 parser->field_allocated = 0;
96 parser->empty_line_has_field = false;
97 ss_alloc_substring (&parser->quotes, ss_cstr ("\"'"));
98 parser->quote_escape = false;
99 ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES));
100 ss_alloc_substring (&parser->hard_seps, ss_cstr (","));
101 ds_init_empty (&parser->any_sep);
102 set_any_sep (parser);
104 parser->records_per_case = 0;
109 /* Destroys PARSER. */
111 data_parser_destroy (struct data_parser *parser)
117 for (i = 0; i < parser->field_cnt; i++)
118 free (parser->fields[i].name);
119 free (parser->fields);
120 ss_dealloc (&parser->quotes);
121 ss_dealloc (&parser->soft_seps);
122 ss_dealloc (&parser->hard_seps);
123 ds_destroy (&parser->any_sep);
128 /* Returns the type of PARSER (either DP_DELIMITED or DP_FIXED). */
129 enum data_parser_type
130 data_parser_get_type (const struct data_parser *parser)
135 /* Sets the type of PARSER to TYPE (either DP_DELIMITED or
138 data_parser_set_type (struct data_parser *parser, enum data_parser_type type)
140 assert (parser->field_cnt == 0);
141 assert (type == DP_FIXED || type == DP_DELIMITED);
145 /* Configures PARSER to skip the specified number of
146 INITIAL_RECORDS_TO_SKIP before parsing any data. By default,
147 no records are skipped. */
149 data_parser_set_skip (struct data_parser *parser, int initial_records_to_skip)
151 assert (initial_records_to_skip >= 0);
152 parser->skip_records = initial_records_to_skip;
155 /* Sets the maximum number of cases parsed by PARSER to
156 MAX_CASES. The default is -1, meaning no limit. */
158 data_parser_set_case_limit (struct data_parser *parser, casenumber max_cases)
160 parser->max_cases = max_cases;
163 /* Sets the percentage of cases that PARSER should read from the
164 input file to PERCENT_CASES. By default, all cases are
167 data_parser_set_case_percent (struct data_parser *parser, int percent_cases)
169 assert (percent_cases >= 0 && percent_cases <= 100);
170 parser->percent_cases = percent_cases;
173 /* Returns true if PARSER is configured to allow cases to span
176 data_parser_get_span (const struct data_parser *parser)
181 /* If MAY_CASES_SPAN_RECORDS is true, configures PARSER to allow
182 a single case to span multiple records and multiple cases to
183 occupy a single record. If MAY_CASES_SPAN_RECORDS is false,
184 configures PARSER to require each record to contain exactly
187 This setting affects parsing of DP_DELIMITED files only. */
189 data_parser_set_span (struct data_parser *parser, bool may_cases_span_records)
191 parser->span = may_cases_span_records;
194 /* If EMPTY_LINE_HAS_FIELD is true, configures PARSER to parse an
195 empty line as an empty field and to treat a hard delimiter
196 followed by end-of-line as an empty field. If
197 EMPTY_LINE_HAS_FIELD is false, PARSER will skip empty lines
198 and hard delimiters at the end of lines without emitting empty
201 This setting affects parsing of DP_DELIMITED files only. */
203 data_parser_set_empty_line_has_field (struct data_parser *parser,
204 bool empty_line_has_field)
206 parser->empty_line_has_field = empty_line_has_field;
209 /* Sets the characters that may be used for quoting field
210 contents to QUOTES. If QUOTES is empty, quoting will be
213 The caller retains ownership of QUOTES.
215 This setting affects parsing of DP_DELIMITED files only. */
217 data_parser_set_quotes (struct data_parser *parser, struct substring quotes)
219 ss_dealloc (&parser->quotes);
220 ss_alloc_substring (&parser->quotes, quotes);
223 /* If ESCAPE is false (the default setting), a character used for
224 quoting cannot itself be embedded within a quoted field. If
225 ESCAPE is true, then a quote character can be embedded within
226 a quoted field by doubling it.
228 This setting affects parsing of DP_DELIMITED files only, and
229 only when at least one quote character has been set (with
230 data_parser_set_quotes). */
232 data_parser_set_quote_escape (struct data_parser *parser, bool escape)
234 parser->quote_escape = escape;
237 /* Sets PARSER's soft delimiters to DELIMITERS. Soft delimiters
238 separate fields, but consecutive soft delimiters do not yield
239 empty fields. (Ordinarily, only white space characters are
240 appropriate soft delimiters.)
242 The caller retains ownership of DELIMITERS.
244 This setting affects parsing of DP_DELIMITED files only. */
246 data_parser_set_soft_delimiters (struct data_parser *parser,
247 struct substring delimiters)
249 ss_dealloc (&parser->soft_seps);
250 ss_alloc_substring (&parser->soft_seps, delimiters);
251 set_any_sep (parser);
254 /* Sets PARSER's hard delimiters to DELIMITERS. Hard delimiters
255 separate fields. A consecutive pair of hard delimiters yield
258 The caller retains ownership of DELIMITERS.
260 This setting affects parsing of DP_DELIMITED files only. */
262 data_parser_set_hard_delimiters (struct data_parser *parser,
263 struct substring delimiters)
265 ss_dealloc (&parser->hard_seps);
266 ss_alloc_substring (&parser->hard_seps, delimiters);
267 set_any_sep (parser);
270 /* Returns the number of records per case. */
272 data_parser_get_records (const struct data_parser *parser)
274 return parser->records_per_case;
277 /* Sets the number of records per case to RECORDS_PER_CASE.
279 This setting affects parsing of DP_FIXED files only. */
281 data_parser_set_records (struct data_parser *parser, int records_per_case)
283 assert (records_per_case >= 0);
284 assert (records_per_case >= parser->records_per_case);
285 parser->records_per_case = records_per_case;
289 add_field (struct data_parser *p, const struct fmt_spec *format, int case_idx,
290 const char *name, int record, int first_column)
294 if (p->field_cnt == p->field_allocated)
295 p->fields = x2nrealloc (p->fields, &p->field_allocated, sizeof *p->fields);
296 field = &p->fields[p->field_cnt++];
297 field->format = *format;
298 field->case_idx = case_idx;
299 field->name = xstrdup (name);
300 field->record = record;
301 field->first_column = first_column;
304 /* Adds a delimited field to the field parsed by PARSER, which
305 must be configured as a DP_DELIMITED parser. The field is
306 parsed as input format FORMAT. Its data will be stored into case
307 index CASE_INDEX. Errors in input data will be reported
308 against variable NAME. */
310 data_parser_add_delimited_field (struct data_parser *parser,
311 const struct fmt_spec *format, int case_idx,
314 assert (parser->type == DP_DELIMITED);
315 add_field (parser, format, case_idx, name, 0, 0);
318 /* Adds a fixed field to the field parsed by PARSER, which
319 must be configured as a DP_FIXED parser. The field is
320 parsed as input format FORMAT. Its data will be stored into case
321 index CASE_INDEX. Errors in input data will be reported
322 against variable NAME. The field will be drawn from the
323 FORMAT->w columns in 1-based RECORD starting at 1-based
326 RECORD must be at least as great as that of any field already
327 added; that is, fields must be added in increasing order of
328 record number. If RECORD is greater than the current number
329 of records per case, the number of records per case are
330 increased as needed. */
332 data_parser_add_fixed_field (struct data_parser *parser,
333 const struct fmt_spec *format, int case_idx,
335 int record, int first_column)
337 assert (parser->type == DP_FIXED);
338 assert (parser->field_cnt == 0
339 || record >= parser->fields[parser->field_cnt - 1].record);
340 if (record > parser->records_per_case)
341 parser->records_per_case = record;
342 add_field (parser, format, case_idx, name, record, first_column);
345 /* Returns true if any fields have been added to PARSER, false
348 data_parser_any_fields (const struct data_parser *parser)
350 return parser->field_cnt > 0;
354 set_any_sep (struct data_parser *parser)
356 ds_assign_substring (&parser->any_sep, parser->soft_seps);
357 ds_put_substring (&parser->any_sep, parser->hard_seps);
360 static bool parse_delimited_span (const struct data_parser *,
361 struct dfm_reader *, struct ccase *);
362 static bool parse_delimited_no_span (const struct data_parser *,
363 struct dfm_reader *, struct ccase *);
364 static bool parse_fixed (const struct data_parser *,
365 struct dfm_reader *, struct ccase *);
367 /* Reads a case from DFM into C, parsing it with PARSER. Returns
368 true if successful, false at end of file or on I/O error.
370 Case C must not be shared. */
372 data_parser_parse (struct data_parser *parser, struct dfm_reader *reader,
377 assert (!case_is_shared (c));
378 assert (data_parser_any_fields (parser));
380 /* Skip the requested number of records before reading the
382 for (; parser->skip_records > 0; parser->skip_records--)
384 if (dfm_eof (reader))
386 dfm_forward_record (reader);
390 if (parser->max_cases != -1 && parser->max_cases-- == 0)
392 if (parser->percent_cases < 100
393 && dfm_get_percent_read (reader) >= parser->percent_cases)
397 if (parser->type == DP_DELIMITED)
400 retval = parse_delimited_span (parser, reader, c);
402 retval = parse_delimited_no_span (parser, reader, c);
405 retval = parse_fixed (parser, reader, c);
411 /* Extracts a delimited field from the current position in the
412 current record according to PARSER, reading data from READER.
414 *FIELD is set to the field content. The caller must not or
415 destroy this constant string.
417 After parsing the field, sets the current position in the
418 record to just past the field and any trailing delimiter.
419 Returns 0 on failure or a 1-based column number indicating the
420 beginning of the field on success. */
422 cut_field (const struct data_parser *parser, struct dfm_reader *reader,
423 int *first_column, int *last_column, struct string *tmp,
424 struct substring *field)
426 struct substring line, p;
428 if (dfm_eof (reader))
430 if (ss_is_empty (parser->hard_seps))
431 dfm_expand_tabs (reader);
432 line = p = dfm_get_record (reader);
434 /* Skip leading soft separators. */
435 ss_ltrim (&p, parser->soft_seps);
437 /* Handle empty or completely consumed lines. */
440 if (!parser->empty_line_has_field || dfm_columns_past_end (reader) > 0)
445 *first_column = dfm_column_start (reader);
446 *last_column = *first_column + 1;
447 dfm_forward_columns (reader, 1);
452 *first_column = dfm_column_start (reader);
453 if (ss_find_char (parser->quotes, ss_first (p)) != SIZE_MAX)
456 int quote = ss_get_char (&p);
457 if (!ss_get_until (&p, quote, field))
458 msg (SW, _("Quoted string extends beyond end of line."));
459 if (parser->quote_escape && ss_first (p) == quote)
461 ds_assign_substring (tmp, *field);
462 while (ss_match_char (&p, quote))
465 ds_put_char (tmp, quote);
466 if (!ss_get_until (&p, quote, &ss))
467 msg (SW, _("Quoted string extends beyond end of line."));
468 ds_put_substring (tmp, ss);
470 *field = ds_ss (tmp);
472 *last_column = dfm_column_start (reader);
474 /* Skip trailing soft separator and a single hard separator
476 ss_ltrim (&p, parser->soft_seps);
478 && ss_find_char (parser->hard_seps, ss_first (p)) != SIZE_MAX)
484 ss_get_chars (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field);
485 *last_column = dfm_column_start (reader);
486 if (!ss_ltrim (&p, parser->soft_seps) || ss_is_empty (p))
488 /* Advance past a trailing hard separator,
489 regardless of whether one actually existed. If
490 we "skip" a delimiter that was not actually
491 there, then we will return end-of-line on our
492 next call, which is what we want. */
493 dfm_forward_columns (reader, 1);
496 dfm_forward_columns (reader, ss_length (line) - ss_length (p));
501 /* Reads a case from READER into C, parsing it according to
502 fixed-format syntax rules in PARSER.
503 Returns true if successful, false at end of file or on I/O error. */
505 parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
508 enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (reader);
512 if (dfm_eof (reader))
516 for (row = 1; row <= parser->records_per_case; row++)
518 struct substring line;
520 if (dfm_eof (reader))
522 msg (SW, _("Partial case of %d of %d records discarded."),
523 row - 1, parser->records_per_case);
526 dfm_expand_tabs (reader);
527 line = dfm_get_record (reader);
529 for (; f < &parser->fields[parser->field_cnt] && f->record == row; f++)
530 data_in (ss_substr (line, f->first_column - 1,
532 encoding, f->format.type, f->format.d,
533 f->first_column, f->first_column + f->format.w,
534 case_data_rw_idx (c, f->case_idx),
535 fmt_var_width (&f->format));
537 dfm_forward_record (reader);
543 /* Reads a case from READER into C, parsing it according to
544 free-format syntax rules in PARSER.
545 Returns true if successful, false at end of file or on I/O error. */
547 parse_delimited_span (const struct data_parser *parser,
548 struct dfm_reader *reader, struct ccase *c)
550 enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (reader);
551 struct string tmp = DS_EMPTY_INITIALIZER;
554 for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++)
557 int first_column, last_column;
559 /* Cut out a field and read in a new record if necessary. */
560 while (!cut_field (parser, reader,
561 &first_column, &last_column, &tmp, &s))
563 if (!dfm_eof (reader))
564 dfm_forward_record (reader);
565 if (dfm_eof (reader))
567 if (f > parser->fields)
568 msg (SW, _("Partial case discarded. The first variable "
569 "missing was %s."), f->name);
575 data_in (s, encoding, f->format.type, 0,
576 first_column, last_column,
577 case_data_rw_idx (c, f->case_idx),
578 fmt_var_width (&f->format));
584 /* Reads a case from READER into C, parsing it according to
585 delimited syntax rules with one case per record in PARSER.
586 Returns true if successful, false at end of file or on I/O error. */
588 parse_delimited_no_span (const struct data_parser *parser,
589 struct dfm_reader *reader, struct ccase *c)
591 enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (reader);
592 struct string tmp = DS_EMPTY_INITIALIZER;
596 if (dfm_eof (reader))
599 for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++)
601 int first_column, last_column;
602 if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s))
604 if (settings_get_undefined ())
605 msg (SW, _("Missing value(s) for all variables from %s onward. "
606 "These will be filled with the system-missing value "
607 "or blanks, as appropriate."),
609 for (; f < &parser->fields[parser->field_cnt]; f++)
610 value_set_missing (case_data_rw_idx (c, f->case_idx),
611 fmt_var_width (&f->format));
615 data_in (s, encoding, f->format.type, 0,
616 first_column, last_column,
617 case_data_rw_idx (c, f->case_idx),
618 fmt_var_width (&f->format));
621 s = dfm_get_record (reader);
622 ss_ltrim (&s, parser->soft_seps);
623 if (!ss_is_empty (s))
624 msg (SW, _("Record ends in data not part of any field."));
627 dfm_forward_record (reader);
632 /* Displays a table giving information on fixed-format variable
633 parsing on DATA LIST. */
635 dump_fixed_table (const struct data_parser *parser,
636 const struct file_handle *fh)
641 t = tab_create (4, parser->field_cnt + 1, 0);
642 tab_columns (t, TAB_COL_DOWN, 1);
643 tab_headers (t, 0, 0, 1, 0);
644 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable"));
645 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Record"));
646 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Columns"));
647 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Format"));
648 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 3, parser->field_cnt);
649 tab_hline (t, TAL_2, 0, 3, 1);
650 tab_dim (t, tab_natural_dimensions);
652 for (i = 0; i < parser->field_cnt; i++)
654 struct field *f = &parser->fields[i];
655 char fmt_string[FMT_STRING_LEN_MAX + 1];
658 tab_text (t, 0, row, TAB_LEFT, f->name);
659 tab_text (t, 1, row, TAT_PRINTF, "%d", f->record);
660 tab_text (t, 2, row, TAT_PRINTF, "%3d-%3d",
661 f->first_column, f->first_column + f->format.w - 1);
662 tab_text (t, 3, row, TAB_LEFT | TAB_FIX,
663 fmt_to_string (&f->format, fmt_string));
666 tab_title (t, ngettext ("Reading %d record from %s.",
667 "Reading %d records from %s.",
668 parser->records_per_case),
669 parser->records_per_case, fh_get_name (fh));
673 /* Displays a table giving information on free-format variable parsing
676 dump_delimited_table (const struct data_parser *parser,
677 const struct file_handle *fh)
682 t = tab_create (2, parser->field_cnt + 1, 0);
683 tab_columns (t, TAB_COL_DOWN, 1);
684 tab_headers (t, 0, 0, 1, 0);
685 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable"));
686 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Format"));
687 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, parser->field_cnt);
688 tab_hline (t, TAL_2, 0, 1, 1);
689 tab_dim (t, tab_natural_dimensions);
691 for (i = 0; i < parser->field_cnt; i++)
693 struct field *f = &parser->fields[i];
694 char str[FMT_STRING_LEN_MAX + 1];
697 tab_text (t, 0, row, TAB_LEFT, f->name);
698 tab_text (t, 1, row, TAB_LEFT | TAB_FIX,
699 fmt_to_string (&f->format, str));
702 tab_title (t, _("Reading free-form data from %s."), fh_get_name (fh));
707 /* Displays a table giving information on how PARSER will read
710 data_parser_output_description (struct data_parser *parser,
711 const struct file_handle *fh)
713 if (parser->type == DP_FIXED)
714 dump_fixed_table (parser, fh);
716 dump_delimited_table (parser, fh);
719 /* Data parser input program. */
720 struct data_parser_casereader
722 struct data_parser *parser; /* Parser. */
723 struct dfm_reader *reader; /* Data file reader. */
724 size_t value_cnt; /* Number of `union value's in case. */
727 static const struct casereader_class data_parser_casereader_class;
729 /* Replaces DS's active file by an input program that reads data
730 from READER according to the rules in PARSER, using DICT as
731 the underlying dictionary. Ownership of PARSER and READER is
732 transferred to the input program, and ownership of DICT is
733 transferred to the dataset. */
735 data_parser_make_active_file (struct data_parser *parser, struct dataset *ds,
736 struct dfm_reader *reader,
737 struct dictionary *dict)
739 struct data_parser_casereader *r;
740 struct casereader *casereader;
742 r = xmalloc (sizeof *r);
745 r->value_cnt = dict_get_next_value_idx (dict);
746 casereader = casereader_create_sequential (NULL, r->value_cnt,
748 &data_parser_casereader_class, r);
749 proc_set_active_file (ds, casereader, dict);
752 static struct ccase *
753 data_parser_casereader_read (struct casereader *reader UNUSED, void *r_)
755 struct data_parser_casereader *r = r_;
756 struct ccase *c = case_create (r->value_cnt);
757 if (data_parser_parse (r->parser, r->reader, c))
767 data_parser_casereader_destroy (struct casereader *reader UNUSED, void *r_)
769 struct data_parser_casereader *r = r_;
770 if (dfm_reader_error (r->reader))
771 casereader_force_error (reader);
772 data_parser_destroy (r->parser);
773 dfm_close_reader (r->reader);
777 static const struct casereader_class data_parser_casereader_class =
779 data_parser_casereader_read,
780 data_parser_casereader_destroy,