1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/data-io/data-parser.h"
24 #include "data/casereader-provider.h"
25 #include "data/data-in.h"
26 #include "data/dataset.h"
27 #include "data/dictionary.h"
28 #include "data/format.h"
29 #include "data/file-handle-def.h"
30 #include "data/settings.h"
31 #include "language/data-io/data-reader.h"
32 #include "libpspp/message.h"
33 #include "libpspp/str.h"
34 #include "output/tab.h"
36 #include "gl/xalloc.h"
39 #define _(msgid) gettext (msgid)
41 /* Data parser for textual data like that read by DATA LIST. */
44 const struct dictionary *dict; /*Dictionary of destination */
45 enum data_parser_type type; /* Type of data to parse. */
46 int skip_records; /* Records to skip before first real data. */
48 struct field *fields; /* Fields to parse. */
49 size_t field_cnt; /* Number of fields. */
50 size_t field_allocated; /* Number of fields spaced allocated for. */
52 /* DP_DELIMITED parsers only. */
53 bool span; /* May cases span multiple records? */
54 bool empty_line_has_field; /* Does an empty line have an (empty) field? */
55 bool warn_missing_fields; /* Should missing fields be considered errors? */
56 struct substring quotes; /* Characters that can quote separators. */
57 bool quote_escape; /* Doubled quote acts as escape? */
58 struct substring soft_seps; /* Two soft separators act like just one. */
59 struct substring hard_seps; /* Two hard separators yield empty fields. */
60 struct string any_sep; /* Concatenation of soft_seps and hard_seps. */
62 /* DP_FIXED parsers only. */
63 int records_per_case; /* Number of records in each case. */
66 /* How to parse one variable. */
69 struct fmt_spec format; /* Input format of this field. */
70 int case_idx; /* First value in case. */
71 char *name; /* Var name for error messages and tables. */
74 int record; /* Record number (1-based). */
75 int first_column; /* First column in record (1-based). */
78 static void set_any_sep (struct data_parser *parser);
80 /* Creates and returns a new data parser. */
82 data_parser_create (const struct dictionary *dict)
84 struct data_parser *parser = xmalloc (sizeof *parser);
86 parser->type = DP_FIXED;
87 parser->skip_records = 0;
89 parser->fields = NULL;
90 parser->field_cnt = 0;
91 parser->field_allocated = 0;
95 parser->empty_line_has_field = false;
96 parser->warn_missing_fields = true;
97 ss_alloc_substring (&parser->quotes, ss_cstr ("\"'"));
98 parser->quote_escape = false;
99 ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES));
100 ss_alloc_substring (&parser->hard_seps, ss_cstr (","));
101 ds_init_empty (&parser->any_sep);
102 set_any_sep (parser);
104 parser->records_per_case = 0;
109 /* Destroys PARSER. */
111 data_parser_destroy (struct data_parser *parser)
117 for (i = 0; i < parser->field_cnt; i++)
118 free (parser->fields[i].name);
119 free (parser->fields);
120 ss_dealloc (&parser->quotes);
121 ss_dealloc (&parser->soft_seps);
122 ss_dealloc (&parser->hard_seps);
123 ds_destroy (&parser->any_sep);
128 /* Returns the type of PARSER (either DP_DELIMITED or DP_FIXED). */
129 enum data_parser_type
130 data_parser_get_type (const struct data_parser *parser)
135 /* Sets the type of PARSER to TYPE (either DP_DELIMITED or
138 data_parser_set_type (struct data_parser *parser, enum data_parser_type type)
140 assert (parser->field_cnt == 0);
141 assert (type == DP_FIXED || type == DP_DELIMITED);
145 /* Configures PARSER to skip the specified number of
146 INITIAL_RECORDS_TO_SKIP before parsing any data. By default,
147 no records are skipped. */
149 data_parser_set_skip (struct data_parser *parser, int initial_records_to_skip)
151 assert (initial_records_to_skip >= 0);
152 parser->skip_records = initial_records_to_skip;
155 /* Returns true if PARSER is configured to allow cases to span
158 data_parser_get_span (const struct data_parser *parser)
163 /* If MAY_CASES_SPAN_RECORDS is true, configures PARSER to allow
164 a single case to span multiple records and multiple cases to
165 occupy a single record. If MAY_CASES_SPAN_RECORDS is false,
166 configures PARSER to require each record to contain exactly
169 This setting affects parsing of DP_DELIMITED files only. */
171 data_parser_set_span (struct data_parser *parser, bool may_cases_span_records)
173 parser->span = may_cases_span_records;
176 /* If EMPTY_LINE_HAS_FIELD is true, configures PARSER to parse an
177 empty line as an empty field and to treat a hard delimiter
178 followed by end-of-line as an empty field. If
179 EMPTY_LINE_HAS_FIELD is false, PARSER will skip empty lines
180 and hard delimiters at the end of lines without emitting empty
183 This setting affects parsing of DP_DELIMITED files only. */
185 data_parser_set_empty_line_has_field (struct data_parser *parser,
186 bool empty_line_has_field)
188 parser->empty_line_has_field = empty_line_has_field;
192 /* If WARN_MISSING_FIELDS is true, configures PARSER to emit a warning
193 and cause an error condition when a missing field is encountered.
194 If WARN_MISSING_FIELDS is false, PARSER will silently fill such
195 fields with the system missing value.
197 This setting affects parsing of DP_DELIMITED files only. */
199 data_parser_set_warn_missing_fields (struct data_parser *parser,
200 bool warn_missing_fields)
202 parser->warn_missing_fields = warn_missing_fields;
206 /* Sets the characters that may be used for quoting field
207 contents to QUOTES. If QUOTES is empty, quoting will be
210 The caller retains ownership of QUOTES.
212 This setting affects parsing of DP_DELIMITED files only. */
214 data_parser_set_quotes (struct data_parser *parser, struct substring quotes)
216 ss_dealloc (&parser->quotes);
217 ss_alloc_substring (&parser->quotes, quotes);
220 /* If ESCAPE is false (the default setting), a character used for
221 quoting cannot itself be embedded within a quoted field. If
222 ESCAPE is true, then a quote character can be embedded within
223 a quoted field by doubling it.
225 This setting affects parsing of DP_DELIMITED files only, and
226 only when at least one quote character has been set (with
227 data_parser_set_quotes). */
229 data_parser_set_quote_escape (struct data_parser *parser, bool escape)
231 parser->quote_escape = escape;
234 /* Sets PARSER's soft delimiters to DELIMITERS. Soft delimiters
235 separate fields, but consecutive soft delimiters do not yield
236 empty fields. (Ordinarily, only white space characters are
237 appropriate soft delimiters.)
239 The caller retains ownership of DELIMITERS.
241 This setting affects parsing of DP_DELIMITED files only. */
243 data_parser_set_soft_delimiters (struct data_parser *parser,
244 struct substring delimiters)
246 ss_dealloc (&parser->soft_seps);
247 ss_alloc_substring (&parser->soft_seps, delimiters);
248 set_any_sep (parser);
251 /* Sets PARSER's hard delimiters to DELIMITERS. Hard delimiters
252 separate fields. A consecutive pair of hard delimiters yield
255 The caller retains ownership of DELIMITERS.
257 This setting affects parsing of DP_DELIMITED files only. */
259 data_parser_set_hard_delimiters (struct data_parser *parser,
260 struct substring delimiters)
262 ss_dealloc (&parser->hard_seps);
263 ss_alloc_substring (&parser->hard_seps, delimiters);
264 set_any_sep (parser);
267 /* Returns the number of records per case. */
269 data_parser_get_records (const struct data_parser *parser)
271 return parser->records_per_case;
274 /* Sets the number of records per case to RECORDS_PER_CASE.
276 This setting affects parsing of DP_FIXED files only. */
278 data_parser_set_records (struct data_parser *parser, int records_per_case)
280 assert (records_per_case >= 0);
281 assert (records_per_case >= parser->records_per_case);
282 parser->records_per_case = records_per_case;
286 add_field (struct data_parser *p, const struct fmt_spec *format, int case_idx,
287 const char *name, int record, int first_column)
291 if (p->field_cnt == p->field_allocated)
292 p->fields = x2nrealloc (p->fields, &p->field_allocated, sizeof *p->fields);
293 field = &p->fields[p->field_cnt++];
294 field->format = *format;
295 field->case_idx = case_idx;
296 field->name = xstrdup (name);
297 field->record = record;
298 field->first_column = first_column;
301 /* Adds a delimited field to the field parsed by PARSER, which
302 must be configured as a DP_DELIMITED parser. The field is
303 parsed as input format FORMAT. Its data will be stored into case
304 index CASE_INDEX. Errors in input data will be reported
305 against variable NAME. */
307 data_parser_add_delimited_field (struct data_parser *parser,
308 const struct fmt_spec *format, int case_idx,
311 assert (parser->type == DP_DELIMITED);
312 add_field (parser, format, case_idx, name, 0, 0);
315 /* Adds a fixed field to the field parsed by PARSER, which
316 must be configured as a DP_FIXED parser. The field is
317 parsed as input format FORMAT. Its data will be stored into case
318 index CASE_INDEX. Errors in input data will be reported
319 against variable NAME. The field will be drawn from the
320 FORMAT->w columns in 1-based RECORD starting at 1-based
323 RECORD must be at least as great as that of any field already
324 added; that is, fields must be added in increasing order of
325 record number. If RECORD is greater than the current number
326 of records per case, the number of records per case are
327 increased as needed. */
329 data_parser_add_fixed_field (struct data_parser *parser,
330 const struct fmt_spec *format, int case_idx,
332 int record, int first_column)
334 assert (parser->type == DP_FIXED);
335 assert (parser->field_cnt == 0
336 || record >= parser->fields[parser->field_cnt - 1].record);
337 if (record > parser->records_per_case)
338 parser->records_per_case = record;
339 add_field (parser, format, case_idx, name, record, first_column);
342 /* Returns true if any fields have been added to PARSER, false
345 data_parser_any_fields (const struct data_parser *parser)
347 return parser->field_cnt > 0;
351 set_any_sep (struct data_parser *parser)
353 ds_assign_substring (&parser->any_sep, parser->soft_seps);
354 ds_put_substring (&parser->any_sep, parser->hard_seps);
357 static bool parse_delimited_span (const struct data_parser *,
358 struct dfm_reader *, struct ccase *);
359 static bool parse_delimited_no_span (const struct data_parser *,
360 struct dfm_reader *, struct ccase *);
361 static bool parse_fixed (const struct data_parser *,
362 struct dfm_reader *, struct ccase *);
364 /* Reads a case from DFM into C, parsing it with PARSER. Returns
365 true if successful, false at end of file or on I/O error.
367 Case C must not be shared. */
369 data_parser_parse (struct data_parser *parser, struct dfm_reader *reader,
374 assert (!case_is_shared (c));
375 assert (data_parser_any_fields (parser));
377 /* Skip the requested number of records before reading the
379 for (; parser->skip_records > 0; parser->skip_records--)
381 if (dfm_eof (reader))
383 dfm_forward_record (reader);
387 if (parser->type == DP_DELIMITED)
390 retval = parse_delimited_span (parser, reader, c);
392 retval = parse_delimited_no_span (parser, reader, c);
395 retval = parse_fixed (parser, reader, c);
400 /* Extracts a delimited field from the current position in the
401 current record according to PARSER, reading data from READER.
403 *FIELD is set to the field content. The caller must not or
404 destroy this constant string.
406 Sets *FIRST_COLUMN to the 1-based column number of the start of
407 the extracted field, and *LAST_COLUMN to the end of the extracted
410 Returns true on success, false on failure. */
412 cut_field (const struct data_parser *parser, struct dfm_reader *reader,
413 int *first_column, int *last_column, struct string *tmp,
414 struct substring *field)
416 size_t length_before_separators;
417 struct substring line, p;
420 if (dfm_eof (reader))
422 if (ss_is_empty (parser->hard_seps))
423 dfm_expand_tabs (reader);
424 line = p = dfm_get_record (reader);
426 /* Skip leading soft separators. */
427 ss_ltrim (&p, parser->soft_seps);
429 /* Handle empty or completely consumed lines. */
432 if (!parser->empty_line_has_field || dfm_columns_past_end (reader) > 0)
437 *first_column = dfm_column_start (reader);
438 *last_column = *first_column + 1;
439 dfm_forward_columns (reader, 1);
444 *first_column = dfm_column_start (reader);
445 quoted = ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX;
449 int quote = ss_get_byte (&p);
450 if (!ss_get_until (&p, quote, field))
451 msg (DW, _("Quoted string extends beyond end of line."));
452 if (parser->quote_escape && ss_first (p) == quote)
454 ds_assign_substring (tmp, *field);
455 while (ss_match_byte (&p, quote))
458 ds_put_byte (tmp, quote);
459 if (!ss_get_until (&p, quote, &ss))
460 msg (DW, _("Quoted string extends beyond end of line."));
461 ds_put_substring (tmp, ss);
463 *field = ds_ss (tmp);
465 *last_column = *first_column + (ss_length (line) - ss_length (p));
470 ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field);
471 *last_column = *first_column + ss_length (*field);
474 /* Skip trailing soft separator and a single hard separator if present. */
475 length_before_separators = ss_length (p);
476 ss_ltrim (&p, parser->soft_seps);
478 && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX)
481 ss_ltrim (&p, parser->soft_seps);
484 dfm_forward_columns (reader, 1);
485 else if (quoted && length_before_separators == ss_length (p))
486 msg (DW, _("Missing delimiter following quoted string."));
487 dfm_forward_columns (reader, ss_length (line) - ss_length (p));
493 parse_error (const struct dfm_reader *reader, const struct field *field,
494 int first_column, int last_column, char *error)
497 .category = MSG_C_DATA,
498 .severity = MSG_S_WARNING,
499 .file_name = CONST_CAST (char *, dfm_get_file_name (reader)),
500 .first_line = dfm_get_line_number (reader),
501 .last_line = m.first_line + 1,
502 .first_column = first_column,
503 .last_column = last_column,
504 .text = xasprintf (_("Data for variable %s is not valid as format %s: %s"),
505 field->name, fmt_name (field->format.type), error),
512 /* Reads a case from READER into C, parsing it according to
513 fixed-format syntax rules in PARSER.
514 Returns true if successful, false at end of file or on I/O error. */
516 parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
519 const char *input_encoding = dfm_reader_get_encoding (reader);
520 const char *output_encoding = dict_get_encoding (parser->dict);
524 if (dfm_eof (reader))
528 for (row = 1; row <= parser->records_per_case; row++)
530 struct substring line;
532 if (dfm_eof (reader))
534 msg (DW, _("Partial case of %d of %d records discarded."),
535 row - 1, parser->records_per_case);
538 dfm_expand_tabs (reader);
539 line = dfm_get_record (reader);
541 for (; f < &parser->fields[parser->field_cnt] && f->record == row; f++)
543 struct substring s = ss_substr (line, f->first_column - 1,
545 union value *value = case_data_rw_idx (c, f->case_idx);
546 char *error = data_in (s, input_encoding, f->format.type,
547 value, fmt_var_width (&f->format),
551 data_in_imply_decimals (s, input_encoding, f->format.type,
554 parse_error (reader, f, f->first_column,
555 f->first_column + f->format.w, error);
558 dfm_forward_record (reader);
564 /* Reads a case from READER into C, parsing it according to
565 free-format syntax rules in PARSER.
566 Returns true if successful, false at end of file or on I/O error. */
568 parse_delimited_span (const struct data_parser *parser,
569 struct dfm_reader *reader, struct ccase *c)
571 const char *output_encoding = dict_get_encoding (parser->dict);
572 struct string tmp = DS_EMPTY_INITIALIZER;
575 for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++)
578 int first_column, last_column;
581 /* Cut out a field and read in a new record if necessary. */
582 while (!cut_field (parser, reader,
583 &first_column, &last_column, &tmp, &s))
585 if (!dfm_eof (reader))
586 dfm_forward_record (reader);
587 if (dfm_eof (reader))
589 if (f > parser->fields)
590 msg (DW, _("Partial case discarded. The first variable "
591 "missing was %s."), f->name);
597 const char *input_encoding = dfm_reader_get_encoding (reader);
598 error = data_in (s, input_encoding, f->format.type,
599 case_data_rw_idx (c, f->case_idx),
600 fmt_var_width (&f->format), output_encoding);
602 parse_error (reader, f, first_column, last_column, error);
608 /* Reads a case from READER into C, parsing it according to
609 delimited syntax rules with one case per record in PARSER.
610 Returns true if successful, false at end of file or on I/O error. */
612 parse_delimited_no_span (const struct data_parser *parser,
613 struct dfm_reader *reader, struct ccase *c)
615 const char *output_encoding = dict_get_encoding (parser->dict);
616 struct string tmp = DS_EMPTY_INITIALIZER;
618 struct field *f, *end;
620 if (dfm_eof (reader))
623 end = &parser->fields[parser->field_cnt];
624 for (f = parser->fields; f < end; f++)
626 int first_column, last_column;
629 if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s))
631 if (f < end - 1 && settings_get_undefined () && parser->warn_missing_fields)
632 msg (DW, _("Missing value(s) for all variables from %s onward. "
633 "These will be filled with the system-missing value "
634 "or blanks, as appropriate."),
637 value_set_missing (case_data_rw_idx (c, f->case_idx),
638 fmt_var_width (&f->format));
642 const char *input_encoding = dfm_reader_get_encoding (reader);
643 error = data_in (s, input_encoding, f->format.type,
644 case_data_rw_idx (c, f->case_idx),
645 fmt_var_width (&f->format), output_encoding);
647 parse_error (reader, f, first_column, last_column, error);
650 s = dfm_get_record (reader);
651 ss_ltrim (&s, parser->soft_seps);
652 if (!ss_is_empty (s))
653 msg (DW, _("Record ends in data not part of any field."));
656 dfm_forward_record (reader);
661 /* Displays a table giving information on fixed-format variable
662 parsing on DATA LIST. */
664 dump_fixed_table (const struct data_parser *parser,
665 const struct file_handle *fh)
670 t = tab_create (4, parser->field_cnt + 1);
671 tab_headers (t, 0, 0, 1, 0);
672 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable"));
673 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Record"));
674 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Columns"));
675 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Format"));
676 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 3, parser->field_cnt);
677 tab_hline (t, TAL_2, 0, 3, 1);
679 for (i = 0; i < parser->field_cnt; i++)
681 struct field *f = &parser->fields[i];
682 char fmt_string[FMT_STRING_LEN_MAX + 1];
685 tab_text (t, 0, row, TAB_LEFT, f->name);
686 tab_text_format (t, 1, row, 0, "%d", f->record);
687 tab_text_format (t, 2, row, 0, "%3d-%3d",
688 f->first_column, f->first_column + f->format.w - 1);
689 tab_text (t, 3, row, TAB_LEFT | TAB_FIX,
690 fmt_to_string (&f->format, fmt_string));
693 tab_title (t, ngettext ("Reading %d record from %s.",
694 "Reading %d records from %s.",
695 parser->records_per_case),
696 parser->records_per_case, fh_get_name (fh));
700 /* Displays a table giving information on free-format variable parsing
703 dump_delimited_table (const struct data_parser *parser,
704 const struct file_handle *fh)
709 t = tab_create (2, parser->field_cnt + 1);
710 tab_headers (t, 0, 0, 1, 0);
711 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable"));
712 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Format"));
713 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, parser->field_cnt);
714 tab_hline (t, TAL_2, 0, 1, 1);
716 for (i = 0; i < parser->field_cnt; i++)
718 struct field *f = &parser->fields[i];
719 char str[FMT_STRING_LEN_MAX + 1];
722 tab_text (t, 0, row, TAB_LEFT, f->name);
723 tab_text (t, 1, row, TAB_LEFT | TAB_FIX,
724 fmt_to_string (&f->format, str));
727 tab_title (t, _("Reading free-form data from %s."), fh_get_name (fh));
732 /* Displays a table giving information on how PARSER will read
735 data_parser_output_description (struct data_parser *parser,
736 const struct file_handle *fh)
738 if (parser->type == DP_FIXED)
739 dump_fixed_table (parser, fh);
741 dump_delimited_table (parser, fh);
744 /* Data parser input program. */
745 struct data_parser_casereader
747 struct data_parser *parser; /* Parser. */
748 struct dfm_reader *reader; /* Data file reader. */
749 struct caseproto *proto; /* Format of cases. */
752 static const struct casereader_class data_parser_casereader_class;
754 /* Replaces DS's active dataset by an input program that reads data
755 from READER according to the rules in PARSER, using DICT as
756 the underlying dictionary. Ownership of PARSER and READER is
757 transferred to the input program, and ownership of DICT is
758 transferred to the dataset. */
760 data_parser_make_active_file (struct data_parser *parser, struct dataset *ds,
761 struct dfm_reader *reader,
762 struct dictionary *dict,
763 struct casereader* (*func)(struct casereader *,
764 const struct dictionary *,
768 struct data_parser_casereader *r;
769 struct casereader *casereader0;
770 struct casereader *casereader1;
772 r = xmalloc (sizeof *r);
775 r->proto = caseproto_ref (dict_get_proto (dict));
776 casereader0 = casereader_create_sequential (NULL, r->proto,
778 &data_parser_casereader_class, r);
781 casereader1 = func (casereader0, dict, ud);
783 casereader1 = casereader0;
785 dataset_set_dict (ds, dict);
786 dataset_set_source (ds, casereader1);
790 static struct ccase *
791 data_parser_casereader_read (struct casereader *reader UNUSED, void *r_)
793 struct data_parser_casereader *r = r_;
794 struct ccase *c = case_create (r->proto);
795 if (data_parser_parse (r->parser, r->reader, c))
805 data_parser_casereader_destroy (struct casereader *reader UNUSED, void *r_)
807 struct data_parser_casereader *r = r_;
808 if (dfm_reader_error (r->reader))
809 casereader_force_error (reader);
810 data_parser_destroy (r->parser);
811 dfm_close_reader (r->reader);
812 caseproto_unref (r->proto);
816 static const struct casereader_class data_parser_casereader_class =
818 data_parser_casereader_read,
819 data_parser_casereader_destroy,