1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/data-io/data-parser.h"
24 #include "data/casereader-provider.h"
25 #include "data/data-in.h"
26 #include "data/dataset.h"
27 #include "data/dictionary.h"
28 #include "data/format.h"
29 #include "data/file-handle-def.h"
30 #include "data/session.h"
31 #include "data/settings.h"
32 #include "language/data-io/data-reader.h"
33 #include "libpspp/message.h"
34 #include "libpspp/str.h"
35 #include "output/tab.h"
37 #include "gl/xalloc.h"
40 #define _(msgid) gettext (msgid)
42 /* Data parser for textual data like that read by DATA LIST. */
45 const struct dictionary *dict; /*Dictionary of destination */
46 enum data_parser_type type; /* Type of data to parse. */
47 int skip_records; /* Records to skip before first real data. */
49 struct field *fields; /* Fields to parse. */
50 size_t field_cnt; /* Number of fields. */
51 size_t field_allocated; /* Number of fields spaced allocated for. */
53 /* DP_DELIMITED parsers only. */
54 bool span; /* May cases span multiple records? */
55 bool empty_line_has_field; /* Does an empty line have an (empty) field? */
56 struct substring quotes; /* Characters that can quote separators. */
57 bool quote_escape; /* Doubled quote acts as escape? */
58 struct substring soft_seps; /* Two soft separators act like just one. */
59 struct substring hard_seps; /* Two hard separators yield empty fields. */
60 struct string any_sep; /* Concatenation of soft_seps and hard_seps. */
62 /* DP_FIXED parsers only. */
63 int records_per_case; /* Number of records in each case. */
66 /* How to parse one variable. */
69 struct fmt_spec format; /* Input format of this field. */
70 int case_idx; /* First value in case. */
71 char *name; /* Var name for error messages and tables. */
74 int record; /* Record number (1-based). */
75 int first_column; /* First column in record (1-based). */
78 static void set_any_sep (struct data_parser *parser);
80 /* Creates and returns a new data parser. */
82 data_parser_create (const struct dictionary *dict)
84 struct data_parser *parser = xmalloc (sizeof *parser);
86 parser->type = DP_FIXED;
87 parser->skip_records = 0;
89 parser->fields = NULL;
90 parser->field_cnt = 0;
91 parser->field_allocated = 0;
95 parser->empty_line_has_field = false;
96 ss_alloc_substring (&parser->quotes, ss_cstr ("\"'"));
97 parser->quote_escape = false;
98 ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES));
99 ss_alloc_substring (&parser->hard_seps, ss_cstr (","));
100 ds_init_empty (&parser->any_sep);
101 set_any_sep (parser);
103 parser->records_per_case = 0;
108 /* Destroys PARSER. */
110 data_parser_destroy (struct data_parser *parser)
116 for (i = 0; i < parser->field_cnt; i++)
117 free (parser->fields[i].name);
118 free (parser->fields);
119 ss_dealloc (&parser->quotes);
120 ss_dealloc (&parser->soft_seps);
121 ss_dealloc (&parser->hard_seps);
122 ds_destroy (&parser->any_sep);
127 /* Returns the type of PARSER (either DP_DELIMITED or DP_FIXED). */
128 enum data_parser_type
129 data_parser_get_type (const struct data_parser *parser)
134 /* Sets the type of PARSER to TYPE (either DP_DELIMITED or
137 data_parser_set_type (struct data_parser *parser, enum data_parser_type type)
139 assert (parser->field_cnt == 0);
140 assert (type == DP_FIXED || type == DP_DELIMITED);
144 /* Configures PARSER to skip the specified number of
145 INITIAL_RECORDS_TO_SKIP before parsing any data. By default,
146 no records are skipped. */
148 data_parser_set_skip (struct data_parser *parser, int initial_records_to_skip)
150 assert (initial_records_to_skip >= 0);
151 parser->skip_records = initial_records_to_skip;
154 /* Returns true if PARSER is configured to allow cases to span
157 data_parser_get_span (const struct data_parser *parser)
162 /* If MAY_CASES_SPAN_RECORDS is true, configures PARSER to allow
163 a single case to span multiple records and multiple cases to
164 occupy a single record. If MAY_CASES_SPAN_RECORDS is false,
165 configures PARSER to require each record to contain exactly
168 This setting affects parsing of DP_DELIMITED files only. */
170 data_parser_set_span (struct data_parser *parser, bool may_cases_span_records)
172 parser->span = may_cases_span_records;
175 /* If EMPTY_LINE_HAS_FIELD is true, configures PARSER to parse an
176 empty line as an empty field and to treat a hard delimiter
177 followed by end-of-line as an empty field. If
178 EMPTY_LINE_HAS_FIELD is false, PARSER will skip empty lines
179 and hard delimiters at the end of lines without emitting empty
182 This setting affects parsing of DP_DELIMITED files only. */
184 data_parser_set_empty_line_has_field (struct data_parser *parser,
185 bool empty_line_has_field)
187 parser->empty_line_has_field = empty_line_has_field;
190 /* Sets the characters that may be used for quoting field
191 contents to QUOTES. If QUOTES is empty, quoting will be
194 The caller retains ownership of QUOTES.
196 This setting affects parsing of DP_DELIMITED files only. */
198 data_parser_set_quotes (struct data_parser *parser, struct substring quotes)
200 ss_dealloc (&parser->quotes);
201 ss_alloc_substring (&parser->quotes, quotes);
204 /* If ESCAPE is false (the default setting), a character used for
205 quoting cannot itself be embedded within a quoted field. If
206 ESCAPE is true, then a quote character can be embedded within
207 a quoted field by doubling it.
209 This setting affects parsing of DP_DELIMITED files only, and
210 only when at least one quote character has been set (with
211 data_parser_set_quotes). */
213 data_parser_set_quote_escape (struct data_parser *parser, bool escape)
215 parser->quote_escape = escape;
218 /* Sets PARSER's soft delimiters to DELIMITERS. Soft delimiters
219 separate fields, but consecutive soft delimiters do not yield
220 empty fields. (Ordinarily, only white space characters are
221 appropriate soft delimiters.)
223 The caller retains ownership of DELIMITERS.
225 This setting affects parsing of DP_DELIMITED files only. */
227 data_parser_set_soft_delimiters (struct data_parser *parser,
228 struct substring delimiters)
230 ss_dealloc (&parser->soft_seps);
231 ss_alloc_substring (&parser->soft_seps, delimiters);
232 set_any_sep (parser);
235 /* Sets PARSER's hard delimiters to DELIMITERS. Hard delimiters
236 separate fields. A consecutive pair of hard delimiters yield
239 The caller retains ownership of DELIMITERS.
241 This setting affects parsing of DP_DELIMITED files only. */
243 data_parser_set_hard_delimiters (struct data_parser *parser,
244 struct substring delimiters)
246 ss_dealloc (&parser->hard_seps);
247 ss_alloc_substring (&parser->hard_seps, delimiters);
248 set_any_sep (parser);
251 /* Returns the number of records per case. */
253 data_parser_get_records (const struct data_parser *parser)
255 return parser->records_per_case;
258 /* Sets the number of records per case to RECORDS_PER_CASE.
260 This setting affects parsing of DP_FIXED files only. */
262 data_parser_set_records (struct data_parser *parser, int records_per_case)
264 assert (records_per_case >= 0);
265 assert (records_per_case >= parser->records_per_case);
266 parser->records_per_case = records_per_case;
270 add_field (struct data_parser *p, const struct fmt_spec *format, int case_idx,
271 const char *name, int record, int first_column)
275 if (p->field_cnt == p->field_allocated)
276 p->fields = x2nrealloc (p->fields, &p->field_allocated, sizeof *p->fields);
277 field = &p->fields[p->field_cnt++];
278 field->format = *format;
279 field->case_idx = case_idx;
280 field->name = xstrdup (name);
281 field->record = record;
282 field->first_column = first_column;
285 /* Adds a delimited field to the field parsed by PARSER, which
286 must be configured as a DP_DELIMITED parser. The field is
287 parsed as input format FORMAT. Its data will be stored into case
288 index CASE_INDEX. Errors in input data will be reported
289 against variable NAME. */
291 data_parser_add_delimited_field (struct data_parser *parser,
292 const struct fmt_spec *format, int case_idx,
295 assert (parser->type == DP_DELIMITED);
296 add_field (parser, format, case_idx, name, 0, 0);
299 /* Adds a fixed field to the field parsed by PARSER, which
300 must be configured as a DP_FIXED parser. The field is
301 parsed as input format FORMAT. Its data will be stored into case
302 index CASE_INDEX. Errors in input data will be reported
303 against variable NAME. The field will be drawn from the
304 FORMAT->w columns in 1-based RECORD starting at 1-based
307 RECORD must be at least as great as that of any field already
308 added; that is, fields must be added in increasing order of
309 record number. If RECORD is greater than the current number
310 of records per case, the number of records per case are
311 increased as needed. */
313 data_parser_add_fixed_field (struct data_parser *parser,
314 const struct fmt_spec *format, int case_idx,
316 int record, int first_column)
318 assert (parser->type == DP_FIXED);
319 assert (parser->field_cnt == 0
320 || record >= parser->fields[parser->field_cnt - 1].record);
321 if (record > parser->records_per_case)
322 parser->records_per_case = record;
323 add_field (parser, format, case_idx, name, record, first_column);
326 /* Returns true if any fields have been added to PARSER, false
329 data_parser_any_fields (const struct data_parser *parser)
331 return parser->field_cnt > 0;
335 set_any_sep (struct data_parser *parser)
337 ds_assign_substring (&parser->any_sep, parser->soft_seps);
338 ds_put_substring (&parser->any_sep, parser->hard_seps);
341 static bool parse_delimited_span (const struct data_parser *,
342 struct dfm_reader *, struct ccase *);
343 static bool parse_delimited_no_span (const struct data_parser *,
344 struct dfm_reader *, struct ccase *);
345 static bool parse_fixed (const struct data_parser *,
346 struct dfm_reader *, struct ccase *);
348 /* Reads a case from DFM into C, parsing it with PARSER. Returns
349 true if successful, false at end of file or on I/O error.
351 Case C must not be shared. */
353 data_parser_parse (struct data_parser *parser, struct dfm_reader *reader,
358 assert (!case_is_shared (c));
359 assert (data_parser_any_fields (parser));
361 /* Skip the requested number of records before reading the
363 for (; parser->skip_records > 0; parser->skip_records--)
365 if (dfm_eof (reader))
367 dfm_forward_record (reader);
371 if (parser->type == DP_DELIMITED)
374 retval = parse_delimited_span (parser, reader, c);
376 retval = parse_delimited_no_span (parser, reader, c);
379 retval = parse_fixed (parser, reader, c);
384 /* Extracts a delimited field from the current position in the
385 current record according to PARSER, reading data from READER.
387 *FIELD is set to the field content. The caller must not or
388 destroy this constant string.
390 After parsing the field, sets the current position in the
391 record to just past the field and any trailing delimiter.
392 Returns 0 on failure or a 1-based column number indicating the
393 beginning of the field on success. */
395 cut_field (const struct data_parser *parser, struct dfm_reader *reader,
396 int *first_column, int *last_column, struct string *tmp,
397 struct substring *field)
399 size_t length_before_separators;
400 struct substring line, p;
403 if (dfm_eof (reader))
405 if (ss_is_empty (parser->hard_seps))
406 dfm_expand_tabs (reader);
407 line = p = dfm_get_record (reader);
409 /* Skip leading soft separators. */
410 ss_ltrim (&p, parser->soft_seps);
412 /* Handle empty or completely consumed lines. */
415 if (!parser->empty_line_has_field || dfm_columns_past_end (reader) > 0)
420 *first_column = dfm_column_start (reader);
421 *last_column = *first_column + 1;
422 dfm_forward_columns (reader, 1);
427 *first_column = dfm_column_start (reader);
428 quoted = ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX;
432 int quote = ss_get_byte (&p);
433 if (!ss_get_until (&p, quote, field))
434 msg (DW, _("Quoted string extends beyond end of line."));
435 if (parser->quote_escape && ss_first (p) == quote)
437 ds_assign_substring (tmp, *field);
438 while (ss_match_byte (&p, quote))
441 ds_put_byte (tmp, quote);
442 if (!ss_get_until (&p, quote, &ss))
443 msg (DW, _("Quoted string extends beyond end of line."));
444 ds_put_substring (tmp, ss);
446 *field = ds_ss (tmp);
448 *last_column = *first_column + (ss_length (line) - ss_length (p));
453 ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field);
454 *last_column = *first_column + ss_length (*field);
457 /* Skip trailing soft separator and a single hard separator if present. */
458 length_before_separators = ss_length (p);
459 ss_ltrim (&p, parser->soft_seps);
461 && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX)
464 ss_ltrim (&p, parser->soft_seps);
467 dfm_forward_columns (reader, 1);
468 else if (quoted && length_before_separators == ss_length (p))
469 msg (DW, _("Missing delimiter following quoted string."));
470 dfm_forward_columns (reader, ss_length (line) - ss_length (p));
476 parse_error (const struct dfm_reader *reader, const struct field *field,
477 int first_column, int last_column, char *error)
481 m.category = MSG_C_DATA;
482 m.severity = MSG_S_WARNING;
483 m.file_name = CONST_CAST (char *, dfm_get_file_name (reader));
484 m.first_line = dfm_get_line_number (reader);
485 m.last_line = m.first_line + 1;
486 m.first_column = first_column;
487 m.last_column = last_column;
488 m.text = xasprintf (_("Data for variable %s is not valid as format %s: %s"),
489 field->name, fmt_name (field->format.type), error);
495 /* Reads a case from READER into C, parsing it according to
496 fixed-format syntax rules in PARSER.
497 Returns true if successful, false at end of file or on I/O error. */
499 parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
502 const char *input_encoding = dfm_reader_get_encoding (reader);
503 const char *output_encoding = dict_get_encoding (parser->dict);
507 if (dfm_eof (reader))
511 for (row = 1; row <= parser->records_per_case; row++)
513 struct substring line;
515 if (dfm_eof (reader))
517 msg (DW, _("Partial case of %d of %d records discarded."),
518 row - 1, parser->records_per_case);
521 dfm_expand_tabs (reader);
522 line = dfm_get_record (reader);
524 for (; f < &parser->fields[parser->field_cnt] && f->record == row; f++)
526 struct substring s = ss_substr (line, f->first_column - 1,
528 union value *value = case_data_rw_idx (c, f->case_idx);
529 char *error = data_in (s, input_encoding, f->format.type,
530 value, fmt_var_width (&f->format),
534 data_in_imply_decimals (s, input_encoding, f->format.type,
537 parse_error (reader, f, f->first_column,
538 f->first_column + f->format.w, error);
541 dfm_forward_record (reader);
547 /* Reads a case from READER into C, parsing it according to
548 free-format syntax rules in PARSER.
549 Returns true if successful, false at end of file or on I/O error. */
551 parse_delimited_span (const struct data_parser *parser,
552 struct dfm_reader *reader, struct ccase *c)
554 const char *output_encoding = dict_get_encoding (parser->dict);
555 struct string tmp = DS_EMPTY_INITIALIZER;
558 for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++)
561 int first_column, last_column;
564 /* Cut out a field and read in a new record if necessary. */
565 while (!cut_field (parser, reader,
566 &first_column, &last_column, &tmp, &s))
568 if (!dfm_eof (reader))
569 dfm_forward_record (reader);
570 if (dfm_eof (reader))
572 if (f > parser->fields)
573 msg (DW, _("Partial case discarded. The first variable "
574 "missing was %s."), f->name);
580 const char *input_encoding = dfm_reader_get_encoding (reader);
581 error = data_in (s, input_encoding, f->format.type,
582 case_data_rw_idx (c, f->case_idx),
583 fmt_var_width (&f->format), output_encoding);
585 parse_error (reader, f, first_column, last_column, error);
591 /* Reads a case from READER into C, parsing it according to
592 delimited syntax rules with one case per record in PARSER.
593 Returns true if successful, false at end of file or on I/O error. */
595 parse_delimited_no_span (const struct data_parser *parser,
596 struct dfm_reader *reader, struct ccase *c)
598 const char *output_encoding = dict_get_encoding (parser->dict);
599 struct string tmp = DS_EMPTY_INITIALIZER;
601 struct field *f, *end;
603 if (dfm_eof (reader))
606 end = &parser->fields[parser->field_cnt];
607 for (f = parser->fields; f < end; f++)
609 int first_column, last_column;
612 if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s))
614 if (f < end - 1 && settings_get_undefined ())
615 msg (DW, _("Missing value(s) for all variables from %s onward. "
616 "These will be filled with the system-missing value "
617 "or blanks, as appropriate."),
620 value_set_missing (case_data_rw_idx (c, f->case_idx),
621 fmt_var_width (&f->format));
625 const char *input_encoding = dfm_reader_get_encoding (reader);
626 error = data_in (s, input_encoding, f->format.type,
627 case_data_rw_idx (c, f->case_idx),
628 fmt_var_width (&f->format), output_encoding);
630 parse_error (reader, f, first_column, last_column, error);
633 s = dfm_get_record (reader);
634 ss_ltrim (&s, parser->soft_seps);
635 if (!ss_is_empty (s))
636 msg (DW, _("Record ends in data not part of any field."));
639 dfm_forward_record (reader);
644 /* Displays a table giving information on fixed-format variable
645 parsing on DATA LIST. */
647 dump_fixed_table (const struct data_parser *parser,
648 const struct file_handle *fh)
653 t = tab_create (4, parser->field_cnt + 1);
654 tab_headers (t, 0, 0, 1, 0);
655 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable"));
656 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Record"));
657 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Columns"));
658 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Format"));
659 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 3, parser->field_cnt);
660 tab_hline (t, TAL_2, 0, 3, 1);
662 for (i = 0; i < parser->field_cnt; i++)
664 struct field *f = &parser->fields[i];
665 char fmt_string[FMT_STRING_LEN_MAX + 1];
668 tab_text (t, 0, row, TAB_LEFT, f->name);
669 tab_text_format (t, 1, row, 0, "%d", f->record);
670 tab_text_format (t, 2, row, 0, "%3d-%3d",
671 f->first_column, f->first_column + f->format.w - 1);
672 tab_text (t, 3, row, TAB_LEFT | TAB_FIX,
673 fmt_to_string (&f->format, fmt_string));
676 tab_title (t, ngettext ("Reading %d record from %s.",
677 "Reading %d records from %s.",
678 parser->records_per_case),
679 parser->records_per_case, fh_get_name (fh));
683 /* Displays a table giving information on free-format variable parsing
686 dump_delimited_table (const struct data_parser *parser,
687 const struct file_handle *fh)
692 t = tab_create (2, parser->field_cnt + 1);
693 tab_headers (t, 0, 0, 1, 0);
694 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable"));
695 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Format"));
696 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, parser->field_cnt);
697 tab_hline (t, TAL_2, 0, 1, 1);
699 for (i = 0; i < parser->field_cnt; i++)
701 struct field *f = &parser->fields[i];
702 char str[FMT_STRING_LEN_MAX + 1];
705 tab_text (t, 0, row, TAB_LEFT, f->name);
706 tab_text (t, 1, row, TAB_LEFT | TAB_FIX,
707 fmt_to_string (&f->format, str));
710 tab_title (t, _("Reading free-form data from %s."), fh_get_name (fh));
715 /* Displays a table giving information on how PARSER will read
718 data_parser_output_description (struct data_parser *parser,
719 const struct file_handle *fh)
721 if (parser->type == DP_FIXED)
722 dump_fixed_table (parser, fh);
724 dump_delimited_table (parser, fh);
727 /* Data parser input program. */
728 struct data_parser_casereader
730 struct data_parser *parser; /* Parser. */
731 struct dfm_reader *reader; /* Data file reader. */
732 struct caseproto *proto; /* Format of cases. */
735 static const struct casereader_class data_parser_casereader_class;
737 /* Creates a new unnamed dataset in SESSION. The new dataset has an input
738 program that reads data from READER according to the rules in PARSER, using
739 DICT as the underlying dictionary. Transfers ownership of PARSER and READER
740 is to the input program, and ownership of DICT to the dataset. */
742 data_parser_make_active_file (struct data_parser *parser,
743 struct session *session,
744 struct dfm_reader *reader,
745 struct dictionary *dict)
747 struct data_parser_casereader *r;
748 struct casereader *casereader;
750 r = xmalloc (sizeof *r);
753 r->proto = caseproto_ref (dict_get_proto (dict));
754 casereader = casereader_create_sequential (NULL, r->proto,
756 &data_parser_casereader_class, r);
758 struct dataset *ds = dataset_create (session, "");
759 dataset_set_dict (ds, dict);
760 dataset_set_source (ds, casereader);
761 session_set_active_dataset (session, ds);
764 static struct ccase *
765 data_parser_casereader_read (struct casereader *reader UNUSED, void *r_)
767 struct data_parser_casereader *r = r_;
768 struct ccase *c = case_create (r->proto);
769 if (data_parser_parse (r->parser, r->reader, c))
779 data_parser_casereader_destroy (struct casereader *reader UNUSED, void *r_)
781 struct data_parser_casereader *r = r_;
782 if (dfm_reader_error (r->reader))
783 casereader_force_error (reader);
784 data_parser_destroy (r->parser);
785 dfm_close_reader (r->reader);
786 caseproto_unref (r->proto);
790 static const struct casereader_class data_parser_casereader_class =
792 data_parser_casereader_read,
793 data_parser_casereader_destroy,