1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/data-io/data-parser.h"
24 #include "data/casereader-provider.h"
25 #include "data/data-in.h"
26 #include "data/dataset.h"
27 #include "data/dictionary.h"
28 #include "data/format.h"
29 #include "data/file-handle-def.h"
30 #include "data/settings.h"
31 #include "language/data-io/data-reader.h"
32 #include "libpspp/intern.h"
33 #include "libpspp/message.h"
34 #include "libpspp/str.h"
35 #include "output/pivot-table.h"
37 #include "gl/xalloc.h"
40 #define N_(msgid) msgid
41 #define _(msgid) gettext (msgid)
43 /* Data parser for textual data like that read by DATA LIST. */
46 struct dictionary *dict; /* Dictionary of destination */
47 enum data_parser_type type; /* Type of data to parse. */
48 int skip_records; /* Records to skip before first real data. */
50 struct field *fields; /* Fields to parse. */
51 size_t field_cnt; /* Number of fields. */
52 size_t field_allocated; /* Number of fields spaced allocated for. */
54 /* DP_DELIMITED parsers only. */
55 bool span; /* May cases span multiple records? */
56 bool empty_line_has_field; /* Does an empty line have an (empty) field? */
57 bool warn_missing_fields; /* Should missing fields be considered errors? */
58 struct substring quotes; /* Characters that can quote separators. */
59 bool quote_escape; /* Doubled quote acts as escape? */
60 struct substring soft_seps; /* Two soft separators act like just one. */
61 struct substring hard_seps; /* Two hard separators yield empty fields. */
62 struct string any_sep; /* Concatenation of soft_seps and hard_seps. */
64 /* DP_FIXED parsers only. */
65 int records_per_case; /* Number of records in each case. */
68 /* How to parse one variable. */
71 struct fmt_spec format; /* Input format of this field. */
72 int case_idx; /* First value in case. */
73 char *name; /* Var name for error messages and tables. */
76 int record; /* Record number (1-based). */
77 int first_column; /* First column in record (1-based). */
80 static void set_any_sep (struct data_parser *parser);
82 /* Creates and returns a new data parser. */
84 data_parser_create (struct dictionary *dict)
86 struct data_parser *parser = xmalloc (sizeof *parser);
88 parser->type = DP_FIXED;
89 parser->skip_records = 0;
91 parser->fields = NULL;
92 parser->field_cnt = 0;
93 parser->field_allocated = 0;
94 parser->dict = dict_ref (dict);
97 parser->empty_line_has_field = false;
98 parser->warn_missing_fields = true;
99 ss_alloc_substring (&parser->quotes, ss_cstr ("\"'"));
100 parser->quote_escape = false;
101 ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES));
102 ss_alloc_substring (&parser->hard_seps, ss_cstr (","));
103 ds_init_empty (&parser->any_sep);
104 set_any_sep (parser);
106 parser->records_per_case = 0;
111 /* Destroys PARSER. */
113 data_parser_destroy (struct data_parser *parser)
119 dict_unref (parser->dict);
120 for (i = 0; i < parser->field_cnt; i++)
121 free (parser->fields[i].name);
122 free (parser->fields);
123 ss_dealloc (&parser->quotes);
124 ss_dealloc (&parser->soft_seps);
125 ss_dealloc (&parser->hard_seps);
126 ds_destroy (&parser->any_sep);
131 /* Returns the type of PARSER (either DP_DELIMITED or DP_FIXED). */
132 enum data_parser_type
133 data_parser_get_type (const struct data_parser *parser)
138 /* Sets the type of PARSER to TYPE (either DP_DELIMITED or
141 data_parser_set_type (struct data_parser *parser, enum data_parser_type type)
143 assert (parser->field_cnt == 0);
144 assert (type == DP_FIXED || type == DP_DELIMITED);
148 /* Configures PARSER to skip the specified number of
149 INITIAL_RECORDS_TO_SKIP before parsing any data. By default,
150 no records are skipped. */
152 data_parser_set_skip (struct data_parser *parser, int initial_records_to_skip)
154 assert (initial_records_to_skip >= 0);
155 parser->skip_records = initial_records_to_skip;
158 /* Returns true if PARSER is configured to allow cases to span
161 data_parser_get_span (const struct data_parser *parser)
166 /* If MAY_CASES_SPAN_RECORDS is true, configures PARSER to allow
167 a single case to span multiple records and multiple cases to
168 occupy a single record. If MAY_CASES_SPAN_RECORDS is false,
169 configures PARSER to require each record to contain exactly
172 This setting affects parsing of DP_DELIMITED files only. */
174 data_parser_set_span (struct data_parser *parser, bool may_cases_span_records)
176 parser->span = may_cases_span_records;
179 /* If EMPTY_LINE_HAS_FIELD is true, configures PARSER to parse an
180 empty line as an empty field and to treat a hard delimiter
181 followed by end-of-line as an empty field. If
182 EMPTY_LINE_HAS_FIELD is false, PARSER will skip empty lines
183 and hard delimiters at the end of lines without emitting empty
186 This setting affects parsing of DP_DELIMITED files only. */
188 data_parser_set_empty_line_has_field (struct data_parser *parser,
189 bool empty_line_has_field)
191 parser->empty_line_has_field = empty_line_has_field;
195 /* If WARN_MISSING_FIELDS is true, configures PARSER to emit a warning
196 and cause an error condition when a missing field is encountered.
197 If WARN_MISSING_FIELDS is false, PARSER will silently fill such
198 fields with the system missing value.
200 This setting affects parsing of DP_DELIMITED files only. */
202 data_parser_set_warn_missing_fields (struct data_parser *parser,
203 bool warn_missing_fields)
205 parser->warn_missing_fields = warn_missing_fields;
209 /* Sets the characters that may be used for quoting field
210 contents to QUOTES. If QUOTES is empty, quoting will be
213 The caller retains ownership of QUOTES.
215 This setting affects parsing of DP_DELIMITED files only. */
217 data_parser_set_quotes (struct data_parser *parser, struct substring quotes)
219 ss_dealloc (&parser->quotes);
220 ss_alloc_substring (&parser->quotes, quotes);
223 /* If ESCAPE is false (the default setting), a character used for
224 quoting cannot itself be embedded within a quoted field. If
225 ESCAPE is true, then a quote character can be embedded within
226 a quoted field by doubling it.
228 This setting affects parsing of DP_DELIMITED files only, and
229 only when at least one quote character has been set (with
230 data_parser_set_quotes). */
232 data_parser_set_quote_escape (struct data_parser *parser, bool escape)
234 parser->quote_escape = escape;
237 /* Sets PARSER's soft delimiters to DELIMITERS. Soft delimiters
238 separate fields, but consecutive soft delimiters do not yield
239 empty fields. (Ordinarily, only white space characters are
240 appropriate soft delimiters.)
242 The caller retains ownership of DELIMITERS.
244 This setting affects parsing of DP_DELIMITED files only. */
246 data_parser_set_soft_delimiters (struct data_parser *parser,
247 struct substring delimiters)
249 ss_dealloc (&parser->soft_seps);
250 ss_alloc_substring (&parser->soft_seps, delimiters);
251 set_any_sep (parser);
254 /* Sets PARSER's hard delimiters to DELIMITERS. Hard delimiters
255 separate fields. A consecutive pair of hard delimiters yield
258 The caller retains ownership of DELIMITERS.
260 This setting affects parsing of DP_DELIMITED files only. */
262 data_parser_set_hard_delimiters (struct data_parser *parser,
263 struct substring delimiters)
265 ss_dealloc (&parser->hard_seps);
266 ss_alloc_substring (&parser->hard_seps, delimiters);
267 set_any_sep (parser);
270 /* Returns the number of records per case. */
272 data_parser_get_records (const struct data_parser *parser)
274 return parser->records_per_case;
277 /* Sets the number of records per case to RECORDS_PER_CASE.
279 This setting affects parsing of DP_FIXED files only. */
281 data_parser_set_records (struct data_parser *parser, int records_per_case)
283 assert (records_per_case >= 0);
284 assert (records_per_case >= parser->records_per_case);
285 parser->records_per_case = records_per_case;
289 add_field (struct data_parser *p, const struct fmt_spec *format, int case_idx,
290 const char *name, int record, int first_column)
294 if (p->field_cnt == p->field_allocated)
295 p->fields = x2nrealloc (p->fields, &p->field_allocated, sizeof *p->fields);
296 field = &p->fields[p->field_cnt++];
297 field->format = *format;
298 field->case_idx = case_idx;
299 field->name = xstrdup (name);
300 field->record = record;
301 field->first_column = first_column;
304 /* Adds a delimited field to the field parsed by PARSER, which
305 must be configured as a DP_DELIMITED parser. The field is
306 parsed as input format FORMAT. Its data will be stored into case
307 index CASE_INDEX. Errors in input data will be reported
308 against variable NAME. */
310 data_parser_add_delimited_field (struct data_parser *parser,
311 const struct fmt_spec *format, int case_idx,
314 assert (parser->type == DP_DELIMITED);
315 add_field (parser, format, case_idx, name, 0, 0);
318 /* Adds a fixed field to the field parsed by PARSER, which
319 must be configured as a DP_FIXED parser. The field is
320 parsed as input format FORMAT. Its data will be stored into case
321 index CASE_INDEX. Errors in input data will be reported
322 against variable NAME. The field will be drawn from the
323 FORMAT->w columns in 1-based RECORD starting at 1-based
326 RECORD must be at least as great as that of any field already
327 added; that is, fields must be added in increasing order of
328 record number. If RECORD is greater than the current number
329 of records per case, the number of records per case are
330 increased as needed. */
332 data_parser_add_fixed_field (struct data_parser *parser,
333 const struct fmt_spec *format, int case_idx,
335 int record, int first_column)
337 assert (parser->type == DP_FIXED);
338 assert (parser->field_cnt == 0
339 || record >= parser->fields[parser->field_cnt - 1].record);
340 if (record > parser->records_per_case)
341 parser->records_per_case = record;
342 add_field (parser, format, case_idx, name, record, first_column);
345 /* Returns true if any fields have been added to PARSER, false
348 data_parser_any_fields (const struct data_parser *parser)
350 return parser->field_cnt > 0;
354 set_any_sep (struct data_parser *parser)
356 ds_assign_substring (&parser->any_sep, parser->soft_seps);
357 ds_put_substring (&parser->any_sep, parser->hard_seps);
360 static bool parse_delimited_span (const struct data_parser *,
361 struct dfm_reader *, struct ccase *);
362 static bool parse_delimited_no_span (const struct data_parser *,
363 struct dfm_reader *, struct ccase *);
364 static bool parse_fixed (const struct data_parser *,
365 struct dfm_reader *, struct ccase *);
367 /* Reads a case from DFM into C, parsing it with PARSER. Returns
368 true if successful, false at end of file or on I/O error.
370 Case C must not be shared. */
372 data_parser_parse (struct data_parser *parser, struct dfm_reader *reader,
377 assert (!case_is_shared (c));
378 assert (data_parser_any_fields (parser));
380 /* Skip the requested number of records before reading the
382 for (; parser->skip_records > 0; parser->skip_records--)
384 if (dfm_eof (reader))
386 dfm_forward_record (reader);
390 if (parser->type == DP_DELIMITED)
393 retval = parse_delimited_span (parser, reader, c);
395 retval = parse_delimited_no_span (parser, reader, c);
398 retval = parse_fixed (parser, reader, c);
403 /* Extracts a delimited field from the current position in the
404 current record according to PARSER, reading data from READER.
406 *FIELD is set to the field content. The caller must not or
407 destroy this constant string.
409 Sets *FIRST_COLUMN to the 1-based column number of the start of
410 the extracted field, and *LAST_COLUMN to the end of the extracted
413 Returns true on success, false on failure. */
415 cut_field (const struct data_parser *parser, struct dfm_reader *reader,
416 int *first_column, int *last_column, struct string *tmp,
417 struct substring *field)
419 size_t length_before_separators;
420 struct substring line, p;
423 if (dfm_eof (reader))
425 if (ss_is_empty (parser->hard_seps))
426 dfm_expand_tabs (reader);
427 line = p = dfm_get_record (reader);
429 /* Skip leading soft separators. */
430 ss_ltrim (&p, parser->soft_seps);
432 /* Handle empty or completely consumed lines. */
435 if (!parser->empty_line_has_field || dfm_columns_past_end (reader) > 0)
440 *first_column = dfm_column_start (reader);
441 *last_column = *first_column + 1;
442 dfm_forward_columns (reader, 1);
447 *first_column = dfm_column_start (reader);
448 quoted = ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX;
452 int quote = ss_get_byte (&p);
453 if (!ss_get_until (&p, quote, field))
454 msg (DW, _("Quoted string extends beyond end of line."));
455 if (parser->quote_escape && ss_first (p) == quote)
457 ds_assign_substring (tmp, *field);
458 while (ss_match_byte (&p, quote))
461 ds_put_byte (tmp, quote);
462 if (!ss_get_until (&p, quote, &ss))
463 msg (DW, _("Quoted string extends beyond end of line."));
464 ds_put_substring (tmp, ss);
466 *field = ds_ss (tmp);
468 *last_column = *first_column + (ss_length (line) - ss_length (p));
473 ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field);
474 *last_column = *first_column + ss_length (*field);
477 /* Skip trailing soft separator and a single hard separator if present. */
478 length_before_separators = ss_length (p);
479 ss_ltrim (&p, parser->soft_seps);
481 && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX)
484 ss_ltrim (&p, parser->soft_seps);
487 dfm_forward_columns (reader, 1);
488 else if (quoted && length_before_separators == ss_length (p))
489 msg (DW, _("Missing delimiter following quoted string."));
490 dfm_forward_columns (reader, ss_length (line) - ss_length (p));
496 parse_error (const struct dfm_reader *reader, const struct field *field,
497 int first_column, int last_column, char *error)
499 int line_number = dfm_get_line_number (reader);
500 struct msg_location *location = xmalloc (sizeof *location);
501 *location = (struct msg_location) {
502 .file_name = intern_new (dfm_get_file_name (reader)),
503 .first_line = line_number,
504 .last_line = line_number + 1,
505 .first_column = first_column,
506 .last_column = last_column,
508 struct msg *m = xmalloc (sizeof *m);
510 .category = MSG_C_DATA,
511 .severity = MSG_S_WARNING,
512 .location = location,
513 .text = xasprintf (_("Data for variable %s is not valid as format %s: %s"),
514 field->name, fmt_name (field->format.type), error),
521 /* Reads a case from READER into C, parsing it according to
522 fixed-format syntax rules in PARSER.
523 Returns true if successful, false at end of file or on I/O error. */
525 parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
528 const char *input_encoding = dfm_reader_get_encoding (reader);
529 const char *output_encoding = dict_get_encoding (parser->dict);
533 if (dfm_eof (reader))
537 for (row = 1; row <= parser->records_per_case; row++)
539 struct substring line;
541 if (dfm_eof (reader))
543 msg (DW, _("Partial case of %d of %d records discarded."),
544 row - 1, parser->records_per_case);
547 dfm_expand_tabs (reader);
548 line = dfm_get_record (reader);
550 for (; f < &parser->fields[parser->field_cnt] && f->record == row; f++)
552 struct substring s = ss_substr (line, f->first_column - 1,
554 union value *value = case_data_rw_idx (c, f->case_idx);
555 char *error = data_in (s, input_encoding, f->format.type,
556 settings_get_fmt_settings (),
557 value, fmt_var_width (&f->format),
561 data_in_imply_decimals (s, input_encoding, f->format.type,
562 f->format.d, settings_get_fmt_settings (),
565 parse_error (reader, f, f->first_column,
566 f->first_column + f->format.w, error);
569 dfm_forward_record (reader);
575 /* Reads a case from READER into C, parsing it according to
576 free-format syntax rules in PARSER.
577 Returns true if successful, false at end of file or on I/O error. */
579 parse_delimited_span (const struct data_parser *parser,
580 struct dfm_reader *reader, struct ccase *c)
582 const char *output_encoding = dict_get_encoding (parser->dict);
583 struct string tmp = DS_EMPTY_INITIALIZER;
586 for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++)
589 int first_column, last_column;
592 /* Cut out a field and read in a new record if necessary. */
593 while (!cut_field (parser, reader,
594 &first_column, &last_column, &tmp, &s))
596 if (!dfm_eof (reader))
597 dfm_forward_record (reader);
598 if (dfm_eof (reader))
600 if (f > parser->fields)
601 msg (DW, _("Partial case discarded. The first variable "
602 "missing was %s."), f->name);
608 const char *input_encoding = dfm_reader_get_encoding (reader);
609 error = data_in (s, input_encoding, f->format.type,
610 settings_get_fmt_settings (),
611 case_data_rw_idx (c, f->case_idx),
612 fmt_var_width (&f->format), output_encoding);
614 parse_error (reader, f, first_column, last_column, error);
620 /* Reads a case from READER into C, parsing it according to
621 delimited syntax rules with one case per record in PARSER.
622 Returns true if successful, false at end of file or on I/O error. */
624 parse_delimited_no_span (const struct data_parser *parser,
625 struct dfm_reader *reader, struct ccase *c)
627 const char *output_encoding = dict_get_encoding (parser->dict);
628 struct string tmp = DS_EMPTY_INITIALIZER;
630 struct field *f, *end;
632 if (dfm_eof (reader))
635 end = &parser->fields[parser->field_cnt];
636 for (f = parser->fields; f < end; f++)
638 int first_column, last_column;
641 if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s))
643 if (f < end - 1 && settings_get_undefined () && parser->warn_missing_fields)
644 msg (DW, _("Missing value(s) for all variables from %s onward. "
645 "These will be filled with the system-missing value "
646 "or blanks, as appropriate."),
649 value_set_missing (case_data_rw_idx (c, f->case_idx),
650 fmt_var_width (&f->format));
654 const char *input_encoding = dfm_reader_get_encoding (reader);
655 error = data_in (s, input_encoding, f->format.type,
656 settings_get_fmt_settings (),
657 case_data_rw_idx (c, f->case_idx),
658 fmt_var_width (&f->format), output_encoding);
660 parse_error (reader, f, first_column, last_column, error);
663 s = dfm_get_record (reader);
664 ss_ltrim (&s, parser->soft_seps);
665 if (!ss_is_empty (s))
666 msg (DW, _("Record ends in data not part of any field."));
669 dfm_forward_record (reader);
674 /* Displays a table giving information on fixed-format variable
675 parsing on DATA LIST. */
677 dump_fixed_table (const struct data_parser *parser,
678 const struct file_handle *fh)
680 /* XXX This should not be preformatted. */
681 char *title = xasprintf (ngettext ("Reading %d record from %s.",
682 "Reading %d records from %s.",
683 parser->records_per_case),
684 parser->records_per_case, fh_get_name (fh));
685 struct pivot_table *table = pivot_table_create__ (
686 pivot_value_new_user_text (title, -1), "Fixed Data Records");
689 pivot_dimension_create (
690 table, PIVOT_AXIS_COLUMN, N_("Attributes"),
691 N_("Record"), N_("Columns"), N_("Format"));
693 struct pivot_dimension *variables = pivot_dimension_create (
694 table, PIVOT_AXIS_ROW, N_("Variable"));
695 variables->root->show_label = true;
696 for (size_t i = 0; i < parser->field_cnt; i++)
698 struct field *f = &parser->fields[i];
700 /* XXX It would be better to have the actual variable here. */
701 int variable_idx = pivot_category_create_leaf (
702 variables->root, pivot_value_new_user_text (f->name, -1));
704 pivot_table_put2 (table, 0, variable_idx,
705 pivot_value_new_integer (f->record));
707 int first_column = f->first_column;
708 int last_column = f->first_column + f->format.w - 1;
709 char *columns = xasprintf ("%d-%d", first_column, last_column);
710 pivot_table_put2 (table, 1, variable_idx,
711 pivot_value_new_user_text (columns, -1));
714 char str[FMT_STRING_LEN_MAX + 1];
715 pivot_table_put2 (table, 2, variable_idx,
716 pivot_value_new_user_text (
717 fmt_to_string (&f->format, str), -1));
721 pivot_table_submit (table);
724 /* Displays a table giving information on free-format variable parsing
727 dump_delimited_table (const struct data_parser *parser,
728 const struct file_handle *fh)
730 struct pivot_table *table = pivot_table_create__ (
731 pivot_value_new_text_format (N_("Reading free-form data from %s."),
733 "Free-Form Data Records");
735 pivot_dimension_create (
736 table, PIVOT_AXIS_COLUMN, N_("Attributes"), N_("Format"));
738 struct pivot_dimension *variables = pivot_dimension_create (
739 table, PIVOT_AXIS_ROW, N_("Variable"));
740 variables->root->show_label = true;
741 for (size_t i = 0; i < parser->field_cnt; i++)
743 struct field *f = &parser->fields[i];
745 /* XXX It would be better to have the actual variable here. */
746 int variable_idx = pivot_category_create_leaf (
747 variables->root, pivot_value_new_user_text (f->name, -1));
749 char str[FMT_STRING_LEN_MAX + 1];
750 pivot_table_put2 (table, 0, variable_idx,
751 pivot_value_new_user_text (
752 fmt_to_string (&f->format, str), -1));
755 pivot_table_submit (table);
758 /* Displays a table giving information on how PARSER will read
761 data_parser_output_description (struct data_parser *parser,
762 const struct file_handle *fh)
764 if (parser->type == DP_FIXED)
765 dump_fixed_table (parser, fh);
767 dump_delimited_table (parser, fh);
770 /* Data parser input program. */
771 struct data_parser_casereader
773 struct data_parser *parser; /* Parser. */
774 struct dfm_reader *reader; /* Data file reader. */
775 struct caseproto *proto; /* Format of cases. */
778 static const struct casereader_class data_parser_casereader_class;
780 /* Replaces DS's active dataset by an input program that reads data
781 from READER according to the rules in PARSER, using DICT as
782 the underlying dictionary. Ownership of PARSER and READER is
783 transferred to the input program, and ownership of DICT is
784 transferred to the dataset. */
786 data_parser_make_active_file (struct data_parser *parser, struct dataset *ds,
787 struct dfm_reader *reader,
788 struct dictionary *dict,
789 struct casereader* (*func)(struct casereader *,
790 const struct dictionary *,
794 struct data_parser_casereader *r;
795 struct casereader *casereader0;
796 struct casereader *casereader1;
798 r = xmalloc (sizeof *r);
801 r->proto = caseproto_ref (dict_get_proto (dict));
802 casereader0 = casereader_create_sequential (NULL, r->proto,
804 &data_parser_casereader_class, r);
807 casereader1 = func (casereader0, dict, ud);
809 casereader1 = casereader0;
811 dataset_set_dict (ds, dict);
812 dataset_set_source (ds, casereader1);
816 static struct ccase *
817 data_parser_casereader_read (struct casereader *reader UNUSED, void *r_)
819 struct data_parser_casereader *r = r_;
820 struct ccase *c = case_create (r->proto);
821 if (data_parser_parse (r->parser, r->reader, c))
831 data_parser_casereader_destroy (struct casereader *reader, void *r_)
833 struct data_parser_casereader *r = r_;
834 if (dfm_reader_error (r->reader))
835 casereader_force_error (reader);
836 dfm_close_reader (r->reader);
837 caseproto_unref (r->proto);
838 data_parser_destroy (r->parser);
842 static const struct casereader_class data_parser_casereader_class =
844 data_parser_casereader_read,
845 data_parser_casereader_destroy,