1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/data-io/data-parser.h"
24 #include "data/casereader-provider.h"
25 #include "data/data-in.h"
26 #include "data/dataset.h"
27 #include "data/dictionary.h"
28 #include "data/format.h"
29 #include "data/file-handle-def.h"
30 #include "data/settings.h"
31 #include "language/data-io/data-reader.h"
32 #include "libpspp/message.h"
33 #include "libpspp/str.h"
34 #include "output/pivot-table.h"
36 #include "gl/xalloc.h"
39 #define N_(msgid) msgid
40 #define _(msgid) gettext (msgid)
42 /* Data parser for textual data like that read by DATA LIST. */
45 const struct dictionary *dict; /*Dictionary of destination */
46 enum data_parser_type type; /* Type of data to parse. */
47 int skip_records; /* Records to skip before first real data. */
49 struct field *fields; /* Fields to parse. */
50 size_t field_cnt; /* Number of fields. */
51 size_t field_allocated; /* Number of fields spaced allocated for. */
53 /* DP_DELIMITED parsers only. */
54 bool span; /* May cases span multiple records? */
55 bool empty_line_has_field; /* Does an empty line have an (empty) field? */
56 bool warn_missing_fields; /* Should missing fields be considered errors? */
57 struct substring quotes; /* Characters that can quote separators. */
58 bool quote_escape; /* Doubled quote acts as escape? */
59 struct substring soft_seps; /* Two soft separators act like just one. */
60 struct substring hard_seps; /* Two hard separators yield empty fields. */
61 struct string any_sep; /* Concatenation of soft_seps and hard_seps. */
63 /* DP_FIXED parsers only. */
64 int records_per_case; /* Number of records in each case. */
67 /* How to parse one variable. */
70 struct fmt_spec format; /* Input format of this field. */
71 int case_idx; /* First value in case. */
72 char *name; /* Var name for error messages and tables. */
75 int record; /* Record number (1-based). */
76 int first_column; /* First column in record (1-based). */
79 static void set_any_sep (struct data_parser *parser);
81 /* Creates and returns a new data parser. */
83 data_parser_create (const struct dictionary *dict)
85 struct data_parser *parser = xmalloc (sizeof *parser);
87 parser->type = DP_FIXED;
88 parser->skip_records = 0;
90 parser->fields = NULL;
91 parser->field_cnt = 0;
92 parser->field_allocated = 0;
96 parser->empty_line_has_field = false;
97 parser->warn_missing_fields = true;
98 ss_alloc_substring (&parser->quotes, ss_cstr ("\"'"));
99 parser->quote_escape = false;
100 ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES));
101 ss_alloc_substring (&parser->hard_seps, ss_cstr (","));
102 ds_init_empty (&parser->any_sep);
103 set_any_sep (parser);
105 parser->records_per_case = 0;
110 /* Destroys PARSER. */
112 data_parser_destroy (struct data_parser *parser)
118 for (i = 0; i < parser->field_cnt; i++)
119 free (parser->fields[i].name);
120 free (parser->fields);
121 ss_dealloc (&parser->quotes);
122 ss_dealloc (&parser->soft_seps);
123 ss_dealloc (&parser->hard_seps);
124 ds_destroy (&parser->any_sep);
129 /* Returns the type of PARSER (either DP_DELIMITED or DP_FIXED). */
130 enum data_parser_type
131 data_parser_get_type (const struct data_parser *parser)
136 /* Sets the type of PARSER to TYPE (either DP_DELIMITED or
139 data_parser_set_type (struct data_parser *parser, enum data_parser_type type)
141 assert (parser->field_cnt == 0);
142 assert (type == DP_FIXED || type == DP_DELIMITED);
146 /* Configures PARSER to skip the specified number of
147 INITIAL_RECORDS_TO_SKIP before parsing any data. By default,
148 no records are skipped. */
150 data_parser_set_skip (struct data_parser *parser, int initial_records_to_skip)
152 assert (initial_records_to_skip >= 0);
153 parser->skip_records = initial_records_to_skip;
156 /* Returns true if PARSER is configured to allow cases to span
159 data_parser_get_span (const struct data_parser *parser)
164 /* If MAY_CASES_SPAN_RECORDS is true, configures PARSER to allow
165 a single case to span multiple records and multiple cases to
166 occupy a single record. If MAY_CASES_SPAN_RECORDS is false,
167 configures PARSER to require each record to contain exactly
170 This setting affects parsing of DP_DELIMITED files only. */
172 data_parser_set_span (struct data_parser *parser, bool may_cases_span_records)
174 parser->span = may_cases_span_records;
177 /* If EMPTY_LINE_HAS_FIELD is true, configures PARSER to parse an
178 empty line as an empty field and to treat a hard delimiter
179 followed by end-of-line as an empty field. If
180 EMPTY_LINE_HAS_FIELD is false, PARSER will skip empty lines
181 and hard delimiters at the end of lines without emitting empty
184 This setting affects parsing of DP_DELIMITED files only. */
186 data_parser_set_empty_line_has_field (struct data_parser *parser,
187 bool empty_line_has_field)
189 parser->empty_line_has_field = empty_line_has_field;
193 /* If WARN_MISSING_FIELDS is true, configures PARSER to emit a warning
194 and cause an error condition when a missing field is encountered.
195 If WARN_MISSING_FIELDS is false, PARSER will silently fill such
196 fields with the system missing value.
198 This setting affects parsing of DP_DELIMITED files only. */
200 data_parser_set_warn_missing_fields (struct data_parser *parser,
201 bool warn_missing_fields)
203 parser->warn_missing_fields = warn_missing_fields;
207 /* Sets the characters that may be used for quoting field
208 contents to QUOTES. If QUOTES is empty, quoting will be
211 The caller retains ownership of QUOTES.
213 This setting affects parsing of DP_DELIMITED files only. */
215 data_parser_set_quotes (struct data_parser *parser, struct substring quotes)
217 ss_dealloc (&parser->quotes);
218 ss_alloc_substring (&parser->quotes, quotes);
221 /* If ESCAPE is false (the default setting), a character used for
222 quoting cannot itself be embedded within a quoted field. If
223 ESCAPE is true, then a quote character can be embedded within
224 a quoted field by doubling it.
226 This setting affects parsing of DP_DELIMITED files only, and
227 only when at least one quote character has been set (with
228 data_parser_set_quotes). */
230 data_parser_set_quote_escape (struct data_parser *parser, bool escape)
232 parser->quote_escape = escape;
235 /* Sets PARSER's soft delimiters to DELIMITERS. Soft delimiters
236 separate fields, but consecutive soft delimiters do not yield
237 empty fields. (Ordinarily, only white space characters are
238 appropriate soft delimiters.)
240 The caller retains ownership of DELIMITERS.
242 This setting affects parsing of DP_DELIMITED files only. */
244 data_parser_set_soft_delimiters (struct data_parser *parser,
245 struct substring delimiters)
247 ss_dealloc (&parser->soft_seps);
248 ss_alloc_substring (&parser->soft_seps, delimiters);
249 set_any_sep (parser);
252 /* Sets PARSER's hard delimiters to DELIMITERS. Hard delimiters
253 separate fields. A consecutive pair of hard delimiters yield
256 The caller retains ownership of DELIMITERS.
258 This setting affects parsing of DP_DELIMITED files only. */
260 data_parser_set_hard_delimiters (struct data_parser *parser,
261 struct substring delimiters)
263 ss_dealloc (&parser->hard_seps);
264 ss_alloc_substring (&parser->hard_seps, delimiters);
265 set_any_sep (parser);
268 /* Returns the number of records per case. */
270 data_parser_get_records (const struct data_parser *parser)
272 return parser->records_per_case;
275 /* Sets the number of records per case to RECORDS_PER_CASE.
277 This setting affects parsing of DP_FIXED files only. */
279 data_parser_set_records (struct data_parser *parser, int records_per_case)
281 assert (records_per_case >= 0);
282 assert (records_per_case >= parser->records_per_case);
283 parser->records_per_case = records_per_case;
287 add_field (struct data_parser *p, const struct fmt_spec *format, int case_idx,
288 const char *name, int record, int first_column)
292 if (p->field_cnt == p->field_allocated)
293 p->fields = x2nrealloc (p->fields, &p->field_allocated, sizeof *p->fields);
294 field = &p->fields[p->field_cnt++];
295 field->format = *format;
296 field->case_idx = case_idx;
297 field->name = xstrdup (name);
298 field->record = record;
299 field->first_column = first_column;
302 /* Adds a delimited field to the field parsed by PARSER, which
303 must be configured as a DP_DELIMITED parser. The field is
304 parsed as input format FORMAT. Its data will be stored into case
305 index CASE_INDEX. Errors in input data will be reported
306 against variable NAME. */
308 data_parser_add_delimited_field (struct data_parser *parser,
309 const struct fmt_spec *format, int case_idx,
312 assert (parser->type == DP_DELIMITED);
313 add_field (parser, format, case_idx, name, 0, 0);
316 /* Adds a fixed field to the field parsed by PARSER, which
317 must be configured as a DP_FIXED parser. The field is
318 parsed as input format FORMAT. Its data will be stored into case
319 index CASE_INDEX. Errors in input data will be reported
320 against variable NAME. The field will be drawn from the
321 FORMAT->w columns in 1-based RECORD starting at 1-based
324 RECORD must be at least as great as that of any field already
325 added; that is, fields must be added in increasing order of
326 record number. If RECORD is greater than the current number
327 of records per case, the number of records per case are
328 increased as needed. */
330 data_parser_add_fixed_field (struct data_parser *parser,
331 const struct fmt_spec *format, int case_idx,
333 int record, int first_column)
335 assert (parser->type == DP_FIXED);
336 assert (parser->field_cnt == 0
337 || record >= parser->fields[parser->field_cnt - 1].record);
338 if (record > parser->records_per_case)
339 parser->records_per_case = record;
340 add_field (parser, format, case_idx, name, record, first_column);
343 /* Returns true if any fields have been added to PARSER, false
346 data_parser_any_fields (const struct data_parser *parser)
348 return parser->field_cnt > 0;
352 set_any_sep (struct data_parser *parser)
354 ds_assign_substring (&parser->any_sep, parser->soft_seps);
355 ds_put_substring (&parser->any_sep, parser->hard_seps);
358 static bool parse_delimited_span (const struct data_parser *,
359 struct dfm_reader *, struct ccase *);
360 static bool parse_delimited_no_span (const struct data_parser *,
361 struct dfm_reader *, struct ccase *);
362 static bool parse_fixed (const struct data_parser *,
363 struct dfm_reader *, struct ccase *);
365 /* Reads a case from DFM into C, parsing it with PARSER. Returns
366 true if successful, false at end of file or on I/O error.
368 Case C must not be shared. */
370 data_parser_parse (struct data_parser *parser, struct dfm_reader *reader,
375 assert (!case_is_shared (c));
376 assert (data_parser_any_fields (parser));
378 /* Skip the requested number of records before reading the
380 for (; parser->skip_records > 0; parser->skip_records--)
382 if (dfm_eof (reader))
384 dfm_forward_record (reader);
388 if (parser->type == DP_DELIMITED)
391 retval = parse_delimited_span (parser, reader, c);
393 retval = parse_delimited_no_span (parser, reader, c);
396 retval = parse_fixed (parser, reader, c);
401 /* Extracts a delimited field from the current position in the
402 current record according to PARSER, reading data from READER.
404 *FIELD is set to the field content. The caller must not or
405 destroy this constant string.
407 Sets *FIRST_COLUMN to the 1-based column number of the start of
408 the extracted field, and *LAST_COLUMN to the end of the extracted
411 Returns true on success, false on failure. */
413 cut_field (const struct data_parser *parser, struct dfm_reader *reader,
414 int *first_column, int *last_column, struct string *tmp,
415 struct substring *field)
417 size_t length_before_separators;
418 struct substring line, p;
421 if (dfm_eof (reader))
423 if (ss_is_empty (parser->hard_seps))
424 dfm_expand_tabs (reader);
425 line = p = dfm_get_record (reader);
427 /* Skip leading soft separators. */
428 ss_ltrim (&p, parser->soft_seps);
430 /* Handle empty or completely consumed lines. */
433 if (!parser->empty_line_has_field || dfm_columns_past_end (reader) > 0)
438 *first_column = dfm_column_start (reader);
439 *last_column = *first_column + 1;
440 dfm_forward_columns (reader, 1);
445 *first_column = dfm_column_start (reader);
446 quoted = ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX;
450 int quote = ss_get_byte (&p);
451 if (!ss_get_until (&p, quote, field))
452 msg (DW, _("Quoted string extends beyond end of line."));
453 if (parser->quote_escape && ss_first (p) == quote)
455 ds_assign_substring (tmp, *field);
456 while (ss_match_byte (&p, quote))
459 ds_put_byte (tmp, quote);
460 if (!ss_get_until (&p, quote, &ss))
461 msg (DW, _("Quoted string extends beyond end of line."));
462 ds_put_substring (tmp, ss);
464 *field = ds_ss (tmp);
466 *last_column = *first_column + (ss_length (line) - ss_length (p));
471 ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field);
472 *last_column = *first_column + ss_length (*field);
475 /* Skip trailing soft separator and a single hard separator if present. */
476 length_before_separators = ss_length (p);
477 ss_ltrim (&p, parser->soft_seps);
479 && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX)
482 ss_ltrim (&p, parser->soft_seps);
485 dfm_forward_columns (reader, 1);
486 else if (quoted && length_before_separators == ss_length (p))
487 msg (DW, _("Missing delimiter following quoted string."));
488 dfm_forward_columns (reader, ss_length (line) - ss_length (p));
494 parse_error (const struct dfm_reader *reader, const struct field *field,
495 int first_column, int last_column, char *error)
498 .category = MSG_C_DATA,
499 .severity = MSG_S_WARNING,
500 .file_name = CONST_CAST (char *, dfm_get_file_name (reader)),
501 .first_line = dfm_get_line_number (reader),
502 .last_line = m.first_line + 1,
503 .first_column = first_column,
504 .last_column = last_column,
505 .text = xasprintf (_("Data for variable %s is not valid as format %s: %s"),
506 field->name, fmt_name (field->format.type), error),
513 /* Reads a case from READER into C, parsing it according to
514 fixed-format syntax rules in PARSER.
515 Returns true if successful, false at end of file or on I/O error. */
517 parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
520 const char *input_encoding = dfm_reader_get_encoding (reader);
521 const char *output_encoding = dict_get_encoding (parser->dict);
525 if (dfm_eof (reader))
529 for (row = 1; row <= parser->records_per_case; row++)
531 struct substring line;
533 if (dfm_eof (reader))
535 msg (DW, _("Partial case of %d of %d records discarded."),
536 row - 1, parser->records_per_case);
539 dfm_expand_tabs (reader);
540 line = dfm_get_record (reader);
542 for (; f < &parser->fields[parser->field_cnt] && f->record == row; f++)
544 struct substring s = ss_substr (line, f->first_column - 1,
546 union value *value = case_data_rw_idx (c, f->case_idx);
547 char *error = data_in (s, input_encoding, f->format.type,
548 value, fmt_var_width (&f->format),
552 data_in_imply_decimals (s, input_encoding, f->format.type,
555 parse_error (reader, f, f->first_column,
556 f->first_column + f->format.w, error);
559 dfm_forward_record (reader);
565 /* Reads a case from READER into C, parsing it according to
566 free-format syntax rules in PARSER.
567 Returns true if successful, false at end of file or on I/O error. */
569 parse_delimited_span (const struct data_parser *parser,
570 struct dfm_reader *reader, struct ccase *c)
572 const char *output_encoding = dict_get_encoding (parser->dict);
573 struct string tmp = DS_EMPTY_INITIALIZER;
576 for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++)
579 int first_column, last_column;
582 /* Cut out a field and read in a new record if necessary. */
583 while (!cut_field (parser, reader,
584 &first_column, &last_column, &tmp, &s))
586 if (!dfm_eof (reader))
587 dfm_forward_record (reader);
588 if (dfm_eof (reader))
590 if (f > parser->fields)
591 msg (DW, _("Partial case discarded. The first variable "
592 "missing was %s."), f->name);
598 const char *input_encoding = dfm_reader_get_encoding (reader);
599 error = data_in (s, input_encoding, f->format.type,
600 case_data_rw_idx (c, f->case_idx),
601 fmt_var_width (&f->format), output_encoding);
603 parse_error (reader, f, first_column, last_column, error);
609 /* Reads a case from READER into C, parsing it according to
610 delimited syntax rules with one case per record in PARSER.
611 Returns true if successful, false at end of file or on I/O error. */
613 parse_delimited_no_span (const struct data_parser *parser,
614 struct dfm_reader *reader, struct ccase *c)
616 const char *output_encoding = dict_get_encoding (parser->dict);
617 struct string tmp = DS_EMPTY_INITIALIZER;
619 struct field *f, *end;
621 if (dfm_eof (reader))
624 end = &parser->fields[parser->field_cnt];
625 for (f = parser->fields; f < end; f++)
627 int first_column, last_column;
630 if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s))
632 if (f < end - 1 && settings_get_undefined () && parser->warn_missing_fields)
633 msg (DW, _("Missing value(s) for all variables from %s onward. "
634 "These will be filled with the system-missing value "
635 "or blanks, as appropriate."),
638 value_set_missing (case_data_rw_idx (c, f->case_idx),
639 fmt_var_width (&f->format));
643 const char *input_encoding = dfm_reader_get_encoding (reader);
644 error = data_in (s, input_encoding, f->format.type,
645 case_data_rw_idx (c, f->case_idx),
646 fmt_var_width (&f->format), output_encoding);
648 parse_error (reader, f, first_column, last_column, error);
651 s = dfm_get_record (reader);
652 ss_ltrim (&s, parser->soft_seps);
653 if (!ss_is_empty (s))
654 msg (DW, _("Record ends in data not part of any field."));
657 dfm_forward_record (reader);
662 /* Displays a table giving information on fixed-format variable
663 parsing on DATA LIST. */
665 dump_fixed_table (const struct data_parser *parser,
666 const struct file_handle *fh)
668 /* XXX This should not be preformatted. */
669 char *title = xasprintf (ngettext ("Reading %d record from %s.",
670 "Reading %d records from %s.",
671 parser->records_per_case),
672 parser->records_per_case, fh_get_name (fh));
673 struct pivot_table *table = pivot_table_create__ (
674 pivot_value_new_user_text (title, -1), "Fixed Data Records");
677 pivot_dimension_create (
678 table, PIVOT_AXIS_COLUMN, N_("Attributes"),
679 N_("Record"), N_("Columns"), N_("Format"));
681 struct pivot_dimension *variables = pivot_dimension_create (
682 table, PIVOT_AXIS_ROW, N_("Variable"));
683 variables->root->show_label = true;
684 for (size_t i = 0; i < parser->field_cnt; i++)
686 struct field *f = &parser->fields[i];
688 /* XXX It would be better to have the actual variable here. */
689 int variable_idx = pivot_category_create_leaf (
690 variables->root, pivot_value_new_user_text (f->name, -1));
692 pivot_table_put2 (table, 0, variable_idx,
693 pivot_value_new_integer (f->record));
695 int first_column = f->first_column;
696 int last_column = f->first_column + f->format.w - 1;
697 char *columns = xasprintf ("%3d-%3d", first_column, last_column);
698 pivot_table_put2 (table, 1, variable_idx,
699 pivot_value_new_user_text (columns, -1));
702 char str[FMT_STRING_LEN_MAX + 1];
703 pivot_table_put2 (table, 2, variable_idx,
704 pivot_value_new_user_text (
705 fmt_to_string (&f->format, str), -1));
709 pivot_table_submit (table);
712 /* Displays a table giving information on free-format variable parsing
715 dump_delimited_table (const struct data_parser *parser,
716 const struct file_handle *fh)
718 struct pivot_table *table = pivot_table_create__ (
719 pivot_value_new_text_format (N_("Reading free-form data from %s."),
721 "Free-Form Data Records");
723 pivot_dimension_create (
724 table, PIVOT_AXIS_COLUMN, N_("Attributes"), N_("Format"));
726 struct pivot_dimension *variables = pivot_dimension_create (
727 table, PIVOT_AXIS_ROW, N_("Variable"));
728 variables->root->show_label = true;
729 for (size_t i = 0; i < parser->field_cnt; i++)
731 struct field *f = &parser->fields[i];
733 /* XXX It would be better to have the actual variable here. */
734 int variable_idx = pivot_category_create_leaf (
735 variables->root, pivot_value_new_user_text (f->name, -1));
737 char str[FMT_STRING_LEN_MAX + 1];
738 pivot_table_put2 (table, 0, variable_idx,
739 pivot_value_new_user_text (
740 fmt_to_string (&f->format, str), -1));
743 pivot_table_submit (table);
746 /* Displays a table giving information on how PARSER will read
749 data_parser_output_description (struct data_parser *parser,
750 const struct file_handle *fh)
752 if (parser->type == DP_FIXED)
753 dump_fixed_table (parser, fh);
755 dump_delimited_table (parser, fh);
758 /* Data parser input program. */
759 struct data_parser_casereader
761 struct data_parser *parser; /* Parser. */
762 struct dfm_reader *reader; /* Data file reader. */
763 struct caseproto *proto; /* Format of cases. */
766 static const struct casereader_class data_parser_casereader_class;
768 /* Replaces DS's active dataset by an input program that reads data
769 from READER according to the rules in PARSER, using DICT as
770 the underlying dictionary. Ownership of PARSER and READER is
771 transferred to the input program, and ownership of DICT is
772 transferred to the dataset. */
774 data_parser_make_active_file (struct data_parser *parser, struct dataset *ds,
775 struct dfm_reader *reader,
776 struct dictionary *dict,
777 struct casereader* (*func)(struct casereader *,
778 const struct dictionary *,
782 struct data_parser_casereader *r;
783 struct casereader *casereader0;
784 struct casereader *casereader1;
786 r = xmalloc (sizeof *r);
789 r->proto = caseproto_ref (dict_get_proto (dict));
790 casereader0 = casereader_create_sequential (NULL, r->proto,
792 &data_parser_casereader_class, r);
795 casereader1 = func (casereader0, dict, ud);
797 casereader1 = casereader0;
799 dataset_set_dict (ds, dict);
800 dataset_set_source (ds, casereader1);
804 static struct ccase *
805 data_parser_casereader_read (struct casereader *reader UNUSED, void *r_)
807 struct data_parser_casereader *r = r_;
808 struct ccase *c = case_create (r->proto);
809 if (data_parser_parse (r->parser, r->reader, c))
819 data_parser_casereader_destroy (struct casereader *reader, void *r_)
821 struct data_parser_casereader *r = r_;
822 if (dfm_reader_error (r->reader))
823 casereader_force_error (reader);
824 data_parser_destroy (r->parser);
825 dfm_close_reader (r->reader);
826 caseproto_unref (r->proto);
830 static const struct casereader_class data_parser_casereader_class =
832 data_parser_casereader_read,
833 data_parser_casereader_destroy,