1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/data-io/data-parser.h"
24 #include "data/casereader-provider.h"
25 #include "data/data-in.h"
26 #include "data/dataset.h"
27 #include "data/dictionary.h"
28 #include "data/format.h"
29 #include "data/file-handle-def.h"
30 #include "data/settings.h"
31 #include "language/data-io/data-reader.h"
32 #include "libpspp/message.h"
33 #include "libpspp/str.h"
34 #include "output/pivot-table.h"
36 #include "gl/xalloc.h"
39 #define N_(msgid) msgid
40 #define _(msgid) gettext (msgid)
42 /* Data parser for textual data like that read by DATA LIST. */
45 struct dictionary *dict; /* Dictionary of destination */
46 enum data_parser_type type; /* Type of data to parse. */
47 int skip_records; /* Records to skip before first real data. */
49 struct field *fields; /* Fields to parse. */
50 size_t field_cnt; /* Number of fields. */
51 size_t field_allocated; /* Number of fields spaced allocated for. */
53 /* DP_DELIMITED parsers only. */
54 bool span; /* May cases span multiple records? */
55 bool empty_line_has_field; /* Does an empty line have an (empty) field? */
56 bool warn_missing_fields; /* Should missing fields be considered errors? */
57 struct substring quotes; /* Characters that can quote separators. */
58 bool quote_escape; /* Doubled quote acts as escape? */
59 struct substring soft_seps; /* Two soft separators act like just one. */
60 struct substring hard_seps; /* Two hard separators yield empty fields. */
61 struct string any_sep; /* Concatenation of soft_seps and hard_seps. */
63 /* DP_FIXED parsers only. */
64 int records_per_case; /* Number of records in each case. */
67 /* How to parse one variable. */
70 struct fmt_spec format; /* Input format of this field. */
71 int case_idx; /* First value in case. */
72 char *name; /* Var name for error messages and tables. */
75 int record; /* Record number (1-based). */
76 int first_column; /* First column in record (1-based). */
79 static void set_any_sep (struct data_parser *parser);
81 /* Creates and returns a new data parser. */
83 data_parser_create (struct dictionary *dict)
85 struct data_parser *parser = xmalloc (sizeof *parser);
87 parser->type = DP_FIXED;
88 parser->skip_records = 0;
90 parser->fields = NULL;
91 parser->field_cnt = 0;
92 parser->field_allocated = 0;
93 parser->dict = dict_ref (dict);
96 parser->empty_line_has_field = false;
97 parser->warn_missing_fields = true;
98 ss_alloc_substring (&parser->quotes, ss_cstr ("\"'"));
99 parser->quote_escape = false;
100 ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES));
101 ss_alloc_substring (&parser->hard_seps, ss_cstr (","));
102 ds_init_empty (&parser->any_sep);
103 set_any_sep (parser);
105 parser->records_per_case = 0;
110 /* Destroys PARSER. */
112 data_parser_destroy (struct data_parser *parser)
118 dict_unref (parser->dict);
119 for (i = 0; i < parser->field_cnt; i++)
120 free (parser->fields[i].name);
121 free (parser->fields);
122 ss_dealloc (&parser->quotes);
123 ss_dealloc (&parser->soft_seps);
124 ss_dealloc (&parser->hard_seps);
125 ds_destroy (&parser->any_sep);
130 /* Returns the type of PARSER (either DP_DELIMITED or DP_FIXED). */
131 enum data_parser_type
132 data_parser_get_type (const struct data_parser *parser)
137 /* Sets the type of PARSER to TYPE (either DP_DELIMITED or
140 data_parser_set_type (struct data_parser *parser, enum data_parser_type type)
142 assert (parser->field_cnt == 0);
143 assert (type == DP_FIXED || type == DP_DELIMITED);
147 /* Configures PARSER to skip the specified number of
148 INITIAL_RECORDS_TO_SKIP before parsing any data. By default,
149 no records are skipped. */
151 data_parser_set_skip (struct data_parser *parser, int initial_records_to_skip)
153 assert (initial_records_to_skip >= 0);
154 parser->skip_records = initial_records_to_skip;
157 /* Returns true if PARSER is configured to allow cases to span
160 data_parser_get_span (const struct data_parser *parser)
165 /* If MAY_CASES_SPAN_RECORDS is true, configures PARSER to allow
166 a single case to span multiple records and multiple cases to
167 occupy a single record. If MAY_CASES_SPAN_RECORDS is false,
168 configures PARSER to require each record to contain exactly
171 This setting affects parsing of DP_DELIMITED files only. */
173 data_parser_set_span (struct data_parser *parser, bool may_cases_span_records)
175 parser->span = may_cases_span_records;
178 /* If EMPTY_LINE_HAS_FIELD is true, configures PARSER to parse an
179 empty line as an empty field and to treat a hard delimiter
180 followed by end-of-line as an empty field. If
181 EMPTY_LINE_HAS_FIELD is false, PARSER will skip empty lines
182 and hard delimiters at the end of lines without emitting empty
185 This setting affects parsing of DP_DELIMITED files only. */
187 data_parser_set_empty_line_has_field (struct data_parser *parser,
188 bool empty_line_has_field)
190 parser->empty_line_has_field = empty_line_has_field;
194 /* If WARN_MISSING_FIELDS is true, configures PARSER to emit a warning
195 and cause an error condition when a missing field is encountered.
196 If WARN_MISSING_FIELDS is false, PARSER will silently fill such
197 fields with the system missing value.
199 This setting affects parsing of DP_DELIMITED files only. */
201 data_parser_set_warn_missing_fields (struct data_parser *parser,
202 bool warn_missing_fields)
204 parser->warn_missing_fields = warn_missing_fields;
208 /* Sets the characters that may be used for quoting field
209 contents to QUOTES. If QUOTES is empty, quoting will be
212 The caller retains ownership of QUOTES.
214 This setting affects parsing of DP_DELIMITED files only. */
216 data_parser_set_quotes (struct data_parser *parser, struct substring quotes)
218 ss_dealloc (&parser->quotes);
219 ss_alloc_substring (&parser->quotes, quotes);
222 /* If ESCAPE is false (the default setting), a character used for
223 quoting cannot itself be embedded within a quoted field. If
224 ESCAPE is true, then a quote character can be embedded within
225 a quoted field by doubling it.
227 This setting affects parsing of DP_DELIMITED files only, and
228 only when at least one quote character has been set (with
229 data_parser_set_quotes). */
231 data_parser_set_quote_escape (struct data_parser *parser, bool escape)
233 parser->quote_escape = escape;
236 /* Sets PARSER's soft delimiters to DELIMITERS. Soft delimiters
237 separate fields, but consecutive soft delimiters do not yield
238 empty fields. (Ordinarily, only white space characters are
239 appropriate soft delimiters.)
241 The caller retains ownership of DELIMITERS.
243 This setting affects parsing of DP_DELIMITED files only. */
245 data_parser_set_soft_delimiters (struct data_parser *parser,
246 struct substring delimiters)
248 ss_dealloc (&parser->soft_seps);
249 ss_alloc_substring (&parser->soft_seps, delimiters);
250 set_any_sep (parser);
253 /* Sets PARSER's hard delimiters to DELIMITERS. Hard delimiters
254 separate fields. A consecutive pair of hard delimiters yield
257 The caller retains ownership of DELIMITERS.
259 This setting affects parsing of DP_DELIMITED files only. */
261 data_parser_set_hard_delimiters (struct data_parser *parser,
262 struct substring delimiters)
264 ss_dealloc (&parser->hard_seps);
265 ss_alloc_substring (&parser->hard_seps, delimiters);
266 set_any_sep (parser);
269 /* Returns the number of records per case. */
271 data_parser_get_records (const struct data_parser *parser)
273 return parser->records_per_case;
276 /* Sets the number of records per case to RECORDS_PER_CASE.
278 This setting affects parsing of DP_FIXED files only. */
280 data_parser_set_records (struct data_parser *parser, int records_per_case)
282 assert (records_per_case >= 0);
283 assert (records_per_case >= parser->records_per_case);
284 parser->records_per_case = records_per_case;
288 add_field (struct data_parser *p, const struct fmt_spec *format, int case_idx,
289 const char *name, int record, int first_column)
293 if (p->field_cnt == p->field_allocated)
294 p->fields = x2nrealloc (p->fields, &p->field_allocated, sizeof *p->fields);
295 field = &p->fields[p->field_cnt++];
296 field->format = *format;
297 field->case_idx = case_idx;
298 field->name = xstrdup (name);
299 field->record = record;
300 field->first_column = first_column;
303 /* Adds a delimited field to the field parsed by PARSER, which
304 must be configured as a DP_DELIMITED parser. The field is
305 parsed as input format FORMAT. Its data will be stored into case
306 index CASE_INDEX. Errors in input data will be reported
307 against variable NAME. */
309 data_parser_add_delimited_field (struct data_parser *parser,
310 const struct fmt_spec *format, int case_idx,
313 assert (parser->type == DP_DELIMITED);
314 add_field (parser, format, case_idx, name, 0, 0);
317 /* Adds a fixed field to the field parsed by PARSER, which
318 must be configured as a DP_FIXED parser. The field is
319 parsed as input format FORMAT. Its data will be stored into case
320 index CASE_INDEX. Errors in input data will be reported
321 against variable NAME. The field will be drawn from the
322 FORMAT->w columns in 1-based RECORD starting at 1-based
325 RECORD must be at least as great as that of any field already
326 added; that is, fields must be added in increasing order of
327 record number. If RECORD is greater than the current number
328 of records per case, the number of records per case are
329 increased as needed. */
331 data_parser_add_fixed_field (struct data_parser *parser,
332 const struct fmt_spec *format, int case_idx,
334 int record, int first_column)
336 assert (parser->type == DP_FIXED);
337 assert (parser->field_cnt == 0
338 || record >= parser->fields[parser->field_cnt - 1].record);
339 if (record > parser->records_per_case)
340 parser->records_per_case = record;
341 add_field (parser, format, case_idx, name, record, first_column);
344 /* Returns true if any fields have been added to PARSER, false
347 data_parser_any_fields (const struct data_parser *parser)
349 return parser->field_cnt > 0;
353 set_any_sep (struct data_parser *parser)
355 ds_assign_substring (&parser->any_sep, parser->soft_seps);
356 ds_put_substring (&parser->any_sep, parser->hard_seps);
359 static bool parse_delimited_span (const struct data_parser *,
360 struct dfm_reader *, struct ccase *);
361 static bool parse_delimited_no_span (const struct data_parser *,
362 struct dfm_reader *, struct ccase *);
363 static bool parse_fixed (const struct data_parser *,
364 struct dfm_reader *, struct ccase *);
366 /* Reads a case from DFM into C, parsing it with PARSER. Returns
367 true if successful, false at end of file or on I/O error.
369 Case C must not be shared. */
371 data_parser_parse (struct data_parser *parser, struct dfm_reader *reader,
376 assert (!case_is_shared (c));
377 assert (data_parser_any_fields (parser));
379 /* Skip the requested number of records before reading the
381 for (; parser->skip_records > 0; parser->skip_records--)
383 if (dfm_eof (reader))
385 dfm_forward_record (reader);
389 if (parser->type == DP_DELIMITED)
392 retval = parse_delimited_span (parser, reader, c);
394 retval = parse_delimited_no_span (parser, reader, c);
397 retval = parse_fixed (parser, reader, c);
402 /* Extracts a delimited field from the current position in the
403 current record according to PARSER, reading data from READER.
405 *FIELD is set to the field content. The caller must not or
406 destroy this constant string.
408 Sets *FIRST_COLUMN to the 1-based column number of the start of
409 the extracted field, and *LAST_COLUMN to the end of the extracted
412 Returns true on success, false on failure. */
414 cut_field (const struct data_parser *parser, struct dfm_reader *reader,
415 int *first_column, int *last_column, struct string *tmp,
416 struct substring *field)
418 size_t length_before_separators;
419 struct substring line, p;
422 if (dfm_eof (reader))
424 if (ss_is_empty (parser->hard_seps))
425 dfm_expand_tabs (reader);
426 line = p = dfm_get_record (reader);
428 /* Skip leading soft separators. */
429 ss_ltrim (&p, parser->soft_seps);
431 /* Handle empty or completely consumed lines. */
434 if (!parser->empty_line_has_field || dfm_columns_past_end (reader) > 0)
439 *first_column = dfm_column_start (reader);
440 *last_column = *first_column + 1;
441 dfm_forward_columns (reader, 1);
446 *first_column = dfm_column_start (reader);
447 quoted = ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX;
451 int quote = ss_get_byte (&p);
452 if (!ss_get_until (&p, quote, field))
453 msg (DW, _("Quoted string extends beyond end of line."));
454 if (parser->quote_escape && ss_first (p) == quote)
456 ds_assign_substring (tmp, *field);
457 while (ss_match_byte (&p, quote))
460 ds_put_byte (tmp, quote);
461 if (!ss_get_until (&p, quote, &ss))
462 msg (DW, _("Quoted string extends beyond end of line."));
463 ds_put_substring (tmp, ss);
465 *field = ds_ss (tmp);
467 *last_column = *first_column + (ss_length (line) - ss_length (p));
472 ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field);
473 *last_column = *first_column + ss_length (*field);
476 /* Skip trailing soft separator and a single hard separator if present. */
477 length_before_separators = ss_length (p);
478 ss_ltrim (&p, parser->soft_seps);
480 && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX)
483 ss_ltrim (&p, parser->soft_seps);
486 dfm_forward_columns (reader, 1);
487 else if (quoted && length_before_separators == ss_length (p))
488 msg (DW, _("Missing delimiter following quoted string."));
489 dfm_forward_columns (reader, ss_length (line) - ss_length (p));
495 parse_error (const struct dfm_reader *reader, const struct field *field,
496 int first_column, int last_column, char *error)
498 int line_number = dfm_get_line_number (reader);
499 struct msg_location *location = xmalloc (sizeof *location);
500 *location = (struct msg_location) {
501 .file_name = xstrdup (dfm_get_file_name (reader)),
502 .first_line = line_number,
503 .last_line = line_number + 1,
504 .first_column = first_column,
505 .last_column = last_column,
507 struct msg *m = xmalloc (sizeof *m);
509 .category = MSG_C_DATA,
510 .severity = MSG_S_WARNING,
511 .location = location,
512 .text = xasprintf (_("Data for variable %s is not valid as format %s: %s"),
513 field->name, fmt_name (field->format.type), error),
520 /* Reads a case from READER into C, parsing it according to
521 fixed-format syntax rules in PARSER.
522 Returns true if successful, false at end of file or on I/O error. */
524 parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
527 const char *input_encoding = dfm_reader_get_encoding (reader);
528 const char *output_encoding = dict_get_encoding (parser->dict);
532 if (dfm_eof (reader))
536 for (row = 1; row <= parser->records_per_case; row++)
538 struct substring line;
540 if (dfm_eof (reader))
542 msg (DW, _("Partial case of %d of %d records discarded."),
543 row - 1, parser->records_per_case);
546 dfm_expand_tabs (reader);
547 line = dfm_get_record (reader);
549 for (; f < &parser->fields[parser->field_cnt] && f->record == row; f++)
551 struct substring s = ss_substr (line, f->first_column - 1,
553 union value *value = case_data_rw_idx (c, f->case_idx);
554 char *error = data_in (s, input_encoding, f->format.type,
555 settings_get_fmt_settings (),
556 value, fmt_var_width (&f->format),
560 data_in_imply_decimals (s, input_encoding, f->format.type,
561 f->format.d, settings_get_fmt_settings (),
564 parse_error (reader, f, f->first_column,
565 f->first_column + f->format.w, error);
568 dfm_forward_record (reader);
574 /* Reads a case from READER into C, parsing it according to
575 free-format syntax rules in PARSER.
576 Returns true if successful, false at end of file or on I/O error. */
578 parse_delimited_span (const struct data_parser *parser,
579 struct dfm_reader *reader, struct ccase *c)
581 const char *output_encoding = dict_get_encoding (parser->dict);
582 struct string tmp = DS_EMPTY_INITIALIZER;
585 for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++)
588 int first_column, last_column;
591 /* Cut out a field and read in a new record if necessary. */
592 while (!cut_field (parser, reader,
593 &first_column, &last_column, &tmp, &s))
595 if (!dfm_eof (reader))
596 dfm_forward_record (reader);
597 if (dfm_eof (reader))
599 if (f > parser->fields)
600 msg (DW, _("Partial case discarded. The first variable "
601 "missing was %s."), f->name);
607 const char *input_encoding = dfm_reader_get_encoding (reader);
608 error = data_in (s, input_encoding, f->format.type,
609 settings_get_fmt_settings (),
610 case_data_rw_idx (c, f->case_idx),
611 fmt_var_width (&f->format), output_encoding);
613 parse_error (reader, f, first_column, last_column, error);
619 /* Reads a case from READER into C, parsing it according to
620 delimited syntax rules with one case per record in PARSER.
621 Returns true if successful, false at end of file or on I/O error. */
623 parse_delimited_no_span (const struct data_parser *parser,
624 struct dfm_reader *reader, struct ccase *c)
626 const char *output_encoding = dict_get_encoding (parser->dict);
627 struct string tmp = DS_EMPTY_INITIALIZER;
629 struct field *f, *end;
631 if (dfm_eof (reader))
634 end = &parser->fields[parser->field_cnt];
635 for (f = parser->fields; f < end; f++)
637 int first_column, last_column;
640 if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s))
642 if (f < end - 1 && settings_get_undefined () && parser->warn_missing_fields)
643 msg (DW, _("Missing value(s) for all variables from %s onward. "
644 "These will be filled with the system-missing value "
645 "or blanks, as appropriate."),
648 value_set_missing (case_data_rw_idx (c, f->case_idx),
649 fmt_var_width (&f->format));
653 const char *input_encoding = dfm_reader_get_encoding (reader);
654 error = data_in (s, input_encoding, f->format.type,
655 settings_get_fmt_settings (),
656 case_data_rw_idx (c, f->case_idx),
657 fmt_var_width (&f->format), output_encoding);
659 parse_error (reader, f, first_column, last_column, error);
662 s = dfm_get_record (reader);
663 ss_ltrim (&s, parser->soft_seps);
664 if (!ss_is_empty (s))
665 msg (DW, _("Record ends in data not part of any field."));
668 dfm_forward_record (reader);
673 /* Displays a table giving information on fixed-format variable
674 parsing on DATA LIST. */
676 dump_fixed_table (const struct data_parser *parser,
677 const struct file_handle *fh)
679 /* XXX This should not be preformatted. */
680 char *title = xasprintf (ngettext ("Reading %d record from %s.",
681 "Reading %d records from %s.",
682 parser->records_per_case),
683 parser->records_per_case, fh_get_name (fh));
684 struct pivot_table *table = pivot_table_create__ (
685 pivot_value_new_user_text (title, -1), "Fixed Data Records");
688 pivot_dimension_create (
689 table, PIVOT_AXIS_COLUMN, N_("Attributes"),
690 N_("Record"), N_("Columns"), N_("Format"));
692 struct pivot_dimension *variables = pivot_dimension_create (
693 table, PIVOT_AXIS_ROW, N_("Variable"));
694 variables->root->show_label = true;
695 for (size_t i = 0; i < parser->field_cnt; i++)
697 struct field *f = &parser->fields[i];
699 /* XXX It would be better to have the actual variable here. */
700 int variable_idx = pivot_category_create_leaf (
701 variables->root, pivot_value_new_user_text (f->name, -1));
703 pivot_table_put2 (table, 0, variable_idx,
704 pivot_value_new_integer (f->record));
706 int first_column = f->first_column;
707 int last_column = f->first_column + f->format.w - 1;
708 char *columns = xasprintf ("%d-%d", first_column, last_column);
709 pivot_table_put2 (table, 1, variable_idx,
710 pivot_value_new_user_text (columns, -1));
713 char str[FMT_STRING_LEN_MAX + 1];
714 pivot_table_put2 (table, 2, variable_idx,
715 pivot_value_new_user_text (
716 fmt_to_string (&f->format, str), -1));
720 pivot_table_submit (table);
723 /* Displays a table giving information on free-format variable parsing
726 dump_delimited_table (const struct data_parser *parser,
727 const struct file_handle *fh)
729 struct pivot_table *table = pivot_table_create__ (
730 pivot_value_new_text_format (N_("Reading free-form data from %s."),
732 "Free-Form Data Records");
734 pivot_dimension_create (
735 table, PIVOT_AXIS_COLUMN, N_("Attributes"), N_("Format"));
737 struct pivot_dimension *variables = pivot_dimension_create (
738 table, PIVOT_AXIS_ROW, N_("Variable"));
739 variables->root->show_label = true;
740 for (size_t i = 0; i < parser->field_cnt; i++)
742 struct field *f = &parser->fields[i];
744 /* XXX It would be better to have the actual variable here. */
745 int variable_idx = pivot_category_create_leaf (
746 variables->root, pivot_value_new_user_text (f->name, -1));
748 char str[FMT_STRING_LEN_MAX + 1];
749 pivot_table_put2 (table, 0, variable_idx,
750 pivot_value_new_user_text (
751 fmt_to_string (&f->format, str), -1));
754 pivot_table_submit (table);
757 /* Displays a table giving information on how PARSER will read
760 data_parser_output_description (struct data_parser *parser,
761 const struct file_handle *fh)
763 if (parser->type == DP_FIXED)
764 dump_fixed_table (parser, fh);
766 dump_delimited_table (parser, fh);
769 /* Data parser input program. */
770 struct data_parser_casereader
772 struct data_parser *parser; /* Parser. */
773 struct dfm_reader *reader; /* Data file reader. */
774 struct caseproto *proto; /* Format of cases. */
777 static const struct casereader_class data_parser_casereader_class;
779 /* Replaces DS's active dataset by an input program that reads data
780 from READER according to the rules in PARSER, using DICT as
781 the underlying dictionary. Ownership of PARSER and READER is
782 transferred to the input program, and ownership of DICT is
783 transferred to the dataset. */
785 data_parser_make_active_file (struct data_parser *parser, struct dataset *ds,
786 struct dfm_reader *reader,
787 struct dictionary *dict,
788 struct casereader* (*func)(struct casereader *,
789 const struct dictionary *,
793 struct data_parser_casereader *r;
794 struct casereader *casereader0;
795 struct casereader *casereader1;
797 r = xmalloc (sizeof *r);
800 r->proto = caseproto_ref (dict_get_proto (dict));
801 casereader0 = casereader_create_sequential (NULL, r->proto,
803 &data_parser_casereader_class, r);
806 casereader1 = func (casereader0, dict, ud);
808 casereader1 = casereader0;
810 dataset_set_dict (ds, dict);
811 dataset_set_source (ds, casereader1);
815 static struct ccase *
816 data_parser_casereader_read (struct casereader *reader UNUSED, void *r_)
818 struct data_parser_casereader *r = r_;
819 struct ccase *c = case_create (r->proto);
820 if (data_parser_parse (r->parser, r->reader, c))
830 data_parser_casereader_destroy (struct casereader *reader, void *r_)
832 struct data_parser_casereader *r = r_;
833 if (dfm_reader_error (r->reader))
834 casereader_force_error (reader);
835 dfm_close_reader (r->reader);
836 caseproto_unref (r->proto);
837 data_parser_destroy (r->parser);
841 static const struct casereader_class data_parser_casereader_class =
843 data_parser_casereader_read,
844 data_parser_casereader_destroy,