1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/data-io/data-parser.h"
24 #include "data/casereader-provider.h"
25 #include "data/data-in.h"
26 #include "data/dataset.h"
27 #include "data/dictionary.h"
28 #include "data/format.h"
29 #include "data/file-handle-def.h"
30 #include "data/settings.h"
31 #include "language/data-io/data-reader.h"
32 #include "libpspp/intern.h"
33 #include "libpspp/message.h"
34 #include "libpspp/str.h"
35 #include "output/pivot-table.h"
37 #include "gl/xalloc.h"
40 #define N_(msgid) msgid
41 #define _(msgid) gettext (msgid)
43 /* Data parser for textual data like that read by DATA LIST. */
46 enum data_parser_type type; /* Type of data to parse. */
47 int skip_records; /* Records to skip before first real data. */
49 struct field *fields; /* Fields to parse. */
50 size_t n_fields; /* Number of fields. */
51 size_t field_allocated; /* Number of fields spaced allocated for. */
53 /* DP_DELIMITED parsers only. */
54 bool span; /* May cases span multiple records? */
55 bool empty_line_has_field; /* Does an empty line have an (empty) field? */
56 bool warn_missing_fields; /* Should missing fields be considered errors? */
57 struct substring quotes; /* Characters that can quote separators. */
58 bool quote_escape; /* Doubled quote acts as escape? */
59 struct substring soft_seps; /* Two soft separators act like just one. */
60 struct substring hard_seps; /* Two hard separators yield empty fields. */
61 struct string any_sep; /* Concatenation of soft_seps and hard_seps. */
63 /* DP_FIXED parsers only. */
64 int records_per_case; /* Number of records in each case. */
67 /* How to parse one variable. */
70 struct fmt_spec format; /* Input format of this field. */
71 int case_idx; /* First value in case. */
72 char *name; /* Var name for error messages and tables. */
75 int record; /* Record number (1-based). */
76 int first_column; /* First column in record (1-based). */
79 static void set_any_sep (struct data_parser *parser);
81 /* Creates and returns a new data parser. */
83 data_parser_create (void)
85 struct data_parser *parser = xmalloc (sizeof *parser);
87 parser->type = DP_FIXED;
88 parser->skip_records = 0;
90 parser->fields = NULL;
92 parser->field_allocated = 0;
95 parser->empty_line_has_field = false;
96 parser->warn_missing_fields = true;
97 ss_alloc_substring (&parser->quotes, ss_cstr ("\"'"));
98 parser->quote_escape = false;
99 ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES));
100 ss_alloc_substring (&parser->hard_seps, ss_cstr (","));
101 ds_init_empty (&parser->any_sep);
102 set_any_sep (parser);
104 parser->records_per_case = 0;
109 /* Destroys PARSER. */
111 data_parser_destroy (struct data_parser *parser)
117 for (i = 0; i < parser->n_fields; i++)
118 free (parser->fields[i].name);
119 free (parser->fields);
120 ss_dealloc (&parser->quotes);
121 ss_dealloc (&parser->soft_seps);
122 ss_dealloc (&parser->hard_seps);
123 ds_destroy (&parser->any_sep);
128 /* Returns the type of PARSER (either DP_DELIMITED or DP_FIXED). */
129 enum data_parser_type
130 data_parser_get_type (const struct data_parser *parser)
135 /* Sets the type of PARSER to TYPE (either DP_DELIMITED or
138 data_parser_set_type (struct data_parser *parser, enum data_parser_type type)
140 assert (parser->n_fields == 0);
141 assert (type == DP_FIXED || type == DP_DELIMITED);
145 /* Configures PARSER to skip the specified number of
146 INITIAL_RECORDS_TO_SKIP before parsing any data. By default,
147 no records are skipped. */
149 data_parser_set_skip (struct data_parser *parser, int initial_records_to_skip)
151 assert (initial_records_to_skip >= 0);
152 parser->skip_records = initial_records_to_skip;
155 /* Returns true if PARSER is configured to allow cases to span
158 data_parser_get_span (const struct data_parser *parser)
163 /* If MAY_CASES_SPAN_RECORDS is true, configures PARSER to allow
164 a single case to span multiple records and multiple cases to
165 occupy a single record. If MAY_CASES_SPAN_RECORDS is false,
166 configures PARSER to require each record to contain exactly
169 This setting affects parsing of DP_DELIMITED files only. */
171 data_parser_set_span (struct data_parser *parser, bool may_cases_span_records)
173 parser->span = may_cases_span_records;
176 /* If EMPTY_LINE_HAS_FIELD is true, configures PARSER to parse an
177 empty line as an empty field and to treat a hard delimiter
178 followed by end-of-line as an empty field. If
179 EMPTY_LINE_HAS_FIELD is false, PARSER will skip empty lines
180 and hard delimiters at the end of lines without emitting empty
183 This setting affects parsing of DP_DELIMITED files only. */
185 data_parser_set_empty_line_has_field (struct data_parser *parser,
186 bool empty_line_has_field)
188 parser->empty_line_has_field = empty_line_has_field;
192 /* If WARN_MISSING_FIELDS is true, configures PARSER to emit a warning
193 and cause an error condition when a missing field is encountered.
194 If WARN_MISSING_FIELDS is false, PARSER will silently fill such
195 fields with the system missing value.
197 This setting affects parsing of DP_DELIMITED files only. */
199 data_parser_set_warn_missing_fields (struct data_parser *parser,
200 bool warn_missing_fields)
202 parser->warn_missing_fields = warn_missing_fields;
206 /* Sets the characters that may be used for quoting field
207 contents to QUOTES. If QUOTES is empty, quoting will be
210 The caller retains ownership of QUOTES.
212 This setting affects parsing of DP_DELIMITED files only. */
214 data_parser_set_quotes (struct data_parser *parser, struct substring quotes)
216 ss_dealloc (&parser->quotes);
217 ss_alloc_substring (&parser->quotes, quotes);
220 /* If ESCAPE is false (the default setting), a character used for
221 quoting cannot itself be embedded within a quoted field. If
222 ESCAPE is true, then a quote character can be embedded within
223 a quoted field by doubling it.
225 This setting affects parsing of DP_DELIMITED files only, and
226 only when at least one quote character has been set (with
227 data_parser_set_quotes). */
229 data_parser_set_quote_escape (struct data_parser *parser, bool escape)
231 parser->quote_escape = escape;
234 /* Sets PARSER's soft delimiters to DELIMITERS. Soft delimiters
235 separate fields, but consecutive soft delimiters do not yield
236 empty fields. (Ordinarily, only white space characters are
237 appropriate soft delimiters.)
239 The caller retains ownership of DELIMITERS.
241 This setting affects parsing of DP_DELIMITED files only. */
243 data_parser_set_soft_delimiters (struct data_parser *parser,
244 struct substring delimiters)
246 ss_dealloc (&parser->soft_seps);
247 ss_alloc_substring (&parser->soft_seps, delimiters);
248 set_any_sep (parser);
251 /* Sets PARSER's hard delimiters to DELIMITERS. Hard delimiters
252 separate fields. A consecutive pair of hard delimiters yield
255 The caller retains ownership of DELIMITERS.
257 This setting affects parsing of DP_DELIMITED files only. */
259 data_parser_set_hard_delimiters (struct data_parser *parser,
260 struct substring delimiters)
262 ss_dealloc (&parser->hard_seps);
263 ss_alloc_substring (&parser->hard_seps, delimiters);
264 set_any_sep (parser);
267 /* Returns the number of records per case. */
269 data_parser_get_records (const struct data_parser *parser)
271 return parser->records_per_case;
274 /* Sets the number of records per case to RECORDS_PER_CASE.
276 This setting affects parsing of DP_FIXED files only. */
278 data_parser_set_records (struct data_parser *parser, int records_per_case)
280 assert (records_per_case >= 0);
281 assert (records_per_case >= parser->records_per_case);
282 parser->records_per_case = records_per_case;
286 add_field (struct data_parser *p, const struct fmt_spec *format, int case_idx,
287 const char *name, int record, int first_column)
291 if (p->n_fields == p->field_allocated)
292 p->fields = x2nrealloc (p->fields, &p->field_allocated, sizeof *p->fields);
293 field = &p->fields[p->n_fields++];
294 field->format = *format;
295 field->case_idx = case_idx;
296 field->name = xstrdup (name);
297 field->record = record;
298 field->first_column = first_column;
301 /* Adds a delimited field to the field parsed by PARSER, which
302 must be configured as a DP_DELIMITED parser. The field is
303 parsed as input format FORMAT. Its data will be stored into case
304 index CASE_INDEX. Errors in input data will be reported
305 against variable NAME. */
307 data_parser_add_delimited_field (struct data_parser *parser,
308 const struct fmt_spec *format, int case_idx,
311 assert (parser->type == DP_DELIMITED);
312 add_field (parser, format, case_idx, name, 0, 0);
315 /* Adds a fixed field to the field parsed by PARSER, which
316 must be configured as a DP_FIXED parser. The field is
317 parsed as input format FORMAT. Its data will be stored into case
318 index CASE_INDEX. Errors in input data will be reported
319 against variable NAME. The field will be drawn from the
320 FORMAT->w columns in 1-based RECORD starting at 1-based
323 RECORD must be at least as great as that of any field already
324 added; that is, fields must be added in increasing order of
325 record number. If RECORD is greater than the current number
326 of records per case, the number of records per case are
327 increased as needed. */
329 data_parser_add_fixed_field (struct data_parser *parser,
330 const struct fmt_spec *format, int case_idx,
332 int record, int first_column)
334 assert (parser->type == DP_FIXED);
335 assert (parser->n_fields == 0
336 || record >= parser->fields[parser->n_fields - 1].record);
337 if (record > parser->records_per_case)
338 parser->records_per_case = record;
339 add_field (parser, format, case_idx, name, record, first_column);
342 /* Returns true if any fields have been added to PARSER, false
345 data_parser_any_fields (const struct data_parser *parser)
347 return parser->n_fields > 0;
351 set_any_sep (struct data_parser *parser)
353 ds_assign_substring (&parser->any_sep, parser->soft_seps);
354 ds_put_substring (&parser->any_sep, parser->hard_seps);
357 static bool parse_delimited_span (const struct data_parser *,
359 struct dictionary *, struct ccase *);
360 static bool parse_delimited_no_span (const struct data_parser *,
362 struct dictionary *, struct ccase *);
363 static bool parse_fixed (const struct data_parser *, struct dfm_reader *,
364 struct dictionary *, struct ccase *);
366 /* Reads a case from DFM into C, which matches dictionary DICT, parsing it with
367 PARSER. Returns true if successful, false at end of file or on I/O error.
369 Case C must not be shared. */
371 data_parser_parse (struct data_parser *parser, struct dfm_reader *reader,
372 struct dictionary *dict, struct ccase *c)
376 assert (!case_is_shared (c));
377 assert (data_parser_any_fields (parser));
379 /* Skip the requested number of records before reading the
381 for (; parser->skip_records > 0; parser->skip_records--)
383 if (dfm_eof (reader))
385 dfm_forward_record (reader);
389 if (parser->type == DP_DELIMITED)
392 retval = parse_delimited_span (parser, reader, dict, c);
394 retval = parse_delimited_no_span (parser, reader, dict, c);
397 retval = parse_fixed (parser, reader, dict, c);
402 /* Extracts a delimited field from the current position in the
403 current record according to PARSER, reading data from READER.
405 *FIELD is set to the field content. The caller must not or
406 destroy this constant string.
408 Sets *FIRST_COLUMN to the 1-based column number of the start of
409 the extracted field, and *LAST_COLUMN to the end of the extracted
412 Returns true on success, false on failure. */
414 cut_field (const struct data_parser *parser, struct dfm_reader *reader,
415 int *first_column, int *last_column, struct string *tmp,
416 struct substring *field)
418 size_t length_before_separators;
419 struct substring line, p;
422 if (dfm_eof (reader))
424 if (ss_is_empty (parser->hard_seps))
425 dfm_expand_tabs (reader);
426 line = p = dfm_get_record (reader);
428 /* Skip leading soft separators. */
429 ss_ltrim (&p, parser->soft_seps);
431 /* Handle empty or completely consumed lines. */
434 if (!parser->empty_line_has_field || dfm_columns_past_end (reader) > 0)
439 *first_column = dfm_column_start (reader);
440 *last_column = *first_column + 1;
441 dfm_forward_columns (reader, 1);
446 *first_column = dfm_column_start (reader);
447 quoted = ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX;
451 int quote = ss_get_byte (&p);
452 if (!ss_get_until (&p, quote, field))
453 msg (DW, _("Quoted string extends beyond end of line."));
454 if (parser->quote_escape && ss_first (p) == quote)
456 ds_assign_substring (tmp, *field);
457 while (ss_match_byte (&p, quote))
460 ds_put_byte (tmp, quote);
461 if (!ss_get_until (&p, quote, &ss))
462 msg (DW, _("Quoted string extends beyond end of line."));
463 ds_put_substring (tmp, ss);
465 *field = ds_ss (tmp);
467 *last_column = *first_column + (ss_length (line) - ss_length (p));
472 ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field);
473 *last_column = *first_column + ss_length (*field);
476 /* Skip trailing soft separator and a single hard separator if present. */
477 length_before_separators = ss_length (p);
478 ss_ltrim (&p, parser->soft_seps);
480 && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX)
483 ss_ltrim (&p, parser->soft_seps);
486 dfm_forward_columns (reader, 1);
487 else if (quoted && length_before_separators == ss_length (p))
488 msg (DW, _("Missing delimiter following quoted string."));
489 dfm_forward_columns (reader, ss_length (line) - ss_length (p));
495 parse_error (const struct dfm_reader *reader, const struct field *field,
496 int first_column, int last_column, char *error)
498 int line_number = dfm_get_line_number (reader);
499 struct msg_location *location = xmalloc (sizeof *location);
500 *location = (struct msg_location) {
501 .file_name = intern_new (dfm_get_file_name (reader)),
502 .start = { .line = line_number, .column = first_column },
503 .end = { .line = line_number, .column = last_column - 1 },
505 struct msg *m = xmalloc (sizeof *m);
507 .category = MSG_C_DATA,
508 .severity = MSG_S_WARNING,
509 .location = location,
510 .text = xasprintf (_("Data for variable %s is not valid as format %s: %s"),
511 field->name, fmt_name (field->format.type), error),
518 /* Reads a case from READER into C, which matches DICT, parsing it according to
519 fixed-format syntax rules in PARSER. Returns true if successful, false at
520 end of file or on I/O error. */
522 parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
523 struct dictionary *dict, struct ccase *c)
525 const char *input_encoding = dfm_reader_get_encoding (reader);
526 const char *output_encoding = dict_get_encoding (dict);
530 if (dfm_eof (reader))
534 for (row = 1; row <= parser->records_per_case; row++)
536 struct substring line;
538 if (dfm_eof (reader))
540 msg (DW, _("Partial case of %d of %d records discarded."),
541 row - 1, parser->records_per_case);
544 dfm_expand_tabs (reader);
545 line = dfm_get_record (reader);
547 for (; f < &parser->fields[parser->n_fields] && f->record == row; f++)
549 struct substring s = ss_substr (line, f->first_column - 1,
551 union value *value = case_data_rw_idx (c, f->case_idx);
552 char *error = data_in (s, input_encoding, f->format.type,
553 settings_get_fmt_settings (),
554 value, fmt_var_width (&f->format),
558 data_in_imply_decimals (s, input_encoding, f->format.type,
559 f->format.d, settings_get_fmt_settings (),
562 parse_error (reader, f, f->first_column,
563 f->first_column + f->format.w, error);
566 dfm_forward_record (reader);
572 /* Reads a case from READER into C, which matches dictionary DICT, parsing it
573 according to free-format syntax rules in PARSER. Returns true if
574 successful, false at end of file or on I/O error. */
576 parse_delimited_span (const struct data_parser *parser,
577 struct dfm_reader *reader,
578 struct dictionary *dict, struct ccase *c)
580 const char *output_encoding = dict_get_encoding (dict);
581 struct string tmp = DS_EMPTY_INITIALIZER;
584 for (f = parser->fields; f < &parser->fields[parser->n_fields]; f++)
587 int first_column, last_column;
590 /* Cut out a field and read in a new record if necessary. */
591 while (!cut_field (parser, reader,
592 &first_column, &last_column, &tmp, &s))
594 if (!dfm_eof (reader))
595 dfm_forward_record (reader);
596 if (dfm_eof (reader))
598 if (f > parser->fields)
599 msg (DW, _("Partial case discarded. The first variable "
600 "missing was %s."), f->name);
606 const char *input_encoding = dfm_reader_get_encoding (reader);
607 error = data_in (s, input_encoding, f->format.type,
608 settings_get_fmt_settings (),
609 case_data_rw_idx (c, f->case_idx),
610 fmt_var_width (&f->format), output_encoding);
612 parse_error (reader, f, first_column, last_column, error);
618 /* Reads a case from READER into C, which matches dictionary DICT, parsing it
619 according to delimited syntax rules with one case per record in PARSER.
620 Returns true if successful, false at end of file or on I/O error. */
622 parse_delimited_no_span (const struct data_parser *parser,
623 struct dfm_reader *reader,
624 struct dictionary *dict, struct ccase *c)
626 const char *output_encoding = dict_get_encoding (dict);
627 struct string tmp = DS_EMPTY_INITIALIZER;
629 struct field *f, *end;
631 if (dfm_eof (reader))
634 end = &parser->fields[parser->n_fields];
635 for (f = parser->fields; f < end; f++)
637 int first_column, last_column;
640 if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s))
642 if (f < end - 1 && settings_get_undefined () && parser->warn_missing_fields)
643 msg (DW, _("Missing value(s) for all variables from %s onward. "
644 "These will be filled with the system-missing value "
645 "or blanks, as appropriate."),
648 value_set_missing (case_data_rw_idx (c, f->case_idx),
649 fmt_var_width (&f->format));
653 const char *input_encoding = dfm_reader_get_encoding (reader);
654 error = data_in (s, input_encoding, f->format.type,
655 settings_get_fmt_settings (),
656 case_data_rw_idx (c, f->case_idx),
657 fmt_var_width (&f->format), output_encoding);
659 parse_error (reader, f, first_column, last_column, error);
662 s = dfm_get_record (reader);
663 ss_ltrim (&s, parser->soft_seps);
664 if (!ss_is_empty (s))
665 msg (DW, _("Record ends in data not part of any field."));
668 dfm_forward_record (reader);
673 /* Displays a table giving information on fixed-format variable
674 parsing on DATA LIST. */
676 dump_fixed_table (const struct data_parser *parser,
677 const struct file_handle *fh)
679 /* XXX This should not be preformatted. */
680 char *title = xasprintf (ngettext ("Reading %d record from %s.",
681 "Reading %d records from %s.",
682 parser->records_per_case),
683 parser->records_per_case, fh_get_name (fh));
684 struct pivot_table *table = pivot_table_create__ (
685 pivot_value_new_user_text (title, -1), "Fixed Data Records");
688 pivot_dimension_create (
689 table, PIVOT_AXIS_COLUMN, N_("Attributes"),
690 N_("Record"), N_("Columns"), N_("Format"));
692 struct pivot_dimension *variables = pivot_dimension_create (
693 table, PIVOT_AXIS_ROW, N_("Variable"));
694 variables->root->show_label = true;
695 for (size_t i = 0; i < parser->n_fields; i++)
697 struct field *f = &parser->fields[i];
699 /* XXX It would be better to have the actual variable here. */
700 int variable_idx = pivot_category_create_leaf (
701 variables->root, pivot_value_new_user_text (f->name, -1));
703 pivot_table_put2 (table, 0, variable_idx,
704 pivot_value_new_integer (f->record));
706 int first_column = f->first_column;
707 int last_column = f->first_column + f->format.w - 1;
708 char *columns = xasprintf ("%d-%d", first_column, last_column);
709 pivot_table_put2 (table, 1, variable_idx,
710 pivot_value_new_user_text (columns, -1));
713 char str[FMT_STRING_LEN_MAX + 1];
714 pivot_table_put2 (table, 2, variable_idx,
715 pivot_value_new_user_text (
716 fmt_to_string (&f->format, str), -1));
720 pivot_table_submit (table);
723 /* Displays a table giving information on free-format variable parsing
726 dump_delimited_table (const struct data_parser *parser,
727 const struct file_handle *fh)
729 struct pivot_table *table = pivot_table_create__ (
730 pivot_value_new_text_format (N_("Reading free-form data from %s."),
732 "Free-Form Data Records");
734 pivot_dimension_create (
735 table, PIVOT_AXIS_COLUMN, N_("Attributes"), N_("Format"));
737 struct pivot_dimension *variables = pivot_dimension_create (
738 table, PIVOT_AXIS_ROW, N_("Variable"));
739 variables->root->show_label = true;
740 for (size_t i = 0; i < parser->n_fields; i++)
742 struct field *f = &parser->fields[i];
744 /* XXX It would be better to have the actual variable here. */
745 int variable_idx = pivot_category_create_leaf (
746 variables->root, pivot_value_new_user_text (f->name, -1));
748 char str[FMT_STRING_LEN_MAX + 1];
749 pivot_table_put2 (table, 0, variable_idx,
750 pivot_value_new_user_text (
751 fmt_to_string (&f->format, str), -1));
754 pivot_table_submit (table);
757 /* Displays a table giving information on how PARSER will read
760 data_parser_output_description (struct data_parser *parser,
761 const struct file_handle *fh)
763 if (parser->type == DP_FIXED)
764 dump_fixed_table (parser, fh);
766 dump_delimited_table (parser, fh);
769 /* Data parser input program. */
770 struct data_parser_casereader
772 struct data_parser *parser; /* Parser. */
773 struct dictionary *dict; /* Dictionary. */
774 struct dfm_reader *reader; /* Data file reader. */
775 struct caseproto *proto; /* Format of cases. */
778 static const struct casereader_class data_parser_casereader_class;
780 /* Replaces DS's active dataset by an input program that reads data
781 from READER according to the rules in PARSER, using DICT as
782 the underlying dictionary. Ownership of PARSER and READER is
783 transferred to the input program, and ownership of DICT is
784 transferred to the dataset. */
786 data_parser_make_active_file (struct data_parser *parser, struct dataset *ds,
787 struct dfm_reader *reader,
788 struct dictionary *dict,
789 struct casereader* (*func)(struct casereader *,
790 const struct dictionary *,
794 struct data_parser_casereader *r;
795 struct casereader *casereader0;
796 struct casereader *casereader1;
798 r = xmalloc (sizeof *r);
800 r->dict = dict_ref (dict);
802 r->proto = caseproto_ref (dict_get_proto (dict));
803 casereader0 = casereader_create_sequential (NULL, r->proto,
805 &data_parser_casereader_class, r);
808 casereader1 = func (casereader0, dict, ud);
810 casereader1 = casereader0;
812 dataset_set_dict (ds, dict);
813 dataset_set_source (ds, casereader1);
817 static struct ccase *
818 data_parser_casereader_read (struct casereader *reader UNUSED, void *r_)
820 struct data_parser_casereader *r = r_;
821 struct ccase *c = case_create (r->proto);
822 if (data_parser_parse (r->parser, r->reader, r->dict, c))
832 data_parser_casereader_destroy (struct casereader *reader, void *r_)
834 struct data_parser_casereader *r = r_;
835 if (dfm_reader_error (r->reader))
836 casereader_force_error (reader);
837 dfm_close_reader (r->reader);
838 caseproto_unref (r->proto);
839 dict_unref (r->dict);
840 data_parser_destroy (r->parser);
844 static const struct casereader_class data_parser_casereader_class =
846 data_parser_casereader_read,
847 data_parser_casereader_destroy,