1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/data-io/data-parser.h"
24 #include "data/casereader-provider.h"
25 #include "data/data-in.h"
26 #include "data/dataset.h"
27 #include "data/dictionary.h"
28 #include "data/format.h"
29 #include "data/file-handle-def.h"
30 #include "data/settings.h"
31 #include "language/data-io/data-reader.h"
32 #include "libpspp/message.h"
33 #include "libpspp/str.h"
34 #include "output/tab.h"
36 #include "gl/xalloc.h"
39 #define _(msgid) gettext (msgid)
41 /* Data parser for textual data like that read by DATA LIST. */
44 const struct dictionary *dict; /*Dictionary of destination */
45 enum data_parser_type type; /* Type of data to parse. */
46 int skip_records; /* Records to skip before first real data. */
48 struct field *fields; /* Fields to parse. */
49 size_t field_cnt; /* Number of fields. */
50 size_t field_allocated; /* Number of fields spaced allocated for. */
52 /* DP_DELIMITED parsers only. */
53 bool span; /* May cases span multiple records? */
54 bool empty_line_has_field; /* Does an empty line have an (empty) field? */
55 struct substring quotes; /* Characters that can quote separators. */
56 bool quote_escape; /* Doubled quote acts as escape? */
57 struct substring soft_seps; /* Two soft separators act like just one. */
58 struct substring hard_seps; /* Two hard separators yield empty fields. */
59 struct string any_sep; /* Concatenation of soft_seps and hard_seps. */
61 /* DP_FIXED parsers only. */
62 int records_per_case; /* Number of records in each case. */
65 /* How to parse one variable. */
68 struct fmt_spec format; /* Input format of this field. */
69 int case_idx; /* First value in case. */
70 char *name; /* Var name for error messages and tables. */
73 int record; /* Record number (1-based). */
74 int first_column; /* First column in record (1-based). */
77 static void set_any_sep (struct data_parser *parser);
79 /* Creates and returns a new data parser. */
81 data_parser_create (const struct dictionary *dict)
83 struct data_parser *parser = xmalloc (sizeof *parser);
85 parser->type = DP_FIXED;
86 parser->skip_records = 0;
88 parser->fields = NULL;
89 parser->field_cnt = 0;
90 parser->field_allocated = 0;
94 parser->empty_line_has_field = false;
95 ss_alloc_substring (&parser->quotes, ss_cstr ("\"'"));
96 parser->quote_escape = false;
97 ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES));
98 ss_alloc_substring (&parser->hard_seps, ss_cstr (","));
99 ds_init_empty (&parser->any_sep);
100 set_any_sep (parser);
102 parser->records_per_case = 0;
107 /* Destroys PARSER. */
109 data_parser_destroy (struct data_parser *parser)
115 for (i = 0; i < parser->field_cnt; i++)
116 free (parser->fields[i].name);
117 free (parser->fields);
118 ss_dealloc (&parser->quotes);
119 ss_dealloc (&parser->soft_seps);
120 ss_dealloc (&parser->hard_seps);
121 ds_destroy (&parser->any_sep);
126 /* Returns the type of PARSER (either DP_DELIMITED or DP_FIXED). */
127 enum data_parser_type
128 data_parser_get_type (const struct data_parser *parser)
133 /* Sets the type of PARSER to TYPE (either DP_DELIMITED or
136 data_parser_set_type (struct data_parser *parser, enum data_parser_type type)
138 assert (parser->field_cnt == 0);
139 assert (type == DP_FIXED || type == DP_DELIMITED);
143 /* Configures PARSER to skip the specified number of
144 INITIAL_RECORDS_TO_SKIP before parsing any data. By default,
145 no records are skipped. */
147 data_parser_set_skip (struct data_parser *parser, int initial_records_to_skip)
149 assert (initial_records_to_skip >= 0);
150 parser->skip_records = initial_records_to_skip;
153 /* Returns true if PARSER is configured to allow cases to span
156 data_parser_get_span (const struct data_parser *parser)
161 /* If MAY_CASES_SPAN_RECORDS is true, configures PARSER to allow
162 a single case to span multiple records and multiple cases to
163 occupy a single record. If MAY_CASES_SPAN_RECORDS is false,
164 configures PARSER to require each record to contain exactly
167 This setting affects parsing of DP_DELIMITED files only. */
169 data_parser_set_span (struct data_parser *parser, bool may_cases_span_records)
171 parser->span = may_cases_span_records;
174 /* If EMPTY_LINE_HAS_FIELD is true, configures PARSER to parse an
175 empty line as an empty field and to treat a hard delimiter
176 followed by end-of-line as an empty field. If
177 EMPTY_LINE_HAS_FIELD is false, PARSER will skip empty lines
178 and hard delimiters at the end of lines without emitting empty
181 This setting affects parsing of DP_DELIMITED files only. */
183 data_parser_set_empty_line_has_field (struct data_parser *parser,
184 bool empty_line_has_field)
186 parser->empty_line_has_field = empty_line_has_field;
189 /* Sets the characters that may be used for quoting field
190 contents to QUOTES. If QUOTES is empty, quoting will be
193 The caller retains ownership of QUOTES.
195 This setting affects parsing of DP_DELIMITED files only. */
197 data_parser_set_quotes (struct data_parser *parser, struct substring quotes)
199 ss_dealloc (&parser->quotes);
200 ss_alloc_substring (&parser->quotes, quotes);
203 /* If ESCAPE is false (the default setting), a character used for
204 quoting cannot itself be embedded within a quoted field. If
205 ESCAPE is true, then a quote character can be embedded within
206 a quoted field by doubling it.
208 This setting affects parsing of DP_DELIMITED files only, and
209 only when at least one quote character has been set (with
210 data_parser_set_quotes). */
212 data_parser_set_quote_escape (struct data_parser *parser, bool escape)
214 parser->quote_escape = escape;
217 /* Sets PARSER's soft delimiters to DELIMITERS. Soft delimiters
218 separate fields, but consecutive soft delimiters do not yield
219 empty fields. (Ordinarily, only white space characters are
220 appropriate soft delimiters.)
222 The caller retains ownership of DELIMITERS.
224 This setting affects parsing of DP_DELIMITED files only. */
226 data_parser_set_soft_delimiters (struct data_parser *parser,
227 struct substring delimiters)
229 ss_dealloc (&parser->soft_seps);
230 ss_alloc_substring (&parser->soft_seps, delimiters);
231 set_any_sep (parser);
234 /* Sets PARSER's hard delimiters to DELIMITERS. Hard delimiters
235 separate fields. A consecutive pair of hard delimiters yield
238 The caller retains ownership of DELIMITERS.
240 This setting affects parsing of DP_DELIMITED files only. */
242 data_parser_set_hard_delimiters (struct data_parser *parser,
243 struct substring delimiters)
245 ss_dealloc (&parser->hard_seps);
246 ss_alloc_substring (&parser->hard_seps, delimiters);
247 set_any_sep (parser);
250 /* Returns the number of records per case. */
252 data_parser_get_records (const struct data_parser *parser)
254 return parser->records_per_case;
257 /* Sets the number of records per case to RECORDS_PER_CASE.
259 This setting affects parsing of DP_FIXED files only. */
261 data_parser_set_records (struct data_parser *parser, int records_per_case)
263 assert (records_per_case >= 0);
264 assert (records_per_case >= parser->records_per_case);
265 parser->records_per_case = records_per_case;
269 add_field (struct data_parser *p, const struct fmt_spec *format, int case_idx,
270 const char *name, int record, int first_column)
274 if (p->field_cnt == p->field_allocated)
275 p->fields = x2nrealloc (p->fields, &p->field_allocated, sizeof *p->fields);
276 field = &p->fields[p->field_cnt++];
277 field->format = *format;
278 field->case_idx = case_idx;
279 field->name = xstrdup (name);
280 field->record = record;
281 field->first_column = first_column;
284 /* Adds a delimited field to the field parsed by PARSER, which
285 must be configured as a DP_DELIMITED parser. The field is
286 parsed as input format FORMAT. Its data will be stored into case
287 index CASE_INDEX. Errors in input data will be reported
288 against variable NAME. */
290 data_parser_add_delimited_field (struct data_parser *parser,
291 const struct fmt_spec *format, int case_idx,
294 assert (parser->type == DP_DELIMITED);
295 add_field (parser, format, case_idx, name, 0, 0);
298 /* Adds a fixed field to the field parsed by PARSER, which
299 must be configured as a DP_FIXED parser. The field is
300 parsed as input format FORMAT. Its data will be stored into case
301 index CASE_INDEX. Errors in input data will be reported
302 against variable NAME. The field will be drawn from the
303 FORMAT->w columns in 1-based RECORD starting at 1-based
306 RECORD must be at least as great as that of any field already
307 added; that is, fields must be added in increasing order of
308 record number. If RECORD is greater than the current number
309 of records per case, the number of records per case are
310 increased as needed. */
312 data_parser_add_fixed_field (struct data_parser *parser,
313 const struct fmt_spec *format, int case_idx,
315 int record, int first_column)
317 assert (parser->type == DP_FIXED);
318 assert (parser->field_cnt == 0
319 || record >= parser->fields[parser->field_cnt - 1].record);
320 if (record > parser->records_per_case)
321 parser->records_per_case = record;
322 add_field (parser, format, case_idx, name, record, first_column);
325 /* Returns true if any fields have been added to PARSER, false
328 data_parser_any_fields (const struct data_parser *parser)
330 return parser->field_cnt > 0;
334 set_any_sep (struct data_parser *parser)
336 ds_assign_substring (&parser->any_sep, parser->soft_seps);
337 ds_put_substring (&parser->any_sep, parser->hard_seps);
340 static bool parse_delimited_span (const struct data_parser *,
341 struct dfm_reader *, struct ccase *);
342 static bool parse_delimited_no_span (const struct data_parser *,
343 struct dfm_reader *, struct ccase *);
344 static bool parse_fixed (const struct data_parser *,
345 struct dfm_reader *, struct ccase *);
347 /* Reads a case from DFM into C, parsing it with PARSER. Returns
348 true if successful, false at end of file or on I/O error.
350 Case C must not be shared. */
352 data_parser_parse (struct data_parser *parser, struct dfm_reader *reader,
357 assert (!case_is_shared (c));
358 assert (data_parser_any_fields (parser));
360 /* Skip the requested number of records before reading the
362 for (; parser->skip_records > 0; parser->skip_records--)
364 if (dfm_eof (reader))
366 dfm_forward_record (reader);
370 if (parser->type == DP_DELIMITED)
373 retval = parse_delimited_span (parser, reader, c);
375 retval = parse_delimited_no_span (parser, reader, c);
378 retval = parse_fixed (parser, reader, c);
383 /* Extracts a delimited field from the current position in the
384 current record according to PARSER, reading data from READER.
386 *FIELD is set to the field content. The caller must not or
387 destroy this constant string.
389 After parsing the field, sets the current position in the
390 record to just past the field and any trailing delimiter.
391 Returns 0 on failure or a 1-based column number indicating the
392 beginning of the field on success. */
394 cut_field (const struct data_parser *parser, struct dfm_reader *reader,
395 int *first_column, int *last_column, struct string *tmp,
396 struct substring *field)
398 size_t length_before_separators;
399 struct substring line, p;
402 if (dfm_eof (reader))
404 if (ss_is_empty (parser->hard_seps))
405 dfm_expand_tabs (reader);
406 line = p = dfm_get_record (reader);
408 /* Skip leading soft separators. */
409 ss_ltrim (&p, parser->soft_seps);
411 /* Handle empty or completely consumed lines. */
414 if (!parser->empty_line_has_field || dfm_columns_past_end (reader) > 0)
419 *first_column = dfm_column_start (reader);
420 *last_column = *first_column + 1;
421 dfm_forward_columns (reader, 1);
426 *first_column = dfm_column_start (reader);
427 quoted = ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX;
431 int quote = ss_get_byte (&p);
432 if (!ss_get_until (&p, quote, field))
433 msg (DW, _("Quoted string extends beyond end of line."));
434 if (parser->quote_escape && ss_first (p) == quote)
436 ds_assign_substring (tmp, *field);
437 while (ss_match_byte (&p, quote))
440 ds_put_byte (tmp, quote);
441 if (!ss_get_until (&p, quote, &ss))
442 msg (DW, _("Quoted string extends beyond end of line."));
443 ds_put_substring (tmp, ss);
445 *field = ds_ss (tmp);
447 *last_column = *first_column + (ss_length (line) - ss_length (p));
452 ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field);
453 *last_column = *first_column + ss_length (*field);
456 /* Skip trailing soft separator and a single hard separator if present. */
457 length_before_separators = ss_length (p);
458 ss_ltrim (&p, parser->soft_seps);
460 && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX)
463 ss_ltrim (&p, parser->soft_seps);
466 dfm_forward_columns (reader, 1);
467 else if (quoted && length_before_separators == ss_length (p))
468 msg (DW, _("Missing delimiter following quoted string."));
469 dfm_forward_columns (reader, ss_length (line) - ss_length (p));
475 parse_error (const struct dfm_reader *reader, const struct field *field,
476 int first_column, int last_column, char *error)
480 m.category = MSG_C_DATA;
481 m.severity = MSG_S_WARNING;
482 m.file_name = CONST_CAST (char *, dfm_get_file_name (reader));
483 m.first_line = dfm_get_line_number (reader);
484 m.last_line = m.first_line + 1;
485 m.first_column = first_column;
486 m.last_column = last_column;
487 m.text = xasprintf (_("Data for variable %s is not valid as format %s: %s"),
488 field->name, fmt_name (field->format.type), error);
494 /* Reads a case from READER into C, parsing it according to
495 fixed-format syntax rules in PARSER.
496 Returns true if successful, false at end of file or on I/O error. */
498 parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
501 const char *input_encoding = dfm_reader_get_encoding (reader);
502 const char *output_encoding = dict_get_encoding (parser->dict);
506 if (dfm_eof (reader))
510 for (row = 1; row <= parser->records_per_case; row++)
512 struct substring line;
514 if (dfm_eof (reader))
516 msg (DW, _("Partial case of %d of %d records discarded."),
517 row - 1, parser->records_per_case);
520 dfm_expand_tabs (reader);
521 line = dfm_get_record (reader);
523 for (; f < &parser->fields[parser->field_cnt] && f->record == row; f++)
525 struct substring s = ss_substr (line, f->first_column - 1,
527 union value *value = case_data_rw_idx (c, f->case_idx);
528 char *error = data_in (s, input_encoding, f->format.type,
529 value, fmt_var_width (&f->format),
533 data_in_imply_decimals (s, input_encoding, f->format.type,
536 parse_error (reader, f, f->first_column,
537 f->first_column + f->format.w, error);
540 dfm_forward_record (reader);
546 /* Reads a case from READER into C, parsing it according to
547 free-format syntax rules in PARSER.
548 Returns true if successful, false at end of file or on I/O error. */
550 parse_delimited_span (const struct data_parser *parser,
551 struct dfm_reader *reader, struct ccase *c)
553 const char *output_encoding = dict_get_encoding (parser->dict);
554 struct string tmp = DS_EMPTY_INITIALIZER;
557 for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++)
560 int first_column, last_column;
563 /* Cut out a field and read in a new record if necessary. */
564 while (!cut_field (parser, reader,
565 &first_column, &last_column, &tmp, &s))
567 if (!dfm_eof (reader))
568 dfm_forward_record (reader);
569 if (dfm_eof (reader))
571 if (f > parser->fields)
572 msg (DW, _("Partial case discarded. The first variable "
573 "missing was %s."), f->name);
579 const char *input_encoding = dfm_reader_get_encoding (reader);
580 error = data_in (s, input_encoding, f->format.type,
581 case_data_rw_idx (c, f->case_idx),
582 fmt_var_width (&f->format), output_encoding);
584 parse_error (reader, f, first_column, last_column, error);
590 /* Reads a case from READER into C, parsing it according to
591 delimited syntax rules with one case per record in PARSER.
592 Returns true if successful, false at end of file or on I/O error. */
594 parse_delimited_no_span (const struct data_parser *parser,
595 struct dfm_reader *reader, struct ccase *c)
597 const char *output_encoding = dict_get_encoding (parser->dict);
598 struct string tmp = DS_EMPTY_INITIALIZER;
600 struct field *f, *end;
602 if (dfm_eof (reader))
605 end = &parser->fields[parser->field_cnt];
606 for (f = parser->fields; f < end; f++)
608 int first_column, last_column;
611 if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s))
613 if (f < end - 1 && settings_get_undefined ())
614 msg (DW, _("Missing value(s) for all variables from %s onward. "
615 "These will be filled with the system-missing value "
616 "or blanks, as appropriate."),
619 value_set_missing (case_data_rw_idx (c, f->case_idx),
620 fmt_var_width (&f->format));
624 const char *input_encoding = dfm_reader_get_encoding (reader);
625 error = data_in (s, input_encoding, f->format.type,
626 case_data_rw_idx (c, f->case_idx),
627 fmt_var_width (&f->format), output_encoding);
629 parse_error (reader, f, first_column, last_column, error);
632 s = dfm_get_record (reader);
633 ss_ltrim (&s, parser->soft_seps);
634 if (!ss_is_empty (s))
635 msg (DW, _("Record ends in data not part of any field."));
638 dfm_forward_record (reader);
643 /* Displays a table giving information on fixed-format variable
644 parsing on DATA LIST. */
646 dump_fixed_table (const struct data_parser *parser,
647 const struct file_handle *fh)
652 t = tab_create (4, parser->field_cnt + 1);
653 tab_headers (t, 0, 0, 1, 0);
654 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable"));
655 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Record"));
656 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Columns"));
657 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Format"));
658 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 3, parser->field_cnt);
659 tab_hline (t, TAL_2, 0, 3, 1);
661 for (i = 0; i < parser->field_cnt; i++)
663 struct field *f = &parser->fields[i];
664 char fmt_string[FMT_STRING_LEN_MAX + 1];
667 tab_text (t, 0, row, TAB_LEFT, f->name);
668 tab_text_format (t, 1, row, 0, "%d", f->record);
669 tab_text_format (t, 2, row, 0, "%3d-%3d",
670 f->first_column, f->first_column + f->format.w - 1);
671 tab_text (t, 3, row, TAB_LEFT | TAB_FIX,
672 fmt_to_string (&f->format, fmt_string));
675 tab_title (t, ngettext ("Reading %d record from %s.",
676 "Reading %d records from %s.",
677 parser->records_per_case),
678 parser->records_per_case, fh_get_name (fh));
682 /* Displays a table giving information on free-format variable parsing
685 dump_delimited_table (const struct data_parser *parser,
686 const struct file_handle *fh)
691 t = tab_create (2, parser->field_cnt + 1);
692 tab_headers (t, 0, 0, 1, 0);
693 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable"));
694 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Format"));
695 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, parser->field_cnt);
696 tab_hline (t, TAL_2, 0, 1, 1);
698 for (i = 0; i < parser->field_cnt; i++)
700 struct field *f = &parser->fields[i];
701 char str[FMT_STRING_LEN_MAX + 1];
704 tab_text (t, 0, row, TAB_LEFT, f->name);
705 tab_text (t, 1, row, TAB_LEFT | TAB_FIX,
706 fmt_to_string (&f->format, str));
709 tab_title (t, _("Reading free-form data from %s."), fh_get_name (fh));
714 /* Displays a table giving information on how PARSER will read
717 data_parser_output_description (struct data_parser *parser,
718 const struct file_handle *fh)
720 if (parser->type == DP_FIXED)
721 dump_fixed_table (parser, fh);
723 dump_delimited_table (parser, fh);
726 /* Data parser input program. */
727 struct data_parser_casereader
729 struct data_parser *parser; /* Parser. */
730 struct dfm_reader *reader; /* Data file reader. */
731 struct caseproto *proto; /* Format of cases. */
734 static const struct casereader_class data_parser_casereader_class;
736 /* Replaces DS's active dataset by an input program that reads data
737 from READER according to the rules in PARSER, using DICT as
738 the underlying dictionary. Ownership of PARSER and READER is
739 transferred to the input program, and ownership of DICT is
740 transferred to the dataset. */
742 data_parser_make_active_file (struct data_parser *parser, struct dataset *ds,
743 struct dfm_reader *reader,
744 struct dictionary *dict)
746 struct data_parser_casereader *r;
747 struct casereader *casereader;
749 r = xmalloc (sizeof *r);
752 r->proto = caseproto_ref (dict_get_proto (dict));
753 casereader = casereader_create_sequential (NULL, r->proto,
755 &data_parser_casereader_class, r);
756 dataset_set_dict (ds, dict);
757 dataset_set_source (ds, casereader);
760 static struct ccase *
761 data_parser_casereader_read (struct casereader *reader UNUSED, void *r_)
763 struct data_parser_casereader *r = r_;
764 struct ccase *c = case_create (r->proto);
765 if (data_parser_parse (r->parser, r->reader, c))
775 data_parser_casereader_destroy (struct casereader *reader UNUSED, void *r_)
777 struct data_parser_casereader *r = r_;
778 if (dfm_reader_error (r->reader))
779 casereader_force_error (reader);
780 data_parser_destroy (r->parser);
781 dfm_close_reader (r->reader);
782 caseproto_unref (r->proto);
786 static const struct casereader_class data_parser_casereader_class =
788 data_parser_casereader_read,
789 data_parser_casereader_destroy,