1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "language/data-io/data-parser.h"
24 #include "data/casereader-provider.h"
25 #include "data/data-in.h"
26 #include "data/dataset.h"
27 #include "data/dictionary.h"
28 #include "data/format.h"
29 #include "data/file-handle-def.h"
30 #include "data/settings.h"
31 #include "language/data-io/data-reader.h"
32 #include "libpspp/message.h"
33 #include "libpspp/str.h"
34 #include "output/tab.h"
36 #include "gl/xalloc.h"
39 #define _(msgid) gettext (msgid)
41 /* Data parser for textual data like that read by DATA LIST. */
44 const struct dictionary *dict; /*Dictionary of destination */
45 enum data_parser_type type; /* Type of data to parse. */
46 int skip_records; /* Records to skip before first real data. */
47 casenumber max_cases; /* Max number of cases to read. */
48 int percent_cases; /* Approximate percent of cases to read. */
50 struct field *fields; /* Fields to parse. */
51 size_t field_cnt; /* Number of fields. */
52 size_t field_allocated; /* Number of fields spaced allocated for. */
54 /* DP_DELIMITED parsers only. */
55 bool span; /* May cases span multiple records? */
56 bool empty_line_has_field; /* Does an empty line have an (empty) field? */
57 struct substring quotes; /* Characters that can quote separators. */
58 bool quote_escape; /* Doubled quote acts as escape? */
59 struct substring soft_seps; /* Two soft separators act like just one. */
60 struct substring hard_seps; /* Two hard separators yield empty fields. */
61 struct string any_sep; /* Concatenation of soft_seps and hard_seps. */
63 /* DP_FIXED parsers only. */
64 int records_per_case; /* Number of records in each case. */
67 /* How to parse one variable. */
70 struct fmt_spec format; /* Input format of this field. */
71 int case_idx; /* First value in case. */
72 char *name; /* Var name for error messages and tables. */
75 int record; /* Record number (1-based). */
76 int first_column; /* First column in record (1-based). */
79 static void set_any_sep (struct data_parser *parser);
81 /* Creates and returns a new data parser. */
83 data_parser_create (const struct dictionary *dict)
85 struct data_parser *parser = xmalloc (sizeof *parser);
87 parser->type = DP_FIXED;
88 parser->skip_records = 0;
89 parser->max_cases = -1;
90 parser->percent_cases = 100;
92 parser->fields = NULL;
93 parser->field_cnt = 0;
94 parser->field_allocated = 0;
98 parser->empty_line_has_field = false;
99 ss_alloc_substring (&parser->quotes, ss_cstr ("\"'"));
100 parser->quote_escape = false;
101 ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES));
102 ss_alloc_substring (&parser->hard_seps, ss_cstr (","));
103 ds_init_empty (&parser->any_sep);
104 set_any_sep (parser);
106 parser->records_per_case = 0;
111 /* Destroys PARSER. */
113 data_parser_destroy (struct data_parser *parser)
119 for (i = 0; i < parser->field_cnt; i++)
120 free (parser->fields[i].name);
121 free (parser->fields);
122 ss_dealloc (&parser->quotes);
123 ss_dealloc (&parser->soft_seps);
124 ss_dealloc (&parser->hard_seps);
125 ds_destroy (&parser->any_sep);
130 /* Returns the type of PARSER (either DP_DELIMITED or DP_FIXED). */
131 enum data_parser_type
132 data_parser_get_type (const struct data_parser *parser)
137 /* Sets the type of PARSER to TYPE (either DP_DELIMITED or
140 data_parser_set_type (struct data_parser *parser, enum data_parser_type type)
142 assert (parser->field_cnt == 0);
143 assert (type == DP_FIXED || type == DP_DELIMITED);
147 /* Configures PARSER to skip the specified number of
148 INITIAL_RECORDS_TO_SKIP before parsing any data. By default,
149 no records are skipped. */
151 data_parser_set_skip (struct data_parser *parser, int initial_records_to_skip)
153 assert (initial_records_to_skip >= 0);
154 parser->skip_records = initial_records_to_skip;
157 /* Sets the maximum number of cases parsed by PARSER to
158 MAX_CASES. The default is -1, meaning no limit. */
160 data_parser_set_case_limit (struct data_parser *parser, casenumber max_cases)
162 parser->max_cases = max_cases;
165 /* Sets the percentage of cases that PARSER should read from the
166 input file to PERCENT_CASES. By default, all cases are
169 data_parser_set_case_percent (struct data_parser *parser, int percent_cases)
171 assert (percent_cases >= 0 && percent_cases <= 100);
172 parser->percent_cases = percent_cases;
175 /* Returns true if PARSER is configured to allow cases to span
178 data_parser_get_span (const struct data_parser *parser)
183 /* If MAY_CASES_SPAN_RECORDS is true, configures PARSER to allow
184 a single case to span multiple records and multiple cases to
185 occupy a single record. If MAY_CASES_SPAN_RECORDS is false,
186 configures PARSER to require each record to contain exactly
189 This setting affects parsing of DP_DELIMITED files only. */
191 data_parser_set_span (struct data_parser *parser, bool may_cases_span_records)
193 parser->span = may_cases_span_records;
196 /* If EMPTY_LINE_HAS_FIELD is true, configures PARSER to parse an
197 empty line as an empty field and to treat a hard delimiter
198 followed by end-of-line as an empty field. If
199 EMPTY_LINE_HAS_FIELD is false, PARSER will skip empty lines
200 and hard delimiters at the end of lines without emitting empty
203 This setting affects parsing of DP_DELIMITED files only. */
205 data_parser_set_empty_line_has_field (struct data_parser *parser,
206 bool empty_line_has_field)
208 parser->empty_line_has_field = empty_line_has_field;
211 /* Sets the characters that may be used for quoting field
212 contents to QUOTES. If QUOTES is empty, quoting will be
215 The caller retains ownership of QUOTES.
217 This setting affects parsing of DP_DELIMITED files only. */
219 data_parser_set_quotes (struct data_parser *parser, struct substring quotes)
221 ss_dealloc (&parser->quotes);
222 ss_alloc_substring (&parser->quotes, quotes);
225 /* If ESCAPE is false (the default setting), a character used for
226 quoting cannot itself be embedded within a quoted field. If
227 ESCAPE is true, then a quote character can be embedded within
228 a quoted field by doubling it.
230 This setting affects parsing of DP_DELIMITED files only, and
231 only when at least one quote character has been set (with
232 data_parser_set_quotes). */
234 data_parser_set_quote_escape (struct data_parser *parser, bool escape)
236 parser->quote_escape = escape;
239 /* Sets PARSER's soft delimiters to DELIMITERS. Soft delimiters
240 separate fields, but consecutive soft delimiters do not yield
241 empty fields. (Ordinarily, only white space characters are
242 appropriate soft delimiters.)
244 The caller retains ownership of DELIMITERS.
246 This setting affects parsing of DP_DELIMITED files only. */
248 data_parser_set_soft_delimiters (struct data_parser *parser,
249 struct substring delimiters)
251 ss_dealloc (&parser->soft_seps);
252 ss_alloc_substring (&parser->soft_seps, delimiters);
253 set_any_sep (parser);
256 /* Sets PARSER's hard delimiters to DELIMITERS. Hard delimiters
257 separate fields. A consecutive pair of hard delimiters yield
260 The caller retains ownership of DELIMITERS.
262 This setting affects parsing of DP_DELIMITED files only. */
264 data_parser_set_hard_delimiters (struct data_parser *parser,
265 struct substring delimiters)
267 ss_dealloc (&parser->hard_seps);
268 ss_alloc_substring (&parser->hard_seps, delimiters);
269 set_any_sep (parser);
272 /* Returns the number of records per case. */
274 data_parser_get_records (const struct data_parser *parser)
276 return parser->records_per_case;
279 /* Sets the number of records per case to RECORDS_PER_CASE.
281 This setting affects parsing of DP_FIXED files only. */
283 data_parser_set_records (struct data_parser *parser, int records_per_case)
285 assert (records_per_case >= 0);
286 assert (records_per_case >= parser->records_per_case);
287 parser->records_per_case = records_per_case;
291 add_field (struct data_parser *p, const struct fmt_spec *format, int case_idx,
292 const char *name, int record, int first_column)
296 if (p->field_cnt == p->field_allocated)
297 p->fields = x2nrealloc (p->fields, &p->field_allocated, sizeof *p->fields);
298 field = &p->fields[p->field_cnt++];
299 field->format = *format;
300 field->case_idx = case_idx;
301 field->name = xstrdup (name);
302 field->record = record;
303 field->first_column = first_column;
306 /* Adds a delimited field to the field parsed by PARSER, which
307 must be configured as a DP_DELIMITED parser. The field is
308 parsed as input format FORMAT. Its data will be stored into case
309 index CASE_INDEX. Errors in input data will be reported
310 against variable NAME. */
312 data_parser_add_delimited_field (struct data_parser *parser,
313 const struct fmt_spec *format, int case_idx,
316 assert (parser->type == DP_DELIMITED);
317 add_field (parser, format, case_idx, name, 0, 0);
320 /* Adds a fixed field to the field parsed by PARSER, which
321 must be configured as a DP_FIXED parser. The field is
322 parsed as input format FORMAT. Its data will be stored into case
323 index CASE_INDEX. Errors in input data will be reported
324 against variable NAME. The field will be drawn from the
325 FORMAT->w columns in 1-based RECORD starting at 1-based
328 RECORD must be at least as great as that of any field already
329 added; that is, fields must be added in increasing order of
330 record number. If RECORD is greater than the current number
331 of records per case, the number of records per case are
332 increased as needed. */
334 data_parser_add_fixed_field (struct data_parser *parser,
335 const struct fmt_spec *format, int case_idx,
337 int record, int first_column)
339 assert (parser->type == DP_FIXED);
340 assert (parser->field_cnt == 0
341 || record >= parser->fields[parser->field_cnt - 1].record);
342 if (record > parser->records_per_case)
343 parser->records_per_case = record;
344 add_field (parser, format, case_idx, name, record, first_column);
347 /* Returns true if any fields have been added to PARSER, false
350 data_parser_any_fields (const struct data_parser *parser)
352 return parser->field_cnt > 0;
356 set_any_sep (struct data_parser *parser)
358 ds_assign_substring (&parser->any_sep, parser->soft_seps);
359 ds_put_substring (&parser->any_sep, parser->hard_seps);
362 static bool parse_delimited_span (const struct data_parser *,
363 struct dfm_reader *, struct ccase *);
364 static bool parse_delimited_no_span (const struct data_parser *,
365 struct dfm_reader *, struct ccase *);
366 static bool parse_fixed (const struct data_parser *,
367 struct dfm_reader *, struct ccase *);
369 /* Reads a case from DFM into C, parsing it with PARSER. Returns
370 true if successful, false at end of file or on I/O error.
372 Case C must not be shared. */
374 data_parser_parse (struct data_parser *parser, struct dfm_reader *reader,
379 assert (!case_is_shared (c));
380 assert (data_parser_any_fields (parser));
382 /* Skip the requested number of records before reading the
384 for (; parser->skip_records > 0; parser->skip_records--)
386 if (dfm_eof (reader))
388 dfm_forward_record (reader);
392 if (parser->max_cases != -1 && parser->max_cases-- == 0)
394 if (parser->percent_cases < 100
395 && dfm_get_percent_read (reader) >= parser->percent_cases)
398 if (parser->type == DP_DELIMITED)
401 retval = parse_delimited_span (parser, reader, c);
403 retval = parse_delimited_no_span (parser, reader, c);
406 retval = parse_fixed (parser, reader, c);
411 /* Extracts a delimited field from the current position in the
412 current record according to PARSER, reading data from READER.
414 *FIELD is set to the field content. The caller must not or
415 destroy this constant string.
417 After parsing the field, sets the current position in the
418 record to just past the field and any trailing delimiter.
419 Returns 0 on failure or a 1-based column number indicating the
420 beginning of the field on success. */
422 cut_field (const struct data_parser *parser, struct dfm_reader *reader,
423 int *first_column, int *last_column, struct string *tmp,
424 struct substring *field)
426 struct substring line, p;
428 if (dfm_eof (reader))
430 if (ss_is_empty (parser->hard_seps))
431 dfm_expand_tabs (reader);
432 line = p = dfm_get_record (reader);
434 /* Skip leading soft separators. */
435 ss_ltrim (&p, parser->soft_seps);
437 /* Handle empty or completely consumed lines. */
440 if (!parser->empty_line_has_field || dfm_columns_past_end (reader) > 0)
445 *first_column = dfm_column_start (reader);
446 *last_column = *first_column + 1;
447 dfm_forward_columns (reader, 1);
452 *first_column = dfm_column_start (reader);
453 if (ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX)
456 int quote = ss_get_byte (&p);
457 if (!ss_get_until (&p, quote, field))
458 msg (DW, _("Quoted string extends beyond end of line."));
459 if (parser->quote_escape && ss_first (p) == quote)
461 ds_assign_substring (tmp, *field);
462 while (ss_match_byte (&p, quote))
465 ds_put_byte (tmp, quote);
466 if (!ss_get_until (&p, quote, &ss))
467 msg (DW, _("Quoted string extends beyond end of line."));
468 ds_put_substring (tmp, ss);
470 *field = ds_ss (tmp);
472 *last_column = *first_column + (ss_length (line) - ss_length (p));
474 /* Skip trailing soft separator and a single hard separator
476 if (!ss_is_empty (p))
478 size_t n_seps = ss_ltrim (&p, parser->soft_seps);
480 && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX)
486 msg (DW, _("Missing delimiter following quoted string."));
492 ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field);
493 *last_column = *first_column + ss_length (*field);
495 if (!ss_ltrim (&p, parser->soft_seps) || ss_is_empty (p)
496 || ss_find_byte (parser->hard_seps, p.string[0]) != SIZE_MAX)
498 /* Advance past a trailing hard separator,
499 regardless of whether one actually existed. If
500 we "skip" a delimiter that was not actually
501 there, then we will return end-of-line on our
502 next call, which is what we want. */
503 dfm_forward_columns (reader, 1);
506 dfm_forward_columns (reader, ss_length (line) - ss_length (p));
512 parse_error (const struct dfm_reader *reader, const struct field *field,
513 int first_column, int last_column, char *error)
517 m.category = MSG_C_DATA;
518 m.severity = MSG_S_WARNING;
519 m.file_name = CONST_CAST (char *, dfm_get_file_name (reader));
520 m.first_line = dfm_get_line_number (reader);
521 m.last_line = m.first_line + 1;
522 m.first_column = first_column;
523 m.last_column = last_column;
524 m.text = xasprintf (_("Data for variable %s is not valid as format %s: %s"),
525 field->name, fmt_name (field->format.type), error);
531 /* Reads a case from READER into C, parsing it according to
532 fixed-format syntax rules in PARSER.
533 Returns true if successful, false at end of file or on I/O error. */
535 parse_fixed (const struct data_parser *parser, struct dfm_reader *reader,
538 const char *input_encoding = dfm_reader_get_encoding (reader);
539 const char *output_encoding = dict_get_encoding (parser->dict);
543 if (dfm_eof (reader))
547 for (row = 1; row <= parser->records_per_case; row++)
549 struct substring line;
551 if (dfm_eof (reader))
553 msg (DW, _("Partial case of %d of %d records discarded."),
554 row - 1, parser->records_per_case);
557 dfm_expand_tabs (reader);
558 line = dfm_get_record (reader);
560 for (; f < &parser->fields[parser->field_cnt] && f->record == row; f++)
562 struct substring s = ss_substr (line, f->first_column - 1,
564 union value *value = case_data_rw_idx (c, f->case_idx);
565 char *error = data_in (s, input_encoding, f->format.type,
566 value, fmt_var_width (&f->format),
570 data_in_imply_decimals (s, input_encoding, f->format.type,
573 parse_error (reader, f, f->first_column,
574 f->first_column + f->format.w, error);
577 dfm_forward_record (reader);
583 /* Reads a case from READER into C, parsing it according to
584 free-format syntax rules in PARSER.
585 Returns true if successful, false at end of file or on I/O error. */
587 parse_delimited_span (const struct data_parser *parser,
588 struct dfm_reader *reader, struct ccase *c)
590 const char *input_encoding = dfm_reader_get_encoding (reader);
591 const char *output_encoding = dict_get_encoding (parser->dict);
592 struct string tmp = DS_EMPTY_INITIALIZER;
595 for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++)
598 int first_column, last_column;
601 /* Cut out a field and read in a new record if necessary. */
602 while (!cut_field (parser, reader,
603 &first_column, &last_column, &tmp, &s))
605 if (!dfm_eof (reader))
606 dfm_forward_record (reader);
607 if (dfm_eof (reader))
609 if (f > parser->fields)
610 msg (DW, _("Partial case discarded. The first variable "
611 "missing was %s."), f->name);
617 error = data_in (s, input_encoding, f->format.type,
618 case_data_rw_idx (c, f->case_idx),
619 fmt_var_width (&f->format), output_encoding);
621 parse_error (reader, f, first_column, last_column, error);
627 /* Reads a case from READER into C, parsing it according to
628 delimited syntax rules with one case per record in PARSER.
629 Returns true if successful, false at end of file or on I/O error. */
631 parse_delimited_no_span (const struct data_parser *parser,
632 struct dfm_reader *reader, struct ccase *c)
634 const char *input_encoding = dfm_reader_get_encoding (reader);
635 const char *output_encoding = dict_get_encoding (parser->dict);
636 struct string tmp = DS_EMPTY_INITIALIZER;
638 struct field *f, *end;
640 if (dfm_eof (reader))
643 end = &parser->fields[parser->field_cnt];
644 for (f = parser->fields; f < end; f++)
646 int first_column, last_column;
649 if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s))
651 if (f < end - 1 && settings_get_undefined ())
652 msg (DW, _("Missing value(s) for all variables from %s onward. "
653 "These will be filled with the system-missing value "
654 "or blanks, as appropriate."),
657 value_set_missing (case_data_rw_idx (c, f->case_idx),
658 fmt_var_width (&f->format));
662 error = data_in (s, input_encoding, f->format.type,
663 case_data_rw_idx (c, f->case_idx),
664 fmt_var_width (&f->format), output_encoding);
666 parse_error (reader, f, first_column, last_column, error);
669 s = dfm_get_record (reader);
670 ss_ltrim (&s, parser->soft_seps);
671 if (!ss_is_empty (s))
672 msg (DW, _("Record ends in data not part of any field."));
675 dfm_forward_record (reader);
680 /* Displays a table giving information on fixed-format variable
681 parsing on DATA LIST. */
683 dump_fixed_table (const struct data_parser *parser,
684 const struct file_handle *fh)
689 t = tab_create (4, parser->field_cnt + 1);
690 tab_headers (t, 0, 0, 1, 0);
691 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable"));
692 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Record"));
693 tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Columns"));
694 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Format"));
695 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 3, parser->field_cnt);
696 tab_hline (t, TAL_2, 0, 3, 1);
698 for (i = 0; i < parser->field_cnt; i++)
700 struct field *f = &parser->fields[i];
701 char fmt_string[FMT_STRING_LEN_MAX + 1];
704 tab_text (t, 0, row, TAB_LEFT, f->name);
705 tab_text_format (t, 1, row, 0, "%d", f->record);
706 tab_text_format (t, 2, row, 0, "%3d-%3d",
707 f->first_column, f->first_column + f->format.w - 1);
708 tab_text (t, 3, row, TAB_LEFT | TAB_FIX,
709 fmt_to_string (&f->format, fmt_string));
712 tab_title (t, ngettext ("Reading %d record from %s.",
713 "Reading %d records from %s.",
714 parser->records_per_case),
715 parser->records_per_case, fh_get_name (fh));
719 /* Displays a table giving information on free-format variable parsing
722 dump_delimited_table (const struct data_parser *parser,
723 const struct file_handle *fh)
728 t = tab_create (2, parser->field_cnt + 1);
729 tab_headers (t, 0, 0, 1, 0);
730 tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable"));
731 tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Format"));
732 tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, parser->field_cnt);
733 tab_hline (t, TAL_2, 0, 1, 1);
735 for (i = 0; i < parser->field_cnt; i++)
737 struct field *f = &parser->fields[i];
738 char str[FMT_STRING_LEN_MAX + 1];
741 tab_text (t, 0, row, TAB_LEFT, f->name);
742 tab_text (t, 1, row, TAB_LEFT | TAB_FIX,
743 fmt_to_string (&f->format, str));
746 tab_title (t, _("Reading free-form data from %s."), fh_get_name (fh));
751 /* Displays a table giving information on how PARSER will read
754 data_parser_output_description (struct data_parser *parser,
755 const struct file_handle *fh)
757 if (parser->type == DP_FIXED)
758 dump_fixed_table (parser, fh);
760 dump_delimited_table (parser, fh);
763 /* Data parser input program. */
764 struct data_parser_casereader
766 struct data_parser *parser; /* Parser. */
767 struct dfm_reader *reader; /* Data file reader. */
768 struct caseproto *proto; /* Format of cases. */
771 static const struct casereader_class data_parser_casereader_class;
773 /* Replaces DS's active dataset by an input program that reads data
774 from READER according to the rules in PARSER, using DICT as
775 the underlying dictionary. Ownership of PARSER and READER is
776 transferred to the input program, and ownership of DICT is
777 transferred to the dataset. */
779 data_parser_make_active_file (struct data_parser *parser, struct dataset *ds,
780 struct dfm_reader *reader,
781 struct dictionary *dict)
783 struct data_parser_casereader *r;
784 struct casereader *casereader;
786 r = xmalloc (sizeof *r);
789 r->proto = caseproto_ref (dict_get_proto (dict));
790 casereader = casereader_create_sequential (NULL, r->proto,
792 &data_parser_casereader_class, r);
793 dataset_set_dict (ds, dict);
794 dataset_set_source (ds, casereader);
797 static struct ccase *
798 data_parser_casereader_read (struct casereader *reader UNUSED, void *r_)
800 struct data_parser_casereader *r = r_;
801 struct ccase *c = case_create (r->proto);
802 if (data_parser_parse (r->parser, r->reader, c))
812 data_parser_casereader_destroy (struct casereader *reader UNUSED, void *r_)
814 struct data_parser_casereader *r = r_;
815 if (dfm_reader_error (r->reader))
816 casereader_force_error (reader);
817 data_parser_destroy (r->parser);
818 dfm_close_reader (r->reader);
819 caseproto_unref (r->proto);
823 static const struct casereader_class data_parser_casereader_class =
825 data_parser_casereader_read,
826 data_parser_casereader_destroy,