X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fdata-io%2Fdata-parser.c;h=2f24b8a23886877be6dbde22f680b209b9d4108f;hb=65d602ed236d685ffec00ad1552a193cf47b2e4d;hp=1dc7c93f7778f9cc266f701a46634be448655e74;hpb=2814862a2c45a39f9822cf4c64ca3884822d064d;p=pspp diff --git a/src/language/data-io/data-parser.c b/src/language/data-io/data-parser.c index 1dc7c93f77..2f24b8a238 100644 --- a/src/language/data-io/data-parser.c +++ b/src/language/data-io/data-parser.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013, 2016 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -31,21 +31,20 @@ #include "language/data-io/data-reader.h" #include "libpspp/message.h" #include "libpspp/str.h" -#include "output/tab.h" +#include "output/pivot-table.h" #include "gl/xalloc.h" #include "gettext.h" +#define N_(msgid) msgid #define _(msgid) gettext (msgid) /* Data parser for textual data like that read by DATA LIST. */ struct data_parser { - const struct dictionary *dict; /*Dictionary of destination */ + struct dictionary *dict; /* Dictionary of destination */ enum data_parser_type type; /* Type of data to parse. */ int skip_records; /* Records to skip before first real data. */ - casenumber max_cases; /* Max number of cases to read. */ - int percent_cases; /* Approximate percent of cases to read. */ struct field *fields; /* Fields to parse. */ size_t field_cnt; /* Number of fields. */ @@ -54,6 +53,7 @@ struct data_parser /* DP_DELIMITED parsers only. */ bool span; /* May cases span multiple records? */ bool empty_line_has_field; /* Does an empty line have an (empty) field? */ + bool warn_missing_fields; /* Should missing fields be considered errors? */ struct substring quotes; /* Characters that can quote separators. */ bool quote_escape; /* Doubled quote acts as escape? */ struct substring soft_seps; /* Two soft separators act like just one. */ @@ -80,22 +80,21 @@ static void set_any_sep (struct data_parser *parser); /* Creates and returns a new data parser. */ struct data_parser * -data_parser_create (const struct dictionary *dict) +data_parser_create (struct dictionary *dict) { struct data_parser *parser = xmalloc (sizeof *parser); parser->type = DP_FIXED; parser->skip_records = 0; - parser->max_cases = -1; - parser->percent_cases = 100; parser->fields = NULL; parser->field_cnt = 0; parser->field_allocated = 0; - parser->dict = dict; + parser->dict = dict_ref (dict); parser->span = true; parser->empty_line_has_field = false; + parser->warn_missing_fields = true; ss_alloc_substring (&parser->quotes, ss_cstr ("\"'")); parser->quote_escape = false; ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES)); @@ -116,6 +115,7 @@ data_parser_destroy (struct data_parser *parser) { size_t i; + dict_unref (parser->dict); for (i = 0; i < parser->field_cnt; i++) free (parser->fields[i].name); free (parser->fields); @@ -154,24 +154,6 @@ data_parser_set_skip (struct data_parser *parser, int initial_records_to_skip) parser->skip_records = initial_records_to_skip; } -/* Sets the maximum number of cases parsed by PARSER to - MAX_CASES. The default is -1, meaning no limit. */ -void -data_parser_set_case_limit (struct data_parser *parser, casenumber max_cases) -{ - parser->max_cases = max_cases; -} - -/* Sets the percentage of cases that PARSER should read from the - input file to PERCENT_CASES. By default, all cases are - read. */ -void -data_parser_set_case_percent (struct data_parser *parser, int percent_cases) -{ - assert (percent_cases >= 0 && percent_cases <= 100); - parser->percent_cases = percent_cases; -} - /* Returns true if PARSER is configured to allow cases to span multiple records. */ bool @@ -208,6 +190,21 @@ data_parser_set_empty_line_has_field (struct data_parser *parser, parser->empty_line_has_field = empty_line_has_field; } + +/* If WARN_MISSING_FIELDS is true, configures PARSER to emit a warning + and cause an error condition when a missing field is encountered. + If WARN_MISSING_FIELDS is false, PARSER will silently fill such + fields with the system missing value. + + This setting affects parsing of DP_DELIMITED files only. */ +void +data_parser_set_warn_missing_fields (struct data_parser *parser, + bool warn_missing_fields) +{ + parser->warn_missing_fields = warn_missing_fields; +} + + /* Sets the characters that may be used for quoting field contents to QUOTES. If QUOTES is empty, quoting will be disabled. @@ -389,12 +386,6 @@ data_parser_parse (struct data_parser *parser, struct dfm_reader *reader, } /* Limit cases. */ - if (parser->max_cases != -1 && parser->max_cases-- == 0) - return false; - if (parser->percent_cases < 100 - && dfm_get_percent_read (reader) >= parser->percent_cases) - return false; - if (parser->type == DP_DELIMITED) { if (parser->span) @@ -414,16 +405,19 @@ data_parser_parse (struct data_parser *parser, struct dfm_reader *reader, *FIELD is set to the field content. The caller must not or destroy this constant string. - After parsing the field, sets the current position in the - record to just past the field and any trailing delimiter. - Returns 0 on failure or a 1-based column number indicating the - beginning of the field on success. */ + Sets *FIRST_COLUMN to the 1-based column number of the start of + the extracted field, and *LAST_COLUMN to the end of the extracted + field. + + Returns true on success, false on failure. */ static bool cut_field (const struct data_parser *parser, struct dfm_reader *reader, int *first_column, int *last_column, struct string *tmp, struct substring *field) { + size_t length_before_separators; struct substring line, p; + bool quoted; if (dfm_eof (reader)) return false; @@ -450,12 +444,13 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, } *first_column = dfm_column_start (reader); - if (ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX) + quoted = ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX; + if (quoted) { /* Quoted field. */ int quote = ss_get_byte (&p); if (!ss_get_until (&p, quote, field)) - msg (SW, _("Quoted string extends beyond end of line.")); + msg (DW, _("Quoted string extends beyond end of line.")); if (parser->quote_escape && ss_first (p) == quote) { ds_assign_substring (tmp, *field); @@ -464,37 +459,33 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, struct substring ss; ds_put_byte (tmp, quote); if (!ss_get_until (&p, quote, &ss)) - msg (SW, _("Quoted string extends beyond end of line.")); + msg (DW, _("Quoted string extends beyond end of line.")); ds_put_substring (tmp, ss); } *field = ds_ss (tmp); } *last_column = *first_column + (ss_length (line) - ss_length (p)); - - /* Skip trailing soft separator and a single hard separator - if present. */ - ss_ltrim (&p, parser->soft_seps); - if (!ss_is_empty (p) - && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX) - ss_advance (&p, 1); } else { /* Regular field. */ ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field); *last_column = *first_column + ss_length (*field); + } - if (!ss_ltrim (&p, parser->soft_seps) || ss_is_empty (p) - || ss_find_byte (parser->hard_seps, p.string[0]) != SIZE_MAX) - { - /* Advance past a trailing hard separator, - regardless of whether one actually existed. If - we "skip" a delimiter that was not actually - there, then we will return end-of-line on our - next call, which is what we want. */ - dfm_forward_columns (reader, 1); - } + /* Skip trailing soft separator and a single hard separator if present. */ + length_before_separators = ss_length (p); + ss_ltrim (&p, parser->soft_seps); + if (!ss_is_empty (p) + && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX) + { + ss_advance (&p, 1); + ss_ltrim (&p, parser->soft_seps); } + if (ss_is_empty (p)) + dfm_forward_columns (reader, 1); + else if (quoted && length_before_separators == ss_length (p)) + msg (DW, _("Missing delimiter following quoted string.")); dfm_forward_columns (reader, ss_length (line) - ss_length (p)); return true; @@ -504,17 +495,17 @@ static void parse_error (const struct dfm_reader *reader, const struct field *field, int first_column, int last_column, char *error) { - struct msg m; - - m.category = MSG_C_DATA; - m.severity = MSG_S_WARNING; - m.file_name = CONST_CAST (char *, dfm_get_file_name (reader)); - m.first_line = dfm_get_line_number (reader); - m.last_line = m.first_line + 1; - m.first_column = first_column; - m.last_column = last_column; - m.text = xasprintf (_("Data for variable %s is not valid as format %s: %s"), - field->name, fmt_name (field->format.type), error); + struct msg m = { + .category = MSG_C_DATA, + .severity = MSG_S_WARNING, + .file_name = CONST_CAST (char *, dfm_get_file_name (reader)), + .first_line = dfm_get_line_number (reader), + .last_line = m.first_line + 1, + .first_column = first_column, + .last_column = last_column, + .text = xasprintf (_("Data for variable %s is not valid as format %s: %s"), + field->name, fmt_name (field->format.type), error), + }; msg_emit (&m); free (error); @@ -542,7 +533,7 @@ parse_fixed (const struct data_parser *parser, struct dfm_reader *reader, if (dfm_eof (reader)) { - msg (SW, _("Partial case of %d of %d records discarded."), + msg (DW, _("Partial case of %d of %d records discarded."), row - 1, parser->records_per_case); return false; } @@ -579,7 +570,6 @@ static bool parse_delimited_span (const struct data_parser *parser, struct dfm_reader *reader, struct ccase *c) { - const char *input_encoding = dfm_reader_get_encoding (reader); const char *output_encoding = dict_get_encoding (parser->dict); struct string tmp = DS_EMPTY_INITIALIZER; struct field *f; @@ -599,13 +589,14 @@ parse_delimited_span (const struct data_parser *parser, if (dfm_eof (reader)) { if (f > parser->fields) - msg (SW, _("Partial case discarded. The first variable " + msg (DW, _("Partial case discarded. The first variable " "missing was %s."), f->name); ds_destroy (&tmp); return false; } } + const char *input_encoding = dfm_reader_get_encoding (reader); error = data_in (s, input_encoding, f->format.type, case_data_rw_idx (c, f->case_idx), fmt_var_width (&f->format), output_encoding); @@ -623,7 +614,6 @@ static bool parse_delimited_no_span (const struct data_parser *parser, struct dfm_reader *reader, struct ccase *c) { - const char *input_encoding = dfm_reader_get_encoding (reader); const char *output_encoding = dict_get_encoding (parser->dict); struct string tmp = DS_EMPTY_INITIALIZER; struct substring s; @@ -640,8 +630,8 @@ parse_delimited_no_span (const struct data_parser *parser, if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s)) { - if (f < end - 1 && settings_get_undefined ()) - msg (SW, _("Missing value(s) for all variables from %s onward. " + if (f < end - 1 && settings_get_undefined () && parser->warn_missing_fields) + msg (DW, _("Missing value(s) for all variables from %s onward. " "These will be filled with the system-missing value " "or blanks, as appropriate."), f->name); @@ -651,6 +641,7 @@ parse_delimited_no_span (const struct data_parser *parser, goto exit; } + const char *input_encoding = dfm_reader_get_encoding (reader); error = data_in (s, input_encoding, f->format.type, case_data_rw_idx (c, f->case_idx), fmt_var_width (&f->format), output_encoding); @@ -661,7 +652,7 @@ parse_delimited_no_span (const struct data_parser *parser, s = dfm_get_record (reader); ss_ltrim (&s, parser->soft_seps); if (!ss_is_empty (s)) - msg (SW, _("Record ends in data not part of any field.")); + msg (DW, _("Record ends in data not part of any field.")); exit: dfm_forward_record (reader); @@ -675,37 +666,48 @@ static void dump_fixed_table (const struct data_parser *parser, const struct file_handle *fh) { - struct tab_table *t; - size_t i; - - t = tab_create (4, parser->field_cnt + 1); - tab_headers (t, 0, 0, 1, 0); - tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable")); - tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Record")); - tab_text (t, 2, 0, TAB_CENTER | TAT_TITLE, _("Columns")); - tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Format")); - tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 3, parser->field_cnt); - tab_hline (t, TAL_2, 0, 3, 1); - - for (i = 0; i < parser->field_cnt; i++) + /* XXX This should not be preformatted. */ + char *title = xasprintf (ngettext ("Reading %d record from %s.", + "Reading %d records from %s.", + parser->records_per_case), + parser->records_per_case, fh_get_name (fh)); + struct pivot_table *table = pivot_table_create__ ( + pivot_value_new_user_text (title, -1), "Fixed Data Records"); + free (title); + + pivot_dimension_create ( + table, PIVOT_AXIS_COLUMN, N_("Attributes"), + N_("Record"), N_("Columns"), N_("Format")); + + struct pivot_dimension *variables = pivot_dimension_create ( + table, PIVOT_AXIS_ROW, N_("Variable")); + variables->root->show_label = true; + for (size_t i = 0; i < parser->field_cnt; i++) { struct field *f = &parser->fields[i]; - char fmt_string[FMT_STRING_LEN_MAX + 1]; - int row = i + 1; - - tab_text (t, 0, row, TAB_LEFT, f->name); - tab_text_format (t, 1, row, 0, "%d", f->record); - tab_text_format (t, 2, row, 0, "%3d-%3d", - f->first_column, f->first_column + f->format.w - 1); - tab_text (t, 3, row, TAB_LEFT | TAB_FIX, - fmt_to_string (&f->format, fmt_string)); + + /* XXX It would be better to have the actual variable here. */ + int variable_idx = pivot_category_create_leaf ( + variables->root, pivot_value_new_user_text (f->name, -1)); + + pivot_table_put2 (table, 0, variable_idx, + pivot_value_new_integer (f->record)); + + int first_column = f->first_column; + int last_column = f->first_column + f->format.w - 1; + char *columns = xasprintf ("%d-%d", first_column, last_column); + pivot_table_put2 (table, 1, variable_idx, + pivot_value_new_user_text (columns, -1)); + free (columns); + + char str[FMT_STRING_LEN_MAX + 1]; + pivot_table_put2 (table, 2, variable_idx, + pivot_value_new_user_text ( + fmt_to_string (&f->format, str), -1)); + } - tab_title (t, ngettext ("Reading %d record from %s.", - "Reading %d records from %s.", - parser->records_per_case), - parser->records_per_case, fh_get_name (fh)); - tab_submit (t); + pivot_table_submit (table); } /* Displays a table giving information on free-format variable parsing @@ -714,30 +716,32 @@ static void dump_delimited_table (const struct data_parser *parser, const struct file_handle *fh) { - struct tab_table *t; - size_t i; + struct pivot_table *table = pivot_table_create__ ( + pivot_value_new_text_format (N_("Reading free-form data from %s."), + fh_get_name (fh)), + "Free-Form Data Records"); - t = tab_create (2, parser->field_cnt + 1); - tab_headers (t, 0, 0, 1, 0); - tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable")); - tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Format")); - tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, parser->field_cnt); - tab_hline (t, TAL_2, 0, 1, 1); + pivot_dimension_create ( + table, PIVOT_AXIS_COLUMN, N_("Attributes"), N_("Format")); - for (i = 0; i < parser->field_cnt; i++) + struct pivot_dimension *variables = pivot_dimension_create ( + table, PIVOT_AXIS_ROW, N_("Variable")); + variables->root->show_label = true; + for (size_t i = 0; i < parser->field_cnt; i++) { struct field *f = &parser->fields[i]; - char str[FMT_STRING_LEN_MAX + 1]; - int row = i + 1; - tab_text (t, 0, row, TAB_LEFT, f->name); - tab_text (t, 1, row, TAB_LEFT | TAB_FIX, - fmt_to_string (&f->format, str)); - } + /* XXX It would be better to have the actual variable here. */ + int variable_idx = pivot_category_create_leaf ( + variables->root, pivot_value_new_user_text (f->name, -1)); - tab_title (t, _("Reading free-form data from %s."), fh_get_name (fh)); + char str[FMT_STRING_LEN_MAX + 1]; + pivot_table_put2 (table, 0, variable_idx, + pivot_value_new_user_text ( + fmt_to_string (&f->format, str), -1)); + } - tab_submit (t); + pivot_table_submit (table); } /* Displays a table giving information on how PARSER will read @@ -769,23 +773,35 @@ static const struct casereader_class data_parser_casereader_class; transferred to the dataset. */ void data_parser_make_active_file (struct data_parser *parser, struct dataset *ds, - struct dfm_reader *reader, - struct dictionary *dict) + struct dfm_reader *reader, + struct dictionary *dict, + struct casereader* (*func)(struct casereader *, + const struct dictionary *, + void *), + void *ud) { struct data_parser_casereader *r; - struct casereader *casereader; + struct casereader *casereader0; + struct casereader *casereader1; r = xmalloc (sizeof *r); r->parser = parser; r->reader = reader; r->proto = caseproto_ref (dict_get_proto (dict)); - casereader = casereader_create_sequential (NULL, r->proto, + casereader0 = casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX, &data_parser_casereader_class, r); + + if (func) + casereader1 = func (casereader0, dict, ud); + else + casereader1 = casereader0; + dataset_set_dict (ds, dict); - dataset_set_source (ds, casereader); + dataset_set_source (ds, casereader1); } + static struct ccase * data_parser_casereader_read (struct casereader *reader UNUSED, void *r_) { @@ -801,14 +817,14 @@ data_parser_casereader_read (struct casereader *reader UNUSED, void *r_) } static void -data_parser_casereader_destroy (struct casereader *reader UNUSED, void *r_) +data_parser_casereader_destroy (struct casereader *reader, void *r_) { struct data_parser_casereader *r = r_; if (dfm_reader_error (r->reader)) casereader_force_error (reader); - data_parser_destroy (r->parser); dfm_close_reader (r->reader); caseproto_unref (r->proto); + data_parser_destroy (r->parser); free (r); }