X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fdata-io%2Fdata-parser.c;h=630363af53f6ce2a077028474dd1d0386bde0fd6;hb=81579d9e9f994fb2908f50af41c3eb033d216e58;hp=9a2ea769b26b76cb95fad86852276354ea2f86ce;hpb=9b94efd7513afdb12a6023024e00e50801532fee;p=pspp-builds.git diff --git a/src/language/data-io/data-parser.c b/src/language/data-io/data-parser.c index 9a2ea769..630363af 100644 --- a/src/language/data-io/data-parser.c +++ b/src/language/data-io/data-parser.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007 Free Software Foundation, Inc. + Copyright (C) 2007, 2009, 2010, 2011 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,24 +16,24 @@ #include -#include +#include "language/data-io/data-parser.h" #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "data/casereader-provider.h" +#include "data/data-in.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/file-handle-def.h" +#include "data/procedure.h" +#include "data/settings.h" +#include "language/data-io/data-reader.h" +#include "libpspp/message.h" +#include "libpspp/str.h" +#include "output/tab.h" -#include "xalloc.h" +#include "gl/xalloc.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -41,6 +41,7 @@ /* Data parser for textual data like that read by DATA LIST. */ struct data_parser { + const struct dictionary *dict; /*Dictionary of destination */ enum data_parser_type type; /* Type of data to parse. */ int skip_records; /* Records to skip before first real data. */ casenumber max_cases; /* Max number of cases to read. */ @@ -54,6 +55,7 @@ struct data_parser bool span; /* May cases span multiple records? */ bool empty_line_has_field; /* Does an empty line have an (empty) field? */ struct substring quotes; /* Characters that can quote separators. */ + bool quote_escape; /* Doubled quote acts as escape? */ struct substring soft_seps; /* Two soft separators act like just one. */ struct substring hard_seps; /* Two hard separators yield empty fields. */ struct string any_sep; /* Concatenation of soft_seps and hard_seps. */ @@ -78,7 +80,7 @@ static void set_any_sep (struct data_parser *parser); /* Creates and returns a new data parser. */ struct data_parser * -data_parser_create (void) +data_parser_create (const struct dictionary *dict) { struct data_parser *parser = xmalloc (sizeof *parser); @@ -90,10 +92,12 @@ data_parser_create (void) parser->fields = NULL; parser->field_cnt = 0; parser->field_allocated = 0; + parser->dict = dict; parser->span = true; parser->empty_line_has_field = false; ss_alloc_substring (&parser->quotes, ss_cstr ("\"'")); + parser->quote_escape = false; ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES)); ss_alloc_substring (&parser->hard_seps, ss_cstr (",")); ds_init_empty (&parser->any_sep); @@ -218,6 +222,20 @@ data_parser_set_quotes (struct data_parser *parser, struct substring quotes) ss_alloc_substring (&parser->quotes, quotes); } +/* If ESCAPE is false (the default setting), a character used for + quoting cannot itself be embedded within a quoted field. If + ESCAPE is true, then a quote character can be embedded within + a quoted field by doubling it. + + This setting affects parsing of DP_DELIMITED files only, and + only when at least one quote character has been set (with + data_parser_set_quotes). */ +void +data_parser_set_quote_escape (struct data_parser *parser, bool escape) +{ + parser->quote_escape = escape; +} + /* Sets PARSER's soft delimiters to DELIMITERS. Soft delimiters separate fields, but consecutive soft delimiters do not yield empty fields. (Ordinarily, only white space characters are @@ -348,14 +366,17 @@ static bool parse_delimited_no_span (const struct data_parser *, static bool parse_fixed (const struct data_parser *, struct dfm_reader *, struct ccase *); -/* Reads a case from DFM into C, parsing it with PARSER. - Returns true if successful, false at end of file or on I/O error. */ +/* Reads a case from DFM into C, parsing it with PARSER. Returns + true if successful, false at end of file or on I/O error. + + Case C must not be shared. */ bool data_parser_parse (struct data_parser *parser, struct dfm_reader *reader, struct ccase *c) { bool retval; + assert (!case_is_shared (c)); assert (data_parser_any_fields (parser)); /* Skip the requested number of records before reading the @@ -374,7 +395,6 @@ data_parser_parse (struct data_parser *parser, struct dfm_reader *reader, && dfm_get_percent_read (reader) >= parser->percent_cases) return false; - dfm_push (reader); if (parser->type == DP_DELIMITED) { if (parser->span) @@ -384,7 +404,6 @@ data_parser_parse (struct data_parser *parser, struct dfm_reader *reader, } else retval = parse_fixed (parser, reader, c); - dfm_pop (reader); return retval; } @@ -401,6 +420,7 @@ data_parser_parse (struct data_parser *parser, struct dfm_reader *reader, beginning of the field on success. */ static bool cut_field (const struct data_parser *parser, struct dfm_reader *reader, + int *first_column, int *last_column, struct string *tmp, struct substring *field) { struct substring line, p; @@ -422,29 +442,50 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, else { *field = p; + *first_column = dfm_column_start (reader); + *last_column = *first_column + 1; dfm_forward_columns (reader, 1); return true; } } - if (ss_find_char (parser->quotes, ss_first (p)) != SIZE_MAX) + *first_column = dfm_column_start (reader); + if (ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX) { /* Quoted field. */ - if (!ss_get_until (&p, ss_get_char (&p), field)) + int quote = ss_get_byte (&p); + if (!ss_get_until (&p, quote, field)) msg (SW, _("Quoted string extends beyond end of line.")); + if (parser->quote_escape && ss_first (p) == quote) + { + ds_assign_substring (tmp, *field); + while (ss_match_byte (&p, quote)) + { + struct substring ss; + ds_put_byte (tmp, quote); + if (!ss_get_until (&p, quote, &ss)) + msg (SW, _("Quoted string extends beyond end of line.")); + ds_put_substring (tmp, ss); + } + *field = ds_ss (tmp); + } + *last_column = *first_column + (ss_length (line) - ss_length (p)); /* Skip trailing soft separator and a single hard separator if present. */ ss_ltrim (&p, parser->soft_seps); if (!ss_is_empty (p) - && ss_find_char (parser->hard_seps, ss_first (p)) != SIZE_MAX) + && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX) ss_advance (&p, 1); } else { /* Regular field. */ - ss_get_chars (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field); - if (!ss_ltrim (&p, parser->soft_seps) || ss_is_empty (p)) + ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field); + *last_column = *first_column + ss_length (*field); + + if (!ss_ltrim (&p, parser->soft_seps) || ss_is_empty (p) + || ss_find_byte (parser->hard_seps, p.string[0]) != SIZE_MAX) { /* Advance past a trailing hard separator, regardless of whether one actually existed. If @@ -459,6 +500,25 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, return true; } +static void +parse_error (const struct dfm_reader *reader, const struct field *field, + int first_column, int last_column, char *error) +{ + struct msg m; + + m.category = MSG_C_DATA; + m.severity = MSG_S_WARNING; + m.where.file_name = CONST_CAST (char *, dfm_get_file_name (reader)); + m.where.line_number = dfm_get_line_number (reader); + m.where.first_column = first_column; + m.where.last_column = last_column; + m.text = xasprintf (_("Data for variable %s is not valid as format %s: %s"), + field->name, fmt_name (field->format.type), error); + msg_emit (&m); + + free (error); +} + /* Reads a case from READER into C, parsing it according to fixed-format syntax rules in PARSER. Returns true if successful, false at end of file or on I/O error. */ @@ -466,7 +526,8 @@ static bool parse_fixed (const struct data_parser *parser, struct dfm_reader *reader, struct ccase *c) { - enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (reader); + const char *input_encoding = dfm_reader_get_legacy_encoding (reader); + const char *output_encoding = dict_get_encoding (parser->dict); struct field *f; int row; @@ -488,11 +549,21 @@ parse_fixed (const struct data_parser *parser, struct dfm_reader *reader, line = dfm_get_record (reader); for (; f < &parser->fields[parser->field_cnt] && f->record == row; f++) - data_in (ss_substr (line, f->first_column - 1, - f->format.w), - encoding, f->format.type, f->format.d, - f->first_column, case_data_rw_idx (c, f->case_idx), - fmt_var_width (&f->format)); + { + struct substring s = ss_substr (line, f->first_column - 1, + f->format.w); + union value *value = case_data_rw_idx (c, f->case_idx); + char *error = data_in (s, input_encoding, f->format.type, + value, fmt_var_width (&f->format), + output_encoding); + + if (error == NULL) + data_in_imply_decimals (s, input_encoding, f->format.type, + f->format.d, value); + else + parse_error (reader, f, f->first_column, + f->first_column + f->format.w, error); + } dfm_forward_record (reader); } @@ -507,15 +578,20 @@ static bool parse_delimited_span (const struct data_parser *parser, struct dfm_reader *reader, struct ccase *c) { - enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (reader); + const char *input_encoding = dfm_reader_get_legacy_encoding (reader); + const char *output_encoding = dict_get_encoding (parser->dict); + struct string tmp = DS_EMPTY_INITIALIZER; struct field *f; for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++) { struct substring s; + int first_column, last_column; + char *error; /* Cut out a field and read in a new record if necessary. */ - while (!cut_field (parser, reader, &s)) + while (!cut_field (parser, reader, + &first_column, &last_column, &tmp, &s)) { if (!dfm_eof (reader)) dfm_forward_record (reader); @@ -524,15 +600,18 @@ parse_delimited_span (const struct data_parser *parser, if (f > parser->fields) msg (SW, _("Partial case discarded. The first variable " "missing was %s."), f->name); + ds_destroy (&tmp); return false; } } - data_in (s, encoding, f->format.type, 0, - dfm_get_column (reader, ss_data (s)), - case_data_rw_idx (c, f->case_idx), - fmt_var_width (&f->format)); + error = data_in (s, input_encoding, f->format.type, + case_data_rw_idx (c, f->case_idx), + fmt_var_width (&f->format), output_encoding); + if (error != NULL) + parse_error (reader, f, first_column, last_column, error); } + ds_destroy (&tmp); return true; } @@ -543,37 +622,39 @@ static bool parse_delimited_no_span (const struct data_parser *parser, struct dfm_reader *reader, struct ccase *c) { - enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (reader); + const char *input_encoding = dfm_reader_get_legacy_encoding (reader); + const char *output_encoding = dict_get_encoding (parser->dict); + struct string tmp = DS_EMPTY_INITIALIZER; struct substring s; - struct field *f; + struct field *f, *end; if (dfm_eof (reader)) return false; - for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++) + end = &parser->fields[parser->field_cnt]; + for (f = parser->fields; f < end; f++) { - if (!cut_field (parser, reader, &s)) + int first_column, last_column; + char *error; + + if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s)) { - if (settings_get_undefined ()) + if (f < end - 1 && settings_get_undefined ()) msg (SW, _("Missing value(s) for all variables from %s onward. " "These will be filled with the system-missing value " "or blanks, as appropriate."), f->name); - for (; f < &parser->fields[parser->field_cnt]; f++) - { - int width = fmt_var_width (&f->format); - if (width == 0) - case_data_rw_idx (c, f->case_idx)->f = SYSMIS; - else - memset (case_data_rw_idx (c, f->case_idx)->s, ' ', width); - } + for (; f < end; f++) + value_set_missing (case_data_rw_idx (c, f->case_idx), + fmt_var_width (&f->format)); goto exit; } - data_in (s, encoding, f->format.type, 0, - dfm_get_column (reader, ss_data (s)), - case_data_rw_idx (c, f->case_idx), - fmt_var_width (&f->format)); + error = data_in (s, input_encoding, f->format.type, + case_data_rw_idx (c, f->case_idx), + fmt_var_width (&f->format), output_encoding); + if (error != NULL) + parse_error (reader, f, first_column, last_column, error); } s = dfm_get_record (reader); @@ -583,6 +664,7 @@ parse_delimited_no_span (const struct data_parser *parser, exit: dfm_forward_record (reader); + ds_destroy (&tmp); return true; } @@ -595,8 +677,7 @@ dump_fixed_table (const struct data_parser *parser, struct tab_table *t; size_t i; - t = tab_create (4, parser->field_cnt + 1, 0); - tab_columns (t, TAB_COL_DOWN, 1); + t = tab_create (4, parser->field_cnt + 1); tab_headers (t, 0, 0, 1, 0); tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable")); tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Record")); @@ -604,7 +685,6 @@ dump_fixed_table (const struct data_parser *parser, tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Format")); tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 3, parser->field_cnt); tab_hline (t, TAL_2, 0, 3, 1); - tab_dim (t, tab_natural_dimensions); for (i = 0; i < parser->field_cnt; i++) { @@ -613,9 +693,9 @@ dump_fixed_table (const struct data_parser *parser, int row = i + 1; tab_text (t, 0, row, TAB_LEFT, f->name); - tab_text (t, 1, row, TAT_PRINTF, "%d", f->record); - tab_text (t, 2, row, TAT_PRINTF, "%3d-%3d", - f->first_column, f->first_column + f->format.w - 1); + tab_text_format (t, 1, row, 0, "%d", f->record); + tab_text_format (t, 2, row, 0, "%3d-%3d", + f->first_column, f->first_column + f->format.w - 1); tab_text (t, 3, row, TAB_LEFT | TAB_FIX, fmt_to_string (&f->format, fmt_string)); } @@ -636,14 +716,12 @@ dump_delimited_table (const struct data_parser *parser, struct tab_table *t; size_t i; - t = tab_create (2, parser->field_cnt + 1, 0); - tab_columns (t, TAB_COL_DOWN, 1); + t = tab_create (2, parser->field_cnt + 1); tab_headers (t, 0, 0, 1, 0); tab_text (t, 0, 0, TAB_CENTER | TAT_TITLE, _("Variable")); tab_text (t, 1, 0, TAB_CENTER | TAT_TITLE, _("Format")); tab_box (t, TAL_1, TAL_1, TAL_0, TAL_1, 0, 0, 1, parser->field_cnt); tab_hline (t, TAL_2, 0, 1, 1); - tab_dim (t, tab_natural_dimensions); for (i = 0; i < parser->field_cnt; i++) { @@ -678,7 +756,7 @@ struct data_parser_casereader { struct data_parser *parser; /* Parser. */ struct dfm_reader *reader; /* Data file reader. */ - size_t value_cnt; /* Number of `union value's in case. */ + struct caseproto *proto; /* Format of cases. */ }; static const struct casereader_class data_parser_casereader_class; @@ -699,25 +777,25 @@ data_parser_make_active_file (struct data_parser *parser, struct dataset *ds, r = xmalloc (sizeof *r); r->parser = parser; r->reader = reader; - r->value_cnt = dict_get_next_value_idx (dict); - casereader = casereader_create_sequential (NULL, r->value_cnt, + r->proto = caseproto_ref (dict_get_proto (dict)); + casereader = casereader_create_sequential (NULL, r->proto, CASENUMBER_MAX, &data_parser_casereader_class, r); proc_set_active_file (ds, casereader, dict); } -static bool -data_parser_casereader_read (struct casereader *reader UNUSED, void *r_, - struct ccase *c) +static struct ccase * +data_parser_casereader_read (struct casereader *reader UNUSED, void *r_) { struct data_parser_casereader *r = r_; - bool ok; - - case_create (c, r->value_cnt); - ok = data_parser_parse (r->parser, r->reader, c); - if (!ok) - case_destroy (c); - return ok; + struct ccase *c = case_create (r->proto); + if (data_parser_parse (r->parser, r->reader, c)) + return c; + else + { + case_unref (c); + return NULL; + } } static void @@ -728,6 +806,7 @@ data_parser_casereader_destroy (struct casereader *reader UNUSED, void *r_) casereader_force_error (reader); data_parser_destroy (r->parser); dfm_close_reader (r->reader); + caseproto_unref (r->proto); free (r); }