From dacfe37faf6837e1e69b75e0f3791f06a3efa68d Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sun, 10 Feb 2008 08:17:49 +0000 Subject: [PATCH] Add a couple of extensions to GET DATA TYPE=TXT. Patch #6412. Thanks to John Darrington for review. --- doc/files.texi | 39 +++++++++------ src/data/ChangeLog | 20 ++++++++ src/data/data-in.c | 17 ++++--- src/data/data-in.h | 3 +- src/language/data-io/data-parser.c | 66 ++++++++++++++++++++----- src/language/data-io/data-parser.h | 1 + src/language/data-io/get-data.c | 16 +++++- src/language/expressions/operations.def | 2 +- src/language/lexer/range-parser.c | 2 +- src/language/xforms/recode.c | 2 +- src/ui/gui/ChangeLog | 15 ++++++ src/ui/gui/find-dialog.c | 2 +- src/ui/gui/helper.c | 2 +- src/ui/gui/psppire-case-file.c | 2 +- tests/ChangeLog | 6 +++ tests/command/get-data-txt-examples.sh | 26 +++++----- 16 files changed, 165 insertions(+), 56 deletions(-) diff --git a/doc/files.texi b/doc/files.texi index ded40e7b..30a023ae 100644 --- a/doc/files.texi +++ b/doc/files.texi @@ -385,7 +385,7 @@ GET DATA /TYPE=TXT [/IMPORTCASE=@{ALL,FIRST max_cases,PERCENT percent@}] /DELIMITERS="delimiters" - [/QUALIFIER="quote"] + [/QUALIFIER="quotes" [/ESCAPE]] [/DELCASE=@{LINE,VARIABLES n_variables@}] /VARIABLES=del_var [del_var]@dots{} where each del_var takes the form: @@ -417,11 +417,22 @@ delimiter, immediately following @samp{\t}. To read a data file in which each field appears on a separate line, specify the empty string for DELIMITERS. -The optional QUALIFIER subcommand names a character that can be used -to quote values within fields in the input. A field that begins with -the specified quote character ends at the next match quote. -Intervening delimiters become part of the field, instead of -terminating it. +The optional QUALIFIER subcommand names one or more characters that +can be used to quote values within fields in the input. A field that +begins with one of the specified quote characters ends at the next +matching quote. Intervening delimiters become part of the field, +instead of terminating it. The ability to specify more than one quote +character is a PSPP extension. + +By default, a character specified on QUALIFIER cannot itself be +embedded within a field that it quotes, because the quote character +always terminates the quoted field. With ESCAPE, however, a doubled +quote character within a quoted field inserts a single instance of the +quote into the field. For example, if @samp{'} is specified on +QUALIFIER, then without ESCAPE @code{'a''b'} specifies a pair of +fields that contain @samp{a} and @samp{b}, but with ESCAPE it +specifies a single field that contains @samp{a'b}. ESCAPE is a PSPP +extension. The DELCASE subcommand controls how data may be broken across lines in the data file. With LINE, the default setting, each line must contain @@ -495,12 +506,12 @@ GET DATA /TYPE=TXT /FILE='cars.data' /DELIMITERS=' ' /FIRSTCASE=2 Consider the following information on animals in a pet store: @example -"Pet Name", "Age", "Color", "Date Received", "Price", "Needs Walking", "Type" +'Pet''s Name', "Age", "Color", "Date Received", "Price", "Height", "Type" , (Years), , , (Dollars), , -"Rover", 4.5, Brown, "12 Feb 2004", 80, True, "Dog" -"Charlie", , Gold, "5 Apr 2007", 12.3, False, "Fish" -"Molly", 2, Black, "12 Dec 2006", 25, False, "Cat" -"Gilly", , White, "10 Apr 2007", 10, False, "Guinea Pig" +"Rover", 4.5, Brown, "12 Feb 2004", 80, '1''4"', "Dog" +"Charlie", , Gold, "5 Apr 2007", 12.3, "3""", "Fish" +"Molly", 2, Black, "12 Dec 2006", 25, '5"', "Cat" +"Gilly", , White, "10 Apr 2007", 10, "3""", "Guinea Pig" @end example @noindent @@ -509,15 +520,15 @@ The following syntax can be used to read the pet store data: @c If you change this example, change the regression test in @c tests/command/get-data-txt-examples.sh to match. @example -GET DATA /TYPE=TXT /FILE='pets.data' /DELIMITERS=', ' /QUALIFIER='"' +GET DATA /TYPE=TXT /FILE='pets.data' /DELIMITERS=', ' /QUALIFIER='''"' /ESCAPE /FIRSTCASE=3 /VARIABLES=name A10 age F3.1 color A5 received EDATE10 price F5.2 - needs_walking A5 - type A10. + height a5 + type a10. @end example @node GET DATA /TYPE=TXT /ARRANGEMENT=FIXED diff --git a/src/data/ChangeLog b/src/data/ChangeLog index b3980052..a154bd09 100644 --- a/src/data/ChangeLog +++ b/src/data/ChangeLog @@ -1,3 +1,23 @@ +2008-02-09 Ben Pfaff + + Add a couple of extensions to GET DATA TYPE=TXT. Patch #6412. + Thanks to John Darrington for review. + + * data-in.c (data_in): Add new argument to designate the last + column of the data field being parsed, for use in error messages. + Update all callers. + + * data-parser (struct data_parser): New member `quote_escape'. + (data_parser_create): Initialize quote_escape. + (data_parser_set_quotes): New function. + (cut_field): Support escaped quotes. + (parse_delimited_span): Ditto. + (parse_delimited_no_span): Ditto. + + * get-data.c (parse_get_txt): Support ESCAPE extension subcommand + in enhanced mode. Only support multiple quote characters in + enhanced mode. + 2008-02-06 John Darrington psql-reader.c psql-reader.h: Read more than one tuple at diff --git a/src/data/data-in.c b/src/data/data-in.c index 3f2f8074..96a6ce01 100644 --- a/src/data/data-in.c +++ b/src/data/data-in.c @@ -90,13 +90,16 @@ static int hexit_value (int c); IMPLIED_DECIMALS decimal places are implied. Specify 0 if no decimal places should be implied. - If FIRST_COLUMN is nonzero, then it should be the 1-based - column number of the first character in INPUT, used in error - messages. */ + If FIRST_COLUMN and LAST_COLUMN are nonzero, then they should + be the 1-based column number of the first and + one-past-the-last-character in INPUT, for use in error + messages. (LAST_COLUMN cannot always be calculated from + FIRST_COLUMN plus the length of the input because of the + possibility of escaped quotes in strings, etc.) */ bool data_in (struct substring input, enum legacy_encoding encoding, enum fmt_type format, int implied_decimals, - int first_column, union value *output, int width) + int first_column, int last_column, union value *output, int width) { static data_in_parser_func *const handlers[FMT_NUMBER_OF_FORMATS] = { @@ -131,7 +134,7 @@ data_in (struct substring input, enum legacy_encoding encoding, i.width = width; i.first_column = first_column; - i.last_column = first_column + ss_length (input) - 1; + i.last_column = last_column; if (!ss_is_empty (i.input)) { @@ -1167,11 +1170,11 @@ vdata_warning (const struct data_in *i, const char *format, va_list args) ds_put_char (&text, '('); if (i->first_column != 0) { - if (i->first_column == i->last_column) + if (i->first_column == i->last_column - 1) ds_put_format (&text, _("column %d"), i->first_column); else ds_put_format (&text, _("columns %d-%d"), - i->first_column, i->last_column); + i->first_column, i->last_column - 1); ds_put_cstr (&text, ", "); } ds_put_format (&text, _("%s field) "), fmt_name (i->format)); diff --git a/src/data/data-in.h b/src/data/data-in.h index 52301c8c..6ba9a588 100644 --- a/src/data/data-in.h +++ b/src/data/data-in.h @@ -27,7 +27,8 @@ union value; bool data_in (struct substring input, enum legacy_encoding, - enum fmt_type, int implied_decimals, int first_column, + enum fmt_type, int implied_decimals, + int first_column, int last_column, union value *output, int width); #endif /* data/data-in.h */ diff --git a/src/language/data-io/data-parser.c b/src/language/data-io/data-parser.c index 9a2ea769..dfc04be4 100644 --- a/src/language/data-io/data-parser.c +++ b/src/language/data-io/data-parser.c @@ -54,6 +54,7 @@ struct data_parser bool span; /* May cases span multiple records? */ bool empty_line_has_field; /* Does an empty line have an (empty) field? */ struct substring quotes; /* Characters that can quote separators. */ + bool quote_escape; /* Doubled quote acts as escape? */ struct substring soft_seps; /* Two soft separators act like just one. */ struct substring hard_seps; /* Two hard separators yield empty fields. */ struct string any_sep; /* Concatenation of soft_seps and hard_seps. */ @@ -94,6 +95,7 @@ data_parser_create (void) parser->span = true; parser->empty_line_has_field = false; ss_alloc_substring (&parser->quotes, ss_cstr ("\"'")); + parser->quote_escape = false; ss_alloc_substring (&parser->soft_seps, ss_cstr (CC_SPACES)); ss_alloc_substring (&parser->hard_seps, ss_cstr (",")); ds_init_empty (&parser->any_sep); @@ -218,6 +220,20 @@ data_parser_set_quotes (struct data_parser *parser, struct substring quotes) ss_alloc_substring (&parser->quotes, quotes); } +/* If ESCAPE is false (the default setting), a character used for + quoting cannot itself be embedded within a quoted field. If + ESCAPE is true, then a quote character can be embedded within + a quoted field by doubling it. + + This setting affects parsing of DP_DELIMITED files only, and + only when at least one quote character has been set (with + data_parser_set_quotes). */ +void +data_parser_set_quote_escape (struct data_parser *parser, bool escape) +{ + parser->quote_escape = escape; +} + /* Sets PARSER's soft delimiters to DELIMITERS. Soft delimiters separate fields, but consecutive soft delimiters do not yield empty fields. (Ordinarily, only white space characters are @@ -401,6 +417,7 @@ data_parser_parse (struct data_parser *parser, struct dfm_reader *reader, beginning of the field on success. */ static bool cut_field (const struct data_parser *parser, struct dfm_reader *reader, + int *first_column, int *last_column, struct string *tmp, struct substring *field) { struct substring line, p; @@ -422,16 +439,34 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, else { *field = p; + *first_column = dfm_column_start (reader); + *last_column = *first_column + 1; dfm_forward_columns (reader, 1); return true; } } + *first_column = dfm_column_start (reader); if (ss_find_char (parser->quotes, ss_first (p)) != SIZE_MAX) { /* Quoted field. */ - if (!ss_get_until (&p, ss_get_char (&p), field)) + int quote = ss_get_char (&p); + if (!ss_get_until (&p, quote, field)) msg (SW, _("Quoted string extends beyond end of line.")); + if (parser->quote_escape && ss_first (p) == quote) + { + ds_assign_substring (tmp, *field); + while (ss_match_char (&p, quote)) + { + struct substring ss; + ds_put_char (tmp, quote); + if (!ss_get_until (&p, quote, &ss)) + msg (SW, _("Quoted string extends beyond end of line.")); + ds_put_substring (tmp, ss); + } + *field = ds_ss (tmp); + } + *last_column = dfm_column_start (reader); /* Skip trailing soft separator and a single hard separator if present. */ @@ -444,6 +479,7 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, { /* Regular field. */ ss_get_chars (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field); + *last_column = dfm_column_start (reader); if (!ss_ltrim (&p, parser->soft_seps) || ss_is_empty (p)) { /* Advance past a trailing hard separator, @@ -491,7 +527,8 @@ parse_fixed (const struct data_parser *parser, struct dfm_reader *reader, data_in (ss_substr (line, f->first_column - 1, f->format.w), encoding, f->format.type, f->format.d, - f->first_column, case_data_rw_idx (c, f->case_idx), + f->first_column, f->first_column + f->format.w, + case_data_rw_idx (c, f->case_idx), fmt_var_width (&f->format)); dfm_forward_record (reader); @@ -508,14 +545,17 @@ parse_delimited_span (const struct data_parser *parser, struct dfm_reader *reader, struct ccase *c) { enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (reader); + struct string tmp = DS_EMPTY_INITIALIZER; struct field *f; for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++) { struct substring s; + int first_column, last_column; /* Cut out a field and read in a new record if necessary. */ - while (!cut_field (parser, reader, &s)) + while (!cut_field (parser, reader, + &first_column, &last_column, &tmp, &s)) { if (!dfm_eof (reader)) dfm_forward_record (reader); @@ -524,15 +564,17 @@ parse_delimited_span (const struct data_parser *parser, if (f > parser->fields) msg (SW, _("Partial case discarded. The first variable " "missing was %s."), f->name); + ds_destroy (&tmp); return false; } } data_in (s, encoding, f->format.type, 0, - dfm_get_column (reader, ss_data (s)), + first_column, last_column, case_data_rw_idx (c, f->case_idx), fmt_var_width (&f->format)); } + ds_destroy (&tmp); return true; } @@ -544,6 +586,7 @@ parse_delimited_no_span (const struct data_parser *parser, struct dfm_reader *reader, struct ccase *c) { enum legacy_encoding encoding = dfm_reader_get_legacy_encoding (reader); + struct string tmp = DS_EMPTY_INITIALIZER; struct substring s; struct field *f; @@ -552,7 +595,8 @@ parse_delimited_no_span (const struct data_parser *parser, for (f = parser->fields; f < &parser->fields[parser->field_cnt]; f++) { - if (!cut_field (parser, reader, &s)) + int first_column, last_column; + if (!cut_field (parser, reader, &first_column, &last_column, &tmp, &s)) { if (settings_get_undefined ()) msg (SW, _("Missing value(s) for all variables from %s onward. " @@ -560,18 +604,13 @@ parse_delimited_no_span (const struct data_parser *parser, "or blanks, as appropriate."), f->name); for (; f < &parser->fields[parser->field_cnt]; f++) - { - int width = fmt_var_width (&f->format); - if (width == 0) - case_data_rw_idx (c, f->case_idx)->f = SYSMIS; - else - memset (case_data_rw_idx (c, f->case_idx)->s, ' ', width); - } + value_set_missing (case_data_rw_idx (c, f->case_idx), + fmt_var_width (&f->format)); goto exit; } data_in (s, encoding, f->format.type, 0, - dfm_get_column (reader, ss_data (s)), + first_column, last_column, case_data_rw_idx (c, f->case_idx), fmt_var_width (&f->format)); } @@ -583,6 +622,7 @@ parse_delimited_no_span (const struct data_parser *parser, exit: dfm_forward_record (reader); + ds_destroy (&tmp); return true; } diff --git a/src/language/data-io/data-parser.h b/src/language/data-io/data-parser.h index 3ee1be8c..4976d3f4 100644 --- a/src/language/data-io/data-parser.h +++ b/src/language/data-io/data-parser.h @@ -54,6 +54,7 @@ void data_parser_set_span (struct data_parser *, bool may_cases_span_records); void data_parser_set_empty_line_has_field (struct data_parser *, bool empty_line_has_field); void data_parser_set_quotes (struct data_parser *, struct substring); +void data_parser_set_quote_escape (struct data_parser *, bool escape); void data_parser_set_soft_delimiters (struct data_parser *, struct substring); void data_parser_set_hard_delimiters (struct data_parser *, struct substring); diff --git a/src/language/data-io/get-data.c b/src/language/data-io/get-data.c index bc09715b..54bb5653 100644 --- a/src/language/data-io/get-data.c +++ b/src/language/data-io/get-data.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -429,18 +430,29 @@ parse_get_txt (struct lexer *lexer, struct dataset *ds) lex_get (lexer); } - else if (lex_match_id (lexer, "QUALIFIER")) + else if (lex_match_id (lexer, "QUALIFIERS")) { - if (!set_type (parser, "QUALIFIER", DP_DELIMITED, &has_type)) + if (!set_type (parser, "QUALIFIERS", DP_DELIMITED, &has_type)) goto error; lex_match (lexer, '='); if (!lex_force_string (lexer)) goto error; + if (settings_get_syntax () == COMPATIBLE + && ds_length (lex_tokstr (lexer)) != 1) + { + msg (SE, _("In compatible syntax mode, the QUALIFIER string " + "must contain exactly one character.")); + goto error; + } + data_parser_set_quotes (parser, ds_ss (lex_tokstr (lexer))); lex_get (lexer); } + else if (settings_get_syntax () == ENHANCED + && lex_match_id (lexer, "ESCAPE")) + data_parser_set_quote_escape (parser, true); else if (lex_match_id (lexer, "VARIABLES")) break; else diff --git a/src/language/expressions/operations.def b/src/language/expressions/operations.def index 3056b886..2ec03c34 100644 --- a/src/language/expressions/operations.def +++ b/src/language/expressions/operations.def @@ -573,7 +573,7 @@ string function RTRIM (string s, string c) function NUMBER (string s, ni_format f) { union value out; - data_in (ss_head (s, f->w), LEGACY_NATIVE, f->type, f->d, 0, &out, 0); + data_in (ss_head (s, f->w), LEGACY_NATIVE, f->type, f->d, 0, 0, &out, 0); return out.f; } diff --git a/src/language/lexer/range-parser.c b/src/language/lexer/range-parser.c index 6a3af097..c07dcdb0 100644 --- a/src/language/lexer/range-parser.c +++ b/src/language/lexer/range-parser.c @@ -99,7 +99,7 @@ parse_number (struct lexer *lexer, double *x, const enum fmt_type *format) { union value v; data_in (ds_ss (lex_tokstr (lexer)), LEGACY_NATIVE, - *format, 0, 0, &v, 0); + *format, 0, 0, 0, &v, 0); lex_get (lexer); *x = v.f; if (*x == SYSMIS) diff --git a/src/language/xforms/recode.c b/src/language/xforms/recode.c index 85fcb9ea..fb02c910 100644 --- a/src/language/xforms/recode.c +++ b/src/language/xforms/recode.c @@ -608,7 +608,7 @@ find_src_string (struct recode_trns *trns, const char *value, int width) msg_disable (); match = data_in (ss_buffer (value, width), LEGACY_NATIVE, - FMT_F, 0, 0, &uv, 0); + FMT_F, 0, 0, 0, &uv, 0); msg_enable (); out->value.f = uv.f; break; diff --git a/src/ui/gui/ChangeLog b/src/ui/gui/ChangeLog index 35ed6d86..ded87d2b 100644 --- a/src/ui/gui/ChangeLog +++ b/src/ui/gui/ChangeLog @@ -1,3 +1,18 @@ +2008-02-09 Ben Pfaff + + Consolidate multiple messages into single message dialog. Patch + #6405. Thanks to John Darrington for review. + + * automake.mk (dist_src_ui_gui_psppire_DATA): Add + message-dialog.glade. + + * helper.c (give_help): Use GtkMessageDialog directly instead of + trying to reuse message-dialog code. + + * message-dialog.c: Rewritten. + + * message-dialog.glade: New file. + 2008-02-08 Jason Stover * crosstabs-dialog.c: New file. diff --git a/src/ui/gui/find-dialog.c b/src/ui/gui/find-dialog.c index 088cac8d..84bfa1be 100644 --- a/src/ui/gui/find-dialog.c +++ b/src/ui/gui/find-dialog.c @@ -600,7 +600,7 @@ value_comparator_create (const struct variable *var, const char *target) if ( ! data_in (ss_cstr (target), LEGACY_NATIVE, fmt->type, - 0, 0, + 0, 0, 0, vc->pattern, width) ) { free (vc); diff --git a/src/ui/gui/helper.c b/src/ui/gui/helper.c index bc197e63..c96fdded 100644 --- a/src/ui/gui/helper.c +++ b/src/ui/gui/helper.c @@ -91,7 +91,7 @@ text_to_value (const gchar *text, union value *v, } msg_disable (); - ok = data_in (ss_cstr (text), LEGACY_NATIVE, format.type, 0, 0, + ok = data_in (ss_cstr (text), LEGACY_NATIVE, format.type, 0, 0, 0, v, fmt_var_width (&format)); msg_enable (); diff --git a/src/ui/gui/psppire-case-file.c b/src/ui/gui/psppire-case-file.c index 0e5bfb4e..08503ff9 100644 --- a/src/ui/gui/psppire-case-file.c +++ b/src/ui/gui/psppire-case-file.c @@ -366,7 +366,7 @@ psppire_case_file_data_in (PsppireCaseFile *cf, casenumber casenum, gint idx, width = fmt_var_width (fmt); value = xmalloca (value_cnt_from_width (width) * sizeof *value); ok = (datasheet_get_value (cf->datasheet, casenum, idx, value, width) - && data_in (input, LEGACY_NATIVE, fmt->type, 0, 0, value, width) + && data_in (input, LEGACY_NATIVE, fmt->type, 0, 0, 0, value, width) && datasheet_put_value (cf->datasheet, casenum, idx, value, width)); if (ok) diff --git a/tests/ChangeLog b/tests/ChangeLog index fa30dd50..fce8b1bf 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,9 @@ +2008-02-10 Ben Pfaff + + * command/get-data-txt-examples.sh: Update to match changes to + documentation (which were in turn updated to show how the escaped + quote feature works). + 2008-02-02 Ben Pfaff * automake.mk: Add target for dissect-sysfile. diff --git a/tests/command/get-data-txt-examples.sh b/tests/command/get-data-txt-examples.sh index 56cbd3b3..d9fab750 100755 --- a/tests/command/get-data-txt-examples.sh +++ b/tests/command/get-data-txt-examples.sh @@ -75,12 +75,12 @@ if [ $? -ne 0 ] ; then no_result ; fi activity="create pets.data" cat > pets.data <<'EOF' -"Pet Name", "Age", "Color", "Date Received", "Price", "Needs Walking", "Type" +'Pet''s Name', "Age", "Color", "Date Received", "Price", "Height", "Type" , (Years), , , (Dollars), , -"Rover", 4.5, Brown, "12 Feb 2004", 80, True, "Dog" -"Charlie", , Gold, "5 Apr 2007", 12.3, False, "Fish" -"Molly", 2, Black, "12 Dec 2006", 25, False, "Cat" -"Gilly", , White, "10 Apr 2007", 10, False, "Guinea Pig" +"Rover", 4.5, Brown, "12 Feb 2004", 80, '1''4"', "Dog" +"Charlie", , Gold, "5 Apr 2007", 12.3, "3""", "Fish" +"Molly", 2, Black, "12 Dec 2006", 25, '5"', "Cat" +"Gilly", , White, "10 Apr 2007", 10, "3""", "Guinea Pig" EOF if [ $? -ne 0 ] ; then no_result ; fi @@ -114,14 +114,14 @@ GET DATA /TYPE=TXT /FILE='cars.data' /ARRANGEMENT=FIXED /FIRSTCASE=2 age 40-47 F. LIST. -GET DATA /TYPE=TXT /FILE='pets.data' /DELIMITERS=', ' /QUALIFIER='"' +GET DATA /TYPE=TXT /FILE='pets.data' /DELIMITERS=', ' /QUALIFIER='''"' /ESCAPE /FIRSTCASE=3 /VARIABLES=name A10 age F3.1 color A5 received EDATE10 price F5.2 - needs_walking a5 + height a5 type a10. LIST. EOF @@ -152,12 +152,12 @@ Civic 2002 29883 15900 Si 2 Civic 2003 13415 15900 EX 1 Civic 1992 107000 3800 n/a 12 Accord 2002 26613 17900 EX 1 - name age color received price needs_walking type ----------- ---- ----- ---------- ------ ------------- ---------- -Rover 4.5 Brown 12.02.2004 80.00 True Dog -Charlie . Gold 05.04.2007 12.30 False Fish -Molly 2.0 Black 12.12.2006 25.00 False Cat -Gilly . White 10.04.2007 10.00 False Guinea Pig + name age color received price height type +---------- ---- ----- ---------- ------ ------ ---------- +Rover 4.5 Brown 12.02.2004 80.00 1'4" Dog +Charlie . Gold 05.04.2007 12.30 3" Fish +Molly 2.0 Black 12.12.2006 25.00 5" Cat +Gilly . White 10.04.2007 10.00 3" Guinea Pig EOF if [ $? -ne 0 ] ; then fail ; fi -- 2.30.2