From: Ben Pfaff Date: Mon, 2 Sep 2013 20:00:30 +0000 (-0700) Subject: data-parser: A hard delimiter at end of line is not an empty field. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=254183e467ae32e6581fafc798a057f55e34773e;p=pspp data-parser: A hard delimiter at end of line is not an empty field. Until now, DATA LIST FREE(',') treated input like this: 1,2, 3,4, as describing six fields, two of which were blank at the end of a line. A bug report and subsequent experiments showed that in fact this is only four fields, none of which is blank. This commit fixes the problem and adds a new test that should avoid regression against this issue. Reported by Samuel Afflerbach. --- diff --git a/src/language/data-io/data-parser.c b/src/language/data-io/data-parser.c index 958326e48d..55c62e08c0 100644 --- a/src/language/data-io/data-parser.c +++ b/src/language/data-io/data-parser.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + Copyright (C) 2007, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -423,7 +423,9 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, int *first_column, int *last_column, struct string *tmp, struct substring *field) { + size_t length_before_separators; struct substring line, p; + bool quoted; if (dfm_eof (reader)) return false; @@ -450,7 +452,8 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, } *first_column = dfm_column_start (reader); - if (ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX) + quoted = ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX; + if (quoted) { /* Quoted field. */ int quote = ss_get_byte (&p); @@ -470,39 +473,27 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, *field = ds_ss (tmp); } *last_column = *first_column + (ss_length (line) - ss_length (p)); - - /* Skip trailing soft separator and a single hard separator - if present. */ - if (!ss_is_empty (p)) - { - size_t n_seps = ss_ltrim (&p, parser->soft_seps); - if (!ss_is_empty (p) - && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX) - { - ss_advance (&p, 1); - n_seps++; - } - if (!n_seps) - msg (DW, _("Missing delimiter following quoted string.")); - } } else { /* Regular field. */ ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field); *last_column = *first_column + ss_length (*field); + } - if (!ss_ltrim (&p, parser->soft_seps) || ss_is_empty (p) - || ss_find_byte (parser->hard_seps, p.string[0]) != SIZE_MAX) - { - /* Advance past a trailing hard separator, - regardless of whether one actually existed. If - we "skip" a delimiter that was not actually - there, then we will return end-of-line on our - next call, which is what we want. */ - dfm_forward_columns (reader, 1); - } + /* Skip trailing soft separator and a single hard separator if present. */ + length_before_separators = ss_length (p); + ss_ltrim (&p, parser->soft_seps); + if (!ss_is_empty (p) + && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX) + { + ss_advance (&p, 1); + ss_ltrim (&p, parser->soft_seps); } + if (ss_is_empty (p)) + dfm_forward_columns (reader, 1); + else if (quoted && length_before_separators == ss_length (p)) + msg (DW, _("Missing delimiter following quoted string.")); dfm_forward_columns (reader, ss_length (line) - ss_length (p)); return true; diff --git a/tests/language/data-io/data-list.at b/tests/language/data-io/data-list.at index 4a59cf795a..b70e3904f5 100644 --- a/tests/language/data-io/data-list.at +++ b/tests/language/data-io/data-list.at @@ -96,7 +96,7 @@ AT_CLEANUP AT_SETUP([DATA LIST LIST with SKIP and tab delimiter]) AT_DATA([data-list.pspp], [dnl -data list free (tab) skip=2/A B C D. +data list list (tab) notable skip=2/A B C D. begin data. # These records # are skipped. @@ -141,6 +141,69 @@ A,B,C,D ]) AT_CLEANUP +dnl Results of this test were confirmed with SPSS 21: +dnl http://lists.gnu.org/archive/html/pspp-dev/2013-09/msg00003.html +AT_SETUP([DATA LIST FREE with explicit delimiter at end of line]) +AT_DATA([data-list.pspp], [dnl +DATA LIST FREE(',')/x y z. +BEGIN DATA. +1,2,3 +4,5,6 +7,8,9 +END DATA. +LIST. + +DATA LIST FREE(',')/x y z. +BEGIN DATA. +11,12,13, +14,15,16, +17,18,19, +END DATA. +LIST. + +DATA LIST FREE(TAB)/x y z. +BEGIN DATA. +21 22 23 +24 25 26 +27 28 29 +END DATA. +LIST. + +DATA LIST FREE(TAB)/x y z. +BEGIN DATA. +31 32 33 @&t@ +34 35 36 @&t@ +37 38 39 @&t@ +END DATA. +LIST. +]) +AT_CHECK([pspp -O format=csv data-list.pspp], [0], [dnl +Table: Data List +x,y,z +1.00,2.00,3.00 +4.00,5.00,6.00 +7.00,8.00,9.00 + +Table: Data List +x,y,z +11.00,12.00,13.00 +14.00,15.00,16.00 +17.00,18.00,19.00 + +Table: Data List +x,y,z +21.00,22.00,23.00 +24.00,25.00,26.00 +27.00,28.00,29.00 + +Table: Data List +x,y,z +31.00,32.00,33.00 +34.00,35.00,36.00 +37.00,38.00,39.00 +]) +AT_CLEANUP + AT_SETUP([DATA LIST FIXED with multiple records per case]) AT_DATA([data-list.pspp], [dnl data list fixed notable diff --git a/tests/language/data-io/get-data-txt.at b/tests/language/data-io/get-data-txt.at index 3ba508c7c4..f18fc978f8 100644 --- a/tests/language/data-io/get-data-txt.at +++ b/tests/language/data-io/get-data-txt.at @@ -36,9 +36,12 @@ begin data. ,4,,5 6 7, + 8 9 0,1,,, + ,,,, + 2 3 @@ -47,7 +50,7 @@ begin data. end data. list. ]) -AT_CHECK([pspp -o pspp.csv get-data.sps]) +AT_CHECK([pspp -o pspp.csv get-data.sps], [0], [ignore]) AT_CHECK([cat pspp.csv], [0], [dnl Table: Data List A,B,C,D @@ -69,21 +72,21 @@ begin data. # These records # are skipped. 1 2 3 4 -1 2 3 @&t@ +1 2 3 4 @&t@ 1 2 4 -1 2 @&t@ +1 2 4 @&t@ 1 3 4 -1 3 @&t@ +1 3 4 @&t@ 1 4 -1 @&t@ +1 4 @&t@ 2 3 4 - 2 3 @&t@ + 2 3 4 @&t@ 2 4 - 2 @&t@ + 2 4 @&t@ 3 4 - 3 @&t@ + 3 4 @&t@ 4 - @&t@ + 4 @&t@ end data. list. ]) @@ -92,21 +95,21 @@ AT_CHECK([cat pspp.csv], [0], [dnl Table: Data List A,B,C,D 1.00,2.00,3.00,4.00 -1.00,2.00,3.00,. @&t@ +1.00,2.00,3.00,4.00 +1.00,2.00,. ,4.00 1.00,2.00,. ,4.00 -1.00,2.00,. ,. @&t@ 1.00,. ,3.00,4.00 -1.00,. ,3.00,. @&t@ +1.00,. ,3.00,4.00 1.00,. ,. ,4.00 -1.00,. ,. ,. @&t@ +1.00,. ,. ,4.00 +. ,2.00,3.00,4.00 . ,2.00,3.00,4.00 -. ,2.00,3.00,. @&t@ . ,2.00,. ,4.00 -. ,2.00,. ,. @&t@ +. ,2.00,. ,4.00 +. ,. ,3.00,4.00 . ,. ,3.00,4.00 -. ,. ,3.00,. @&t@ . ,. ,. ,4.00 -. ,. ,. ,. @&t@ +. ,. ,. ,4.00 ]) AT_CLEANUP