From 453a25eb0d3ef50989735fc68ae3f17e5b318c8c Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Fri, 27 May 2022 18:29:20 -0700 Subject: [PATCH] gui: For text data import, use the same parser for preview as for import. There were odd discrepancies between the preview code in the GUI for text data import and the code in GET DATA /TYPE=TXT that implemented the eventual import. This commit unifies the two parsers, dropping the one that was used only for preview. This should prevent further discrepancies. Bug #61809. Thanks to Friedrich Beckmann for reporting this bug. --- src/language/data-io/data-parser.c | 127 +++++++++++++------- src/language/data-io/data-parser.h | 4 +- src/ui/gui/psppire-delimited-text.c | 178 ++++++++++------------------ src/ui/gui/psppire-delimited-text.h | 7 +- 4 files changed, 158 insertions(+), 158 deletions(-) diff --git a/src/language/data-io/data-parser.c b/src/language/data-io/data-parser.c index e7896f6bbe..3f19fb6262 100644 --- a/src/language/data-io/data-parser.c +++ b/src/language/data-io/data-parser.c @@ -32,6 +32,7 @@ #include "libpspp/intern.h" #include "libpspp/message.h" #include "libpspp/str.h" +#include "libpspp/string-array.h" #include "output/pivot-table.h" #include "gl/xalloc.h" @@ -399,6 +400,54 @@ data_parser_parse (struct data_parser *parser, struct dfm_reader *reader, return retval; } +static void +cut_field__ (const struct data_parser *parser, const struct substring *line, + struct substring *p, size_t *n_columns, + struct string *tmp, struct substring *field) +{ + bool quoted = ss_find_byte (parser->quotes, ss_first (*p)) != SIZE_MAX; + if (quoted) + { + /* Quoted field. */ + int quote = ss_get_byte (p); + if (!ss_get_until (p, quote, field)) + msg (DW, _("Quoted string extends beyond end of line.")); + if (parser->quote_escape && ss_first (*p) == quote) + { + ds_assign_substring (tmp, *field); + while (ss_match_byte (p, quote)) + { + struct substring ss; + ds_put_byte (tmp, quote); + if (!ss_get_until (p, quote, &ss)) + msg (DW, _("Quoted string extends beyond end of line.")); + ds_put_substring (tmp, ss); + } + *field = ds_ss (tmp); + } + *n_columns = ss_length (*line) - ss_length (*p); + } + else + { + /* Regular field. */ + ss_get_bytes (p, ss_cspan (*p, ds_ss (&parser->any_sep)), field); + *n_columns = ss_length (*field); + } + + /* Skip trailing soft separator and a single hard separator if present. */ + size_t length_before_separators = ss_length (*p); + ss_ltrim (p, parser->soft_seps); + if (!ss_is_empty (*p) + && ss_find_byte (parser->hard_seps, ss_first (*p)) != SIZE_MAX) + { + ss_advance (p, 1); + ss_ltrim (p, parser->soft_seps); + } + + if (!ss_is_empty (*p) && quoted && length_before_separators == ss_length (*p)) + msg (DW, _("Missing delimiter following quoted string.")); +} + /* Extracts a delimited field from the current position in the current record according to PARSER, reading data from READER. @@ -415,9 +464,7 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, int *first_column, int *last_column, struct string *tmp, struct substring *field) { - size_t length_before_separators; struct substring line, p; - bool quoted; if (dfm_eof (reader)) return false; @@ -443,49 +490,13 @@ cut_field (const struct data_parser *parser, struct dfm_reader *reader, } } + size_t n_columns; + cut_field__ (parser, &line, &p, &n_columns, tmp, field); *first_column = dfm_column_start (reader); - quoted = ss_find_byte (parser->quotes, ss_first (p)) != SIZE_MAX; - if (quoted) - { - /* Quoted field. */ - int quote = ss_get_byte (&p); - if (!ss_get_until (&p, quote, field)) - msg (DW, _("Quoted string extends beyond end of line.")); - if (parser->quote_escape && ss_first (p) == quote) - { - ds_assign_substring (tmp, *field); - while (ss_match_byte (&p, quote)) - { - struct substring ss; - ds_put_byte (tmp, quote); - if (!ss_get_until (&p, quote, &ss)) - msg (DW, _("Quoted string extends beyond end of line.")); - ds_put_substring (tmp, ss); - } - *field = ds_ss (tmp); - } - *last_column = *first_column + (ss_length (line) - ss_length (p)); - } - else - { - /* Regular field. */ - ss_get_bytes (&p, ss_cspan (p, ds_ss (&parser->any_sep)), field); - *last_column = *first_column + ss_length (*field); - } + *last_column = *first_column + n_columns; - /* Skip trailing soft separator and a single hard separator if present. */ - length_before_separators = ss_length (p); - ss_ltrim (&p, parser->soft_seps); - if (!ss_is_empty (p) - && ss_find_byte (parser->hard_seps, ss_first (p)) != SIZE_MAX) - { - ss_advance (&p, 1); - ss_ltrim (&p, parser->soft_seps); - } if (ss_is_empty (p)) dfm_forward_columns (reader, 1); - else if (quoted && length_before_separators == ss_length (p)) - msg (DW, _("Missing delimiter following quoted string.")); dfm_forward_columns (reader, ss_length (line) - ss_length (p)); return true; @@ -569,6 +580,40 @@ parse_fixed (const struct data_parser *parser, struct dfm_reader *reader, return true; } +/* Splits the data line in LINE into individual text fields and returns the + number of fields. If SA is nonnull, appends each field to SA; the caller + retains ownership of SA and its contents. */ +size_t +data_parser_split (const struct data_parser *parser, + struct substring line, struct string_array *sa) +{ + size_t n = 0; + + struct string tmp = DS_EMPTY_INITIALIZER; + for (;;) + { + struct substring p = line; + ss_ltrim (&p, parser->soft_seps); + if (ss_is_empty (p)) + { + ds_destroy (&tmp); + return n; + } + + size_t n_columns; + struct substring field; + + msg_disable (); + cut_field__ (parser, &line, &p, &n_columns, &tmp, &field); + msg_enable (); + + if (sa) + string_array_append_nocopy (sa, ss_xstrdup (field)); + n++; + line = p; + } +} + /* Reads a case from READER into C, which matches dictionary DICT, parsing it according to free-format syntax rules in PARSER. Returns true if successful, false at end of file or on I/O error. */ diff --git a/src/language/data-io/data-parser.h b/src/language/data-io/data-parser.h index caef721d10..4ae25b2cea 100644 --- a/src/language/data-io/data-parser.h +++ b/src/language/data-io/data-parser.h @@ -28,6 +28,7 @@ struct dfm_reader; struct dictionary; struct file_handle; struct fmt_spec; +struct string_array; struct substring; /* Type of data read by a data parser. */ @@ -75,7 +76,8 @@ void data_parser_add_fixed_field (struct data_parser *, bool data_parser_any_fields (const struct data_parser *); bool data_parser_parse (struct data_parser *, struct dfm_reader *, struct dictionary *, struct ccase *); - +size_t data_parser_split (const struct data_parser *, struct substring line, + struct string_array *); /* Uses for a configured parser. */ void data_parser_output_description (struct data_parser *, diff --git a/src/ui/gui/psppire-delimited-text.c b/src/ui/gui/psppire-delimited-text.c index 59b423010e..674bd86eac 100644 --- a/src/ui/gui/psppire-delimited-text.c +++ b/src/ui/gui/psppire-delimited-text.c @@ -21,7 +21,9 @@ #include "psppire-delimited-text.h" #include "psppire-text-file.h" +#include "language/data-io/data-parser.h" #include "libpspp/str.h" +#include "libpspp/string-array.h" #include "libpspp/i18n.h" #include @@ -36,63 +38,73 @@ enum PROP_FIRST_LINE }; +static struct data_parser * +make_data_parser (PsppireDelimitedText *tf) +{ + struct data_parser *parser = data_parser_create (); + data_parser_set_type (parser, DP_DELIMITED); + data_parser_set_span (parser, false); + data_parser_set_quotes (parser, ss_empty ()); + data_parser_set_quote_escape (parser, true); + data_parser_set_empty_line_has_field (parser, true); + + bool space = false; + struct string hard_delimiters = DS_EMPTY_INITIALIZER; + GSList *del; + for (del = tf->delimiters; del; del = g_slist_next (del)) + { + gunichar c = GPOINTER_TO_INT (del->data); + if (c == ' ') + space = true; + else + ds_put_unichar (&hard_delimiters, c); + } + data_parser_set_soft_delimiters (parser, ss_cstr (space ? " " : "")); + data_parser_set_hard_delimiters (parser, ds_ss (&hard_delimiters)); + ds_destroy (&hard_delimiters); + + if (tf->quote) + { + struct string quote = DS_EMPTY_INITIALIZER; + ds_put_unichar ("e, tf->quote); + data_parser_set_quotes (parser, ds_ss ("e)); + ds_destroy ("e); + } + return parser; +} + static void count_delims (PsppireDelimitedText *tf) { if (tf->child == NULL) return; - tf->max_delimiters = 0; + struct data_parser *parser = make_data_parser (tf); + + tf->max_fields = 0; GtkTreeIter iter; gboolean valid; for (valid = gtk_tree_model_get_iter_first (tf->child, &iter); valid; valid = gtk_tree_model_iter_next (tf->child, &iter)) { - gunichar quote = -1; - // FIXME: Box these lines to avoid constant allocation/deallocation gchar *line = NULL; gtk_tree_model_get (tf->child, &iter, 1, &line, -1); - { - char *p; - gint count = 0; - for (p = line; ; p = g_utf8_find_next_char (p, NULL)) - { - const gunichar c = g_utf8_get_char (p); - if (c == 0) - break; - - if (c == quote) - quote = -1; - else if (tf->quote && c == tf->quote) - quote = c; - - if (quote == -1) - { - GSList *del; - for (del = tf->delimiters; del; del = g_slist_next (del)) - { - if (c == GPOINTER_TO_INT (del->data)) - count++; - } - } - } - tf->max_delimiters = MAX (tf->max_delimiters, count); - } + size_t n_fields = data_parser_split (parser, ss_cstr (line), NULL); + if (n_fields > tf->max_fields) + tf->max_fields = n_fields; g_free (line); } + + data_parser_destroy (parser); } static void cache_invalidate (PsppireDelimitedText *tf) { - memset (tf->cache_starts, 0, sizeof tf->cache_starts); - if (tf->const_cache.string) - { - ss_dealloc (&tf->const_cache); - tf->const_cache.string = NULL; - tf->cache_row = -1; - } + tf->cache_row = -1; + data_parser_destroy (tf->parser); + tf->parser = make_data_parser (tf); } static void @@ -295,8 +307,8 @@ __tree_model_get_n_columns (GtkTreeModel *tree_model) { PsppireDelimitedText *tf = PSPPIRE_DELIMITED_TEXT (tree_model); - /* + 1 for the trailing field and +1 for the leading line number column */ - return tf->max_delimiters + 1 + 1; + /* +1 for the leading line number column */ + return tf->max_fields + 1; } @@ -327,80 +339,19 @@ __iter_nth_child (GtkTreeModel *tree_model, return TRUE; } - -static void -nullify_char (struct substring cs) -{ - int char_len = ss_first_mblen (cs); - while (char_len > 0) - { - cs.string[char_len - 1] = '\0'; - char_len--; - } -} - - /* Split row N into it's delimited fields (if it is not already cached) and set this row as the current cache. */ static void split_row_into_fields (PsppireDelimitedText *file, gint n) { if (n == file->cache_row) /* Cache hit */ - { - return; - } - - memset (file->cache_starts, 0, sizeof file->cache_starts); - /* Cache miss */ - if (file->const_cache.string) - { - ss_dealloc (&file->const_cache); - } - ss_alloc_substring_pool (&file->const_cache, - PSPPIRE_TEXT_FILE (file->child)->lines[n], NULL); - struct substring cs = file->const_cache; - int field = 0; - file->cache_starts[0] = cs.string; - gunichar quote = -1; - for (; - UINT32_MAX != ss_first_mb (cs); - ss_get_mb (&cs)) - { - ucs4_t character = ss_first_mb (cs); - gboolean char_is_quote = FALSE; - if (quote == -1) - { - if (file->quote && character == file->quote) - { - quote = character; - char_is_quote = TRUE; - file->cache_starts[field] += ss_first_mblen (cs); - } - } - else if (character == quote) - { - char_is_quote = TRUE; - nullify_char (cs); - quote = -1; - } - - if (quote == -1 && char_is_quote == FALSE) - { - GSList *del; - for (del = file->delimiters; del; del = g_slist_next (del)) - { - if (character == GPOINTER_TO_INT (del->data)) - { - field++; - int char_len = ss_first_mblen (cs); - file->cache_starts[field] = cs.string + char_len; - nullify_char (cs); - break; - } - } - } - } + return; + if (!file->parser) + file->parser = make_data_parser (file); + string_array_clear (&file->cache); + data_parser_split (file->parser, PSPPIRE_TEXT_FILE (file->child)->lines[n], + &file->cache); file->cache_row = n; } @@ -412,7 +363,7 @@ psppire_delimited_text_get_header_title (PsppireDelimitedText *file, gint column split_row_into_fields (file, file->first_line - 1); - return file->cache_starts [column]; + return column < file->cache.n ? file->cache.strings[column] : ""; } static void @@ -439,7 +390,9 @@ __get_value (GtkTreeModel *tree_model, split_row_into_fields (file, n); - g_value_set_string (value, file->cache_starts [column - 1]); + size_t idx = column - 1; + const char *s = idx < file->cache.n ? file->cache.strings[idx] : ""; + g_value_set_string (value, s); } @@ -531,12 +484,11 @@ psppire_delimited_text_init (PsppireDelimitedText *text_file) text_file->first_line = 0; text_file->delimiters = g_slist_prepend (NULL, GINT_TO_POINTER (':')); - text_file->const_cache.string = NULL; - text_file->const_cache.length = 0; text_file->cache_row = -1; - memset (text_file->cache_starts, 0, sizeof text_file->cache_starts); + string_array_init (&text_file->cache); + text_file->parser = NULL; - text_file->max_delimiters = 0; + text_file->max_fields = 0; text_file->quote = 0; @@ -560,8 +512,8 @@ psppire_delimited_text_finalize (GObject *object) PsppireDelimitedText *tf = PSPPIRE_DELIMITED_TEXT (object); g_slist_free (tf->delimiters); - - ss_dealloc (&tf->const_cache); + string_array_destroy (&tf->cache); + data_parser_destroy (tf->parser); /* must chain up */ (* parent_class->finalize) (object); diff --git a/src/ui/gui/psppire-delimited-text.h b/src/ui/gui/psppire-delimited-text.h index b5c919f4fb..c24ce10c93 100644 --- a/src/ui/gui/psppire-delimited-text.h +++ b/src/ui/gui/psppire-delimited-text.h @@ -18,6 +18,7 @@ #define __PSPPIRE_DELIMITED_TEXT_H__ #include "libpspp/str.h" +#include "libpspp/string-array.h" #include #include @@ -59,7 +60,7 @@ struct _PsppireDelimitedText gint first_line; GSList *delimiters; - gint max_delimiters; + gint max_fields; gunichar quote; @@ -68,9 +69,9 @@ struct _PsppireDelimitedText gint stamp; /* caching */ - const char *cache_starts[512]; - struct substring const_cache; int cache_row; + struct string_array cache; + struct data_parser *parser; }; struct _PsppireDelimitedTextClass -- 2.30.2