From 7d15b322392a7bc29c7da540a8e2de2d2cf9c808 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Sat, 29 Apr 2023 18:07:35 -0700 Subject: [PATCH] combine-files: Adopt the encoding of the first input file. Previously, there was no good way to control the output encoding, which was simply the default encoding. One could use SET LOCALE, but it seems more reasonable to use the encoding from one of the input files, and the first one is the most obvious choice. Also, improve the warning about incompatible input encodings. --- src/language/commands/combine-files.c | 63 +++++++++++++++++++++----- tests/language/commands/match-files.at | 36 +++++++++++++++ 2 files changed, 87 insertions(+), 12 deletions(-) diff --git a/src/language/commands/combine-files.c b/src/language/commands/combine-files.c index e3c8dc5093..286f230561 100644 --- a/src/language/commands/combine-files.c +++ b/src/language/commands/combine-files.c @@ -125,6 +125,7 @@ static void close_all_comb_files (struct comb_proc *); static void merge_dictionary (struct comb_proc *, struct comb_file *); static void different_types_error (struct comb_proc *, struct lexer *, const char *var_name); +static void check_encodings (struct comb_proc *, struct lexer *); static void execute_update (struct comb_proc *); static void execute_match_files (struct comb_proc *); @@ -159,7 +160,6 @@ combine_files (enum comb_command_type command, struct lexer *lexer, struct dataset *ds) { struct comb_proc proc = { - .dict = dict_create (get_default_encoding ()), .different_types = STRINGI_SET_INITIALIZER (proc.different_types), }; @@ -178,8 +178,6 @@ combine_files (enum comb_command_type command, int sort_ofs = INT_MAX; size_t allocated_files = 0; - dict_set_case_limit (proc.dict, dict_get_case_limit (dataset_dict (ds))); - lex_match (lexer, T_SLASH); for (;;) { @@ -243,6 +241,13 @@ combine_files (enum comb_command_type command, } file->end_ofs = lex_ofs (lexer) - 1; + if (!proc.dict) + { + proc.dict = dict_create (dict_get_encoding (file->dict)); + dict_set_case_limit (proc.dict, + dict_get_case_limit (dataset_dict (ds))); + } + while (lex_match (lexer, T_SLASH)) if (lex_match_id (lexer, "RENAME")) { @@ -410,6 +415,8 @@ combine_files (enum comb_command_type command, goto error; } + check_encodings (&proc, lexer); + if (!saw_by) { if (command == COMB_UPDATE) @@ -543,15 +550,6 @@ merge_dictionary (struct comb_proc *proc, struct comb_file *f) if (dict_get_label (m) == NULL) dict_set_label (m, dict_get_label (d)); - /* FIXME: If the input files have different encodings, then - the result is undefined. - The correct thing to do would be to convert to an encoding - which can cope with all the input files (eg UTF-8). - */ - if (strcmp (dict_get_encoding (f->dict), dict_get_encoding (m))) - msg (MW, _("Combining files with incompatible encodings. String data may " - "not be represented correctly.")); - const struct string_array *d_docs = dict_get_documents (d); const struct string_array *m_docs = dict_get_documents (m); if (d_docs) @@ -631,6 +629,47 @@ different_types_error (struct comb_proc *proc, } } +static void +check_encodings (struct comb_proc *proc, struct lexer *lexer) +{ + /* FIXME: If the input files have different encodings, then + the result is undefined. + The correct thing to do would be to convert to an encoding + which can cope with all the input files (eg UTF-8). + */ + for (size_t i = 0; i < dict_get_n_vars (proc->dict); i++) + if (var_is_alpha (dict_get_var (proc->dict, i))) + { + for (size_t j = 1; j < proc->n_files; j++) + if (strcmp (dict_get_encoding (proc->files[j - 1].dict), + dict_get_encoding (proc->files[j].dict))) + { + msg (MW, _("Combining files with different encodings. " + "String data (such as in variable `%s') " + "may not be represented correctly."), + var_get_name (dict_get_var (proc->dict, i))); + + for (size_t k = 0; k < proc->n_files; k++) + { + const struct comb_file *ef = &proc->files[k]; + const char *fn = ef->handle ? fh_get_name (ef->handle) : "*"; + if (!k) + lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs, + _("File %s uses encoding %s. The output " + "will use this encoding."), + fn, dict_get_encoding (ef->dict)); + else + lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs, + _("File %s uses encoding %s."), + fn, dict_get_encoding (ef->dict)); + } + + return; + } + return; + } +} + /* If VAR_NAME is non-NULL, attempts to create a variable named VAR_NAME, with format F1.0, in DICT, and stores a pointer to the variable in *VAR. Returns true if diff --git a/tests/language/commands/match-files.at b/tests/language/commands/match-files.at index b7f5dd30d1..7e7aee352b 100644 --- a/tests/language/commands/match-files.at +++ b/tests/language/commands/match-files.at @@ -413,3 +413,39 @@ match-files.sps:43: error: MATCH FILES: Variable name has different type or widt | ^~~~~~" ]) AT_CLEANUP + +AT_SETUP([MATCH FILES incompatible encoding warning]) +AT_DATA([match-files.sps], [dnl +SET LOCALE='utf-8'. +DATA LIST LIST NOTABLE/name (A6) x. +BEGIN DATA. +al,7 +brad,8 +carl,9 +END DATA. +SAVE OUTFILE='x.sav'. + +SET LOCALE='US-ASCII'. +DATA LIST LIST NOTABLE/name (A6) y. +BEGIN DATA. +al,1 +carl,2 +dan,3 +END DATA. +MATCH FILES/FILE=*/FILE='x.sav'/RENAME(x=y). +]) +AT_CHECK([pspp match-files.sps], [0], [dnl +warning: Combining files with different encodings. String data (such as in +variable `name') may not be represented correctly. + +match-files.sps:17.13-17.18: note: MATCH FILES: File * uses encoding US-ASCII. +The output will use this encoding. + 17 | MATCH FILES/FILE=*/FILE='x.sav'/RENAME(x=y). + | ^~~~~~ + +match-files.sps:17.20-17.31: note: MATCH FILES: File `x.sav' uses encoding +UTF-8. + 17 | MATCH FILES/FILE=*/FILE='x.sav'/RENAME(x=y). + | ^~~~~~~~~~~~ +]) +AT_CLEANUP -- 2.30.2