combine-files: Adopt the encoding of the first input file.

author Ben Pfaff <blp@cs.stanford.edu>

Sun, 30 Apr 2023 01:07:35 +0000 (18:07 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Sun, 30 Apr 2023 01:07:35 +0000 (18:07 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Sun, 30 Apr 2023 01:07:35 +0000 (18:07 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Sun, 30 Apr 2023 01:07:35 +0000 (18:07 -0700)
diff --git a/src/language/commands/combine-files.c b/src/language/commands/combine-files.c

index e3c8dc5093545867eab670e0ea32d6de5b80b8f3..286f2305614f21c92d924e889ee67740cb1dd580 100644 (file)
--- a/src/language/commands/combine-files.c
+++ b/src/language/commands/combine-files.c
@@ -125,6 +125,7 @@ static void close_all_comb_files (struct comb_proc *);
  static void merge_dictionary (struct comb_proc *, struct comb_file *);
  static void different_types_error (struct comb_proc *, struct lexer *,
                                     const char *var_name);
+static void check_encodings (struct comb_proc *, struct lexer *);
  
  static void execute_update (struct comb_proc *);
  static void execute_match_files (struct comb_proc *);
@@ -159,7 +160,6 @@ combine_files (enum comb_command_type command,
                 struct lexer *lexer, struct dataset *ds)
  {
    struct comb_proc proc = {
-    .dict = dict_create (get_default_encoding ()),
      .different_types = STRINGI_SET_INITIALIZER (proc.different_types),
    };
  
@@ -178,8 +178,6 @@ combine_files (enum comb_command_type command,
    int sort_ofs = INT_MAX;
    size_t allocated_files = 0;
  
-  dict_set_case_limit (proc.dict, dict_get_case_limit (dataset_dict (ds)));
-
    lex_match (lexer, T_SLASH);
    for (;;)
      {
@@ -243,6 +241,13 @@ combine_files (enum comb_command_type command,
          }
        file->end_ofs = lex_ofs (lexer) - 1;
  
+      if (!proc.dict)
+        {
+          proc.dict = dict_create (dict_get_encoding (file->dict));
+          dict_set_case_limit (proc.dict,
+                               dict_get_case_limit (dataset_dict (ds)));
+        }
+
        while (lex_match (lexer, T_SLASH))
          if (lex_match_id (lexer, "RENAME"))
            {
@@ -410,6 +415,8 @@ combine_files (enum comb_command_type command,
          goto error;
      }
  
+  check_encodings (&proc, lexer);
+
    if (!saw_by)
      {
        if (command == COMB_UPDATE)
@@ -543,15 +550,6 @@ merge_dictionary (struct comb_proc *proc, struct comb_file *f)
    if (dict_get_label (m) == NULL)
      dict_set_label (m, dict_get_label (d));
  
-  /* FIXME: If the input files have different encodings, then
-     the result is undefined.
-     The correct thing to do would be to convert to an encoding
-     which can cope with all the input files (eg UTF-8).
-   */
-  if (strcmp (dict_get_encoding (f->dict), dict_get_encoding (m)))
-    msg (MW, _("Combining files with incompatible encodings. String data may "
-               "not be represented correctly."));
-
    const struct string_array *d_docs = dict_get_documents (d);
    const struct string_array *m_docs = dict_get_documents (m);
    if (d_docs)
@@ -631,6 +629,47 @@ different_types_error (struct comb_proc *proc,
      }
  }
  
+static void
+check_encodings (struct comb_proc *proc, struct lexer *lexer)
+{
+  /* FIXME: If the input files have different encodings, then
+     the result is undefined.
+     The correct thing to do would be to convert to an encoding
+     which can cope with all the input files (eg UTF-8).
+  */
+  for (size_t i = 0; i < dict_get_n_vars (proc->dict); i++)
+    if (var_is_alpha (dict_get_var (proc->dict, i)))
+      {
+        for (size_t j = 1; j < proc->n_files; j++)
+          if (strcmp (dict_get_encoding (proc->files[j - 1].dict),
+                      dict_get_encoding (proc->files[j].dict)))
+            {
+              msg (MW, _("Combining files with different encodings.  "
+                         "String data (such as in variable `%s') "
+                         "may not be represented correctly."),
+                   var_get_name (dict_get_var (proc->dict, i)));
+
+              for (size_t k = 0; k < proc->n_files; k++)
+                {
+                  const struct comb_file *ef = &proc->files[k];
+                  const char *fn = ef->handle ? fh_get_name (ef->handle) : "*";
+                  if (!k)
+                    lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs,
+                                 _("File %s uses encoding %s.  The output "
+                                   "will use this encoding."),
+                                 fn, dict_get_encoding (ef->dict));
+                  else
+                    lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs,
+                                 _("File %s uses encoding %s."),
+                                 fn, dict_get_encoding (ef->dict));
+                }
+
+              return;
+            }
+        return;
+      }
+}
+
  /* If VAR_NAME is non-NULL, attempts to create a
     variable named VAR_NAME, with format F1.0, in DICT, and stores
     a pointer to the variable in *VAR.  Returns true if
diff --git a/tests/language/commands/match-files.at b/tests/language/commands/match-files.at

index b7f5dd30d1cf497e86a0e4ec0a8fddd73ce8e716..7e7aee352b154ceba086b8f41497ccbeeaf42217 100644 (file)
--- a/tests/language/commands/match-files.at
+++ b/tests/language/commands/match-files.at
@@ -413,3 +413,39 @@ match-files.sps:43: error: MATCH FILES: Variable name has different type or widt
        |                          ^~~~~~"
  ])
  AT_CLEANUP
+
+AT_SETUP([MATCH FILES incompatible encoding warning])
+AT_DATA([match-files.sps], [dnl
+SET LOCALE='utf-8'.
+DATA LIST LIST NOTABLE/name (A6) x.
+BEGIN DATA.
+al,7
+brad,8
+carl,9
+END DATA.
+SAVE OUTFILE='x.sav'.
+
+SET LOCALE='US-ASCII'.
+DATA LIST LIST NOTABLE/name (A6) y.
+BEGIN DATA.
+al,1
+carl,2
+dan,3
+END DATA.
+MATCH FILES/FILE=*/FILE='x.sav'/RENAME(x=y).
+])
+AT_CHECK([pspp match-files.sps], [0], [dnl
+warning: Combining files with different encodings.  String data (such as in
+variable `name') may not be represented correctly.
+
+match-files.sps:17.13-17.18: note: MATCH FILES: File * uses encoding US-ASCII.
+The output will use this encoding.
+   17 | MATCH FILES/FILE=*/FILE='x.sav'/RENAME(x=y).
+      |             ^~~~~~
+
+match-files.sps:17.20-17.31: note: MATCH FILES: File `x.sav' uses encoding
+UTF-8.
+   17 | MATCH FILES/FILE=*/FILE='x.sav'/RENAME(x=y).
+      |                    ^~~~~~~~~~~~
+])
+AT_CLEANUP
author	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 30 Apr 2023 01:07:35 +0000 (18:07 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sun, 30 Apr 2023 01:07:35 +0000 (18:07 -0700)
src/language/commands/combine-files.c		patch \| blob \| history
tests/language/commands/match-files.at		patch \| blob \| history