static void merge_dictionary (struct comb_proc *, struct comb_file *);
static void different_types_error (struct comb_proc *, struct lexer *,
const char *var_name);
+static void check_encodings (struct comb_proc *, struct lexer *);
static void execute_update (struct comb_proc *);
static void execute_match_files (struct comb_proc *);
struct lexer *lexer, struct dataset *ds)
{
struct comb_proc proc = {
- .dict = dict_create (get_default_encoding ()),
.different_types = STRINGI_SET_INITIALIZER (proc.different_types),
};
int sort_ofs = INT_MAX;
size_t allocated_files = 0;
- dict_set_case_limit (proc.dict, dict_get_case_limit (dataset_dict (ds)));
-
lex_match (lexer, T_SLASH);
for (;;)
{
}
file->end_ofs = lex_ofs (lexer) - 1;
+ if (!proc.dict)
+ {
+ proc.dict = dict_create (dict_get_encoding (file->dict));
+ dict_set_case_limit (proc.dict,
+ dict_get_case_limit (dataset_dict (ds)));
+ }
+
while (lex_match (lexer, T_SLASH))
if (lex_match_id (lexer, "RENAME"))
{
goto error;
}
+ check_encodings (&proc, lexer);
+
if (!saw_by)
{
if (command == COMB_UPDATE)
if (dict_get_label (m) == NULL)
dict_set_label (m, dict_get_label (d));
- /* FIXME: If the input files have different encodings, then
- the result is undefined.
- The correct thing to do would be to convert to an encoding
- which can cope with all the input files (eg UTF-8).
- */
- if (strcmp (dict_get_encoding (f->dict), dict_get_encoding (m)))
- msg (MW, _("Combining files with incompatible encodings. String data may "
- "not be represented correctly."));
-
const struct string_array *d_docs = dict_get_documents (d);
const struct string_array *m_docs = dict_get_documents (m);
if (d_docs)
}
}
+static void
+check_encodings (struct comb_proc *proc, struct lexer *lexer)
+{
+ /* FIXME: If the input files have different encodings, then
+ the result is undefined.
+ The correct thing to do would be to convert to an encoding
+ which can cope with all the input files (eg UTF-8).
+ */
+ for (size_t i = 0; i < dict_get_n_vars (proc->dict); i++)
+ if (var_is_alpha (dict_get_var (proc->dict, i)))
+ {
+ for (size_t j = 1; j < proc->n_files; j++)
+ if (strcmp (dict_get_encoding (proc->files[j - 1].dict),
+ dict_get_encoding (proc->files[j].dict)))
+ {
+ msg (MW, _("Combining files with different encodings. "
+ "String data (such as in variable `%s') "
+ "may not be represented correctly."),
+ var_get_name (dict_get_var (proc->dict, i)));
+
+ for (size_t k = 0; k < proc->n_files; k++)
+ {
+ const struct comb_file *ef = &proc->files[k];
+ const char *fn = ef->handle ? fh_get_name (ef->handle) : "*";
+ if (!k)
+ lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs,
+ _("File %s uses encoding %s. The output "
+ "will use this encoding."),
+ fn, dict_get_encoding (ef->dict));
+ else
+ lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs,
+ _("File %s uses encoding %s."),
+ fn, dict_get_encoding (ef->dict));
+ }
+
+ return;
+ }
+ return;
+ }
+}
+
/* If VAR_NAME is non-NULL, attempts to create a
variable named VAR_NAME, with format F1.0, in DICT, and stores
a pointer to the variable in *VAR. Returns true if
| ^~~~~~"
])
AT_CLEANUP
+
+AT_SETUP([MATCH FILES incompatible encoding warning])
+AT_DATA([match-files.sps], [dnl
+SET LOCALE='utf-8'.
+DATA LIST LIST NOTABLE/name (A6) x.
+BEGIN DATA.
+al,7
+brad,8
+carl,9
+END DATA.
+SAVE OUTFILE='x.sav'.
+
+SET LOCALE='US-ASCII'.
+DATA LIST LIST NOTABLE/name (A6) y.
+BEGIN DATA.
+al,1
+carl,2
+dan,3
+END DATA.
+MATCH FILES/FILE=*/FILE='x.sav'/RENAME(x=y).
+])
+AT_CHECK([pspp match-files.sps], [0], [dnl
+warning: Combining files with different encodings. String data (such as in
+variable `name') may not be represented correctly.
+
+match-files.sps:17.13-17.18: note: MATCH FILES: File * uses encoding US-ASCII.
+The output will use this encoding.
+ 17 | MATCH FILES/FILE=*/FILE='x.sav'/RENAME(x=y).
+ | ^~~~~~
+
+match-files.sps:17.20-17.31: note: MATCH FILES: File `x.sav' uses encoding
+UTF-8.
+ 17 | MATCH FILES/FILE=*/FILE='x.sav'/RENAME(x=y).
+ | ^~~~~~~~~~~~
+])
+AT_CLEANUP