From: Ben Pfaff <blp@cs.stanford.edu>
Date: Sat, 6 May 2023 01:01:06 +0000 (-0700)
Subject: combine-files: Allow matching string variables to have different widths.
X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=638bf380e550;p=pspp

combine-files: Allow matching string variables to have different widths.

Requested by Frans Houweling and Alan Mead at
https://lists.gnu.org/archive/html/pspp-users/2015-03/msg00008.html
---

diff --git a/NEWS b/NEWS
index 2ba75571b3..6c15134edf 100644
--- a/NEWS
+++ b/NEWS
@@ -17,6 +17,9 @@ Changes after 1.6.2:
 
    - Break variables are now optional.
 
+ * ADD FILES, MATCH FILES, and UPDATE now allow string variables with
+   the same name to have different widths.
+
  * CROSSTABS now calculates significance of Pearson and Spearman
    correlations in symmetric measures.
 
diff --git a/doc/combining.texi b/doc/combining.texi
index 751612a874..8bb9eafb33 100644
--- a/doc/combining.texi
+++ b/doc/combining.texi
@@ -116,8 +116,10 @@ The variables in the new active dataset are the union of all the input files'
 variables, matched based on their name.  When a single input file
 contains a variable with a given name, the output file will contain
 exactly that variable.  When more than one input file contains a
-variable with a given name, those variables must all have the same
-type (numeric or string) and, for string variables, the same width.
+variable with a given name, those variables must be all string or all numeric.
+If they are string variables, then the result will have the width of the longest
+variable with that name, with narrower values padded on the right with spaces
+to fill the width.
 Variables are matched after renaming with the @subcmd{RENAME} subcommand.
 Thus, @subcmd{RENAME} can be used to resolve conflicts.
 Only variables in the output file can conflict, so @subcmd{DROP} or
diff --git a/src/data/subcase.h b/src/data/subcase.h
index 027a639542..7abbef6d49 100644
--- a/src/data/subcase.h
+++ b/src/data/subcase.h
@@ -84,6 +84,7 @@ static inline size_t subcase_get_n_fields (const struct subcase *);
 
 static inline size_t subcase_get_case_index (const struct subcase *,
                                              size_t idx);
+static inline int subcase_get_width (const struct subcase *, size_t idx);
 static inline enum subcase_direction subcase_get_direction (
   const struct subcase *, size_t idx);
 
@@ -119,6 +120,12 @@ subcase_get_case_index (const struct subcase *sc, size_t idx)
   return sc->fields[idx].case_index;
 }
 
+static inline int
+subcase_get_width (const struct subcase *sc, size_t idx)
+{
+  return sc->fields[idx].width;
+}
+
 static inline enum subcase_direction
 subcase_get_direction (const struct subcase *sc, size_t idx)
 {
diff --git a/src/language/commands/combine-files.c b/src/language/commands/combine-files.c
index ad6914b364..5641328101 100644
--- a/src/language/commands/combine-files.c
+++ b/src/language/commands/combine-files.c
@@ -62,6 +62,21 @@ enum comb_file_type
     COMB_TABLE			/* Specified on TABLE= subcommand. */
   };
 
+/* These commands combine multiple input files into a single master file.  The
+   input files may merge string variables with different widths, right-padding
+   with spaces to the length of the longest.  This data structure allows for
+   that. */
+struct comb_resizer
+  {
+    struct caseproto *output_proto;
+    int *indexes;
+    size_t n_indexes;
+  };
+
+static struct casereader *comb_resize (struct casereader *,
+                                       struct comb_resizer *);
+static void comb_resizer_destroy (struct comb_resizer *);
+
 /* One FILE or TABLE subcommand. */
 struct comb_file
   {
@@ -82,6 +97,7 @@ struct comb_file
     bool is_minimal;            /* Does 'data' have minimum BY values across
                                    all input files? */
     bool is_sorted;             /* Is file presorted on the BY variables? */
+    struct comb_resizer *resizer; /* If necessary. */
 
     /* IN subcommand. */
     char *in_name;
@@ -98,7 +114,7 @@ struct comb_proc
     struct subcase by_vars;     /* BY variables in the output. */
     struct casewriter *output;  /* Destination for output. */
 
-    /* Names of variables whose types or widths different among the files.
+    /* Names of variables whose types differ among the files.
        It's OK if they're all dropped, but not otherwise. */
     struct stringi_set different_types;
 
@@ -277,6 +293,41 @@ combine_files (enum comb_command_type command,
       merge_dictionary (&proc, file);
     }
 
+  for (size_t i = 0; i < dict_get_n_vars (proc.dict); i++)
+    {
+      const struct variable *mv = dict_get_var (proc.dict, i);
+      const char *name = var_get_name (mv);
+      int mw = var_get_width (mv);
+      if (!mw || stringi_set_contains (&proc.different_types, name))
+        continue;
+
+      for (size_t j = 0; j < proc.n_files; j++)
+        {
+          struct comb_file *cf = &proc.files[j];
+          struct variable *dv = dict_lookup_var (cf->dict, name);
+          if (!dv)
+            continue;
+
+          int dw = var_get_width (dv);
+          assert (dw <= mw);
+          if (dw < mw)
+            {
+              struct comb_resizer *r = cf->resizer;
+              if (!r)
+                {
+                  r = xmalloc (sizeof *r);
+                  *r = (struct comb_resizer) {
+                    .output_proto = caseproto_ref (dict_get_proto (proc.dict)),
+                    .indexes = xnmalloc (dict_get_n_vars (cf->dict),
+                                         sizeof *r->indexes),
+                  };
+                  cf->resizer = r;
+                }
+              r->indexes[r->n_indexes++] = var_get_dict_index (dv);
+            }
+        }
+    }
+
   while (lex_token (lexer) != T_ENDCMD)
     {
       if (lex_match (lexer, T_BY))
@@ -316,8 +367,9 @@ combine_files (enum comb_command_type command,
                   const char *name = var_get_name (by_vars[j]);
                   struct variable *var = dict_lookup_var (file->dict, name);
                   if (var != NULL)
-                    subcase_add_var (&file->by_vars, var,
-                                     subcase_get_direction (&proc.by_vars, j));
+                    subcase_add (&file->by_vars, var_get_dict_index (var),
+                                 subcase_get_width (&proc.by_vars, j),
+                                 subcase_get_direction (&proc.by_vars, j));
                   else
                     {
                       const char *fn
@@ -491,6 +543,11 @@ combine_files (enum comb_command_type command,
           else
             file->reader = casereader_clone (active_file);
         }
+      if (file->resizer)
+        {
+          file->reader = comb_resize (file->reader, file->resizer);
+          file->resizer = NULL;
+        }
       if (!file->is_sorted)
         file->reader = sort_execute (file->reader, &file->by_vars);
       taint_propagate (casereader_get_taint (file->reader), taint);
@@ -580,8 +637,11 @@ merge_dictionary (struct comb_proc *proc, struct comb_file *f)
 
       if (!mv)
         mv = dict_clone_var_assert (m, dv);
-      else if (var_get_width (mv) == var_get_width (dv))
+      else if (var_get_type (mv) == var_get_type (dv))
         {
+          if (var_get_width (dv) > var_get_width (mv))
+            var_set_width (mv, var_get_width (dv));
+
           if (var_has_value_labels (dv) && !var_has_value_labels (mv))
             var_set_value_labels (mv, var_get_value_labels (dv));
           if (var_has_missing_values (dv) && !var_has_missing_values (mv))
@@ -598,8 +658,7 @@ static void
 different_types_error (struct comb_proc *proc,
                        struct lexer *lexer, const char *var_name)
 {
-  msg (SE, _("Variable %s has different type or width in different "
-             "files."), var_name);
+  msg (SE, _("Variable %s has different types in different files."), var_name);
   for (size_t i = 0; i < proc->n_files; i++)
     {
       const struct comb_file *ef = &proc->files[i];
@@ -614,8 +673,7 @@ different_types_error (struct comb_proc *proc,
                      fn, var_name);
       else
         lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs,
-                     _("In file %s, %s is a string with width %d."),
-                     fn, var_name, var_get_width (ev));
+                     _("In file %s, %s is a string."), fn, var_name);
     }
 }
 
@@ -707,6 +765,7 @@ close_all_comb_files (struct comb_proc *proc)
       dict_unref (file->dict);
       casereader_destroy (file->reader);
       case_unref (file->data);
+      comb_resizer_destroy (file->resizer);
       free (file->in_name);
     }
   free (proc->files);
@@ -1000,3 +1059,56 @@ output_buffered_case (struct comb_proc *proc)
       proc->buffered_case = NULL;
     }
 }
+
+static void
+comb_resizer_destroy (struct comb_resizer *r)
+{
+  if (!r)
+    return;
+
+  caseproto_unref (r->output_proto);
+  free (r->indexes);
+  free (r);
+}
+
+static struct ccase *
+comb_resize_translate (struct ccase *c, void *r_)
+{
+  struct comb_resizer *r = r_;
+
+  c = case_unshare (c);
+
+  for (size_t i = 0; i < r->n_indexes; i++)
+    {
+      int idx = r->indexes[i];
+      value_resize (&c->values[idx],
+                    caseproto_get_width (c->proto, idx),
+                    caseproto_get_width (r->output_proto, idx));
+    }
+
+  caseproto_unref (c->proto);
+  c->proto = caseproto_ref (r->output_proto);
+
+  return c;
+}
+
+static bool
+comb_resizer_translate_destroy (void *r_)
+{
+  struct comb_resizer *r = r_;
+
+  comb_resizer_destroy (r);
+  return true;
+}
+
+static struct casereader *
+comb_resize (struct casereader *subreader, struct comb_resizer *r)
+{
+  static struct casereader_translator_class class = {
+    .translate = comb_resize_translate,
+    .destroy = comb_resizer_translate_destroy,
+  };
+
+  return casereader_translate_stateless (subreader, r->output_proto,
+                                         &class, r);
+}
diff --git a/tests/language/commands/match-files.at b/tests/language/commands/match-files.at
index 7e7aee352b..3341ce79f1 100644
--- a/tests/language/commands/match-files.at
+++ b/tests/language/commands/match-files.at
@@ -215,11 +215,7 @@ x,y,j
 ])
 AT_CLEANUP
 
-dnl Tests for a bug that caused MATCH FILES to crash
-dnl with incompatible variables, especially but not
-dnl exclusively when one variable came from the active
-dnl file.
-AT_SETUP([MATCH FILES with incompatible variable types])
+AT_SETUP([MATCH FILES with different string widths])
 AT_DATA([match-files.sps], [dnl
 DATA LIST LIST NOTABLE/name (A6) x.
 BEGIN DATA.
@@ -238,18 +234,13 @@ END DATA.
 MATCH FILES/FILE='x.sav'/FILE=*/BY name.
 LIST.
 ])
-AT_CHECK([pspp -O format=csv match-files.sps], [1], [dnl
-match-files.sps:15: error: MATCH FILES: Variable name has different type or width in different files.
-
-"match-files.sps:15.13-15.24: note: MATCH FILES: In file `x.sav', name is a string with width 6.
-   15 | MATCH FILES/FILE='x.sav'/FILE=*/BY name.
-      |             ^~~~~~~~~~~~"
-
-"match-files.sps:15.26-15.31: note: MATCH FILES: In file *, name is a string with width 7.
-   15 | MATCH FILES/FILE='x.sav'/FILE=*/BY name.
-      |                          ^~~~~~"
-
-match-files.sps:16: error: Stopping syntax file processing here to avoid a cascade of dependent command failures.
+AT_CHECK([pspp -O format=csv match-files.sps], [0], [dnl
+Table: Data List
+name,x,y
+al,7.00,1.00
+brad,8.00,.  @&t@
+carl,9.00,2.00
+dan,.  ,3.00
 ])
 AT_CLEANUP
 
@@ -277,7 +268,7 @@ al,1
 carl,2
 dan,3
 END DATA.
-MATCH FILES/FILE='x.sav'/FILE=*/BY name.
+MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x)/BY x.
 MATCH FILES/FILE='x.sav'/IN=**.
 MATCH FILES/FILE='x.sav'/IN=x/IN=y.
 MATCH FILES/FILE='x.sav'/BY=x/BY=y.
@@ -299,9 +290,10 @@ MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=name2)/LAST=x.
 MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=name2)/IN=x.
 MATCH FILES/KEEP=**.
 
-* The different 'name' variables are a problem, but not if we drop them.
-MATCH FILES/FILE='x.sav'/FILE=*/RENAME(y=x).
-MATCH FILES/FILE='x.sav'/FILE=*/RENAME(y=x)/DROP=name.
+* It's a problem to have different types and the same name,
+but not if we drop them.
+MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x).
+MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x)/DROP=x.
 ])
 AT_CHECK([pspp --testing-mode -O format=csv insert.sps], [1], [dnl
 "match-files.sps:1.18: error: MATCH FILES: Cannot specify the active dataset since none has been defined.
@@ -312,14 +304,14 @@ AT_CHECK([pspp --testing-mode -O format=csv insert.sps], [1], [dnl
    12 | MATCH FILES/FILE=*.
       |                  ^"
 
-match-files.sps:20: error: MATCH FILES: Variable name has different type or width in different files.
+match-files.sps:20: error: MATCH FILES: Variable x has different types in different files.
 
-"match-files.sps:20.13-20.24: note: MATCH FILES: In file `x.sav', name is a string with width 6.
-   20 | MATCH FILES/FILE='x.sav'/FILE=*/BY name.
+"match-files.sps:20.13-20.24: note: MATCH FILES: In file `x.sav', x is numeric.
+   20 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x)/BY x.
       |             ^~~~~~~~~~~~"
 
-"match-files.sps:20.26-20.31: note: MATCH FILES: In file *, name is a string with width 7.
-   20 | MATCH FILES/FILE='x.sav'/FILE=*/BY name.
+"match-files.sps:20.26-20.31: note: MATCH FILES: In file *, x is a string.
+   20 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x)/BY x.
       |                          ^~~~~~"
 
 "match-files.sps:21.29-21.30: error: MATCH FILES: Syntax error expecting identifier.
@@ -402,14 +394,14 @@ match-files.sps:20: error: MATCH FILES: Variable name has different type or widt
    40 | MATCH FILES/KEEP=**.
       |             ^~~~"
 
-match-files.sps:43: error: MATCH FILES: Variable name has different type or width in different files.
+match-files.sps:44: error: MATCH FILES: Variable x has different types in different files.
 
-"match-files.sps:43.13-43.24: note: MATCH FILES: In file `x.sav', name is a string with width 6.
-   43 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(y=x).
+"match-files.sps:44.13-44.24: note: MATCH FILES: In file `x.sav', x is numeric.
+   44 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x).
       |             ^~~~~~~~~~~~"
 
-"match-files.sps:43.26-43.31: note: MATCH FILES: In file *, name is a string with width 7.
-   43 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(y=x).
+"match-files.sps:44.26-44.31: note: MATCH FILES: In file *, x is a string.
+   44 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x).
       |                          ^~~~~~"
 ])
 AT_CLEANUP