combine-files: Allow matching string variables to have different widths.

author Ben Pfaff <blp@cs.stanford.edu>

Sat, 6 May 2023 01:01:06 +0000 (18:01 -0700)

committer Ben Pfaff <blp@cs.stanford.edu>

Sat, 6 May 2023 01:01:06 +0000 (18:01 -0700)
author Ben Pfaff <blp@cs.stanford.edu>
Sat, 6 May 2023 01:01:06 +0000 (18:01 -0700)
committer Ben Pfaff <blp@cs.stanford.edu>
Sat, 6 May 2023 01:01:06 +0000 (18:01 -0700)
diff --git a/NEWS b/NEWS

index 2ba75571b30c1ddeec022dcc43e6851430af4612..6c15134edf1499d4295a6c6aea95dcacbba63a51 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -17,6 +17,9 @@ Changes after 1.6.2:
  
     - Break variables are now optional.
  
+ * ADD FILES, MATCH FILES, and UPDATE now allow string variables with
+   the same name to have different widths.
+
   * CROSSTABS now calculates significance of Pearson and Spearman
     correlations in symmetric measures.
  
diff --git a/doc/combining.texi b/doc/combining.texi

index 751612a8741554af8310d03fd911efe393106c80..8bb9eafb3361df23fec3e6ffc11c0e4457009eea 100644 (file)
--- a/doc/combining.texi
+++ b/doc/combining.texi
@@ -116,8 +116,10 @@ The variables in the new active dataset are the union of all the input files'
  variables, matched based on their name.  When a single input file
  contains a variable with a given name, the output file will contain
  exactly that variable.  When more than one input file contains a
-variable with a given name, those variables must all have the same
-type (numeric or string) and, for string variables, the same width.
+variable with a given name, those variables must be all string or all numeric.
+If they are string variables, then the result will have the width of the longest
+variable with that name, with narrower values padded on the right with spaces
+to fill the width.
  Variables are matched after renaming with the @subcmd{RENAME} subcommand.
  Thus, @subcmd{RENAME} can be used to resolve conflicts.
  Only variables in the output file can conflict, so @subcmd{DROP} or
diff --git a/src/data/subcase.h b/src/data/subcase.h

index 027a6395420bf90e52999a496e0c0d4cb14f30f9..7abbef6d49382363041d0ffd15a675729bf30271 100644 (file)
--- a/src/data/subcase.h
+++ b/src/data/subcase.h
@@ -84,6 +84,7 @@ static inline size_t subcase_get_n_fields (const struct subcase *);
  
  static inline size_t subcase_get_case_index (const struct subcase *,
                                               size_t idx);
+static inline int subcase_get_width (const struct subcase *, size_t idx);
  static inline enum subcase_direction subcase_get_direction (
    const struct subcase *, size_t idx);
  
@@ -119,6 +120,12 @@ subcase_get_case_index (const struct subcase *sc, size_t idx)
    return sc->fields[idx].case_index;
  }
  
+static inline int
+subcase_get_width (const struct subcase *sc, size_t idx)
+{
+  return sc->fields[idx].width;
+}
+
  static inline enum subcase_direction
  subcase_get_direction (const struct subcase *sc, size_t idx)
  {
diff --git a/src/language/commands/combine-files.c b/src/language/commands/combine-files.c

index ad6914b364d45a1a2e6b4ac375a24ad6c65b3d04..5641328101edfeb3ad8a0c1a8e0ffed5ee44e51a 100644 (file)
--- a/src/language/commands/combine-files.c
+++ b/src/language/commands/combine-files.c
@@ -62,6 +62,21 @@ enum comb_file_type
      COMB_TABLE                 /* Specified on TABLE= subcommand. */
    };
  
+/* These commands combine multiple input files into a single master file.  The
+   input files may merge string variables with different widths, right-padding
+   with spaces to the length of the longest.  This data structure allows for
+   that. */
+struct comb_resizer
+  {
+    struct caseproto *output_proto;
+    int *indexes;
+    size_t n_indexes;
+  };
+
+static struct casereader *comb_resize (struct casereader *,
+                                       struct comb_resizer *);
+static void comb_resizer_destroy (struct comb_resizer *);
+
  /* One FILE or TABLE subcommand. */
  struct comb_file
    {
@@ -82,6 +97,7 @@ struct comb_file
      bool is_minimal;            /* Does 'data' have minimum BY values across
                                     all input files? */
      bool is_sorted;             /* Is file presorted on the BY variables? */
+    struct comb_resizer *resizer; /* If necessary. */
  
      /* IN subcommand. */
      char *in_name;
@@ -98,7 +114,7 @@ struct comb_proc
      struct subcase by_vars;     /* BY variables in the output. */
      struct casewriter *output;  /* Destination for output. */
  
-    /* Names of variables whose types or widths different among the files.
+    /* Names of variables whose types differ among the files.
         It's OK if they're all dropped, but not otherwise. */
      struct stringi_set different_types;
  
@@ -277,6 +293,41 @@ combine_files (enum comb_command_type command,
        merge_dictionary (&proc, file);
      }
  
+  for (size_t i = 0; i < dict_get_n_vars (proc.dict); i++)
+    {
+      const struct variable *mv = dict_get_var (proc.dict, i);
+      const char *name = var_get_name (mv);
+      int mw = var_get_width (mv);
+      if (!mw || stringi_set_contains (&proc.different_types, name))
+        continue;
+
+      for (size_t j = 0; j < proc.n_files; j++)
+        {
+          struct comb_file *cf = &proc.files[j];
+          struct variable *dv = dict_lookup_var (cf->dict, name);
+          if (!dv)
+            continue;
+
+          int dw = var_get_width (dv);
+          assert (dw <= mw);
+          if (dw < mw)
+            {
+              struct comb_resizer *r = cf->resizer;
+              if (!r)
+                {
+                  r = xmalloc (sizeof *r);
+                  *r = (struct comb_resizer) {
+                    .output_proto = caseproto_ref (dict_get_proto (proc.dict)),
+                    .indexes = xnmalloc (dict_get_n_vars (cf->dict),
+                                         sizeof *r->indexes),
+                  };
+                  cf->resizer = r;
+                }
+              r->indexes[r->n_indexes++] = var_get_dict_index (dv);
+            }
+        }
+    }
+
    while (lex_token (lexer) != T_ENDCMD)
      {
        if (lex_match (lexer, T_BY))
@@ -316,8 +367,9 @@ combine_files (enum comb_command_type command,
                    const char *name = var_get_name (by_vars[j]);
                    struct variable *var = dict_lookup_var (file->dict, name);
                    if (var != NULL)
-                    subcase_add_var (&file->by_vars, var,
-                                     subcase_get_direction (&proc.by_vars, j));
+                    subcase_add (&file->by_vars, var_get_dict_index (var),
+                                 subcase_get_width (&proc.by_vars, j),
+                                 subcase_get_direction (&proc.by_vars, j));
                    else
                      {
                        const char *fn
@@ -491,6 +543,11 @@ combine_files (enum comb_command_type command,
            else
              file->reader = casereader_clone (active_file);
          }
+      if (file->resizer)
+        {
+          file->reader = comb_resize (file->reader, file->resizer);
+          file->resizer = NULL;
+        }
        if (!file->is_sorted)
          file->reader = sort_execute (file->reader, &file->by_vars);
        taint_propagate (casereader_get_taint (file->reader), taint);
@@ -580,8 +637,11 @@ merge_dictionary (struct comb_proc *proc, struct comb_file *f)
  
        if (!mv)
          mv = dict_clone_var_assert (m, dv);
-      else if (var_get_width (mv) == var_get_width (dv))
+      else if (var_get_type (mv) == var_get_type (dv))
          {
+          if (var_get_width (dv) > var_get_width (mv))
+            var_set_width (mv, var_get_width (dv));
+
            if (var_has_value_labels (dv) && !var_has_value_labels (mv))
              var_set_value_labels (mv, var_get_value_labels (dv));
            if (var_has_missing_values (dv) && !var_has_missing_values (mv))
@@ -598,8 +658,7 @@ static void
  different_types_error (struct comb_proc *proc,
                         struct lexer *lexer, const char *var_name)
  {
-  msg (SE, _("Variable %s has different type or width in different "
-             "files."), var_name);
+  msg (SE, _("Variable %s has different types in different files."), var_name);
    for (size_t i = 0; i < proc->n_files; i++)
      {
        const struct comb_file *ef = &proc->files[i];
@@ -614,8 +673,7 @@ different_types_error (struct comb_proc *proc,
                       fn, var_name);
        else
          lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs,
-                     _("In file %s, %s is a string with width %d."),
-                     fn, var_name, var_get_width (ev));
+                     _("In file %s, %s is a string."), fn, var_name);
      }
  }
  
@@ -707,6 +765,7 @@ close_all_comb_files (struct comb_proc *proc)
        dict_unref (file->dict);
        casereader_destroy (file->reader);
        case_unref (file->data);
+      comb_resizer_destroy (file->resizer);
        free (file->in_name);
      }
    free (proc->files);
@@ -1000,3 +1059,56 @@ output_buffered_case (struct comb_proc *proc)
        proc->buffered_case = NULL;
      }
  }
+\f
+static void
+comb_resizer_destroy (struct comb_resizer *r)
+{
+  if (!r)
+    return;
+
+  caseproto_unref (r->output_proto);
+  free (r->indexes);
+  free (r);
+}
+
+static struct ccase *
+comb_resize_translate (struct ccase *c, void *r_)
+{
+  struct comb_resizer *r = r_;
+
+  c = case_unshare (c);
+
+  for (size_t i = 0; i < r->n_indexes; i++)
+    {
+      int idx = r->indexes[i];
+      value_resize (&c->values[idx],
+                    caseproto_get_width (c->proto, idx),
+                    caseproto_get_width (r->output_proto, idx));
+    }
+
+  caseproto_unref (c->proto);
+  c->proto = caseproto_ref (r->output_proto);
+
+  return c;
+}
+
+static bool
+comb_resizer_translate_destroy (void *r_)
+{
+  struct comb_resizer *r = r_;
+
+  comb_resizer_destroy (r);
+  return true;
+}
+
+static struct casereader *
+comb_resize (struct casereader *subreader, struct comb_resizer *r)
+{
+  static struct casereader_translator_class class = {
+    .translate = comb_resize_translate,
+    .destroy = comb_resizer_translate_destroy,
+  };
+
+  return casereader_translate_stateless (subreader, r->output_proto,
+                                         &class, r);
+}
diff --git a/tests/language/commands/match-files.at b/tests/language/commands/match-files.at

index 7e7aee352b154ceba086b8f41497ccbeeaf42217..3341ce79f1c0909fd6b9a47e4d5a8c377d74e0fa 100644 (file)
--- a/tests/language/commands/match-files.at
+++ b/tests/language/commands/match-files.at
@@ -215,11 +215,7 @@ x,y,j
  ])
  AT_CLEANUP
  
-dnl Tests for a bug that caused MATCH FILES to crash
-dnl with incompatible variables, especially but not
-dnl exclusively when one variable came from the active
-dnl file.
-AT_SETUP([MATCH FILES with incompatible variable types])
+AT_SETUP([MATCH FILES with different string widths])
  AT_DATA([match-files.sps], [dnl
  DATA LIST LIST NOTABLE/name (A6) x.
  BEGIN DATA.
@@ -238,18 +234,13 @@ END DATA.
  MATCH FILES/FILE='x.sav'/FILE=*/BY name.
  LIST.
  ])
-AT_CHECK([pspp -O format=csv match-files.sps], [1], [dnl
-match-files.sps:15: error: MATCH FILES: Variable name has different type or width in different files.
-
-"match-files.sps:15.13-15.24: note: MATCH FILES: In file `x.sav', name is a string with width 6.
-   15 | MATCH FILES/FILE='x.sav'/FILE=*/BY name.
-      |             ^~~~~~~~~~~~"
-
-"match-files.sps:15.26-15.31: note: MATCH FILES: In file *, name is a string with width 7.
-   15 | MATCH FILES/FILE='x.sav'/FILE=*/BY name.
-      |                          ^~~~~~"
-
-match-files.sps:16: error: Stopping syntax file processing here to avoid a cascade of dependent command failures.
+AT_CHECK([pspp -O format=csv match-files.sps], [0], [dnl
+Table: Data List
+name,x,y
+al,7.00,1.00
+brad,8.00,.  @&t@
+carl,9.00,2.00
+dan,.  ,3.00
  ])
  AT_CLEANUP
  
@@ -277,7 +268,7 @@ al,1
  carl,2
  dan,3
  END DATA.
-MATCH FILES/FILE='x.sav'/FILE=*/BY name.
+MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x)/BY x.
  MATCH FILES/FILE='x.sav'/IN=**.
  MATCH FILES/FILE='x.sav'/IN=x/IN=y.
  MATCH FILES/FILE='x.sav'/BY=x/BY=y.
@@ -299,9 +290,10 @@ MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=name2)/LAST=x.
  MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=name2)/IN=x.
  MATCH FILES/KEEP=**.
  
-* The different 'name' variables are a problem, but not if we drop them.
-MATCH FILES/FILE='x.sav'/FILE=*/RENAME(y=x).
-MATCH FILES/FILE='x.sav'/FILE=*/RENAME(y=x)/DROP=name.
+* It's a problem to have different types and the same name,
+but not if we drop them.
+MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x).
+MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x)/DROP=x.
  ])
  AT_CHECK([pspp --testing-mode -O format=csv insert.sps], [1], [dnl
  "match-files.sps:1.18: error: MATCH FILES: Cannot specify the active dataset since none has been defined.
@@ -312,14 +304,14 @@ AT_CHECK([pspp --testing-mode -O format=csv insert.sps], [1], [dnl
     12 | MATCH FILES/FILE=*.
        |                  ^"
  
-match-files.sps:20: error: MATCH FILES: Variable name has different type or width in different files.
+match-files.sps:20: error: MATCH FILES: Variable x has different types in different files.
  
-"match-files.sps:20.13-20.24: note: MATCH FILES: In file `x.sav', name is a string with width 6.
-   20 | MATCH FILES/FILE='x.sav'/FILE=*/BY name.
+"match-files.sps:20.13-20.24: note: MATCH FILES: In file `x.sav', x is numeric.
+   20 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x)/BY x.
        |             ^~~~~~~~~~~~"
  
-"match-files.sps:20.26-20.31: note: MATCH FILES: In file *, name is a string with width 7.
-   20 | MATCH FILES/FILE='x.sav'/FILE=*/BY name.
+"match-files.sps:20.26-20.31: note: MATCH FILES: In file *, x is a string.
+   20 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x)/BY x.
        |                          ^~~~~~"
  
  "match-files.sps:21.29-21.30: error: MATCH FILES: Syntax error expecting identifier.
@@ -402,14 +394,14 @@ match-files.sps:20: error: MATCH FILES: Variable name has different type or widt
     40 | MATCH FILES/KEEP=**.
        |             ^~~~"
  
-match-files.sps:43: error: MATCH FILES: Variable name has different type or width in different files.
+match-files.sps:44: error: MATCH FILES: Variable x has different types in different files.
  
-"match-files.sps:43.13-43.24: note: MATCH FILES: In file `x.sav', name is a string with width 6.
-   43 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(y=x).
+"match-files.sps:44.13-44.24: note: MATCH FILES: In file `x.sav', x is numeric.
+   44 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x).
        |             ^~~~~~~~~~~~"
  
-"match-files.sps:43.26-43.31: note: MATCH FILES: In file *, name is a string with width 7.
-   43 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(y=x).
+"match-files.sps:44.26-44.31: note: MATCH FILES: In file *, x is a string.
+   44 | MATCH FILES/FILE='x.sav'/FILE=*/RENAME(name=x).
        |                          ^~~~~~"
  ])
  AT_CLEANUP
author	Ben Pfaff <blp@cs.stanford.edu>
	Sat, 6 May 2023 01:01:06 +0000 (18:01 -0700)
committer	Ben Pfaff <blp@cs.stanford.edu>
	Sat, 6 May 2023 01:01:06 +0000 (18:01 -0700)
NEWS		patch \| blob \| history
doc/combining.texi		patch \| blob \| history
src/data/subcase.h		patch \| blob \| history
src/language/commands/combine-files.c		patch \| blob \| history
tests/language/commands/match-files.at		patch \| blob \| history