X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fdata-io%2Fcombine-files.c;h=b95683ef8c752867cb26dd6bba6f1ac6e98f756f;hb=d4f19dd9241b87b0b330daf674ed90d767b44822;hp=1a82ef3f1bf61bfcc6ce1e9a2559237f37fcc37d;hpb=99e37c4d062ac23f89070b578f28eb6d49eec632;p=pspp

diff --git a/src/language/data-io/combine-files.c b/src/language/data-io/combine-files.c
index 1a82ef3f1b..b95683ef8c 100644
--- a/src/language/data-io/combine-files.c
+++ b/src/language/data-io/combine-files.c
@@ -1,5 +1,5 @@
 /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
@@ -18,28 +18,31 @@
 
 #include <stdlib.h>
 
-#include <data/any-reader.h>
-#include <data/case-matcher.h>
-#include <data/case.h>
-#include <data/casereader.h>
-#include <data/casewriter.h>
-#include <data/dictionary.h>
-#include <data/format.h>
-#include <data/procedure.h>
-#include <data/subcase.h>
-#include <data/variable.h>
-#include <language/command.h>
-#include <language/data-io/file-handle.h>
-#include <language/data-io/trim.h>
-#include <language/lexer/lexer.h>
-#include <language/lexer/variable-parser.h>
-#include <language/stats/sort-criteria.h>
-#include <libpspp/assertion.h>
-#include <libpspp/message.h>
-#include <libpspp/taint.h>
-#include <math/sort.h>
-
-#include "xalloc.h"
+#include "data/any-reader.h"
+#include "data/case-matcher.h"
+#include "data/case.h"
+#include "data/casereader.h"
+#include "data/casewriter.h"
+#include "data/dataset.h"
+#include "data/dictionary.h"
+#include "data/format.h"
+#include "data/subcase.h"
+#include "data/variable.h"
+#include "language/command.h"
+#include "language/data-io/file-handle.h"
+#include "language/data-io/trim.h"
+#include "language/lexer/lexer.h"
+#include "language/lexer/variable-parser.h"
+#include "language/stats/sort-criteria.h"
+#include "libpspp/assertion.h"
+#include "libpspp/i18n.h"
+#include "libpspp/message.h"
+#include "libpspp/string-array.h"
+#include "libpspp/taint.h"
+#include "math/sort.h"
+
+#include "gl/minmax.h"
+#include "gl/xalloc.h"
 
 #include "gettext.h"
 #define _(msgid) gettext (msgid)
@@ -63,10 +66,12 @@ struct comb_file
   {
     /* Basics. */
     enum comb_file_type type;   /* COMB_FILE or COMB_TABLE. */
+    int start_ofs, end_ofs;     /* Lexer offsets. */
 
     /* Variables. */
     struct subcase by_vars;     /* BY variables in this input file. */
     struct subcase src, dst;    /* Data to copy to output; where to put it. */
+    const struct missing_values **mv; /* Each variable's missing values. */
 
     /* Input files. */
     struct file_handle *handle; /* Input file handle. */
@@ -78,7 +83,8 @@ struct comb_file
     bool is_sorted;             /* Is file presorted on the BY variables? */
 
     /* IN subcommand. */
-    char in_name[VAR_NAME_LEN + 1];
+    char *in_name;
+    int in_ofs;
     struct variable *in_var;
   };
 
@@ -91,6 +97,9 @@ struct comb_proc
     struct subcase by_vars;     /* BY variables in the output. */
     struct casewriter *output;  /* Destination for output. */
 
+    size_t *var_sources;
+    size_t n_var_sources, allocated_var_sources;
+
     struct case_matcher *matcher;
 
     /* FIRST, LAST.
@@ -108,13 +117,15 @@ static int combine_files (enum comb_command_type, struct lexer *,
 static void free_comb_proc (struct comb_proc *);
 
 static void close_all_comb_files (struct comb_proc *);
-static bool merge_dictionary (struct dictionary *const, struct comb_file *);
+static bool merge_dictionary (struct comb_proc *, struct lexer *,
+                              struct comb_file *);
 
 static void execute_update (struct comb_proc *);
 static void execute_match_files (struct comb_proc *);
 static void execute_add_files (struct comb_proc *);
 
-static bool create_flag_var (const char *subcommand_name, const char *var_name,
+static bool create_flag_var (struct lexer *lexer, const char *subcommand_name,
+                             const char *var_name, int var_ofs,
                              struct dictionary *, struct variable **);
 static void output_case (struct comb_proc *, struct ccase *, union value *by);
 static void output_buffered_case (struct comb_proc *);
@@ -141,155 +152,141 @@ static int
 combine_files (enum comb_command_type command,
                struct lexer *lexer, struct dataset *ds)
 {
-  struct comb_proc proc;
+  struct comb_proc proc = {
+    .dict = dict_create (get_default_encoding ()),
+  };
 
   bool saw_by = false;
   bool saw_sort = false;
   struct casereader *active_file = NULL;
 
-  char first_name[VAR_NAME_LEN + 1] = "";
-  char last_name[VAR_NAME_LEN + 1] = "";
+  char *first_name = NULL;
+  int first_ofs = 0;
+  char *last_name = NULL;
+  int last_ofs = 0;
 
   struct taint *taint = NULL;
 
-  size_t n_tables = 0;
+  size_t table_idx = SIZE_MAX;
+  int sort_ofs = INT_MAX;
   size_t allocated_files = 0;
 
-  size_t i;
-
-  proc.files = NULL;
-  proc.n_files = 0;
-  proc.dict = dict_create ();
-  proc.output = NULL;
-  proc.matcher = NULL;
-  subcase_init_empty (&proc.by_vars);
-  proc.first = NULL;
-  proc.last = NULL;
-  proc.buffered_case = NULL;
-  proc.prev_BY = NULL;
-
   dict_set_case_limit (proc.dict, dict_get_case_limit (dataset_dict (ds)));
 
-  lex_match (lexer, '/');
+  lex_match (lexer, T_SLASH);
   for (;;)
     {
-      struct comb_file *file;
+      int start_ofs = lex_ofs (lexer);
       enum comb_file_type type;
-
       if (lex_match_id (lexer, "FILE"))
         type = COMB_FILE;
       else if (command == COMB_MATCH && lex_match_id (lexer, "TABLE"))
         {
           type = COMB_TABLE;
-          n_tables++;
+          table_idx = MIN (table_idx, proc.n_files);
         }
       else
         break;
-      lex_match (lexer, '=');
+      lex_match (lexer, T_EQUALS);
 
       if (proc.n_files >= allocated_files)
         proc.files = x2nrealloc (proc.files, &allocated_files,
                                 sizeof *proc.files);
-      file = &proc.files[proc.n_files++];
-      file->type = type;
-      subcase_init_empty (&file->by_vars);
-      subcase_init_empty (&file->src);
-      subcase_init_empty (&file->dst);
-      file->handle = NULL;
-      file->dict = NULL;
-      file->reader = NULL;
-      file->data = NULL;
-      file->is_sorted = true;
-      file->in_name[0] = '\0';
-      file->in_var = NULL;
-
-      if (lex_match (lexer, '*'))
+      struct comb_file *file = &proc.files[proc.n_files++];
+      *file = (struct comb_file) {
+        .type = type,
+        .start_ofs = start_ofs,
+        .is_sorted = true,
+      };
+
+      if (lex_match (lexer, T_ASTERISK))
         {
-          if (!proc_has_active_file (ds))
+          if (!dataset_has_source (ds))
             {
-              msg (SE, _("Cannot specify the active file since no active "
-                         "file has been defined."));
+              lex_next_error (lexer, -1, -1,
+                              _("Cannot specify the active dataset since none "
+                                "has been defined."));
               goto error;
             }
 
           if (proc_make_temporary_transformations_permanent (ds))
-            msg (SE, _("This command may not be used after TEMPORARY when "
-                       "the active file is an input source.  "
-                       "Temporary transformations will be made permanent."));
+            lex_next_error (lexer, -1, -1,
+                            _("This command may not be used after TEMPORARY "
+                              "when the active dataset is an input source.  "
+                              "Temporary transformations will be made "
+                              "permanent."));
 
           file->dict = dict_clone (dataset_dict (ds));
         }
       else
         {
-          file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
+          file->handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds));
           if (file->handle == NULL)
             goto error;
 
-          file->reader = any_reader_open (file->handle, &file->dict);
+          file->reader = any_reader_open_and_decode (file->handle, NULL,
+                                                     &file->dict, NULL);
           if (file->reader == NULL)
             goto error;
         }
+      file->end_ofs = lex_ofs (lexer) - 1;
 
-      while (lex_match (lexer, '/'))
+      while (lex_match (lexer, T_SLASH))
         if (lex_match_id (lexer, "RENAME"))
           {
-            if (!parse_dict_rename (lexer, file->dict))
+            if (!parse_dict_rename (lexer, file->dict, false))
               goto error;
           }
         else if (lex_match_id (lexer, "IN"))
           {
-            lex_match (lexer, '=');
-            if (lex_token (lexer) != T_ID)
-              {
-                lex_error (lexer, NULL);
-                goto error;
-              }
+            lex_match (lexer, T_EQUALS);
+            if (!lex_force_id (lexer))
+              goto error;
 
-            if (file->in_name[0])
+            if (file->in_name)
               {
-                msg (SE, _("Multiple IN subcommands for a single FILE or "
-                           "TABLE."));
+                lex_error (lexer, _("Multiple IN subcommands for a single FILE "
+                                    "or TABLE."));
                 goto error;
               }
-            strcpy (file->in_name, lex_tokid (lexer));
+            file->in_name = xstrdup (lex_tokcstr (lexer));
+            file->in_ofs = lex_ofs (lexer);
             lex_get (lexer);
           }
         else if (lex_match_id (lexer, "SORT"))
           {
             file->is_sorted = false;
             saw_sort = true;
+            sort_ofs = MIN (sort_ofs, lex_ofs (lexer) - 1);
           }
 
-      merge_dictionary (proc.dict, file);
+      if (!merge_dictionary (&proc, lexer, file))
+        goto error;
     }
 
-  while (lex_token (lexer) != '.')
+  while (lex_token (lexer) != T_ENDCMD)
     {
       if (lex_match (lexer, T_BY))
 	{
-          const struct variable **by_vars;
-          size_t i;
-          bool ok;
-
-	  if (saw_by)
+          if (saw_by)
 	    {
-              lex_sbc_only_once ("BY");
+              lex_sbc_only_once (lexer, "BY");
 	      goto error;
 	    }
           saw_by = true;
 
-	  lex_match (lexer, '=');
+	  lex_match (lexer, T_EQUALS);
+
+          const struct variable **by_vars;
           if (!parse_sort_criteria (lexer, proc.dict, &proc.by_vars,
                                     &by_vars, NULL))
 	    goto error;
 
-          ok = true;
-          for (i = 0; i < proc.n_files; i++)
+          bool ok = true;
+          for (size_t i = 0; i < proc.n_files; i++)
             {
               struct comb_file *file = &proc.files[i];
-              size_t j;
-
-              for (j = 0; j < subcase_get_n_values (&proc.by_vars); j++)
+              for (size_t j = 0; j < subcase_get_n_fields (&proc.by_vars); j++)
                 {
                   const char *name = var_get_name (by_vars[j]);
                   struct variable *var = dict_lookup_var (file->dict, name);
@@ -298,11 +295,11 @@ combine_files (enum comb_command_type command,
                                      subcase_get_direction (&proc.by_vars, j));
                   else
                     {
-                      if (file->handle != NULL)
-                        msg (SE, _("File %s lacks BY variable %s."),
-                             fh_get_name (file->handle), name);
-                      else
-                        msg (SE, _("Active file lacks BY variable %s."), name);
+                      const char *fn
+                        = file->handle ? fh_get_name (file->handle) : "*";
+                      lex_ofs_error (lexer, file->start_ofs, file->end_ofs,
+                                     _("File %s lacks BY variable %s."),
+                                     fn, name);
                       ok = false;
                     }
                 }
@@ -316,30 +313,32 @@ combine_files (enum comb_command_type command,
 	}
       else if (command != COMB_UPDATE && lex_match_id (lexer, "FIRST"))
         {
-          if (first_name[0] != '\0')
+          if (first_name != NULL)
             {
-              lex_sbc_only_once ("FIRST");
+              lex_sbc_only_once (lexer, "FIRST");
               goto error;
             }
 
-	  lex_match (lexer, '=');
+	  lex_match (lexer, T_EQUALS);
           if (!lex_force_id (lexer))
             goto error;
-          strcpy (first_name, lex_tokid (lexer));
+          first_name = xstrdup (lex_tokcstr (lexer));
+          first_ofs = lex_ofs (lexer);
           lex_get (lexer);
         }
       else if (command != COMB_UPDATE && lex_match_id (lexer, "LAST"))
         {
-          if (last_name[0] != '\0')
+          if (last_name != NULL)
             {
-              lex_sbc_only_once ("LAST");
+              lex_sbc_only_once (lexer, "LAST");
               goto error;
             }
 
-	  lex_match (lexer, '=');
+	  lex_match (lexer, T_EQUALS);
           if (!lex_force_id (lexer))
             goto error;
-          strcpy (last_name, lex_tokid (lexer));
+          last_name = xstrdup (lex_tokcstr (lexer));
+          last_ofs = lex_ofs (lexer);
           lex_get (lexer);
         }
       else if (lex_match_id (lexer, "MAP"))
@@ -362,7 +361,7 @@ combine_files (enum comb_command_type command,
 	  goto error;
 	}
 
-      if (!lex_match (lexer, '/') && lex_token (lexer) != '.')
+      if (!lex_match (lexer, T_SLASH) && lex_token (lexer) != T_ENDCMD)
         {
           lex_end_of_command (lexer);
           goto error;
@@ -373,30 +372,34 @@ combine_files (enum comb_command_type command,
     {
       if (command == COMB_UPDATE)
         {
-          msg (SE, _("The BY subcommand is required."));
+          lex_sbc_missing (lexer, "BY");
           goto error;
         }
-      if (n_tables)
+      if (table_idx != SIZE_MAX)
         {
-          msg (SE, _("BY is required when TABLE is specified."));
+          const struct comb_file *table = &proc.files[table_idx];
+          lex_ofs_error (lexer, table->start_ofs, table->end_ofs,
+                         _("BY is required when %s is specified."), "TABLE");
           goto error;
         }
       if (saw_sort)
         {
-          msg (SE, _("BY is required when SORT is specified."));
+          lex_ofs_error (lexer, sort_ofs, sort_ofs,
+                         _("BY is required when %s is specified."), "SORT");
           goto error;
         }
     }
 
   /* Add IN, FIRST, and LAST variables to master dictionary. */
-  for (i = 0; i < proc.n_files; i++)
+  for (size_t i = 0; i < proc.n_files; i++)
     {
       struct comb_file *file = &proc.files[i];
-      if (!create_flag_var ("IN", file->in_name, proc.dict, &file->in_var))
+      if (!create_flag_var (lexer, "IN", file->in_name, file->in_ofs,
+                            proc.dict, &file->in_var))
         goto error;
     }
-  if (!create_flag_var ("FIRST", first_name, proc.dict, &proc.first)
-      || !create_flag_var ("LAST", last_name, proc.dict, &proc.last))
+  if (!create_flag_var (lexer, "FIRST", first_name, first_ofs, proc.dict, &proc.first)
+      || !create_flag_var (lexer, "LAST", last_name, last_ofs, proc.dict, &proc.last))
     goto error;
 
   dict_delete_scratch_vars (proc.dict);
@@ -404,31 +407,33 @@ combine_files (enum comb_command_type command,
 
   /* Set up mapping from each file's variables to master
      variables. */
-  for (i = 0; i < proc.n_files; i++)
+  for (size_t i = 0; i < proc.n_files; i++)
     {
       struct comb_file *file = &proc.files[i];
-      size_t src_var_cnt = dict_get_var_cnt (file->dict);
-      size_t j;
+      size_t src_n_vars = dict_get_n_vars (file->dict);
 
-      for (j = 0; j < src_var_cnt; j++)
+      file->mv = xnmalloc (src_n_vars, sizeof *file->mv);
+      for (size_t j = 0; j < src_n_vars; j++)
         {
           struct variable *src_var = dict_get_var (file->dict, j);
           struct variable *dst_var = dict_lookup_var (proc.dict,
                                                       var_get_name (src_var));
           if (dst_var != NULL)
             {
+              size_t n = subcase_get_n_fields (&file->src);
+              file->mv[n] = var_get_missing_values (src_var);
               subcase_add_var (&file->src, src_var, SC_ASCEND);
               subcase_add_var (&file->dst, dst_var, SC_ASCEND);
             }
         }
     }
 
-  proc.output = autopaging_writer_create (dict_get_next_value_idx (proc.dict));
+  proc.output = autopaging_writer_create (dict_get_proto (proc.dict));
   taint = taint_clone (casewriter_get_taint (proc.output));
 
   /* Set up case matcher. */
   proc.matcher = case_matcher_create ();
-  for (i = 0; i < proc.n_files; i++)
+  for (size_t i = 0; i < proc.n_files; i++)
     {
       struct comb_file *file = &proc.files[i];
       if (file->reader == NULL)
@@ -436,7 +441,7 @@ combine_files (enum comb_command_type command,
           if (active_file == NULL)
             {
               proc_discard_output (ds);
-              file->reader = active_file = proc_open (ds);
+              file->reader = active_file = proc_open_filtering (ds, false);
             }
           else
             file->reader = casereader_clone (active_file);
@@ -465,12 +470,16 @@ combine_files (enum comb_command_type command,
   if (active_file != NULL)
     proc_commit (ds);
 
-  proc_set_active_file (ds, casewriter_make_reader (proc.output), proc.dict);
+  dataset_set_dict (ds, proc.dict);
+  dataset_set_source (ds, casewriter_make_reader (proc.output));
   proc.dict = NULL;
   proc.output = NULL;
 
   free_comb_proc (&proc);
 
+  free (first_name);
+  free (last_name);
+
   return taint_destroy (taint) ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
 
  error:
@@ -478,52 +487,55 @@ combine_files (enum comb_command_type command,
     proc_commit (ds);
   free_comb_proc (&proc);
   taint_destroy (taint);
+  free (first_name);
+  free (last_name);
   return CMD_CASCADING_FAILURE;
 }
 
-/* Merge the dictionary for file F into master dictionary M. */
+/* Merge the dictionary for file F into master dictionary for PROC. */
 static bool
-merge_dictionary (struct dictionary *const m, struct comb_file *f)
+merge_dictionary (struct comb_proc *proc, struct lexer *lexer,
+                  struct comb_file *f)
 {
+  struct dictionary *m = proc->dict;
   struct dictionary *d = f->dict;
-  const char *d_docs, *m_docs;
-  int i;
-  const char *file_encoding;
 
   if (dict_get_label (m) == NULL)
     dict_set_label (m, dict_get_label (d));
 
-  d_docs = dict_get_documents (d);
-  m_docs = dict_get_documents (m);
-
-
-  /* If the input files have different encodings, then
+  /* FIXME: If the input files have different encodings, then
+     the result is undefined.
+     The correct thing to do would be to convert to an encoding
+     which can cope with all the input files (eg UTF-8).
    */
-  file_encoding = dict_get_encoding (f->dict);
-  if ( file_encoding != NULL)
-    {
-      if ( dict_get_encoding (m) == NULL)
-	dict_set_encoding (m, file_encoding);
-      else if ( 0 != strcmp (file_encoding, dict_get_encoding (m)))
-	{
-	  msg (MW,
-	       _("Combining files with incompatible encodings. String data may not be represented correctly."));
-	}
-    }
+  if (strcmp (dict_get_encoding (f->dict), dict_get_encoding (m)))
+    msg (MW, _("Combining files with incompatible encodings. String data may "
+               "not be represented correctly."));
 
-  if (d_docs != NULL)
+  const struct string_array *d_docs = dict_get_documents (d);
+  const struct string_array *m_docs = dict_get_documents (m);
+  if (d_docs)
     {
-      if (m_docs == NULL)
+      if (!m_docs)
         dict_set_documents (m, d_docs);
       else
         {
-          char *new_docs = xasprintf ("%s%s", m_docs, d_docs);
-          dict_set_documents (m, new_docs);
-          free (new_docs);
+          size_t n = m_docs->n + d_docs->n;
+          struct string_array new_docs = {
+            .strings = xmalloc (n * sizeof *new_docs.strings),
+          };
+          for (size_t i = 0; i < m_docs->n; i++)
+            new_docs.strings[new_docs.n++] = m_docs->strings[i];
+          for (size_t i = 0; i < d_docs->n; i++)
+            new_docs.strings[new_docs.n++] = d_docs->strings[i];
+
+          dict_set_documents (m, &new_docs);
+
+          free (new_docs.strings);
         }
     }
 
-  for (i = 0; i < dict_get_var_cnt (d); i++)
+  for (size_t i = 0; i < dict_get_n_vars (d); i++)
     {
       struct variable *dv = dict_get_var (d, i);
       struct variable *mv = dict_lookup_var (m, var_get_name (dv));
@@ -531,36 +543,40 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f)
       if (dict_class_from_id (var_get_name (dv)) == DC_SCRATCH)
         continue;
 
-      if (mv != NULL)
+      if (!mv)
+        {
+          mv = dict_clone_var_assert (m, dv);
+          if (proc->n_var_sources >= proc->allocated_var_sources)
+            proc->var_sources = x2nrealloc (proc->var_sources,
+                                            &proc->allocated_var_sources,
+                                            sizeof *proc->var_sources);
+          proc->var_sources[proc->n_var_sources++] = f - proc->files;
+        }
+      else
         {
           if (var_get_width (mv) != var_get_width (dv))
             {
               const char *var_name = var_get_name (dv);
-              const char *file_name = fh_get_name (f->handle);
-              struct string s = DS_EMPTY_INITIALIZER;
-              ds_put_format (&s,
-                             _("Variable %s in file %s has different "
-                               "type or width from the same variable in "
-                               "earlier file."),
-                             var_name, file_name);
-              ds_put_cstr (&s, "  ");
-              if (var_is_numeric (dv))
-                ds_put_format (&s, _("In file %s, %s is numeric."),
-                               file_name, var_name);
-              else
-                ds_put_format (&s, _("In file %s, %s is a string variable "
-                                     "with width %d."),
-                               file_name, var_name, var_get_width (dv));
-              ds_put_cstr (&s, "  ");
-              if (var_is_numeric (mv))
-                ds_put_format (&s, _("In an earlier file, %s was numeric."),
-                               var_name);
-              else
-                ds_put_format (&s, _("In an earlier file, %s was a string "
-                                     "variable with width %d."),
-                               var_name, var_get_width (mv));
-              msg (SE, ds_cstr (&s));
-              ds_destroy (&s);
+              msg (SE, _("Variable %s has different type or width in different "
+                         "files."), var_name);
+
+              for (size_t j = 0; j < 2; j++)
+                {
+                  const struct variable *ev = !j ? mv : dv;
+                  const struct comb_file *ef
+                    = !j ? &proc->files[proc->var_sources[var_get_dict_index (mv)]] : f;
+                  const char *fn = ef->handle ? fh_get_name (ef->handle) : "*";
+
+                  if (var_is_numeric (ev))
+                    lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs,
+                                 _("In file %s, %s is numeric."),
+                                 fn, var_name);
+                  else
+                    lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs,
+                                 _("In file %s, %s is a string with width %d."),
+                                 fn, var_name, var_get_width (ev));
+                }
+
               return false;
             }
 
@@ -571,33 +587,34 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f)
           if (var_get_label (dv) && !var_get_label (mv))
             var_set_label (mv, var_get_label (dv));
         }
-      else
-        mv = dict_clone_var_assert (m, dv, var_get_name (dv));
     }
 
   return true;
 }
 
-/* If VAR_NAME is a non-empty string, attempts to create a
+/* If VAR_NAME is non-NULL, attempts to create a
    variable named VAR_NAME, with format F1.0, in DICT, and stores
    a pointer to the variable in *VAR.  Returns true if
    successful, false if the variable name is a duplicate (in
    which case a message saying that the variable specified on the
-   given SUBCOMMAND is a duplicate is emitted).  Also returns
-   true, without doing anything, if VAR_NAME is null or empty. */
+   given SUBCOMMAND is a duplicate is emitted).
+
+   Does nothing and returns true if VAR_NAME is null. */
 static bool
-create_flag_var (const char *subcommand, const char *var_name,
+create_flag_var (struct lexer *lexer, const char *subcommand,
+                 const char *var_name, int var_ofs,
                  struct dictionary *dict, struct variable **var)
 {
-  if (var_name[0] != '\0')
+  if (var_name != NULL)
     {
       struct fmt_spec format = fmt_for_output (FMT_F, 1, 0);
       *var = dict_create_var (dict, var_name, 0);
       if (*var == NULL)
         {
-          msg (SE, _("Variable name %s specified on %s subcommand "
-                     "duplicates an existing variable name."),
-               subcommand, var_name);
+          lex_ofs_error (lexer, var_ofs, var_ofs,
+                         _("Variable name %s specified on %s subcommand "
+                           "duplicates an existing variable name."),
+                         var_name, subcommand);
           return false;
         }
       var_set_both_formats (*var, &format);
@@ -611,18 +628,18 @@ create_flag_var (const char *subcommand, const char *var_name,
 static void
 close_all_comb_files (struct comb_proc *proc)
 {
-  size_t i;
-
-  for (i = 0; i < proc->n_files; i++)
+  for (size_t i = 0; i < proc->n_files; i++)
     {
       struct comb_file *file = &proc->files[i];
-      subcase_destroy (&file->by_vars);
-      subcase_destroy (&file->src);
-      subcase_destroy (&file->dst);
+      subcase_uninit (&file->by_vars);
+      subcase_uninit (&file->src);
+      subcase_uninit (&file->dst);
+      free (file->mv);
       fh_unref (file->handle);
-      dict_destroy (file->dict);
+      dict_unref (file->dict);
       casereader_destroy (file->reader);
       case_unref (file->data);
+      free (file->in_name);
     }
   free (proc->files);
   proc->files = NULL;
@@ -634,19 +651,25 @@ static void
 free_comb_proc (struct comb_proc *proc)
 {
   close_all_comb_files (proc);
-  dict_destroy (proc->dict);
+  dict_unref (proc->dict);
   casewriter_destroy (proc->output);
   case_matcher_destroy (proc->matcher);
-  subcase_destroy (&proc->by_vars);
+  if (proc->prev_BY)
+    {
+      caseproto_destroy_values (subcase_get_proto (&proc->by_vars),
+                                proc->prev_BY);
+      free (proc->prev_BY);
+    }
+  subcase_uninit (&proc->by_vars);
   case_unref (proc->buffered_case);
-  free (proc->prev_BY);
+  free (proc->var_sources);
 }
 
 static bool scan_table (struct comb_file *, union value by[]);
 static struct ccase *create_output_case (const struct comb_proc *);
 static void apply_case (const struct comb_file *, struct ccase *);
-static void apply_file_case_and_advance (struct comb_file *, struct ccase *,
-                                         union value by[]);
+static void apply_nonmissing_case (const struct comb_file *, struct ccase *);
+static void advance_file (struct comb_file *, union value by[]);
 static void output_case (struct comb_proc *, struct ccase *, union value by[]);
 static void output_buffered_case (struct comb_proc *);
 
@@ -657,20 +680,17 @@ execute_add_files (struct comb_proc *proc)
   union value *by;
 
   while (case_matcher_match (proc->matcher, &by))
-    {
-      size_t i;
-
-      for (i = 0; i < proc->n_files; i++)
-        {
-          struct comb_file *file = &proc->files[i];
-          while (file->is_minimal)
-            {
-              struct ccase *output = create_output_case (proc);
-              apply_file_case_and_advance (file, output, by);
-              output_case (proc, output, by);
-            }
-        }
-    }
+    for (size_t i = 0; i < proc->n_files; i++)
+      {
+        struct comb_file *file = &proc->files[i];
+        while (file->is_minimal)
+          {
+            struct ccase *output = create_output_case (proc);
+            apply_case (file, output);
+            advance_file (file, by);
+            output_case (proc, output, by);
+          }
+      }
   output_buffered_case (proc);
 }
 
@@ -682,17 +702,17 @@ execute_match_files (struct comb_proc *proc)
 
   while (case_matcher_match (proc->matcher, &by))
     {
-      struct ccase *output;
-      size_t i;
-
-      output = create_output_case (proc);
-      for (i = proc->n_files; i-- > 0; )
+      struct ccase *output = create_output_case (proc);
+      for (size_t i = proc->n_files; i-- > 0;)
         {
           struct comb_file *file = &proc->files[i];
           if (file->type == COMB_FILE)
             {
               if (file->is_minimal)
-                apply_file_case_and_advance (file, output, NULL);
+                {
+                  apply_case (file, output);
+                  advance_file (file, NULL);
+                }
             }
           else
             {
@@ -723,7 +743,8 @@ execute_update (struct comb_proc *proc)
       for (first = &proc->files[0]; ; first++)
         if (first->is_minimal)
           break;
-      apply_file_case_and_advance (first, output, by);
+      apply_case (first, output);
+      advance_file (first, by);
 
       /* Read additional cases and update the output case from
          them.  (Don't update the output case from any duplicate
@@ -732,7 +753,10 @@ execute_update (struct comb_proc *proc)
            file < &proc->files[proc->n_files]; file++)
         {
           while (file->is_minimal)
-            apply_file_case_and_advance (file, output, by);
+            {
+              apply_nonmissing_case (file, output);
+              advance_file (file, by);
+            }
         }
       casewriter_write (proc->output, output);
 
@@ -744,7 +768,8 @@ execute_update (struct comb_proc *proc)
           while (first->is_minimal)
             {
               output = create_output_case (proc);
-              apply_file_case_and_advance (first, output, by);
+              apply_case (first, output);
+              advance_file (first, by);
               casewriter_write (proc->output, output);
             }
         }
@@ -782,44 +807,67 @@ scan_table (struct comb_file *file, union value by[])
 static struct ccase *
 create_output_case (const struct comb_proc *proc)
 {
-  size_t n_vars = dict_get_var_cnt (proc->dict);
-  struct ccase *output;
-  size_t i;
-
-  output = case_create (dict_get_next_value_idx (proc->dict));
-  for (i = 0; i < n_vars; i++)
+  size_t n_vars = dict_get_n_vars (proc->dict);
+  struct ccase *output = case_create (dict_get_proto (proc->dict));
+  for (size_t i = 0; i < n_vars; i++)
     {
       struct variable *v = dict_get_var (proc->dict, i);
       value_set_missing (case_data_rw (output, v), var_get_width (v));
     }
-  for (i = 0; i < proc->n_files; i++)
+  for (size_t i = 0; i < proc->n_files; i++)
     {
       struct comb_file *file = &proc->files[i];
       if (file->in_var != NULL)
-        case_data_rw (output, file->in_var)->f = false;
+        *case_num_rw (output, file->in_var) = false;
     }
   return output;
 }
 
+static void
+mark_file_used (const struct comb_file *file, struct ccase *output)
+{
+  if (file->in_var != NULL)
+    *case_num_rw (output, file->in_var) = true;
+}
+
 /* Copies the data from FILE's case into output case OUTPUT.
    If FILE has an IN variable, then it is set to 1 in OUTPUT. */
 static void
 apply_case (const struct comb_file *file, struct ccase *output)
 {
   subcase_copy (&file->src, file->data, &file->dst, output);
-  if (file->in_var != NULL)
-    case_data_rw (output, file->in_var)->f = true;
+  mark_file_used (file, output);
+}
+
+/* Copies the data from FILE's case into output case OUTPUT,
+   skipping values that are missing or all spaces.
+
+   If FILE has an IN variable, then it is set to 1 in OUTPUT. */
+static void
+apply_nonmissing_case (const struct comb_file *file, struct ccase *output)
+{
+  for (size_t i = 0; i < subcase_get_n_fields (&file->src); i++)
+    {
+      const struct subcase_field *src_field = &file->src.fields[i];
+      const struct subcase_field *dst_field = &file->dst.fields[i];
+      const union value *src_value
+        = case_data_idx (file->data, src_field->case_index);
+      int width = src_field->width;
+
+      if (!mv_is_value_missing (file->mv[i], src_value)
+          && !(width > 0 && value_is_spaces (src_value, width)))
+        value_copy (case_data_rw_idx (output, dst_field->case_index),
+                    src_value, width);
+    }
+  mark_file_used (file, output);
 }
 
-/* Like apply_case() above, but also advances FILE to its next
-   case.  Also, if BY is nonnull, then FILE's is_minimal member
-   is updated based on whether the new case's BY values still
-   match those in BY. */
+/* Advances FILE to its next case.  If BY is nonnull, then FILE's is_minimal
+   member is updated based on whether the new case's BY values still match
+   those in BY. */
 static void
-apply_file_case_and_advance (struct comb_file *file, struct ccase *output,
-                             union value by[])
+advance_file (struct comb_file *file, union value by[])
 {
-  apply_case (file, output);
   case_unref (file->data);
   file->data = casereader_read (file->reader);
   if (by)
@@ -846,7 +894,7 @@ output_case (struct comb_proc *proc, struct ccase *output, union value by[])
         {
           new_BY = !subcase_equal_xx (&proc->by_vars, proc->prev_BY, by);
           if (proc->last != NULL)
-            case_data_rw (proc->buffered_case, proc->last)->f = new_BY;
+            *case_num_rw (proc->buffered_case, proc->last) = new_BY;
           casewriter_write (proc->output, proc->buffered_case);
         }
       else
@@ -854,15 +902,19 @@ output_case (struct comb_proc *proc, struct ccase *output, union value by[])
 
       proc->buffered_case = output;
       if (proc->first != NULL)
-        case_data_rw (proc->buffered_case, proc->first)->f = new_BY;
+        *case_num_rw (proc->buffered_case, proc->first) = new_BY;
 
       if (new_BY)
         {
-          size_t n = (subcase_get_n_values (&proc->by_vars)
-                      * sizeof (union value));
+          size_t n_values = subcase_get_n_fields (&proc->by_vars);
+          const struct caseproto *proto = subcase_get_proto (&proc->by_vars);
           if (proc->prev_BY == NULL)
-            proc->prev_BY = xmalloc (n);
-          memcpy (proc->prev_BY, by, n);
+            {
+              proc->prev_BY = xmalloc (n_values * sizeof *proc->prev_BY);
+              caseproto_init_values (proto, proc->prev_BY);
+            }
+          caseproto_copy (subcase_get_proto (&proc->by_vars), 0, n_values,
+                          proc->prev_BY, by);
         }
     }
 }
@@ -875,7 +927,7 @@ output_buffered_case (struct comb_proc *proc)
   if (proc->prev_BY != NULL)
     {
       if (proc->last != NULL)
-        case_data_rw (proc->buffered_case, proc->last)->f = 1.0;
+        *case_num_rw (proc->buffered_case, proc->last) = 1.0;
       casewriter_write (proc->output, proc->buffered_case);
       proc->buffered_case = NULL;
     }