Change how checking for missing values works.

[pspp] / src / language / data-io / combine-files.c
diff --git a/src/language/data-io/combine-files.c b/src/language/data-io/combine-files.c

index 0f9518890984fc7971c393119782d5acf1f890ea..8322f5c03307f7e82597bcc5975a90621c0c7186 100644 (file)
--- a/src/language/data-io/combine-files.c
+++ b/src/language/data-io/combine-files.c
@@ -1,5 +1,5 @@
  /* PSPP - a program for statistical analysis.
-   Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+   Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -18,28 +18,30 @@
  
  #include <stdlib.h>
  
-#include <data/any-reader.h>
-#include <data/case-matcher.h>
-#include <data/case.h>
-#include <data/casereader.h>
-#include <data/casewriter.h>
-#include <data/dictionary.h>
-#include <data/format.h>
-#include <data/procedure.h>
-#include <data/subcase.h>
-#include <data/variable.h>
-#include <language/command.h>
-#include <language/data-io/file-handle.h>
-#include <language/data-io/trim.h>
-#include <language/lexer/lexer.h>
-#include <language/lexer/variable-parser.h>
-#include <language/stats/sort-criteria.h>
-#include <libpspp/assertion.h>
-#include <libpspp/message.h>
-#include <libpspp/taint.h>
-#include <math/sort.h>
-
-#include "xalloc.h"
+#include "data/any-reader.h"
+#include "data/case-matcher.h"
+#include "data/case.h"
+#include "data/casereader.h"
+#include "data/casewriter.h"
+#include "data/dataset.h"
+#include "data/dictionary.h"
+#include "data/format.h"
+#include "data/subcase.h"
+#include "data/variable.h"
+#include "language/command.h"
+#include "language/data-io/file-handle.h"
+#include "language/data-io/trim.h"
+#include "language/lexer/lexer.h"
+#include "language/lexer/variable-parser.h"
+#include "language/stats/sort-criteria.h"
+#include "libpspp/assertion.h"
+#include "libpspp/i18n.h"
+#include "libpspp/message.h"
+#include "libpspp/string-array.h"
+#include "libpspp/taint.h"
+#include "math/sort.h"
+
+#include "gl/xalloc.h"
  
  #include "gettext.h"
  #define _(msgid) gettext (msgid)
@@ -67,6 +69,7 @@ struct comb_file
      /* Variables. */
      struct subcase by_vars;     /* BY variables in this input file. */
      struct subcase src, dst;    /* Data to copy to output; where to put it. */
+    const struct missing_values **mv; /* Each variable's missing values. */
  
      /* Input files. */
      struct file_handle *handle; /* Input file handle. */
@@ -159,7 +162,7 @@ combine_files (enum comb_command_type command,
  
    proc.files = NULL;
    proc.n_files = 0;
-  proc.dict = dict_create ();
+  proc.dict = dict_create (get_default_encoding ());
    proc.output = NULL;
    proc.matcher = NULL;
    subcase_init_empty (&proc.by_vars);
@@ -195,6 +198,7 @@ combine_files (enum comb_command_type command,
        subcase_init_empty (&file->by_vars);
        subcase_init_empty (&file->src);
        subcase_init_empty (&file->dst);
+      file->mv = NULL;
        file->handle = NULL;
        file->dict = NULL;
        file->reader = NULL;
@@ -205,27 +209,28 @@ combine_files (enum comb_command_type command,
  
        if (lex_match (lexer, T_ASTERISK))
          {
-          if (!proc_has_active_file (ds))
+          if (!dataset_has_source (ds))
              {
-              msg (SE, _("Cannot specify the active file since no active "
-                         "file has been defined."));
+              msg (SE, _("Cannot specify the active dataset since none "
+                         "has been defined."));
                goto error;
              }
  
            if (proc_make_temporary_transformations_permanent (ds))
              msg (SE, _("This command may not be used after TEMPORARY when "
-                       "the active file is an input source.  "
+                       "the active dataset is an input source.  "
                         "Temporary transformations will be made permanent."));
  
            file->dict = dict_clone (dataset_dict (ds));
          }
        else
          {
-          file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
+          file->handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds));
            if (file->handle == NULL)
              goto error;
  
-          file->reader = any_reader_open (file->handle, &file->dict);
+          file->reader = any_reader_open_and_decode (file->handle, NULL,
+                                                     &file->dict, NULL);
            if (file->reader == NULL)
              goto error;
          }
@@ -233,7 +238,7 @@ combine_files (enum comb_command_type command,
        while (lex_match (lexer, T_SLASH))
          if (lex_match_id (lexer, "RENAME"))
            {
-            if (!parse_dict_rename (lexer, file->dict))
+            if (!parse_dict_rename (lexer, file->dict, false))
                goto error;
            }
          else if (lex_match_id (lexer, "IN"))
@@ -260,7 +265,8 @@ combine_files (enum comb_command_type command,
              saw_sort = true;
            }
  
-      merge_dictionary (proc.dict, file);
+      if (!merge_dictionary (proc.dict, file))
+        goto error;
      }
  
    while (lex_token (lexer) != T_ENDCMD)
@@ -302,7 +308,8 @@ combine_files (enum comb_command_type command,
                          msg (SE, _("File %s lacks BY variable %s."),
                               fh_get_name (file->handle), name);
                        else
-                        msg (SE, _("Active file lacks BY variable %s."), name);
+                        msg (SE, _("Active dataset lacks BY variable %s."),
+                             name);
                        ok = false;
                      }
                  }
@@ -373,7 +380,7 @@ combine_files (enum comb_command_type command,
      {
        if (command == COMB_UPDATE)
          {
-          msg (SE, _("The BY subcommand is required."));
+          lex_sbc_missing ("BY");
            goto error;
          }
        if (n_tables)
@@ -407,16 +414,19 @@ combine_files (enum comb_command_type command,
    for (i = 0; i < proc.n_files; i++)
      {
        struct comb_file *file = &proc.files[i];
-      size_t src_var_cnt = dict_get_var_cnt (file->dict);
+      size_t src_n_vars = dict_get_n_vars (file->dict);
        size_t j;
  
-      for (j = 0; j < src_var_cnt; j++)
+      file->mv = xnmalloc (src_n_vars, sizeof *file->mv);
+      for (j = 0; j < src_n_vars; j++)
          {
            struct variable *src_var = dict_get_var (file->dict, j);
            struct variable *dst_var = dict_lookup_var (proc.dict,
                                                        var_get_name (src_var));
            if (dst_var != NULL)
              {
+              size_t n = subcase_get_n_fields (&file->src);
+              file->mv[n] = var_get_missing_values (src_var);
                subcase_add_var (&file->src, src_var, SC_ASCEND);
                subcase_add_var (&file->dst, dst_var, SC_ASCEND);
              }
@@ -436,7 +446,7 @@ combine_files (enum comb_command_type command,
            if (active_file == NULL)
              {
                proc_discard_output (ds);
-              file->reader = active_file = proc_open (ds);
+              file->reader = active_file = proc_open_filtering (ds, false);
              }
            else
              file->reader = casereader_clone (active_file);
@@ -465,7 +475,8 @@ combine_files (enum comb_command_type command,
    if (active_file != NULL)
      proc_commit (ds);
  
-  proc_set_active_file (ds, casewriter_make_reader (proc.output), proc.dict);
+  dataset_set_dict (ds, proc.dict);
+  dataset_set_source (ds, casewriter_make_reader (proc.output));
    proc.dict = NULL;
    proc.output = NULL;
  
@@ -491,9 +502,8 @@ static bool
  merge_dictionary (struct dictionary *const m, struct comb_file *f)
  {
    struct dictionary *d = f->dict;
-  const char *d_docs, *m_docs;
+  const struct string_array *d_docs, *m_docs;
    int i;
-  const char *file_encoding;
  
    if (dict_get_label (m) == NULL)
      dict_set_label (m, dict_get_label (d));
@@ -507,17 +517,9 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f)
       The correct thing to do would be to convert to an encoding
       which can cope with all the input files (eg UTF-8).
     */
-  file_encoding = dict_get_encoding (f->dict);
-  if ( file_encoding != NULL)
-    {
-      if ( dict_get_encoding (m) == NULL)
-       dict_set_encoding (m, file_encoding);
-      else if ( 0 != strcmp (file_encoding, dict_get_encoding (m)))
-       {
-         msg (MW,
-              _("Combining files with incompatible encodings. String data may not be represented correctly."));
-       }
-    }
+  if (0 != strcmp (dict_get_encoding (f->dict), dict_get_encoding (m)))
+    msg (MW, _("Combining files with incompatible encodings. String data may "
+               "not be represented correctly."));
  
    if (d_docs != NULL)
      {
@@ -525,13 +527,23 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f)
          dict_set_documents (m, d_docs);
        else
          {
-          char *new_docs = xasprintf ("%s%s", m_docs, d_docs);
-          dict_set_documents (m, new_docs);
-          free (new_docs);
+          struct string_array new_docs;
+          size_t i;
+
+          new_docs.n = m_docs->n + d_docs->n;
+          new_docs.strings = xmalloc (new_docs.n * sizeof *new_docs.strings);
+          for (i = 0; i < m_docs->n; i++)
+            new_docs.strings[i] = m_docs->strings[i];
+          for (i = 0; i < d_docs->n; i++)
+            new_docs.strings[m_docs->n + i] = d_docs->strings[i];
+
+          dict_set_documents (m, &new_docs);
+
+          free (new_docs.strings);
          }
      }
  
-  for (i = 0; i < dict_get_var_cnt (d); i++)
+  for (i = 0; i < dict_get_n_vars (d); i++)
      {
        struct variable *dv = dict_get_var (d, i);
        struct variable *mv = dict_lookup_var (m, var_get_name (dv));
@@ -544,8 +556,10 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f)
            if (var_get_width (mv) != var_get_width (dv))
              {
                const char *var_name = var_get_name (dv);
-              const char *file_name = fh_get_name (f->handle);
                struct string s = DS_EMPTY_INITIALIZER;
+              const char *file_name;
+
+              file_name = f->handle ? fh_get_name (f->handle) : "*";
                ds_put_format (&s,
                               _("Variable %s in file %s has different "
                                 "type or width from the same variable in "
@@ -628,8 +642,9 @@ close_all_comb_files (struct comb_proc *proc)
        subcase_destroy (&file->by_vars);
        subcase_destroy (&file->src);
        subcase_destroy (&file->dst);
+      free (file->mv);
        fh_unref (file->handle);
-      dict_destroy (file->dict);
+      dict_unref (file->dict);
        casereader_destroy (file->reader);
        case_unref (file->data);
        free (file->in_name);
@@ -644,7 +659,7 @@ static void
  free_comb_proc (struct comb_proc *proc)
  {
    close_all_comb_files (proc);
-  dict_destroy (proc->dict);
+  dict_unref (proc->dict);
    casewriter_destroy (proc->output);
    case_matcher_destroy (proc->matcher);
    if (proc->prev_BY)
@@ -660,8 +675,8 @@ free_comb_proc (struct comb_proc *proc)
  static bool scan_table (struct comb_file *, union value by[]);
  static struct ccase *create_output_case (const struct comb_proc *);
  static void apply_case (const struct comb_file *, struct ccase *);
-static void apply_file_case_and_advance (struct comb_file *, struct ccase *,
-                                         union value by[]);
+static void apply_nonmissing_case (const struct comb_file *, struct ccase *);
+static void advance_file (struct comb_file *, union value by[]);
  static void output_case (struct comb_proc *, struct ccase *, union value by[]);
  static void output_buffered_case (struct comb_proc *);
  
@@ -681,7 +696,8 @@ execute_add_files (struct comb_proc *proc)
            while (file->is_minimal)
              {
                struct ccase *output = create_output_case (proc);
-              apply_file_case_and_advance (file, output, by);
+              apply_case (file, output);
+              advance_file (file, by);
                output_case (proc, output, by);
              }
          }
@@ -701,13 +717,16 @@ execute_match_files (struct comb_proc *proc)
        size_t i;
  
        output = create_output_case (proc);
-      for (i = proc->n_files; i-- > 0; )
+      for (i = proc->n_files; i-- > 0;)
          {
            struct comb_file *file = &proc->files[i];
            if (file->type == COMB_FILE)
              {
                if (file->is_minimal)
-                apply_file_case_and_advance (file, output, NULL);
+                {
+                  apply_case (file, output);
+                  advance_file (file, NULL);
+                }
              }
            else
              {
@@ -738,7 +757,8 @@ execute_update (struct comb_proc *proc)
        for (first = &proc->files[0]; ; first++)
          if (first->is_minimal)
            break;
-      apply_file_case_and_advance (first, output, by);
+      apply_case (first, output);
+      advance_file (first, by);
  
        /* Read additional cases and update the output case from
           them.  (Don't update the output case from any duplicate
@@ -747,7 +767,10 @@ execute_update (struct comb_proc *proc)
             file < &proc->files[proc->n_files]; file++)
          {
            while (file->is_minimal)
-            apply_file_case_and_advance (file, output, by);
+            {
+              apply_nonmissing_case (file, output);
+              advance_file (file, by);
+            }
          }
        casewriter_write (proc->output, output);
  
@@ -759,7 +782,8 @@ execute_update (struct comb_proc *proc)
            while (first->is_minimal)
              {
                output = create_output_case (proc);
-              apply_file_case_and_advance (first, output, by);
+              apply_case (first, output);
+              advance_file (first, by);
                casewriter_write (proc->output, output);
              }
          }
@@ -797,7 +821,7 @@ scan_table (struct comb_file *file, union value by[])
  static struct ccase *
  create_output_case (const struct comb_proc *proc)
  {
-  size_t n_vars = dict_get_var_cnt (proc->dict);
+  size_t n_vars = dict_get_n_vars (proc->dict);
    struct ccase *output;
    size_t i;
  
@@ -811,30 +835,58 @@ create_output_case (const struct comb_proc *proc)
      {
        struct comb_file *file = &proc->files[i];
        if (file->in_var != NULL)
-        case_data_rw (output, file->in_var)->f = false;
+        *case_num_rw (output, file->in_var) = false;
      }
    return output;
  }
  
+static void
+mark_file_used (const struct comb_file *file, struct ccase *output)
+{
+  if (file->in_var != NULL)
+    *case_num_rw (output, file->in_var) = true;
+}
+
  /* Copies the data from FILE's case into output case OUTPUT.
     If FILE has an IN variable, then it is set to 1 in OUTPUT. */
  static void
  apply_case (const struct comb_file *file, struct ccase *output)
  {
    subcase_copy (&file->src, file->data, &file->dst, output);
-  if (file->in_var != NULL)
-    case_data_rw (output, file->in_var)->f = true;
+  mark_file_used (file, output);
+}
+
+/* Copies the data from FILE's case into output case OUTPUT,
+   skipping values that are missing or all spaces.
+
+   If FILE has an IN variable, then it is set to 1 in OUTPUT. */
+static void
+apply_nonmissing_case (const struct comb_file *file, struct ccase *output)
+{
+  size_t i;
+
+  for (i = 0; i < subcase_get_n_fields (&file->src); i++)
+    {
+      const struct subcase_field *src_field = &file->src.fields[i];
+      const struct subcase_field *dst_field = &file->dst.fields[i];
+      const union value *src_value
+        = case_data_idx (file->data, src_field->case_index);
+      int width = src_field->width;
+
+      if (!mv_is_value_missing (file->mv[i], src_value)
+          && !(width > 0 && value_is_spaces (src_value, width)))
+        value_copy (case_data_rw_idx (output, dst_field->case_index),
+                    src_value, width);
+    }
+  mark_file_used (file, output);
  }
  
-/* Like apply_case() above, but also advances FILE to its next
-   case.  Also, if BY is nonnull, then FILE's is_minimal member
-   is updated based on whether the new case's BY values still
-   match those in BY. */
+/* Advances FILE to its next case.  If BY is nonnull, then FILE's is_minimal
+   member is updated based on whether the new case's BY values still match
+   those in BY. */
  static void
-apply_file_case_and_advance (struct comb_file *file, struct ccase *output,
-                             union value by[])
+advance_file (struct comb_file *file, union value by[])
  {
-  apply_case (file, output);
    case_unref (file->data);
    file->data = casereader_read (file->reader);
    if (by)
@@ -861,7 +913,7 @@ output_case (struct comb_proc *proc, struct ccase *output, union value by[])
          {
            new_BY = !subcase_equal_xx (&proc->by_vars, proc->prev_BY, by);
            if (proc->last != NULL)
-            case_data_rw (proc->buffered_case, proc->last)->f = new_BY;
+            *case_num_rw (proc->buffered_case, proc->last) = new_BY;
            casewriter_write (proc->output, proc->buffered_case);
          }
        else
@@ -869,7 +921,7 @@ output_case (struct comb_proc *proc, struct ccase *output, union value by[])
  
        proc->buffered_case = output;
        if (proc->first != NULL)
-        case_data_rw (proc->buffered_case, proc->first)->f = new_BY;
+        *case_num_rw (proc->buffered_case, proc->first) = new_BY;
  
        if (new_BY)
          {
@@ -894,7 +946,7 @@ output_buffered_case (struct comb_proc *proc)
    if (proc->prev_BY != NULL)
      {
        if (proc->last != NULL)
-        case_data_rw (proc->buffered_case, proc->last)->f = 1.0;
+        *case_num_rw (proc->buffered_case, proc->last) = 1.0;
        casewriter_write (proc->output, proc->buffered_case);
        proc->buffered_case = NULL;
      }