X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fdata-io%2Fcombine-files.c;h=b95683ef8c752867cb26dd6bba6f1ac6e98f756f;hb=d4f19dd9241b87b0b330daf674ed90d767b44822;hp=1a82ef3f1bf61bfcc6ce1e9a2559237f37fcc37d;hpb=99e37c4d062ac23f89070b578f28eb6d49eec632;p=pspp diff --git a/src/language/data-io/combine-files.c b/src/language/data-io/combine-files.c index 1a82ef3f1b..b95683ef8c 100644 --- a/src/language/data-io/combine-files.c +++ b/src/language/data-io/combine-files.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,28 +18,31 @@ #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xalloc.h" +#include "data/any-reader.h" +#include "data/case-matcher.h" +#include "data/case.h" +#include "data/casereader.h" +#include "data/casewriter.h" +#include "data/dataset.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/subcase.h" +#include "data/variable.h" +#include "language/command.h" +#include "language/data-io/file-handle.h" +#include "language/data-io/trim.h" +#include "language/lexer/lexer.h" +#include "language/lexer/variable-parser.h" +#include "language/stats/sort-criteria.h" +#include "libpspp/assertion.h" +#include "libpspp/i18n.h" +#include "libpspp/message.h" +#include "libpspp/string-array.h" +#include "libpspp/taint.h" +#include "math/sort.h" + +#include "gl/minmax.h" +#include "gl/xalloc.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -63,10 +66,12 @@ struct comb_file { /* Basics. */ enum comb_file_type type; /* COMB_FILE or COMB_TABLE. */ + int start_ofs, end_ofs; /* Lexer offsets. */ /* Variables. */ struct subcase by_vars; /* BY variables in this input file. */ struct subcase src, dst; /* Data to copy to output; where to put it. */ + const struct missing_values **mv; /* Each variable's missing values. */ /* Input files. */ struct file_handle *handle; /* Input file handle. */ @@ -78,7 +83,8 @@ struct comb_file bool is_sorted; /* Is file presorted on the BY variables? */ /* IN subcommand. */ - char in_name[VAR_NAME_LEN + 1]; + char *in_name; + int in_ofs; struct variable *in_var; }; @@ -91,6 +97,9 @@ struct comb_proc struct subcase by_vars; /* BY variables in the output. */ struct casewriter *output; /* Destination for output. */ + size_t *var_sources; + size_t n_var_sources, allocated_var_sources; + struct case_matcher *matcher; /* FIRST, LAST. @@ -108,13 +117,15 @@ static int combine_files (enum comb_command_type, struct lexer *, static void free_comb_proc (struct comb_proc *); static void close_all_comb_files (struct comb_proc *); -static bool merge_dictionary (struct dictionary *const, struct comb_file *); +static bool merge_dictionary (struct comb_proc *, struct lexer *, + struct comb_file *); static void execute_update (struct comb_proc *); static void execute_match_files (struct comb_proc *); static void execute_add_files (struct comb_proc *); -static bool create_flag_var (const char *subcommand_name, const char *var_name, +static bool create_flag_var (struct lexer *lexer, const char *subcommand_name, + const char *var_name, int var_ofs, struct dictionary *, struct variable **); static void output_case (struct comb_proc *, struct ccase *, union value *by); static void output_buffered_case (struct comb_proc *); @@ -141,155 +152,141 @@ static int combine_files (enum comb_command_type command, struct lexer *lexer, struct dataset *ds) { - struct comb_proc proc; + struct comb_proc proc = { + .dict = dict_create (get_default_encoding ()), + }; bool saw_by = false; bool saw_sort = false; struct casereader *active_file = NULL; - char first_name[VAR_NAME_LEN + 1] = ""; - char last_name[VAR_NAME_LEN + 1] = ""; + char *first_name = NULL; + int first_ofs = 0; + char *last_name = NULL; + int last_ofs = 0; struct taint *taint = NULL; - size_t n_tables = 0; + size_t table_idx = SIZE_MAX; + int sort_ofs = INT_MAX; size_t allocated_files = 0; - size_t i; - - proc.files = NULL; - proc.n_files = 0; - proc.dict = dict_create (); - proc.output = NULL; - proc.matcher = NULL; - subcase_init_empty (&proc.by_vars); - proc.first = NULL; - proc.last = NULL; - proc.buffered_case = NULL; - proc.prev_BY = NULL; - dict_set_case_limit (proc.dict, dict_get_case_limit (dataset_dict (ds))); - lex_match (lexer, '/'); + lex_match (lexer, T_SLASH); for (;;) { - struct comb_file *file; + int start_ofs = lex_ofs (lexer); enum comb_file_type type; - if (lex_match_id (lexer, "FILE")) type = COMB_FILE; else if (command == COMB_MATCH && lex_match_id (lexer, "TABLE")) { type = COMB_TABLE; - n_tables++; + table_idx = MIN (table_idx, proc.n_files); } else break; - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (proc.n_files >= allocated_files) proc.files = x2nrealloc (proc.files, &allocated_files, sizeof *proc.files); - file = &proc.files[proc.n_files++]; - file->type = type; - subcase_init_empty (&file->by_vars); - subcase_init_empty (&file->src); - subcase_init_empty (&file->dst); - file->handle = NULL; - file->dict = NULL; - file->reader = NULL; - file->data = NULL; - file->is_sorted = true; - file->in_name[0] = '\0'; - file->in_var = NULL; - - if (lex_match (lexer, '*')) + struct comb_file *file = &proc.files[proc.n_files++]; + *file = (struct comb_file) { + .type = type, + .start_ofs = start_ofs, + .is_sorted = true, + }; + + if (lex_match (lexer, T_ASTERISK)) { - if (!proc_has_active_file (ds)) + if (!dataset_has_source (ds)) { - msg (SE, _("Cannot specify the active file since no active " - "file has been defined.")); + lex_next_error (lexer, -1, -1, + _("Cannot specify the active dataset since none " + "has been defined.")); goto error; } if (proc_make_temporary_transformations_permanent (ds)) - msg (SE, _("This command may not be used after TEMPORARY when " - "the active file is an input source. " - "Temporary transformations will be made permanent.")); + lex_next_error (lexer, -1, -1, + _("This command may not be used after TEMPORARY " + "when the active dataset is an input source. " + "Temporary transformations will be made " + "permanent.")); file->dict = dict_clone (dataset_dict (ds)); } else { - file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH); + file->handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds)); if (file->handle == NULL) goto error; - file->reader = any_reader_open (file->handle, &file->dict); + file->reader = any_reader_open_and_decode (file->handle, NULL, + &file->dict, NULL); if (file->reader == NULL) goto error; } + file->end_ofs = lex_ofs (lexer) - 1; - while (lex_match (lexer, '/')) + while (lex_match (lexer, T_SLASH)) if (lex_match_id (lexer, "RENAME")) { - if (!parse_dict_rename (lexer, file->dict)) + if (!parse_dict_rename (lexer, file->dict, false)) goto error; } else if (lex_match_id (lexer, "IN")) { - lex_match (lexer, '='); - if (lex_token (lexer) != T_ID) - { - lex_error (lexer, NULL); - goto error; - } + lex_match (lexer, T_EQUALS); + if (!lex_force_id (lexer)) + goto error; - if (file->in_name[0]) + if (file->in_name) { - msg (SE, _("Multiple IN subcommands for a single FILE or " - "TABLE.")); + lex_error (lexer, _("Multiple IN subcommands for a single FILE " + "or TABLE.")); goto error; } - strcpy (file->in_name, lex_tokid (lexer)); + file->in_name = xstrdup (lex_tokcstr (lexer)); + file->in_ofs = lex_ofs (lexer); lex_get (lexer); } else if (lex_match_id (lexer, "SORT")) { file->is_sorted = false; saw_sort = true; + sort_ofs = MIN (sort_ofs, lex_ofs (lexer) - 1); } - merge_dictionary (proc.dict, file); + if (!merge_dictionary (&proc, lexer, file)) + goto error; } - while (lex_token (lexer) != '.') + while (lex_token (lexer) != T_ENDCMD) { if (lex_match (lexer, T_BY)) { - const struct variable **by_vars; - size_t i; - bool ok; - - if (saw_by) + if (saw_by) { - lex_sbc_only_once ("BY"); + lex_sbc_only_once (lexer, "BY"); goto error; } saw_by = true; - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); + + const struct variable **by_vars; if (!parse_sort_criteria (lexer, proc.dict, &proc.by_vars, &by_vars, NULL)) goto error; - ok = true; - for (i = 0; i < proc.n_files; i++) + bool ok = true; + for (size_t i = 0; i < proc.n_files; i++) { struct comb_file *file = &proc.files[i]; - size_t j; - - for (j = 0; j < subcase_get_n_values (&proc.by_vars); j++) + for (size_t j = 0; j < subcase_get_n_fields (&proc.by_vars); j++) { const char *name = var_get_name (by_vars[j]); struct variable *var = dict_lookup_var (file->dict, name); @@ -298,11 +295,11 @@ combine_files (enum comb_command_type command, subcase_get_direction (&proc.by_vars, j)); else { - if (file->handle != NULL) - msg (SE, _("File %s lacks BY variable %s."), - fh_get_name (file->handle), name); - else - msg (SE, _("Active file lacks BY variable %s."), name); + const char *fn + = file->handle ? fh_get_name (file->handle) : "*"; + lex_ofs_error (lexer, file->start_ofs, file->end_ofs, + _("File %s lacks BY variable %s."), + fn, name); ok = false; } } @@ -316,30 +313,32 @@ combine_files (enum comb_command_type command, } else if (command != COMB_UPDATE && lex_match_id (lexer, "FIRST")) { - if (first_name[0] != '\0') + if (first_name != NULL) { - lex_sbc_only_once ("FIRST"); + lex_sbc_only_once (lexer, "FIRST"); goto error; } - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (!lex_force_id (lexer)) goto error; - strcpy (first_name, lex_tokid (lexer)); + first_name = xstrdup (lex_tokcstr (lexer)); + first_ofs = lex_ofs (lexer); lex_get (lexer); } else if (command != COMB_UPDATE && lex_match_id (lexer, "LAST")) { - if (last_name[0] != '\0') + if (last_name != NULL) { - lex_sbc_only_once ("LAST"); + lex_sbc_only_once (lexer, "LAST"); goto error; } - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (!lex_force_id (lexer)) goto error; - strcpy (last_name, lex_tokid (lexer)); + last_name = xstrdup (lex_tokcstr (lexer)); + last_ofs = lex_ofs (lexer); lex_get (lexer); } else if (lex_match_id (lexer, "MAP")) @@ -362,7 +361,7 @@ combine_files (enum comb_command_type command, goto error; } - if (!lex_match (lexer, '/') && lex_token (lexer) != '.') + if (!lex_match (lexer, T_SLASH) && lex_token (lexer) != T_ENDCMD) { lex_end_of_command (lexer); goto error; @@ -373,30 +372,34 @@ combine_files (enum comb_command_type command, { if (command == COMB_UPDATE) { - msg (SE, _("The BY subcommand is required.")); + lex_sbc_missing (lexer, "BY"); goto error; } - if (n_tables) + if (table_idx != SIZE_MAX) { - msg (SE, _("BY is required when TABLE is specified.")); + const struct comb_file *table = &proc.files[table_idx]; + lex_ofs_error (lexer, table->start_ofs, table->end_ofs, + _("BY is required when %s is specified."), "TABLE"); goto error; } if (saw_sort) { - msg (SE, _("BY is required when SORT is specified.")); + lex_ofs_error (lexer, sort_ofs, sort_ofs, + _("BY is required when %s is specified."), "SORT"); goto error; } } /* Add IN, FIRST, and LAST variables to master dictionary. */ - for (i = 0; i < proc.n_files; i++) + for (size_t i = 0; i < proc.n_files; i++) { struct comb_file *file = &proc.files[i]; - if (!create_flag_var ("IN", file->in_name, proc.dict, &file->in_var)) + if (!create_flag_var (lexer, "IN", file->in_name, file->in_ofs, + proc.dict, &file->in_var)) goto error; } - if (!create_flag_var ("FIRST", first_name, proc.dict, &proc.first) - || !create_flag_var ("LAST", last_name, proc.dict, &proc.last)) + if (!create_flag_var (lexer, "FIRST", first_name, first_ofs, proc.dict, &proc.first) + || !create_flag_var (lexer, "LAST", last_name, last_ofs, proc.dict, &proc.last)) goto error; dict_delete_scratch_vars (proc.dict); @@ -404,31 +407,33 @@ combine_files (enum comb_command_type command, /* Set up mapping from each file's variables to master variables. */ - for (i = 0; i < proc.n_files; i++) + for (size_t i = 0; i < proc.n_files; i++) { struct comb_file *file = &proc.files[i]; - size_t src_var_cnt = dict_get_var_cnt (file->dict); - size_t j; + size_t src_n_vars = dict_get_n_vars (file->dict); - for (j = 0; j < src_var_cnt; j++) + file->mv = xnmalloc (src_n_vars, sizeof *file->mv); + for (size_t j = 0; j < src_n_vars; j++) { struct variable *src_var = dict_get_var (file->dict, j); struct variable *dst_var = dict_lookup_var (proc.dict, var_get_name (src_var)); if (dst_var != NULL) { + size_t n = subcase_get_n_fields (&file->src); + file->mv[n] = var_get_missing_values (src_var); subcase_add_var (&file->src, src_var, SC_ASCEND); subcase_add_var (&file->dst, dst_var, SC_ASCEND); } } } - proc.output = autopaging_writer_create (dict_get_next_value_idx (proc.dict)); + proc.output = autopaging_writer_create (dict_get_proto (proc.dict)); taint = taint_clone (casewriter_get_taint (proc.output)); /* Set up case matcher. */ proc.matcher = case_matcher_create (); - for (i = 0; i < proc.n_files; i++) + for (size_t i = 0; i < proc.n_files; i++) { struct comb_file *file = &proc.files[i]; if (file->reader == NULL) @@ -436,7 +441,7 @@ combine_files (enum comb_command_type command, if (active_file == NULL) { proc_discard_output (ds); - file->reader = active_file = proc_open (ds); + file->reader = active_file = proc_open_filtering (ds, false); } else file->reader = casereader_clone (active_file); @@ -465,12 +470,16 @@ combine_files (enum comb_command_type command, if (active_file != NULL) proc_commit (ds); - proc_set_active_file (ds, casewriter_make_reader (proc.output), proc.dict); + dataset_set_dict (ds, proc.dict); + dataset_set_source (ds, casewriter_make_reader (proc.output)); proc.dict = NULL; proc.output = NULL; free_comb_proc (&proc); + free (first_name); + free (last_name); + return taint_destroy (taint) ? CMD_SUCCESS : CMD_CASCADING_FAILURE; error: @@ -478,52 +487,55 @@ combine_files (enum comb_command_type command, proc_commit (ds); free_comb_proc (&proc); taint_destroy (taint); + free (first_name); + free (last_name); return CMD_CASCADING_FAILURE; } -/* Merge the dictionary for file F into master dictionary M. */ +/* Merge the dictionary for file F into master dictionary for PROC. */ static bool -merge_dictionary (struct dictionary *const m, struct comb_file *f) +merge_dictionary (struct comb_proc *proc, struct lexer *lexer, + struct comb_file *f) { + struct dictionary *m = proc->dict; struct dictionary *d = f->dict; - const char *d_docs, *m_docs; - int i; - const char *file_encoding; if (dict_get_label (m) == NULL) dict_set_label (m, dict_get_label (d)); - d_docs = dict_get_documents (d); - m_docs = dict_get_documents (m); - - - /* If the input files have different encodings, then + /* FIXME: If the input files have different encodings, then + the result is undefined. + The correct thing to do would be to convert to an encoding + which can cope with all the input files (eg UTF-8). */ - file_encoding = dict_get_encoding (f->dict); - if ( file_encoding != NULL) - { - if ( dict_get_encoding (m) == NULL) - dict_set_encoding (m, file_encoding); - else if ( 0 != strcmp (file_encoding, dict_get_encoding (m))) - { - msg (MW, - _("Combining files with incompatible encodings. String data may not be represented correctly.")); - } - } + if (strcmp (dict_get_encoding (f->dict), dict_get_encoding (m))) + msg (MW, _("Combining files with incompatible encodings. String data may " + "not be represented correctly.")); - if (d_docs != NULL) + const struct string_array *d_docs = dict_get_documents (d); + const struct string_array *m_docs = dict_get_documents (m); + if (d_docs) { - if (m_docs == NULL) + if (!m_docs) dict_set_documents (m, d_docs); else { - char *new_docs = xasprintf ("%s%s", m_docs, d_docs); - dict_set_documents (m, new_docs); - free (new_docs); + size_t n = m_docs->n + d_docs->n; + struct string_array new_docs = { + .strings = xmalloc (n * sizeof *new_docs.strings), + }; + for (size_t i = 0; i < m_docs->n; i++) + new_docs.strings[new_docs.n++] = m_docs->strings[i]; + for (size_t i = 0; i < d_docs->n; i++) + new_docs.strings[new_docs.n++] = d_docs->strings[i]; + + dict_set_documents (m, &new_docs); + + free (new_docs.strings); } } - for (i = 0; i < dict_get_var_cnt (d); i++) + for (size_t i = 0; i < dict_get_n_vars (d); i++) { struct variable *dv = dict_get_var (d, i); struct variable *mv = dict_lookup_var (m, var_get_name (dv)); @@ -531,36 +543,40 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) if (dict_class_from_id (var_get_name (dv)) == DC_SCRATCH) continue; - if (mv != NULL) + if (!mv) + { + mv = dict_clone_var_assert (m, dv); + if (proc->n_var_sources >= proc->allocated_var_sources) + proc->var_sources = x2nrealloc (proc->var_sources, + &proc->allocated_var_sources, + sizeof *proc->var_sources); + proc->var_sources[proc->n_var_sources++] = f - proc->files; + } + else { if (var_get_width (mv) != var_get_width (dv)) { const char *var_name = var_get_name (dv); - const char *file_name = fh_get_name (f->handle); - struct string s = DS_EMPTY_INITIALIZER; - ds_put_format (&s, - _("Variable %s in file %s has different " - "type or width from the same variable in " - "earlier file."), - var_name, file_name); - ds_put_cstr (&s, " "); - if (var_is_numeric (dv)) - ds_put_format (&s, _("In file %s, %s is numeric."), - file_name, var_name); - else - ds_put_format (&s, _("In file %s, %s is a string variable " - "with width %d."), - file_name, var_name, var_get_width (dv)); - ds_put_cstr (&s, " "); - if (var_is_numeric (mv)) - ds_put_format (&s, _("In an earlier file, %s was numeric."), - var_name); - else - ds_put_format (&s, _("In an earlier file, %s was a string " - "variable with width %d."), - var_name, var_get_width (mv)); - msg (SE, ds_cstr (&s)); - ds_destroy (&s); + msg (SE, _("Variable %s has different type or width in different " + "files."), var_name); + + for (size_t j = 0; j < 2; j++) + { + const struct variable *ev = !j ? mv : dv; + const struct comb_file *ef + = !j ? &proc->files[proc->var_sources[var_get_dict_index (mv)]] : f; + const char *fn = ef->handle ? fh_get_name (ef->handle) : "*"; + + if (var_is_numeric (ev)) + lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs, + _("In file %s, %s is numeric."), + fn, var_name); + else + lex_ofs_msg (lexer, SN, ef->start_ofs, ef->end_ofs, + _("In file %s, %s is a string with width %d."), + fn, var_name, var_get_width (ev)); + } + return false; } @@ -571,33 +587,34 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) if (var_get_label (dv) && !var_get_label (mv)) var_set_label (mv, var_get_label (dv)); } - else - mv = dict_clone_var_assert (m, dv, var_get_name (dv)); } return true; } -/* If VAR_NAME is a non-empty string, attempts to create a +/* If VAR_NAME is non-NULL, attempts to create a variable named VAR_NAME, with format F1.0, in DICT, and stores a pointer to the variable in *VAR. Returns true if successful, false if the variable name is a duplicate (in which case a message saying that the variable specified on the - given SUBCOMMAND is a duplicate is emitted). Also returns - true, without doing anything, if VAR_NAME is null or empty. */ + given SUBCOMMAND is a duplicate is emitted). + + Does nothing and returns true if VAR_NAME is null. */ static bool -create_flag_var (const char *subcommand, const char *var_name, +create_flag_var (struct lexer *lexer, const char *subcommand, + const char *var_name, int var_ofs, struct dictionary *dict, struct variable **var) { - if (var_name[0] != '\0') + if (var_name != NULL) { struct fmt_spec format = fmt_for_output (FMT_F, 1, 0); *var = dict_create_var (dict, var_name, 0); if (*var == NULL) { - msg (SE, _("Variable name %s specified on %s subcommand " - "duplicates an existing variable name."), - subcommand, var_name); + lex_ofs_error (lexer, var_ofs, var_ofs, + _("Variable name %s specified on %s subcommand " + "duplicates an existing variable name."), + var_name, subcommand); return false; } var_set_both_formats (*var, &format); @@ -611,18 +628,18 @@ create_flag_var (const char *subcommand, const char *var_name, static void close_all_comb_files (struct comb_proc *proc) { - size_t i; - - for (i = 0; i < proc->n_files; i++) + for (size_t i = 0; i < proc->n_files; i++) { struct comb_file *file = &proc->files[i]; - subcase_destroy (&file->by_vars); - subcase_destroy (&file->src); - subcase_destroy (&file->dst); + subcase_uninit (&file->by_vars); + subcase_uninit (&file->src); + subcase_uninit (&file->dst); + free (file->mv); fh_unref (file->handle); - dict_destroy (file->dict); + dict_unref (file->dict); casereader_destroy (file->reader); case_unref (file->data); + free (file->in_name); } free (proc->files); proc->files = NULL; @@ -634,19 +651,25 @@ static void free_comb_proc (struct comb_proc *proc) { close_all_comb_files (proc); - dict_destroy (proc->dict); + dict_unref (proc->dict); casewriter_destroy (proc->output); case_matcher_destroy (proc->matcher); - subcase_destroy (&proc->by_vars); + if (proc->prev_BY) + { + caseproto_destroy_values (subcase_get_proto (&proc->by_vars), + proc->prev_BY); + free (proc->prev_BY); + } + subcase_uninit (&proc->by_vars); case_unref (proc->buffered_case); - free (proc->prev_BY); + free (proc->var_sources); } static bool scan_table (struct comb_file *, union value by[]); static struct ccase *create_output_case (const struct comb_proc *); static void apply_case (const struct comb_file *, struct ccase *); -static void apply_file_case_and_advance (struct comb_file *, struct ccase *, - union value by[]); +static void apply_nonmissing_case (const struct comb_file *, struct ccase *); +static void advance_file (struct comb_file *, union value by[]); static void output_case (struct comb_proc *, struct ccase *, union value by[]); static void output_buffered_case (struct comb_proc *); @@ -657,20 +680,17 @@ execute_add_files (struct comb_proc *proc) union value *by; while (case_matcher_match (proc->matcher, &by)) - { - size_t i; - - for (i = 0; i < proc->n_files; i++) - { - struct comb_file *file = &proc->files[i]; - while (file->is_minimal) - { - struct ccase *output = create_output_case (proc); - apply_file_case_and_advance (file, output, by); - output_case (proc, output, by); - } - } - } + for (size_t i = 0; i < proc->n_files; i++) + { + struct comb_file *file = &proc->files[i]; + while (file->is_minimal) + { + struct ccase *output = create_output_case (proc); + apply_case (file, output); + advance_file (file, by); + output_case (proc, output, by); + } + } output_buffered_case (proc); } @@ -682,17 +702,17 @@ execute_match_files (struct comb_proc *proc) while (case_matcher_match (proc->matcher, &by)) { - struct ccase *output; - size_t i; - - output = create_output_case (proc); - for (i = proc->n_files; i-- > 0; ) + struct ccase *output = create_output_case (proc); + for (size_t i = proc->n_files; i-- > 0;) { struct comb_file *file = &proc->files[i]; if (file->type == COMB_FILE) { if (file->is_minimal) - apply_file_case_and_advance (file, output, NULL); + { + apply_case (file, output); + advance_file (file, NULL); + } } else { @@ -723,7 +743,8 @@ execute_update (struct comb_proc *proc) for (first = &proc->files[0]; ; first++) if (first->is_minimal) break; - apply_file_case_and_advance (first, output, by); + apply_case (first, output); + advance_file (first, by); /* Read additional cases and update the output case from them. (Don't update the output case from any duplicate @@ -732,7 +753,10 @@ execute_update (struct comb_proc *proc) file < &proc->files[proc->n_files]; file++) { while (file->is_minimal) - apply_file_case_and_advance (file, output, by); + { + apply_nonmissing_case (file, output); + advance_file (file, by); + } } casewriter_write (proc->output, output); @@ -744,7 +768,8 @@ execute_update (struct comb_proc *proc) while (first->is_minimal) { output = create_output_case (proc); - apply_file_case_and_advance (first, output, by); + apply_case (first, output); + advance_file (first, by); casewriter_write (proc->output, output); } } @@ -782,44 +807,67 @@ scan_table (struct comb_file *file, union value by[]) static struct ccase * create_output_case (const struct comb_proc *proc) { - size_t n_vars = dict_get_var_cnt (proc->dict); - struct ccase *output; - size_t i; - - output = case_create (dict_get_next_value_idx (proc->dict)); - for (i = 0; i < n_vars; i++) + size_t n_vars = dict_get_n_vars (proc->dict); + struct ccase *output = case_create (dict_get_proto (proc->dict)); + for (size_t i = 0; i < n_vars; i++) { struct variable *v = dict_get_var (proc->dict, i); value_set_missing (case_data_rw (output, v), var_get_width (v)); } - for (i = 0; i < proc->n_files; i++) + for (size_t i = 0; i < proc->n_files; i++) { struct comb_file *file = &proc->files[i]; if (file->in_var != NULL) - case_data_rw (output, file->in_var)->f = false; + *case_num_rw (output, file->in_var) = false; } return output; } +static void +mark_file_used (const struct comb_file *file, struct ccase *output) +{ + if (file->in_var != NULL) + *case_num_rw (output, file->in_var) = true; +} + /* Copies the data from FILE's case into output case OUTPUT. If FILE has an IN variable, then it is set to 1 in OUTPUT. */ static void apply_case (const struct comb_file *file, struct ccase *output) { subcase_copy (&file->src, file->data, &file->dst, output); - if (file->in_var != NULL) - case_data_rw (output, file->in_var)->f = true; + mark_file_used (file, output); +} + +/* Copies the data from FILE's case into output case OUTPUT, + skipping values that are missing or all spaces. + + If FILE has an IN variable, then it is set to 1 in OUTPUT. */ +static void +apply_nonmissing_case (const struct comb_file *file, struct ccase *output) +{ + for (size_t i = 0; i < subcase_get_n_fields (&file->src); i++) + { + const struct subcase_field *src_field = &file->src.fields[i]; + const struct subcase_field *dst_field = &file->dst.fields[i]; + const union value *src_value + = case_data_idx (file->data, src_field->case_index); + int width = src_field->width; + + if (!mv_is_value_missing (file->mv[i], src_value) + && !(width > 0 && value_is_spaces (src_value, width))) + value_copy (case_data_rw_idx (output, dst_field->case_index), + src_value, width); + } + mark_file_used (file, output); } -/* Like apply_case() above, but also advances FILE to its next - case. Also, if BY is nonnull, then FILE's is_minimal member - is updated based on whether the new case's BY values still - match those in BY. */ +/* Advances FILE to its next case. If BY is nonnull, then FILE's is_minimal + member is updated based on whether the new case's BY values still match + those in BY. */ static void -apply_file_case_and_advance (struct comb_file *file, struct ccase *output, - union value by[]) +advance_file (struct comb_file *file, union value by[]) { - apply_case (file, output); case_unref (file->data); file->data = casereader_read (file->reader); if (by) @@ -846,7 +894,7 @@ output_case (struct comb_proc *proc, struct ccase *output, union value by[]) { new_BY = !subcase_equal_xx (&proc->by_vars, proc->prev_BY, by); if (proc->last != NULL) - case_data_rw (proc->buffered_case, proc->last)->f = new_BY; + *case_num_rw (proc->buffered_case, proc->last) = new_BY; casewriter_write (proc->output, proc->buffered_case); } else @@ -854,15 +902,19 @@ output_case (struct comb_proc *proc, struct ccase *output, union value by[]) proc->buffered_case = output; if (proc->first != NULL) - case_data_rw (proc->buffered_case, proc->first)->f = new_BY; + *case_num_rw (proc->buffered_case, proc->first) = new_BY; if (new_BY) { - size_t n = (subcase_get_n_values (&proc->by_vars) - * sizeof (union value)); + size_t n_values = subcase_get_n_fields (&proc->by_vars); + const struct caseproto *proto = subcase_get_proto (&proc->by_vars); if (proc->prev_BY == NULL) - proc->prev_BY = xmalloc (n); - memcpy (proc->prev_BY, by, n); + { + proc->prev_BY = xmalloc (n_values * sizeof *proc->prev_BY); + caseproto_init_values (proto, proc->prev_BY); + } + caseproto_copy (subcase_get_proto (&proc->by_vars), 0, n_values, + proc->prev_BY, by); } } } @@ -875,7 +927,7 @@ output_buffered_case (struct comb_proc *proc) if (proc->prev_BY != NULL) { if (proc->last != NULL) - case_data_rw (proc->buffered_case, proc->last)->f = 1.0; + *case_num_rw (proc->buffered_case, proc->last) = 1.0; casewriter_write (proc->output, proc->buffered_case); proc->buffered_case = NULL; }