X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fdata-io%2Fcombine-files.c;h=4e6ae4f8846b911aa243786fa825c95cbb78f60d;hb=53d339111a9f51561cfccc65764874cdf54e501a;hp=58693c62d21d568dd152eb91fd8b0edc86639f46;hpb=9ade26c8349b4434008c46cf09bc7473ec743972;p=pspp diff --git a/src/language/data-io/combine-files.c b/src/language/data-io/combine-files.c index 58693c62d2..4e6ae4f884 100644 --- a/src/language/data-io/combine-files.c +++ b/src/language/data-io/combine-files.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -23,9 +23,9 @@ #include "data/case.h" #include "data/casereader.h" #include "data/casewriter.h" +#include "data/dataset.h" #include "data/dictionary.h" #include "data/format.h" -#include "data/procedure.h" #include "data/subcase.h" #include "data/variable.h" #include "language/command.h" @@ -35,6 +35,7 @@ #include "language/lexer/variable-parser.h" #include "language/stats/sort-criteria.h" #include "libpspp/assertion.h" +#include "libpspp/i18n.h" #include "libpspp/message.h" #include "libpspp/string-array.h" #include "libpspp/taint.h" @@ -68,6 +69,7 @@ struct comb_file /* Variables. */ struct subcase by_vars; /* BY variables in this input file. */ struct subcase src, dst; /* Data to copy to output; where to put it. */ + const struct missing_values **mv; /* Each variable's missing values. */ /* Input files. */ struct file_handle *handle; /* Input file handle. */ @@ -160,7 +162,7 @@ combine_files (enum comb_command_type command, proc.files = NULL; proc.n_files = 0; - proc.dict = dict_create (); + proc.dict = dict_create (get_default_encoding ()); proc.output = NULL; proc.matcher = NULL; subcase_init_empty (&proc.by_vars); @@ -196,6 +198,7 @@ combine_files (enum comb_command_type command, subcase_init_empty (&file->by_vars); subcase_init_empty (&file->src); subcase_init_empty (&file->dst); + file->mv = NULL; file->handle = NULL; file->dict = NULL; file->reader = NULL; @@ -206,27 +209,28 @@ combine_files (enum comb_command_type command, if (lex_match (lexer, T_ASTERISK)) { - if (!proc_has_active_file (ds)) + if (!dataset_has_source (ds)) { - msg (SE, _("Cannot specify the active file since no active " - "file has been defined.")); + msg (SE, _("Cannot specify the active dataset since none " + "has been defined.")); goto error; } if (proc_make_temporary_transformations_permanent (ds)) msg (SE, _("This command may not be used after TEMPORARY when " - "the active file is an input source. " + "the active dataset is an input source. " "Temporary transformations will be made permanent.")); file->dict = dict_clone (dataset_dict (ds)); } else { - file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH); + file->handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds)); if (file->handle == NULL) goto error; - file->reader = any_reader_open (file->handle, &file->dict); + file->reader = any_reader_open_and_decode (file->handle, NULL, + &file->dict, NULL); if (file->reader == NULL) goto error; } @@ -234,7 +238,7 @@ combine_files (enum comb_command_type command, while (lex_match (lexer, T_SLASH)) if (lex_match_id (lexer, "RENAME")) { - if (!parse_dict_rename (lexer, file->dict)) + if (!parse_dict_rename (lexer, file->dict, false)) goto error; } else if (lex_match_id (lexer, "IN")) @@ -261,7 +265,8 @@ combine_files (enum comb_command_type command, saw_sort = true; } - merge_dictionary (proc.dict, file); + if (!merge_dictionary (proc.dict, file)) + goto error; } while (lex_token (lexer) != T_ENDCMD) @@ -303,7 +308,8 @@ combine_files (enum comb_command_type command, msg (SE, _("File %s lacks BY variable %s."), fh_get_name (file->handle), name); else - msg (SE, _("Active file lacks BY variable %s."), name); + msg (SE, _("Active dataset lacks BY variable %s."), + name); ok = false; } } @@ -374,7 +380,7 @@ combine_files (enum comb_command_type command, { if (command == COMB_UPDATE) { - msg (SE, _("The BY subcommand is required.")); + lex_sbc_missing ("BY"); goto error; } if (n_tables) @@ -411,6 +417,7 @@ combine_files (enum comb_command_type command, size_t src_var_cnt = dict_get_var_cnt (file->dict); size_t j; + file->mv = xnmalloc (src_var_cnt, sizeof *file->mv); for (j = 0; j < src_var_cnt; j++) { struct variable *src_var = dict_get_var (file->dict, j); @@ -418,6 +425,8 @@ combine_files (enum comb_command_type command, var_get_name (src_var)); if (dst_var != NULL) { + size_t n = subcase_get_n_fields (&file->src); + file->mv[n] = var_get_missing_values (src_var); subcase_add_var (&file->src, src_var, SC_ASCEND); subcase_add_var (&file->dst, dst_var, SC_ASCEND); } @@ -437,7 +446,7 @@ combine_files (enum comb_command_type command, if (active_file == NULL) { proc_discard_output (ds); - file->reader = active_file = proc_open (ds); + file->reader = active_file = proc_open_filtering (ds, false); } else file->reader = casereader_clone (active_file); @@ -466,7 +475,8 @@ combine_files (enum comb_command_type command, if (active_file != NULL) proc_commit (ds); - proc_set_active_file (ds, casewriter_make_reader (proc.output), proc.dict); + dataset_set_dict (ds, proc.dict); + dataset_set_source (ds, casewriter_make_reader (proc.output)); proc.dict = NULL; proc.output = NULL; @@ -494,7 +504,6 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) struct dictionary *d = f->dict; const struct string_array *d_docs, *m_docs; int i; - const char *file_encoding; if (dict_get_label (m) == NULL) dict_set_label (m, dict_get_label (d)); @@ -508,17 +517,9 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) The correct thing to do would be to convert to an encoding which can cope with all the input files (eg UTF-8). */ - file_encoding = dict_get_encoding (f->dict); - if ( file_encoding != NULL) - { - if ( dict_get_encoding (m) == NULL) - dict_set_encoding (m, file_encoding); - else if ( 0 != strcmp (file_encoding, dict_get_encoding (m))) - { - msg (MW, - _("Combining files with incompatible encodings. String data may not be represented correctly.")); - } - } + if ( 0 != strcmp (dict_get_encoding (f->dict), dict_get_encoding (m))) + msg (MW, _("Combining files with incompatible encodings. String data may " + "not be represented correctly.")); if (d_docs != NULL) { @@ -555,8 +556,10 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) if (var_get_width (mv) != var_get_width (dv)) { const char *var_name = var_get_name (dv); - const char *file_name = fh_get_name (f->handle); struct string s = DS_EMPTY_INITIALIZER; + const char *file_name; + + file_name = f->handle ? fh_get_name (f->handle) : "*"; ds_put_format (&s, _("Variable %s in file %s has different " "type or width from the same variable in " @@ -588,7 +591,7 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) if (var_has_missing_values (dv) && !var_has_missing_values (mv)) var_set_missing_values (mv, var_get_missing_values (dv)); if (var_get_label (dv) && !var_get_label (mv)) - var_set_label (mv, var_get_label (dv), file_encoding, false); + var_set_label (mv, var_get_label (dv)); } else mv = dict_clone_var_assert (m, dv); @@ -639,8 +642,9 @@ close_all_comb_files (struct comb_proc *proc) subcase_destroy (&file->by_vars); subcase_destroy (&file->src); subcase_destroy (&file->dst); + free (file->mv); fh_unref (file->handle); - dict_destroy (file->dict); + dict_unref (file->dict); casereader_destroy (file->reader); case_unref (file->data); free (file->in_name); @@ -655,7 +659,7 @@ static void free_comb_proc (struct comb_proc *proc) { close_all_comb_files (proc); - dict_destroy (proc->dict); + dict_unref (proc->dict); casewriter_destroy (proc->output); case_matcher_destroy (proc->matcher); if (proc->prev_BY) @@ -671,8 +675,8 @@ free_comb_proc (struct comb_proc *proc) static bool scan_table (struct comb_file *, union value by[]); static struct ccase *create_output_case (const struct comb_proc *); static void apply_case (const struct comb_file *, struct ccase *); -static void apply_file_case_and_advance (struct comb_file *, struct ccase *, - union value by[]); +static void apply_nonmissing_case (const struct comb_file *, struct ccase *); +static void advance_file (struct comb_file *, union value by[]); static void output_case (struct comb_proc *, struct ccase *, union value by[]); static void output_buffered_case (struct comb_proc *); @@ -692,7 +696,8 @@ execute_add_files (struct comb_proc *proc) while (file->is_minimal) { struct ccase *output = create_output_case (proc); - apply_file_case_and_advance (file, output, by); + apply_case (file, output); + advance_file (file, by); output_case (proc, output, by); } } @@ -718,7 +723,10 @@ execute_match_files (struct comb_proc *proc) if (file->type == COMB_FILE) { if (file->is_minimal) - apply_file_case_and_advance (file, output, NULL); + { + apply_case (file, output); + advance_file (file, NULL); + } } else { @@ -749,7 +757,8 @@ execute_update (struct comb_proc *proc) for (first = &proc->files[0]; ; first++) if (first->is_minimal) break; - apply_file_case_and_advance (first, output, by); + apply_case (first, output); + advance_file (first, by); /* Read additional cases and update the output case from them. (Don't update the output case from any duplicate @@ -758,7 +767,10 @@ execute_update (struct comb_proc *proc) file < &proc->files[proc->n_files]; file++) { while (file->is_minimal) - apply_file_case_and_advance (file, output, by); + { + apply_nonmissing_case (file, output); + advance_file (file, by); + } } casewriter_write (proc->output, output); @@ -770,7 +782,8 @@ execute_update (struct comb_proc *proc) while (first->is_minimal) { output = create_output_case (proc); - apply_file_case_and_advance (first, output, by); + apply_case (first, output); + advance_file (first, by); casewriter_write (proc->output, output); } } @@ -827,25 +840,53 @@ create_output_case (const struct comb_proc *proc) return output; } +static void +mark_file_used (const struct comb_file *file, struct ccase *output) +{ + if (file->in_var != NULL) + case_data_rw (output, file->in_var)->f = true; +} + /* Copies the data from FILE's case into output case OUTPUT. If FILE has an IN variable, then it is set to 1 in OUTPUT. */ static void apply_case (const struct comb_file *file, struct ccase *output) { subcase_copy (&file->src, file->data, &file->dst, output); - if (file->in_var != NULL) - case_data_rw (output, file->in_var)->f = true; + mark_file_used (file, output); +} + +/* Copies the data from FILE's case into output case OUTPUT, + skipping values that are missing or all spaces. + + If FILE has an IN variable, then it is set to 1 in OUTPUT. */ +static void +apply_nonmissing_case (const struct comb_file *file, struct ccase *output) +{ + size_t i; + + for (i = 0; i < subcase_get_n_fields (&file->src); i++) + { + const struct subcase_field *src_field = &file->src.fields[i]; + const struct subcase_field *dst_field = &file->dst.fields[i]; + const union value *src_value + = case_data_idx (file->data, src_field->case_index); + int width = src_field->width; + + if (!mv_is_value_missing (file->mv[i], src_value, MV_ANY) + && !(width > 0 && value_is_spaces (src_value, width))) + value_copy (case_data_rw_idx (output, dst_field->case_index), + src_value, width); + } + mark_file_used (file, output); } -/* Like apply_case() above, but also advances FILE to its next - case. Also, if BY is nonnull, then FILE's is_minimal member - is updated based on whether the new case's BY values still - match those in BY. */ +/* Advances FILE to its next case. If BY is nonnull, then FILE's is_minimal + member is updated based on whether the new case's BY values still match + those in BY. */ static void -apply_file_case_and_advance (struct comb_file *file, struct ccase *output, - union value by[]) +advance_file (struct comb_file *file, union value by[]) { - apply_case (file, output); case_unref (file->data); file->data = casereader_read (file->reader); if (by)