X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fdata-io%2Fcombine-files.c;h=b4eac56a6b4a4d270f19b2b47c4fc3b21ececaa5;hb=8d6bfdd2a100bf8166b3b0b3f006d46f3e7a59e9;hp=ccbe7679d15a6331c93c7f4054ac168cae9f3c00;hpb=deb4fd96c0c171fc8eb64f7f1e7f5c2af4931416;p=pspp diff --git a/src/language/data-io/combine-files.c b/src/language/data-io/combine-files.c index ccbe7679d1..b4eac56a6b 100644 --- a/src/language/data-io/combine-files.c +++ b/src/language/data-io/combine-files.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,28 +18,30 @@ #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xalloc.h" +#include "data/any-reader.h" +#include "data/case-matcher.h" +#include "data/case.h" +#include "data/casereader.h" +#include "data/casewriter.h" +#include "data/dataset.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/subcase.h" +#include "data/variable.h" +#include "language/command.h" +#include "language/data-io/file-handle.h" +#include "language/data-io/trim.h" +#include "language/lexer/lexer.h" +#include "language/lexer/variable-parser.h" +#include "language/stats/sort-criteria.h" +#include "libpspp/assertion.h" +#include "libpspp/i18n.h" +#include "libpspp/message.h" +#include "libpspp/string-array.h" +#include "libpspp/taint.h" +#include "math/sort.h" + +#include "gl/xalloc.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -67,6 +69,7 @@ struct comb_file /* Variables. */ struct subcase by_vars; /* BY variables in this input file. */ struct subcase src, dst; /* Data to copy to output; where to put it. */ + const struct missing_values **mv; /* Each variable's missing values. */ /* Input files. */ struct file_handle *handle; /* Input file handle. */ @@ -78,7 +81,7 @@ struct comb_file bool is_sorted; /* Is file presorted on the BY variables? */ /* IN subcommand. */ - char in_name[VAR_NAME_LEN + 1]; + char *in_name; struct variable *in_var; }; @@ -147,8 +150,8 @@ combine_files (enum comb_command_type command, bool saw_sort = false; struct casereader *active_file = NULL; - char first_name[VAR_NAME_LEN + 1] = ""; - char last_name[VAR_NAME_LEN + 1] = ""; + char *first_name = NULL; + char *last_name = NULL; struct taint *taint = NULL; @@ -159,7 +162,7 @@ combine_files (enum comb_command_type command, proc.files = NULL; proc.n_files = 0; - proc.dict = dict_create (); + proc.dict = dict_create (get_default_encoding ()); proc.output = NULL; proc.matcher = NULL; subcase_init_empty (&proc.by_vars); @@ -170,7 +173,7 @@ combine_files (enum comb_command_type command, dict_set_case_limit (proc.dict, dict_get_case_limit (dataset_dict (ds))); - lex_match (lexer, '/'); + lex_match (lexer, T_SLASH); for (;;) { struct comb_file *file; @@ -185,7 +188,7 @@ combine_files (enum comb_command_type command, } else break; - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (proc.n_files >= allocated_files) proc.files = x2nrealloc (proc.files, &allocated_files, @@ -195,42 +198,44 @@ combine_files (enum comb_command_type command, subcase_init_empty (&file->by_vars); subcase_init_empty (&file->src); subcase_init_empty (&file->dst); + file->mv = NULL; file->handle = NULL; file->dict = NULL; file->reader = NULL; file->data = NULL; file->is_sorted = true; - file->in_name[0] = '\0'; + file->in_name = NULL; file->in_var = NULL; - if (lex_match (lexer, '*')) + if (lex_match (lexer, T_ASTERISK)) { - if (!proc_has_active_file (ds)) + if (!dataset_has_source (ds)) { - msg (SE, _("Cannot specify the active file since no active " - "file has been defined.")); + msg (SE, _("Cannot specify the active dataset since none " + "has been defined.")); goto error; } if (proc_make_temporary_transformations_permanent (ds)) msg (SE, _("This command may not be used after TEMPORARY when " - "the active file is an input source. " + "the active dataset is an input source. " "Temporary transformations will be made permanent.")); file->dict = dict_clone (dataset_dict (ds)); } else { - file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH); + file->handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds)); if (file->handle == NULL) goto error; - file->reader = any_reader_open (file->handle, &file->dict); + file->reader = any_reader_open_and_decode (file->handle, NULL, + &file->dict, NULL); if (file->reader == NULL) goto error; } - while (lex_match (lexer, '/')) + while (lex_match (lexer, T_SLASH)) if (lex_match_id (lexer, "RENAME")) { if (!parse_dict_rename (lexer, file->dict)) @@ -238,20 +243,20 @@ combine_files (enum comb_command_type command, } else if (lex_match_id (lexer, "IN")) { - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (lex_token (lexer) != T_ID) { lex_error (lexer, NULL); goto error; } - if (file->in_name[0]) + if (file->in_name) { msg (SE, _("Multiple IN subcommands for a single FILE or " "TABLE.")); goto error; } - strcpy (file->in_name, lex_tokid (lexer)); + file->in_name = xstrdup (lex_tokcstr (lexer)); lex_get (lexer); } else if (lex_match_id (lexer, "SORT")) @@ -260,10 +265,11 @@ combine_files (enum comb_command_type command, saw_sort = true; } - merge_dictionary (proc.dict, file); + if (!merge_dictionary (proc.dict, file)) + goto error; } - while (lex_token (lexer) != '.') + while (lex_token (lexer) != T_ENDCMD) { if (lex_match (lexer, T_BY)) { @@ -278,7 +284,7 @@ combine_files (enum comb_command_type command, } saw_by = true; - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (!parse_sort_criteria (lexer, proc.dict, &proc.by_vars, &by_vars, NULL)) goto error; @@ -289,7 +295,7 @@ combine_files (enum comb_command_type command, struct comb_file *file = &proc.files[i]; size_t j; - for (j = 0; j < subcase_get_n_values (&proc.by_vars); j++) + for (j = 0; j < subcase_get_n_fields (&proc.by_vars); j++) { const char *name = var_get_name (by_vars[j]); struct variable *var = dict_lookup_var (file->dict, name); @@ -302,7 +308,8 @@ combine_files (enum comb_command_type command, msg (SE, _("File %s lacks BY variable %s."), fh_get_name (file->handle), name); else - msg (SE, _("Active file lacks BY variable %s."), name); + msg (SE, _("Active dataset lacks BY variable %s."), + name); ok = false; } } @@ -316,30 +323,30 @@ combine_files (enum comb_command_type command, } else if (command != COMB_UPDATE && lex_match_id (lexer, "FIRST")) { - if (first_name[0] != '\0') + if (first_name != NULL) { lex_sbc_only_once ("FIRST"); goto error; } - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (!lex_force_id (lexer)) goto error; - strcpy (first_name, lex_tokid (lexer)); + first_name = xstrdup (lex_tokcstr (lexer)); lex_get (lexer); } else if (command != COMB_UPDATE && lex_match_id (lexer, "LAST")) { - if (last_name[0] != '\0') + if (last_name != NULL) { lex_sbc_only_once ("LAST"); goto error; } - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (!lex_force_id (lexer)) goto error; - strcpy (last_name, lex_tokid (lexer)); + last_name = xstrdup (lex_tokcstr (lexer)); lex_get (lexer); } else if (lex_match_id (lexer, "MAP")) @@ -362,7 +369,7 @@ combine_files (enum comb_command_type command, goto error; } - if (!lex_match (lexer, '/') && lex_token (lexer) != '.') + if (!lex_match (lexer, T_SLASH) && lex_token (lexer) != T_ENDCMD) { lex_end_of_command (lexer); goto error; @@ -373,17 +380,17 @@ combine_files (enum comb_command_type command, { if (command == COMB_UPDATE) { - msg (SE, _("The BY subcommand is required.")); + lex_sbc_missing ("BY"); goto error; } if (n_tables) { - msg (SE, _("BY is required when TABLE is specified.")); + msg (SE, _("BY is required when %s is specified."), "TABLE"); goto error; } if (saw_sort) { - msg (SE, _("BY is required when SORT is specified.")); + msg (SE, _("BY is required when %s is specified."), "SORT"); goto error; } } @@ -410,6 +417,7 @@ combine_files (enum comb_command_type command, size_t src_var_cnt = dict_get_var_cnt (file->dict); size_t j; + file->mv = xnmalloc (src_var_cnt, sizeof *file->mv); for (j = 0; j < src_var_cnt; j++) { struct variable *src_var = dict_get_var (file->dict, j); @@ -417,13 +425,15 @@ combine_files (enum comb_command_type command, var_get_name (src_var)); if (dst_var != NULL) { + size_t n = subcase_get_n_fields (&file->src); + file->mv[n] = var_get_missing_values (src_var); subcase_add_var (&file->src, src_var, SC_ASCEND); subcase_add_var (&file->dst, dst_var, SC_ASCEND); } } } - proc.output = autopaging_writer_create (dict_get_next_value_idx (proc.dict)); + proc.output = autopaging_writer_create (dict_get_proto (proc.dict)); taint = taint_clone (casewriter_get_taint (proc.output)); /* Set up case matcher. */ @@ -436,7 +446,7 @@ combine_files (enum comb_command_type command, if (active_file == NULL) { proc_discard_output (ds); - file->reader = active_file = proc_open (ds); + file->reader = active_file = proc_open_filtering (ds, false); } else file->reader = casereader_clone (active_file); @@ -465,12 +475,16 @@ combine_files (enum comb_command_type command, if (active_file != NULL) proc_commit (ds); - proc_set_active_file (ds, casewriter_make_reader (proc.output), proc.dict); + dataset_set_dict (ds, proc.dict); + dataset_set_source (ds, casewriter_make_reader (proc.output)); proc.dict = NULL; proc.output = NULL; free_comb_proc (&proc); + free (first_name); + free (last_name); + return taint_destroy (taint) ? CMD_SUCCESS : CMD_CASCADING_FAILURE; error: @@ -478,6 +492,8 @@ combine_files (enum comb_command_type command, proc_commit (ds); free_comb_proc (&proc); taint_destroy (taint); + free (first_name); + free (last_name); return CMD_CASCADING_FAILURE; } @@ -486,7 +502,7 @@ static bool merge_dictionary (struct dictionary *const m, struct comb_file *f) { struct dictionary *d = f->dict; - const char *d_docs, *m_docs; + const struct string_array *d_docs, *m_docs; int i; if (dict_get_label (m) == NULL) @@ -494,15 +510,36 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) d_docs = dict_get_documents (d); m_docs = dict_get_documents (m); + + + /* FIXME: If the input files have different encodings, then + the result is undefined. + The correct thing to do would be to convert to an encoding + which can cope with all the input files (eg UTF-8). + */ + if ( 0 != strcmp (dict_get_encoding (f->dict), dict_get_encoding (m))) + msg (MW, _("Combining files with incompatible encodings. String data may " + "not be represented correctly.")); + if (d_docs != NULL) { if (m_docs == NULL) dict_set_documents (m, d_docs); else { - char *new_docs = xasprintf ("%s%s", m_docs, d_docs); - dict_set_documents (m, new_docs); - free (new_docs); + struct string_array new_docs; + size_t i; + + new_docs.n = m_docs->n + d_docs->n; + new_docs.strings = xmalloc (new_docs.n * sizeof *new_docs.strings); + for (i = 0; i < m_docs->n; i++) + new_docs.strings[i] = m_docs->strings[i]; + for (i = 0; i < d_docs->n; i++) + new_docs.strings[m_docs->n + i] = d_docs->strings[i]; + + dict_set_documents (m, &new_docs); + + free (new_docs.strings); } } @@ -519,8 +556,10 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) if (var_get_width (mv) != var_get_width (dv)) { const char *var_name = var_get_name (dv); - const char *file_name = fh_get_name (f->handle); struct string s = DS_EMPTY_INITIALIZER; + const char *file_name; + + file_name = f->handle ? fh_get_name (f->handle) : "*"; ds_put_format (&s, _("Variable %s in file %s has different " "type or width from the same variable in " @@ -542,7 +581,7 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) ds_put_format (&s, _("In an earlier file, %s was a string " "variable with width %d."), var_name, var_get_width (mv)); - msg (SE, ds_cstr (&s)); + msg (SE, "%s", ds_cstr (&s)); ds_destroy (&s); return false; } @@ -555,24 +594,25 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) var_set_label (mv, var_get_label (dv)); } else - mv = dict_clone_var_assert (m, dv, var_get_name (dv)); + mv = dict_clone_var_assert (m, dv); } return true; } -/* If VAR_NAME is a non-empty string, attempts to create a +/* If VAR_NAME is non-NULL, attempts to create a variable named VAR_NAME, with format F1.0, in DICT, and stores a pointer to the variable in *VAR. Returns true if successful, false if the variable name is a duplicate (in which case a message saying that the variable specified on the - given SUBCOMMAND is a duplicate is emitted). Also returns - true, without doing anything, if VAR_NAME is null or empty. */ + given SUBCOMMAND is a duplicate is emitted). + + Does nothing and returns true if VAR_NAME is null. */ static bool create_flag_var (const char *subcommand, const char *var_name, struct dictionary *dict, struct variable **var) { - if (var_name[0] != '\0') + if (var_name != NULL) { struct fmt_spec format = fmt_for_output (FMT_F, 1, 0); *var = dict_create_var (dict, var_name, 0); @@ -602,10 +642,12 @@ close_all_comb_files (struct comb_proc *proc) subcase_destroy (&file->by_vars); subcase_destroy (&file->src); subcase_destroy (&file->dst); + free (file->mv); fh_unref (file->handle); - dict_destroy (file->dict); + dict_unref (file->dict); casereader_destroy (file->reader); case_unref (file->data); + free (file->in_name); } free (proc->files); proc->files = NULL; @@ -617,19 +659,24 @@ static void free_comb_proc (struct comb_proc *proc) { close_all_comb_files (proc); - dict_destroy (proc->dict); + dict_unref (proc->dict); casewriter_destroy (proc->output); case_matcher_destroy (proc->matcher); + if (proc->prev_BY) + { + caseproto_destroy_values (subcase_get_proto (&proc->by_vars), + proc->prev_BY); + free (proc->prev_BY); + } subcase_destroy (&proc->by_vars); case_unref (proc->buffered_case); - free (proc->prev_BY); } static bool scan_table (struct comb_file *, union value by[]); static struct ccase *create_output_case (const struct comb_proc *); static void apply_case (const struct comb_file *, struct ccase *); -static void apply_file_case_and_advance (struct comb_file *, struct ccase *, - union value by[]); +static void apply_nonmissing_case (const struct comb_file *, struct ccase *); +static void advance_file (struct comb_file *, union value by[]); static void output_case (struct comb_proc *, struct ccase *, union value by[]); static void output_buffered_case (struct comb_proc *); @@ -649,7 +696,8 @@ execute_add_files (struct comb_proc *proc) while (file->is_minimal) { struct ccase *output = create_output_case (proc); - apply_file_case_and_advance (file, output, by); + apply_case (file, output); + advance_file (file, by); output_case (proc, output, by); } } @@ -675,7 +723,10 @@ execute_match_files (struct comb_proc *proc) if (file->type == COMB_FILE) { if (file->is_minimal) - apply_file_case_and_advance (file, output, NULL); + { + apply_case (file, output); + advance_file (file, NULL); + } } else { @@ -706,7 +757,8 @@ execute_update (struct comb_proc *proc) for (first = &proc->files[0]; ; first++) if (first->is_minimal) break; - apply_file_case_and_advance (first, output, by); + apply_case (first, output); + advance_file (first, by); /* Read additional cases and update the output case from them. (Don't update the output case from any duplicate @@ -715,7 +767,10 @@ execute_update (struct comb_proc *proc) file < &proc->files[proc->n_files]; file++) { while (file->is_minimal) - apply_file_case_and_advance (file, output, by); + { + apply_nonmissing_case (file, output); + advance_file (file, by); + } } casewriter_write (proc->output, output); @@ -727,7 +782,8 @@ execute_update (struct comb_proc *proc) while (first->is_minimal) { output = create_output_case (proc); - apply_file_case_and_advance (first, output, by); + apply_case (first, output); + advance_file (first, by); casewriter_write (proc->output, output); } } @@ -769,7 +825,7 @@ create_output_case (const struct comb_proc *proc) struct ccase *output; size_t i; - output = case_create (dict_get_next_value_idx (proc->dict)); + output = case_create (dict_get_proto (proc->dict)); for (i = 0; i < n_vars; i++) { struct variable *v = dict_get_var (proc->dict, i); @@ -784,25 +840,53 @@ create_output_case (const struct comb_proc *proc) return output; } +static void +mark_file_used (const struct comb_file *file, struct ccase *output) +{ + if (file->in_var != NULL) + case_data_rw (output, file->in_var)->f = true; +} + /* Copies the data from FILE's case into output case OUTPUT. If FILE has an IN variable, then it is set to 1 in OUTPUT. */ static void apply_case (const struct comb_file *file, struct ccase *output) { subcase_copy (&file->src, file->data, &file->dst, output); - if (file->in_var != NULL) - case_data_rw (output, file->in_var)->f = true; + mark_file_used (file, output); +} + +/* Copies the data from FILE's case into output case OUTPUT, + skipping values that are missing or all spaces. + + If FILE has an IN variable, then it is set to 1 in OUTPUT. */ +static void +apply_nonmissing_case (const struct comb_file *file, struct ccase *output) +{ + size_t i; + + for (i = 0; i < subcase_get_n_fields (&file->src); i++) + { + const struct subcase_field *src_field = &file->src.fields[i]; + const struct subcase_field *dst_field = &file->dst.fields[i]; + const union value *src_value + = case_data_idx (file->data, src_field->case_index); + int width = src_field->width; + + if (!mv_is_value_missing (file->mv[i], src_value, MV_ANY) + && !(width > 0 && value_is_spaces (src_value, width))) + value_copy (case_data_rw_idx (output, dst_field->case_index), + src_value, width); + } + mark_file_used (file, output); } -/* Like apply_case() above, but also advances FILE to its next - case. Also, if BY is nonnull, then FILE's is_minimal member - is updated based on whether the new case's BY values still - match those in BY. */ +/* Advances FILE to its next case. If BY is nonnull, then FILE's is_minimal + member is updated based on whether the new case's BY values still match + those in BY. */ static void -apply_file_case_and_advance (struct comb_file *file, struct ccase *output, - union value by[]) +advance_file (struct comb_file *file, union value by[]) { - apply_case (file, output); case_unref (file->data); file->data = casereader_read (file->reader); if (by) @@ -841,11 +925,15 @@ output_case (struct comb_proc *proc, struct ccase *output, union value by[]) if (new_BY) { - size_t n = (subcase_get_n_values (&proc->by_vars) - * sizeof (union value)); + size_t n_values = subcase_get_n_fields (&proc->by_vars); + const struct caseproto *proto = subcase_get_proto (&proc->by_vars); if (proc->prev_BY == NULL) - proc->prev_BY = xmalloc (n); - memcpy (proc->prev_BY, by, n); + { + proc->prev_BY = xmalloc (n_values * sizeof *proc->prev_BY); + caseproto_init_values (proto, proc->prev_BY); + } + caseproto_copy (subcase_get_proto (&proc->by_vars), 0, n_values, + proc->prev_BY, by); } } }