X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fdata-io%2Fcombine-files.c;h=82c36945a2fd803de74a0ea3239c3918440c4ed3;hb=81579d9e9f994fb2908f50af41c3eb033d216e58;hp=1451743dc882e651751cc9b3ccdbc2d1b8f1829e;hpb=2165f59ab9eee5272b4037e45477811627cae078;p=pspp-builds.git diff --git a/src/language/data-io/combine-files.c b/src/language/data-io/combine-files.c index 1451743d..82c36945 100644 --- a/src/language/data-io/combine-files.c +++ b/src/language/data-io/combine-files.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2007, 2008 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -18,28 +18,28 @@ #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "xalloc.h" +#include "data/any-reader.h" +#include "data/case-matcher.h" +#include "data/case.h" +#include "data/casereader.h" +#include "data/casewriter.h" +#include "data/dictionary.h" +#include "data/format.h" +#include "data/procedure.h" +#include "data/subcase.h" +#include "data/variable.h" +#include "language/command.h" +#include "language/data-io/file-handle.h" +#include "language/data-io/trim.h" +#include "language/lexer/lexer.h" +#include "language/lexer/variable-parser.h" +#include "language/stats/sort-criteria.h" +#include "libpspp/assertion.h" +#include "libpspp/message.h" +#include "libpspp/taint.h" +#include "math/sort.h" + +#include "gl/xalloc.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -72,13 +72,13 @@ struct comb_file struct file_handle *handle; /* Input file handle. */ struct dictionary *dict; /* Input file dictionary. */ struct casereader *reader; /* Input data source. */ - struct ccase data; /* The current input case. */ + struct ccase *data; /* The current input case. */ bool is_minimal; /* Does 'data' have minimum BY values across all input files? */ bool is_sorted; /* Is file presorted on the BY variables? */ /* IN subcommand. */ - char in_name[VAR_NAME_LEN + 1]; + char *in_name; struct variable *in_var; }; @@ -98,8 +98,8 @@ struct comb_proc members used. */ struct variable *first; /* Variable specified on FIRST (if any). */ struct variable *last; /* Variable specified on LAST (if any). */ - struct ccase buffered_case; /* Case ready for output except that we don't - know the value for the LAST variable yet. */ + struct ccase *buffered_case; /* Case ready for output except that we don't + know the value for the LAST var yet. */ union value *prev_BY; /* Values of BY vars in buffered_case. */ }; @@ -147,8 +147,8 @@ combine_files (enum comb_command_type command, bool saw_sort = false; struct casereader *active_file = NULL; - char first_name[VAR_NAME_LEN + 1] = ""; - char last_name[VAR_NAME_LEN + 1] = ""; + char *first_name = NULL; + char *last_name = NULL; struct taint *taint = NULL; @@ -165,12 +165,12 @@ combine_files (enum comb_command_type command, subcase_init_empty (&proc.by_vars); proc.first = NULL; proc.last = NULL; - case_nullify (&proc.buffered_case); + proc.buffered_case = NULL; proc.prev_BY = NULL; dict_set_case_limit (proc.dict, dict_get_case_limit (dataset_dict (ds))); - lex_match (lexer, '/'); + lex_match (lexer, T_SLASH); for (;;) { struct comb_file *file; @@ -185,7 +185,7 @@ combine_files (enum comb_command_type command, } else break; - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (proc.n_files >= allocated_files) proc.files = x2nrealloc (proc.files, &allocated_files, @@ -198,12 +198,12 @@ combine_files (enum comb_command_type command, file->handle = NULL; file->dict = NULL; file->reader = NULL; - case_nullify (&file->data); + file->data = NULL; file->is_sorted = true; - file->in_name[0] = '\0'; + file->in_name = NULL; file->in_var = NULL; - if (lex_match (lexer, '*')) + if (lex_match (lexer, T_ASTERISK)) { if (!proc_has_active_file (ds)) { @@ -230,7 +230,7 @@ combine_files (enum comb_command_type command, goto error; } - while (lex_match (lexer, '/')) + while (lex_match (lexer, T_SLASH)) if (lex_match_id (lexer, "RENAME")) { if (!parse_dict_rename (lexer, file->dict)) @@ -238,20 +238,20 @@ combine_files (enum comb_command_type command, } else if (lex_match_id (lexer, "IN")) { - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (lex_token (lexer) != T_ID) { lex_error (lexer, NULL); goto error; } - if (file->in_name[0]) + if (file->in_name) { msg (SE, _("Multiple IN subcommands for a single FILE or " "TABLE.")); goto error; } - strcpy (file->in_name, lex_tokid (lexer)); + file->in_name = xstrdup (lex_tokcstr (lexer)); lex_get (lexer); } else if (lex_match_id (lexer, "SORT")) @@ -263,7 +263,7 @@ combine_files (enum comb_command_type command, merge_dictionary (proc.dict, file); } - while (lex_token (lexer) != '.') + while (lex_token (lexer) != T_ENDCMD) { if (lex_match (lexer, T_BY)) { @@ -278,7 +278,7 @@ combine_files (enum comb_command_type command, } saw_by = true; - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (!parse_sort_criteria (lexer, proc.dict, &proc.by_vars, &by_vars, NULL)) goto error; @@ -289,7 +289,7 @@ combine_files (enum comb_command_type command, struct comb_file *file = &proc.files[i]; size_t j; - for (j = 0; j < subcase_get_n_values (&proc.by_vars); j++) + for (j = 0; j < subcase_get_n_fields (&proc.by_vars); j++) { const char *name = var_get_name (by_vars[j]); struct variable *var = dict_lookup_var (file->dict, name); @@ -316,30 +316,30 @@ combine_files (enum comb_command_type command, } else if (command != COMB_UPDATE && lex_match_id (lexer, "FIRST")) { - if (first_name[0] != '\0') + if (first_name != NULL) { lex_sbc_only_once ("FIRST"); goto error; } - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (!lex_force_id (lexer)) goto error; - strcpy (first_name, lex_tokid (lexer)); + first_name = xstrdup (lex_tokcstr (lexer)); lex_get (lexer); } else if (command != COMB_UPDATE && lex_match_id (lexer, "LAST")) { - if (last_name[0] != '\0') + if (last_name != NULL) { lex_sbc_only_once ("LAST"); goto error; } - lex_match (lexer, '='); + lex_match (lexer, T_EQUALS); if (!lex_force_id (lexer)) goto error; - strcpy (last_name, lex_tokid (lexer)); + last_name = xstrdup (lex_tokcstr (lexer)); lex_get (lexer); } else if (lex_match_id (lexer, "MAP")) @@ -362,7 +362,7 @@ combine_files (enum comb_command_type command, goto error; } - if (!lex_match (lexer, '/') && lex_token (lexer) != '.') + if (!lex_match (lexer, T_SLASH) && lex_token (lexer) != T_ENDCMD) { lex_end_of_command (lexer); goto error; @@ -378,12 +378,12 @@ combine_files (enum comb_command_type command, } if (n_tables) { - msg (SE, _("BY is required when TABLE is specified.")); + msg (SE, _("BY is required when %s is specified."), "TABLE"); goto error; } if (saw_sort) { - msg (SE, _("BY is required when SORT is specified.")); + msg (SE, _("BY is required when %s is specified."), "SORT"); goto error; } } @@ -423,7 +423,7 @@ combine_files (enum comb_command_type command, } } - proc.output = autopaging_writer_create (dict_get_next_value_idx (proc.dict)); + proc.output = autopaging_writer_create (dict_get_proto (proc.dict)); taint = taint_clone (casewriter_get_taint (proc.output)); /* Set up case matcher. */ @@ -444,7 +444,7 @@ combine_files (enum comb_command_type command, if (!file->is_sorted) file->reader = sort_execute (file->reader, &file->by_vars); taint_propagate (casereader_get_taint (file->reader), taint); - casereader_read (file->reader, &file->data); + file->data = casereader_read (file->reader); if (file->type == COMB_FILE) case_matcher_add_input (proc.matcher, &file->by_vars, &file->data, &file->is_minimal); @@ -471,6 +471,9 @@ combine_files (enum comb_command_type command, free_comb_proc (&proc); + free (first_name); + free (last_name); + return taint_destroy (taint) ? CMD_SUCCESS : CMD_CASCADING_FAILURE; error: @@ -478,6 +481,8 @@ combine_files (enum comb_command_type command, proc_commit (ds); free_comb_proc (&proc); taint_destroy (taint); + free (first_name); + free (last_name); return CMD_CASCADING_FAILURE; } @@ -488,12 +493,32 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) struct dictionary *d = f->dict; const char *d_docs, *m_docs; int i; + const char *file_encoding; if (dict_get_label (m) == NULL) dict_set_label (m, dict_get_label (d)); d_docs = dict_get_documents (d); m_docs = dict_get_documents (m); + + + /* FIXME: If the input files have different encodings, then + the result is undefined. + The correct thing to do would be to convert to an encoding + which can cope with all the input files (eg UTF-8). + */ + file_encoding = dict_get_encoding (f->dict); + if ( file_encoding != NULL) + { + if ( dict_get_encoding (m) == NULL) + dict_set_encoding (m, file_encoding); + else if ( 0 != strcmp (file_encoding, dict_get_encoding (m))) + { + msg (MW, + _("Combining files with incompatible encodings. String data may not be represented correctly.")); + } + } + if (d_docs != NULL) { if (m_docs == NULL) @@ -542,7 +567,7 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) ds_put_format (&s, _("In an earlier file, %s was a string " "variable with width %d."), var_name, var_get_width (mv)); - msg (SE, ds_cstr (&s)); + msg (SE, "%s", ds_cstr (&s)); ds_destroy (&s); return false; } @@ -555,24 +580,25 @@ merge_dictionary (struct dictionary *const m, struct comb_file *f) var_set_label (mv, var_get_label (dv)); } else - mv = dict_clone_var_assert (m, dv, var_get_name (dv)); + mv = dict_clone_var_assert (m, dv); } return true; } -/* If VAR_NAME is a non-empty string, attempts to create a +/* If VAR_NAME is non-NULL, attempts to create a variable named VAR_NAME, with format F1.0, in DICT, and stores a pointer to the variable in *VAR. Returns true if successful, false if the variable name is a duplicate (in which case a message saying that the variable specified on the - given SUBCOMMAND is a duplicate is emitted). Also returns - true, without doing anything, if VAR_NAME is null or empty. */ + given SUBCOMMAND is a duplicate is emitted). + + Does nothing and returns true if VAR_NAME is null. */ static bool create_flag_var (const char *subcommand, const char *var_name, struct dictionary *dict, struct variable **var) { - if (var_name[0] != '\0') + if (var_name != NULL) { struct fmt_spec format = fmt_for_output (FMT_F, 1, 0); *var = dict_create_var (dict, var_name, 0); @@ -605,7 +631,8 @@ close_all_comb_files (struct comb_proc *proc) fh_unref (file->handle); dict_destroy (file->dict); casereader_destroy (file->reader); - case_destroy (&file->data); + case_unref (file->data); + free (file->in_name); } free (proc->files); proc->files = NULL; @@ -620,13 +647,18 @@ free_comb_proc (struct comb_proc *proc) dict_destroy (proc->dict); casewriter_destroy (proc->output); case_matcher_destroy (proc->matcher); + if (proc->prev_BY) + { + caseproto_destroy_values (subcase_get_proto (&proc->by_vars), + proc->prev_BY); + free (proc->prev_BY); + } subcase_destroy (&proc->by_vars); - case_destroy (&proc->buffered_case); - free (proc->prev_BY); + case_unref (proc->buffered_case); } static bool scan_table (struct comb_file *, union value by[]); -static void create_output_case (const struct comb_proc *, struct ccase *); +static struct ccase *create_output_case (const struct comb_proc *); static void apply_case (const struct comb_file *, struct ccase *); static void apply_file_case_and_advance (struct comb_file *, struct ccase *, union value by[]); @@ -641,7 +673,6 @@ execute_add_files (struct comb_proc *proc) while (case_matcher_match (proc->matcher, &by)) { - struct ccase output; size_t i; for (i = 0; i < proc->n_files; i++) @@ -649,9 +680,9 @@ execute_add_files (struct comb_proc *proc) struct comb_file *file = &proc->files[i]; while (file->is_minimal) { - create_output_case (proc, &output); - apply_file_case_and_advance (file, &output, by); - output_case (proc, &output, by); + struct ccase *output = create_output_case (proc); + apply_file_case_and_advance (file, output, by); + output_case (proc, output, by); } } } @@ -666,25 +697,25 @@ execute_match_files (struct comb_proc *proc) while (case_matcher_match (proc->matcher, &by)) { - struct ccase output; + struct ccase *output; size_t i; - create_output_case (proc, &output); + output = create_output_case (proc); for (i = proc->n_files; i-- > 0; ) { struct comb_file *file = &proc->files[i]; if (file->type == COMB_FILE) { if (file->is_minimal) - apply_file_case_and_advance (file, &output, NULL); + apply_file_case_and_advance (file, output, NULL); } else { if (scan_table (file, by)) - apply_case (file, &output); + apply_case (file, output); } } - output_case (proc, &output, by); + output_case (proc, output, by); } output_buffered_case (proc); } @@ -699,15 +730,15 @@ execute_update (struct comb_proc *proc) while (case_matcher_match (proc->matcher, &by)) { struct comb_file *first, *file; - struct ccase output; + struct ccase *output; /* Find first nonnull case in array and make an output case from it. */ - create_output_case (proc, &output); + output = create_output_case (proc); for (first = &proc->files[0]; ; first++) if (first->is_minimal) break; - apply_file_case_and_advance (first, &output, by); + apply_file_case_and_advance (first, output, by); /* Read additional cases and update the output case from them. (Don't update the output case from any duplicate @@ -716,9 +747,9 @@ execute_update (struct comb_proc *proc) file < &proc->files[proc->n_files]; file++) { while (file->is_minimal) - apply_file_case_and_advance (file, &output, by); + apply_file_case_and_advance (file, output, by); } - casewriter_write (proc->output, &output); + casewriter_write (proc->output, output); /* Write duplicate cases in the master file directly to the output. */ @@ -727,9 +758,9 @@ execute_update (struct comb_proc *proc) n_duplicates++; while (first->is_minimal) { - create_output_case (proc, &output); - apply_file_case_and_advance (first, &output, by); - casewriter_write (proc->output, &output); + output = create_output_case (proc); + apply_file_case_and_advance (first, output, by); + casewriter_write (proc->output, output); } } } @@ -746,13 +777,13 @@ execute_update (struct comb_proc *proc) static bool scan_table (struct comb_file *file, union value by[]) { - while (!case_is_null (&file->data)) + while (file->data != NULL) { - int cmp = subcase_compare_3way_xc (&file->by_vars, by, &file->data); + int cmp = subcase_compare_3way_xc (&file->by_vars, by, file->data); if (cmp > 0) { - case_destroy (&file->data); - casereader_read (file->reader, &file->data); + case_unref (file->data); + file->data = casereader_read (file->reader); } else return cmp == 0; @@ -760,16 +791,17 @@ scan_table (struct comb_file *file, union value by[]) return false; } -/* Creates OUTPUT as an output case for PROC, by initializing each of - its values to system-missing or blanks, except that the values - of IN variables are set to 0. */ -static void -create_output_case (const struct comb_proc *proc, struct ccase *output) +/* Creates and returns an output case for PROC, initializing each + of its values to system-missing or blanks, except that the + values of IN variables are set to 0. */ +static struct ccase * +create_output_case (const struct comb_proc *proc) { size_t n_vars = dict_get_var_cnt (proc->dict); + struct ccase *output; size_t i; - case_create (output, dict_get_next_value_idx (proc->dict)); + output = case_create (dict_get_proto (proc->dict)); for (i = 0; i < n_vars; i++) { struct variable *v = dict_get_var (proc->dict, i); @@ -781,6 +813,7 @@ create_output_case (const struct comb_proc *proc, struct ccase *output) if (file->in_var != NULL) case_data_rw (output, file->in_var)->f = false; } + return output; } /* Copies the data from FILE's case into output case OUTPUT. @@ -788,7 +821,7 @@ create_output_case (const struct comb_proc *proc, struct ccase *output) static void apply_case (const struct comb_file *file, struct ccase *output) { - subcase_copy (&file->src, &file->data, &file->dst, output); + subcase_copy (&file->src, file->data, &file->dst, output); if (file->in_var != NULL) case_data_rw (output, file->in_var)->f = true; } @@ -802,11 +835,11 @@ apply_file_case_and_advance (struct comb_file *file, struct ccase *output, union value by[]) { apply_case (file, output); - case_destroy (&file->data); - casereader_read (file->reader, &file->data); + case_unref (file->data); + file->data = casereader_read (file->reader); if (by) - file->is_minimal = (!case_is_null (&file->data) - && subcase_equal_cx (&file->by_vars, &file->data, by)); + file->is_minimal = (file->data != NULL + && subcase_equal_cx (&file->by_vars, file->data, by)); } /* Writes OUTPUT, whose BY values has been extracted into BY, to @@ -828,23 +861,27 @@ output_case (struct comb_proc *proc, struct ccase *output, union value by[]) { new_BY = !subcase_equal_xx (&proc->by_vars, proc->prev_BY, by); if (proc->last != NULL) - case_data_rw (&proc->buffered_case, proc->last)->f = new_BY; - casewriter_write (proc->output, &proc->buffered_case); + case_data_rw (proc->buffered_case, proc->last)->f = new_BY; + casewriter_write (proc->output, proc->buffered_case); } else new_BY = true; - case_move (&proc->buffered_case, output); + proc->buffered_case = output; if (proc->first != NULL) - case_data_rw (&proc->buffered_case, proc->first)->f = new_BY; + case_data_rw (proc->buffered_case, proc->first)->f = new_BY; if (new_BY) { - size_t n = (subcase_get_n_values (&proc->by_vars) - * sizeof (union value)); + size_t n_values = subcase_get_n_fields (&proc->by_vars); + const struct caseproto *proto = subcase_get_proto (&proc->by_vars); if (proc->prev_BY == NULL) - proc->prev_BY = xmalloc (n); - memcpy (proc->prev_BY, by, n); + { + proc->prev_BY = xmalloc (n_values * sizeof *proc->prev_BY); + caseproto_init_values (proto, proc->prev_BY); + } + caseproto_copy (subcase_get_proto (&proc->by_vars), 0, n_values, + proc->prev_BY, by); } } } @@ -857,8 +894,8 @@ output_buffered_case (struct comb_proc *proc) if (proc->prev_BY != NULL) { if (proc->last != NULL) - case_data_rw (&proc->buffered_case, proc->last)->f = 1.0; - casewriter_write (proc->output, &proc->buffered_case); - case_nullify (&proc->buffered_case); + case_data_rw (proc->buffered_case, proc->last)->f = 1.0; + casewriter_write (proc->output, proc->buffered_case); + proc->buffered_case = NULL; } }