1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 #include <data/any-reader.h>
22 #include <data/case-matcher.h>
23 #include <data/case.h>
24 #include <data/casereader.h>
25 #include <data/casewriter.h>
26 #include <data/dictionary.h>
27 #include <data/format.h>
28 #include <data/procedure.h>
29 #include <data/subcase.h>
30 #include <data/variable.h>
31 #include <language/command.h>
32 #include <language/data-io/file-handle.h>
33 #include <language/data-io/trim.h>
34 #include <language/lexer/lexer.h>
35 #include <language/lexer/variable-parser.h>
36 #include <language/stats/sort-criteria.h>
37 #include <libpspp/assertion.h>
38 #include <libpspp/message.h>
39 #include <libpspp/taint.h>
40 #include <math/sort.h>
45 #define _(msgid) gettext (msgid)
47 enum comb_command_type
57 COMB_FILE, /* Specified on FILE= subcommand. */
58 COMB_TABLE /* Specified on TABLE= subcommand. */
61 /* One FILE or TABLE subcommand. */
65 enum comb_file_type type; /* COMB_FILE or COMB_TABLE. */
68 struct subcase by_vars; /* BY variables in this input file. */
69 struct subcase src, dst; /* Data to copy to output; where to put it. */
72 struct file_handle *handle; /* Input file handle. */
73 struct dictionary *dict; /* Input file dictionary. */
74 struct casereader *reader; /* Input data source. */
75 struct ccase *data; /* The current input case. */
76 bool is_minimal; /* Does 'data' have minimum BY values across
78 bool is_sorted; /* Is file presorted on the BY variables? */
82 struct variable *in_var;
87 struct comb_file *files; /* All the files being merged. */
88 size_t n_files; /* Number of files. */
90 struct dictionary *dict; /* Dictionary of output file. */
91 struct subcase by_vars; /* BY variables in the output. */
92 struct casewriter *output; /* Destination for output. */
94 struct case_matcher *matcher;
97 Only if "first" or "last" is nonnull are the remaining
99 struct variable *first; /* Variable specified on FIRST (if any). */
100 struct variable *last; /* Variable specified on LAST (if any). */
101 struct ccase *buffered_case; /* Case ready for output except that we don't
102 know the value for the LAST var yet. */
103 union value *prev_BY; /* Values of BY vars in buffered_case. */
106 static int combine_files (enum comb_command_type, struct lexer *,
108 static void free_comb_proc (struct comb_proc *);
110 static void close_all_comb_files (struct comb_proc *);
111 static bool merge_dictionary (struct dictionary *const, struct comb_file *);
113 static void execute_update (struct comb_proc *);
114 static void execute_match_files (struct comb_proc *);
115 static void execute_add_files (struct comb_proc *);
117 static bool create_flag_var (const char *subcommand_name, const char *var_name,
118 struct dictionary *, struct variable **);
119 static void output_case (struct comb_proc *, struct ccase *, union value *by);
120 static void output_buffered_case (struct comb_proc *);
123 cmd_add_files (struct lexer *lexer, struct dataset *ds)
125 return combine_files (COMB_ADD, lexer, ds);
129 cmd_match_files (struct lexer *lexer, struct dataset *ds)
131 return combine_files (COMB_MATCH, lexer, ds);
135 cmd_update (struct lexer *lexer, struct dataset *ds)
137 return combine_files (COMB_UPDATE, lexer, ds);
141 combine_files (enum comb_command_type command,
142 struct lexer *lexer, struct dataset *ds)
144 struct comb_proc proc;
147 bool saw_sort = false;
148 struct casereader *active_file = NULL;
150 char *first_name = NULL;
151 char *last_name = NULL;
153 struct taint *taint = NULL;
156 size_t allocated_files = 0;
162 proc.dict = dict_create ();
165 subcase_init_empty (&proc.by_vars);
168 proc.buffered_case = NULL;
171 dict_set_case_limit (proc.dict, dict_get_case_limit (dataset_dict (ds)));
173 lex_match (lexer, T_SLASH);
176 struct comb_file *file;
177 enum comb_file_type type;
179 if (lex_match_id (lexer, "FILE"))
181 else if (command == COMB_MATCH && lex_match_id (lexer, "TABLE"))
188 lex_match (lexer, T_EQUALS);
190 if (proc.n_files >= allocated_files)
191 proc.files = x2nrealloc (proc.files, &allocated_files,
193 file = &proc.files[proc.n_files++];
195 subcase_init_empty (&file->by_vars);
196 subcase_init_empty (&file->src);
197 subcase_init_empty (&file->dst);
202 file->is_sorted = true;
203 file->in_name = NULL;
206 if (lex_match (lexer, T_ASTERISK))
208 if (!proc_has_active_file (ds))
210 msg (SE, _("Cannot specify the active file since no active "
211 "file has been defined."));
215 if (proc_make_temporary_transformations_permanent (ds))
216 msg (SE, _("This command may not be used after TEMPORARY when "
217 "the active file is an input source. "
218 "Temporary transformations will be made permanent."));
220 file->dict = dict_clone (dataset_dict (ds));
224 file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
225 if (file->handle == NULL)
228 file->reader = any_reader_open (file->handle, &file->dict);
229 if (file->reader == NULL)
233 while (lex_match (lexer, T_SLASH))
234 if (lex_match_id (lexer, "RENAME"))
236 if (!parse_dict_rename (lexer, file->dict))
239 else if (lex_match_id (lexer, "IN"))
241 lex_match (lexer, T_EQUALS);
242 if (lex_token (lexer) != T_ID)
244 lex_error (lexer, NULL);
250 msg (SE, _("Multiple IN subcommands for a single FILE or "
254 file->in_name = xstrdup (lex_tokcstr (lexer));
257 else if (lex_match_id (lexer, "SORT"))
259 file->is_sorted = false;
263 merge_dictionary (proc.dict, file);
266 while (lex_token (lexer) != T_ENDCMD)
268 if (lex_match (lexer, T_BY))
270 const struct variable **by_vars;
276 lex_sbc_only_once ("BY");
281 lex_match (lexer, T_EQUALS);
282 if (!parse_sort_criteria (lexer, proc.dict, &proc.by_vars,
287 for (i = 0; i < proc.n_files; i++)
289 struct comb_file *file = &proc.files[i];
292 for (j = 0; j < subcase_get_n_fields (&proc.by_vars); j++)
294 const char *name = var_get_name (by_vars[j]);
295 struct variable *var = dict_lookup_var (file->dict, name);
297 subcase_add_var (&file->by_vars, var,
298 subcase_get_direction (&proc.by_vars, j));
301 if (file->handle != NULL)
302 msg (SE, _("File %s lacks BY variable %s."),
303 fh_get_name (file->handle), name);
305 msg (SE, _("Active file lacks BY variable %s."), name);
309 assert (!ok || subcase_conformable (&file->by_vars,
310 &proc.files[0].by_vars));
317 else if (command != COMB_UPDATE && lex_match_id (lexer, "FIRST"))
319 if (first_name != NULL)
321 lex_sbc_only_once ("FIRST");
325 lex_match (lexer, T_EQUALS);
326 if (!lex_force_id (lexer))
328 first_name = xstrdup (lex_tokcstr (lexer));
331 else if (command != COMB_UPDATE && lex_match_id (lexer, "LAST"))
333 if (last_name != NULL)
335 lex_sbc_only_once ("LAST");
339 lex_match (lexer, T_EQUALS);
340 if (!lex_force_id (lexer))
342 last_name = xstrdup (lex_tokcstr (lexer));
345 else if (lex_match_id (lexer, "MAP"))
349 else if (lex_match_id (lexer, "DROP"))
351 if (!parse_dict_drop (lexer, proc.dict))
354 else if (lex_match_id (lexer, "KEEP"))
356 if (!parse_dict_keep (lexer, proc.dict))
361 lex_error (lexer, NULL);
365 if (!lex_match (lexer, T_SLASH) && lex_token (lexer) != T_ENDCMD)
367 lex_end_of_command (lexer);
374 if (command == COMB_UPDATE)
376 msg (SE, _("The BY subcommand is required."));
381 msg (SE, _("BY is required when %s is specified."), "TABLE");
386 msg (SE, _("BY is required when %s is specified."), "SORT");
391 /* Add IN, FIRST, and LAST variables to master dictionary. */
392 for (i = 0; i < proc.n_files; i++)
394 struct comb_file *file = &proc.files[i];
395 if (!create_flag_var ("IN", file->in_name, proc.dict, &file->in_var))
398 if (!create_flag_var ("FIRST", first_name, proc.dict, &proc.first)
399 || !create_flag_var ("LAST", last_name, proc.dict, &proc.last))
402 dict_delete_scratch_vars (proc.dict);
403 dict_compact_values (proc.dict);
405 /* Set up mapping from each file's variables to master
407 for (i = 0; i < proc.n_files; i++)
409 struct comb_file *file = &proc.files[i];
410 size_t src_var_cnt = dict_get_var_cnt (file->dict);
413 for (j = 0; j < src_var_cnt; j++)
415 struct variable *src_var = dict_get_var (file->dict, j);
416 struct variable *dst_var = dict_lookup_var (proc.dict,
417 var_get_name (src_var));
420 subcase_add_var (&file->src, src_var, SC_ASCEND);
421 subcase_add_var (&file->dst, dst_var, SC_ASCEND);
426 proc.output = autopaging_writer_create (dict_get_proto (proc.dict));
427 taint = taint_clone (casewriter_get_taint (proc.output));
429 /* Set up case matcher. */
430 proc.matcher = case_matcher_create ();
431 for (i = 0; i < proc.n_files; i++)
433 struct comb_file *file = &proc.files[i];
434 if (file->reader == NULL)
436 if (active_file == NULL)
438 proc_discard_output (ds);
439 file->reader = active_file = proc_open (ds);
442 file->reader = casereader_clone (active_file);
444 if (!file->is_sorted)
445 file->reader = sort_execute (file->reader, &file->by_vars);
446 taint_propagate (casereader_get_taint (file->reader), taint);
447 file->data = casereader_read (file->reader);
448 if (file->type == COMB_FILE)
449 case_matcher_add_input (proc.matcher, &file->by_vars,
450 &file->data, &file->is_minimal);
453 if (command == COMB_ADD)
454 execute_add_files (&proc);
455 else if (command == COMB_MATCH)
456 execute_match_files (&proc);
457 else if (command == COMB_UPDATE)
458 execute_update (&proc);
462 case_matcher_destroy (proc.matcher);
464 close_all_comb_files (&proc);
465 if (active_file != NULL)
468 proc_set_active_file (ds, casewriter_make_reader (proc.output), proc.dict);
472 free_comb_proc (&proc);
477 return taint_destroy (taint) ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
480 if (active_file != NULL)
482 free_comb_proc (&proc);
483 taint_destroy (taint);
486 return CMD_CASCADING_FAILURE;
489 /* Merge the dictionary for file F into master dictionary M. */
491 merge_dictionary (struct dictionary *const m, struct comb_file *f)
493 struct dictionary *d = f->dict;
494 const char *d_docs, *m_docs;
496 const char *file_encoding;
498 if (dict_get_label (m) == NULL)
499 dict_set_label (m, dict_get_label (d));
501 d_docs = dict_get_documents (d);
502 m_docs = dict_get_documents (m);
505 /* FIXME: If the input files have different encodings, then
506 the result is undefined.
507 The correct thing to do would be to convert to an encoding
508 which can cope with all the input files (eg UTF-8).
510 file_encoding = dict_get_encoding (f->dict);
511 if ( file_encoding != NULL)
513 if ( dict_get_encoding (m) == NULL)
514 dict_set_encoding (m, file_encoding);
515 else if ( 0 != strcmp (file_encoding, dict_get_encoding (m)))
518 _("Combining files with incompatible encodings. String data may not be represented correctly."));
525 dict_set_documents (m, d_docs);
528 char *new_docs = xasprintf ("%s%s", m_docs, d_docs);
529 dict_set_documents (m, new_docs);
534 for (i = 0; i < dict_get_var_cnt (d); i++)
536 struct variable *dv = dict_get_var (d, i);
537 struct variable *mv = dict_lookup_var (m, var_get_name (dv));
539 if (dict_class_from_id (var_get_name (dv)) == DC_SCRATCH)
544 if (var_get_width (mv) != var_get_width (dv))
546 const char *var_name = var_get_name (dv);
547 const char *file_name = fh_get_name (f->handle);
548 struct string s = DS_EMPTY_INITIALIZER;
550 _("Variable %s in file %s has different "
551 "type or width from the same variable in "
553 var_name, file_name);
554 ds_put_cstr (&s, " ");
555 if (var_is_numeric (dv))
556 ds_put_format (&s, _("In file %s, %s is numeric."),
557 file_name, var_name);
559 ds_put_format (&s, _("In file %s, %s is a string variable "
561 file_name, var_name, var_get_width (dv));
562 ds_put_cstr (&s, " ");
563 if (var_is_numeric (mv))
564 ds_put_format (&s, _("In an earlier file, %s was numeric."),
567 ds_put_format (&s, _("In an earlier file, %s was a string "
568 "variable with width %d."),
569 var_name, var_get_width (mv));
570 msg (SE, "%s", ds_cstr (&s));
575 if (var_has_value_labels (dv) && !var_has_value_labels (mv))
576 var_set_value_labels (mv, var_get_value_labels (dv));
577 if (var_has_missing_values (dv) && !var_has_missing_values (mv))
578 var_set_missing_values (mv, var_get_missing_values (dv));
579 if (var_get_label (dv) && !var_get_label (mv))
580 var_set_label (mv, var_get_label (dv));
583 mv = dict_clone_var_assert (m, dv);
589 /* If VAR_NAME is non-NULL, attempts to create a
590 variable named VAR_NAME, with format F1.0, in DICT, and stores
591 a pointer to the variable in *VAR. Returns true if
592 successful, false if the variable name is a duplicate (in
593 which case a message saying that the variable specified on the
594 given SUBCOMMAND is a duplicate is emitted).
596 Does nothing and returns true if VAR_NAME is null. */
598 create_flag_var (const char *subcommand, const char *var_name,
599 struct dictionary *dict, struct variable **var)
601 if (var_name != NULL)
603 struct fmt_spec format = fmt_for_output (FMT_F, 1, 0);
604 *var = dict_create_var (dict, var_name, 0);
607 msg (SE, _("Variable name %s specified on %s subcommand "
608 "duplicates an existing variable name."),
609 subcommand, var_name);
612 var_set_both_formats (*var, &format);
619 /* Closes all the files in PROC and frees their associated data. */
621 close_all_comb_files (struct comb_proc *proc)
625 for (i = 0; i < proc->n_files; i++)
627 struct comb_file *file = &proc->files[i];
628 subcase_destroy (&file->by_vars);
629 subcase_destroy (&file->src);
630 subcase_destroy (&file->dst);
631 fh_unref (file->handle);
632 dict_destroy (file->dict);
633 casereader_destroy (file->reader);
634 case_unref (file->data);
635 free (file->in_name);
642 /* Frees all the data for the procedure. */
644 free_comb_proc (struct comb_proc *proc)
646 close_all_comb_files (proc);
647 dict_destroy (proc->dict);
648 casewriter_destroy (proc->output);
649 case_matcher_destroy (proc->matcher);
652 caseproto_destroy_values (subcase_get_proto (&proc->by_vars),
654 free (proc->prev_BY);
656 subcase_destroy (&proc->by_vars);
657 case_unref (proc->buffered_case);
660 static bool scan_table (struct comb_file *, union value by[]);
661 static struct ccase *create_output_case (const struct comb_proc *);
662 static void apply_case (const struct comb_file *, struct ccase *);
663 static void apply_file_case_and_advance (struct comb_file *, struct ccase *,
665 static void output_case (struct comb_proc *, struct ccase *, union value by[]);
666 static void output_buffered_case (struct comb_proc *);
668 /* Executes the ADD FILES command. */
670 execute_add_files (struct comb_proc *proc)
674 while (case_matcher_match (proc->matcher, &by))
678 for (i = 0; i < proc->n_files; i++)
680 struct comb_file *file = &proc->files[i];
681 while (file->is_minimal)
683 struct ccase *output = create_output_case (proc);
684 apply_file_case_and_advance (file, output, by);
685 output_case (proc, output, by);
689 output_buffered_case (proc);
692 /* Executes the MATCH FILES command. */
694 execute_match_files (struct comb_proc *proc)
698 while (case_matcher_match (proc->matcher, &by))
700 struct ccase *output;
703 output = create_output_case (proc);
704 for (i = proc->n_files; i-- > 0; )
706 struct comb_file *file = &proc->files[i];
707 if (file->type == COMB_FILE)
709 if (file->is_minimal)
710 apply_file_case_and_advance (file, output, NULL);
714 if (scan_table (file, by))
715 apply_case (file, output);
718 output_case (proc, output, by);
720 output_buffered_case (proc);
723 /* Executes the UPDATE command. */
725 execute_update (struct comb_proc *proc)
728 size_t n_duplicates = 0;
730 while (case_matcher_match (proc->matcher, &by))
732 struct comb_file *first, *file;
733 struct ccase *output;
735 /* Find first nonnull case in array and make an output case
737 output = create_output_case (proc);
738 for (first = &proc->files[0]; ; first++)
739 if (first->is_minimal)
741 apply_file_case_and_advance (first, output, by);
743 /* Read additional cases and update the output case from
744 them. (Don't update the output case from any duplicate
745 cases in the master file.) */
746 for (file = first + (first == proc->files);
747 file < &proc->files[proc->n_files]; file++)
749 while (file->is_minimal)
750 apply_file_case_and_advance (file, output, by);
752 casewriter_write (proc->output, output);
754 /* Write duplicate cases in the master file directly to the
756 if (first == proc->files && first->is_minimal)
759 while (first->is_minimal)
761 output = create_output_case (proc);
762 apply_file_case_and_advance (first, output, by);
763 casewriter_write (proc->output, output);
769 msg (SW, _("Encountered %zu sets of duplicate cases in the master file."),
773 /* Reads FILE, which must be of type COMB_TABLE, until it
774 encounters a case with BY or greater for its BY variables.
775 Returns true if a case with exactly BY for its BY variables
776 was found, otherwise false. */
778 scan_table (struct comb_file *file, union value by[])
780 while (file->data != NULL)
782 int cmp = subcase_compare_3way_xc (&file->by_vars, by, file->data);
785 case_unref (file->data);
786 file->data = casereader_read (file->reader);
794 /* Creates and returns an output case for PROC, initializing each
795 of its values to system-missing or blanks, except that the
796 values of IN variables are set to 0. */
797 static struct ccase *
798 create_output_case (const struct comb_proc *proc)
800 size_t n_vars = dict_get_var_cnt (proc->dict);
801 struct ccase *output;
804 output = case_create (dict_get_proto (proc->dict));
805 for (i = 0; i < n_vars; i++)
807 struct variable *v = dict_get_var (proc->dict, i);
808 value_set_missing (case_data_rw (output, v), var_get_width (v));
810 for (i = 0; i < proc->n_files; i++)
812 struct comb_file *file = &proc->files[i];
813 if (file->in_var != NULL)
814 case_data_rw (output, file->in_var)->f = false;
819 /* Copies the data from FILE's case into output case OUTPUT.
820 If FILE has an IN variable, then it is set to 1 in OUTPUT. */
822 apply_case (const struct comb_file *file, struct ccase *output)
824 subcase_copy (&file->src, file->data, &file->dst, output);
825 if (file->in_var != NULL)
826 case_data_rw (output, file->in_var)->f = true;
829 /* Like apply_case() above, but also advances FILE to its next
830 case. Also, if BY is nonnull, then FILE's is_minimal member
831 is updated based on whether the new case's BY values still
832 match those in BY. */
834 apply_file_case_and_advance (struct comb_file *file, struct ccase *output,
837 apply_case (file, output);
838 case_unref (file->data);
839 file->data = casereader_read (file->reader);
841 file->is_minimal = (file->data != NULL
842 && subcase_equal_cx (&file->by_vars, file->data, by));
845 /* Writes OUTPUT, whose BY values has been extracted into BY, to
846 PROC's output file, first initializing any FIRST or LAST
847 variables in OUTPUT to the correct values. */
849 output_case (struct comb_proc *proc, struct ccase *output, union value by[])
851 if (proc->first == NULL && proc->last == NULL)
852 casewriter_write (proc->output, output);
855 /* It's harder with LAST, because we can't know whether
856 this case is the last in a group until we've prepared
857 the *next* case also. Thus, we buffer the previous
858 output case until the next one is ready. */
860 if (proc->prev_BY != NULL)
862 new_BY = !subcase_equal_xx (&proc->by_vars, proc->prev_BY, by);
863 if (proc->last != NULL)
864 case_data_rw (proc->buffered_case, proc->last)->f = new_BY;
865 casewriter_write (proc->output, proc->buffered_case);
870 proc->buffered_case = output;
871 if (proc->first != NULL)
872 case_data_rw (proc->buffered_case, proc->first)->f = new_BY;
876 size_t n_values = subcase_get_n_fields (&proc->by_vars);
877 const struct caseproto *proto = subcase_get_proto (&proc->by_vars);
878 if (proc->prev_BY == NULL)
880 proc->prev_BY = xmalloc (n_values * sizeof *proc->prev_BY);
881 caseproto_init_values (proto, proc->prev_BY);
883 caseproto_copy (subcase_get_proto (&proc->by_vars), 0, n_values,
889 /* Writes a trailing buffered case to the output, if FIRST or
892 output_buffered_case (struct comb_proc *proc)
894 if (proc->prev_BY != NULL)
896 if (proc->last != NULL)
897 case_data_rw (proc->buffered_case, proc->last)->f = 1.0;
898 casewriter_write (proc->output, proc->buffered_case);
899 proc->buffered_case = NULL;