1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 #include "data/any-reader.h"
22 #include "data/case-matcher.h"
23 #include "data/case.h"
24 #include "data/casereader.h"
25 #include "data/casewriter.h"
26 #include "data/dataset.h"
27 #include "data/dictionary.h"
28 #include "data/format.h"
29 #include "data/subcase.h"
30 #include "data/variable.h"
31 #include "language/command.h"
32 #include "language/data-io/file-handle.h"
33 #include "language/data-io/trim.h"
34 #include "language/lexer/lexer.h"
35 #include "language/lexer/variable-parser.h"
36 #include "language/stats/sort-criteria.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/message.h"
39 #include "libpspp/string-array.h"
40 #include "libpspp/taint.h"
41 #include "math/sort.h"
43 #include "gl/xalloc.h"
46 #define _(msgid) gettext (msgid)
48 enum comb_command_type
58 COMB_FILE, /* Specified on FILE= subcommand. */
59 COMB_TABLE /* Specified on TABLE= subcommand. */
62 /* One FILE or TABLE subcommand. */
66 enum comb_file_type type; /* COMB_FILE or COMB_TABLE. */
69 struct subcase by_vars; /* BY variables in this input file. */
70 struct subcase src, dst; /* Data to copy to output; where to put it. */
73 struct file_handle *handle; /* Input file handle. */
74 struct dictionary *dict; /* Input file dictionary. */
75 struct casereader *reader; /* Input data source. */
76 struct ccase *data; /* The current input case. */
77 bool is_minimal; /* Does 'data' have minimum BY values across
79 bool is_sorted; /* Is file presorted on the BY variables? */
83 struct variable *in_var;
88 struct comb_file *files; /* All the files being merged. */
89 size_t n_files; /* Number of files. */
91 struct dictionary *dict; /* Dictionary of output file. */
92 struct subcase by_vars; /* BY variables in the output. */
93 struct casewriter *output; /* Destination for output. */
95 struct case_matcher *matcher;
98 Only if "first" or "last" is nonnull are the remaining
100 struct variable *first; /* Variable specified on FIRST (if any). */
101 struct variable *last; /* Variable specified on LAST (if any). */
102 struct ccase *buffered_case; /* Case ready for output except that we don't
103 know the value for the LAST var yet. */
104 union value *prev_BY; /* Values of BY vars in buffered_case. */
107 static int combine_files (enum comb_command_type, struct lexer *,
109 static void free_comb_proc (struct comb_proc *);
111 static void close_all_comb_files (struct comb_proc *);
112 static bool merge_dictionary (struct dictionary *const, struct comb_file *);
114 static void execute_update (struct comb_proc *);
115 static void execute_match_files (struct comb_proc *);
116 static void execute_add_files (struct comb_proc *);
118 static bool create_flag_var (const char *subcommand_name, const char *var_name,
119 struct dictionary *, struct variable **);
120 static void output_case (struct comb_proc *, struct ccase *, union value *by);
121 static void output_buffered_case (struct comb_proc *);
124 cmd_add_files (struct lexer *lexer, struct dataset *ds)
126 return combine_files (COMB_ADD, lexer, ds);
130 cmd_match_files (struct lexer *lexer, struct dataset *ds)
132 return combine_files (COMB_MATCH, lexer, ds);
136 cmd_update (struct lexer *lexer, struct dataset *ds)
138 return combine_files (COMB_UPDATE, lexer, ds);
142 combine_files (enum comb_command_type command,
143 struct lexer *lexer, struct dataset *ds)
145 struct comb_proc proc;
148 bool saw_sort = false;
149 struct casereader *active_file = NULL;
151 char *first_name = NULL;
152 char *last_name = NULL;
154 struct taint *taint = NULL;
157 size_t allocated_files = 0;
163 proc.dict = dict_create ();
166 subcase_init_empty (&proc.by_vars);
169 proc.buffered_case = NULL;
172 dict_set_case_limit (proc.dict, dict_get_case_limit (dataset_dict (ds)));
174 lex_match (lexer, T_SLASH);
177 struct comb_file *file;
178 enum comb_file_type type;
180 if (lex_match_id (lexer, "FILE"))
182 else if (command == COMB_MATCH && lex_match_id (lexer, "TABLE"))
189 lex_match (lexer, T_EQUALS);
191 if (proc.n_files >= allocated_files)
192 proc.files = x2nrealloc (proc.files, &allocated_files,
194 file = &proc.files[proc.n_files++];
196 subcase_init_empty (&file->by_vars);
197 subcase_init_empty (&file->src);
198 subcase_init_empty (&file->dst);
203 file->is_sorted = true;
204 file->in_name = NULL;
207 if (lex_match (lexer, T_ASTERISK))
209 if (!dataset_has_source (ds))
211 msg (SE, _("Cannot specify the active file since no active "
212 "file has been defined."));
216 if (proc_make_temporary_transformations_permanent (ds))
217 msg (SE, _("This command may not be used after TEMPORARY when "
218 "the active file is an input source. "
219 "Temporary transformations will be made permanent."));
221 file->dict = dict_clone (dataset_dict (ds));
225 file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
226 if (file->handle == NULL)
229 file->reader = any_reader_open (file->handle, &file->dict);
230 if (file->reader == NULL)
234 while (lex_match (lexer, T_SLASH))
235 if (lex_match_id (lexer, "RENAME"))
237 if (!parse_dict_rename (lexer, file->dict))
240 else if (lex_match_id (lexer, "IN"))
242 lex_match (lexer, T_EQUALS);
243 if (lex_token (lexer) != T_ID)
245 lex_error (lexer, NULL);
251 msg (SE, _("Multiple IN subcommands for a single FILE or "
255 file->in_name = xstrdup (lex_tokcstr (lexer));
258 else if (lex_match_id (lexer, "SORT"))
260 file->is_sorted = false;
264 merge_dictionary (proc.dict, file);
267 while (lex_token (lexer) != T_ENDCMD)
269 if (lex_match (lexer, T_BY))
271 const struct variable **by_vars;
277 lex_sbc_only_once ("BY");
282 lex_match (lexer, T_EQUALS);
283 if (!parse_sort_criteria (lexer, proc.dict, &proc.by_vars,
288 for (i = 0; i < proc.n_files; i++)
290 struct comb_file *file = &proc.files[i];
293 for (j = 0; j < subcase_get_n_fields (&proc.by_vars); j++)
295 const char *name = var_get_name (by_vars[j]);
296 struct variable *var = dict_lookup_var (file->dict, name);
298 subcase_add_var (&file->by_vars, var,
299 subcase_get_direction (&proc.by_vars, j));
302 if (file->handle != NULL)
303 msg (SE, _("File %s lacks BY variable %s."),
304 fh_get_name (file->handle), name);
306 msg (SE, _("Active file lacks BY variable %s."), name);
310 assert (!ok || subcase_conformable (&file->by_vars,
311 &proc.files[0].by_vars));
318 else if (command != COMB_UPDATE && lex_match_id (lexer, "FIRST"))
320 if (first_name != NULL)
322 lex_sbc_only_once ("FIRST");
326 lex_match (lexer, T_EQUALS);
327 if (!lex_force_id (lexer))
329 first_name = xstrdup (lex_tokcstr (lexer));
332 else if (command != COMB_UPDATE && lex_match_id (lexer, "LAST"))
334 if (last_name != NULL)
336 lex_sbc_only_once ("LAST");
340 lex_match (lexer, T_EQUALS);
341 if (!lex_force_id (lexer))
343 last_name = xstrdup (lex_tokcstr (lexer));
346 else if (lex_match_id (lexer, "MAP"))
350 else if (lex_match_id (lexer, "DROP"))
352 if (!parse_dict_drop (lexer, proc.dict))
355 else if (lex_match_id (lexer, "KEEP"))
357 if (!parse_dict_keep (lexer, proc.dict))
362 lex_error (lexer, NULL);
366 if (!lex_match (lexer, T_SLASH) && lex_token (lexer) != T_ENDCMD)
368 lex_end_of_command (lexer);
375 if (command == COMB_UPDATE)
377 msg (SE, _("The BY subcommand is required."));
382 msg (SE, _("BY is required when %s is specified."), "TABLE");
387 msg (SE, _("BY is required when %s is specified."), "SORT");
392 /* Add IN, FIRST, and LAST variables to master dictionary. */
393 for (i = 0; i < proc.n_files; i++)
395 struct comb_file *file = &proc.files[i];
396 if (!create_flag_var ("IN", file->in_name, proc.dict, &file->in_var))
399 if (!create_flag_var ("FIRST", first_name, proc.dict, &proc.first)
400 || !create_flag_var ("LAST", last_name, proc.dict, &proc.last))
403 dict_delete_scratch_vars (proc.dict);
404 dict_compact_values (proc.dict);
406 /* Set up mapping from each file's variables to master
408 for (i = 0; i < proc.n_files; i++)
410 struct comb_file *file = &proc.files[i];
411 size_t src_var_cnt = dict_get_var_cnt (file->dict);
414 for (j = 0; j < src_var_cnt; j++)
416 struct variable *src_var = dict_get_var (file->dict, j);
417 struct variable *dst_var = dict_lookup_var (proc.dict,
418 var_get_name (src_var));
421 subcase_add_var (&file->src, src_var, SC_ASCEND);
422 subcase_add_var (&file->dst, dst_var, SC_ASCEND);
427 proc.output = autopaging_writer_create (dict_get_proto (proc.dict));
428 taint = taint_clone (casewriter_get_taint (proc.output));
430 /* Set up case matcher. */
431 proc.matcher = case_matcher_create ();
432 for (i = 0; i < proc.n_files; i++)
434 struct comb_file *file = &proc.files[i];
435 if (file->reader == NULL)
437 if (active_file == NULL)
439 proc_discard_output (ds);
440 file->reader = active_file = proc_open (ds);
443 file->reader = casereader_clone (active_file);
445 if (!file->is_sorted)
446 file->reader = sort_execute (file->reader, &file->by_vars);
447 taint_propagate (casereader_get_taint (file->reader), taint);
448 file->data = casereader_read (file->reader);
449 if (file->type == COMB_FILE)
450 case_matcher_add_input (proc.matcher, &file->by_vars,
451 &file->data, &file->is_minimal);
454 if (command == COMB_ADD)
455 execute_add_files (&proc);
456 else if (command == COMB_MATCH)
457 execute_match_files (&proc);
458 else if (command == COMB_UPDATE)
459 execute_update (&proc);
463 case_matcher_destroy (proc.matcher);
465 close_all_comb_files (&proc);
466 if (active_file != NULL)
469 dataset_set_dict (ds, proc.dict);
470 dataset_set_source (ds, casewriter_make_reader (proc.output));
474 free_comb_proc (&proc);
479 return taint_destroy (taint) ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
482 if (active_file != NULL)
484 free_comb_proc (&proc);
485 taint_destroy (taint);
488 return CMD_CASCADING_FAILURE;
491 /* Merge the dictionary for file F into master dictionary M. */
493 merge_dictionary (struct dictionary *const m, struct comb_file *f)
495 struct dictionary *d = f->dict;
496 const struct string_array *d_docs, *m_docs;
498 const char *file_encoding;
500 if (dict_get_label (m) == NULL)
501 dict_set_label (m, dict_get_label (d));
503 d_docs = dict_get_documents (d);
504 m_docs = dict_get_documents (m);
507 /* FIXME: If the input files have different encodings, then
508 the result is undefined.
509 The correct thing to do would be to convert to an encoding
510 which can cope with all the input files (eg UTF-8).
512 file_encoding = dict_get_encoding (f->dict);
513 if ( file_encoding != NULL)
515 if ( dict_get_encoding (m) == NULL)
516 dict_set_encoding (m, file_encoding);
517 else if ( 0 != strcmp (file_encoding, dict_get_encoding (m)))
520 _("Combining files with incompatible encodings. String data may not be represented correctly."));
527 dict_set_documents (m, d_docs);
530 struct string_array new_docs;
533 new_docs.n = m_docs->n + d_docs->n;
534 new_docs.strings = xmalloc (new_docs.n * sizeof *new_docs.strings);
535 for (i = 0; i < m_docs->n; i++)
536 new_docs.strings[i] = m_docs->strings[i];
537 for (i = 0; i < d_docs->n; i++)
538 new_docs.strings[m_docs->n + i] = d_docs->strings[i];
540 dict_set_documents (m, &new_docs);
542 free (new_docs.strings);
546 for (i = 0; i < dict_get_var_cnt (d); i++)
548 struct variable *dv = dict_get_var (d, i);
549 struct variable *mv = dict_lookup_var (m, var_get_name (dv));
551 if (dict_class_from_id (var_get_name (dv)) == DC_SCRATCH)
556 if (var_get_width (mv) != var_get_width (dv))
558 const char *var_name = var_get_name (dv);
559 const char *file_name = fh_get_name (f->handle);
560 struct string s = DS_EMPTY_INITIALIZER;
562 _("Variable %s in file %s has different "
563 "type or width from the same variable in "
565 var_name, file_name);
566 ds_put_cstr (&s, " ");
567 if (var_is_numeric (dv))
568 ds_put_format (&s, _("In file %s, %s is numeric."),
569 file_name, var_name);
571 ds_put_format (&s, _("In file %s, %s is a string variable "
573 file_name, var_name, var_get_width (dv));
574 ds_put_cstr (&s, " ");
575 if (var_is_numeric (mv))
576 ds_put_format (&s, _("In an earlier file, %s was numeric."),
579 ds_put_format (&s, _("In an earlier file, %s was a string "
580 "variable with width %d."),
581 var_name, var_get_width (mv));
582 msg (SE, "%s", ds_cstr (&s));
587 if (var_has_value_labels (dv) && !var_has_value_labels (mv))
588 var_set_value_labels (mv, var_get_value_labels (dv));
589 if (var_has_missing_values (dv) && !var_has_missing_values (mv))
590 var_set_missing_values (mv, var_get_missing_values (dv));
591 if (var_get_label (dv) && !var_get_label (mv))
592 var_set_label (mv, var_get_label (dv), file_encoding, false);
595 mv = dict_clone_var_assert (m, dv);
601 /* If VAR_NAME is non-NULL, attempts to create a
602 variable named VAR_NAME, with format F1.0, in DICT, and stores
603 a pointer to the variable in *VAR. Returns true if
604 successful, false if the variable name is a duplicate (in
605 which case a message saying that the variable specified on the
606 given SUBCOMMAND is a duplicate is emitted).
608 Does nothing and returns true if VAR_NAME is null. */
610 create_flag_var (const char *subcommand, const char *var_name,
611 struct dictionary *dict, struct variable **var)
613 if (var_name != NULL)
615 struct fmt_spec format = fmt_for_output (FMT_F, 1, 0);
616 *var = dict_create_var (dict, var_name, 0);
619 msg (SE, _("Variable name %s specified on %s subcommand "
620 "duplicates an existing variable name."),
621 subcommand, var_name);
624 var_set_both_formats (*var, &format);
631 /* Closes all the files in PROC and frees their associated data. */
633 close_all_comb_files (struct comb_proc *proc)
637 for (i = 0; i < proc->n_files; i++)
639 struct comb_file *file = &proc->files[i];
640 subcase_destroy (&file->by_vars);
641 subcase_destroy (&file->src);
642 subcase_destroy (&file->dst);
643 fh_unref (file->handle);
644 dict_destroy (file->dict);
645 casereader_destroy (file->reader);
646 case_unref (file->data);
647 free (file->in_name);
654 /* Frees all the data for the procedure. */
656 free_comb_proc (struct comb_proc *proc)
658 close_all_comb_files (proc);
659 dict_destroy (proc->dict);
660 casewriter_destroy (proc->output);
661 case_matcher_destroy (proc->matcher);
664 caseproto_destroy_values (subcase_get_proto (&proc->by_vars),
666 free (proc->prev_BY);
668 subcase_destroy (&proc->by_vars);
669 case_unref (proc->buffered_case);
672 static bool scan_table (struct comb_file *, union value by[]);
673 static struct ccase *create_output_case (const struct comb_proc *);
674 static void apply_case (const struct comb_file *, struct ccase *);
675 static void apply_file_case_and_advance (struct comb_file *, struct ccase *,
677 static void output_case (struct comb_proc *, struct ccase *, union value by[]);
678 static void output_buffered_case (struct comb_proc *);
680 /* Executes the ADD FILES command. */
682 execute_add_files (struct comb_proc *proc)
686 while (case_matcher_match (proc->matcher, &by))
690 for (i = 0; i < proc->n_files; i++)
692 struct comb_file *file = &proc->files[i];
693 while (file->is_minimal)
695 struct ccase *output = create_output_case (proc);
696 apply_file_case_and_advance (file, output, by);
697 output_case (proc, output, by);
701 output_buffered_case (proc);
704 /* Executes the MATCH FILES command. */
706 execute_match_files (struct comb_proc *proc)
710 while (case_matcher_match (proc->matcher, &by))
712 struct ccase *output;
715 output = create_output_case (proc);
716 for (i = proc->n_files; i-- > 0; )
718 struct comb_file *file = &proc->files[i];
719 if (file->type == COMB_FILE)
721 if (file->is_minimal)
722 apply_file_case_and_advance (file, output, NULL);
726 if (scan_table (file, by))
727 apply_case (file, output);
730 output_case (proc, output, by);
732 output_buffered_case (proc);
735 /* Executes the UPDATE command. */
737 execute_update (struct comb_proc *proc)
740 size_t n_duplicates = 0;
742 while (case_matcher_match (proc->matcher, &by))
744 struct comb_file *first, *file;
745 struct ccase *output;
747 /* Find first nonnull case in array and make an output case
749 output = create_output_case (proc);
750 for (first = &proc->files[0]; ; first++)
751 if (first->is_minimal)
753 apply_file_case_and_advance (first, output, by);
755 /* Read additional cases and update the output case from
756 them. (Don't update the output case from any duplicate
757 cases in the master file.) */
758 for (file = first + (first == proc->files);
759 file < &proc->files[proc->n_files]; file++)
761 while (file->is_minimal)
762 apply_file_case_and_advance (file, output, by);
764 casewriter_write (proc->output, output);
766 /* Write duplicate cases in the master file directly to the
768 if (first == proc->files && first->is_minimal)
771 while (first->is_minimal)
773 output = create_output_case (proc);
774 apply_file_case_and_advance (first, output, by);
775 casewriter_write (proc->output, output);
781 msg (SW, _("Encountered %zu sets of duplicate cases in the master file."),
785 /* Reads FILE, which must be of type COMB_TABLE, until it
786 encounters a case with BY or greater for its BY variables.
787 Returns true if a case with exactly BY for its BY variables
788 was found, otherwise false. */
790 scan_table (struct comb_file *file, union value by[])
792 while (file->data != NULL)
794 int cmp = subcase_compare_3way_xc (&file->by_vars, by, file->data);
797 case_unref (file->data);
798 file->data = casereader_read (file->reader);
806 /* Creates and returns an output case for PROC, initializing each
807 of its values to system-missing or blanks, except that the
808 values of IN variables are set to 0. */
809 static struct ccase *
810 create_output_case (const struct comb_proc *proc)
812 size_t n_vars = dict_get_var_cnt (proc->dict);
813 struct ccase *output;
816 output = case_create (dict_get_proto (proc->dict));
817 for (i = 0; i < n_vars; i++)
819 struct variable *v = dict_get_var (proc->dict, i);
820 value_set_missing (case_data_rw (output, v), var_get_width (v));
822 for (i = 0; i < proc->n_files; i++)
824 struct comb_file *file = &proc->files[i];
825 if (file->in_var != NULL)
826 case_data_rw (output, file->in_var)->f = false;
831 /* Copies the data from FILE's case into output case OUTPUT.
832 If FILE has an IN variable, then it is set to 1 in OUTPUT. */
834 apply_case (const struct comb_file *file, struct ccase *output)
836 subcase_copy (&file->src, file->data, &file->dst, output);
837 if (file->in_var != NULL)
838 case_data_rw (output, file->in_var)->f = true;
841 /* Like apply_case() above, but also advances FILE to its next
842 case. Also, if BY is nonnull, then FILE's is_minimal member
843 is updated based on whether the new case's BY values still
844 match those in BY. */
846 apply_file_case_and_advance (struct comb_file *file, struct ccase *output,
849 apply_case (file, output);
850 case_unref (file->data);
851 file->data = casereader_read (file->reader);
853 file->is_minimal = (file->data != NULL
854 && subcase_equal_cx (&file->by_vars, file->data, by));
857 /* Writes OUTPUT, whose BY values has been extracted into BY, to
858 PROC's output file, first initializing any FIRST or LAST
859 variables in OUTPUT to the correct values. */
861 output_case (struct comb_proc *proc, struct ccase *output, union value by[])
863 if (proc->first == NULL && proc->last == NULL)
864 casewriter_write (proc->output, output);
867 /* It's harder with LAST, because we can't know whether
868 this case is the last in a group until we've prepared
869 the *next* case also. Thus, we buffer the previous
870 output case until the next one is ready. */
872 if (proc->prev_BY != NULL)
874 new_BY = !subcase_equal_xx (&proc->by_vars, proc->prev_BY, by);
875 if (proc->last != NULL)
876 case_data_rw (proc->buffered_case, proc->last)->f = new_BY;
877 casewriter_write (proc->output, proc->buffered_case);
882 proc->buffered_case = output;
883 if (proc->first != NULL)
884 case_data_rw (proc->buffered_case, proc->first)->f = new_BY;
888 size_t n_values = subcase_get_n_fields (&proc->by_vars);
889 const struct caseproto *proto = subcase_get_proto (&proc->by_vars);
890 if (proc->prev_BY == NULL)
892 proc->prev_BY = xmalloc (n_values * sizeof *proc->prev_BY);
893 caseproto_init_values (proto, proc->prev_BY);
895 caseproto_copy (subcase_get_proto (&proc->by_vars), 0, n_values,
901 /* Writes a trailing buffered case to the output, if FIRST or
904 output_buffered_case (struct comb_proc *proc)
906 if (proc->prev_BY != NULL)
908 if (proc->last != NULL)
909 case_data_rw (proc->buffered_case, proc->last)->f = 1.0;
910 casewriter_write (proc->output, proc->buffered_case);
911 proc->buffered_case = NULL;