1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 #include "data/any-reader.h"
22 #include "data/case-matcher.h"
23 #include "data/case.h"
24 #include "data/casereader.h"
25 #include "data/casewriter.h"
26 #include "data/dataset.h"
27 #include "data/dictionary.h"
28 #include "data/format.h"
29 #include "data/subcase.h"
30 #include "data/variable.h"
31 #include "language/command.h"
32 #include "language/data-io/file-handle.h"
33 #include "language/data-io/trim.h"
34 #include "language/lexer/lexer.h"
35 #include "language/lexer/variable-parser.h"
36 #include "language/stats/sort-criteria.h"
37 #include "libpspp/assertion.h"
38 #include "libpspp/i18n.h"
39 #include "libpspp/message.h"
40 #include "libpspp/string-array.h"
41 #include "libpspp/taint.h"
42 #include "math/sort.h"
44 #include "gl/xalloc.h"
47 #define _(msgid) gettext (msgid)
49 enum comb_command_type
59 COMB_FILE, /* Specified on FILE= subcommand. */
60 COMB_TABLE /* Specified on TABLE= subcommand. */
63 /* One FILE or TABLE subcommand. */
67 enum comb_file_type type; /* COMB_FILE or COMB_TABLE. */
70 struct subcase by_vars; /* BY variables in this input file. */
71 struct subcase src, dst; /* Data to copy to output; where to put it. */
74 struct file_handle *handle; /* Input file handle. */
75 struct dictionary *dict; /* Input file dictionary. */
76 struct casereader *reader; /* Input data source. */
77 struct ccase *data; /* The current input case. */
78 bool is_minimal; /* Does 'data' have minimum BY values across
80 bool is_sorted; /* Is file presorted on the BY variables? */
84 struct variable *in_var;
89 struct comb_file *files; /* All the files being merged. */
90 size_t n_files; /* Number of files. */
92 struct dictionary *dict; /* Dictionary of output file. */
93 struct subcase by_vars; /* BY variables in the output. */
94 struct casewriter *output; /* Destination for output. */
96 struct case_matcher *matcher;
99 Only if "first" or "last" is nonnull are the remaining
101 struct variable *first; /* Variable specified on FIRST (if any). */
102 struct variable *last; /* Variable specified on LAST (if any). */
103 struct ccase *buffered_case; /* Case ready for output except that we don't
104 know the value for the LAST var yet. */
105 union value *prev_BY; /* Values of BY vars in buffered_case. */
108 static int combine_files (enum comb_command_type, struct lexer *,
110 static void free_comb_proc (struct comb_proc *);
112 static void close_all_comb_files (struct comb_proc *);
113 static bool merge_dictionary (struct dictionary *const, struct comb_file *);
115 static void execute_update (struct comb_proc *);
116 static void execute_match_files (struct comb_proc *);
117 static void execute_add_files (struct comb_proc *);
119 static bool create_flag_var (const char *subcommand_name, const char *var_name,
120 struct dictionary *, struct variable **);
121 static void output_case (struct comb_proc *, struct ccase *, union value *by);
122 static void output_buffered_case (struct comb_proc *);
125 cmd_add_files (struct lexer *lexer, struct dataset *ds)
127 return combine_files (COMB_ADD, lexer, ds);
131 cmd_match_files (struct lexer *lexer, struct dataset *ds)
133 return combine_files (COMB_MATCH, lexer, ds);
137 cmd_update (struct lexer *lexer, struct dataset *ds)
139 return combine_files (COMB_UPDATE, lexer, ds);
143 combine_files (enum comb_command_type command,
144 struct lexer *lexer, struct dataset *ds)
146 struct comb_proc proc;
149 bool saw_sort = false;
150 struct casereader *active_file = NULL;
152 char *first_name = NULL;
153 char *last_name = NULL;
155 struct taint *taint = NULL;
158 size_t allocated_files = 0;
164 proc.dict = dict_create (get_default_encoding ());
167 subcase_init_empty (&proc.by_vars);
170 proc.buffered_case = NULL;
173 dict_set_case_limit (proc.dict, dict_get_case_limit (dataset_dict (ds)));
175 lex_match (lexer, T_SLASH);
178 struct comb_file *file;
179 enum comb_file_type type;
181 if (lex_match_id (lexer, "FILE"))
183 else if (command == COMB_MATCH && lex_match_id (lexer, "TABLE"))
190 lex_match (lexer, T_EQUALS);
192 if (proc.n_files >= allocated_files)
193 proc.files = x2nrealloc (proc.files, &allocated_files,
195 file = &proc.files[proc.n_files++];
197 subcase_init_empty (&file->by_vars);
198 subcase_init_empty (&file->src);
199 subcase_init_empty (&file->dst);
204 file->is_sorted = true;
205 file->in_name = NULL;
208 if (lex_match (lexer, T_ASTERISK))
210 if (!dataset_has_source (ds))
212 msg (SE, _("Cannot specify the active dataset since none "
213 "has been defined."));
217 if (proc_make_temporary_transformations_permanent (ds))
218 msg (SE, _("This command may not be used after TEMPORARY when "
219 "the active dataset is an input source. "
220 "Temporary transformations will be made permanent."));
222 file->dict = dict_clone (dataset_dict (ds));
226 file->handle = fh_parse (lexer, FH_REF_FILE, dataset_session (ds));
227 if (file->handle == NULL)
230 file->reader = any_reader_open (file->handle, NULL, &file->dict);
231 if (file->reader == NULL)
235 while (lex_match (lexer, T_SLASH))
236 if (lex_match_id (lexer, "RENAME"))
238 if (!parse_dict_rename (lexer, file->dict))
241 else if (lex_match_id (lexer, "IN"))
243 lex_match (lexer, T_EQUALS);
244 if (lex_token (lexer) != T_ID)
246 lex_error (lexer, NULL);
252 msg (SE, _("Multiple IN subcommands for a single FILE or "
256 file->in_name = xstrdup (lex_tokcstr (lexer));
259 else if (lex_match_id (lexer, "SORT"))
261 file->is_sorted = false;
265 merge_dictionary (proc.dict, file);
268 while (lex_token (lexer) != T_ENDCMD)
270 if (lex_match (lexer, T_BY))
272 const struct variable **by_vars;
278 lex_sbc_only_once ("BY");
283 lex_match (lexer, T_EQUALS);
284 if (!parse_sort_criteria (lexer, proc.dict, &proc.by_vars,
289 for (i = 0; i < proc.n_files; i++)
291 struct comb_file *file = &proc.files[i];
294 for (j = 0; j < subcase_get_n_fields (&proc.by_vars); j++)
296 const char *name = var_get_name (by_vars[j]);
297 struct variable *var = dict_lookup_var (file->dict, name);
299 subcase_add_var (&file->by_vars, var,
300 subcase_get_direction (&proc.by_vars, j));
303 if (file->handle != NULL)
304 msg (SE, _("File %s lacks BY variable %s."),
305 fh_get_name (file->handle), name);
307 msg (SE, _("Active dataset lacks BY variable %s."),
312 assert (!ok || subcase_conformable (&file->by_vars,
313 &proc.files[0].by_vars));
320 else if (command != COMB_UPDATE && lex_match_id (lexer, "FIRST"))
322 if (first_name != NULL)
324 lex_sbc_only_once ("FIRST");
328 lex_match (lexer, T_EQUALS);
329 if (!lex_force_id (lexer))
331 first_name = xstrdup (lex_tokcstr (lexer));
334 else if (command != COMB_UPDATE && lex_match_id (lexer, "LAST"))
336 if (last_name != NULL)
338 lex_sbc_only_once ("LAST");
342 lex_match (lexer, T_EQUALS);
343 if (!lex_force_id (lexer))
345 last_name = xstrdup (lex_tokcstr (lexer));
348 else if (lex_match_id (lexer, "MAP"))
352 else if (lex_match_id (lexer, "DROP"))
354 if (!parse_dict_drop (lexer, proc.dict))
357 else if (lex_match_id (lexer, "KEEP"))
359 if (!parse_dict_keep (lexer, proc.dict))
364 lex_error (lexer, NULL);
368 if (!lex_match (lexer, T_SLASH) && lex_token (lexer) != T_ENDCMD)
370 lex_end_of_command (lexer);
377 if (command == COMB_UPDATE)
379 lex_sbc_missing ("BY");
384 msg (SE, _("BY is required when %s is specified."), "TABLE");
389 msg (SE, _("BY is required when %s is specified."), "SORT");
394 /* Add IN, FIRST, and LAST variables to master dictionary. */
395 for (i = 0; i < proc.n_files; i++)
397 struct comb_file *file = &proc.files[i];
398 if (!create_flag_var ("IN", file->in_name, proc.dict, &file->in_var))
401 if (!create_flag_var ("FIRST", first_name, proc.dict, &proc.first)
402 || !create_flag_var ("LAST", last_name, proc.dict, &proc.last))
405 dict_delete_scratch_vars (proc.dict);
406 dict_compact_values (proc.dict);
408 /* Set up mapping from each file's variables to master
410 for (i = 0; i < proc.n_files; i++)
412 struct comb_file *file = &proc.files[i];
413 size_t src_var_cnt = dict_get_var_cnt (file->dict);
416 for (j = 0; j < src_var_cnt; j++)
418 struct variable *src_var = dict_get_var (file->dict, j);
419 struct variable *dst_var = dict_lookup_var (proc.dict,
420 var_get_name (src_var));
423 subcase_add_var (&file->src, src_var, SC_ASCEND);
424 subcase_add_var (&file->dst, dst_var, SC_ASCEND);
429 proc.output = autopaging_writer_create (dict_get_proto (proc.dict));
430 taint = taint_clone (casewriter_get_taint (proc.output));
432 /* Set up case matcher. */
433 proc.matcher = case_matcher_create ();
434 for (i = 0; i < proc.n_files; i++)
436 struct comb_file *file = &proc.files[i];
437 if (file->reader == NULL)
439 if (active_file == NULL)
441 proc_discard_output (ds);
442 file->reader = active_file = proc_open (ds);
445 file->reader = casereader_clone (active_file);
447 if (!file->is_sorted)
448 file->reader = sort_execute (file->reader, &file->by_vars);
449 taint_propagate (casereader_get_taint (file->reader), taint);
450 file->data = casereader_read (file->reader);
451 if (file->type == COMB_FILE)
452 case_matcher_add_input (proc.matcher, &file->by_vars,
453 &file->data, &file->is_minimal);
456 if (command == COMB_ADD)
457 execute_add_files (&proc);
458 else if (command == COMB_MATCH)
459 execute_match_files (&proc);
460 else if (command == COMB_UPDATE)
461 execute_update (&proc);
465 case_matcher_destroy (proc.matcher);
467 close_all_comb_files (&proc);
468 if (active_file != NULL)
471 dataset_set_dict (ds, proc.dict);
472 dataset_set_source (ds, casewriter_make_reader (proc.output));
476 free_comb_proc (&proc);
481 return taint_destroy (taint) ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
484 if (active_file != NULL)
486 free_comb_proc (&proc);
487 taint_destroy (taint);
490 return CMD_CASCADING_FAILURE;
493 /* Merge the dictionary for file F into master dictionary M. */
495 merge_dictionary (struct dictionary *const m, struct comb_file *f)
497 struct dictionary *d = f->dict;
498 const struct string_array *d_docs, *m_docs;
501 if (dict_get_label (m) == NULL)
502 dict_set_label (m, dict_get_label (d));
504 d_docs = dict_get_documents (d);
505 m_docs = dict_get_documents (m);
508 /* FIXME: If the input files have different encodings, then
509 the result is undefined.
510 The correct thing to do would be to convert to an encoding
511 which can cope with all the input files (eg UTF-8).
513 if ( 0 != strcmp (dict_get_encoding (f->dict), dict_get_encoding (m)))
514 msg (MW, _("Combining files with incompatible encodings. String data may "
515 "not be represented correctly."));
520 dict_set_documents (m, d_docs);
523 struct string_array new_docs;
526 new_docs.n = m_docs->n + d_docs->n;
527 new_docs.strings = xmalloc (new_docs.n * sizeof *new_docs.strings);
528 for (i = 0; i < m_docs->n; i++)
529 new_docs.strings[i] = m_docs->strings[i];
530 for (i = 0; i < d_docs->n; i++)
531 new_docs.strings[m_docs->n + i] = d_docs->strings[i];
533 dict_set_documents (m, &new_docs);
535 free (new_docs.strings);
539 for (i = 0; i < dict_get_var_cnt (d); i++)
541 struct variable *dv = dict_get_var (d, i);
542 struct variable *mv = dict_lookup_var (m, var_get_name (dv));
544 if (dict_class_from_id (var_get_name (dv)) == DC_SCRATCH)
549 if (var_get_width (mv) != var_get_width (dv))
551 const char *var_name = var_get_name (dv);
552 const char *file_name = fh_get_name (f->handle);
553 struct string s = DS_EMPTY_INITIALIZER;
555 _("Variable %s in file %s has different "
556 "type or width from the same variable in "
558 var_name, file_name);
559 ds_put_cstr (&s, " ");
560 if (var_is_numeric (dv))
561 ds_put_format (&s, _("In file %s, %s is numeric."),
562 file_name, var_name);
564 ds_put_format (&s, _("In file %s, %s is a string variable "
566 file_name, var_name, var_get_width (dv));
567 ds_put_cstr (&s, " ");
568 if (var_is_numeric (mv))
569 ds_put_format (&s, _("In an earlier file, %s was numeric."),
572 ds_put_format (&s, _("In an earlier file, %s was a string "
573 "variable with width %d."),
574 var_name, var_get_width (mv));
575 msg (SE, "%s", ds_cstr (&s));
580 if (var_has_value_labels (dv) && !var_has_value_labels (mv))
581 var_set_value_labels (mv, var_get_value_labels (dv));
582 if (var_has_missing_values (dv) && !var_has_missing_values (mv))
583 var_set_missing_values (mv, var_get_missing_values (dv));
584 if (var_get_label (dv) && !var_get_label (mv))
585 var_set_label (mv, var_get_label (dv), false);
588 mv = dict_clone_var_assert (m, dv);
594 /* If VAR_NAME is non-NULL, attempts to create a
595 variable named VAR_NAME, with format F1.0, in DICT, and stores
596 a pointer to the variable in *VAR. Returns true if
597 successful, false if the variable name is a duplicate (in
598 which case a message saying that the variable specified on the
599 given SUBCOMMAND is a duplicate is emitted).
601 Does nothing and returns true if VAR_NAME is null. */
603 create_flag_var (const char *subcommand, const char *var_name,
604 struct dictionary *dict, struct variable **var)
606 if (var_name != NULL)
608 struct fmt_spec format = fmt_for_output (FMT_F, 1, 0);
609 *var = dict_create_var (dict, var_name, 0);
612 msg (SE, _("Variable name %s specified on %s subcommand "
613 "duplicates an existing variable name."),
614 subcommand, var_name);
617 var_set_both_formats (*var, &format);
624 /* Closes all the files in PROC and frees their associated data. */
626 close_all_comb_files (struct comb_proc *proc)
630 for (i = 0; i < proc->n_files; i++)
632 struct comb_file *file = &proc->files[i];
633 subcase_destroy (&file->by_vars);
634 subcase_destroy (&file->src);
635 subcase_destroy (&file->dst);
636 fh_unref (file->handle);
637 dict_destroy (file->dict);
638 casereader_destroy (file->reader);
639 case_unref (file->data);
640 free (file->in_name);
647 /* Frees all the data for the procedure. */
649 free_comb_proc (struct comb_proc *proc)
651 close_all_comb_files (proc);
652 dict_destroy (proc->dict);
653 casewriter_destroy (proc->output);
654 case_matcher_destroy (proc->matcher);
657 caseproto_destroy_values (subcase_get_proto (&proc->by_vars),
659 free (proc->prev_BY);
661 subcase_destroy (&proc->by_vars);
662 case_unref (proc->buffered_case);
665 static bool scan_table (struct comb_file *, union value by[]);
666 static struct ccase *create_output_case (const struct comb_proc *);
667 static void apply_case (const struct comb_file *, struct ccase *);
668 static void advance_file (struct comb_file *, union value by[]);
669 static void output_case (struct comb_proc *, struct ccase *, union value by[]);
670 static void output_buffered_case (struct comb_proc *);
672 /* Executes the ADD FILES command. */
674 execute_add_files (struct comb_proc *proc)
678 while (case_matcher_match (proc->matcher, &by))
682 for (i = 0; i < proc->n_files; i++)
684 struct comb_file *file = &proc->files[i];
685 while (file->is_minimal)
687 struct ccase *output = create_output_case (proc);
688 apply_case (file, output);
689 advance_file (file, by);
690 output_case (proc, output, by);
694 output_buffered_case (proc);
697 /* Executes the MATCH FILES command. */
699 execute_match_files (struct comb_proc *proc)
703 while (case_matcher_match (proc->matcher, &by))
705 struct ccase *output;
708 output = create_output_case (proc);
709 for (i = proc->n_files; i-- > 0; )
711 struct comb_file *file = &proc->files[i];
712 if (file->type == COMB_FILE)
714 if (file->is_minimal)
716 apply_case (file, output);
717 advance_file (file, NULL);
722 if (scan_table (file, by))
723 apply_case (file, output);
726 output_case (proc, output, by);
728 output_buffered_case (proc);
731 /* Executes the UPDATE command. */
733 execute_update (struct comb_proc *proc)
736 size_t n_duplicates = 0;
738 while (case_matcher_match (proc->matcher, &by))
740 struct comb_file *first, *file;
741 struct ccase *output;
743 /* Find first nonnull case in array and make an output case
745 output = create_output_case (proc);
746 for (first = &proc->files[0]; ; first++)
747 if (first->is_minimal)
749 apply_case (first, output);
750 advance_file (first, by);
752 /* Read additional cases and update the output case from
753 them. (Don't update the output case from any duplicate
754 cases in the master file.) */
755 for (file = first + (first == proc->files);
756 file < &proc->files[proc->n_files]; file++)
758 while (file->is_minimal)
760 apply_case (file, output);
761 advance_file (file, by);
764 casewriter_write (proc->output, output);
766 /* Write duplicate cases in the master file directly to the
768 if (first == proc->files && first->is_minimal)
771 while (first->is_minimal)
773 output = create_output_case (proc);
774 apply_case (first, output);
775 advance_file (first, by);
776 casewriter_write (proc->output, output);
782 msg (SW, _("Encountered %zu sets of duplicate cases in the master file."),
786 /* Reads FILE, which must be of type COMB_TABLE, until it
787 encounters a case with BY or greater for its BY variables.
788 Returns true if a case with exactly BY for its BY variables
789 was found, otherwise false. */
791 scan_table (struct comb_file *file, union value by[])
793 while (file->data != NULL)
795 int cmp = subcase_compare_3way_xc (&file->by_vars, by, file->data);
798 case_unref (file->data);
799 file->data = casereader_read (file->reader);
807 /* Creates and returns an output case for PROC, initializing each
808 of its values to system-missing or blanks, except that the
809 values of IN variables are set to 0. */
810 static struct ccase *
811 create_output_case (const struct comb_proc *proc)
813 size_t n_vars = dict_get_var_cnt (proc->dict);
814 struct ccase *output;
817 output = case_create (dict_get_proto (proc->dict));
818 for (i = 0; i < n_vars; i++)
820 struct variable *v = dict_get_var (proc->dict, i);
821 value_set_missing (case_data_rw (output, v), var_get_width (v));
823 for (i = 0; i < proc->n_files; i++)
825 struct comb_file *file = &proc->files[i];
826 if (file->in_var != NULL)
827 case_data_rw (output, file->in_var)->f = false;
832 /* Copies the data from FILE's case into output case OUTPUT.
833 If FILE has an IN variable, then it is set to 1 in OUTPUT. */
835 apply_case (const struct comb_file *file, struct ccase *output)
837 subcase_copy (&file->src, file->data, &file->dst, output);
838 if (file->in_var != NULL)
839 case_data_rw (output, file->in_var)->f = true;
842 /* Advances FILE to its next case. If BY is nonnull, then FILE's is_minimal
843 member is updated based on whether the new case's BY values still match
846 advance_file (struct comb_file *file, union value by[])
848 case_unref (file->data);
849 file->data = casereader_read (file->reader);
851 file->is_minimal = (file->data != NULL
852 && subcase_equal_cx (&file->by_vars, file->data, by));
855 /* Writes OUTPUT, whose BY values has been extracted into BY, to
856 PROC's output file, first initializing any FIRST or LAST
857 variables in OUTPUT to the correct values. */
859 output_case (struct comb_proc *proc, struct ccase *output, union value by[])
861 if (proc->first == NULL && proc->last == NULL)
862 casewriter_write (proc->output, output);
865 /* It's harder with LAST, because we can't know whether
866 this case is the last in a group until we've prepared
867 the *next* case also. Thus, we buffer the previous
868 output case until the next one is ready. */
870 if (proc->prev_BY != NULL)
872 new_BY = !subcase_equal_xx (&proc->by_vars, proc->prev_BY, by);
873 if (proc->last != NULL)
874 case_data_rw (proc->buffered_case, proc->last)->f = new_BY;
875 casewriter_write (proc->output, proc->buffered_case);
880 proc->buffered_case = output;
881 if (proc->first != NULL)
882 case_data_rw (proc->buffered_case, proc->first)->f = new_BY;
886 size_t n_values = subcase_get_n_fields (&proc->by_vars);
887 const struct caseproto *proto = subcase_get_proto (&proc->by_vars);
888 if (proc->prev_BY == NULL)
890 proc->prev_BY = xmalloc (n_values * sizeof *proc->prev_BY);
891 caseproto_init_values (proto, proc->prev_BY);
893 caseproto_copy (subcase_get_proto (&proc->by_vars), 0, n_values,
899 /* Writes a trailing buffered case to the output, if FIRST or
902 output_buffered_case (struct comb_proc *proc)
904 if (proc->prev_BY != NULL)
906 if (proc->last != NULL)
907 case_data_rw (proc->buffered_case, proc->last)->f = 1.0;
908 casewriter_write (proc->output, proc->buffered_case);
909 proc->buffered_case = NULL;