1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 #include <data/any-reader.h>
22 #include <data/case-matcher.h>
23 #include <data/case.h>
24 #include <data/casereader.h>
25 #include <data/casewriter.h>
26 #include <data/dictionary.h>
27 #include <data/format.h>
28 #include <data/procedure.h>
29 #include <data/subcase.h>
30 #include <data/variable.h>
31 #include <language/command.h>
32 #include <language/data-io/file-handle.h>
33 #include <language/data-io/trim.h>
34 #include <language/lexer/lexer.h>
35 #include <language/lexer/variable-parser.h>
36 #include <language/stats/sort-criteria.h>
37 #include <libpspp/assertion.h>
38 #include <libpspp/message.h>
39 #include <libpspp/taint.h>
40 #include <math/sort.h>
45 #define _(msgid) gettext (msgid)
47 enum comb_command_type
57 COMB_FILE, /* Specified on FILE= subcommand. */
58 COMB_TABLE /* Specified on TABLE= subcommand. */
61 /* One FILE or TABLE subcommand. */
65 enum comb_file_type type; /* COMB_FILE or COMB_TABLE. */
68 struct subcase by_vars; /* BY variables in this input file. */
69 struct subcase src, dst; /* Data to copy to output; where to put it. */
72 struct file_handle *handle; /* Input file handle. */
73 struct dictionary *dict; /* Input file dictionary. */
74 struct casereader *reader; /* Input data source. */
75 struct ccase data; /* The current input case. */
76 bool is_minimal; /* Does 'data' have minimum BY values across
78 bool is_sorted; /* Is file presorted on the BY variables? */
81 char in_name[VAR_NAME_LEN + 1];
82 struct variable *in_var;
87 struct comb_file *files; /* All the files being merged. */
88 size_t n_files; /* Number of files. */
90 struct dictionary *dict; /* Dictionary of output file. */
91 struct subcase by_vars; /* BY variables in the output. */
92 struct casewriter *output; /* Destination for output. */
94 struct case_matcher *matcher;
97 Only if "first" or "last" is nonnull are the remaining
99 struct variable *first; /* Variable specified on FIRST (if any). */
100 struct variable *last; /* Variable specified on LAST (if any). */
101 struct ccase buffered_case; /* Case ready for output except that we don't
102 know the value for the LAST variable yet. */
103 union value *prev_BY; /* Values of BY vars in buffered_case. */
106 static int combine_files (enum comb_command_type, struct lexer *,
108 static void free_comb_proc (struct comb_proc *);
110 static void close_all_comb_files (struct comb_proc *);
111 static bool merge_dictionary (struct dictionary *const, struct comb_file *);
113 static void execute_update (struct comb_proc *);
114 static void execute_match_files (struct comb_proc *);
115 static void execute_add_files (struct comb_proc *);
117 static bool create_flag_var (const char *subcommand_name, const char *var_name,
118 struct dictionary *, struct variable **);
119 static void output_case (struct comb_proc *, struct ccase *, union value *by);
120 static void output_buffered_case (struct comb_proc *);
123 cmd_add_files (struct lexer *lexer, struct dataset *ds)
125 return combine_files (COMB_ADD, lexer, ds);
129 cmd_match_files (struct lexer *lexer, struct dataset *ds)
131 return combine_files (COMB_MATCH, lexer, ds);
135 cmd_update (struct lexer *lexer, struct dataset *ds)
137 return combine_files (COMB_UPDATE, lexer, ds);
141 combine_files (enum comb_command_type command,
142 struct lexer *lexer, struct dataset *ds)
144 struct comb_proc proc;
147 bool saw_sort = false;
148 struct casereader *active_file = NULL;
150 char first_name[VAR_NAME_LEN + 1] = "";
151 char last_name[VAR_NAME_LEN + 1] = "";
153 struct taint *taint = NULL;
156 size_t allocated_files = 0;
162 proc.dict = dict_create ();
165 subcase_init_empty (&proc.by_vars);
168 case_nullify (&proc.buffered_case);
171 dict_set_case_limit (proc.dict, dict_get_case_limit (dataset_dict (ds)));
173 lex_match (lexer, '/');
176 struct comb_file *file;
177 enum comb_file_type type;
179 if (lex_match_id (lexer, "FILE"))
181 else if (command == COMB_MATCH && lex_match_id (lexer, "TABLE"))
188 lex_match (lexer, '=');
190 if (proc.n_files >= allocated_files)
191 proc.files = x2nrealloc (proc.files, &allocated_files,
193 file = &proc.files[proc.n_files++];
195 subcase_init_empty (&file->by_vars);
196 subcase_init_empty (&file->src);
197 subcase_init_empty (&file->dst);
201 case_nullify (&file->data);
202 file->is_sorted = true;
203 file->in_name[0] = '\0';
206 if (lex_match (lexer, '*'))
208 if (!proc_has_active_file (ds))
210 msg (SE, _("Cannot specify the active file since no active "
211 "file has been defined."));
215 if (proc_make_temporary_transformations_permanent (ds))
216 msg (SE, _("This command may not be used after TEMPORARY when "
217 "the active file is an input source. "
218 "Temporary transformations will be made permanent."));
220 file->dict = dict_clone (dataset_dict (ds));
224 file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
225 if (file->handle == NULL)
228 file->reader = any_reader_open (file->handle, &file->dict);
229 if (file->reader == NULL)
233 while (lex_match (lexer, '/'))
234 if (lex_match_id (lexer, "RENAME"))
236 if (!parse_dict_rename (lexer, file->dict))
239 else if (lex_match_id (lexer, "IN"))
241 lex_match (lexer, '=');
242 if (lex_token (lexer) != T_ID)
244 lex_error (lexer, NULL);
248 if (file->in_name[0])
250 msg (SE, _("Multiple IN subcommands for a single FILE or "
254 strcpy (file->in_name, lex_tokid (lexer));
257 else if (lex_match_id (lexer, "SORT"))
259 file->is_sorted = false;
263 merge_dictionary (proc.dict, file);
266 while (lex_token (lexer) != '.')
268 if (lex_match (lexer, T_BY))
270 const struct variable **by_vars;
276 lex_sbc_only_once ("BY");
281 lex_match (lexer, '=');
282 if (!parse_sort_criteria (lexer, proc.dict, &proc.by_vars,
287 for (i = 0; i < proc.n_files; i++)
289 struct comb_file *file = &proc.files[i];
292 for (j = 0; j < subcase_get_n_values (&proc.by_vars); j++)
294 const char *name = var_get_name (by_vars[j]);
295 struct variable *var = dict_lookup_var (file->dict, name);
297 subcase_add_var (&file->by_vars, var,
298 subcase_get_direction (&proc.by_vars, j));
301 if (file->handle != NULL)
302 msg (SE, _("File %s lacks BY variable %s."),
303 fh_get_name (file->handle), name);
305 msg (SE, _("Active file lacks BY variable %s."), name);
309 assert (!ok || subcase_conformable (&file->by_vars,
310 &proc.files[0].by_vars));
317 else if (command != COMB_UPDATE && lex_match_id (lexer, "FIRST"))
319 if (first_name[0] != '\0')
321 lex_sbc_only_once ("FIRST");
325 lex_match (lexer, '=');
326 if (!lex_force_id (lexer))
328 strcpy (first_name, lex_tokid (lexer));
331 else if (command != COMB_UPDATE && lex_match_id (lexer, "LAST"))
333 if (last_name[0] != '\0')
335 lex_sbc_only_once ("LAST");
339 lex_match (lexer, '=');
340 if (!lex_force_id (lexer))
342 strcpy (last_name, lex_tokid (lexer));
345 else if (lex_match_id (lexer, "MAP"))
349 else if (lex_match_id (lexer, "DROP"))
351 if (!parse_dict_drop (lexer, proc.dict))
354 else if (lex_match_id (lexer, "KEEP"))
356 if (!parse_dict_keep (lexer, proc.dict))
361 lex_error (lexer, NULL);
365 if (!lex_match (lexer, '/') && lex_token (lexer) != '.')
367 lex_end_of_command (lexer);
374 if (command == COMB_UPDATE)
376 msg (SE, _("The BY subcommand is required."));
381 msg (SE, _("BY is required when TABLE is specified."));
386 msg (SE, _("BY is required when SORT is specified."));
391 /* Add IN, FIRST, and LAST variables to master dictionary. */
392 for (i = 0; i < proc.n_files; i++)
394 struct comb_file *file = &proc.files[i];
395 if (!create_flag_var ("IN", file->in_name, proc.dict, &file->in_var))
398 if (!create_flag_var ("FIRST", first_name, proc.dict, &proc.first)
399 || !create_flag_var ("LAST", last_name, proc.dict, &proc.last))
402 dict_delete_scratch_vars (proc.dict);
403 dict_compact_values (proc.dict);
405 /* Set up mapping from each file's variables to master
407 for (i = 0; i < proc.n_files; i++)
409 struct comb_file *file = &proc.files[i];
410 size_t src_var_cnt = dict_get_var_cnt (file->dict);
413 for (j = 0; j < src_var_cnt; j++)
415 struct variable *src_var = dict_get_var (file->dict, j);
416 struct variable *dst_var = dict_lookup_var (proc.dict,
417 var_get_name (src_var));
420 subcase_add_var (&file->src, src_var, SC_ASCEND);
421 subcase_add_var (&file->dst, dst_var, SC_ASCEND);
426 proc.output = autopaging_writer_create (dict_get_next_value_idx (proc.dict));
427 taint = taint_clone (casewriter_get_taint (proc.output));
429 /* Set up case matcher. */
430 proc.matcher = case_matcher_create ();
431 for (i = 0; i < proc.n_files; i++)
433 struct comb_file *file = &proc.files[i];
434 if (file->reader == NULL)
436 if (active_file == NULL)
438 proc_discard_output (ds);
439 file->reader = active_file = proc_open (ds);
442 file->reader = casereader_clone (active_file);
444 if (!file->is_sorted)
445 file->reader = sort_execute (file->reader, &file->by_vars);
446 taint_propagate (casereader_get_taint (file->reader), taint);
447 casereader_read (file->reader, &file->data);
448 if (file->type == COMB_FILE)
449 case_matcher_add_input (proc.matcher, &file->by_vars,
450 &file->data, &file->is_minimal);
453 if (command == COMB_ADD)
454 execute_add_files (&proc);
455 else if (command == COMB_MATCH)
456 execute_match_files (&proc);
457 else if (command == COMB_UPDATE)
458 execute_update (&proc);
462 case_matcher_destroy (proc.matcher);
464 close_all_comb_files (&proc);
465 if (active_file != NULL)
468 proc_set_active_file (ds, casewriter_make_reader (proc.output), proc.dict);
472 free_comb_proc (&proc);
474 return taint_destroy (taint) ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
477 if (active_file != NULL)
479 free_comb_proc (&proc);
480 taint_destroy (taint);
481 return CMD_CASCADING_FAILURE;
484 /* Merge the dictionary for file F into master dictionary M. */
486 merge_dictionary (struct dictionary *const m, struct comb_file *f)
488 struct dictionary *d = f->dict;
489 const char *d_docs, *m_docs;
492 if (dict_get_label (m) == NULL)
493 dict_set_label (m, dict_get_label (d));
495 d_docs = dict_get_documents (d);
496 m_docs = dict_get_documents (m);
500 dict_set_documents (m, d_docs);
503 char *new_docs = xasprintf ("%s%s", m_docs, d_docs);
504 dict_set_documents (m, new_docs);
509 for (i = 0; i < dict_get_var_cnt (d); i++)
511 struct variable *dv = dict_get_var (d, i);
512 struct variable *mv = dict_lookup_var (m, var_get_name (dv));
514 if (dict_class_from_id (var_get_name (dv)) == DC_SCRATCH)
519 if (var_get_width (mv) != var_get_width (dv))
521 const char *var_name = var_get_name (dv);
522 const char *file_name = fh_get_name (f->handle);
523 struct string s = DS_EMPTY_INITIALIZER;
525 _("Variable %s in file %s has different "
526 "type or width from the same variable in "
528 var_name, file_name);
529 ds_put_cstr (&s, " ");
530 if (var_is_numeric (dv))
531 ds_put_format (&s, _("In file %s, %s is numeric."),
532 file_name, var_name);
534 ds_put_format (&s, _("In file %s, %s is a string variable "
536 file_name, var_name, var_get_width (dv));
537 ds_put_cstr (&s, " ");
538 if (var_is_numeric (mv))
539 ds_put_format (&s, _("In an earlier file, %s was numeric."),
542 ds_put_format (&s, _("In an earlier file, %s was a string "
543 "variable with width %d."),
544 var_name, var_get_width (mv));
545 msg (SE, ds_cstr (&s));
550 if (var_has_value_labels (dv) && !var_has_value_labels (mv))
551 var_set_value_labels (mv, var_get_value_labels (dv));
552 if (var_has_missing_values (dv) && !var_has_missing_values (mv))
553 var_set_missing_values (mv, var_get_missing_values (dv));
554 if (var_get_label (dv) && !var_get_label (mv))
555 var_set_label (mv, var_get_label (dv));
558 mv = dict_clone_var_assert (m, dv, var_get_name (dv));
564 /* If VAR_NAME is a non-empty string, attempts to create a
565 variable named VAR_NAME, with format F1.0, in DICT, and stores
566 a pointer to the variable in *VAR. Returns true if
567 successful, false if the variable name is a duplicate (in
568 which case a message saying that the variable specified on the
569 given SUBCOMMAND is a duplicate is emitted). Also returns
570 true, without doing anything, if VAR_NAME is null or empty. */
572 create_flag_var (const char *subcommand, const char *var_name,
573 struct dictionary *dict, struct variable **var)
575 if (var_name[0] != '\0')
577 struct fmt_spec format = fmt_for_output (FMT_F, 1, 0);
578 *var = dict_create_var (dict, var_name, 0);
581 msg (SE, _("Variable name %s specified on %s subcommand "
582 "duplicates an existing variable name."),
583 subcommand, var_name);
586 var_set_both_formats (*var, &format);
593 /* Closes all the files in PROC and frees their associated data. */
595 close_all_comb_files (struct comb_proc *proc)
599 for (i = 0; i < proc->n_files; i++)
601 struct comb_file *file = &proc->files[i];
602 subcase_destroy (&file->by_vars);
603 subcase_destroy (&file->src);
604 subcase_destroy (&file->dst);
605 fh_unref (file->handle);
606 dict_destroy (file->dict);
607 casereader_destroy (file->reader);
608 case_destroy (&file->data);
615 /* Frees all the data for the procedure. */
617 free_comb_proc (struct comb_proc *proc)
619 close_all_comb_files (proc);
620 dict_destroy (proc->dict);
621 casewriter_destroy (proc->output);
622 case_matcher_destroy (proc->matcher);
623 subcase_destroy (&proc->by_vars);
624 case_destroy (&proc->buffered_case);
625 free (proc->prev_BY);
628 static bool scan_table (struct comb_file *, union value by[]);
629 static void create_output_case (const struct comb_proc *, struct ccase *);
630 static void apply_case (const struct comb_file *, struct ccase *);
631 static void apply_file_case_and_advance (struct comb_file *, struct ccase *,
633 static void output_case (struct comb_proc *, struct ccase *, union value by[]);
634 static void output_buffered_case (struct comb_proc *);
636 /* Executes the ADD FILES command. */
638 execute_add_files (struct comb_proc *proc)
642 while (case_matcher_match (proc->matcher, &by))
647 for (i = 0; i < proc->n_files; i++)
649 struct comb_file *file = &proc->files[i];
650 while (file->is_minimal)
652 create_output_case (proc, &output);
653 apply_file_case_and_advance (file, &output, by);
654 output_case (proc, &output, by);
658 output_buffered_case (proc);
661 /* Executes the MATCH FILES command. */
663 execute_match_files (struct comb_proc *proc)
667 while (case_matcher_match (proc->matcher, &by))
672 create_output_case (proc, &output);
673 for (i = proc->n_files; i-- > 0; )
675 struct comb_file *file = &proc->files[i];
676 if (file->type == COMB_FILE)
678 if (file->is_minimal)
679 apply_file_case_and_advance (file, &output, NULL);
683 if (scan_table (file, by))
684 apply_case (file, &output);
687 output_case (proc, &output, by);
689 output_buffered_case (proc);
692 /* Executes the UPDATE command. */
694 execute_update (struct comb_proc *proc)
697 size_t n_duplicates = 0;
699 while (case_matcher_match (proc->matcher, &by))
701 struct comb_file *first, *file;
704 /* Find first nonnull case in array and make an output case
706 create_output_case (proc, &output);
707 for (first = &proc->files[0]; ; first++)
708 if (first->is_minimal)
710 apply_file_case_and_advance (first, &output, by);
712 /* Read additional cases and update the output case from
713 them. (Don't update the output case from any duplicate
714 cases in the master file.) */
715 for (file = first + (first == proc->files);
716 file < &proc->files[proc->n_files]; file++)
718 while (file->is_minimal)
719 apply_file_case_and_advance (file, &output, by);
721 casewriter_write (proc->output, &output);
723 /* Write duplicate cases in the master file directly to the
725 if (first == proc->files && first->is_minimal)
728 while (first->is_minimal)
730 create_output_case (proc, &output);
731 apply_file_case_and_advance (first, &output, by);
732 casewriter_write (proc->output, &output);
738 msg (SW, _("Encountered %zu sets of duplicate cases in the master file."),
742 /* Reads FILE, which must be of type COMB_TABLE, until it
743 encounters a case with BY or greater for its BY variables.
744 Returns true if a case with exactly BY for its BY variables
745 was found, otherwise false. */
747 scan_table (struct comb_file *file, union value by[])
749 while (!case_is_null (&file->data))
751 int cmp = subcase_compare_3way_xc (&file->by_vars, by, &file->data);
754 case_destroy (&file->data);
755 casereader_read (file->reader, &file->data);
763 /* Creates OUTPUT as an output case for PROC, by initializing each of
764 its values to system-missing or blanks, except that the values
765 of IN variables are set to 0. */
767 create_output_case (const struct comb_proc *proc, struct ccase *output)
769 size_t n_vars = dict_get_var_cnt (proc->dict);
772 case_create (output, dict_get_next_value_idx (proc->dict));
773 for (i = 0; i < n_vars; i++)
775 struct variable *v = dict_get_var (proc->dict, i);
776 value_set_missing (case_data_rw (output, v), var_get_width (v));
778 for (i = 0; i < proc->n_files; i++)
780 struct comb_file *file = &proc->files[i];
781 if (file->in_var != NULL)
782 case_data_rw (output, file->in_var)->f = false;
786 /* Copies the data from FILE's case into output case OUTPUT.
787 If FILE has an IN variable, then it is set to 1 in OUTPUT. */
789 apply_case (const struct comb_file *file, struct ccase *output)
791 subcase_copy (&file->src, &file->data, &file->dst, output);
792 if (file->in_var != NULL)
793 case_data_rw (output, file->in_var)->f = true;
796 /* Like apply_case() above, but also advances FILE to its next
797 case. Also, if BY is nonnull, then FILE's is_minimal member
798 is updated based on whether the new case's BY values still
799 match those in BY. */
801 apply_file_case_and_advance (struct comb_file *file, struct ccase *output,
804 apply_case (file, output);
805 case_destroy (&file->data);
806 casereader_read (file->reader, &file->data);
808 file->is_minimal = (!case_is_null (&file->data)
809 && subcase_equal_cx (&file->by_vars, &file->data, by));
812 /* Writes OUTPUT, whose BY values has been extracted into BY, to
813 PROC's output file, first initializing any FIRST or LAST
814 variables in OUTPUT to the correct values. */
816 output_case (struct comb_proc *proc, struct ccase *output, union value by[])
818 if (proc->first == NULL && proc->last == NULL)
819 casewriter_write (proc->output, output);
822 /* It's harder with LAST, because we can't know whether
823 this case is the last in a group until we've prepared
824 the *next* case also. Thus, we buffer the previous
825 output case until the next one is ready. */
827 if (proc->prev_BY != NULL)
829 new_BY = !subcase_equal_xx (&proc->by_vars, proc->prev_BY, by);
830 if (proc->last != NULL)
831 case_data_rw (&proc->buffered_case, proc->last)->f = new_BY;
832 casewriter_write (proc->output, &proc->buffered_case);
837 case_move (&proc->buffered_case, output);
838 if (proc->first != NULL)
839 case_data_rw (&proc->buffered_case, proc->first)->f = new_BY;
843 size_t n = (subcase_get_n_values (&proc->by_vars)
844 * sizeof (union value));
845 if (proc->prev_BY == NULL)
846 proc->prev_BY = xmalloc (n);
847 memcpy (proc->prev_BY, by, n);
852 /* Writes a trailing buffered case to the output, if FIRST or
855 output_buffered_case (struct comb_proc *proc)
857 if (proc->prev_BY != NULL)
859 if (proc->last != NULL)
860 case_data_rw (&proc->buffered_case, proc->last)->f = 1.0;
861 casewriter_write (proc->output, &proc->buffered_case);
862 case_nullify (&proc->buffered_case);