1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 #include <data/any-reader.h>
22 #include <data/case-matcher.h>
23 #include <data/case.h>
24 #include <data/casereader.h>
25 #include <data/casewriter.h>
26 #include <data/dictionary.h>
27 #include <data/format.h>
28 #include <data/procedure.h>
29 #include <data/subcase.h>
30 #include <data/variable.h>
31 #include <language/command.h>
32 #include <language/data-io/file-handle.h>
33 #include <language/data-io/trim.h>
34 #include <language/lexer/lexer.h>
35 #include <language/lexer/variable-parser.h>
36 #include <language/stats/sort-criteria.h>
37 #include <libpspp/assertion.h>
38 #include <libpspp/message.h>
39 #include <libpspp/taint.h>
40 #include <math/sort.h>
45 #define _(msgid) gettext (msgid)
57 MTF_FILE, /* Specified on FILE= subcommand. */
58 MTF_TABLE /* Specified on TABLE= subcommand. */
61 /* One FILE or TABLE subcommand. */
65 struct casereader *reader;
68 struct mtf_variable *vars; /* Variables to copy to output. */
69 size_t var_cnt; /* Number of other variables. */
70 bool is_sorted; /* Is presorted on the BY variables? */
72 struct file_handle *handle; /* Input file handle. */
73 struct dictionary *dict; /* Input file dictionary. */
78 char in_name[VAR_NAME_LEN + 1];
79 struct variable *in_var;
84 struct variable *in_var;
85 struct variable *out_var;
90 struct mtf_file **files; /* All the files being merged. */
91 size_t n_files; /* Number of files. */
93 struct dictionary *dict; /* Dictionary of output file. */
94 struct casewriter *output; /* Destination for output. */
96 struct case_matcher *matcher;
100 Only if "first" or "last" is nonnull are the remaining
102 struct variable *first; /* Variable specified on FIRST (if any). */
103 struct variable *last; /* Variable specified on LAST (if any). */
104 struct ccase buffered_case; /* Case ready for output except that we don't
105 know the value for the LAST variable yet. */
106 union value *prev_BY; /* Values of BY vars in buffered_case. */
109 static int combine_files (enum command_type, struct lexer *, struct dataset *);
110 static void mtf_free (struct mtf_proc *);
112 static bool mtf_close_all_files (struct mtf_proc *);
113 static bool mtf_merge_dictionary (struct dictionary *const, struct mtf_file *);
115 static void process_update (struct mtf_proc *);
116 static void process_match_files (struct mtf_proc *);
117 static void process_add_files (struct mtf_proc *);
119 static bool create_flag_var (const char *subcommand_name, const char *var_name,
120 struct dictionary *, struct variable **);
121 static char *var_type_description (struct variable *);
122 static void output_case (struct mtf_proc *, struct ccase *, union value *by);
123 static void output_buffered_case (struct mtf_proc *);
126 cmd_add_files (struct lexer *lexer, struct dataset *ds)
128 return combine_files (ADD_FILES, lexer, ds);
132 cmd_match_files (struct lexer *lexer, struct dataset *ds)
134 return combine_files (MATCH_FILES, lexer, ds);
138 cmd_update (struct lexer *lexer, struct dataset *ds)
140 return combine_files (UPDATE, lexer, ds);
144 combine_files (enum command_type command,
145 struct lexer *lexer, struct dataset *ds)
150 bool saw_sort = false;
151 struct casereader *active_file = NULL;
153 char first_name[VAR_NAME_LEN + 1] = "";
154 char last_name[VAR_NAME_LEN + 1] = "";
156 struct taint *taint = NULL;
160 size_t allocated_files = 0;
166 mtf.dict = dict_create ();
169 subcase_init_empty (&mtf.by);
172 case_nullify (&mtf.buffered_case);
175 dict_set_case_limit (mtf.dict, dict_get_case_limit (dataset_dict (ds)));
177 lex_match (lexer, '/');
180 struct mtf_file *file;
183 if (lex_match_id (lexer, "FILE"))
185 else if (command == MATCH_FILES && lex_match_id (lexer, "TABLE"))
189 lex_match (lexer, '=');
191 if (mtf.n_files >= allocated_files)
192 mtf.files = x2nrealloc (mtf.files, &allocated_files,
194 mtf.files[mtf.n_files++] = file = xmalloc (sizeof *file);
197 subcase_init_empty (&file->by);
198 file->idx = type == MTF_FILE ? n_files++ : n_tables++;
201 file->is_sorted = true;
204 case_nullify (&file->c);
205 file->in_name[0] = '\0';
208 if (lex_match (lexer, '*'))
210 if (!proc_has_active_file (ds))
212 msg (SE, _("Cannot specify the active file since no active "
213 "file has been defined."));
217 if (proc_make_temporary_transformations_permanent (ds))
219 _("This command may not be used after TEMPORARY when "
220 "the active file is an input source. "
221 "Temporary transformations will be made permanent."));
223 file->dict = dict_clone (dataset_dict (ds));
227 file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
228 if (file->handle == NULL)
231 file->reader = any_reader_open (file->handle, &file->dict);
232 if (file->reader == NULL)
236 while (lex_match (lexer, '/'))
237 if (lex_match_id (lexer, "RENAME"))
239 if (!parse_dict_rename (lexer, file->dict))
242 else if (lex_match_id (lexer, "IN"))
244 lex_match (lexer, '=');
245 if (lex_token (lexer) != T_ID)
247 lex_error (lexer, NULL);
251 if (file->in_name[0])
253 msg (SE, _("Multiple IN subcommands for a single FILE or "
257 strcpy (file->in_name, lex_tokid (lexer));
260 else if (lex_match_id (lexer, "SORT"))
262 file->is_sorted = false;
266 mtf_merge_dictionary (mtf.dict, file);
269 while (lex_token (lexer) != '.')
271 if (lex_match (lexer, T_BY))
273 const struct variable **by_vars;
279 lex_sbc_only_once ("BY");
284 lex_match (lexer, '=');
285 if (!parse_sort_criteria (lexer, mtf.dict, &mtf.by, &by_vars, NULL))
289 for (i = 0; i < mtf.n_files; i++)
291 struct mtf_file *file = mtf.files[i];
294 for (j = 0; j < subcase_get_n_values (&mtf.by); j++)
296 const char *name = var_get_name (by_vars[j]);
297 struct variable *var = dict_lookup_var (file->dict, name);
299 subcase_add_var (&file->by, var,
300 subcase_get_direction (&mtf.by, j));
303 if (file->handle != NULL)
304 msg (SE, _("File %s lacks BY variable %s."),
305 fh_get_name (file->handle), name);
307 msg (SE, _("Active file lacks BY variable %s."), name);
311 assert (!ok || subcase_conformable (&file->by,
319 else if (command != UPDATE && lex_match_id (lexer, "FIRST"))
321 if (first_name[0] != '\0')
323 lex_sbc_only_once ("FIRST");
327 lex_match (lexer, '=');
328 if (!lex_force_id (lexer))
330 strcpy (first_name, lex_tokid (lexer));
333 else if (command != UPDATE && lex_match_id (lexer, "LAST"))
335 if (last_name[0] != '\0')
337 lex_sbc_only_once ("LAST");
341 lex_match (lexer, '=');
342 if (!lex_force_id (lexer))
344 strcpy (last_name, lex_tokid (lexer));
347 else if (lex_match_id (lexer, "MAP"))
351 else if (lex_match_id (lexer, "DROP"))
353 if (!parse_dict_drop (lexer, mtf.dict))
356 else if (lex_match_id (lexer, "KEEP"))
358 if (!parse_dict_keep (lexer, mtf.dict))
363 lex_error (lexer, NULL);
367 if (!lex_match (lexer, '/') && lex_token (lexer) != '.')
369 lex_end_of_command (lexer);
376 if (command == UPDATE)
378 msg (SE, _("The BY subcommand is required."));
383 msg (SE, _("BY is required when TABLE is specified."));
388 msg (SE, _("BY is required when SORT is specified."));
393 /* Set up mapping from each file's variables to master
395 for (i = 0; i < mtf.n_files; i++)
397 struct mtf_file *file = mtf.files[i];
398 size_t in_var_cnt = dict_get_var_cnt (file->dict);
401 file->vars = xnmalloc (in_var_cnt, sizeof *file->vars);
403 for (j = 0; j < in_var_cnt; j++)
405 struct variable *in_var = dict_get_var (file->dict, j);
406 struct variable *out_var = dict_lookup_var (mtf.dict,
407 var_get_name (in_var));
411 struct mtf_variable *mv = &file->vars[file->var_cnt++];
413 mv->out_var = out_var;
418 /* Add IN, FIRST, and LAST variables to master dictionary. */
419 for (i = 0; i < mtf.n_files; i++)
421 struct mtf_file *file = mtf.files[i];
422 if (!create_flag_var ("IN", file->in_name, mtf.dict, &file->in_var))
425 if (!create_flag_var ("FIRST", first_name, mtf.dict, &mtf.first)
426 || !create_flag_var ("LAST", last_name, mtf.dict, &mtf.last))
429 dict_delete_scratch_vars (mtf.dict);
430 dict_compact_values (mtf.dict);
431 mtf.output = autopaging_writer_create (dict_get_next_value_idx (mtf.dict));
432 taint = taint_clone (casewriter_get_taint (mtf.output));
434 mtf.matcher = case_matcher_create ();
435 taint_propagate (case_matcher_get_taint (mtf.matcher), taint);
436 for (i = 0; i < mtf.n_files; i++)
438 struct mtf_file *file = mtf.files[i];
439 if (file->reader == NULL)
441 if (active_file == NULL)
443 proc_discard_output (ds);
444 file->reader = active_file = proc_open (ds);
447 file->reader = casereader_clone (active_file);
449 if (!file->is_sorted)
450 file->reader = sort_execute (file->reader, &file->by);
451 if (file->type == MTF_FILE)
452 case_matcher_add_input (mtf.matcher, file->reader, &file->by);
455 casereader_read (file->reader, &file->c);
456 taint_propagate (casereader_get_taint (file->reader), taint);
460 if (command == ADD_FILES)
461 process_add_files (&mtf);
462 else if (command == MATCH_FILES)
463 process_match_files (&mtf);
464 else if (command == UPDATE)
465 process_update (&mtf);
469 case_matcher_destroy (mtf.matcher);
470 mtf_close_all_files (&mtf);
471 if (active_file != NULL)
474 proc_set_active_file (ds, casewriter_make_reader (mtf.output), mtf.dict);
480 return taint_destroy (taint) ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
483 if (active_file != NULL)
486 taint_destroy (taint);
487 return CMD_CASCADING_FAILURE;
490 /* If VAR_NAME is a non-empty string, attempts to create a
491 variable named VAR_NAME, with format F1.0, in DICT, and stores
492 a pointer to the variable in *VAR. Returns true if
493 successful, false if the variable name is a duplicate (in
494 which case a message saying that the variable specified on the
495 given SUBCOMMAND is a duplicate is emitted). Also returns
496 true, without doing anything, if VAR_NAME is null or empty. */
498 create_flag_var (const char *subcommand, const char *var_name,
499 struct dictionary *dict, struct variable **var)
501 if (var_name[0] != '\0')
503 struct fmt_spec format = fmt_for_output (FMT_F, 1, 0);
504 *var = dict_create_var (dict, var_name, 0);
507 msg (SE, _("Variable name %s specified on %s subcommand "
508 "duplicates an existing variable name."),
509 subcommand, var_name);
512 var_set_both_formats (*var, &format);
519 /* Return a string in an allocated buffer describing V's variable
522 var_type_description (struct variable *v)
524 if (var_is_numeric (v))
525 return xstrdup ("numeric");
527 return xasprintf ("string with width %d", var_get_width (v));
530 /* Closes all the files in MTF and frees their associated data.
531 Returns true if successful, false if an I/O error occurred on
534 mtf_close_all_files (struct mtf_proc *mtf)
539 for (i = 0; i < mtf->n_files; i++)
541 struct mtf_file *file = mtf->files[i];
542 fh_unref (file->handle);
543 dict_destroy (file->dict);
544 subcase_destroy (&file->by);
545 if (file->type == MTF_TABLE)
546 casereader_destroy (file->reader);
557 /* Frees all the data for the procedure. */
559 mtf_free (struct mtf_proc *mtf)
561 mtf_close_all_files (mtf);
562 dict_destroy (mtf->dict);
563 subcase_destroy (&mtf->by);
564 casewriter_destroy (mtf->output);
565 case_destroy (&mtf->buffered_case);
570 scan_table (struct mtf_file *file, union value *by)
572 while (!case_is_null (&file->c))
574 int cmp = subcase_compare_3way_xc (&file->by, by, &file->c);
576 casereader_read (file->reader, &file->c);
584 create_output_case (const struct mtf_proc *mtf, struct ccase *c)
588 case_create (c, dict_get_next_value_idx (mtf->dict));
589 for (i = 0; i < dict_get_var_cnt (mtf->dict); i++)
591 struct variable *v = dict_get_var (mtf->dict, i);
592 value_set_missing (case_data_rw (c, v), var_get_width (v));
594 for (i = 0; i < mtf->n_files; i++)
596 struct mtf_file *file = mtf->files[i];
597 if (file->in_var != NULL)
598 case_data_rw (c, file->in_var)->f = false;
603 apply_case (const struct mtf_file *file, struct ccase *file_case,
608 for (j = 0; j < file->var_cnt; j++)
610 const struct mtf_variable *mv = &file->vars[j];
611 const union value *in = case_data (file_case, mv->in_var);
612 union value *out = case_data_rw (c, mv->out_var);
613 value_copy (out, in, var_get_width (mv->in_var));
615 case_destroy (file_case);
616 if (file->in_var != NULL)
617 case_data_rw (c, file->in_var)->f = true;
621 find_first_match (struct ccase *cases)
625 if (!case_is_null (&cases[i]))
630 process_update (struct mtf_proc *mtf)
635 while (case_matcher_read (mtf->matcher, &cases, &by))
637 struct mtf_file *min;
642 create_output_case (mtf, &c);
643 min_idx = find_first_match (cases);
644 min = mtf->files[min_idx];
645 apply_case (min, &cases[min_idx], &c);
646 case_matcher_advance (mtf->matcher, min_idx, &cases[min_idx]);
647 for (i = MAX (1, min_idx); i < mtf->n_files; i++)
648 while (!case_is_null (&cases[i]))
650 apply_case (mtf->files[i], &cases[i], &c);
651 case_matcher_advance (mtf->matcher, i, &cases[i]);
653 casewriter_write (mtf->output, &c);
659 for (n_dups = 0; !case_is_null (&cases[0]); n_dups++)
661 create_output_case (mtf, &c);
662 apply_case (mtf->files[0], &cases[0], &c);
663 case_matcher_advance (mtf->matcher, 0, &cases[0]);
664 casewriter_write (mtf->output, &c);
668 msg (SW, _("Encountered %zu duplicates."), n_dups);
670 /* XXX warn. That's the whole point; otherwise we
671 don't need the 'if' statement at all. */
676 /* Executes MATCH FILES for key-based matches. */
678 process_match_files (struct mtf_proc *mtf)
683 while (case_matcher_read (mtf->matcher, &cases, &by))
688 create_output_case (mtf, &c);
689 for (i = mtf->n_files; i-- > 0; )
691 struct mtf_file *file = mtf->files[i];
692 struct ccase *file_case;
694 if (file->type == MTF_FILE)
696 file_case = &cases[file->idx];
697 include = !case_is_null (file_case);
699 case_matcher_advance (mtf->matcher, file->idx, NULL);
703 file_case = &file->c;
704 include = scan_table (file, by);
706 case_clone (file_case, file_case);
709 apply_case (file, file_case, &c);
711 output_case (mtf, &c, by);
713 output_buffered_case (mtf);
716 /* Processes input files and write one case to the output file. */
718 process_add_files (struct mtf_proc *mtf)
723 while (case_matcher_read (mtf->matcher, &cases, &by))
728 for (i = 0; i < mtf->n_files; i++)
730 struct mtf_file *file = mtf->files[i];
731 while (!case_is_null (&cases[i]))
733 create_output_case (mtf, &c);
734 apply_case (file, &cases[i], &c);
735 case_matcher_advance (mtf->matcher, i, &cases[i]);
736 output_case (mtf, &c, by);
740 output_buffered_case (mtf);
744 output_case (struct mtf_proc *mtf, struct ccase *c, union value *by)
746 if (mtf->first == NULL && mtf->last == NULL)
747 casewriter_write (mtf->output, c);
750 /* It's harder with LAST, because we can't know whether
751 this case is the last in a group until we've prepared
752 the *next* case also. Thus, we buffer the previous
753 output case until the next one is ready. */
755 if (mtf->prev_BY != NULL)
757 new_BY = !subcase_equal_xx (&mtf->by, mtf->prev_BY, by);
758 if (mtf->last != NULL)
759 case_data_rw (&mtf->buffered_case, mtf->last)->f = new_BY;
760 casewriter_write (mtf->output, &mtf->buffered_case);
765 case_move (&mtf->buffered_case, c);
766 if (mtf->first != NULL)
767 case_data_rw (&mtf->buffered_case, mtf->first)->f = new_BY;
771 size_t n = subcase_get_n_values (&mtf->by) * sizeof (union value);
772 if (mtf->prev_BY == NULL)
773 mtf->prev_BY = xmalloc (n);
774 memcpy (mtf->prev_BY, by, n);
780 output_buffered_case (struct mtf_proc *mtf)
782 if (mtf->prev_BY != NULL)
784 if (mtf->last != NULL)
785 case_data_rw (&mtf->buffered_case, mtf->last)->f = 1.0;
786 casewriter_write (mtf->output, &mtf->buffered_case);
787 case_nullify (&mtf->buffered_case);
791 /* Merge the dictionary for file F into master dictionary M. */
793 mtf_merge_dictionary (struct dictionary *const m, struct mtf_file *f)
795 struct dictionary *d = f->dict;
796 const char *d_docs, *m_docs;
799 if (dict_get_label (m) == NULL)
800 dict_set_label (m, dict_get_label (d));
802 d_docs = dict_get_documents (d);
803 m_docs = dict_get_documents (m);
807 dict_set_documents (m, d_docs);
810 char *new_docs = xasprintf ("%s%s", m_docs, d_docs);
811 dict_set_documents (m, new_docs);
816 for (i = 0; i < dict_get_var_cnt (d); i++)
818 struct variable *dv = dict_get_var (d, i);
819 struct variable *mv = dict_lookup_var (m, var_get_name (dv));
821 if (dict_class_from_id (var_get_name (dv)) == DC_SCRATCH)
826 if (var_get_width (mv) != var_get_width (dv))
828 char *dv_description = var_type_description (dv);
829 char *mv_description = var_type_description (mv);
830 msg (SE, _("Variable %s in file %s (%s) has different "
831 "type or width from the same variable in "
832 "earlier file (%s)."),
833 var_get_name (dv), fh_get_name (f->handle),
834 dv_description, mv_description);
835 free (dv_description);
836 free (mv_description);
840 if (var_has_value_labels (dv) && !var_has_value_labels (mv))
841 var_set_value_labels (mv, var_get_value_labels (dv));
842 if (var_has_missing_values (dv) && !var_has_missing_values (mv))
843 var_set_missing_values (mv, var_get_missing_values (dv));
844 if (var_get_label (dv) && !var_get_label (mv))
845 var_set_label (mv, var_get_label (dv));
848 mv = dict_clone_var_assert (m, dv, var_get_name (dv));