1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2006, 2007, 2008 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 #include <data/any-reader.h>
22 #include <data/case.h>
23 #include <data/casereader.h>
24 #include <data/casewriter.h>
25 #include <data/format.h>
26 #include <data/dictionary.h>
27 #include <data/procedure.h>
28 #include <data/variable.h>
29 #include <language/command.h>
30 #include <language/data-io/file-handle.h>
31 #include <language/data-io/trim.h>
32 #include <language/lexer/lexer.h>
33 #include <language/lexer/variable-parser.h>
34 #include <libpspp/assertion.h>
35 #include <libpspp/message.h>
36 #include <libpspp/taint.h>
41 #define _(msgid) gettext (msgid)
46 MTF_FILE, /* Specified on FILE= subcommand. */
47 MTF_TABLE /* Specified on TABLE= subcommand. */
50 /* One of the FILEs or TABLEs on MATCH FILES. */
53 struct ll ll; /* In list of all files and tables. */
58 const struct variable **by; /* List of BY variables for this file. */
59 struct mtf_variable *vars; /* Variables to copy to output. */
60 size_t var_cnt; /* Number of other variables. */
62 struct file_handle *handle; /* Input file handle. */
63 struct dictionary *dict; /* Input file dictionary. */
64 struct casereader *reader; /* Input reader. */
65 struct ccase input; /* Input record (null at end of file). */
68 char *in_name; /* Variable name. */
69 struct variable *in_var; /* Variable (in master dictionary). */
74 struct variable *in_var;
75 struct variable *out_var;
78 /* MATCH FILES procedure. */
81 struct ll_list files; /* List of "struct mtf_file"s. */
82 int nonempty_files; /* FILEs that are not at end-of-file. */
84 bool ok; /* False if I/O error occurs. */
86 struct dictionary *dict; /* Dictionary of output file. */
87 struct casewriter *output; /* MATCH FILES output. */
89 size_t by_cnt; /* Number of variables on BY subcommand. */
92 Only if "first" or "last" is nonnull are the remaining
94 struct variable *first; /* Variable specified on FIRST (if any). */
95 struct variable *last; /* Variable specified on LAST (if any). */
96 struct ccase buffered_case; /* Case ready for output except that we don't
97 know the value for the LAST variable yet. */
98 struct ccase prev_BY_case; /* Case with values of last set of BY vars. */
99 const struct variable **prev_BY; /* Last set of BY variables. */
102 static void mtf_free (struct mtf_proc *);
104 static bool mtf_close_all_files (struct mtf_proc *);
105 static bool mtf_merge_dictionary (struct dictionary *const, struct mtf_file *);
106 static bool mtf_read_record (struct mtf_proc *mtf, struct mtf_file *);
108 static void mtf_process_case (struct mtf_proc *);
110 static bool create_flag_var (const char *subcommand_name, const char *var_name,
111 struct dictionary *, struct variable **);
112 static char *var_type_description (struct variable *);
114 /* Parse and execute the MATCH FILES command. */
116 cmd_match_files (struct lexer *lexer, struct dataset *ds)
119 struct ll *first_table;
120 struct mtf_file *file, *next;
123 struct casereader *active_file = NULL;
125 char first_name[VAR_NAME_LEN + 1] = "";
126 char last_name[VAR_NAME_LEN + 1] = "";
128 struct taint *taint = NULL;
132 ll_init (&mtf.files);
133 mtf.nonempty_files = 0;
134 first_table = ll_null (&mtf.files);
135 mtf.dict = dict_create ();
138 mtf.first = mtf.last = NULL;
139 case_nullify (&mtf.buffered_case);
140 case_nullify (&mtf.prev_BY_case);
143 dict_set_case_limit (mtf.dict, dict_get_case_limit (dataset_dict (ds)));
145 lex_match (lexer, '/');
146 while (lex_token (lexer) == T_ID
147 && (lex_id_match (ss_cstr ("FILE"), ss_cstr (lex_tokid (lexer)))
148 || lex_id_match (ss_cstr ("TABLE"), ss_cstr (lex_tokid (lexer)))))
150 struct mtf_file *file = xmalloc (sizeof *file);
155 file->in_name = NULL;
159 case_nullify (&file->input);
161 if (lex_match_id (lexer, "FILE"))
163 file->type = MTF_FILE;
164 ll_insert (first_table, &file->ll);
165 mtf.nonempty_files++;
167 else if (lex_match_id (lexer, "TABLE"))
169 file->type = MTF_TABLE;
170 ll_push_tail (&mtf.files, &file->ll);
171 if (first_table == ll_null (&mtf.files))
172 first_table = &file->ll;
176 lex_match (lexer, '=');
178 if (lex_match (lexer, '*'))
180 if (!proc_has_active_file (ds))
182 msg (SE, _("Cannot specify the active file since no active "
183 "file has been defined."));
187 if (proc_make_temporary_transformations_permanent (ds))
189 _("MATCH FILES may not be used after TEMPORARY when "
190 "the active file is an input source. "
191 "Temporary transformations will be made permanent."));
193 file->dict = dict_clone (dataset_dict (ds));
197 file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
198 if (file->handle == NULL)
201 file->reader = any_reader_open (file->handle, &file->dict);
202 if (file->reader == NULL)
206 while (lex_match (lexer, '/'))
207 if (lex_match_id (lexer, "RENAME"))
209 if (!parse_dict_rename (lexer, file->dict))
212 else if (lex_match_id (lexer, "IN"))
214 lex_match (lexer, '=');
215 if (lex_token (lexer) != T_ID)
217 lex_error (lexer, NULL);
221 if (file->in_name != NULL)
223 msg (SE, _("Multiple IN subcommands for a single FILE or "
227 file->in_name = xstrdup (lex_tokid (lexer));
232 mtf_merge_dictionary (mtf.dict, file);
235 while (lex_token (lexer) != '.')
237 if (lex_match (lexer, T_BY))
239 struct mtf_file *file;
240 struct variable **by;
245 lex_sbc_only_once ("BY");
249 lex_match (lexer, '=');
250 if (!parse_variables (lexer, mtf.dict, &by, &mtf.by_cnt,
251 PV_NO_DUPLICATE | PV_NO_SCRATCH))
255 ll_for_each (file, struct mtf_file, ll, &mtf.files)
259 file->by = xnmalloc (mtf.by_cnt, sizeof *file->by);
260 for (i = 0; i < mtf.by_cnt; i++)
262 const char *var_name = var_get_name (by[i]);
263 file->by[i] = dict_lookup_var (file->dict, var_name);
264 if (file->by[i] == NULL)
266 if (file->handle != NULL)
267 msg (SE, _("File %s lacks BY variable %s."),
268 fh_get_name (file->handle), var_name);
270 msg (SE, _("Active file lacks BY variable %s."),
281 else if (lex_match_id (lexer, "FIRST"))
283 if (first_name[0] != '\0')
285 lex_sbc_only_once ("FIRST");
289 lex_match (lexer, '=');
290 if (!lex_force_id (lexer))
292 strcpy (first_name, lex_tokid (lexer));
295 else if (lex_match_id (lexer, "LAST"))
297 if (last_name[0] != '\0')
299 lex_sbc_only_once ("LAST");
303 lex_match (lexer, '=');
304 if (!lex_force_id (lexer))
306 strcpy (last_name, lex_tokid (lexer));
309 else if (lex_match_id (lexer, "MAP"))
313 else if (lex_match_id (lexer, "DROP"))
315 if (!parse_dict_drop (lexer, mtf.dict))
318 else if (lex_match_id (lexer, "KEEP"))
320 if (!parse_dict_keep (lexer, mtf.dict))
325 lex_error (lexer, NULL);
329 if (!lex_match (lexer, '/') && lex_token (lexer) != '.')
331 lex_end_of_command (lexer);
338 if (first_table != ll_null (&mtf.files))
340 msg (SE, _("BY is required when TABLE is specified."));
345 msg (SE, _("BY is required when IN is specified."));
350 /* Set up mapping from each file's variables to master
352 ll_for_each (file, struct mtf_file, ll, &mtf.files)
354 size_t in_var_cnt = dict_get_var_cnt (file->dict);
356 file->vars = xnmalloc (in_var_cnt, sizeof *file->vars);
358 for (i = 0; i < in_var_cnt; i++)
360 struct variable *in_var = dict_get_var (file->dict, i);
361 struct variable *out_var = dict_lookup_var (mtf.dict,
362 var_get_name (in_var));
366 struct mtf_variable *mv = &file->vars[file->var_cnt++];
368 mv->out_var = out_var;
373 /* Add IN, FIRST, and LAST variables to master dictionary. */
374 ll_for_each (file, struct mtf_file, ll, &mtf.files)
375 if (!create_flag_var ("IN", file->in_name, mtf.dict, &file->in_var))
377 if (!create_flag_var ("FIRST", first_name, mtf.dict, &mtf.first)
378 || !create_flag_var ("LAST", last_name, mtf.dict, &mtf.last))
381 dict_delete_scratch_vars (mtf.dict);
382 dict_compact_values (mtf.dict);
383 mtf.output = autopaging_writer_create (dict_get_next_value_idx (mtf.dict));
384 taint = taint_clone (casewriter_get_taint (mtf.output));
386 ll_for_each (file, struct mtf_file, ll, &mtf.files)
388 if (file->reader == NULL)
390 if (active_file == NULL)
392 proc_discard_output (ds);
393 file->reader = active_file = proc_open (ds);
396 file->reader = casereader_clone (active_file);
398 taint_propagate (casereader_get_taint (file->reader), taint);
401 ll_for_each_safe (file, next, struct mtf_file, ll, &mtf.files)
402 mtf_read_record (&mtf, file);
403 while (mtf.nonempty_files > 0)
404 mtf_process_case (&mtf);
405 if ((mtf.first != NULL || mtf.last != NULL) && mtf.prev_BY != NULL)
407 if (mtf.last != NULL)
408 case_data_rw (&mtf.buffered_case, mtf.last)->f = 1.0;
409 casewriter_write (mtf.output, &mtf.buffered_case);
410 case_nullify (&mtf.buffered_case);
412 mtf_close_all_files (&mtf);
413 if (active_file != NULL)
416 proc_set_active_file (ds, casewriter_make_reader (mtf.output), mtf.dict);
422 return taint_destroy (taint) ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
425 if (active_file != NULL)
428 taint_destroy (taint);
429 return CMD_CASCADING_FAILURE;
432 /* If VAR_NAME is a nonnull pointer to a non-empty string,
433 attempts to create a variable named VAR_NAME, with format
434 F1.0, in DICT, and stores a pointer to the variable in *VAR.
435 Returns true if successful, false if the variable name is a
436 duplicate (in which case a message saying that the variable
437 specified on the given SUBCOMMAND is a duplicate is emitted).
438 Also returns true, without doing anything, if VAR_NAME is null
441 create_flag_var (const char *subcommand, const char *var_name,
442 struct dictionary *dict, struct variable **var)
444 if (var_name != NULL && var_name[0] != '\0')
446 struct fmt_spec format = fmt_for_output (FMT_F, 1, 0);
447 *var = dict_create_var (dict, var_name, 0);
450 msg (SE, _("Variable name %s specified on %s subcommand "
451 "duplicates an existing variable name."),
452 subcommand, var_name);
455 var_set_both_formats (*var, &format);
462 /* Return a string in an allocated buffer describing V's variable
465 var_type_description (struct variable *v)
467 if (var_is_numeric (v))
468 return xstrdup ("numeric");
470 return xasprintf ("string with width %d", var_get_width (v));
473 /* Closes all the files in MTF and frees their associated data.
474 Returns true if successful, false if an I/O error occurred on
477 mtf_close_all_files (struct mtf_proc *mtf)
479 struct mtf_file *file;
482 ll_for_each_preremove (file, struct mtf_file, ll, &mtf->files)
484 fh_unref (file->handle);
485 casereader_destroy (file->reader);
487 dict_destroy (file->dict);
488 free (file->in_name);
489 case_destroy (&file->input);
497 /* Frees all the data for the MATCH FILES procedure. */
499 mtf_free (struct mtf_proc *mtf)
501 mtf_close_all_files (mtf);
502 dict_destroy (mtf->dict);
503 casewriter_destroy (mtf->output);
504 case_destroy (&mtf->buffered_case);
505 case_destroy (&mtf->prev_BY_case);
508 /* Reads the next record into FILE, if possible, and update MTF's
509 nonempty_files count if not. */
511 mtf_read_record (struct mtf_proc *mtf, struct mtf_file *file)
513 case_destroy (&file->input);
514 if (!casereader_read (file->reader, &file->input))
516 mtf->nonempty_files--;
523 /* Compare the BY variables for files A and B; return -1 if A <
524 B, 0 if A == B, 1 if A > B. (If there are no BY variables,
525 then all records are equal.) */
527 mtf_compare_BY_values (struct mtf_proc *mtf,
528 struct mtf_file *a, struct mtf_file *b)
530 return case_compare_2dict (&a->input, &b->input, a->by, b->by, mtf->by_cnt);
533 /* Processes input files and write one case to the output file. */
535 mtf_process_case (struct mtf_proc *mtf)
538 struct mtf_file *min;
539 struct mtf_file *file;
543 /* Find the set of one or more FILEs whose BY values are
544 minimal, as well as the set of zero or more TABLEs whose BY
545 values equal those of the minimum FILEs.
547 After each iteration of the loop, this invariant holds: the
548 FILEs with minimum BY values thus far have "sequence"
549 members equal to min_sequence, and "min" points to one of
550 the mtf_files whose case has those minimum BY values, and
551 similarly for TABLEs. */
554 ll_for_each (file, struct mtf_file, ll, &mtf->files)
555 if (case_is_null (&file->input))
557 else if (file->type == MTF_FILE)
559 int cmp = min != NULL ? mtf_compare_BY_values (mtf, min, file) : 1;
561 file->sequence = cmp < 0 ? -1 : min_sequence;
564 file->sequence = ++min_sequence;
571 assert (min != NULL);
574 cmp = mtf_compare_BY_values (mtf, min, file);
576 while (cmp > 0 && mtf_read_record (mtf, file));
577 file->sequence = cmp == 0 ? min_sequence : -1;
580 /* Form the output case from the input cases. */
581 case_create (&c, dict_get_next_value_idx (mtf->dict));
582 for (i = 0; i < dict_get_var_cnt (mtf->dict); i++)
584 struct variable *v = dict_get_var (mtf->dict, i);
585 value_set_missing (case_data_rw (&c, v), var_get_width (v));
587 ll_for_each_reverse (file, struct mtf_file, ll, &mtf->files)
589 bool include_file = file->sequence == min_sequence;
591 for (i = 0; i < file->var_cnt; i++)
593 const struct mtf_variable *mv = &file->vars[i];
594 const union value *in = case_data (&file->input, mv->in_var);
595 union value *out = case_data_rw (&c, mv->out_var);
596 value_copy (out, in, var_get_width (mv->in_var));
598 if (file->in_var != NULL)
599 case_data_rw (&c, file->in_var)->f = include_file;
602 /* Write the output case. */
603 if (mtf->first == NULL && mtf->last == NULL)
605 /* With no FIRST or LAST variables, it's trivial. */
606 casewriter_write (mtf->output, &c);
610 /* It's harder with LAST, because we can't know whether
611 this case is the last in a group until we've prepared
612 the *next* case also. Thus, we buffer the previous
613 output case until the next one is ready.
615 We also have to save a copy of one of the previous input
616 cases, so that we can compare the BY variables. We
617 can't compare the BY variables between the current
618 output case and the saved one because the BY variables
619 might not be in the output (the user is allowed to drop
622 if (mtf->prev_BY != NULL)
624 new_BY = case_compare_2dict (&min->input, &mtf->prev_BY_case,
625 min->by, mtf->prev_BY,
627 if (mtf->last != NULL)
628 case_data_rw (&mtf->buffered_case, mtf->last)->f = new_BY;
629 casewriter_write (mtf->output, &mtf->buffered_case);
634 case_move (&mtf->buffered_case, &c);
635 if (mtf->first != NULL)
636 case_data_rw (&mtf->buffered_case, mtf->first)->f = new_BY;
640 mtf->prev_BY = min->by;
641 case_destroy (&mtf->prev_BY_case);
642 case_clone (&mtf->prev_BY_case, &min->input);
646 /* Read another record from each input file FILE with minimum
648 ll_for_each (file, struct mtf_file, ll, &mtf->files)
649 if (file->type == MTF_FILE)
651 if (file->sequence == min_sequence)
652 mtf_read_record (mtf, file);
658 /* Merge the dictionary for file F into master dictionary M. */
660 mtf_merge_dictionary (struct dictionary *const m, struct mtf_file *f)
662 struct dictionary *d = f->dict;
663 const char *d_docs, *m_docs;
666 if (dict_get_label (m) == NULL)
667 dict_set_label (m, dict_get_label (d));
669 d_docs = dict_get_documents (d);
670 m_docs = dict_get_documents (m);
674 dict_set_documents (m, d_docs);
677 char *new_docs = xasprintf ("%s%s", m_docs, d_docs);
678 dict_set_documents (m, new_docs);
683 for (i = 0; i < dict_get_var_cnt (d); i++)
685 struct variable *dv = dict_get_var (d, i);
686 struct variable *mv = dict_lookup_var (m, var_get_name (dv));
688 if (dict_class_from_id (var_get_name (dv)) == DC_SCRATCH)
693 if (var_get_width (mv) != var_get_width (dv))
695 char *dv_description = var_type_description (dv);
696 char *mv_description = var_type_description (mv);
697 msg (SE, _("Variable %s in file %s (%s) has different "
698 "type or width from the same variable in "
699 "earlier file (%s)."),
700 var_get_name (dv), fh_get_name (f->handle),
701 dv_description, mv_description);
702 free (dv_description);
703 free (mv_description);
707 if (var_get_width (dv) == var_get_width (mv))
709 if (var_has_value_labels (dv) && !var_has_value_labels (mv))
710 var_set_value_labels (mv, var_get_value_labels (dv));
711 if (var_has_missing_values (dv) && !var_has_missing_values (mv))
712 var_set_missing_values (mv, var_get_missing_values (dv));
715 if (var_get_label (dv) && !var_get_label (mv))
716 var_set_label (mv, var_get_label (dv));
719 mv = dict_clone_var_assert (m, dv, var_get_name (dv));