1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include <data/any-reader.h>
25 #include <data/any-writer.h>
26 #include <data/case-sink.h>
27 #include <data/case-source.h>
28 #include <data/case.h>
29 #include <data/casefile.h>
30 #include <data/fastfile.h>
31 #include <data/dictionary.h>
32 #include <data/por-file-writer.h>
33 #include <data/procedure.h>
34 #include <data/settings.h>
35 #include <data/storage-stream.h>
36 #include <data/sys-file-writer.h>
37 #include <data/transformations.h>
38 #include <data/value-labels.h>
39 #include <data/variable.h>
40 #include <language/command.h>
41 #include <language/data-io/file-handle.h>
42 #include <language/lexer/lexer.h>
43 #include <language/lexer/variable-parser.h>
44 #include <libpspp/alloc.h>
45 #include <libpspp/assertion.h>
46 #include <libpspp/compiler.h>
47 #include <libpspp/hash.h>
48 #include <libpspp/message.h>
49 #include <libpspp/message.h>
50 #include <libpspp/misc.h>
51 #include <libpspp/str.h>
54 #define _(msgid) gettext (msgid)
56 /* Rearranging and reducing a dictionary. */
57 static void start_case_map (struct dictionary *);
58 static struct case_map *finish_case_map (struct dictionary *);
59 static void map_case (const struct case_map *,
60 const struct ccase *, struct ccase *);
61 static void destroy_case_map (struct case_map *);
63 static bool parse_dict_trim (struct lexer *, struct dictionary *);
65 /* Reading system and portable files. */
67 /* Type of command. */
74 /* Case reader input program. */
75 struct case_reader_pgm
77 struct any_reader *reader; /* File reader. */
78 struct case_map *map; /* Map from file dict to active file dict. */
79 struct ccase bounce; /* Bounce buffer. */
82 static const struct case_source_class case_reader_source_class;
84 static void case_reader_pgm_free (struct case_reader_pgm *);
86 /* Parses a GET or IMPORT command. */
88 parse_read_command (struct lexer *lexer, struct dataset *ds, enum reader_command type)
90 struct case_reader_pgm *pgm = NULL;
91 struct file_handle *fh = NULL;
92 struct dictionary *dict = NULL;
96 lex_match (lexer, '/');
98 if (lex_match_id (lexer, "FILE") || lex_token (lexer) == T_STRING)
100 lex_match (lexer, '=');
102 fh = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
106 else if (type == IMPORT_CMD && lex_match_id (lexer, "TYPE"))
108 lex_match (lexer, '=');
110 if (lex_match_id (lexer, "COMM"))
112 else if (lex_match_id (lexer, "TAPE"))
116 lex_error (lexer, _("expecting COMM or TAPE"));
126 lex_sbc_missing (lexer, "FILE");
130 discard_variables (ds);
132 pgm = xmalloc (sizeof *pgm);
133 pgm->reader = any_reader_open (fh, &dict);
135 case_nullify (&pgm->bounce);
136 if (pgm->reader == NULL)
139 case_create (&pgm->bounce, dict_get_next_value_idx (dict));
141 start_case_map (dict);
143 while (lex_token (lexer) != '.')
145 lex_match (lexer, '/');
146 if (!parse_dict_trim (lexer, dict))
150 pgm->map = finish_case_map (dict);
152 dict_destroy (dataset_dict (ds));
153 dataset_set_dict (ds, dict);
156 create_case_source (&case_reader_source_class, pgm));
161 case_reader_pgm_free (pgm);
164 return CMD_CASCADING_FAILURE;
167 /* Frees a struct case_reader_pgm. */
169 case_reader_pgm_free (struct case_reader_pgm *pgm)
173 any_reader_close (pgm->reader);
174 destroy_case_map (pgm->map);
175 case_destroy (&pgm->bounce);
180 /* Clears internal state related to case reader input procedure. */
182 case_reader_source_destroy (struct case_source *source)
184 struct case_reader_pgm *pgm = source->aux;
185 case_reader_pgm_free (pgm);
188 /* Reads all the cases from the data file into C and passes them
189 to WRITE_CASE one by one, passing WC_DATA.
190 Returns true if successful, false if an I/O error occurred. */
192 case_reader_source_read (struct case_source *source,
194 write_case_func *write_case, write_case_data wc_data)
196 struct case_reader_pgm *pgm = source->aux;
202 if (pgm->map == NULL)
203 got_case = any_reader_read (pgm->reader, c);
206 got_case = any_reader_read (pgm->reader, &pgm->bounce);
208 map_case (pgm->map, &pgm->bounce, c);
213 ok = write_case (wc_data);
217 return ok && !any_reader_error (pgm->reader);
220 static const struct case_source_class case_reader_source_class =
224 case_reader_source_read,
225 case_reader_source_destroy,
230 cmd_get (struct lexer *lexer, struct dataset *ds)
232 return parse_read_command (lexer, ds, GET_CMD);
237 cmd_import (struct lexer *lexer, struct dataset *ds)
239 return parse_read_command (lexer, ds, IMPORT_CMD);
242 /* Writing system and portable files. */
244 /* Type of output file. */
247 SYSFILE_WRITER, /* System file. */
248 PORFILE_WRITER /* Portable file. */
251 /* Type of a command. */
254 XFORM_CMD, /* Transformation. */
255 PROC_CMD /* Procedure. */
258 /* File writer plus a case map. */
261 struct any_writer *writer; /* File writer. */
262 struct case_map *map; /* Map to output file dictionary
263 (null pointer for identity mapping). */
264 struct ccase bounce; /* Bounce buffer for mapping (if needed). */
269 case_writer_destroy (struct case_writer *aw)
274 ok = any_writer_close (aw->writer);
275 destroy_case_map (aw->map);
276 case_destroy (&aw->bounce);
282 /* Parses SAVE or XSAVE or EXPORT or XEXPORT command.
283 WRITER_TYPE identifies the type of file to write,
284 and COMMAND_TYPE identifies the type of command.
286 On success, returns a writer.
287 For procedures only, sets *RETAIN_UNSELECTED to true if cases
288 that would otherwise be excluded by FILTER or USE should be
291 On failure, returns a null pointer. */
292 static struct case_writer *
293 parse_write_command (struct lexer *lexer, struct dataset *ds,
294 enum writer_type writer_type,
295 enum command_type command_type,
296 bool *retain_unselected)
299 struct file_handle *handle; /* Output file. */
300 struct dictionary *dict; /* Dictionary for output file. */
301 struct case_writer *aw; /* Writer. */
303 /* Common options. */
304 bool print_map; /* Print map? TODO. */
305 bool print_short_names; /* Print long-to-short name map. TODO. */
306 struct sfm_write_options sysfile_opts;
307 struct pfm_write_options porfile_opts;
309 assert (writer_type == SYSFILE_WRITER || writer_type == PORFILE_WRITER);
310 assert (command_type == XFORM_CMD || command_type == PROC_CMD);
311 assert ((retain_unselected != NULL) == (command_type == PROC_CMD));
313 if (command_type == PROC_CMD)
314 *retain_unselected = true;
317 dict = dict_clone (dataset_dict (ds));
318 aw = xmalloc (sizeof *aw);
321 case_nullify (&aw->bounce);
323 print_short_names = false;
324 sysfile_opts = sfm_writer_default_options ();
325 porfile_opts = pfm_writer_default_options ();
327 start_case_map (dict);
328 dict_delete_scratch_vars (dict);
330 lex_match (lexer, '/');
333 if (lex_match_id (lexer, "OUTFILE"))
337 lex_sbc_only_once ("OUTFILE");
341 lex_match (lexer, '=');
343 handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
347 else if (lex_match_id (lexer, "NAMES"))
348 print_short_names = true;
349 else if (lex_match_id (lexer, "PERMISSIONS"))
353 lex_match (lexer, '=');
354 if (lex_match_id (lexer, "READONLY"))
356 else if (lex_match_id (lexer, "WRITEABLE"))
360 lex_error (lexer, _("expecting %s or %s"), "READONLY", "WRITEABLE");
363 sysfile_opts.create_writeable = porfile_opts.create_writeable = cw;
365 else if (command_type == PROC_CMD && lex_match_id (lexer, "UNSELECTED"))
367 lex_match (lexer, '=');
368 if (lex_match_id (lexer, "RETAIN"))
369 *retain_unselected = true;
370 else if (lex_match_id (lexer, "DELETE"))
371 *retain_unselected = false;
374 lex_error (lexer, _("expecting %s or %s"), "RETAIN", "DELETE");
378 else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "COMPRESSED"))
379 sysfile_opts.compress = true;
380 else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "UNCOMPRESSED"))
381 sysfile_opts.compress = false;
382 else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "VERSION"))
384 lex_match (lexer, '=');
385 if (!lex_force_int (lexer))
387 sysfile_opts.version = lex_integer (lexer);
390 else if (writer_type == PORFILE_WRITER && lex_match_id (lexer, "TYPE"))
392 lex_match (lexer, '=');
393 if (lex_match_id (lexer, "COMMUNICATIONS"))
394 porfile_opts.type = PFM_COMM;
395 else if (lex_match_id (lexer, "TAPE"))
396 porfile_opts.type = PFM_TAPE;
399 lex_error (lexer, _("expecting %s or %s"), "COMM", "TAPE");
403 else if (writer_type == PORFILE_WRITER && lex_match_id (lexer, "DIGITS"))
405 lex_match (lexer, '=');
406 if (!lex_force_int (lexer))
408 porfile_opts.digits = lex_integer (lexer);
411 else if (!parse_dict_trim (lexer, dict))
414 if (!lex_match (lexer, '/'))
417 if (lex_end_of_command (lexer) != CMD_SUCCESS)
422 lex_sbc_missing (lexer, "OUTFILE");
426 dict_compact_values (dict);
427 aw->map = finish_case_map (dict);
429 case_create (&aw->bounce, dict_get_next_value_idx (dict));
431 if (fh_get_referent (handle) == FH_REF_FILE)
436 aw->writer = any_writer_from_sfm_writer (
437 sfm_open_writer (handle, dict, sysfile_opts));
440 aw->writer = any_writer_from_pfm_writer (
441 pfm_open_writer (handle, dict, porfile_opts));
446 aw->writer = any_writer_open (handle, dict);
447 if (aw->writer == NULL)
454 case_writer_destroy (aw);
459 /* Writes case C to writer AW. */
461 case_writer_write_case (struct case_writer *aw, const struct ccase *c)
465 map_case (aw->map, c, &aw->bounce);
468 return any_writer_write (aw->writer, c);
471 /* SAVE and EXPORT. */
473 static bool output_proc (const struct ccase *, void *, const struct dataset *);
475 /* Parses and performs the SAVE or EXPORT procedure. */
477 parse_output_proc (struct lexer *lexer, struct dataset *ds, enum writer_type writer_type)
479 bool retain_unselected;
480 struct variable *saved_filter_variable;
481 struct case_writer *aw;
484 aw = parse_write_command (lexer, ds, writer_type, PROC_CMD, &retain_unselected);
486 return CMD_CASCADING_FAILURE;
488 saved_filter_variable = dict_get_filter (dataset_dict (ds));
489 if (retain_unselected)
490 dict_set_filter (dataset_dict (ds), NULL);
491 ok = procedure (ds, output_proc, aw);
492 dict_set_filter (dataset_dict (ds), saved_filter_variable);
494 case_writer_destroy (aw);
495 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
498 /* Writes case C to file. */
500 output_proc (const struct ccase *c, void *aw_, const struct dataset *ds UNUSED)
502 struct case_writer *aw = aw_;
503 return case_writer_write_case (aw, c);
507 cmd_save (struct lexer *lexer, struct dataset *ds)
509 return parse_output_proc (lexer, ds, SYSFILE_WRITER);
513 cmd_export (struct lexer *lexer, struct dataset *ds)
515 return parse_output_proc (lexer, ds, PORFILE_WRITER);
518 /* XSAVE and XEXPORT. */
520 /* Transformation. */
523 struct case_writer *aw; /* Writer. */
526 static trns_proc_func output_trns_proc;
527 static trns_free_func output_trns_free;
529 /* Parses the XSAVE or XEXPORT transformation command. */
531 parse_output_trns (struct lexer *lexer, struct dataset *ds, enum writer_type writer_type)
533 struct output_trns *t = xmalloc (sizeof *t);
534 t->aw = parse_write_command (lexer, ds, writer_type, XFORM_CMD, NULL);
538 return CMD_CASCADING_FAILURE;
541 add_transformation (ds, output_trns_proc, output_trns_free, t);
545 /* Writes case C to the system file specified on XSAVE or XEXPORT. */
547 output_trns_proc (void *trns_, struct ccase *c, casenumber case_num UNUSED)
549 struct output_trns *t = trns_;
550 case_writer_write_case (t->aw, c);
551 return TRNS_CONTINUE;
554 /* Frees an XSAVE or XEXPORT transformation.
555 Returns true if successful, false if an I/O error occurred. */
557 output_trns_free (void *trns_)
559 struct output_trns *t = trns_;
564 ok = case_writer_destroy (t->aw);
572 cmd_xsave (struct lexer *lexer, struct dataset *ds)
574 return parse_output_trns (lexer, ds, SYSFILE_WRITER);
577 /* XEXPORT command. */
579 cmd_xexport (struct lexer *lexer, struct dataset *ds)
581 return parse_output_trns (lexer, ds, PORFILE_WRITER);
584 static bool rename_variables (struct lexer *lexer, struct dictionary *dict);
585 static bool drop_variables (struct lexer *, struct dictionary *dict);
586 static bool keep_variables (struct lexer *, struct dictionary *dict);
588 /* Commands that read and write system files share a great deal
589 of common syntactic structure for rearranging and dropping
590 variables. This function parses this syntax and modifies DICT
591 appropriately. Returns true on success, false on failure. */
593 parse_dict_trim (struct lexer *lexer, struct dictionary *dict)
595 if (lex_match_id (lexer, "MAP"))
600 else if (lex_match_id (lexer, "DROP"))
601 return drop_variables (lexer, dict);
602 else if (lex_match_id (lexer, "KEEP"))
603 return keep_variables (lexer, dict);
604 else if (lex_match_id (lexer, "RENAME"))
605 return rename_variables (lexer, dict);
608 lex_error (lexer, _("expecting a valid subcommand"));
613 /* Parses and performs the RENAME subcommand of GET and SAVE. */
615 rename_variables (struct lexer *lexer, struct dictionary *dict)
628 lex_match (lexer, '=');
629 if (lex_token (lexer) != '(')
633 v = parse_variable (lexer, dict);
636 if (!lex_force_match (lexer, '=')
637 || !lex_force_id (lexer))
639 if (dict_lookup_var (dict, lex_tokid (lexer)) != NULL)
641 msg (SE, _("Cannot rename %s as %s because there already exists "
642 "a variable named %s. To rename variables with "
643 "overlapping names, use a single RENAME subcommand "
644 "such as \"/RENAME (A=B)(B=C)(C=A)\", or equivalently, "
645 "\"/RENAME (A B C=B C A)\"."),
646 var_get_name (v), lex_tokid (lexer), lex_tokid (lexer));
650 dict_rename_var (dict, v, lex_tokid (lexer));
659 while (lex_match (lexer, '('))
663 if (!parse_variables (lexer, dict, &v, &nv, PV_NO_DUPLICATE | PV_APPEND))
665 if (!lex_match (lexer, '='))
667 msg (SE, _("`=' expected after variable list."));
670 if (!parse_DATA_LIST_vars (lexer, &new_names, &nn, PV_APPEND | PV_NO_SCRATCH))
674 msg (SE, _("Number of variables on left side of `=' (%d) does not "
675 "match number of variables on right side (%d), in "
676 "parenthesized group %d of RENAME subcommand."),
677 (unsigned) (nv - old_nv), (unsigned) (nn - old_nv), group);
680 if (!lex_force_match (lexer, ')'))
685 if (!dict_rename_vars (dict, v, new_names, nv, &err_name))
687 msg (SE, _("Requested renaming duplicates variable name %s."), err_name);
693 for (i = 0; i < nn; i++)
701 /* Parses and performs the DROP subcommand of GET and SAVE.
702 Returns true if successful, false on failure.*/
704 drop_variables (struct lexer *lexer, struct dictionary *dict)
709 lex_match (lexer, '=');
710 if (!parse_variables (lexer, dict, &v, &nv, PV_NONE))
712 dict_delete_vars (dict, v, nv);
715 if (dict_get_var_cnt (dict) == 0)
717 msg (SE, _("Cannot DROP all variables from dictionary."));
723 /* Parses and performs the KEEP subcommand of GET and SAVE.
724 Returns true if successful, false on failure.*/
726 keep_variables (struct lexer *lexer, struct dictionary *dict)
732 lex_match (lexer, '=');
733 if (!parse_variables (lexer, dict, &v, &nv, PV_NONE))
736 /* Move the specified variables to the beginning. */
737 dict_reorder_vars (dict, v, nv);
739 /* Delete the remaining variables. */
740 v = xnrealloc (v, dict_get_var_cnt (dict) - nv, sizeof *v);
741 for (i = nv; i < dict_get_var_cnt (dict); i++)
742 v[i - nv] = dict_get_var (dict, i);
743 dict_delete_vars (dict, v, dict_get_var_cnt (dict) - nv);
754 MTF_FILE, /* Specified on FILE= subcommand. */
755 MTF_TABLE /* Specified on TABLE= subcommand. */
758 /* One of the files on MATCH FILES. */
761 struct mtf_file *next, *prev; /* Next, previous in the list of files. */
762 struct mtf_file *next_min; /* Next in the chain of minimums. */
764 int type; /* One of MTF_*. */
765 struct variable **by; /* List of BY variables for this file. */
766 struct file_handle *handle; /* File handle. */
767 struct any_reader *reader; /* File reader. */
768 struct dictionary *dict; /* Dictionary from system file. */
771 char *in_name; /* Variable name. */
772 struct variable *in_var; /* Variable (in master dictionary). */
774 struct ccase input; /* Input record. */
777 /* MATCH FILES procedure. */
780 struct mtf_file *head; /* First file mentioned on FILE or TABLE. */
781 struct mtf_file *tail; /* Last file mentioned on FILE or TABLE. */
783 bool ok; /* False if I/O error occurs. */
785 size_t by_cnt; /* Number of variables on BY subcommand. */
787 /* Names of FIRST, LAST variables. */
788 char first[LONG_NAME_LEN + 1], last[LONG_NAME_LEN + 1];
790 struct dictionary *dict; /* Dictionary of output file. */
791 struct casefile *output; /* MATCH FILES output. */
792 struct ccase mtf_case; /* Case used for output. */
794 unsigned seq_num; /* Have we initialized this variable? */
795 unsigned *seq_nums; /* Sequence numbers for each var in dict. */
798 static bool mtf_free (struct mtf_proc *);
799 static bool mtf_close_file (struct mtf_file *);
800 static int mtf_merge_dictionary (struct dictionary *const, struct mtf_file *);
801 static bool mtf_delete_file_in_place (struct mtf_proc *, struct mtf_file **);
803 static bool mtf_read_nonactive_records (void *);
804 static bool mtf_processing_finish (void *, const struct dataset *);
805 static bool mtf_processing (const struct ccase *, void *, const struct dataset *);
807 static char *var_type_description (struct variable *);
809 static void set_master (struct variable *, struct variable *master);
810 static struct variable *get_master (struct variable *);
812 /* Parse and execute the MATCH FILES command. */
814 cmd_match_files (struct lexer *lexer, struct dataset *ds)
817 struct mtf_file *first_table = NULL;
818 struct mtf_file *iter;
820 bool used_active_file = false;
821 bool saw_table = false;
826 mtf.head = mtf.tail = NULL;
830 mtf.dict = dict_create ();
832 case_nullify (&mtf.mtf_case);
835 dict_set_case_limit (mtf.dict, dict_get_case_limit (dataset_dict (ds)));
837 lex_match (lexer, '/');
838 while (lex_token (lexer) == T_ID
839 && (lex_id_match (ss_cstr ("FILE"), ss_cstr (lex_tokid (lexer)))
840 || lex_id_match (ss_cstr ("TABLE"), ss_cstr (lex_tokid (lexer)))))
842 struct mtf_file *file = xmalloc (sizeof *file);
844 if (lex_match_id (lexer, "FILE"))
845 file->type = MTF_FILE;
846 else if (lex_match_id (lexer, "TABLE"))
848 file->type = MTF_TABLE;
853 lex_match (lexer, '=');
859 file->in_name = NULL;
861 case_nullify (&file->input);
863 /* FILEs go first, then TABLEs. */
864 if (file->type == MTF_TABLE || first_table == NULL)
867 file->prev = mtf.tail;
869 mtf.tail->next = file;
871 if (mtf.head == NULL)
873 if (file->type == MTF_TABLE && first_table == NULL)
878 assert (file->type == MTF_FILE);
879 file->next = first_table;
880 file->prev = first_table->prev;
881 if (first_table->prev)
882 first_table->prev->next = file;
885 first_table->prev = file;
888 if (lex_match (lexer, '*'))
893 if (used_active_file)
895 msg (SE, _("The active file may not be specified more "
899 used_active_file = true;
901 if (!proc_has_source (ds))
903 msg (SE, _("Cannot specify the active file since no active "
904 "file has been defined."));
908 if (proc_make_temporary_transformations_permanent (ds))
910 _("MATCH FILES may not be used after TEMPORARY when "
911 "the active file is an input source. "
912 "Temporary transformations will be made permanent."));
914 file->dict = dataset_dict (ds);
918 file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
919 if (file->handle == NULL)
922 file->reader = any_reader_open (file->handle, &file->dict);
923 if (file->reader == NULL)
926 case_create (&file->input, dict_get_next_value_idx (file->dict));
929 while (lex_match (lexer, '/'))
930 if (lex_match_id (lexer, "RENAME"))
932 if (!rename_variables (lexer, file->dict))
935 else if (lex_match_id (lexer, "IN"))
937 lex_match (lexer, '=');
938 if (lex_token (lexer) != T_ID)
940 lex_error (lexer, NULL);
944 if (file->in_name != NULL)
946 msg (SE, _("Multiple IN subcommands for a single FILE or "
950 file->in_name = xstrdup (lex_tokid (lexer));
955 mtf_merge_dictionary (mtf.dict, file);
958 while (lex_token (lexer) != '.')
960 if (lex_match (lexer, T_BY))
962 struct variable **by;
966 msg (SE, _("BY may appear at most once."));
970 lex_match (lexer, '=');
971 if (!parse_variables (lexer, mtf.dict, &by, &mtf.by_cnt,
972 PV_NO_DUPLICATE | PV_NO_SCRATCH))
975 for (iter = mtf.head; iter != NULL; iter = iter->next)
979 iter->by = xnmalloc (mtf.by_cnt, sizeof *iter->by);
981 for (i = 0; i < mtf.by_cnt; i++)
983 iter->by[i] = dict_lookup_var (iter->dict,
984 var_get_name (by[i]));
985 if (iter->by[i] == NULL)
987 msg (SE, _("File %s lacks BY variable %s."),
988 iter->handle ? fh_get_name (iter->handle) : "*",
989 var_get_name (by[i]));
997 else if (lex_match_id (lexer, "FIRST"))
999 if (mtf.first[0] != '\0')
1001 msg (SE, _("FIRST may appear at most once."));
1005 lex_match (lexer, '=');
1006 if (!lex_force_id (lexer))
1008 strcpy (mtf.first, lex_tokid (lexer));
1011 else if (lex_match_id (lexer, "LAST"))
1013 if (mtf.last[0] != '\0')
1015 msg (SE, _("LAST may appear at most once."));
1019 lex_match (lexer, '=');
1020 if (!lex_force_id (lexer))
1022 strcpy (mtf.last, lex_tokid (lexer));
1025 else if (lex_match_id (lexer, "MAP"))
1029 else if (lex_match_id (lexer, "DROP"))
1031 if (!drop_variables (lexer, mtf.dict))
1034 else if (lex_match_id (lexer, "KEEP"))
1036 if (!keep_variables (lexer, mtf.dict))
1041 lex_error (lexer, NULL);
1045 if (!lex_match (lexer, '/') && lex_token (lexer) != '.')
1047 lex_end_of_command (lexer);
1052 if (mtf.by_cnt == 0)
1056 msg (SE, _("BY is required when TABLE is specified."));
1061 msg (SE, _("BY is required when IN is specified."));
1066 /* Set up mapping from each file's variables to master
1068 for (iter = mtf.head; iter != NULL; iter = iter->next)
1070 struct dictionary *d = iter->dict;
1073 for (i = 0; i < dict_get_var_cnt (d); i++)
1075 struct variable *v = dict_get_var (d, i);
1076 struct variable *mv = dict_lookup_var (mtf.dict, var_get_name (v));
1082 /* Add IN variables to master dictionary. */
1083 for (iter = mtf.head; iter != NULL; iter = iter->next)
1084 if (iter->in_name != NULL)
1086 struct fmt_spec format = fmt_for_output (FMT_F, 1, 0);
1087 iter->in_var = dict_create_var (mtf.dict, iter->in_name, 0);
1088 if (iter->in_var == NULL)
1090 msg (SE, _("IN variable name %s duplicates an "
1091 "existing variable name."),
1092 var_get_name (iter->in_var));
1095 var_set_both_formats (iter->in_var, &format);
1098 /* MATCH FILES performs an n-way merge on all its input files.
1101 1. Read one input record from every input FILE.
1103 2. If no FILEs are left, stop. Otherwise, proceed to step 3.
1105 3. Find the FILE input record(s) that have minimum BY
1106 values. Store all the values from these input records into
1109 4. For every TABLE, read another record as long as the BY values
1110 on the TABLE's input record are less than the FILEs' BY values.
1111 If an exact match is found, store all the values from the TABLE
1112 input record into the output record.
1114 5. Write the output record.
1116 6. Read another record from each input file FILE and TABLE that
1117 we stored values from above. If we come to the end of one of the
1118 input files, remove it from the list of input files.
1120 7. Repeat from step 2.
1122 Unfortunately, this algorithm can't be implemented in a
1123 straightforward way because there's no function to read a
1124 record from the active file. Instead, it has to be written
1127 FIXME: For merging large numbers of files (more than 10?) a
1128 better algorithm would use a heap for finding minimum
1131 if (!used_active_file)
1132 discard_variables (ds);
1134 dict_compact_values (mtf.dict);
1135 mtf.output = fastfile_create (dict_get_next_value_idx (mtf.dict));
1136 mtf.seq_nums = xcalloc (dict_get_var_cnt (mtf.dict), sizeof *mtf.seq_nums);
1137 case_create (&mtf.mtf_case, dict_get_next_value_idx (mtf.dict));
1139 if (!mtf_read_nonactive_records (&mtf))
1142 if (used_active_file)
1145 create_case_sink (&null_sink_class,
1146 dataset_dict (ds), NULL));
1148 ( procedure (ds, mtf_processing, &mtf) &&
1149 mtf_processing_finish (&mtf, ds) );
1152 ok = mtf_processing_finish (&mtf, ds);
1154 discard_variables (ds);
1156 dict_destroy (dataset_dict (ds));
1157 dataset_set_dict (ds, mtf.dict);
1159 proc_set_source (ds, storage_source_create (mtf.output));
1162 if (!mtf_free (&mtf))
1164 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
1168 return CMD_CASCADING_FAILURE;
1171 /* Repeats 2...7 an arbitrary number of times. */
1173 mtf_processing_finish (void *mtf_, const struct dataset *ds)
1175 struct mtf_proc *mtf = mtf_;
1176 struct mtf_file *iter;
1178 /* Find the active file and delete it. */
1179 for (iter = mtf->head; iter; iter = iter->next)
1180 if (iter->handle == NULL)
1182 if (!mtf_delete_file_in_place (mtf, &iter))
1187 while (mtf->head && mtf->head->type == MTF_FILE)
1188 if (!mtf_processing (NULL, mtf, ds))
1194 /* Return a string in a static buffer describing V's variable type and
1197 var_type_description (struct variable *v)
1199 static char buf[2][32];
1206 if (var_is_numeric (v))
1207 strcpy (s, "numeric");
1209 sprintf (s, "string with width %d", var_get_width (v));
1213 /* Closes FILE and frees its associated data.
1214 Returns true if successful, false if an I/O error
1215 occurred on FILE. */
1217 mtf_close_file (struct mtf_file *file)
1219 bool ok = file->reader == NULL || !any_reader_error (file->reader);
1221 any_reader_close (file->reader);
1222 if (file->handle != NULL)
1223 dict_destroy (file->dict);
1224 case_destroy (&file->input);
1225 free (file->in_name);
1230 /* Free all the data for the MATCH FILES procedure.
1231 Returns true if successful, false if an I/O error
1234 mtf_free (struct mtf_proc *mtf)
1236 struct mtf_file *iter, *next;
1239 for (iter = mtf->head; iter; iter = next)
1242 assert (iter->dict != mtf->dict);
1243 if (!mtf_close_file (iter))
1248 dict_destroy (mtf->dict);
1249 case_destroy (&mtf->mtf_case);
1250 free (mtf->seq_nums);
1255 /* Remove *FILE from the mtf_file chain. Make *FILE point to the next
1256 file in the chain, or to NULL if was the last in the chain.
1257 Returns true if successful, false if an I/O error occurred. */
1259 mtf_delete_file_in_place (struct mtf_proc *mtf, struct mtf_file **file)
1261 struct mtf_file *f = *file;
1265 f->prev->next = f->next;
1267 f->next->prev = f->prev;
1269 mtf->head = f->next;
1271 mtf->tail = f->prev;
1274 if (f->in_var != NULL)
1275 case_data_rw (&mtf->mtf_case, f->in_var->fv)->f = 0.;
1276 for (i = 0; i < dict_get_var_cnt (f->dict); i++)
1278 struct variable *v = dict_get_var (f->dict, i);
1279 struct variable *mv = get_master (v);
1282 union value *out = case_data_rw (&mtf->mtf_case, mv->fv);
1284 if (var_is_numeric (v))
1287 memset (out->s, ' ', var_get_width (v));
1291 return mtf_close_file (f);
1294 /* Read a record from every input file except the active file.
1295 Returns true if successful, false if an I/O error occurred. */
1297 mtf_read_nonactive_records (void *mtf_)
1299 struct mtf_proc *mtf = mtf_;
1300 struct mtf_file *iter, *next;
1303 for (iter = mtf->head; ok && iter != NULL; iter = next)
1306 if (iter->handle && !any_reader_read (iter->reader, &iter->input))
1307 if (!mtf_delete_file_in_place (mtf, &iter))
1313 /* Compare the BY variables for files A and B; return -1 if A < B, 0
1314 if A == B, 1 if A > B. */
1316 mtf_compare_BY_values (struct mtf_proc *mtf,
1317 struct mtf_file *a, struct mtf_file *b,
1318 const struct ccase *c)
1320 const struct ccase *ca = case_is_null (&a->input) ? c : &a->input;
1321 const struct ccase *cb = case_is_null (&b->input) ? c : &b->input;
1322 assert ((a == NULL) + (b == NULL) + (c == NULL) <= 1);
1323 return case_compare_2dict (ca, cb, a->by, b->by, mtf->by_cnt);
1326 /* Perform one iteration of steps 3...7 above.
1327 Returns true if successful, false if an I/O error occurred. */
1329 mtf_processing (const struct ccase *c, void *mtf_, const struct dataset *ds UNUSED)
1331 struct mtf_proc *mtf = mtf_;
1333 /* Do we need another record from the active file? */
1334 bool read_active_file;
1336 assert (mtf->head != NULL);
1337 if (mtf->head->type == MTF_TABLE)
1342 struct mtf_file *min_head, *min_tail; /* Files with minimum BY values. */
1343 struct mtf_file *max_head, *max_tail; /* Files with non-minimum BYs. */
1344 struct mtf_file *iter, *next;
1346 read_active_file = false;
1348 /* 3. Find the FILE input record(s) that have minimum BY
1349 values. Store all the values from these input records into
1350 the output record. */
1351 min_head = min_tail = mtf->head;
1352 max_head = max_tail = NULL;
1353 for (iter = mtf->head->next; iter && iter->type == MTF_FILE;
1356 int cmp = mtf_compare_BY_values (mtf, min_head, iter, c);
1360 max_tail = max_tail->next_min = iter;
1362 max_head = max_tail = iter;
1365 min_tail = min_tail->next_min = iter;
1370 max_tail->next_min = min_head;
1371 max_tail = min_tail;
1375 max_head = min_head;
1376 max_tail = min_tail;
1378 min_head = min_tail = iter;
1382 /* 4. For every TABLE, read another record as long as the BY
1383 values on the TABLE's input record are less than the FILEs'
1384 BY values. If an exact match is found, store all the values
1385 from the TABLE input record into the output record. */
1386 for (; iter != NULL; iter = next)
1388 assert (iter->type == MTF_TABLE);
1393 int cmp = mtf_compare_BY_values (mtf, min_head, iter, c);
1397 max_tail = max_tail->next_min = iter;
1399 max_head = max_tail = iter;
1402 min_tail = min_tail->next_min = iter;
1405 if (iter->handle == NULL)
1407 if (any_reader_read (iter->reader, &iter->input))
1409 if (!mtf_delete_file_in_place (mtf, &iter))
1416 /* Next sequence number. */
1419 /* Store data to all the records we are using. */
1421 min_tail->next_min = NULL;
1422 for (iter = min_head; iter; iter = iter->next_min)
1426 for (i = 0; i < dict_get_var_cnt (iter->dict); i++)
1428 struct variable *v = dict_get_var (iter->dict, i);
1429 struct variable *mv = get_master (v);
1431 if (mv != NULL && mtf->seq_nums[mv->index] != mtf->seq_num)
1433 const struct ccase *record
1434 = case_is_null (&iter->input) ? c : &iter->input;
1435 union value *out = case_data_rw (&mtf->mtf_case, mv->fv);
1437 mtf->seq_nums[mv->index] = mtf->seq_num;
1438 if (var_is_numeric (v))
1439 out->f = case_num (record, v->fv);
1441 memcpy (out->s, case_str (record, v->fv),
1445 if (iter->in_var != NULL)
1446 case_data_rw (&mtf->mtf_case, iter->in_var->fv)->f = 1.;
1448 if (iter->type == MTF_FILE && iter->handle == NULL)
1449 read_active_file = true;
1452 /* Store missing values to all the records we're not
1455 max_tail->next_min = NULL;
1456 for (iter = max_head; iter; iter = iter->next_min)
1460 for (i = 0; i < dict_get_var_cnt (iter->dict); i++)
1462 struct variable *v = dict_get_var (iter->dict, i);
1463 struct variable *mv = get_master (v);
1465 if (mv != NULL && mtf->seq_nums[mv->index] != mtf->seq_num)
1467 union value *out = case_data_rw (&mtf->mtf_case, mv->fv);
1468 mtf->seq_nums[mv->index] = mtf->seq_num;
1470 if (var_is_numeric (v))
1473 memset (out->s, ' ', var_get_width (v));
1476 if (iter->in_var != NULL)
1477 case_data_rw (&mtf->mtf_case, iter->in_var->fv)->f = 0.;
1480 /* 5. Write the output record. */
1481 casefile_append (mtf->output, &mtf->mtf_case);
1483 /* 6. Read another record from each input file FILE and TABLE
1484 that we stored values from above. If we come to the end of
1485 one of the input files, remove it from the list of input
1487 for (iter = min_head; iter && iter->type == MTF_FILE; iter = next)
1489 next = iter->next_min;
1490 if (iter->reader != NULL
1491 && !any_reader_read (iter->reader, &iter->input))
1492 if (!mtf_delete_file_in_place (mtf, &iter))
1496 while (!read_active_file
1497 && mtf->head != NULL && mtf->head->type == MTF_FILE);
1502 /* Merge the dictionary for file F into master dictionary M. */
1504 mtf_merge_dictionary (struct dictionary *const m, struct mtf_file *f)
1506 struct dictionary *d = f->dict;
1507 const char *d_docs, *m_docs;
1510 if (dict_get_label (m) == NULL)
1511 dict_set_label (m, dict_get_label (d));
1513 d_docs = dict_get_documents (d);
1514 m_docs = dict_get_documents (m);
1518 dict_set_documents (m, d_docs);
1524 new_len = strlen (m_docs) + strlen (d_docs);
1525 new_docs = xmalloc (new_len + 1);
1526 strcpy (new_docs, m_docs);
1527 strcat (new_docs, d_docs);
1528 dict_set_documents (m, new_docs);
1533 for (i = 0; i < dict_get_var_cnt (d); i++)
1535 struct variable *dv = dict_get_var (d, i);
1536 struct variable *mv = dict_lookup_var (m, var_get_name (dv));
1538 if (dict_class_from_id (var_get_name (dv)) == DC_SCRATCH)
1543 if (var_get_width (mv) != var_get_width (dv))
1545 msg (SE, _("Variable %s in file %s (%s) has different "
1546 "type or width from the same variable in "
1547 "earlier file (%s)."),
1548 var_get_name (dv), fh_get_name (f->handle),
1549 var_type_description (dv), var_type_description (mv));
1553 if (var_get_width (dv) == var_get_width (mv))
1555 if (val_labs_count (dv->val_labs)
1556 && !val_labs_count (mv->val_labs))
1558 val_labs_destroy (mv->val_labs);
1559 mv->val_labs = val_labs_copy (dv->val_labs);
1561 if (var_has_missing_values (dv) && !var_has_missing_values (mv))
1562 var_set_missing_values (mv, var_get_missing_values (dv));
1565 if (var_get_label (dv) && !var_get_label (mv))
1566 var_set_label (mv, var_get_label (dv));
1569 mv = dict_clone_var_assert (m, dv, var_get_name (dv));
1575 /* Marks V's master variable as MASTER. */
1577 set_master (struct variable *v, struct variable *master)
1579 var_attach_aux (v, master, NULL);
1582 /* Returns the master variable corresponding to V,
1583 as set with set_master(). */
1584 static struct variable *
1585 get_master (struct variable *v)
1592 A case map copies data from a case that corresponds for one
1593 dictionary to a case that corresponds to a second dictionary
1594 derived from the first by, optionally, deleting, reordering,
1595 or renaming variables. (No new variables may be created.)
1601 size_t value_cnt; /* Number of values in map. */
1602 int *map; /* For each destination index, the
1603 corresponding source index. */
1606 /* Prepares dictionary D for producing a case map. Afterward,
1607 the caller may delete, reorder, or rename variables within D
1608 at will before using finish_case_map() to produce the case
1611 Uses D's aux members, which must otherwise not be in use. */
1613 start_case_map (struct dictionary *d)
1615 size_t var_cnt = dict_get_var_cnt (d);
1618 for (i = 0; i < var_cnt; i++)
1620 struct variable *v = dict_get_var (d, i);
1621 int *src_fv = xmalloc (sizeof *src_fv);
1623 var_attach_aux (v, src_fv, var_dtor_free);
1627 /* Produces a case map from dictionary D, which must have been
1628 previously prepared with start_case_map().
1630 Does not retain any reference to D, and clears the aux members
1631 set up by start_case_map().
1633 Returns the new case map, or a null pointer if no mapping is
1634 required (that is, no data has changed position). */
1635 static struct case_map *
1636 finish_case_map (struct dictionary *d)
1638 struct case_map *map;
1639 size_t var_cnt = dict_get_var_cnt (d);
1643 map = xmalloc (sizeof *map);
1644 map->value_cnt = dict_get_next_value_idx (d);
1645 map->map = xnmalloc (map->value_cnt, sizeof *map->map);
1646 for (i = 0; i < map->value_cnt; i++)
1650 for (i = 0; i < var_cnt; i++)
1652 struct variable *v = dict_get_var (d, i);
1653 size_t value_cnt = var_get_value_cnt (v);
1654 int *src_fv = (int *) var_detach_aux (v);
1657 if (v->fv != *src_fv)
1660 for (idx = 0; idx < value_cnt; idx++)
1662 int src_idx = *src_fv + idx;
1663 int dst_idx = v->fv + idx;
1665 assert (map->map[dst_idx] == -1);
1666 map->map[dst_idx] = src_idx;
1673 destroy_case_map (map);
1677 while (map->value_cnt > 0 && map->map[map->value_cnt - 1] == -1)
1683 /* Maps from SRC to DST, applying case map MAP. */
1685 map_case (const struct case_map *map,
1686 const struct ccase *src, struct ccase *dst)
1690 assert (map != NULL);
1691 assert (src != NULL);
1692 assert (dst != NULL);
1693 assert (src != dst);
1695 for (dst_idx = 0; dst_idx < map->value_cnt; dst_idx++)
1697 int src_idx = map->map[dst_idx];
1699 *case_data_rw (dst, dst_idx) = *case_data (src, src_idx);
1703 /* Destroys case map MAP. */
1705 destroy_case_map (struct case_map *map)