1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include <data/any-reader.h>
25 #include <data/any-writer.h>
26 #include <data/case-sink.h>
27 #include <data/case-source.h>
28 #include <data/case.h>
29 #include <data/casefile.h>
30 #include <data/fastfile.h>
31 #include <data/format.h>
32 #include <data/dictionary.h>
33 #include <data/por-file-writer.h>
34 #include <data/procedure.h>
35 #include <data/settings.h>
36 #include <data/storage-stream.h>
37 #include <data/sys-file-writer.h>
38 #include <data/transformations.h>
39 #include <data/value-labels.h>
40 #include <data/variable.h>
41 #include <language/command.h>
42 #include <language/data-io/file-handle.h>
43 #include <language/lexer/lexer.h>
44 #include <language/lexer/variable-parser.h>
45 #include <libpspp/alloc.h>
46 #include <libpspp/assertion.h>
47 #include <libpspp/compiler.h>
48 #include <libpspp/hash.h>
49 #include <libpspp/message.h>
50 #include <libpspp/message.h>
51 #include <libpspp/misc.h>
52 #include <libpspp/str.h>
55 #define _(msgid) gettext (msgid)
57 /* Rearranging and reducing a dictionary. */
58 static void start_case_map (struct dictionary *);
59 static struct case_map *finish_case_map (struct dictionary *);
60 static void map_case (const struct case_map *,
61 const struct ccase *, struct ccase *);
62 static void destroy_case_map (struct case_map *);
64 static bool parse_dict_trim (struct lexer *, struct dictionary *);
66 /* Reading system and portable files. */
68 /* Type of command. */
75 /* Case reader input program. */
76 struct case_reader_pgm
78 struct any_reader *reader; /* File reader. */
79 struct case_map *map; /* Map from file dict to active file dict. */
80 struct ccase bounce; /* Bounce buffer. */
83 static const struct case_source_class case_reader_source_class;
85 static void case_reader_pgm_free (struct case_reader_pgm *);
87 /* Parses a GET or IMPORT command. */
89 parse_read_command (struct lexer *lexer, struct dataset *ds, enum reader_command type)
91 struct case_reader_pgm *pgm = NULL;
92 struct file_handle *fh = NULL;
93 struct dictionary *dict = NULL;
97 lex_match (lexer, '/');
99 if (lex_match_id (lexer, "FILE") || lex_token (lexer) == T_STRING)
101 lex_match (lexer, '=');
103 fh = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
107 else if (type == IMPORT_CMD && lex_match_id (lexer, "TYPE"))
109 lex_match (lexer, '=');
111 if (lex_match_id (lexer, "COMM"))
113 else if (lex_match_id (lexer, "TAPE"))
117 lex_error (lexer, _("expecting COMM or TAPE"));
127 lex_sbc_missing (lexer, "FILE");
131 discard_variables (ds);
133 pgm = xmalloc (sizeof *pgm);
134 pgm->reader = any_reader_open (fh, &dict);
136 case_nullify (&pgm->bounce);
137 if (pgm->reader == NULL)
140 case_create (&pgm->bounce, dict_get_next_value_idx (dict));
142 start_case_map (dict);
144 while (lex_token (lexer) != '.')
146 lex_match (lexer, '/');
147 if (!parse_dict_trim (lexer, dict))
151 pgm->map = finish_case_map (dict);
153 dict_destroy (dataset_dict (ds));
154 dataset_set_dict (ds, dict);
157 create_case_source (&case_reader_source_class, pgm));
162 case_reader_pgm_free (pgm);
165 return CMD_CASCADING_FAILURE;
168 /* Frees a struct case_reader_pgm. */
170 case_reader_pgm_free (struct case_reader_pgm *pgm)
174 any_reader_close (pgm->reader);
175 destroy_case_map (pgm->map);
176 case_destroy (&pgm->bounce);
181 /* Clears internal state related to case reader input procedure. */
183 case_reader_source_destroy (struct case_source *source)
185 struct case_reader_pgm *pgm = source->aux;
186 case_reader_pgm_free (pgm);
189 /* Reads all the cases from the data file into C and passes them
190 to WRITE_CASE one by one, passing WC_DATA.
191 Returns true if successful, false if an I/O error occurred. */
193 case_reader_source_read (struct case_source *source,
195 write_case_func *write_case, write_case_data wc_data)
197 struct case_reader_pgm *pgm = source->aux;
203 if (pgm->map == NULL)
204 got_case = any_reader_read (pgm->reader, c);
207 got_case = any_reader_read (pgm->reader, &pgm->bounce);
209 map_case (pgm->map, &pgm->bounce, c);
214 ok = write_case (wc_data);
218 return ok && !any_reader_error (pgm->reader);
221 static const struct case_source_class case_reader_source_class =
225 case_reader_source_read,
226 case_reader_source_destroy,
231 cmd_get (struct lexer *lexer, struct dataset *ds)
233 return parse_read_command (lexer, ds, GET_CMD);
238 cmd_import (struct lexer *lexer, struct dataset *ds)
240 return parse_read_command (lexer, ds, IMPORT_CMD);
243 /* Writing system and portable files. */
245 /* Type of output file. */
248 SYSFILE_WRITER, /* System file. */
249 PORFILE_WRITER /* Portable file. */
252 /* Type of a command. */
255 XFORM_CMD, /* Transformation. */
256 PROC_CMD /* Procedure. */
259 /* File writer plus a case map. */
262 struct any_writer *writer; /* File writer. */
263 struct case_map *map; /* Map to output file dictionary
264 (null pointer for identity mapping). */
265 struct ccase bounce; /* Bounce buffer for mapping (if needed). */
270 case_writer_destroy (struct case_writer *aw)
275 ok = any_writer_close (aw->writer);
276 destroy_case_map (aw->map);
277 case_destroy (&aw->bounce);
283 /* Parses SAVE or XSAVE or EXPORT or XEXPORT command.
284 WRITER_TYPE identifies the type of file to write,
285 and COMMAND_TYPE identifies the type of command.
287 On success, returns a writer.
288 For procedures only, sets *RETAIN_UNSELECTED to true if cases
289 that would otherwise be excluded by FILTER or USE should be
292 On failure, returns a null pointer. */
293 static struct case_writer *
294 parse_write_command (struct lexer *lexer, struct dataset *ds,
295 enum writer_type writer_type,
296 enum command_type command_type,
297 bool *retain_unselected)
300 struct file_handle *handle; /* Output file. */
301 struct dictionary *dict; /* Dictionary for output file. */
302 struct case_writer *aw; /* Writer. */
304 /* Common options. */
305 bool print_map; /* Print map? TODO. */
306 bool print_short_names; /* Print long-to-short name map. TODO. */
307 struct sfm_write_options sysfile_opts;
308 struct pfm_write_options porfile_opts;
310 assert (writer_type == SYSFILE_WRITER || writer_type == PORFILE_WRITER);
311 assert (command_type == XFORM_CMD || command_type == PROC_CMD);
312 assert ((retain_unselected != NULL) == (command_type == PROC_CMD));
314 if (command_type == PROC_CMD)
315 *retain_unselected = true;
318 dict = dict_clone (dataset_dict (ds));
319 aw = xmalloc (sizeof *aw);
322 case_nullify (&aw->bounce);
324 print_short_names = false;
325 sysfile_opts = sfm_writer_default_options ();
326 porfile_opts = pfm_writer_default_options ();
328 start_case_map (dict);
329 dict_delete_scratch_vars (dict);
331 lex_match (lexer, '/');
334 if (lex_match_id (lexer, "OUTFILE"))
338 lex_sbc_only_once ("OUTFILE");
342 lex_match (lexer, '=');
344 handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
348 else if (lex_match_id (lexer, "NAMES"))
349 print_short_names = true;
350 else if (lex_match_id (lexer, "PERMISSIONS"))
354 lex_match (lexer, '=');
355 if (lex_match_id (lexer, "READONLY"))
357 else if (lex_match_id (lexer, "WRITEABLE"))
361 lex_error (lexer, _("expecting %s or %s"), "READONLY", "WRITEABLE");
364 sysfile_opts.create_writeable = porfile_opts.create_writeable = cw;
366 else if (command_type == PROC_CMD && lex_match_id (lexer, "UNSELECTED"))
368 lex_match (lexer, '=');
369 if (lex_match_id (lexer, "RETAIN"))
370 *retain_unselected = true;
371 else if (lex_match_id (lexer, "DELETE"))
372 *retain_unselected = false;
375 lex_error (lexer, _("expecting %s or %s"), "RETAIN", "DELETE");
379 else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "COMPRESSED"))
380 sysfile_opts.compress = true;
381 else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "UNCOMPRESSED"))
382 sysfile_opts.compress = false;
383 else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "VERSION"))
385 lex_match (lexer, '=');
386 if (!lex_force_int (lexer))
388 sysfile_opts.version = lex_integer (lexer);
391 else if (writer_type == PORFILE_WRITER && lex_match_id (lexer, "TYPE"))
393 lex_match (lexer, '=');
394 if (lex_match_id (lexer, "COMMUNICATIONS"))
395 porfile_opts.type = PFM_COMM;
396 else if (lex_match_id (lexer, "TAPE"))
397 porfile_opts.type = PFM_TAPE;
400 lex_error (lexer, _("expecting %s or %s"), "COMM", "TAPE");
404 else if (writer_type == PORFILE_WRITER && lex_match_id (lexer, "DIGITS"))
406 lex_match (lexer, '=');
407 if (!lex_force_int (lexer))
409 porfile_opts.digits = lex_integer (lexer);
412 else if (!parse_dict_trim (lexer, dict))
415 if (!lex_match (lexer, '/'))
418 if (lex_end_of_command (lexer) != CMD_SUCCESS)
423 lex_sbc_missing (lexer, "OUTFILE");
427 dict_compact_values (dict);
428 aw->map = finish_case_map (dict);
430 case_create (&aw->bounce, dict_get_next_value_idx (dict));
432 if (fh_get_referent (handle) == FH_REF_FILE)
437 aw->writer = any_writer_from_sfm_writer (
438 sfm_open_writer (handle, dict, sysfile_opts));
441 aw->writer = any_writer_from_pfm_writer (
442 pfm_open_writer (handle, dict, porfile_opts));
447 aw->writer = any_writer_open (handle, dict);
448 if (aw->writer == NULL)
455 case_writer_destroy (aw);
460 /* Writes case C to writer AW. */
462 case_writer_write_case (struct case_writer *aw, const struct ccase *c)
466 map_case (aw->map, c, &aw->bounce);
469 return any_writer_write (aw->writer, c);
472 /* SAVE and EXPORT. */
474 static bool output_proc (const struct ccase *, void *, const struct dataset *);
476 /* Parses and performs the SAVE or EXPORT procedure. */
478 parse_output_proc (struct lexer *lexer, struct dataset *ds, enum writer_type writer_type)
480 bool retain_unselected;
481 struct variable *saved_filter_variable;
482 struct case_writer *aw;
485 aw = parse_write_command (lexer, ds, writer_type, PROC_CMD, &retain_unselected);
487 return CMD_CASCADING_FAILURE;
489 saved_filter_variable = dict_get_filter (dataset_dict (ds));
490 if (retain_unselected)
491 dict_set_filter (dataset_dict (ds), NULL);
492 ok = procedure (ds, output_proc, aw);
493 dict_set_filter (dataset_dict (ds), saved_filter_variable);
495 case_writer_destroy (aw);
496 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
499 /* Writes case C to file. */
501 output_proc (const struct ccase *c, void *aw_, const struct dataset *ds UNUSED)
503 struct case_writer *aw = aw_;
504 return case_writer_write_case (aw, c);
508 cmd_save (struct lexer *lexer, struct dataset *ds)
510 return parse_output_proc (lexer, ds, SYSFILE_WRITER);
514 cmd_export (struct lexer *lexer, struct dataset *ds)
516 return parse_output_proc (lexer, ds, PORFILE_WRITER);
519 /* XSAVE and XEXPORT. */
521 /* Transformation. */
524 struct case_writer *aw; /* Writer. */
527 static trns_proc_func output_trns_proc;
528 static trns_free_func output_trns_free;
530 /* Parses the XSAVE or XEXPORT transformation command. */
532 parse_output_trns (struct lexer *lexer, struct dataset *ds, enum writer_type writer_type)
534 struct output_trns *t = xmalloc (sizeof *t);
535 t->aw = parse_write_command (lexer, ds, writer_type, XFORM_CMD, NULL);
539 return CMD_CASCADING_FAILURE;
542 add_transformation (ds, output_trns_proc, output_trns_free, t);
546 /* Writes case C to the system file specified on XSAVE or XEXPORT. */
548 output_trns_proc (void *trns_, struct ccase *c, casenumber case_num UNUSED)
550 struct output_trns *t = trns_;
551 case_writer_write_case (t->aw, c);
552 return TRNS_CONTINUE;
555 /* Frees an XSAVE or XEXPORT transformation.
556 Returns true if successful, false if an I/O error occurred. */
558 output_trns_free (void *trns_)
560 struct output_trns *t = trns_;
565 ok = case_writer_destroy (t->aw);
573 cmd_xsave (struct lexer *lexer, struct dataset *ds)
575 return parse_output_trns (lexer, ds, SYSFILE_WRITER);
578 /* XEXPORT command. */
580 cmd_xexport (struct lexer *lexer, struct dataset *ds)
582 return parse_output_trns (lexer, ds, PORFILE_WRITER);
585 static bool rename_variables (struct lexer *lexer, struct dictionary *dict);
586 static bool drop_variables (struct lexer *, struct dictionary *dict);
587 static bool keep_variables (struct lexer *, struct dictionary *dict);
589 /* Commands that read and write system files share a great deal
590 of common syntactic structure for rearranging and dropping
591 variables. This function parses this syntax and modifies DICT
592 appropriately. Returns true on success, false on failure. */
594 parse_dict_trim (struct lexer *lexer, struct dictionary *dict)
596 if (lex_match_id (lexer, "MAP"))
601 else if (lex_match_id (lexer, "DROP"))
602 return drop_variables (lexer, dict);
603 else if (lex_match_id (lexer, "KEEP"))
604 return keep_variables (lexer, dict);
605 else if (lex_match_id (lexer, "RENAME"))
606 return rename_variables (lexer, dict);
609 lex_error (lexer, _("expecting a valid subcommand"));
614 /* Parses and performs the RENAME subcommand of GET and SAVE. */
616 rename_variables (struct lexer *lexer, struct dictionary *dict)
629 lex_match (lexer, '=');
630 if (lex_token (lexer) != '(')
634 v = parse_variable (lexer, dict);
637 if (!lex_force_match (lexer, '=')
638 || !lex_force_id (lexer))
640 if (dict_lookup_var (dict, lex_tokid (lexer)) != NULL)
642 msg (SE, _("Cannot rename %s as %s because there already exists "
643 "a variable named %s. To rename variables with "
644 "overlapping names, use a single RENAME subcommand "
645 "such as \"/RENAME (A=B)(B=C)(C=A)\", or equivalently, "
646 "\"/RENAME (A B C=B C A)\"."),
647 var_get_name (v), lex_tokid (lexer), lex_tokid (lexer));
651 dict_rename_var (dict, v, lex_tokid (lexer));
660 while (lex_match (lexer, '('))
664 if (!parse_variables (lexer, dict, &v, &nv, PV_NO_DUPLICATE | PV_APPEND))
666 if (!lex_match (lexer, '='))
668 msg (SE, _("`=' expected after variable list."));
671 if (!parse_DATA_LIST_vars (lexer, &new_names, &nn, PV_APPEND | PV_NO_SCRATCH))
675 msg (SE, _("Number of variables on left side of `=' (%d) does not "
676 "match number of variables on right side (%d), in "
677 "parenthesized group %d of RENAME subcommand."),
678 (unsigned) (nv - old_nv), (unsigned) (nn - old_nv), group);
681 if (!lex_force_match (lexer, ')'))
686 if (!dict_rename_vars (dict, v, new_names, nv, &err_name))
688 msg (SE, _("Requested renaming duplicates variable name %s."), err_name);
694 for (i = 0; i < nn; i++)
702 /* Parses and performs the DROP subcommand of GET and SAVE.
703 Returns true if successful, false on failure.*/
705 drop_variables (struct lexer *lexer, struct dictionary *dict)
710 lex_match (lexer, '=');
711 if (!parse_variables (lexer, dict, &v, &nv, PV_NONE))
713 dict_delete_vars (dict, v, nv);
716 if (dict_get_var_cnt (dict) == 0)
718 msg (SE, _("Cannot DROP all variables from dictionary."));
724 /* Parses and performs the KEEP subcommand of GET and SAVE.
725 Returns true if successful, false on failure.*/
727 keep_variables (struct lexer *lexer, struct dictionary *dict)
733 lex_match (lexer, '=');
734 if (!parse_variables (lexer, dict, &v, &nv, PV_NONE))
737 /* Move the specified variables to the beginning. */
738 dict_reorder_vars (dict, v, nv);
740 /* Delete the remaining variables. */
741 v = xnrealloc (v, dict_get_var_cnt (dict) - nv, sizeof *v);
742 for (i = nv; i < dict_get_var_cnt (dict); i++)
743 v[i - nv] = dict_get_var (dict, i);
744 dict_delete_vars (dict, v, dict_get_var_cnt (dict) - nv);
755 MTF_FILE, /* Specified on FILE= subcommand. */
756 MTF_TABLE /* Specified on TABLE= subcommand. */
759 /* One of the files on MATCH FILES. */
762 struct mtf_file *next, *prev; /* Next, previous in the list of files. */
763 struct mtf_file *next_min; /* Next in the chain of minimums. */
765 int type; /* One of MTF_*. */
766 struct variable **by; /* List of BY variables for this file. */
767 struct file_handle *handle; /* File handle. */
768 struct any_reader *reader; /* File reader. */
769 struct dictionary *dict; /* Dictionary from system file. */
772 char *in_name; /* Variable name. */
773 struct variable *in_var; /* Variable (in master dictionary). */
775 struct ccase input; /* Input record. */
778 /* MATCH FILES procedure. */
781 struct mtf_file *head; /* First file mentioned on FILE or TABLE. */
782 struct mtf_file *tail; /* Last file mentioned on FILE or TABLE. */
784 bool ok; /* False if I/O error occurs. */
786 size_t by_cnt; /* Number of variables on BY subcommand. */
788 /* Names of FIRST, LAST variables. */
789 char first[LONG_NAME_LEN + 1], last[LONG_NAME_LEN + 1];
791 struct dictionary *dict; /* Dictionary of output file. */
792 struct casefile *output; /* MATCH FILES output. */
793 struct ccase mtf_case; /* Case used for output. */
795 unsigned seq_num; /* Have we initialized this variable? */
796 unsigned *seq_nums; /* Sequence numbers for each var in dict. */
799 static bool mtf_free (struct mtf_proc *);
800 static bool mtf_close_file (struct mtf_file *);
801 static int mtf_merge_dictionary (struct dictionary *const, struct mtf_file *);
802 static bool mtf_delete_file_in_place (struct mtf_proc *, struct mtf_file **);
804 static bool mtf_read_nonactive_records (void *);
805 static bool mtf_processing_finish (void *, const struct dataset *);
806 static bool mtf_processing (const struct ccase *, void *, const struct dataset *);
808 static char *var_type_description (struct variable *);
810 static void set_master (struct variable *, struct variable *master);
811 static struct variable *get_master (struct variable *);
813 /* Parse and execute the MATCH FILES command. */
815 cmd_match_files (struct lexer *lexer, struct dataset *ds)
818 struct mtf_file *first_table = NULL;
819 struct mtf_file *iter;
821 bool used_active_file = false;
822 bool saw_table = false;
827 mtf.head = mtf.tail = NULL;
831 mtf.dict = dict_create ();
833 case_nullify (&mtf.mtf_case);
836 dict_set_case_limit (mtf.dict, dict_get_case_limit (dataset_dict (ds)));
838 lex_match (lexer, '/');
839 while (lex_token (lexer) == T_ID
840 && (lex_id_match (ss_cstr ("FILE"), ss_cstr (lex_tokid (lexer)))
841 || lex_id_match (ss_cstr ("TABLE"), ss_cstr (lex_tokid (lexer)))))
843 struct mtf_file *file = xmalloc (sizeof *file);
845 if (lex_match_id (lexer, "FILE"))
846 file->type = MTF_FILE;
847 else if (lex_match_id (lexer, "TABLE"))
849 file->type = MTF_TABLE;
854 lex_match (lexer, '=');
860 file->in_name = NULL;
862 case_nullify (&file->input);
864 /* FILEs go first, then TABLEs. */
865 if (file->type == MTF_TABLE || first_table == NULL)
868 file->prev = mtf.tail;
870 mtf.tail->next = file;
872 if (mtf.head == NULL)
874 if (file->type == MTF_TABLE && first_table == NULL)
879 assert (file->type == MTF_FILE);
880 file->next = first_table;
881 file->prev = first_table->prev;
882 if (first_table->prev)
883 first_table->prev->next = file;
886 first_table->prev = file;
889 if (lex_match (lexer, '*'))
894 if (used_active_file)
896 msg (SE, _("The active file may not be specified more "
900 used_active_file = true;
902 if (!proc_has_source (ds))
904 msg (SE, _("Cannot specify the active file since no active "
905 "file has been defined."));
909 if (proc_make_temporary_transformations_permanent (ds))
911 _("MATCH FILES may not be used after TEMPORARY when "
912 "the active file is an input source. "
913 "Temporary transformations will be made permanent."));
915 file->dict = dataset_dict (ds);
919 file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
920 if (file->handle == NULL)
923 file->reader = any_reader_open (file->handle, &file->dict);
924 if (file->reader == NULL)
927 case_create (&file->input, dict_get_next_value_idx (file->dict));
930 while (lex_match (lexer, '/'))
931 if (lex_match_id (lexer, "RENAME"))
933 if (!rename_variables (lexer, file->dict))
936 else if (lex_match_id (lexer, "IN"))
938 lex_match (lexer, '=');
939 if (lex_token (lexer) != T_ID)
941 lex_error (lexer, NULL);
945 if (file->in_name != NULL)
947 msg (SE, _("Multiple IN subcommands for a single FILE or "
951 file->in_name = xstrdup (lex_tokid (lexer));
956 mtf_merge_dictionary (mtf.dict, file);
959 while (lex_token (lexer) != '.')
961 if (lex_match (lexer, T_BY))
963 struct variable **by;
967 msg (SE, _("BY may appear at most once."));
971 lex_match (lexer, '=');
972 if (!parse_variables (lexer, mtf.dict, &by, &mtf.by_cnt,
973 PV_NO_DUPLICATE | PV_NO_SCRATCH))
976 for (iter = mtf.head; iter != NULL; iter = iter->next)
980 iter->by = xnmalloc (mtf.by_cnt, sizeof *iter->by);
982 for (i = 0; i < mtf.by_cnt; i++)
984 iter->by[i] = dict_lookup_var (iter->dict,
985 var_get_name (by[i]));
986 if (iter->by[i] == NULL)
988 msg (SE, _("File %s lacks BY variable %s."),
989 iter->handle ? fh_get_name (iter->handle) : "*",
990 var_get_name (by[i]));
998 else if (lex_match_id (lexer, "FIRST"))
1000 if (mtf.first[0] != '\0')
1002 msg (SE, _("FIRST may appear at most once."));
1006 lex_match (lexer, '=');
1007 if (!lex_force_id (lexer))
1009 strcpy (mtf.first, lex_tokid (lexer));
1012 else if (lex_match_id (lexer, "LAST"))
1014 if (mtf.last[0] != '\0')
1016 msg (SE, _("LAST may appear at most once."));
1020 lex_match (lexer, '=');
1021 if (!lex_force_id (lexer))
1023 strcpy (mtf.last, lex_tokid (lexer));
1026 else if (lex_match_id (lexer, "MAP"))
1030 else if (lex_match_id (lexer, "DROP"))
1032 if (!drop_variables (lexer, mtf.dict))
1035 else if (lex_match_id (lexer, "KEEP"))
1037 if (!keep_variables (lexer, mtf.dict))
1042 lex_error (lexer, NULL);
1046 if (!lex_match (lexer, '/') && lex_token (lexer) != '.')
1048 lex_end_of_command (lexer);
1053 if (mtf.by_cnt == 0)
1057 msg (SE, _("BY is required when TABLE is specified."));
1062 msg (SE, _("BY is required when IN is specified."));
1067 /* Set up mapping from each file's variables to master
1069 for (iter = mtf.head; iter != NULL; iter = iter->next)
1071 struct dictionary *d = iter->dict;
1074 for (i = 0; i < dict_get_var_cnt (d); i++)
1076 struct variable *v = dict_get_var (d, i);
1077 struct variable *mv = dict_lookup_var (mtf.dict, var_get_name (v));
1083 /* Add IN variables to master dictionary. */
1084 for (iter = mtf.head; iter != NULL; iter = iter->next)
1085 if (iter->in_name != NULL)
1087 struct fmt_spec format = fmt_for_output (FMT_F, 1, 0);
1088 iter->in_var = dict_create_var (mtf.dict, iter->in_name, 0);
1089 if (iter->in_var == NULL)
1091 msg (SE, _("IN variable name %s duplicates an "
1092 "existing variable name."),
1093 var_get_name (iter->in_var));
1096 var_set_both_formats (iter->in_var, &format);
1099 /* MATCH FILES performs an n-way merge on all its input files.
1102 1. Read one input record from every input FILE.
1104 2. If no FILEs are left, stop. Otherwise, proceed to step 3.
1106 3. Find the FILE input record(s) that have minimum BY
1107 values. Store all the values from these input records into
1110 4. For every TABLE, read another record as long as the BY values
1111 on the TABLE's input record are less than the FILEs' BY values.
1112 If an exact match is found, store all the values from the TABLE
1113 input record into the output record.
1115 5. Write the output record.
1117 6. Read another record from each input file FILE and TABLE that
1118 we stored values from above. If we come to the end of one of the
1119 input files, remove it from the list of input files.
1121 7. Repeat from step 2.
1123 Unfortunately, this algorithm can't be implemented in a
1124 straightforward way because there's no function to read a
1125 record from the active file. Instead, it has to be written
1128 FIXME: For merging large numbers of files (more than 10?) a
1129 better algorithm would use a heap for finding minimum
1132 if (!used_active_file)
1133 discard_variables (ds);
1135 dict_compact_values (mtf.dict);
1136 mtf.output = fastfile_create (dict_get_next_value_idx (mtf.dict));
1137 mtf.seq_nums = xcalloc (dict_get_var_cnt (mtf.dict), sizeof *mtf.seq_nums);
1138 case_create (&mtf.mtf_case, dict_get_next_value_idx (mtf.dict));
1140 if (!mtf_read_nonactive_records (&mtf))
1143 if (used_active_file)
1146 create_case_sink (&null_sink_class,
1147 dataset_dict (ds), NULL));
1149 ( procedure (ds, mtf_processing, &mtf) &&
1150 mtf_processing_finish (&mtf, ds) );
1153 ok = mtf_processing_finish (&mtf, ds);
1155 discard_variables (ds);
1157 dict_destroy (dataset_dict (ds));
1158 dataset_set_dict (ds, mtf.dict);
1160 proc_set_source (ds, storage_source_create (mtf.output));
1163 if (!mtf_free (&mtf))
1165 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
1169 return CMD_CASCADING_FAILURE;
1172 /* Repeats 2...7 an arbitrary number of times. */
1174 mtf_processing_finish (void *mtf_, const struct dataset *ds)
1176 struct mtf_proc *mtf = mtf_;
1177 struct mtf_file *iter;
1179 /* Find the active file and delete it. */
1180 for (iter = mtf->head; iter; iter = iter->next)
1181 if (iter->handle == NULL)
1183 if (!mtf_delete_file_in_place (mtf, &iter))
1188 while (mtf->head && mtf->head->type == MTF_FILE)
1189 if (!mtf_processing (NULL, mtf, ds))
1195 /* Return a string in a static buffer describing V's variable type and
1198 var_type_description (struct variable *v)
1200 static char buf[2][32];
1207 if (var_is_numeric (v))
1208 strcpy (s, "numeric");
1210 sprintf (s, "string with width %d", var_get_width (v));
1214 /* Closes FILE and frees its associated data.
1215 Returns true if successful, false if an I/O error
1216 occurred on FILE. */
1218 mtf_close_file (struct mtf_file *file)
1220 bool ok = file->reader == NULL || !any_reader_error (file->reader);
1222 any_reader_close (file->reader);
1223 if (file->handle != NULL)
1224 dict_destroy (file->dict);
1225 case_destroy (&file->input);
1226 free (file->in_name);
1231 /* Free all the data for the MATCH FILES procedure.
1232 Returns true if successful, false if an I/O error
1235 mtf_free (struct mtf_proc *mtf)
1237 struct mtf_file *iter, *next;
1240 for (iter = mtf->head; iter; iter = next)
1243 assert (iter->dict != mtf->dict);
1244 if (!mtf_close_file (iter))
1249 dict_destroy (mtf->dict);
1250 case_destroy (&mtf->mtf_case);
1251 free (mtf->seq_nums);
1256 /* Remove *FILE from the mtf_file chain. Make *FILE point to the next
1257 file in the chain, or to NULL if was the last in the chain.
1258 Returns true if successful, false if an I/O error occurred. */
1260 mtf_delete_file_in_place (struct mtf_proc *mtf, struct mtf_file **file)
1262 struct mtf_file *f = *file;
1266 f->prev->next = f->next;
1268 f->next->prev = f->prev;
1270 mtf->head = f->next;
1272 mtf->tail = f->prev;
1275 if (f->in_var != NULL)
1276 case_data_rw (&mtf->mtf_case, f->in_var)->f = 0.;
1277 for (i = 0; i < dict_get_var_cnt (f->dict); i++)
1279 struct variable *v = dict_get_var (f->dict, i);
1280 struct variable *mv = get_master (v);
1283 union value *out = case_data_rw (&mtf->mtf_case, mv);
1285 if (var_is_numeric (v))
1288 memset (out->s, ' ', var_get_width (v));
1292 return mtf_close_file (f);
1295 /* Read a record from every input file except the active file.
1296 Returns true if successful, false if an I/O error occurred. */
1298 mtf_read_nonactive_records (void *mtf_)
1300 struct mtf_proc *mtf = mtf_;
1301 struct mtf_file *iter, *next;
1304 for (iter = mtf->head; ok && iter != NULL; iter = next)
1307 if (iter->handle && !any_reader_read (iter->reader, &iter->input))
1308 if (!mtf_delete_file_in_place (mtf, &iter))
1314 /* Compare the BY variables for files A and B; return -1 if A < B, 0
1315 if A == B, 1 if A > B. */
1317 mtf_compare_BY_values (struct mtf_proc *mtf,
1318 struct mtf_file *a, struct mtf_file *b,
1319 const struct ccase *c)
1321 const struct ccase *ca = case_is_null (&a->input) ? c : &a->input;
1322 const struct ccase *cb = case_is_null (&b->input) ? c : &b->input;
1323 assert ((a == NULL) + (b == NULL) + (c == NULL) <= 1);
1324 return case_compare_2dict (ca, cb, a->by, b->by, mtf->by_cnt);
1327 /* Perform one iteration of steps 3...7 above.
1328 Returns true if successful, false if an I/O error occurred. */
1330 mtf_processing (const struct ccase *c, void *mtf_, const struct dataset *ds UNUSED)
1332 struct mtf_proc *mtf = mtf_;
1334 /* Do we need another record from the active file? */
1335 bool read_active_file;
1337 assert (mtf->head != NULL);
1338 if (mtf->head->type == MTF_TABLE)
1343 struct mtf_file *min_head, *min_tail; /* Files with minimum BY values. */
1344 struct mtf_file *max_head, *max_tail; /* Files with non-minimum BYs. */
1345 struct mtf_file *iter, *next;
1347 read_active_file = false;
1349 /* 3. Find the FILE input record(s) that have minimum BY
1350 values. Store all the values from these input records into
1351 the output record. */
1352 min_head = min_tail = mtf->head;
1353 max_head = max_tail = NULL;
1354 for (iter = mtf->head->next; iter && iter->type == MTF_FILE;
1357 int cmp = mtf_compare_BY_values (mtf, min_head, iter, c);
1361 max_tail = max_tail->next_min = iter;
1363 max_head = max_tail = iter;
1366 min_tail = min_tail->next_min = iter;
1371 max_tail->next_min = min_head;
1372 max_tail = min_tail;
1376 max_head = min_head;
1377 max_tail = min_tail;
1379 min_head = min_tail = iter;
1383 /* 4. For every TABLE, read another record as long as the BY
1384 values on the TABLE's input record are less than the FILEs'
1385 BY values. If an exact match is found, store all the values
1386 from the TABLE input record into the output record. */
1387 for (; iter != NULL; iter = next)
1389 assert (iter->type == MTF_TABLE);
1394 int cmp = mtf_compare_BY_values (mtf, min_head, iter, c);
1398 max_tail = max_tail->next_min = iter;
1400 max_head = max_tail = iter;
1403 min_tail = min_tail->next_min = iter;
1406 if (iter->handle == NULL)
1408 if (any_reader_read (iter->reader, &iter->input))
1410 if (!mtf_delete_file_in_place (mtf, &iter))
1417 /* Next sequence number. */
1420 /* Store data to all the records we are using. */
1422 min_tail->next_min = NULL;
1423 for (iter = min_head; iter; iter = iter->next_min)
1427 for (i = 0; i < dict_get_var_cnt (iter->dict); i++)
1429 struct variable *v = dict_get_var (iter->dict, i);
1430 struct variable *mv = get_master (v);
1431 size_t mv_index = mv ? var_get_dict_index (mv) : 0;
1433 if (mv != NULL && mtf->seq_nums[mv_index] != mtf->seq_num)
1435 const struct ccase *record
1436 = case_is_null (&iter->input) ? c : &iter->input;
1437 union value *out = case_data_rw (&mtf->mtf_case, mv);
1439 mtf->seq_nums[mv_index] = mtf->seq_num;
1440 if (var_is_numeric (v))
1441 out->f = case_num (record, v);
1443 memcpy (out->s, case_str (record, v), var_get_width (v));
1446 if (iter->in_var != NULL)
1447 case_data_rw (&mtf->mtf_case, iter->in_var)->f = 1.;
1449 if (iter->type == MTF_FILE && iter->handle == NULL)
1450 read_active_file = true;
1453 /* Store missing values to all the records we're not
1456 max_tail->next_min = NULL;
1457 for (iter = max_head; iter; iter = iter->next_min)
1461 for (i = 0; i < dict_get_var_cnt (iter->dict); i++)
1463 struct variable *v = dict_get_var (iter->dict, i);
1464 struct variable *mv = get_master (v);
1465 size_t mv_index = mv ? var_get_dict_index (mv) : 0;
1467 if (mv != NULL && mtf->seq_nums[mv_index] != mtf->seq_num)
1469 union value *out = case_data_rw (&mtf->mtf_case, mv);
1470 mtf->seq_nums[mv_index] = mtf->seq_num;
1472 if (var_is_numeric (v))
1475 memset (out->s, ' ', var_get_width (v));
1478 if (iter->in_var != NULL)
1479 case_data_rw (&mtf->mtf_case, iter->in_var)->f = 0.;
1482 /* 5. Write the output record. */
1483 casefile_append (mtf->output, &mtf->mtf_case);
1485 /* 6. Read another record from each input file FILE and TABLE
1486 that we stored values from above. If we come to the end of
1487 one of the input files, remove it from the list of input
1489 for (iter = min_head; iter && iter->type == MTF_FILE; iter = next)
1491 next = iter->next_min;
1492 if (iter->reader != NULL
1493 && !any_reader_read (iter->reader, &iter->input))
1494 if (!mtf_delete_file_in_place (mtf, &iter))
1498 while (!read_active_file
1499 && mtf->head != NULL && mtf->head->type == MTF_FILE);
1504 /* Merge the dictionary for file F into master dictionary M. */
1506 mtf_merge_dictionary (struct dictionary *const m, struct mtf_file *f)
1508 struct dictionary *d = f->dict;
1509 const char *d_docs, *m_docs;
1512 if (dict_get_label (m) == NULL)
1513 dict_set_label (m, dict_get_label (d));
1515 d_docs = dict_get_documents (d);
1516 m_docs = dict_get_documents (m);
1520 dict_set_documents (m, d_docs);
1526 new_len = strlen (m_docs) + strlen (d_docs);
1527 new_docs = xmalloc (new_len + 1);
1528 strcpy (new_docs, m_docs);
1529 strcat (new_docs, d_docs);
1530 dict_set_documents (m, new_docs);
1535 for (i = 0; i < dict_get_var_cnt (d); i++)
1537 struct variable *dv = dict_get_var (d, i);
1538 struct variable *mv = dict_lookup_var (m, var_get_name (dv));
1540 if (dict_class_from_id (var_get_name (dv)) == DC_SCRATCH)
1545 if (var_get_width (mv) != var_get_width (dv))
1547 msg (SE, _("Variable %s in file %s (%s) has different "
1548 "type or width from the same variable in "
1549 "earlier file (%s)."),
1550 var_get_name (dv), fh_get_name (f->handle),
1551 var_type_description (dv), var_type_description (mv));
1555 if (var_get_width (dv) == var_get_width (mv))
1557 if (var_has_value_labels (dv) && !var_has_value_labels (mv))
1558 var_set_value_labels (mv, var_get_value_labels (dv));
1559 if (var_has_missing_values (dv) && !var_has_missing_values (mv))
1560 var_set_missing_values (mv, var_get_missing_values (dv));
1563 if (var_get_label (dv) && !var_get_label (mv))
1564 var_set_label (mv, var_get_label (dv));
1567 mv = dict_clone_var_assert (m, dv, var_get_name (dv));
1573 /* Marks V's master variable as MASTER. */
1575 set_master (struct variable *v, struct variable *master)
1577 var_attach_aux (v, master, NULL);
1580 /* Returns the master variable corresponding to V,
1581 as set with set_master(). */
1582 static struct variable *
1583 get_master (struct variable *v)
1585 return var_get_aux (v);
1590 A case map copies data from a case that corresponds for one
1591 dictionary to a case that corresponds to a second dictionary
1592 derived from the first by, optionally, deleting, reordering,
1593 or renaming variables. (No new variables may be created.)
1599 size_t value_cnt; /* Number of values in map. */
1600 int *map; /* For each destination index, the
1601 corresponding source index. */
1604 /* Prepares dictionary D for producing a case map. Afterward,
1605 the caller may delete, reorder, or rename variables within D
1606 at will before using finish_case_map() to produce the case
1609 Uses D's aux members, which must otherwise not be in use. */
1611 start_case_map (struct dictionary *d)
1613 size_t var_cnt = dict_get_var_cnt (d);
1616 for (i = 0; i < var_cnt; i++)
1618 struct variable *v = dict_get_var (d, i);
1619 int *src_fv = xmalloc (sizeof *src_fv);
1620 *src_fv = var_get_case_index (v);
1621 var_attach_aux (v, src_fv, var_dtor_free);
1625 /* Produces a case map from dictionary D, which must have been
1626 previously prepared with start_case_map().
1628 Does not retain any reference to D, and clears the aux members
1629 set up by start_case_map().
1631 Returns the new case map, or a null pointer if no mapping is
1632 required (that is, no data has changed position). */
1633 static struct case_map *
1634 finish_case_map (struct dictionary *d)
1636 struct case_map *map;
1637 size_t var_cnt = dict_get_var_cnt (d);
1641 map = xmalloc (sizeof *map);
1642 map->value_cnt = dict_get_next_value_idx (d);
1643 map->map = xnmalloc (map->value_cnt, sizeof *map->map);
1644 for (i = 0; i < map->value_cnt; i++)
1648 for (i = 0; i < var_cnt; i++)
1650 struct variable *v = dict_get_var (d, i);
1651 size_t value_cnt = var_get_value_cnt (v);
1652 int *src_fv = (int *) var_detach_aux (v);
1655 if (var_get_case_index (v) != *src_fv)
1658 for (idx = 0; idx < value_cnt; idx++)
1660 int src_idx = *src_fv + idx;
1661 int dst_idx = var_get_case_index (v) + idx;
1663 assert (map->map[dst_idx] == -1);
1664 map->map[dst_idx] = src_idx;
1671 destroy_case_map (map);
1675 while (map->value_cnt > 0 && map->map[map->value_cnt - 1] == -1)
1681 /* Maps from SRC to DST, applying case map MAP. */
1683 map_case (const struct case_map *map,
1684 const struct ccase *src, struct ccase *dst)
1688 assert (map != NULL);
1689 assert (src != NULL);
1690 assert (dst != NULL);
1691 assert (src != dst);
1693 for (dst_idx = 0; dst_idx < map->value_cnt; dst_idx++)
1695 int src_idx = map->map[dst_idx];
1697 *case_data_rw_idx (dst, dst_idx) = *case_data_idx (src, src_idx);
1701 /* Destroys case map MAP. */
1703 destroy_case_map (struct case_map *map)