1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include <data/any-reader.h>
25 #include <data/any-writer.h>
26 #include <data/case-sink.h>
27 #include <data/case-source.h>
28 #include <data/case.h>
29 #include <data/casefile.h>
30 #include <data/fastfile.h>
31 #include <data/dictionary.h>
32 #include <data/por-file-writer.h>
33 #include <data/procedure.h>
34 #include <data/settings.h>
35 #include <data/storage-stream.h>
36 #include <data/sys-file-writer.h>
37 #include <data/transformations.h>
38 #include <data/value-labels.h>
39 #include <data/variable.h>
40 #include <language/command.h>
41 #include <language/data-io/file-handle.h>
42 #include <language/lexer/lexer.h>
43 #include <language/lexer/variable-parser.h>
44 #include <libpspp/alloc.h>
45 #include <libpspp/assertion.h>
46 #include <libpspp/compiler.h>
47 #include <libpspp/hash.h>
48 #include <libpspp/message.h>
49 #include <libpspp/message.h>
50 #include <libpspp/misc.h>
51 #include <libpspp/str.h>
54 #define _(msgid) gettext (msgid)
56 /* Rearranging and reducing a dictionary. */
57 static void start_case_map (struct dictionary *);
58 static struct case_map *finish_case_map (struct dictionary *);
59 static void map_case (const struct case_map *,
60 const struct ccase *, struct ccase *);
61 static void destroy_case_map (struct case_map *);
63 static bool parse_dict_trim (struct lexer *, struct dictionary *);
65 /* Reading system and portable files. */
67 /* Type of command. */
74 /* Case reader input program. */
75 struct case_reader_pgm
77 struct any_reader *reader; /* File reader. */
78 struct case_map *map; /* Map from file dict to active file dict. */
79 struct ccase bounce; /* Bounce buffer. */
82 static const struct case_source_class case_reader_source_class;
84 static void case_reader_pgm_free (struct case_reader_pgm *);
86 /* Parses a GET or IMPORT command. */
88 parse_read_command (struct lexer *lexer, struct dataset *ds, enum reader_command type)
90 struct case_reader_pgm *pgm = NULL;
91 struct file_handle *fh = NULL;
92 struct dictionary *dict = NULL;
96 lex_match (lexer, '/');
98 if (lex_match_id (lexer, "FILE") || lex_token (lexer) == T_STRING)
100 lex_match (lexer, '=');
102 fh = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
106 else if (type == IMPORT_CMD && lex_match_id (lexer, "TYPE"))
108 lex_match (lexer, '=');
110 if (lex_match_id (lexer, "COMM"))
112 else if (lex_match_id (lexer, "TAPE"))
116 lex_error (lexer, _("expecting COMM or TAPE"));
126 lex_sbc_missing (lexer, "FILE");
130 discard_variables (ds);
132 pgm = xmalloc (sizeof *pgm);
133 pgm->reader = any_reader_open (fh, &dict);
135 case_nullify (&pgm->bounce);
136 if (pgm->reader == NULL)
139 case_create (&pgm->bounce, dict_get_next_value_idx (dict));
141 start_case_map (dict);
143 while (lex_token (lexer) != '.')
145 lex_match (lexer, '/');
146 if (!parse_dict_trim (lexer, dict))
150 pgm->map = finish_case_map (dict);
152 dict_destroy (dataset_dict (ds));
153 dataset_set_dict (ds, dict);
156 create_case_source (&case_reader_source_class, pgm));
161 case_reader_pgm_free (pgm);
164 return CMD_CASCADING_FAILURE;
167 /* Frees a struct case_reader_pgm. */
169 case_reader_pgm_free (struct case_reader_pgm *pgm)
173 any_reader_close (pgm->reader);
174 destroy_case_map (pgm->map);
175 case_destroy (&pgm->bounce);
180 /* Clears internal state related to case reader input procedure. */
182 case_reader_source_destroy (struct case_source *source)
184 struct case_reader_pgm *pgm = source->aux;
185 case_reader_pgm_free (pgm);
188 /* Reads all the cases from the data file into C and passes them
189 to WRITE_CASE one by one, passing WC_DATA.
190 Returns true if successful, false if an I/O error occurred. */
192 case_reader_source_read (struct case_source *source,
194 write_case_func *write_case, write_case_data wc_data)
196 struct case_reader_pgm *pgm = source->aux;
202 if (pgm->map == NULL)
203 got_case = any_reader_read (pgm->reader, c);
206 got_case = any_reader_read (pgm->reader, &pgm->bounce);
208 map_case (pgm->map, &pgm->bounce, c);
213 ok = write_case (wc_data);
217 return ok && !any_reader_error (pgm->reader);
220 static const struct case_source_class case_reader_source_class =
224 case_reader_source_read,
225 case_reader_source_destroy,
230 cmd_get (struct lexer *lexer, struct dataset *ds)
232 return parse_read_command (lexer, ds, GET_CMD);
237 cmd_import (struct lexer *lexer, struct dataset *ds)
239 return parse_read_command (lexer, ds, IMPORT_CMD);
242 /* Writing system and portable files. */
244 /* Type of output file. */
247 SYSFILE_WRITER, /* System file. */
248 PORFILE_WRITER /* Portable file. */
251 /* Type of a command. */
254 XFORM_CMD, /* Transformation. */
255 PROC_CMD /* Procedure. */
258 /* File writer plus a case map. */
261 struct any_writer *writer; /* File writer. */
262 struct case_map *map; /* Map to output file dictionary
263 (null pointer for identity mapping). */
264 struct ccase bounce; /* Bounce buffer for mapping (if needed). */
269 case_writer_destroy (struct case_writer *aw)
274 ok = any_writer_close (aw->writer);
275 destroy_case_map (aw->map);
276 case_destroy (&aw->bounce);
282 /* Parses SAVE or XSAVE or EXPORT or XEXPORT command.
283 WRITER_TYPE identifies the type of file to write,
284 and COMMAND_TYPE identifies the type of command.
286 On success, returns a writer.
287 For procedures only, sets *RETAIN_UNSELECTED to true if cases
288 that would otherwise be excluded by FILTER or USE should be
291 On failure, returns a null pointer. */
292 static struct case_writer *
293 parse_write_command (struct lexer *lexer, struct dataset *ds,
294 enum writer_type writer_type,
295 enum command_type command_type,
296 bool *retain_unselected)
299 struct file_handle *handle; /* Output file. */
300 struct dictionary *dict; /* Dictionary for output file. */
301 struct case_writer *aw; /* Writer. */
303 /* Common options. */
304 bool print_map; /* Print map? TODO. */
305 bool print_short_names; /* Print long-to-short name map. TODO. */
306 struct sfm_write_options sysfile_opts;
307 struct pfm_write_options porfile_opts;
309 assert (writer_type == SYSFILE_WRITER || writer_type == PORFILE_WRITER);
310 assert (command_type == XFORM_CMD || command_type == PROC_CMD);
311 assert ((retain_unselected != NULL) == (command_type == PROC_CMD));
313 if (command_type == PROC_CMD)
314 *retain_unselected = true;
317 dict = dict_clone (dataset_dict (ds));
318 aw = xmalloc (sizeof *aw);
321 case_nullify (&aw->bounce);
323 print_short_names = false;
324 sysfile_opts = sfm_writer_default_options ();
325 porfile_opts = pfm_writer_default_options ();
327 start_case_map (dict);
328 dict_delete_scratch_vars (dict);
330 lex_match (lexer, '/');
333 if (lex_match_id (lexer, "OUTFILE"))
337 lex_sbc_only_once ("OUTFILE");
341 lex_match (lexer, '=');
343 handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
347 else if (lex_match_id (lexer, "NAMES"))
348 print_short_names = true;
349 else if (lex_match_id (lexer, "PERMISSIONS"))
353 lex_match (lexer, '=');
354 if (lex_match_id (lexer, "READONLY"))
356 else if (lex_match_id (lexer, "WRITEABLE"))
360 lex_error (lexer, _("expecting %s or %s"), "READONLY", "WRITEABLE");
363 sysfile_opts.create_writeable = porfile_opts.create_writeable = cw;
365 else if (command_type == PROC_CMD && lex_match_id (lexer, "UNSELECTED"))
367 lex_match (lexer, '=');
368 if (lex_match_id (lexer, "RETAIN"))
369 *retain_unselected = true;
370 else if (lex_match_id (lexer, "DELETE"))
371 *retain_unselected = false;
374 lex_error (lexer, _("expecting %s or %s"), "RETAIN", "DELETE");
378 else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "COMPRESSED"))
379 sysfile_opts.compress = true;
380 else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "UNCOMPRESSED"))
381 sysfile_opts.compress = false;
382 else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "VERSION"))
384 lex_match (lexer, '=');
385 if (!lex_force_int (lexer))
387 sysfile_opts.version = lex_integer (lexer);
390 else if (writer_type == PORFILE_WRITER && lex_match_id (lexer, "TYPE"))
392 lex_match (lexer, '=');
393 if (lex_match_id (lexer, "COMMUNICATIONS"))
394 porfile_opts.type = PFM_COMM;
395 else if (lex_match_id (lexer, "TAPE"))
396 porfile_opts.type = PFM_TAPE;
399 lex_error (lexer, _("expecting %s or %s"), "COMM", "TAPE");
403 else if (writer_type == PORFILE_WRITER && lex_match_id (lexer, "DIGITS"))
405 lex_match (lexer, '=');
406 if (!lex_force_int (lexer))
408 porfile_opts.digits = lex_integer (lexer);
411 else if (!parse_dict_trim (lexer, dict))
414 if (!lex_match (lexer, '/'))
417 if (lex_end_of_command (lexer) != CMD_SUCCESS)
422 lex_sbc_missing (lexer, "OUTFILE");
426 dict_compact_values (dict);
427 aw->map = finish_case_map (dict);
429 case_create (&aw->bounce, dict_get_next_value_idx (dict));
431 if (fh_get_referent (handle) == FH_REF_FILE)
436 aw->writer = any_writer_from_sfm_writer (
437 sfm_open_writer (handle, dict, sysfile_opts));
440 aw->writer = any_writer_from_pfm_writer (
441 pfm_open_writer (handle, dict, porfile_opts));
446 aw->writer = any_writer_open (handle, dict);
447 if (aw->writer == NULL)
454 case_writer_destroy (aw);
459 /* Writes case C to writer AW. */
461 case_writer_write_case (struct case_writer *aw, const struct ccase *c)
465 map_case (aw->map, c, &aw->bounce);
468 return any_writer_write (aw->writer, c);
471 /* SAVE and EXPORT. */
473 static bool output_proc (const struct ccase *, void *, const struct dataset *);
475 /* Parses and performs the SAVE or EXPORT procedure. */
477 parse_output_proc (struct lexer *lexer, struct dataset *ds, enum writer_type writer_type)
479 bool retain_unselected;
480 struct variable *saved_filter_variable;
481 struct case_writer *aw;
484 aw = parse_write_command (lexer, ds, writer_type, PROC_CMD, &retain_unselected);
486 return CMD_CASCADING_FAILURE;
488 saved_filter_variable = dict_get_filter (dataset_dict (ds));
489 if (retain_unselected)
490 dict_set_filter (dataset_dict (ds), NULL);
491 ok = procedure (ds, output_proc, aw);
492 dict_set_filter (dataset_dict (ds), saved_filter_variable);
494 case_writer_destroy (aw);
495 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
498 /* Writes case C to file. */
500 output_proc (const struct ccase *c, void *aw_, const struct dataset *ds UNUSED)
502 struct case_writer *aw = aw_;
503 return case_writer_write_case (aw, c);
507 cmd_save (struct lexer *lexer, struct dataset *ds)
509 return parse_output_proc (lexer, ds, SYSFILE_WRITER);
513 cmd_export (struct lexer *lexer, struct dataset *ds)
515 return parse_output_proc (lexer, ds, PORFILE_WRITER);
518 /* XSAVE and XEXPORT. */
520 /* Transformation. */
523 struct case_writer *aw; /* Writer. */
526 static trns_proc_func output_trns_proc;
527 static trns_free_func output_trns_free;
529 /* Parses the XSAVE or XEXPORT transformation command. */
531 parse_output_trns (struct lexer *lexer, struct dataset *ds, enum writer_type writer_type)
533 struct output_trns *t = xmalloc (sizeof *t);
534 t->aw = parse_write_command (lexer, ds, writer_type, XFORM_CMD, NULL);
538 return CMD_CASCADING_FAILURE;
541 add_transformation (ds, output_trns_proc, output_trns_free, t);
545 /* Writes case C to the system file specified on XSAVE or XEXPORT. */
547 output_trns_proc (void *trns_, struct ccase *c, casenumber case_num UNUSED)
549 struct output_trns *t = trns_;
550 case_writer_write_case (t->aw, c);
551 return TRNS_CONTINUE;
554 /* Frees an XSAVE or XEXPORT transformation.
555 Returns true if successful, false if an I/O error occurred. */
557 output_trns_free (void *trns_)
559 struct output_trns *t = trns_;
564 ok = case_writer_destroy (t->aw);
572 cmd_xsave (struct lexer *lexer, struct dataset *ds)
574 return parse_output_trns (lexer, ds, SYSFILE_WRITER);
577 /* XEXPORT command. */
579 cmd_xexport (struct lexer *lexer, struct dataset *ds)
581 return parse_output_trns (lexer, ds, PORFILE_WRITER);
584 static bool rename_variables (struct lexer *lexer, struct dictionary *dict);
585 static bool drop_variables (struct lexer *, struct dictionary *dict);
586 static bool keep_variables (struct lexer *, struct dictionary *dict);
588 /* Commands that read and write system files share a great deal
589 of common syntactic structure for rearranging and dropping
590 variables. This function parses this syntax and modifies DICT
591 appropriately. Returns true on success, false on failure. */
593 parse_dict_trim (struct lexer *lexer, struct dictionary *dict)
595 if (lex_match_id (lexer, "MAP"))
600 else if (lex_match_id (lexer, "DROP"))
601 return drop_variables (lexer, dict);
602 else if (lex_match_id (lexer, "KEEP"))
603 return keep_variables (lexer, dict);
604 else if (lex_match_id (lexer, "RENAME"))
605 return rename_variables (lexer, dict);
608 lex_error (lexer, _("expecting a valid subcommand"));
613 /* Parses and performs the RENAME subcommand of GET and SAVE. */
615 rename_variables (struct lexer *lexer, struct dictionary *dict)
628 lex_match (lexer, '=');
629 if (lex_token (lexer) != '(')
633 v = parse_variable (lexer, dict);
636 if (!lex_force_match (lexer, '=')
637 || !lex_force_id (lexer))
639 if (dict_lookup_var (dict, lex_tokid (lexer)) != NULL)
641 msg (SE, _("Cannot rename %s as %s because there already exists "
642 "a variable named %s. To rename variables with "
643 "overlapping names, use a single RENAME subcommand "
644 "such as \"/RENAME (A=B)(B=C)(C=A)\", or equivalently, "
645 "\"/RENAME (A B C=B C A)\"."), v->name, lex_tokid (lexer), lex_tokid (lexer));
649 dict_rename_var (dict, v, lex_tokid (lexer));
658 while (lex_match (lexer, '('))
662 if (!parse_variables (lexer, dict, &v, &nv, PV_NO_DUPLICATE | PV_APPEND))
664 if (!lex_match (lexer, '='))
666 msg (SE, _("`=' expected after variable list."));
669 if (!parse_DATA_LIST_vars (lexer, &new_names, &nn, PV_APPEND | PV_NO_SCRATCH))
673 msg (SE, _("Number of variables on left side of `=' (%d) does not "
674 "match number of variables on right side (%d), in "
675 "parenthesized group %d of RENAME subcommand."),
676 (unsigned) (nv - old_nv), (unsigned) (nn - old_nv), group);
679 if (!lex_force_match (lexer, ')'))
684 if (!dict_rename_vars (dict, v, new_names, nv, &err_name))
686 msg (SE, _("Requested renaming duplicates variable name %s."), err_name);
692 for (i = 0; i < nn; i++)
700 /* Parses and performs the DROP subcommand of GET and SAVE.
701 Returns true if successful, false on failure.*/
703 drop_variables (struct lexer *lexer, struct dictionary *dict)
708 lex_match (lexer, '=');
709 if (!parse_variables (lexer, dict, &v, &nv, PV_NONE))
711 dict_delete_vars (dict, v, nv);
714 if (dict_get_var_cnt (dict) == 0)
716 msg (SE, _("Cannot DROP all variables from dictionary."));
722 /* Parses and performs the KEEP subcommand of GET and SAVE.
723 Returns true if successful, false on failure.*/
725 keep_variables (struct lexer *lexer, struct dictionary *dict)
731 lex_match (lexer, '=');
732 if (!parse_variables (lexer, dict, &v, &nv, PV_NONE))
735 /* Move the specified variables to the beginning. */
736 dict_reorder_vars (dict, v, nv);
738 /* Delete the remaining variables. */
739 v = xnrealloc (v, dict_get_var_cnt (dict) - nv, sizeof *v);
740 for (i = nv; i < dict_get_var_cnt (dict); i++)
741 v[i - nv] = dict_get_var (dict, i);
742 dict_delete_vars (dict, v, dict_get_var_cnt (dict) - nv);
753 MTF_FILE, /* Specified on FILE= subcommand. */
754 MTF_TABLE /* Specified on TABLE= subcommand. */
757 /* One of the files on MATCH FILES. */
760 struct mtf_file *next, *prev; /* Next, previous in the list of files. */
761 struct mtf_file *next_min; /* Next in the chain of minimums. */
763 int type; /* One of MTF_*. */
764 struct variable **by; /* List of BY variables for this file. */
765 struct file_handle *handle; /* File handle. */
766 struct any_reader *reader; /* File reader. */
767 struct dictionary *dict; /* Dictionary from system file. */
770 char *in_name; /* Variable name. */
771 struct variable *in_var; /* Variable (in master dictionary). */
773 struct ccase input; /* Input record. */
776 /* MATCH FILES procedure. */
779 struct mtf_file *head; /* First file mentioned on FILE or TABLE. */
780 struct mtf_file *tail; /* Last file mentioned on FILE or TABLE. */
782 bool ok; /* False if I/O error occurs. */
784 size_t by_cnt; /* Number of variables on BY subcommand. */
786 /* Names of FIRST, LAST variables. */
787 char first[LONG_NAME_LEN + 1], last[LONG_NAME_LEN + 1];
789 struct dictionary *dict; /* Dictionary of output file. */
790 struct casefile *output; /* MATCH FILES output. */
791 struct ccase mtf_case; /* Case used for output. */
793 unsigned seq_num; /* Have we initialized this variable? */
794 unsigned *seq_nums; /* Sequence numbers for each var in dict. */
797 static bool mtf_free (struct mtf_proc *);
798 static bool mtf_close_file (struct mtf_file *);
799 static int mtf_merge_dictionary (struct dictionary *const, struct mtf_file *);
800 static bool mtf_delete_file_in_place (struct mtf_proc *, struct mtf_file **);
802 static bool mtf_read_nonactive_records (void *);
803 static bool mtf_processing_finish (void *, const struct dataset *);
804 static bool mtf_processing (const struct ccase *, void *, const struct dataset *);
806 static char *var_type_description (struct variable *);
808 static void set_master (struct variable *, struct variable *master);
809 static struct variable *get_master (struct variable *);
811 /* Parse and execute the MATCH FILES command. */
813 cmd_match_files (struct lexer *lexer, struct dataset *ds)
816 struct mtf_file *first_table = NULL;
817 struct mtf_file *iter;
819 bool used_active_file = false;
820 bool saw_table = false;
825 mtf.head = mtf.tail = NULL;
829 mtf.dict = dict_create ();
831 case_nullify (&mtf.mtf_case);
834 dict_set_case_limit (mtf.dict, dict_get_case_limit (dataset_dict (ds)));
836 lex_match (lexer, '/');
837 while (lex_token (lexer) == T_ID
838 && (lex_id_match (ss_cstr ("FILE"), ss_cstr (lex_tokid (lexer)))
839 || lex_id_match (ss_cstr ("TABLE"), ss_cstr (lex_tokid (lexer)))))
841 struct mtf_file *file = xmalloc (sizeof *file);
843 if (lex_match_id (lexer, "FILE"))
844 file->type = MTF_FILE;
845 else if (lex_match_id (lexer, "TABLE"))
847 file->type = MTF_TABLE;
852 lex_match (lexer, '=');
858 file->in_name = NULL;
860 case_nullify (&file->input);
862 /* FILEs go first, then TABLEs. */
863 if (file->type == MTF_TABLE || first_table == NULL)
866 file->prev = mtf.tail;
868 mtf.tail->next = file;
870 if (mtf.head == NULL)
872 if (file->type == MTF_TABLE && first_table == NULL)
877 assert (file->type == MTF_FILE);
878 file->next = first_table;
879 file->prev = first_table->prev;
880 if (first_table->prev)
881 first_table->prev->next = file;
884 first_table->prev = file;
887 if (lex_match (lexer, '*'))
892 if (used_active_file)
894 msg (SE, _("The active file may not be specified more "
898 used_active_file = true;
900 if (!proc_has_source (ds))
902 msg (SE, _("Cannot specify the active file since no active "
903 "file has been defined."));
907 if (proc_make_temporary_transformations_permanent (ds))
909 _("MATCH FILES may not be used after TEMPORARY when "
910 "the active file is an input source. "
911 "Temporary transformations will be made permanent."));
913 file->dict = dataset_dict (ds);
917 file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
918 if (file->handle == NULL)
921 file->reader = any_reader_open (file->handle, &file->dict);
922 if (file->reader == NULL)
925 case_create (&file->input, dict_get_next_value_idx (file->dict));
928 while (lex_match (lexer, '/'))
929 if (lex_match_id (lexer, "RENAME"))
931 if (!rename_variables (lexer, file->dict))
934 else if (lex_match_id (lexer, "IN"))
936 lex_match (lexer, '=');
937 if (lex_token (lexer) != T_ID)
939 lex_error (lexer, NULL);
943 if (file->in_name != NULL)
945 msg (SE, _("Multiple IN subcommands for a single FILE or "
949 file->in_name = xstrdup (lex_tokid (lexer));
954 mtf_merge_dictionary (mtf.dict, file);
957 while (lex_token (lexer) != '.')
959 if (lex_match (lexer, T_BY))
961 struct variable **by;
965 msg (SE, _("BY may appear at most once."));
969 lex_match (lexer, '=');
970 if (!parse_variables (lexer, mtf.dict, &by, &mtf.by_cnt,
971 PV_NO_DUPLICATE | PV_NO_SCRATCH))
974 for (iter = mtf.head; iter != NULL; iter = iter->next)
978 iter->by = xnmalloc (mtf.by_cnt, sizeof *iter->by);
980 for (i = 0; i < mtf.by_cnt; i++)
982 iter->by[i] = dict_lookup_var (iter->dict, by[i]->name);
983 if (iter->by[i] == NULL)
985 msg (SE, _("File %s lacks BY variable %s."),
986 iter->handle ? fh_get_name (iter->handle) : "*",
995 else if (lex_match_id (lexer, "FIRST"))
997 if (mtf.first[0] != '\0')
999 msg (SE, _("FIRST may appear at most once."));
1003 lex_match (lexer, '=');
1004 if (!lex_force_id (lexer))
1006 strcpy (mtf.first, lex_tokid (lexer));
1009 else if (lex_match_id (lexer, "LAST"))
1011 if (mtf.last[0] != '\0')
1013 msg (SE, _("LAST may appear at most once."));
1017 lex_match (lexer, '=');
1018 if (!lex_force_id (lexer))
1020 strcpy (mtf.last, lex_tokid (lexer));
1023 else if (lex_match_id (lexer, "MAP"))
1027 else if (lex_match_id (lexer, "DROP"))
1029 if (!drop_variables (lexer, mtf.dict))
1032 else if (lex_match_id (lexer, "KEEP"))
1034 if (!keep_variables (lexer, mtf.dict))
1039 lex_error (lexer, NULL);
1043 if (!lex_match (lexer, '/') && lex_token (lexer) != '.')
1045 lex_end_of_command (lexer);
1050 if (mtf.by_cnt == 0)
1054 msg (SE, _("BY is required when TABLE is specified."));
1059 msg (SE, _("BY is required when IN is specified."));
1064 /* Set up mapping from each file's variables to master
1066 for (iter = mtf.head; iter != NULL; iter = iter->next)
1068 struct dictionary *d = iter->dict;
1071 for (i = 0; i < dict_get_var_cnt (d); i++)
1073 struct variable *v = dict_get_var (d, i);
1074 struct variable *mv = dict_lookup_var (mtf.dict, v->name);
1080 /* Add IN variables to master dictionary. */
1081 for (iter = mtf.head; iter != NULL; iter = iter->next)
1082 if (iter->in_name != NULL)
1084 iter->in_var = dict_create_var (mtf.dict, iter->in_name, 0);
1085 if (iter->in_var == NULL)
1087 msg (SE, _("IN variable name %s duplicates an "
1088 "existing variable name."),
1089 iter->in_var->name);
1092 iter->in_var->print = iter->in_var->write
1093 = fmt_for_output (FMT_F, 1, 0);
1096 /* MATCH FILES performs an n-way merge on all its input files.
1099 1. Read one input record from every input FILE.
1101 2. If no FILEs are left, stop. Otherwise, proceed to step 3.
1103 3. Find the FILE input record(s) that have minimum BY
1104 values. Store all the values from these input records into
1107 4. For every TABLE, read another record as long as the BY values
1108 on the TABLE's input record are less than the FILEs' BY values.
1109 If an exact match is found, store all the values from the TABLE
1110 input record into the output record.
1112 5. Write the output record.
1114 6. Read another record from each input file FILE and TABLE that
1115 we stored values from above. If we come to the end of one of the
1116 input files, remove it from the list of input files.
1118 7. Repeat from step 2.
1120 Unfortunately, this algorithm can't be implemented in a
1121 straightforward way because there's no function to read a
1122 record from the active file. Instead, it has to be written
1125 FIXME: For merging large numbers of files (more than 10?) a
1126 better algorithm would use a heap for finding minimum
1129 if (!used_active_file)
1130 discard_variables (ds);
1132 dict_compact_values (mtf.dict);
1133 mtf.output = fastfile_create (dict_get_next_value_idx (mtf.dict));
1134 mtf.seq_nums = xcalloc (dict_get_var_cnt (mtf.dict), sizeof *mtf.seq_nums);
1135 case_create (&mtf.mtf_case, dict_get_next_value_idx (mtf.dict));
1137 if (!mtf_read_nonactive_records (&mtf))
1140 if (used_active_file)
1143 create_case_sink (&null_sink_class,
1144 dataset_dict (ds), NULL));
1146 ( procedure (ds, mtf_processing, &mtf) &&
1147 mtf_processing_finish (&mtf, ds) );
1150 ok = mtf_processing_finish (&mtf, ds);
1152 discard_variables (ds);
1154 dict_destroy (dataset_dict (ds));
1155 dataset_set_dict (ds, mtf.dict);
1157 proc_set_source (ds, storage_source_create (mtf.output));
1160 if (!mtf_free (&mtf))
1162 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
1166 return CMD_CASCADING_FAILURE;
1169 /* Repeats 2...7 an arbitrary number of times. */
1171 mtf_processing_finish (void *mtf_, const struct dataset *ds)
1173 struct mtf_proc *mtf = mtf_;
1174 struct mtf_file *iter;
1176 /* Find the active file and delete it. */
1177 for (iter = mtf->head; iter; iter = iter->next)
1178 if (iter->handle == NULL)
1180 if (!mtf_delete_file_in_place (mtf, &iter))
1185 while (mtf->head && mtf->head->type == MTF_FILE)
1186 if (!mtf_processing (NULL, mtf, ds))
1192 /* Return a string in a static buffer describing V's variable type and
1195 var_type_description (struct variable *v)
1197 static char buf[2][32];
1204 if (v->type == NUMERIC)
1205 strcpy (s, "numeric");
1208 assert (v->type == ALPHA);
1209 sprintf (s, "string with width %d", v->width);
1214 /* Closes FILE and frees its associated data.
1215 Returns true if successful, false if an I/O error
1216 occurred on FILE. */
1218 mtf_close_file (struct mtf_file *file)
1220 bool ok = file->reader == NULL || !any_reader_error (file->reader);
1222 any_reader_close (file->reader);
1223 if (file->handle != NULL)
1224 dict_destroy (file->dict);
1225 case_destroy (&file->input);
1226 free (file->in_name);
1231 /* Free all the data for the MATCH FILES procedure.
1232 Returns true if successful, false if an I/O error
1235 mtf_free (struct mtf_proc *mtf)
1237 struct mtf_file *iter, *next;
1240 for (iter = mtf->head; iter; iter = next)
1243 assert (iter->dict != mtf->dict);
1244 if (!mtf_close_file (iter))
1249 dict_destroy (mtf->dict);
1250 case_destroy (&mtf->mtf_case);
1251 free (mtf->seq_nums);
1256 /* Remove *FILE from the mtf_file chain. Make *FILE point to the next
1257 file in the chain, or to NULL if was the last in the chain.
1258 Returns true if successful, false if an I/O error occurred. */
1260 mtf_delete_file_in_place (struct mtf_proc *mtf, struct mtf_file **file)
1262 struct mtf_file *f = *file;
1266 f->prev->next = f->next;
1268 f->next->prev = f->prev;
1270 mtf->head = f->next;
1272 mtf->tail = f->prev;
1275 if (f->in_var != NULL)
1276 case_data_rw (&mtf->mtf_case, f->in_var->fv)->f = 0.;
1277 for (i = 0; i < dict_get_var_cnt (f->dict); i++)
1279 struct variable *v = dict_get_var (f->dict, i);
1280 struct variable *mv = get_master (v);
1283 union value *out = case_data_rw (&mtf->mtf_case, mv->fv);
1285 if (v->type == NUMERIC)
1288 memset (out->s, ' ', v->width);
1292 return mtf_close_file (f);
1295 /* Read a record from every input file except the active file.
1296 Returns true if successful, false if an I/O error occurred. */
1298 mtf_read_nonactive_records (void *mtf_)
1300 struct mtf_proc *mtf = mtf_;
1301 struct mtf_file *iter, *next;
1304 for (iter = mtf->head; ok && iter != NULL; iter = next)
1307 if (iter->handle && !any_reader_read (iter->reader, &iter->input))
1308 if (!mtf_delete_file_in_place (mtf, &iter))
1314 /* Compare the BY variables for files A and B; return -1 if A < B, 0
1315 if A == B, 1 if A > B. */
1317 mtf_compare_BY_values (struct mtf_proc *mtf,
1318 struct mtf_file *a, struct mtf_file *b,
1319 const struct ccase *c)
1321 const struct ccase *ca = case_is_null (&a->input) ? c : &a->input;
1322 const struct ccase *cb = case_is_null (&b->input) ? c : &b->input;
1323 assert ((a == NULL) + (b == NULL) + (c == NULL) <= 1);
1324 return case_compare_2dict (ca, cb, a->by, b->by, mtf->by_cnt);
1327 /* Perform one iteration of steps 3...7 above.
1328 Returns true if successful, false if an I/O error occurred. */
1330 mtf_processing (const struct ccase *c, void *mtf_, const struct dataset *ds UNUSED)
1332 struct mtf_proc *mtf = mtf_;
1334 /* Do we need another record from the active file? */
1335 bool read_active_file;
1337 assert (mtf->head != NULL);
1338 if (mtf->head->type == MTF_TABLE)
1343 struct mtf_file *min_head, *min_tail; /* Files with minimum BY values. */
1344 struct mtf_file *max_head, *max_tail; /* Files with non-minimum BYs. */
1345 struct mtf_file *iter, *next;
1347 read_active_file = false;
1349 /* 3. Find the FILE input record(s) that have minimum BY
1350 values. Store all the values from these input records into
1351 the output record. */
1352 min_head = min_tail = mtf->head;
1353 max_head = max_tail = NULL;
1354 for (iter = mtf->head->next; iter && iter->type == MTF_FILE;
1357 int cmp = mtf_compare_BY_values (mtf, min_head, iter, c);
1361 max_tail = max_tail->next_min = iter;
1363 max_head = max_tail = iter;
1366 min_tail = min_tail->next_min = iter;
1371 max_tail->next_min = min_head;
1372 max_tail = min_tail;
1376 max_head = min_head;
1377 max_tail = min_tail;
1379 min_head = min_tail = iter;
1383 /* 4. For every TABLE, read another record as long as the BY
1384 values on the TABLE's input record are less than the FILEs'
1385 BY values. If an exact match is found, store all the values
1386 from the TABLE input record into the output record. */
1387 for (; iter != NULL; iter = next)
1389 assert (iter->type == MTF_TABLE);
1394 int cmp = mtf_compare_BY_values (mtf, min_head, iter, c);
1398 max_tail = max_tail->next_min = iter;
1400 max_head = max_tail = iter;
1403 min_tail = min_tail->next_min = iter;
1406 if (iter->handle == NULL)
1408 if (any_reader_read (iter->reader, &iter->input))
1410 if (!mtf_delete_file_in_place (mtf, &iter))
1417 /* Next sequence number. */
1420 /* Store data to all the records we are using. */
1422 min_tail->next_min = NULL;
1423 for (iter = min_head; iter; iter = iter->next_min)
1427 for (i = 0; i < dict_get_var_cnt (iter->dict); i++)
1429 struct variable *v = dict_get_var (iter->dict, i);
1430 struct variable *mv = get_master (v);
1432 if (mv != NULL && mtf->seq_nums[mv->index] != mtf->seq_num)
1434 const struct ccase *record
1435 = case_is_null (&iter->input) ? c : &iter->input;
1436 union value *out = case_data_rw (&mtf->mtf_case, mv->fv);
1438 mtf->seq_nums[mv->index] = mtf->seq_num;
1439 if (v->type == NUMERIC)
1440 out->f = case_num (record, v->fv);
1442 memcpy (out->s, case_str (record, v->fv), v->width);
1445 if (iter->in_var != NULL)
1446 case_data_rw (&mtf->mtf_case, iter->in_var->fv)->f = 1.;
1448 if (iter->type == MTF_FILE && iter->handle == NULL)
1449 read_active_file = true;
1452 /* Store missing values to all the records we're not
1455 max_tail->next_min = NULL;
1456 for (iter = max_head; iter; iter = iter->next_min)
1460 for (i = 0; i < dict_get_var_cnt (iter->dict); i++)
1462 struct variable *v = dict_get_var (iter->dict, i);
1463 struct variable *mv = get_master (v);
1465 if (mv != NULL && mtf->seq_nums[mv->index] != mtf->seq_num)
1467 union value *out = case_data_rw (&mtf->mtf_case, mv->fv);
1468 mtf->seq_nums[mv->index] = mtf->seq_num;
1470 if (v->type == NUMERIC)
1473 memset (out->s, ' ', v->width);
1476 if (iter->in_var != NULL)
1477 case_data_rw (&mtf->mtf_case, iter->in_var->fv)->f = 0.;
1480 /* 5. Write the output record. */
1481 casefile_append (mtf->output, &mtf->mtf_case);
1483 /* 6. Read another record from each input file FILE and TABLE
1484 that we stored values from above. If we come to the end of
1485 one of the input files, remove it from the list of input
1487 for (iter = min_head; iter && iter->type == MTF_FILE; iter = next)
1489 next = iter->next_min;
1490 if (iter->reader != NULL
1491 && !any_reader_read (iter->reader, &iter->input))
1492 if (!mtf_delete_file_in_place (mtf, &iter))
1496 while (!read_active_file
1497 && mtf->head != NULL && mtf->head->type == MTF_FILE);
1502 /* Merge the dictionary for file F into master dictionary M. */
1504 mtf_merge_dictionary (struct dictionary *const m, struct mtf_file *f)
1506 struct dictionary *d = f->dict;
1507 const char *d_docs, *m_docs;
1510 if (dict_get_label (m) == NULL)
1511 dict_set_label (m, dict_get_label (d));
1513 d_docs = dict_get_documents (d);
1514 m_docs = dict_get_documents (m);
1518 dict_set_documents (m, d_docs);
1524 new_len = strlen (m_docs) + strlen (d_docs);
1525 new_docs = xmalloc (new_len + 1);
1526 strcpy (new_docs, m_docs);
1527 strcat (new_docs, d_docs);
1528 dict_set_documents (m, new_docs);
1533 for (i = 0; i < dict_get_var_cnt (d); i++)
1535 struct variable *dv = dict_get_var (d, i);
1536 struct variable *mv = dict_lookup_var (m, dv->name);
1538 if (dict_class_from_id (dv->name) == DC_SCRATCH)
1543 if (mv->width != dv->width)
1545 msg (SE, _("Variable %s in file %s (%s) has different "
1546 "type or width from the same variable in "
1547 "earlier file (%s)."),
1548 dv->name, fh_get_name (f->handle),
1549 var_type_description (dv), var_type_description (mv));
1553 if (dv->width == mv->width)
1555 if (val_labs_count (dv->val_labs)
1556 && !val_labs_count (mv->val_labs))
1558 val_labs_destroy (mv->val_labs);
1559 mv->val_labs = val_labs_copy (dv->val_labs);
1561 if (!mv_is_empty (&dv->miss) && mv_is_empty (&mv->miss))
1562 mv_copy (&mv->miss, &dv->miss);
1565 if (dv->label && !mv->label)
1566 mv->label = xstrdup (dv->label);
1569 mv = dict_clone_var_assert (m, dv, dv->name);
1575 /* Marks V's master variable as MASTER. */
1577 set_master (struct variable *v, struct variable *master)
1579 var_attach_aux (v, master, NULL);
1582 /* Returns the master variable corresponding to V,
1583 as set with set_master(). */
1584 static struct variable *
1585 get_master (struct variable *v)
1594 A case map copies data from a case that corresponds for one
1595 dictionary to a case that corresponds to a second dictionary
1596 derived from the first by, optionally, deleting, reordering,
1597 or renaming variables. (No new variables may be created.)
1603 size_t value_cnt; /* Number of values in map. */
1604 int *map; /* For each destination index, the
1605 corresponding source index. */
1608 /* Prepares dictionary D for producing a case map. Afterward,
1609 the caller may delete, reorder, or rename variables within D
1610 at will before using finish_case_map() to produce the case
1613 Uses D's aux members, which must otherwise not be in use. */
1615 start_case_map (struct dictionary *d)
1617 size_t var_cnt = dict_get_var_cnt (d);
1620 for (i = 0; i < var_cnt; i++)
1622 struct variable *v = dict_get_var (d, i);
1623 int *src_fv = xmalloc (sizeof *src_fv);
1625 var_attach_aux (v, src_fv, var_dtor_free);
1629 /* Produces a case map from dictionary D, which must have been
1630 previously prepared with start_case_map().
1632 Does not retain any reference to D, and clears the aux members
1633 set up by start_case_map().
1635 Returns the new case map, or a null pointer if no mapping is
1636 required (that is, no data has changed position). */
1637 static struct case_map *
1638 finish_case_map (struct dictionary *d)
1640 struct case_map *map;
1641 size_t var_cnt = dict_get_var_cnt (d);
1645 map = xmalloc (sizeof *map);
1646 map->value_cnt = dict_get_next_value_idx (d);
1647 map->map = xnmalloc (map->value_cnt, sizeof *map->map);
1648 for (i = 0; i < map->value_cnt; i++)
1652 for (i = 0; i < var_cnt; i++)
1654 struct variable *v = dict_get_var (d, i);
1655 int *src_fv = (int *) var_detach_aux (v);
1658 if (v->fv != *src_fv)
1661 for (idx = 0; idx < v->nv; idx++)
1663 int src_idx = *src_fv + idx;
1664 int dst_idx = v->fv + idx;
1666 assert (map->map[dst_idx] == -1);
1667 map->map[dst_idx] = src_idx;
1674 destroy_case_map (map);
1678 while (map->value_cnt > 0 && map->map[map->value_cnt - 1] == -1)
1684 /* Maps from SRC to DST, applying case map MAP. */
1686 map_case (const struct case_map *map,
1687 const struct ccase *src, struct ccase *dst)
1691 assert (map != NULL);
1692 assert (src != NULL);
1693 assert (dst != NULL);
1694 assert (src != dst);
1696 for (dst_idx = 0; dst_idx < map->value_cnt; dst_idx++)
1698 int src_idx = map->map[dst_idx];
1700 *case_data_rw (dst, dst_idx) = *case_data (src, src_idx);
1704 /* Destroys case map MAP. */
1706 destroy_case_map (struct case_map *map)