1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
24 #include <data/any-reader.h>
25 #include <data/any-writer.h>
26 #include <data/case-sink.h>
27 #include <data/case-source.h>
28 #include <data/case.h>
29 #include <data/casefile.h>
30 #include <data/fastfile.h>
31 #include <data/dictionary.h>
32 #include <data/por-file-writer.h>
33 #include <data/procedure.h>
34 #include <data/settings.h>
35 #include <data/storage-stream.h>
36 #include <data/sys-file-writer.h>
37 #include <data/transformations.h>
38 #include <data/value-labels.h>
39 #include <data/variable.h>
40 #include <language/command.h>
41 #include <language/data-io/file-handle.h>
42 #include <language/lexer/lexer.h>
43 #include <language/lexer/variable-parser.h>
44 #include <libpspp/alloc.h>
45 #include <libpspp/assertion.h>
46 #include <libpspp/compiler.h>
47 #include <libpspp/hash.h>
48 #include <libpspp/message.h>
49 #include <libpspp/message.h>
50 #include <libpspp/misc.h>
51 #include <libpspp/str.h>
54 #define _(msgid) gettext (msgid)
56 /* Rearranging and reducing a dictionary. */
57 static void start_case_map (struct dictionary *);
58 static struct case_map *finish_case_map (struct dictionary *);
59 static void map_case (const struct case_map *,
60 const struct ccase *, struct ccase *);
61 static void destroy_case_map (struct case_map *);
63 static bool parse_dict_trim (struct dictionary *);
65 /* Reading system and portable files. */
67 /* Type of command. */
74 /* Case reader input program. */
75 struct case_reader_pgm
77 struct any_reader *reader; /* File reader. */
78 struct case_map *map; /* Map from file dict to active file dict. */
79 struct ccase bounce; /* Bounce buffer. */
82 static const struct case_source_class case_reader_source_class;
84 static void case_reader_pgm_free (struct case_reader_pgm *);
86 /* Parses a GET or IMPORT command. */
88 parse_read_command (struct dataset *ds, enum reader_command type)
90 struct case_reader_pgm *pgm = NULL;
91 struct file_handle *fh = NULL;
92 struct dictionary *dict = NULL;
98 if (lex_match_id ("FILE") || token == T_STRING)
102 fh = fh_parse (FH_REF_FILE | FH_REF_SCRATCH);
106 else if (type == IMPORT_CMD && lex_match_id ("TYPE"))
110 if (lex_match_id ("COMM"))
112 else if (lex_match_id ("TAPE"))
116 lex_error (_("expecting COMM or TAPE"));
126 lex_sbc_missing ("FILE");
130 discard_variables (ds);
132 pgm = xmalloc (sizeof *pgm);
133 pgm->reader = any_reader_open (fh, &dict);
135 case_nullify (&pgm->bounce);
136 if (pgm->reader == NULL)
139 case_create (&pgm->bounce, dict_get_next_value_idx (dict));
141 start_case_map (dict);
146 if (!parse_dict_trim (dict))
150 pgm->map = finish_case_map (dict);
152 dict_destroy (dataset_dict (ds));
153 dataset_set_dict (ds, dict);
156 create_case_source (&case_reader_source_class, pgm));
161 case_reader_pgm_free (pgm);
164 return CMD_CASCADING_FAILURE;
167 /* Frees a struct case_reader_pgm. */
169 case_reader_pgm_free (struct case_reader_pgm *pgm)
173 any_reader_close (pgm->reader);
174 destroy_case_map (pgm->map);
175 case_destroy (&pgm->bounce);
180 /* Clears internal state related to case reader input procedure. */
182 case_reader_source_destroy (struct case_source *source)
184 struct case_reader_pgm *pgm = source->aux;
185 case_reader_pgm_free (pgm);
188 /* Reads all the cases from the data file into C and passes them
189 to WRITE_CASE one by one, passing WC_DATA.
190 Returns true if successful, false if an I/O error occurred. */
192 case_reader_source_read (struct case_source *source,
194 write_case_func *write_case, write_case_data wc_data)
196 struct case_reader_pgm *pgm = source->aux;
202 if (pgm->map == NULL)
203 got_case = any_reader_read (pgm->reader, c);
206 got_case = any_reader_read (pgm->reader, &pgm->bounce);
208 map_case (pgm->map, &pgm->bounce, c);
213 ok = write_case (wc_data);
217 return ok && !any_reader_error (pgm->reader);
220 static const struct case_source_class case_reader_source_class =
224 case_reader_source_read,
225 case_reader_source_destroy,
230 cmd_get (struct dataset *ds)
232 return parse_read_command (ds, GET_CMD);
237 cmd_import (struct dataset *ds)
239 return parse_read_command (ds, IMPORT_CMD);
242 /* Writing system and portable files. */
244 /* Type of output file. */
247 SYSFILE_WRITER, /* System file. */
248 PORFILE_WRITER /* Portable file. */
251 /* Type of a command. */
254 XFORM_CMD, /* Transformation. */
255 PROC_CMD /* Procedure. */
258 /* File writer plus a case map. */
261 struct any_writer *writer; /* File writer. */
262 struct case_map *map; /* Map to output file dictionary
263 (null pointer for identity mapping). */
264 struct ccase bounce; /* Bounce buffer for mapping (if needed). */
269 case_writer_destroy (struct case_writer *aw)
274 ok = any_writer_close (aw->writer);
275 destroy_case_map (aw->map);
276 case_destroy (&aw->bounce);
282 /* Parses SAVE or XSAVE or EXPORT or XEXPORT command.
283 WRITER_TYPE identifies the type of file to write,
284 and COMMAND_TYPE identifies the type of command.
286 On success, returns a writer.
287 For procedures only, sets *RETAIN_UNSELECTED to true if cases
288 that would otherwise be excluded by FILTER or USE should be
291 On failure, returns a null pointer. */
292 static struct case_writer *
293 parse_write_command (struct dataset *ds,
294 enum writer_type writer_type,
295 enum command_type command_type,
296 bool *retain_unselected)
299 struct file_handle *handle; /* Output file. */
300 struct dictionary *dict; /* Dictionary for output file. */
301 struct case_writer *aw; /* Writer. */
303 /* Common options. */
304 bool print_map; /* Print map? TODO. */
305 bool print_short_names; /* Print long-to-short name map. TODO. */
306 struct sfm_write_options sysfile_opts;
307 struct pfm_write_options porfile_opts;
309 assert (writer_type == SYSFILE_WRITER || writer_type == PORFILE_WRITER);
310 assert (command_type == XFORM_CMD || command_type == PROC_CMD);
311 assert ((retain_unselected != NULL) == (command_type == PROC_CMD));
313 if (command_type == PROC_CMD)
314 *retain_unselected = true;
317 dict = dict_clone (dataset_dict (ds));
318 aw = xmalloc (sizeof *aw);
321 case_nullify (&aw->bounce);
323 print_short_names = false;
324 sysfile_opts = sfm_writer_default_options ();
325 porfile_opts = pfm_writer_default_options ();
327 start_case_map (dict);
328 dict_delete_scratch_vars (dict);
333 if (lex_match_id ("OUTFILE"))
337 lex_sbc_only_once ("OUTFILE");
343 handle = fh_parse (FH_REF_FILE | FH_REF_SCRATCH);
347 else if (lex_match_id ("NAMES"))
348 print_short_names = true;
349 else if (lex_match_id ("PERMISSIONS"))
354 if (lex_match_id ("READONLY"))
356 else if (lex_match_id ("WRITEABLE"))
360 lex_error (_("expecting %s or %s"), "READONLY", "WRITEABLE");
363 sysfile_opts.create_writeable = porfile_opts.create_writeable = cw;
365 else if (command_type == PROC_CMD && lex_match_id ("UNSELECTED"))
368 if (lex_match_id ("RETAIN"))
369 *retain_unselected = true;
370 else if (lex_match_id ("DELETE"))
371 *retain_unselected = false;
374 lex_error (_("expecting %s or %s"), "RETAIN", "DELETE");
378 else if (writer_type == SYSFILE_WRITER && lex_match_id ("COMPRESSED"))
379 sysfile_opts.compress = true;
380 else if (writer_type == SYSFILE_WRITER && lex_match_id ("UNCOMPRESSED"))
381 sysfile_opts.compress = false;
382 else if (writer_type == SYSFILE_WRITER && lex_match_id ("VERSION"))
385 if (!lex_force_int ())
387 sysfile_opts.version = lex_integer ();
390 else if (writer_type == PORFILE_WRITER && lex_match_id ("TYPE"))
393 if (lex_match_id ("COMMUNICATIONS"))
394 porfile_opts.type = PFM_COMM;
395 else if (lex_match_id ("TAPE"))
396 porfile_opts.type = PFM_TAPE;
399 lex_error (_("expecting %s or %s"), "COMM", "TAPE");
403 else if (writer_type == PORFILE_WRITER && lex_match_id ("DIGITS"))
406 if (!lex_force_int ())
408 porfile_opts.digits = lex_integer ();
411 else if (!parse_dict_trim (dict))
414 if (!lex_match ('/'))
417 if (lex_end_of_command () != CMD_SUCCESS)
422 lex_sbc_missing ("OUTFILE");
426 dict_compact_values (dict);
427 aw->map = finish_case_map (dict);
429 case_create (&aw->bounce, dict_get_next_value_idx (dict));
431 if (fh_get_referent (handle) == FH_REF_FILE)
436 aw->writer = any_writer_from_sfm_writer (
437 sfm_open_writer (handle, dict, sysfile_opts));
440 aw->writer = any_writer_from_pfm_writer (
441 pfm_open_writer (handle, dict, porfile_opts));
446 aw->writer = any_writer_open (handle, dict);
447 if (aw->writer == NULL)
454 case_writer_destroy (aw);
459 /* Writes case C to writer AW. */
461 case_writer_write_case (struct case_writer *aw, const struct ccase *c)
465 map_case (aw->map, c, &aw->bounce);
468 return any_writer_write (aw->writer, c);
471 /* SAVE and EXPORT. */
473 static bool output_proc (const struct ccase *, void *, const struct dataset *);
475 /* Parses and performs the SAVE or EXPORT procedure. */
477 parse_output_proc (struct dataset *ds, enum writer_type writer_type)
479 bool retain_unselected;
480 struct variable *saved_filter_variable;
481 struct case_writer *aw;
484 aw = parse_write_command (ds, writer_type, PROC_CMD, &retain_unselected);
486 return CMD_CASCADING_FAILURE;
488 saved_filter_variable = dict_get_filter (dataset_dict (ds));
489 if (retain_unselected)
490 dict_set_filter (dataset_dict (ds), NULL);
491 ok = procedure (ds, output_proc, aw);
492 dict_set_filter (dataset_dict (ds), saved_filter_variable);
494 case_writer_destroy (aw);
495 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
498 /* Writes case C to file. */
500 output_proc (const struct ccase *c, void *aw_, const struct dataset *ds UNUSED)
502 struct case_writer *aw = aw_;
503 return case_writer_write_case (aw, c);
507 cmd_save (struct dataset *ds)
509 return parse_output_proc (ds, SYSFILE_WRITER);
513 cmd_export (struct dataset *ds)
515 return parse_output_proc (ds, PORFILE_WRITER);
518 /* XSAVE and XEXPORT. */
520 /* Transformation. */
523 struct case_writer *aw; /* Writer. */
526 static trns_proc_func output_trns_proc;
527 static trns_free_func output_trns_free;
529 /* Parses the XSAVE or XEXPORT transformation command. */
531 parse_output_trns (struct dataset *ds, enum writer_type writer_type)
533 struct output_trns *t = xmalloc (sizeof *t);
534 t->aw = parse_write_command (ds, writer_type, XFORM_CMD, NULL);
538 return CMD_CASCADING_FAILURE;
541 add_transformation (ds, output_trns_proc, output_trns_free, t);
545 /* Writes case C to the system file specified on XSAVE or XEXPORT. */
547 output_trns_proc (void *trns_, struct ccase *c, casenumber case_num UNUSED)
549 struct output_trns *t = trns_;
550 case_writer_write_case (t->aw, c);
551 return TRNS_CONTINUE;
554 /* Frees an XSAVE or XEXPORT transformation.
555 Returns true if successful, false if an I/O error occurred. */
557 output_trns_free (void *trns_)
559 struct output_trns *t = trns_;
564 ok = case_writer_destroy (t->aw);
572 cmd_xsave (struct dataset *ds)
574 return parse_output_trns (ds, SYSFILE_WRITER);
577 /* XEXPORT command. */
579 cmd_xexport (struct dataset *ds)
581 return parse_output_trns (ds, PORFILE_WRITER);
584 static bool rename_variables (struct dictionary *dict);
585 static bool drop_variables (struct dictionary *dict);
586 static bool keep_variables (struct dictionary *dict);
588 /* Commands that read and write system files share a great deal
589 of common syntactic structure for rearranging and dropping
590 variables. This function parses this syntax and modifies DICT
591 appropriately. Returns true on success, false on failure. */
593 parse_dict_trim (struct dictionary *dict)
595 if (lex_match_id ("MAP"))
600 else if (lex_match_id ("DROP"))
601 return drop_variables (dict);
602 else if (lex_match_id ("KEEP"))
603 return keep_variables (dict);
604 else if (lex_match_id ("RENAME"))
605 return rename_variables (dict);
608 lex_error (_("expecting a valid subcommand"));
613 /* Parses and performs the RENAME subcommand of GET and SAVE. */
615 rename_variables (struct dictionary *dict)
633 v = parse_variable (dict);
636 if (!lex_force_match ('=')
639 if (dict_lookup_var (dict, tokid) != NULL)
641 msg (SE, _("Cannot rename %s as %s because there already exists "
642 "a variable named %s. To rename variables with "
643 "overlapping names, use a single RENAME subcommand "
644 "such as \"/RENAME (A=B)(B=C)(C=A)\", or equivalently, "
645 "\"/RENAME (A B C=B C A)\"."), v->name, tokid, tokid);
649 dict_rename_var (dict, v, tokid);
658 while (lex_match ('('))
662 if (!parse_variables (dict, &v, &nv, PV_NO_DUPLICATE | PV_APPEND))
664 if (!lex_match ('='))
666 msg (SE, _("`=' expected after variable list."));
669 if (!parse_DATA_LIST_vars (&new_names, &nn, PV_APPEND | PV_NO_SCRATCH))
673 msg (SE, _("Number of variables on left side of `=' (%d) does not "
674 "match number of variables on right side (%d), in "
675 "parenthesized group %d of RENAME subcommand."),
676 (unsigned) (nv - old_nv), (unsigned) (nn - old_nv), group);
679 if (!lex_force_match (')'))
684 if (!dict_rename_vars (dict, v, new_names, nv, &err_name))
686 msg (SE, _("Requested renaming duplicates variable name %s."), err_name);
692 for (i = 0; i < nn; i++)
700 /* Parses and performs the DROP subcommand of GET and SAVE.
701 Returns true if successful, false on failure.*/
703 drop_variables (struct dictionary *dict)
709 if (!parse_variables (dict, &v, &nv, PV_NONE))
711 dict_delete_vars (dict, v, nv);
714 if (dict_get_var_cnt (dict) == 0)
716 msg (SE, _("Cannot DROP all variables from dictionary."));
722 /* Parses and performs the KEEP subcommand of GET and SAVE.
723 Returns true if successful, false on failure.*/
725 keep_variables (struct dictionary *dict)
732 if (!parse_variables (dict, &v, &nv, PV_NONE))
735 /* Move the specified variables to the beginning. */
736 dict_reorder_vars (dict, v, nv);
738 /* Delete the remaining variables. */
739 v = xnrealloc (v, dict_get_var_cnt (dict) - nv, sizeof *v);
740 for (i = nv; i < dict_get_var_cnt (dict); i++)
741 v[i - nv] = dict_get_var (dict, i);
742 dict_delete_vars (dict, v, dict_get_var_cnt (dict) - nv);
753 MTF_FILE, /* Specified on FILE= subcommand. */
754 MTF_TABLE /* Specified on TABLE= subcommand. */
757 /* One of the files on MATCH FILES. */
760 struct mtf_file *next, *prev; /* Next, previous in the list of files. */
761 struct mtf_file *next_min; /* Next in the chain of minimums. */
763 int type; /* One of MTF_*. */
764 struct variable **by; /* List of BY variables for this file. */
765 struct file_handle *handle; /* File handle. */
766 struct any_reader *reader; /* File reader. */
767 struct dictionary *dict; /* Dictionary from system file. */
770 char *in_name; /* Variable name. */
771 struct variable *in_var; /* Variable (in master dictionary). */
773 struct ccase input; /* Input record. */
776 /* MATCH FILES procedure. */
779 struct mtf_file *head; /* First file mentioned on FILE or TABLE. */
780 struct mtf_file *tail; /* Last file mentioned on FILE or TABLE. */
782 bool ok; /* False if I/O error occurs. */
784 size_t by_cnt; /* Number of variables on BY subcommand. */
786 /* Names of FIRST, LAST variables. */
787 char first[LONG_NAME_LEN + 1], last[LONG_NAME_LEN + 1];
789 struct dictionary *dict; /* Dictionary of output file. */
790 struct casefile *output; /* MATCH FILES output. */
791 struct ccase mtf_case; /* Case used for output. */
793 unsigned seq_num; /* Have we initialized this variable? */
794 unsigned *seq_nums; /* Sequence numbers for each var in dict. */
797 static bool mtf_free (struct mtf_proc *);
798 static bool mtf_close_file (struct mtf_file *);
799 static int mtf_merge_dictionary (struct dictionary *const, struct mtf_file *);
800 static bool mtf_delete_file_in_place (struct mtf_proc *, struct mtf_file **);
802 static bool mtf_read_nonactive_records (void *);
803 static bool mtf_processing_finish (void *, const struct dataset *);
804 static bool mtf_processing (const struct ccase *, void *, const struct dataset *);
806 static char *var_type_description (struct variable *);
808 static void set_master (struct variable *, struct variable *master);
809 static struct variable *get_master (struct variable *);
811 /* Parse and execute the MATCH FILES command. */
813 cmd_match_files (struct dataset *ds)
816 struct mtf_file *first_table = NULL;
817 struct mtf_file *iter;
819 bool used_active_file = false;
820 bool saw_table = false;
825 mtf.head = mtf.tail = NULL;
829 mtf.dict = dict_create ();
831 case_nullify (&mtf.mtf_case);
834 dict_set_case_limit (mtf.dict, dict_get_case_limit (dataset_dict (ds)));
838 && (lex_id_match ("FILE", tokid) || lex_id_match ("TABLE", tokid)))
840 struct mtf_file *file = xmalloc (sizeof *file);
842 if (lex_match_id ("FILE"))
843 file->type = MTF_FILE;
844 else if (lex_match_id ("TABLE"))
846 file->type = MTF_TABLE;
857 file->in_name = NULL;
859 case_nullify (&file->input);
861 /* FILEs go first, then TABLEs. */
862 if (file->type == MTF_TABLE || first_table == NULL)
865 file->prev = mtf.tail;
867 mtf.tail->next = file;
869 if (mtf.head == NULL)
871 if (file->type == MTF_TABLE && first_table == NULL)
876 assert (file->type == MTF_FILE);
877 file->next = first_table;
878 file->prev = first_table->prev;
879 if (first_table->prev)
880 first_table->prev->next = file;
883 first_table->prev = file;
891 if (used_active_file)
893 msg (SE, _("The active file may not be specified more "
897 used_active_file = true;
899 if (!proc_has_source (ds))
901 msg (SE, _("Cannot specify the active file since no active "
902 "file has been defined."));
906 if (proc_make_temporary_transformations_permanent (ds))
908 _("MATCH FILES may not be used after TEMPORARY when "
909 "the active file is an input source. "
910 "Temporary transformations will be made permanent."));
912 file->dict = dataset_dict (ds);
916 file->handle = fh_parse (FH_REF_FILE | FH_REF_SCRATCH);
917 if (file->handle == NULL)
920 file->reader = any_reader_open (file->handle, &file->dict);
921 if (file->reader == NULL)
924 case_create (&file->input, dict_get_next_value_idx (file->dict));
927 while (lex_match ('/'))
928 if (lex_match_id ("RENAME"))
930 if (!rename_variables (file->dict))
933 else if (lex_match_id ("IN"))
942 if (file->in_name != NULL)
944 msg (SE, _("Multiple IN subcommands for a single FILE or "
948 file->in_name = xstrdup (tokid);
953 mtf_merge_dictionary (mtf.dict, file);
958 if (lex_match (T_BY))
960 struct variable **by;
964 msg (SE, _("BY may appear at most once."));
969 if (!parse_variables (mtf.dict, &by, &mtf.by_cnt,
970 PV_NO_DUPLICATE | PV_NO_SCRATCH))
973 for (iter = mtf.head; iter != NULL; iter = iter->next)
977 iter->by = xnmalloc (mtf.by_cnt, sizeof *iter->by);
979 for (i = 0; i < mtf.by_cnt; i++)
981 iter->by[i] = dict_lookup_var (iter->dict, by[i]->name);
982 if (iter->by[i] == NULL)
984 msg (SE, _("File %s lacks BY variable %s."),
985 iter->handle ? fh_get_name (iter->handle) : "*",
994 else if (lex_match_id ("FIRST"))
996 if (mtf.first[0] != '\0')
998 msg (SE, _("FIRST may appear at most once."));
1003 if (!lex_force_id ())
1005 strcpy (mtf.first, tokid);
1008 else if (lex_match_id ("LAST"))
1010 if (mtf.last[0] != '\0')
1012 msg (SE, _("LAST may appear at most once."));
1017 if (!lex_force_id ())
1019 strcpy (mtf.last, tokid);
1022 else if (lex_match_id ("MAP"))
1026 else if (lex_match_id ("DROP"))
1028 if (!drop_variables (mtf.dict))
1031 else if (lex_match_id ("KEEP"))
1033 if (!keep_variables (mtf.dict))
1042 if (!lex_match ('/') && token != '.')
1044 lex_end_of_command ();
1049 if (mtf.by_cnt == 0)
1053 msg (SE, _("BY is required when TABLE is specified."));
1058 msg (SE, _("BY is required when IN is specified."));
1063 /* Set up mapping from each file's variables to master
1065 for (iter = mtf.head; iter != NULL; iter = iter->next)
1067 struct dictionary *d = iter->dict;
1070 for (i = 0; i < dict_get_var_cnt (d); i++)
1072 struct variable *v = dict_get_var (d, i);
1073 struct variable *mv = dict_lookup_var (mtf.dict, v->name);
1079 /* Add IN variables to master dictionary. */
1080 for (iter = mtf.head; iter != NULL; iter = iter->next)
1081 if (iter->in_name != NULL)
1083 iter->in_var = dict_create_var (mtf.dict, iter->in_name, 0);
1084 if (iter->in_var == NULL)
1086 msg (SE, _("IN variable name %s duplicates an "
1087 "existing variable name."),
1088 iter->in_var->name);
1091 iter->in_var->print = iter->in_var->write
1092 = fmt_for_output (FMT_F, 1, 0);
1095 /* MATCH FILES performs an n-way merge on all its input files.
1098 1. Read one input record from every input FILE.
1100 2. If no FILEs are left, stop. Otherwise, proceed to step 3.
1102 3. Find the FILE input record(s) that have minimum BY
1103 values. Store all the values from these input records into
1106 4. For every TABLE, read another record as long as the BY values
1107 on the TABLE's input record are less than the FILEs' BY values.
1108 If an exact match is found, store all the values from the TABLE
1109 input record into the output record.
1111 5. Write the output record.
1113 6. Read another record from each input file FILE and TABLE that
1114 we stored values from above. If we come to the end of one of the
1115 input files, remove it from the list of input files.
1117 7. Repeat from step 2.
1119 Unfortunately, this algorithm can't be implemented in a
1120 straightforward way because there's no function to read a
1121 record from the active file. Instead, it has to be written
1124 FIXME: For merging large numbers of files (more than 10?) a
1125 better algorithm would use a heap for finding minimum
1128 if (!used_active_file)
1129 discard_variables (ds);
1131 dict_compact_values (mtf.dict);
1132 mtf.output = fastfile_create (dict_get_next_value_idx (mtf.dict));
1133 mtf.seq_nums = xcalloc (dict_get_var_cnt (mtf.dict), sizeof *mtf.seq_nums);
1134 case_create (&mtf.mtf_case, dict_get_next_value_idx (mtf.dict));
1136 if (!mtf_read_nonactive_records (&mtf))
1139 if (used_active_file)
1142 create_case_sink (&null_sink_class,
1143 dataset_dict (ds), NULL));
1145 ( procedure (ds, mtf_processing, &mtf) &&
1146 mtf_processing_finish (&mtf, ds) );
1149 ok = mtf_processing_finish (&mtf, ds);
1151 discard_variables (ds);
1153 dict_destroy (dataset_dict (ds));
1154 dataset_set_dict (ds, mtf.dict);
1156 proc_set_source (ds, storage_source_create (mtf.output));
1159 if (!mtf_free (&mtf))
1161 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
1165 return CMD_CASCADING_FAILURE;
1168 /* Repeats 2...7 an arbitrary number of times. */
1170 mtf_processing_finish (void *mtf_, const struct dataset *ds)
1172 struct mtf_proc *mtf = mtf_;
1173 struct mtf_file *iter;
1175 /* Find the active file and delete it. */
1176 for (iter = mtf->head; iter; iter = iter->next)
1177 if (iter->handle == NULL)
1179 if (!mtf_delete_file_in_place (mtf, &iter))
1184 while (mtf->head && mtf->head->type == MTF_FILE)
1185 if (!mtf_processing (NULL, mtf, ds))
1191 /* Return a string in a static buffer describing V's variable type and
1194 var_type_description (struct variable *v)
1196 static char buf[2][32];
1203 if (v->type == NUMERIC)
1204 strcpy (s, "numeric");
1207 assert (v->type == ALPHA);
1208 sprintf (s, "string with width %d", v->width);
1213 /* Closes FILE and frees its associated data.
1214 Returns true if successful, false if an I/O error
1215 occurred on FILE. */
1217 mtf_close_file (struct mtf_file *file)
1219 bool ok = file->reader == NULL || !any_reader_error (file->reader);
1221 any_reader_close (file->reader);
1222 if (file->handle != NULL)
1223 dict_destroy (file->dict);
1224 case_destroy (&file->input);
1225 free (file->in_name);
1230 /* Free all the data for the MATCH FILES procedure.
1231 Returns true if successful, false if an I/O error
1234 mtf_free (struct mtf_proc *mtf)
1236 struct mtf_file *iter, *next;
1239 for (iter = mtf->head; iter; iter = next)
1242 assert (iter->dict != mtf->dict);
1243 if (!mtf_close_file (iter))
1248 dict_destroy (mtf->dict);
1249 case_destroy (&mtf->mtf_case);
1250 free (mtf->seq_nums);
1255 /* Remove *FILE from the mtf_file chain. Make *FILE point to the next
1256 file in the chain, or to NULL if was the last in the chain.
1257 Returns true if successful, false if an I/O error occurred. */
1259 mtf_delete_file_in_place (struct mtf_proc *mtf, struct mtf_file **file)
1261 struct mtf_file *f = *file;
1265 f->prev->next = f->next;
1267 f->next->prev = f->prev;
1269 mtf->head = f->next;
1271 mtf->tail = f->prev;
1274 if (f->in_var != NULL)
1275 case_data_rw (&mtf->mtf_case, f->in_var->fv)->f = 0.;
1276 for (i = 0; i < dict_get_var_cnt (f->dict); i++)
1278 struct variable *v = dict_get_var (f->dict, i);
1279 struct variable *mv = get_master (v);
1282 union value *out = case_data_rw (&mtf->mtf_case, mv->fv);
1284 if (v->type == NUMERIC)
1287 memset (out->s, ' ', v->width);
1291 return mtf_close_file (f);
1294 /* Read a record from every input file except the active file.
1295 Returns true if successful, false if an I/O error occurred. */
1297 mtf_read_nonactive_records (void *mtf_)
1299 struct mtf_proc *mtf = mtf_;
1300 struct mtf_file *iter, *next;
1303 for (iter = mtf->head; ok && iter != NULL; iter = next)
1306 if (iter->handle && !any_reader_read (iter->reader, &iter->input))
1307 if (!mtf_delete_file_in_place (mtf, &iter))
1313 /* Compare the BY variables for files A and B; return -1 if A < B, 0
1314 if A == B, 1 if A > B. */
1316 mtf_compare_BY_values (struct mtf_proc *mtf,
1317 struct mtf_file *a, struct mtf_file *b,
1318 const struct ccase *c)
1320 const struct ccase *ca = case_is_null (&a->input) ? c : &a->input;
1321 const struct ccase *cb = case_is_null (&b->input) ? c : &b->input;
1322 assert ((a == NULL) + (b == NULL) + (c == NULL) <= 1);
1323 return case_compare_2dict (ca, cb, a->by, b->by, mtf->by_cnt);
1326 /* Perform one iteration of steps 3...7 above.
1327 Returns true if successful, false if an I/O error occurred. */
1329 mtf_processing (const struct ccase *c, void *mtf_, const struct dataset *ds UNUSED)
1331 struct mtf_proc *mtf = mtf_;
1333 /* Do we need another record from the active file? */
1334 bool read_active_file;
1336 assert (mtf->head != NULL);
1337 if (mtf->head->type == MTF_TABLE)
1342 struct mtf_file *min_head, *min_tail; /* Files with minimum BY values. */
1343 struct mtf_file *max_head, *max_tail; /* Files with non-minimum BYs. */
1344 struct mtf_file *iter, *next;
1346 read_active_file = false;
1348 /* 3. Find the FILE input record(s) that have minimum BY
1349 values. Store all the values from these input records into
1350 the output record. */
1351 min_head = min_tail = mtf->head;
1352 max_head = max_tail = NULL;
1353 for (iter = mtf->head->next; iter && iter->type == MTF_FILE;
1356 int cmp = mtf_compare_BY_values (mtf, min_head, iter, c);
1360 max_tail = max_tail->next_min = iter;
1362 max_head = max_tail = iter;
1365 min_tail = min_tail->next_min = iter;
1370 max_tail->next_min = min_head;
1371 max_tail = min_tail;
1375 max_head = min_head;
1376 max_tail = min_tail;
1378 min_head = min_tail = iter;
1382 /* 4. For every TABLE, read another record as long as the BY
1383 values on the TABLE's input record are less than the FILEs'
1384 BY values. If an exact match is found, store all the values
1385 from the TABLE input record into the output record. */
1386 for (; iter != NULL; iter = next)
1388 assert (iter->type == MTF_TABLE);
1393 int cmp = mtf_compare_BY_values (mtf, min_head, iter, c);
1397 max_tail = max_tail->next_min = iter;
1399 max_head = max_tail = iter;
1402 min_tail = min_tail->next_min = iter;
1405 if (iter->handle == NULL)
1407 if (any_reader_read (iter->reader, &iter->input))
1409 if (!mtf_delete_file_in_place (mtf, &iter))
1416 /* Next sequence number. */
1419 /* Store data to all the records we are using. */
1421 min_tail->next_min = NULL;
1422 for (iter = min_head; iter; iter = iter->next_min)
1426 for (i = 0; i < dict_get_var_cnt (iter->dict); i++)
1428 struct variable *v = dict_get_var (iter->dict, i);
1429 struct variable *mv = get_master (v);
1431 if (mv != NULL && mtf->seq_nums[mv->index] != mtf->seq_num)
1433 const struct ccase *record
1434 = case_is_null (&iter->input) ? c : &iter->input;
1435 union value *out = case_data_rw (&mtf->mtf_case, mv->fv);
1437 mtf->seq_nums[mv->index] = mtf->seq_num;
1438 if (v->type == NUMERIC)
1439 out->f = case_num (record, v->fv);
1441 memcpy (out->s, case_str (record, v->fv), v->width);
1444 if (iter->in_var != NULL)
1445 case_data_rw (&mtf->mtf_case, iter->in_var->fv)->f = 1.;
1447 if (iter->type == MTF_FILE && iter->handle == NULL)
1448 read_active_file = true;
1451 /* Store missing values to all the records we're not
1454 max_tail->next_min = NULL;
1455 for (iter = max_head; iter; iter = iter->next_min)
1459 for (i = 0; i < dict_get_var_cnt (iter->dict); i++)
1461 struct variable *v = dict_get_var (iter->dict, i);
1462 struct variable *mv = get_master (v);
1464 if (mv != NULL && mtf->seq_nums[mv->index] != mtf->seq_num)
1466 union value *out = case_data_rw (&mtf->mtf_case, mv->fv);
1467 mtf->seq_nums[mv->index] = mtf->seq_num;
1469 if (v->type == NUMERIC)
1472 memset (out->s, ' ', v->width);
1475 if (iter->in_var != NULL)
1476 case_data_rw (&mtf->mtf_case, iter->in_var->fv)->f = 0.;
1479 /* 5. Write the output record. */
1480 casefile_append (mtf->output, &mtf->mtf_case);
1482 /* 6. Read another record from each input file FILE and TABLE
1483 that we stored values from above. If we come to the end of
1484 one of the input files, remove it from the list of input
1486 for (iter = min_head; iter && iter->type == MTF_FILE; iter = next)
1488 next = iter->next_min;
1489 if (iter->reader != NULL
1490 && !any_reader_read (iter->reader, &iter->input))
1491 if (!mtf_delete_file_in_place (mtf, &iter))
1495 while (!read_active_file
1496 && mtf->head != NULL && mtf->head->type == MTF_FILE);
1501 /* Merge the dictionary for file F into master dictionary M. */
1503 mtf_merge_dictionary (struct dictionary *const m, struct mtf_file *f)
1505 struct dictionary *d = f->dict;
1506 const char *d_docs, *m_docs;
1509 if (dict_get_label (m) == NULL)
1510 dict_set_label (m, dict_get_label (d));
1512 d_docs = dict_get_documents (d);
1513 m_docs = dict_get_documents (m);
1517 dict_set_documents (m, d_docs);
1523 new_len = strlen (m_docs) + strlen (d_docs);
1524 new_docs = xmalloc (new_len + 1);
1525 strcpy (new_docs, m_docs);
1526 strcat (new_docs, d_docs);
1527 dict_set_documents (m, new_docs);
1532 for (i = 0; i < dict_get_var_cnt (d); i++)
1534 struct variable *dv = dict_get_var (d, i);
1535 struct variable *mv = dict_lookup_var (m, dv->name);
1537 if (dict_class_from_id (dv->name) == DC_SCRATCH)
1542 if (mv->width != dv->width)
1544 msg (SE, _("Variable %s in file %s (%s) has different "
1545 "type or width from the same variable in "
1546 "earlier file (%s)."),
1547 dv->name, fh_get_name (f->handle),
1548 var_type_description (dv), var_type_description (mv));
1552 if (dv->width == mv->width)
1554 if (val_labs_count (dv->val_labs)
1555 && !val_labs_count (mv->val_labs))
1557 val_labs_destroy (mv->val_labs);
1558 mv->val_labs = val_labs_copy (dv->val_labs);
1560 if (!mv_is_empty (&dv->miss) && mv_is_empty (&mv->miss))
1561 mv_copy (&mv->miss, &dv->miss);
1564 if (dv->label && !mv->label)
1565 mv->label = xstrdup (dv->label);
1568 mv = dict_clone_var_assert (m, dv, dv->name);
1574 /* Marks V's master variable as MASTER. */
1576 set_master (struct variable *v, struct variable *master)
1578 var_attach_aux (v, master, NULL);
1581 /* Returns the master variable corresponding to V,
1582 as set with set_master(). */
1583 static struct variable *
1584 get_master (struct variable *v)
1593 A case map copies data from a case that corresponds for one
1594 dictionary to a case that corresponds to a second dictionary
1595 derived from the first by, optionally, deleting, reordering,
1596 or renaming variables. (No new variables may be created.)
1602 size_t value_cnt; /* Number of values in map. */
1603 int *map; /* For each destination index, the
1604 corresponding source index. */
1607 /* Prepares dictionary D for producing a case map. Afterward,
1608 the caller may delete, reorder, or rename variables within D
1609 at will before using finish_case_map() to produce the case
1612 Uses D's aux members, which must otherwise not be in use. */
1614 start_case_map (struct dictionary *d)
1616 size_t var_cnt = dict_get_var_cnt (d);
1619 for (i = 0; i < var_cnt; i++)
1621 struct variable *v = dict_get_var (d, i);
1622 int *src_fv = xmalloc (sizeof *src_fv);
1624 var_attach_aux (v, src_fv, var_dtor_free);
1628 /* Produces a case map from dictionary D, which must have been
1629 previously prepared with start_case_map().
1631 Does not retain any reference to D, and clears the aux members
1632 set up by start_case_map().
1634 Returns the new case map, or a null pointer if no mapping is
1635 required (that is, no data has changed position). */
1636 static struct case_map *
1637 finish_case_map (struct dictionary *d)
1639 struct case_map *map;
1640 size_t var_cnt = dict_get_var_cnt (d);
1644 map = xmalloc (sizeof *map);
1645 map->value_cnt = dict_get_next_value_idx (d);
1646 map->map = xnmalloc (map->value_cnt, sizeof *map->map);
1647 for (i = 0; i < map->value_cnt; i++)
1651 for (i = 0; i < var_cnt; i++)
1653 struct variable *v = dict_get_var (d, i);
1654 int *src_fv = (int *) var_detach_aux (v);
1657 if (v->fv != *src_fv)
1660 for (idx = 0; idx < v->nv; idx++)
1662 int src_idx = *src_fv + idx;
1663 int dst_idx = v->fv + idx;
1665 assert (map->map[dst_idx] == -1);
1666 map->map[dst_idx] = src_idx;
1673 destroy_case_map (map);
1677 while (map->value_cnt > 0 && map->map[map->value_cnt - 1] == -1)
1683 /* Maps from SRC to DST, applying case map MAP. */
1685 map_case (const struct case_map *map,
1686 const struct ccase *src, struct ccase *dst)
1690 assert (map != NULL);
1691 assert (src != NULL);
1692 assert (dst != NULL);
1693 assert (src != dst);
1695 for (dst_idx = 0; dst_idx < map->value_cnt; dst_idx++)
1697 int src_idx = map->map[dst_idx];
1699 *case_data_rw (dst, dst_idx) = *case_data (src, src_idx);
1703 /* Destroys case map MAP. */
1705 destroy_case_map (struct case_map *map)