1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000, 2006 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 #include <data/any-reader.h>
24 #include <data/any-writer.h>
25 #include <data/case-sink.h>
26 #include <data/case-source.h>
27 #include <data/case.h>
28 #include <data/casefile.h>
29 #include <data/fastfile.h>
30 #include <data/format.h>
31 #include <data/dictionary.h>
32 #include <data/por-file-writer.h>
33 #include <data/procedure.h>
34 #include <data/settings.h>
35 #include <data/storage-stream.h>
36 #include <data/sys-file-writer.h>
37 #include <data/transformations.h>
38 #include <data/value-labels.h>
39 #include <data/variable.h>
40 #include <language/command.h>
41 #include <language/data-io/file-handle.h>
42 #include <language/lexer/lexer.h>
43 #include <language/lexer/variable-parser.h>
44 #include <libpspp/alloc.h>
45 #include <libpspp/assertion.h>
46 #include <libpspp/compiler.h>
47 #include <libpspp/hash.h>
48 #include <libpspp/message.h>
49 #include <libpspp/message.h>
50 #include <libpspp/misc.h>
51 #include <libpspp/str.h>
54 #define _(msgid) gettext (msgid)
56 /* Rearranging and reducing a dictionary. */
57 static void start_case_map (struct dictionary *);
58 static struct case_map *finish_case_map (struct dictionary *);
59 static void map_case (const struct case_map *,
60 const struct ccase *, struct ccase *);
61 static void destroy_case_map (struct case_map *);
63 static bool parse_dict_trim (struct lexer *, struct dictionary *);
65 /* Reading system and portable files. */
67 /* Type of command. */
74 /* Case reader input program. */
75 struct case_reader_pgm
77 struct any_reader *reader; /* File reader. */
78 struct case_map *map; /* Map from file dict to active file dict. */
79 struct ccase bounce; /* Bounce buffer. */
82 static const struct case_source_class case_reader_source_class;
84 static void case_reader_pgm_free (struct case_reader_pgm *);
86 /* Parses a GET or IMPORT command. */
88 parse_read_command (struct lexer *lexer, struct dataset *ds, enum reader_command type)
90 struct case_reader_pgm *pgm = NULL;
91 struct file_handle *fh = NULL;
92 struct dictionary *dict = NULL;
96 lex_match (lexer, '/');
98 if (lex_match_id (lexer, "FILE") || lex_token (lexer) == T_STRING)
100 lex_match (lexer, '=');
102 fh = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
106 else if (type == IMPORT_CMD && lex_match_id (lexer, "TYPE"))
108 lex_match (lexer, '=');
110 if (lex_match_id (lexer, "COMM"))
112 else if (lex_match_id (lexer, "TAPE"))
116 lex_error (lexer, _("expecting COMM or TAPE"));
126 lex_sbc_missing (lexer, "FILE");
130 discard_variables (ds);
132 pgm = xmalloc (sizeof *pgm);
133 pgm->reader = any_reader_open (fh, &dict);
135 case_nullify (&pgm->bounce);
136 if (pgm->reader == NULL)
139 case_create (&pgm->bounce, dict_get_next_value_idx (dict));
141 start_case_map (dict);
143 while (lex_token (lexer) != '.')
145 lex_match (lexer, '/');
146 if (!parse_dict_trim (lexer, dict))
150 pgm->map = finish_case_map (dict);
152 dict_destroy (dataset_dict (ds));
153 dataset_set_dict (ds, dict);
156 create_case_source (&case_reader_source_class, pgm));
161 case_reader_pgm_free (pgm);
164 return CMD_CASCADING_FAILURE;
167 /* Frees a struct case_reader_pgm. */
169 case_reader_pgm_free (struct case_reader_pgm *pgm)
173 any_reader_close (pgm->reader);
174 destroy_case_map (pgm->map);
175 case_destroy (&pgm->bounce);
180 /* Reads one case into C.
181 Returns true if successful, false at end of file or if an
182 I/O error occurred. */
184 case_reader_source_read (struct case_source *source, struct ccase *c)
186 struct case_reader_pgm *pgm = source->aux;
187 if (any_reader_read (pgm->reader, pgm->map == NULL ? c : &pgm->bounce))
189 if (pgm->map != NULL)
190 map_case (pgm->map, &pgm->bounce, c);
197 /* Destroys the source.
198 Returns true if successful read, false if an I/O occurred
199 during destruction or previously. */
201 case_reader_source_destroy (struct case_source *source)
203 struct case_reader_pgm *pgm = source->aux;
204 bool ok = !any_reader_error (pgm->reader);
205 case_reader_pgm_free (pgm);
209 static const struct case_source_class case_reader_source_class =
213 case_reader_source_read,
214 case_reader_source_destroy,
219 cmd_get (struct lexer *lexer, struct dataset *ds)
221 return parse_read_command (lexer, ds, GET_CMD);
226 cmd_import (struct lexer *lexer, struct dataset *ds)
228 return parse_read_command (lexer, ds, IMPORT_CMD);
231 /* Writing system and portable files. */
233 /* Type of output file. */
236 SYSFILE_WRITER, /* System file. */
237 PORFILE_WRITER /* Portable file. */
240 /* Type of a command. */
243 XFORM_CMD, /* Transformation. */
244 PROC_CMD /* Procedure. */
247 /* File writer plus a case map. */
250 struct any_writer *writer; /* File writer. */
251 struct case_map *map; /* Map to output file dictionary
252 (null pointer for identity mapping). */
253 struct ccase bounce; /* Bounce buffer for mapping (if needed). */
258 case_writer_destroy (struct case_writer *aw)
263 ok = any_writer_close (aw->writer);
264 destroy_case_map (aw->map);
265 case_destroy (&aw->bounce);
271 /* Parses SAVE or XSAVE or EXPORT or XEXPORT command.
272 WRITER_TYPE identifies the type of file to write,
273 and COMMAND_TYPE identifies the type of command.
275 On success, returns a writer.
276 For procedures only, sets *RETAIN_UNSELECTED to true if cases
277 that would otherwise be excluded by FILTER or USE should be
280 On failure, returns a null pointer. */
281 static struct case_writer *
282 parse_write_command (struct lexer *lexer, struct dataset *ds,
283 enum writer_type writer_type,
284 enum command_type command_type,
285 bool *retain_unselected)
288 struct file_handle *handle; /* Output file. */
289 struct dictionary *dict; /* Dictionary for output file. */
290 struct case_writer *aw; /* Writer. */
292 /* Common options. */
293 bool print_map; /* Print map? TODO. */
294 bool print_short_names; /* Print long-to-short name map. TODO. */
295 struct sfm_write_options sysfile_opts;
296 struct pfm_write_options porfile_opts;
298 assert (writer_type == SYSFILE_WRITER || writer_type == PORFILE_WRITER);
299 assert (command_type == XFORM_CMD || command_type == PROC_CMD);
300 assert ((retain_unselected != NULL) == (command_type == PROC_CMD));
302 if (command_type == PROC_CMD)
303 *retain_unselected = true;
306 dict = dict_clone (dataset_dict (ds));
307 aw = xmalloc (sizeof *aw);
310 case_nullify (&aw->bounce);
312 print_short_names = false;
313 sysfile_opts = sfm_writer_default_options ();
314 porfile_opts = pfm_writer_default_options ();
316 start_case_map (dict);
317 dict_delete_scratch_vars (dict);
319 lex_match (lexer, '/');
322 if (lex_match_id (lexer, "OUTFILE"))
326 lex_sbc_only_once ("OUTFILE");
330 lex_match (lexer, '=');
332 handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
336 else if (lex_match_id (lexer, "NAMES"))
337 print_short_names = true;
338 else if (lex_match_id (lexer, "PERMISSIONS"))
342 lex_match (lexer, '=');
343 if (lex_match_id (lexer, "READONLY"))
345 else if (lex_match_id (lexer, "WRITEABLE"))
349 lex_error (lexer, _("expecting %s or %s"), "READONLY", "WRITEABLE");
352 sysfile_opts.create_writeable = porfile_opts.create_writeable = cw;
354 else if (command_type == PROC_CMD && lex_match_id (lexer, "UNSELECTED"))
356 lex_match (lexer, '=');
357 if (lex_match_id (lexer, "RETAIN"))
358 *retain_unselected = true;
359 else if (lex_match_id (lexer, "DELETE"))
360 *retain_unselected = false;
363 lex_error (lexer, _("expecting %s or %s"), "RETAIN", "DELETE");
367 else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "COMPRESSED"))
368 sysfile_opts.compress = true;
369 else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "UNCOMPRESSED"))
370 sysfile_opts.compress = false;
371 else if (writer_type == SYSFILE_WRITER && lex_match_id (lexer, "VERSION"))
373 lex_match (lexer, '=');
374 if (!lex_force_int (lexer))
376 sysfile_opts.version = lex_integer (lexer);
379 else if (writer_type == PORFILE_WRITER && lex_match_id (lexer, "TYPE"))
381 lex_match (lexer, '=');
382 if (lex_match_id (lexer, "COMMUNICATIONS"))
383 porfile_opts.type = PFM_COMM;
384 else if (lex_match_id (lexer, "TAPE"))
385 porfile_opts.type = PFM_TAPE;
388 lex_error (lexer, _("expecting %s or %s"), "COMM", "TAPE");
392 else if (writer_type == PORFILE_WRITER && lex_match_id (lexer, "DIGITS"))
394 lex_match (lexer, '=');
395 if (!lex_force_int (lexer))
397 porfile_opts.digits = lex_integer (lexer);
400 else if (!parse_dict_trim (lexer, dict))
403 if (!lex_match (lexer, '/'))
406 if (lex_end_of_command (lexer) != CMD_SUCCESS)
411 lex_sbc_missing (lexer, "OUTFILE");
415 dict_compact_values (dict);
416 aw->map = finish_case_map (dict);
418 case_create (&aw->bounce, dict_get_next_value_idx (dict));
420 if (fh_get_referent (handle) == FH_REF_FILE)
425 aw->writer = any_writer_from_sfm_writer (
426 sfm_open_writer (handle, dict, sysfile_opts));
429 aw->writer = any_writer_from_pfm_writer (
430 pfm_open_writer (handle, dict, porfile_opts));
435 aw->writer = any_writer_open (handle, dict);
436 if (aw->writer == NULL)
443 case_writer_destroy (aw);
448 /* Writes case C to writer AW. */
450 case_writer_write_case (struct case_writer *aw, const struct ccase *c)
454 map_case (aw->map, c, &aw->bounce);
457 return any_writer_write (aw->writer, c);
460 /* SAVE and EXPORT. */
462 /* Parses and performs the SAVE or EXPORT procedure. */
464 parse_output_proc (struct lexer *lexer, struct dataset *ds, enum writer_type writer_type)
466 bool retain_unselected;
467 struct variable *saved_filter_variable;
468 struct case_writer *aw;
472 aw = parse_write_command (lexer, ds, writer_type, PROC_CMD, &retain_unselected);
474 return CMD_CASCADING_FAILURE;
476 saved_filter_variable = dict_get_filter (dataset_dict (ds));
477 if (retain_unselected)
478 dict_set_filter (dataset_dict (ds), NULL);
481 while (ok && proc_read (ds, &c))
482 ok = case_writer_write_case (aw, c);
483 ok = proc_close (ds) && ok;
485 dict_set_filter (dataset_dict (ds), saved_filter_variable);
487 case_writer_destroy (aw);
488 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
492 cmd_save (struct lexer *lexer, struct dataset *ds)
494 return parse_output_proc (lexer, ds, SYSFILE_WRITER);
498 cmd_export (struct lexer *lexer, struct dataset *ds)
500 return parse_output_proc (lexer, ds, PORFILE_WRITER);
503 /* XSAVE and XEXPORT. */
505 /* Transformation. */
508 struct case_writer *aw; /* Writer. */
511 static trns_proc_func output_trns_proc;
512 static trns_free_func output_trns_free;
514 /* Parses the XSAVE or XEXPORT transformation command. */
516 parse_output_trns (struct lexer *lexer, struct dataset *ds, enum writer_type writer_type)
518 struct output_trns *t = xmalloc (sizeof *t);
519 t->aw = parse_write_command (lexer, ds, writer_type, XFORM_CMD, NULL);
523 return CMD_CASCADING_FAILURE;
526 add_transformation (ds, output_trns_proc, output_trns_free, t);
530 /* Writes case C to the system file specified on XSAVE or XEXPORT. */
532 output_trns_proc (void *trns_, struct ccase *c, casenumber case_num UNUSED)
534 struct output_trns *t = trns_;
535 case_writer_write_case (t->aw, c);
536 return TRNS_CONTINUE;
539 /* Frees an XSAVE or XEXPORT transformation.
540 Returns true if successful, false if an I/O error occurred. */
542 output_trns_free (void *trns_)
544 struct output_trns *t = trns_;
549 ok = case_writer_destroy (t->aw);
557 cmd_xsave (struct lexer *lexer, struct dataset *ds)
559 return parse_output_trns (lexer, ds, SYSFILE_WRITER);
562 /* XEXPORT command. */
564 cmd_xexport (struct lexer *lexer, struct dataset *ds)
566 return parse_output_trns (lexer, ds, PORFILE_WRITER);
569 static bool rename_variables (struct lexer *lexer, struct dictionary *dict);
570 static bool drop_variables (struct lexer *, struct dictionary *dict);
571 static bool keep_variables (struct lexer *, struct dictionary *dict);
573 /* Commands that read and write system files share a great deal
574 of common syntactic structure for rearranging and dropping
575 variables. This function parses this syntax and modifies DICT
576 appropriately. Returns true on success, false on failure. */
578 parse_dict_trim (struct lexer *lexer, struct dictionary *dict)
580 if (lex_match_id (lexer, "MAP"))
585 else if (lex_match_id (lexer, "DROP"))
586 return drop_variables (lexer, dict);
587 else if (lex_match_id (lexer, "KEEP"))
588 return keep_variables (lexer, dict);
589 else if (lex_match_id (lexer, "RENAME"))
590 return rename_variables (lexer, dict);
593 lex_error (lexer, _("expecting a valid subcommand"));
598 /* Parses and performs the RENAME subcommand of GET and SAVE. */
600 rename_variables (struct lexer *lexer, struct dictionary *dict)
613 lex_match (lexer, '=');
614 if (lex_token (lexer) != '(')
618 v = parse_variable (lexer, dict);
621 if (!lex_force_match (lexer, '=')
622 || !lex_force_id (lexer))
624 if (dict_lookup_var (dict, lex_tokid (lexer)) != NULL)
626 msg (SE, _("Cannot rename %s as %s because there already exists "
627 "a variable named %s. To rename variables with "
628 "overlapping names, use a single RENAME subcommand "
629 "such as \"/RENAME (A=B)(B=C)(C=A)\", or equivalently, "
630 "\"/RENAME (A B C=B C A)\"."),
631 var_get_name (v), lex_tokid (lexer), lex_tokid (lexer));
635 dict_rename_var (dict, v, lex_tokid (lexer));
644 while (lex_match (lexer, '('))
648 if (!parse_variables (lexer, dict, &v, &nv, PV_NO_DUPLICATE | PV_APPEND))
650 if (!lex_match (lexer, '='))
652 msg (SE, _("`=' expected after variable list."));
655 if (!parse_DATA_LIST_vars (lexer, &new_names, &nn, PV_APPEND | PV_NO_SCRATCH))
659 msg (SE, _("Number of variables on left side of `=' (%d) does not "
660 "match number of variables on right side (%d), in "
661 "parenthesized group %d of RENAME subcommand."),
662 (unsigned) (nv - old_nv), (unsigned) (nn - old_nv), group);
665 if (!lex_force_match (lexer, ')'))
670 if (!dict_rename_vars (dict, v, new_names, nv, &err_name))
672 msg (SE, _("Requested renaming duplicates variable name %s."), err_name);
678 for (i = 0; i < nn; i++)
686 /* Parses and performs the DROP subcommand of GET and SAVE.
687 Returns true if successful, false on failure.*/
689 drop_variables (struct lexer *lexer, struct dictionary *dict)
694 lex_match (lexer, '=');
695 if (!parse_variables (lexer, dict, &v, &nv, PV_NONE))
697 dict_delete_vars (dict, v, nv);
700 if (dict_get_var_cnt (dict) == 0)
702 msg (SE, _("Cannot DROP all variables from dictionary."));
708 /* Parses and performs the KEEP subcommand of GET and SAVE.
709 Returns true if successful, false on failure.*/
711 keep_variables (struct lexer *lexer, struct dictionary *dict)
717 lex_match (lexer, '=');
718 if (!parse_variables (lexer, dict, &v, &nv, PV_NONE))
721 /* Move the specified variables to the beginning. */
722 dict_reorder_vars (dict, v, nv);
724 /* Delete the remaining variables. */
725 v = xnrealloc (v, dict_get_var_cnt (dict) - nv, sizeof *v);
726 for (i = nv; i < dict_get_var_cnt (dict); i++)
727 v[i - nv] = dict_get_var (dict, i);
728 dict_delete_vars (dict, v, dict_get_var_cnt (dict) - nv);
739 MTF_FILE, /* Specified on FILE= subcommand. */
740 MTF_TABLE /* Specified on TABLE= subcommand. */
743 /* One of the files on MATCH FILES. */
746 struct mtf_file *next, *prev; /* Next, previous in the list of files. */
747 struct mtf_file *next_min; /* Next in the chain of minimums. */
749 int type; /* One of MTF_*. */
750 struct variable **by; /* List of BY variables for this file. */
751 struct file_handle *handle; /* File handle. */
752 struct any_reader *reader; /* File reader. */
753 struct dictionary *dict; /* Dictionary from system file. */
756 char *in_name; /* Variable name. */
757 struct variable *in_var; /* Variable (in master dictionary). */
759 struct ccase input_storage; /* Input record storage. */
760 struct ccase *input; /* Input record. */
763 /* MATCH FILES procedure. */
766 struct mtf_file *head; /* First file mentioned on FILE or TABLE. */
767 struct mtf_file *tail; /* Last file mentioned on FILE or TABLE. */
769 bool ok; /* False if I/O error occurs. */
771 size_t by_cnt; /* Number of variables on BY subcommand. */
773 /* Names of FIRST, LAST variables. */
774 char first[LONG_NAME_LEN + 1], last[LONG_NAME_LEN + 1];
776 struct dictionary *dict; /* Dictionary of output file. */
777 struct casefile *output; /* MATCH FILES output. */
778 struct ccase mtf_case; /* Case used for output. */
780 unsigned seq_num; /* Have we initialized this variable? */
781 unsigned *seq_nums; /* Sequence numbers for each var in dict. */
784 static bool mtf_free (struct mtf_proc *);
785 static bool mtf_close_file (struct mtf_file *);
786 static int mtf_merge_dictionary (struct dictionary *const, struct mtf_file *);
787 static bool mtf_read_records (struct mtf_proc *, struct dataset *);
788 static bool mtf_delete_file_in_place (struct mtf_proc *, struct mtf_file **);
790 static bool mtf_processing (struct mtf_proc *, struct dataset *);
792 static char *var_type_description (struct variable *);
794 static void set_master (struct variable *, struct variable *master);
795 static struct variable *get_master (struct variable *);
797 /* Parse and execute the MATCH FILES command. */
799 cmd_match_files (struct lexer *lexer, struct dataset *ds)
802 struct mtf_file *first_table = NULL;
803 struct mtf_file *iter;
805 bool used_active_file = false;
806 bool saw_table = false;
809 mtf.head = mtf.tail = NULL;
813 mtf.dict = dict_create ();
815 case_nullify (&mtf.mtf_case);
818 dict_set_case_limit (mtf.dict, dict_get_case_limit (dataset_dict (ds)));
820 lex_match (lexer, '/');
821 while (lex_token (lexer) == T_ID
822 && (lex_id_match (ss_cstr ("FILE"), ss_cstr (lex_tokid (lexer)))
823 || lex_id_match (ss_cstr ("TABLE"), ss_cstr (lex_tokid (lexer)))))
825 struct mtf_file *file = xmalloc (sizeof *file);
827 if (lex_match_id (lexer, "FILE"))
828 file->type = MTF_FILE;
829 else if (lex_match_id (lexer, "TABLE"))
831 file->type = MTF_TABLE;
836 lex_match (lexer, '=');
842 file->in_name = NULL;
844 case_nullify (&file->input_storage);
845 file->input = &file->input_storage;
847 /* FILEs go first, then TABLEs. */
848 if (file->type == MTF_TABLE || first_table == NULL)
851 file->prev = mtf.tail;
853 mtf.tail->next = file;
855 if (mtf.head == NULL)
857 if (file->type == MTF_TABLE && first_table == NULL)
862 assert (file->type == MTF_FILE);
863 file->next = first_table;
864 file->prev = first_table->prev;
865 if (first_table->prev)
866 first_table->prev->next = file;
869 first_table->prev = file;
872 if (lex_match (lexer, '*'))
877 if (used_active_file)
879 msg (SE, _("The active file may not be specified more "
883 used_active_file = true;
885 if (!proc_has_source (ds))
887 msg (SE, _("Cannot specify the active file since no active "
888 "file has been defined."));
892 if (proc_make_temporary_transformations_permanent (ds))
894 _("MATCH FILES may not be used after TEMPORARY when "
895 "the active file is an input source. "
896 "Temporary transformations will be made permanent."));
898 file->dict = dataset_dict (ds);
902 file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH);
903 if (file->handle == NULL)
906 file->reader = any_reader_open (file->handle, &file->dict);
907 if (file->reader == NULL)
910 case_create (&file->input_storage,
911 dict_get_next_value_idx (file->dict));
914 while (lex_match (lexer, '/'))
915 if (lex_match_id (lexer, "RENAME"))
917 if (!rename_variables (lexer, file->dict))
920 else if (lex_match_id (lexer, "IN"))
922 lex_match (lexer, '=');
923 if (lex_token (lexer) != T_ID)
925 lex_error (lexer, NULL);
929 if (file->in_name != NULL)
931 msg (SE, _("Multiple IN subcommands for a single FILE or "
935 file->in_name = xstrdup (lex_tokid (lexer));
940 mtf_merge_dictionary (mtf.dict, file);
943 while (lex_token (lexer) != '.')
945 if (lex_match (lexer, T_BY))
947 struct variable **by;
951 msg (SE, _("BY may appear at most once."));
955 lex_match (lexer, '=');
956 if (!parse_variables (lexer, mtf.dict, &by, &mtf.by_cnt,
957 PV_NO_DUPLICATE | PV_NO_SCRATCH))
960 for (iter = mtf.head; iter != NULL; iter = iter->next)
964 iter->by = xnmalloc (mtf.by_cnt, sizeof *iter->by);
966 for (i = 0; i < mtf.by_cnt; i++)
968 iter->by[i] = dict_lookup_var (iter->dict,
969 var_get_name (by[i]));
970 if (iter->by[i] == NULL)
972 msg (SE, _("File %s lacks BY variable %s."),
973 iter->handle ? fh_get_name (iter->handle) : "*",
974 var_get_name (by[i]));
982 else if (lex_match_id (lexer, "FIRST"))
984 if (mtf.first[0] != '\0')
986 msg (SE, _("FIRST may appear at most once."));
990 lex_match (lexer, '=');
991 if (!lex_force_id (lexer))
993 strcpy (mtf.first, lex_tokid (lexer));
996 else if (lex_match_id (lexer, "LAST"))
998 if (mtf.last[0] != '\0')
1000 msg (SE, _("LAST may appear at most once."));
1004 lex_match (lexer, '=');
1005 if (!lex_force_id (lexer))
1007 strcpy (mtf.last, lex_tokid (lexer));
1010 else if (lex_match_id (lexer, "MAP"))
1014 else if (lex_match_id (lexer, "DROP"))
1016 if (!drop_variables (lexer, mtf.dict))
1019 else if (lex_match_id (lexer, "KEEP"))
1021 if (!keep_variables (lexer, mtf.dict))
1026 lex_error (lexer, NULL);
1030 if (!lex_match (lexer, '/') && lex_token (lexer) != '.')
1032 lex_end_of_command (lexer);
1037 if (mtf.by_cnt == 0)
1041 msg (SE, _("BY is required when TABLE is specified."));
1046 msg (SE, _("BY is required when IN is specified."));
1051 /* Set up mapping from each file's variables to master
1053 for (iter = mtf.head; iter != NULL; iter = iter->next)
1055 struct dictionary *d = iter->dict;
1058 for (i = 0; i < dict_get_var_cnt (d); i++)
1060 struct variable *v = dict_get_var (d, i);
1061 struct variable *mv = dict_lookup_var (mtf.dict, var_get_name (v));
1067 /* Add IN variables to master dictionary. */
1068 for (iter = mtf.head; iter != NULL; iter = iter->next)
1069 if (iter->in_name != NULL)
1071 struct fmt_spec format = fmt_for_output (FMT_F, 1, 0);
1072 iter->in_var = dict_create_var (mtf.dict, iter->in_name, 0);
1073 if (iter->in_var == NULL)
1075 msg (SE, _("IN variable name %s duplicates an "
1076 "existing variable name."),
1077 var_get_name (iter->in_var));
1080 var_set_both_formats (iter->in_var, &format);
1083 /* MATCH FILES performs an n-way merge on all its input files.
1086 1. Read one input record from every input FILE.
1088 2. If no FILEs are left, stop. Otherwise, proceed to step 3.
1090 3. Find the FILE input record(s) that have minimum BY
1091 values. Store all the values from these input records into
1094 4. For every TABLE, read another record as long as the BY values
1095 on the TABLE's input record are less than the FILEs' BY values.
1096 If an exact match is found, store all the values from the TABLE
1097 input record into the output record.
1099 5. Write the output record.
1101 6. Read another record from each input file FILE and TABLE that
1102 we stored values from above. If we come to the end of one of the
1103 input files, remove it from the list of input files.
1105 7. Repeat from step 2.
1107 FIXME: For merging large numbers of files (more than 10?) a
1108 better algorithm would use a heap for finding minimum
1111 if (used_active_file)
1113 proc_set_sink (ds, create_case_sink (&null_sink_class,
1115 dataset_get_casefile_factory (ds),
1120 discard_variables (ds);
1122 dict_compact_values (mtf.dict);
1123 mtf.output = dataset_get_casefile_factory (ds)->create_casefile
1124 (dataset_get_casefile_factory (ds),
1125 dict_get_next_value_idx (mtf.dict));
1127 mtf.seq_nums = xcalloc (dict_get_var_cnt (mtf.dict), sizeof *mtf.seq_nums);
1128 case_create (&mtf.mtf_case, dict_get_next_value_idx (mtf.dict));
1130 if (!mtf_read_records (&mtf, ds))
1132 while (mtf.head && mtf.head->type == MTF_FILE)
1133 if (!mtf_processing (&mtf, ds))
1135 if (!proc_close (ds))
1138 discard_variables (ds);
1140 dict_destroy (dataset_dict (ds));
1141 dataset_set_dict (ds, mtf.dict);
1143 proc_set_source (ds, storage_source_create (mtf.output));
1146 return mtf_free (&mtf) ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
1151 return CMD_CASCADING_FAILURE;
1154 /* Return a string in a static buffer describing V's variable type and
1157 var_type_description (struct variable *v)
1159 static char buf[2][32];
1166 if (var_is_numeric (v))
1167 strcpy (s, "numeric");
1169 sprintf (s, "string with width %d", var_get_width (v));
1173 /* Closes FILE and frees its associated data.
1174 Returns true if successful, false if an I/O error
1175 occurred on FILE. */
1177 mtf_close_file (struct mtf_file *file)
1179 bool ok = file->reader == NULL || !any_reader_error (file->reader);
1181 any_reader_close (file->reader);
1182 if (file->handle != NULL)
1183 dict_destroy (file->dict);
1184 case_destroy (&file->input_storage);
1185 free (file->in_name);
1190 /* Free all the data for the MATCH FILES procedure.
1191 Returns true if successful, false if an I/O error
1194 mtf_free (struct mtf_proc *mtf)
1196 struct mtf_file *iter, *next;
1199 for (iter = mtf->head; iter; iter = next)
1202 assert (iter->dict != mtf->dict);
1203 if (!mtf_close_file (iter))
1208 dict_destroy (mtf->dict);
1209 case_destroy (&mtf->mtf_case);
1210 free (mtf->seq_nums);
1215 /* Remove *FILE from the mtf_file chain. Make *FILE point to the next
1216 file in the chain, or to NULL if was the last in the chain.
1217 Returns true if successful, false if an I/O error occurred. */
1219 mtf_delete_file_in_place (struct mtf_proc *mtf, struct mtf_file **file)
1221 struct mtf_file *f = *file;
1225 f->prev->next = f->next;
1227 f->next->prev = f->prev;
1229 mtf->head = f->next;
1231 mtf->tail = f->prev;
1234 if (f->in_var != NULL)
1235 case_data_rw (&mtf->mtf_case, f->in_var)->f = 0.;
1236 for (i = 0; i < dict_get_var_cnt (f->dict); i++)
1238 struct variable *v = dict_get_var (f->dict, i);
1239 struct variable *mv = get_master (v);
1242 union value *out = case_data_rw (&mtf->mtf_case, mv);
1244 if (var_is_numeric (v))
1247 memset (out->s, ' ', var_get_width (v));
1251 return mtf_close_file (f);
1254 /* Read a record from every input file.
1255 Returns true if successful, false if an I/O error occurred. */
1257 mtf_read_records (struct mtf_proc *mtf, struct dataset *ds)
1259 struct mtf_file *iter, *next;
1262 for (iter = mtf->head; ok && iter != NULL; iter = next)
1266 ? !any_reader_read (iter->reader, iter->input)
1267 : !proc_read (ds, &iter->input))
1269 if (!mtf_delete_file_in_place (mtf, &iter))
1276 /* Compare the BY variables for files A and B; return -1 if A < B, 0
1277 if A == B, 1 if A > B. */
1279 mtf_compare_BY_values (struct mtf_proc *mtf,
1280 struct mtf_file *a, struct mtf_file *b)
1282 return case_compare_2dict (a->input, b->input, a->by, b->by, mtf->by_cnt);
1285 /* Perform one iteration of steps 3...7 above.
1286 Returns true if successful, false if an I/O error occurred. */
1288 mtf_processing (struct mtf_proc *mtf, struct dataset *ds)
1290 struct mtf_file *min_head, *min_tail; /* Files with minimum BY values. */
1291 struct mtf_file *max_head, *max_tail; /* Files with non-minimum BYs. */
1292 struct mtf_file *iter, *next;
1294 /* 3. Find the FILE input record(s) that have minimum BY
1295 values. Store all the values from these input records into
1296 the output record. */
1297 min_head = min_tail = mtf->head;
1298 max_head = max_tail = NULL;
1299 for (iter = mtf->head->next; iter && iter->type == MTF_FILE;
1302 int cmp = mtf_compare_BY_values (mtf, min_head, iter);
1306 max_tail = max_tail->next_min = iter;
1308 max_head = max_tail = iter;
1311 min_tail = min_tail->next_min = iter;
1316 max_tail->next_min = min_head;
1317 max_tail = min_tail;
1321 max_head = min_head;
1322 max_tail = min_tail;
1324 min_head = min_tail = iter;
1328 /* 4. For every TABLE, read another record as long as the BY
1329 values on the TABLE's input record are less than the FILEs'
1330 BY values. If an exact match is found, store all the values
1331 from the TABLE input record into the output record. */
1332 for (; iter != NULL; iter = next)
1334 assert (iter->type == MTF_TABLE);
1339 int cmp = mtf_compare_BY_values (mtf, min_head, iter);
1343 max_tail = max_tail->next_min = iter;
1345 max_head = max_tail = iter;
1348 min_tail = min_tail->next_min = iter;
1352 ? any_reader_read (iter->reader, iter->input)
1353 : proc_read (ds, &iter->input))
1355 if (!mtf_delete_file_in_place (mtf, &iter))
1362 /* Next sequence number. */
1365 /* Store data to all the records we are using. */
1367 min_tail->next_min = NULL;
1368 for (iter = min_head; iter; iter = iter->next_min)
1372 for (i = 0; i < dict_get_var_cnt (iter->dict); i++)
1374 struct variable *v = dict_get_var (iter->dict, i);
1375 struct variable *mv = get_master (v);
1376 size_t mv_index = mv ? var_get_dict_index (mv) : 0;
1378 if (mv != NULL && mtf->seq_nums[mv_index] != mtf->seq_num)
1380 const struct ccase *record = iter->input;
1381 union value *out = case_data_rw (&mtf->mtf_case, mv);
1383 mtf->seq_nums[mv_index] = mtf->seq_num;
1384 if (var_is_numeric (v))
1385 out->f = case_num (record, v);
1387 memcpy (out->s, case_str (record, v), var_get_width (v));
1390 if (iter->in_var != NULL)
1391 case_data_rw (&mtf->mtf_case, iter->in_var)->f = 1.;
1394 /* Store missing values to all the records we're not using. */
1396 max_tail->next_min = NULL;
1397 for (iter = max_head; iter; iter = iter->next_min)
1401 for (i = 0; i < dict_get_var_cnt (iter->dict); i++)
1403 struct variable *v = dict_get_var (iter->dict, i);
1404 struct variable *mv = get_master (v);
1405 size_t mv_index = mv ? var_get_dict_index (mv) : 0;
1407 if (mv != NULL && mtf->seq_nums[mv_index] != mtf->seq_num)
1409 union value *out = case_data_rw (&mtf->mtf_case, mv);
1410 mtf->seq_nums[mv_index] = mtf->seq_num;
1412 if (var_is_numeric (v))
1415 memset (out->s, ' ', var_get_width (v));
1418 if (iter->in_var != NULL)
1419 case_data_rw (&mtf->mtf_case, iter->in_var)->f = 0.;
1422 /* 5. Write the output record. */
1423 casefile_append (mtf->output, &mtf->mtf_case);
1425 /* 6. Read another record from each input file FILE and TABLE
1426 that we stored values from above. If we come to the end of
1427 one of the input files, remove it from the list of input
1429 for (iter = min_head; iter && iter->type == MTF_FILE; iter = next)
1431 next = iter->next_min;
1432 if (iter->reader != NULL
1433 ? !any_reader_read (iter->reader, iter->input)
1434 : !proc_read (ds, &iter->input))
1435 if (!mtf_delete_file_in_place (mtf, &iter))
1441 /* Merge the dictionary for file F into master dictionary M. */
1443 mtf_merge_dictionary (struct dictionary *const m, struct mtf_file *f)
1445 struct dictionary *d = f->dict;
1446 const char *d_docs, *m_docs;
1449 if (dict_get_label (m) == NULL)
1450 dict_set_label (m, dict_get_label (d));
1452 d_docs = dict_get_documents (d);
1453 m_docs = dict_get_documents (m);
1457 dict_set_documents (m, d_docs);
1463 new_len = strlen (m_docs) + strlen (d_docs);
1464 new_docs = xmalloc (new_len + 1);
1465 strcpy (new_docs, m_docs);
1466 strcat (new_docs, d_docs);
1467 dict_set_documents (m, new_docs);
1472 for (i = 0; i < dict_get_var_cnt (d); i++)
1474 struct variable *dv = dict_get_var (d, i);
1475 struct variable *mv = dict_lookup_var (m, var_get_name (dv));
1477 if (dict_class_from_id (var_get_name (dv)) == DC_SCRATCH)
1482 if (var_get_width (mv) != var_get_width (dv))
1484 msg (SE, _("Variable %s in file %s (%s) has different "
1485 "type or width from the same variable in "
1486 "earlier file (%s)."),
1487 var_get_name (dv), fh_get_name (f->handle),
1488 var_type_description (dv), var_type_description (mv));
1492 if (var_get_width (dv) == var_get_width (mv))
1494 if (var_has_value_labels (dv) && !var_has_value_labels (mv))
1495 var_set_value_labels (mv, var_get_value_labels (dv));
1496 if (var_has_missing_values (dv) && !var_has_missing_values (mv))
1497 var_set_missing_values (mv, var_get_missing_values (dv));
1500 if (var_get_label (dv) && !var_get_label (mv))
1501 var_set_label (mv, var_get_label (dv));
1504 mv = dict_clone_var_assert (m, dv, var_get_name (dv));
1510 /* Marks V's master variable as MASTER. */
1512 set_master (struct variable *v, struct variable *master)
1514 var_attach_aux (v, master, NULL);
1517 /* Returns the master variable corresponding to V,
1518 as set with set_master(). */
1519 static struct variable *
1520 get_master (struct variable *v)
1522 return var_get_aux (v);
1527 A case map copies data from a case that corresponds for one
1528 dictionary to a case that corresponds to a second dictionary
1529 derived from the first by, optionally, deleting, reordering,
1530 or renaming variables. (No new variables may be created.)
1536 size_t value_cnt; /* Number of values in map. */
1537 int *map; /* For each destination index, the
1538 corresponding source index. */
1541 /* Prepares dictionary D for producing a case map. Afterward,
1542 the caller may delete, reorder, or rename variables within D
1543 at will before using finish_case_map() to produce the case
1546 Uses D's aux members, which must otherwise not be in use. */
1548 start_case_map (struct dictionary *d)
1550 size_t var_cnt = dict_get_var_cnt (d);
1553 for (i = 0; i < var_cnt; i++)
1555 struct variable *v = dict_get_var (d, i);
1556 int *src_fv = xmalloc (sizeof *src_fv);
1557 *src_fv = var_get_case_index (v);
1558 var_attach_aux (v, src_fv, var_dtor_free);
1562 /* Produces a case map from dictionary D, which must have been
1563 previously prepared with start_case_map().
1565 Does not retain any reference to D, and clears the aux members
1566 set up by start_case_map().
1568 Returns the new case map, or a null pointer if no mapping is
1569 required (that is, no data has changed position). */
1570 static struct case_map *
1571 finish_case_map (struct dictionary *d)
1573 struct case_map *map;
1574 size_t var_cnt = dict_get_var_cnt (d);
1578 map = xmalloc (sizeof *map);
1579 map->value_cnt = dict_get_next_value_idx (d);
1580 map->map = xnmalloc (map->value_cnt, sizeof *map->map);
1581 for (i = 0; i < map->value_cnt; i++)
1585 for (i = 0; i < var_cnt; i++)
1587 struct variable *v = dict_get_var (d, i);
1588 size_t value_cnt = var_get_value_cnt (v);
1589 int *src_fv = (int *) var_detach_aux (v);
1592 if (var_get_case_index (v) != *src_fv)
1595 for (idx = 0; idx < value_cnt; idx++)
1597 int src_idx = *src_fv + idx;
1598 int dst_idx = var_get_case_index (v) + idx;
1600 assert (map->map[dst_idx] == -1);
1601 map->map[dst_idx] = src_idx;
1608 destroy_case_map (map);
1612 while (map->value_cnt > 0 && map->map[map->value_cnt - 1] == -1)
1618 /* Maps from SRC to DST, applying case map MAP. */
1620 map_case (const struct case_map *map,
1621 const struct ccase *src, struct ccase *dst)
1625 assert (map != NULL);
1626 assert (src != NULL);
1627 assert (dst != NULL);
1628 assert (src != dst);
1630 for (dst_idx = 0; dst_idx < map->value_cnt; dst_idx++)
1632 int src_idx = map->map[dst_idx];
1634 *case_data_rw_idx (dst, dst_idx) = *case_data_idx (src, src_idx);
1638 /* Destroys case map MAP. */
1640 destroy_case_map (struct case_map *map)