From 51755f0fbf79c3e5c756b5b835b0ce15267e02d3 Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Mon, 17 Nov 2008 21:36:41 -0800 Subject: [PATCH] Implement ADD FILES and UPDATE. --- doc/automake.mk | 1 + doc/combining.texi | 336 +++++++++++ doc/files.texi | 98 +-- doc/pspp.texinfo | 4 +- src/data/automake.mk | 2 + src/data/case-matcher.c | 152 +++++ src/data/case-matcher.h | 33 + src/language/command.def | 4 +- src/language/data-io/automake.mk | 2 +- src/language/data-io/combine-files.c | 864 +++++++++++++++++++++++++++ src/language/data-io/match-files.c | 595 ++++++++++-------- tests/automake.mk | 2 + tests/command/add-files.sh | 200 +++++++ tests/command/match-files.sh | 43 +- tests/command/update.sh | 172 ++++++ 15 files changed, 2155 insertions(+), 353 deletions(-) create mode 100644 doc/combining.texi create mode 100644 src/data/case-matcher.c create mode 100644 src/data/case-matcher.h create mode 100644 src/language/data-io/combine-files.c create mode 100755 tests/command/add-files.sh create mode 100755 tests/command/update.sh diff --git a/doc/automake.mk b/doc/automake.mk index 3445379a..14359e6d 100644 --- a/doc/automake.mk +++ b/doc/automake.mk @@ -11,6 +11,7 @@ doc_pspp_TEXINFOS = doc/version.texi \ doc/data-selection.texi \ doc/expressions.texi \ doc/files.texi \ + doc/combining.texi \ doc/flow-control.texi \ doc/function-index.texi \ doc/installing.texi \ diff --git a/doc/combining.texi b/doc/combining.texi new file mode 100644 index 00000000..a77c3b07 --- /dev/null +++ b/doc/combining.texi @@ -0,0 +1,336 @@ +@node Combining Data Files +@chapter Combining Data Files + +This chapter describes commands that allow data from system files, +portable file, scratch files, and the active file to be combined to +form a new active file. These commands can combine data files in the +following ways: + +@itemize +@item +@cmd{ADD FILES} interleaves or appends the cases from each input file. +It is used with input files that have variables in common, but +distinct sets of cases. + +@item +@cmd{MATCH FILES} adds the data together in cases that match across +multiple input files. It is used with input files that have cases in +common, but different information about each case. + +@item +@cmd{UPDATE} updates a master data file from data in a set of +transaction files. Each case in a transaction data file modifies a +matching case in the primary data file, or it adds a new case if no +matching case can be found. +@end itemize + +These commands share the majority of their syntax, which is described +in the following section, followed by one section for each command +that describes its specific syntax and semantics. + +@menu +* Combining Files Common Syntax:: +* ADD FILES:: Interleave cases from multiple files. +* MATCH FILES:: Merge cases from multiple files. +* UPDATE:: Update cases using transactional data. +@end menu + +@node Combining Files Common Syntax +@section Common Syntax + +@display +Per input file: + /FILE=@{*,'file-name'@} + [/RENAME=(src_names=target_names)@dots{}] + [/IN=var_name] + [/SORT] + +Once per command: + /BY var_list[(@{D|A@})] [var_list[(@{D|A@}]]@dots{} + [/DROP=var_list] + [/KEEP=var_list] + [/FIRST=var_name] + [/LAST=var_name] + [/MAP] +@end display + +This section describes the syntactical features in common among the +@cmd{ADD FILES}, @cmd{MATCH FILES}, and @cmd{UPDATE} commands. The +following sections describe details specific to each command. + +Each of these commands reads two or more input files and combines +them. The command's output becomes the new active file. The input +files are not changed on disk. + +The syntax of each command begins with a specification of the files to +be read as input. For each input file, specify FILE with a system, +portable, or scratch file's name as a string or a file handle +(@pxref{File Handles}), or specify an asterisk (@samp{*}) to use the +active file as input. Use of portable or scratch files on FILE is a +PSPP extension. + +At least two FILE subcommands must be specified. If the active file +is used as an input source, then @cmd{TEMPORARY} must not be in +effect. + +Each FILE subcommand may be followed by any number of RENAME +subcommands that specify a parenthesized group or groups of variable +names as they appear in the input file, followed by those variables' +new names, separated by an equals sign (@samp{=}), +e.g. @samp{/RENAME=(OLD1=NEW1)(OLD2=NEW2)}. To rename a single +variable, the parentheses may be omitted: @samp{/RENAME=OLD=NEW}. +Within a parenthesized group, variables are renamed simultaneously, so +that @samp{/RENAME=(A B=B A)} exchanges the names of variables A and +B. Otherwise, renaming occurs in left-to-right order. + +Each FILE subcommand may optionally be followed by a single IN +subcommand, which creates a numeric variable with the specified name +and format F1.0. The IN variable takes value 1 in an output case if +the given input file contributed to that output case, and 0 otherwise. +The DROP, KEEP, and RENAME subcommands have no effect on IN variables. + +If BY is used (see below), the SORT keyword must be specified after a +FILE if that input file is not already sorted on the BY variables. +When SORT is specified, PSPP sorts the input file's data on the BY +variables before it applies it to the command. When SORT is used, BY +is required. SORT is a PSPP extension. + +PSPP merges the dictionaries of all of the input files to form the new +active file dictionary, like so: + +@itemize @bullet +@item +The new active file's variables are the union of all the input files' +variables, matched based on their name. When a single input file +contains a variable with a given name, the output file will contain +exactly that variable. When more than one input file contains a +variable with a given name, those variables must all have the same +type (numeric or string) and, for string variables, the same width. +Variables are matched after renaming with the RENAME subcommand. +Thus, RENAME can be used to resolve conflicts. + +@item +The variable label for each output variable is taken from the first +specified input file that has a variable label for that variable, and +similarly for value labels and missing values. + +@item +The new active file's file label (@pxref{FILE LABEL}) is that of the +first specified FILE that has a file label. + +@item +The new active file's documents (@pxref{DOCUMENT}) are the +concatenation of all the input files' documents, in the order in which +the FILE subcommands are specified. + +@item +If all of the input files are weighted on the same variable, then the +new active file is weighted on that variable. Otherwise, the new +active file is not weighted. +@end itemize + +The remaining subcommands apply to the output file as a whole, rather +than to individual input files. They must be specified at the end of +the command specification, following all of the FILE and related +subcommands. The most important of these subcommands is BY, which +specifies a set of one or more variables that may be used to find +corresponding cases in each of the input files. The variables +specified on BY must be present in all of the input files. +Furthermore, if any of the input files are not sorted on the BY +variables, then SORT must be specified for those input files. + +The variables listed on BY may include (A) or (D) annotations to +specify ascending or descending sort order. @xref{SORT CASES}, for +more details on this notation. Adding (A) or (D) to the BY subcommand +specification is a PSPP extension. + +The DROP subcommand can be used to specify a list of variables to +exclude from the output. By contrast, the KEEP subcommand can be used +to specify variables to include in the output; all variables not +listed are dropped. DROP and KEEP are executed in left-to-right order +and may be repeated any number of times. DROP and KEEP do not affect +variables created by the IN, FIRST, and LAST subcommands, which are +always included in the new active file, but they can be used to drop +BY variables. + +The FIRST and LAST subcommands are optional. They may only be +specified on @cmd{MATCH FILES} and @cmd{ADD FILES}, and only when BY +is used. FIRST and LIST each adds a numeric variable to the new +active file, with the name given as the subcommand's argument and F1.0 +print and write formats. The value of the FIRST variable is 1 in the +first output case with a given set of values for the BY variables, and +0 in other cases. Similarly, the LAST variable is 1 in the last case +with a given of BY values, and 0 in other cases. + +When any of these commands creates an output case, variables that are +only in files that are not present for the current case are set to the +system-missing value for numeric variables or spaces for string +variables. + +@node ADD FILES +@section ADD FILES +@vindex ADD FILES + +@display +ADD FILES + +Per input file: + /FILE=@{*,'file-name'@} + [/RENAME=(src_names=target_names)@dots{}] + [/IN=var_name] + [/SORT] + +Once per command: + [/BY var_list[(@{D|A@})] [var_list[(@{D|A@})]@dots{}]] + [/DROP=var_list] + [/KEEP=var_list] + [/FIRST=var_name] + [/LAST=var_name] + [/MAP] +@end display + +@cmd{ADD FILES} adds cases from multiple input files. The output, +which replaces the active file, consists all of the cases in all of +the input files. + +ADD FILES shares the bulk of its syntax with other PSPP commands for +combining multiple data files. @xref{Combining Files Common Syntax}, +above, for an explanation of this common syntax. + +When BY is not used, the output of ADD FILES consists of all the cases +from the first input file specified, followed by all the cases from +the second file specified, and so on. When BY is used, the output is +additionally sorted on the BY variables. + +When ADD FILES creates an output case, variables that are not part of +the input file from which the case was drawn are set to the +system-missing value for numeric variables or spaces for string +variables. + +@node MATCH FILES +@section MATCH FILES +@vindex MATCH FILES + +@display +MATCH FILES + +Per input file: + /@{FILE,TABLE@}=@{*,'file-name'@} + [/RENAME=(src_names=target_names)@dots{}] + [/IN=var_name] + [/SORT] + +Once per command: + /BY var_list[(@{D|A@}] [var_list[(@{D|A@})]@dots{}] + [/DROP=var_list] + [/KEEP=var_list] + [/FIRST=var_name] + [/LAST=var_name] + [/MAP] +@end display + +@cmd{MATCH FILES} merges sets of corresponding cases in multiple +input files into single cases in the output, combining their data. + +MATCH FILES shares the bulk of its syntax with other PSPP commands for +combining multiple data files. @xref{Combining Files Common Syntax}, +above, for an explanation of this common syntax. + +How MATCH FILES matches up cases from the input files depends on +whether BY is specified: + +@itemize @bullet +@item +If BY is not used, MATCH FILES combines the first case from each input +file to produce the first output case, then the second case from each +input file for the second output case, and so on. If some input files +have fewer cases than others, then the shorter files do not contribute +to cases output after their input has been exhausted. + +@item +If BY is used, MATCH FILES combines cases from each input file that +have identical values for the BY variables. + +When BY is used, TABLE subcommands may be used to introduce @dfn{table +lookup file}. TABLE has same syntax as FILE, and the RENAME, IN, and +SORT subcommands may follow a TABLE in the same way as a FILE. +Regardless of the number of TABLEs, at least one FILE must specified. +Table lookup files are treated in the same way as other input files +for most purposes and, in particular, table lookup files must be +sorted on the BY variables or the SORT subcommand must be specified +for that TABLE. + +Cases in table lookup files are not consumed after they have been used +once. This means that data in table lookup files can correspond to +any number of cases in FILE input files. Table lookup files are +analogous to lookup tables in traditional relational database systems. + +If a table lookup file contains more than one case with a given set of +BY variables, only the first case is used. +@end itemize + +When MATCH FILES creates an output case, variables that are only in +files that are not present for the current case are set to the +system-missing value for numeric variables or spaces for string +variables. + +@node UPDATE +@section UPDATE +@vindex UPDATE + +@display +UPDATE + +Per input file: + /FILE=@{*,'file-name'@} + [/RENAME=(src_names=target_names)@dots{}] + [/IN=var_name] + [/SORT] + +Once per command: + /BY var_list[(@{D|A@})] [var_list[(@{D|A@})]]@dots{} + [/DROP=var_list] + [/KEEP=var_list] + [/MAP] +@end display + +@cmd{UPDATE} updates a @dfn{master file} by applying modifications +from one or more @dfn{transaction files}. + +UPDATE shares the bulk of its syntax with other PSPP commands for +combining multiple data files. @xref{Combining Files Common Syntax}, +above, for an explanation of this common syntax. + +At least two FILE subcommands must be specified. The first FILE +subcommand names the master file, and the rest name transaction files. +Every input file must either be sorted on the variables named on the +BY subcommand, or the SORT subcommand must be used just after the FILE +subcommand for that input file. + +UPDATE uses the variables specified on the BY subcommand, which is +required, to attempt to match each case in a transaction file with a +case in the master file: + +@itemize @bullet +@item +When a match is found, then the values of the variables present in the +transaction file replace those variable's values in the new active +file. If there are matching cases in more than more transaction file, +PSPP applies the replacements from the first transaction file, then +from the second transaction file, and so on. Similarly, if a single +transaction file has cases with duplicate BY values, then those are +applied in order to the master file. + +When a variable in a transaction file has a missing value or a string +variable's value is all blanks, that value is never used to update the +master file. + +@item +If a case in the master file has no matching case in any transaction +file, then it is copied unchanged to the output. + +@item +If a case in a transaction file has no matching case in the master +file, then it causes a new case to be added to the output, initialized +from the values in the transaction file. +@end itemize diff --git a/doc/files.texi b/doc/files.texi index ae10ec7a..2fd98927 100644 --- a/doc/files.texi +++ b/doc/files.texi @@ -1,5 +1,5 @@ -@node System and Portable Files -@chapter System Files and Portable Files +@node System and Portable File IO +@chapter System and Portable File I/O The commands in this chapter read, write, and examine system files and portable files. @@ -10,7 +10,6 @@ portable files. * GET:: Read from a system file. * GET DATA:: Read from foreign files. * IMPORT:: Read from a portable file. -* MATCH FILES:: Merge system files. * SAVE:: Write to a system file. * SYSFILE INFO:: Display system file dictionary. * XEXPORT:: Write to a portable file, as a transformation. @@ -651,99 +650,6 @@ data is read later, when a procedure is executed. Use of @cmd{IMPORT} to read a system file or scratch file is a PSPP extension. -@node MATCH FILES -@section MATCH FILES -@vindex MATCH FILES - -@display -MATCH FILES - /@{FILE,TABLE@}=@{*,'file-name'@} - /RENAME=(src_names=target_names)@dots{} - /IN=var_name - - /BY=var_list - /DROP=var_list - /KEEP=var_list - /FIRST=var_name - /LAST=var_name - /MAP -@end display - -@cmd{MATCH FILES} merges one or more system, portable, or scratch files, -optionally -including the active file. Cases with the same values for BY -variables are combined into a single case. Cases with different -values are output in order. Thus, multiple sorted files are -combined into a single sorted file based on the value of the BY -variables. The results of the merge become the new active file. - -Specify FILE with a system, portable, or scratch file as a file name -string or file handle -(@pxref{File Handles}), or with an asterisk (@samp{*}) to -indicate the current active file. The files specified on FILE are -merged together based on the BY variables, or combined case-by-case if -BY is not specified. - -Specify TABLE with a file to use it as a @dfn{table -lookup file}. Cases in table lookup files are not used up after -they've been used once. This means that data in table lookup files can -correspond to any number of cases in FILE files. Table lookup files -correspond to lookup tables in traditional relational database systems. -If a table lookup file contains more than one case with a given set of -BY variables, only the first case is used. - -Any number of FILE and TABLE subcommands may be specified. -Ordinarily, at least two FILE subcommands, or one FILE and at least -one TABLE, should be specified. Each instance of FILE or TABLE can be -followed by any sequence of RENAME subcommands. These have the same -form and meaning as the corresponding subcommands of @cmd{GET} -(@pxref{GET}), but apply only to variables in the given file. - -Each FILE or TABLE may optionally be followed by an IN subcommand, -which creates a numeric variable with the specified name and format -F1.0. The IN variable takes value 1 in a case if the given file -contributed a row to the merged file, 0 otherwise. The DROP, KEEP, -and RENAME subcommands do not affect IN variables. - -When more than one FILE or TABLE contains a variable with a given -name, those variables must all have the same type (numeric or string) -and, for string variables, the same width. This rules applies to -variable names after renaming with RENAME; thus, RENAME can be used to -resolve conflicts. - -FILE and TABLE must be specified at the beginning of the command, with -any RENAME or IN specifications immediately after the corresponding -FILE or TABLE. These subcommands are followed by BY, DROP, KEEP, -FIRST, LAST, and MAP. - -The BY subcommand specifies a list of variables that are used to match -cases from each of the files. When TABLE or IN is used, BY is -required; otherwise, it is optional. When BY is specified, all the -files named on FILE and TABLE subcommands must be sorted in ascending -order of the BY variables. Variables belonging to files that are not -present for the current case are set to the system-missing value for -numeric variables or spaces for string variables. - -The DROP and KEEP subcommands allow variables to be dropped from or -reordered within the new active file. These subcommands have the same -form and meaning as the corresponding subcommands of @cmd{GET} -(@pxref{GET}). They apply to the new active file as a whole, not to -individual input files. The variable names specified on DROP and KEEP -are those after any renaming with RENAME. - -The optional FIRST and LAST subcommands name variables that @cmd{MATCH -FILES} adds to the active file. The new variables are numeric with -print and write format F1.0. The value of the FIRST variable is 1 in -the first case with a given set of values for the BY variables, and 0 -in other cases. Similarly, the LAST variable is 1 in the last case -with a given of BY values, and 0 in other cases. - -@cmd{MATCH FILES} may not be specified following @cmd{TEMPORARY} -(@pxref{TEMPORARY}) if the active file is used as an input source. - -Use of portable or scratch files on @cmd{MATCH FILES} is a PSPP -extension. - @node SAVE @section SAVE @vindex SAVE diff --git a/doc/pspp.texinfo b/doc/pspp.texinfo index 6171df6d..975ee660 100644 --- a/doc/pspp.texinfo +++ b/doc/pspp.texinfo @@ -70,7 +70,8 @@ modify this GNU manual.'' * Expressions:: Numeric and string expression syntax. * Data Input and Output:: Reading data from user files. -* System and Portable Files:: Dealing with system & portable files. +* System and Portable File IO:: Reading and writing system & portable files. +* Combining Data Files:: Combining data from multiple files. * Variable Attributes:: Adjusting and examining variables. * Data Manipulation:: Simple operations on data. * Data Selection:: Select certain cases for analysis. @@ -98,6 +99,7 @@ modify this GNU manual.'' @include expressions.texi @include data-io.texi @include files.texi +@include combining.texi @include variables.texi @include transformation.texi @include data-selection.texi diff --git a/src/data/automake.mk b/src/data/automake.mk index cb407776..4169a57e 100644 --- a/src/data/automake.mk +++ b/src/data/automake.mk @@ -16,6 +16,8 @@ src_data_libdata_la_SOURCES = \ src/data/calendar.h \ src/data/case-map.c \ src/data/case-map.h \ + src/data/case-matcher.c \ + src/data/case-matcher.h \ src/data/case.c \ src/data/casegrouper.c \ src/data/casegrouper.h \ diff --git a/src/data/case-matcher.c b/src/data/case-matcher.c new file mode 100644 index 00000000..9cfc0083 --- /dev/null +++ b/src/data/case-matcher.c @@ -0,0 +1,152 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include + +#include + +#include +#include +#include +#include + +#include "xalloc.h" + +struct case_matcher_input + { + struct subcase by_vars; + const struct ccase *data; + bool *is_minimal; + }; + +struct case_matcher + { + struct case_matcher_input *inputs; + size_t n_inputs, allocated_inputs; + union value *by_values; + }; + +/* Creates and returns a new case matcher. */ +struct case_matcher * +case_matcher_create (void) +{ + struct case_matcher *cm = xmalloc (sizeof *cm); + cm->inputs = NULL; + cm->n_inputs = 0; + cm->allocated_inputs = 0; + cm->by_values = NULL; + return cm; +} + +/* Adds a new input file to case matcher CM. + case_matcher_match() will compare the variables specified in + BY in case DATA and set *IS_MINIMAL appropriately. + + All of the BY subcases provided to this function for a given + CM must be conformable (see subcase_conformable()). */ +void +case_matcher_add_input (struct case_matcher *cm, const struct subcase *by, + const struct ccase *data, bool *is_minimal) +{ + struct case_matcher_input *input; + + if (cm->n_inputs == 0) + cm->by_values = xmalloc (subcase_get_n_values (by) + * sizeof *cm->by_values); + else + assert (subcase_conformable (by, &cm->inputs[0].by_vars)); + + if (cm->n_inputs >= cm->allocated_inputs) + cm->inputs = x2nrealloc (cm->inputs, &cm->allocated_inputs, + sizeof *cm->inputs); + input = &cm->inputs[cm->n_inputs++]; + subcase_clone (&input->by_vars, by); + input->data = data; + input->is_minimal = is_minimal; +} + +/* Destroys case matcher CM. */ +void +case_matcher_destroy (struct case_matcher *cm) +{ + if (cm != NULL) + { + size_t i; + + for (i = 0; i < cm->n_inputs; i++) + { + struct case_matcher_input *input = &cm->inputs[i]; + subcase_destroy (&input->by_vars); + } + free (cm->inputs); + free (cm); + } +} + +static int +compare_BY_3way (struct case_matcher_input *a, struct case_matcher_input *b) +{ + return subcase_compare_3way (&a->by_vars, a->data, &b->by_vars, b->data); +} + +/* Compares the values of the BY variables in all of the nonnull + cases provided to case_matcher_add_input() for CM, sets + *IS_MINIMAL for each one to true if it has the minimum BY + values among those cases or to false if its BY values are + greater than the minimum. Also sets *IS_MINIMAL to false for + null cases. Sets *BY to the BY values extracted from the + minimum case. (The caller must not free *BY.) + + Returns true if at least one of the cases is nonnull, false + if they are all null.*/ +bool +case_matcher_match (struct case_matcher *cm, union value **by) +{ + struct case_matcher_input *file, *min; + + min = NULL; + for (file = cm->inputs; file < &cm->inputs[cm->n_inputs]; file++) + if (!case_is_null (file->data)) + { + int cmp = min != NULL ? compare_BY_3way (min, file) : 1; + if (cmp < 0) + *file->is_minimal = false; + else + { + *file->is_minimal = true; + if (cmp > 0) + min = file; + } + } + else + *file->is_minimal = false; + + if (min != NULL) + { + for (file = cm->inputs; file < min; file++) + *file->is_minimal = false; + subcase_extract (&min->by_vars, min->data, cm->by_values); + *by = cm->by_values; + return true; + } + else + { + *by = NULL; + return false; + } +} diff --git a/src/data/case-matcher.h b/src/data/case-matcher.h new file mode 100644 index 00000000..d172f611 --- /dev/null +++ b/src/data/case-matcher.h @@ -0,0 +1,33 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#ifndef DATA_CASE_MATCHER_H +#define DATA_CASE_MATCHER_H 1 + +#include + +struct ccase; +struct subcase; +union value; + +struct case_matcher *case_matcher_create (void); +void case_matcher_add_input (struct case_matcher *, const struct subcase *, + const struct ccase *, bool *is_minimal); +void case_matcher_destroy (struct case_matcher *); + +bool case_matcher_match (struct case_matcher *, union value **by); + +#endif /* data/case-matcher.h */ diff --git a/src/language/command.def b/src/language/command.def index de92c26a..e06261f9 100644 --- a/src/language/command.def +++ b/src/language/command.def @@ -40,12 +40,14 @@ DEF_CMD (S_ANY, 0, "SYSFILE INFO", cmd_sysfile_info) DEF_CMD (S_ANY, F_KEEP_FINAL_TOKEN, "TITLE", cmd_title) /* Commands that define (or replace) the active file. */ +DEF_CMD (S_INITIAL | S_DATA, 0, "ADD FILES", cmd_add_files) DEF_CMD (S_INITIAL | S_DATA | S_INPUT_PROGRAM | S_FILE_TYPE, 0, "DATA LIST", cmd_data_list) DEF_CMD (S_INITIAL | S_DATA, 0, "GET", cmd_get) DEF_CMD (S_INITIAL | S_DATA, 0, "GET DATA", cmd_get_data) DEF_CMD (S_INITIAL | S_DATA, 0, "IMPORT", cmd_import) DEF_CMD (S_INITIAL | S_DATA, 0, "INPUT PROGRAM", cmd_input_program) DEF_CMD (S_INITIAL | S_DATA, 0, "MATCH FILES", cmd_match_files) +DEF_CMD (S_INITIAL | S_DATA, 0, "UPDATE", cmd_update) /* Transformations and utilities that may appear after active file definition or within INPUT PROGRAM. */ @@ -144,7 +146,6 @@ DEF_CMD (S_ANY, F_TESTING, "DEBUG XFORM FAIL", cmd_debug_xform_fail) /* Unimplemented commands. */ UNIMPL_CMD ("2SLS", "Two stage least squares regression") UNIMPL_CMD ("ACF", "Autocorrelation function") -UNIMPL_CMD ("ADD FILES", "Add files to dictionary") UNIMPL_CMD ("ALSCAL", "Multidimensional scaling") UNIMPL_CMD ("ANACOR", "Correspondence analysis") UNIMPL_CMD ("ANOVA", "Factorial analysis of variance") @@ -256,7 +257,6 @@ UNIMPL_CMD ("TSPLOT", "Plot time sequence variables") UNIMPL_CMD ("TWOSTEP CLUSTER", "Cluster observations") UNIMPL_CMD ("UNIANOVA", "Univariate analysis") UNIMPL_CMD ("UNNUMBERED", "obsolete") -UNIMPL_CMD ("UPDATE", "Update working file") UNIMPL_CMD ("VALIDATEDATA", "Identify suspicious cases") UNIMPL_CMD ("VARCOMP", "Estimate variance") UNIMPL_CMD ("VARSTOCASES", "Restructure complex data") diff --git a/src/language/data-io/automake.mk b/src/language/data-io/automake.mk index 312690a3..ca003d1d 100644 --- a/src/language/data-io/automake.mk +++ b/src/language/data-io/automake.mk @@ -5,6 +5,7 @@ src_language_data_io_built_sources = \ src/language/data-io/list.c language_data_io_sources = \ + src/language/data-io/combine-files.c \ src/language/data-io/data-list.c \ src/language/data-io/data-parser.c \ src/language/data-io/data-parser.h \ @@ -17,7 +18,6 @@ language_data_io_sources = \ src/language/data-io/get.c \ src/language/data-io/inpt-pgm.c \ src/language/data-io/inpt-pgm.h \ - src/language/data-io/match-files.c \ src/language/data-io/placement-parser.c \ src/language/data-io/placement-parser.h \ src/language/data-io/print-space.c \ diff --git a/src/language/data-io/combine-files.c b/src/language/data-io/combine-files.c new file mode 100644 index 00000000..1451743d --- /dev/null +++ b/src/language/data-io/combine-files.c @@ -0,0 +1,864 @@ +/* PSPP - a program for statistical analysis. + Copyright (C) 1997-9, 2000, 2006, 2007, 2008 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xalloc.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) + +enum comb_command_type + { + COMB_ADD, + COMB_MATCH, + COMB_UPDATE + }; + +/* File types. */ +enum comb_file_type + { + COMB_FILE, /* Specified on FILE= subcommand. */ + COMB_TABLE /* Specified on TABLE= subcommand. */ + }; + +/* One FILE or TABLE subcommand. */ +struct comb_file + { + /* Basics. */ + enum comb_file_type type; /* COMB_FILE or COMB_TABLE. */ + + /* Variables. */ + struct subcase by_vars; /* BY variables in this input file. */ + struct subcase src, dst; /* Data to copy to output; where to put it. */ + + /* Input files. */ + struct file_handle *handle; /* Input file handle. */ + struct dictionary *dict; /* Input file dictionary. */ + struct casereader *reader; /* Input data source. */ + struct ccase data; /* The current input case. */ + bool is_minimal; /* Does 'data' have minimum BY values across + all input files? */ + bool is_sorted; /* Is file presorted on the BY variables? */ + + /* IN subcommand. */ + char in_name[VAR_NAME_LEN + 1]; + struct variable *in_var; + }; + +struct comb_proc + { + struct comb_file *files; /* All the files being merged. */ + size_t n_files; /* Number of files. */ + + struct dictionary *dict; /* Dictionary of output file. */ + struct subcase by_vars; /* BY variables in the output. */ + struct casewriter *output; /* Destination for output. */ + + struct case_matcher *matcher; + + /* FIRST, LAST. + Only if "first" or "last" is nonnull are the remaining + members used. */ + struct variable *first; /* Variable specified on FIRST (if any). */ + struct variable *last; /* Variable specified on LAST (if any). */ + struct ccase buffered_case; /* Case ready for output except that we don't + know the value for the LAST variable yet. */ + union value *prev_BY; /* Values of BY vars in buffered_case. */ + }; + +static int combine_files (enum comb_command_type, struct lexer *, + struct dataset *); +static void free_comb_proc (struct comb_proc *); + +static void close_all_comb_files (struct comb_proc *); +static bool merge_dictionary (struct dictionary *const, struct comb_file *); + +static void execute_update (struct comb_proc *); +static void execute_match_files (struct comb_proc *); +static void execute_add_files (struct comb_proc *); + +static bool create_flag_var (const char *subcommand_name, const char *var_name, + struct dictionary *, struct variable **); +static void output_case (struct comb_proc *, struct ccase *, union value *by); +static void output_buffered_case (struct comb_proc *); + +int +cmd_add_files (struct lexer *lexer, struct dataset *ds) +{ + return combine_files (COMB_ADD, lexer, ds); +} + +int +cmd_match_files (struct lexer *lexer, struct dataset *ds) +{ + return combine_files (COMB_MATCH, lexer, ds); +} + +int +cmd_update (struct lexer *lexer, struct dataset *ds) +{ + return combine_files (COMB_UPDATE, lexer, ds); +} + +static int +combine_files (enum comb_command_type command, + struct lexer *lexer, struct dataset *ds) +{ + struct comb_proc proc; + + bool saw_by = false; + bool saw_sort = false; + struct casereader *active_file = NULL; + + char first_name[VAR_NAME_LEN + 1] = ""; + char last_name[VAR_NAME_LEN + 1] = ""; + + struct taint *taint = NULL; + + size_t n_tables = 0; + size_t allocated_files = 0; + + size_t i; + + proc.files = NULL; + proc.n_files = 0; + proc.dict = dict_create (); + proc.output = NULL; + proc.matcher = NULL; + subcase_init_empty (&proc.by_vars); + proc.first = NULL; + proc.last = NULL; + case_nullify (&proc.buffered_case); + proc.prev_BY = NULL; + + dict_set_case_limit (proc.dict, dict_get_case_limit (dataset_dict (ds))); + + lex_match (lexer, '/'); + for (;;) + { + struct comb_file *file; + enum comb_file_type type; + + if (lex_match_id (lexer, "FILE")) + type = COMB_FILE; + else if (command == COMB_MATCH && lex_match_id (lexer, "TABLE")) + { + type = COMB_TABLE; + n_tables++; + } + else + break; + lex_match (lexer, '='); + + if (proc.n_files >= allocated_files) + proc.files = x2nrealloc (proc.files, &allocated_files, + sizeof *proc.files); + file = &proc.files[proc.n_files++]; + file->type = type; + subcase_init_empty (&file->by_vars); + subcase_init_empty (&file->src); + subcase_init_empty (&file->dst); + file->handle = NULL; + file->dict = NULL; + file->reader = NULL; + case_nullify (&file->data); + file->is_sorted = true; + file->in_name[0] = '\0'; + file->in_var = NULL; + + if (lex_match (lexer, '*')) + { + if (!proc_has_active_file (ds)) + { + msg (SE, _("Cannot specify the active file since no active " + "file has been defined.")); + goto error; + } + + if (proc_make_temporary_transformations_permanent (ds)) + msg (SE, _("This command may not be used after TEMPORARY when " + "the active file is an input source. " + "Temporary transformations will be made permanent.")); + + file->dict = dict_clone (dataset_dict (ds)); + } + else + { + file->handle = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH); + if (file->handle == NULL) + goto error; + + file->reader = any_reader_open (file->handle, &file->dict); + if (file->reader == NULL) + goto error; + } + + while (lex_match (lexer, '/')) + if (lex_match_id (lexer, "RENAME")) + { + if (!parse_dict_rename (lexer, file->dict)) + goto error; + } + else if (lex_match_id (lexer, "IN")) + { + lex_match (lexer, '='); + if (lex_token (lexer) != T_ID) + { + lex_error (lexer, NULL); + goto error; + } + + if (file->in_name[0]) + { + msg (SE, _("Multiple IN subcommands for a single FILE or " + "TABLE.")); + goto error; + } + strcpy (file->in_name, lex_tokid (lexer)); + lex_get (lexer); + } + else if (lex_match_id (lexer, "SORT")) + { + file->is_sorted = false; + saw_sort = true; + } + + merge_dictionary (proc.dict, file); + } + + while (lex_token (lexer) != '.') + { + if (lex_match (lexer, T_BY)) + { + const struct variable **by_vars; + size_t i; + bool ok; + + if (saw_by) + { + lex_sbc_only_once ("BY"); + goto error; + } + saw_by = true; + + lex_match (lexer, '='); + if (!parse_sort_criteria (lexer, proc.dict, &proc.by_vars, + &by_vars, NULL)) + goto error; + + ok = true; + for (i = 0; i < proc.n_files; i++) + { + struct comb_file *file = &proc.files[i]; + size_t j; + + for (j = 0; j < subcase_get_n_values (&proc.by_vars); j++) + { + const char *name = var_get_name (by_vars[j]); + struct variable *var = dict_lookup_var (file->dict, name); + if (var != NULL) + subcase_add_var (&file->by_vars, var, + subcase_get_direction (&proc.by_vars, j)); + else + { + if (file->handle != NULL) + msg (SE, _("File %s lacks BY variable %s."), + fh_get_name (file->handle), name); + else + msg (SE, _("Active file lacks BY variable %s."), name); + ok = false; + } + } + assert (!ok || subcase_conformable (&file->by_vars, + &proc.files[0].by_vars)); + } + free (by_vars); + + if (!ok) + goto error; + } + else if (command != COMB_UPDATE && lex_match_id (lexer, "FIRST")) + { + if (first_name[0] != '\0') + { + lex_sbc_only_once ("FIRST"); + goto error; + } + + lex_match (lexer, '='); + if (!lex_force_id (lexer)) + goto error; + strcpy (first_name, lex_tokid (lexer)); + lex_get (lexer); + } + else if (command != COMB_UPDATE && lex_match_id (lexer, "LAST")) + { + if (last_name[0] != '\0') + { + lex_sbc_only_once ("LAST"); + goto error; + } + + lex_match (lexer, '='); + if (!lex_force_id (lexer)) + goto error; + strcpy (last_name, lex_tokid (lexer)); + lex_get (lexer); + } + else if (lex_match_id (lexer, "MAP")) + { + /* FIXME. */ + } + else if (lex_match_id (lexer, "DROP")) + { + if (!parse_dict_drop (lexer, proc.dict)) + goto error; + } + else if (lex_match_id (lexer, "KEEP")) + { + if (!parse_dict_keep (lexer, proc.dict)) + goto error; + } + else + { + lex_error (lexer, NULL); + goto error; + } + + if (!lex_match (lexer, '/') && lex_token (lexer) != '.') + { + lex_end_of_command (lexer); + goto error; + } + } + + if (!saw_by) + { + if (command == COMB_UPDATE) + { + msg (SE, _("The BY subcommand is required.")); + goto error; + } + if (n_tables) + { + msg (SE, _("BY is required when TABLE is specified.")); + goto error; + } + if (saw_sort) + { + msg (SE, _("BY is required when SORT is specified.")); + goto error; + } + } + + /* Add IN, FIRST, and LAST variables to master dictionary. */ + for (i = 0; i < proc.n_files; i++) + { + struct comb_file *file = &proc.files[i]; + if (!create_flag_var ("IN", file->in_name, proc.dict, &file->in_var)) + goto error; + } + if (!create_flag_var ("FIRST", first_name, proc.dict, &proc.first) + || !create_flag_var ("LAST", last_name, proc.dict, &proc.last)) + goto error; + + dict_delete_scratch_vars (proc.dict); + dict_compact_values (proc.dict); + + /* Set up mapping from each file's variables to master + variables. */ + for (i = 0; i < proc.n_files; i++) + { + struct comb_file *file = &proc.files[i]; + size_t src_var_cnt = dict_get_var_cnt (file->dict); + size_t j; + + for (j = 0; j < src_var_cnt; j++) + { + struct variable *src_var = dict_get_var (file->dict, j); + struct variable *dst_var = dict_lookup_var (proc.dict, + var_get_name (src_var)); + if (dst_var != NULL) + { + subcase_add_var (&file->src, src_var, SC_ASCEND); + subcase_add_var (&file->dst, dst_var, SC_ASCEND); + } + } + } + + proc.output = autopaging_writer_create (dict_get_next_value_idx (proc.dict)); + taint = taint_clone (casewriter_get_taint (proc.output)); + + /* Set up case matcher. */ + proc.matcher = case_matcher_create (); + for (i = 0; i < proc.n_files; i++) + { + struct comb_file *file = &proc.files[i]; + if (file->reader == NULL) + { + if (active_file == NULL) + { + proc_discard_output (ds); + file->reader = active_file = proc_open (ds); + } + else + file->reader = casereader_clone (active_file); + } + if (!file->is_sorted) + file->reader = sort_execute (file->reader, &file->by_vars); + taint_propagate (casereader_get_taint (file->reader), taint); + casereader_read (file->reader, &file->data); + if (file->type == COMB_FILE) + case_matcher_add_input (proc.matcher, &file->by_vars, + &file->data, &file->is_minimal); + } + + if (command == COMB_ADD) + execute_add_files (&proc); + else if (command == COMB_MATCH) + execute_match_files (&proc); + else if (command == COMB_UPDATE) + execute_update (&proc); + else + NOT_REACHED (); + + case_matcher_destroy (proc.matcher); + proc.matcher = NULL; + close_all_comb_files (&proc); + if (active_file != NULL) + proc_commit (ds); + + proc_set_active_file (ds, casewriter_make_reader (proc.output), proc.dict); + proc.dict = NULL; + proc.output = NULL; + + free_comb_proc (&proc); + + return taint_destroy (taint) ? CMD_SUCCESS : CMD_CASCADING_FAILURE; + + error: + if (active_file != NULL) + proc_commit (ds); + free_comb_proc (&proc); + taint_destroy (taint); + return CMD_CASCADING_FAILURE; +} + +/* Merge the dictionary for file F into master dictionary M. */ +static bool +merge_dictionary (struct dictionary *const m, struct comb_file *f) +{ + struct dictionary *d = f->dict; + const char *d_docs, *m_docs; + int i; + + if (dict_get_label (m) == NULL) + dict_set_label (m, dict_get_label (d)); + + d_docs = dict_get_documents (d); + m_docs = dict_get_documents (m); + if (d_docs != NULL) + { + if (m_docs == NULL) + dict_set_documents (m, d_docs); + else + { + char *new_docs = xasprintf ("%s%s", m_docs, d_docs); + dict_set_documents (m, new_docs); + free (new_docs); + } + } + + for (i = 0; i < dict_get_var_cnt (d); i++) + { + struct variable *dv = dict_get_var (d, i); + struct variable *mv = dict_lookup_var (m, var_get_name (dv)); + + if (dict_class_from_id (var_get_name (dv)) == DC_SCRATCH) + continue; + + if (mv != NULL) + { + if (var_get_width (mv) != var_get_width (dv)) + { + const char *var_name = var_get_name (dv); + const char *file_name = fh_get_name (f->handle); + struct string s = DS_EMPTY_INITIALIZER; + ds_put_format (&s, + _("Variable %s in file %s has different " + "type or width from the same variable in " + "earlier file."), + var_name, file_name); + ds_put_cstr (&s, " "); + if (var_is_numeric (dv)) + ds_put_format (&s, _("In file %s, %s is numeric."), + file_name, var_name); + else + ds_put_format (&s, _("In file %s, %s is a string variable " + "with width %d."), + file_name, var_name, var_get_width (dv)); + ds_put_cstr (&s, " "); + if (var_is_numeric (mv)) + ds_put_format (&s, _("In an earlier file, %s was numeric."), + var_name); + else + ds_put_format (&s, _("In an earlier file, %s was a string " + "variable with width %d."), + var_name, var_get_width (mv)); + msg (SE, ds_cstr (&s)); + ds_destroy (&s); + return false; + } + + if (var_has_value_labels (dv) && !var_has_value_labels (mv)) + var_set_value_labels (mv, var_get_value_labels (dv)); + if (var_has_missing_values (dv) && !var_has_missing_values (mv)) + var_set_missing_values (mv, var_get_missing_values (dv)); + if (var_get_label (dv) && !var_get_label (mv)) + var_set_label (mv, var_get_label (dv)); + } + else + mv = dict_clone_var_assert (m, dv, var_get_name (dv)); + } + + return true; +} + +/* If VAR_NAME is a non-empty string, attempts to create a + variable named VAR_NAME, with format F1.0, in DICT, and stores + a pointer to the variable in *VAR. Returns true if + successful, false if the variable name is a duplicate (in + which case a message saying that the variable specified on the + given SUBCOMMAND is a duplicate is emitted). Also returns + true, without doing anything, if VAR_NAME is null or empty. */ +static bool +create_flag_var (const char *subcommand, const char *var_name, + struct dictionary *dict, struct variable **var) +{ + if (var_name[0] != '\0') + { + struct fmt_spec format = fmt_for_output (FMT_F, 1, 0); + *var = dict_create_var (dict, var_name, 0); + if (*var == NULL) + { + msg (SE, _("Variable name %s specified on %s subcommand " + "duplicates an existing variable name."), + subcommand, var_name); + return false; + } + var_set_both_formats (*var, &format); + } + else + *var = NULL; + return true; +} + +/* Closes all the files in PROC and frees their associated data. */ +static void +close_all_comb_files (struct comb_proc *proc) +{ + size_t i; + + for (i = 0; i < proc->n_files; i++) + { + struct comb_file *file = &proc->files[i]; + subcase_destroy (&file->by_vars); + subcase_destroy (&file->src); + subcase_destroy (&file->dst); + fh_unref (file->handle); + dict_destroy (file->dict); + casereader_destroy (file->reader); + case_destroy (&file->data); + } + free (proc->files); + proc->files = NULL; + proc->n_files = 0; +} + +/* Frees all the data for the procedure. */ +static void +free_comb_proc (struct comb_proc *proc) +{ + close_all_comb_files (proc); + dict_destroy (proc->dict); + casewriter_destroy (proc->output); + case_matcher_destroy (proc->matcher); + subcase_destroy (&proc->by_vars); + case_destroy (&proc->buffered_case); + free (proc->prev_BY); +} + +static bool scan_table (struct comb_file *, union value by[]); +static void create_output_case (const struct comb_proc *, struct ccase *); +static void apply_case (const struct comb_file *, struct ccase *); +static void apply_file_case_and_advance (struct comb_file *, struct ccase *, + union value by[]); +static void output_case (struct comb_proc *, struct ccase *, union value by[]); +static void output_buffered_case (struct comb_proc *); + +/* Executes the ADD FILES command. */ +static void +execute_add_files (struct comb_proc *proc) +{ + union value *by; + + while (case_matcher_match (proc->matcher, &by)) + { + struct ccase output; + size_t i; + + for (i = 0; i < proc->n_files; i++) + { + struct comb_file *file = &proc->files[i]; + while (file->is_minimal) + { + create_output_case (proc, &output); + apply_file_case_and_advance (file, &output, by); + output_case (proc, &output, by); + } + } + } + output_buffered_case (proc); +} + +/* Executes the MATCH FILES command. */ +static void +execute_match_files (struct comb_proc *proc) +{ + union value *by; + + while (case_matcher_match (proc->matcher, &by)) + { + struct ccase output; + size_t i; + + create_output_case (proc, &output); + for (i = proc->n_files; i-- > 0; ) + { + struct comb_file *file = &proc->files[i]; + if (file->type == COMB_FILE) + { + if (file->is_minimal) + apply_file_case_and_advance (file, &output, NULL); + } + else + { + if (scan_table (file, by)) + apply_case (file, &output); + } + } + output_case (proc, &output, by); + } + output_buffered_case (proc); +} + +/* Executes the UPDATE command. */ +static void +execute_update (struct comb_proc *proc) +{ + union value *by; + size_t n_duplicates = 0; + + while (case_matcher_match (proc->matcher, &by)) + { + struct comb_file *first, *file; + struct ccase output; + + /* Find first nonnull case in array and make an output case + from it. */ + create_output_case (proc, &output); + for (first = &proc->files[0]; ; first++) + if (first->is_minimal) + break; + apply_file_case_and_advance (first, &output, by); + + /* Read additional cases and update the output case from + them. (Don't update the output case from any duplicate + cases in the master file.) */ + for (file = first + (first == proc->files); + file < &proc->files[proc->n_files]; file++) + { + while (file->is_minimal) + apply_file_case_and_advance (file, &output, by); + } + casewriter_write (proc->output, &output); + + /* Write duplicate cases in the master file directly to the + output. */ + if (first == proc->files && first->is_minimal) + { + n_duplicates++; + while (first->is_minimal) + { + create_output_case (proc, &output); + apply_file_case_and_advance (first, &output, by); + casewriter_write (proc->output, &output); + } + } + } + + if (n_duplicates) + msg (SW, _("Encountered %zu sets of duplicate cases in the master file."), + n_duplicates); +} + +/* Reads FILE, which must be of type COMB_TABLE, until it + encounters a case with BY or greater for its BY variables. + Returns true if a case with exactly BY for its BY variables + was found, otherwise false. */ +static bool +scan_table (struct comb_file *file, union value by[]) +{ + while (!case_is_null (&file->data)) + { + int cmp = subcase_compare_3way_xc (&file->by_vars, by, &file->data); + if (cmp > 0) + { + case_destroy (&file->data); + casereader_read (file->reader, &file->data); + } + else + return cmp == 0; + } + return false; +} + +/* Creates OUTPUT as an output case for PROC, by initializing each of + its values to system-missing or blanks, except that the values + of IN variables are set to 0. */ +static void +create_output_case (const struct comb_proc *proc, struct ccase *output) +{ + size_t n_vars = dict_get_var_cnt (proc->dict); + size_t i; + + case_create (output, dict_get_next_value_idx (proc->dict)); + for (i = 0; i < n_vars; i++) + { + struct variable *v = dict_get_var (proc->dict, i); + value_set_missing (case_data_rw (output, v), var_get_width (v)); + } + for (i = 0; i < proc->n_files; i++) + { + struct comb_file *file = &proc->files[i]; + if (file->in_var != NULL) + case_data_rw (output, file->in_var)->f = false; + } +} + +/* Copies the data from FILE's case into output case OUTPUT. + If FILE has an IN variable, then it is set to 1 in OUTPUT. */ +static void +apply_case (const struct comb_file *file, struct ccase *output) +{ + subcase_copy (&file->src, &file->data, &file->dst, output); + if (file->in_var != NULL) + case_data_rw (output, file->in_var)->f = true; +} + +/* Like apply_case() above, but also advances FILE to its next + case. Also, if BY is nonnull, then FILE's is_minimal member + is updated based on whether the new case's BY values still + match those in BY. */ +static void +apply_file_case_and_advance (struct comb_file *file, struct ccase *output, + union value by[]) +{ + apply_case (file, output); + case_destroy (&file->data); + casereader_read (file->reader, &file->data); + if (by) + file->is_minimal = (!case_is_null (&file->data) + && subcase_equal_cx (&file->by_vars, &file->data, by)); +} + +/* Writes OUTPUT, whose BY values has been extracted into BY, to + PROC's output file, first initializing any FIRST or LAST + variables in OUTPUT to the correct values. */ +static void +output_case (struct comb_proc *proc, struct ccase *output, union value by[]) +{ + if (proc->first == NULL && proc->last == NULL) + casewriter_write (proc->output, output); + else + { + /* It's harder with LAST, because we can't know whether + this case is the last in a group until we've prepared + the *next* case also. Thus, we buffer the previous + output case until the next one is ready. */ + bool new_BY; + if (proc->prev_BY != NULL) + { + new_BY = !subcase_equal_xx (&proc->by_vars, proc->prev_BY, by); + if (proc->last != NULL) + case_data_rw (&proc->buffered_case, proc->last)->f = new_BY; + casewriter_write (proc->output, &proc->buffered_case); + } + else + new_BY = true; + + case_move (&proc->buffered_case, output); + if (proc->first != NULL) + case_data_rw (&proc->buffered_case, proc->first)->f = new_BY; + + if (new_BY) + { + size_t n = (subcase_get_n_values (&proc->by_vars) + * sizeof (union value)); + if (proc->prev_BY == NULL) + proc->prev_BY = xmalloc (n); + memcpy (proc->prev_BY, by, n); + } + } +} + +/* Writes a trailing buffered case to the output, if FIRST or + LAST is in use. */ +static void +output_buffered_case (struct comb_proc *proc) +{ + if (proc->prev_BY != NULL) + { + if (proc->last != NULL) + case_data_rw (&proc->buffered_case, proc->last)->f = 1.0; + casewriter_write (proc->output, &proc->buffered_case); + case_nullify (&proc->buffered_case); + } +} diff --git a/src/language/data-io/match-files.c b/src/language/data-io/match-files.c index 8fe878b4..ada14824 100644 --- a/src/language/data-io/match-files.c +++ b/src/language/data-io/match-files.c @@ -19,27 +19,38 @@ #include #include +#include #include #include #include -#include #include +#include #include +#include #include #include #include #include #include #include +#include #include #include #include +#include #include "xalloc.h" #include "gettext.h" #define _(msgid) gettext (msgid) +enum command_type + { + ADD_FILES, + MATCH_FILES, + UPDATE + }; + /* File types. */ enum mtf_type { @@ -47,26 +58,25 @@ enum mtf_type MTF_TABLE /* Specified on TABLE= subcommand. */ }; -/* One of the FILEs or TABLEs on MATCH FILES. */ +/* One FILE or TABLE subcommand. */ struct mtf_file { - struct ll ll; /* In list of all files and tables. */ - enum mtf_type type; - int sequence; - - const struct variable **by; /* List of BY variables for this file. */ + struct casereader *reader; + struct subcase by; + int idx; struct mtf_variable *vars; /* Variables to copy to output. */ size_t var_cnt; /* Number of other variables. */ + bool is_sorted; /* Is presorted on the BY variables? */ struct file_handle *handle; /* Input file handle. */ struct dictionary *dict; /* Input file dictionary. */ - struct casereader *reader; /* Input reader. */ - struct ccase input; /* Input record (null at end of file). */ - /* IN subcommand. */ - char *in_name; /* Variable name. */ - struct variable *in_var; /* Variable (in master dictionary). */ + /* Used by TABLE. */ + struct ccase c; + + char in_name[VAR_NAME_LEN + 1]; + struct variable *in_var; }; struct mtf_variable @@ -75,18 +85,16 @@ struct mtf_variable struct variable *out_var; }; -/* MATCH FILES procedure. */ struct mtf_proc { - struct ll_list files; /* List of "struct mtf_file"s. */ - int nonempty_files; /* FILEs that are not at end-of-file. */ - - bool ok; /* False if I/O error occurs. */ + struct mtf_file **files; /* All the files being merged. */ + size_t n_files; /* Number of files. */ struct dictionary *dict; /* Dictionary of output file. */ - struct casewriter *output; /* MATCH FILES output. */ + struct casewriter *output; /* Destination for output. */ - size_t by_cnt; /* Number of variables on BY subcommand. */ + struct case_matcher *matcher; + struct subcase by; /* FIRST, LAST. Only if "first" or "last" is nonnull are the remaining @@ -95,31 +103,51 @@ struct mtf_proc struct variable *last; /* Variable specified on LAST (if any). */ struct ccase buffered_case; /* Case ready for output except that we don't know the value for the LAST variable yet. */ - struct ccase prev_BY_case; /* Case with values of last set of BY vars. */ - const struct variable **prev_BY; /* Last set of BY variables. */ + union value *prev_BY; /* Values of BY vars in buffered_case. */ }; +static int combine_files (enum command_type, struct lexer *, struct dataset *); static void mtf_free (struct mtf_proc *); static bool mtf_close_all_files (struct mtf_proc *); static bool mtf_merge_dictionary (struct dictionary *const, struct mtf_file *); -static bool mtf_read_record (struct mtf_proc *mtf, struct mtf_file *); -static void mtf_process_case (struct mtf_proc *); +static void process_update (struct mtf_proc *); +static void process_match_files (struct mtf_proc *); +static void process_add_files (struct mtf_proc *); static bool create_flag_var (const char *subcommand_name, const char *var_name, struct dictionary *, struct variable **); static char *var_type_description (struct variable *); +static void output_case (struct mtf_proc *, struct ccase *, union value *by); +static void output_buffered_case (struct mtf_proc *); + +int +cmd_add_files (struct lexer *lexer, struct dataset *ds) +{ + return combine_files (ADD_FILES, lexer, ds); +} -/* Parse and execute the MATCH FILES command. */ int cmd_match_files (struct lexer *lexer, struct dataset *ds) +{ + return combine_files (MATCH_FILES, lexer, ds); +} + +int +cmd_update (struct lexer *lexer, struct dataset *ds) +{ + return combine_files (UPDATE, lexer, ds); +} + +static int +combine_files (enum command_type command, + struct lexer *lexer, struct dataset *ds) { struct mtf_proc mtf; - struct ll *first_table; - struct mtf_file *file, *next; - bool saw_in = false; + bool saw_by = false; + bool saw_sort = false; struct casereader *active_file = NULL; char first_name[VAR_NAME_LEN + 1] = ""; @@ -127,54 +155,56 @@ cmd_match_files (struct lexer *lexer, struct dataset *ds) struct taint *taint = NULL; + size_t n_files = 0; + size_t n_tables = 0; + size_t allocated_files = 0; + size_t i; - ll_init (&mtf.files); - mtf.nonempty_files = 0; - first_table = ll_null (&mtf.files); + mtf.files = NULL; + mtf.n_files = 0; mtf.dict = dict_create (); mtf.output = NULL; - mtf.by_cnt = 0; - mtf.first = mtf.last = NULL; + mtf.matcher = NULL; + subcase_init_empty (&mtf.by); + mtf.first = NULL; + mtf.last = NULL; case_nullify (&mtf.buffered_case); - case_nullify (&mtf.prev_BY_case); mtf.prev_BY = NULL; dict_set_case_limit (mtf.dict, dict_get_case_limit (dataset_dict (ds))); lex_match (lexer, '/'); - while (lex_token (lexer) == T_ID - && (lex_id_match (ss_cstr ("FILE"), ss_cstr (lex_tokid (lexer))) - || lex_id_match (ss_cstr ("TABLE"), ss_cstr (lex_tokid (lexer))))) + for (;;) { - struct mtf_file *file = xmalloc (sizeof *file); - file->by = NULL; - file->handle = NULL; - file->reader = NULL; - file->dict = NULL; - file->in_name = NULL; - file->in_var = NULL; - file->var_cnt = 0; - file->vars = NULL; - case_nullify (&file->input); + struct mtf_file *file; + enum mtf_type type; - if (lex_match_id (lexer, "FILE")) - { - file->type = MTF_FILE; - ll_insert (first_table, &file->ll); - mtf.nonempty_files++; - } - else if (lex_match_id (lexer, "TABLE")) - { - file->type = MTF_TABLE; - ll_push_tail (&mtf.files, &file->ll); - if (first_table == ll_null (&mtf.files)) - first_table = &file->ll; - } + if (lex_match_id (lexer, "FILE")) + type = MTF_FILE; + else if (command == MATCH_FILES && lex_match_id (lexer, "TABLE")) + type = MTF_TABLE; else - NOT_REACHED (); + break; lex_match (lexer, '='); + if (mtf.n_files >= allocated_files) + mtf.files = x2nrealloc (mtf.files, &allocated_files, + sizeof *mtf.files); + mtf.files[mtf.n_files++] = file = xmalloc (sizeof *file); + file->type = type; + file->reader = NULL; + subcase_init_empty (&file->by); + file->idx = type == MTF_FILE ? n_files++ : n_tables++; + file->vars = NULL; + file->var_cnt = 0; + file->is_sorted = true; + file->handle = NULL; + file->dict = NULL; + case_nullify (&file->c); + file->in_name[0] = '\0'; + file->in_var = NULL; + if (lex_match (lexer, '*')) { if (!proc_has_active_file (ds)) @@ -186,7 +216,7 @@ cmd_match_files (struct lexer *lexer, struct dataset *ds) if (proc_make_temporary_transformations_permanent (ds)) msg (SE, - _("MATCH FILES may not be used after TEMPORARY when " + _("This command may not be used after TEMPORARY when " "the active file is an input source. " "Temporary transformations will be made permanent.")); @@ -218,15 +248,19 @@ cmd_match_files (struct lexer *lexer, struct dataset *ds) goto error; } - if (file->in_name != NULL) + if (file->in_name[0]) { msg (SE, _("Multiple IN subcommands for a single FILE or " "TABLE.")); goto error; } - file->in_name = xstrdup (lex_tokid (lexer)); + strcpy (file->in_name, lex_tokid (lexer)); lex_get (lexer); - saw_in = true; + } + else if (lex_match_id (lexer, "SORT")) + { + file->is_sorted = false; + saw_sort = true; } mtf_merge_dictionary (mtf.dict, file); @@ -236,49 +270,53 @@ cmd_match_files (struct lexer *lexer, struct dataset *ds) { if (lex_match (lexer, T_BY)) { - struct mtf_file *file; - struct variable **by; + const struct variable **by_vars; + size_t i; bool ok; - if (mtf.by_cnt) + if (saw_by) { lex_sbc_only_once ("BY"); goto error; } + saw_by = true; lex_match (lexer, '='); - if (!parse_variables (lexer, mtf.dict, &by, &mtf.by_cnt, - PV_NO_DUPLICATE | PV_NO_SCRATCH)) + if (!parse_sort_criteria (lexer, mtf.dict, &mtf.by, &by_vars, NULL)) goto error; ok = true; - ll_for_each (file, struct mtf_file, ll, &mtf.files) + for (i = 0; i < mtf.n_files; i++) { - size_t i; + struct mtf_file *file = mtf.files[i]; + size_t j; - file->by = xnmalloc (mtf.by_cnt, sizeof *file->by); - for (i = 0; i < mtf.by_cnt; i++) + for (j = 0; j < subcase_get_n_values (&mtf.by); j++) { - const char *var_name = var_get_name (by[i]); - file->by[i] = dict_lookup_var (file->dict, var_name); - if (file->by[i] == NULL) + const char *name = var_get_name (by_vars[j]); + struct variable *var = dict_lookup_var (file->dict, name); + if (var != NULL) + subcase_add_var (&file->by, var, + subcase_get_direction (&mtf.by, j)); + else { if (file->handle != NULL) msg (SE, _("File %s lacks BY variable %s."), - fh_get_name (file->handle), var_name); + fh_get_name (file->handle), name); else - msg (SE, _("Active file lacks BY variable %s."), - var_name); + msg (SE, _("Active file lacks BY variable %s."), name); ok = false; } } + assert (!ok || subcase_conformable (&file->by, + &mtf.files[0]->by)); } - free (by); + free (by_vars); if (!ok) goto error; } - else if (lex_match_id (lexer, "FIRST")) + else if (command != UPDATE && lex_match_id (lexer, "FIRST")) { if (first_name[0] != '\0') { @@ -292,7 +330,7 @@ cmd_match_files (struct lexer *lexer, struct dataset *ds) strcpy (first_name, lex_tokid (lexer)); lex_get (lexer); } - else if (lex_match_id (lexer, "LAST")) + else if (command != UPDATE && lex_match_id (lexer, "LAST")) { if (last_name[0] != '\0') { @@ -333,31 +371,38 @@ cmd_match_files (struct lexer *lexer, struct dataset *ds) } } - if (mtf.by_cnt == 0) + if (!saw_by) { - if (first_table != ll_null (&mtf.files)) + if (command == UPDATE) + { + msg (SE, _("The BY subcommand is required.")); + goto error; + } + if (n_tables) { msg (SE, _("BY is required when TABLE is specified.")); goto error; } - if (saw_in) + if (saw_sort) { - msg (SE, _("BY is required when IN is specified.")); + msg (SE, _("BY is required when SORT is specified.")); goto error; } } /* Set up mapping from each file's variables to master variables. */ - ll_for_each (file, struct mtf_file, ll, &mtf.files) + for (i = 0; i < mtf.n_files; i++) { + struct mtf_file *file = mtf.files[i]; size_t in_var_cnt = dict_get_var_cnt (file->dict); + size_t j; file->vars = xnmalloc (in_var_cnt, sizeof *file->vars); file->var_cnt = 0; - for (i = 0; i < in_var_cnt; i++) + for (j = 0; j < in_var_cnt; j++) { - struct variable *in_var = dict_get_var (file->dict, i); + struct variable *in_var = dict_get_var (file->dict, j); struct variable *out_var = dict_lookup_var (mtf.dict, var_get_name (in_var)); @@ -371,9 +416,12 @@ cmd_match_files (struct lexer *lexer, struct dataset *ds) } /* Add IN, FIRST, and LAST variables to master dictionary. */ - ll_for_each (file, struct mtf_file, ll, &mtf.files) - if (!create_flag_var ("IN", file->in_name, mtf.dict, &file->in_var)) - goto error; + for (i = 0; i < mtf.n_files; i++) + { + struct mtf_file *file = mtf.files[i]; + if (!create_flag_var ("IN", file->in_name, mtf.dict, &file->in_var)) + goto error; + } if (!create_flag_var ("FIRST", first_name, mtf.dict, &mtf.first) || !create_flag_var ("LAST", last_name, mtf.dict, &mtf.last)) goto error; @@ -383,8 +431,11 @@ cmd_match_files (struct lexer *lexer, struct dataset *ds) mtf.output = autopaging_writer_create (dict_get_next_value_idx (mtf.dict)); taint = taint_clone (casewriter_get_taint (mtf.output)); - ll_for_each (file, struct mtf_file, ll, &mtf.files) + mtf.matcher = case_matcher_create (); + taint_propagate (case_matcher_get_taint (mtf.matcher), taint); + for (i = 0; i < mtf.n_files; i++) { + struct mtf_file *file = mtf.files[i]; if (file->reader == NULL) { if (active_file == NULL) @@ -395,20 +446,27 @@ cmd_match_files (struct lexer *lexer, struct dataset *ds) else file->reader = casereader_clone (active_file); } - taint_propagate (casereader_get_taint (file->reader), taint); + if (!file->is_sorted) + file->reader = sort_execute (file->reader, &file->by); + if (file->type == MTF_FILE) + case_matcher_add_input (mtf.matcher, file->reader, &file->by); + else + { + casereader_read (file->reader, &file->c); + taint_propagate (casereader_get_taint (file->reader), taint); + } } - ll_for_each_safe (file, next, struct mtf_file, ll, &mtf.files) - mtf_read_record (&mtf, file); - while (mtf.nonempty_files > 0) - mtf_process_case (&mtf); - if ((mtf.first != NULL || mtf.last != NULL) && mtf.prev_BY != NULL) - { - if (mtf.last != NULL) - case_data_rw (&mtf.buffered_case, mtf.last)->f = 1.0; - casewriter_write (mtf.output, &mtf.buffered_case); - case_nullify (&mtf.buffered_case); - } + if (command == ADD_FILES) + process_add_files (&mtf); + else if (command == MATCH_FILES) + process_match_files (&mtf); + else if (command == UPDATE) + process_update (&mtf); + else + NOT_REACHED (); + + case_matcher_destroy (mtf.matcher); mtf_close_all_files (&mtf); if (active_file != NULL) proc_commit (ds); @@ -429,19 +487,18 @@ cmd_match_files (struct lexer *lexer, struct dataset *ds) return CMD_CASCADING_FAILURE; } -/* If VAR_NAME is a nonnull pointer to a non-empty string, - attempts to create a variable named VAR_NAME, with format - F1.0, in DICT, and stores a pointer to the variable in *VAR. - Returns true if successful, false if the variable name is a - duplicate (in which case a message saying that the variable - specified on the given SUBCOMMAND is a duplicate is emitted). - Also returns true, without doing anything, if VAR_NAME is null - or empty. */ +/* If VAR_NAME is a non-empty string, attempts to create a + variable named VAR_NAME, with format F1.0, in DICT, and stores + a pointer to the variable in *VAR. Returns true if + successful, false if the variable name is a duplicate (in + which case a message saying that the variable specified on the + given SUBCOMMAND is a duplicate is emitted). Also returns + true, without doing anything, if VAR_NAME is null or empty. */ static bool create_flag_var (const char *subcommand, const char *var_name, struct dictionary *dict, struct variable **var) { - if (var_name != NULL && var_name[0] != '\0') + if (var_name[0] != '\0') { struct fmt_spec format = fmt_for_output (FMT_F, 1, 0); *var = dict_create_var (dict, var_name, 0); @@ -476,154 +533,228 @@ var_type_description (struct variable *v) static bool mtf_close_all_files (struct mtf_proc *mtf) { - struct mtf_file *file; bool ok = true; + size_t i; - ll_for_each_preremove (file, struct mtf_file, ll, &mtf->files) + for (i = 0; i < mtf->n_files; i++) { + struct mtf_file *file = mtf->files[i]; fh_unref (file->handle); - casereader_destroy (file->reader); - free (file->by); dict_destroy (file->dict); - free (file->in_name); - case_destroy (&file->input); + subcase_destroy (&file->by); + if (file->type == MTF_TABLE) + casereader_destroy (file->reader); free (file->vars); free (file); } + free (mtf->files); + mtf->files = NULL; + mtf->n_files = 0; return ok; } -/* Frees all the data for the MATCH FILES procedure. */ +/* Frees all the data for the procedure. */ static void mtf_free (struct mtf_proc *mtf) { mtf_close_all_files (mtf); dict_destroy (mtf->dict); + subcase_destroy (&mtf->by); casewriter_destroy (mtf->output); case_destroy (&mtf->buffered_case); - case_destroy (&mtf->prev_BY_case); + free (mtf->prev_BY); } -/* Reads the next record into FILE, if possible, and update MTF's - nonempty_files count if not. */ static bool -mtf_read_record (struct mtf_proc *mtf, struct mtf_file *file) +scan_table (struct mtf_file *file, union value *by) { - case_destroy (&file->input); - if (!casereader_read (file->reader, &file->input)) + while (!case_is_null (&file->c)) { - mtf->nonempty_files--; - return false; + int cmp = subcase_compare_3way_xc (&file->by, by, &file->c); + if (cmp > 0) + casereader_read (file->reader, &file->c); + else + return cmp == 0; } - else - return true; + return false; } -/* Compare the BY variables for files A and B; return -1 if A < - B, 0 if A == B, 1 if A > B. (If there are no BY variables, - then all records are equal.) */ -static inline int -mtf_compare_BY_values (struct mtf_proc *mtf, - struct mtf_file *a, struct mtf_file *b) +static void +create_output_case (const struct mtf_proc *mtf, struct ccase *c) { - return case_compare_2dict (&a->input, &b->input, a->by, b->by, mtf->by_cnt); + size_t i; + + case_create (c, dict_get_next_value_idx (mtf->dict)); + for (i = 0; i < dict_get_var_cnt (mtf->dict); i++) + { + struct variable *v = dict_get_var (mtf->dict, i); + value_set_missing (case_data_rw (c, v), var_get_width (v)); + } + for (i = 0; i < mtf->n_files; i++) + { + struct mtf_file *file = mtf->files[i]; + if (file->in_var != NULL) + case_data_rw (c, file->in_var)->f = false; + } } -/* Processes input files and write one case to the output file. */ static void -mtf_process_case (struct mtf_proc *mtf) +apply_case (const struct mtf_file *file, struct ccase *file_case, + struct ccase *c) +{ + /* XXX subcases */ + size_t j; + for (j = 0; j < file->var_cnt; j++) + { + const struct mtf_variable *mv = &file->vars[j]; + const union value *in = case_data (file_case, mv->in_var); + union value *out = case_data_rw (c, mv->out_var); + value_copy (out, in, var_get_width (mv->in_var)); + } + case_destroy (file_case); + if (file->in_var != NULL) + case_data_rw (c, file->in_var)->f = true; +} + +static size_t +find_first_match (struct ccase *cases) { - struct ccase c; - struct mtf_file *min; - struct mtf_file *file; - int min_sequence; size_t i; + for (i = 0; ; i++) + if (!case_is_null (&cases[i])) + return i; +} - /* Find the set of one or more FILEs whose BY values are - minimal, as well as the set of zero or more TABLEs whose BY - values equal those of the minimum FILEs. - - After each iteration of the loop, this invariant holds: the - FILEs with minimum BY values thus far have "sequence" - members equal to min_sequence, and "min" points to one of - the mtf_files whose case has those minimum BY values, and - similarly for TABLEs. */ - min_sequence = 0; - min = NULL; - ll_for_each (file, struct mtf_file, ll, &mtf->files) - if (case_is_null (&file->input)) - file->sequence = -1; - else if (file->type == MTF_FILE) - { - int cmp = min != NULL ? mtf_compare_BY_values (mtf, min, file) : 1; - if (cmp <= 0) - file->sequence = cmp < 0 ? -1 : min_sequence; - else - { - file->sequence = ++min_sequence; - min = file; - } - } - else - { - int cmp; - assert (min != NULL); - do +static void +process_update (struct mtf_proc *mtf) +{ + struct ccase *cases; + union value *by; + + while (case_matcher_read (mtf->matcher, &cases, &by)) + { + struct mtf_file *min; + struct ccase c; + size_t min_idx; + size_t i; + + create_output_case (mtf, &c); + min_idx = find_first_match (cases); + min = mtf->files[min_idx]; + apply_case (min, &cases[min_idx], &c); + case_matcher_advance (mtf->matcher, min_idx, &cases[min_idx]); + for (i = MAX (1, min_idx); i < mtf->n_files; i++) + while (!case_is_null (&cases[i])) { - cmp = mtf_compare_BY_values (mtf, min, file); + apply_case (mtf->files[i], &cases[i], &c); + case_matcher_advance (mtf->matcher, i, &cases[i]); } - while (cmp > 0 && mtf_read_record (mtf, file)); - file->sequence = cmp == 0 ? min_sequence : -1; - } + casewriter_write (mtf->output, &c); + + if (min_idx == 0) + { + size_t n_dups; - /* Form the output case from the input cases. */ - case_create (&c, dict_get_next_value_idx (mtf->dict)); - for (i = 0; i < dict_get_var_cnt (mtf->dict); i++) - { - struct variable *v = dict_get_var (mtf->dict, i); - value_set_missing (case_data_rw (&c, v), var_get_width (v)); + for (n_dups = 0; !case_is_null (&cases[0]); n_dups++) + { + create_output_case (mtf, &c); + apply_case (mtf->files[0], &cases[0], &c); + case_matcher_advance (mtf->matcher, 0, &cases[0]); + casewriter_write (mtf->output, &c); + } +#if 0 + if (n_dups > 0) + msg (SW, _("Encountered %zu duplicates."), n_dups); +#endif + /* XXX warn. That's the whole point; otherwise we + don't need the 'if' statement at all. */ + } } - ll_for_each_reverse (file, struct mtf_file, ll, &mtf->files) +} + +/* Executes MATCH FILES for key-based matches. */ +static void +process_match_files (struct mtf_proc *mtf) +{ + union value *by; + struct ccase *cases; + + while (case_matcher_read (mtf->matcher, &cases, &by)) { - bool include_file = file->sequence == min_sequence; - if (include_file) - for (i = 0; i < file->var_cnt; i++) - { - const struct mtf_variable *mv = &file->vars[i]; - const union value *in = case_data (&file->input, mv->in_var); - union value *out = case_data_rw (&c, mv->out_var); - value_copy (out, in, var_get_width (mv->in_var)); - } - if (file->in_var != NULL) - case_data_rw (&c, file->in_var)->f = include_file; + struct ccase c; + size_t i; + + create_output_case (mtf, &c); + for (i = mtf->n_files; i-- > 0; ) + { + struct mtf_file *file = mtf->files[i]; + struct ccase *file_case; + bool include; + if (file->type == MTF_FILE) + { + file_case = &cases[file->idx]; + include = !case_is_null (file_case); + if (include) + case_matcher_advance (mtf->matcher, file->idx, NULL); + } + else + { + file_case = &file->c; + include = scan_table (file, by); + if (include) + case_clone (file_case, file_case); + } + if (include) + apply_case (file, file_case, &c); + } + output_case (mtf, &c, by); } + output_buffered_case (mtf); +} - /* Write the output case. */ - if (mtf->first == NULL && mtf->last == NULL) +/* Processes input files and write one case to the output file. */ +static void +process_add_files (struct mtf_proc *mtf) +{ + union value *by; + struct ccase *cases; + + while (case_matcher_read (mtf->matcher, &cases, &by)) { - /* With no FIRST or LAST variables, it's trivial. */ - casewriter_write (mtf->output, &c); + struct ccase c; + size_t i; + + for (i = 0; i < mtf->n_files; i++) + { + struct mtf_file *file = mtf->files[i]; + while (!case_is_null (&cases[i])) + { + create_output_case (mtf, &c); + apply_case (file, &cases[i], &c); + case_matcher_advance (mtf->matcher, i, &cases[i]); + output_case (mtf, &c, by); + } + } } + output_buffered_case (mtf); +} + +static void +output_case (struct mtf_proc *mtf, struct ccase *c, union value *by) +{ + if (mtf->first == NULL && mtf->last == NULL) + casewriter_write (mtf->output, c); else { /* It's harder with LAST, because we can't know whether this case is the last in a group until we've prepared the *next* case also. Thus, we buffer the previous - output case until the next one is ready. - - We also have to save a copy of one of the previous input - cases, so that we can compare the BY variables. We - can't compare the BY variables between the current - output case and the saved one because the BY variables - might not be in the output (the user is allowed to drop - them). */ + output case until the next one is ready. */ bool new_BY; if (mtf->prev_BY != NULL) { - new_BY = case_compare_2dict (&min->input, &mtf->prev_BY_case, - min->by, mtf->prev_BY, - mtf->by_cnt); + new_BY = !subcase_equal_xx (&mtf->by, mtf->prev_BY, by); if (mtf->last != NULL) case_data_rw (&mtf->buffered_case, mtf->last)->f = new_BY; casewriter_write (mtf->output, &mtf->buffered_case); @@ -631,28 +762,30 @@ mtf_process_case (struct mtf_proc *mtf) else new_BY = true; - case_move (&mtf->buffered_case, &c); + case_move (&mtf->buffered_case, c); if (mtf->first != NULL) case_data_rw (&mtf->buffered_case, mtf->first)->f = new_BY; if (new_BY) { - mtf->prev_BY = min->by; - case_destroy (&mtf->prev_BY_case); - case_clone (&mtf->prev_BY_case, &min->input); + size_t n = subcase_get_n_values (&mtf->by) * sizeof (union value); + if (mtf->prev_BY == NULL) + mtf->prev_BY = xmalloc (n); + memcpy (mtf->prev_BY, by, n); } } +} - /* Read another record from each input file FILE with minimum - values. */ - ll_for_each (file, struct mtf_file, ll, &mtf->files) - if (file->type == MTF_FILE) - { - if (file->sequence == min_sequence) - mtf_read_record (mtf, file); - } - else - break; +static void +output_buffered_case (struct mtf_proc *mtf) +{ + if (mtf->prev_BY != NULL) + { + if (mtf->last != NULL) + case_data_rw (&mtf->buffered_case, mtf->last)->f = 1.0; + casewriter_write (mtf->output, &mtf->buffered_case); + case_nullify (&mtf->buffered_case); + } } /* Merge the dictionary for file F into master dictionary M. */ @@ -704,14 +837,10 @@ mtf_merge_dictionary (struct dictionary *const m, struct mtf_file *f) return false; } - if (var_get_width (dv) == var_get_width (mv)) - { - if (var_has_value_labels (dv) && !var_has_value_labels (mv)) - var_set_value_labels (mv, var_get_value_labels (dv)); - if (var_has_missing_values (dv) && !var_has_missing_values (mv)) - var_set_missing_values (mv, var_get_missing_values (dv)); - } - + if (var_has_value_labels (dv) && !var_has_value_labels (mv)) + var_set_value_labels (mv, var_get_value_labels (dv)); + if (var_has_missing_values (dv) && !var_has_missing_values (mv)) + var_set_missing_values (mv, var_get_missing_values (dv)); if (var_get_label (dv) && !var_get_label (mv)) var_set_label (mv, var_get_label (dv)); } diff --git a/tests/automake.mk b/tests/automake.mk index 082889cd..00f3a126 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -9,6 +9,7 @@ TESTS_ENVIRONMENT += CHARSETALIASDIR='$(abs_top_builddir)/gl' TESTS_ENVIRONMENT += LC_ALL=C dist_TESTS = \ + tests/command/add-files.sh \ tests/command/aggregate.sh \ tests/command/attributes.sh \ tests/command/autorecod.sh \ @@ -73,6 +74,7 @@ dist_TESTS = \ tests/command/t-test-pairs.sh \ tests/command/trimmed-mean.sh \ tests/command/tabs.sh \ + tests/command/update.sh \ tests/command/use.sh \ tests/command/variable-display.sh \ tests/command/vector.sh \ diff --git a/tests/command/add-files.sh b/tests/command/add-files.sh new file mode 100755 index 00000000..48232f2e --- /dev/null +++ b/tests/command/add-files.sh @@ -0,0 +1,200 @@ +#!/bin/sh + +# This program tests the ADD FILES procedure + +TEMPDIR=/tmp/pspp-tst-$$ +TESTFILE=$TEMPDIR/add-files.pspp + + +# ensure that top_builddir are absolute +if [ -z "$top_builddir" ] ; then top_builddir=. ; fi +if [ -z "$top_srcdir" ] ; then top_srcdir=. ; fi +top_builddir=`cd $top_builddir; pwd` +PSPP=$top_builddir/src/ui/terminal/pspp + +# ensure that top_srcdir is absolute +top_srcdir=`cd $top_srcdir; pwd` + + +STAT_CONFIG_PATH=$top_srcdir/config +export STAT_CONFIG_PATH + +cleanup() +{ + if [ x"$PSPP_TEST_NO_CLEANUP" != x ] ; then + echo "NOT cleaning $TEMPDIR" + return ; + fi + cd / + rm -rf $TEMPDIR +} + + +fail() +{ + echo $activity + echo FAILED + cleanup; + exit 1; +} + + +no_result() +{ + echo $activity + echo NO RESULT; + cleanup; + exit 2; +} + +pass() +{ + cleanup; + exit 0; +} + +mkdir -p $TEMPDIR + +cd $TEMPDIR + +activity="data create" +cat > a.data < b.data < concatenate.out < interleave.out < $name.pspp + if [ $? -ne 0 ] ; then no_result ; fi + + activity="run $name.pspp" + $SUPERVISOR $PSPP --testing-mode $name.pspp + if [ $? -ne 0 ] ; then no_result ; fi + + activity="check $name output" + perl -pi -e 's/^\s*$//g' pspp.list + perl -pi -e 's/^\s*$//g' $type.out + diff -u -b -w pspp.list $type.out + if [ $? -ne 0 ] ; then fail ; fi + done +done + +pass; diff --git a/tests/command/match-files.sh b/tests/command/match-files.sh index 965539c8..9726b065 100755 --- a/tests/command/match-files.sh +++ b/tests/command/match-files.sh @@ -59,19 +59,19 @@ cd $TEMPDIR activity="data create" cat > a.data < b.data < a.data < b.data < update.out < $name.pspp + if [ $? -ne 0 ] ; then no_result ; fi + + activity="run $name.pspp" + rm -f errors + $SUPERVISOR $PSPP --testing-mode --error-file=errors $name.pspp + if [ $? -ne 0 ] ; then no_result ; fi + + activity="check $name output" + perl -pi -e 's/^\s*$//g' pspp.list + diff -c -b -w pspp.list update.out + if [ $? -ne 0 ] ; then fail ; fi + diff -c -b -w - errors <