From 73549ddf86afb11f488f9f95265f29aab2ef712e Mon Sep 17 00:00:00 2001 From: John Darrington Date: Mon, 26 Jul 2010 14:34:58 +0200 Subject: [PATCH] AGGREGATE: Add MODE=ADDVARIABLES subcommand. Instead of replacing the current dataset with the aggregated data, this subcommand appends newvariables to the data with the aggregated values. --- doc/transformation.texi | 22 +++++- src/language/stats/aggregate.c | 117 +++++++++++++++++++++--------- tests/language/stats/aggregate.at | 38 ++++++++++ 3 files changed, 139 insertions(+), 38 deletions(-) diff --git a/doc/transformation.texi b/doc/transformation.texi index fe08b9cd8f..27181b3cde 100644 --- a/doc/transformation.texi +++ b/doc/transformation.texi @@ -23,7 +23,7 @@ as a rule. @display AGGREGATE - OUTFILE=@{*,'file-name',file_handle@} + OUTFILE=@{*,'file-name',file_handle@} [MODE=@{REPLACE, ADDVARIABLES@}] /PRESORTED /DOCUMENT /MISSING=COLUMNWISE @@ -40,13 +40,29 @@ The OUTFILE subcommand is required and must appear first. Specify a system file, portable file, or scratch file by file name or file handle (@pxref{File Handles}). The aggregated cases are written to this file. If @samp{*} is -specified, then the aggregated cases replace the active file. Use of -OUTFILE to write a portable file or scratch file is a PSPP extension. +specified, then the aggregated cases replace the active file. +Use of OUTFILE to write a portable file or scratch file is a PSPP extension. + +If OUTFILE=@samp{*} is given, then the subcommand MODE may also be +specified. +The mode subcommand has two possible values: ADDVARIABLES or REPLACE. +In REPLACE mode, the entire active file is replaced by a new file +which contains just the break variables and the destination varibles. +In this mode, the new file will contain as many cases as there are +unique combinations of the break variables. +In ADDVARIABLES mode, the destination variables will be appended to +the existing active file. +Cases which have identical combinations of values in their break +variables, will receive identical values for the destination variables. +The number of cases in the active file will remain unchanged. +Note that if ADDVARIABLES is specified, then the data @emph{must} be +sorted on the break variables. By default, the active file will be sorted based on the break variables before aggregation takes place. If the active file is already sorted or otherwise grouped in terms of the break variables, specify PRESORTED to save time. +PRESORTED is assumed if MODE=ADDVARIABLES is used. Specify DOCUMENT to copy the documents from the active file into the aggregate file (@pxref{DOCUMENT}). Otherwise, the aggregate file will diff --git a/src/language/stats/aggregate.c b/src/language/stats/aggregate.c index 42f330d5d2..0d918fb908 100644 --- a/src/language/stats/aggregate.c +++ b/src/language/stats/aggregate.c @@ -146,17 +146,18 @@ struct agr_proc struct subcase sort; /* Sort criteria (break variables). */ const struct variable **break_vars; /* Break variables. */ size_t break_var_cnt; /* Number of break variables. */ - struct ccase *break_case; /* Last values of break variables. */ enum missing_treatment missing; /* How to treat missing values. */ struct agr_var *agr_vars; /* First aggregate variable. */ struct dictionary *dict; /* Aggregate dictionary. */ const struct dictionary *src_dict; /* Dict of the source */ int case_cnt; /* Counts aggregated cases. */ + + bool add_variables; /* True iff the aggregated variables should + be appended to the existing dictionary */ }; -static void initialize_aggregate_info (struct agr_proc *, - const struct ccase *); +static void initialize_aggregate_info (struct agr_proc *); static void accumulate_aggregate_info (struct agr_proc *, const struct ccase *); @@ -164,8 +165,9 @@ static void accumulate_aggregate_info (struct agr_proc *, static bool parse_aggregate_functions (struct lexer *, const struct dictionary *, struct agr_proc *); static void agr_destroy (struct agr_proc *); -static void dump_aggregate_info (struct agr_proc *agr, - struct casewriter *output); +static void dump_aggregate_info (const struct agr_proc *agr, + struct casewriter *output, + const struct ccase *break_case); /* Parsing. */ @@ -187,13 +189,8 @@ cmd_aggregate (struct lexer *lexer, struct dataset *ds) memset(&agr, 0 , sizeof (agr)); agr.missing = ITEMWISE; - agr.break_case = NULL; - - agr.dict = dict_create (); agr.src_dict = dict; subcase_init_empty (&agr.sort); - dict_set_label (agr.dict, dict_get_label (dict)); - dict_set_documents (agr.dict, dict_get_documents (dict)); /* OUTFILE subcommand must be first. */ lex_match (lexer, '/'); @@ -207,6 +204,32 @@ cmd_aggregate (struct lexer *lexer, struct dataset *ds) goto error; } + if (out_file == NULL && lex_match_id (lexer, "MODE")) + { + lex_match (lexer, '='); + if (lex_match_id (lexer, "ADDVARIABLES")) + { + agr.add_variables = true; + + /* presorted is assumed in ADDVARIABLES mode */ + presorted = true; + } + else if (lex_match_id (lexer, "REPLACE")) + { + agr.add_variables = false; + } + else + goto error; + } + + if ( agr.add_variables ) + agr.dict = dict_clone (dict); + else + agr.dict = dict_create (); + + dict_set_label (agr.dict, dict_get_label (dict)); + dict_set_documents (agr.dict, dict_get_documents (dict)); + /* Read most of the subcommands. */ for (;;) { @@ -236,8 +259,9 @@ cmd_aggregate (struct lexer *lexer, struct dataset *ds) goto error; agr.break_var_cnt = subcase_get_n_fields (&agr.sort); - for (i = 0; i < agr.break_var_cnt; i++) - dict_clone_var_assert (agr.dict, agr.break_vars[i]); + if (! agr.add_variables) + for (i = 0; i < agr.break_var_cnt; i++) + dict_clone_var_assert (agr.dict, agr.break_vars[i]); /* BREAK must follow the options. */ break; @@ -295,18 +319,40 @@ cmd_aggregate (struct lexer *lexer, struct dataset *ds) casegrouper_get_next_group (grouper, &group); casereader_destroy (group)) { + struct casereader *placeholder = NULL; struct ccase *c = casereader_peek (group, 0); + if (c == NULL) { casereader_destroy (group); continue; } - initialize_aggregate_info (&agr, c); - case_unref (c); - for (; (c = casereader_read (group)) != NULL; case_unref (c)) - accumulate_aggregate_info (&agr, c); - dump_aggregate_info (&agr, output); + initialize_aggregate_info (&agr); + + if ( agr.add_variables ) + placeholder = casereader_clone (group); + + { + struct ccase *cg; + for (; (cg = casereader_read (group)) != NULL; case_unref (cg)) + accumulate_aggregate_info (&agr, cg); + } + + + if (agr.add_variables) + { + struct ccase *cg; + for (; (cg = casereader_read (placeholder)) != NULL; case_unref (cg)) + dump_aggregate_info (&agr, output, cg); + + casereader_destroy (placeholder); + } + else + { + dump_aggregate_info (&agr, output, c); + case_unref (c); + } } if (!casegrouper_destroy (grouper)) goto error; @@ -694,7 +740,6 @@ agr_destroy (struct agr_proc *agr) subcase_destroy (&agr->sort); free (agr->break_vars); - case_unref (agr->break_case); for (iter = agr->agr_vars; iter; iter = next) { next = iter->next; @@ -920,23 +965,28 @@ accumulate_aggregate_info (struct agr_proc *agr, const struct ccase *input) /* Writes an aggregated record to OUTPUT. */ static void -dump_aggregate_info (struct agr_proc *agr, struct casewriter *output) +dump_aggregate_info (const struct agr_proc *agr, struct casewriter *output, const struct ccase *break_case) { struct ccase *c = case_create (dict_get_proto (agr->dict)); - { - int value_idx = 0; - int i; + if ( agr->add_variables) + { + case_copy (c, 0, break_case, 0, dict_get_var_cnt (agr->src_dict)); + } + else + { + int value_idx = 0; + int i; - for (i = 0; i < agr->break_var_cnt; i++) - { - const struct variable *v = agr->break_vars[i]; - value_copy (case_data_rw_idx (c, value_idx), - case_data (agr->break_case, v), - var_get_width (v)); - value_idx++; - } - } + for (i = 0; i < agr->break_var_cnt; i++) + { + const struct variable *v = agr->break_vars[i]; + value_copy (case_data_rw_idx (c, value_idx), + case_data (break_case, v), + var_get_width (v)); + value_idx++; + } + } { struct agr_var *i; @@ -1070,13 +1120,10 @@ dump_aggregate_info (struct agr_proc *agr, struct casewriter *output) /* Resets the state for all the aggregate functions. */ static void -initialize_aggregate_info (struct agr_proc *agr, const struct ccase *input) +initialize_aggregate_info (struct agr_proc *agr) { struct agr_var *iter; - case_unref (agr->break_case); - agr->break_case = case_ref (input); - for (iter = agr->agr_vars; iter; iter = iter->next) { iter->saw_missing = false; diff --git a/tests/language/stats/aggregate.at b/tests/language/stats/aggregate.at index 5e00965e2d..860c939d7f 100644 --- a/tests/language/stats/aggregate.at +++ b/tests/language/stats/aggregate.at @@ -210,3 +210,41 @@ AGGREGATE /BREAK=x . ]) AT_CHECK([pspp -O format=csv aggregate.sps], [1], [ignore], []) AT_CLEANUP + + +AT_SETUP([AGGREGATE mode=addvariables]) +AT_DATA([addvariables.sps], + [data list notable list /x * cn * y *. +begin data. +1 1 2 +3 2 3 +3 3 4 +5 4 6 +7 5 8 +7 6 9 +7 7 10 +9 8 11 +end data. + +aggregate outfile=* mode=addvariables + /break = x + /sum = sum(y) + /mean = mean (y). + +list. +]) + +AT_CHECK([pspp -O format=csv addvariables.sps], [0], + [Table: Data List +x,cn,y,sum,mean +1.00,1.00,2.00,2.00,2.00 +3.00,2.00,3.00,7.00,3.50 +3.00,3.00,4.00,7.00,3.50 +5.00,4.00,6.00,6.00,6.00 +7.00,5.00,8.00,27.00,9.00 +7.00,6.00,9.00,27.00,9.00 +7.00,7.00,10.00,27.00,9.00 +9.00,8.00,11.00,11.00,11.00 +]) + +AT_CLEANUP -- 2.30.2