X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Faggregate.c;h=4edbe22f773444b70ba341bb04c0b6f03e1ca578;hb=9666b94fcb0fe0230371d85a951ed18d1a34d538;hp=d7339d8652c60115a926acfe10ba85ee09dff369;hpb=b401615e6db40bf74394839b96600afe3a868a95;p=pspp diff --git a/src/language/stats/aggregate.c b/src/language/stats/aggregate.c index d7339d8652..4edbe22f77 100644 --- a/src/language/stats/aggregate.c +++ b/src/language/stats/aggregate.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2008, 2009, 2010, 2011, 2012, 2014 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -49,6 +49,7 @@ #include "math/sort.h" #include "math/statistic.h" +#include "gl/c-strcase.h" #include "gl/minmax.h" #include "gl/xalloc.h" @@ -56,63 +57,48 @@ #define _(msgid) gettext (msgid) #define N_(msgid) msgid -/* Argument for AGGREGATE function. */ -union agr_argument +/* Argument for AGGREGATE function. + + Only one of the members is used, so this could be a union, but it's simpler + to just have both. */ +struct agr_argument { double f; /* Numeric. */ - char *c; /* Short or long string. */ + struct substring s; /* String. */ }; /* Specifies how to make an aggregate variable. */ struct agr_var { - struct agr_var *next; /* Next in list. */ - /* Collected during parsing. */ const struct variable *src; /* Source variable. */ struct variable *dest; /* Target variable. */ - int function; /* Function. */ + enum agr_function function; /* Function. */ enum mv_class exclude; /* Classes of missing values to exclude. */ - union agr_argument arg[2]; /* Arguments. */ + struct agr_argument arg[2]; /* Arguments. */ /* Accumulated during AGGREGATE execution. */ - double dbl[3]; - int int1, int2; + double dbl; + double W; /* Total non-missing weight. */ + int int1; char *string; bool saw_missing; struct moments1 *moments; - double cc; struct variable *subject; struct variable *weight; struct casewriter *writer; }; - /* Attributes of aggregation functions. */ const struct agr_func agr_func_tab[] = { - {"SUM", N_("Sum of values"), AGR_SV_YES, 0, -1, {FMT_F, 8, 2}}, - {"MEAN", N_("Mean average"), AGR_SV_YES, 0, -1, {FMT_F, 8, 2}}, - {"MEDIAN", N_("Median average"), AGR_SV_YES, 0, -1, {FMT_F, 8, 2}}, - {"SD", N_("Standard deviation"), AGR_SV_YES, 0, -1, {FMT_F, 8, 2}}, - {"MAX", N_("Maximum value"), AGR_SV_YES, 0, VAL_STRING, {-1, -1, -1}}, - {"MIN", N_("Minimum value"), AGR_SV_YES, 0, VAL_STRING, {-1, -1, -1}}, - {"PGT", N_("Percentage greater than"), AGR_SV_YES, 1, VAL_NUMERIC, {FMT_F, 5, 1}}, - {"PLT", N_("Percentage less than"), AGR_SV_YES, 1, VAL_NUMERIC, {FMT_F, 5, 1}}, - {"PIN", N_("Percentage included in range"), AGR_SV_YES, 2, VAL_NUMERIC, {FMT_F, 5, 1}}, - {"POUT", N_("Percentage excluded from range"), AGR_SV_YES, 2, VAL_NUMERIC, {FMT_F, 5, 1}}, - {"FGT", N_("Fraction greater than"), AGR_SV_YES, 1, VAL_NUMERIC, {FMT_F, 5, 3}}, - {"FLT", N_("Fraction less than"), AGR_SV_YES, 1, VAL_NUMERIC, {FMT_F, 5, 3}}, - {"FIN", N_("Fraction included in range"), AGR_SV_YES, 2, VAL_NUMERIC, {FMT_F, 5, 3}}, - {"FOUT", N_("Fraction excluded from range"), AGR_SV_YES, 2, VAL_NUMERIC, {FMT_F, 5, 3}}, - {"N", N_("Number of cases"), AGR_SV_NO, 0, VAL_NUMERIC, {FMT_F, 7, 0}}, - {"NU", N_("Number of cases (unweighted)"), AGR_SV_OPT, 0, VAL_NUMERIC, {FMT_F, 7, 0}}, - {"NMISS", N_("Number of missing values"), AGR_SV_YES, 0, VAL_NUMERIC, {FMT_F, 7, 0}}, - {"NUMISS", N_("Number of missing values (unweighted)"), AGR_SV_YES, 0, VAL_NUMERIC, {FMT_F, 7, 0}}, - {"FIRST", N_("First non-missing value"), AGR_SV_YES, 0, VAL_STRING, {-1, -1, -1}}, - {"LAST", N_("Last non-missing value"), AGR_SV_YES, 0, VAL_STRING, {-1, -1, -1}}, - {NULL, NULL, AGR_SV_NO, 0, -1, {-1, -1, -1}}, +#define AGRF(ENUM, NAME, DESCRIPTION, SRC_VARS, N_ARGS, ALPHA_TYPE, W, D) \ + [ENUM] = { NAME, DESCRIPTION, SRC_VARS, N_ARGS, ALPHA_TYPE, \ + { .type = (W) > 0 ? FMT_F : -1, .w = W, .d = D } }, +AGGREGATE_FUNCTIONS +#undef AGRF + {NULL, NULL, AGR_SV_NO, 0, -1, {-1, -1, -1}}, }; /* Missing value types. */ @@ -128,13 +114,14 @@ struct agr_proc /* Break variables. */ struct subcase sort; /* Sort criteria (break variables). */ const struct variable **break_vars; /* Break variables. */ - size_t break_var_cnt; /* Number of break variables. */ + size_t break_n_vars; /* Number of break variables. */ enum missing_treatment missing; /* How to treat missing values. */ - struct agr_var *agr_vars; /* First aggregate variable. */ + struct agr_var *agr_vars; /* Aggregate variables. */ + size_t n_agr_vars; struct dictionary *dict; /* Aggregate dictionary. */ const struct dictionary *src_dict; /* Dict of the source */ - int case_cnt; /* Counts aggregated cases. */ + int n_cases; /* Counts aggregated cases. */ bool add_variables; /* True iff the aggregated variables should be appended to the existing dictionary */ @@ -159,107 +146,121 @@ int cmd_aggregate (struct lexer *lexer, struct dataset *ds) { struct dictionary *dict = dataset_dict (ds); - struct agr_proc agr; + struct agr_proc agr = { + .missing = ITEMWISE, + .src_dict = dict, + }; struct file_handle *out_file = NULL; - struct casereader *input = NULL, *group; - struct casegrouper *grouper; + struct casereader *input = NULL; struct casewriter *output = NULL; bool copy_documents = false; bool presorted = false; - bool saw_direction; - bool ok; - - memset(&agr, 0 , sizeof (agr)); - agr.missing = ITEMWISE; - agr.src_dict = dict; - subcase_init_empty (&agr.sort); + int addvariables_ofs = 0; /* OUTFILE subcommand must be first. */ - lex_match (lexer, T_SLASH); - if (!lex_force_match_id (lexer, "OUTFILE")) - goto error; - lex_match (lexer, T_EQUALS); - if (!lex_match (lexer, T_ASTERISK)) + if (lex_match_phrase (lexer, "/OUTFILE") || lex_match_id (lexer, "OUTFILE")) { - out_file = fh_parse (lexer, FH_REF_FILE | FH_REF_SCRATCH); - if (out_file == NULL) - goto error; + lex_match (lexer, T_EQUALS); + if (!lex_match (lexer, T_ASTERISK)) + { + out_file = fh_parse (lexer, FH_REF_FILE, dataset_session (ds)); + if (out_file == NULL) + goto error; + } + + if (!out_file && lex_match_id (lexer, "MODE")) + { + lex_match (lexer, T_EQUALS); + if (lex_match_id (lexer, "ADDVARIABLES")) + { + addvariables_ofs = lex_ofs (lexer) - 1; + agr.add_variables = true; + presorted = true; + } + else if (lex_match_id (lexer, "REPLACE")) + agr.add_variables = false; + else + { + lex_error_expecting (lexer, "ADDVARIABLES", "REPLACE"); + goto error; + } + } + } + else + { + agr.add_variables = true; + presorted = true; } - if (out_file == NULL && lex_match_id (lexer, "MODE")) + if (lex_match_phrase (lexer, "/MISSING")) { lex_match (lexer, T_EQUALS); - if (lex_match_id (lexer, "ADDVARIABLES")) - { - agr.add_variables = true; - - /* presorted is assumed in ADDVARIABLES mode */ - presorted = true; - } - else if (lex_match_id (lexer, "REPLACE")) - { - agr.add_variables = false; - } - else - goto error; + if (!lex_match_id (lexer, "COLUMNWISE")) + { + lex_error_expecting (lexer, "COLUMNWISE"); + goto error; + } + agr.missing = COLUMNWISE; } - if ( agr.add_variables ) + int presorted_ofs = 0; + for (;;) + if (lex_match_phrase (lexer, "/DOCUMENT")) + copy_documents = true; + else if (lex_match_phrase (lexer, "/PRESORTED")) + { + presorted = true; + presorted_ofs = lex_ofs (lexer) - 1; + } + else + break; + + if (agr.add_variables) agr.dict = dict_clone (dict); else - agr.dict = dict_create (); + agr.dict = dict_create (dict_get_encoding (dict)); dict_set_label (agr.dict, dict_get_label (dict)); dict_set_documents (agr.dict, dict_get_documents (dict)); - /* Read most of the subcommands. */ - for (;;) + if (lex_match_phrase (lexer, "/BREAK")) { - lex_match (lexer, T_SLASH); - - if (lex_match_id (lexer, "MISSING")) - { - lex_match (lexer, T_EQUALS); - if (!lex_match_id (lexer, "COLUMNWISE")) - { - lex_error (lexer, _("expecting %s"), "COLUMNWISE"); - goto error; - } - agr.missing = COLUMNWISE; - } - else if (lex_match_id (lexer, "DOCUMENT")) - copy_documents = true; - else if (lex_match_id (lexer, "PRESORTED")) - presorted = true; - else if (lex_force_match_id (lexer, "BREAK")) - { - int i; - - lex_match (lexer, T_EQUALS); - if (!parse_sort_criteria (lexer, dict, &agr.sort, &agr.break_vars, - &saw_direction)) - goto error; - agr.break_var_cnt = subcase_get_n_fields (&agr.sort); - - if (! agr.add_variables) - for (i = 0; i < agr.break_var_cnt; i++) - dict_clone_var_assert (agr.dict, agr.break_vars[i]); - - /* BREAK must follow the options. */ - break; - } - else + lex_match (lexer, T_EQUALS); + bool saw_direction; + int break_start = lex_ofs (lexer); + if (!parse_sort_criteria (lexer, dict, &agr.sort, &agr.break_vars, + &saw_direction)) goto error; + int break_end = lex_ofs (lexer) - 1; + agr.break_n_vars = subcase_get_n_fields (&agr.sort); + if (! agr.add_variables) + for (size_t i = 0; i < agr.break_n_vars; i++) + dict_clone_var_assert (agr.dict, agr.break_vars[i]); + + if (presorted && saw_direction) + { + lex_ofs_msg (lexer, SW, break_start, break_end, + _("When the input data is presorted, specifying " + "sorting directions with (A) or (D) has no effect. " + "Output data will be sorted the same way as the " + "input data.")); + if (presorted_ofs) + lex_ofs_msg (lexer, SN, presorted_ofs, presorted_ofs, + _("The PRESORTED subcommand state that the " + "input data is presorted.")); + else if (addvariables_ofs) + lex_ofs_msg (lexer, SN, addvariables_ofs, addvariables_ofs, + _("ADDVARIABLES implies that the input data " + "is presorted.")); + else + msg (SN, _("The input data must be presorted because the " + "OUTFILE subcommand is not specified.")); + } } - if (presorted && saw_direction) - msg (SW, _("When PRESORTED is specified, specifying sorting directions " - "with (A) or (D) has no effect. Output data will be sorted " - "the same way as the input data.")); /* Read in the aggregate functions. */ - lex_match (lexer, T_SLASH); if (!parse_aggregate_functions (lexer, dict, &agr)) goto error; @@ -268,10 +269,10 @@ cmd_aggregate (struct lexer *lexer, struct dataset *ds) dict_clear_documents (agr.dict); /* Cancel SPLIT FILE. */ - dict_set_split_vars (agr.dict, NULL, 0); + dict_clear_split_vars (agr.dict); /* Initialize. */ - agr.case_cnt = 0; + agr.n_cases = 0; if (out_file == NULL) { @@ -295,8 +296,10 @@ cmd_aggregate (struct lexer *lexer, struct dataset *ds) subcase_clear (&agr.sort); } + struct casegrouper *grouper; + struct casereader *group; for (grouper = casegrouper_create_vars (input, agr.break_vars, - agr.break_var_cnt); + agr.break_n_vars); casegrouper_get_next_group (grouper, &group); casereader_destroy (group)) { @@ -311,7 +314,7 @@ cmd_aggregate (struct lexer *lexer, struct dataset *ds) initialize_aggregate_info (&agr); - if ( agr.add_variables ) + if (agr.add_variables) placeholder = casereader_clone (group); { @@ -332,18 +335,16 @@ cmd_aggregate (struct lexer *lexer, struct dataset *ds) else { dump_aggregate_info (&agr, output, c); - case_unref (c); } + case_unref (c); } if (!casegrouper_destroy (grouper)) goto error; - if (!proc_commit (ds)) - { - input = NULL; - goto error; - } + bool ok = proc_commit (ds); input = NULL; + if (!ok) + goto error; if (out_file == NULL) { @@ -376,155 +377,161 @@ error: return CMD_CASCADING_FAILURE; } +static bool +parse_agr_func_name (struct lexer *lexer, int *func_index, + enum mv_class *exclude) +{ + if (lex_token (lexer) != T_ID) + { + lex_error (lexer, _("Syntax error expecting aggregation function.")); + return false; + } + + struct substring name = lex_tokss (lexer); + *exclude = ss_chomp_byte (&name, '.') ? MV_SYSTEM : MV_ANY; + + for (const struct agr_func *f = agr_func_tab; f->name; f++) + if (ss_equals_case (ss_cstr (f->name), name)) + { + *func_index = f - agr_func_tab; + lex_get (lexer); + return true; + } + lex_error (lexer, _("Unknown aggregation function %s."), lex_tokcstr (lexer)); + return false; +} + /* Parse all the aggregate functions. */ static bool parse_aggregate_functions (struct lexer *lexer, const struct dictionary *dict, struct agr_proc *agr) { - struct agr_var *tail; /* Tail of linked list starting at agr->vars. */ + if (!lex_force_match (lexer, T_SLASH)) + return false; + + size_t starting_n_vars = dict_get_n_vars (dict); + size_t allocated_agr_vars = 0; /* Parse everything. */ - tail = NULL; for (;;) { - char **dest; - char **dest_label; - size_t n_dest; - struct string function_name; - - enum mv_class exclude; - const struct agr_func *function; - int func_index; - - union agr_argument arg[2]; - - const struct variable **src; - size_t n_src; + char **dest = NULL; + char **dest_label = NULL; + size_t n_vars = 0; - size_t i; + struct agr_argument arg[2] = { { .f = 0 }, { .f = 0 } }; - dest = NULL; - dest_label = NULL; - n_dest = 0; - src = NULL; - function = NULL; - n_src = 0; - arg[0].c = NULL; - arg[1].c = NULL; - ds_init_empty (&function_name); + const struct variable **src = NULL; /* Parse the list of target variables. */ + int dst_start_ofs = lex_ofs (lexer); while (!lex_match (lexer, T_EQUALS)) { - size_t n_dest_prev = n_dest; + size_t n_vars_prev = n_vars; - if (!parse_DATA_LIST_vars (lexer, dict, &dest, &n_dest, + if (!parse_DATA_LIST_vars (lexer, dict, &dest, &n_vars, (PV_APPEND | PV_SINGLE | PV_NO_SCRATCH | PV_NO_DUPLICATE))) goto error; /* Assign empty labels. */ - { - int j; - - dest_label = xnrealloc (dest_label, n_dest, sizeof *dest_label); - for (j = n_dest_prev; j < n_dest; j++) - dest_label[j] = NULL; - } - - + dest_label = xnrealloc (dest_label, n_vars, sizeof *dest_label); + for (size_t j = n_vars_prev; j < n_vars; j++) + dest_label[j] = NULL; if (lex_is_string (lexer)) { - dest_label[n_dest - 1] = xstrdup (lex_tokcstr (lexer)); + dest_label[n_vars - 1] = xstrdup (lex_tokcstr (lexer)); lex_get (lexer); } } + int dst_end_ofs = lex_ofs (lexer) - 2; /* Get the name of the aggregation function. */ - if (lex_token (lexer) != T_ID) - { - lex_error (lexer, _("expecting aggregation function")); - goto error; - } - - ds_assign_substring (&function_name, lex_tokss (lexer)); - exclude = ds_chomp_byte (&function_name, '.') ? MV_SYSTEM : MV_ANY; - - for (function = agr_func_tab; function->name; function++) - if (!strcasecmp (function->name, ds_cstr (&function_name))) - break; - if (NULL == function->name) - { - msg (SE, _("Unknown aggregation function %s."), - ds_cstr (&function_name)); - goto error; - } - ds_destroy (&function_name); - func_index = function - agr_func_tab; - lex_get (lexer); + int func_index; + enum mv_class exclude; + if (!parse_agr_func_name (lexer, &func_index, &exclude)) + goto error; + const struct agr_func *function = &agr_func_tab[func_index]; /* Check for leading lparen. */ if (!lex_match (lexer, T_LPAREN)) { if (function->src_vars == AGR_SV_YES) { - lex_force_match (lexer, T_LPAREN); + bool ok UNUSED = lex_force_match (lexer, T_LPAREN); goto error; } } else { /* Parse list of source variables. */ - { - int pv_opts = PV_NO_SCRATCH; - - if (func_index == SUM || func_index == MEAN || func_index == SD) - pv_opts |= PV_NUMERIC; - else if (function->n_args) - pv_opts |= PV_SAME_TYPE; - - if (!parse_variables_const (lexer, dict, &src, &n_src, pv_opts)) - goto error; - } + int pv_opts = PV_NO_SCRATCH; + if (func_index == AGRF_SUM || func_index == AGRF_MEAN + || func_index == AGRF_MEDIAN || func_index == AGRF_SD) + pv_opts |= PV_NUMERIC; + else if (function->n_args) + pv_opts |= PV_SAME_TYPE; + + int src_start_ofs = lex_ofs (lexer); + size_t n_src; + if (!parse_variables_const (lexer, dict, &src, &n_src, pv_opts)) + goto error; + int src_end_ofs = lex_ofs (lexer) - 1; /* Parse function arguments, for those functions that require arguments. */ + int args_start_ofs = 0; if (function->n_args != 0) - for (i = 0; i < function->n_args; i++) + for (size_t i = 0; i < function->n_args; i++) { - int type; - lex_match (lexer, T_COMMA); + + enum val_type type; if (lex_is_string (lexer)) - { - arg[i].c = recode_string (dict_get_encoding (agr->dict), - "UTF-8", lex_tokcstr (lexer), - -1); - type = VAL_STRING; - } - else if (lex_is_number (lexer)) - { - arg[i].f = lex_tokval (lexer); - type = VAL_NUMERIC; - } + type = VAL_STRING; + else if (lex_is_number (lexer)) + type = VAL_NUMERIC; else { - msg (SE, _("Missing argument %zu to %s."), - i + 1, function->name); + lex_error (lexer, _("Missing argument %zu to %s."), + i + 1, function->name); goto error; } - lex_get (lexer); - if (type != var_get_type (src[0])) { msg (SE, _("Arguments to %s must be of same type as " "source variables."), function->name); + if (type == VAL_NUMERIC) + { + lex_next_msg (lexer, SN, 0, 0, + _("The argument is numeric.")); + lex_ofs_msg (lexer, SN, src_start_ofs, src_end_ofs, + _("The variables have string type.")); + } + else + { + lex_next_msg (lexer, SN, 0, 0, + _("The argument is a string.")); + lex_ofs_msg (lexer, SN, src_start_ofs, src_end_ofs, + _("The variables are numeric.")); + } goto error; } + + if (i == 0) + args_start_ofs = lex_ofs (lexer); + if (type == VAL_NUMERIC) + arg[i].f = lex_tokval (lexer); + else + arg[i].s = recode_substring_pool (dict_get_encoding (agr->dict), + "UTF-8", lex_tokss (lexer), + NULL); + lex_get (lexer); } + int args_end_ofs = lex_ofs (lexer) - 1; /* Trailing rparen. */ if (!lex_force_match (lexer, T_RPAREN)) @@ -536,137 +543,110 @@ parse_aggregate_functions (struct lexer *lexer, const struct dictionary *dict, message, i.e. `AGGREGATE x=SUM(y t).' will get this error message when a proper message would be more like `unknown variable t'. */ - if (n_src != n_dest) + if (n_src != n_vars) { msg (SE, _("Number of source variables (%zu) does not match " "number of target variables (%zu)."), - n_src, n_dest); + n_src, n_vars); + lex_ofs_msg (lexer, SN, src_start_ofs, src_end_ofs, + _("These are the source variables.")); + lex_ofs_msg (lexer, SN, dst_start_ofs, dst_end_ofs, + _("These are the target variables.")); goto error; } - if ((func_index == PIN || func_index == POUT - || func_index == FIN || func_index == FOUT) + if ((func_index == AGRF_PIN || func_index == AGRF_POUT + || func_index == AGRF_FIN || func_index == AGRF_FOUT) && (var_is_numeric (src[0]) ? arg[0].f > arg[1].f - : str_compare_rpad (arg[0].c, arg[1].c) > 0)) + : buf_compare_rpad (arg[0].s.string, arg[0].s.length, + arg[1].s.string, arg[1].s.length) > 0)) { - union agr_argument t = arg[0]; + struct agr_argument tmp = arg[0]; arg[0] = arg[1]; - arg[1] = t; + arg[1] = tmp; - msg (SW, _("The value arguments passed to the %s function " - "are out-of-order. They will be treated as if " - "they had been specified in the correct order."), - function->name); + lex_ofs_msg (lexer, SW, args_start_ofs, args_end_ofs, + _("The value arguments passed to the %s function " + "are out of order. They will be treated as if " + "they had been specified in the correct order."), + function->name); } } - /* Finally add these to the linked list of aggregation - variables. */ - for (i = 0; i < n_dest; i++) + /* Finally add these to the aggregation variables. */ + for (size_t i = 0; i < n_vars; i++) { - struct agr_var *v = xzalloc (sizeof *v); - - /* Add variable to chain. */ - if (agr->agr_vars != NULL) - tail->next = v; - else - agr->agr_vars = v; - tail = v; - tail->next = NULL; - v->moments = NULL; - - /* Create the target variable in the aggregate - dictionary. */ - { - struct variable *destvar; - - v->function = func_index; - - if (src) - { - v->src = src[i]; - - if (var_is_alpha (src[i])) - { - v->function |= FSTRING; - v->string = xmalloc (var_get_width (src[i])); - } - - if (function->alpha_type == VAL_STRING) - destvar = dict_clone_var_as (agr->dict, v->src, dest[i]); - else - { - assert (var_is_numeric (v->src) - || function->alpha_type == VAL_NUMERIC); - destvar = dict_create_var (agr->dict, dest[i], 0); - if (destvar != NULL) - { - struct fmt_spec f; - if ((func_index == N || func_index == NMISS) - && dict_get_weight (dict) != NULL) - f = fmt_for_output (FMT_F, 8, 2); - else - f = function->format; - var_set_both_formats (destvar, &f); - } - } - } else { - struct fmt_spec f; - v->src = NULL; - destvar = dict_create_var (agr->dict, dest[i], 0); - if (destvar != NULL) - { - if ((func_index == N || func_index == NMISS) - && dict_get_weight (dict) != NULL) - f = fmt_for_output (FMT_F, 8, 2); - else - f = function->format; - var_set_both_formats (destvar, &f); - } - } - - if (!destvar) - { - msg (SE, _("Variable name %s is not unique within the " - "aggregate file dictionary, which contains " - "the aggregate variables and the break " - "variables."), - dest[i]); - goto error; - } - - free (dest[i]); - if (dest_label[i]) - var_set_label (destvar, dest_label[i], - dict_get_encoding (agr->dict), true); + const struct variable *existing_var = dict_lookup_var (agr->dict, + dest[i]); + if (existing_var) + { + if (var_get_dict_index (existing_var) >= starting_n_vars) + lex_ofs_error (lexer, dst_start_ofs, dst_end_ofs, + _("Duplicate target variable name %s."), + dest[i]); + else if (agr->add_variables) + lex_ofs_error (lexer, dst_start_ofs, dst_end_ofs, + _("Variable name %s duplicates the name of a " + "variable in the active file dictionary."), + dest[i]); + else + lex_ofs_error (lexer, dst_start_ofs, dst_end_ofs, + _("Variable name %s duplicates the name of a " + "break variable."), dest[i]); + goto error; + } - v->dest = destvar; - } + /* Add variable. */ + if (agr->n_agr_vars >= allocated_agr_vars) + agr->agr_vars = x2nrealloc (agr->agr_vars, &allocated_agr_vars, + sizeof *agr->agr_vars); + struct agr_var *v = &agr->agr_vars[agr->n_agr_vars++]; + *v = (struct agr_var) { + .exclude = exclude, + .moments = NULL, + .function = func_index, + .src = src ? src[i] : NULL, + }; + + /* Create the target variable in the aggregate dictionary. */ + if (v->src && var_is_alpha (v->src)) + v->string = xmalloc (var_get_width (v->src)); + + if (v->src && function->alpha_type == VAL_STRING) + v->dest = dict_clone_var_as_assert (agr->dict, v->src, dest[i]); + else + { + v->dest = dict_create_var_assert (agr->dict, dest[i], 0); - v->exclude = exclude; + struct fmt_spec f; + if ((func_index == AGRF_N || func_index == AGRF_NMISS) + && dict_get_weight (dict) != NULL) + f = fmt_for_output (FMT_F, 8, 2); + else + f = function->format; + var_set_both_formats (v->dest, &f); + } + if (dest_label[i]) + var_set_label (v->dest, dest_label[i]); if (v->src != NULL) - { - int j; - - if (var_is_numeric (v->src)) - for (j = 0; j < function->n_args; j++) - v->arg[j].f = arg[j].f; - else - for (j = 0; j < function->n_args; j++) - v->arg[j].c = xstrdup (arg[j].c); - } + for (size_t j = 0; j < function->n_args; j++) + v->arg[j] = (struct agr_argument) { + .f = arg[j].f, + .s = arg[j].s.string ? ss_clone (arg[j].s) : ss_empty (), + }; } - if (src != NULL && var_is_alpha (src[0])) - for (i = 0; i < function->n_args; i++) - { - free (arg[i].c); - arg[i].c = NULL; - } + ss_dealloc (&arg[0].s); + ss_dealloc (&arg[1].s); free (src); + for (size_t i = 0; i < n_vars; i++) + { + free (dest[i]); + free (dest_label[i]); + } free (dest); free (dest_label); @@ -675,28 +655,21 @@ parse_aggregate_functions (struct lexer *lexer, const struct dictionary *dict, if (lex_token (lexer) == T_ENDCMD) return true; - lex_error (lexer, "expecting end of command"); + lex_error (lexer, "Syntax error expecting end of command."); return false; } continue; error: - ds_destroy (&function_name); - for (i = 0; i < n_dest; i++) + for (size_t i = 0; i < n_vars; i++) { free (dest[i]); free (dest_label[i]); } free (dest); free (dest_label); - free (arg[0].c); - free (arg[1].c); - if (src && n_src && var_is_alpha (src[0])) - for (i = 0; i < function->n_args; i++) - { - free (arg[i].c); - arg[i].c = NULL; - } + ss_dealloc (&arg[0].s); + ss_dealloc (&arg[1].s); free (src); return false; @@ -707,34 +680,25 @@ parse_aggregate_functions (struct lexer *lexer, const struct dictionary *dict, static void agr_destroy (struct agr_proc *agr) { - struct agr_var *iter, *next; - - subcase_destroy (&agr->sort); + subcase_uninit (&agr->sort); free (agr->break_vars); - for (iter = agr->agr_vars; iter; iter = next) + for (size_t i = 0; i < agr->n_agr_vars; i++) { - next = iter->next; + struct agr_var *av = &agr->agr_vars[i]; - if (iter->function & FSTRING) - { - size_t n_args; - size_t i; + ss_dealloc (&av->arg[0].s); + ss_dealloc (&av->arg[1].s); + free (av->string); - n_args = agr_func_tab[iter->function & FUNC].n_args; - for (i = 0; i < n_args; i++) - free (iter->arg[i].c); - free (iter->string); - } - else if (iter->function == SD) - moments1_destroy (iter->moments); - - dict_destroy_internal_var (iter->subject); - dict_destroy_internal_var (iter->weight); + if (av->function == AGRF_SD) + moments1_destroy (av->moments); - free (iter); + dict_destroy_internal_var (av->subject); + dict_destroy_internal_var (av->weight); } + free (agr->agr_vars); if (agr->dict != NULL) - dict_destroy (agr->dict); + dict_unref (agr->dict); } /* Execution. */ @@ -743,195 +707,212 @@ agr_destroy (struct agr_proc *agr) static void accumulate_aggregate_info (struct agr_proc *agr, const struct ccase *input) { - struct agr_var *iter; - double weight; bool bad_warn = true; + double weight = dict_get_case_weight (agr->src_dict, input, &bad_warn); + for (size_t i = 0; i < agr->n_agr_vars; i++) + { + struct agr_var *av = &agr->agr_vars[i]; + if (av->src) + { + bool is_string = var_is_alpha (av->src); + const union value *v = case_data (input, av->src); + int src_width = var_get_width (av->src); + const struct substring vs = (src_width > 0 + ? value_ss (v, src_width) + : ss_empty ()); + + if (var_is_value_missing (av->src, v) & av->exclude) + { + switch (av->function) + { + case AGRF_NMISS: + av->dbl += weight; + break; + + case AGRF_NUMISS: + av->int1++; + break; + + case AGRF_SUM: + case AGRF_MEAN: + case AGRF_MEDIAN: + case AGRF_SD: + case AGRF_MAX: + case AGRF_MIN: + case AGRF_PGT: + case AGRF_PLT: + case AGRF_PIN: + case AGRF_POUT: + case AGRF_FGT: + case AGRF_FLT: + case AGRF_FIN: + case AGRF_FOUT: + case AGRF_CGT: + case AGRF_CLT: + case AGRF_CIN: + case AGRF_COUT: + case AGRF_N: + case AGRF_NU: + case AGRF_FIRST: + case AGRF_LAST: + break; + } + av->saw_missing = true; + continue; + } - weight = dict_get_case_weight (agr->src_dict, input, &bad_warn); - - for (iter = agr->agr_vars; iter; iter = iter->next) - if (iter->src) - { - const union value *v = case_data (input, iter->src); - int src_width = var_get_width (iter->src); - - if (var_is_value_missing (iter->src, v, iter->exclude)) - { - switch (iter->function) - { - case NMISS: - case NMISS | FSTRING: - iter->dbl[0] += weight; - break; - case NUMISS: - case NUMISS | FSTRING: - iter->int1++; - break; - } - iter->saw_missing = true; - continue; - } - - /* This is horrible. There are too many possibilities. */ - switch (iter->function) - { - case SUM: - iter->dbl[0] += v->f * weight; - iter->int1 = 1; - break; - case MEAN: - iter->dbl[0] += v->f * weight; - iter->dbl[1] += weight; - break; - case MEDIAN: - { - double wv ; - struct ccase *cout; - - cout = case_create (casewriter_get_proto (iter->writer)); - - case_data_rw (cout, iter->subject)->f - = case_data (input, iter->src)->f; - - wv = dict_get_case_weight (agr->src_dict, input, NULL); - - case_data_rw (cout, iter->weight)->f = wv; - - iter->cc += wv; - - casewriter_write (iter->writer, cout); - } - break; - case SD: - moments1_add (iter->moments, v->f, weight); - break; - case MAX: - iter->dbl[0] = MAX (iter->dbl[0], v->f); - iter->int1 = 1; - break; - case MAX | FSTRING: - /* Need to do some kind of Unicode collation thingy here */ - if (memcmp (iter->string, value_str (v, src_width), src_width) < 0) - memcpy (iter->string, value_str (v, src_width), src_width); - iter->int1 = 1; - break; - case MIN: - iter->dbl[0] = MIN (iter->dbl[0], v->f); - iter->int1 = 1; - break; - case MIN | FSTRING: - if (memcmp (iter->string, value_str (v, src_width), src_width) > 0) - memcpy (iter->string, value_str (v, src_width), src_width); - iter->int1 = 1; - break; - case FGT: - case PGT: - if (v->f > iter->arg[0].f) - iter->dbl[0] += weight; - iter->dbl[1] += weight; - break; - case FGT | FSTRING: - case PGT | FSTRING: - if (memcmp (iter->arg[0].c, - value_str (v, src_width), src_width) < 0) - iter->dbl[0] += weight; - iter->dbl[1] += weight; - break; - case FLT: - case PLT: - if (v->f < iter->arg[0].f) - iter->dbl[0] += weight; - iter->dbl[1] += weight; - break; - case FLT | FSTRING: - case PLT | FSTRING: - if (memcmp (iter->arg[0].c, - value_str (v, src_width), src_width) > 0) - iter->dbl[0] += weight; - iter->dbl[1] += weight; - break; - case FIN: - case PIN: - if (iter->arg[0].f <= v->f && v->f <= iter->arg[1].f) - iter->dbl[0] += weight; - iter->dbl[1] += weight; - break; - case FIN | FSTRING: - case PIN | FSTRING: - if (memcmp (iter->arg[0].c, - value_str (v, src_width), src_width) <= 0 - && memcmp (iter->arg[1].c, - value_str (v, src_width), src_width) >= 0) - iter->dbl[0] += weight; - iter->dbl[1] += weight; - break; - case FOUT: - case POUT: - if (iter->arg[0].f > v->f || v->f > iter->arg[1].f) - iter->dbl[0] += weight; - iter->dbl[1] += weight; - break; - case FOUT | FSTRING: - case POUT | FSTRING: - if (memcmp (iter->arg[0].c, - value_str (v, src_width), src_width) > 0 - || memcmp (iter->arg[1].c, - value_str (v, src_width), src_width) < 0) - iter->dbl[0] += weight; - iter->dbl[1] += weight; - break; - case N: - case N | FSTRING: - iter->dbl[0] += weight; - break; - case NU: - case NU | FSTRING: - iter->int1++; - break; - case FIRST: - if (iter->int1 == 0) - { - iter->dbl[0] = v->f; - iter->int1 = 1; - } - break; - case FIRST | FSTRING: - if (iter->int1 == 0) - { - memcpy (iter->string, value_str (v, src_width), src_width); - iter->int1 = 1; - } - break; - case LAST: - iter->dbl[0] = v->f; - iter->int1 = 1; - break; - case LAST | FSTRING: - memcpy (iter->string, value_str (v, src_width), src_width); - iter->int1 = 1; - break; - case NMISS: - case NMISS | FSTRING: - case NUMISS: - case NUMISS | FSTRING: - /* Our value is not missing or it would have been - caught earlier. Nothing to do. */ - break; - default: - NOT_REACHED (); - } - } else { - switch (iter->function) - { - case N: - iter->dbl[0] += weight; - break; - case NU: - iter->int1++; - break; - default: - NOT_REACHED (); - } + /* This is horrible. There are too many possibilities. */ + av->W += weight; + switch (av->function) + { + case AGRF_SUM: + av->dbl += v->f * weight; + av->int1 = 1; + break; + + case AGRF_MEAN: + av->dbl += v->f * weight; + break; + + case AGRF_MEDIAN: + { + struct ccase *cout = case_create (casewriter_get_proto (av->writer)); + *case_num_rw (cout, av->subject) = case_num (input, av->src); + *case_num_rw (cout, av->weight) = weight; + casewriter_write (av->writer, cout); + } + break; + + case AGRF_SD: + moments1_add (av->moments, v->f, weight); + break; + + case AGRF_MAX: + if (!is_string) + av->dbl = MAX (av->dbl, v->f); + else if (memcmp (av->string, v->s, src_width) < 0) + memcpy (av->string, v->s, src_width); + av->int1 = 1; + break; + + case AGRF_MIN: + if (!is_string) + av->dbl = MIN (av->dbl, v->f); + else if (memcmp (av->string, v->s, src_width) > 0) + memcpy (av->string, v->s, src_width); + av->dbl = MIN (av->dbl, v->f); + av->int1 = 1; + break; + + case AGRF_FGT: + case AGRF_PGT: + case AGRF_CGT: + if (is_string + ? ss_compare_rpad (av->arg[0].s, vs) < 0 + : v->f > av->arg[0].f) + av->dbl += weight; + break; + + case AGRF_FLT: + case AGRF_PLT: + case AGRF_CLT: + if (is_string + ? ss_compare_rpad (av->arg[0].s, vs) > 0 + : v->f < av->arg[0].f) + av->dbl += weight; + break; + + case AGRF_FIN: + case AGRF_PIN: + case AGRF_CIN: + if (is_string + ? (ss_compare_rpad (av->arg[0].s, vs) <= 0 + && ss_compare_rpad (av->arg[1].s, vs) >= 0) + : av->arg[0].f <= v->f && v->f <= av->arg[1].f) + av->dbl += weight; + break; + + case AGRF_FOUT: + case AGRF_POUT: + case AGRF_COUT: + if (is_string + ? (ss_compare_rpad (av->arg[0].s, vs) > 0 + || ss_compare_rpad (av->arg[1].s, vs) < 0) + : av->arg[0].f > v->f || v->f > av->arg[1].f) + av->dbl += weight; + break; + + case AGRF_N: + av->dbl += weight; + break; + + case AGRF_NU: + av->int1++; + break; + + case AGRF_FIRST: + if (av->int1 == 0) + { + if (is_string) + memcpy (av->string, v->s, src_width); + else + av->dbl = v->f; + av->int1 = 1; + } + break; + + case AGRF_LAST: + if (is_string) + memcpy (av->string, v->s, src_width); + else + av->dbl = v->f; + av->int1 = 1; + break; + + case AGRF_NMISS: + case AGRF_NUMISS: + /* Our value is not missing or it would have been + caught earlier. Nothing to do. */ + break; + } + } + else + { + av->W += weight; + switch (av->function) + { + case AGRF_N: + break; + + case AGRF_NU: + av->int1++; + break; + + case AGRF_SUM: + case AGRF_MEAN: + case AGRF_MEDIAN: + case AGRF_SD: + case AGRF_MAX: + case AGRF_MIN: + case AGRF_PGT: + case AGRF_PLT: + case AGRF_PIN: + case AGRF_POUT: + case AGRF_FGT: + case AGRF_FLT: + case AGRF_FIN: + case AGRF_FOUT: + case AGRF_CGT: + case AGRF_CLT: + case AGRF_CIN: + case AGRF_COUT: + case AGRF_NMISS: + case AGRF_NUMISS: + case AGRF_FIRST: + case AGRF_LAST: + NOT_REACHED (); + } + } } } @@ -941,16 +922,15 @@ dump_aggregate_info (const struct agr_proc *agr, struct casewriter *output, cons { struct ccase *c = case_create (dict_get_proto (agr->dict)); - if ( agr->add_variables) + if (agr->add_variables) { - case_copy (c, 0, break_case, 0, dict_get_var_cnt (agr->src_dict)); + case_copy (c, 0, break_case, 0, dict_get_n_vars (agr->src_dict)); } else { int value_idx = 0; - int i; - for (i = 0; i < agr->break_var_cnt; i++) + for (size_t i = 0; i < agr->break_n_vars; i++) { const struct variable *v = agr->break_vars[i]; value_copy (case_data_rw_idx (c, value_idx), @@ -960,127 +940,114 @@ dump_aggregate_info (const struct agr_proc *agr, struct casewriter *output, cons } } - { - struct agr_var *i; + for (size_t i = 0; i < agr->n_agr_vars; i++) + { + struct agr_var *av = &agr->agr_vars[i]; + union value *v = case_data_rw (c, av->dest); + int width = var_get_width (av->dest); + + if (agr->missing == COLUMNWISE && av->saw_missing + && av->function != AGRF_N + && av->function != AGRF_NU + && av->function != AGRF_NMISS + && av->function != AGRF_NUMISS) + { + value_set_missing (v, width); + casewriter_destroy (av->writer); + continue; + } - for (i = agr->agr_vars; i; i = i->next) - { - union value *v = case_data_rw (c, i->dest); - int width = var_get_width (i->dest); + switch (av->function) + { + case AGRF_SUM: + v->f = av->int1 ? av->dbl : SYSMIS; + break; - if (agr->missing == COLUMNWISE && i->saw_missing - && (i->function & FUNC) != N && (i->function & FUNC) != NU - && (i->function & FUNC) != NMISS && (i->function & FUNC) != NUMISS) - { - value_set_missing (v, width); - casewriter_destroy (i->writer); - continue; - } + case AGRF_MEAN: + v->f = av->W != 0.0 ? av->dbl / av->W : SYSMIS; + break; - switch (i->function) - { - case SUM: - v->f = i->int1 ? i->dbl[0] : SYSMIS; - break; - case MEAN: - v->f = i->dbl[1] != 0.0 ? i->dbl[0] / i->dbl[1] : SYSMIS; - break; - case MEDIAN: - { - if ( i->writer) - { - struct percentile *median = percentile_create (0.5, i->cc); - struct order_stats *os = &median->parent; - struct casereader *sorted_reader = casewriter_make_reader (i->writer); - i->writer = NULL; - - order_stats_accumulate (&os, 1, - sorted_reader, - i->weight, - i->subject, - i->exclude); - i->dbl[0] = percentile_calculate (median, PC_HAVERAGE); - statistic_destroy (&median->parent.parent); - } - v->f = i->dbl[0]; - } - break; - case SD: - { - double variance; + case AGRF_MEDIAN: + { + if (av->writer) + { + struct percentile *median = percentile_create (0.5, av->W); + struct order_stats *os = &median->parent; + struct casereader *sorted_reader = casewriter_make_reader (av->writer); + av->writer = NULL; + + order_stats_accumulate (&os, 1, + sorted_reader, + av->weight, + av->subject, + av->exclude); + av->dbl = percentile_calculate (median, PC_HAVERAGE); + statistic_destroy (&median->parent.parent); + } + v->f = av->dbl; + } + break; - /* FIXME: we should use two passes. */ - moments1_calculate (i->moments, NULL, NULL, &variance, - NULL, NULL); - if (variance != SYSMIS) - v->f = sqrt (variance); + case AGRF_SD: + { + double variance; + + moments1_calculate (av->moments, NULL, NULL, &variance, + NULL, NULL); + v->f = variance != SYSMIS ? sqrt (variance) : SYSMIS; + } + break; + + case AGRF_MAX: + case AGRF_MIN: + case AGRF_FIRST: + case AGRF_LAST: + if (!width) + v->f = av->int1 ? av->dbl : SYSMIS; + else + { + if (av->int1) + memcpy (v->s, av->string, width); else - v->f = SYSMIS; + value_set_missing (v, width); } - break; - case MAX: - case MIN: - v->f = i->int1 ? i->dbl[0] : SYSMIS; - break; - case MAX | FSTRING: - case MIN | FSTRING: - if (i->int1) - memcpy (value_str_rw (v, width), i->string, width); - else - value_set_missing (v, width); - break; - case FGT: - case FGT | FSTRING: - case FLT: - case FLT | FSTRING: - case FIN: - case FIN | FSTRING: - case FOUT: - case FOUT | FSTRING: - v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] : SYSMIS; - break; - case PGT: - case PGT | FSTRING: - case PLT: - case PLT | FSTRING: - case PIN: - case PIN | FSTRING: - case POUT: - case POUT | FSTRING: - v->f = i->dbl[1] ? i->dbl[0] / i->dbl[1] * 100.0 : SYSMIS; - break; - case N: - case N | FSTRING: - v->f = i->dbl[0]; - break; - case NU: - case NU | FSTRING: - v->f = i->int1; - break; - case FIRST: - case LAST: - v->f = i->int1 ? i->dbl[0] : SYSMIS; - break; - case FIRST | FSTRING: - case LAST | FSTRING: - if (i->int1) - memcpy (value_str_rw (v, width), i->string, width); - else - value_set_missing (v, width); - break; - case NMISS: - case NMISS | FSTRING: - v->f = i->dbl[0]; - break; - case NUMISS: - case NUMISS | FSTRING: - v->f = i->int1; - break; - default: - NOT_REACHED (); - } - } - } + break; + + case AGRF_FGT: + case AGRF_FLT: + case AGRF_FIN: + case AGRF_FOUT: + v->f = av->W ? av->dbl / av->W : SYSMIS; + break; + + case AGRF_PGT: + case AGRF_PLT: + case AGRF_PIN: + case AGRF_POUT: + v->f = av->W ? av->dbl / av->W * 100.0 : SYSMIS; + break; + + case AGRF_CGT: + case AGRF_CLT: + case AGRF_CIN: + case AGRF_COUT: + v->f = av->dbl; + break; + + case AGRF_N: + v->f = av->W; + break; + + case AGRF_NU: + case AGRF_NUMISS: + v->f = av->int1; + break; + + case AGRF_NMISS: + v->f = av->dbl; + break; + } + } casewriter_write (output, c); } @@ -1089,57 +1056,77 @@ dump_aggregate_info (const struct agr_proc *agr, struct casewriter *output, cons static void initialize_aggregate_info (struct agr_proc *agr) { - struct agr_var *iter; - - for (iter = agr->agr_vars; iter; iter = iter->next) + for (size_t i = 0; i < agr->n_agr_vars; i++) { - iter->saw_missing = false; - iter->dbl[0] = iter->dbl[1] = iter->dbl[2] = 0.0; - iter->int1 = iter->int2 = 0; - switch (iter->function) + struct agr_var *av = &agr->agr_vars[i]; + av->saw_missing = false; + av->dbl = av->W = 0.0; + av->int1 = 0; + + int width = av->src ? var_get_width (av->src) : 0; + switch (av->function) { - case MIN: - iter->dbl[0] = DBL_MAX; - break; - case MIN | FSTRING: - memset (iter->string, 255, var_get_width (iter->src)); - break; - case MAX: - iter->dbl[0] = -DBL_MAX; + case AGRF_MIN: + if (!width) + av->dbl = DBL_MAX; + else + memset (av->string, 255, width); break; - case MAX | FSTRING: - memset (iter->string, 0, var_get_width (iter->src)); + + case AGRF_MAX: + if (!width) + av->dbl = -DBL_MAX; + else + memset (av->string, 0, width); break; - case MEDIAN: - { - struct caseproto *proto; - struct subcase ordering; - proto = caseproto_create (); + case AGRF_MEDIAN: + { + struct caseproto *proto = caseproto_create (); proto = caseproto_add_width (proto, 0); proto = caseproto_add_width (proto, 0); - if ( ! iter->subject) - iter->subject = dict_create_internal_var (0, 0); + if (! av->subject) + av->subject = dict_create_internal_var (0, 0); - if ( ! iter->weight) - iter->weight = dict_create_internal_var (1, 0); + if (! av->weight) + av->weight = dict_create_internal_var (1, 0); - subcase_init_var (&ordering, iter->subject, SC_ASCEND); - iter->writer = sort_create_writer (&ordering, proto); - subcase_destroy (&ordering); + struct subcase ordering; + subcase_init_var (&ordering, av->subject, SC_ASCEND); + av->writer = sort_create_writer (&ordering, proto); + subcase_uninit (&ordering); caseproto_unref (proto); - - iter->cc = 0; } break; - case SD: - if (iter->moments == NULL) - iter->moments = moments1_create (MOMENT_VARIANCE); + + case AGRF_SD: + if (av->moments == NULL) + av->moments = moments1_create (MOMENT_VARIANCE); else - moments1_clear (iter->moments); + moments1_clear (av->moments); break; - default: + + case AGRF_SUM: + case AGRF_MEAN: + case AGRF_PGT: + case AGRF_PLT: + case AGRF_PIN: + case AGRF_POUT: + case AGRF_FGT: + case AGRF_FLT: + case AGRF_FIN: + case AGRF_FOUT: + case AGRF_CGT: + case AGRF_CLT: + case AGRF_CIN: + case AGRF_COUT: + case AGRF_N: + case AGRF_NU: + case AGRF_NMISS: + case AGRF_NUMISS: + case AGRF_FIRST: + case AGRF_LAST: break; } }