From: John Darrington Date: Thu, 6 Jun 2019 15:29:46 +0000 (+0200) Subject: Revert "Restart of new means implementation" X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=cad3e2fc1551410ef04224be1c9877600515b816;p=pspp Revert "Restart of new means implementation" This reverts commit ad7ae8bb105c925b951e241fd6a3d1cb93d614a0. I inadvertently pushed this! --- diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index aa385529eb..33d9cd4c62 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -54,9 +54,6 @@ language_stats_sources = \ src/language/stats/mann-whitney.c \ src/language/stats/mann-whitney.h \ src/language/stats/means.c \ - src/language/stats/means.h \ - src/language/stats/means-calc.c \ - src/language/stats/means-parser.c \ src/language/stats/mcnemar.c \ src/language/stats/mcnemar.h \ src/language/stats/median.c \ diff --git a/src/language/stats/means-calc.c b/src/language/stats/means-calc.c deleted file mode 100644 index 5e49b1802c..0000000000 --- a/src/language/stats/means-calc.c +++ /dev/null @@ -1,438 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2011, 2012, 2013, 2019 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#include - -#include "data/case.h" -#include "data/format.h" -#include "data/variable.h" - -#include "libpspp/bt.h" -#include "libpspp/hmap.h" -#include "libpspp/misc.h" -#include "libpspp/pool.h" - -#include "math/moments.h" - -#include - -#include "means.h" - -#include "gettext.h" -#define _(msgid) gettext (msgid) -#define N_(msgid) (msgid) - -struct per_var_data -{ -}; - -struct per_var_data_simple -{ - struct per_var_data parent; - double acc; -}; - -struct per_var_data_moment -{ - struct per_var_data parent; - struct moments1 *mom; -}; - -static struct per_var_data * -default_create (struct pool *pool) -{ - struct per_var_data_moment *pvd = pool_alloc (pool, sizeof *pvd); - - pvd->mom = moments1_create (MOMENT_KURTOSIS); - - return (struct per_var_data *) pvd; -} - -static void -default_update (struct per_var_data *stat, double w, double x) -{ - struct per_var_data_moment *pvd = (struct per_var_data_moment *)stat; - - moments1_add (pvd->mom, x, w); -} - - - -/* HARMONIC MEAN: The reciprocal of the sum of the reciprocals: - 1 / ( 1/(x_0) + 1/(x_1) + ... + 1/(x_{n-1}) ) */ - -struct harmonic_mean -{ - struct per_var_data parent; - double rsum; - double n; -}; - -static struct per_var_data * -harmonic_create (struct pool *pool) -{ - struct harmonic_mean *hm = pool_alloc (pool, sizeof *hm); - - hm->rsum = 0; - hm->n = 0; - - return (struct per_var_data *) hm; -} - - -static void -harmonic_update (struct per_var_data *stat, double w, double x) -{ - struct harmonic_mean *hm = (struct harmonic_mean *)stat; - hm->rsum += w / x; - hm->n += w; -} - - -static double -harmonic_get (const struct per_var_data *pvd) -{ - const struct harmonic_mean *hm = (const struct harmonic_mean *) pvd; - - return hm->n / hm->rsum; -} - - - -/* GEOMETRIC MEAN: The nth root of the product of all n observations - pow ((x_0 * x_1 * ... x_{n - 1}), 1/n) */ -struct geometric_mean -{ - struct per_var_data parent; - double prod; - double n; -}; - -static struct per_var_data * -geometric_create (struct pool *pool) -{ - struct geometric_mean *gm = pool_alloc (pool, sizeof *gm); - - gm->prod = 1.0; - gm->n = 0; - - return (struct per_var_data *) gm; -} - -static void -geometric_update (struct per_var_data *pvd, double w, double x) -{ - struct geometric_mean *gm = (struct geometric_mean *)pvd; - gm->prod *= pow (x, w); - gm->n += w; -} - - -static double -geometric_get (const struct per_var_data *pvd) -{ - const struct geometric_mean *gm = (const struct geometric_mean *)pvd; - return pow (gm->prod, 1.0 / gm->n); -} - - - -static double -sum_get (const struct per_var_data *pvd) -{ - double n, mean; - - moments1_calculate (((struct per_var_data_moment *)pvd)->mom, &n, &mean, 0, 0, 0); - - return mean * n; -} - - -static double -n_get (const struct per_var_data *pvd) -{ - double n; - - moments1_calculate (((struct per_var_data_moment *)pvd)->mom, &n, 0, 0, 0, 0); - - return n; -} - -static double -arithmean_get (const struct per_var_data *pvd) -{ - double n, mean; - - moments1_calculate (((struct per_var_data_moment *)pvd)->mom, &n, &mean, 0, 0, 0); - - return mean; -} - -static double -variance_get (const struct per_var_data *pvd) -{ - double n, mean, variance; - - moments1_calculate (((struct per_var_data_moment *)pvd)->mom, &n, &mean, &variance, 0, 0); - - return variance; -} - - -static double -stddev_get (const struct per_var_data *pvd) -{ - return sqrt (variance_get (pvd)); -} - - - - -static double -skew_get (const struct per_var_data *pvd) -{ - double skew; - - moments1_calculate (((struct per_var_data_moment *)pvd)->mom, NULL, NULL, NULL, &skew, 0); - - return skew; -} - -static double -sekurt_get (const struct per_var_data *pvd) -{ - double n; - - moments1_calculate (((struct per_var_data_moment *)pvd)->mom, &n, NULL, NULL, NULL, NULL); - - return calc_sekurt (n); -} - -static double -seskew_get (const struct per_var_data *pvd) -{ - double n; - - moments1_calculate (((struct per_var_data_moment *)pvd)->mom, &n, NULL, NULL, NULL, NULL); - - return calc_seskew (n); -} - -static double -kurt_get (const struct per_var_data *pvd) -{ - double kurt; - - moments1_calculate (((struct per_var_data_moment *)pvd)->mom, NULL, NULL, NULL, NULL, &kurt); - - return kurt; -} - -static double -semean_get (const struct per_var_data *pvd) -{ - double n, var; - - moments1_calculate (((struct per_var_data_moment *)pvd)->mom, &n, NULL, &var, NULL, NULL); - - return sqrt (var / n); -} - - - -/* MIN: The smallest (closest to minus infinity) value. */ - -static struct per_var_data * -min_create (struct pool *pool) -{ - struct per_var_data_simple *pvd = pool_alloc (pool, sizeof *pvd); - - pvd->acc = DBL_MAX; - - return (struct per_var_data *) pvd; -} - -static void -min_update (struct per_var_data *pvd, double w UNUSED, double x) -{ - double *r = &((struct per_var_data_simple *)pvd)->acc; - - if (x < *r) - *r = x; -} - -static double -min_get (const struct per_var_data *pvd) -{ - double *r = &((struct per_var_data_simple *)pvd)->acc; - - return *r; -} - -/* MAX: The largest (closest to plus infinity) value. */ - -static struct per_var_data * -max_create (struct pool *pool) -{ - struct per_var_data_simple *pvd = pool_alloc (pool, sizeof *pvd); - - pvd->acc = -DBL_MAX; - - return (struct per_var_data *) pvd; -} - -static void -max_update (struct per_var_data *pvd, double w UNUSED, double x) -{ - double *r = &((struct per_var_data_simple *)pvd)->acc; - - if (x > *r) - *r = x; -} - -static double -max_get (const struct per_var_data *pvd) -{ - double *r = &((struct per_var_data_simple *)pvd)->acc; - - return *r; -} - - - -struct range -{ - struct per_var_data parent; - double min; - double max; -}; - -static struct per_var_data * -range_create (struct pool *pool) -{ - struct range *r = pool_alloc (pool, sizeof *r); - - r->min = DBL_MAX; - r->max = -DBL_MAX; - - return (struct per_var_data *) r; -} - -static void -range_update (struct per_var_data *pvd, double w UNUSED, double x) -{ - struct range *r = (struct range *) pvd; - - if (x > r->max) - r->max = x; - - if (x < r->min) - r->min = x; -} - -static double -range_get (const struct per_var_data *pvd) -{ - const struct range *r = (struct range *) pvd; - - return r->max - r->min; -} - - - -/* LAST: The last value (the one closest to the end of the file). */ - -static struct per_var_data * -last_create (struct pool *pool) -{ - struct per_var_data_simple *pvd = pool_alloc (pool, sizeof *pvd); - - return (struct per_var_data *) pvd; -} - -static void -last_update (struct per_var_data *pvd, double w UNUSED, double x) -{ - struct per_var_data_simple *stat = (struct per_var_data_simple *) pvd; - - stat->acc = x; -} - -static double -last_get (const struct per_var_data *pvd) -{ - const struct per_var_data_simple *stat = (struct per_var_data_simple *) pvd; - - return stat->acc; -} - -/* FIRST: The first value (the one closest to the start of the file). */ - -static struct per_var_data * -first_create (struct pool *pool) -{ - struct per_var_data_simple *pvd = pool_alloc (pool, sizeof *pvd); - - pvd->acc = SYSMIS; - - return (struct per_var_data *) pvd; -} - -static void -first_update (struct per_var_data *pvd, double w UNUSED, double x) -{ - struct per_var_data_simple *stat = (struct per_var_data_simple *) pvd; - - if (stat->acc == SYSMIS) - stat->acc = x; -} - -static double -first_get (const struct per_var_data *pvd) -{ - const struct per_var_data_simple *stat = (struct per_var_data_simple *) pvd; - - return stat->acc; -} - -/* Table of cell_specs */ -const struct cell_spec cell_spec[n_MEANS_STATISTICS] = { - {N_("Mean"), "MEAN", default_create, default_update, arithmean_get}, - {N_("N"), "COUNT", default_create, default_update, n_get}, - {N_("Std. Deviation"), "STDDEV", default_create, default_update, stddev_get}, -#if 0 - {N_("Median"), "MEDIAN", default_create, default_update, NULL}, - {N_("Group Median"), "GMEDIAN", default_create, default_update, NULL}, -#endif - {N_("S.E. Mean"), "SEMEAN", default_create, default_update, semean_get}, - {N_("Sum"), "SUM", default_create, default_update, sum_get}, - {N_("Minimum"), "MIN", min_create, min_update, min_get}, - {N_("Maximum"), "MAX", max_create, max_update, max_get}, - {N_("Range"), "RANGE", range_create, range_update, range_get}, - {N_("Variance"), "VARIANCE", default_create, default_update, variance_get}, - {N_("Kurtosis"), "KURT", default_create, default_update, kurt_get}, - {N_("S.E. Kurt"), "SEKURT", default_create, default_update, sekurt_get}, - {N_("Skewness"), "SKEW", default_create, default_update, skew_get}, - {N_("S.E. Skew"), "SESKEW", default_create, default_update, seskew_get}, - {N_("First"), "FIRST", first_create, first_update, first_get}, - {N_("Last"), "LAST", last_create, last_update, last_get}, -#if 0 - {N_("Percent N"), "NPCT", default_create, default_update, NULL}, - {N_("Percent Sum"), "SPCT", default_create, default_update, NULL}, -#endif - {N_("Harmonic Mean"), "HARMONIC", harmonic_create, harmonic_update, harmonic_get}, - {N_("Geom. Mean"), "GEOMETRIC", geometric_create, geometric_update, geometric_get} -}; diff --git a/src/language/stats/means-parser.c b/src/language/stats/means-parser.c deleted file mode 100644 index 73822ecce1..0000000000 --- a/src/language/stats/means-parser.c +++ /dev/null @@ -1,313 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2011, 2012, 2013, 2019 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#include - -#include "data/case.h" -#include "data/casegrouper.h" -#include "data/casereader.h" -#include "data/dataset.h" -#include "data/dictionary.h" -#include "data/format.h" -#include "data/variable.h" - -#include "language/command.h" -#include "language/lexer/lexer.h" -#include "language/lexer/variable-parser.h" - -#include "libpspp/hmap.h" -#include "libpspp/bt.h" -#include "libpspp/misc.h" -#include "libpspp/pool.h" - -#include "means.h" - -/* Parse the /TABLES stanza of the command. */ -static bool -parse_means_table_syntax (struct lexer *lexer, const struct means *cmd, - struct mtable *table) -{ - table->n_layers = 0; - table->layers = NULL; - - /* Dependent variable (s) */ - if (!parse_variables_const_pool (lexer, cmd->pool, cmd->dict, - &table->dep_vars, &table->n_dep_vars, - PV_NO_DUPLICATE | PV_NUMERIC)) - return false; - - - /* Factor variable (s) */ - while (lex_match (lexer, T_BY)) - { - struct layer *layer = xzalloc (sizeof *layer); - hmap_init (&layer->instances.map); - - table->n_layers++; - table->layers = xrealloc (table->layers, - table->n_layers * sizeof *table->layers); - - table->layers[table->n_layers - 1] = layer; - - if (!parse_variables_const_pool - (lexer, cmd->pool, cmd->dict, - &layer->factor_vars, - &layer->n_factor_vars, - PV_NO_DUPLICATE)) - return false; - } - - return true; -} - -/* Match a variable. - If the match succeeds, the variable will be placed in VAR. - Returns true if successful */ -static bool -lex_is_variable (struct lexer *lexer, const struct dictionary *dict, - int n) -{ - const char *tstr; - if (lex_next_token (lexer, n) != T_ID) - return false; - - tstr = lex_next_tokcstr (lexer, n); - - if (NULL == dict_lookup_var (dict, tstr) ) - return false; - - return true; -} - -static bool -means_parse (struct lexer *lexer, struct means *means) -{ - /* Optional TABLES = */ - if (lex_match_id (lexer, "TABLES")) - { - if (! lex_force_match (lexer, T_EQUALS)) - return false; - } - - bool more_tables = true; - /* Parse the "tables" */ - while (more_tables) - { - means->n_tables ++; - means->table = pool_realloc (means->pool, means->table, means->n_tables * sizeof (*means->table)); - - if (! parse_means_table_syntax (lexer, means, - &means->table[means->n_tables - 1])) - { - return false; - } - - /* Look ahead to see if there are more tables to be parsed */ - more_tables = false; - if ( T_SLASH == lex_next_token (lexer, 0) ) - { - if (lex_is_variable (lexer, means->dict, 1) ) - { - more_tables = true; - lex_match (lexer, T_SLASH); - } - } - } - - /* /MISSING subcommand */ - while (lex_token (lexer) != T_ENDCMD) - { - lex_match (lexer, T_SLASH); - - if (lex_match_id (lexer, "MISSING")) - { - /* - If no MISSING subcommand is specified, each combination of - a dependent variable and categorical variables is handled - separately. - */ - lex_match (lexer, T_EQUALS); - if (lex_match_id (lexer, "INCLUDE")) - { - /* - Use the subcommand "/MISSING=INCLUDE" to include user-missing - values in the analysis. - */ - - means->exclude = MV_SYSTEM; - means->dep_exclude = MV_SYSTEM; - } - else if (lex_match_id (lexer, "TABLE")) - /* - This is the default. (I think). - Every case containing a complete set of variables for a given - table. If any variable, categorical or dependent for in a table - is missing (as defined by what?), then that variable will - be dropped FOR THAT TABLE ONLY. - */ - { - means->listwise_exclude = true; - } - else if (lex_match_id (lexer, "DEPENDENT")) - /* - Use the command "/MISSING=DEPENDENT" to - include user-missing values for the categorical variables, - while excluding them for the dependent variables. - - Cases are dropped only when user-missing values - appear in dependent variables. User-missing - values for categorical variables are treated according to - their face value. - - Cases are ALWAYS dropped when System Missing values appear - in the categorical variables. - */ - { - means->dep_exclude = MV_ANY; - means->exclude = MV_SYSTEM; - } - else - { - lex_error (lexer, NULL); - return false; - } - } - else if (lex_match_id (lexer, "CELLS")) - { - lex_match (lexer, T_EQUALS); - - /* The default values become overwritten */ - means->n_statistics = 0; - free (means->statistics); - means->statistics = 0; - while (lex_token (lexer) != T_ENDCMD - && lex_token (lexer) != T_SLASH) - { - if (lex_match (lexer, T_ALL)) - { - free (means->statistics); - means->statistics = xcalloc (n_MEANS_STATISTICS, sizeof (*means->statistics)); - means->n_statistics = n_MEANS_STATISTICS; - int i; - for (i = 0; i < n_MEANS_STATISTICS; ++i) - { - means->statistics[i] = i; - } - } - else if (lex_match_id (lexer, "NONE")) - { - means->n_statistics = 0; - free (means->statistics); - means->statistics = 0; - } - else if (lex_match_id (lexer, "DEFAULT")) - { - means->n_statistics = 3; - means->statistics = xcalloc (3, sizeof *means->statistics); - means->statistics[0] = MEANS_MEAN; - means->statistics[1] = MEANS_N; - means->statistics[2] = MEANS_STDDEV; - } - else - { - int i; - for (i = 0; i < n_MEANS_STATISTICS; ++i) - { - const struct cell_spec *cs = cell_spec + i; - if (lex_match_id (lexer, cs->keyword)) - { - means->statistics - = xrealloc (means->statistics, - (means->n_statistics + 1) - * sizeof (*means->statistics)); - - means->statistics[means->n_statistics] = i; - means->n_statistics++; - break; - } - } - - if (i >= n_MEANS_STATISTICS) - { - lex_error (lexer, NULL); - return false; - } - } - } - } - else - { - lex_error (lexer, NULL); - return false; - } - } - return true; -} - -int -cmd_means (struct lexer *lexer, struct dataset *ds) -{ - struct means means; - means.pool = pool_create (); - - means.exclude = MV_ANY; - means.dep_exclude = MV_ANY; - means.listwise_exclude = false; - means.table = NULL; - means.n_tables = 0; - - means.dict = dataset_dict (ds); - - means.n_statistics = 3; - means.statistics = xcalloc (3, sizeof *means.statistics); - means.statistics[0] = MEANS_MEAN; - means.statistics[1] = MEANS_N; - means.statistics[2] = MEANS_STDDEV; - - if (! means_parse (lexer, &means)) - goto error; - - { - struct casegrouper *grouper; - struct casereader *group; - bool ok; - - grouper = casegrouper_create_splits (proc_open (ds), means.dict); - while (casegrouper_get_next_group (grouper, &group)) - { - run_means (&means, group, ds); - } - ok = casegrouper_destroy (grouper); - ok = proc_commit (ds) && ok; - } - - for (int t = 0; t < means.n_tables; ++t) - { - const struct mtable *table = means.table + t; - means_shipout (table, &means); - } - - return CMD_SUCCESS; - - error: - - return CMD_FAILURE; -} - - - - - diff --git a/src/language/stats/means.c b/src/language/stats/means.c index aa94784430..7e7642adf1 100644 --- a/src/language/stats/means.c +++ b/src/language/stats/means.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2011, 2012, 2013, 2019 Free Software Foundation, Inc. + Copyright (C) 2011, 2012, 2013 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -24,716 +24,1167 @@ #include "data/format.h" #include "data/variable.h" -#include "libpspp/hmap.h" -#include "libpspp/bt.h" +#include "language/command.h" +#include "language/lexer/lexer.h" +#include "language/lexer/variable-parser.h" -#include "output/pivot-table.h" +#include "libpspp/misc.h" +#include "libpspp/pool.h" + +#include "math/categoricals.h" +#include "math/interaction.h" +#include "math/moments.h" -#include "means.h" +#include "output/pivot-table.h" +#include #include "gettext.h" #define _(msgid) gettext (msgid) #define N_(msgid) (msgid) -/* A "cell" in this procedure represents a distinct value of the - procedure's categorical variables, and a set of summary statistics - of all cases which whose categorical variables have that set of - values. For example, the dataset - v1 v2 cat1 cat2 - 100 202 0 1 - 100 202 0 2 - 100 202 1 0 - 100 202 0 1 +struct means; +struct per_var_data +{ + void **cell_stats; + struct moments1 *mom; +}; - has three cells in layer 0 and two cells in layer 1 in addition - to a "grand summary" cell to which all (non-missing) cases - contribute. - The cells form a n-ary tree structure with the "grand summary" - cell at the root. - */ -struct cell +typedef void *stat_create (struct pool *pool); +typedef void stat_update (void *stat, double w, double x); +typedef double stat_get (const struct per_var_data *, void *aux); + +struct cell_spec { - struct hmap_node hmap_node; /* Element in hash table. */ - struct bt_node bt_node; /* Element in binary tree */ + /* Printable title for output */ + const char *title; - struct cell_container children; + /* Keyword for syntax */ + const char *keyword; - /* The level of the subtotal to which this cell pertains, or - -1 if it does not pertain to a subtotal. */ - int subtotal; + stat_create *sc; + stat_update *su; + stat_get *sd; +}; - /* The statistics to be calculated for the cell. */ - struct per_var_data **stat; +struct harmonic_mean +{ + double rsum; + double n; +}; - /* The parent of this cell, or NULL if this is the root cell. */ - struct cell *parent_cell; +static void * +harmonic_create (struct pool *pool) +{ + struct harmonic_mean *hm = pool_alloc (pool, sizeof *hm); - int n_subtotals; - struct cell_container *subtotals; + hm->rsum = 0; + hm->n = 0; + + return hm; +} - int n_values; - union value values[1]; /* The value(s). */ -}; static void -dump_cell (const struct cell *cell, int level) +harmonic_update (void *stat, double w, double x) +{ + struct harmonic_mean *hm = stat; + hm->rsum += w / x; + hm->n += w; +} + + +static double +harmonic_get (const struct per_var_data *pvd UNUSED, void *stat) +{ + struct harmonic_mean *hm = stat; + + return hm->n / hm->rsum; +} + + + +struct geometric_mean +{ + double prod; + double n; +}; + + +static void * +geometric_create (struct pool *pool) { - printf ("%p: ", cell); - for (int i = 0; i < cell->n_values; ++i) - printf ("%g; ", cell->values[i].f); - // printf ("--- Count: %g", cell->count); - // printf ("--- N Subtotals: %d", cell->n_subtotals); - printf ("--- Level: %d", level); - printf ("--- Subtotal: %d", cell->subtotal); - printf ("--- Parent: %p", cell->parent_cell); - printf ("\n"); + struct geometric_mean *gm = pool_alloc (pool, sizeof *gm); + + gm->prod = 1.0; + gm->n = 0; + + return gm; } + static void -dump_tree (const struct cell *cell, int level) +geometric_update (void *stat, double w, double x) { - struct cell *sub_cell; - BT_FOR_EACH (sub_cell, struct cell, bt_node, &cell->children.bt) - { - dump_tree (sub_cell, level + 1); - } + struct geometric_mean *gm = stat; + gm->prod *= pow (x, w); + gm->n += w; +} - for (int i = 0; i < cell->n_subtotals; ++i) - { - struct cell_container *container = cell->subtotals + i; - struct cell *scell; - BT_FOR_EACH (scell, struct cell, bt_node, &container->bt) - { - dump_cell (scell, level); - } - } - dump_cell (cell, level); +static double +geometric_get (const struct per_var_data *pvd UNUSED, void *stat) +{ + struct geometric_mean *gm = stat; + + return pow (gm->prod, 1.0 / gm->n); } -struct instance + + +static double +sum_get (const struct per_var_data *pvd, void *stat UNUSED) { - struct hmap_node hmap_node; /* Element in hash table. */ - struct bt_node bt_node; /* Element in binary tree */ + double n, mean; - /* A unique, consecutive, zero based index identifying this - instance. */ - int index; + moments1_calculate (pvd->mom, &n, &mean, 0, 0, 0); - /* The top level value of this instance. */ - union value value; -}; + return mean * n; +} -static void -dump_layer (const struct layer *layer) + +static double +n_get (const struct per_var_data *pvd, void *stat UNUSED) { - printf ("Layer: %p; fv0: %s; N %ld\n", layer, - layer->n_factor_vars - ? var_get_name (layer->factor_vars[0]) - : "(null)", - hmap_count (&layer->instances.map) - ); + double n; - struct instance *inst; - BT_FOR_EACH (inst, struct instance, bt_node, &layer->instances.bt) - { - printf ("Val %g; Index %d\n", inst->value.f, inst->index); - } - printf ("\n"); + moments1_calculate (pvd->mom, &n, 0, 0, 0, 0); + + return n; } +static double +arithmean_get (const struct per_var_data *pvd, void *stat UNUSED) +{ + double n, mean; + + moments1_calculate (pvd->mom, &n, &mean, 0, 0, 0); + + return mean; +} -/* Generate a hash based on the values of the N variables in - the array VARS which are taken from the case C. */ -static size_t -generate_hash (const struct ccase *c, int n, - const struct variable * const * vars) +static double +variance_get (const struct per_var_data *pvd, void *stat UNUSED) { - size_t hash = 0; - for (int i = 0; i < n; ++i) - { - const struct variable *var = vars[i]; - const union value *vv = case_data (c, var); - int width = var_get_width (var); - hash = value_hash (vv, width, hash); - } + double n, mean, variance; - return hash; + moments1_calculate (pvd->mom, &n, &mean, &variance, 0, 0); + + return variance; } -/* Create a cell based on the N variables in the array VARS, - which are indeces into the case C. - The caller is responsible for destroying this cell when - no longer needed. */ -static struct cell * -generate_cell (const struct means *means, const struct ccase *c, int n, - const struct variable * const * vars) + +static double +stddev_get (const struct per_var_data *pvd, void *stat) { - struct cell *cell = xzalloc ((sizeof *cell) - + (n - 1) * sizeof (union value)); - cell->subtotal = -1; - cell->n_values = n; - for (int i = 0; i < n; ++i) - { - const struct variable *var = vars[i]; - const union value *vv = case_data (c, var); - int width = var_get_width (var); - value_clone (&cell->values[i], vv, width); - } + return sqrt (variance_get (pvd, stat)); +} - hmap_init (&cell->children.map); - cell->stat = xcalloc (means->n_statistics, sizeof *cell->stat); - for (int stat = 0; stat < means->n_statistics; ++stat) - { - stat_create *sc = cell_spec[means->statistics[stat]].sc; + - cell->stat[stat] = sc (means->pool); - } +static double +skew_get (const struct per_var_data *pvd, void *stat UNUSED) +{ + double skew; - return cell; + moments1_calculate (pvd->mom, NULL, NULL, NULL, &skew, 0); + + return skew; } +static double +sekurt_get (const struct per_var_data *pvd, void *stat UNUSED) +{ + double n; + + moments1_calculate (pvd->mom, &n, NULL, NULL, NULL, NULL); + + return calc_sekurt (n); +} -/* If a cell based on the N variables in the array VARS, - which are indeces into the case C and whose hash is HASH, - exists in HMAP, then return that cell. - Otherwise, return NULL. */ -static struct cell * -lookup_cell (struct hmap *hmap, size_t hash, - const struct ccase *c, - int n, const struct variable * const * vars) +static double +seskew_get (const struct per_var_data *pvd, void *stat UNUSED) { - struct cell *fcol = NULL; - HMAP_FOR_EACH_WITH_HASH (fcol, struct cell, hmap_node, hash, hmap) - { - bool match = true; - for (int i = 0; i < n; ++i) - { - const struct variable *var = vars[i]; - const union value *vv = case_data (c, var); - int width = var_get_width (var); - if (!value_equal (vv, &fcol->values[i], width)) - { - match = false; - break; - } - } - if (match) - return fcol; - } - return NULL; + double n; + + moments1_calculate (pvd->mom, &n, NULL, NULL, NULL, NULL); + + return calc_seskew (n); } +static double +kurt_get (const struct per_var_data *pvd, void *stat UNUSED) +{ + double kurt; + + moments1_calculate (pvd->mom, NULL, NULL, NULL, NULL, &kurt); + + return kurt; +} -/* A comparison function used to sort cells in a binary tree. */ -static int -my_compare_func (const struct bt_node *a, - const struct bt_node *b, - const void *aux) +static double +semean_get (const struct per_var_data *pvd, void *stat UNUSED) { - const struct cell *fa = BT_DATA (a, struct cell, bt_node); - const struct cell *fb = BT_DATA (b, struct cell, bt_node); + double n, var; - const struct variable *const *cv = aux; + moments1_calculate (pvd->mom, &n, NULL, &var, NULL, NULL); - int vidx = fa->n_values - 1; - assert (fa->n_values == fb->n_values); + return sqrt (var / n); +} + + + +static void * +min_create (struct pool *pool) +{ + double *r = pool_alloc (pool, sizeof *r); + + *r = DBL_MAX; - // FIXME: Consider whether other layers need to be compared. - int r = value_compare_3way (&fa->values[vidx], - &fb->values[vidx], - var_get_width (cv[vidx])); return r; } -/* A comparison function used to sort cells in a binary tree. */ -static int -my_other_compare_func (const struct bt_node *a, - const struct bt_node *b, - const void *aux) +static void +min_update (void *stat, double w UNUSED, double x) { - const struct instance *fa = BT_DATA (a, struct instance, bt_node); - const struct instance *fb = BT_DATA (b, struct instance, bt_node); + double *r = stat; - const struct variable *var = aux; + if (x < *r) + *r = x; +} - int r = value_compare_3way (&fa->value, - &fb->value, - var_get_width (var)); - return r; +static double +min_get (const struct per_var_data *pvd UNUSED, void *stat) +{ + double *r = stat; + + return *r; } +static void * +max_create (struct pool *pool) +{ + double *r = pool_alloc (pool, sizeof *r); -static void arrange_cells (struct cell *cell, const struct mtable *table); + *r = -DBL_MAX; + return r; +} -/* Walk the tree starting at CELL, creating and populating the BT for - each cell. Also assigns the "rank", "parent_cell" and "subtotal" members - of each cell.*/ static void -arrange_cell (struct cell_container *container, - struct cell *cell, const struct mtable *table, - int subtotal) +max_update (void *stat, double w UNUSED, double x) { - const struct variable **control_vars = table->control_vars; - struct bt *bt = &container->bt; - struct hmap *map = &container->map; - bt_init (bt, my_compare_func, control_vars); + double *r = stat; - struct cell *scell; - HMAP_FOR_EACH (scell, struct cell, hmap_node, map) - { - scell->parent_cell = cell; - scell->subtotal = subtotal; - bt_insert (bt, &scell->bt_node); - arrange_cells (scell, table); - } + if (x > *r) + *r = x; +} - if (cell->n_values > 0 && cell->subtotal == -1) - { - struct layer *layer = table->layers[cell->n_values - 1]; - - const struct variable *var = control_vars[cell->n_values - 1]; - int width = var_get_width (var); - unsigned int hash - = value_hash (&cell->values[cell->n_values - 1], width, 0); - - - struct instance *inst = NULL; - struct instance *next = NULL; - HMAP_FOR_EACH_WITH_HASH_SAFE (inst, next, struct instance, hmap_node, - hash, &layer->instances.map) - { - if (value_equal (&inst->value, - &cell->values[cell->n_values - 1], - width)) - { - break; - } - } - - if (!inst) - { - inst = xzalloc (sizeof *inst); - inst->index = -1; - value_copy (&inst->value, &cell->values[cell->n_values -1], - width); - hmap_insert (&layer->instances.map, &inst->hmap_node, hash); - } - } +static double +max_get (const struct per_var_data *pvd UNUSED, void *stat) +{ + double *r = stat; + + return *r; +} + + + +struct range +{ + double min; + double max; +}; + +static void * +range_create (struct pool *pool) +{ + struct range *r = pool_alloc (pool, sizeof *r); + + r->min = DBL_MAX; + r->max = -DBL_MAX; + + return r; } -/* Arrange the children and then all the subtotals. */ static void -arrange_cells (struct cell *cell, const struct mtable *table) +range_update (void *stat, double w UNUSED, double x) { - arrange_cell (&cell->children, cell, table, -1); + struct range *r = stat; - for (int s = 0; s < cell->n_subtotals; ++ s) - { - arrange_cell (cell->subtotals + s, cell, table, s); - } + if (x > r->max) + r->max = x; + + if (x < r->min) + r->min = x; } +static double +range_get (const struct per_var_data *pvd UNUSED, void *stat) +{ + struct range *r = stat; + + return r->max - r->min; +} -/* If the top level value in CELL, has an instance in LAYER, then - return that instance. Otherwise return NULL. */ -static const struct instance * -lookup_instance (const struct layer *layer, const struct cell *cell) -{ - const struct variable *var = layer->factor_vars[0]; - const union value *val = cell->values + cell->n_values - 1; - int width = var_get_width (var); - unsigned int hash = value_hash (val, width, 0); - struct instance *inst = NULL; - struct instance *next; - HMAP_FOR_EACH_WITH_HASH_SAFE (inst, next, - struct instance, hmap_node, - hash, &layer->instances.map) +static void * +last_create (struct pool *pool) +{ + double *l = pool_alloc (pool, sizeof *l); + + return l; +} + +static void +last_update (void *stat, double w UNUSED, double x) +{ + double *l = stat; + + *l = x; +} + +static double +last_get (const struct per_var_data *pvd UNUSED, void *stat) +{ + double *l = stat; + + return *l; +} + + +static void * +first_create (struct pool *pool) +{ + double *f = pool_alloc (pool, sizeof *f); + + *f = SYSMIS; + + return f; +} + +static void +first_update (void *stat, double w UNUSED, double x) +{ + double *f = stat; + + if (*f == SYSMIS) + *f = x; +} + +static double +first_get (const struct per_var_data *pvd UNUSED, void *stat) +{ + double *f = stat; + + return *f; +} + +enum + { + MEANS_MEAN = 0, + MEANS_N, + MEANS_STDDEV + }; + +/* Table of cell_specs */ +static const struct cell_spec cell_spec[] = { + {N_("Mean"), "MEAN", NULL, NULL, arithmean_get}, + {N_("N"), "COUNT", NULL, NULL, n_get}, + {N_("Std. Deviation"), "STDDEV", NULL, NULL, stddev_get}, +#if 0 + {N_("Median"), "MEDIAN", NULL, NULL, NULL}, + {N_("Group Median"), "GMEDIAN", NULL, NULL, NULL}, +#endif + {N_("S.E. Mean"), "SEMEAN", NULL, NULL, semean_get}, + {N_("Sum"), "SUM", NULL, NULL, sum_get}, + {N_("Min"), "MIN", min_create, min_update, min_get}, + {N_("Max"), "MAX", max_create, max_update, max_get}, + {N_("Range"), "RANGE", range_create, range_update, range_get}, + {N_("Variance"), "VARIANCE", NULL, NULL, variance_get}, + {N_("Kurtosis"), "KURT", NULL, NULL, kurt_get}, + {N_("S.E. Kurt"), "SEKURT", NULL, NULL, sekurt_get}, + {N_("Skewness"), "SKEW", NULL, NULL, skew_get}, + {N_("S.E. Skew"), "SESKEW", NULL, NULL, seskew_get}, + {N_("First"), "FIRST", first_create, first_update, first_get}, + {N_("Last"), "LAST", last_create, last_update, last_get}, +#if 0 + {N_("Percent N"), "NPCT", NULL, NULL, NULL}, + {N_("Percent Sum"), "SPCT", NULL, NULL, NULL}, +#endif + {N_("Harmonic Mean"), "HARMONIC", harmonic_create, harmonic_update, harmonic_get}, + {N_("Geom. Mean"), "GEOMETRIC", geometric_create, geometric_update, geometric_get} +}; + +#define n_C (sizeof (cell_spec) / sizeof (struct cell_spec)) + + +struct summary +{ + casenumber missing; + casenumber non_missing; +}; + + +struct layer +{ + size_t n_factor_vars; + const struct variable **factor_vars; +}; + +/* The thing parsed after TABLES= */ +struct mtable +{ + size_t n_dep_vars; + const struct variable **dep_vars; + + int n_layers; + struct layer *layers; + + struct interaction **interactions; + struct summary *summary; + + int ii; + + struct categoricals *cats; +}; + +struct means +{ + const struct dictionary *dict; + + struct mtable *table; + size_t n_tables; + + /* Missing value class for categorical variables */ + enum mv_class exclude; + + /* Missing value class for dependent variables */ + enum mv_class dep_exclude; + + bool listwise_exclude; + + /* an array indicating which statistics are to be calculated */ + int *cells; + + /* Size of cells */ + int n_cells; + + /* Pool on which cell functions may allocate data */ + struct pool *pool; +}; + + +static void +run_means (struct means *cmd, struct casereader *input, + const struct dataset *ds); + + + +static bool +parse_means_table_syntax (struct lexer *lexer, const struct means *cmd, struct mtable *table) +{ + table->ii = 0; + table->n_layers = 0; + table->layers = NULL; + table->interactions = NULL; + + /* Dependent variable (s) */ + if (!parse_variables_const_pool (lexer, cmd->pool, cmd->dict, + &table->dep_vars, &table->n_dep_vars, + PV_NO_DUPLICATE | PV_NUMERIC)) + return false; + + /* Factor variable (s) */ + while (lex_match (lexer, T_BY)) { - if (value_equal (val, &inst->value, width)) - break; + table->n_layers++; + table->layers = + pool_realloc (cmd->pool, table->layers, + sizeof (*table->layers) * table->n_layers); + + if (!parse_variables_const_pool + (lexer, cmd->pool, cmd->dict, + &table->layers[table->n_layers - 1].factor_vars, + &table->layers[table->n_layers - 1].n_factor_vars, + PV_NO_DUPLICATE)) + return false; } - return inst; + + /* There is always at least one layer. + However the final layer is the total, and not + normally considered by the user as a + layer. + */ + + table->n_layers++; + table->layers = + pool_realloc (cmd->pool, table->layers, + sizeof (*table->layers) * table->n_layers); + table->layers[table->n_layers - 1].factor_vars = NULL; + table->layers[table->n_layers - 1].n_factor_vars = 0; + + return true; } -static void -populate_table (const struct cell *cell, const struct means *means, - struct pivot_table *table, int level) +/* Match a variable. + If the match succeeds, the variable will be placed in VAR. + Returns true if successful */ +static bool +lex_is_variable (struct lexer *lexer, const struct dictionary *dict, + int n) { - /* It is easier to overallocate this table by 2, than to adjust the - logic when assigning its contents, and it is cheap to do so. */ - size_t *indexes = xcalloc (table->n_dimensions + 2, sizeof *indexes); - for (int s = 0; s < means->n_statistics; ++s) + const char *tstr; + if (lex_next_token (lexer, n) != T_ID) + return false; + + tstr = lex_next_tokcstr (lexer, n); + + if (NULL == dict_lookup_var (dict, tstr) ) + return false; + + return true; +} + + +int +cmd_means (struct lexer *lexer, struct dataset *ds) +{ + int t; + int i; + int l; + struct means means; + bool more_tables = true; + + means.pool = pool_create (); + + means.exclude = MV_ANY; + means.dep_exclude = MV_ANY; + means.listwise_exclude = false; + means.table = NULL; + means.n_tables = 0; + + means.dict = dataset_dict (ds); + + means.n_cells = 3; + means.cells = pool_calloc (means.pool, means.n_cells, sizeof (*means.cells)); + + + /* The first three items (MEAN, COUNT, STDDEV) are the default */ + for (i = 0; i < 3; ++i) + means.cells[i] = i; + + + /* Optional TABLES = */ + if (lex_match_id (lexer, "TABLES")) + { + if (! lex_force_match (lexer, T_EQUALS)) + goto error; + } + + + more_tables = true; + /* Parse the "tables" */ + while (more_tables) { - bool kludge = false; // FIXME: Remove this for production. - indexes[0] = s; - int stat = means->statistics[s]; - if (cell->subtotal != -1) + means.n_tables ++; + means.table = pool_realloc (means.pool, means.table, means.n_tables * sizeof (*means.table)); + + if (! parse_means_table_syntax (lexer, &means, + &means.table[means.n_tables - 1])) { - // for (int i = 1; i < table->n_dimensions; ++i) - { - int i = 1; - const struct layer *layer - = means->table->layers[table->n_dimensions - 1 - i]; - assert (layer); - const struct instance *inst = lookup_instance (layer, cell); - assert (inst); - indexes[i] = inst->index; - } - int i = 2; - const struct layer *layer - = means->table->layers[table->n_dimensions - 1 - i]; - indexes[i] = hmap_count (&layer->instances.map); - kludge = true; + goto error; } - else if (hmap_is_empty (&cell->children.map)) + + /* Look ahead to see if there are more tables to be parsed */ + more_tables = false; + if ( T_SLASH == lex_next_token (lexer, 0) ) { - const struct cell *pc = cell; - for (int i = 1; i < table->n_dimensions; ++i) + if (lex_is_variable (lexer, means.dict, 1) ) { - const struct layer *layer - = means->table->layers[table->n_dimensions - 1 - i]; - - const struct instance *inst = lookup_instance (layer, pc); - assert (inst); - indexes[i] = inst->index; - pc = pc->parent_cell; + more_tables = true; + lex_match (lexer, T_SLASH); } - kludge = true; } - else + } + + /* /MISSING subcommand */ + while (lex_token (lexer) != T_ENDCMD) + { + lex_match (lexer, T_SLASH); + + if (lex_match_id (lexer, "MISSING")) { - const struct layer *layer; - int i = 0; - for (int st = 0; st < cell->n_subtotals; ++st) + /* + If no MISSING subcommand is specified, each combination of + a dependent variable and categorical variables is handled + separately. + */ + lex_match (lexer, T_EQUALS); + if (lex_match_id (lexer, "INCLUDE")) { - layer = means->table->layers[table->n_dimensions - i - 2]; - indexes[++i] = hmap_count (&layer->instances.map); + /* + Use the subcommand "/MISSING=INCLUDE" to include user-missing + values in the analysis. + */ + + means.exclude = MV_SYSTEM; + means.dep_exclude = MV_SYSTEM; } - layer = means->table->layers[table->n_dimensions - i - 2]; - indexes[++i] = hmap_count (&layer->instances.map); - if (++i < table->n_dimensions) + else if (lex_match_id (lexer, "TABLE")) + /* + This is the default. (I think). + Every case containing a complete set of variables for a given + table. If any variable, categorical or dependent for in a table + is missing (as defined by what?), then that variable will + be dropped FOR THAT TABLE ONLY. + */ { - layer = means->table->layers[table->n_dimensions - i - 1]; - const struct instance *inst = lookup_instance (layer, cell); - assert (inst); - indexes[i] = inst->index; + means.listwise_exclude = true; + } + else if (lex_match_id (lexer, "DEPENDENT")) + /* + Use the command "/MISSING=DEPENDENT" to + include user-missing values for the categorical variables, + while excluding them for the dependent variables. + + Cases are dropped only when user-missing values + appear in dependent variables. User-missing + values for categorical variables are treated according to + their face value. + + Cases are ALWAYS dropped when System Missing values appear + in the categorical variables. + */ + { + means.dep_exclude = MV_ANY; + means.exclude = MV_SYSTEM; + } + else + { + lex_error (lexer, NULL); + goto error; } - kludge = true; } + else if (lex_match_id (lexer, "CELLS")) + { + lex_match (lexer, T_EQUALS); - if (kludge) + /* The default values become overwritten */ + means.n_cells = 0; + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { + int k = 0; + if (lex_match (lexer, T_ALL)) + { + int x; + means.cells = + pool_realloc (means.pool, means.cells, + (means.n_cells += n_C) * sizeof (*means.cells)); + + for (x = 0; x < n_C; ++x) + means.cells[means.n_cells - (n_C - 1 - x) - 1] = x; + } + else if (lex_match_id (lexer, "NONE")) + { + /* Do nothing */ + } + else if (lex_match_id (lexer, "DEFAULT")) + { + means.cells = + pool_realloc (means.pool, means.cells, + (means.n_cells += 3) * sizeof (*means.cells)); + + means.cells[means.n_cells - 2 - 1] = MEANS_MEAN; + means.cells[means.n_cells - 1 - 1] = MEANS_N; + means.cells[means.n_cells - 0 - 1] = MEANS_STDDEV; + } + else + { + for (; k < n_C; ++k) + { + if (lex_match_id (lexer, cell_spec[k].keyword)) + { + means.cells = + pool_realloc (means.pool, means.cells, + ++means.n_cells * sizeof (*means.cells)); + + means.cells[means.n_cells - 1] = k; + break; + } + } + } + if (k >= n_C) + { + lex_error (lexer, NULL); + goto error; + } + } + } + else { - stat_get *sg = cell_spec[stat].sd; - pivot_table_put (table, indexes, table->n_dimensions, - pivot_value_new_number (sg (cell->stat[s]))); + lex_error (lexer, NULL); + goto error; } } - free (indexes); - const struct bt *bt = &cell->children.bt; - struct cell *child = NULL; - BT_FOR_EACH (child, struct cell, bt_node, bt) - { - populate_table (child, means, table, level + 1); - } - // printf ("There are %d subtotals\n", cell->n_subtotals); - for (int i = 0; i < cell->n_subtotals; ++i) + + for (t = 0; t < means.n_tables; ++t) { - // printf ("aa\n"); - const struct cell_container *st = cell->subtotals + i; - struct cell *scell; - HMAP_FOR_EACH (scell, struct cell, hmap_node, &st->map) - { - // printf ("%s:%d xxx\n", __FILE__, __LINE__); - /* dump_cell (scell, 0); */ - populate_table (scell, means, table, level + 1); - } - // printf ("zz\n"); - } - // printf ("ooo\n"); -} + struct mtable *table = &means.table[t]; -static void -ann_dim (struct pivot_table *t, int d, size_t *indeces) -{ - if (d < 0) - return; - char label[10]; + table->interactions = + pool_calloc (means.pool, table->n_layers, sizeof (*table->interactions)); - const struct pivot_dimension *dim = t->dimensions[d]; - for (int l = 0; l < dim->n_leaves; ++l) - { - indeces[d] = l; - int x; - for (x = 0; x < t->n_dimensions; ++x) + table->summary = + pool_calloc (means.pool, table->n_dep_vars * table->n_layers, sizeof (*table->summary)); + + for (l = 0; l < table->n_layers; ++l) { - label[x] = '0' + indeces[x]; + int v; + const struct layer *lyr = &table->layers[l]; + const int n_vars = lyr->n_factor_vars; + table->interactions[l] = interaction_create (NULL); + for (v = 0; v < n_vars ; ++v) + { + interaction_add_variable (table->interactions[l], + lyr->factor_vars[v]); + } } - label[x] = '\0'; - pivot_table_put (t, indeces, t->n_dimensions, - pivot_value_new_user_text (label, x)); - ann_dim (t, d - 1, indeces); } -} -static void -annotate_table (struct pivot_table *t) -{ - size_t *indeces = xcalloc (t->n_dimensions, sizeof *indeces); + { + struct casegrouper *grouper; + struct casereader *group; + bool ok; - for (int d = 0; d < t->n_dimensions; ++d) - { - ann_dim (t, d, indeces); - } - free (indeces); -} + grouper = casegrouper_create_splits (proc_open (ds), means.dict); + while (casegrouper_get_next_group (grouper, &group)) + { + run_means (&means, group, ds); + } + ok = casegrouper_destroy (grouper); + ok = proc_commit (ds) && ok; + } -static void -create_table_structure (const struct mtable *mt, struct pivot_table *table) -{ - for (int l = mt->n_layers -1; l >=0 ; --l) + for (t = 0; t < means.n_tables; ++t) { - const struct layer *layer = mt->layers[l]; - assert (layer->n_factor_vars > 0); - const struct variable *var = layer->factor_vars[0]; - struct pivot_dimension *dim_layer - = pivot_dimension_create (table, PIVOT_AXIS_ROW, - var_to_string (var)); - dim_layer->root->show_label = true; - - { - struct instance *inst = NULL; - BT_FOR_EACH (inst, struct instance, bt_node, &layer->instances.bt) + int l; + struct mtable *table = &means.table[t]; + if (table->interactions) + for (l = 0; l < table->n_layers; ++l) { - struct substring space = SS_LITERAL_INITIALIZER ("\t "); - struct string str; - ds_init_empty (&str); - var_append_value_name (var, - &inst->value, - &str); + interaction_destroy (table->interactions[l]); + } + } - ds_ltrim (&str, space); + pool_destroy (means.pool); + return CMD_SUCCESS; - pivot_category_create_leaf - (dim_layer->root, - pivot_value_new_text (ds_cstr (&str))); + error: - ds_destroy (&str); + for (t = 0; t < means.n_tables; ++t) + { + int l; + struct mtable *table = &means.table[t]; + if (table->interactions) + for (l = 0; l < table->n_layers; ++l) + { + interaction_destroy (table->interactions[l]); } - } - - pivot_category_create_leaf - (dim_layer->root, - pivot_value_new_text ("Total")); } + + pool_destroy (means.pool); + return CMD_FAILURE; } -void -means_shipout (const struct mtable *mt, const struct means *means) + +static bool +is_missing (const struct means *cmd, + const struct variable *dvar, + const struct interaction *iact, + const struct ccase *c) { - struct pivot_table *table = pivot_table_create (N_("Report")); + if ( interaction_case_is_missing (iact, c, cmd->exclude) ) + return true; - struct pivot_dimension *dim_cells = - pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics")); - for (int i = 0; i < means->n_statistics; ++i) - { - const struct cell_spec *cs = cell_spec + means->statistics[i]; - pivot_category_create_leaf - (dim_cells->root, - pivot_value_new_text (gettext (cs->title))); - } + if (var_is_value_missing (dvar, + case_data (c, dvar), + cmd->dep_exclude)) + return true; - create_table_structure (mt, table); + return false; +} - populate_table (mt->root_cell, means, table, 0); - // pivot_table_dump (table, 0); - // annotate_table (table); +static void output_case_processing_summary (const struct mtable *, + const struct variable *wv); - pivot_table_submit (table); -} +static void output_report (const struct means *, int, const struct mtable *); - -static bool -missing_for_layer (const struct layer *layer, const struct ccase *c) +struct per_cat_data { - if (layer->n_factor_vars == 0) - return false; - - const struct variable *var = layer->factor_vars[0]; - const union value *vv = case_data (c, var); + struct per_var_data *pvd; - return var_is_value_missing (var, vv, MV_ANY); -} + bool warn; +}; -static bool -missing_for_any_layer (const struct mtable *table, - int startx, - const struct ccase *c) +static void +destroy_n (const void *aux1 UNUSED, void *aux2, void *user_data) { - bool miss = false; - for (int l = startx; l < table->n_layers; ++l) + struct mtable *table = aux2; + int v; + struct per_cat_data *per_cat_data = user_data; + struct per_var_data *pvd = per_cat_data->pvd; + + for (v = 0; v < table->n_dep_vars; ++v) { - miss = missing_for_layer (table->layers[l], c); + struct per_var_data *pp = &pvd[v]; + moments1_destroy (pp->mom); } - - if (miss) - return true; - - return false; } -static struct cell * -update_map_from_data (const struct means *means, - const struct mtable *table, - int startx, - struct hmap *map, - const struct ccase *c, - int start_var, - int n_vars, - double weight) +static void * +create_n (const void *aux1, void *aux2) { - const struct variable **cv = table->control_vars + start_var; - if (! missing_for_any_layer (table, startx, c)) + int i, v; + const struct means *means = aux1; + struct mtable *table = aux2; + struct per_cat_data *per_cat_data = pool_malloc (means->pool, sizeof *per_cat_data); + + struct per_var_data *pvd = pool_calloc (means->pool, table->n_dep_vars, sizeof *pvd); + + for (v = 0; v < table->n_dep_vars; ++v) { - const struct variable *dep_var = table->dep_vars[0]; + enum moment maxmom = MOMENT_KURTOSIS; + struct per_var_data *pp = &pvd[v]; + + pp->cell_stats = pool_calloc (means->pool, means->n_cells, sizeof *pp->cell_stats); - size_t hash = generate_hash (c, n_vars, cv); - struct cell *fcol = lookup_cell (map, hash, - c, n_vars, cv); - if (!fcol) + for (i = 0; i < means->n_cells; ++i) { - fcol = generate_cell (means, c, n_vars, cv); - fcol->n_subtotals = table->n_layers - 2 - start_var; - if (fcol->n_subtotals < 0) - fcol->n_subtotals = 0; - fcol->subtotals = xcalloc (fcol->n_subtotals, - sizeof *fcol->subtotals); - for (int i = 0; i < fcol->n_subtotals; ++i) + int csi = means->cells[i]; + const struct cell_spec *cs = &cell_spec[csi]; + if (cs->sc) { - struct cell_container *c = fcol->subtotals + i; - hmap_init (&c->map); + pp->cell_stats[i] = cs->sc (means->pool); } + } + pp->mom = moments1_create (maxmom); + } + + + per_cat_data->pvd = pvd; + per_cat_data->warn = true; + return per_cat_data; +} - hmap_insert (map, &fcol->hmap_node, hash); +static void +update_n (const void *aux1, void *aux2, void *user_data, const struct ccase *c, double weight) +{ + int i; + int v = 0; + const struct means *means = aux1; + struct mtable *table = aux2; + struct per_cat_data *per_cat_data = user_data; + + for (v = 0; v < table->n_dep_vars; ++v) + { + struct per_var_data *pvd = &per_cat_data->pvd[v]; + + const double x = case_data (c, table->dep_vars[v])->f; + + for (i = 0; i < table->n_layers; ++i) + { + if ( is_missing (means, table->dep_vars[v], + table->interactions[i], c)) + goto end; } - for (int stat = 0; stat < means->n_statistics; ++stat) + for (i = 0; i < means->n_cells; ++i) { - stat_update *su = cell_spec[means->statistics[stat]].su; - su (fcol->stat[stat], weight, case_data (c, dep_var)->f); + const int csi = means->cells[i]; + const struct cell_spec *cs = &cell_spec[csi]; + + + if (cs->su) + cs->su (pvd->cell_stats[i], + weight, x); } - return fcol; + moments1_add (pvd->mom, x, weight); + + end: + continue; } - return NULL; } +static void +calculate_n (const void *aux1, void *aux2, void *user_data) +{ + int i; + int v = 0; + struct per_cat_data *per_cat_data = user_data; + const struct means *means = aux1; + struct mtable *table = aux2; -void + for (v = 0; v < table->n_dep_vars; ++v) + { + struct per_var_data *pvd = &per_cat_data->pvd[v]; + for (i = 0; i < means->n_cells; ++i) + { + int csi = means->cells[i]; + const struct cell_spec *cs = &cell_spec[csi]; + + if (cs->su) + cs->sd (pvd, pvd->cell_stats[i]); + } + } +} + +static void run_means (struct means *cmd, struct casereader *input, const struct dataset *ds UNUSED) { - struct mtable *table = cmd->table + 0; + int t; + const struct variable *wv = dict_get_weight (cmd->dict); struct ccase *c; struct casereader *reader; - table->root_cell = generate_cell (cmd, 0, 0, 0); - table->root_cell->n_subtotals = table->n_layers - 1; - if (table->root_cell->n_subtotals < 0) - table->root_cell->n_subtotals = 0; - table->root_cell->subtotals - = xcalloc (table->root_cell->n_subtotals, - sizeof *table->root_cell->subtotals); - for (int i = 0; i < table->root_cell->n_subtotals; ++i) - { - struct cell_container *c = table->root_cell->subtotals + i; - hmap_init (&c->map); - } + struct payload payload; + payload.create = create_n; + payload.update = update_n; + payload.calculate = calculate_n; + payload.destroy = destroy_n; - table->control_vars - = calloc (table->n_layers, sizeof *table->control_vars); - for (int l = 0; l < table->n_layers; ++l) + for (t = 0; t < cmd->n_tables; ++t) { - const struct layer *layer = table->layers[l]; - if (layer->n_factor_vars > 0) - table->control_vars[l] = layer->factor_vars[0]; + struct mtable *table = &cmd->table[t]; + table->cats + = categoricals_create (table->interactions, + table->n_layers, wv, cmd->exclude); + + categoricals_set_payload (table->cats, &payload, cmd, table); } - struct cell *cell = table->root_cell; - const struct variable *dep_var = table->dep_vars[0]; for (reader = input; (c = casereader_read (reader)) != NULL; case_unref (c)) { - const double weight = dict_get_case_weight (cmd->dict, c, NULL); - if (! missing_for_any_layer (table, 0, c)) + for (t = 0; t < cmd->n_tables; ++t) { - for (int stat = 0; stat < cmd->n_statistics; ++stat) + bool something_missing = false; + int v; + struct mtable *table = &cmd->table[t]; + + for (v = 0; v < table->n_dep_vars; ++v) { - stat_update *su = cell_spec[cmd->statistics[stat]].su; - su (cell->stat[stat], weight, case_data (c, dep_var)->f); + int i; + for (i = 0; i < table->n_layers; ++i) + { + const bool missing = + is_missing (cmd, table->dep_vars[v], + table->interactions[i], c); + if (missing) + { + something_missing = true; + table->summary[v * table->n_layers + i].missing++; + } + else + table->summary[v * table->n_layers + i].non_missing++; + } } + if ( something_missing && cmd->listwise_exclude) + continue; + + categoricals_update (table->cats, c); } + } + casereader_destroy (reader); + + for (t = 0; t < cmd->n_tables; ++t) + { + struct mtable *table = &cmd->table[t]; + + categoricals_done (table->cats); + } + + + for (t = 0; t < cmd->n_tables; ++t) + { + int i; + const struct mtable *table = &cmd->table[t]; + + output_case_processing_summary (table, wv); - for (int i = 0; i < cell->n_subtotals; ++i) + for (i = 0; i < table->n_layers; ++i) { - struct cell_container *container = cell->subtotals + i; - update_map_from_data (cmd, table, 1, &container->map, c, 1, 1, - weight); + output_report (cmd, i, table); } + categoricals_destroy (table->cats); + } - struct hmap *map = &cell->children.map; - for (int l = 0; l < table->n_layers; ++l) - { - struct cell *cell - = update_map_from_data (cmd, table, l, map, c, - 0, l + 1, weight); +} - if (cell) - map = &cell->children.map; + + +static void +output_case_processing_summary (const struct mtable *mt, + const struct variable *wv) +{ + struct pivot_table *table = pivot_table_create ( + N_("Case Processing Summary")); + pivot_table_set_weight_var (table, wv); + + pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"), + N_("N"), PIVOT_RC_COUNT, + N_("Percent"), PIVOT_RC_PERCENT); + pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Cases"), + N_("Included"), N_("Excluded"), N_("Total")) + ->root->show_label = true; + + struct pivot_dimension *tables = pivot_dimension_create ( + table, PIVOT_AXIS_ROW, N_("Tables")); + + for (size_t v = 0; v < mt->n_dep_vars; ++v) + { + const struct variable *var = mt->dep_vars[v]; + for (size_t i = 0; i < mt->n_layers; ++i) + { + const int row = v * mt->n_layers + i; + const struct interaction *iact = mt->interactions[i]; + + struct string str = DS_EMPTY_INITIALIZER; + ds_put_format (&str, "%s: ", var_to_string (var)); + interaction_to_string (iact, &str); + int table_idx = pivot_category_create_leaf ( + tables->root, pivot_value_new_user_text_nocopy ( + ds_steal_cstr (&str))); + + const struct summary *s = &mt->summary[row]; + double n_total = s->missing + s->non_missing; + struct entry + { + int stat_idx; + int case_idx; + double x; + } + entries[] = + { + { 0, 0, s->non_missing }, + { 1, 0, s->non_missing / n_total * 100.0 }, + { 0, 1, s->missing }, + { 1, 1, s->missing / n_total * 100.0 }, + { 0, 2, n_total }, + { 1, 2, 100.0 }, + }; + + for (size_t j = 0; j < sizeof entries / sizeof *entries; j++) + { + const struct entry *e = &entries[j]; + pivot_table_put3 (table, e->stat_idx, e->case_idx, table_idx, + pivot_value_new_number (e->x)); + } } } - casereader_destroy (reader); - arrange_cells (table->root_cell, table); - for (int l = 0; l < table->n_layers; ++l) + pivot_table_submit (table); +} + +static void +create_interaction_dimensions (struct pivot_table *table, + const struct categoricals *cats, + const struct interaction *iact) +{ + for (size_t i = iact->n_vars; i-- > 0; ) + { + const struct variable *var = iact->vars[i]; + struct pivot_dimension *d = pivot_dimension_create__ ( + table, PIVOT_AXIS_ROW, pivot_value_new_variable (var)); + d->root->show_label = true; + + size_t n; + union value *values = categoricals_get_var_values (cats, var, &n); + for (size_t j = 0; j < n; j++) + pivot_category_create_leaf ( + d->root, pivot_value_new_var_value (var, &values[j])); + } +} + +static void +output_report (const struct means *cmd, int iact_idx, + const struct mtable *mt) +{ + struct pivot_table *table = pivot_table_create (N_("Report")); + table->omit_empty = true; + + struct pivot_dimension *statistics = pivot_dimension_create ( + table, PIVOT_AXIS_COLUMN, N_("Statistics")); + for (int i = 0; i < cmd->n_cells; ++i) + pivot_category_create_leaf ( + statistics->root, pivot_value_new_text (cell_spec[cmd->cells[i]].title)); + + const struct interaction *iact = mt->interactions[iact_idx]; + create_interaction_dimensions (table, mt->cats, iact); + + struct pivot_dimension *dep_dim = pivot_dimension_create ( + table, PIVOT_AXIS_ROW, N_("Dependent Variables")); + + size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes); + + size_t n_cats = categoricals_n_count (mt->cats, iact_idx); + for (size_t v = 0; v < mt->n_dep_vars; ++v) { - struct layer *layer = table->layers[l]; - bt_init (&layer->instances.bt, my_other_compare_func, - table->control_vars[l]); - - /* Iterate the instance hash table, and insert each instance - into the binary tree BT. */ - struct instance *inst; - HMAP_FOR_EACH (inst, struct instance, hmap_node, - &layer->instances.map) - { - bt_insert (&layer->instances.bt, &inst->bt_node); - } - - /* Iterate the binary tree (in order) and assign the index - member accordingly. */ - int index = 0; - BT_FOR_EACH (inst, struct instance, bt_node, &layer->instances.bt) - { - inst->index = index++; - } + indexes[table->n_dimensions - 1] = pivot_category_create_leaf ( + dep_dim->root, pivot_value_new_variable (mt->dep_vars[v])); + + for (size_t i = 0; i < n_cats; ++i) + { + for (size_t j = 0; j < iact->n_vars; j++) + { + int idx = categoricals_get_value_index_by_category_real ( + mt->cats, iact_idx, i, j); + indexes[table->n_dimensions - 2 - j] = idx; + } + + struct per_cat_data *per_cat_data + = categoricals_get_user_data_by_category_real ( + mt->cats, iact_idx, i); + + const struct per_var_data *pvd = &per_cat_data->pvd[v]; + for (int stat_idx = 0; stat_idx < cmd->n_cells; ++stat_idx) + { + indexes[0] = stat_idx; + const int csi = cmd->cells[stat_idx]; + const struct cell_spec *cs = &cell_spec[csi]; + + double result = cs->sd (pvd, pvd->cell_stats[stat_idx]); + pivot_table_put (table, indexes, table->n_dimensions, + pivot_value_new_number (result)); + } + } } + free (indexes); - /* The root cell should have no parent. */ - assert (table->root_cell->parent_cell == 0); - dump_tree (table->root_cell, 0); + pivot_table_submit (table); } + diff --git a/src/language/stats/means.h b/src/language/stats/means.h deleted file mode 100644 index c449cb0087..0000000000 --- a/src/language/stats/means.h +++ /dev/null @@ -1,134 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2011, 2012, 2013, 2019 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#ifndef MEANS_H -#define MEANS_H - -#include "libpspp/compiler.h" - -struct cell_container -{ - /* A hash table containing the cells. The table is indexed by a hash - based on the cell's categorical value. */ - struct hmap map; - - /* A binary tree containing the cells. This is - used to sort the elements in order of their categorical - values. */ - struct bt bt; -}; - - - -struct layer -{ - size_t n_factor_vars; - const struct variable **factor_vars; - - /* A container holding the union of all instances of values used - as categories forming this layer. */ - struct cell_container instances; -}; - - -struct per_var_data; - -typedef struct per_var_data *stat_create (struct pool *pool); -typedef void stat_update (struct per_var_data *stat, double w, double x); -typedef double stat_get (const struct per_var_data *); - - -struct cell_spec -{ - /* Printable title for output */ - const char *title; - - /* Keyword for syntax */ - const char *keyword; - - stat_create *sc; - stat_update *su; - stat_get *sd; -}; - - -/* The thing parsed after TABLES= */ -struct mtable -{ - size_t n_dep_vars; - const struct variable **dep_vars; - - const struct variable **control_vars; - - struct layer **layers; - int n_layers; - - struct cell *root_cell; -}; - -/* A structure created by the parser. Contains the definition of the - what the procedure should calculate. */ -struct means -{ - const struct dictionary *dict; - - /* The "tables" (ie, a definition of how the data should - be broken down). */ - struct mtable *table; - size_t n_tables; - - /* Missing value class for categorical variables. */ - enum mv_class exclude; - - /* Missing value class for dependent variables */ - enum mv_class dep_exclude; - - bool listwise_exclude; - - - /* The statistics to be calculated for each cell. */ - int *statistics; - int n_statistics; - - /* Pool on which cell functions may allocate data. */ - struct pool *pool; -}; - - - -#define n_MEANS_STATISTICS 17 -extern const struct cell_spec cell_spec[n_MEANS_STATISTICS]; - -/* This enum must be consistent with the array cell_spec (in means-calc.c). - A bitfield instead of enums would in my opinion be - more elegent. However we want the order of the specified - statistics to be retained in the output. */ -enum - { - MEANS_MEAN = 0, - MEANS_N, - MEANS_STDDEV - }; - - - -struct dataset; -struct casereader; -void run_means (struct means *cmd, struct casereader *input, const struct dataset *ds UNUSED); - -void means_shipout (const struct mtable *mt, const struct means *means); - -#endif