1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2011, 2012, 2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/case.h"
20 #include "data/casegrouper.h"
21 #include "data/casereader.h"
22 #include "data/dataset.h"
23 #include "data/dictionary.h"
24 #include "data/format.h"
25 #include "data/variable.h"
27 #include "language/command.h"
28 #include "language/lexer/lexer.h"
29 #include "language/lexer/variable-parser.h"
31 #include "libpspp/misc.h"
32 #include "libpspp/pool.h"
34 #include "math/categoricals.h"
35 #include "math/interaction.h"
36 #include "math/moments.h"
38 #include "output/pivot-table.h"
43 #define _(msgid) gettext (msgid)
44 #define N_(msgid) (msgid)
56 typedef void *stat_create (struct pool *pool);
57 typedef void stat_update (void *stat, double w, double x);
58 typedef double stat_get (const struct per_var_data *, void *aux);
62 /* Printable title for output */
65 /* Keyword for syntax */
80 harmonic_create (struct pool *pool)
82 struct harmonic_mean *hm = pool_alloc (pool, sizeof *hm);
92 harmonic_update (void *stat, double w, double x)
94 struct harmonic_mean *hm = stat;
101 harmonic_get (const struct per_var_data *pvd UNUSED, void *stat)
103 struct harmonic_mean *hm = stat;
105 return hm->n / hm->rsum;
110 struct geometric_mean
118 geometric_create (struct pool *pool)
120 struct geometric_mean *gm = pool_alloc (pool, sizeof *gm);
130 geometric_update (void *stat, double w, double x)
132 struct geometric_mean *gm = stat;
133 gm->prod *= pow (x, w);
139 geometric_get (const struct per_var_data *pvd UNUSED, void *stat)
141 struct geometric_mean *gm = stat;
143 return pow (gm->prod, 1.0 / gm->n);
149 sum_get (const struct per_var_data *pvd, void *stat UNUSED)
153 moments1_calculate (pvd->mom, &n, &mean, 0, 0, 0);
160 n_get (const struct per_var_data *pvd, void *stat UNUSED)
164 moments1_calculate (pvd->mom, &n, 0, 0, 0, 0);
170 arithmean_get (const struct per_var_data *pvd, void *stat UNUSED)
174 moments1_calculate (pvd->mom, &n, &mean, 0, 0, 0);
180 variance_get (const struct per_var_data *pvd, void *stat UNUSED)
182 double n, mean, variance;
184 moments1_calculate (pvd->mom, &n, &mean, &variance, 0, 0);
191 stddev_get (const struct per_var_data *pvd, void *stat)
193 return sqrt (variance_get (pvd, stat));
200 skew_get (const struct per_var_data *pvd, void *stat UNUSED)
204 moments1_calculate (pvd->mom, NULL, NULL, NULL, &skew, 0);
210 sekurt_get (const struct per_var_data *pvd, void *stat UNUSED)
214 moments1_calculate (pvd->mom, &n, NULL, NULL, NULL, NULL);
216 return calc_sekurt (n);
220 seskew_get (const struct per_var_data *pvd, void *stat UNUSED)
224 moments1_calculate (pvd->mom, &n, NULL, NULL, NULL, NULL);
226 return calc_seskew (n);
230 kurt_get (const struct per_var_data *pvd, void *stat UNUSED)
234 moments1_calculate (pvd->mom, NULL, NULL, NULL, NULL, &kurt);
240 semean_get (const struct per_var_data *pvd, void *stat UNUSED)
244 moments1_calculate (pvd->mom, &n, NULL, &var, NULL, NULL);
246 return sqrt (var / n);
252 min_create (struct pool *pool)
254 double *r = pool_alloc (pool, sizeof *r);
262 min_update (void *stat, double w UNUSED, double x)
271 min_get (const struct per_var_data *pvd UNUSED, void *stat)
279 max_create (struct pool *pool)
281 double *r = pool_alloc (pool, sizeof *r);
289 max_update (void *stat, double w UNUSED, double x)
298 max_get (const struct per_var_data *pvd UNUSED, void *stat)
314 range_create (struct pool *pool)
316 struct range *r = pool_alloc (pool, sizeof *r);
325 range_update (void *stat, double w UNUSED, double x)
327 struct range *r = stat;
337 range_get (const struct per_var_data *pvd UNUSED, void *stat)
339 struct range *r = stat;
341 return r->max - r->min;
347 last_create (struct pool *pool)
349 double *l = pool_alloc (pool, sizeof *l);
355 last_update (void *stat, double w UNUSED, double x)
363 last_get (const struct per_var_data *pvd UNUSED, void *stat)
372 first_create (struct pool *pool)
374 double *f = pool_alloc (pool, sizeof *f);
382 first_update (void *stat, double w UNUSED, double x)
391 first_get (const struct per_var_data *pvd UNUSED, void *stat)
405 /* Table of cell_specs */
406 static const struct cell_spec cell_spec[] = {
407 {N_("Mean"), "MEAN", NULL, NULL, arithmean_get},
408 {N_("N"), "COUNT", NULL, NULL, n_get},
409 {N_("Std. Deviation"), "STDDEV", NULL, NULL, stddev_get},
411 {N_("Median"), "MEDIAN", NULL, NULL, NULL},
412 {N_("Group Median"), "GMEDIAN", NULL, NULL, NULL},
414 {N_("S.E. Mean"), "SEMEAN", NULL, NULL, semean_get},
415 {N_("Sum"), "SUM", NULL, NULL, sum_get},
416 {N_("Min"), "MIN", min_create, min_update, min_get},
417 {N_("Max"), "MAX", max_create, max_update, max_get},
418 {N_("Range"), "RANGE", range_create, range_update, range_get},
419 {N_("Variance"), "VARIANCE", NULL, NULL, variance_get},
420 {N_("Kurtosis"), "KURT", NULL, NULL, kurt_get},
421 {N_("S.E. Kurt"), "SEKURT", NULL, NULL, sekurt_get},
422 {N_("Skewness"), "SKEW", NULL, NULL, skew_get},
423 {N_("S.E. Skew"), "SESKEW", NULL, NULL, seskew_get},
424 {N_("First"), "FIRST", first_create, first_update, first_get},
425 {N_("Last"), "LAST", last_create, last_update, last_get},
427 {N_("Percent N"), "NPCT", NULL, NULL, NULL},
428 {N_("Percent Sum"), "SPCT", NULL, NULL, NULL},
430 {N_("Harmonic Mean"), "HARMONIC", harmonic_create, harmonic_update, harmonic_get},
431 {N_("Geom. Mean"), "GEOMETRIC", geometric_create, geometric_update, geometric_get}
434 #define n_C (sizeof (cell_spec) / sizeof (struct cell_spec))
440 casenumber non_missing;
446 size_t n_factor_vars;
447 const struct variable **factor_vars;
450 /* The thing parsed after TABLES= */
454 const struct variable **dep_vars;
457 struct layer *layers;
459 struct interaction **interactions;
460 struct summary *summary;
464 struct categoricals *cats;
469 const struct dictionary *dict;
471 struct mtable *table;
474 /* Missing value class for categorical variables */
475 enum mv_class exclude;
477 /* Missing value class for dependent variables */
478 enum mv_class dep_exclude;
480 bool listwise_exclude;
482 /* an array indicating which statistics are to be calculated */
488 /* Pool on which cell functions may allocate data */
494 run_means (struct means *cmd, struct casereader *input,
495 const struct dataset *ds);
500 parse_means_table_syntax (struct lexer *lexer, const struct means *cmd, struct mtable *table)
504 table->layers = NULL;
505 table->interactions = NULL;
507 /* Dependent variable (s) */
508 if (!parse_variables_const_pool (lexer, cmd->pool, cmd->dict,
509 &table->dep_vars, &table->n_dep_vars,
510 PV_NO_DUPLICATE | PV_NUMERIC))
513 /* Factor variable (s) */
514 while (lex_match (lexer, T_BY))
518 pool_realloc (cmd->pool, table->layers,
519 sizeof (*table->layers) * table->n_layers);
521 if (!parse_variables_const_pool
522 (lexer, cmd->pool, cmd->dict,
523 &table->layers[table->n_layers - 1].factor_vars,
524 &table->layers[table->n_layers - 1].n_factor_vars,
529 /* There is always at least one layer.
530 However the final layer is the total, and not
531 normally considered by the user as a
537 pool_realloc (cmd->pool, table->layers,
538 sizeof (*table->layers) * table->n_layers);
539 table->layers[table->n_layers - 1].factor_vars = NULL;
540 table->layers[table->n_layers - 1].n_factor_vars = 0;
546 If the match succeeds, the variable will be placed in VAR.
547 Returns true if successful */
549 lex_is_variable (struct lexer *lexer, const struct dictionary *dict,
553 if (lex_next_token (lexer, n) != T_ID)
556 tstr = lex_next_tokcstr (lexer, n);
558 if (NULL == dict_lookup_var (dict, tstr) )
566 cmd_means (struct lexer *lexer, struct dataset *ds)
572 bool more_tables = true;
574 means.pool = pool_create ();
576 means.exclude = MV_ANY;
577 means.dep_exclude = MV_ANY;
578 means.listwise_exclude = false;
582 means.dict = dataset_dict (ds);
585 means.cells = pool_calloc (means.pool, means.n_cells, sizeof (*means.cells));
588 /* The first three items (MEAN, COUNT, STDDEV) are the default */
589 for (i = 0; i < 3; ++i)
593 /* Optional TABLES = */
594 if (lex_match_id (lexer, "TABLES"))
596 if (! lex_force_match (lexer, T_EQUALS))
602 /* Parse the "tables" */
606 means.table = pool_realloc (means.pool, means.table, means.n_tables * sizeof (*means.table));
608 if (! parse_means_table_syntax (lexer, &means,
609 &means.table[means.n_tables - 1]))
614 /* Look ahead to see if there are more tables to be parsed */
616 if ( T_SLASH == lex_next_token (lexer, 0) )
618 if (lex_is_variable (lexer, means.dict, 1) )
621 lex_match (lexer, T_SLASH);
626 /* /MISSING subcommand */
627 while (lex_token (lexer) != T_ENDCMD)
629 lex_match (lexer, T_SLASH);
631 if (lex_match_id (lexer, "MISSING"))
634 If no MISSING subcommand is specified, each combination of
635 a dependent variable and categorical variables is handled
638 lex_match (lexer, T_EQUALS);
639 if (lex_match_id (lexer, "INCLUDE"))
642 Use the subcommand "/MISSING=INCLUDE" to include user-missing
643 values in the analysis.
646 means.exclude = MV_SYSTEM;
647 means.dep_exclude = MV_SYSTEM;
649 else if (lex_match_id (lexer, "TABLE"))
651 This is the default. (I think).
652 Every case containing a complete set of variables for a given
653 table. If any variable, categorical or dependent for in a table
654 is missing (as defined by what?), then that variable will
655 be dropped FOR THAT TABLE ONLY.
658 means.listwise_exclude = true;
660 else if (lex_match_id (lexer, "DEPENDENT"))
662 Use the command "/MISSING=DEPENDENT" to
663 include user-missing values for the categorical variables,
664 while excluding them for the dependent variables.
666 Cases are dropped only when user-missing values
667 appear in dependent variables. User-missing
668 values for categorical variables are treated according to
671 Cases are ALWAYS dropped when System Missing values appear
672 in the categorical variables.
675 means.dep_exclude = MV_ANY;
676 means.exclude = MV_SYSTEM;
680 lex_error (lexer, NULL);
684 else if (lex_match_id (lexer, "CELLS"))
686 lex_match (lexer, T_EQUALS);
688 /* The default values become overwritten */
690 while (lex_token (lexer) != T_ENDCMD
691 && lex_token (lexer) != T_SLASH)
694 if (lex_match (lexer, T_ALL))
698 pool_realloc (means.pool, means.cells,
699 (means.n_cells += n_C) * sizeof (*means.cells));
701 for (x = 0; x < n_C; ++x)
702 means.cells[means.n_cells - (n_C - 1 - x) - 1] = x;
704 else if (lex_match_id (lexer, "NONE"))
708 else if (lex_match_id (lexer, "DEFAULT"))
711 pool_realloc (means.pool, means.cells,
712 (means.n_cells += 3) * sizeof (*means.cells));
714 means.cells[means.n_cells - 2 - 1] = MEANS_MEAN;
715 means.cells[means.n_cells - 1 - 1] = MEANS_N;
716 means.cells[means.n_cells - 0 - 1] = MEANS_STDDEV;
722 if (lex_match_id (lexer, cell_spec[k].keyword))
725 pool_realloc (means.pool, means.cells,
726 ++means.n_cells * sizeof (*means.cells));
728 means.cells[means.n_cells - 1] = k;
735 lex_error (lexer, NULL);
742 lex_error (lexer, NULL);
749 for (t = 0; t < means.n_tables; ++t)
751 struct mtable *table = &means.table[t];
753 table->interactions =
754 pool_calloc (means.pool, table->n_layers, sizeof (*table->interactions));
757 pool_calloc (means.pool, table->n_dep_vars * table->n_layers, sizeof (*table->summary));
759 for (l = 0; l < table->n_layers; ++l)
762 const struct layer *lyr = &table->layers[l];
763 const int n_vars = lyr->n_factor_vars;
764 table->interactions[l] = interaction_create (NULL);
765 for (v = 0; v < n_vars ; ++v)
767 interaction_add_variable (table->interactions[l],
768 lyr->factor_vars[v]);
774 struct casegrouper *grouper;
775 struct casereader *group;
778 grouper = casegrouper_create_splits (proc_open (ds), means.dict);
779 while (casegrouper_get_next_group (grouper, &group))
781 run_means (&means, group, ds);
783 ok = casegrouper_destroy (grouper);
784 ok = proc_commit (ds) && ok;
787 for (t = 0; t < means.n_tables; ++t)
790 struct mtable *table = &means.table[t];
791 if (table->interactions)
792 for (l = 0; l < table->n_layers; ++l)
794 interaction_destroy (table->interactions[l]);
798 pool_destroy (means.pool);
803 for (t = 0; t < means.n_tables; ++t)
806 struct mtable *table = &means.table[t];
807 if (table->interactions)
808 for (l = 0; l < table->n_layers; ++l)
810 interaction_destroy (table->interactions[l]);
814 pool_destroy (means.pool);
820 is_missing (const struct means *cmd,
821 const struct variable *dvar,
822 const struct interaction *iact,
823 const struct ccase *c)
825 if ( interaction_case_is_missing (iact, c, cmd->exclude) )
829 if (var_is_value_missing (dvar,
837 static void output_case_processing_summary (const struct mtable *,
838 const struct variable *wv);
840 static void output_report (const struct means *, int, const struct mtable *);
845 struct per_var_data *pvd;
852 destroy_n (const void *aux1 UNUSED, void *aux2, void *user_data)
854 struct mtable *table = aux2;
856 struct per_cat_data *per_cat_data = user_data;
857 struct per_var_data *pvd = per_cat_data->pvd;
859 for (v = 0; v < table->n_dep_vars; ++v)
861 struct per_var_data *pp = &pvd[v];
862 moments1_destroy (pp->mom);
867 create_n (const void *aux1, void *aux2)
870 const struct means *means = aux1;
871 struct mtable *table = aux2;
872 struct per_cat_data *per_cat_data = pool_malloc (means->pool, sizeof *per_cat_data);
874 struct per_var_data *pvd = pool_calloc (means->pool, table->n_dep_vars, sizeof *pvd);
876 for (v = 0; v < table->n_dep_vars; ++v)
878 enum moment maxmom = MOMENT_KURTOSIS;
879 struct per_var_data *pp = &pvd[v];
881 pp->cell_stats = pool_calloc (means->pool, means->n_cells, sizeof *pp->cell_stats);
884 for (i = 0; i < means->n_cells; ++i)
886 int csi = means->cells[i];
887 const struct cell_spec *cs = &cell_spec[csi];
890 pp->cell_stats[i] = cs->sc (means->pool);
893 pp->mom = moments1_create (maxmom);
897 per_cat_data->pvd = pvd;
898 per_cat_data->warn = true;
903 update_n (const void *aux1, void *aux2, void *user_data, const struct ccase *c, double weight)
907 const struct means *means = aux1;
908 struct mtable *table = aux2;
909 struct per_cat_data *per_cat_data = user_data;
911 for (v = 0; v < table->n_dep_vars; ++v)
913 struct per_var_data *pvd = &per_cat_data->pvd[v];
915 const double x = case_data (c, table->dep_vars[v])->f;
917 for (i = 0; i < table->n_layers; ++i)
919 if ( is_missing (means, table->dep_vars[v],
920 table->interactions[i], c))
924 for (i = 0; i < means->n_cells; ++i)
926 const int csi = means->cells[i];
927 const struct cell_spec *cs = &cell_spec[csi];
931 cs->su (pvd->cell_stats[i],
935 moments1_add (pvd->mom, x, weight);
943 calculate_n (const void *aux1, void *aux2, void *user_data)
947 struct per_cat_data *per_cat_data = user_data;
948 const struct means *means = aux1;
949 struct mtable *table = aux2;
951 for (v = 0; v < table->n_dep_vars; ++v)
953 struct per_var_data *pvd = &per_cat_data->pvd[v];
954 for (i = 0; i < means->n_cells; ++i)
956 int csi = means->cells[i];
957 const struct cell_spec *cs = &cell_spec[csi];
960 cs->sd (pvd, pvd->cell_stats[i]);
966 run_means (struct means *cmd, struct casereader *input,
967 const struct dataset *ds UNUSED)
970 const struct variable *wv = dict_get_weight (cmd->dict);
972 struct casereader *reader;
974 struct payload payload;
975 payload.create = create_n;
976 payload.update = update_n;
977 payload.calculate = calculate_n;
978 payload.destroy = destroy_n;
980 for (t = 0; t < cmd->n_tables; ++t)
982 struct mtable *table = &cmd->table[t];
984 = categoricals_create (table->interactions,
985 table->n_layers, wv, cmd->exclude);
987 categoricals_set_payload (table->cats, &payload, cmd, table);
991 (c = casereader_read (reader)) != NULL; case_unref (c))
993 for (t = 0; t < cmd->n_tables; ++t)
995 bool something_missing = false;
997 struct mtable *table = &cmd->table[t];
999 for (v = 0; v < table->n_dep_vars; ++v)
1002 for (i = 0; i < table->n_layers; ++i)
1004 const bool missing =
1005 is_missing (cmd, table->dep_vars[v],
1006 table->interactions[i], c);
1009 something_missing = true;
1010 table->summary[v * table->n_layers + i].missing++;
1013 table->summary[v * table->n_layers + i].non_missing++;
1016 if ( something_missing && cmd->listwise_exclude)
1019 categoricals_update (table->cats, c);
1022 casereader_destroy (reader);
1024 for (t = 0; t < cmd->n_tables; ++t)
1026 struct mtable *table = &cmd->table[t];
1028 categoricals_done (table->cats);
1032 for (t = 0; t < cmd->n_tables; ++t)
1035 const struct mtable *table = &cmd->table[t];
1037 output_case_processing_summary (table, wv);
1039 for (i = 0; i < table->n_layers; ++i)
1041 output_report (cmd, i, table);
1043 categoricals_destroy (table->cats);
1051 output_case_processing_summary (const struct mtable *mt,
1052 const struct variable *wv)
1054 struct pivot_table *table = pivot_table_create (
1055 N_("Case Processing Summary"));
1056 pivot_table_set_weight_var (table, wv);
1058 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
1059 N_("N"), PIVOT_RC_COUNT,
1060 N_("Percent"), PIVOT_RC_PERCENT);
1061 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Cases"),
1062 N_("Included"), N_("Excluded"), N_("Total"))
1063 ->root->show_label = true;
1065 struct pivot_dimension *tables = pivot_dimension_create (
1066 table, PIVOT_AXIS_ROW, N_("Tables"));
1068 for (size_t v = 0; v < mt->n_dep_vars; ++v)
1070 const struct variable *var = mt->dep_vars[v];
1071 for (size_t i = 0; i < mt->n_layers; ++i)
1073 const int row = v * mt->n_layers + i;
1074 const struct interaction *iact = mt->interactions[i];
1076 struct string str = DS_EMPTY_INITIALIZER;
1077 ds_put_format (&str, "%s: ", var_to_string (var));
1078 interaction_to_string (iact, &str);
1079 int table_idx = pivot_category_create_leaf (
1080 tables->root, pivot_value_new_user_text_nocopy (
1081 ds_steal_cstr (&str)));
1083 const struct summary *s = &mt->summary[row];
1084 double n_total = s->missing + s->non_missing;
1093 { 0, 0, s->non_missing },
1094 { 1, 0, s->non_missing / n_total * 100.0 },
1095 { 0, 1, s->missing },
1096 { 1, 1, s->missing / n_total * 100.0 },
1101 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
1103 const struct entry *e = &entries[j];
1104 pivot_table_put3 (table, e->stat_idx, e->case_idx, table_idx,
1105 pivot_value_new_number (e->x));
1110 pivot_table_submit (table);
1114 create_interaction_dimensions (struct pivot_table *table,
1115 const struct categoricals *cats,
1116 const struct interaction *iact)
1118 for (size_t i = iact->n_vars; i-- > 0; )
1120 const struct variable *var = iact->vars[i];
1121 struct pivot_dimension *d = pivot_dimension_create__ (
1122 table, PIVOT_AXIS_ROW, pivot_value_new_variable (var));
1123 d->root->show_label = true;
1126 union value *values = categoricals_get_var_values (cats, var, &n);
1127 for (size_t j = 0; j < n; j++)
1128 pivot_category_create_leaf (
1129 d->root, pivot_value_new_var_value (var, &values[j]));
1134 output_report (const struct means *cmd, int iact_idx,
1135 const struct mtable *mt)
1137 struct pivot_table *table = pivot_table_create (N_("Report"));
1138 table->omit_empty = true;
1140 struct pivot_dimension *statistics = pivot_dimension_create (
1141 table, PIVOT_AXIS_COLUMN, N_("Statistics"));
1142 for (int i = 0; i < cmd->n_cells; ++i)
1143 pivot_category_create_leaf (
1144 statistics->root, pivot_value_new_text (cell_spec[cmd->cells[i]].title));
1146 const struct interaction *iact = mt->interactions[iact_idx];
1147 create_interaction_dimensions (table, mt->cats, iact);
1149 struct pivot_dimension *dep_dim = pivot_dimension_create (
1150 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
1152 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
1154 size_t n_cats = categoricals_n_count (mt->cats, iact_idx);
1155 for (size_t v = 0; v < mt->n_dep_vars; ++v)
1157 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
1158 dep_dim->root, pivot_value_new_variable (mt->dep_vars[v]));
1160 for (size_t i = 0; i < n_cats; ++i)
1162 for (size_t j = 0; j < iact->n_vars; j++)
1164 int idx = categoricals_get_value_index_by_category_real (
1165 mt->cats, iact_idx, i, j);
1166 indexes[table->n_dimensions - 2 - j] = idx;
1169 struct per_cat_data *per_cat_data
1170 = categoricals_get_user_data_by_category_real (
1171 mt->cats, iact_idx, i);
1173 const struct per_var_data *pvd = &per_cat_data->pvd[v];
1174 for (int stat_idx = 0; stat_idx < cmd->n_cells; ++stat_idx)
1176 indexes[0] = stat_idx;
1177 const int csi = cmd->cells[stat_idx];
1178 const struct cell_spec *cs = &cell_spec[csi];
1180 double result = cs->sd (pvd, pvd->cell_stats[stat_idx]);
1181 pivot_table_put (table, indexes, table->n_dimensions,
1182 pivot_value_new_number (result));
1188 pivot_table_submit (table);