1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2011, 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/case.h"
20 #include "data/casegrouper.h"
21 #include "data/casereader.h"
22 #include "data/dataset.h"
23 #include "data/dictionary.h"
24 #include "data/format.h"
25 #include "data/variable.h"
27 #include "language/command.h"
28 #include "language/lexer/lexer.h"
29 #include "language/lexer/variable-parser.h"
31 #include "libpspp/misc.h"
32 #include "libpspp/pool.h"
34 #include "math/categoricals.h"
35 #include "math/interaction.h"
36 #include "math/moments.h"
38 #include "output/tab.h"
43 #define _(msgid) gettext (msgid)
44 #define N_(msgid) (msgid)
56 typedef void *stat_create (struct pool *pool);
57 typedef void stat_update (void *stat, double w, double x);
58 typedef double stat_get (const struct per_var_data *, void *aux);
62 /* Printable title for output */
65 /* Keyword for syntax */
80 harmonic_create (struct pool *pool)
82 struct harmonic_mean *hm = pool_alloc (pool, sizeof *hm);
92 harmonic_update (void *stat, double w, double x)
94 struct harmonic_mean *hm = stat;
101 harmonic_get (const struct per_var_data *pvd UNUSED, void *stat)
103 struct harmonic_mean *hm = stat;
105 return hm->n / hm->rsum;
110 struct geometric_mean
118 geometric_create (struct pool *pool)
120 struct geometric_mean *gm = pool_alloc (pool, sizeof *gm);
130 geometric_update (void *stat, double w, double x)
132 struct geometric_mean *gm = stat;
133 gm->prod *= pow (x, w);
139 geometric_get (const struct per_var_data *pvd UNUSED, void *stat)
141 struct geometric_mean *gm = stat;
143 return pow (gm->prod, 1.0 / gm->n);
149 sum_get (const struct per_var_data *pvd, void *stat UNUSED)
153 moments1_calculate (pvd->mom, &n, &mean, 0, 0, 0);
160 n_get (const struct per_var_data *pvd, void *stat UNUSED)
164 moments1_calculate (pvd->mom, &n, 0, 0, 0, 0);
170 arithmean_get (const struct per_var_data *pvd, void *stat UNUSED)
174 moments1_calculate (pvd->mom, &n, &mean, 0, 0, 0);
180 variance_get (const struct per_var_data *pvd, void *stat UNUSED)
182 double n, mean, variance;
184 moments1_calculate (pvd->mom, &n, &mean, &variance, 0, 0);
191 stddev_get (const struct per_var_data *pvd, void *stat)
193 return sqrt (variance_get (pvd, stat));
200 skew_get (const struct per_var_data *pvd, void *stat UNUSED)
204 moments1_calculate (pvd->mom, NULL, NULL, NULL, &skew, 0);
210 sekurt_get (const struct per_var_data *pvd, void *stat UNUSED)
214 moments1_calculate (pvd->mom, &n, NULL, NULL, NULL, NULL);
216 return calc_sekurt (n);
220 seskew_get (const struct per_var_data *pvd, void *stat UNUSED)
224 moments1_calculate (pvd->mom, &n, NULL, NULL, NULL, NULL);
226 return calc_seskew (n);
230 kurt_get (const struct per_var_data *pvd, void *stat UNUSED)
234 moments1_calculate (pvd->mom, NULL, NULL, NULL, NULL, &kurt);
240 semean_get (const struct per_var_data *pvd, void *stat UNUSED)
244 moments1_calculate (pvd->mom, &n, NULL, &var, NULL, NULL);
246 return sqrt (var / n);
252 min_create (struct pool *pool)
254 double *r = pool_alloc (pool, sizeof *r);
262 min_update (void *stat, double w UNUSED, double x)
271 min_get (const struct per_var_data *pvd UNUSED, void *stat)
279 max_create (struct pool *pool)
281 double *r = pool_alloc (pool, sizeof *r);
289 max_update (void *stat, double w UNUSED, double x)
298 max_get (const struct per_var_data *pvd UNUSED, void *stat)
314 range_create (struct pool *pool)
316 struct range *r = pool_alloc (pool, sizeof *r);
325 range_update (void *stat, double w UNUSED, double x)
327 struct range *r = stat;
337 range_get (const struct per_var_data *pvd UNUSED, void *stat)
339 struct range *r = stat;
341 return r->max - r->min;
347 last_create (struct pool *pool)
349 double *l = pool_alloc (pool, sizeof *l);
355 last_update (void *stat, double w UNUSED, double x)
363 last_get (const struct per_var_data *pvd UNUSED, void *stat)
372 first_create (struct pool *pool)
374 double *f = pool_alloc (pool, sizeof *f);
382 first_update (void *stat, double w UNUSED, double x)
391 first_get (const struct per_var_data *pvd UNUSED, void *stat)
405 /* Table of cell_specs */
406 static const struct cell_spec cell_spec[] = {
407 {N_("Mean"), "MEAN", NULL, NULL, arithmean_get},
408 {N_("N"), "COUNT", NULL, NULL, n_get},
409 {N_("Std. Deviation"), "STDDEV", NULL, NULL, stddev_get},
411 {N_("Median"), "MEDIAN", NULL, NULL, NULL},
412 {N_("Group Median"), "GMEDIAN", NULL, NULL, NULL},
414 {N_("S.E. Mean"), "SEMEAN", NULL, NULL, semean_get},
415 {N_("Sum"), "SUM", NULL, NULL, sum_get},
416 {N_("Min"), "MIN", min_create, min_update, min_get},
417 {N_("Max"), "MAX", max_create, max_update, max_get},
418 {N_("Range"), "RANGE", range_create, range_update, range_get},
419 {N_("Variance"), "VARIANCE", NULL, NULL, variance_get},
420 {N_("Kurtosis"), "KURT", NULL, NULL, kurt_get},
421 {N_("S.E. Kurt"), "SEKURT", NULL, NULL, sekurt_get},
422 {N_("Skewness"), "SKEW", NULL, NULL, skew_get},
423 {N_("S.E. Skew"), "SESKEW", NULL, NULL, seskew_get},
424 {N_("First"), "FIRST", first_create, first_update, first_get},
425 {N_("Last"), "LAST", last_create, last_update, last_get},
427 {N_("Percent N"), "NPCT", NULL, NULL, NULL},
428 {N_("Percent Sum"), "SPCT", NULL, NULL, NULL},
430 {N_("Harmonic Mean"), "HARMONIC", harmonic_create, harmonic_update, harmonic_get},
431 {N_("Geom. Mean"), "GEOMETRIC", geometric_create, geometric_update, geometric_get}
434 #define n_C (sizeof (cell_spec) / sizeof (struct cell_spec))
440 casenumber non_missing;
446 size_t n_factor_vars;
447 const struct variable **factor_vars;
450 /* The thing parsed after TABLES= */
454 const struct variable **dep_vars;
457 struct layer *layers;
459 struct interaction **interactions;
460 struct summary *summary;
464 struct categoricals *cats;
469 const struct dictionary *dict;
471 struct mtable *table;
474 /* Missing value class for categorical variables */
475 enum mv_class exclude;
477 /* Missing value class for dependent variables */
478 enum mv_class dep_exclude;
480 bool listwise_exclude;
482 /* an array indicating which statistics are to be calculated */
488 /* Pool on which cell functions may allocate data */
494 run_means (struct means *cmd, struct casereader *input,
495 const struct dataset *ds);
500 parse_means_table_syntax (struct lexer *lexer, const struct means *cmd, struct mtable *table)
504 table->layers = NULL;
506 /* Dependent variable (s) */
507 if (!parse_variables_const (lexer, cmd->dict,
508 &table->dep_vars, &table->n_dep_vars,
509 PV_NO_DUPLICATE | PV_NUMERIC))
512 /* Factor variable (s) */
513 while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
515 if (lex_match (lexer, T_BY))
519 xrealloc (table->layers,
520 sizeof (*table->layers) * table->n_layers);
522 if (!parse_variables_const
524 &table->layers[table->n_layers - 1].factor_vars,
525 &table->layers[table->n_layers - 1].n_factor_vars,
532 /* There is always at least one layer.
533 However the final layer is the total, and not
534 normally considered by the user as a
540 xrealloc (table->layers,
541 sizeof (*table->layers) * table->n_layers);
542 table->layers[table->n_layers - 1].factor_vars = NULL;
543 table->layers[table->n_layers - 1].n_factor_vars = 0;
549 If the match succeeds, the variable will be placed in VAR.
550 Returns true if successful */
552 lex_is_variable (struct lexer *lexer, const struct dictionary *dict,
556 if (lex_next_token (lexer, n) != T_ID)
559 tstr = lex_next_tokcstr (lexer, n);
561 if (NULL == dict_lookup_var (dict, tstr) )
569 cmd_means (struct lexer *lexer, struct dataset *ds)
575 bool more_tables = true;
577 means.exclude = MV_ANY;
578 means.dep_exclude = MV_ANY;
579 means.listwise_exclude = false;
583 means.dict = dataset_dict (ds);
586 means.cells = xcalloc (means.n_cells, sizeof (*means.cells));
589 /* The first three items (MEAN, COUNT, STDDEV) are the default */
590 for (i = 0; i < 3; ++i)
594 /* Optional TABLES = */
595 if (lex_match_id (lexer, "TABLES"))
597 lex_force_match (lexer, T_EQUALS);
602 /* Parse the "tables" */
606 means.table = xrealloc (means.table, means.n_tables * sizeof (*means.table));
608 if (! parse_means_table_syntax (lexer, &means,
609 &means.table[means.n_tables - 1]))
614 /* Look ahead to see if there are more tables to be parsed */
616 if ( T_SLASH == lex_next_token (lexer, 0) )
618 if (lex_is_variable (lexer, means.dict, 1) )
621 lex_force_match (lexer, T_SLASH);
626 /* /MISSING subcommand */
627 while (lex_token (lexer) != T_ENDCMD)
629 lex_match (lexer, T_SLASH);
631 if (lex_match_id (lexer, "MISSING"))
634 If no MISSING subcommand is specified, each combination of
635 a dependent variable and categorical variables is handled
638 lex_match (lexer, T_EQUALS);
639 if (lex_match_id (lexer, "INCLUDE"))
642 Use the subcommand "/MISSING=INCLUDE" to include user-missing
643 values in the analysis.
646 means.exclude = MV_SYSTEM;
647 means.dep_exclude = MV_SYSTEM;
649 else if (lex_match_id (lexer, "TABLE"))
651 This is the default. (I think).
652 Every case containing a complete set of variables for a given
653 table. If any variable, categorical or dependent for in a table
654 is missing (as defined by what?), then that variable will
655 be dropped FOR THAT TABLE ONLY.
658 means.listwise_exclude = true;
660 else if (lex_match_id (lexer, "DEPENDENT"))
662 Use the command "/MISSING=DEPENDENT" to
663 include user-missing values for the categorical variables,
664 while excluding them for the dependent variables.
666 Cases are dropped only when user-missing values
667 appear in dependent variables. User-missing
668 values for categorical variables are treated according to
671 Cases are ALWAYS dropped when System Missing values appear
672 in the categorical variables.
675 means.dep_exclude = MV_ANY;
676 means.exclude = MV_SYSTEM;
680 lex_error (lexer, NULL);
684 else if (lex_match_id (lexer, "CELLS"))
686 lex_match (lexer, T_EQUALS);
688 /* The default values become overwritten */
690 while (lex_token (lexer) != T_ENDCMD
691 && lex_token (lexer) != T_SLASH)
694 if (lex_match (lexer, T_ALL))
698 xrealloc (means.cells,
699 (means.n_cells += n_C) * sizeof (*means.cells));
701 for (x = 0; x < n_C; ++x)
702 means.cells[means.n_cells - (n_C - 1 - x) - 1] = x;
704 else if (lex_match_id (lexer, "NONE"))
708 else if (lex_match_id (lexer, "DEFAULT"))
711 xrealloc (means.cells,
712 (means.n_cells += 3) * sizeof (*means.cells));
714 means.cells[means.n_cells - 2 - 1] = MEANS_MEAN;
715 means.cells[means.n_cells - 1 - 1] = MEANS_N;
716 means.cells[means.n_cells - 0 - 1] = MEANS_STDDEV;
722 if (lex_match_id (lexer, cell_spec[k].keyword))
725 xrealloc (means.cells,
726 ++means.n_cells * sizeof (*means.cells));
728 means.cells[means.n_cells - 1] = k;
735 lex_error (lexer, NULL);
742 lex_error (lexer, NULL);
747 means.pool = pool_create ();
750 for (t = 0; t < means.n_tables; ++t)
752 struct mtable *table = &means.table[t];
754 table->interactions =
755 xcalloc (table->n_layers, sizeof (*table->interactions));
758 xcalloc (table->n_dep_vars * table->n_layers, sizeof (*table->summary));
760 for (l = 0; l < table->n_layers; ++l)
763 const struct layer *lyr = &table->layers[l];
764 const int n_vars = lyr->n_factor_vars;
765 table->interactions[l] = interaction_create (NULL);
766 for (v = 0 ; v < n_vars ; ++v)
768 interaction_add_variable (table->interactions[l],
769 lyr->factor_vars[v]);
775 struct casegrouper *grouper;
776 struct casereader *group;
779 grouper = casegrouper_create_splits (proc_open (ds), means.dict);
780 while (casegrouper_get_next_group (grouper, &group))
782 run_means (&means, group, ds);
784 ok = casegrouper_destroy (grouper);
785 ok = proc_commit (ds) && ok;
798 is_missing (const struct means *cmd,
799 const struct variable *dvar,
800 const struct interaction *iact,
801 const struct ccase *c)
803 if ( interaction_case_is_missing (iact, c, cmd->exclude) )
807 if (var_is_value_missing (dvar,
815 static void output_case_processing_summary (const struct mtable *);
817 static void output_report (const struct means *, int, const struct mtable *);
822 struct per_var_data *pvd;
828 create_n (const void *aux1, void *aux2)
831 const struct means *means = aux1;
832 struct mtable *table = aux2;
833 struct per_cat_data *per_cat_data = xmalloc (sizeof *per_cat_data);
835 struct per_var_data *pvd = xcalloc (table->n_dep_vars, sizeof *pvd);
837 for (v = 0; v < table->n_dep_vars; ++v)
839 enum moment maxmom = MOMENT_KURTOSIS;
840 struct per_var_data *pp = &pvd[v];
842 pp->cell_stats = xcalloc (means->n_cells, sizeof *pp->cell_stats);
845 for (i = 0; i < means->n_cells; ++i)
847 int csi = means->cells[i];
848 const struct cell_spec *cs = &cell_spec[csi];
851 pp->cell_stats[i] = cs->sc (means->pool);
854 pp->mom = moments1_create (maxmom);
858 per_cat_data->pvd = pvd;
859 per_cat_data->warn = true;
864 update_n (const void *aux1, void *aux2, void *user_data, const struct ccase *c, double weight)
868 const struct means *means = aux1;
869 struct mtable *table = aux2;
870 struct per_cat_data *per_cat_data = user_data;
872 for (v = 0; v < table->n_dep_vars; ++v)
874 struct per_var_data *pvd = &per_cat_data->pvd[v];
876 const double x = case_data (c, table->dep_vars[v])->f;
878 for (i = 0; i < table->n_layers; ++i)
880 if ( is_missing (means, table->dep_vars[v],
881 table->interactions[i], c))
885 for (i = 0; i < means->n_cells; ++i)
887 const int csi = means->cells[i];
888 const struct cell_spec *cs = &cell_spec[csi];
892 cs->su (pvd->cell_stats[i],
896 moments1_add (pvd->mom, x, weight);
904 calculate_n (const void *aux1, void *aux2, void *user_data)
908 struct per_cat_data *per_cat_data = user_data;
909 const struct means *means = aux1;
910 struct mtable *table = aux2;
912 for (v = 0; v < table->n_dep_vars; ++v)
914 struct per_var_data *pvd = &per_cat_data->pvd[v];
915 for (i = 0; i < means->n_cells; ++i)
917 int csi = means->cells[i];
918 const struct cell_spec *cs = &cell_spec[csi];
921 cs->sd (pvd, pvd->cell_stats[i]);
927 run_means (struct means *cmd, struct casereader *input,
928 const struct dataset *ds UNUSED)
931 const struct variable *wv = dict_get_weight (cmd->dict);
933 struct casereader *reader;
935 struct payload payload;
936 payload.create = create_n;
937 payload.update = update_n;
938 payload.destroy = calculate_n;
940 for (t = 0; t < cmd->n_tables; ++t)
942 struct mtable *table = &cmd->table[t];
944 = categoricals_create (table->interactions,
945 table->n_layers, wv, cmd->exclude);
947 categoricals_set_payload (table->cats, &payload, cmd, table);
950 for (reader = casereader_clone (input);
951 (c = casereader_read (reader)) != NULL; case_unref (c))
953 for (t = 0; t < cmd->n_tables; ++t)
955 bool something_missing = false;
957 struct mtable *table = &cmd->table[t];
959 for (v = 0; v < table->n_dep_vars; ++v)
962 for (i = 0; i < table->n_layers; ++i)
965 is_missing (cmd, table->dep_vars[v],
966 table->interactions[i], c);
969 something_missing = true;
970 table->summary[v * table->n_layers + i].missing++;
973 table->summary[v * table->n_layers + i].non_missing++;
976 if ( something_missing && cmd->listwise_exclude)
979 categoricals_update (table->cats, c);
982 casereader_destroy (reader);
984 for (t = 0; t < cmd->n_tables; ++t)
986 struct mtable *table = &cmd->table[t];
988 categoricals_done (table->cats);
992 for (t = 0; t < cmd->n_tables; ++t)
995 const struct mtable *table = &cmd->table[t];
997 output_case_processing_summary (table);
999 for (i = 0; i < table->n_layers; ++i)
1001 output_report (cmd, i, table);
1003 categoricals_destroy (table->cats);
1011 output_case_processing_summary (const struct mtable *table)
1014 const int heading_columns = 1;
1015 const int heading_rows = 3;
1016 struct tab_table *t;
1018 const int nr = heading_rows + table->n_layers * table->n_dep_vars;
1021 t = tab_create (nc, nr);
1022 tab_title (t, _("Case Processing Summary"));
1024 tab_headers (t, heading_columns, 0, heading_rows, 0);
1026 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
1028 tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1029 tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
1032 tab_joint_text (t, heading_columns, 0,
1033 nc - 1, 0, TAB_CENTER | TAT_TITLE, _("Cases"));
1035 tab_joint_text (t, 1, 1, 2, 1, TAB_CENTER | TAT_TITLE, _("Included"));
1036 tab_joint_text (t, 3, 1, 4, 1, TAB_CENTER | TAT_TITLE, _("Excluded"));
1037 tab_joint_text (t, 5, 1, 6, 1, TAB_CENTER | TAT_TITLE, _("Total"));
1039 tab_hline (t, TAL_1, heading_columns, nc - 1, 1);
1040 tab_hline (t, TAL_1, heading_columns, nc - 1, 2);
1043 for (i = 0; i < 3; ++i)
1045 tab_text (t, heading_columns + i * 2, 2, TAB_CENTER | TAT_TITLE,
1047 tab_text (t, heading_columns + i * 2 + 1, 2, TAB_CENTER | TAT_TITLE,
1051 for (v = 0; v < table->n_dep_vars; ++v)
1053 const struct variable *var = table->dep_vars[v];
1054 const char *dv_name = var_to_string (var);
1055 for (i = 0; i < table->n_layers; ++i)
1057 const int row = v * table->n_layers + i;
1058 const struct interaction *iact = table->interactions[i];
1062 ds_init_cstr (&str, dv_name);
1063 ds_put_cstr (&str, ": ");
1065 interaction_to_string (iact, &str);
1067 tab_text (t, 0, row + heading_rows,
1068 TAB_LEFT | TAT_TITLE, ds_cstr (&str));
1071 n_total = table->summary[row].missing +
1072 table->summary[row].non_missing;
1074 tab_double (t, 1, row + heading_rows,
1075 0, table->summary[row].non_missing, &F_8_0);
1077 tab_text_format (t, 2, row + heading_rows,
1079 table->summary[row].non_missing / (double) n_total * 100.0);
1082 tab_double (t, 3, row + heading_rows,
1083 0, table->summary[row].missing, &F_8_0);
1086 tab_text_format (t, 4, row + heading_rows,
1088 table->summary[row].missing / (double) n_total * 100.0);
1091 tab_double (t, 5, row + heading_rows,
1092 0, table->summary[row].missing +
1093 table->summary[row].non_missing, &F_8_0);
1095 tab_text_format (t, 6, row + heading_rows,
1097 n_total / (double) n_total * 100.0);
1109 output_report (const struct means *cmd, int iact_idx,
1110 const struct mtable *table)
1115 const struct interaction *iact = table->interactions[iact_idx];
1117 const int heading_columns = 1 + iact->n_vars;
1118 const int heading_rows = 1;
1119 struct tab_table *t;
1121 const int n_cats = categoricals_n_count (table->cats, iact_idx);
1123 const int nr = n_cats * table->n_dep_vars + heading_rows;
1125 const int nc = heading_columns + cmd->n_cells;
1127 t = tab_create (nc, nr);
1128 tab_title (t, _("Report"));
1130 tab_headers (t, heading_columns, 0, heading_rows, 0);
1132 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
1134 tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1135 tab_vline (t, TAL_2, iact->n_vars, 0, nr - 1);
1137 for (i = 0; i < iact->n_vars; ++i)
1139 tab_text (t, 1 + i, 0, TAB_CENTER | TAT_TITLE,
1140 var_to_string (iact->vars[i]));
1143 for (i = 0; i < cmd->n_cells; ++i)
1145 tab_text (t, heading_columns + i, 0,
1146 TAB_CENTER | TAT_TITLE,
1147 gettext (cell_spec[cmd->cells[i]].title));
1151 for (i = 0; i < n_cats; ++i)
1154 const struct ccase *c =
1155 categoricals_get_case_by_category_real (table->cats, iact_idx, i);
1157 for (dv = 0; dv < table->n_dep_vars; ++dv)
1160 heading_rows + dv * n_cats,
1161 TAB_RIGHT | TAT_TITLE,
1162 var_get_name (table->dep_vars[dv])
1166 tab_hline (t, TAL_1, 0, nc - 1, heading_rows + dv * n_cats);
1168 for (v = 0; v < iact->n_vars; ++v)
1170 const struct variable *var = iact->vars[v];
1171 const union value *val = case_data (c, var);
1173 ds_init_empty (&str);
1174 var_append_value_name (var, val, &str);
1176 tab_text (t, 1 + v, heading_rows + dv * n_cats + i,
1177 TAB_RIGHT | TAT_TITLE, ds_cstr (&str));
1184 for (grp = 0; grp < n_cats; ++grp)
1187 struct per_cat_data *per_cat_data =
1188 categoricals_get_user_data_by_category_real (table->cats, iact_idx, grp);
1190 for (dv = 0; dv < table->n_dep_vars; ++dv)
1192 const struct per_var_data *pvd = &per_cat_data->pvd[dv];
1193 for (i = 0; i < cmd->n_cells; ++i)
1195 const int csi = cmd->cells[i];
1196 const struct cell_spec *cs = &cell_spec[csi];
1198 double result = cs->sd (pvd, pvd->cell_stats[i]);
1200 tab_double (t, heading_columns + i,
1201 heading_rows + grp + dv * n_cats,