1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2011, 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/case.h"
20 #include "data/casegrouper.h"
21 #include "data/casereader.h"
22 #include "data/dataset.h"
23 #include "data/dictionary.h"
24 #include "data/format.h"
25 #include "data/variable.h"
27 #include "language/command.h"
28 #include "language/lexer/lexer.h"
29 #include "language/lexer/variable-parser.h"
31 #include "libpspp/misc.h"
32 #include "libpspp/pool.h"
34 #include "math/categoricals.h"
35 #include "math/interaction.h"
36 #include "math/moments.h"
38 #include "output/tab.h"
43 #define _(msgid) gettext (msgid)
44 #define N_(msgid) (msgid)
56 typedef void *stat_create (struct pool *pool);
57 typedef void stat_update (void *stat, double w, double x);
58 typedef double stat_get (const struct per_var_data *, void *aux);
62 /* Printable title for output */
65 /* Keyword for syntax */
80 harmonic_create (struct pool *pool)
82 struct harmonic_mean *hm = pool_alloc (pool, sizeof *hm);
92 harmonic_update (void *stat, double w, double x)
94 struct harmonic_mean *hm = stat;
101 harmonic_get (const struct per_var_data *pvd UNUSED, void *stat)
103 struct harmonic_mean *hm = stat;
105 return hm->n / hm->rsum;
110 struct geometric_mean
118 geometric_create (struct pool *pool)
120 struct geometric_mean *gm = pool_alloc (pool, sizeof *gm);
130 geometric_update (void *stat, double w, double x)
132 struct geometric_mean *gm = stat;
133 gm->prod *= pow (x, w);
139 geometric_get (const struct per_var_data *pvd UNUSED, void *stat)
141 struct geometric_mean *gm = stat;
143 return pow (gm->prod, 1.0 / gm->n);
149 sum_get (const struct per_var_data *pvd, void *stat UNUSED)
153 moments1_calculate (pvd->mom, &n, &mean, 0, 0, 0);
160 n_get (const struct per_var_data *pvd, void *stat UNUSED)
164 moments1_calculate (pvd->mom, &n, 0, 0, 0, 0);
170 arithmean_get (const struct per_var_data *pvd, void *stat UNUSED)
174 moments1_calculate (pvd->mom, &n, &mean, 0, 0, 0);
180 variance_get (const struct per_var_data *pvd, void *stat UNUSED)
182 double n, mean, variance;
184 moments1_calculate (pvd->mom, &n, &mean, &variance, 0, 0);
191 stddev_get (const struct per_var_data *pvd, void *stat)
193 return sqrt (variance_get (pvd, stat));
200 skew_get (const struct per_var_data *pvd, void *stat UNUSED)
204 moments1_calculate (pvd->mom, NULL, NULL, NULL, &skew, 0);
210 sekurt_get (const struct per_var_data *pvd, void *stat UNUSED)
214 moments1_calculate (pvd->mom, &n, NULL, NULL, NULL, NULL);
216 return calc_sekurt (n);
220 seskew_get (const struct per_var_data *pvd, void *stat UNUSED)
224 moments1_calculate (pvd->mom, &n, NULL, NULL, NULL, NULL);
226 return calc_seskew (n);
230 kurt_get (const struct per_var_data *pvd, void *stat UNUSED)
234 moments1_calculate (pvd->mom, NULL, NULL, NULL, NULL, &kurt);
240 semean_get (const struct per_var_data *pvd, void *stat UNUSED)
244 moments1_calculate (pvd->mom, &n, NULL, &var, NULL, NULL);
246 return sqrt (var / n);
252 min_create (struct pool *pool)
254 double *r = pool_alloc (pool, sizeof *r);
262 min_update (void *stat, double w UNUSED, double x)
271 min_get (const struct per_var_data *pvd UNUSED, void *stat)
279 max_create (struct pool *pool)
281 double *r = pool_alloc (pool, sizeof *r);
289 max_update (void *stat, double w UNUSED, double x)
298 max_get (const struct per_var_data *pvd UNUSED, void *stat)
314 range_create (struct pool *pool)
316 struct range *r = pool_alloc (pool, sizeof *r);
325 range_update (void *stat, double w UNUSED, double x)
327 struct range *r = stat;
337 range_get (const struct per_var_data *pvd UNUSED, void *stat)
339 struct range *r = stat;
341 return r->max - r->min;
347 last_create (struct pool *pool)
349 double *l = pool_alloc (pool, sizeof *l);
355 last_update (void *stat, double w UNUSED, double x)
363 last_get (const struct per_var_data *pvd UNUSED, void *stat)
372 first_create (struct pool *pool)
374 double *f = pool_alloc (pool, sizeof *f);
382 first_update (void *stat, double w UNUSED, double x)
391 first_get (const struct per_var_data *pvd UNUSED, void *stat)
405 /* Table of cell_specs */
406 static const struct cell_spec cell_spec[] = {
407 {N_("Mean"), "MEAN", NULL, NULL, arithmean_get},
408 {N_("N"), "COUNT", NULL, NULL, n_get},
409 {N_("Std. Deviation"), "STDDEV", NULL, NULL, stddev_get},
411 {N_("Median"), "MEDIAN", NULL, NULL, NULL},
412 {N_("Group Median"), "GMEDIAN", NULL, NULL, NULL},
414 {N_("S.E. Mean"), "SEMEAN", NULL, NULL, semean_get},
415 {N_("Sum"), "SUM", NULL, NULL, sum_get},
416 {N_("Min"), "MIN", min_create, min_update, min_get},
417 {N_("Max"), "MAX", max_create, max_update, max_get},
418 {N_("Range"), "RANGE", range_create, range_update, range_get},
419 {N_("Variance"), "VARIANCE", NULL, NULL, variance_get},
420 {N_("Kurtosis"), "KURT", NULL, NULL, kurt_get},
421 {N_("S.E. Kurt"), "SEKURT", NULL, NULL, sekurt_get},
422 {N_("Skewness"), "SKEW", NULL, NULL, skew_get},
423 {N_("S.E. Skew"), "SESKEW", NULL, NULL, seskew_get},
424 {N_("First"), "FIRST", first_create, first_update, first_get},
425 {N_("Last"), "LAST", last_create, last_update, last_get},
427 {N_("Percent N"), "NPCT", NULL, NULL, NULL},
428 {N_("Percent Sum"), "SPCT", NULL, NULL, NULL},
430 {N_("Harmonic Mean"), "HARMONIC", harmonic_create, harmonic_update, harmonic_get},
431 {N_("Geom. Mean"), "GEOMETRIC", geometric_create, geometric_update, geometric_get}
434 #define n_C (sizeof (cell_spec) / sizeof (struct cell_spec))
440 casenumber non_missing;
446 size_t n_factor_vars;
447 const struct variable **factor_vars;
450 /* The thing parsed after TABLES= */
454 const struct variable **dep_vars;
457 struct layer *layers;
459 struct interaction **interactions;
460 struct summary *summary;
464 struct categoricals *cats;
469 const struct dictionary *dict;
471 struct mtable *table;
474 /* Missing value class for categorical variables */
475 enum mv_class exclude;
477 /* Missing value class for dependent variables */
478 enum mv_class dep_exclude;
480 bool listwise_exclude;
482 /* an array indicating which statistics are to be calculated */
488 /* Pool on which cell functions may allocate data */
494 run_means (struct means *cmd, struct casereader *input,
495 const struct dataset *ds);
500 parse_means_table_syntax (struct lexer *lexer, const struct means *cmd, struct mtable *table)
504 table->layers = NULL;
506 /* Dependent variable (s) */
507 if (!parse_variables_const (lexer, cmd->dict,
508 &table->dep_vars, &table->n_dep_vars,
509 PV_NO_DUPLICATE | PV_NUMERIC))
512 /* Factor variable (s) */
513 while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
515 if (lex_match (lexer, T_BY))
519 pool_realloc (cmd->pool, table->layers,
520 sizeof (*table->layers) * table->n_layers);
522 if (!parse_variables_const
524 &table->layers[table->n_layers - 1].factor_vars,
525 &table->layers[table->n_layers - 1].n_factor_vars,
532 /* There is always at least one layer.
533 However the final layer is the total, and not
534 normally considered by the user as a
540 pool_realloc (cmd->pool, table->layers,
541 sizeof (*table->layers) * table->n_layers);
542 table->layers[table->n_layers - 1].factor_vars = NULL;
543 table->layers[table->n_layers - 1].n_factor_vars = 0;
549 If the match succeeds, the variable will be placed in VAR.
550 Returns true if successful */
552 lex_is_variable (struct lexer *lexer, const struct dictionary *dict,
556 if (lex_next_token (lexer, n) != T_ID)
559 tstr = lex_next_tokcstr (lexer, n);
561 if (NULL == dict_lookup_var (dict, tstr) )
569 cmd_means (struct lexer *lexer, struct dataset *ds)
575 bool more_tables = true;
577 means.pool = pool_create ();
579 means.exclude = MV_ANY;
580 means.dep_exclude = MV_ANY;
581 means.listwise_exclude = false;
585 means.dict = dataset_dict (ds);
588 means.cells = pool_calloc (means.pool, means.n_cells, sizeof (*means.cells));
591 /* The first three items (MEAN, COUNT, STDDEV) are the default */
592 for (i = 0; i < 3; ++i)
596 /* Optional TABLES = */
597 if (lex_match_id (lexer, "TABLES"))
599 lex_force_match (lexer, T_EQUALS);
604 /* Parse the "tables" */
608 means.table = pool_realloc (means.pool, means.table, means.n_tables * sizeof (*means.table));
610 if (! parse_means_table_syntax (lexer, &means,
611 &means.table[means.n_tables - 1]))
616 /* Look ahead to see if there are more tables to be parsed */
618 if ( T_SLASH == lex_next_token (lexer, 0) )
620 if (lex_is_variable (lexer, means.dict, 1) )
623 lex_force_match (lexer, T_SLASH);
628 /* /MISSING subcommand */
629 while (lex_token (lexer) != T_ENDCMD)
631 lex_match (lexer, T_SLASH);
633 if (lex_match_id (lexer, "MISSING"))
636 If no MISSING subcommand is specified, each combination of
637 a dependent variable and categorical variables is handled
640 lex_match (lexer, T_EQUALS);
641 if (lex_match_id (lexer, "INCLUDE"))
644 Use the subcommand "/MISSING=INCLUDE" to include user-missing
645 values in the analysis.
648 means.exclude = MV_SYSTEM;
649 means.dep_exclude = MV_SYSTEM;
651 else if (lex_match_id (lexer, "TABLE"))
653 This is the default. (I think).
654 Every case containing a complete set of variables for a given
655 table. If any variable, categorical or dependent for in a table
656 is missing (as defined by what?), then that variable will
657 be dropped FOR THAT TABLE ONLY.
660 means.listwise_exclude = true;
662 else if (lex_match_id (lexer, "DEPENDENT"))
664 Use the command "/MISSING=DEPENDENT" to
665 include user-missing values for the categorical variables,
666 while excluding them for the dependent variables.
668 Cases are dropped only when user-missing values
669 appear in dependent variables. User-missing
670 values for categorical variables are treated according to
673 Cases are ALWAYS dropped when System Missing values appear
674 in the categorical variables.
677 means.dep_exclude = MV_ANY;
678 means.exclude = MV_SYSTEM;
682 lex_error (lexer, NULL);
686 else if (lex_match_id (lexer, "CELLS"))
688 lex_match (lexer, T_EQUALS);
690 /* The default values become overwritten */
692 while (lex_token (lexer) != T_ENDCMD
693 && lex_token (lexer) != T_SLASH)
696 if (lex_match (lexer, T_ALL))
700 pool_realloc (means.pool, means.cells,
701 (means.n_cells += n_C) * sizeof (*means.cells));
703 for (x = 0; x < n_C; ++x)
704 means.cells[means.n_cells - (n_C - 1 - x) - 1] = x;
706 else if (lex_match_id (lexer, "NONE"))
710 else if (lex_match_id (lexer, "DEFAULT"))
713 pool_realloc (means.pool, means.cells,
714 (means.n_cells += 3) * sizeof (*means.cells));
716 means.cells[means.n_cells - 2 - 1] = MEANS_MEAN;
717 means.cells[means.n_cells - 1 - 1] = MEANS_N;
718 means.cells[means.n_cells - 0 - 1] = MEANS_STDDEV;
724 if (lex_match_id (lexer, cell_spec[k].keyword))
727 pool_realloc (means.pool, means.cells,
728 ++means.n_cells * sizeof (*means.cells));
730 means.cells[means.n_cells - 1] = k;
737 lex_error (lexer, NULL);
744 lex_error (lexer, NULL);
751 for (t = 0; t < means.n_tables; ++t)
753 struct mtable *table = &means.table[t];
755 table->interactions =
756 pool_calloc (means.pool, table->n_layers, sizeof (*table->interactions));
759 pool_calloc (means.pool, table->n_dep_vars * table->n_layers, sizeof (*table->summary));
761 for (l = 0; l < table->n_layers; ++l)
764 const struct layer *lyr = &table->layers[l];
765 const int n_vars = lyr->n_factor_vars;
766 table->interactions[l] = interaction_create (NULL);
767 for (v = 0 ; v < n_vars ; ++v)
769 interaction_add_variable (table->interactions[l],
770 lyr->factor_vars[v]);
776 struct casegrouper *grouper;
777 struct casereader *group;
780 grouper = casegrouper_create_splits (proc_open (ds), means.dict);
781 while (casegrouper_get_next_group (grouper, &group))
783 run_means (&means, group, ds);
785 ok = casegrouper_destroy (grouper);
786 ok = proc_commit (ds) && ok;
790 pool_destroy (means.pool);
795 pool_destroy (means.pool);
801 is_missing (const struct means *cmd,
802 const struct variable *dvar,
803 const struct interaction *iact,
804 const struct ccase *c)
806 if ( interaction_case_is_missing (iact, c, cmd->exclude) )
810 if (var_is_value_missing (dvar,
818 static void output_case_processing_summary (const struct mtable *);
820 static void output_report (const struct means *, int, const struct mtable *);
825 struct per_var_data *pvd;
831 create_n (const void *aux1, void *aux2)
834 const struct means *means = aux1;
835 struct mtable *table = aux2;
836 struct per_cat_data *per_cat_data = pool_malloc (means->pool, sizeof *per_cat_data);
838 struct per_var_data *pvd = pool_calloc (means->pool, table->n_dep_vars, sizeof *pvd);
840 for (v = 0; v < table->n_dep_vars; ++v)
842 enum moment maxmom = MOMENT_KURTOSIS;
843 struct per_var_data *pp = &pvd[v];
845 pp->cell_stats = pool_calloc (means->pool, means->n_cells, sizeof *pp->cell_stats);
848 for (i = 0; i < means->n_cells; ++i)
850 int csi = means->cells[i];
851 const struct cell_spec *cs = &cell_spec[csi];
854 pp->cell_stats[i] = cs->sc (means->pool);
857 pp->mom = moments1_create (maxmom);
861 per_cat_data->pvd = pvd;
862 per_cat_data->warn = true;
867 update_n (const void *aux1, void *aux2, void *user_data, const struct ccase *c, double weight)
871 const struct means *means = aux1;
872 struct mtable *table = aux2;
873 struct per_cat_data *per_cat_data = user_data;
875 for (v = 0; v < table->n_dep_vars; ++v)
877 struct per_var_data *pvd = &per_cat_data->pvd[v];
879 const double x = case_data (c, table->dep_vars[v])->f;
881 for (i = 0; i < table->n_layers; ++i)
883 if ( is_missing (means, table->dep_vars[v],
884 table->interactions[i], c))
888 for (i = 0; i < means->n_cells; ++i)
890 const int csi = means->cells[i];
891 const struct cell_spec *cs = &cell_spec[csi];
895 cs->su (pvd->cell_stats[i],
899 moments1_add (pvd->mom, x, weight);
907 calculate_n (const void *aux1, void *aux2, void *user_data)
911 struct per_cat_data *per_cat_data = user_data;
912 const struct means *means = aux1;
913 struct mtable *table = aux2;
915 for (v = 0; v < table->n_dep_vars; ++v)
917 struct per_var_data *pvd = &per_cat_data->pvd[v];
918 for (i = 0; i < means->n_cells; ++i)
920 int csi = means->cells[i];
921 const struct cell_spec *cs = &cell_spec[csi];
924 cs->sd (pvd, pvd->cell_stats[i]);
930 run_means (struct means *cmd, struct casereader *input,
931 const struct dataset *ds UNUSED)
934 const struct variable *wv = dict_get_weight (cmd->dict);
936 struct casereader *reader;
938 struct payload payload;
939 payload.create = create_n;
940 payload.update = update_n;
941 payload.destroy = calculate_n;
943 for (t = 0; t < cmd->n_tables; ++t)
945 struct mtable *table = &cmd->table[t];
947 = categoricals_create (table->interactions,
948 table->n_layers, wv, cmd->dep_exclude, cmd->exclude);
950 categoricals_set_payload (table->cats, &payload, cmd, table);
953 for (reader = casereader_clone (input);
954 (c = casereader_read (reader)) != NULL; case_unref (c))
956 for (t = 0; t < cmd->n_tables; ++t)
958 bool something_missing = false;
960 struct mtable *table = &cmd->table[t];
962 for (v = 0; v < table->n_dep_vars; ++v)
965 for (i = 0; i < table->n_layers; ++i)
968 is_missing (cmd, table->dep_vars[v],
969 table->interactions[i], c);
972 something_missing = true;
973 table->summary[v * table->n_layers + i].missing++;
976 table->summary[v * table->n_layers + i].non_missing++;
979 if ( something_missing && cmd->listwise_exclude)
982 categoricals_update (table->cats, c);
985 casereader_destroy (reader);
987 for (t = 0; t < cmd->n_tables; ++t)
989 struct mtable *table = &cmd->table[t];
991 categoricals_done (table->cats);
995 for (t = 0; t < cmd->n_tables; ++t)
998 const struct mtable *table = &cmd->table[t];
1000 output_case_processing_summary (table);
1002 for (i = 0; i < table->n_layers; ++i)
1004 output_report (cmd, i, table);
1006 categoricals_destroy (table->cats);
1014 output_case_processing_summary (const struct mtable *table)
1017 const int heading_columns = 1;
1018 const int heading_rows = 3;
1019 struct tab_table *t;
1021 const int nr = heading_rows + table->n_layers * table->n_dep_vars;
1024 t = tab_create (nc, nr);
1025 tab_title (t, _("Case Processing Summary"));
1027 tab_headers (t, heading_columns, 0, heading_rows, 0);
1029 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
1031 tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1032 tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
1035 tab_joint_text (t, heading_columns, 0,
1036 nc - 1, 0, TAB_CENTER | TAT_TITLE, _("Cases"));
1038 tab_joint_text (t, 1, 1, 2, 1, TAB_CENTER | TAT_TITLE, _("Included"));
1039 tab_joint_text (t, 3, 1, 4, 1, TAB_CENTER | TAT_TITLE, _("Excluded"));
1040 tab_joint_text (t, 5, 1, 6, 1, TAB_CENTER | TAT_TITLE, _("Total"));
1042 tab_hline (t, TAL_1, heading_columns, nc - 1, 1);
1043 tab_hline (t, TAL_1, heading_columns, nc - 1, 2);
1046 for (i = 0; i < 3; ++i)
1048 tab_text (t, heading_columns + i * 2, 2, TAB_CENTER | TAT_TITLE,
1050 tab_text (t, heading_columns + i * 2 + 1, 2, TAB_CENTER | TAT_TITLE,
1054 for (v = 0; v < table->n_dep_vars; ++v)
1056 const struct variable *var = table->dep_vars[v];
1057 const char *dv_name = var_to_string (var);
1058 for (i = 0; i < table->n_layers; ++i)
1060 const int row = v * table->n_layers + i;
1061 const struct interaction *iact = table->interactions[i];
1065 ds_init_cstr (&str, dv_name);
1066 ds_put_cstr (&str, ": ");
1068 interaction_to_string (iact, &str);
1070 tab_text (t, 0, row + heading_rows,
1071 TAB_LEFT | TAT_TITLE, ds_cstr (&str));
1074 n_total = table->summary[row].missing +
1075 table->summary[row].non_missing;
1077 tab_double (t, 1, row + heading_rows,
1078 0, table->summary[row].non_missing, &F_8_0);
1080 tab_text_format (t, 2, row + heading_rows,
1082 table->summary[row].non_missing / (double) n_total * 100.0);
1085 tab_double (t, 3, row + heading_rows,
1086 0, table->summary[row].missing, &F_8_0);
1089 tab_text_format (t, 4, row + heading_rows,
1091 table->summary[row].missing / (double) n_total * 100.0);
1094 tab_double (t, 5, row + heading_rows,
1095 0, table->summary[row].missing +
1096 table->summary[row].non_missing, &F_8_0);
1098 tab_text_format (t, 6, row + heading_rows,
1100 n_total / (double) n_total * 100.0);
1112 output_report (const struct means *cmd, int iact_idx,
1113 const struct mtable *table)
1118 const struct interaction *iact = table->interactions[iact_idx];
1120 const int heading_columns = 1 + iact->n_vars;
1121 const int heading_rows = 1;
1122 struct tab_table *t;
1124 const int n_cats = categoricals_n_count (table->cats, iact_idx);
1126 const int nr = n_cats * table->n_dep_vars + heading_rows;
1128 const int nc = heading_columns + cmd->n_cells;
1130 t = tab_create (nc, nr);
1131 tab_title (t, _("Report"));
1133 tab_headers (t, heading_columns, 0, heading_rows, 0);
1135 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
1137 tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1138 tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
1140 for (i = 0; i < iact->n_vars; ++i)
1142 tab_text (t, 1 + i, 0, TAB_CENTER | TAT_TITLE,
1143 var_to_string (iact->vars[i]));
1146 for (i = 0; i < cmd->n_cells; ++i)
1148 tab_text (t, heading_columns + i, 0,
1149 TAB_CENTER | TAT_TITLE,
1150 gettext (cell_spec[cmd->cells[i]].title));
1154 for (i = 0; i < n_cats; ++i)
1157 const struct ccase *c =
1158 categoricals_get_case_by_category_real (table->cats, iact_idx, i);
1160 for (dv = 0; dv < table->n_dep_vars; ++dv)
1163 heading_rows + dv * n_cats,
1164 TAB_RIGHT | TAT_TITLE,
1165 var_to_string (table->dep_vars[dv])
1169 tab_hline (t, TAL_1, 0, nc - 1, heading_rows + dv * n_cats);
1171 for (v = 0; v < iact->n_vars; ++v)
1173 const struct variable *var = iact->vars[v];
1174 const union value *val = case_data (c, var);
1176 ds_init_empty (&str);
1177 var_append_value_name (var, val, &str);
1179 tab_text (t, 1 + v, heading_rows + dv * n_cats + i,
1180 TAB_RIGHT | TAT_TITLE, ds_cstr (&str));
1187 for (grp = 0; grp < n_cats; ++grp)
1190 struct per_cat_data *per_cat_data =
1191 categoricals_get_user_data_by_category_real (table->cats, iact_idx, grp);
1193 for (dv = 0; dv < table->n_dep_vars; ++dv)
1195 const struct per_var_data *pvd = &per_cat_data->pvd[dv];
1196 for (i = 0; i < cmd->n_cells; ++i)
1198 const int csi = cmd->cells[i];
1199 const struct cell_spec *cs = &cell_spec[csi];
1201 double result = cs->sd (pvd, pvd->cell_stats[i]);
1203 tab_double (t, heading_columns + i,
1204 heading_rows + grp + dv * n_cats,