1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2011, 2012 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include "data/case.h"
20 #include "data/casegrouper.h"
21 #include "data/casereader.h"
22 #include "data/dataset.h"
23 #include "data/dictionary.h"
24 #include "data/format.h"
25 #include "data/variable.h"
27 #include "language/command.h"
28 #include "language/lexer/lexer.h"
29 #include "language/lexer/variable-parser.h"
31 #include "libpspp/misc.h"
32 #include "libpspp/pool.h"
34 #include "math/categoricals.h"
35 #include "math/interaction.h"
36 #include "math/moments.h"
38 #include "output/tab.h"
43 #define _(msgid) gettext (msgid)
44 #define N_(msgid) (msgid)
56 typedef void *stat_create (struct pool *pool);
57 typedef void stat_update (void *stat, double w, double x);
58 typedef double stat_get (const struct per_var_data *, void *aux);
62 /* Printable title for output */
65 /* Keyword for syntax */
80 harmonic_create (struct pool *pool)
82 struct harmonic_mean *hm = pool_alloc (pool, sizeof *hm);
92 harmonic_update (void *stat, double w, double x)
94 struct harmonic_mean *hm = stat;
101 harmonic_get (const struct per_var_data *pvd UNUSED, void *stat)
103 struct harmonic_mean *hm = stat;
105 return hm->n / hm->rsum;
110 struct geometric_mean
118 geometric_create (struct pool *pool)
120 struct geometric_mean *gm = pool_alloc (pool, sizeof *gm);
130 geometric_update (void *stat, double w, double x)
132 struct geometric_mean *gm = stat;
133 gm->prod *= pow (x, w);
139 geometric_get (const struct per_var_data *pvd UNUSED, void *stat)
141 struct geometric_mean *gm = stat;
143 return pow (gm->prod, 1.0 / gm->n);
149 sum_get (const struct per_var_data *pvd, void *stat UNUSED)
153 moments1_calculate (pvd->mom, &n, &mean, 0, 0, 0);
160 n_get (const struct per_var_data *pvd, void *stat UNUSED)
164 moments1_calculate (pvd->mom, &n, 0, 0, 0, 0);
170 arithmean_get (const struct per_var_data *pvd, void *stat UNUSED)
174 moments1_calculate (pvd->mom, &n, &mean, 0, 0, 0);
180 variance_get (const struct per_var_data *pvd, void *stat UNUSED)
182 double n, mean, variance;
184 moments1_calculate (pvd->mom, &n, &mean, &variance, 0, 0);
191 stddev_get (const struct per_var_data *pvd, void *stat)
193 return sqrt (variance_get (pvd, stat));
200 skew_get (const struct per_var_data *pvd, void *stat UNUSED)
204 moments1_calculate (pvd->mom, NULL, NULL, NULL, &skew, 0);
210 sekurt_get (const struct per_var_data *pvd, void *stat UNUSED)
214 moments1_calculate (pvd->mom, &n, NULL, NULL, NULL, NULL);
216 return calc_sekurt (n);
220 seskew_get (const struct per_var_data *pvd, void *stat UNUSED)
224 moments1_calculate (pvd->mom, &n, NULL, NULL, NULL, NULL);
226 return calc_seskew (n);
230 kurt_get (const struct per_var_data *pvd, void *stat UNUSED)
234 moments1_calculate (pvd->mom, NULL, NULL, NULL, NULL, &kurt);
240 semean_get (const struct per_var_data *pvd, void *stat UNUSED)
244 moments1_calculate (pvd->mom, &n, NULL, &var, NULL, NULL);
246 return sqrt (var / n);
252 min_create (struct pool *pool)
254 double *r = pool_alloc (pool, sizeof *r);
262 min_update (void *stat, double w UNUSED, double x)
271 min_get (const struct per_var_data *pvd UNUSED, void *stat)
279 max_create (struct pool *pool)
281 double *r = pool_alloc (pool, sizeof *r);
289 max_update (void *stat, double w UNUSED, double x)
298 max_get (const struct per_var_data *pvd UNUSED, void *stat)
314 range_create (struct pool *pool)
316 struct range *r = pool_alloc (pool, sizeof *r);
325 range_update (void *stat, double w UNUSED, double x)
327 struct range *r = stat;
337 range_get (const struct per_var_data *pvd UNUSED, void *stat)
339 struct range *r = stat;
341 return r->max - r->min;
347 last_create (struct pool *pool)
349 double *l = pool_alloc (pool, sizeof *l);
355 last_update (void *stat, double w UNUSED, double x)
363 last_get (const struct per_var_data *pvd UNUSED, void *stat)
372 first_create (struct pool *pool)
374 double *f = pool_alloc (pool, sizeof *f);
382 first_update (void *stat, double w UNUSED, double x)
391 first_get (const struct per_var_data *pvd UNUSED, void *stat)
398 /* Table of cell_specs */
399 static const struct cell_spec cell_spec[] = {
400 {N_("Mean"), "MEAN", NULL, NULL, arithmean_get},
401 {N_("N"), "COUNT", NULL, NULL, n_get},
402 {N_("Std. Deviation"), "STDDEV", NULL, NULL, stddev_get},
404 {N_("Median"), "MEDIAN", NULL, NULL, NULL},
405 {N_("Group Median"), "GMEDIAN", NULL, NULL, NULL},
407 {N_("S.E. Mean"), "SEMEAN", NULL, NULL, semean_get},
408 {N_("Sum"), "SUM", NULL, NULL, sum_get},
409 {N_("Min"), "MIN", min_create, min_update, min_get},
410 {N_("Max"), "MAX", max_create, max_update, max_get},
411 {N_("Range"), "RANGE", range_create, range_update, range_get},
412 {N_("Variance"), "VARIANCE", NULL, NULL, variance_get},
413 {N_("Kurtosis"), "KURT", NULL, NULL, kurt_get},
414 {N_("S.E. Kurt"), "SEKURT", NULL, NULL, sekurt_get},
415 {N_("Skewness"), "SKEW", NULL, NULL, skew_get},
416 {N_("S.E. Skew"), "SESKEW", NULL, NULL, seskew_get},
417 {N_("First"), "FIRST", first_create, first_update, first_get},
418 {N_("Last"), "LAST", last_create, last_update, last_get},
420 {N_("Percent N"), "NPCT", NULL, NULL, NULL},
421 {N_("Percent Sum"), "SPCT", NULL, NULL, NULL},
423 {N_("Harmonic Mean"), "HARMONIC", harmonic_create, harmonic_update, harmonic_get},
424 {N_("Geom. Mean"), "GEOMETRIC", geometric_create, geometric_update, geometric_get}
427 #define n_C (sizeof (cell_spec) / sizeof (struct cell_spec))
433 casenumber non_missing;
437 /* The thing parsed after TABLES= */
441 const struct variable **dep_vars;
443 size_t n_interactions;
444 struct interaction **interactions;
445 struct summary *summary;
447 size_t *n_factor_vars;
448 const struct variable ***factor_vars;
454 struct categoricals *cats;
459 const struct dictionary *dict;
461 struct mtable *table;
464 /* Missing value class for categorical variables */
465 enum mv_class exclude;
467 /* Missing value class for dependent variables */
468 enum mv_class dep_exclude;
470 /* an array indicating which statistics are to be calculated */
476 /* Pool on which cell functions may allocate data */
482 run_means (struct means *cmd, struct casereader *input,
483 const struct dataset *ds);
485 /* Append all the variables belonging to layer and all subsequent layers
486 to iact. And then append iact to the means->interaction.
487 This is a recursive function.
490 iact_append_factor (struct mtable *means, int layer,
491 const struct interaction *iact)
494 const struct variable **fv;
496 if (layer >= means->n_layers)
499 fv = means->factor_vars[layer];
501 for (v = 0; v < means->n_factor_vars[layer]; ++v)
503 struct interaction *nexti = interaction_clone (iact);
505 interaction_add_variable (nexti, fv[v]);
507 iact_append_factor (means, layer + 1, nexti);
509 if (layer == means->n_layers - 1)
511 means->interactions[means->ii++] = nexti;
517 parse_means_table_syntax (struct lexer *lexer, const struct means *cmd, struct mtable *table)
521 table->factor_vars = NULL;
522 table->n_factor_vars = NULL;
524 /* Dependent variable (s) */
525 if (!parse_variables_const (lexer, cmd->dict,
526 &table->dep_vars, &table->n_dep_vars,
527 PV_NO_DUPLICATE | PV_NUMERIC))
530 /* Factor variable (s) */
531 while (lex_token (lexer) != T_ENDCMD && lex_token (lexer) != T_SLASH)
533 if (lex_match (lexer, T_BY))
537 xrealloc (table->factor_vars,
538 sizeof (*table->factor_vars) * table->n_layers);
540 table->n_factor_vars =
541 xrealloc (table->n_factor_vars,
542 sizeof (*table->n_factor_vars) * table->n_layers);
544 if (!parse_variables_const (lexer, cmd->dict,
545 &table->factor_vars[table->n_layers - 1],
546 &table->n_factor_vars[table->n_layers -
558 If the match succeeds, the variable will be placed in VAR.
559 Returns true if successful */
561 lex_is_variable (struct lexer *lexer, const struct dictionary *dict,
565 if (lex_next_token (lexer, n) != T_ID)
568 tstr = lex_next_tokcstr (lexer, n);
570 if (NULL == dict_lookup_var (dict, tstr) )
578 cmd_means (struct lexer *lexer, struct dataset *ds)
584 bool more_tables = true;
586 means.exclude = MV_ANY;
587 means.dep_exclude = MV_ANY;
591 means.dict = dataset_dict (ds);
594 means.cells = xcalloc (means.n_cells, sizeof (*means.cells));
597 /* The first three items (MEAN, COUNT, STDDEV) are the default */
598 for (i = 0; i < 3; ++i)
602 /* Optional TABLES = */
603 if (lex_match_id (lexer, "TABLES"))
605 lex_force_match (lexer, T_EQUALS);
610 /* Parse the "tables" */
614 means.table = xrealloc (means.table, means.n_tables * sizeof (*means.table));
616 if (! parse_means_table_syntax (lexer, &means,
617 &means.table[means.n_tables - 1]))
622 /* Look ahead to see if there are more tables to be parsed */
624 if ( T_SLASH == lex_next_token (lexer, 0) )
626 if (lex_is_variable (lexer, means.dict, 1) )
629 lex_force_match (lexer, T_SLASH);
634 /* /MISSING subcommand */
635 while (lex_token (lexer) != T_ENDCMD)
637 lex_match (lexer, T_SLASH);
639 if (lex_match_id (lexer, "MISSING"))
642 If no MISSING subcommand is specified, each combination of
643 a dependent variable and categorical variables is handled
646 lex_match (lexer, T_EQUALS);
647 if (lex_match_id (lexer, "INCLUDE"))
650 Use the subcommand "/MISSING=INCLUDE" to include user-missing
651 values in the analysis.
654 means.exclude = MV_SYSTEM;
655 means.dep_exclude = MV_SYSTEM;
657 else if (lex_match_id (lexer, "TABLE"))
659 This is the default. (I think).
660 Every case containing a complete set of variables for a given
661 table. If any variable, categorical or dependent for in a table
662 is missing (as defined by what?), then that variable will
663 be dropped FOR THAT TABLE ONLY.
666 means.exclude = MV_ANY;
667 means.dep_exclude = MV_ANY;
669 else if (lex_match_id (lexer, "DEPENDENT"))
671 Use the command "/MISSING=DEPENDENT" to
672 include user-missing values for the categorical variables,
673 while excluding them for the dependent variables.
675 Cases are dropped only when user-missing values
676 appear in dependent variables. User-missing
677 values for categorical variables are treated according to
680 Cases are ALWAYS dropped when System Missing values appear
681 in the categorical variables.
684 means.dep_exclude = MV_ANY;
685 means.exclude = MV_SYSTEM;
689 lex_error (lexer, NULL);
693 else if (lex_match_id (lexer, "CELLS"))
695 lex_match (lexer, T_EQUALS);
697 /* The default values become overwritten */
699 while (lex_token (lexer) != T_ENDCMD
700 && lex_token (lexer) != T_SLASH)
703 for (k = 0; k < n_C; ++k)
705 if (lex_match_id (lexer, cell_spec[k].keyword))
708 xrealloc (means.cells,
709 ++means.n_cells * sizeof (*means.cells));
711 means.cells[means.n_cells - 1] = k;
717 lex_error (lexer, NULL);
724 lex_error (lexer, NULL);
729 means.pool = pool_create ();
732 for (t = 0; t < means.n_tables; ++t)
734 struct mtable *table = &means.table[t];
735 table->n_interactions = 1;
736 for (l = 0; l < table->n_layers; ++l)
738 const int n_vars = table->n_factor_vars[l];
739 table->n_interactions *= n_vars;
742 table->interactions =
743 xcalloc (table->n_interactions, sizeof (*table->interactions));
746 xcalloc (table->n_dep_vars * table->n_interactions, sizeof (*table->summary));
749 if (table->n_layers > 0)
750 iact_append_factor (table, 0, interaction_create (NULL));
752 table->interactions[0] = interaction_create (NULL);
758 struct casegrouper *grouper;
759 struct casereader *group;
762 grouper = casegrouper_create_splits (proc_open (ds), means.dict);
763 while (casegrouper_get_next_group (grouper, &group))
765 run_means (&means, group, ds);
767 ok = casegrouper_destroy (grouper);
768 ok = proc_commit (ds) && ok;
781 is_missing (const struct means *cmd,
782 const struct variable *dvar,
783 const struct interaction *iact,
784 const struct ccase *c)
786 if ( interaction_case_is_missing (iact, c, cmd->exclude) )
790 if (var_is_value_missing (dvar,
798 static void output_case_processing_summary (const struct mtable *);
800 static void output_report (const struct means *, int, const struct mtable *);
805 struct per_var_data *pvd;
811 create_n (const void *aux1, void *aux2)
814 const struct means *means = aux1;
815 struct mtable *table = aux2;
816 struct per_cat_data *per_cat_data = xmalloc (sizeof *per_cat_data);
818 struct per_var_data *pvd = xcalloc (table->n_dep_vars, sizeof *pvd);
820 for (v = 0; v < table->n_dep_vars; ++v)
822 enum moment maxmom = MOMENT_KURTOSIS;
823 struct per_var_data *pp = &pvd[v];
825 pp->cell_stats = xcalloc (means->n_cells, sizeof *pp->cell_stats);
828 for (i = 0; i < means->n_cells; ++i)
830 int csi = means->cells[i];
831 const struct cell_spec *cs = &cell_spec[csi];
834 pp->cell_stats[i] = cs->sc (means->pool);
837 pp->mom = moments1_create (maxmom);
841 per_cat_data->pvd = pvd;
842 per_cat_data->warn = true;
847 update_n (const void *aux1, void *aux2, void *user_data, const struct ccase *c, double weight)
851 const struct means *means = aux1;
852 struct mtable *table = aux2;
853 struct per_cat_data *per_cat_data = user_data;
855 for (v = 0; v < table->n_dep_vars; ++v)
857 struct per_var_data *pvd = &per_cat_data->pvd[v];
859 const double x = case_data (c, table->dep_vars[v])->f;
861 for (i = 0; i < table->n_interactions; ++i)
863 if ( is_missing (means, table->dep_vars[v], table->interactions[i], c))
867 for (i = 0; i < means->n_cells; ++i)
869 const int csi = means->cells[i];
870 const struct cell_spec *cs = &cell_spec[csi];
874 cs->su (pvd->cell_stats[i],
878 moments1_add (pvd->mom, x, weight);
886 calculate_n (const void *aux1, void *aux2, void *user_data)
890 struct per_cat_data *per_cat_data = user_data;
891 const struct means *means = aux1;
892 struct mtable *table = aux2;
894 for (v = 0; v < table->n_dep_vars; ++v)
896 struct per_var_data *pvd = &per_cat_data->pvd[v];
897 for (i = 0; i < means->n_cells; ++i)
899 int csi = means->cells[i];
900 const struct cell_spec *cs = &cell_spec[csi];
903 cs->sd (pvd, pvd->cell_stats[i]);
910 run_means (struct means *cmd, struct casereader *input,
911 const struct dataset *ds UNUSED)
914 const struct variable *wv = dict_get_weight (cmd->dict);
916 struct casereader *reader;
918 struct payload payload;
919 payload.create = create_n;
920 payload.update = update_n;
921 payload.destroy = calculate_n;
923 for (t = 0; t < cmd->n_tables; ++t)
925 struct mtable *table = &cmd->table[t];
927 = categoricals_create (table->interactions,
928 table->n_interactions, wv, cmd->exclude);
930 categoricals_set_payload (table->cats, &payload, cmd, table);
933 for (reader = casereader_clone (input);
934 (c = casereader_read (reader)) != NULL; case_unref (c))
936 for (t = 0; t < cmd->n_tables; ++t)
939 struct mtable *table = &cmd->table[t];
941 for (v = 0; v < table->n_dep_vars; ++v)
944 for (i = 0; i < table->n_interactions; ++i)
947 is_missing (cmd, table->dep_vars[v],
948 table->interactions[i], c);
950 table->summary[v * table->n_interactions + i].missing++;
952 table->summary[v * table->n_interactions + i].non_missing++;
955 categoricals_update (table->cats, c);
958 casereader_destroy (reader);
960 for (t = 0; t < cmd->n_tables; ++t)
962 struct mtable *table = &cmd->table[t];
964 categoricals_done (table->cats);
968 for (t = 0; t < cmd->n_tables; ++t)
970 const struct mtable *table = &cmd->table[t];
972 output_case_processing_summary (table);
974 for (i = 0; i < table->n_interactions; ++i)
976 output_report (cmd, i, table);
979 categoricals_destroy (table->cats);
985 output_case_processing_summary (const struct mtable *table)
988 const int heading_columns = 1;
989 const int heading_rows = 3;
992 const int nr = heading_rows + table->n_interactions * table->n_dep_vars;
995 t = tab_create (nc, nr);
996 tab_title (t, _("Case Processing Summary"));
998 tab_headers (t, heading_columns, 0, heading_rows, 0);
1000 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
1002 tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1003 tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
1006 tab_joint_text (t, heading_columns, 0,
1007 nc - 1, 0, TAB_CENTER | TAT_TITLE, _("Cases"));
1009 tab_joint_text (t, 1, 1, 2, 1, TAB_CENTER | TAT_TITLE, _("Included"));
1010 tab_joint_text (t, 3, 1, 4, 1, TAB_CENTER | TAT_TITLE, _("Excluded"));
1011 tab_joint_text (t, 5, 1, 6, 1, TAB_CENTER | TAT_TITLE, _("Total"));
1013 tab_hline (t, TAL_1, heading_columns, nc - 1, 1);
1014 tab_hline (t, TAL_1, heading_columns, nc - 1, 2);
1017 for (i = 0; i < 3; ++i)
1019 tab_text (t, heading_columns + i * 2, 2, TAB_CENTER | TAT_TITLE,
1021 tab_text (t, heading_columns + i * 2 + 1, 2, TAB_CENTER | TAT_TITLE,
1025 for (v = 0; v < table->n_dep_vars; ++v)
1027 const struct variable *var = table->dep_vars[v];
1028 const char *dv_name = var_to_string (var);
1029 for (i = 0; i < table->n_interactions; ++i)
1031 const int row = v * table->n_interactions + i;
1032 const struct interaction *iact = table->interactions[i];
1036 ds_init_cstr (&str, dv_name);
1037 ds_put_cstr (&str, ": ");
1039 interaction_to_string (iact, &str);
1041 tab_text (t, 0, row + heading_rows,
1042 TAB_LEFT | TAT_TITLE, ds_cstr (&str));
1045 n_total = table->summary[row].missing +
1046 table->summary[row].non_missing;
1048 tab_double (t, 1, row + heading_rows,
1049 0, table->summary[row].non_missing, &F_8_0);
1051 tab_text_format (t, 2, row + heading_rows,
1053 table->summary[row].non_missing / (double) n_total * 100.0);
1056 tab_double (t, 3, row + heading_rows,
1057 0, table->summary[row].missing, &F_8_0);
1060 tab_text_format (t, 4, row + heading_rows,
1062 table->summary[row].missing / (double) n_total * 100.0);
1065 tab_double (t, 5, row + heading_rows,
1066 0, table->summary[row].missing +
1067 table->summary[row].non_missing, &F_8_0);
1069 tab_text_format (t, 6, row + heading_rows,
1071 n_total / (double) n_total * 100.0);
1084 output_report (const struct means *cmd, int iact_idx,
1085 const struct mtable *table)
1090 const struct interaction *iact = table->interactions[iact_idx];
1092 const int heading_columns = 1 + iact->n_vars;
1093 const int heading_rows = 1;
1094 struct tab_table *t;
1096 const int n_cats = categoricals_n_count (table->cats, iact_idx);
1098 const int nr = n_cats * table->n_dep_vars + heading_rows;
1100 const int nc = heading_columns + cmd->n_cells;
1102 t = tab_create (nc, nr);
1103 tab_title (t, _("Report"));
1105 tab_headers (t, heading_columns, 0, heading_rows, 0);
1107 tab_box (t, TAL_2, TAL_2, -1, TAL_1, 0, 0, nc - 1, nr - 1);
1109 tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1110 tab_vline (t, TAL_2, iact->n_vars, 0, nr - 1);
1112 for (i = 0; i < iact->n_vars; ++i)
1114 tab_text (t, 1 + i, 0, TAB_CENTER | TAT_TITLE,
1115 var_to_string (iact->vars[i]));
1118 for (i = 0; i < cmd->n_cells; ++i)
1120 tab_text (t, heading_columns + i, 0,
1121 TAB_CENTER | TAT_TITLE,
1122 gettext (cell_spec[cmd->cells[i]].title));
1126 for (i = 0; i < n_cats; ++i)
1129 const struct ccase *c =
1130 categoricals_get_case_by_category_real (table->cats, iact_idx, i);
1132 for (dv = 0; dv < table->n_dep_vars; ++dv)
1135 heading_rows + dv * n_cats,
1136 TAB_RIGHT | TAT_TITLE,
1137 var_get_name (table->dep_vars[dv])
1141 tab_hline (t, TAL_1, 0, nc - 1, heading_rows + dv * n_cats);
1143 for (v = 0; v < iact->n_vars; ++v)
1145 const struct variable *var = iact->vars[v];
1146 const union value *val = case_data (c, var);
1148 ds_init_empty (&str);
1149 var_append_value_name (var, val, &str);
1151 tab_text (t, 1 + v, heading_rows + dv * n_cats + i,
1152 TAB_RIGHT | TAT_TITLE, ds_cstr (&str));
1159 for (grp = 0; grp < n_cats; ++grp)
1162 struct per_cat_data *per_cat_data =
1163 categoricals_get_user_data_by_category_real (table->cats, iact_idx, grp);
1165 for (dv = 0; dv < table->n_dep_vars; ++dv)
1167 const struct per_var_data *pvd = &per_cat_data->pvd[dv];
1168 for (i = 0; i < cmd->n_cells; ++i)
1170 const int csi = cmd->cells[i];
1171 const struct cell_spec *cs = &cell_spec[csi];
1173 double result = cs->sd (pvd, pvd->cell_stats[i]);
1175 tab_double (t, heading_columns + i,
1176 heading_rows + grp + dv * n_cats,