2 PSPP - a program for statistical analysis.
3 Copyright (C) 2012 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 #include <gsl/gsl_cdf.h>
24 #include "libpspp/assertion.h"
25 #include "libpspp/message.h"
28 #include "data/dataset.h"
29 #include "data/dictionary.h"
30 #include "data/casegrouper.h"
31 #include "data/casereader.h"
32 #include "data/casewriter.h"
33 #include "data/caseproto.h"
34 #include "data/subcase.h"
37 #include "data/format.h"
39 #include "math/interaction.h"
40 #include "math/box-whisker.h"
41 #include "math/categoricals.h"
42 #include "math/histogram.h"
43 #include "math/moments.h"
45 #include "math/sort.h"
46 #include "math/order-stats.h"
47 #include "math/percentiles.h"
48 #include "math/tukey-hinges.h"
49 #include "math/trimmed-mean.h"
51 #include "output/charts/boxplot.h"
52 #include "output/charts/np-plot.h"
53 #include "output/charts/plot-hist.h"
55 #include "language/command.h"
56 #include "language/lexer/lexer.h"
57 #include "language/lexer/value-parser.h"
58 #include "language/lexer/variable-parser.h"
60 #include "output/tab.h"
63 #define _(msgid) gettext (msgid)
64 #define N_(msgid) msgid
75 const struct variable **dep_vars;
78 struct interaction **iacts;
80 enum mv_class exclude;
82 const struct dictionary *dict;
84 struct categoricals *cats;
86 /* how many extremities to display */
95 /* Test options require that casenumbers are known */
98 /* The case index of the ID value (or -1) if not applicable */
103 size_t n_percentiles;
109 enum bp_mode boxplot_mode;
111 const struct variable *id_var;
113 const struct variable *wv;
118 /* The value of this extremity */
121 /* Either the casenumber or the value of the variable specified
122 by the /ID subcommand which corresponds to this extremity */
129 EX_ID, /* identity */
133 struct exploratory_stats
140 /* Most operations need a sorted reader/writer */
141 struct casewriter *sorted_writer;
142 struct casereader *sorted_reader;
144 struct extremity *minima;
145 struct extremity *maxima;
148 Minimum should alway equal mimima[0].val.
149 Likewise, maximum should alway equal maxima[0].val.
150 This redundancy exists as an optimisation effort.
151 Some statistics (eg histogram) require early calculation
157 struct trimmed_mean *trimmed_mean;
158 struct percentile *quartiles[3];
159 struct percentile **percentiles;
161 struct tukey_hinges *hinges;
163 /* The data for the NP Plots */
166 struct histogram *histogram;
168 /* The data for the box plots */
169 struct box_whisker *box_whisker;
174 /* The minimum weight */
181 xxx0 (const struct interaction *iact)
185 const union value **prev_val = xcalloc (iact->n_vars, sizeof (*prev_val));
187 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
188 prev_val[ivar_idx] = NULL;
194 xxx1 (const struct interaction *iact, const struct ccase *c, const union value **prev_val)
198 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
200 const struct variable *ivar = iact->vars[ivar_idx];
201 const int width = var_get_width (ivar);
202 const union value *val = case_data (c, ivar);
204 if (prev_val[ivar_idx])
205 if (! value_equal (prev_val[ivar_idx], val, width))
212 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
214 const struct variable *ivar = iact->vars[ivar_idx];
215 const union value *val = case_data (c, ivar);
217 prev_val[ivar_idx] = val;
224 show_boxplot_grouped (const struct examine *cmd, int iact_idx)
228 const struct interaction *iact = cmd->iacts[iact_idx];
229 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
231 for (v = 0; v < cmd->n_dep_vars; ++v)
233 double y_min = DBL_MAX;
234 double y_max = -DBL_MAX;
236 struct boxplot *boxplot;
238 ds_init_empty (&title);
240 if (iact->n_vars > 0)
243 ds_init_empty (&istr);
244 interaction_to_string (iact, &istr);
245 ds_put_format (&title, _("Boxplot of %s vs. %s"),
246 var_to_string (cmd->dep_vars[v]),
251 ds_put_format (&title, _("Boxplot of %s"), var_to_string (cmd->dep_vars[v]));
253 for (grp = 0; grp < n_cats; ++grp)
255 const struct exploratory_stats *es =
256 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
258 if ( y_min > es[v].minimum)
259 y_min = es[v].minimum;
261 if ( y_max < es[v].maximum)
262 y_max = es[v].maximum;
265 boxplot = boxplot_create (y_min, y_max, ds_cstr (&title));
269 for (grp = 0; grp < n_cats; ++grp)
274 const struct ccase *c =
275 categoricals_get_case_by_category_real (cmd->cats, iact_idx, grp);
277 const struct exploratory_stats *es =
278 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
280 ds_init_empty (&label);
281 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
283 const struct variable *ivar = iact->vars[ivar_idx];
284 const union value *val = case_data (c, ivar);
286 ds_put_cstr (&label, var_to_string (ivar));
287 ds_put_cstr (&label, " = ");
288 var_append_value_name (ivar, val, &label);
289 ds_put_cstr (&label, "; ");
292 boxplot_add_box (boxplot, es[v].box_whisker, ds_cstr (&label));
297 boxplot_submit (boxplot);
302 show_boxplot_variabled (const struct examine *cmd, int iact_idx)
305 const struct interaction *iact = cmd->iacts[iact_idx];
306 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
308 for (grp = 0; grp < n_cats; ++grp)
310 struct boxplot *boxplot;
312 double y_min = DBL_MAX;
313 double y_max = -DBL_MAX;
315 const struct ccase *c =
316 categoricals_get_case_by_category_real (cmd->cats, iact_idx, grp);
319 ds_init_empty (&title);
321 for (v = 0; v < cmd->n_dep_vars; ++v)
323 const struct exploratory_stats *es =
324 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
326 if ( y_min > es[v].minimum)
327 y_min = es[v].minimum;
329 if ( y_max < es[v].maximum)
330 y_max = es[v].maximum;
333 if ( iact->n_vars == 0)
334 ds_put_format (&title, _("Boxplot"));
339 ds_init_empty (&label);
340 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
342 const struct variable *ivar = iact->vars[ivar_idx];
343 const union value *val = case_data (c, ivar);
345 ds_put_cstr (&label, var_to_string (ivar));
346 ds_put_cstr (&label, " = ");
347 var_append_value_name (ivar, val, &label);
348 ds_put_cstr (&label, "; ");
351 ds_put_format (&title, _("Boxplot of %s"),
357 boxplot = boxplot_create (y_min, y_max, ds_cstr (&title));
361 for (v = 0; v < cmd->n_dep_vars; ++v)
363 const struct exploratory_stats *es =
364 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
366 boxplot_add_box (boxplot, es[v].box_whisker,
367 var_to_string (cmd->dep_vars[v]));
370 boxplot_submit (boxplot);
376 show_npplot (const struct examine *cmd, int iact_idx)
378 const struct interaction *iact = cmd->iacts[iact_idx];
379 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
383 for (v = 0; v < cmd->n_dep_vars; ++v)
386 for (grp = 0; grp < n_cats; ++grp)
388 struct chart_item *npp, *dnpp;
389 struct casereader *reader;
393 const struct ccase *c =
394 categoricals_get_case_by_category_real (cmd->cats,
397 const struct exploratory_stats *es =
398 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
401 ds_init_cstr (&label,
402 var_to_string (cmd->dep_vars[v]));
404 if ( iact->n_vars > 0)
406 ds_put_cstr (&label, " (");
407 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
409 const struct variable *ivar = iact->vars[ivar_idx];
410 const union value *val = case_data (c, ivar);
412 ds_put_cstr (&label, var_to_string (ivar));
413 ds_put_cstr (&label, " = ");
414 var_append_value_name (ivar, val, &label);
415 ds_put_cstr (&label, "; ");
418 ds_put_cstr (&label, ")");
422 reader = casewriter_make_reader (np->writer);
425 npp = np_plot_create (np, reader, ds_cstr (&label));
426 dnpp = dnp_plot_create (np, reader, ds_cstr (&label));
428 if (npp == NULL || dnpp == NULL)
430 msg (MW, _("Not creating NP plot because data set is empty."));
431 chart_item_unref (npp);
432 chart_item_unref (dnpp);
436 chart_item_submit (npp);
437 chart_item_submit (dnpp);
447 show_histogram (const struct examine *cmd, int iact_idx)
449 const struct interaction *iact = cmd->iacts[iact_idx];
450 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
454 for (v = 0; v < cmd->n_dep_vars; ++v)
457 for (grp = 0; grp < n_cats; ++grp)
461 const struct ccase *c =
462 categoricals_get_case_by_category_real (cmd->cats,
465 const struct exploratory_stats *es =
466 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
469 ds_init_cstr (&label,
470 var_to_string (cmd->dep_vars[v]));
472 if ( iact->n_vars > 0)
474 ds_put_cstr (&label, " (");
475 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
477 const struct variable *ivar = iact->vars[ivar_idx];
478 const union value *val = case_data (c, ivar);
480 ds_put_cstr (&label, var_to_string (ivar));
481 ds_put_cstr (&label, " = ");
482 var_append_value_name (ivar, val, &label);
483 ds_put_cstr (&label, "; ");
486 ds_put_cstr (&label, ")");
490 moments_calculate (es[v].mom, &n, &mean, &var, NULL, NULL);
493 ( histogram_chart_create (es[v].histogram->gsl_hist,
494 ds_cstr (&label), n, mean,
504 percentiles_report (const struct examine *cmd, int iact_idx)
506 const struct interaction *iact = cmd->iacts[iact_idx];
508 const int heading_columns = 1 + iact->n_vars + 1;
509 const int heading_rows = 2;
512 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
514 const int rows_per_cat = 2;
515 const int rows_per_var = n_cats * rows_per_cat;
517 const int nr = heading_rows + cmd->n_dep_vars * rows_per_var;
518 const int nc = heading_columns + cmd->n_percentiles;
520 t = tab_create (nc, nr);
521 tab_title (t, _("Percentiles"));
523 tab_headers (t, heading_columns, 0, heading_rows, 0);
525 /* Internal Vertical lines */
526 tab_box (t, -1, -1, -1, TAL_1,
527 heading_columns, 0, nc - 1, nr - 1);
530 tab_box (t, TAL_2, TAL_2, -1, -1,
531 0, 0, nc - 1, nr - 1);
533 tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
534 tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
536 tab_joint_text (t, heading_columns, 0,
538 TAT_TITLE | TAB_CENTER,
542 tab_hline (t, TAL_1, heading_columns, nc - 1, 1);
545 for (i = 0; i < cmd->n_percentiles; ++i)
547 tab_text_format (t, heading_columns + i, 1,
548 TAT_TITLE | TAB_CENTER,
549 _("%g"), cmd->ptiles[i]);
552 for (i = 0; i < iact->n_vars; ++i)
557 var_to_string (iact->vars[i])
561 tab_vline (t, TAL_1, heading_columns - 1, heading_rows, nr - 1);
564 for (v = 0; v < cmd->n_dep_vars; ++v)
566 const union value **prev_val = xxx0 (iact);
570 tab_hline (t, TAL_1, 0, nc - 1, heading_rows + v * rows_per_var);
573 0, heading_rows + v * rows_per_var,
574 TAT_TITLE | TAB_LEFT,
575 var_to_string (cmd->dep_vars[v])
578 for (i = 0; i < n_cats; ++i)
580 const struct ccase *c =
581 categoricals_get_case_by_category_real (cmd->cats,
584 const struct exploratory_stats *ess =
585 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, i);
587 const struct exploratory_stats *es = ess + v;
589 int diff_idx = xxx1 (iact, c, prev_val);
594 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
596 const struct variable *ivar = iact->vars[ivar_idx];
597 const union value *val = case_data (c, ivar);
599 if (( diff_idx != -1 && diff_idx <= ivar_idx)
603 ds_init_empty (&str);
604 var_append_value_name (ivar, val, &str);
608 heading_rows + v * rows_per_var + i * rows_per_cat,
609 TAT_TITLE | TAB_LEFT,
617 if ( diff_idx != -1 && diff_idx < iact->n_vars)
619 tab_hline (t, TAL_1, 1 + diff_idx, nc - 1,
620 heading_rows + v * rows_per_var + i * rows_per_cat
624 tab_text (t, heading_columns - 1,
625 heading_rows + v * rows_per_var + i * rows_per_cat,
626 TAT_TITLE | TAB_LEFT,
627 gettext (ptile_alg_desc [cmd->pc_alg]));
629 tukey_hinges_calculate (es->hinges, hinges);
631 for (p = 0; p < cmd->n_percentiles; ++p)
633 tab_double (t, heading_columns + p,
634 heading_rows + v * rows_per_var + i * rows_per_cat,
636 percentile_calculate (es->percentiles[p], cmd->pc_alg),
639 if (cmd->ptiles[p] == 25.0)
641 tab_double (t, heading_columns + p,
642 heading_rows + v * rows_per_var + i * rows_per_cat + 1,
647 else if (cmd->ptiles[p] == 50.0)
649 tab_double (t, heading_columns + p,
650 heading_rows + v * rows_per_var + i * rows_per_cat + 1,
655 else if (cmd->ptiles[p] == 75.0)
657 tab_double (t, heading_columns + p,
658 heading_rows + v * rows_per_var + i * rows_per_cat + 1,
666 tab_text (t, heading_columns - 1,
667 heading_rows + v * rows_per_var + i * rows_per_cat + 1,
668 TAT_TITLE | TAB_LEFT,
669 _("Tukey's Hinges"));
678 descriptives_report (const struct examine *cmd, int iact_idx)
680 const struct interaction *iact = cmd->iacts[iact_idx];
682 const int heading_columns = 1 + iact->n_vars + 2;
683 const int heading_rows = 1;
686 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
688 const int rows_per_cat = 13;
689 const int rows_per_var = n_cats * rows_per_cat;
691 const int nr = heading_rows + cmd->n_dep_vars * rows_per_var;
692 const int nc = 2 + heading_columns;
694 t = tab_create (nc, nr);
695 tab_title (t, _("Descriptives"));
697 tab_headers (t, heading_columns, 0, heading_rows, 0);
699 /* Internal Vertical lines */
700 tab_box (t, -1, -1, -1, TAL_1,
701 heading_columns, 0, nc - 1, nr - 1);
704 tab_box (t, TAL_2, TAL_2, -1, -1,
705 0, 0, nc - 1, nr - 1);
707 tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
708 tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
711 tab_text (t, heading_columns, 0, TAB_CENTER | TAT_TITLE,
714 tab_text (t, heading_columns + 1, 0, TAB_CENTER | TAT_TITLE,
717 for (i = 0; i < iact->n_vars; ++i)
722 var_to_string (iact->vars[i])
726 for (v = 0; v < cmd->n_dep_vars; ++v)
728 const union value **prev_val = xxx0 (iact);
732 tab_hline (t, TAL_1, 0, nc - 1, heading_rows + v * rows_per_var);
735 0, heading_rows + v * rows_per_var,
736 TAT_TITLE | TAB_LEFT,
737 var_to_string (cmd->dep_vars[v])
740 for (i = 0; i < n_cats; ++i)
742 const struct ccase *c =
743 categoricals_get_case_by_category_real (cmd->cats,
746 const struct exploratory_stats *ess =
747 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, i);
749 const struct exploratory_stats *es = ess + v;
751 const int diff_idx = xxx1 (iact, c, prev_val);
753 double m0, m1, m2, m3, m4;
756 moments_calculate (es->mom, &m0, &m1, &m2, &m3, &m4);
758 tval = gsl_cdf_tdist_Qinv ((1.0 - cmd->conf) / 2.0, m0 - 1.0);
760 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
762 const struct variable *ivar = iact->vars[ivar_idx];
763 const union value *val = case_data (c, ivar);
765 if (( diff_idx != -1 && diff_idx <= ivar_idx)
769 ds_init_empty (&str);
770 var_append_value_name (ivar, val, &str);
774 heading_rows + v * rows_per_var + i * rows_per_cat,
775 TAT_TITLE | TAB_LEFT,
783 if ( diff_idx != -1 && diff_idx < iact->n_vars)
785 tab_hline (t, TAL_1, 1 + diff_idx, nc - 1,
786 heading_rows + v * rows_per_var + i * rows_per_cat
792 heading_rows + v * rows_per_var + i * rows_per_cat,
798 1 + iact->n_vars + 2,
799 heading_rows + v * rows_per_var + i * rows_per_cat,
803 1 + iact->n_vars + 3,
804 heading_rows + v * rows_per_var + i * rows_per_cat,
805 0, calc_semean (m2, m0), 0);
809 heading_rows + v * rows_per_var + i * rows_per_cat + 1,
811 _("%g%% Confidence Interval for Mean"),
816 1 + iact->n_vars + 1,
817 heading_rows + v * rows_per_var + i * rows_per_cat + 1,
823 1 + iact->n_vars + 2,
824 heading_rows + v * rows_per_var + i * rows_per_cat + 1,
825 0, m1 - tval * calc_semean (m2, m0), 0);
829 1 + iact->n_vars + 1,
830 heading_rows + v * rows_per_var + i * rows_per_cat + 2,
836 1 + iact->n_vars + 2,
837 heading_rows + v * rows_per_var + i * rows_per_cat + 2,
838 0, m1 + tval * calc_semean (m2, m0), 0);
843 heading_rows + v * rows_per_var + i * rows_per_cat + 3,
849 1 + iact->n_vars + 2,
850 heading_rows + v * rows_per_var + i * rows_per_cat + 3,
852 trimmed_mean_calculate (es->trimmed_mean),
857 heading_rows + v * rows_per_var + i * rows_per_cat + 4,
863 1 + iact->n_vars + 2,
864 heading_rows + v * rows_per_var + i * rows_per_cat + 4,
866 percentile_calculate (es->quartiles[1], cmd->pc_alg),
872 heading_rows + v * rows_per_var + i * rows_per_cat + 5,
878 1 + iact->n_vars + 2,
879 heading_rows + v * rows_per_var + i * rows_per_cat + 5,
884 heading_rows + v * rows_per_var + i * rows_per_cat + 6,
890 1 + iact->n_vars + 2,
891 heading_rows + v * rows_per_var + i * rows_per_cat + 6,
896 heading_rows + v * rows_per_var + i * rows_per_cat + 7,
902 1 + iact->n_vars + 2,
903 heading_rows + v * rows_per_var + i * rows_per_cat + 7,
910 heading_rows + v * rows_per_var + i * rows_per_cat + 8,
916 1 + iact->n_vars + 2,
917 heading_rows + v * rows_per_var + i * rows_per_cat + 8,
924 heading_rows + v * rows_per_var + i * rows_per_cat + 9,
930 1 + iact->n_vars + 2,
931 heading_rows + v * rows_per_var + i * rows_per_cat + 9,
933 es->maxima[0].val - es->minima[0].val,
938 heading_rows + v * rows_per_var + i * rows_per_cat + 10,
940 _("Interquartile Range")
945 1 + iact->n_vars + 2,
946 heading_rows + v * rows_per_var + i * rows_per_cat + 10,
948 percentile_calculate (es->quartiles[2], cmd->pc_alg) -
949 percentile_calculate (es->quartiles[0], cmd->pc_alg),
957 heading_rows + v * rows_per_var + i * rows_per_cat + 11,
963 1 + iact->n_vars + 2,
964 heading_rows + v * rows_per_var + i * rows_per_cat + 11,
968 1 + iact->n_vars + 3,
969 heading_rows + v * rows_per_var + i * rows_per_cat + 11,
970 0, calc_seskew (m0), 0);
974 heading_rows + v * rows_per_var + i * rows_per_cat + 12,
980 1 + iact->n_vars + 2,
981 heading_rows + v * rows_per_var + i * rows_per_cat + 12,
985 1 + iact->n_vars + 3,
986 heading_rows + v * rows_per_var + i * rows_per_cat + 12,
987 0, calc_sekurt (m0), 0);
995 extremes_report (const struct examine *cmd, int iact_idx)
997 const struct interaction *iact = cmd->iacts[iact_idx];
999 const int heading_columns = 1 + iact->n_vars + 2;
1000 const int heading_rows = 1;
1001 struct tab_table *t;
1003 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
1005 const int rows_per_cat = 2 * cmd->disp_extremes;
1006 const int rows_per_var = n_cats * rows_per_cat;
1008 const int nr = heading_rows + cmd->n_dep_vars * rows_per_var;
1009 const int nc = 2 + heading_columns;
1011 t = tab_create (nc, nr);
1012 tab_title (t, _("Extreme Values"));
1014 tab_headers (t, heading_columns, 0, heading_rows, 0);
1016 /* Internal Vertical lines */
1017 tab_box (t, -1, -1, -1, TAL_1,
1018 heading_columns, 0, nc - 1, nr - 1);
1020 /* External Frame */
1021 tab_box (t, TAL_2, TAL_2, -1, -1,
1022 0, 0, nc - 1, nr - 1);
1024 tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1025 tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
1029 tab_text (t, heading_columns, 0, TAB_CENTER | TAT_TITLE,
1030 var_to_string (cmd->id_var));
1032 tab_text (t, heading_columns, 0, TAB_CENTER | TAT_TITLE,
1035 tab_text (t, heading_columns + 1, 0, TAB_CENTER | TAT_TITLE,
1038 for (i = 0; i < iact->n_vars; ++i)
1043 var_to_string (iact->vars[i])
1047 for (v = 0; v < cmd->n_dep_vars; ++v)
1049 const union value **prev_val = xxx0 (iact);
1053 tab_hline (t, TAL_1, 0, nc - 1, heading_rows + v * rows_per_var);
1056 0, heading_rows + v * rows_per_var,
1058 var_to_string (cmd->dep_vars[v])
1061 for (i = 0; i < n_cats; ++i)
1064 const struct ccase *c =
1065 categoricals_get_case_by_category_real (cmd->cats, iact_idx, i);
1067 const struct exploratory_stats *ess =
1068 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, i);
1070 const struct exploratory_stats *es = ess + v;
1072 int diff_idx = xxx1 (iact, c, prev_val);
1074 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
1076 const struct variable *ivar = iact->vars[ivar_idx];
1077 const union value *val = case_data (c, ivar);
1079 if (( diff_idx != -1 && diff_idx <= ivar_idx)
1083 ds_init_empty (&str);
1084 var_append_value_name (ivar, val, &str);
1088 heading_rows + v * rows_per_var + i * rows_per_cat,
1089 TAT_TITLE | TAB_LEFT,
1097 if ( diff_idx != -1 && diff_idx < iact->n_vars)
1099 tab_hline (t, TAL_1, 1 + diff_idx, nc - 1,
1100 heading_rows + v * rows_per_var + i * rows_per_cat
1105 heading_columns - 2,
1106 heading_rows + v * rows_per_var + i * rows_per_cat,
1111 tab_hline (t, TAL_1, heading_columns - 2, nc - 1,
1112 heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes
1116 heading_columns - 2,
1117 heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes,
1121 for (e = 0 ; e < cmd->disp_extremes; ++e)
1124 heading_columns - 1,
1125 heading_rows + v * rows_per_var + i * rows_per_cat + e,
1130 /* The casenumber */
1133 heading_rows + v * rows_per_var + i * rows_per_cat + e,
1135 es->maxima[e].identity,
1140 heading_columns + 1,
1141 heading_rows + v * rows_per_var + i * rows_per_cat + e,
1149 heading_columns - 1,
1150 heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes + e,
1155 /* The casenumber */
1158 heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes + e,
1160 es->minima[e].identity,
1164 heading_columns + 1,
1165 heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes + e,
1178 summary_report (const struct examine *cmd, int iact_idx)
1180 const struct interaction *iact = cmd->iacts[iact_idx];
1182 const int heading_columns = 1 + iact->n_vars;
1183 const int heading_rows = 3;
1184 struct tab_table *t;
1186 const struct fmt_spec *wfmt = cmd->wv ? var_get_print_format (cmd->wv) : &F_8_0;
1188 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
1190 const int nr = heading_rows + n_cats * cmd->n_dep_vars;
1191 const int nc = 6 + heading_columns;
1193 t = tab_create (nc, nr);
1194 tab_title (t, _("Case Processing Summary"));
1196 tab_headers (t, heading_columns, 0, heading_rows, 0);
1198 /* Internal Vertical lines */
1199 tab_box (t, -1, -1, -1, TAL_1,
1200 heading_columns, 0, nc - 1, nr - 1);
1202 /* External Frame */
1203 tab_box (t, TAL_2, TAL_2, -1, -1,
1204 0, 0, nc - 1, nr - 1);
1206 tab_hline (t, TAL_2, 0, nc - 1, heading_rows);
1207 tab_vline (t, TAL_2, heading_columns, 0, nr - 1);
1209 tab_joint_text (t, heading_columns, 0,
1210 nc - 1, 0, TAB_CENTER | TAT_TITLE, _("Cases"));
1213 heading_columns + 1, 1,
1214 TAB_CENTER | TAT_TITLE, _("Valid"));
1217 heading_columns + 2, 1,
1218 heading_columns + 3, 1,
1219 TAB_CENTER | TAT_TITLE, _("Missing"));
1222 heading_columns + 4, 1,
1223 heading_columns + 5, 1,
1224 TAB_CENTER | TAT_TITLE, _("Total"));
1226 for (i = 0; i < 3; ++i)
1228 tab_text (t, heading_columns + i * 2, 2, TAB_CENTER | TAT_TITLE,
1230 tab_text (t, heading_columns + i * 2 + 1, 2, TAB_CENTER | TAT_TITLE,
1234 for (i = 0; i < iact->n_vars; ++i)
1239 var_to_string (iact->vars[i])
1244 for (v = 0; v < cmd->n_dep_vars; ++v)
1247 const union value **prev_val = xxx0 (iact);
1250 tab_hline (t, TAL_1, 0, nc - 1, heading_rows + v * n_cats);
1253 0, heading_rows + n_cats * v,
1255 var_to_string (cmd->dep_vars[v])
1259 for (i = 0; i < n_cats; ++i)
1262 const struct exploratory_stats *es;
1264 const struct ccase *c =
1265 categoricals_get_case_by_category_real (cmd->cats,
1269 int diff_idx = xxx1 (iact, c, prev_val);
1271 if ( diff_idx != -1 && diff_idx < iact->n_vars - 1)
1272 tab_hline (t, TAL_1, 1 + diff_idx, nc - 1,
1273 heading_rows + n_cats * v + i
1276 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
1278 const struct variable *ivar = iact->vars[ivar_idx];
1279 const union value *val = case_data (c, ivar);
1281 if (( diff_idx != -1 && diff_idx <= ivar_idx)
1285 ds_init_empty (&str);
1286 var_append_value_name (ivar, val, &str);
1289 1 + ivar_idx, heading_rows + n_cats * v + i,
1290 TAT_TITLE | TAB_LEFT,
1299 es = categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, i);
1302 total = es[v].missing + es[v].non_missing;
1304 heading_columns + 0,
1305 heading_rows + n_cats * v + i,
1312 heading_columns + 1,
1313 heading_rows + n_cats * v + i,
1316 100.0 * es[v].non_missing / total
1321 heading_columns + 2,
1322 heading_rows + n_cats * v + i,
1328 heading_columns + 3,
1329 heading_rows + n_cats * v + i,
1332 100.0 * es[v].missing / total
1335 heading_columns + 4,
1336 heading_rows + n_cats * v + i,
1341 /* This can only be 100% can't it? */
1343 heading_columns + 5,
1344 heading_rows + n_cats * v + i,
1347 100.0 * (es[v].missing + es[v].non_missing)/ total
1352 tab_hline (t, TAL_1, heading_columns, nc - 1, 1);
1353 tab_hline (t, TAL_1, heading_columns, nc - 1, 2);
1359 /* Match a variable.
1360 If the match succeeds, the variable will be placed in VAR.
1361 Returns true if successful */
1363 lex_match_variable (struct lexer *lexer,
1364 const struct dictionary *dict, const struct variable **var)
1366 if (lex_token (lexer) != T_ID)
1370 *var = parse_variable_const (lexer, dict);
1377 /* Attempt to parse an interaction from LEXER */
1378 static struct interaction *
1379 parse_interaction (struct lexer *lexer, struct examine *ex)
1381 const struct variable *v = NULL;
1382 struct interaction *iact = NULL;
1384 if ( lex_match_variable (lexer, ex->dict, &v))
1386 iact = interaction_create (v);
1388 while (lex_match (lexer, T_BY))
1390 if (!lex_match_variable (lexer, ex->dict, &v))
1392 interaction_destroy (iact);
1395 interaction_add_variable (iact, v);
1397 lex_match (lexer, T_COMMA);
1405 create_n (const void *aux1, void *aux2 UNUSED)
1409 const struct examine *examine = aux1;
1410 struct exploratory_stats *es = xcalloc (examine->n_dep_vars, sizeof (*es));
1412 struct caseproto *proto = caseproto_create ();
1413 proto = caseproto_add_width (proto, 0); /* value */
1414 proto = caseproto_add_width (proto, 0); /* id */
1415 proto = caseproto_add_width (proto, 0); /* weight */
1417 for (v = 0; v < examine->n_dep_vars; v++)
1419 struct subcase ordering;
1421 subcase_init (&ordering, 0, 0, SC_ASCEND);
1423 es[v].sorted_writer = sort_create_writer (&ordering, proto);
1424 es[v].sorted_reader = NULL;
1426 es[v].mom = moments_create (MOMENT_KURTOSIS);
1427 es[v].cmin = DBL_MAX;
1429 es[v].maximum = -DBL_MAX;
1430 es[v].minimum = DBL_MAX;
1436 update_n (const void *aux1, void *aux2 UNUSED, void *user_data,
1437 const struct ccase *c, double weight)
1440 const struct examine *examine = aux1;
1441 struct exploratory_stats *es = user_data;
1443 struct caseproto *proto = caseproto_create ();
1444 proto = caseproto_add_width (proto, 0); /* value */
1445 proto = caseproto_add_width (proto, 0); /* id */
1446 proto = caseproto_add_width (proto, 0); /* weight */
1448 for (v = 0; v < examine->n_dep_vars; v++)
1450 struct ccase *outcase = case_create (proto);
1451 const struct variable *var = examine->dep_vars[v];
1452 const double x = case_data (c, var)->f;
1454 if (var_is_value_missing (var, case_data (c, var), examine->exclude))
1456 es[v].missing += weight;
1460 if (x > es[v].maximum)
1463 if (x < es[v].minimum)
1466 es[v].non_missing += weight;
1468 moments_pass_one (es[v].mom, x, weight);
1470 /* Save the value and the casenumber to the writer */
1471 case_data_rw_idx (outcase, EX_VAL)->f = x;
1472 if ( examine->id_idx != -1)
1473 case_data_rw_idx (outcase, EX_ID)->f = case_data_idx (c, examine->id_idx)->f;
1475 case_data_rw_idx (outcase, EX_WT)->f = weight;
1479 if (es[v].cmin > weight)
1480 es[v].cmin = weight;
1482 casewriter_write (es[v].sorted_writer, outcase);
1487 calculate_n (const void *aux1, void *aux2 UNUSED, void *user_data)
1490 const struct examine *examine = aux1;
1491 struct exploratory_stats *es = user_data;
1493 for (v = 0; v < examine->n_dep_vars; v++)
1496 casenumber imin = 0;
1497 double imax = es[v].cc;
1498 struct casereader *reader;
1500 casenumber total_cases;
1502 if (examine->histogram)
1505 histogram_create (10, es[v].minimum, es[v].maximum);
1508 es[v].sorted_reader = casewriter_make_reader (es[v].sorted_writer);
1509 total_cases = casereader_count_cases (casereader_clone (es[v].sorted_reader));
1510 es[v].sorted_writer = NULL;
1512 es[v].maxima = xcalloc (examine->calc_extremes, sizeof (*es[v].maxima));
1513 es[v].minima = xcalloc (examine->calc_extremes, sizeof (*es[v].minima));
1515 for (reader = casereader_clone (es[v].sorted_reader);
1516 (c = casereader_read (reader)) != NULL; case_unref (c))
1518 const double val = case_data_idx (c, EX_VAL)->f;
1519 const double wt = case_data_idx (c, EX_WT)->f; /* FIXME: What about fractional weights ??? */
1521 moments_pass_two (es[v].mom, val, wt);
1523 if (es[v].histogram)
1524 histogram_add (es[v].histogram, val, wt);
1526 if (imin < examine->calc_extremes)
1529 for (x = imin; x < examine->calc_extremes; ++x)
1531 struct extremity *min = &es[v].minima[x];
1533 min->identity = case_data_idx (c, EX_ID)->f;
1539 if (imax < examine->calc_extremes)
1543 for (x = imax; x < imax + wt; ++x)
1545 struct extremity *max;
1547 if (x >= examine->calc_extremes)
1550 max = &es[v].maxima[x];
1552 max->identity = case_data_idx (c, EX_ID)->f;
1556 casereader_destroy (reader);
1558 if (examine->calc_extremes > 0)
1560 assert (es[v].minima[0].val == es[v].minimum);
1561 assert (es[v].maxima[0].val == es[v].maximum);
1565 const int n_os = 5 + examine->n_percentiles;
1566 struct order_stats **os ;
1567 es[v].percentiles = xcalloc (examine->n_percentiles, sizeof (*es[v].percentiles));
1569 es[v].trimmed_mean = trimmed_mean_create (es[v].cc, 0.05);
1571 os = xcalloc (n_os, sizeof *os);
1572 os[0] = &es[v].trimmed_mean->parent;
1574 es[v].quartiles[0] = percentile_create (0.25, es[v].cc);
1575 es[v].quartiles[1] = percentile_create (0.5, es[v].cc);
1576 es[v].quartiles[2] = percentile_create (0.75, es[v].cc);
1578 os[1] = &es[v].quartiles[0]->parent;
1579 os[2] = &es[v].quartiles[1]->parent;
1580 os[3] = &es[v].quartiles[2]->parent;
1582 es[v].hinges = tukey_hinges_create (es[v].cc, es[v].cmin);
1583 os[4] = &es[v].hinges->parent;
1585 for (i = 0; i < examine->n_percentiles; ++i)
1587 es[v].percentiles[i] = percentile_create (examine->ptiles[i] / 100.00, es[v].cc);
1588 os[5 + i] = &es[v].percentiles[i]->parent;
1591 order_stats_accumulate_idx (os, n_os,
1592 casereader_clone (es[v].sorted_reader),
1596 if (examine->boxplot)
1598 struct order_stats *os;
1600 es[v].box_whisker = box_whisker_create (es[v].hinges,
1603 os = &es[v].box_whisker->parent;
1604 order_stats_accumulate_idx (&os, 1,
1605 casereader_clone (es[v].sorted_reader),
1609 if (examine->npplot)
1611 double n, mean, var;
1612 struct order_stats *os;
1614 moments_calculate (es[v].mom, &n, &mean, &var, NULL, NULL);
1616 es[v].np = np_create (n, mean, var);
1618 os = &es[v].np->parent;
1620 order_stats_accumulate_idx (&os, 1,
1621 casereader_clone (es[v].sorted_reader),
1629 run_examine (struct examine *cmd, struct casereader *input)
1633 struct casereader *reader;
1635 struct payload payload;
1636 payload.create = create_n;
1637 payload.update = update_n;
1638 payload.destroy = calculate_n;
1640 cmd->wv = dict_get_weight (cmd->dict);
1644 = categoricals_create (cmd->iacts, cmd->n_iacts,
1645 cmd->wv, cmd->exclude);
1647 categoricals_set_payload (cmd->cats, &payload, cmd, NULL);
1649 if (cmd->casenumbers)
1651 struct ccase *c = casereader_peek (input, 0);
1654 cmd->id_idx = var_get_case_index (cmd->id_var);
1657 cmd->id_idx = case_get_value_cnt (c);
1658 input = casereader_create_arithmetic_sequence (input, 1.0, 1.0);
1664 /* FIXME: Filter out missing factor variables */
1666 /* Remove cases on a listwise basis if requested */
1667 if ( cmd->missing_pw == false)
1668 input = casereader_create_filter_missing (input,
1675 for (reader = casereader_clone (input);
1676 (c = casereader_read (reader)) != NULL; case_unref (c))
1678 categoricals_update (cmd->cats, c);
1680 casereader_destroy (reader);
1681 categoricals_done (cmd->cats);
1683 for (i = 0; i < cmd->n_iacts; ++i)
1685 summary_report (cmd, i);
1687 if (cmd->disp_extremes > 0)
1688 extremes_report (cmd, i);
1690 if (cmd->n_percentiles > 0)
1691 percentiles_report (cmd, i);
1695 switch (cmd->boxplot_mode)
1698 show_boxplot_grouped (cmd, i);
1701 show_boxplot_variabled (cmd, i);
1710 show_histogram (cmd, i);
1713 show_npplot (cmd, i);
1715 if (cmd->descriptives)
1716 descriptives_report (cmd, i);
1721 cmd_examine (struct lexer *lexer, struct dataset *ds)
1723 bool nototals_seen = false;
1724 bool totals_seen = false;
1726 struct interaction **iacts_mem = NULL;
1727 struct examine examine;
1728 bool percentiles_seen = false;
1730 examine.casenumbers = false;
1731 examine.missing_pw = false;
1732 examine.disp_extremes = 0;
1733 examine.calc_extremes = 0;
1734 examine.descriptives = false;
1735 examine.conf = 0.95;
1736 examine.pc_alg = PC_HAVERAGE;
1737 examine.ptiles = NULL;
1738 examine.n_percentiles = 0;
1740 examine.boxplot_mode = BP_GROUPS;
1743 /* Allocate space for the first interaction.
1744 This is interaction is an empty one (for the totals).
1745 If no totals are requested, we will simply ignore this
1748 examine.n_iacts = 1;
1749 examine.iacts = iacts_mem = xzalloc (sizeof (struct interaction *));
1750 examine.iacts[0] = interaction_create (NULL);
1752 examine.exclude = MV_ANY;
1753 examine.histogram = false;
1754 examine.npplot = false;
1755 examine.boxplot = false;
1757 examine.dict = dataset_dict (ds);
1759 /* Accept an optional, completely pointless "/VARIABLES=" */
1760 lex_match (lexer, T_SLASH);
1761 if (lex_match_id (lexer, "VARIABLES"))
1763 if (! lex_force_match (lexer, T_EQUALS) )
1767 if (!parse_variables_const (lexer, examine.dict,
1768 &examine.dep_vars, &examine.n_dep_vars,
1769 PV_NO_DUPLICATE | PV_NUMERIC))
1772 if (lex_match (lexer, T_BY))
1774 struct interaction *iact = NULL;
1777 iact = parse_interaction (lexer, &examine);
1782 xrealloc (iacts_mem,
1783 sizeof (*iacts_mem) * examine.n_iacts);
1785 iacts_mem[examine.n_iacts - 1] = iact;
1792 while (lex_token (lexer) != T_ENDCMD)
1794 lex_match (lexer, T_SLASH);
1796 if (lex_match_id (lexer, "STATISTICS"))
1798 lex_match (lexer, T_EQUALS);
1800 while (lex_token (lexer) != T_ENDCMD
1801 && lex_token (lexer) != T_SLASH)
1803 if (lex_match_id (lexer, "DESCRIPTIVES"))
1805 examine.descriptives = true;
1807 else if (lex_match_id (lexer, "EXTREME"))
1810 if (lex_match (lexer, T_LPAREN))
1812 extr = lex_integer (lexer);
1816 msg (MW, _("%s may not be negative. Using default value (%g)."), "EXTREME", 5.0);
1821 if (! lex_force_match (lexer, T_RPAREN))
1824 examine.disp_extremes = extr;
1826 else if (lex_match_id (lexer, "NONE"))
1829 else if (lex_match (lexer, T_ALL))
1831 if (examine.disp_extremes == 0)
1832 examine.disp_extremes = 5;
1836 lex_error (lexer, NULL);
1841 else if (lex_match_id (lexer, "PERCENTILES"))
1843 percentiles_seen = true;
1844 if (lex_match (lexer, T_LPAREN))
1846 while (lex_is_number (lexer))
1848 double p = lex_number (lexer);
1850 if ( p <= 0 || p >= 100.0)
1853 _("Percentiles must lie in the range (0, 100)"));
1857 examine.n_percentiles++;
1859 xrealloc (examine.ptiles,
1860 sizeof (*examine.ptiles) *
1861 examine.n_percentiles);
1863 examine.ptiles[examine.n_percentiles - 1] = p;
1866 lex_match (lexer, T_COMMA);
1868 if (!lex_force_match (lexer, T_RPAREN))
1872 lex_match (lexer, T_EQUALS);
1874 while (lex_token (lexer) != T_ENDCMD
1875 && lex_token (lexer) != T_SLASH)
1877 if (lex_match_id (lexer, "HAVERAGE"))
1879 examine.pc_alg = PC_HAVERAGE;
1881 else if (lex_match_id (lexer, "WAVERAGE"))
1883 examine.pc_alg = PC_WAVERAGE;
1885 else if (lex_match_id (lexer, "ROUND"))
1887 examine.pc_alg = PC_ROUND;
1889 else if (lex_match_id (lexer, "EMPIRICAL"))
1891 examine.pc_alg = PC_EMPIRICAL;
1893 else if (lex_match_id (lexer, "AEMPIRICAL"))
1895 examine.pc_alg = PC_AEMPIRICAL;
1897 else if (lex_match_id (lexer, "NONE"))
1899 examine.pc_alg = PC_NONE;
1903 lex_error (lexer, NULL);
1908 else if (lex_match_id (lexer, "TOTAL"))
1912 else if (lex_match_id (lexer, "NOTOTAL"))
1914 nototals_seen = true;
1916 else if (lex_match_id (lexer, "MISSING"))
1918 lex_match (lexer, T_EQUALS);
1920 while (lex_token (lexer) != T_ENDCMD
1921 && lex_token (lexer) != T_SLASH)
1923 if (lex_match_id (lexer, "LISTWISE"))
1925 examine.missing_pw = false;
1927 else if (lex_match_id (lexer, "PAIRWISE"))
1929 examine.missing_pw = true;
1931 else if (lex_match_id (lexer, "EXCLUDE"))
1933 examine.exclude = MV_ANY;
1935 else if (lex_match_id (lexer, "INCLUDE"))
1937 examine.exclude = MV_SYSTEM;
1941 lex_error (lexer, NULL);
1946 else if (lex_match_id (lexer, "COMPARE"))
1948 lex_match (lexer, T_EQUALS);
1949 if (lex_match_id (lexer, "VARIABLES"))
1951 examine.boxplot_mode = BP_VARIABLES;
1953 else if (lex_match_id (lexer, "GROUPS"))
1955 examine.boxplot_mode = BP_GROUPS;
1959 lex_error (lexer, NULL);
1963 else if (lex_match_id (lexer, "PLOT"))
1965 lex_match (lexer, T_EQUALS);
1967 while (lex_token (lexer) != T_ENDCMD
1968 && lex_token (lexer) != T_SLASH)
1970 if (lex_match_id (lexer, "BOXPLOT"))
1972 examine.boxplot = true;
1974 else if (lex_match_id (lexer, "NPPLOT"))
1976 examine.npplot = true;
1978 else if (lex_match_id (lexer, "HISTOGRAM"))
1980 examine.histogram = true;
1982 else if (lex_match_id (lexer, "NONE"))
1984 examine.histogram = false;
1985 examine.npplot = false;
1986 examine.boxplot = false;
1988 else if (lex_match (lexer, T_ALL))
1990 examine.histogram = true;
1991 examine.npplot = true;
1992 examine.boxplot = true;
1996 lex_error (lexer, NULL);
1999 lex_match (lexer, T_COMMA);
2002 else if (lex_match_id (lexer, "CINTERVAL"))
2004 if ( !lex_force_num (lexer))
2007 examine.conf = lex_number (lexer);
2010 else if (lex_match_id (lexer, "ID"))
2012 lex_match (lexer, T_EQUALS);
2014 examine.id_var = parse_variable_const (lexer, examine.dict);
2018 lex_error (lexer, NULL);
2023 if ( totals_seen && nototals_seen)
2025 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
2029 /* If totals have been requested or if there are no factors
2030 in this analysis, then the totals need to be included. */
2031 if ( !nototals_seen || examine.n_iacts == 1)
2033 examine.iacts = &iacts_mem[0];
2038 examine.iacts = &iacts_mem[1];
2042 if (examine.disp_extremes > 0)
2044 examine.calc_extremes = examine.disp_extremes;
2045 examine.casenumbers = true;
2048 if (examine.boxplot)
2050 examine.casenumbers = true;
2054 if (examine.descriptives && examine.calc_extremes == 0)
2056 /* Descriptives always displays the max and min */
2057 examine.calc_extremes = 1;
2060 if (percentiles_seen && examine.n_percentiles == 0)
2062 examine.n_percentiles = 7;
2063 examine.ptiles = xcalloc (examine.n_percentiles,
2064 sizeof (*examine.ptiles));
2066 examine.ptiles[0] = 5;
2067 examine.ptiles[1] = 10;
2068 examine.ptiles[2] = 25;
2069 examine.ptiles[3] = 50;
2070 examine.ptiles[4] = 75;
2071 examine.ptiles[5] = 90;
2072 examine.ptiles[6] = 95;
2075 assert (examine.calc_extremes >= examine.disp_extremes);
2077 struct casegrouper *grouper;
2078 struct casereader *group;
2081 grouper = casegrouper_create_splits (proc_open (ds), examine.dict);
2082 while (casegrouper_get_next_group (grouper, &group))
2083 run_examine (&examine, group);
2084 ok = casegrouper_destroy (grouper);
2085 ok = proc_commit (ds) && ok;