2 PSPP - a program for statistical analysis.
3 Copyright (C) 2012, 2013, 2016, 2019 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 #include <gsl/gsl_cdf.h>
24 #include "libpspp/assertion.h"
25 #include "libpspp/message.h"
26 #include "libpspp/pool.h"
29 #include "data/dataset.h"
30 #include "data/dictionary.h"
31 #include "data/casegrouper.h"
32 #include "data/casereader.h"
33 #include "data/casewriter.h"
34 #include "data/caseproto.h"
35 #include "data/subcase.h"
38 #include "data/format.h"
40 #include "math/interaction.h"
41 #include "math/box-whisker.h"
42 #include "math/categoricals.h"
43 #include "math/chart-geometry.h"
44 #include "math/histogram.h"
45 #include "math/moments.h"
47 #include "math/sort.h"
48 #include "math/order-stats.h"
49 #include "math/percentiles.h"
50 #include "math/shapiro-wilk.h"
51 #include "math/tukey-hinges.h"
52 #include "math/trimmed-mean.h"
54 #include "output/charts/boxplot.h"
55 #include "output/charts/np-plot.h"
56 #include "output/charts/spreadlevel-plot.h"
57 #include "output/charts/plot-hist.h"
59 #include "language/command.h"
60 #include "language/lexer/lexer.h"
61 #include "language/lexer/value-parser.h"
62 #include "language/lexer/variable-parser.h"
64 #include "output/pivot-table.h"
67 #define _(msgid) gettext (msgid)
68 #define N_(msgid) msgid
71 append_value_name (const struct variable *var, const union value *val, struct string *str)
73 var_append_value_name (var, val, str);
74 if (var_is_value_missing (var, val))
75 ds_put_cstr (str, _(" (missing)"));
85 /* Indices for the ex_proto member (below) */
94 #define PLOT_HISTOGRAM 0x1
95 #define PLOT_BOXPLOT 0x2
96 #define PLOT_NPPLOT 0x4
97 #define PLOT_SPREADLEVEL 0x8
103 /* A caseproto used to contain the data subsets under examination,
105 struct caseproto *ex_proto;
108 const struct variable **dep_vars;
111 struct interaction **iacts;
113 enum mv_class dep_excl;
114 enum mv_class fctr_excl;
116 const struct dictionary *dict;
118 struct categoricals *cats;
120 /* how many extremities to display */
129 /* The case index of the ID value (or -1) if not applicable */
135 size_t n_percentiles;
140 enum bp_mode boxplot_mode;
142 const struct variable *id_var;
144 const struct variable *wv;
149 /* The value of this extremity */
152 /* Either the casenumber or the value of the variable specified
153 by the /ID subcommand which corresponds to this extremity */
154 union value identity;
157 struct exploratory_stats
164 /* Most operations need a sorted reader/writer */
165 struct casewriter *sorted_writer;
166 struct casereader *sorted_reader;
168 struct extremity *minima;
169 struct extremity *maxima;
172 Minimum should alway equal mimima[0].val.
173 Likewise, maximum should alway equal maxima[0].val.
174 This redundancy exists as an optimisation effort.
175 Some statistics (eg histogram) require early calculation
181 struct trimmed_mean *trimmed_mean;
182 struct percentile *quartiles[3];
183 struct percentile **percentiles;
184 struct shapiro_wilk *shapiro_wilk;
186 struct tukey_hinges *hinges;
188 /* The data for the NP Plots */
191 struct histogram *histogram;
193 /* The data for the box plots */
194 struct box_whisker *box_whisker;
199 /* The minimum weight */
204 show_boxplot_grouped (const struct examine *cmd, int iact_idx)
208 const struct interaction *iact = cmd->iacts[iact_idx];
209 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
211 for (v = 0; v < cmd->n_dep_vars; ++v)
213 double y_min = DBL_MAX;
214 double y_max = -DBL_MAX;
216 struct boxplot *boxplot;
218 ds_init_empty (&title);
220 if (iact->n_vars > 0)
223 ds_init_empty (&istr);
224 interaction_to_string (iact, &istr);
225 ds_put_format (&title, _("Boxplot of %s vs. %s"),
226 var_to_string (cmd->dep_vars[v]),
231 ds_put_format (&title, _("Boxplot of %s"), var_to_string (cmd->dep_vars[v]));
233 for (grp = 0; grp < n_cats; ++grp)
235 const struct exploratory_stats *es =
236 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
238 if (y_min > es[v].minimum)
239 y_min = es[v].minimum;
241 if (y_max < es[v].maximum)
242 y_max = es[v].maximum;
245 boxplot = boxplot_create (y_min, y_max, ds_cstr (&title));
249 for (grp = 0; grp < n_cats; ++grp)
254 const struct ccase *c =
255 categoricals_get_case_by_category_real (cmd->cats, iact_idx, grp);
257 struct exploratory_stats *es =
258 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
260 ds_init_empty (&label);
261 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
264 const struct variable *ivar = iact->vars[ivar_idx];
265 const union value *val = case_data (c, ivar);
268 append_value_name (ivar, val, &l);
269 ds_ltrim (&l, ss_cstr (" "));
271 ds_put_substring (&label, l.ss);
272 if (ivar_idx < iact->n_vars - 1)
273 ds_put_cstr (&label, "; ");
278 boxplot_add_box (boxplot, es[v].box_whisker, ds_cstr (&label));
279 es[v].box_whisker = NULL;
284 boxplot_submit (boxplot);
289 show_boxplot_variabled (const struct examine *cmd, int iact_idx)
292 const struct interaction *iact = cmd->iacts[iact_idx];
293 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
295 for (grp = 0; grp < n_cats; ++grp)
297 struct boxplot *boxplot;
299 double y_min = DBL_MAX;
300 double y_max = -DBL_MAX;
302 const struct ccase *c =
303 categoricals_get_case_by_category_real (cmd->cats, iact_idx, grp);
306 ds_init_empty (&title);
308 for (v = 0; v < cmd->n_dep_vars; ++v)
310 const struct exploratory_stats *es =
311 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
313 if (y_min > es[v].minimum)
314 y_min = es[v].minimum;
316 if (y_max < es[v].maximum)
317 y_max = es[v].maximum;
320 if (iact->n_vars == 0)
321 ds_put_format (&title, _("Boxplot"));
326 ds_init_empty (&label);
327 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
329 const struct variable *ivar = iact->vars[ivar_idx];
330 const union value *val = case_data (c, ivar);
332 ds_put_cstr (&label, var_to_string (ivar));
333 ds_put_cstr (&label, " = ");
334 append_value_name (ivar, val, &label);
335 ds_put_cstr (&label, "; ");
338 ds_put_format (&title, _("Boxplot of %s"),
344 boxplot = boxplot_create (y_min, y_max, ds_cstr (&title));
348 for (v = 0; v < cmd->n_dep_vars; ++v)
350 struct exploratory_stats *es =
351 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
353 boxplot_add_box (boxplot, es[v].box_whisker,
354 var_to_string (cmd->dep_vars[v]));
355 es[v].box_whisker = NULL;
358 boxplot_submit (boxplot);
364 show_npplot (const struct examine *cmd, int iact_idx)
366 const struct interaction *iact = cmd->iacts[iact_idx];
367 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
371 for (v = 0; v < cmd->n_dep_vars; ++v)
374 for (grp = 0; grp < n_cats; ++grp)
376 struct chart *npp, *dnpp;
377 struct casereader *reader;
381 const struct ccase *c =
382 categoricals_get_case_by_category_real (cmd->cats,
385 const struct exploratory_stats *es =
386 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
389 ds_init_cstr (&label,
390 var_to_string (cmd->dep_vars[v]));
392 if (iact->n_vars > 0)
394 ds_put_cstr (&label, " (");
395 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
397 const struct variable *ivar = iact->vars[ivar_idx];
398 const union value *val = case_data (c, ivar);
400 ds_put_cstr (&label, var_to_string (ivar));
401 ds_put_cstr (&label, " = ");
402 append_value_name (ivar, val, &label);
403 ds_put_cstr (&label, "; ");
406 ds_put_cstr (&label, ")");
410 reader = casewriter_make_reader (np->writer);
413 npp = np_plot_create (np, reader, ds_cstr (&label));
414 dnpp = dnp_plot_create (np, reader, ds_cstr (&label));
416 if (npp == NULL || dnpp == NULL)
418 msg (MW, _("Not creating NP plot because data set is empty."));
427 casereader_destroy (reader);
435 show_spreadlevel (const struct examine *cmd, int iact_idx)
437 const struct interaction *iact = cmd->iacts[iact_idx];
438 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
442 /* Spreadlevel when there are no levels is not useful */
443 if (iact->n_vars == 0)
446 for (v = 0; v < cmd->n_dep_vars; ++v)
452 ds_init_cstr (&label,
453 var_to_string (cmd->dep_vars[v]));
455 if (iact->n_vars > 0)
457 ds_put_cstr (&label, " (");
458 interaction_to_string (iact, &label);
459 ds_put_cstr (&label, ")");
462 sl = spreadlevel_plot_create (ds_cstr (&label), cmd->sl_power);
464 for (grp = 0; grp < n_cats; ++grp)
466 const struct exploratory_stats *es =
467 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
469 double median = percentile_calculate (es[v].quartiles[1], cmd->pc_alg);
471 double iqr = percentile_calculate (es[v].quartiles[2], cmd->pc_alg) -
472 percentile_calculate (es[v].quartiles[0], cmd->pc_alg);
474 spreadlevel_plot_add (sl, iqr, median);
478 msg (MW, _("Not creating spreadlevel chart for %s"), ds_cstr (&label));
488 show_histogram (const struct examine *cmd, int iact_idx)
490 const struct interaction *iact = cmd->iacts[iact_idx];
491 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
495 for (v = 0; v < cmd->n_dep_vars; ++v)
498 for (grp = 0; grp < n_cats; ++grp)
502 const struct ccase *c =
503 categoricals_get_case_by_category_real (cmd->cats,
506 const struct exploratory_stats *es =
507 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
511 if (es[v].histogram == NULL)
514 ds_init_cstr (&label,
515 var_to_string (cmd->dep_vars[v]));
517 if (iact->n_vars > 0)
519 ds_put_cstr (&label, " (");
520 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
522 const struct variable *ivar = iact->vars[ivar_idx];
523 const union value *val = case_data (c, ivar);
525 ds_put_cstr (&label, var_to_string (ivar));
526 ds_put_cstr (&label, " = ");
527 append_value_name (ivar, val, &label);
528 ds_put_cstr (&label, "; ");
531 ds_put_cstr (&label, ")");
535 moments_calculate (es[v].mom, &n, &mean, &var, NULL, NULL);
538 (histogram_chart_create (es[v].histogram->gsl_hist,
539 ds_cstr (&label), n, mean,
548 static struct pivot_value *
549 new_value_with_missing_footnote (const struct variable *var,
550 const union value *value,
551 struct pivot_footnote *missing_footnote)
553 struct pivot_value *pv = pivot_value_new_var_value (var, value);
554 if (var_is_value_missing (var, value) == MV_USER)
555 pivot_value_add_footnote (pv, missing_footnote);
560 create_interaction_dimensions (struct pivot_table *table,
561 const struct categoricals *cats,
562 const struct interaction *iact,
563 struct pivot_footnote *missing_footnote)
565 for (size_t i = iact->n_vars; i-- > 0;)
567 const struct variable *var = iact->vars[i];
568 struct pivot_dimension *d = pivot_dimension_create__ (
569 table, PIVOT_AXIS_ROW, pivot_value_new_variable (var));
570 d->root->show_label = true;
573 union value *values = categoricals_get_var_values (cats, var, &n);
574 for (size_t j = 0; j < n; j++)
575 pivot_category_create_leaf (
576 d->root, new_value_with_missing_footnote (var, &values[j],
581 static struct pivot_footnote *
582 create_missing_footnote (struct pivot_table *table)
584 return pivot_table_create_footnote (
585 table, pivot_value_new_text (N_("User-missing value.")));
589 percentiles_report (const struct examine *cmd, int iact_idx)
591 struct pivot_table *table = pivot_table_create (N_("Percentiles"));
593 struct pivot_dimension *percentiles = pivot_dimension_create (
594 table, PIVOT_AXIS_COLUMN, N_("Percentiles"));
595 percentiles->root->show_label = true;
596 for (int i = 0; i < cmd->n_percentiles; ++i)
597 pivot_category_create_leaf (
599 pivot_value_new_user_text_nocopy (xasprintf ("%g", cmd->ptiles[i])));
601 pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Statistics"),
602 N_("Weighted Average"), N_("Tukey's Hinges"));
604 const struct interaction *iact = cmd->iacts[iact_idx];
605 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
606 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
608 struct pivot_dimension *dep_dim = pivot_dimension_create (
609 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
611 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
613 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
614 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
616 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
617 dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
619 for (size_t i = 0; i < n_cats; ++i)
621 for (size_t j = 0; j < iact->n_vars; j++)
623 int idx = categoricals_get_value_index_by_category_real (
624 cmd->cats, iact_idx, i, j);
625 indexes[table->n_dimensions - 2 - j] = idx;
628 const struct exploratory_stats *ess
629 = categoricals_get_user_data_by_category_real (
630 cmd->cats, iact_idx, i);
631 const struct exploratory_stats *es = ess + v;
634 tukey_hinges_calculate (es->hinges, hinges);
636 for (size_t pc_idx = 0; pc_idx < cmd->n_percentiles; ++pc_idx)
641 double value = percentile_calculate (es->percentiles[pc_idx],
643 pivot_table_put (table, indexes, table->n_dimensions,
644 pivot_value_new_number (value));
646 double hinge = (cmd->ptiles[pc_idx] == 25.0 ? hinges[0]
647 : cmd->ptiles[pc_idx] == 50.0 ? hinges[1]
648 : cmd->ptiles[pc_idx] == 75.0 ? hinges[2]
653 pivot_table_put (table, indexes, table->n_dimensions,
654 pivot_value_new_number (hinge));
662 pivot_table_submit (table);
666 normality_report (const struct examine *cmd, int iact_idx)
668 struct pivot_table *table = pivot_table_create (N_("Tests of Normality"));
670 struct pivot_dimension *test =
671 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Shapiro-Wilk"),
673 N_("df"), PIVOT_RC_COUNT,
676 test->root->show_label = true;
678 const struct interaction *iact = cmd->iacts[iact_idx];
679 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
680 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
682 struct pivot_dimension *dep_dim = pivot_dimension_create (
683 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
685 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
687 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
688 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
690 indexes[table->n_dimensions - 1] =
691 pivot_category_create_leaf (dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
693 for (size_t i = 0; i < n_cats; ++i)
697 const struct exploratory_stats *es
698 = categoricals_get_user_data_by_category_real (
699 cmd->cats, iact_idx, i);
701 struct shapiro_wilk *sw = es[v].shapiro_wilk;
706 double w = shapiro_wilk_calculate (sw);
711 pivot_table_put (table, indexes, table->n_dimensions,
712 pivot_value_new_number (w));
715 pivot_table_put (table, indexes, table->n_dimensions,
716 pivot_value_new_number (sw->n));
719 pivot_table_put (table, indexes, table->n_dimensions,
720 pivot_value_new_number (shapiro_wilk_significance (sw->n, w)));
726 pivot_table_submit (table);
731 descriptives_report (const struct examine *cmd, int iact_idx)
733 struct pivot_table *table = pivot_table_create (N_("Descriptives"));
735 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Aspect"),
736 N_("Statistic"), N_("Std. Error"));
738 struct pivot_dimension *statistics = pivot_dimension_create (
739 table, PIVOT_AXIS_ROW, N_("Statistics"), N_("Mean"));
740 struct pivot_category *interval = pivot_category_create_group__ (
742 pivot_value_new_text_format (N_("%g%% Confidence Interval for Mean"),
744 pivot_category_create_leaves (interval, N_("Lower Bound"),
746 pivot_category_create_leaves (
747 statistics->root, N_("5% Trimmed Mean"), N_("Median"), N_("Variance"),
748 N_("Std. Deviation"), N_("Minimum"), N_("Maximum"), N_("Range"),
749 N_("Interquartile Range"), N_("Skewness"), N_("Kurtosis"));
751 const struct interaction *iact = cmd->iacts[iact_idx];
752 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
753 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
755 struct pivot_dimension *dep_dim = pivot_dimension_create (
756 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
758 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
760 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
761 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
763 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
764 dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
766 for (size_t i = 0; i < n_cats; ++i)
768 for (size_t j = 0; j < iact->n_vars; j++)
770 int idx = categoricals_get_value_index_by_category_real (
771 cmd->cats, iact_idx, i, j);
772 indexes[table->n_dimensions - 2 - j] = idx;
775 const struct exploratory_stats *ess
776 = categoricals_get_user_data_by_category_real (cmd->cats,
778 const struct exploratory_stats *es = ess + v;
780 double m0, m1, m2, m3, m4;
781 moments_calculate (es->mom, &m0, &m1, &m2, &m3, &m4);
782 double tval = gsl_cdf_tdist_Qinv ((1.0 - cmd->conf) / 2.0, m0 - 1.0);
792 { 0, 1, calc_semean (m2, m0) },
793 { 1, 0, m1 - tval * calc_semean (m2, m0) },
794 { 2, 0, m1 + tval * calc_semean (m2, m0) },
795 { 3, 0, trimmed_mean_calculate (es->trimmed_mean) },
796 { 4, 0, percentile_calculate (es->quartiles[1], cmd->pc_alg) },
799 { 7, 0, es->minima[0].val },
800 { 8, 0, es->maxima[0].val },
801 { 9, 0, es->maxima[0].val - es->minima[0].val },
802 { 10, 0, (percentile_calculate (es->quartiles[2], cmd->pc_alg) -
803 percentile_calculate (es->quartiles[0], cmd->pc_alg)) },
805 { 11, 1, calc_seskew (m0) },
807 { 12, 1, calc_sekurt (m0) },
809 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
811 const struct entry *e = &entries[j];
812 indexes[0] = e->aspect_idx;
813 indexes[1] = e->stat_idx;
814 pivot_table_put (table, indexes, table->n_dimensions,
815 pivot_value_new_number (e->x));
822 pivot_table_submit (table);
827 extremes_report (const struct examine *cmd, int iact_idx)
829 struct pivot_table *table = pivot_table_create (N_("Extreme Values"));
831 struct pivot_dimension *statistics = pivot_dimension_create (
832 table, PIVOT_AXIS_COLUMN, N_("Statistics"));
833 pivot_category_create_leaf (statistics->root,
835 ? pivot_value_new_variable (cmd->id_var)
836 : pivot_value_new_text (N_("Case Number"))));
837 pivot_category_create_leaves (statistics->root, N_("Value"));
839 struct pivot_dimension *order = pivot_dimension_create (
840 table, PIVOT_AXIS_ROW, N_("Order"));
841 for (size_t i = 0; i < cmd->disp_extremes; i++)
842 pivot_category_create_leaf (order->root, pivot_value_new_integer (i + 1));
844 pivot_dimension_create (table, PIVOT_AXIS_ROW,
845 /* TRANSLATORS: This is a noun, not an adjective. */
847 N_("Highest"), N_("Lowest"));
849 const struct interaction *iact = cmd->iacts[iact_idx];
850 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
851 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
853 struct pivot_dimension *dep_dim = pivot_dimension_create (
854 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
856 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
858 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
859 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
861 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
862 dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
864 for (size_t i = 0; i < n_cats; ++i)
866 for (size_t j = 0; j < iact->n_vars; j++)
868 int idx = categoricals_get_value_index_by_category_real (
869 cmd->cats, iact_idx, i, j);
870 indexes[table->n_dimensions - 2 - j] = idx;
873 const struct exploratory_stats *ess
874 = categoricals_get_user_data_by_category_real (cmd->cats,
876 const struct exploratory_stats *es = ess + v;
878 for (int e = 0 ; e < cmd->disp_extremes; ++e)
882 for (size_t j = 0; j < 2; j++)
884 const struct extremity *extremity
885 = j ? &es->minima[e] : &es->maxima[e];
890 table, indexes, table->n_dimensions,
892 ? new_value_with_missing_footnote (cmd->id_var,
893 &extremity->identity,
895 : pivot_value_new_integer (extremity->identity.f)));
898 union value val = { .f = extremity->val };
900 table, indexes, table->n_dimensions,
901 new_value_with_missing_footnote (cmd->dep_vars[v], &val,
909 pivot_table_submit (table);
914 summary_report (const struct examine *cmd, int iact_idx)
916 struct pivot_table *table = pivot_table_create (
917 N_("Case Processing Summary"));
918 pivot_table_set_weight_var (table, dict_get_weight (cmd->dict));
920 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
921 N_("N"), PIVOT_RC_COUNT,
922 N_("Percent"), PIVOT_RC_PERCENT);
923 struct pivot_dimension *cases = pivot_dimension_create (
924 table, PIVOT_AXIS_COLUMN, N_("Cases"), N_("Valid"), N_("Missing"),
926 cases->root->show_label = true;
928 const struct interaction *iact = cmd->iacts[iact_idx];
929 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
930 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
932 struct pivot_dimension *dep_dim = pivot_dimension_create (
933 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
935 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
937 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
938 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
940 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
941 dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
943 for (size_t i = 0; i < n_cats; ++i)
945 for (size_t j = 0; j < iact->n_vars; j++)
947 int idx = categoricals_get_value_index_by_category_real (
948 cmd->cats, iact_idx, i, j);
949 indexes[table->n_dimensions - 2 - j] = idx;
952 const struct exploratory_stats *es
953 = categoricals_get_user_data_by_category_real (
954 cmd->cats, iact_idx, i);
956 double total = es[v].missing + es[v].non_missing;
964 { 0, 0, es[v].non_missing },
965 { 1, 0, 100.0 * es[v].non_missing / total },
966 { 0, 1, es[v].missing },
967 { 1, 1, 100.0 * es[v].missing / total },
971 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
973 const struct entry *e = &entries[j];
974 indexes[0] = e->stat_idx;
975 indexes[1] = e->case_idx;
976 pivot_table_put (table, indexes, table->n_dimensions,
977 pivot_value_new_number (e->x));
984 pivot_table_submit (table);
987 /* Attempt to parse an interaction from LEXER */
988 static struct interaction *
989 parse_interaction (struct lexer *lexer, struct examine *ex)
991 const struct variable *v = NULL;
992 struct interaction *iact = NULL;
994 if (lex_match_variable (lexer, ex->dict, &v))
996 iact = interaction_create (v);
998 while (lex_match (lexer, T_BY))
1000 if (!lex_match_variable (lexer, ex->dict, &v))
1002 interaction_destroy (iact);
1005 interaction_add_variable (iact, v);
1007 lex_match (lexer, T_COMMA);
1015 create_n (const void *aux1, void *aux2 UNUSED)
1019 const struct examine *examine = aux1;
1020 struct exploratory_stats *es = pool_calloc (examine->pool, examine->n_dep_vars, sizeof (*es));
1021 struct subcase ordering;
1022 subcase_init (&ordering, 0, 0, SC_ASCEND);
1024 for (v = 0; v < examine->n_dep_vars; v++)
1026 es[v].sorted_writer = sort_create_writer (&ordering, examine->ex_proto);
1027 es[v].sorted_reader = NULL;
1029 es[v].mom = moments_create (MOMENT_KURTOSIS);
1030 es[v].cmin = DBL_MAX;
1032 es[v].maximum = -DBL_MAX;
1033 es[v].minimum = DBL_MAX;
1036 subcase_uninit (&ordering);
1041 update_n (const void *aux1, void *aux2 UNUSED, void *user_data,
1042 const struct ccase *c, double weight)
1045 const struct examine *examine = aux1;
1046 struct exploratory_stats *es = user_data;
1048 bool this_case_is_missing = false;
1049 /* LISTWISE missing must be dealt with here */
1050 if (!examine->missing_pw)
1052 for (v = 0; v < examine->n_dep_vars; v++)
1054 const struct variable *var = examine->dep_vars[v];
1056 if (var_is_value_missing (var, case_data (c, var))
1057 & examine->dep_excl)
1059 es[v].missing += weight;
1060 this_case_is_missing = true;
1065 if (this_case_is_missing)
1068 for (v = 0; v < examine->n_dep_vars; v++)
1070 struct ccase *outcase ;
1071 const struct variable *var = examine->dep_vars[v];
1072 const double x = case_num (c, var);
1074 if (var_is_value_missing (var, case_data (c, var)) & examine->dep_excl)
1076 es[v].missing += weight;
1080 outcase = case_create (examine->ex_proto);
1082 if (x > es[v].maximum)
1085 if (x < es[v].minimum)
1088 es[v].non_missing += weight;
1090 moments_pass_one (es[v].mom, x, weight);
1092 /* Save the value and the ID to the writer */
1093 assert (examine->id_idx != -1);
1094 *case_num_rw_idx (outcase, EX_VAL) = x;
1095 value_copy (case_data_rw_idx (outcase, EX_ID),
1096 case_data_idx (c, examine->id_idx), examine->id_width);
1098 *case_num_rw_idx (outcase, EX_WT) = weight;
1102 if (es[v].cmin > weight)
1103 es[v].cmin = weight;
1105 casewriter_write (es[v].sorted_writer, outcase);
1110 calculate_n (const void *aux1, void *aux2 UNUSED, void *user_data)
1113 const struct examine *examine = aux1;
1114 struct exploratory_stats *es = user_data;
1116 for (v = 0; v < examine->n_dep_vars; v++)
1119 casenumber imin = 0;
1121 struct casereader *reader;
1124 if (examine->plot & PLOT_HISTOGRAM && es[v].non_missing > 0)
1127 double bin_width = fabs (es[v].minimum - es[v].maximum)
1128 / (1 + log2 (es[v].cc))
1132 histogram_create (bin_width, es[v].minimum, es[v].maximum);
1135 es[v].sorted_reader = casewriter_make_reader (es[v].sorted_writer);
1136 es[v].sorted_writer = NULL;
1138 imax = casereader_get_n_cases (es[v].sorted_reader);
1140 es[v].maxima = pool_calloc (examine->pool, examine->calc_extremes, sizeof (*es[v].maxima));
1141 es[v].minima = pool_calloc (examine->pool, examine->calc_extremes, sizeof (*es[v].minima));
1142 for (i = 0; i < examine->calc_extremes; ++i)
1144 value_init_pool (examine->pool, &es[v].maxima[i].identity, examine->id_width) ;
1145 value_init_pool (examine->pool, &es[v].minima[i].identity, examine->id_width) ;
1149 for (reader = casereader_clone (es[v].sorted_reader);
1150 (c = casereader_read (reader)) != NULL; case_unref (c))
1152 const double val = case_num_idx (c, EX_VAL);
1153 double wt = case_num_idx (c, EX_WT);
1154 wt = var_force_valid_weight (examine->wv, wt, &warn);
1156 moments_pass_two (es[v].mom, val, wt);
1158 if (es[v].histogram)
1159 histogram_add (es[v].histogram, val, wt);
1161 if (imin < examine->calc_extremes)
1164 for (x = imin; x < examine->calc_extremes; ++x)
1166 struct extremity *min = &es[v].minima[x];
1168 value_copy (&min->identity, case_data_idx (c, EX_ID), examine->id_width);
1174 if (imax < examine->calc_extremes)
1178 for (x = imax; x < imax + 1; ++x)
1180 struct extremity *max;
1182 if (x >= examine->calc_extremes)
1185 max = &es[v].maxima[x];
1187 value_copy (&max->identity, case_data_idx (c, EX_ID), examine->id_width);
1191 casereader_destroy (reader);
1193 if (examine->calc_extremes > 0 && es[v].non_missing > 0)
1195 assert (es[v].minima[0].val == es[v].minimum);
1196 assert (es[v].maxima[0].val == es[v].maximum);
1200 const int n_os = 5 + examine->n_percentiles;
1201 es[v].percentiles = pool_calloc (examine->pool, examine->n_percentiles, sizeof (*es[v].percentiles));
1203 es[v].trimmed_mean = trimmed_mean_create (es[v].cc, 0.05);
1204 es[v].shapiro_wilk = NULL;
1206 struct order_stats **os = XCALLOC (n_os, struct order_stats *);
1207 os[0] = &es[v].trimmed_mean->parent;
1209 es[v].quartiles[0] = percentile_create (0.25, es[v].cc);
1210 es[v].quartiles[1] = percentile_create (0.5, es[v].cc);
1211 es[v].quartiles[2] = percentile_create (0.75, es[v].cc);
1213 os[1] = &es[v].quartiles[0]->parent;
1214 os[2] = &es[v].quartiles[1]->parent;
1215 os[3] = &es[v].quartiles[2]->parent;
1217 es[v].hinges = tukey_hinges_create (es[v].cc, es[v].cmin);
1218 os[4] = &es[v].hinges->parent;
1220 for (i = 0; i < examine->n_percentiles; ++i)
1222 es[v].percentiles[i] = percentile_create (examine->ptiles[i] / 100.00, es[v].cc);
1223 os[5 + i] = &es[v].percentiles[i]->parent;
1226 order_stats_accumulate_idx (os, n_os,
1227 casereader_clone (es[v].sorted_reader),
1233 if (examine->plot & PLOT_BOXPLOT)
1235 struct order_stats *os;
1237 es[v].box_whisker = box_whisker_create (es[v].hinges,
1238 EX_ID, examine->id_var);
1240 os = &es[v].box_whisker->parent;
1241 order_stats_accumulate_idx (&os, 1,
1242 casereader_clone (es[v].sorted_reader),
1250 moments_calculate (es[v].mom, NULL, &mean, NULL, NULL, NULL);
1252 es[v].shapiro_wilk = shapiro_wilk_create (es[v].non_missing, mean);
1254 if (es[v].shapiro_wilk)
1256 struct order_stats *os = &es[v].shapiro_wilk->parent;
1257 order_stats_accumulate_idx (&os, 1,
1258 casereader_clone (es[v].sorted_reader),
1263 if (examine->plot & PLOT_NPPLOT)
1265 double n, mean, var;
1266 struct order_stats *os;
1268 moments_calculate (es[v].mom, &n, &mean, &var, NULL, NULL);
1270 es[v].np = np_create (n, mean, var);
1272 os = &es[v].np->parent;
1274 order_stats_accumulate_idx (&os, 1,
1275 casereader_clone (es[v].sorted_reader),
1283 cleanup_exploratory_stats (struct examine *cmd)
1286 for (i = 0; i < cmd->n_iacts; ++i)
1289 const size_t n_cats = categoricals_n_count (cmd->cats, i);
1291 for (v = 0; v < cmd->n_dep_vars; ++v)
1294 for (grp = 0; grp < n_cats; ++grp)
1297 const struct exploratory_stats *es =
1298 categoricals_get_user_data_by_category_real (cmd->cats, i, grp);
1300 struct order_stats *os = &es[v].hinges->parent;
1301 struct statistic *stat = &os->parent;
1302 stat->destroy (stat);
1304 for (q = 0; q < 3 ; q++)
1306 os = &es[v].quartiles[q]->parent;
1308 stat->destroy (stat);
1311 for (q = 0; q < cmd->n_percentiles ; q++)
1313 os = &es[v].percentiles[q]->parent;
1315 stat->destroy (stat);
1318 if (es[v].shapiro_wilk)
1320 stat = &es[v].shapiro_wilk->parent.parent;
1321 stat->destroy (stat);
1324 os = &es[v].trimmed_mean->parent;
1326 stat->destroy (stat);
1328 os = &es[v].np->parent;
1332 stat->destroy (stat);
1335 statistic_destroy (&es[v].histogram->parent);
1336 moments_destroy (es[v].mom);
1338 if (es[v].box_whisker)
1340 stat = &es[v].box_whisker->parent.parent;
1341 stat->destroy (stat);
1344 casereader_destroy (es[v].sorted_reader);
1352 run_examine (struct examine *cmd, struct casereader *input)
1356 struct casereader *reader;
1358 struct payload payload;
1359 payload.create = create_n;
1360 payload.update = update_n;
1361 payload.calculate = calculate_n;
1362 payload.destroy = NULL;
1364 cmd->wv = dict_get_weight (cmd->dict);
1367 = categoricals_create (cmd->iacts, cmd->n_iacts, cmd->wv, cmd->fctr_excl);
1369 categoricals_set_payload (cmd->cats, &payload, cmd, NULL);
1371 if (cmd->id_var == NULL)
1373 struct ccase *c = casereader_peek (input, 0);
1375 cmd->id_idx = case_get_n_values (c);
1376 input = casereader_create_arithmetic_sequence (input, 1.0, 1.0);
1381 for (reader = input;
1382 (c = casereader_read (reader)) != NULL; case_unref (c))
1384 categoricals_update (cmd->cats, c);
1386 casereader_destroy (reader);
1387 categoricals_done (cmd->cats);
1389 for (i = 0; i < cmd->n_iacts; ++i)
1391 summary_report (cmd, i);
1393 const size_t n_cats = categoricals_n_count (cmd->cats, i);
1397 if (cmd->disp_extremes > 0)
1398 extremes_report (cmd, i);
1400 if (cmd->n_percentiles > 0)
1401 percentiles_report (cmd, i);
1403 if (cmd->plot & PLOT_BOXPLOT)
1405 switch (cmd->boxplot_mode)
1408 show_boxplot_grouped (cmd, i);
1411 show_boxplot_variabled (cmd, i);
1419 if (cmd->plot & PLOT_HISTOGRAM)
1420 show_histogram (cmd, i);
1422 if (cmd->plot & PLOT_NPPLOT)
1423 show_npplot (cmd, i);
1425 if (cmd->plot & PLOT_SPREADLEVEL)
1426 show_spreadlevel (cmd, i);
1428 if (cmd->descriptives)
1429 descriptives_report (cmd, i);
1432 normality_report (cmd, i);
1435 cleanup_exploratory_stats (cmd);
1436 categoricals_destroy (cmd->cats);
1441 cmd_examine (struct lexer *lexer, struct dataset *ds)
1444 bool nototals_seen = false;
1445 bool totals_seen = false;
1447 struct interaction **iacts_mem = NULL;
1448 struct examine examine;
1449 bool percentiles_seen = false;
1451 examine.missing_pw = false;
1452 examine.disp_extremes = 0;
1453 examine.calc_extremes = 0;
1454 examine.descriptives = false;
1455 examine.conf = 0.95;
1456 examine.pc_alg = PC_HAVERAGE;
1457 examine.ptiles = NULL;
1458 examine.n_percentiles = 0;
1459 examine.id_idx = -1;
1460 examine.id_width = 0;
1461 examine.id_var = NULL;
1462 examine.boxplot_mode = BP_GROUPS;
1464 examine.ex_proto = caseproto_create ();
1466 examine.pool = pool_create ();
1468 /* Allocate space for the first interaction.
1469 This is interaction is an empty one (for the totals).
1470 If no totals are requested, we will simply ignore this
1473 examine.n_iacts = 1;
1474 examine.iacts = iacts_mem = pool_zalloc (examine.pool, sizeof (struct interaction *));
1475 examine.iacts[0] = interaction_create (NULL);
1477 examine.dep_excl = MV_ANY;
1478 examine.fctr_excl = MV_ANY;
1480 examine.sl_power = 0;
1481 examine.dep_vars = NULL;
1482 examine.n_dep_vars = 0;
1483 examine.dict = dataset_dict (ds);
1485 /* Accept an optional, completely pointless "/VARIABLES=" */
1486 lex_match (lexer, T_SLASH);
1487 if (lex_match_id (lexer, "VARIABLES"))
1489 if (! lex_force_match (lexer, T_EQUALS))
1493 if (!parse_variables_const (lexer, examine.dict,
1494 &examine.dep_vars, &examine.n_dep_vars,
1495 PV_NO_DUPLICATE | PV_NUMERIC))
1498 if (lex_match (lexer, T_BY))
1500 struct interaction *iact = NULL;
1503 iact = parse_interaction (lexer, &examine);
1508 pool_nrealloc (examine.pool, iacts_mem,
1510 sizeof (*iacts_mem));
1512 iacts_mem[examine.n_iacts - 1] = iact;
1519 while (lex_token (lexer) != T_ENDCMD)
1521 lex_match (lexer, T_SLASH);
1523 if (lex_match_id (lexer, "STATISTICS"))
1525 lex_match (lexer, T_EQUALS);
1527 while (lex_token (lexer) != T_ENDCMD
1528 && lex_token (lexer) != T_SLASH)
1530 if (lex_match_id (lexer, "DESCRIPTIVES"))
1532 examine.descriptives = true;
1534 else if (lex_match_id (lexer, "EXTREME"))
1537 if (lex_match (lexer, T_LPAREN))
1539 if (!lex_force_int_range (lexer, "EXTREME", 0, INT_MAX))
1541 extr = lex_integer (lexer);
1544 if (! lex_force_match (lexer, T_RPAREN))
1547 examine.disp_extremes = extr;
1549 else if (lex_match_id (lexer, "NONE"))
1552 else if (lex_match (lexer, T_ALL))
1554 if (examine.disp_extremes == 0)
1555 examine.disp_extremes = 5;
1559 lex_error (lexer, NULL);
1564 else if (lex_match_id (lexer, "PERCENTILES"))
1566 percentiles_seen = true;
1567 if (lex_match (lexer, T_LPAREN))
1569 while (lex_is_number (lexer))
1571 if (!lex_force_num_range_open (lexer, "PERCENTILES", 0, 100))
1573 double p = lex_number (lexer);
1575 examine.n_percentiles++;
1577 xrealloc (examine.ptiles,
1578 sizeof (*examine.ptiles) *
1579 examine.n_percentiles);
1581 examine.ptiles[examine.n_percentiles - 1] = p;
1584 lex_match (lexer, T_COMMA);
1586 if (!lex_force_match (lexer, T_RPAREN))
1590 lex_match (lexer, T_EQUALS);
1592 while (lex_token (lexer) != T_ENDCMD
1593 && lex_token (lexer) != T_SLASH)
1595 if (lex_match_id (lexer, "HAVERAGE"))
1597 examine.pc_alg = PC_HAVERAGE;
1599 else if (lex_match_id (lexer, "WAVERAGE"))
1601 examine.pc_alg = PC_WAVERAGE;
1603 else if (lex_match_id (lexer, "ROUND"))
1605 examine.pc_alg = PC_ROUND;
1607 else if (lex_match_id (lexer, "EMPIRICAL"))
1609 examine.pc_alg = PC_EMPIRICAL;
1611 else if (lex_match_id (lexer, "AEMPIRICAL"))
1613 examine.pc_alg = PC_AEMPIRICAL;
1615 else if (lex_match_id (lexer, "NONE"))
1617 examine.pc_alg = PC_NONE;
1621 lex_error (lexer, NULL);
1626 else if (lex_match_id (lexer, "TOTAL"))
1630 else if (lex_match_id (lexer, "NOTOTAL"))
1632 nototals_seen = true;
1634 else if (lex_match_id (lexer, "MISSING"))
1636 lex_match (lexer, T_EQUALS);
1638 while (lex_token (lexer) != T_ENDCMD
1639 && lex_token (lexer) != T_SLASH)
1641 if (lex_match_id (lexer, "LISTWISE"))
1643 examine.missing_pw = false;
1645 else if (lex_match_id (lexer, "PAIRWISE"))
1647 examine.missing_pw = true;
1649 else if (lex_match_id (lexer, "EXCLUDE"))
1651 examine.dep_excl = MV_ANY;
1653 else if (lex_match_id (lexer, "INCLUDE"))
1655 examine.dep_excl = MV_SYSTEM;
1657 else if (lex_match_id (lexer, "REPORT"))
1659 examine.fctr_excl = 0;
1661 else if (lex_match_id (lexer, "NOREPORT"))
1663 examine.fctr_excl = MV_ANY;
1667 lex_error (lexer, NULL);
1672 else if (lex_match_id (lexer, "COMPARE"))
1674 lex_match (lexer, T_EQUALS);
1675 if (lex_match_id (lexer, "VARIABLES"))
1677 examine.boxplot_mode = BP_VARIABLES;
1679 else if (lex_match_id (lexer, "GROUPS"))
1681 examine.boxplot_mode = BP_GROUPS;
1685 lex_error (lexer, NULL);
1689 else if (lex_match_id (lexer, "PLOT"))
1691 lex_match (lexer, T_EQUALS);
1693 while (lex_token (lexer) != T_ENDCMD
1694 && lex_token (lexer) != T_SLASH)
1696 if (lex_match_id (lexer, "BOXPLOT"))
1698 examine.plot |= PLOT_BOXPLOT;
1700 else if (lex_match_id (lexer, "NPPLOT"))
1702 examine.plot |= PLOT_NPPLOT;
1704 else if (lex_match_id (lexer, "HISTOGRAM"))
1706 examine.plot |= PLOT_HISTOGRAM;
1708 else if (lex_match_id (lexer, "SPREADLEVEL"))
1710 examine.plot |= PLOT_SPREADLEVEL;
1711 examine.sl_power = 0;
1712 if (lex_match (lexer, T_LPAREN) && lex_force_num (lexer))
1714 examine.sl_power = lex_number (lexer);
1717 if (! lex_force_match (lexer, T_RPAREN))
1721 else if (lex_match_id (lexer, "NONE"))
1725 else if (lex_match (lexer, T_ALL))
1731 lex_error (lexer, NULL);
1734 lex_match (lexer, T_COMMA);
1737 else if (lex_match_id (lexer, "CINTERVAL"))
1739 if (!lex_force_num (lexer))
1742 examine.conf = lex_number (lexer);
1745 else if (lex_match_id (lexer, "ID"))
1747 lex_match (lexer, T_EQUALS);
1749 examine.id_var = parse_variable_const (lexer, examine.dict);
1753 lex_error (lexer, NULL);
1759 if (totals_seen && nototals_seen)
1761 msg (SE, _("%s and %s are mutually exclusive"), "TOTAL", "NOTOTAL");
1765 /* If totals have been requested or if there are no factors
1766 in this analysis, then the totals need to be included. */
1767 if (!nototals_seen || examine.n_iacts == 1)
1769 examine.iacts = &iacts_mem[0];
1774 examine.iacts = &iacts_mem[1];
1775 interaction_destroy (iacts_mem[0]);
1781 examine.id_idx = var_get_case_index (examine.id_var);
1782 examine.id_width = var_get_width (examine.id_var);
1785 examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* value */
1786 examine.ex_proto = caseproto_add_width (examine.ex_proto, examine.id_width); /* id */
1787 examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* weight */
1790 if (examine.disp_extremes > 0)
1792 examine.calc_extremes = examine.disp_extremes;
1795 if (examine.descriptives && examine.calc_extremes == 0)
1797 /* Descriptives always displays the max and min */
1798 examine.calc_extremes = 1;
1801 if (percentiles_seen && examine.n_percentiles == 0)
1803 examine.n_percentiles = 7;
1804 examine.ptiles = xcalloc (examine.n_percentiles, sizeof (*examine.ptiles));
1806 examine.ptiles[0] = 5;
1807 examine.ptiles[1] = 10;
1808 examine.ptiles[2] = 25;
1809 examine.ptiles[3] = 50;
1810 examine.ptiles[4] = 75;
1811 examine.ptiles[5] = 90;
1812 examine.ptiles[6] = 95;
1815 assert (examine.calc_extremes >= examine.disp_extremes);
1817 struct casegrouper *grouper;
1818 struct casereader *group;
1821 grouper = casegrouper_create_splits (proc_open (ds), examine.dict);
1822 while (casegrouper_get_next_group (grouper, &group))
1823 run_examine (&examine, group);
1824 ok = casegrouper_destroy (grouper);
1825 ok = proc_commit (ds) && ok;
1828 caseproto_unref (examine.ex_proto);
1830 for (i = 0; i < examine.n_iacts; ++i)
1831 interaction_destroy (examine.iacts[i]);
1832 free (examine.ptiles);
1833 free (examine.dep_vars);
1834 pool_destroy (examine.pool);
1839 caseproto_unref (examine.ex_proto);
1840 examine.iacts = iacts_mem;
1841 for (i = 0; i < examine.n_iacts; ++i)
1842 interaction_destroy (examine.iacts[i]);
1843 free (examine.dep_vars);
1844 free (examine.ptiles);
1845 pool_destroy (examine.pool);