2 PSPP - a program for statistical analysis.
3 Copyright (C) 2012, 2013, 2016 Free Software Foundation, Inc.
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
22 #include <gsl/gsl_cdf.h>
24 #include "libpspp/assertion.h"
25 #include "libpspp/message.h"
26 #include "libpspp/pool.h"
29 #include "data/dataset.h"
30 #include "data/dictionary.h"
31 #include "data/casegrouper.h"
32 #include "data/casereader.h"
33 #include "data/casewriter.h"
34 #include "data/caseproto.h"
35 #include "data/subcase.h"
38 #include "data/format.h"
40 #include "math/interaction.h"
41 #include "math/box-whisker.h"
42 #include "math/categoricals.h"
43 #include "math/chart-geometry.h"
44 #include "math/histogram.h"
45 #include "math/moments.h"
47 #include "math/sort.h"
48 #include "math/order-stats.h"
49 #include "math/percentiles.h"
50 #include "math/tukey-hinges.h"
51 #include "math/trimmed-mean.h"
53 #include "output/charts/boxplot.h"
54 #include "output/charts/np-plot.h"
55 #include "output/charts/spreadlevel-plot.h"
56 #include "output/charts/plot-hist.h"
58 #include "language/command.h"
59 #include "language/lexer/lexer.h"
60 #include "language/lexer/value-parser.h"
61 #include "language/lexer/variable-parser.h"
63 #include "output/pivot-table.h"
66 #define _(msgid) gettext (msgid)
67 #define N_(msgid) msgid
70 append_value_name (const struct variable *var, const union value *val, struct string *str)
72 var_append_value_name (var, val, str);
73 if ( var_is_value_missing (var, val, MV_ANY))
74 ds_put_cstr (str, _(" (missing)"));
84 /* Indices for the ex_proto member (below) */
93 #define PLOT_HISTOGRAM 0x1
94 #define PLOT_BOXPLOT 0x2
95 #define PLOT_NPPLOT 0x4
96 #define PLOT_SPREADLEVEL 0x8
103 /* A caseproto used to contain the data subsets under examination,
105 struct caseproto *ex_proto;
108 const struct variable **dep_vars;
111 struct interaction **iacts;
113 enum mv_class dep_excl;
114 enum mv_class fctr_excl;
116 const struct dictionary *dict;
118 struct categoricals *cats;
120 /* how many extremities to display */
129 /* The case index of the ID value (or -1) if not applicable */
135 size_t n_percentiles;
140 enum bp_mode boxplot_mode;
142 const struct variable *id_var;
144 const struct variable *wv;
149 /* The value of this extremity */
152 /* Either the casenumber or the value of the variable specified
153 by the /ID subcommand which corresponds to this extremity */
154 union value identity;
157 struct exploratory_stats
164 /* Most operations need a sorted reader/writer */
165 struct casewriter *sorted_writer;
166 struct casereader *sorted_reader;
168 struct extremity *minima;
169 struct extremity *maxima;
172 Minimum should alway equal mimima[0].val.
173 Likewise, maximum should alway equal maxima[0].val.
174 This redundancy exists as an optimisation effort.
175 Some statistics (eg histogram) require early calculation
181 struct trimmed_mean *trimmed_mean;
182 struct percentile *quartiles[3];
183 struct percentile **percentiles;
185 struct tukey_hinges *hinges;
187 /* The data for the NP Plots */
190 struct histogram *histogram;
192 /* The data for the box plots */
193 struct box_whisker *box_whisker;
198 /* The minimum weight */
203 show_boxplot_grouped (const struct examine *cmd, int iact_idx)
207 const struct interaction *iact = cmd->iacts[iact_idx];
208 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
210 for (v = 0; v < cmd->n_dep_vars; ++v)
212 double y_min = DBL_MAX;
213 double y_max = -DBL_MAX;
215 struct boxplot *boxplot;
217 ds_init_empty (&title);
219 if (iact->n_vars > 0)
222 ds_init_empty (&istr);
223 interaction_to_string (iact, &istr);
224 ds_put_format (&title, _("Boxplot of %s vs. %s"),
225 var_to_string (cmd->dep_vars[v]),
230 ds_put_format (&title, _("Boxplot of %s"), var_to_string (cmd->dep_vars[v]));
232 for (grp = 0; grp < n_cats; ++grp)
234 const struct exploratory_stats *es =
235 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
237 if ( y_min > es[v].minimum)
238 y_min = es[v].minimum;
240 if ( y_max < es[v].maximum)
241 y_max = es[v].maximum;
244 boxplot = boxplot_create (y_min, y_max, ds_cstr (&title));
248 for (grp = 0; grp < n_cats; ++grp)
253 const struct ccase *c =
254 categoricals_get_case_by_category_real (cmd->cats, iact_idx, grp);
256 struct exploratory_stats *es =
257 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
259 ds_init_empty (&label);
260 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
263 const struct variable *ivar = iact->vars[ivar_idx];
264 const union value *val = case_data (c, ivar);
267 append_value_name (ivar, val, &l);
268 ds_ltrim (&l, ss_cstr (" "));
270 ds_put_substring (&label, l.ss);
271 if (ivar_idx < iact->n_vars - 1)
272 ds_put_cstr (&label, "; ");
277 boxplot_add_box (boxplot, es[v].box_whisker, ds_cstr (&label));
278 es[v].box_whisker = NULL;
283 boxplot_submit (boxplot);
288 show_boxplot_variabled (const struct examine *cmd, int iact_idx)
291 const struct interaction *iact = cmd->iacts[iact_idx];
292 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
294 for (grp = 0; grp < n_cats; ++grp)
296 struct boxplot *boxplot;
298 double y_min = DBL_MAX;
299 double y_max = -DBL_MAX;
301 const struct ccase *c =
302 categoricals_get_case_by_category_real (cmd->cats, iact_idx, grp);
305 ds_init_empty (&title);
307 for (v = 0; v < cmd->n_dep_vars; ++v)
309 const struct exploratory_stats *es =
310 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
312 if ( y_min > es[v].minimum)
313 y_min = es[v].minimum;
315 if ( y_max < es[v].maximum)
316 y_max = es[v].maximum;
319 if ( iact->n_vars == 0)
320 ds_put_format (&title, _("Boxplot"));
325 ds_init_empty (&label);
326 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
328 const struct variable *ivar = iact->vars[ivar_idx];
329 const union value *val = case_data (c, ivar);
331 ds_put_cstr (&label, var_to_string (ivar));
332 ds_put_cstr (&label, " = ");
333 append_value_name (ivar, val, &label);
334 ds_put_cstr (&label, "; ");
337 ds_put_format (&title, _("Boxplot of %s"),
343 boxplot = boxplot_create (y_min, y_max, ds_cstr (&title));
347 for (v = 0; v < cmd->n_dep_vars; ++v)
349 struct exploratory_stats *es =
350 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
352 boxplot_add_box (boxplot, es[v].box_whisker,
353 var_to_string (cmd->dep_vars[v]));
354 es[v].box_whisker = NULL;
357 boxplot_submit (boxplot);
363 show_npplot (const struct examine *cmd, int iact_idx)
365 const struct interaction *iact = cmd->iacts[iact_idx];
366 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
370 for (v = 0; v < cmd->n_dep_vars; ++v)
373 for (grp = 0; grp < n_cats; ++grp)
375 struct chart_item *npp, *dnpp;
376 struct casereader *reader;
380 const struct ccase *c =
381 categoricals_get_case_by_category_real (cmd->cats,
384 const struct exploratory_stats *es =
385 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
388 ds_init_cstr (&label,
389 var_to_string (cmd->dep_vars[v]));
391 if ( iact->n_vars > 0)
393 ds_put_cstr (&label, " (");
394 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
396 const struct variable *ivar = iact->vars[ivar_idx];
397 const union value *val = case_data (c, ivar);
399 ds_put_cstr (&label, var_to_string (ivar));
400 ds_put_cstr (&label, " = ");
401 append_value_name (ivar, val, &label);
402 ds_put_cstr (&label, "; ");
405 ds_put_cstr (&label, ")");
409 reader = casewriter_make_reader (np->writer);
412 npp = np_plot_create (np, reader, ds_cstr (&label));
413 dnpp = dnp_plot_create (np, reader, ds_cstr (&label));
415 if (npp == NULL || dnpp == NULL)
417 msg (MW, _("Not creating NP plot because data set is empty."));
418 chart_item_unref (npp);
419 chart_item_unref (dnpp);
423 chart_item_submit (npp);
424 chart_item_submit (dnpp);
426 casereader_destroy (reader);
434 show_spreadlevel (const struct examine *cmd, int iact_idx)
436 const struct interaction *iact = cmd->iacts[iact_idx];
437 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
441 /* Spreadlevel when there are no levels is not useful */
442 if (iact->n_vars == 0)
445 for (v = 0; v < cmd->n_dep_vars; ++v)
448 struct chart_item *sl;
451 ds_init_cstr (&label,
452 var_to_string (cmd->dep_vars[v]));
454 if (iact->n_vars > 0)
456 ds_put_cstr (&label, " (");
457 interaction_to_string (iact, &label);
458 ds_put_cstr (&label, ")");
461 sl = spreadlevel_plot_create (ds_cstr (&label), cmd->sl_power);
463 for (grp = 0; grp < n_cats; ++grp)
465 const struct exploratory_stats *es =
466 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
468 double median = percentile_calculate (es[v].quartiles[1], cmd->pc_alg);
470 double iqr = percentile_calculate (es[v].quartiles[2], cmd->pc_alg) -
471 percentile_calculate (es[v].quartiles[0], cmd->pc_alg);
473 spreadlevel_plot_add (sl, iqr, median);
477 msg (MW, _("Not creating spreadlevel chart for %s"), ds_cstr (&label));
479 chart_item_submit (sl);
487 show_histogram (const struct examine *cmd, int iact_idx)
489 const struct interaction *iact = cmd->iacts[iact_idx];
490 const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
494 for (v = 0; v < cmd->n_dep_vars; ++v)
497 for (grp = 0; grp < n_cats; ++grp)
501 const struct ccase *c =
502 categoricals_get_case_by_category_real (cmd->cats,
505 const struct exploratory_stats *es =
506 categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp);
510 if (es[v].histogram == NULL)
513 ds_init_cstr (&label,
514 var_to_string (cmd->dep_vars[v]));
516 if ( iact->n_vars > 0)
518 ds_put_cstr (&label, " (");
519 for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx)
521 const struct variable *ivar = iact->vars[ivar_idx];
522 const union value *val = case_data (c, ivar);
524 ds_put_cstr (&label, var_to_string (ivar));
525 ds_put_cstr (&label, " = ");
526 append_value_name (ivar, val, &label);
527 ds_put_cstr (&label, "; ");
530 ds_put_cstr (&label, ")");
534 moments_calculate (es[v].mom, &n, &mean, &var, NULL, NULL);
537 ( histogram_chart_create (es[v].histogram->gsl_hist,
538 ds_cstr (&label), n, mean,
547 static struct pivot_value *
548 new_value_with_missing_footnote (const struct variable *var,
549 const union value *value,
550 struct pivot_footnote *missing_footnote)
552 struct pivot_value *pv = pivot_value_new_var_value (var, value);
553 if (var_is_value_missing (var, value, MV_USER))
554 pivot_value_add_footnote (pv, missing_footnote);
559 create_interaction_dimensions (struct pivot_table *table,
560 const struct categoricals *cats,
561 const struct interaction *iact,
562 struct pivot_footnote *missing_footnote)
564 for (size_t i = iact->n_vars; i-- > 0; )
566 const struct variable *var = iact->vars[i];
567 struct pivot_dimension *d = pivot_dimension_create__ (
568 table, PIVOT_AXIS_ROW, pivot_value_new_variable (var));
569 d->root->show_label = true;
572 union value *values = categoricals_get_var_values (cats, var, &n);
573 for (size_t j = 0; j < n; j++)
574 pivot_category_create_leaf (
575 d->root, new_value_with_missing_footnote (var, &values[j],
580 static struct pivot_footnote *
581 create_missing_footnote (struct pivot_table *table)
583 return pivot_table_create_footnote (
584 table, pivot_value_new_text (N_("User-missing value.")));
588 percentiles_report (const struct examine *cmd, int iact_idx)
590 struct pivot_table *table = pivot_table_create (N_("Percentiles"));
591 table->omit_empty = true;
593 struct pivot_dimension *percentiles = pivot_dimension_create (
594 table, PIVOT_AXIS_COLUMN, N_("Percentiles"));
595 percentiles->root->show_label = true;
596 for (int i = 0; i < cmd->n_percentiles; ++i)
597 pivot_category_create_leaf (
599 pivot_value_new_user_text_nocopy (xasprintf ("%g", cmd->ptiles[i])));
601 pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Statistics"),
602 N_("Weighted Average"), N_("Tukey's Hinges"));
604 const struct interaction *iact = cmd->iacts[iact_idx];
605 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
606 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
608 struct pivot_dimension *dep_dim = pivot_dimension_create (
609 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
611 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
613 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
614 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
616 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
617 dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
619 for (size_t i = 0; i < n_cats; ++i)
621 for (size_t j = 0; j < iact->n_vars; j++)
623 int idx = categoricals_get_value_index_by_category_real (
624 cmd->cats, iact_idx, i, j);
625 indexes[table->n_dimensions - 2 - j] = idx;
628 const struct exploratory_stats *ess
629 = categoricals_get_user_data_by_category_real (
630 cmd->cats, iact_idx, i);
631 const struct exploratory_stats *es = ess + v;
634 tukey_hinges_calculate (es->hinges, hinges);
636 for (size_t pc_idx = 0; pc_idx < cmd->n_percentiles; ++pc_idx)
641 double value = percentile_calculate (es->percentiles[pc_idx],
643 pivot_table_put (table, indexes, table->n_dimensions,
644 pivot_value_new_number (value));
646 double hinge = (cmd->ptiles[pc_idx] == 25.0 ? hinges[0]
647 : cmd->ptiles[pc_idx] == 50.0 ? hinges[1]
648 : cmd->ptiles[pc_idx] == 75.0 ? hinges[2]
653 pivot_table_put (table, indexes, table->n_dimensions,
654 pivot_value_new_number (hinge));
662 pivot_table_submit (table);
666 descriptives_report (const struct examine *cmd, int iact_idx)
668 struct pivot_table *table = pivot_table_create (N_("Descriptives"));
669 table->omit_empty = true;
671 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Aspect"),
672 N_("Statistic"), N_("Std. Error"));
674 struct pivot_dimension *statistics = pivot_dimension_create (
675 table, PIVOT_AXIS_ROW, N_("Statistics"), N_("Mean"));
676 struct pivot_category *interval = pivot_category_create_group__ (
678 pivot_value_new_text_format (N_("%g%% Confidence Interval for Mean"),
680 pivot_category_create_leaves (interval, N_("Lower Bound"),
682 pivot_category_create_leaves (
683 statistics->root, N_("5% Trimmed Mean"), N_("Median"), N_("Variance"),
684 N_("Std. Deviation"), N_("Minimum"), N_("Maximum"), N_("Range"),
685 N_("Interquartile Range"), N_("Skewness"), N_("Kurtosis"));
687 const struct interaction *iact = cmd->iacts[iact_idx];
688 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
689 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
691 struct pivot_dimension *dep_dim = pivot_dimension_create (
692 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
694 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
696 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
697 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
699 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
700 dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
702 for (size_t i = 0; i < n_cats; ++i)
704 for (size_t j = 0; j < iact->n_vars; j++)
706 int idx = categoricals_get_value_index_by_category_real (
707 cmd->cats, iact_idx, i, j);
708 indexes[table->n_dimensions - 2 - j] = idx;
711 const struct exploratory_stats *ess
712 = categoricals_get_user_data_by_category_real (cmd->cats,
714 const struct exploratory_stats *es = ess + v;
716 double m0, m1, m2, m3, m4;
717 moments_calculate (es->mom, &m0, &m1, &m2, &m3, &m4);
718 double tval = gsl_cdf_tdist_Qinv ((1.0 - cmd->conf) / 2.0, m0 - 1.0);
728 { 0, 1, calc_semean (m2, m0) },
729 { 1, 0, m1 - tval * calc_semean (m2, m0) },
730 { 2, 0, m1 + tval * calc_semean (m2, m0) },
731 { 3, 0, trimmed_mean_calculate (es->trimmed_mean) },
732 { 4, 0, percentile_calculate (es->quartiles[1], cmd->pc_alg) },
735 { 7, 0, es->minima[0].val },
736 { 8, 0, es->maxima[0].val },
737 { 9, 0, es->maxima[0].val - es->minima[0].val },
738 { 10, 0, (percentile_calculate (es->quartiles[2], cmd->pc_alg) -
739 percentile_calculate (es->quartiles[0], cmd->pc_alg)) },
741 { 11, 1, calc_seskew (m0) },
743 { 12, 1, calc_sekurt (m0) },
745 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
747 const struct entry *e = &entries[j];
748 indexes[0] = e->aspect_idx;
749 indexes[1] = e->stat_idx;
750 pivot_table_put (table, indexes, table->n_dimensions,
751 pivot_value_new_number (e->x));
758 pivot_table_submit (table);
763 extremes_report (const struct examine *cmd, int iact_idx)
765 struct pivot_table *table = pivot_table_create (N_("Extreme Values"));
766 table->omit_empty = true;
768 struct pivot_dimension *statistics = pivot_dimension_create (
769 table, PIVOT_AXIS_COLUMN, N_("Statistics"));
770 pivot_category_create_leaf (statistics->root,
772 ? pivot_value_new_variable (cmd->id_var)
773 : pivot_value_new_text (N_("Case Number"))));
774 pivot_category_create_leaves (statistics->root, N_("Value"));
776 struct pivot_dimension *order = pivot_dimension_create (
777 table, PIVOT_AXIS_ROW, N_("Order"));
778 for (size_t i = 0; i < cmd->disp_extremes; i++)
779 pivot_category_create_leaf (order->root, pivot_value_new_integer (i + 1));
781 pivot_dimension_create (table, PIVOT_AXIS_ROW, N_("Extreme"),
782 N_("Highest"), N_("Lowest"));
784 const struct interaction *iact = cmd->iacts[iact_idx];
785 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
786 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
788 struct pivot_dimension *dep_dim = pivot_dimension_create (
789 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
791 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
793 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
794 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
796 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
797 dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
799 for (size_t i = 0; i < n_cats; ++i)
801 for (size_t j = 0; j < iact->n_vars; j++)
803 int idx = categoricals_get_value_index_by_category_real (
804 cmd->cats, iact_idx, i, j);
805 indexes[table->n_dimensions - 2 - j] = idx;
808 const struct exploratory_stats *ess
809 = categoricals_get_user_data_by_category_real (cmd->cats,
811 const struct exploratory_stats *es = ess + v;
813 for (int e = 0 ; e < cmd->disp_extremes; ++e)
817 for (size_t j = 0; j < 2; j++)
819 const struct extremity *extremity
820 = j ? &es->minima[e] : &es->maxima[e];
825 table, indexes, table->n_dimensions,
827 ? new_value_with_missing_footnote (cmd->id_var,
828 &extremity->identity,
830 : pivot_value_new_integer (extremity->identity.f)));
833 union value val = { .f = extremity->val };
835 table, indexes, table->n_dimensions,
836 new_value_with_missing_footnote (cmd->dep_vars[v], &val,
844 pivot_table_submit (table);
849 summary_report (const struct examine *cmd, int iact_idx)
851 struct pivot_table *table = pivot_table_create (
852 N_("Case Processing Summary"));
853 table->omit_empty = true;
854 pivot_table_set_weight_var (table, dict_get_weight (cmd->dict));
856 pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"),
857 N_("N"), PIVOT_RC_COUNT,
858 N_("Percent"), PIVOT_RC_PERCENT);
859 struct pivot_dimension *cases = pivot_dimension_create (
860 table, PIVOT_AXIS_COLUMN, N_("Cases"), N_("Valid"), N_("Missing"),
862 cases->root->show_label = true;
864 const struct interaction *iact = cmd->iacts[iact_idx];
865 struct pivot_footnote *missing_footnote = create_missing_footnote (table);
866 create_interaction_dimensions (table, cmd->cats, iact, missing_footnote);
868 struct pivot_dimension *dep_dim = pivot_dimension_create (
869 table, PIVOT_AXIS_ROW, N_("Dependent Variables"));
871 size_t *indexes = xnmalloc (table->n_dimensions, sizeof *indexes);
873 size_t n_cats = categoricals_n_count (cmd->cats, iact_idx);
874 for (size_t v = 0; v < cmd->n_dep_vars; ++v)
876 indexes[table->n_dimensions - 1] = pivot_category_create_leaf (
877 dep_dim->root, pivot_value_new_variable (cmd->dep_vars[v]));
879 for (size_t i = 0; i < n_cats; ++i)
881 for (size_t j = 0; j < iact->n_vars; j++)
883 int idx = categoricals_get_value_index_by_category_real (
884 cmd->cats, iact_idx, i, j);
885 indexes[table->n_dimensions - 2 - j] = idx;
888 const struct exploratory_stats *es
889 = categoricals_get_user_data_by_category_real (
890 cmd->cats, iact_idx, i);
892 double total = es[v].missing + es[v].non_missing;
900 { 0, 0, es[v].non_missing },
901 { 1, 0, 100.0 * es[v].non_missing / total },
902 { 0, 1, es[v].missing },
903 { 1, 1, 100.0 * es[v].missing / total },
907 for (size_t j = 0; j < sizeof entries / sizeof *entries; j++)
909 const struct entry *e = &entries[j];
910 indexes[0] = e->stat_idx;
911 indexes[1] = e->case_idx;
912 pivot_table_put (table, indexes, table->n_dimensions,
913 pivot_value_new_number (e->x));
920 pivot_table_submit (table);
923 /* Attempt to parse an interaction from LEXER */
924 static struct interaction *
925 parse_interaction (struct lexer *lexer, struct examine *ex)
927 const struct variable *v = NULL;
928 struct interaction *iact = NULL;
930 if ( lex_match_variable (lexer, ex->dict, &v))
932 iact = interaction_create (v);
934 while (lex_match (lexer, T_BY))
936 if (!lex_match_variable (lexer, ex->dict, &v))
938 interaction_destroy (iact);
941 interaction_add_variable (iact, v);
943 lex_match (lexer, T_COMMA);
951 create_n (const void *aux1, void *aux2 UNUSED)
955 const struct examine *examine = aux1;
956 struct exploratory_stats *es = pool_calloc (examine->pool, examine->n_dep_vars, sizeof (*es));
957 struct subcase ordering;
958 subcase_init (&ordering, 0, 0, SC_ASCEND);
960 for (v = 0; v < examine->n_dep_vars; v++)
962 es[v].sorted_writer = sort_create_writer (&ordering, examine->ex_proto);
963 es[v].sorted_reader = NULL;
965 es[v].mom = moments_create (MOMENT_KURTOSIS);
966 es[v].cmin = DBL_MAX;
968 es[v].maximum = -DBL_MAX;
969 es[v].minimum = DBL_MAX;
972 subcase_destroy (&ordering);
977 update_n (const void *aux1, void *aux2 UNUSED, void *user_data,
978 const struct ccase *c, double weight)
981 const struct examine *examine = aux1;
982 struct exploratory_stats *es = user_data;
984 bool this_case_is_missing = false;
985 /* LISTWISE missing must be dealt with here */
986 if (!examine->missing_pw)
988 for (v = 0; v < examine->n_dep_vars; v++)
990 const struct variable *var = examine->dep_vars[v];
992 if (var_is_value_missing (var, case_data (c, var), examine->dep_excl))
994 es[v].missing += weight;
995 this_case_is_missing = true;
1000 if (this_case_is_missing)
1003 for (v = 0; v < examine->n_dep_vars; v++)
1005 struct ccase *outcase ;
1006 const struct variable *var = examine->dep_vars[v];
1007 const double x = case_data (c, var)->f;
1009 if (var_is_value_missing (var, case_data (c, var), examine->dep_excl))
1011 es[v].missing += weight;
1015 outcase = case_create (examine->ex_proto);
1017 if (x > es[v].maximum)
1020 if (x < es[v].minimum)
1023 es[v].non_missing += weight;
1025 moments_pass_one (es[v].mom, x, weight);
1027 /* Save the value and the ID to the writer */
1028 assert (examine->id_idx != -1);
1029 case_data_rw_idx (outcase, EX_VAL)->f = x;
1030 value_copy (case_data_rw_idx (outcase, EX_ID),
1031 case_data_idx (c, examine->id_idx), examine->id_width);
1033 case_data_rw_idx (outcase, EX_WT)->f = weight;
1037 if (es[v].cmin > weight)
1038 es[v].cmin = weight;
1040 casewriter_write (es[v].sorted_writer, outcase);
1045 calculate_n (const void *aux1, void *aux2 UNUSED, void *user_data)
1048 const struct examine *examine = aux1;
1049 struct exploratory_stats *es = user_data;
1051 for (v = 0; v < examine->n_dep_vars; v++)
1054 casenumber imin = 0;
1056 struct casereader *reader;
1059 if (examine->plot & PLOT_HISTOGRAM && es[v].non_missing > 0)
1062 double bin_width = fabs (es[v].minimum - es[v].maximum)
1063 / (1 + log2 (es[v].cc))
1067 histogram_create (bin_width, es[v].minimum, es[v].maximum);
1070 es[v].sorted_reader = casewriter_make_reader (es[v].sorted_writer);
1071 es[v].sorted_writer = NULL;
1073 imax = casereader_get_case_cnt (es[v].sorted_reader);
1075 es[v].maxima = pool_calloc (examine->pool, examine->calc_extremes, sizeof (*es[v].maxima));
1076 es[v].minima = pool_calloc (examine->pool, examine->calc_extremes, sizeof (*es[v].minima));
1077 for (i = 0; i < examine->calc_extremes; ++i)
1079 value_init_pool (examine->pool, &es[v].maxima[i].identity, examine->id_width) ;
1080 value_init_pool (examine->pool, &es[v].minima[i].identity, examine->id_width) ;
1084 for (reader = casereader_clone (es[v].sorted_reader);
1085 (c = casereader_read (reader)) != NULL; case_unref (c))
1087 const double val = case_data_idx (c, EX_VAL)->f;
1088 double wt = case_data_idx (c, EX_WT)->f;
1089 wt = var_force_valid_weight (examine->wv, wt, &warn);
1091 moments_pass_two (es[v].mom, val, wt);
1093 if (es[v].histogram)
1094 histogram_add (es[v].histogram, val, wt);
1096 if (imin < examine->calc_extremes)
1099 for (x = imin; x < examine->calc_extremes; ++x)
1101 struct extremity *min = &es[v].minima[x];
1103 value_copy (&min->identity, case_data_idx (c, EX_ID), examine->id_width);
1109 if (imax < examine->calc_extremes)
1113 for (x = imax; x < imax + 1; ++x)
1115 struct extremity *max;
1117 if (x >= examine->calc_extremes)
1120 max = &es[v].maxima[x];
1122 value_copy (&max->identity, case_data_idx (c, EX_ID), examine->id_width);
1126 casereader_destroy (reader);
1128 if (examine->calc_extremes > 0 && es[v].non_missing > 0)
1130 assert (es[v].minima[0].val == es[v].minimum);
1131 assert (es[v].maxima[0].val == es[v].maximum);
1135 const int n_os = 5 + examine->n_percentiles;
1136 struct order_stats **os ;
1137 es[v].percentiles = pool_calloc (examine->pool, examine->n_percentiles, sizeof (*es[v].percentiles));
1139 es[v].trimmed_mean = trimmed_mean_create (es[v].cc, 0.05);
1141 os = xcalloc (n_os, sizeof *os);
1142 os[0] = &es[v].trimmed_mean->parent;
1144 es[v].quartiles[0] = percentile_create (0.25, es[v].cc);
1145 es[v].quartiles[1] = percentile_create (0.5, es[v].cc);
1146 es[v].quartiles[2] = percentile_create (0.75, es[v].cc);
1148 os[1] = &es[v].quartiles[0]->parent;
1149 os[2] = &es[v].quartiles[1]->parent;
1150 os[3] = &es[v].quartiles[2]->parent;
1152 es[v].hinges = tukey_hinges_create (es[v].cc, es[v].cmin);
1153 os[4] = &es[v].hinges->parent;
1155 for (i = 0; i < examine->n_percentiles; ++i)
1157 es[v].percentiles[i] = percentile_create (examine->ptiles[i] / 100.00, es[v].cc);
1158 os[5 + i] = &es[v].percentiles[i]->parent;
1161 order_stats_accumulate_idx (os, n_os,
1162 casereader_clone (es[v].sorted_reader),
1168 if (examine->plot & PLOT_BOXPLOT)
1170 struct order_stats *os;
1172 es[v].box_whisker = box_whisker_create (es[v].hinges,
1173 EX_ID, examine->id_var);
1175 os = &es[v].box_whisker->parent;
1176 order_stats_accumulate_idx (&os, 1,
1177 casereader_clone (es[v].sorted_reader),
1181 if (examine->plot & PLOT_NPPLOT)
1183 double n, mean, var;
1184 struct order_stats *os;
1186 moments_calculate (es[v].mom, &n, &mean, &var, NULL, NULL);
1188 es[v].np = np_create (n, mean, var);
1190 os = &es[v].np->parent;
1192 order_stats_accumulate_idx (&os, 1,
1193 casereader_clone (es[v].sorted_reader),
1201 cleanup_exploratory_stats (struct examine *cmd)
1204 for (i = 0; i < cmd->n_iacts; ++i)
1207 const size_t n_cats = categoricals_n_count (cmd->cats, i);
1209 for (v = 0; v < cmd->n_dep_vars; ++v)
1212 for (grp = 0; grp < n_cats; ++grp)
1215 const struct exploratory_stats *es =
1216 categoricals_get_user_data_by_category_real (cmd->cats, i, grp);
1218 struct order_stats *os = &es[v].hinges->parent;
1219 struct statistic *stat = &os->parent;
1220 stat->destroy (stat);
1222 for (q = 0; q < 3 ; q++)
1224 os = &es[v].quartiles[q]->parent;
1226 stat->destroy (stat);
1229 for (q = 0; q < cmd->n_percentiles ; q++)
1231 os = &es[v].percentiles[q]->parent;
1233 stat->destroy (stat);
1236 os = &es[v].trimmed_mean->parent;
1238 stat->destroy (stat);
1240 os = &es[v].np->parent;
1244 stat->destroy (stat);
1247 statistic_destroy (&es[v].histogram->parent);
1248 moments_destroy (es[v].mom);
1250 if (es[v].box_whisker)
1252 stat = &es[v].box_whisker->parent.parent;
1253 stat->destroy (stat);
1256 casereader_destroy (es[v].sorted_reader);
1264 run_examine (struct examine *cmd, struct casereader *input)
1268 struct casereader *reader;
1270 struct payload payload;
1271 payload.create = create_n;
1272 payload.update = update_n;
1273 payload.calculate = calculate_n;
1274 payload.destroy = NULL;
1276 cmd->wv = dict_get_weight (cmd->dict);
1279 = categoricals_create (cmd->iacts, cmd->n_iacts, cmd->wv, cmd->fctr_excl);
1281 categoricals_set_payload (cmd->cats, &payload, cmd, NULL);
1283 if (cmd->id_var == NULL)
1285 struct ccase *c = casereader_peek (input, 0);
1287 cmd->id_idx = case_get_value_cnt (c);
1288 input = casereader_create_arithmetic_sequence (input, 1.0, 1.0);
1293 for (reader = input;
1294 (c = casereader_read (reader)) != NULL; case_unref (c))
1296 categoricals_update (cmd->cats, c);
1298 casereader_destroy (reader);
1299 categoricals_done (cmd->cats);
1301 for (i = 0; i < cmd->n_iacts; ++i)
1303 summary_report (cmd, i);
1305 const size_t n_cats = categoricals_n_count (cmd->cats, i);
1309 if (cmd->disp_extremes > 0)
1310 extremes_report (cmd, i);
1312 if (cmd->n_percentiles > 0)
1313 percentiles_report (cmd, i);
1315 if (cmd->plot & PLOT_BOXPLOT)
1317 switch (cmd->boxplot_mode)
1320 show_boxplot_grouped (cmd, i);
1323 show_boxplot_variabled (cmd, i);
1331 if (cmd->plot & PLOT_HISTOGRAM)
1332 show_histogram (cmd, i);
1334 if (cmd->plot & PLOT_NPPLOT)
1335 show_npplot (cmd, i);
1337 if (cmd->plot & PLOT_SPREADLEVEL)
1338 show_spreadlevel (cmd, i);
1340 if (cmd->descriptives)
1341 descriptives_report (cmd, i);
1344 cleanup_exploratory_stats (cmd);
1345 categoricals_destroy (cmd->cats);
1350 cmd_examine (struct lexer *lexer, struct dataset *ds)
1353 bool nototals_seen = false;
1354 bool totals_seen = false;
1356 struct interaction **iacts_mem = NULL;
1357 struct examine examine;
1358 bool percentiles_seen = false;
1360 examine.missing_pw = false;
1361 examine.disp_extremes = 0;
1362 examine.calc_extremes = 0;
1363 examine.descriptives = false;
1364 examine.conf = 0.95;
1365 examine.pc_alg = PC_HAVERAGE;
1366 examine.ptiles = NULL;
1367 examine.n_percentiles = 0;
1368 examine.id_idx = -1;
1369 examine.id_width = 0;
1370 examine.id_var = NULL;
1371 examine.boxplot_mode = BP_GROUPS;
1373 examine.ex_proto = caseproto_create ();
1375 examine.pool = pool_create ();
1377 /* Allocate space for the first interaction.
1378 This is interaction is an empty one (for the totals).
1379 If no totals are requested, we will simply ignore this
1382 examine.n_iacts = 1;
1383 examine.iacts = iacts_mem = pool_zalloc (examine.pool, sizeof (struct interaction *));
1384 examine.iacts[0] = interaction_create (NULL);
1386 examine.dep_excl = MV_ANY;
1387 examine.fctr_excl = MV_ANY;
1389 examine.sl_power = 0;
1390 examine.dep_vars = NULL;
1391 examine.n_dep_vars = 0;
1392 examine.dict = dataset_dict (ds);
1394 /* Accept an optional, completely pointless "/VARIABLES=" */
1395 lex_match (lexer, T_SLASH);
1396 if (lex_match_id (lexer, "VARIABLES"))
1398 if (! lex_force_match (lexer, T_EQUALS) )
1402 if (!parse_variables_const (lexer, examine.dict,
1403 &examine.dep_vars, &examine.n_dep_vars,
1404 PV_NO_DUPLICATE | PV_NUMERIC))
1407 if (lex_match (lexer, T_BY))
1409 struct interaction *iact = NULL;
1412 iact = parse_interaction (lexer, &examine);
1417 pool_nrealloc (examine.pool, iacts_mem,
1419 sizeof (*iacts_mem));
1421 iacts_mem[examine.n_iacts - 1] = iact;
1428 while (lex_token (lexer) != T_ENDCMD)
1430 lex_match (lexer, T_SLASH);
1432 if (lex_match_id (lexer, "STATISTICS"))
1434 lex_match (lexer, T_EQUALS);
1436 while (lex_token (lexer) != T_ENDCMD
1437 && lex_token (lexer) != T_SLASH)
1439 if (lex_match_id (lexer, "DESCRIPTIVES"))
1441 examine.descriptives = true;
1443 else if (lex_match_id (lexer, "EXTREME"))
1446 if (lex_match (lexer, T_LPAREN))
1448 if (!lex_force_int (lexer))
1450 extr = lex_integer (lexer);
1454 msg (MW, _("%s may not be negative. Using default value (%g)."), "EXTREME", 5.0);
1459 if (! lex_force_match (lexer, T_RPAREN))
1462 examine.disp_extremes = extr;
1464 else if (lex_match_id (lexer, "NONE"))
1467 else if (lex_match (lexer, T_ALL))
1469 if (examine.disp_extremes == 0)
1470 examine.disp_extremes = 5;
1474 lex_error (lexer, NULL);
1479 else if (lex_match_id (lexer, "PERCENTILES"))
1481 percentiles_seen = true;
1482 if (lex_match (lexer, T_LPAREN))
1484 while (lex_is_number (lexer))
1486 double p = lex_number (lexer);
1488 if ( p <= 0 || p >= 100.0)
1491 _("Percentiles must lie in the range (0, 100)"));
1495 examine.n_percentiles++;
1497 xrealloc (examine.ptiles,
1498 sizeof (*examine.ptiles) *
1499 examine.n_percentiles);
1501 examine.ptiles[examine.n_percentiles - 1] = p;
1504 lex_match (lexer, T_COMMA);
1506 if (!lex_force_match (lexer, T_RPAREN))
1510 lex_match (lexer, T_EQUALS);
1512 while (lex_token (lexer) != T_ENDCMD
1513 && lex_token (lexer) != T_SLASH)
1515 if (lex_match_id (lexer, "HAVERAGE"))
1517 examine.pc_alg = PC_HAVERAGE;
1519 else if (lex_match_id (lexer, "WAVERAGE"))
1521 examine.pc_alg = PC_WAVERAGE;
1523 else if (lex_match_id (lexer, "ROUND"))
1525 examine.pc_alg = PC_ROUND;
1527 else if (lex_match_id (lexer, "EMPIRICAL"))
1529 examine.pc_alg = PC_EMPIRICAL;
1531 else if (lex_match_id (lexer, "AEMPIRICAL"))
1533 examine.pc_alg = PC_AEMPIRICAL;
1535 else if (lex_match_id (lexer, "NONE"))
1537 examine.pc_alg = PC_NONE;
1541 lex_error (lexer, NULL);
1546 else if (lex_match_id (lexer, "TOTAL"))
1550 else if (lex_match_id (lexer, "NOTOTAL"))
1552 nototals_seen = true;
1554 else if (lex_match_id (lexer, "MISSING"))
1556 lex_match (lexer, T_EQUALS);
1558 while (lex_token (lexer) != T_ENDCMD
1559 && lex_token (lexer) != T_SLASH)
1561 if (lex_match_id (lexer, "LISTWISE"))
1563 examine.missing_pw = false;
1565 else if (lex_match_id (lexer, "PAIRWISE"))
1567 examine.missing_pw = true;
1569 else if (lex_match_id (lexer, "EXCLUDE"))
1571 examine.dep_excl = MV_ANY;
1573 else if (lex_match_id (lexer, "INCLUDE"))
1575 examine.dep_excl = MV_SYSTEM;
1577 else if (lex_match_id (lexer, "REPORT"))
1579 examine.fctr_excl = MV_NEVER;
1581 else if (lex_match_id (lexer, "NOREPORT"))
1583 examine.fctr_excl = MV_ANY;
1587 lex_error (lexer, NULL);
1592 else if (lex_match_id (lexer, "COMPARE"))
1594 lex_match (lexer, T_EQUALS);
1595 if (lex_match_id (lexer, "VARIABLES"))
1597 examine.boxplot_mode = BP_VARIABLES;
1599 else if (lex_match_id (lexer, "GROUPS"))
1601 examine.boxplot_mode = BP_GROUPS;
1605 lex_error (lexer, NULL);
1609 else if (lex_match_id (lexer, "PLOT"))
1611 lex_match (lexer, T_EQUALS);
1613 while (lex_token (lexer) != T_ENDCMD
1614 && lex_token (lexer) != T_SLASH)
1616 if (lex_match_id (lexer, "BOXPLOT"))
1618 examine.plot |= PLOT_BOXPLOT;
1620 else if (lex_match_id (lexer, "NPPLOT"))
1622 examine.plot |= PLOT_NPPLOT;
1624 else if (lex_match_id (lexer, "HISTOGRAM"))
1626 examine.plot |= PLOT_HISTOGRAM;
1628 else if (lex_match_id (lexer, "SPREADLEVEL"))
1630 examine.plot |= PLOT_SPREADLEVEL;
1631 examine.sl_power = 0;
1632 if (lex_match (lexer, T_LPAREN) && lex_force_int (lexer))
1634 examine.sl_power = lex_integer (lexer);
1637 if (! lex_force_match (lexer, T_RPAREN))
1641 else if (lex_match_id (lexer, "NONE"))
1645 else if (lex_match (lexer, T_ALL))
1651 lex_error (lexer, NULL);
1654 lex_match (lexer, T_COMMA);
1657 else if (lex_match_id (lexer, "CINTERVAL"))
1659 if ( !lex_force_num (lexer))
1662 examine.conf = lex_number (lexer);
1665 else if (lex_match_id (lexer, "ID"))
1667 lex_match (lexer, T_EQUALS);
1669 examine.id_var = parse_variable_const (lexer, examine.dict);
1673 lex_error (lexer, NULL);
1679 if ( totals_seen && nototals_seen)
1681 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
1685 /* If totals have been requested or if there are no factors
1686 in this analysis, then the totals need to be included. */
1687 if ( !nototals_seen || examine.n_iacts == 1)
1689 examine.iacts = &iacts_mem[0];
1694 examine.iacts = &iacts_mem[1];
1695 interaction_destroy (iacts_mem[0]);
1699 if ( examine.id_var )
1701 examine.id_idx = var_get_case_index (examine.id_var);
1702 examine.id_width = var_get_width (examine.id_var);
1705 examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* value */
1706 examine.ex_proto = caseproto_add_width (examine.ex_proto, examine.id_width); /* id */
1707 examine.ex_proto = caseproto_add_width (examine.ex_proto, 0); /* weight */
1710 if (examine.disp_extremes > 0)
1712 examine.calc_extremes = examine.disp_extremes;
1715 if (examine.descriptives && examine.calc_extremes == 0)
1717 /* Descriptives always displays the max and min */
1718 examine.calc_extremes = 1;
1721 if (percentiles_seen && examine.n_percentiles == 0)
1723 examine.n_percentiles = 7;
1724 examine.ptiles = xcalloc (examine.n_percentiles,
1725 sizeof (*examine.ptiles));
1727 examine.ptiles[0] = 5;
1728 examine.ptiles[1] = 10;
1729 examine.ptiles[2] = 25;
1730 examine.ptiles[3] = 50;
1731 examine.ptiles[4] = 75;
1732 examine.ptiles[5] = 90;
1733 examine.ptiles[6] = 95;
1736 assert (examine.calc_extremes >= examine.disp_extremes);
1738 struct casegrouper *grouper;
1739 struct casereader *group;
1742 grouper = casegrouper_create_splits (proc_open (ds), examine.dict);
1743 while (casegrouper_get_next_group (grouper, &group))
1744 run_examine (&examine, group);
1745 ok = casegrouper_destroy (grouper);
1746 ok = proc_commit (ds) && ok;
1749 caseproto_unref (examine.ex_proto);
1751 for (i = 0; i < examine.n_iacts; ++i)
1752 interaction_destroy (examine.iacts[i]);
1753 free (examine.ptiles);
1754 free (examine.dep_vars);
1755 pool_destroy (examine.pool);
1760 caseproto_unref (examine.ex_proto);
1761 examine.iacts = iacts_mem;
1762 for (i = 0; i < examine.n_iacts; ++i)
1763 interaction_destroy (examine.iacts[i]);
1764 free (examine.dep_vars);
1765 free (examine.ptiles);
1766 pool_destroy (examine.pool);