From 00473f96a19f9b4fc8b9dbe54dc165f2742b1140 Mon Sep 17 00:00:00 2001 From: John Darrington Date: Fri, 16 Mar 2012 20:37:21 +0100 Subject: [PATCH] EXAMINE: Complete re-implementation This change is a complete re-implementation of the EXAMINE command. It includes a few minor changes to the tests, to accommodate slight changes in semantics (to bring it in line with other software). Closes bug #30732 --- src/language/stats/.gitignore | 1 - src/language/stats/automake.mk | 2 +- src/language/stats/examine.c | 2092 +++++++++++++++++++++++++++++++ src/language/stats/examine.q | 2084 ------------------------------ src/math/box-whisker.c | 20 +- src/math/box-whisker.h | 4 +- tests/language/lexer/q2c.at | 2 +- tests/language/stats/examine.at | 41 +- 8 files changed, 2140 insertions(+), 2106 deletions(-) create mode 100644 src/language/stats/examine.c delete mode 100644 src/language/stats/examine.q diff --git a/src/language/stats/.gitignore b/src/language/stats/.gitignore index 79c1832a6b..d550b0d129 100644 --- a/src/language/stats/.gitignore +++ b/src/language/stats/.gitignore @@ -1,4 +1,3 @@ crosstabs.c -examine.c frequencies.c regression.c diff --git a/src/language/stats/automake.mk b/src/language/stats/automake.mk index 583835498b..93a8e2053d 100644 --- a/src/language/stats/automake.mk +++ b/src/language/stats/automake.mk @@ -4,7 +4,6 @@ AM_CPPFLAGS += -I$(top_srcdir)/src/language/stats src_language_stats_built_sources = \ src/language/stats/crosstabs.c \ - src/language/stats/examine.c \ src/language/stats/frequencies.c \ src/language/stats/regression.c @@ -20,6 +19,7 @@ language_stats_sources = \ src/language/stats/cochran.h \ src/language/stats/correlations.c \ src/language/stats/descriptives.c \ + src/language/stats/examine.c \ src/language/stats/factor.c \ src/language/stats/flip.c \ src/language/stats/freq.c \ diff --git a/src/language/stats/examine.c b/src/language/stats/examine.c new file mode 100644 index 0000000000..d2d4987d27 --- /dev/null +++ b/src/language/stats/examine.c @@ -0,0 +1,2092 @@ +/* + PSPP - a program for statistical analysis. + Copyright (C) 2012 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +#include + +#include +#include + +#include "libpspp/assertion.h" +#include "libpspp/message.h" + + +#include "data/dataset.h" +#include "data/dictionary.h" +#include "data/casegrouper.h" +#include "data/casereader.h" +#include "data/casewriter.h" +#include "data/caseproto.h" +#include "data/subcase.h" + + +#include "data/format.h" + +#include "math/interaction.h" +#include "math/box-whisker.h" +#include "math/categoricals.h" +#include "math/histogram.h" +#include "math/moments.h" +#include "math/np.h" +#include "math/sort.h" +#include "math/order-stats.h" +#include "math/percentiles.h" +#include "math/tukey-hinges.h" +#include "math/trimmed-mean.h" + +#include "output/charts/boxplot.h" +#include "output/charts/np-plot.h" +#include "output/charts/plot-hist.h" + +#include "language/command.h" +#include "language/lexer/lexer.h" +#include "language/lexer/value-parser.h" +#include "language/lexer/variable-parser.h" + +#include "output/tab.h" + +#include "gettext.h" +#define _(msgid) gettext (msgid) +#define N_(msgid) msgid + +enum bp_mode + { + BP_GROUPS, + BP_VARIABLES + }; + +struct examine +{ + size_t n_dep_vars; + const struct variable **dep_vars; + + size_t n_iacts; + struct interaction **iacts; + + enum mv_class exclude; + + const struct dictionary *dict; + + struct categoricals *cats; + + /* how many extremities to display */ + int disp_extremes; + int calc_extremes; + bool descriptives; + + double conf; + + bool missing_pw; + + /* Test options require that casenumbers are known */ + bool casenumbers; + + /* The case index of the ID value (or -1) if not applicable */ + size_t id_idx; + + enum pc_alg pc_alg; + double *ptiles; + size_t n_percentiles; + + bool npplot; + bool histogram; + bool boxplot; + + enum bp_mode boxplot_mode; + + const struct variable *id_var; + + const struct variable *wv; +}; + +struct extremity +{ + /* The value of this extremity */ + double val; + + /* Either the casenumber or the value of the variable specified + by the /ID subcommand which corresponds to this extremity */ + double identity; +}; + +enum + { + EX_VAL, /* value */ + EX_ID, /* identity */ + EX_WT /* weight */ + }; + +struct exploratory_stats +{ + double missing; + double non_missing; + + struct moments *mom; + + /* Most operations need a sorted reader/writer */ + struct casewriter *sorted_writer; + struct casereader *sorted_reader; + + struct extremity *minima; + struct extremity *maxima; + + /* + Minimum should alway equal mimima[0].val. + Likewise, maximum should alway equal maxima[0].val. + This redundancy exists as an optimisation effort. + Some statistics (eg histogram) require early calculation + of the min and max + */ + double minimum; + double maximum; + + struct trimmed_mean *trimmed_mean; + struct percentile *quartiles[3]; + struct percentile **percentiles; + + struct tukey_hinges *hinges; + + /* The data for the NP Plots */ + struct np *np; + + struct histogram *histogram; + + /* The data for the box plots */ + struct box_whisker *box_whisker; + + /* Total weight */ + double cc; + + /* The minimum weight */ + double cmin; +}; + + +static +const union value ** +xxx0 (const struct interaction *iact) +{ + int ivar_idx; + + const union value **prev_val = xcalloc (iact->n_vars, sizeof (*prev_val)); + + for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx) + prev_val[ivar_idx] = NULL; + + return prev_val; +} + +static int +xxx1 (const struct interaction *iact, const struct ccase *c, const union value **prev_val) +{ + int ivar_idx; + int diff_idx = -1; + for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx) + { + const struct variable *ivar = iact->vars[ivar_idx]; + const int width = var_get_width (ivar); + const union value *val = case_data (c, ivar); + + if (prev_val[ivar_idx]) + if (! value_equal (prev_val[ivar_idx], val, width)) + { + diff_idx = ivar_idx; + break; + } + } + + for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx) + { + const struct variable *ivar = iact->vars[ivar_idx]; + const union value *val = case_data (c, ivar); + + prev_val[ivar_idx] = val; + } + return diff_idx; +} + + +static void +show_boxplot_grouped (const struct examine *cmd, int iact_idx) +{ + int v; + + const struct interaction *iact = cmd->iacts[iact_idx]; + const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx); + + for (v = 0; v < cmd->n_dep_vars; ++v) + { + double y_min = DBL_MAX; + double y_max = -DBL_MAX; + int grp; + struct boxplot *boxplot; + struct string title; + ds_init_empty (&title); + + if (iact->n_vars > 0) + { + struct string istr; + ds_init_empty (&istr); + interaction_to_string (iact, &istr); + ds_put_format (&title, _("Boxplot of %s vs. %s"), + var_to_string (cmd->dep_vars[v]), + ds_cstr (&istr)); + ds_destroy (&istr); + } + else + ds_put_format (&title, _("Boxplot of %s"), var_to_string (cmd->dep_vars[v])); + + for (grp = 0; grp < n_cats; ++grp) + { + const struct exploratory_stats *es = + categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp); + + if ( y_min > es[v].minimum) + y_min = es[v].minimum; + + if ( y_max < es[v].maximum) + y_max = es[v].maximum; + } + + boxplot = boxplot_create (y_min, y_max, ds_cstr (&title)); + + ds_destroy (&title); + + for (grp = 0; grp < n_cats; ++grp) + { + int ivar_idx; + struct string label; + + const struct ccase *c = + categoricals_get_case_by_category_real (cmd->cats, iact_idx, grp); + + const struct exploratory_stats *es = + categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp); + + ds_init_empty (&label); + for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx) + { + const struct variable *ivar = iact->vars[ivar_idx]; + const union value *val = case_data (c, ivar); + + ds_put_cstr (&label, var_to_string (ivar)); + ds_put_cstr (&label, " = "); + var_append_value_name (ivar, val, &label); + ds_put_cstr (&label, "; "); + } + + boxplot_add_box (boxplot, es[v].box_whisker, ds_cstr (&label)); + + ds_destroy (&label); + } + + boxplot_submit (boxplot); + } +} + +static void +show_boxplot_variabled (const struct examine *cmd, int iact_idx) +{ + int grp; + const struct interaction *iact = cmd->iacts[iact_idx]; + const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx); + + for (grp = 0; grp < n_cats; ++grp) + { + struct boxplot *boxplot; + int v; + double y_min = DBL_MAX; + double y_max = -DBL_MAX; + + const struct ccase *c = + categoricals_get_case_by_category_real (cmd->cats, iact_idx, grp); + + struct string title; + ds_init_empty (&title); + + for (v = 0; v < cmd->n_dep_vars; ++v) + { + const struct exploratory_stats *es = + categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp); + + if ( y_min > es[v].minimum) + y_min = es[v].minimum; + + if ( y_max < es[v].maximum) + y_max = es[v].maximum; + } + + if ( iact->n_vars == 0) + ds_put_format (&title, _("Boxplot")); + else + { + int ivar_idx; + struct string label; + ds_init_empty (&label); + for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx) + { + const struct variable *ivar = iact->vars[ivar_idx]; + const union value *val = case_data (c, ivar); + + ds_put_cstr (&label, var_to_string (ivar)); + ds_put_cstr (&label, " = "); + var_append_value_name (ivar, val, &label); + ds_put_cstr (&label, "; "); + } + + ds_put_format (&title, _("Boxplot of %s"), + ds_cstr (&label)); + + ds_destroy (&label); + } + + boxplot = boxplot_create (y_min, y_max, ds_cstr (&title)); + + ds_destroy (&title); + + for (v = 0; v < cmd->n_dep_vars; ++v) + { + const struct exploratory_stats *es = + categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp); + + boxplot_add_box (boxplot, es[v].box_whisker, + var_to_string (cmd->dep_vars[v])); + } + + boxplot_submit (boxplot); + } +} + + +static void +show_npplot (const struct examine *cmd, int iact_idx) +{ + const struct interaction *iact = cmd->iacts[iact_idx]; + const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx); + + int v; + + for (v = 0; v < cmd->n_dep_vars; ++v) + { + int grp; + for (grp = 0; grp < n_cats; ++grp) + { + struct chart_item *npp, *dnpp; + struct casereader *reader; + struct np *np; + + int ivar_idx; + const struct ccase *c = + categoricals_get_case_by_category_real (cmd->cats, + iact_idx, grp); + + const struct exploratory_stats *es = + categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp); + + struct string label; + ds_init_cstr (&label, + var_to_string (cmd->dep_vars[v])); + + if ( iact->n_vars > 0) + { + ds_put_cstr (&label, " ("); + for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx) + { + const struct variable *ivar = iact->vars[ivar_idx]; + const union value *val = case_data (c, ivar); + + ds_put_cstr (&label, var_to_string (ivar)); + ds_put_cstr (&label, " = "); + var_append_value_name (ivar, val, &label); + ds_put_cstr (&label, "; "); + + } + ds_put_cstr (&label, ")"); + } + + np = es[v].np; + reader = casewriter_make_reader (np->writer); + np->writer = NULL; + + npp = np_plot_create (np, reader, ds_cstr (&label)); + dnpp = dnp_plot_create (np, reader, ds_cstr (&label)); + + if (npp == NULL || dnpp == NULL) + { + msg (MW, _("Not creating NP plot because data set is empty.")); + chart_item_unref (npp); + chart_item_unref (dnpp); + } + else + { + chart_item_submit (npp); + chart_item_submit (dnpp); + } + + ds_destroy (&label); + } + } +} + + +static void +show_histogram (const struct examine *cmd, int iact_idx) +{ + const struct interaction *iact = cmd->iacts[iact_idx]; + const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx); + + int v; + + for (v = 0; v < cmd->n_dep_vars; ++v) + { + int grp; + for (grp = 0; grp < n_cats; ++grp) + { + double n, mean, var; + int ivar_idx; + const struct ccase *c = + categoricals_get_case_by_category_real (cmd->cats, + iact_idx, grp); + + const struct exploratory_stats *es = + categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, grp); + + struct string label; + ds_init_cstr (&label, + var_to_string (cmd->dep_vars[v])); + + if ( iact->n_vars > 0) + { + ds_put_cstr (&label, " ("); + for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx) + { + const struct variable *ivar = iact->vars[ivar_idx]; + const union value *val = case_data (c, ivar); + + ds_put_cstr (&label, var_to_string (ivar)); + ds_put_cstr (&label, " = "); + var_append_value_name (ivar, val, &label); + ds_put_cstr (&label, "; "); + + } + ds_put_cstr (&label, ")"); + } + + + moments_calculate (es[v].mom, &n, &mean, &var, NULL, NULL); + + chart_item_submit + ( histogram_chart_create (es[v].histogram->gsl_hist, + ds_cstr (&label), n, mean, + sqrt (var), false)); + + + ds_destroy (&label); + } + } +} + +static void +percentiles_report (const struct examine *cmd, int iact_idx) +{ + const struct interaction *iact = cmd->iacts[iact_idx]; + int i, v; + const int heading_columns = 1 + iact->n_vars + 1; + const int heading_rows = 2; + struct tab_table *t; + + const size_t n_cats = categoricals_n_count (cmd->cats, iact_idx); + + const int rows_per_cat = 2; + const int rows_per_var = n_cats * rows_per_cat; + + const int nr = heading_rows + cmd->n_dep_vars * rows_per_var; + const int nc = heading_columns + cmd->n_percentiles; + + t = tab_create (nc, nr); + tab_title (t, _("Percentiles")); + + tab_headers (t, heading_columns, 0, heading_rows, 0); + + /* Internal Vertical lines */ + tab_box (t, -1, -1, -1, TAL_1, + heading_columns, 0, nc - 1, nr - 1); + + /* External Frame */ + tab_box (t, TAL_2, TAL_2, -1, -1, + 0, 0, nc - 1, nr - 1); + + tab_hline (t, TAL_2, 0, nc - 1, heading_rows); + tab_vline (t, TAL_2, heading_columns, 0, nr - 1); + + tab_joint_text (t, heading_columns, 0, + nc - 1, 0, + TAT_TITLE | TAB_CENTER, + _("Percentiles") + ); + + tab_hline (t, TAL_1, heading_columns, nc - 1, 1); + + + for (i = 0; i < cmd->n_percentiles; ++i) + { + tab_text_format (t, heading_columns + i, 1, + TAT_TITLE | TAB_CENTER, + _("%g"), cmd->ptiles[i]); + } + + for (i = 0; i < iact->n_vars; ++i) + { + tab_text (t, + 1 + i, 1, + TAT_TITLE, + var_to_string (iact->vars[i]) + ); + } + + tab_vline (t, TAL_1, heading_columns - 1, heading_rows, nr - 1); + + + for (v = 0; v < cmd->n_dep_vars; ++v) + { + const union value **prev_val = xxx0 (iact); + + int ivar_idx; + if ( v > 0 ) + tab_hline (t, TAL_1, 0, nc - 1, heading_rows + v * rows_per_var); + + tab_text (t, + 0, heading_rows + v * rows_per_var, + TAT_TITLE | TAB_LEFT, + var_to_string (cmd->dep_vars[v]) + ); + + for (i = 0; i < n_cats; ++i) + { + const struct ccase *c = + categoricals_get_case_by_category_real (cmd->cats, + iact_idx, i); + + const struct exploratory_stats *ess = + categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, i); + + const struct exploratory_stats *es = ess + v; + + int diff_idx = xxx1 (iact, c, prev_val); + + double hinges[3]; + int p; + + for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx) + { + const struct variable *ivar = iact->vars[ivar_idx]; + const union value *val = case_data (c, ivar); + + if (( diff_idx != -1 && diff_idx <= ivar_idx) + || i == 0) + { + struct string str; + ds_init_empty (&str); + var_append_value_name (ivar, val, &str); + + tab_text (t, + 1 + ivar_idx, + heading_rows + v * rows_per_var + i * rows_per_cat, + TAT_TITLE | TAB_LEFT, + ds_cstr (&str) + ); + + ds_destroy (&str); + } + } + + if ( diff_idx != -1 && diff_idx < iact->n_vars) + { + tab_hline (t, TAL_1, 1 + diff_idx, nc - 1, + heading_rows + v * rows_per_var + i * rows_per_cat + ); + } + + tab_text (t, heading_columns - 1, + heading_rows + v * rows_per_var + i * rows_per_cat, + TAT_TITLE | TAB_LEFT, + gettext (ptile_alg_desc [cmd->pc_alg])); + + tukey_hinges_calculate (es->hinges, hinges); + + for (p = 0; p < cmd->n_percentiles; ++p) + { + tab_double (t, heading_columns + p, + heading_rows + v * rows_per_var + i * rows_per_cat, + 0, + percentile_calculate (es->percentiles[p], cmd->pc_alg), + 0); + + if (cmd->ptiles[p] == 25.0) + { + tab_double (t, heading_columns + p, + heading_rows + v * rows_per_var + i * rows_per_cat + 1, + 0, + hinges[0], + 0); + } + else if (cmd->ptiles[p] == 50.0) + { + tab_double (t, heading_columns + p, + heading_rows + v * rows_per_var + i * rows_per_cat + 1, + 0, + hinges[1], + 0); + } + else if (cmd->ptiles[p] == 75.0) + { + tab_double (t, heading_columns + p, + heading_rows + v * rows_per_var + i * rows_per_cat + 1, + 0, + hinges[2], + 0); + } + } + + + tab_text (t, heading_columns - 1, + heading_rows + v * rows_per_var + i * rows_per_cat + 1, + TAT_TITLE | TAB_LEFT, + _("Tukey's Hinges")); + + } + } + + tab_submit (t); +} + +static void +descriptives_report (const struct examine *cmd, int iact_idx) +{ + const struct interaction *iact = cmd->iacts[iact_idx]; + int i, v; + const int heading_columns = 1 + iact->n_vars + 2; + const int heading_rows = 1; + struct tab_table *t; + + size_t n_cats = categoricals_n_count (cmd->cats, iact_idx); + + const int rows_per_cat = 13; + const int rows_per_var = n_cats * rows_per_cat; + + const int nr = heading_rows + cmd->n_dep_vars * rows_per_var; + const int nc = 2 + heading_columns; + + t = tab_create (nc, nr); + tab_title (t, _("Descriptives")); + + tab_headers (t, heading_columns, 0, heading_rows, 0); + + /* Internal Vertical lines */ + tab_box (t, -1, -1, -1, TAL_1, + heading_columns, 0, nc - 1, nr - 1); + + /* External Frame */ + tab_box (t, TAL_2, TAL_2, -1, -1, + 0, 0, nc - 1, nr - 1); + + tab_hline (t, TAL_2, 0, nc - 1, heading_rows); + tab_vline (t, TAL_2, heading_columns, 0, nr - 1); + + + tab_text (t, heading_columns, 0, TAB_CENTER | TAT_TITLE, + _("Statistic")); + + tab_text (t, heading_columns + 1, 0, TAB_CENTER | TAT_TITLE, + _("Std. Error")); + + for (i = 0; i < iact->n_vars; ++i) + { + tab_text (t, + 1 + i, 0, + TAT_TITLE, + var_to_string (iact->vars[i]) + ); + } + + for (v = 0; v < cmd->n_dep_vars; ++v) + { + const union value **prev_val = xxx0 (iact); + + int ivar_idx; + if ( v > 0 ) + tab_hline (t, TAL_1, 0, nc - 1, heading_rows + v * rows_per_var); + + tab_text (t, + 0, heading_rows + v * rows_per_var, + TAT_TITLE | TAB_LEFT, + var_to_string (cmd->dep_vars[v]) + ); + + for (i = 0; i < n_cats; ++i) + { + const struct ccase *c = + categoricals_get_case_by_category_real (cmd->cats, + iact_idx, i); + + const struct exploratory_stats *ess = + categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, i); + + const struct exploratory_stats *es = ess + v; + + const int diff_idx = xxx1 (iact, c, prev_val); + + double m0, m1, m2, m3, m4; + double tval; + + moments_calculate (es->mom, &m0, &m1, &m2, &m3, &m4); + + tval = gsl_cdf_tdist_Qinv ((1.0 - cmd->conf) / 2.0, m0 - 1.0); + + for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx) + { + const struct variable *ivar = iact->vars[ivar_idx]; + const union value *val = case_data (c, ivar); + + if (( diff_idx != -1 && diff_idx <= ivar_idx) + || i == 0) + { + struct string str; + ds_init_empty (&str); + var_append_value_name (ivar, val, &str); + + tab_text (t, + 1 + ivar_idx, + heading_rows + v * rows_per_var + i * rows_per_cat, + TAT_TITLE | TAB_LEFT, + ds_cstr (&str) + ); + + ds_destroy (&str); + } + } + + if ( diff_idx != -1 && diff_idx < iact->n_vars) + { + tab_hline (t, TAL_1, 1 + diff_idx, nc - 1, + heading_rows + v * rows_per_var + i * rows_per_cat + ); + } + + tab_text (t, + 1 + iact->n_vars, + heading_rows + v * rows_per_var + i * rows_per_cat, + TAB_LEFT, + _("Mean") + ); + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat, + 0, m1, 0); + + tab_double (t, + 1 + iact->n_vars + 3, + heading_rows + v * rows_per_var + i * rows_per_cat, + 0, calc_semean (m2, m0), 0); + + tab_text_format (t, + 1 + iact->n_vars, + heading_rows + v * rows_per_var + i * rows_per_cat + 1, + TAB_LEFT, + _("%g%% Confidence Interval for Mean"), + cmd->conf * 100.0 + ); + + tab_text (t, + 1 + iact->n_vars + 1, + heading_rows + v * rows_per_var + i * rows_per_cat + 1, + TAB_LEFT, + _("Lower Bound") + ); + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat + 1, + 0, m1 - tval * calc_semean (m2, m0), 0); + + + tab_text (t, + 1 + iact->n_vars + 1, + heading_rows + v * rows_per_var + i * rows_per_cat + 2, + TAB_LEFT, + _("Upper Bound") + ); + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat + 2, + 0, m1 + tval * calc_semean (m2, m0), 0); + + + tab_text (t, + 1 + iact->n_vars, + heading_rows + v * rows_per_var + i * rows_per_cat + 3, + TAB_LEFT, + _("5% Trimmed Mean") + ); + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat + 3, + 0, + trimmed_mean_calculate (es->trimmed_mean), + 0); + + tab_text (t, + 1 + iact->n_vars, + heading_rows + v * rows_per_var + i * rows_per_cat + 4, + TAB_LEFT, + _("Median") + ); + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat + 4, + 0, + percentile_calculate (es->quartiles[1], cmd->pc_alg), + 0); + + + tab_text (t, + 1 + iact->n_vars, + heading_rows + v * rows_per_var + i * rows_per_cat + 5, + TAB_LEFT, + _("Variance") + ); + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat + 5, + 0, m2, 0); + + tab_text (t, + 1 + iact->n_vars, + heading_rows + v * rows_per_var + i * rows_per_cat + 6, + TAB_LEFT, + _("Std. Deviation") + ); + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat + 6, + 0, sqrt (m2), 0); + + tab_text (t, + 1 + iact->n_vars, + heading_rows + v * rows_per_var + i * rows_per_cat + 7, + TAB_LEFT, + _("Minimum") + ); + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat + 7, + 0, + es->minima[0].val, + 0); + + tab_text (t, + 1 + iact->n_vars, + heading_rows + v * rows_per_var + i * rows_per_cat + 8, + TAB_LEFT, + _("Maximum") + ); + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat + 8, + 0, + es->maxima[0].val, + 0); + + tab_text (t, + 1 + iact->n_vars, + heading_rows + v * rows_per_var + i * rows_per_cat + 9, + TAB_LEFT, + _("Range") + ); + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat + 9, + 0, + es->maxima[0].val - es->minima[0].val, + 0); + + tab_text (t, + 1 + iact->n_vars, + heading_rows + v * rows_per_var + i * rows_per_cat + 10, + TAB_LEFT, + _("Interquartile Range") + ); + + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat + 10, + 0, + percentile_calculate (es->quartiles[2], cmd->pc_alg) - + percentile_calculate (es->quartiles[0], cmd->pc_alg), + 0); + + + + + tab_text (t, + 1 + iact->n_vars, + heading_rows + v * rows_per_var + i * rows_per_cat + 11, + TAB_LEFT, + _("Skewness") + ); + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat + 11, + 0, m3, 0); + + tab_double (t, + 1 + iact->n_vars + 3, + heading_rows + v * rows_per_var + i * rows_per_cat + 11, + 0, calc_seskew (m0), 0); + + tab_text (t, + 1 + iact->n_vars, + heading_rows + v * rows_per_var + i * rows_per_cat + 12, + TAB_LEFT, + _("Kurtosis") + ); + + tab_double (t, + 1 + iact->n_vars + 2, + heading_rows + v * rows_per_var + i * rows_per_cat + 12, + 0, m4, 0); + + tab_double (t, + 1 + iact->n_vars + 3, + heading_rows + v * rows_per_var + i * rows_per_cat + 12, + 0, calc_sekurt (m0), 0); + } + } + tab_submit (t); +} + + +static void +extremes_report (const struct examine *cmd, int iact_idx) +{ + const struct interaction *iact = cmd->iacts[iact_idx]; + int i, v; + const int heading_columns = 1 + iact->n_vars + 2; + const int heading_rows = 1; + struct tab_table *t; + + size_t n_cats = categoricals_n_count (cmd->cats, iact_idx); + + const int rows_per_cat = 2 * cmd->disp_extremes; + const int rows_per_var = n_cats * rows_per_cat; + + const int nr = heading_rows + cmd->n_dep_vars * rows_per_var; + const int nc = 2 + heading_columns; + + t = tab_create (nc, nr); + tab_title (t, _("Extreme Values")); + + tab_headers (t, heading_columns, 0, heading_rows, 0); + + /* Internal Vertical lines */ + tab_box (t, -1, -1, -1, TAL_1, + heading_columns, 0, nc - 1, nr - 1); + + /* External Frame */ + tab_box (t, TAL_2, TAL_2, -1, -1, + 0, 0, nc - 1, nr - 1); + + tab_hline (t, TAL_2, 0, nc - 1, heading_rows); + tab_vline (t, TAL_2, heading_columns, 0, nr - 1); + + + if ( cmd->id_var ) + tab_text (t, heading_columns, 0, TAB_CENTER | TAT_TITLE, + var_to_string (cmd->id_var)); + else + tab_text (t, heading_columns, 0, TAB_CENTER | TAT_TITLE, + _("Case Number")); + + tab_text (t, heading_columns + 1, 0, TAB_CENTER | TAT_TITLE, + _("Value")); + + for (i = 0; i < iact->n_vars; ++i) + { + tab_text (t, + 1 + i, 0, + TAT_TITLE, + var_to_string (iact->vars[i]) + ); + } + + for (v = 0; v < cmd->n_dep_vars; ++v) + { + const union value **prev_val = xxx0 (iact); + + int ivar_idx; + if ( v > 0 ) + tab_hline (t, TAL_1, 0, nc - 1, heading_rows + v * rows_per_var); + + tab_text (t, + 0, heading_rows + v * rows_per_var, + TAT_TITLE, + var_to_string (cmd->dep_vars[v]) + ); + + for (i = 0; i < n_cats; ++i) + { + int e; + const struct ccase *c = + categoricals_get_case_by_category_real (cmd->cats, iact_idx, i); + + const struct exploratory_stats *ess = + categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, i); + + const struct exploratory_stats *es = ess + v; + + int diff_idx = xxx1 (iact, c, prev_val); + + for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx) + { + const struct variable *ivar = iact->vars[ivar_idx]; + const union value *val = case_data (c, ivar); + + if (( diff_idx != -1 && diff_idx <= ivar_idx) + || i == 0) + { + struct string str; + ds_init_empty (&str); + var_append_value_name (ivar, val, &str); + + tab_text (t, + 1 + ivar_idx, + heading_rows + v * rows_per_var + i * rows_per_cat, + TAT_TITLE | TAB_LEFT, + ds_cstr (&str) + ); + + ds_destroy (&str); + } + } + + if ( diff_idx != -1 && diff_idx < iact->n_vars) + { + tab_hline (t, TAL_1, 1 + diff_idx, nc - 1, + heading_rows + v * rows_per_var + i * rows_per_cat + ); + } + + tab_text (t, + heading_columns - 2, + heading_rows + v * rows_per_var + i * rows_per_cat, + TAB_RIGHT, + _("Highest")); + + + tab_hline (t, TAL_1, heading_columns - 2, nc - 1, + heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes + ); + + tab_text (t, + heading_columns - 2, + heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes, + TAB_RIGHT, + _("Lowest")); + + for (e = 0 ; e < cmd->disp_extremes; ++e) + { + tab_double (t, + heading_columns - 1, + heading_rows + v * rows_per_var + i * rows_per_cat + e, + TAB_RIGHT, + e + 1, + &F_8_0); + + /* The casenumber */ + tab_double (t, + heading_columns, + heading_rows + v * rows_per_var + i * rows_per_cat + e, + 0, + es->maxima[e].identity, + &F_8_0); + + + tab_double (t, + heading_columns + 1, + heading_rows + v * rows_per_var + i * rows_per_cat + e, + 0, + es->maxima[e].val, + 0); + + + + tab_double (t, + heading_columns - 1, + heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes + e, + TAB_RIGHT, + e + 1, + &F_8_0); + + /* The casenumber */ + tab_double (t, + heading_columns, + heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes + e, + 0, + es->minima[e].identity, + &F_8_0); + + tab_double (t, + heading_columns + 1, + heading_rows + v * rows_per_var + i * rows_per_cat + cmd->disp_extremes + e, + 0, + es->minima[e].val, + 0); + } + } + } + + tab_submit (t); +} + + +static void +summary_report (const struct examine *cmd, int iact_idx) +{ + const struct interaction *iact = cmd->iacts[iact_idx]; + int i, v; + const int heading_columns = 1 + iact->n_vars; + const int heading_rows = 3; + struct tab_table *t; + + const struct fmt_spec *wfmt = cmd->wv ? var_get_print_format (cmd->wv) : &F_8_0; + + size_t n_cats = categoricals_n_count (cmd->cats, iact_idx); + + const int nr = heading_rows + n_cats * cmd->n_dep_vars; + const int nc = 6 + heading_columns; + + t = tab_create (nc, nr); + tab_title (t, _("Case Processing Summary")); + + tab_headers (t, heading_columns, 0, heading_rows, 0); + + /* Internal Vertical lines */ + tab_box (t, -1, -1, -1, TAL_1, + heading_columns, 0, nc - 1, nr - 1); + + /* External Frame */ + tab_box (t, TAL_2, TAL_2, -1, -1, + 0, 0, nc - 1, nr - 1); + + tab_hline (t, TAL_2, 0, nc - 1, heading_rows); + tab_vline (t, TAL_2, heading_columns, 0, nr - 1); + + tab_joint_text (t, heading_columns, 0, + nc - 1, 0, TAB_CENTER | TAT_TITLE, _("Cases")); + tab_joint_text (t, + heading_columns, 1, + heading_columns + 1, 1, + TAB_CENTER | TAT_TITLE, _("Valid")); + + tab_joint_text (t, + heading_columns + 2, 1, + heading_columns + 3, 1, + TAB_CENTER | TAT_TITLE, _("Missing")); + + tab_joint_text (t, + heading_columns + 4, 1, + heading_columns + 5, 1, + TAB_CENTER | TAT_TITLE, _("Total")); + + for (i = 0; i < 3; ++i) + { + tab_text (t, heading_columns + i * 2, 2, TAB_CENTER | TAT_TITLE, + _("N")); + tab_text (t, heading_columns + i * 2 + 1, 2, TAB_CENTER | TAT_TITLE, + _("Percent")); + } + + for (i = 0; i < iact->n_vars; ++i) + { + tab_text (t, + 1 + i, 2, + TAT_TITLE, + var_to_string (iact->vars[i]) + ); + } + + if (n_cats > 0) + for (v = 0; v < cmd->n_dep_vars; ++v) + { + int ivar_idx; + const union value **prev_val = xxx0 (iact); + + if ( v > 0 ) + tab_hline (t, TAL_1, 0, nc - 1, heading_rows + v * n_cats); + + tab_text (t, + 0, heading_rows + n_cats * v, + TAT_TITLE, + var_to_string (cmd->dep_vars[v]) + ); + + + for (i = 0; i < n_cats; ++i) + { + double total; + const struct exploratory_stats *es; + + const struct ccase *c = + categoricals_get_case_by_category_real (cmd->cats, + iact_idx, i); + if (c) + { + int diff_idx = xxx1 (iact, c, prev_val); + + if ( diff_idx != -1 && diff_idx < iact->n_vars - 1) + tab_hline (t, TAL_1, 1 + diff_idx, nc - 1, + heading_rows + n_cats * v + i + ); + + for (ivar_idx = 0; ivar_idx < iact->n_vars; ++ivar_idx) + { + const struct variable *ivar = iact->vars[ivar_idx]; + const union value *val = case_data (c, ivar); + + if (( diff_idx != -1 && diff_idx <= ivar_idx) + || i == 0) + { + struct string str; + ds_init_empty (&str); + var_append_value_name (ivar, val, &str); + + tab_text (t, + 1 + ivar_idx, heading_rows + n_cats * v + i, + TAT_TITLE | TAB_LEFT, + ds_cstr (&str) + ); + + ds_destroy (&str); + } + } + } + + es = categoricals_get_user_data_by_category_real (cmd->cats, iact_idx, i); + + + total = es[v].missing + es[v].non_missing; + tab_double (t, + heading_columns + 0, + heading_rows + n_cats * v + i, + 0, + es[v].non_missing, + wfmt); + + + tab_text_format (t, + heading_columns + 1, + heading_rows + n_cats * v + i, + 0, + "%g%%", + 100.0 * es[v].non_missing / total + ); + + + tab_double (t, + heading_columns + 2, + heading_rows + n_cats * v + i, + 0, + es[v].missing, + wfmt); + + tab_text_format (t, + heading_columns + 3, + heading_rows + n_cats * v + i, + 0, + "%g%%", + 100.0 * es[v].missing / total + ); + tab_double (t, + heading_columns + 4, + heading_rows + n_cats * v + i, + 0, + total, + wfmt); + + /* This can only be 100% can't it? */ + tab_text_format (t, + heading_columns + 5, + heading_rows + n_cats * v + i, + 0, + "%g%%", + 100.0 * (es[v].missing + es[v].non_missing)/ total + ); + } + } + + tab_hline (t, TAL_1, heading_columns, nc - 1, 1); + tab_hline (t, TAL_1, heading_columns, nc - 1, 2); + + tab_submit (t); +} + + +/* Match a variable. + If the match succeeds, the variable will be placed in VAR. + Returns true if successful */ +static bool +lex_match_variable (struct lexer *lexer, + const struct dictionary *dict, const struct variable **var) +{ + if (lex_token (lexer) != T_ID) + + return false; + + *var = parse_variable_const (lexer, dict); + + if ( *var == NULL) + return false; + return true; +} + +/* Attempt to parse an interaction from LEXER */ +static struct interaction * +parse_interaction (struct lexer *lexer, struct examine *ex) +{ + const struct variable *v = NULL; + struct interaction *iact = NULL; + + if ( lex_match_variable (lexer, ex->dict, &v)) + { + iact = interaction_create (v); + + while (lex_match (lexer, T_BY)) + { + if (!lex_match_variable (lexer, ex->dict, &v)) + { + interaction_destroy (iact); + return NULL; + } + interaction_add_variable (iact, v); + } + lex_match (lexer, T_COMMA); + } + + return iact; +} + + +static void * +create_n (const void *aux1, void *aux2 UNUSED) +{ + int v; + + const struct examine *examine = aux1; + struct exploratory_stats *es = xcalloc (examine->n_dep_vars, sizeof (*es)); + + struct caseproto *proto = caseproto_create (); + proto = caseproto_add_width (proto, 0); /* value */ + proto = caseproto_add_width (proto, 0); /* id */ + proto = caseproto_add_width (proto, 0); /* weight */ + + for (v = 0; v < examine->n_dep_vars; v++) + { + struct subcase ordering; + + subcase_init (&ordering, 0, 0, SC_ASCEND); + + es[v].sorted_writer = sort_create_writer (&ordering, proto); + es[v].sorted_reader = NULL; + + es[v].mom = moments_create (MOMENT_KURTOSIS); + es[v].cmin = DBL_MAX; + + es[v].maximum = -DBL_MAX; + es[v].minimum = DBL_MAX; + } + return es; +} + +static void +update_n (const void *aux1, void *aux2 UNUSED, void *user_data, + const struct ccase *c, double weight) +{ + int v; + const struct examine *examine = aux1; + struct exploratory_stats *es = user_data; + + struct caseproto *proto = caseproto_create (); + proto = caseproto_add_width (proto, 0); /* value */ + proto = caseproto_add_width (proto, 0); /* id */ + proto = caseproto_add_width (proto, 0); /* weight */ + + for (v = 0; v < examine->n_dep_vars; v++) + { + struct ccase *outcase = case_create (proto); + const struct variable *var = examine->dep_vars[v]; + const double x = case_data (c, var)->f; + + if (var_is_value_missing (var, case_data (c, var), examine->exclude)) + { + es[v].missing += weight; + continue; + } + + if (x > es[v].maximum) + es[v].maximum = x; + + if (x < es[v].minimum) + es[v].minimum = x; + + es[v].non_missing += weight; + + moments_pass_one (es[v].mom, x, weight); + + /* Save the value and the casenumber to the writer */ + case_data_rw_idx (outcase, EX_VAL)->f = x; + if ( examine->id_idx != -1) + case_data_rw_idx (outcase, EX_ID)->f = case_data_idx (c, examine->id_idx)->f; + + case_data_rw_idx (outcase, EX_WT)->f = weight; + + es[v].cc += weight; + + if (es[v].cmin > weight) + es[v].cmin = weight; + + casewriter_write (es[v].sorted_writer, outcase); + } +} + +static void +calculate_n (const void *aux1, void *aux2 UNUSED, void *user_data) +{ + int v; + const struct examine *examine = aux1; + struct exploratory_stats *es = user_data; + + for (v = 0; v < examine->n_dep_vars; v++) + { + int i; + casenumber imin = 0; + double imax = es[v].cc; + struct casereader *reader; + struct ccase *c; + casenumber total_cases; + + if (examine->histogram) + { + es[v].histogram = + histogram_create (10, es[v].minimum, es[v].maximum); + } + + es[v].sorted_reader = casewriter_make_reader (es[v].sorted_writer); + total_cases = casereader_count_cases (casereader_clone (es[v].sorted_reader)); + es[v].sorted_writer = NULL; + + es[v].maxima = xcalloc (examine->calc_extremes, sizeof (*es[v].maxima)); + es[v].minima = xcalloc (examine->calc_extremes, sizeof (*es[v].minima)); + + for (reader = casereader_clone (es[v].sorted_reader); + (c = casereader_read (reader)) != NULL; case_unref (c)) + { + const double val = case_data_idx (c, EX_VAL)->f; + const double wt = case_data_idx (c, EX_WT)->f; /* FIXME: What about fractional weights ??? */ + + moments_pass_two (es[v].mom, val, wt); + + if (es[v].histogram) + histogram_add (es[v].histogram, val, wt); + + if (imin < examine->calc_extremes) + { + int x; + for (x = imin; x < examine->calc_extremes; ++x) + { + struct extremity *min = &es[v].minima[x]; + min->val = val; + min->identity = case_data_idx (c, EX_ID)->f; + } + imin += wt; + } + + imax -= wt; + if (imax < examine->calc_extremes) + { + int x; + + for (x = imax; x < imax + wt; ++x) + { + struct extremity *max; + + if (x >= examine->calc_extremes) + break; + + max = &es[v].maxima[x]; + max->val = val; + max->identity = case_data_idx (c, EX_ID)->f; + } + } + } + casereader_destroy (reader); + + if (examine->calc_extremes > 0) + { + assert (es[v].minima[0].val == es[v].minimum); + assert (es[v].maxima[0].val == es[v].maximum); + } + + { + const int n_os = 5 + examine->n_percentiles; + struct order_stats **os ; + es[v].percentiles = xcalloc (examine->n_percentiles, sizeof (*es[v].percentiles)); + + es[v].trimmed_mean = trimmed_mean_create (es[v].cc, 0.05); + + os = xcalloc (n_os, sizeof *os); + os[0] = &es[v].trimmed_mean->parent; + + es[v].quartiles[0] = percentile_create (0.25, es[v].cc); + es[v].quartiles[1] = percentile_create (0.5, es[v].cc); + es[v].quartiles[2] = percentile_create (0.75, es[v].cc); + + os[1] = &es[v].quartiles[0]->parent; + os[2] = &es[v].quartiles[1]->parent; + os[3] = &es[v].quartiles[2]->parent; + + es[v].hinges = tukey_hinges_create (es[v].cc, es[v].cmin); + os[4] = &es[v].hinges->parent; + + for (i = 0; i < examine->n_percentiles; ++i) + { + es[v].percentiles[i] = percentile_create (examine->ptiles[i] / 100.00, es[v].cc); + os[5 + i] = &es[v].percentiles[i]->parent; + } + + order_stats_accumulate_idx (os, n_os, + casereader_clone (es[v].sorted_reader), + EX_WT, EX_VAL); + } + + if (examine->boxplot) + { + struct order_stats *os; + + es[v].box_whisker = box_whisker_create (es[v].hinges, + EX_ID); + + os = &es[v].box_whisker->parent; + order_stats_accumulate_idx (&os, 1, + casereader_clone (es[v].sorted_reader), + EX_WT, EX_VAL); + } + + if (examine->npplot) + { + double n, mean, var; + struct order_stats *os; + + moments_calculate (es[v].mom, &n, &mean, &var, NULL, NULL); + + es[v].np = np_create (n, mean, var); + + os = &es[v].np->parent; + + order_stats_accumulate_idx (&os, 1, + casereader_clone (es[v].sorted_reader), + EX_WT, EX_VAL); + } + + } +} + +static void +run_examine (struct examine *cmd, struct casereader *input) +{ + int i; + struct ccase *c; + struct casereader *reader; + + struct payload payload; + payload.create = create_n; + payload.update = update_n; + payload.destroy = calculate_n; + + cmd->wv = dict_get_weight (cmd->dict); + + cmd->id_idx = -1; + cmd->cats + = categoricals_create (cmd->iacts, cmd->n_iacts, + cmd->wv, cmd->exclude); + + categoricals_set_payload (cmd->cats, &payload, cmd, NULL); + + if (cmd->casenumbers) + { + struct ccase *c = casereader_peek (input, 0); + + if (cmd->id_var) + cmd->id_idx = var_get_case_index (cmd->id_var); + else + { + cmd->id_idx = case_get_value_cnt (c); + input = casereader_create_arithmetic_sequence (input, 1.0, 1.0); + } + + case_unref (c); + } + + /* FIXME: Filter out missing factor variables */ + + /* Remove cases on a listwise basis if requested */ + if ( cmd->missing_pw == false) + input = casereader_create_filter_missing (input, + cmd->dep_vars, + cmd->n_dep_vars, + cmd->exclude, + NULL, + NULL); + + for (reader = casereader_clone (input); + (c = casereader_read (reader)) != NULL; case_unref (c)) + { + categoricals_update (cmd->cats, c); + } + casereader_destroy (reader); + categoricals_done (cmd->cats); + + for (i = 0; i < cmd->n_iacts; ++i) + { + summary_report (cmd, i); + + if (cmd->disp_extremes > 0) + extremes_report (cmd, i); + + if (cmd->n_percentiles > 0) + percentiles_report (cmd, i); + + if (cmd->boxplot) + { + switch (cmd->boxplot_mode) + { + case BP_GROUPS: + show_boxplot_grouped (cmd, i); + break; + case BP_VARIABLES: + show_boxplot_variabled (cmd, i); + break; + default: + NOT_REACHED (); + break; + } + } + + if (cmd->histogram) + show_histogram (cmd, i); + + if (cmd->npplot) + show_npplot (cmd, i); + + if (cmd->descriptives) + descriptives_report (cmd, i); + } +} + +int +cmd_examine (struct lexer *lexer, struct dataset *ds) +{ + bool nototals_seen = false; + bool totals_seen = false; + + struct interaction **iacts_mem = NULL; + struct examine examine; + bool percentiles_seen = false; + + examine.casenumbers = false; + examine.missing_pw = false; + examine.disp_extremes = 0; + examine.calc_extremes = 0; + examine.descriptives = false; + examine.conf = 0.95; + examine.pc_alg = PC_HAVERAGE; + examine.ptiles = NULL; + examine.n_percentiles = 0; + examine.id_var = 0; + examine.boxplot_mode = BP_GROUPS; + + + /* Allocate space for the first interaction. + This is interaction is an empty one (for the totals). + If no totals are requested, we will simply ignore this + interaction. + */ + examine.n_iacts = 1; + examine.iacts = iacts_mem = xzalloc (sizeof (struct interaction *)); + examine.iacts[0] = interaction_create (NULL); + + examine.exclude = MV_ANY; + examine.histogram = false; + examine.npplot = false; + examine.boxplot = false; + + examine.dict = dataset_dict (ds); + + /* Accept an optional, completely pointless "/VARIABLES=" */ + lex_match (lexer, T_SLASH); + if (lex_match_id (lexer, "VARIABLES")) + { + if (! lex_force_match (lexer, T_EQUALS) ) + goto error; + } + + if (!parse_variables_const (lexer, examine.dict, + &examine.dep_vars, &examine.n_dep_vars, + PV_NO_DUPLICATE | PV_NUMERIC)) + goto error; + + if (lex_match (lexer, T_BY)) + { + struct interaction *iact = NULL; + do + { + iact = parse_interaction (lexer, &examine); + if (iact) + { + examine.n_iacts++; + iacts_mem = + xrealloc (iacts_mem, + sizeof (*iacts_mem) * examine.n_iacts); + + iacts_mem[examine.n_iacts - 1] = iact; + } + } + while (iact); + } + + + while (lex_token (lexer) != T_ENDCMD) + { + lex_match (lexer, T_SLASH); + + if (lex_match_id (lexer, "STATISTICS")) + { + lex_match (lexer, T_EQUALS); + + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { + if (lex_match_id (lexer, "DESCRIPTIVES")) + { + examine.descriptives = true; + } + else if (lex_match_id (lexer, "EXTREME")) + { + int extr = 5; + if (lex_match (lexer, T_LPAREN)) + { + extr = lex_integer (lexer); + + if (extr < 0) + { + msg (MW, _("%s may not be negative. Using default value (%g)."), "EXTREME", 5.0); + extr = 5; + } + + lex_get (lexer); + if (! lex_force_match (lexer, T_RPAREN)) + goto error; + } + examine.disp_extremes = extr; + } + else if (lex_match_id (lexer, "NONE")) + { + } + else if (lex_match (lexer, T_ALL)) + { + if (examine.disp_extremes == 0) + examine.disp_extremes = 5; + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + } + else if (lex_match_id (lexer, "PERCENTILES")) + { + percentiles_seen = true; + if (lex_match (lexer, T_LPAREN)) + { + while (lex_is_number (lexer)) + { + double p = lex_number (lexer); + + if ( p <= 0 || p >= 100.0) + { + lex_error (lexer, + _("Percentiles must lie in the range (0, 100)")); + goto error; + } + + examine.n_percentiles++; + examine.ptiles = + xrealloc (examine.ptiles, + sizeof (*examine.ptiles) * + examine.n_percentiles); + + examine.ptiles[examine.n_percentiles - 1] = p; + + lex_get (lexer); + lex_match (lexer, T_COMMA); + } + if (!lex_force_match (lexer, T_RPAREN)) + goto error; + } + + lex_match (lexer, T_EQUALS); + + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { + if (lex_match_id (lexer, "HAVERAGE")) + { + examine.pc_alg = PC_HAVERAGE; + } + else if (lex_match_id (lexer, "WAVERAGE")) + { + examine.pc_alg = PC_WAVERAGE; + } + else if (lex_match_id (lexer, "ROUND")) + { + examine.pc_alg = PC_ROUND; + } + else if (lex_match_id (lexer, "EMPIRICAL")) + { + examine.pc_alg = PC_EMPIRICAL; + } + else if (lex_match_id (lexer, "AEMPIRICAL")) + { + examine.pc_alg = PC_AEMPIRICAL; + } + else if (lex_match_id (lexer, "NONE")) + { + examine.pc_alg = PC_NONE; + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + } + else if (lex_match_id (lexer, "TOTAL")) + { + totals_seen = true; + } + else if (lex_match_id (lexer, "NOTOTAL")) + { + nototals_seen = true; + } + else if (lex_match_id (lexer, "MISSING")) + { + lex_match (lexer, T_EQUALS); + + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { + if (lex_match_id (lexer, "LISTWISE")) + { + examine.missing_pw = false; + } + else if (lex_match_id (lexer, "PAIRWISE")) + { + examine.missing_pw = true; + } + else if (lex_match_id (lexer, "EXCLUDE")) + { + examine.exclude = MV_ANY; + } + else if (lex_match_id (lexer, "INCLUDE")) + { + examine.exclude = MV_SYSTEM; + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + } + else if (lex_match_id (lexer, "COMPARE")) + { + lex_match (lexer, T_EQUALS); + if (lex_match_id (lexer, "VARIABLES")) + { + examine.boxplot_mode = BP_VARIABLES; + } + else if (lex_match_id (lexer, "GROUPS")) + { + examine.boxplot_mode = BP_GROUPS; + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + else if (lex_match_id (lexer, "PLOT")) + { + lex_match (lexer, T_EQUALS); + + while (lex_token (lexer) != T_ENDCMD + && lex_token (lexer) != T_SLASH) + { + if (lex_match_id (lexer, "BOXPLOT")) + { + examine.boxplot = true; + } + else if (lex_match_id (lexer, "NPPLOT")) + { + examine.npplot = true; + } + else if (lex_match_id (lexer, "HISTOGRAM")) + { + examine.histogram = true; + } + else if (lex_match_id (lexer, "NONE")) + { + examine.histogram = false; + examine.npplot = false; + examine.boxplot = false; + } + else if (lex_match (lexer, T_ALL)) + { + examine.histogram = true; + examine.npplot = true; + examine.boxplot = true; + } + else + { + lex_error (lexer, NULL); + goto error; + } + lex_match (lexer, T_COMMA); + } + } + else if (lex_match_id (lexer, "CINTERVAL")) + { + if ( !lex_force_num (lexer)) + goto error; + + examine.conf = lex_number (lexer); + lex_get (lexer); + } + else if (lex_match_id (lexer, "ID")) + { + lex_match (lexer, T_EQUALS); + + examine.id_var = parse_variable_const (lexer, examine.dict); + } + else + { + lex_error (lexer, NULL); + goto error; + } + } + + if ( totals_seen && nototals_seen) + { + msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL"); + goto error; + } + + /* If totals have been requested or if there are no factors + in this analysis, then the totals need to be included. */ + if ( !nototals_seen || examine.n_iacts == 1) + { + examine.iacts = &iacts_mem[0]; + } + else + { + examine.n_iacts--; + examine.iacts = &iacts_mem[1]; + } + + + if (examine.disp_extremes > 0) + { + examine.calc_extremes = examine.disp_extremes; + examine.casenumbers = true; + } + + if (examine.boxplot) + { + examine.casenumbers = true; + } + + + if (examine.descriptives && examine.calc_extremes == 0) + { + /* Descriptives always displays the max and min */ + examine.calc_extremes = 1; + } + + if (percentiles_seen && examine.n_percentiles == 0) + { + examine.n_percentiles = 7; + examine.ptiles = xcalloc (examine.n_percentiles, + sizeof (*examine.ptiles)); + + examine.ptiles[0] = 5; + examine.ptiles[1] = 10; + examine.ptiles[2] = 25; + examine.ptiles[3] = 50; + examine.ptiles[4] = 75; + examine.ptiles[5] = 90; + examine.ptiles[6] = 95; + } + + assert (examine.calc_extremes >= examine.disp_extremes); + { + struct casegrouper *grouper; + struct casereader *group; + bool ok; + + grouper = casegrouper_create_splits (proc_open (ds), examine.dict); + while (casegrouper_get_next_group (grouper, &group)) + run_examine (&examine, group); + ok = casegrouper_destroy (grouper); + ok = proc_commit (ds) && ok; + } + + return CMD_SUCCESS; + + error: + return CMD_FAILURE; +} diff --git a/src/language/stats/examine.q b/src/language/stats/examine.q deleted file mode 100644 index 8bf913e92a..0000000000 --- a/src/language/stats/examine.q +++ /dev/null @@ -1,2084 +0,0 @@ -/* PSPP - a program for statistical analysis. - Copyright (C) 2004, 2008, 2009, 2010, 2011 Free Software Foundation, Inc. - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . */ - -#include - -#include -#include -#include -#include - -#include "data/case.h" -#include "data/casegrouper.h" -#include "data/casereader.h" -#include "data/casewriter.h" -#include "data/dataset.h" -#include "data/dictionary.h" -#include "data/subcase.h" -#include "data/value-labels.h" -#include "data/variable.h" -#include "language/command.h" -#include "language/dictionary/split-file.h" -#include "language/lexer/lexer.h" -#include "libpspp/compiler.h" -#include "libpspp/message.h" -#include "libpspp/misc.h" -#include "libpspp/str.h" -#include "math/box-whisker.h" -#include "math/extrema.h" -#include "math/histogram.h" -#include "math/moments.h" -#include "math/np.h" -#include "math/order-stats.h" -#include "math/percentiles.h" -#include "math/sort.h" -#include "math/trimmed-mean.h" -#include "math/tukey-hinges.h" -#include "output/chart-item.h" -#include "output/charts/boxplot.h" -#include "output/charts/np-plot.h" -#include "output/charts/plot-hist.h" -#include "output/tab.h" - -#include "gl/minmax.h" -#include "gl/xalloc.h" - -#include "gettext.h" -#define _(msgid) gettext (msgid) -#define N_(msgid) msgid - -/* (headers) */ - -/* (specification) - "EXAMINE" (xmn_): - *^variables=custom; - +total=custom; - +nototal=custom; - missing=miss:pairwise/!listwise, - rep:report/!noreport, - incl:include/!exclude; - +compare=cmp:variables/!groups; - +percentiles=custom; - +id=var; - +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none; - +cinterval=double; - +statistics[st_]=descriptives,:extreme(*d:n),all,none. -*/ - -/* (declarations) */ - -/* (functions) */ - - -static struct cmd_examine cmd; - -static const struct variable **dependent_vars; -static size_t n_dependent_vars; - -/* PERCENTILES */ - -static subc_list_double percentile_list; -static enum pc_alg percentile_algorithm; - -struct factor_metrics -{ - struct moments1 *moments; - - struct percentile **ptl; - size_t n_ptiles; - - struct tukey_hinges *tukey_hinges; - struct box_whisker *box_whisker; - struct trimmed_mean *trimmed_mean; - struct histogram *histogram; - struct np *np; - - /* Three quartiles indexing into PTL */ - struct percentile **quartiles; - - /* A reader sorted in ASCENDING order */ - struct casereader *up_reader; - - /* The minimum value of all the weights */ - double cmin; - - /* Sum of all weights, including those for missing values */ - double n; - - /* Sum of weights of non_missing values */ - double n_valid; - - double mean; - - double variance; - - double skewness; - - double kurtosis; - - double se_mean; - - struct extrema *minima; - struct extrema *maxima; -}; - -struct factor_result -{ - struct ll ll; - - union value value[2]; - - /* An array of factor metrics, one for each variable */ - struct factor_metrics *metrics; -}; - -struct xfactor -{ - /* We need to make a list of this structure */ - struct ll ll; - - /* The independent variable */ - const struct variable const* indep_var[2]; - - /* A list of results for this factor */ - struct ll_list result_list ; -}; - - -static void -factor_destroy (struct xfactor *fctr) -{ - struct ll *ll = ll_head (&fctr->result_list); - while (ll != ll_null (&fctr->result_list)) - { - int v; - struct factor_result *result = - ll_data (ll, struct factor_result, ll); - int i; - - for (v = 0; v < n_dependent_vars; ++v) - { - int i; - moments1_destroy (result->metrics[v].moments); - extrema_destroy (result->metrics[v].minima); - extrema_destroy (result->metrics[v].maxima); - statistic_destroy (&result->metrics[v].trimmed_mean->parent.parent); - statistic_destroy (&result->metrics[v].tukey_hinges->parent.parent); - statistic_destroy (&result->metrics[v].box_whisker->parent.parent); - statistic_destroy (&result->metrics[v].histogram->parent); - for (i = 0 ; i < result->metrics[v].n_ptiles; ++i) - statistic_destroy (&result->metrics[v].ptl[i]->parent.parent); - free (result->metrics[v].ptl); - free (result->metrics[v].quartiles); - casereader_destroy (result->metrics[v].up_reader); - } - - for (i = 0; i < 2; i++) - if (fctr->indep_var[i]) - value_destroy (&result->value[i], - var_get_width (fctr->indep_var[i])); - free (result->metrics); - ll = ll_next (ll); - free (result); - } -} - -static struct xfactor level0_factor; -static struct ll_list factor_list; - -/* Parse the clause specifying the factors */ -static int examine_parse_independent_vars (struct lexer *lexer, - const struct dictionary *dict, - struct cmd_examine *cmd); - -/* Output functions */ -static void show_summary (const struct variable **dependent_var, int n_dep_var, - const struct dictionary *dict, - const struct xfactor *f); - - -static void show_descriptives (const struct variable **dependent_var, - int n_dep_var, - const struct xfactor *f); - - -static void show_percentiles (const struct variable **dependent_var, - int n_dep_var, - const struct xfactor *f); - - -static void show_extremes (const struct variable **dependent_var, - int n_dep_var, - const struct xfactor *f); - - - - -/* Per Split function */ -static void run_examine (struct cmd_examine *, struct casereader *, - struct dataset *); - -static void output_examine (const struct dictionary *dict); - - -void factor_calc (const struct ccase *c, int case_no, - double weight, bool case_missing); - - -/* Represent a factor as a string, so it can be - printed in a human readable fashion */ -static void factor_to_string (const struct xfactor *fctr, - const struct factor_result *result, - struct string *str); - -/* Represent a factor as a string, so it can be - printed in a human readable fashion, - but sacrificing some readablility for the sake of brevity */ -static void -factor_to_string_concise (const struct xfactor *fctr, - const struct factor_result *result, - struct string *str - ); - - - -/* Categories of missing values to exclude. */ -static enum mv_class exclude_values; - -int -cmd_examine (struct lexer *lexer, struct dataset *ds) -{ - struct casegrouper *grouper; - struct casereader *group; - bool ok; - - subc_list_double_create (&percentile_list); - percentile_algorithm = PC_HAVERAGE; - - ll_init (&factor_list); - - if ( !parse_examine (lexer, ds, &cmd, NULL) ) - { - subc_list_double_destroy (&percentile_list); - return CMD_FAILURE; - } - - /* If /MISSING=INCLUDE is set, then user missing values are ignored */ - exclude_values = cmd.incl == XMN_INCLUDE ? MV_SYSTEM : MV_ANY; - - if ( cmd.st_n == SYSMIS ) - cmd.st_n = 5; - - if ( ! cmd.sbc_cinterval) - cmd.n_cinterval[0] = 95.0; - - /* If descriptives have been requested, make sure the - quartiles are calculated */ - if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] ) - { - subc_list_double_push (&percentile_list, 25); - subc_list_double_push (&percentile_list, 50); - subc_list_double_push (&percentile_list, 75); - } - - grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds)); - - while (casegrouper_get_next_group (grouper, &group)) - { - struct casereader *reader = - casereader_create_arithmetic_sequence (group, 1, 1); - - run_examine (&cmd, reader, ds); - } - - ok = casegrouper_destroy (grouper); - ok = proc_commit (ds) && ok; - - if ( dependent_vars ) - free (dependent_vars); - - subc_list_double_destroy (&percentile_list); - - return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE; -}; - - - - -static void -show_npplot (const struct variable **dependent_var, - int n_dep_var, - const struct xfactor *fctr) -{ - int v; - - for (v = 0; v < n_dep_var; ++v) - { - struct ll *ll; - for (ll = ll_head (&fctr->result_list); - ll != ll_null (&fctr->result_list); - ll = ll_next (ll)) - { - struct string label; - const struct factor_result *result = - ll_data (ll, struct factor_result, ll); - struct chart_item *npp, *dnpp; - struct casereader *reader; - struct np *np; - - ds_init_empty (&label); - ds_put_format (&label, "%s ", var_get_name (dependent_var[v])); - factor_to_string (fctr, result, &label); - - np = result->metrics[v].np; - reader = casewriter_make_reader (np->writer); - npp = np_plot_create (np, reader, ds_cstr (&label)); - dnpp = dnp_plot_create (np, reader, ds_cstr (&label)); - - ds_destroy (&label); - - if (npp == NULL || dnpp == NULL) - { - msg (MW, _("Not creating NP plot because data set is empty.")); - chart_item_unref (npp); - chart_item_unref (dnpp); - } - else - { - chart_item_submit (npp); - chart_item_submit (dnpp); - } - - statistic_destroy (&np->parent.parent); - casereader_destroy (reader); - } - } -} - - -static void -show_histogram (const struct variable **dependent_var, - int n_dep_var, - const struct xfactor *fctr) -{ - int v; - - for (v = 0; v < n_dep_var; ++v) - { - struct ll *ll; - for (ll = ll_head (&fctr->result_list); - ll != ll_null (&fctr->result_list); - ll = ll_next (ll)) - { - struct string str; - const struct factor_result *result = - ll_data (ll, struct factor_result, ll); - struct histogram *histogram; - double mean, var, n; - - histogram = result->metrics[v].histogram; - if (histogram == NULL) - { - /* Probably all values are SYSMIS. */ - continue; - } - - ds_init_empty (&str); - ds_put_format (&str, "%s ", var_get_name (dependent_var[v])); - - factor_to_string (fctr, result, &str); - - moments1_calculate (result->metrics[v].moments, - &n, &mean, &var, NULL, NULL); - chart_item_submit (histogram_chart_create (histogram->gsl_hist, - ds_cstr (&str), n, mean, - sqrt (var), false)); - - ds_destroy (&str); - } - } -} - - - -static void -show_boxplot_groups (const struct variable **dependent_var, - int n_dep_var, - const struct xfactor *fctr) -{ - int v; - - for (v = 0; v < n_dep_var; ++v) - { - const struct factor_result *result; - struct boxplot *boxplot; - double y_min = DBL_MAX; - double y_max = -DBL_MAX; - char *title; - - ll_for_each (result, struct factor_result, ll, &fctr->result_list) - { - struct factor_metrics *metrics = &result->metrics[v]; - const struct ll_list *max_list = extrema_list (metrics->maxima); - const struct ll_list *min_list = extrema_list (metrics->minima); - const struct extremum *max, *min; - - if ( ll_is_empty (max_list)) - { - msg (MW, _("Not creating plot because data set is empty.")); - continue; - } - - max = ll_data (ll_head(max_list), struct extremum, ll); - min = ll_data (ll_head (min_list), struct extremum, ll); - - y_max = MAX (y_max, max->value); - y_min = MIN (y_min, min->value); - } - - if (fctr->indep_var[0]) - title = xasprintf (_("Boxplot of %s vs. %s"), - var_to_string (dependent_var[v]), - var_to_string (fctr->indep_var[0])); - else - title = xasprintf (_("Boxplot of %s"), - var_to_string (dependent_var[v])); - boxplot = boxplot_create (y_min, y_max, title); - free (title); - - ll_for_each (result, struct factor_result, ll, &fctr->result_list) - { - struct factor_metrics *metrics = &result->metrics[v]; - struct string str = DS_EMPTY_INITIALIZER; - factor_to_string_concise (fctr, result, &str); - boxplot_add_box (boxplot, metrics->box_whisker, ds_cstr (&str)); - metrics->box_whisker = NULL; - ds_destroy (&str); - } - - boxplot_submit (boxplot); - } -} - - - -static void -show_boxplot_variables (const struct variable **dependent_var, - int n_dep_var, - const struct xfactor *fctr - ) - -{ - const struct factor_result *result; - int v; - - ll_for_each (result, struct factor_result, ll, &fctr->result_list) - { - struct string title; - double y_min = DBL_MAX; - double y_max = -DBL_MAX; - struct boxplot *boxplot; - - for (v = 0; v < n_dep_var; ++v) - { - const struct factor_metrics *metrics = &result->metrics[v]; - const struct ll *max_ll = ll_head (extrema_list (metrics->maxima)); - const struct ll *min_ll = ll_head (extrema_list (metrics->minima)); - const struct extremum *max = ll_data (max_ll, struct extremum, ll); - const struct extremum *min = ll_data (min_ll, struct extremum, ll); - - y_max = MAX (y_max, max->value); - y_min = MIN (y_min, min->value); - } - - ds_init_empty (&title); - factor_to_string (fctr, result, &title); - boxplot = boxplot_create (y_min, y_max, ds_cstr (&title)); - ds_destroy (&title); - - for (v = 0; v < n_dep_var; ++v) - { - struct factor_metrics *metrics = &result->metrics[v]; - boxplot_add_box (boxplot, metrics->box_whisker, - var_get_name (dependent_var[v])); - metrics->box_whisker = NULL; - } - - boxplot_submit (boxplot); - } -} - - -/* Show all the appropriate tables */ -static void -output_examine (const struct dictionary *dict) -{ - struct ll *ll; - - show_summary (dependent_vars, n_dependent_vars, dict, &level0_factor); - - if ( cmd.a_statistics[XMN_ST_EXTREME] ) - show_extremes (dependent_vars, n_dependent_vars, &level0_factor); - - if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] ) - show_descriptives (dependent_vars, n_dependent_vars, &level0_factor); - - if ( cmd.sbc_percentiles) - show_percentiles (dependent_vars, n_dependent_vars, &level0_factor); - - if ( cmd.sbc_plot) - { - if (cmd.a_plot[XMN_PLT_BOXPLOT]) - show_boxplot_groups (dependent_vars, n_dependent_vars, &level0_factor); - - if (cmd.a_plot[XMN_PLT_HISTOGRAM]) - show_histogram (dependent_vars, n_dependent_vars, &level0_factor); - - if (cmd.a_plot[XMN_PLT_NPPLOT]) - show_npplot (dependent_vars, n_dependent_vars, &level0_factor); - } - - for (ll = ll_head (&factor_list); - ll != ll_null (&factor_list); ll = ll_next (ll)) - { - struct xfactor *factor = ll_data (ll, struct xfactor, ll); - show_summary (dependent_vars, n_dependent_vars, dict, factor); - - if ( cmd.a_statistics[XMN_ST_EXTREME] ) - show_extremes (dependent_vars, n_dependent_vars, factor); - - if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] ) - show_descriptives (dependent_vars, n_dependent_vars, factor); - - if ( cmd.sbc_percentiles) - show_percentiles (dependent_vars, n_dependent_vars, factor); - - if (cmd.a_plot[XMN_PLT_BOXPLOT]) - { - if (cmd.cmp == XMN_GROUPS) - show_boxplot_groups (dependent_vars, n_dependent_vars, factor); - else if (cmd.cmp == XMN_VARIABLES) - show_boxplot_variables (dependent_vars, n_dependent_vars, factor); - } - - if (cmd.a_plot[XMN_PLT_HISTOGRAM]) - show_histogram (dependent_vars, n_dependent_vars, factor); - - if (cmd.a_plot[XMN_PLT_NPPLOT]) - show_npplot (dependent_vars, n_dependent_vars, factor); - } -} - -/* Parse the PERCENTILES subcommand */ -static int -xmn_custom_percentiles (struct lexer *lexer, struct dataset *ds UNUSED, - struct cmd_examine *p UNUSED, void *aux UNUSED) -{ - lex_match (lexer, T_EQUALS); - - lex_match (lexer, T_LPAREN); - - while ( lex_is_number (lexer) ) - { - subc_list_double_push (&percentile_list, lex_number (lexer)); - - lex_get (lexer); - - lex_match (lexer, T_COMMA) ; - } - lex_match (lexer, T_RPAREN); - - lex_match (lexer, T_EQUALS); - - if ( lex_match_id (lexer, "HAVERAGE")) - percentile_algorithm = PC_HAVERAGE; - - else if ( lex_match_id (lexer, "WAVERAGE")) - percentile_algorithm = PC_WAVERAGE; - - else if ( lex_match_id (lexer, "ROUND")) - percentile_algorithm = PC_ROUND; - - else if ( lex_match_id (lexer, "EMPIRICAL")) - percentile_algorithm = PC_EMPIRICAL; - - else if ( lex_match_id (lexer, "AEMPIRICAL")) - percentile_algorithm = PC_AEMPIRICAL; - - else if ( lex_match_id (lexer, "NONE")) - percentile_algorithm = PC_NONE; - - - if ( 0 == subc_list_double_count (&percentile_list)) - { - subc_list_double_push (&percentile_list, 5); - subc_list_double_push (&percentile_list, 10); - subc_list_double_push (&percentile_list, 25); - subc_list_double_push (&percentile_list, 50); - subc_list_double_push (&percentile_list, 75); - subc_list_double_push (&percentile_list, 90); - subc_list_double_push (&percentile_list, 95); - } - - return 1; -} - -/* TOTAL and NOTOTAL are simple, mutually exclusive flags */ -static int -xmn_custom_total (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, - struct cmd_examine *p, void *aux UNUSED) -{ - if ( p->sbc_nototal ) - { - msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL"); - return 0; - } - - return 1; -} - -static int -xmn_custom_nototal (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, - struct cmd_examine *p, void *aux UNUSED) -{ - if ( p->sbc_total ) - { - msg (SE, _("%s and %s are mutually exclusive"), "TOTAL", "NOTOTAL"); - return 0; - } - - return 1; -} - - - -/* Parser for the variables sub command - Returns 1 on success */ -static int -xmn_custom_variables (struct lexer *lexer, struct dataset *ds, - struct cmd_examine *cmd, - void *aux UNUSED) -{ - const struct dictionary *dict = dataset_dict (ds); - lex_match (lexer, T_EQUALS); - - if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokcstr (lexer)) == NULL) - && lex_token (lexer) != T_ALL) - { - return 2; - } - - if (!parse_variables_const (lexer, dict, &dependent_vars, &n_dependent_vars, - PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) ) - { - free (dependent_vars); - return 0; - } - - assert (n_dependent_vars); - - - if ( lex_match (lexer, T_BY)) - { - int success ; - success = examine_parse_independent_vars (lexer, dict, cmd); - if ( success != 1 ) - { - free (dependent_vars); - } - return success; - } - - return 1; -} - - - -/* Parse the clause specifying the factors */ -static int -examine_parse_independent_vars (struct lexer *lexer, - const struct dictionary *dict, - struct cmd_examine *cmd) -{ - int success; - struct xfactor *sf = xmalloc (sizeof *sf); - - ll_init (&sf->result_list); - - if ( (lex_token (lexer) != T_ID || - dict_lookup_var (dict, lex_tokcstr (lexer)) == NULL) - && lex_token (lexer) != T_ALL) - { - free ( sf ) ; - return 2; - } - - sf->indep_var[0] = parse_variable (lexer, dict); - sf->indep_var[1] = NULL; - - if ( lex_token (lexer) == T_BY ) - { - lex_match (lexer, T_BY); - - if ( (lex_token (lexer) != T_ID || - dict_lookup_var (dict, lex_tokcstr (lexer)) == NULL) - && lex_token (lexer) != T_ALL) - { - free (sf); - return 2; - } - - sf->indep_var[1] = parse_variable (lexer, dict); - - ll_push_tail (&factor_list, &sf->ll); - } - else - ll_push_tail (&factor_list, &sf->ll); - - lex_match (lexer, T_COMMA); - - if ( lex_token (lexer) == T_ENDCMD || lex_token (lexer) == T_SLASH ) - return 1; - - success = examine_parse_independent_vars (lexer, dict, cmd); - - if ( success != 1 ) - free ( sf ) ; - - return success; -} - -static void -examine_group (struct cmd_examine *cmd, struct casereader *reader, int level, - const struct dictionary *dict, struct xfactor *factor) -{ - struct ccase *c; - const struct variable *wv = dict_get_weight (dict); - int v; - int n_extrema = 1; - struct factor_result *result = xzalloc (sizeof (*result)); - int i; - - for (i = 0; i < 2; i++) - if (factor->indep_var[i]) - value_init (&result->value[i], var_get_width (factor->indep_var[i])); - - result->metrics = xcalloc (n_dependent_vars, sizeof (*result->metrics)); - - if ( cmd->a_statistics[XMN_ST_EXTREME] ) - n_extrema = cmd->st_n; - - - c = casereader_peek (reader, 0); - if (c != NULL) - { - if ( level > 0) - for (i = 0; i < 2; i++) - if (factor->indep_var[i]) - value_copy (&result->value[i], case_data (c, factor->indep_var[i]), - var_get_width (factor->indep_var[i])); - case_unref (c); - } - - for (v = 0; v < n_dependent_vars; ++v) - { - struct casewriter *writer; - struct casereader *input = casereader_clone (reader); - - result->metrics[v].moments = moments1_create (MOMENT_KURTOSIS); - result->metrics[v].minima = extrema_create (n_extrema, EXTREME_MINIMA); - result->metrics[v].maxima = extrema_create (n_extrema, EXTREME_MAXIMA); - result->metrics[v].cmin = DBL_MAX; - - if (cmd->a_statistics[XMN_ST_DESCRIPTIVES] || - cmd->a_plot[XMN_PLT_BOXPLOT] || - cmd->a_plot[XMN_PLT_NPPLOT] || - cmd->sbc_percentiles) - { - /* In this case, we need to sort the data, so we create a sorting - casewriter */ - struct subcase up_ordering; - subcase_init_var (&up_ordering, dependent_vars[v], SC_ASCEND); - writer = sort_create_writer (&up_ordering, - casereader_get_proto (reader)); - subcase_destroy (&up_ordering); - } - else - { - /* but in this case, sorting is unnecessary, so an ordinary - casewriter is sufficient */ - writer = - autopaging_writer_create (casereader_get_proto (reader)); - } - - - /* Sort or just iterate, whilst calculating moments etc */ - while ((c = casereader_read (input)) != NULL) - { - int n_vals = caseproto_get_n_widths (casereader_get_proto (reader)); - const casenumber loc = case_data_idx (c, n_vals - 1)->f; - - const double weight = wv ? case_data (c, wv)->f : 1.0; - const union value *value = case_data (c, dependent_vars[v]); - - if (weight != SYSMIS) - minimize (&result->metrics[v].cmin, weight); - - moments1_add (result->metrics[v].moments, - value->f, - weight); - - result->metrics[v].n += weight; - - if ( ! var_is_value_missing (dependent_vars[v], value, MV_ANY) ) - result->metrics[v].n_valid += weight; - - extrema_add (result->metrics[v].maxima, - value->f, - weight, - loc); - - extrema_add (result->metrics[v].minima, - value->f, - weight, - loc); - - casewriter_write (writer, c); - } - casereader_destroy (input); - result->metrics[v].up_reader = casewriter_make_reader (writer); - } - - /* If percentiles or descriptives have been requested, then a - second pass through the data (which has now been sorted) - is necessary */ - if ( cmd->a_statistics[XMN_ST_DESCRIPTIVES] || - cmd->a_plot[XMN_PLT_BOXPLOT] || - cmd->a_plot[XMN_PLT_NPPLOT] || - cmd->sbc_percentiles) - { - for (v = 0; v < n_dependent_vars; ++v) - { - int i; - int n_os; - struct order_stats **os ; - struct factor_metrics *metric = &result->metrics[v]; - - metric->n_ptiles = percentile_list.n_data; - - metric->ptl = xcalloc (metric->n_ptiles, sizeof *metric->ptl); - - metric->quartiles = xcalloc (3, sizeof (*metric->quartiles)); - - for (i = 0 ; i < metric->n_ptiles; ++i) - { - metric->ptl[i] = percentile_create (percentile_list.data[i] / 100.0, metric->n_valid); - - if ( percentile_list.data[i] == 25) - metric->quartiles[0] = metric->ptl[i]; - else if ( percentile_list.data[i] == 50) - metric->quartiles[1] = metric->ptl[i]; - else if ( percentile_list.data[i] == 75) - metric->quartiles[2] = metric->ptl[i]; - } - - metric->tukey_hinges = tukey_hinges_create (metric->n_valid, metric->cmin); - metric->trimmed_mean = trimmed_mean_create (metric->n_valid, 0.05); - - n_os = metric->n_ptiles + 2; - - if ( cmd->a_plot[XMN_PLT_NPPLOT] ) - { - double n, mean, var; - moments1_calculate (metric->moments, - &n, &mean, &var, NULL, NULL); - - metric->np = np_create (n, mean, var); - n_os ++; - } - - os = xcalloc (n_os, sizeof *os); - - for (i = 0 ; i < metric->n_ptiles ; ++i ) - { - os[i] = &metric->ptl[i]->parent; - } - - os[i] = &metric->tukey_hinges->parent; - os[i+1] = &metric->trimmed_mean->parent; - - if (cmd->a_plot[XMN_PLT_NPPLOT]) - os[i+2] = &metric->np->parent; - - order_stats_accumulate (os, n_os, - casereader_clone (metric->up_reader), - wv, dependent_vars[v], MV_ANY); - free (os); - } - } - - /* FIXME: Do this in the above loop */ - if ( cmd->a_plot[XMN_PLT_HISTOGRAM] ) - { - struct ccase *c; - struct casereader *input = casereader_clone (reader); - - for (v = 0; v < n_dependent_vars; ++v) - { - const struct extremum *max, *min; - struct factor_metrics *metric = &result->metrics[v]; - - const struct ll_list *max_list = - extrema_list (result->metrics[v].maxima); - - const struct ll_list *min_list = - extrema_list (result->metrics[v].minima); - - if ( ll_is_empty (max_list)) - { - msg (MW, _("Not creating plot because data set is empty.")); - continue; - } - - assert (! ll_is_empty (min_list)); - - max = (const struct extremum *) - ll_data (ll_head(max_list), struct extremum, ll); - - min = (const struct extremum *) - ll_data (ll_head (min_list), struct extremum, ll); - - metric->histogram = histogram_create (10, min->value, max->value); - } - - while ((c = casereader_read (input)) != NULL) - { - const double weight = wv ? case_data (c, wv)->f : 1.0; - - for (v = 0; v < n_dependent_vars; ++v) - { - struct factor_metrics *metric = &result->metrics[v]; - if ( metric->histogram) - histogram_add (metric->histogram, - case_data (c, dependent_vars[v])->f, weight); - } - case_unref (c); - } - casereader_destroy (input); - } - - /* In this case, a third iteration is required */ - if (cmd->a_plot[XMN_PLT_BOXPLOT]) - { - for (v = 0; v < n_dependent_vars; ++v) - { - struct factor_metrics *metric = &result->metrics[v]; - int n_vals = caseproto_get_n_widths (casereader_get_proto ( - metric->up_reader)); - struct order_stats *os; - - metric->box_whisker = - box_whisker_create ( metric->tukey_hinges, cmd->v_id, n_vals - 1); - - os = &metric->box_whisker->parent; - order_stats_accumulate ( &os, 1, - casereader_clone (metric->up_reader), - wv, dependent_vars[v], MV_ANY); - } - } - - ll_push_tail (&factor->result_list, &result->ll); - casereader_destroy (reader); -} - - -static void -run_examine (struct cmd_examine *cmd, struct casereader *input, - struct dataset *ds) -{ - struct ll *ll; - const struct dictionary *dict = dataset_dict (ds); - struct ccase *c; - struct casereader *level0 = casereader_clone (input); - - c = casereader_peek (input, 0); - if (c == NULL) - { - casereader_destroy (input); - return; - } - - output_split_file_values (ds, c); - case_unref (c); - - ll_init (&level0_factor.result_list); - - examine_group (cmd, level0, 0, dict, &level0_factor); - - for (ll = ll_head (&factor_list); - ll != ll_null (&factor_list); - ll = ll_next (ll)) - { - struct xfactor *factor = ll_data (ll, struct xfactor, ll); - - struct casereader *group = NULL; - struct casereader *level1; - struct casegrouper *grouper1 = NULL; - - level1 = casereader_clone (input); - level1 = sort_execute_1var (level1, factor->indep_var[0]); - grouper1 = casegrouper_create_vars (level1, &factor->indep_var[0], 1); - - while (casegrouper_get_next_group (grouper1, &group)) - { - struct casereader *group_copy = casereader_clone (group); - - if ( !factor->indep_var[1]) - examine_group (cmd, group_copy, 1, dict, factor); - else - { - int n_groups = 0; - struct casereader *group2 = NULL; - struct casegrouper *grouper2 = NULL; - - group_copy = sort_execute_1var (group_copy, - factor->indep_var[1]); - - grouper2 = casegrouper_create_vars (group_copy, - &factor->indep_var[1], 1); - - while (casegrouper_get_next_group (grouper2, &group2)) - { - examine_group (cmd, group2, 2, dict, factor); - n_groups++; - } - casegrouper_destroy (grouper2); - } - - casereader_destroy (group); - } - casegrouper_destroy (grouper1); - } - - casereader_destroy (input); - - output_examine (dict); - - factor_destroy (&level0_factor); - - { - struct ll *ll; - for (ll = ll_head (&factor_list); - ll != ll_null (&factor_list); - ll = ll_next (ll)) - { - struct xfactor *f = ll_data (ll, struct xfactor, ll); - factor_destroy (f); - } - } - -} - - -static void -show_summary (const struct variable **dependent_var, int n_dep_var, - const struct dictionary *dict, - const struct xfactor *fctr) -{ - const struct variable *wv = dict_get_weight (dict); - const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : & F_8_0; - - static const char *subtitle[]= - { - N_("Valid"), - N_("Missing"), - N_("Total") - }; - - int v, j; - int heading_columns = 1; - int n_cols; - const int heading_rows = 3; - struct tab_table *tbl; - - int n_rows ; - n_rows = n_dep_var; - - assert (fctr); - - if ( fctr->indep_var[0] ) - { - heading_columns = 2; - - if ( fctr->indep_var[1] ) - { - heading_columns = 3; - } - } - - n_rows *= ll_count (&fctr->result_list); - n_rows += heading_rows; - - n_cols = heading_columns + 6; - - tbl = tab_create (n_cols, n_rows); - tab_headers (tbl, heading_columns, 0, heading_rows, 0); - - /* Outline the box */ - tab_box (tbl, - TAL_2, TAL_2, - -1, -1, - 0, 0, - n_cols - 1, n_rows - 1); - - /* Vertical lines for the data only */ - tab_box (tbl, - -1, -1, - -1, TAL_1, - heading_columns, 0, - n_cols - 1, n_rows - 1); - - - tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows ); - tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 ); - tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 ); - - tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1); - - - tab_title (tbl, _("Case Processing Summary")); - - tab_joint_text (tbl, heading_columns, 0, - n_cols -1, 0, - TAB_CENTER | TAT_TITLE, - _("Cases")); - - /* Remove lines ... */ - tab_box (tbl, - -1, -1, - TAL_0, TAL_0, - heading_columns, 0, - n_cols - 1, 0); - - for (j = 0 ; j < 3 ; ++j) - { - tab_text (tbl, heading_columns + j * 2 , 2, TAB_CENTER | TAT_TITLE, - _("N")); - - tab_text (tbl, heading_columns + j * 2 + 1, 2, TAB_CENTER | TAT_TITLE, - _("Percent")); - - tab_joint_text (tbl, heading_columns + j * 2 , 1, - heading_columns + j * 2 + 1, 1, - TAB_CENTER | TAT_TITLE, - subtitle[j]); - - tab_box (tbl, -1, -1, - TAL_0, TAL_0, - heading_columns + j * 2, 1, - heading_columns + j * 2 + 1, 1); - } - - - /* Titles for the independent variables */ - if ( fctr->indep_var[0] ) - { - tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string (fctr->indep_var[0])); - - if ( fctr->indep_var[1] ) - { - tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE, - var_to_string (fctr->indep_var[1])); - } - } - - for (v = 0 ; v < n_dep_var ; ++v) - { - int j = 0; - struct ll *ll; - const union value *last_value = NULL; - - if ( v > 0 ) - tab_hline (tbl, TAL_1, 0, n_cols -1 , - v * ll_count (&fctr->result_list) - + heading_rows); - - tab_text (tbl, - 0, - v * ll_count (&fctr->result_list) + heading_rows, - TAB_LEFT | TAT_TITLE, - var_to_string (dependent_var[v]) - ); - - - for (ll = ll_head (&fctr->result_list); - ll != ll_null (&fctr->result_list); ll = ll_next (ll)) - { - double n; - const struct factor_result *result = - ll_data (ll, struct factor_result, ll); - - if ( fctr->indep_var[0] ) - { - - if ( last_value == NULL || - !value_equal (last_value, &result->value[0], - var_get_width (fctr->indep_var[0]))) - { - struct string str; - - last_value = &result->value[0]; - ds_init_empty (&str); - - var_append_value_name (fctr->indep_var[0], &result->value[0], - &str); - - tab_text (tbl, 1, - heading_rows + j + - v * ll_count (&fctr->result_list), - TAB_LEFT | TAT_TITLE, - ds_cstr (&str)); - - ds_destroy (&str); - - if ( fctr->indep_var[1] && j > 0) - tab_hline (tbl, TAL_1, 1, n_cols - 1, - heading_rows + j + - v * ll_count (&fctr->result_list)); - } - - if ( fctr->indep_var[1]) - { - struct string str; - - ds_init_empty (&str); - - var_append_value_name (fctr->indep_var[1], - &result->value[1], &str); - - tab_text (tbl, 2, - heading_rows + j + - v * ll_count (&fctr->result_list), - TAB_LEFT | TAT_TITLE, - ds_cstr (&str)); - - ds_destroy (&str); - } - } - - - moments1_calculate (result->metrics[v].moments, - &n, &result->metrics[v].mean, - &result->metrics[v].variance, - &result->metrics[v].skewness, - &result->metrics[v].kurtosis); - - result->metrics[v].se_mean = sqrt (result->metrics[v].variance / n) ; - - /* Total Valid */ - tab_double (tbl, heading_columns, - heading_rows + j + v * ll_count (&fctr->result_list), - TAB_LEFT, - n, wfmt); - - tab_text_format (tbl, heading_columns + 1, - heading_rows + j + v * ll_count (&fctr->result_list), - TAB_RIGHT, - "%g%%", n * 100.0 / result->metrics[v].n); - - /* Total Missing */ - tab_double (tbl, heading_columns + 2, - heading_rows + j + v * ll_count (&fctr->result_list), - TAB_LEFT, - result->metrics[v].n - n, - wfmt); - - tab_text_format (tbl, heading_columns + 3, - heading_rows + j + v * ll_count (&fctr->result_list), - TAB_RIGHT, - "%g%%", - (result->metrics[v].n - n) * 100.0 / result->metrics[v].n - ); - - /* Total Valid + Missing */ - tab_double (tbl, heading_columns + 4, - heading_rows + j + v * ll_count (&fctr->result_list), - TAB_LEFT, - result->metrics[v].n, - wfmt); - - tab_text_format (tbl, heading_columns + 5, - heading_rows + j + v * ll_count (&fctr->result_list), - TAB_RIGHT, - "%g%%", - ((result->metrics[v].n) * 100.0 - / result->metrics[v].n)); - - ++j; - } - } - - - tab_submit (tbl); -} - -#define DESCRIPTIVE_ROWS 13 - -static void -show_descriptives (const struct variable **dependent_var, - int n_dep_var, - const struct xfactor *fctr) -{ - int v; - int heading_columns = 3; - int n_cols; - const int heading_rows = 1; - struct tab_table *tbl; - - int n_rows ; - n_rows = n_dep_var; - - assert (fctr); - - if ( fctr->indep_var[0] ) - { - heading_columns = 4; - - if ( fctr->indep_var[1] ) - { - heading_columns = 5; - } - } - - n_rows *= ll_count (&fctr->result_list) * DESCRIPTIVE_ROWS; - n_rows += heading_rows; - - n_cols = heading_columns + 2; - - tbl = tab_create (n_cols, n_rows); - tab_headers (tbl, heading_columns, 0, heading_rows, 0); - - /* Outline the box */ - tab_box (tbl, - TAL_2, TAL_2, - -1, -1, - 0, 0, - n_cols - 1, n_rows - 1); - - - tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows ); - tab_hline (tbl, TAL_2, 1, n_cols - 1, heading_rows ); - - tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1); - - - if ( fctr->indep_var[0]) - tab_text (tbl, 1, 0, TAT_TITLE, var_to_string (fctr->indep_var[0])); - - if ( fctr->indep_var[1]) - tab_text (tbl, 2, 0, TAT_TITLE, var_to_string (fctr->indep_var[1])); - - for (v = 0 ; v < n_dep_var ; ++v ) - { - struct ll *ll; - int i = 0; - - const int row_var_start = - v * DESCRIPTIVE_ROWS * ll_count(&fctr->result_list); - - tab_text (tbl, - 0, - heading_rows + row_var_start, - TAB_LEFT | TAT_TITLE, - var_to_string (dependent_var[v]) - ); - - for (ll = ll_head (&fctr->result_list); - ll != ll_null (&fctr->result_list); i++, ll = ll_next (ll)) - { - const struct factor_result *result = - ll_data (ll, struct factor_result, ll); - - const double t = - gsl_cdf_tdist_Qinv ((1 - cmd.n_cinterval[0] / 100.0) / 2.0, - result->metrics[v].n - 1); - - if ( i > 0 || v > 0 ) - { - const int left_col = (i == 0) ? 0 : 1; - tab_hline (tbl, TAL_1, left_col, n_cols - 1, - heading_rows + row_var_start + i * DESCRIPTIVE_ROWS); - } - - if ( fctr->indep_var[0]) - { - struct string vstr; - ds_init_empty (&vstr); - var_append_value_name (fctr->indep_var[0], - &result->value[0], &vstr); - - tab_text (tbl, 1, - heading_rows + row_var_start + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - ds_cstr (&vstr) - ); - - ds_destroy (&vstr); - } - - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("Mean")); - - tab_text_format (tbl, n_cols - 4, - heading_rows + row_var_start + 1 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("%g%% Confidence Interval for Mean"), - cmd.n_cinterval[0]); - - tab_text (tbl, n_cols - 3, - heading_rows + row_var_start + 1 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("Lower Bound")); - - tab_text (tbl, n_cols - 3, - heading_rows + row_var_start + 2 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("Upper Bound")); - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + 3 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, _("5% Trimmed Mean")); - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + 4 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("Median")); - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + 5 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("Variance")); - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + 6 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("Std. Deviation")); - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + 7 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("Minimum")); - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + 8 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("Maximum")); - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + 9 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("Range")); - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + 10 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("Interquartile Range")); - - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + 11 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("Skewness")); - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + 12 + i * DESCRIPTIVE_ROWS, - TAB_LEFT, - _("Kurtosis")); - - - /* Now the statistics ... */ - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - result->metrics[v].mean, - NULL); - - tab_double (tbl, n_cols - 1, - heading_rows + row_var_start + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - result->metrics[v].se_mean, - NULL); - - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + 1 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - result->metrics[v].mean - t * - result->metrics[v].se_mean, - NULL); - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + 2 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - result->metrics[v].mean + t * - result->metrics[v].se_mean, - NULL); - - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + 3 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - trimmed_mean_calculate (result->metrics[v].trimmed_mean), - NULL); - - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + 4 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - percentile_calculate (result->metrics[v].quartiles[1], percentile_algorithm), - NULL); - - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + 5 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - result->metrics[v].variance, - NULL); - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + 6 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - sqrt (result->metrics[v].variance), - NULL); - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + 10 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - percentile_calculate (result->metrics[v].quartiles[2], - percentile_algorithm) - - percentile_calculate (result->metrics[v].quartiles[0], - percentile_algorithm), - NULL); - - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + 11 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - result->metrics[v].skewness, - NULL); - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + 12 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - result->metrics[v].kurtosis, - NULL); - - tab_double (tbl, n_cols - 1, - heading_rows + row_var_start + 11 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - calc_seskew (result->metrics[v].n), - NULL); - - tab_double (tbl, n_cols - 1, - heading_rows + row_var_start + 12 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - calc_sekurt (result->metrics[v].n), - NULL); - - { - struct extremum *minimum, *maximum ; - - struct ll *max_ll = ll_head (extrema_list (result->metrics[v].maxima)); - struct ll *min_ll = ll_head (extrema_list (result->metrics[v].minima)); - - maximum = ll_data (max_ll, struct extremum, ll); - minimum = ll_data (min_ll, struct extremum, ll); - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + 7 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - minimum->value, - NULL); - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + 8 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - maximum->value, - NULL); - - tab_double (tbl, n_cols - 2, - heading_rows + row_var_start + 9 + i * DESCRIPTIVE_ROWS, - TAB_CENTER, - maximum->value - minimum->value, - NULL); - } - } - } - - tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1); - - tab_title (tbl, _("Descriptives")); - - tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, - _("Statistic")); - - tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, - _("Std. Error")); - - tab_submit (tbl); -} - - - -static void -show_extremes (const struct variable **dependent_var, - int n_dep_var, - const struct xfactor *fctr) -{ - int v; - int heading_columns = 3; - int n_cols; - const int heading_rows = 1; - struct tab_table *tbl; - - int n_rows ; - n_rows = n_dep_var; - - assert (fctr); - - if ( fctr->indep_var[0] ) - { - heading_columns = 4; - - if ( fctr->indep_var[1] ) - { - heading_columns = 5; - } - } - - n_rows *= ll_count (&fctr->result_list) * cmd.st_n * 2; - n_rows += heading_rows; - - n_cols = heading_columns + 2; - - tbl = tab_create (n_cols, n_rows); - tab_headers (tbl, heading_columns, 0, heading_rows, 0); - - /* Outline the box */ - tab_box (tbl, - TAL_2, TAL_2, - -1, -1, - 0, 0, - n_cols - 1, n_rows - 1); - - - tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows ); - tab_hline (tbl, TAL_2, 1, n_cols - 1, heading_rows ); - tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1); - - if ( fctr->indep_var[0]) - tab_text (tbl, 1, 0, TAT_TITLE, var_to_string (fctr->indep_var[0])); - - if ( fctr->indep_var[1]) - tab_text (tbl, 2, 0, TAT_TITLE, var_to_string (fctr->indep_var[1])); - - for (v = 0 ; v < n_dep_var ; ++v ) - { - struct ll *ll; - int i = 0; - const int row_var_start = v * cmd.st_n * 2 * ll_count(&fctr->result_list); - - tab_text (tbl, - 0, - heading_rows + row_var_start, - TAB_LEFT | TAT_TITLE, - var_to_string (dependent_var[v]) - ); - - for (ll = ll_head (&fctr->result_list); - ll != ll_null (&fctr->result_list); i++, ll = ll_next (ll)) - { - int e ; - struct ll *min_ll; - struct ll *max_ll; - const int row_result_start = i * cmd.st_n * 2; - - const struct factor_result *result = - ll_data (ll, struct factor_result, ll); - - if (i > 0 || v > 0) - tab_hline (tbl, TAL_1, 1, n_cols - 1, - heading_rows + row_var_start + row_result_start); - - tab_hline (tbl, TAL_1, heading_columns - 2, n_cols - 1, - heading_rows + row_var_start + row_result_start + cmd.st_n); - - for ( e = 1; e <= cmd.st_n; ++e ) - { - tab_text_format (tbl, n_cols - 3, - heading_rows + row_var_start + row_result_start + e - 1, - TAB_RIGHT, - "%d", e); - - tab_text_format (tbl, n_cols - 3, - heading_rows + row_var_start + row_result_start + cmd.st_n + e - 1, - TAB_RIGHT, - "%d", e); - } - - - min_ll = ll_head (extrema_list (result->metrics[v].minima)); - for (e = 0; e < cmd.st_n;) - { - struct extremum *minimum = ll_data (min_ll, struct extremum, ll); - double weight = minimum->weight; - - while (weight-- > 0 && e < cmd.st_n) - { - tab_double (tbl, n_cols - 1, - heading_rows + row_var_start + row_result_start + cmd.st_n + e, - TAB_RIGHT, - minimum->value, - NULL); - - - tab_fixed (tbl, n_cols - 2, - heading_rows + row_var_start + - row_result_start + cmd.st_n + e, - TAB_RIGHT, - minimum->location, - 10, 0); - ++e; - } - - min_ll = ll_next (min_ll); - } - - max_ll = ll_head (extrema_list (result->metrics[v].maxima)); - for (e = 0; e < cmd.st_n;) - { - struct extremum *maximum = ll_data (max_ll, struct extremum, ll); - double weight = maximum->weight; - - while (weight-- > 0 && e < cmd.st_n) - { - tab_double (tbl, n_cols - 1, - heading_rows + row_var_start + - row_result_start + e, - TAB_RIGHT, - maximum->value, - NULL); - - - tab_fixed (tbl, n_cols - 2, - heading_rows + row_var_start + - row_result_start + e, - TAB_RIGHT, - maximum->location, - 10, 0); - ++e; - } - - max_ll = ll_next (max_ll); - } - - - if ( fctr->indep_var[0]) - { - struct string vstr; - ds_init_empty (&vstr); - var_append_value_name (fctr->indep_var[0], - &result->value[0], &vstr); - - tab_text (tbl, 1, - heading_rows + row_var_start + row_result_start, - TAB_LEFT, - ds_cstr (&vstr) - ); - - ds_destroy (&vstr); - } - - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + row_result_start, - TAB_RIGHT, - _("Highest")); - - tab_text (tbl, n_cols - 4, - heading_rows + row_var_start + row_result_start + cmd.st_n, - TAB_RIGHT, - _("Lowest")); - } - } - - tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1); - - - tab_title (tbl, _("Extreme Values")); - - - tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, - _("Case Number")); - - - tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, - _("Value")); - - tab_submit (tbl); -} - -#define PERCENTILE_ROWS 2 - -static void -show_percentiles (const struct variable **dependent_var, - int n_dep_var, - const struct xfactor *fctr) -{ - int i; - int v; - int heading_columns = 2; - int n_cols; - const int n_percentiles = subc_list_double_count (&percentile_list); - const int heading_rows = 2; - struct tab_table *tbl; - - int n_rows ; - n_rows = n_dep_var; - - assert (fctr); - - if ( fctr->indep_var[0] ) - { - heading_columns = 3; - - if ( fctr->indep_var[1] ) - { - heading_columns = 4; - } - } - - n_rows *= ll_count (&fctr->result_list) * PERCENTILE_ROWS; - n_rows += heading_rows; - - n_cols = heading_columns + n_percentiles; - - tbl = tab_create (n_cols, n_rows); - tab_headers (tbl, heading_columns, 0, heading_rows, 0); - - /* Outline the box */ - tab_box (tbl, - TAL_2, TAL_2, - -1, -1, - 0, 0, - n_cols - 1, n_rows - 1); - - - tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows ); - tab_hline (tbl, TAL_2, 1, n_cols - 1, heading_rows ); - - if ( fctr->indep_var[0]) - tab_text (tbl, 1, 1, TAT_TITLE, var_to_string (fctr->indep_var[0])); - - if ( fctr->indep_var[1]) - tab_text (tbl, 2, 1, TAT_TITLE, var_to_string (fctr->indep_var[1])); - - for (v = 0 ; v < n_dep_var ; ++v ) - { - double hinges[3]; - struct ll *ll; - int i = 0; - - const int row_var_start = - v * PERCENTILE_ROWS * ll_count(&fctr->result_list); - - tab_text (tbl, - 0, - heading_rows + row_var_start, - TAB_LEFT | TAT_TITLE, - var_to_string (dependent_var[v]) - ); - - for (ll = ll_head (&fctr->result_list); - ll != ll_null (&fctr->result_list); i++, ll = ll_next (ll)) - { - int j; - const struct factor_result *result = - ll_data (ll, struct factor_result, ll); - - if ( i > 0 || v > 0 ) - { - const int left_col = (i == 0) ? 0 : 1; - tab_hline (tbl, TAL_1, left_col, n_cols - 1, - heading_rows + row_var_start + i * PERCENTILE_ROWS); - } - - if ( fctr->indep_var[0]) - { - struct string vstr; - ds_init_empty (&vstr); - var_append_value_name (fctr->indep_var[0], - &result->value[0], &vstr); - - tab_text (tbl, 1, - heading_rows + row_var_start + i * PERCENTILE_ROWS, - TAB_LEFT, - ds_cstr (&vstr) - ); - - ds_destroy (&vstr); - } - - - tab_text (tbl, n_cols - n_percentiles - 1, - heading_rows + row_var_start + i * PERCENTILE_ROWS, - TAB_LEFT, - ptile_alg_desc [percentile_algorithm]); - - - tab_text (tbl, n_cols - n_percentiles - 1, - heading_rows + row_var_start + 1 + i * PERCENTILE_ROWS, - TAB_LEFT, - _("Tukey's Hinges")); - - - tab_vline (tbl, TAL_1, n_cols - n_percentiles -1, heading_rows, n_rows - 1); - - tukey_hinges_calculate (result->metrics[v].tukey_hinges, hinges); - - for (j = 0; j < n_percentiles; ++j) - { - double hinge = SYSMIS; - tab_double (tbl, n_cols - n_percentiles + j, - heading_rows + row_var_start + i * PERCENTILE_ROWS, - TAB_CENTER, - percentile_calculate (result->metrics[v].ptl[j], - percentile_algorithm), - NULL - ); - - if ( result->metrics[v].ptl[j]->ptile == 0.5) - hinge = hinges[1]; - else if ( result->metrics[v].ptl[j]->ptile == 0.25) - hinge = hinges[0]; - else if ( result->metrics[v].ptl[j]->ptile == 0.75) - hinge = hinges[2]; - - if ( hinge != SYSMIS) - tab_double (tbl, n_cols - n_percentiles + j, - heading_rows + row_var_start + 1 + i * PERCENTILE_ROWS, - TAB_CENTER, - hinge, - NULL - ); - - } - } - } - - tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1); - - tab_title (tbl, _("Percentiles")); - - - for (i = 0 ; i < n_percentiles; ++i ) - { - tab_text_format (tbl, n_cols - n_percentiles + i, 1, - TAB_CENTER | TAT_TITLE, - _("%g"), - subc_list_double_at (&percentile_list, i)); - - - } - - tab_joint_text (tbl, - n_cols - n_percentiles, 0, - n_cols - 1, 0, - TAB_CENTER | TAT_TITLE, - _("Percentiles")); - - /* Vertical lines for the data only */ - tab_box (tbl, - -1, -1, - -1, TAL_1, - n_cols - n_percentiles, 1, - n_cols - 1, n_rows - 1); - - tab_hline (tbl, TAL_1, n_cols - n_percentiles, n_cols - 1, 1); - - - tab_submit (tbl); -} - - -static void -factor_to_string_concise (const struct xfactor *fctr, - const struct factor_result *result, - struct string *str - ) -{ - if (fctr->indep_var[0]) - { - var_append_value_name (fctr->indep_var[0], &result->value[0], str); - - if ( fctr->indep_var[1] ) - { - ds_put_cstr (str, ","); - - var_append_value_name (fctr->indep_var[1], &result->value[1], str); - - ds_put_cstr (str, ")"); - } - } -} - - -static void -factor_to_string (const struct xfactor *fctr, - const struct factor_result *result, - struct string *str - ) -{ - if (fctr->indep_var[0]) - { - ds_put_format (str, "(%s = ", var_get_name (fctr->indep_var[0])); - - var_append_value_name (fctr->indep_var[0], &result->value[0], str); - - if ( fctr->indep_var[1] ) - { - ds_put_cstr (str, ","); - ds_put_format (str, "%s = ", var_get_name (fctr->indep_var[1])); - - var_append_value_name (fctr->indep_var[1], &result->value[1], str); - } - ds_put_cstr (str, ")"); - } -} - - - - -/* - Local Variables: - mode: c - End: -*/ diff --git a/src/math/box-whisker.c b/src/math/box-whisker.c index 2e4545906f..fb5c2c62a2 100644 --- a/src/math/box-whisker.c +++ b/src/math/box-whisker.c @@ -82,14 +82,9 @@ acc (struct statistic *s, const struct ccase *cx, o->extreme = extreme; ds_init_empty (&o->label); - if (bw->id_var) - var_append_value_name (bw->id_var, - case_data (cx, bw->id_var), - &o->label); - else - ds_put_format (&o->label, + ds_put_format (&o->label, "%ld", - (casenumber) case_data_idx (cx, bw->casenumber_idx)->f); + (casenumber) case_data_idx (cx, bw->casenumber_idx)->f); ll_push_head (&bw->outliers, &o->ll); } @@ -115,9 +110,17 @@ box_whisker_outliers (const struct box_whisker *bw) return &bw->outliers; } +/* + Create a box_whisker struct, suitable for generating a boxplot. + + TH are the tukey hinges of the dataset. + + Casenumber_idx is the index into the casereader which will be used to label + outliers. +*/ struct box_whisker * box_whisker_create (const struct tukey_hinges *th, - const struct variable *id_var, size_t casenumber_idx) + size_t casenumber_idx) { struct box_whisker *w = xzalloc (sizeof (*w)); struct order_stats *os = &w->parent; @@ -131,7 +134,6 @@ box_whisker_create (const struct tukey_hinges *th, tukey_hinges_calculate (th, w->hinges); w->casenumber_idx = casenumber_idx; - w->id_var = id_var; w->step = (w->hinges[2] - w->hinges[0]) * 1.5; diff --git a/src/math/box-whisker.h b/src/math/box-whisker.h index f6856d1e8a..c9b2bef8d3 100644 --- a/src/math/box-whisker.h +++ b/src/math/box-whisker.h @@ -27,7 +27,6 @@ */ struct tukey_hinges; - struct outlier { double value; @@ -49,11 +48,10 @@ struct box_whisker double step; size_t casenumber_idx; - const struct variable *id_var; }; struct box_whisker * box_whisker_create (const struct tukey_hinges *, - const struct variable *, size_t); + size_t); void box_whisker_whiskers (const struct box_whisker *bw, double whiskers[2]); diff --git a/tests/language/lexer/q2c.at b/tests/language/lexer/q2c.at index e8338ace85..b9bc092aa3 100644 --- a/tests/language/lexer/q2c.at +++ b/tests/language/lexer/q2c.at @@ -14,7 +14,7 @@ ONEWAY. CROSSTABS. ]) AT_CHECK([pspp -O format=csv q2c.sps], [1], [dnl -q2c.sps:8: error: EXAMINE: Required subcommand VARIABLES was not specified. +q2c.sps:8.8: error: EXAMINE: Syntax error at end of command: expecting variable name. q2c.sps:9.7: error: ONEWAY: Syntax error at end of command: expecting variable name. diff --git a/tests/language/stats/examine.at b/tests/language/stats/examine.at index 01bebeddbe..40c5c1a966 100644 --- a/tests/language/stats/examine.at +++ b/tests/language/stats/examine.at @@ -75,7 +75,7 @@ Table: Extreme Values ,,,Case Number,Value Breaking Strain,Highest,1,12,7.00 ,,2,16,6.00 -,,3,7,5.00 +,,3,14,5.00 ,Lowest,1,3,1.00 ,,2,3,1.00 ,,3,4,1.00 @@ -106,8 +106,8 @@ Breaking Strain,Aspeger,8.00,100%,.00,0%,8.00,100% Table: Extreme Values ,Manufacturer,,,Case Number,Value -Breaking Strain,Aspeger,Highest,1,5,4.00 -,,,2,6,4.00 +Breaking Strain,Aspeger,Highest,1,6,4.00 +,,,2,5,4.00 ,,,3,1,3.00 ,,Lowest,1,3,1.00 ,,,2,3,1.00 @@ -335,7 +335,9 @@ BEGIN DATA. . 2 END DATA -EXAMINE /x by y. +EXAMINE /x by y + /MISSING = PAIRWISE + . ]) AT_CHECK([pspp -o pspp.csv examine.sps]) AT_CHECK([cat pspp.csv], [0], [dnl @@ -359,6 +361,33 @@ x,1.00,4,100%,0,0%,4,100% ]) AT_CLEANUP + +AT_SETUP([EXAMINE -- user missing values]) +AT_DATA([examine-m.sps], [dnl +DATA LIST notable LIST /x * y *. +BEGIN DATA. +1 2 +9999999999 2 +9999999999 99 +END DATA. + +MISSING VALUES x (9999999999). +MISSING VALUES y (99). + +EXAMINE + /VARIABLES= x y + /MISSING=PAIRWISE. +]) +AT_CHECK([pspp -O format=csv examine-m.sps], [0], [dnl +Table: Case Processing Summary +,Cases,,,,, +,Valid,,Missing,,Total, +,N,Percent,N,Percent,N,Percent +x,1,33.3333%,2,66.6667%,3,100% +y,2,66.6667%,1,33.3333%,3,100% +]) +AT_CLEANUP + AT_SETUP([EXAMINE -- missing values and percentiles]) AT_DATA([examine.sps], [dnl DATA LIST LIST /X *. @@ -509,9 +538,7 @@ END DATA. EXAMINE /x PLOT=HISTOGRAM. ]) -AT_CHECK([pspp -o pspp.csv examine.sps], [0], [dnl -warning: Not creating plot because data set is empty. -]) +AT_CHECK([pspp -o pspp.csv examine.sps], [0], [ignore]) dnl Ignore output -- this is just a no-crash check. AT_CLEANUP -- 2.30.2