1 /* PSPP - EXAMINE data for normality . -*-c-*-
3 Copyright (C) 2004 Free Software Foundation, Inc.
4 Author: John Darrington 2004
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 #include <gsl/gsl_cdf.h>
24 #include <libpspp/message.h>
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/dictionary.h>
32 #include <data/procedure.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/lexer/lexer.h>
37 #include <libpspp/alloc.h>
38 #include <libpspp/compiler.h>
39 #include <libpspp/hash.h>
40 #include <libpspp/magic.h>
41 #include <libpspp/message.h>
42 #include <libpspp/misc.h>
43 #include <libpspp/str.h>
44 #include <math/factor-stats.h>
45 #include <math/moments.h>
46 #include <math/percentiles.h>
47 #include <output/charts/box-whisker.h>
48 #include <output/charts/cartesian.h>
49 #include <output/manager.h>
50 #include <output/table.h>
53 #define _(msgid) gettext (msgid)
54 #define N_(msgid) msgid
57 #include <output/chart.h>
58 #include <output/charts/plot-hist.h>
59 #include <output/charts/plot-chart.h>
66 +missing=miss:pairwise/!listwise,
68 incl:include/!exclude;
69 +compare=cmp:variables/!groups;
72 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
74 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
83 static struct cmd_examine cmd;
85 static struct variable **dependent_vars;
87 static size_t n_dependent_vars;
92 /* The independent variable */
93 struct variable *indep_var[2];
96 /* Hash table of factor stats indexed by 2 values */
97 struct hsh_table *fstats;
99 /* The hash table after it has been crunched */
100 struct factor_statistics **fs;
106 /* Linked list of factors */
107 static struct factor *factors=0;
109 static struct metrics *totals=0;
111 /* Parse the clause specifying the factors */
112 static int examine_parse_independent_vars(struct cmd_examine *cmd);
116 /* Output functions */
117 static void show_summary(struct variable **dependent_var, int n_dep_var,
118 const struct factor *f);
120 static void show_extremes(struct variable **dependent_var,
122 const struct factor *factor,
125 static void show_descriptives(struct variable **dependent_var,
127 struct factor *factor);
129 static void show_percentiles(struct variable **dependent_var,
131 struct factor *factor);
136 void np_plot(const struct metrics *m, const char *factorname);
139 void box_plot_group(const struct factor *fctr,
140 const struct variable **vars, int n_vars,
141 const struct variable *id
145 void box_plot_variables(const struct factor *fctr,
146 const struct variable **vars, int n_vars,
147 const struct variable *id
152 /* Per Split function */
153 static bool run_examine(const struct casefile *cf, void *cmd_);
155 static void output_examine(void);
158 void factor_calc(struct ccase *c, int case_no,
159 double weight, int case_missing);
162 /* Represent a factor as a string, so it can be
163 printed in a human readable fashion */
164 const char * factor_to_string(const struct factor *fctr,
165 struct factor_statistics *fs,
166 const struct variable *var);
169 /* Represent a factor as a string, so it can be
170 printed in a human readable fashion,
171 but sacrificing some readablility for the sake of brevity */
172 const char *factor_to_string_concise(const struct factor *fctr,
173 struct factor_statistics *fs);
178 /* Function to use for testing for missing values */
179 static is_missing_func *value_is_missing;
184 static subc_list_double percentile_list;
186 static enum pc_alg percentile_algorithm;
188 static short sbc_percentile;
196 subc_list_double_create(&percentile_list);
197 percentile_algorithm = PC_HAVERAGE;
199 if ( !parse_examine(&cmd) )
202 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
203 if (cmd.incl == XMN_INCLUDE )
204 value_is_missing = mv_is_value_system_missing;
206 value_is_missing = mv_is_value_missing;
208 if ( cmd.st_n == SYSMIS )
211 if ( ! cmd.sbc_cinterval)
212 cmd.n_cinterval[0] = 95.0;
214 /* If descriptives have been requested, make sure the
215 quartiles are calculated */
216 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
218 subc_list_double_push(&percentile_list, 25);
219 subc_list_double_push(&percentile_list, 50);
220 subc_list_double_push(&percentile_list, 75);
223 ok = multipass_procedure_with_splits (run_examine, &cmd);
230 if ( dependent_vars )
231 free (dependent_vars);
234 struct factor *f = factors ;
237 struct factor *ff = f;
241 hsh_destroy ( ff->fstats ) ;
247 subc_list_double_destroy(&percentile_list);
249 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
254 /* Show all the appropriate tables */
260 /* Show totals if appropriate */
261 if ( ! cmd.sbc_nototal || factors == 0 )
263 show_summary(dependent_vars, n_dependent_vars, 0);
265 if ( cmd.sbc_statistics )
267 if ( cmd.a_statistics[XMN_ST_EXTREME])
268 show_extremes(dependent_vars, n_dependent_vars, 0, cmd.st_n);
270 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
271 show_descriptives(dependent_vars, n_dependent_vars, 0);
274 if ( sbc_percentile )
275 show_percentiles(dependent_vars, n_dependent_vars, 0);
280 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
282 for ( v = 0 ; v < n_dependent_vars; ++v )
283 np_plot(&totals[v], var_to_string(dependent_vars[v]));
286 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
288 if ( cmd.cmp == XMN_GROUPS )
290 box_plot_group (0, (const struct variable **) dependent_vars,
291 n_dependent_vars, cmd.v_id);
294 box_plot_variables (0,
295 (const struct variable **) dependent_vars,
296 n_dependent_vars, cmd.v_id);
299 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
301 for ( v = 0 ; v < n_dependent_vars; ++v )
303 struct normal_curve normal;
305 normal.N = totals[v].n;
306 normal.mean = totals[v].mean;
307 normal.stddev = totals[v].stddev;
309 histogram_plot(totals[v].histogram,
310 var_to_string(dependent_vars[v]),
320 /* Show grouped statistics as appropriate */
324 show_summary(dependent_vars, n_dependent_vars, fctr);
326 if ( cmd.sbc_statistics )
328 if ( cmd.a_statistics[XMN_ST_EXTREME])
329 show_extremes(dependent_vars, n_dependent_vars, fctr, cmd.st_n);
331 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
332 show_descriptives(dependent_vars, n_dependent_vars, fctr);
335 if ( sbc_percentile )
336 show_percentiles(dependent_vars, n_dependent_vars, fctr);
343 struct factor_statistics **fs = fctr->fs ;
345 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
347 if ( cmd.cmp == XMN_VARIABLES )
348 box_plot_variables (fctr,
349 (const struct variable **) dependent_vars,
350 n_dependent_vars, cmd.v_id);
352 box_plot_group (fctr,
353 (const struct variable **) dependent_vars,
354 n_dependent_vars, cmd.v_id);
357 for ( v = 0 ; v < n_dependent_vars; ++v )
360 for ( fs = fctr->fs ; *fs ; ++fs )
362 const char *s = factor_to_string(fctr, *fs, dependent_vars[v]);
364 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
365 np_plot(&(*fs)->m[v], s);
367 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
369 struct normal_curve normal;
371 normal.N = (*fs)->m[v].n;
372 normal.mean = (*fs)->m[v].mean;
373 normal.stddev = (*fs)->m[v].stddev;
375 histogram_plot((*fs)->m[v].histogram,
379 } /* for ( fs .... */
381 } /* for ( v = 0 ..... */
391 /* Create a hash table of percentiles and their values from the list of
393 static struct hsh_table *
394 list_to_ptile_hash(const subc_list_double *l)
398 struct hsh_table *h ;
400 h = hsh_create(subc_list_double_count(l),
401 (hsh_compare_func *) ptile_compare,
402 (hsh_hash_func *) ptile_hash,
403 (hsh_free_func *) free,
407 for ( i = 0 ; i < subc_list_double_count(l) ; ++i )
409 struct percentile *p = xmalloc (sizeof *p);
411 p->p = subc_list_double_at(l,i);
422 /* Parse the PERCENTILES subcommand */
424 xmn_custom_percentiles(struct cmd_examine *p UNUSED)
432 while ( lex_is_number() )
434 subc_list_double_push(&percentile_list,lex_number());
444 if ( lex_match_id("HAVERAGE"))
445 percentile_algorithm = PC_HAVERAGE;
447 else if ( lex_match_id("WAVERAGE"))
448 percentile_algorithm = PC_WAVERAGE;
450 else if ( lex_match_id("ROUND"))
451 percentile_algorithm = PC_ROUND;
453 else if ( lex_match_id("EMPIRICAL"))
454 percentile_algorithm = PC_EMPIRICAL;
456 else if ( lex_match_id("AEMPIRICAL"))
457 percentile_algorithm = PC_AEMPIRICAL;
459 else if ( lex_match_id("NONE"))
460 percentile_algorithm = PC_NONE;
463 if ( 0 == subc_list_double_count(&percentile_list))
465 subc_list_double_push(&percentile_list, 5);
466 subc_list_double_push(&percentile_list, 10);
467 subc_list_double_push(&percentile_list, 25);
468 subc_list_double_push(&percentile_list, 50);
469 subc_list_double_push(&percentile_list, 75);
470 subc_list_double_push(&percentile_list, 90);
471 subc_list_double_push(&percentile_list, 95);
477 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
479 xmn_custom_total(struct cmd_examine *p)
481 if ( p->sbc_nototal )
483 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
491 xmn_custom_nototal(struct cmd_examine *p)
495 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
504 /* Parser for the variables sub command
505 Returns 1 on success */
507 xmn_custom_variables(struct cmd_examine *cmd )
511 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
517 if (!parse_variables (default_dict, &dependent_vars, &n_dependent_vars,
518 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
520 free (dependent_vars);
524 assert(n_dependent_vars);
526 totals = xnmalloc (n_dependent_vars, sizeof *totals);
528 if ( lex_match(T_BY))
531 success = examine_parse_independent_vars(cmd);
532 if ( success != 1 ) {
533 free (dependent_vars);
544 /* Parse the clause specifying the factors */
546 examine_parse_independent_vars(struct cmd_examine *cmd)
549 struct factor *sf = xmalloc (sizeof *sf);
551 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
559 sf->indep_var[0] = parse_variable();
560 sf->indep_var[1] = 0;
567 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
574 sf->indep_var[1] = parse_variable();
579 sf->fstats = hsh_create(4,
580 (hsh_compare_func *) factor_statistics_compare,
581 (hsh_hash_func *) factor_statistics_hash,
582 (hsh_free_func *) factor_statistics_free,
590 if ( token == '.' || token == '/' )
593 success = examine_parse_independent_vars(cmd);
604 void populate_percentiles(struct tab_table *tbl, int col, int row,
605 const struct metrics *m);
607 void populate_descriptives(struct tab_table *t, int col, int row,
608 const struct metrics *fs);
610 void populate_extremes(struct tab_table *t, int col, int row, int n,
611 const struct metrics *m);
613 void populate_summary(struct tab_table *t, int col, int row,
614 const struct metrics *m);
619 static int bad_weight_warn = 1;
622 /* Perform calculations for the sub factors */
624 factor_calc(struct ccase *c, int case_no, double weight, int case_missing)
627 struct factor *fctr = factors;
631 struct factor_statistics **foo ;
632 union value indep_vals[2] ;
634 indep_vals[0] = * case_data(c, fctr->indep_var[0]->fv);
636 if ( fctr->indep_var[1] )
637 indep_vals[1] = * case_data(c, fctr->indep_var[1]->fv);
639 indep_vals[1].f = SYSMIS;
641 assert(fctr->fstats);
643 foo = ( struct factor_statistics ** )
644 hsh_probe(fctr->fstats, (void *) &indep_vals);
649 *foo = create_factor_statistics(n_dependent_vars,
653 for ( v = 0 ; v < n_dependent_vars ; ++v )
655 metrics_precalc( &(*foo)->m[v] );
660 for ( v = 0 ; v < n_dependent_vars ; ++v )
662 const struct variable *var = dependent_vars[v];
663 const union value *val = case_data (c, var->fv);
665 if ( value_is_missing (&var->miss, val) || case_missing )
668 metrics_calc( &(*foo)->m[v], val, weight, case_no);
679 run_examine(const struct casefile *cf, void *cmd_ )
681 struct casereader *r;
685 const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
687 /* Make sure we haven't got rubbish left over from a
689 struct factor *fctr = factors;
692 struct factor *next = fctr->next;
694 hsh_clear(fctr->fstats);
703 for ( v = 0 ; v < n_dependent_vars ; ++v )
704 metrics_precalc(&totals[v]);
706 for(r = casefile_get_reader (cf);
707 casereader_read (r, &c) ;
711 const int case_no = casereader_cnum(r);
713 const double weight =
714 dict_get_case_weight(default_dict, &c, &bad_weight_warn);
716 if ( cmd->miss == XMN_LISTWISE )
718 for ( v = 0 ; v < n_dependent_vars ; ++v )
720 const struct variable *var = dependent_vars[v];
721 const union value *val = case_data (&c, var->fv);
723 if ( value_is_missing(&var->miss, val))
729 for ( v = 0 ; v < n_dependent_vars ; ++v )
731 const struct variable *var = dependent_vars[v];
732 const union value *val = case_data (&c, var->fv);
734 if ( value_is_missing(&var->miss, val) || case_missing )
737 metrics_calc(&totals[v], val, weight, case_no);
741 factor_calc(&c, case_no, weight, case_missing);
746 for ( v = 0 ; v < n_dependent_vars ; ++v)
751 struct hsh_iterator hi;
752 struct factor_statistics *fs;
754 for ( fs = hsh_first(fctr->fstats, &hi);
756 fs = hsh_next(fctr->fstats, &hi))
759 fs->m[v].ptile_hash = list_to_ptile_hash(&percentile_list);
760 fs->m[v].ptile_alg = percentile_algorithm;
761 metrics_postcalc(&fs->m[v]);
767 totals[v].ptile_hash = list_to_ptile_hash(&percentile_list);
768 totals[v].ptile_alg = percentile_algorithm;
769 metrics_postcalc(&totals[v]);
773 /* Make sure that the combination of factors are complete */
778 struct hsh_iterator hi;
779 struct hsh_iterator hi0;
780 struct hsh_iterator hi1;
781 struct factor_statistics *fs;
783 struct hsh_table *idh0=0;
784 struct hsh_table *idh1=0;
788 idh0 = hsh_create(4, (hsh_compare_func *) compare_values,
789 (hsh_hash_func *) hash_value,
792 idh1 = hsh_create(4, (hsh_compare_func *) compare_values,
793 (hsh_hash_func *) hash_value,
797 for ( fs = hsh_first(fctr->fstats, &hi);
799 fs = hsh_next(fctr->fstats, &hi))
801 hsh_insert(idh0,(void *) &fs->id[0]);
802 hsh_insert(idh1,(void *) &fs->id[1]);
805 /* Ensure that the factors combination is complete */
806 for ( val0 = hsh_first(idh0, &hi0);
808 val0 = hsh_next(idh0, &hi0))
810 for ( val1 = hsh_first(idh1, &hi1);
812 val1 = hsh_next(idh1, &hi1))
814 struct factor_statistics **ffs;
819 ffs = (struct factor_statistics **)
820 hsh_probe(fctr->fstats, (void *) &key );
824 (*ffs) = create_factor_statistics (n_dependent_vars,
826 for ( i = 0 ; i < n_dependent_vars ; ++i )
827 metrics_precalc( &(*ffs)->m[i]);
835 fctr->fs = (struct factor_statistics **) hsh_sort_copy(fctr->fstats);
846 for ( i = 0 ; i < n_dependent_vars ; ++i )
848 metrics_destroy(&totals[i]);
857 show_summary(struct variable **dependent_var, int n_dep_var,
858 const struct factor *fctr)
860 static const char *subtitle[]=
868 int heading_columns ;
870 const int heading_rows = 3;
871 struct tab_table *tbl;
879 n_factors = hsh_count(fctr->fstats);
880 n_rows = n_dep_var * n_factors ;
882 if ( fctr->indep_var[1] )
891 n_rows += heading_rows;
893 n_cols = heading_columns + 6;
895 tbl = tab_create (n_cols,n_rows,0);
896 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
898 tab_dim (tbl, tab_natural_dimensions);
900 /* Outline the box */
905 n_cols - 1, n_rows - 1);
907 /* Vertical lines for the data only */
912 n_cols - 1, n_rows - 1);
915 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
916 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
917 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
919 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
922 tab_title (tbl, _("Case Processing Summary"));
925 tab_joint_text(tbl, heading_columns, 0,
927 TAB_CENTER | TAT_TITLE,
930 /* Remove lines ... */
937 for ( i = 0 ; i < 3 ; ++i )
939 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
942 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
945 tab_joint_text(tbl, heading_columns + i*2 , 1,
946 heading_columns + i*2 + 1, 1,
947 TAB_CENTER | TAT_TITLE,
950 tab_box (tbl, -1, -1,
952 heading_columns + i*2, 1,
953 heading_columns + i*2 + 1, 1);
958 /* Titles for the independent variables */
961 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
962 var_to_string(fctr->indep_var[0]));
964 if ( fctr->indep_var[1] )
966 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
967 var_to_string(fctr->indep_var[1]));
973 for ( i = 0 ; i < n_dep_var ; ++i )
977 n_factors = hsh_count(fctr->fstats);
981 tab_hline(tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
984 0, i * n_factors + heading_rows,
985 TAB_LEFT | TAT_TITLE,
986 var_to_string(dependent_var[i])
991 populate_summary(tbl, heading_columns,
992 (i * n_factors) + heading_rows,
998 struct factor_statistics **fs = fctr->fs;
1003 static union value prev;
1005 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1006 fctr->indep_var[0]->width))
1010 (i * n_factors ) + count +
1012 TAB_LEFT | TAT_TITLE,
1013 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1016 if (fctr->indep_var[1] && count > 0 )
1017 tab_hline(tbl, TAL_1, 1, n_cols - 1,
1018 (i * n_factors ) + count + heading_rows);
1022 prev = (*fs)->id[0];
1025 if ( fctr->indep_var[1])
1028 (i * n_factors ) + count +
1030 TAB_LEFT | TAT_TITLE,
1031 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1034 populate_summary(tbl, heading_columns,
1035 (i * n_factors) + count
1050 populate_summary(struct tab_table *t, int col, int row,
1051 const struct metrics *m)
1054 const double total = m->n + m->n_missing ;
1056 tab_float(t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1057 tab_float(t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1058 tab_float(t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1062 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1063 100.0 * m->n / total );
1065 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1066 100.0 * m->n_missing / total );
1068 /* This seems a bit pointless !!! */
1069 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1070 100.0 * total / total );
1081 show_extremes(struct variable **dependent_var, int n_dep_var,
1082 const struct factor *fctr, int n_extremities)
1085 int heading_columns ;
1087 const int heading_rows = 1;
1088 struct tab_table *tbl;
1095 heading_columns = 2;
1096 n_factors = hsh_count(fctr->fstats);
1098 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1100 if ( fctr->indep_var[1] )
1101 heading_columns = 3;
1105 heading_columns = 1;
1106 n_rows = n_dep_var * 2 * n_extremities;
1109 n_rows += heading_rows;
1111 heading_columns += 2;
1112 n_cols = heading_columns + 2;
1114 tbl = tab_create (n_cols,n_rows,0);
1115 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1117 tab_dim (tbl, tab_natural_dimensions);
1119 /* Outline the box, No internal lines*/
1124 n_cols - 1, n_rows - 1);
1126 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1128 tab_title (tbl, _("Extreme Values"));
1130 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1131 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1135 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1136 var_to_string(fctr->indep_var[0]));
1138 if ( fctr->indep_var[1] )
1139 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1140 var_to_string(fctr->indep_var[1]));
1143 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Value"));
1144 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Case Number"));
1146 for ( i = 0 ; i < n_dep_var ; ++i )
1150 tab_hline(tbl, TAL_1, 0, n_cols -1 ,
1151 i * 2 * n_extremities * n_factors + heading_rows);
1154 i * 2 * n_extremities * n_factors + heading_rows,
1155 TAB_LEFT | TAT_TITLE,
1156 var_to_string(dependent_var[i])
1161 populate_extremes(tbl, heading_columns - 2,
1162 i * 2 * n_extremities * n_factors + heading_rows,
1163 n_extremities, &totals[i]);
1167 struct factor_statistics **fs = fctr->fs;
1172 static union value prev ;
1174 const int row = heading_rows + ( 2 * n_extremities ) *
1175 ( ( i * n_factors ) + count );
1178 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1179 fctr->indep_var[0]->width))
1183 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1187 TAB_LEFT | TAT_TITLE,
1188 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1192 prev = (*fs)->id[0];
1194 if (fctr->indep_var[1] && count > 0 )
1195 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1197 if ( fctr->indep_var[1])
1198 tab_text (tbl, 2, row,
1199 TAB_LEFT | TAT_TITLE,
1200 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1203 populate_extremes(tbl, heading_columns - 2,
1218 /* Fill in the extremities table */
1220 populate_extremes(struct tab_table *t,
1221 int col, int row, int n, const struct metrics *m)
1227 tab_text(t, col, row,
1228 TAB_RIGHT | TAT_TITLE ,
1232 tab_text(t, col, row + n ,
1233 TAB_RIGHT | TAT_TITLE ,
1238 tab_hline(t, TAL_1, col, col + 3, row + n );
1240 for (extremity = 0; extremity < n ; ++extremity )
1243 tab_float(t, col + 1, row + extremity,
1245 extremity + 1, 8, 0);
1249 tab_float(t, col + 1, row + extremity + n,
1251 extremity + 1, 8, 0);
1257 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1260 const struct weighted_value *wv = m->wvp[idx];
1261 struct case_node *cn = wv->case_nos;
1264 for (j = 0 ; j < wv->w ; ++j )
1266 if ( extremity + j >= n )
1269 tab_float(t, col + 3, row + extremity + j + n,
1273 tab_float(t, col + 2, row + extremity + j + n,
1282 extremity += wv->w ;
1287 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1290 const struct weighted_value *wv = m->wvp[idx];
1291 struct case_node *cn = wv->case_nos;
1293 for (j = 0 ; j < wv->w ; ++j )
1295 if ( extremity + j >= n )
1298 tab_float(t, col + 3, row + extremity + j,
1302 tab_float(t, col + 2, row + extremity + j,
1311 extremity += wv->w ;
1316 /* Show the descriptives table */
1318 show_descriptives(struct variable **dependent_var,
1320 struct factor *fctr)
1323 int heading_columns ;
1325 const int n_stat_rows = 13;
1327 const int heading_rows = 1;
1329 struct tab_table *tbl;
1336 heading_columns = 4;
1337 n_factors = hsh_count(fctr->fstats);
1339 n_rows = n_dep_var * n_stat_rows * n_factors;
1341 if ( fctr->indep_var[1] )
1342 heading_columns = 5;
1346 heading_columns = 3;
1347 n_rows = n_dep_var * n_stat_rows;
1350 n_rows += heading_rows;
1352 n_cols = heading_columns + 2;
1355 tbl = tab_create (n_cols, n_rows, 0);
1357 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1359 tab_dim (tbl, tab_natural_dimensions);
1361 /* Outline the box and have no internal lines*/
1366 n_cols - 1, n_rows - 1);
1368 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1370 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1371 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1372 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1374 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Statistic"));
1375 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
1377 tab_title (tbl, _("Descriptives"));
1380 for ( i = 0 ; i < n_dep_var ; ++i )
1382 const int row = heading_rows + i * n_stat_rows * n_factors ;
1385 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
1388 i * n_stat_rows * n_factors + heading_rows,
1389 TAB_LEFT | TAT_TITLE,
1390 var_to_string(dependent_var[i])
1396 struct factor_statistics **fs = fctr->fs;
1399 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1400 var_to_string(fctr->indep_var[0]));
1403 if ( fctr->indep_var[1])
1404 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1405 var_to_string(fctr->indep_var[1]));
1410 static union value prev ;
1412 const int row = heading_rows + n_stat_rows *
1413 ( ( i * n_factors ) + count );
1416 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1417 fctr->indep_var[0]->width))
1421 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1425 TAB_LEFT | TAT_TITLE,
1426 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1430 prev = (*fs)->id[0];
1432 if (fctr->indep_var[1] && count > 0 )
1433 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1435 if ( fctr->indep_var[1])
1436 tab_text (tbl, 2, row,
1437 TAB_LEFT | TAT_TITLE,
1438 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1441 populate_descriptives(tbl, heading_columns - 2,
1453 populate_descriptives(tbl, heading_columns - 2,
1454 i * n_stat_rows * n_factors + heading_rows,
1466 /* Fill in the descriptives data */
1468 populate_descriptives(struct tab_table *tbl, int col, int row,
1469 const struct metrics *m)
1472 const double t = gsl_cdf_tdist_Qinv(1 - cmd.n_cinterval[0]/100.0/2.0, \
1478 TAB_LEFT | TAT_TITLE,
1481 tab_float (tbl, col + 2,
1487 tab_float (tbl, col + 3,
1496 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1497 _("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1500 tab_text (tbl, col + 1,
1502 TAB_LEFT | TAT_TITLE,
1505 tab_float (tbl, col + 2,
1508 m->mean - t * m->se_mean,
1511 tab_text (tbl, col + 1,
1513 TAB_LEFT | TAT_TITLE,
1517 tab_float (tbl, col + 2,
1520 m->mean + t * m->se_mean,
1525 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1526 _("5%% Trimmed Mean"));
1528 tab_float (tbl, col + 2,
1536 TAB_LEFT | TAT_TITLE,
1540 struct percentile *p;
1543 p = hsh_find(m->ptile_hash, &d);
1548 tab_float (tbl, col + 2,
1558 TAB_LEFT | TAT_TITLE,
1561 tab_float (tbl, col + 2,
1570 TAB_LEFT | TAT_TITLE,
1571 _("Std. Deviation"));
1574 tab_float (tbl, col + 2,
1583 TAB_LEFT | TAT_TITLE,
1586 tab_float (tbl, col + 2,
1594 TAB_LEFT | TAT_TITLE,
1597 tab_float (tbl, col + 2,
1606 TAB_LEFT | TAT_TITLE,
1610 tab_float (tbl, col + 2,
1618 TAB_LEFT | TAT_TITLE,
1619 _("Interquartile Range"));
1622 struct percentile *p1;
1623 struct percentile *p2;
1626 p1 = hsh_find(m->ptile_hash, &d);
1629 p2 = hsh_find(m->ptile_hash, &d);
1634 tab_float (tbl, col + 2,
1645 TAB_LEFT | TAT_TITLE,
1649 tab_float (tbl, col + 2,
1655 /* stderr of skewness */
1656 tab_float (tbl, col + 3,
1665 TAB_LEFT | TAT_TITLE,
1669 tab_float (tbl, col + 2,
1675 /* stderr of kurtosis */
1676 tab_float (tbl, col + 3,
1688 box_plot_variables(const struct factor *fctr,
1689 const struct variable **vars, int n_vars,
1690 const struct variable *id)
1694 struct factor_statistics **fs ;
1698 box_plot_group(fctr, vars, n_vars, id);
1702 for ( fs = fctr->fs ; *fs ; ++fs )
1704 double y_min = DBL_MAX;
1705 double y_max = -DBL_MAX;
1706 struct chart *ch = chart_create();
1707 const char *s = factor_to_string(fctr, *fs, 0 );
1709 chart_write_title(ch, s);
1711 for ( i = 0 ; i < n_vars ; ++i )
1713 y_max = max(y_max, (*fs)->m[i].max);
1714 y_min = min(y_min, (*fs)->m[i].min);
1717 boxplot_draw_yscale(ch, y_max, y_min);
1719 for ( i = 0 ; i < n_vars ; ++i )
1722 const double box_width = (ch->data_right - ch->data_left)
1725 const double box_centre = ( i * 2 + 1) * box_width
1728 boxplot_draw_boxplot(ch,
1729 box_centre, box_width,
1731 var_to_string(vars[i]));
1743 /* Do a box plot, grouping all factors into one plot ;
1744 each dependent variable has its own plot.
1747 box_plot_group(const struct factor *fctr,
1748 const struct variable **vars,
1750 const struct variable *id UNUSED)
1755 for ( i = 0 ; i < n_vars ; ++i )
1757 struct factor_statistics **fs ;
1760 ch = chart_create();
1762 boxplot_draw_yscale(ch, totals[i].max, totals[i].min);
1768 for ( fs = fctr->fs ; *fs ; ++fs )
1771 chart_write_title(ch, _("Boxplot of %s vs. %s"),
1772 var_to_string(vars[i]), var_to_string(fctr->indep_var[0]) );
1774 for ( fs = fctr->fs ; *fs ; ++fs )
1777 const char *s = factor_to_string_concise(fctr, *fs);
1779 const double box_width = (ch->data_right - ch->data_left)
1780 / (n_factors * 2.0 ) ;
1782 const double box_centre = ( f++ * 2 + 1) * box_width
1785 boxplot_draw_boxplot(ch,
1786 box_centre, box_width,
1793 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1794 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1796 chart_write_title(ch, _("Boxplot"));
1798 boxplot_draw_boxplot(ch,
1799 box_centre, box_width,
1801 var_to_string(vars[i]) );
1810 /* Plot the normal and detrended normal plots for m
1811 Label the plots with factorname */
1813 np_plot(const struct metrics *m, const char *factorname)
1816 double yfirst=0, ylast=0;
1819 struct chart *np_chart;
1821 /* Detrended Normal Plot */
1822 struct chart *dnp_chart;
1824 /* The slope and intercept of the ideal normal probability line */
1825 const double slope = 1.0 / m->stddev;
1826 const double intercept = - m->mean / m->stddev;
1828 /* Cowardly refuse to plot an empty data set */
1829 if ( m->n_data == 0 )
1832 np_chart = chart_create();
1833 dnp_chart = chart_create();
1835 if ( !np_chart || ! dnp_chart )
1838 chart_write_title(np_chart, _("Normal Q-Q Plot of %s"), factorname);
1839 chart_write_xlabel(np_chart, _("Observed Value"));
1840 chart_write_ylabel(np_chart, _("Expected Normal"));
1843 chart_write_title(dnp_chart, _("Detrended Normal Q-Q Plot of %s"),
1845 chart_write_xlabel(dnp_chart, _("Observed Value"));
1846 chart_write_ylabel(dnp_chart, _("Dev from Normal"));
1848 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1849 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1853 /* Need to make sure that both the scatter plot and the ideal fit into the
1855 double x_lower = min(m->min, (yfirst - intercept) / slope) ;
1856 double x_upper = max(m->max, (ylast - intercept) / slope) ;
1857 double slack = (x_upper - x_lower) * 0.05 ;
1859 chart_write_xscale(np_chart, x_lower - slack, x_upper + slack, 5);
1861 chart_write_xscale(dnp_chart, m->min, m->max, 5);
1865 chart_write_yscale(np_chart, yfirst, ylast, 5);
1868 /* We have to cache the detrended data, beacause we need to
1869 find its limits before we can plot it */
1870 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1871 double d_max = -DBL_MAX;
1872 double d_min = DBL_MAX;
1873 for ( i = 0 ; i < m->n_data; ++i )
1875 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1877 chart_datum(np_chart, 0, m->wvp[i]->v.f, ns);
1879 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1881 if ( d_data[i] < d_min ) d_min = d_data[i];
1882 if ( d_data[i] > d_max ) d_max = d_data[i];
1884 chart_write_yscale(dnp_chart, d_min, d_max, 5);
1886 for ( i = 0 ; i < m->n_data; ++i )
1887 chart_datum(dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1892 chart_line(np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1893 chart_line(dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1895 chart_submit(np_chart);
1896 chart_submit(dnp_chart);
1902 /* Show the percentiles */
1904 show_percentiles(struct variable **dependent_var,
1906 struct factor *fctr)
1908 struct tab_table *tbl;
1914 struct hsh_table *ptiles ;
1916 int n_heading_columns;
1917 const int n_heading_rows = 2;
1918 const int n_stat_rows = 2;
1924 struct factor_statistics **fs = fctr->fs ;
1925 n_heading_columns = 3;
1926 n_factors = hsh_count(fctr->fstats);
1928 ptiles = (*fs)->m[0].ptile_hash;
1930 if ( fctr->indep_var[1] )
1931 n_heading_columns = 4;
1936 n_heading_columns = 2;
1938 ptiles = totals[0].ptile_hash;
1941 n_ptiles = hsh_count(ptiles);
1943 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1945 n_cols = n_heading_columns + n_ptiles ;
1947 tbl = tab_create (n_cols, n_rows, 0);
1949 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1951 tab_dim (tbl, tab_natural_dimensions);
1953 /* Outline the box and have no internal lines*/
1958 n_cols - 1, n_rows - 1);
1960 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
1962 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
1965 tab_title (tbl, _("Percentiles"));
1968 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
1975 n_heading_columns - 1, n_rows - 1);
1981 n_heading_columns, n_heading_rows - 1,
1982 n_cols - 1, n_rows - 1);
1984 tab_joint_text(tbl, n_heading_columns + 1, 0,
1986 TAB_CENTER | TAT_TITLE ,
1991 /* Put in the percentile break points as headings */
1993 struct percentile **p = (struct percentile **) hsh_sort(ptiles);
1998 tab_float(tbl, n_heading_columns + i++ , 1,
2007 for ( i = 0 ; i < n_dep_var ; ++i )
2009 const int n_stat_rows = 2;
2010 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2013 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
2016 i * n_stat_rows * n_factors + n_heading_rows,
2017 TAB_LEFT | TAT_TITLE,
2018 var_to_string(dependent_var[i])
2023 struct factor_statistics **fs = fctr->fs;
2026 tab_text (tbl, 1, n_heading_rows - 1,
2027 TAB_CENTER | TAT_TITLE,
2028 var_to_string(fctr->indep_var[0]));
2031 if ( fctr->indep_var[1])
2032 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2033 var_to_string(fctr->indep_var[1]));
2038 static union value prev ;
2040 const int row = n_heading_rows + n_stat_rows *
2041 ( ( i * n_factors ) + count );
2044 if ( 0 != compare_values(&prev, &(*fs)->id[0],
2045 fctr->indep_var[0]->width))
2049 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2053 TAB_LEFT | TAT_TITLE,
2054 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
2060 prev = (*fs)->id[0];
2062 if (fctr->indep_var[1] && count > 0 )
2063 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
2065 if ( fctr->indep_var[1])
2066 tab_text (tbl, 2, row,
2067 TAB_LEFT | TAT_TITLE,
2068 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
2072 populate_percentiles(tbl, n_heading_columns - 1,
2084 populate_percentiles(tbl, n_heading_columns - 1,
2085 i * n_stat_rows * n_factors + n_heading_rows,
2102 populate_percentiles(struct tab_table *tbl, int col, int row,
2103 const struct metrics *m)
2107 struct percentile **p = (struct percentile **) hsh_sort(m->ptile_hash);
2111 TAB_LEFT | TAT_TITLE,
2112 _("Tukey\'s Hinges")
2117 TAB_LEFT | TAT_TITLE,
2118 ptile_alg_desc[m->ptile_alg]
2125 tab_float(tbl, col + i + 1 , row,
2128 if ( (*p)->p == 25 )
2129 tab_float(tbl, col + i + 1 , row + 1,
2133 if ( (*p)->p == 50 )
2134 tab_float(tbl, col + i + 1 , row + 1,
2138 if ( (*p)->p == 75 )
2139 tab_float(tbl, col + i + 1 , row + 1,
2154 factor_to_string(const struct factor *fctr,
2155 struct factor_statistics *fs,
2156 const struct variable *var)
2159 static char buf1[100];
2165 sprintf(buf1, "%s (",var_to_string(var) );
2168 snprintf(buf2, 100, "%s = %s",
2169 var_to_string(fctr->indep_var[0]),
2170 value_to_string(&fs->id[0],fctr->indep_var[0]));
2174 if ( fctr->indep_var[1] )
2176 sprintf(buf2, "; %s = %s)",
2177 var_to_string(fctr->indep_var[1]),
2178 value_to_string(&fs->id[1],
2179 fctr->indep_var[1]));
2194 factor_to_string_concise(const struct factor *fctr,
2195 struct factor_statistics *fs)
2199 static char buf[100];
2203 snprintf(buf, 100, "%s",
2204 value_to_string(&fs->id[0], fctr->indep_var[0]));
2206 if ( fctr->indep_var[1] )
2208 sprintf(buf2, ",%s)", value_to_string(&fs->id[1], fctr->indep_var[1]) );