1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2004 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <libpspp/message.h>
25 #include <data/case.h>
26 #include <data/casegrouper.h>
27 #include <data/casereader.h>
28 #include <data/dictionary.h>
29 #include <data/procedure.h>
30 #include <data/value-labels.h>
31 #include <data/variable.h>
32 #include <language/command.h>
33 #include <language/dictionary/split-file.h>
34 #include <language/lexer/lexer.h>
35 #include <libpspp/compiler.h>
36 #include <libpspp/hash.h>
37 #include <libpspp/message.h>
38 #include <libpspp/misc.h>
39 #include <libpspp/str.h>
40 #include <math/factor-stats.h>
41 #include <math/moments.h>
42 #include <math/percentiles.h>
43 #include <output/charts/box-whisker.h>
44 #include <output/charts/cartesian.h>
45 #include <output/manager.h>
46 #include <output/table.h>
52 #define _(msgid) gettext (msgid)
53 #define N_(msgid) msgid
56 #include <output/chart.h>
57 #include <output/charts/plot-hist.h>
58 #include <output/charts/plot-chart.h>
65 missing=miss:pairwise/!listwise,
67 incl:include/!exclude;
68 +compare=cmp:variables/!groups;
71 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
73 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
82 static struct cmd_examine cmd;
84 static const struct variable **dependent_vars;
86 static size_t n_dependent_vars;
91 /* The independent variable */
92 struct variable *indep_var[2];
95 /* Hash table of factor stats indexed by 2 values */
96 struct hsh_table *fstats;
98 /* The hash table after it has been crunched */
99 struct factor_statistics **fs;
105 /* Linked list of factors */
106 static struct factor *factors = 0;
108 static struct metrics *totals = 0;
110 /* Parse the clause specifying the factors */
111 static int examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd);
115 /* Output functions */
116 static void show_summary (const struct variable **dependent_var, int n_dep_var,
117 const struct factor *f);
119 static void show_extremes (const struct variable **dependent_var,
121 const struct factor *factor,
124 static void show_descriptives (const struct variable **dependent_var,
126 struct factor *factor);
128 static void show_percentiles (const struct variable **dependent_var,
130 struct factor *factor);
135 void np_plot (const struct metrics *m, const char *factorname);
138 void box_plot_group (const struct factor *fctr,
139 const struct variable **vars, int n_vars,
140 const struct variable *id
144 void box_plot_variables (const struct factor *fctr,
145 const struct variable **vars, int n_vars,
146 const struct variable *id
151 /* Per Split function */
152 static void run_examine (struct cmd_examine *, struct casereader *,
155 static void output_examine (void);
158 void factor_calc (const struct ccase *c, int case_no,
159 double weight, bool case_missing);
162 /* Represent a factor as a string, so it can be
163 printed in a human readable fashion */
164 static void factor_to_string (const struct factor *fctr,
165 const struct factor_statistics *fs,
166 const struct variable *var,
170 /* Represent a factor as a string, so it can be
171 printed in a human readable fashion,
172 but sacrificing some readablility for the sake of brevity */
173 static void factor_to_string_concise (const struct factor *fctr,
174 const struct factor_statistics *fs,
180 /* Categories of missing values to exclude. */
181 static enum mv_class exclude_values;
185 static subc_list_double percentile_list;
187 static enum pc_alg percentile_algorithm;
189 static short sbc_percentile;
193 cmd_examine (struct lexer *lexer, struct dataset *ds)
195 struct casegrouper *grouper;
196 struct casereader *group;
199 subc_list_double_create (&percentile_list);
200 percentile_algorithm = PC_HAVERAGE;
202 if ( !parse_examine (lexer, ds, &cmd, NULL) )
204 subc_list_double_destroy (&percentile_list);
208 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
209 exclude_values = cmd.incl == XMN_INCLUDE ? MV_SYSTEM : MV_ANY;
211 if ( cmd.st_n == SYSMIS )
214 if ( ! cmd.sbc_cinterval)
215 cmd.n_cinterval[0] = 95.0;
217 /* If descriptives have been requested, make sure the
218 quartiles are calculated */
219 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
221 subc_list_double_push (&percentile_list, 25);
222 subc_list_double_push (&percentile_list, 50);
223 subc_list_double_push (&percentile_list, 75);
226 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
227 while (casegrouper_get_next_group (grouper, &group))
228 run_examine (&cmd, group, ds);
229 ok = casegrouper_destroy (grouper);
230 ok = proc_commit (ds) && ok;
237 if ( dependent_vars )
238 free (dependent_vars);
241 struct factor *f = factors ;
244 struct factor *ff = f;
248 hsh_destroy ( ff->fstats ) ;
254 subc_list_double_destroy (&percentile_list);
256 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
261 /* Show all the appropriate tables */
263 output_examine (void)
267 /* Show totals if appropriate */
268 if ( ! cmd.sbc_nototal || factors == 0 )
270 show_summary (dependent_vars, n_dependent_vars, 0);
272 if ( cmd.sbc_statistics )
274 if ( cmd.a_statistics[XMN_ST_EXTREME])
275 show_extremes (dependent_vars, n_dependent_vars, 0, cmd.st_n);
277 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
278 show_descriptives (dependent_vars, n_dependent_vars, 0);
281 if ( sbc_percentile )
282 show_percentiles (dependent_vars, n_dependent_vars, 0);
287 if ( cmd.a_plot[XMN_PLT_STEMLEAF] )
288 msg (SW, _ ("%s is not currently supported."), "STEMLEAF");
290 if ( cmd.a_plot[XMN_PLT_SPREADLEVEL] )
291 msg (SW, _ ("%s is not currently supported."), "SPREADLEVEL");
293 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
295 for ( v = 0 ; v < n_dependent_vars; ++v )
296 np_plot (&totals[v], var_to_string (dependent_vars[v]));
299 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
301 if ( cmd.cmp == XMN_GROUPS )
303 box_plot_group (0, (const struct variable **) dependent_vars,
304 n_dependent_vars, cmd.v_id);
307 box_plot_variables (0,
308 (const struct variable **) dependent_vars,
309 n_dependent_vars, cmd.v_id);
312 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
314 for ( v = 0 ; v < n_dependent_vars; ++v )
316 struct normal_curve normal;
318 normal.N = totals[v].n;
319 normal.mean = totals[v].mean;
320 normal.stddev = totals[v].stddev;
322 histogram_plot (totals[v].histogram,
323 var_to_string (dependent_vars[v]),
333 /* Show grouped statistics as appropriate */
337 show_summary (dependent_vars, n_dependent_vars, fctr);
339 if ( cmd.sbc_statistics )
341 if ( cmd.a_statistics[XMN_ST_EXTREME])
342 show_extremes (dependent_vars, n_dependent_vars, fctr, cmd.st_n);
344 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
345 show_descriptives (dependent_vars, n_dependent_vars, fctr);
348 if ( sbc_percentile )
349 show_percentiles (dependent_vars, n_dependent_vars, fctr);
356 struct factor_statistics **fs = fctr->fs ;
358 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
360 if ( cmd.cmp == XMN_VARIABLES )
361 box_plot_variables (fctr,
362 (const struct variable **) dependent_vars,
363 n_dependent_vars, cmd.v_id);
365 box_plot_group (fctr,
366 (const struct variable **) dependent_vars,
367 n_dependent_vars, cmd.v_id);
370 for ( v = 0 ; v < n_dependent_vars; ++v )
373 for ( fs = fctr->fs ; *fs ; ++fs )
376 ds_init_empty (&str);
377 factor_to_string (fctr, *fs, dependent_vars[v], &str);
379 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
380 np_plot (& (*fs)->m[v], ds_cstr (&str));
382 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
384 struct normal_curve normal;
386 normal.N = (*fs)->m[v].n;
387 normal.mean = (*fs)->m[v].mean;
388 normal.stddev = (*fs)->m[v].stddev;
390 histogram_plot ((*fs)->m[v].histogram,
391 ds_cstr (&str) , &normal, 0);
396 } /* for ( fs .... */
398 } /* for ( v = 0 ..... */
408 /* Create a hash table of percentiles and their values from the list of
410 static struct hsh_table *
411 list_to_ptile_hash (const subc_list_double *l)
415 struct hsh_table *h ;
417 h = hsh_create (subc_list_double_count (l),
418 (hsh_compare_func *) ptile_compare,
419 (hsh_hash_func *) ptile_hash,
420 (hsh_free_func *) free,
424 for ( i = 0 ; i < subc_list_double_count (l) ; ++i )
426 struct percentile *p = xmalloc (sizeof *p);
428 p->p = subc_list_double_at (l,i);
439 /* Parse the PERCENTILES subcommand */
441 xmn_custom_percentiles (struct lexer *lexer, struct dataset *ds UNUSED,
442 struct cmd_examine *p UNUSED, void *aux UNUSED)
446 lex_match (lexer, '=');
448 lex_match (lexer, '(');
450 while ( lex_is_number (lexer) )
452 subc_list_double_push (&percentile_list, lex_number (lexer));
456 lex_match (lexer, ',') ;
458 lex_match (lexer, ')');
460 lex_match (lexer, '=');
462 if ( lex_match_id (lexer, "HAVERAGE"))
463 percentile_algorithm = PC_HAVERAGE;
465 else if ( lex_match_id (lexer, "WAVERAGE"))
466 percentile_algorithm = PC_WAVERAGE;
468 else if ( lex_match_id (lexer, "ROUND"))
469 percentile_algorithm = PC_ROUND;
471 else if ( lex_match_id (lexer, "EMPIRICAL"))
472 percentile_algorithm = PC_EMPIRICAL;
474 else if ( lex_match_id (lexer, "AEMPIRICAL"))
475 percentile_algorithm = PC_AEMPIRICAL;
477 else if ( lex_match_id (lexer, "NONE"))
478 percentile_algorithm = PC_NONE;
481 if ( 0 == subc_list_double_count (&percentile_list))
483 subc_list_double_push (&percentile_list, 5);
484 subc_list_double_push (&percentile_list, 10);
485 subc_list_double_push (&percentile_list, 25);
486 subc_list_double_push (&percentile_list, 50);
487 subc_list_double_push (&percentile_list, 75);
488 subc_list_double_push (&percentile_list, 90);
489 subc_list_double_push (&percentile_list, 95);
495 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
497 xmn_custom_total (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, struct cmd_examine *p, void *aux UNUSED)
499 if ( p->sbc_nototal )
501 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
509 xmn_custom_nototal (struct lexer *lexer UNUSED, struct dataset *ds UNUSED,
510 struct cmd_examine *p, void *aux UNUSED)
514 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
523 /* Parser for the variables sub command
524 Returns 1 on success */
526 xmn_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_examine *cmd, void *aux UNUSED)
528 const struct dictionary *dict = dataset_dict (ds);
529 lex_match (lexer, '=');
531 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
532 && lex_token (lexer) != T_ALL)
537 if (!parse_variables_const (lexer, dict, &dependent_vars, &n_dependent_vars,
538 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
540 free (dependent_vars);
544 assert (n_dependent_vars);
546 totals = xnmalloc (n_dependent_vars, sizeof *totals);
548 if ( lex_match (lexer, T_BY))
551 success = examine_parse_independent_vars (lexer, dict, cmd);
552 if ( success != 1 ) {
553 free (dependent_vars);
564 /* Parse the clause specifying the factors */
566 examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd)
569 struct factor *sf = xmalloc (sizeof *sf);
571 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
572 && lex_token (lexer) != T_ALL)
579 sf->indep_var[0] = parse_variable (lexer, dict);
580 sf->indep_var[1] = 0;
582 if ( lex_token (lexer) == T_BY )
585 lex_match (lexer, T_BY);
587 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
588 && lex_token (lexer) != T_ALL)
594 sf->indep_var[1] = parse_variable (lexer, dict);
599 sf->fstats = hsh_create (4,
600 (hsh_compare_func *) factor_statistics_compare,
601 (hsh_hash_func *) factor_statistics_hash,
602 (hsh_free_func *) factor_statistics_free,
608 lex_match (lexer, ',');
610 if ( lex_token (lexer) == '.' || lex_token (lexer) == '/' )
613 success = examine_parse_independent_vars (lexer, dict, cmd);
624 void populate_percentiles (struct tab_table *tbl, int col, int row,
625 const struct metrics *m);
627 void populate_descriptives (struct tab_table *t, int col, int row,
628 const struct metrics *fs);
630 void populate_extremes (struct tab_table *t, int col, int row, int n,
631 const struct metrics *m);
633 void populate_summary (struct tab_table *t, int col, int row,
634 const struct metrics *m);
639 /* Perform calculations for the sub factors */
641 factor_calc (const struct ccase *c, int case_no, double weight,
645 struct factor *fctr = factors;
649 struct factor_statistics **foo ;
650 union value *indep_vals[2] ;
652 indep_vals[0] = value_dup (
653 case_data (c, fctr->indep_var[0]),
654 var_get_width (fctr->indep_var[0])
657 if ( fctr->indep_var[1] )
658 indep_vals[1] = value_dup (
659 case_data (c, fctr->indep_var[1]),
660 var_get_width (fctr->indep_var[1])
664 const union value sm = {SYSMIS};
665 indep_vals[1] = value_dup (&sm, 0);
668 assert (fctr->fstats);
670 foo = ( struct factor_statistics ** )
671 hsh_probe (fctr->fstats, (void *) &indep_vals);
676 *foo = create_factor_statistics (n_dependent_vars,
680 for ( v = 0 ; v < n_dependent_vars ; ++v )
682 metrics_precalc ( & (*foo)->m[v] );
688 free (indep_vals[0]);
689 free (indep_vals[1]);
692 for ( v = 0 ; v < n_dependent_vars ; ++v )
694 const struct variable *var = dependent_vars[v];
695 union value *val = value_dup (
700 if (case_missing || var_is_value_missing (var, val, exclude_values))
706 metrics_calc ( & (*foo)->m[v], val, weight, case_no);
716 run_examine (struct cmd_examine *cmd, struct casereader *input,
719 struct dictionary *dict = dataset_dict (ds);
727 if (!casereader_peek (input, 0, &c))
729 casereader_destroy (input);
732 output_split_file_values (ds, &c);
735 input = casereader_create_filter_weight (input, dict, NULL, NULL);
736 input = casereader_create_counter (input, &case_no, 0);
738 /* Make sure we haven't got rubbish left over from a
743 struct factor *next = fctr->next;
745 hsh_clear (fctr->fstats);
752 for ( v = 0 ; v < n_dependent_vars ; ++v )
753 metrics_precalc (&totals[v]);
755 for (; casereader_read (input, &c); case_destroy (&c))
757 bool case_missing = false;
758 const double weight = dict_get_case_weight (dict, &c, NULL);
760 if ( cmd->miss == XMN_LISTWISE )
762 for ( v = 0 ; v < n_dependent_vars ; ++v )
764 const struct variable *var = dependent_vars[v];
765 union value *val = value_dup (
770 if ( var_is_value_missing (var, val, exclude_values))
777 for ( v = 0 ; v < n_dependent_vars ; ++v )
779 const struct variable *var = dependent_vars[v];
780 union value *val = value_dup (
785 if ( var_is_value_missing (var, val, exclude_values)
792 metrics_calc (&totals[v], val, weight, case_no);
797 factor_calc (&c, case_no, weight, case_missing);
799 ok = casereader_destroy (input);
801 for ( v = 0 ; v < n_dependent_vars ; ++v)
806 struct hsh_iterator hi;
807 struct factor_statistics *fs;
809 for ( fs = hsh_first (fctr->fstats, &hi);
811 fs = hsh_next (fctr->fstats, &hi))
814 fs->m[v].ptile_hash = list_to_ptile_hash (&percentile_list);
815 fs->m[v].ptile_alg = percentile_algorithm;
816 metrics_postcalc (&fs->m[v]);
822 totals[v].ptile_hash = list_to_ptile_hash (&percentile_list);
823 totals[v].ptile_alg = percentile_algorithm;
824 metrics_postcalc (&totals[v]);
828 /* Make sure that the combination of factors are complete */
833 struct hsh_iterator hi;
834 struct hsh_iterator hi0;
835 struct hsh_iterator hi1;
836 struct factor_statistics *fs;
838 struct hsh_table *idh0=0;
839 struct hsh_table *idh1=0;
843 idh0 = hsh_create (4, (hsh_compare_func *) compare_values,
844 (hsh_hash_func *) hash_value,
847 idh1 = hsh_create (4, (hsh_compare_func *) compare_values,
848 (hsh_hash_func *) hash_value,
852 for ( fs = hsh_first (fctr->fstats, &hi);
854 fs = hsh_next (fctr->fstats, &hi))
856 hsh_insert (idh0, (void *) &fs->id[0]);
857 hsh_insert (idh1, (void *) &fs->id[1]);
860 /* Ensure that the factors combination is complete */
861 for ( val0 = hsh_first (idh0, &hi0);
863 val0 = hsh_next (idh0, &hi0))
865 for ( val1 = hsh_first (idh1, &hi1);
867 val1 = hsh_next (idh1, &hi1))
869 struct factor_statistics **ffs;
874 ffs = (struct factor_statistics **)
875 hsh_probe (fctr->fstats, (void *) &key );
879 (*ffs) = create_factor_statistics (n_dependent_vars,
881 for ( i = 0 ; i < n_dependent_vars ; ++i )
882 metrics_precalc ( & (*ffs)->m[i]);
890 fctr->fs = (struct factor_statistics **) hsh_sort_copy (fctr->fstats);
902 for ( i = 0 ; i < n_dependent_vars ; ++i )
904 metrics_destroy (&totals[i]);
911 show_summary (const struct variable **dependent_var, int n_dep_var,
912 const struct factor *fctr)
914 static const char *subtitle[]=
922 int heading_columns ;
924 const int heading_rows = 3;
925 struct tab_table *tbl;
933 n_factors = hsh_count (fctr->fstats);
934 n_rows = n_dep_var * n_factors ;
936 if ( fctr->indep_var[1] )
945 n_rows += heading_rows;
947 n_cols = heading_columns + 6;
949 tbl = tab_create (n_cols,n_rows,0);
950 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
952 tab_dim (tbl, tab_natural_dimensions);
954 /* Outline the box */
959 n_cols - 1, n_rows - 1);
961 /* Vertical lines for the data only */
966 n_cols - 1, n_rows - 1);
969 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
970 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
971 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
973 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
976 tab_title (tbl, _ ("Case Processing Summary"));
978 tab_joint_text (tbl, heading_columns, 0,
980 TAB_CENTER | TAT_TITLE,
983 /* Remove lines ... */
990 for ( i = 0 ; i < 3 ; ++i )
992 tab_text (tbl, heading_columns + i * 2 , 2, TAB_CENTER | TAT_TITLE,
995 tab_text (tbl, heading_columns + i * 2 + 1, 2, TAB_CENTER | TAT_TITLE,
998 tab_joint_text (tbl, heading_columns + i*2 , 1,
999 heading_columns + i * 2 + 1, 1,
1000 TAB_CENTER | TAT_TITLE,
1003 tab_box (tbl, -1, -1,
1005 heading_columns + i * 2, 1,
1006 heading_columns + i * 2 + 1, 1);
1010 /* Titles for the independent variables */
1013 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1014 var_to_string (fctr->indep_var[0]));
1016 if ( fctr->indep_var[1] )
1018 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1019 var_to_string (fctr->indep_var[1]));
1024 for ( i = 0 ; i < n_dep_var ; ++i )
1028 n_factors = hsh_count (fctr->fstats);
1031 tab_hline (tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
1034 0, i * n_factors + heading_rows,
1035 TAB_LEFT | TAT_TITLE,
1036 var_to_string (dependent_var[i])
1040 populate_summary (tbl, heading_columns,
1041 (i * n_factors) + heading_rows,
1045 struct factor_statistics **fs = fctr->fs;
1047 const union value *prev = NULL;
1052 0 != compare_values (prev, (*fs)->id[0],
1053 var_get_width (fctr->indep_var[0])))
1056 ds_init_empty (&vstr);
1057 var_append_value_name (fctr->indep_var[0],
1058 (*fs)->id[0], &vstr);
1062 (i * n_factors ) + count +
1064 TAB_LEFT | TAT_TITLE,
1070 if (fctr->indep_var[1] && count > 0 )
1071 tab_hline (tbl, TAL_1, 1, n_cols - 1,
1072 (i * n_factors ) + count + heading_rows);
1075 prev = (*fs)->id[0];
1077 if ( fctr->indep_var[1])
1080 ds_init_empty (&vstr);
1081 var_append_value_name (fctr->indep_var[1],
1082 (*fs)->id[1], &vstr);
1085 (i * n_factors ) + count +
1087 TAB_LEFT | TAT_TITLE,
1093 populate_summary (tbl, heading_columns,
1094 (i * n_factors) + count
1109 populate_summary (struct tab_table *t, int col, int row,
1110 const struct metrics *m)
1113 const double total = m->n + m->n_missing ;
1115 tab_float (t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1116 tab_float (t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1117 tab_float (t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1121 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1122 100.0 * m->n / total );
1124 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1125 100.0 * m->n_missing / total );
1127 /* This seems a bit pointless !!! */
1128 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1129 100.0 * total / total );
1136 show_extremes (const struct variable **dependent_var, int n_dep_var,
1137 const struct factor *fctr, int n_extremities)
1140 int heading_columns ;
1142 const int heading_rows = 1;
1143 struct tab_table *tbl;
1150 heading_columns = 2;
1151 n_factors = hsh_count (fctr->fstats);
1153 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1155 if ( fctr->indep_var[1] )
1156 heading_columns = 3;
1160 heading_columns = 1;
1161 n_rows = n_dep_var * 2 * n_extremities;
1164 n_rows += heading_rows;
1166 heading_columns += 2;
1167 n_cols = heading_columns + 2;
1169 tbl = tab_create (n_cols,n_rows,0);
1170 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1172 tab_dim (tbl, tab_natural_dimensions);
1174 /* Outline the box, No internal lines*/
1179 n_cols - 1, n_rows - 1);
1181 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1183 tab_title (tbl, _ ("Extreme Values"));
1185 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1186 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1190 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1191 var_to_string (fctr->indep_var[0]));
1193 if ( fctr->indep_var[1] )
1194 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1195 var_to_string (fctr->indep_var[1]));
1198 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Value"));
1199 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Case Number"));
1201 for ( i = 0 ; i < n_dep_var ; ++i )
1205 tab_hline (tbl, TAL_1, 0, n_cols -1 ,
1206 i * 2 * n_extremities * n_factors + heading_rows);
1209 i * 2 * n_extremities * n_factors + heading_rows,
1210 TAB_LEFT | TAT_TITLE,
1211 var_to_string (dependent_var[i])
1216 populate_extremes (tbl, heading_columns - 2,
1217 i * 2 * n_extremities * n_factors + heading_rows,
1218 n_extremities, &totals[i]);
1222 struct factor_statistics **fs = fctr->fs;
1224 const union value *prev = NULL;
1228 const int row = heading_rows + ( 2 * n_extremities ) *
1229 ( ( i * n_factors ) + count );
1232 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
1233 var_get_width (fctr->indep_var[0])))
1236 ds_init_empty (&vstr);
1237 var_append_value_name (fctr->indep_var[0],
1238 (*fs)->id[0], &vstr);
1241 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1245 TAB_LEFT | TAT_TITLE,
1252 prev = (*fs)->id[0];
1254 if (fctr->indep_var[1] && count > 0 )
1255 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1257 if ( fctr->indep_var[1])
1260 ds_init_empty (&vstr);
1261 var_append_value_name (fctr->indep_var[1], (*fs)->id[1], &vstr);
1263 tab_text (tbl, 2, row,
1264 TAB_LEFT | TAT_TITLE,
1271 populate_extremes (tbl, heading_columns - 2,
1286 /* Fill in the extremities table */
1288 populate_extremes (struct tab_table *t,
1289 int col, int row, int n, const struct metrics *m)
1295 tab_text (t, col, row,
1296 TAB_RIGHT | TAT_TITLE ,
1300 tab_text (t, col, row + n ,
1301 TAB_RIGHT | TAT_TITLE ,
1306 tab_hline (t, TAL_1, col, col + 3, row + n );
1308 for (extremity = 0; extremity < n ; ++extremity )
1311 tab_float (t, col + 1, row + extremity,
1313 extremity + 1, 8, 0);
1317 tab_float (t, col + 1, row + extremity + n,
1319 extremity + 1, 8, 0);
1325 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1328 const struct weighted_value *wv = m->wvp[idx];
1329 struct case_node *cn = wv->case_nos;
1332 for (j = 0 ; j < wv->w ; ++j )
1334 if ( extremity + j >= n )
1337 tab_float (t, col + 3, row + extremity + j + n,
1341 tab_float (t, col + 2, row + extremity + j + n,
1350 extremity += wv->w ;
1355 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1358 const struct weighted_value *wv = m->wvp[idx];
1359 struct case_node *cn = wv->case_nos;
1361 for (j = 0 ; j < wv->w ; ++j )
1363 if ( extremity + j >= n )
1366 tab_float (t, col + 3, row + extremity + j,
1370 tab_float (t, col + 2, row + extremity + j,
1379 extremity += wv->w ;
1384 /* Show the descriptives table */
1386 show_descriptives (const struct variable **dependent_var,
1388 struct factor *fctr)
1391 int heading_columns ;
1393 const int n_stat_rows = 13;
1395 const int heading_rows = 1;
1397 struct tab_table *tbl;
1404 heading_columns = 4;
1405 n_factors = hsh_count (fctr->fstats);
1407 n_rows = n_dep_var * n_stat_rows * n_factors;
1409 if ( fctr->indep_var[1] )
1410 heading_columns = 5;
1414 heading_columns = 3;
1415 n_rows = n_dep_var * n_stat_rows;
1418 n_rows += heading_rows;
1420 n_cols = heading_columns + 2;
1423 tbl = tab_create (n_cols, n_rows, 0);
1425 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1427 tab_dim (tbl, tab_natural_dimensions);
1429 /* Outline the box and have no internal lines*/
1434 n_cols - 1, n_rows - 1);
1436 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1438 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1439 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1440 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1442 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Statistic"));
1443 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Std. Error"));
1445 tab_title (tbl, _ ("Descriptives"));
1448 for ( i = 0 ; i < n_dep_var ; ++i )
1450 const int row = heading_rows + i * n_stat_rows * n_factors ;
1453 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
1456 i * n_stat_rows * n_factors + heading_rows,
1457 TAB_LEFT | TAT_TITLE,
1458 var_to_string (dependent_var[i])
1464 const union value *prev = NULL;
1466 struct factor_statistics **fs = fctr->fs;
1469 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1470 var_to_string (fctr->indep_var[0]));
1473 if ( fctr->indep_var[1])
1474 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1475 var_to_string (fctr->indep_var[1]));
1479 const int row = heading_rows + n_stat_rows *
1480 ( ( i * n_factors ) + count );
1483 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
1484 var_get_width (fctr->indep_var[0])))
1487 ds_init_empty (&vstr);
1488 var_append_value_name (fctr->indep_var[0],
1489 (*fs)->id[0], &vstr);
1492 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1496 TAB_LEFT | TAT_TITLE,
1503 prev = (*fs)->id[0];
1505 if (fctr->indep_var[1] && count > 0 )
1506 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1508 if ( fctr->indep_var[1])
1511 ds_init_empty (&vstr);
1512 var_append_value_name (fctr->indep_var[1], (*fs)->id[1], &vstr);
1514 tab_text (tbl, 2, row,
1515 TAB_LEFT | TAT_TITLE,
1522 populate_descriptives (tbl, heading_columns - 2,
1523 row, & (*fs)->m[i]);
1534 populate_descriptives (tbl, heading_columns - 2,
1535 i * n_stat_rows * n_factors + heading_rows,
1545 /* Fill in the descriptives data */
1547 populate_descriptives (struct tab_table *tbl, int col, int row,
1548 const struct metrics *m)
1550 const double t = gsl_cdf_tdist_Qinv ((1 - cmd.n_cinterval[0] / 100.0)/2.0,
1555 TAB_LEFT | TAT_TITLE,
1558 tab_float (tbl, col + 2,
1564 tab_float (tbl, col + 3,
1573 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1574 _ ("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1577 tab_text (tbl, col + 1,
1579 TAB_LEFT | TAT_TITLE,
1582 tab_float (tbl, col + 2,
1585 m->mean - t * m->se_mean,
1588 tab_text (tbl, col + 1,
1590 TAB_LEFT | TAT_TITLE,
1594 tab_float (tbl, col + 2,
1597 m->mean + t * m->se_mean,
1602 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1603 _ ("5%% Trimmed Mean"));
1605 tab_float (tbl, col + 2,
1613 TAB_LEFT | TAT_TITLE,
1617 struct percentile *p;
1620 p = hsh_find (m->ptile_hash, &d);
1625 tab_float (tbl, col + 2,
1635 TAB_LEFT | TAT_TITLE,
1638 tab_float (tbl, col + 2,
1647 TAB_LEFT | TAT_TITLE,
1648 _ ("Std. Deviation"));
1651 tab_float (tbl, col + 2,
1660 TAB_LEFT | TAT_TITLE,
1663 tab_float (tbl, col + 2,
1671 TAB_LEFT | TAT_TITLE,
1674 tab_float (tbl, col + 2,
1683 TAB_LEFT | TAT_TITLE,
1687 tab_float (tbl, col + 2,
1695 TAB_LEFT | TAT_TITLE,
1696 _ ("Interquartile Range"));
1699 struct percentile *p1;
1700 struct percentile *p2;
1703 p1 = hsh_find (m->ptile_hash, &d);
1706 p2 = hsh_find (m->ptile_hash, &d);
1711 tab_float (tbl, col + 2,
1722 TAB_LEFT | TAT_TITLE,
1726 tab_float (tbl, col + 2,
1732 /* stderr of skewness */
1733 tab_float (tbl, col + 3,
1742 TAB_LEFT | TAT_TITLE,
1746 tab_float (tbl, col + 2,
1752 /* stderr of kurtosis */
1753 tab_float (tbl, col + 3,
1765 box_plot_variables (const struct factor *fctr,
1766 const struct variable **vars, int n_vars,
1767 const struct variable *id)
1771 struct factor_statistics **fs ;
1775 box_plot_group (fctr, vars, n_vars, id);
1779 for ( fs = fctr->fs ; *fs ; ++fs )
1782 double y_min = DBL_MAX;
1783 double y_max = -DBL_MAX;
1784 struct chart *ch = chart_create ();
1785 ds_init_empty (&str);
1786 factor_to_string (fctr, *fs, 0, &str );
1788 chart_write_title (ch, ds_cstr (&str));
1790 for ( i = 0 ; i < n_vars ; ++i )
1792 y_max = MAX (y_max, (*fs)->m[i].max);
1793 y_min = MIN (y_min, (*fs)->m[i].min);
1796 boxplot_draw_yscale (ch, y_max, y_min);
1798 for ( i = 0 ; i < n_vars ; ++i )
1801 const double box_width = (ch->data_right - ch->data_left)
1804 const double box_centre = ( i * 2 + 1) * box_width
1807 boxplot_draw_boxplot (ch,
1808 box_centre, box_width,
1810 var_to_string (vars[i]));
1822 /* Do a box plot, grouping all factors into one plot ;
1823 each dependent variable has its own plot.
1826 box_plot_group (const struct factor *fctr,
1827 const struct variable **vars,
1829 const struct variable *id UNUSED)
1834 for ( i = 0 ; i < n_vars ; ++i )
1836 struct factor_statistics **fs ;
1839 ch = chart_create ();
1841 boxplot_draw_yscale (ch, totals[i].max, totals[i].min);
1847 for ( fs = fctr->fs ; *fs ; ++fs )
1850 chart_write_title (ch, _ ("Boxplot of %s vs. %s"),
1851 var_to_string (vars[i]), var_to_string (fctr->indep_var[0]) );
1853 for ( fs = fctr->fs ; *fs ; ++fs )
1856 const double box_width = (ch->data_right - ch->data_left)
1857 / (n_factors * 2.0 ) ;
1859 const double box_centre = ( f++ * 2 + 1) * box_width
1862 ds_init_empty (&str);
1863 factor_to_string_concise (fctr, *fs, &str);
1865 boxplot_draw_boxplot (ch,
1866 box_centre, box_width,
1874 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1875 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1877 chart_write_title (ch, _ ("Boxplot"));
1879 boxplot_draw_boxplot (ch,
1880 box_centre, box_width,
1882 var_to_string (vars[i]) );
1891 /* Plot the normal and detrended normal plots for m
1892 Label the plots with factorname */
1894 np_plot (const struct metrics *m, const char *factorname)
1897 double yfirst=0, ylast=0;
1900 struct chart *np_chart;
1902 /* Detrended Normal Plot */
1903 struct chart *dnp_chart;
1905 /* The slope and intercept of the ideal normal probability line */
1906 const double slope = 1.0 / m->stddev;
1907 const double intercept = - m->mean / m->stddev;
1909 /* Cowardly refuse to plot an empty data set */
1910 if ( m->n_data == 0 )
1913 np_chart = chart_create ();
1914 dnp_chart = chart_create ();
1916 if ( !np_chart || ! dnp_chart )
1919 chart_write_title (np_chart, _ ("Normal Q-Q Plot of %s"), factorname);
1920 chart_write_xlabel (np_chart, _ ("Observed Value"));
1921 chart_write_ylabel (np_chart, _ ("Expected Normal"));
1924 chart_write_title (dnp_chart, _ ("Detrended Normal Q-Q Plot of %s"),
1926 chart_write_xlabel (dnp_chart, _ ("Observed Value"));
1927 chart_write_ylabel (dnp_chart, _ ("Dev from Normal"));
1929 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1930 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1934 /* Need to make sure that both the scatter plot and the ideal fit into the
1936 double x_lower = MIN (m->min, (yfirst - intercept) / slope) ;
1937 double x_upper = MAX (m->max, (ylast - intercept) / slope) ;
1938 double slack = (x_upper - x_lower) * 0.05 ;
1940 chart_write_xscale (np_chart, x_lower - slack, x_upper + slack, 5);
1942 chart_write_xscale (dnp_chart, m->min, m->max, 5);
1946 chart_write_yscale (np_chart, yfirst, ylast, 5);
1949 /* We have to cache the detrended data, beacause we need to
1950 find its limits before we can plot it */
1951 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1952 double d_max = -DBL_MAX;
1953 double d_min = DBL_MAX;
1954 for ( i = 0 ; i < m->n_data; ++i )
1956 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1958 chart_datum (np_chart, 0, m->wvp[i]->v.f, ns);
1960 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1962 if ( d_data[i] < d_min ) d_min = d_data[i];
1963 if ( d_data[i] > d_max ) d_max = d_data[i];
1965 chart_write_yscale (dnp_chart, d_min, d_max, 5);
1967 for ( i = 0 ; i < m->n_data; ++i )
1968 chart_datum (dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1973 chart_line (np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1974 chart_line (dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1976 chart_submit (np_chart);
1977 chart_submit (dnp_chart);
1983 /* Show the percentiles */
1985 show_percentiles (const struct variable **dependent_var,
1987 struct factor *fctr)
1989 struct tab_table *tbl;
1995 struct hsh_table *ptiles ;
1997 int n_heading_columns;
1998 const int n_heading_rows = 2;
1999 const int n_stat_rows = 2;
2005 struct factor_statistics **fs = fctr->fs ;
2006 n_heading_columns = 3;
2007 n_factors = hsh_count (fctr->fstats);
2009 ptiles = (*fs)->m[0].ptile_hash;
2011 if ( fctr->indep_var[1] )
2012 n_heading_columns = 4;
2017 n_heading_columns = 2;
2019 ptiles = totals[0].ptile_hash;
2022 n_ptiles = hsh_count (ptiles);
2024 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
2026 n_cols = n_heading_columns + n_ptiles ;
2028 tbl = tab_create (n_cols, n_rows, 0);
2030 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
2032 tab_dim (tbl, tab_natural_dimensions);
2034 /* Outline the box and have no internal lines*/
2039 n_cols - 1, n_rows - 1);
2041 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
2043 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
2046 tab_title (tbl, _ ("Percentiles"));
2049 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
2056 n_heading_columns - 1, n_rows - 1);
2062 n_heading_columns, n_heading_rows - 1,
2063 n_cols - 1, n_rows - 1);
2065 tab_joint_text (tbl, n_heading_columns + 1, 0,
2067 TAB_CENTER | TAT_TITLE ,
2072 /* Put in the percentile break points as headings */
2074 struct percentile **p = (struct percentile **) hsh_sort (ptiles);
2079 tab_float (tbl, n_heading_columns + i++ , 1,
2088 for ( i = 0 ; i < n_dep_var ; ++i )
2090 const int n_stat_rows = 2;
2091 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2094 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
2097 i * n_stat_rows * n_factors + n_heading_rows,
2098 TAB_LEFT | TAT_TITLE,
2099 var_to_string (dependent_var[i])
2104 const union value *prev = NULL ;
2105 struct factor_statistics **fs = fctr->fs;
2108 tab_text (tbl, 1, n_heading_rows - 1,
2109 TAB_CENTER | TAT_TITLE,
2110 var_to_string (fctr->indep_var[0]));
2113 if ( fctr->indep_var[1])
2114 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2115 var_to_string (fctr->indep_var[1]));
2119 const int row = n_heading_rows + n_stat_rows *
2120 ( ( i * n_factors ) + count );
2123 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
2124 var_get_width (fctr->indep_var[0])))
2127 ds_init_empty (&vstr);
2128 var_append_value_name (fctr->indep_var[0],
2129 (*fs)->id[0], &vstr);
2133 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2137 TAB_LEFT | TAT_TITLE,
2144 prev = (*fs)->id[0];
2146 if (fctr->indep_var[1] && count > 0 )
2147 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
2149 if ( fctr->indep_var[1])
2152 ds_init_empty (&vstr);
2153 var_append_value_name (fctr->indep_var[1], (*fs)->id[1], &vstr);
2155 tab_text (tbl, 2, row,
2156 TAB_LEFT | TAT_TITLE,
2164 populate_percentiles (tbl, n_heading_columns - 1,
2165 row, & (*fs)->m[i]);
2176 populate_percentiles (tbl, n_heading_columns - 1,
2177 i * n_stat_rows * n_factors + n_heading_rows,
2194 populate_percentiles (struct tab_table *tbl, int col, int row,
2195 const struct metrics *m)
2199 struct percentile **p = (struct percentile **) hsh_sort (m->ptile_hash);
2203 TAB_LEFT | TAT_TITLE,
2204 _ ("Tukey\'s Hinges")
2209 TAB_LEFT | TAT_TITLE,
2210 ptile_alg_desc[m->ptile_alg]
2217 tab_float (tbl, col + i + 1 , row,
2220 if ( (*p)->p == 25 )
2221 tab_float (tbl, col + i + 1 , row + 1,
2225 if ( (*p)->p == 50 )
2226 tab_float (tbl, col + i + 1 , row + 1,
2230 if ( (*p)->p == 75 )
2231 tab_float (tbl, col + i + 1 , row + 1,
2244 factor_to_string (const struct factor *fctr,
2245 const struct factor_statistics *fs,
2246 const struct variable *var,
2251 ds_put_format (str, "%s (",var_to_string (var) );
2254 ds_put_format (str, "%s = ",
2255 var_to_string (fctr->indep_var[0]));
2257 var_append_value_name (fctr->indep_var[0], fs->id[0], str);
2259 if ( fctr->indep_var[1] )
2261 ds_put_format (str, "; %s = )",
2262 var_to_string (fctr->indep_var[1]));
2264 var_append_value_name (fctr->indep_var[1], fs->id[1], str);
2269 ds_put_cstr (str, ")");
2275 factor_to_string_concise (const struct factor *fctr,
2276 const struct factor_statistics *fs,
2281 var_append_value_name (fctr->indep_var[0], fs->id[0], str);
2283 if ( fctr->indep_var[1] )
2285 ds_put_cstr (str, ",");
2287 var_append_value_name (fctr->indep_var[1],fs->id[1], str);
2289 ds_put_cstr (str, ")");