1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2004 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <libpspp/message.h>
25 #include <data/case.h>
26 #include <data/casegrouper.h>
27 #include <data/casereader.h>
28 #include <data/dictionary.h>
29 #include <data/procedure.h>
30 #include <data/value-labels.h>
31 #include <data/variable.h>
32 #include <language/command.h>
33 #include <language/dictionary/split-file.h>
34 #include <language/lexer/lexer.h>
35 #include <libpspp/compiler.h>
36 #include <libpspp/hash.h>
37 #include <libpspp/message.h>
38 #include <libpspp/misc.h>
39 #include <libpspp/str.h>
40 #include <math/factor-stats.h>
41 #include <math/moments.h>
42 #include <math/percentiles.h>
43 #include <output/charts/box-whisker.h>
44 #include <output/charts/cartesian.h>
45 #include <output/manager.h>
46 #include <output/table.h>
52 #define _(msgid) gettext (msgid)
53 #define N_(msgid) msgid
56 #include <output/chart.h>
57 #include <output/charts/plot-hist.h>
58 #include <output/charts/plot-chart.h>
65 missing=miss:pairwise/!listwise,
67 incl:include/!exclude;
68 +compare=cmp:variables/!groups;
71 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
73 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
82 static struct cmd_examine cmd;
84 static const struct variable **dependent_vars;
86 static size_t n_dependent_vars;
91 /* The independent variable */
92 struct variable *indep_var[2];
95 /* Hash table of factor stats indexed by 2 values */
96 struct hsh_table *fstats;
98 /* The hash table after it has been crunched */
99 struct factor_statistics **fs;
105 /* Linked list of factors */
106 static struct factor *factors = 0;
108 static struct metrics *totals = 0;
110 /* Parse the clause specifying the factors */
111 static int examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd);
115 /* Output functions */
116 static void show_summary (const struct variable **dependent_var, int n_dep_var,
117 const struct factor *f);
119 static void show_extremes (const struct variable **dependent_var,
121 const struct factor *factor,
124 static void show_descriptives (const struct variable **dependent_var,
126 struct factor *factor);
128 static void show_percentiles (const struct variable **dependent_var,
130 struct factor *factor);
135 void np_plot (const struct metrics *m, const char *factorname);
138 void box_plot_group (const struct factor *fctr,
139 const struct variable **vars, int n_vars,
140 const struct variable *id
144 void box_plot_variables (const struct factor *fctr,
145 const struct variable **vars, int n_vars,
146 const struct variable *id
151 /* Per Split function */
152 static void run_examine (struct cmd_examine *, struct casereader *,
155 static void output_examine (void);
158 void factor_calc (const struct ccase *c, int case_no,
159 double weight, int case_missing);
162 /* Represent a factor as a string, so it can be
163 printed in a human readable fashion */
164 static void factor_to_string (const struct factor *fctr,
165 const struct factor_statistics *fs,
166 const struct variable *var,
170 /* Represent a factor as a string, so it can be
171 printed in a human readable fashion,
172 but sacrificing some readablility for the sake of brevity */
173 static void factor_to_string_concise (const struct factor *fctr,
174 const struct factor_statistics *fs,
180 /* Categories of missing values to exclude. */
181 static enum mv_class exclude_values;
185 static subc_list_double percentile_list;
187 static enum pc_alg percentile_algorithm;
189 static short sbc_percentile;
193 cmd_examine (struct lexer *lexer, struct dataset *ds)
195 struct casegrouper *grouper;
196 struct casereader *group;
199 subc_list_double_create (&percentile_list);
200 percentile_algorithm = PC_HAVERAGE;
202 if ( !parse_examine (lexer, ds, &cmd, NULL) )
204 subc_list_double_destroy (&percentile_list);
208 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
209 exclude_values = cmd.incl == XMN_INCLUDE ? MV_SYSTEM : MV_ANY;
211 if ( cmd.st_n == SYSMIS )
214 if ( ! cmd.sbc_cinterval)
215 cmd.n_cinterval[0] = 95.0;
217 /* If descriptives have been requested, make sure the
218 quartiles are calculated */
219 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
221 subc_list_double_push (&percentile_list, 25);
222 subc_list_double_push (&percentile_list, 50);
223 subc_list_double_push (&percentile_list, 75);
226 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
227 while (casegrouper_get_next_group (grouper, &group))
228 run_examine (&cmd, group, ds);
229 ok = casegrouper_destroy (grouper);
230 ok = proc_commit (ds) && ok;
237 if ( dependent_vars )
238 free (dependent_vars);
241 struct factor *f = factors ;
244 struct factor *ff = f;
248 hsh_destroy ( ff->fstats ) ;
254 subc_list_double_destroy (&percentile_list);
256 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
261 /* Show all the appropriate tables */
263 output_examine (void)
267 /* Show totals if appropriate */
268 if ( ! cmd.sbc_nototal || factors == 0 )
270 show_summary (dependent_vars, n_dependent_vars, 0);
272 if ( cmd.sbc_statistics )
274 if ( cmd.a_statistics[XMN_ST_EXTREME])
275 show_extremes (dependent_vars, n_dependent_vars, 0, cmd.st_n);
277 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
278 show_descriptives (dependent_vars, n_dependent_vars, 0);
281 if ( sbc_percentile )
282 show_percentiles (dependent_vars, n_dependent_vars, 0);
287 if ( cmd.a_plot[XMN_PLT_STEMLEAF] )
288 msg (SW, _ ("%s is not currently supported."), "STEMLEAF");
290 if ( cmd.a_plot[XMN_PLT_SPREADLEVEL] )
291 msg (SW, _ ("%s is not currently supported."), "SPREADLEVEL");
293 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
295 for ( v = 0 ; v < n_dependent_vars; ++v )
296 np_plot (&totals[v], var_to_string (dependent_vars[v]));
299 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
301 if ( cmd.cmp == XMN_GROUPS )
303 box_plot_group (0, (const struct variable **) dependent_vars,
304 n_dependent_vars, cmd.v_id);
307 box_plot_variables (0,
308 (const struct variable **) dependent_vars,
309 n_dependent_vars, cmd.v_id);
312 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
314 for ( v = 0 ; v < n_dependent_vars; ++v )
316 struct normal_curve normal;
318 normal.N = totals[v].n;
319 normal.mean = totals[v].mean;
320 normal.stddev = totals[v].stddev;
322 histogram_plot (totals[v].histogram,
323 var_to_string (dependent_vars[v]),
333 /* Show grouped statistics as appropriate */
337 show_summary (dependent_vars, n_dependent_vars, fctr);
339 if ( cmd.sbc_statistics )
341 if ( cmd.a_statistics[XMN_ST_EXTREME])
342 show_extremes (dependent_vars, n_dependent_vars, fctr, cmd.st_n);
344 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
345 show_descriptives (dependent_vars, n_dependent_vars, fctr);
348 if ( sbc_percentile )
349 show_percentiles (dependent_vars, n_dependent_vars, fctr);
356 struct factor_statistics **fs = fctr->fs ;
358 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
360 if ( cmd.cmp == XMN_VARIABLES )
361 box_plot_variables (fctr,
362 (const struct variable **) dependent_vars,
363 n_dependent_vars, cmd.v_id);
365 box_plot_group (fctr,
366 (const struct variable **) dependent_vars,
367 n_dependent_vars, cmd.v_id);
370 for ( v = 0 ; v < n_dependent_vars; ++v )
373 for ( fs = fctr->fs ; *fs ; ++fs )
376 ds_init_empty (&str);
377 factor_to_string (fctr, *fs, dependent_vars[v], &str);
379 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
380 np_plot (& (*fs)->m[v], ds_cstr (&str));
382 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
384 struct normal_curve normal;
386 normal.N = (*fs)->m[v].n;
387 normal.mean = (*fs)->m[v].mean;
388 normal.stddev = (*fs)->m[v].stddev;
390 histogram_plot ((*fs)->m[v].histogram,
391 ds_cstr (&str) , &normal, 0);
396 } /* for ( fs .... */
398 } /* for ( v = 0 ..... */
408 /* Create a hash table of percentiles and their values from the list of
410 static struct hsh_table *
411 list_to_ptile_hash (const subc_list_double *l)
415 struct hsh_table *h ;
417 h = hsh_create (subc_list_double_count (l),
418 (hsh_compare_func *) ptile_compare,
419 (hsh_hash_func *) ptile_hash,
420 (hsh_free_func *) free,
424 for ( i = 0 ; i < subc_list_double_count (l) ; ++i )
426 struct percentile *p = xmalloc (sizeof *p);
428 p->p = subc_list_double_at (l,i);
439 /* Parse the PERCENTILES subcommand */
441 xmn_custom_percentiles (struct lexer *lexer, struct dataset *ds UNUSED,
442 struct cmd_examine *p UNUSED, void *aux UNUSED)
446 lex_match (lexer, '=');
448 lex_match (lexer, '(');
450 while ( lex_is_number (lexer) )
452 subc_list_double_push (&percentile_list, lex_number (lexer));
456 lex_match (lexer, ',') ;
458 lex_match (lexer, ')');
460 lex_match (lexer, '=');
462 if ( lex_match_id (lexer, "HAVERAGE"))
463 percentile_algorithm = PC_HAVERAGE;
465 else if ( lex_match_id (lexer, "WAVERAGE"))
466 percentile_algorithm = PC_WAVERAGE;
468 else if ( lex_match_id (lexer, "ROUND"))
469 percentile_algorithm = PC_ROUND;
471 else if ( lex_match_id (lexer, "EMPIRICAL"))
472 percentile_algorithm = PC_EMPIRICAL;
474 else if ( lex_match_id (lexer, "AEMPIRICAL"))
475 percentile_algorithm = PC_AEMPIRICAL;
477 else if ( lex_match_id (lexer, "NONE"))
478 percentile_algorithm = PC_NONE;
481 if ( 0 == subc_list_double_count (&percentile_list))
483 subc_list_double_push (&percentile_list, 5);
484 subc_list_double_push (&percentile_list, 10);
485 subc_list_double_push (&percentile_list, 25);
486 subc_list_double_push (&percentile_list, 50);
487 subc_list_double_push (&percentile_list, 75);
488 subc_list_double_push (&percentile_list, 90);
489 subc_list_double_push (&percentile_list, 95);
495 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
497 xmn_custom_total (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, struct cmd_examine *p, void *aux UNUSED)
499 if ( p->sbc_nototal )
501 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
509 xmn_custom_nototal (struct lexer *lexer UNUSED, struct dataset *ds UNUSED,
510 struct cmd_examine *p, void *aux UNUSED)
514 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
523 /* Parser for the variables sub command
524 Returns 1 on success */
526 xmn_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_examine *cmd, void *aux UNUSED)
528 const struct dictionary *dict = dataset_dict (ds);
529 lex_match (lexer, '=');
531 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
532 && lex_token (lexer) != T_ALL)
537 if (!parse_variables_const (lexer, dict, &dependent_vars, &n_dependent_vars,
538 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
540 free (dependent_vars);
544 assert (n_dependent_vars);
546 totals = xnmalloc (n_dependent_vars, sizeof *totals);
548 if ( lex_match (lexer, T_BY))
551 success = examine_parse_independent_vars (lexer, dict, cmd);
552 if ( success != 1 ) {
553 free (dependent_vars);
564 /* Parse the clause specifying the factors */
566 examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd)
569 struct factor *sf = xmalloc (sizeof *sf);
571 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
572 && lex_token (lexer) != T_ALL)
579 sf->indep_var[0] = parse_variable (lexer, dict);
580 sf->indep_var[1] = 0;
582 if ( lex_token (lexer) == T_BY )
585 lex_match (lexer, T_BY);
587 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
588 && lex_token (lexer) != T_ALL)
594 sf->indep_var[1] = parse_variable (lexer, dict);
599 sf->fstats = hsh_create (4,
600 (hsh_compare_func *) factor_statistics_compare,
601 (hsh_hash_func *) factor_statistics_hash,
602 (hsh_free_func *) factor_statistics_free,
608 lex_match (lexer, ',');
610 if ( lex_token (lexer) == '.' || lex_token (lexer) == '/' )
613 success = examine_parse_independent_vars (lexer, dict, cmd);
624 void populate_percentiles (struct tab_table *tbl, int col, int row,
625 const struct metrics *m);
627 void populate_descriptives (struct tab_table *t, int col, int row,
628 const struct metrics *fs);
630 void populate_extremes (struct tab_table *t, int col, int row, int n,
631 const struct metrics *m);
633 void populate_summary (struct tab_table *t, int col, int row,
634 const struct metrics *m);
639 /* Perform calculations for the sub factors */
641 factor_calc (const struct ccase *c, int case_no, double weight,
645 struct factor *fctr = factors;
649 struct factor_statistics **foo ;
650 union value *indep_vals[2] ;
652 indep_vals[0] = value_dup (
653 case_data (c, fctr->indep_var[0]),
654 var_get_width (fctr->indep_var[0])
657 if ( fctr->indep_var[1] )
658 indep_vals[1] = value_dup (
659 case_data (c, fctr->indep_var[1]),
660 var_get_width (fctr->indep_var[1])
664 const union value sm = {SYSMIS};
665 indep_vals[1] = value_dup (&sm, 0);
668 assert (fctr->fstats);
670 foo = ( struct factor_statistics ** )
671 hsh_probe (fctr->fstats, (void *) &indep_vals);
676 *foo = create_factor_statistics (n_dependent_vars,
680 for ( v = 0 ; v < n_dependent_vars ; ++v )
682 metrics_precalc ( & (*foo)->m[v] );
688 free (indep_vals[0]);
689 free (indep_vals[1]);
692 for ( v = 0 ; v < n_dependent_vars ; ++v )
694 const struct variable *var = dependent_vars[v];
695 union value *val = value_dup (
700 if (case_missing || var_is_value_missing (var, val, exclude_values))
706 metrics_calc ( & (*foo)->m[v], val, weight, case_no);
716 run_examine (struct cmd_examine *cmd, struct casereader *input,
719 struct dictionary *dict = dataset_dict (ds);
727 if (!casereader_peek (input, 0, &c))
729 casereader_destroy (input);
732 output_split_file_values (ds, &c);
735 input = casereader_create_filter_weight (input, dict, NULL, NULL);
736 input = casereader_create_counter (input, &case_no, 0);
738 /* Make sure we haven't got rubbish left over from a
743 struct factor *next = fctr->next;
745 hsh_clear (fctr->fstats);
752 for ( v = 0 ; v < n_dependent_vars ; ++v )
753 metrics_precalc (&totals[v]);
755 for (; casereader_read (input, &c); case_destroy (&c))
757 int case_missing = 0;
758 const double weight = dict_get_case_weight (dict, &c, NULL);
760 if ( cmd->miss == XMN_LISTWISE )
762 for ( v = 0 ; v < n_dependent_vars ; ++v )
764 const struct variable *var = dependent_vars[v];
765 union value *val = value_dup (
770 if ( var_is_value_missing (var, val, exclude_values))
777 for ( v = 0 ; v < n_dependent_vars ; ++v )
779 const struct variable *var = dependent_vars[v];
780 union value *val = value_dup (
785 if ( var_is_value_missing (var, val, exclude_values)
792 metrics_calc (&totals[v], val, weight, case_no);
797 factor_calc (&c, case_no, weight, case_missing);
799 ok = casereader_destroy (input);
801 for ( v = 0 ; v < n_dependent_vars ; ++v)
806 struct hsh_iterator hi;
807 struct factor_statistics *fs;
809 for ( fs = hsh_first (fctr->fstats, &hi);
811 fs = hsh_next (fctr->fstats, &hi))
814 fs->m[v].ptile_hash = list_to_ptile_hash (&percentile_list);
815 fs->m[v].ptile_alg = percentile_algorithm;
816 metrics_postcalc (&fs->m[v]);
822 totals[v].ptile_hash = list_to_ptile_hash (&percentile_list);
823 totals[v].ptile_alg = percentile_algorithm;
824 metrics_postcalc (&totals[v]);
828 /* Make sure that the combination of factors are complete */
833 struct hsh_iterator hi;
834 struct hsh_iterator hi0;
835 struct hsh_iterator hi1;
836 struct factor_statistics *fs;
838 struct hsh_table *idh0=0;
839 struct hsh_table *idh1=0;
843 idh0 = hsh_create (4, (hsh_compare_func *) compare_values,
844 (hsh_hash_func *) hash_value,
847 idh1 = hsh_create (4, (hsh_compare_func *) compare_values,
848 (hsh_hash_func *) hash_value,
852 for ( fs = hsh_first (fctr->fstats, &hi);
854 fs = hsh_next (fctr->fstats, &hi))
856 hsh_insert (idh0, (void *) &fs->id[0]);
857 hsh_insert (idh1, (void *) &fs->id[1]);
860 /* Ensure that the factors combination is complete */
861 for ( val0 = hsh_first (idh0, &hi0);
863 val0 = hsh_next (idh0, &hi0))
865 for ( val1 = hsh_first (idh1, &hi1);
867 val1 = hsh_next (idh1, &hi1))
869 struct factor_statistics **ffs;
874 ffs = (struct factor_statistics **)
875 hsh_probe (fctr->fstats, (void *) &key );
879 (*ffs) = create_factor_statistics (n_dependent_vars,
881 for ( i = 0 ; i < n_dependent_vars ; ++i )
882 metrics_precalc ( & (*ffs)->m[i]);
890 fctr->fs = (struct factor_statistics **) hsh_sort_copy (fctr->fstats);
902 for ( i = 0 ; i < n_dependent_vars ; ++i )
904 metrics_destroy (&totals[i]);
911 show_summary (const struct variable **dependent_var, int n_dep_var,
912 const struct factor *fctr)
914 static const char *subtitle[]=
922 int heading_columns ;
924 const int heading_rows = 3;
925 struct tab_table *tbl;
933 n_factors = hsh_count (fctr->fstats);
934 n_rows = n_dep_var * n_factors ;
936 if ( fctr->indep_var[1] )
945 n_rows += heading_rows;
947 n_cols = heading_columns + 6;
949 tbl = tab_create (n_cols,n_rows,0);
950 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
952 tab_dim (tbl, tab_natural_dimensions);
954 /* Outline the box */
959 n_cols - 1, n_rows - 1);
961 /* Vertical lines for the data only */
966 n_cols - 1, n_rows - 1);
969 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
970 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
971 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
973 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
976 tab_title (tbl, _ ("Case Processing Summary"));
979 tab_joint_text (tbl, heading_columns, 0,
981 TAB_CENTER | TAT_TITLE,
984 /* Remove lines ... */
991 for ( i = 0 ; i < 3 ; ++i )
993 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
996 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
999 tab_joint_text (tbl, heading_columns + i*2 , 1,
1000 heading_columns + i*2 + 1, 1,
1001 TAB_CENTER | TAT_TITLE,
1004 tab_box (tbl, -1, -1,
1006 heading_columns + i*2, 1,
1007 heading_columns + i*2 + 1, 1);
1012 /* Titles for the independent variables */
1015 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1016 var_to_string (fctr->indep_var[0]));
1018 if ( fctr->indep_var[1] )
1020 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1021 var_to_string (fctr->indep_var[1]));
1027 for ( i = 0 ; i < n_dep_var ; ++i )
1031 n_factors = hsh_count (fctr->fstats);
1035 tab_hline (tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
1038 0, i * n_factors + heading_rows,
1039 TAB_LEFT | TAT_TITLE,
1040 var_to_string (dependent_var[i])
1045 populate_summary (tbl, heading_columns,
1046 (i * n_factors) + heading_rows,
1052 struct factor_statistics **fs = fctr->fs;
1054 const union value *prev = NULL;
1059 0 != compare_values (prev, (*fs)->id[0],
1060 var_get_width (fctr->indep_var[0])))
1063 ds_init_empty (&vstr);
1064 var_append_value_name (fctr->indep_var[0],
1065 (*fs)->id[0], &vstr);
1069 (i * n_factors ) + count +
1071 TAB_LEFT | TAT_TITLE,
1077 if (fctr->indep_var[1] && count > 0 )
1078 tab_hline (tbl, TAL_1, 1, n_cols - 1,
1079 (i * n_factors ) + count + heading_rows);
1083 prev = (*fs)->id[0];
1085 if ( fctr->indep_var[1])
1088 ds_init_empty (&vstr);
1089 var_append_value_name (fctr->indep_var[1],
1090 (*fs)->id[1], &vstr);
1093 (i * n_factors ) + count +
1095 TAB_LEFT | TAT_TITLE,
1101 populate_summary (tbl, heading_columns,
1102 (i * n_factors) + count
1117 populate_summary (struct tab_table *t, int col, int row,
1118 const struct metrics *m)
1121 const double total = m->n + m->n_missing ;
1123 tab_float (t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1124 tab_float (t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1125 tab_float (t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1129 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1130 100.0 * m->n / total );
1132 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1133 100.0 * m->n_missing / total );
1135 /* This seems a bit pointless !!! */
1136 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1137 100.0 * total / total );
1148 show_extremes (const struct variable **dependent_var, int n_dep_var,
1149 const struct factor *fctr, int n_extremities)
1152 int heading_columns ;
1154 const int heading_rows = 1;
1155 struct tab_table *tbl;
1162 heading_columns = 2;
1163 n_factors = hsh_count (fctr->fstats);
1165 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1167 if ( fctr->indep_var[1] )
1168 heading_columns = 3;
1172 heading_columns = 1;
1173 n_rows = n_dep_var * 2 * n_extremities;
1176 n_rows += heading_rows;
1178 heading_columns += 2;
1179 n_cols = heading_columns + 2;
1181 tbl = tab_create (n_cols,n_rows,0);
1182 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1184 tab_dim (tbl, tab_natural_dimensions);
1186 /* Outline the box, No internal lines*/
1191 n_cols - 1, n_rows - 1);
1193 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1195 tab_title (tbl, _ ("Extreme Values"));
1197 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1198 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1202 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1203 var_to_string (fctr->indep_var[0]));
1205 if ( fctr->indep_var[1] )
1206 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1207 var_to_string (fctr->indep_var[1]));
1210 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Value"));
1211 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Case Number"));
1213 for ( i = 0 ; i < n_dep_var ; ++i )
1217 tab_hline (tbl, TAL_1, 0, n_cols -1 ,
1218 i * 2 * n_extremities * n_factors + heading_rows);
1221 i * 2 * n_extremities * n_factors + heading_rows,
1222 TAB_LEFT | TAT_TITLE,
1223 var_to_string (dependent_var[i])
1228 populate_extremes (tbl, heading_columns - 2,
1229 i * 2 * n_extremities * n_factors + heading_rows,
1230 n_extremities, &totals[i]);
1234 struct factor_statistics **fs = fctr->fs;
1236 const union value *prev = NULL;
1240 const int row = heading_rows + ( 2 * n_extremities ) *
1241 ( ( i * n_factors ) + count );
1244 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
1245 var_get_width (fctr->indep_var[0])))
1248 ds_init_empty (&vstr);
1249 var_append_value_name (fctr->indep_var[0],
1250 (*fs)->id[0], &vstr);
1253 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1257 TAB_LEFT | TAT_TITLE,
1264 prev = (*fs)->id[0];
1266 if (fctr->indep_var[1] && count > 0 )
1267 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1269 if ( fctr->indep_var[1])
1272 ds_init_empty (&vstr);
1273 var_append_value_name (fctr->indep_var[1], (*fs)->id[1], &vstr);
1275 tab_text (tbl, 2, row,
1276 TAB_LEFT | TAT_TITLE,
1283 populate_extremes (tbl, heading_columns - 2,
1298 /* Fill in the extremities table */
1300 populate_extremes (struct tab_table *t,
1301 int col, int row, int n, const struct metrics *m)
1307 tab_text (t, col, row,
1308 TAB_RIGHT | TAT_TITLE ,
1312 tab_text (t, col, row + n ,
1313 TAB_RIGHT | TAT_TITLE ,
1318 tab_hline (t, TAL_1, col, col + 3, row + n );
1320 for (extremity = 0; extremity < n ; ++extremity )
1323 tab_float (t, col + 1, row + extremity,
1325 extremity + 1, 8, 0);
1329 tab_float (t, col + 1, row + extremity + n,
1331 extremity + 1, 8, 0);
1337 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1340 const struct weighted_value *wv = m->wvp[idx];
1341 struct case_node *cn = wv->case_nos;
1344 for (j = 0 ; j < wv->w ; ++j )
1346 if ( extremity + j >= n )
1349 tab_float (t, col + 3, row + extremity + j + n,
1353 tab_float (t, col + 2, row + extremity + j + n,
1362 extremity += wv->w ;
1367 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1370 const struct weighted_value *wv = m->wvp[idx];
1371 struct case_node *cn = wv->case_nos;
1373 for (j = 0 ; j < wv->w ; ++j )
1375 if ( extremity + j >= n )
1378 tab_float (t, col + 3, row + extremity + j,
1382 tab_float (t, col + 2, row + extremity + j,
1391 extremity += wv->w ;
1396 /* Show the descriptives table */
1398 show_descriptives (const struct variable **dependent_var,
1400 struct factor *fctr)
1403 int heading_columns ;
1405 const int n_stat_rows = 13;
1407 const int heading_rows = 1;
1409 struct tab_table *tbl;
1416 heading_columns = 4;
1417 n_factors = hsh_count (fctr->fstats);
1419 n_rows = n_dep_var * n_stat_rows * n_factors;
1421 if ( fctr->indep_var[1] )
1422 heading_columns = 5;
1426 heading_columns = 3;
1427 n_rows = n_dep_var * n_stat_rows;
1430 n_rows += heading_rows;
1432 n_cols = heading_columns + 2;
1435 tbl = tab_create (n_cols, n_rows, 0);
1437 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1439 tab_dim (tbl, tab_natural_dimensions);
1441 /* Outline the box and have no internal lines*/
1446 n_cols - 1, n_rows - 1);
1448 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1450 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1451 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1452 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1454 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Statistic"));
1455 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Std. Error"));
1457 tab_title (tbl, _ ("Descriptives"));
1460 for ( i = 0 ; i < n_dep_var ; ++i )
1462 const int row = heading_rows + i * n_stat_rows * n_factors ;
1465 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
1468 i * n_stat_rows * n_factors + heading_rows,
1469 TAB_LEFT | TAT_TITLE,
1470 var_to_string (dependent_var[i])
1476 const union value *prev = NULL;
1478 struct factor_statistics **fs = fctr->fs;
1481 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1482 var_to_string (fctr->indep_var[0]));
1485 if ( fctr->indep_var[1])
1486 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1487 var_to_string (fctr->indep_var[1]));
1491 const int row = heading_rows + n_stat_rows *
1492 ( ( i * n_factors ) + count );
1495 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
1496 var_get_width (fctr->indep_var[0])))
1499 ds_init_empty (&vstr);
1500 var_append_value_name (fctr->indep_var[0],
1501 (*fs)->id[0], &vstr);
1504 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1508 TAB_LEFT | TAT_TITLE,
1515 prev = (*fs)->id[0];
1517 if (fctr->indep_var[1] && count > 0 )
1518 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1520 if ( fctr->indep_var[1])
1523 ds_init_empty (&vstr);
1524 var_append_value_name (fctr->indep_var[1], (*fs)->id[1], &vstr);
1526 tab_text (tbl, 2, row,
1527 TAB_LEFT | TAT_TITLE,
1534 populate_descriptives (tbl, heading_columns - 2,
1535 row, & (*fs)->m[i]);
1546 populate_descriptives (tbl, heading_columns - 2,
1547 i * n_stat_rows * n_factors + heading_rows,
1557 /* Fill in the descriptives data */
1559 populate_descriptives (struct tab_table *tbl, int col, int row,
1560 const struct metrics *m)
1562 const double t = gsl_cdf_tdist_Qinv ((1 - cmd.n_cinterval[0] / 100.0)/2.0,
1567 TAB_LEFT | TAT_TITLE,
1570 tab_float (tbl, col + 2,
1576 tab_float (tbl, col + 3,
1585 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1586 _ ("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1589 tab_text (tbl, col + 1,
1591 TAB_LEFT | TAT_TITLE,
1594 tab_float (tbl, col + 2,
1597 m->mean - t * m->se_mean,
1600 tab_text (tbl, col + 1,
1602 TAB_LEFT | TAT_TITLE,
1606 tab_float (tbl, col + 2,
1609 m->mean + t * m->se_mean,
1614 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1615 _ ("5%% Trimmed Mean"));
1617 tab_float (tbl, col + 2,
1625 TAB_LEFT | TAT_TITLE,
1629 struct percentile *p;
1632 p = hsh_find (m->ptile_hash, &d);
1637 tab_float (tbl, col + 2,
1647 TAB_LEFT | TAT_TITLE,
1650 tab_float (tbl, col + 2,
1659 TAB_LEFT | TAT_TITLE,
1660 _ ("Std. Deviation"));
1663 tab_float (tbl, col + 2,
1672 TAB_LEFT | TAT_TITLE,
1675 tab_float (tbl, col + 2,
1683 TAB_LEFT | TAT_TITLE,
1686 tab_float (tbl, col + 2,
1695 TAB_LEFT | TAT_TITLE,
1699 tab_float (tbl, col + 2,
1707 TAB_LEFT | TAT_TITLE,
1708 _ ("Interquartile Range"));
1711 struct percentile *p1;
1712 struct percentile *p2;
1715 p1 = hsh_find (m->ptile_hash, &d);
1718 p2 = hsh_find (m->ptile_hash, &d);
1723 tab_float (tbl, col + 2,
1734 TAB_LEFT | TAT_TITLE,
1738 tab_float (tbl, col + 2,
1744 /* stderr of skewness */
1745 tab_float (tbl, col + 3,
1754 TAB_LEFT | TAT_TITLE,
1758 tab_float (tbl, col + 2,
1764 /* stderr of kurtosis */
1765 tab_float (tbl, col + 3,
1777 box_plot_variables (const struct factor *fctr,
1778 const struct variable **vars, int n_vars,
1779 const struct variable *id)
1783 struct factor_statistics **fs ;
1787 box_plot_group (fctr, vars, n_vars, id);
1791 for ( fs = fctr->fs ; *fs ; ++fs )
1794 double y_min = DBL_MAX;
1795 double y_max = -DBL_MAX;
1796 struct chart *ch = chart_create ();
1797 ds_init_empty (&str);
1798 factor_to_string (fctr, *fs, 0, &str );
1800 chart_write_title (ch, ds_cstr (&str));
1802 for ( i = 0 ; i < n_vars ; ++i )
1804 y_max = MAX (y_max, (*fs)->m[i].max);
1805 y_min = MIN (y_min, (*fs)->m[i].min);
1808 boxplot_draw_yscale (ch, y_max, y_min);
1810 for ( i = 0 ; i < n_vars ; ++i )
1813 const double box_width = (ch->data_right - ch->data_left)
1816 const double box_centre = ( i * 2 + 1) * box_width
1819 boxplot_draw_boxplot (ch,
1820 box_centre, box_width,
1822 var_to_string (vars[i]));
1834 /* Do a box plot, grouping all factors into one plot ;
1835 each dependent variable has its own plot.
1838 box_plot_group (const struct factor *fctr,
1839 const struct variable **vars,
1841 const struct variable *id UNUSED)
1846 for ( i = 0 ; i < n_vars ; ++i )
1848 struct factor_statistics **fs ;
1851 ch = chart_create ();
1853 boxplot_draw_yscale (ch, totals[i].max, totals[i].min);
1859 for ( fs = fctr->fs ; *fs ; ++fs )
1862 chart_write_title (ch, _ ("Boxplot of %s vs. %s"),
1863 var_to_string (vars[i]), var_to_string (fctr->indep_var[0]) );
1865 for ( fs = fctr->fs ; *fs ; ++fs )
1868 const double box_width = (ch->data_right - ch->data_left)
1869 / (n_factors * 2.0 ) ;
1871 const double box_centre = ( f++ * 2 + 1) * box_width
1874 ds_init_empty (&str);
1875 factor_to_string_concise (fctr, *fs, &str);
1877 boxplot_draw_boxplot (ch,
1878 box_centre, box_width,
1886 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1887 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1889 chart_write_title (ch, _ ("Boxplot"));
1891 boxplot_draw_boxplot (ch,
1892 box_centre, box_width,
1894 var_to_string (vars[i]) );
1903 /* Plot the normal and detrended normal plots for m
1904 Label the plots with factorname */
1906 np_plot (const struct metrics *m, const char *factorname)
1909 double yfirst=0, ylast=0;
1912 struct chart *np_chart;
1914 /* Detrended Normal Plot */
1915 struct chart *dnp_chart;
1917 /* The slope and intercept of the ideal normal probability line */
1918 const double slope = 1.0 / m->stddev;
1919 const double intercept = - m->mean / m->stddev;
1921 /* Cowardly refuse to plot an empty data set */
1922 if ( m->n_data == 0 )
1925 np_chart = chart_create ();
1926 dnp_chart = chart_create ();
1928 if ( !np_chart || ! dnp_chart )
1931 chart_write_title (np_chart, _ ("Normal Q-Q Plot of %s"), factorname);
1932 chart_write_xlabel (np_chart, _ ("Observed Value"));
1933 chart_write_ylabel (np_chart, _ ("Expected Normal"));
1936 chart_write_title (dnp_chart, _ ("Detrended Normal Q-Q Plot of %s"),
1938 chart_write_xlabel (dnp_chart, _ ("Observed Value"));
1939 chart_write_ylabel (dnp_chart, _ ("Dev from Normal"));
1941 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1942 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1946 /* Need to make sure that both the scatter plot and the ideal fit into the
1948 double x_lower = MIN (m->min, (yfirst - intercept) / slope) ;
1949 double x_upper = MAX (m->max, (ylast - intercept) / slope) ;
1950 double slack = (x_upper - x_lower) * 0.05 ;
1952 chart_write_xscale (np_chart, x_lower - slack, x_upper + slack, 5);
1954 chart_write_xscale (dnp_chart, m->min, m->max, 5);
1958 chart_write_yscale (np_chart, yfirst, ylast, 5);
1961 /* We have to cache the detrended data, beacause we need to
1962 find its limits before we can plot it */
1963 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1964 double d_max = -DBL_MAX;
1965 double d_min = DBL_MAX;
1966 for ( i = 0 ; i < m->n_data; ++i )
1968 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1970 chart_datum (np_chart, 0, m->wvp[i]->v.f, ns);
1972 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1974 if ( d_data[i] < d_min ) d_min = d_data[i];
1975 if ( d_data[i] > d_max ) d_max = d_data[i];
1977 chart_write_yscale (dnp_chart, d_min, d_max, 5);
1979 for ( i = 0 ; i < m->n_data; ++i )
1980 chart_datum (dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1985 chart_line (np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1986 chart_line (dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1988 chart_submit (np_chart);
1989 chart_submit (dnp_chart);
1995 /* Show the percentiles */
1997 show_percentiles (const struct variable **dependent_var,
1999 struct factor *fctr)
2001 struct tab_table *tbl;
2007 struct hsh_table *ptiles ;
2009 int n_heading_columns;
2010 const int n_heading_rows = 2;
2011 const int n_stat_rows = 2;
2017 struct factor_statistics **fs = fctr->fs ;
2018 n_heading_columns = 3;
2019 n_factors = hsh_count (fctr->fstats);
2021 ptiles = (*fs)->m[0].ptile_hash;
2023 if ( fctr->indep_var[1] )
2024 n_heading_columns = 4;
2029 n_heading_columns = 2;
2031 ptiles = totals[0].ptile_hash;
2034 n_ptiles = hsh_count (ptiles);
2036 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
2038 n_cols = n_heading_columns + n_ptiles ;
2040 tbl = tab_create (n_cols, n_rows, 0);
2042 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
2044 tab_dim (tbl, tab_natural_dimensions);
2046 /* Outline the box and have no internal lines*/
2051 n_cols - 1, n_rows - 1);
2053 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
2055 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
2058 tab_title (tbl, _ ("Percentiles"));
2061 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
2068 n_heading_columns - 1, n_rows - 1);
2074 n_heading_columns, n_heading_rows - 1,
2075 n_cols - 1, n_rows - 1);
2077 tab_joint_text (tbl, n_heading_columns + 1, 0,
2079 TAB_CENTER | TAT_TITLE ,
2084 /* Put in the percentile break points as headings */
2086 struct percentile **p = (struct percentile **) hsh_sort (ptiles);
2091 tab_float (tbl, n_heading_columns + i++ , 1,
2100 for ( i = 0 ; i < n_dep_var ; ++i )
2102 const int n_stat_rows = 2;
2103 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2106 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
2109 i * n_stat_rows * n_factors + n_heading_rows,
2110 TAB_LEFT | TAT_TITLE,
2111 var_to_string (dependent_var[i])
2116 const union value *prev = NULL ;
2117 struct factor_statistics **fs = fctr->fs;
2120 tab_text (tbl, 1, n_heading_rows - 1,
2121 TAB_CENTER | TAT_TITLE,
2122 var_to_string (fctr->indep_var[0]));
2125 if ( fctr->indep_var[1])
2126 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2127 var_to_string (fctr->indep_var[1]));
2131 const int row = n_heading_rows + n_stat_rows *
2132 ( ( i * n_factors ) + count );
2135 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
2136 var_get_width (fctr->indep_var[0])))
2139 ds_init_empty (&vstr);
2140 var_append_value_name (fctr->indep_var[0],
2141 (*fs)->id[0], &vstr);
2145 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2149 TAB_LEFT | TAT_TITLE,
2156 prev = (*fs)->id[0];
2158 if (fctr->indep_var[1] && count > 0 )
2159 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
2161 if ( fctr->indep_var[1])
2164 ds_init_empty (&vstr);
2165 var_append_value_name (fctr->indep_var[1], (*fs)->id[1], &vstr);
2167 tab_text (tbl, 2, row,
2168 TAB_LEFT | TAT_TITLE,
2176 populate_percentiles (tbl, n_heading_columns - 1,
2177 row, & (*fs)->m[i]);
2188 populate_percentiles (tbl, n_heading_columns - 1,
2189 i * n_stat_rows * n_factors + n_heading_rows,
2206 populate_percentiles (struct tab_table *tbl, int col, int row,
2207 const struct metrics *m)
2211 struct percentile **p = (struct percentile **) hsh_sort (m->ptile_hash);
2215 TAB_LEFT | TAT_TITLE,
2216 _ ("Tukey\'s Hinges")
2221 TAB_LEFT | TAT_TITLE,
2222 ptile_alg_desc[m->ptile_alg]
2229 tab_float (tbl, col + i + 1 , row,
2232 if ( (*p)->p == 25 )
2233 tab_float (tbl, col + i + 1 , row + 1,
2237 if ( (*p)->p == 50 )
2238 tab_float (tbl, col + i + 1 , row + 1,
2242 if ( (*p)->p == 75 )
2243 tab_float (tbl, col + i + 1 , row + 1,
2256 factor_to_string (const struct factor *fctr,
2257 const struct factor_statistics *fs,
2258 const struct variable *var,
2263 ds_put_format (str, "%s (",var_to_string (var) );
2266 ds_put_format (str, "%s = ",
2267 var_to_string (fctr->indep_var[0]));
2269 var_append_value_name (fctr->indep_var[0], fs->id[0], str);
2271 if ( fctr->indep_var[1] )
2273 ds_put_format (str, "; %s = )",
2274 var_to_string (fctr->indep_var[1]));
2276 var_append_value_name (fctr->indep_var[1], fs->id[1], str);
2281 ds_put_cstr (str, ")");
2287 factor_to_string_concise (const struct factor *fctr,
2288 const struct factor_statistics *fs,
2293 var_append_value_name (fctr->indep_var[0], fs->id[0], str);
2295 if ( fctr->indep_var[1] )
2297 ds_put_cstr (str, ",");
2299 var_append_value_name (fctr->indep_var[1],fs->id[1], str);
2301 ds_put_cstr (str, ")");