1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2004 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <libpspp/message.h>
25 #include <data/case.h>
26 #include <data/casegrouper.h>
27 #include <data/casereader.h>
28 #include <data/dictionary.h>
29 #include <data/procedure.h>
30 #include <data/value-labels.h>
31 #include <data/variable.h>
32 #include <language/command.h>
33 #include <language/dictionary/split-file.h>
34 #include <language/lexer/lexer.h>
35 #include <libpspp/alloc.h>
36 #include <libpspp/compiler.h>
37 #include <libpspp/hash.h>
38 #include <libpspp/magic.h>
39 #include <libpspp/message.h>
40 #include <libpspp/misc.h>
41 #include <libpspp/str.h>
42 #include <math/factor-stats.h>
43 #include <math/moments.h>
44 #include <math/percentiles.h>
45 #include <output/charts/box-whisker.h>
46 #include <output/charts/cartesian.h>
47 #include <output/manager.h>
48 #include <output/table.h>
53 #define _(msgid) gettext (msgid)
54 #define N_(msgid) msgid
57 #include <output/chart.h>
58 #include <output/charts/plot-hist.h>
59 #include <output/charts/plot-chart.h>
66 missing=miss:pairwise/!listwise,
68 incl:include/!exclude;
69 +compare=cmp:variables/!groups;
72 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
74 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
83 static struct cmd_examine cmd;
85 static const struct variable **dependent_vars;
87 static size_t n_dependent_vars;
92 /* The independent variable */
93 struct variable *indep_var[2];
96 /* Hash table of factor stats indexed by 2 values */
97 struct hsh_table *fstats;
99 /* The hash table after it has been crunched */
100 struct factor_statistics **fs;
106 /* Linked list of factors */
107 static struct factor *factors = 0;
109 static struct metrics *totals = 0;
111 /* Parse the clause specifying the factors */
112 static int examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd);
116 /* Output functions */
117 static void show_summary (const struct variable **dependent_var, int n_dep_var,
118 const struct factor *f);
120 static void show_extremes (const struct variable **dependent_var,
122 const struct factor *factor,
125 static void show_descriptives (const struct variable **dependent_var,
127 struct factor *factor);
129 static void show_percentiles (const struct variable **dependent_var,
131 struct factor *factor);
136 void np_plot (const struct metrics *m, const char *factorname);
139 void box_plot_group (const struct factor *fctr,
140 const struct variable **vars, int n_vars,
141 const struct variable *id
145 void box_plot_variables (const struct factor *fctr,
146 const struct variable **vars, int n_vars,
147 const struct variable *id
152 /* Per Split function */
153 static void run_examine (struct cmd_examine *, struct casereader *,
156 static void output_examine (void);
159 void factor_calc (const struct ccase *c, int case_no,
160 double weight, int case_missing);
163 /* Represent a factor as a string, so it can be
164 printed in a human readable fashion */
165 const char * factor_to_string (const struct factor *fctr,
166 const struct factor_statistics *fs,
167 const struct variable *var);
170 /* Represent a factor as a string, so it can be
171 printed in a human readable fashion,
172 but sacrificing some readablility for the sake of brevity */
173 const char *factor_to_string_concise (const struct factor *fctr,
174 struct factor_statistics *fs);
179 /* Categories of missing values to exclude. */
180 static enum mv_class exclude_values;
184 static subc_list_double percentile_list;
186 static enum pc_alg percentile_algorithm;
188 static short sbc_percentile;
192 cmd_examine (struct lexer *lexer, struct dataset *ds)
194 struct casegrouper *grouper;
195 struct casereader *group;
198 subc_list_double_create (&percentile_list);
199 percentile_algorithm = PC_HAVERAGE;
201 if ( !parse_examine (lexer, ds, &cmd, NULL) )
203 subc_list_double_destroy (&percentile_list);
207 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
208 exclude_values = cmd.incl == XMN_INCLUDE ? MV_SYSTEM : MV_ANY;
210 if ( cmd.st_n == SYSMIS )
213 if ( ! cmd.sbc_cinterval)
214 cmd.n_cinterval[0] = 95.0;
216 /* If descriptives have been requested, make sure the
217 quartiles are calculated */
218 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
220 subc_list_double_push (&percentile_list, 25);
221 subc_list_double_push (&percentile_list, 50);
222 subc_list_double_push (&percentile_list, 75);
225 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
226 while (casegrouper_get_next_group (grouper, &group))
227 run_examine (&cmd, group, ds);
228 ok = casegrouper_destroy (grouper);
229 ok = proc_commit (ds) && ok;
236 if ( dependent_vars )
237 free (dependent_vars);
240 struct factor *f = factors ;
243 struct factor *ff = f;
247 hsh_destroy ( ff->fstats ) ;
253 subc_list_double_destroy (&percentile_list);
255 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
260 /* Show all the appropriate tables */
262 output_examine (void)
266 /* Show totals if appropriate */
267 if ( ! cmd.sbc_nototal || factors == 0 )
269 show_summary (dependent_vars, n_dependent_vars, 0);
271 if ( cmd.sbc_statistics )
273 if ( cmd.a_statistics[XMN_ST_EXTREME])
274 show_extremes (dependent_vars, n_dependent_vars, 0, cmd.st_n);
276 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
277 show_descriptives (dependent_vars, n_dependent_vars, 0);
280 if ( sbc_percentile )
281 show_percentiles (dependent_vars, n_dependent_vars, 0);
286 if ( cmd.a_plot[XMN_PLT_STEMLEAF] )
287 msg (SW, _ ("%s is not currently supported."), "STEMLEAF");
289 if ( cmd.a_plot[XMN_PLT_SPREADLEVEL] )
290 msg (SW, _ ("%s is not currently supported."), "SPREADLEVEL");
292 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
294 for ( v = 0 ; v < n_dependent_vars; ++v )
295 np_plot (&totals[v], var_to_string (dependent_vars[v]));
298 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
300 if ( cmd.cmp == XMN_GROUPS )
302 box_plot_group (0, (const struct variable **) dependent_vars,
303 n_dependent_vars, cmd.v_id);
306 box_plot_variables (0,
307 (const struct variable **) dependent_vars,
308 n_dependent_vars, cmd.v_id);
311 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
313 for ( v = 0 ; v < n_dependent_vars; ++v )
315 struct normal_curve normal;
317 normal.N = totals[v].n;
318 normal.mean = totals[v].mean;
319 normal.stddev = totals[v].stddev;
321 histogram_plot (totals[v].histogram,
322 var_to_string (dependent_vars[v]),
332 /* Show grouped statistics as appropriate */
336 show_summary (dependent_vars, n_dependent_vars, fctr);
338 if ( cmd.sbc_statistics )
340 if ( cmd.a_statistics[XMN_ST_EXTREME])
341 show_extremes (dependent_vars, n_dependent_vars, fctr, cmd.st_n);
343 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
344 show_descriptives (dependent_vars, n_dependent_vars, fctr);
347 if ( sbc_percentile )
348 show_percentiles (dependent_vars, n_dependent_vars, fctr);
355 struct factor_statistics **fs = fctr->fs ;
357 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
359 if ( cmd.cmp == XMN_VARIABLES )
360 box_plot_variables (fctr,
361 (const struct variable **) dependent_vars,
362 n_dependent_vars, cmd.v_id);
364 box_plot_group (fctr,
365 (const struct variable **) dependent_vars,
366 n_dependent_vars, cmd.v_id);
369 for ( v = 0 ; v < n_dependent_vars; ++v )
372 for ( fs = fctr->fs ; *fs ; ++fs )
374 const char *s = factor_to_string (fctr, *fs, dependent_vars[v]);
376 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
377 np_plot (& (*fs)->m[v], s);
379 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
381 struct normal_curve normal;
383 normal.N = (*fs)->m[v].n;
384 normal.mean = (*fs)->m[v].mean;
385 normal.stddev = (*fs)->m[v].stddev;
387 histogram_plot ((*fs)->m[v].histogram,
391 } /* for ( fs .... */
393 } /* for ( v = 0 ..... */
403 /* Create a hash table of percentiles and their values from the list of
405 static struct hsh_table *
406 list_to_ptile_hash (const subc_list_double *l)
410 struct hsh_table *h ;
412 h = hsh_create (subc_list_double_count (l),
413 (hsh_compare_func *) ptile_compare,
414 (hsh_hash_func *) ptile_hash,
415 (hsh_free_func *) free,
419 for ( i = 0 ; i < subc_list_double_count (l) ; ++i )
421 struct percentile *p = xmalloc (sizeof *p);
423 p->p = subc_list_double_at (l,i);
434 /* Parse the PERCENTILES subcommand */
436 xmn_custom_percentiles (struct lexer *lexer, struct dataset *ds UNUSED,
437 struct cmd_examine *p UNUSED, void *aux UNUSED)
441 lex_match (lexer, '=');
443 lex_match (lexer, '(');
445 while ( lex_is_number (lexer) )
447 subc_list_double_push (&percentile_list, lex_number (lexer));
451 lex_match (lexer, ',') ;
453 lex_match (lexer, ')');
455 lex_match (lexer, '=');
457 if ( lex_match_id (lexer, "HAVERAGE"))
458 percentile_algorithm = PC_HAVERAGE;
460 else if ( lex_match_id (lexer, "WAVERAGE"))
461 percentile_algorithm = PC_WAVERAGE;
463 else if ( lex_match_id (lexer, "ROUND"))
464 percentile_algorithm = PC_ROUND;
466 else if ( lex_match_id (lexer, "EMPIRICAL"))
467 percentile_algorithm = PC_EMPIRICAL;
469 else if ( lex_match_id (lexer, "AEMPIRICAL"))
470 percentile_algorithm = PC_AEMPIRICAL;
472 else if ( lex_match_id (lexer, "NONE"))
473 percentile_algorithm = PC_NONE;
476 if ( 0 == subc_list_double_count (&percentile_list))
478 subc_list_double_push (&percentile_list, 5);
479 subc_list_double_push (&percentile_list, 10);
480 subc_list_double_push (&percentile_list, 25);
481 subc_list_double_push (&percentile_list, 50);
482 subc_list_double_push (&percentile_list, 75);
483 subc_list_double_push (&percentile_list, 90);
484 subc_list_double_push (&percentile_list, 95);
490 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
492 xmn_custom_total (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, struct cmd_examine *p, void *aux UNUSED)
494 if ( p->sbc_nototal )
496 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
504 xmn_custom_nototal (struct lexer *lexer UNUSED, struct dataset *ds UNUSED,
505 struct cmd_examine *p, void *aux UNUSED)
509 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
518 /* Parser for the variables sub command
519 Returns 1 on success */
521 xmn_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_examine *cmd, void *aux UNUSED)
523 const struct dictionary *dict = dataset_dict (ds);
524 lex_match (lexer, '=');
526 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
527 && lex_token (lexer) != T_ALL)
532 if (!parse_variables_const (lexer, dict, &dependent_vars, &n_dependent_vars,
533 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
535 free (dependent_vars);
539 assert (n_dependent_vars);
541 totals = xnmalloc (n_dependent_vars, sizeof *totals);
543 if ( lex_match (lexer, T_BY))
546 success = examine_parse_independent_vars (lexer, dict, cmd);
547 if ( success != 1 ) {
548 free (dependent_vars);
559 /* Parse the clause specifying the factors */
561 examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd)
564 struct factor *sf = xmalloc (sizeof *sf);
566 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
567 && lex_token (lexer) != T_ALL)
574 sf->indep_var[0] = parse_variable (lexer, dict);
575 sf->indep_var[1] = 0;
577 if ( lex_token (lexer) == T_BY )
580 lex_match (lexer, T_BY);
582 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
583 && lex_token (lexer) != T_ALL)
589 sf->indep_var[1] = parse_variable (lexer, dict);
594 sf->fstats = hsh_create (4,
595 (hsh_compare_func *) factor_statistics_compare,
596 (hsh_hash_func *) factor_statistics_hash,
597 (hsh_free_func *) factor_statistics_free,
603 lex_match (lexer, ',');
605 if ( lex_token (lexer) == '.' || lex_token (lexer) == '/' )
608 success = examine_parse_independent_vars (lexer, dict, cmd);
619 void populate_percentiles (struct tab_table *tbl, int col, int row,
620 const struct metrics *m);
622 void populate_descriptives (struct tab_table *t, int col, int row,
623 const struct metrics *fs);
625 void populate_extremes (struct tab_table *t, int col, int row, int n,
626 const struct metrics *m);
628 void populate_summary (struct tab_table *t, int col, int row,
629 const struct metrics *m);
634 /* Perform calculations for the sub factors */
636 factor_calc (const struct ccase *c, int case_no, double weight,
640 struct factor *fctr = factors;
644 struct factor_statistics **foo ;
645 union value *indep_vals[2] ;
647 indep_vals[0] = value_dup (
648 case_data (c, fctr->indep_var[0]),
649 var_get_width (fctr->indep_var[0])
652 if ( fctr->indep_var[1] )
653 indep_vals[1] = value_dup (
654 case_data (c, fctr->indep_var[1]),
655 var_get_width (fctr->indep_var[1])
659 const union value sm = {SYSMIS};
660 indep_vals[1] = value_dup (&sm, 0);
663 assert (fctr->fstats);
665 foo = ( struct factor_statistics ** )
666 hsh_probe (fctr->fstats, (void *) &indep_vals);
671 *foo = create_factor_statistics (n_dependent_vars,
675 for ( v = 0 ; v < n_dependent_vars ; ++v )
677 metrics_precalc ( & (*foo)->m[v] );
683 free (indep_vals[0]);
684 free (indep_vals[1]);
687 for ( v = 0 ; v < n_dependent_vars ; ++v )
689 const struct variable *var = dependent_vars[v];
690 union value *val = value_dup (
695 if (case_missing || var_is_value_missing (var, val, exclude_values))
701 metrics_calc ( & (*foo)->m[v], val, weight, case_no);
711 run_examine (struct cmd_examine *cmd, struct casereader *input,
714 struct dictionary *dict = dataset_dict (ds);
722 if (!casereader_peek (input, 0, &c))
724 output_split_file_values (ds, &c);
727 input = casereader_create_filter_weight (input, dict, NULL, NULL);
728 input = casereader_create_counter (input, &case_no, 0);
730 /* Make sure we haven't got rubbish left over from a
735 struct factor *next = fctr->next;
737 hsh_clear (fctr->fstats);
744 for ( v = 0 ; v < n_dependent_vars ; ++v )
745 metrics_precalc (&totals[v]);
747 for (; casereader_read (input, &c); case_destroy (&c))
749 int case_missing = 0;
750 const double weight = dict_get_case_weight (dict, &c, NULL);
752 if ( cmd->miss == XMN_LISTWISE )
754 for ( v = 0 ; v < n_dependent_vars ; ++v )
756 const struct variable *var = dependent_vars[v];
757 union value *val = value_dup (
762 if ( var_is_value_missing (var, val, exclude_values))
769 for ( v = 0 ; v < n_dependent_vars ; ++v )
771 const struct variable *var = dependent_vars[v];
772 union value *val = value_dup (
777 if ( var_is_value_missing (var, val, exclude_values)
784 metrics_calc (&totals[v], val, weight, case_no);
789 factor_calc (&c, case_no, weight, case_missing);
791 ok = casereader_destroy (input);
793 for ( v = 0 ; v < n_dependent_vars ; ++v)
798 struct hsh_iterator hi;
799 struct factor_statistics *fs;
801 for ( fs = hsh_first (fctr->fstats, &hi);
803 fs = hsh_next (fctr->fstats, &hi))
806 fs->m[v].ptile_hash = list_to_ptile_hash (&percentile_list);
807 fs->m[v].ptile_alg = percentile_algorithm;
808 metrics_postcalc (&fs->m[v]);
814 totals[v].ptile_hash = list_to_ptile_hash (&percentile_list);
815 totals[v].ptile_alg = percentile_algorithm;
816 metrics_postcalc (&totals[v]);
820 /* Make sure that the combination of factors are complete */
825 struct hsh_iterator hi;
826 struct hsh_iterator hi0;
827 struct hsh_iterator hi1;
828 struct factor_statistics *fs;
830 struct hsh_table *idh0=0;
831 struct hsh_table *idh1=0;
835 idh0 = hsh_create (4, (hsh_compare_func *) compare_values,
836 (hsh_hash_func *) hash_value,
839 idh1 = hsh_create (4, (hsh_compare_func *) compare_values,
840 (hsh_hash_func *) hash_value,
844 for ( fs = hsh_first (fctr->fstats, &hi);
846 fs = hsh_next (fctr->fstats, &hi))
848 hsh_insert (idh0, (void *) &fs->id[0]);
849 hsh_insert (idh1, (void *) &fs->id[1]);
852 /* Ensure that the factors combination is complete */
853 for ( val0 = hsh_first (idh0, &hi0);
855 val0 = hsh_next (idh0, &hi0))
857 for ( val1 = hsh_first (idh1, &hi1);
859 val1 = hsh_next (idh1, &hi1))
861 struct factor_statistics **ffs;
866 ffs = (struct factor_statistics **)
867 hsh_probe (fctr->fstats, (void *) &key );
871 (*ffs) = create_factor_statistics (n_dependent_vars,
873 for ( i = 0 ; i < n_dependent_vars ; ++i )
874 metrics_precalc ( & (*ffs)->m[i]);
882 fctr->fs = (struct factor_statistics **) hsh_sort_copy (fctr->fstats);
894 for ( i = 0 ; i < n_dependent_vars ; ++i )
896 metrics_destroy (&totals[i]);
903 show_summary (const struct variable **dependent_var, int n_dep_var,
904 const struct factor *fctr)
906 static const char *subtitle[]=
914 int heading_columns ;
916 const int heading_rows = 3;
917 struct tab_table *tbl;
925 n_factors = hsh_count (fctr->fstats);
926 n_rows = n_dep_var * n_factors ;
928 if ( fctr->indep_var[1] )
937 n_rows += heading_rows;
939 n_cols = heading_columns + 6;
941 tbl = tab_create (n_cols,n_rows,0);
942 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
944 tab_dim (tbl, tab_natural_dimensions);
946 /* Outline the box */
951 n_cols - 1, n_rows - 1);
953 /* Vertical lines for the data only */
958 n_cols - 1, n_rows - 1);
961 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
962 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
963 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
965 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
968 tab_title (tbl, _ ("Case Processing Summary"));
971 tab_joint_text (tbl, heading_columns, 0,
973 TAB_CENTER | TAT_TITLE,
976 /* Remove lines ... */
983 for ( i = 0 ; i < 3 ; ++i )
985 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
988 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
991 tab_joint_text (tbl, heading_columns + i*2 , 1,
992 heading_columns + i*2 + 1, 1,
993 TAB_CENTER | TAT_TITLE,
996 tab_box (tbl, -1, -1,
998 heading_columns + i*2, 1,
999 heading_columns + i*2 + 1, 1);
1004 /* Titles for the independent variables */
1007 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1008 var_to_string (fctr->indep_var[0]));
1010 if ( fctr->indep_var[1] )
1012 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1013 var_to_string (fctr->indep_var[1]));
1019 for ( i = 0 ; i < n_dep_var ; ++i )
1023 n_factors = hsh_count (fctr->fstats);
1027 tab_hline (tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
1030 0, i * n_factors + heading_rows,
1031 TAB_LEFT | TAT_TITLE,
1032 var_to_string (dependent_var[i])
1037 populate_summary (tbl, heading_columns,
1038 (i * n_factors) + heading_rows,
1044 struct factor_statistics **fs = fctr->fs;
1046 const union value *prev = NULL;
1051 0 != compare_values (prev, (*fs)->id[0],
1052 var_get_width (fctr->indep_var[0])))
1056 (i * n_factors ) + count +
1058 TAB_LEFT | TAT_TITLE,
1059 var_get_value_name (fctr->indep_var[0],
1063 if (fctr->indep_var[1] && count > 0 )
1064 tab_hline (tbl, TAL_1, 1, n_cols - 1,
1065 (i * n_factors ) + count + heading_rows);
1069 prev = (*fs)->id[0];
1072 if ( fctr->indep_var[1])
1075 (i * n_factors ) + count +
1077 TAB_LEFT | TAT_TITLE,
1078 var_get_value_name (fctr->indep_var[1], (*fs)->id[1])
1081 populate_summary (tbl, heading_columns,
1082 (i * n_factors) + count
1097 populate_summary (struct tab_table *t, int col, int row,
1098 const struct metrics *m)
1101 const double total = m->n + m->n_missing ;
1103 tab_float (t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1104 tab_float (t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1105 tab_float (t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1109 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1110 100.0 * m->n / total );
1112 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1113 100.0 * m->n_missing / total );
1115 /* This seems a bit pointless !!! */
1116 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1117 100.0 * total / total );
1128 show_extremes (const struct variable **dependent_var, int n_dep_var,
1129 const struct factor *fctr, int n_extremities)
1132 int heading_columns ;
1134 const int heading_rows = 1;
1135 struct tab_table *tbl;
1142 heading_columns = 2;
1143 n_factors = hsh_count (fctr->fstats);
1145 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1147 if ( fctr->indep_var[1] )
1148 heading_columns = 3;
1152 heading_columns = 1;
1153 n_rows = n_dep_var * 2 * n_extremities;
1156 n_rows += heading_rows;
1158 heading_columns += 2;
1159 n_cols = heading_columns + 2;
1161 tbl = tab_create (n_cols,n_rows,0);
1162 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1164 tab_dim (tbl, tab_natural_dimensions);
1166 /* Outline the box, No internal lines*/
1171 n_cols - 1, n_rows - 1);
1173 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1175 tab_title (tbl, _ ("Extreme Values"));
1177 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1178 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1182 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1183 var_to_string (fctr->indep_var[0]));
1185 if ( fctr->indep_var[1] )
1186 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1187 var_to_string (fctr->indep_var[1]));
1190 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Value"));
1191 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Case Number"));
1193 for ( i = 0 ; i < n_dep_var ; ++i )
1197 tab_hline (tbl, TAL_1, 0, n_cols -1 ,
1198 i * 2 * n_extremities * n_factors + heading_rows);
1201 i * 2 * n_extremities * n_factors + heading_rows,
1202 TAB_LEFT | TAT_TITLE,
1203 var_to_string (dependent_var[i])
1208 populate_extremes (tbl, heading_columns - 2,
1209 i * 2 * n_extremities * n_factors + heading_rows,
1210 n_extremities, &totals[i]);
1214 struct factor_statistics **fs = fctr->fs;
1216 const union value *prev = NULL;
1220 const int row = heading_rows + ( 2 * n_extremities ) *
1221 ( ( i * n_factors ) + count );
1224 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
1225 var_get_width (fctr->indep_var[0])))
1229 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1233 TAB_LEFT | TAT_TITLE,
1234 var_get_value_name (fctr->indep_var[0],
1239 prev = (*fs)->id[0];
1241 if (fctr->indep_var[1] && count > 0 )
1242 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1244 if ( fctr->indep_var[1])
1245 tab_text (tbl, 2, row,
1246 TAB_LEFT | TAT_TITLE,
1247 var_get_value_name (fctr->indep_var[1], (*fs)->id[1])
1250 populate_extremes (tbl, heading_columns - 2,
1265 /* Fill in the extremities table */
1267 populate_extremes (struct tab_table *t,
1268 int col, int row, int n, const struct metrics *m)
1274 tab_text (t, col, row,
1275 TAB_RIGHT | TAT_TITLE ,
1279 tab_text (t, col, row + n ,
1280 TAB_RIGHT | TAT_TITLE ,
1285 tab_hline (t, TAL_1, col, col + 3, row + n );
1287 for (extremity = 0; extremity < n ; ++extremity )
1290 tab_float (t, col + 1, row + extremity,
1292 extremity + 1, 8, 0);
1296 tab_float (t, col + 1, row + extremity + n,
1298 extremity + 1, 8, 0);
1304 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1307 const struct weighted_value *wv = m->wvp[idx];
1308 struct case_node *cn = wv->case_nos;
1311 for (j = 0 ; j < wv->w ; ++j )
1313 if ( extremity + j >= n )
1316 tab_float (t, col + 3, row + extremity + j + n,
1320 tab_float (t, col + 2, row + extremity + j + n,
1329 extremity += wv->w ;
1334 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1337 const struct weighted_value *wv = m->wvp[idx];
1338 struct case_node *cn = wv->case_nos;
1340 for (j = 0 ; j < wv->w ; ++j )
1342 if ( extremity + j >= n )
1345 tab_float (t, col + 3, row + extremity + j,
1349 tab_float (t, col + 2, row + extremity + j,
1358 extremity += wv->w ;
1363 /* Show the descriptives table */
1365 show_descriptives (const struct variable **dependent_var,
1367 struct factor *fctr)
1370 int heading_columns ;
1372 const int n_stat_rows = 13;
1374 const int heading_rows = 1;
1376 struct tab_table *tbl;
1383 heading_columns = 4;
1384 n_factors = hsh_count (fctr->fstats);
1386 n_rows = n_dep_var * n_stat_rows * n_factors;
1388 if ( fctr->indep_var[1] )
1389 heading_columns = 5;
1393 heading_columns = 3;
1394 n_rows = n_dep_var * n_stat_rows;
1397 n_rows += heading_rows;
1399 n_cols = heading_columns + 2;
1402 tbl = tab_create (n_cols, n_rows, 0);
1404 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1406 tab_dim (tbl, tab_natural_dimensions);
1408 /* Outline the box and have no internal lines*/
1413 n_cols - 1, n_rows - 1);
1415 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1417 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1418 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1419 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1421 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Statistic"));
1422 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Std. Error"));
1424 tab_title (tbl, _ ("Descriptives"));
1427 for ( i = 0 ; i < n_dep_var ; ++i )
1429 const int row = heading_rows + i * n_stat_rows * n_factors ;
1432 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
1435 i * n_stat_rows * n_factors + heading_rows,
1436 TAB_LEFT | TAT_TITLE,
1437 var_to_string (dependent_var[i])
1443 const union value *prev = NULL;
1445 struct factor_statistics **fs = fctr->fs;
1448 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1449 var_to_string (fctr->indep_var[0]));
1452 if ( fctr->indep_var[1])
1453 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1454 var_to_string (fctr->indep_var[1]));
1458 const int row = heading_rows + n_stat_rows *
1459 ( ( i * n_factors ) + count );
1462 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
1463 var_get_width (fctr->indep_var[0])))
1467 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1471 TAB_LEFT | TAT_TITLE,
1472 var_get_value_name (fctr->indep_var[0],
1477 prev = (*fs)->id[0];
1479 if (fctr->indep_var[1] && count > 0 )
1480 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1482 if ( fctr->indep_var[1])
1483 tab_text (tbl, 2, row,
1484 TAB_LEFT | TAT_TITLE,
1485 var_get_value_name (fctr->indep_var[1], (*fs)->id[1])
1488 populate_descriptives (tbl, heading_columns - 2,
1489 row, & (*fs)->m[i]);
1500 populate_descriptives (tbl, heading_columns - 2,
1501 i * n_stat_rows * n_factors + heading_rows,
1511 /* Fill in the descriptives data */
1513 populate_descriptives (struct tab_table *tbl, int col, int row,
1514 const struct metrics *m)
1516 const double t = gsl_cdf_tdist_Qinv ((1 - cmd.n_cinterval[0] / 100.0)/2.0,
1521 TAB_LEFT | TAT_TITLE,
1524 tab_float (tbl, col + 2,
1530 tab_float (tbl, col + 3,
1539 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1540 _ ("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1543 tab_text (tbl, col + 1,
1545 TAB_LEFT | TAT_TITLE,
1548 tab_float (tbl, col + 2,
1551 m->mean - t * m->se_mean,
1554 tab_text (tbl, col + 1,
1556 TAB_LEFT | TAT_TITLE,
1560 tab_float (tbl, col + 2,
1563 m->mean + t * m->se_mean,
1568 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1569 _ ("5%% Trimmed Mean"));
1571 tab_float (tbl, col + 2,
1579 TAB_LEFT | TAT_TITLE,
1583 struct percentile *p;
1586 p = hsh_find (m->ptile_hash, &d);
1591 tab_float (tbl, col + 2,
1601 TAB_LEFT | TAT_TITLE,
1604 tab_float (tbl, col + 2,
1613 TAB_LEFT | TAT_TITLE,
1614 _ ("Std. Deviation"));
1617 tab_float (tbl, col + 2,
1626 TAB_LEFT | TAT_TITLE,
1629 tab_float (tbl, col + 2,
1637 TAB_LEFT | TAT_TITLE,
1640 tab_float (tbl, col + 2,
1649 TAB_LEFT | TAT_TITLE,
1653 tab_float (tbl, col + 2,
1661 TAB_LEFT | TAT_TITLE,
1662 _ ("Interquartile Range"));
1665 struct percentile *p1;
1666 struct percentile *p2;
1669 p1 = hsh_find (m->ptile_hash, &d);
1672 p2 = hsh_find (m->ptile_hash, &d);
1677 tab_float (tbl, col + 2,
1688 TAB_LEFT | TAT_TITLE,
1692 tab_float (tbl, col + 2,
1698 /* stderr of skewness */
1699 tab_float (tbl, col + 3,
1708 TAB_LEFT | TAT_TITLE,
1712 tab_float (tbl, col + 2,
1718 /* stderr of kurtosis */
1719 tab_float (tbl, col + 3,
1731 box_plot_variables (const struct factor *fctr,
1732 const struct variable **vars, int n_vars,
1733 const struct variable *id)
1737 struct factor_statistics **fs ;
1741 box_plot_group (fctr, vars, n_vars, id);
1745 for ( fs = fctr->fs ; *fs ; ++fs )
1747 double y_min = DBL_MAX;
1748 double y_max = -DBL_MAX;
1749 struct chart *ch = chart_create ();
1750 const char *s = factor_to_string (fctr, *fs, 0 );
1752 chart_write_title (ch, s);
1754 for ( i = 0 ; i < n_vars ; ++i )
1756 y_max = MAX (y_max, (*fs)->m[i].max);
1757 y_min = MIN (y_min, (*fs)->m[i].min);
1760 boxplot_draw_yscale (ch, y_max, y_min);
1762 for ( i = 0 ; i < n_vars ; ++i )
1765 const double box_width = (ch->data_right - ch->data_left)
1768 const double box_centre = ( i * 2 + 1) * box_width
1771 boxplot_draw_boxplot (ch,
1772 box_centre, box_width,
1774 var_to_string (vars[i]));
1786 /* Do a box plot, grouping all factors into one plot ;
1787 each dependent variable has its own plot.
1790 box_plot_group (const struct factor *fctr,
1791 const struct variable **vars,
1793 const struct variable *id UNUSED)
1798 for ( i = 0 ; i < n_vars ; ++i )
1800 struct factor_statistics **fs ;
1803 ch = chart_create ();
1805 boxplot_draw_yscale (ch, totals[i].max, totals[i].min);
1811 for ( fs = fctr->fs ; *fs ; ++fs )
1814 chart_write_title (ch, _ ("Boxplot of %s vs. %s"),
1815 var_to_string (vars[i]), var_to_string (fctr->indep_var[0]) );
1817 for ( fs = fctr->fs ; *fs ; ++fs )
1820 const char *s = factor_to_string_concise (fctr, *fs);
1822 const double box_width = (ch->data_right - ch->data_left)
1823 / (n_factors * 2.0 ) ;
1825 const double box_centre = ( f++ * 2 + 1) * box_width
1828 boxplot_draw_boxplot (ch,
1829 box_centre, box_width,
1836 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1837 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1839 chart_write_title (ch, _ ("Boxplot"));
1841 boxplot_draw_boxplot (ch,
1842 box_centre, box_width,
1844 var_to_string (vars[i]) );
1853 /* Plot the normal and detrended normal plots for m
1854 Label the plots with factorname */
1856 np_plot (const struct metrics *m, const char *factorname)
1859 double yfirst=0, ylast=0;
1862 struct chart *np_chart;
1864 /* Detrended Normal Plot */
1865 struct chart *dnp_chart;
1867 /* The slope and intercept of the ideal normal probability line */
1868 const double slope = 1.0 / m->stddev;
1869 const double intercept = - m->mean / m->stddev;
1871 /* Cowardly refuse to plot an empty data set */
1872 if ( m->n_data == 0 )
1875 np_chart = chart_create ();
1876 dnp_chart = chart_create ();
1878 if ( !np_chart || ! dnp_chart )
1881 chart_write_title (np_chart, _ ("Normal Q-Q Plot of %s"), factorname);
1882 chart_write_xlabel (np_chart, _ ("Observed Value"));
1883 chart_write_ylabel (np_chart, _ ("Expected Normal"));
1886 chart_write_title (dnp_chart, _ ("Detrended Normal Q-Q Plot of %s"),
1888 chart_write_xlabel (dnp_chart, _ ("Observed Value"));
1889 chart_write_ylabel (dnp_chart, _ ("Dev from Normal"));
1891 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1892 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1896 /* Need to make sure that both the scatter plot and the ideal fit into the
1898 double x_lower = MIN (m->min, (yfirst - intercept) / slope) ;
1899 double x_upper = MAX (m->max, (ylast - intercept) / slope) ;
1900 double slack = (x_upper - x_lower) * 0.05 ;
1902 chart_write_xscale (np_chart, x_lower - slack, x_upper + slack, 5);
1904 chart_write_xscale (dnp_chart, m->min, m->max, 5);
1908 chart_write_yscale (np_chart, yfirst, ylast, 5);
1911 /* We have to cache the detrended data, beacause we need to
1912 find its limits before we can plot it */
1913 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1914 double d_max = -DBL_MAX;
1915 double d_min = DBL_MAX;
1916 for ( i = 0 ; i < m->n_data; ++i )
1918 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1920 chart_datum (np_chart, 0, m->wvp[i]->v.f, ns);
1922 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1924 if ( d_data[i] < d_min ) d_min = d_data[i];
1925 if ( d_data[i] > d_max ) d_max = d_data[i];
1927 chart_write_yscale (dnp_chart, d_min, d_max, 5);
1929 for ( i = 0 ; i < m->n_data; ++i )
1930 chart_datum (dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1935 chart_line (np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1936 chart_line (dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1938 chart_submit (np_chart);
1939 chart_submit (dnp_chart);
1945 /* Show the percentiles */
1947 show_percentiles (const struct variable **dependent_var,
1949 struct factor *fctr)
1951 struct tab_table *tbl;
1957 struct hsh_table *ptiles ;
1959 int n_heading_columns;
1960 const int n_heading_rows = 2;
1961 const int n_stat_rows = 2;
1967 struct factor_statistics **fs = fctr->fs ;
1968 n_heading_columns = 3;
1969 n_factors = hsh_count (fctr->fstats);
1971 ptiles = (*fs)->m[0].ptile_hash;
1973 if ( fctr->indep_var[1] )
1974 n_heading_columns = 4;
1979 n_heading_columns = 2;
1981 ptiles = totals[0].ptile_hash;
1984 n_ptiles = hsh_count (ptiles);
1986 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1988 n_cols = n_heading_columns + n_ptiles ;
1990 tbl = tab_create (n_cols, n_rows, 0);
1992 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1994 tab_dim (tbl, tab_natural_dimensions);
1996 /* Outline the box and have no internal lines*/
2001 n_cols - 1, n_rows - 1);
2003 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
2005 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
2008 tab_title (tbl, _ ("Percentiles"));
2011 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
2018 n_heading_columns - 1, n_rows - 1);
2024 n_heading_columns, n_heading_rows - 1,
2025 n_cols - 1, n_rows - 1);
2027 tab_joint_text (tbl, n_heading_columns + 1, 0,
2029 TAB_CENTER | TAT_TITLE ,
2034 /* Put in the percentile break points as headings */
2036 struct percentile **p = (struct percentile **) hsh_sort (ptiles);
2041 tab_float (tbl, n_heading_columns + i++ , 1,
2050 for ( i = 0 ; i < n_dep_var ; ++i )
2052 const int n_stat_rows = 2;
2053 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2056 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
2059 i * n_stat_rows * n_factors + n_heading_rows,
2060 TAB_LEFT | TAT_TITLE,
2061 var_to_string (dependent_var[i])
2066 const union value *prev = NULL ;
2067 struct factor_statistics **fs = fctr->fs;
2070 tab_text (tbl, 1, n_heading_rows - 1,
2071 TAB_CENTER | TAT_TITLE,
2072 var_to_string (fctr->indep_var[0]));
2075 if ( fctr->indep_var[1])
2076 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2077 var_to_string (fctr->indep_var[1]));
2081 const int row = n_heading_rows + n_stat_rows *
2082 ( ( i * n_factors ) + count );
2085 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
2086 var_get_width (fctr->indep_var[0])))
2090 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2094 TAB_LEFT | TAT_TITLE,
2095 var_get_value_name (fctr->indep_var[0],
2102 prev = (*fs)->id[0];
2104 if (fctr->indep_var[1] && count > 0 )
2105 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
2107 if ( fctr->indep_var[1])
2108 tab_text (tbl, 2, row,
2109 TAB_LEFT | TAT_TITLE,
2110 var_get_value_name (fctr->indep_var[1], (*fs)->id[1])
2114 populate_percentiles (tbl, n_heading_columns - 1,
2115 row, & (*fs)->m[i]);
2126 populate_percentiles (tbl, n_heading_columns - 1,
2127 i * n_stat_rows * n_factors + n_heading_rows,
2144 populate_percentiles (struct tab_table *tbl, int col, int row,
2145 const struct metrics *m)
2149 struct percentile **p = (struct percentile **) hsh_sort (m->ptile_hash);
2153 TAB_LEFT | TAT_TITLE,
2154 _ ("Tukey\'s Hinges")
2159 TAB_LEFT | TAT_TITLE,
2160 ptile_alg_desc[m->ptile_alg]
2167 tab_float (tbl, col + i + 1 , row,
2170 if ( (*p)->p == 25 )
2171 tab_float (tbl, col + i + 1 , row + 1,
2175 if ( (*p)->p == 50 )
2176 tab_float (tbl, col + i + 1 , row + 1,
2180 if ( (*p)->p == 75 )
2181 tab_float (tbl, col + i + 1 , row + 1,
2196 factor_to_string (const struct factor *fctr,
2197 const struct factor_statistics *fs,
2198 const struct variable *var)
2201 static char buf1[100];
2207 sprintf (buf1, "%s (",var_to_string (var) );
2210 snprintf (buf2, 100, "%s = %s",
2211 var_to_string (fctr->indep_var[0]),
2212 var_get_value_name (fctr->indep_var[0], fs->id[0]));
2214 strcat (buf1, buf2);
2216 if ( fctr->indep_var[1] )
2218 sprintf (buf2, "; %s = %s)",
2219 var_to_string (fctr->indep_var[1]),
2220 var_get_value_name (fctr->indep_var[1], fs->id[1]));
2221 strcat (buf1, buf2);
2235 factor_to_string_concise (const struct factor *fctr,
2236 struct factor_statistics *fs)
2240 static char buf[100];
2244 snprintf (buf, 100, "%s",
2245 var_get_value_name (fctr->indep_var[0], fs->id[0]));
2247 if ( fctr->indep_var[1] )
2249 sprintf (buf2, ",%s)", var_get_value_name (fctr->indep_var[1],