1 /* PSPP - EXAMINE data for normality . -*-c-*-
3 Copyright (C) 2004 Free Software Foundation, Inc.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <libpspp/message.h>
28 #include <data/case.h>
29 #include <data/casefile.h>
30 #include <data/dictionary.h>
31 #include <data/procedure.h>
32 #include <data/value-labels.h>
33 #include <data/variable.h>
34 #include <language/command.h>
35 #include <language/dictionary/split-file.h>
36 #include <language/lexer/lexer.h>
37 #include <libpspp/alloc.h>
38 #include <libpspp/compiler.h>
39 #include <libpspp/hash.h>
40 #include <libpspp/magic.h>
41 #include <libpspp/message.h>
42 #include <libpspp/misc.h>
43 #include <libpspp/str.h>
44 #include <math/factor-stats.h>
45 #include <math/moments.h>
46 #include <math/percentiles.h>
47 #include <output/charts/box-whisker.h>
48 #include <output/charts/cartesian.h>
49 #include <output/manager.h>
50 #include <output/table.h>
55 #define _(msgid) gettext (msgid)
56 #define N_(msgid) msgid
59 #include <output/chart.h>
60 #include <output/charts/plot-hist.h>
61 #include <output/charts/plot-chart.h>
68 missing=miss:pairwise/!listwise,
70 incl:include/!exclude;
71 +compare=cmp:variables/!groups;
74 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
76 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
85 static struct cmd_examine cmd;
87 static const struct variable **dependent_vars;
89 static size_t n_dependent_vars;
94 /* The independent variable */
95 struct variable *indep_var[2];
98 /* Hash table of factor stats indexed by 2 values */
99 struct hsh_table *fstats;
101 /* The hash table after it has been crunched */
102 struct factor_statistics **fs;
108 /* Linked list of factors */
109 static struct factor *factors = 0;
111 static struct metrics *totals = 0;
113 /* Parse the clause specifying the factors */
114 static int examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd);
118 /* Output functions */
119 static void show_summary (const struct variable **dependent_var, int n_dep_var,
120 const struct factor *f);
122 static void show_extremes (const struct variable **dependent_var,
124 const struct factor *factor,
127 static void show_descriptives (const struct variable **dependent_var,
129 struct factor *factor);
131 static void show_percentiles (const struct variable **dependent_var,
133 struct factor *factor);
138 void np_plot (const struct metrics *m, const char *factorname);
141 void box_plot_group (const struct factor *fctr,
142 const struct variable **vars, int n_vars,
143 const struct variable *id
147 void box_plot_variables (const struct factor *fctr,
148 const struct variable **vars, int n_vars,
149 const struct variable *id
154 /* Per Split function */
155 static bool run_examine (const struct ccase *,
156 const struct casefile *cf, void *cmd_, const struct dataset *);
158 static void output_examine (void);
161 void factor_calc (const struct ccase *c, int case_no,
162 double weight, int case_missing);
165 /* Represent a factor as a string, so it can be
166 printed in a human readable fashion */
167 const char * factor_to_string (const struct factor *fctr,
168 const struct factor_statistics *fs,
169 const struct variable *var);
172 /* Represent a factor as a string, so it can be
173 printed in a human readable fashion,
174 but sacrificing some readablility for the sake of brevity */
175 const char *factor_to_string_concise (const struct factor *fctr,
176 struct factor_statistics *fs);
181 /* Categories of missing values to exclude. */
182 static enum mv_class exclude_values;
186 static subc_list_double percentile_list;
188 static enum pc_alg percentile_algorithm;
190 static short sbc_percentile;
194 cmd_examine (struct lexer *lexer, struct dataset *ds)
198 subc_list_double_create (&percentile_list);
199 percentile_algorithm = PC_HAVERAGE;
201 if ( !parse_examine (lexer, ds, &cmd, NULL) )
203 subc_list_double_destroy (&percentile_list);
207 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
208 exclude_values = cmd.incl == XMN_INCLUDE ? MV_SYSTEM : MV_ANY;
210 if ( cmd.st_n == SYSMIS )
213 if ( ! cmd.sbc_cinterval)
214 cmd.n_cinterval[0] = 95.0;
216 /* If descriptives have been requested, make sure the
217 quartiles are calculated */
218 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
220 subc_list_double_push (&percentile_list, 25);
221 subc_list_double_push (&percentile_list, 50);
222 subc_list_double_push (&percentile_list, 75);
225 ok = multipass_procedure_with_splits (ds, run_examine, &cmd);
232 if ( dependent_vars )
233 free (dependent_vars);
236 struct factor *f = factors ;
239 struct factor *ff = f;
243 hsh_destroy ( ff->fstats ) ;
249 subc_list_double_destroy (&percentile_list);
251 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
256 /* Show all the appropriate tables */
258 output_examine (void)
262 /* Show totals if appropriate */
263 if ( ! cmd.sbc_nototal || factors == 0 )
265 show_summary (dependent_vars, n_dependent_vars, 0);
267 if ( cmd.sbc_statistics )
269 if ( cmd.a_statistics[XMN_ST_EXTREME])
270 show_extremes (dependent_vars, n_dependent_vars, 0, cmd.st_n);
272 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
273 show_descriptives (dependent_vars, n_dependent_vars, 0);
276 if ( sbc_percentile )
277 show_percentiles (dependent_vars, n_dependent_vars, 0);
282 if ( cmd.a_plot[XMN_PLT_STEMLEAF] )
283 msg (SW, _ ("%s is not currently supported."), "STEMLEAF");
285 if ( cmd.a_plot[XMN_PLT_SPREADLEVEL] )
286 msg (SW, _ ("%s is not currently supported."), "SPREADLEVEL");
288 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
290 for ( v = 0 ; v < n_dependent_vars; ++v )
291 np_plot (&totals[v], var_to_string (dependent_vars[v]));
294 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
296 if ( cmd.cmp == XMN_GROUPS )
298 box_plot_group (0, (const struct variable **) dependent_vars,
299 n_dependent_vars, cmd.v_id);
302 box_plot_variables (0,
303 (const struct variable **) dependent_vars,
304 n_dependent_vars, cmd.v_id);
307 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
309 for ( v = 0 ; v < n_dependent_vars; ++v )
311 struct normal_curve normal;
313 normal.N = totals[v].n;
314 normal.mean = totals[v].mean;
315 normal.stddev = totals[v].stddev;
317 histogram_plot (totals[v].histogram,
318 var_to_string (dependent_vars[v]),
328 /* Show grouped statistics as appropriate */
332 show_summary (dependent_vars, n_dependent_vars, fctr);
334 if ( cmd.sbc_statistics )
336 if ( cmd.a_statistics[XMN_ST_EXTREME])
337 show_extremes (dependent_vars, n_dependent_vars, fctr, cmd.st_n);
339 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
340 show_descriptives (dependent_vars, n_dependent_vars, fctr);
343 if ( sbc_percentile )
344 show_percentiles (dependent_vars, n_dependent_vars, fctr);
351 struct factor_statistics **fs = fctr->fs ;
353 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
355 if ( cmd.cmp == XMN_VARIABLES )
356 box_plot_variables (fctr,
357 (const struct variable **) dependent_vars,
358 n_dependent_vars, cmd.v_id);
360 box_plot_group (fctr,
361 (const struct variable **) dependent_vars,
362 n_dependent_vars, cmd.v_id);
365 for ( v = 0 ; v < n_dependent_vars; ++v )
368 for ( fs = fctr->fs ; *fs ; ++fs )
370 const char *s = factor_to_string (fctr, *fs, dependent_vars[v]);
372 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
373 np_plot (& (*fs)->m[v], s);
375 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
377 struct normal_curve normal;
379 normal.N = (*fs)->m[v].n;
380 normal.mean = (*fs)->m[v].mean;
381 normal.stddev = (*fs)->m[v].stddev;
383 histogram_plot ((*fs)->m[v].histogram,
387 } /* for ( fs .... */
389 } /* for ( v = 0 ..... */
399 /* Create a hash table of percentiles and their values from the list of
401 static struct hsh_table *
402 list_to_ptile_hash (const subc_list_double *l)
406 struct hsh_table *h ;
408 h = hsh_create (subc_list_double_count (l),
409 (hsh_compare_func *) ptile_compare,
410 (hsh_hash_func *) ptile_hash,
411 (hsh_free_func *) free,
415 for ( i = 0 ; i < subc_list_double_count (l) ; ++i )
417 struct percentile *p = xmalloc (sizeof *p);
419 p->p = subc_list_double_at (l,i);
430 /* Parse the PERCENTILES subcommand */
432 xmn_custom_percentiles (struct lexer *lexer, struct dataset *ds UNUSED,
433 struct cmd_examine *p UNUSED, void *aux UNUSED)
437 lex_match (lexer, '=');
439 lex_match (lexer, '(');
441 while ( lex_is_number (lexer) )
443 subc_list_double_push (&percentile_list, lex_number (lexer));
447 lex_match (lexer, ',') ;
449 lex_match (lexer, ')');
451 lex_match (lexer, '=');
453 if ( lex_match_id (lexer, "HAVERAGE"))
454 percentile_algorithm = PC_HAVERAGE;
456 else if ( lex_match_id (lexer, "WAVERAGE"))
457 percentile_algorithm = PC_WAVERAGE;
459 else if ( lex_match_id (lexer, "ROUND"))
460 percentile_algorithm = PC_ROUND;
462 else if ( lex_match_id (lexer, "EMPIRICAL"))
463 percentile_algorithm = PC_EMPIRICAL;
465 else if ( lex_match_id (lexer, "AEMPIRICAL"))
466 percentile_algorithm = PC_AEMPIRICAL;
468 else if ( lex_match_id (lexer, "NONE"))
469 percentile_algorithm = PC_NONE;
472 if ( 0 == subc_list_double_count (&percentile_list))
474 subc_list_double_push (&percentile_list, 5);
475 subc_list_double_push (&percentile_list, 10);
476 subc_list_double_push (&percentile_list, 25);
477 subc_list_double_push (&percentile_list, 50);
478 subc_list_double_push (&percentile_list, 75);
479 subc_list_double_push (&percentile_list, 90);
480 subc_list_double_push (&percentile_list, 95);
486 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
488 xmn_custom_total (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, struct cmd_examine *p, void *aux UNUSED)
490 if ( p->sbc_nototal )
492 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
500 xmn_custom_nototal (struct lexer *lexer UNUSED, struct dataset *ds UNUSED,
501 struct cmd_examine *p, void *aux UNUSED)
505 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
514 /* Parser for the variables sub command
515 Returns 1 on success */
517 xmn_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_examine *cmd, void *aux UNUSED)
519 const struct dictionary *dict = dataset_dict (ds);
520 lex_match (lexer, '=');
522 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
523 && lex_token (lexer) != T_ALL)
528 if (!parse_variables_const (lexer, dict, &dependent_vars, &n_dependent_vars,
529 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
531 free (dependent_vars);
535 assert (n_dependent_vars);
537 totals = xnmalloc (n_dependent_vars, sizeof *totals);
539 if ( lex_match (lexer, T_BY))
542 success = examine_parse_independent_vars (lexer, dict, cmd);
543 if ( success != 1 ) {
544 free (dependent_vars);
555 /* Parse the clause specifying the factors */
557 examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd)
560 struct factor *sf = xmalloc (sizeof *sf);
562 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
563 && lex_token (lexer) != T_ALL)
570 sf->indep_var[0] = parse_variable (lexer, dict);
571 sf->indep_var[1] = 0;
573 if ( lex_token (lexer) == T_BY )
576 lex_match (lexer, T_BY);
578 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
579 && lex_token (lexer) != T_ALL)
585 sf->indep_var[1] = parse_variable (lexer, dict);
590 sf->fstats = hsh_create (4,
591 (hsh_compare_func *) factor_statistics_compare,
592 (hsh_hash_func *) factor_statistics_hash,
593 (hsh_free_func *) factor_statistics_free,
599 lex_match (lexer, ',');
601 if ( lex_token (lexer) == '.' || lex_token (lexer) == '/' )
604 success = examine_parse_independent_vars (lexer, dict, cmd);
615 void populate_percentiles (struct tab_table *tbl, int col, int row,
616 const struct metrics *m);
618 void populate_descriptives (struct tab_table *t, int col, int row,
619 const struct metrics *fs);
621 void populate_extremes (struct tab_table *t, int col, int row, int n,
622 const struct metrics *m);
624 void populate_summary (struct tab_table *t, int col, int row,
625 const struct metrics *m);
630 static bool bad_weight_warn = true;
633 /* Perform calculations for the sub factors */
635 factor_calc (const struct ccase *c, int case_no, double weight,
639 struct factor *fctr = factors;
643 struct factor_statistics **foo ;
644 union value *indep_vals[2] ;
646 indep_vals[0] = value_dup (
647 case_data (c, fctr->indep_var[0]),
648 var_get_width (fctr->indep_var[0])
651 if ( fctr->indep_var[1] )
652 indep_vals[1] = value_dup (
653 case_data (c, fctr->indep_var[1]),
654 var_get_width (fctr->indep_var[1])
658 const union value sm = {SYSMIS};
659 indep_vals[1] = value_dup (&sm, 0);
662 assert (fctr->fstats);
664 foo = ( struct factor_statistics ** )
665 hsh_probe (fctr->fstats, (void *) &indep_vals);
670 *foo = create_factor_statistics (n_dependent_vars,
674 for ( v = 0 ; v < n_dependent_vars ; ++v )
676 metrics_precalc ( & (*foo)->m[v] );
682 free (indep_vals[0]);
683 free (indep_vals[1]);
686 for ( v = 0 ; v < n_dependent_vars ; ++v )
688 const struct variable *var = dependent_vars[v];
689 union value *val = value_dup (
694 if (case_missing || var_is_value_missing (var, val, exclude_values))
700 metrics_calc ( & (*foo)->m[v], val, weight, case_no);
710 run_examine (const struct ccase *first, const struct casefile *cf,
711 void *cmd_, const struct dataset *ds)
713 struct dictionary *dict = dataset_dict (ds);
714 struct casereader *r;
718 const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
722 output_split_file_values (ds, first);
724 /* Make sure we haven't got rubbish left over from a
729 struct factor *next = fctr->next;
731 hsh_clear (fctr->fstats);
738 for ( v = 0 ; v < n_dependent_vars ; ++v )
739 metrics_precalc (&totals[v]);
741 for (r = casefile_get_reader (cf, NULL);
742 casereader_read (r, &c) ;
746 const int case_no = casereader_cnum (r);
748 const double weight =
749 dict_get_case_weight (dict, &c, &bad_weight_warn);
751 if ( cmd->miss == XMN_LISTWISE )
753 for ( v = 0 ; v < n_dependent_vars ; ++v )
755 const struct variable *var = dependent_vars[v];
756 union value *val = value_dup (
761 if ( var_is_value_missing (var, val, exclude_values))
768 for ( v = 0 ; v < n_dependent_vars ; ++v )
770 const struct variable *var = dependent_vars[v];
771 union value *val = value_dup (
776 if ( var_is_value_missing (var, val, exclude_values)
783 metrics_calc (&totals[v], val, weight, case_no);
788 factor_calc (&c, case_no, weight, case_missing);
791 for ( v = 0 ; v < n_dependent_vars ; ++v)
796 struct hsh_iterator hi;
797 struct factor_statistics *fs;
799 for ( fs = hsh_first (fctr->fstats, &hi);
801 fs = hsh_next (fctr->fstats, &hi))
804 fs->m[v].ptile_hash = list_to_ptile_hash (&percentile_list);
805 fs->m[v].ptile_alg = percentile_algorithm;
806 metrics_postcalc (&fs->m[v]);
812 totals[v].ptile_hash = list_to_ptile_hash (&percentile_list);
813 totals[v].ptile_alg = percentile_algorithm;
814 metrics_postcalc (&totals[v]);
818 /* Make sure that the combination of factors are complete */
823 struct hsh_iterator hi;
824 struct hsh_iterator hi0;
825 struct hsh_iterator hi1;
826 struct factor_statistics *fs;
828 struct hsh_table *idh0=0;
829 struct hsh_table *idh1=0;
833 idh0 = hsh_create (4, (hsh_compare_func *) compare_values,
834 (hsh_hash_func *) hash_value,
837 idh1 = hsh_create (4, (hsh_compare_func *) compare_values,
838 (hsh_hash_func *) hash_value,
842 for ( fs = hsh_first (fctr->fstats, &hi);
844 fs = hsh_next (fctr->fstats, &hi))
846 hsh_insert (idh0, (void *) &fs->id[0]);
847 hsh_insert (idh1, (void *) &fs->id[1]);
850 /* Ensure that the factors combination is complete */
851 for ( val0 = hsh_first (idh0, &hi0);
853 val0 = hsh_next (idh0, &hi0))
855 for ( val1 = hsh_first (idh1, &hi1);
857 val1 = hsh_next (idh1, &hi1))
859 struct factor_statistics **ffs;
864 ffs = (struct factor_statistics **)
865 hsh_probe (fctr->fstats, (void *) &key );
869 (*ffs) = create_factor_statistics (n_dependent_vars,
871 for ( i = 0 ; i < n_dependent_vars ; ++i )
872 metrics_precalc ( & (*ffs)->m[i]);
880 fctr->fs = (struct factor_statistics **) hsh_sort_copy (fctr->fstats);
891 for ( i = 0 ; i < n_dependent_vars ; ++i )
893 metrics_destroy (&totals[i]);
902 show_summary (const struct variable **dependent_var, int n_dep_var,
903 const struct factor *fctr)
905 static const char *subtitle[]=
913 int heading_columns ;
915 const int heading_rows = 3;
916 struct tab_table *tbl;
924 n_factors = hsh_count (fctr->fstats);
925 n_rows = n_dep_var * n_factors ;
927 if ( fctr->indep_var[1] )
936 n_rows += heading_rows;
938 n_cols = heading_columns + 6;
940 tbl = tab_create (n_cols,n_rows,0);
941 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
943 tab_dim (tbl, tab_natural_dimensions);
945 /* Outline the box */
950 n_cols - 1, n_rows - 1);
952 /* Vertical lines for the data only */
957 n_cols - 1, n_rows - 1);
960 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
961 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
962 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
964 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
967 tab_title (tbl, _ ("Case Processing Summary"));
970 tab_joint_text (tbl, heading_columns, 0,
972 TAB_CENTER | TAT_TITLE,
975 /* Remove lines ... */
982 for ( i = 0 ; i < 3 ; ++i )
984 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
987 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
990 tab_joint_text (tbl, heading_columns + i*2 , 1,
991 heading_columns + i*2 + 1, 1,
992 TAB_CENTER | TAT_TITLE,
995 tab_box (tbl, -1, -1,
997 heading_columns + i*2, 1,
998 heading_columns + i*2 + 1, 1);
1003 /* Titles for the independent variables */
1006 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1007 var_to_string (fctr->indep_var[0]));
1009 if ( fctr->indep_var[1] )
1011 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1012 var_to_string (fctr->indep_var[1]));
1018 for ( i = 0 ; i < n_dep_var ; ++i )
1022 n_factors = hsh_count (fctr->fstats);
1026 tab_hline (tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
1029 0, i * n_factors + heading_rows,
1030 TAB_LEFT | TAT_TITLE,
1031 var_to_string (dependent_var[i])
1036 populate_summary (tbl, heading_columns,
1037 (i * n_factors) + heading_rows,
1043 struct factor_statistics **fs = fctr->fs;
1045 const union value *prev = NULL;
1050 0 != compare_values (prev, (*fs)->id[0],
1051 var_get_width (fctr->indep_var[0])))
1055 (i * n_factors ) + count +
1057 TAB_LEFT | TAT_TITLE,
1058 var_get_value_name (fctr->indep_var[0],
1062 if (fctr->indep_var[1] && count > 0 )
1063 tab_hline (tbl, TAL_1, 1, n_cols - 1,
1064 (i * n_factors ) + count + heading_rows);
1068 prev = (*fs)->id[0];
1071 if ( fctr->indep_var[1])
1074 (i * n_factors ) + count +
1076 TAB_LEFT | TAT_TITLE,
1077 var_get_value_name (fctr->indep_var[1], (*fs)->id[1])
1080 populate_summary (tbl, heading_columns,
1081 (i * n_factors) + count
1096 populate_summary (struct tab_table *t, int col, int row,
1097 const struct metrics *m)
1100 const double total = m->n + m->n_missing ;
1102 tab_float (t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1103 tab_float (t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1104 tab_float (t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1108 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1109 100.0 * m->n / total );
1111 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1112 100.0 * m->n_missing / total );
1114 /* This seems a bit pointless !!! */
1115 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1116 100.0 * total / total );
1127 show_extremes (const struct variable **dependent_var, int n_dep_var,
1128 const struct factor *fctr, int n_extremities)
1131 int heading_columns ;
1133 const int heading_rows = 1;
1134 struct tab_table *tbl;
1141 heading_columns = 2;
1142 n_factors = hsh_count (fctr->fstats);
1144 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1146 if ( fctr->indep_var[1] )
1147 heading_columns = 3;
1151 heading_columns = 1;
1152 n_rows = n_dep_var * 2 * n_extremities;
1155 n_rows += heading_rows;
1157 heading_columns += 2;
1158 n_cols = heading_columns + 2;
1160 tbl = tab_create (n_cols,n_rows,0);
1161 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1163 tab_dim (tbl, tab_natural_dimensions);
1165 /* Outline the box, No internal lines*/
1170 n_cols - 1, n_rows - 1);
1172 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1174 tab_title (tbl, _ ("Extreme Values"));
1176 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1177 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1181 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1182 var_to_string (fctr->indep_var[0]));
1184 if ( fctr->indep_var[1] )
1185 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1186 var_to_string (fctr->indep_var[1]));
1189 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Value"));
1190 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Case Number"));
1192 for ( i = 0 ; i < n_dep_var ; ++i )
1196 tab_hline (tbl, TAL_1, 0, n_cols -1 ,
1197 i * 2 * n_extremities * n_factors + heading_rows);
1200 i * 2 * n_extremities * n_factors + heading_rows,
1201 TAB_LEFT | TAT_TITLE,
1202 var_to_string (dependent_var[i])
1207 populate_extremes (tbl, heading_columns - 2,
1208 i * 2 * n_extremities * n_factors + heading_rows,
1209 n_extremities, &totals[i]);
1213 struct factor_statistics **fs = fctr->fs;
1215 const union value *prev = NULL;
1219 const int row = heading_rows + ( 2 * n_extremities ) *
1220 ( ( i * n_factors ) + count );
1223 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
1224 var_get_width (fctr->indep_var[0])))
1228 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1232 TAB_LEFT | TAT_TITLE,
1233 var_get_value_name (fctr->indep_var[0],
1238 prev = (*fs)->id[0];
1240 if (fctr->indep_var[1] && count > 0 )
1241 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1243 if ( fctr->indep_var[1])
1244 tab_text (tbl, 2, row,
1245 TAB_LEFT | TAT_TITLE,
1246 var_get_value_name (fctr->indep_var[1], (*fs)->id[1])
1249 populate_extremes (tbl, heading_columns - 2,
1264 /* Fill in the extremities table */
1266 populate_extremes (struct tab_table *t,
1267 int col, int row, int n, const struct metrics *m)
1273 tab_text (t, col, row,
1274 TAB_RIGHT | TAT_TITLE ,
1278 tab_text (t, col, row + n ,
1279 TAB_RIGHT | TAT_TITLE ,
1284 tab_hline (t, TAL_1, col, col + 3, row + n );
1286 for (extremity = 0; extremity < n ; ++extremity )
1289 tab_float (t, col + 1, row + extremity,
1291 extremity + 1, 8, 0);
1295 tab_float (t, col + 1, row + extremity + n,
1297 extremity + 1, 8, 0);
1303 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1306 const struct weighted_value *wv = m->wvp[idx];
1307 struct case_node *cn = wv->case_nos;
1310 for (j = 0 ; j < wv->w ; ++j )
1312 if ( extremity + j >= n )
1315 tab_float (t, col + 3, row + extremity + j + n,
1319 tab_float (t, col + 2, row + extremity + j + n,
1328 extremity += wv->w ;
1333 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1336 const struct weighted_value *wv = m->wvp[idx];
1337 struct case_node *cn = wv->case_nos;
1339 for (j = 0 ; j < wv->w ; ++j )
1341 if ( extremity + j >= n )
1344 tab_float (t, col + 3, row + extremity + j,
1348 tab_float (t, col + 2, row + extremity + j,
1357 extremity += wv->w ;
1362 /* Show the descriptives table */
1364 show_descriptives (const struct variable **dependent_var,
1366 struct factor *fctr)
1369 int heading_columns ;
1371 const int n_stat_rows = 13;
1373 const int heading_rows = 1;
1375 struct tab_table *tbl;
1382 heading_columns = 4;
1383 n_factors = hsh_count (fctr->fstats);
1385 n_rows = n_dep_var * n_stat_rows * n_factors;
1387 if ( fctr->indep_var[1] )
1388 heading_columns = 5;
1392 heading_columns = 3;
1393 n_rows = n_dep_var * n_stat_rows;
1396 n_rows += heading_rows;
1398 n_cols = heading_columns + 2;
1401 tbl = tab_create (n_cols, n_rows, 0);
1403 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1405 tab_dim (tbl, tab_natural_dimensions);
1407 /* Outline the box and have no internal lines*/
1412 n_cols - 1, n_rows - 1);
1414 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1416 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1417 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1418 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1420 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Statistic"));
1421 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Std. Error"));
1423 tab_title (tbl, _ ("Descriptives"));
1426 for ( i = 0 ; i < n_dep_var ; ++i )
1428 const int row = heading_rows + i * n_stat_rows * n_factors ;
1431 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
1434 i * n_stat_rows * n_factors + heading_rows,
1435 TAB_LEFT | TAT_TITLE,
1436 var_to_string (dependent_var[i])
1442 const union value *prev = NULL;
1444 struct factor_statistics **fs = fctr->fs;
1447 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1448 var_to_string (fctr->indep_var[0]));
1451 if ( fctr->indep_var[1])
1452 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1453 var_to_string (fctr->indep_var[1]));
1457 const int row = heading_rows + n_stat_rows *
1458 ( ( i * n_factors ) + count );
1461 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
1462 var_get_width (fctr->indep_var[0])))
1466 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1470 TAB_LEFT | TAT_TITLE,
1471 var_get_value_name (fctr->indep_var[0],
1476 prev = (*fs)->id[0];
1478 if (fctr->indep_var[1] && count > 0 )
1479 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1481 if ( fctr->indep_var[1])
1482 tab_text (tbl, 2, row,
1483 TAB_LEFT | TAT_TITLE,
1484 var_get_value_name (fctr->indep_var[1], (*fs)->id[1])
1487 populate_descriptives (tbl, heading_columns - 2,
1488 row, & (*fs)->m[i]);
1499 populate_descriptives (tbl, heading_columns - 2,
1500 i * n_stat_rows * n_factors + heading_rows,
1510 /* Fill in the descriptives data */
1512 populate_descriptives (struct tab_table *tbl, int col, int row,
1513 const struct metrics *m)
1515 const double t = gsl_cdf_tdist_Qinv ((1 - cmd.n_cinterval[0] / 100.0)/2.0,
1520 TAB_LEFT | TAT_TITLE,
1523 tab_float (tbl, col + 2,
1529 tab_float (tbl, col + 3,
1538 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1539 _ ("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1542 tab_text (tbl, col + 1,
1544 TAB_LEFT | TAT_TITLE,
1547 tab_float (tbl, col + 2,
1550 m->mean - t * m->se_mean,
1553 tab_text (tbl, col + 1,
1555 TAB_LEFT | TAT_TITLE,
1559 tab_float (tbl, col + 2,
1562 m->mean + t * m->se_mean,
1567 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1568 _ ("5%% Trimmed Mean"));
1570 tab_float (tbl, col + 2,
1578 TAB_LEFT | TAT_TITLE,
1582 struct percentile *p;
1585 p = hsh_find (m->ptile_hash, &d);
1590 tab_float (tbl, col + 2,
1600 TAB_LEFT | TAT_TITLE,
1603 tab_float (tbl, col + 2,
1612 TAB_LEFT | TAT_TITLE,
1613 _ ("Std. Deviation"));
1616 tab_float (tbl, col + 2,
1625 TAB_LEFT | TAT_TITLE,
1628 tab_float (tbl, col + 2,
1636 TAB_LEFT | TAT_TITLE,
1639 tab_float (tbl, col + 2,
1648 TAB_LEFT | TAT_TITLE,
1652 tab_float (tbl, col + 2,
1660 TAB_LEFT | TAT_TITLE,
1661 _ ("Interquartile Range"));
1664 struct percentile *p1;
1665 struct percentile *p2;
1668 p1 = hsh_find (m->ptile_hash, &d);
1671 p2 = hsh_find (m->ptile_hash, &d);
1676 tab_float (tbl, col + 2,
1687 TAB_LEFT | TAT_TITLE,
1691 tab_float (tbl, col + 2,
1697 /* stderr of skewness */
1698 tab_float (tbl, col + 3,
1707 TAB_LEFT | TAT_TITLE,
1711 tab_float (tbl, col + 2,
1717 /* stderr of kurtosis */
1718 tab_float (tbl, col + 3,
1730 box_plot_variables (const struct factor *fctr,
1731 const struct variable **vars, int n_vars,
1732 const struct variable *id)
1736 struct factor_statistics **fs ;
1740 box_plot_group (fctr, vars, n_vars, id);
1744 for ( fs = fctr->fs ; *fs ; ++fs )
1746 double y_min = DBL_MAX;
1747 double y_max = -DBL_MAX;
1748 struct chart *ch = chart_create ();
1749 const char *s = factor_to_string (fctr, *fs, 0 );
1751 chart_write_title (ch, s);
1753 for ( i = 0 ; i < n_vars ; ++i )
1755 y_max = MAX (y_max, (*fs)->m[i].max);
1756 y_min = MIN (y_min, (*fs)->m[i].min);
1759 boxplot_draw_yscale (ch, y_max, y_min);
1761 for ( i = 0 ; i < n_vars ; ++i )
1764 const double box_width = (ch->data_right - ch->data_left)
1767 const double box_centre = ( i * 2 + 1) * box_width
1770 boxplot_draw_boxplot (ch,
1771 box_centre, box_width,
1773 var_to_string (vars[i]));
1785 /* Do a box plot, grouping all factors into one plot ;
1786 each dependent variable has its own plot.
1789 box_plot_group (const struct factor *fctr,
1790 const struct variable **vars,
1792 const struct variable *id UNUSED)
1797 for ( i = 0 ; i < n_vars ; ++i )
1799 struct factor_statistics **fs ;
1802 ch = chart_create ();
1804 boxplot_draw_yscale (ch, totals[i].max, totals[i].min);
1810 for ( fs = fctr->fs ; *fs ; ++fs )
1813 chart_write_title (ch, _ ("Boxplot of %s vs. %s"),
1814 var_to_string (vars[i]), var_to_string (fctr->indep_var[0]) );
1816 for ( fs = fctr->fs ; *fs ; ++fs )
1819 const char *s = factor_to_string_concise (fctr, *fs);
1821 const double box_width = (ch->data_right - ch->data_left)
1822 / (n_factors * 2.0 ) ;
1824 const double box_centre = ( f++ * 2 + 1) * box_width
1827 boxplot_draw_boxplot (ch,
1828 box_centre, box_width,
1835 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1836 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1838 chart_write_title (ch, _ ("Boxplot"));
1840 boxplot_draw_boxplot (ch,
1841 box_centre, box_width,
1843 var_to_string (vars[i]) );
1852 /* Plot the normal and detrended normal plots for m
1853 Label the plots with factorname */
1855 np_plot (const struct metrics *m, const char *factorname)
1858 double yfirst=0, ylast=0;
1861 struct chart *np_chart;
1863 /* Detrended Normal Plot */
1864 struct chart *dnp_chart;
1866 /* The slope and intercept of the ideal normal probability line */
1867 const double slope = 1.0 / m->stddev;
1868 const double intercept = - m->mean / m->stddev;
1870 /* Cowardly refuse to plot an empty data set */
1871 if ( m->n_data == 0 )
1874 np_chart = chart_create ();
1875 dnp_chart = chart_create ();
1877 if ( !np_chart || ! dnp_chart )
1880 chart_write_title (np_chart, _ ("Normal Q-Q Plot of %s"), factorname);
1881 chart_write_xlabel (np_chart, _ ("Observed Value"));
1882 chart_write_ylabel (np_chart, _ ("Expected Normal"));
1885 chart_write_title (dnp_chart, _ ("Detrended Normal Q-Q Plot of %s"),
1887 chart_write_xlabel (dnp_chart, _ ("Observed Value"));
1888 chart_write_ylabel (dnp_chart, _ ("Dev from Normal"));
1890 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1891 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1895 /* Need to make sure that both the scatter plot and the ideal fit into the
1897 double x_lower = MIN (m->min, (yfirst - intercept) / slope) ;
1898 double x_upper = MAX (m->max, (ylast - intercept) / slope) ;
1899 double slack = (x_upper - x_lower) * 0.05 ;
1901 chart_write_xscale (np_chart, x_lower - slack, x_upper + slack, 5);
1903 chart_write_xscale (dnp_chart, m->min, m->max, 5);
1907 chart_write_yscale (np_chart, yfirst, ylast, 5);
1910 /* We have to cache the detrended data, beacause we need to
1911 find its limits before we can plot it */
1912 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1913 double d_max = -DBL_MAX;
1914 double d_min = DBL_MAX;
1915 for ( i = 0 ; i < m->n_data; ++i )
1917 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1919 chart_datum (np_chart, 0, m->wvp[i]->v.f, ns);
1921 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1923 if ( d_data[i] < d_min ) d_min = d_data[i];
1924 if ( d_data[i] > d_max ) d_max = d_data[i];
1926 chart_write_yscale (dnp_chart, d_min, d_max, 5);
1928 for ( i = 0 ; i < m->n_data; ++i )
1929 chart_datum (dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1934 chart_line (np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1935 chart_line (dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1937 chart_submit (np_chart);
1938 chart_submit (dnp_chart);
1944 /* Show the percentiles */
1946 show_percentiles (const struct variable **dependent_var,
1948 struct factor *fctr)
1950 struct tab_table *tbl;
1956 struct hsh_table *ptiles ;
1958 int n_heading_columns;
1959 const int n_heading_rows = 2;
1960 const int n_stat_rows = 2;
1966 struct factor_statistics **fs = fctr->fs ;
1967 n_heading_columns = 3;
1968 n_factors = hsh_count (fctr->fstats);
1970 ptiles = (*fs)->m[0].ptile_hash;
1972 if ( fctr->indep_var[1] )
1973 n_heading_columns = 4;
1978 n_heading_columns = 2;
1980 ptiles = totals[0].ptile_hash;
1983 n_ptiles = hsh_count (ptiles);
1985 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1987 n_cols = n_heading_columns + n_ptiles ;
1989 tbl = tab_create (n_cols, n_rows, 0);
1991 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1993 tab_dim (tbl, tab_natural_dimensions);
1995 /* Outline the box and have no internal lines*/
2000 n_cols - 1, n_rows - 1);
2002 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
2004 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
2007 tab_title (tbl, _ ("Percentiles"));
2010 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
2017 n_heading_columns - 1, n_rows - 1);
2023 n_heading_columns, n_heading_rows - 1,
2024 n_cols - 1, n_rows - 1);
2026 tab_joint_text (tbl, n_heading_columns + 1, 0,
2028 TAB_CENTER | TAT_TITLE ,
2033 /* Put in the percentile break points as headings */
2035 struct percentile **p = (struct percentile **) hsh_sort (ptiles);
2040 tab_float (tbl, n_heading_columns + i++ , 1,
2049 for ( i = 0 ; i < n_dep_var ; ++i )
2051 const int n_stat_rows = 2;
2052 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2055 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
2058 i * n_stat_rows * n_factors + n_heading_rows,
2059 TAB_LEFT | TAT_TITLE,
2060 var_to_string (dependent_var[i])
2065 const union value *prev = NULL ;
2066 struct factor_statistics **fs = fctr->fs;
2069 tab_text (tbl, 1, n_heading_rows - 1,
2070 TAB_CENTER | TAT_TITLE,
2071 var_to_string (fctr->indep_var[0]));
2074 if ( fctr->indep_var[1])
2075 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2076 var_to_string (fctr->indep_var[1]));
2080 const int row = n_heading_rows + n_stat_rows *
2081 ( ( i * n_factors ) + count );
2084 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
2085 var_get_width (fctr->indep_var[0])))
2089 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2093 TAB_LEFT | TAT_TITLE,
2094 var_get_value_name (fctr->indep_var[0],
2101 prev = (*fs)->id[0];
2103 if (fctr->indep_var[1] && count > 0 )
2104 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
2106 if ( fctr->indep_var[1])
2107 tab_text (tbl, 2, row,
2108 TAB_LEFT | TAT_TITLE,
2109 var_get_value_name (fctr->indep_var[1], (*fs)->id[1])
2113 populate_percentiles (tbl, n_heading_columns - 1,
2114 row, & (*fs)->m[i]);
2125 populate_percentiles (tbl, n_heading_columns - 1,
2126 i * n_stat_rows * n_factors + n_heading_rows,
2143 populate_percentiles (struct tab_table *tbl, int col, int row,
2144 const struct metrics *m)
2148 struct percentile **p = (struct percentile **) hsh_sort (m->ptile_hash);
2152 TAB_LEFT | TAT_TITLE,
2153 _ ("Tukey\'s Hinges")
2158 TAB_LEFT | TAT_TITLE,
2159 ptile_alg_desc[m->ptile_alg]
2166 tab_float (tbl, col + i + 1 , row,
2169 if ( (*p)->p == 25 )
2170 tab_float (tbl, col + i + 1 , row + 1,
2174 if ( (*p)->p == 50 )
2175 tab_float (tbl, col + i + 1 , row + 1,
2179 if ( (*p)->p == 75 )
2180 tab_float (tbl, col + i + 1 , row + 1,
2195 factor_to_string (const struct factor *fctr,
2196 const struct factor_statistics *fs,
2197 const struct variable *var)
2200 static char buf1[100];
2206 sprintf (buf1, "%s (",var_to_string (var) );
2209 snprintf (buf2, 100, "%s = %s",
2210 var_to_string (fctr->indep_var[0]),
2211 var_get_value_name (fctr->indep_var[0], fs->id[0]));
2213 strcat (buf1, buf2);
2215 if ( fctr->indep_var[1] )
2217 sprintf (buf2, "; %s = %s)",
2218 var_to_string (fctr->indep_var[1]),
2219 var_get_value_name (fctr->indep_var[1], fs->id[1]));
2220 strcat (buf1, buf2);
2234 factor_to_string_concise (const struct factor *fctr,
2235 struct factor_statistics *fs)
2239 static char buf[100];
2243 snprintf (buf, 100, "%s",
2244 var_get_value_name (fctr->indep_var[0], fs->id[0]));
2246 if ( fctr->indep_var[1] )
2248 sprintf (buf2, ",%s)", var_get_value_name (fctr->indep_var[1],