1 /* PSPP - EXAMINE data for normality . -*-c-*-
3 Copyright (C) 2004 Free Software Foundation, Inc.
4 Author: John Darrington 2004
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 #include <gsl/gsl_cdf.h>
24 #include <libpspp/message.h>
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/dictionary.h>
32 #include <data/procedure.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/lexer/lexer.h>
38 #include <libpspp/alloc.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/hash.h>
41 #include <libpspp/magic.h>
42 #include <libpspp/message.h>
43 #include <libpspp/misc.h>
44 #include <libpspp/str.h>
45 #include <math/factor-stats.h>
46 #include <math/moments.h>
47 #include <math/percentiles.h>
48 #include <output/charts/box-whisker.h>
49 #include <output/charts/cartesian.h>
50 #include <output/manager.h>
51 #include <output/table.h>
54 #define _(msgid) gettext (msgid)
55 #define N_(msgid) msgid
58 #include <output/chart.h>
59 #include <output/charts/plot-hist.h>
60 #include <output/charts/plot-chart.h>
67 missing=miss:pairwise/!listwise,
69 incl:include/!exclude;
70 +compare=cmp:variables/!groups;
73 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
75 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
84 static struct cmd_examine cmd;
86 static struct variable **dependent_vars;
88 static size_t n_dependent_vars;
93 /* The independent variable */
94 struct variable *indep_var[2];
97 /* Hash table of factor stats indexed by 2 values */
98 struct hsh_table *fstats;
100 /* The hash table after it has been crunched */
101 struct factor_statistics **fs;
107 /* Linked list of factors */
108 static struct factor *factors=0;
110 static struct metrics *totals=0;
112 /* Parse the clause specifying the factors */
113 static int examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd);
117 /* Output functions */
118 static void show_summary (struct variable **dependent_var, int n_dep_var,
119 const struct factor *f);
121 static void show_extremes (struct variable **dependent_var,
123 const struct factor *factor,
126 static void show_descriptives (struct variable **dependent_var,
128 struct factor *factor);
130 static void show_percentiles (struct variable **dependent_var,
132 struct factor *factor);
137 void np_plot (const struct metrics *m, const char *factorname);
140 void box_plot_group (const struct factor *fctr,
141 const struct variable **vars, int n_vars,
142 const struct variable *id
146 void box_plot_variables (const struct factor *fctr,
147 const struct variable **vars, int n_vars,
148 const struct variable *id
153 /* Per Split function */
154 static bool run_examine (const struct ccase *,
155 const struct casefile *cf, void *cmd_, const struct dataset *);
157 static void output_examine (void);
160 void factor_calc (struct ccase *c, int case_no,
161 double weight, int case_missing);
164 /* Represent a factor as a string, so it can be
165 printed in a human readable fashion */
166 const char * factor_to_string (const struct factor *fctr,
167 struct factor_statistics *fs,
168 const struct variable *var);
171 /* Represent a factor as a string, so it can be
172 printed in a human readable fashion,
173 but sacrificing some readablility for the sake of brevity */
174 const char *factor_to_string_concise (const struct factor *fctr,
175 struct factor_statistics *fs);
180 /* Function to use for testing for missing values */
181 static is_missing_func *value_is_missing;
186 static subc_list_double percentile_list;
188 static enum pc_alg percentile_algorithm;
190 static short sbc_percentile;
194 cmd_examine (struct lexer *lexer, struct dataset *ds)
198 subc_list_double_create (&percentile_list);
199 percentile_algorithm = PC_HAVERAGE;
201 if ( !parse_examine (lexer, ds, &cmd, NULL) )
203 subc_list_double_destroy (&percentile_list);
207 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
208 if (cmd.incl == XMN_INCLUDE )
209 value_is_missing = mv_is_value_system_missing;
211 value_is_missing = mv_is_value_missing;
213 if ( cmd.st_n == SYSMIS )
216 if ( ! cmd.sbc_cinterval)
217 cmd.n_cinterval[0] = 95.0;
219 /* If descriptives have been requested, make sure the
220 quartiles are calculated */
221 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
223 subc_list_double_push (&percentile_list, 25);
224 subc_list_double_push (&percentile_list, 50);
225 subc_list_double_push (&percentile_list, 75);
228 ok = multipass_procedure_with_splits (ds, run_examine, &cmd);
235 if ( dependent_vars )
236 free (dependent_vars);
239 struct factor *f = factors ;
242 struct factor *ff = f;
246 hsh_destroy ( ff->fstats ) ;
252 subc_list_double_destroy (&percentile_list);
254 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
259 /* Show all the appropriate tables */
261 output_examine (void)
265 /* Show totals if appropriate */
266 if ( ! cmd.sbc_nototal || factors == 0 )
268 show_summary (dependent_vars, n_dependent_vars, 0);
270 if ( cmd.sbc_statistics )
272 if ( cmd.a_statistics[XMN_ST_EXTREME])
273 show_extremes (dependent_vars, n_dependent_vars, 0, cmd.st_n);
275 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
276 show_descriptives (dependent_vars, n_dependent_vars, 0);
279 if ( sbc_percentile )
280 show_percentiles (dependent_vars, n_dependent_vars, 0);
285 if ( cmd.a_plot[XMN_PLT_STEMLEAF] )
286 msg (SW, _ ("%s is not currently supported."), "STEMLEAF");
288 if ( cmd.a_plot[XMN_PLT_SPREADLEVEL] )
289 msg (SW, _ ("%s is not currently supported."), "SPREADLEVEL");
291 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
293 for ( v = 0 ; v < n_dependent_vars; ++v )
294 np_plot (&totals[v], var_to_string (dependent_vars[v]));
297 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
299 if ( cmd.cmp == XMN_GROUPS )
301 box_plot_group (0, (const struct variable **) dependent_vars,
302 n_dependent_vars, cmd.v_id);
305 box_plot_variables (0,
306 (const struct variable **) dependent_vars,
307 n_dependent_vars, cmd.v_id);
310 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
312 for ( v = 0 ; v < n_dependent_vars; ++v )
314 struct normal_curve normal;
316 normal.N = totals[v].n;
317 normal.mean = totals[v].mean;
318 normal.stddev = totals[v].stddev;
320 histogram_plot (totals[v].histogram,
321 var_to_string (dependent_vars[v]),
331 /* Show grouped statistics as appropriate */
335 show_summary (dependent_vars, n_dependent_vars, fctr);
337 if ( cmd.sbc_statistics )
339 if ( cmd.a_statistics[XMN_ST_EXTREME])
340 show_extremes (dependent_vars, n_dependent_vars, fctr, cmd.st_n);
342 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
343 show_descriptives (dependent_vars, n_dependent_vars, fctr);
346 if ( sbc_percentile )
347 show_percentiles (dependent_vars, n_dependent_vars, fctr);
354 struct factor_statistics **fs = fctr->fs ;
356 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
358 if ( cmd.cmp == XMN_VARIABLES )
359 box_plot_variables (fctr,
360 (const struct variable **) dependent_vars,
361 n_dependent_vars, cmd.v_id);
363 box_plot_group (fctr,
364 (const struct variable **) dependent_vars,
365 n_dependent_vars, cmd.v_id);
368 for ( v = 0 ; v < n_dependent_vars; ++v )
371 for ( fs = fctr->fs ; *fs ; ++fs )
373 const char *s = factor_to_string (fctr, *fs, dependent_vars[v]);
375 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
376 np_plot (& (*fs)->m[v], s);
378 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
380 struct normal_curve normal;
382 normal.N = (*fs)->m[v].n;
383 normal.mean = (*fs)->m[v].mean;
384 normal.stddev = (*fs)->m[v].stddev;
386 histogram_plot ((*fs)->m[v].histogram,
390 } /* for ( fs .... */
392 } /* for ( v = 0 ..... */
402 /* Create a hash table of percentiles and their values from the list of
404 static struct hsh_table *
405 list_to_ptile_hash (const subc_list_double *l)
409 struct hsh_table *h ;
411 h = hsh_create (subc_list_double_count (l),
412 (hsh_compare_func *) ptile_compare,
413 (hsh_hash_func *) ptile_hash,
414 (hsh_free_func *) free,
418 for ( i = 0 ; i < subc_list_double_count (l) ; ++i )
420 struct percentile *p = xmalloc (sizeof *p);
422 p->p = subc_list_double_at (l,i);
433 /* Parse the PERCENTILES subcommand */
435 xmn_custom_percentiles (struct lexer *lexer, struct dataset *ds UNUSED,
436 struct cmd_examine *p UNUSED, void *aux UNUSED)
440 lex_match (lexer, '=');
442 lex_match (lexer, '(');
444 while ( lex_is_number (lexer) )
446 subc_list_double_push (&percentile_list, lex_number (lexer));
450 lex_match (lexer, ',') ;
452 lex_match (lexer, ')');
454 lex_match (lexer, '=');
456 if ( lex_match_id (lexer, "HAVERAGE"))
457 percentile_algorithm = PC_HAVERAGE;
459 else if ( lex_match_id (lexer, "WAVERAGE"))
460 percentile_algorithm = PC_WAVERAGE;
462 else if ( lex_match_id (lexer, "ROUND"))
463 percentile_algorithm = PC_ROUND;
465 else if ( lex_match_id (lexer, "EMPIRICAL"))
466 percentile_algorithm = PC_EMPIRICAL;
468 else if ( lex_match_id (lexer, "AEMPIRICAL"))
469 percentile_algorithm = PC_AEMPIRICAL;
471 else if ( lex_match_id (lexer, "NONE"))
472 percentile_algorithm = PC_NONE;
475 if ( 0 == subc_list_double_count (&percentile_list))
477 subc_list_double_push (&percentile_list, 5);
478 subc_list_double_push (&percentile_list, 10);
479 subc_list_double_push (&percentile_list, 25);
480 subc_list_double_push (&percentile_list, 50);
481 subc_list_double_push (&percentile_list, 75);
482 subc_list_double_push (&percentile_list, 90);
483 subc_list_double_push (&percentile_list, 95);
489 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
491 xmn_custom_total (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, struct cmd_examine *p, void *aux UNUSED)
493 if ( p->sbc_nototal )
495 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
503 xmn_custom_nototal (struct lexer *lexer UNUSED, struct dataset *ds UNUSED,
504 struct cmd_examine *p, void *aux UNUSED)
508 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
517 /* Parser for the variables sub command
518 Returns 1 on success */
520 xmn_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_examine *cmd, void *aux UNUSED)
522 const struct dictionary *dict = dataset_dict (ds);
523 lex_match (lexer, '=');
525 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
526 && lex_token (lexer) != T_ALL)
531 if (!parse_variables (lexer, dict, &dependent_vars, &n_dependent_vars,
532 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
534 free (dependent_vars);
538 assert (n_dependent_vars);
540 totals = xnmalloc (n_dependent_vars, sizeof *totals);
542 if ( lex_match (lexer, T_BY))
545 success = examine_parse_independent_vars (lexer, dict, cmd);
546 if ( success != 1 ) {
547 free (dependent_vars);
558 /* Parse the clause specifying the factors */
560 examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd)
563 struct factor *sf = xmalloc (sizeof *sf);
565 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
566 && lex_token (lexer) != T_ALL)
573 sf->indep_var[0] = parse_variable (lexer, dict);
574 sf->indep_var[1] = 0;
576 if ( lex_token (lexer) == T_BY )
579 lex_match (lexer, T_BY);
581 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
582 && lex_token (lexer) != T_ALL)
588 sf->indep_var[1] = parse_variable (lexer, dict);
593 sf->fstats = hsh_create (4,
594 (hsh_compare_func *) factor_statistics_compare,
595 (hsh_hash_func *) factor_statistics_hash,
596 (hsh_free_func *) factor_statistics_free,
602 lex_match (lexer, ',');
604 if ( lex_token (lexer) == '.' || lex_token (lexer) == '/' )
607 success = examine_parse_independent_vars (lexer, dict, cmd);
618 void populate_percentiles (struct tab_table *tbl, int col, int row,
619 const struct metrics *m);
621 void populate_descriptives (struct tab_table *t, int col, int row,
622 const struct metrics *fs);
624 void populate_extremes (struct tab_table *t, int col, int row, int n,
625 const struct metrics *m);
627 void populate_summary (struct tab_table *t, int col, int row,
628 const struct metrics *m);
633 static bool bad_weight_warn = true;
636 /* Perform calculations for the sub factors */
638 factor_calc (struct ccase *c, int case_no, double weight, int case_missing)
641 struct factor *fctr = factors;
645 struct factor_statistics **foo ;
646 union value indep_vals[2] ;
648 indep_vals[0] = * case_data (c, fctr->indep_var[0]->fv);
650 if ( fctr->indep_var[1] )
651 indep_vals[1] = * case_data (c, fctr->indep_var[1]->fv);
653 indep_vals[1].f = SYSMIS;
655 assert (fctr->fstats);
657 foo = ( struct factor_statistics ** )
658 hsh_probe (fctr->fstats, (void *) &indep_vals);
663 *foo = create_factor_statistics (n_dependent_vars,
667 for ( v = 0 ; v < n_dependent_vars ; ++v )
669 metrics_precalc ( & (*foo)->m[v] );
674 for ( v = 0 ; v < n_dependent_vars ; ++v )
676 const struct variable *var = dependent_vars[v];
677 const union value *val = case_data (c, var->fv);
679 if ( value_is_missing (&var->miss, val) || case_missing )
682 metrics_calc ( & (*foo)->m[v], val, weight, case_no);
693 run_examine (const struct ccase *first, const struct casefile *cf,
694 void *cmd_, const struct dataset *ds)
696 struct dictionary *dict = dataset_dict (ds);
697 struct casereader *r;
701 const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
705 output_split_file_values (ds, first);
707 /* Make sure we haven't got rubbish left over from a
712 struct factor *next = fctr->next;
714 hsh_clear (fctr->fstats);
721 for ( v = 0 ; v < n_dependent_vars ; ++v )
722 metrics_precalc (&totals[v]);
724 for (r = casefile_get_reader (cf, NULL);
725 casereader_read (r, &c) ;
729 const int case_no = casereader_cnum (r);
731 const double weight =
732 dict_get_case_weight (dict, &c, &bad_weight_warn);
734 if ( cmd->miss == XMN_LISTWISE )
736 for ( v = 0 ; v < n_dependent_vars ; ++v )
738 const struct variable *var = dependent_vars[v];
739 const union value *val = case_data (&c, var->fv);
741 if ( value_is_missing (&var->miss, val))
747 for ( v = 0 ; v < n_dependent_vars ; ++v )
749 const struct variable *var = dependent_vars[v];
750 const union value *val = case_data (&c, var->fv);
752 if ( value_is_missing (&var->miss, val) || case_missing )
755 metrics_calc (&totals[v], val, weight, case_no);
759 factor_calc (&c, case_no, weight, case_missing);
764 for ( v = 0 ; v < n_dependent_vars ; ++v)
769 struct hsh_iterator hi;
770 struct factor_statistics *fs;
772 for ( fs = hsh_first (fctr->fstats, &hi);
774 fs = hsh_next (fctr->fstats, &hi))
777 fs->m[v].ptile_hash = list_to_ptile_hash (&percentile_list);
778 fs->m[v].ptile_alg = percentile_algorithm;
779 metrics_postcalc (&fs->m[v]);
785 totals[v].ptile_hash = list_to_ptile_hash (&percentile_list);
786 totals[v].ptile_alg = percentile_algorithm;
787 metrics_postcalc (&totals[v]);
791 /* Make sure that the combination of factors are complete */
796 struct hsh_iterator hi;
797 struct hsh_iterator hi0;
798 struct hsh_iterator hi1;
799 struct factor_statistics *fs;
801 struct hsh_table *idh0=0;
802 struct hsh_table *idh1=0;
806 idh0 = hsh_create (4, (hsh_compare_func *) compare_values,
807 (hsh_hash_func *) hash_value,
810 idh1 = hsh_create (4, (hsh_compare_func *) compare_values,
811 (hsh_hash_func *) hash_value,
815 for ( fs = hsh_first (fctr->fstats, &hi);
817 fs = hsh_next (fctr->fstats, &hi))
819 hsh_insert (idh0, (void *) &fs->id[0]);
820 hsh_insert (idh1, (void *) &fs->id[1]);
823 /* Ensure that the factors combination is complete */
824 for ( val0 = hsh_first (idh0, &hi0);
826 val0 = hsh_next (idh0, &hi0))
828 for ( val1 = hsh_first (idh1, &hi1);
830 val1 = hsh_next (idh1, &hi1))
832 struct factor_statistics **ffs;
837 ffs = (struct factor_statistics **)
838 hsh_probe (fctr->fstats, (void *) &key );
842 (*ffs) = create_factor_statistics (n_dependent_vars,
844 for ( i = 0 ; i < n_dependent_vars ; ++i )
845 metrics_precalc ( & (*ffs)->m[i]);
853 fctr->fs = (struct factor_statistics **) hsh_sort_copy (fctr->fstats);
864 for ( i = 0 ; i < n_dependent_vars ; ++i )
866 metrics_destroy (&totals[i]);
875 show_summary (struct variable **dependent_var, int n_dep_var,
876 const struct factor *fctr)
878 static const char *subtitle[]=
886 int heading_columns ;
888 const int heading_rows = 3;
889 struct tab_table *tbl;
897 n_factors = hsh_count (fctr->fstats);
898 n_rows = n_dep_var * n_factors ;
900 if ( fctr->indep_var[1] )
909 n_rows += heading_rows;
911 n_cols = heading_columns + 6;
913 tbl = tab_create (n_cols,n_rows,0);
914 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
916 tab_dim (tbl, tab_natural_dimensions);
918 /* Outline the box */
923 n_cols - 1, n_rows - 1);
925 /* Vertical lines for the data only */
930 n_cols - 1, n_rows - 1);
933 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
934 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
935 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
937 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
940 tab_title (tbl, _ ("Case Processing Summary"));
943 tab_joint_text (tbl, heading_columns, 0,
945 TAB_CENTER | TAT_TITLE,
948 /* Remove lines ... */
955 for ( i = 0 ; i < 3 ; ++i )
957 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
960 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
963 tab_joint_text (tbl, heading_columns + i*2 , 1,
964 heading_columns + i*2 + 1, 1,
965 TAB_CENTER | TAT_TITLE,
968 tab_box (tbl, -1, -1,
970 heading_columns + i*2, 1,
971 heading_columns + i*2 + 1, 1);
976 /* Titles for the independent variables */
979 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
980 var_to_string (fctr->indep_var[0]));
982 if ( fctr->indep_var[1] )
984 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
985 var_to_string (fctr->indep_var[1]));
991 for ( i = 0 ; i < n_dep_var ; ++i )
995 n_factors = hsh_count (fctr->fstats);
999 tab_hline (tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
1002 0, i * n_factors + heading_rows,
1003 TAB_LEFT | TAT_TITLE,
1004 var_to_string (dependent_var[i])
1009 populate_summary (tbl, heading_columns,
1010 (i * n_factors) + heading_rows,
1016 struct factor_statistics **fs = fctr->fs;
1021 static union value prev;
1023 if ( 0 != compare_values (&prev, & (*fs)->id[0],
1024 fctr->indep_var[0]->width))
1028 (i * n_factors ) + count +
1030 TAB_LEFT | TAT_TITLE,
1031 value_to_string (& (*fs)->id[0], fctr->indep_var[0])
1034 if (fctr->indep_var[1] && count > 0 )
1035 tab_hline (tbl, TAL_1, 1, n_cols - 1,
1036 (i * n_factors ) + count + heading_rows);
1040 prev = (*fs)->id[0];
1043 if ( fctr->indep_var[1])
1046 (i * n_factors ) + count +
1048 TAB_LEFT | TAT_TITLE,
1049 value_to_string (& (*fs)->id[1], fctr->indep_var[1])
1052 populate_summary (tbl, heading_columns,
1053 (i * n_factors) + count
1068 populate_summary (struct tab_table *t, int col, int row,
1069 const struct metrics *m)
1072 const double total = m->n + m->n_missing ;
1074 tab_float (t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1075 tab_float (t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1076 tab_float (t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1080 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1081 100.0 * m->n / total );
1083 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1084 100.0 * m->n_missing / total );
1086 /* This seems a bit pointless !!! */
1087 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1088 100.0 * total / total );
1099 show_extremes (struct variable **dependent_var, int n_dep_var,
1100 const struct factor *fctr, int n_extremities)
1103 int heading_columns ;
1105 const int heading_rows = 1;
1106 struct tab_table *tbl;
1113 heading_columns = 2;
1114 n_factors = hsh_count (fctr->fstats);
1116 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1118 if ( fctr->indep_var[1] )
1119 heading_columns = 3;
1123 heading_columns = 1;
1124 n_rows = n_dep_var * 2 * n_extremities;
1127 n_rows += heading_rows;
1129 heading_columns += 2;
1130 n_cols = heading_columns + 2;
1132 tbl = tab_create (n_cols,n_rows,0);
1133 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1135 tab_dim (tbl, tab_natural_dimensions);
1137 /* Outline the box, No internal lines*/
1142 n_cols - 1, n_rows - 1);
1144 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1146 tab_title (tbl, _ ("Extreme Values"));
1148 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1149 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1153 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1154 var_to_string (fctr->indep_var[0]));
1156 if ( fctr->indep_var[1] )
1157 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1158 var_to_string (fctr->indep_var[1]));
1161 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Value"));
1162 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Case Number"));
1164 for ( i = 0 ; i < n_dep_var ; ++i )
1168 tab_hline (tbl, TAL_1, 0, n_cols -1 ,
1169 i * 2 * n_extremities * n_factors + heading_rows);
1172 i * 2 * n_extremities * n_factors + heading_rows,
1173 TAB_LEFT | TAT_TITLE,
1174 var_to_string (dependent_var[i])
1179 populate_extremes (tbl, heading_columns - 2,
1180 i * 2 * n_extremities * n_factors + heading_rows,
1181 n_extremities, &totals[i]);
1185 struct factor_statistics **fs = fctr->fs;
1190 static union value prev ;
1192 const int row = heading_rows + ( 2 * n_extremities ) *
1193 ( ( i * n_factors ) + count );
1196 if ( 0 != compare_values (&prev, & (*fs)->id[0],
1197 fctr->indep_var[0]->width))
1201 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1205 TAB_LEFT | TAT_TITLE,
1206 value_to_string (& (*fs)->id[0], fctr->indep_var[0])
1210 prev = (*fs)->id[0];
1212 if (fctr->indep_var[1] && count > 0 )
1213 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1215 if ( fctr->indep_var[1])
1216 tab_text (tbl, 2, row,
1217 TAB_LEFT | TAT_TITLE,
1218 value_to_string (& (*fs)->id[1], fctr->indep_var[1])
1221 populate_extremes (tbl, heading_columns - 2,
1236 /* Fill in the extremities table */
1238 populate_extremes (struct tab_table *t,
1239 int col, int row, int n, const struct metrics *m)
1245 tab_text (t, col, row,
1246 TAB_RIGHT | TAT_TITLE ,
1250 tab_text (t, col, row + n ,
1251 TAB_RIGHT | TAT_TITLE ,
1256 tab_hline (t, TAL_1, col, col + 3, row + n );
1258 for (extremity = 0; extremity < n ; ++extremity )
1261 tab_float (t, col + 1, row + extremity,
1263 extremity + 1, 8, 0);
1267 tab_float (t, col + 1, row + extremity + n,
1269 extremity + 1, 8, 0);
1275 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1278 const struct weighted_value *wv = m->wvp[idx];
1279 struct case_node *cn = wv->case_nos;
1282 for (j = 0 ; j < wv->w ; ++j )
1284 if ( extremity + j >= n )
1287 tab_float (t, col + 3, row + extremity + j + n,
1291 tab_float (t, col + 2, row + extremity + j + n,
1300 extremity += wv->w ;
1305 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1308 const struct weighted_value *wv = m->wvp[idx];
1309 struct case_node *cn = wv->case_nos;
1311 for (j = 0 ; j < wv->w ; ++j )
1313 if ( extremity + j >= n )
1316 tab_float (t, col + 3, row + extremity + j,
1320 tab_float (t, col + 2, row + extremity + j,
1329 extremity += wv->w ;
1334 /* Show the descriptives table */
1336 show_descriptives (struct variable **dependent_var,
1338 struct factor *fctr)
1341 int heading_columns ;
1343 const int n_stat_rows = 13;
1345 const int heading_rows = 1;
1347 struct tab_table *tbl;
1354 heading_columns = 4;
1355 n_factors = hsh_count (fctr->fstats);
1357 n_rows = n_dep_var * n_stat_rows * n_factors;
1359 if ( fctr->indep_var[1] )
1360 heading_columns = 5;
1364 heading_columns = 3;
1365 n_rows = n_dep_var * n_stat_rows;
1368 n_rows += heading_rows;
1370 n_cols = heading_columns + 2;
1373 tbl = tab_create (n_cols, n_rows, 0);
1375 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1377 tab_dim (tbl, tab_natural_dimensions);
1379 /* Outline the box and have no internal lines*/
1384 n_cols - 1, n_rows - 1);
1386 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1388 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1389 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1390 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1392 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Statistic"));
1393 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Std. Error"));
1395 tab_title (tbl, _ ("Descriptives"));
1398 for ( i = 0 ; i < n_dep_var ; ++i )
1400 const int row = heading_rows + i * n_stat_rows * n_factors ;
1403 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
1406 i * n_stat_rows * n_factors + heading_rows,
1407 TAB_LEFT | TAT_TITLE,
1408 var_to_string (dependent_var[i])
1414 struct factor_statistics **fs = fctr->fs;
1417 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1418 var_to_string (fctr->indep_var[0]));
1421 if ( fctr->indep_var[1])
1422 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1423 var_to_string (fctr->indep_var[1]));
1428 static union value prev ;
1430 const int row = heading_rows + n_stat_rows *
1431 ( ( i * n_factors ) + count );
1434 if ( 0 != compare_values (&prev, & (*fs)->id[0],
1435 fctr->indep_var[0]->width))
1439 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1443 TAB_LEFT | TAT_TITLE,
1444 value_to_string (& (*fs)->id[0], fctr->indep_var[0])
1448 prev = (*fs)->id[0];
1450 if (fctr->indep_var[1] && count > 0 )
1451 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1453 if ( fctr->indep_var[1])
1454 tab_text (tbl, 2, row,
1455 TAB_LEFT | TAT_TITLE,
1456 value_to_string (& (*fs)->id[1], fctr->indep_var[1])
1459 populate_descriptives (tbl, heading_columns - 2,
1460 row, & (*fs)->m[i]);
1471 populate_descriptives (tbl, heading_columns - 2,
1472 i * n_stat_rows * n_factors + heading_rows,
1484 /* Fill in the descriptives data */
1486 populate_descriptives (struct tab_table *tbl, int col, int row,
1487 const struct metrics *m)
1490 const double t = gsl_cdf_tdist_Qinv (1 - cmd.n_cinterval[0]/100.0/2.0, \
1496 TAB_LEFT | TAT_TITLE,
1499 tab_float (tbl, col + 2,
1505 tab_float (tbl, col + 3,
1514 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1515 _ ("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1518 tab_text (tbl, col + 1,
1520 TAB_LEFT | TAT_TITLE,
1523 tab_float (tbl, col + 2,
1526 m->mean - t * m->se_mean,
1529 tab_text (tbl, col + 1,
1531 TAB_LEFT | TAT_TITLE,
1535 tab_float (tbl, col + 2,
1538 m->mean + t * m->se_mean,
1543 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1544 _ ("5%% Trimmed Mean"));
1546 tab_float (tbl, col + 2,
1554 TAB_LEFT | TAT_TITLE,
1558 struct percentile *p;
1561 p = hsh_find (m->ptile_hash, &d);
1566 tab_float (tbl, col + 2,
1576 TAB_LEFT | TAT_TITLE,
1579 tab_float (tbl, col + 2,
1588 TAB_LEFT | TAT_TITLE,
1589 _ ("Std. Deviation"));
1592 tab_float (tbl, col + 2,
1601 TAB_LEFT | TAT_TITLE,
1604 tab_float (tbl, col + 2,
1612 TAB_LEFT | TAT_TITLE,
1615 tab_float (tbl, col + 2,
1624 TAB_LEFT | TAT_TITLE,
1628 tab_float (tbl, col + 2,
1636 TAB_LEFT | TAT_TITLE,
1637 _ ("Interquartile Range"));
1640 struct percentile *p1;
1641 struct percentile *p2;
1644 p1 = hsh_find (m->ptile_hash, &d);
1647 p2 = hsh_find (m->ptile_hash, &d);
1652 tab_float (tbl, col + 2,
1663 TAB_LEFT | TAT_TITLE,
1667 tab_float (tbl, col + 2,
1673 /* stderr of skewness */
1674 tab_float (tbl, col + 3,
1683 TAB_LEFT | TAT_TITLE,
1687 tab_float (tbl, col + 2,
1693 /* stderr of kurtosis */
1694 tab_float (tbl, col + 3,
1706 box_plot_variables (const struct factor *fctr,
1707 const struct variable **vars, int n_vars,
1708 const struct variable *id)
1712 struct factor_statistics **fs ;
1716 box_plot_group (fctr, vars, n_vars, id);
1720 for ( fs = fctr->fs ; *fs ; ++fs )
1722 double y_min = DBL_MAX;
1723 double y_max = -DBL_MAX;
1724 struct chart *ch = chart_create ();
1725 const char *s = factor_to_string (fctr, *fs, 0 );
1727 chart_write_title (ch, s);
1729 for ( i = 0 ; i < n_vars ; ++i )
1731 y_max = max (y_max, (*fs)->m[i].max);
1732 y_min = min (y_min, (*fs)->m[i].min);
1735 boxplot_draw_yscale (ch, y_max, y_min);
1737 for ( i = 0 ; i < n_vars ; ++i )
1740 const double box_width = (ch->data_right - ch->data_left)
1743 const double box_centre = ( i * 2 + 1) * box_width
1746 boxplot_draw_boxplot (ch,
1747 box_centre, box_width,
1749 var_to_string (vars[i]));
1761 /* Do a box plot, grouping all factors into one plot ;
1762 each dependent variable has its own plot.
1765 box_plot_group (const struct factor *fctr,
1766 const struct variable **vars,
1768 const struct variable *id UNUSED)
1773 for ( i = 0 ; i < n_vars ; ++i )
1775 struct factor_statistics **fs ;
1778 ch = chart_create ();
1780 boxplot_draw_yscale (ch, totals[i].max, totals[i].min);
1786 for ( fs = fctr->fs ; *fs ; ++fs )
1789 chart_write_title (ch, _ ("Boxplot of %s vs. %s"),
1790 var_to_string (vars[i]), var_to_string (fctr->indep_var[0]) );
1792 for ( fs = fctr->fs ; *fs ; ++fs )
1795 const char *s = factor_to_string_concise (fctr, *fs);
1797 const double box_width = (ch->data_right - ch->data_left)
1798 / (n_factors * 2.0 ) ;
1800 const double box_centre = ( f++ * 2 + 1) * box_width
1803 boxplot_draw_boxplot (ch,
1804 box_centre, box_width,
1811 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1812 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1814 chart_write_title (ch, _ ("Boxplot"));
1816 boxplot_draw_boxplot (ch,
1817 box_centre, box_width,
1819 var_to_string (vars[i]) );
1828 /* Plot the normal and detrended normal plots for m
1829 Label the plots with factorname */
1831 np_plot (const struct metrics *m, const char *factorname)
1834 double yfirst=0, ylast=0;
1837 struct chart *np_chart;
1839 /* Detrended Normal Plot */
1840 struct chart *dnp_chart;
1842 /* The slope and intercept of the ideal normal probability line */
1843 const double slope = 1.0 / m->stddev;
1844 const double intercept = - m->mean / m->stddev;
1846 /* Cowardly refuse to plot an empty data set */
1847 if ( m->n_data == 0 )
1850 np_chart = chart_create ();
1851 dnp_chart = chart_create ();
1853 if ( !np_chart || ! dnp_chart )
1856 chart_write_title (np_chart, _ ("Normal Q-Q Plot of %s"), factorname);
1857 chart_write_xlabel (np_chart, _ ("Observed Value"));
1858 chart_write_ylabel (np_chart, _ ("Expected Normal"));
1861 chart_write_title (dnp_chart, _ ("Detrended Normal Q-Q Plot of %s"),
1863 chart_write_xlabel (dnp_chart, _ ("Observed Value"));
1864 chart_write_ylabel (dnp_chart, _ ("Dev from Normal"));
1866 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1867 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1871 /* Need to make sure that both the scatter plot and the ideal fit into the
1873 double x_lower = min (m->min, (yfirst - intercept) / slope) ;
1874 double x_upper = max (m->max, (ylast - intercept) / slope) ;
1875 double slack = (x_upper - x_lower) * 0.05 ;
1877 chart_write_xscale (np_chart, x_lower - slack, x_upper + slack, 5);
1879 chart_write_xscale (dnp_chart, m->min, m->max, 5);
1883 chart_write_yscale (np_chart, yfirst, ylast, 5);
1886 /* We have to cache the detrended data, beacause we need to
1887 find its limits before we can plot it */
1888 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1889 double d_max = -DBL_MAX;
1890 double d_min = DBL_MAX;
1891 for ( i = 0 ; i < m->n_data; ++i )
1893 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1895 chart_datum (np_chart, 0, m->wvp[i]->v.f, ns);
1897 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1899 if ( d_data[i] < d_min ) d_min = d_data[i];
1900 if ( d_data[i] > d_max ) d_max = d_data[i];
1902 chart_write_yscale (dnp_chart, d_min, d_max, 5);
1904 for ( i = 0 ; i < m->n_data; ++i )
1905 chart_datum (dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1910 chart_line (np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1911 chart_line (dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1913 chart_submit (np_chart);
1914 chart_submit (dnp_chart);
1920 /* Show the percentiles */
1922 show_percentiles (struct variable **dependent_var,
1924 struct factor *fctr)
1926 struct tab_table *tbl;
1932 struct hsh_table *ptiles ;
1934 int n_heading_columns;
1935 const int n_heading_rows = 2;
1936 const int n_stat_rows = 2;
1942 struct factor_statistics **fs = fctr->fs ;
1943 n_heading_columns = 3;
1944 n_factors = hsh_count (fctr->fstats);
1946 ptiles = (*fs)->m[0].ptile_hash;
1948 if ( fctr->indep_var[1] )
1949 n_heading_columns = 4;
1954 n_heading_columns = 2;
1956 ptiles = totals[0].ptile_hash;
1959 n_ptiles = hsh_count (ptiles);
1961 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1963 n_cols = n_heading_columns + n_ptiles ;
1965 tbl = tab_create (n_cols, n_rows, 0);
1967 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1969 tab_dim (tbl, tab_natural_dimensions);
1971 /* Outline the box and have no internal lines*/
1976 n_cols - 1, n_rows - 1);
1978 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
1980 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
1983 tab_title (tbl, _ ("Percentiles"));
1986 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
1993 n_heading_columns - 1, n_rows - 1);
1999 n_heading_columns, n_heading_rows - 1,
2000 n_cols - 1, n_rows - 1);
2002 tab_joint_text (tbl, n_heading_columns + 1, 0,
2004 TAB_CENTER | TAT_TITLE ,
2009 /* Put in the percentile break points as headings */
2011 struct percentile **p = (struct percentile **) hsh_sort (ptiles);
2016 tab_float (tbl, n_heading_columns + i++ , 1,
2025 for ( i = 0 ; i < n_dep_var ; ++i )
2027 const int n_stat_rows = 2;
2028 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2031 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
2034 i * n_stat_rows * n_factors + n_heading_rows,
2035 TAB_LEFT | TAT_TITLE,
2036 var_to_string (dependent_var[i])
2041 struct factor_statistics **fs = fctr->fs;
2044 tab_text (tbl, 1, n_heading_rows - 1,
2045 TAB_CENTER | TAT_TITLE,
2046 var_to_string (fctr->indep_var[0]));
2049 if ( fctr->indep_var[1])
2050 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2051 var_to_string (fctr->indep_var[1]));
2056 static union value prev ;
2058 const int row = n_heading_rows + n_stat_rows *
2059 ( ( i * n_factors ) + count );
2062 if ( 0 != compare_values (&prev, & (*fs)->id[0],
2063 fctr->indep_var[0]->width))
2067 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2071 TAB_LEFT | TAT_TITLE,
2072 value_to_string (& (*fs)->id[0], fctr->indep_var[0])
2078 prev = (*fs)->id[0];
2080 if (fctr->indep_var[1] && count > 0 )
2081 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
2083 if ( fctr->indep_var[1])
2084 tab_text (tbl, 2, row,
2085 TAB_LEFT | TAT_TITLE,
2086 value_to_string (& (*fs)->id[1], fctr->indep_var[1])
2090 populate_percentiles (tbl, n_heading_columns - 1,
2091 row, & (*fs)->m[i]);
2102 populate_percentiles (tbl, n_heading_columns - 1,
2103 i * n_stat_rows * n_factors + n_heading_rows,
2120 populate_percentiles (struct tab_table *tbl, int col, int row,
2121 const struct metrics *m)
2125 struct percentile **p = (struct percentile **) hsh_sort (m->ptile_hash);
2129 TAB_LEFT | TAT_TITLE,
2130 _ ("Tukey\'s Hinges")
2135 TAB_LEFT | TAT_TITLE,
2136 ptile_alg_desc[m->ptile_alg]
2143 tab_float (tbl, col + i + 1 , row,
2146 if ( (*p)->p == 25 )
2147 tab_float (tbl, col + i + 1 , row + 1,
2151 if ( (*p)->p == 50 )
2152 tab_float (tbl, col + i + 1 , row + 1,
2156 if ( (*p)->p == 75 )
2157 tab_float (tbl, col + i + 1 , row + 1,
2172 factor_to_string (const struct factor *fctr,
2173 struct factor_statistics *fs,
2174 const struct variable *var)
2177 static char buf1[100];
2183 sprintf (buf1, "%s (",var_to_string (var) );
2186 snprintf (buf2, 100, "%s = %s",
2187 var_to_string (fctr->indep_var[0]),
2188 value_to_string (&fs->id[0],fctr->indep_var[0]));
2190 strcat (buf1, buf2);
2192 if ( fctr->indep_var[1] )
2194 sprintf (buf2, "; %s = %s)",
2195 var_to_string (fctr->indep_var[1]),
2196 value_to_string (&fs->id[1],
2197 fctr->indep_var[1]));
2198 strcat (buf1, buf2);
2212 factor_to_string_concise (const struct factor *fctr,
2213 struct factor_statistics *fs)
2217 static char buf[100];
2221 snprintf (buf, 100, "%s",
2222 value_to_string (&fs->id[0], fctr->indep_var[0]));
2224 if ( fctr->indep_var[1] )
2226 sprintf (buf2, ",%s)", value_to_string (&fs->id[1], fctr->indep_var[1]) );