1 /* PSPP - EXAMINE data for normality . -*-c-*-
3 Copyright (C) 2004 Free Software Foundation, Inc.
4 Author: John Darrington 2004
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 #include <gsl/gsl_cdf.h>
24 #include <libpspp/message.h>
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/dictionary.h>
32 #include <data/procedure.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/lexer/lexer.h>
38 #include <libpspp/alloc.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/hash.h>
41 #include <libpspp/magic.h>
42 #include <libpspp/message.h>
43 #include <libpspp/misc.h>
44 #include <libpspp/str.h>
45 #include <math/factor-stats.h>
46 #include <math/moments.h>
47 #include <math/percentiles.h>
48 #include <output/charts/box-whisker.h>
49 #include <output/charts/cartesian.h>
50 #include <output/manager.h>
51 #include <output/table.h>
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
60 #include <output/chart.h>
61 #include <output/charts/plot-hist.h>
62 #include <output/charts/plot-chart.h>
69 missing=miss:pairwise/!listwise,
71 incl:include/!exclude;
72 +compare=cmp:variables/!groups;
75 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
77 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
86 static struct cmd_examine cmd;
88 static struct variable **dependent_vars;
90 static size_t n_dependent_vars;
95 /* The independent variable */
96 struct variable *indep_var[2];
99 /* Hash table of factor stats indexed by 2 values */
100 struct hsh_table *fstats;
102 /* The hash table after it has been crunched */
103 struct factor_statistics **fs;
109 /* Linked list of factors */
110 static struct factor *factors=0;
112 static struct metrics *totals=0;
114 /* Parse the clause specifying the factors */
115 static int examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd);
119 /* Output functions */
120 static void show_summary (struct variable **dependent_var, int n_dep_var,
121 const struct factor *f);
123 static void show_extremes (struct variable **dependent_var,
125 const struct factor *factor,
128 static void show_descriptives (struct variable **dependent_var,
130 struct factor *factor);
132 static void show_percentiles (struct variable **dependent_var,
134 struct factor *factor);
139 void np_plot (const struct metrics *m, const char *factorname);
142 void box_plot_group (const struct factor *fctr,
143 const struct variable **vars, int n_vars,
144 const struct variable *id
148 void box_plot_variables (const struct factor *fctr,
149 const struct variable **vars, int n_vars,
150 const struct variable *id
155 /* Per Split function */
156 static bool run_examine (const struct ccase *,
157 const struct casefile *cf, void *cmd_, const struct dataset *);
159 static void output_examine (void);
162 void factor_calc (struct ccase *c, int case_no,
163 double weight, int case_missing);
166 /* Represent a factor as a string, so it can be
167 printed in a human readable fashion */
168 const char * factor_to_string (const struct factor *fctr,
169 struct factor_statistics *fs,
170 const struct variable *var);
173 /* Represent a factor as a string, so it can be
174 printed in a human readable fashion,
175 but sacrificing some readablility for the sake of brevity */
176 const char *factor_to_string_concise (const struct factor *fctr,
177 struct factor_statistics *fs);
182 /* Function to use for testing for missing values */
183 static is_missing_func *value_is_missing;
188 static subc_list_double percentile_list;
190 static enum pc_alg percentile_algorithm;
192 static short sbc_percentile;
196 cmd_examine (struct lexer *lexer, struct dataset *ds)
200 subc_list_double_create (&percentile_list);
201 percentile_algorithm = PC_HAVERAGE;
203 if ( !parse_examine (lexer, ds, &cmd, NULL) )
205 subc_list_double_destroy (&percentile_list);
209 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
210 if (cmd.incl == XMN_INCLUDE )
211 value_is_missing = mv_is_value_system_missing;
213 value_is_missing = mv_is_value_missing;
215 if ( cmd.st_n == SYSMIS )
218 if ( ! cmd.sbc_cinterval)
219 cmd.n_cinterval[0] = 95.0;
221 /* If descriptives have been requested, make sure the
222 quartiles are calculated */
223 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
225 subc_list_double_push (&percentile_list, 25);
226 subc_list_double_push (&percentile_list, 50);
227 subc_list_double_push (&percentile_list, 75);
230 ok = multipass_procedure_with_splits (ds, run_examine, &cmd);
237 if ( dependent_vars )
238 free (dependent_vars);
241 struct factor *f = factors ;
244 struct factor *ff = f;
248 hsh_destroy ( ff->fstats ) ;
254 subc_list_double_destroy (&percentile_list);
256 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
261 /* Show all the appropriate tables */
263 output_examine (void)
267 /* Show totals if appropriate */
268 if ( ! cmd.sbc_nototal || factors == 0 )
270 show_summary (dependent_vars, n_dependent_vars, 0);
272 if ( cmd.sbc_statistics )
274 if ( cmd.a_statistics[XMN_ST_EXTREME])
275 show_extremes (dependent_vars, n_dependent_vars, 0, cmd.st_n);
277 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
278 show_descriptives (dependent_vars, n_dependent_vars, 0);
281 if ( sbc_percentile )
282 show_percentiles (dependent_vars, n_dependent_vars, 0);
287 if ( cmd.a_plot[XMN_PLT_STEMLEAF] )
288 msg (SW, _ ("%s is not currently supported."), "STEMLEAF");
290 if ( cmd.a_plot[XMN_PLT_SPREADLEVEL] )
291 msg (SW, _ ("%s is not currently supported."), "SPREADLEVEL");
293 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
295 for ( v = 0 ; v < n_dependent_vars; ++v )
296 np_plot (&totals[v], var_to_string (dependent_vars[v]));
299 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
301 if ( cmd.cmp == XMN_GROUPS )
303 box_plot_group (0, (const struct variable **) dependent_vars,
304 n_dependent_vars, cmd.v_id);
307 box_plot_variables (0,
308 (const struct variable **) dependent_vars,
309 n_dependent_vars, cmd.v_id);
312 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
314 for ( v = 0 ; v < n_dependent_vars; ++v )
316 struct normal_curve normal;
318 normal.N = totals[v].n;
319 normal.mean = totals[v].mean;
320 normal.stddev = totals[v].stddev;
322 histogram_plot (totals[v].histogram,
323 var_to_string (dependent_vars[v]),
333 /* Show grouped statistics as appropriate */
337 show_summary (dependent_vars, n_dependent_vars, fctr);
339 if ( cmd.sbc_statistics )
341 if ( cmd.a_statistics[XMN_ST_EXTREME])
342 show_extremes (dependent_vars, n_dependent_vars, fctr, cmd.st_n);
344 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
345 show_descriptives (dependent_vars, n_dependent_vars, fctr);
348 if ( sbc_percentile )
349 show_percentiles (dependent_vars, n_dependent_vars, fctr);
356 struct factor_statistics **fs = fctr->fs ;
358 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
360 if ( cmd.cmp == XMN_VARIABLES )
361 box_plot_variables (fctr,
362 (const struct variable **) dependent_vars,
363 n_dependent_vars, cmd.v_id);
365 box_plot_group (fctr,
366 (const struct variable **) dependent_vars,
367 n_dependent_vars, cmd.v_id);
370 for ( v = 0 ; v < n_dependent_vars; ++v )
373 for ( fs = fctr->fs ; *fs ; ++fs )
375 const char *s = factor_to_string (fctr, *fs, dependent_vars[v]);
377 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
378 np_plot (& (*fs)->m[v], s);
380 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
382 struct normal_curve normal;
384 normal.N = (*fs)->m[v].n;
385 normal.mean = (*fs)->m[v].mean;
386 normal.stddev = (*fs)->m[v].stddev;
388 histogram_plot ((*fs)->m[v].histogram,
392 } /* for ( fs .... */
394 } /* for ( v = 0 ..... */
404 /* Create a hash table of percentiles and their values from the list of
406 static struct hsh_table *
407 list_to_ptile_hash (const subc_list_double *l)
411 struct hsh_table *h ;
413 h = hsh_create (subc_list_double_count (l),
414 (hsh_compare_func *) ptile_compare,
415 (hsh_hash_func *) ptile_hash,
416 (hsh_free_func *) free,
420 for ( i = 0 ; i < subc_list_double_count (l) ; ++i )
422 struct percentile *p = xmalloc (sizeof *p);
424 p->p = subc_list_double_at (l,i);
435 /* Parse the PERCENTILES subcommand */
437 xmn_custom_percentiles (struct lexer *lexer, struct dataset *ds UNUSED,
438 struct cmd_examine *p UNUSED, void *aux UNUSED)
442 lex_match (lexer, '=');
444 lex_match (lexer, '(');
446 while ( lex_is_number (lexer) )
448 subc_list_double_push (&percentile_list, lex_number (lexer));
452 lex_match (lexer, ',') ;
454 lex_match (lexer, ')');
456 lex_match (lexer, '=');
458 if ( lex_match_id (lexer, "HAVERAGE"))
459 percentile_algorithm = PC_HAVERAGE;
461 else if ( lex_match_id (lexer, "WAVERAGE"))
462 percentile_algorithm = PC_WAVERAGE;
464 else if ( lex_match_id (lexer, "ROUND"))
465 percentile_algorithm = PC_ROUND;
467 else if ( lex_match_id (lexer, "EMPIRICAL"))
468 percentile_algorithm = PC_EMPIRICAL;
470 else if ( lex_match_id (lexer, "AEMPIRICAL"))
471 percentile_algorithm = PC_AEMPIRICAL;
473 else if ( lex_match_id (lexer, "NONE"))
474 percentile_algorithm = PC_NONE;
477 if ( 0 == subc_list_double_count (&percentile_list))
479 subc_list_double_push (&percentile_list, 5);
480 subc_list_double_push (&percentile_list, 10);
481 subc_list_double_push (&percentile_list, 25);
482 subc_list_double_push (&percentile_list, 50);
483 subc_list_double_push (&percentile_list, 75);
484 subc_list_double_push (&percentile_list, 90);
485 subc_list_double_push (&percentile_list, 95);
491 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
493 xmn_custom_total (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, struct cmd_examine *p, void *aux UNUSED)
495 if ( p->sbc_nototal )
497 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
505 xmn_custom_nototal (struct lexer *lexer UNUSED, struct dataset *ds UNUSED,
506 struct cmd_examine *p, void *aux UNUSED)
510 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
519 /* Parser for the variables sub command
520 Returns 1 on success */
522 xmn_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_examine *cmd, void *aux UNUSED)
524 const struct dictionary *dict = dataset_dict (ds);
525 lex_match (lexer, '=');
527 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
528 && lex_token (lexer) != T_ALL)
533 if (!parse_variables (lexer, dict, &dependent_vars, &n_dependent_vars,
534 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
536 free (dependent_vars);
540 assert (n_dependent_vars);
542 totals = xnmalloc (n_dependent_vars, sizeof *totals);
544 if ( lex_match (lexer, T_BY))
547 success = examine_parse_independent_vars (lexer, dict, cmd);
548 if ( success != 1 ) {
549 free (dependent_vars);
560 /* Parse the clause specifying the factors */
562 examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd)
565 struct factor *sf = xmalloc (sizeof *sf);
567 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
568 && lex_token (lexer) != T_ALL)
575 sf->indep_var[0] = parse_variable (lexer, dict);
576 sf->indep_var[1] = 0;
578 if ( lex_token (lexer) == T_BY )
581 lex_match (lexer, T_BY);
583 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
584 && lex_token (lexer) != T_ALL)
590 sf->indep_var[1] = parse_variable (lexer, dict);
595 sf->fstats = hsh_create (4,
596 (hsh_compare_func *) factor_statistics_compare,
597 (hsh_hash_func *) factor_statistics_hash,
598 (hsh_free_func *) factor_statistics_free,
604 lex_match (lexer, ',');
606 if ( lex_token (lexer) == '.' || lex_token (lexer) == '/' )
609 success = examine_parse_independent_vars (lexer, dict, cmd);
620 void populate_percentiles (struct tab_table *tbl, int col, int row,
621 const struct metrics *m);
623 void populate_descriptives (struct tab_table *t, int col, int row,
624 const struct metrics *fs);
626 void populate_extremes (struct tab_table *t, int col, int row, int n,
627 const struct metrics *m);
629 void populate_summary (struct tab_table *t, int col, int row,
630 const struct metrics *m);
635 static bool bad_weight_warn = true;
638 /* Perform calculations for the sub factors */
640 factor_calc (struct ccase *c, int case_no, double weight, int case_missing)
643 struct factor *fctr = factors;
647 struct factor_statistics **foo ;
648 union value indep_vals[2] ;
650 indep_vals[0] = * case_data (c, fctr->indep_var[0]->fv);
652 if ( fctr->indep_var[1] )
653 indep_vals[1] = * case_data (c, fctr->indep_var[1]->fv);
655 indep_vals[1].f = SYSMIS;
657 assert (fctr->fstats);
659 foo = ( struct factor_statistics ** )
660 hsh_probe (fctr->fstats, (void *) &indep_vals);
665 *foo = create_factor_statistics (n_dependent_vars,
669 for ( v = 0 ; v < n_dependent_vars ; ++v )
671 metrics_precalc ( & (*foo)->m[v] );
676 for ( v = 0 ; v < n_dependent_vars ; ++v )
678 const struct variable *var = dependent_vars[v];
679 const union value *val = case_data (c, var->fv);
681 if ( value_is_missing (&var->miss, val) || case_missing )
684 metrics_calc ( & (*foo)->m[v], val, weight, case_no);
695 run_examine (const struct ccase *first, const struct casefile *cf,
696 void *cmd_, const struct dataset *ds)
698 struct dictionary *dict = dataset_dict (ds);
699 struct casereader *r;
703 const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
707 output_split_file_values (ds, first);
709 /* Make sure we haven't got rubbish left over from a
714 struct factor *next = fctr->next;
716 hsh_clear (fctr->fstats);
723 for ( v = 0 ; v < n_dependent_vars ; ++v )
724 metrics_precalc (&totals[v]);
726 for (r = casefile_get_reader (cf, NULL);
727 casereader_read (r, &c) ;
731 const int case_no = casereader_cnum (r);
733 const double weight =
734 dict_get_case_weight (dict, &c, &bad_weight_warn);
736 if ( cmd->miss == XMN_LISTWISE )
738 for ( v = 0 ; v < n_dependent_vars ; ++v )
740 const struct variable *var = dependent_vars[v];
741 const union value *val = case_data (&c, var->fv);
743 if ( value_is_missing (&var->miss, val))
749 for ( v = 0 ; v < n_dependent_vars ; ++v )
751 const struct variable *var = dependent_vars[v];
752 const union value *val = case_data (&c, var->fv);
754 if ( value_is_missing (&var->miss, val) || case_missing )
757 metrics_calc (&totals[v], val, weight, case_no);
761 factor_calc (&c, case_no, weight, case_missing);
766 for ( v = 0 ; v < n_dependent_vars ; ++v)
771 struct hsh_iterator hi;
772 struct factor_statistics *fs;
774 for ( fs = hsh_first (fctr->fstats, &hi);
776 fs = hsh_next (fctr->fstats, &hi))
779 fs->m[v].ptile_hash = list_to_ptile_hash (&percentile_list);
780 fs->m[v].ptile_alg = percentile_algorithm;
781 metrics_postcalc (&fs->m[v]);
787 totals[v].ptile_hash = list_to_ptile_hash (&percentile_list);
788 totals[v].ptile_alg = percentile_algorithm;
789 metrics_postcalc (&totals[v]);
793 /* Make sure that the combination of factors are complete */
798 struct hsh_iterator hi;
799 struct hsh_iterator hi0;
800 struct hsh_iterator hi1;
801 struct factor_statistics *fs;
803 struct hsh_table *idh0=0;
804 struct hsh_table *idh1=0;
808 idh0 = hsh_create (4, (hsh_compare_func *) compare_values,
809 (hsh_hash_func *) hash_value,
812 idh1 = hsh_create (4, (hsh_compare_func *) compare_values,
813 (hsh_hash_func *) hash_value,
817 for ( fs = hsh_first (fctr->fstats, &hi);
819 fs = hsh_next (fctr->fstats, &hi))
821 hsh_insert (idh0, (void *) &fs->id[0]);
822 hsh_insert (idh1, (void *) &fs->id[1]);
825 /* Ensure that the factors combination is complete */
826 for ( val0 = hsh_first (idh0, &hi0);
828 val0 = hsh_next (idh0, &hi0))
830 for ( val1 = hsh_first (idh1, &hi1);
832 val1 = hsh_next (idh1, &hi1))
834 struct factor_statistics **ffs;
839 ffs = (struct factor_statistics **)
840 hsh_probe (fctr->fstats, (void *) &key );
844 (*ffs) = create_factor_statistics (n_dependent_vars,
846 for ( i = 0 ; i < n_dependent_vars ; ++i )
847 metrics_precalc ( & (*ffs)->m[i]);
855 fctr->fs = (struct factor_statistics **) hsh_sort_copy (fctr->fstats);
866 for ( i = 0 ; i < n_dependent_vars ; ++i )
868 metrics_destroy (&totals[i]);
877 show_summary (struct variable **dependent_var, int n_dep_var,
878 const struct factor *fctr)
880 static const char *subtitle[]=
888 int heading_columns ;
890 const int heading_rows = 3;
891 struct tab_table *tbl;
899 n_factors = hsh_count (fctr->fstats);
900 n_rows = n_dep_var * n_factors ;
902 if ( fctr->indep_var[1] )
911 n_rows += heading_rows;
913 n_cols = heading_columns + 6;
915 tbl = tab_create (n_cols,n_rows,0);
916 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
918 tab_dim (tbl, tab_natural_dimensions);
920 /* Outline the box */
925 n_cols - 1, n_rows - 1);
927 /* Vertical lines for the data only */
932 n_cols - 1, n_rows - 1);
935 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
936 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
937 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
939 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
942 tab_title (tbl, _ ("Case Processing Summary"));
945 tab_joint_text (tbl, heading_columns, 0,
947 TAB_CENTER | TAT_TITLE,
950 /* Remove lines ... */
957 for ( i = 0 ; i < 3 ; ++i )
959 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
962 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
965 tab_joint_text (tbl, heading_columns + i*2 , 1,
966 heading_columns + i*2 + 1, 1,
967 TAB_CENTER | TAT_TITLE,
970 tab_box (tbl, -1, -1,
972 heading_columns + i*2, 1,
973 heading_columns + i*2 + 1, 1);
978 /* Titles for the independent variables */
981 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
982 var_to_string (fctr->indep_var[0]));
984 if ( fctr->indep_var[1] )
986 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
987 var_to_string (fctr->indep_var[1]));
993 for ( i = 0 ; i < n_dep_var ; ++i )
997 n_factors = hsh_count (fctr->fstats);
1001 tab_hline (tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
1004 0, i * n_factors + heading_rows,
1005 TAB_LEFT | TAT_TITLE,
1006 var_to_string (dependent_var[i])
1011 populate_summary (tbl, heading_columns,
1012 (i * n_factors) + heading_rows,
1018 struct factor_statistics **fs = fctr->fs;
1023 static union value prev;
1025 if ( 0 != compare_values (&prev, & (*fs)->id[0],
1026 fctr->indep_var[0]->width))
1030 (i * n_factors ) + count +
1032 TAB_LEFT | TAT_TITLE,
1033 value_to_string (& (*fs)->id[0], fctr->indep_var[0])
1036 if (fctr->indep_var[1] && count > 0 )
1037 tab_hline (tbl, TAL_1, 1, n_cols - 1,
1038 (i * n_factors ) + count + heading_rows);
1042 prev = (*fs)->id[0];
1045 if ( fctr->indep_var[1])
1048 (i * n_factors ) + count +
1050 TAB_LEFT | TAT_TITLE,
1051 value_to_string (& (*fs)->id[1], fctr->indep_var[1])
1054 populate_summary (tbl, heading_columns,
1055 (i * n_factors) + count
1070 populate_summary (struct tab_table *t, int col, int row,
1071 const struct metrics *m)
1074 const double total = m->n + m->n_missing ;
1076 tab_float (t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1077 tab_float (t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1078 tab_float (t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1082 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1083 100.0 * m->n / total );
1085 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1086 100.0 * m->n_missing / total );
1088 /* This seems a bit pointless !!! */
1089 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1090 100.0 * total / total );
1101 show_extremes (struct variable **dependent_var, int n_dep_var,
1102 const struct factor *fctr, int n_extremities)
1105 int heading_columns ;
1107 const int heading_rows = 1;
1108 struct tab_table *tbl;
1115 heading_columns = 2;
1116 n_factors = hsh_count (fctr->fstats);
1118 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1120 if ( fctr->indep_var[1] )
1121 heading_columns = 3;
1125 heading_columns = 1;
1126 n_rows = n_dep_var * 2 * n_extremities;
1129 n_rows += heading_rows;
1131 heading_columns += 2;
1132 n_cols = heading_columns + 2;
1134 tbl = tab_create (n_cols,n_rows,0);
1135 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1137 tab_dim (tbl, tab_natural_dimensions);
1139 /* Outline the box, No internal lines*/
1144 n_cols - 1, n_rows - 1);
1146 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1148 tab_title (tbl, _ ("Extreme Values"));
1150 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1151 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1155 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1156 var_to_string (fctr->indep_var[0]));
1158 if ( fctr->indep_var[1] )
1159 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1160 var_to_string (fctr->indep_var[1]));
1163 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Value"));
1164 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Case Number"));
1166 for ( i = 0 ; i < n_dep_var ; ++i )
1170 tab_hline (tbl, TAL_1, 0, n_cols -1 ,
1171 i * 2 * n_extremities * n_factors + heading_rows);
1174 i * 2 * n_extremities * n_factors + heading_rows,
1175 TAB_LEFT | TAT_TITLE,
1176 var_to_string (dependent_var[i])
1181 populate_extremes (tbl, heading_columns - 2,
1182 i * 2 * n_extremities * n_factors + heading_rows,
1183 n_extremities, &totals[i]);
1187 struct factor_statistics **fs = fctr->fs;
1192 static union value prev ;
1194 const int row = heading_rows + ( 2 * n_extremities ) *
1195 ( ( i * n_factors ) + count );
1198 if ( 0 != compare_values (&prev, & (*fs)->id[0],
1199 fctr->indep_var[0]->width))
1203 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1207 TAB_LEFT | TAT_TITLE,
1208 value_to_string (& (*fs)->id[0], fctr->indep_var[0])
1212 prev = (*fs)->id[0];
1214 if (fctr->indep_var[1] && count > 0 )
1215 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1217 if ( fctr->indep_var[1])
1218 tab_text (tbl, 2, row,
1219 TAB_LEFT | TAT_TITLE,
1220 value_to_string (& (*fs)->id[1], fctr->indep_var[1])
1223 populate_extremes (tbl, heading_columns - 2,
1238 /* Fill in the extremities table */
1240 populate_extremes (struct tab_table *t,
1241 int col, int row, int n, const struct metrics *m)
1247 tab_text (t, col, row,
1248 TAB_RIGHT | TAT_TITLE ,
1252 tab_text (t, col, row + n ,
1253 TAB_RIGHT | TAT_TITLE ,
1258 tab_hline (t, TAL_1, col, col + 3, row + n );
1260 for (extremity = 0; extremity < n ; ++extremity )
1263 tab_float (t, col + 1, row + extremity,
1265 extremity + 1, 8, 0);
1269 tab_float (t, col + 1, row + extremity + n,
1271 extremity + 1, 8, 0);
1277 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1280 const struct weighted_value *wv = m->wvp[idx];
1281 struct case_node *cn = wv->case_nos;
1284 for (j = 0 ; j < wv->w ; ++j )
1286 if ( extremity + j >= n )
1289 tab_float (t, col + 3, row + extremity + j + n,
1293 tab_float (t, col + 2, row + extremity + j + n,
1302 extremity += wv->w ;
1307 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1310 const struct weighted_value *wv = m->wvp[idx];
1311 struct case_node *cn = wv->case_nos;
1313 for (j = 0 ; j < wv->w ; ++j )
1315 if ( extremity + j >= n )
1318 tab_float (t, col + 3, row + extremity + j,
1322 tab_float (t, col + 2, row + extremity + j,
1331 extremity += wv->w ;
1336 /* Show the descriptives table */
1338 show_descriptives (struct variable **dependent_var,
1340 struct factor *fctr)
1343 int heading_columns ;
1345 const int n_stat_rows = 13;
1347 const int heading_rows = 1;
1349 struct tab_table *tbl;
1356 heading_columns = 4;
1357 n_factors = hsh_count (fctr->fstats);
1359 n_rows = n_dep_var * n_stat_rows * n_factors;
1361 if ( fctr->indep_var[1] )
1362 heading_columns = 5;
1366 heading_columns = 3;
1367 n_rows = n_dep_var * n_stat_rows;
1370 n_rows += heading_rows;
1372 n_cols = heading_columns + 2;
1375 tbl = tab_create (n_cols, n_rows, 0);
1377 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1379 tab_dim (tbl, tab_natural_dimensions);
1381 /* Outline the box and have no internal lines*/
1386 n_cols - 1, n_rows - 1);
1388 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1390 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1391 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1392 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1394 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Statistic"));
1395 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Std. Error"));
1397 tab_title (tbl, _ ("Descriptives"));
1400 for ( i = 0 ; i < n_dep_var ; ++i )
1402 const int row = heading_rows + i * n_stat_rows * n_factors ;
1405 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
1408 i * n_stat_rows * n_factors + heading_rows,
1409 TAB_LEFT | TAT_TITLE,
1410 var_to_string (dependent_var[i])
1416 struct factor_statistics **fs = fctr->fs;
1419 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1420 var_to_string (fctr->indep_var[0]));
1423 if ( fctr->indep_var[1])
1424 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1425 var_to_string (fctr->indep_var[1]));
1430 static union value prev ;
1432 const int row = heading_rows + n_stat_rows *
1433 ( ( i * n_factors ) + count );
1436 if ( 0 != compare_values (&prev, & (*fs)->id[0],
1437 fctr->indep_var[0]->width))
1441 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1445 TAB_LEFT | TAT_TITLE,
1446 value_to_string (& (*fs)->id[0], fctr->indep_var[0])
1450 prev = (*fs)->id[0];
1452 if (fctr->indep_var[1] && count > 0 )
1453 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1455 if ( fctr->indep_var[1])
1456 tab_text (tbl, 2, row,
1457 TAB_LEFT | TAT_TITLE,
1458 value_to_string (& (*fs)->id[1], fctr->indep_var[1])
1461 populate_descriptives (tbl, heading_columns - 2,
1462 row, & (*fs)->m[i]);
1473 populate_descriptives (tbl, heading_columns - 2,
1474 i * n_stat_rows * n_factors + heading_rows,
1486 /* Fill in the descriptives data */
1488 populate_descriptives (struct tab_table *tbl, int col, int row,
1489 const struct metrics *m)
1492 const double t = gsl_cdf_tdist_Qinv (1 - cmd.n_cinterval[0]/100.0/2.0, \
1498 TAB_LEFT | TAT_TITLE,
1501 tab_float (tbl, col + 2,
1507 tab_float (tbl, col + 3,
1516 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1517 _ ("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1520 tab_text (tbl, col + 1,
1522 TAB_LEFT | TAT_TITLE,
1525 tab_float (tbl, col + 2,
1528 m->mean - t * m->se_mean,
1531 tab_text (tbl, col + 1,
1533 TAB_LEFT | TAT_TITLE,
1537 tab_float (tbl, col + 2,
1540 m->mean + t * m->se_mean,
1545 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1546 _ ("5%% Trimmed Mean"));
1548 tab_float (tbl, col + 2,
1556 TAB_LEFT | TAT_TITLE,
1560 struct percentile *p;
1563 p = hsh_find (m->ptile_hash, &d);
1568 tab_float (tbl, col + 2,
1578 TAB_LEFT | TAT_TITLE,
1581 tab_float (tbl, col + 2,
1590 TAB_LEFT | TAT_TITLE,
1591 _ ("Std. Deviation"));
1594 tab_float (tbl, col + 2,
1603 TAB_LEFT | TAT_TITLE,
1606 tab_float (tbl, col + 2,
1614 TAB_LEFT | TAT_TITLE,
1617 tab_float (tbl, col + 2,
1626 TAB_LEFT | TAT_TITLE,
1630 tab_float (tbl, col + 2,
1638 TAB_LEFT | TAT_TITLE,
1639 _ ("Interquartile Range"));
1642 struct percentile *p1;
1643 struct percentile *p2;
1646 p1 = hsh_find (m->ptile_hash, &d);
1649 p2 = hsh_find (m->ptile_hash, &d);
1654 tab_float (tbl, col + 2,
1665 TAB_LEFT | TAT_TITLE,
1669 tab_float (tbl, col + 2,
1675 /* stderr of skewness */
1676 tab_float (tbl, col + 3,
1685 TAB_LEFT | TAT_TITLE,
1689 tab_float (tbl, col + 2,
1695 /* stderr of kurtosis */
1696 tab_float (tbl, col + 3,
1708 box_plot_variables (const struct factor *fctr,
1709 const struct variable **vars, int n_vars,
1710 const struct variable *id)
1714 struct factor_statistics **fs ;
1718 box_plot_group (fctr, vars, n_vars, id);
1722 for ( fs = fctr->fs ; *fs ; ++fs )
1724 double y_min = DBL_MAX;
1725 double y_max = -DBL_MAX;
1726 struct chart *ch = chart_create ();
1727 const char *s = factor_to_string (fctr, *fs, 0 );
1729 chart_write_title (ch, s);
1731 for ( i = 0 ; i < n_vars ; ++i )
1733 y_max = MAX (y_max, (*fs)->m[i].max);
1734 y_min = MIN (y_min, (*fs)->m[i].min);
1737 boxplot_draw_yscale (ch, y_max, y_min);
1739 for ( i = 0 ; i < n_vars ; ++i )
1742 const double box_width = (ch->data_right - ch->data_left)
1745 const double box_centre = ( i * 2 + 1) * box_width
1748 boxplot_draw_boxplot (ch,
1749 box_centre, box_width,
1751 var_to_string (vars[i]));
1763 /* Do a box plot, grouping all factors into one plot ;
1764 each dependent variable has its own plot.
1767 box_plot_group (const struct factor *fctr,
1768 const struct variable **vars,
1770 const struct variable *id UNUSED)
1775 for ( i = 0 ; i < n_vars ; ++i )
1777 struct factor_statistics **fs ;
1780 ch = chart_create ();
1782 boxplot_draw_yscale (ch, totals[i].max, totals[i].min);
1788 for ( fs = fctr->fs ; *fs ; ++fs )
1791 chart_write_title (ch, _ ("Boxplot of %s vs. %s"),
1792 var_to_string (vars[i]), var_to_string (fctr->indep_var[0]) );
1794 for ( fs = fctr->fs ; *fs ; ++fs )
1797 const char *s = factor_to_string_concise (fctr, *fs);
1799 const double box_width = (ch->data_right - ch->data_left)
1800 / (n_factors * 2.0 ) ;
1802 const double box_centre = ( f++ * 2 + 1) * box_width
1805 boxplot_draw_boxplot (ch,
1806 box_centre, box_width,
1813 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1814 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1816 chart_write_title (ch, _ ("Boxplot"));
1818 boxplot_draw_boxplot (ch,
1819 box_centre, box_width,
1821 var_to_string (vars[i]) );
1830 /* Plot the normal and detrended normal plots for m
1831 Label the plots with factorname */
1833 np_plot (const struct metrics *m, const char *factorname)
1836 double yfirst=0, ylast=0;
1839 struct chart *np_chart;
1841 /* Detrended Normal Plot */
1842 struct chart *dnp_chart;
1844 /* The slope and intercept of the ideal normal probability line */
1845 const double slope = 1.0 / m->stddev;
1846 const double intercept = - m->mean / m->stddev;
1848 /* Cowardly refuse to plot an empty data set */
1849 if ( m->n_data == 0 )
1852 np_chart = chart_create ();
1853 dnp_chart = chart_create ();
1855 if ( !np_chart || ! dnp_chart )
1858 chart_write_title (np_chart, _ ("Normal Q-Q Plot of %s"), factorname);
1859 chart_write_xlabel (np_chart, _ ("Observed Value"));
1860 chart_write_ylabel (np_chart, _ ("Expected Normal"));
1863 chart_write_title (dnp_chart, _ ("Detrended Normal Q-Q Plot of %s"),
1865 chart_write_xlabel (dnp_chart, _ ("Observed Value"));
1866 chart_write_ylabel (dnp_chart, _ ("Dev from Normal"));
1868 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1869 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1873 /* Need to make sure that both the scatter plot and the ideal fit into the
1875 double x_lower = MIN (m->min, (yfirst - intercept) / slope) ;
1876 double x_upper = MAX (m->max, (ylast - intercept) / slope) ;
1877 double slack = (x_upper - x_lower) * 0.05 ;
1879 chart_write_xscale (np_chart, x_lower - slack, x_upper + slack, 5);
1881 chart_write_xscale (dnp_chart, m->min, m->max, 5);
1885 chart_write_yscale (np_chart, yfirst, ylast, 5);
1888 /* We have to cache the detrended data, beacause we need to
1889 find its limits before we can plot it */
1890 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1891 double d_max = -DBL_MAX;
1892 double d_min = DBL_MAX;
1893 for ( i = 0 ; i < m->n_data; ++i )
1895 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1897 chart_datum (np_chart, 0, m->wvp[i]->v.f, ns);
1899 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1901 if ( d_data[i] < d_min ) d_min = d_data[i];
1902 if ( d_data[i] > d_max ) d_max = d_data[i];
1904 chart_write_yscale (dnp_chart, d_min, d_max, 5);
1906 for ( i = 0 ; i < m->n_data; ++i )
1907 chart_datum (dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1912 chart_line (np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1913 chart_line (dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1915 chart_submit (np_chart);
1916 chart_submit (dnp_chart);
1922 /* Show the percentiles */
1924 show_percentiles (struct variable **dependent_var,
1926 struct factor *fctr)
1928 struct tab_table *tbl;
1934 struct hsh_table *ptiles ;
1936 int n_heading_columns;
1937 const int n_heading_rows = 2;
1938 const int n_stat_rows = 2;
1944 struct factor_statistics **fs = fctr->fs ;
1945 n_heading_columns = 3;
1946 n_factors = hsh_count (fctr->fstats);
1948 ptiles = (*fs)->m[0].ptile_hash;
1950 if ( fctr->indep_var[1] )
1951 n_heading_columns = 4;
1956 n_heading_columns = 2;
1958 ptiles = totals[0].ptile_hash;
1961 n_ptiles = hsh_count (ptiles);
1963 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1965 n_cols = n_heading_columns + n_ptiles ;
1967 tbl = tab_create (n_cols, n_rows, 0);
1969 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1971 tab_dim (tbl, tab_natural_dimensions);
1973 /* Outline the box and have no internal lines*/
1978 n_cols - 1, n_rows - 1);
1980 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
1982 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
1985 tab_title (tbl, _ ("Percentiles"));
1988 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
1995 n_heading_columns - 1, n_rows - 1);
2001 n_heading_columns, n_heading_rows - 1,
2002 n_cols - 1, n_rows - 1);
2004 tab_joint_text (tbl, n_heading_columns + 1, 0,
2006 TAB_CENTER | TAT_TITLE ,
2011 /* Put in the percentile break points as headings */
2013 struct percentile **p = (struct percentile **) hsh_sort (ptiles);
2018 tab_float (tbl, n_heading_columns + i++ , 1,
2027 for ( i = 0 ; i < n_dep_var ; ++i )
2029 const int n_stat_rows = 2;
2030 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2033 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
2036 i * n_stat_rows * n_factors + n_heading_rows,
2037 TAB_LEFT | TAT_TITLE,
2038 var_to_string (dependent_var[i])
2043 struct factor_statistics **fs = fctr->fs;
2046 tab_text (tbl, 1, n_heading_rows - 1,
2047 TAB_CENTER | TAT_TITLE,
2048 var_to_string (fctr->indep_var[0]));
2051 if ( fctr->indep_var[1])
2052 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2053 var_to_string (fctr->indep_var[1]));
2058 static union value prev ;
2060 const int row = n_heading_rows + n_stat_rows *
2061 ( ( i * n_factors ) + count );
2064 if ( 0 != compare_values (&prev, & (*fs)->id[0],
2065 fctr->indep_var[0]->width))
2069 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2073 TAB_LEFT | TAT_TITLE,
2074 value_to_string (& (*fs)->id[0], fctr->indep_var[0])
2080 prev = (*fs)->id[0];
2082 if (fctr->indep_var[1] && count > 0 )
2083 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
2085 if ( fctr->indep_var[1])
2086 tab_text (tbl, 2, row,
2087 TAB_LEFT | TAT_TITLE,
2088 value_to_string (& (*fs)->id[1], fctr->indep_var[1])
2092 populate_percentiles (tbl, n_heading_columns - 1,
2093 row, & (*fs)->m[i]);
2104 populate_percentiles (tbl, n_heading_columns - 1,
2105 i * n_stat_rows * n_factors + n_heading_rows,
2122 populate_percentiles (struct tab_table *tbl, int col, int row,
2123 const struct metrics *m)
2127 struct percentile **p = (struct percentile **) hsh_sort (m->ptile_hash);
2131 TAB_LEFT | TAT_TITLE,
2132 _ ("Tukey\'s Hinges")
2137 TAB_LEFT | TAT_TITLE,
2138 ptile_alg_desc[m->ptile_alg]
2145 tab_float (tbl, col + i + 1 , row,
2148 if ( (*p)->p == 25 )
2149 tab_float (tbl, col + i + 1 , row + 1,
2153 if ( (*p)->p == 50 )
2154 tab_float (tbl, col + i + 1 , row + 1,
2158 if ( (*p)->p == 75 )
2159 tab_float (tbl, col + i + 1 , row + 1,
2174 factor_to_string (const struct factor *fctr,
2175 struct factor_statistics *fs,
2176 const struct variable *var)
2179 static char buf1[100];
2185 sprintf (buf1, "%s (",var_to_string (var) );
2188 snprintf (buf2, 100, "%s = %s",
2189 var_to_string (fctr->indep_var[0]),
2190 value_to_string (&fs->id[0],fctr->indep_var[0]));
2192 strcat (buf1, buf2);
2194 if ( fctr->indep_var[1] )
2196 sprintf (buf2, "; %s = %s)",
2197 var_to_string (fctr->indep_var[1]),
2198 value_to_string (&fs->id[1],
2199 fctr->indep_var[1]));
2200 strcat (buf1, buf2);
2214 factor_to_string_concise (const struct factor *fctr,
2215 struct factor_statistics *fs)
2219 static char buf[100];
2223 snprintf (buf, 100, "%s",
2224 value_to_string (&fs->id[0], fctr->indep_var[0]));
2226 if ( fctr->indep_var[1] )
2228 sprintf (buf2, ",%s)", value_to_string (&fs->id[1], fctr->indep_var[1]) );