1 /* PSPP - EXAMINE data for normality . -*-c-*-
3 Copyright (C) 2004 Free Software Foundation, Inc.
4 Author: John Darrington 2004
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 #include <gsl/gsl_cdf.h>
24 #include <libpspp/message.h>
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/dictionary.h>
32 #include <data/procedure.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/lexer/lexer.h>
38 #include <libpspp/alloc.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/hash.h>
41 #include <libpspp/magic.h>
42 #include <libpspp/message.h>
43 #include <libpspp/misc.h>
44 #include <libpspp/str.h>
45 #include <math/factor-stats.h>
46 #include <math/moments.h>
47 #include <math/percentiles.h>
48 #include <output/charts/box-whisker.h>
49 #include <output/charts/cartesian.h>
50 #include <output/manager.h>
51 #include <output/table.h>
54 #define _(msgid) gettext (msgid)
55 #define N_(msgid) msgid
58 #include <output/chart.h>
59 #include <output/charts/plot-hist.h>
60 #include <output/charts/plot-chart.h>
67 missing=miss:pairwise/!listwise,
69 incl:include/!exclude;
70 +compare=cmp:variables/!groups;
73 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
75 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
84 static struct cmd_examine cmd;
86 static struct variable **dependent_vars;
88 static size_t n_dependent_vars;
93 /* The independent variable */
94 struct variable *indep_var[2];
97 /* Hash table of factor stats indexed by 2 values */
98 struct hsh_table *fstats;
100 /* The hash table after it has been crunched */
101 struct factor_statistics **fs;
107 /* Linked list of factors */
108 static struct factor *factors=0;
110 static struct metrics *totals=0;
112 /* Parse the clause specifying the factors */
113 static int examine_parse_independent_vars (const struct dictionary *dict, struct cmd_examine *cmd);
117 /* Output functions */
118 static void show_summary(struct variable **dependent_var, int n_dep_var,
119 const struct factor *f);
121 static void show_extremes(struct variable **dependent_var,
123 const struct factor *factor,
126 static void show_descriptives(struct variable **dependent_var,
128 struct factor *factor);
130 static void show_percentiles(struct variable **dependent_var,
132 struct factor *factor);
137 void np_plot(const struct metrics *m, const char *factorname);
140 void box_plot_group(const struct factor *fctr,
141 const struct variable **vars, int n_vars,
142 const struct variable *id
146 void box_plot_variables(const struct factor *fctr,
147 const struct variable **vars, int n_vars,
148 const struct variable *id
153 /* Per Split function */
154 static bool run_examine (const struct ccase *,
155 const struct casefile *cf, void *cmd_, const struct dataset *);
157 static void output_examine(void);
160 void factor_calc(struct ccase *c, int case_no,
161 double weight, int case_missing);
164 /* Represent a factor as a string, so it can be
165 printed in a human readable fashion */
166 const char * factor_to_string(const struct factor *fctr,
167 struct factor_statistics *fs,
168 const struct variable *var);
171 /* Represent a factor as a string, so it can be
172 printed in a human readable fashion,
173 but sacrificing some readablility for the sake of brevity */
174 const char *factor_to_string_concise(const struct factor *fctr,
175 struct factor_statistics *fs);
180 /* Function to use for testing for missing values */
181 static is_missing_func *value_is_missing;
186 static subc_list_double percentile_list;
188 static enum pc_alg percentile_algorithm;
190 static short sbc_percentile;
194 cmd_examine (struct dataset *ds)
198 subc_list_double_create(&percentile_list);
199 percentile_algorithm = PC_HAVERAGE;
201 if ( !parse_examine (ds, &cmd, NULL) )
204 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
205 if (cmd.incl == XMN_INCLUDE )
206 value_is_missing = mv_is_value_system_missing;
208 value_is_missing = mv_is_value_missing;
210 if ( cmd.st_n == SYSMIS )
213 if ( ! cmd.sbc_cinterval)
214 cmd.n_cinterval[0] = 95.0;
216 /* If descriptives have been requested, make sure the
217 quartiles are calculated */
218 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
220 subc_list_double_push(&percentile_list, 25);
221 subc_list_double_push(&percentile_list, 50);
222 subc_list_double_push(&percentile_list, 75);
225 ok = multipass_procedure_with_splits (ds, run_examine, &cmd);
232 if ( dependent_vars )
233 free (dependent_vars);
236 struct factor *f = factors ;
239 struct factor *ff = f;
243 hsh_destroy ( ff->fstats ) ;
249 subc_list_double_destroy(&percentile_list);
251 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
256 /* Show all the appropriate tables */
262 /* Show totals if appropriate */
263 if ( ! cmd.sbc_nototal || factors == 0 )
265 show_summary(dependent_vars, n_dependent_vars, 0);
267 if ( cmd.sbc_statistics )
269 if ( cmd.a_statistics[XMN_ST_EXTREME])
270 show_extremes(dependent_vars, n_dependent_vars, 0, cmd.st_n);
272 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
273 show_descriptives(dependent_vars, n_dependent_vars, 0);
276 if ( sbc_percentile )
277 show_percentiles(dependent_vars, n_dependent_vars, 0);
282 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
284 for ( v = 0 ; v < n_dependent_vars; ++v )
285 np_plot(&totals[v], var_to_string(dependent_vars[v]));
288 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
290 if ( cmd.cmp == XMN_GROUPS )
292 box_plot_group (0, (const struct variable **) dependent_vars,
293 n_dependent_vars, cmd.v_id);
296 box_plot_variables (0,
297 (const struct variable **) dependent_vars,
298 n_dependent_vars, cmd.v_id);
301 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
303 for ( v = 0 ; v < n_dependent_vars; ++v )
305 struct normal_curve normal;
307 normal.N = totals[v].n;
308 normal.mean = totals[v].mean;
309 normal.stddev = totals[v].stddev;
311 histogram_plot(totals[v].histogram,
312 var_to_string(dependent_vars[v]),
322 /* Show grouped statistics as appropriate */
326 show_summary(dependent_vars, n_dependent_vars, fctr);
328 if ( cmd.sbc_statistics )
330 if ( cmd.a_statistics[XMN_ST_EXTREME])
331 show_extremes(dependent_vars, n_dependent_vars, fctr, cmd.st_n);
333 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
334 show_descriptives(dependent_vars, n_dependent_vars, fctr);
337 if ( sbc_percentile )
338 show_percentiles(dependent_vars, n_dependent_vars, fctr);
345 struct factor_statistics **fs = fctr->fs ;
347 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
349 if ( cmd.cmp == XMN_VARIABLES )
350 box_plot_variables (fctr,
351 (const struct variable **) dependent_vars,
352 n_dependent_vars, cmd.v_id);
354 box_plot_group (fctr,
355 (const struct variable **) dependent_vars,
356 n_dependent_vars, cmd.v_id);
359 for ( v = 0 ; v < n_dependent_vars; ++v )
362 for ( fs = fctr->fs ; *fs ; ++fs )
364 const char *s = factor_to_string(fctr, *fs, dependent_vars[v]);
366 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
367 np_plot(&(*fs)->m[v], s);
369 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
371 struct normal_curve normal;
373 normal.N = (*fs)->m[v].n;
374 normal.mean = (*fs)->m[v].mean;
375 normal.stddev = (*fs)->m[v].stddev;
377 histogram_plot((*fs)->m[v].histogram,
381 } /* for ( fs .... */
383 } /* for ( v = 0 ..... */
393 /* Create a hash table of percentiles and their values from the list of
395 static struct hsh_table *
396 list_to_ptile_hash(const subc_list_double *l)
400 struct hsh_table *h ;
402 h = hsh_create(subc_list_double_count(l),
403 (hsh_compare_func *) ptile_compare,
404 (hsh_hash_func *) ptile_hash,
405 (hsh_free_func *) free,
409 for ( i = 0 ; i < subc_list_double_count(l) ; ++i )
411 struct percentile *p = xmalloc (sizeof *p);
413 p->p = subc_list_double_at(l,i);
424 /* Parse the PERCENTILES subcommand */
426 xmn_custom_percentiles(struct dataset *ds UNUSED,
427 struct cmd_examine *p UNUSED, void *aux UNUSED)
435 while ( lex_is_number() )
437 subc_list_double_push (&percentile_list, lex_number());
447 if ( lex_match_id("HAVERAGE"))
448 percentile_algorithm = PC_HAVERAGE;
450 else if ( lex_match_id("WAVERAGE"))
451 percentile_algorithm = PC_WAVERAGE;
453 else if ( lex_match_id("ROUND"))
454 percentile_algorithm = PC_ROUND;
456 else if ( lex_match_id("EMPIRICAL"))
457 percentile_algorithm = PC_EMPIRICAL;
459 else if ( lex_match_id("AEMPIRICAL"))
460 percentile_algorithm = PC_AEMPIRICAL;
462 else if ( lex_match_id("NONE"))
463 percentile_algorithm = PC_NONE;
466 if ( 0 == subc_list_double_count(&percentile_list))
468 subc_list_double_push (&percentile_list, 5);
469 subc_list_double_push (&percentile_list, 10);
470 subc_list_double_push (&percentile_list, 25);
471 subc_list_double_push (&percentile_list, 50);
472 subc_list_double_push (&percentile_list, 75);
473 subc_list_double_push (&percentile_list, 90);
474 subc_list_double_push (&percentile_list, 95);
480 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
482 xmn_custom_total (struct dataset *ds UNUSED, struct cmd_examine *p, void *aux UNUSED)
484 if ( p->sbc_nototal )
486 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
494 xmn_custom_nototal (struct dataset *ds UNUSED,
495 struct cmd_examine *p, void *aux UNUSED)
499 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
508 /* Parser for the variables sub command
509 Returns 1 on success */
511 xmn_custom_variables(struct dataset *ds, struct cmd_examine *cmd, void *aux UNUSED)
513 const struct dictionary *dict = dataset_dict (ds);
516 if ((token != T_ID || dict_lookup_var (dict, tokid) == NULL)
522 if (!parse_variables (dict, &dependent_vars, &n_dependent_vars,
523 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
525 free (dependent_vars);
529 assert(n_dependent_vars);
531 totals = xnmalloc (n_dependent_vars, sizeof *totals);
533 if ( lex_match(T_BY))
536 success = examine_parse_independent_vars (dict, cmd);
537 if ( success != 1 ) {
538 free (dependent_vars);
549 /* Parse the clause specifying the factors */
551 examine_parse_independent_vars (const struct dictionary *dict, struct cmd_examine *cmd)
554 struct factor *sf = xmalloc (sizeof *sf);
556 if ((token != T_ID || dict_lookup_var (dict, tokid) == NULL)
564 sf->indep_var[0] = parse_variable (dict);
565 sf->indep_var[1] = 0;
572 if ((token != T_ID || dict_lookup_var (dict, tokid) == NULL)
579 sf->indep_var[1] = parse_variable (dict);
584 sf->fstats = hsh_create(4,
585 (hsh_compare_func *) factor_statistics_compare,
586 (hsh_hash_func *) factor_statistics_hash,
587 (hsh_free_func *) factor_statistics_free,
595 if ( token == '.' || token == '/' )
598 success = examine_parse_independent_vars (dict, cmd);
609 void populate_percentiles(struct tab_table *tbl, int col, int row,
610 const struct metrics *m);
612 void populate_descriptives(struct tab_table *t, int col, int row,
613 const struct metrics *fs);
615 void populate_extremes(struct tab_table *t, int col, int row, int n,
616 const struct metrics *m);
618 void populate_summary(struct tab_table *t, int col, int row,
619 const struct metrics *m);
624 static bool bad_weight_warn = true;
627 /* Perform calculations for the sub factors */
629 factor_calc(struct ccase *c, int case_no, double weight, int case_missing)
632 struct factor *fctr = factors;
636 struct factor_statistics **foo ;
637 union value indep_vals[2] ;
639 indep_vals[0] = * case_data(c, fctr->indep_var[0]->fv);
641 if ( fctr->indep_var[1] )
642 indep_vals[1] = * case_data(c, fctr->indep_var[1]->fv);
644 indep_vals[1].f = SYSMIS;
646 assert(fctr->fstats);
648 foo = ( struct factor_statistics ** )
649 hsh_probe(fctr->fstats, (void *) &indep_vals);
654 *foo = create_factor_statistics(n_dependent_vars,
658 for ( v = 0 ; v < n_dependent_vars ; ++v )
660 metrics_precalc( &(*foo)->m[v] );
665 for ( v = 0 ; v < n_dependent_vars ; ++v )
667 const struct variable *var = dependent_vars[v];
668 const union value *val = case_data (c, var->fv);
670 if ( value_is_missing (&var->miss, val) || case_missing )
673 metrics_calc( &(*foo)->m[v], val, weight, case_no);
684 run_examine(const struct ccase *first, const struct casefile *cf,
685 void *cmd_, const struct dataset *ds)
687 struct dictionary *dict = dataset_dict (ds);
688 struct casereader *r;
692 const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
696 output_split_file_values (ds, first);
698 /* Make sure we haven't got rubbish left over from a
703 struct factor *next = fctr->next;
705 hsh_clear(fctr->fstats);
712 for ( v = 0 ; v < n_dependent_vars ; ++v )
713 metrics_precalc(&totals[v]);
715 for(r = casefile_get_reader (cf);
716 casereader_read (r, &c) ;
720 const int case_no = casereader_cnum(r);
722 const double weight =
723 dict_get_case_weight(dict, &c, &bad_weight_warn);
725 if ( cmd->miss == XMN_LISTWISE )
727 for ( v = 0 ; v < n_dependent_vars ; ++v )
729 const struct variable *var = dependent_vars[v];
730 const union value *val = case_data (&c, var->fv);
732 if ( value_is_missing(&var->miss, val))
738 for ( v = 0 ; v < n_dependent_vars ; ++v )
740 const struct variable *var = dependent_vars[v];
741 const union value *val = case_data (&c, var->fv);
743 if ( value_is_missing(&var->miss, val) || case_missing )
746 metrics_calc(&totals[v], val, weight, case_no);
750 factor_calc(&c, case_no, weight, case_missing);
755 for ( v = 0 ; v < n_dependent_vars ; ++v)
760 struct hsh_iterator hi;
761 struct factor_statistics *fs;
763 for ( fs = hsh_first(fctr->fstats, &hi);
765 fs = hsh_next(fctr->fstats, &hi))
768 fs->m[v].ptile_hash = list_to_ptile_hash(&percentile_list);
769 fs->m[v].ptile_alg = percentile_algorithm;
770 metrics_postcalc(&fs->m[v]);
776 totals[v].ptile_hash = list_to_ptile_hash(&percentile_list);
777 totals[v].ptile_alg = percentile_algorithm;
778 metrics_postcalc(&totals[v]);
782 /* Make sure that the combination of factors are complete */
787 struct hsh_iterator hi;
788 struct hsh_iterator hi0;
789 struct hsh_iterator hi1;
790 struct factor_statistics *fs;
792 struct hsh_table *idh0=0;
793 struct hsh_table *idh1=0;
797 idh0 = hsh_create(4, (hsh_compare_func *) compare_values,
798 (hsh_hash_func *) hash_value,
801 idh1 = hsh_create(4, (hsh_compare_func *) compare_values,
802 (hsh_hash_func *) hash_value,
806 for ( fs = hsh_first(fctr->fstats, &hi);
808 fs = hsh_next(fctr->fstats, &hi))
810 hsh_insert(idh0,(void *) &fs->id[0]);
811 hsh_insert(idh1,(void *) &fs->id[1]);
814 /* Ensure that the factors combination is complete */
815 for ( val0 = hsh_first(idh0, &hi0);
817 val0 = hsh_next(idh0, &hi0))
819 for ( val1 = hsh_first(idh1, &hi1);
821 val1 = hsh_next(idh1, &hi1))
823 struct factor_statistics **ffs;
828 ffs = (struct factor_statistics **)
829 hsh_probe(fctr->fstats, (void *) &key );
833 (*ffs) = create_factor_statistics (n_dependent_vars,
835 for ( i = 0 ; i < n_dependent_vars ; ++i )
836 metrics_precalc( &(*ffs)->m[i]);
844 fctr->fs = (struct factor_statistics **) hsh_sort_copy(fctr->fstats);
855 for ( i = 0 ; i < n_dependent_vars ; ++i )
857 metrics_destroy(&totals[i]);
866 show_summary(struct variable **dependent_var, int n_dep_var,
867 const struct factor *fctr)
869 static const char *subtitle[]=
877 int heading_columns ;
879 const int heading_rows = 3;
880 struct tab_table *tbl;
888 n_factors = hsh_count(fctr->fstats);
889 n_rows = n_dep_var * n_factors ;
891 if ( fctr->indep_var[1] )
900 n_rows += heading_rows;
902 n_cols = heading_columns + 6;
904 tbl = tab_create (n_cols,n_rows,0);
905 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
907 tab_dim (tbl, tab_natural_dimensions);
909 /* Outline the box */
914 n_cols - 1, n_rows - 1);
916 /* Vertical lines for the data only */
921 n_cols - 1, n_rows - 1);
924 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
925 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
926 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
928 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
931 tab_title (tbl, _("Case Processing Summary"));
934 tab_joint_text(tbl, heading_columns, 0,
936 TAB_CENTER | TAT_TITLE,
939 /* Remove lines ... */
946 for ( i = 0 ; i < 3 ; ++i )
948 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
951 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
954 tab_joint_text(tbl, heading_columns + i*2 , 1,
955 heading_columns + i*2 + 1, 1,
956 TAB_CENTER | TAT_TITLE,
959 tab_box (tbl, -1, -1,
961 heading_columns + i*2, 1,
962 heading_columns + i*2 + 1, 1);
967 /* Titles for the independent variables */
970 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
971 var_to_string(fctr->indep_var[0]));
973 if ( fctr->indep_var[1] )
975 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
976 var_to_string(fctr->indep_var[1]));
982 for ( i = 0 ; i < n_dep_var ; ++i )
986 n_factors = hsh_count(fctr->fstats);
990 tab_hline(tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
993 0, i * n_factors + heading_rows,
994 TAB_LEFT | TAT_TITLE,
995 var_to_string(dependent_var[i])
1000 populate_summary(tbl, heading_columns,
1001 (i * n_factors) + heading_rows,
1007 struct factor_statistics **fs = fctr->fs;
1012 static union value prev;
1014 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1015 fctr->indep_var[0]->width))
1019 (i * n_factors ) + count +
1021 TAB_LEFT | TAT_TITLE,
1022 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1025 if (fctr->indep_var[1] && count > 0 )
1026 tab_hline(tbl, TAL_1, 1, n_cols - 1,
1027 (i * n_factors ) + count + heading_rows);
1031 prev = (*fs)->id[0];
1034 if ( fctr->indep_var[1])
1037 (i * n_factors ) + count +
1039 TAB_LEFT | TAT_TITLE,
1040 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1043 populate_summary(tbl, heading_columns,
1044 (i * n_factors) + count
1059 populate_summary(struct tab_table *t, int col, int row,
1060 const struct metrics *m)
1063 const double total = m->n + m->n_missing ;
1065 tab_float(t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1066 tab_float(t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1067 tab_float(t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1071 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1072 100.0 * m->n / total );
1074 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1075 100.0 * m->n_missing / total );
1077 /* This seems a bit pointless !!! */
1078 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1079 100.0 * total / total );
1090 show_extremes(struct variable **dependent_var, int n_dep_var,
1091 const struct factor *fctr, int n_extremities)
1094 int heading_columns ;
1096 const int heading_rows = 1;
1097 struct tab_table *tbl;
1104 heading_columns = 2;
1105 n_factors = hsh_count(fctr->fstats);
1107 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1109 if ( fctr->indep_var[1] )
1110 heading_columns = 3;
1114 heading_columns = 1;
1115 n_rows = n_dep_var * 2 * n_extremities;
1118 n_rows += heading_rows;
1120 heading_columns += 2;
1121 n_cols = heading_columns + 2;
1123 tbl = tab_create (n_cols,n_rows,0);
1124 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1126 tab_dim (tbl, tab_natural_dimensions);
1128 /* Outline the box, No internal lines*/
1133 n_cols - 1, n_rows - 1);
1135 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1137 tab_title (tbl, _("Extreme Values"));
1139 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1140 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1144 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1145 var_to_string(fctr->indep_var[0]));
1147 if ( fctr->indep_var[1] )
1148 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1149 var_to_string(fctr->indep_var[1]));
1152 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Value"));
1153 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Case Number"));
1155 for ( i = 0 ; i < n_dep_var ; ++i )
1159 tab_hline(tbl, TAL_1, 0, n_cols -1 ,
1160 i * 2 * n_extremities * n_factors + heading_rows);
1163 i * 2 * n_extremities * n_factors + heading_rows,
1164 TAB_LEFT | TAT_TITLE,
1165 var_to_string(dependent_var[i])
1170 populate_extremes(tbl, heading_columns - 2,
1171 i * 2 * n_extremities * n_factors + heading_rows,
1172 n_extremities, &totals[i]);
1176 struct factor_statistics **fs = fctr->fs;
1181 static union value prev ;
1183 const int row = heading_rows + ( 2 * n_extremities ) *
1184 ( ( i * n_factors ) + count );
1187 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1188 fctr->indep_var[0]->width))
1192 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1196 TAB_LEFT | TAT_TITLE,
1197 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1201 prev = (*fs)->id[0];
1203 if (fctr->indep_var[1] && count > 0 )
1204 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1206 if ( fctr->indep_var[1])
1207 tab_text (tbl, 2, row,
1208 TAB_LEFT | TAT_TITLE,
1209 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1212 populate_extremes(tbl, heading_columns - 2,
1227 /* Fill in the extremities table */
1229 populate_extremes(struct tab_table *t,
1230 int col, int row, int n, const struct metrics *m)
1236 tab_text(t, col, row,
1237 TAB_RIGHT | TAT_TITLE ,
1241 tab_text(t, col, row + n ,
1242 TAB_RIGHT | TAT_TITLE ,
1247 tab_hline(t, TAL_1, col, col + 3, row + n );
1249 for (extremity = 0; extremity < n ; ++extremity )
1252 tab_float(t, col + 1, row + extremity,
1254 extremity + 1, 8, 0);
1258 tab_float(t, col + 1, row + extremity + n,
1260 extremity + 1, 8, 0);
1266 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1269 const struct weighted_value *wv = m->wvp[idx];
1270 struct case_node *cn = wv->case_nos;
1273 for (j = 0 ; j < wv->w ; ++j )
1275 if ( extremity + j >= n )
1278 tab_float(t, col + 3, row + extremity + j + n,
1282 tab_float(t, col + 2, row + extremity + j + n,
1291 extremity += wv->w ;
1296 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1299 const struct weighted_value *wv = m->wvp[idx];
1300 struct case_node *cn = wv->case_nos;
1302 for (j = 0 ; j < wv->w ; ++j )
1304 if ( extremity + j >= n )
1307 tab_float(t, col + 3, row + extremity + j,
1311 tab_float(t, col + 2, row + extremity + j,
1320 extremity += wv->w ;
1325 /* Show the descriptives table */
1327 show_descriptives(struct variable **dependent_var,
1329 struct factor *fctr)
1332 int heading_columns ;
1334 const int n_stat_rows = 13;
1336 const int heading_rows = 1;
1338 struct tab_table *tbl;
1345 heading_columns = 4;
1346 n_factors = hsh_count(fctr->fstats);
1348 n_rows = n_dep_var * n_stat_rows * n_factors;
1350 if ( fctr->indep_var[1] )
1351 heading_columns = 5;
1355 heading_columns = 3;
1356 n_rows = n_dep_var * n_stat_rows;
1359 n_rows += heading_rows;
1361 n_cols = heading_columns + 2;
1364 tbl = tab_create (n_cols, n_rows, 0);
1366 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1368 tab_dim (tbl, tab_natural_dimensions);
1370 /* Outline the box and have no internal lines*/
1375 n_cols - 1, n_rows - 1);
1377 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1379 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1380 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1381 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1383 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Statistic"));
1384 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
1386 tab_title (tbl, _("Descriptives"));
1389 for ( i = 0 ; i < n_dep_var ; ++i )
1391 const int row = heading_rows + i * n_stat_rows * n_factors ;
1394 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
1397 i * n_stat_rows * n_factors + heading_rows,
1398 TAB_LEFT | TAT_TITLE,
1399 var_to_string(dependent_var[i])
1405 struct factor_statistics **fs = fctr->fs;
1408 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1409 var_to_string(fctr->indep_var[0]));
1412 if ( fctr->indep_var[1])
1413 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1414 var_to_string(fctr->indep_var[1]));
1419 static union value prev ;
1421 const int row = heading_rows + n_stat_rows *
1422 ( ( i * n_factors ) + count );
1425 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1426 fctr->indep_var[0]->width))
1430 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1434 TAB_LEFT | TAT_TITLE,
1435 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1439 prev = (*fs)->id[0];
1441 if (fctr->indep_var[1] && count > 0 )
1442 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1444 if ( fctr->indep_var[1])
1445 tab_text (tbl, 2, row,
1446 TAB_LEFT | TAT_TITLE,
1447 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1450 populate_descriptives(tbl, heading_columns - 2,
1462 populate_descriptives(tbl, heading_columns - 2,
1463 i * n_stat_rows * n_factors + heading_rows,
1475 /* Fill in the descriptives data */
1477 populate_descriptives(struct tab_table *tbl, int col, int row,
1478 const struct metrics *m)
1481 const double t = gsl_cdf_tdist_Qinv(1 - cmd.n_cinterval[0]/100.0/2.0, \
1487 TAB_LEFT | TAT_TITLE,
1490 tab_float (tbl, col + 2,
1496 tab_float (tbl, col + 3,
1505 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1506 _("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1509 tab_text (tbl, col + 1,
1511 TAB_LEFT | TAT_TITLE,
1514 tab_float (tbl, col + 2,
1517 m->mean - t * m->se_mean,
1520 tab_text (tbl, col + 1,
1522 TAB_LEFT | TAT_TITLE,
1526 tab_float (tbl, col + 2,
1529 m->mean + t * m->se_mean,
1534 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1535 _("5%% Trimmed Mean"));
1537 tab_float (tbl, col + 2,
1545 TAB_LEFT | TAT_TITLE,
1549 struct percentile *p;
1552 p = hsh_find(m->ptile_hash, &d);
1557 tab_float (tbl, col + 2,
1567 TAB_LEFT | TAT_TITLE,
1570 tab_float (tbl, col + 2,
1579 TAB_LEFT | TAT_TITLE,
1580 _("Std. Deviation"));
1583 tab_float (tbl, col + 2,
1592 TAB_LEFT | TAT_TITLE,
1595 tab_float (tbl, col + 2,
1603 TAB_LEFT | TAT_TITLE,
1606 tab_float (tbl, col + 2,
1615 TAB_LEFT | TAT_TITLE,
1619 tab_float (tbl, col + 2,
1627 TAB_LEFT | TAT_TITLE,
1628 _("Interquartile Range"));
1631 struct percentile *p1;
1632 struct percentile *p2;
1635 p1 = hsh_find(m->ptile_hash, &d);
1638 p2 = hsh_find(m->ptile_hash, &d);
1643 tab_float (tbl, col + 2,
1654 TAB_LEFT | TAT_TITLE,
1658 tab_float (tbl, col + 2,
1664 /* stderr of skewness */
1665 tab_float (tbl, col + 3,
1674 TAB_LEFT | TAT_TITLE,
1678 tab_float (tbl, col + 2,
1684 /* stderr of kurtosis */
1685 tab_float (tbl, col + 3,
1697 box_plot_variables(const struct factor *fctr,
1698 const struct variable **vars, int n_vars,
1699 const struct variable *id)
1703 struct factor_statistics **fs ;
1707 box_plot_group(fctr, vars, n_vars, id);
1711 for ( fs = fctr->fs ; *fs ; ++fs )
1713 double y_min = DBL_MAX;
1714 double y_max = -DBL_MAX;
1715 struct chart *ch = chart_create();
1716 const char *s = factor_to_string(fctr, *fs, 0 );
1718 chart_write_title(ch, s);
1720 for ( i = 0 ; i < n_vars ; ++i )
1722 y_max = max(y_max, (*fs)->m[i].max);
1723 y_min = min(y_min, (*fs)->m[i].min);
1726 boxplot_draw_yscale(ch, y_max, y_min);
1728 for ( i = 0 ; i < n_vars ; ++i )
1731 const double box_width = (ch->data_right - ch->data_left)
1734 const double box_centre = ( i * 2 + 1) * box_width
1737 boxplot_draw_boxplot(ch,
1738 box_centre, box_width,
1740 var_to_string(vars[i]));
1752 /* Do a box plot, grouping all factors into one plot ;
1753 each dependent variable has its own plot.
1756 box_plot_group(const struct factor *fctr,
1757 const struct variable **vars,
1759 const struct variable *id UNUSED)
1764 for ( i = 0 ; i < n_vars ; ++i )
1766 struct factor_statistics **fs ;
1769 ch = chart_create();
1771 boxplot_draw_yscale(ch, totals[i].max, totals[i].min);
1777 for ( fs = fctr->fs ; *fs ; ++fs )
1780 chart_write_title(ch, _("Boxplot of %s vs. %s"),
1781 var_to_string(vars[i]), var_to_string(fctr->indep_var[0]) );
1783 for ( fs = fctr->fs ; *fs ; ++fs )
1786 const char *s = factor_to_string_concise(fctr, *fs);
1788 const double box_width = (ch->data_right - ch->data_left)
1789 / (n_factors * 2.0 ) ;
1791 const double box_centre = ( f++ * 2 + 1) * box_width
1794 boxplot_draw_boxplot(ch,
1795 box_centre, box_width,
1802 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1803 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1805 chart_write_title(ch, _("Boxplot"));
1807 boxplot_draw_boxplot(ch,
1808 box_centre, box_width,
1810 var_to_string(vars[i]) );
1819 /* Plot the normal and detrended normal plots for m
1820 Label the plots with factorname */
1822 np_plot(const struct metrics *m, const char *factorname)
1825 double yfirst=0, ylast=0;
1828 struct chart *np_chart;
1830 /* Detrended Normal Plot */
1831 struct chart *dnp_chart;
1833 /* The slope and intercept of the ideal normal probability line */
1834 const double slope = 1.0 / m->stddev;
1835 const double intercept = - m->mean / m->stddev;
1837 /* Cowardly refuse to plot an empty data set */
1838 if ( m->n_data == 0 )
1841 np_chart = chart_create();
1842 dnp_chart = chart_create();
1844 if ( !np_chart || ! dnp_chart )
1847 chart_write_title(np_chart, _("Normal Q-Q Plot of %s"), factorname);
1848 chart_write_xlabel(np_chart, _("Observed Value"));
1849 chart_write_ylabel(np_chart, _("Expected Normal"));
1852 chart_write_title(dnp_chart, _("Detrended Normal Q-Q Plot of %s"),
1854 chart_write_xlabel(dnp_chart, _("Observed Value"));
1855 chart_write_ylabel(dnp_chart, _("Dev from Normal"));
1857 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1858 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1862 /* Need to make sure that both the scatter plot and the ideal fit into the
1864 double x_lower = min(m->min, (yfirst - intercept) / slope) ;
1865 double x_upper = max(m->max, (ylast - intercept) / slope) ;
1866 double slack = (x_upper - x_lower) * 0.05 ;
1868 chart_write_xscale(np_chart, x_lower - slack, x_upper + slack, 5);
1870 chart_write_xscale(dnp_chart, m->min, m->max, 5);
1874 chart_write_yscale(np_chart, yfirst, ylast, 5);
1877 /* We have to cache the detrended data, beacause we need to
1878 find its limits before we can plot it */
1879 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1880 double d_max = -DBL_MAX;
1881 double d_min = DBL_MAX;
1882 for ( i = 0 ; i < m->n_data; ++i )
1884 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1886 chart_datum(np_chart, 0, m->wvp[i]->v.f, ns);
1888 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1890 if ( d_data[i] < d_min ) d_min = d_data[i];
1891 if ( d_data[i] > d_max ) d_max = d_data[i];
1893 chart_write_yscale(dnp_chart, d_min, d_max, 5);
1895 for ( i = 0 ; i < m->n_data; ++i )
1896 chart_datum(dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1901 chart_line(np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1902 chart_line(dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1904 chart_submit(np_chart);
1905 chart_submit(dnp_chart);
1911 /* Show the percentiles */
1913 show_percentiles(struct variable **dependent_var,
1915 struct factor *fctr)
1917 struct tab_table *tbl;
1923 struct hsh_table *ptiles ;
1925 int n_heading_columns;
1926 const int n_heading_rows = 2;
1927 const int n_stat_rows = 2;
1933 struct factor_statistics **fs = fctr->fs ;
1934 n_heading_columns = 3;
1935 n_factors = hsh_count(fctr->fstats);
1937 ptiles = (*fs)->m[0].ptile_hash;
1939 if ( fctr->indep_var[1] )
1940 n_heading_columns = 4;
1945 n_heading_columns = 2;
1947 ptiles = totals[0].ptile_hash;
1950 n_ptiles = hsh_count(ptiles);
1952 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1954 n_cols = n_heading_columns + n_ptiles ;
1956 tbl = tab_create (n_cols, n_rows, 0);
1958 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1960 tab_dim (tbl, tab_natural_dimensions);
1962 /* Outline the box and have no internal lines*/
1967 n_cols - 1, n_rows - 1);
1969 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
1971 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
1974 tab_title (tbl, _("Percentiles"));
1977 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
1984 n_heading_columns - 1, n_rows - 1);
1990 n_heading_columns, n_heading_rows - 1,
1991 n_cols - 1, n_rows - 1);
1993 tab_joint_text(tbl, n_heading_columns + 1, 0,
1995 TAB_CENTER | TAT_TITLE ,
2000 /* Put in the percentile break points as headings */
2002 struct percentile **p = (struct percentile **) hsh_sort(ptiles);
2007 tab_float(tbl, n_heading_columns + i++ , 1,
2016 for ( i = 0 ; i < n_dep_var ; ++i )
2018 const int n_stat_rows = 2;
2019 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2022 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
2025 i * n_stat_rows * n_factors + n_heading_rows,
2026 TAB_LEFT | TAT_TITLE,
2027 var_to_string(dependent_var[i])
2032 struct factor_statistics **fs = fctr->fs;
2035 tab_text (tbl, 1, n_heading_rows - 1,
2036 TAB_CENTER | TAT_TITLE,
2037 var_to_string(fctr->indep_var[0]));
2040 if ( fctr->indep_var[1])
2041 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2042 var_to_string(fctr->indep_var[1]));
2047 static union value prev ;
2049 const int row = n_heading_rows + n_stat_rows *
2050 ( ( i * n_factors ) + count );
2053 if ( 0 != compare_values(&prev, &(*fs)->id[0],
2054 fctr->indep_var[0]->width))
2058 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2062 TAB_LEFT | TAT_TITLE,
2063 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
2069 prev = (*fs)->id[0];
2071 if (fctr->indep_var[1] && count > 0 )
2072 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
2074 if ( fctr->indep_var[1])
2075 tab_text (tbl, 2, row,
2076 TAB_LEFT | TAT_TITLE,
2077 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
2081 populate_percentiles(tbl, n_heading_columns - 1,
2093 populate_percentiles(tbl, n_heading_columns - 1,
2094 i * n_stat_rows * n_factors + n_heading_rows,
2111 populate_percentiles(struct tab_table *tbl, int col, int row,
2112 const struct metrics *m)
2116 struct percentile **p = (struct percentile **) hsh_sort(m->ptile_hash);
2120 TAB_LEFT | TAT_TITLE,
2121 _("Tukey\'s Hinges")
2126 TAB_LEFT | TAT_TITLE,
2127 ptile_alg_desc[m->ptile_alg]
2134 tab_float(tbl, col + i + 1 , row,
2137 if ( (*p)->p == 25 )
2138 tab_float(tbl, col + i + 1 , row + 1,
2142 if ( (*p)->p == 50 )
2143 tab_float(tbl, col + i + 1 , row + 1,
2147 if ( (*p)->p == 75 )
2148 tab_float(tbl, col + i + 1 , row + 1,
2163 factor_to_string(const struct factor *fctr,
2164 struct factor_statistics *fs,
2165 const struct variable *var)
2168 static char buf1[100];
2174 sprintf(buf1, "%s (",var_to_string(var) );
2177 snprintf(buf2, 100, "%s = %s",
2178 var_to_string(fctr->indep_var[0]),
2179 value_to_string(&fs->id[0],fctr->indep_var[0]));
2183 if ( fctr->indep_var[1] )
2185 sprintf(buf2, "; %s = %s)",
2186 var_to_string(fctr->indep_var[1]),
2187 value_to_string(&fs->id[1],
2188 fctr->indep_var[1]));
2203 factor_to_string_concise(const struct factor *fctr,
2204 struct factor_statistics *fs)
2208 static char buf[100];
2212 snprintf(buf, 100, "%s",
2213 value_to_string(&fs->id[0], fctr->indep_var[0]));
2215 if ( fctr->indep_var[1] )
2217 sprintf(buf2, ",%s)", value_to_string(&fs->id[1], fctr->indep_var[1]) );