1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2004, 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
19 #include <gsl/gsl_cdf.h>
20 #include <libpspp/message.h>
25 #include <data/case.h>
26 #include <data/casegrouper.h>
27 #include <data/casereader.h>
28 #include <data/dictionary.h>
29 #include <data/procedure.h>
30 #include <data/value-labels.h>
31 #include <data/variable.h>
32 #include <data/format.h>
33 #include <language/command.h>
34 #include <language/dictionary/split-file.h>
35 #include <language/lexer/lexer.h>
36 #include <libpspp/compiler.h>
37 #include <libpspp/hash.h>
38 #include <libpspp/message.h>
39 #include <libpspp/misc.h>
40 #include <libpspp/str.h>
41 #include <math/factor-stats.h>
42 #include <math/moments.h>
43 #include <math/percentiles.h>
44 #include <output/charts/box-whisker.h>
45 #include <output/charts/cartesian.h>
46 #include <output/manager.h>
47 #include <output/table.h>
53 #define _(msgid) gettext (msgid)
54 #define N_(msgid) msgid
57 #include <output/chart.h>
58 #include <output/charts/plot-hist.h>
59 #include <output/charts/plot-chart.h>
66 missing=miss:pairwise/!listwise,
68 incl:include/!exclude;
69 +compare=cmp:variables/!groups;
72 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
74 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
83 static struct cmd_examine cmd;
85 static const struct variable **dependent_vars;
87 static size_t n_dependent_vars;
92 /* The independent variable */
93 struct variable *indep_var[2];
96 /* Hash table of factor stats indexed by 2 values */
97 struct hsh_table *fstats;
99 /* The hash table after it has been crunched */
100 struct factor_statistics **fs;
106 /* Linked list of factors */
107 static struct factor *factors = 0;
109 static struct metrics *totals = 0;
111 /* Parse the clause specifying the factors */
112 static int examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd);
116 /* Output functions */
117 static void show_summary (const struct variable **dependent_var, int n_dep_var,
118 const struct dictionary *dict,
119 const struct factor *f);
121 static void show_extremes (const struct variable **dependent_var,
123 const struct factor *factor,
126 static void show_descriptives (const struct variable **dependent_var,
128 struct factor *factor);
130 static void show_percentiles (const struct variable **dependent_var,
132 struct factor *factor);
137 void np_plot (const struct metrics *m, const char *factorname);
140 void box_plot_group (const struct factor *fctr,
141 const struct variable **vars, int n_vars,
142 const struct variable *id
146 void box_plot_variables (const struct factor *fctr,
147 const struct variable **vars, int n_vars,
148 const struct variable *id
153 /* Per Split function */
154 static void run_examine (struct cmd_examine *, struct casereader *,
157 static void output_examine (const struct dictionary *dict);
160 void factor_calc (const struct ccase *c, int case_no,
161 double weight, bool case_missing);
164 /* Represent a factor as a string, so it can be
165 printed in a human readable fashion */
166 static void factor_to_string (const struct factor *fctr,
167 const struct factor_statistics *fs,
168 const struct variable *var,
172 /* Represent a factor as a string, so it can be
173 printed in a human readable fashion,
174 but sacrificing some readablility for the sake of brevity */
175 static void factor_to_string_concise (const struct factor *fctr,
176 const struct factor_statistics *fs,
182 /* Categories of missing values to exclude. */
183 static enum mv_class exclude_values;
187 static subc_list_double percentile_list;
189 static enum pc_alg percentile_algorithm;
191 static short sbc_percentile;
195 cmd_examine (struct lexer *lexer, struct dataset *ds)
197 struct casegrouper *grouper;
198 struct casereader *group;
201 subc_list_double_create (&percentile_list);
202 percentile_algorithm = PC_HAVERAGE;
204 if ( !parse_examine (lexer, ds, &cmd, NULL) )
206 subc_list_double_destroy (&percentile_list);
210 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
211 exclude_values = cmd.incl == XMN_INCLUDE ? MV_SYSTEM : MV_ANY;
213 if ( cmd.st_n == SYSMIS )
216 if ( ! cmd.sbc_cinterval)
217 cmd.n_cinterval[0] = 95.0;
219 /* If descriptives have been requested, make sure the
220 quartiles are calculated */
221 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
223 subc_list_double_push (&percentile_list, 25);
224 subc_list_double_push (&percentile_list, 50);
225 subc_list_double_push (&percentile_list, 75);
228 grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds));
229 while (casegrouper_get_next_group (grouper, &group))
230 run_examine (&cmd, group, ds);
231 ok = casegrouper_destroy (grouper);
232 ok = proc_commit (ds) && ok;
239 if ( dependent_vars )
240 free (dependent_vars);
243 struct factor *f = factors ;
246 struct factor *ff = f;
250 hsh_destroy ( ff->fstats ) ;
256 subc_list_double_destroy (&percentile_list);
258 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
263 /* Show all the appropriate tables */
265 output_examine (const struct dictionary *dict)
269 /* Show totals if appropriate */
270 if ( ! cmd.sbc_nototal || factors == 0 )
272 show_summary (dependent_vars, n_dependent_vars, dict, 0);
274 if ( cmd.sbc_statistics )
276 if ( cmd.a_statistics[XMN_ST_EXTREME])
277 show_extremes (dependent_vars, n_dependent_vars, 0, cmd.st_n);
279 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
280 show_descriptives (dependent_vars, n_dependent_vars, 0);
283 if ( sbc_percentile )
284 show_percentiles (dependent_vars, n_dependent_vars, 0);
289 if ( cmd.a_plot[XMN_PLT_STEMLEAF] )
290 msg (SW, _ ("%s is not currently supported."), "STEMLEAF");
292 if ( cmd.a_plot[XMN_PLT_SPREADLEVEL] )
293 msg (SW, _ ("%s is not currently supported."), "SPREADLEVEL");
295 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
297 for ( v = 0 ; v < n_dependent_vars; ++v )
298 np_plot (&totals[v], var_to_string (dependent_vars[v]));
301 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
303 if ( cmd.cmp == XMN_GROUPS )
305 box_plot_group (0, (const struct variable **) dependent_vars,
306 n_dependent_vars, cmd.v_id);
309 box_plot_variables (0,
310 (const struct variable **) dependent_vars,
311 n_dependent_vars, cmd.v_id);
314 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
316 for ( v = 0 ; v < n_dependent_vars; ++v )
318 struct normal_curve normal;
320 normal.N = totals[v].n;
321 normal.mean = totals[v].mean;
322 normal.stddev = totals[v].stddev;
324 histogram_plot (totals[v].histogram,
325 var_to_string (dependent_vars[v]),
335 /* Show grouped statistics as appropriate */
339 show_summary (dependent_vars, n_dependent_vars, dict, fctr);
341 if ( cmd.sbc_statistics )
343 if ( cmd.a_statistics[XMN_ST_EXTREME])
344 show_extremes (dependent_vars, n_dependent_vars, fctr, cmd.st_n);
346 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
347 show_descriptives (dependent_vars, n_dependent_vars, fctr);
350 if ( sbc_percentile )
351 show_percentiles (dependent_vars, n_dependent_vars, fctr);
358 struct factor_statistics **fs = fctr->fs ;
360 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
362 if ( cmd.cmp == XMN_VARIABLES )
363 box_plot_variables (fctr,
364 (const struct variable **) dependent_vars,
365 n_dependent_vars, cmd.v_id);
367 box_plot_group (fctr,
368 (const struct variable **) dependent_vars,
369 n_dependent_vars, cmd.v_id);
372 for ( v = 0 ; v < n_dependent_vars; ++v )
375 for ( fs = fctr->fs ; *fs ; ++fs )
378 ds_init_empty (&str);
379 factor_to_string (fctr, *fs, dependent_vars[v], &str);
381 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
382 np_plot (& (*fs)->m[v], ds_cstr (&str));
384 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
386 struct normal_curve normal;
388 normal.N = (*fs)->m[v].n;
389 normal.mean = (*fs)->m[v].mean;
390 normal.stddev = (*fs)->m[v].stddev;
392 histogram_plot ((*fs)->m[v].histogram,
393 ds_cstr (&str) , &normal, 0);
398 } /* for ( fs .... */
400 } /* for ( v = 0 ..... */
410 /* Create a hash table of percentiles and their values from the list of
412 static struct hsh_table *
413 list_to_ptile_hash (const subc_list_double *l)
417 struct hsh_table *h ;
419 h = hsh_create (subc_list_double_count (l),
420 (hsh_compare_func *) ptile_compare,
421 (hsh_hash_func *) ptile_hash,
422 (hsh_free_func *) free,
426 for ( i = 0 ; i < subc_list_double_count (l) ; ++i )
428 struct percentile *p = xmalloc (sizeof *p);
430 p->p = subc_list_double_at (l,i);
441 /* Parse the PERCENTILES subcommand */
443 xmn_custom_percentiles (struct lexer *lexer, struct dataset *ds UNUSED,
444 struct cmd_examine *p UNUSED, void *aux UNUSED)
448 lex_match (lexer, '=');
450 lex_match (lexer, '(');
452 while ( lex_is_number (lexer) )
454 subc_list_double_push (&percentile_list, lex_number (lexer));
458 lex_match (lexer, ',') ;
460 lex_match (lexer, ')');
462 lex_match (lexer, '=');
464 if ( lex_match_id (lexer, "HAVERAGE"))
465 percentile_algorithm = PC_HAVERAGE;
467 else if ( lex_match_id (lexer, "WAVERAGE"))
468 percentile_algorithm = PC_WAVERAGE;
470 else if ( lex_match_id (lexer, "ROUND"))
471 percentile_algorithm = PC_ROUND;
473 else if ( lex_match_id (lexer, "EMPIRICAL"))
474 percentile_algorithm = PC_EMPIRICAL;
476 else if ( lex_match_id (lexer, "AEMPIRICAL"))
477 percentile_algorithm = PC_AEMPIRICAL;
479 else if ( lex_match_id (lexer, "NONE"))
480 percentile_algorithm = PC_NONE;
483 if ( 0 == subc_list_double_count (&percentile_list))
485 subc_list_double_push (&percentile_list, 5);
486 subc_list_double_push (&percentile_list, 10);
487 subc_list_double_push (&percentile_list, 25);
488 subc_list_double_push (&percentile_list, 50);
489 subc_list_double_push (&percentile_list, 75);
490 subc_list_double_push (&percentile_list, 90);
491 subc_list_double_push (&percentile_list, 95);
497 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
499 xmn_custom_total (struct lexer *lexer UNUSED, struct dataset *ds UNUSED, struct cmd_examine *p, void *aux UNUSED)
501 if ( p->sbc_nototal )
503 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
511 xmn_custom_nototal (struct lexer *lexer UNUSED, struct dataset *ds UNUSED,
512 struct cmd_examine *p, void *aux UNUSED)
516 msg (SE, _ ("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
525 /* Parser for the variables sub command
526 Returns 1 on success */
528 xmn_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_examine *cmd, void *aux UNUSED)
530 const struct dictionary *dict = dataset_dict (ds);
531 lex_match (lexer, '=');
533 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
534 && lex_token (lexer) != T_ALL)
539 if (!parse_variables_const (lexer, dict, &dependent_vars, &n_dependent_vars,
540 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
542 free (dependent_vars);
546 assert (n_dependent_vars);
548 totals = xnmalloc (n_dependent_vars, sizeof *totals);
550 if ( lex_match (lexer, T_BY))
553 success = examine_parse_independent_vars (lexer, dict, cmd);
554 if ( success != 1 ) {
555 free (dependent_vars);
566 /* Parse the clause specifying the factors */
568 examine_parse_independent_vars (struct lexer *lexer, const struct dictionary *dict, struct cmd_examine *cmd)
571 struct factor *sf = xmalloc (sizeof *sf);
573 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
574 && lex_token (lexer) != T_ALL)
581 sf->indep_var[0] = parse_variable (lexer, dict);
582 sf->indep_var[1] = 0;
584 if ( lex_token (lexer) == T_BY )
587 lex_match (lexer, T_BY);
589 if ( (lex_token (lexer) != T_ID || dict_lookup_var (dict, lex_tokid (lexer)) == NULL)
590 && lex_token (lexer) != T_ALL)
596 sf->indep_var[1] = parse_variable (lexer, dict);
601 sf->fstats = hsh_create (4,
602 (hsh_compare_func *) factor_statistics_compare,
603 (hsh_hash_func *) factor_statistics_hash,
604 (hsh_free_func *) factor_statistics_free,
610 lex_match (lexer, ',');
612 if ( lex_token (lexer) == '.' || lex_token (lexer) == '/' )
615 success = examine_parse_independent_vars (lexer, dict, cmd);
626 static void populate_percentiles (struct tab_table *tbl, int col, int row,
627 const struct metrics *m);
629 static void populate_descriptives (struct tab_table *t, int col, int row,
630 const struct variable *,
631 const struct metrics *fs);
633 static void populate_extremes (struct tab_table *t, int col, int row, int n,
634 const struct variable *var,
635 const struct metrics *m);
637 static void populate_summary (struct tab_table *t, int col, int row,
638 const struct dictionary *dict,
639 const struct metrics *m);
644 /* Perform calculations for the sub factors */
646 factor_calc (const struct ccase *c, int case_no, double weight,
650 struct factor *fctr = factors;
654 struct factor_statistics **foo ;
655 union value *indep_vals[2] ;
657 indep_vals[0] = value_dup (
658 case_data (c, fctr->indep_var[0]),
659 var_get_width (fctr->indep_var[0])
662 if ( fctr->indep_var[1] )
663 indep_vals[1] = value_dup (
664 case_data (c, fctr->indep_var[1]),
665 var_get_width (fctr->indep_var[1])
669 const union value sm = {SYSMIS};
670 indep_vals[1] = value_dup (&sm, 0);
673 assert (fctr->fstats);
675 foo = ( struct factor_statistics ** )
676 hsh_probe (fctr->fstats, (void *) &indep_vals);
681 *foo = create_factor_statistics (n_dependent_vars,
685 for ( v = 0 ; v < n_dependent_vars ; ++v )
687 metrics_precalc ( & (*foo)->m[v] );
693 free (indep_vals[0]);
694 free (indep_vals[1]);
697 for ( v = 0 ; v < n_dependent_vars ; ++v )
699 const struct variable *var = dependent_vars[v];
700 union value *val = value_dup (
705 if (case_missing || var_is_value_missing (var, val, exclude_values))
711 metrics_calc ( & (*foo)->m[v], val, weight, case_no);
721 run_examine (struct cmd_examine *cmd, struct casereader *input,
724 struct dictionary *dict = dataset_dict (ds);
732 if (!casereader_peek (input, 0, &c))
734 casereader_destroy (input);
737 output_split_file_values (ds, &c);
740 input = casereader_create_filter_weight (input, dict, NULL, NULL);
741 input = casereader_create_counter (input, &case_no, 0);
743 /* Make sure we haven't got rubbish left over from a
748 struct factor *next = fctr->next;
750 hsh_clear (fctr->fstats);
757 for ( v = 0 ; v < n_dependent_vars ; ++v )
758 metrics_precalc (&totals[v]);
760 for (; casereader_read (input, &c); case_destroy (&c))
762 bool case_missing = false;
763 const double weight = dict_get_case_weight (dict, &c, NULL);
765 if ( cmd->miss == XMN_LISTWISE )
767 for ( v = 0 ; v < n_dependent_vars ; ++v )
769 const struct variable *var = dependent_vars[v];
770 union value *val = value_dup (
775 if ( var_is_value_missing (var, val, exclude_values))
782 for ( v = 0 ; v < n_dependent_vars ; ++v )
784 const struct variable *var = dependent_vars[v];
785 union value *val = value_dup (
790 if ( var_is_value_missing (var, val, exclude_values)
797 metrics_calc (&totals[v], val, weight, case_no);
802 factor_calc (&c, case_no, weight, case_missing);
804 ok = casereader_destroy (input);
806 for ( v = 0 ; v < n_dependent_vars ; ++v)
811 struct hsh_iterator hi;
812 struct factor_statistics *fs;
814 for ( fs = hsh_first (fctr->fstats, &hi);
816 fs = hsh_next (fctr->fstats, &hi))
819 fs->m[v].ptile_hash = list_to_ptile_hash (&percentile_list);
820 fs->m[v].ptile_alg = percentile_algorithm;
821 metrics_postcalc (&fs->m[v]);
827 totals[v].ptile_hash = list_to_ptile_hash (&percentile_list);
828 totals[v].ptile_alg = percentile_algorithm;
829 metrics_postcalc (&totals[v]);
833 /* Make sure that the combination of factors are complete */
838 struct hsh_iterator hi;
839 struct hsh_iterator hi0;
840 struct hsh_iterator hi1;
841 struct factor_statistics *fs;
843 struct hsh_table *idh0 = NULL;
844 struct hsh_table *idh1 = NULL;
848 idh0 = hsh_create (4, (hsh_compare_func *) compare_ptr_values,
849 (hsh_hash_func *) hash_ptr_value,
852 idh1 = hsh_create (4, (hsh_compare_func *) compare_ptr_values,
853 (hsh_hash_func *) hash_ptr_value,
857 for ( fs = hsh_first (fctr->fstats, &hi);
859 fs = hsh_next (fctr->fstats, &hi))
861 hsh_insert (idh0, &fs->id[0]);
862 hsh_insert (idh1, &fs->id[1]);
865 /* Ensure that the factors combination is complete */
866 for ( val0 = hsh_first (idh0, &hi0);
868 val0 = hsh_next (idh0, &hi0))
870 for ( val1 = hsh_first (idh1, &hi1);
872 val1 = hsh_next (idh1, &hi1))
874 struct factor_statistics **ffs;
879 ffs = (struct factor_statistics **)
880 hsh_probe (fctr->fstats, &key );
884 (*ffs) = create_factor_statistics (n_dependent_vars,
886 for ( i = 0 ; i < n_dependent_vars ; ++i )
887 metrics_precalc ( & (*ffs)->m[i]);
895 fctr->fs = (struct factor_statistics **) hsh_sort_copy (fctr->fstats);
901 output_examine (dict);
907 for ( i = 0 ; i < n_dependent_vars ; ++i )
909 metrics_destroy (&totals[i]);
916 show_summary (const struct variable **dependent_var, int n_dep_var,
917 const struct dictionary *dict,
918 const struct factor *fctr)
920 static const char *subtitle[]=
928 int heading_columns ;
930 const int heading_rows = 3;
931 struct tab_table *tbl;
939 n_factors = hsh_count (fctr->fstats);
940 n_rows = n_dep_var * n_factors ;
942 if ( fctr->indep_var[1] )
951 n_rows += heading_rows;
953 n_cols = heading_columns + 6;
955 tbl = tab_create (n_cols,n_rows,0);
956 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
958 tab_dim (tbl, tab_natural_dimensions);
960 /* Outline the box */
965 n_cols - 1, n_rows - 1);
967 /* Vertical lines for the data only */
972 n_cols - 1, n_rows - 1);
975 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
976 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
977 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
979 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
982 tab_title (tbl, _ ("Case Processing Summary"));
984 tab_joint_text (tbl, heading_columns, 0,
986 TAB_CENTER | TAT_TITLE,
989 /* Remove lines ... */
996 for ( i = 0 ; i < 3 ; ++i )
998 tab_text (tbl, heading_columns + i * 2 , 2, TAB_CENTER | TAT_TITLE,
1001 tab_text (tbl, heading_columns + i * 2 + 1, 2, TAB_CENTER | TAT_TITLE,
1004 tab_joint_text (tbl, heading_columns + i*2 , 1,
1005 heading_columns + i * 2 + 1, 1,
1006 TAB_CENTER | TAT_TITLE,
1009 tab_box (tbl, -1, -1,
1011 heading_columns + i * 2, 1,
1012 heading_columns + i * 2 + 1, 1);
1016 /* Titles for the independent variables */
1019 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1020 var_to_string (fctr->indep_var[0]));
1022 if ( fctr->indep_var[1] )
1024 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1025 var_to_string (fctr->indep_var[1]));
1030 for ( i = 0 ; i < n_dep_var ; ++i )
1034 n_factors = hsh_count (fctr->fstats);
1037 tab_hline (tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
1040 0, i * n_factors + heading_rows,
1041 TAB_LEFT | TAT_TITLE,
1042 var_to_string (dependent_var[i])
1046 populate_summary (tbl, heading_columns,
1047 (i * n_factors) + heading_rows,
1052 struct factor_statistics **fs = fctr->fs;
1054 const union value *prev = NULL;
1059 0 != compare_values (prev, (*fs)->id[0],
1060 var_get_width (fctr->indep_var[0])))
1063 ds_init_empty (&vstr);
1064 var_append_value_name (fctr->indep_var[0],
1065 (*fs)->id[0], &vstr);
1069 (i * n_factors ) + count +
1071 TAB_LEFT | TAT_TITLE,
1077 if (fctr->indep_var[1] && count > 0 )
1078 tab_hline (tbl, TAL_1, 1, n_cols - 1,
1079 (i * n_factors ) + count + heading_rows);
1082 prev = (*fs)->id[0];
1084 if ( fctr->indep_var[1])
1087 ds_init_empty (&vstr);
1088 var_append_value_name (fctr->indep_var[1],
1089 (*fs)->id[1], &vstr);
1092 (i * n_factors ) + count +
1094 TAB_LEFT | TAT_TITLE,
1100 populate_summary (tbl, heading_columns,
1101 (i * n_factors) + count
1117 populate_summary (struct tab_table *t, int col, int row,
1118 const struct dictionary *dict,
1119 const struct metrics *m)
1122 const double total = m->n + m->n_missing ;
1124 const struct variable *wv = dict_get_weight (dict);
1125 const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : & F_8_0;
1127 tab_double (t, col + 0, row + 0, TAB_RIGHT, m->n, wfmt);
1129 tab_double (t, col + 2, row + 0, TAB_RIGHT, m->n_missing, wfmt);
1131 tab_double (t, col + 4, row + 0, TAB_RIGHT, total, wfmt);
1135 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1136 100.0 * m->n / total );
1138 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1139 100.0 * m->n_missing / total );
1141 /* This seems a bit pointless !!! */
1142 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1143 100.0 * total / total );
1150 show_extremes (const struct variable **dependent_var, int n_dep_var,
1151 const struct factor *fctr,
1155 int heading_columns ;
1157 const int heading_rows = 1;
1158 struct tab_table *tbl;
1167 heading_columns = 2;
1168 n_factors = hsh_count (fctr->fstats);
1170 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1172 if ( fctr->indep_var[1] )
1173 heading_columns = 3;
1177 heading_columns = 1;
1178 n_rows = n_dep_var * 2 * n_extremities;
1181 n_rows += heading_rows;
1183 heading_columns += 2;
1184 n_cols = heading_columns + 2;
1186 tbl = tab_create (n_cols,n_rows,0);
1187 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1189 tab_dim (tbl, tab_natural_dimensions);
1191 /* Outline the box, No internal lines*/
1196 n_cols - 1, n_rows - 1);
1198 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1200 tab_title (tbl, _ ("Extreme Values"));
1202 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1203 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1207 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1208 var_to_string (fctr->indep_var[0]));
1210 if ( fctr->indep_var[1] )
1211 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1212 var_to_string (fctr->indep_var[1]));
1215 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Value"));
1216 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Case Number"));
1218 for ( i = 0 ; i < n_dep_var ; ++i )
1222 tab_hline (tbl, TAL_1, 0, n_cols -1 ,
1223 i * 2 * n_extremities * n_factors + heading_rows);
1226 i * 2 * n_extremities * n_factors + heading_rows,
1227 TAB_LEFT | TAT_TITLE,
1228 var_to_string (dependent_var[i])
1233 populate_extremes (tbl, heading_columns - 2,
1234 i * 2 * n_extremities * n_factors + heading_rows,
1240 struct factor_statistics **fs = fctr->fs;
1242 const union value *prev = NULL;
1246 const int row = heading_rows + ( 2 * n_extremities ) *
1247 ( ( i * n_factors ) + count );
1250 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
1251 var_get_width (fctr->indep_var[0])))
1254 ds_init_empty (&vstr);
1255 var_append_value_name (fctr->indep_var[0],
1256 (*fs)->id[0], &vstr);
1259 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1263 TAB_LEFT | TAT_TITLE,
1270 prev = (*fs)->id[0];
1272 if (fctr->indep_var[1] && count > 0 )
1273 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1275 if ( fctr->indep_var[1])
1278 ds_init_empty (&vstr);
1279 var_append_value_name (fctr->indep_var[1], (*fs)->id[1], &vstr);
1281 tab_text (tbl, 2, row,
1282 TAB_LEFT | TAT_TITLE,
1289 populate_extremes (tbl, heading_columns - 2,
1305 /* Fill in the extremities table */
1307 populate_extremes (struct tab_table *t,
1308 int col, int row, int n,
1309 const struct variable *var,
1310 const struct metrics *m)
1315 tab_text (t, col, row,
1316 TAB_RIGHT | TAT_TITLE ,
1320 tab_text (t, col, row + n ,
1321 TAB_RIGHT | TAT_TITLE ,
1326 tab_hline (t, TAL_1, col, col + 3, row + n );
1328 for (extremity = 0; extremity < n ; ++extremity )
1331 tab_fixed (t, col + 1, row + extremity,
1333 extremity + 1, 8, 0);
1337 tab_fixed (t, col + 1, row + extremity + n,
1339 extremity + 1, 8, 0);
1345 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1348 const struct weighted_value *wv = m->wvp[idx];
1349 struct case_node *cn = wv->case_nos;
1352 for (j = 0 ; j < wv->w ; ++j )
1354 if ( extremity + j >= n )
1357 tab_value (t, col + 3, row + extremity + j + n,
1359 &wv->v, var_get_print_format (var));
1361 tab_fixed (t, col + 2, row + extremity + j + n,
1370 extremity += wv->w ;
1375 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1378 const struct weighted_value *wv = m->wvp[idx];
1379 struct case_node *cn = wv->case_nos;
1381 for (j = 0 ; j < wv->w ; ++j )
1383 if ( extremity + j >= n )
1386 tab_value (t, col + 3, row + extremity + j,
1388 &wv->v, var_get_print_format (var));
1390 tab_fixed (t, col + 2, row + extremity + j,
1399 extremity += wv->w ;
1404 /* Show the descriptives table */
1406 show_descriptives (const struct variable **dependent_var,
1408 struct factor *fctr)
1411 int heading_columns ;
1413 const int n_stat_rows = 13;
1415 const int heading_rows = 1;
1417 struct tab_table *tbl;
1424 heading_columns = 4;
1425 n_factors = hsh_count (fctr->fstats);
1427 n_rows = n_dep_var * n_stat_rows * n_factors;
1429 if ( fctr->indep_var[1] )
1430 heading_columns = 5;
1434 heading_columns = 3;
1435 n_rows = n_dep_var * n_stat_rows;
1438 n_rows += heading_rows;
1440 n_cols = heading_columns + 2;
1443 tbl = tab_create (n_cols, n_rows, 0);
1445 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1447 tab_dim (tbl, tab_natural_dimensions);
1449 /* Outline the box and have no internal lines*/
1454 n_cols - 1, n_rows - 1);
1456 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1458 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1459 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1460 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1462 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _ ("Statistic"));
1463 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _ ("Std. Error"));
1465 tab_title (tbl, _ ("Descriptives"));
1468 for ( i = 0 ; i < n_dep_var ; ++i )
1470 const int row = heading_rows + i * n_stat_rows * n_factors ;
1473 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
1476 i * n_stat_rows * n_factors + heading_rows,
1477 TAB_LEFT | TAT_TITLE,
1478 var_to_string (dependent_var[i])
1484 const union value *prev = NULL;
1486 struct factor_statistics **fs = fctr->fs;
1489 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1490 var_to_string (fctr->indep_var[0]));
1493 if ( fctr->indep_var[1])
1494 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1495 var_to_string (fctr->indep_var[1]));
1499 const int row = heading_rows + n_stat_rows *
1500 ( ( i * n_factors ) + count );
1503 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
1504 var_get_width (fctr->indep_var[0])))
1507 ds_init_empty (&vstr);
1508 var_append_value_name (fctr->indep_var[0],
1509 (*fs)->id[0], &vstr);
1512 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1516 TAB_LEFT | TAT_TITLE,
1523 prev = (*fs)->id[0];
1525 if (fctr->indep_var[1] && count > 0 )
1526 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
1528 if ( fctr->indep_var[1])
1531 ds_init_empty (&vstr);
1532 var_append_value_name (fctr->indep_var[1], (*fs)->id[1], &vstr);
1534 tab_text (tbl, 2, row,
1535 TAB_LEFT | TAT_TITLE,
1542 populate_descriptives (tbl, heading_columns - 2,
1556 populate_descriptives (tbl, heading_columns - 2,
1557 i * n_stat_rows * n_factors + heading_rows,
1567 /* Fill in the descriptives data */
1569 populate_descriptives (struct tab_table *tbl, int col, int row,
1570 const struct variable *var,
1571 const struct metrics *m)
1573 const double t = gsl_cdf_tdist_Qinv ((1 - cmd.n_cinterval[0] / 100.0)/2.0,
1578 TAB_LEFT | TAT_TITLE,
1581 tab_double (tbl, col + 2,
1587 tab_double (tbl, col + 3,
1596 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1597 _ ("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1600 tab_text (tbl, col + 1,
1602 TAB_LEFT | TAT_TITLE,
1605 tab_double (tbl, col + 2,
1608 m->mean - t * m->se_mean,
1611 tab_text (tbl, col + 1,
1613 TAB_LEFT | TAT_TITLE,
1617 tab_double (tbl, col + 2,
1620 m->mean + t * m->se_mean,
1625 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1626 _ ("5%% Trimmed Mean"));
1628 tab_double (tbl, col + 2,
1636 TAB_LEFT | TAT_TITLE,
1640 struct percentile *p;
1643 p = hsh_find (m->ptile_hash, &d);
1648 tab_double (tbl, col + 2,
1658 TAB_LEFT | TAT_TITLE,
1661 tab_double (tbl, col + 2,
1670 TAB_LEFT | TAT_TITLE,
1671 _ ("Std. Deviation"));
1674 tab_double (tbl, col + 2,
1683 TAB_LEFT | TAT_TITLE,
1686 tab_double (tbl, col + 2,
1689 m->min, var_get_print_format (var));
1693 TAB_LEFT | TAT_TITLE,
1696 tab_double (tbl, col + 2,
1699 m->max, var_get_print_format (var));
1703 TAB_LEFT | TAT_TITLE,
1707 tab_double (tbl, col + 2,
1715 TAB_LEFT | TAT_TITLE,
1716 _ ("Interquartile Range"));
1719 struct percentile *p1;
1720 struct percentile *p2;
1723 p1 = hsh_find (m->ptile_hash, &d);
1726 p2 = hsh_find (m->ptile_hash, &d);
1731 tab_double (tbl, col + 2,
1740 TAB_LEFT | TAT_TITLE,
1744 tab_double (tbl, col + 2,
1750 /* stderr of skewness */
1751 tab_double (tbl, col + 3,
1759 TAB_LEFT | TAT_TITLE,
1763 tab_double (tbl, col + 2,
1769 /* stderr of kurtosis */
1770 tab_double (tbl, col + 3,
1780 box_plot_variables (const struct factor *fctr,
1781 const struct variable **vars, int n_vars,
1782 const struct variable *id)
1786 struct factor_statistics **fs ;
1790 box_plot_group (fctr, vars, n_vars, id);
1794 for ( fs = fctr->fs ; *fs ; ++fs )
1797 double y_min = DBL_MAX;
1798 double y_max = -DBL_MAX;
1799 struct chart *ch = chart_create ();
1800 ds_init_empty (&str);
1801 factor_to_string (fctr, *fs, 0, &str );
1803 chart_write_title (ch, "%s", ds_cstr (&str));
1805 for ( i = 0 ; i < n_vars ; ++i )
1807 y_max = MAX (y_max, (*fs)->m[i].max);
1808 y_min = MIN (y_min, (*fs)->m[i].min);
1811 boxplot_draw_yscale (ch, y_max, y_min);
1813 for ( i = 0 ; i < n_vars ; ++i )
1816 const double box_width = (ch->data_right - ch->data_left)
1819 const double box_centre = ( i * 2 + 1) * box_width
1822 boxplot_draw_boxplot (ch,
1823 box_centre, box_width,
1825 var_to_string (vars[i]));
1837 /* Do a box plot, grouping all factors into one plot ;
1838 each dependent variable has its own plot.
1841 box_plot_group (const struct factor *fctr,
1842 const struct variable **vars,
1844 const struct variable *id UNUSED)
1849 for ( i = 0 ; i < n_vars ; ++i )
1851 struct factor_statistics **fs ;
1854 ch = chart_create ();
1856 boxplot_draw_yscale (ch, totals[i].max, totals[i].min);
1862 for ( fs = fctr->fs ; *fs ; ++fs )
1865 chart_write_title (ch, _ ("Boxplot of %s vs. %s"),
1866 var_to_string (vars[i]), var_to_string (fctr->indep_var[0]) );
1868 for ( fs = fctr->fs ; *fs ; ++fs )
1871 const double box_width = (ch->data_right - ch->data_left)
1872 / (n_factors * 2.0 ) ;
1874 const double box_centre = ( f++ * 2 + 1) * box_width
1877 ds_init_empty (&str);
1878 factor_to_string_concise (fctr, *fs, &str);
1880 boxplot_draw_boxplot (ch,
1881 box_centre, box_width,
1889 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1890 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1892 chart_write_title (ch, _ ("Boxplot"));
1894 boxplot_draw_boxplot (ch,
1895 box_centre, box_width,
1897 var_to_string (vars[i]) );
1906 /* Plot the normal and detrended normal plots for m
1907 Label the plots with factorname */
1909 np_plot (const struct metrics *m, const char *factorname)
1912 double yfirst=0, ylast=0;
1915 struct chart *np_chart;
1917 /* Detrended Normal Plot */
1918 struct chart *dnp_chart;
1920 /* The slope and intercept of the ideal normal probability line */
1921 const double slope = 1.0 / m->stddev;
1922 const double intercept = - m->mean / m->stddev;
1924 /* Cowardly refuse to plot an empty data set */
1925 if ( m->n_data == 0 )
1928 np_chart = chart_create ();
1929 dnp_chart = chart_create ();
1931 if ( !np_chart || ! dnp_chart )
1934 chart_write_title (np_chart, _ ("Normal Q-Q Plot of %s"), factorname);
1935 chart_write_xlabel (np_chart, _ ("Observed Value"));
1936 chart_write_ylabel (np_chart, _ ("Expected Normal"));
1939 chart_write_title (dnp_chart, _ ("Detrended Normal Q-Q Plot of %s"),
1941 chart_write_xlabel (dnp_chart, _ ("Observed Value"));
1942 chart_write_ylabel (dnp_chart, _ ("Dev from Normal"));
1944 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1945 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1949 /* Need to make sure that both the scatter plot and the ideal fit into the
1951 double x_lower = MIN (m->min, (yfirst - intercept) / slope) ;
1952 double x_upper = MAX (m->max, (ylast - intercept) / slope) ;
1953 double slack = (x_upper - x_lower) * 0.05 ;
1955 chart_write_xscale (np_chart, x_lower - slack, x_upper + slack, 5);
1957 chart_write_xscale (dnp_chart, m->min, m->max, 5);
1961 chart_write_yscale (np_chart, yfirst, ylast, 5);
1964 /* We have to cache the detrended data, beacause we need to
1965 find its limits before we can plot it */
1966 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1967 double d_max = -DBL_MAX;
1968 double d_min = DBL_MAX;
1969 for ( i = 0 ; i < m->n_data; ++i )
1971 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1973 chart_datum (np_chart, 0, m->wvp[i]->v.f, ns);
1975 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1977 if ( d_data[i] < d_min ) d_min = d_data[i];
1978 if ( d_data[i] > d_max ) d_max = d_data[i];
1980 chart_write_yscale (dnp_chart, d_min, d_max, 5);
1982 for ( i = 0 ; i < m->n_data; ++i )
1983 chart_datum (dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1988 chart_line (np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1989 chart_line (dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1991 chart_submit (np_chart);
1992 chart_submit (dnp_chart);
1998 /* Show the percentiles */
2000 show_percentiles (const struct variable **dependent_var,
2002 struct factor *fctr)
2004 struct tab_table *tbl;
2010 struct hsh_table *ptiles ;
2012 int n_heading_columns;
2013 const int n_heading_rows = 2;
2014 const int n_stat_rows = 2;
2020 struct factor_statistics **fs = fctr->fs ;
2021 n_heading_columns = 3;
2022 n_factors = hsh_count (fctr->fstats);
2024 ptiles = (*fs)->m[0].ptile_hash;
2026 if ( fctr->indep_var[1] )
2027 n_heading_columns = 4;
2032 n_heading_columns = 2;
2034 ptiles = totals[0].ptile_hash;
2037 n_ptiles = hsh_count (ptiles);
2039 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
2041 n_cols = n_heading_columns + n_ptiles ;
2043 tbl = tab_create (n_cols, n_rows, 0);
2045 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
2047 tab_dim (tbl, tab_natural_dimensions);
2049 /* Outline the box and have no internal lines*/
2054 n_cols - 1, n_rows - 1);
2056 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
2058 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
2061 tab_title (tbl, _ ("Percentiles"));
2064 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
2071 n_heading_columns - 1, n_rows - 1);
2077 n_heading_columns, n_heading_rows - 1,
2078 n_cols - 1, n_rows - 1);
2080 tab_joint_text (tbl, n_heading_columns + 1, 0,
2082 TAB_CENTER | TAT_TITLE ,
2087 /* Put in the percentile break points as headings */
2089 struct percentile **p = (struct percentile **) hsh_sort (ptiles);
2094 tab_fixed (tbl, n_heading_columns + i++ , 1,
2103 for ( i = 0 ; i < n_dep_var ; ++i )
2105 const int n_stat_rows = 2;
2106 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2109 tab_hline (tbl, TAL_1, 0, n_cols - 1, row );
2112 i * n_stat_rows * n_factors + n_heading_rows,
2113 TAB_LEFT | TAT_TITLE,
2114 var_to_string (dependent_var[i])
2119 const union value *prev = NULL ;
2120 struct factor_statistics **fs = fctr->fs;
2123 tab_text (tbl, 1, n_heading_rows - 1,
2124 TAB_CENTER | TAT_TITLE,
2125 var_to_string (fctr->indep_var[0]));
2128 if ( fctr->indep_var[1])
2129 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2130 var_to_string (fctr->indep_var[1]));
2134 const int row = n_heading_rows + n_stat_rows *
2135 ( ( i * n_factors ) + count );
2138 if ( !prev || 0 != compare_values (prev, (*fs)->id[0],
2139 var_get_width (fctr->indep_var[0])))
2142 ds_init_empty (&vstr);
2143 var_append_value_name (fctr->indep_var[0],
2144 (*fs)->id[0], &vstr);
2148 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2152 TAB_LEFT | TAT_TITLE,
2159 prev = (*fs)->id[0];
2161 if (fctr->indep_var[1] && count > 0 )
2162 tab_hline (tbl, TAL_1, 2, n_cols - 1, row);
2164 if ( fctr->indep_var[1])
2167 ds_init_empty (&vstr);
2168 var_append_value_name (fctr->indep_var[1], (*fs)->id[1], &vstr);
2170 tab_text (tbl, 2, row,
2171 TAB_LEFT | TAT_TITLE,
2179 populate_percentiles (tbl, n_heading_columns - 1,
2192 populate_percentiles (tbl, n_heading_columns - 1,
2193 i * n_stat_rows * n_factors + n_heading_rows,
2206 populate_percentiles (struct tab_table *tbl, int col, int row,
2207 const struct metrics *m)
2211 struct percentile **p = (struct percentile **) hsh_sort (m->ptile_hash);
2215 TAB_LEFT | TAT_TITLE,
2216 _ ("Tukey\'s Hinges")
2221 TAB_LEFT | TAT_TITLE,
2222 ptile_alg_desc[m->ptile_alg]
2229 tab_double (tbl, col + i + 1 , row,
2234 if ( (*p)->p == 25 )
2235 tab_double (tbl, col + i + 1 , row + 1,
2240 if ( (*p)->p == 50 )
2241 tab_double (tbl, col + i + 1 , row + 1,
2247 if ( (*p)->p == 75 )
2248 tab_double (tbl, col + i + 1 , row + 1,
2258 factor_to_string (const struct factor *fctr,
2259 const struct factor_statistics *fs,
2260 const struct variable *var,
2265 ds_put_format (str, "%s (",var_to_string (var) );
2268 ds_put_format (str, "%s = ",
2269 var_to_string (fctr->indep_var[0]));
2271 var_append_value_name (fctr->indep_var[0], fs->id[0], str);
2273 if ( fctr->indep_var[1] )
2275 ds_put_format (str, "; %s = )",
2276 var_to_string (fctr->indep_var[1]));
2278 var_append_value_name (fctr->indep_var[1], fs->id[1], str);
2283 ds_put_cstr (str, ")");
2289 factor_to_string_concise (const struct factor *fctr,
2290 const struct factor_statistics *fs,
2295 var_append_value_name (fctr->indep_var[0], fs->id[0], str);
2297 if ( fctr->indep_var[1] )
2299 ds_put_cstr (str, ",");
2301 var_append_value_name (fctr->indep_var[1],fs->id[1], str);
2303 ds_put_cstr (str, ")");