1 /* PSPP - EXAMINE data for normality . -*-c-*-
3 Copyright (C) 2004 Free Software Foundation, Inc.
4 Author: John Darrington 2004
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
23 #include <libpspp/message.h>
27 #include <libpspp/alloc.h>
28 #include <libpspp/str.h>
29 #include <data/case.h>
30 #include <data/dictionary.h>
31 #include <language/command.h>
32 #include <libpspp/compiler.h>
33 #include <language/lexer/lexer.h>
34 #include <libpspp/message.h>
35 #include <libpspp/magic.h>
36 #include <libpspp/misc.h>
37 #include <output/table.h>
38 #include <output/manager.h>
39 #include <data/value-labels.h>
40 #include <data/variable.h>
41 #include <procedure.h>
42 #include <libpspp/hash.h>
43 #include <data/casefile.h>
44 #include <math/factor-stats.h>
45 #include <math/moments.h>
46 #include <math/percentiles.h>
47 #include <output/charts/box-whisker.h>
48 #include <output/charts/cartesian.h>
51 #define _(msgid) gettext (msgid)
52 #define N_(msgid) msgid
55 #include <output/chart.h>
56 #include <output/charts/plot-hist.h>
57 #include <output/charts/plot-chart.h>
64 +missing=miss:pairwise/!listwise,
66 incl:include/!exclude;
67 +compare=cmp:variables/!groups;
70 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
72 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
81 static struct cmd_examine cmd;
83 static struct variable **dependent_vars;
85 static size_t n_dependent_vars;
90 /* The independent variable */
91 struct variable *indep_var[2];
94 /* Hash table of factor stats indexed by 2 values */
95 struct hsh_table *fstats;
97 /* The hash table after it has been crunched */
98 struct factor_statistics **fs;
104 /* Linked list of factors */
105 static struct factor *factors=0;
107 static struct metrics *totals=0;
109 /* Parse the clause specifying the factors */
110 static int examine_parse_independent_vars(struct cmd_examine *cmd);
114 /* Output functions */
115 static void show_summary(struct variable **dependent_var, int n_dep_var,
116 const struct factor *f);
118 static void show_extremes(struct variable **dependent_var,
120 const struct factor *factor,
123 static void show_descriptives(struct variable **dependent_var,
125 struct factor *factor);
127 static void show_percentiles(struct variable **dependent_var,
129 struct factor *factor);
134 void np_plot(const struct metrics *m, const char *factorname);
137 void box_plot_group(const struct factor *fctr,
138 const struct variable **vars, int n_vars,
139 const struct variable *id
143 void box_plot_variables(const struct factor *fctr,
144 const struct variable **vars, int n_vars,
145 const struct variable *id
150 /* Per Split function */
151 static bool run_examine(const struct casefile *cf, void *cmd_);
153 static void output_examine(void);
156 void factor_calc(struct ccase *c, int case_no,
157 double weight, int case_missing);
160 /* Represent a factor as a string, so it can be
161 printed in a human readable fashion */
162 const char * factor_to_string(const struct factor *fctr,
163 struct factor_statistics *fs,
164 const struct variable *var);
167 /* Represent a factor as a string, so it can be
168 printed in a human readable fashion,
169 but sacrificing some readablility for the sake of brevity */
170 const char *factor_to_string_concise(const struct factor *fctr,
171 struct factor_statistics *fs);
176 /* Function to use for testing for missing values */
177 static is_missing_func *value_is_missing;
182 static subc_list_double percentile_list;
184 static enum pc_alg percentile_algorithm;
186 static short sbc_percentile;
194 subc_list_double_create(&percentile_list);
195 percentile_algorithm = PC_HAVERAGE;
197 if ( !parse_examine(&cmd) )
200 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
201 if (cmd.incl == XMN_INCLUDE )
202 value_is_missing = mv_is_value_system_missing;
204 value_is_missing = mv_is_value_missing;
206 if ( cmd.st_n == SYSMIS )
209 if ( ! cmd.sbc_cinterval)
210 cmd.n_cinterval[0] = 95.0;
212 /* If descriptives have been requested, make sure the
213 quartiles are calculated */
214 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
216 subc_list_double_push(&percentile_list, 25);
217 subc_list_double_push(&percentile_list, 50);
218 subc_list_double_push(&percentile_list, 75);
221 ok = multipass_procedure_with_splits (run_examine, &cmd);
228 if ( dependent_vars )
229 free (dependent_vars);
232 struct factor *f = factors ;
235 struct factor *ff = f;
239 hsh_destroy ( ff->fstats ) ;
245 subc_list_double_destroy(&percentile_list);
247 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
252 /* Show all the appropriate tables */
258 /* Show totals if appropriate */
259 if ( ! cmd.sbc_nototal || factors == 0 )
261 show_summary(dependent_vars, n_dependent_vars, 0);
263 if ( cmd.sbc_statistics )
265 if ( cmd.a_statistics[XMN_ST_EXTREME])
266 show_extremes(dependent_vars, n_dependent_vars, 0, cmd.st_n);
268 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
269 show_descriptives(dependent_vars, n_dependent_vars, 0);
272 if ( sbc_percentile )
273 show_percentiles(dependent_vars, n_dependent_vars, 0);
278 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
280 for ( v = 0 ; v < n_dependent_vars; ++v )
281 np_plot(&totals[v], var_to_string(dependent_vars[v]));
284 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
286 if ( cmd.cmp == XMN_GROUPS )
288 box_plot_group (0, (const struct variable **) dependent_vars,
289 n_dependent_vars, cmd.v_id);
292 box_plot_variables (0,
293 (const struct variable **) dependent_vars,
294 n_dependent_vars, cmd.v_id);
297 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
299 for ( v = 0 ; v < n_dependent_vars; ++v )
301 struct normal_curve normal;
303 normal.N = totals[v].n;
304 normal.mean = totals[v].mean;
305 normal.stddev = totals[v].stddev;
307 histogram_plot(totals[v].histogram,
308 var_to_string(dependent_vars[v]),
318 /* Show grouped statistics as appropriate */
322 show_summary(dependent_vars, n_dependent_vars, fctr);
324 if ( cmd.sbc_statistics )
326 if ( cmd.a_statistics[XMN_ST_EXTREME])
327 show_extremes(dependent_vars, n_dependent_vars, fctr, cmd.st_n);
329 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
330 show_descriptives(dependent_vars, n_dependent_vars, fctr);
333 if ( sbc_percentile )
334 show_percentiles(dependent_vars, n_dependent_vars, fctr);
341 struct factor_statistics **fs = fctr->fs ;
343 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
345 if ( cmd.cmp == XMN_VARIABLES )
346 box_plot_variables (fctr,
347 (const struct variable **) dependent_vars,
348 n_dependent_vars, cmd.v_id);
350 box_plot_group (fctr,
351 (const struct variable **) dependent_vars,
352 n_dependent_vars, cmd.v_id);
355 for ( v = 0 ; v < n_dependent_vars; ++v )
358 for ( fs = fctr->fs ; *fs ; ++fs )
360 const char *s = factor_to_string(fctr, *fs, dependent_vars[v]);
362 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
363 np_plot(&(*fs)->m[v], s);
365 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
367 struct normal_curve normal;
369 normal.N = (*fs)->m[v].n;
370 normal.mean = (*fs)->m[v].mean;
371 normal.stddev = (*fs)->m[v].stddev;
373 histogram_plot((*fs)->m[v].histogram,
377 } /* for ( fs .... */
379 } /* for ( v = 0 ..... */
389 /* Create a hash table of percentiles and their values from the list of
391 static struct hsh_table *
392 list_to_ptile_hash(const subc_list_double *l)
396 struct hsh_table *h ;
398 h = hsh_create(subc_list_double_count(l),
399 (hsh_compare_func *) ptile_compare,
400 (hsh_hash_func *) ptile_hash,
401 (hsh_free_func *) free,
405 for ( i = 0 ; i < subc_list_double_count(l) ; ++i )
407 struct percentile *p = xmalloc (sizeof *p);
409 p->p = subc_list_double_at(l,i);
420 /* Parse the PERCENTILES subcommand */
422 xmn_custom_percentiles(struct cmd_examine *p UNUSED)
430 while ( lex_is_number() )
432 subc_list_double_push(&percentile_list,lex_number());
442 if ( lex_match_id("HAVERAGE"))
443 percentile_algorithm = PC_HAVERAGE;
445 else if ( lex_match_id("WAVERAGE"))
446 percentile_algorithm = PC_WAVERAGE;
448 else if ( lex_match_id("ROUND"))
449 percentile_algorithm = PC_ROUND;
451 else if ( lex_match_id("EMPIRICAL"))
452 percentile_algorithm = PC_EMPIRICAL;
454 else if ( lex_match_id("AEMPIRICAL"))
455 percentile_algorithm = PC_AEMPIRICAL;
457 else if ( lex_match_id("NONE"))
458 percentile_algorithm = PC_NONE;
461 if ( 0 == subc_list_double_count(&percentile_list))
463 subc_list_double_push(&percentile_list, 5);
464 subc_list_double_push(&percentile_list, 10);
465 subc_list_double_push(&percentile_list, 25);
466 subc_list_double_push(&percentile_list, 50);
467 subc_list_double_push(&percentile_list, 75);
468 subc_list_double_push(&percentile_list, 90);
469 subc_list_double_push(&percentile_list, 95);
475 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
477 xmn_custom_total(struct cmd_examine *p)
479 if ( p->sbc_nototal )
481 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
489 xmn_custom_nototal(struct cmd_examine *p)
493 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
502 /* Parser for the variables sub command
503 Returns 1 on success */
505 xmn_custom_variables(struct cmd_examine *cmd )
509 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
515 if (!parse_variables (default_dict, &dependent_vars, &n_dependent_vars,
516 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
518 free (dependent_vars);
522 assert(n_dependent_vars);
524 totals = xnmalloc (n_dependent_vars, sizeof *totals);
526 if ( lex_match(T_BY))
529 success = examine_parse_independent_vars(cmd);
530 if ( success != 1 ) {
531 free (dependent_vars);
542 /* Parse the clause specifying the factors */
544 examine_parse_independent_vars(struct cmd_examine *cmd)
547 struct factor *sf = xmalloc (sizeof *sf);
549 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
557 sf->indep_var[0] = parse_variable();
558 sf->indep_var[1] = 0;
565 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
572 sf->indep_var[1] = parse_variable();
577 sf->fstats = hsh_create(4,
578 (hsh_compare_func *) factor_statistics_compare,
579 (hsh_hash_func *) factor_statistics_hash,
580 (hsh_free_func *) factor_statistics_free,
588 if ( token == '.' || token == '/' )
591 success = examine_parse_independent_vars(cmd);
602 void populate_percentiles(struct tab_table *tbl, int col, int row,
603 const struct metrics *m);
605 void populate_descriptives(struct tab_table *t, int col, int row,
606 const struct metrics *fs);
608 void populate_extremes(struct tab_table *t, int col, int row, int n,
609 const struct metrics *m);
611 void populate_summary(struct tab_table *t, int col, int row,
612 const struct metrics *m);
617 static int bad_weight_warn = 1;
620 /* Perform calculations for the sub factors */
622 factor_calc(struct ccase *c, int case_no, double weight, int case_missing)
625 struct factor *fctr = factors;
629 struct factor_statistics **foo ;
630 union value indep_vals[2] ;
632 indep_vals[0] = * case_data(c, fctr->indep_var[0]->fv);
634 if ( fctr->indep_var[1] )
635 indep_vals[1] = * case_data(c, fctr->indep_var[1]->fv);
637 indep_vals[1].f = SYSMIS;
639 assert(fctr->fstats);
641 foo = ( struct factor_statistics ** )
642 hsh_probe(fctr->fstats, (void *) &indep_vals);
647 *foo = create_factor_statistics(n_dependent_vars,
651 for ( v = 0 ; v < n_dependent_vars ; ++v )
653 metrics_precalc( &(*foo)->m[v] );
658 for ( v = 0 ; v < n_dependent_vars ; ++v )
660 const struct variable *var = dependent_vars[v];
661 const union value *val = case_data (c, var->fv);
663 if ( value_is_missing (&var->miss, val) || case_missing )
666 metrics_calc( &(*foo)->m[v], val, weight, case_no);
677 run_examine(const struct casefile *cf, void *cmd_ )
679 struct casereader *r;
683 const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
685 /* Make sure we haven't got rubbish left over from a
687 struct factor *fctr = factors;
690 struct factor *next = fctr->next;
692 hsh_clear(fctr->fstats);
701 for ( v = 0 ; v < n_dependent_vars ; ++v )
702 metrics_precalc(&totals[v]);
704 for(r = casefile_get_reader (cf);
705 casereader_read (r, &c) ;
709 const int case_no = casereader_cnum(r);
711 const double weight =
712 dict_get_case_weight(default_dict, &c, &bad_weight_warn);
714 if ( cmd->miss == XMN_LISTWISE )
716 for ( v = 0 ; v < n_dependent_vars ; ++v )
718 const struct variable *var = dependent_vars[v];
719 const union value *val = case_data (&c, var->fv);
721 if ( value_is_missing(&var->miss, val))
727 for ( v = 0 ; v < n_dependent_vars ; ++v )
729 const struct variable *var = dependent_vars[v];
730 const union value *val = case_data (&c, var->fv);
732 if ( value_is_missing(&var->miss, val) || case_missing )
735 metrics_calc(&totals[v], val, weight, case_no);
739 factor_calc(&c, case_no, weight, case_missing);
744 for ( v = 0 ; v < n_dependent_vars ; ++v)
749 struct hsh_iterator hi;
750 struct factor_statistics *fs;
752 for ( fs = hsh_first(fctr->fstats, &hi);
754 fs = hsh_next(fctr->fstats, &hi))
757 fs->m[v].ptile_hash = list_to_ptile_hash(&percentile_list);
758 fs->m[v].ptile_alg = percentile_algorithm;
759 metrics_postcalc(&fs->m[v]);
765 totals[v].ptile_hash = list_to_ptile_hash(&percentile_list);
766 totals[v].ptile_alg = percentile_algorithm;
767 metrics_postcalc(&totals[v]);
771 /* Make sure that the combination of factors are complete */
776 struct hsh_iterator hi;
777 struct hsh_iterator hi0;
778 struct hsh_iterator hi1;
779 struct factor_statistics *fs;
781 struct hsh_table *idh0=0;
782 struct hsh_table *idh1=0;
786 idh0 = hsh_create(4, (hsh_compare_func *) compare_values,
787 (hsh_hash_func *) hash_value,
790 idh1 = hsh_create(4, (hsh_compare_func *) compare_values,
791 (hsh_hash_func *) hash_value,
795 for ( fs = hsh_first(fctr->fstats, &hi);
797 fs = hsh_next(fctr->fstats, &hi))
799 hsh_insert(idh0,(void *) &fs->id[0]);
800 hsh_insert(idh1,(void *) &fs->id[1]);
803 /* Ensure that the factors combination is complete */
804 for ( val0 = hsh_first(idh0, &hi0);
806 val0 = hsh_next(idh0, &hi0))
808 for ( val1 = hsh_first(idh1, &hi1);
810 val1 = hsh_next(idh1, &hi1))
812 struct factor_statistics **ffs;
817 ffs = (struct factor_statistics **)
818 hsh_probe(fctr->fstats, (void *) &key );
822 (*ffs) = create_factor_statistics (n_dependent_vars,
824 for ( i = 0 ; i < n_dependent_vars ; ++i )
825 metrics_precalc( &(*ffs)->m[i]);
833 fctr->fs = (struct factor_statistics **) hsh_sort_copy(fctr->fstats);
844 for ( i = 0 ; i < n_dependent_vars ; ++i )
846 metrics_destroy(&totals[i]);
855 show_summary(struct variable **dependent_var, int n_dep_var,
856 const struct factor *fctr)
858 static const char *subtitle[]=
866 int heading_columns ;
868 const int heading_rows = 3;
869 struct tab_table *tbl;
877 n_factors = hsh_count(fctr->fstats);
878 n_rows = n_dep_var * n_factors ;
880 if ( fctr->indep_var[1] )
889 n_rows += heading_rows;
891 n_cols = heading_columns + 6;
893 tbl = tab_create (n_cols,n_rows,0);
894 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
896 tab_dim (tbl, tab_natural_dimensions);
898 /* Outline the box */
903 n_cols - 1, n_rows - 1);
905 /* Vertical lines for the data only */
910 n_cols - 1, n_rows - 1);
913 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
914 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
915 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
917 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
920 tab_title (tbl, _("Case Processing Summary"));
923 tab_joint_text(tbl, heading_columns, 0,
925 TAB_CENTER | TAT_TITLE,
928 /* Remove lines ... */
935 for ( i = 0 ; i < 3 ; ++i )
937 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
940 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
943 tab_joint_text(tbl, heading_columns + i*2 , 1,
944 heading_columns + i*2 + 1, 1,
945 TAB_CENTER | TAT_TITLE,
948 tab_box (tbl, -1, -1,
950 heading_columns + i*2, 1,
951 heading_columns + i*2 + 1, 1);
956 /* Titles for the independent variables */
959 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
960 var_to_string(fctr->indep_var[0]));
962 if ( fctr->indep_var[1] )
964 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
965 var_to_string(fctr->indep_var[1]));
971 for ( i = 0 ; i < n_dep_var ; ++i )
975 n_factors = hsh_count(fctr->fstats);
979 tab_hline(tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
982 0, i * n_factors + heading_rows,
983 TAB_LEFT | TAT_TITLE,
984 var_to_string(dependent_var[i])
989 populate_summary(tbl, heading_columns,
990 (i * n_factors) + heading_rows,
996 struct factor_statistics **fs = fctr->fs;
1001 static union value prev;
1003 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1004 fctr->indep_var[0]->width))
1008 (i * n_factors ) + count +
1010 TAB_LEFT | TAT_TITLE,
1011 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1014 if (fctr->indep_var[1] && count > 0 )
1015 tab_hline(tbl, TAL_1, 1, n_cols - 1,
1016 (i * n_factors ) + count + heading_rows);
1020 prev = (*fs)->id[0];
1023 if ( fctr->indep_var[1])
1026 (i * n_factors ) + count +
1028 TAB_LEFT | TAT_TITLE,
1029 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1032 populate_summary(tbl, heading_columns,
1033 (i * n_factors) + count
1048 populate_summary(struct tab_table *t, int col, int row,
1049 const struct metrics *m)
1052 const double total = m->n + m->n_missing ;
1054 tab_float(t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1055 tab_float(t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1056 tab_float(t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1060 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1061 100.0 * m->n / total );
1063 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1064 100.0 * m->n_missing / total );
1066 /* This seems a bit pointless !!! */
1067 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1068 100.0 * total / total );
1079 show_extremes(struct variable **dependent_var, int n_dep_var,
1080 const struct factor *fctr, int n_extremities)
1083 int heading_columns ;
1085 const int heading_rows = 1;
1086 struct tab_table *tbl;
1093 heading_columns = 2;
1094 n_factors = hsh_count(fctr->fstats);
1096 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1098 if ( fctr->indep_var[1] )
1099 heading_columns = 3;
1103 heading_columns = 1;
1104 n_rows = n_dep_var * 2 * n_extremities;
1107 n_rows += heading_rows;
1109 heading_columns += 2;
1110 n_cols = heading_columns + 2;
1112 tbl = tab_create (n_cols,n_rows,0);
1113 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1115 tab_dim (tbl, tab_natural_dimensions);
1117 /* Outline the box, No internal lines*/
1122 n_cols - 1, n_rows - 1);
1124 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1126 tab_title (tbl, _("Extreme Values"));
1128 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1129 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1133 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1134 var_to_string(fctr->indep_var[0]));
1136 if ( fctr->indep_var[1] )
1137 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1138 var_to_string(fctr->indep_var[1]));
1141 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Value"));
1142 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Case Number"));
1144 for ( i = 0 ; i < n_dep_var ; ++i )
1148 tab_hline(tbl, TAL_1, 0, n_cols -1 ,
1149 i * 2 * n_extremities * n_factors + heading_rows);
1152 i * 2 * n_extremities * n_factors + heading_rows,
1153 TAB_LEFT | TAT_TITLE,
1154 var_to_string(dependent_var[i])
1159 populate_extremes(tbl, heading_columns - 2,
1160 i * 2 * n_extremities * n_factors + heading_rows,
1161 n_extremities, &totals[i]);
1165 struct factor_statistics **fs = fctr->fs;
1170 static union value prev ;
1172 const int row = heading_rows + ( 2 * n_extremities ) *
1173 ( ( i * n_factors ) + count );
1176 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1177 fctr->indep_var[0]->width))
1181 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1185 TAB_LEFT | TAT_TITLE,
1186 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1190 prev = (*fs)->id[0];
1192 if (fctr->indep_var[1] && count > 0 )
1193 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1195 if ( fctr->indep_var[1])
1196 tab_text (tbl, 2, row,
1197 TAB_LEFT | TAT_TITLE,
1198 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1201 populate_extremes(tbl, heading_columns - 2,
1216 /* Fill in the extremities table */
1218 populate_extremes(struct tab_table *t,
1219 int col, int row, int n, const struct metrics *m)
1225 tab_text(t, col, row,
1226 TAB_RIGHT | TAT_TITLE ,
1230 tab_text(t, col, row + n ,
1231 TAB_RIGHT | TAT_TITLE ,
1236 tab_hline(t, TAL_1, col, col + 3, row + n );
1238 for (extremity = 0; extremity < n ; ++extremity )
1241 tab_float(t, col + 1, row + extremity,
1243 extremity + 1, 8, 0);
1247 tab_float(t, col + 1, row + extremity + n,
1249 extremity + 1, 8, 0);
1255 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1258 const struct weighted_value *wv = m->wvp[idx];
1259 struct case_node *cn = wv->case_nos;
1262 for (j = 0 ; j < wv->w ; ++j )
1264 if ( extremity + j >= n )
1267 tab_float(t, col + 3, row + extremity + j + n,
1271 tab_float(t, col + 2, row + extremity + j + n,
1280 extremity += wv->w ;
1285 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1288 const struct weighted_value *wv = m->wvp[idx];
1289 struct case_node *cn = wv->case_nos;
1291 for (j = 0 ; j < wv->w ; ++j )
1293 if ( extremity + j >= n )
1296 tab_float(t, col + 3, row + extremity + j,
1300 tab_float(t, col + 2, row + extremity + j,
1309 extremity += wv->w ;
1314 /* Show the descriptives table */
1316 show_descriptives(struct variable **dependent_var,
1318 struct factor *fctr)
1321 int heading_columns ;
1323 const int n_stat_rows = 13;
1325 const int heading_rows = 1;
1327 struct tab_table *tbl;
1334 heading_columns = 4;
1335 n_factors = hsh_count(fctr->fstats);
1337 n_rows = n_dep_var * n_stat_rows * n_factors;
1339 if ( fctr->indep_var[1] )
1340 heading_columns = 5;
1344 heading_columns = 3;
1345 n_rows = n_dep_var * n_stat_rows;
1348 n_rows += heading_rows;
1350 n_cols = heading_columns + 2;
1353 tbl = tab_create (n_cols, n_rows, 0);
1355 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1357 tab_dim (tbl, tab_natural_dimensions);
1359 /* Outline the box and have no internal lines*/
1364 n_cols - 1, n_rows - 1);
1366 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1368 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1369 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1370 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1372 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Statistic"));
1373 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
1375 tab_title (tbl, _("Descriptives"));
1378 for ( i = 0 ; i < n_dep_var ; ++i )
1380 const int row = heading_rows + i * n_stat_rows * n_factors ;
1383 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
1386 i * n_stat_rows * n_factors + heading_rows,
1387 TAB_LEFT | TAT_TITLE,
1388 var_to_string(dependent_var[i])
1394 struct factor_statistics **fs = fctr->fs;
1397 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1398 var_to_string(fctr->indep_var[0]));
1401 if ( fctr->indep_var[1])
1402 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1403 var_to_string(fctr->indep_var[1]));
1408 static union value prev ;
1410 const int row = heading_rows + n_stat_rows *
1411 ( ( i * n_factors ) + count );
1414 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1415 fctr->indep_var[0]->width))
1419 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1423 TAB_LEFT | TAT_TITLE,
1424 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1428 prev = (*fs)->id[0];
1430 if (fctr->indep_var[1] && count > 0 )
1431 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1433 if ( fctr->indep_var[1])
1434 tab_text (tbl, 2, row,
1435 TAB_LEFT | TAT_TITLE,
1436 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1439 populate_descriptives(tbl, heading_columns - 2,
1451 populate_descriptives(tbl, heading_columns - 2,
1452 i * n_stat_rows * n_factors + heading_rows,
1464 /* Fill in the descriptives data */
1466 populate_descriptives(struct tab_table *tbl, int col, int row,
1467 const struct metrics *m)
1470 const double t = gsl_cdf_tdist_Qinv(1 - cmd.n_cinterval[0]/100.0/2.0, \
1476 TAB_LEFT | TAT_TITLE,
1479 tab_float (tbl, col + 2,
1485 tab_float (tbl, col + 3,
1494 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1495 _("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1498 tab_text (tbl, col + 1,
1500 TAB_LEFT | TAT_TITLE,
1503 tab_float (tbl, col + 2,
1506 m->mean - t * m->se_mean,
1509 tab_text (tbl, col + 1,
1511 TAB_LEFT | TAT_TITLE,
1515 tab_float (tbl, col + 2,
1518 m->mean + t * m->se_mean,
1523 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1524 _("5%% Trimmed Mean"));
1526 tab_float (tbl, col + 2,
1534 TAB_LEFT | TAT_TITLE,
1538 struct percentile *p;
1541 p = hsh_find(m->ptile_hash, &d);
1546 tab_float (tbl, col + 2,
1556 TAB_LEFT | TAT_TITLE,
1559 tab_float (tbl, col + 2,
1568 TAB_LEFT | TAT_TITLE,
1569 _("Std. Deviation"));
1572 tab_float (tbl, col + 2,
1581 TAB_LEFT | TAT_TITLE,
1584 tab_float (tbl, col + 2,
1592 TAB_LEFT | TAT_TITLE,
1595 tab_float (tbl, col + 2,
1604 TAB_LEFT | TAT_TITLE,
1608 tab_float (tbl, col + 2,
1616 TAB_LEFT | TAT_TITLE,
1617 _("Interquartile Range"));
1620 struct percentile *p1;
1621 struct percentile *p2;
1624 p1 = hsh_find(m->ptile_hash, &d);
1627 p2 = hsh_find(m->ptile_hash, &d);
1632 tab_float (tbl, col + 2,
1643 TAB_LEFT | TAT_TITLE,
1647 tab_float (tbl, col + 2,
1653 /* stderr of skewness */
1654 tab_float (tbl, col + 3,
1663 TAB_LEFT | TAT_TITLE,
1667 tab_float (tbl, col + 2,
1673 /* stderr of kurtosis */
1674 tab_float (tbl, col + 3,
1686 box_plot_variables(const struct factor *fctr,
1687 const struct variable **vars, int n_vars,
1688 const struct variable *id)
1692 struct factor_statistics **fs ;
1696 box_plot_group(fctr, vars, n_vars, id);
1700 for ( fs = fctr->fs ; *fs ; ++fs )
1702 double y_min = DBL_MAX;
1703 double y_max = -DBL_MAX;
1704 struct chart *ch = chart_create();
1705 const char *s = factor_to_string(fctr, *fs, 0 );
1707 chart_write_title(ch, s);
1709 for ( i = 0 ; i < n_vars ; ++i )
1711 y_max = max(y_max, (*fs)->m[i].max);
1712 y_min = min(y_min, (*fs)->m[i].min);
1715 boxplot_draw_yscale(ch, y_max, y_min);
1717 for ( i = 0 ; i < n_vars ; ++i )
1720 const double box_width = (ch->data_right - ch->data_left)
1723 const double box_centre = ( i * 2 + 1) * box_width
1726 boxplot_draw_boxplot(ch,
1727 box_centre, box_width,
1729 var_to_string(vars[i]));
1741 /* Do a box plot, grouping all factors into one plot ;
1742 each dependent variable has its own plot.
1745 box_plot_group(const struct factor *fctr,
1746 const struct variable **vars,
1748 const struct variable *id UNUSED)
1753 for ( i = 0 ; i < n_vars ; ++i )
1755 struct factor_statistics **fs ;
1758 ch = chart_create();
1760 boxplot_draw_yscale(ch, totals[i].max, totals[i].min);
1766 for ( fs = fctr->fs ; *fs ; ++fs )
1769 chart_write_title(ch, _("Boxplot of %s vs. %s"),
1770 var_to_string(vars[i]), var_to_string(fctr->indep_var[0]) );
1772 for ( fs = fctr->fs ; *fs ; ++fs )
1775 const char *s = factor_to_string_concise(fctr, *fs);
1777 const double box_width = (ch->data_right - ch->data_left)
1778 / (n_factors * 2.0 ) ;
1780 const double box_centre = ( f++ * 2 + 1) * box_width
1783 boxplot_draw_boxplot(ch,
1784 box_centre, box_width,
1791 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1792 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1794 chart_write_title(ch, _("Boxplot"));
1796 boxplot_draw_boxplot(ch,
1797 box_centre, box_width,
1799 var_to_string(vars[i]) );
1808 /* Plot the normal and detrended normal plots for m
1809 Label the plots with factorname */
1811 np_plot(const struct metrics *m, const char *factorname)
1814 double yfirst=0, ylast=0;
1817 struct chart *np_chart;
1819 /* Detrended Normal Plot */
1820 struct chart *dnp_chart;
1822 /* The slope and intercept of the ideal normal probability line */
1823 const double slope = 1.0 / m->stddev;
1824 const double intercept = - m->mean / m->stddev;
1826 /* Cowardly refuse to plot an empty data set */
1827 if ( m->n_data == 0 )
1830 np_chart = chart_create();
1831 dnp_chart = chart_create();
1833 if ( !np_chart || ! dnp_chart )
1836 chart_write_title(np_chart, _("Normal Q-Q Plot of %s"), factorname);
1837 chart_write_xlabel(np_chart, _("Observed Value"));
1838 chart_write_ylabel(np_chart, _("Expected Normal"));
1841 chart_write_title(dnp_chart, _("Detrended Normal Q-Q Plot of %s"),
1843 chart_write_xlabel(dnp_chart, _("Observed Value"));
1844 chart_write_ylabel(dnp_chart, _("Dev from Normal"));
1846 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1847 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1851 /* Need to make sure that both the scatter plot and the ideal fit into the
1853 double x_lower = min(m->min, (yfirst - intercept) / slope) ;
1854 double x_upper = max(m->max, (ylast - intercept) / slope) ;
1855 double slack = (x_upper - x_lower) * 0.05 ;
1857 chart_write_xscale(np_chart, x_lower - slack, x_upper + slack, 5);
1859 chart_write_xscale(dnp_chart, m->min, m->max, 5);
1863 chart_write_yscale(np_chart, yfirst, ylast, 5);
1866 /* We have to cache the detrended data, beacause we need to
1867 find its limits before we can plot it */
1868 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1869 double d_max = -DBL_MAX;
1870 double d_min = DBL_MAX;
1871 for ( i = 0 ; i < m->n_data; ++i )
1873 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1875 chart_datum(np_chart, 0, m->wvp[i]->v.f, ns);
1877 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1879 if ( d_data[i] < d_min ) d_min = d_data[i];
1880 if ( d_data[i] > d_max ) d_max = d_data[i];
1882 chart_write_yscale(dnp_chart, d_min, d_max, 5);
1884 for ( i = 0 ; i < m->n_data; ++i )
1885 chart_datum(dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1890 chart_line(np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1891 chart_line(dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1893 chart_submit(np_chart);
1894 chart_submit(dnp_chart);
1900 /* Show the percentiles */
1902 show_percentiles(struct variable **dependent_var,
1904 struct factor *fctr)
1906 struct tab_table *tbl;
1912 struct hsh_table *ptiles ;
1914 int n_heading_columns;
1915 const int n_heading_rows = 2;
1916 const int n_stat_rows = 2;
1922 struct factor_statistics **fs = fctr->fs ;
1923 n_heading_columns = 3;
1924 n_factors = hsh_count(fctr->fstats);
1926 ptiles = (*fs)->m[0].ptile_hash;
1928 if ( fctr->indep_var[1] )
1929 n_heading_columns = 4;
1934 n_heading_columns = 2;
1936 ptiles = totals[0].ptile_hash;
1939 n_ptiles = hsh_count(ptiles);
1941 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1943 n_cols = n_heading_columns + n_ptiles ;
1945 tbl = tab_create (n_cols, n_rows, 0);
1947 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1949 tab_dim (tbl, tab_natural_dimensions);
1951 /* Outline the box and have no internal lines*/
1956 n_cols - 1, n_rows - 1);
1958 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
1960 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
1963 tab_title (tbl, _("Percentiles"));
1966 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
1973 n_heading_columns - 1, n_rows - 1);
1979 n_heading_columns, n_heading_rows - 1,
1980 n_cols - 1, n_rows - 1);
1982 tab_joint_text(tbl, n_heading_columns + 1, 0,
1984 TAB_CENTER | TAT_TITLE ,
1989 /* Put in the percentile break points as headings */
1991 struct percentile **p = (struct percentile **) hsh_sort(ptiles);
1996 tab_float(tbl, n_heading_columns + i++ , 1,
2005 for ( i = 0 ; i < n_dep_var ; ++i )
2007 const int n_stat_rows = 2;
2008 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2011 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
2014 i * n_stat_rows * n_factors + n_heading_rows,
2015 TAB_LEFT | TAT_TITLE,
2016 var_to_string(dependent_var[i])
2021 struct factor_statistics **fs = fctr->fs;
2024 tab_text (tbl, 1, n_heading_rows - 1,
2025 TAB_CENTER | TAT_TITLE,
2026 var_to_string(fctr->indep_var[0]));
2029 if ( fctr->indep_var[1])
2030 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2031 var_to_string(fctr->indep_var[1]));
2036 static union value prev ;
2038 const int row = n_heading_rows + n_stat_rows *
2039 ( ( i * n_factors ) + count );
2042 if ( 0 != compare_values(&prev, &(*fs)->id[0],
2043 fctr->indep_var[0]->width))
2047 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2051 TAB_LEFT | TAT_TITLE,
2052 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
2058 prev = (*fs)->id[0];
2060 if (fctr->indep_var[1] && count > 0 )
2061 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
2063 if ( fctr->indep_var[1])
2064 tab_text (tbl, 2, row,
2065 TAB_LEFT | TAT_TITLE,
2066 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
2070 populate_percentiles(tbl, n_heading_columns - 1,
2082 populate_percentiles(tbl, n_heading_columns - 1,
2083 i * n_stat_rows * n_factors + n_heading_rows,
2100 populate_percentiles(struct tab_table *tbl, int col, int row,
2101 const struct metrics *m)
2105 struct percentile **p = (struct percentile **) hsh_sort(m->ptile_hash);
2109 TAB_LEFT | TAT_TITLE,
2110 _("Tukey\'s Hinges")
2115 TAB_LEFT | TAT_TITLE,
2116 ptile_alg_desc[m->ptile_alg]
2123 tab_float(tbl, col + i + 1 , row,
2126 if ( (*p)->p == 25 )
2127 tab_float(tbl, col + i + 1 , row + 1,
2131 if ( (*p)->p == 50 )
2132 tab_float(tbl, col + i + 1 , row + 1,
2136 if ( (*p)->p == 75 )
2137 tab_float(tbl, col + i + 1 , row + 1,
2152 factor_to_string(const struct factor *fctr,
2153 struct factor_statistics *fs,
2154 const struct variable *var)
2157 static char buf1[100];
2163 sprintf(buf1, "%s (",var_to_string(var) );
2166 snprintf(buf2, 100, "%s = %s",
2167 var_to_string(fctr->indep_var[0]),
2168 value_to_string(&fs->id[0],fctr->indep_var[0]));
2172 if ( fctr->indep_var[1] )
2174 sprintf(buf2, "; %s = %s)",
2175 var_to_string(fctr->indep_var[1]),
2176 value_to_string(&fs->id[1],
2177 fctr->indep_var[1]));
2192 factor_to_string_concise(const struct factor *fctr,
2193 struct factor_statistics *fs)
2197 static char buf[100];
2201 snprintf(buf, 100, "%s",
2202 value_to_string(&fs->id[0], fctr->indep_var[0]));
2204 if ( fctr->indep_var[1] )
2206 sprintf(buf2, ",%s)", value_to_string(&fs->id[1], fctr->indep_var[1]) );