1 /* PSPP - EXAMINE data for normality . -*-c-*-
3 Copyright (C) 2004 Free Software Foundation, Inc.
4 Author: John Darrington 2004
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
23 #include <gsl/gsl_cdf.h>
24 #include <libpspp/message.h>
29 #include <data/case.h>
30 #include <data/casefile.h>
31 #include <data/dictionary.h>
32 #include <data/procedure.h>
33 #include <data/value-labels.h>
34 #include <data/variable.h>
35 #include <language/command.h>
36 #include <language/dictionary/split-file.h>
37 #include <language/lexer/lexer.h>
38 #include <libpspp/alloc.h>
39 #include <libpspp/compiler.h>
40 #include <libpspp/hash.h>
41 #include <libpspp/magic.h>
42 #include <libpspp/message.h>
43 #include <libpspp/misc.h>
44 #include <libpspp/str.h>
45 #include <math/factor-stats.h>
46 #include <math/moments.h>
47 #include <math/percentiles.h>
48 #include <output/charts/box-whisker.h>
49 #include <output/charts/cartesian.h>
50 #include <output/manager.h>
51 #include <output/table.h>
54 #define _(msgid) gettext (msgid)
55 #define N_(msgid) msgid
58 #include <output/chart.h>
59 #include <output/charts/plot-hist.h>
60 #include <output/charts/plot-chart.h>
67 missing=miss:pairwise/!listwise,
69 incl:include/!exclude;
70 +compare=cmp:variables/!groups;
73 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
75 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
84 static struct cmd_examine cmd;
86 static struct variable **dependent_vars;
88 static size_t n_dependent_vars;
93 /* The independent variable */
94 struct variable *indep_var[2];
97 /* Hash table of factor stats indexed by 2 values */
98 struct hsh_table *fstats;
100 /* The hash table after it has been crunched */
101 struct factor_statistics **fs;
107 /* Linked list of factors */
108 static struct factor *factors=0;
110 static struct metrics *totals=0;
112 /* Parse the clause specifying the factors */
113 static int examine_parse_independent_vars (const struct dictionary *dict, struct cmd_examine *cmd);
117 /* Output functions */
118 static void show_summary(struct variable **dependent_var, int n_dep_var,
119 const struct factor *f);
121 static void show_extremes(struct variable **dependent_var,
123 const struct factor *factor,
126 static void show_descriptives(struct variable **dependent_var,
128 struct factor *factor);
130 static void show_percentiles(struct variable **dependent_var,
132 struct factor *factor);
137 void np_plot(const struct metrics *m, const char *factorname);
140 void box_plot_group(const struct factor *fctr,
141 const struct variable **vars, int n_vars,
142 const struct variable *id
146 void box_plot_variables(const struct factor *fctr,
147 const struct variable **vars, int n_vars,
148 const struct variable *id
153 /* Per Split function */
154 static bool run_examine (const struct ccase *,
155 const struct casefile *cf, void *cmd_, const struct dataset *);
157 static void output_examine(void);
160 void factor_calc(struct ccase *c, int case_no,
161 double weight, int case_missing);
164 /* Represent a factor as a string, so it can be
165 printed in a human readable fashion */
166 const char * factor_to_string(const struct factor *fctr,
167 struct factor_statistics *fs,
168 const struct variable *var);
171 /* Represent a factor as a string, so it can be
172 printed in a human readable fashion,
173 but sacrificing some readablility for the sake of brevity */
174 const char *factor_to_string_concise(const struct factor *fctr,
175 struct factor_statistics *fs);
180 /* Function to use for testing for missing values */
181 static is_missing_func *value_is_missing;
186 static subc_list_double percentile_list;
188 static enum pc_alg percentile_algorithm;
190 static short sbc_percentile;
194 cmd_examine (struct dataset *ds)
198 subc_list_double_create(&percentile_list);
199 percentile_algorithm = PC_HAVERAGE;
201 if ( !parse_examine (ds, &cmd, NULL) )
204 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
205 if (cmd.incl == XMN_INCLUDE )
206 value_is_missing = mv_is_value_system_missing;
208 value_is_missing = mv_is_value_missing;
210 if ( cmd.st_n == SYSMIS )
213 if ( ! cmd.sbc_cinterval)
214 cmd.n_cinterval[0] = 95.0;
216 /* If descriptives have been requested, make sure the
217 quartiles are calculated */
218 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
220 subc_list_double_push(&percentile_list, 25);
221 subc_list_double_push(&percentile_list, 50);
222 subc_list_double_push(&percentile_list, 75);
225 ok = multipass_procedure_with_splits (ds, run_examine, &cmd);
232 if ( dependent_vars )
233 free (dependent_vars);
236 struct factor *f = factors ;
239 struct factor *ff = f;
243 hsh_destroy ( ff->fstats ) ;
249 subc_list_double_destroy(&percentile_list);
251 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
256 /* Show all the appropriate tables */
262 /* Show totals if appropriate */
263 if ( ! cmd.sbc_nototal || factors == 0 )
265 show_summary(dependent_vars, n_dependent_vars, 0);
267 if ( cmd.sbc_statistics )
269 if ( cmd.a_statistics[XMN_ST_EXTREME])
270 show_extremes(dependent_vars, n_dependent_vars, 0, cmd.st_n);
272 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
273 show_descriptives(dependent_vars, n_dependent_vars, 0);
276 if ( sbc_percentile )
277 show_percentiles(dependent_vars, n_dependent_vars, 0);
282 if ( cmd.a_plot[XMN_PLT_STEMLEAF] )
283 msg (SW, _("%s is not currently supported."), "STEMLEAF");
285 if ( cmd.a_plot[XMN_PLT_SPREADLEVEL] )
286 msg (SW, _("%s is not currently supported."), "SPREADLEVEL");
288 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
290 for ( v = 0 ; v < n_dependent_vars; ++v )
291 np_plot(&totals[v], var_to_string(dependent_vars[v]));
294 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
296 if ( cmd.cmp == XMN_GROUPS )
298 box_plot_group (0, (const struct variable **) dependent_vars,
299 n_dependent_vars, cmd.v_id);
302 box_plot_variables (0,
303 (const struct variable **) dependent_vars,
304 n_dependent_vars, cmd.v_id);
307 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
309 for ( v = 0 ; v < n_dependent_vars; ++v )
311 struct normal_curve normal;
313 normal.N = totals[v].n;
314 normal.mean = totals[v].mean;
315 normal.stddev = totals[v].stddev;
317 histogram_plot(totals[v].histogram,
318 var_to_string(dependent_vars[v]),
328 /* Show grouped statistics as appropriate */
332 show_summary(dependent_vars, n_dependent_vars, fctr);
334 if ( cmd.sbc_statistics )
336 if ( cmd.a_statistics[XMN_ST_EXTREME])
337 show_extremes(dependent_vars, n_dependent_vars, fctr, cmd.st_n);
339 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
340 show_descriptives(dependent_vars, n_dependent_vars, fctr);
343 if ( sbc_percentile )
344 show_percentiles(dependent_vars, n_dependent_vars, fctr);
351 struct factor_statistics **fs = fctr->fs ;
353 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
355 if ( cmd.cmp == XMN_VARIABLES )
356 box_plot_variables (fctr,
357 (const struct variable **) dependent_vars,
358 n_dependent_vars, cmd.v_id);
360 box_plot_group (fctr,
361 (const struct variable **) dependent_vars,
362 n_dependent_vars, cmd.v_id);
365 for ( v = 0 ; v < n_dependent_vars; ++v )
368 for ( fs = fctr->fs ; *fs ; ++fs )
370 const char *s = factor_to_string(fctr, *fs, dependent_vars[v]);
372 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
373 np_plot(&(*fs)->m[v], s);
375 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
377 struct normal_curve normal;
379 normal.N = (*fs)->m[v].n;
380 normal.mean = (*fs)->m[v].mean;
381 normal.stddev = (*fs)->m[v].stddev;
383 histogram_plot((*fs)->m[v].histogram,
387 } /* for ( fs .... */
389 } /* for ( v = 0 ..... */
399 /* Create a hash table of percentiles and their values from the list of
401 static struct hsh_table *
402 list_to_ptile_hash(const subc_list_double *l)
406 struct hsh_table *h ;
408 h = hsh_create(subc_list_double_count(l),
409 (hsh_compare_func *) ptile_compare,
410 (hsh_hash_func *) ptile_hash,
411 (hsh_free_func *) free,
415 for ( i = 0 ; i < subc_list_double_count(l) ; ++i )
417 struct percentile *p = xmalloc (sizeof *p);
419 p->p = subc_list_double_at(l,i);
430 /* Parse the PERCENTILES subcommand */
432 xmn_custom_percentiles(struct dataset *ds UNUSED,
433 struct cmd_examine *p UNUSED, void *aux UNUSED)
441 while ( lex_is_number() )
443 subc_list_double_push (&percentile_list, lex_number());
453 if ( lex_match_id("HAVERAGE"))
454 percentile_algorithm = PC_HAVERAGE;
456 else if ( lex_match_id("WAVERAGE"))
457 percentile_algorithm = PC_WAVERAGE;
459 else if ( lex_match_id("ROUND"))
460 percentile_algorithm = PC_ROUND;
462 else if ( lex_match_id("EMPIRICAL"))
463 percentile_algorithm = PC_EMPIRICAL;
465 else if ( lex_match_id("AEMPIRICAL"))
466 percentile_algorithm = PC_AEMPIRICAL;
468 else if ( lex_match_id("NONE"))
469 percentile_algorithm = PC_NONE;
472 if ( 0 == subc_list_double_count(&percentile_list))
474 subc_list_double_push (&percentile_list, 5);
475 subc_list_double_push (&percentile_list, 10);
476 subc_list_double_push (&percentile_list, 25);
477 subc_list_double_push (&percentile_list, 50);
478 subc_list_double_push (&percentile_list, 75);
479 subc_list_double_push (&percentile_list, 90);
480 subc_list_double_push (&percentile_list, 95);
486 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
488 xmn_custom_total (struct dataset *ds UNUSED, struct cmd_examine *p, void *aux UNUSED)
490 if ( p->sbc_nototal )
492 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
500 xmn_custom_nototal (struct dataset *ds UNUSED,
501 struct cmd_examine *p, void *aux UNUSED)
505 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
514 /* Parser for the variables sub command
515 Returns 1 on success */
517 xmn_custom_variables(struct dataset *ds, struct cmd_examine *cmd, void *aux UNUSED)
519 const struct dictionary *dict = dataset_dict (ds);
522 if ((token != T_ID || dict_lookup_var (dict, tokid) == NULL)
528 if (!parse_variables (dict, &dependent_vars, &n_dependent_vars,
529 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
531 free (dependent_vars);
535 assert(n_dependent_vars);
537 totals = xnmalloc (n_dependent_vars, sizeof *totals);
539 if ( lex_match(T_BY))
542 success = examine_parse_independent_vars (dict, cmd);
543 if ( success != 1 ) {
544 free (dependent_vars);
555 /* Parse the clause specifying the factors */
557 examine_parse_independent_vars (const struct dictionary *dict, struct cmd_examine *cmd)
560 struct factor *sf = xmalloc (sizeof *sf);
562 if ((token != T_ID || dict_lookup_var (dict, tokid) == NULL)
570 sf->indep_var[0] = parse_variable (dict);
571 sf->indep_var[1] = 0;
578 if ((token != T_ID || dict_lookup_var (dict, tokid) == NULL)
585 sf->indep_var[1] = parse_variable (dict);
590 sf->fstats = hsh_create(4,
591 (hsh_compare_func *) factor_statistics_compare,
592 (hsh_hash_func *) factor_statistics_hash,
593 (hsh_free_func *) factor_statistics_free,
601 if ( token == '.' || token == '/' )
604 success = examine_parse_independent_vars (dict, cmd);
615 void populate_percentiles(struct tab_table *tbl, int col, int row,
616 const struct metrics *m);
618 void populate_descriptives(struct tab_table *t, int col, int row,
619 const struct metrics *fs);
621 void populate_extremes(struct tab_table *t, int col, int row, int n,
622 const struct metrics *m);
624 void populate_summary(struct tab_table *t, int col, int row,
625 const struct metrics *m);
630 static bool bad_weight_warn = true;
633 /* Perform calculations for the sub factors */
635 factor_calc(struct ccase *c, int case_no, double weight, int case_missing)
638 struct factor *fctr = factors;
642 struct factor_statistics **foo ;
643 union value indep_vals[2] ;
645 indep_vals[0] = * case_data(c, fctr->indep_var[0]->fv);
647 if ( fctr->indep_var[1] )
648 indep_vals[1] = * case_data(c, fctr->indep_var[1]->fv);
650 indep_vals[1].f = SYSMIS;
652 assert(fctr->fstats);
654 foo = ( struct factor_statistics ** )
655 hsh_probe(fctr->fstats, (void *) &indep_vals);
660 *foo = create_factor_statistics(n_dependent_vars,
664 for ( v = 0 ; v < n_dependent_vars ; ++v )
666 metrics_precalc( &(*foo)->m[v] );
671 for ( v = 0 ; v < n_dependent_vars ; ++v )
673 const struct variable *var = dependent_vars[v];
674 const union value *val = case_data (c, var->fv);
676 if ( value_is_missing (&var->miss, val) || case_missing )
679 metrics_calc( &(*foo)->m[v], val, weight, case_no);
690 run_examine(const struct ccase *first, const struct casefile *cf,
691 void *cmd_, const struct dataset *ds)
693 struct dictionary *dict = dataset_dict (ds);
694 struct casereader *r;
698 const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
702 output_split_file_values (ds, first);
704 /* Make sure we haven't got rubbish left over from a
709 struct factor *next = fctr->next;
711 hsh_clear(fctr->fstats);
718 for ( v = 0 ; v < n_dependent_vars ; ++v )
719 metrics_precalc(&totals[v]);
721 for(r = casefile_get_reader (cf, NULL);
722 casereader_read (r, &c) ;
726 const int case_no = casereader_cnum(r);
728 const double weight =
729 dict_get_case_weight(dict, &c, &bad_weight_warn);
731 if ( cmd->miss == XMN_LISTWISE )
733 for ( v = 0 ; v < n_dependent_vars ; ++v )
735 const struct variable *var = dependent_vars[v];
736 const union value *val = case_data (&c, var->fv);
738 if ( value_is_missing(&var->miss, val))
744 for ( v = 0 ; v < n_dependent_vars ; ++v )
746 const struct variable *var = dependent_vars[v];
747 const union value *val = case_data (&c, var->fv);
749 if ( value_is_missing(&var->miss, val) || case_missing )
752 metrics_calc(&totals[v], val, weight, case_no);
756 factor_calc(&c, case_no, weight, case_missing);
761 for ( v = 0 ; v < n_dependent_vars ; ++v)
766 struct hsh_iterator hi;
767 struct factor_statistics *fs;
769 for ( fs = hsh_first(fctr->fstats, &hi);
771 fs = hsh_next(fctr->fstats, &hi))
774 fs->m[v].ptile_hash = list_to_ptile_hash(&percentile_list);
775 fs->m[v].ptile_alg = percentile_algorithm;
776 metrics_postcalc(&fs->m[v]);
782 totals[v].ptile_hash = list_to_ptile_hash(&percentile_list);
783 totals[v].ptile_alg = percentile_algorithm;
784 metrics_postcalc(&totals[v]);
788 /* Make sure that the combination of factors are complete */
793 struct hsh_iterator hi;
794 struct hsh_iterator hi0;
795 struct hsh_iterator hi1;
796 struct factor_statistics *fs;
798 struct hsh_table *idh0=0;
799 struct hsh_table *idh1=0;
803 idh0 = hsh_create(4, (hsh_compare_func *) compare_values,
804 (hsh_hash_func *) hash_value,
807 idh1 = hsh_create(4, (hsh_compare_func *) compare_values,
808 (hsh_hash_func *) hash_value,
812 for ( fs = hsh_first(fctr->fstats, &hi);
814 fs = hsh_next(fctr->fstats, &hi))
816 hsh_insert(idh0,(void *) &fs->id[0]);
817 hsh_insert(idh1,(void *) &fs->id[1]);
820 /* Ensure that the factors combination is complete */
821 for ( val0 = hsh_first(idh0, &hi0);
823 val0 = hsh_next(idh0, &hi0))
825 for ( val1 = hsh_first(idh1, &hi1);
827 val1 = hsh_next(idh1, &hi1))
829 struct factor_statistics **ffs;
834 ffs = (struct factor_statistics **)
835 hsh_probe(fctr->fstats, (void *) &key );
839 (*ffs) = create_factor_statistics (n_dependent_vars,
841 for ( i = 0 ; i < n_dependent_vars ; ++i )
842 metrics_precalc( &(*ffs)->m[i]);
850 fctr->fs = (struct factor_statistics **) hsh_sort_copy(fctr->fstats);
861 for ( i = 0 ; i < n_dependent_vars ; ++i )
863 metrics_destroy(&totals[i]);
872 show_summary(struct variable **dependent_var, int n_dep_var,
873 const struct factor *fctr)
875 static const char *subtitle[]=
883 int heading_columns ;
885 const int heading_rows = 3;
886 struct tab_table *tbl;
894 n_factors = hsh_count(fctr->fstats);
895 n_rows = n_dep_var * n_factors ;
897 if ( fctr->indep_var[1] )
906 n_rows += heading_rows;
908 n_cols = heading_columns + 6;
910 tbl = tab_create (n_cols,n_rows,0);
911 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
913 tab_dim (tbl, tab_natural_dimensions);
915 /* Outline the box */
920 n_cols - 1, n_rows - 1);
922 /* Vertical lines for the data only */
927 n_cols - 1, n_rows - 1);
930 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
931 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
932 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
934 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
937 tab_title (tbl, _("Case Processing Summary"));
940 tab_joint_text(tbl, heading_columns, 0,
942 TAB_CENTER | TAT_TITLE,
945 /* Remove lines ... */
952 for ( i = 0 ; i < 3 ; ++i )
954 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
957 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
960 tab_joint_text(tbl, heading_columns + i*2 , 1,
961 heading_columns + i*2 + 1, 1,
962 TAB_CENTER | TAT_TITLE,
965 tab_box (tbl, -1, -1,
967 heading_columns + i*2, 1,
968 heading_columns + i*2 + 1, 1);
973 /* Titles for the independent variables */
976 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
977 var_to_string(fctr->indep_var[0]));
979 if ( fctr->indep_var[1] )
981 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
982 var_to_string(fctr->indep_var[1]));
988 for ( i = 0 ; i < n_dep_var ; ++i )
992 n_factors = hsh_count(fctr->fstats);
996 tab_hline(tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
999 0, i * n_factors + heading_rows,
1000 TAB_LEFT | TAT_TITLE,
1001 var_to_string(dependent_var[i])
1006 populate_summary(tbl, heading_columns,
1007 (i * n_factors) + heading_rows,
1013 struct factor_statistics **fs = fctr->fs;
1018 static union value prev;
1020 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1021 fctr->indep_var[0]->width))
1025 (i * n_factors ) + count +
1027 TAB_LEFT | TAT_TITLE,
1028 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1031 if (fctr->indep_var[1] && count > 0 )
1032 tab_hline(tbl, TAL_1, 1, n_cols - 1,
1033 (i * n_factors ) + count + heading_rows);
1037 prev = (*fs)->id[0];
1040 if ( fctr->indep_var[1])
1043 (i * n_factors ) + count +
1045 TAB_LEFT | TAT_TITLE,
1046 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1049 populate_summary(tbl, heading_columns,
1050 (i * n_factors) + count
1065 populate_summary(struct tab_table *t, int col, int row,
1066 const struct metrics *m)
1069 const double total = m->n + m->n_missing ;
1071 tab_float(t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1072 tab_float(t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1073 tab_float(t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1077 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1078 100.0 * m->n / total );
1080 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1081 100.0 * m->n_missing / total );
1083 /* This seems a bit pointless !!! */
1084 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1085 100.0 * total / total );
1096 show_extremes(struct variable **dependent_var, int n_dep_var,
1097 const struct factor *fctr, int n_extremities)
1100 int heading_columns ;
1102 const int heading_rows = 1;
1103 struct tab_table *tbl;
1110 heading_columns = 2;
1111 n_factors = hsh_count(fctr->fstats);
1113 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1115 if ( fctr->indep_var[1] )
1116 heading_columns = 3;
1120 heading_columns = 1;
1121 n_rows = n_dep_var * 2 * n_extremities;
1124 n_rows += heading_rows;
1126 heading_columns += 2;
1127 n_cols = heading_columns + 2;
1129 tbl = tab_create (n_cols,n_rows,0);
1130 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1132 tab_dim (tbl, tab_natural_dimensions);
1134 /* Outline the box, No internal lines*/
1139 n_cols - 1, n_rows - 1);
1141 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1143 tab_title (tbl, _("Extreme Values"));
1145 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1146 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1150 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1151 var_to_string(fctr->indep_var[0]));
1153 if ( fctr->indep_var[1] )
1154 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1155 var_to_string(fctr->indep_var[1]));
1158 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Value"));
1159 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Case Number"));
1161 for ( i = 0 ; i < n_dep_var ; ++i )
1165 tab_hline(tbl, TAL_1, 0, n_cols -1 ,
1166 i * 2 * n_extremities * n_factors + heading_rows);
1169 i * 2 * n_extremities * n_factors + heading_rows,
1170 TAB_LEFT | TAT_TITLE,
1171 var_to_string(dependent_var[i])
1176 populate_extremes(tbl, heading_columns - 2,
1177 i * 2 * n_extremities * n_factors + heading_rows,
1178 n_extremities, &totals[i]);
1182 struct factor_statistics **fs = fctr->fs;
1187 static union value prev ;
1189 const int row = heading_rows + ( 2 * n_extremities ) *
1190 ( ( i * n_factors ) + count );
1193 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1194 fctr->indep_var[0]->width))
1198 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1202 TAB_LEFT | TAT_TITLE,
1203 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1207 prev = (*fs)->id[0];
1209 if (fctr->indep_var[1] && count > 0 )
1210 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1212 if ( fctr->indep_var[1])
1213 tab_text (tbl, 2, row,
1214 TAB_LEFT | TAT_TITLE,
1215 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1218 populate_extremes(tbl, heading_columns - 2,
1233 /* Fill in the extremities table */
1235 populate_extremes(struct tab_table *t,
1236 int col, int row, int n, const struct metrics *m)
1242 tab_text(t, col, row,
1243 TAB_RIGHT | TAT_TITLE ,
1247 tab_text(t, col, row + n ,
1248 TAB_RIGHT | TAT_TITLE ,
1253 tab_hline(t, TAL_1, col, col + 3, row + n );
1255 for (extremity = 0; extremity < n ; ++extremity )
1258 tab_float(t, col + 1, row + extremity,
1260 extremity + 1, 8, 0);
1264 tab_float(t, col + 1, row + extremity + n,
1266 extremity + 1, 8, 0);
1272 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1275 const struct weighted_value *wv = m->wvp[idx];
1276 struct case_node *cn = wv->case_nos;
1279 for (j = 0 ; j < wv->w ; ++j )
1281 if ( extremity + j >= n )
1284 tab_float(t, col + 3, row + extremity + j + n,
1288 tab_float(t, col + 2, row + extremity + j + n,
1297 extremity += wv->w ;
1302 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1305 const struct weighted_value *wv = m->wvp[idx];
1306 struct case_node *cn = wv->case_nos;
1308 for (j = 0 ; j < wv->w ; ++j )
1310 if ( extremity + j >= n )
1313 tab_float(t, col + 3, row + extremity + j,
1317 tab_float(t, col + 2, row + extremity + j,
1326 extremity += wv->w ;
1331 /* Show the descriptives table */
1333 show_descriptives(struct variable **dependent_var,
1335 struct factor *fctr)
1338 int heading_columns ;
1340 const int n_stat_rows = 13;
1342 const int heading_rows = 1;
1344 struct tab_table *tbl;
1351 heading_columns = 4;
1352 n_factors = hsh_count(fctr->fstats);
1354 n_rows = n_dep_var * n_stat_rows * n_factors;
1356 if ( fctr->indep_var[1] )
1357 heading_columns = 5;
1361 heading_columns = 3;
1362 n_rows = n_dep_var * n_stat_rows;
1365 n_rows += heading_rows;
1367 n_cols = heading_columns + 2;
1370 tbl = tab_create (n_cols, n_rows, 0);
1372 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1374 tab_dim (tbl, tab_natural_dimensions);
1376 /* Outline the box and have no internal lines*/
1381 n_cols - 1, n_rows - 1);
1383 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1385 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1386 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1387 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1389 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Statistic"));
1390 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
1392 tab_title (tbl, _("Descriptives"));
1395 for ( i = 0 ; i < n_dep_var ; ++i )
1397 const int row = heading_rows + i * n_stat_rows * n_factors ;
1400 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
1403 i * n_stat_rows * n_factors + heading_rows,
1404 TAB_LEFT | TAT_TITLE,
1405 var_to_string(dependent_var[i])
1411 struct factor_statistics **fs = fctr->fs;
1414 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1415 var_to_string(fctr->indep_var[0]));
1418 if ( fctr->indep_var[1])
1419 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1420 var_to_string(fctr->indep_var[1]));
1425 static union value prev ;
1427 const int row = heading_rows + n_stat_rows *
1428 ( ( i * n_factors ) + count );
1431 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1432 fctr->indep_var[0]->width))
1436 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1440 TAB_LEFT | TAT_TITLE,
1441 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1445 prev = (*fs)->id[0];
1447 if (fctr->indep_var[1] && count > 0 )
1448 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1450 if ( fctr->indep_var[1])
1451 tab_text (tbl, 2, row,
1452 TAB_LEFT | TAT_TITLE,
1453 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1456 populate_descriptives(tbl, heading_columns - 2,
1468 populate_descriptives(tbl, heading_columns - 2,
1469 i * n_stat_rows * n_factors + heading_rows,
1481 /* Fill in the descriptives data */
1483 populate_descriptives(struct tab_table *tbl, int col, int row,
1484 const struct metrics *m)
1487 const double t = gsl_cdf_tdist_Qinv(1 - cmd.n_cinterval[0]/100.0/2.0, \
1493 TAB_LEFT | TAT_TITLE,
1496 tab_float (tbl, col + 2,
1502 tab_float (tbl, col + 3,
1511 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1512 _("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1515 tab_text (tbl, col + 1,
1517 TAB_LEFT | TAT_TITLE,
1520 tab_float (tbl, col + 2,
1523 m->mean - t * m->se_mean,
1526 tab_text (tbl, col + 1,
1528 TAB_LEFT | TAT_TITLE,
1532 tab_float (tbl, col + 2,
1535 m->mean + t * m->se_mean,
1540 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1541 _("5%% Trimmed Mean"));
1543 tab_float (tbl, col + 2,
1551 TAB_LEFT | TAT_TITLE,
1555 struct percentile *p;
1558 p = hsh_find(m->ptile_hash, &d);
1563 tab_float (tbl, col + 2,
1573 TAB_LEFT | TAT_TITLE,
1576 tab_float (tbl, col + 2,
1585 TAB_LEFT | TAT_TITLE,
1586 _("Std. Deviation"));
1589 tab_float (tbl, col + 2,
1598 TAB_LEFT | TAT_TITLE,
1601 tab_float (tbl, col + 2,
1609 TAB_LEFT | TAT_TITLE,
1612 tab_float (tbl, col + 2,
1621 TAB_LEFT | TAT_TITLE,
1625 tab_float (tbl, col + 2,
1633 TAB_LEFT | TAT_TITLE,
1634 _("Interquartile Range"));
1637 struct percentile *p1;
1638 struct percentile *p2;
1641 p1 = hsh_find(m->ptile_hash, &d);
1644 p2 = hsh_find(m->ptile_hash, &d);
1649 tab_float (tbl, col + 2,
1660 TAB_LEFT | TAT_TITLE,
1664 tab_float (tbl, col + 2,
1670 /* stderr of skewness */
1671 tab_float (tbl, col + 3,
1680 TAB_LEFT | TAT_TITLE,
1684 tab_float (tbl, col + 2,
1690 /* stderr of kurtosis */
1691 tab_float (tbl, col + 3,
1703 box_plot_variables(const struct factor *fctr,
1704 const struct variable **vars, int n_vars,
1705 const struct variable *id)
1709 struct factor_statistics **fs ;
1713 box_plot_group(fctr, vars, n_vars, id);
1717 for ( fs = fctr->fs ; *fs ; ++fs )
1719 double y_min = DBL_MAX;
1720 double y_max = -DBL_MAX;
1721 struct chart *ch = chart_create();
1722 const char *s = factor_to_string(fctr, *fs, 0 );
1724 chart_write_title(ch, s);
1726 for ( i = 0 ; i < n_vars ; ++i )
1728 y_max = max(y_max, (*fs)->m[i].max);
1729 y_min = min(y_min, (*fs)->m[i].min);
1732 boxplot_draw_yscale(ch, y_max, y_min);
1734 for ( i = 0 ; i < n_vars ; ++i )
1737 const double box_width = (ch->data_right - ch->data_left)
1740 const double box_centre = ( i * 2 + 1) * box_width
1743 boxplot_draw_boxplot(ch,
1744 box_centre, box_width,
1746 var_to_string(vars[i]));
1758 /* Do a box plot, grouping all factors into one plot ;
1759 each dependent variable has its own plot.
1762 box_plot_group(const struct factor *fctr,
1763 const struct variable **vars,
1765 const struct variable *id UNUSED)
1770 for ( i = 0 ; i < n_vars ; ++i )
1772 struct factor_statistics **fs ;
1775 ch = chart_create();
1777 boxplot_draw_yscale(ch, totals[i].max, totals[i].min);
1783 for ( fs = fctr->fs ; *fs ; ++fs )
1786 chart_write_title(ch, _("Boxplot of %s vs. %s"),
1787 var_to_string(vars[i]), var_to_string(fctr->indep_var[0]) );
1789 for ( fs = fctr->fs ; *fs ; ++fs )
1792 const char *s = factor_to_string_concise(fctr, *fs);
1794 const double box_width = (ch->data_right - ch->data_left)
1795 / (n_factors * 2.0 ) ;
1797 const double box_centre = ( f++ * 2 + 1) * box_width
1800 boxplot_draw_boxplot(ch,
1801 box_centre, box_width,
1808 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1809 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1811 chart_write_title(ch, _("Boxplot"));
1813 boxplot_draw_boxplot(ch,
1814 box_centre, box_width,
1816 var_to_string(vars[i]) );
1825 /* Plot the normal and detrended normal plots for m
1826 Label the plots with factorname */
1828 np_plot(const struct metrics *m, const char *factorname)
1831 double yfirst=0, ylast=0;
1834 struct chart *np_chart;
1836 /* Detrended Normal Plot */
1837 struct chart *dnp_chart;
1839 /* The slope and intercept of the ideal normal probability line */
1840 const double slope = 1.0 / m->stddev;
1841 const double intercept = - m->mean / m->stddev;
1843 /* Cowardly refuse to plot an empty data set */
1844 if ( m->n_data == 0 )
1847 np_chart = chart_create();
1848 dnp_chart = chart_create();
1850 if ( !np_chart || ! dnp_chart )
1853 chart_write_title(np_chart, _("Normal Q-Q Plot of %s"), factorname);
1854 chart_write_xlabel(np_chart, _("Observed Value"));
1855 chart_write_ylabel(np_chart, _("Expected Normal"));
1858 chart_write_title(dnp_chart, _("Detrended Normal Q-Q Plot of %s"),
1860 chart_write_xlabel(dnp_chart, _("Observed Value"));
1861 chart_write_ylabel(dnp_chart, _("Dev from Normal"));
1863 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1864 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1868 /* Need to make sure that both the scatter plot and the ideal fit into the
1870 double x_lower = min(m->min, (yfirst - intercept) / slope) ;
1871 double x_upper = max(m->max, (ylast - intercept) / slope) ;
1872 double slack = (x_upper - x_lower) * 0.05 ;
1874 chart_write_xscale(np_chart, x_lower - slack, x_upper + slack, 5);
1876 chart_write_xscale(dnp_chart, m->min, m->max, 5);
1880 chart_write_yscale(np_chart, yfirst, ylast, 5);
1883 /* We have to cache the detrended data, beacause we need to
1884 find its limits before we can plot it */
1885 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1886 double d_max = -DBL_MAX;
1887 double d_min = DBL_MAX;
1888 for ( i = 0 ; i < m->n_data; ++i )
1890 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1892 chart_datum(np_chart, 0, m->wvp[i]->v.f, ns);
1894 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1896 if ( d_data[i] < d_min ) d_min = d_data[i];
1897 if ( d_data[i] > d_max ) d_max = d_data[i];
1899 chart_write_yscale(dnp_chart, d_min, d_max, 5);
1901 for ( i = 0 ; i < m->n_data; ++i )
1902 chart_datum(dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1907 chart_line(np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1908 chart_line(dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1910 chart_submit(np_chart);
1911 chart_submit(dnp_chart);
1917 /* Show the percentiles */
1919 show_percentiles(struct variable **dependent_var,
1921 struct factor *fctr)
1923 struct tab_table *tbl;
1929 struct hsh_table *ptiles ;
1931 int n_heading_columns;
1932 const int n_heading_rows = 2;
1933 const int n_stat_rows = 2;
1939 struct factor_statistics **fs = fctr->fs ;
1940 n_heading_columns = 3;
1941 n_factors = hsh_count(fctr->fstats);
1943 ptiles = (*fs)->m[0].ptile_hash;
1945 if ( fctr->indep_var[1] )
1946 n_heading_columns = 4;
1951 n_heading_columns = 2;
1953 ptiles = totals[0].ptile_hash;
1956 n_ptiles = hsh_count(ptiles);
1958 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1960 n_cols = n_heading_columns + n_ptiles ;
1962 tbl = tab_create (n_cols, n_rows, 0);
1964 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1966 tab_dim (tbl, tab_natural_dimensions);
1968 /* Outline the box and have no internal lines*/
1973 n_cols - 1, n_rows - 1);
1975 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
1977 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
1980 tab_title (tbl, _("Percentiles"));
1983 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
1990 n_heading_columns - 1, n_rows - 1);
1996 n_heading_columns, n_heading_rows - 1,
1997 n_cols - 1, n_rows - 1);
1999 tab_joint_text(tbl, n_heading_columns + 1, 0,
2001 TAB_CENTER | TAT_TITLE ,
2006 /* Put in the percentile break points as headings */
2008 struct percentile **p = (struct percentile **) hsh_sort(ptiles);
2013 tab_float(tbl, n_heading_columns + i++ , 1,
2022 for ( i = 0 ; i < n_dep_var ; ++i )
2024 const int n_stat_rows = 2;
2025 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2028 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
2031 i * n_stat_rows * n_factors + n_heading_rows,
2032 TAB_LEFT | TAT_TITLE,
2033 var_to_string(dependent_var[i])
2038 struct factor_statistics **fs = fctr->fs;
2041 tab_text (tbl, 1, n_heading_rows - 1,
2042 TAB_CENTER | TAT_TITLE,
2043 var_to_string(fctr->indep_var[0]));
2046 if ( fctr->indep_var[1])
2047 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2048 var_to_string(fctr->indep_var[1]));
2053 static union value prev ;
2055 const int row = n_heading_rows + n_stat_rows *
2056 ( ( i * n_factors ) + count );
2059 if ( 0 != compare_values(&prev, &(*fs)->id[0],
2060 fctr->indep_var[0]->width))
2064 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2068 TAB_LEFT | TAT_TITLE,
2069 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
2075 prev = (*fs)->id[0];
2077 if (fctr->indep_var[1] && count > 0 )
2078 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
2080 if ( fctr->indep_var[1])
2081 tab_text (tbl, 2, row,
2082 TAB_LEFT | TAT_TITLE,
2083 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
2087 populate_percentiles(tbl, n_heading_columns - 1,
2099 populate_percentiles(tbl, n_heading_columns - 1,
2100 i * n_stat_rows * n_factors + n_heading_rows,
2117 populate_percentiles(struct tab_table *tbl, int col, int row,
2118 const struct metrics *m)
2122 struct percentile **p = (struct percentile **) hsh_sort(m->ptile_hash);
2126 TAB_LEFT | TAT_TITLE,
2127 _("Tukey\'s Hinges")
2132 TAB_LEFT | TAT_TITLE,
2133 ptile_alg_desc[m->ptile_alg]
2140 tab_float(tbl, col + i + 1 , row,
2143 if ( (*p)->p == 25 )
2144 tab_float(tbl, col + i + 1 , row + 1,
2148 if ( (*p)->p == 50 )
2149 tab_float(tbl, col + i + 1 , row + 1,
2153 if ( (*p)->p == 75 )
2154 tab_float(tbl, col + i + 1 , row + 1,
2169 factor_to_string(const struct factor *fctr,
2170 struct factor_statistics *fs,
2171 const struct variable *var)
2174 static char buf1[100];
2180 sprintf(buf1, "%s (",var_to_string(var) );
2183 snprintf(buf2, 100, "%s = %s",
2184 var_to_string(fctr->indep_var[0]),
2185 value_to_string(&fs->id[0],fctr->indep_var[0]));
2189 if ( fctr->indep_var[1] )
2191 sprintf(buf2, "; %s = %s)",
2192 var_to_string(fctr->indep_var[1]),
2193 value_to_string(&fs->id[1],
2194 fctr->indep_var[1]));
2209 factor_to_string_concise(const struct factor *fctr,
2210 struct factor_statistics *fs)
2214 static char buf[100];
2218 snprintf(buf, 100, "%s",
2219 value_to_string(&fs->id[0], fctr->indep_var[0]));
2221 if ( fctr->indep_var[1] )
2223 sprintf(buf2, ",%s)", value_to_string(&fs->id[1], fctr->indep_var[1]) );