1 /* PSPP - EXAMINE data for normality . -*-c-*-
3 Copyright (C) 2004 Free Software Foundation, Inc.
4 Author: John Darrington 2004
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
30 #include "dictionary.h"
38 #include "value-labels.h"
40 #include "procedure.h"
43 #include "factor-stats.h"
45 #include "percentiles.h"
46 #include "box-whisker.h"
47 #include "cartesian.h"
50 #define _(msgid) gettext (msgid)
51 #define N_(msgid) msgid
55 #include "plot-hist.h"
56 #include "plot-chart.h"
63 +missing=miss:pairwise/!listwise,
65 incl:include/!exclude;
66 +compare=cmp:variables/!groups;
69 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
71 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
80 static struct cmd_examine cmd;
82 static struct variable **dependent_vars;
84 static size_t n_dependent_vars;
89 /* The independent variable */
90 struct variable *indep_var[2];
93 /* Hash table of factor stats indexed by 2 values */
94 struct hsh_table *fstats;
96 /* The hash table after it has been crunched */
97 struct factor_statistics **fs;
103 /* Linked list of factors */
104 static struct factor *factors=0;
106 static struct metrics *totals=0;
108 /* Parse the clause specifying the factors */
109 static int examine_parse_independent_vars(struct cmd_examine *cmd);
113 /* Output functions */
114 static void show_summary(struct variable **dependent_var, int n_dep_var,
115 const struct factor *f);
117 static void show_extremes(struct variable **dependent_var,
119 const struct factor *factor,
122 static void show_descriptives(struct variable **dependent_var,
124 struct factor *factor);
126 static void show_percentiles(struct variable **dependent_var,
128 struct factor *factor);
133 void np_plot(const struct metrics *m, const char *factorname);
136 void box_plot_group(const struct factor *fctr,
137 const struct variable **vars, int n_vars,
138 const struct variable *id
142 void box_plot_variables(const struct factor *fctr,
143 const struct variable **vars, int n_vars,
144 const struct variable *id
149 /* Per Split function */
150 static bool run_examine(const struct casefile *cf, void *cmd_);
152 static void output_examine(void);
155 void factor_calc(struct ccase *c, int case_no,
156 double weight, int case_missing);
159 /* Represent a factor as a string, so it can be
160 printed in a human readable fashion */
161 const char * factor_to_string(const struct factor *fctr,
162 struct factor_statistics *fs,
163 const struct variable *var);
166 /* Represent a factor as a string, so it can be
167 printed in a human readable fashion,
168 but sacrificing some readablility for the sake of brevity */
169 const char *factor_to_string_concise(const struct factor *fctr,
170 struct factor_statistics *fs);
175 /* Function to use for testing for missing values */
176 static is_missing_func *value_is_missing;
181 static subc_list_double percentile_list;
183 static enum pc_alg percentile_algorithm;
185 static short sbc_percentile;
193 subc_list_double_create(&percentile_list);
194 percentile_algorithm = PC_HAVERAGE;
196 if ( !parse_examine(&cmd) )
199 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
200 if (cmd.incl == XMN_INCLUDE )
201 value_is_missing = mv_is_value_system_missing;
203 value_is_missing = mv_is_value_missing;
205 if ( cmd.st_n == SYSMIS )
208 if ( ! cmd.sbc_cinterval)
209 cmd.n_cinterval[0] = 95.0;
211 /* If descriptives have been requested, make sure the
212 quartiles are calculated */
213 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
215 subc_list_double_push(&percentile_list, 25);
216 subc_list_double_push(&percentile_list, 50);
217 subc_list_double_push(&percentile_list, 75);
220 ok = multipass_procedure_with_splits (run_examine, &cmd);
227 if ( dependent_vars )
228 free (dependent_vars);
231 struct factor *f = factors ;
234 struct factor *ff = f;
238 hsh_destroy ( ff->fstats ) ;
244 subc_list_double_destroy(&percentile_list);
246 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
251 /* Show all the appropriate tables */
257 /* Show totals if appropriate */
258 if ( ! cmd.sbc_nototal || factors == 0 )
260 show_summary(dependent_vars, n_dependent_vars, 0);
262 if ( cmd.sbc_statistics )
264 if ( cmd.a_statistics[XMN_ST_EXTREME])
265 show_extremes(dependent_vars, n_dependent_vars, 0, cmd.st_n);
267 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
268 show_descriptives(dependent_vars, n_dependent_vars, 0);
271 if ( sbc_percentile )
272 show_percentiles(dependent_vars, n_dependent_vars, 0);
277 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
279 for ( v = 0 ; v < n_dependent_vars; ++v )
280 np_plot(&totals[v], var_to_string(dependent_vars[v]));
283 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
285 if ( cmd.cmp == XMN_GROUPS )
287 box_plot_group(0, dependent_vars, n_dependent_vars,
291 box_plot_variables(0, dependent_vars, n_dependent_vars,
295 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
297 for ( v = 0 ; v < n_dependent_vars; ++v )
299 struct normal_curve normal;
301 normal.N = totals[v].n;
302 normal.mean = totals[v].mean;
303 normal.stddev = totals[v].stddev;
305 histogram_plot(totals[v].histogram,
306 var_to_string(dependent_vars[v]),
316 /* Show grouped statistics as appropriate */
320 show_summary(dependent_vars, n_dependent_vars, fctr);
322 if ( cmd.sbc_statistics )
324 if ( cmd.a_statistics[XMN_ST_EXTREME])
325 show_extremes(dependent_vars, n_dependent_vars, fctr, cmd.st_n);
327 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
328 show_descriptives(dependent_vars, n_dependent_vars, fctr);
331 if ( sbc_percentile )
332 show_percentiles(dependent_vars, n_dependent_vars, fctr);
339 struct factor_statistics **fs = fctr->fs ;
341 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
343 if ( cmd.cmp == XMN_VARIABLES )
344 box_plot_variables(fctr, dependent_vars, n_dependent_vars,
347 box_plot_group(fctr, dependent_vars, n_dependent_vars,
351 for ( v = 0 ; v < n_dependent_vars; ++v )
354 for ( fs = fctr->fs ; *fs ; ++fs )
356 const char *s = factor_to_string(fctr, *fs, dependent_vars[v]);
358 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
359 np_plot(&(*fs)->m[v], s);
361 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
363 struct normal_curve normal;
365 normal.N = (*fs)->m[v].n;
366 normal.mean = (*fs)->m[v].mean;
367 normal.stddev = (*fs)->m[v].stddev;
369 histogram_plot((*fs)->m[v].histogram,
373 } /* for ( fs .... */
375 } /* for ( v = 0 ..... */
385 /* Create a hash table of percentiles and their values from the list of
387 static struct hsh_table *
388 list_to_ptile_hash(const subc_list_double *l)
392 struct hsh_table *h ;
394 h = hsh_create(subc_list_double_count(l),
395 (hsh_compare_func *) ptile_compare,
396 (hsh_hash_func *) ptile_hash,
397 (hsh_free_func *) free,
401 for ( i = 0 ; i < subc_list_double_count(l) ; ++i )
403 struct percentile *p = xmalloc (sizeof *p);
405 p->p = subc_list_double_at(l,i);
416 /* Parse the PERCENTILES subcommand */
418 xmn_custom_percentiles(struct cmd_examine *p UNUSED)
426 while ( lex_is_number() )
428 subc_list_double_push(&percentile_list,lex_number());
438 if ( lex_match_id("HAVERAGE"))
439 percentile_algorithm = PC_HAVERAGE;
441 else if ( lex_match_id("WAVERAGE"))
442 percentile_algorithm = PC_WAVERAGE;
444 else if ( lex_match_id("ROUND"))
445 percentile_algorithm = PC_ROUND;
447 else if ( lex_match_id("EMPIRICAL"))
448 percentile_algorithm = PC_EMPIRICAL;
450 else if ( lex_match_id("AEMPIRICAL"))
451 percentile_algorithm = PC_AEMPIRICAL;
453 else if ( lex_match_id("NONE"))
454 percentile_algorithm = PC_NONE;
457 if ( 0 == subc_list_double_count(&percentile_list))
459 subc_list_double_push(&percentile_list, 5);
460 subc_list_double_push(&percentile_list, 10);
461 subc_list_double_push(&percentile_list, 25);
462 subc_list_double_push(&percentile_list, 50);
463 subc_list_double_push(&percentile_list, 75);
464 subc_list_double_push(&percentile_list, 90);
465 subc_list_double_push(&percentile_list, 95);
471 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
473 xmn_custom_total(struct cmd_examine *p)
475 if ( p->sbc_nototal )
477 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
485 xmn_custom_nototal(struct cmd_examine *p)
489 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
498 /* Parser for the variables sub command
499 Returns 1 on success */
501 xmn_custom_variables(struct cmd_examine *cmd )
505 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
511 if (!parse_variables (default_dict, &dependent_vars, &n_dependent_vars,
512 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
514 free (dependent_vars);
518 assert(n_dependent_vars);
520 totals = xnmalloc (n_dependent_vars, sizeof *totals);
522 if ( lex_match(T_BY))
525 success = examine_parse_independent_vars(cmd);
526 if ( success != 1 ) {
527 free (dependent_vars);
538 /* Parse the clause specifying the factors */
540 examine_parse_independent_vars(struct cmd_examine *cmd)
543 struct factor *sf = xmalloc (sizeof *sf);
545 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
553 sf->indep_var[0] = parse_variable();
554 sf->indep_var[1] = 0;
561 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
568 sf->indep_var[1] = parse_variable();
573 sf->fstats = hsh_create(4,
574 (hsh_compare_func *) factor_statistics_compare,
575 (hsh_hash_func *) factor_statistics_hash,
576 (hsh_free_func *) factor_statistics_free,
584 if ( token == '.' || token == '/' )
587 success = examine_parse_independent_vars(cmd);
598 void populate_percentiles(struct tab_table *tbl, int col, int row,
599 const struct metrics *m);
601 void populate_descriptives(struct tab_table *t, int col, int row,
602 const struct metrics *fs);
604 void populate_extremes(struct tab_table *t, int col, int row, int n,
605 const struct metrics *m);
607 void populate_summary(struct tab_table *t, int col, int row,
608 const struct metrics *m);
613 static int bad_weight_warn = 1;
616 /* Perform calculations for the sub factors */
618 factor_calc(struct ccase *c, int case_no, double weight, int case_missing)
621 struct factor *fctr = factors;
625 struct factor_statistics **foo ;
626 union value indep_vals[2] ;
628 indep_vals[0] = * case_data(c, fctr->indep_var[0]->fv);
630 if ( fctr->indep_var[1] )
631 indep_vals[1] = * case_data(c, fctr->indep_var[1]->fv);
633 indep_vals[1].f = SYSMIS;
635 assert(fctr->fstats);
637 foo = ( struct factor_statistics ** )
638 hsh_probe(fctr->fstats, (void *) &indep_vals);
643 *foo = create_factor_statistics(n_dependent_vars,
647 for ( v = 0 ; v < n_dependent_vars ; ++v )
649 metrics_precalc( &(*foo)->m[v] );
654 for ( v = 0 ; v < n_dependent_vars ; ++v )
656 const struct variable *var = dependent_vars[v];
657 const union value *val = case_data (c, var->fv);
659 if ( value_is_missing (&var->miss, val) || case_missing )
662 metrics_calc( &(*foo)->m[v], val, weight, case_no);
673 run_examine(const struct casefile *cf, void *cmd_ )
675 struct casereader *r;
679 const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
681 /* Make sure we haven't got rubbish left over from a
683 struct factor *fctr = factors;
686 struct factor *next = fctr->next;
688 hsh_clear(fctr->fstats);
697 for ( v = 0 ; v < n_dependent_vars ; ++v )
698 metrics_precalc(&totals[v]);
700 for(r = casefile_get_reader (cf);
701 casereader_read (r, &c) ;
705 const int case_no = casereader_cnum(r);
707 const double weight =
708 dict_get_case_weight(default_dict, &c, &bad_weight_warn);
710 if ( cmd->miss == XMN_LISTWISE )
712 for ( v = 0 ; v < n_dependent_vars ; ++v )
714 const struct variable *var = dependent_vars[v];
715 const union value *val = case_data (&c, var->fv);
717 if ( value_is_missing(&var->miss, val))
723 for ( v = 0 ; v < n_dependent_vars ; ++v )
725 const struct variable *var = dependent_vars[v];
726 const union value *val = case_data (&c, var->fv);
728 if ( value_is_missing(&var->miss, val) || case_missing )
731 metrics_calc(&totals[v], val, weight, case_no);
735 factor_calc(&c, case_no, weight, case_missing);
740 for ( v = 0 ; v < n_dependent_vars ; ++v)
745 struct hsh_iterator hi;
746 struct factor_statistics *fs;
748 for ( fs = hsh_first(fctr->fstats, &hi);
750 fs = hsh_next(fctr->fstats, &hi))
753 fs->m[v].ptile_hash = list_to_ptile_hash(&percentile_list);
754 fs->m[v].ptile_alg = percentile_algorithm;
755 metrics_postcalc(&fs->m[v]);
761 totals[v].ptile_hash = list_to_ptile_hash(&percentile_list);
762 totals[v].ptile_alg = percentile_algorithm;
763 metrics_postcalc(&totals[v]);
767 /* Make sure that the combination of factors are complete */
772 struct hsh_iterator hi;
773 struct hsh_iterator hi0;
774 struct hsh_iterator hi1;
775 struct factor_statistics *fs;
777 struct hsh_table *idh0=0;
778 struct hsh_table *idh1=0;
782 idh0 = hsh_create(4, (hsh_compare_func *) compare_values,
783 (hsh_hash_func *) hash_value,
786 idh1 = hsh_create(4, (hsh_compare_func *) compare_values,
787 (hsh_hash_func *) hash_value,
791 for ( fs = hsh_first(fctr->fstats, &hi);
793 fs = hsh_next(fctr->fstats, &hi))
795 hsh_insert(idh0,(void *) &fs->id[0]);
796 hsh_insert(idh1,(void *) &fs->id[1]);
799 /* Ensure that the factors combination is complete */
800 for ( val0 = hsh_first(idh0, &hi0);
802 val0 = hsh_next(idh0, &hi0))
804 for ( val1 = hsh_first(idh1, &hi1);
806 val1 = hsh_next(idh1, &hi1))
808 struct factor_statistics **ffs;
813 ffs = (struct factor_statistics **)
814 hsh_probe(fctr->fstats, (void *) &key );
818 (*ffs) = create_factor_statistics (n_dependent_vars,
820 for ( i = 0 ; i < n_dependent_vars ; ++i )
821 metrics_precalc( &(*ffs)->m[i]);
829 fctr->fs = (struct factor_statistics **) hsh_sort_copy(fctr->fstats);
840 for ( i = 0 ; i < n_dependent_vars ; ++i )
842 metrics_destroy(&totals[i]);
851 show_summary(struct variable **dependent_var, int n_dep_var,
852 const struct factor *fctr)
854 static const char *subtitle[]=
862 int heading_columns ;
864 const int heading_rows = 3;
865 struct tab_table *tbl;
873 n_factors = hsh_count(fctr->fstats);
874 n_rows = n_dep_var * n_factors ;
876 if ( fctr->indep_var[1] )
885 n_rows += heading_rows;
887 n_cols = heading_columns + 6;
889 tbl = tab_create (n_cols,n_rows,0);
890 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
892 tab_dim (tbl, tab_natural_dimensions);
894 /* Outline the box */
899 n_cols - 1, n_rows - 1);
901 /* Vertical lines for the data only */
906 n_cols - 1, n_rows - 1);
909 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
910 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
911 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
913 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
916 tab_title (tbl, 0, _("Case Processing Summary"));
919 tab_joint_text(tbl, heading_columns, 0,
921 TAB_CENTER | TAT_TITLE,
924 /* Remove lines ... */
931 for ( i = 0 ; i < 3 ; ++i )
933 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
936 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
939 tab_joint_text(tbl, heading_columns + i*2 , 1,
940 heading_columns + i*2 + 1, 1,
941 TAB_CENTER | TAT_TITLE,
944 tab_box (tbl, -1, -1,
946 heading_columns + i*2, 1,
947 heading_columns + i*2 + 1, 1);
952 /* Titles for the independent variables */
955 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
956 var_to_string(fctr->indep_var[0]));
958 if ( fctr->indep_var[1] )
960 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
961 var_to_string(fctr->indep_var[1]));
967 for ( i = 0 ; i < n_dep_var ; ++i )
971 n_factors = hsh_count(fctr->fstats);
975 tab_hline(tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
978 0, i * n_factors + heading_rows,
979 TAB_LEFT | TAT_TITLE,
980 var_to_string(dependent_var[i])
985 populate_summary(tbl, heading_columns,
986 (i * n_factors) + heading_rows,
992 struct factor_statistics **fs = fctr->fs;
997 static union value prev;
999 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1000 fctr->indep_var[0]->width))
1004 (i * n_factors ) + count +
1006 TAB_LEFT | TAT_TITLE,
1007 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1010 if (fctr->indep_var[1] && count > 0 )
1011 tab_hline(tbl, TAL_1, 1, n_cols - 1,
1012 (i * n_factors ) + count + heading_rows);
1016 prev = (*fs)->id[0];
1019 if ( fctr->indep_var[1])
1022 (i * n_factors ) + count +
1024 TAB_LEFT | TAT_TITLE,
1025 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1028 populate_summary(tbl, heading_columns,
1029 (i * n_factors) + count
1044 populate_summary(struct tab_table *t, int col, int row,
1045 const struct metrics *m)
1048 const double total = m->n + m->n_missing ;
1050 tab_float(t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1051 tab_float(t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1052 tab_float(t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1056 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1057 100.0 * m->n / total );
1059 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1060 100.0 * m->n_missing / total );
1062 /* This seems a bit pointless !!! */
1063 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1064 100.0 * total / total );
1075 show_extremes(struct variable **dependent_var, int n_dep_var,
1076 const struct factor *fctr, int n_extremities)
1079 int heading_columns ;
1081 const int heading_rows = 1;
1082 struct tab_table *tbl;
1089 heading_columns = 2;
1090 n_factors = hsh_count(fctr->fstats);
1092 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1094 if ( fctr->indep_var[1] )
1095 heading_columns = 3;
1099 heading_columns = 1;
1100 n_rows = n_dep_var * 2 * n_extremities;
1103 n_rows += heading_rows;
1105 heading_columns += 2;
1106 n_cols = heading_columns + 2;
1108 tbl = tab_create (n_cols,n_rows,0);
1109 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1111 tab_dim (tbl, tab_natural_dimensions);
1113 /* Outline the box, No internal lines*/
1118 n_cols - 1, n_rows - 1);
1120 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1122 tab_title (tbl, 0, _("Extreme Values"));
1124 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1125 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1129 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1130 var_to_string(fctr->indep_var[0]));
1132 if ( fctr->indep_var[1] )
1133 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1134 var_to_string(fctr->indep_var[1]));
1137 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Value"));
1138 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Case Number"));
1140 for ( i = 0 ; i < n_dep_var ; ++i )
1144 tab_hline(tbl, TAL_1, 0, n_cols -1 ,
1145 i * 2 * n_extremities * n_factors + heading_rows);
1148 i * 2 * n_extremities * n_factors + heading_rows,
1149 TAB_LEFT | TAT_TITLE,
1150 var_to_string(dependent_var[i])
1155 populate_extremes(tbl, heading_columns - 2,
1156 i * 2 * n_extremities * n_factors + heading_rows,
1157 n_extremities, &totals[i]);
1161 struct factor_statistics **fs = fctr->fs;
1166 static union value prev ;
1168 const int row = heading_rows + ( 2 * n_extremities ) *
1169 ( ( i * n_factors ) + count );
1172 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1173 fctr->indep_var[0]->width))
1177 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1181 TAB_LEFT | TAT_TITLE,
1182 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1186 prev = (*fs)->id[0];
1188 if (fctr->indep_var[1] && count > 0 )
1189 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1191 if ( fctr->indep_var[1])
1192 tab_text (tbl, 2, row,
1193 TAB_LEFT | TAT_TITLE,
1194 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1197 populate_extremes(tbl, heading_columns - 2,
1212 /* Fill in the extremities table */
1214 populate_extremes(struct tab_table *t,
1215 int col, int row, int n, const struct metrics *m)
1221 tab_text(t, col, row,
1222 TAB_RIGHT | TAT_TITLE ,
1226 tab_text(t, col, row + n ,
1227 TAB_RIGHT | TAT_TITLE ,
1232 tab_hline(t, TAL_1, col, col + 3, row + n );
1234 for (extremity = 0; extremity < n ; ++extremity )
1237 tab_float(t, col + 1, row + extremity,
1239 extremity + 1, 8, 0);
1243 tab_float(t, col + 1, row + extremity + n,
1245 extremity + 1, 8, 0);
1251 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1254 const struct weighted_value *wv = m->wvp[idx];
1255 struct case_node *cn = wv->case_nos;
1258 for (j = 0 ; j < wv->w ; ++j )
1260 if ( extremity + j >= n )
1263 tab_float(t, col + 3, row + extremity + j + n,
1267 tab_float(t, col + 2, row + extremity + j + n,
1276 extremity += wv->w ;
1281 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1284 const struct weighted_value *wv = m->wvp[idx];
1285 struct case_node *cn = wv->case_nos;
1287 for (j = 0 ; j < wv->w ; ++j )
1289 if ( extremity + j >= n )
1292 tab_float(t, col + 3, row + extremity + j,
1296 tab_float(t, col + 2, row + extremity + j,
1305 extremity += wv->w ;
1310 /* Show the descriptives table */
1312 show_descriptives(struct variable **dependent_var,
1314 struct factor *fctr)
1317 int heading_columns ;
1319 const int n_stat_rows = 13;
1321 const int heading_rows = 1;
1323 struct tab_table *tbl;
1330 heading_columns = 4;
1331 n_factors = hsh_count(fctr->fstats);
1333 n_rows = n_dep_var * n_stat_rows * n_factors;
1335 if ( fctr->indep_var[1] )
1336 heading_columns = 5;
1340 heading_columns = 3;
1341 n_rows = n_dep_var * n_stat_rows;
1344 n_rows += heading_rows;
1346 n_cols = heading_columns + 2;
1349 tbl = tab_create (n_cols, n_rows, 0);
1351 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1353 tab_dim (tbl, tab_natural_dimensions);
1355 /* Outline the box and have no internal lines*/
1360 n_cols - 1, n_rows - 1);
1362 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1364 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1365 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1366 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1368 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Statistic"));
1369 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
1371 tab_title (tbl, 0, _("Descriptives"));
1374 for ( i = 0 ; i < n_dep_var ; ++i )
1376 const int row = heading_rows + i * n_stat_rows * n_factors ;
1379 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
1382 i * n_stat_rows * n_factors + heading_rows,
1383 TAB_LEFT | TAT_TITLE,
1384 var_to_string(dependent_var[i])
1390 struct factor_statistics **fs = fctr->fs;
1393 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1394 var_to_string(fctr->indep_var[0]));
1397 if ( fctr->indep_var[1])
1398 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1399 var_to_string(fctr->indep_var[1]));
1404 static union value prev ;
1406 const int row = heading_rows + n_stat_rows *
1407 ( ( i * n_factors ) + count );
1410 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1411 fctr->indep_var[0]->width))
1415 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1419 TAB_LEFT | TAT_TITLE,
1420 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1424 prev = (*fs)->id[0];
1426 if (fctr->indep_var[1] && count > 0 )
1427 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1429 if ( fctr->indep_var[1])
1430 tab_text (tbl, 2, row,
1431 TAB_LEFT | TAT_TITLE,
1432 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1435 populate_descriptives(tbl, heading_columns - 2,
1447 populate_descriptives(tbl, heading_columns - 2,
1448 i * n_stat_rows * n_factors + heading_rows,
1460 /* Fill in the descriptives data */
1462 populate_descriptives(struct tab_table *tbl, int col, int row,
1463 const struct metrics *m)
1466 const double t = gsl_cdf_tdist_Qinv(1 - cmd.n_cinterval[0]/100.0/2.0, \
1472 TAB_LEFT | TAT_TITLE,
1475 tab_float (tbl, col + 2,
1481 tab_float (tbl, col + 3,
1490 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1491 _("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1494 tab_text (tbl, col + 1,
1496 TAB_LEFT | TAT_TITLE,
1499 tab_float (tbl, col + 2,
1502 m->mean - t * m->se_mean,
1505 tab_text (tbl, col + 1,
1507 TAB_LEFT | TAT_TITLE,
1511 tab_float (tbl, col + 2,
1514 m->mean + t * m->se_mean,
1519 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1520 _("5%% Trimmed Mean"));
1522 tab_float (tbl, col + 2,
1530 TAB_LEFT | TAT_TITLE,
1534 struct percentile *p;
1537 p = hsh_find(m->ptile_hash, &d);
1542 tab_float (tbl, col + 2,
1552 TAB_LEFT | TAT_TITLE,
1555 tab_float (tbl, col + 2,
1564 TAB_LEFT | TAT_TITLE,
1565 _("Std. Deviation"));
1568 tab_float (tbl, col + 2,
1577 TAB_LEFT | TAT_TITLE,
1580 tab_float (tbl, col + 2,
1588 TAB_LEFT | TAT_TITLE,
1591 tab_float (tbl, col + 2,
1600 TAB_LEFT | TAT_TITLE,
1604 tab_float (tbl, col + 2,
1612 TAB_LEFT | TAT_TITLE,
1613 _("Interquartile Range"));
1616 struct percentile *p1;
1617 struct percentile *p2;
1620 p1 = hsh_find(m->ptile_hash, &d);
1623 p2 = hsh_find(m->ptile_hash, &d);
1628 tab_float (tbl, col + 2,
1639 TAB_LEFT | TAT_TITLE,
1643 tab_float (tbl, col + 2,
1649 /* stderr of skewness */
1650 tab_float (tbl, col + 3,
1659 TAB_LEFT | TAT_TITLE,
1663 tab_float (tbl, col + 2,
1669 /* stderr of kurtosis */
1670 tab_float (tbl, col + 3,
1682 box_plot_variables(const struct factor *fctr,
1683 const struct variable **vars, int n_vars,
1684 const struct variable *id)
1688 struct factor_statistics **fs ;
1692 box_plot_group(fctr, vars, n_vars, id);
1696 for ( fs = fctr->fs ; *fs ; ++fs )
1698 double y_min = DBL_MAX;
1699 double y_max = -DBL_MAX;
1700 struct chart *ch = chart_create();
1701 const char *s = factor_to_string(fctr, *fs, 0 );
1703 chart_write_title(ch, s);
1705 for ( i = 0 ; i < n_vars ; ++i )
1707 y_max = max(y_max, (*fs)->m[i].max);
1708 y_min = min(y_min, (*fs)->m[i].min);
1711 boxplot_draw_yscale(ch, y_max, y_min);
1713 for ( i = 0 ; i < n_vars ; ++i )
1716 const double box_width = (ch->data_right - ch->data_left)
1719 const double box_centre = ( i * 2 + 1) * box_width
1722 boxplot_draw_boxplot(ch,
1723 box_centre, box_width,
1725 var_to_string(vars[i]));
1737 /* Do a box plot, grouping all factors into one plot ;
1738 each dependent variable has its own plot.
1741 box_plot_group(const struct factor *fctr,
1742 const struct variable **vars,
1744 const struct variable *id UNUSED)
1749 for ( i = 0 ; i < n_vars ; ++i )
1751 struct factor_statistics **fs ;
1754 ch = chart_create();
1756 boxplot_draw_yscale(ch, totals[i].max, totals[i].min);
1762 for ( fs = fctr->fs ; *fs ; ++fs )
1765 chart_write_title(ch, _("Boxplot of %s vs. %s"),
1766 var_to_string(vars[i]), var_to_string(fctr->indep_var[0]) );
1768 for ( fs = fctr->fs ; *fs ; ++fs )
1771 const char *s = factor_to_string_concise(fctr, *fs);
1773 const double box_width = (ch->data_right - ch->data_left)
1774 / (n_factors * 2.0 ) ;
1776 const double box_centre = ( f++ * 2 + 1) * box_width
1779 boxplot_draw_boxplot(ch,
1780 box_centre, box_width,
1787 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1788 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1790 chart_write_title(ch, _("Boxplot"));
1792 boxplot_draw_boxplot(ch,
1793 box_centre, box_width,
1795 var_to_string(vars[i]) );
1804 /* Plot the normal and detrended normal plots for m
1805 Label the plots with factorname */
1807 np_plot(const struct metrics *m, const char *factorname)
1810 double yfirst=0, ylast=0;
1813 struct chart *np_chart;
1815 /* Detrended Normal Plot */
1816 struct chart *dnp_chart;
1818 /* The slope and intercept of the ideal normal probability line */
1819 const double slope = 1.0 / m->stddev;
1820 const double intercept = - m->mean / m->stddev;
1822 /* Cowardly refuse to plot an empty data set */
1823 if ( m->n_data == 0 )
1826 np_chart = chart_create();
1827 dnp_chart = chart_create();
1829 if ( !np_chart || ! dnp_chart )
1832 chart_write_title(np_chart, _("Normal Q-Q Plot of %s"), factorname);
1833 chart_write_xlabel(np_chart, _("Observed Value"));
1834 chart_write_ylabel(np_chart, _("Expected Normal"));
1837 chart_write_title(dnp_chart, _("Detrended Normal Q-Q Plot of %s"),
1839 chart_write_xlabel(dnp_chart, _("Observed Value"));
1840 chart_write_ylabel(dnp_chart, _("Dev from Normal"));
1842 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1843 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1847 /* Need to make sure that both the scatter plot and the ideal fit into the
1849 double x_lower = min(m->min, (yfirst - intercept) / slope) ;
1850 double x_upper = max(m->max, (ylast - intercept) / slope) ;
1851 double slack = (x_upper - x_lower) * 0.05 ;
1853 chart_write_xscale(np_chart, x_lower - slack, x_upper + slack, 5);
1855 chart_write_xscale(dnp_chart, m->min, m->max, 5);
1859 chart_write_yscale(np_chart, yfirst, ylast, 5);
1862 /* We have to cache the detrended data, beacause we need to
1863 find its limits before we can plot it */
1864 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1865 double d_max = -DBL_MAX;
1866 double d_min = DBL_MAX;
1867 for ( i = 0 ; i < m->n_data; ++i )
1869 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1871 chart_datum(np_chart, 0, m->wvp[i]->v.f, ns);
1873 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1875 if ( d_data[i] < d_min ) d_min = d_data[i];
1876 if ( d_data[i] > d_max ) d_max = d_data[i];
1878 chart_write_yscale(dnp_chart, d_min, d_max, 5);
1880 for ( i = 0 ; i < m->n_data; ++i )
1881 chart_datum(dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1886 chart_line(np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1887 chart_line(dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1889 chart_submit(np_chart);
1890 chart_submit(dnp_chart);
1896 /* Show the percentiles */
1898 show_percentiles(struct variable **dependent_var,
1900 struct factor *fctr)
1902 struct tab_table *tbl;
1908 struct hsh_table *ptiles ;
1910 int n_heading_columns;
1911 const int n_heading_rows = 2;
1912 const int n_stat_rows = 2;
1918 struct factor_statistics **fs = fctr->fs ;
1919 n_heading_columns = 3;
1920 n_factors = hsh_count(fctr->fstats);
1922 ptiles = (*fs)->m[0].ptile_hash;
1924 if ( fctr->indep_var[1] )
1925 n_heading_columns = 4;
1930 n_heading_columns = 2;
1932 ptiles = totals[0].ptile_hash;
1935 n_ptiles = hsh_count(ptiles);
1937 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1939 n_cols = n_heading_columns + n_ptiles ;
1941 tbl = tab_create (n_cols, n_rows, 0);
1943 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1945 tab_dim (tbl, tab_natural_dimensions);
1947 /* Outline the box and have no internal lines*/
1952 n_cols - 1, n_rows - 1);
1954 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
1956 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
1959 tab_title (tbl, 0, _("Percentiles"));
1962 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
1969 n_heading_columns - 1, n_rows - 1);
1975 n_heading_columns, n_heading_rows - 1,
1976 n_cols - 1, n_rows - 1);
1978 tab_joint_text(tbl, n_heading_columns + 1, 0,
1980 TAB_CENTER | TAT_TITLE ,
1985 /* Put in the percentile break points as headings */
1987 struct percentile **p = (struct percentile **) hsh_sort(ptiles);
1992 tab_float(tbl, n_heading_columns + i++ , 1,
2001 for ( i = 0 ; i < n_dep_var ; ++i )
2003 const int n_stat_rows = 2;
2004 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2007 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
2010 i * n_stat_rows * n_factors + n_heading_rows,
2011 TAB_LEFT | TAT_TITLE,
2012 var_to_string(dependent_var[i])
2017 struct factor_statistics **fs = fctr->fs;
2020 tab_text (tbl, 1, n_heading_rows - 1,
2021 TAB_CENTER | TAT_TITLE,
2022 var_to_string(fctr->indep_var[0]));
2025 if ( fctr->indep_var[1])
2026 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2027 var_to_string(fctr->indep_var[1]));
2032 static union value prev ;
2034 const int row = n_heading_rows + n_stat_rows *
2035 ( ( i * n_factors ) + count );
2038 if ( 0 != compare_values(&prev, &(*fs)->id[0],
2039 fctr->indep_var[0]->width))
2043 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2047 TAB_LEFT | TAT_TITLE,
2048 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
2054 prev = (*fs)->id[0];
2056 if (fctr->indep_var[1] && count > 0 )
2057 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
2059 if ( fctr->indep_var[1])
2060 tab_text (tbl, 2, row,
2061 TAB_LEFT | TAT_TITLE,
2062 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
2066 populate_percentiles(tbl, n_heading_columns - 1,
2078 populate_percentiles(tbl, n_heading_columns - 1,
2079 i * n_stat_rows * n_factors + n_heading_rows,
2096 populate_percentiles(struct tab_table *tbl, int col, int row,
2097 const struct metrics *m)
2101 struct percentile **p = (struct percentile **) hsh_sort(m->ptile_hash);
2105 TAB_LEFT | TAT_TITLE,
2106 _("Tukey\'s Hinges")
2111 TAB_LEFT | TAT_TITLE,
2112 ptile_alg_desc[m->ptile_alg]
2119 tab_float(tbl, col + i + 1 , row,
2122 if ( (*p)->p == 25 )
2123 tab_float(tbl, col + i + 1 , row + 1,
2127 if ( (*p)->p == 50 )
2128 tab_float(tbl, col + i + 1 , row + 1,
2132 if ( (*p)->p == 75 )
2133 tab_float(tbl, col + i + 1 , row + 1,
2148 factor_to_string(const struct factor *fctr,
2149 struct factor_statistics *fs,
2150 const struct variable *var)
2153 static char buf1[100];
2159 sprintf(buf1, "%s (",var_to_string(var) );
2162 snprintf(buf2, 100, "%s = %s",
2163 var_to_string(fctr->indep_var[0]),
2164 value_to_string(&fs->id[0],fctr->indep_var[0]));
2168 if ( fctr->indep_var[1] )
2170 sprintf(buf2, "; %s = %s)",
2171 var_to_string(fctr->indep_var[1]),
2172 value_to_string(&fs->id[1],
2173 fctr->indep_var[1]));
2188 factor_to_string_concise(const struct factor *fctr,
2189 struct factor_statistics *fs)
2193 static char buf[100];
2197 snprintf(buf, 100, "%s",
2198 value_to_string(&fs->id[0], fctr->indep_var[0]));
2200 if ( fctr->indep_var[1] )
2202 sprintf(buf2, ",%s)", value_to_string(&fs->id[1], fctr->indep_var[1]) );