1 /* PSPP - EXAMINE data for normality . -*-c-*-
3 Copyright (C) 2004 Free Software Foundation, Inc.
4 Author: John Darrington 2004
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
30 #include "dictionary.h"
38 #include "value-labels.h"
43 #include "factor_stats.h"
45 #include "percentiles.h"
55 +missing=miss:pairwise/!listwise,
57 incl:include/!exclude;
58 +compare=cmp:variables/!groups;
61 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
63 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
72 static struct cmd_examine cmd;
74 static struct variable **dependent_vars;
76 static int n_dependent_vars;
81 /* The independent variable */
82 struct variable *indep_var[2];
85 /* Hash table of factor stats indexed by 2 values */
86 struct hsh_table *fstats;
88 /* The hash table after it has been crunched */
89 struct factor_statistics **fs;
95 /* Linked list of factors */
96 static struct factor *factors=0;
98 static struct metrics *totals=0;
100 /* Parse the clause specifying the factors */
101 static int examine_parse_independent_vars(struct cmd_examine *cmd);
105 /* Output functions */
106 static void show_summary(struct variable **dependent_var, int n_dep_var,
107 const struct factor *f);
109 static void show_extremes(struct variable **dependent_var,
111 const struct factor *factor,
114 static void show_descriptives(struct variable **dependent_var,
116 struct factor *factor);
118 static void show_percentiles(struct variable **dependent_var,
120 struct factor *factor);
125 void np_plot(const struct metrics *m, const char *factorname);
128 void box_plot_group(const struct factor *fctr,
129 const struct variable **vars, int n_vars,
130 const struct variable *id
134 void box_plot_variables(const struct factor *fctr,
135 const struct variable **vars, int n_vars,
136 const struct variable *id
141 /* Per Split function */
142 static void run_examine(const struct casefile *cf, void *cmd_);
144 static void output_examine(void);
147 void factor_calc(struct ccase *c, int case_no,
148 double weight, int case_missing);
151 /* Represent a factor as a string, so it can be
152 printed in a human readable fashion */
153 const char * factor_to_string(const struct factor *fctr,
154 struct factor_statistics *fs,
155 const struct variable *var);
158 /* Represent a factor as a string, so it can be
159 printed in a human readable fashion,
160 but sacrificing some readablility for the sake of brevity */
161 const char *factor_to_string_concise(const struct factor *fctr,
162 struct factor_statistics *fs);
167 /* Function to use for testing for missing values */
168 static is_missing_func value_is_missing;
173 static subc_list_double percentile_list;
175 static enum pc_alg percentile_algorithm;
177 static short sbc_percentile;
184 subc_list_double_create(&percentile_list);
185 percentile_algorithm = PC_HAVERAGE;
187 if ( !parse_examine(&cmd) )
190 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
191 if (cmd.incl == XMN_INCLUDE )
192 value_is_missing = is_system_missing;
194 value_is_missing = is_missing;
196 if ( cmd.st_n == SYSMIS )
199 if ( ! cmd.sbc_cinterval)
200 cmd.n_cinterval[0] = 95.0;
202 /* If descriptives have been requested, make sure the
203 quartiles are calculated */
204 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
206 subc_list_double_push(&percentile_list, 25);
207 subc_list_double_push(&percentile_list, 50);
208 subc_list_double_push(&percentile_list, 75);
211 multipass_procedure_with_splits (run_examine, &cmd);
218 if ( dependent_vars )
219 free (dependent_vars);
222 struct factor *f = factors ;
225 struct factor *ff = f;
229 hsh_destroy ( ff->fstats ) ;
234 subc_list_double_destroy(&percentile_list);
241 /* Show all the appropriate tables */
247 /* Show totals if appropriate */
248 if ( ! cmd.sbc_nototal || factors == 0 )
250 show_summary(dependent_vars, n_dependent_vars, 0);
252 if ( cmd.sbc_statistics )
254 if ( cmd.a_statistics[XMN_ST_EXTREME])
255 show_extremes(dependent_vars, n_dependent_vars, 0, cmd.st_n);
257 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
258 show_descriptives(dependent_vars, n_dependent_vars, 0);
261 if ( sbc_percentile )
262 show_percentiles(dependent_vars, n_dependent_vars, 0);
267 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
269 for ( v = 0 ; v < n_dependent_vars; ++v )
270 np_plot(&totals[v], var_to_string(dependent_vars[v]));
273 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
275 if ( cmd.cmp == XMN_GROUPS )
277 box_plot_group(0, dependent_vars, n_dependent_vars,
281 box_plot_variables(0, dependent_vars, n_dependent_vars,
285 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
287 for ( v = 0 ; v < n_dependent_vars; ++v )
289 struct normal_curve normal;
291 normal.N = totals[v].n;
292 normal.mean = totals[v].mean;
293 normal.stddev = totals[v].stddev;
295 histogram_plot(totals[v].histogram,
296 var_to_string(dependent_vars[v]),
306 /* Show grouped statistics as appropriate */
310 show_summary(dependent_vars, n_dependent_vars, fctr);
312 if ( cmd.sbc_statistics )
314 if ( cmd.a_statistics[XMN_ST_EXTREME])
315 show_extremes(dependent_vars, n_dependent_vars, fctr, cmd.st_n);
317 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
318 show_descriptives(dependent_vars, n_dependent_vars, fctr);
321 if ( sbc_percentile )
322 show_percentiles(dependent_vars, n_dependent_vars, fctr);
329 struct factor_statistics **fs = fctr->fs ;
331 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
333 if ( cmd.cmp == XMN_VARIABLES )
334 box_plot_variables(fctr, dependent_vars, n_dependent_vars,
337 box_plot_group(fctr, dependent_vars, n_dependent_vars,
341 for ( v = 0 ; v < n_dependent_vars; ++v )
344 for ( fs = fctr->fs ; *fs ; ++fs )
346 const char *s = factor_to_string(fctr, *fs, dependent_vars[v]);
348 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
349 np_plot(&(*fs)->m[v], s);
351 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
353 struct normal_curve normal;
355 normal.N = (*fs)->m[v].n;
356 normal.mean = (*fs)->m[v].mean;
357 normal.stddev = (*fs)->m[v].stddev;
359 histogram_plot((*fs)->m[v].histogram,
363 } /* for ( fs .... */
365 } /* for ( v = 0 ..... */
375 /* Create a hash table of percentiles and their values from the list of
377 static struct hsh_table *
378 list_to_ptile_hash(const subc_list_double *l)
382 struct hsh_table *h ;
384 h = hsh_create(subc_list_double_count(l),
385 (hsh_compare_func *) ptile_compare,
386 (hsh_hash_func *) ptile_hash,
387 (hsh_free_func *) free,
391 for ( i = 0 ; i < subc_list_double_count(l) ; ++i )
393 struct percentile *p = xmalloc (sizeof (struct percentile));
395 p->p = subc_list_double_at(l,i);
406 /* Parse the PERCENTILES subcommand */
408 xmn_custom_percentiles(struct cmd_examine *p UNUSED)
416 while ( lex_is_number() )
418 subc_list_double_push(&percentile_list,lex_number());
428 if ( lex_match_id("HAVERAGE"))
429 percentile_algorithm = PC_HAVERAGE;
431 else if ( lex_match_id("WAVERAGE"))
432 percentile_algorithm = PC_WAVERAGE;
434 else if ( lex_match_id("ROUND"))
435 percentile_algorithm = PC_ROUND;
437 else if ( lex_match_id("EMPIRICAL"))
438 percentile_algorithm = PC_EMPIRICAL;
440 else if ( lex_match_id("AEMPIRICAL"))
441 percentile_algorithm = PC_AEMPIRICAL;
443 else if ( lex_match_id("NONE"))
444 percentile_algorithm = PC_NONE;
447 if ( 0 == subc_list_double_count(&percentile_list))
449 subc_list_double_push(&percentile_list, 5);
450 subc_list_double_push(&percentile_list, 10);
451 subc_list_double_push(&percentile_list, 25);
452 subc_list_double_push(&percentile_list, 50);
453 subc_list_double_push(&percentile_list, 75);
454 subc_list_double_push(&percentile_list, 90);
455 subc_list_double_push(&percentile_list, 95);
461 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
463 xmn_custom_total(struct cmd_examine *p)
465 if ( p->sbc_nototal )
467 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
475 xmn_custom_nototal(struct cmd_examine *p)
479 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
488 /* Parser for the variables sub command
489 Returns 1 on success */
491 xmn_custom_variables(struct cmd_examine *cmd )
495 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
501 if (!parse_variables (default_dict, &dependent_vars, &n_dependent_vars,
502 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
504 free (dependent_vars);
508 assert(n_dependent_vars);
510 totals = xmalloc( sizeof(struct metrics) * n_dependent_vars);
512 if ( lex_match(T_BY))
515 success = examine_parse_independent_vars(cmd);
516 if ( success != 1 ) {
517 free (dependent_vars);
528 /* Parse the clause specifying the factors */
530 examine_parse_independent_vars(struct cmd_examine *cmd)
533 struct factor *sf = xmalloc(sizeof(struct factor));
535 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
543 sf->indep_var[0] = parse_variable();
544 sf->indep_var[1] = 0;
551 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
558 sf->indep_var[1] = parse_variable();
563 sf->fstats = hsh_create(4,
564 (hsh_compare_func *) factor_statistics_compare,
565 (hsh_hash_func *) factor_statistics_hash,
566 (hsh_free_func *) factor_statistics_free,
574 if ( token == '.' || token == '/' )
577 success = examine_parse_independent_vars(cmd);
588 void populate_percentiles(struct tab_table *tbl, int col, int row,
589 const struct metrics *m);
591 void populate_descriptives(struct tab_table *t, int col, int row,
592 const struct metrics *fs);
594 void populate_extremes(struct tab_table *t, int col, int row, int n,
595 const struct metrics *m);
597 void populate_summary(struct tab_table *t, int col, int row,
598 const struct metrics *m);
603 static int bad_weight_warn = 1;
606 /* Perform calculations for the sub factors */
608 factor_calc(struct ccase *c, int case_no, double weight, int case_missing)
611 struct factor *fctr = factors;
615 struct factor_statistics **foo ;
616 union value indep_vals[2] ;
618 indep_vals[0] = * case_data(c, fctr->indep_var[0]->fv);
620 if ( fctr->indep_var[1] )
621 indep_vals[1] = * case_data(c, fctr->indep_var[1]->fv);
623 indep_vals[1].f = SYSMIS;
625 assert(fctr->fstats);
627 foo = ( struct factor_statistics ** )
628 hsh_probe(fctr->fstats, (void *) &indep_vals);
633 *foo = create_factor_statistics(n_dependent_vars,
637 for ( v = 0 ; v < n_dependent_vars ; ++v )
639 metrics_precalc( &(*foo)->m[v] );
644 for ( v = 0 ; v < n_dependent_vars ; ++v )
646 const struct variable *var = dependent_vars[v];
647 const union value *val = case_data (c, var->fv);
649 if ( value_is_missing(val,var) || case_missing )
652 metrics_calc( &(*foo)->m[v], val, weight, case_no);
666 run_examine(const struct casefile *cf, void *cmd_ )
668 struct casereader *r;
672 const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
674 /* Make sure we haven't got rubbish left over from a
677 struct factor *fctr = factors;
680 struct factor *next = fctr->next;
682 hsh_clear(fctr->fstats);
691 for ( v = 0 ; v < n_dependent_vars ; ++v )
692 metrics_precalc(&totals[v]);
694 for(r = casefile_get_reader (cf);
695 casereader_read (r, &c) ;
699 const int case_no = casereader_cnum(r);
701 const double weight =
702 dict_get_case_weight(default_dict, &c, &bad_weight_warn);
704 if ( cmd->miss == XMN_LISTWISE )
706 for ( v = 0 ; v < n_dependent_vars ; ++v )
708 const struct variable *var = dependent_vars[v];
709 const union value *val = case_data (&c, var->fv);
711 if ( value_is_missing(val,var))
717 for ( v = 0 ; v < n_dependent_vars ; ++v )
719 const struct variable *var = dependent_vars[v];
720 const union value *val = case_data (&c, var->fv);
722 if ( value_is_missing(val,var) || case_missing )
725 metrics_calc(&totals[v], val, weight, case_no);
729 factor_calc(&c, case_no, weight, case_missing);
734 for ( v = 0 ; v < n_dependent_vars ; ++v)
739 struct hsh_iterator hi;
740 struct factor_statistics *fs;
742 for ( fs = hsh_first(fctr->fstats, &hi);
744 fs = hsh_next(fctr->fstats, &hi))
747 fs->m[v].ptile_hash = list_to_ptile_hash(&percentile_list);
748 fs->m[v].ptile_alg = percentile_algorithm;
749 metrics_postcalc(&fs->m[v]);
755 totals[v].ptile_hash = list_to_ptile_hash(&percentile_list);
756 totals[v].ptile_alg = percentile_algorithm;
757 metrics_postcalc(&totals[v]);
761 /* Make sure that the combination of factors are complete */
766 struct hsh_iterator hi;
767 struct hsh_iterator hi0;
768 struct hsh_iterator hi1;
769 struct factor_statistics *fs;
771 struct hsh_table *idh0=0;
772 struct hsh_table *idh1=0;
776 idh0 = hsh_create(4, (hsh_compare_func *) compare_values,
777 (hsh_hash_func *) hash_value,
780 idh1 = hsh_create(4, (hsh_compare_func *) compare_values,
781 (hsh_hash_func *) hash_value,
785 for ( fs = hsh_first(fctr->fstats, &hi);
787 fs = hsh_next(fctr->fstats, &hi))
789 hsh_insert(idh0,(void *) &fs->id[0]);
790 hsh_insert(idh1,(void *) &fs->id[1]);
793 /* Ensure that the factors combination is complete */
794 for ( val0 = hsh_first(idh0, &hi0);
796 val0 = hsh_next(idh0, &hi0))
798 for ( val1 = hsh_first(idh1, &hi1);
800 val1 = hsh_next(idh1, &hi1))
802 struct factor_statistics **ffs;
807 ffs = (struct factor_statistics **)
808 hsh_probe(fctr->fstats, (void *) &key );
812 (*ffs) = create_factor_statistics (n_dependent_vars,
814 for ( i = 0 ; i < n_dependent_vars ; ++i )
815 metrics_precalc( &(*ffs)->m[i]);
823 fctr->fs = (struct factor_statistics **) hsh_sort_copy(fctr->fstats);
834 for ( i = 0 ; i < n_dependent_vars ; ++i )
836 metrics_destroy(&totals[i]);
844 show_summary(struct variable **dependent_var, int n_dep_var,
845 const struct factor *fctr)
847 static const char *subtitle[]=
855 int heading_columns ;
857 const int heading_rows = 3;
858 struct tab_table *tbl;
866 n_factors = hsh_count(fctr->fstats);
867 n_rows = n_dep_var * n_factors ;
869 if ( fctr->indep_var[1] )
878 n_rows += heading_rows;
880 n_cols = heading_columns + 6;
882 tbl = tab_create (n_cols,n_rows,0);
883 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
885 tab_dim (tbl, tab_natural_dimensions);
887 /* Outline the box */
892 n_cols - 1, n_rows - 1);
894 /* Vertical lines for the data only */
899 n_cols - 1, n_rows - 1);
902 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
903 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
904 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
906 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
909 tab_title (tbl, 0, _("Case Processing Summary"));
912 tab_joint_text(tbl, heading_columns, 0,
914 TAB_CENTER | TAT_TITLE,
917 /* Remove lines ... */
924 for ( i = 0 ; i < 3 ; ++i )
926 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
929 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
932 tab_joint_text(tbl, heading_columns + i*2 , 1,
933 heading_columns + i*2 + 1, 1,
934 TAB_CENTER | TAT_TITLE,
937 tab_box (tbl, -1, -1,
939 heading_columns + i*2, 1,
940 heading_columns + i*2 + 1, 1);
945 /* Titles for the independent variables */
948 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
949 var_to_string(fctr->indep_var[0]));
951 if ( fctr->indep_var[1] )
953 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
954 var_to_string(fctr->indep_var[1]));
960 for ( i = 0 ; i < n_dep_var ; ++i )
964 n_factors = hsh_count(fctr->fstats);
968 tab_hline(tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
971 0, i * n_factors + heading_rows,
972 TAB_LEFT | TAT_TITLE,
973 var_to_string(dependent_var[i])
978 populate_summary(tbl, heading_columns,
979 (i * n_factors) + heading_rows,
985 struct factor_statistics **fs = fctr->fs;
990 static union value prev;
992 if ( 0 != compare_values(&prev, &(*fs)->id[0],
993 fctr->indep_var[0]->width))
997 (i * n_factors ) + count +
999 TAB_LEFT | TAT_TITLE,
1000 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1003 if (fctr->indep_var[1] && count > 0 )
1004 tab_hline(tbl, TAL_1, 1, n_cols - 1,
1005 (i * n_factors ) + count + heading_rows);
1009 prev = (*fs)->id[0];
1012 if ( fctr->indep_var[1])
1015 (i * n_factors ) + count +
1017 TAB_LEFT | TAT_TITLE,
1018 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1021 populate_summary(tbl, heading_columns,
1022 (i * n_factors) + count
1037 populate_summary(struct tab_table *t, int col, int row,
1038 const struct metrics *m)
1041 const double total = m->n + m->n_missing ;
1043 tab_float(t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1044 tab_float(t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1045 tab_float(t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1049 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1050 100.0 * m->n / total );
1052 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1053 100.0 * m->n_missing / total );
1055 /* This seems a bit pointless !!! */
1056 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1057 100.0 * total / total );
1068 show_extremes(struct variable **dependent_var, int n_dep_var,
1069 const struct factor *fctr, int n_extremities)
1072 int heading_columns ;
1074 const int heading_rows = 1;
1075 struct tab_table *tbl;
1082 heading_columns = 2;
1083 n_factors = hsh_count(fctr->fstats);
1085 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1087 if ( fctr->indep_var[1] )
1088 heading_columns = 3;
1092 heading_columns = 1;
1093 n_rows = n_dep_var * 2 * n_extremities;
1096 n_rows += heading_rows;
1098 heading_columns += 2;
1099 n_cols = heading_columns + 2;
1101 tbl = tab_create (n_cols,n_rows,0);
1102 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1104 tab_dim (tbl, tab_natural_dimensions);
1106 /* Outline the box, No internal lines*/
1111 n_cols - 1, n_rows - 1);
1113 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1115 tab_title (tbl, 0, _("Extreme Values"));
1117 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1118 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1122 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1123 var_to_string(fctr->indep_var[0]));
1125 if ( fctr->indep_var[1] )
1126 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1127 var_to_string(fctr->indep_var[1]));
1130 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Value"));
1131 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Case Number"));
1133 for ( i = 0 ; i < n_dep_var ; ++i )
1137 tab_hline(tbl, TAL_1, 0, n_cols -1 ,
1138 i * 2 * n_extremities * n_factors + heading_rows);
1141 i * 2 * n_extremities * n_factors + heading_rows,
1142 TAB_LEFT | TAT_TITLE,
1143 var_to_string(dependent_var[i])
1148 populate_extremes(tbl, heading_columns - 2,
1149 i * 2 * n_extremities * n_factors + heading_rows,
1150 n_extremities, &totals[i]);
1154 struct factor_statistics **fs = fctr->fs;
1159 static union value prev ;
1161 const int row = heading_rows + ( 2 * n_extremities ) *
1162 ( ( i * n_factors ) + count );
1165 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1166 fctr->indep_var[0]->width))
1170 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1174 TAB_LEFT | TAT_TITLE,
1175 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1179 prev = (*fs)->id[0];
1181 if (fctr->indep_var[1] && count > 0 )
1182 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1184 if ( fctr->indep_var[1])
1185 tab_text (tbl, 2, row,
1186 TAB_LEFT | TAT_TITLE,
1187 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1190 populate_extremes(tbl, heading_columns - 2,
1205 /* Fill in the extremities table */
1207 populate_extremes(struct tab_table *t,
1208 int col, int row, int n, const struct metrics *m)
1214 tab_text(t, col, row,
1215 TAB_RIGHT | TAT_TITLE ,
1219 tab_text(t, col, row + n ,
1220 TAB_RIGHT | TAT_TITLE ,
1225 tab_hline(t, TAL_1, col, col + 3, row + n );
1227 for (extremity = 0; extremity < n ; ++extremity )
1230 tab_float(t, col + 1, row + extremity,
1232 extremity + 1, 8, 0);
1236 tab_float(t, col + 1, row + extremity + n,
1238 extremity + 1, 8, 0);
1244 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1247 const struct weighted_value *wv = m->wvp[idx];
1248 struct case_node *cn = wv->case_nos;
1251 for (j = 0 ; j < wv->w ; ++j )
1253 if ( extremity + j >= n )
1256 tab_float(t, col + 3, row + extremity + j + n,
1260 tab_float(t, col + 2, row + extremity + j + n,
1269 extremity += wv->w ;
1274 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1277 const struct weighted_value *wv = m->wvp[idx];
1278 struct case_node *cn = wv->case_nos;
1280 for (j = 0 ; j < wv->w ; ++j )
1282 if ( extremity + j >= n )
1285 tab_float(t, col + 3, row + extremity + j,
1289 tab_float(t, col + 2, row + extremity + j,
1298 extremity += wv->w ;
1303 /* Show the descriptives table */
1305 show_descriptives(struct variable **dependent_var,
1307 struct factor *fctr)
1310 int heading_columns ;
1312 const int n_stat_rows = 13;
1314 const int heading_rows = 1;
1316 struct tab_table *tbl;
1323 heading_columns = 4;
1324 n_factors = hsh_count(fctr->fstats);
1326 n_rows = n_dep_var * n_stat_rows * n_factors;
1328 if ( fctr->indep_var[1] )
1329 heading_columns = 5;
1333 heading_columns = 3;
1334 n_rows = n_dep_var * n_stat_rows;
1337 n_rows += heading_rows;
1339 n_cols = heading_columns + 2;
1342 tbl = tab_create (n_cols, n_rows, 0);
1344 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1346 tab_dim (tbl, tab_natural_dimensions);
1348 /* Outline the box and have no internal lines*/
1353 n_cols - 1, n_rows - 1);
1355 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1357 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1358 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1359 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1361 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Statistic"));
1362 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
1364 tab_title (tbl, 0, _("Descriptives"));
1367 for ( i = 0 ; i < n_dep_var ; ++i )
1369 const int row = heading_rows + i * n_stat_rows * n_factors ;
1372 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
1375 i * n_stat_rows * n_factors + heading_rows,
1376 TAB_LEFT | TAT_TITLE,
1377 var_to_string(dependent_var[i])
1383 struct factor_statistics **fs = fctr->fs;
1386 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1387 var_to_string(fctr->indep_var[0]));
1390 if ( fctr->indep_var[1])
1391 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1392 var_to_string(fctr->indep_var[1]));
1397 static union value prev ;
1399 const int row = heading_rows + n_stat_rows *
1400 ( ( i * n_factors ) + count );
1403 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1404 fctr->indep_var[0]->width))
1408 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1412 TAB_LEFT | TAT_TITLE,
1413 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1417 prev = (*fs)->id[0];
1419 if (fctr->indep_var[1] && count > 0 )
1420 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1422 if ( fctr->indep_var[1])
1423 tab_text (tbl, 2, row,
1424 TAB_LEFT | TAT_TITLE,
1425 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1428 populate_descriptives(tbl, heading_columns - 2,
1440 populate_descriptives(tbl, heading_columns - 2,
1441 i * n_stat_rows * n_factors + heading_rows,
1453 /* Fill in the descriptives data */
1455 populate_descriptives(struct tab_table *tbl, int col, int row,
1456 const struct metrics *m)
1459 const double t = gsl_cdf_tdist_Qinv(1 - cmd.n_cinterval[0]/100.0/2.0, \
1465 TAB_LEFT | TAT_TITLE,
1468 tab_float (tbl, col + 2,
1474 tab_float (tbl, col + 3,
1483 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1484 _("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1487 tab_text (tbl, col + 1,
1489 TAB_LEFT | TAT_TITLE,
1492 tab_float (tbl, col + 2,
1495 m->mean - t * m->se_mean,
1498 tab_text (tbl, col + 1,
1500 TAB_LEFT | TAT_TITLE,
1504 tab_float (tbl, col + 2,
1507 m->mean + t * m->se_mean,
1512 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1513 _("5%% Trimmed Mean"));
1515 tab_float (tbl, col + 2,
1523 TAB_LEFT | TAT_TITLE,
1527 struct percentile *p;
1530 p = hsh_find(m->ptile_hash, &d);
1535 tab_float (tbl, col + 2,
1545 TAB_LEFT | TAT_TITLE,
1548 tab_float (tbl, col + 2,
1557 TAB_LEFT | TAT_TITLE,
1558 _("Std. Deviation"));
1561 tab_float (tbl, col + 2,
1570 TAB_LEFT | TAT_TITLE,
1573 tab_float (tbl, col + 2,
1581 TAB_LEFT | TAT_TITLE,
1584 tab_float (tbl, col + 2,
1593 TAB_LEFT | TAT_TITLE,
1597 tab_float (tbl, col + 2,
1605 TAB_LEFT | TAT_TITLE,
1606 _("Interquartile Range"));
1609 struct percentile *p1;
1610 struct percentile *p2;
1613 p1 = hsh_find(m->ptile_hash, &d);
1616 p2 = hsh_find(m->ptile_hash, &d);
1621 tab_float (tbl, col + 2,
1632 TAB_LEFT | TAT_TITLE,
1636 tab_float (tbl, col + 2,
1642 /* stderr of skewness */
1643 tab_float (tbl, col + 3,
1652 TAB_LEFT | TAT_TITLE,
1656 tab_float (tbl, col + 2,
1662 /* stderr of kurtosis */
1663 tab_float (tbl, col + 3,
1675 box_plot_variables(const struct factor *fctr,
1676 const struct variable **vars, int n_vars,
1677 const struct variable *id)
1681 struct factor_statistics **fs ;
1685 box_plot_group(fctr, vars, n_vars, id);
1689 for ( fs = fctr->fs ; *fs ; ++fs )
1691 double y_min = DBL_MAX;
1692 double y_max = -DBL_MAX;
1693 struct chart *ch = chart_create();
1694 const char *s = factor_to_string(fctr, *fs, 0 );
1696 chart_write_title(ch, s);
1698 for ( i = 0 ; i < n_vars ; ++i )
1700 y_max = max(y_max, (*fs)->m[i].max);
1701 y_min = min(y_min, (*fs)->m[i].min);
1704 boxplot_draw_yscale(ch, y_max, y_min);
1706 for ( i = 0 ; i < n_vars ; ++i )
1709 const double box_width = (ch->data_right - ch->data_left)
1712 const double box_centre = ( i * 2 + 1) * box_width
1715 boxplot_draw_boxplot(ch,
1716 box_centre, box_width,
1718 var_to_string(vars[i]));
1730 /* Do a box plot, grouping all factors into one plot ;
1731 each dependent variable has its own plot.
1734 box_plot_group(const struct factor *fctr,
1735 const struct variable **vars,
1737 const struct variable *id UNUSED)
1742 for ( i = 0 ; i < n_vars ; ++i )
1744 struct factor_statistics **fs ;
1747 ch = chart_create();
1749 boxplot_draw_yscale(ch, totals[i].max, totals[i].min);
1755 for ( fs = fctr->fs ; *fs ; ++fs )
1758 chart_write_title(ch, _("Boxplot of %s vs. %s"),
1759 var_to_string(vars[i]), var_to_string(fctr->indep_var[0]) );
1761 for ( fs = fctr->fs ; *fs ; ++fs )
1764 const char *s = factor_to_string_concise(fctr, *fs);
1766 const double box_width = (ch->data_right - ch->data_left)
1767 / (n_factors * 2.0 ) ;
1769 const double box_centre = ( f++ * 2 + 1) * box_width
1772 boxplot_draw_boxplot(ch,
1773 box_centre, box_width,
1780 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1781 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1783 chart_write_title(ch, _("Boxplot"));
1785 boxplot_draw_boxplot(ch,
1786 box_centre, box_width,
1788 var_to_string(vars[i]) );
1797 /* Plot the normal and detrended normal plots for m
1798 Label the plots with factorname */
1800 np_plot(const struct metrics *m, const char *factorname)
1803 double yfirst=0, ylast=0;
1806 struct chart *np_chart;
1808 /* Detrended Normal Plot */
1809 struct chart *dnp_chart;
1811 /* The slope and intercept of the ideal normal probability line */
1812 const double slope = 1.0 / m->stddev;
1813 const double intercept = - m->mean / m->stddev;
1815 /* Cowardly refuse to plot an empty data set */
1816 if ( m->n_data == 0 )
1819 np_chart = chart_create();
1820 dnp_chart = chart_create();
1822 if ( !np_chart || ! dnp_chart )
1825 chart_write_title(np_chart, _("Normal Q-Q Plot of %s"), factorname);
1826 chart_write_xlabel(np_chart, _("Observed Value"));
1827 chart_write_ylabel(np_chart, _("Expected Normal"));
1830 chart_write_title(dnp_chart, _("Detrended Normal Q-Q Plot of %s"),
1832 chart_write_xlabel(dnp_chart, _("Observed Value"));
1833 chart_write_ylabel(dnp_chart, _("Dev from Normal"));
1835 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1836 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1840 /* Need to make sure that both the scatter plot and the ideal fit into the
1842 double x_lower = min(m->min, (yfirst - intercept) / slope) ;
1843 double x_upper = max(m->max, (ylast - intercept) / slope) ;
1844 double slack = (x_upper - x_lower) * 0.05 ;
1846 chart_write_xscale(np_chart, x_lower - slack, x_upper + slack, 5);
1848 chart_write_xscale(dnp_chart, m->min, m->max, 5);
1852 chart_write_yscale(np_chart, yfirst, ylast, 5);
1855 /* We have to cache the detrended data, beacause we need to
1856 find its limits before we can plot it */
1857 double *d_data = xmalloc (m->n_data * sizeof(double));
1858 double d_max = -DBL_MAX;
1859 double d_min = DBL_MAX;
1860 for ( i = 0 ; i < m->n_data; ++i )
1862 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1864 chart_datum(np_chart, 0, m->wvp[i]->v.f, ns);
1866 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1868 if ( d_data[i] < d_min ) d_min = d_data[i];
1869 if ( d_data[i] > d_max ) d_max = d_data[i];
1871 chart_write_yscale(dnp_chart, d_min, d_max, 5);
1873 for ( i = 0 ; i < m->n_data; ++i )
1874 chart_datum(dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1879 chart_line(np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1880 chart_line(dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1882 chart_submit(np_chart);
1883 chart_submit(dnp_chart);
1889 /* Show the percentiles */
1891 show_percentiles(struct variable **dependent_var,
1893 struct factor *fctr)
1895 struct tab_table *tbl;
1901 struct hsh_table *ptiles ;
1903 int n_heading_columns;
1904 const int n_heading_rows = 2;
1905 const int n_stat_rows = 2;
1911 struct factor_statistics **fs = fctr->fs ;
1912 n_heading_columns = 3;
1913 n_factors = hsh_count(fctr->fstats);
1915 ptiles = (*fs)->m[0].ptile_hash;
1917 if ( fctr->indep_var[1] )
1918 n_heading_columns = 4;
1923 n_heading_columns = 2;
1925 ptiles = totals[0].ptile_hash;
1928 n_ptiles = hsh_count(ptiles);
1930 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1932 n_cols = n_heading_columns + n_ptiles ;
1934 tbl = tab_create (n_cols, n_rows, 0);
1936 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1938 tab_dim (tbl, tab_natural_dimensions);
1940 /* Outline the box and have no internal lines*/
1945 n_cols - 1, n_rows - 1);
1947 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
1949 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
1952 tab_title (tbl, 0, _("Percentiles"));
1955 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
1962 n_heading_columns - 1, n_rows - 1);
1968 n_heading_columns, n_heading_rows - 1,
1969 n_cols - 1, n_rows - 1);
1971 tab_joint_text(tbl, n_heading_columns + 1, 0,
1973 TAB_CENTER | TAT_TITLE ,
1978 /* Put in the percentile break points as headings */
1980 struct percentile **p = (struct percentile **) hsh_sort(ptiles);
1985 tab_float(tbl, n_heading_columns + i++ , 1,
1994 for ( i = 0 ; i < n_dep_var ; ++i )
1996 const int n_stat_rows = 2;
1997 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2000 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
2003 i * n_stat_rows * n_factors + n_heading_rows,
2004 TAB_LEFT | TAT_TITLE,
2005 var_to_string(dependent_var[i])
2010 struct factor_statistics **fs = fctr->fs;
2013 tab_text (tbl, 1, n_heading_rows - 1,
2014 TAB_CENTER | TAT_TITLE,
2015 var_to_string(fctr->indep_var[0]));
2018 if ( fctr->indep_var[1])
2019 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2020 var_to_string(fctr->indep_var[1]));
2025 static union value prev ;
2027 const int row = n_heading_rows + n_stat_rows *
2028 ( ( i * n_factors ) + count );
2031 if ( 0 != compare_values(&prev, &(*fs)->id[0],
2032 fctr->indep_var[0]->width))
2036 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2040 TAB_LEFT | TAT_TITLE,
2041 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
2047 prev = (*fs)->id[0];
2049 if (fctr->indep_var[1] && count > 0 )
2050 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
2052 if ( fctr->indep_var[1])
2053 tab_text (tbl, 2, row,
2054 TAB_LEFT | TAT_TITLE,
2055 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
2059 populate_percentiles(tbl, n_heading_columns - 1,
2071 populate_percentiles(tbl, n_heading_columns - 1,
2072 i * n_stat_rows * n_factors + n_heading_rows,
2089 populate_percentiles(struct tab_table *tbl, int col, int row,
2090 const struct metrics *m)
2094 struct percentile **p = (struct percentile **) hsh_sort(m->ptile_hash);
2098 TAB_LEFT | TAT_TITLE,
2099 _("Tukey\'s Hinges")
2104 TAB_LEFT | TAT_TITLE,
2105 ptile_alg_desc[m->ptile_alg]
2112 tab_float(tbl, col + i + 1 , row,
2115 if ( (*p)->p == 25 )
2116 tab_float(tbl, col + i + 1 , row + 1,
2120 if ( (*p)->p == 50 )
2121 tab_float(tbl, col + i + 1 , row + 1,
2125 if ( (*p)->p == 75 )
2126 tab_float(tbl, col + i + 1 , row + 1,
2141 factor_to_string(const struct factor *fctr,
2142 struct factor_statistics *fs,
2143 const struct variable *var)
2146 static char buf1[100];
2152 sprintf(buf1, "%s (",var_to_string(var) );
2155 snprintf(buf2, 100, "%s = %s",
2156 var_to_string(fctr->indep_var[0]),
2157 value_to_string(&fs->id[0],fctr->indep_var[0]));
2161 if ( fctr->indep_var[1] )
2163 sprintf(buf2, "; %s = %s)",
2164 var_to_string(fctr->indep_var[1]),
2165 value_to_string(&fs->id[1],
2166 fctr->indep_var[1]));
2181 factor_to_string_concise(const struct factor *fctr,
2182 struct factor_statistics *fs)
2186 static char buf[100];
2190 snprintf(buf, 100, "%s",
2191 value_to_string(&fs->id[0], fctr->indep_var[0]));
2193 if ( fctr->indep_var[1] )
2195 sprintf(buf2, ",%s)", value_to_string(&fs->id[1], fctr->indep_var[1]) );