1 /* PSPP - EXAMINE data for normality . -*-c-*-
3 Copyright (C) 2004 Free Software Foundation, Inc.
4 Author: John Darrington 2004
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
22 #include <gsl/gsl_cdf.h>
30 #include "dictionary.h"
38 #include "value-labels.h"
43 #include "factor_stats.h"
45 #include "percentiles.h"
55 +missing=miss:pairwise/!listwise,
57 incl:include/!exclude;
58 +compare=cmp:variables/!groups;
61 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
63 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
72 static struct cmd_examine cmd;
74 static struct variable **dependent_vars;
76 static int n_dependent_vars;
81 /* The independent variable */
82 struct variable *indep_var[2];
85 /* Hash table of factor stats indexed by 2 values */
86 struct hsh_table *fstats;
88 /* The hash table after it has been crunched */
89 struct factor_statistics **fs;
95 /* Linked list of factors */
96 static struct factor *factors=0;
98 static struct metrics *totals=0;
100 /* Parse the clause specifying the factors */
101 static int examine_parse_independent_vars(struct cmd_examine *cmd);
105 /* Output functions */
106 static void show_summary(struct variable **dependent_var, int n_dep_var,
107 const struct factor *f);
109 static void show_extremes(struct variable **dependent_var,
111 const struct factor *factor,
114 static void show_descriptives(struct variable **dependent_var,
116 struct factor *factor);
118 static void show_percentiles(struct variable **dependent_var,
120 struct factor *factor);
125 void np_plot(const struct metrics *m, const char *factorname);
128 void box_plot_group(const struct factor *fctr,
129 const struct variable **vars, int n_vars,
130 const struct variable *id
134 void box_plot_variables(const struct factor *fctr,
135 struct variable **vars, int n_vars,
136 const struct variable *id
141 /* Per Split function */
142 static void run_examine(const struct casefile *cf, void *cmd_);
144 static void output_examine(void);
147 void factor_calc(struct ccase *c, int case_no,
148 double weight, int case_missing);
151 /* Represent a factor as a string, so it can be
152 printed in a human readable fashion */
153 const char * factor_to_string(const struct factor *fctr,
154 struct factor_statistics *fs,
155 const struct variable *var);
158 /* Represent a factor as a string, so it can be
159 printed in a human readable fashion,
160 but sacrificing some readablility for the sake of brevity */
161 const char *factor_to_string_concise(const struct factor *fctr,
162 struct factor_statistics *fs);
167 /* Function to use for testing for missing values */
168 static is_missing_func value_is_missing;
173 static subc_list_double percentile_list;
175 static enum pc_alg percentile_algorithm;
177 static short sbc_percentile;
184 subc_list_double_create(&percentile_list);
185 percentile_algorithm = PC_HAVERAGE;
187 if ( !parse_examine(&cmd) )
190 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
191 if (cmd.incl == XMN_INCLUDE )
192 value_is_missing = is_system_missing;
194 value_is_missing = is_missing;
196 if ( cmd.st_n == SYSMIS )
199 if ( ! cmd.sbc_cinterval)
200 cmd.n_cinterval[0] = 95.0;
202 /* If descriptives have been requested, make sure the
203 quartiles are calculated */
204 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
206 subc_list_double_push(&percentile_list, 25);
207 subc_list_double_push(&percentile_list, 50);
208 subc_list_double_push(&percentile_list, 75);
211 multipass_procedure_with_splits (run_examine, &cmd);
216 subc_list_double_destroy(&percentile_list);
223 /* Show all the appropriate tables */
229 /* Show totals if appropriate */
230 if ( ! cmd.sbc_nototal || factors == 0 )
232 show_summary(dependent_vars, n_dependent_vars, 0);
234 if ( cmd.sbc_statistics )
236 if ( cmd.a_statistics[XMN_ST_EXTREME])
237 show_extremes(dependent_vars, n_dependent_vars, 0, cmd.st_n);
239 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
240 show_descriptives(dependent_vars, n_dependent_vars, 0);
243 if ( sbc_percentile )
244 show_percentiles(dependent_vars, n_dependent_vars, 0);
249 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
251 for ( v = 0 ; v < n_dependent_vars; ++v )
252 np_plot(&totals[v], var_to_string(dependent_vars[v]));
255 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
257 if ( cmd.cmp == XMN_GROUPS )
259 box_plot_group(0, dependent_vars, n_dependent_vars,
263 box_plot_variables(0, dependent_vars, n_dependent_vars,
268 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
270 for ( v = 0 ; v < n_dependent_vars; ++v )
272 struct normal_curve normal;
274 normal.N = totals[v].n;
275 normal.mean = totals[v].mean;
276 normal.stddev = totals[v].stddev;
278 histogram_plot(totals[v].histogram,
279 var_to_string(dependent_vars[v]),
290 /* Show grouped statistics as appropriate */
294 show_summary(dependent_vars, n_dependent_vars, fctr);
296 if ( cmd.sbc_statistics )
298 if ( cmd.a_statistics[XMN_ST_EXTREME])
299 show_extremes(dependent_vars, n_dependent_vars, fctr, cmd.st_n);
301 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
302 show_descriptives(dependent_vars, n_dependent_vars, fctr);
305 if ( sbc_percentile )
306 show_percentiles(dependent_vars, n_dependent_vars, fctr);
313 struct factor_statistics **fs = fctr->fs ;
315 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
317 if ( cmd.cmp == XMN_VARIABLES )
318 box_plot_variables(fctr, dependent_vars, n_dependent_vars,
321 box_plot_group(fctr, dependent_vars, n_dependent_vars,
325 for ( v = 0 ; v < n_dependent_vars; ++v )
328 for ( fs = fctr->fs ; *fs ; ++fs )
330 const char *s = factor_to_string(fctr, *fs, dependent_vars[v]);
332 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
333 np_plot(&(*fs)->m[v], s);
336 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
338 struct normal_curve normal;
340 normal.N = (*fs)->m[v].n;
341 normal.mean = (*fs)->m[v].mean;
342 normal.stddev = (*fs)->m[v].stddev;
344 histogram_plot((*fs)->m[v].histogram,
349 } /* for ( fs .... */
351 } /* for ( v = 0 ..... */
361 static struct hsh_table *
362 list_to_ptile_hash(const subc_list_double *l)
366 struct hsh_table *h ;
368 h = hsh_create(subc_list_double_count(l),
369 (hsh_compare_func *) ptile_compare,
370 (hsh_hash_func *) ptile_hash,
371 (hsh_free_func *) free,
375 for ( i = 0 ; i < subc_list_double_count(l) ; ++i )
377 struct percentile *p = xmalloc (sizeof (struct percentile));
379 p->p = subc_list_double_at(l,i);
389 /* Parse the PERCENTILES subcommand */
391 xmn_custom_percentiles(struct cmd_examine *p UNUSED)
399 while ( lex_double_p() )
401 subc_list_double_push(&percentile_list,lex_double());
411 if ( lex_match_id("HAVERAGE"))
412 percentile_algorithm = PC_HAVERAGE;
414 else if ( lex_match_id("WAVERAGE"))
415 percentile_algorithm = PC_WAVERAGE;
417 else if ( lex_match_id("ROUND"))
418 percentile_algorithm = PC_ROUND;
420 else if ( lex_match_id("EMPIRICAL"))
421 percentile_algorithm = PC_EMPIRICAL;
423 else if ( lex_match_id("AEMPIRICAL"))
424 percentile_algorithm = PC_AEMPIRICAL;
426 else if ( lex_match_id("NONE"))
427 percentile_algorithm = PC_NONE;
430 if ( 0 == subc_list_double_count(&percentile_list))
432 subc_list_double_push(&percentile_list, 5);
433 subc_list_double_push(&percentile_list, 10);
434 subc_list_double_push(&percentile_list, 25);
435 subc_list_double_push(&percentile_list, 50);
436 subc_list_double_push(&percentile_list, 75);
437 subc_list_double_push(&percentile_list, 90);
438 subc_list_double_push(&percentile_list, 95);
444 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
446 xmn_custom_total(struct cmd_examine *p)
448 if ( p->sbc_nototal )
450 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
458 xmn_custom_nototal(struct cmd_examine *p)
462 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
471 /* Parser for the variables sub command */
473 xmn_custom_variables(struct cmd_examine *cmd )
478 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
482 if (!parse_variables (default_dict, &dependent_vars, &n_dependent_vars,
483 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
485 free (dependent_vars);
489 assert(n_dependent_vars);
491 totals = xmalloc( sizeof(struct metrics) * n_dependent_vars);
493 if ( lex_match(T_BY))
495 return examine_parse_independent_vars(cmd);
503 /* Parse the clause specifying the factors */
505 examine_parse_independent_vars(struct cmd_examine *cmd)
508 struct factor *sf = xmalloc(sizeof(struct factor));
510 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
515 sf->indep_var[0] = parse_variable();
516 sf->indep_var[1] = 0;
523 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
527 sf->indep_var[1] = parse_variable();
532 sf->fstats = hsh_create(4,
533 (hsh_compare_func *) factor_statistics_compare,
534 (hsh_hash_func *) factor_statistics_hash,
535 (hsh_free_func *) factor_statistics_free,
543 if ( token == '.' || token == '/' )
546 return examine_parse_independent_vars(cmd);
552 void populate_percentiles(struct tab_table *tbl, int col, int row,
553 const struct metrics *m);
555 void populate_descriptives(struct tab_table *t, int col, int row,
556 const struct metrics *fs);
558 void populate_extremes(struct tab_table *t, int col, int row, int n,
559 const struct metrics *m);
561 void populate_summary(struct tab_table *t, int col, int row,
562 const struct metrics *m);
567 static int bad_weight_warn = 1;
570 /* Perform calculations for the sub factors */
572 factor_calc(struct ccase *c, int case_no, double weight, int case_missing)
575 struct factor *fctr = factors;
579 union value indep_vals[2] ;
581 indep_vals[0] = * case_data(c, fctr->indep_var[0]->fv);
583 if ( fctr->indep_var[1] )
584 indep_vals[1] = * case_data(c, fctr->indep_var[1]->fv);
586 indep_vals[1].f = SYSMIS;
588 assert(fctr->fstats);
590 struct factor_statistics **foo = ( struct factor_statistics ** )
591 hsh_probe(fctr->fstats, (void *) &indep_vals);
596 *foo = create_factor_statistics(n_dependent_vars,
600 for ( v = 0 ; v < n_dependent_vars ; ++v )
602 metrics_precalc( &(*foo)->m[v] );
607 for ( v = 0 ; v < n_dependent_vars ; ++v )
609 const struct variable *var = dependent_vars[v];
610 const union value *val = case_data (c, var->fv);
612 if ( value_is_missing(val,var) || case_missing )
615 metrics_calc( &(*foo)->m[v], val, weight, case_no);
629 run_examine(const struct casefile *cf, void *cmd_ )
631 struct casereader *r;
635 const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
637 /* Make sure we haven't got rubbish left over from a
640 struct factor *fctr = factors;
643 struct factor *next = fctr->next;
645 hsh_clear(fctr->fstats);
654 for ( v = 0 ; v < n_dependent_vars ; ++v )
655 metrics_precalc(&totals[v]);
657 for(r = casefile_get_reader (cf);
658 casereader_read (r, &c) ;
662 const int case_no = casereader_cnum(r);
664 const double weight =
665 dict_get_case_weight(default_dict, &c, &bad_weight_warn);
667 if ( cmd->miss == XMN_LISTWISE )
669 for ( v = 0 ; v < n_dependent_vars ; ++v )
671 const struct variable *var = dependent_vars[v];
672 const union value *val = case_data (&c, var->fv);
674 if ( value_is_missing(val,var))
680 for ( v = 0 ; v < n_dependent_vars ; ++v )
682 const struct variable *var = dependent_vars[v];
683 const union value *val = case_data (&c, var->fv);
685 if ( value_is_missing(val,var) || case_missing )
688 metrics_calc(&totals[v], val, weight, case_no);
692 factor_calc(&c, case_no, weight, case_missing);
697 for ( v = 0 ; v < n_dependent_vars ; ++v)
702 struct hsh_iterator hi;
703 struct factor_statistics *fs;
705 for ( fs = hsh_first(fctr->fstats, &hi);
707 fs = hsh_next(fctr->fstats, &hi))
710 fs->m[v].ptile_hash = list_to_ptile_hash(&percentile_list);
711 fs->m[v].ptile_alg = percentile_algorithm;
712 metrics_postcalc(&fs->m[v]);
718 totals[v].ptile_hash = list_to_ptile_hash(&percentile_list);
719 totals[v].ptile_alg = percentile_algorithm;
720 metrics_postcalc(&totals[v]);
724 /* Make sure that the combination of factors are complete */
729 struct hsh_iterator hi;
730 struct hsh_iterator hi0;
731 struct hsh_iterator hi1;
732 struct factor_statistics *fs;
734 struct hsh_table *idh0=0;
735 struct hsh_table *idh1=0;
739 idh0 = hsh_create(4, (hsh_compare_func *) compare_values,
740 (hsh_hash_func *) hash_value,
743 idh1 = hsh_create(4, (hsh_compare_func *) compare_values,
744 (hsh_hash_func *) hash_value,
748 for ( fs = hsh_first(fctr->fstats, &hi);
750 fs = hsh_next(fctr->fstats, &hi))
752 hsh_insert(idh0,(void *) &fs->id[0]);
753 hsh_insert(idh1,(void *) &fs->id[1]);
756 /* Ensure that the factors combination is complete */
757 for ( val0 = hsh_first(idh0, &hi0);
759 val0 = hsh_next(idh0, &hi0))
761 for ( val1 = hsh_first(idh1, &hi1);
763 val1 = hsh_next(idh1, &hi1))
765 struct factor_statistics **ffs;
770 ffs = (struct factor_statistics **)
771 hsh_probe(fctr->fstats, (void *) &key );
775 (*ffs) = create_factor_statistics (n_dependent_vars,
777 for ( i = 0 ; i < n_dependent_vars ; ++i )
778 metrics_precalc( &(*ffs)->m[i]);
786 fctr->fs = (struct factor_statistics **) hsh_sort_copy(fctr->fstats);
793 for ( v = 0 ; v < n_dependent_vars ; ++v )
794 hsh_destroy(totals[v].ordered_data);
800 show_summary(struct variable **dependent_var, int n_dep_var,
801 const struct factor *fctr)
803 static const char *subtitle[]=
811 int heading_columns ;
813 const int heading_rows = 3;
814 struct tab_table *tbl;
822 n_factors = hsh_count(fctr->fstats);
823 n_rows = n_dep_var * n_factors ;
825 if ( fctr->indep_var[1] )
834 n_rows += heading_rows;
836 n_cols = heading_columns + 6;
838 tbl = tab_create (n_cols,n_rows,0);
839 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
841 tab_dim (tbl, tab_natural_dimensions);
843 /* Outline the box */
848 n_cols - 1, n_rows - 1);
850 /* Vertical lines for the data only */
855 n_cols - 1, n_rows - 1);
858 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
859 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
860 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
862 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
865 tab_title (tbl, 0, _("Case Processing Summary"));
868 tab_joint_text(tbl, heading_columns, 0,
870 TAB_CENTER | TAT_TITLE,
873 /* Remove lines ... */
880 for ( i = 0 ; i < 3 ; ++i )
882 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
885 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
888 tab_joint_text(tbl, heading_columns + i*2 , 1,
889 heading_columns + i*2 + 1, 1,
890 TAB_CENTER | TAT_TITLE,
893 tab_box (tbl, -1, -1,
895 heading_columns + i*2, 1,
896 heading_columns + i*2 + 1, 1);
901 /* Titles for the independent variables */
904 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
905 var_to_string(fctr->indep_var[0]));
907 if ( fctr->indep_var[1] )
909 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
910 var_to_string(fctr->indep_var[1]));
916 for ( i = 0 ; i < n_dep_var ; ++i )
920 n_factors = hsh_count(fctr->fstats);
924 tab_hline(tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
927 0, i * n_factors + heading_rows,
928 TAB_LEFT | TAT_TITLE,
929 var_to_string(dependent_var[i])
934 populate_summary(tbl, heading_columns,
935 (i * n_factors) + heading_rows,
941 struct factor_statistics **fs = fctr->fs;
946 static union value prev;
948 if ( 0 != compare_values(&prev, &(*fs)->id[0],
949 fctr->indep_var[0]->width))
953 (i * n_factors ) + count +
955 TAB_LEFT | TAT_TITLE,
956 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
959 if (fctr->indep_var[1] && count > 0 )
960 tab_hline(tbl, TAL_1, 1, n_cols - 1,
961 (i * n_factors ) + count + heading_rows);
968 if ( fctr->indep_var[1])
971 (i * n_factors ) + count +
973 TAB_LEFT | TAT_TITLE,
974 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
977 populate_summary(tbl, heading_columns,
978 (i * n_factors) + count
993 populate_summary(struct tab_table *t, int col, int row,
994 const struct metrics *m)
997 const double total = m->n + m->n_missing ;
999 tab_float(t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1000 tab_float(t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1001 tab_float(t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1005 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1006 100.0 * m->n / total );
1008 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1009 100.0 * m->n_missing / total );
1011 /* This seems a bit pointless !!! */
1012 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1013 100.0 * total / total );
1024 show_extremes(struct variable **dependent_var, int n_dep_var,
1025 const struct factor *fctr, int n_extremities)
1028 int heading_columns ;
1030 const int heading_rows = 1;
1031 struct tab_table *tbl;
1038 heading_columns = 2;
1039 n_factors = hsh_count(fctr->fstats);
1041 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1043 if ( fctr->indep_var[1] )
1044 heading_columns = 3;
1048 heading_columns = 1;
1049 n_rows = n_dep_var * 2 * n_extremities;
1052 n_rows += heading_rows;
1054 heading_columns += 2;
1055 n_cols = heading_columns + 2;
1057 tbl = tab_create (n_cols,n_rows,0);
1058 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1060 tab_dim (tbl, tab_natural_dimensions);
1062 /* Outline the box, No internal lines*/
1067 n_cols - 1, n_rows - 1);
1069 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1071 tab_title (tbl, 0, _("Extreme Values"));
1073 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1074 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1078 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1079 var_to_string(fctr->indep_var[0]));
1081 if ( fctr->indep_var[1] )
1082 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1083 var_to_string(fctr->indep_var[1]));
1086 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Value"));
1087 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Case Number"));
1089 for ( i = 0 ; i < n_dep_var ; ++i )
1093 tab_hline(tbl, TAL_1, 0, n_cols -1 ,
1094 i * 2 * n_extremities * n_factors + heading_rows);
1097 i * 2 * n_extremities * n_factors + heading_rows,
1098 TAB_LEFT | TAT_TITLE,
1099 var_to_string(dependent_var[i])
1104 populate_extremes(tbl, heading_columns - 2,
1105 i * 2 * n_extremities * n_factors + heading_rows,
1106 n_extremities, &totals[i]);
1110 struct factor_statistics **fs = fctr->fs;
1115 static union value prev ;
1117 const int row = heading_rows + ( 2 * n_extremities ) *
1118 ( ( i * n_factors ) + count );
1121 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1122 fctr->indep_var[0]->width))
1126 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1130 TAB_LEFT | TAT_TITLE,
1131 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1135 prev = (*fs)->id[0];
1137 if (fctr->indep_var[1] && count > 0 )
1138 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1140 if ( fctr->indep_var[1])
1141 tab_text (tbl, 2, row,
1142 TAB_LEFT | TAT_TITLE,
1143 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1146 populate_extremes(tbl, heading_columns - 2,
1161 /* Fill in the extremities table */
1163 populate_extremes(struct tab_table *t,
1164 int col, int row, int n, const struct metrics *m)
1170 tab_text(t, col, row,
1171 TAB_RIGHT | TAT_TITLE ,
1175 tab_text(t, col, row + n ,
1176 TAB_RIGHT | TAT_TITLE ,
1181 tab_hline(t, TAL_1, col, col + 3, row + n );
1183 for (extremity = 0; extremity < n ; ++extremity )
1186 tab_float(t, col + 1, row + extremity,
1188 extremity + 1, 8, 0);
1192 tab_float(t, col + 1, row + extremity + n,
1194 extremity + 1, 8, 0);
1200 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1203 const struct weighted_value *wv = m->wvp[idx];
1204 struct case_node *cn = wv->case_nos;
1207 for (j = 0 ; j < wv->w ; ++j )
1209 if ( extremity + j >= n )
1212 tab_float(t, col + 3, row + extremity + j + n,
1216 tab_float(t, col + 2, row + extremity + j + n,
1225 extremity += wv->w ;
1230 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1233 const struct weighted_value *wv = m->wvp[idx];
1234 struct case_node *cn = wv->case_nos;
1236 for (j = 0 ; j < wv->w ; ++j )
1238 if ( extremity + j >= n )
1241 tab_float(t, col + 3, row + extremity + j,
1245 tab_float(t, col + 2, row + extremity + j,
1254 extremity += wv->w ;
1259 /* Show the descriptives table */
1261 show_descriptives(struct variable **dependent_var,
1263 struct factor *fctr)
1266 int heading_columns ;
1268 const int n_stat_rows = 13;
1270 const int heading_rows = 1;
1272 struct tab_table *tbl;
1279 heading_columns = 4;
1280 n_factors = hsh_count(fctr->fstats);
1282 n_rows = n_dep_var * n_stat_rows * n_factors;
1284 if ( fctr->indep_var[1] )
1285 heading_columns = 5;
1289 heading_columns = 3;
1290 n_rows = n_dep_var * n_stat_rows;
1293 n_rows += heading_rows;
1295 n_cols = heading_columns + 2;
1298 tbl = tab_create (n_cols, n_rows, 0);
1300 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1302 tab_dim (tbl, tab_natural_dimensions);
1304 /* Outline the box and have no internal lines*/
1309 n_cols - 1, n_rows - 1);
1311 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1313 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1314 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1315 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1317 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Statistic"));
1318 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
1320 tab_title (tbl, 0, _("Descriptives"));
1323 for ( i = 0 ; i < n_dep_var ; ++i )
1325 const int row = heading_rows + i * n_stat_rows * n_factors ;
1328 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
1331 i * n_stat_rows * n_factors + heading_rows,
1332 TAB_LEFT | TAT_TITLE,
1333 var_to_string(dependent_var[i])
1339 struct factor_statistics **fs = fctr->fs;
1342 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1343 var_to_string(fctr->indep_var[0]));
1346 if ( fctr->indep_var[1])
1347 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1348 var_to_string(fctr->indep_var[1]));
1353 static union value prev ;
1355 const int row = heading_rows + n_stat_rows *
1356 ( ( i * n_factors ) + count );
1359 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1360 fctr->indep_var[0]->width))
1364 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1368 TAB_LEFT | TAT_TITLE,
1369 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1373 prev = (*fs)->id[0];
1375 if (fctr->indep_var[1] && count > 0 )
1376 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1378 if ( fctr->indep_var[1])
1379 tab_text (tbl, 2, row,
1380 TAB_LEFT | TAT_TITLE,
1381 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1384 populate_descriptives(tbl, heading_columns - 2,
1396 populate_descriptives(tbl, heading_columns - 2,
1397 i * n_stat_rows * n_factors + heading_rows,
1409 /* Fill in the descriptives data */
1411 populate_descriptives(struct tab_table *tbl, int col, int row,
1412 const struct metrics *m)
1415 const double t = gsl_cdf_tdist_Qinv(1 - cmd.n_cinterval[0]/100.0/2.0, \
1421 TAB_LEFT | TAT_TITLE,
1424 tab_float (tbl, col + 2,
1430 tab_float (tbl, col + 3,
1439 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1440 _("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1443 tab_text (tbl, col + 1,
1445 TAB_LEFT | TAT_TITLE,
1448 tab_float (tbl, col + 2,
1451 m->mean - t * m->se_mean,
1454 tab_text (tbl, col + 1,
1456 TAB_LEFT | TAT_TITLE,
1460 tab_float (tbl, col + 2,
1463 m->mean + t * m->se_mean,
1468 TAB_LEFT | TAT_TITLE,
1469 _("5% Trimmed Mean"));
1471 tab_float (tbl, col + 2,
1479 TAB_LEFT | TAT_TITLE,
1483 struct percentile *p;
1486 p = hsh_find(m->ptile_hash, &d);
1490 tab_float (tbl, col + 2,
1499 TAB_LEFT | TAT_TITLE,
1502 tab_float (tbl, col + 2,
1511 TAB_LEFT | TAT_TITLE,
1512 _("Std. Deviation"));
1515 tab_float (tbl, col + 2,
1524 TAB_LEFT | TAT_TITLE,
1527 tab_float (tbl, col + 2,
1535 TAB_LEFT | TAT_TITLE,
1538 tab_float (tbl, col + 2,
1547 TAB_LEFT | TAT_TITLE,
1551 tab_float (tbl, col + 2,
1559 TAB_LEFT | TAT_TITLE,
1560 _("Interquartile Range"));
1563 struct percentile *p1;
1564 struct percentile *p2;
1567 p1 = hsh_find(m->ptile_hash, &d);
1570 p2 = hsh_find(m->ptile_hash, &d);
1575 tab_float (tbl, col + 2,
1586 TAB_LEFT | TAT_TITLE,
1590 tab_float (tbl, col + 2,
1596 /* stderr of skewness */
1597 tab_float (tbl, col + 3,
1606 TAB_LEFT | TAT_TITLE,
1610 tab_float (tbl, col + 2,
1616 /* stderr of kurtosis */
1617 tab_float (tbl, col + 3,
1629 box_plot_variables(const struct factor *fctr,
1630 struct variable **vars, int n_vars,
1631 const struct variable *id)
1635 struct factor_statistics **fs ;
1639 box_plot_group(fctr, vars, n_vars, id);
1643 for ( fs = fctr->fs ; *fs ; ++fs )
1645 double y_min = DBL_MAX;
1646 double y_max = -DBL_MAX;
1649 chart_initialise(&ch);
1651 const char *s = factor_to_string(fctr, *fs, 0 );
1653 chart_write_title(&ch, s);
1655 for ( i = 0 ; i < n_vars ; ++i )
1657 y_max = max(y_max, (*fs)->m[i].max);
1658 y_min = min(y_min, (*fs)->m[i].min);
1661 boxplot_draw_yscale(&ch, y_max, y_min);
1663 for ( i = 0 ; i < n_vars ; ++i )
1666 const double box_width = (ch.data_right - ch.data_left)
1669 const double box_centre = ( i * 2 + 1) * box_width
1672 boxplot_draw_boxplot(&ch,
1673 box_centre, box_width,
1675 var_to_string(vars[i]));
1680 chart_finalise(&ch);
1688 /* Do a box plot, grouping all factors into one plot ;
1689 each dependent variable has its own plot.
1692 box_plot_group(const struct factor *fctr,
1693 const struct variable **vars,
1695 const struct variable *id)
1700 for ( i = 0 ; i < n_vars ; ++i )
1702 struct factor_statistics **fs ;
1705 chart_initialise(&ch);
1707 boxplot_draw_yscale(&ch, totals[i].max, totals[i].min);
1713 for ( fs = fctr->fs ; *fs ; ++fs )
1716 chart_write_title(&ch, _("Boxplot of %s vs. %s"),
1717 var_to_string(vars[i]), var_to_string(fctr->indep_var[0]) );
1719 for ( fs = fctr->fs ; *fs ; ++fs )
1722 const char *s = factor_to_string_concise(fctr, *fs);
1724 const double box_width = (ch.data_right - ch.data_left)
1725 / (n_factors * 2.0 ) ;
1727 const double box_centre = ( f++ * 2 + 1) * box_width
1730 boxplot_draw_boxplot(&ch,
1731 box_centre, box_width,
1738 const double box_width = (ch.data_right - ch.data_left) / 3.0;
1739 const double box_centre = (ch.data_right + ch.data_left) / 2.0;
1741 chart_write_title(&ch, _("Boxplot"));
1743 boxplot_draw_boxplot(&ch,
1744 box_centre, box_width,
1746 var_to_string(vars[i]) );
1750 chart_finalise(&ch);
1756 /* Plot the normal and detrended normal plots for m
1757 Label the plots with factorname */
1759 np_plot(const struct metrics *m, const char *factorname)
1763 double yfirst=0, ylast=0;
1766 struct chart np_chart;
1768 /* Detrended Normal Plot */
1769 struct chart dnp_chart;
1771 /* The slope and intercept of the ideal normal probability line */
1772 const double slope = 1.0 / m->stddev;
1773 const double intercept = - m->mean / m->stddev;
1775 /* Cowardly refuse to plot an empty data set */
1776 if ( m->n_data == 0 )
1779 chart_initialise(&np_chart);
1780 chart_write_title(&np_chart, _("Normal Q-Q Plot of %s"), factorname);
1781 chart_write_xlabel(&np_chart, _("Observed Value"));
1782 chart_write_ylabel(&np_chart, _("Expected Normal"));
1784 chart_initialise(&dnp_chart);
1785 chart_write_title(&dnp_chart, _("Detrended Normal Q-Q Plot of %s"),
1787 chart_write_xlabel(&dnp_chart, _("Observed Value"));
1788 chart_write_ylabel(&dnp_chart, _("Dev from Normal"));
1790 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1791 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1795 /* Need to make sure that both the scatter plot and the ideal fit into the
1797 double x_lower = min(m->min, (yfirst - intercept) / slope) ;
1798 double x_upper = max(m->max, (ylast - intercept) / slope) ;
1799 double slack = (x_upper - x_lower) * 0.05 ;
1801 chart_write_xscale(&np_chart, x_lower - slack, x_upper + slack, 5);
1803 chart_write_xscale(&dnp_chart, m->min, m->max, 5);
1807 chart_write_yscale(&np_chart, yfirst, ylast, 5);
1810 /* We have to cache the detrended data, beacause we need to
1811 find its limits before we can plot it */
1813 d_data = xmalloc (m->n_data * sizeof(double));
1814 double d_max = -DBL_MAX;
1815 double d_min = DBL_MAX;
1816 for ( i = 0 ; i < m->n_data; ++i )
1818 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1820 chart_datum(&np_chart, 0, m->wvp[i]->v.f, ns);
1822 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1824 if ( d_data[i] < d_min ) d_min = d_data[i];
1825 if ( d_data[i] > d_max ) d_max = d_data[i];
1827 chart_write_yscale(&dnp_chart, d_min, d_max, 5);
1829 for ( i = 0 ; i < m->n_data; ++i )
1830 chart_datum(&dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1835 chart_line(&np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1836 chart_line(&dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1838 chart_finalise(&np_chart);
1839 chart_finalise(&dnp_chart);
1846 /* Show the percentiles */
1848 show_percentiles(struct variable **dependent_var,
1850 struct factor *fctr)
1852 struct tab_table *tbl;
1858 struct hsh_table *ptiles ;
1860 int n_heading_columns;
1861 const int n_heading_rows = 2;
1862 const int n_stat_rows = 2;
1868 struct factor_statistics **fs = fctr->fs ;
1869 n_heading_columns = 3;
1870 n_factors = hsh_count(fctr->fstats);
1872 ptiles = (*fs)->m[0].ptile_hash;
1874 if ( fctr->indep_var[1] )
1875 n_heading_columns = 4;
1880 n_heading_columns = 2;
1882 ptiles = totals[0].ptile_hash;
1885 n_ptiles = hsh_count(ptiles);
1887 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1889 n_cols = n_heading_columns + n_ptiles ;
1891 tbl = tab_create (n_cols, n_rows, 0);
1893 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1895 tab_dim (tbl, tab_natural_dimensions);
1897 /* Outline the box and have no internal lines*/
1902 n_cols - 1, n_rows - 1);
1904 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
1906 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
1909 tab_title (tbl, 0, _("Percentiles"));
1912 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
1919 n_heading_columns - 1, n_rows - 1);
1925 n_heading_columns, n_heading_rows - 1,
1926 n_cols - 1, n_rows - 1);
1928 tab_joint_text(tbl, n_heading_columns + 1, 0,
1930 TAB_CENTER | TAT_TITLE ,
1935 /* Put in the percentile break points as headings */
1937 struct percentile **p = (struct percentile **) hsh_sort(ptiles);
1942 tab_float(tbl, n_heading_columns + i++ , 1,
1951 for ( i = 0 ; i < n_dep_var ; ++i )
1953 const int n_stat_rows = 2;
1954 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
1957 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
1960 i * n_stat_rows * n_factors + n_heading_rows,
1961 TAB_LEFT | TAT_TITLE,
1962 var_to_string(dependent_var[i])
1967 struct factor_statistics **fs = fctr->fs;
1970 tab_text (tbl, 1, n_heading_rows - 1,
1971 TAB_CENTER | TAT_TITLE,
1972 var_to_string(fctr->indep_var[0]));
1975 if ( fctr->indep_var[1])
1976 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
1977 var_to_string(fctr->indep_var[1]));
1982 static union value prev ;
1984 const int row = n_heading_rows + n_stat_rows *
1985 ( ( i * n_factors ) + count );
1988 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1989 fctr->indep_var[0]->width))
1993 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1997 TAB_LEFT | TAT_TITLE,
1998 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
2004 prev = (*fs)->id[0];
2006 if (fctr->indep_var[1] && count > 0 )
2007 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
2009 if ( fctr->indep_var[1])
2010 tab_text (tbl, 2, row,
2011 TAB_LEFT | TAT_TITLE,
2012 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
2016 populate_percentiles(tbl, n_heading_columns - 1,
2028 populate_percentiles(tbl, n_heading_columns - 1,
2029 i * n_stat_rows * n_factors + n_heading_rows,
2046 populate_percentiles(struct tab_table *tbl, int col, int row,
2047 const struct metrics *m)
2051 struct percentile **p = (struct percentile **) hsh_sort(m->ptile_hash);
2055 TAB_LEFT | TAT_TITLE,
2056 _("Tukey\'s Hinges")
2061 TAB_LEFT | TAT_TITLE,
2062 ptile_alg_desc[m->ptile_alg]
2069 tab_float(tbl, col + i + 1 , row,
2072 if ( (*p)->p == 25 )
2073 tab_float(tbl, col + i + 1 , row + 1,
2077 if ( (*p)->p == 50 )
2078 tab_float(tbl, col + i + 1 , row + 1,
2082 if ( (*p)->p == 75 )
2083 tab_float(tbl, col + i + 1 , row + 1,
2098 factor_to_string(const struct factor *fctr,
2099 struct factor_statistics *fs,
2100 const struct variable *var)
2103 static char buf1[100];
2109 sprintf(buf1, "%s (",var_to_string(var) );
2112 snprintf(buf2, 100, "%s = %s",
2113 var_to_string(fctr->indep_var[0]),
2114 value_to_string(&fs->id[0],fctr->indep_var[0]));
2118 if ( fctr->indep_var[1] )
2120 sprintf(buf2, "; %s = %s)",
2121 var_to_string(fctr->indep_var[1]),
2122 value_to_string(&fs->id[1],
2123 fctr->indep_var[1]));
2138 factor_to_string_concise(const struct factor *fctr,
2139 struct factor_statistics *fs)
2143 static char buf[100];
2147 snprintf(buf, 100, "%s",
2148 value_to_string(&fs->id[0], fctr->indep_var[0]));
2150 if ( fctr->indep_var[1] )
2152 sprintf(buf2, ",%s)", value_to_string(&fs->id[1], fctr->indep_var[1]) );