1 /* PSPP - EXAMINE data for normality . -*-c-*-
3 Copyright (C) 2004 Free Software Foundation, Inc.
4 Author: John Darrington 2004
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
22 #include <gsl/gsl_cdf.h>
30 #include "dictionary.h"
38 #include "value-labels.h"
43 #include "factor_stats.h"
45 #include "percentiles.h"
48 #define _(msgid) gettext (msgid)
49 #define N_(msgid) msgid
59 +missing=miss:pairwise/!listwise,
61 incl:include/!exclude;
62 +compare=cmp:variables/!groups;
65 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
67 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
76 static struct cmd_examine cmd;
78 static struct variable **dependent_vars;
80 static size_t n_dependent_vars;
85 /* The independent variable */
86 struct variable *indep_var[2];
89 /* Hash table of factor stats indexed by 2 values */
90 struct hsh_table *fstats;
92 /* The hash table after it has been crunched */
93 struct factor_statistics **fs;
99 /* Linked list of factors */
100 static struct factor *factors=0;
102 static struct metrics *totals=0;
104 /* Parse the clause specifying the factors */
105 static int examine_parse_independent_vars(struct cmd_examine *cmd);
109 /* Output functions */
110 static void show_summary(struct variable **dependent_var, int n_dep_var,
111 const struct factor *f);
113 static void show_extremes(struct variable **dependent_var,
115 const struct factor *factor,
118 static void show_descriptives(struct variable **dependent_var,
120 struct factor *factor);
122 static void show_percentiles(struct variable **dependent_var,
124 struct factor *factor);
129 void np_plot(const struct metrics *m, const char *factorname);
132 void box_plot_group(const struct factor *fctr,
133 const struct variable **vars, int n_vars,
134 const struct variable *id
138 void box_plot_variables(const struct factor *fctr,
139 const struct variable **vars, int n_vars,
140 const struct variable *id
145 /* Per Split function */
146 static void run_examine(const struct casefile *cf, void *cmd_);
148 static void output_examine(void);
151 void factor_calc(struct ccase *c, int case_no,
152 double weight, int case_missing);
155 /* Represent a factor as a string, so it can be
156 printed in a human readable fashion */
157 const char * factor_to_string(const struct factor *fctr,
158 struct factor_statistics *fs,
159 const struct variable *var);
162 /* Represent a factor as a string, so it can be
163 printed in a human readable fashion,
164 but sacrificing some readablility for the sake of brevity */
165 const char *factor_to_string_concise(const struct factor *fctr,
166 struct factor_statistics *fs);
171 /* Function to use for testing for missing values */
172 static is_missing_func *value_is_missing;
177 static subc_list_double percentile_list;
179 static enum pc_alg percentile_algorithm;
181 static short sbc_percentile;
188 subc_list_double_create(&percentile_list);
189 percentile_algorithm = PC_HAVERAGE;
191 if ( !parse_examine(&cmd) )
194 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
195 if (cmd.incl == XMN_INCLUDE )
196 value_is_missing = mv_is_value_system_missing;
198 value_is_missing = mv_is_value_missing;
200 if ( cmd.st_n == SYSMIS )
203 if ( ! cmd.sbc_cinterval)
204 cmd.n_cinterval[0] = 95.0;
206 /* If descriptives have been requested, make sure the
207 quartiles are calculated */
208 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
210 subc_list_double_push(&percentile_list, 25);
211 subc_list_double_push(&percentile_list, 50);
212 subc_list_double_push(&percentile_list, 75);
215 multipass_procedure_with_splits (run_examine, &cmd);
222 if ( dependent_vars )
223 free (dependent_vars);
226 struct factor *f = factors ;
229 struct factor *ff = f;
233 hsh_destroy ( ff->fstats ) ;
239 subc_list_double_destroy(&percentile_list);
246 /* Show all the appropriate tables */
252 /* Show totals if appropriate */
253 if ( ! cmd.sbc_nototal || factors == 0 )
255 show_summary(dependent_vars, n_dependent_vars, 0);
257 if ( cmd.sbc_statistics )
259 if ( cmd.a_statistics[XMN_ST_EXTREME])
260 show_extremes(dependent_vars, n_dependent_vars, 0, cmd.st_n);
262 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
263 show_descriptives(dependent_vars, n_dependent_vars, 0);
266 if ( sbc_percentile )
267 show_percentiles(dependent_vars, n_dependent_vars, 0);
272 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
274 for ( v = 0 ; v < n_dependent_vars; ++v )
275 np_plot(&totals[v], var_to_string(dependent_vars[v]));
278 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
280 if ( cmd.cmp == XMN_GROUPS )
282 box_plot_group(0, dependent_vars, n_dependent_vars,
286 box_plot_variables(0, dependent_vars, n_dependent_vars,
290 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
292 for ( v = 0 ; v < n_dependent_vars; ++v )
294 struct normal_curve normal;
296 normal.N = totals[v].n;
297 normal.mean = totals[v].mean;
298 normal.stddev = totals[v].stddev;
300 histogram_plot(totals[v].histogram,
301 var_to_string(dependent_vars[v]),
311 /* Show grouped statistics as appropriate */
315 show_summary(dependent_vars, n_dependent_vars, fctr);
317 if ( cmd.sbc_statistics )
319 if ( cmd.a_statistics[XMN_ST_EXTREME])
320 show_extremes(dependent_vars, n_dependent_vars, fctr, cmd.st_n);
322 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
323 show_descriptives(dependent_vars, n_dependent_vars, fctr);
326 if ( sbc_percentile )
327 show_percentiles(dependent_vars, n_dependent_vars, fctr);
334 struct factor_statistics **fs = fctr->fs ;
336 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
338 if ( cmd.cmp == XMN_VARIABLES )
339 box_plot_variables(fctr, dependent_vars, n_dependent_vars,
342 box_plot_group(fctr, dependent_vars, n_dependent_vars,
346 for ( v = 0 ; v < n_dependent_vars; ++v )
349 for ( fs = fctr->fs ; *fs ; ++fs )
351 const char *s = factor_to_string(fctr, *fs, dependent_vars[v]);
353 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
354 np_plot(&(*fs)->m[v], s);
356 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
358 struct normal_curve normal;
360 normal.N = (*fs)->m[v].n;
361 normal.mean = (*fs)->m[v].mean;
362 normal.stddev = (*fs)->m[v].stddev;
364 histogram_plot((*fs)->m[v].histogram,
368 } /* for ( fs .... */
370 } /* for ( v = 0 ..... */
380 /* Create a hash table of percentiles and their values from the list of
382 static struct hsh_table *
383 list_to_ptile_hash(const subc_list_double *l)
387 struct hsh_table *h ;
389 h = hsh_create(subc_list_double_count(l),
390 (hsh_compare_func *) ptile_compare,
391 (hsh_hash_func *) ptile_hash,
392 (hsh_free_func *) free,
396 for ( i = 0 ; i < subc_list_double_count(l) ; ++i )
398 struct percentile *p = xmalloc (sizeof *p);
400 p->p = subc_list_double_at(l,i);
411 /* Parse the PERCENTILES subcommand */
413 xmn_custom_percentiles(struct cmd_examine *p UNUSED)
421 while ( lex_is_number() )
423 subc_list_double_push(&percentile_list,lex_number());
433 if ( lex_match_id("HAVERAGE"))
434 percentile_algorithm = PC_HAVERAGE;
436 else if ( lex_match_id("WAVERAGE"))
437 percentile_algorithm = PC_WAVERAGE;
439 else if ( lex_match_id("ROUND"))
440 percentile_algorithm = PC_ROUND;
442 else if ( lex_match_id("EMPIRICAL"))
443 percentile_algorithm = PC_EMPIRICAL;
445 else if ( lex_match_id("AEMPIRICAL"))
446 percentile_algorithm = PC_AEMPIRICAL;
448 else if ( lex_match_id("NONE"))
449 percentile_algorithm = PC_NONE;
452 if ( 0 == subc_list_double_count(&percentile_list))
454 subc_list_double_push(&percentile_list, 5);
455 subc_list_double_push(&percentile_list, 10);
456 subc_list_double_push(&percentile_list, 25);
457 subc_list_double_push(&percentile_list, 50);
458 subc_list_double_push(&percentile_list, 75);
459 subc_list_double_push(&percentile_list, 90);
460 subc_list_double_push(&percentile_list, 95);
466 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
468 xmn_custom_total(struct cmd_examine *p)
470 if ( p->sbc_nototal )
472 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
480 xmn_custom_nototal(struct cmd_examine *p)
484 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
493 /* Parser for the variables sub command
494 Returns 1 on success */
496 xmn_custom_variables(struct cmd_examine *cmd )
500 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
506 if (!parse_variables (default_dict, &dependent_vars, &n_dependent_vars,
507 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
509 free (dependent_vars);
513 assert(n_dependent_vars);
515 totals = xnmalloc (n_dependent_vars, sizeof *totals);
517 if ( lex_match(T_BY))
520 success = examine_parse_independent_vars(cmd);
521 if ( success != 1 ) {
522 free (dependent_vars);
533 /* Parse the clause specifying the factors */
535 examine_parse_independent_vars(struct cmd_examine *cmd)
538 struct factor *sf = xmalloc (sizeof *sf);
540 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
548 sf->indep_var[0] = parse_variable();
549 sf->indep_var[1] = 0;
556 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
563 sf->indep_var[1] = parse_variable();
568 sf->fstats = hsh_create(4,
569 (hsh_compare_func *) factor_statistics_compare,
570 (hsh_hash_func *) factor_statistics_hash,
571 (hsh_free_func *) factor_statistics_free,
579 if ( token == '.' || token == '/' )
582 success = examine_parse_independent_vars(cmd);
593 void populate_percentiles(struct tab_table *tbl, int col, int row,
594 const struct metrics *m);
596 void populate_descriptives(struct tab_table *t, int col, int row,
597 const struct metrics *fs);
599 void populate_extremes(struct tab_table *t, int col, int row, int n,
600 const struct metrics *m);
602 void populate_summary(struct tab_table *t, int col, int row,
603 const struct metrics *m);
608 static int bad_weight_warn = 1;
611 /* Perform calculations for the sub factors */
613 factor_calc(struct ccase *c, int case_no, double weight, int case_missing)
616 struct factor *fctr = factors;
620 struct factor_statistics **foo ;
621 union value indep_vals[2] ;
623 indep_vals[0] = * case_data(c, fctr->indep_var[0]->fv);
625 if ( fctr->indep_var[1] )
626 indep_vals[1] = * case_data(c, fctr->indep_var[1]->fv);
628 indep_vals[1].f = SYSMIS;
630 assert(fctr->fstats);
632 foo = ( struct factor_statistics ** )
633 hsh_probe(fctr->fstats, (void *) &indep_vals);
638 *foo = create_factor_statistics(n_dependent_vars,
642 for ( v = 0 ; v < n_dependent_vars ; ++v )
644 metrics_precalc( &(*foo)->m[v] );
649 for ( v = 0 ; v < n_dependent_vars ; ++v )
651 const struct variable *var = dependent_vars[v];
652 const union value *val = case_data (c, var->fv);
654 if ( value_is_missing (&var->miss, val) || case_missing )
657 metrics_calc( &(*foo)->m[v], val, weight, case_no);
668 run_examine(const struct casefile *cf, void *cmd_ )
670 struct casereader *r;
674 const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
676 /* Make sure we haven't got rubbish left over from a
678 struct factor *fctr = factors;
681 struct factor *next = fctr->next;
683 hsh_clear(fctr->fstats);
692 for ( v = 0 ; v < n_dependent_vars ; ++v )
693 metrics_precalc(&totals[v]);
695 for(r = casefile_get_reader (cf);
696 casereader_read (r, &c) ;
700 const int case_no = casereader_cnum(r);
702 const double weight =
703 dict_get_case_weight(default_dict, &c, &bad_weight_warn);
705 if ( cmd->miss == XMN_LISTWISE )
707 for ( v = 0 ; v < n_dependent_vars ; ++v )
709 const struct variable *var = dependent_vars[v];
710 const union value *val = case_data (&c, var->fv);
712 if ( value_is_missing(&var->miss, val))
718 for ( v = 0 ; v < n_dependent_vars ; ++v )
720 const struct variable *var = dependent_vars[v];
721 const union value *val = case_data (&c, var->fv);
723 if ( value_is_missing(&var->miss, val) || case_missing )
726 metrics_calc(&totals[v], val, weight, case_no);
730 factor_calc(&c, case_no, weight, case_missing);
735 for ( v = 0 ; v < n_dependent_vars ; ++v)
740 struct hsh_iterator hi;
741 struct factor_statistics *fs;
743 for ( fs = hsh_first(fctr->fstats, &hi);
745 fs = hsh_next(fctr->fstats, &hi))
748 fs->m[v].ptile_hash = list_to_ptile_hash(&percentile_list);
749 fs->m[v].ptile_alg = percentile_algorithm;
750 metrics_postcalc(&fs->m[v]);
756 totals[v].ptile_hash = list_to_ptile_hash(&percentile_list);
757 totals[v].ptile_alg = percentile_algorithm;
758 metrics_postcalc(&totals[v]);
762 /* Make sure that the combination of factors are complete */
767 struct hsh_iterator hi;
768 struct hsh_iterator hi0;
769 struct hsh_iterator hi1;
770 struct factor_statistics *fs;
772 struct hsh_table *idh0=0;
773 struct hsh_table *idh1=0;
777 idh0 = hsh_create(4, (hsh_compare_func *) compare_values,
778 (hsh_hash_func *) hash_value,
781 idh1 = hsh_create(4, (hsh_compare_func *) compare_values,
782 (hsh_hash_func *) hash_value,
786 for ( fs = hsh_first(fctr->fstats, &hi);
788 fs = hsh_next(fctr->fstats, &hi))
790 hsh_insert(idh0,(void *) &fs->id[0]);
791 hsh_insert(idh1,(void *) &fs->id[1]);
794 /* Ensure that the factors combination is complete */
795 for ( val0 = hsh_first(idh0, &hi0);
797 val0 = hsh_next(idh0, &hi0))
799 for ( val1 = hsh_first(idh1, &hi1);
801 val1 = hsh_next(idh1, &hi1))
803 struct factor_statistics **ffs;
808 ffs = (struct factor_statistics **)
809 hsh_probe(fctr->fstats, (void *) &key );
813 (*ffs) = create_factor_statistics (n_dependent_vars,
815 for ( i = 0 ; i < n_dependent_vars ; ++i )
816 metrics_precalc( &(*ffs)->m[i]);
824 fctr->fs = (struct factor_statistics **) hsh_sort_copy(fctr->fstats);
835 for ( i = 0 ; i < n_dependent_vars ; ++i )
837 metrics_destroy(&totals[i]);
845 show_summary(struct variable **dependent_var, int n_dep_var,
846 const struct factor *fctr)
848 static const char *subtitle[]=
856 int heading_columns ;
858 const int heading_rows = 3;
859 struct tab_table *tbl;
867 n_factors = hsh_count(fctr->fstats);
868 n_rows = n_dep_var * n_factors ;
870 if ( fctr->indep_var[1] )
879 n_rows += heading_rows;
881 n_cols = heading_columns + 6;
883 tbl = tab_create (n_cols,n_rows,0);
884 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
886 tab_dim (tbl, tab_natural_dimensions);
888 /* Outline the box */
893 n_cols - 1, n_rows - 1);
895 /* Vertical lines for the data only */
900 n_cols - 1, n_rows - 1);
903 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
904 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
905 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
907 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
910 tab_title (tbl, 0, _("Case Processing Summary"));
913 tab_joint_text(tbl, heading_columns, 0,
915 TAB_CENTER | TAT_TITLE,
918 /* Remove lines ... */
925 for ( i = 0 ; i < 3 ; ++i )
927 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
930 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
933 tab_joint_text(tbl, heading_columns + i*2 , 1,
934 heading_columns + i*2 + 1, 1,
935 TAB_CENTER | TAT_TITLE,
938 tab_box (tbl, -1, -1,
940 heading_columns + i*2, 1,
941 heading_columns + i*2 + 1, 1);
946 /* Titles for the independent variables */
949 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
950 var_to_string(fctr->indep_var[0]));
952 if ( fctr->indep_var[1] )
954 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
955 var_to_string(fctr->indep_var[1]));
961 for ( i = 0 ; i < n_dep_var ; ++i )
965 n_factors = hsh_count(fctr->fstats);
969 tab_hline(tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
972 0, i * n_factors + heading_rows,
973 TAB_LEFT | TAT_TITLE,
974 var_to_string(dependent_var[i])
979 populate_summary(tbl, heading_columns,
980 (i * n_factors) + heading_rows,
986 struct factor_statistics **fs = fctr->fs;
991 static union value prev;
993 if ( 0 != compare_values(&prev, &(*fs)->id[0],
994 fctr->indep_var[0]->width))
998 (i * n_factors ) + count +
1000 TAB_LEFT | TAT_TITLE,
1001 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1004 if (fctr->indep_var[1] && count > 0 )
1005 tab_hline(tbl, TAL_1, 1, n_cols - 1,
1006 (i * n_factors ) + count + heading_rows);
1010 prev = (*fs)->id[0];
1013 if ( fctr->indep_var[1])
1016 (i * n_factors ) + count +
1018 TAB_LEFT | TAT_TITLE,
1019 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1022 populate_summary(tbl, heading_columns,
1023 (i * n_factors) + count
1038 populate_summary(struct tab_table *t, int col, int row,
1039 const struct metrics *m)
1042 const double total = m->n + m->n_missing ;
1044 tab_float(t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
1045 tab_float(t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
1046 tab_float(t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1050 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1051 100.0 * m->n / total );
1053 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1054 100.0 * m->n_missing / total );
1056 /* This seems a bit pointless !!! */
1057 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1058 100.0 * total / total );
1069 show_extremes(struct variable **dependent_var, int n_dep_var,
1070 const struct factor *fctr, int n_extremities)
1073 int heading_columns ;
1075 const int heading_rows = 1;
1076 struct tab_table *tbl;
1083 heading_columns = 2;
1084 n_factors = hsh_count(fctr->fstats);
1086 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1088 if ( fctr->indep_var[1] )
1089 heading_columns = 3;
1093 heading_columns = 1;
1094 n_rows = n_dep_var * 2 * n_extremities;
1097 n_rows += heading_rows;
1099 heading_columns += 2;
1100 n_cols = heading_columns + 2;
1102 tbl = tab_create (n_cols,n_rows,0);
1103 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1105 tab_dim (tbl, tab_natural_dimensions);
1107 /* Outline the box, No internal lines*/
1112 n_cols - 1, n_rows - 1);
1114 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1116 tab_title (tbl, 0, _("Extreme Values"));
1118 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1119 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1123 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1124 var_to_string(fctr->indep_var[0]));
1126 if ( fctr->indep_var[1] )
1127 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1128 var_to_string(fctr->indep_var[1]));
1131 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Value"));
1132 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Case Number"));
1134 for ( i = 0 ; i < n_dep_var ; ++i )
1138 tab_hline(tbl, TAL_1, 0, n_cols -1 ,
1139 i * 2 * n_extremities * n_factors + heading_rows);
1142 i * 2 * n_extremities * n_factors + heading_rows,
1143 TAB_LEFT | TAT_TITLE,
1144 var_to_string(dependent_var[i])
1149 populate_extremes(tbl, heading_columns - 2,
1150 i * 2 * n_extremities * n_factors + heading_rows,
1151 n_extremities, &totals[i]);
1155 struct factor_statistics **fs = fctr->fs;
1160 static union value prev ;
1162 const int row = heading_rows + ( 2 * n_extremities ) *
1163 ( ( i * n_factors ) + count );
1166 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1167 fctr->indep_var[0]->width))
1171 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1175 TAB_LEFT | TAT_TITLE,
1176 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1180 prev = (*fs)->id[0];
1182 if (fctr->indep_var[1] && count > 0 )
1183 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1185 if ( fctr->indep_var[1])
1186 tab_text (tbl, 2, row,
1187 TAB_LEFT | TAT_TITLE,
1188 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1191 populate_extremes(tbl, heading_columns - 2,
1206 /* Fill in the extremities table */
1208 populate_extremes(struct tab_table *t,
1209 int col, int row, int n, const struct metrics *m)
1215 tab_text(t, col, row,
1216 TAB_RIGHT | TAT_TITLE ,
1220 tab_text(t, col, row + n ,
1221 TAB_RIGHT | TAT_TITLE ,
1226 tab_hline(t, TAL_1, col, col + 3, row + n );
1228 for (extremity = 0; extremity < n ; ++extremity )
1231 tab_float(t, col + 1, row + extremity,
1233 extremity + 1, 8, 0);
1237 tab_float(t, col + 1, row + extremity + n,
1239 extremity + 1, 8, 0);
1245 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1248 const struct weighted_value *wv = m->wvp[idx];
1249 struct case_node *cn = wv->case_nos;
1252 for (j = 0 ; j < wv->w ; ++j )
1254 if ( extremity + j >= n )
1257 tab_float(t, col + 3, row + extremity + j + n,
1261 tab_float(t, col + 2, row + extremity + j + n,
1270 extremity += wv->w ;
1275 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1278 const struct weighted_value *wv = m->wvp[idx];
1279 struct case_node *cn = wv->case_nos;
1281 for (j = 0 ; j < wv->w ; ++j )
1283 if ( extremity + j >= n )
1286 tab_float(t, col + 3, row + extremity + j,
1290 tab_float(t, col + 2, row + extremity + j,
1299 extremity += wv->w ;
1304 /* Show the descriptives table */
1306 show_descriptives(struct variable **dependent_var,
1308 struct factor *fctr)
1311 int heading_columns ;
1313 const int n_stat_rows = 13;
1315 const int heading_rows = 1;
1317 struct tab_table *tbl;
1324 heading_columns = 4;
1325 n_factors = hsh_count(fctr->fstats);
1327 n_rows = n_dep_var * n_stat_rows * n_factors;
1329 if ( fctr->indep_var[1] )
1330 heading_columns = 5;
1334 heading_columns = 3;
1335 n_rows = n_dep_var * n_stat_rows;
1338 n_rows += heading_rows;
1340 n_cols = heading_columns + 2;
1343 tbl = tab_create (n_cols, n_rows, 0);
1345 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1347 tab_dim (tbl, tab_natural_dimensions);
1349 /* Outline the box and have no internal lines*/
1354 n_cols - 1, n_rows - 1);
1356 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1358 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1359 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1360 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1362 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Statistic"));
1363 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
1365 tab_title (tbl, 0, _("Descriptives"));
1368 for ( i = 0 ; i < n_dep_var ; ++i )
1370 const int row = heading_rows + i * n_stat_rows * n_factors ;
1373 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
1376 i * n_stat_rows * n_factors + heading_rows,
1377 TAB_LEFT | TAT_TITLE,
1378 var_to_string(dependent_var[i])
1384 struct factor_statistics **fs = fctr->fs;
1387 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1388 var_to_string(fctr->indep_var[0]));
1391 if ( fctr->indep_var[1])
1392 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1393 var_to_string(fctr->indep_var[1]));
1398 static union value prev ;
1400 const int row = heading_rows + n_stat_rows *
1401 ( ( i * n_factors ) + count );
1404 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1405 fctr->indep_var[0]->width))
1409 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1413 TAB_LEFT | TAT_TITLE,
1414 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1418 prev = (*fs)->id[0];
1420 if (fctr->indep_var[1] && count > 0 )
1421 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1423 if ( fctr->indep_var[1])
1424 tab_text (tbl, 2, row,
1425 TAB_LEFT | TAT_TITLE,
1426 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1429 populate_descriptives(tbl, heading_columns - 2,
1441 populate_descriptives(tbl, heading_columns - 2,
1442 i * n_stat_rows * n_factors + heading_rows,
1454 /* Fill in the descriptives data */
1456 populate_descriptives(struct tab_table *tbl, int col, int row,
1457 const struct metrics *m)
1460 const double t = gsl_cdf_tdist_Qinv(1 - cmd.n_cinterval[0]/100.0/2.0, \
1466 TAB_LEFT | TAT_TITLE,
1469 tab_float (tbl, col + 2,
1475 tab_float (tbl, col + 3,
1484 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1485 _("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1488 tab_text (tbl, col + 1,
1490 TAB_LEFT | TAT_TITLE,
1493 tab_float (tbl, col + 2,
1496 m->mean - t * m->se_mean,
1499 tab_text (tbl, col + 1,
1501 TAB_LEFT | TAT_TITLE,
1505 tab_float (tbl, col + 2,
1508 m->mean + t * m->se_mean,
1513 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1514 _("5%% Trimmed Mean"));
1516 tab_float (tbl, col + 2,
1524 TAB_LEFT | TAT_TITLE,
1528 struct percentile *p;
1531 p = hsh_find(m->ptile_hash, &d);
1536 tab_float (tbl, col + 2,
1546 TAB_LEFT | TAT_TITLE,
1549 tab_float (tbl, col + 2,
1558 TAB_LEFT | TAT_TITLE,
1559 _("Std. Deviation"));
1562 tab_float (tbl, col + 2,
1571 TAB_LEFT | TAT_TITLE,
1574 tab_float (tbl, col + 2,
1582 TAB_LEFT | TAT_TITLE,
1585 tab_float (tbl, col + 2,
1594 TAB_LEFT | TAT_TITLE,
1598 tab_float (tbl, col + 2,
1606 TAB_LEFT | TAT_TITLE,
1607 _("Interquartile Range"));
1610 struct percentile *p1;
1611 struct percentile *p2;
1614 p1 = hsh_find(m->ptile_hash, &d);
1617 p2 = hsh_find(m->ptile_hash, &d);
1622 tab_float (tbl, col + 2,
1633 TAB_LEFT | TAT_TITLE,
1637 tab_float (tbl, col + 2,
1643 /* stderr of skewness */
1644 tab_float (tbl, col + 3,
1653 TAB_LEFT | TAT_TITLE,
1657 tab_float (tbl, col + 2,
1663 /* stderr of kurtosis */
1664 tab_float (tbl, col + 3,
1676 box_plot_variables(const struct factor *fctr,
1677 const struct variable **vars, int n_vars,
1678 const struct variable *id)
1682 struct factor_statistics **fs ;
1686 box_plot_group(fctr, vars, n_vars, id);
1690 for ( fs = fctr->fs ; *fs ; ++fs )
1692 double y_min = DBL_MAX;
1693 double y_max = -DBL_MAX;
1694 struct chart *ch = chart_create();
1695 const char *s = factor_to_string(fctr, *fs, 0 );
1697 chart_write_title(ch, s);
1699 for ( i = 0 ; i < n_vars ; ++i )
1701 y_max = max(y_max, (*fs)->m[i].max);
1702 y_min = min(y_min, (*fs)->m[i].min);
1705 boxplot_draw_yscale(ch, y_max, y_min);
1707 for ( i = 0 ; i < n_vars ; ++i )
1710 const double box_width = (ch->data_right - ch->data_left)
1713 const double box_centre = ( i * 2 + 1) * box_width
1716 boxplot_draw_boxplot(ch,
1717 box_centre, box_width,
1719 var_to_string(vars[i]));
1731 /* Do a box plot, grouping all factors into one plot ;
1732 each dependent variable has its own plot.
1735 box_plot_group(const struct factor *fctr,
1736 const struct variable **vars,
1738 const struct variable *id UNUSED)
1743 for ( i = 0 ; i < n_vars ; ++i )
1745 struct factor_statistics **fs ;
1748 ch = chart_create();
1750 boxplot_draw_yscale(ch, totals[i].max, totals[i].min);
1756 for ( fs = fctr->fs ; *fs ; ++fs )
1759 chart_write_title(ch, _("Boxplot of %s vs. %s"),
1760 var_to_string(vars[i]), var_to_string(fctr->indep_var[0]) );
1762 for ( fs = fctr->fs ; *fs ; ++fs )
1765 const char *s = factor_to_string_concise(fctr, *fs);
1767 const double box_width = (ch->data_right - ch->data_left)
1768 / (n_factors * 2.0 ) ;
1770 const double box_centre = ( f++ * 2 + 1) * box_width
1773 boxplot_draw_boxplot(ch,
1774 box_centre, box_width,
1781 const double box_width = (ch->data_right - ch->data_left) / 3.0;
1782 const double box_centre = (ch->data_right + ch->data_left) / 2.0;
1784 chart_write_title(ch, _("Boxplot"));
1786 boxplot_draw_boxplot(ch,
1787 box_centre, box_width,
1789 var_to_string(vars[i]) );
1798 /* Plot the normal and detrended normal plots for m
1799 Label the plots with factorname */
1801 np_plot(const struct metrics *m, const char *factorname)
1804 double yfirst=0, ylast=0;
1807 struct chart *np_chart;
1809 /* Detrended Normal Plot */
1810 struct chart *dnp_chart;
1812 /* The slope and intercept of the ideal normal probability line */
1813 const double slope = 1.0 / m->stddev;
1814 const double intercept = - m->mean / m->stddev;
1816 /* Cowardly refuse to plot an empty data set */
1817 if ( m->n_data == 0 )
1820 np_chart = chart_create();
1821 dnp_chart = chart_create();
1823 if ( !np_chart || ! dnp_chart )
1826 chart_write_title(np_chart, _("Normal Q-Q Plot of %s"), factorname);
1827 chart_write_xlabel(np_chart, _("Observed Value"));
1828 chart_write_ylabel(np_chart, _("Expected Normal"));
1831 chart_write_title(dnp_chart, _("Detrended Normal Q-Q Plot of %s"),
1833 chart_write_xlabel(dnp_chart, _("Observed Value"));
1834 chart_write_ylabel(dnp_chart, _("Dev from Normal"));
1836 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1837 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1841 /* Need to make sure that both the scatter plot and the ideal fit into the
1843 double x_lower = min(m->min, (yfirst - intercept) / slope) ;
1844 double x_upper = max(m->max, (ylast - intercept) / slope) ;
1845 double slack = (x_upper - x_lower) * 0.05 ;
1847 chart_write_xscale(np_chart, x_lower - slack, x_upper + slack, 5);
1849 chart_write_xscale(dnp_chart, m->min, m->max, 5);
1853 chart_write_yscale(np_chart, yfirst, ylast, 5);
1856 /* We have to cache the detrended data, beacause we need to
1857 find its limits before we can plot it */
1858 double *d_data = xnmalloc (m->n_data, sizeof *d_data);
1859 double d_max = -DBL_MAX;
1860 double d_min = DBL_MAX;
1861 for ( i = 0 ; i < m->n_data; ++i )
1863 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1865 chart_datum(np_chart, 0, m->wvp[i]->v.f, ns);
1867 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1869 if ( d_data[i] < d_min ) d_min = d_data[i];
1870 if ( d_data[i] > d_max ) d_max = d_data[i];
1872 chart_write_yscale(dnp_chart, d_min, d_max, 5);
1874 for ( i = 0 ; i < m->n_data; ++i )
1875 chart_datum(dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1880 chart_line(np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1881 chart_line(dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1883 chart_submit(np_chart);
1884 chart_submit(dnp_chart);
1890 /* Show the percentiles */
1892 show_percentiles(struct variable **dependent_var,
1894 struct factor *fctr)
1896 struct tab_table *tbl;
1902 struct hsh_table *ptiles ;
1904 int n_heading_columns;
1905 const int n_heading_rows = 2;
1906 const int n_stat_rows = 2;
1912 struct factor_statistics **fs = fctr->fs ;
1913 n_heading_columns = 3;
1914 n_factors = hsh_count(fctr->fstats);
1916 ptiles = (*fs)->m[0].ptile_hash;
1918 if ( fctr->indep_var[1] )
1919 n_heading_columns = 4;
1924 n_heading_columns = 2;
1926 ptiles = totals[0].ptile_hash;
1929 n_ptiles = hsh_count(ptiles);
1931 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1933 n_cols = n_heading_columns + n_ptiles ;
1935 tbl = tab_create (n_cols, n_rows, 0);
1937 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1939 tab_dim (tbl, tab_natural_dimensions);
1941 /* Outline the box and have no internal lines*/
1946 n_cols - 1, n_rows - 1);
1948 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
1950 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
1953 tab_title (tbl, 0, _("Percentiles"));
1956 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
1963 n_heading_columns - 1, n_rows - 1);
1969 n_heading_columns, n_heading_rows - 1,
1970 n_cols - 1, n_rows - 1);
1972 tab_joint_text(tbl, n_heading_columns + 1, 0,
1974 TAB_CENTER | TAT_TITLE ,
1979 /* Put in the percentile break points as headings */
1981 struct percentile **p = (struct percentile **) hsh_sort(ptiles);
1986 tab_float(tbl, n_heading_columns + i++ , 1,
1995 for ( i = 0 ; i < n_dep_var ; ++i )
1997 const int n_stat_rows = 2;
1998 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
2001 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
2004 i * n_stat_rows * n_factors + n_heading_rows,
2005 TAB_LEFT | TAT_TITLE,
2006 var_to_string(dependent_var[i])
2011 struct factor_statistics **fs = fctr->fs;
2014 tab_text (tbl, 1, n_heading_rows - 1,
2015 TAB_CENTER | TAT_TITLE,
2016 var_to_string(fctr->indep_var[0]));
2019 if ( fctr->indep_var[1])
2020 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
2021 var_to_string(fctr->indep_var[1]));
2026 static union value prev ;
2028 const int row = n_heading_rows + n_stat_rows *
2029 ( ( i * n_factors ) + count );
2032 if ( 0 != compare_values(&prev, &(*fs)->id[0],
2033 fctr->indep_var[0]->width))
2037 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
2041 TAB_LEFT | TAT_TITLE,
2042 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
2048 prev = (*fs)->id[0];
2050 if (fctr->indep_var[1] && count > 0 )
2051 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
2053 if ( fctr->indep_var[1])
2054 tab_text (tbl, 2, row,
2055 TAB_LEFT | TAT_TITLE,
2056 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
2060 populate_percentiles(tbl, n_heading_columns - 1,
2072 populate_percentiles(tbl, n_heading_columns - 1,
2073 i * n_stat_rows * n_factors + n_heading_rows,
2090 populate_percentiles(struct tab_table *tbl, int col, int row,
2091 const struct metrics *m)
2095 struct percentile **p = (struct percentile **) hsh_sort(m->ptile_hash);
2099 TAB_LEFT | TAT_TITLE,
2100 _("Tukey\'s Hinges")
2105 TAB_LEFT | TAT_TITLE,
2106 ptile_alg_desc[m->ptile_alg]
2113 tab_float(tbl, col + i + 1 , row,
2116 if ( (*p)->p == 25 )
2117 tab_float(tbl, col + i + 1 , row + 1,
2121 if ( (*p)->p == 50 )
2122 tab_float(tbl, col + i + 1 , row + 1,
2126 if ( (*p)->p == 75 )
2127 tab_float(tbl, col + i + 1 , row + 1,
2142 factor_to_string(const struct factor *fctr,
2143 struct factor_statistics *fs,
2144 const struct variable *var)
2147 static char buf1[100];
2153 sprintf(buf1, "%s (",var_to_string(var) );
2156 snprintf(buf2, 100, "%s = %s",
2157 var_to_string(fctr->indep_var[0]),
2158 value_to_string(&fs->id[0],fctr->indep_var[0]));
2162 if ( fctr->indep_var[1] )
2164 sprintf(buf2, "; %s = %s)",
2165 var_to_string(fctr->indep_var[1]),
2166 value_to_string(&fs->id[1],
2167 fctr->indep_var[1]));
2182 factor_to_string_concise(const struct factor *fctr,
2183 struct factor_statistics *fs)
2187 static char buf[100];
2191 snprintf(buf, 100, "%s",
2192 value_to_string(&fs->id[0], fctr->indep_var[0]));
2194 if ( fctr->indep_var[1] )
2196 sprintf(buf2, ",%s)", value_to_string(&fs->id[1], fctr->indep_var[1]) );