1 /* PSPP - EXAMINE data for normality . -*-c-*-
3 Copyright (C) 2004 Free Software Foundation, Inc.
4 Author: John Darrington 2004
6 This program is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 This program is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software
18 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
22 #include <gsl/gsl_cdf.h>
30 #include "dictionary.h"
38 #include "value-labels.h"
43 #include "factor_stats.h"
45 #include "percentiles.h"
55 +missing=miss:pairwise/!listwise,
57 incl:include/!exclude;
58 +compare=cmp:variables/!groups;
61 +plot[plt_]=stemleaf,boxplot,npplot,:spreadlevel(*d:n),histogram,all,none;
63 +statistics[st_]=descriptives,:extreme(*d:n),all,none.
72 static struct cmd_examine cmd;
74 static struct variable **dependent_vars;
76 static int n_dependent_vars;
81 /* The independent variable */
82 struct variable *indep_var[2];
85 /* Hash table of factor stats indexed by 2 values */
86 struct hsh_table *fstats;
88 /* The hash table after it has been crunched */
89 struct factor_statistics **fs;
95 /* Linked list of factors */
96 static struct factor *factors=0;
98 static struct metrics *totals=0;
100 /* Parse the clause specifying the factors */
101 static int examine_parse_independent_vars(struct cmd_examine *cmd);
105 /* Output functions */
106 static void show_summary(struct variable **dependent_var, int n_dep_var,
107 const struct factor *f);
109 static void show_extremes(struct variable **dependent_var,
111 const struct factor *factor,
114 static void show_descriptives(struct variable **dependent_var,
116 struct factor *factor);
118 static void show_percentiles(struct variable **dependent_var,
120 struct factor *factor);
125 void np_plot(const struct metrics *m, const char *factorname);
128 void box_plot_group(const struct factor *fctr,
129 const struct variable **vars, int n_vars,
130 const struct variable *id
134 void box_plot_variables(const struct factor *fctr,
135 struct variable **vars, int n_vars,
136 const struct variable *id
141 /* Per Split function */
142 static void run_examine(const struct casefile *cf, void *cmd_);
144 static void output_examine(void);
147 void factor_calc(struct ccase *c, int case_no,
148 double weight, int case_missing);
151 /* Represent a factor as a string, so it can be
152 printed in a human readable fashion */
153 const char * factor_to_string(const struct factor *fctr,
154 struct factor_statistics *fs,
155 const struct variable *var);
158 /* Represent a factor as a string, so it can be
159 printed in a human readable fashion,
160 but sacrificing some readablility for the sake of brevity */
161 const char *factor_to_string_concise(const struct factor *fctr,
162 struct factor_statistics *fs);
167 /* Function to use for testing for missing values */
168 static is_missing_func value_is_missing;
173 static subc_list_double percentile_list;
175 static enum pc_alg percentile_algorithm;
177 static short sbc_percentile;
184 subc_list_double_create(&percentile_list);
185 percentile_algorithm = PC_HAVERAGE;
187 if ( !parse_examine(&cmd) )
190 /* If /MISSING=INCLUDE is set, then user missing values are ignored */
191 if (cmd.incl == XMN_INCLUDE )
192 value_is_missing = is_system_missing;
194 value_is_missing = is_missing;
196 if ( cmd.st_n == SYSMIS )
199 if ( ! cmd.sbc_cinterval)
200 cmd.n_cinterval[0] = 95.0;
202 /* If descriptives have been requested, make sure the
203 quartiles are calculated */
204 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES] )
206 subc_list_double_push(&percentile_list, 25);
207 subc_list_double_push(&percentile_list, 50);
208 subc_list_double_push(&percentile_list, 75);
211 multipass_procedure_with_splits (run_examine, &cmd);
216 subc_list_double_destroy(&percentile_list);
223 /* Show all the appropriate tables */
229 /* Show totals if appropriate */
230 if ( ! cmd.sbc_nototal || factors == 0 )
232 show_summary(dependent_vars, n_dependent_vars, 0);
234 if ( cmd.sbc_statistics )
236 if ( cmd.a_statistics[XMN_ST_EXTREME])
237 show_extremes(dependent_vars, n_dependent_vars, 0, cmd.st_n);
239 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
240 show_descriptives(dependent_vars, n_dependent_vars, 0);
243 if ( sbc_percentile )
244 show_percentiles(dependent_vars, n_dependent_vars, 0);
249 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
251 for ( v = 0 ; v < n_dependent_vars; ++v )
252 np_plot(&totals[v], var_to_string(dependent_vars[v]));
255 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
257 if ( cmd.cmp == XMN_GROUPS )
259 box_plot_group(0, dependent_vars, n_dependent_vars,
263 box_plot_variables(0, dependent_vars, n_dependent_vars,
267 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
269 for ( v = 0 ; v < n_dependent_vars; ++v )
271 struct normal_curve normal;
273 normal.N = totals[v].n;
274 normal.mean = totals[v].mean;
275 normal.stddev = totals[v].stddev;
277 histogram_plot(totals[v].histogram,
278 var_to_string(dependent_vars[v]),
288 /* Show grouped statistics as appropriate */
292 show_summary(dependent_vars, n_dependent_vars, fctr);
294 if ( cmd.sbc_statistics )
296 if ( cmd.a_statistics[XMN_ST_EXTREME])
297 show_extremes(dependent_vars, n_dependent_vars, fctr, cmd.st_n);
299 if ( cmd.a_statistics[XMN_ST_DESCRIPTIVES])
300 show_descriptives(dependent_vars, n_dependent_vars, fctr);
303 if ( sbc_percentile )
304 show_percentiles(dependent_vars, n_dependent_vars, fctr);
311 struct factor_statistics **fs = fctr->fs ;
313 if ( cmd.a_plot[XMN_PLT_BOXPLOT] )
315 if ( cmd.cmp == XMN_VARIABLES )
316 box_plot_variables(fctr, dependent_vars, n_dependent_vars,
319 box_plot_group(fctr, dependent_vars, n_dependent_vars,
323 for ( v = 0 ; v < n_dependent_vars; ++v )
326 for ( fs = fctr->fs ; *fs ; ++fs )
328 const char *s = factor_to_string(fctr, *fs, dependent_vars[v]);
330 if ( cmd.a_plot[XMN_PLT_NPPLOT] )
331 np_plot(&(*fs)->m[v], s);
333 if ( cmd.a_plot[XMN_PLT_HISTOGRAM] )
335 struct normal_curve normal;
337 normal.N = (*fs)->m[v].n;
338 normal.mean = (*fs)->m[v].mean;
339 normal.stddev = (*fs)->m[v].stddev;
341 histogram_plot((*fs)->m[v].histogram,
345 } /* for ( fs .... */
347 } /* for ( v = 0 ..... */
357 static struct hsh_table *
358 list_to_ptile_hash(const subc_list_double *l)
362 struct hsh_table *h ;
364 h = hsh_create(subc_list_double_count(l),
365 (hsh_compare_func *) ptile_compare,
366 (hsh_hash_func *) ptile_hash,
367 (hsh_free_func *) free,
371 for ( i = 0 ; i < subc_list_double_count(l) ; ++i )
373 struct percentile *p = xmalloc (sizeof (struct percentile));
375 p->p = subc_list_double_at(l,i);
385 /* Parse the PERCENTILES subcommand */
387 xmn_custom_percentiles(struct cmd_examine *p UNUSED)
395 while ( lex_double_p() )
397 subc_list_double_push(&percentile_list,lex_double());
407 if ( lex_match_id("HAVERAGE"))
408 percentile_algorithm = PC_HAVERAGE;
410 else if ( lex_match_id("WAVERAGE"))
411 percentile_algorithm = PC_WAVERAGE;
413 else if ( lex_match_id("ROUND"))
414 percentile_algorithm = PC_ROUND;
416 else if ( lex_match_id("EMPIRICAL"))
417 percentile_algorithm = PC_EMPIRICAL;
419 else if ( lex_match_id("AEMPIRICAL"))
420 percentile_algorithm = PC_AEMPIRICAL;
422 else if ( lex_match_id("NONE"))
423 percentile_algorithm = PC_NONE;
426 if ( 0 == subc_list_double_count(&percentile_list))
428 subc_list_double_push(&percentile_list, 5);
429 subc_list_double_push(&percentile_list, 10);
430 subc_list_double_push(&percentile_list, 25);
431 subc_list_double_push(&percentile_list, 50);
432 subc_list_double_push(&percentile_list, 75);
433 subc_list_double_push(&percentile_list, 90);
434 subc_list_double_push(&percentile_list, 95);
440 /* TOTAL and NOTOTAL are simple, mutually exclusive flags */
442 xmn_custom_total(struct cmd_examine *p)
444 if ( p->sbc_nototal )
446 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
454 xmn_custom_nototal(struct cmd_examine *p)
458 msg (SE, _("%s and %s are mutually exclusive"),"TOTAL","NOTOTAL");
467 /* Parser for the variables sub command */
469 xmn_custom_variables(struct cmd_examine *cmd )
474 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
478 if (!parse_variables (default_dict, &dependent_vars, &n_dependent_vars,
479 PV_NO_DUPLICATE | PV_NUMERIC | PV_NO_SCRATCH) )
481 free (dependent_vars);
485 assert(n_dependent_vars);
487 totals = xmalloc( sizeof(struct metrics) * n_dependent_vars);
489 if ( lex_match(T_BY))
491 return examine_parse_independent_vars(cmd);
499 /* Parse the clause specifying the factors */
501 examine_parse_independent_vars(struct cmd_examine *cmd)
504 struct factor *sf = xmalloc(sizeof(struct factor));
506 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
511 sf->indep_var[0] = parse_variable();
512 sf->indep_var[1] = 0;
519 if ((token != T_ID || dict_lookup_var (default_dict, tokid) == NULL)
523 sf->indep_var[1] = parse_variable();
528 sf->fstats = hsh_create(4,
529 (hsh_compare_func *) factor_statistics_compare,
530 (hsh_hash_func *) factor_statistics_hash,
531 (hsh_free_func *) factor_statistics_free,
539 if ( token == '.' || token == '/' )
542 return examine_parse_independent_vars(cmd);
548 void populate_percentiles(struct tab_table *tbl, int col, int row,
549 const struct metrics *m);
551 void populate_descriptives(struct tab_table *t, int col, int row,
552 const struct metrics *fs);
554 void populate_extremes(struct tab_table *t, int col, int row, int n,
555 const struct metrics *m);
557 void populate_summary(struct tab_table *t, int col, int row,
558 const struct metrics *m);
563 static int bad_weight_warn = 1;
566 /* Perform calculations for the sub factors */
568 factor_calc(struct ccase *c, int case_no, double weight, int case_missing)
571 struct factor *fctr = factors;
575 union value indep_vals[2] ;
577 indep_vals[0] = * case_data(c, fctr->indep_var[0]->fv);
579 if ( fctr->indep_var[1] )
580 indep_vals[1] = * case_data(c, fctr->indep_var[1]->fv);
582 indep_vals[1].f = SYSMIS;
584 assert(fctr->fstats);
586 struct factor_statistics **foo = ( struct factor_statistics ** )
587 hsh_probe(fctr->fstats, (void *) &indep_vals);
592 *foo = create_factor_statistics(n_dependent_vars,
596 for ( v = 0 ; v < n_dependent_vars ; ++v )
598 metrics_precalc( &(*foo)->m[v] );
603 for ( v = 0 ; v < n_dependent_vars ; ++v )
605 const struct variable *var = dependent_vars[v];
606 const union value *val = case_data (c, var->fv);
608 if ( value_is_missing(val,var) || case_missing )
611 metrics_calc( &(*foo)->m[v], val, weight, case_no);
625 run_examine(const struct casefile *cf, void *cmd_ )
627 struct casereader *r;
631 const struct cmd_examine *cmd = (struct cmd_examine *) cmd_;
633 /* Make sure we haven't got rubbish left over from a
636 struct factor *fctr = factors;
639 struct factor *next = fctr->next;
641 hsh_clear(fctr->fstats);
650 for ( v = 0 ; v < n_dependent_vars ; ++v )
651 metrics_precalc(&totals[v]);
653 for(r = casefile_get_reader (cf);
654 casereader_read (r, &c) ;
658 const int case_no = casereader_cnum(r);
660 const double weight =
661 dict_get_case_weight(default_dict, &c, &bad_weight_warn);
663 if ( cmd->miss == XMN_LISTWISE )
665 for ( v = 0 ; v < n_dependent_vars ; ++v )
667 const struct variable *var = dependent_vars[v];
668 const union value *val = case_data (&c, var->fv);
670 if ( value_is_missing(val,var))
676 for ( v = 0 ; v < n_dependent_vars ; ++v )
678 const struct variable *var = dependent_vars[v];
679 const union value *val = case_data (&c, var->fv);
681 if ( value_is_missing(val,var) || case_missing )
684 metrics_calc(&totals[v], val, weight, case_no);
688 factor_calc(&c, case_no, weight, case_missing);
693 for ( v = 0 ; v < n_dependent_vars ; ++v)
698 struct hsh_iterator hi;
699 struct factor_statistics *fs;
701 for ( fs = hsh_first(fctr->fstats, &hi);
703 fs = hsh_next(fctr->fstats, &hi))
706 fs->m[v].ptile_hash = list_to_ptile_hash(&percentile_list);
707 fs->m[v].ptile_alg = percentile_algorithm;
708 metrics_postcalc(&fs->m[v]);
714 totals[v].ptile_hash = list_to_ptile_hash(&percentile_list);
715 totals[v].ptile_alg = percentile_algorithm;
716 metrics_postcalc(&totals[v]);
720 /* Make sure that the combination of factors are complete */
725 struct hsh_iterator hi;
726 struct hsh_iterator hi0;
727 struct hsh_iterator hi1;
728 struct factor_statistics *fs;
730 struct hsh_table *idh0=0;
731 struct hsh_table *idh1=0;
735 idh0 = hsh_create(4, (hsh_compare_func *) compare_values,
736 (hsh_hash_func *) hash_value,
739 idh1 = hsh_create(4, (hsh_compare_func *) compare_values,
740 (hsh_hash_func *) hash_value,
744 for ( fs = hsh_first(fctr->fstats, &hi);
746 fs = hsh_next(fctr->fstats, &hi))
748 hsh_insert(idh0,(void *) &fs->id[0]);
749 hsh_insert(idh1,(void *) &fs->id[1]);
752 /* Ensure that the factors combination is complete */
753 for ( val0 = hsh_first(idh0, &hi0);
755 val0 = hsh_next(idh0, &hi0))
757 for ( val1 = hsh_first(idh1, &hi1);
759 val1 = hsh_next(idh1, &hi1))
761 struct factor_statistics **ffs;
766 ffs = (struct factor_statistics **)
767 hsh_probe(fctr->fstats, (void *) &key );
771 (*ffs) = create_factor_statistics (n_dependent_vars,
773 for ( i = 0 ; i < n_dependent_vars ; ++i )
774 metrics_precalc( &(*ffs)->m[i]);
782 fctr->fs = (struct factor_statistics **) hsh_sort_copy(fctr->fstats);
789 for ( v = 0 ; v < n_dependent_vars ; ++v )
790 hsh_destroy(totals[v].ordered_data);
796 show_summary(struct variable **dependent_var, int n_dep_var,
797 const struct factor *fctr)
799 static const char *subtitle[]=
807 int heading_columns ;
809 const int heading_rows = 3;
810 struct tab_table *tbl;
818 n_factors = hsh_count(fctr->fstats);
819 n_rows = n_dep_var * n_factors ;
821 if ( fctr->indep_var[1] )
830 n_rows += heading_rows;
832 n_cols = heading_columns + 6;
834 tbl = tab_create (n_cols,n_rows,0);
835 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
837 tab_dim (tbl, tab_natural_dimensions);
839 /* Outline the box */
844 n_cols - 1, n_rows - 1);
846 /* Vertical lines for the data only */
851 n_cols - 1, n_rows - 1);
854 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
855 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, 1 );
856 tab_hline (tbl, TAL_1, heading_columns, n_cols - 1, heading_rows -1 );
858 tab_vline (tbl, TAL_2, heading_columns, 0, n_rows - 1);
861 tab_title (tbl, 0, _("Case Processing Summary"));
864 tab_joint_text(tbl, heading_columns, 0,
866 TAB_CENTER | TAT_TITLE,
869 /* Remove lines ... */
876 for ( i = 0 ; i < 3 ; ++i )
878 tab_text (tbl, heading_columns + i*2 , 2, TAB_CENTER | TAT_TITLE,
881 tab_text (tbl, heading_columns + i*2 + 1, 2, TAB_CENTER | TAT_TITLE,
884 tab_joint_text(tbl, heading_columns + i*2 , 1,
885 heading_columns + i*2 + 1, 1,
886 TAB_CENTER | TAT_TITLE,
889 tab_box (tbl, -1, -1,
891 heading_columns + i*2, 1,
892 heading_columns + i*2 + 1, 1);
897 /* Titles for the independent variables */
900 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
901 var_to_string(fctr->indep_var[0]));
903 if ( fctr->indep_var[1] )
905 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
906 var_to_string(fctr->indep_var[1]));
912 for ( i = 0 ; i < n_dep_var ; ++i )
916 n_factors = hsh_count(fctr->fstats);
920 tab_hline(tbl, TAL_1, 0, n_cols -1 , i * n_factors + heading_rows);
923 0, i * n_factors + heading_rows,
924 TAB_LEFT | TAT_TITLE,
925 var_to_string(dependent_var[i])
930 populate_summary(tbl, heading_columns,
931 (i * n_factors) + heading_rows,
937 struct factor_statistics **fs = fctr->fs;
942 static union value prev;
944 if ( 0 != compare_values(&prev, &(*fs)->id[0],
945 fctr->indep_var[0]->width))
949 (i * n_factors ) + count +
951 TAB_LEFT | TAT_TITLE,
952 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
955 if (fctr->indep_var[1] && count > 0 )
956 tab_hline(tbl, TAL_1, 1, n_cols - 1,
957 (i * n_factors ) + count + heading_rows);
964 if ( fctr->indep_var[1])
967 (i * n_factors ) + count +
969 TAB_LEFT | TAT_TITLE,
970 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
973 populate_summary(tbl, heading_columns,
974 (i * n_factors) + count
989 populate_summary(struct tab_table *t, int col, int row,
990 const struct metrics *m)
993 const double total = m->n + m->n_missing ;
995 tab_float(t, col + 0, row + 0, TAB_RIGHT, m->n, 8, 0);
996 tab_float(t, col + 2, row + 0, TAB_RIGHT, m->n_missing, 8, 0);
997 tab_float(t, col + 4, row + 0, TAB_RIGHT, total, 8, 0);
1001 tab_text (t, col + 1, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1002 100.0 * m->n / total );
1004 tab_text (t, col + 3, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1005 100.0 * m->n_missing / total );
1007 /* This seems a bit pointless !!! */
1008 tab_text (t, col + 5, row + 0, TAB_RIGHT | TAT_PRINTF, "%2.0f%%",
1009 100.0 * total / total );
1020 show_extremes(struct variable **dependent_var, int n_dep_var,
1021 const struct factor *fctr, int n_extremities)
1024 int heading_columns ;
1026 const int heading_rows = 1;
1027 struct tab_table *tbl;
1034 heading_columns = 2;
1035 n_factors = hsh_count(fctr->fstats);
1037 n_rows = n_dep_var * 2 * n_extremities * n_factors;
1039 if ( fctr->indep_var[1] )
1040 heading_columns = 3;
1044 heading_columns = 1;
1045 n_rows = n_dep_var * 2 * n_extremities;
1048 n_rows += heading_rows;
1050 heading_columns += 2;
1051 n_cols = heading_columns + 2;
1053 tbl = tab_create (n_cols,n_rows,0);
1054 tab_headers (tbl, heading_columns, 0, heading_rows, 0);
1056 tab_dim (tbl, tab_natural_dimensions);
1058 /* Outline the box, No internal lines*/
1063 n_cols - 1, n_rows - 1);
1065 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1067 tab_title (tbl, 0, _("Extreme Values"));
1069 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows -1);
1070 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows -1);
1074 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1075 var_to_string(fctr->indep_var[0]));
1077 if ( fctr->indep_var[1] )
1078 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1079 var_to_string(fctr->indep_var[1]));
1082 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Value"));
1083 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Case Number"));
1085 for ( i = 0 ; i < n_dep_var ; ++i )
1089 tab_hline(tbl, TAL_1, 0, n_cols -1 ,
1090 i * 2 * n_extremities * n_factors + heading_rows);
1093 i * 2 * n_extremities * n_factors + heading_rows,
1094 TAB_LEFT | TAT_TITLE,
1095 var_to_string(dependent_var[i])
1100 populate_extremes(tbl, heading_columns - 2,
1101 i * 2 * n_extremities * n_factors + heading_rows,
1102 n_extremities, &totals[i]);
1106 struct factor_statistics **fs = fctr->fs;
1111 static union value prev ;
1113 const int row = heading_rows + ( 2 * n_extremities ) *
1114 ( ( i * n_factors ) + count );
1117 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1118 fctr->indep_var[0]->width))
1122 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1126 TAB_LEFT | TAT_TITLE,
1127 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1131 prev = (*fs)->id[0];
1133 if (fctr->indep_var[1] && count > 0 )
1134 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1136 if ( fctr->indep_var[1])
1137 tab_text (tbl, 2, row,
1138 TAB_LEFT | TAT_TITLE,
1139 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1142 populate_extremes(tbl, heading_columns - 2,
1157 /* Fill in the extremities table */
1159 populate_extremes(struct tab_table *t,
1160 int col, int row, int n, const struct metrics *m)
1166 tab_text(t, col, row,
1167 TAB_RIGHT | TAT_TITLE ,
1171 tab_text(t, col, row + n ,
1172 TAB_RIGHT | TAT_TITLE ,
1177 tab_hline(t, TAL_1, col, col + 3, row + n );
1179 for (extremity = 0; extremity < n ; ++extremity )
1182 tab_float(t, col + 1, row + extremity,
1184 extremity + 1, 8, 0);
1188 tab_float(t, col + 1, row + extremity + n,
1190 extremity + 1, 8, 0);
1196 for (idx = 0, extremity = 0; extremity < n && idx < m->n_data ; ++idx )
1199 const struct weighted_value *wv = m->wvp[idx];
1200 struct case_node *cn = wv->case_nos;
1203 for (j = 0 ; j < wv->w ; ++j )
1205 if ( extremity + j >= n )
1208 tab_float(t, col + 3, row + extremity + j + n,
1212 tab_float(t, col + 2, row + extremity + j + n,
1221 extremity += wv->w ;
1226 for (idx = m->n_data - 1, extremity = 0; extremity < n && idx >= 0; --idx )
1229 const struct weighted_value *wv = m->wvp[idx];
1230 struct case_node *cn = wv->case_nos;
1232 for (j = 0 ; j < wv->w ; ++j )
1234 if ( extremity + j >= n )
1237 tab_float(t, col + 3, row + extremity + j,
1241 tab_float(t, col + 2, row + extremity + j,
1250 extremity += wv->w ;
1255 /* Show the descriptives table */
1257 show_descriptives(struct variable **dependent_var,
1259 struct factor *fctr)
1262 int heading_columns ;
1264 const int n_stat_rows = 13;
1266 const int heading_rows = 1;
1268 struct tab_table *tbl;
1275 heading_columns = 4;
1276 n_factors = hsh_count(fctr->fstats);
1278 n_rows = n_dep_var * n_stat_rows * n_factors;
1280 if ( fctr->indep_var[1] )
1281 heading_columns = 5;
1285 heading_columns = 3;
1286 n_rows = n_dep_var * n_stat_rows;
1289 n_rows += heading_rows;
1291 n_cols = heading_columns + 2;
1294 tbl = tab_create (n_cols, n_rows, 0);
1296 tab_headers (tbl, heading_columns + 1, 0, heading_rows, 0);
1298 tab_dim (tbl, tab_natural_dimensions);
1300 /* Outline the box and have no internal lines*/
1305 n_cols - 1, n_rows - 1);
1307 tab_hline (tbl, TAL_2, 0, n_cols - 1, heading_rows );
1309 tab_vline (tbl, TAL_1, heading_columns, 0, n_rows - 1);
1310 tab_vline (tbl, TAL_2, n_cols - 2, 0, n_rows - 1);
1311 tab_vline (tbl, TAL_1, n_cols - 1, 0, n_rows - 1);
1313 tab_text (tbl, n_cols - 2, 0, TAB_CENTER | TAT_TITLE, _("Statistic"));
1314 tab_text (tbl, n_cols - 1, 0, TAB_CENTER | TAT_TITLE, _("Std. Error"));
1316 tab_title (tbl, 0, _("Descriptives"));
1319 for ( i = 0 ; i < n_dep_var ; ++i )
1321 const int row = heading_rows + i * n_stat_rows * n_factors ;
1324 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
1327 i * n_stat_rows * n_factors + heading_rows,
1328 TAB_LEFT | TAT_TITLE,
1329 var_to_string(dependent_var[i])
1335 struct factor_statistics **fs = fctr->fs;
1338 tab_text (tbl, 1, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1339 var_to_string(fctr->indep_var[0]));
1342 if ( fctr->indep_var[1])
1343 tab_text (tbl, 2, heading_rows - 1, TAB_CENTER | TAT_TITLE,
1344 var_to_string(fctr->indep_var[1]));
1349 static union value prev ;
1351 const int row = heading_rows + n_stat_rows *
1352 ( ( i * n_factors ) + count );
1355 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1356 fctr->indep_var[0]->width))
1360 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1364 TAB_LEFT | TAT_TITLE,
1365 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1369 prev = (*fs)->id[0];
1371 if (fctr->indep_var[1] && count > 0 )
1372 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
1374 if ( fctr->indep_var[1])
1375 tab_text (tbl, 2, row,
1376 TAB_LEFT | TAT_TITLE,
1377 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
1380 populate_descriptives(tbl, heading_columns - 2,
1392 populate_descriptives(tbl, heading_columns - 2,
1393 i * n_stat_rows * n_factors + heading_rows,
1405 /* Fill in the descriptives data */
1407 populate_descriptives(struct tab_table *tbl, int col, int row,
1408 const struct metrics *m)
1411 const double t = gsl_cdf_tdist_Qinv(1 - cmd.n_cinterval[0]/100.0/2.0, \
1417 TAB_LEFT | TAT_TITLE,
1420 tab_float (tbl, col + 2,
1426 tab_float (tbl, col + 3,
1435 TAB_LEFT | TAT_TITLE | TAT_PRINTF,
1436 _("%g%% Confidence Interval for Mean"), cmd.n_cinterval[0]);
1439 tab_text (tbl, col + 1,
1441 TAB_LEFT | TAT_TITLE,
1444 tab_float (tbl, col + 2,
1447 m->mean - t * m->se_mean,
1450 tab_text (tbl, col + 1,
1452 TAB_LEFT | TAT_TITLE,
1456 tab_float (tbl, col + 2,
1459 m->mean + t * m->se_mean,
1464 TAB_LEFT | TAT_TITLE,
1465 _("5% Trimmed Mean"));
1467 tab_float (tbl, col + 2,
1475 TAB_LEFT | TAT_TITLE,
1479 struct percentile *p;
1482 p = hsh_find(m->ptile_hash, &d);
1486 tab_float (tbl, col + 2,
1495 TAB_LEFT | TAT_TITLE,
1498 tab_float (tbl, col + 2,
1507 TAB_LEFT | TAT_TITLE,
1508 _("Std. Deviation"));
1511 tab_float (tbl, col + 2,
1520 TAB_LEFT | TAT_TITLE,
1523 tab_float (tbl, col + 2,
1531 TAB_LEFT | TAT_TITLE,
1534 tab_float (tbl, col + 2,
1543 TAB_LEFT | TAT_TITLE,
1547 tab_float (tbl, col + 2,
1555 TAB_LEFT | TAT_TITLE,
1556 _("Interquartile Range"));
1559 struct percentile *p1;
1560 struct percentile *p2;
1563 p1 = hsh_find(m->ptile_hash, &d);
1566 p2 = hsh_find(m->ptile_hash, &d);
1571 tab_float (tbl, col + 2,
1582 TAB_LEFT | TAT_TITLE,
1586 tab_float (tbl, col + 2,
1592 /* stderr of skewness */
1593 tab_float (tbl, col + 3,
1602 TAB_LEFT | TAT_TITLE,
1606 tab_float (tbl, col + 2,
1612 /* stderr of kurtosis */
1613 tab_float (tbl, col + 3,
1625 box_plot_variables(const struct factor *fctr,
1626 struct variable **vars, int n_vars,
1627 const struct variable *id)
1630 struct factor_statistics **fs ;
1634 box_plot_group(fctr, vars, n_vars, id);
1638 for ( fs = fctr->fs ; *fs ; ++fs )
1640 double y_min = DBL_MAX;
1641 double y_max = -DBL_MAX;
1644 chart_initialise(&ch);
1646 const char *s = factor_to_string(fctr, *fs, 0 );
1648 chart_write_title(&ch, s);
1650 for ( i = 0 ; i < n_vars ; ++i )
1652 y_max = max(y_max, (*fs)->m[i].max);
1653 y_min = min(y_min, (*fs)->m[i].min);
1656 boxplot_draw_yscale(&ch, y_max, y_min);
1658 for ( i = 0 ; i < n_vars ; ++i )
1661 const double box_width = (ch.data_right - ch.data_left)
1664 const double box_centre = ( i * 2 + 1) * box_width
1667 boxplot_draw_boxplot(&ch,
1668 box_centre, box_width,
1670 var_to_string(vars[i]));
1675 chart_finalise(&ch);
1683 /* Do a box plot, grouping all factors into one plot ;
1684 each dependent variable has its own plot.
1687 box_plot_group(const struct factor *fctr,
1688 const struct variable **vars,
1690 const struct variable *id)
1694 for ( i = 0 ; i < n_vars ; ++i )
1696 struct factor_statistics **fs ;
1699 chart_initialise(&ch);
1701 boxplot_draw_yscale(&ch, totals[i].max, totals[i].min);
1707 for ( fs = fctr->fs ; *fs ; ++fs )
1710 chart_write_title(&ch, _("Boxplot of %s vs. %s"),
1711 var_to_string(vars[i]), var_to_string(fctr->indep_var[0]) );
1713 for ( fs = fctr->fs ; *fs ; ++fs )
1716 const char *s = factor_to_string_concise(fctr, *fs);
1718 const double box_width = (ch.data_right - ch.data_left)
1719 / (n_factors * 2.0 ) ;
1721 const double box_centre = ( f++ * 2 + 1) * box_width
1724 boxplot_draw_boxplot(&ch,
1725 box_centre, box_width,
1732 const double box_width = (ch.data_right - ch.data_left) / 3.0;
1733 const double box_centre = (ch.data_right + ch.data_left) / 2.0;
1735 chart_write_title(&ch, _("Boxplot"));
1737 boxplot_draw_boxplot(&ch,
1738 box_centre, box_width,
1740 var_to_string(vars[i]) );
1744 chart_finalise(&ch);
1750 /* Plot the normal and detrended normal plots for m
1751 Label the plots with factorname */
1753 np_plot(const struct metrics *m, const char *factorname)
1756 double yfirst=0, ylast=0;
1759 struct chart np_chart;
1761 /* Detrended Normal Plot */
1762 struct chart dnp_chart;
1764 /* The slope and intercept of the ideal normal probability line */
1765 const double slope = 1.0 / m->stddev;
1766 const double intercept = - m->mean / m->stddev;
1768 /* Cowardly refuse to plot an empty data set */
1769 if ( m->n_data == 0 )
1772 chart_initialise(&np_chart);
1773 chart_write_title(&np_chart, _("Normal Q-Q Plot of %s"), factorname);
1774 chart_write_xlabel(&np_chart, _("Observed Value"));
1775 chart_write_ylabel(&np_chart, _("Expected Normal"));
1777 chart_initialise(&dnp_chart);
1778 chart_write_title(&dnp_chart, _("Detrended Normal Q-Q Plot of %s"),
1780 chart_write_xlabel(&dnp_chart, _("Observed Value"));
1781 chart_write_ylabel(&dnp_chart, _("Dev from Normal"));
1783 yfirst = gsl_cdf_ugaussian_Pinv (m->wvp[0]->rank / ( m->n + 1));
1784 ylast = gsl_cdf_ugaussian_Pinv (m->wvp[m->n_data-1]->rank / ( m->n + 1));
1788 /* Need to make sure that both the scatter plot and the ideal fit into the
1790 double x_lower = min(m->min, (yfirst - intercept) / slope) ;
1791 double x_upper = max(m->max, (ylast - intercept) / slope) ;
1792 double slack = (x_upper - x_lower) * 0.05 ;
1794 chart_write_xscale(&np_chart, x_lower - slack, x_upper + slack, 5);
1796 chart_write_xscale(&dnp_chart, m->min, m->max, 5);
1800 chart_write_yscale(&np_chart, yfirst, ylast, 5);
1803 /* We have to cache the detrended data, beacause we need to
1804 find its limits before we can plot it */
1806 d_data = xmalloc (m->n_data * sizeof(double));
1807 double d_max = -DBL_MAX;
1808 double d_min = DBL_MAX;
1809 for ( i = 0 ; i < m->n_data; ++i )
1811 const double ns = gsl_cdf_ugaussian_Pinv (m->wvp[i]->rank / ( m->n + 1));
1813 chart_datum(&np_chart, 0, m->wvp[i]->v.f, ns);
1815 d_data[i] = (m->wvp[i]->v.f - m->mean) / m->stddev - ns;
1817 if ( d_data[i] < d_min ) d_min = d_data[i];
1818 if ( d_data[i] > d_max ) d_max = d_data[i];
1820 chart_write_yscale(&dnp_chart, d_min, d_max, 5);
1822 for ( i = 0 ; i < m->n_data; ++i )
1823 chart_datum(&dnp_chart, 0, m->wvp[i]->v.f, d_data[i]);
1828 chart_line(&np_chart, slope, intercept, yfirst, ylast , CHART_DIM_Y);
1829 chart_line(&dnp_chart, 0, 0, m->min, m->max , CHART_DIM_X);
1831 chart_finalise(&np_chart);
1832 chart_finalise(&dnp_chart);
1839 /* Show the percentiles */
1841 show_percentiles(struct variable **dependent_var,
1843 struct factor *fctr)
1845 struct tab_table *tbl;
1851 struct hsh_table *ptiles ;
1853 int n_heading_columns;
1854 const int n_heading_rows = 2;
1855 const int n_stat_rows = 2;
1861 struct factor_statistics **fs = fctr->fs ;
1862 n_heading_columns = 3;
1863 n_factors = hsh_count(fctr->fstats);
1865 ptiles = (*fs)->m[0].ptile_hash;
1867 if ( fctr->indep_var[1] )
1868 n_heading_columns = 4;
1873 n_heading_columns = 2;
1875 ptiles = totals[0].ptile_hash;
1878 n_ptiles = hsh_count(ptiles);
1880 n_rows = n_heading_rows + n_dep_var * n_stat_rows * n_factors;
1882 n_cols = n_heading_columns + n_ptiles ;
1884 tbl = tab_create (n_cols, n_rows, 0);
1886 tab_headers (tbl, n_heading_columns + 1, 0, n_heading_rows, 0);
1888 tab_dim (tbl, tab_natural_dimensions);
1890 /* Outline the box and have no internal lines*/
1895 n_cols - 1, n_rows - 1);
1897 tab_hline (tbl, TAL_2, 0, n_cols - 1, n_heading_rows );
1899 tab_vline (tbl, TAL_2, n_heading_columns, 0, n_rows - 1);
1902 tab_title (tbl, 0, _("Percentiles"));
1905 tab_hline (tbl, TAL_1, n_heading_columns, n_cols - 1, 1 );
1912 n_heading_columns - 1, n_rows - 1);
1918 n_heading_columns, n_heading_rows - 1,
1919 n_cols - 1, n_rows - 1);
1921 tab_joint_text(tbl, n_heading_columns + 1, 0,
1923 TAB_CENTER | TAT_TITLE ,
1928 /* Put in the percentile break points as headings */
1930 struct percentile **p = (struct percentile **) hsh_sort(ptiles);
1935 tab_float(tbl, n_heading_columns + i++ , 1,
1944 for ( i = 0 ; i < n_dep_var ; ++i )
1946 const int n_stat_rows = 2;
1947 const int row = n_heading_rows + i * n_stat_rows * n_factors ;
1950 tab_hline(tbl, TAL_1, 0, n_cols - 1, row );
1953 i * n_stat_rows * n_factors + n_heading_rows,
1954 TAB_LEFT | TAT_TITLE,
1955 var_to_string(dependent_var[i])
1960 struct factor_statistics **fs = fctr->fs;
1963 tab_text (tbl, 1, n_heading_rows - 1,
1964 TAB_CENTER | TAT_TITLE,
1965 var_to_string(fctr->indep_var[0]));
1968 if ( fctr->indep_var[1])
1969 tab_text (tbl, 2, n_heading_rows - 1, TAB_CENTER | TAT_TITLE,
1970 var_to_string(fctr->indep_var[1]));
1975 static union value prev ;
1977 const int row = n_heading_rows + n_stat_rows *
1978 ( ( i * n_factors ) + count );
1981 if ( 0 != compare_values(&prev, &(*fs)->id[0],
1982 fctr->indep_var[0]->width))
1986 tab_hline (tbl, TAL_1, 1, n_cols - 1, row);
1990 TAB_LEFT | TAT_TITLE,
1991 value_to_string(&(*fs)->id[0], fctr->indep_var[0])
1997 prev = (*fs)->id[0];
1999 if (fctr->indep_var[1] && count > 0 )
2000 tab_hline(tbl, TAL_1, 2, n_cols - 1, row);
2002 if ( fctr->indep_var[1])
2003 tab_text (tbl, 2, row,
2004 TAB_LEFT | TAT_TITLE,
2005 value_to_string(&(*fs)->id[1], fctr->indep_var[1])
2009 populate_percentiles(tbl, n_heading_columns - 1,
2021 populate_percentiles(tbl, n_heading_columns - 1,
2022 i * n_stat_rows * n_factors + n_heading_rows,
2039 populate_percentiles(struct tab_table *tbl, int col, int row,
2040 const struct metrics *m)
2044 struct percentile **p = (struct percentile **) hsh_sort(m->ptile_hash);
2048 TAB_LEFT | TAT_TITLE,
2049 _("Tukey\'s Hinges")
2054 TAB_LEFT | TAT_TITLE,
2055 ptile_alg_desc[m->ptile_alg]
2062 tab_float(tbl, col + i + 1 , row,
2065 if ( (*p)->p == 25 )
2066 tab_float(tbl, col + i + 1 , row + 1,
2070 if ( (*p)->p == 50 )
2071 tab_float(tbl, col + i + 1 , row + 1,
2075 if ( (*p)->p == 75 )
2076 tab_float(tbl, col + i + 1 , row + 1,
2091 factor_to_string(const struct factor *fctr,
2092 struct factor_statistics *fs,
2093 const struct variable *var)
2096 static char buf1[100];
2102 sprintf(buf1, "%s (",var_to_string(var) );
2105 snprintf(buf2, 100, "%s = %s",
2106 var_to_string(fctr->indep_var[0]),
2107 value_to_string(&fs->id[0],fctr->indep_var[0]));
2111 if ( fctr->indep_var[1] )
2113 sprintf(buf2, "; %s = %s)",
2114 var_to_string(fctr->indep_var[1]),
2115 value_to_string(&fs->id[1],
2116 fctr->indep_var[1]));
2131 factor_to_string_concise(const struct factor *fctr,
2132 struct factor_statistics *fs)
2136 static char buf[100];
2140 snprintf(buf, 100, "%s",
2141 value_to_string(&fs->id[0], fctr->indep_var[0]));
2143 if ( fctr->indep_var[1] )
2145 sprintf(buf2, ",%s)", value_to_string(&fs->id[1], fctr->indep_var[1]) );