1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 * Remember that histograms, bar charts need mean, stddev.
32 #include "bitvector.h"
49 #include "debug-print.h"
54 format=cond:condense/onepage(*n:onepage_limit,"%s>=0")/!standard,
55 table:limit(n:limit,"%s>0")/notable/!table,
56 labels:!labels/nolabels,
57 sort:!avalue/dvalue/afreq/dfreq,
58 spaces:!single/double,
59 paging:newpage/!oldpage;
60 missing=miss:include/!exclude;
61 barchart(ba_)=:minimum(d:min),
63 scale:freq(*n:freq,"%s>0")/percent(*n:pcnt,"%s>0");
64 histogram(hi_)=:minimum(d:min),
66 scale:freq(*n:freq,"%s>0")/percent(*n:pcnt,"%s>0"),
67 norm:!nonormal/normal,
68 incr:increment(d:inc,"%s>0");
69 hbar(hb_)=:minimum(d:min),
71 scale:freq(*n:freq,"%s>0")/percent(*n:pcnt,"%s>0"),
72 norm:!nonormal/normal,
73 incr:increment(d:inc,"%s>0");
77 statistics[st_]=1|mean,2|semean,3|median,4|mode,5|stddev,6|variance,
78 7|kurtosis,8|skewness,9|range,10|minimum,11|maximum,12|sum,
79 13|default,14|seskewness,15|sekurtosis,all,none.
84 /* Description of a statistic. */
87 int st_indx; /* Index into a_statistics[]. */
88 const char *s10; /* Identifying string. */
91 /* Table of statistics, indexed by dsc_*. */
92 static struct frq_info st_name[frq_n_stats + 1] =
94 {FRQ_ST_MEAN, N_("Mean")},
95 {FRQ_ST_SEMEAN, N_("S.E. Mean")},
96 {FRQ_ST_MEDIAN, N_("Median")},
97 {FRQ_ST_MODE, N_("Mode")},
98 {FRQ_ST_STDDEV, N_("Std Dev")},
99 {FRQ_ST_VARIANCE, N_("Variance")},
100 {FRQ_ST_KURTOSIS, N_("Kurtosis")},
101 {FRQ_ST_SEKURTOSIS, N_("S.E. Kurt")},
102 {FRQ_ST_SKEWNESS, N_("Skewness")},
103 {FRQ_ST_SESKEWNESS, N_("S.E. Skew")},
104 {FRQ_ST_RANGE, N_("Range")},
105 {FRQ_ST_MINIMUM, N_("Minimum")},
106 {FRQ_ST_MAXIMUM, N_("Maximum")},
107 {FRQ_ST_SUM, N_("Sum")},
111 /* Percentiles to calculate. */
112 static double *percentiles=0;
113 static double *percentile_values=0;
114 static int n_percentiles=0;
116 /* Groups of statistics. */
118 #define frq_default \
119 (BI (frq_mean) | BI (frq_stddev) | BI (frq_min) | BI (frq_max))
121 (BI (frq_sum) | BI(frq_min) | BI(frq_max) \
122 | BI(frq_mean) | BI(frq_semean) | BI(frq_stddev) \
123 | BI(frq_variance) | BI(frq_kurt) | BI(frq_sekurt) \
124 | BI(frq_skew) | BI(frq_seskew) | BI(frq_range) \
125 | BI(frq_range) | BI(frq_mode) | BI(frq_median))
127 /* Statistics; number of statistics. */
128 static unsigned long stats;
131 /* Types of graphs. */
134 GFT_NONE, /* Don't draw graphs. */
135 GFT_BAR, /* Draw bar charts. */
136 GFT_HIST, /* Draw histograms. */
137 GFT_HBAR /* Draw bar charts or histograms at our discretion. */
140 /* Parsed command. */
141 static struct cmd_frequencies cmd;
143 /* Summary of the barchart, histogram, and hbar subcommands. */
144 static int chart; /* NONE/BAR/HIST/HBAR. */
145 static double min, max; /* Minimum, maximum on y axis. */
146 static int format; /* FREQ/PERCENT: Scaling of y axis. */
147 static double scale, incr; /* FIXME */
148 static int normal; /* FIXME */
150 /* Variables for which to calculate statistics. */
151 static int n_variables;
152 static struct variable **v_variables;
154 /* Arenas used to store semi-permanent storage. */
155 static struct pool *int_pool; /* Integer mode. */
156 static struct pool *gen_pool; /* General mode. */
158 /* Easier access to a_statistics. */
159 #define stat cmd.a_statistics
161 static void determine_charts (void);
163 static void precalc (void);
164 static int calc_weighting (struct ccase *);
165 static int calc_no_weight (struct ccase *);
166 static void postcalc (void);
168 static void postprocess_freq_tab (struct variable *);
169 static void dump_full (struct variable *);
170 static void dump_condensed (struct variable *);
171 static void dump_statistics (struct variable *, int show_varname);
172 static void cleanup_freq_tab (struct variable *);
174 static int compare_value_numeric_a (const void *, const void *, void *);
175 static int compare_value_alpha_a (const void *, const void *, void *);
176 static int compare_value_numeric_d (const void *, const void *, void *);
177 static int compare_value_alpha_d (const void *, const void *, void *);
178 static int compare_freq_numeric_a (const void *, const void *, void *);
179 static int compare_freq_alpha_a (const void *, const void *, void *);
180 static int compare_freq_numeric_d (const void *, const void *, void *);
181 static int compare_freq_alpha_d (const void *, const void *, void *);
183 /* Parser and outline. */
185 static int internal_cmd_frequencies (void);
188 cmd_frequencies (void)
192 int_pool = pool_create ();
193 result = internal_cmd_frequencies ();
194 pool_destroy (int_pool);
196 pool_destroy (gen_pool);
204 internal_cmd_frequencies (void)
206 int (*calc) (struct ccase *);
215 for (i = 0; i < default_dict.nvar; i++)
216 default_dict.var[i]->foo = 0;
218 lex_match_id ("FREQUENCIES");
219 if (!parse_frequencies (&cmd))
222 if (cmd.onepage_limit == NOT_LONG)
223 cmd.onepage_limit = 50;
225 /* Figure out statistics to calculate. */
227 if (stat[FRQ_ST_DEFAULT] || !cmd.sbc_statistics)
228 stats |= frq_default;
229 if (stat[FRQ_ST_ALL])
231 if (cmd.sort != FRQ_AVALUE && cmd.sort != FRQ_DVALUE)
232 stats &= ~frq_median;
233 for (i = 0; i < frq_n_stats; i++)
234 if (stat[st_name[i].st_indx])
235 stats |= BIT_INDEX (i);
236 if (stats & frq_kurt)
238 if (stats & frq_skew)
241 /* Calculate n_stats. */
243 for (i = 0; i < frq_n_stats; i++)
244 if ((stats & BIT_INDEX (i)))
249 if (chart != GFT_NONE || cmd.sbc_ntiles)
250 cmd.sort = FRQ_AVALUE;
253 update_weighting (&default_dict);
254 calc = default_dict.weight_index == -1 ? calc_no_weight : calc_weighting;
255 procedure (precalc, calc, postcalc);
260 /* Figure out which charts the user requested. */
262 determine_charts (void)
264 int count = (!!cmd.sbc_histogram) + (!!cmd.sbc_barchart) + (!!cmd.sbc_hbar);
274 msg (SW, _("At most one of BARCHART, HISTOGRAM, or HBAR should be "
275 "given. HBAR will be assumed. Argument values will be "
276 "given precedence increasing along the order given."));
278 else if (cmd.sbc_histogram)
280 else if (cmd.sbc_barchart)
291 if (cmd.sbc_barchart)
293 if (cmd.ba_min != SYSMIS)
295 if (cmd.ba_max != SYSMIS)
297 if (cmd.ba_scale == FRQ_FREQ)
302 else if (cmd.ba_scale == FRQ_PERCENT)
304 format = FRQ_PERCENT;
309 if (cmd.sbc_histogram)
311 if (cmd.hi_min != SYSMIS)
313 if (cmd.hi_max != SYSMIS)
315 if (cmd.hi_scale == FRQ_FREQ)
320 else if (cmd.hi_scale == FRQ_PERCENT)
322 format = FRQ_PERCENT;
327 if (cmd.hi_incr == FRQ_INCREMENT)
333 if (cmd.hb_min != SYSMIS)
335 if (cmd.hb_max != SYSMIS)
337 if (cmd.hb_scale == FRQ_FREQ)
342 else if (cmd.hb_scale == FRQ_PERCENT)
344 format = FRQ_PERCENT;
349 if (cmd.hb_incr == FRQ_INCREMENT)
353 if (min != SYSMIS && max != SYSMIS && min >= max)
355 msg (SE, _("MAX must be greater than or equal to MIN, if both are "
356 "specified. However, MIN was specified as %g and MAX as %g. "
357 "MIN and MAX will be ignored."), min, max);
362 /* Generate each calc_*(). */
364 #include "frequencies.g"
367 #include "frequencies.g"
369 /* Prepares each variable that is the target of FREQUENCIES by setting
370 up its hash table. */
376 pool_destroy (gen_pool);
377 gen_pool = pool_create ();
379 for (i = 0; i < n_variables; i++)
381 struct variable *v = v_variables[i];
383 if (v->p.frq.tab.mode == FRQM_GENERAL)
385 avl_comparison_func compare;
386 if (v->type == NUMERIC)
387 compare = compare_value_numeric_a;
389 compare = compare_value_alpha_a;
390 v->p.frq.tab.tree = avl_create (gen_pool, compare,
392 v->p.frq.tab.n_missing = 0;
398 for (j = (v->p.frq.tab.max - v->p.frq.tab.min); j >= 0; j--)
399 v->p.frq.tab.vector[j] = 0.0;
400 v->p.frq.tab.out_of_range = 0.0;
401 v->p.frq.tab.sysmis = 0.0;
406 /* Finishes up with the variables after frequencies have been
407 calculated. Displays statistics, percentiles, ... */
413 for (i = 0; i < n_variables; i++)
415 struct variable *v = v_variables[i];
417 int dumped_freq_tab = 1;
419 postprocess_freq_tab (v);
421 /* Frequencies tables. */
422 n_categories = v->p.frq.tab.n_valid + v->p.frq.tab.n_missing;
423 if (cmd.table == FRQ_TABLE
424 || (cmd.table == FRQ_LIMIT && n_categories <= cmd.limit))
434 if (n_categories > cmd.onepage_limit)
447 dump_statistics (v, !dumped_freq_tab);
449 cleanup_freq_tab (v);
453 /* Comparison function called by comparison_helper(). */
454 static avl_comparison_func comparison_func;
456 /* Passed to comparison function by comparison_helper(). */
457 static void *comparison_param;
459 /* Used by postprocess_freq_tab to re-sort frequency tables. */
461 comparison_helper (const void *a, const void *b)
463 return comparison_func (&((struct freq *) a)->v,
464 &((struct freq *) b)->v, comparison_param);
467 /* Used by postprocess_freq_tab to construct the array members valid,
468 missing of freq_tab. */
470 add_freq (void *data, void *param)
472 struct freq *f = data;
473 struct variable *v = param;
475 v->p.frq.tab.total_cases += f->c;
477 if ((v->type == NUMERIC && f->v.f == SYSMIS)
478 || (cmd.miss == FRQ_EXCLUDE && is_user_missing (&f->v, v)))
480 *v->p.frq.tab.missing++ = *f;
481 v->p.frq.tab.valid_cases -= f->c;
484 *v->p.frq.tab.valid++ = *f;
488 postprocess_freq_tab (struct variable * v)
490 avl_comparison_func compare;
492 switch (cmd.sort | (v->type << 16))
494 /* Note that q2c generates tags beginning with 1000. */
495 case FRQ_AVALUE | (NUMERIC << 16):
498 case FRQ_AVALUE | (ALPHA << 16):
501 case FRQ_DVALUE | (NUMERIC << 16):
502 comparison_func = compare_value_numeric_d;
504 case FRQ_DVALUE | (ALPHA << 16):
505 compare = compare_value_alpha_d;
507 case FRQ_AFREQ | (NUMERIC << 16):
508 compare = compare_freq_numeric_a;
510 case FRQ_AFREQ | (ALPHA << 16):
511 compare = compare_freq_alpha_a;
513 case FRQ_DFREQ | (NUMERIC << 16):
514 compare = compare_freq_numeric_d;
516 case FRQ_DFREQ | (ALPHA << 16):
517 compare = compare_freq_alpha_d;
522 comparison_func = compare;
524 if (v->p.frq.tab.mode == FRQM_GENERAL)
527 struct freq_tab *ft = &v->p.frq.tab;
529 total = avl_count (ft->tree);
530 ft->n_valid = total - ft->n_missing;
531 ft->valid = xmalloc (sizeof (struct freq) * total);
532 ft->missing = &ft->valid[ft->n_valid];
533 ft->valid_cases = ft->total_cases = 0.0;
535 avl_walk (ft->tree, add_freq, (void *) v);
537 ft->valid -= ft->n_valid;
538 ft->missing -= ft->n_missing;
539 ft->valid_cases += ft->total_cases;
543 qsort (ft->valid, ft->n_valid, sizeof (struct freq), comparison_helper);
544 qsort (ft->missing, ft->n_missing, sizeof (struct freq), comparison_helper);
552 cleanup_freq_tab (struct variable * v)
554 if (v->p.frq.tab.mode == FRQM_GENERAL)
556 struct freq_tab *ft = &v->p.frq.tab;
564 /* Parses the VARIABLES subcommand, adding to
565 {n_variables,v_variables}. */
567 frq_custom_variables (struct cmd_frequencies *cmd unused)
572 int old_n_variables = n_variables;
576 if (token != T_ALL && (token != T_ID || !is_varname (tokid)))
579 if (!parse_variables (NULL, &v_variables, &n_variables,
580 PV_APPEND | PV_NO_SCRATCH))
583 for (i = old_n_variables; i < n_variables; i++)
584 v_variables[i]->p.frq.tab.mode = FRQM_GENERAL;
586 if (!lex_match ('('))
591 if (!lex_force_int ())
593 min = lex_integer ();
595 if (!lex_force_match (','))
597 if (!lex_force_int ())
599 max = lex_integer ();
601 if (!lex_force_match (')'))
605 msg (SE, _("Upper limit of integer mode value range must be "
606 "greater than lower limit."));
611 for (i = old_n_variables; i < n_variables; i++)
613 struct variable *v = v_variables[i];
617 msg (SE, _("Variable %s specified multiple times on VARIABLES "
618 "subcommand."), v->name);
622 v->foo = 1; /* Used simply as a marker. */
624 v->p.frq.tab.valid = v->p.frq.tab.missing = NULL;
626 if (mode == FRQM_INTEGER)
628 if (v->type != NUMERIC)
630 msg (SE, _("Integer mode specified, but %s is not a numeric "
631 "variable."), v->name);
635 v->p.frq.tab.min = min;
636 v->p.frq.tab.max = max;
637 v->p.frq.tab.vector = pool_alloc (int_pool,
638 sizeof (struct freq) * (max - min + 1));
641 v->p.frq.tab.vector = NULL;
643 v->p.frq.n_groups = 0;
644 v->p.frq.groups = NULL;
649 /* Parses the GROUPED subcommand, setting the frq.{n_grouped,grouped}
650 fields of specified variables. */
652 frq_custom_grouped (struct cmd_frequencies *cmd unused)
655 if ((token == T_ID && is_varname (tokid)) || token == T_ID)
660 /* Max, current size of list; list itself. */
668 if (!parse_variables (NULL, &v, &n, PV_NO_DUPLICATE | PV_NUMERIC))
674 while (token == T_NUM)
679 dl = pool_realloc (int_pool, dl, ml * sizeof (double));
685 /* Note that nl might still be 0 and dl might still be
686 NULL. That's okay. */
687 if (!lex_match (')'))
690 msg (SE, _("`)' expected after GROUPED interval list."));
697 for (i = 0; i < n; i++)
700 msg (SE, _("Variables %s specified on GROUPED but not on "
701 "VARIABLES."), v[i]->name);
702 if (v[i]->p.frq.groups != NULL)
703 msg (SE, _("Variables %s specified multiple times on GROUPED "
704 "subcommand."), v[i]->name);
707 v[i]->p.frq.n_groups = nl;
708 v[i]->p.frq.groups = dl;
712 if (!lex_match ('/'))
714 if ((token != T_ID || !is_varname (tokid)) && token != T_ALL)
724 /* Adds X to the list of percentiles, keeping the list in proper
727 add_percentile (double x)
731 for (i = 0; i < n_percentiles; i++)
732 if (x <= percentiles[i])
734 if (i >= n_percentiles || tokval != percentiles[i])
736 percentiles = pool_realloc (int_pool, percentiles,
737 (n_percentiles + 1) * sizeof (double));
738 percentile_values = pool_realloc (int_pool, percentile_values,
739 (n_percentiles + 1) * sizeof (double));
741 if (i < n_percentiles)
743 memmove (&percentiles[i + 1], &percentiles[i],
744 (n_percentiles - i) * sizeof (double));
745 memmove (&percentile_values[i + 1], &percentile_values[i],
746 (n_percentiles - i) * sizeof (double));
754 /* Parses the PERCENTILES subcommand, adding user-specified
755 percentiles to the list. */
757 frq_custom_percentiles (struct cmd_frequencies *cmd unused)
762 msg (SE, _("Percentile list expected after PERCENTILES."));
768 if (tokval <= 0 || tokval >= 100)
770 msg (SE, _("Percentiles must be greater than "
771 "0 and less than 100."));
775 add_percentile (tokval / 100.0);
779 while (token == T_NUM);
783 /* Parses the NTILES subcommand, adding the percentiles that
784 correspond to the specified evenly-distributed ntiles. */
786 frq_custom_ntiles (struct cmd_frequencies *cmd unused)
791 if (!lex_force_int ())
793 for (i = 1; i < lex_integer (); i++)
794 add_percentile (1.0 / lex_integer () * i);
799 /* Comparison functions. */
801 /* Ascending numeric compare of values. */
803 compare_value_numeric_a (const void *a, const void *b, void *foo unused)
805 return approx_compare (((struct freq *) a)->v.f, ((struct freq *) b)->v.f);
808 /* Ascending string compare of values. */
810 compare_value_alpha_a (const void *a, const void *b, void *len)
812 return memcmp (((struct freq *) a)->v.s, ((struct freq *) b)->v.s, (int) len);
815 /* Descending numeric compare of values. */
817 compare_value_numeric_d (const void *a, const void *b, void *foo unused)
819 return approx_compare (((struct freq *) b)->v.f, ((struct freq *) a)->v.f);
822 /* Descending string compare of values. */
824 compare_value_alpha_d (const void *a, const void *b, void *len)
826 return memcmp (((struct freq *) b)->v.s, ((struct freq *) a)->v.s, (int) len);
829 /* Ascending numeric compare of frequency;
830 secondary key on ascending numeric value. */
832 compare_freq_numeric_a (const void *a, const void *b, void *foo unused)
834 int x = approx_compare (((struct freq *) a)->c, ((struct freq *) b)->c);
835 return x ? x : approx_compare (((struct freq *) a)->v.f, ((struct freq *) b)->v.f);
838 /* Ascending numeric compare of frequency;
839 secondary key on ascending string value. */
841 compare_freq_alpha_a (const void *a, const void *b, void *len)
843 int x = approx_compare (((struct freq *) a)->c, ((struct freq *) b)->c);
844 return x ? x : memcmp (((struct freq *) a)->v.s, ((struct freq *) b)->v.s, (int) len);
847 /* Descending numeric compare of frequency;
848 secondary key on ascending numeric value. */
850 compare_freq_numeric_d (const void *a, const void *b, void *foo unused)
852 int x = approx_compare (((struct freq *) b)->c, ((struct freq *) a)->c);
853 return x ? x : approx_compare (((struct freq *) a)->v.f, ((struct freq *) b)->v.f);
856 /* Descending numeric compare of frequency;
857 secondary key on ascending string value. */
859 compare_freq_alpha_d (const void *a, const void *b, void *len)
861 int x = approx_compare (((struct freq *) b)->c, ((struct freq *) a)->c);
862 return x ? x : memcmp (((struct freq *) a)->v.s, ((struct freq *) b)->v.s, (int) len);
865 /* Frequency table display. */
867 /* Sets the widths of all the columns and heights of all the rows in
868 table T for driver D. */
870 full_dim (struct tab_table *t, struct outp_driver *d)
872 int lab = cmd.labels == FRQ_LABELS;
876 t->w[0] = min (tab_natural_width (t, d, 0), d->prop_em_width * 15);
877 for (i = lab; i < lab + 5; i++)
878 t->w[i] = max (tab_natural_width (t, d, i), d->prop_em_width * 8);
879 for (i = 0; i < t->nr; i++)
880 t->h[i] = d->font_height;
883 /* Displays a full frequency table for variable V. */
885 dump_full (struct variable * v)
891 double cum_percent = 0.0;
892 double cum_freq = 0.0;
902 static struct init vec[] =
907 {2, 1, N_("Frequency")},
908 {3, 1, N_("Percent")},
909 {4, 1, N_("Percent")},
910 {5, 1, N_("Percent")},
918 int lab = cmd.labels == FRQ_LABELS;
920 n_categories = v->p.frq.tab.n_valid + v->p.frq.tab.n_missing;
921 t = tab_create (5 + lab, n_categories + 3, 0);
922 tab_headers (t, 0, 0, 2, 0);
923 tab_dim (t, full_dim);
926 tab_text (t, 0, 1, TAB_CENTER | TAT_TITLE, _("Value Label"));
927 for (p = vec; p->s; p++)
928 tab_text (t, p->c - (p->r ? !lab : 0), p->r,
929 TAB_CENTER | TAT_TITLE, gettext (p->s));
932 for (f = v->p.frq.tab.valid; f < v->p.frq.tab.missing; f++)
934 double percent, valid_percent;
938 percent = f->c / v->p.frq.tab.total_cases * 100.0;
939 valid_percent = f->c / v->p.frq.tab.valid_cases * 100.0;
940 cum_percent += valid_percent;
944 char *label = get_val_lab (v, f->v, 0);
946 tab_text (t, 0, r, TAB_LEFT, label);
949 tab_value (t, 0 + lab, r, TAB_NONE, &f->v, &v->print);
950 tab_float (t, 1 + lab, r, TAB_NONE, f->c, 8, 0);
951 tab_float (t, 2 + lab, r, TAB_NONE, percent, 5, 1);
952 tab_float (t, 3 + lab, r, TAB_NONE, valid_percent, 5, 1);
953 tab_float (t, 4 + lab, r, TAB_NONE, cum_percent, 5, 1);
956 for (; f < &v->p.frq.tab.valid[n_categories]; f++)
962 char *label = get_val_lab (v, f->v, 0);
964 tab_text (t, 0, r, TAB_LEFT, label);
967 tab_value (t, 0 + lab, r, TAB_NONE, &f->v, &v->print);
968 tab_float (t, 1 + lab, r, TAB_NONE, f->c, 8, 0);
969 tab_float (t, 2 + lab, r, TAB_NONE,
970 f->c / v->p.frq.tab.total_cases * 100.0, 5, 1);
971 tab_text (t, 3 + lab, r, TAB_NONE, _("Missing"));
975 tab_box (t, TAL_1, TAL_1,
976 cmd.spaces == FRQ_SINGLE ? -1 : (TAL_1 | TAL_SPACING), TAL_1,
978 tab_hline (t, TAL_2, 0, 4 + lab, 2);
979 tab_hline (t, TAL_2, 0, 4 + lab, r);
980 tab_joint_text (t, 0, r, 0 + lab, r, TAB_RIGHT | TAT_TITLE, _("Total"));
981 tab_vline (t, TAL_0, 1, r, r);
982 tab_float (t, 1 + lab, r, TAB_NONE, cum_freq, 8, 0);
983 tab_float (t, 2 + lab, r, TAB_NONE, 100.0, 5, 1);
984 tab_float (t, 3 + lab, r, TAB_NONE, 100.0, 5, 1);
986 tab_title (t, 1, "%s: %s", v->name, v->label ? v->label : "");
990 /* Sets the widths of all the columns and heights of all the rows in
991 table T for driver D. */
993 condensed_dim (struct tab_table *t, struct outp_driver *d)
995 int cum_w = max (outp_string_width (d, _("Cum")),
996 max (outp_string_width (d, _("Cum")),
997 outp_string_width (d, "000")));
1001 for (i = 0; i < 2; i++)
1002 t->w[i] = max (tab_natural_width (t, d, i), d->prop_em_width * 8);
1003 for (i = 2; i < 4; i++)
1005 for (i = 0; i < t->nr; i++)
1006 t->h[i] = d->font_height;
1009 /* Display condensed frequency table for variable V. */
1011 dump_condensed (struct variable * v)
1015 struct tab_table *t;
1017 double cum_percent = 0.0;
1019 n_categories = v->p.frq.tab.n_valid + v->p.frq.tab.n_missing;
1020 t = tab_create (4, n_categories + 2, 0);
1022 tab_headers (t, 0, 0, 2, 0);
1023 tab_text (t, 0, 1, TAB_CENTER | TAT_TITLE, _("Value"));
1024 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Freq"));
1025 tab_text (t, 2, 1, TAB_CENTER | TAT_TITLE, _("Pct"));
1026 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Cum"));
1027 tab_text (t, 3, 1, TAB_CENTER | TAT_TITLE, _("Pct"));
1028 tab_dim (t, condensed_dim);
1031 for (f = v->p.frq.tab.valid; f < v->p.frq.tab.missing; f++)
1035 percent = f->c / v->p.frq.tab.total_cases * 100.0;
1036 cum_percent += f->c / v->p.frq.tab.valid_cases * 100.0;
1038 tab_value (t, 0, r, TAB_NONE, &f->v, &v->print);
1039 tab_float (t, 1, r, TAB_NONE, f->c, 8, 0);
1040 tab_float (t, 2, r, TAB_NONE, percent, 3, 0);
1041 tab_float (t, 3, r, TAB_NONE, cum_percent, 3, 0);
1044 for (; f < &v->p.frq.tab.valid[n_categories]; f++)
1046 tab_value (t, 0, r, TAB_NONE, &f->v, &v->print);
1047 tab_float (t, 1, r, TAB_NONE, f->c, 8, 0);
1048 tab_float (t, 2, r, TAB_NONE,
1049 f->c / v->p.frq.tab.total_cases * 100.0, 3, 0);
1053 tab_box (t, TAL_1, TAL_1,
1054 cmd.spaces == FRQ_SINGLE ? -1 : (TAL_1 | TAL_SPACING), TAL_1,
1056 tab_hline (t, TAL_2, 0, 3, 2);
1057 tab_title (t, 1, "%s: %s", v->name, v->label ? v->label : "");
1058 tab_columns (t, SOM_COL_DOWN, 1);
1062 /* Statistical display. */
1064 /* Calculates all the pertinent statistics for variable V, putting
1065 them in array D[]. FIXME: This could be made much more optimal. */
1067 calc_stats (struct variable * v, double d[frq_n_stats])
1069 double W = v->p.frq.tab.valid_cases;
1070 double X_bar, X_mode, M2, M3, M4;
1074 double cum_percent=0;
1076 double previous_value=SYSMIS;
1080 /* Calculate the mean and mode */
1084 for (f = v->p.frq.tab.valid; f < v->p.frq.tab.missing; f++)
1088 cum_percent += f->c / v->p.frq.tab.valid_cases ;
1091 for(;i < n_percentiles ; ++i)
1095 if (cum_percent <= percentiles[i])
1098 percentile_values[i]=previous_value;
1104 X_bar += f->v.f * f->c;
1107 if (most_often < f->c )
1112 else if ( most_often == f->c )
1114 /* if there are 2 values , then mode is undefined */
1118 previous_value=f->v.f;
1122 /* Calculate moments about the mean. */
1124 for (f = v->p.frq.tab.valid; f < v->p.frq.tab.missing; f++)
1126 double dev = f->v.f - X_bar;
1136 /* Formulas below are taken from _SPSS Statistical Algorithms_. */
1137 d[frq_min] = v->p.frq.tab.valid[0].v.f;
1138 d[frq_max] = v->p.frq.tab.missing[-1].v.f;
1139 d[frq_mode] = X_mode;
1140 d[frq_range] = d[frq_max] - d[frq_min];
1141 d[frq_median] = SYSMIS;
1142 d[frq_mean] = X_bar;
1143 d[frq_sum] = X_bar * W;
1144 d[frq_variance] = M2 / (W - 1);
1145 d[frq_stddev] = sqrt (d[frq_variance]);
1146 d[frq_semean] = d[frq_stddev] / sqrt (W);
1147 if (W >= 3.0 && d[frq_variance] > 0)
1149 double S = d[frq_stddev];
1150 d[frq_skew] = (W * M3 / ((W - 1.0) * (W - 2.0) * S * S * S));
1151 d[frq_seskew] = sqrt (6.0 * W * (W - 1.0)
1152 / ((W - 2.0) * (W + 1.0) * (W + 3.0)));
1156 d[frq_skew] = d[frq_seskew] = SYSMIS;
1158 if (W >= 4.0 && d[frq_variance] > 0)
1160 double S2 = d[frq_variance];
1161 double SE_g1 = d[frq_seskew];
1163 d[frq_kurt] = ((W * (W + 1.0) * M4 - 3.0 * M2 * M2 * (W - 1.0))
1164 / ((W - 1.0) * (W - 2.0) * (W - 3.0) * S2 * S2));
1165 d[frq_sekurt] = sqrt ((4.0 * (W * W - 1.0) * SE_g1 * SE_g1)
1166 / ((W - 3.0) * (W + 5.0)));
1170 d[frq_kurt] = d[frq_sekurt] = SYSMIS;
1174 /* Displays a table of all the statistics requested for variable V. */
1176 dump_statistics (struct variable * v, int show_varname)
1178 double stat_value[frq_n_stats];
1179 struct tab_table *t;
1182 if (v->type == ALPHA)
1184 if (v->p.frq.tab.n_valid == 0)
1186 msg (SW, _("No valid data for variable %s; statistics not displayed."),
1190 calc_stats (v, stat_value);
1192 t = tab_create (2, n_stats + n_percentiles, 0);
1193 tab_dim (t, tab_natural_dimensions);
1194 tab_vline (t, TAL_1 | TAL_SPACING, 1, 0, n_stats - 1);
1195 for (i = r = 0; i < frq_n_stats; i++)
1196 if (stats & BIT_INDEX (i))
1198 tab_text (t, 0, r, TAB_LEFT | TAT_TITLE,
1199 gettext (st_name[i].s10));
1200 tab_float (t, 1, r, TAB_NONE, stat_value[i], 11, 3);
1204 for ( i=0 ; i < n_percentiles ; ++i,++r ) {
1207 ds_init(gen_pool, &ds, 20 );
1209 ds_printf(&ds,"%s %d",_("Percentile"),(int)(percentiles[i]*100));
1212 tab_text(t,0,r, TAB_LEFT | TAT_TITLE, ds.string);
1213 tab_float(t,1,r,TAB_NONE,percentile_values[i],11,3);
1218 tab_columns (t, SOM_COL_DOWN, 1);
1222 tab_title (t, 1, "%s: %s", v->name, v->label);
1224 tab_title (t, 0, v->name);
1227 tab_flags (t, SOMF_NO_TITLE);
1233 /* Statistical calculation. */
1235 static int degree[6];
1236 static int maxdegree, minmax;
1238 static void stat_func (struct freq *, VISIT, int);
1239 static void calc_stats (int);
1240 static void display_stats (int);
1242 /* mapping of data[]:
1246 * index 3: number of modes found (detects multiple modes)
1247 * index 4: number of nodes processed, for calculation of median
1251 * index 0-3: sum of X**i
1256 * index 8: number of cases, valid and missing
1257 * index 9: number of valid cases
1258 * index 10: maximum frequency found, for calculation of mode
1259 * index 11: maximum frequency
1266 if (cur_var->type == ALPHA)
1268 for (j = 0; j < 8; j++)
1269 cur_var->dbl[j] = 0.;
1270 cur_var->dbl[10] = 0;
1271 cur_var->dbl[4] = DBL_MAX;
1272 cur_var->dbl[5] = -DBL_MAX;
1273 for (j = 2; j < 5; j++)
1274 cur_var->data[j] = 0;
1275 cur_var->p.frq.median_ncases = cur_var->p.frq.t.valid_cases / 2;
1276 avlwalk (cur_var->p.frq.t.f, stat_func, LEFT_TO_RIGHT);
1289 n = v->p.frq.t.valid_cases;
1292 if (n < 2 || (n < 3 && stat[FRQ_ST_7]))
1294 warn (_("only %g case%s for variable %s, statistics not "
1295 "computed"), n, n == 1 ? "" : "s", v->name);
1299 v->res[FRQ_ST_9] = d[5] - d[4];
1300 if (stat[FRQ_ST_10])
1301 v->res[FRQ_ST_10] = d[4];
1302 if (stat[FRQ_ST_11])
1303 v->res[FRQ_ST_11] = d[5];
1304 if (stat[FRQ_ST_12])
1305 v->res[FRQ_ST_12] = d[0];
1306 if (stat[FRQ_ST_1] || stat[FRQ_ST_2] || stat[FRQ_ST_5] || stat[FRQ_ST_6] || stat[FRQ_ST_7])
1308 v->res[FRQ_ST_1] = calc_mean (d, n);
1309 v->res[FRQ_ST_6] = calc_variance (d, n);
1311 if (stat[FRQ_ST_2] || stat[FRQ_ST_5] || stat[FRQ_ST_7])
1312 v->res[FRQ_ST_5] = calc_stddev (v->res[FRQ_ST_6]);
1314 v->res[FRQ_ST_2] = calc_semean (v->res[FRQ_ST_5], n);
1317 v->res[FRQ_ST_7] = calc_kurt (d, n, v->res[FRQ_ST_6]);
1318 v->res[FRQ_ST_14] = calc_sekurt (n);
1322 v->res[FRQ_ST_8] = calc_skew (d, n, v->res[FRQ_ST_5]);
1323 v->res[FRQ_ST_15] = calc_seskew (n);
1325 if (stat[FRQ_ST_MODE])
1327 v->res[FRQ_ST_MODE] = v->dbl[6];
1329 warn (_("The variable %s has %d modes. The lowest of these "
1330 "is the one given in the table."), v->name, v->data[3]);
1332 if (stat[FRQ_ST_MEDIAN])
1333 v->res[FRQ_ST_MEDIAN] = v->dbl[7];
1337 stat_func (struct freq * x, VISIT order, int param)
1341 if (order != INORDER)
1344 cur_var->dbl[0] += (d * x->c);
1349 cur_var->dbl[1] += (f * x->c);
1353 cur_var->dbl[1] += (f * x->c);
1355 cur_var->dbl[2] += (f * x->c);
1359 cur_var->dbl[1] += (f * x->c);
1361 cur_var->dbl[2] += (f * x->c);
1363 cur_var->dbl[3] += (f * x->c);
1368 if (d < cur_var->dbl[4])
1369 cur_var->dbl[4] = d;
1370 if (d > cur_var->dbl[5])
1371 cur_var->dbl[5] = d;
1373 if (x->c > cur_var->dbl[10])
1375 cur_var->data[3] = 1;
1376 cur_var->dbl[10] = x->c;
1377 cur_var->dbl[6] = x->v.f;
1379 else if (x->c == cur_var->dbl[10])
1381 if (cur_var->data[4] < cur_var->p.frq.median_ncases
1382 && cur_var->data[4] + x->c >= cur_var->p.frq.median_ncases)
1383 cur_var->dbl[7] = x->v.f;
1384 cur_var->data[4] += x->c;
1387 /* Statistical display. */
1388 static int column, ncolumns;
1390 static void outstat (char *, double);
1393 display_stats (int i)
1400 ncolumns = (margin_width + 3) / 26;
1403 nlines = sc / ncolumns + (sc % ncolumns > 0);
1404 if (nlines == 2 && sc == 4)
1406 if (nlines == 3 && sc == 9)
1408 if (nlines == 4 && sc == 12)
1411 for (sp = st_name; sp->s != -1; sp++)
1412 if (stat[sp->s] == 1)
1413 outstat (gettext (sp->s10), v->res[sp->s]);
1420 outstat (char *label, double value)
1430 memset (buf, ' ', 3);
1434 n = nsprintf (cp, "%-10s %12.4f", label, value);
1435 while (n > 23 && dw > 0)
1436 n = nsprintf (cp, "%-10s %12.*f", label, --dw, value);
1439 if (column == ncolumns)
1448 static rect pb, gb; /* Page border, graph border. */
1449 static int px, py; /* Page width, height. */
1450 static int ix, iy; /* Inch width, height. */
1452 static void draw_bar_chart (int);
1453 static void draw_histogram (int);
1454 static int scale_dep_axis (int);
1462 if (avlcount (cur_var->p.frq.t.f) < 2
1463 || (chart == HIST && v_variables[i]->type == ALPHA))
1465 if (driver_id && set_highres == 1)
1469 graf_page_size (&px, &py, &ix, &iy);
1472 /* Calculate borders. */
1479 gb.x2 = pb.x2 - ix / 2;
1483 graf_frame_rect (COMPONENTS (pb));
1484 graf_frame_rect (COMPONENTS (gb));
1486 /* Draw axis labels. */
1487 graf_font_size (iy / 4); /* 18-point text */
1488 text = format == PERCENT ? _("Percentage") : _("Frequency");
1489 graf_text (pb.x1 + max (ix, iy) / 4 + max (ix, iy) / 16, gb.y2, text,
1491 text = v->label ? v->label : v->name;
1492 graf_text (gb.x1, pb.y2 - iy / 4, text, UPRIGHT);
1494 /* Draw axes, chart proper. */
1497 && (avlcount (cur_var->p.frq.t.f) || v_variables[i]->type == ALPHA)))
1504 if (set_lowres == 1 || (set_lowres == 2 && (!driver_id || !set_highres)))
1508 /* Do character-based graphs. */
1511 warn (_("low-res graphs not implemented"));
1517 #if __GNUC__ && !__CHECKER__
1518 #define BIG_TYPE long long
1519 #else /* !__GNUC__ */
1520 #define BIG_TYPE double
1521 #endif /* !__GNUC__ */
1524 draw_bar_chart (int i)
1526 int bar_width, bar_spacing;
1531 AVLtraverser *t = NULL;
1533 w = (px - ix * 7 / 2) / avlcount (cur_var->p.frq.t.f);
1534 bar_width = w * 2 / 3;
1535 bar_spacing = w - bar_width;
1537 #if !ALLOW_HUGE_BARS
1538 if (bar_width > ix / 2)
1540 #endif /* !ALLOW_HUGE_BARS */
1542 max = scale_dep_axis (cur_var->p.frq.t.max_freq);
1545 r.x1 = gb.x1 + bar_spacing / 2;
1546 r.x2 = r.x1 + bar_width;
1548 graf_fill_color (255, 0, 0);
1549 for (f = avltrav (cur_var->p.frq.t.f, &t); f;
1550 f = avltrav (cur_var->p.frq.t.f, &t))
1556 if (format == PERCENT)
1557 val = val * 100 / cur_var->p.frq.t.valid_cases;
1558 r.y1 = r.y2 - val * (height (gb) - 1) / max;
1559 graf_fill_rect (COMPONENTS (r));
1560 graf_frame_rect (COMPONENTS (r));
1561 buf = get_val_lab (cur_var, f->v, 0);
1563 if (cur_var->type == ALPHA)
1567 sprintf (buf2, "%g", f->v.f);
1570 graf_text (r.x1 + bar_width / 2,
1571 gb.y2 + iy / 32 + row * iy / 9, buf, TCJUST);
1573 r.x1 += bar_width + bar_spacing;
1574 r.x2 += bar_width + bar_spacing;
1576 graf_fill_color (0, 0, 0);
1579 #define round_down(X, V) \
1580 (floor ((X) / (V)) * (V))
1581 #define round_up(X, V) \
1582 (ceil ((X) / (V)) * (V))
1585 draw_histogram (int i)
1587 double lower, upper, interval;
1588 int bars[MAX_HIST_BARS + 1], top, j;
1589 int err, addend, rem, nbars, row, max_freq;
1593 AVLtraverser *t = NULL;
1595 lower = min == SYSMIS ? cur_var->dbl[4] : min;
1596 upper = max == SYSMIS ? cur_var->dbl[5] : max;
1597 if (upper - lower >= 10)
1601 u = round_up (upper, 5);
1602 l = round_down (lower, 5);
1603 nbars = (u - l) / 5;
1604 if (nbars * 2 + 1 <= MAX_HIST_BARS)
1607 u = round_up (upper, 2.5);
1608 l = round_down (lower, 2.5);
1609 if (l + 1.25 <= lower && u - 1.25 >= upper)
1610 nbars--, lower = l + 1.25, upper = u - 1.25;
1611 else if (l + 1.25 <= lower)
1612 lower = l + 1.25, upper = u + 1.25;
1613 else if (u - 1.25 >= upper)
1614 lower = l - 1.25, upper = u - 1.25;
1616 nbars++, lower = l - 1.25, upper = u + 1.25;
1618 else if (nbars < MAX_HIST_BARS)
1620 if (l + 2.5 <= lower && u - 2.5 >= upper)
1621 nbars--, lower = l + 2.5, upper = u - 2.5;
1622 else if (l + 2.5 <= lower)
1623 lower = l + 2.5, upper = u + 2.5;
1624 else if (u - 2.5 >= upper)
1625 lower = l - 2.5, upper = u - 2.5;
1627 nbars++, lower = l - 2.5, upper = u + 2.5;
1630 nbars = MAX_HIST_BARS;
1634 nbars = avlcount (cur_var->p.frq.t.f);
1635 if (nbars > MAX_HIST_BARS)
1636 nbars = MAX_HIST_BARS;
1638 if (nbars < MIN_HIST_BARS)
1639 nbars = MIN_HIST_BARS;
1640 interval = (upper - lower) / nbars;
1642 memset (bars, 0, sizeof (int[nbars + 1]));
1645 msg (SE, _("Could not make histogram for %s for specified "
1646 "minimum %g and maximum %g; please discard graph."), cur_var->name,
1650 for (f = avltrav (cur_var->p.frq.t.f, &t); f;
1651 f = avltrav (cur_var->p.frq.t.f, &t))
1652 if (f->v.f == upper)
1653 bars[nbars - 1] += f->c;
1654 else if (f->v.f >= lower && f->v.f < upper)
1655 bars[(int) ((f->v.f - lower) / interval)] += f->c;
1656 bars[nbars - 1] += bars[nbars];
1657 for (j = top = 0; j < nbars; j++)
1661 top = scale_dep_axis (top);
1664 addend = width (gb) / nbars;
1665 rem = width (gb) % nbars;
1667 r.x2 = r.x1 + addend;
1670 graf_fill_color (255, 0, 0);
1671 for (j = 0; j < nbars; j++)
1675 r.y1 = r.y2 - (BIG_TYPE) bars[j] * (height (gb) - 1) / top;
1676 graf_fill_rect (COMPONENTS (r));
1677 graf_frame_rect (COMPONENTS (r));
1678 sprintf (buf, "%g", lower + interval / 2 + interval * j);
1679 graf_text (r.x1 + addend / 2,
1680 gb.y2 + iy / 32 + row * iy / 9, buf, TCJUST);
1684 while (err >= addend)
1694 double x, y, variance, mean, step, factor;
1696 variance = cur_var->res[FRQ_ST_VARIANCE];
1697 mean = cur_var->res[FRQ_ST_MEAN];
1698 factor = (1. / (sqrt (2. * PI * variance))
1699 * cur_var->p.frq.t.valid_cases * interval);
1700 graf_polyline_begin ();
1701 for (x = lower, step = (upper - lower) / (POLYLINE_DENSITY);
1702 x <= upper; x += step)
1704 y = factor * exp (-square (x - mean) / (2. * variance));
1705 debug_printf (("(%20.10f, %20.10f)\n", x, y));
1706 graf_polyline_point (gb.x1 + (x - lower) / (upper - lower) * width (gb),
1707 gb.y2 - y * (height (gb) - 1) / top);
1709 graf_polyline_end ();
1711 graf_fill_color (0, 0, 0);
1715 scale_dep_axis (int max)
1717 int j, s, x, y, ty, by;
1721 if (scale != SYSMIS && max < scale)
1722 x = scale, s = scale / 5;
1723 else if (format == PERCENT)
1725 max = ((BIG_TYPE) 100 * cur_var->p.frq.t.max_freq
1726 / cur_var->p.frq.t.valid_cases + 1);
1738 else /* format==FREQ */
1739 /* Uses a progression of 10, 20, 50, 100, 200, 500, ... */
1755 graf_font_size (iy / 9); /* 8-pt text */
1756 for (j = 0; j <= x; j += s)
1758 y = gb.y2 - (BIG_TYPE) j *(height (gb) - 1) / x;
1762 ty += iy / 64, by += iy / 64;
1763 else if (by > gb.y2)
1764 ty -= iy / 64, by -= iy / 64;
1765 graf_fill_rect (gb.x1 - ix / 16, ty, gb.x1, by);
1766 sprintf (buf, "%d", j);
1767 graf_text (gb.x1 - ix / 8, (ty + by) / 2, buf, CRJUST);
1774 static void ungrouped_pcnt (int i);
1775 static int grouped_interval_pcnt (int i);
1776 static void out_pcnt (double, double);
1779 out_percentiles (int i)
1781 if (cur_var->type == ALPHA || !n_percentiles)
1784 outs_line (_("Percentile Value "
1786 "Percentile Value"));
1792 else if (g_var[i] == 1)
1793 grouped_interval_pcnt (i);
1795 else if (g_var[i] == -1)
1798 grouped_boundaries_pcnt (i);
1801 warn (_("this form of percentiles not supported"));
1808 out_pcnt (double pcnt, double value)
1814 out ("%7.2f%13.3f", pcnt * 100., value);
1824 ungrouped_pcnt (int i)
1826 AVLtraverser *t = NULL;
1832 e = &percentiles[n_percentiles];
1834 for (f = avltrav (cur_var->p.frq.t.f, &t);
1835 f && p < e; f = avltrav (cur_var->p.frq.t.f, &t))
1838 while (sum >= p[0] * cur_var->p.frq.t.valid_cases && p < e)
1839 out_pcnt (*p++, f->v.f);
1845 grouped_interval_pcnt (int i)
1847 AVLtraverser * t = NULL;
1848 struct freq * f, *fp;
1853 e = &percentiles[n_percentiles];
1856 for (fp = 0, f = avltrav (cur_var->p.frq.t.f, &t);
1858 fp = f, f = avltrav (cur_var->p.frq.t.f, &t))
1861 if (fabs (f->v.f - fp->v.f) < w)
1865 return msg (SE, _("Difference between %g and %g is "
1866 "too small for grouping interval %g."), f->v.f,
1871 while (sum >= p[0] * cur_var->p.frq.t.valid_cases && p < e)
1873 out_pcnt (p[0], (((p[0] * cur_var->p.frq.t.valid_cases) - psum) * w / f->c
1874 + (f->v.f - w / 2)));