1 /* PSPP - computes sample statistics.
2 Copyright (C) 1997-9, 2000 Free Software Foundation, Inc.
3 Written by Ben Pfaff <blp@gnu.org>.
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23 * Remember that histograms, bar charts need mean, stddev.
31 #include "bitvector.h"
33 #include "dictionary.h"
40 #include "algorithm.h"
47 #include "value-labels.h"
54 #include "debug-print.h"
59 format=cond:condense/onepage(*n:onepage_limit,"%s>=0")/!standard,
60 table:limit(n:limit,"%s>0")/notable/!table,
61 labels:!labels/nolabels,
62 sort:!avalue/dvalue/afreq/dfreq,
63 spaces:!single/double,
64 paging:newpage/!oldpage;
65 missing=miss:include/!exclude;
66 barchart(ba_)=:minimum(d:min),
68 scale:freq(*n:freq,"%s>0")/percent(*n:pcnt,"%s>0");
69 piechart(pie_)=:minimum(d:min),
71 missing:missing/!nomissing;
72 histogram(hi_)=:minimum(d:min),
74 scale:freq(*n:freq,"%s>0")/percent(*n:pcnt,"%s>0"),
75 norm:!nonormal/normal,
76 incr:increment(d:inc,"%s>0");
77 hbar(hb_)=:minimum(d:min),
79 scale:freq(*n:freq,"%s>0")/percent(*n:pcnt,"%s>0"),
80 norm:!nonormal/normal,
81 incr:increment(d:inc,"%s>0");
84 +percentiles = double list;
85 statistics[st_]=1|mean,2|semean,3|median,4|mode,5|stddev,6|variance,
86 7|kurtosis,8|skewness,9|range,10|minimum,11|maximum,12|sum,
87 13|default,14|seskewness,15|sekurtosis,all,none.
95 frq_mean = 0, frq_semean, frq_median, frq_mode, frq_stddev, frq_variance,
96 frq_kurt, frq_sekurt, frq_skew, frq_seskew, frq_range, frq_min, frq_max,
100 /* Description of a statistic. */
103 int st_indx; /* Index into a_statistics[]. */
104 const char *s10; /* Identifying string. */
107 /* Table of statistics, indexed by dsc_*. */
108 static struct frq_info st_name[frq_n_stats + 1] =
110 {FRQ_ST_MEAN, N_("Mean")},
111 {FRQ_ST_SEMEAN, N_("S.E. Mean")},
112 {FRQ_ST_MEDIAN, N_("Median")},
113 {FRQ_ST_MODE, N_("Mode")},
114 {FRQ_ST_STDDEV, N_("Std Dev")},
115 {FRQ_ST_VARIANCE, N_("Variance")},
116 {FRQ_ST_KURTOSIS, N_("Kurtosis")},
117 {FRQ_ST_SEKURTOSIS, N_("S.E. Kurt")},
118 {FRQ_ST_SKEWNESS, N_("Skewness")},
119 {FRQ_ST_SESKEWNESS, N_("S.E. Skew")},
120 {FRQ_ST_RANGE, N_("Range")},
121 {FRQ_ST_MINIMUM, N_("Minimum")},
122 {FRQ_ST_MAXIMUM, N_("Maximum")},
123 {FRQ_ST_SUM, N_("Sum")},
127 /* Percentiles to calculate. */
131 double p; /* the %ile to be calculated */
132 double value; /* the %ile's value */
133 double x1; /* The datum value <= the percentile */
134 double x2; /* The datum value >= the percentile */
136 int flag2; /* Set to 1 if this percentile value has been found */
140 static void add_percentile (double x) ;
142 static struct percentile *percentiles;
143 static int n_percentiles;
145 static int implicit_50th ;
147 /* Groups of statistics. */
149 #define frq_default \
150 (BI (frq_mean) | BI (frq_stddev) | BI (frq_min) | BI (frq_max))
152 (BI (frq_sum) | BI(frq_min) | BI(frq_max) \
153 | BI(frq_mean) | BI(frq_semean) | BI(frq_stddev) \
154 | BI(frq_variance) | BI(frq_kurt) | BI(frq_sekurt) \
155 | BI(frq_skew) | BI(frq_seskew) | BI(frq_range) \
156 | BI(frq_range) | BI(frq_mode) | BI(frq_median))
158 /* Statistics; number of statistics. */
159 static unsigned long stats;
162 /* Types of graphs. */
165 GFT_NONE, /* Don't draw graphs. */
166 GFT_BAR, /* Draw bar charts. */
167 GFT_HIST, /* Draw histograms. */
168 GFT_PIE, /* Draw piechart */
169 GFT_HBAR /* Draw bar charts or histograms at our discretion. */
172 /* Parsed command. */
173 static struct cmd_frequencies cmd;
175 /* Summary of the barchart, histogram, and hbar subcommands. */
176 /* FIXME: These should not be mututally exclusive */
177 static int chart; /* NONE/BAR/HIST/HBAR/PIE. */
178 static double min, max; /* Minimum, maximum on y axis. */
179 static int format; /* FREQ/PERCENT: Scaling of y axis. */
180 static double scale, incr; /* FIXME */
181 static int normal; /* FIXME */
183 /* Variables for which to calculate statistics. */
184 static int n_variables;
185 static struct variable **v_variables;
187 /* Arenas used to store semi-permanent storage. */
188 static struct pool *int_pool; /* Integer mode. */
189 static struct pool *gen_pool; /* General mode. */
191 /* Per-variable frequency data. */
194 /* Freqency table. */
195 struct freq_tab tab; /* Frequencies table to use. */
198 int n_groups; /* Number of groups. */
199 double *groups; /* Groups. */
202 double stat[frq_n_stats];
205 static inline struct var_freqs *
206 get_var_freqs (struct variable *v)
209 assert (v->aux != NULL);
213 static void determine_charts (void);
215 static void calc_stats (struct variable *v, double d[frq_n_stats]);
217 static void precalc (void *);
218 static int calc (struct ccase *, void *);
219 static void postcalc (void *);
221 static void postprocess_freq_tab (struct variable *);
222 static void dump_full (struct variable *);
223 static void dump_condensed (struct variable *);
224 static void dump_statistics (struct variable *, int show_varname);
225 static void cleanup_freq_tab (struct variable *);
227 static hsh_hash_func hash_value_numeric, hash_value_alpha;
228 static hsh_compare_func compare_value_numeric_a, compare_value_alpha_a;
229 static hsh_compare_func compare_value_numeric_d, compare_value_alpha_d;
230 static hsh_compare_func compare_freq_numeric_a, compare_freq_alpha_a;
231 static hsh_compare_func compare_freq_numeric_d, compare_freq_alpha_d;
233 /* Parser and outline. */
235 static int internal_cmd_frequencies (void);
238 cmd_frequencies (void)
242 int_pool = pool_create ();
243 result = internal_cmd_frequencies ();
244 pool_destroy (int_pool);
246 pool_destroy (gen_pool);
254 internal_cmd_frequencies (void)
264 if (!parse_frequencies (&cmd))
267 if (cmd.onepage_limit == NOT_LONG)
268 cmd.onepage_limit = 50;
270 /* Figure out statistics to calculate. */
272 if (cmd.a_statistics[FRQ_ST_DEFAULT] || !cmd.sbc_statistics)
273 stats |= frq_default;
274 if (cmd.a_statistics[FRQ_ST_ALL])
276 if (cmd.sort != FRQ_AVALUE && cmd.sort != FRQ_DVALUE)
277 stats &= ~frq_median;
278 for (i = 0; i < frq_n_stats; i++)
279 if (cmd.a_statistics[st_name[i].st_indx])
280 stats |= BIT_INDEX (i);
281 if (stats & frq_kurt)
283 if (stats & frq_skew)
286 /* Calculate n_stats. */
288 for (i = 0; i < frq_n_stats; i++)
289 if ((stats & BIT_INDEX (i)))
294 if (chart != GFT_NONE || cmd.sbc_ntiles)
295 cmd.sort = FRQ_AVALUE;
297 /* Work out what percentiles need to be calculated */
298 if ( cmd.sbc_percentiles )
300 for ( i = 0 ; i < MAXLISTS ; ++i )
303 subc_list_double *ptl_list = &cmd.dl_percentiles[i];
304 for ( pl = 0 ; pl < subc_list_double_count(ptl_list); ++pl)
305 add_percentile(subc_list_double_at(ptl_list,pl) / 100.0 );
308 if ( cmd.sbc_ntiles )
310 for ( i = 0 ; i < cmd.sbc_ntiles ; ++i )
313 for (j = 0; j <= cmd.n_ntiles[i]; ++j )
314 add_percentile(j / (double) cmd.n_ntiles[i]);
320 procedure_with_splits (precalc, calc, postcalc, NULL);
325 /* Figure out which charts the user requested. */
327 determine_charts (void)
329 int count = (!!cmd.sbc_histogram) + (!!cmd.sbc_barchart) +
330 (!!cmd.sbc_hbar) + (!!cmd.sbc_piechart);
340 msg (SW, _("At most one of BARCHART, HISTOGRAM, or HBAR should be "
341 "given. HBAR will be assumed. Argument values will be "
342 "given precedence increasing along the order given."));
344 else if (cmd.sbc_histogram)
346 else if (cmd.sbc_barchart)
348 else if (cmd.sbc_piechart)
359 if (cmd.sbc_barchart)
361 if (cmd.ba_min != SYSMIS)
363 if (cmd.ba_max != SYSMIS)
365 if (cmd.ba_scale == FRQ_FREQ)
370 else if (cmd.ba_scale == FRQ_PERCENT)
372 format = FRQ_PERCENT;
377 if (cmd.sbc_histogram)
379 if (cmd.hi_min != SYSMIS)
381 if (cmd.hi_max != SYSMIS)
383 if (cmd.hi_scale == FRQ_FREQ)
388 else if (cmd.hi_scale == FRQ_PERCENT)
390 format = FRQ_PERCENT;
393 if (cmd.hi_norm != FRQ_NONORMAL )
395 if (cmd.hi_incr == FRQ_INCREMENT)
401 if (cmd.hb_min != SYSMIS)
403 if (cmd.hb_max != SYSMIS)
405 if (cmd.hb_scale == FRQ_FREQ)
410 else if (cmd.hb_scale == FRQ_PERCENT)
412 format = FRQ_PERCENT;
417 if (cmd.hb_incr == FRQ_INCREMENT)
421 if (min != SYSMIS && max != SYSMIS && min >= max)
423 msg (SE, _("MAX must be greater than or equal to MIN, if both are "
424 "specified. However, MIN was specified as %g and MAX as %g. "
425 "MIN and MAX will be ignored."), min, max);
430 /* Add data from case C to the frequency table. */
432 calc (struct ccase *c, void *aux UNUSED)
438 weight = dict_get_case_weight (default_dict, c, &bad_warn);
440 for (i = 0; i < n_variables; i++)
442 struct variable *v = v_variables[i];
443 const union value *val = case_data (c, v->fv);
444 struct freq_tab *ft = &get_var_freqs (v)->tab;
452 struct freq **fpp = (struct freq **) hsh_probe (ft->data, val);
458 struct freq *fp = *fpp = pool_alloc (gen_pool, sizeof *fp);
466 if (val->f == SYSMIS)
467 ft->sysmis += weight;
468 else if (val->f > INT_MIN+1 && val->f < INT_MAX-1)
471 if (i >= ft->min && i <= ft->max)
472 ft->vector[i - ft->min] += weight;
475 ft->out_of_range += weight;
484 /* Prepares each variable that is the target of FREQUENCIES by setting
485 up its hash table. */
487 precalc (void *aux UNUSED)
491 pool_destroy (gen_pool);
492 gen_pool = pool_create ();
494 for (i = 0; i < n_variables; i++)
496 struct variable *v = v_variables[i];
497 struct freq_tab *ft = &get_var_freqs (v)->tab;
499 if (ft->mode == FRQM_GENERAL)
502 hsh_compare_func *compare;
504 if (v->type == NUMERIC)
506 hash = hash_value_numeric;
507 compare = compare_value_numeric_a;
511 hash = hash_value_alpha;
512 compare = compare_value_alpha_a;
514 ft->data = hsh_create (16, compare, hash, NULL, v);
520 for (j = (ft->max - ft->min); j >= 0; j--)
522 ft->out_of_range = 0.0;
528 /* Finishes up with the variables after frequencies have been
529 calculated. Displays statistics, percentiles, ... */
531 postcalc (void *aux UNUSED)
535 for (i = 0; i < n_variables; i++)
537 struct variable *v = v_variables[i];
538 struct var_freqs *vf = get_var_freqs (v);
539 struct freq_tab *ft = &vf->tab;
541 int dumped_freq_tab = 1;
543 postprocess_freq_tab (v);
545 /* Frequencies tables. */
546 n_categories = ft->n_valid + ft->n_missing;
547 if (cmd.table == FRQ_TABLE
548 || (cmd.table == FRQ_LIMIT && n_categories <= cmd.limit))
558 if (n_categories > cmd.onepage_limit)
571 dump_statistics (v, !dumped_freq_tab);
574 if ( chart == GFT_HIST)
577 double d[frq_n_stats];
579 struct normal_curve norm;
580 norm.N = vf->tab.total_cases;
583 norm.mean = d[frq_mean];
584 norm.stddev = d[frq_stddev];
586 chart_initialise(&ch);
587 draw_histogram(&ch, v_variables[i], ft, "HISTOGRAM",&norm,normal);
592 if ( chart == GFT_PIE)
596 chart_initialise(&ch);
598 draw_piechart(&ch, v_variables[i], ft);
604 cleanup_freq_tab (v);
609 /* Returns the comparison function that should be used for
610 sorting a frequency table by FRQ_SORT using VAR_TYPE
612 static hsh_compare_func *
613 get_freq_comparator (int frq_sort, int var_type)
615 /* Note that q2c generates tags beginning with 1000. */
616 switch (frq_sort | (var_type << 16))
618 case FRQ_AVALUE | (NUMERIC << 16): return compare_value_numeric_a;
619 case FRQ_AVALUE | (ALPHA << 16): return compare_value_alpha_a;
620 case FRQ_DVALUE | (NUMERIC << 16): return compare_value_numeric_d;
621 case FRQ_DVALUE | (ALPHA << 16): return compare_value_alpha_d;
622 case FRQ_AFREQ | (NUMERIC << 16): return compare_freq_numeric_a;
623 case FRQ_AFREQ | (ALPHA << 16): return compare_freq_alpha_a;
624 case FRQ_DFREQ | (NUMERIC << 16): return compare_freq_numeric_d;
625 case FRQ_DFREQ | (ALPHA << 16): return compare_freq_alpha_d;
632 /* Returns nonzero iff the value in struct freq F is non-missing
635 not_missing (const void *f_, void *v_)
637 const struct freq *f = f_;
638 struct variable *v = v_;
640 return !is_missing (&f->v, v);
643 /* Summarizes the frequency table data for variable V. */
645 postprocess_freq_tab (struct variable *v)
647 hsh_compare_func *compare;
651 struct freq *freqs, *f;
654 ft = &get_var_freqs (v)->tab;
655 assert (ft->mode == FRQM_GENERAL);
656 compare = get_freq_comparator (cmd.sort, v->type);
658 /* Extract data from hash table. */
659 count = hsh_count (ft->data);
660 data = hsh_data (ft->data);
662 /* Copy dereferenced data into freqs. */
663 freqs = xmalloc (count * sizeof *freqs);
664 for (i = 0; i < count; i++)
666 struct freq *f = data[i];
670 /* Put data into ft. */
672 ft->n_valid = partition (freqs, count, sizeof *freqs, not_missing, v);
673 ft->missing = freqs + ft->n_valid;
674 ft->n_missing = count - ft->n_valid;
677 sort (ft->valid, ft->n_valid, sizeof *ft->valid, compare, v);
678 sort (ft->missing, ft->n_missing, sizeof *ft->missing, compare, v);
680 /* Summary statistics. */
681 ft->valid_cases = 0.0;
682 for(i = 0 ; i < ft->n_valid ; ++i )
685 ft->valid_cases += f->c;
689 ft->total_cases = ft->valid_cases ;
690 for(i = 0 ; i < ft->n_missing ; ++i )
693 ft->total_cases += f->c;
698 /* Frees the frequency table for variable V. */
700 cleanup_freq_tab (struct variable *v)
702 struct freq_tab *ft = &get_var_freqs (v)->tab;
703 assert (ft->mode == FRQM_GENERAL);
705 hsh_destroy (ft->data);
708 /* Parses the VARIABLES subcommand, adding to
709 {n_variables,v_variables}. */
711 frq_custom_variables (struct cmd_frequencies *cmd UNUSED)
714 int min = 0, max = 0;
716 int old_n_variables = n_variables;
720 if (token != T_ALL && (token != T_ID
721 || dict_lookup_var (default_dict, tokid) == NULL))
724 if (!parse_variables (default_dict, &v_variables, &n_variables,
725 PV_APPEND | PV_NO_SCRATCH))
728 if (!lex_match ('('))
733 if (!lex_force_int ())
735 min = lex_integer ();
737 if (!lex_force_match (','))
739 if (!lex_force_int ())
741 max = lex_integer ();
743 if (!lex_force_match (')'))
747 msg (SE, _("Upper limit of integer mode value range must be "
748 "greater than lower limit."));
753 for (i = old_n_variables; i < n_variables; i++)
755 struct variable *v = v_variables[i];
756 struct var_freqs *vf;
760 msg (SE, _("Variable %s specified multiple times on VARIABLES "
761 "subcommand."), v->name);
764 if (mode == FRQM_INTEGER && v->type != NUMERIC)
766 msg (SE, _("Integer mode specified, but %s is not a numeric "
767 "variable."), v->name);
771 vf = var_attach_aux (v, xmalloc (sizeof *vf), var_dtor_free);
773 vf->tab.valid = vf->tab.missing = NULL;
774 if (mode == FRQM_INTEGER)
778 vf->tab.vector = pool_alloc (int_pool,
779 sizeof (struct freq) * (max - min + 1));
782 vf->tab.vector = NULL;
789 /* Parses the GROUPED subcommand, setting the n_grouped, grouped
790 fields of specified variables. */
792 frq_custom_grouped (struct cmd_frequencies *cmd UNUSED)
795 if ((token == T_ID && dict_lookup_var (default_dict, tokid) != NULL)
801 /* Max, current size of list; list itself. */
809 if (!parse_variables (default_dict, &v, &n,
810 PV_NO_DUPLICATE | PV_NUMERIC))
816 while (token == T_NUM)
821 dl = pool_realloc (int_pool, dl, ml * sizeof (double));
827 /* Note that nl might still be 0 and dl might still be
828 NULL. That's okay. */
829 if (!lex_match (')'))
832 msg (SE, _("`)' expected after GROUPED interval list."));
842 for (i = 0; i < n; i++)
843 if (v[i]->aux == NULL)
844 msg (SE, _("Variables %s specified on GROUPED but not on "
845 "VARIABLES."), v[i]->name);
848 struct var_freqs *vf = get_var_freqs (v[i]);
850 if (vf->groups != NULL)
851 msg (SE, _("Variables %s specified multiple times on GROUPED "
852 "subcommand."), v[i]->name);
860 if (!lex_match ('/'))
862 if ((token != T_ID || dict_lookup_var (default_dict, tokid) != NULL)
873 /* Adds X to the list of percentiles, keeping the list in proper
876 add_percentile (double x)
880 for (i = 0; i < n_percentiles; i++)
882 /* Do nothing if it's already in the list */
883 if ( fabs(x - percentiles[i].p) < DBL_EPSILON )
886 if (x < percentiles[i].p)
890 if (i >= n_percentiles || tokval != percentiles[i].p)
893 = pool_realloc (int_pool, percentiles,
894 (n_percentiles + 1) * sizeof (struct percentile ));
896 if (i < n_percentiles)
897 memmove (&percentiles[i + 1], &percentiles[i],
898 (n_percentiles - i) * sizeof (struct percentile) );
900 percentiles[i].p = x;
905 /* Comparison functions. */
907 /* Hash of numeric values. */
909 hash_value_numeric (const void *value_, void *foo UNUSED)
911 const struct freq *value = value_;
912 return hsh_hash_double (value->v.f);
915 /* Hash of string values. */
917 hash_value_alpha (const void *value_, void *v_)
919 const struct freq *value = value_;
920 struct variable *v = v_;
922 return hsh_hash_bytes (value->v.s, v->width);
925 /* Ascending numeric compare of values. */
927 compare_value_numeric_a (const void *a_, const void *b_, void *foo UNUSED)
929 const struct freq *a = a_;
930 const struct freq *b = b_;
934 else if (a->v.f < b->v.f)
940 /* Ascending string compare of values. */
942 compare_value_alpha_a (const void *a_, const void *b_, void *v_)
944 const struct freq *a = a_;
945 const struct freq *b = b_;
946 const struct variable *v = v_;
948 return memcmp (a->v.s, b->v.s, v->width);
951 /* Descending numeric compare of values. */
953 compare_value_numeric_d (const void *a, const void *b, void *foo UNUSED)
955 return -compare_value_numeric_a (a, b, foo);
958 /* Descending string compare of values. */
960 compare_value_alpha_d (const void *a, const void *b, void *v)
962 return -compare_value_alpha_a (a, b, v);
965 /* Ascending numeric compare of frequency;
966 secondary key on ascending numeric value. */
968 compare_freq_numeric_a (const void *a_, const void *b_, void *foo UNUSED)
970 const struct freq *a = a_;
971 const struct freq *b = b_;
975 else if (a->c < b->c)
980 else if (a->v.f < b->v.f)
986 /* Ascending numeric compare of frequency;
987 secondary key on ascending string value. */
989 compare_freq_alpha_a (const void *a_, const void *b_, void *v_)
991 const struct freq *a = a_;
992 const struct freq *b = b_;
993 const struct variable *v = v_;
997 else if (a->c < b->c)
1000 return memcmp (a->v.s, b->v.s, v->width);
1003 /* Descending numeric compare of frequency;
1004 secondary key on ascending numeric value. */
1006 compare_freq_numeric_d (const void *a_, const void *b_, void *foo UNUSED)
1008 const struct freq *a = a_;
1009 const struct freq *b = b_;
1013 else if (a->c < b->c)
1016 if (a->v.f > b->v.f)
1018 else if (a->v.f < b->v.f)
1024 /* Descending numeric compare of frequency;
1025 secondary key on ascending string value. */
1027 compare_freq_alpha_d (const void *a_, const void *b_, void *v_)
1029 const struct freq *a = a_;
1030 const struct freq *b = b_;
1031 const struct variable *v = v_;
1035 else if (a->c < b->c)
1038 return memcmp (a->v.s, b->v.s, v->width);
1041 /* Frequency table display. */
1043 /* Sets the widths of all the columns and heights of all the rows in
1044 table T for driver D. */
1046 full_dim (struct tab_table *t, struct outp_driver *d)
1048 int lab = cmd.labels == FRQ_LABELS;
1052 t->w[0] = min (tab_natural_width (t, d, 0), d->prop_em_width * 15);
1053 for (i = lab; i < lab + 5; i++)
1054 t->w[i] = max (tab_natural_width (t, d, i), d->prop_em_width * 8);
1055 for (i = 0; i < t->nr; i++)
1056 t->h[i] = d->font_height;
1059 /* Displays a full frequency table for variable V. */
1061 dump_full (struct variable *v)
1064 struct freq_tab *ft;
1066 struct tab_table *t;
1068 double cum_total = 0.0;
1069 double cum_freq = 0.0;
1079 static struct init vec[] =
1081 {4, 0, N_("Valid")},
1083 {1, 1, N_("Value")},
1084 {2, 1, N_("Frequency")},
1085 {3, 1, N_("Percent")},
1086 {4, 1, N_("Percent")},
1087 {5, 1, N_("Percent")},
1095 int lab = cmd.labels == FRQ_LABELS;
1097 ft = &get_var_freqs (v)->tab;
1098 n_categories = ft->n_valid + ft->n_missing;
1099 t = tab_create (5 + lab, n_categories + 3, 0);
1100 tab_headers (t, 0, 0, 2, 0);
1101 tab_dim (t, full_dim);
1104 tab_text (t, 0, 1, TAB_CENTER | TAT_TITLE, _("Value Label"));
1105 for (p = vec; p->s; p++)
1106 tab_text (t, p->c - (p->r ? !lab : 0), p->r,
1107 TAB_CENTER | TAT_TITLE, gettext (p->s));
1110 for (f = ft->valid; f < ft->missing; f++)
1112 double percent, valid_percent;
1116 percent = f->c / ft->total_cases * 100.0;
1117 valid_percent = f->c / ft->valid_cases * 100.0;
1118 cum_total += valid_percent;
1122 const char *label = val_labs_find (v->val_labs, f->v);
1124 tab_text (t, 0, r, TAB_LEFT, label);
1127 tab_value (t, 0 + lab, r, TAB_NONE, &f->v, &v->print);
1128 tab_float (t, 1 + lab, r, TAB_NONE, f->c, 8, 0);
1129 tab_float (t, 2 + lab, r, TAB_NONE, percent, 5, 1);
1130 tab_float (t, 3 + lab, r, TAB_NONE, valid_percent, 5, 1);
1131 tab_float (t, 4 + lab, r, TAB_NONE, cum_total, 5, 1);
1134 for (; f < &ft->valid[n_categories]; f++)
1140 const char *label = val_labs_find (v->val_labs, f->v);
1142 tab_text (t, 0, r, TAB_LEFT, label);
1145 tab_value (t, 0 + lab, r, TAB_NONE, &f->v, &v->print);
1146 tab_float (t, 1 + lab, r, TAB_NONE, f->c, 8, 0);
1147 tab_float (t, 2 + lab, r, TAB_NONE,
1148 f->c / ft->total_cases * 100.0, 5, 1);
1149 tab_text (t, 3 + lab, r, TAB_NONE, _("Missing"));
1153 tab_box (t, TAL_1, TAL_1,
1154 cmd.spaces == FRQ_SINGLE ? -1 : (TAL_1 | TAL_SPACING), TAL_1,
1156 tab_hline (t, TAL_2, 0, 4 + lab, 2);
1157 tab_hline (t, TAL_2, 0, 4 + lab, r);
1158 tab_joint_text (t, 0, r, 0 + lab, r, TAB_RIGHT | TAT_TITLE, _("Total"));
1159 tab_vline (t, TAL_0, 1, r, r);
1160 tab_float (t, 1 + lab, r, TAB_NONE, cum_freq, 8, 0);
1161 tab_float (t, 2 + lab, r, TAB_NONE, 100.0, 5, 1);
1162 tab_float (t, 3 + lab, r, TAB_NONE, 100.0, 5, 1);
1164 tab_title (t, 1, "%s: %s", v->name, v->label ? v->label : "");
1169 /* Sets the widths of all the columns and heights of all the rows in
1170 table T for driver D. */
1172 condensed_dim (struct tab_table *t, struct outp_driver *d)
1174 int cum_w = max (outp_string_width (d, _("Cum")),
1175 max (outp_string_width (d, _("Cum")),
1176 outp_string_width (d, "000")));
1180 for (i = 0; i < 2; i++)
1181 t->w[i] = max (tab_natural_width (t, d, i), d->prop_em_width * 8);
1182 for (i = 2; i < 4; i++)
1184 for (i = 0; i < t->nr; i++)
1185 t->h[i] = d->font_height;
1188 /* Display condensed frequency table for variable V. */
1190 dump_condensed (struct variable *v)
1193 struct freq_tab *ft;
1195 struct tab_table *t;
1197 double cum_total = 0.0;
1199 ft = &get_var_freqs (v)->tab;
1200 n_categories = ft->n_valid + ft->n_missing;
1201 t = tab_create (4, n_categories + 2, 0);
1203 tab_headers (t, 0, 0, 2, 0);
1204 tab_text (t, 0, 1, TAB_CENTER | TAT_TITLE, _("Value"));
1205 tab_text (t, 1, 1, TAB_CENTER | TAT_TITLE, _("Freq"));
1206 tab_text (t, 2, 1, TAB_CENTER | TAT_TITLE, _("Pct"));
1207 tab_text (t, 3, 0, TAB_CENTER | TAT_TITLE, _("Cum"));
1208 tab_text (t, 3, 1, TAB_CENTER | TAT_TITLE, _("Pct"));
1209 tab_dim (t, condensed_dim);
1212 for (f = ft->valid; f < ft->missing; f++)
1216 percent = f->c / ft->total_cases * 100.0;
1217 cum_total += f->c / ft->valid_cases * 100.0;
1219 tab_value (t, 0, r, TAB_NONE, &f->v, &v->print);
1220 tab_float (t, 1, r, TAB_NONE, f->c, 8, 0);
1221 tab_float (t, 2, r, TAB_NONE, percent, 3, 0);
1222 tab_float (t, 3, r, TAB_NONE, cum_total, 3, 0);
1225 for (; f < &ft->valid[n_categories]; f++)
1227 tab_value (t, 0, r, TAB_NONE, &f->v, &v->print);
1228 tab_float (t, 1, r, TAB_NONE, f->c, 8, 0);
1229 tab_float (t, 2, r, TAB_NONE,
1230 f->c / ft->total_cases * 100.0, 3, 0);
1234 tab_box (t, TAL_1, TAL_1,
1235 cmd.spaces == FRQ_SINGLE ? -1 : (TAL_1 | TAL_SPACING), TAL_1,
1237 tab_hline (t, TAL_2, 0, 3, 2);
1238 tab_title (t, 1, "%s: %s", v->name, v->label ? v->label : "");
1239 tab_columns (t, SOM_COL_DOWN, 1);
1243 /* Statistical display. */
1245 /* Calculates all the pertinent statistics for variable V, putting
1246 them in array D[]. FIXME: This could be made much more optimal. */
1248 calc_stats (struct variable *v, double d[frq_n_stats])
1250 struct freq_tab *ft = &get_var_freqs (v)->tab;
1251 double W = ft->valid_cases;
1260 double *median_value;
1262 /* Calculate percentiles. */
1264 /* If the 50th percentile was not explicitly requested then we must
1265 calculate it anyway --- it's the median */
1267 for (i = 0; i < n_percentiles; i++)
1269 if (percentiles[i].p == 0.5)
1271 median_value = &percentiles[i].value;
1276 if ( 0 == median_value )
1278 add_percentile (0.5);
1282 for (i = 0; i < n_percentiles; i++)
1284 percentiles[i].flag = 0;
1285 percentiles[i].flag2 = 0;
1289 for (idx = 0; idx < ft->n_valid; ++idx)
1291 static double prev_value = SYSMIS;
1292 f = &ft->valid[idx];
1294 for (i = 0; i < n_percentiles; i++)
1297 if ( percentiles[i].flag2 ) continue ;
1299 if ( get_algorithm() != COMPATIBLE )
1301 (ft->valid_cases - 1) * percentiles[i].p;
1304 (ft->valid_cases + 1) * percentiles[i].p - 1;
1306 if ( percentiles[i].flag )
1308 percentiles[i].x2 = f->v.f;
1309 percentiles[i].x1 = prev_value;
1310 percentiles[i].flag2 = 1;
1316 if ( f->c > 1 && rank - (f->c - 1) > tp )
1318 percentiles[i].x2 = percentiles[i].x1 = f->v.f;
1319 percentiles[i].flag2 = 1;
1323 percentiles[i].flag=1;
1329 prev_value = f->v.f;
1332 for (i = 0; i < n_percentiles; i++)
1334 /* Catches the case when p == 100% */
1335 if ( ! percentiles[i].flag2 )
1336 percentiles[i].x1 = percentiles[i].x2 = f->v.f;
1339 printf("percentile %d (p==%.2f); X1 = %g; X2 = %g\n",
1340 i,percentiles[i].p,percentiles[i].x1,percentiles[i].x2);
1344 for (i = 0; i < n_percentiles; i++)
1346 struct freq_tab *ft = &get_var_freqs (v)->tab;
1350 if ( get_algorithm() != COMPATIBLE )
1352 s = modf((ft->valid_cases - 1) * percentiles[i].p , &dummy);
1356 s = modf((ft->valid_cases + 1) * percentiles[i].p -1, &dummy);
1359 percentiles[i].value = percentiles[i].x1 +
1360 ( percentiles[i].x2 - percentiles[i].x1) * s ;
1362 if ( percentiles[i].p == 0.50)
1363 median_value = &percentiles[i].value;
1367 /* Calculate the mode. */
1370 for (f = ft->valid; f < ft->missing; f++)
1372 if (most_often < f->c)
1377 else if (most_often == f->c)
1379 /* A duplicate mode is undefined.
1380 FIXME: keep track of *all* the modes. */
1385 /* Calculate moments. */
1386 m = moments_create (MOMENT_KURTOSIS);
1387 for (f = ft->valid; f < ft->missing; f++)
1388 moments_pass_one (m, f->v.f, f->c);
1389 for (f = ft->valid; f < ft->missing; f++)
1390 moments_pass_two (m, f->v.f, f->c);
1391 moments_calculate (m, NULL, &d[frq_mean], &d[frq_variance],
1392 &d[frq_skew], &d[frq_kurt]);
1393 moments_destroy (m);
1395 /* Formulas below are taken from _SPSS Statistical Algorithms_. */
1396 d[frq_min] = ft->valid[0].v.f;
1397 d[frq_max] = ft->valid[ft->n_valid - 1].v.f;
1398 d[frq_mode] = X_mode;
1399 d[frq_range] = d[frq_max] - d[frq_min];
1400 d[frq_median] = *median_value;
1401 d[frq_sum] = d[frq_mean] * W;
1402 d[frq_stddev] = sqrt (d[frq_variance]);
1403 d[frq_semean] = d[frq_stddev] / sqrt (W);
1404 d[frq_seskew] = calc_seskew (W);
1405 d[frq_sekurt] = calc_sekurt (W);
1408 /* Displays a table of all the statistics requested for variable V. */
1410 dump_statistics (struct variable *v, int show_varname)
1412 struct freq_tab *ft;
1413 double stat_value[frq_n_stats];
1414 struct tab_table *t;
1417 int n_explicit_percentiles = n_percentiles;
1419 if ( implicit_50th && n_percentiles > 0 )
1422 if (v->type == ALPHA)
1424 ft = &get_var_freqs (v)->tab;
1425 if (ft->n_valid == 0)
1427 msg (SW, _("No valid data for variable %s; statistics not displayed."),
1431 calc_stats (v, stat_value);
1433 t = tab_create (3, n_stats + n_explicit_percentiles + 2, 0);
1434 tab_dim (t, tab_natural_dimensions);
1436 tab_box (t, TAL_1, TAL_1, -1, -1 , 0 , 0 , 2, tab_nr(t) - 1) ;
1439 tab_vline (t, TAL_1 , 2, 0, tab_nr(t) - 1);
1440 tab_vline (t, TAL_1 | TAL_SPACING , 1, 0, tab_nr(t) - 1 ) ;
1442 r=2; /* N missing and N valid are always dumped */
1444 for (i = 0; i < frq_n_stats; i++)
1445 if (stats & BIT_INDEX (i))
1447 tab_text (t, 0, r, TAB_LEFT | TAT_TITLE,
1448 gettext (st_name[i].s10));
1449 tab_float (t, 2, r, TAB_NONE, stat_value[i], 11, 3);
1453 tab_text (t, 0, 0, TAB_LEFT | TAT_TITLE, _("N"));
1454 tab_text (t, 1, 0, TAB_LEFT | TAT_TITLE, _("Valid"));
1455 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Missing"));
1457 tab_float(t, 2, 0, TAB_NONE, ft->valid_cases, 11, 0);
1458 tab_float(t, 2, 1, TAB_NONE, ft->total_cases - ft->valid_cases, 11, 0);
1461 for (i = 0; i < n_explicit_percentiles; i++, r++)
1465 tab_text (t, 0, r, TAB_LEFT | TAT_TITLE, _("Percentiles"));
1468 tab_float (t, 1, r, TAB_LEFT, percentiles[i].p * 100, 3, 0 );
1469 tab_float (t, 2, r, TAB_NONE, percentiles[i].value, 11, 3);
1473 tab_columns (t, SOM_COL_DOWN, 1);
1477 tab_title (t, 1, "%s: %s", v->name, v->label);
1479 tab_title (t, 0, v->name);
1482 tab_flags (t, SOMF_NO_TITLE);