1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 1997-9, 2000, 2007, 2009, 2010 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
21 #include <gsl/gsl_histogram.h>
23 #include <data/case.h>
24 #include <data/casegrouper.h>
25 #include <data/casereader.h>
26 #include <data/dictionary.h>
27 #include <data/format.h>
28 #include <data/procedure.h>
29 #include <data/settings.h>
30 #include <data/value-labels.h>
31 #include <data/variable.h>
32 #include <language/command.h>
33 #include <language/dictionary/split-file.h>
34 #include <language/lexer/lexer.h>
35 #include <libpspp/array.h>
36 #include <libpspp/bit-vector.h>
37 #include <libpspp/compiler.h>
38 #include <libpspp/hash.h>
39 #include <libpspp/message.h>
40 #include <libpspp/misc.h>
41 #include <libpspp/pool.h>
42 #include <libpspp/str.h>
43 #include <math/histogram.h>
44 #include <math/moments.h>
45 #include <output/chart-item.h>
46 #include <output/charts/piechart.h>
47 #include <output/charts/plot-hist.h>
48 #include <output/tab.h>
56 #define _(msgid) gettext (msgid)
57 #define N_(msgid) msgid
64 +format=table:limit(n:limit,"%s>0")/notable/!table,
65 sort:!avalue/dvalue/afreq/dfreq;
66 missing=miss:include/!exclude;
67 barchart(ba_)=:minimum(d:min),
69 scale:freq(*n:freq,"%s>0")/percent(*n:pcnt,"%s>0");
70 piechart(pie_)=:minimum(d:min),
72 missing:missing/!nomissing,
74 histogram(hi_)=:minimum(d:min),
76 scale:freq(*n:freq,"%s>0")/percent(*n:pcnt,"%s>0"),
77 norm:!nonormal/normal;
80 +percentiles = double list;
81 +statistics[st_]=mean,semean,median,mode,stddev,variance,
82 kurtosis,skewness,range,minimum,maximum,sum,
83 default,seskewness,sekurtosis,all,none.
91 frq_mean = 0, frq_semean, frq_median, frq_mode, frq_stddev, frq_variance,
92 frq_kurt, frq_sekurt, frq_skew, frq_seskew, frq_range, frq_min, frq_max,
96 /* Description of a statistic. */
99 int st_indx; /* Index into a_statistics[]. */
100 const char *s10; /* Identifying string. */
103 /* Table of statistics, indexed by dsc_*. */
104 static const struct frq_info st_name[frq_n_stats + 1] =
106 {FRQ_ST_MEAN, N_("Mean")},
107 {FRQ_ST_SEMEAN, N_("S.E. Mean")},
108 {FRQ_ST_MEDIAN, N_("Median")},
109 {FRQ_ST_MODE, N_("Mode")},
110 {FRQ_ST_STDDEV, N_("Std Dev")},
111 {FRQ_ST_VARIANCE, N_("Variance")},
112 {FRQ_ST_KURTOSIS, N_("Kurtosis")},
113 {FRQ_ST_SEKURTOSIS, N_("S.E. Kurt")},
114 {FRQ_ST_SKEWNESS, N_("Skewness")},
115 {FRQ_ST_SESKEWNESS, N_("S.E. Skew")},
116 {FRQ_ST_RANGE, N_("Range")},
117 {FRQ_ST_MINIMUM, N_("Minimum")},
118 {FRQ_ST_MAXIMUM, N_("Maximum")},
119 {FRQ_ST_SUM, N_("Sum")},
123 /* Percentiles to calculate. */
127 double p; /* the %ile to be calculated */
128 double value; /* the %ile's value */
129 double x1; /* The datum value <= the percentile */
130 double x2; /* The datum value >= the percentile */
132 int flag2; /* Set to 1 if this percentile value has been found */
133 bool show; /* True to show this percentile in the statistics box. */
137 static void add_percentile (double x, bool show);
139 static struct percentile *percentiles;
140 static int n_percentiles, n_show_percentiles;
142 /* Groups of statistics. */
144 #define frq_default \
145 (BI (frq_mean) | BI (frq_stddev) | BI (frq_min) | BI (frq_max))
147 (BI (frq_sum) | BI(frq_min) | BI(frq_max) \
148 | BI(frq_mean) | BI(frq_semean) | BI(frq_stddev) \
149 | BI(frq_variance) | BI(frq_kurt) | BI(frq_sekurt) \
150 | BI(frq_skew) | BI(frq_seskew) | BI(frq_range) \
151 | BI(frq_range) | BI(frq_mode) | BI(frq_median))
153 /* Statistics; number of statistics. */
154 static unsigned long stats;
159 double x_min; /* X axis minimum value. */
160 double x_max; /* X axis maximum value. */
161 int y_scale; /* Y axis scale: FRQ_FREQ or FRQ_PERCENT. */
163 /* Histograms only. */
164 double y_max; /* Y axis maximum value. */
165 bool draw_normal; /* Whether to draw normal curve. */
167 /* Pie charts only. */
168 bool include_missing; /* Whether to include missing values. */
171 /* Histogram and pie chart settings. */
172 static struct frq_chart hist, pie;
174 /* Parsed command. */
175 static struct cmd_frequencies cmd;
177 /* Variables for which to calculate statistics. */
178 static size_t n_variables;
179 static const struct variable **v_variables;
182 static struct pool *data_pool; /* For per-SPLIT FILE group data. */
183 static struct pool *syntax_pool; /* For syntax-related data. */
185 /* Frequency tables. */
187 /* Entire frequency table. */
190 struct hsh_table *data; /* Undifferentiated data. */
191 struct freq_mutable *valid; /* Valid freqs. */
192 int n_valid; /* Number of total freqs. */
193 const struct dictionary *dict; /* The dict from whence entries in the table
196 struct freq_mutable *missing; /* Missing freqs. */
197 int n_missing; /* Number of missing freqs. */
200 double total_cases; /* Sum of weights of all cases. */
201 double valid_cases; /* Sum of weights of valid cases. */
204 /* Per-variable frequency data. */
207 /* Freqency table. */
208 struct freq_tab tab; /* Frequencies table to use. */
211 int n_groups; /* Number of groups. */
212 double *groups; /* Groups. */
215 double stat[frq_n_stats];
217 /* Variable attributes. */
219 struct fmt_spec print;
222 static inline struct var_freqs *
223 get_var_freqs (const struct variable *v)
225 return var_get_aux (v);
228 static void determine_charts (void);
230 static void calc_stats (const struct variable *v, double d[frq_n_stats]);
232 static void precalc (struct casereader *, struct dataset *);
233 static void calc (const struct ccase *, const struct dataset *);
234 static void postcalc (const struct dataset *);
236 static void postprocess_freq_tab (const struct variable *);
237 static void dump_freq_table (const struct variable *, const struct variable *);
238 static void dump_statistics (const struct variable *, const struct variable *);
239 static void cleanup_freq_tab (const struct variable *);
241 static hsh_compare_func compare_value_numeric_a, compare_value_alpha_a;
242 static hsh_compare_func compare_value_numeric_d, compare_value_alpha_d;
243 static hsh_compare_func compare_freq_numeric_a, compare_freq_alpha_a;
244 static hsh_compare_func compare_freq_numeric_d, compare_freq_alpha_d;
247 static void do_piechart(const struct variable *var,
248 const struct freq_tab *frq_tab);
251 freq_tab_to_hist(const struct freq_tab *ft, const struct variable *var);
255 /* Parser and outline. */
257 static int internal_cmd_frequencies (struct lexer *lexer, struct dataset *ds);
260 cmd_frequencies (struct lexer *lexer, struct dataset *ds)
264 syntax_pool = pool_create ();
265 result = internal_cmd_frequencies (lexer, ds);
266 pool_destroy (syntax_pool);
268 pool_destroy (data_pool);
276 internal_cmd_frequencies (struct lexer *lexer, struct dataset *ds)
278 struct casegrouper *grouper;
279 struct casereader *input, *group;
284 n_show_percentiles = 0;
290 if (!parse_frequencies (lexer, ds, &cmd, NULL))
293 /* Figure out statistics to calculate. */
295 if (cmd.a_statistics[FRQ_ST_DEFAULT] || !cmd.sbc_statistics)
296 stats |= frq_default;
297 if (cmd.a_statistics[FRQ_ST_ALL])
299 if (cmd.sort != FRQ_AVALUE && cmd.sort != FRQ_DVALUE)
300 stats &= ~BIT_INDEX (frq_median);
301 for (i = 0; i < frq_n_stats; i++)
302 if (cmd.a_statistics[st_name[i].st_indx])
303 stats |= BIT_INDEX (i);
304 if (stats & frq_kurt)
305 stats |= BIT_INDEX (frq_sekurt);
306 if (stats & frq_skew)
307 stats |= BIT_INDEX (frq_seskew);
309 /* Calculate n_stats. */
311 for (i = 0; i < frq_n_stats; i++)
312 if ((stats & BIT_INDEX (i)))
317 if (cmd.sbc_histogram || cmd.sbc_piechart || cmd.sbc_ntiles)
318 cmd.sort = FRQ_AVALUE;
320 /* Work out what percentiles need to be calculated */
321 if ( cmd.sbc_percentiles )
323 for ( i = 0 ; i < MAXLISTS ; ++i )
326 subc_list_double *ptl_list = &cmd.dl_percentiles[i];
327 for ( pl = 0 ; pl < subc_list_double_count(ptl_list); ++pl)
328 add_percentile (subc_list_double_at(ptl_list, pl) / 100.0, true);
331 if ( cmd.sbc_ntiles )
333 for ( i = 0 ; i < cmd.sbc_ntiles ; ++i )
336 for (j = 0; j <= cmd.n_ntiles[i]; ++j )
337 add_percentile (j / (double) cmd.n_ntiles[i], true);
340 if (stats & BIT_INDEX (frq_median))
342 /* Treat the median as the 50% percentile.
343 We output it in the percentiles table as "50 (Median)." */
344 add_percentile (0.5, true);
345 stats &= ~BIT_INDEX (frq_median);
348 if (cmd.sbc_histogram)
350 add_percentile (0.25, false);
351 add_percentile (0.75, false);
355 input = casereader_create_filter_weight (proc_open (ds), dataset_dict (ds),
357 grouper = casegrouper_create_splits (input, dataset_dict (ds));
358 for (; casegrouper_get_next_group (grouper, &group);
359 casereader_destroy (group))
364 for (; (c = casereader_read (group)) != NULL; case_unref (c))
368 ok = casegrouper_destroy (grouper);
369 ok = proc_commit (ds) && ok;
371 free_frequencies(&cmd);
373 return ok ? CMD_SUCCESS : CMD_CASCADING_FAILURE;
376 /* Figure out which charts the user requested. */
378 determine_charts (void)
380 if (cmd.sbc_barchart)
381 msg (SW, _("Bar charts are not implemented."));
383 if (cmd.sbc_histogram)
385 hist.x_min = cmd.hi_min;
386 hist.x_max = cmd.hi_max;
387 hist.y_scale = cmd.hi_scale;
388 hist.y_max = cmd.hi_scale == FRQ_FREQ ? cmd.hi_freq : cmd.hi_pcnt;
389 hist.draw_normal = cmd.hi_norm != FRQ_NONORMAL;
390 hist.include_missing = false;
392 if (hist.x_min != SYSMIS && hist.x_max != SYSMIS
393 && hist.x_min >= hist.x_max)
395 msg (SE, _("MAX for histogram must be greater than or equal to MIN, "
396 "but MIN was specified as %.15g and MAX as %.15g. "
397 "MIN and MAX will be ignored."), hist.x_min, hist.x_max);
398 hist.x_min = hist.x_max = SYSMIS;
402 if (cmd.sbc_piechart)
404 pie.x_min = cmd.pie_min;
405 pie.x_max = cmd.pie_max;
406 pie.y_scale = cmd.pie_scale;
407 pie.include_missing = cmd.pie_missing == FRQ_MISSING;
409 if (pie.x_min != SYSMIS && pie.x_max != SYSMIS
410 && pie.x_min >= pie.x_max)
412 msg (SE, _("MAX for pie chart must be greater than or equal to MIN, "
413 "but MIN was specified as %.15g and MAX as %.15g. "
414 "MIN and MAX will be ignored."), pie.x_min, pie.x_max);
415 pie.x_min = pie.x_max = SYSMIS;
421 /* Add data from case C to the frequency table. */
423 calc (const struct ccase *c, const struct dataset *ds)
425 double weight = dict_get_case_weight (dataset_dict (ds), c, NULL);
428 for (i = 0; i < n_variables; i++)
430 const struct variable *v = v_variables[i];
431 const union value *val = case_data (c, v);
432 struct var_freqs *vf = get_var_freqs (v);
433 struct freq_tab *ft = &vf->tab;
435 struct freq_mutable target;
436 struct freq_mutable **fpp;
439 fpp = (struct freq_mutable **) hsh_probe (ft->data, &target);
442 (*fpp)->count += weight;
445 struct freq_mutable *fp = pool_alloc (data_pool, sizeof *fp);
447 value_init_pool (data_pool, &fp->value, vf->width);
448 value_copy (&fp->value, val, vf->width);
454 /* Prepares each variable that is the target of FREQUENCIES by setting
455 up its hash table. */
457 precalc (struct casereader *input, struct dataset *ds)
462 c = casereader_peek (input, 0);
465 output_split_file_values (ds, c);
469 pool_destroy (data_pool);
470 data_pool = pool_create ();
472 for (i = 0; i < n_variables; i++)
474 const struct variable *v = v_variables[i];
475 struct freq_tab *ft = &get_var_freqs (v)->tab;
477 ft->data = hsh_create (16, compare_freq, hash_freq, NULL, v);
481 /* Finishes up with the variables after frequencies have been
482 calculated. Displays statistics, percentiles, ... */
484 postcalc (const struct dataset *ds)
486 const struct dictionary *dict = dataset_dict (ds);
487 const struct variable *wv = dict_get_weight (dict);
490 for (i = 0; i < n_variables; i++)
492 const struct variable *v = v_variables[i];
493 struct var_freqs *vf = get_var_freqs (v);
494 struct freq_tab *ft = &vf->tab;
497 postprocess_freq_tab (v);
499 /* Frequencies tables. */
500 n_categories = ft->n_valid + ft->n_missing;
501 if (cmd.table == FRQ_TABLE
502 || (cmd.table == FRQ_LIMIT && n_categories <= cmd.limit))
503 dump_freq_table (v, wv);
507 dump_statistics (v, wv);
509 if (cmd.sbc_histogram && var_is_numeric (v) && ft->n_valid > 0)
511 double d[frq_n_stats];
512 struct histogram *histogram;
516 histogram = freq_tab_to_hist (ft, v);
518 chart_item_submit (histogram_chart_create (
519 histogram->gsl_hist, var_to_string(v),
525 statistic_destroy (&histogram->parent);
528 if (cmd.sbc_piechart)
529 do_piechart(v_variables[i], ft);
531 cleanup_freq_tab (v);
536 /* Returns the comparison function that should be used for
537 sorting a frequency table by FRQ_SORT using VAL_TYPE
539 static hsh_compare_func *
540 get_freq_comparator (int frq_sort, enum val_type val_type)
542 bool is_numeric = val_type == VAL_NUMERIC;
546 return is_numeric ? compare_value_numeric_a : compare_value_alpha_a;
548 return is_numeric ? compare_value_numeric_d : compare_value_alpha_d;
550 return is_numeric ? compare_freq_numeric_a : compare_freq_alpha_a;
552 return is_numeric ? compare_freq_numeric_d : compare_freq_alpha_d;
558 /* Returns true iff the value in struct freq_mutable F is non-missing
561 not_missing (const void *f_, const void *v_)
563 const struct freq_mutable *f = f_;
564 const struct variable *v = v_;
566 return !var_is_value_missing (v, &f->value, MV_ANY);
569 /* Summarizes the frequency table data for variable V. */
571 postprocess_freq_tab (const struct variable *v)
573 hsh_compare_func *compare;
577 struct freq_mutable *freqs, *f;
580 ft = &get_var_freqs (v)->tab;
581 compare = get_freq_comparator (cmd.sort, var_get_type (v));
583 /* Extract data from hash table. */
584 count = hsh_count (ft->data);
585 data = hsh_data (ft->data);
587 /* Copy dereferenced data into freqs. */
588 freqs = xnmalloc (count, sizeof *freqs);
589 for (i = 0; i < count; i++)
591 struct freq_mutable *f = data[i];
595 /* Put data into ft. */
597 ft->n_valid = partition (freqs, count, sizeof *freqs, not_missing, v);
598 ft->missing = freqs + ft->n_valid;
599 ft->n_missing = count - ft->n_valid;
602 sort (ft->valid, ft->n_valid, sizeof *ft->valid, compare, v);
603 sort (ft->missing, ft->n_missing, sizeof *ft->missing, compare, v);
605 /* Summary statistics. */
606 ft->valid_cases = 0.0;
607 for(i = 0 ; i < ft->n_valid ; ++i )
610 ft->valid_cases += f->count;
614 ft->total_cases = ft->valid_cases ;
615 for(i = 0 ; i < ft->n_missing ; ++i )
618 ft->total_cases += f->count;
623 /* Frees the frequency table for variable V. */
625 cleanup_freq_tab (const struct variable *v)
627 struct freq_tab *ft = &get_var_freqs (v)->tab;
629 hsh_destroy (ft->data);
632 /* Parses the VARIABLES subcommand, adding to
633 {n_variables,v_variables}. */
635 frq_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_frequencies *cmd UNUSED, void *aux UNUSED)
637 size_t old_n_variables = n_variables;
640 lex_match (lexer, '=');
641 if (lex_token (lexer) != T_ALL && (lex_token (lexer) != T_ID
642 || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) == NULL))
645 if (!parse_variables_const (lexer, dataset_dict (ds), &v_variables, &n_variables,
646 PV_APPEND | PV_NO_SCRATCH))
649 for (i = old_n_variables; i < n_variables; i++)
651 const struct variable *v = v_variables[i];
652 struct var_freqs *vf;
654 if (var_get_aux (v) != NULL)
656 msg (SE, _("Variable %s specified multiple times on VARIABLES "
657 "subcommand."), var_get_name (v));
660 vf = var_attach_aux (v, xmalloc (sizeof *vf), var_dtor_free);
661 vf->tab.valid = vf->tab.missing = NULL;
662 vf->tab.dict = dataset_dict (ds);
665 vf->width = var_get_width (v);
666 vf->print = *var_get_print_format (v);
671 /* Parses the GROUPED subcommand, setting the n_grouped, grouped
672 fields of specified variables. */
674 frq_custom_grouped (struct lexer *lexer, struct dataset *ds, struct cmd_frequencies *cmd UNUSED, void *aux UNUSED)
676 lex_match (lexer, '=');
677 if ((lex_token (lexer) == T_ID && dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) != NULL)
678 || lex_token (lexer) == T_ID)
683 /* Max, current size of list; list itself. */
689 const struct variable **v;
691 if (!parse_variables_const (lexer, dataset_dict (ds), &v, &n,
692 PV_NO_DUPLICATE | PV_NUMERIC))
694 if (lex_match (lexer, '('))
698 while (lex_integer (lexer))
703 dl = pool_nrealloc (syntax_pool, dl, ml, sizeof *dl);
705 dl[nl++] = lex_tokval (lexer);
707 lex_match (lexer, ',');
709 /* Note that nl might still be 0 and dl might still be
710 NULL. That's okay. */
711 if (!lex_match (lexer, ')'))
714 msg (SE, _("`)' expected after GROUPED interval list."));
724 for (i = 0; i < n; i++)
725 if (var_get_aux (v[i]) == NULL)
726 msg (SE, _("Variables %s specified on GROUPED but not on "
727 "VARIABLES."), var_get_name (v[i]));
730 struct var_freqs *vf = get_var_freqs (v[i]);
732 if (vf->groups != NULL)
733 msg (SE, _("Variables %s specified multiple times on GROUPED "
734 "subcommand."), var_get_name (v[i]));
742 if (!lex_match (lexer, '/'))
744 if ((lex_token (lexer) != T_ID || dict_lookup_var (dataset_dict (ds), lex_tokid (lexer)) != NULL)
745 && lex_token (lexer) != T_ALL)
747 lex_put_back (lexer, '/');
755 /* Adds X to the list of percentiles, keeping the list in proper
756 order. If SHOW is true, the percentile will be shown in the statistics
757 box, otherwise it will be hidden. */
759 add_percentile (double x, bool show)
763 for (i = 0; i < n_percentiles; i++)
765 /* Do nothing if it's already in the list */
766 if ( fabs(x - percentiles[i].p) < DBL_EPSILON )
768 if (show && !percentiles[i].show)
770 n_show_percentiles++;
771 percentiles[i].show = true;
776 if (x < percentiles[i].p)
780 if (i >= n_percentiles || x != percentiles[i].p)
782 percentiles = pool_nrealloc (syntax_pool, percentiles,
783 n_percentiles + 1, sizeof *percentiles);
784 insert_element (percentiles, n_percentiles, sizeof *percentiles, i);
785 percentiles[i].p = x;
786 percentiles[i].show = show;
789 n_show_percentiles++;
793 /* Comparison functions. */
795 /* Ascending numeric compare of values. */
797 compare_value_numeric_a (const void *a_, const void *b_, const void *aux UNUSED)
799 const struct freq_mutable *a = a_;
800 const struct freq_mutable *b = b_;
802 if (a->value.f > b->value.f)
804 else if (a->value.f < b->value.f)
810 /* Ascending string compare of values. */
812 compare_value_alpha_a (const void *a_, const void *b_, const void *v_)
814 const struct freq_mutable *a = a_;
815 const struct freq_mutable *b = b_;
816 const struct variable *v = v_;
817 struct var_freqs *vf = get_var_freqs (v);
819 return value_compare_3way (&a->value, &b->value, vf->width);
822 /* Descending numeric compare of values. */
824 compare_value_numeric_d (const void *a, const void *b, const void *aux UNUSED)
826 return -compare_value_numeric_a (a, b, aux);
829 /* Descending string compare of values. */
831 compare_value_alpha_d (const void *a, const void *b, const void *v)
833 return -compare_value_alpha_a (a, b, v);
836 /* Ascending numeric compare of frequency;
837 secondary key on ascending numeric value. */
839 compare_freq_numeric_a (const void *a_, const void *b_, const void *aux UNUSED)
841 const struct freq_mutable *a = a_;
842 const struct freq_mutable *b = b_;
844 if (a->count > b->count)
846 else if (a->count < b->count)
849 if (a->value.f > b->value.f)
851 else if (a->value.f < b->value.f)
857 /* Ascending numeric compare of frequency;
858 secondary key on ascending string value. */
860 compare_freq_alpha_a (const void *a_, const void *b_, const void *v_)
862 const struct freq_mutable *a = a_;
863 const struct freq_mutable *b = b_;
864 const struct variable *v = v_;
865 struct var_freqs *vf = get_var_freqs (v);
867 if (a->count > b->count)
869 else if (a->count < b->count)
872 return value_compare_3way (&a->value, &b->value, vf->width);
875 /* Descending numeric compare of frequency;
876 secondary key on ascending numeric value. */
878 compare_freq_numeric_d (const void *a_, const void *b_, const void *aux UNUSED)
880 const struct freq_mutable *a = a_;
881 const struct freq_mutable *b = b_;
883 if (a->count > b->count)
885 else if (a->count < b->count)
888 if (a->value.f > b->value.f)
890 else if (a->value.f < b->value.f)
896 /* Descending numeric compare of frequency;
897 secondary key on ascending string value. */
899 compare_freq_alpha_d (const void *a_, const void *b_, const void *v_)
901 const struct freq_mutable *a = a_;
902 const struct freq_mutable *b = b_;
903 const struct variable *v = v_;
904 struct var_freqs *vf = get_var_freqs (v);
906 if (a->count > b->count)
908 else if (a->count < b->count)
911 return value_compare_3way (&a->value, &b->value, vf->width);
914 /* Frequency table display. */
916 /* Displays a full frequency table for variable V. */
918 dump_freq_table (const struct variable *v, const struct variable *wv)
920 const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : &F_8_0;
922 struct var_freqs *vf;
924 struct freq_mutable *f;
927 double cum_total = 0.0;
928 double cum_freq = 0.0;
930 static const char *headings[] = {
939 vf = get_var_freqs (v);
941 n_categories = ft->n_valid + ft->n_missing;
942 t = tab_create (6, n_categories + 2);
943 tab_headers (t, 0, 0, 1, 0);
945 for (x = 0; x < 6; x++)
946 tab_text (t, x, 0, TAB_CENTER | TAT_TITLE, gettext (headings[x]));
949 for (f = ft->valid; f < ft->missing; f++)
952 double percent, valid_percent;
954 cum_freq += f->count;
956 percent = f->count / ft->total_cases * 100.0;
957 valid_percent = f->count / ft->valid_cases * 100.0;
958 cum_total += valid_percent;
960 label = var_lookup_value_label (v, &f->value);
962 tab_text (t, 0, r, TAB_LEFT, label);
964 tab_value (t, 1, r, TAB_NONE, &f->value, ft->dict, &vf->print);
965 tab_double (t, 2, r, TAB_NONE, f->count, wfmt);
966 tab_double (t, 3, r, TAB_NONE, percent, NULL);
967 tab_double (t, 4, r, TAB_NONE, valid_percent, NULL);
968 tab_double (t, 5, r, TAB_NONE, cum_total, NULL);
971 for (; f < &ft->valid[n_categories]; f++)
975 cum_freq += f->count;
977 label = var_lookup_value_label (v, &f->value);
979 tab_text (t, 0, r, TAB_LEFT, label);
981 tab_value (t, 1, r, TAB_NONE, &f->value, ft->dict, &vf->print);
982 tab_double (t, 2, r, TAB_NONE, f->count, wfmt);
983 tab_double (t, 3, r, TAB_NONE,
984 f->count / ft->total_cases * 100.0, NULL);
985 tab_text (t, 4, r, TAB_NONE, _("Missing"));
989 tab_box (t, TAL_1, TAL_1, -1, TAL_1, 0, 0, 5, r);
990 tab_hline (t, TAL_2, 0, 5, 1);
991 tab_hline (t, TAL_2, 0, 5, r);
992 tab_joint_text (t, 0, r, 1, r, TAB_RIGHT | TAT_TITLE, _("Total"));
993 tab_vline (t, TAL_0, 1, r, r);
994 tab_double (t, 2, r, TAB_NONE, cum_freq, wfmt);
995 tab_fixed (t, 3, r, TAB_NONE, 100.0, 5, 1);
996 tab_fixed (t, 4, r, TAB_NONE, 100.0, 5, 1);
998 tab_title (t, "%s", var_to_string (v));
1002 /* Statistical display. */
1004 /* Calculates all the pertinent statistics for variable V, putting them in
1007 calc_stats (const struct variable *v, double d[frq_n_stats])
1009 struct freq_tab *ft = &get_var_freqs (v)->tab;
1010 double W = ft->valid_cases;
1012 struct freq_mutable *f=0;
1020 /* Calculate percentiles. */
1022 assert (ft->n_valid > 0);
1024 for (i = 0; i < n_percentiles; i++)
1026 percentiles[i].flag = 0;
1027 percentiles[i].flag2 = 0;
1031 for (idx = 0; idx < ft->n_valid; ++idx)
1033 static double prev_value = SYSMIS;
1034 f = &ft->valid[idx];
1036 for (i = 0; i < n_percentiles; i++)
1039 if ( percentiles[i].flag2 ) continue ;
1041 if ( settings_get_algorithm () != COMPATIBLE )
1043 (ft->valid_cases - 1) * percentiles[i].p;
1046 (ft->valid_cases + 1) * percentiles[i].p - 1;
1048 if ( percentiles[i].flag )
1050 percentiles[i].x2 = f->value.f;
1051 percentiles[i].x1 = prev_value;
1052 percentiles[i].flag2 = 1;
1058 if ( f->count > 1 && rank - (f->count - 1) > tp )
1060 percentiles[i].x2 = percentiles[i].x1 = f->value.f;
1061 percentiles[i].flag2 = 1;
1065 percentiles[i].flag=1;
1071 prev_value = f->value.f;
1074 for (i = 0; i < n_percentiles; i++)
1076 /* Catches the case when p == 100% */
1077 if ( ! percentiles[i].flag2 )
1078 percentiles[i].x1 = percentiles[i].x2 = f->value.f;
1081 printf("percentile %d (p==%.2f); X1 = %g; X2 = %g\n",
1082 i,percentiles[i].p,percentiles[i].x1,percentiles[i].x2);
1086 for (i = 0; i < n_percentiles; i++)
1088 struct freq_tab *ft = &get_var_freqs (v)->tab;
1092 if ( settings_get_algorithm () != COMPATIBLE )
1094 s = modf((ft->valid_cases - 1) * percentiles[i].p , &dummy);
1098 s = modf((ft->valid_cases + 1) * percentiles[i].p -1, &dummy);
1101 percentiles[i].value = percentiles[i].x1 +
1102 ( percentiles[i].x2 - percentiles[i].x1) * s ;
1106 /* Calculate the mode. */
1109 for (f = ft->valid; f < ft->missing; f++)
1111 if (most_often < f->count)
1113 most_often = f->count;
1114 X_mode = f->value.f;
1116 else if (most_often == f->count)
1118 /* A duplicate mode is undefined.
1119 FIXME: keep track of *all* the modes. */
1124 /* Calculate moments. */
1125 m = moments_create (MOMENT_KURTOSIS);
1126 for (f = ft->valid; f < ft->missing; f++)
1127 moments_pass_one (m, f->value.f, f->count);
1128 for (f = ft->valid; f < ft->missing; f++)
1129 moments_pass_two (m, f->value.f, f->count);
1130 moments_calculate (m, NULL, &d[frq_mean], &d[frq_variance],
1131 &d[frq_skew], &d[frq_kurt]);
1132 moments_destroy (m);
1134 /* Formulas below are taken from _SPSS Statistical Algorithms_. */
1135 d[frq_min] = ft->valid[0].value.f;
1136 d[frq_max] = ft->valid[ft->n_valid - 1].value.f;
1137 d[frq_mode] = X_mode;
1138 d[frq_range] = d[frq_max] - d[frq_min];
1139 d[frq_sum] = d[frq_mean] * W;
1140 d[frq_stddev] = sqrt (d[frq_variance]);
1141 d[frq_semean] = d[frq_stddev] / sqrt (W);
1142 d[frq_seskew] = calc_seskew (W);
1143 d[frq_sekurt] = calc_sekurt (W);
1146 /* Displays a table of all the statistics requested for variable V. */
1148 dump_statistics (const struct variable *v, const struct variable *wv)
1150 const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : &F_8_0;
1151 struct freq_tab *ft;
1152 double stat_value[frq_n_stats];
1153 struct tab_table *t;
1156 if (var_is_alpha (v))
1158 ft = &get_var_freqs (v)->tab;
1159 if (ft->n_valid == 0)
1161 msg (SW, _("No valid data for variable %s; statistics not displayed."),
1165 calc_stats (v, stat_value);
1167 t = tab_create (3, n_stats + n_show_percentiles + 2);
1169 tab_box (t, TAL_1, TAL_1, -1, -1 , 0 , 0 , 2, tab_nr(t) - 1) ;
1172 tab_vline (t, TAL_1 , 2, 0, tab_nr(t) - 1);
1173 tab_vline (t, TAL_GAP , 1, 0, tab_nr(t) - 1 ) ;
1175 r=2; /* N missing and N valid are always dumped */
1177 for (i = 0; i < frq_n_stats; i++)
1178 if (stats & BIT_INDEX (i))
1180 tab_text (t, 0, r, TAB_LEFT | TAT_TITLE,
1181 gettext (st_name[i].s10));
1182 tab_double (t, 2, r, TAB_NONE, stat_value[i], NULL);
1186 tab_text (t, 0, 0, TAB_LEFT | TAT_TITLE, _("N"));
1187 tab_text (t, 1, 0, TAB_LEFT | TAT_TITLE, _("Valid"));
1188 tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Missing"));
1190 tab_double (t, 2, 0, TAB_NONE, ft->valid_cases, wfmt);
1191 tab_double (t, 2, 1, TAB_NONE, ft->total_cases - ft->valid_cases, wfmt);
1193 for (i = 0; i < n_percentiles; i++, r++)
1195 if (!percentiles[i].show)
1200 tab_text (t, 0, r, TAB_LEFT | TAT_TITLE, _("Percentiles"));
1203 if (percentiles[i].p == 0.5)
1204 tab_text (t, 1, r, TAB_LEFT, _("50 (Median)"));
1206 tab_fixed (t, 1, r, TAB_LEFT, percentiles[i].p * 100, 3, 0);
1207 tab_double (t, 2, r, TAB_NONE, percentiles[i].value,
1208 var_get_print_format (v));
1211 tab_title (t, "%s", var_to_string (v));
1217 calculate_iqr (void)
1223 for (i = 0; i < n_percentiles; i++)
1225 if (fabs (0.25 - percentiles[i].p) < DBL_EPSILON)
1226 q1 = percentiles[i].value;
1227 else if (fabs (0.75 - percentiles[i].p) < DBL_EPSILON)
1228 q3 = percentiles[i].value;
1231 return q1 == SYSMIS || q3 == SYSMIS ? SYSMIS : q3 - q1;
1235 chart_includes_value (const struct frq_chart *chart,
1236 const struct variable *var,
1237 const union value *value)
1239 if (!chart->include_missing && var_is_value_missing (var, value, MV_ANY))
1242 if (var_is_numeric (var)
1243 && ((chart->x_min != SYSMIS && value->f < chart->x_min)
1244 || (chart->x_max != SYSMIS && value->f > chart->x_max)))
1250 /* Create a gsl_histogram from a freq_tab */
1252 freq_tab_to_hist (const struct freq_tab *ft, const struct variable *var)
1254 double x_min, x_max, valid_freq;
1257 struct histogram *histogram;
1261 /* Find out the extremes of the x value, within the range to be included in
1262 the histogram, and sum the total frequency of those values. */
1266 for (i = 0; i < ft->n_valid; i++)
1268 const struct freq_mutable *frq = &ft->valid[i];
1269 if (chart_includes_value (&hist, var, &frq->value))
1271 x_min = MIN (x_min, frq->value.f);
1272 x_max = MAX (x_max, frq->value.f);
1273 valid_freq += frq->count;
1277 /* Freedman-Diaconis' choice of bin width. */
1278 iqr = calculate_iqr ();
1281 double bin_width = 2 * iqr / pow (valid_freq, 1.0 / 3.0);
1282 bins = (x_max - x_min) / bin_width;
1285 else if (bins > 400)
1291 histogram = histogram_create (bins, x_min, x_max);
1292 for (i = 0; i < ft->n_valid; i++)
1294 const struct freq_mutable *frq = &ft->valid[i];
1295 if (chart_includes_value (&hist, var, &frq->value))
1296 histogram_add (histogram, frq->value.f, frq->count);
1303 add_slice (const struct freq_mutable *freq, const struct variable *var,
1304 struct slice *slice)
1306 if (chart_includes_value (&pie, var, &freq->value))
1308 ds_init_empty (&slice->label);
1309 var_append_value_name (var, &freq->value, &slice->label);
1310 slice->magnitude = freq->count;
1317 /* Allocate an array of slices and fill them from the data in frq_tab
1318 n_slices will contain the number of slices allocated.
1319 The caller is responsible for freeing slices
1321 static struct slice *
1322 freq_tab_to_slice_array(const struct freq_tab *frq_tab,
1323 const struct variable *var,
1326 struct slice *slices;
1330 slices = xnmalloc (frq_tab->n_valid + frq_tab->n_missing, sizeof *slices);
1333 for (i = 0; i < frq_tab->n_valid; i++)
1334 n_slices += add_slice (&frq_tab->valid[i], var, &slices[n_slices]);
1335 for (i = 0; i < frq_tab->n_missing; i++)
1336 n_slices += add_slice (&frq_tab->missing[i], var, &slices[n_slices]);
1338 *n_slicesp = n_slices;
1346 do_piechart(const struct variable *var, const struct freq_tab *frq_tab)
1348 struct slice *slices;
1351 slices = freq_tab_to_slice_array (frq_tab, var, &n_slices);
1354 msg (SW, _("Omitting pie chart for %s, which has only %d unique values."),
1355 var_get_name (var), n_slices);
1356 else if (n_slices > 50)
1357 msg (SW, _("Omitting pie chart for %s, which has over 50 unique values."),
1358 var_get_name (var));
1360 chart_item_submit (piechart_create (var_to_string(var), slices, n_slices));
1362 for (i = 0; i < n_slices; i++)
1363 ds_destroy (&slices[i].label);