X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Ffrequencies.c;h=d20d8392c176d4827a2f4f0e28d3f4e53b73bf0f;hb=3facf6314da231910f60a53805916e879b1782e9;hp=e5462d083d2fd44d6e24e2e23960d360c6ad8f6f;hpb=2ba9563cb1fddad9430be0c415dc81456f006281;p=pspp diff --git a/src/language/stats/frequencies.c b/src/language/stats/frequencies.c index e5462d083d..d20d8392c1 100644 --- a/src/language/stats/frequencies.c +++ b/src/language/stats/frequencies.c @@ -45,7 +45,6 @@ #include "libpspp/hmap.h" #include "libpspp/message.h" #include "libpspp/misc.h" -#include "libpspp/pool.h" #include "math/histogram.h" #include "math/moments.h" @@ -69,7 +68,6 @@ struct percentile { double p; /* the %ile to be calculated */ - double value; /* the %ile's value */ bool show; /* True to show this percentile in the statistics box. */ }; @@ -190,12 +188,9 @@ struct var_freqs /* Freqency table. */ struct freq_tab tab; /* Frequencies table to use. */ - /* Percentiles. */ - int n_groups; /* Number of groups. */ - double *groups; /* Groups. */ - /* Statistics. */ double stat[FRQ_ST_count]; + double *percentiles; /* Variable attributes. */ int width; @@ -203,15 +198,13 @@ struct var_freqs struct frq_proc { - struct pool *pool; - struct var_freqs *vars; size_t n_vars; /* Percentiles to calculate and possibly display. */ struct percentile *percentiles; - const struct percentile *median; - int n_percentiles; + size_t median_idx; + size_t n_percentiles; /* Frequency table display. */ long int max_categories; /* Maximum categories to show. */ @@ -249,8 +242,11 @@ static void do_barchart(const struct frq_chart *bar, const struct variable **var, const struct freq_tab *frq_tab); -static void dump_statistics (const struct frq_proc *frq, - const struct variable *wv); +static struct frq_stats_table *frq_stats_table_submit ( + struct frq_stats_table *, const struct frq_proc *, + const struct dictionary *, const struct variable *wv, + const struct ccase *example); +static void frq_stats_table_destroy (struct frq_stats_table *); static int compare_freq (const void *a_, const void *b_, const void *aux_) @@ -272,9 +268,8 @@ compare_freq (const void *a_, const void *b_, const void *aux_) } /* Create a gsl_histogram from a freq_tab */ -static struct histogram * -freq_tab_to_hist (const struct frq_proc *frq, const struct freq_tab *ft, - const struct variable *var); +static struct histogram *freq_tab_to_hist (const struct frq_proc *, + const struct var_freqs *); static void put_freq_row (struct pivot_table *table, int var_idx, @@ -362,20 +357,25 @@ calc_percentile (double p, double valid_cases, double x1, double x2) /* Calculates all of the percentiles for VF within FRQ. */ static void -calc_percentiles (const struct frq_proc *frq, const struct var_freqs *vf) +calc_percentiles (const struct frq_proc *frq, struct var_freqs *vf) { + if (!frq->n_percentiles) + return; + + if (!vf->percentiles) + vf->percentiles = xnmalloc (frq->n_percentiles, sizeof *vf->percentiles); + const struct freq_tab *ft = &vf->tab; - double W = ft->valid_cases; - const struct freq *f; - int percentile_idx = 0; - double rank = 0; + const double W = ft->valid_cases; + size_t idx = 0; - for (f = ft->valid; f < ft->missing; f++) + double rank = 0; + for (const struct freq *f = ft->valid; f < ft->missing; f++) { rank += f->count; - for (; percentile_idx < frq->n_percentiles; percentile_idx++) + for (; idx < frq->n_percentiles; idx++) { - struct percentile *pc = &frq->percentiles[percentile_idx]; + struct percentile *pc = &frq->percentiles[idx]; double tp; tp = (settings_get_algorithm () == ENHANCED @@ -386,18 +386,16 @@ calc_percentiles (const struct frq_proc *frq, const struct var_freqs *vf) break; if (tp + 1 < rank || f + 1 >= ft->missing) - pc->value = f->values[0].f; + vf->percentiles[idx] = f->values[0].f; else - pc->value = calc_percentile (pc->p, W, f->values[0].f, f[1].values[0].f); + vf->percentiles[idx] = calc_percentile (pc->p, W, f->values[0].f, + f[1].values[0].f); } } - for (; percentile_idx < frq->n_percentiles; percentile_idx++) - { - struct percentile *pc = &frq->percentiles[percentile_idx]; - pc->value = (ft->n_valid > 0 - ? ft->valid[ft->n_valid - 1].values[0].f - : SYSMIS); - } + for (; idx < frq->n_percentiles; idx++) + vf->percentiles[idx] = (ft->n_valid > 0 + ? ft->valid[ft->n_valid - 1].values[0].f + : SYSMIS); } /* Returns true iff the value in struct freq F is non-missing @@ -408,7 +406,7 @@ not_missing (const void *f_, const void *v_) const struct freq *f = f_; const struct variable *v = v_; - return !var_is_value_missing (v, f->values, MV_ANY); + return !var_is_value_missing (v, f->values); } @@ -458,14 +456,6 @@ postprocess_freq_tab (const struct frq_proc *frq, struct var_freqs *vf) } -/* Frees the frequency table for variable V. */ -static void -cleanup_freq_tab (struct var_freqs *vf) -{ - free (vf->tab.valid); - freq_hmap_destroy (&vf->tab.data, vf->width); -} - /* Add data from case C to the frequency table. */ static void calc (struct frq_proc *frq, const struct ccase *c, const struct dataset *ds) @@ -488,52 +478,52 @@ calc (struct frq_proc *frq, const struct ccase *c, const struct dataset *ds) } } -/* Prepares each variable that is the target of FREQUENCIES by setting - up its hash table. */ static void -precalc (struct frq_proc *frq, struct casereader *input, struct dataset *ds) +output_splits_once (bool *need_splits, const struct dataset *ds, + const struct ccase *c) { - struct ccase *c; - size_t i; - - c = casereader_peek (input, 0); - if (c != NULL) + if (*need_splits) { output_split_file_values (ds, c); - case_unref (c); + *need_splits = false; } - - for (i = 0; i < frq->n_vars; i++) - hmap_init (&frq->vars[i].tab.data); } /* Finishes up with the variables after frequencies have been calculated. Displays statistics, percentiles, ... */ -static void -postcalc (struct frq_proc *frq, const struct dataset *ds) +static struct frq_stats_table * +postcalc (struct frq_proc *frq, const struct dataset *ds, + struct ccase *example, struct frq_stats_table *fst) { const struct dictionary *dict = dataset_dict (ds); const struct variable *wv = dict_get_weight (dict); - size_t i; - for (i = 0; i < frq->n_vars; i++) + for (size_t i = 0; i < frq->n_vars; i++) { struct var_freqs *vf = &frq->vars[i]; postprocess_freq_tab (frq, vf); calc_percentiles (frq, vf); } + enum split_type st = dict_get_split_type (dict); + bool need_splits = true; if (frq->n_stats) - dump_statistics (frq, wv); + { + if (st != SPLIT_LAYERED) + output_splits_once (&need_splits, ds, example); + fst = frq_stats_table_submit (fst, frq, dict, wv, example); + } - for (i = 0; i < frq->n_vars; i++) + for (size_t i = 0; i < frq->n_vars; i++) { struct var_freqs *vf = &frq->vars[i]; /* Frequencies tables. */ if (vf->tab.n_valid + vf->tab.n_missing <= frq->max_categories) - dump_freq_table (vf, wv); - + { + output_splits_once (&need_splits, ds, example); + dump_freq_table (vf, wv); + } if (frq->hist && var_is_numeric (vf->var) && vf->tab.n_valid > 0) { @@ -542,10 +532,11 @@ postcalc (struct frq_proc *frq, const struct dataset *ds) calc_stats (frq, vf, d); - histogram = freq_tab_to_hist (frq, &vf->tab, vf->var); + histogram = freq_tab_to_hist (frq, vf); if (histogram) { + output_splits_once (&need_splits, ds, example); chart_submit (histogram_chart_create ( histogram->gsl_hist, var_to_string(vf->var), vf->tab.valid_cases, @@ -558,13 +549,49 @@ postcalc (struct frq_proc *frq, const struct dataset *ds) } if (frq->pie) - do_piechart(frq->pie, vf->var, &vf->tab); + { + output_splits_once (&need_splits, ds, example); + do_piechart(frq->pie, vf->var, &vf->tab); + } if (frq->bar) - do_barchart(frq->bar, &vf->var, &vf->tab); + { + output_splits_once (&need_splits, ds, example); + do_barchart(frq->bar, &vf->var, &vf->tab); + } - cleanup_freq_tab (vf); + free (vf->tab.valid); + freq_hmap_destroy (&vf->tab.data, vf->width); } + + return fst; +} + +static void +frq_run (struct frq_proc *frq, struct dataset *ds) +{ + struct frq_stats_table *fst = NULL; + struct casegrouper *grouper = casegrouper_create_splits (proc_open (ds), + dataset_dict (ds)); + struct casereader *group; + while (casegrouper_get_next_group (grouper, &group)) + { + for (size_t i = 0; i < frq->n_vars; i++) + hmap_init (&frq->vars[i].tab.data); + + struct ccase *example = casereader_peek (group, 0); + + struct ccase *c; + for (; (c = casereader_read (group)) != NULL; case_unref (c)) + calc (frq, c, ds); + fst = postcalc (frq, ds, example, fst); + casereader_destroy (group); + + case_unref (example); + } + frq_stats_table_destroy (fst); + casegrouper_destroy (grouper); + proc_commit (ds); } int @@ -593,7 +620,6 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) int hi_pcnt = INT_MIN; int hi_norm = FRQ_NONORMAL; - frq.pool = pool_create (); frq.sort = FRQ_AVALUE; frq.vars = NULL; @@ -1208,61 +1234,44 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) frq.n_percentiles = o; - frq.median = NULL; + frq.median_idx = SIZE_MAX; for (i = 0; i < frq.n_percentiles; i++) if (frq.percentiles[i].p == 0.5) { - frq.median = &frq.percentiles[i]; + frq.median_idx = i; break; } } - { - struct casegrouper *grouper; - struct casereader *group; - bool ok; - - grouper = casegrouper_create_splits (proc_open (ds), dataset_dict (ds)); - while (casegrouper_get_next_group (grouper, &group)) - { - struct ccase *c; - precalc (&frq, group, ds); - - for (; (c = casereader_read (group)) != NULL; case_unref (c)) - calc (&frq, c, ds); - postcalc (&frq, ds); - casereader_destroy (group); - } - ok = casegrouper_destroy (grouper); - ok = proc_commit (ds) && ok; - } - + frq_run (&frq, ds); free (vars); + for (size_t i = 0; i < frq.n_vars; i++) + free (frq.vars[i].percentiles); free (frq.vars); free (frq.bar); free (frq.pie); free (frq.hist); free (frq.percentiles); - pool_destroy (frq.pool); return CMD_SUCCESS; error: free (vars); + for (size_t i = 0; i < frq.n_vars; i++) + free (frq.vars[i].percentiles); free (frq.vars); free (frq.bar); free (frq.pie); free (frq.hist); free (frq.percentiles); - pool_destroy (frq.pool); return CMD_FAILURE; } static double -calculate_iqr (const struct frq_proc *frq) +calculate_iqr (const struct frq_proc *frq, const struct var_freqs *vf) { double q1 = SYSMIS; double q3 = SYSMIS; @@ -1275,9 +1284,9 @@ calculate_iqr (const struct frq_proc *frq) struct percentile *pc = &frq->percentiles[i]; if (fabs (0.25 - pc->p) < DBL_EPSILON) - q1 = pc->value; + q1 = vf->percentiles[i]; else if (fabs (0.75 - pc->p) < DBL_EPSILON) - q3 = pc->value; + q3 = vf->percentiles[i]; } return q1 == SYSMIS || q3 == SYSMIS ? SYSMIS : q3 - q1; @@ -1288,7 +1297,7 @@ chart_includes_value (const struct frq_chart *chart, const struct variable *var, const union value *value) { - if (!chart->include_missing && var_is_value_missing (var, value, MV_ANY)) + if (!chart->include_missing && var_is_value_missing (var, value)) return false; if (var_is_numeric (var) @@ -1301,24 +1310,17 @@ chart_includes_value (const struct frq_chart *chart, /* Create a gsl_histogram from a freq_tab */ static struct histogram * -freq_tab_to_hist (const struct frq_proc *frq, const struct freq_tab *ft, - const struct variable *var) +freq_tab_to_hist (const struct frq_proc *frq, const struct var_freqs *vf) { - double x_min, x_max, valid_freq; - int i; - double bin_width; - struct histogram *histogram; - double iqr; - /* Find out the extremes of the x value, within the range to be included in the histogram, and sum the total frequency of those values. */ - x_min = DBL_MAX; - x_max = -DBL_MAX; - valid_freq = 0; - for (i = 0; i < ft->n_valid; i++) + double x_min = DBL_MAX; + double x_max = -DBL_MAX; + double valid_freq = 0; + for (int i = 0; i < vf->tab.n_valid; i++) { - const struct freq *f = &ft->valid[i]; - if (chart_includes_value (frq->hist, var, f->values)) + const struct freq *f = &vf->tab.valid[i]; + if (chart_includes_value (frq->hist, vf->var, f->values)) { x_min = MIN (x_min, f->values[0].f); x_max = MAX (x_max, f->values[0].f); @@ -1329,25 +1331,21 @@ freq_tab_to_hist (const struct frq_proc *frq, const struct freq_tab *ft, if (valid_freq <= 0) return NULL; - iqr = calculate_iqr (frq); - - if (iqr > 0) - /* Freedman-Diaconis' choice of bin width. */ - bin_width = 2 * iqr / pow (valid_freq, 1.0 / 3.0); + double iqr = calculate_iqr (frq, vf); - else - /* Sturges Rule */ - bin_width = (x_max - x_min) / (1 + log2 (valid_freq)); - - histogram = histogram_create (bin_width, x_min, x_max); + double bin_width = + (iqr > 0 + ? 2 * iqr / pow (valid_freq, 1.0 / 3.0) /* Freedman-Diaconis. */ + : (x_max - x_min) / (1 + log2 (valid_freq))); /* Sturges */ + struct histogram *histogram = histogram_create (bin_width, x_min, x_max); if (histogram == NULL) return NULL; - for (i = 0; i < ft->n_valid; i++) + for (int i = 0; i < vf->tab.n_valid; i++) { - const struct freq *f = &ft->valid[i]; - if (chart_includes_value (frq->hist, var, f->values)) + const struct freq *f = &vf->tab.valid[i]; + if (chart_includes_value (frq->hist, vf->var, f->values)) histogram_add (histogram, f->values[0].f, f->count); } @@ -1504,21 +1502,14 @@ calc_stats (const struct frq_proc *frq, const struct var_freqs *vf, int most_often = -1; double X_mode = SYSMIS; - /* Calculate the mode. */ + /* Calculate the mode. If there is more than one mode, we take the + smallest. */ for (f = ft->valid; f < ft->missing; f++) - { - if (most_often < f->count) - { - most_often = f->count; - X_mode = f->values[0].f; - } - else if (most_often == f->count) - { - /* A duplicate mode is undefined. - FIXME: keep track of *all* the modes. */ - X_mode = SYSMIS; - } - } + if (most_often < f->count) + { + most_often = f->count; + X_mode = f->values[0].f; + } /* Calculate moments. */ m = moments_create (MOMENT_KURTOSIS); @@ -1549,7 +1540,9 @@ calc_stats (const struct frq_proc *frq, const struct var_freqs *vf, d[FRQ_ST_SEMEAN] = d[FRQ_ST_STDDEV] / sqrt (W); d[FRQ_ST_SESKEWNESS] = calc_seskew (W); d[FRQ_ST_SEKURTOSIS] = calc_sekurt (W); - d[FRQ_ST_MEDIAN] = frq->median ? frq->median->value : SYSMIS; + d[FRQ_ST_MEDIAN] = (frq->median_idx != SIZE_MAX + ? vf->percentiles[frq->median_idx] + : SYSMIS); } static bool @@ -1561,19 +1554,31 @@ all_string_variables (const struct frq_proc *frq) return true; } + +struct frq_stats_table + { + struct pivot_table *table; + struct pivot_splits *splits; + }; /* Displays a table of all the statistics requested. */ -static void -dump_statistics (const struct frq_proc *frq, const struct variable *wv) +static struct frq_stats_table * +frq_stats_table_create (const struct frq_proc *frq, + const struct dictionary *dict, + const struct variable *wv) { if (all_string_variables (frq)) - return; + return NULL; struct pivot_table *table = pivot_table_create (N_("Statistics")); pivot_table_set_weight_var (table, wv); struct pivot_dimension *variables = pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Variables")); + for (size_t i = 0; i < frq->n_vars; i++) + if (!var_is_alpha (frq->vars[i].var)) + pivot_category_create_leaf (variables->root, + pivot_value_new_variable (frq->vars[i].var)); struct pivot_dimension *statistics = pivot_dimension_create ( table, PIVOT_AXIS_ROW, N_("Statistics")); @@ -1601,6 +1606,30 @@ dump_statistics (const struct frq_proc *frq, const struct variable *wv) pc->p * 100.0)); } + struct pivot_splits *splits = pivot_splits_create (table, PIVOT_AXIS_COLUMN, + dict); + + struct frq_stats_table *fst = xmalloc (sizeof *fst); + *fst = (struct frq_stats_table) { .table = table, .splits = splits }; + return fst; +} + +static struct frq_stats_table * +frq_stats_table_submit (struct frq_stats_table *fst, + const struct frq_proc *frq, + const struct dictionary *dict, + const struct variable *wv, + const struct ccase *example) +{ + if (!fst) + { + fst = frq_stats_table_create (frq, dict, wv); + if (!fst) + return NULL; + } + pivot_splits_new_split (fst->splits, example); + + int var_idx = 0; for (size_t i = 0; i < frq->n_vars; i++) { struct var_freqs *vf = &frq->vars[i]; @@ -1609,13 +1638,10 @@ dump_statistics (const struct frq_proc *frq, const struct variable *wv) const struct freq_tab *ft = &vf->tab; - int var_idx = pivot_category_create_leaf ( - variables->root, pivot_value_new_variable (vf->var)); - int row = 0; - pivot_table_put2 (table, var_idx, row++, + pivot_splits_put2 (fst->splits, fst->table, var_idx, row++, pivot_value_new_number (ft->valid_cases)); - pivot_table_put2 (table, var_idx, row++, + pivot_splits_put2 (fst->splits, fst->table, var_idx, row++, pivot_value_new_number ( ft->total_cases - ft->valid_cases)); @@ -1631,7 +1657,7 @@ dump_statistics (const struct frq_proc *frq, const struct variable *wv) = (j == FRQ_ST_MODE || j == FRQ_ST_MINIMUM || j == FRQ_ST_MAXIMUM ? pivot_value_new_var_value (vf->var, &v) : pivot_value_new_number (v.f)); - pivot_table_put2 (table, var_idx, row++, pv); + pivot_splits_put2 (fst->splits, fst->table, var_idx, row++, pv); } for (size_t j = 0; j < frq->n_percentiles; j++) @@ -1640,11 +1666,31 @@ dump_statistics (const struct frq_proc *frq, const struct variable *wv) if (!pc->show) continue; - union value v = { .f = vf->tab.n_valid ? pc->value : SYSMIS }; - pivot_table_put2 (table, var_idx, row++, - pivot_value_new_var_value (vf->var, &v)); + union value v = { + .f = vf->tab.n_valid ? vf->percentiles[j] : SYSMIS + }; + pivot_splits_put2 (fst->splits, fst->table, var_idx, row++, + pivot_value_new_var_value (vf->var, &v)); } + + var_idx++; } - pivot_table_submit (table); + if (!fst->splits) + { + frq_stats_table_destroy (fst); + return NULL; + } + return fst; +} + +static void +frq_stats_table_destroy (struct frq_stats_table *fst) +{ + if (!fst) + return; + + pivot_table_submit (fst->table); + pivot_splits_destroy (fst->splits); + free (fst); }