X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Ffrequencies.c;h=307a869ba2d1e17d723c80443f19b8ba7a7a7240;hb=5e755f453ddba1a9f0c9677d9484950d5c21f4fd;hp=8bd6c600205d3fe75dc6a99dd230c2ab9591146b;hpb=5cab4cf3322f29c0ed7134d23740e07382914f20;p=pspp diff --git a/src/language/stats/frequencies.c b/src/language/stats/frequencies.c index 8bd6c60020..307a869ba2 100644 --- a/src/language/stats/frequencies.c +++ b/src/language/stats/frequencies.c @@ -45,14 +45,12 @@ #include "libpspp/hmap.h" #include "libpspp/message.h" #include "libpspp/misc.h" -#include "libpspp/pool.h" #include "math/histogram.h" #include "math/moments.h" #include "math/chart-geometry.h" -#include "output/chart-item.h" #include "output/charts/barchart.h" #include "output/charts/piechart.h" #include "output/charts/plot-hist.h" @@ -70,7 +68,6 @@ struct percentile { double p; /* the %ile to be calculated */ - double value; /* the %ile's value */ bool show; /* True to show this percentile in the statistics box. */ }; @@ -191,12 +188,9 @@ struct var_freqs /* Freqency table. */ struct freq_tab tab; /* Frequencies table to use. */ - /* Percentiles. */ - int n_groups; /* Number of groups. */ - double *groups; /* Groups. */ - /* Statistics. */ double stat[FRQ_ST_count]; + double *percentiles; /* Variable attributes. */ int width; @@ -204,15 +198,13 @@ struct var_freqs struct frq_proc { - struct pool *pool; - struct var_freqs *vars; size_t n_vars; /* Percentiles to calculate and possibly display. */ struct percentile *percentiles; - const struct percentile *median; - int n_percentiles; + size_t median_idx; + size_t n_percentiles; /* Frequency table display. */ long int max_categories; /* Maximum categories to show. */ @@ -225,6 +217,8 @@ struct frq_proc /* Histogram and pie chart settings. */ struct frq_chart *hist, *pie, *bar; + + bool warn; }; @@ -271,9 +265,8 @@ compare_freq (const void *a_, const void *b_, const void *aux_) } /* Create a gsl_histogram from a freq_tab */ -static struct histogram * -freq_tab_to_hist (const struct frq_proc *frq, const struct freq_tab *ft, - const struct variable *var); +static struct histogram *freq_tab_to_hist (const struct frq_proc *, + const struct var_freqs *); static void put_freq_row (struct pivot_table *table, int var_idx, @@ -294,7 +287,7 @@ dump_freq_table (const struct var_freqs *vf, const struct variable *wv) const struct freq_tab *ft = &vf->tab; struct pivot_table *table = pivot_table_create__ (pivot_value_new_variable ( - vf->var)); + vf->var), "Frequencies"); pivot_table_set_weight_var (table, wv); pivot_dimension_create (table, PIVOT_AXIS_COLUMN, N_("Statistics"), @@ -361,20 +354,24 @@ calc_percentile (double p, double valid_cases, double x1, double x2) /* Calculates all of the percentiles for VF within FRQ. */ static void -calc_percentiles (const struct frq_proc *frq, const struct var_freqs *vf) +calc_percentiles (const struct frq_proc *frq, struct var_freqs *vf) { + if (!frq->n_percentiles) + return; + + vf->percentiles = xnmalloc (frq->n_percentiles, sizeof *vf->percentiles); + const struct freq_tab *ft = &vf->tab; - double W = ft->valid_cases; - const struct freq *f; - int percentile_idx = 0; - double rank = 0; + const double W = ft->valid_cases; + size_t idx = 0; - for (f = ft->valid; f < ft->missing; f++) + double rank = 0; + for (const struct freq *f = ft->valid; f < ft->missing; f++) { rank += f->count; - for (; percentile_idx < frq->n_percentiles; percentile_idx++) + for (; idx < frq->n_percentiles; idx++) { - struct percentile *pc = &frq->percentiles[percentile_idx]; + struct percentile *pc = &frq->percentiles[idx]; double tp; tp = (settings_get_algorithm () == ENHANCED @@ -385,18 +382,16 @@ calc_percentiles (const struct frq_proc *frq, const struct var_freqs *vf) break; if (tp + 1 < rank || f + 1 >= ft->missing) - pc->value = f->values[0].f; + vf->percentiles[idx] = f->values[0].f; else - pc->value = calc_percentile (pc->p, W, f->values[0].f, f[1].values[0].f); + vf->percentiles[idx] = calc_percentile (pc->p, W, f->values[0].f, + f[1].values[0].f); } } - for (; percentile_idx < frq->n_percentiles; percentile_idx++) - { - struct percentile *pc = &frq->percentiles[percentile_idx]; - pc->value = (ft->n_valid > 0 - ? ft->valid[ft->n_valid - 1].values[0].f - : SYSMIS); - } + for (; idx < frq->n_percentiles; idx++) + vf->percentiles[idx] = (ft->n_valid > 0 + ? ft->valid[ft->n_valid - 1].values[0].f + : SYSMIS); } /* Returns true iff the value in struct freq F is non-missing @@ -407,7 +402,7 @@ not_missing (const void *f_, const void *v_) const struct freq *f = f_; const struct variable *v = v_; - return !var_is_value_missing (v, f->values, MV_ANY); + return !var_is_value_missing (v, f->values); } @@ -441,7 +436,7 @@ postprocess_freq_tab (const struct frq_proc *frq, struct var_freqs *vf) /* Summary statistics. */ ft->valid_cases = 0.0; - for(i = 0 ; i < ft->n_valid ; ++i ) + for(i = 0 ; i < ft->n_valid ; ++i) { f = &ft->valid[i]; ft->valid_cases += f->count; @@ -449,7 +444,7 @@ postprocess_freq_tab (const struct frq_proc *frq, struct var_freqs *vf) } ft->total_cases = ft->valid_cases ; - for(i = 0 ; i < ft->n_missing ; ++i ) + for(i = 0 ; i < ft->n_missing ; ++i) { f = &ft->missing[i]; ft->total_cases += f->count; @@ -469,7 +464,7 @@ cleanup_freq_tab (struct var_freqs *vf) static void calc (struct frq_proc *frq, const struct ccase *c, const struct dataset *ds) { - double weight = dict_get_case_weight (dataset_dict (ds), c, NULL); + double weight = dict_get_case_weight (dataset_dict (ds), c, &frq->warn); size_t i; for (i = 0; i < frq->n_vars; i++) @@ -541,16 +536,16 @@ postcalc (struct frq_proc *frq, const struct dataset *ds) calc_stats (frq, vf, d); - histogram = freq_tab_to_hist (frq, &vf->tab, vf->var); + histogram = freq_tab_to_hist (frq, vf); - if ( histogram) + if (histogram) { - chart_item_submit (histogram_chart_create ( - histogram->gsl_hist, var_to_string(vf->var), - vf->tab.valid_cases, - d[FRQ_ST_MEAN], - d[FRQ_ST_STDDEV], - frq->hist->draw_normal)); + chart_submit (histogram_chart_create ( + histogram->gsl_hist, var_to_string(vf->var), + vf->tab.valid_cases, + d[FRQ_ST_MEAN], + d[FRQ_ST_STDDEV], + frq->hist->draw_normal)); statistic_destroy (&histogram->parent); } @@ -592,7 +587,6 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) int hi_pcnt = INT_MIN; int hi_norm = FRQ_NONORMAL; - frq.pool = pool_create (); frq.sort = FRQ_AVALUE; frq.vars = NULL; @@ -613,13 +607,14 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) frq.hist = NULL; frq.pie = NULL; frq.bar = NULL; + frq.warn = true; /* Accept an optional, completely pointless "/VARIABLES=" */ lex_match (lexer, T_SLASH); if (lex_match_id (lexer, "VARIABLES")) { - if (! lex_force_match (lexer, T_EQUALS) ) + if (! lex_force_match (lexer, T_EQUALS)) goto error; } @@ -629,7 +624,7 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) PV_NO_DUPLICATE)) goto error; - frq.vars = xzalloc (frq.n_vars * sizeof (*frq.vars)); + frq.vars = xcalloc (frq.n_vars, sizeof (*frq.vars)); for (i = 0; i < frq.n_vars; ++i) { frq.vars[i].var = vars[i]; @@ -795,7 +790,7 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) else if (lex_match_id (lexer, "LIMIT")) { if (!lex_force_match (lexer, T_LPAREN) - || !lex_force_int (lexer)) + || !lex_force_int_range (lexer, "LIMIT", 0, INT_MAX)) goto error; frq.max_categories = lex_integer (lexer); @@ -831,12 +826,11 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) { lex_match (lexer, T_EQUALS); - if (lex_force_int (lexer)) + if (lex_force_int_range (lexer, "NTILES", 0, INT_MAX)) { - int i; int n = lex_integer (lexer); lex_get (lexer); - for (i = 0; i < n + 1; ++i) + for (int i = 0; i < n + 1; ++i) { frq.percentiles = xrealloc (frq.percentiles, @@ -894,13 +888,9 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) hi_scale = FRQ_FREQ; if (lex_match (lexer, T_LPAREN)) { - if (lex_force_int (lexer)) + if (lex_force_int_range (lexer, "FREQ", 1, INT_MAX)) { hi_freq = lex_integer (lexer); - if (hi_freq <= 0) - { - lex_error (lexer, _("Histogram frequency must be greater than zero.")); - } lex_get (lexer); if (! lex_force_match (lexer, T_RPAREN)) goto error; @@ -912,13 +902,9 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) hi_scale = FRQ_PERCENT; if (lex_match (lexer, T_LPAREN)) { - if (lex_force_int (lexer)) + if (lex_force_int_range (lexer, "PERCENT", 1, INT_MAX)) { hi_pcnt = lex_integer (lexer); - if (hi_pcnt <= 0) - { - lex_error (lexer, _("Histogram percentage must be greater than zero.")); - } lex_get (lexer); if (! lex_force_match (lexer, T_RPAREN)) goto error; @@ -1034,7 +1020,7 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) } else if (lex_match_id (lexer, "FREQ")) { - if ( lex_match (lexer, T_LPAREN)) + if (lex_match (lexer, T_LPAREN)) { if (lex_force_num (lexer)) { @@ -1048,7 +1034,7 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) } else if (lex_match_id (lexer, "PERCENT")) { - if ( lex_match (lexer, T_LPAREN)) + if (lex_match (lexer, T_LPAREN)) { if (lex_force_num (lexer)) { @@ -1215,11 +1201,11 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) frq.n_percentiles = o; - frq.median = NULL; + frq.median_idx = SIZE_MAX; for (i = 0; i < frq.n_percentiles; i++) if (frq.percentiles[i].p == 0.5) { - frq.median = &frq.percentiles[i]; + frq.median_idx = i; break; } } @@ -1244,14 +1230,14 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) ok = proc_commit (ds) && ok; } - free (vars); + for (size_t i = 0; i < frq.n_vars; i++) + free (frq.vars[i].percentiles); free (frq.vars); free (frq.bar); free (frq.pie); free (frq.hist); free (frq.percentiles); - pool_destroy (frq.pool); return CMD_SUCCESS; @@ -1259,17 +1245,18 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) free (vars); free (frq.vars); + for (size_t i = 0; i < frq.n_vars; i++) + free (frq.vars[i].percentiles); free (frq.bar); free (frq.pie); free (frq.hist); free (frq.percentiles); - pool_destroy (frq.pool); return CMD_FAILURE; } static double -calculate_iqr (const struct frq_proc *frq) +calculate_iqr (const struct frq_proc *frq, const struct var_freqs *vf) { double q1 = SYSMIS; double q3 = SYSMIS; @@ -1282,9 +1269,9 @@ calculate_iqr (const struct frq_proc *frq) struct percentile *pc = &frq->percentiles[i]; if (fabs (0.25 - pc->p) < DBL_EPSILON) - q1 = pc->value; + q1 = vf->percentiles[i]; else if (fabs (0.75 - pc->p) < DBL_EPSILON) - q3 = pc->value; + q3 = vf->percentiles[i]; } return q1 == SYSMIS || q3 == SYSMIS ? SYSMIS : q3 - q1; @@ -1295,7 +1282,7 @@ chart_includes_value (const struct frq_chart *chart, const struct variable *var, const union value *value) { - if (!chart->include_missing && var_is_value_missing (var, value, MV_ANY)) + if (!chart->include_missing && var_is_value_missing (var, value)) return false; if (var_is_numeric (var) @@ -1308,24 +1295,17 @@ chart_includes_value (const struct frq_chart *chart, /* Create a gsl_histogram from a freq_tab */ static struct histogram * -freq_tab_to_hist (const struct frq_proc *frq, const struct freq_tab *ft, - const struct variable *var) +freq_tab_to_hist (const struct frq_proc *frq, const struct var_freqs *vf) { - double x_min, x_max, valid_freq; - int i; - double bin_width; - struct histogram *histogram; - double iqr; - /* Find out the extremes of the x value, within the range to be included in the histogram, and sum the total frequency of those values. */ - x_min = DBL_MAX; - x_max = -DBL_MAX; - valid_freq = 0; - for (i = 0; i < ft->n_valid; i++) + double x_min = DBL_MAX; + double x_max = -DBL_MAX; + double valid_freq = 0; + for (int i = 0; i < vf->tab.n_valid; i++) { - const struct freq *f = &ft->valid[i]; - if (chart_includes_value (frq->hist, var, f->values)) + const struct freq *f = &vf->tab.valid[i]; + if (chart_includes_value (frq->hist, vf->var, f->values)) { x_min = MIN (x_min, f->values[0].f); x_max = MAX (x_max, f->values[0].f); @@ -1336,25 +1316,21 @@ freq_tab_to_hist (const struct frq_proc *frq, const struct freq_tab *ft, if (valid_freq <= 0) return NULL; - iqr = calculate_iqr (frq); - - if (iqr > 0) - /* Freedman-Diaconis' choice of bin width. */ - bin_width = 2 * iqr / pow (valid_freq, 1.0 / 3.0); + double iqr = calculate_iqr (frq, vf); - else - /* Sturges Rule */ - bin_width = (x_max - x_min) / (1 + log2 (valid_freq)); + double bin_width = + (iqr > 0 + ? 2 * iqr / pow (valid_freq, 1.0 / 3.0) /* Freedman-Diaconis. */ + : (x_max - x_min) / (1 + log2 (valid_freq))); /* Sturges */ - histogram = histogram_create (bin_width, x_min, x_max); - - if ( histogram == NULL) + struct histogram *histogram = histogram_create (bin_width, x_min, x_max); + if (histogram == NULL) return NULL; - for (i = 0; i < ft->n_valid; i++) + for (int i = 0; i < vf->tab.n_valid; i++) { - const struct freq *f = &ft->valid[i]; - if (chart_includes_value (frq->hist, var, f->values)) + const struct freq *f = &vf->tab.valid[i]; + if (chart_includes_value (frq->hist, vf->var, f->values)) histogram_add (histogram, f->values[0].f, f->count); } @@ -1474,7 +1450,7 @@ do_piechart(const struct frq_chart *pie, const struct variable *var, msg (SW, _("Omitting pie chart for %s, which has over 50 unique values."), var_get_name (var)); else - chart_item_submit (piechart_create (var, slices, n_slices)); + chart_submit (piechart_create (var, slices, n_slices)); free (slices); } @@ -1487,10 +1463,13 @@ do_barchart(const struct frq_chart *bar, const struct variable **var, int n_slices; struct freq **slices = pick_cat_counts_ptr (bar, frq_tab, &n_slices); - chart_item_submit (barchart_create (var, 1, - (bar->y_scale == FRQ_FREQ) ? _("Count") : _("Percent"), - (bar->y_scale == FRQ_PERCENT), - slices, n_slices)); + if (n_slices < 1) + msg (SW, _("Omitting bar chart, which has no values.")); + else + chart_submit (barchart_create (var, 1, + (bar->y_scale == FRQ_FREQ) ? _("Count") : _("Percent"), + (bar->y_scale == FRQ_PERCENT), + slices, n_slices)); free (slices); } @@ -1553,7 +1532,9 @@ calc_stats (const struct frq_proc *frq, const struct var_freqs *vf, d[FRQ_ST_SEMEAN] = d[FRQ_ST_STDDEV] / sqrt (W); d[FRQ_ST_SESKEWNESS] = calc_seskew (W); d[FRQ_ST_SEKURTOSIS] = calc_sekurt (W); - d[FRQ_ST_MEDIAN] = frq->median ? frq->median->value : SYSMIS; + d[FRQ_ST_MEDIAN] = (frq->median_idx != SIZE_MAX + ? vf->percentiles[frq->median_idx] + : SYSMIS); } static bool @@ -1644,7 +1625,9 @@ dump_statistics (const struct frq_proc *frq, const struct variable *wv) if (!pc->show) continue; - union value v = { .f = vf->tab.n_valid ? pc->value : SYSMIS }; + union value v = { + .f = vf->tab.n_valid ? vf->percentiles[j] : SYSMIS + }; pivot_table_put2 (table, var_idx, row++, pivot_value_new_var_value (vf->var, &v)); }