X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Ffrequencies.q;h=a09ecc100759b76d2ecf4d682a050c9999be79f5;hb=37f8a4cc05e3e99f0219ed2765ed242e63464857;hp=9d90e06bd96cd63c346d5d036706a6ce18ba3799;hpb=43b1296aafe7582e7dbe6c2b6a8b478d7d9b0fcf;p=pspp-builds.git diff --git a/src/language/stats/frequencies.q b/src/language/stats/frequencies.q index 9d90e06b..a09ecc10 100644 --- a/src/language/stats/frequencies.q +++ b/src/language/stats/frequencies.q @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2007 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -38,12 +38,10 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include @@ -60,6 +58,7 @@ #include "freq.h" #include "minmax.h" +#include "xalloc.h" #include "gettext.h" #define _(msgid) gettext (msgid) @@ -156,8 +155,6 @@ static void add_percentile (double x) ; static struct percentile *percentiles; static int n_percentiles; -static int implicit_50th ; - /* Groups of statistics. */ #define BI BIT_INDEX #define frq_default \ @@ -198,34 +195,16 @@ static int normal; /* FIXME */ static size_t n_variables; static const struct variable **v_variables; -/* Arenas used to store semi-permanent storage. */ -static struct pool *int_pool; /* Integer mode. */ -static struct pool *gen_pool; /* General mode. */ +/* Pools. */ +static struct pool *data_pool; /* For per-SPLIT FILE group data. */ +static struct pool *syntax_pool; /* For syntax-related data. */ /* Frequency tables. */ -/* Types of frequency tables. */ -enum - { - FRQM_GENERAL, - FRQM_INTEGER - }; - /* Entire frequency table. */ struct freq_tab { - int mode; /* FRQM_GENERAL or FRQM_INTEGER. */ - - /* General mode. */ struct hsh_table *data; /* Undifferentiated data. */ - - /* Integer mode. */ - double *vector; /* Frequencies proper. */ - int min, max; /* The boundaries of the table. */ - double out_of_range; /* Sum of weights of out-of-range values. */ - double sysmis; /* Sum of weights of SYSMIS values. */ - - /* All modes. */ struct freq *valid; /* Valid freqs. */ int n_valid; /* Number of total freqs. */ @@ -272,12 +251,12 @@ static void calc_stats (const struct variable *v, double d[frq_n_stats]); static void precalc (struct casereader *, struct dataset *); static void calc (const struct ccase *, const struct dataset *); -static void postcalc (void); +static void postcalc (const struct dataset *); static void postprocess_freq_tab (const struct variable *); -static void dump_full (const struct variable *); -static void dump_condensed (const struct variable *); -static void dump_statistics (const struct variable *, int show_varname); +static void dump_full ( const struct variable *, const struct variable *); +static void dump_condensed (const struct variable *, const struct variable *); +static void dump_statistics (const struct variable *, bool show_varname, const struct variable *); static void cleanup_freq_tab (const struct variable *); static hsh_compare_func compare_value_numeric_a, compare_value_alpha_a; @@ -303,12 +282,12 @@ cmd_frequencies (struct lexer *lexer, struct dataset *ds) { int result; - int_pool = pool_create (); + syntax_pool = pool_create (); result = internal_cmd_frequencies (lexer, ds); - pool_destroy (int_pool); - int_pool=0; - pool_destroy (gen_pool); - gen_pool=0; + pool_destroy (syntax_pool); + syntax_pool=0; + pool_destroy (data_pool); + data_pool=0; free (v_variables); v_variables=0; return result; @@ -331,7 +310,7 @@ internal_cmd_frequencies (struct lexer *lexer, struct dataset *ds) if (!parse_frequencies (lexer, ds, &cmd, NULL)) return CMD_FAILURE; - if (cmd.onepage_limit == NOT_LONG) + if (cmd.onepage_limit == LONG_MIN) cmd.onepage_limit = 50; /* Figure out statistics to calculate. */ @@ -341,14 +320,14 @@ internal_cmd_frequencies (struct lexer *lexer, struct dataset *ds) if (cmd.a_statistics[FRQ_ST_ALL]) stats |= frq_all; if (cmd.sort != FRQ_AVALUE && cmd.sort != FRQ_DVALUE) - stats &= ~frq_median; + stats &= ~BIT_INDEX (frq_median); for (i = 0; i < frq_n_stats; i++) if (cmd.a_statistics[st_name[i].st_indx]) stats |= BIT_INDEX (i); if (stats & frq_kurt) - stats |= frq_sekurt; + stats |= BIT_INDEX (frq_sekurt); if (stats & frq_skew) - stats |= frq_seskew; + stats |= BIT_INDEX (frq_seskew); /* Calculate n_stats. */ n_stats = 0; @@ -381,7 +360,14 @@ internal_cmd_frequencies (struct lexer *lexer, struct dataset *ds) add_percentile (j / (double) cmd.n_ntiles[i]); } } - + if (stats & BIT_INDEX (frq_median)) + { + /* Treat the median as the 50% percentile. + We output it in the percentiles table as "50 (Median)." */ + add_percentile (0.5); + stats &= ~BIT_INDEX (frq_median); + n_stats--; + } /* Do it! */ input = casereader_create_filter_weight (proc_open (ds), dataset_dict (ds), @@ -395,7 +381,7 @@ internal_cmd_frequencies (struct lexer *lexer, struct dataset *ds) precalc (group, ds); for (; casereader_read (group, &c); case_destroy (&c)) calc (&c, ds); - postcalc (); + postcalc (ds); } ok = casegrouper_destroy (grouper); ok = proc_commit (ds) && ok; @@ -524,46 +510,23 @@ calc (const struct ccase *c, const struct dataset *ds) struct var_freqs *vf = get_var_freqs (v); struct freq_tab *ft = &vf->tab; - switch (ft->mode) - { - case FRQM_GENERAL: - { - /* General mode. */ - struct freq target; - struct freq **fpp; - - target.value = (union value *) val; - fpp = (struct freq **) hsh_probe (ft->data, &target); - - if (*fpp != NULL) - (*fpp)->count += weight; - else - { - struct freq *fp = pool_alloc (gen_pool, sizeof *fp); - fp->count = weight; - fp->value = pool_clone (gen_pool, - val, - MAX (MAX_SHORT_STRING, vf->width)); - *fpp = fp; - } - } - break; - case FRQM_INTEGER: - /* Integer mode. */ - if (val->f == SYSMIS) - ft->sysmis += weight; - else if (val->f > INT_MIN+1 && val->f < INT_MAX-1) - { - int i = val->f; - if (i >= ft->min && i <= ft->max) - ft->vector[i - ft->min] += weight; - } - else - ft->out_of_range += weight; - break; - default: - NOT_REACHED (); - } + struct freq target; + struct freq **fpp; + + target.value = (union value *) val; + fpp = (struct freq **) hsh_probe (ft->data, &target); + + if (*fpp != NULL) + (*fpp)->count += weight; + else + { + struct freq *fp = pool_alloc (data_pool, sizeof *fp); + fp->count = weight; + fp->value = pool_clone (data_pool, + val, + MAX (MAX_SHORT_STRING, vf->width)); + *fpp = fp; + } } } @@ -575,40 +538,31 @@ precalc (struct casereader *input, struct dataset *ds) struct ccase c; size_t i; - if (!casereader_peek (input, 0, &c)) - return; - output_split_file_values (ds, &c); - case_destroy (&c); + if (casereader_peek (input, 0, &c)) + { + output_split_file_values (ds, &c); + case_destroy (&c); + } - pool_destroy (gen_pool); - gen_pool = pool_create (); + pool_destroy (data_pool); + data_pool = pool_create (); for (i = 0; i < n_variables; i++) { const struct variable *v = v_variables[i]; struct freq_tab *ft = &get_var_freqs (v)->tab; - if (ft->mode == FRQM_GENERAL) - { - ft->data = hsh_create (16, compare_freq, hash_freq, NULL, v); - } - else - { - int j; - - for (j = (ft->max - ft->min); j >= 0; j--) - ft->vector[j] = 0.0; - ft->out_of_range = 0.0; - ft->sysmis = 0.0; - } + ft->data = hsh_create (16, compare_freq, hash_freq, NULL, v); } } /* Finishes up with the variables after frequencies have been calculated. Displays statistics, percentiles, ... */ static void -postcalc (void) +postcalc (const struct dataset *ds) { + const struct dictionary *dict = dataset_dict (ds); + const struct variable *wv = dict_get_weight (dict); size_t i; for (i = 0; i < n_variables; i++) @@ -628,16 +582,16 @@ postcalc (void) switch (cmd.cond) { case FRQ_CONDENSE: - dump_condensed (v); + dump_condensed (v, wv); break; case FRQ_STANDARD: - dump_full (v); + dump_full (v, wv); break; case FRQ_ONEPAGE: if (n_categories > cmd.onepage_limit) - dump_condensed (v); + dump_condensed (v, wv); else - dump_full (v); + dump_full (v, wv); break; default: NOT_REACHED (); @@ -647,7 +601,7 @@ postcalc (void) /* Statistics. */ if (n_stats) - dump_statistics (v, !dumped_freq_tab); + dump_statistics (v, !dumped_freq_tab, wv); @@ -685,12 +639,12 @@ postcalc (void) } /* Returns the comparison function that should be used for - sorting a frequency table by FRQ_SORT using VAR_TYPE - variables. */ + sorting a frequency table by FRQ_SORT using VAL_TYPE + values. */ static hsh_compare_func * -get_freq_comparator (int frq_sort, enum var_type var_type) +get_freq_comparator (int frq_sort, enum val_type val_type) { - bool is_numeric = var_type == VAR_NUMERIC; + bool is_numeric = val_type == VAL_NUMERIC; switch (frq_sort) { case FRQ_AVALUE: @@ -729,7 +683,6 @@ postprocess_freq_tab (const struct variable *v) size_t i; ft = &get_var_freqs (v)->tab; - assert (ft->mode == FRQM_GENERAL); compare = get_freq_comparator (cmd.sort, var_get_type (v)); /* Extract data from hash table. */ @@ -777,7 +730,6 @@ static void cleanup_freq_tab (const struct variable *v) { struct freq_tab *ft = &get_var_freqs (v)->tab; - assert (ft->mode == FRQM_GENERAL); free (ft->valid); hsh_destroy (ft->data); } @@ -787,9 +739,6 @@ cleanup_freq_tab (const struct variable *v) static int frq_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_frequencies *cmd UNUSED, void *aux UNUSED) { - int mode; - int min = 0, max = 0; - size_t old_n_variables = n_variables; size_t i; @@ -802,31 +751,6 @@ frq_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_freque PV_APPEND | PV_NO_SCRATCH)) return 0; - if (!lex_match (lexer, '(')) - mode = FRQM_GENERAL; - else - { - mode = FRQM_INTEGER; - if (!lex_force_int (lexer)) - return 0; - min = lex_integer (lexer); - lex_get (lexer); - if (!lex_force_match (lexer, ',')) - return 0; - if (!lex_force_int (lexer)) - return 0; - max = lex_integer (lexer); - lex_get (lexer); - if (!lex_force_match (lexer, ')')) - return 0; - if (max < min) - { - msg (SE, _("Upper limit of integer mode value range must be " - "greater than lower limit.")); - return 0; - } - } - for (i = old_n_variables; i < n_variables; i++) { const struct variable *v = v_variables[i]; @@ -838,30 +762,13 @@ frq_custom_variables (struct lexer *lexer, struct dataset *ds, struct cmd_freque "subcommand."), var_get_name (v)); return 0; } - if (mode == FRQM_INTEGER && !var_is_numeric (v)) - { - msg (SE, _("Integer mode specified, but %s is not a numeric " - "variable."), var_get_name (v)); - return 0; - } - vf = var_attach_aux (v, xmalloc (sizeof *vf), var_dtor_free); - vf->tab.mode = mode; vf->tab.valid = vf->tab.missing = NULL; - if (mode == FRQM_INTEGER) - { - vf->tab.min = min; - vf->tab.max = max; - vf->tab.vector = pool_nalloc (int_pool, - max - min + 1, sizeof *vf->tab.vector); - } - else - vf->tab.vector = NULL; vf->n_groups = 0; vf->groups = NULL; vf->width = var_get_width (v); vf->print = *var_get_print_format (v); - if (vf->width > MAX_SHORT_STRING && get_algorithm () == COMPATIBLE) + if (vf->width > MAX_SHORT_STRING && settings_get_algorithm () == COMPATIBLE) { enum fmt_type type = var_get_print_format (v)->type; vf->width = MAX_SHORT_STRING; @@ -903,7 +810,7 @@ frq_custom_grouped (struct lexer *lexer, struct dataset *ds, struct cmd_frequenc if (nl >= ml) { ml += 16; - dl = pool_nrealloc (int_pool, dl, ml, sizeof *dl); + dl = pool_nrealloc (syntax_pool, dl, ml, sizeof *dl); } dl[nl++] = lex_tokval (lexer); lex_get (lexer); @@ -974,13 +881,9 @@ add_percentile (double x) if (i >= n_percentiles || x != percentiles[i].p) { - percentiles = pool_nrealloc (int_pool, percentiles, + percentiles = pool_nrealloc (syntax_pool, percentiles, n_percentiles + 1, sizeof *percentiles); - - if (i < n_percentiles) - memmove (&percentiles[i + 1], &percentiles[i], - (n_percentiles - i) * sizeof (struct percentile) ); - + insert_element (percentiles, n_percentiles, sizeof *percentiles, i); percentiles[i].p = x; n_percentiles++; } @@ -1133,8 +1036,9 @@ full_dim (struct tab_table *t, struct outp_driver *d) /* Displays a full frequency table for variable V. */ static void -dump_full (const struct variable *v) +dump_full (const struct variable *v, const struct variable *wv) { + const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : &F_8_0; int n_categories; struct var_freqs *vf; struct freq_tab *ft; @@ -1203,10 +1107,10 @@ dump_full (const struct variable *v) } tab_value (t, 0 + lab, r, TAB_NONE, f->value, &vf->print); - tab_float (t, 1 + lab, r, TAB_NONE, f->count, 8, 0); - tab_float (t, 2 + lab, r, TAB_NONE, percent, 5, 1); - tab_float (t, 3 + lab, r, TAB_NONE, valid_percent, 5, 1); - tab_float (t, 4 + lab, r, TAB_NONE, cum_total, 5, 1); + tab_double (t, 1 + lab, r, TAB_NONE, f->count, wfmt); + tab_double (t, 2 + lab, r, TAB_NONE, percent, NULL); + tab_double (t, 3 + lab, r, TAB_NONE, valid_percent, NULL); + tab_double (t, 4 + lab, r, TAB_NONE, cum_total, NULL); r++; } for (; f < &ft->valid[n_categories]; f++) @@ -1221,9 +1125,9 @@ dump_full (const struct variable *v) } tab_value (t, 0 + lab, r, TAB_NONE, f->value, &vf->print); - tab_float (t, 1 + lab, r, TAB_NONE, f->count, 8, 0); - tab_float (t, 2 + lab, r, TAB_NONE, - f->count / ft->total_cases * 100.0, 5, 1); + tab_double (t, 1 + lab, r, TAB_NONE, f->count, wfmt); + tab_double (t, 2 + lab, r, TAB_NONE, + f->count / ft->total_cases * 100.0, NULL); tab_text (t, 3 + lab, r, TAB_NONE, _("Missing")); r++; } @@ -1235,9 +1139,9 @@ dump_full (const struct variable *v) tab_hline (t, TAL_2, 0, 4 + lab, r); tab_joint_text (t, 0, r, 0 + lab, r, TAB_RIGHT | TAT_TITLE, _("Total")); tab_vline (t, TAL_0, 1, r, r); - tab_float (t, 1 + lab, r, TAB_NONE, cum_freq, 8, 0); - tab_float (t, 2 + lab, r, TAB_NONE, 100.0, 5, 1); - tab_float (t, 3 + lab, r, TAB_NONE, 100.0, 5, 1); + tab_double (t, 1 + lab, r, TAB_NONE, cum_freq, wfmt); + tab_fixed (t, 2 + lab, r, TAB_NONE, 100.0, 5, 1); + tab_fixed (t, 3 + lab, r, TAB_NONE, 100.0, 5, 1); tab_title (t, "%s", var_to_string (v)); tab_submit (t); @@ -1264,8 +1168,9 @@ condensed_dim (struct tab_table *t, struct outp_driver *d) /* Display condensed frequency table for variable V. */ static void -dump_condensed (const struct variable *v) +dump_condensed (const struct variable *v, const struct variable *wv) { + const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : &F_8_0; int n_categories; struct var_freqs *vf; struct freq_tab *ft; @@ -1296,17 +1201,17 @@ dump_condensed (const struct variable *v) cum_total += f->count / ft->valid_cases * 100.0; tab_value (t, 0, r, TAB_NONE, f->value, &vf->print); - tab_float (t, 1, r, TAB_NONE, f->count, 8, 0); - tab_float (t, 2, r, TAB_NONE, percent, 3, 0); - tab_float (t, 3, r, TAB_NONE, cum_total, 3, 0); + tab_double (t, 1, r, TAB_NONE, f->count, wfmt); + tab_double (t, 2, r, TAB_NONE, percent, NULL); + tab_double (t, 3, r, TAB_NONE, cum_total, NULL); r++; } for (; f < &ft->valid[n_categories]; f++) { tab_value (t, 0, r, TAB_NONE, f->value, &vf->print); - tab_float (t, 1, r, TAB_NONE, f->count, 8, 0); - tab_float (t, 2, r, TAB_NONE, - f->count / ft->total_cases * 100.0, 3, 0); + tab_double (t, 1, r, TAB_NONE, f->count, wfmt); + tab_double (t, 2, r, TAB_NONE, + f->count / ft->total_cases * 100.0, NULL); r++; } @@ -1336,28 +1241,9 @@ calc_stats (const struct variable *v, double d[frq_n_stats]) double rank; int i = 0; int idx; - double *median_value; /* Calculate percentiles. */ - /* If the 50th percentile was not explicitly requested then we must - calculate it anyway --- it's the median */ - median_value = 0 ; - for (i = 0; i < n_percentiles; i++) - { - if (percentiles[i].p == 0.5) - { - median_value = &percentiles[i].value; - break; - } - } - - if ( 0 == median_value ) - { - add_percentile (0.5); - implicit_50th = 1; - } - for (i = 0; i < n_percentiles; i++) { percentiles[i].flag = 0; @@ -1375,7 +1261,7 @@ calc_stats (const struct variable *v, double d[frq_n_stats]) double tp; if ( percentiles[i].flag2 ) continue ; - if ( get_algorithm() != COMPATIBLE ) + if ( settings_get_algorithm () != COMPATIBLE ) tp = (ft->valid_cases - 1) * percentiles[i].p; else @@ -1426,7 +1312,7 @@ calc_stats (const struct variable *v, double d[frq_n_stats]) double s; double dummy; - if ( get_algorithm() != COMPATIBLE ) + if ( settings_get_algorithm () != COMPATIBLE ) { s = modf((ft->valid_cases - 1) * percentiles[i].p , &dummy); } @@ -1437,9 +1323,6 @@ calc_stats (const struct variable *v, double d[frq_n_stats]) percentiles[i].value = percentiles[i].x1 + ( percentiles[i].x2 - percentiles[i].x1) * s ; - - if ( percentiles[i].p == 0.50) - median_value = &percentiles[i].value; } @@ -1476,7 +1359,6 @@ calc_stats (const struct variable *v, double d[frq_n_stats]) d[frq_max] = ft->valid[ft->n_valid - 1].value[0].f; d[frq_mode] = X_mode; d[frq_range] = d[frq_max] - d[frq_min]; - d[frq_median] = *median_value; d[frq_sum] = d[frq_mean] * W; d[frq_stddev] = sqrt (d[frq_variance]); d[frq_semean] = d[frq_stddev] / sqrt (W); @@ -1486,18 +1368,15 @@ calc_stats (const struct variable *v, double d[frq_n_stats]) /* Displays a table of all the statistics requested for variable V. */ static void -dump_statistics (const struct variable *v, int show_varname) +dump_statistics (const struct variable *v, bool show_varname, + const struct variable *wv) { + const struct fmt_spec *wfmt = wv ? var_get_print_format (wv) : &F_8_0; struct freq_tab *ft; double stat_value[frq_n_stats]; struct tab_table *t; int i, r; - int n_explicit_percentiles = n_percentiles; - - if ( implicit_50th && n_percentiles > 0 ) - --n_percentiles; - if (var_is_alpha (v)) return; ft = &get_var_freqs (v)->tab; @@ -1509,7 +1388,7 @@ dump_statistics (const struct variable *v, int show_varname) } calc_stats (v, stat_value); - t = tab_create (3, n_stats + n_explicit_percentiles + 2, 0); + t = tab_create (3, n_stats + n_percentiles + 2, 0); tab_dim (t, tab_natural_dimensions); tab_box (t, TAL_1, TAL_1, -1, -1 , 0 , 0 , 2, tab_nr(t) - 1) ; @@ -1525,7 +1404,7 @@ dump_statistics (const struct variable *v, int show_varname) { tab_text (t, 0, r, TAB_LEFT | TAT_TITLE, gettext (st_name[i].s10)); - tab_float (t, 2, r, TAB_NONE, stat_value[i], 11, 3); + tab_double (t, 2, r, TAB_NONE, stat_value[i], NULL); r++; } @@ -1533,20 +1412,22 @@ dump_statistics (const struct variable *v, int show_varname) tab_text (t, 1, 0, TAB_LEFT | TAT_TITLE, _("Valid")); tab_text (t, 1, 1, TAB_LEFT | TAT_TITLE, _("Missing")); - tab_float(t, 2, 0, TAB_NONE, ft->valid_cases, 11, 0); - tab_float(t, 2, 1, TAB_NONE, ft->total_cases - ft->valid_cases, 11, 0); - + tab_double (t, 2, 0, TAB_NONE, ft->valid_cases, wfmt); + tab_double (t, 2, 1, TAB_NONE, ft->total_cases - ft->valid_cases, wfmt); - for (i = 0; i < n_explicit_percentiles; i++, r++) + for (i = 0; i < n_percentiles; i++, r++) { if ( i == 0 ) { tab_text (t, 0, r, TAB_LEFT | TAT_TITLE, _("Percentiles")); } - tab_float (t, 1, r, TAB_LEFT, percentiles[i].p * 100, 3, 0 ); - tab_float (t, 2, r, TAB_NONE, percentiles[i].value, 11, 3); - + if (percentiles[i].p == 0.5) + tab_text (t, 1, r, TAB_LEFT, _("50 (Median)")); + else + tab_fixed (t, 1, r, TAB_LEFT, percentiles[i].p * 100, 3, 0); + tab_double (t, 2, r, TAB_NONE, percentiles[i].value, + var_get_print_format (v)); } tab_columns (t, SOM_COL_DOWN, 1); @@ -1623,7 +1504,8 @@ freq_tab_to_slice_array(const struct freq_tab *frq_tab, { const struct freq *frq = &frq_tab->valid[i]; - slices[i].label = var_get_value_name (var, frq->value); + ds_init_empty (&slices[i].label); + var_append_value_name (var, frq->value, &slices[i].label); slices[i].magnetude = frq->count; } @@ -1637,12 +1519,17 @@ static void do_piechart(const struct variable *var, const struct freq_tab *frq_tab) { struct slice *slices; - int n_slices; + int n_slices, i; slices = freq_tab_to_slice_array(frq_tab, var, &n_slices); piechart_plot(var_to_string(var), slices, n_slices); + for (i = 0 ; i < n_slices ; ++i ) + { + ds_destroy (&slices[i].label); + } + free(slices); }