X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=src%2Flanguage%2Fstats%2Ffrequencies.q;h=0c0dc63b3d32148f86adfc663c69779accc4414a;hb=e7d0a9f16192ceeff9243f0ede8e399ee1ef0d44;hp=39882e76c527c8d46e51bb708bcb3f7c5bfb6af2;hpb=a4ae68f966bc574326d429119878e733069ced14;p=pspp-builds.git diff --git a/src/language/stats/frequencies.q b/src/language/stats/frequencies.q index 39882e76..0c0dc63b 100644 --- a/src/language/stats/frequencies.q +++ b/src/language/stats/frequencies.q @@ -58,6 +58,8 @@ #include #include +#include "minmax.h" + #include "gettext.h" #define _(msgid) gettext (msgid) #define N_(msgid) msgid @@ -66,13 +68,13 @@ /* (specification) FREQUENCIES (frq_): - *variables=custom; - format=cond:condense/onepage(*n:onepage_limit,"%s>=0")/!standard, - table:limit(n:limit,"%s>0")/notable/!table, - labels:!labels/nolabels, - sort:!avalue/dvalue/afreq/dfreq, - spaces:!single/double, - paging:newpage/!oldpage; + *+variables=custom; + +format=cond:condense/onepage(*n:onepage_limit,"%s>=0")/!standard, + table:limit(n:limit,"%s>0")/notable/!table, + labels:!labels/nolabels, + sort:!avalue/dvalue/afreq/dfreq, + spaces:!single/double, + paging:newpage/!oldpage; missing=miss:include/!exclude; barchart(ba_)=:minimum(d:min), :maximum(d:max), @@ -90,12 +92,12 @@ scale:freq(*n:freq,"%s>0")/percent(*n:pcnt,"%s>0"), norm:!nonormal/normal, incr:increment(d:inc,"%s>0"); - grouped=custom; - ntiles=integer; + +grouped=custom; + +ntiles=integer; +percentiles = double list; - statistics[st_]=1|mean,2|semean,3|median,4|mode,5|stddev,6|variance, - 7|kurtosis,8|skewness,9|range,10|minimum,11|maximum,12|sum, - 13|default,14|seskewness,15|sekurtosis,all,none. + +statistics[st_]=1|mean,2|semean,3|median,4|mode,5|stddev,6|variance, + 7|kurtosis,8|skewness,9|range,10|minimum,11|maximum,12|sum, + 13|default,14|seskewness,15|sekurtosis,all,none. */ /* (declarations) */ /* (functions) */ @@ -116,7 +118,7 @@ struct frq_info }; /* Table of statistics, indexed by dsc_*. */ -static struct frq_info st_name[frq_n_stats + 1] = +static const struct frq_info st_name[frq_n_stats + 1] = { {FRQ_ST_MEAN, N_("Mean")}, {FRQ_ST_SEMEAN, N_("S.E. Mean")}, @@ -204,7 +206,7 @@ static struct pool *gen_pool; /* General mode. */ /* Frequency table entry. */ struct freq { - union value v; /* The value. */ + union value *v; /* The value. */ double c; /* The number of occurrences of the value. */ }; @@ -254,10 +256,18 @@ struct var_freqs /* Statistics. */ double stat[frq_n_stats]; + + /* Width and format for analysis and display. + This is normally the same as "width" and "print" in struct + variable, but in SPSS-compatible mode only the first + MAX_SHORT_STRING bytes of long string variables are + included. */ + int width; + struct fmt_spec print; }; static inline struct var_freqs * -get_var_freqs (struct variable *v) +get_var_freqs (const struct variable *v) { assert (v != NULL); assert (v->aux != NULL); @@ -268,9 +278,9 @@ static void determine_charts (void); static void calc_stats (struct variable *v, double d[frq_n_stats]); -static void precalc (const struct ccase *, void *); -static bool calc (const struct ccase *, void *); -static void postcalc (void *); +static void precalc (const struct ccase *, void *, const struct dataset *); +static bool calc (const struct ccase *, void *, const struct dataset *); +static bool postcalc (void *, const struct dataset *); static void postprocess_freq_tab (struct variable *); static void dump_full (struct variable *); @@ -295,15 +305,15 @@ freq_tab_to_hist(const struct freq_tab *ft, const struct variable *var); /* Parser and outline. */ -static int internal_cmd_frequencies (void); +static int internal_cmd_frequencies (struct dataset *ds); int -cmd_frequencies (void) +cmd_frequencies (struct dataset *ds) { int result; int_pool = pool_create (); - result = internal_cmd_frequencies (); + result = internal_cmd_frequencies (ds); pool_destroy (int_pool); int_pool=0; pool_destroy (gen_pool); @@ -314,7 +324,7 @@ cmd_frequencies (void) } static int -internal_cmd_frequencies (void) +internal_cmd_frequencies (struct dataset *ds) { int i; bool ok; @@ -325,7 +335,7 @@ internal_cmd_frequencies (void) n_variables = 0; v_variables = NULL; - if (!parse_frequencies (&cmd)) + if (!parse_frequencies (ds, &cmd, NULL)) return CMD_FAILURE; if (cmd.onepage_limit == NOT_LONG) @@ -381,7 +391,7 @@ internal_cmd_frequencies (void) /* Do it! */ - ok = procedure_with_splits (precalc, calc, postcalc, NULL); + ok = procedure_with_splits (ds, precalc, calc, postcalc, NULL); free_frequencies(&cmd); @@ -495,35 +505,41 @@ determine_charts (void) /* Add data from case C to the frequency table. */ static bool -calc (const struct ccase *c, void *aux UNUSED) +calc (const struct ccase *c, void *aux UNUSED, const struct dataset *ds) { double weight; size_t i; - int bad_warn = 1; + bool bad_warn = true; - weight = dict_get_case_weight (default_dict, c, &bad_warn); + weight = dict_get_case_weight (dataset_dict (ds), c, &bad_warn); for (i = 0; i < n_variables; i++) { - struct variable *v = v_variables[i]; + const struct variable *v = v_variables[i]; const union value *val = case_data (c, v->fv); - struct freq_tab *ft = &get_var_freqs (v)->tab; + struct var_freqs *vf = get_var_freqs (v); + struct freq_tab *ft = &vf->tab; switch (ft->mode) { case FRQM_GENERAL: { - /* General mode. */ - struct freq **fpp = (struct freq **) hsh_probe (ft->data, val); + struct freq target; + struct freq **fpp; + + target.v = (union value *) val; + fpp = (struct freq **) hsh_probe (ft->data, &target); if (*fpp != NULL) (*fpp)->c += weight; else { - struct freq *fp = *fpp = pool_alloc (gen_pool, sizeof *fp); - fp->v = *val; - fp->c = weight; + struct freq *fp = pool_alloc (gen_pool, sizeof *fp); + fp->c = weight; + fp->v = pool_clone (gen_pool, + val, MAX (MAX_SHORT_STRING, vf->width)); + *fpp = fp; } } break; @@ -541,7 +557,7 @@ calc (const struct ccase *c, void *aux UNUSED) ft->out_of_range += weight; break; default: - assert (0); + NOT_REACHED (); } } return true; @@ -550,11 +566,11 @@ calc (const struct ccase *c, void *aux UNUSED) /* Prepares each variable that is the target of FREQUENCIES by setting up its hash table. */ static void -precalc (const struct ccase *first, void *aux UNUSED) +precalc (const struct ccase *first, void *aux UNUSED, const struct dataset *ds) { size_t i; - output_split_file_values (first); + output_split_file_values (ds, first); pool_destroy (gen_pool); gen_pool = pool_create (); @@ -595,8 +611,8 @@ precalc (const struct ccase *first, void *aux UNUSED) /* Finishes up with the variables after frequencies have been calculated. Displays statistics, percentiles, ... */ -static void -postcalc (void *aux UNUSED) +static bool +postcalc (void *aux UNUSED, const struct dataset *ds UNUSED) { size_t i; @@ -629,7 +645,7 @@ postcalc (void *aux UNUSED) dump_full (v); break; default: - assert (0); + NOT_REACHED (); } else dumped_freq_tab = 0; @@ -671,6 +687,8 @@ postcalc (void *aux UNUSED) cleanup_freq_tab (v); } + + return true; } /* Returns the comparison function that should be used for @@ -690,21 +708,21 @@ get_freq_comparator (int frq_sort, int var_type) case FRQ_AFREQ | (ALPHA << 16): return compare_freq_alpha_a; case FRQ_DFREQ | (NUMERIC << 16): return compare_freq_numeric_d; case FRQ_DFREQ | (ALPHA << 16): return compare_freq_alpha_d; - default: assert (0); + default: NOT_REACHED (); } return 0; } -/* Returns nonzero iff the value in struct freq F is non-missing +/* Returns true iff the value in struct freq F is non-missing for variable V. */ -static int -not_missing (const void *f_, void *v_) +static bool +not_missing (const void *f_, const void *v_) { const struct freq *f = f_; - struct variable *v = v_; + const struct variable *v = v_; - return !mv_is_value_missing (&v->miss, &f->v); + return !mv_is_value_missing (&v->miss, f->v); } /* Summarizes the frequency table data for variable V. */ @@ -775,7 +793,7 @@ cleanup_freq_tab (struct variable *v) /* Parses the VARIABLES subcommand, adding to {n_variables,v_variables}. */ static int -frq_custom_variables (struct cmd_frequencies *cmd UNUSED) +frq_custom_variables (struct dataset *ds, struct cmd_frequencies *cmd UNUSED, void *aux UNUSED) { int mode; int min = 0, max = 0; @@ -785,10 +803,10 @@ frq_custom_variables (struct cmd_frequencies *cmd UNUSED) lex_match ('='); if (token != T_ALL && (token != T_ID - || dict_lookup_var (default_dict, tokid) == NULL)) + || dict_lookup_var (dataset_dict (ds), tokid) == NULL)) return 2; - if (!parse_variables (default_dict, &v_variables, &n_variables, + if (!parse_variables (dataset_dict (ds), &v_variables, &n_variables, PV_APPEND | PV_NO_SCRATCH)) return 0; @@ -845,10 +863,17 @@ frq_custom_variables (struct cmd_frequencies *cmd UNUSED) vf->tab.vector = pool_nalloc (int_pool, max - min + 1, sizeof *vf->tab.vector); } - else - vf->tab.vector = NULL; + else + vf->tab.vector = NULL; vf->n_groups = 0; vf->groups = NULL; + vf->width = v->width; + vf->print = v->print; + if (vf->width > MAX_SHORT_STRING && get_algorithm () == COMPATIBLE) + { + vf->width = MAX_SHORT_STRING; + vf->print.w = MAX_SHORT_STRING * (v->print.type == FMT_AHEX ? 2 : 1); + } } return 1; } @@ -856,10 +881,10 @@ frq_custom_variables (struct cmd_frequencies *cmd UNUSED) /* Parses the GROUPED subcommand, setting the n_grouped, grouped fields of specified variables. */ static int -frq_custom_grouped (struct cmd_frequencies *cmd UNUSED) +frq_custom_grouped (struct dataset *ds, struct cmd_frequencies *cmd UNUSED, void *aux UNUSED) { lex_match ('='); - if ((token == T_ID && dict_lookup_var (default_dict, tokid) != NULL) + if ((token == T_ID && dict_lookup_var (dataset_dict (ds), tokid) != NULL) || token == T_ID) for (;;) { @@ -873,7 +898,7 @@ frq_custom_grouped (struct cmd_frequencies *cmd UNUSED) size_t n; struct variable **v; - if (!parse_variables (default_dict, &v, &n, + if (!parse_variables (dataset_dict (ds), &v, &n, PV_NO_DUPLICATE | PV_NUMERIC)) return 0; if (lex_match ('(')) @@ -926,7 +951,7 @@ frq_custom_grouped (struct cmd_frequencies *cmd UNUSED) free (v); if (!lex_match ('/')) break; - if ((token != T_ID || dict_lookup_var (default_dict, tokid) != NULL) + if ((token != T_ID || dict_lookup_var (dataset_dict (ds), tokid) != NULL) && token != T_ALL) { lex_put_back ('/'); @@ -972,32 +997,33 @@ add_percentile (double x) /* Hash of numeric values. */ static unsigned -hash_value_numeric (const void *value_, void *foo UNUSED) +hash_value_numeric (const void *value_, const void *aux UNUSED) { const struct freq *value = value_; - return hsh_hash_double (value->v.f); + return hsh_hash_double (value->v[0].f); } /* Hash of string values. */ static unsigned -hash_value_alpha (const void *value_, void *v_) +hash_value_alpha (const void *value_, const void *v_) { const struct freq *value = value_; - struct variable *v = v_; + const struct variable *v = v_; + struct var_freqs *vf = get_var_freqs (v); - return hsh_hash_bytes (value->v.s, v->width); + return hsh_hash_bytes (value->v[0].s, vf->width); } /* Ascending numeric compare of values. */ static int -compare_value_numeric_a (const void *a_, const void *b_, void *foo UNUSED) +compare_value_numeric_a (const void *a_, const void *b_, const void *aux UNUSED) { const struct freq *a = a_; const struct freq *b = b_; - if (a->v.f > b->v.f) + if (a->v[0].f > b->v[0].f) return 1; - else if (a->v.f < b->v.f) + else if (a->v[0].f < b->v[0].f) return -1; else return 0; @@ -1005,25 +1031,26 @@ compare_value_numeric_a (const void *a_, const void *b_, void *foo UNUSED) /* Ascending string compare of values. */ static int -compare_value_alpha_a (const void *a_, const void *b_, void *v_) +compare_value_alpha_a (const void *a_, const void *b_, const void *v_) { const struct freq *a = a_; const struct freq *b = b_; const struct variable *v = v_; + struct var_freqs *vf = get_var_freqs (v); - return memcmp (a->v.s, b->v.s, v->width); + return memcmp (a->v[0].s, b->v[0].s, vf->width); } /* Descending numeric compare of values. */ static int -compare_value_numeric_d (const void *a, const void *b, void *foo UNUSED) +compare_value_numeric_d (const void *a, const void *b, const void *aux UNUSED) { - return -compare_value_numeric_a (a, b, foo); + return -compare_value_numeric_a (a, b, aux); } /* Descending string compare of values. */ static int -compare_value_alpha_d (const void *a, const void *b, void *v) +compare_value_alpha_d (const void *a, const void *b, const void *v) { return -compare_value_alpha_a (a, b, v); } @@ -1031,7 +1058,7 @@ compare_value_alpha_d (const void *a, const void *b, void *v) /* Ascending numeric compare of frequency; secondary key on ascending numeric value. */ static int -compare_freq_numeric_a (const void *a_, const void *b_, void *foo UNUSED) +compare_freq_numeric_a (const void *a_, const void *b_, const void *aux UNUSED) { const struct freq *a = a_; const struct freq *b = b_; @@ -1041,9 +1068,9 @@ compare_freq_numeric_a (const void *a_, const void *b_, void *foo UNUSED) else if (a->c < b->c) return -1; - if (a->v.f > b->v.f) + if (a->v[0].f > b->v[0].f) return 1; - else if (a->v.f < b->v.f) + else if (a->v[0].f < b->v[0].f) return -1; else return 0; @@ -1052,24 +1079,25 @@ compare_freq_numeric_a (const void *a_, const void *b_, void *foo UNUSED) /* Ascending numeric compare of frequency; secondary key on ascending string value. */ static int -compare_freq_alpha_a (const void *a_, const void *b_, void *v_) +compare_freq_alpha_a (const void *a_, const void *b_, const void *v_) { const struct freq *a = a_; const struct freq *b = b_; const struct variable *v = v_; + struct var_freqs *vf = get_var_freqs (v); if (a->c > b->c) return 1; else if (a->c < b->c) return -1; else - return memcmp (a->v.s, b->v.s, v->width); + return memcmp (a->v[0].s, b->v[0].s, vf->width); } /* Descending numeric compare of frequency; secondary key on ascending numeric value. */ static int -compare_freq_numeric_d (const void *a_, const void *b_, void *foo UNUSED) +compare_freq_numeric_d (const void *a_, const void *b_, const void *aux UNUSED) { const struct freq *a = a_; const struct freq *b = b_; @@ -1079,9 +1107,9 @@ compare_freq_numeric_d (const void *a_, const void *b_, void *foo UNUSED) else if (a->c < b->c) return 1; - if (a->v.f > b->v.f) + if (a->v[0].f > b->v[0].f) return 1; - else if (a->v.f < b->v.f) + else if (a->v[0].f < b->v[0].f) return -1; else return 0; @@ -1090,18 +1118,19 @@ compare_freq_numeric_d (const void *a_, const void *b_, void *foo UNUSED) /* Descending numeric compare of frequency; secondary key on ascending string value. */ static int -compare_freq_alpha_d (const void *a_, const void *b_, void *v_) +compare_freq_alpha_d (const void *a_, const void *b_, const void *v_) { const struct freq *a = a_; const struct freq *b = b_; const struct variable *v = v_; + struct var_freqs *vf = get_var_freqs (v); if (a->c > b->c) return -1; else if (a->c < b->c) return 1; else - return memcmp (a->v.s, b->v.s, v->width); + return memcmp (a->v[0].s, b->v[0].s, vf->width); } /* Frequency table display. */ @@ -1127,6 +1156,7 @@ static void dump_full (struct variable *v) { int n_categories; + struct var_freqs *vf; struct freq_tab *ft; struct freq *f; struct tab_table *t; @@ -1140,9 +1170,9 @@ dump_full (struct variable *v) const char *s; }; - struct init *p; + const struct init *p; - static struct init vec[] = + static const struct init vec[] = { {4, 0, N_("Valid")}, {5, 0, N_("Cum")}, @@ -1160,7 +1190,8 @@ dump_full (struct variable *v) int lab = cmd.labels == FRQ_LABELS; - ft = &get_var_freqs (v)->tab; + vf = get_var_freqs (v); + ft = &vf->tab; n_categories = ft->n_valid + ft->n_missing; t = tab_create (5 + lab, n_categories + 3, 0); tab_headers (t, 0, 0, 2, 0); @@ -1185,12 +1216,12 @@ dump_full (struct variable *v) if (lab) { - const char *label = val_labs_find (v->val_labs, f->v); + const char *label = val_labs_find (v->val_labs, f->v[0]); if (label != NULL) tab_text (t, 0, r, TAB_LEFT, label); } - tab_value (t, 0 + lab, r, TAB_NONE, &f->v, &v->print); + tab_value (t, 0 + lab, r, TAB_NONE, f->v, &vf->print); tab_float (t, 1 + lab, r, TAB_NONE, f->c, 8, 0); tab_float (t, 2 + lab, r, TAB_NONE, percent, 5, 1); tab_float (t, 3 + lab, r, TAB_NONE, valid_percent, 5, 1); @@ -1203,12 +1234,12 @@ dump_full (struct variable *v) if (lab) { - const char *label = val_labs_find (v->val_labs, f->v); + const char *label = val_labs_find (v->val_labs, f->v[0]); if (label != NULL) tab_text (t, 0, r, TAB_LEFT, label); } - tab_value (t, 0 + lab, r, TAB_NONE, &f->v, &v->print); + tab_value (t, 0 + lab, r, TAB_NONE, f->v, &vf->print); tab_float (t, 1 + lab, r, TAB_NONE, f->c, 8, 0); tab_float (t, 2 + lab, r, TAB_NONE, f->c / ft->total_cases * 100.0, 5, 1); @@ -1256,13 +1287,15 @@ static void dump_condensed (struct variable *v) { int n_categories; + struct var_freqs *vf; struct freq_tab *ft; struct freq *f; struct tab_table *t; int r; double cum_total = 0.0; - ft = &get_var_freqs (v)->tab; + vf = get_var_freqs (v); + ft = &vf->tab; n_categories = ft->n_valid + ft->n_missing; t = tab_create (4, n_categories + 2, 0); @@ -1282,7 +1315,7 @@ dump_condensed (struct variable *v) percent = f->c / ft->total_cases * 100.0; cum_total += f->c / ft->valid_cases * 100.0; - tab_value (t, 0, r, TAB_NONE, &f->v, &v->print); + tab_value (t, 0, r, TAB_NONE, f->v, &vf->print); tab_float (t, 1, r, TAB_NONE, f->c, 8, 0); tab_float (t, 2, r, TAB_NONE, percent, 3, 0); tab_float (t, 3, r, TAB_NONE, cum_total, 3, 0); @@ -1290,7 +1323,7 @@ dump_condensed (struct variable *v) } for (; f < &ft->valid[n_categories]; f++) { - tab_value (t, 0, r, TAB_NONE, &f->v, &v->print); + tab_value (t, 0, r, TAB_NONE, f->v, &vf->print); tab_float (t, 1, r, TAB_NONE, f->c, 8, 0); tab_float (t, 2, r, TAB_NONE, f->c / ft->total_cases * 100.0, 3, 0); @@ -1371,7 +1404,7 @@ calc_stats (struct variable *v, double d[frq_n_stats]) if ( percentiles[i].flag ) { - percentiles[i].x2 = f->v.f; + percentiles[i].x2 = f->v[0].f; percentiles[i].x1 = prev_value; percentiles[i].flag2 = 1; continue; @@ -1381,7 +1414,7 @@ calc_stats (struct variable *v, double d[frq_n_stats]) { if ( f->c > 1 && rank - (f->c - 1) > tp ) { - percentiles[i].x2 = percentiles[i].x1 = f->v.f; + percentiles[i].x2 = percentiles[i].x1 = f->v[0].f; percentiles[i].flag2 = 1; } else @@ -1392,14 +1425,14 @@ calc_stats (struct variable *v, double d[frq_n_stats]) continue; } } - prev_value = f->v.f; + prev_value = f->v[0].f; } for (i = 0; i < n_percentiles; i++) { /* Catches the case when p == 100% */ if ( ! percentiles[i].flag2 ) - percentiles[i].x1 = percentiles[i].x2 = f->v.f; + percentiles[i].x1 = percentiles[i].x2 = f->v[0].f; /* printf("percentile %d (p==%.2f); X1 = %g; X2 = %g\n", @@ -1438,7 +1471,7 @@ calc_stats (struct variable *v, double d[frq_n_stats]) if (most_often < f->c) { most_often = f->c; - X_mode = f->v.f; + X_mode = f->v[0].f; } else if (most_often == f->c) { @@ -1451,16 +1484,16 @@ calc_stats (struct variable *v, double d[frq_n_stats]) /* Calculate moments. */ m = moments_create (MOMENT_KURTOSIS); for (f = ft->valid; f < ft->missing; f++) - moments_pass_one (m, f->v.f, f->c); + moments_pass_one (m, f->v[0].f, f->c); for (f = ft->valid; f < ft->missing; f++) - moments_pass_two (m, f->v.f, f->c); + moments_pass_two (m, f->v[0].f, f->c); moments_calculate (m, NULL, &d[frq_mean], &d[frq_variance], &d[frq_skew], &d[frq_kurt]); moments_destroy (m); /* Formulas below are taken from _SPSS Statistical Algorithms_. */ - d[frq_min] = ft->valid[0].v.f; - d[frq_max] = ft->valid[ft->n_valid - 1].v.f; + d[frq_min] = ft->valid[0].v[0].f; + d[frq_max] = ft->valid[ft->n_valid - 1].v[0].f; d[frq_mode] = X_mode; d[frq_range] = d[frq_max] - d[frq_min]; d[frq_median] = *median_value; @@ -1570,11 +1603,11 @@ freq_tab_to_hist(const struct freq_tab *ft, const struct variable *var) /* Find out the extremes of the x value */ for ( frq = hsh_first(fh, &hi); frq != 0; frq = hsh_next(fh, &hi) ) { - if ( mv_is_value_missing(&var->miss, &frq->v)) + if ( mv_is_value_missing(&var->miss, frq->v)) continue; - if ( frq->v.f < x_min ) x_min = frq->v.f ; - if ( frq->v.f > x_max ) x_max = frq->v.f ; + if ( frq->v[0].f < x_min ) x_min = frq->v[0].f ; + if ( frq->v[0].f > x_max ) x_max = frq->v[0].f ; } hist = histogram_create(bins, x_min, x_max); @@ -1582,7 +1615,7 @@ freq_tab_to_hist(const struct freq_tab *ft, const struct variable *var) for( i = 0 ; i < ft->n_valid ; ++i ) { frq = &ft->valid[i]; - gsl_histogram_accumulate(hist, frq->v.f, frq->c); + gsl_histogram_accumulate(hist, frq->v[0].f, frq->c); } return hist; @@ -1615,7 +1648,7 @@ freq_tab_to_slice_array(const struct freq_tab *frq_tab, { const struct freq *frq = &frq_tab->valid[i]; - slices[i].label = value_to_string(&frq->v, var); + slices[i].label = value_to_string(frq->v, var); slices[i].magnetude = frq->c; }