From: Ben Pfaff Date: Tue, 23 Mar 2010 04:41:25 +0000 (-0700) Subject: FREQUENCIES: Refactor frequency counting with sorting instead of hashing. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=cdb5830c023841725228f15725740eb608c5897b FREQUENCIES: Refactor frequency counting with sorting instead of hashing. --- diff --git a/src/language/stats/frequencies.q b/src/language/stats/frequencies.q index 723d26e834..c360713b56 100644 --- a/src/language/stats/frequencies.q +++ b/src/language/stats/frequencies.q @@ -23,10 +23,12 @@ #include "data/case.h" #include "data/casegrouper.h" #include "data/casereader.h" +#include "data/casewriter.h" #include "data/dictionary.h" #include "data/format.h" #include "data/procedure.h" #include "data/settings.h" +#include "data/subcase.h" #include "data/value-labels.h" #include "data/variable.h" #include "language/command.h" @@ -43,6 +45,7 @@ #include "libpspp/str.h" #include "math/histogram.h" #include "math/moments.h" +#include "math/sort.h" #include "output/chart-item.h" #include "output/charts/piechart.h" #include "output/charts/plot-hist.h" @@ -158,9 +161,10 @@ struct frq_chart /* Entire frequency table. */ struct freq_tab { - struct hmap data; /* Hash table for accumulating counts. */ + struct casewriter *sorter; + struct freq *valid; /* Valid freqs. */ - int n_valid; /* Number of total freqs. */ + int n_valid; /* Number of valid freqs. */ const struct dictionary *dict; /* Source of entries in the table. */ struct freq *missing; /* Missing freqs. */ @@ -435,18 +439,27 @@ calc (struct frq_proc *frq, const struct ccase *c, const struct dataset *ds) for (i = 0; i < frq->n_vars; i++) { struct var_freqs *vf = &frq->vars[i]; - const union value *value = case_data (c, vf->var); - size_t hash = value_hash (value, vf->width, 0); - struct freq *f; + struct casewriter *writer = vf->tab.sorter; + struct ccase *f; - f = freq_hmap_search (&vf->tab.data, value, vf->width, hash); - if (f == NULL) - f = freq_hmap_insert (&vf->tab.data, value, vf->width, hash); + f = case_create (casewriter_get_proto (writer)); + value_copy (case_data_rw_idx (f, 0), case_data (c, vf->var), vf->width); + case_data_rw_idx (f, 1)->f = weight; - f->count += weight; + casewriter_write (writer, f); } } +static struct ccase * +combine_freq_cases (struct ccase *a, struct ccase *b, void *aux UNUSED) +{ + a = case_unshare (a); + case_data_rw_idx (a, 1)->f += case_data_idx (b, 1)->f; + case_unref (b); + + return a; +} + /* Prepares each variable that is the target of FREQUENCIES by setting up its hash table. */ static void @@ -463,7 +476,22 @@ precalc (struct frq_proc *frq, struct casereader *input, struct dataset *ds) } for (i = 0; i < frq->n_vars; i++) - hmap_init (&frq->vars[i].tab.data); + { + int width = var_get_width (frq->vars[i].var); + struct caseproto *proto; + struct subcase ordering; + + proto = caseproto_create (); + proto = caseproto_add_width (proto, width); + proto = caseproto_add_width (proto, 0); + + subcase_init (&ordering, 0, width, SC_ASCEND); + frq->vars[i].tab.sorter = sort_distinct_create_writer ( + &ordering, proto, combine_freq_cases, NULL, NULL); + + caseproto_unref (proto); + subcase_destroy (&ordering); + } } /* Finishes up with the variables after frequencies have been @@ -560,13 +588,23 @@ postprocess_freq_tab (const struct frq_proc *frq, struct var_freqs *vf) { struct freq_tab *ft = &vf->tab; struct freq_compare_aux aux; + struct casereader *reader; + struct ccase *c; size_t count; struct freq *freqs, *f; size_t i; /* Extract data from hash table. */ - count = hmap_count (&ft->data); - freqs = freq_hmap_extract (&ft->data); + reader = casewriter_make_reader (ft->sorter); + freqs = xnmalloc (casereader_count_cases (reader), sizeof *freqs); + for (count = 0; (c = casereader_read (reader)) != NULL; count++) + { + struct freq *f = &freqs[count]; + value_clone (&f->value, case_data_idx (c, 0), vf->width); + f->count = case_num_idx (c, 1); + case_unref (c); + } + casereader_destroy (reader); /* Put data into ft. */ ft->valid = freqs; @@ -604,8 +642,14 @@ postprocess_freq_tab (const struct frq_proc *frq, struct var_freqs *vf) static void cleanup_freq_tab (struct var_freqs *vf) { + if (value_needs_init (vf->width)) + { + int i; + + for (i = 0; i < vf->tab.n_valid + vf->tab.n_missing; i++) + value_destroy (&vf->tab.valid[i].value, vf->width); + } free (vf->tab.valid); - freq_hmap_destroy (&vf->tab.data, vf->width); } /* Parses the VARIABLES subcommand. */