From: Ben Pfaff Date: Sun, 21 Mar 2010 19:40:41 +0000 (-0700) Subject: sort: Add support for combining cases with identical sort criteria. X-Git-Url: https://pintos-os.org/cgi-bin/gitweb.cgi?p=pspp;a=commitdiff_plain;h=490ac70d9c9f754f733552d64c23dd6aedced342 sort: Add support for combining cases with identical sort criteria. This makes it possible to efficiently assemble frequency tables and crosstabs for data sets that might be larger than available memory. --- diff --git a/src/math/merge.c b/src/math/merge.c index 5b0704d9c2..20d26c0420 100644 --- a/src/math/merge.c +++ b/src/math/merge.c @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007, 2009, 2011 Free Software Foundation, Inc. + Copyright (C) 2007, 2009-11, 14 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -45,17 +45,23 @@ struct merge struct merge_input inputs[MAX_MERGE_ORDER]; size_t input_cnt; struct caseproto *proto; + + merge_distinct_combine_func *combine; + void *aux; }; static void do_merge (struct merge *m); struct merge * -merge_create (const struct subcase *ordering, const struct caseproto *proto) +merge_create (const struct subcase *ordering, const struct caseproto *proto, + merge_distinct_combine_func *combine, void *aux) { struct merge *m = xmalloc (sizeof *m); subcase_clone (&m->ordering, ordering); m->input_cnt = 0; m->proto = caseproto_ref (proto); + m->combine = combine; + m->aux = aux; return m; } @@ -128,6 +134,7 @@ static void do_merge (struct merge *m) { struct casewriter *w; + struct ccase *prev_case; size_t i; assert (m->input_cnt > 1); @@ -140,8 +147,11 @@ do_merge (struct merge *m) for (i = 0; i < m->input_cnt; ) if (read_input_case (m, i)) i++; + + prev_case = NULL; while (m->input_cnt > 0) { + struct ccase *min_case; size_t min; min = 0; @@ -150,11 +160,28 @@ do_merge (struct merge *m) &m->ordering, m->inputs[min].c) < 0) min = i; - casewriter_write (w, m->inputs[min].c); + min_case = m->inputs[min].c; + if (m->combine != NULL) + { + if (prev_case == NULL) + prev_case = min_case; + else if (subcase_equal (&m->ordering, min_case, + &m->ordering, prev_case)) + prev_case = m->combine (prev_case, min_case, m->aux); + else + { + casewriter_write (w, prev_case); + prev_case = min_case; + } + } + else + casewriter_write (w, min_case); + read_input_case (m, min); } + if (prev_case != NULL) + casewriter_write (w, prev_case); m->input_cnt = 1; m->inputs[0].reader = casewriter_make_reader (w); } - diff --git a/src/math/merge.h b/src/math/merge.h index 5fdb6fc062..45c8493783 100644 --- a/src/math/merge.h +++ b/src/math/merge.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 2007, 2009 Free Software Foundation, Inc. + Copyright (C) 2007, 2009, 2010 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -21,7 +21,12 @@ struct caseproto; struct casereader; struct subcase; -struct merge *merge_create (const struct subcase *, const struct caseproto *); +typedef struct ccase *merge_distinct_combine_func (struct ccase *first, + struct ccase *second, + void *aux); + +struct merge *merge_create (const struct subcase *, const struct caseproto *, + merge_distinct_combine_func *, void *aux); void merge_destroy (struct merge *); void merge_append (struct merge *, struct casereader *); struct casereader *merge_make_reader (struct merge *); diff --git a/src/math/sort.c b/src/math/sort.c index bfc27fd7d8..61256ccdbc 100644 --- a/src/math/sort.c +++ b/src/math/sort.c @@ -47,6 +47,10 @@ struct sort_writer struct merge *merge; struct pqueue *pqueue; + sort_distinct_combine_func *combine; + sort_distinct_destroy_func *destroy; + void *aux; + struct casewriter *run; casenumber run_id; struct ccase *run_end; @@ -55,7 +59,8 @@ struct sort_writer static struct casewriter_class sort_casewriter_class; static struct pqueue *pqueue_create (const struct subcase *, - const struct caseproto *); + const struct caseproto *, + sort_distinct_combine_func *, void *aux); static void pqueue_destroy (struct pqueue *); static bool pqueue_is_full (const struct pqueue *); static bool pqueue_is_empty (const struct pqueue *); @@ -67,14 +72,29 @@ static void output_record (struct sort_writer *); struct casewriter * sort_create_writer (const struct subcase *ordering, const struct caseproto *proto) +{ + return sort_distinct_create_writer (ordering, proto, NULL, NULL, NULL); +} + +struct casewriter * +sort_distinct_create_writer (const struct subcase *ordering, + const struct caseproto *proto, + sort_distinct_combine_func *combine, + sort_distinct_destroy_func *destroy, + void *aux) { struct sort_writer *sort; sort = xmalloc (sizeof *sort); sort->proto = caseproto_ref (proto); subcase_clone (&sort->ordering, ordering); - sort->merge = merge_create (ordering, proto); - sort->pqueue = pqueue_create (ordering, proto); + sort->merge = merge_create (ordering, proto, combine, aux); + sort->pqueue = pqueue_create (ordering, proto, combine, aux); + + sort->combine = combine; + sort->destroy = destroy; + sort->aux = aux; + sort->run = NULL; sort->run_id = 0; sort->run_end = NULL; @@ -103,6 +123,9 @@ sort_casewriter_destroy (struct casewriter *writer UNUSED, void *sort_) { struct sort_writer *sort = sort_; + if (sort->destroy != NULL) + sort->destroy (sort->aux); + subcase_destroy (&sort->ordering); merge_destroy (sort->merge); pqueue_destroy (sort->pqueue); @@ -203,6 +226,9 @@ struct pqueue struct bt bt; size_t record_max; casenumber idx; + + sort_distinct_combine_func *combine; + void *aux; }; struct pqueue_record @@ -218,7 +244,8 @@ static int compare_pqueue_records (const struct bt_node *a, const void *ordering); static struct pqueue * -pqueue_create (const struct subcase *ordering, const struct caseproto *proto) +pqueue_create (const struct subcase *ordering, const struct caseproto *proto, + sort_distinct_combine_func *combine, void *aux) { struct pqueue *pq; @@ -232,6 +259,9 @@ pqueue_create (const struct subcase *ordering, const struct caseproto *proto) bt_init (&pq->bt, compare_pqueue_records, &pq->ordering); pq->idx = 0; + pq->combine = combine; + pq->aux = aux; + return pq; } @@ -275,6 +305,23 @@ pqueue_push (struct pqueue *pq, struct ccase *c, casenumber id) r->c = c; r->idx = pq->idx++; bt_insert (&pq->bt, &r->bt_node); + + if (pq->combine != NULL) + { + struct bt_node *q_ = bt_prev (&pq->bt, &r->bt_node); + if (q_ != NULL) + { + struct pqueue_record *q = bt_data (q_, struct pqueue_record, + bt_node); + if (q->id == r->id && subcase_equal (&pq->ordering, q->c, + &pq->ordering, r->c)) + { + bt_delete (&pq->bt, &r->bt_node); + q->c = pq->combine (q->c, r->c, pq->aux); + free (r); + } + } + } } static struct ccase * diff --git a/src/math/sort.h b/src/math/sort.h index 96ac32cc0b..06227d64ba 100644 --- a/src/math/sort.h +++ b/src/math/sort.h @@ -1,5 +1,5 @@ /* PSPP - a program for statistical analysis. - Copyright (C) 1997-9, 2000, 2006, 2009 Free Software Foundation, Inc. + Copyright (C) 1997-9, 2000, 2006, 2009, 2010 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,4 +30,14 @@ struct casereader *sort_execute (struct casereader *, const struct subcase *); struct casereader *sort_execute_1var (struct casereader *, const struct variable *); +typedef struct ccase *sort_distinct_combine_func (struct ccase *first, + struct ccase *second, + void *aux); +typedef void sort_distinct_destroy_func (void *aux); +struct casewriter *sort_distinct_create_writer (const struct subcase *, + const struct caseproto *, + sort_distinct_combine_func *, + sort_distinct_destroy_func *, + void *aux); + #endif /* math/sort.h */