From 6ccbd384363db2e304ffe8cc51fcd2eac0a5349a Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 15 Mar 2005 06:04:10 +0000 Subject: [PATCH] Make sort stable (PR 12313). --- doc/transformation.texi | 10 +- src/ChangeLog | 64 ++++++- src/algorithm.c | 28 +++ src/algorithm.h | 12 ++ src/cmdline.c | 4 +- src/dictionary.c | 3 +- src/flip.c | 6 +- src/misc.h | 63 ++----- src/sort.c | 386 +++++++++++++++++----------------------- tests/ChangeLog | 8 + tests/Makefile.am | 3 +- tests/command/sort.sh | 101 +++++++---- 12 files changed, 364 insertions(+), 324 deletions(-) diff --git a/doc/transformation.texi b/doc/transformation.texi index 0050f998..9225ca80 100644 --- a/doc/transformation.texi +++ b/doc/transformation.texi @@ -527,11 +527,17 @@ are sorted in ascending order. To override sort order, specify (D) or for ascending order. These apply to the entire list of variables preceding them. +The sort algorithms used by @cmd{SORT CASES} are stable. That is, +records that have equal values of the sort variables will have the +same relative order before and after sorting. As a special case, +re-sorting an already sorted file will not affect the ordering of +cases. + @cmd{SORT CASES} is a procedure. It causes the data to be read. @cmd{SORT CASES} attempts to sort the entire active file in main memory. -If main memory is exhausted, it falls back to a merge sort algorithm that -involves writing and reading numerous temporary files. +If workspace is exhausted, it falls back to a merge sort algorithm that +involves creates numerous temporary files. @cmd{SORT CASES} may not be specified following TEMPORARY. @setfilename ignored diff --git a/src/ChangeLog b/src/ChangeLog index 727e6b09..cb998be2 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,57 @@ +Mon Mar 14 21:52:34 2005 Ben Pfaff + + * misc.h: Remove GCC specializations. + +Mon Mar 14 21:07:23 2005 Ben Pfaff + + Make sort stable (PR 12313). + + * sort.c: Don't need to include some headers anymore. + (static var min_buffers) New variable. + (static var max_buffers) New variable. + (static var allow_internal_sort) New variable. + (cmd_sort_cases) Add test mode. + (sort_execute) Rephrase. + (do_internal_sort) Don't try internal sorting if + allow_internal_sort is set. + (struct external_sort) Renamed `initial_runs' to `runs' and + updated all references. + (macro MIN_BUFFER_TOTAL_SIZE_RECS) Removed. + (macro MIN_BUFFER_SIZE_BYTES) Removed. + (macro MIN_BUFFER_SIZE_RECS) Removed. + (compare_initial_runs) Removed. + (struct record_run) Add member `idx'. + (write_initial_runs) Pass increasing values to process_case() as + index. + (process_case) Add `idx' parameter and use it to initialize new + `idx' member. + (allocate_cases) Limit allocated buffers to max_buffers. + (compare_record_run) Use new `idx' member for last resort + comparison, for stability. + (end_run) Call casefile_sleep() on irs->casefile, to prevent + running out of file descriptors. + (struct merge_state) Removed. + (merge) Don't need to allocate cases. Always use MAX_MERGE_ORDER + unless we have fewer runs left. Always merge consecutive runs, + for stability. Return the final run. + (mod) Removed. + (choose_merge) New function. + (merge_once) Rewritten. + +Mon Mar 14 21:05:42 2005 Ben Pfaff + + * cmdline.c: (static var testing_mode) Move into + parse_command_line(). + +Mon Mar 14 21:05:13 2005 Ben Pfaff + + * algorithm.c: (remove_range) New function. + (remove_element) New function. + + * dictionary.c: (dict_delete_var) Use remove_element(). + + * flip.c: (cmd_flip) Ditto. + Sun Mar 13 22:52:05 2005 Ben Pfaff * file-handle.q: (struct file_handle) `open_mode' should not be @@ -5,7 +59,7 @@ Sun Mar 13 22:52:05 2005 Ben Pfaff Sun Mar 13 22:40:54 2005 Ben Pfaff - First phase of making SORT CASES stable (PR 12035). + First phase of making SORT CASES stable (PR 12313). * sort.c: (struct indexed_case) New structure. (do_internal_sort) Rewrite to make internal sorting stable. @@ -161,7 +215,7 @@ Sun Mar 6 23:25:40 2005 Ben Pfaff Sun Mar 6 19:52:22 2005 Ben Pfaff DATA LIST with free-field formats should not have implied decimal - places (bug #12035). Also clean up data-in.c a bit. + places (PR 12035). Also clean up data-in.c a bit. * data-in.h: (enum) Add DI_IMPLIED_DECIMALS. @@ -366,7 +420,7 @@ Fri Feb 25 21:11:35 WST 2005 John Darrington Sun Feb 13 16:11:13 2005 Ben Pfaff - Fix Bug #11955. + Fix PR 11955. * aggregate.c: (parse_aggregate_functions) Code cleanup. Important part: get rid of spurious copying of function->format to @@ -374,7 +428,7 @@ Sun Feb 13 16:11:13 2005 Ben Pfaff Fri Feb 11 00:08:36 2005 Ben Pfaff - Fix Bug #11916, which was confusing a variable's `index' member + Fix PR 11916, which was confusing a variable's `index' member with the variable's position in a var_set. Although these are usually the same, they are not for array `var_set's. @@ -567,7 +621,7 @@ Wed Jan 5 08:30:48 WST 2005 John Darrington Mon Jan 3 17:44:37 2005 Ben Pfaff * pfm-read.c: (read_variables) Remove direct manipulation of - v->aux, which is no longer needed. Fixes Bug #11483. + v->aux, which is no longer needed. Fixes PR 11483. Sat Jan 1 19:01:16 WST 2005 John Darrington diff --git a/src/algorithm.c b/src/algorithm.c index e6b9132b..3c1aed90 100644 --- a/src/algorithm.c +++ b/src/algorithm.c @@ -365,6 +365,34 @@ copy_if (const void *array, size_t count, size_t size, return nonzero_cnt; } +/* Removes N elements starting at IDX from ARRAY, which consists + of COUNT elements of SIZE bytes each, by shifting the elements + following them, if any, into its position. */ +void +remove_range (void *array_, size_t count, size_t size, + size_t idx, size_t n) +{ + char *array = array_; + + assert (array != NULL); + assert (idx <= count); + assert (idx + n <= count); + + if (idx + n < count) + memmove (array + idx * size, array + (idx + n) * size, + size * (count - idx - n)); +} + +/* Removes element IDX from ARRAY, which consists of COUNT + elements of SIZE bytes each, by shifting the elements + following it, if any, into its position. */ +void +remove_element (void *array, size_t count, size_t size, + size_t idx) +{ + remove_range (array, count, size, idx, 1); +} + /* A predicate and its auxiliary data. */ struct pred_aux { diff --git a/src/algorithm.h b/src/algorithm.h index 707acf28..5482de15 100644 --- a/src/algorithm.h +++ b/src/algorithm.h @@ -95,6 +95,18 @@ size_t copy_if (const void *array, size_t count, size_t size, void *result, algo_predicate_func *predicate, void *aux); +/* Removes N elements starting at IDX from ARRAY, which consists + of COUNT elements of SIZE bytes each, by shifting the elements + following them, if any, into its position. */ +void remove_range (void *array, size_t count, size_t size, + size_t idx, size_t n); + +/* Removes element IDX from ARRAY, which consists of COUNT + elements of SIZE bytes each, by shifting the elements + following it, if any, into its position. */ +void remove_element (void *array, size_t count, size_t size, + size_t idx); + /* Removes elements equal to ELEMENT from ARRAY, which consists of COUNT elements of SIZE bytes each. Returns the number of remaining elements. AUX is passed to COMPARE as auxiliary diff --git a/src/cmdline.c b/src/cmdline.c index 81ec7d80..c280728a 100644 --- a/src/cmdline.c +++ b/src/cmdline.c @@ -43,13 +43,12 @@ static void usage (void); char *subst_vars (char *); -static int testing_mode=0; - /* Parses the command line specified by ARGC and ARGV as received by main(). */ void parse_command_line (int argc, char **argv) { + static int testing_mode = 0; static struct option long_options[] = { {"algorithm", required_argument, NULL, 'a'}, @@ -79,7 +78,6 @@ parse_command_line (int argc, char **argv) int c, i; int cleared_device_defaults = 0; - int no_statrc = 0; for (;;) diff --git a/src/dictionary.c b/src/dictionary.c index 177987b5..99ea5b8b 100644 --- a/src/dictionary.c +++ b/src/dictionary.c @@ -435,9 +435,8 @@ dict_delete_var (struct dictionary *d, struct variable *v) dict_clear_vectors (d); /* Remove V from var array. */ + remove_element (d->var, d->var_cnt, sizeof *d->var, v->index); d->var_cnt--; - memmove (d->var + v->index, d->var + v->index + 1, - (d->var_cnt - v->index) * sizeof *d->var); /* Update index. */ for (i = v->index; i < d->var_cnt; i++) diff --git a/src/flip.c b/src/flip.c index f14338da..9ff81d79 100644 --- a/src/flip.c +++ b/src/flip.c @@ -24,9 +24,7 @@ #include #include #include -#ifdef HAVE_SYS_TYPES_H -#include -#endif +#include "algorithm.h" #include "alloc.h" #include "case.h" #include "command.h" @@ -123,7 +121,7 @@ cmd_flip (void) for (i = 0; i < flip->var_cnt; i++) if (flip->var[i] == flip->new_names) { - memmove (&flip->var[i], &flip->var[i + 1], sizeof *flip->var * (flip->var_cnt - i - 1)); + remove_element (flip->var, flip->var_cnt, sizeof *flip->var, i); flip->var_cnt--; break; } diff --git a/src/misc.h b/src/misc.h index 26b9a526..e7ec8126 100644 --- a/src/misc.h +++ b/src/misc.h @@ -28,83 +28,44 @@ /* HUGE_VAL is traditionally defined as positive infinity, or alternatively, DBL_MAX. */ #if !HAVE_ISINF -#define isinf(X) \ - (fabs (X) == HUGE_VAL) +#define isinf(X) (fabs (X) == HUGE_VAL) #endif /* A Not a Number is not equal to itself. */ #if !HAVE_ISNAN -#define isnan(X) \ - ((X) != (X)) +#define isnan(X) ((X) != (X)) #endif /* Finite numbers are not infinities or NaNs. */ #if !HAVE_FINITE -#define finite(X) \ - (!isinf (X) && !isnan (X)) +#define finite(X) (!isinf (X) && !isnan (X)) #elif HAVE_IEEEFP_H #include /* Declares finite() under Solaris. */ #endif -#if __TURBOC__ -#include /* screwed-up Borland headers define min(), max(), - so we might as well let 'em */ -#endif - #ifndef min -#if __GNUC__ && !__STRICT_ANSI__ -#define min(A, B) \ - ({ \ - int _a = (A), _b = (B); \ - _a < _b ? _a : _b; \ - }) -#else /* !__GNUC__ */ -#define min(A, B) \ - ((A) < (B) ? (A) : (B)) -#endif /* !__GNUC__ */ -#endif /* !min */ +#define min(A, B) ((A) < (B) ? (A) : (B)) +#endif #ifndef max -#if __GNUC__ && !__STRICT_ANSI__ -#define max(A, B) \ - ({ \ - int _a = (A), _b = (B); \ - _a > _b ? _a : _b; \ - }) -#else /* !__GNUC__ */ -#define max(A, B) \ - ((A) > (B) ? (A) : (B)) -#endif /* !__GNUC__ */ -#endif /* !max */ +#define max(A, B) ((A) > (B) ? (A) : (B)) +#endif /* Clamps A to be between B and C. */ -#define range(A, B, C) \ - ((A) < (B) ? (B) : ((A) > (C) ? (C) : (A))) +#define range(A, B, C) ((A) < (B) ? (B) : ((A) > (C) ? (C) : (A))) /* Divides nonnegative X by positive Y, rounding up. */ -#define DIV_RND_UP(X, Y) \ - (((X) + ((Y) - 1)) / (Y)) +#define DIV_RND_UP(X, Y) (((X) + ((Y) - 1)) / (Y)) /* Returns nonnegative difference between {nonnegative X} and {the least multiple of positive Y greater than or equal to X}. */ -#if __GNUC__ && !__STRICT_ANSI__ -#define REM_RND_UP(X, Y) \ - ({ \ - int rem = (X) % (Y); \ - rem ? (Y) - rem : 0; \ - }) -#else -#define REM_RND_UP(X, Y) \ - ((X) % (Y) ? (Y) - (X) % (Y) : 0) -#endif +#define REM_RND_UP(X, Y) ((X) % (Y) ? (Y) - (X) % (Y) : 0) /* Rounds X up to the next multiple of Y. */ -#define ROUND_UP(X, Y) \ - (((X) + ((Y) - 1)) / (Y) * (Y)) +#define ROUND_UP(X, Y) (((X) + ((Y) - 1)) / (Y) * (Y)) /* Rounds X down to the previous multiple of Y. */ -#define ROUND_DOWN(X, Y) \ - ((X) / (Y) * (Y)) +#define ROUND_DOWN(X, Y) ((X) / (Y) * (Y)) int intlog10 (unsigned); diff --git a/src/sort.c b/src/sort.c index c2f48d0b..f71b93c5 100644 --- a/src/sort.c +++ b/src/sort.c @@ -20,6 +20,7 @@ #include #include "sort.h" #include "error.h" +#include #include #include #include @@ -31,6 +32,7 @@ #include "command.h" #include "error.h" #include "expressions/public.h" +#include "glob.h" #include "lexer.h" #include "misc.h" #include "settings.h" @@ -39,18 +41,6 @@ #include "vfm.h" #include "vfmP.h" -#if HAVE_UNISTD_H -#include -#endif - -#if HAVE_SYS_TYPES_H -#include -#endif - -#if HAVE_SYS_STAT_H -#include -#endif - #include "debug-print.h" /* Sort direction. */ @@ -75,6 +65,11 @@ struct sort_criteria size_t crit_cnt; }; +/* These should only be changed for testing purposes. */ +static int min_buffers = 64; +static int max_buffers = INT_MAX; +static bool allow_internal_sort = true; + static int compare_record (const struct ccase *, const struct ccase *, const struct sort_criteria *); static struct casefile *do_internal_sort (struct casereader *, @@ -87,7 +82,7 @@ int cmd_sort_cases (void) { struct sort_criteria *criteria; - int success; + bool success = false; lex_match (T_BY); @@ -95,7 +90,30 @@ cmd_sort_cases (void) if (criteria == NULL) return CMD_FAILURE; + if (test_mode && lex_match ('/')) + { + if (!lex_force_match_id ("BUFFERS") || !lex_match ('=') + || !lex_force_int ()) + goto done; + + min_buffers = max_buffers = lex_integer (); + allow_internal_sort = false; + if (max_buffers < 2) + { + msg (SE, _("Buffer limit must be at least 2.")); + goto done; + } + + lex_get (); + } + success = sort_active_file_in_place (criteria); + + done: + min_buffers = 64; + max_buffers = INT_MAX; + allow_internal_sort = true; + sort_destroy_criteria (criteria); return success ? lex_end_of_command () : CMD_FAILURE; } @@ -252,9 +270,7 @@ sort_destroy_criteria (struct sort_criteria *criteria) struct casefile * sort_execute (struct casereader *reader, const struct sort_criteria *criteria) { - struct casefile *output; - - output = do_internal_sort (reader, criteria); + struct casefile *output = do_internal_sort (reader, criteria); if (output == NULL) output = do_external_sort (reader, criteria); casereader_destroy (reader); @@ -271,7 +287,7 @@ struct indexed_case static int compare_indexed_cases (const void *, const void *, void *); /* If the data is in memory, do an internal sort and return a new - casefile for the data. */ + casefile for the data. Otherwise, return a null pointer. */ static struct casefile * do_internal_sort (struct casereader *reader, const struct sort_criteria *criteria) @@ -280,6 +296,9 @@ do_internal_sort (struct casereader *reader, struct casefile *dst; unsigned long case_cnt; + if (!allow_internal_sort) + return NULL; + src = casereader_get_casefile (reader); if (casefile_get_case_cnt (src) > 1 && !casefile_in_core (src)) return NULL; @@ -335,52 +354,29 @@ compare_indexed_cases (const void *a_, const void *b_, void *criteria_) /* External sort. */ -/* Maximum order of merge. If you increase this, then you should - use a heap for comparing cases during merge. */ -#define MAX_MERGE_ORDER 7 - -/* Minimum total number of records for buffers. */ -#define MIN_BUFFER_TOTAL_SIZE_RECS 64 - -/* Minimum single input buffer size, in bytes and records. */ -#define MIN_BUFFER_SIZE_BYTES 4096 -#define MIN_BUFFER_SIZE_RECS 16 - -#if MIN_BUFFER_SIZE_RECS * 2 + 16 > MIN_BUFFER_TOTAL_SIZE_RECS -#error MIN_BUFFER_SIZE_RECS and MIN_BUFFER_TOTAL_SIZE_RECS do not make sense. -#endif - -/* Sorts initial runs A and B in decending order by length. */ -static int -compare_initial_runs (const void *a_, const void *b_, void *aux UNUSED) -{ - struct casefile *const *a = a_; - struct casefile *const *b = b_; - unsigned long a_case_cnt = casefile_get_case_cnt (*a); - unsigned long b_case_cnt = casefile_get_case_cnt (*b); - - return a_case_cnt > b_case_cnt ? -1 : a_case_cnt < b_case_cnt; -} +/* Maximum order of merge (external sort only). The maximum + reasonable value is about 7. Above that, it would be a good + idea to use a heap in merge_once() to select the minimum. */ +#define MAX_MERGE_ORDER 7 /* Results of an external sort. */ struct external_sort { const struct sort_criteria *criteria; /* Sort criteria. */ size_t value_cnt; /* Size of data in `union value's. */ - struct casefile **initial_runs; /* Array of initial runs. */ + struct casefile **runs; /* Array of initial runs. */ size_t run_cnt, run_cap; /* Number of runs, allocated capacity. */ }; /* Prototypes for helper functions. */ -static int write_initial_runs (struct external_sort *, struct casereader *); -static int merge (struct external_sort *); +static int write_runs (struct external_sort *, struct casereader *); +static struct casefile *merge (struct external_sort *); static void destroy_external_sort (struct external_sort *); -/* Performs an external sort of the active file according to the - specification in SCP. Forms initial runs using a heap as a - reservoir. Determines the optimum merge pattern via Huffman's - method (see Knuth vol. 3, 2nd edition, p. 365-366), and merges - according to that pattern. */ +/* Performs a stable external sort of the active file according + to the specification in SCP. Forms initial runs using a heap + as a reservoir. Merges the initial runs according to a + pattern that assures stability. */ static struct casefile * do_external_sort (struct casereader *reader, const struct sort_criteria *criteria) @@ -394,11 +390,10 @@ do_external_sort (struct casereader *reader, xsrt->value_cnt = casefile_get_value_cnt (casereader_get_casefile (reader)); xsrt->run_cap = 512; xsrt->run_cnt = 0; - xsrt->initial_runs = xmalloc (sizeof *xsrt->initial_runs * xsrt->run_cap); - if (write_initial_runs (xsrt, reader) && merge (xsrt)) + xsrt->runs = xmalloc (sizeof *xsrt->runs * xsrt->run_cap); + if (write_runs (xsrt, reader)) { - struct casefile *output = xsrt->initial_runs[0]; - xsrt->initial_runs[0] = NULL; + struct casefile *output = merge (xsrt); destroy_external_sort (xsrt); return output; } @@ -418,8 +413,8 @@ destroy_external_sort (struct external_sort *xsrt) int i; for (i = 0; i < xsrt->run_cnt; i++) - casefile_destroy (xsrt->initial_runs[i]); - free (xsrt->initial_runs); + casefile_destroy (xsrt->runs[i]); + free (xsrt->runs); free (xsrt); } } @@ -431,6 +426,7 @@ struct record_run { int run; /* Run number of case. */ struct ccase record; /* Case data. */ + size_t idx; /* Case number (for stability). */ }; /* Represents a set of initial runs during an external sort. */ @@ -455,7 +451,8 @@ struct initial_run_state static const struct case_sink_class sort_sink_class; static void destroy_initial_run_state (struct initial_run_state *); -static void process_case (struct initial_run_state *, const struct ccase *); +static void process_case (struct initial_run_state *, const struct ccase *, + size_t); static int allocate_cases (struct initial_run_state *); static void output_record (struct initial_run_state *); static void start_run (struct initial_run_state *); @@ -467,10 +464,11 @@ static int compare_record_run_minheap (const void *, const void *, void *); /* Reads cases from READER and composes initial runs in XSRT. */ static int -write_initial_runs (struct external_sort *xsrt, struct casereader *reader) +write_runs (struct external_sort *xsrt, struct casereader *reader) { struct initial_run_state *irs; struct ccase c; + size_t idx = 0; int success = 0; /* Allocate memory for cases. */ @@ -489,7 +487,7 @@ write_initial_runs (struct external_sort *xsrt, struct casereader *reader) /* Create initial runs. */ start_run (irs); for (; irs->okay && casereader_read (reader, &c); case_destroy (&c)) - process_case (irs, &c); + process_case (irs, &c, idx++); while (irs->okay && irs->record_cnt > 0) output_record (irs); end_run (irs); @@ -504,18 +502,19 @@ write_initial_runs (struct external_sort *xsrt, struct casereader *reader) /* Add a single case to an initial run. */ static void -process_case (struct initial_run_state *irs, const struct ccase *c) +process_case (struct initial_run_state *irs, const struct ccase *c, size_t idx) { - struct record_run *new_record_run; + struct record_run *rr; /* Compose record_run for this run and add to heap. */ assert (irs->record_cnt < irs->record_cap - 1); - new_record_run = irs->records + irs->record_cnt++; - case_copy (&new_record_run->record, 0, c, 0, irs->xsrt->value_cnt); - new_record_run->run = irs->run; + rr = irs->records + irs->record_cnt++; + case_copy (&rr->record, 0, c, 0, irs->xsrt->value_cnt); + rr->run = irs->run; + rr->idx = idx; if (!case_is_null (&irs->last_output) && compare_record (c, &irs->last_output, irs->xsrt->criteria) < 0) - new_record_run->run = irs->run + 1; + rr->run = irs->run + 1; push_heap (irs->records, irs->record_cnt, sizeof *irs->records, compare_record_run_minheap, irs); @@ -557,6 +556,8 @@ allocate_cases (struct initial_run_state *irs) + irs->xsrt->value_cnt * sizeof (union value) + 4 * sizeof (void *)); max_cases = get_max_workspace() / approx_case_cost; + if (max_cases > max_buffers) + max_cases = max_buffers; irs->records = malloc (sizeof *irs->records * max_cases); for (i = 0; i < max_cases; i++) if (!case_try_create (&irs->records[i].record, irs->xsrt->value_cnt)) @@ -567,12 +568,12 @@ allocate_cases (struct initial_run_state *irs) irs->record_cap = max_cases; /* Fail if we didn't allocate an acceptable number of cases. */ - if (irs->records == NULL || max_cases < MIN_BUFFER_TOTAL_SIZE_RECS) + if (irs->records == NULL || max_cases < min_buffers) { msg (SE, _("Out of memory. Could not allocate room for minimum of %d " "cases of %d bytes each. (PSPP workspace is currently " "restricted to a maximum of %d KB.)"), - MIN_BUFFER_TOTAL_SIZE_RECS, approx_case_cost, get_max_workspace() / 1024); + min_buffers, approx_case_cost, get_max_workspace() / 1024); return 0; } return 1; @@ -612,16 +613,18 @@ compare_record (const struct ccase *a, const struct ccase *b, } /* Compares record-run tuples A and B on run number first, then - on the current record according to SCP. */ + on record, then on case index. */ static int compare_record_run (const struct record_run *a, const struct record_run *b, struct initial_run_state *irs) { - if (a->run != b->run) - return a->run > b->run ? 1 : -1; - else - return compare_record (&a->record, &b->record, irs->xsrt->criteria); + int result = a->run < b->run ? -1 : a->run > b->run; + if (result == 0) + result = compare_record (&a->record, &b->record, irs->xsrt->criteria); + if (result == 0) + result = a->idx < b->idx ? -1 : a->idx > b->idx; + return result; } /* Compares record-run tuples A and B on run number first, then @@ -653,14 +656,14 @@ end_run (struct initial_run_state *irs) /* Record initial run. */ if (irs->casefile != NULL) { + casefile_sleep (irs->casefile); if (xsrt->run_cnt >= xsrt->run_cap) { xsrt->run_cap *= 2; - xsrt->initial_runs - = xrealloc (xsrt->initial_runs, - sizeof *xsrt->initial_runs * xsrt->run_cap); + xsrt->runs = xrealloc (xsrt->runs, + sizeof *xsrt->runs * xsrt->run_cap); } - xsrt->initial_runs[xsrt->run_cnt++] = irs->casefile; + xsrt->runs[xsrt->run_cnt++] = irs->casefile; irs->casefile = NULL; } } @@ -705,193 +708,128 @@ output_record (struct initial_run_state *irs) /* Merging. */ -/* State of merging initial runs. */ -struct merge_state - { - struct external_sort *xsrt; /* External sort state. */ - struct ccase *cases; /* Buffers. */ - size_t case_cnt; /* Number of buffers. */ - }; - -struct run; -static struct casefile *merge_once (struct merge_state *, +static int choose_merge (struct casefile *runs[], int run_cnt, int order); +static struct casefile *merge_once (struct external_sort *, struct casefile *[], size_t); -static int mod (int, int); -/* Performs a series of P-way merges of initial runs. */ -static int +/* Repeatedly merges run until only one is left, + and returns the final casefile. */ +static struct casefile * merge (struct external_sort *xsrt) { - struct merge_state mrg; /* State of merge. */ - size_t approx_case_cost; /* Approximate memory cost of one case. */ - int max_order; /* Maximum order of merge. */ - size_t dummy_run_cnt; /* Number of dummy runs to insert. */ - int success = 0; - int i; - - mrg.xsrt = xsrt; - - /* Allocate as many cases as possible into cases. */ - approx_case_cost = (sizeof *mrg.cases - + xsrt->value_cnt * sizeof (union value) - + 4 * sizeof (void *)); - mrg.case_cnt = get_max_workspace() / approx_case_cost; - mrg.cases = malloc (sizeof *mrg.cases * mrg.case_cnt); - if (mrg.cases == NULL) - goto done; - for (i = 0; i < mrg.case_cnt; i++) - if (!case_try_create (&mrg.cases[i], xsrt->value_cnt)) - { - mrg.case_cnt = i; - break; - } - if (mrg.case_cnt < MIN_BUFFER_TOTAL_SIZE_RECS) - { - msg (SE, _("Out of memory. Could not allocate room for minimum of %d " - "cases of %d bytes each. (PSPP workspace is currently " - "restricted to a maximum of %d KB.)"), - MIN_BUFFER_TOTAL_SIZE_RECS, approx_case_cost, get_max_workspace() / 1024); - return 0; - } - - /* Determine maximum order of merge. */ - max_order = MAX_MERGE_ORDER; - if (mrg.case_cnt / max_order < MIN_BUFFER_SIZE_RECS) - max_order = mrg.case_cnt / MIN_BUFFER_SIZE_RECS; - else if (mrg.case_cnt / max_order * xsrt->value_cnt * sizeof (union value) - < MIN_BUFFER_SIZE_BYTES) - max_order = mrg.case_cnt / (MIN_BUFFER_SIZE_BYTES - / (xsrt->value_cnt * sizeof (union value))); - if (max_order < 2) - max_order = 2; - if (max_order > xsrt->run_cnt) - max_order = xsrt->run_cnt; - - /* Repeatedly merge the P shortest existing runs until only one run - is left. */ - make_heap (xsrt->initial_runs, xsrt->run_cnt, sizeof *xsrt->initial_runs, - compare_initial_runs, NULL); - dummy_run_cnt = mod (1 - (int) xsrt->run_cnt, max_order - 1); - - assert (max_order > 0); - assert (max_order <= 2 - || (xsrt->run_cnt + dummy_run_cnt) % (max_order - 1) == 1); while (xsrt->run_cnt > 1) { - struct casefile *output_run; - int order; - int i; - - /* Choose order of merge (max_order after first merge). */ - order = max_order - dummy_run_cnt; - dummy_run_cnt = 0; - - /* Choose runs to merge. */ - assert (xsrt->run_cnt >= order); - for (i = 0; i < order; i++) - pop_heap (xsrt->initial_runs, xsrt->run_cnt--, - sizeof *xsrt->initial_runs, - compare_initial_runs, NULL); - - /* Merge runs. */ - output_run = merge_once (&mrg, - xsrt->initial_runs + xsrt->run_cnt, order); - if (output_run == NULL) - goto done; - - /* Add output run to heap. */ - xsrt->initial_runs[xsrt->run_cnt++] = output_run; - push_heap (xsrt->initial_runs, xsrt->run_cnt, sizeof *xsrt->initial_runs, - compare_initial_runs, NULL); + int order = min (MAX_MERGE_ORDER, xsrt->run_cnt); + int idx = choose_merge (xsrt->runs, xsrt->run_cnt, order); + xsrt->runs[idx] = merge_once (xsrt, xsrt->runs + idx, order); + remove_range (xsrt->runs, xsrt->run_cnt, sizeof *xsrt->runs, + idx + 1, order - 1); + xsrt->run_cnt -= order - 1; } - - /* Exactly one run is left, which contains the entire sorted - file. We could use it to find a total case count. */ assert (xsrt->run_cnt == 1); - - success = 1; - - done: - for (i = 0; i < mrg.case_cnt; i++) - case_destroy (&mrg.cases[i]); - free (mrg.cases); - - return success; + xsrt->run_cnt = 0; + return xsrt->runs[0]; } -/* Modulo function as defined by Knuth. */ +/* Chooses ORDER runs out of the RUN_CNT runs in RUNS to merge, + and returns the index of the first one. + + For stability, we must merge only consecutive runs. For + efficiency, we choose the shortest consecutive sequence of + runs. */ static int -mod (int x, int y) +choose_merge (struct casefile *runs[], int run_cnt, int order) { - if (y == 0) - return x; - else if (x == 0) - return 0; - else if (x > 0 && y > 0) - return x % y; - else if (x < 0 && y > 0) - return y - (-x) % y; + int min_idx, min_sum; + int cur_idx, cur_sum; + int i; + + /* Sum up the length of the first ORDER runs. */ + cur_sum = 0; + for (i = 0; i < order; i++) + cur_sum += casefile_get_case_cnt (runs[i]); + + /* Find the shortest group of ORDER runs, + using a running total for efficiency. */ + min_idx = 0; + min_sum = cur_sum; + for (cur_idx = 1; cur_idx + order <= run_cnt; cur_idx++) + { + cur_sum -= casefile_get_case_cnt (runs[cur_idx - 1]); + cur_sum += casefile_get_case_cnt (runs[cur_idx + order - 1]); + if (cur_sum < min_sum) + { + min_sum = cur_sum; + min_idx = cur_idx; + } + } - abort (); + return min_idx; } -/* Merges the RUN_CNT initial runs specified in INPUT_RUNS into a - new run. Returns nonzero only if successful. Adds an entry - to MRG->xsrt->runs for the output file if and only if the - output file is actually created. Always deletes all the input - files. */ +/* Merges the RUN_CNT initial runs specified in INPUT_FILES into a + new run, and returns the new run. */ static struct casefile * -merge_once (struct merge_state *mrg, - struct casefile *input_runs[], +merge_once (struct external_sort *xsrt, + struct casefile **const input_files, size_t run_cnt) { - struct casereader *input_readers[MAX_MERGE_ORDER]; - struct ccase input_cases[MAX_MERGE_ORDER]; - struct casefile *output_casefile = NULL; + struct run + { + struct casefile *file; + struct casereader *reader; + struct ccase ccase; + } + *runs; + + struct casefile *output = NULL; int i; + /* Open input files. */ + runs = xmalloc (sizeof *runs * run_cnt); for (i = 0; i < run_cnt; i++) { - input_readers[i] = casefile_get_destructive_reader (input_runs[i]); - if (!casereader_read_xfer (input_readers[i], &input_cases[i])) + struct run *r = &runs[i]; + r->file = input_files[i]; + r->reader = casefile_get_destructive_reader (r->file); + if (!casereader_read_xfer (r->reader, &r->ccase)) { run_cnt--; i--; } } - - output_casefile = casefile_create (mrg->xsrt->value_cnt); - casefile_to_disk (output_casefile); + + /* Create output file. */ + output = casefile_create (xsrt->value_cnt); + casefile_to_disk (output); /* Merge. */ while (run_cnt > 0) { - size_t min_idx; - + struct run *min_run, *run; + /* Find minimum. */ - min_idx = 0; - for (i = 1; i < run_cnt; i++) - if (compare_record (&input_cases[i], &input_cases[min_idx], - mrg->xsrt->criteria) < 0) - min_idx = i; + min_run = runs; + for (run = runs + 1; run < runs + run_cnt; run++) + if (compare_record (&run->ccase, &min_run->ccase, xsrt->criteria) < 0) + min_run = run; /* Write minimum to output file. */ - casefile_append_xfer (output_casefile, &input_cases[min_idx]); + casefile_append_xfer (output, &min_run->ccase); - if (!casereader_read_xfer (input_readers[min_idx], - &input_cases[min_idx])) + /* Read another case from minimum run. */ + if (!casereader_read_xfer (min_run->reader, &min_run->ccase)) { - casereader_destroy (input_readers[min_idx]); - casefile_destroy (input_runs[min_idx]); + casereader_destroy (min_run->reader); + casefile_destroy (min_run->file); + remove_element (runs, run_cnt, sizeof *runs, min_run - runs); run_cnt--; - input_runs[min_idx] = input_runs[run_cnt]; - input_readers[min_idx] = input_readers[run_cnt]; - input_cases[min_idx] = input_cases[run_cnt]; } } - casefile_sleep (output_casefile); + casefile_sleep (output); + free (runs); - return output_casefile; + return output; } diff --git a/tests/ChangeLog b/tests/ChangeLog index bbc4a7f2..79c7c673 100644 --- a/tests/ChangeLog +++ b/tests/ChangeLog @@ -1,3 +1,11 @@ +Mon Mar 14 21:58:23 2005 Ben Pfaff + + * Makefile.am: (TESTS_ENVIRONMENT) Add PERL to the test + environment. + + * commands/sort.sh: Rewrite to test more thoroughly and to verify + that the sort is stable. + Sat Mar 12 23:30:37 2005 Ben Pfaff * bugs/agg-crash-2.sh, bugs/big-input-2.sh, command/aggregate.sh: diff --git a/tests/Makefile.am b/tests/Makefile.am index d018e19e..57b819d2 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,6 +1,7 @@ ## Process this file with automake to produce Makefile.in -*- makefile -*- -TESTS_ENVIRONMENT=top_srcdir=${top_srcdir} +TESTS_ENVIRONMENT = top_srcdir=${top_srcdir} +TESTS_ENVIRONMENT += PERL='@PERL@' TESTS = \ command/aggregate.sh \ command/autorecod.sh \ diff --git a/tests/command/sort.sh b/tests/command/sort.sh index 44e5fd05..f51881a3 100755 --- a/tests/command/sort.sh +++ b/tests/command/sort.sh @@ -46,38 +46,75 @@ mkdir -p $TEMPDIR cd $TEMPDIR - -activity="generate stat program" -cat > $TESTFILE < gen-data.pl <<'EOF' +use strict; +use warnings; + +# Generate shuffled data. +my (@data); +for my $i (0...$ARGV[0] - 1) { + push (@data, $i) foreach 1...$ARGV[1]; +} +fisher_yates_shuffle (\@data); + +# Output shuffled data. +my (@shuffled) = map ([$data[$_], $_], 0...$#data); +open (SHUFFLED, ">sort.in"); +print SHUFFLED "$data[$_] $_\n" foreach 0...$#data; + +# Output sorted data. +my (@sorted) = sort { $a->[0] <=> $b->[0] || $a->[1] <=> $b->[1] } @shuffled; +open (SORTED, ">sort.exp"); +print SORTED "$_->[0] $_->[1]\n" foreach @sorted; + +# From perlfaq4. +sub fisher_yates_shuffle { + my $deck = shift; # $deck is a reference to an array + my $i = @$deck; + while ($i--) { + my $j = int rand ($i+1); + @$deck[$i,$j] = @$deck[$j,$i]; + } +} EOF -if [ $? -ne 0 ] ; then no_result ; fi - -activity="run program" -$SUPERVISOR $here/../src/pspp -o raw-ascii $TESTFILE -if [ $? -ne 0 ] ; then no_result ; fi - -# Now there should be some sorted data in $TEMPDIR/pspp.list -# We have to do some checks on it. - -# 1. Is it sorted ? - -activity="check sorted" -sort $TEMPDIR/pspp.list > $TEMPDIR/sortsort -if [ $? -ne 0 ] ; then no_result ; fi - -diff -B -b $TEMPDIR/sortsort $TEMPDIR/pspp.list -if [ $? -ne 0 ] ; then fail ; fi - -# 2. It should be six elements wide -activity="check data width" -awk '!/^\ *$/{if (NF!=6) exit 1}' $TEMPDIR/pspp.list -if [ $? -ne 0 ] ; then fail ; fi - +for count_repeat_buffers in \ + "100 5 2" "100 5 3" "100 5 4" "100 5 5" "100 5 10" "100 5 50" "100 5 100" "100 5" \ + "100 10 2" "100 10 3" "100 10 5" "100 10" \ + "1000 5 5" "1000 5 50" "1000 5" \ + "100 100 3" "100 100 5" "100 100" \ + "10000 5 500" \ + "50000 1"; do + set $count_repeat_buffers + count=$1 + repeat=$2 + buffers=$3 + + echo -n . + + activity="generate data for $count_repeat_buffers run" + $PERL gen-data.pl $count $repeat > sort.data + if [ $? -ne 0 ] ; then no_result ; fi + + activity="generate test program for $count_repeat_buffers run" + { + echo "data list list file='sort.in'/x y (f8)." + if test "$buffers" != ""; then + echo "sort by x/buffers=$buffers." + else + echo "sort by x." + fi + echo "print outfile='sort.out'/x y." + echo "execute." + } > sort.pspp || no_result + + activity="run program" + $SUPERVISOR $here/../src/pspp --testing-mode -o raw-ascii sort.pspp + if [ $? -ne 0 ] ; then no_result ; fi + + diff -B -w sort.exp sort.out + if [ $? -ne 0 ] ; then fail ; fi +done +echo pass; -- 2.30.2