1 /* PSPP - a program for statistical analysis.
2 Copyright (C) 2007, 2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Balanced tree (BT) data structure.
19 The client should not need to be aware of the form of
20 balancing applied to the balanced tree, as its operation is
21 fully encapsulated. The current implementation is a scapegoat
22 tree. Scapegoat trees have the advantage over many other
23 forms of balanced trees that they do not store any additional
24 information in each node; thus, they are slightly more
25 space-efficient than, say, red-black or AVL trees. Compared
26 to splay trees, scapegoat trees provide guaranteed logarithmic
27 worst-case search time and do not restructure the tree during
30 For information on scapegoat trees, see Galperin and Rivest,
31 "Scapegoat Trees", Proc. 4th ACM-SIAM Symposium on Discrete
32 Algorithms, or <http://en.wikipedia.org/wiki/Scapegoat_tree>,
33 which includes source code and links to other resources, such
34 as the Galperin and Rivest paper.
36 One potentially tricky part of scapegoat tree design is the
37 choice of alpha, which is a real constant that must be greater
38 than 0.5 and less than 1. We must be able to efficiently
39 calculate h_alpha = floor(log(n)/log(1/alpha)) for integer n >
40 0. One way to do so would be to maintain a table relating
41 h_alpha to the minimum value of n that yields that h_alpha.
42 Then, we can track the value of h_alpha(n) in the number of
43 nodes in the tree n as nodes are inserted and deleted.
45 This implementation uses a different approach. We choose
46 alpha = sqrt(2)/2 = 1/sqrt(2) ~= .707. Then, computing
47 h_alpha is a matter of computing a logarithm in base sqrt(2).
48 This is easy: we simply compute twice the base-2 logarithm,
49 then adjust upward by 1 if necessary. See calculate_h_alpha
52 /* These library routines have no external dependencies other
53 than the standard C library.
55 If you add routines in this file, please add a corresponding
56 test to bt-test.c. This test program should achieve 100%
57 coverage of lines and branches in this code, as reported by
64 #include <libpspp/bt.h>
70 #include <libpspp/cast.h>
72 static void rebalance_subtree (struct bt *, struct bt_node *, size_t);
74 static struct bt_node **down_link (struct bt *, struct bt_node *);
75 static inline struct bt_node *sibling (struct bt_node *p);
76 static size_t count_nodes_in_subtree (const struct bt_node *);
78 static inline int floor_log2 (size_t);
79 static inline int calculate_h_alpha (size_t);
81 /* Initializes BT as an empty BT that uses the given COMPARE
82 function, passing in AUX as auxiliary data. */
84 bt_init (struct bt *bt,
85 bt_compare_func *compare,
89 bt->compare = compare;
95 /* Inserts the given NODE into BT.
96 Returns a null pointer if successful.
97 Returns the existing node already in BT equal to NODE, on
100 bt_insert (struct bt *bt, struct bt_node *node)
104 node->down[0] = NULL;
105 node->down[1] = NULL;
107 if (bt->root == NULL)
114 struct bt_node *p = bt->root;
119 cmp = bt->compare (node, p, bt->aux);
125 if (p->down[dir] == NULL)
136 if (bt->size > bt->max_size)
137 bt->max_size = bt->size;
139 if (depth > calculate_h_alpha (bt->size))
141 /* We use the "alternative" method of finding a scapegoat
142 node described by Galperin and Rivest. */
143 struct bt_node *s = node;
150 size += 1 + count_nodes_in_subtree (sibling (s));
152 if (i > calculate_h_alpha (size))
154 rebalance_subtree (bt, s, size);
160 rebalance_subtree (bt, bt->root, bt->size);
161 bt->max_size = bt->size;
169 /* Deletes P from BT. */
171 bt_delete (struct bt *bt, struct bt_node *p)
173 struct bt_node **q = down_link (bt, p);
174 struct bt_node *r = p->down[1];
181 else if (r->down[0] == NULL)
183 r->down[0] = p->down[0];
186 if (r->down[0] != NULL)
191 struct bt_node *s = r->down[0];
192 while (s->down[0] != NULL)
195 r->down[0] = s->down[1];
196 s->down[0] = p->down[0];
197 s->down[1] = p->down[1];
199 if (s->down[0] != NULL)
203 if (r->down[0] != NULL)
208 /* We approximate .707 as .75 here. This is conservative: it
209 will cause us to do a little more rebalancing than strictly
210 necessary to maintain the scapegoat tree's height
212 if (bt->size < bt->max_size * 3 / 4 && bt->size > 0)
214 rebalance_subtree (bt, bt->root, bt->size);
215 bt->max_size = bt->size;
219 /* Returns the node with minimum value in BT, or a null pointer
222 bt_first (const struct bt *bt)
224 struct bt_node *p = bt->root;
226 while (p->down[0] != NULL)
231 /* Returns the node with maximum value in BT, or a null pointer
234 bt_last (const struct bt *bt)
236 struct bt_node *p = bt->root;
238 while (p->down[1] != NULL)
243 /* Searches BT for a node equal to TARGET.
244 Returns the node if found, or a null pointer otherwise. */
246 bt_find (const struct bt *bt, const struct bt_node *target)
248 const struct bt_node *p;
251 for (p = bt->root; p != NULL; p = p->down[cmp > 0])
253 cmp = bt->compare (target, p, bt->aux);
255 return CONST_CAST (struct bt_node *, p);
261 /* Searches BT for, and returns, the first node in in-order whose
262 value is greater than or equal to TARGET. Returns a null
263 pointer if all nodes in BT are less than TARGET.
265 Another way to look at the return value is that it is the node
266 that would be returned by "bt_next (BT, TARGET)" if TARGET
267 were inserted in BT (assuming that TARGET would not be a
270 bt_find_ge (const struct bt *bt, const struct bt_node *target)
272 const struct bt_node *p = bt->root;
273 const struct bt_node *q = NULL;
276 int cmp = bt->compare (target, p, bt->aux);
288 return CONST_CAST (struct bt_node *, q);
291 /* Searches BT for, and returns, the last node in in-order whose
292 value is less than or equal to TARGET, which should not be in
293 BT. Returns a null pointer if all nodes in BT are greater
296 Another way to look at the return value is that it is the node
297 that would be returned by "bt_prev (BT, TARGET)" if TARGET
298 were inserted in BT (assuming that TARGET would not be a
301 bt_find_le (const struct bt *bt, const struct bt_node *target)
303 const struct bt_node *p = bt->root;
304 const struct bt_node *q = NULL;
307 int cmp = bt->compare (target, p, bt->aux);
319 return CONST_CAST (struct bt_node *, q);
322 /* Returns the node in BT following P in in-order.
323 If P is null, returns the minimum node in BT.
324 Returns a null pointer if P is the maximum node in BT or if P
325 is null and BT is empty. */
327 bt_next (const struct bt *bt, const struct bt_node *p)
330 return bt_first (bt);
331 else if (p->down[1] == NULL)
334 for (q = p->up; ; p = q, q = q->up)
335 if (q == NULL || p == q->down[0])
341 while (p->down[0] != NULL)
343 return CONST_CAST (struct bt_node *, p);
347 /* Returns the node in BT preceding P in in-order.
348 If P is null, returns the maximum node in BT.
349 Returns a null pointer if P is the minimum node in BT or if P
350 is null and BT is empty. */
352 bt_prev (const struct bt *bt, const struct bt_node *p)
356 else if (p->down[0] == NULL)
359 for (q = p->up; ; p = q, q = q->up)
360 if (q == NULL || p == q->down[1])
366 while (p->down[1] != NULL)
368 return CONST_CAST (struct bt_node *, p);
372 /* Moves P around in BT to compensate for its key having
373 changed. Returns a null pointer if successful. If P's new
374 value is equal to that of some other node in BT, returns the
375 other node after removing P from BT.
377 This function is an optimization only if it is likely that P
378 can actually retain its relative position in BT, e.g. its key
379 has only been adjusted slightly. Otherwise, it is more
380 efficient to simply remove P from BT, change its key, and
383 It is not safe to update more than one node's key, then to
384 call this function for each node. Instead, update a single
385 node's key, call this function, update another node's key, and
386 so on. Alternatively, remove all affected nodes from the
387 tree, update their keys, then re-insert all of them. */
389 bt_changed (struct bt *bt, struct bt_node *p)
391 struct bt_node *prev = bt_prev (bt, p);
392 struct bt_node *next = bt_next (bt, p);
394 if ((prev != NULL && bt->compare (prev, p, bt->aux) >= 0)
395 || (next != NULL && bt->compare (p, next, bt->aux) >= 0))
398 return bt_insert (bt, p);
403 /* BT nodes may be moved around in memory as necessary, e.g. as
404 the result of an realloc operation on a block that contains a
405 node. Once this is done, call this function passing node P
406 that was moved and its BT before attempting any other
409 It is not safe to move more than one node, then to call this
410 function for each node. Instead, move a single node, call
411 this function, move another node, and so on. Alternatively,
412 remove all affected nodes from the tree, move them, then
413 re-insert all of them. */
415 bt_moved (struct bt *bt, struct bt_node *p)
419 int d = p->up->down[0] == NULL || bt->compare (p, p->up, bt->aux) > 0;
424 if (p->down[0] != NULL)
426 if (p->down[1] != NULL)
432 This algorithm is from Q. F. Stout and B. L. Warren, "Tree
433 Rebalancing in Optimal Time and Space", CACM 29(1986):9,
434 pp. 902-908. It uses O(N) time and O(1) space to rebalance a
435 subtree that contains N nodes. */
437 static void tree_to_vine (struct bt_node **);
438 static void vine_to_tree (struct bt_node **, size_t count);
440 /* Rebalances the subtree in BT rooted at SUBTREE, which contains
441 exactly COUNT nodes. */
443 rebalance_subtree (struct bt *bt, struct bt_node *subtree, size_t count)
445 struct bt_node *up = subtree->up;
446 struct bt_node **q = down_link (bt, subtree);
448 vine_to_tree (q, count);
452 /* Converts the subtree rooted at *Q into a vine (a binary search
453 tree in which all the right links are null), and updates *Q to
454 point to the new root of the subtree. */
456 tree_to_vine (struct bt_node **q)
458 struct bt_node *p = *q;
460 if (p->down[1] == NULL)
467 struct bt_node *r = p->down[1];
468 p->down[1] = r->down[0];
475 /* Performs a compression transformation COUNT times, starting at
476 *Q, and updates *Q to point to the new root of the subtree. */
478 compress (struct bt_node **q, unsigned long count)
482 struct bt_node *red = *q;
483 struct bt_node *black = red->down[0];
486 red->down[0] = black->down[1];
487 black->down[1] = red;
489 if (red->down[0] != NULL)
490 red->down[0]->up = red;
495 /* Converts the vine rooted at *Q, which contains exactly COUNT
496 nodes, into a balanced tree, and updates *Q to point to the
497 new root of the balanced tree. */
499 vine_to_tree (struct bt_node **q, size_t count)
501 size_t leaf_nodes = count + 1 - ((size_t) 1 << floor_log2 (count + 1));
502 size_t vine_nodes = count - leaf_nodes;
504 compress (q, leaf_nodes);
505 while (vine_nodes > 1)
508 compress (q, vine_nodes);
510 while ((*q)->down[0] != NULL)
512 (*q)->down[0]->up = *q;
517 /* Other binary tree helper functions. */
519 /* Returns the address of the pointer that points down to P
521 static struct bt_node **
522 down_link (struct bt *bt, struct bt_node *p)
524 struct bt_node *q = p->up;
525 return q != NULL ? &q->down[q->down[0] != p] : &bt->root;
528 /* Returns node P's sibling; that is, the other child of its
529 parent. P must not be the root. */
530 static inline struct bt_node *
531 sibling (struct bt_node *p)
533 struct bt_node *q = p->up;
534 return q->down[q->down[0] == p];
537 /* Returns the number of nodes in the given SUBTREE. */
539 count_nodes_in_subtree (const struct bt_node *subtree)
541 /* This is an in-order traversal modified to iterate only the
546 const struct bt_node *p = subtree;
547 while (p->down[0] != NULL)
552 if (p->down[1] != NULL)
555 while (p->down[0] != NULL)
562 const struct bt_node *q;
579 /* Returns the number of high-order 0-bits in X.
580 Undefined if X is zero. */
582 count_leading_zeros (size_t x)
585 #if SIZE_MAX > ULONG_MAX
586 return __builtin_clzll (x);
587 #elif SIZE_MAX > UINT_MAX
588 return __builtin_clzl (x);
590 return __builtin_clz (x);
593 /* This algorithm is from _Hacker's Delight_ section 5.3. */
597 #define COUNT_STEP(BITS) \
605 n = sizeof (size_t) * CHAR_BIT;
606 #if SIZE_MAX >> 31 >> 31 >> 2
609 #if SIZE_MAX >> 31 >> 1
617 return y != 0 ? n - 2 : n - x;
621 /* Returns floor(log2(x)).
622 Undefined if X is zero. */
624 floor_log2 (size_t x)
626 return sizeof (size_t) * CHAR_BIT - 1 - count_leading_zeros (x);
629 /* Returns floor(pow(sqrt(2), x * 2 + 1)).
630 Defined for X from 0 up to the number of bits in size_t minus
635 /* These constants are sqrt(2) multiplied by 2**63 or 2**31,
636 respectively, and then rounded to nearest. */
637 #if SIZE_MAX >> 31 >> 1
638 return (UINT64_C(0xb504f333f9de6484) >> (63 - x)) + 1;
640 return (0xb504f334 >> (31 - x)) + 1;
644 /* Returns floor(log(n)/log(sqrt(2))).
645 Undefined if N is 0. */
647 calculate_h_alpha (size_t n)
649 int log2 = floor_log2 (n);
651 /* The correct answer is either 2 * log2 or one more. So we
652 see if n >= pow(sqrt(2), 2 * log2 + 1) and if so, add 1. */
653 return (2 * log2) + (n >= pow_sqrt2 (log2));