1 /* PSPP - computes sample statistics.
2 Copyright (C) 2007 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or
5 modify it under the terms of the GNU General Public License as
6 published by the Free Software Foundation; either version 2 of the
7 License, or (at your option) any later version.
9 This program is distributed in the hope that it will be useful, but
10 WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
19 /* Balanced tree (BT) data structure.
21 The client should not need to be aware of the form of
22 balancing applied to the balanced tree, as its operation is
23 fully encapsulated. The current implementation is a scapegoat
24 tree. Scapegoat trees have the advantage over many other
25 forms of balanced trees that they do not store any additional
26 information in each node; thus, they are slightly more
27 space-efficient than, say, red-black or AVL trees. Compared
28 to splay trees, scapegoat trees provide guaranteed logarithmic
29 worst-case search time and do not restructure the tree during
32 For information on scapegoat trees, see Galperin and Rivest,
33 "Scapegoat Trees", Proc. 4th ACM-SIAM Symposium on Discrete
34 Algorithms, or <http://en.wikipedia.org/wiki/Scapegoat_tree>,
35 which includes source code and links to other resources, such
36 as the Galperin and Rivest paper.
38 One potentially tricky part of scapegoat tree design is the
39 choice of alpha, which is a real constant that must be greater
40 than 0.5 and less than 1. We must be able to efficiently
41 calculate h_alpha = floor(log(n)/log(1/alpha)) for integer n >
42 0. One way to do so would be to maintain a table relating
43 h_alpha to the minimum value of n that yields that h_alpha.
44 Then, we can track the value of h_alpha(n) in the number of
45 nodes in the tree n as nodes are inserted and deleted.
47 This implementation uses a different approach. We choose
48 alpha = sqrt(2)/2 = 1/sqrt(2) ~= .707. Then, computing
49 h_alpha is a matter of computing a logarithm in base sqrt(2).
50 This is easy: we simply compute twice the base-2 logarithm,
51 then adjust upward by 1 if necessary. See calculate_h_alpha
54 /* These library routines have no external dependencies other
55 than the standard C library.
57 If you add routines in this file, please add a corresponding
58 test to bt-test.c. This test program should achieve 100%
59 coverage of lines and branches in this code, as reported by
66 #include <libpspp/bt.h>
71 static void rebalance_subtree (struct bt *, struct bt_node *, size_t);
73 static struct bt_node **down_link (struct bt *, struct bt_node *);
74 static inline struct bt_node *sibling (struct bt_node *p);
75 static size_t count_nodes_in_subtree (const struct bt_node *);
77 static inline int floor_log2 (size_t);
78 static inline int calculate_h_alpha (size_t);
80 /* Initializes BT as an empty BT that uses the given COMPARE
81 function, passing in AUX as auxiliary data. */
83 bt_init (struct bt *bt,
84 bt_compare_func *compare,
88 bt->compare = compare;
94 /* Inserts the given NODE into BT.
95 Returns a null pointer if successful.
96 Returns the existing node already in BT equal to NODE, on
99 bt_insert (struct bt *bt, struct bt_node *node)
103 node->down[0] = NULL;
104 node->down[1] = NULL;
106 if (bt->root == NULL)
113 struct bt_node *p = bt->root;
118 cmp = bt->compare (node, p, bt->aux);
124 if (p->down[dir] == NULL)
135 if (bt->size > bt->max_size)
136 bt->max_size = bt->size;
138 if (depth > calculate_h_alpha (bt->size))
140 /* We use the "alternative" method of finding a scapegoat
141 node described by Galperin and Rivest. */
142 struct bt_node *s = node;
149 size += 1 + count_nodes_in_subtree (sibling (s));
151 if (i > calculate_h_alpha (size))
153 rebalance_subtree (bt, s, size);
159 rebalance_subtree (bt, bt->root, bt->size);
160 bt->max_size = bt->size;
168 /* Deletes P from BT. */
170 bt_delete (struct bt *bt, struct bt_node *p)
172 struct bt_node **q = down_link (bt, p);
173 struct bt_node *r = p->down[1];
180 else if (r->down[0] == NULL)
182 r->down[0] = p->down[0];
185 if (r->down[0] != NULL)
190 struct bt_node *s = r->down[0];
191 while (s->down[0] != NULL)
194 r->down[0] = s->down[1];
195 s->down[0] = p->down[0];
196 s->down[1] = p->down[1];
198 if (s->down[0] != NULL)
202 if (r->down[0] != NULL)
207 /* We approximate .707 as .75 here. This is conservative: it
208 will cause us to do a little more rebalancing than strictly
209 necessary to maintain the scapegoat tree's height
211 if (bt->size < bt->max_size * 3 / 4 && bt->size > 0)
213 rebalance_subtree (bt, bt->root, bt->size);
214 bt->max_size = bt->size;
218 /* Returns the node with minimum value in BT, or a null pointer
221 bt_first (const struct bt *bt)
223 struct bt_node *p = bt->root;
225 while (p->down[0] != NULL)
230 /* Returns the node with maximum value in BT, or a null pointer
233 bt_last (const struct bt *bt)
235 struct bt_node *p = bt->root;
237 while (p->down[1] != NULL)
242 /* Searches BT for a node equal to TARGET.
243 Returns the node if found, or a null pointer otherwise. */
245 bt_find (const struct bt *bt, const struct bt_node *target)
247 const struct bt_node *p;
250 for (p = bt->root; p != NULL; p = p->down[cmp > 0])
252 cmp = bt->compare (target, p, bt->aux);
254 return (struct bt_node *) p;
260 /* Searches BT for, and returns, the first node in in-order whose
261 value is greater than or equal to TARGET. Returns a null
262 pointer if all nodes in BT are less than TARGET.
264 Another way to look at the return value is that it is the node
265 that would be returned by "bt_next (BT, TARGET)" if TARGET
266 were inserted in BT (assuming that TARGET would not be a
269 bt_find_ge (const struct bt *bt, const struct bt_node *target)
271 const struct bt_node *p = bt->root;
272 const struct bt_node *q = NULL;
275 int cmp = bt->compare (target, p, bt->aux);
287 return (struct bt_node *) q;
290 /* Searches BT for, and returns, the last node in in-order whose
291 value is less than or equal to TARGET, which should not be in
292 BT. Returns a null pointer if all nodes in BT are greater
295 Another way to look at the return value is that it is the node
296 that would be returned by "bt_prev (BT, TARGET)" if TARGET
297 were inserted in BT (assuming that TARGET would not be a
300 bt_find_le (const struct bt *bt, const struct bt_node *target)
302 const struct bt_node *p = bt->root;
303 const struct bt_node *q = NULL;
306 int cmp = bt->compare (target, p, bt->aux);
318 return (struct bt_node *) q;
321 /* Returns the node in BT following P in in-order.
322 If P is null, returns the minimum node in BT.
323 Returns a null pointer if P is the maximum node in BT or if P
324 is null and BT is empty. */
326 bt_next (const struct bt *bt, const struct bt_node *p)
329 return bt_first (bt);
330 else if (p->down[1] == NULL)
333 for (q = p->up; ; p = q, q = q->up)
334 if (q == NULL || p == q->down[0])
340 while (p->down[0] != NULL)
342 return (struct bt_node *) p;
346 /* Returns the node in BT preceding P in in-order.
347 If P is null, returns the maximum node in BT.
348 Returns a null pointer if P is the minimum node in BT or if P
349 is null and BT is empty. */
351 bt_prev (const struct bt *bt, const struct bt_node *p)
355 else if (p->down[0] == NULL)
358 for (q = p->up; ; p = q, q = q->up)
359 if (q == NULL || p == q->down[1])
365 while (p->down[1] != NULL)
367 return (struct bt_node *) p;
371 /* Moves P around in BT to compensate for its key having
372 changed. Returns a null pointer if successful. If P's new
373 value is equal to that of some other node in BT, returns the
374 other node after removing P from BT.
376 This function is an optimization only if it is likely that P
377 can actually retain its relative position in BT, e.g. its key
378 has only been adjusted slightly. Otherwise, it is more
379 efficient to simply remove P from BT, change its key, and
382 It is not safe to update more than one node's key, then to
383 call this function for each node. Instead, update a single
384 node's key, call this function, update another node's key, and
385 so on. Alternatively, remove all affected nodes from the
386 tree, update their keys, then re-insert all of them. */
388 bt_changed (struct bt *bt, struct bt_node *p)
390 struct bt_node *prev = bt_prev (bt, p);
391 struct bt_node *next = bt_next (bt, p);
393 if ((prev != NULL && bt->compare (prev, p, bt->aux) >= 0)
394 || (next != NULL && bt->compare (p, next, bt->aux) >= 0))
397 return bt_insert (bt, p);
402 /* BT nodes may be moved around in memory as necessary, e.g. as
403 the result of an realloc operation on a block that contains a
404 node. Once this is done, call this function passing node P
405 that was moved and its BT before attempting any other
408 It is not safe to move more than one node, then to call this
409 function for each node. Instead, move a single node, call
410 this function, move another node, and so on. Alternatively,
411 remove all affected nodes from the tree, move them, then
412 re-insert all of them. */
414 bt_moved (struct bt *bt, struct bt_node *p)
418 int d = p->up->down[0] == NULL || bt->compare (p, p->up, bt->aux) > 0;
423 if (p->down[0] != NULL)
425 if (p->down[1] != NULL)
431 This algorithm is from Q. F. Stout and B. L. Warren, "Tree
432 Rebalancing in Optimal Time and Space", CACM 29(1986):9,
433 pp. 902-908. It uses O(N) time and O(1) space to rebalance a
434 subtree that contains N nodes. */
436 static void tree_to_vine (struct bt_node **);
437 static void vine_to_tree (struct bt_node **, size_t count);
439 /* Rebalances the subtree in BT rooted at SUBTREE, which contains
440 exactly COUNT nodes. */
442 rebalance_subtree (struct bt *bt, struct bt_node *subtree, size_t count)
444 struct bt_node *up = subtree->up;
445 struct bt_node **q = down_link (bt, subtree);
447 vine_to_tree (q, count);
451 /* Converts the subtree rooted at *Q into a vine (a binary search
452 tree in which all the right links are null), and updates *Q to
453 point to the new root of the subtree. */
455 tree_to_vine (struct bt_node **q)
457 struct bt_node *p = *q;
459 if (p->down[1] == NULL)
466 struct bt_node *r = p->down[1];
467 p->down[1] = r->down[0];
474 /* Performs a compression transformation COUNT times, starting at
475 *Q, and updates *Q to point to the new root of the subtree. */
477 compress (struct bt_node **q, unsigned long count)
481 struct bt_node *red = *q;
482 struct bt_node *black = red->down[0];
485 red->down[0] = black->down[1];
486 black->down[1] = red;
488 if (red->down[0] != NULL)
489 red->down[0]->up = red;
494 /* Converts the vine rooted at *Q, which contains exactly COUNT
495 nodes, into a balanced tree, and updates *Q to point to the
496 new root of the balanced tree. */
498 vine_to_tree (struct bt_node **q, size_t count)
500 size_t leaf_nodes = count + 1 - ((size_t) 1 << floor_log2 (count + 1));
501 size_t vine_nodes = count - leaf_nodes;
503 compress (q, leaf_nodes);
504 while (vine_nodes > 1)
507 compress (q, vine_nodes);
509 while ((*q)->down[0] != NULL)
511 (*q)->down[0]->up = *q;
516 /* Other binary tree helper functions. */
518 /* Returns the address of the pointer that points down to P
520 static struct bt_node **
521 down_link (struct bt *bt, struct bt_node *p)
523 struct bt_node *q = p->up;
524 return q != NULL ? &q->down[q->down[0] != p] : &bt->root;
527 /* Returns node P's sibling; that is, the other child of its
528 parent. P must not be the root. */
529 static inline struct bt_node *
530 sibling (struct bt_node *p)
532 struct bt_node *q = p->up;
533 return q->down[q->down[0] == p];
536 /* Returns the number of nodes in the given SUBTREE. */
538 count_nodes_in_subtree (const struct bt_node *subtree)
540 /* This is an in-order traversal modified to iterate only the
545 const struct bt_node *p = subtree;
546 while (p->down[0] != NULL)
551 if (p->down[1] != NULL)
554 while (p->down[0] != NULL)
561 const struct bt_node *q;
578 /* Returns the number of high-order 0-bits in X.
579 Undefined if X is zero. */
581 count_leading_zeros (size_t x)
584 #if SIZE_MAX > ULONG_MAX
585 return __builtin_clzll (x);
586 #elif SIZE_MAX > UINT_MAX
587 return __builtin_clzl (x);
589 return __builtin_clz (x);
592 /* This algorithm is from _Hacker's Delight_ section 5.3. */
596 #define COUNT_STEP(BITS) \
604 n = sizeof (size_t) * CHAR_BIT;
605 #if SIZE_MAX >> 31 >> 31 >> 2
608 #if SIZE_MAX >> 31 >> 1
616 return y != 0 ? n - 2 : n - x;
620 /* Returns floor(log2(x)).
621 Undefined if X is zero. */
623 floor_log2 (size_t x)
625 return sizeof (size_t) * CHAR_BIT - 1 - count_leading_zeros (x);
628 /* Returns floor(pow(sqrt(2), x * 2 + 1)).
629 Defined for X from 0 up to the number of bits in size_t minus
634 /* These constants are sqrt(2) multiplied by 2**63 or 2**31,
635 respectively, and then rounded to nearest. */
636 #if SIZE_MAX >> 31 >> 1
637 return (UINT64_C(0xb504f333f9de6484) >> (63 - x)) + 1;
639 return (0xb504f334 >> (31 - x)) + 1;
643 /* Returns floor(log(n)/log(sqrt(2))).
644 Undefined if N is 0. */
646 calculate_h_alpha (size_t n)
648 int log2 = floor_log2 (n);
650 /* The correct answer is either 2 * log2 or one more. So we
651 see if n >= pow(sqrt(2), 2 * log2 + 1) and if so, add 1. */
652 return (2 * log2) + (n >= pow_sqrt2 (log2));