-
Notifications
You must be signed in to change notification settings - Fork 60
/
Copy pathbtree.rs
4355 lines (3956 loc) · 188 KB
/
btree.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//! Disk BTree data structure implementation.
//!
//! See [`BTree`] for details.
use std::{
cmp::{min, Ordering, Reverse},
collections::{BinaryHeap, HashSet, VecDeque},
io::{self, Read, Seek, Write},
mem,
};
use super::{
page::{Cell, OverflowPage, Page, SlotId},
tuple::{byte_length_of_integer_type, utf8_length_prefix_bytes},
};
use crate::{
paging::{
io::FileOps,
pager::{PageNumber, Pager},
},
sql::statement::DataType,
};
/// [`BTree`] key comparator. Entries are stored in binary, so we need a way to
/// determine the correct [`Ordering`]. Whenever two entries need to be
/// compared, the [`BTree`] will call [`BytesCmp::bytes_cmp`] passing both
/// binary slices as parameters. For example, suppose we need to compare the
/// following two slices:
///
/// ```ignore
/// let A = [1, 0, 0, 0];
/// let B = [2, 0, 0, 0, 0, 0, 0, 0];
/// ```
///
/// The [`BTree`] will just call the comparator function with `A` and `B`, it
/// is up to the implementer to determine the [`Ordering`]. The format of `A`
/// and `B` is also defined by the [`BTree`] user, since the [`BTree`] only
/// receives binary buffers as parameters.
///
/// We provide a couple of reusable comparators in this module:
/// - [`FixedSizeMemCmp`]
/// - [`StringCmp`]
pub(crate) trait BytesCmp {
/// Compares two byte arrays and returns the corresponding [`Ordering`].
///
/// At the [`BTree`] level we don't care how that is done. Upper levels can
/// parse the binary to obtain an [`Ordering`] instance, they can store
/// the entry in such a format that they can tell the [`Ordering`] by
/// looking at the bytes, or they can do anything else, we don't care here.
fn bytes_cmp(&self, a: &[u8], b: &[u8]) -> Ordering;
}
/// Compares the first `self.0` number of bytes using the good old `memcmp`.
/// This is more useful than it seems at first glance because if we store
/// integer keys at the beginning of the binary buffer in big endian format,
/// then this is all we need to successfully determine the [`Ordering`].
#[derive(Debug, Clone, Copy, PartialEq)]
pub(crate) struct FixedSizeMemCmp(pub usize);
impl FixedSizeMemCmp {
/// Creates a comparator for a certain type.
///
/// ```ignore
/// let comparator = FixedSizeMemCmp::for_type::<u32>();
///
/// let a = [1, 0, 0, 0, 0, 0, 0];
/// let b = [2, 0, 0, 0];
///
/// assert_eq!(comparator.bytes_cmp(a, b), std::cmp::Ordering::Less);
/// ```
///
/// Using this with structs or other complex data types probably doesn't
/// make sense.
pub fn for_type<T>() -> Self {
Self(mem::size_of::<T>())
}
}
impl BytesCmp for FixedSizeMemCmp {
fn bytes_cmp(&self, a: &[u8], b: &[u8]) -> Ordering {
a[..self.0].cmp(&b[..self.0])
}
}
impl TryFrom<&DataType> for FixedSizeMemCmp {
type Error = ();
fn try_from(data_type: &DataType) -> Result<Self, Self::Error> {
match data_type {
DataType::Varchar(_) | DataType::Bool => Err(()),
fixed => Ok(Self(byte_length_of_integer_type(fixed))),
}
}
}
/// Compares UTF-8 strings.
///
/// Assumes that the buffers have this format:
///
/// ```text
/// self.0 bytes
/// +---------+
/// V V
/// +---------+--------+--------+--------+-------+
/// | STR LEN | BYTE 0 | BYTE 1 | BYTE 2 | ... |
/// +---------+--------+--------+--------+-------+
/// ^ ^
/// +--------------------------+
/// STR LEN amount of string bytes
/// ```
///
/// Then computes the total length of the string in bytes by reading the first
/// `self.0` bytes as a little endian integer and once the total length is known
/// [`str`] instances can be created.
#[derive(Debug, Clone, Copy, PartialEq)]
pub(crate) struct StringCmp(pub usize);
impl BytesCmp for StringCmp {
fn bytes_cmp(&self, a: &[u8], b: &[u8]) -> Ordering {
debug_assert!(
self.0 <= 4,
"strings longer than {} bytes are not supported",
u32::MAX
);
let mut buf = [0; std::mem::size_of::<usize>()];
buf[..self.0].copy_from_slice(&a[..self.0]);
let len_a = usize::from_le_bytes(buf);
buf.fill(0);
buf[..self.0].copy_from_slice(&b[..self.0]);
let len_b = usize::from_le_bytes(buf);
// TODO: Not 100% sure if unwrap() can actually panic here. When we
// insert data we already have a valid [`String`] instance which is
// supposed to be UTF-8 encoded and we call String::as_bytes() to
// serialize it into binary. If unwrap() can't panic then we should
// use the unchecked version of from_utf8() that doesn't loop through
// the entire string to check that all bytes are valid UTF-8.
std::str::from_utf8(&a[self.0..self.0 + len_a])
.unwrap()
.cmp(std::str::from_utf8(&b[self.0..self.0 + len_b]).unwrap())
}
}
impl From<&DataType> for Box<dyn BytesCmp> {
/// Easy way to obtain a [`BytesCmp`] impl at runtime based on SQL data
/// types.
fn from(data_type: &DataType) -> Self {
match data_type {
DataType::Varchar(max_characters) => {
Box::new(StringCmp(utf8_length_prefix_bytes(*max_characters)))
}
fixed => Box::new(FixedSizeMemCmp(byte_length_of_integer_type(fixed))),
}
}
}
impl BytesCmp for &Box<dyn BytesCmp> {
fn bytes_cmp(&self, a: &[u8], b: &[u8]) -> Ordering {
self.as_ref().bytes_cmp(a, b)
}
}
impl BytesCmp for Box<dyn BytesCmp> {
fn bytes_cmp(&self, a: &[u8], b: &[u8]) -> Ordering {
self.as_ref().bytes_cmp(a, b)
}
}
/// [`Box<dyn BytesCmp>`] kinda sucks.
///
/// Jump table based dynamic dispatch for the win. No allocations, easy
/// [`Debug`] impl, [`Copy`], etc.
#[derive(Debug, Clone, Copy, PartialEq)]
pub(crate) enum BTreeKeyComparator {
MemCmp(FixedSizeMemCmp),
StrCmp(StringCmp),
}
impl From<&DataType> for BTreeKeyComparator {
fn from(data_type: &DataType) -> Self {
match data_type {
DataType::Varchar(max_characters) => {
Self::StrCmp(StringCmp(utf8_length_prefix_bytes(*max_characters)))
}
fixed => Self::MemCmp(FixedSizeMemCmp(byte_length_of_integer_type(fixed))),
}
}
}
impl BytesCmp for BTreeKeyComparator {
fn bytes_cmp(&self, a: &[u8], b: &[u8]) -> Ordering {
match self {
Self::MemCmp(mem_cmp) => mem_cmp.bytes_cmp(a, b),
Self::StrCmp(str_cmp) => str_cmp.bytes_cmp(a, b),
}
}
}
/// The result of a search in the [`BTree`] structure.
pub(crate) struct Search {
/// Page number of the node where the search ended.
pub page: PageNumber,
/// Contains an [`Ok`] value with the index where the search matched or an
/// [`Err`] value with the index where the searched key should be located.
pub index: Result<u16, u16>,
}
/// The result of a remove operation on the BTree.
struct Removal {
/// The removed cell.
cell: Box<Cell>,
/// Page number of the leaf node where a substitute was found.
leaf_node: PageNumber,
/// If the removal ocurred on an internal node, then this is its page number.
internal_node: Option<PageNumber>,
}
/// Stores the page number and the index in the parent page entries that points
/// back to this node.
///
/// See [`BTree::load_siblings`] and [`BTree::balance`] for details.
#[derive(Debug, Clone, Copy)]
struct Sibling {
/// Page number of this sibling node.
page: PageNumber,
/// Index of the cell that points to this node in the parent.
index: SlotId,
}
impl Sibling {
fn new(page: PageNumber, index: u16) -> Self {
Self { page, index }
}
}
/// Used to search either the minimum or maximum key of a subtree.
///
/// See [`BTree::search_leaf_key`] for details.
enum LeafKeySearch {
/// Maximum key in a leaf node.
Max,
/// Minimum key in a leaf node.
Min,
}
/// Used for reading payloads stored in the BTree.
///
/// Most of the times the BTree can simply return a reference, but large
/// payloads require reassembly to construct a buffer with contiguous data.
#[derive(Debug, PartialEq)]
pub(crate) enum Payload<'s> {
/// Payload did not need reassembly, so we returned a reference to the
/// slotted page buffer.
PageRef(&'s [u8]),
/// Payload was too large and needed reassembly.
Reassembled(Box<[u8]>),
}
impl<'s> AsRef<[u8]> for Payload<'s> {
fn as_ref(&self) -> &[u8] {
match self {
Self::PageRef(reference) => reference,
Self::Reassembled(boxed) => boxed,
}
}
}
/// B*-Tree implementation inspired by "Art of Computer Programming Volume 3:
/// Sorting and Searching" and SQLite 2.X.X
///
/// # About BTrees
///
/// BTrees are a family of tree data structures that always maintain their
/// balance. Unlike binary trees, which can be turned into a linked list by
/// inserting keys in sequential order like this:
///
/// ```text
/// +---+
/// | 1 |
/// +---+
/// \
/// +---+
/// | 2 |
/// +---+
/// \
/// +---+
/// | 3 |
/// +---+
/// ```
///
/// BTrees will never lose their O(log n) search time, because they cannot
/// be turned into linked lists. This is how a BTree looks like:
///
/// ```text
/// +--------+ Root Node
/// +-----------------| 11 |--------------+
/// / +--------+ \
/// / \
/// +-------+ +----------+ Internal Node
/// +----| 4,8 |----+ +------------| 15,19,23 |------------+
/// / +-------+ \ / +----------+ \
/// / | \ / / \ \
/// +-------+ +-------+ +-------+ +----------+ +----------+ +----------+ +----------+
/// | 1,2,3 | | 5,6,7 | | 9,10 | | 12,13,14 | | 16,17,18 | | 20,21,22 | | 24,25,26 |
/// +-------+ +-------+ +-------+ +----------+ +----------+ +----------+ +----------+
/// Leaf Node
/// ```
///
/// The first difference that can be spotted is the number of keys stored in
/// each node. BTrees are generalization of binary trees, so they can store N
/// keys per node. This greatly reduces the overall height of the tree. The
/// more keys we store, the slower the tree grows in size. Second difference you
/// can spot here is that even though the tree is storing sequential keys, it's
/// still a tree, not a linked list. All this is thanks to [`BTree::balance`].
///
/// Another important distinction is that BTrees are better suited for disk
/// storage, since storage hardware is usually divided in blocks or pages of
/// N bytes each, and we can essentially store one BTree node per page.
///
/// The BTree stores variable size binary keys, but if all the keys have the
/// same size then some nice properties apply.
///
/// **Terminology**:
///
/// `order`: Maximum number of children per node (except root).
///
/// **Properties**:
///
/// - Max children: `order`
/// - Max keys: `order - 1`
///
/// The lower bounds depend on the type of BTree. For normal BTrees, the
/// formulas are:
///
/// - Min children: `(order / 2)` (except root)
/// - Min keys: `(order / 2) - 1` (except root)
///
/// This would keep nodes at least 50% full. There's a variant of the BTree
/// called B*-Tree which attempts to keep nodes 66% (two thirds) full. In that
/// case, the lower bounds would be:
///
/// - Min children: `order * 2 / 3` (except root)
/// - Min keys: `order * 2 / 3 - 1` (except root)
///
/// The algorithms for this kind of BTree are described in "Art of Computer
/// Programming Volume 3: Sorting and Searching". However, since we're storing
/// variable length data, none of this applies. It's important to know for the
/// tests because most of the tests use fixed size data, but databases usually
/// store variable length data.
///
/// In this implementation of the BTree, we attempt to keep nodes above 50%
/// capacity, but we don't know exactly how much "above" 50% because, as
/// mentioned earlier, we store variable length data.
///
/// See the documentation of the [`BTree`] struct methods for more details on
/// the algorithms used.
///
/// # Structure
///
/// The [`BTree`] is made of [`Page`] instances that are stored on disk and
/// cached in memory as required. Each [`Page`] stores a list of [`Cell`]
/// instances using a slot array. Cells are the smallest unit of data that we
/// work with, each cell stores a key (or key-value pair, we don't care) and a
/// pointer to the node that contains keys less than the one stored in the
/// cell. When a [`Cell`] is too big it points to a linked list of
/// [`OverflowPage`] instances. Overall, this is how the BTree really looks
/// like:
///
/// ```text
/// BTREE +---------------+
/// PAGE | |
/// +-------------+----|---------------V----------+
/// | +----+----+ | +----+ +----+---+ |
/// | | RC | NS | | | O1 | -> FS <- | LC | D | |
/// | +----+----+ | +----+ +----+---+ |
/// +---|---------+----------------------|--------+
/// | |
/// +---------------|--------------------------------+
/// | |
/// | +-----------------------------------------------+
/// | |
/// PAGE HEADER SLOT ARRAY V CELLS V
/// +-------------+-----------------------------------------------+ +-------------+-------------------------------------+
/// | +----+----+ | +----+----+ +----+---+ +----+---+ | | +----+----+ | +----+ +----+---+-----+ |
/// | | RC | NS | | | O1 | O2 | -> FS <- | LC | D | | LC | D | | | | RC | NS | | | O1 | -> FS <- | LC | D | OVF | |
/// | +----+----+ | +----+----+ +----+---+ +----+---+ | | +----+----+ | +----+ +----+---+-----+ |
/// +-------------+---|----|--------------------------------------+ +-------------+---|----------------------------|----+
/// | | ^ ^ | ^ |
/// | | | | | | |
/// | +----------------+ | +----------------+ |
/// +--------------------------------+ |
/// +------------------------------------------+
/// |
/// OVERFLOW PAGE V
/// +------+-------------------------------------------+ +------+-------------------------------------------+
/// | NEXT | OVERFLOW PAYLOAD | | NEXT | OVERFLOW PAYLOAD |
/// +------+-------------------------------------------+ +------+-------------------------------------------+
/// ^ |
/// | |
/// +---------------------------------------------------------+
/// ```
///
/// Here's what everything stands for:
///
/// - RC: Right Child
/// - NS: Number of Slots
/// - ON: Offset N
/// - LC: Left Child
/// - FS: Free Space
/// - D: Data
/// - OVF: Overflow
///
/// Right child is stored in the page header and is always the "last child" of
/// a node. Or in other words, the child that has keys grater than any key in
/// the current node. Left child is stored in the cell header and points to
/// the child that has keys less than the one in the cell. For the slotted page
/// details and omitted fields see the documentation of [`super::page`] module.
pub(crate) struct BTree<'p, F, C> {
/// Root page.
root: PageNumber,
/// Pager instance.
///
/// TODO: Should be some owned value instead of a mutable reference but
/// [`std::cell::RefCell`] is anoying to use here because of recursion and
/// calls to functions that need the pager from other functions that also
/// need the pager. We can use it by dropping manually the
/// [`std::cell::RefMut`] instances whenever we know the next function call
/// will borrow the pager but that's just suboptimal. The pager should be
/// fully multithreaded and we could use an Arc or something here.
pager: &'p mut Pager<F>,
/// Bytes comparator used to obtain [`Ordering`] instances from binary data.
comparator: C,
/// Number of siblings to examine at each side when balancing a node.
///
/// See [`Self::load_siblings`] and [`Self::balance`].
balance_siblings_per_side: usize,
/// Forces pages to store at least this number of [`Cell`] instances.
minimum_keys: usize,
}
/// Default value for [`BTree::balance_siblings_per_side`].
pub(crate) const DEFAULT_BALANCE_SIBLINGS_PER_SIDE: usize = 1;
/// Default value for [`BTree::minimum_keys`].
pub(crate) const DEFAULT_MINIMUM_KEYS: usize = 4;
impl<'p, F, C: BytesCmp> BTree<'p, F, C> {
pub fn new(pager: &'p mut Pager<F>, root: PageNumber, comparator: C) -> Self {
Self {
pager,
root,
comparator,
balance_siblings_per_side: DEFAULT_BALANCE_SIBLINGS_PER_SIDE,
minimum_keys: DEFAULT_MINIMUM_KEYS,
}
}
}
impl<'p, F: Seek + Read + Write + FileOps, C: BytesCmp> BTree<'p, F, C> {
/// Returns the value corresponding to the key.
///
/// See [`Self::search`] for details.
pub fn get(&mut self, entry: &[u8]) -> io::Result<Option<Payload>> {
let search = self.search(self.root, entry, &mut Vec::new())?;
match search.index {
Err(_) => Ok(None),
Ok(index) => Ok(Some(reassemble_payload(self.pager, search.page, index)?)),
}
}
/// # BTree Search Algorithm
///
/// 1. Read the subtree root node into memory.
/// 2. Run a binary search on the entries to find the given key.
/// 3. If successful, return the [`Search`] result.
/// 4. If not, the binary search result will tell us which child to pick for
/// recursion.
///
/// # Example
///
/// Find key 9 in this tree, located at page 5:
///
/// ```text
/// PAGE 0
/// +--------+
/// +-------| 11 |-------+
/// / +--------+ \
/// / \
/// +-------+ PAGE 1 +--------+
/// +----| 4,8 |----+ | 14 |
/// / +-------+ \ +--------+
/// / | \ / \
/// +-------+ +-------+ +-------+ +-------+ +-------+
/// | 1,2,3 | | 5,6,7 | | 9,10 | | 12,13 | | 15,16 |
/// +-------+ +-------+ +-------+ +-------+ +-------+
/// PAGE 3 PAGE 4 PAGE 5
/// ```
/// ## First iteration
///
/// 1. Read page 0 into memory.
/// 2. Binary search on [`Page`] results in [`Err(0)`].
/// 3. Read index 0 using [`Page::child`] and recurse into the result.
///
/// ## Second iteration
///
/// 1. Read page 1 into memory.
/// 2. Binary search results in [`Err(3)`].
/// 3. Read child pointer at index 3 and recurse again.
///
/// ## Final iteration.
///
/// 1. Read page 5 into memory.
/// 2. Binary search results in [`Ok(0)`].
/// 3. Done, construct the [`Search`] result and return.
pub fn search(
&mut self,
page: PageNumber,
entry: &[u8],
parents: &mut Vec<PageNumber>,
) -> io::Result<Search> {
// Search key in this node.
let index = self.binary_search(page, entry)?;
let node = self.pager.get(page)?;
// We found the key or we're already at the bottom, stop recursion.
if index.is_ok() || node.is_leaf() {
return Ok(Search { page, index });
}
// No luck, keep recursing downwards.
parents.push(page);
let next_node = node.child(index.unwrap_err());
self.search(next_node, entry, parents)
}
/// Binary search with support for overflow data.
///
/// Returns an [`Ok`] result containing the index where `entry` was found or
/// an [`Err`] result containing the index where `entry` should have been
/// found if present.
fn binary_search(&mut self, page: PageNumber, entry: &[u8]) -> io::Result<Result<u16, u16>> {
let mut size = self.pager.get(page)?.len();
let mut left = 0;
let mut right = size;
while left < right {
let mid = left + size / 2;
let cell = self.pager.get(page)?.cell(mid);
let overflow_buf: Box<[u8]>;
// TODO: Figure out if we actually need to reassemble the payload.
// We could ask the comparator through the [`BytesCmp`] trait if
// it needs the entire buffer or not. When comparing strings for
// example, "a" is always less than "abcdefg", so there's no point
// in reassembling the entire string.
let payload = if cell.header.is_overflow {
match reassemble_payload(self.pager, page, mid)? {
Payload::Reassembled(buf) => overflow_buf = buf,
_ => unreachable!(),
}
&overflow_buf
} else {
&cell.content
};
match self.comparator.bytes_cmp(payload, entry) {
Ordering::Equal => return Ok(Ok(mid)),
Ordering::Greater => right = mid,
Ordering::Less => left = mid + 1,
}
size = right - left;
}
Ok(Err(left))
}
/// Inserts a new entry into the tree or updates the value if the key
/// already exists.
///
/// # Insertion algorithm
///
/// Entries are always inserted at leaf nodes. Internal nodes and the root
/// node can only grow in size when leaf nodes overflow and siblings can't
/// take any load to keep the leaves balanced, causing a split.
///
/// Let's walk through an example. Suppose we have the following [`BTree`]
/// of `order = 4`, which means each node can hold at maximum 3 keys and 4
/// children.
///
/// ```text
/// PAGE 0 (ROOT)
/// +-------+
/// +---| 3,6 |---+
/// / +-------+ \
/// / | \
/// +-------+ +-------+ +-------+
/// | 1,2 | | 4,5 | | 7,8 |
/// +-------+ +-------+ +-------+
/// PAGE 1 PAGE 2 PAGE 3
/// ```
///
/// Now let's say we want to insert key `9`. The insertion algorithm will
/// call [`Self::search`] to find the page and index where the new key
/// should be added, and it will simply insert the key:
///
/// ```text
/// PAGE 0 (ROOT)
/// +-------+
/// +---| 3,6 |---+
/// / +-------+ \
/// / | \
/// +-------+ +-------+ +---------+
/// | 1,2 | | 4,5 | | 7,8,9 |
/// +-------+ +-------+ +---------+
/// PAGE 1 PAGE 2 PAGE 3
/// ```
///
/// Now page 3 has reached the maximum number of keys allowed. We are not
/// going to do anything special yet, the insertion algorithm per se does
/// not care about overflowing, it will always insert keys in the node where
/// they should belong as returned by [`Self::search`].
///
/// Therefore, when the caller wants to insert key `10` the algorithm will
/// do this:
///
/// ```text
/// PAGE 0 (ROOT)
/// +-------+
/// +---| 3,6 |---+
/// / +-------+ \
/// / | \
/// +-------+ +-------+ +----------+
/// | 1,2 | | 4,5 | | 7,8,9,10 |
/// +-------+ +-------+ +----------+
/// PAGE 1 PAGE 2 PAGE 3
/// ```
///
/// Note that this leaves the tree in an unbalanced state and since page 3
/// has overflowed it cannot be saved to disk yet, because the bytes may not
/// fit in the current page size.
///
/// However, before finishing and returning an [`Ok`] result, we call
/// [`Self::balance`], which does all the work needed to maintain the tree
/// in a correct and balanced state. See [`Self::balance`] for details on
/// rearranging keys and splitting nodes after inserts.
pub fn insert(&mut self, entry: Vec<u8>) -> io::Result<()> {
let mut parents = Vec::new();
let search = self.search(self.root, &entry, &mut parents)?;
let mut new_cell = self.alloc_cell(entry)?;
let node = self.pager.get_mut(search.page)?;
match search.index {
// Key found, swap value.
Ok(index) => {
new_cell.header.left_child = node.cell(index).header.left_child;
let old_cell = node.replace(index, new_cell);
free_cell(self.pager, old_cell)?;
}
// Key not found, insert new entry.
Err(index) => node.insert(index, new_cell),
};
self.balance(search.page, &mut parents)
}
/// Same as [`Self::insert`] but doesn't update the key if it already
/// exists.
///
/// It returns its location as an error instead.
pub fn try_insert(&mut self, entry: Vec<u8>) -> io::Result<Result<(), Search>> {
let mut parents = Vec::new();
let search = self.search(self.root, &entry, &mut parents)?;
let Err(index) = search.index else {
return Ok(Err(search));
};
let cell = self.alloc_cell(entry)?;
self.pager.get_mut(search.page)?.insert(index, cell);
self.balance(search.page, &mut parents)?;
Ok(Ok(()))
}
/// Removes the entry corresponding to the given key if it exists.
///
/// # Deletion algorithm
///
/// There are 2 major cases to consider. First, the simplest case, deleting
/// from leaf nodes. Suppose we have the following [`BTree`] of order 4 and
/// we want to delete key `13`:
///
/// ```text
/// PAGE 0
/// +--------+
/// +-------| 4,8,12 |----------+
/// / +--------+ \
/// / / \ \
/// +-------+ +-------+ +---------+ +----------+
/// | 1,2,3 | | 5,6,7 | | 9,10,11 | | 13,14,15 |
/// +-------+ +-------+ +---------+ +----------+
/// PAGE 1 PAGE 2 PAGE 3 PAGE 4
/// ```
///
/// The deletion algorithm will simply remove key 13 from page 4 and it
/// won't care whether the node has underflowed or not:
///
/// ```text
/// PAGE 0
/// +--------+
/// +-------| 4,8,12 |----------+
/// / +--------+ \
/// / / \ \
/// +-------+ +-------+ +---------+ +----------+
/// | 1,2,3 | | 5,6,7 | | 9,10,11 | | 14,15 |
/// +-------+ +-------+ +---------+ +----------+
/// PAGE 1 PAGE 2 PAGE 3 PAGE 4
/// ```
///
/// Later, when calling [`Self::balance`] before returning, underflows will
/// be handled properly.
///
/// The second case to consider is deleting from internal nodes or the root
/// node. The root node is not a special case, it is treated just like any
/// other internal node.
///
/// Suppose we have the following [`BTree`] of order 4 and we want to delete
/// key `11` located at page 0:
///
/// ```text
/// PAGE 0
/// +--------+
/// +-------| 11 |-------+
/// / +--------+ \
/// / \
/// +-------+ PAGE 1 +--------+
/// +----| 4,8 |----+ | 14 |
/// / +-------+ \ +--------+
/// / | \ / \
/// +-------+ +-------+ +-------+ +-------+ +-------+
/// | 1,2,3 | | 5,6,7 | | 9,10 | | 12,13 | | 15,16 |
/// +-------+ +-------+ +-------+ +-------+ +-------+
/// PAGE 3 PAGE 4 PAGE 5
/// ```
///
/// Deleting keys from internal nodes is not allowed because each key needs
/// a corresponding child to the left and one to the right. Therefore, we
/// have to find a key in one of the leaf nodes that can substitute the key
/// we want to delete.
///
/// There are two possible options for that:
/// - *Predecessor*: greatest key in the left subtree.
/// - *Successor*: smallest key in the right subtree.
///
/// Locate either the predecessor or successor and move it up to the current
/// node. The condition for choosing one or another is based on how many
/// children the left child has VS how many children the right child has.
///
/// We'll choose the option with more children since there's less
/// probabilities of merging nodes. See [`Self::balance`] for details on
/// merging. In this case, the left child of page 0 has 2 children while
/// the right child has only 1, so we'll pick the predecessor of key 11,
/// which is key 10 located at page 5. The end result is this:
///
/// ```text
/// PAGE 0
/// +--------+
/// +-------| 10 |-------+
/// / +--------+ \
/// / \
/// +-------+ PAGE 1 +--------+
/// +----| 4,8 |----+ | 14 |
/// / +-------+ \ +--------+
/// / | \ / \
/// +-------+ +-------+ +-------+ +-------+ +-------+
/// | 1,2,3 | | 5,6,7 | | 9 | | 12,13 | | 15,16 |
/// +-------+ +-------+ +-------+ +-------+ +-------+
/// PAGE 3 PAGE 4 PAGE 5
/// ```
///
/// As you can see, page 5 has underflowed below 50%. Underflows are
/// handled by [`Self::balance`]. See also [`Self::remove_entry`] for the
/// actual deletion code, this function is a wrapper that provides a public
/// API and calls [`Self::balance`] at the end.
pub fn remove(&mut self, entry: &[u8]) -> io::Result<Option<Box<Cell>>> {
let mut parents = Vec::new();
let Some(Removal {
cell,
leaf_node,
internal_node,
}) = self.remove_entry(entry, &mut parents)?
else {
return Ok(None);
};
self.balance(leaf_node, &mut parents)?;
// Since data is variable size it could happen that we peek a large
// substitute from a leaf node to replace a small key in an internal
// node, leaving the leaf node balanced and the internal node overflow.
if let Some(node) = internal_node {
// This algorithm is O(n) but the height of the tree grows
// logarithmically so there shoudn't be that many elements to search
// here.
if let Some(index) = parents.iter().position(|n| n == &node) {
parents.drain(index..);
self.balance(node, &mut parents)?;
}
}
Ok(Some(cell))
}
/// Finds the node where `key` is located and removes it from the page.
///
/// The removes [`Cell`] is replaced with either its predecessor or
/// successor in the case of internal nodes. [`Self::balance`] must be
/// called on the leaf node after this operation, and possibly on the
/// internal node if the leaf was balanced. See [`Self::remove`] for more
/// details.
fn remove_entry(
&mut self,
entry: &[u8],
parents: &mut Vec<PageNumber>,
) -> io::Result<Option<Removal>> {
let search = self.search(self.root, entry, parents)?;
let node = self.pager.get(search.page)?;
// Can't remove entry, key not found.
if search.index.is_err() {
return Ok(None);
}
let index = search.index.unwrap();
// Leaf node is the simplest case, remove key and pop off the stack.
if node.is_leaf() {
let cell = self.pager.get_mut(search.page)?.remove(index);
return Ok(Some(Removal {
cell,
leaf_node: search.page,
internal_node: None,
}));
}
// Root or internal nodes require additional work. We need to find a
// suitable substitute for the key in one of the leaf nodes. We'll
// pick either the predecessor (max key in the left subtree) or the
// successor (min key in the right subtree) of the key we want to
// delete.
let left_child = node.child(index);
let right_child = node.child(index + 1);
parents.push(search.page);
let (leaf_node, key_idx) =
if self.pager.get(left_child)?.len() >= self.pager.get(right_child)?.len() {
self.search_max_key(left_child, parents)?
} else {
self.search_min_key(right_child, parents)?
};
let mut substitute = self.pager.get_mut(leaf_node)?.remove(key_idx);
let node = self.pager.get_mut(search.page)?;
substitute.header.left_child = node.child(index);
let cell = node.replace(index, substitute);
Ok(Some(Removal {
cell,
leaf_node,
internal_node: Some(search.page),
}))
}
/// Traverses the tree all the way down to the leaf nodes, following the
/// path specified by [`LeafKeySearch`].
///
/// [`LeafKeySearch::Max`] will always choose the last child for recursion,
/// while [`LeafKeySearch::Min`] will always choose the first child. This
/// function is used to find successors or predecessors of keys in internal
/// nodes.
fn search_leaf_key(
&mut self,
page: PageNumber,
parents: &mut Vec<PageNumber>,
leaf_key_search: LeafKeySearch,
) -> io::Result<(PageNumber, u16)> {
let node = self.pager.get(page)?;
let (key_idx, child_idx) = match leaf_key_search {
LeafKeySearch::Min => (0, 0),
LeafKeySearch::Max => (node.len().saturating_sub(1), node.len()),
};
if node.is_leaf() {
return Ok((page, key_idx));
}
parents.push(page);
let child = node.child(child_idx);
self.search_leaf_key(child, parents, leaf_key_search)
}
/// Returns the page number and slot index in [`Page`] where the greatest
/// key of the given subtree is located.
fn search_max_key(
&mut self,
page: PageNumber,
parents: &mut Vec<PageNumber>,
) -> io::Result<(PageNumber, u16)> {
self.search_leaf_key(page, parents, LeafKeySearch::Max)
}
/// Returns the page number and slot index in [`Page`] where the smallest
/// key of the given subtree is located.
fn search_min_key(
&mut self,
page: PageNumber,
parents: &mut Vec<PageNumber>,
) -> io::Result<(PageNumber, u16)> {
self.search_leaf_key(page, parents, LeafKeySearch::Min)
}
/// Returns the greatest entry in this tree.
pub fn max(&mut self) -> io::Result<Option<Payload>> {
let (page, slot) = self.search_max_key(self.root, &mut Vec::new())?;
if self.pager.get(page)?.is_empty() {
return Ok(None);
}
reassemble_payload(self.pager, page, slot).map(Some)
}
/// B*-Tree balancing algorithm inspired by (or rather stolen from) SQLite
/// 2.X.X. Take a look at the original source code here:
///
/// <https://github.com/antoniosarosi/sqlite2-btree-visualizer/blob/master/src/btree.c#L2171>
///
/// # Algorithm Steps
///
/// This is how it works:
///
/// 1. Check if the current node has overflown or underflown. If not, early
/// return without doing anything.
///
/// 2. Check if the node is the root and has underflown. The root is only
/// considered underflow when it contains 0 cells and has one child:
///
/// ```text
/// +-------+
/// | | Empty root
/// +-------+
/// |
/// v
/// +-------+ Direct root child
/// +----| 4,8 |----+
/// / +-------+ \
/// / | \
/// +-------+ +-------+ +----------+
/// | 1,2,3 | | 5,6,7 | | 12,13,14 | Other pages below
/// +-------+ +-------+ +----------+
/// ```
///
/// In that case, move the child into the root decreasing tree height by
/// one, free the child page and return:
///
/// ```text
/// New root
/// +-------+
/// +----| 4,8 |----+
/// / +-------+ \
/// / | \
/// +-------+ +-------+ +----------+
/// | 1,2,3 | | 5,6,7 | | 12,13,14 | Other pages below
/// +-------+ +-------+ +----------+
/// ```
///