Skip to content

Commit

Permalink
feat: canonicalize indices in take if sufficiently large (#1036)
Browse files Browse the repository at this point in the history
Co-authored-by: Daniel King <[email protected]>
  • Loading branch information
lwwmanning and danking authored Oct 15, 2024
1 parent 8054958 commit 0154623
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 10 deletions.
32 changes: 32 additions & 0 deletions encodings/fastlanes/benches/bitpacking_take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,22 @@ fn bench_take(c: &mut Criterion) {
c.bench_function("take_10K_contiguous", |b| {
b.iter(|| black_box(take(packed.as_ref(), contiguous_indices.as_ref()).unwrap()));
});

let lots_of_indices: PrimitiveArray = (0..200_000)
.map(|i| (i * 42) % values.len() as u64)
.collect::<Vec<_>>()
.into();
c.bench_function("take_200K_dispersed", |b| {
b.iter(|| black_box(take(packed.as_ref(), lots_of_indices.as_ref()).unwrap()));
});

let lots_of_indices: PrimitiveArray = (0..200_000)
.map(|i| ((i * 42) % 1024) as u64)
.collect::<Vec<_>>()
.into();
c.bench_function("take_200K_first_chunk_only", |b| {
b.iter(|| black_box(take(packed.as_ref(), lots_of_indices.as_ref()).unwrap()));
});
}

fn bench_patched_take(c: &mut Criterion) {
Expand Down Expand Up @@ -112,6 +128,22 @@ fn bench_patched_take(c: &mut Criterion) {
b.iter(|| black_box(take(packed.as_ref(), patch_indices.as_ref()).unwrap()));
});

let lots_of_indices: PrimitiveArray = (0..200_000)
.map(|i| (i * 42) % values.len() as u64)
.collect::<Vec<_>>()
.into();
c.bench_function("patched_take_200K_dispersed", |b| {
b.iter(|| black_box(take(packed.as_ref(), lots_of_indices.as_ref()).unwrap()));
});

let lots_of_indices: PrimitiveArray = (0..200_000)
.map(|i| ((i * 42) % 1024) as u64)
.collect::<Vec<_>>()
.into();
c.bench_function("patched_take_200K_first_chunk_only", |b| {
b.iter(|| black_box(take(packed.as_ref(), lots_of_indices.as_ref()).unwrap()));
});

// There are currently 2 magic parameters of note:
// 1. the threshold at which sparse take will switch from search_sorted to map (currently 128)
// 2. the threshold at which bitpacked take will switch from bulk patching to per chunk patching (currently 64)
Expand Down
28 changes: 18 additions & 10 deletions encodings/fastlanes/src/bitpacking/compute/take.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,31 @@ use fastlanes::BitPacking;
use itertools::Itertools;
use vortex::array::{PrimitiveArray, SparseArray};
use vortex::compute::{slice, take, TakeFn};
use vortex::{Array, ArrayDType, IntoArray, IntoArrayVariant};
use vortex::{Array, ArrayDType, IntoArray, IntoArrayVariant, IntoCanonical};
use vortex_dtype::{
match_each_integer_ptype, match_each_unsigned_integer_ptype, NativePType, PType,
};
use vortex_error::VortexResult;

use crate::{unpack_single_primitive, BitPackedArray};

// assuming the buffer is already allocated (which will happen at most once) then unpacking
// all 1024 elements takes ~8.8x as long as unpacking a single element on an M2 Macbook Air.
// see https://github.com/spiraldb/vortex/pull/190#issue-2223752833
const UNPACK_CHUNK_THRESHOLD: usize = 8;
const BULK_PATCH_THRESHOLD: usize = 64;

impl TakeFn for BitPackedArray {
fn take(&self, indices: &Array) -> VortexResult<Array> {
// If the indices are large enough, it's faster to flatten and take the primitive array.
if indices.len() * UNPACK_CHUNK_THRESHOLD > self.len() {
return self
.clone()
.into_canonical()?
.into_primitive()?
.take(indices);
}

let ptype: PType = self.dtype().try_into()?;
let validity = self.validity();
let taken_validity = validity.take(indices)?;
Expand Down Expand Up @@ -51,21 +66,14 @@ fn take_primitive<T: NativePType + BitPacking>(
// if we have a small number of relatively large batches, we gain by slicing and then patching inside the loop
// if we have a large number of relatively small batches, the overhead isn't worth it, and we're better off with a bulk patch
// roughly, if we have an average of less than 64 elements per batch, we prefer bulk patching
let prefer_bulk_patch = relative_indices.len() * 64 > indices.len();

// assuming the buffer is already allocated (which will happen at most once)
// then unpacking all 1024 elements takes ~8.8x as long as unpacking a single element
// see https://github.com/fulcrum-so/vortex/pull/190#issue-2223752833
// however, the gap should be smaller with larger registers (e.g., AVX-512) vs the 128 bit
// ones on M2 Macbook Air.
let unpack_chunk_threshold = 8;
let prefer_bulk_patch = relative_indices.len() * BULK_PATCH_THRESHOLD > indices.len();

let mut output = Vec::with_capacity(indices.len());
let mut unpacked = [T::zero(); 1024];
for (chunk, offsets) in relative_indices {
let chunk_size = 128 * bit_width / size_of::<T>();
let packed_chunk = &packed[chunk * chunk_size..][..chunk_size];
if offsets.len() > unpack_chunk_threshold {
if offsets.len() > UNPACK_CHUNK_THRESHOLD {
unsafe {
BitPacking::unchecked_unpack(bit_width, packed_chunk, &mut unpacked);
}
Expand Down

0 comments on commit 0154623

Please sign in to comment.