Skip to content

Commit

Permalink
fix: support non-Primitive encodings for views (#1123)
Browse files Browse the repository at this point in the history
Fix issue that would pop up when canonicalizing a Null
ConstantArray<utf8> into VarBinView. We assume that we can immediately
slice the views array. Now we handle the case where the views array is
compressed (in this case, as a ConstantArray(0u8)).
  • Loading branch information
a10y authored Oct 23, 2024
1 parent 3b1079d commit 3277e18
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 27 deletions.
2 changes: 1 addition & 1 deletion vortex-array/src/array/chunked/canonical.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ fn pack_views(
buffers.push(canonical_buffer);
}

for view in canonical_chunk.view_slice() {
for view in canonical_chunk.binary_views()? {
if view.is_inlined() {
// Inlined views can be copied directly into the output
views.push(view.as_u128());
Expand Down
4 changes: 2 additions & 2 deletions vortex-array/src/array/varbin/flatten.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,14 @@ mod test {
assert!(!canonical.is_valid(1));

// First value is inlined (12 bytes)
assert!(canonical.view_at(2).is_inlined());
assert!(canonical.view_at(2).unwrap().is_inlined());
assert_eq!(
canonical.bytes_at(2).unwrap().as_slice(),
"123456789012".as_bytes()
);

// Second value is not inlined (13 bytes)
assert!(!canonical.view_at(3).is_inlined());
assert!(!canonical.view_at(3).unwrap().is_inlined());
assert_eq!(
canonical.bytes_at(3).unwrap().as_slice(),
"1234567890123".as_bytes()
Expand Down
3 changes: 2 additions & 1 deletion vortex-array/src/array/varbinview/accessor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use vortex_error::VortexResult;
use crate::accessor::ArrayAccessor;
use crate::array::primitive::PrimitiveArray;
use crate::array::varbinview::VarBinViewArray;
use crate::array::BinaryView;
use crate::validity::ArrayValidity;
use crate::IntoCanonical;

Expand All @@ -12,10 +13,10 @@ impl ArrayAccessor<[u8]> for VarBinViewArray {
&self,
f: F,
) -> VortexResult<R> {
let views = self.view_slice();
let bytes: Vec<PrimitiveArray> = (0..self.metadata().buffer_lens.len())
.map(|i| self.buffer(i).into_canonical()?.into_primitive())
.try_collect()?;
let views: Vec<BinaryView> = self.binary_views()?.collect();
let validity = self.logical_validity().to_null_buffer()?;

match validity {
Expand Down
74 changes: 51 additions & 23 deletions vortex-array/src/array/varbinview/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use std::fmt::{Debug, Display, Formatter};
use std::slice;
use std::sync::Arc;

use ::serde::{Deserialize, Serialize};
Expand All @@ -9,11 +8,11 @@ use arrow_array::{ArrayRef, BinaryViewArray, GenericByteViewArray, StringViewArr
use arrow_buffer::ScalarBuffer;
use itertools::Itertools;
use static_assertions::{assert_eq_align, assert_eq_size};
use vortex_buffer::Buffer;
use vortex_dtype::{DType, PType};
use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexExpect, VortexResult};

use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
use crate::array::PrimitiveArray;
use crate::arrow::FromArrowArray;
use crate::compute::slice;
use crate::encoding::ids;
Expand Down Expand Up @@ -96,7 +95,7 @@ impl Ref {
pub union BinaryView {
// Numeric representation. This is logically `u128`, but we split it into the high and low
// bits to preserve the alignment.
num: [u64; 2],
le_bytes: [u8; 16],

// Inlined representation: strings <= 12 bytes
inlined: Inlined,
Expand Down Expand Up @@ -156,13 +155,8 @@ impl BinaryView {
}

pub fn as_u128(&self) -> u128 {
let mut tmp = 0u128;
unsafe {
tmp |= self.num[0] as u128;
tmp |= (self.num[1] as u128) << 64;
}

tmp
// SAFETY: binary view always safe to read as u128 LE bytes
unsafe { u128::from_le_bytes(self.le_bytes) }
}
}

Expand Down Expand Up @@ -276,20 +270,29 @@ impl VarBinViewArray {
///
/// This is useful for iteration over the values, as well as for applying filters that may
/// only require hitting the prefixes or inline strings.
pub fn view_slice(&self) -> &[BinaryView] {
unsafe {
slice::from_raw_parts(
PrimitiveArray::try_from(self.views())
.vortex_expect("Views must be a primitive array")
.maybe_null_slice::<u8>()
.as_ptr() as _,
self.views().len() / VIEW_SIZE_BYTES,
)
}
pub fn binary_views(&self) -> VortexResult<Views> {
Ok(Views {
inner: self.views().into_primitive()?.into_buffer(),
idx: 0,
len: self.len(),
})
}

pub fn view_at(&self, index: usize) -> BinaryView {
self.view_slice()[index]
/// Retrieve a binary view at a specific index.
///
/// If you will be calling this method many times, it's recommended that you instead use the
/// iterator provided by [`binary_views`][Self::binary_views].
pub fn view_at(&self, index: usize) -> VortexResult<BinaryView> {
let start = index * VIEW_SIZE_BYTES;
let stop = (index + 1) * VIEW_SIZE_BYTES;
let view_bytes = slice(self.views(), start, stop)?
.into_primitive()?
.into_buffer();

let mut le_bytes: [u8; 16] = [0u8; 16];
le_bytes.copy_from_slice(view_bytes.as_slice());

Ok(BinaryView { le_bytes })
}

/// Access to the primitive views array.
Expand Down Expand Up @@ -436,7 +439,7 @@ impl VarBinViewArray {

// TODO(aduffy): do we really need to do this with copying?
pub fn bytes_at(&self, index: usize) -> VortexResult<Vec<u8>> {
let view = self.view_at(index);
let view = self.view_at(index)?;
// Expect this to be the common case: strings > 12 bytes.
if !view.is_inlined() {
let view_ref = view.as_view();
Expand All @@ -454,6 +457,31 @@ impl VarBinViewArray {
}
}

pub struct Views {
inner: Buffer,
len: usize,
idx: usize,
}

impl Iterator for Views {
type Item = BinaryView;

fn next(&mut self) -> Option<Self::Item> {
if self.idx >= self.len {
None
} else {
// SAFETY: constructing the BinaryView against u128 value
let mut le_bytes = [0u8; 16];
let start = self.idx * VIEW_SIZE_BYTES;
let stop = start + VIEW_SIZE_BYTES;
le_bytes.copy_from_slice(&self.inner[start..stop]);

self.idx += 1;
Some(BinaryView { le_bytes })
}
}
}

// Generic helper to create an Arrow ByteViewBuilder of the appropriate type.
fn generic_byte_view_builder<B, V, F>(
values: impl Iterator<Item = Option<V>>,
Expand Down

0 comments on commit 3277e18

Please sign in to comment.