diff --git a/vortex-array/src/array/chunked/canonical.rs b/vortex-array/src/array/chunked/canonical.rs index a1dbcd08ab..741d8a86d5 100644 --- a/vortex-array/src/array/chunked/canonical.rs +++ b/vortex-array/src/array/chunked/canonical.rs @@ -211,7 +211,7 @@ fn pack_views( buffers.push(canonical_buffer); } - for view in canonical_chunk.view_slice() { + for view in canonical_chunk.binary_views()? { if view.is_inlined() { // Inlined views can be copied directly into the output views.push(view.as_u128()); diff --git a/vortex-array/src/array/varbin/flatten.rs b/vortex-array/src/array/varbin/flatten.rs index d7c0ba3aab..ebd27da0a2 100644 --- a/vortex-array/src/array/varbin/flatten.rs +++ b/vortex-array/src/array/varbin/flatten.rs @@ -42,14 +42,14 @@ mod test { assert!(!canonical.is_valid(1)); // First value is inlined (12 bytes) - assert!(canonical.view_at(2).is_inlined()); + assert!(canonical.view_at(2).unwrap().is_inlined()); assert_eq!( canonical.bytes_at(2).unwrap().as_slice(), "123456789012".as_bytes() ); // Second value is not inlined (13 bytes) - assert!(!canonical.view_at(3).is_inlined()); + assert!(!canonical.view_at(3).unwrap().is_inlined()); assert_eq!( canonical.bytes_at(3).unwrap().as_slice(), "1234567890123".as_bytes() diff --git a/vortex-array/src/array/varbinview/accessor.rs b/vortex-array/src/array/varbinview/accessor.rs index 00b8c863aa..0474bb211f 100644 --- a/vortex-array/src/array/varbinview/accessor.rs +++ b/vortex-array/src/array/varbinview/accessor.rs @@ -4,6 +4,7 @@ use vortex_error::VortexResult; use crate::accessor::ArrayAccessor; use crate::array::primitive::PrimitiveArray; use crate::array::varbinview::VarBinViewArray; +use crate::array::BinaryView; use crate::validity::ArrayValidity; use crate::IntoCanonical; @@ -12,10 +13,10 @@ impl ArrayAccessor<[u8]> for VarBinViewArray { &self, f: F, ) -> VortexResult { - let views = self.view_slice(); let bytes: Vec = (0..self.metadata().buffer_lens.len()) .map(|i| self.buffer(i).into_canonical()?.into_primitive()) .try_collect()?; + let views: Vec = self.binary_views()?.collect(); let validity = self.logical_validity().to_null_buffer()?; match validity { diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 79e04d6c6d..97bedd5be2 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -1,5 +1,4 @@ use std::fmt::{Debug, Display, Formatter}; -use std::slice; use std::sync::Arc; use ::serde::{Deserialize, Serialize}; @@ -9,11 +8,11 @@ use arrow_array::{ArrayRef, BinaryViewArray, GenericByteViewArray, StringViewArr use arrow_buffer::ScalarBuffer; use itertools::Itertools; use static_assertions::{assert_eq_align, assert_eq_size}; +use vortex_buffer::Buffer; use vortex_dtype::{DType, PType}; use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexExpect, VortexResult}; use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::array::PrimitiveArray; use crate::arrow::FromArrowArray; use crate::compute::slice; use crate::encoding::ids; @@ -96,7 +95,7 @@ impl Ref { pub union BinaryView { // Numeric representation. This is logically `u128`, but we split it into the high and low // bits to preserve the alignment. - num: [u64; 2], + le_bytes: [u8; 16], // Inlined representation: strings <= 12 bytes inlined: Inlined, @@ -156,13 +155,8 @@ impl BinaryView { } pub fn as_u128(&self) -> u128 { - let mut tmp = 0u128; - unsafe { - tmp |= self.num[0] as u128; - tmp |= (self.num[1] as u128) << 64; - } - - tmp + // SAFETY: binary view always safe to read as u128 LE bytes + unsafe { u128::from_le_bytes(self.le_bytes) } } } @@ -276,20 +270,29 @@ impl VarBinViewArray { /// /// This is useful for iteration over the values, as well as for applying filters that may /// only require hitting the prefixes or inline strings. - pub fn view_slice(&self) -> &[BinaryView] { - unsafe { - slice::from_raw_parts( - PrimitiveArray::try_from(self.views()) - .vortex_expect("Views must be a primitive array") - .maybe_null_slice::() - .as_ptr() as _, - self.views().len() / VIEW_SIZE_BYTES, - ) - } + pub fn binary_views(&self) -> VortexResult { + Ok(Views { + inner: self.views().into_primitive()?.into_buffer(), + idx: 0, + len: self.len(), + }) } - pub fn view_at(&self, index: usize) -> BinaryView { - self.view_slice()[index] + /// Retrieve a binary view at a specific index. + /// + /// If you will be calling this method many times, it's recommended that you instead use the + /// iterator provided by [`binary_views`][Self::binary_views]. + pub fn view_at(&self, index: usize) -> VortexResult { + let start = index * VIEW_SIZE_BYTES; + let stop = (index + 1) * VIEW_SIZE_BYTES; + let view_bytes = slice(self.views(), start, stop)? + .into_primitive()? + .into_buffer(); + + let mut le_bytes: [u8; 16] = [0u8; 16]; + le_bytes.copy_from_slice(view_bytes.as_slice()); + + Ok(BinaryView { le_bytes }) } /// Access to the primitive views array. @@ -436,7 +439,7 @@ impl VarBinViewArray { // TODO(aduffy): do we really need to do this with copying? pub fn bytes_at(&self, index: usize) -> VortexResult> { - let view = self.view_at(index); + let view = self.view_at(index)?; // Expect this to be the common case: strings > 12 bytes. if !view.is_inlined() { let view_ref = view.as_view(); @@ -454,6 +457,31 @@ impl VarBinViewArray { } } +pub struct Views { + inner: Buffer, + len: usize, + idx: usize, +} + +impl Iterator for Views { + type Item = BinaryView; + + fn next(&mut self) -> Option { + if self.idx >= self.len { + None + } else { + // SAFETY: constructing the BinaryView against u128 value + let mut le_bytes = [0u8; 16]; + let start = self.idx * VIEW_SIZE_BYTES; + let stop = start + VIEW_SIZE_BYTES; + le_bytes.copy_from_slice(&self.inner[start..stop]); + + self.idx += 1; + Some(BinaryView { le_bytes }) + } + } +} + // Generic helper to create an Arrow ByteViewBuilder of the appropriate type. fn generic_byte_view_builder( values: impl Iterator>,