From 84e40dedd72d3fd8720a74f0a04ad40d5aa30ba8 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 22 Oct 2024 21:23:46 -0400 Subject: [PATCH 1/6] feat: use buffer for VarBinView views Also implements an enhanced DictArray canonicalize that canonicalizes the values then just does a gather over them. --- encodings/dict/Cargo.toml | 4 + encodings/dict/benches/dict_canonical.rs | 34 ++++++++ encodings/dict/src/array.rs | 83 ++++++++++++++++++-- encodings/dict/src/compute.rs | 3 - vortex-array/src/array/chunked/canonical.rs | 12 +-- vortex-array/src/array/constant/canonical.rs | 12 ++- vortex-array/src/array/varbinview/compute.rs | 9 +-- vortex-array/src/array/varbinview/mod.rs | 49 +++++------- 8 files changed, 147 insertions(+), 59 deletions(-) create mode 100644 encodings/dict/benches/dict_canonical.rs diff --git a/encodings/dict/Cargo.toml b/encodings/dict/Cargo.toml index de79f9f5bc..4068ecdb9c 100644 --- a/encodings/dict/Cargo.toml +++ b/encodings/dict/Cargo.toml @@ -36,3 +36,7 @@ simplelog = { workspace = true } [[bench]] name = "dict_compress" harness = false + +[[bench]] +name = "dict_canonical" +harness = false diff --git a/encodings/dict/benches/dict_canonical.rs b/encodings/dict/benches/dict_canonical.rs new file mode 100644 index 0000000000..e5c11ab133 --- /dev/null +++ b/encodings/dict/benches/dict_canonical.rs @@ -0,0 +1,34 @@ +#![allow(clippy::unwrap_used)] + +use criterion::{criterion_group, criterion_main, Criterion}; +use vortex::array::VarBinArray; +use vortex::{IntoArray, IntoCanonical}; +use vortex_dict::{dict_encode_varbin, DictArray}; +use vortex_dtype::{DType, Nullability}; + +fn fixture(len: usize) -> DictArray { + let values = [ + Some("inlined"), + None, + Some("not inlined but repeated often"), + ]; + + let strings = VarBinArray::from_iter( + values.into_iter().cycle().take(len), + DType::Utf8(Nullability::Nullable), + ); + + let (codes, values) = dict_encode_varbin(&strings); + DictArray::try_new(codes.into_array(), values.into_array()).unwrap() +} + +fn bench_canonical(c: &mut Criterion) { + let dict_array = fixture(1024).into_array(); + + c.bench_function("canonical", |b| { + b.iter(|| dict_array.clone().into_canonical().unwrap()) + }); +} + +criterion_group!(bench_dict_canonical, bench_canonical); +criterion_main!(bench_dict_canonical); diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index 598aa96e34..e3dbd289b8 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -1,19 +1,19 @@ use std::fmt::{Debug, Display}; -use arrow_buffer::BooleanBuffer; +use arrow_buffer::{BooleanBuffer, ScalarBuffer}; use serde::{Deserialize, Serialize}; use vortex::array::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::array::BoolArray; +use vortex::array::{BoolArray, VarBinViewArray}; use vortex::compute::take; -use vortex::compute::unary::scalar_at; +use vortex::compute::unary::{scalar_at, try_cast}; use vortex::encoding::ids; use vortex::stats::StatsSet; -use vortex::validity::{ArrayValidity, LogicalValidity}; +use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; use vortex::{ impl_encoding, Array, ArrayDType, ArrayTrait, Canonical, IntoArray, IntoArrayVariant, IntoCanonical, }; -use vortex_dtype::{match_each_integer_ptype, DType, PType}; +use vortex_dtype::{match_each_integer_ptype, DType, Nullability, PType}; use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult}; impl_encoding!("vortex.dict", ids::DICT, Dict); @@ -41,6 +41,7 @@ impl DictArray { DictMetadata { codes_ptype: PType::try_from(codes.dtype()) .vortex_expect("codes dtype must be uint"), + values_len: values.len(), }, [values, codes].into(), @@ -67,11 +68,79 @@ impl ArrayTrait for DictArray {} impl IntoCanonical for DictArray { fn into_canonical(self) -> VortexResult { - let canonical_values: Array = self.values().into_canonical()?.into(); - take(canonical_values, self.codes())?.into_canonical() + match self.dtype() { + DType::Utf8(nullability) | DType::Binary(nullability) => { + let values = self.values().into_varbinview()?; + let codes = try_cast(self.codes(), PType::U64.into())?.into_primitive()?; + let codes_u64 = codes.maybe_null_slice::(); + + if *nullability == Nullability::NonNullable { + canonicalize_string_nonnull(codes_u64, values) + } else { + canonicalize_string_nullable( + codes_u64, + values, + self.logical_validity().into_validity(), + ) + } + } + _ => canonicalize_primitive(self), + } } } +/// Canonicalize a set of codes and values. +fn canonicalize_string_nonnull(codes: &[u64], values: VarBinViewArray) -> VortexResult { + let value_views = ScalarBuffer::::from(values.views().clone().into_arrow()); + + // Gather the views from value_views into full_views using the dictionary codes. + let full_views: Vec = codes + .iter() + .map(|code| value_views[*code as usize]) + .collect(); + + VarBinViewArray::try_new( + full_views.into(), + values.buffers().collect(), + values.dtype().clone(), + Validity::NonNullable, + ) + .map(Canonical::VarBinView) +} + +fn canonicalize_string_nullable( + codes: &[u64], + values: VarBinViewArray, + validity: Validity, +) -> VortexResult { + let value_views = ScalarBuffer::::from(values.views().clone().into_arrow()); + + // Gather the views from value_views into full_views using the dictionary codes. + let full_views: Vec = codes + .iter() + .map(|code| { + if *code == 0 { + 0u128 + } else { + value_views[(*code - 1) as usize] + } + }) + .collect(); + + VarBinViewArray::try_new( + full_views.into(), + values.buffers().collect(), + values.dtype().clone(), + validity, + ) + .map(Canonical::VarBinView) +} + +fn canonicalize_primitive(array: DictArray) -> VortexResult { + let canonical_values: Array = array.values().into_canonical()?.into(); + take(canonical_values, array.codes())?.into_canonical() +} + impl ArrayValidity for DictArray { fn is_valid(&self, index: usize) -> bool { let values_index = scalar_at(self.codes(), index) diff --git a/encodings/dict/src/compute.rs b/encodings/dict/src/compute.rs index 5bf223630c..391fca8db3 100644 --- a/encodings/dict/src/compute.rs +++ b/encodings/dict/src/compute.rs @@ -42,9 +42,6 @@ impl ScalarAtFn for DictArray { impl TakeFn for DictArray { fn take(&self, indices: &Array) -> VortexResult { - // Dict - // codes: 0 0 1 - // dict: a b c d e f g h let codes = take(self.codes(), indices)?; Self::try_new(codes, self.values()).map(|a| a.into_array()) } diff --git a/vortex-array/src/array/chunked/canonical.rs b/vortex-array/src/array/chunked/canonical.rs index a1dbcd08ab..29979ac452 100644 --- a/vortex-array/src/array/chunked/canonical.rs +++ b/vortex-array/src/array/chunked/canonical.rs @@ -1,4 +1,4 @@ -use arrow_buffer::{BooleanBufferBuilder, Buffer, MutableBuffer, ScalarBuffer}; +use arrow_buffer::{BooleanBufferBuilder, MutableBuffer, ScalarBuffer}; use vortex_dtype::{DType, PType, StructDType}; use vortex_error::{vortex_bail, vortex_err, ErrString, VortexResult}; @@ -179,11 +179,7 @@ fn pack_primitives( buffer.extend_from_slice(chunk.buffer()); } - Ok(PrimitiveArray::new( - Buffer::from(buffer).into(), - ptype, - validity, - )) + Ok(PrimitiveArray::new(buffer.into(), ptype, validity)) } /// Builds a new [VarBinViewArray] by repacking the values from the chunks into a single @@ -231,8 +227,8 @@ fn pack_views( } } - let views_buffer: Buffer = ScalarBuffer::::from(views).into_inner(); - VarBinViewArray::try_new(Array::from(views_buffer), buffers, dtype.clone(), validity) + let views_buffer = ScalarBuffer::::from(views).into_inner(); + VarBinViewArray::try_new(views_buffer.into(), buffers, dtype.clone(), validity) } #[cfg(test)] diff --git a/vortex-array/src/array/constant/canonical.rs b/vortex-array/src/array/constant/canonical.rs index a529f849ab..003f3ef11d 100644 --- a/vortex-array/src/array/constant/canonical.rs +++ b/vortex-array/src/array/constant/canonical.rs @@ -1,9 +1,9 @@ use arrow_array::builder::make_view; -use arrow_buffer::{BooleanBuffer, BufferBuilder}; +use arrow_buffer::{BooleanBuffer, BufferBuilder, MutableBuffer}; use vortex_buffer::Buffer; use vortex_dtype::{match_each_native_ptype, DType, Nullability, PType}; use vortex_error::{vortex_bail, VortexResult}; -use vortex_scalar::{BinaryScalar, BoolScalar, Scalar, Utf8Scalar}; +use vortex_scalar::{BinaryScalar, BoolScalar, Utf8Scalar}; use crate::array::constant::ConstantArray; use crate::array::primitive::PrimitiveArray; @@ -70,10 +70,10 @@ fn canonical_byte_view( ) -> VortexResult { match scalar_bytes { None => { - let views = ConstantArray::new(Scalar::null(dtype.clone()), len); + let views = MutableBuffer::from(Vec::::with_capacity(1)); VarBinViewArray::try_new( - views.into_array(), + views.into(), Vec::new(), dtype.clone(), Validity::AllInvalid, @@ -100,9 +100,7 @@ fn canonical_byte_view( // add u128 PType, see https://github.com/spiraldb/vortex/issues/1110 let mut views = BufferBuilder::::new(len); views.append_n(len, view); - let views = - PrimitiveArray::new(views.finish().into(), PType::U8, Validity::NonNullable) - .into_array(); + let views = views.finish().into(); let validity = if dtype.nullability() == Nullability::NonNullable { Validity::NonNullable diff --git a/vortex-array/src/array/varbinview/compute.rs b/vortex-array/src/array/varbinview/compute.rs index 8631407aca..8846ee81f6 100644 --- a/vortex-array/src/array/varbinview/compute.rs +++ b/vortex-array/src/array/varbinview/compute.rs @@ -14,7 +14,7 @@ use crate::array::varbinview::{VarBinViewArray, VIEW_SIZE_BYTES}; use crate::array::{varbinview_as_arrow, ConstantArray}; use crate::arrow::FromArrowArray; use crate::compute::unary::ScalarAtFn; -use crate::compute::{slice, ArrayCompute, MaybeCompareFn, Operator, SliceFn, TakeFn}; +use crate::compute::{ArrayCompute, MaybeCompareFn, Operator, SliceFn, TakeFn}; use crate::{Array, ArrayDType, IntoArray, IntoCanonical}; impl ArrayCompute for VarBinViewArray { @@ -45,11 +45,8 @@ impl ScalarAtFn for VarBinViewArray { impl SliceFn for VarBinViewArray { fn slice(&self, start: usize, stop: usize) -> VortexResult { Ok(Self::try_new( - slice( - self.views(), - start * VIEW_SIZE_BYTES, - stop * VIEW_SIZE_BYTES, - )?, + self.views() + .slice(start * VIEW_SIZE_BYTES..stop * VIEW_SIZE_BYTES), (0..self.metadata().buffer_lens.len()) .map(|i| self.buffer(i)) .collect::>(), diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index bfa5aa8fc8..b59dac6383 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -9,11 +9,11 @@ use arrow_array::{ArrayRef, BinaryViewArray, GenericByteViewArray, StringViewArr use arrow_buffer::ScalarBuffer; use itertools::Itertools; use static_assertions::{assert_eq_align, assert_eq_size}; +use vortex_buffer::Buffer; use vortex_dtype::{DType, PType}; use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexExpect, VortexResult}; use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use crate::array::PrimitiveArray; use crate::arrow::FromArrowArray; use crate::compute::slice; use crate::encoding::ids; @@ -21,6 +21,7 @@ use crate::stats::StatsSet; use crate::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata}; use crate::{ impl_encoding, Array, ArrayDType, ArrayTrait, Canonical, IntoArrayVariant, IntoCanonical, + TypedArray, }; mod accessor; @@ -221,15 +222,11 @@ impl_encoding!("vortex.varbinview", ids::VAR_BIN_VIEW, VarBinView); impl VarBinViewArray { pub fn try_new( - views: Array, + views: Buffer, buffers: Vec, dtype: DType, validity: Validity, ) -> VortexResult { - if !matches!(views.dtype(), &DType::BYTES) { - vortex_bail!(MismatchedTypes: "u8", views.dtype()); - } - for d in buffers.iter() { if !matches!(d.dtype(), &DType::BYTES) { vortex_bail!(MismatchedTypes: "u8", d.dtype()); @@ -258,13 +255,21 @@ impl VarBinViewArray { }; let mut children = Vec::with_capacity(buffers.len() + 2); - children.push(views); children.extend(buffers); if let Some(a) = validity.into_array() { children.push(a) } - Self::try_from_parts(dtype, num_views, metadata, children.into(), StatsSet::new()) + Ok(Self { + typed: TypedArray::try_from_parts( + dtype, + num_views, + metadata, + Some(views), + children.into(), + StatsSet::new(), + )?, + }) } /// Number of raw string data buffers held by this array. @@ -279,10 +284,7 @@ impl VarBinViewArray { pub fn view_slice(&self) -> &[BinaryView] { unsafe { slice::from_raw_parts( - PrimitiveArray::try_from(self.views()) - .vortex_expect("Views must be a primitive array") - .maybe_null_slice::() - .as_ptr() as _, + self.views().as_ptr() as _, self.views().len() / VIEW_SIZE_BYTES, ) } @@ -298,10 +300,8 @@ impl VarBinViewArray { /// contain either a pointer into one of the array's owned `buffer`s OR an inlined copy of /// the string (if the string has 12 bytes or fewer). #[inline] - pub fn views(&self) -> Array { - self.as_ref() - .child(0, &DType::BYTES, self.len() * VIEW_SIZE_BYTES) - .vortex_expect("VarBinViewArray: views child") + pub fn views(&self) -> &Buffer { + self.as_ref().buffer().vortex_expect("views buffer") } /// Access one of the backing data buffers. @@ -314,7 +314,7 @@ impl VarBinViewArray { pub fn buffer(&self, idx: usize) -> Array { self.as_ref() .child( - idx + 1, + idx, &DType::BYTES, self.metadata().buffer_lens[idx] as usize, ) @@ -346,7 +346,7 @@ impl VarBinViewArray { self.metadata().validity.to_validity(|| { self.as_ref() .child( - (self.metadata().buffer_lens.len() + 1) as _, + self.metadata().buffer_lens.len(), &Validity::DTYPE, self.len(), ) @@ -488,13 +488,6 @@ impl IntoCanonical for VarBinViewArray { } pub(crate) fn varbinview_as_arrow(var_bin_view: &VarBinViewArray) -> ArrayRef { - // Views should be buffer of u8 - let views = var_bin_view - .views() - .into_primitive() - .vortex_expect("VarBinViewArray: views child must be primitive"); - assert_eq!(views.ptype(), PType::U8); - let nulls = var_bin_view .logical_validity() .to_null_buffer() @@ -518,14 +511,14 @@ pub(crate) fn varbinview_as_arrow(var_bin_view: &VarBinViewArray) -> ArrayRef { match var_bin_view.dtype() { DType::Binary(_) => Arc::new(unsafe { BinaryViewArray::new_unchecked( - ScalarBuffer::::from(views.buffer().clone().into_arrow()), + ScalarBuffer::::from(var_bin_view.views().clone().into_arrow()), data, nulls, ) }), DType::Utf8(_) => Arc::new(unsafe { StringViewArray::new_unchecked( - ScalarBuffer::::from(views.buffer().clone().into_arrow()), + ScalarBuffer::::from(var_bin_view.views().clone().into_arrow()), data, nulls, ) @@ -546,7 +539,7 @@ impl ArrayValidity for VarBinViewArray { impl AcceptArrayVisitor for VarBinViewArray { fn accept(&self, visitor: &mut dyn ArrayVisitor) -> VortexResult<()> { - visitor.visit_child("views", &self.views())?; + visitor.visit_buffer(self.views())?; for i in 0..self.metadata().buffer_lens.len() { visitor.visit_child(format!("bytes_{i}").as_str(), &self.buffer(i))?; } From ccee372bbb193de9a5d5aa54994fa0f80997984f Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 22 Oct 2024 21:56:16 -0400 Subject: [PATCH 2/6] fix logic (no offset needed) --- encodings/dict/src/array.rs | 56 ++++++------------------------------- 1 file changed, 9 insertions(+), 47 deletions(-) diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index e3dbd289b8..cd97ceadd2 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -8,12 +8,12 @@ use vortex::compute::take; use vortex::compute::unary::{scalar_at, try_cast}; use vortex::encoding::ids; use vortex::stats::StatsSet; -use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; +use vortex::validity::{ArrayValidity, LogicalValidity}; use vortex::{ impl_encoding, Array, ArrayDType, ArrayTrait, Canonical, IntoArray, IntoArrayVariant, IntoCanonical, }; -use vortex_dtype::{match_each_integer_ptype, DType, Nullability, PType}; +use vortex_dtype::{match_each_integer_ptype, DType, PType}; use vortex_error::{vortex_bail, vortex_panic, VortexExpect as _, VortexResult}; impl_encoding!("vortex.dict", ids::DICT, Dict); @@ -69,69 +69,31 @@ impl ArrayTrait for DictArray {} impl IntoCanonical for DictArray { fn into_canonical(self) -> VortexResult { match self.dtype() { - DType::Utf8(nullability) | DType::Binary(nullability) => { - let values = self.values().into_varbinview()?; - let codes = try_cast(self.codes(), PType::U64.into())?.into_primitive()?; - let codes_u64 = codes.maybe_null_slice::(); - - if *nullability == Nullability::NonNullable { - canonicalize_string_nonnull(codes_u64, values) - } else { - canonicalize_string_nullable( - codes_u64, - values, - self.logical_validity().into_validity(), - ) - } - } + DType::Utf8(_) | DType::Binary(_) => canonicalize_string(self), _ => canonicalize_primitive(self), } } } /// Canonicalize a set of codes and values. -fn canonicalize_string_nonnull(codes: &[u64], values: VarBinViewArray) -> VortexResult { - let value_views = ScalarBuffer::::from(values.views().clone().into_arrow()); - - // Gather the views from value_views into full_views using the dictionary codes. - let full_views: Vec = codes - .iter() - .map(|code| value_views[*code as usize]) - .collect(); - - VarBinViewArray::try_new( - full_views.into(), - values.buffers().collect(), - values.dtype().clone(), - Validity::NonNullable, - ) - .map(Canonical::VarBinView) -} +fn canonicalize_string(array: DictArray) -> VortexResult { + let values = array.values().into_varbinview()?; + let codes = try_cast(array.codes(), PType::U64.into())?.into_primitive()?; -fn canonicalize_string_nullable( - codes: &[u64], - values: VarBinViewArray, - validity: Validity, -) -> VortexResult { let value_views = ScalarBuffer::::from(values.views().clone().into_arrow()); // Gather the views from value_views into full_views using the dictionary codes. let full_views: Vec = codes + .maybe_null_slice::() .iter() - .map(|code| { - if *code == 0 { - 0u128 - } else { - value_views[(*code - 1) as usize] - } - }) + .map(|code| value_views[*code as usize]) .collect(); VarBinViewArray::try_new( full_views.into(), values.buffers().collect(), values.dtype().clone(), - validity, + array.logical_validity().into_validity(), ) .map(Canonical::VarBinView) } From 3756acf0a8bb3f8572bdec6e575e221af91cf8fe Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 22 Oct 2024 22:06:19 -0400 Subject: [PATCH 3/6] dtype --- encodings/dict/src/array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index cd97ceadd2..376a07945c 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -92,7 +92,7 @@ fn canonicalize_string(array: DictArray) -> VortexResult { VarBinViewArray::try_new( full_views.into(), values.buffers().collect(), - values.dtype().clone(), + array.dtype().clone(), array.logical_validity().into_validity(), ) .map(Canonical::VarBinView) From 64e079562f4da4fe6eb5a97557d0f259b8461bb9 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 22 Oct 2024 22:43:47 -0400 Subject: [PATCH 4/6] validity --- encodings/dict/src/array.rs | 19 ++++++++++--- vortex-array/src/array/varbin/mod.rs | 2 +- vortex-array/src/array/varbinview/compute.rs | 29 ++++++++++++++------ vortex-array/src/array/varbinview/mod.rs | 2 +- 4 files changed, 37 insertions(+), 15 deletions(-) diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index 376a07945c..d356b81c16 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -3,12 +3,12 @@ use std::fmt::{Debug, Display}; use arrow_buffer::{BooleanBuffer, ScalarBuffer}; use serde::{Deserialize, Serialize}; use vortex::array::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::array::{BoolArray, VarBinViewArray}; -use vortex::compute::take; +use vortex::array::{BoolArray, ConstantArray, VarBinViewArray}; use vortex::compute::unary::{scalar_at, try_cast}; +use vortex::compute::{compare, take, Operator}; use vortex::encoding::ids; use vortex::stats::StatsSet; -use vortex::validity::{ArrayValidity, LogicalValidity}; +use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; use vortex::{ impl_encoding, Array, ArrayDType, ArrayTrait, Canonical, IntoArray, IntoArrayVariant, IntoCanonical, @@ -89,11 +89,22 @@ fn canonicalize_string(array: DictArray) -> VortexResult { .map(|code| value_views[*code as usize]) .collect(); + let validity = if array.dtype().is_nullable() { + // For nullable arrays, a code of 0 indicates null value. + Validity::Array(compare( + codes.as_ref(), + ConstantArray::new(0u64, codes.len()).as_ref(), + Operator::Eq, + )?) + } else { + Validity::NonNullable + }; + VarBinViewArray::try_new( full_views.into(), values.buffers().collect(), array.dtype().clone(), - array.logical_validity().into_validity(), + validity, ) .map(Canonical::VarBinView) } diff --git a/vortex-array/src/array/varbin/mod.rs b/vortex-array/src/array/varbin/mod.rs index df8ed51deb..0df86daa7f 100644 --- a/vortex-array/src/array/varbin/mod.rs +++ b/vortex-array/src/array/varbin/mod.rs @@ -62,7 +62,7 @@ impl VarBinArray { vortex_bail!(MismatchedTypes: "utf8 or binary", dtype); } if dtype.is_nullable() == (validity == Validity::NonNullable) { - vortex_bail!("incorrect validity {:?}", validity); + vortex_bail!("incorrect validity {:?} for {}", validity, dtype); } let length = offsets.len() - 1; diff --git a/vortex-array/src/array/varbinview/compute.rs b/vortex-array/src/array/varbinview/compute.rs index 8846ee81f6..a914068f95 100644 --- a/vortex-array/src/array/varbinview/compute.rs +++ b/vortex-array/src/array/varbinview/compute.rs @@ -3,19 +3,21 @@ use std::sync::Arc; use arrow_array::cast::AsArray; use arrow_array::types::ByteViewType; use arrow_array::{Datum, GenericByteViewArray}; +use arrow_buffer::ScalarBuffer; use arrow_ord::cmp; use arrow_schema::DataType; use vortex_buffer::Buffer; +use vortex_dtype::PType; use vortex_error::{vortex_bail, VortexResult, VortexUnwrap}; use vortex_scalar::Scalar; use crate::array::varbin::varbin_scalar; use crate::array::varbinview::{VarBinViewArray, VIEW_SIZE_BYTES}; -use crate::array::{varbinview_as_arrow, ConstantArray}; +use crate::array::ConstantArray; use crate::arrow::FromArrowArray; -use crate::compute::unary::ScalarAtFn; +use crate::compute::unary::{try_cast, ScalarAtFn}; use crate::compute::{ArrayCompute, MaybeCompareFn, Operator, SliceFn, TakeFn}; -use crate::{Array, ArrayDType, IntoArray, IntoCanonical}; +use crate::{Array, ArrayDType, IntoArray, IntoArrayVariant, IntoCanonical}; impl ArrayCompute for VarBinViewArray { fn scalar_at(&self) -> Option<&dyn ScalarAtFn> { @@ -60,12 +62,21 @@ impl SliceFn for VarBinViewArray { /// Take involves creating a new array that references the old array, just with the given set of views. impl TakeFn for VarBinViewArray { fn take(&self, indices: &Array) -> VortexResult { - let array_ref = varbinview_as_arrow(self); - let indices_arrow = indices.clone().into_canonical()?.into_arrow()?; - - let take_arrow = arrow_select::take::take(&array_ref, &indices_arrow, None)?; - let nullable = take_arrow.is_nullable(); - Ok(Array::from_arrow(take_arrow, nullable)) + let views = ScalarBuffer::::from(self.views().clone().into_arrow()); + + let taken: Vec = try_cast(indices, PType::U64.into())? + .into_primitive()? + .maybe_null_slice::() + .iter() + .map(|idx| views[*idx as usize]) + .collect(); + VarBinViewArray::try_new( + ScalarBuffer::from(taken).into_inner().into(), + self.buffers().collect(), + self.dtype().clone(), + self.validity().take(indices)?, + ) + .map(|a| a.into_array()) } } diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index b59dac6383..53dd3fdb6a 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -238,7 +238,7 @@ impl VarBinViewArray { } if dtype.is_nullable() == (validity == Validity::NonNullable) { - vortex_bail!("incorrect validity {:?}", validity); + vortex_bail!("incorrect validity {:?} for {}", validity, dtype); } let num_views = views.len() / VIEW_SIZE_BYTES; From 53b9f4d04d61d70f0c5790b719e5b453fbcb5f0c Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Tue, 22 Oct 2024 22:47:42 -0400 Subject: [PATCH 5/6] enforce views buffer is aligned on construction --- vortex-array/src/array/varbinview/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 53dd3fdb6a..4c90647f4b 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -2,12 +2,12 @@ use std::fmt::{Debug, Display, Formatter}; use std::slice; use std::sync::Arc; -use ::serde::{Deserialize, Serialize}; use arrow_array::builder::{BinaryViewBuilder, GenericByteViewBuilder, StringViewBuilder}; use arrow_array::types::{BinaryViewType, ByteViewType, StringViewType}; use arrow_array::{ArrayRef, BinaryViewArray, GenericByteViewArray, StringViewArray}; use arrow_buffer::ScalarBuffer; use itertools::Itertools; +use ::serde::{Deserialize, Serialize}; use static_assertions::{assert_eq_align, assert_eq_size}; use vortex_buffer::Buffer; use vortex_dtype::{DType, PType}; @@ -227,6 +227,10 @@ impl VarBinViewArray { dtype: DType, validity: Validity, ) -> VortexResult { + if views.as_ptr().align_offset(VIEW_SIZE_BYTES) != 0 { + vortex_bail!("views buffer must be aligned to 16 bytes"); + } + for d in buffers.iter() { if !matches!(d.dtype(), &DType::BYTES) { vortex_bail!(MismatchedTypes: "u8", d.dtype()); From 8d3ed4d24e6f61aea666206c72c6acbdb813f829 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Wed, 23 Oct 2024 10:22:07 -0400 Subject: [PATCH 6/6] remove specialized canonicalize, just use take --- encodings/dict/benches/dict_canonical.rs | 4 +- encodings/dict/src/array.rs | 55 +++------------------ vortex-array/src/array/bool/compute/cast.rs | 35 +++++++++++++ vortex-array/src/array/bool/compute/mod.rs | 7 ++- vortex-array/src/array/varbinview/mod.rs | 2 +- vortex-array/src/compute/compare.rs | 15 ++++++ 6 files changed, 66 insertions(+), 52 deletions(-) create mode 100644 vortex-array/src/array/bool/compute/cast.rs diff --git a/encodings/dict/benches/dict_canonical.rs b/encodings/dict/benches/dict_canonical.rs index e5c11ab133..a6f75eb685 100644 --- a/encodings/dict/benches/dict_canonical.rs +++ b/encodings/dict/benches/dict_canonical.rs @@ -23,10 +23,10 @@ fn fixture(len: usize) -> DictArray { } fn bench_canonical(c: &mut Criterion) { - let dict_array = fixture(1024).into_array(); + let dict_array = fixture(1024 * 1024).into_array(); c.bench_function("canonical", |b| { - b.iter(|| dict_array.clone().into_canonical().unwrap()) + b.iter(|| dict_array.clone().into_canonical()) }); } diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index d356b81c16..d4a903fafe 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -1,14 +1,14 @@ use std::fmt::{Debug, Display}; -use arrow_buffer::{BooleanBuffer, ScalarBuffer}; +use arrow_buffer::BooleanBuffer; use serde::{Deserialize, Serialize}; use vortex::array::visitor::{AcceptArrayVisitor, ArrayVisitor}; -use vortex::array::{BoolArray, ConstantArray, VarBinViewArray}; -use vortex::compute::unary::{scalar_at, try_cast}; -use vortex::compute::{compare, take, Operator}; +use vortex::array::BoolArray; +use vortex::compute::take; +use vortex::compute::unary::scalar_at; use vortex::encoding::ids; use vortex::stats::StatsSet; -use vortex::validity::{ArrayValidity, LogicalValidity, Validity}; +use vortex::validity::{ArrayValidity, LogicalValidity}; use vortex::{ impl_encoding, Array, ArrayDType, ArrayTrait, Canonical, IntoArray, IntoArrayVariant, IntoCanonical, @@ -68,52 +68,11 @@ impl ArrayTrait for DictArray {} impl IntoCanonical for DictArray { fn into_canonical(self) -> VortexResult { - match self.dtype() { - DType::Utf8(_) | DType::Binary(_) => canonicalize_string(self), - _ => canonicalize_primitive(self), - } + let canonical_values: Array = self.values().into_canonical()?.into(); + take(canonical_values, self.codes())?.into_canonical() } } -/// Canonicalize a set of codes and values. -fn canonicalize_string(array: DictArray) -> VortexResult { - let values = array.values().into_varbinview()?; - let codes = try_cast(array.codes(), PType::U64.into())?.into_primitive()?; - - let value_views = ScalarBuffer::::from(values.views().clone().into_arrow()); - - // Gather the views from value_views into full_views using the dictionary codes. - let full_views: Vec = codes - .maybe_null_slice::() - .iter() - .map(|code| value_views[*code as usize]) - .collect(); - - let validity = if array.dtype().is_nullable() { - // For nullable arrays, a code of 0 indicates null value. - Validity::Array(compare( - codes.as_ref(), - ConstantArray::new(0u64, codes.len()).as_ref(), - Operator::Eq, - )?) - } else { - Validity::NonNullable - }; - - VarBinViewArray::try_new( - full_views.into(), - values.buffers().collect(), - array.dtype().clone(), - validity, - ) - .map(Canonical::VarBinView) -} - -fn canonicalize_primitive(array: DictArray) -> VortexResult { - let canonical_values: Array = array.values().into_canonical()?.into(); - take(canonical_values, array.codes())?.into_canonical() -} - impl ArrayValidity for DictArray { fn is_valid(&self, index: usize) -> bool { let values_index = scalar_at(self.codes(), index) diff --git a/vortex-array/src/array/bool/compute/cast.rs b/vortex-array/src/array/bool/compute/cast.rs new file mode 100644 index 0000000000..699589dc6d --- /dev/null +++ b/vortex-array/src/array/bool/compute/cast.rs @@ -0,0 +1,35 @@ +use vortex_dtype::{DType, Nullability}; +use vortex_error::{vortex_bail, VortexResult}; + +use crate::array::BoolArray; +use crate::compute::unary::CastFn; +use crate::validity::Validity; +use crate::{Array, ArrayDType, IntoArray}; + +impl CastFn for BoolArray { + fn cast(&self, dtype: &DType) -> VortexResult { + if !dtype.is_boolean() { + vortex_bail!("Cannot cast BoolArray to non-Bool type"); + } + + match (self.dtype().nullability(), dtype.nullability()) { + // convert to same nullability => no-op + (Nullability::NonNullable, Nullability::NonNullable) + | (Nullability::Nullable, Nullability::Nullable) => Ok(self.clone().into_array()), + + // convert non-nullable to nullable + (Nullability::NonNullable, Nullability::Nullable) => { + Ok(BoolArray::try_new(self.boolean_buffer(), Validity::AllValid)?.into_array()) + } + + // convert nullable to non-nullable, only safe if there are no nulls present. + (Nullability::Nullable, Nullability::NonNullable) => { + if self.validity() != Validity::AllValid { + vortex_bail!("cannot cast bool array with nulls as non-nullable"); + } + + Ok(BoolArray::try_new(self.boolean_buffer(), Validity::NonNullable)?.into_array()) + } + } + } +} diff --git a/vortex-array/src/array/bool/compute/mod.rs b/vortex-array/src/array/bool/compute/mod.rs index 9dbfc0f1dd..3d5355d45e 100644 --- a/vortex-array/src/array/bool/compute/mod.rs +++ b/vortex-array/src/array/bool/compute/mod.rs @@ -1,9 +1,10 @@ use crate::array::BoolArray; -use crate::compute::unary::{FillForwardFn, ScalarAtFn}; +use crate::compute::unary::{CastFn, FillForwardFn, ScalarAtFn}; use crate::compute::{AndFn, ArrayCompute, OrFn, SliceFn, TakeFn}; mod boolean; +mod cast; mod fill; mod filter; mod flatten; @@ -12,6 +13,10 @@ mod slice; mod take; impl ArrayCompute for BoolArray { + fn cast(&self) -> Option<&dyn CastFn> { + Some(self) + } + fn fill_forward(&self) -> Option<&dyn FillForwardFn> { Some(self) } diff --git a/vortex-array/src/array/varbinview/mod.rs b/vortex-array/src/array/varbinview/mod.rs index 4c90647f4b..d2c7b5d1c6 100644 --- a/vortex-array/src/array/varbinview/mod.rs +++ b/vortex-array/src/array/varbinview/mod.rs @@ -2,12 +2,12 @@ use std::fmt::{Debug, Display, Formatter}; use std::slice; use std::sync::Arc; +use ::serde::{Deserialize, Serialize}; use arrow_array::builder::{BinaryViewBuilder, GenericByteViewBuilder, StringViewBuilder}; use arrow_array::types::{BinaryViewType, ByteViewType, StringViewType}; use arrow_array::{ArrayRef, BinaryViewArray, GenericByteViewArray, StringViewArray}; use arrow_buffer::ScalarBuffer; use itertools::Itertools; -use ::serde::{Deserialize, Serialize}; use static_assertions::{assert_eq_align, assert_eq_size}; use vortex_buffer::Buffer; use vortex_dtype::{DType, PType}; diff --git a/vortex-array/src/compute/compare.rs b/vortex-array/src/compute/compare.rs index c2736c3fb1..d8d02fc34d 100644 --- a/vortex-array/src/compute/compare.rs +++ b/vortex-array/src/compute/compare.rs @@ -78,6 +78,21 @@ pub trait MaybeCompareFn { fn maybe_compare(&self, other: &Array, operator: Operator) -> Option>; } +/// Binary comparison operation between two arrays. +/// +/// The result of comparison is a `Bool` typed Array holding `true` where both of the operands +/// satisfy the operation, `false` otherwise. +/// +/// Nullability of the result is the union of the nullabilities of the operands. +/// +/// ## Null semantics +/// +/// All binary comparison operations where one of the operands is `NULL` will result in a `NULL` +/// value being placed in the output. +/// +/// This semantic is derived from [Apache Arrow's handling of nulls]. +/// +/// [Apache Arrow's handling of nulls]: https://arrow.apache.org/rust/arrow/compute/kernels/cmp/fn.eq.html pub fn compare( left: impl AsRef, right: impl AsRef,