From 8467f6499ff1e2fade631a3479a59af775bf5bd2 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 28 Oct 2024 14:42:20 -0400 Subject: [PATCH] feat: specialized IntoCanonical for DictArray utf8/binary (#1146) Fixes #1041 Change the IntoCanonical implementation for DictArray to do canonicalize-then-take for stringy things, take-then-canonicalize for everything else. It is always the case for strings that canonicalize-then-take will be faster, regardless of the compression of the values array. Overrides #1136 --- encodings/dict/src/array.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index 598aa96e34..69b5831c19 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -67,8 +67,18 @@ impl ArrayTrait for DictArray {} impl IntoCanonical for DictArray { fn into_canonical(self) -> VortexResult { - let canonical_values: Array = self.values().into_canonical()?.into(); - take(canonical_values, self.codes())?.into_canonical() + match self.dtype() { + // NOTE: Utf8 and Binary will decompress into VarBinViewArray, which requires a full + // decompression to construct the views child array. + // For this case, it is *always* faster to decompress the values first and then create + // copies of the view pointers. + DType::Utf8(_) | DType::Binary(_) => { + let canonical_values: Array = self.values().into_canonical()?.into(); + take(canonical_values, self.codes())?.into_canonical() + } + // Non-string case: take and then canonicalize + _ => take(self.values(), self.codes())?.into_canonical(), + } } }