From 5afe635df957c3de8a2d43fc57392fb0b158b337 Mon Sep 17 00:00:00 2001 From: Andrew Duffy Date: Mon, 28 Oct 2024 11:08:13 -0400 Subject: [PATCH] feat: specialized IntoCanonical for DictArray utf8/binary --- encodings/dict/src/array.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/encodings/dict/src/array.rs b/encodings/dict/src/array.rs index 598aa96e34..69b5831c19 100644 --- a/encodings/dict/src/array.rs +++ b/encodings/dict/src/array.rs @@ -67,8 +67,18 @@ impl ArrayTrait for DictArray {} impl IntoCanonical for DictArray { fn into_canonical(self) -> VortexResult { - let canonical_values: Array = self.values().into_canonical()?.into(); - take(canonical_values, self.codes())?.into_canonical() + match self.dtype() { + // NOTE: Utf8 and Binary will decompress into VarBinViewArray, which requires a full + // decompression to construct the views child array. + // For this case, it is *always* faster to decompress the values first and then create + // copies of the view pointers. + DType::Utf8(_) | DType::Binary(_) => { + let canonical_values: Array = self.values().into_canonical()?.into(); + take(canonical_values, self.codes())?.into_canonical() + } + // Non-string case: take and then canonicalize + _ => take(self.values(), self.codes())?.into_canonical(), + } } }