Skip to content

Commit

Permalink
feat: add stat for uncompressed size in bytes (#1315)
Browse files Browse the repository at this point in the history
fixes #1237
  • Loading branch information
lwwmanning authored Nov 18, 2024
1 parent 8f0ba91 commit 18986c2
Show file tree
Hide file tree
Showing 25 changed files with 169 additions and 36 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ jobs:
with:
version: v1.0.0
- name: Rust Bench as test
run: cargo bench --bench '*[!noci]' -- --test
run: cargo bench --bench '*[!noci]' --profile benchtest -- --test

generated-files:
name: "Check generated proto/fbs files are up to date"
Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -222,3 +222,7 @@ lto = "thin" # attempts to perform optimizations across all crates within t
codegen-units = 16 # default for "release", which "bench" inherits
lto = false # default
debug = true

[profile.benchtest]
inherits = "bench"
debug-assertions = true
6 changes: 5 additions & 1 deletion vortex-array/src/array/bool/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@ use vortex_error::VortexResult;
use crate::array::BoolArray;
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use crate::validity::{ArrayValidity, LogicalValidity};
use crate::{ArrayDType, IntoArrayVariant};
use crate::{ArrayDType, ArrayTrait as _, IntoArrayVariant};

impl ArrayStatisticsCompute for BoolArray {
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
if stat == Stat::UncompressedSizeInBytes {
return Ok(StatsSet::of(stat, self.nbytes()));
}

if self.is_empty() {
return Ok(StatsSet::from_iter([
(Stat::TrueCount, 0.into()),
Expand Down
20 changes: 15 additions & 5 deletions vortex-array/src/array/chunked/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use crate::compute::unary::{scalar_at, scalar_at_unchecked, subtract_scalar, Sub
use crate::compute::{search_sorted, SearchSortedSide};
use crate::encoding::ids;
use crate::iter::{ArrayIterator, ArrayIteratorAdapter};
use crate::stats::StatsSet;
use crate::stats::ArrayStatistics;
use crate::stream::{ArrayStream, ArrayStreamAdapter};
use crate::validity::Validity::NonNullable;
use crate::validity::{ArrayValidity, LogicalValidity, Validity};
Expand Down Expand Up @@ -61,9 +61,19 @@ impl ChunkedArray {
.collect_vec();

let nchunks = chunk_offsets.len() - 1;
let length = *chunk_offsets.last().unwrap_or_else(|| {
unreachable!("Chunk ends is guaranteed to have at least one element")
}) as usize;
let length = *chunk_offsets
.last()
.vortex_expect("Chunk ends is guaranteed to have at least one element")
as usize;

let stats = chunks
.iter()
.map(|chunk| chunk.statistics().to_set())
.reduce(|mut acc, stats| {
acc.merge_ordered(&stats);
acc
})
.unwrap_or_default();

let mut children = Vec::with_capacity(chunks.len() + 1);
children.push(PrimitiveArray::from_vec(chunk_offsets, NonNullable).into_array());
Expand All @@ -74,7 +84,7 @@ impl ChunkedArray {
length,
ChunkedMetadata { nchunks },
children.into(),
StatsSet::default(),
stats,
)
}

Expand Down
2 changes: 2 additions & 0 deletions vortex-array/src/array/chunked/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use crate::stats::{ArrayStatistics, ArrayStatisticsCompute, Stat, StatsSet};

impl ArrayStatisticsCompute for ChunkedArray {
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
// for UncompressedSizeInBytes, we end up with sum of chunk uncompressed sizes
// this ignores the `chunk_offsets` array child, so it won't exactly match self.nbytes()
Ok(self
.chunks()
.map(|c| {
Expand Down
6 changes: 5 additions & 1 deletion vortex-array/src/array/null/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,11 @@ impl ArrayValidity for NullArray {
}

impl ArrayStatisticsCompute for NullArray {
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
if stat == Stat::UncompressedSizeInBytes {
return Ok(StatsSet::of(stat, self.nbytes()));
}

Ok(StatsSet::nulls(self.len(), &DType::Null))
}
}
Expand Down
10 changes: 7 additions & 3 deletions vortex-array/src/array/primitive/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,18 @@ use crate::array::primitive::PrimitiveArray;
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use crate::validity::{ArrayValidity, LogicalValidity};
use crate::variants::PrimitiveArrayTrait;
use crate::{ArrayDType, IntoArrayVariant};
use crate::{ArrayDType, ArrayTrait as _, IntoArrayVariant};

trait PStatsType: NativePType + Into<Scalar> + BitWidth {}

impl<T: NativePType + Into<Scalar> + BitWidth> PStatsType for T {}

impl ArrayStatisticsCompute for PrimitiveArray {
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
if stat == Stat::UncompressedSizeInBytes {
return Ok(StatsSet::of(stat, self.nbytes()));
}

let mut stats = match_each_native_ptype!(self.ptype(), |$P| {
match self.logical_validity() {
LogicalValidity::AllValid(_) => self.maybe_null_slice::<$P>().compute_statistics(stat),
Expand Down Expand Up @@ -77,7 +81,7 @@ impl<T: PStatsType> ArrayStatisticsCompute for &[T] {
self.iter().skip(1).for_each(|next| stats.next(*next));
stats.finish()
}
Stat::TrueCount => StatsSet::default(),
Stat::TrueCount | Stat::UncompressedSizeInBytes => StatsSet::default(),
})
}
}
Expand All @@ -87,7 +91,7 @@ struct NullableValues<'a, T: PStatsType>(&'a [T], &'a BooleanBuffer);
impl<T: PStatsType> ArrayStatisticsCompute for NullableValues<'_, T> {
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
let values = self.0;
if values.is_empty() || stat == Stat::TrueCount {
if values.is_empty() || stat == Stat::TrueCount || stat == Stat::UncompressedSizeInBytes {
return Ok(StatsSet::default());
}

Expand Down
18 changes: 16 additions & 2 deletions vortex-array/src/array/struct_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use vortex_error::{vortex_bail, vortex_err, vortex_panic, VortexExpect as _, Vor

use crate::array::visitor::{AcceptArrayVisitor, ArrayVisitor};
use crate::encoding::ids;
use crate::stats::{ArrayStatisticsCompute, StatsSet};
use crate::stats::{ArrayStatistics, ArrayStatisticsCompute, Stat, StatsSet};
use crate::validity::{ArrayValidity, LogicalValidity, Validity, ValidityMetadata};
use crate::variants::{ArrayVariants, StructArrayTrait};
use crate::{
Expand Down Expand Up @@ -191,7 +191,21 @@ impl AcceptArrayVisitor for StructArray {
}
}

impl ArrayStatisticsCompute for StructArray {}
impl ArrayStatisticsCompute for StructArray {
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
Ok(match stat {
Stat::UncompressedSizeInBytes => self
.children()
.map(|f| f.statistics().compute_uncompressed_size_in_bytes())
.reduce(|acc, field_size| acc.zip(field_size).map(|(a, b)| a + b))
.flatten()
.map(|size| StatsSet::of(stat, size))
.unwrap_or_default(),
Stat::NullCount => StatsSet::of(stat, self.validity().null_count(self.len())?),
_ => StatsSet::default(),
})
}
}

#[cfg(test)]
mod test {
Expand Down
8 changes: 6 additions & 2 deletions vortex-array/src/array/varbin/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,14 @@ use vortex_error::VortexResult;
use crate::accessor::ArrayAccessor;
use crate::array::varbin::{varbin_scalar, VarBinArray};
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use crate::ArrayDType;
use crate::{ArrayDType, ArrayTrait as _};

impl ArrayStatisticsCompute for VarBinArray {
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
if stat == Stat::UncompressedSizeInBytes {
return Ok(StatsSet::of(stat, self.nbytes()));
}

if self.is_empty() {
return Ok(StatsSet::default());
}
Expand Down
9 changes: 7 additions & 2 deletions vortex-array/src/array/varbinview/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,18 @@ use crate::accessor::ArrayAccessor;
use crate::array::varbin::compute_stats;
use crate::array::varbinview::VarBinViewArray;
use crate::stats::{ArrayStatisticsCompute, Stat, StatsSet};
use crate::ArrayDType;
use crate::{ArrayDType, ArrayTrait as _};

impl ArrayStatisticsCompute for VarBinViewArray {
fn compute_statistics(&self, _stat: Stat) -> VortexResult<StatsSet> {
fn compute_statistics(&self, stat: Stat) -> VortexResult<StatsSet> {
if stat == Stat::UncompressedSizeInBytes {
return Ok(StatsSet::of(stat, self.nbytes()));
}

if self.is_empty() {
return Ok(StatsSet::default());
}

self.with_iterator(|iter| compute_stats(iter, self.dtype()))
}
}
16 changes: 13 additions & 3 deletions vortex-array/src/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,15 @@ pub fn check_statistics_unchanged(arr: &ArrayData, compressed: &ArrayData) {
let _ = compressed;
#[cfg(debug_assertions)]
{
for (stat, value) in arr.statistics().to_set().into_iter() {
use crate::stats::Stat;

// Run count merge_ordered assumes that the run is "broken" on each chunk, which is a useful estimate but not guaranteed to be correct.
for (stat, value) in arr
.statistics()
.to_set()
.into_iter()
.filter(|(stat, _)| *stat != Stat::RunCount)
{
debug_assert_eq!(
compressed.statistics().get(stat),
Some(value.clone()),
Expand All @@ -68,7 +76,9 @@ pub fn check_statistics_unchanged(arr: &ArrayData, compressed: &ArrayData) {
}
}

/// Compute pruning stats for an array.
pub fn compute_pruning_stats(arr: &ArrayData) -> VortexResult<()> {
/// Eagerly compute certain statistics (i.e., pruning stats plus UncompressedSizeInBytes) for an array.
/// This function is intended to be called in compressors, immediately before compression occurs.
pub fn compute_precompression_stats(arr: &ArrayData) -> VortexResult<()> {
arr.statistics().compute_uncompressed_size_in_bytes();
arr.statistics().compute_all(PRUNING_STATS).map(|_| ())
}
1 change: 1 addition & 0 deletions vortex-array/src/stats/flatbuffers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ impl WriteFlatBuffer for &dyn Statistics {
null_count: self.get_as_cast::<u64>(Stat::NullCount),
bit_width_freq,
trailing_zero_freq,
uncompressed_size_in_bytes: self.get_as_cast::<u64>(Stat::UncompressedSizeInBytes),
};

crate::flatbuffers::ArrayStats::create(fbb, stat_args)
Expand Down
14 changes: 12 additions & 2 deletions vortex-array/src/stats/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ pub enum Stat {
TrueCount,
/// The number of null values in the array
NullCount,
UncompressedSizeInBytes,
}

impl Stat {
Expand All @@ -59,6 +60,7 @@ impl Stat {
| Stat::Min
| Stat::TrueCount
| Stat::NullCount
| Stat::UncompressedSizeInBytes
)
}

Expand All @@ -81,6 +83,7 @@ impl Display for Stat {
Self::RunCount => write!(f, "run_count"),
Self::TrueCount => write!(f, "true_count"),
Self::NullCount => write!(f, "null_count"),
Self::UncompressedSizeInBytes => write!(f, "uncompressed_size_in_bytes"),
}
}
}
Expand All @@ -100,10 +103,13 @@ pub trait Statistics {
/// Compute all of the requested statistics (if not already present)
/// Returns a StatsSet with the requested stats and any additional available stats
fn compute_all(&self, stats: &[Stat]) -> VortexResult<StatsSet> {
let mut stats_set = self.to_set();
for stat in stats {
let _ = self.compute(*stat);
if let Some(s) = self.compute(*stat) {
stats_set.set(*stat, s)
}
}
Ok(self.to_set())
Ok(stats_set)
}
}

Expand Down Expand Up @@ -222,6 +228,10 @@ impl dyn Statistics + '_ {
pub fn compute_trailing_zero_freq(&self) -> Option<Vec<usize>> {
self.compute_as::<Vec<usize>>(Stat::TrailingZeroFreq)
}

pub fn compute_uncompressed_size_in_bytes(&self) -> Option<usize> {
self.compute_as(Stat::UncompressedSizeInBytes)
}
}

pub fn trailing_zeros(array: &ArrayData) -> u8 {
Expand Down
11 changes: 10 additions & 1 deletion vortex-array/src/stats/statsset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,10 @@ impl StatsSet {
self.values[stat].as_ref()
}

fn get_as<T: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(&self, stat: Stat) -> Option<T> {
pub fn get_as<T: for<'a> TryFrom<&'a Scalar, Error = VortexError>>(
&self,
stat: Stat,
) -> Option<T> {
self.get(stat).map(|v| {
T::try_from(v).unwrap_or_else(|err| {
vortex_panic!(
Expand Down Expand Up @@ -138,6 +141,7 @@ impl StatsSet {
Stat::RunCount => self.merge_run_count(other),
Stat::TrueCount => self.merge_true_count(other),
Stat::NullCount => self.merge_null_count(other),
Stat::UncompressedSizeInBytes => self.merge_uncompressed_size_in_bytes(other),
}
}

Expand All @@ -161,6 +165,7 @@ impl StatsSet {
Stat::Min => self.merge_min(other),
Stat::TrueCount => self.merge_true_count(other),
Stat::NullCount => self.merge_null_count(other),
Stat::UncompressedSizeInBytes => self.merge_uncompressed_size_in_bytes(other),
_ => vortex_panic!("Unrecognized commutative stat {}", s),
}
}
Expand Down Expand Up @@ -241,6 +246,10 @@ impl StatsSet {
self.merge_sum_stat(other, Stat::NullCount)
}

fn merge_uncompressed_size_in_bytes(&mut self, other: &Self) {
self.merge_sum_stat(other, Stat::UncompressedSizeInBytes)
}

fn merge_sum_stat(&mut self, other: &Self, stat: Stat) {
match (self.get_as::<usize>(stat), other.get_as::<usize>(stat)) {
(Some(nc1), Some(nc2)) => {
Expand Down
5 changes: 5 additions & 0 deletions vortex-array/src/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,11 @@ impl Statistics for ViewedArrayData {
.trailing_zero_freq()
.map(|v| v.iter().collect_vec())
.map(|v| v.into()),
Stat::UncompressedSizeInBytes => self
.flatbuffer()
.stats()?
.uncompressed_size_in_bytes()
.map(u64::into),
}
}

Expand Down
1 change: 1 addition & 0 deletions vortex-flatbuffers/flatbuffers/vortex-array/array.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ table ArrayStats {
null_count: uint64 = null;
bit_width_freq: [uint64];
trailing_zero_freq: [uint64];
uncompressed_size_in_bytes: uint64 = null;
}


Expand Down
Loading

0 comments on commit 18986c2

Please sign in to comment.