diff --git a/Cargo.lock b/Cargo.lock index b4f15d27ab..bf54441a92 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3145,6 +3145,7 @@ dependencies = [ "tokio", "vortex-alp", "vortex-array", + "vortex-buffer", "vortex-bytebool", "vortex-datetime-parts", "vortex-dict", diff --git a/docs/index.rst b/docs/index.rst index 3fba86d504..12a0e36a96 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,3 +16,4 @@ Vortex is an Apache Arrow-compatible toolkit for working with compressed array d dtype io expr + scalar diff --git a/docs/scalar.rst b/docs/scalar.rst new file mode 100644 index 0000000000..9fb3b26cfc --- /dev/null +++ b/docs/scalar.rst @@ -0,0 +1,6 @@ +Scalar Values +============= + +.. automodule:: vortex.scalar + :members: + :imported-members: diff --git a/pyvortex/Cargo.toml b/pyvortex/Cargo.toml index 4f7ae9dfd6..540740dadc 100644 --- a/pyvortex/Cargo.toml +++ b/pyvortex/Cargo.toml @@ -36,6 +36,7 @@ tokio = { workspace = true, features = ["fs"] } vortex-alp = { workspace = true } vortex-array = { workspace = true } +vortex-buffer = { workspace = true } vortex-bytebool = { workspace = true } vortex-datetime-parts = { workspace = true } vortex-dict = { workspace = true } diff --git a/pyvortex/python/vortex/__init__.py b/pyvortex/python/vortex/__init__.py index f6ec061509..d09266070b 100644 --- a/pyvortex/python/vortex/__init__.py +++ b/pyvortex/python/vortex/__init__.py @@ -1,9 +1,9 @@ from . import encoding from ._lib import __doc__ as module_docs -from ._lib import dtype, expr, io +from ._lib import dtype, expr, io, scalar __doc__ = module_docs del module_docs array = encoding.array -__all__ = ["array", dtype, expr, io, encoding] +__all__ = ["array", dtype, expr, io, encoding, scalar] diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index 7dcb9dd751..bff63bada4 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -2,15 +2,16 @@ use arrow::array::{Array as ArrowArray, ArrayRef}; use arrow::pyarrow::ToPyArrow; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; -use pyo3::types::{IntoPyDict, PyList}; +use pyo3::types::{IntoPyDict, PyInt, PyList}; use vortex::array::ChunkedArray; -use vortex::compute::unary::fill_forward; +use vortex::compute::unary::{fill_forward, scalar_at}; use vortex::compute::{compare, slice, take, Operator}; use vortex::{Array, ArrayDType, IntoCanonical}; use crate::dtype::PyDType; use crate::error::PyVortexError; use crate::python_repr::PythonRepr; +use crate::scalar::scalar_into_py; #[pyclass(name = "Array", module = "vortex", sequence, subclass)] /// An array of zero or more *rows* each with the same set of *columns*. @@ -326,6 +327,87 @@ impl PyArray { .map(|arr| PyArray { inner: arr }) } + /// Retrieve a row by its index. + /// + /// Parameters + /// ---------- + /// index : :class:`int` + /// The index of interest. Must be greater than or equal to zero and less than the length of + /// this array. + /// + /// Returns + /// ------- + /// one of :class:`int`, :class:`float`, :class:`bool`, :class:`vortex.scalar.Buffer`, :class:`vortex.scalar.BufferString`, :class:`vortex.scalar.VortexList`, :class:`vortex.scalar.VortexStruct` + /// If this array contains numbers or Booleans, this array returns the corresponding + /// primitive Python type, i.e. int, float, and bool. For structures and variable-length + /// data types, a zero-copy view of the underlying data is returned. + /// + /// Examples + /// -------- + /// + /// Retrieve the last element from an array of integers: + /// + /// >>> vortex.encoding.array([10, 42, 999, 1992]).scalar_at(3) + /// 1992 + /// + /// Retrieve the third element from an array of strings: + /// + /// >>> array = vortex.encoding.array(["hello", "goodbye", "it", "is"]) + /// >>> array.scalar_at(2) + /// + /// + /// Vortex, by default, returns a view into the array's data. This avoids copying the data, + /// which can be expensive if done repeatedly. :meth:`.BufferString.into_python` forcibly copies + /// the scalar data into a Python data structure. + /// + /// >>> array.scalar_at(2).into_python() + /// 'it' + /// + /// Retrieve an element from an array of structures: + /// + /// >>> array = vortex.encoding.array([ + /// ... {'name': 'Joseph', 'age': 25}, + /// ... {'name': 'Narendra', 'age': 31}, + /// ... {'name': 'Angela', 'age': 33}, + /// ... None, + /// ... {'name': 'Mikhail', 'age': 57}, + /// ... ]) + /// >>> array.scalar_at(2).into_python() + /// {'age': 33, 'name': } + /// + /// Notice that :meth:`.VortexStruct.into_python` only copies one "layer" of data into + /// Python. If we want to ensure the entire structure is recurisvely copied into Python we can + /// specify ``recursive=True``: + /// + /// >>> array.scalar_at(2).into_python(recursive=True) + /// {'age': 33, 'name': 'Angela'} + /// + /// Retrieve a missing element from an array of structures: + /// + /// >>> array.scalar_at(3) is None + /// True + /// + /// Out of bounds accesses are prohibited: + /// + /// >>> vortex.encoding.array([10, 42, 999, 1992]).scalar_at(10) + /// Traceback (most recent call last): + /// ... + /// ValueError: index 10 out of bounds from 0 to 4 + /// ... + /// + /// Unlike Python, negative indices are not supported: + /// + /// >>> vortex.encoding.array([10, 42, 999, 1992]).scalar_at(-2) + /// Traceback (most recent call last): + /// ... + /// OverflowError: can't convert negative int to unsigned + /// + fn scalar_at(&self, index: &Bound) -> PyResult { + scalar_at(&self.inner, index.extract()?) + .map_err(PyVortexError::map_err) + .and_then(|scalar| scalar_into_py(index.py(), scalar, false)) + } + /// Filter, permute, and/or repeat elements by their index. /// /// Parameters diff --git a/pyvortex/src/lib.rs b/pyvortex/src/lib.rs index d8629e6850..750e42aafd 100644 --- a/pyvortex/src/lib.rs +++ b/pyvortex/src/lib.rs @@ -3,6 +3,7 @@ use array::PyArray; use expr::PyExpr; use pyo3::prelude::*; +use scalar::{PyBuffer, PyBufferString, PyVortexList, PyVortexStruct}; mod array; mod compress; @@ -12,6 +13,7 @@ mod error; mod expr; mod io; mod python_repr; +mod scalar; /// Vortex is an Apache Arrow-compatible toolkit for working with compressed array data. #[pymodule] @@ -50,5 +52,13 @@ fn _lib(py: Python, m: &Bound) -> PyResult<()> { expr.add_function(wrap_pyfunction!(expr::column, m)?)?; expr.add_class::()?; + let scalar = PyModule::new_bound(py, "scalar")?; + m.add_submodule(&scalar)?; + + scalar.add_class::()?; + scalar.add_class::()?; + scalar.add_class::()?; + scalar.add_class::()?; + Ok(()) } diff --git a/pyvortex/src/scalar.rs b/pyvortex/src/scalar.rs new file mode 100644 index 0000000000..abee1bf5dc --- /dev/null +++ b/pyvortex/src/scalar.rs @@ -0,0 +1,264 @@ +//! Views into arrays of individual values. +//! +//! Vortex, like Arrow, avoids copying data. The classes in this package are returned by +//! :meth:`.Array.scalar_at`. They represent shared-memory views into individual values of a Vortex +//! array. + +use std::sync::Arc; + +use pyo3::prelude::*; +use pyo3::types::PyDict; +use vortex_buffer::{Buffer, BufferString}; +use vortex_dtype::{DType, StructDType}; +use vortex_error::vortex_panic; +use vortex_scalar::{PValue, Scalar, ScalarValue}; + +pub fn scalar_into_py(py: Python, x: Scalar, copy_into_python: bool) -> PyResult { + let (value, dtype) = x.into_parts(); + scalar_value_into_py(py, value, &dtype, copy_into_python) +} + +pub fn scalar_value_into_py( + py: Python, + x: ScalarValue, + dtype: &DType, + copy_into_python: bool, +) -> PyResult { + match x { + ScalarValue::Bool(x) => Ok(x.into_py(py)), + ScalarValue::Primitive(PValue::U8(x)) => Ok(x.into_py(py)), + ScalarValue::Primitive(PValue::U16(x)) => Ok(x.into_py(py)), + ScalarValue::Primitive(PValue::U32(x)) => Ok(x.into_py(py)), + ScalarValue::Primitive(PValue::U64(x)) => Ok(x.into_py(py)), + ScalarValue::Primitive(PValue::I8(x)) => Ok(x.into_py(py)), + ScalarValue::Primitive(PValue::I16(x)) => Ok(x.into_py(py)), + ScalarValue::Primitive(PValue::I32(x)) => Ok(x.into_py(py)), + ScalarValue::Primitive(PValue::I64(x)) => Ok(x.into_py(py)), + ScalarValue::Primitive(PValue::F16(x)) => Ok(x.to_f32().into_py(py)), + ScalarValue::Primitive(PValue::F32(x)) => Ok(x.into_py(py)), + ScalarValue::Primitive(PValue::F64(x)) => Ok(x.into_py(py)), + ScalarValue::Buffer(x) => { + if copy_into_python { + Ok(x.into_py(py)) + } else { + PyBuffer::new_pyobject(py, x) + } + } + ScalarValue::BufferString(x) => { + if copy_into_python { + Ok(x.into_py(py)) + } else { + PyBufferString::new_pyobject(py, x) + } + } + ScalarValue::List(x) => match dtype { + DType::List(dtype, ..) => { + if copy_into_python { + to_python_list(py, &x, dtype, true) + } else { + PyVortexList::new_pyobject(py, x, dtype.clone()) + } + } + DType::Struct(dtype, ..) => { + if copy_into_python { + to_python_dict(py, &x, dtype, true) + } else { + PyVortexStruct::new_pyobject(py, x, dtype.clone()) + } + } + _ => vortex_panic!("impossible"), + }, + ScalarValue::Null => Ok(py.None()), + } +} + +#[pyclass(name = "Buffer", module = "vortex", sequence, subclass)] +/// A view of binary data from a Vortex array. +pub struct PyBuffer { + inner: Buffer, +} + +impl PyBuffer { + pub fn new(inner: Buffer) -> PyBuffer { + PyBuffer { inner } + } + + pub fn new_bound(py: Python, inner: Buffer) -> PyResult> { + Bound::new(py, Self::new(inner)) + } + + pub fn new_pyobject(py: Python, inner: Buffer) -> PyResult { + let bound = Bound::new(py, Self::new(inner))?; + Ok(bound.into_py(py)) + } + + pub fn unwrap(&self) -> &Buffer { + &self.inner + } +} + +#[pymethods] +impl PyBuffer { + /// Copy this buffer from array memory into a Python bytes. + #[pyo3(signature = (*, recursive = false))] + #[allow(unused_variables)] // we want the same Python name across all methods + pub fn into_python(self_: PyRef, recursive: bool) -> PyResult { + Ok(self_.inner.into_py(self_.py())) + } +} + +#[pyclass(name = "BufferString", module = "vortex", sequence, subclass)] +/// A view of UTF-8 data from a Vortex array. +pub struct PyBufferString { + inner: BufferString, +} + +impl PyBufferString { + pub fn new(inner: BufferString) -> PyBufferString { + PyBufferString { inner } + } + + pub fn new_bound(py: Python, inner: BufferString) -> PyResult> { + Bound::new(py, Self::new(inner)) + } + + pub fn new_pyobject(py: Python, inner: BufferString) -> PyResult { + let bound = Bound::new(py, Self::new(inner))?; + Ok(bound.into_py(py)) + } + + pub fn unwrap(&self) -> &BufferString { + &self.inner + } +} + +#[pymethods] +impl PyBufferString { + /// Copy this buffer string from array memory into a Python str. + #[pyo3(signature = (*, recursive = false))] + #[allow(unused_variables)] // we want the same Python name across all methods + pub fn into_python(self_: PyRef, recursive: bool) -> PyResult { + Ok(self_.inner.into_py(self_.py())) + } +} + +#[pyclass(name = "VortexList", module = "vortex", sequence, subclass)] +/// A view of a variable-length list of data from a Vortex array. +pub struct PyVortexList { + inner: Arc<[ScalarValue]>, + dtype: Arc, +} + +impl PyVortexList { + pub fn new(inner: Arc<[ScalarValue]>, dtype: Arc) -> PyVortexList { + PyVortexList { inner, dtype } + } + + pub fn new_bound( + py: Python, + inner: Arc<[ScalarValue]>, + dtype: Arc, + ) -> PyResult> { + Bound::new(py, Self::new(inner, dtype)) + } + + pub fn new_pyobject( + py: Python, + inner: Arc<[ScalarValue]>, + dtype: Arc, + ) -> PyResult { + let bound = Bound::new(py, Self::new(inner, dtype))?; + Ok(bound.into_py(py)) + } + + pub fn unwrap(&self) -> &Arc<[ScalarValue]> { + &self.inner + } +} + +#[pymethods] +impl PyVortexList { + /// Copy the elements of this list from array memory into a list of Python objects. + #[pyo3(signature = (*, recursive = false))] + pub fn into_python(self_: PyRef, recursive: bool) -> PyResult { + to_python_list(self_.py(), &self_.inner, &self_.dtype, recursive) + } +} + +fn to_python_list( + py: Python, + values: &[ScalarValue], + dtype: &DType, + recursive: bool, +) -> PyResult { + Ok(values + .iter() + .cloned() + .map(|x| scalar_value_into_py(py, x, dtype, recursive)) + .collect::, _>>()? + .into_py(py)) +} + +#[pyclass(name = "VortexStruct", module = "vortex", sequence, subclass)] +/// A view of structured data from a Vortex array. +pub struct PyVortexStruct { + inner: Arc<[ScalarValue]>, + dtype: StructDType, +} + +impl PyVortexStruct { + pub fn new(inner: Arc<[ScalarValue]>, dtype: StructDType) -> PyVortexStruct { + PyVortexStruct { inner, dtype } + } + + pub fn new_bound( + py: Python, + inner: Arc<[ScalarValue]>, + dtype: StructDType, + ) -> PyResult> { + Bound::new(py, Self::new(inner, dtype)) + } + + pub fn new_pyobject( + py: Python, + inner: Arc<[ScalarValue]>, + dtype: StructDType, + ) -> PyResult { + let bound = Bound::new(py, Self::new(inner, dtype))?; + Ok(bound.into_py(py)) + } + + pub fn unwrap(&self) -> &Arc<[ScalarValue]> { + &self.inner + } +} + +#[pymethods] +impl PyVortexStruct { + #[pyo3(signature = (*, recursive = false))] + /// Copy the elements of this list from array memory into a list of Python objects. + pub fn into_python(self_: PyRef, recursive: bool) -> PyResult { + to_python_dict(self_.py(), &self_.inner, &self_.dtype, recursive) + } +} + +fn to_python_dict( + py: Python, + values: &[ScalarValue], + dtype: &StructDType, + recursive: bool, +) -> PyResult { + let dict = PyDict::new_bound(py); + for ((child, name), dtype) in values + .iter() + .cloned() + .zip(dtype.names().iter()) + .zip(dtype.dtypes().iter()) + { + dict.set_item( + name.to_string(), + scalar_value_into_py(py, child, dtype, recursive)?, + )? + } + Ok(dict.into_py(py)) +} diff --git a/vortex-scalar/src/lib.rs b/vortex-scalar/src/lib.rs index e2a46b1aae..f535cc1112 100644 --- a/vortex-scalar/src/lib.rs +++ b/vortex-scalar/src/lib.rs @@ -60,6 +60,11 @@ impl Scalar { self.value } + #[inline] + pub fn into_parts(self) -> (ScalarValue, DType) { + (self.value, self.dtype) + } + pub fn is_valid(&self) -> bool { !self.value.is_null() }