diff --git a/.github/workflows/python-docs.yml b/.github/workflows/docs.yml
similarity index 92%
rename from .github/workflows/python-docs.yml
rename to .github/workflows/docs.yml
index fc0f0adc1f..4dac3655b2 100644
--- a/.github/workflows/python-docs.yml
+++ b/.github/workflows/docs.yml
@@ -1,4 +1,4 @@
-name: Python docs
+name: Python & Rust docs
on:
push:
@@ -31,6 +31,7 @@ jobs:
built_sha=$(git rev-parse HEAD)
+ rm -rf docs/_build/html/rust/CACHETAG.DIR docs/_build/html/rust/debug
mv docs/_build/html /tmp/html
git fetch origin
diff --git a/docs/Makefile b/docs/Makefile
index 1b1224a5b8..122224bb74 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -18,3 +18,9 @@ help:
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+html: rust-html
+
+.PHONY: rust-html
+rust-html:
+ cargo doc --no-deps --workspace --all-features --target-dir $(BUILDDIR)/html/rust
diff --git a/docs/_static/example.parquet b/docs/_static/example.parquet
new file mode 100644
index 0000000000..46c86166bf
Binary files /dev/null and b/docs/_static/example.parquet differ
diff --git a/docs/_static/example.vortex b/docs/_static/example.vortex
new file mode 100644
index 0000000000..796654f2a7
Binary files /dev/null and b/docs/_static/example.vortex differ
diff --git a/docs/_static/file-format-2024-10-23-1642.svg b/docs/_static/file-format-2024-10-23-1642.svg
new file mode 100644
index 0000000000..46eb3b5e5d
--- /dev/null
+++ b/docs/_static/file-format-2024-10-23-1642.svg
@@ -0,0 +1,10 @@
+
\ No newline at end of file
diff --git a/docs/_static/style.css b/docs/_static/style.css
new file mode 100644
index 0000000000..2d137b32a9
--- /dev/null
+++ b/docs/_static/style.css
@@ -0,0 +1,3 @@
+html .pst-navbar-icon {
+ font-size: 1.5rem;
+}
diff --git a/docs/_static/vortex_spiral_logo.svg b/docs/_static/vortex_spiral_logo.svg
new file mode 100644
index 0000000000..026901c94f
--- /dev/null
+++ b/docs/_static/vortex_spiral_logo.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/docs/_static/vortex_spiral_logo_dark_theme.svg b/docs/_static/vortex_spiral_logo_dark_theme.svg
new file mode 100644
index 0000000000..0c4d52bab2
--- /dev/null
+++ b/docs/_static/vortex_spiral_logo_dark_theme.svg
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file
diff --git a/docs/dataset.rst b/docs/api/dataset.rst
similarity index 77%
rename from docs/dataset.rst
rename to docs/api/dataset.rst
index 848e6592ca..16d564868d 100644
--- a/docs/dataset.rst
+++ b/docs/api/dataset.rst
@@ -6,5 +6,15 @@ query engines like DuckDB and Polars. In particular, Vortex will read data propo
number of rows passing a filter condition and the number of columns in a selection. For most Vortex
encodings, this property holds true even when the filter condition specifies a single row.
+.. autosummary::
+ :nosignatures:
+
+ ~vortex.dataset.VortexDataset
+ ~vortex.dataset.VortexScanner
+
+.. raw:: html
+
+
+
.. automodule:: vortex.dataset
:members:
diff --git a/docs/api/dtype.rst b/docs/api/dtype.rst
new file mode 100644
index 0000000000..4f529feea9
--- /dev/null
+++ b/docs/api/dtype.rst
@@ -0,0 +1,27 @@
+Array Data Types
+================
+
+The logical types of the elements of an Array. Each logical type is implemented by a variety of
+Array encodings which describe both a representation-as-bytes as well as how to apply operations on
+that representation.
+
+.. autosummary::
+ :nosignatures:
+
+ ~vortex.dtype.DType
+ ~vortex.dtype.binary
+ ~vortex.dtype.bool
+ ~vortex.dtype.float
+ ~vortex.dtype.int
+ ~vortex.dtype.null
+ ~vortex.dtype.uint
+ ~vortex.dtype.utf8
+
+.. raw:: html
+
+
+
+.. automodule:: vortex.dtype
+ :members:
+ :imported-members:
+
diff --git a/docs/api/encoding.rst b/docs/api/encoding.rst
new file mode 100644
index 0000000000..3ec5cb449c
--- /dev/null
+++ b/docs/api/encoding.rst
@@ -0,0 +1,26 @@
+Arrays
+======
+
+A Vortex array is a possibly compressed ordered set of homogeneously typed values. Each array has a
+logical type and a physical encoding. The logical type describes the set of operations applicable to
+the values of this array. The physical encoding describes how this array is realized in memory, on
+disk, and over the wire and how to apply operations to that realization.
+
+.. autosummary::
+ :nosignatures:
+
+ ~vortex.encoding.array
+ ~vortex.encoding.compress
+ ~vortex.encoding.Array
+
+.. raw:: html
+
+
+
+.. autofunction:: vortex.encoding.array
+
+.. autofunction:: vortex.encoding.compress
+
+.. autoclass:: vortex.encoding.Array
+ :members:
+ :special-members: __len__
diff --git a/docs/api/expr.rst b/docs/api/expr.rst
new file mode 100644
index 0000000000..3fd6ab3390
--- /dev/null
+++ b/docs/api/expr.rst
@@ -0,0 +1,26 @@
+Expressions
+===========
+
+Vortex expressions represent simple filtering conditions on the rows of a Vortex array. For example,
+the following expression represents the set of rows for which the `age` column lies between 23 and
+55:
+
+.. doctest::
+
+ >>> import vortex
+ >>> age = vortex.expr.column("age")
+ >>> (23 > age) & (age < 55) # doctest: +SKIP
+
+.. autosummary::
+ :nosignatures:
+
+ ~vortex.expr.column
+ ~vortex.expr.Expr
+
+.. raw:: html
+
+
+
+.. autofunction:: vortex.expr.column
+
+.. autoclass:: vortex.expr.Expr
diff --git a/docs/api/index.rst b/docs/api/index.rst
new file mode 100644
index 0000000000..b67c96ad6e
--- /dev/null
+++ b/docs/api/index.rst
@@ -0,0 +1,12 @@
+Python API
+==========
+
+.. toctree::
+ :maxdepth: 5
+
+ encoding
+ dtype
+ io
+ dataset
+ expr
+ scalar
diff --git a/docs/api/io.rst b/docs/api/io.rst
new file mode 100644
index 0000000000..1dee8dea5d
--- /dev/null
+++ b/docs/api/io.rst
@@ -0,0 +1,20 @@
+Input and Output
+================
+
+Vortex arrays support reading and writing to local and remote file systems, including plain-old
+HTTP, S3, Google Cloud Storage, and Azure Blob Storage.
+
+.. autosummary::
+ :nosignatures:
+
+ ~vortex.io.read_path
+ ~vortex.io.read_url
+ ~vortex.io.write_path
+
+.. raw:: html
+
+
+
+.. automodule:: vortex.io
+ :members:
+ :imported-members:
diff --git a/docs/api/scalar.rst b/docs/api/scalar.rst
new file mode 100644
index 0000000000..288673a5c1
--- /dev/null
+++ b/docs/api/scalar.rst
@@ -0,0 +1,25 @@
+Scalars
+=======
+
+A scalar is a single atomic value like the integer ``1``, the string ``"hello"``, or the structure
+``{"age": 55, "name": "Angela"}``. The :meth:`.Array.scalar_at` method
+returns a native Python value when the cost of doing so is small. However, for larger values like
+binary data, UTF-8 strings, variable-length lists, and structures, Vortex returns a zero-copy *view*
+of the Array data. The ``into_python`` method of each view will copy the scalar into a native Python
+value.
+
+.. autosummary::
+ :nosignatures:
+
+ ~vortex.scalar.Buffer
+ ~vortex.scalar.BufferString
+ ~vortex.scalar.VortexList
+ ~vortex.scalar.VortexStruct
+
+.. raw:: html
+
+
+
+.. automodule:: vortex.scalar
+ :members:
+ :imported-members:
diff --git a/docs/conf.py b/docs/conf.py
index 719854b5e1..0fe652707d 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -15,8 +15,11 @@
extensions = [
"sphinx.ext.autodoc",
- "sphinx.ext.intersphinx",
+ "sphinx.ext.autosummary",
"sphinx.ext.doctest",
+ "sphinx.ext.intersphinx",
+ "sphinx.ext.napoleon",
+ "sphinx_design",
]
templates_path = ["_templates"]
@@ -24,10 +27,10 @@
intersphinx_mapping = {
"python": ("https://docs.python.org/3", None),
- "pyarrow": ("https://arrow.apache.org/docs/", None),
- "pandas": ("https://pandas.pydata.org/docs/", None),
- "numpy": ("https://numpy.org/doc/stable/", None),
- "polars": ("https://docs.pola.rs/api/python/stable/", None),
+ "pyarrow": ("https://arrow.apache.org/docs", None),
+ "pandas": ("https://pandas.pydata.org/docs", None),
+ "numpy": ("https://numpy.org/doc/stable", None),
+ "polars": ("https://docs.pola.rs/api/python/stable", None),
}
nitpicky = True # ensures all :class:, :obj:, etc. links are valid
@@ -38,4 +41,37 @@
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
html_theme = "pydata_sphinx_theme"
-# html_static_path = ['_static'] # no static files yet
+html_static_path = ["_static"]
+html_css_files = ["style.css"] # relative to _static/
+
+# -- Options for PyData Theme ------------------------------------------------
+html_theme_options = {
+ "show_toc_level": 2,
+ "logo": {
+ "alt_text": "The Vortex logo.",
+ "text": "Vortex",
+ "image_light": "_static/vortex_spiral_logo.svg",
+ "image_dark": "_static/vortex_spiral_logo_dark_theme.svg",
+ },
+ "icon_links": [
+ {
+ "name": "GitHub",
+ "url": "https://github.com/spiraldb/vortex",
+ "icon": "fa-brands fa-github",
+ "type": "fontawesome",
+ },
+ {
+ "name": "PyPI",
+ "url": "https://pypi.org/project/vortex-array",
+ "icon": "fa-brands fa-python",
+ "type": "fontawesome",
+ },
+ ],
+ "header_links_before_dropdown": 3,
+}
+html_sidebars = {
+ # hide the primary (left-hand) sidebar on pages without sub-pages
+ "quickstart": [],
+ "guide": [],
+ "file_format": [],
+}
diff --git a/docs/dtype.rst b/docs/dtype.rst
deleted file mode 100644
index 9c30bc80b9..0000000000
--- a/docs/dtype.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Array Data Types
-================
-
-.. automodule:: vortex.dtype
- :members:
- :imported-members:
-
diff --git a/docs/encoding.rst b/docs/encoding.rst
deleted file mode 100644
index 8448777fba..0000000000
--- a/docs/encoding.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Arrays
-======
-
-.. automodule:: vortex.encoding
- :members:
- :imported-members:
- :special-members: __len__
diff --git a/docs/expr.rst b/docs/expr.rst
deleted file mode 100644
index 854aec35ce..0000000000
--- a/docs/expr.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-Row Filter Expressions
-======================
-
-.. automodule:: vortex.expr
- :members:
- :imported-members:
diff --git a/docs/file_format.rst b/docs/file_format.rst
new file mode 100644
index 0000000000..322bcd503b
--- /dev/null
+++ b/docs/file_format.rst
@@ -0,0 +1,79 @@
+File Format
+===========
+
+Intuition
+---------
+
+The Vortex file format has both *layouts*, which describe how different chunks of columns are stored
+relative to one another, and *encodings* which describe the byte representation of a contiguous
+sequence of values. A layout describes how to contiguously store one or more arrays as is necessary
+for storing an array on disk or transmitting it over the wire. An encoding defines one binary
+representation for memory, disk, and the wire.
+
+.. _file-format--layouts:
+
+Layouts
+^^^^^^^
+
+Vortex arrays have the same binary representation in-memory, on-disk, and over-the-wire; however,
+all the rows of all the columns are not necessarily contiguously laid out. Vortex has three kinds of
+*layouts* which recursively compose: the *flat layout*, the *column layout*, and the *chunked
+layout*.
+
+The flat layout is a contiguous sequence of bytes. Any Vortex array encoding can be serialized into
+the flat layout.
+
+The column layout lays out each column of a struct-typed array as a separate sequence of bytes. Each
+column may or may not recursively use a chunked layout. Column layouts permit readers to push-down
+column projections.
+
+The chunked layout lays out an array as a sequence of row chunks. Each chunk may have a different
+size. A chunked layout permits reader to push-down row filters based on statistics which we describe
+later. Note that, if the laid out array is a struct array, each column uses the same chunk
+size. This is equivalent to Parquet's row groups.
+
+A few examples of concrete layouts:
+
+1. Chunked of struct of chunked of flat: essentially a Parquet layout with row groups in which each
+ column's values are contiguously stored in pages.
+2. Struct of chunked of flat: eliminates row groups, retaining only pages.
+3. Struct of flat: prevents row filter push-down because each array is, to the layout, an opaque
+ sequence of bytes.
+
+The chunked layout stores, per chunk, metadata necessary for effective row filtering such as
+sortedness, constancy, the minimum value, the maximum value, and the number of null rows. Readers
+consult these metadata tables to avoid reading chunks without relevant data.
+
+.. card::
+
+ .. figure:: _static/file-format-2024-10-23-1642.svg
+ :width: 800px
+ :alt: A schematic of the file format
+
+ +++
+
+ The Vortex file format has five sections: data, statistics, schema, footer, and postscript. The
+ postscript describes the locating of the schema and layout which in turn describe how to
+ interpret the data and metadata. The schema describes the logical type. The metadata contains
+ information necessary for row filtering.
+
+.. _included-codecs:
+
+Encodings
+^^^^^^^^^
+
+- Most of the Arrow encodings.
+- Chunked, a sequence of arrays.
+- Constant, a value and a length.
+- Sparse, a value plus a pair of arrays representing exceptions: an array of indices and of values.
+- FastLanes Frame-of-Reference, BitPacking, and Delta.
+- Fast Static Symbol Table (FSST).
+- Adapative Lossless Floating Point (ALP).
+- ALP Real Double (ALP-RD).
+- ByteBool, one byte per Boolean value.
+- ZigZag.
+
+Specification
+-------------
+
+TODO!
diff --git a/docs/guide.rst b/docs/guide.rst
new file mode 100644
index 0000000000..068d12e708
--- /dev/null
+++ b/docs/guide.rst
@@ -0,0 +1,173 @@
+Guide
+=====
+
+.. admonition:: Rustaceans
+
+ See the `Vortex Rust documentation `_, for details on Vortex in Rust.
+
+Python
+------
+
+Construct a Vortex array from lists of simple Python values:
+
+.. doctest::
+
+ >>> import vortex
+ >>> vtx = vortex.array([1, 2, 3, 4])
+ >>> vtx.dtype
+ int(64, False)
+
+Python's :obj:`None` represents a missing or null value and changes the dtype of the array from
+non-nullable 64-bit integers to nullable 64-bit integers:
+
+.. doctest::
+
+ >>> vtx = vortex.array([1, 2, None, 4])
+ >>> vtx.dtype
+ int(64, True)
+
+A list of :class:`dict` is converted to an array of structures. Missing values may appear at any
+level:
+
+.. doctest::
+
+ >>> vtx = vortex.array([
+ ... {'name': 'Joseph', 'age': 25},
+ ... {'name': None, 'age': 31},
+ ... {'name': 'Angela', 'age': None},
+ ... {'name': 'Mikhail', 'age': 57},
+ ... {'name': None, 'age': None},
+ ... None,
+ ... ])
+ >>> vtx.dtype
+ struct({"age": int(64, True), "name": utf8(True)}, True)
+
+:meth:`.Array.to_pylist` converts a Vortex array into a list of Python values.
+
+.. doctest::
+
+ >>> vtx.to_pylist()
+ [{'age': 25, 'name': 'Joseph'}, {'age': 31, 'name': None}, {'age': None, 'name': 'Angela'}, {'age': 57, 'name': 'Mikhail'}, {'age': None, 'name': None}, {'age': None, 'name': None}]
+
+Arrow
+^^^^^
+
+The :func:`~vortex.encoding.array` function constructs a Vortex array from an Arrow one without any
+copies:
+
+.. doctest::
+
+ >>> import pyarrow as pa
+ >>> arrow = pa.array([1, 2, None, 3])
+ >>> arrow.type
+ DataType(int64)
+ >>> vtx = vortex.array(arrow)
+ >>> vtx.dtype
+ int(64, True)
+
+:meth:`.Array.to_arrow_array` converts back to an Arrow array:
+
+.. doctest::
+
+ >>> vtx.to_arrow_array()
+
+ [
+ 1,
+ 2,
+ null,
+ 3
+ ]
+
+If you have a struct array, use :meth:`.Array.to_arrow_table` to construct an Arrow table:
+
+.. doctest::
+
+ >>> struct_vtx = vortex.array([
+ ... {'name': 'Joseph', 'age': 25},
+ ... {'name': 'Narendra', 'age': 31},
+ ... {'name': 'Angela', 'age': 33},
+ ... {'name': 'Mikhail', 'age': 57},
+ ... ])
+ >>> struct_vtx.to_arrow_table()
+ pyarrow.Table
+ age: int64
+ name: string_view
+ ----
+ age: [[25,31,33,57]]
+ name: [["Joseph","Narendra","Angela","Mikhail"]]
+
+Pandas
+^^^^^^
+
+:meth:`.Array.to_pandas_df` converts a Vortex array into a Pandas DataFrame:
+
+.. doctest::
+
+ >>> df = struct_vtx.to_pandas_df()
+ >>> df
+ age name
+ 0 25 Joseph
+ 1 31 Narendra
+ 2 33 Angela
+ 3 57 Mikhail
+
+:func:`~vortex.encoding.array` converts from a Pandas DataFrame into a Vortex array:
+
+ >>> vortex.array(df).to_arrow_table()
+ pyarrow.Table
+ age: int64
+ name: string_view
+ ----
+ age: [[25,31,33,57]]
+ name: [["Joseph","Narendra","Angela","Mikhail"]]
+
+
+.. _query-engine-integration:
+
+Query Engines
+-------------
+
+:class:`~vortex.dataset.VortexDataset` implements the :class:`pyarrow.dataset.Dataset` API which
+enables many Python-based query engines to pushdown row filters and column projections on Vortex
+files.
+
+Polars
+^^^^^^
+
+ >>> import polars as pl
+ >>> ds = vortex.dataset.from_path(
+ ... '_static/example.vortex'
+ ... )
+ >>> lf = pl.scan_pyarrow_dataset(ds)
+ >>> lf = lf.select('tip_amount', 'fare_amount')
+ >>> lf = lf.head(3)
+ >>> lf.collect()
+ shape: (3, 2)
+ ┌────────────┬─────────────┐
+ │ tip_amount ┆ fare_amount │
+ │ --- ┆ --- │
+ │ f64 ┆ f64 │
+ ╞════════════╪═════════════╡
+ │ 0.0 ┆ 61.8 │
+ │ 5.1 ┆ 20.5 │
+ │ 16.54 ┆ 70.0 │
+ └────────────┴─────────────┘
+
+DuckDB
+^^^^^^
+
+ >>> import duckdb
+ >>> ds = vortex.dataset.from_path(
+ ... '_static/example.vortex'
+ ... )
+ >>> duckdb.sql('select ds.tip_amount, ds.fare_amount from ds limit 3').show()
+ ┌────────────┬─────────────┐
+ │ tip_amount │ fare_amount │
+ │ double │ double │
+ ├────────────┼─────────────┤
+ │ 0.0 │ 61.8 │
+ │ 5.1 │ 20.5 │
+ │ 16.54 │ 70.0 │
+ └────────────┴─────────────┘
+
+
diff --git a/docs/index.rst b/docs/index.rst
index c89a19c9a3..2a2d9e9232 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -3,18 +3,68 @@
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
-Vortex documentation
-====================
+Wide, Fast & Compact. Pick Three.
+==================================
-Vortex is an Apache Arrow-compatible toolkit for working with compressed array data.
+.. grid:: 1 1 2 2
+ :gutter: 4 4 4 4
+
+ .. grid-item-card:: The File Format
+ :link: file_format
+ :link-type: doc
+
+ Currently just a schematic. Specification forthcoming.
+
+ .. grid-item-card:: The Rust API
+ :link: https://spiraldb.github.io/vortex/docs2/rust/doc/vortex
+
+ The primary interface to the Vortex toolkit.
+
+ .. grid-item-card:: Quickstart
+ :link: quickstart
+ :link-type: doc
+
+ For end-users looking to read and write Vortex files.
+
+ .. grid-item-card:: The Benchmarks
+ :link: https://bench.vortex.dev/
+
+ Random access, throughput, and TPC-H.
+
+
+Vortex is a fast & extensible columnar file format that is based around state-of-the-art research
+from the database community. It is built around cascading compression with lightweight encodings (no
+block compression), allowing for both efficient random access and extremely fast decompression.
+
+Vortex also includes an accompanying in-memory format for these (recursively) compressed arrays,
+that is zero-copy compatible with Apache Arrow in uncompressed form. Taken together, the Vortex
+library is a useful toolkit with compressed Arrow data in-memory, on-disk, & over-the-wire.
+
+Vortex aspires to succeed Apache Parquet by pushing the Pareto frontier outwards: 1-2x faster
+writes, 2-10x faster scans, and 100-200x faster random access reads, while preserving the same
+approximate compression ratio as Parquet v2 with zstd.
+
+Its features include:
+
+- A zero-copy data layout for disk, memory, and the wire.
+- Kernels for computing on, filtering, slicing, indexing, and projecting compressed arrays.
+- Builtin state-of-the-art codecs including FastLanes (integer bit-packing), ALP (floating point),
+ and FSST (strings).
+- Support for custom user-implemented codecs.
+- Support for, but no requirement for, row groups.
+- A read sub-system supporting filter and projection pushdown.
+
+Vortex's flexible layout empowers writers to choose the right layout for their setting: fast writes,
+fast reads, small files, few columns, many columns, over-sized columns, etc.
+
+Documentation
+-------------
.. toctree::
:maxdepth: 2
- :caption: Contents:
-
- encoding
- dtype
- io
- dataset
- expr
- scalar
+
+ quickstart
+ guide
+ file_format
+ api/index
+ Rust API
diff --git a/docs/io.rst b/docs/io.rst
deleted file mode 100644
index f2cc405ce9..0000000000
--- a/docs/io.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-Input and Output
-================
-
-.. automodule:: vortex.io
- :members:
- :imported-members:
diff --git a/docs/pyproject.toml b/docs/pyproject.toml
index 53ceee91e1..a93d7459ed 100644
--- a/docs/pyproject.toml
+++ b/docs/pyproject.toml
@@ -3,7 +3,12 @@ name = "docs"
version = "0.1.0"
description = "Vortex documentation."
authors = []
-dependencies = ["pydata-sphinx-theme>=0.15.4", "sphinx>=8.0.2", "pyvortex"]
+dependencies = [
+ "pydata-sphinx-theme>=0.16.0",
+ "sphinx>=8.0.2",
+ "pyvortex",
+ "sphinx-design>=0.6.1",
+]
requires-python = ">= 3.10"
[tool.uv]
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
new file mode 100644
index 0000000000..65a71cb7c3
--- /dev/null
+++ b/docs/quickstart.rst
@@ -0,0 +1,199 @@
+Quickstart
+==========
+
+The reference implementation exposes both a Rust and Python API. A C API is currently in progress.
+
+- :ref:`Quickstart for Python `
+- :ref:`Quickstart for Rust `
+- :ref:`Quickstart for C `
+
+.. _python-quickstart:
+
+Python
+------
+
+Install
+^^^^^^^
+
+::
+
+ pip install vortex-array
+
+Convert
+^^^^^^^
+
+You can either use your own Parquet file or download the `example used here
+`__.
+
+Use Arrow to read a Parquet file and then use :func:`~vortex.encoding.array` to construct an uncompressed
+Vortex array:
+
+.. doctest::
+
+ >>> import pyarrow.parquet as pq
+ >>> import vortex
+ >>> parquet = pq.read_table("_static/example.parquet")
+ >>> vtx = vortex.array(parquet)
+ >>> vtx.nbytes
+ 141024
+
+Compress
+^^^^^^^^
+
+Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the relative size:
+
+.. doctest::
+
+ >>> cvtx = vortex.compress(vtx)
+ >>> cvtx.nbytes
+ 13970
+ >>> cvtx.nbytes / vtx.nbytes
+ 0.099...
+
+Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in
+cache and RAM.
+
+Write
+^^^^^
+
+Use :func:`~vortex.io.write_path` to write the Vortex array to disk:
+
+.. doctest::
+
+ >>> vortex.io.write_path(cvtx, "example.vortex")
+
+Small Vortex files (this one is just 71KiB) currently have substantial overhead relative to their
+size. This will be addressed shortly. On files with at least tens of megabytes of data, Vortex is
+similar to or smaller than Parquet.
+
+.. doctest::
+
+ >>> from os.path import getsize
+ >>> getsize("example.vortex") / getsize("_static/example.parquet")
+ 2.1...
+
+Read
+^^^^
+
+Use :func:`~vortex.io.read_path` to read the Vortex array from disk:
+
+.. doctest::
+
+ >>> cvtx = vortex.io.read_path("example.vortex")
+
+.. _rust-quickstart:
+
+Rust
+----
+
+Install
+^^^^^^^
+
+Install vortex and all the first-party array encodings::
+
+ cargo add vortex-array vortex-alp vortex-fsst vortex-fastlanes \
+ vortex-bytebool vortex-datetime-dtype vortex-datetime-parts \
+ vortex-dict vortex-runend vortex-runend-bool vortex-zigzag \
+ vortex-sampling-compressor vortex-serde
+
+Convert
+^^^^^^^
+
+You can either use your own Parquet file or download the `example used here
+`__.
+
+Use Arrow to read a Parquet file and then construct an uncompressed Vortex array:
+
+.. code-block:: rust
+
+ use std::fs::File;
+
+ use arrow_array::RecordBatchReader;
+ use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+ use vortex::array::ChunkedArray;
+ use vortex::arrow::FromArrowType;
+ use vortex::{Array, IntoArray};
+ use vortex_dtype::DType;
+
+ let reader =
+ ParquetRecordBatchReaderBuilder::try_new(File::open("_static/example.parquet").unwrap())
+ .unwrap()
+ .build()
+ .unwrap();
+ let dtype = DType::from_arrow(reader.schema());
+ let chunks = reader
+ .map(|x| Array::try_from(x.unwrap()).unwrap())
+ .collect::>();
+ let vtx = ChunkedArray::try_new(chunks, dtype).unwrap().into_array();
+
+Compress
+^^^^^^^^
+
+Use the sampling compressor to compress the Vortex array and check the relative size:
+
+.. code-block:: rust
+
+ use std::collections::HashSet;
+
+ use vortex_sampling_compressor::{SamplingCompressor, DEFAULT_COMPRESSORS};
+
+ let compressor = SamplingCompressor::new(HashSet::from(*DEFAULT_COMPRESSORS));
+ let cvtx = compressor.compress(&vtx, None).unwrap().into_array();
+ println!("{}", cvtx.nbytes());
+
+Write
+^^^^^
+
+Reading and writing both require an async runtime, in this example we use Tokio. The LayoutWriter
+knows how to write Vortex arrays to disk:
+
+.. code-block:: rust
+
+ use std::path::Path;
+
+ use tokio::fs::File as TokioFile;
+ use vortex_serde::layouts::LayoutWriter;
+
+ let file = TokioFile::create(Path::new("example.vortex"))
+ .await
+ .unwrap();
+ let writer = LayoutWriter::new(file)
+ .write_array_columns(cvtx.clone())
+ .await
+ .unwrap();
+ writer.finalize().await.unwrap();
+
+Read
+^^^^
+
+.. code-block:: rust
+
+ use futures::TryStreamExt;
+ use vortex_sampling_compressor::ALL_COMPRESSORS_CONTEXT;
+ use vortex_serde::layouts::{LayoutContext, LayoutDeserializer, LayoutReaderBuilder};
+
+ let file = TokioFile::open(Path::new("example.vortex")).await.unwrap();
+ let builder = LayoutReaderBuilder::new(
+ file,
+ LayoutDeserializer::new(
+ ALL_COMPRESSORS_CONTEXT.clone(),
+ LayoutContext::default().into(),
+ ),
+ );
+
+ let stream = builder.build().await.unwrap();
+ let dtype = stream.schema().clone().into();
+ let vecs: Vec = stream.try_collect().await.unwrap();
+ let cvtx = ChunkedArray::try_new(vecs, dtype)
+ .unwrap()
+ .into_array();
+
+ println!("{}", cvtx.nbytes());
+
+
+.. _c-quickstart:
+
+C
+-
+
+Coming soon!
diff --git a/docs/scalar.rst b/docs/scalar.rst
deleted file mode 100644
index 9fb3b26cfc..0000000000
--- a/docs/scalar.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-Scalar Values
-=============
-
-.. automodule:: vortex.scalar
- :members:
- :imported-members:
diff --git a/pyvortex/pyproject.toml b/pyvortex/pyproject.toml
index f482ce0541..08f78c3ceb 100644
--- a/pyvortex/pyproject.toml
+++ b/pyvortex/pyproject.toml
@@ -41,4 +41,5 @@ features = ["pyo3/extension-module"]
include = [
{ path = "rust-toolchain.toml", format = "sdist" },
{ path = "README.md", format = "sdist" },
+ { path = "python/vortex/py.typed", format = "sdist" },
]
diff --git a/pyvortex/python/vortex/__init__.py b/pyvortex/python/vortex/__init__.py
index b7101a7ccc..6a50c5978b 100644
--- a/pyvortex/python/vortex/__init__.py
+++ b/pyvortex/python/vortex/__init__.py
@@ -5,5 +5,6 @@
__doc__ = module_docs
del module_docs
array = encoding.array
+compress = encoding.compress
__all__ = ["array", dtype, expr, io, encoding, scalar, dataset]
diff --git a/pyvortex/python/vortex/dataset.py b/pyvortex/python/vortex/dataset.py
index d8b3254966..7f9d8d5d3b 100644
--- a/pyvortex/python/vortex/dataset.py
+++ b/pyvortex/python/vortex/dataset.py
@@ -12,7 +12,12 @@
class VortexDataset(pyarrow.dataset.Dataset):
- """Read Vortex files with row filter and column selection pushdown."""
+ """Read Vortex files with row filter and column selection pushdown.
+
+ This class implements the :class:`.pyarrow.dataset.Dataset` interface which enables its use with
+ Polars, DuckDB, Pandas and others.
+
+ """
def __init__(self, dataset):
self._dataset = dataset
@@ -62,6 +67,35 @@ def head(
use_threads: bool | None = None,
memory_pool: pa.MemoryPool = None,
) -> pa.Table:
+ """Load the first `num_rows` of the dataset.
+
+ Parameters
+ ----------
+ num_rows : int
+ The number of rows to load.
+ columns : list of str
+ The columns to keep, identified by name.
+ filter : :class:`.pyarrow.dataset.Expression`
+ Keep only rows for which this expression evalutes to ``True``. Any rows for which
+ this expression evaluates to ``Null`` is removed.
+ batch_size : int
+ The maximum number of rows per batch.
+ batch_readahead : int
+ Not implemented.
+ fragment_readahead : int
+ Not implemented.
+ fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
+ Not implemented.
+ use_threads : bool
+ Not implemented.
+ memory_pool : :class:`.pyarrow.MemoryPool`
+ Not implemented.
+
+ Returns
+ -------
+ table : :class:`.pyarrow.Table`
+
+ """
if batch_readahead is not None:
raise ValueError("batch_readahead not supported")
if fragment_readahead is not None:
@@ -114,7 +148,33 @@ def scanner(
use_threads: bool | None = None,
memory_pool: pa.MemoryPool = None,
) -> pa.dataset.Scanner:
- """Not implemented."""
+ """Construct a :class:`.pyarrow.dataset.Scanner`.
+
+ Parameters
+ ----------
+ columns : list of str
+ The columns to keep, identified by name.
+ filter : :class:`.pyarrow.dataset.Expression`
+ Keep only rows for which this expression evalutes to ``True``. Any rows for which
+ this expression evaluates to ``Null`` is removed.
+ batch_size : int
+ The maximum number of rows per batch.
+ batch_readahead : int
+ Not implemented.
+ fragment_readahead : int
+ Not implemented.
+ fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
+ Not implemented.
+ use_threads : bool
+ Not implemented.
+ memory_pool : :class:`.pyarrow.MemoryPool`
+ Not implemented.
+
+ Returns
+ -------
+ table : :class:`.pyarrow.Table`
+
+ """
return VortexScanner(
self,
columns,
@@ -143,6 +203,35 @@ def take(
use_threads: bool | None = None,
memory_pool: pa.MemoryPool = None,
) -> pa.Table:
+ """Load a subset of rows identified by their absolute indices.
+
+ Parameters
+ ----------
+ indices : :class:`.pyarrow.Array`
+ A numeric array of absolute indices into `self` indicating which rows to keep.
+ columns : list of str
+ The columns to keep, identified by name.
+ filter : :class:`.pyarrow.dataset.Expression`
+ Keep only rows for which this expression evalutes to ``True``. Any rows for which
+ this expression evaluates to ``Null`` is removed.
+ batch_size : int
+ The maximum number of rows per batch.
+ batch_readahead : int
+ Not implemented.
+ fragment_readahead : int
+ Not implemented.
+ fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
+ Not implemented.
+ use_threads : bool
+ Not implemented.
+ memory_pool : :class:`.pyarrow.MemoryPool`
+ Not implemented.
+
+ Returns
+ -------
+ table : :class:`.pyarrow.Table`
+
+ """
return (
self._dataset.to_array(columns=columns, batch_size=batch_size, row_filter=filter)
.take(encoding.array(indices))
@@ -160,6 +249,33 @@ def to_record_batch_reader(
use_threads: bool | None = None,
memory_pool: pa.MemoryPool = None,
) -> pa.RecordBatchReader:
+ """Construct a :class:`.pyarrow.RecordBatchReader`.
+
+ Parameters
+ ----------
+ columns : list of str
+ The columns to keep, identified by name.
+ filter : :class:`.pyarrow.dataset.Expression`
+ Keep only rows for which this expression evalutes to ``True``. Any rows for which
+ this expression evaluates to ``Null`` is removed.
+ batch_size : int
+ The maximum number of rows per batch.
+ batch_readahead : int
+ Not implemented.
+ fragment_readahead : int
+ Not implemented.
+ fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
+ Not implemented.
+ use_threads : bool
+ Not implemented.
+ memory_pool : :class:`.pyarrow.MemoryPool`
+ Not implemented.
+
+ Returns
+ -------
+ table : :class:`.pyarrow.Table`
+
+ """
if batch_readahead is not None:
raise ValueError("batch_readahead not supported")
if fragment_readahead is not None:
@@ -186,6 +302,33 @@ def to_batches(
use_threads: bool | None = None,
memory_pool: pa.MemoryPool = None,
) -> Iterator[pa.RecordBatch]:
+ """Construct an iterator of :class:`.pyarrow.RecordBatch`.
+
+ Parameters
+ ----------
+ columns : list of str
+ The columns to keep, identified by name.
+ filter : :class:`.pyarrow.dataset.Expression`
+ Keep only rows for which this expression evalutes to ``True``. Any rows for which
+ this expression evaluates to ``Null`` is removed.
+ batch_size : int
+ The maximum number of rows per batch.
+ batch_readahead : int
+ Not implemented.
+ fragment_readahead : int
+ Not implemented.
+ fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
+ Not implemented.
+ use_threads : bool
+ Not implemented.
+ memory_pool : :class:`.pyarrow.MemoryPool`
+ Not implemented.
+
+ Returns
+ -------
+ table : :class:`.pyarrow.Table`
+
+ """
record_batch_reader = self.to_record_batch_reader(
columns,
filter,
@@ -213,6 +356,33 @@ def to_table(
use_threads: bool | None = None,
memory_pool: pa.MemoryPool = None,
) -> pa.Table:
+ """Construct an Arrow :class:`.pyarrow.Table`.
+
+ Parameters
+ ----------
+ columns : list of str
+ The columns to keep, identified by name.
+ filter : :class:`.pyarrow.dataset.Expression`
+ Keep only rows for which this expression evalutes to ``True``. Any rows for which
+ this expression evaluates to ``Null`` is removed.
+ batch_size : int
+ The maximum number of rows per batch.
+ batch_readahead : int
+ Not implemented.
+ fragment_readahead : int
+ Not implemented.
+ fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
+ Not implemented.
+ use_threads : bool
+ Not implemented.
+ memory_pool : :class:`.pyarrow.MemoryPool`
+ Not implemented.
+
+ Returns
+ -------
+ table : :class:`.pyarrow.Table`
+
+ """
if batch_readahead is not None:
raise ValueError("batch_readahead not supported")
if fragment_readahead is not None:
@@ -229,8 +399,44 @@ def to_table(
return self._dataset.to_array(columns=columns, batch_size=batch_size, row_filter=filter).to_arrow_table()
+def from_path(path: str) -> VortexDataset:
+ return VortexDataset(_lib_dataset.dataset_from_path(path))
+
+
+def from_url(url: str) -> VortexDataset:
+ return VortexDataset(_lib_dataset.dataset_from_url(url))
+
+
class VortexScanner(pa.dataset.Scanner):
- """A PyArrow Dataset Scanner that reads from a Vortex Array."""
+ """A PyArrow Dataset Scanner that reads from a Vortex Array.
+
+ Parameters
+ ----------
+ dataset : VortexDataset
+ The dataset to scan.
+ columns : list of str
+ The columns to keep, identified by name.
+ filter : :class:`.pyarrow.dataset.Expression`
+ Keep only rows for which this expression evalutes to ``True``. Any rows for which
+ this expression evaluates to ``Null`` is removed.
+ batch_size : int
+ The maximum number of rows per batch.
+ batch_readahead : int
+ Not implemented.
+ fragment_readahead : int
+ Not implemented.
+ fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions`
+ Not implemented.
+ use_threads : bool
+ Not implemented.
+ memory_pool : :class:`.pyarrow.MemoryPool`
+ Not implemented.
+
+ Returns
+ -------
+ table : :class:`.pyarrow.Table`
+
+ """
def __init__(
self,
@@ -270,6 +476,18 @@ def count_rows(self):
)
def head(self, num_rows: int) -> pa.Table:
+ """Load the first `num_rows` of the dataset.
+
+ Parameters
+ ----------
+ num_rows : int
+ The number of rows to read.
+
+ Returns
+ -------
+ table : :class:`.pyarrow.Table`
+
+ """
return self._dataset.head(
num_rows,
self._columns,
@@ -287,6 +505,13 @@ def scan_batches(self) -> Iterator[pa.dataset.TaggedRecordBatch]:
raise NotImplementedError("scan batches")
def to_batches(self) -> Iterator[pa.RecordBatch]:
+ """Construct an iterator of :class:`.pyarrow.RecordBatch`.
+
+ Returns
+ -------
+ table : :class:`.pyarrow.Table`
+
+ """
return self._dataset.to_batches(
self._columns,
self._filter,
@@ -299,6 +524,14 @@ def to_batches(self) -> Iterator[pa.RecordBatch]:
)
def to_reader(self) -> pa.RecordBatchReader:
+ """Construct a :class:`.pyarrow.RecordBatchReader`.
+
+
+ Returns
+ -------
+ table : :class:`.pyarrow.Table`
+
+ """
return self._dataset.to_record_batch_reader(
self._columns,
self._filter,
@@ -311,6 +544,14 @@ def to_reader(self) -> pa.RecordBatchReader:
)
def to_table(self) -> pa.Table:
+ """Construct an Arrow :class:`.pyarrow.Table`.
+
+
+ Returns
+ -------
+ table : :class:`.pyarrow.Table`
+
+ """
return self._dataset.to_table(
self._columns,
self._filter,
diff --git a/pyvortex/python/vortex/encoding.py b/pyvortex/python/vortex/encoding.py
index ac522d3750..75eeb5655c 100644
--- a/pyvortex/python/vortex/encoding.py
+++ b/pyvortex/python/vortex/encoding.py
@@ -1,4 +1,4 @@
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
import pandas
import pyarrow
@@ -61,7 +61,7 @@ def _Array_to_arrow_table(self: _encoding.Array) -> pyarrow.Table:
Examples
--------
- >>> array = vortex.encoding.array([
+ >>> array = vortex.array([
... {'name': 'Joseph', 'age': 25},
... {'name': 'Narendra', 'age': 31},
... {'name': 'Angela', 'age': 33},
@@ -82,7 +82,7 @@ def _Array_to_arrow_table(self: _encoding.Array) -> pyarrow.Table:
Array.to_arrow_table = _Array_to_arrow_table
-def _Array_to_pandas(self: _encoding.Array) -> "pandas.DataFrame":
+def _Array_to_pandas_df(self: _encoding.Array) -> "pandas.DataFrame":
"""Construct a Pandas dataframe from this Vortex array.
Warning
@@ -99,27 +99,24 @@ def _Array_to_pandas(self: _encoding.Array) -> "pandas.DataFrame":
Construct a dataframe from a Vortex array:
- >>> array = vortex.encoding.array([
+ >>> array = vortex.array([
... {'name': 'Joseph', 'age': 25},
... {'name': 'Narendra', 'age': 31},
... {'name': 'Angela', 'age': 33},
... {'name': 'Mikhail', 'age': 57},
... ])
- >>> array.to_pandas()
+ >>> array.to_pandas_df()
age name
0 25 Joseph
1 31 Narendra
2 33 Angela
3 57 Mikhail
-
- Lift the struct fields to the top-level in the dataframe:
-
"""
return self.to_arrow_table().to_pandas(types_mapper=pandas.ArrowDtype)
-Array.to_pandas = _Array_to_pandas
+Array.to_pandas_df = _Array_to_pandas_df
def _Array_to_polars_dataframe(
@@ -146,7 +143,7 @@ def _Array_to_polars_dataframe(
Examples
--------
- >>> array = vortex.encoding.array([
+ >>> array = vortex.array([
... {'name': 'Joseph', 'age': 25},
... {'name': 'Narendra', 'age': 31},
... {'name': 'Angela', 'age': 33},
@@ -193,7 +190,7 @@ def _Array_to_polars_series(self: _encoding.Array): # -> 'polars.Series': # br
Convert a numeric array with nulls to a Polars Series:
- >>> vortex.encoding.array([1, None, 2, 3]).to_polars_series() # doctest: +NORMALIZE_WHITESPACE
+ >>> vortex.array([1, None, 2, 3]).to_polars_series() # doctest: +NORMALIZE_WHITESPACE
shape: (4,)
Series: '' [i64]
[
@@ -205,7 +202,7 @@ def _Array_to_polars_series(self: _encoding.Array): # -> 'polars.Series': # br
Convert a UTF-8 string array to a Polars Series:
- >>> vortex.encoding.array(['hello, ', 'is', 'it', 'me?']).to_polars_series() # doctest: +NORMALIZE_WHITESPACE
+ >>> vortex.array(['hello, ', 'is', 'it', 'me?']).to_polars_series() # doctest: +NORMALIZE_WHITESPACE
shape: (4,)
Series: '' [str]
[
@@ -217,7 +214,7 @@ def _Array_to_polars_series(self: _encoding.Array): # -> 'polars.Series': # br
Convert a struct array to a Polars Series:
- >>> array = vortex.encoding.array([
+ >>> array = vortex.array([
... {'name': 'Joseph', 'age': 25},
... {'name': 'Narendra', 'age': 31},
... {'name': 'Angela', 'age': 33},
@@ -262,7 +259,7 @@ def _Array_to_numpy(self: _encoding.Array, *, zero_copy_only: bool = True) -> "n
Construct an immutable ndarray from a Vortex array:
- >>> array = vortex.encoding.array([1, 0, 0, 1])
+ >>> array = vortex.array([1, 0, 0, 1])
>>> array.to_numpy()
array([1, 0, 0, 1])
@@ -273,14 +270,39 @@ def _Array_to_numpy(self: _encoding.Array, *, zero_copy_only: bool = True) -> "n
Array.to_numpy = _Array_to_numpy
-def array(obj: pyarrow.Array | list) -> Array:
+def _Array_to_pylist(self: _encoding.Array) -> list[Any]:
+ """Deeply copy an Array into a Python list.
+
+ Returns
+ -------
+ :class:`list`
+
+ Examples
+ --------
+
+ >>> array = vortex.array([
+ ... {'name': 'Joseph', 'age': 25},
+ ... {'name': 'Narendra', 'age': 31},
+ ... {'name': 'Angela', 'age': 33},
+ ... ])
+ >>> array.to_pylist()
+ [{'age': 25, 'name': 'Joseph'}, {'age': 31, 'name': 'Narendra'}, {'age': 33, 'name': 'Angela'}]
+
+ """
+ return self.to_arrow_table().to_pylist()
+
+
+Array.to_pylist = _Array_to_pylist
+
+
+def array(obj: pyarrow.Array | list | Any) -> Array:
"""The main entry point for creating Vortex arrays from other Python objects.
This function is also available as ``vortex.array``.
Parameters
----------
- obj : :class:`pyarrow.Array` or :class:`list`
+ obj : :class:`pyarrow.Array`, :class:`list`, :class:`pandas.DataFrame`
The elements of this array or list become the elements of the Vortex array.
Returns
@@ -290,9 +312,9 @@ def array(obj: pyarrow.Array | list) -> Array:
Examples
--------
- A Vortex array containing the first three integers.
+ A Vortex array containing the first three integers:
- >>> vortex.encoding.array([1, 2, 3]).to_arrow_array()
+ >>> vortex.array([1, 2, 3]).to_arrow_array()
[
1,
@@ -300,9 +322,9 @@ def array(obj: pyarrow.Array | list) -> Array:
3
]
- The same Vortex array with a null value in the third position.
+ The same Vortex array with a null value in the third position:
- >>> vortex.encoding.array([1, 2, None, 3]).to_arrow_array()
+ >>> vortex.array([1, 2, None, 3]).to_arrow_array()
[
1,
@@ -314,7 +336,7 @@ def array(obj: pyarrow.Array | list) -> Array:
Initialize a Vortex array from an Arrow array:
>>> arrow = pyarrow.array(['Hello', 'it', 'is', 'me'])
- >>> vortex.encoding.array(arrow).to_arrow_array()
+ >>> vortex.array(arrow).to_arrow_array()
[
"Hello",
@@ -323,7 +345,40 @@ def array(obj: pyarrow.Array | list) -> Array:
"me"
]
+ Initialize a Vortex array from a Pandas dataframe:
+
+ >>> import pandas as pd
+ >>> df = pd.DataFrame({
+ ... "Name": ["Braund", "Allen", "Bonnell"],
+ ... "Age": [22, 35, 58],
+ ... })
+ >>> vortex.array(df).to_arrow_array()
+
+ [
+ -- is_valid: all not null
+ -- child 0 type: string_view
+ [
+ "Braund",
+ "Allen",
+ "Bonnell"
+ ]
+ -- child 1 type: int64
+ [
+ 22,
+ 35,
+ 58
+ ]
+ ]
+
"""
+
if isinstance(obj, list):
return _encoding._encode(pyarrow.array(obj))
+ try:
+ import pandas
+
+ if isinstance(obj, pandas.DataFrame):
+ return _encoding._encode(pyarrow.Table.from_pandas(obj))
+ except ImportError:
+ pass
return _encoding._encode(obj)
diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs
index 5e2f14d263..5022157117 100644
--- a/pyvortex/src/array.rs
+++ b/pyvortex/src/array.rs
@@ -20,8 +20,8 @@ use crate::scalar::scalar_into_py;
///
/// Arrays support all the standard comparison operations:
///
-/// >>> a = vortex.encoding.array(['dog', None, 'cat', 'mouse', 'fish'])
-/// >>> b = vortex.encoding.array(['doug', 'jennifer', 'casper', 'mouse', 'faust'])
+/// >>> a = vortex.array(['dog', None, 'cat', 'mouse', 'fish'])
+/// >>> b = vortex.array(['doug', 'jennifer', 'casper', 'mouse', 'faust'])
/// >>> (a < b).to_arrow_array()
///
/// [
@@ -106,7 +106,7 @@ impl PyArray {
///
/// Round-trip an Arrow array through a Vortex array:
///
- /// >>> vortex.encoding.array([1, 2, 3]).to_arrow_array()
+ /// >>> vortex.array([1, 2, 3]).to_arrow_array()
///
/// [
/// 1,
@@ -179,19 +179,19 @@ impl PyArray {
/// Examples
/// --------
///
- /// By default, :func:`vortex.encoding.array` uses the largest available bit-width:
+ /// By default, :func:`~vortex.encoding.array` uses the largest available bit-width:
///
- /// >>> vortex.encoding.array([1, 2, 3]).dtype
+ /// >>> vortex.array([1, 2, 3]).dtype
/// int(64, False)
///
/// Including a :obj:`None` forces a nullable type:
///
- /// >>> vortex.encoding.array([1, None, 2, 3]).dtype
+ /// >>> vortex.array([1, None, 2, 3]).dtype
/// int(64, True)
///
/// A UTF-8 string array:
///
- /// >>> vortex.encoding.array(['hello, ', 'is', 'it', 'me?']).dtype
+ /// >>> vortex.array(['hello, ', 'is', 'it', 'me?']).dtype
/// utf8(False)
#[getter]
fn dtype(self_: PyRef) -> PyResult> {
@@ -244,19 +244,19 @@ impl PyArray {
///
/// Parameters
/// ----------
- /// filter : :class:`vortex.encoding.Array`
+ /// filter : :class:`~vortex.encoding.Array`
/// Keep all the rows in ``self`` for which the correspondingly indexed row in `filter` is True.
///
/// Returns
/// -------
- /// :class:`vortex.encoding.Array`
+ /// :class:`~vortex.encoding.Array`
///
/// Examples
/// --------
///
/// Keep only the single digit positive integers.
///
- /// >>> a = vortex.encoding.array([0, 42, 1_000, -23, 10, 9, 5])
+ /// >>> a = vortex.array([0, 42, 1_000, -23, 10, 9, 5])
/// >>> filter = vortex.array([True, False, False, False, False, True, True])
/// >>> a.filter(filter).to_arrow_array()
///
@@ -279,7 +279,7 @@ impl PyArray {
/// Fill forward sensor values over intermediate missing values. Note that leading nulls are
/// replaced with 0.0:
///
- /// >>> a = vortex.encoding.array([
+ /// >>> a = vortex.array([
/// ... None, None, 30.29, 30.30, 30.30, None, None, 30.27, 30.25,
/// ... 30.22, None, None, None, None, 30.12, 30.11, 30.11, 30.11,
/// ... 30.10, 30.08, None, 30.21, 30.03, 30.03, 30.05, 30.07, 30.07,
@@ -334,12 +334,12 @@ impl PyArray {
///
/// Retrieve the last element from an array of integers:
///
- /// >>> vortex.encoding.array([10, 42, 999, 1992]).scalar_at(3)
+ /// >>> vortex.array([10, 42, 999, 1992]).scalar_at(3)
/// 1992
///
/// Retrieve the third element from an array of strings:
///
- /// >>> array = vortex.encoding.array(["hello", "goodbye", "it", "is"])
+ /// >>> array = vortex.array(["hello", "goodbye", "it", "is"])
/// >>> array.scalar_at(2)
///
///
@@ -352,7 +352,7 @@ impl PyArray {
///
/// Retrieve an element from an array of structures:
///
- /// >>> array = vortex.encoding.array([
+ /// >>> array = vortex.array([
/// ... {'name': 'Joseph', 'age': 25},
/// ... {'name': 'Narendra', 'age': 31},
/// ... {'name': 'Angela', 'age': 33},
@@ -376,7 +376,7 @@ impl PyArray {
///
/// Out of bounds accesses are prohibited:
///
- /// >>> vortex.encoding.array([10, 42, 999, 1992]).scalar_at(10)
+ /// >>> vortex.array([10, 42, 999, 1992]).scalar_at(10)
/// Traceback (most recent call last):
/// ...
/// ValueError: index 10 out of bounds from 0 to 4
@@ -384,7 +384,7 @@ impl PyArray {
///
/// Unlike Python, negative indices are not supported:
///
- /// >>> vortex.encoding.array([10, 42, 999, 1992]).scalar_at(-2)
+ /// >>> vortex.array([10, 42, 999, 1992]).scalar_at(-2)
/// Traceback (most recent call last):
/// ...
/// OverflowError: can't convert negative int to unsigned
@@ -398,20 +398,20 @@ impl PyArray {
///
/// Parameters
/// ----------
- /// indices : :class:`vortex.encoding.Array`
+ /// indices : :class:`~vortex.encoding.Array`
/// An array of indices to keep.
///
/// Returns
/// -------
- /// :class:`vortex.encoding.Array`
+ /// :class:`~vortex.encoding.Array`
///
/// Examples
/// --------
///
/// Keep only the first and third elements:
///
- /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd'])
- /// >>> indices = vortex.encoding.array([0, 2])
+ /// >>> a = vortex.array(['a', 'b', 'c', 'd'])
+ /// >>> indices = vortex.array([0, 2])
/// >>> a.take(indices).to_arrow_array()
///
/// [
@@ -421,8 +421,8 @@ impl PyArray {
///
/// Permute and repeat the first and second elements:
///
- /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd'])
- /// >>> indices = vortex.encoding.array([0, 1, 1, 0])
+ /// >>> a = vortex.array(['a', 'b', 'c', 'd'])
+ /// >>> indices = vortex.array([0, 1, 1, 0])
/// >>> a.take(indices).to_arrow_array()
///
/// [
@@ -457,14 +457,14 @@ impl PyArray {
///
/// Returns
/// -------
- /// :class:`vortex.encoding.Array`
+ /// :class:`~vortex.encoding.Array`
///
/// Examples
/// --------
///
/// Keep only the second through third elements:
///
- /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd'])
+ /// >>> a = vortex.array(['a', 'b', 'c', 'd'])
/// >>> a.slice(1, 3).to_arrow_array()
///
/// [
@@ -474,14 +474,14 @@ impl PyArray {
///
/// Keep none of the elements:
///
- /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd'])
+ /// >>> a = vortex.array(['a', 'b', 'c', 'd'])
/// >>> a.slice(3, 3).to_arrow_array()
///
/// []
///
/// Unlike Python, it is an error to slice outside the bounds of the array:
///
- /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd'])
+ /// >>> a = vortex.array(['a', 'b', 'c', 'd'])
/// >>> a.slice(2, 10).to_arrow_array()
/// Traceback (most recent call last):
/// ...
@@ -489,7 +489,7 @@ impl PyArray {
///
/// Or to slice with a negative value:
///
- /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd'])
+ /// >>> a = vortex.array(['a', 'b', 'c', 'd'])
/// >>> a.slice(-2, -1).to_arrow_array()
/// Traceback (most recent call last):
/// ...
@@ -516,7 +516,7 @@ impl PyArray {
///
/// Uncompressed arrays have straightforward encodings:
///
- /// >>> arr = vortex.encoding.array([1, 2, None, 3])
+ /// >>> arr = vortex.array([1, 2, None, 3])
/// >>> print(arr.tree_display())
/// root: vortex.primitive(0x03)(i64?, len=4) nbytes=33 B (100.00%)
/// metadata: PrimitiveMetadata { validity: Array }
diff --git a/pyvortex/src/compress.rs b/pyvortex/src/compress.rs
index 31768fc008..be84bf018d 100644
--- a/pyvortex/src/compress.rs
+++ b/pyvortex/src/compress.rs
@@ -8,7 +8,7 @@ use crate::array::PyArray;
///
/// Parameters
/// ----------
-/// array : :class:`vortex.encoding.Array`
+/// array : :class:`~vortex.encoding.Array`
/// The array.
///
/// Examples
@@ -16,23 +16,23 @@ use crate::array::PyArray;
///
/// Compress a very sparse array of integers:
///
-/// >>> a = vortex.encoding.array([42 for _ in range(1000)])
-/// >>> str(vortex.encoding.compress(a))
+/// >>> a = vortex.array([42 for _ in range(1000)])
+/// >>> str(vortex.compress(a))
/// 'vortex.constant(0x09)(i64, len=1000)'
///
/// Compress an array of increasing integers:
///
-/// >>> a = vortex.encoding.array(list(range(1000)))
-/// >>> str(vortex.encoding.compress(a))
+/// >>> a = vortex.array(list(range(1000)))
+/// >>> str(vortex.compress(a))
/// 'fastlanes.for(0x17)(i64, len=1000)'
///
/// Compress an array of increasing floating-point numbers and a few nulls:
///
-/// >>> a = vortex.encoding.array([
+/// >>> a = vortex.array([
/// ... float(x) if x % 20 != 0 else None
/// ... for x in range(1000)
/// ... ])
-/// >>> str(vortex.encoding.compress(a))
+/// >>> str(vortex.compress(a))
/// 'vortex.alp(0x11)(f64?, len=1000)'
pub fn compress(array: &Bound) -> PyResult {
let compressor = SamplingCompressor::default();
diff --git a/pyvortex/src/dtype.rs b/pyvortex/src/dtype.rs
index 7a9e49a79a..61f419d0f7 100644
--- a/pyvortex/src/dtype.rs
+++ b/pyvortex/src/dtype.rs
@@ -119,7 +119,7 @@ pub fn dtype_bool(py: Python<'_>, nullable: bool) -> PyResult> {
///
/// Parameters
/// ----------
-/// width : one of 8, 16, 32, and 64.
+/// width : Literal[8, 16, 32, 64].
/// The bit width determines the span of valid values. If :obj:`None`, 64 is used.
///
/// nullable : :class:`bool`
@@ -162,7 +162,7 @@ pub fn dtype_int(py: Python<'_>, width: Option, nullable: bool) -> PyResult
///
/// Parameters
/// ----------
-/// width : one of 8, 16, 32, and 64.
+/// width : Literal[8, 16, 32, 64].
/// The bit width determines the span of valid values. If :obj:`None`, 64 is used.
///
/// nullable : :class:`bool`
@@ -205,7 +205,7 @@ pub fn dtype_uint(py: Python<'_>, width: Option, nullable: bool) -> PyResul
///
/// Parameters
/// ----------
-/// width : one of 16, 32, and 64.
+/// width : Literal[16, 32, 64].
/// The bit width determines the range and precision of the floating-point values. If
/// :obj:`None`, 64 is used.
///
diff --git a/pyvortex/src/expr.rs b/pyvortex/src/expr.rs
index 8bac88fe67..3b27fc120a 100644
--- a/pyvortex/src/expr.rs
+++ b/pyvortex/src/expr.rs
@@ -13,12 +13,15 @@ use crate::dtype::PyDType;
/// An expression describes how to filter rows when reading an array from a file.
///
+/// .. seealso::
+/// :func:`.column`
+///
/// Examples
/// ========
///
/// All the examples read the following file.
///
-/// >>> a = vortex.encoding.array([
+/// >>> a = vortex.array([
/// ... {'name': 'Joseph', 'age': 25},
/// ... {'name': None, 'age': 31},
/// ... {'name': 'Angela', 'age': None},
@@ -209,7 +212,8 @@ impl PyExpr {
/// A named column.
///
-/// See :class:`.Expr` for more examples.
+/// .. seealso::
+/// :class:`.Expr`
///
/// Example
/// =======
@@ -219,6 +223,8 @@ impl PyExpr {
/// >>> name = vortex.expr.column("name")
/// >>> filter = name == "Joseph"
///
+/// See :class:`.Expr` for more examples.
+///
#[pyfunction]
pub fn column<'py>(name: &Bound<'py, PyString>) -> PyResult> {
let py = name.py();
diff --git a/pyvortex/src/io.rs b/pyvortex/src/io.rs
index ac32aed647..d93350df95 100644
--- a/pyvortex/src/io.rs
+++ b/pyvortex/src/io.rs
@@ -5,6 +5,7 @@ use pyo3::pyfunction;
use pyo3::types::PyString;
use tokio::fs::File;
use vortex::Array;
+use vortex_sampling_compressor::SamplingCompressor;
use vortex_serde::layouts::LayoutWriter;
use crate::dataset::{ObjectStoreUrlDataset, TokioFileDataset};
@@ -27,7 +28,7 @@ use crate::{PyArray, TOKIO_RUNTIME};
///
/// Read an array with a structured column and nulls at multiple levels and in multiple columns.
///
-/// >>> a = vortex.encoding.array([
+/// >>> a = vortex.array([
/// ... {'name': 'Joseph', 'age': 25},
/// ... {'name': None, 'age': 31},
/// ... {'name': 'Angela', 'age': None},
@@ -111,7 +112,7 @@ use crate::{PyArray, TOKIO_RUNTIME};
///
/// TODO(DK): Top-level nullness does not work.
///
-/// >>> a = vortex.encoding.array([
+/// >>> a = vortex.array([
/// ... {'name': 'Joseph', 'age': 25},
/// ... {'name': None, 'age': 31},
/// ... {'name': 'Angela', 'age': None},
@@ -186,23 +187,25 @@ pub fn read_url(
dataset.to_array(projection, None, row_filter)
}
-#[pyfunction]
/// Write a vortex struct array to the local filesystem.
///
/// Parameters
/// ----------
-/// array : :class:`vortex.encoding.Array`
+/// array : :class:`~vortex.encoding.Array`
/// The array. Must be an array of structures.
///
/// f : :class:`str`
/// The file path.
///
+/// compress : :class:`bool`
+/// Compress the array before writing, defaults to ``True``.
+///
/// Examples
/// --------
///
/// Write the array `a` to the local file `a.vortex`.
///
-/// >>> a = vortex.encoding.array([
+/// >>> a = vortex.array([
/// ... {'x': 1},
/// ... {'x': 2},
/// ... {'x': 10},
@@ -211,7 +214,13 @@ pub fn read_url(
/// ... ])
/// >>> vortex.io.write_path(a, "a.vortex")
///
-pub fn write_path(array: &Bound<'_, PyArray>, f: &Bound<'_, PyString>) -> PyResult<()> {
+#[pyfunction]
+#[pyo3(signature = (array, f, *, compress=true))]
+pub fn write_path(
+ array: &Bound<'_, PyArray>,
+ f: &Bound<'_, PyString>,
+ compress: bool,
+) -> PyResult<()> {
async fn run(array: &Array, fname: &str) -> PyResult<()> {
let file = File::create(Path::new(fname)).await?;
let mut writer = LayoutWriter::new(file);
@@ -222,7 +231,12 @@ pub fn write_path(array: &Bound<'_, PyArray>, f: &Bound<'_, PyString>) -> PyResu
}
let fname = f.to_str()?; // TODO(dk): support file objects
- let array = array.borrow().unwrap().clone();
+ let mut array = array.borrow().unwrap().clone();
+
+ if compress {
+ let compressor = SamplingCompressor::default();
+ array = compressor.compress(&array, None)?.into_array();
+ }
TOKIO_RUNTIME.block_on(run(&array, fname))
}
diff --git a/pyvortex/src/scalar.rs b/pyvortex/src/scalar.rs
index abee1bf5dc..dbf9a5ed1e 100644
--- a/pyvortex/src/scalar.rs
+++ b/pyvortex/src/scalar.rs
@@ -134,7 +134,7 @@ impl PyBufferString {
#[pymethods]
impl PyBufferString {
- /// Copy this buffer string from array memory into a Python str.
+ /// Copy this buffer string from array memory into a :class:`str`.
#[pyo3(signature = (*, recursive = false))]
#[allow(unused_variables)] // we want the same Python name across all methods
pub fn into_python(self_: PyRef, recursive: bool) -> PyResult {
@@ -178,7 +178,7 @@ impl PyVortexList {
#[pymethods]
impl PyVortexList {
- /// Copy the elements of this list from array memory into a list of Python objects.
+ /// Copy the elements of this list from array memory into a :class:`list`.
#[pyo3(signature = (*, recursive = false))]
pub fn into_python(self_: PyRef, recursive: bool) -> PyResult {
to_python_list(self_.py(), &self_.inner, &self_.dtype, recursive)
@@ -236,7 +236,7 @@ impl PyVortexStruct {
#[pymethods]
impl PyVortexStruct {
#[pyo3(signature = (*, recursive = false))]
- /// Copy the elements of this list from array memory into a list of Python objects.
+ /// Copy the elements of this list from array memory into a :class:`dict`.
pub fn into_python(self_: PyRef, recursive: bool) -> PyResult {
to_python_dict(self_.py(), &self_.inner, &self_.dtype, recursive)
}
diff --git a/requirements-dev.lock b/requirements-dev.lock
index 5a9bc870cc..6df3b41447 100644
--- a/requirements-dev.lock
+++ b/requirements-dev.lock
@@ -72,7 +72,6 @@ numpy==1.26.4
# via xarray
packaging==24.0
# via matplotlib
- # via pydata-sphinx-theme
# via pytest
# via sphinx
# via xarray
@@ -103,7 +102,7 @@ py-cpuinfo==9.0.0
# via pytest-benchmark
pyarrow==17.0.0
# via vortex-array
-pydata-sphinx-theme==0.15.4
+pydata-sphinx-theme==0.16.0
pygments==2.17.2
# via accessible-pygments
# via ipython
@@ -133,6 +132,8 @@ soupsieve==2.6
# via beautifulsoup4
sphinx==8.0.2
# via pydata-sphinx-theme
+ # via sphinx-design
+sphinx-design==0.6.1
sphinxcontrib-applehelp==2.0.0
# via sphinx
sphinxcontrib-devhelp==2.0.0
diff --git a/requirements.lock b/requirements.lock
index 88d7d0993d..d8458d2455 100644
--- a/requirements.lock
+++ b/requirements.lock
@@ -55,7 +55,6 @@ numpy==2.1.2
# via xarray
packaging==24.1
# via matplotlib
- # via pydata-sphinx-theme
# via sphinx
# via xarray
pandas==2.2.3
@@ -70,7 +69,7 @@ protobuf==5.28.2
# via substrait
pyarrow==17.0.0
# via vortex-array
-pydata-sphinx-theme==0.15.4
+pydata-sphinx-theme==0.16.0
pygments==2.18.0
# via accessible-pygments
# via pydata-sphinx-theme
@@ -93,6 +92,8 @@ soupsieve==2.6
# via beautifulsoup4
sphinx==8.1.3
# via pydata-sphinx-theme
+ # via sphinx-design
+sphinx-design==0.6.1
sphinxcontrib-applehelp==2.0.0
# via sphinx
sphinxcontrib-devhelp==2.0.0
diff --git a/uv.lock b/uv.lock
index 519b3c0a95..0e5b0edd34 100644
--- a/uv.lock
+++ b/uv.lock
@@ -242,13 +242,15 @@ dependencies = [
{ name = "pydata-sphinx-theme" },
{ name = "pyvortex" },
{ name = "sphinx" },
+ { name = "sphinx-design" },
]
[package.metadata]
requires-dist = [
- { name = "pydata-sphinx-theme", specifier = ">=0.15.4" },
+ { name = "pydata-sphinx-theme", specifier = ">=0.16.0" },
{ name = "pyvortex" },
{ name = "sphinx", specifier = ">=8.0.2" },
+ { name = "sphinx-design", specifier = ">=0.6.1" },
]
[[package]]
@@ -1189,6 +1191,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/26/60/1ddff83a56d33aaf6f10ec8ce84b4c007d9368b21008876fceda7e7381ef/sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2", size = 3487125 },
]
+[[package]]
+name = "sphinx-design"
+version = "0.6.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "sphinx" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/2b/69/b34e0cb5336f09c6866d53b4a19d76c227cdec1bbc7ac4de63ca7d58c9c7/sphinx_design-0.6.1.tar.gz", hash = "sha256:b44eea3719386d04d765c1a8257caca2b3e6f8421d7b3a5e742c0fd45f84e632", size = 2193689 }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/c6/43/65c0acbd8cc6f50195a3a1fc195c404988b15c67090e73c7a41a9f57d6bd/sphinx_design-0.6.1-py3-none-any.whl", hash = "sha256:b11f37db1a802a183d61b159d9a202314d4d2fe29c163437001324fe2f19549c", size = 2215338 },
+]
+
[[package]]
name = "sphinxcontrib-applehelp"
version = "2.0.0"