diff --git a/.github/workflows/python-docs.yml b/.github/workflows/docs.yml similarity index 92% rename from .github/workflows/python-docs.yml rename to .github/workflows/docs.yml index fc0f0adc1f..4dac3655b2 100644 --- a/.github/workflows/python-docs.yml +++ b/.github/workflows/docs.yml @@ -1,4 +1,4 @@ -name: Python docs +name: Python & Rust docs on: push: @@ -31,6 +31,7 @@ jobs: built_sha=$(git rev-parse HEAD) + rm -rf docs/_build/html/rust/CACHETAG.DIR docs/_build/html/rust/debug mv docs/_build/html /tmp/html git fetch origin diff --git a/docs/Makefile b/docs/Makefile index 1b1224a5b8..122224bb74 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -18,3 +18,9 @@ help: # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +html: rust-html + +.PHONY: rust-html +rust-html: + cargo doc --no-deps --workspace --all-features --target-dir $(BUILDDIR)/html/rust diff --git a/docs/_static/example.parquet b/docs/_static/example.parquet new file mode 100644 index 0000000000..46c86166bf Binary files /dev/null and b/docs/_static/example.parquet differ diff --git a/docs/_static/example.vortex b/docs/_static/example.vortex new file mode 100644 index 0000000000..796654f2a7 Binary files /dev/null and b/docs/_static/example.vortex differ diff --git a/docs/_static/file-format-2024-10-23-1642.svg b/docs/_static/file-format-2024-10-23-1642.svg new file mode 100644 index 0000000000..46eb3b5e5d --- /dev/null +++ b/docs/_static/file-format-2024-10-23-1642.svg @@ -0,0 +1,10 @@ + + + + + + + + Statistics"Tables"SchemaFooterPostscriptData.........AABBStruct { names: ["A", "B"]; dtypes: [Primitive { I32; nullable: false }, Utf8 { nullable: false }]; nullable: false }RowOffsetChunkIndex012......MinMaxOne Metadata Table Per ColumnNullCount...TrueCountColumns Chunked{has_metadata=true}Flat { begin: u64, end: u64 }FlatFlatFlat1 Flat Layout (i.e., Byte Offset Range)Per Column ChunkColumn AColumn BFirst child contains the byte offsetsfor the column's metadata tableChunked...FlatFlatFlat Chunked{has_metadata=true}Example: a Layout with Row GroupsEOFVersion Info (4 bytes)Magic (4 bytes)EOF: fixed sizeof 8-bytes(forever)Schema OffsetLayout Offset...Postscript: compile-time known size;guaranteed to fitinto initial readLayoutRow Count...Footer: variable-sizemetadata that isnecessary for pruning& pushdown \ No newline at end of file diff --git a/docs/_static/style.css b/docs/_static/style.css new file mode 100644 index 0000000000..2d137b32a9 --- /dev/null +++ b/docs/_static/style.css @@ -0,0 +1,3 @@ +html .pst-navbar-icon { + font-size: 1.5rem; +} diff --git a/docs/_static/vortex_spiral_logo.svg b/docs/_static/vortex_spiral_logo.svg new file mode 100644 index 0000000000..026901c94f --- /dev/null +++ b/docs/_static/vortex_spiral_logo.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/docs/_static/vortex_spiral_logo_dark_theme.svg b/docs/_static/vortex_spiral_logo_dark_theme.svg new file mode 100644 index 0000000000..0c4d52bab2 --- /dev/null +++ b/docs/_static/vortex_spiral_logo_dark_theme.svg @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/docs/dataset.rst b/docs/api/dataset.rst similarity index 77% rename from docs/dataset.rst rename to docs/api/dataset.rst index 848e6592ca..16d564868d 100644 --- a/docs/dataset.rst +++ b/docs/api/dataset.rst @@ -6,5 +6,15 @@ query engines like DuckDB and Polars. In particular, Vortex will read data propo number of rows passing a filter condition and the number of columns in a selection. For most Vortex encodings, this property holds true even when the filter condition specifies a single row. +.. autosummary:: + :nosignatures: + + ~vortex.dataset.VortexDataset + ~vortex.dataset.VortexScanner + +.. raw:: html + +
+ .. automodule:: vortex.dataset :members: diff --git a/docs/api/dtype.rst b/docs/api/dtype.rst new file mode 100644 index 0000000000..4f529feea9 --- /dev/null +++ b/docs/api/dtype.rst @@ -0,0 +1,27 @@ +Array Data Types +================ + +The logical types of the elements of an Array. Each logical type is implemented by a variety of +Array encodings which describe both a representation-as-bytes as well as how to apply operations on +that representation. + +.. autosummary:: + :nosignatures: + + ~vortex.dtype.DType + ~vortex.dtype.binary + ~vortex.dtype.bool + ~vortex.dtype.float + ~vortex.dtype.int + ~vortex.dtype.null + ~vortex.dtype.uint + ~vortex.dtype.utf8 + +.. raw:: html + +
+ +.. automodule:: vortex.dtype + :members: + :imported-members: + diff --git a/docs/api/encoding.rst b/docs/api/encoding.rst new file mode 100644 index 0000000000..3ec5cb449c --- /dev/null +++ b/docs/api/encoding.rst @@ -0,0 +1,26 @@ +Arrays +====== + +A Vortex array is a possibly compressed ordered set of homogeneously typed values. Each array has a +logical type and a physical encoding. The logical type describes the set of operations applicable to +the values of this array. The physical encoding describes how this array is realized in memory, on +disk, and over the wire and how to apply operations to that realization. + +.. autosummary:: + :nosignatures: + + ~vortex.encoding.array + ~vortex.encoding.compress + ~vortex.encoding.Array + +.. raw:: html + +
+ +.. autofunction:: vortex.encoding.array + +.. autofunction:: vortex.encoding.compress + +.. autoclass:: vortex.encoding.Array + :members: + :special-members: __len__ diff --git a/docs/api/expr.rst b/docs/api/expr.rst new file mode 100644 index 0000000000..3fd6ab3390 --- /dev/null +++ b/docs/api/expr.rst @@ -0,0 +1,26 @@ +Expressions +=========== + +Vortex expressions represent simple filtering conditions on the rows of a Vortex array. For example, +the following expression represents the set of rows for which the `age` column lies between 23 and +55: + +.. doctest:: + + >>> import vortex + >>> age = vortex.expr.column("age") + >>> (23 > age) & (age < 55) # doctest: +SKIP + +.. autosummary:: + :nosignatures: + + ~vortex.expr.column + ~vortex.expr.Expr + +.. raw:: html + +
+ +.. autofunction:: vortex.expr.column + +.. autoclass:: vortex.expr.Expr diff --git a/docs/api/index.rst b/docs/api/index.rst new file mode 100644 index 0000000000..b67c96ad6e --- /dev/null +++ b/docs/api/index.rst @@ -0,0 +1,12 @@ +Python API +========== + +.. toctree:: + :maxdepth: 5 + + encoding + dtype + io + dataset + expr + scalar diff --git a/docs/api/io.rst b/docs/api/io.rst new file mode 100644 index 0000000000..1dee8dea5d --- /dev/null +++ b/docs/api/io.rst @@ -0,0 +1,20 @@ +Input and Output +================ + +Vortex arrays support reading and writing to local and remote file systems, including plain-old +HTTP, S3, Google Cloud Storage, and Azure Blob Storage. + +.. autosummary:: + :nosignatures: + + ~vortex.io.read_path + ~vortex.io.read_url + ~vortex.io.write_path + +.. raw:: html + +
+ +.. automodule:: vortex.io + :members: + :imported-members: diff --git a/docs/api/scalar.rst b/docs/api/scalar.rst new file mode 100644 index 0000000000..288673a5c1 --- /dev/null +++ b/docs/api/scalar.rst @@ -0,0 +1,25 @@ +Scalars +======= + +A scalar is a single atomic value like the integer ``1``, the string ``"hello"``, or the structure +``{"age": 55, "name": "Angela"}``. The :meth:`.Array.scalar_at` method +returns a native Python value when the cost of doing so is small. However, for larger values like +binary data, UTF-8 strings, variable-length lists, and structures, Vortex returns a zero-copy *view* +of the Array data. The ``into_python`` method of each view will copy the scalar into a native Python +value. + +.. autosummary:: + :nosignatures: + + ~vortex.scalar.Buffer + ~vortex.scalar.BufferString + ~vortex.scalar.VortexList + ~vortex.scalar.VortexStruct + +.. raw:: html + +
+ +.. automodule:: vortex.scalar + :members: + :imported-members: diff --git a/docs/conf.py b/docs/conf.py index 719854b5e1..0fe652707d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -15,8 +15,11 @@ extensions = [ "sphinx.ext.autodoc", - "sphinx.ext.intersphinx", + "sphinx.ext.autosummary", "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.napoleon", + "sphinx_design", ] templates_path = ["_templates"] @@ -24,10 +27,10 @@ intersphinx_mapping = { "python": ("https://docs.python.org/3", None), - "pyarrow": ("https://arrow.apache.org/docs/", None), - "pandas": ("https://pandas.pydata.org/docs/", None), - "numpy": ("https://numpy.org/doc/stable/", None), - "polars": ("https://docs.pola.rs/api/python/stable/", None), + "pyarrow": ("https://arrow.apache.org/docs", None), + "pandas": ("https://pandas.pydata.org/docs", None), + "numpy": ("https://numpy.org/doc/stable", None), + "polars": ("https://docs.pola.rs/api/python/stable", None), } nitpicky = True # ensures all :class:, :obj:, etc. links are valid @@ -38,4 +41,37 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output html_theme = "pydata_sphinx_theme" -# html_static_path = ['_static'] # no static files yet +html_static_path = ["_static"] +html_css_files = ["style.css"] # relative to _static/ + +# -- Options for PyData Theme ------------------------------------------------ +html_theme_options = { + "show_toc_level": 2, + "logo": { + "alt_text": "The Vortex logo.", + "text": "Vortex", + "image_light": "_static/vortex_spiral_logo.svg", + "image_dark": "_static/vortex_spiral_logo_dark_theme.svg", + }, + "icon_links": [ + { + "name": "GitHub", + "url": "https://github.com/spiraldb/vortex", + "icon": "fa-brands fa-github", + "type": "fontawesome", + }, + { + "name": "PyPI", + "url": "https://pypi.org/project/vortex-array", + "icon": "fa-brands fa-python", + "type": "fontawesome", + }, + ], + "header_links_before_dropdown": 3, +} +html_sidebars = { + # hide the primary (left-hand) sidebar on pages without sub-pages + "quickstart": [], + "guide": [], + "file_format": [], +} diff --git a/docs/dtype.rst b/docs/dtype.rst deleted file mode 100644 index 9c30bc80b9..0000000000 --- a/docs/dtype.rst +++ /dev/null @@ -1,7 +0,0 @@ -Array Data Types -================ - -.. automodule:: vortex.dtype - :members: - :imported-members: - diff --git a/docs/encoding.rst b/docs/encoding.rst deleted file mode 100644 index 8448777fba..0000000000 --- a/docs/encoding.rst +++ /dev/null @@ -1,7 +0,0 @@ -Arrays -====== - -.. automodule:: vortex.encoding - :members: - :imported-members: - :special-members: __len__ diff --git a/docs/expr.rst b/docs/expr.rst deleted file mode 100644 index 854aec35ce..0000000000 --- a/docs/expr.rst +++ /dev/null @@ -1,6 +0,0 @@ -Row Filter Expressions -====================== - -.. automodule:: vortex.expr - :members: - :imported-members: diff --git a/docs/file_format.rst b/docs/file_format.rst new file mode 100644 index 0000000000..322bcd503b --- /dev/null +++ b/docs/file_format.rst @@ -0,0 +1,79 @@ +File Format +=========== + +Intuition +--------- + +The Vortex file format has both *layouts*, which describe how different chunks of columns are stored +relative to one another, and *encodings* which describe the byte representation of a contiguous +sequence of values. A layout describes how to contiguously store one or more arrays as is necessary +for storing an array on disk or transmitting it over the wire. An encoding defines one binary +representation for memory, disk, and the wire. + +.. _file-format--layouts: + +Layouts +^^^^^^^ + +Vortex arrays have the same binary representation in-memory, on-disk, and over-the-wire; however, +all the rows of all the columns are not necessarily contiguously laid out. Vortex has three kinds of +*layouts* which recursively compose: the *flat layout*, the *column layout*, and the *chunked +layout*. + +The flat layout is a contiguous sequence of bytes. Any Vortex array encoding can be serialized into +the flat layout. + +The column layout lays out each column of a struct-typed array as a separate sequence of bytes. Each +column may or may not recursively use a chunked layout. Column layouts permit readers to push-down +column projections. + +The chunked layout lays out an array as a sequence of row chunks. Each chunk may have a different +size. A chunked layout permits reader to push-down row filters based on statistics which we describe +later. Note that, if the laid out array is a struct array, each column uses the same chunk +size. This is equivalent to Parquet's row groups. + +A few examples of concrete layouts: + +1. Chunked of struct of chunked of flat: essentially a Parquet layout with row groups in which each + column's values are contiguously stored in pages. +2. Struct of chunked of flat: eliminates row groups, retaining only pages. +3. Struct of flat: prevents row filter push-down because each array is, to the layout, an opaque + sequence of bytes. + +The chunked layout stores, per chunk, metadata necessary for effective row filtering such as +sortedness, constancy, the minimum value, the maximum value, and the number of null rows. Readers +consult these metadata tables to avoid reading chunks without relevant data. + +.. card:: + + .. figure:: _static/file-format-2024-10-23-1642.svg + :width: 800px + :alt: A schematic of the file format + + +++ + + The Vortex file format has five sections: data, statistics, schema, footer, and postscript. The + postscript describes the locating of the schema and layout which in turn describe how to + interpret the data and metadata. The schema describes the logical type. The metadata contains + information necessary for row filtering. + +.. _included-codecs: + +Encodings +^^^^^^^^^ + +- Most of the Arrow encodings. +- Chunked, a sequence of arrays. +- Constant, a value and a length. +- Sparse, a value plus a pair of arrays representing exceptions: an array of indices and of values. +- FastLanes Frame-of-Reference, BitPacking, and Delta. +- Fast Static Symbol Table (FSST). +- Adapative Lossless Floating Point (ALP). +- ALP Real Double (ALP-RD). +- ByteBool, one byte per Boolean value. +- ZigZag. + +Specification +------------- + +TODO! diff --git a/docs/guide.rst b/docs/guide.rst new file mode 100644 index 0000000000..068d12e708 --- /dev/null +++ b/docs/guide.rst @@ -0,0 +1,173 @@ +Guide +===== + +.. admonition:: Rustaceans + + See the `Vortex Rust documentation `_, for details on Vortex in Rust. + +Python +------ + +Construct a Vortex array from lists of simple Python values: + +.. doctest:: + + >>> import vortex + >>> vtx = vortex.array([1, 2, 3, 4]) + >>> vtx.dtype + int(64, False) + +Python's :obj:`None` represents a missing or null value and changes the dtype of the array from +non-nullable 64-bit integers to nullable 64-bit integers: + +.. doctest:: + + >>> vtx = vortex.array([1, 2, None, 4]) + >>> vtx.dtype + int(64, True) + +A list of :class:`dict` is converted to an array of structures. Missing values may appear at any +level: + +.. doctest:: + + >>> vtx = vortex.array([ + ... {'name': 'Joseph', 'age': 25}, + ... {'name': None, 'age': 31}, + ... {'name': 'Angela', 'age': None}, + ... {'name': 'Mikhail', 'age': 57}, + ... {'name': None, 'age': None}, + ... None, + ... ]) + >>> vtx.dtype + struct({"age": int(64, True), "name": utf8(True)}, True) + +:meth:`.Array.to_pylist` converts a Vortex array into a list of Python values. + +.. doctest:: + + >>> vtx.to_pylist() + [{'age': 25, 'name': 'Joseph'}, {'age': 31, 'name': None}, {'age': None, 'name': 'Angela'}, {'age': 57, 'name': 'Mikhail'}, {'age': None, 'name': None}, {'age': None, 'name': None}] + +Arrow +^^^^^ + +The :func:`~vortex.encoding.array` function constructs a Vortex array from an Arrow one without any +copies: + +.. doctest:: + + >>> import pyarrow as pa + >>> arrow = pa.array([1, 2, None, 3]) + >>> arrow.type + DataType(int64) + >>> vtx = vortex.array(arrow) + >>> vtx.dtype + int(64, True) + +:meth:`.Array.to_arrow_array` converts back to an Arrow array: + +.. doctest:: + + >>> vtx.to_arrow_array() + + [ + 1, + 2, + null, + 3 + ] + +If you have a struct array, use :meth:`.Array.to_arrow_table` to construct an Arrow table: + +.. doctest:: + + >>> struct_vtx = vortex.array([ + ... {'name': 'Joseph', 'age': 25}, + ... {'name': 'Narendra', 'age': 31}, + ... {'name': 'Angela', 'age': 33}, + ... {'name': 'Mikhail', 'age': 57}, + ... ]) + >>> struct_vtx.to_arrow_table() + pyarrow.Table + age: int64 + name: string_view + ---- + age: [[25,31,33,57]] + name: [["Joseph","Narendra","Angela","Mikhail"]] + +Pandas +^^^^^^ + +:meth:`.Array.to_pandas_df` converts a Vortex array into a Pandas DataFrame: + +.. doctest:: + + >>> df = struct_vtx.to_pandas_df() + >>> df + age name + 0 25 Joseph + 1 31 Narendra + 2 33 Angela + 3 57 Mikhail + +:func:`~vortex.encoding.array` converts from a Pandas DataFrame into a Vortex array: + + >>> vortex.array(df).to_arrow_table() + pyarrow.Table + age: int64 + name: string_view + ---- + age: [[25,31,33,57]] + name: [["Joseph","Narendra","Angela","Mikhail"]] + + +.. _query-engine-integration: + +Query Engines +------------- + +:class:`~vortex.dataset.VortexDataset` implements the :class:`pyarrow.dataset.Dataset` API which +enables many Python-based query engines to pushdown row filters and column projections on Vortex +files. + +Polars +^^^^^^ + + >>> import polars as pl + >>> ds = vortex.dataset.from_path( + ... '_static/example.vortex' + ... ) + >>> lf = pl.scan_pyarrow_dataset(ds) + >>> lf = lf.select('tip_amount', 'fare_amount') + >>> lf = lf.head(3) + >>> lf.collect() + shape: (3, 2) + ┌────────────┬─────────────┐ + │ tip_amount ┆ fare_amount │ + │ --- ┆ --- │ + │ f64 ┆ f64 │ + ╞════════════╪═════════════╡ + │ 0.0 ┆ 61.8 │ + │ 5.1 ┆ 20.5 │ + │ 16.54 ┆ 70.0 │ + └────────────┴─────────────┘ + +DuckDB +^^^^^^ + + >>> import duckdb + >>> ds = vortex.dataset.from_path( + ... '_static/example.vortex' + ... ) + >>> duckdb.sql('select ds.tip_amount, ds.fare_amount from ds limit 3').show() + ┌────────────┬─────────────┐ + │ tip_amount │ fare_amount │ + │ double │ double │ + ├────────────┼─────────────┤ + │ 0.0 │ 61.8 │ + │ 5.1 │ 20.5 │ + │ 16.54 │ 70.0 │ + └────────────┴─────────────┘ + + diff --git a/docs/index.rst b/docs/index.rst index c89a19c9a3..2a2d9e9232 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,18 +3,68 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Vortex documentation -==================== +Wide, Fast & Compact. Pick Three. +================================== -Vortex is an Apache Arrow-compatible toolkit for working with compressed array data. +.. grid:: 1 1 2 2 + :gutter: 4 4 4 4 + + .. grid-item-card:: The File Format + :link: file_format + :link-type: doc + + Currently just a schematic. Specification forthcoming. + + .. grid-item-card:: The Rust API + :link: https://spiraldb.github.io/vortex/docs2/rust/doc/vortex + + The primary interface to the Vortex toolkit. + + .. grid-item-card:: Quickstart + :link: quickstart + :link-type: doc + + For end-users looking to read and write Vortex files. + + .. grid-item-card:: The Benchmarks + :link: https://bench.vortex.dev/ + + Random access, throughput, and TPC-H. + + +Vortex is a fast & extensible columnar file format that is based around state-of-the-art research +from the database community. It is built around cascading compression with lightweight encodings (no +block compression), allowing for both efficient random access and extremely fast decompression. + +Vortex also includes an accompanying in-memory format for these (recursively) compressed arrays, +that is zero-copy compatible with Apache Arrow in uncompressed form. Taken together, the Vortex +library is a useful toolkit with compressed Arrow data in-memory, on-disk, & over-the-wire. + +Vortex aspires to succeed Apache Parquet by pushing the Pareto frontier outwards: 1-2x faster +writes, 2-10x faster scans, and 100-200x faster random access reads, while preserving the same +approximate compression ratio as Parquet v2 with zstd. + +Its features include: + +- A zero-copy data layout for disk, memory, and the wire. +- Kernels for computing on, filtering, slicing, indexing, and projecting compressed arrays. +- Builtin state-of-the-art codecs including FastLanes (integer bit-packing), ALP (floating point), + and FSST (strings). +- Support for custom user-implemented codecs. +- Support for, but no requirement for, row groups. +- A read sub-system supporting filter and projection pushdown. + +Vortex's flexible layout empowers writers to choose the right layout for their setting: fast writes, +fast reads, small files, few columns, many columns, over-sized columns, etc. + +Documentation +------------- .. toctree:: :maxdepth: 2 - :caption: Contents: - - encoding - dtype - io - dataset - expr - scalar + + quickstart + guide + file_format + api/index + Rust API diff --git a/docs/io.rst b/docs/io.rst deleted file mode 100644 index f2cc405ce9..0000000000 --- a/docs/io.rst +++ /dev/null @@ -1,6 +0,0 @@ -Input and Output -================ - -.. automodule:: vortex.io - :members: - :imported-members: diff --git a/docs/pyproject.toml b/docs/pyproject.toml index 53ceee91e1..a93d7459ed 100644 --- a/docs/pyproject.toml +++ b/docs/pyproject.toml @@ -3,7 +3,12 @@ name = "docs" version = "0.1.0" description = "Vortex documentation." authors = [] -dependencies = ["pydata-sphinx-theme>=0.15.4", "sphinx>=8.0.2", "pyvortex"] +dependencies = [ + "pydata-sphinx-theme>=0.16.0", + "sphinx>=8.0.2", + "pyvortex", + "sphinx-design>=0.6.1", +] requires-python = ">= 3.10" [tool.uv] diff --git a/docs/quickstart.rst b/docs/quickstart.rst new file mode 100644 index 0000000000..65a71cb7c3 --- /dev/null +++ b/docs/quickstart.rst @@ -0,0 +1,199 @@ +Quickstart +========== + +The reference implementation exposes both a Rust and Python API. A C API is currently in progress. + +- :ref:`Quickstart for Python ` +- :ref:`Quickstart for Rust ` +- :ref:`Quickstart for C ` + +.. _python-quickstart: + +Python +------ + +Install +^^^^^^^ + +:: + + pip install vortex-array + +Convert +^^^^^^^ + +You can either use your own Parquet file or download the `example used here +`__. + +Use Arrow to read a Parquet file and then use :func:`~vortex.encoding.array` to construct an uncompressed +Vortex array: + +.. doctest:: + + >>> import pyarrow.parquet as pq + >>> import vortex + >>> parquet = pq.read_table("_static/example.parquet") + >>> vtx = vortex.array(parquet) + >>> vtx.nbytes + 141024 + +Compress +^^^^^^^^ + +Use :func:`~vortex.encoding.compress` to compress the Vortex array and check the relative size: + +.. doctest:: + + >>> cvtx = vortex.compress(vtx) + >>> cvtx.nbytes + 13970 + >>> cvtx.nbytes / vtx.nbytes + 0.099... + +Vortex uses nearly ten times fewer bytes than Arrow. Fewer bytes means more of your data fits in +cache and RAM. + +Write +^^^^^ + +Use :func:`~vortex.io.write_path` to write the Vortex array to disk: + +.. doctest:: + + >>> vortex.io.write_path(cvtx, "example.vortex") + +Small Vortex files (this one is just 71KiB) currently have substantial overhead relative to their +size. This will be addressed shortly. On files with at least tens of megabytes of data, Vortex is +similar to or smaller than Parquet. + +.. doctest:: + + >>> from os.path import getsize + >>> getsize("example.vortex") / getsize("_static/example.parquet") + 2.1... + +Read +^^^^ + +Use :func:`~vortex.io.read_path` to read the Vortex array from disk: + +.. doctest:: + + >>> cvtx = vortex.io.read_path("example.vortex") + +.. _rust-quickstart: + +Rust +---- + +Install +^^^^^^^ + +Install vortex and all the first-party array encodings:: + + cargo add vortex-array vortex-alp vortex-fsst vortex-fastlanes \ + vortex-bytebool vortex-datetime-dtype vortex-datetime-parts \ + vortex-dict vortex-runend vortex-runend-bool vortex-zigzag \ + vortex-sampling-compressor vortex-serde + +Convert +^^^^^^^ + +You can either use your own Parquet file or download the `example used here +`__. + +Use Arrow to read a Parquet file and then construct an uncompressed Vortex array: + +.. code-block:: rust + + use std::fs::File; + + use arrow_array::RecordBatchReader; + use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; + use vortex::array::ChunkedArray; + use vortex::arrow::FromArrowType; + use vortex::{Array, IntoArray}; + use vortex_dtype::DType; + + let reader = + ParquetRecordBatchReaderBuilder::try_new(File::open("_static/example.parquet").unwrap()) + .unwrap() + .build() + .unwrap(); + let dtype = DType::from_arrow(reader.schema()); + let chunks = reader + .map(|x| Array::try_from(x.unwrap()).unwrap()) + .collect::>(); + let vtx = ChunkedArray::try_new(chunks, dtype).unwrap().into_array(); + +Compress +^^^^^^^^ + +Use the sampling compressor to compress the Vortex array and check the relative size: + +.. code-block:: rust + + use std::collections::HashSet; + + use vortex_sampling_compressor::{SamplingCompressor, DEFAULT_COMPRESSORS}; + + let compressor = SamplingCompressor::new(HashSet::from(*DEFAULT_COMPRESSORS)); + let cvtx = compressor.compress(&vtx, None).unwrap().into_array(); + println!("{}", cvtx.nbytes()); + +Write +^^^^^ + +Reading and writing both require an async runtime, in this example we use Tokio. The LayoutWriter +knows how to write Vortex arrays to disk: + +.. code-block:: rust + + use std::path::Path; + + use tokio::fs::File as TokioFile; + use vortex_serde::layouts::LayoutWriter; + + let file = TokioFile::create(Path::new("example.vortex")) + .await + .unwrap(); + let writer = LayoutWriter::new(file) + .write_array_columns(cvtx.clone()) + .await + .unwrap(); + writer.finalize().await.unwrap(); + +Read +^^^^ + +.. code-block:: rust + + use futures::TryStreamExt; + use vortex_sampling_compressor::ALL_COMPRESSORS_CONTEXT; + use vortex_serde::layouts::{LayoutContext, LayoutDeserializer, LayoutReaderBuilder}; + + let file = TokioFile::open(Path::new("example.vortex")).await.unwrap(); + let builder = LayoutReaderBuilder::new( + file, + LayoutDeserializer::new( + ALL_COMPRESSORS_CONTEXT.clone(), + LayoutContext::default().into(), + ), + ); + + let stream = builder.build().await.unwrap(); + let dtype = stream.schema().clone().into(); + let vecs: Vec = stream.try_collect().await.unwrap(); + let cvtx = ChunkedArray::try_new(vecs, dtype) + .unwrap() + .into_array(); + + println!("{}", cvtx.nbytes()); + + +.. _c-quickstart: + +C +- + +Coming soon! diff --git a/docs/scalar.rst b/docs/scalar.rst deleted file mode 100644 index 9fb3b26cfc..0000000000 --- a/docs/scalar.rst +++ /dev/null @@ -1,6 +0,0 @@ -Scalar Values -============= - -.. automodule:: vortex.scalar - :members: - :imported-members: diff --git a/pyvortex/pyproject.toml b/pyvortex/pyproject.toml index f482ce0541..08f78c3ceb 100644 --- a/pyvortex/pyproject.toml +++ b/pyvortex/pyproject.toml @@ -41,4 +41,5 @@ features = ["pyo3/extension-module"] include = [ { path = "rust-toolchain.toml", format = "sdist" }, { path = "README.md", format = "sdist" }, + { path = "python/vortex/py.typed", format = "sdist" }, ] diff --git a/pyvortex/python/vortex/__init__.py b/pyvortex/python/vortex/__init__.py index b7101a7ccc..6a50c5978b 100644 --- a/pyvortex/python/vortex/__init__.py +++ b/pyvortex/python/vortex/__init__.py @@ -5,5 +5,6 @@ __doc__ = module_docs del module_docs array = encoding.array +compress = encoding.compress __all__ = ["array", dtype, expr, io, encoding, scalar, dataset] diff --git a/pyvortex/python/vortex/dataset.py b/pyvortex/python/vortex/dataset.py index d8b3254966..7f9d8d5d3b 100644 --- a/pyvortex/python/vortex/dataset.py +++ b/pyvortex/python/vortex/dataset.py @@ -12,7 +12,12 @@ class VortexDataset(pyarrow.dataset.Dataset): - """Read Vortex files with row filter and column selection pushdown.""" + """Read Vortex files with row filter and column selection pushdown. + + This class implements the :class:`.pyarrow.dataset.Dataset` interface which enables its use with + Polars, DuckDB, Pandas and others. + + """ def __init__(self, dataset): self._dataset = dataset @@ -62,6 +67,35 @@ def head( use_threads: bool | None = None, memory_pool: pa.MemoryPool = None, ) -> pa.Table: + """Load the first `num_rows` of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str + The columns to keep, identified by name. + filter : :class:`.pyarrow.dataset.Expression` + Keep only rows for which this expression evalutes to ``True``. Any rows for which + this expression evaluates to ``Null`` is removed. + batch_size : int + The maximum number of rows per batch. + batch_readahead : int + Not implemented. + fragment_readahead : int + Not implemented. + fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions` + Not implemented. + use_threads : bool + Not implemented. + memory_pool : :class:`.pyarrow.MemoryPool` + Not implemented. + + Returns + ------- + table : :class:`.pyarrow.Table` + + """ if batch_readahead is not None: raise ValueError("batch_readahead not supported") if fragment_readahead is not None: @@ -114,7 +148,33 @@ def scanner( use_threads: bool | None = None, memory_pool: pa.MemoryPool = None, ) -> pa.dataset.Scanner: - """Not implemented.""" + """Construct a :class:`.pyarrow.dataset.Scanner`. + + Parameters + ---------- + columns : list of str + The columns to keep, identified by name. + filter : :class:`.pyarrow.dataset.Expression` + Keep only rows for which this expression evalutes to ``True``. Any rows for which + this expression evaluates to ``Null`` is removed. + batch_size : int + The maximum number of rows per batch. + batch_readahead : int + Not implemented. + fragment_readahead : int + Not implemented. + fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions` + Not implemented. + use_threads : bool + Not implemented. + memory_pool : :class:`.pyarrow.MemoryPool` + Not implemented. + + Returns + ------- + table : :class:`.pyarrow.Table` + + """ return VortexScanner( self, columns, @@ -143,6 +203,35 @@ def take( use_threads: bool | None = None, memory_pool: pa.MemoryPool = None, ) -> pa.Table: + """Load a subset of rows identified by their absolute indices. + + Parameters + ---------- + indices : :class:`.pyarrow.Array` + A numeric array of absolute indices into `self` indicating which rows to keep. + columns : list of str + The columns to keep, identified by name. + filter : :class:`.pyarrow.dataset.Expression` + Keep only rows for which this expression evalutes to ``True``. Any rows for which + this expression evaluates to ``Null`` is removed. + batch_size : int + The maximum number of rows per batch. + batch_readahead : int + Not implemented. + fragment_readahead : int + Not implemented. + fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions` + Not implemented. + use_threads : bool + Not implemented. + memory_pool : :class:`.pyarrow.MemoryPool` + Not implemented. + + Returns + ------- + table : :class:`.pyarrow.Table` + + """ return ( self._dataset.to_array(columns=columns, batch_size=batch_size, row_filter=filter) .take(encoding.array(indices)) @@ -160,6 +249,33 @@ def to_record_batch_reader( use_threads: bool | None = None, memory_pool: pa.MemoryPool = None, ) -> pa.RecordBatchReader: + """Construct a :class:`.pyarrow.RecordBatchReader`. + + Parameters + ---------- + columns : list of str + The columns to keep, identified by name. + filter : :class:`.pyarrow.dataset.Expression` + Keep only rows for which this expression evalutes to ``True``. Any rows for which + this expression evaluates to ``Null`` is removed. + batch_size : int + The maximum number of rows per batch. + batch_readahead : int + Not implemented. + fragment_readahead : int + Not implemented. + fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions` + Not implemented. + use_threads : bool + Not implemented. + memory_pool : :class:`.pyarrow.MemoryPool` + Not implemented. + + Returns + ------- + table : :class:`.pyarrow.Table` + + """ if batch_readahead is not None: raise ValueError("batch_readahead not supported") if fragment_readahead is not None: @@ -186,6 +302,33 @@ def to_batches( use_threads: bool | None = None, memory_pool: pa.MemoryPool = None, ) -> Iterator[pa.RecordBatch]: + """Construct an iterator of :class:`.pyarrow.RecordBatch`. + + Parameters + ---------- + columns : list of str + The columns to keep, identified by name. + filter : :class:`.pyarrow.dataset.Expression` + Keep only rows for which this expression evalutes to ``True``. Any rows for which + this expression evaluates to ``Null`` is removed. + batch_size : int + The maximum number of rows per batch. + batch_readahead : int + Not implemented. + fragment_readahead : int + Not implemented. + fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions` + Not implemented. + use_threads : bool + Not implemented. + memory_pool : :class:`.pyarrow.MemoryPool` + Not implemented. + + Returns + ------- + table : :class:`.pyarrow.Table` + + """ record_batch_reader = self.to_record_batch_reader( columns, filter, @@ -213,6 +356,33 @@ def to_table( use_threads: bool | None = None, memory_pool: pa.MemoryPool = None, ) -> pa.Table: + """Construct an Arrow :class:`.pyarrow.Table`. + + Parameters + ---------- + columns : list of str + The columns to keep, identified by name. + filter : :class:`.pyarrow.dataset.Expression` + Keep only rows for which this expression evalutes to ``True``. Any rows for which + this expression evaluates to ``Null`` is removed. + batch_size : int + The maximum number of rows per batch. + batch_readahead : int + Not implemented. + fragment_readahead : int + Not implemented. + fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions` + Not implemented. + use_threads : bool + Not implemented. + memory_pool : :class:`.pyarrow.MemoryPool` + Not implemented. + + Returns + ------- + table : :class:`.pyarrow.Table` + + """ if batch_readahead is not None: raise ValueError("batch_readahead not supported") if fragment_readahead is not None: @@ -229,8 +399,44 @@ def to_table( return self._dataset.to_array(columns=columns, batch_size=batch_size, row_filter=filter).to_arrow_table() +def from_path(path: str) -> VortexDataset: + return VortexDataset(_lib_dataset.dataset_from_path(path)) + + +def from_url(url: str) -> VortexDataset: + return VortexDataset(_lib_dataset.dataset_from_url(url)) + + class VortexScanner(pa.dataset.Scanner): - """A PyArrow Dataset Scanner that reads from a Vortex Array.""" + """A PyArrow Dataset Scanner that reads from a Vortex Array. + + Parameters + ---------- + dataset : VortexDataset + The dataset to scan. + columns : list of str + The columns to keep, identified by name. + filter : :class:`.pyarrow.dataset.Expression` + Keep only rows for which this expression evalutes to ``True``. Any rows for which + this expression evaluates to ``Null`` is removed. + batch_size : int + The maximum number of rows per batch. + batch_readahead : int + Not implemented. + fragment_readahead : int + Not implemented. + fragment_scan_options : :class:`.pyarrow.dataset.FragmentScanOptions` + Not implemented. + use_threads : bool + Not implemented. + memory_pool : :class:`.pyarrow.MemoryPool` + Not implemented. + + Returns + ------- + table : :class:`.pyarrow.Table` + + """ def __init__( self, @@ -270,6 +476,18 @@ def count_rows(self): ) def head(self, num_rows: int) -> pa.Table: + """Load the first `num_rows` of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to read. + + Returns + ------- + table : :class:`.pyarrow.Table` + + """ return self._dataset.head( num_rows, self._columns, @@ -287,6 +505,13 @@ def scan_batches(self) -> Iterator[pa.dataset.TaggedRecordBatch]: raise NotImplementedError("scan batches") def to_batches(self) -> Iterator[pa.RecordBatch]: + """Construct an iterator of :class:`.pyarrow.RecordBatch`. + + Returns + ------- + table : :class:`.pyarrow.Table` + + """ return self._dataset.to_batches( self._columns, self._filter, @@ -299,6 +524,14 @@ def to_batches(self) -> Iterator[pa.RecordBatch]: ) def to_reader(self) -> pa.RecordBatchReader: + """Construct a :class:`.pyarrow.RecordBatchReader`. + + + Returns + ------- + table : :class:`.pyarrow.Table` + + """ return self._dataset.to_record_batch_reader( self._columns, self._filter, @@ -311,6 +544,14 @@ def to_reader(self) -> pa.RecordBatchReader: ) def to_table(self) -> pa.Table: + """Construct an Arrow :class:`.pyarrow.Table`. + + + Returns + ------- + table : :class:`.pyarrow.Table` + + """ return self._dataset.to_table( self._columns, self._filter, diff --git a/pyvortex/python/vortex/encoding.py b/pyvortex/python/vortex/encoding.py index ac522d3750..75eeb5655c 100644 --- a/pyvortex/python/vortex/encoding.py +++ b/pyvortex/python/vortex/encoding.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any import pandas import pyarrow @@ -61,7 +61,7 @@ def _Array_to_arrow_table(self: _encoding.Array) -> pyarrow.Table: Examples -------- - >>> array = vortex.encoding.array([ + >>> array = vortex.array([ ... {'name': 'Joseph', 'age': 25}, ... {'name': 'Narendra', 'age': 31}, ... {'name': 'Angela', 'age': 33}, @@ -82,7 +82,7 @@ def _Array_to_arrow_table(self: _encoding.Array) -> pyarrow.Table: Array.to_arrow_table = _Array_to_arrow_table -def _Array_to_pandas(self: _encoding.Array) -> "pandas.DataFrame": +def _Array_to_pandas_df(self: _encoding.Array) -> "pandas.DataFrame": """Construct a Pandas dataframe from this Vortex array. Warning @@ -99,27 +99,24 @@ def _Array_to_pandas(self: _encoding.Array) -> "pandas.DataFrame": Construct a dataframe from a Vortex array: - >>> array = vortex.encoding.array([ + >>> array = vortex.array([ ... {'name': 'Joseph', 'age': 25}, ... {'name': 'Narendra', 'age': 31}, ... {'name': 'Angela', 'age': 33}, ... {'name': 'Mikhail', 'age': 57}, ... ]) - >>> array.to_pandas() + >>> array.to_pandas_df() age name 0 25 Joseph 1 31 Narendra 2 33 Angela 3 57 Mikhail - - Lift the struct fields to the top-level in the dataframe: - """ return self.to_arrow_table().to_pandas(types_mapper=pandas.ArrowDtype) -Array.to_pandas = _Array_to_pandas +Array.to_pandas_df = _Array_to_pandas_df def _Array_to_polars_dataframe( @@ -146,7 +143,7 @@ def _Array_to_polars_dataframe( Examples -------- - >>> array = vortex.encoding.array([ + >>> array = vortex.array([ ... {'name': 'Joseph', 'age': 25}, ... {'name': 'Narendra', 'age': 31}, ... {'name': 'Angela', 'age': 33}, @@ -193,7 +190,7 @@ def _Array_to_polars_series(self: _encoding.Array): # -> 'polars.Series': # br Convert a numeric array with nulls to a Polars Series: - >>> vortex.encoding.array([1, None, 2, 3]).to_polars_series() # doctest: +NORMALIZE_WHITESPACE + >>> vortex.array([1, None, 2, 3]).to_polars_series() # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [i64] [ @@ -205,7 +202,7 @@ def _Array_to_polars_series(self: _encoding.Array): # -> 'polars.Series': # br Convert a UTF-8 string array to a Polars Series: - >>> vortex.encoding.array(['hello, ', 'is', 'it', 'me?']).to_polars_series() # doctest: +NORMALIZE_WHITESPACE + >>> vortex.array(['hello, ', 'is', 'it', 'me?']).to_polars_series() # doctest: +NORMALIZE_WHITESPACE shape: (4,) Series: '' [str] [ @@ -217,7 +214,7 @@ def _Array_to_polars_series(self: _encoding.Array): # -> 'polars.Series': # br Convert a struct array to a Polars Series: - >>> array = vortex.encoding.array([ + >>> array = vortex.array([ ... {'name': 'Joseph', 'age': 25}, ... {'name': 'Narendra', 'age': 31}, ... {'name': 'Angela', 'age': 33}, @@ -262,7 +259,7 @@ def _Array_to_numpy(self: _encoding.Array, *, zero_copy_only: bool = True) -> "n Construct an immutable ndarray from a Vortex array: - >>> array = vortex.encoding.array([1, 0, 0, 1]) + >>> array = vortex.array([1, 0, 0, 1]) >>> array.to_numpy() array([1, 0, 0, 1]) @@ -273,14 +270,39 @@ def _Array_to_numpy(self: _encoding.Array, *, zero_copy_only: bool = True) -> "n Array.to_numpy = _Array_to_numpy -def array(obj: pyarrow.Array | list) -> Array: +def _Array_to_pylist(self: _encoding.Array) -> list[Any]: + """Deeply copy an Array into a Python list. + + Returns + ------- + :class:`list` + + Examples + -------- + + >>> array = vortex.array([ + ... {'name': 'Joseph', 'age': 25}, + ... {'name': 'Narendra', 'age': 31}, + ... {'name': 'Angela', 'age': 33}, + ... ]) + >>> array.to_pylist() + [{'age': 25, 'name': 'Joseph'}, {'age': 31, 'name': 'Narendra'}, {'age': 33, 'name': 'Angela'}] + + """ + return self.to_arrow_table().to_pylist() + + +Array.to_pylist = _Array_to_pylist + + +def array(obj: pyarrow.Array | list | Any) -> Array: """The main entry point for creating Vortex arrays from other Python objects. This function is also available as ``vortex.array``. Parameters ---------- - obj : :class:`pyarrow.Array` or :class:`list` + obj : :class:`pyarrow.Array`, :class:`list`, :class:`pandas.DataFrame` The elements of this array or list become the elements of the Vortex array. Returns @@ -290,9 +312,9 @@ def array(obj: pyarrow.Array | list) -> Array: Examples -------- - A Vortex array containing the first three integers. + A Vortex array containing the first three integers: - >>> vortex.encoding.array([1, 2, 3]).to_arrow_array() + >>> vortex.array([1, 2, 3]).to_arrow_array() [ 1, @@ -300,9 +322,9 @@ def array(obj: pyarrow.Array | list) -> Array: 3 ] - The same Vortex array with a null value in the third position. + The same Vortex array with a null value in the third position: - >>> vortex.encoding.array([1, 2, None, 3]).to_arrow_array() + >>> vortex.array([1, 2, None, 3]).to_arrow_array() [ 1, @@ -314,7 +336,7 @@ def array(obj: pyarrow.Array | list) -> Array: Initialize a Vortex array from an Arrow array: >>> arrow = pyarrow.array(['Hello', 'it', 'is', 'me']) - >>> vortex.encoding.array(arrow).to_arrow_array() + >>> vortex.array(arrow).to_arrow_array() [ "Hello", @@ -323,7 +345,40 @@ def array(obj: pyarrow.Array | list) -> Array: "me" ] + Initialize a Vortex array from a Pandas dataframe: + + >>> import pandas as pd + >>> df = pd.DataFrame({ + ... "Name": ["Braund", "Allen", "Bonnell"], + ... "Age": [22, 35, 58], + ... }) + >>> vortex.array(df).to_arrow_array() + + [ + -- is_valid: all not null + -- child 0 type: string_view + [ + "Braund", + "Allen", + "Bonnell" + ] + -- child 1 type: int64 + [ + 22, + 35, + 58 + ] + ] + """ + if isinstance(obj, list): return _encoding._encode(pyarrow.array(obj)) + try: + import pandas + + if isinstance(obj, pandas.DataFrame): + return _encoding._encode(pyarrow.Table.from_pandas(obj)) + except ImportError: + pass return _encoding._encode(obj) diff --git a/pyvortex/src/array.rs b/pyvortex/src/array.rs index 5e2f14d263..5022157117 100644 --- a/pyvortex/src/array.rs +++ b/pyvortex/src/array.rs @@ -20,8 +20,8 @@ use crate::scalar::scalar_into_py; /// /// Arrays support all the standard comparison operations: /// -/// >>> a = vortex.encoding.array(['dog', None, 'cat', 'mouse', 'fish']) -/// >>> b = vortex.encoding.array(['doug', 'jennifer', 'casper', 'mouse', 'faust']) +/// >>> a = vortex.array(['dog', None, 'cat', 'mouse', 'fish']) +/// >>> b = vortex.array(['doug', 'jennifer', 'casper', 'mouse', 'faust']) /// >>> (a < b).to_arrow_array() /// /// [ @@ -106,7 +106,7 @@ impl PyArray { /// /// Round-trip an Arrow array through a Vortex array: /// - /// >>> vortex.encoding.array([1, 2, 3]).to_arrow_array() + /// >>> vortex.array([1, 2, 3]).to_arrow_array() /// /// [ /// 1, @@ -179,19 +179,19 @@ impl PyArray { /// Examples /// -------- /// - /// By default, :func:`vortex.encoding.array` uses the largest available bit-width: + /// By default, :func:`~vortex.encoding.array` uses the largest available bit-width: /// - /// >>> vortex.encoding.array([1, 2, 3]).dtype + /// >>> vortex.array([1, 2, 3]).dtype /// int(64, False) /// /// Including a :obj:`None` forces a nullable type: /// - /// >>> vortex.encoding.array([1, None, 2, 3]).dtype + /// >>> vortex.array([1, None, 2, 3]).dtype /// int(64, True) /// /// A UTF-8 string array: /// - /// >>> vortex.encoding.array(['hello, ', 'is', 'it', 'me?']).dtype + /// >>> vortex.array(['hello, ', 'is', 'it', 'me?']).dtype /// utf8(False) #[getter] fn dtype(self_: PyRef) -> PyResult> { @@ -244,19 +244,19 @@ impl PyArray { /// /// Parameters /// ---------- - /// filter : :class:`vortex.encoding.Array` + /// filter : :class:`~vortex.encoding.Array` /// Keep all the rows in ``self`` for which the correspondingly indexed row in `filter` is True. /// /// Returns /// ------- - /// :class:`vortex.encoding.Array` + /// :class:`~vortex.encoding.Array` /// /// Examples /// -------- /// /// Keep only the single digit positive integers. /// - /// >>> a = vortex.encoding.array([0, 42, 1_000, -23, 10, 9, 5]) + /// >>> a = vortex.array([0, 42, 1_000, -23, 10, 9, 5]) /// >>> filter = vortex.array([True, False, False, False, False, True, True]) /// >>> a.filter(filter).to_arrow_array() /// @@ -279,7 +279,7 @@ impl PyArray { /// Fill forward sensor values over intermediate missing values. Note that leading nulls are /// replaced with 0.0: /// - /// >>> a = vortex.encoding.array([ + /// >>> a = vortex.array([ /// ... None, None, 30.29, 30.30, 30.30, None, None, 30.27, 30.25, /// ... 30.22, None, None, None, None, 30.12, 30.11, 30.11, 30.11, /// ... 30.10, 30.08, None, 30.21, 30.03, 30.03, 30.05, 30.07, 30.07, @@ -334,12 +334,12 @@ impl PyArray { /// /// Retrieve the last element from an array of integers: /// - /// >>> vortex.encoding.array([10, 42, 999, 1992]).scalar_at(3) + /// >>> vortex.array([10, 42, 999, 1992]).scalar_at(3) /// 1992 /// /// Retrieve the third element from an array of strings: /// - /// >>> array = vortex.encoding.array(["hello", "goodbye", "it", "is"]) + /// >>> array = vortex.array(["hello", "goodbye", "it", "is"]) /// >>> array.scalar_at(2) /// /// @@ -352,7 +352,7 @@ impl PyArray { /// /// Retrieve an element from an array of structures: /// - /// >>> array = vortex.encoding.array([ + /// >>> array = vortex.array([ /// ... {'name': 'Joseph', 'age': 25}, /// ... {'name': 'Narendra', 'age': 31}, /// ... {'name': 'Angela', 'age': 33}, @@ -376,7 +376,7 @@ impl PyArray { /// /// Out of bounds accesses are prohibited: /// - /// >>> vortex.encoding.array([10, 42, 999, 1992]).scalar_at(10) + /// >>> vortex.array([10, 42, 999, 1992]).scalar_at(10) /// Traceback (most recent call last): /// ... /// ValueError: index 10 out of bounds from 0 to 4 @@ -384,7 +384,7 @@ impl PyArray { /// /// Unlike Python, negative indices are not supported: /// - /// >>> vortex.encoding.array([10, 42, 999, 1992]).scalar_at(-2) + /// >>> vortex.array([10, 42, 999, 1992]).scalar_at(-2) /// Traceback (most recent call last): /// ... /// OverflowError: can't convert negative int to unsigned @@ -398,20 +398,20 @@ impl PyArray { /// /// Parameters /// ---------- - /// indices : :class:`vortex.encoding.Array` + /// indices : :class:`~vortex.encoding.Array` /// An array of indices to keep. /// /// Returns /// ------- - /// :class:`vortex.encoding.Array` + /// :class:`~vortex.encoding.Array` /// /// Examples /// -------- /// /// Keep only the first and third elements: /// - /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd']) - /// >>> indices = vortex.encoding.array([0, 2]) + /// >>> a = vortex.array(['a', 'b', 'c', 'd']) + /// >>> indices = vortex.array([0, 2]) /// >>> a.take(indices).to_arrow_array() /// /// [ @@ -421,8 +421,8 @@ impl PyArray { /// /// Permute and repeat the first and second elements: /// - /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd']) - /// >>> indices = vortex.encoding.array([0, 1, 1, 0]) + /// >>> a = vortex.array(['a', 'b', 'c', 'd']) + /// >>> indices = vortex.array([0, 1, 1, 0]) /// >>> a.take(indices).to_arrow_array() /// /// [ @@ -457,14 +457,14 @@ impl PyArray { /// /// Returns /// ------- - /// :class:`vortex.encoding.Array` + /// :class:`~vortex.encoding.Array` /// /// Examples /// -------- /// /// Keep only the second through third elements: /// - /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd']) + /// >>> a = vortex.array(['a', 'b', 'c', 'd']) /// >>> a.slice(1, 3).to_arrow_array() /// /// [ @@ -474,14 +474,14 @@ impl PyArray { /// /// Keep none of the elements: /// - /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd']) + /// >>> a = vortex.array(['a', 'b', 'c', 'd']) /// >>> a.slice(3, 3).to_arrow_array() /// /// [] /// /// Unlike Python, it is an error to slice outside the bounds of the array: /// - /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd']) + /// >>> a = vortex.array(['a', 'b', 'c', 'd']) /// >>> a.slice(2, 10).to_arrow_array() /// Traceback (most recent call last): /// ... @@ -489,7 +489,7 @@ impl PyArray { /// /// Or to slice with a negative value: /// - /// >>> a = vortex.encoding.array(['a', 'b', 'c', 'd']) + /// >>> a = vortex.array(['a', 'b', 'c', 'd']) /// >>> a.slice(-2, -1).to_arrow_array() /// Traceback (most recent call last): /// ... @@ -516,7 +516,7 @@ impl PyArray { /// /// Uncompressed arrays have straightforward encodings: /// - /// >>> arr = vortex.encoding.array([1, 2, None, 3]) + /// >>> arr = vortex.array([1, 2, None, 3]) /// >>> print(arr.tree_display()) /// root: vortex.primitive(0x03)(i64?, len=4) nbytes=33 B (100.00%) /// metadata: PrimitiveMetadata { validity: Array } diff --git a/pyvortex/src/compress.rs b/pyvortex/src/compress.rs index 31768fc008..be84bf018d 100644 --- a/pyvortex/src/compress.rs +++ b/pyvortex/src/compress.rs @@ -8,7 +8,7 @@ use crate::array::PyArray; /// /// Parameters /// ---------- -/// array : :class:`vortex.encoding.Array` +/// array : :class:`~vortex.encoding.Array` /// The array. /// /// Examples @@ -16,23 +16,23 @@ use crate::array::PyArray; /// /// Compress a very sparse array of integers: /// -/// >>> a = vortex.encoding.array([42 for _ in range(1000)]) -/// >>> str(vortex.encoding.compress(a)) +/// >>> a = vortex.array([42 for _ in range(1000)]) +/// >>> str(vortex.compress(a)) /// 'vortex.constant(0x09)(i64, len=1000)' /// /// Compress an array of increasing integers: /// -/// >>> a = vortex.encoding.array(list(range(1000))) -/// >>> str(vortex.encoding.compress(a)) +/// >>> a = vortex.array(list(range(1000))) +/// >>> str(vortex.compress(a)) /// 'fastlanes.for(0x17)(i64, len=1000)' /// /// Compress an array of increasing floating-point numbers and a few nulls: /// -/// >>> a = vortex.encoding.array([ +/// >>> a = vortex.array([ /// ... float(x) if x % 20 != 0 else None /// ... for x in range(1000) /// ... ]) -/// >>> str(vortex.encoding.compress(a)) +/// >>> str(vortex.compress(a)) /// 'vortex.alp(0x11)(f64?, len=1000)' pub fn compress(array: &Bound) -> PyResult { let compressor = SamplingCompressor::default(); diff --git a/pyvortex/src/dtype.rs b/pyvortex/src/dtype.rs index 7a9e49a79a..61f419d0f7 100644 --- a/pyvortex/src/dtype.rs +++ b/pyvortex/src/dtype.rs @@ -119,7 +119,7 @@ pub fn dtype_bool(py: Python<'_>, nullable: bool) -> PyResult> { /// /// Parameters /// ---------- -/// width : one of 8, 16, 32, and 64. +/// width : Literal[8, 16, 32, 64]. /// The bit width determines the span of valid values. If :obj:`None`, 64 is used. /// /// nullable : :class:`bool` @@ -162,7 +162,7 @@ pub fn dtype_int(py: Python<'_>, width: Option, nullable: bool) -> PyResult /// /// Parameters /// ---------- -/// width : one of 8, 16, 32, and 64. +/// width : Literal[8, 16, 32, 64]. /// The bit width determines the span of valid values. If :obj:`None`, 64 is used. /// /// nullable : :class:`bool` @@ -205,7 +205,7 @@ pub fn dtype_uint(py: Python<'_>, width: Option, nullable: bool) -> PyResul /// /// Parameters /// ---------- -/// width : one of 16, 32, and 64. +/// width : Literal[16, 32, 64]. /// The bit width determines the range and precision of the floating-point values. If /// :obj:`None`, 64 is used. /// diff --git a/pyvortex/src/expr.rs b/pyvortex/src/expr.rs index 8bac88fe67..3b27fc120a 100644 --- a/pyvortex/src/expr.rs +++ b/pyvortex/src/expr.rs @@ -13,12 +13,15 @@ use crate::dtype::PyDType; /// An expression describes how to filter rows when reading an array from a file. /// +/// .. seealso:: +/// :func:`.column` +/// /// Examples /// ======== /// /// All the examples read the following file. /// -/// >>> a = vortex.encoding.array([ +/// >>> a = vortex.array([ /// ... {'name': 'Joseph', 'age': 25}, /// ... {'name': None, 'age': 31}, /// ... {'name': 'Angela', 'age': None}, @@ -209,7 +212,8 @@ impl PyExpr { /// A named column. /// -/// See :class:`.Expr` for more examples. +/// .. seealso:: +/// :class:`.Expr` /// /// Example /// ======= @@ -219,6 +223,8 @@ impl PyExpr { /// >>> name = vortex.expr.column("name") /// >>> filter = name == "Joseph" /// +/// See :class:`.Expr` for more examples. +/// #[pyfunction] pub fn column<'py>(name: &Bound<'py, PyString>) -> PyResult> { let py = name.py(); diff --git a/pyvortex/src/io.rs b/pyvortex/src/io.rs index ac32aed647..d93350df95 100644 --- a/pyvortex/src/io.rs +++ b/pyvortex/src/io.rs @@ -5,6 +5,7 @@ use pyo3::pyfunction; use pyo3::types::PyString; use tokio::fs::File; use vortex::Array; +use vortex_sampling_compressor::SamplingCompressor; use vortex_serde::layouts::LayoutWriter; use crate::dataset::{ObjectStoreUrlDataset, TokioFileDataset}; @@ -27,7 +28,7 @@ use crate::{PyArray, TOKIO_RUNTIME}; /// /// Read an array with a structured column and nulls at multiple levels and in multiple columns. /// -/// >>> a = vortex.encoding.array([ +/// >>> a = vortex.array([ /// ... {'name': 'Joseph', 'age': 25}, /// ... {'name': None, 'age': 31}, /// ... {'name': 'Angela', 'age': None}, @@ -111,7 +112,7 @@ use crate::{PyArray, TOKIO_RUNTIME}; /// /// TODO(DK): Top-level nullness does not work. /// -/// >>> a = vortex.encoding.array([ +/// >>> a = vortex.array([ /// ... {'name': 'Joseph', 'age': 25}, /// ... {'name': None, 'age': 31}, /// ... {'name': 'Angela', 'age': None}, @@ -186,23 +187,25 @@ pub fn read_url( dataset.to_array(projection, None, row_filter) } -#[pyfunction] /// Write a vortex struct array to the local filesystem. /// /// Parameters /// ---------- -/// array : :class:`vortex.encoding.Array` +/// array : :class:`~vortex.encoding.Array` /// The array. Must be an array of structures. /// /// f : :class:`str` /// The file path. /// +/// compress : :class:`bool` +/// Compress the array before writing, defaults to ``True``. +/// /// Examples /// -------- /// /// Write the array `a` to the local file `a.vortex`. /// -/// >>> a = vortex.encoding.array([ +/// >>> a = vortex.array([ /// ... {'x': 1}, /// ... {'x': 2}, /// ... {'x': 10}, @@ -211,7 +214,13 @@ pub fn read_url( /// ... ]) /// >>> vortex.io.write_path(a, "a.vortex") /// -pub fn write_path(array: &Bound<'_, PyArray>, f: &Bound<'_, PyString>) -> PyResult<()> { +#[pyfunction] +#[pyo3(signature = (array, f, *, compress=true))] +pub fn write_path( + array: &Bound<'_, PyArray>, + f: &Bound<'_, PyString>, + compress: bool, +) -> PyResult<()> { async fn run(array: &Array, fname: &str) -> PyResult<()> { let file = File::create(Path::new(fname)).await?; let mut writer = LayoutWriter::new(file); @@ -222,7 +231,12 @@ pub fn write_path(array: &Bound<'_, PyArray>, f: &Bound<'_, PyString>) -> PyResu } let fname = f.to_str()?; // TODO(dk): support file objects - let array = array.borrow().unwrap().clone(); + let mut array = array.borrow().unwrap().clone(); + + if compress { + let compressor = SamplingCompressor::default(); + array = compressor.compress(&array, None)?.into_array(); + } TOKIO_RUNTIME.block_on(run(&array, fname)) } diff --git a/pyvortex/src/scalar.rs b/pyvortex/src/scalar.rs index abee1bf5dc..dbf9a5ed1e 100644 --- a/pyvortex/src/scalar.rs +++ b/pyvortex/src/scalar.rs @@ -134,7 +134,7 @@ impl PyBufferString { #[pymethods] impl PyBufferString { - /// Copy this buffer string from array memory into a Python str. + /// Copy this buffer string from array memory into a :class:`str`. #[pyo3(signature = (*, recursive = false))] #[allow(unused_variables)] // we want the same Python name across all methods pub fn into_python(self_: PyRef, recursive: bool) -> PyResult { @@ -178,7 +178,7 @@ impl PyVortexList { #[pymethods] impl PyVortexList { - /// Copy the elements of this list from array memory into a list of Python objects. + /// Copy the elements of this list from array memory into a :class:`list`. #[pyo3(signature = (*, recursive = false))] pub fn into_python(self_: PyRef, recursive: bool) -> PyResult { to_python_list(self_.py(), &self_.inner, &self_.dtype, recursive) @@ -236,7 +236,7 @@ impl PyVortexStruct { #[pymethods] impl PyVortexStruct { #[pyo3(signature = (*, recursive = false))] - /// Copy the elements of this list from array memory into a list of Python objects. + /// Copy the elements of this list from array memory into a :class:`dict`. pub fn into_python(self_: PyRef, recursive: bool) -> PyResult { to_python_dict(self_.py(), &self_.inner, &self_.dtype, recursive) } diff --git a/requirements-dev.lock b/requirements-dev.lock index 5a9bc870cc..6df3b41447 100644 --- a/requirements-dev.lock +++ b/requirements-dev.lock @@ -72,7 +72,6 @@ numpy==1.26.4 # via xarray packaging==24.0 # via matplotlib - # via pydata-sphinx-theme # via pytest # via sphinx # via xarray @@ -103,7 +102,7 @@ py-cpuinfo==9.0.0 # via pytest-benchmark pyarrow==17.0.0 # via vortex-array -pydata-sphinx-theme==0.15.4 +pydata-sphinx-theme==0.16.0 pygments==2.17.2 # via accessible-pygments # via ipython @@ -133,6 +132,8 @@ soupsieve==2.6 # via beautifulsoup4 sphinx==8.0.2 # via pydata-sphinx-theme + # via sphinx-design +sphinx-design==0.6.1 sphinxcontrib-applehelp==2.0.0 # via sphinx sphinxcontrib-devhelp==2.0.0 diff --git a/requirements.lock b/requirements.lock index 88d7d0993d..d8458d2455 100644 --- a/requirements.lock +++ b/requirements.lock @@ -55,7 +55,6 @@ numpy==2.1.2 # via xarray packaging==24.1 # via matplotlib - # via pydata-sphinx-theme # via sphinx # via xarray pandas==2.2.3 @@ -70,7 +69,7 @@ protobuf==5.28.2 # via substrait pyarrow==17.0.0 # via vortex-array -pydata-sphinx-theme==0.15.4 +pydata-sphinx-theme==0.16.0 pygments==2.18.0 # via accessible-pygments # via pydata-sphinx-theme @@ -93,6 +92,8 @@ soupsieve==2.6 # via beautifulsoup4 sphinx==8.1.3 # via pydata-sphinx-theme + # via sphinx-design +sphinx-design==0.6.1 sphinxcontrib-applehelp==2.0.0 # via sphinx sphinxcontrib-devhelp==2.0.0 diff --git a/uv.lock b/uv.lock index 519b3c0a95..0e5b0edd34 100644 --- a/uv.lock +++ b/uv.lock @@ -242,13 +242,15 @@ dependencies = [ { name = "pydata-sphinx-theme" }, { name = "pyvortex" }, { name = "sphinx" }, + { name = "sphinx-design" }, ] [package.metadata] requires-dist = [ - { name = "pydata-sphinx-theme", specifier = ">=0.15.4" }, + { name = "pydata-sphinx-theme", specifier = ">=0.16.0" }, { name = "pyvortex" }, { name = "sphinx", specifier = ">=8.0.2" }, + { name = "sphinx-design", specifier = ">=0.6.1" }, ] [[package]] @@ -1189,6 +1191,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/26/60/1ddff83a56d33aaf6f10ec8ce84b4c007d9368b21008876fceda7e7381ef/sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2", size = 3487125 }, ] +[[package]] +name = "sphinx-design" +version = "0.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "sphinx" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/2b/69/b34e0cb5336f09c6866d53b4a19d76c227cdec1bbc7ac4de63ca7d58c9c7/sphinx_design-0.6.1.tar.gz", hash = "sha256:b44eea3719386d04d765c1a8257caca2b3e6f8421d7b3a5e742c0fd45f84e632", size = 2193689 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/43/65c0acbd8cc6f50195a3a1fc195c404988b15c67090e73c7a41a9f57d6bd/sphinx_design-0.6.1-py3-none-any.whl", hash = "sha256:b11f37db1a802a183d61b159d9a202314d4d2fe29c163437001324fe2f19549c", size = 2215338 }, +] + [[package]] name = "sphinxcontrib-applehelp" version = "2.0.0"