diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index da76916e..8af0f336 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -49,7 +49,7 @@ jobs: - ec2-macOS - ${{ needs.start-runner.outputs.label }} runs-on: ${{ matrix.os }} - timeout-minutes: 30 + timeout-minutes: 45 steps: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 @@ -147,7 +147,7 @@ jobs: matrix: python-version: ["3.10", "3.11"] runs-on: ubuntu-22.04 - timeout-minutes: 15 + timeout-minutes: 45 steps: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 @@ -188,7 +188,7 @@ jobs: - ec2-macOS - ${{ needs.start-runner.outputs.label }} runs-on: ${{ matrix.os }} - timeout-minutes: 15 + timeout-minutes: 45 steps: - name: Create dir for wheels run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index 40d8d97b..31c992ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +## [0.6.0] - 2023-11-10 + +### Added +- A flag `with_metadata` for the filesystem-based connectors to attach the source file metadata to the table entries. +- Methods `pw.debug.table_from_list_of_batches` and `pw.debug.table_from_list_of_batches_by_workers` for creating tables with defined data being inserted over time. + +### Changed +- **BREAKING**: `pw.debug.table_from_pandas` and `pw.debug.table_from_markdown` now will create tables in the streaming mode, instead of static, if given table definition contains `_time` column. +- **BREAKING**: Renamed the parameter `keep_queries` in `pw.io.http.rest_connector` to `delete_queries` with the opposite meaning. It changes the default behavior - it was `keep_queries=False`, now it is `delete_queries=False`. + ## [0.5.3] - 2023-10-27 ### Added diff --git a/Cargo.lock b/Cargo.lock index 01d8627c..d792224d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -36,9 +36,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.7.6" +version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +checksum = "5a824f2aa7e75a0c98c5a504fceb80649e9c35265d44525b5f94de4771a395cd" dependencies = [ "getrandom", "once_cell", @@ -325,9 +325,9 @@ checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" [[package]] name = "cpufeatures" -version = "0.2.10" +version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fbc60abd742b35f2492f808e1abbb83d45f72db402e14c55057edc9c7b1e9e4" +checksum = "ce420fe07aecd3e67c5f910618fe65e94158f6dcc0adf44e00d69ce2bdfe0fd0" dependencies = [ "libc", ] @@ -664,9 +664,9 @@ dependencies = [ [[package]] name = "futures" -version = "0.3.28" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23342abe12aba583913b2e62f22225ff9c950774065e4bfb61a19cd9770fec40" +checksum = "da0290714b38af9b4a7b094b8a37086d1b4e61f2df9122c3cad2577669145335" dependencies = [ "futures-channel", "futures-core", @@ -679,9 +679,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.28" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" +checksum = "ff4dd66668b557604244583e3e1e1eada8c5c2e96a6d0d6653ede395b78bbacb" dependencies = [ "futures-core", "futures-sink", @@ -689,15 +689,15 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.28" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" +checksum = "eb1d22c66e66d9d72e1758f0bd7d4fd0bee04cad842ee34587d68c07e45d088c" [[package]] name = "futures-executor" -version = "0.3.28" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccecee823288125bd88b4d7f565c9e58e41858e47ab72e8ea2d64e93624386e0" +checksum = "0f4fb8693db0cf099eadcca0efe2a5a22e4550f98ed16aba6c48700da29597bc" dependencies = [ "futures-core", "futures-task", @@ -706,15 +706,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.28" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" +checksum = "8bf34a163b5c4c52d0478a4d757da8fb65cabef42ba90515efee0f6f9fa45aaa" [[package]] name = "futures-macro" -version = "0.3.28" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" +checksum = "53b153fd91e4b0147f4aced87be237c98248656bb01050b96bf3ee89220a8ddb" dependencies = [ "proc-macro2", "quote", @@ -723,21 +723,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.28" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f43be4fe21a13b9781a69afa4985b0f6ee0e1afab2c6f454a8cf30e2b2237b6e" +checksum = "e36d3378ee38c2a36ad710c5d30c2911d752cb941c00c72dbabfb786a7970817" [[package]] name = "futures-task" -version = "0.3.28" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" +checksum = "efd193069b0ddadc69c46389b740bbccdd97203899b48d09c5f7969591d6bae2" [[package]] name = "futures-util" -version = "0.3.28" +version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b01e40b772d54cf6c6d721c1d1abd0647a0106a12ecaa1c186273392a69533" +checksum = "a19526d624e703a3179b3d322efec918b6246ea0fa51d41124525f00f1cc8104" dependencies = [ "futures-channel", "futures-core", @@ -1022,9 +1022,9 @@ checksum = "e1be380c410bf0595e94992a648ea89db4dd3f3354ba54af206fd2a68cf5ac8e" [[package]] name = "ipnet" -version = "2.8.0" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" +checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" [[package]] name = "itertools" @@ -1194,9 +1194,9 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" +checksum = "3dce281c5e46beae905d4de1870d8b1509a9142b62eedf18b443b011ca8343d0" dependencies = [ "libc", "wasi", @@ -1359,9 +1359,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-src" -version = "300.1.5+3.1.3" +version = "300.1.6+3.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "559068e4c12950d7dcaa1857a61725c0d38d4fc03ff8e070ab31a75d6e316491" +checksum = "439fac53e092cd7442a3660c85dde4643ab3b5bd39040912388dcdabf6b88085" dependencies = [ "cc", ] @@ -1434,7 +1434,7 @@ dependencies = [ [[package]] name = "pathway" -version = "0.5.3" +version = "0.6.0" dependencies = [ "arc-swap", "arcstr", @@ -1640,9 +1640,9 @@ dependencies = [ [[package]] name = "prometheus-client" -version = "0.21.2" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c99afa9a01501019ac3a14d71d9f94050346f55ca471ce90c799a15c58f61e2" +checksum = "510c4f1c9d81d556458f94c98f857748130ea9737bbd6053da497503b26ea63c" dependencies = [ "dtoa", "itoa", @@ -1844,15 +1844,6 @@ dependencies = [ "bitflags 1.3.2", ] -[[package]] -name = "redox_syscall" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.4.1" @@ -2004,9 +1995,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.20" +version = "0.38.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67ce50cb2e16c2903e30d1cbccfd8387a74b9d4c938b6a4c5ec6cc7556f7a8a0" +checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" dependencies = [ "bitflags 2.4.1", "errno", @@ -2082,18 +2073,18 @@ checksum = "cd0b0ec5f1c1ca621c432a25813d8d60c88abe6d3e08a3eb9cf37d97a0fe3d73" [[package]] name = "serde" -version = "1.0.189" +version = "1.0.190" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e422a44e74ad4001bdc8eede9a4570ab52f71190e9c076d14369f38b9200537" +checksum = "91d3c334ca1ee894a2c6f6ad698fe8c435b76d504b13d436f0685d648d6d96f7" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.189" +version = "1.0.190" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e48d1f918009ce3145511378cf68d613e3b3d9137d67272562080d68a2b32d5" +checksum = "67c5609f394e5c2bd7fc51efda478004ea80ef42fee983d5c67a65e34f32c0e3" dependencies = [ "proc-macro2", "quote", @@ -2312,13 +2303,13 @@ checksum = "14c39fd04924ca3a864207c66fc2cd7d22d7c016007f9ce846cbb9326331930a" [[package]] name = "tempfile" -version = "3.8.0" +version = "3.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb94d2f3cc536af71caac6b6fcebf65860b347e7ce0cc9ebe8f70d3e521054ef" +checksum = "7ef1adac450ad7f4b3c28589471ade84f25f731a7a0fe30d71dfa9f60fd808e5" dependencies = [ "cfg-if", "fastrand", - "redox_syscall 0.3.5", + "redox_syscall 0.4.1", "rustix", "windows-sys", ] @@ -2487,9 +2478,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.9" +version = "0.7.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d68074620f57a0b21594d9735eb2e98ab38b17f80d3fcb189fca266771ca60d" +checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" dependencies = [ "bytes", "futures-core", @@ -2501,9 +2492,9 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.6.3" +version = "0.6.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7cda73e2f1397b1262d6dfdcef8aafae14d1de7748d66822d3bfeeb6d03e5e4b" +checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1" [[package]] name = "toml_edit" diff --git a/Cargo.toml b/Cargo.toml index bc1a4c96..613dc7c4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pathway" -version = "0.5.3" +version = "0.6.0" edition = "2021" publish = false rust-version = "1.72.0" @@ -13,7 +13,7 @@ crate-type = ["cdylib", "lib"] [dev-dependencies] assert_matches = "1.5.0" eyre = "0.6.8" -tempfile = "3.8.0" +tempfile = "3.8.1" [dependencies] arc-swap = "1.6.0" @@ -30,7 +30,7 @@ csv = "1.3.0" derivative = "2.2.0" differential-dataflow = { path = "./external/differential-dataflow" } elasticsearch = "8.5.0-alpha.1" -futures = "0.3.28" +futures = "0.3.29" glob = "0.3.1" hyper = { version = "0.14", features = ["server"] } id-arena = "2.2.1" @@ -38,14 +38,14 @@ itertools = "0.11.0" jemallocator = { version = "0.5.4", features = ["stats", "disable_initial_exec_tls"] } log = { version = "0.4.20", features = ["std"] } ndarray = { version = "0.15.6", features = ["serde"] } -nix = { version = "0.27.1", features = ["fs"] } +nix = { version = "0.27.1", features = ["fs", "user"] } num-integer = "0.1.45" numpy = "0.19.0" once_cell = "1.18.0" ordered-float = { version = "4.1.1", features = ["serde"] } pipe = "0.4.0" postgres = { version = "0.19.7", features = ["with-chrono-0_4", "with-serde_json-1"] } -prometheus-client = "0.21.2" +prometheus-client = "0.22.0" pyo3 = { version = "0.19.2", features = ["abi3-py310", "multiple-pymethods"] } pyo3-asyncio = "0.19.0" pyo3-log = "0.9.0" @@ -54,7 +54,7 @@ rdkafka = { version = "0.34.0", features = ["ssl-vendored", "cmake-build", "zstd rust-s3 = { version = "0.33.0", features = ["sync-native-tls-vendored", "sync-native-tls", "fail-on-err"], default-features = false } scopeguard = "1.2.0" send_wrapper = "0.6.0" -serde = { version = "1.0.189", features = ["derive", "rc"] } +serde = { version = "1.0.190", features = ["derive", "rc"] } serde_json = "1.0" serde_with = "3.4.0" smallvec = { version = "1.11.1", features = ["union", "const_generics"] } diff --git a/integration_tests/kafka/test_backfilling.py b/integration_tests/kafka/test_backfilling.py index f82bb3f2..fa4dabeb 100644 --- a/integration_tests/kafka/test_backfilling.py +++ b/integration_tests/kafka/test_backfilling.py @@ -138,9 +138,9 @@ def run_backfilling_program( def test_backfilling_fs_storage( tmp_path: pathlib.Path, kafka_context: KafkaTestContext ): - fs_persistence_config = pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(tmp_path / "PStorage"), - refresh_duration_ms=5000, + fs_persistence_config = pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(tmp_path / "PStorage"), + snapshot_interval_ms=5000, ) run_backfilling_program(fs_persistence_config, tmp_path, kafka_context) @@ -154,11 +154,11 @@ def test_backfilling_s3_storage( time.time() ) ) - s3_persistence_config = pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.s3( + s3_persistence_config = pw.persistence.Config.simple_config( + pw.persistence.Backend.s3( root_path=pstorage_s3_path, bucket_settings=get_aws_s3_settings(), ), - refresh_duration_ms=2000, + snapshot_interval_ms=2000, ) run_backfilling_program(s3_persistence_config, tmp_path, kafka_context) diff --git a/integration_tests/kafka/test_simple.py b/integration_tests/kafka/test_simple.py index 865ca3b7..f798ab97 100644 --- a/integration_tests/kafka/test_simple.py +++ b/integration_tests/kafka/test_simple.py @@ -213,8 +213,8 @@ def test_kafka_recovery(tmp_path: pathlib.Path, kafka_context: KafkaTestContext) ), 10, kwargs={ - "persistence_config": pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(persistent_storage_path), + "persistence_config": pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(persistent_storage_path), ), }, ) @@ -255,8 +255,8 @@ def test_kafka_recovery(tmp_path: pathlib.Path, kafka_context: KafkaTestContext) 10, target=pw.run, kwargs={ - "persistence_config": pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(persistent_storage_path), + "persistence_config": pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(persistent_storage_path), ), }, ) diff --git a/integration_tests/s3/test_s3_interops.py b/integration_tests/s3/test_s3_interops.py index b7ba74e4..aca27ca7 100644 --- a/integration_tests/s3/test_s3_interops.py +++ b/integration_tests/s3/test_s3_interops.py @@ -147,8 +147,8 @@ def test_s3_backfilling(tmp_path: pathlib.Path): pw.io.csv.write(table, str(tmp_path / "output.csv")) pw.run( monitoring_level=MonitoringLevel.NONE, - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(pathway_persistent_storage), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(pathway_persistent_storage), ), ) G.clear() @@ -166,8 +166,8 @@ def test_s3_backfilling(tmp_path: pathlib.Path): pw.io.csv.write(table, str(tmp_path / "output_backfilled.csv")) pw.run( monitoring_level=MonitoringLevel.NONE, - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(pathway_persistent_storage), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(pathway_persistent_storage), ), ) G.clear() @@ -189,8 +189,8 @@ def test_s3_backfilling(tmp_path: pathlib.Path): pw.io.csv.write(table, str(output_path)) pw.run( monitoring_level=MonitoringLevel.NONE, - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(pathway_persistent_storage), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(pathway_persistent_storage), ), ) @@ -231,8 +231,8 @@ class InputSchema(pw.Schema): pw.io.jsonlines.write(table, str(output_path)) pw.run( monitoring_level=MonitoringLevel.NONE, - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.s3( + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.s3( root_path=pstorage_s3_path, bucket_settings=get_aws_s3_settings(), ), diff --git a/integration_tests/webserver/test_rest_connector.py b/integration_tests/webserver/test_rest_connector.py index a9ae63f9..61ced2f5 100644 --- a/integration_tests/webserver/test_rest_connector.py +++ b/integration_tests/webserver/test_rest_connector.py @@ -151,7 +151,7 @@ def target(): ).raise_for_status() queries, response_writer = pw.io.http.rest_connector( - host="127.0.0.1", port=port, schema=InputSchema, keep_queries=True + host="127.0.0.1", port=port, schema=InputSchema, delete_queries=False ) response_writer(queries.select(query_id=queries.id, result=pw.this.v)) diff --git a/integration_tests/wordcount/pw_wordcount.py b/integration_tests/wordcount/pw_wordcount.py index a4f66f12..4eb503a9 100755 --- a/integration_tests/wordcount/pw_wordcount.py +++ b/integration_tests/wordcount/pw_wordcount.py @@ -19,9 +19,9 @@ os.environ["PATHWAY_THREADS"] = str(args.n_cpus) if args.pstorage_type == "fs": - pstorage_config = pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(path=args.pstorage), - refresh_duration_ms=5000, + pstorage_config = pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(path=args.pstorage), + snapshot_interval_ms=5000, ) elif args.pstorage_type == "s3": aws_s3_settings = pw.io.s3.AwsS3Settings( @@ -30,12 +30,12 @@ secret_access_key=os.environ["AWS_S3_SECRET_ACCESS_KEY"], region="eu-central-1", ) - pstorage_config = pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.s3( + pstorage_config = pw.persistence.Config.simple_config( + pw.persistence.Backend.s3( root_path=args.pstorage, bucket_settings=aws_s3_settings, ), - refresh_duration_ms=5000, + snapshot_interval_ms=5000, ) else: raise ValueError( diff --git a/python/pathway/__init__.py b/python/pathway/__init__.py index 11155c35..0827002b 100644 --- a/python/pathway/__init__.py +++ b/python/pathway/__init__.py @@ -236,5 +236,4 @@ def __getattr__(name: str): Table.interpolate = statistical.interpolate Table.windowby = temporal.windowby -Table.sort = indexing.sort Table.diff = ordered.diff diff --git a/python/pathway/debug/__init__.py b/python/pathway/debug/__init__.py index ad2d154a..a1f7cb85 100644 --- a/python/pathway/debug/__init__.py +++ b/python/pathway/debug/__init__.py @@ -4,11 +4,15 @@ import functools import io +import itertools import re +from collections.abc import Iterable from os import PathLike +from warnings import warn import pandas as pd +from pathway import persistence from pathway.internals import Json, api, parse_graph from pathway.internals.datasource import DataSourceOptions, PandasDataSource from pathway.internals.decorators import table_from_datasource @@ -18,6 +22,8 @@ from pathway.internals.schema import Schema, schema_from_pandas from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame +from pathway.io._utils import read_schema +from pathway.io.python import ConnectorSubject, read @runtime_type_check @@ -116,7 +122,7 @@ def table_to_pandas(table: Table): @runtime_type_check @trace_user_frame -def table_from_pandas( +def _table_from_pandas( df: pd.DataFrame, id_from: list[str] | None = None, unsafe_trusted_ids: bool = False, @@ -199,5 +205,259 @@ def table_to_parquet(table: Table, filename: str | PathLike): return df.to_parquet(filename) -# XXX: clean this up -table_from_markdown = parse_to_table +class _EmptyConnectorSubject(ConnectorSubject): + def run(self): + pass + + +class StreamGenerator: + _persistent_id = itertools.count() + events: dict[tuple[str, int], list[api.SnapshotEvent]] = {} + + def _get_next_persistent_id(self) -> str: + return str(next(self._persistent_id)) + + def _advance_time_for_all_workers( + self, persistent_id: str, workers: Iterable[int], timestamp: int + ): + for worker in workers: + self.events[(persistent_id, worker)].append( + api.SnapshotEvent.advance_time(timestamp) + ) + + def _table_from_dict( + self, + batches: dict[int, dict[int, list[tuple[int, api.Pointer, list[api.Value]]]]], + schema: type[Schema], + ) -> Table: + """ + A function that creates a table from a mapping of timestamps to batches. Each batch + is a mapping from worker id to list of rows processed in this batch by this worker, + and each row is tuple (diff, key, values). + + Note: unless you need to specify timestamps and keys, consider using + `table_from_list_of_batches` and `table_from_list_of_batches_by_workers`. + + Args: + batches: dictionary with specified batches to be put in the table + schema: schema of the table + """ + persistent_id = self._get_next_persistent_id() + workers = set([worker for batch in batches.values() for worker in batch]) + for worker in workers: + self.events[(persistent_id, worker)] = [] + + timestamps = set(batches.keys()) + + if any(timestamp for timestamp in timestamps if timestamp < 0): + raise ValueError("negative timestamp cannot be used") + elif any(timestamp for timestamp in timestamps if timestamp == 0): + warn( + "rows with timestamp 0 are only backfilled and are not processed by output connectors" + ) + + if any(timestamp for timestamp in timestamps if timestamp % 2 == 1): + warn("timestamps are required to be even; all timestamps will be doubled") + batches = {2 * timestamp: batches[timestamp] for timestamp in batches} + + for timestamp in sorted(batches): + self._advance_time_for_all_workers(persistent_id, workers, timestamp) + batch = batches[timestamp] + for worker, changes in batch.items(): + for diff, key, values in changes: + if diff == 1: + event = api.SnapshotEvent.insert(key, values) + self.events[(persistent_id, worker)] += [event] * diff + elif diff == -1: + event = api.SnapshotEvent.delete(key, values) + self.events[(persistent_id, worker)] += [event] * (-diff) + else: + raise ValueError("only diffs of 1 and -1 are supported") + + return read( + _EmptyConnectorSubject(), persistent_id=persistent_id, schema=schema + ) + + def table_from_list_of_batches_by_workers( + self, + batches: list[dict[int, list[dict[str, api.Value]]]], + schema: type[Schema], + ) -> Table: + key = itertools.count() + schema, api_schema = read_schema(schema=schema) + value_fields: list[api.ValueField] = api_schema["value_fields"] + + def next_key() -> api.Pointer: + api_key = api.ref_scalar(next(key)) + return api_key + + def add_diffs_and_keys(list_of_values: list[dict[str, api.Value]]): + return [ + (1, next_key(), [values[field.name] for field in value_fields]) + for values in list_of_values + ] + + formatted_batches: dict[ + int, dict[int, list[tuple[int, api.Pointer, list[api.Value]]]] + ] = {} + timestamp = itertools.count(2, 2) + + for batch in batches: + changes = {worker: add_diffs_and_keys(batch[worker]) for worker in batch} + formatted_batches[next(timestamp)] = changes + + return self._table_from_dict(formatted_batches, schema) + + def table_from_list_of_batches( + self, + batches: list[list[dict[str, api.Value]]], + schema: type[Schema], + ) -> Table: + batches_by_worker = [{0: batch} for batch in batches] + return self.table_from_list_of_batches_by_workers(batches_by_worker, schema) + + def table_from_pandas( + self, + df: pd.DataFrame, + id_from: list[str] | None = None, + unsafe_trusted_ids: bool = False, + schema: type[Schema] | None = None, + ) -> Table: + if schema is None: + schema = schema_from_pandas( + df, exclude_columns=["_time", "_diff", "_worker"] + ) + schema, api_schema = read_schema(schema=schema) + value_fields: list[api.ValueField] = api_schema["value_fields"] + + if "_time" not in df: + df["_time"] = [2] * len(df) + if "_worker" not in df: + df["_worker"] = [0] * len(df) + if "_diff" not in df: + df["_diff"] = [1] * len(df) + + persistent_id = self._get_next_persistent_id() + workers = set(df["_worker"]) + for worker in workers: + self.events[(persistent_id, worker)] = [] + + batches: dict[ + int, dict[int, list[tuple[int, api.Pointer, list[api.Value]]]] + ] = {} + + ids = api.ids_from_pandas( + df, api.ConnectorProperties(unsafe_trusted_ids=unsafe_trusted_ids), id_from + ) + + for row_index in range(len(df)): + row = df.iloc[row_index] + time = row["_time"] + key = ids[df.index[row_index]] + worker = row["_worker"] + + if time not in batches: + batches[time] = {} + + if worker not in batches[time]: + batches[time][worker] = [] + + values = [] + for value_field in value_fields: + column = value_field.name + value = api.denumpify(row[column]) + values.append(value) + diff = row["_diff"] + + batches[time][worker].append((diff, key, values)) + + return self._table_from_dict(batches, schema) + + def table_from_markdown( + self, + table: str, + id_from: list[str] | None = None, + unsafe_trusted_ids: bool = False, + schema: type[Schema] | None = None, + ) -> Table: + df = _markdown_to_pandas(table) + return self.table_from_pandas(df, id_from, unsafe_trusted_ids, schema) + + def persistence_config(self) -> persistence.Config | None: + if len(self.events) == 0: + return None + return persistence.Config.simple_config( + persistence.Backend.mock(self.events), + snapshot_access=api.SnapshotAccess.REPLAY, + replay_mode=api.ReplayMode.SPEEDRUN, + ) + + +stream_generator = StreamGenerator() + + +def table_from_list_of_batches_by_workers( + batches: list[dict[int, list[dict[str, api.Value]]]], + schema: type[Schema], +) -> Table: + """ + A function that creates a table from a list of batches, where each batch is a mapping + from worker id to a list of rows processed by this worker in this batch. + Each row is a mapping from column name to a value. + + Args: + batches: list of batches to be put in the table + schema: schema of the table + """ + return stream_generator.table_from_list_of_batches_by_workers(batches, schema) + + +def table_from_list_of_batches( + batches: list[list[dict[str, api.Value]]], + schema: type[Schema], +) -> Table: + """ + A function that creates a table from a list of batches, where each batch is a list of + rows in this batch. Each row is a mapping from column name to a value. + + Args: + batches: list of batches to be put in the table + schema: schema of the table + """ + return stream_generator.table_from_list_of_batches(batches, schema) + + +def table_from_pandas( + df: pd.DataFrame, + id_from: list[str] | None = None, + unsafe_trusted_ids: bool = False, + schema: type[Schema] | None = None, +): + """ + A function for creating a table from a pandas DataFrame. If the DataFrame + contains a column ``_time``, rows will be split into batches with timestamps from ``_time`` column. + Then ``_worker`` column will be interpreted as the id of a worker which will process the row and + ``_diff`` column as an event type with ``1`` treated as inserting row and ``-1`` as removing. + """ + if "_time" in df: + return stream_generator.table_from_pandas( + df, id_from, unsafe_trusted_ids, schema + ) + else: + return _table_from_pandas(df, id_from, unsafe_trusted_ids, schema) + + +def table_from_markdown( + table_def: str, + id_from: list[str] | None = None, + unsafe_trusted_ids: bool = False, + schema: type[Schema] | None = None, +) -> Table: + """ + A function for creating a table from its definition in markdown. If it + contains a column ``_time``, rows will be split into batches with timestamps from ``_time`` column. + Then ``_worker`` column will be interpreted as the id of a worker which will process the row and + ``_diff`` column as an event type - with ``1`` treated as inserting row and ``-1`` as removing. + """ + df = _markdown_to_pandas(table_def) + return table_from_pandas(df, id_from, unsafe_trusted_ids, schema) diff --git a/python/pathway/engine.pyi b/python/pathway/engine.pyi index feb21813..f414166a 100644 --- a/python/pathway/engine.pyi +++ b/python/pathway/engine.pyi @@ -45,6 +45,10 @@ class ReadMethod(Enum): BY_LINE: ReadMethod FULL: ReadMethod +class DebeziumDBType(Enum): + POSTGRES: DebeziumDBType + MONGO_DB: DebeziumDBType + class Universe: pass @@ -586,6 +590,7 @@ class AwsS3Settings: def __init__(self, *args, **kwargs): ... class ValueField: + name: str def __init__(self, *args, **kwargs): ... def set_default(self, *args, **kwargs): ... @@ -611,3 +616,16 @@ class SnapshotAccess(Enum): RECORD: SnapshotAccess REPLAY: SnapshotAccess FULL: SnapshotAccess + +class SnapshotEvent: + @staticmethod + def insert(key: Pointer, values: list[Value]) -> SnapshotEvent: ... + @staticmethod + def delete(key: Pointer, values: list[Value]) -> SnapshotEvent: ... + @staticmethod + def advance_time(timestamp: int) -> SnapshotEvent: ... + FINISHED: SnapshotEvent + +class LocalBinarySnapshotWriter: + def __init__(self, path: str, persistent_id: str, worker_id: int): ... + def write(self, events: list[SnapshotEvent]): ... diff --git a/python/pathway/internals/_io_helpers.py b/python/pathway/internals/_io_helpers.py index 1d3a7f95..6672839b 100644 --- a/python/pathway/internals/_io_helpers.py +++ b/python/pathway/internals/_io_helpers.py @@ -14,6 +14,19 @@ class AwsS3Settings: + """Stores Amazon S3 connection settings. You may also use this class to store + configuration settings for any custom S3 installation, however you will need to + specify the region and the endpoint. + + Args: + bucket_name: Name of S3 bucket. + access_key: Access key for the bucket. + secret_access_key: Secret access key for the bucket. + with_path_style: Whether to use path-style requests. + region: Region of the bucket. + endpoint: Custom endpoint in case of self-hosted storage. + """ + @trace_user_frame def __init__( self, @@ -25,16 +38,6 @@ def __init__( region=None, endpoint=None, ): - """Constructs Amazon S3 connection settings. - - Args: - bucket_name: Name of S3 bucket. - access_key: Access key for the bucket. - secret_access_key: Secret access key for the bucket. - with_path_style: Whether to use path-style requests for the bucket. - region: Region of the bucket. - endpoint: Custom endpoint in case of self-hosted storage. - """ self.settings = api.AwsS3Settings( bucket_name, access_key, @@ -46,6 +49,20 @@ def __init__( @classmethod def new_from_path(cls, s3_path: str): + """ + Constructs settings from S3 path. The engine will look for the credentials in + environment variables and in local AWS profiles. It will also automatically + detect the region of the bucket. + + This method may fail if there are no credentials or they are incorrect. It may + also fail if the bucket does not exist. + + Args: + s3_path: full path to the object in the form ``s3:///``. + + Returns: + Configuration object. + """ starts_with_prefix = s3_path.startswith(S3_PATH_PREFIX) has_extra_chars = len(s3_path) > len(S3_PATH_PREFIX) if not starts_with_prefix or not has_extra_chars: diff --git a/python/pathway/internals/api.py b/python/pathway/internals/api.py index f91191a2..582716d8 100644 --- a/python/pathway/internals/api.py +++ b/python/pathway/internals/api.py @@ -42,12 +42,7 @@ def __call__(self, state: S | None, rows: list[tuple[list[Value], int]]) -> S: ... -def static_table_from_pandas( - scope, - df: pd.DataFrame, - connector_properties: ConnectorProperties | None = None, - id_from: list[str] | None = None, -) -> Table: +def denumpify(x): def denumpify_inner(x): if pd.api.types.is_scalar(x) and pd.isna(x): return None @@ -55,20 +50,34 @@ def denumpify_inner(x): return x.item() return x - def denumpify(x): - v = denumpify_inner(x) - if isinstance(v, str): - return v.encode("utf-8", "ignore").decode("utf-8") - else: - return v + v = denumpify_inner(x) + if isinstance(v, str): + return v.encode("utf-8", "ignore").decode("utf-8") + else: + return v + +def ids_from_pandas( + df: pd.DataFrame, + connector_properties: ConnectorProperties | None, + id_from: list[str] | None, +) -> dict[Any, Pointer]: if id_from is None: if connector_properties is not None and connector_properties.unsafe_trusted_ids: - ids = {k: unsafe_make_pointer(k) for k in df.index} + return {k: unsafe_make_pointer(k) for k in df.index} else: - ids = {k: ref_scalar(k) for k in df.index} + return {k: ref_scalar(k) for k in df.index} else: - ids = {k: ref_scalar(*args) for (k, *args) in df[id_from].itertuples()} + return {k: ref_scalar(*args) for (k, *args) in df[id_from].itertuples()} + + +def static_table_from_pandas( + scope, + df: pd.DataFrame, + connector_properties: ConnectorProperties | None = None, + id_from: list[str] | None = None, +) -> Table: + ids = ids_from_pandas(df, connector_properties, id_from) all_data: list[tuple[Pointer, list[Value]]] = [(key, []) for key in ids.values()] diff --git a/python/pathway/internals/environ.py b/python/pathway/internals/environ.py index 02c5c276..52095f96 100644 --- a/python/pathway/internals/environ.py +++ b/python/pathway/internals/environ.py @@ -3,7 +3,10 @@ import os from pathway.internals import api -from pathway.internals.persistence import PersistenceConfig, PersistentStorageBackend +from pathway.persistence import ( + Backend as PersistentStorageBackend, + Config as PersistenceConfig, +) ignore_asserts = os.environ.get("PATHWAY_IGNORE_ASSERTS", "false").lower() in ( "1", @@ -44,7 +47,7 @@ def get_replay_config(): continue_after_replay = bool(os.environ.get("PATHWAY_CONTINUE_AFTER_REPLAY")) data_storage = PersistentStorageBackend.filesystem(replay_storage) - persistence_config = PersistenceConfig.single_backend( + persistence_config = PersistenceConfig.simple_config( data_storage, replay_mode=replay_mode, snapshot_access=snapshot_access, diff --git a/python/pathway/internals/graph_runner/__init__.py b/python/pathway/internals/graph_runner/__init__.py index ee3814c7..b05596d2 100644 --- a/python/pathway/internals/graph_runner/__init__.py +++ b/python/pathway/internals/graph_runner/__init__.py @@ -16,7 +16,7 @@ from pathway.internals.helpers import StableSet from pathway.internals.monitoring import MonitoringLevel, monitor_stats from pathway.internals.operator import ContextualizedIntermediateOperator, Operator -from pathway.internals.persistence import PersistenceConfig +from pathway.persistence import Config as PersistenceConfig class GraphRunner: @@ -37,6 +37,8 @@ def __init__( default_logging: bool = True, persistence_config: PersistenceConfig | None = None, ) -> None: + from pathway.debug import stream_generator + self._graph = input_graph self.debug = debug if ignore_asserts is None: @@ -45,7 +47,11 @@ def __init__( self.monitoring_level = monitoring_level self.with_http_server = with_http_server self.default_logging = default_logging - self.persistence_config = persistence_config or environ.get_replay_config() + self.persistence_config = ( + persistence_config + or environ.get_replay_config() + or stream_generator.persistence_config() + ) def run_tables( self, diff --git a/python/pathway/internals/graph_runner/expression_evaluator.py b/python/pathway/internals/graph_runner/expression_evaluator.py index c69c2e87..d8ef92be 100644 --- a/python/pathway/internals/graph_runner/expression_evaluator.py +++ b/python/pathway/internals/graph_runner/expression_evaluator.py @@ -203,6 +203,8 @@ def run( ) -> api.Table: [input_storage] = input_storages engine_input_table = self.state.get_table(input_storage) + if output_storage.has_only_references: + return engine_input_table expressions = [] eval_state = RowwiseEvalState() diff --git a/python/pathway/internals/graph_runner/path_evaluator.py b/python/pathway/internals/graph_runner/path_evaluator.py index ae5886e3..e72cbb2e 100644 --- a/python/pathway/internals/graph_runner/path_evaluator.py +++ b/python/pathway/internals/graph_runner/path_evaluator.py @@ -108,12 +108,37 @@ class AddNewColumnsPathEvaluator( clmn.GradualBroadcastContext, ], ): + def compute_if_all_new_are_references( + self, + output_columns: Iterable[clmn.Column], + input_storage: Storage, + ) -> Storage | None: + paths = {} + for column in output_columns: + if input_storage.has_column(column): + paths[column] = input_storage.get_path(column) + elif ( + isinstance(column, clmn.ColumnWithReference) + and input_storage.has_column(column.expression._column) + and input_storage.get_path(column.expression._column) != ColumnPath.KEY + ): + paths[column] = input_storage.get_path(column.expression._column) + else: + return None + return Storage(self.context.universe, paths, has_only_references=True) + def compute( self, output_columns: Iterable[clmn.Column], input_storages: dict[Universe, Storage], ) -> Storage: input_storage = input_storages.get(self.context.universe) + if input_storage is not None and isinstance(self.context, clmn.RowwiseContext): + maybe_storage = self.compute_if_all_new_are_references( + output_columns, input_storage + ) + if maybe_storage is not None: + return maybe_storage paths = {} counter = itertools.count(start=1) for column in output_columns: diff --git a/python/pathway/internals/graph_runner/path_storage.py b/python/pathway/internals/graph_runner/path_storage.py index 4152531d..f52c92c7 100644 --- a/python/pathway/internals/graph_runner/path_storage.py +++ b/python/pathway/internals/graph_runner/path_storage.py @@ -19,6 +19,7 @@ class Storage: _column_paths: dict[Column, ColumnPath] flattened_inputs: list[Storage] | None = None flattened_output: Storage | None = None + has_only_references: bool = False def get_columns(self) -> Iterable[Column]: return self._column_paths.keys() diff --git a/python/pathway/internals/persistence.py b/python/pathway/internals/persistence.py deleted file mode 100644 index 3e1a800e..00000000 --- a/python/pathway/internals/persistence.py +++ /dev/null @@ -1,100 +0,0 @@ -import os -from dataclasses import KW_ONLY, dataclass - -from pathway.internals import api -from pathway.internals._io_helpers import AwsS3Settings - - -class PersistentStorageBackend: - """ - This class works as a part of a high-level persistence config. User specifies - the persistent storage parameters using one classmethod-marked methods. - - In order to configure persistence in Pathway, you will need two settings of this - kind: one for stream storage and one for snapshot storage. - """ - - def __init__( - self, - engine_data_storage: api.DataStorage, - fs_path: str | os.PathLike[str] | None = None, - ): - self._engine_data_storage = engine_data_storage - self._fs_path = fs_path - - @classmethod - def filesystem(cls, path: str | os.PathLike[str]): - return cls( - api.DataStorage( - storage_type="fs", - path=os.fspath(path), - ), - fs_path=path, - ) - - @classmethod - def s3(cls, root_path: str, bucket_settings: AwsS3Settings): - return cls( - api.DataStorage( - storage_type="s3", - aws_s3_settings=bucket_settings.settings, - path=root_path, - ), - ) - - @property - def engine_data_storage(self): - return self._engine_data_storage - - def store_path_in_env_variable(self): - if self._fs_path: - os.environ["PATHWAY_PERSISTENT_STORAGE"] = os.fspath(self._fs_path) - - -@dataclass(frozen=True) -class PersistenceConfig: - """ - This class aggregates the metadata and stream storage settings. This is the entry - point for persistence configuration and should be used as ``persistence_config`` - parameter in ``pw.run(...)`` command. - """ - - _: KW_ONLY - refresh_duration_ms: int = 0 - metadata_storage: PersistentStorageBackend - snapshot_storage: PersistentStorageBackend - snapshot_access: api.SnapshotAccess - replay_mode: api.ReplayMode - continue_after_replay: bool - - @classmethod - def single_backend( - cls, - backend: PersistentStorageBackend, - refresh_duration_ms=0, - snapshot_access=api.SnapshotAccess.FULL, - replay_mode=api.ReplayMode.PERSISTING, - continue_after_replay=True, - ): - return cls( - refresh_duration_ms=refresh_duration_ms, - metadata_storage=backend, - snapshot_storage=backend, - snapshot_access=snapshot_access, - replay_mode=replay_mode, - continue_after_replay=continue_after_replay, - ) - - @property - def engine_config(self): - return api.PersistenceConfig( - refresh_duration_ms=self.refresh_duration_ms, - metadata_storage=self.metadata_storage.engine_data_storage, - stream_storage=self.snapshot_storage.engine_data_storage, - snapshot_access=self.snapshot_access, - replay_mode=self.replay_mode, - continue_after_replay=self.continue_after_replay, - ) - - def on_before_run(self): - self.snapshot_storage.store_path_in_env_variable() diff --git a/python/pathway/internals/run.py b/python/pathway/internals/run.py index 9a041d00..e3b0b763 100644 --- a/python/pathway/internals/run.py +++ b/python/pathway/internals/run.py @@ -4,8 +4,8 @@ from pathway.internals import parse_graph from pathway.internals.graph_runner import GraphRunner from pathway.internals.monitoring import MonitoringLevel -from pathway.internals.persistence import PersistenceConfig from pathway.internals.runtime_type_check import runtime_type_check +from pathway.persistence import Config as PersistenceConfig @runtime_type_check @@ -28,6 +28,8 @@ def run( more in a `tutorial `_ . default_logging: whether to allow pathway to set its own logging handler. Set it to False if you want to set your own logging handler. + persistence_config: the config for persisting the state in case this + persistence is required. """ GraphRunner( parse_graph.G, diff --git a/python/pathway/internals/schema.py b/python/pathway/internals/schema.py index d0e255b3..998b817a 100644 --- a/python/pathway/internals/schema.py +++ b/python/pathway/internals/schema.py @@ -78,6 +78,7 @@ def schema_from_pandas( *, id_from: list[str] | None = None, name: str | None = None, + exclude_columns: list[str] = [], ) -> type[Schema]: if name is None: name = "schema_from_pandas(" + str(dframe.columns) + ")" @@ -86,6 +87,7 @@ def schema_from_pandas( columns: dict[str, ColumnDefinition] = { name: column_definition(dtype=_type_converter(dframe[name])) for name in dframe.columns + if name not in exclude_columns } for name in id_from: columns[name] = dataclasses.replace(columns[name], primary_key=True) diff --git a/python/pathway/internals/table.py b/python/pathway/internals/table.py index 8d20cc17..79a18a3a 100644 --- a/python/pathway/internals/table.py +++ b/python/pathway/internals/table.py @@ -75,7 +75,6 @@ class Table( """ if TYPE_CHECKING: - from pathway.stdlib.indexing import sort # type: ignore[misc] from pathway.stdlib.ordered import diff # type: ignore[misc] from pathway.stdlib.statistical import interpolate # type: ignore[misc] from pathway.stdlib.temporal import ( # type: ignore[misc] @@ -1890,11 +1889,61 @@ def _flatten( @desugar @contextualized_operator @runtime_type_check - def _sort_experimental( + def sort( self, key: expr.ColumnExpression, instance: expr.ColumnExpression | None = None, ) -> Table: + """ + Sorts a table by the specified keys. + + Args: + table : pw.Table + The table to be sorted. + key : ColumnReference + An expression to sort by. + instance : ColumnReference or None + An expression with instance. Rows are sorted within an instance. + ``prev`` and ``next`` columns will only point to rows that have the same instance. + + Returns: + pw.Table: The sorted table. Contains two columns: ``prev`` and ``next``, containing the pointers + to the previous and next rows. + + Example: + + >>> import pathway as pw + >>> table = pw.debug.table_from_markdown(''' + ... name | age | score + ... Alice | 25 | 80 + ... Bob | 20 | 90 + ... Charlie | 30 | 80 + ... ''') + >>> table = table.with_id_from(pw.this.name) + >>> table += table.sort(key=pw.this.age) + >>> pw.debug.compute_and_print(table, include_id=True) + | name | age | score | prev | next + ^GBSDEEW... | Alice | 25 | 80 | ^EDPSSB1... | ^DS9AT95... + ^EDPSSB1... | Bob | 20 | 90 | | ^GBSDEEW... + ^DS9AT95... | Charlie | 30 | 80 | ^GBSDEEW... | + >>> table = pw.debug.table_from_markdown(''' + ... name | age | score + ... Alice | 25 | 80 + ... Bob | 20 | 90 + ... Charlie | 30 | 80 + ... David | 35 | 90 + ... Eve | 15 | 80 + ... ''') + >>> table = table.with_id_from(pw.this.name) + >>> table += table.sort(key=pw.this.age, instance=pw.this.score) + >>> pw.debug.compute_and_print(table, include_id=True) + | name | age | score | prev | next + ^GBSDEEW... | Alice | 25 | 80 | ^T0B95XH... | ^DS9AT95... + ^EDPSSB1... | Bob | 20 | 90 | | ^RT0AZWX... + ^DS9AT95... | Charlie | 30 | 80 | ^GBSDEEW... | + ^RT0AZWX... | David | 35 | 90 | ^EDPSSB1... | + ^T0B95XH... | Eve | 15 | 80 | | ^GBSDEEW... + """ if not isinstance(instance, expr.ColumnExpression): instance = expr.ColumnConstExpression(instance) prev_column = clmn.MaterializedColumn( diff --git a/python/pathway/internals/table_subscription.py b/python/pathway/internals/table_subscription.py index fd481aef..4059b83f 100644 --- a/python/pathway/internals/table_subscription.py +++ b/python/pathway/internals/table_subscription.py @@ -1,16 +1,54 @@ from __future__ import annotations -from collections.abc import Callable from typing import Any, Protocol from pathway.internals import datasink from pathway.internals.api import Pointer +class OnFinishCallback(Protocol): + """ + The callback function to be called when the stream of changes ends. It will be called \ + on each engine worker separately. + """ + + def __call__(self) -> Any: + """ + The callable part of the callback. It will be called without arguments and its + return result won't be used by the engine. + """ + ... + + class OnChangeCallback(Protocol): + """ + The callback to be called on every change in the table. It is required to be + callable and to accept four parameters: the key, the row changed, the time of the + change in milliseconds and the flag stating if the change had been an addition + of the row. + """ + def __call__( self, key: Pointer, row: dict[str, Any], time: int, is_addition: bool ) -> Any: + """ + The callable part of the callback. + + Args: + key: the key of the changed row; + row: the changed row as a dict mapping from the field name to the value; + time: the time of the modification, also can be referred as minibatch ID of \ +the change; + is_addition: boolean value, equals to true if the row is inserted into the \ +table, false otherwise. Please note that update is basically two operations: the \ +deletion of the old value and the insertion of a new value, which happen within a single \ +transaction; + + Returns: + None + + The return result of this method will be ignored by the engine. + """ ... @@ -19,7 +57,7 @@ def subscribe( *, skip_persisted_batch: bool, on_change: OnChangeCallback, - on_end: Callable[[], Any] = lambda: None, + on_end: OnFinishCallback = lambda: None, ): """ Calls a callback function on_change on every change happening in table. This method @@ -33,8 +71,8 @@ def subscribe( outputting things twice is required from persistence). However, it can be overridden, which is required by some parts of internal functionality. on_change: the callback function to be called on every change in the table. The - function is required to accept three parameters: the row changed, the time - of the change in microseconds and the flag stating if the change had been an + function is required to accept four parameters: the key, the row changed, the time + of the change in milliseconds and the flag stating if the change had been an addition of the row. These parameters of the callback are expected to have names row, time and is_addition respectively. on_end: the callback function to be called when the stream of changes ends. diff --git a/python/pathway/io/__init__.py b/python/pathway/io/__init__.py index db4187fa..dd6d84dd 100644 --- a/python/pathway/io/__init__.py +++ b/python/pathway/io/__init__.py @@ -1,7 +1,5 @@ # Copyright © 2023 Pathway -from pathway.internals._io_helpers import AwsS3Settings -from pathway.internals.persistence import PersistenceConfig, PersistentStorageBackend from pathway.io import ( csv, debezium, @@ -20,11 +18,10 @@ s3, s3_csv, ) -from pathway.io._subscribe import OnChangeCallback, subscribe +from pathway.io._subscribe import OnChangeCallback, OnFinishCallback, subscribe from pathway.io._utils import CsvParserSettings __all__ = [ - "AwsS3Settings", "csv", "CsvParserSettings", "debezium", @@ -36,12 +33,11 @@ "logstash", "minio", "null", - "PersistenceConfig", - "PersistentStorageBackend", "plaintext", "postgres", "python", "OnChangeCallback", + "OnFinishCallback", "redpanda", "subscribe", "s3", diff --git a/python/pathway/io/_subscribe.py b/python/pathway/io/_subscribe.py index c9fef667..ae86f9e4 100644 --- a/python/pathway/io/_subscribe.py +++ b/python/pathway/io/_subscribe.py @@ -2,29 +2,27 @@ from __future__ import annotations -from collections.abc import Callable -from typing import Any - from pathway.internals.table_subscription import ( OnChangeCallback, + OnFinishCallback, subscribe as internal_subscribe, ) def subscribe( - table, on_change: OnChangeCallback, on_end: Callable[[], Any] = lambda: None + table, on_change: OnChangeCallback, on_end: OnFinishCallback = lambda: None ): """ Calls a callback function on_change on every change happening in table. Args: table: the table to subscribe. - on_change: the callback function to be called on every change in the table. The + on_change: the callback to be called on every change in the table. The function is required to accept three parameters: the row changed, the time of the change in microseconds and the flag stating if the change had been an addition of the row. These parameters of the callback are expected to have names row, time and is_addition respectively. - on_end: the callback function to be called when the stream of changes ends. + on_end: the callback to be called when the stream of changes ends. It will be called on each engine worker separately. Returns: None diff --git a/python/pathway/io/_utils.py b/python/pathway/io/_utils.py index fbcbda37..46e31ce5 100644 --- a/python/pathway/io/_utils.py +++ b/python/pathway/io/_utils.py @@ -14,6 +14,8 @@ STREAMING_MODE_NAME = "streaming" SNAPSHOT_MODE_NAME = "streaming_with_deletions" +METADATA_COLUMN_NAME = "_metadata" + _INPUT_MODES_MAPPING = { STATIC_MODE_NAME: ConnectorMode.STATIC, STREAMING_MODE_NAME: ConnectorMode.SIMPLE_STREAMING, @@ -57,6 +59,10 @@ class RawDataSchema(pw.Schema): data: Any +class MetadataSchema(Schema): + _metadata: dict + + def get_data_format_type(format: str, supported_formats: set[str]): if format not in _DATA_FORMAT_MAPPING or format not in supported_formats: raise ValueError(f"data format `{format}` not supported") @@ -98,7 +104,19 @@ def internal_read_method(format: str) -> ReadMethod: class CsvParserSettings: - """Class representing settings for the CSV parser.""" + """ + Class representing settings for the CSV parser. + + Args: + delimiter: Field delimiter to use when parsing CSV. + quote: Quote character to use when parsing CSV. + escape: What character to use for escaping fields in CSV. + enable_double_quote_escapes: Enable escapes of double quotes. + enable_quoting: Enable quoting for the fields. + comment_character: If specified, the lines starting with the comment \ +character will be treated as comments and therefore, will be ignored by \ +parser + """ def __init__( self, @@ -109,18 +127,6 @@ def __init__( enable_quoting=True, comment_character=None, ): - """Constructs the CSV parser settings. - - Args: - delimiter: Field delimiter to use when parsing CSV. - quote: Quote character to use when parsing CSV. - escape: What character to use for escaping fields in CSV. - enable_double_quote_escapes: Enable escapes of double quotes. - enable_quoting: Enable quoting for the fields. - comment_character: If specified, the lines starting with the comment - character will be treated as comments and therefore, will be ignored by - parser - """ self.api_settings = api.CsvParserSettings( delimiter, quote, @@ -199,10 +205,10 @@ def _read_schema( def read_schema( *, schema: type[Schema] | None, - value_columns: list[str] | None, - primary_key: list[str] | None, - types: dict[str, api.PathwayType] | None, - default_values: dict[str, Any] | None, + value_columns: list[str] | None = None, + primary_key: list[str] | None = None, + types: dict[str, api.PathwayType] | None = None, + default_values: dict[str, Any] | None = None, ) -> tuple[type[Schema], dict[str, Any]]: schema = _read_schema( schema=schema, @@ -225,6 +231,7 @@ def construct_schema_and_data_format( format: str, *, schema: type[Schema] | None = None, + with_metadata: bool = False, csv_settings: CsvParserSettings | None = None, json_field_paths: dict[str, str] | None = None, value_columns: list[str] | None = None, @@ -248,12 +255,31 @@ def construct_schema_and_data_format( if param in kwargs and kwargs[param] is not None: raise ValueError(f"Unexpected argument for plaintext format: {param}") - return RawDataSchema, api.DataFormat( + schema = RawDataSchema + if with_metadata: + schema |= MetadataSchema + schema, api_schema = read_schema( + schema=schema, + value_columns=None, + primary_key=None, + types=None, + default_values=None, + ) + + return schema, api.DataFormat( format_type=data_format_type, - key_field_names=None, - value_fields=[api.ValueField("data", PathwayType.ANY)], + **api_schema, parse_utf8=(format != "binary"), ) + + if with_metadata: + if schema is not None: + schema |= MetadataSchema + elif value_columns is not None: + value_columns.append(METADATA_COLUMN_NAME) + else: + raise ValueError("Neither schema nor value_columns were specified") + schema, api_schema = read_schema( schema=schema, value_columns=value_columns, diff --git a/python/pathway/io/csv/__init__.py b/python/pathway/io/csv/__init__.py index 3da88746..4e2ac58b 100644 --- a/python/pathway/io/csv/__init__.py +++ b/python/pathway/io/csv/__init__.py @@ -23,6 +23,7 @@ def read( csv_settings: CsvParserSettings | None = None, mode: str = "streaming", object_pattern: str = "*", + with_metadata: bool = False, autocommit_duration_ms: int | None = 1500, persistent_id: str | None = None, debug_data=None, @@ -56,6 +57,13 @@ def read( The default value is "streaming". object_pattern: Unix shell style pattern for filtering only certain files in the \ directory. Ignored in case a path to a single file is specified. + with_metadata: When set to true, the connector will add an additional column \ +named ``_metadata`` to the table. This column will be a JSON field that will contain two \ +optional fields - ``created_at`` and ``modified_at``. These fields will have integral \ +UNIX timestamps for the creation and modification time respectively. Additionally, the \ +column will also have an optional field named ``owner`` that will contain the name of \ +the file owner (applicable only for Un). Finally, the column will also contain a field \ +named ``path`` that will show the full path to the file from where a row was filled. types: Dictionary containing the mapping between the columns and the data types (``pw.Type``) of the values of those columns. This parameter is optional, and if not provided the default type is ``pw.Type.ANY``. [will be deprecated soon] @@ -158,6 +166,7 @@ def read( format="csv", mode=mode, object_pattern=object_pattern, + with_metadata=with_metadata, csv_settings=csv_settings, autocommit_duration_ms=autocommit_duration_ms, json_field_paths=None, diff --git a/python/pathway/io/debezium/__init__.py b/python/pathway/io/debezium/__init__.py index 15db32aa..a798ff1a 100644 --- a/python/pathway/io/debezium/__init__.py +++ b/python/pathway/io/debezium/__init__.py @@ -4,6 +4,7 @@ from typing import Any +from pathway.engine import DebeziumDBType from pathway.internals import api, datasource from pathway.internals.api import PathwayType from pathway.internals.decorators import table_from_datasource @@ -20,6 +21,7 @@ def read( rdkafka_settings: dict, topic_name: str, *, + db_type: DebeziumDBType = DebeziumDBType.POSTGRES, schema: type[Schema] | None = None, debug_data=None, autocommit_duration_ms: int | None = 1500, @@ -38,6 +40,7 @@ def read( rdkafka_settings: Connection settings in the format of `librdkafka `_. topic_name: Name of topic in Kafka to which the updates are streamed. + db_type: Type of the database from which events are streamed; schema: Schema of the resulting table. debug_data: Static data replacing original one when debug mode is active. autocommit_duration_ms:the maximum time between two commits. Every @@ -135,7 +138,9 @@ def read( data_source_options = datasource.DataSourceOptions( commit_duration_ms=autocommit_duration_ms ) - data_format = api.DataFormat(format_type="debezium", **data_format_definition) + data_format = api.DataFormat( + format_type="debezium", debezium_db_type=db_type, **data_format_definition + ) return table_from_datasource( datasource.GenericDataSource( datastorage=data_storage, diff --git a/python/pathway/io/fs/__init__.py b/python/pathway/io/fs/__init__.py index 6462b3d3..3f73bd9e 100644 --- a/python/pathway/io/fs/__init__.py +++ b/python/pathway/io/fs/__init__.py @@ -36,6 +36,7 @@ def read( csv_settings: CsvParserSettings | None = None, json_field_paths: dict[str, str] | None = None, object_pattern: str = "*", + with_metadata: bool = False, persistent_id: str | None = None, autocommit_duration_ms: int | None = 1500, debug_data: Any = None, @@ -79,6 +80,13 @@ def read( `JSON Pointer (RFC 6901) `_. object_pattern: Unix shell style pattern for filtering only certain files in the \ directory. Ignored in case a path to a single file is specified. + with_metadata: When set to true, the connector will add an additional column \ +named ``_metadata`` to the table. This column will be a JSON field that will contain two \ +optional fields - ``created_at`` and ``modified_at``. These fields will have integral \ +UNIX timestamps for the creation and modification time respectively. Additionally, the \ +column will also have an optional field named ``owner`` that will contain the name of \ +the file owner (applicable only for Un). Finally, the column will also contain a field \ +named ``path`` that will show the full path to the file from where a row was filled. persistent_id: (unstable) An identifier, under which the state of the table will be persisted or ``None``, if there is no need to persist the state of this table. When a program restarts, it restores the state for all input tables according to what @@ -218,6 +226,7 @@ def read( mode=internal_connector_mode(mode), object_pattern=object_pattern, persistent_id=persistent_id, + with_metadata=with_metadata, ) else: data_storage = api.DataStorage( @@ -227,11 +236,13 @@ def read( read_method=internal_read_method(format), object_pattern=object_pattern, persistent_id=persistent_id, + with_metadata=with_metadata, ) schema, data_format = construct_schema_and_data_format( format, schema=schema, + with_metadata=with_metadata, csv_settings=csv_settings, json_field_paths=json_field_paths, value_columns=value_columns, @@ -239,6 +250,7 @@ def read( types=types, default_values=default_values, ) + data_source_options = datasource.DataSourceOptions( commit_duration_ms=autocommit_duration_ms ) diff --git a/python/pathway/io/http/_server.py b/python/pathway/io/http/_server.py index 1875137d..bdff90ad 100644 --- a/python/pathway/io/http/_server.py +++ b/python/pathway/io/http/_server.py @@ -18,7 +18,7 @@ class RestServerSubject(io.python.ConnectorSubject): _host: str _port: int _loop: asyncio.AbstractEventLoop - _keep_queries: bool + _delete_queries: bool def __init__( self, @@ -28,7 +28,7 @@ def __init__( loop: asyncio.AbstractEventLoop, tasks: dict[Any, Any], schema: type[pw.Schema], - keep_queries: bool, + delete_queries: bool, format: str = "raw", ) -> None: super().__init__() @@ -38,7 +38,7 @@ def __init__( self._loop = loop self._tasks = tasks self._schema = schema - self._keep_queries = keep_queries + self._delete_queries = delete_queries self._format = format def run(self): @@ -75,7 +75,7 @@ async def handle(self, request: web.Request): self._add(id, data) response = await self._fetch_response(id, event) - if not self._keep_queries: + if self._delete_queries: self._remove(id, data) return web.json_response(status=200, data=response) @@ -98,7 +98,7 @@ def rest_connector( route: str = "/", schema: type[pw.Schema] | None = None, autocommit_duration_ms=1500, - keep_queries: bool = False, + delete_queries: bool = False, ) -> tuple[pw.Table, Callable]: """ Runs a lightweight HTTP server and inputs a collection from the HTTP endpoint, @@ -116,7 +116,8 @@ def rest_connector( autocommit_duration_ms: the maximum time between two commits. Every autocommit_duration_ms milliseconds, the updates received by the connector are committed and pushed into Pathway's computation graph; - keep_queries: whether to keep queries after processing; defaults to False. + delete_queries: whether to send a deletion entry after the query is processed. + Allows to remove it from the system if it is stored by operators such as ``join`` or ``groupby``; Returns: table: the table read; @@ -140,7 +141,7 @@ def rest_connector( loop=loop, tasks=tasks, schema=schema, - keep_queries=keep_queries, + delete_queries=delete_queries, format=format, ), schema=schema, diff --git a/python/pathway/io/jsonlines/__init__.py b/python/pathway/io/jsonlines/__init__.py index fbad97da..799330b8 100644 --- a/python/pathway/io/jsonlines/__init__.py +++ b/python/pathway/io/jsonlines/__init__.py @@ -22,6 +22,7 @@ def read( mode: str = "streaming", json_field_paths: dict[str, str] | None = None, object_pattern: str = "*", + with_metadata: bool = False, autocommit_duration_ms: int | None = 1500, persistent_id: str | None = None, debug_data=None, @@ -54,6 +55,13 @@ def read( `JSON Pointer (RFC 6901) `_. object_pattern: Unix shell style pattern for filtering only certain files in the \ directory. Ignored in case a path to a single file is specified. + with_metadata: When set to true, the connector will add an additional column \ +named ``_metadata`` to the table. This column will be a JSON field that will contain two \ +optional fields - ``created_at`` and ``modified_at``. These fields will have integral \ +UNIX timestamps for the creation and modification time respectively. Additionally, the \ +column will also have an optional field named ``owner`` that will contain the name of \ +the file owner (applicable only for Un). Finally, the column will also contain a field \ +named ``path`` that will show the full path to the file from where a row was filled. autocommit_duration_ms: the maximum time between two commits. Every autocommit_duration_ms milliseconds, the updates received by the connector are committed and pushed into Pathway's computation graph. @@ -168,6 +176,7 @@ def read( autocommit_duration_ms=autocommit_duration_ms, value_columns=value_columns, object_pattern=object_pattern, + with_metadata=with_metadata, primary_key=primary_key, types=types, default_values=default_values, diff --git a/python/pathway/io/minio/__init__.py b/python/pathway/io/minio/__init__.py index e2c6b4ce..ff99fdfe 100644 --- a/python/pathway/io/minio/__init__.py +++ b/python/pathway/io/minio/__init__.py @@ -9,10 +9,23 @@ from pathway.internals.table import Table from pathway.internals.trace import trace_user_frame from pathway.io._utils import CsvParserSettings -from pathway.io.s3 import AwsS3Settings, read as s3_csv_read +from pathway.io.s3 import AwsS3Settings, read as s3_read class MinIOSettings: + """Stores MinIO bucket connection settings. + + Args: + endpoint: Endpoint for the bucket. + bucket_name: Name of a bucket. + access_key: Access key for the bucket. + secret_access_key: Secret access key for the bucket. + region: Region of the bucket. + with_path_style: Whether to use path-style addresses for bucket access. It defaults \ +to True as this is the most widespread way to access MinIO, but can be overridden in case \ +of a custom configuration. + """ + def __init__( self, endpoint, @@ -23,15 +36,6 @@ def __init__( with_path_style=True, region=None, ): - """Constructs MinIO bucket connection settings. - - Args: - endpoint: Endpoint for the bucket. - bucket_name: Name of a bucket. - access_key: Access key for the bucket. - secret_access_key: Secret access key for the bucket. - region: Region of the bucket. - """ self.endpoint = endpoint self.bucket_name = bucket_name self.access_key = access_key @@ -65,7 +69,7 @@ def read( autocommit_duration_ms: int | None = 1500, debug_data: Any = None, ) -> Table: - """Reads a table from one or several objects in CSV format from S3 bucket in MinIO. + """Reads a table from one or several objects from S3 bucket in MinIO. In case the prefix is specified, and there are several objects lying under this prefix, their order is determined according to their modification times: the smaller @@ -122,7 +126,7 @@ def read( ... ) """ - return s3_csv_read( + return s3_read( path=path, aws_s3_settings=minio_settings.create_aws_settings(), format=format, diff --git a/python/pathway/io/plaintext/__init__.py b/python/pathway/io/plaintext/__init__.py index f0e6b7c1..fd403f76 100644 --- a/python/pathway/io/plaintext/__init__.py +++ b/python/pathway/io/plaintext/__init__.py @@ -17,6 +17,7 @@ def read( *, mode: str = "streaming", object_pattern: str = "*", + with_metadata: bool = False, persistent_id: str | None = None, autocommit_duration_ms: int | None = 1500, debug_data=None, @@ -41,6 +42,13 @@ def read( The default value is "streaming". object_pattern: Unix shell style pattern for filtering only certain files in the \ directory. Ignored in case a path to a single file is specified. + with_metadata: When set to true, the connector will add an additional column \ +named ``_metadata`` to the table. This column will be a JSON field that will contain two \ +optional fields - ``created_at`` and ``modified_at``. These fields will have integral \ +UNIX timestamps for the creation and modification time respectively. Additionally, the \ +column will also have an optional field named ``owner`` that will contain the name of \ +the file owner (applicable only for Un). Finally, the column will also contain a field \ +named ``path`` that will show the full path to the file from where a row was filled. persistent_id: (unstable) An identifier, under which the state of the table \ will be persisted or ``None``, if there is no need to persist the state of this table. \ When a program restarts, it restores the state for all input tables according to what \ @@ -65,6 +73,7 @@ def read( format="plaintext", mode=mode, object_pattern=object_pattern, + with_metadata=with_metadata, persistent_id=persistent_id, autocommit_duration_ms=autocommit_duration_ms, debug_data=debug_data, diff --git a/python/pathway/io/s3/__init__.py b/python/pathway/io/s3/__init__.py index 3272accf..7f4f5487 100644 --- a/python/pathway/io/s3/__init__.py +++ b/python/pathway/io/s3/__init__.py @@ -18,12 +18,17 @@ internal_connector_mode, ) -S3_PATH_PREFIX = "s3://" -S3_DEFAULT_REGION = "us-east-1" -S3_LOCATION_FIELD = "LocationConstraint" - class DigitalOceanS3Settings: + """Stores Digital Ocean S3 connection settings. + + Args: + bucket_name: Name of Digital Ocean S3 bucket. + access_key: Access key for the bucket. + secret_access_key: Secret access key for the bucket. + region: Region of the bucket. + """ + @trace_user_frame def __init__( self, @@ -33,14 +38,6 @@ def __init__( secret_access_key=None, region=None, ): - """Constructs Digital Ocean S3 connection settings. - - Args: - bucket_name: Name of S3 bucket. - access_key: Access key for the bucket. - secret_access_key: Secret access key for the bucket. - region: Region of the bucket. - """ self.settings = api.AwsS3Settings( bucket_name, access_key, @@ -52,6 +49,15 @@ def __init__( class WasabiS3Settings: + """Stores Wasabi S3 connection settings. + + Args: + bucket_name: Name of Wasabi S3 bucket. + access_key: Access key for the bucket. + secret_access_key: Secret access key for the bucket. + region: Region of the bucket. + """ + @trace_user_frame def __init__( self, @@ -61,14 +67,6 @@ def __init__( secret_access_key=None, region=None, ): - """Constructs Wasabi S3 connection settings. - - Args: - bucket_name: Name of S3 bucket. - access_key: Access key for the bucket. - secret_access_key: Secret access key for the bucket. - region: Region of the bucket. - """ self.settings = api.AwsS3Settings( bucket_name, access_key, @@ -421,3 +419,14 @@ def read_from_wasabi( ), debug_datasource=datasource.debug_datasource(debug_data), ) + + +# This is made to force AwsS3Settings documentation +__all__ = [ + "AwsS3Settings", + "DigitalOceanS3Settings", + "WasabiS3Settings", + "read", + "read_from_digital_ocean", + "read_from_wasabi", +] diff --git a/python/pathway/persistence/__init__.py b/python/pathway/persistence/__init__.py new file mode 100644 index 00000000..60d8276f --- /dev/null +++ b/python/pathway/persistence/__init__.py @@ -0,0 +1,149 @@ +import os +from dataclasses import KW_ONLY, dataclass + +from pathway.internals import api +from pathway.internals._io_helpers import AwsS3Settings + + +class Backend: + """ + The settings of a backend, which is used to persist the computation state. There + are two kinds of data backends: metadata backend and snapshot backend. Both are + configurable via this class. + """ + + def __init__( + self, + engine_data_storage: api.DataStorage, + fs_path: str | os.PathLike[str] | None = None, + ): + self._engine_data_storage = engine_data_storage + self._fs_path = fs_path + + @classmethod + def filesystem(cls, path: str | os.PathLike[str]): + """ + Configure the filesystem backend. + + Args: + path: the path to the root directory in the file system, which will be used \ +to store the persisted data. + + Returns: + Class instance denoting the filesystem storage backend with root directory \ +at ``path``. + """ + return cls( + api.DataStorage( + storage_type="fs", + path=os.fspath(path), + ), + fs_path=path, + ) + + @classmethod + def s3(cls, root_path: str, bucket_settings: AwsS3Settings): + """ + Configure the S3 backend. + + Args: + root_path: path to the root in the S3 storage, which will be used to \ +store persisted data; + bucket_settings: the settings for S3 bucket connection in the same format \ +as they are used by S3 connectors. + + Returns: + Class instance denoting the S3 storage backend with root directory as + ``root_path`` and connection settings given by ``bucket_settings``. + """ + return cls( + api.DataStorage( + storage_type="s3", + aws_s3_settings=bucket_settings.settings, + path=root_path, + ), + ) + + @classmethod + def mock(cls, events: dict[tuple[str, int], list[api.SnapshotEvent]]): + return cls(api.DataStorage(storage_type="mock", mock_events=events)) + + @property + def engine_data_storage(self): + return self._engine_data_storage + + def store_path_in_env_variable(self): + if self._fs_path: + os.environ["PATHWAY_PERSISTENT_STORAGE"] = os.fspath(self._fs_path) + + +@dataclass(frozen=True) +class Config: + """ + Configure the data persistence. An instance of this class should be passed as a + parameter to pw.run in case persistence is enabled. + + Please note that if you'd like to use the same backend for both metadata and + snapshot storages, you can use the convenience method ``simple_config``. + + Args: + metadata_storage: metadata backend configuration; + snapshot_storage: snapshots backend configuration; + snapshot_interval_ms: the desired duration between snapshot updates in \ +milliseconds; + """ + + _: KW_ONLY + snapshot_interval_ms: int = 0 + metadata_storage: Backend + snapshot_storage: Backend + snapshot_access: api.SnapshotAccess + replay_mode: api.ReplayMode + continue_after_replay: bool + + @classmethod + def simple_config( + cls, + backend: Backend, + snapshot_interval_ms=0, + snapshot_access=api.SnapshotAccess.FULL, + replay_mode=api.ReplayMode.PERSISTING, + continue_after_replay=True, + ): + """ + Construct config from a single instance of the \ +``Backend`` class, using this backend to persist metadata and \ +snapshot. + + Args: + backend: storage backend settings; + snapshot_interval_ms: the desired freshness of the persisted snapshot in \ +milliseconds. The greater the value is, the more the amount of time that the snapshot \ +may fall behind, and the less computational resources are required. + + Returns: + Persistence config. + """ + + return cls( + snapshot_interval_ms=snapshot_interval_ms, + metadata_storage=backend, + snapshot_storage=backend, + snapshot_access=snapshot_access, + replay_mode=replay_mode, + continue_after_replay=continue_after_replay, + ) + + @property + def engine_config(self): + return api.PersistenceConfig( + snapshot_interval_ms=self.snapshot_interval_ms, + metadata_storage=self.metadata_storage.engine_data_storage, + stream_storage=self.snapshot_storage.engine_data_storage, + snapshot_access=self.snapshot_access, + replay_mode=self.replay_mode, + continue_after_replay=self.continue_after_replay, + ) + + def on_before_run(self): + self.snapshot_storage.store_path_in_env_variable() diff --git a/python/pathway/stdlib/indexing/__init__.py b/python/pathway/stdlib/indexing/__init__.py index 17b160a5..f21b3b78 100644 --- a/python/pathway/stdlib/indexing/__init__.py +++ b/python/pathway/stdlib/indexing/__init__.py @@ -12,7 +12,6 @@ filter_smallest_k, prefix_sum_oracle, retrieve_prev_next_values, - sort, sort_from_index, ) @@ -24,6 +23,5 @@ "filter_smallest_k", "prefix_sum_oracle", "retrieve_prev_next_values", - "sort", "sort_from_index", ] diff --git a/python/pathway/stdlib/indexing/sorting.py b/python/pathway/stdlib/indexing/sorting.py index 303e9691..dd9eb57c 100644 --- a/python/pathway/stdlib/indexing/sorting.py +++ b/python/pathway/stdlib/indexing/sorting.py @@ -150,80 +150,6 @@ def sort_from_index( return _treesort(index=index).index # type: ignore -@runtime_type_check -@trace_user_frame -def sort( - table: pw.Table, - key: pw.ColumnReference | None = None, - instance: pw.ColumnReference | None = None, -) -> pw.Table[PrevNext]: - """ - Sorts a table by the specified keys. - - Args: - table : pw.Table - The table to be sorted. - key : ColumnReference or None - The name of the primary key to sort by. If None, the table is sorted - based on the `key` column as primary key. - instance : ColumnReference or None - The name of the secondary key to sort by. If None, the field "instance" is chosen - if it exists, otherwise only the primary key is used. - - Returns: - pw.Table: The sorted table. Contains two columns: ``prev`` and ``next``, containing the pointers - to the previous and next rows. - - Example: - - >>> import pathway as pw - >>> table = pw.debug.table_from_markdown(''' - ... name | age | score - ... Alice | 25 | 80 - ... Bob | 20 | 90 - ... Charlie | 30 | 80 - ... ''') - >>> table = table.with_id_from(pw.this.name) - >>> table += sort(table, key=pw.this.age) - >>> pw.debug.compute_and_print(table, include_id=False) - name | age | score | next | prev - Alice | 25 | 80 | ^DS9AT95... | ^EDPSSB1... - Bob | 20 | 90 | ^GBSDEEW... | - Charlie | 30 | 80 | | ^GBSDEEW... - """ - if key is None: - key = table.key - else: - if isinstance(key, pw.ColumnReference): - key = table[key] - else: - if isinstance(key, str): - raise ValueError( - f"sorting.sort(): Invalid column reference for the parameter key, found a string. Did you mean this.{key} instead of {repr(key)}?" # noqa: E501 - ) - raise ValueError( - "sorting.sort(): Invalid column reference for the parameter key." # noqa: E501 - ) - if instance is None: - table = table.select(key=key, instance=0) - else: - if isinstance(instance, pw.ColumnReference): - instance = table[instance] - else: - if isinstance(instance, str): - raise ValueError( - f"sorting.sort(): Invalid column reference for the parameter instance. Did you mean this.{instance} instead of {repr(instance)}?" # noqa: E501 - ) - raise ValueError( - "sorting.sort(): Invalid column reference for the parameter instance." # noqa: E501 - ) - table = table.select( - key=key, - instance=instance, - ) - return sort_from_index(**build_sorted_index(table)) - - @pw.transformer class _treesort: class index(pw.ClassArg, input=LeftRight | Parent, output=PrevNext): diff --git a/python/pathway/stdlib/temporal/_window.py b/python/pathway/stdlib/temporal/_window.py index a068ebbf..e3d96717 100644 --- a/python/pathway/stdlib/temporal/_window.py +++ b/python/pathway/stdlib/temporal/_window.py @@ -345,6 +345,14 @@ def _apply( target = target._buffer( target._pw_window_start + behavior.delay, target._pw_key ) + target = target.with_columns( + _pw_key=pw.if_else( + target._pw_key > target._pw_window_start + behavior.delay, + target._pw_key, + target._pw_window_start + behavior.delay, + ) + ) + if behavior.cutoff is not None: cutoff_threshold = pw.this._pw_window_end + behavior.cutoff target = target._freeze(cutoff_threshold, pw.this._pw_key) diff --git a/python/pathway/tests/cli/replay.py b/python/pathway/tests/cli/replay.py index 78e246f7..a36bc3cb 100644 --- a/python/pathway/tests/cli/replay.py +++ b/python/pathway/tests/cli/replay.py @@ -1,26 +1,14 @@ +import pathlib import sys import pathway as pw - - -class CountDifferentTimestampsCallback(pw.io.OnChangeCallback): - times: set[int] - - def __init__(self, expected): - self.times = set() - self.expected = expected - - def __call__(self, key, row, time: int, is_addition): - self.times.add(time) - - def on_end(self): - assert len(self.times) == self.expected +from pathway.tests.utils import CountDifferentTimestampsCallback def run_graph( - expected_count, + expected_count: int | None, rows_to_generate, -): +) -> int: class InputSchema(pw.Schema): number: int @@ -42,12 +30,25 @@ class InputSchema(pw.Schema): pw.io.subscribe(t, callback, callback.on_end) pw.run() + return len(callback.timestamps) def main(): - expected_count = int(sys.argv[1]) + expected_count: int | None = int(sys.argv[1]) rows_to_generate = int(sys.argv[2]) - run_graph(expected_count, rows_to_generate) + + if len(sys.argv) > 3: + timestamp_file = pathlib.Path(sys.argv[3]) + else: + timestamp_file = None + + # When generating rows, we can't be sure that new rows will have distinct timestamps, + # so we don't check their number + if rows_to_generate > 0: + expected_count = None + n_timestamps = run_graph(expected_count, rows_to_generate) + if timestamp_file is not None: + timestamp_file.write_text(str(n_timestamps)) if __name__ == "__main__": diff --git a/python/pathway/tests/cli/test_cli.py b/python/pathway/tests/cli/test_cli.py index fa2bd2ef..d1ea1ec4 100644 --- a/python/pathway/tests/cli/test_cli.py +++ b/python/pathway/tests/cli/test_cli.py @@ -1,13 +1,17 @@ import os import pathlib -import pytest from click.testing import CliRunner from pathway import cli -def run_record(path, expected_count, rows_to_generate): +def run_record( + path, timestamp_file: pathlib.Path, expected_count, rows_to_generate +) -> int: + # timestamp_file is used to get back information about number of distinct + # timestamps - as we are not guaranteed that all timestamps are different, + # during replay we compare with the number obtained during data generation. script_path = os.path.join(os.path.dirname(__file__), "replay.py") runner = CliRunner() result = runner.invoke( @@ -20,10 +24,15 @@ def run_record(path, expected_count, rows_to_generate): script_path, str(expected_count), str(rows_to_generate), + str(timestamp_file), ], ) assert result.exit_code == 0 + n_timestamps = int(timestamp_file.read_text()) + + return n_timestamps + def run_replay( path, mode, expected_count, rows_to_generate=0, continue_after_replay=True @@ -47,31 +56,23 @@ def run_replay( assert result.exit_code == 0 -@pytest.mark.xfail(reason="failing non-deterministically") def test_record_replay_through_cli(tmp_path: pathlib.Path): replay_dir = str(tmp_path / "test_replay") + timestamp_file = tmp_path / "timestamp" # First run to persist data in local storage - run_record(replay_dir, 15, 15) + n_timestamps = run_record(replay_dir, timestamp_file, 15, 15) run_replay(replay_dir, "batch", 1) - run_replay(replay_dir, "speedrun", 15) - - # When we don't continue after replay, we should not generate new rows - run_replay(replay_dir, "speedrun", 15, continue_after_replay=False) + run_replay(replay_dir, "speedrun", n_timestamps) # Generate rows during replay + # expected number will be ignored, as we are generating rows run_replay(replay_dir, "speedrun", 30, rows_to_generate=15) # Check that the rows weren't recorded run_replay(replay_dir, "speedrun", 15) # Without replay (and with empty input connector), there are no rows - run_record(replay_dir, 0, 0) - - # Generate rows and record them (but don't replay saved data) - run_record(replay_dir, 15, 15) - - # Check that the rows were recorded - run_replay(replay_dir, "speedrun", 30) + run_record(replay_dir, timestamp_file, 0, 0) diff --git a/python/pathway/tests/ml/test_index.py b/python/pathway/tests/ml/test_index.py index c34229dd..64d478d3 100644 --- a/python/pathway/tests/ml/test_index.py +++ b/python/pathway/tests/ml/test_index.py @@ -1,19 +1,15 @@ -import os +from typing import Any import numpy as np import pandas as pd -import pytest import pathway as pw from pathway.stdlib.ml.index import KNNIndex -from pathway.tests.utils import ( - assert_table_equality_wo_index, - assert_values_in_stream_consistent, -) +from pathway.tests.utils import assert_table_equality_wo_index class PointSchema(pw.Schema): - coords: np.ndarray + coords: Any is_query: bool @@ -37,11 +33,24 @@ def get_points() -> list[tuple[tuple[int, ...], bool]]: return [(point[:-1], point[-1] == 1) for point in points] +def nn_as_table( + to_table: list[tuple[tuple[int, ...], tuple[tuple[int, ...]]]] +) -> pw.Table: + return pw.debug.table_from_pandas( + pd.DataFrame( + { + "coords": [point[0] for point in to_table], + "nn": [point[1] for point in to_table], + } + ) + ).update_types(nn=list[tuple[int, ...]]) + + def test_all_at_once(): data = get_points() df = pd.DataFrame( { - "coords": [np.array(point[0]) for point in data], + "coords": [point[0] for point in data], "is_query": [point[1] for point in data], } ) @@ -52,38 +61,26 @@ def test_all_at_once(): result = queries + index.get_nearest_items(queries.coords, k=2).with_universe_of( queries ).select(nn=pw.apply(sort_arrays, pw.this.coords)) - expected = [ - ((0, 0), ((-1, 0), (1, 2))), - ((2, -2), ((1, -4), (3, -2))), - ((-1, 1), ((-3, 1), (-1, 0))), - ((-2, -3), ((-1, 0), (1, -4))), - ] - expected_pw = pw.debug.table_from_pandas( - pd.DataFrame( - { - "coords": [np.array(point[0]) for point in expected], - "nn": [point[1] for point in expected], - } - ) + expected = nn_as_table( + [ + ((0, 0), ((-1, 0), (1, 2))), + ((2, -2), ((1, -4), (3, -2))), + ((-1, 1), ((-3, 1), (-1, 0))), + ((-2, -3), ((-1, 0), (1, -4))), + ] ) - assert_table_equality_wo_index(result, expected_pw) + assert_table_equality_wo_index(result, expected) def stream_points() -> tuple[pw.Table, pw.Table]: """Returns (points, queries).""" points = get_points() - value_functions = { - "coords": lambda i: points[i][0], - "is_query": lambda i: points[i][1], - } - - table = pw.demo.generate_custom_stream( - value_functions, - schema=PointSchema, - nb_rows=10, - autocommit_duration_ms=20, - input_rate=10, - ) + + table = pw.debug.table_from_list_of_batches( + [[{"coords": point[0], "is_query": point[1]}] for point in points], + PointSchema, + ).update_types(coords=tuple[int, ...]) + return ( table.filter(~pw.this.is_query).without(pw.this.is_query), table.filter(pw.this.is_query).without(pw.this.is_query), @@ -96,32 +93,29 @@ def test_update_old(): result = queries + index.get_nearest_items(queries.coords, k=2).with_universe_of( queries ).select(nn=pw.apply(sort_arrays, pw.this.coords)) - expected = [ - ((0, 0), ((-1, 0), (1, 2))), - ((2, -2), ((1, -4), (3, -2))), - ((-1, 1), ((-3, 1), (-1, 0))), - ((-2, -3), ((-1, 0), (1, -4))), - ] - assert_values_in_stream_consistent(result, expected) + expected = nn_as_table( + [ + ((0, 0), ((-1, 0), (1, 2))), + ((2, -2), ((1, -4), (3, -2))), + ((-1, 1), ((-3, 1), (-1, 0))), + ((-2, -3), ((-1, 0), (1, -4))), + ] + ) + assert_table_equality_wo_index(result, expected) -@pytest.mark.xfail(reason="data isn't split into batches as expected") def test_asof_now(): - if ( - os.getenv("PATHWAY_THREADS") is not None - and int(os.getenv("PATHWAY_THREADS")) != 1 # type: ignore[arg-type] - ): - pytest.xfail(reason="Order changes when multiple threads are used.") - # FIXME: waits for proper tool for streaming tests points, queries = stream_points() index = KNNIndex(points.coords, points, n_dimensions=2, n_and=5) result = queries + index.get_nearest_items_asof_now(queries.coords, k=2).select( nn=pw.apply(sort_arrays, pw.this.coords) ) - expected = [ - ((0, 0), ((2, 2), (3, -2))), - ((2, -2), ((-1, 0), (3, -2))), - ((-1, 1), ((-1, 0), (1, 2))), - ((-2, -3), ((-3, 1), (-1, 0))), - ] - assert_values_in_stream_consistent(result, expected) + expected = nn_as_table( + [ + ((0, 0), ((2, 2), (3, -2))), + ((2, -2), ((-1, 0), (3, -2))), + ((-1, 1), ((-1, 0), (1, 2))), + ((-2, -3), ((-3, 1), (-1, 0))), + ] + ) + assert_table_equality_wo_index(result, expected) diff --git a/python/pathway/tests/temporal/test_asof_now_joins.py b/python/pathway/tests/temporal/test_asof_now_joins.py index f870175a..215adb43 100644 --- a/python/pathway/tests/temporal/test_asof_now_joins.py +++ b/python/pathway/tests/temporal/test_asof_now_joins.py @@ -2,7 +2,8 @@ import pathway as pw from pathway.tests.utils import ( - assert_values_in_stream_consistent, + T, + assert_table_equality_wo_index, generate_custom_stream_with_deletions, ) @@ -60,22 +61,25 @@ def test_update_old(): result = queries.join(data, pw.left.instance == pw.right.instance).select( query=pw.left.value, ans=pw.right.value ) - expected = [ - (1, 9), - (2, 9), - (3, 9), - (4, 9), - (5, 3), - (6, 9), - (7, 3), - (8, 9), - (9, 3), - ] - assert_values_in_stream_consistent(result, expected) + expected = T( + """ + query | ans + 1 | 9 + 2 | 9 + 3 | 9 + 4 | 9 + 5 | 3 + 6 | 9 + 7 | 3 + 8 | 9 + 9 | 3 + """ + ) + assert_table_equality_wo_index(result, expected) @pytest.mark.parametrize("set_id", [True, False]) -def test_asof_now_inner(set_id): +def test_asof_now_inner(set_id: bool): if set_id: id = pw.left.id else: @@ -84,23 +88,26 @@ def test_asof_now_inner(set_id): result = queries.asof_now_join( data, pw.left.instance == pw.right.instance, id=id ).select(query=pw.left.value, ans=pw.right.value) - expected = [ - (2, 4), - (3, 4), - (4, 5), - (5, 2), - (6, 5), - (7, 2), - (8, 5), - (9, 3), - ] + expected = T( + """ + query | ans + 2 | 4 + 3 | 4 + 4 | 5 + 5 | 2 + 6 | 5 + 7 | 2 + 8 | 5 + 9 | 3 + """ + ) if set_id: assert result._universe.is_subset_of(queries._universe) - assert_values_in_stream_consistent(result, expected) + assert_table_equality_wo_index(result, expected) @pytest.mark.parametrize("set_id", [True, False]) -def test_asof_now_left(set_id): +def test_asof_now_left(set_id: bool): if set_id: id = pw.left.id else: @@ -109,17 +116,20 @@ def test_asof_now_left(set_id): result = queries.asof_now_join_left( data, pw.left.instance == pw.right.instance, id=id ).select(query=pw.left.value, ans=pw.right.value) - expected = [ - (1, None), - (2, 4), - (3, 4), - (4, 5), - (5, 2), - (6, 5), - (7, 2), - (8, 5), - (9, 3), - ] + expected = T( + """ + query | ans + 1 | + 2 | 4 + 3 | 4 + 4 | 5 + 5 | 2 + 6 | 5 + 7 | 2 + 8 | 5 + 9 | 3 + """ + ) if set_id: assert result._universe == queries._universe - assert_values_in_stream_consistent(result, expected) + assert_table_equality_wo_index(result, expected) diff --git a/python/pathway/tests/temporal/test_interval_joins_stream.py b/python/pathway/tests/temporal/test_interval_joins_stream.py index 797705ef..ebfc5e02 100644 --- a/python/pathway/tests/temporal/test_interval_joins_stream.py +++ b/python/pathway/tests/temporal/test_interval_joins_stream.py @@ -1,7 +1,7 @@ import pytest import pathway as pw -from pathway.tests.utils import assert_values_in_stream_consistent +from pathway.tests.utils import T, assert_table_equality_wo_index class TimeInputSchema(pw.Schema): @@ -38,31 +38,37 @@ def test_forgetting(keep_results: bool): behavior=pw.temporal.window_behavior(0, 2, keep_results=keep_results), ).select(left_t=pw.left.t, right_t=pw.right.t) if keep_results: - expected = [ - (0, 0), - (1, 1), - (2, 2), - (3, 3), - (3, 3), - (3, 3), - (3, 3), - (4, 4), - (4, 4), - (4, 4), - (4, 4), - ] + expected = T( + """ + left_t | right_t + 0 | 0 + 1 | 1 + 2 | 2 + 3 | 3 + 3 | 3 + 3 | 3 + 3 | 3 + 4 | 4 + 4 | 4 + 4 | 4 + 4 | 4 + """ + ) else: - expected = [ - (3, 3), - (3, 3), - (3, 3), - (3, 3), - (4, 4), - (4, 4), - (4, 4), - (4, 4), - ] - assert_values_in_stream_consistent(result, expected) + expected = T( + """ + left_t | right_t + 3 | 3 + 3 | 3 + 3 | 3 + 3 | 3 + 4 | 4 + 4 | 4 + 4 | 4 + 4 | 4 + """ + ) + assert_table_equality_wo_index(result, expected) class TimeValueInputSchema(pw.Schema): @@ -101,47 +107,53 @@ def test_forgetting_sharded(keep_results: bool): behavior=pw.temporal.window_behavior(0, 2, keep_results=keep_results), ).select(v=pw.this.v, left_t=pw.left.t, right_t=pw.right.t) if keep_results: - expected = [ - (0, 0, 0), - (0, 1, 1), - (0, 2, 2), - (0, 3, 3), - (0, 3, 3), - (0, 3, 3), - (0, 3, 3), - (0, 4, 4), - (0, 4, 4), - (0, 4, 4), - (0, 4, 4), - (1, 0, 0), - (1, 1, 1), - (1, 2, 2), - (1, 3, 3), - (1, 3, 3), - (1, 3, 3), - (1, 3, 3), - (1, 4, 4), - (1, 4, 4), - (1, 4, 4), - (1, 4, 4), - ] + expected = T( + """ + v | left_t | right_t + 0 | 0 | 0 + 0 | 1 | 1 + 0 | 2 | 2 + 0 | 3 | 3 + 0 | 3 | 3 + 0 | 3 | 3 + 0 | 3 | 3 + 0 | 4 | 4 + 0 | 4 | 4 + 0 | 4 | 4 + 0 | 4 | 4 + 1 | 0 | 0 + 1 | 1 | 1 + 1 | 2 | 2 + 1 | 3 | 3 + 1 | 3 | 3 + 1 | 3 | 3 + 1 | 3 | 3 + 1 | 4 | 4 + 1 | 4 | 4 + 1 | 4 | 4 + 1 | 4 | 4 + """ + ) else: - expected = [ - (0, 3, 3), - (0, 3, 3), - (0, 3, 3), - (0, 3, 3), - (0, 4, 4), - (0, 4, 4), - (0, 4, 4), - (0, 4, 4), - (1, 3, 3), - (1, 3, 3), - (1, 3, 3), - (1, 3, 3), - (1, 4, 4), - (1, 4, 4), - (1, 4, 4), - (1, 4, 4), - ] - assert_values_in_stream_consistent(result, expected) + expected = T( + """ + v | left_t | right_t + 0 | 3 | 3 + 0 | 3 | 3 + 0 | 3 | 3 + 0 | 3 | 3 + 0 | 4 | 4 + 0 | 4 | 4 + 0 | 4 | 4 + 0 | 4 | 4 + 1 | 3 | 3 + 1 | 3 | 3 + 1 | 3 | 3 + 1 | 3 | 3 + 1 | 4 | 4 + 1 | 4 | 4 + 1 | 4 | 4 + 1 | 4 | 4 + """ + ) + assert_table_equality_wo_index(result, expected) diff --git a/python/pathway/tests/temporal/test_windows_stream.py b/python/pathway/tests/temporal/test_windows_stream.py index f295713b..92fb58d6 100644 --- a/python/pathway/tests/temporal/test_windows_stream.py +++ b/python/pathway/tests/temporal/test_windows_stream.py @@ -170,7 +170,6 @@ def parametrized_test(duration, hop, delay, cutoff, keep_results): autocommit_duration_ms=5, input_rate=25, ) - gb = t.windowby( t.time, window=pw.temporal.sliding(duration=duration, hop=hop), @@ -206,7 +205,7 @@ def parametrized_test(duration, hop, delay, cutoff, keep_results): order = in_entry["value"] max_value = in_entry["value"] max_window_time = in_entry["time"] - max_global_time = max(max_window_time, max_global_time) + max_global_time = max(max(in_entry["time"], window[1] + delay), max_global_time) old_entry_state = simulated_state.get(entry_id) if old_entry_state is not None: diff --git a/python/pathway/tests/test_common.py b/python/pathway/tests/test_common.py index 0b50d44c..7d5aa5b5 100644 --- a/python/pathway/tests/test_common.py +++ b/python/pathway/tests/test_common.py @@ -2091,15 +2091,15 @@ def inc(x: int) -> int: assert_table_equality( result, expected, - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(cache_dir), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(cache_dir), ), ) assert_table_equality( result, expected, - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(cache_dir), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(cache_dir), ), ) assert os.path.exists(cache_dir) @@ -2170,15 +2170,15 @@ async def inc(x: int) -> int: assert_table_equality( result, expected, - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(cache_dir), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(cache_dir), ), ) assert_table_equality( result, expected, - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(cache_dir), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(cache_dir), ), ) assert os.path.exists(cache_dir) diff --git a/python/pathway/tests/test_gist_event_streaming_time_between_occurrences.py b/python/pathway/tests/test_gist_event_streaming_time_between_occurrences.py index ff72c511..e36da4ce 100644 --- a/python/pathway/tests/test_gist_event_streaming_time_between_occurrences.py +++ b/python/pathway/tests/test_gist_event_streaming_time_between_occurrences.py @@ -12,11 +12,7 @@ # DO NOT MODIFY THIS WITHOUT MODIFYING the following file: # public/website3/content/2.developers/6.tutorials/.event_stream_processing_time_between_occurrences/article.py # noqa E501 def get_differences(events): - sorted_events = pw.indexing.sort_from_index( - **pw.indexing.build_sorted_index( - events + events.select(key=events.timestamp, instance=events.topic_id) - ) - ) + sorted_events = events.sort(key=events.timestamp, instance=events.topic_id) events_with_prev = events.having(sorted_events.prev) differences = events_with_prev.select( @@ -40,8 +36,8 @@ def convert_table(table): result_table = result_table.with_id_from(pw.this.key) result_table = result_table.select( - next=pw.this.next_id, prev=pw.this.prev_id, + next=pw.this.next_id, ) return result_table diff --git a/python/pathway/tests/test_io.py b/python/pathway/tests/test_io.py index 8d908f96..face3c23 100644 --- a/python/pathway/tests/test_io.py +++ b/python/pathway/tests/test_io.py @@ -15,9 +15,11 @@ import pytest import pathway as pw +from pathway.engine import ref_scalar from pathway.internals import api from pathway.internals.parse_graph import G from pathway.tests.utils import ( + CountDifferentTimestampsCallback, CsvLinesNumberChecker, T, assert_table_equality, @@ -680,8 +682,8 @@ async def invoke(self, value: int) -> dict[str, Any]: assert_table_equality( result, expected, - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(cache_dir), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(cache_dir), ), ) @@ -1046,8 +1048,8 @@ def run_computation(py_connector_input, fs_connector_input): ) pw.io.csv.write(table_joined, output_path) pw.run( - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(persistent_storage_path), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(persistent_storage_path), ) ) @@ -1094,28 +1096,26 @@ def test_no_pstorage(tmp_path: pathlib.Path): match="persistent metadata backend failed: target object should be a directory", ): pw.run( - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(path), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(path), ) ) -def test_persistent_id_not_assigned(tmp_path: pathlib.Path): +def test_persistent_id_not_assigned_autogenerate(tmp_path: pathlib.Path): input_path = tmp_path / "input.txt" write_lines(input_path, "test_data") pstorage_path = tmp_path / "PStrorage" - table = pw.io.plaintext.read(input_path) + write_lines(input_path, "test_data") + + table = pw.io.plaintext.read(input_path, mode="static") pw.io.csv.write(table, tmp_path / "output.txt") - with pytest.raises( - ValueError, - match="persistent storage is configured, but persistent id is not assigned for FileSystem reader", - ): - pw.run( - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(pstorage_path) - ) + pw.run( + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(pstorage_path) ) + ) def test_no_persistent_storage(tmp_path: pathlib.Path): @@ -1151,8 +1151,8 @@ def test_duplicated_persistent_id(tmp_path: pathlib.Path): match="Persistent ID 'one' used more than once", ): pw.run( - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(pstorage_path) + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(pstorage_path) ) ) @@ -1513,8 +1513,8 @@ def test_persistent_subscribe(tmp_path): root.on_change, root.on_end = on_change, on_end pw.io.subscribe(table, on_change=on_change, on_end=on_end) pw.run( - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(pstorage_dir), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(pstorage_dir), ), ) @@ -1552,8 +1552,8 @@ def test_persistent_subscribe(tmp_path): root = mock.Mock() pw.io.subscribe(table, on_change=root.on_change, on_end=root.on_end) pw.run( - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(pstorage_dir), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(pstorage_dir), ), ) root.assert_has_calls( @@ -1653,8 +1653,8 @@ def run_graph( pw.io.subscribe(t, callback, callback.on_end) pw.run( - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(replay_dir), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(replay_dir), replay_mode=replay_mode, continue_after_replay=continue_after_replay, snapshot_access=snapshot_access, @@ -1709,20 +1709,6 @@ def run_graph( run_graph(api.ReplayMode.SPEEDRUN, expected) -class CountDifferentTimestampsCallback(pw.io.OnChangeCallback): - times: set[int] - - def __init__(self, expected): - self.times = set() - self.expected = expected - - def __call__(self, key, row, time: int, is_addition): - self.times.add(time) - - def on_end(self): - assert len(self.times) == self.expected - - def test_replay_timestamps(tmp_path: pathlib.Path): replay_dir = tmp_path / "test_replay_timestamps" @@ -1737,11 +1723,11 @@ class TimeColumnInputSchema(pw.Schema): def run_graph( replay_mode, - expected_count, + expected_count: int | None = None, generate_rows=0, continue_after_replay=True, snapshot_access=api.SnapshotAccess.FULL, - ): + ) -> int: G.clear() t = pw.demo.generate_custom_stream( @@ -1758,16 +1744,22 @@ def run_graph( pw.io.subscribe(t, callback, callback.on_end) pw.run( - persistence_config=pw.io.PersistenceConfig.single_backend( - pw.io.PersistentStorageBackend.filesystem(replay_dir), + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.filesystem(replay_dir), replay_mode=replay_mode, continue_after_replay=continue_after_replay, snapshot_access=snapshot_access, ) ) + return len(callback.timestamps) + + # Workaround for demo.generate_custom_stream sometimes putting two rows in the same batch: + # when generating rows we count number of different timestamp, and then during replay in Speedrun mode + # we expect the number of different timestamps to be the same as when generating data. + # First run to persist data in local storage - run_graph(api.ReplayMode.PERSISTING, 15, generate_rows=15) + n_timestamps = run_graph(api.ReplayMode.PERSISTING, generate_rows=15) # In Persistency there should not be any data in output connector run_graph(api.ReplayMode.PERSISTING, 0) @@ -1775,5 +1767,386 @@ def run_graph( # In Batch every row should have the same timestamp run_graph(api.ReplayMode.BATCH, 1) - # In Speedrun every row should have different timestamp - run_graph(api.ReplayMode.SPEEDRUN, 15) + # In Speedrun we should have the same number of timestamps as when generating data + run_graph(api.ReplayMode.SPEEDRUN, n_timestamps) + + +def test_metadata_column_identity(tmp_path: pathlib.Path): + inputs_path = tmp_path / "inputs" + os.mkdir(inputs_path) + + input_contents_1 = "abc\n\ndef\nghi" + input_contents_2 = "ttt\nppp\nqqq" + input_contents_3 = "zzz\nyyy\n\nxxx" + write_lines(inputs_path / "input1.txt", input_contents_1) + write_lines(inputs_path / "input2.txt", input_contents_2) + write_lines(inputs_path / "input3.txt", input_contents_3) + + output_path = tmp_path / "output.json" + table = pw.io.fs.read( + inputs_path, + with_metadata=True, + format="plaintext_by_file", + mode="static", + autocommit_duration_ms=1000, + ) + pw.io.jsonlines.write(table, output_path) + pw.run() + + metadata_file_names = [] + with open(output_path, "r") as f: + for line in f.readlines(): + metadata_file_names.append(json.loads(line)["_metadata"]["path"]) + + assert len(metadata_file_names) == 3, metadata_file_names + metadata_file_names.sort() + assert metadata_file_names[0].endswith("input1.txt") + assert metadata_file_names[1].endswith("input2.txt") + assert metadata_file_names[2].endswith("input3.txt") + + +def test_metadata_column_regular_parser(tmp_path: pathlib.Path): + inputs_path = tmp_path / "inputs" + os.mkdir(inputs_path) + + input_contents_1 = json.dumps({"a": 1, "b": 10}) + input_contents_2 = json.dumps({"a": 2, "b": 20}) + write_lines(inputs_path / "input1.txt", input_contents_1) + write_lines(inputs_path / "input2.txt", input_contents_2) + + class InputSchema(pw.Schema): + a: int + b: int + + output_path = tmp_path / "output.json" + table = pw.io.fs.read( + inputs_path, + with_metadata=True, + schema=InputSchema, + format="json", + mode="static", + autocommit_duration_ms=1000, + ) + pw.io.jsonlines.write(table, output_path) + pw.run() + + metadata_file_names = [] + with open(output_path, "r") as f: + for line in f.readlines(): + metadata_file_names.append(json.loads(line)["_metadata"]["path"]) + + assert len(metadata_file_names) == 2, metadata_file_names + metadata_file_names.sort() + assert metadata_file_names[0].endswith("input1.txt") + assert metadata_file_names[1].endswith("input2.txt") + + +def test_mock_snapshot_reader(): + class InputSchema(pw.Schema): + number: int + + events = { + ("1", 0): [ + api.SnapshotEvent.advance_time(2), + api.SnapshotEvent.insert(ref_scalar(0), [1]), + api.SnapshotEvent.insert(ref_scalar(1), [1]), + api.SnapshotEvent.advance_time(4), + api.SnapshotEvent.insert(ref_scalar(2), [4]), + api.SnapshotEvent.delete(ref_scalar(0), [1]), + api.SnapshotEvent.FINISHED, + ] + } + + t = pw.demo.generate_custom_stream( + {}, + schema=InputSchema, + nb_rows=0, + input_rate=15, + autocommit_duration_ms=50, + persistent_id="1", + ) + + on_change = mock.Mock() + pw.io.subscribe(t, on_change=on_change) + + pw.run( + persistence_config=pw.persistence.Config.simple_config( + pw.persistence.Backend.mock(events), + replay_mode=api.ReplayMode.SPEEDRUN, + snapshot_access=api.SnapshotAccess.REPLAY, + ) + ) + + on_change.assert_has_calls( + [ + mock.call.on_change( + key=ref_scalar(0), + row={"number": 1}, + time=2, + is_addition=True, + ), + mock.call.on_change( + key=ref_scalar(1), + row={"number": 1}, + time=2, + is_addition=True, + ), + mock.call.on_change( + key=ref_scalar(2), + row={"number": 4}, + time=4, + is_addition=True, + ), + mock.call.on_change( + key=ref_scalar(0), + row={"number": 1}, + time=4, + is_addition=False, + ), + ], + any_order=True, + ) + assert on_change.call_count == 4 + + +def test_stream_generator_from_list(): + class InputSchema(pw.Schema): + number: int + + events = [ + [{"number": 1}, {"number": 2}, {"number": 5}], + [{"number": 4}, {"number": 4}], + ] + + t = pw.debug.table_from_list_of_batches(events, InputSchema) + on_change = mock.Mock() + pw.io.subscribe(t, on_change=on_change) + + pw.run() + + timestamps = set([call.kwargs["time"] for call in on_change.mock_calls]) + assert len(timestamps) == 2 + + on_change.assert_has_calls( + [ + mock.call.on_change( + key=mock.ANY, + row={"number": 1}, + time=min(timestamps), + is_addition=True, + ), + mock.call.on_change( + key=mock.ANY, + row={"number": 2}, + time=min(timestamps), + is_addition=True, + ), + mock.call.on_change( + key=mock.ANY, + row={"number": 5}, + time=min(timestamps), + is_addition=True, + ), + mock.call.on_change( + key=mock.ANY, + row={"number": 4}, + time=max(timestamps), + is_addition=True, + ), + mock.call.on_change( + key=mock.ANY, + row={"number": 4}, + time=max(timestamps), + is_addition=True, + ), + ], + any_order=True, + ) + assert on_change.call_count == 5 + + +def test_stream_generator_from_list_multiple_workers(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("PATHWAY_THREADS", "2") + + class InputSchema(pw.Schema): + number: int + + events = [ + {0: [{"number": 1}, {"number": 2}], 1: [{"number": 5}]}, + {0: [{"number": 4}], 1: [{"number": 4}]}, + ] + + t = pw.debug.table_from_list_of_batches_by_workers(events, InputSchema) + on_change = mock.Mock() + pw.io.subscribe(t, on_change=on_change) + + pw.run() + + timestamps = set([call.kwargs["time"] for call in on_change.mock_calls]) + assert len(timestamps) == 2 + + on_change.assert_has_calls( + [ + mock.call.on_change( + key=mock.ANY, + row={"number": 1}, + time=min(timestamps), + is_addition=True, + ), + mock.call.on_change( + key=mock.ANY, + row={"number": 2}, + time=min(timestamps), + is_addition=True, + ), + mock.call.on_change( + key=mock.ANY, + row={"number": 5}, + time=min(timestamps), + is_addition=True, + ), + mock.call.on_change( + key=mock.ANY, + row={"number": 4}, + time=max(timestamps), + is_addition=True, + ), + mock.call.on_change( + key=mock.ANY, + row={"number": 4}, + time=max(timestamps), + is_addition=True, + ), + ], + any_order=True, + ) + assert on_change.call_count == 5 + + +@pytest.mark.filterwarnings("ignore:timestamps are required to be even") +def test_stream_generator_from_markdown(): + t = pw.debug.table_from_markdown( + """ + | colA | colB | _time + 1 | 1 | 2 | 1 + 5 | 2 | 3 | 1 + 10 | 5 | 1 | 2 + """ + ) + on_change = mock.Mock() + pw.io.subscribe(t, on_change=on_change) + + pw.run() + + on_change.assert_has_calls( + [ + mock.call.on_change( + key=api.ref_scalar(1), + row={"colA": 1, "colB": 2}, + time=2, + is_addition=True, + ), + mock.call.on_change( + key=api.ref_scalar(5), + row={"colA": 2, "colB": 3}, + time=2, + is_addition=True, + ), + mock.call.on_change( + key=api.ref_scalar(10), + row={"colA": 5, "colB": 1}, + time=4, + is_addition=True, + ), + ], + any_order=True, + ) + assert on_change.call_count == 3 + + +def test_stream_generator_from_markdown_with_diffs(): + t = pw.debug.table_from_markdown( + """ + | colA | colB | _time | _diff + 1 | 1 | 2 | 2 | 1 + 5 | 2 | 3 | 2 | 1 + 1 | 1 | 2 | 4 | -1 + 10 | 5 | 1 | 4 | 1 + 3 | 1 | 1 | 4 | 1 + 10 | 5 | 1 | 8 | -1 + """ + ) + + expected = pw.debug.table_from_markdown( + """ + | colA | colB + 5 | 2 | 3 + 3 | 1 | 1 + """ + ) + + assert_table_equality(t, expected) + + +def test_stream_generator_two_tables_multiple_workers(monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("PATHWAY_THREADS", "4") + + class InputSchema(pw.Schema): + colA: int + colB: int + + t1 = pw.debug.table_from_markdown( + """ + colA | colB | _time | _worker + 1 | 2 | 2 | 0 + 2 | 3 | 2 | 1 + 5 | 1 | 4 | 2 + 3 | 5 | 6 | 3 + 7 | 4 | 8 | 0 + """ + ) + + t2 = pw.debug.stream_generator._table_from_dict( + { + 2: {0: [(1, api.ref_scalar(0), [1, 4])]}, + 4: {2: [(1, api.ref_scalar(1), [3, 7])]}, + 8: {0: [(1, api.ref_scalar(2), [2, 2])]}, + }, + InputSchema, + ) + + t3 = ( + t1.join(t2, t1.colA == t2.colA) + .select(colA=pw.left.colA, left=pw.left.colB, right=pw.right.colB) + .with_columns(sum=pw.this.left + pw.this.right) + ) + + on_change = mock.Mock() + pw.io.subscribe(t3, on_change=on_change) + + pw.run() + + on_change.assert_has_calls( + [ + mock.call.on_change( + key=mock.ANY, + row={"colA": 1, "left": 2, "right": 4, "sum": 6}, + time=2, + is_addition=True, + ), + mock.call.on_change( + key=mock.ANY, + row={"colA": 3, "left": 5, "right": 7, "sum": 12}, + time=6, + is_addition=True, + ), + mock.call.on_change( + key=mock.ANY, + row={"colA": 2, "left": 3, "right": 2, "sum": 5}, + time=8, + is_addition=True, + ), + ], + any_order=True, + ) + assert on_change.call_count == 3 diff --git a/python/pathway/tests/test_sorting.py b/python/pathway/tests/test_sorting.py index 26f2e9c4..b9b0f160 100644 --- a/python/pathway/tests/test_sorting.py +++ b/python/pathway/tests/test_sorting.py @@ -19,7 +19,6 @@ filter_cmp_helper, filter_smallest_k, prefix_sum_oracle, - sort_from_index, ) from pathway.tests.utils import T, assert_table_equality, assert_table_equality_wo_index @@ -578,7 +577,7 @@ def test_prevnext_single_instance(): 5 | 2 | 42 """ ) - result = sort_from_index(**build_sorted_index(nodes)) + result = nodes.sort(key=nodes.key, instance=nodes.instance) assert_table_equality( result, @@ -592,8 +591,8 @@ def test_prevnext_single_instance(): 5 | 3 | 1 """, ).select( - next=nodes.pointer_from(this.next, optional=True), prev=nodes.pointer_from(this.prev, optional=True), + next=nodes.pointer_from(this.next, optional=True), ), ) @@ -616,7 +615,7 @@ def test_prevnext_many_instance(): 10| 2 | 28 """ ) - result = sort_from_index(**build_sorted_index(nodes)) + result = nodes.sort(key=nodes.key, instance=nodes.instance) assert_table_equality( result, @@ -634,49 +633,6 @@ def test_prevnext_many_instance(): 9 | 5 | 1 10 | 6 | 2 """, - ).select( - next=nodes.pointer_from(this.next, optional=True), - prev=nodes.pointer_from(this.prev, optional=True), - ), - ) - - -def test_prevnext_many_instances_experimental(): - # FIXME: remove when experimental integrated into main sort - nodes = T( - """ - | key - 1 | 11 - 2 | 1 - 3 | 15 - 4 | 5 - 5 | 13 - 6 | 3 - 7 | 18 - 8 | 8 - 9 | 12 - 10| 2 - """ - ) - - result = nodes._sort_experimental(key=nodes.key, instance=None) - - assert_table_equality( - result, - T( - """ - | next | prev - 1 | 9 | 8 - 2 | 10 | - 3 | 7 | 5 - 4 | 8 | 6 - 5 | 3 | 9 - 6 | 4 | 10 - 7 | | 3 - 8 | 1 | 4 - 9 | 5 | 1 - 10 | 6 | 2 - """, ).select( prev=nodes.pointer_from(this.prev, optional=True), next=nodes.pointer_from(this.next, optional=True), diff --git a/python/pathway/tests/test_streaming_test_utils.py b/python/pathway/tests/test_streaming_test_utils.py index 23045c7e..4cb0258d 100644 --- a/python/pathway/tests/test_streaming_test_utils.py +++ b/python/pathway/tests/test_streaming_test_utils.py @@ -6,12 +6,7 @@ import pathway as pw from pathway import demo from pathway.internals import Schema, api -from pathway.tests.utils import ( - DiffEntry, - assert_key_entries_in_stream_consistent, - assert_values_in_stream_consistent, - run, -) +from pathway.tests.utils import DiffEntry, assert_key_entries_in_stream_consistent, run def test_stream_success(): @@ -189,63 +184,3 @@ class TimeColumnInputSchema(Schema): assert_key_entries_in_stream_consistent(list, gb) with pytest.raises(AssertionError): run() - - -class ValueInputSchema(pw.Schema): - value: int - instance: int - - -def test_values_consistency_checker_1(): - value_functions = {"value": lambda x: 200 - x, "instance": lambda x: x % 3} - - t1 = pw.demo.generate_custom_stream( - value_functions, - schema=ValueInputSchema, - nb_rows=200, - autocommit_duration_ms=10, - input_rate=400, - ) - - res = t1.groupby(pw.this.instance).reduce( - pw.this.instance, min=pw.reducers.min(pw.this.value) - ) - - assert_values_in_stream_consistent(res, [(0, 2), (1, 1), (2, 3)]) - - -def test_values_consistency_checker_2(): - value_functions = {"value": lambda x: x, "instance": lambda x: x % 3} - - t1 = pw.demo.generate_custom_stream( - value_functions, - schema=ValueInputSchema, - nb_rows=200, - autocommit_duration_ms=10, - input_rate=400, - ) - - res = t1.groupby(pw.this.instance).reduce( - pw.this.instance, max=pw.reducers.max(pw.this.value) - ) - - assert_values_in_stream_consistent(res, [(0, 198), (1, 199), (2, 197)]) - - -def test_values_consistency_checker_raises(): - value_functions = {"value": lambda x: 200 - x, "instance": lambda x: x % 3} - - t1 = pw.demo.generate_custom_stream( - value_functions, - schema=ValueInputSchema, - nb_rows=200, - autocommit_duration_ms=10, - input_rate=400, - ) - - res = t1.groupby(pw.this.instance).reduce( - pw.this.instance, min=pw.reducers.min(pw.this.value) - ) - - with pytest.raises(AssertionError): - assert_values_in_stream_consistent(res, [(0, 2), (1, 1), (2, 0)]) diff --git a/python/pathway/tests/utils.py b/python/pathway/tests/utils.py index b36e91ea..5c2abe83 100644 --- a/python/pathway/tests/utils.py +++ b/python/pathway/tests/utils.py @@ -13,7 +13,7 @@ import time from abc import abstractmethod from collections.abc import Callable, Iterable -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any import numpy as np @@ -136,79 +136,6 @@ def assert_key_entries_in_stream_consistent(expected: list[DiffEntry], table: pw pw.io.subscribe(table, callback, callback.on_end) -@dataclass -class Entry: - last_modified: int = 0 - values: collections.Counter[tuple[api.Value, ...]] = field( - default_factory=collections.Counter - ) - - def validate(self, key: api.Pointer) -> None: - if len(self.values) > 1: - raise AssertionError(f"Multiple entries {self.values} found for key {key}.") - elif len(self.values) == 1: - (value, count) = next(iter(self.values.items())) - assert ( - count == 1 - ), f"Entry {value} with cardinality {count}!=1 found for key {key}." - - def empty(self) -> bool: - return len(self.values) == 0 - - def get_element(self, key: api.Pointer) -> tuple[api.Value, ...]: - self.validate(key) - return next(iter(self.values)) - - -class CheckValuesConsistentInStreamCallback(pw.io._subscribe.OnChangeCallback): - data: dict[api.Pointer, Entry] - expected: list[tuple[api.Value, ...]] - - def __init__(self, expected: list[tuple[api.Value, ...]]): - super().__init__() - self.expected = expected - self.data = collections.defaultdict(Entry) - - def __call__( - self, - key: api.Pointer, - row: dict[str, api.Value], - time: int, - is_addition: bool, - ) -> Any: - hashable_row = make_row_hashable(tuple(row.values())) - entry = self.data[key] - if entry.last_modified < time: - entry.validate(key) - entry.last_modified = time - if is_addition: - entry.values[hashable_row] += 1 - else: - entry.values[hashable_row] -= 1 - if entry.values[hashable_row] == 0: - del entry.values[hashable_row] - - def on_end(self): - result = collections.Counter( - ( - entry.get_element(key) - for key, entry in self.data.items() - if not entry.empty() - ) - ) - expected = collections.Counter(make_row_hashable(row) for row in self.expected) - if result != expected: - raise AssertionError( - f"Tables are different, result: {result} vs expected: {expected}." - ) - - -def assert_values_in_stream_consistent(table: pw.Table, expected: list): - callback = CheckValuesConsistentInStreamCallback(expected) - pw.io.subscribe(table, callback, callback.on_end) - run() - - def assert_equal_tables(t0: api.CapturedTable, t1: api.CapturedTable): assert t0 == t1 @@ -451,3 +378,20 @@ def _create_send_row(i): ) return table + + +# Callback class for checking whether number of distinct timestamps of +# rows is equal to expected +class CountDifferentTimestampsCallback(pw.io.OnChangeCallback): + timestamps: set[int] + + def __init__(self, expected: int | None = None): + self.timestamps = set() + self.expected = expected + + def __call__(self, key, row, time: int, is_addition): + self.timestamps.add(time) + + def on_end(self): + if self.expected is not None: + assert len(self.timestamps) == self.expected diff --git a/src/connectors/adaptors.rs b/src/connectors/adaptors.rs new file mode 100644 index 00000000..16972599 --- /dev/null +++ b/src/connectors/adaptors.rs @@ -0,0 +1,174 @@ +use differential_dataflow::input::InputSession; +use differential_dataflow::lattice::Lattice; +use differential_dataflow::operators::arrange::upsert::arrange_from_upsert; +use differential_dataflow::trace::implementations::ord::OrdValBatch; +use differential_dataflow::trace::implementations::spine_fueled::Spine; +use differential_dataflow::Collection; +use timely::dataflow::operators::input::Handle; +use timely::dataflow::operators::Input as TimelyInput; +use timely::order::TotalOrder; +use timely::progress::Timestamp as TimelyTimestamp; + +use crate::engine::dataflow::maybe_total::MaybeTotalScope; +use crate::engine::{Key, Value}; + +use std::rc::Rc; + +pub type GenericValues = Collection; +pub type ValuesSessionAdaptor = Box>; + +#[derive(Clone, Copy, Debug)] +pub enum SessionType { + Native, + Upsert, +} + +impl SessionType { + pub fn new_collection< + Timestamp: TimelyTimestamp + Lattice + TotalOrder, + S: MaybeTotalScope, + >( + &self, + scope: &mut S, + ) -> (ValuesSessionAdaptor, GenericValues) { + match &self { + SessionType::Native => { + let mut input_session = InputSession::new(); + let collection = input_session.to_collection(scope); + (Box::new(input_session), collection) + } + SessionType::Upsert => { + let mut upsert_session = UpsertSession::new(); + let collection = upsert_session.to_collection(scope); + (Box::new(upsert_session), collection) + } + } + } +} + +pub trait InputAdaptor { + fn new() -> Self + where + Self: Sized; + + fn insert(&mut self, key: Key, value: Value); + fn remove(&mut self, key: Key, value: Value); + fn upsert(&mut self, key: Key, value: Option); + + fn advance_to(&mut self, time: Timestamp); + fn time(&self) -> &Timestamp; + + fn flush(&mut self); +} + +#[derive(Default)] +pub struct UpsertSession { + time: Timestamp, + buffer: Vec<(Key, Option, Timestamp)>, + handle: Handle, Timestamp)>, +} + +impl UpsertSession { + pub fn to_collection>( + &mut self, + scope: &mut S, + ) -> Collection { + // We require that any given key is provided only to a single worker. + arrange_from_upsert::>>>( + &scope.input_from(&mut self.handle), + "UpsertSession", + ) + .as_collection(|k, v| (*k, v.clone())) + } +} + +impl InputAdaptor + for UpsertSession +{ + /// The implementation below mostly reuses differetial dataflow's `InputSession` internals. + /// + /// The main difference is the interface of the `to_collection` method and more task-based + /// insert and remove methods. + + fn new() -> Self { + let handle: Handle = Handle::new(); + UpsertSession { + time: handle.time().clone(), + buffer: Vec::new(), + handle, + } + } + + fn flush(&mut self) { + self.handle.send_batch(&mut self.buffer); + if self.handle.epoch().less_than(&self.time) { + self.handle.advance_to(self.time.clone()); + } + } + + fn advance_to(&mut self, time: Timestamp) { + assert!(self.handle.epoch().less_equal(&time)); + assert!(self.time.less_equal(&time)); + self.time = time; + } + + fn insert(&mut self, _key: Key, _value: Value) { + unimplemented!("this type of InputAdaptor doesn't support inserts") + } + + fn remove(&mut self, _key: Key, _value: Value) { + unimplemented!("this type of InputAdaptor doesn't support removals") + } + + fn upsert(&mut self, key: Key, value: Option) { + if self.buffer.len() == self.buffer.capacity() { + if !self.buffer.is_empty() { + self.handle.send_batch(&mut self.buffer); + } + self.buffer.reserve(1024); + } + self.buffer.push((key, value, self.time.clone())); + } + + fn time(&self) -> &Timestamp { + &self.time + } +} + +impl Drop for UpsertSession { + fn drop(&mut self) { + self.flush(); + } +} + +impl InputAdaptor + for InputSession +{ + fn new() -> Self { + Self::new() + } + + fn insert(&mut self, key: Key, value: Value) { + self.insert((key, value)); + } + + fn remove(&mut self, key: Key, value: Value) { + self.remove((key, value)); + } + + fn upsert(&mut self, _key: Key, _value: Option) { + unimplemented!("this type of InputAdaptor doesn't support upserts") + } + + fn flush(&mut self) { + self.flush(); + } + + fn advance_to(&mut self, time: Timestamp) { + self.advance_to(time); + } + + fn time(&self) -> &Timestamp { + self.time() + } +} diff --git a/src/connectors/data_format.rs b/src/connectors/data_format.rs index 6aba5e62..5f71895d 100644 --- a/src/connectors/data_format.rs +++ b/src/connectors/data_format.rs @@ -8,8 +8,9 @@ use std::iter::zip; use std::mem::take; use std::str::{from_utf8, Utf8Error}; +use crate::connectors::metadata::SourceMetadata; use crate::connectors::ReaderContext::{Diff, KeyValue, RawBytes, TokenizedEntries}; -use crate::connectors::{DataEventType, ReaderContext}; +use crate::connectors::{DataEventType, Offset, ReaderContext, SessionType, SnapshotEvent}; use crate::engine::error::DynError; use crate::engine::{Key, Result, Type, Value}; @@ -25,10 +26,37 @@ pub enum ParsedEvent { AdvanceTime, Insert((Option>, Vec)), + // None as Vec of values means that the record is removed + Upsert((Option>, Option>)), + // If None, finding the key for the provided values becomes responsibility of the connector Delete((Option>, Vec)), } +impl ParsedEvent { + pub fn key( + &self, + mut values_to_key: impl FnMut(Option<&Vec>, Option<&Offset>) -> Key, + offset: Option<&Offset>, + ) -> Option { + match self { + ParsedEvent::Insert((raw_key, _)) + | ParsedEvent::Upsert((raw_key, _)) + | ParsedEvent::Delete((raw_key, _)) => Some(values_to_key(raw_key.as_ref(), offset)), + ParsedEvent::AdvanceTime => None, + } + } + + pub fn snapshot_event(&self, key: Key) -> Option { + match self { + ParsedEvent::Insert((_, values)) => Some(SnapshotEvent::Insert(key, values.clone())), + ParsedEvent::Upsert((_, values)) => Some(SnapshotEvent::Upsert(key, values.clone())), + ParsedEvent::Delete((_, values)) => Some(SnapshotEvent::Delete(key, values.clone())), + ParsedEvent::AdvanceTime => None, + } + } +} + #[derive(Debug, thiserror::Error)] #[non_exhaustive] pub enum ParseError { @@ -129,12 +157,16 @@ fn prepare_plaintext_string(bytes: &[u8]) -> PrepareStringResult { pub trait Parser: Send { fn parse(&mut self, data: &ReaderContext) -> ParseResult; - fn on_new_source_started(&mut self); + fn on_new_source_started(&mut self, metadata: Option<&SourceMetadata>); fn column_count(&self) -> usize; fn short_description(&self) -> Cow<'static, str> { type_name::().into() } + + fn session_type(&self) -> SessionType { + SessionType::Native + } } #[derive(Debug)] @@ -214,13 +246,20 @@ impl DsvSettings { } } +#[derive(Clone)] +enum DsvColumnIndex { + Index(usize), + Metadata, +} + pub struct DsvParser { settings: DsvSettings, schema: HashMap, header: Vec, - key_column_indices: Option>, - value_column_indices: Vec, + metadata_column_value: Value, + key_column_indices: Option>, + value_column_indices: Vec, indexed_schema: HashMap, dsv_header_read: bool, } @@ -300,12 +339,16 @@ fn parse_with_type( } } +/// "magic field" containing the metadata +const METADATA_FIELD_NAME: &str = "_metadata"; + impl DsvParser { pub fn new(settings: DsvSettings, schema: HashMap) -> DsvParser { DsvParser { settings, schema, + metadata_column_value: Value::None, header: Vec::new(), key_column_indices: None, value_column_indices: Vec::new(), @@ -317,12 +360,16 @@ impl DsvParser { fn column_indices_by_names( tokenized_entries: &[String], sought_names: &[String], - ) -> Result, ParseError> { + ) -> Result, ParseError> { let mut value_indices_found = 0; - let mut column_indices = vec![0; sought_names.len()]; + let mut column_indices = vec![DsvColumnIndex::Metadata; sought_names.len()]; let mut requested_indices = HashMap::>::new(); for (index, field) in sought_names.iter().enumerate() { + if field == METADATA_FIELD_NAME { + value_indices_found += 1; + continue; + } match requested_indices.get_mut(field) { Some(indices) => indices.push(index), None => { @@ -334,7 +381,7 @@ impl DsvParser { for (index, value) in tokenized_entries.iter().enumerate() { if let Some(indices) = requested_indices.get(value) { for requested_index in indices { - column_indices[*requested_index] = index; + column_indices[*requested_index] = DsvColumnIndex::Index(index); value_indices_found += 1; } } @@ -392,15 +439,21 @@ impl DsvParser { } fn values_by_indices( + &self, tokens: &[String], - indices: &[usize], + indices: &[DsvColumnIndex], indexed_schema: &HashMap, header: &[String], ) -> Result, ParseError> { let mut parsed_tokens = Vec::with_capacity(indices.len()); for index in indices { - let schema_item = indexed_schema.get(index).unwrap_or_default(); - let token = parse_with_type(&tokens[*index], schema_item, &header[*index])?; + let token = match index { + DsvColumnIndex::Index(index) => { + let schema_item = indexed_schema.get(index).unwrap_or_default(); + parse_with_type(&tokens[*index], schema_item, &header[*index])? + } + DsvColumnIndex::Metadata => self.metadata_column_value.clone(), + }; parsed_tokens.push(token); } Ok(parsed_tokens) @@ -422,15 +475,19 @@ impl DsvParser { let mut line_has_enough_tokens = true; if let Some(indices) = &self.key_column_indices { for index in indices { - line_has_enough_tokens &= index < &tokens.len(); + if let DsvColumnIndex::Index(index) = index { + line_has_enough_tokens &= index < &tokens.len(); + } } } for index in &self.value_column_indices { - line_has_enough_tokens &= index < &tokens.len(); + if let DsvColumnIndex::Index(index) = index { + line_has_enough_tokens &= index < &tokens.len(); + } } if line_has_enough_tokens { let key = match &self.key_column_indices { - Some(indices) => Some(Self::values_by_indices( + Some(indices) => Some(self.values_by_indices( tokens, indices, &self.indexed_schema, @@ -438,7 +495,7 @@ impl DsvParser { )?), None => None, }; - let parsed_tokens = Self::values_by_indices( + let parsed_tokens = self.values_by_indices( tokens, &self.value_column_indices, &self.indexed_schema, @@ -447,6 +504,7 @@ impl DsvParser { let parsed_entry = match event { DataEventType::Insert => ParsedEvent::Insert((key, parsed_tokens)), DataEventType::Delete => ParsedEvent::Delete((key, parsed_tokens)), + DataEventType::Upsert => unreachable!("readers can't send upserts to DsvParser"), }; Ok(vec![parsed_entry]) } else { @@ -470,8 +528,13 @@ impl Parser for DsvParser { } } - fn on_new_source_started(&mut self) { + fn on_new_source_started(&mut self, metadata: Option<&SourceMetadata>) { self.dsv_header_read = false; + if let Some(metadata) = metadata { + let metadata_serialized: JsonValue = + serde_json::to_value(metadata).expect("internal serialization error"); + self.metadata_column_value = metadata_serialized.into(); + } } fn column_count(&self) -> usize { @@ -480,12 +543,18 @@ impl Parser for DsvParser { } pub struct IdentityParser { + value_fields: Vec, parse_utf8: bool, + metadata_column_value: Value, } impl IdentityParser { - pub fn new(parse_utf8: bool) -> IdentityParser { - Self { parse_utf8 } + pub fn new(value_fields: Vec, parse_utf8: bool) -> IdentityParser { + Self { + value_fields, + parse_utf8, + metadata_column_value: Value::None, + } } fn prepare_bytes(&self, bytes: &[u8]) -> Result { @@ -524,19 +593,36 @@ impl Parser for IdentityParser { let event = if is_commit { ParsedEvent::AdvanceTime } else { - let values = vec![value]; + let mut values = Vec::new(); + for field in &self.value_fields { + if field == METADATA_FIELD_NAME { + values.push(self.metadata_column_value.clone()); + } else { + values.push(value.clone()); + } + } match event { DataEventType::Insert => ParsedEvent::Insert((key, values)), DataEventType::Delete => ParsedEvent::Delete((key, values)), + DataEventType::Upsert => { + unreachable!("readers can't send upserts to IdentityParser") + } } }; Ok(vec![event]) } - fn on_new_source_started(&mut self) {} + fn on_new_source_started(&mut self, metadata: Option<&SourceMetadata>) { + if let Some(metadata) = metadata { + let metadata_serialized: JsonValue = + serde_json::to_value(metadata).expect("internal serialization error"); + self.metadata_column_value = metadata_serialized.into(); + } + } + fn column_count(&self) -> usize { - 1 + self.value_fields.len() } } @@ -607,10 +693,17 @@ impl Formatter for DsvFormatter { } } +#[derive(Clone, Copy, Debug)] +pub enum DebeziumDBType { + Postgres, + MongoDB, +} + pub struct DebeziumMessageParser { key_field_names: Option>, value_field_names: Vec, separator: String, // how key-value pair is separated + db_type: DebeziumDBType, } fn parse_value_from_json(value: &JsonValue) -> Option { @@ -688,6 +781,7 @@ fn values_by_names_from_json( column_paths: &HashMap, field_absence_is_error: bool, schema: &HashMap, + metadata_column_value: &Value, ) -> Result, ParseError> { let mut parsed_values = Vec::with_capacity(field_names.len()); for value_field in field_names { @@ -703,7 +797,9 @@ fn values_by_names_from_json( } }; - let value = if let Some(path) = column_paths.get(value_field) { + let value = if value_field == METADATA_FIELD_NAME { + metadata_column_value.clone() + } else if let Some(path) = column_paths.get(value_field) { if let Some(value) = payload.pointer(path) { match dtype { Type::Json => Value::from(value.clone()), @@ -760,11 +856,13 @@ impl DebeziumMessageParser { key_field_names: Option>, value_field_names: Vec, separator: String, + db_type: DebeziumDBType, ) -> DebeziumMessageParser { DebeziumMessageParser { key_field_names, value_field_names, separator, + db_type, } } @@ -774,57 +872,104 @@ impl DebeziumMessageParser { fn parse_event( &mut self, + key: &JsonValue, value: &JsonValue, event: DataEventType, ) -> Result { + // in case of MongoDB, the message is always string + let prepared_value: JsonValue = { + if let JsonValue::String(serialized_json) = &value { + let Ok(prepared_value) = serde_json::from_str::(serialized_json) else { + return Err(ParseError::FailedToParseJson(serialized_json.to_string())); + }; + prepared_value + } else { + value.clone() + } + }; + let key = match &self.key_field_names { None => None, Some(names) => Some(values_by_names_from_json( - value, + key, names, &HashMap::new(), true, &HashMap::new(), + &Value::None, )?), }; let parsed_values = values_by_names_from_json( - value, + &prepared_value, &self.value_field_names, &HashMap::new(), true, &HashMap::new(), + &Value::None, )?; match event { DataEventType::Insert => Ok(ParsedEvent::Insert((key, parsed_values))), DataEventType::Delete => Ok(ParsedEvent::Delete((key, parsed_values))), + DataEventType::Upsert => Ok(ParsedEvent::Upsert((key, Some(parsed_values)))), } } - fn parse_read_or_create(&mut self, value: &JsonValue) -> ParseResult { - Ok(vec![ - self.parse_event(&value["after"], DataEventType::Insert)? - ]) + fn parse_read_or_create(&mut self, key: &JsonValue, value: &JsonValue) -> ParseResult { + let event = match self.db_type { + DebeziumDBType::Postgres => { + self.parse_event(key, &value["after"], DataEventType::Insert)? + } + DebeziumDBType::MongoDB => { + self.parse_event(key, &value["after"], DataEventType::Upsert)? + } + }; + Ok(vec![event]) } - fn parse_delete(&mut self, value: &JsonValue) -> ParseResult { - Ok(vec![ - self.parse_event(&value["before"], DataEventType::Delete)? - ]) + fn parse_delete(&mut self, key: &JsonValue, value: &JsonValue) -> ParseResult { + let event = match self.db_type { + DebeziumDBType::Postgres => { + self.parse_event(key, &value["before"], DataEventType::Delete)? + } + DebeziumDBType::MongoDB => { + let key = match &self.key_field_names { + None => None, + Some(names) => Some(values_by_names_from_json( + key, + names, + &HashMap::new(), + true, + &HashMap::new(), + &Value::None, + )?), + }; + ParsedEvent::Upsert((key, None)) + } + }; + Ok(vec![event]) } - fn parse_update(&mut self, value: &JsonValue) -> ParseResult { - let event_before = self.parse_event(&value["before"], DataEventType::Delete)?; - let event_after = self.parse_event(&value["after"], DataEventType::Insert)?; - - Ok(vec![event_before, event_after]) + fn parse_update(&mut self, key: &JsonValue, value: &JsonValue) -> ParseResult { + match self.db_type { + DebeziumDBType::Postgres => { + let event_before = + self.parse_event(key, &value["before"], DataEventType::Delete)?; + let event_after = self.parse_event(key, &value["after"], DataEventType::Insert)?; + Ok(vec![event_before, event_after]) + } + DebeziumDBType::MongoDB => { + let event_after = self.parse_event(key, &value["after"], DataEventType::Upsert)?; + Ok(vec![event_after]) + } + } } } impl Parser for DebeziumMessageParser { fn parse(&mut self, data: &ReaderContext) -> ParseResult { - let raw_value_change = match data { + let (raw_key_change, raw_value_change) = match data { RawBytes(event, raw_bytes) => { // We don't use `event` type here, because it's Debezium message parser, // whose messages can only arrive from Kafka. @@ -840,12 +985,19 @@ impl Parser for DebeziumMessageParser { if key_and_value.len() != 2 { return Err(ParseError::KeyValueTokensIncorrect(key_and_value.len())); } - key_and_value[1].to_string() + (key_and_value[0].to_string(), key_and_value[1].to_string()) + } + KeyValue((k, v)) => { + let key = match k { + Some(bytes) => prepare_plaintext_string(bytes)?, + None => return Err(ParseError::EmptyKafkaPayload), + }; + let value = match v { + Some(bytes) => prepare_plaintext_string(bytes)?, + None => return Err(ParseError::EmptyKafkaPayload), + }; + (key, value) } - KeyValue((_k, v)) => match v { - Some(bytes) => prepare_plaintext_string(bytes)?, - None => return Err(ParseError::EmptyKafkaPayload), - }, Diff(_) | TokenizedEntries(_, _) => { return Err(ParseError::UnsupportedReaderContext); } @@ -857,7 +1009,7 @@ impl Parser for DebeziumMessageParser { let change_payload = match value_change { JsonValue::Object(payload_value) => payload_value, - JsonValue::Null => return Ok(Vec::new()), + JsonValue::Null => return Ok(Vec::new()), // tombstone event for kafka: nothing to do for us _ => { return Err(ParseError::DebeziumFormatViolated( DebeziumFormatError::IncorrectJsonRoot, @@ -865,6 +1017,10 @@ impl Parser for DebeziumMessageParser { } }; + let Ok(change_key) = serde_json::from_str::(&raw_key_change) else { + return Err(ParseError::FailedToParseJson(raw_key_change)); + }; + if !change_payload.contains_key("payload") { return Err(ParseError::DebeziumFormatViolated( DebeziumFormatError::NoPayloadAtTopLevel, @@ -873,9 +1029,11 @@ impl Parser for DebeziumMessageParser { match &change_payload["payload"]["op"] { JsonValue::String(op) => match op.as_ref() { - "r" | "c" => self.parse_read_or_create(&change_payload["payload"]), - "u" => self.parse_update(&change_payload["payload"]), - "d" => self.parse_delete(&change_payload["payload"]), + "r" | "c" => { + self.parse_read_or_create(&change_key["payload"], &change_payload["payload"]) + } + "u" => self.parse_update(&change_key["payload"], &change_payload["payload"]), + "d" => self.parse_delete(&change_key["payload"], &change_payload["payload"]), _ => Err(ParseError::UnsupportedDebeziumOperation(op.to_string())), }, _ => Err(ParseError::DebeziumFormatViolated( @@ -884,11 +1042,22 @@ impl Parser for DebeziumMessageParser { } } - fn on_new_source_started(&mut self) {} + fn on_new_source_started(&mut self, _metadata: Option<&SourceMetadata>) {} fn column_count(&self) -> usize { self.value_field_names.len() } + + fn session_type(&self) -> SessionType { + match self.db_type { + DebeziumDBType::Postgres => SessionType::Native, + + // MongoDB events don't contain the previous state of the record + // therefore we can only do the upsert with the same key and the + // new value + DebeziumDBType::MongoDB => SessionType::Upsert, + } + } } pub struct JsonLinesParser { @@ -897,6 +1066,7 @@ pub struct JsonLinesParser { column_paths: HashMap, field_absence_is_error: bool, schema: HashMap, + metadata_column_value: Value, } impl JsonLinesParser { @@ -913,6 +1083,7 @@ impl JsonLinesParser { column_paths, field_absence_is_error, schema, + metadata_column_value: Value::None, } } } @@ -962,6 +1133,7 @@ impl Parser for JsonLinesParser { &self.column_paths, self.field_absence_is_error, &self.schema, + &self.metadata_column_value, )?), None => None, // use method from the different PR }); @@ -972,17 +1144,25 @@ impl Parser for JsonLinesParser { &self.column_paths, self.field_absence_is_error, &self.schema, + &self.metadata_column_value, )?; let event = match data_event { DataEventType::Insert => ParsedEvent::Insert((key, values)), DataEventType::Delete => ParsedEvent::Delete((key, values)), + DataEventType::Upsert => unreachable!("readers can't send upserts to JsonLinesParser"), }; Ok(vec![event]) } - fn on_new_source_started(&mut self) {} + fn on_new_source_started(&mut self, metadata: Option<&SourceMetadata>) { + if let Some(metadata) = metadata { + let metadata_serialized: JsonValue = + serde_json::to_value(metadata).expect("internal serialization error"); + self.metadata_column_value = metadata_serialized.into(); + } + } fn column_count(&self) -> usize { self.value_field_names.len() diff --git a/src/connectors/data_storage.rs b/src/connectors/data_storage.rs index be7a7f7b..96c4e1fd 100644 --- a/src/connectors/data_storage.rs +++ b/src/connectors/data_storage.rs @@ -31,6 +31,7 @@ use postgres::types::ToSql; use xxhash_rust::xxh3::Xxh3 as Hasher; use crate::connectors::data_format::FormatterContext; +use crate::connectors::metadata::SourceMetadata; use crate::connectors::{Offset, OffsetKey, OffsetValue}; use crate::deepcopy::DeepCopy; use crate::engine::Value; @@ -121,6 +122,7 @@ pub enum S3CommandName { pub enum DataEventType { Insert, Delete, + Upsert, } #[derive(PartialEq, Eq, Debug)] @@ -159,7 +161,7 @@ impl ReaderContext { #[derive(Debug, Eq, PartialEq)] pub enum ReadResult { Finished, - NewSource, + NewSource(Option), Data(ReaderContext, Offset), } @@ -227,6 +229,7 @@ pub trait Reader { #[allow(clippy::missing_errors_doc)] fn seek(&mut self, frontier: &OffsetAntichain) -> Result<(), ReadError>; + fn update_persistent_id(&mut self, persistent_id: Option); fn persistent_id(&self) -> Option; fn merge_two_frontiers(lhs: &OffsetAntichain, rhs: &OffsetAntichain) -> OffsetAntichain @@ -297,7 +300,7 @@ pub trait ReaderBuilder: Send + 'static { type_name::().into() } - fn name(&self, persistent_id: &Option, id: usize) -> String { + fn name(&self, persistent_id: Option<&ExternalPersistentId>, id: usize) -> String { let desc = self.short_description(); let name = desc.split("::").last().unwrap().replace("Builder", ""); if let Some(id) = persistent_id { @@ -312,6 +315,7 @@ pub trait ReaderBuilder: Send + 'static { } fn persistent_id(&self) -> Option; + fn update_persistent_id(&mut self, persistent_id: Option); fn storage_type(&self) -> StorageType; } @@ -328,6 +332,10 @@ where Reader::persistent_id(self) } + fn update_persistent_id(&mut self, persistent_id: Option) { + Reader::update_persistent_id(self, persistent_id); + } + fn storage_type(&self) -> StorageType { Reader::storage_type(self) } @@ -437,9 +445,15 @@ impl FilesystemReader { persistent_id: Option, read_method: ReadMethod, object_pattern: &str, + with_metadata: bool, ) -> Result { - let filesystem_scanner = - FilesystemScanner::new(path, persistent_id, streaming_mode, object_pattern)?; + let filesystem_scanner = FilesystemScanner::new( + path, + persistent_id, + streaming_mode, + object_pattern, + with_metadata, + )?; Ok(Self { persistent_id, @@ -528,7 +542,9 @@ impl Reader for FilesystemReader { .as_path(), )?; self.reader = Some(BufReader::new(file)); - return Ok(ReadResult::NewSource); + return Ok(ReadResult::NewSource( + self.filesystem_scanner.maybe_current_object_metadata(), + )); } if self.filesystem_scanner.is_polling_enabled() { @@ -543,6 +559,10 @@ impl Reader for FilesystemReader { self.persistent_id } + fn update_persistent_id(&mut self, persistent_id: Option) { + self.persistent_id = persistent_id; + } + fn storage_type(&self) -> StorageType { StorageType::FileSystem } @@ -649,6 +669,10 @@ impl Reader for KafkaReader { self.persistent_id } + fn update_persistent_id(&mut self, persistent_id: Option) { + self.persistent_id = persistent_id; + } + fn storage_type(&self) -> StorageType { StorageType::Kafka } @@ -690,6 +714,7 @@ struct FilesystemScanner { cached_modify_times: HashMap>, inotify: Option, object_pattern: GlobPattern, + with_metadata: bool, } impl FilesystemScanner { @@ -698,6 +723,7 @@ impl FilesystemScanner { persistent_id: Option, streaming_mode: ConnectorMode, object_pattern: &str, + with_metadata: bool, ) -> Result { let path = std::fs::canonicalize(path.into())?; @@ -740,6 +766,7 @@ impl FilesystemScanner { cached_modify_times: HashMap::new(), inotify, object_pattern: GlobPattern::new(object_pattern)?, + with_metadata, }) } @@ -767,6 +794,19 @@ impl FilesystemScanner { } } + fn maybe_current_object_metadata(&self) -> Option { + if !self.with_metadata { + return None; + } + let path: &Path = match &self.current_action { + Some(PosixScannerAction::Read(path) | PosixScannerAction::Delete(path)) => { + path.as_ref() + } + None => return None, + }; + Some(SourceMetadata::from_fs_path(path)) + } + /// Returns the name of the currently processed file in the input directory fn current_offset_file(&self) -> Option> { match &self.current_action { @@ -1063,9 +1103,15 @@ impl CsvFilesystemReader { streaming_mode: ConnectorMode, persistent_id: Option, object_pattern: &str, + with_metadata: bool, ) -> Result { - let filesystem_scanner = - FilesystemScanner::new(path.into(), persistent_id, streaming_mode, object_pattern)?; + let filesystem_scanner = FilesystemScanner::new( + path.into(), + persistent_id, + streaming_mode, + object_pattern, + with_metadata, + )?; Ok(CsvFilesystemReader { parser_builder, persistent_id, @@ -1180,7 +1226,9 @@ impl Reader for CsvFilesystemReader { .as_path(), )?, ); - return Ok(ReadResult::NewSource); + return Ok(ReadResult::NewSource( + self.filesystem_scanner.maybe_current_object_metadata(), + )); } // The file came to its end, so we should drop the reader self.reader = None; @@ -1196,7 +1244,9 @@ impl Reader for CsvFilesystemReader { .as_path(), )?, ); - return Ok(ReadResult::NewSource); + return Ok(ReadResult::NewSource( + self.filesystem_scanner.maybe_current_object_metadata(), + )); } } } @@ -1213,6 +1263,10 @@ impl Reader for CsvFilesystemReader { self.persistent_id } + fn update_persistent_id(&mut self, persistent_id: Option) { + self.persistent_id = persistent_id; + } + fn storage_type(&self) -> StorageType { StorageType::CsvFilesystem } @@ -1267,6 +1321,10 @@ impl ReaderBuilder for PythonReaderBuilder { self.persistent_id } + fn update_persistent_id(&mut self, persistent_id: Option) { + self.persistent_id = persistent_id; + } + fn storage_type(&self) -> StorageType { StorageType::Python } @@ -1334,6 +1392,10 @@ impl Reader for PythonReader { self.persistent_id } + fn update_persistent_id(&mut self, persistent_id: Option) { + self.persistent_id = persistent_id; + } + fn storage_type(&self) -> StorageType { StorageType::Python } @@ -1807,12 +1869,14 @@ impl Reader for S3CsvReader { )); } if self.stream_next_object()? { - return Ok(ReadResult::NewSource); + // No metadata is currently provided by S3 scanner + return Ok(ReadResult::NewSource(None)); } } None => { if self.stream_next_object()? { - return Ok(ReadResult::NewSource); + // No metadata is currently provided by S3 scanner + return Ok(ReadResult::NewSource(None)); } } } @@ -1832,6 +1896,10 @@ impl Reader for S3CsvReader { fn persistent_id(&self) -> Option { self.persistent_id } + + fn update_persistent_id(&mut self, persistent_id: Option) { + self.persistent_id = persistent_id; + } } pub struct KafkaWriter { @@ -2094,12 +2162,14 @@ impl Reader for S3GenericReader { } if self.stream_next_object()? { - return Ok(ReadResult::NewSource); + // No metadata is currently provided by S3 scanner + return Ok(ReadResult::NewSource(None)); } } None => { if self.stream_next_object()? { - return Ok(ReadResult::NewSource); + // No metadata is currently provided by S3 scanner + return Ok(ReadResult::NewSource(None)); } } } @@ -2119,4 +2189,8 @@ impl Reader for S3GenericReader { fn persistent_id(&self) -> Option { self.persistent_id } + + fn update_persistent_id(&mut self, persistent_id: Option) { + self.persistent_id = persistent_id; + } } diff --git a/src/connectors/metadata.rs b/src/connectors/metadata.rs new file mode 100644 index 00000000..8ac65923 --- /dev/null +++ b/src/connectors/metadata.rs @@ -0,0 +1,71 @@ +use log::error; +use std::path::Path; +use std::time::{SystemTime, UNIX_EPOCH}; + +use serde::Serialize; + +/// Basic metadata for a file-like object +#[allow(clippy::module_name_repetitions)] +#[derive(Clone, Debug, Eq, PartialEq, Serialize)] +pub struct SourceMetadata { + // Creation and modification time may not be available at some platforms + // Stored in u64 for easy serialization + created_at: Option, + modified_at: Option, + + // Owner may be unavailable at some platforms and on S3 + owner: Option, + + // Path should always be available. We make it String for two reasons: + // * S3 path is denoted as a String + // * This object is directly serialized and passed into a connector row + path: String, +} + +impl SourceMetadata { + pub fn from_fs_path(path: &Path) -> Self { + let (created_at, modified_at, owner) = match std::fs::metadata(path) { + Ok(metadata) => ( + metadata_time_to_unix_timestamp(metadata.created().ok()), + metadata_time_to_unix_timestamp(metadata.modified().ok()), + file_owner::get_owner(&metadata), + ), + Err(e) => { + error!("Failed to get metadata for filesystem object {path:?}, details: {e}"); + (None, None, None) + } + }; + + Self { + created_at, + modified_at, + owner, + path: path.to_string_lossy().to_string(), + } + } +} + +#[cfg(target_os = "linux")] +mod file_owner { + use nix::unistd::User; + use std::os::unix::fs::MetadataExt; + + pub fn get_owner(metadata: &std::fs::Metadata) -> Option { + let uid = metadata.uid(); + let user = User::from_uid(uid.into()).ok()?; + Some(user?.name) + } +} + +#[cfg(not(target_os = "linux"))] +mod file_owner { + pub fn get_owner(metadata: &std::fs::Metadata) -> Option { + None + } +} + +fn metadata_time_to_unix_timestamp(timestamp: Option) -> Option { + timestamp + .and_then(|timestamp| timestamp.duration_since(UNIX_EPOCH).ok()) + .map(|duration| duration.as_secs()) +} diff --git a/src/connectors/mod.rs b/src/connectors/mod.rs index 54f6ecfb..86f5d71a 100644 --- a/src/connectors/mod.rs +++ b/src/connectors/mod.rs @@ -10,22 +10,23 @@ use std::thread; use std::thread::Thread; use std::time::{Duration, SystemTime}; -use differential_dataflow::input::InputSession; use scopeguard::guard; use timely::dataflow::operators::probe::Handle; use timely::progress::Timestamp as TimelyTimestamp; +pub mod adaptors; pub mod data_format; pub mod data_storage; +pub mod metadata; pub mod monitoring; pub mod offset; pub mod snapshot; -pub mod upsert_session; use crate::connectors::monitoring::ConnectorMonitor; use crate::engine::report_error::{ReportError, SpawnWithReporter}; use crate::engine::{Key, Value}; +use crate::connectors::adaptors::InputAdaptor; use crate::connectors::snapshot::Event as SnapshotEvent; use crate::engine::Error as EngineError; use crate::persistence::frontier::OffsetAntichain; @@ -36,6 +37,7 @@ use crate::timestamp::current_unix_timestamp_ms; use data_format::{ParseResult, ParsedEvent, Parser}; use data_storage::{DataEventType, ReadResult, Reader, ReaderBuilder, ReaderContext, WriteError}; +pub use adaptors::SessionType; pub use data_storage::StorageType; pub use offset::{Offset, OffsetKey, OffsetValue}; @@ -171,10 +173,7 @@ where } } - fn advance_time( - &mut self, - input_session: &mut InputSession, - ) -> u64 { + fn advance_time(&mut self, input_session: &mut dyn InputAdaptor) -> u64 { let new_timestamp = u64::try_from(current_unix_timestamp_ms()) .expect("number of milliseconds should fit in 64 bits"); let new_timestamp = (new_timestamp / 2) * 2; //use only even times (required by alt-neu) @@ -214,7 +213,9 @@ where info!("Reached the end of the snapshot. Exiting the rewind after {entries_read} entries"); break; } - SnapshotEvent::Insert(_, _) | SnapshotEvent::Delete(_, _) => { + SnapshotEvent::Insert(_, _) + | SnapshotEvent::Delete(_, _) + | SnapshotEvent::Upsert(_, _) => { entries_read += 1; let send_res = sender.send(Entry::Snapshot(entry_read)); if let Err(e) = send_res { @@ -337,13 +338,13 @@ where mut self, reader: Box, mut parser: Box, - mut input_session: InputSession, - mut values_to_key: impl FnMut(Option>, Option<&Offset>) -> Key + 'static, + mut input_session: Box>, + mut values_to_key: impl FnMut(Option<&Vec>, Option<&Offset>) -> Key + 'static, probe: Handle, persistent_storage: Option>>, connector_id: usize, realtime_reader_needed: bool, - external_persistent_id: &Option, + external_persistent_id: Option<&ExternalPersistentId>, replay_mode: ReplayMode, snapshot_access: SnapshotAccess, error_reporter: impl ReportError + 'static, @@ -423,7 +424,7 @@ where self.on_parsed_data( parsed_entries, None, // no key generation for time advancement - &mut input_session, + input_session.as_mut(), &mut values_to_key, &mut snapshot_writer, &mut Some(&mut *connector_monitor.borrow_mut()), @@ -454,7 +455,7 @@ where entry, &mut backfilling_finished, &mut parser, - &mut input_session, + input_session.as_mut(), &mut values_to_key, &mut snapshot_writer, &offsets_by_time_writer, @@ -483,8 +484,8 @@ where entry: Entry, backfilling_finished: &mut bool, parser: &mut Box, - input_session: &mut InputSession, - values_to_key: impl FnMut(Option>, Option<&Offset>) -> Key, + input_session: &mut dyn InputAdaptor, + values_to_key: impl FnMut(Option<&Vec>, Option<&Offset>) -> Key, snapshot_writer: &mut Option, offsets_by_time_writer: &Mutex>, connector_monitor: &mut Option<&mut ConnectorMonitor>, @@ -494,8 +495,8 @@ where match entry { Entry::Realtime(read_result) => match read_result { ReadResult::Finished => {} - ReadResult::NewSource => { - parser.on_new_source_started(); + ReadResult::NewSource(metadata) => { + parser.on_new_source_started(metadata.as_ref()); let parsed_entries = vec![ParsedEvent::AdvanceTime]; self.on_parsed_data( @@ -563,6 +564,9 @@ where SnapshotEvent::Delete(key, value) => { Self::on_remove(key, value, input_session); } + SnapshotEvent::Upsert(key, value) => { + Self::on_upsert(key, value, input_session); + } SnapshotEvent::AdvanceTime(_) | SnapshotEvent::Finished => { unreachable!() } @@ -577,8 +581,8 @@ where pub fn run_with_custom_reader( &mut self, custom_reader: &mut dyn CustomReader, - input_session: &mut InputSession, - mut values_to_key: impl FnMut(Option>, Option<&Offset>) -> Key, + input_session: &mut dyn InputAdaptor, + mut values_to_key: impl FnMut(Option<&Vec>, Option<&Offset>) -> Key, snapshot_writer: &mut Option, ) { loop { @@ -610,8 +614,8 @@ where raw_read_data: &ReaderContext, offset: Option<&Offset>, parser: &mut dyn Parser, - input_session: &mut InputSession, - values_to_key: impl FnMut(Option>, Option<&Offset>) -> Key, + input_session: &mut dyn InputAdaptor, + values_to_key: impl FnMut(Option<&Vec>, Option<&Offset>) -> Key, snapshot_writer: &mut Option, ) { match parser.parse(raw_read_data) { @@ -629,20 +633,20 @@ where } } - fn on_insert( - key: Key, - values: Vec, - input_session: &mut InputSession, - ) { - input_session.insert((key, Value::Tuple(values.into()))); + fn on_insert(key: Key, values: Vec, input_session: &mut dyn InputAdaptor) { + input_session.insert(key, Value::Tuple(values.into())); } - fn on_remove( + fn on_upsert( key: Key, - values: Vec, - input_session: &mut InputSession, + values: Option>, + input_session: &mut dyn InputAdaptor, ) { - input_session.remove((key, Value::Tuple(values.into()))); + input_session.upsert(key, values.map(|v| Value::Tuple(v.into()))); + } + + fn on_remove(key: Key, values: Vec, input_session: &mut dyn InputAdaptor) { + input_session.remove(key, Value::Tuple(values.into())); } #[allow(clippy::too_many_arguments)] @@ -650,55 +654,49 @@ where &mut self, parsed_entries: Vec, offset: Option<&Offset>, - input_session: &mut InputSession, - mut values_to_key: impl FnMut(Option>, Option<&Offset>) -> Key, + input_session: &mut dyn InputAdaptor, + mut values_to_key: impl FnMut(Option<&Vec>, Option<&Offset>) -> Key, snapshot_writer: &mut Option, connector_monitor: &mut Option<&mut ConnectorMonitor>, ) { for entry in parsed_entries { + let key = entry.key(&mut values_to_key, offset); + if let Some(key) = key { + // true for Insert, Remove, Upsert + if let Some(ref mut connector_monitor) = connector_monitor { + connector_monitor.increment(); + } + + if let Some(snapshot_writer) = snapshot_writer { + // TODO: if the usage of Mutex+Arc hits the performance, add a buffered accessor here + // It must accumulate the data to the extent of the chunk size, and then unlock the mutex + // once and send the full chunk + let snapshot_event = entry + .snapshot_event(key) + .expect("Snapshot event not constructed"); + if let Err(e) = snapshot_writer.lock().unwrap().write(&snapshot_event) { + error!("Failed to save row ({entry:?}) in persistent buffer. Error: {e}"); + } + } + } + match entry { - ParsedEvent::Insert((raw_key, values)) => { + ParsedEvent::Insert((_, values)) => { if values.len() != self.num_columns { error!("There are {} tokens in the entry, but the expected number of tokens was {}", values.len(), self.num_columns); continue; } - let key = values_to_key(raw_key, offset); - if let Some(snapshot_writer) = snapshot_writer { - // TODO: if the usage of Mutex+Arc hits the performance, add a buffered accessor here - // It must accumulate the data to the extent of the chunk size, and then unlock the mutex - // once and send the full chunk - if let Err(e) = snapshot_writer - .lock() - .unwrap() - .write(&SnapshotEvent::Insert(key, values.clone())) - { - error!("Failed to save row ({key}, {values:?}) in persistent buffer. Error: {e}"); - } - } - Self::on_insert(key, values, input_session); - if let Some(ref mut connector_monitor) = connector_monitor { - connector_monitor.increment(); - } + Self::on_insert(key.expect("No key"), values, input_session); + } + ParsedEvent::Upsert((_, values)) => { + Self::on_upsert(key.expect("No key"), values, input_session); } - ParsedEvent::Delete((raw_key, values)) => { + ParsedEvent::Delete((_, values)) => { if values.len() != self.num_columns { error!("There are {} tokens in the entry, but the expected number of tokens was {}", values.len(), self.num_columns); continue; } - let key = values_to_key(raw_key, offset); - if let Some(snapshot_writer) = snapshot_writer { - if let Err(e) = snapshot_writer - .lock() - .unwrap() - .write(&SnapshotEvent::Delete(key, values.clone())) - { - error!("Failed to save row ({key}, {values:?}) in persistent buffer. Error: {e}"); - } - } - Self::on_remove(key, values, input_session); - if let Some(ref mut connector_monitor) = connector_monitor { - connector_monitor.increment(); - } + Self::on_remove(key.expect("No key"), values, input_session); } ParsedEvent::AdvanceTime => { let time_advanced = self.advance_time(input_session); diff --git a/src/connectors/snapshot.rs b/src/connectors/snapshot.rs index 11304a9b..9aa5fb5c 100644 --- a/src/connectors/snapshot.rs +++ b/src/connectors/snapshot.rs @@ -33,6 +33,7 @@ use crate::timestamp::current_unix_timestamp_ms; pub enum Event { Insert(Key, Vec), Delete(Key, Vec), + Upsert(Key, Option>), AdvanceTime(u64), Finished, } @@ -49,6 +50,14 @@ pub trait SnapshotReaderImpl { /// It must ensure that no further data is present in the snapshot so that when it gets appended, /// the unused data is not written next to the non-processed tail. fn truncate(&mut self) -> Result<(), ReadError>; + + /// This method will be called to check, whether snapshot reading should end, when timestamp exceeds + /// threshold from metadata or will the snapshot reader finish rewinding by itself. + /// + /// In the latter case, snapshot reader needs to end by sending `Event::Finished`. + fn check_threshold_from_metadata(&mut self) -> bool { + true + } } #[allow(clippy::module_name_repetitions)] @@ -528,6 +537,36 @@ impl SnapshotWriter for S3SnapshotWriter { } } +pub struct MockSnapshotReader { + events: Box>, +} + +impl MockSnapshotReader { + pub fn new(events: Vec) -> Self { + Self { + events: Box::new(events.into_iter()), + } + } +} + +impl SnapshotReaderImpl for MockSnapshotReader { + fn read(&mut self) -> Result { + if let Some(event) = self.events.next() { + Ok(event) + } else { + Ok(Event::Finished) + } + } + + fn truncate(&mut self) -> Result<(), ReadError> { + Ok(()) + } + + fn check_threshold_from_metadata(&mut self) -> bool { + false + } +} + #[allow(clippy::module_name_repetitions)] pub struct SnapshotReader { reader_impl: Box, @@ -558,7 +597,7 @@ impl SnapshotReader { pub fn read(&mut self) -> Result { let event = self.reader_impl.read()?; if let Event::AdvanceTime(new_time) = event { - if new_time >= self.threshold_time { + if self.reader_impl.check_threshold_from_metadata() && new_time >= self.threshold_time { if let Err(e) = self.reader_impl.truncate() { error!("Failed to truncate the snapshot, the next re-run may provide incorrect results: {e}"); return Err(e); diff --git a/src/connectors/upsert_session.rs b/src/connectors/upsert_session.rs deleted file mode 100644 index 4c842032..00000000 --- a/src/connectors/upsert_session.rs +++ /dev/null @@ -1,92 +0,0 @@ -use differential_dataflow::lattice::Lattice; -use differential_dataflow::operators::arrange::upsert::arrange_from_upsert; -use differential_dataflow::trace::implementations::ord::OrdValBatch; -use differential_dataflow::trace::implementations::spine_fueled::Spine; -use differential_dataflow::Collection; -use timely::dataflow::operators::input::Handle; -use timely::dataflow::operators::Input as TimelyInput; -use timely::order::TotalOrder; -use timely::progress::Timestamp as TimelyTimestamp; - -use crate::engine::dataflow::maybe_total::MaybeTotalScope; -use crate::engine::{Key, Value}; - -use std::rc::Rc; - -pub struct UpsertSession { - time: Timestamp, - buffer: Vec<(Key, Option, Timestamp)>, - handle: Handle, Timestamp)>, -} - -impl UpsertSession { - /* - The implementation below mostly reuses differetial dataflow's InputSession internals. - - The main difference is the interface of the `to_collection` method and more task-based - insert and remove methods. - */ - - pub fn new() -> Self { - let handle: Handle = Handle::new(); - UpsertSession { - time: handle.time().clone(), - buffer: Vec::new(), - handle, - } - } - - pub fn to_collection>( - &mut self, - scope: &mut S, - ) -> Collection { - arrange_from_upsert::>>>( - &scope.input_from(&mut self.handle), - "UpsertSession", - ) - .as_collection(|k, v| (*k, v.clone())) - } - - pub fn flush(&mut self) { - self.handle.send_batch(&mut self.buffer); - if self.handle.epoch().less_than(&self.time) { - self.handle.advance_to(self.time.clone()); - } - } - - pub fn advance_to(&mut self, time: Timestamp) { - assert!(self.handle.epoch().less_equal(&time)); - assert!(self.time.less_equal(&time)); - self.time = time; - } - - pub fn update(&mut self, key: Key, value: Option) { - if self.buffer.len() == self.buffer.capacity() { - if !self.buffer.is_empty() { - self.handle.send_batch(&mut self.buffer); - } - self.buffer.reserve(1024); - } - self.buffer.push((key, value, self.time.clone())); - } - - pub fn insert(&mut self, key: Key, value: Value) { - self.update(key, Some(value)); - } - - pub fn remove(&mut self, key: Key) { - self.update(key, None); - } -} - -impl Drop for UpsertSession { - fn drop(&mut self) { - self.flush(); - } -} - -impl Default for UpsertSession { - fn default() -> Self { - Self::new() - } -} diff --git a/src/engine/dataflow.rs b/src/engine/dataflow.rs index e395bb00..44b81133 100644 --- a/src/engine/dataflow.rs +++ b/src/engine/dataflow.rs @@ -5,6 +5,7 @@ pub mod maybe_total; pub mod operators; pub mod shard; +use crate::connectors::adaptors::{GenericValues, ValuesSessionAdaptor}; use crate::connectors::data_format::{Formatter, Parser}; use crate::connectors::data_storage::{ReaderBuilder, Writer}; use crate::connectors::monitoring::{ConnectorMonitor, ConnectorStats, OutputConnectorStats}; @@ -19,7 +20,7 @@ use crate::engine::value::HashInto; use crate::persistence::config::{PersistenceManagerConfig, PersistenceManagerOuterConfig}; use crate::persistence::sync::SharedWorkersPersistenceCoordinator; use crate::persistence::tracker::SingleWorkerPersistentStorage; -use crate::persistence::ExternalPersistentId; +use crate::persistence::{ExternalPersistentId, IntoPersistentId}; use std::any::type_name; use std::borrow::{Borrow, Cow}; @@ -45,7 +46,6 @@ use arcstr::ArcStr; use crossbeam_channel::{bounded, never, select, Receiver, RecvError, Sender}; use derivative::Derivative; use differential_dataflow::collection::concatenate; -use differential_dataflow::input::{Input, InputSession}; use differential_dataflow::lattice::Lattice; use differential_dataflow::operators::arrange::{Arranged, TraceAgent}; use differential_dataflow::operators::iterate::Variable; @@ -137,16 +137,12 @@ type ArrangedBySelf = type ArrangedByKey = Arranged::MaybeTotalTimestamp, R>>>; -type Session = InputSession<::MaybeTotalTimestamp, D, R>; - type Var = Variable; type Keys = Collection; type KeysArranged = ArrangedBySelf; type KeysVar = Var; -type GenericValues = Collection; type ValuesArranged = ArrangedByKey; -type ValuesSession = Session; type ValuesVar = Var; #[derive(Clone)] @@ -1203,8 +1199,11 @@ impl DataflowGraphInner { depth: usize, data: &Arc<[Value]>, ) -> Value { - if paths.len() == 1 && paths.first().unwrap().1.len() == depth { + if !paths.is_empty() && paths.first().unwrap().1.len() == depth { let id = paths.first().unwrap().0; + for (next_id, _path) in &paths[1..] { + assert_eq!(data[id], data[*next_id]); + } return data[id].clone(); } let mut path_prefix = 0; @@ -2593,24 +2592,40 @@ enum OutputEvent { impl> DataflowGraphInner { fn connector_table( &mut self, - reader: Box, + mut reader: Box, parser: Box, commit_duration: Option, parallel_readers: usize, table_properties: Arc, - external_persistent_id: &Option, + external_persistent_id: Option<&ExternalPersistentId>, ) -> Result { let has_persistent_storage = self.worker_persistent_storage.is_some(); if let Some(external_persistent_id) = external_persistent_id { if !has_persistent_storage { return Err(Error::NoPersistentStorage(external_persistent_id.clone())); } - } else if has_persistent_storage && !reader.is_internal() { - return Err(Error::PersistentIdNotAssigned(reader.storage_type())); } - let (input_session, table_values): (ValuesSession, GenericValues) = - self.scope.new_collection(); + let effective_persistent_id = { + if external_persistent_id.is_some() { + external_persistent_id.cloned() + } else if has_persistent_storage && !reader.is_internal() { + let generated_external_id = reader.name(None, self.connector_monitors.len()); + reader + .update_persistent_id(Some(generated_external_id.clone().into_persistent_id())); + info!( + "Persistent ID autogenerated for a {:?} reader: {generated_external_id}", + reader.storage_type() + ); + Some(generated_external_id) + } else { + None + } + }; + + let (input_session, table_values): (ValuesSessionAdaptor, GenericValues) = + parser.session_type().new_collection(&mut self.scope); + let table_values = table_values.reshard(); table_values.probe_with(&mut self.input_probe); @@ -2655,14 +2670,14 @@ impl> DataflowGraphInner { return key; } } - Key::for_values(&values) + Key::for_values(values) } }, self.output_probe.clone(), self.worker_persistent_storage.clone(), self.connector_monitors.len(), realtime_reader_needed, - external_persistent_id, + effective_persistent_id.as_ref(), replay_mode, snapshot_access, self.error_reporter.clone(), @@ -3968,7 +3983,7 @@ where _commit_duration: Option, _parallel_readers: usize, _table_properties: Arc, - _external_persistent_id: &Option, + _external_persistent_id: Option<&ExternalPersistentId>, ) -> Result { Err(Error::IoNotPossible) } @@ -4527,7 +4542,7 @@ impl> Graph for OuterDataflowGraph commit_duration: Option, parallel_readers: usize, table_properties: Arc, - external_persistent_id: &Option, + external_persistent_id: Option<&ExternalPersistentId>, ) -> Result { self.0.borrow_mut().connector_table( reader, diff --git a/src/engine/error.rs b/src/engine/error.rs index b2b06e79..ad4ec6d6 100644 --- a/src/engine/error.rs +++ b/src/engine/error.rs @@ -7,7 +7,6 @@ use super::{Key, Value}; use crate::persistence::metadata_backends::Error as MetadataBackendError; use crate::connectors::data_storage::WriteError; -use crate::connectors::StorageType; use crate::persistence::ExternalPersistentId; #[allow(clippy::module_name_repetitions)] @@ -150,11 +149,6 @@ pub enum Error { #[error("persistent id {0} is assigned, but no persistent storage is configured")] NoPersistentStorage(ExternalPersistentId), - #[error( - "persistent storage is configured, but persistent id is not assigned for {0:?} reader" - )] - PersistentIdNotAssigned(StorageType), - #[error("snapshot writer failed: {0}")] SnapshotWriterError(#[source] WriteError), } @@ -200,7 +194,7 @@ impl From for Error { pub type Result = result::Result; -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq, Eq)] pub enum Trace { Frame { line: String, diff --git a/src/engine/graph.rs b/src/engine/graph.rs index 26b85edf..28722504 100644 --- a/src/engine/graph.rs +++ b/src/engine/graph.rs @@ -232,14 +232,14 @@ impl ComplexColumn { } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] pub struct ColumnProperties { pub dtype: Type, pub append_only: bool, pub trace: Trace, } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq, Eq)] pub enum TableProperties { Table(Arc<[TableProperties]>), Column(Arc), @@ -260,9 +260,18 @@ impl TableProperties { fn produce_nested_tuple( props: &[(Vec, TableProperties)], depth: usize, - ) -> TableProperties { - if props.len() == 1 && props.first().unwrap().0.len() == depth { - return props.first().unwrap().1.clone(); + ) -> Result { + if !props.is_empty() && props.first().unwrap().0.len() == depth { + let first = &props.first().unwrap().1; + for (_path, other) in &props[1..] { + assert_eq!(first, other); + if first != other { + return Err(Error::ValueError( + "Properties of two columns with the same path are not equal".into(), + )); + } + } + return Ok(first.clone()); } let mut prefix = 0; let mut begin = 0; @@ -281,11 +290,11 @@ impl TableProperties { continue; } assert!(begin < end); - result.push(produce_nested_tuple(&props[begin..end], depth + 1)); + result.push(produce_nested_tuple(&props[begin..end], depth + 1)?); begin = end; } - TableProperties::Table(result.as_slice().into()) + Ok(TableProperties::Table(result.as_slice().into())) } let mut properties: Vec<(Vec, TableProperties)> = properties @@ -300,7 +309,7 @@ impl TableProperties { properties.sort_unstable_by(|(left_path, _), (right_path, _)| left_path.cmp(right_path)); - Ok(produce_nested_tuple(properties.as_slice(), 0)) + produce_nested_tuple(properties.as_slice(), 0) } pub fn trace(&self) -> &Trace { @@ -693,7 +702,7 @@ pub trait Graph { commit_duration: Option, parallel_readers: usize, table_properties: Arc, - external_persistent_id: &Option, + external_persistent_id: Option<&ExternalPersistentId>, ) -> Result; fn output_table( @@ -1232,7 +1241,7 @@ impl Graph for ScopedGraph { commit_duration: Option, parallel_readers: usize, table_properties: Arc, - external_persistent_id: &Option, + external_persistent_id: Option<&ExternalPersistentId>, ) -> Result { self.try_with(|g| { g.connector_table( diff --git a/src/engine/value.rs b/src/engine/value.rs index 6876287e..bd31710e 100644 --- a/src/engine/value.rs +++ b/src/engine/value.rs @@ -414,7 +414,7 @@ pub enum SimpleType { Json, } -#[derive(Debug, Default, Clone, Copy)] +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] pub enum Type { #[default] Any, diff --git a/src/persistence/config.rs b/src/persistence/config.rs index 5e0d20eb..76dacbfa 100644 --- a/src/persistence/config.rs +++ b/src/persistence/config.rs @@ -13,20 +13,24 @@ use s3::bucket::Bucket as S3Bucket; use crate::connectors::data_storage::S3CommandName; use crate::connectors::data_storage::{ReadError, WriteError}; use crate::connectors::snapshot::{ - LocalBinarySnapshotReader, LocalBinarySnapshotWriter, S3SnapshotReader, S3SnapshotWriter, - SnapshotReader, SnapshotReaderImpl, + Event, LocalBinarySnapshotReader, LocalBinarySnapshotWriter, MockSnapshotReader, + S3SnapshotReader, S3SnapshotWriter, SnapshotReader, SnapshotReaderImpl, }; use crate::connectors::{ReplayMode, SnapshotAccess}; use crate::deepcopy::DeepCopy; use crate::fs_helpers::ensure_directory; use crate::persistence::metadata_backends::Error as MetadataBackendError; -use crate::persistence::metadata_backends::{FilesystemKVStorage, MetadataBackend, S3KVStorage}; +use crate::persistence::metadata_backends::{ + FilesystemKVStorage, MetadataBackend, MockKVStorage, S3KVStorage, +}; use crate::persistence::state::MetadataAccessor; use crate::persistence::sync::WorkersPersistenceCoordinator; use crate::persistence::{PersistentId, SharedSnapshotWriter}; const STREAMS_DIRECTORY_NAME: &str = "streams"; +pub type ConnectorWorkerPair = (PersistentId, usize); + /// Metadata storage handles the frontier over all persisted data sources. /// When we restart the computation, it will start from the frontier stored /// in the metadata storage @@ -34,6 +38,7 @@ const STREAMS_DIRECTORY_NAME: &str = "streams"; pub enum MetadataStorageConfig { Filesystem(PathBuf), S3 { bucket: S3Bucket, root_path: String }, + Mock, } /// Stream storage handles the snapshot, which will be loaded when Pathway @@ -42,6 +47,7 @@ pub enum MetadataStorageConfig { pub enum StreamStorageConfig { Filesystem(PathBuf), S3 { bucket: S3Bucket, root_path: String }, + Mock(HashMap>), } /// Persistence in Pathway consists of two parts: actual frontier @@ -51,7 +57,7 @@ pub enum StreamStorageConfig { /// which is passed from Python program by user. #[derive(Debug, Clone)] pub struct PersistenceManagerOuterConfig { - refresh_duration: Duration, + snapshot_interval: Duration, metadata_storage: MetadataStorageConfig, stream_storage: StreamStorageConfig, snapshot_access: SnapshotAccess, @@ -61,7 +67,7 @@ pub struct PersistenceManagerOuterConfig { impl PersistenceManagerOuterConfig { pub fn new( - refresh_duration: Duration, + snapshot_interval: Duration, metadata_storage: MetadataStorageConfig, stream_storage: StreamStorageConfig, snapshot_access: SnapshotAccess, @@ -69,7 +75,7 @@ impl PersistenceManagerOuterConfig { continue_after_replay: bool, ) -> Self { Self { - refresh_duration, + snapshot_interval, metadata_storage, stream_storage, snapshot_access, @@ -86,7 +92,7 @@ impl PersistenceManagerOuterConfig { &self, num_workers: usize, ) -> WorkersPersistenceCoordinator { - WorkersPersistenceCoordinator::new(self.refresh_duration, num_workers) + WorkersPersistenceCoordinator::new(self.snapshot_interval, num_workers) } } @@ -128,6 +134,7 @@ impl PersistenceManagerConfig { MetadataStorageConfig::S3 { bucket, root_path } => { Box::new(S3KVStorage::new(bucket.deep_copy(), root_path)) } + MetadataStorageConfig::Mock => Box::new(MockKVStorage {}), }; MetadataAccessor::new(backend, self.worker_id) } @@ -160,6 +167,16 @@ impl PersistenceManagerConfig { } reader_impls } + StreamStorageConfig::Mock(event_map) => { + let mut reader_impls = HashMap::>::new(); + let events = event_map + .get(&(persistent_id, self.worker_id)) + .unwrap_or(&vec![]) + .clone(); + let reader = MockSnapshotReader::new(events); + reader_impls.insert(self.worker_id, Box::new(reader)); + reader_impls + } }; for (worker_id, reader_impl) in reader_impls { @@ -191,6 +208,9 @@ impl PersistenceManagerConfig { &snapshot_path, )))) } + StreamStorageConfig::Mock(_) => { + unreachable!() + } } } diff --git a/src/persistence/metadata_backends/mock.rs b/src/persistence/metadata_backends/mock.rs new file mode 100644 index 00000000..5e2406a9 --- /dev/null +++ b/src/persistence/metadata_backends/mock.rs @@ -0,0 +1,19 @@ +use crate::persistence::metadata_backends::{Error, MetadataBackend}; + +#[derive(Debug)] +#[allow(clippy::module_name_repetitions)] +pub struct MockKVStorage {} + +impl MetadataBackend for MockKVStorage { + fn list_keys(&self) -> Result, Error> { + Ok(vec![]) + } + + fn get_value(&self, _key: &str) -> Result { + unreachable!() + } + + fn put_value(&mut self, _key: &str, _value: &str) -> Result<(), Error> { + Ok(()) + } +} diff --git a/src/persistence/metadata_backends/mod.rs b/src/persistence/metadata_backends/mod.rs index 8ee58dfb..05be5e9d 100644 --- a/src/persistence/metadata_backends/mod.rs +++ b/src/persistence/metadata_backends/mod.rs @@ -6,8 +6,10 @@ use ::s3::error::S3Error; use serde_json::Error as ParseError; pub mod file; +pub mod mock; pub mod s3; pub use file::FilesystemKVStorage; +pub use mock::MockKVStorage; pub use s3::S3KVStorage; #[derive(Debug, thiserror::Error)] diff --git a/src/persistence/mod.rs b/src/persistence/mod.rs index e458e28c..e45a3c08 100644 --- a/src/persistence/mod.rs +++ b/src/persistence/mod.rs @@ -1,5 +1,7 @@ use std::sync::{Arc, Mutex}; +use xxhash_rust::xxh3::Xxh3 as Hasher; + use crate::connectors::snapshot::SnapshotWriter; pub mod config; @@ -12,3 +14,15 @@ pub mod tracker; pub type PersistentId = u128; pub type ExternalPersistentId = String; pub type SharedSnapshotWriter = Arc>; + +pub trait IntoPersistentId { + fn into_persistent_id(self) -> PersistentId; +} + +impl IntoPersistentId for ExternalPersistentId { + fn into_persistent_id(self) -> PersistentId { + let mut hasher = Hasher::default(); + hasher.update(self.as_bytes()); + hasher.digest128() + } +} diff --git a/src/python_api.rs b/src/python_api.rs index fac98c93..830de042 100644 --- a/src/python_api.rs +++ b/src/python_api.rs @@ -46,19 +46,19 @@ use std::os::unix::prelude::*; use std::sync::{Arc, Mutex}; use std::thread; use std::time; -use xxhash_rust::xxh3::Xxh3 as Hasher; use self::threads::PythonThreadState; use crate::connectors::data_format::{ - DebeziumMessageParser, DsvSettings, Formatter, IdentityParser, InnerSchemaField, - JsonLinesFormatter, JsonLinesParser, NullFormatter, Parser, PsqlSnapshotFormatter, - PsqlUpdatesFormatter, + DebeziumDBType, DebeziumMessageParser, DsvSettings, Formatter, IdentityParser, + InnerSchemaField, JsonLinesFormatter, JsonLinesParser, NullFormatter, Parser, + PsqlSnapshotFormatter, PsqlUpdatesFormatter, }; use crate::connectors::data_storage::{ ConnectorMode, CsvFilesystemReader, ElasticSearchWriter, FileWriter, FilesystemReader, KafkaReader, KafkaWriter, NullWriter, PsqlWriter, PythonReaderBuilder, ReadMethod, ReaderBuilder, S3CsvReader, S3GenericReader, Writer, }; +use crate::connectors::snapshot::Event as SnapshotEvent; use crate::connectors::{ReplayMode, SnapshotAccess}; use crate::engine::dataflow::config_from_env; use crate::engine::error::{DynError, DynResult, Trace as EngineTrace}; @@ -81,9 +81,9 @@ use crate::engine::{Expression, IntExpression}; use crate::engine::{FloatExpression, Graph}; use crate::engine::{LegacyTable as EngineLegacyTable, StringExpression}; use crate::persistence::config::{ - MetadataStorageConfig, PersistenceManagerOuterConfig, StreamStorageConfig, + ConnectorWorkerPair, MetadataStorageConfig, PersistenceManagerOuterConfig, StreamStorageConfig, }; -use crate::persistence::{ExternalPersistentId, PersistentId}; +use crate::persistence::{ExternalPersistentId, IntoPersistentId, PersistentId}; use crate::pipe::{pipe, ReaderType, WriterType}; use s3::creds::Credentials as AwsCredentials; @@ -410,6 +410,18 @@ impl IntoPy for ConnectorMode { } } +impl<'source> FromPyObject<'source> for DebeziumDBType { + fn extract(ob: &'source PyAny) -> PyResult { + Ok(ob.extract::>()?.0) + } +} + +impl IntoPy for DebeziumDBType { + fn into_py(self, py: Python<'_>) -> PyObject { + PyDebeziumDBType(self).into_py(py) + } +} + impl<'source> FromPyObject<'source> for MonitoringLevel { fn extract(ob: &'source PyAny) -> PyResult { Ok(ob.extract::>()?.0) @@ -443,7 +455,6 @@ impl From for PyErr { EngineError::DivisionByZero => PyZeroDivisionError::type_object(py), EngineError::IterationLimitTooSmall | EngineError::ValueError(_) - | EngineError::PersistentIdNotAssigned(_) | EngineError::NoPersistentStorage(_) | EngineError::ParseError(_) => PyValueError::type_object(py), EngineError::IndexOutOfBounds => PyIndexError::type_object(py), @@ -1256,6 +1267,17 @@ impl PyConnectorMode { pub const STREAMING_WITH_DELETIONS: ConnectorMode = ConnectorMode::StreamingWithDeletions; } +#[pyclass(module = "pathway.engine", frozen, name = "DebeziumDBType")] +pub struct PyDebeziumDBType(DebeziumDBType); + +#[pymethods] +impl PyDebeziumDBType { + #[classattr] + pub const POSTGRES: DebeziumDBType = DebeziumDBType::Postgres; + #[classattr] + pub const MONGO_DB: DebeziumDBType = DebeziumDBType::MongoDB; +} + #[pyclass(module = "pathway.engine", frozen, name = "MonitoringLevel")] pub struct PyMonitoringLevel(MonitoringLevel); @@ -1815,7 +1837,7 @@ impl Scope { .map(time::Duration::from_millis), parallel_readers, Arc::new(EngineTableProperties::flat(column_properties)), - &persistent_id, + persistent_id.as_ref(), )?; Table::new(self_, table_handle) } @@ -2955,6 +2977,8 @@ pub struct DataStorage { persistent_id: Option, max_batch_size: Option, object_pattern: String, + with_metadata: bool, + mock_events: Option>>, } #[pyclass(module = "pathway.engine", frozen, name = "ReplayMode")] @@ -3012,7 +3036,7 @@ impl IntoPy for SnapshotAccess { #[derive(Clone, Debug)] #[pyclass(module = "pathway.engine", frozen)] pub struct PersistenceConfig { - refresh_duration: ::std::time::Duration, + snapshot_interval: ::std::time::Duration, metadata_storage: DataStorage, stream_storage: DataStorage, snapshot_access: SnapshotAccess, @@ -3025,7 +3049,7 @@ impl PersistenceConfig { #[new] #[pyo3(signature = ( *, - refresh_duration_ms, + snapshot_interval_ms, metadata_storage, stream_storage, snapshot_access = SnapshotAccess::Full, @@ -3033,7 +3057,7 @@ impl PersistenceConfig { continue_after_replay = true, ))] fn new( - refresh_duration_ms: u64, + snapshot_interval_ms: u64, metadata_storage: DataStorage, stream_storage: DataStorage, snapshot_access: SnapshotAccess, @@ -3041,7 +3065,7 @@ impl PersistenceConfig { continue_after_replay: bool, ) -> Self { Self { - refresh_duration: ::std::time::Duration::from_millis(refresh_duration_ms), + snapshot_interval: ::std::time::Duration::from_millis(snapshot_interval_ms), metadata_storage, stream_storage, snapshot_access, @@ -3054,7 +3078,7 @@ impl PersistenceConfig { impl PersistenceConfig { fn prepare(self, py: pyo3::Python) -> PyResult { Ok(PersistenceManagerOuterConfig::new( - self.refresh_duration, + self.snapshot_interval, self.metadata_storage .construct_metadata_storage_config(py)?, self.stream_storage.construct_stream_storage_config(py)?, @@ -3065,6 +3089,39 @@ impl PersistenceConfig { } } +impl<'source> FromPyObject<'source> for SnapshotEvent { + fn extract(ob: &'source PyAny) -> PyResult { + Ok(ob.extract::>()?.0.clone()) + } +} + +impl IntoPy for SnapshotEvent { + fn into_py(self, py: Python<'_>) -> PyObject { + PySnapshotEvent(self).into_py(py) + } +} + +#[pyclass(module = "pathway.engine", frozen, name = "SnapshotEvent")] +pub struct PySnapshotEvent(SnapshotEvent); + +#[pymethods] +impl PySnapshotEvent { + #[staticmethod] + pub fn insert(key: Key, values: Vec) -> SnapshotEvent { + SnapshotEvent::Insert(key, values) + } + #[staticmethod] + pub fn delete(key: Key, values: Vec) -> SnapshotEvent { + SnapshotEvent::Delete(key, values) + } + #[staticmethod] + pub fn advance_time(timestamp: u64) -> SnapshotEvent { + SnapshotEvent::AdvanceTime(timestamp) + } + #[classattr] + pub const FINISHED: SnapshotEvent = SnapshotEvent::Finished; +} + #[pyclass(module = "pathway.engine", frozen)] #[derive(Clone)] pub struct PythonSubject { @@ -3130,6 +3187,7 @@ pub struct DataFormat { column_paths: Option>, field_absence_is_error: bool, parse_utf8: bool, + debezium_db_type: DebeziumDBType, } #[pymethods] @@ -3151,6 +3209,8 @@ impl DataStorage { persistent_id = None, max_batch_size = None, object_pattern = "*".to_string(), + with_metadata = false, + mock_events = None, ))] #[allow(clippy::too_many_arguments)] fn new( @@ -3169,6 +3229,8 @@ impl DataStorage { persistent_id: Option, max_batch_size: Option, object_pattern: String, + with_metadata: bool, + mock_events: Option>>, ) -> Self { DataStorage { storage_type, @@ -3186,6 +3248,8 @@ impl DataStorage { persistent_id, max_batch_size, object_pattern, + with_metadata, + mock_events, } } } @@ -3203,6 +3267,7 @@ impl DataFormat { column_paths = None, field_absence_is_error = true, parse_utf8 = true, + debezium_db_type = DebeziumDBType::Postgres, ))] #[allow(clippy::too_many_arguments)] fn new( @@ -3214,6 +3279,7 @@ impl DataFormat { column_paths: Option>, field_absence_is_error: bool, parse_utf8: bool, + debezium_db_type: DebeziumDBType, ) -> Self { DataFormat { format_type, @@ -3224,6 +3290,7 @@ impl DataFormat { column_paths, field_absence_is_error, parse_utf8, + debezium_db_type, } } } @@ -3366,11 +3433,9 @@ impl DataStorage { } fn internal_persistent_id(&self) -> Option { - self.persistent_id.clone().map(|external_persistent_id| { - let mut hasher = Hasher::default(); - hasher.update(external_persistent_id.as_bytes()); - hasher.digest128() - }) + self.persistent_id + .clone() + .map(IntoPersistentId::into_persistent_id) } fn construct_reader(&self, py: pyo3::Python) -> PyResult<(Box, usize)> { @@ -3382,6 +3447,7 @@ impl DataStorage { self.internal_persistent_id(), self.read_method, &self.object_pattern, + self.with_metadata, ) .map_err(|e| { PyIOError::new_err(format!("Failed to initialize Filesystem reader: {e}")) @@ -3419,6 +3485,7 @@ impl DataStorage { self.mode, self.internal_persistent_id(), &self.object_pattern, + self.with_metadata, ) .map_err(|e| { PyIOError::new_err(format!("Failed to initialize CsvFilesystem reader: {e}")) @@ -3474,6 +3541,16 @@ impl DataStorage { root_path: path.into(), }) } + "mock" => { + let mut events = HashMap::>::new(); + for ((external_persistent_id, worker_id), es) in self.mock_events.as_ref().unwrap() + { + let internal_persistent_id = + external_persistent_id.clone().into_persistent_id(); + events.insert((internal_persistent_id, *worker_id), es.clone()); + } + Ok(StreamStorageConfig::Mock(events)) + } other => Err(PyValueError::new_err(format!( "Unsupported snapshot storage format: {other:?}" ))), @@ -3494,6 +3571,7 @@ impl DataStorage { root_path: path.into(), }) } + "mock" => Ok(MetadataStorageConfig::Mock), other => Err(PyValueError::new_err(format!( "Unsupported metadata storage format: {other:?}" ))), @@ -3632,6 +3710,7 @@ impl DataFormat { self.key_field_names.clone(), self.value_field_names(py), DebeziumMessageParser::standard_separator(), + self.debezium_db_type, ); Ok(Box::new(parser)) } @@ -3645,7 +3724,10 @@ impl DataFormat { ); Ok(Box::new(parser)) } - "identity" => Ok(Box::new(IdentityParser::new(self.parse_utf8))), + "identity" => Ok(Box::new(IdentityParser::new( + self.value_field_names(py), + self.parse_utf8, + ))), _ => Err(PyValueError::new_err("Unknown data format")), } } @@ -3965,6 +4047,7 @@ fn module(_py: Python<'_>, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -3986,6 +4069,7 @@ fn module(_py: Python<'_>, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/tests/data/jsonlines/one.jsonlines b/tests/data/jsonlines/one.jsonlines new file mode 100644 index 00000000..16124cef --- /dev/null +++ b/tests/data/jsonlines/one.jsonlines @@ -0,0 +1 @@ +{"a": 10, "b": 20} diff --git a/tests/data/jsonlines/two.jsonlines b/tests/data/jsonlines/two.jsonlines new file mode 100644 index 00000000..b6040190 --- /dev/null +++ b/tests/data/jsonlines/two.jsonlines @@ -0,0 +1 @@ +{"a": 20, "b": 30} diff --git a/tests/data/sample_debezium_mongodb.txt b/tests/data/sample_debezium_mongodb.txt new file mode 100644 index 00000000..f7a46ef8 --- /dev/null +++ b/tests/data/sample_debezium_mongodb.txt @@ -0,0 +1,8 @@ +{"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.customers.Key"},"payload":{"id":"1001"}} {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"before"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"filter"},{"type":"struct","fields":[{"type":"array","items":{"type":"string","optional":false},"optional":true,"field":"removedFields"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"updatedFields"},{"type":"array","items":{"type":"struct","fields":[{"type":"string","optional":false,"field":"field"},{"type":"int32","optional":false,"field":"size"}],"optional":false,"name":"io.debezium.connector.mongodb.changestream.truncatedarray","version":1},"optional":true,"field":"truncatedArrays"}],"optional":true,"name":"io.debezium.connector.mongodb.changestream.updatedescription","version":1,"field":"updateDescription"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false,incremental"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":true,"field":"sequence"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"string","optional":true,"field":"lsid"},{"type":"int64","optional":true,"field":"txnNumber"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"},{"type":"int64","optional":false,"field":"total_order"},{"type":"int64","optional":false,"field":"data_collection_order"}],"optional":true,"name":"event.block","version":1,"field":"transaction"}],"optional":false,"name":"dbserver1.inventory.customers.Envelope"},"payload":{"before":null,"after":"{\"_id\": {\"$numberLong\": \"1001\"},\"first_name\": \"Sally\",\"last_name\": \"Thomas\",\"email\": \"sally.thomas@acme.com\"}","patch":null,"filter":null,"updateDescription":null,"source":{"version":"2.1.4.Final","connector":"mongodb","name":"dbserver1","ts_ms":1696603750000,"snapshot":"true","db":"inventory","sequence":null,"rs":"rs0","collection":"customers","ord":1,"lsid":null,"txnNumber":null},"op":"r","ts_ms":1696603754156,"transaction":null}} +{"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.customers.Key"},"payload":{"id":"1002"}} {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"before"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"filter"},{"type":"struct","fields":[{"type":"array","items":{"type":"string","optional":false},"optional":true,"field":"removedFields"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"updatedFields"},{"type":"array","items":{"type":"struct","fields":[{"type":"string","optional":false,"field":"field"},{"type":"int32","optional":false,"field":"size"}],"optional":false,"name":"io.debezium.connector.mongodb.changestream.truncatedarray","version":1},"optional":true,"field":"truncatedArrays"}],"optional":true,"name":"io.debezium.connector.mongodb.changestream.updatedescription","version":1,"field":"updateDescription"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false,incremental"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":true,"field":"sequence"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"string","optional":true,"field":"lsid"},{"type":"int64","optional":true,"field":"txnNumber"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"},{"type":"int64","optional":false,"field":"total_order"},{"type":"int64","optional":false,"field":"data_collection_order"}],"optional":true,"name":"event.block","version":1,"field":"transaction"}],"optional":false,"name":"dbserver1.inventory.customers.Envelope"},"payload":{"before":null,"after":"{\"_id\": {\"$numberLong\": \"1002\"},\"first_name\": \"George\",\"last_name\": \"Bailey\",\"email\": \"gbailey@foobar.com\"}","patch":null,"filter":null,"updateDescription":null,"source":{"version":"2.1.4.Final","connector":"mongodb","name":"dbserver1","ts_ms":1696603750000,"snapshot":"true","db":"inventory","sequence":null,"rs":"rs0","collection":"customers","ord":1,"lsid":null,"txnNumber":null},"op":"r","ts_ms":1696603754156,"transaction":null}} +{"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.customers.Key"},"payload":{"id":"1003"}} {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"before"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"filter"},{"type":"struct","fields":[{"type":"array","items":{"type":"string","optional":false},"optional":true,"field":"removedFields"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"updatedFields"},{"type":"array","items":{"type":"struct","fields":[{"type":"string","optional":false,"field":"field"},{"type":"int32","optional":false,"field":"size"}],"optional":false,"name":"io.debezium.connector.mongodb.changestream.truncatedarray","version":1},"optional":true,"field":"truncatedArrays"}],"optional":true,"name":"io.debezium.connector.mongodb.changestream.updatedescription","version":1,"field":"updateDescription"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false,incremental"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":true,"field":"sequence"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"string","optional":true,"field":"lsid"},{"type":"int64","optional":true,"field":"txnNumber"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"},{"type":"int64","optional":false,"field":"total_order"},{"type":"int64","optional":false,"field":"data_collection_order"}],"optional":true,"name":"event.block","version":1,"field":"transaction"}],"optional":false,"name":"dbserver1.inventory.customers.Envelope"},"payload":{"before":null,"after":"{\"_id\": {\"$numberLong\": \"1003\"},\"first_name\": \"Edward\",\"last_name\": \"Walker\",\"email\": \"ed@walker.com\"}","patch":null,"filter":null,"updateDescription":null,"source":{"version":"2.1.4.Final","connector":"mongodb","name":"dbserver1","ts_ms":1696603750000,"snapshot":"true","db":"inventory","sequence":null,"rs":"rs0","collection":"customers","ord":1,"lsid":null,"txnNumber":null},"op":"r","ts_ms":1696603754156,"transaction":null}} +{"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.customers.Key"},"payload":{"id":"1004"}} {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"before"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"filter"},{"type":"struct","fields":[{"type":"array","items":{"type":"string","optional":false},"optional":true,"field":"removedFields"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"updatedFields"},{"type":"array","items":{"type":"struct","fields":[{"type":"string","optional":false,"field":"field"},{"type":"int32","optional":false,"field":"size"}],"optional":false,"name":"io.debezium.connector.mongodb.changestream.truncatedarray","version":1},"optional":true,"field":"truncatedArrays"}],"optional":true,"name":"io.debezium.connector.mongodb.changestream.updatedescription","version":1,"field":"updateDescription"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false,incremental"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":true,"field":"sequence"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"string","optional":true,"field":"lsid"},{"type":"int64","optional":true,"field":"txnNumber"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"},{"type":"int64","optional":false,"field":"total_order"},{"type":"int64","optional":false,"field":"data_collection_order"}],"optional":true,"name":"event.block","version":1,"field":"transaction"}],"optional":false,"name":"dbserver1.inventory.customers.Envelope"},"payload":{"before":null,"after":"{\"_id\": {\"$numberLong\": \"1004\"},\"first_name\": \"Anne\",\"last_name\": \"Kretchmar\",\"email\": \"annek@noanswer.org\"}","patch":null,"filter":null,"updateDescription":null,"source":{"version":"2.1.4.Final","connector":"mongodb","name":"dbserver1","ts_ms":1696603750000,"snapshot":"last","db":"inventory","sequence":null,"rs":"rs0","collection":"customers","ord":1,"lsid":null,"txnNumber":null},"op":"r","ts_ms":1696603754156,"transaction":null}} +{"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.customers.Key"},"payload":{"id":"1005"}} {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"before"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"filter"},{"type":"struct","fields":[{"type":"array","items":{"type":"string","optional":false},"optional":true,"field":"removedFields"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"updatedFields"},{"type":"array","items":{"type":"struct","fields":[{"type":"string","optional":false,"field":"field"},{"type":"int32","optional":false,"field":"size"}],"optional":false,"name":"io.debezium.connector.mongodb.changestream.truncatedarray","version":1},"optional":true,"field":"truncatedArrays"}],"optional":true,"name":"io.debezium.connector.mongodb.changestream.updatedescription","version":1,"field":"updateDescription"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false,incremental"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":true,"field":"sequence"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"string","optional":true,"field":"lsid"},{"type":"int64","optional":true,"field":"txnNumber"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"},{"type":"int64","optional":false,"field":"total_order"},{"type":"int64","optional":false,"field":"data_collection_order"}],"optional":true,"name":"event.block","version":1,"field":"transaction"}],"optional":false,"name":"dbserver1.inventory.customers.Envelope"},"payload":{"before":null,"after":"{\"_id\": {\"$numberLong\": \"1005\"},\"first_name\": \"Bob\",\"last_name\": \"Hopper\",\"email\": \"thebob@example.com\",\"unique_id\": {\"$binary\": \"7U4LYfBwQSOh9XODt78boA==\",\"$type\": \"04\"}}","patch":null,"filter":null,"updateDescription":null,"source":{"version":"2.1.4.Final","connector":"mongodb","name":"dbserver1","ts_ms":1696603793000,"snapshot":"false","db":"inventory","sequence":null,"rs":"rs0","collection":"customers","ord":1,"lsid":null,"txnNumber":null},"op":"c","ts_ms":1696603793792,"transaction":null}} +{"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.customers.Key"},"payload":{"id":"1003"}} {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"before"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"filter"},{"type":"struct","fields":[{"type":"array","items":{"type":"string","optional":false},"optional":true,"field":"removedFields"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"updatedFields"},{"type":"array","items":{"type":"struct","fields":[{"type":"string","optional":false,"field":"field"},{"type":"int32","optional":false,"field":"size"}],"optional":false,"name":"io.debezium.connector.mongodb.changestream.truncatedarray","version":1},"optional":true,"field":"truncatedArrays"}],"optional":true,"name":"io.debezium.connector.mongodb.changestream.updatedescription","version":1,"field":"updateDescription"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false,incremental"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":true,"field":"sequence"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"string","optional":true,"field":"lsid"},{"type":"int64","optional":true,"field":"txnNumber"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"},{"type":"int64","optional":false,"field":"total_order"},{"type":"int64","optional":false,"field":"data_collection_order"}],"optional":true,"name":"event.block","version":1,"field":"transaction"}],"optional":false,"name":"dbserver1.inventory.customers.Envelope"},"payload":{"before":null,"after":"{\"_id\": {\"$numberLong\": \"1003\"},\"first_name\": \"Sergey\"}","patch":null,"filter":null,"updateDescription":null,"source":{"version":"2.1.4.Final","connector":"mongodb","name":"dbserver1","ts_ms":1696603827000,"snapshot":"false","db":"inventory","sequence":null,"rs":"rs0","collection":"customers","ord":1,"lsid":null,"txnNumber":null},"op":"u","ts_ms":1696603827950,"transaction":null}} +{"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.customers.Key"},"payload":{"id":"1004"}} {"schema":{"type":"struct","fields":[{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"before"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"after"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"patch"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"filter"},{"type":"struct","fields":[{"type":"array","items":{"type":"string","optional":false},"optional":true,"field":"removedFields"},{"type":"string","optional":true,"name":"io.debezium.data.Json","version":1,"field":"updatedFields"},{"type":"array","items":{"type":"struct","fields":[{"type":"string","optional":false,"field":"field"},{"type":"int32","optional":false,"field":"size"}],"optional":false,"name":"io.debezium.connector.mongodb.changestream.truncatedarray","version":1},"optional":true,"field":"truncatedArrays"}],"optional":true,"name":"io.debezium.connector.mongodb.changestream.updatedescription","version":1,"field":"updateDescription"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"version"},{"type":"string","optional":false,"field":"connector"},{"type":"string","optional":false,"field":"name"},{"type":"int64","optional":false,"field":"ts_ms"},{"type":"string","optional":true,"name":"io.debezium.data.Enum","version":1,"parameters":{"allowed":"true,last,false,incremental"},"default":"false","field":"snapshot"},{"type":"string","optional":false,"field":"db"},{"type":"string","optional":true,"field":"sequence"},{"type":"string","optional":false,"field":"rs"},{"type":"string","optional":false,"field":"collection"},{"type":"int32","optional":false,"field":"ord"},{"type":"string","optional":true,"field":"lsid"},{"type":"int64","optional":true,"field":"txnNumber"}],"optional":false,"name":"io.debezium.connector.mongo.Source","field":"source"},{"type":"string","optional":true,"field":"op"},{"type":"int64","optional":true,"field":"ts_ms"},{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"},{"type":"int64","optional":false,"field":"total_order"},{"type":"int64","optional":false,"field":"data_collection_order"}],"optional":true,"name":"event.block","version":1,"field":"transaction"}],"optional":false,"name":"dbserver1.inventory.customers.Envelope"},"payload":{"before":null,"after":null,"patch":null,"filter":null,"updateDescription":null,"source":{"version":"2.1.4.Final","connector":"mongodb","name":"dbserver1","ts_ms":1696603839000,"snapshot":"false","db":"inventory","sequence":null,"rs":"rs0","collection":"customers","ord":1,"lsid":null,"txnNumber":null},"op":"d","ts_ms":1696603839816,"transaction":null}} +{"schema":{"type":"struct","fields":[{"type":"string","optional":false,"field":"id"}],"optional":false,"name":"dbserver1.inventory.customers.Key"},"payload":{"id":"1004"}} null diff --git a/tests/helpers.rs b/tests/helpers.rs index abc37769..985c9d5a 100644 --- a/tests/helpers.rs +++ b/tests/helpers.rs @@ -98,8 +98,8 @@ pub fn full_cycle_read( let key = Key::random(); SnapshotEvent::Insert(key, values.clone()) } - ParsedEvent::Delete((_, _)) => { - todo!("remove isn't supported in this test") + ParsedEvent::Delete((_, _)) | ParsedEvent::Upsert((_, _)) => { + todo!("delete and upsert aren't supported in this test") } ParsedEvent::AdvanceTime => SnapshotEvent::AdvanceTime(1), }; @@ -112,8 +112,8 @@ pub fn full_cycle_read( new_parsed_entries.push(event); } } - Entry::Realtime(ReadResult::NewSource) => { - parser.on_new_source_started(); + Entry::Realtime(ReadResult::NewSource(metadata)) => { + parser.on_new_source_started(metadata.as_ref()); } Entry::Snapshot(snapshot_entry) => { snapshot_entries.push(snapshot_entry.clone()); @@ -174,7 +174,7 @@ pub fn read_data_from_reader( panic!("Unexpected erroneous reply: {parse_result:?}"); } } - ReadResult::NewSource => parser.on_new_source_started(), + ReadResult::NewSource(metadata) => parser.on_new_source_started(metadata.as_ref()), ReadResult::Finished => break, } } @@ -249,7 +249,7 @@ pub fn data_parsing_fails( return Ok(true); } } - ReadResult::NewSource => parser.on_new_source_started(), + ReadResult::NewSource(metadata) => parser.on_new_source_started(metadata.as_ref()), ReadResult::Finished => break, } } diff --git a/tests/test_bytes.rs b/tests/test_bytes.rs index 6f679546..fb18343f 100644 --- a/tests/test_bytes.rs +++ b/tests/test_bytes.rs @@ -13,8 +13,9 @@ fn read_bytes_from_path(path: &str) -> eyre::Result> { None, ReadMethod::Full, "*", + false, )?; - let mut parser = IdentityParser::new(false); + let mut parser = IdentityParser::new(vec!["data".to_string()], false); let mut events = Vec::new(); loop { @@ -29,7 +30,7 @@ fn read_bytes_from_path(path: &str) -> eyre::Result> { } } ReadResult::Finished => break, - ReadResult::NewSource => continue, + ReadResult::NewSource(_) => continue, } } diff --git a/tests/test_connector_field_defaults.rs b/tests/test_connector_field_defaults.rs index 8f582bb4..964392e9 100644 --- a/tests/test_connector_field_defaults.rs +++ b/tests/test_connector_field_defaults.rs @@ -30,6 +30,7 @@ fn test_dsv_with_default_end_of_line() -> eyre::Result<()> { ConnectorMode::Static, None, "*", + false, )?; let parser = DsvParser::new( DsvSettings::new( @@ -84,6 +85,7 @@ fn test_dsv_with_default_middle_of_line() -> eyre::Result<()> { ConnectorMode::Static, None, "*", + false, )?; let parser = DsvParser::new( DsvSettings::new( @@ -134,6 +136,7 @@ fn test_dsv_fails_without_default() -> eyre::Result<()> { ConnectorMode::Static, None, "*", + false, )?; let parser = DsvParser::new( DsvSettings::new( @@ -167,6 +170,7 @@ fn test_dsv_with_default_nullable() -> eyre::Result<()> { ConnectorMode::Static, None, "*", + false, )?; let parser = DsvParser::new( DsvSettings::new( @@ -211,6 +215,7 @@ fn test_jsonlines_fails_without_default() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = JsonLinesParser::new( Some(vec!["a".to_string()]), @@ -239,6 +244,7 @@ fn test_jsonlines_with_default() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = JsonLinesParser::new( Some(vec!["a".to_string()]), @@ -291,6 +297,7 @@ fn test_jsonlines_with_default_at_jsonpath() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = JsonLinesParser::new( Some(vec!["a".to_string()]), @@ -337,6 +344,7 @@ fn test_jsonlines_explicit_null_not_overridden() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = JsonLinesParser::new( Some(vec!["a".to_string()]), diff --git a/tests/test_debezium.rs b/tests/test_debezium.rs index f120995c..be4dfdca 100644 --- a/tests/test_debezium.rs +++ b/tests/test_debezium.rs @@ -3,8 +3,13 @@ use helpers::{assert_error_shown_for_raw_data, read_data_from_reader}; use std::path::PathBuf; -use pathway_engine::connectors::data_format::{DebeziumMessageParser, ParsedEvent}; +use assert_matches::assert_matches; + +use pathway_engine::connectors::data_format::{ + DebeziumDBType, DebeziumMessageParser, ParsedEvent, Parser, +}; use pathway_engine::connectors::data_storage::{ConnectorMode, FilesystemReader, ReadMethod}; +use pathway_engine::connectors::SessionType; use pathway_engine::engine::Value; #[test] @@ -15,13 +20,17 @@ fn test_debezium_reads_ok() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = DebeziumMessageParser::new( Some(vec!["id".to_string()]), vec!["first_name".to_string()], " ".to_string(), + DebeziumDBType::Postgres, ); + assert_matches!(parser.session_type(), SessionType::Native); + let changelog = read_data_from_reader(Box::new(reader), Box::new(parser))?; let expected_values = vec![ @@ -46,6 +55,7 @@ fn test_debezium_unparsable_json() -> eyre::Result<()> { Some(vec!["id".to_string()]), vec!["first_name".to_string()], " ".to_string(), + DebeziumDBType::Postgres, ); assert_error_shown_for_raw_data( @@ -59,11 +69,12 @@ fn test_debezium_unparsable_json() -> eyre::Result<()> { #[test] fn test_debezium_json_format_incorrect() -> eyre::Result<()> { - let incorrect_json_pair = br#"{"a": "b"} {"c": "d"}"#; + let incorrect_json_pair = br#"{"payload": {}} {"c": "d"}"#; let parser = DebeziumMessageParser::new( Some(vec!["id".to_string()]), vec!["first_name".to_string()], " ".to_string(), + DebeziumDBType::Postgres, ); assert_error_shown_for_raw_data(incorrect_json_pair, Box::new(parser), "received message doesn't comply with debezium format: there is no payload at the top level of value json"); Ok(()) @@ -71,11 +82,12 @@ fn test_debezium_json_format_incorrect() -> eyre::Result<()> { #[test] fn test_debezium_json_no_operation_specified() -> eyre::Result<()> { - let incorrect_json_pair = br#"{"a": "b"} {"payload": "d"}"#; + let incorrect_json_pair = br#"{"payload": {}} {"payload": "d"}"#; let parser = DebeziumMessageParser::new( Some(vec!["id".to_string()]), vec!["first_name".to_string()], " ".to_string(), + DebeziumDBType::Postgres, ); assert_error_shown_for_raw_data(incorrect_json_pair, Box::new(parser), "received message doesn't comply with debezium format: incorrect type of payload.op field or it is missing"); Ok(()) @@ -83,11 +95,12 @@ fn test_debezium_json_no_operation_specified() -> eyre::Result<()> { #[test] fn test_debezium_json_unsupported_operation() -> eyre::Result<()> { - let incorrect_json_pair = br#"{"a": "b"} {"payload": {"op": "a"}}"#; + let incorrect_json_pair = br#"{"payload": {}} {"payload": {"op": "a"}}"#; let parser = DebeziumMessageParser::new( Some(vec!["id".to_string()]), vec!["first_name".to_string()], " ".to_string(), + DebeziumDBType::Postgres, ); assert_error_shown_for_raw_data( incorrect_json_pair, @@ -99,11 +112,12 @@ fn test_debezium_json_unsupported_operation() -> eyre::Result<()> { #[test] fn test_debezium_json_incomplete_data() -> eyre::Result<()> { - let incorrect_json_pair = br#"{"a": "b"} {"payload": {"op": "u"}}"#; + let incorrect_json_pair = br#"{"payload": null} {"payload": {"op": "u"}}"#; let parser = DebeziumMessageParser::new( Some(vec!["id".to_string()]), vec!["first_name".to_string()], " ".to_string(), + DebeziumDBType::Postgres, ); assert_error_shown_for_raw_data( incorrect_json_pair, @@ -120,6 +134,7 @@ fn test_debezium_tokens_amt_mismatch() -> eyre::Result<()> { Some(vec!["id".to_string()]), vec!["first_name".to_string()], " ".to_string(), + DebeziumDBType::Postgres, ); assert_error_shown_for_raw_data( incorrect_json_pair, @@ -132,6 +147,7 @@ fn test_debezium_tokens_amt_mismatch() -> eyre::Result<()> { Some(vec!["id".to_string()]), vec!["first_name".to_string()], " ".to_string(), + DebeziumDBType::Postgres, ); assert_error_shown_for_raw_data( incorrect_json_pair, @@ -141,3 +157,56 @@ fn test_debezium_tokens_amt_mismatch() -> eyre::Result<()> { Ok(()) } + +#[test] +fn test_debezium_mongodb_format() -> eyre::Result<()> { + let reader = FilesystemReader::new( + PathBuf::from("tests/data/sample_debezium_mongodb.txt"), + ConnectorMode::Static, + None, + ReadMethod::ByLine, + "*", + false, + )?; + let parser = DebeziumMessageParser::new( + Some(vec!["id".to_string()]), + vec!["first_name".to_string()], + " ".to_string(), + DebeziumDBType::MongoDB, + ); + + assert_matches!(parser.session_type(), SessionType::Upsert); + + let changelog = read_data_from_reader(Box::new(reader), Box::new(parser))?; + + let expected_values = vec![ + ParsedEvent::Upsert(( + Some(vec![Value::from("1001")]), + Some(vec![Value::from("Sally")]), + )), + ParsedEvent::Upsert(( + Some(vec![Value::from("1002")]), + Some(vec![Value::from("George")]), + )), + ParsedEvent::Upsert(( + Some(vec![Value::from("1003")]), + Some(vec![Value::from("Edward")]), + )), + ParsedEvent::Upsert(( + Some(vec![Value::from("1004")]), + Some(vec![Value::from("Anne")]), + )), + ParsedEvent::Upsert(( + Some(vec![Value::from("1005")]), + Some(vec![Value::from("Bob")]), + )), + ParsedEvent::Upsert(( + Some(vec![Value::from("1003")]), + Some(vec![Value::from("Sergey")]), + )), + ParsedEvent::Upsert((Some(vec![Value::from("1004")]), None)), + ]; + assert_eq!(changelog, expected_values); + + Ok(()) +} diff --git a/tests/test_dsv.rs b/tests/test_dsv.rs index 6ca663a7..36917da9 100644 --- a/tests/test_dsv.rs +++ b/tests/test_dsv.rs @@ -21,6 +21,7 @@ fn test_dsv_read_ok() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let mut parser = DsvParser::new( DsvSettings::new(Some(vec!["a".to_string()]), vec!["b".to_string()], ','), @@ -64,6 +65,7 @@ fn test_dsv_column_does_not_exist() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = DsvParser::new( DsvSettings::new(Some(vec!["a".to_string()]), vec!["c".to_string()], ','), @@ -87,6 +89,7 @@ fn test_dsv_rows_parsing_ignore_type() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let mut parser = DsvParser::new( DsvSettings::new(Some(vec!["a".to_string()]), vec!["b".to_string()], ','), @@ -123,6 +126,7 @@ fn test_dsv_not_enough_columns() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let mut parser = DsvParser::new( DsvSettings::new(Some(vec!["a".to_string()]), vec!["b".to_string()], ','), @@ -168,6 +172,7 @@ fn test_dsv_autogenerate_pkey() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let mut parser = DsvParser::new( DsvSettings::new(None, vec!["a".to_string(), "b".to_string()], ','), @@ -195,7 +200,7 @@ fn test_dsv_autogenerate_pkey() -> eyre::Result<()> { } } ReadResult::Finished => break, - ReadResult::NewSource => continue, + ReadResult::NewSource(_) => continue, } } @@ -210,6 +215,7 @@ fn test_dsv_composite_pkey() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let mut parser = DsvParser::new( DsvSettings::new( @@ -240,7 +246,7 @@ fn test_dsv_composite_pkey() -> eyre::Result<()> { } } ReadResult::Finished => break, - ReadResult::NewSource => continue, + ReadResult::NewSource(_) => continue, } } @@ -272,6 +278,7 @@ fn test_dsv_read_schema_ok() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let mut parser = DsvParser::new( DsvSettings::new( @@ -341,6 +348,7 @@ fn test_dsv_read_schema_nonparsable() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let mut parser = DsvParser::new( DsvSettings::new( diff --git a/tests/test_dsv_dir.rs b/tests/test_dsv_dir.rs index d3102266..3b96c694 100644 --- a/tests/test_dsv_dir.rs +++ b/tests/test_dsv_dir.rs @@ -20,6 +20,7 @@ fn test_dsv_dir_ok() -> eyre::Result<()> { ConnectorMode::Static, None, "*", + false, )?; let parser = DsvParser::new( DsvSettings::new(Some(vec!["key".to_string()]), vec!["foo".to_string()], ','), @@ -53,6 +54,7 @@ fn test_single_file_ok() -> eyre::Result<()> { ConnectorMode::Static, None, "*", + false, )?; let parser = DsvParser::new( DsvSettings::new(Some(vec!["a".to_string()]), vec!["b".to_string()], ','), @@ -78,6 +80,7 @@ fn test_custom_delimiter() -> eyre::Result<()> { ConnectorMode::Static, None, "*", + false, )?; let parser = DsvParser::new( DsvSettings::new( @@ -105,6 +108,7 @@ fn test_escape_fields() -> eyre::Result<()> { ConnectorMode::Static, None, "*", + false, )?; let parser = DsvParser::new( DsvSettings::new( @@ -151,6 +155,7 @@ fn test_escape_newlines() -> eyre::Result<()> { ConnectorMode::Static, None, "*", + false, )?; let parser = DsvParser::new( DsvSettings::new( @@ -187,6 +192,7 @@ fn test_nonexistent_file() -> eyre::Result<()> { ConnectorMode::Static, None, "*", + false, ); assert!(reader.is_err()); @@ -204,6 +210,7 @@ fn test_special_fields() -> eyre::Result<()> { ConnectorMode::Static, None, "*", + false, )?; let parser = DsvParser::new( DsvSettings::new( diff --git a/tests/test_jsonlines.rs b/tests/test_jsonlines.rs index 8f15b19e..5fd849a7 100644 --- a/tests/test_jsonlines.rs +++ b/tests/test_jsonlines.rs @@ -18,6 +18,7 @@ fn test_jsonlines_ok() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = JsonLinesParser::new( Some(vec!["a".to_string()]), @@ -57,6 +58,7 @@ fn test_jsonlines_incorrect_key() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = JsonLinesParser::new( Some(vec!["a".to_string(), "d".to_string()]), @@ -83,6 +85,7 @@ fn test_jsonlines_incomplete_key_to_null() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = JsonLinesParser::new( Some(vec!["a".to_string(), "d".to_string()]), @@ -106,6 +109,7 @@ fn test_jsonlines_incorrect_values() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = JsonLinesParser::new( Some(vec!["a".to_string()]), @@ -132,6 +136,7 @@ fn test_jsonlines_types_parsing() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = JsonLinesParser::new( Some(vec!["a".to_string()]), @@ -182,6 +187,7 @@ fn test_jsonlines_complex_paths() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let mut routes = HashMap::new(); @@ -238,6 +244,7 @@ fn test_jsonlines_complex_paths_error() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let mut routes = HashMap::new(); @@ -279,6 +286,7 @@ fn test_jsonlines_complex_path_ignore_errors() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let mut routes = HashMap::new(); @@ -317,6 +325,7 @@ fn test_jsonlines_incorrect_key_verbose_error() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = JsonLinesParser::new( Some(vec!["a".to_string(), "d".to_string()]), @@ -346,6 +355,7 @@ fn test_jsonlines_incorrect_jsonpointer_verbose_error() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = JsonLinesParser::new( Some(vec!["a".to_string(), "d".to_string()]), @@ -372,6 +382,7 @@ fn test_jsonlines_failed_to_parse_field() -> eyre::Result<()> { None, ReadMethod::ByLine, "*", + false, )?; let parser = JsonLinesParser::new( None, diff --git a/tests/test_metadata.rs b/tests/test_metadata.rs new file mode 100644 index 00000000..94d40412 --- /dev/null +++ b/tests/test_metadata.rs @@ -0,0 +1,239 @@ +mod helpers; +use helpers::read_data_from_reader; + +use std::collections::HashMap; +use std::path::PathBuf; + +use pathway_engine::connectors::data_format::{ + DsvParser, DsvSettings, IdentityParser, JsonLinesParser, ParsedEvent, +}; +use pathway_engine::connectors::data_storage::{ + ConnectorMode, CsvFilesystemReader, FilesystemReader, ReadMethod, +}; +use pathway_engine::engine::Value; + +/// This function requires that _metadata field is the last in the `value_names_list` +fn check_file_name_in_metadata(data_read: &ParsedEvent, name: &str) { + if let ParsedEvent::Insert((_, values)) = data_read { + if let Value::Json(meta) = &values[values.len() - 1] { + let path: String = meta["path"].to_string(); + assert!(path.ends_with(name), "{data_read:?}"); + } else { + panic!("wrong type of metadata field"); + } + } else { + panic!("wrong type of event"); + } +} + +#[test] +fn test_metadata_fs_dir() -> eyre::Result<()> { + let reader = FilesystemReader::new( + PathBuf::from("tests/data/csvdir/"), + ConnectorMode::Static, + None, + ReadMethod::ByLine, + "*", + true, + )?; + let parser = DsvParser::new( + DsvSettings::new( + Some(vec!["key".to_string()]), + vec![ + "key".to_string(), + "foo".to_string(), + "_metadata".to_string(), + ], + ',', + ), + HashMap::new(), + ); + + let data_read = read_data_from_reader(Box::new(reader), Box::new(parser))?; + check_file_name_in_metadata(&data_read[0], "tests/data/csvdir/a.txt\""); + check_file_name_in_metadata(&data_read[2], "tests/data/csvdir/b.txt\""); + check_file_name_in_metadata(&data_read[4], "tests/data/csvdir/c.txt\""); + + Ok(()) +} + +#[test] +fn test_metadata_fs_file() -> eyre::Result<()> { + let reader = FilesystemReader::new( + PathBuf::from("tests/data/minimal.txt"), + ConnectorMode::Static, + None, + ReadMethod::ByLine, + "*", + true, + )?; + let parser = DsvParser::new( + DsvSettings::new( + Some(vec!["key".to_string()]), + vec![ + "key".to_string(), + "foo".to_string(), + "_metadata".to_string(), + ], + ',', + ), + HashMap::new(), + ); + + let data_read = read_data_from_reader(Box::new(reader), Box::new(parser))?; + check_file_name_in_metadata(&data_read[0], "tests/data/minimal.txt\""); + + Ok(()) +} + +#[test] +fn test_metadata_csv_dir() -> eyre::Result<()> { + let mut builder = csv::ReaderBuilder::new(); + builder.has_headers(false); + + let reader = CsvFilesystemReader::new( + PathBuf::from("tests/data/csvdir/"), + builder, + ConnectorMode::Static, + None, + "*", + true, + )?; + let parser = DsvParser::new( + DsvSettings::new( + Some(vec!["key".to_string()]), + vec![ + "key".to_string(), + "foo".to_string(), + "_metadata".to_string(), + ], + ',', + ), + HashMap::new(), + ); + + let data_read = read_data_from_reader(Box::new(reader), Box::new(parser))?; + check_file_name_in_metadata(&data_read[0], "tests/data/csvdir/a.txt\""); + check_file_name_in_metadata(&data_read[2], "tests/data/csvdir/b.txt\""); + check_file_name_in_metadata(&data_read[4], "tests/data/csvdir/c.txt\""); + + Ok(()) +} + +#[test] +fn test_metadata_csv_file() -> eyre::Result<()> { + let mut builder = csv::ReaderBuilder::new(); + builder.has_headers(false); + + let reader = CsvFilesystemReader::new( + PathBuf::from("tests/data/minimal.txt"), + builder, + ConnectorMode::Static, + None, + "*", + true, + )?; + let parser = DsvParser::new( + DsvSettings::new( + Some(vec!["key".to_string()]), + vec![ + "key".to_string(), + "foo".to_string(), + "_metadata".to_string(), + ], + ',', + ), + HashMap::new(), + ); + + let data_read = read_data_from_reader(Box::new(reader), Box::new(parser))?; + check_file_name_in_metadata(&data_read[0], "tests/data/minimal.txt\""); + + Ok(()) +} + +#[test] +fn test_metadata_json_file() -> eyre::Result<()> { + let reader = FilesystemReader::new( + PathBuf::from("tests/data/jsonlines.txt"), + ConnectorMode::Static, + None, + ReadMethod::ByLine, + "*", + true, + )?; + let parser = JsonLinesParser::new( + None, + vec!["a".to_string(), "_metadata".to_string()], + HashMap::new(), + false, + HashMap::new(), + ); + + let data_read = read_data_from_reader(Box::new(reader), Box::new(parser))?; + check_file_name_in_metadata(&data_read[0], "tests/data/jsonlines.txt\""); + + Ok(()) +} + +#[test] +fn test_metadata_json_dir() -> eyre::Result<()> { + let reader = FilesystemReader::new( + PathBuf::from("tests/data/jsonlines/"), + ConnectorMode::Static, + None, + ReadMethod::ByLine, + "*", + true, + )?; + let parser = JsonLinesParser::new( + None, + vec!["a".to_string(), "_metadata".to_string()], + HashMap::new(), + false, + HashMap::new(), + ); + + let data_read = read_data_from_reader(Box::new(reader), Box::new(parser))?; + check_file_name_in_metadata(&data_read[0], "tests/data/jsonlines/one.jsonlines\""); + check_file_name_in_metadata(&data_read[1], "tests/data/jsonlines/two.jsonlines\""); + + Ok(()) +} + +#[test] +fn test_metadata_identity_file() -> eyre::Result<()> { + let reader = FilesystemReader::new( + PathBuf::from("tests/data/jsonlines.txt"), + ConnectorMode::Static, + None, + ReadMethod::ByLine, + "*", + true, + )?; + let parser = IdentityParser::new(vec!["data".to_string(), "_metadata".to_string()], false); + + let data_read = read_data_from_reader(Box::new(reader), Box::new(parser))?; + check_file_name_in_metadata(&data_read[0], "tests/data/jsonlines.txt\""); + + Ok(()) +} + +#[test] +fn test_metadata_identity_dir() -> eyre::Result<()> { + let reader = FilesystemReader::new( + PathBuf::from("tests/data/jsonlines/"), + ConnectorMode::Static, + None, + ReadMethod::ByLine, + "*", + true, + )?; + let parser = IdentityParser::new(vec!["data".to_string(), "_metadata".to_string()], false); + + let data_read = read_data_from_reader(Box::new(reader), Box::new(parser))?; + check_file_name_in_metadata(&data_read[0], "tests/data/jsonlines/one.jsonlines\""); + check_file_name_in_metadata(&data_read[1], "tests/data/jsonlines/two.jsonlines\""); + + Ok(()) +} diff --git a/tests/test_parser_errors.rs b/tests/test_parser_errors.rs index 9e4ff863..af6bfabc 100644 --- a/tests/test_parser_errors.rs +++ b/tests/test_parser_errors.rs @@ -1,7 +1,7 @@ mod helpers; use helpers::assert_error_shown_for_reader_context; -use pathway_engine::connectors::data_format::DebeziumMessageParser; +use pathway_engine::connectors::data_format::{DebeziumDBType, DebeziumMessageParser}; use pathway_engine::connectors::data_storage::{DataEventType, ReaderContext}; #[test] @@ -10,6 +10,7 @@ fn test_utf8_decode_error() -> eyre::Result<()> { Some(vec!["id".to_string()]), vec!["first_name".to_string()], " ".to_string(), + DebeziumDBType::Postgres, ); let invalid_utf8_bytes: &[u8] = &[0xC0, 0x80, 0xE0, 0x80, 0x80]; @@ -28,6 +29,7 @@ fn test_empty_payload() -> eyre::Result<()> { Some(vec!["id".to_string()]), vec!["first_name".to_string()], " ".to_string(), + DebeziumDBType::Postgres, ); assert_error_shown_for_reader_context( @@ -45,6 +47,7 @@ fn test_unsupported_context() -> eyre::Result<()> { Some(vec!["id".to_string()]), vec!["first_name".to_string()], " ".to_string(), + DebeziumDBType::Postgres, ); assert_error_shown_for_reader_context( diff --git a/tests/test_seek.rs b/tests/test_seek.rs index 64798126..d2884ea2 100644 --- a/tests/test_seek.rs +++ b/tests/test_seek.rs @@ -32,6 +32,7 @@ fn csv_reader_parser_pair(input_path: &Path) -> (Box, Box (Box, Box