diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 585d0601..a9c18d3a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -18,7 +18,7 @@ repos: - id: trailing-whitespace - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.14 + rev: v0.2.0 hooks: - id: ruff args: [--fix] diff --git a/README.md b/README.md index 32733350..5b50808e 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,17 @@ pre-commit install ### Create and Run Tests +Set up the SSL files permissions: + +```bash +chmod 0600 .ssl/*.key +``` + +Start the test databases using Docker Compose: +```bash +docker-compose up -d +``` + Create tests within the `target_postgres/tests` subfolder and then run: @@ -163,7 +174,7 @@ The below table shows how this tap will map between jsonschema datatypes and Pos | UNSUPPORTED | bit varying [ (n) ] | | boolean | boolean | | UNSUPPORTED | box | -| UNSUPPORTED | bytea | +| string with contentEncoding="base16" ([opt-in feature](#content-encoding-support)) | bytea | | UNSUPPORTED | character [ (n) ] | | UNSUPPORTED | character varying [ (n) ] | | UNSUPPORTED | cidr | @@ -204,6 +215,7 @@ The below table shows how this tap will map between jsonschema datatypes and Pos Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array. If a column has multiple jsonschema types, the following order is using to order Postgres types, from highest priority to lowest priority. +- BYTEA - ARRAY(JSONB) - JSONB - TEXT @@ -216,3 +228,50 @@ If a column has multiple jsonschema types, the following order is using to order - INTEGER - BOOLEAN - NOTYPE + +## Content Encoding Support + +Json Schema supports the [`contentEncoding` keyword](https://datatracker.ietf.org/doc/html/rfc4648#section-8), which can be used to specify the encoding of input string types. + +This target can detect content encoding clues in the schema to determine how to store the data in the postgres in a more efficient way. + +Content encoding interpretation is disabled by default. This is because the default config is meant to be as permissive as possible, and do not make any assumptions about the data that could lead to data loss. + +However if you know your data respects the advertised content encoding way, you can enable this feature to get better performance and storage efficiency. + +To enable it, set the `interpret_content_encoding` option to `True`. + +### base16 + +The string is encoded using the base16 encoding, as defined in [RFC 4648](https://json-schema.org/draft/2020-12/draft-bhutton-json-schema-validation-00#rfc.section.8.3 +). + +Example schema: +```json +{ + "type": "object", + "properties": { + "my_hex": { + "type": "string", + "contentEncoding": "base16" + } + } +} +``` + +Data will be stored as a `bytea` in the database. + +Example data: +```json +# valid data +{ "my_hex": "01AF" } +{ "my_hex": "01af" } +{ "my_hex": "1af" } +{ "my_hex": "0x1234" } + +# invalid data +{ "my_hex": " 0x1234 " } +{ "my_hex": "House" } +``` + +For convenience, data prefixed with `0x` or containing an odd number of characters is supported although it's not part of the standard. diff --git a/poetry.lock b/poetry.lock index a7965852..917195fd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -688,16 +688,6 @@ files = [ importlib-resources = {version = ">=1.4.0", markers = "python_version < \"3.9\""} referencing = ">=0.31.0" -[[package]] -name = "memoization" -version = "0.4.0" -description = "A powerful caching library for Python, with TTL support and multiple algorithm options. (https://github.com/lonelyenvoy/python-memoization)" -optional = false -python-versions = ">=3, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, <4" -files = [ - {file = "memoization-0.4.0.tar.gz", hash = "sha256:fde5e7cd060ef45b135e0310cfec17b2029dc472ccb5bbbbb42a503d4538a135"}, -] - [[package]] name = "mypy" version = "1.8.0" @@ -1145,17 +1135,6 @@ files = [ [package.extras] cli = ["click (>=5.0)"] -[[package]] -name = "pytz" -version = "2023.4" -description = "World timezone definitions, modern and historical" -optional = false -python-versions = "*" -files = [ - {file = "pytz-2023.4-py2.py3-none-any.whl", hash = "sha256:f90ef520d95e7c46951105338d918664ebfd6f1d995bd7d153127ce90efafa6a"}, - {file = "pytz-2023.4.tar.gz", hash = "sha256:31d4583c4ed539cd037956140d695e42c033a19e984bfce9964a3f7d59bc2b40"}, -] - [[package]] name = "pyyaml" version = "6.0.1" @@ -1373,28 +1352,28 @@ files = [ [[package]] name = "ruff" -version = "0.1.15" +version = "0.2.1" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" files = [ - {file = "ruff-0.1.15-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:5fe8d54df166ecc24106db7dd6a68d44852d14eb0729ea4672bb4d96c320b7df"}, - {file = "ruff-0.1.15-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6f0bfbb53c4b4de117ac4d6ddfd33aa5fc31beeaa21d23c45c6dd249faf9126f"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e0d432aec35bfc0d800d4f70eba26e23a352386be3a6cf157083d18f6f5881c8"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9405fa9ac0e97f35aaddf185a1be194a589424b8713e3b97b762336ec79ff807"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c66ec24fe36841636e814b8f90f572a8c0cb0e54d8b5c2d0e300d28a0d7bffec"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:6f8ad828f01e8dd32cc58bc28375150171d198491fc901f6f98d2a39ba8e3ff5"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:86811954eec63e9ea162af0ffa9f8d09088bab51b7438e8b6488b9401863c25e"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fd4025ac5e87d9b80e1f300207eb2fd099ff8200fa2320d7dc066a3f4622dc6b"}, - {file = "ruff-0.1.15-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b17b93c02cdb6aeb696effecea1095ac93f3884a49a554a9afa76bb125c114c1"}, - {file = "ruff-0.1.15-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ddb87643be40f034e97e97f5bc2ef7ce39de20e34608f3f829db727a93fb82c5"}, - {file = "ruff-0.1.15-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:abf4822129ed3a5ce54383d5f0e964e7fef74a41e48eb1dfad404151efc130a2"}, - {file = "ruff-0.1.15-py3-none-musllinux_1_2_i686.whl", hash = "sha256:6c629cf64bacfd136c07c78ac10a54578ec9d1bd2a9d395efbee0935868bf852"}, - {file = "ruff-0.1.15-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1bab866aafb53da39c2cadfb8e1c4550ac5340bb40300083eb8967ba25481447"}, - {file = "ruff-0.1.15-py3-none-win32.whl", hash = "sha256:2417e1cb6e2068389b07e6fa74c306b2810fe3ee3476d5b8a96616633f40d14f"}, - {file = "ruff-0.1.15-py3-none-win_amd64.whl", hash = "sha256:3837ac73d869efc4182d9036b1405ef4c73d9b1f88da2413875e34e0d6919587"}, - {file = "ruff-0.1.15-py3-none-win_arm64.whl", hash = "sha256:9a933dfb1c14ec7a33cceb1e49ec4a16b51ce3c20fd42663198746efc0427360"}, - {file = "ruff-0.1.15.tar.gz", hash = "sha256:f6dfa8c1b21c913c326919056c390966648b680966febcb796cc9d1aaab8564e"}, + {file = "ruff-0.2.1-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:dd81b911d28925e7e8b323e8d06951554655021df8dd4ac3045d7212ac4ba080"}, + {file = "ruff-0.2.1-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:dc586724a95b7d980aa17f671e173df00f0a2eef23f8babbeee663229a938fec"}, + {file = "ruff-0.2.1-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c92db7101ef5bfc18e96777ed7bc7c822d545fa5977e90a585accac43d22f18a"}, + {file = "ruff-0.2.1-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:13471684694d41ae0f1e8e3a7497e14cd57ccb7dd72ae08d56a159d6c9c3e30e"}, + {file = "ruff-0.2.1-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a11567e20ea39d1f51aebd778685582d4c56ccb082c1161ffc10f79bebe6df35"}, + {file = "ruff-0.2.1-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:00a818e2db63659570403e44383ab03c529c2b9678ba4ba6c105af7854008105"}, + {file = "ruff-0.2.1-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be60592f9d218b52f03384d1325efa9d3b41e4c4d55ea022cd548547cc42cd2b"}, + {file = "ruff-0.2.1-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fbd2288890b88e8aab4499e55148805b58ec711053588cc2f0196a44f6e3d855"}, + {file = "ruff-0.2.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3ef052283da7dec1987bba8d8733051c2325654641dfe5877a4022108098683"}, + {file = "ruff-0.2.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:7022d66366d6fded4ba3889f73cd791c2d5621b2ccf34befc752cb0df70f5fad"}, + {file = "ruff-0.2.1-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0a725823cb2a3f08ee743a534cb6935727d9e47409e4ad72c10a3faf042ad5ba"}, + {file = "ruff-0.2.1-py3-none-musllinux_1_2_i686.whl", hash = "sha256:0034d5b6323e6e8fe91b2a1e55b02d92d0b582d2953a2b37a67a2d7dedbb7acc"}, + {file = "ruff-0.2.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e5cb5526d69bb9143c2e4d2a115d08ffca3d8e0fddc84925a7b54931c96f5c02"}, + {file = "ruff-0.2.1-py3-none-win32.whl", hash = "sha256:6b95ac9ce49b4fb390634d46d6ece32ace3acdd52814671ccaf20b7f60adb232"}, + {file = "ruff-0.2.1-py3-none-win_amd64.whl", hash = "sha256:e3affdcbc2afb6f5bd0eb3130139ceedc5e3f28d206fe49f63073cb9e65988e0"}, + {file = "ruff-0.2.1-py3-none-win_arm64.whl", hash = "sha256:efababa8e12330aa94a53e90a81eb6e2d55f348bc2e71adbf17d9cad23c03ee6"}, + {file = "ruff-0.2.1.tar.gz", hash = "sha256:3b42b5d8677cd0c72b99fcaf068ffc62abb5a19e71b4a3b9cfa50658a0af02f1"}, ] [[package]] @@ -1533,13 +1512,13 @@ files = [ [[package]] name = "singer-sdk" -version = "0.34.1" +version = "0.35.0" description = "A framework for building Singer taps" optional = false -python-versions = ">=3.7.1" +python-versions = ">=3.8" files = [ - {file = "singer_sdk-0.34.1-py3-none-any.whl", hash = "sha256:2b9afa40722c2d7288992d86d16298128e38387941306267d908b9cd954227a3"}, - {file = "singer_sdk-0.34.1.tar.gz", hash = "sha256:5da11da3de07cc31cc8c1f3f19a40b8f8be4a0df9b5535fb90880c263e58aee5"}, + {file = "singer_sdk-0.35.0-py3-none-any.whl", hash = "sha256:0f56be58655708dd97df4b892ad86bc74cc7a96c360bf4d22362391d0668b8be"}, + {file = "singer_sdk-0.35.0.tar.gz", hash = "sha256:3e4048a0e2aaef080acff2bca1d27a0dafcf24684f763cc46ed2b1a0c6970fd7"}, ] [package.dependencies] @@ -1548,20 +1527,19 @@ backports-datetime-fromisoformat = {version = ">=2.0.1", markers = "python_versi click = ">=8.0,<9.0" cryptography = ">=3.4.6" fs = ">=2.4.16" -importlib-metadata = {version = "<7.0.0", markers = "python_version < \"3.12\""} +importlib-metadata = {version = "<8.0.0", markers = "python_version < \"3.12\""} importlib-resources = {version = ">=5.12.0", markers = "python_version < \"3.9\""} inflection = ">=0.5.1" joblib = ">=1.0.1" jsonpath-ng = ">=1.5.3" -jsonschema = {version = ">=4.16.0", markers = "python_version >= \"3.8\""} -memoization = {version = ">=0.3.2,<0.5.0", markers = "python_version < \"4\""} +jsonschema = ">=4.16.0" packaging = ">=23.1" -pendulum = {version = ">=2.1.0,<4", markers = "python_version >= \"3.8\""} +pendulum = ">=2.1.0,<4" PyJWT = ">=2.4,<3.0" python-dateutil = ">=2.8.2" python-dotenv = ">=0.20" -pytz = ">=2022.2.1" PyYAML = ">=6.0" +referencing = ">=0.30.0" requests = ">=2.25.1" simpleeval = ">=0.9.13" simplejson = ">=3.17.6" @@ -1571,7 +1549,8 @@ urllib3 = ">=1.26,<2" [package.extras] docs = ["furo (>=2022.12.7)", "myst-parser (>=1)", "sphinx (>=4.5)", "sphinx-autobuild (>=2021.3.14)", "sphinx-copybutton (>=0.3.1)", "sphinx-inline-tabs (>=2023.4.21)", "sphinx-notfound-page (>=1.0.0)", "sphinx-reredirects (>=0.1.1)"] -parquet = ["numpy (<1.22)", "numpy (>=1.22)", "numpy (>=1.22,<1.25)", "pyarrow (>=11,<13)", "pyarrow (>=13)"] +faker = ["faker (>=22.5,<23.0)"] +parquet = ["numpy (>=1.22)", "numpy (>=1.22,<1.25)", "pyarrow (>=13)"] s3 = ["fs-s3fs (>=1.1.1)"] testing = ["pytest (>=7.2.1)", "pytest-durations (>=1.2.0)"] @@ -1746,13 +1725,13 @@ referencing = "*" [[package]] name = "types-paramiko" -version = "3.4.0.20240120" +version = "3.4.0.20240205" description = "Typing stubs for paramiko" optional = false python-versions = ">=3.8" files = [ - {file = "types-paramiko-3.4.0.20240120.tar.gz", hash = "sha256:07e5992e995c2266a0803a332a69d912036be7664ca36a1ca01dac1b67643132"}, - {file = "types_paramiko-3.4.0.20240120-py3-none-any.whl", hash = "sha256:b0b44764c6c61a3e7629ee5923dc78b1a22b1d029522df321997697501e30d97"}, + {file = "types-paramiko-3.4.0.20240205.tar.gz", hash = "sha256:6f9030b478d6b24ecc0ebae5d9ec9b7838760e1c3e1e47966f98343c3e916675"}, + {file = "types_paramiko-3.4.0.20240205-py3-none-any.whl", hash = "sha256:a0c5b198721dbc72db189e9207804f344cf9e303db742600cf67fba397094397"}, ] [package.dependencies] @@ -1856,4 +1835,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = ">=3.8" -content-hash = "e6edcdd1ad2c91706171bcc872cb95d710968a9e1d70a6d34112e05cbd4073c7" +content-hash = "7ec7381e62d05a6d872f943a737ac7b96bf47026e13973e51b508fcae609a526" diff --git a/pyproject.toml b/pyproject.toml index 439e7e39..4aa7ec14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,7 +37,7 @@ sqlalchemy = "~=2.0" sshtunnel = "0.4.0" [tool.poetry.dependencies.singer-sdk] -version = "~=0.34.0" +version = "~=0.35.0" [tool.poetry.group.dev.dependencies] pytest = ">=7.4.2" diff --git a/target_postgres/connector.py b/target_postgres/connector.py index 819eb507..7114273f 100644 --- a/target_postgres/connector.py +++ b/target_postgres/connector.py @@ -10,6 +10,7 @@ import sys import typing as t from contextlib import contextmanager +from functools import cached_property from os import chmod, path from typing import cast @@ -18,7 +19,7 @@ import sqlalchemy as sa from singer_sdk import SQLConnector from singer_sdk import typing as th -from sqlalchemy.dialects.postgresql import ARRAY, BIGINT, JSONB +from sqlalchemy.dialects.postgresql import ARRAY, BIGINT, BYTEA, JSONB from sqlalchemy.engine import URL from sqlalchemy.engine.url import make_url from sqlalchemy.types import ( @@ -82,11 +83,23 @@ def __init__(self, config: dict) -> None: sqlalchemy_url=url.render_as_string(hide_password=False), ) + @cached_property + def interpret_content_encoding(self) -> bool: + """Whether to interpret schema contentEncoding to set the column type. + + It is an opt-in feature because it might result in data loss if the + actual data does not match the schema's advertised encoding. + + Returns: + True if the feature is enabled, False otherwise. + """ + return self.config.get("interpret_content_encoding", False) + def prepare_table( # type: ignore[override] # noqa: PLR0913 self, full_table_name: str, schema: dict, - primary_keys: list[str], + primary_keys: t.Sequence[str], connection: sa.engine.Connection, partition_keys: list[str] | None = None, as_temp_table: bool = False, @@ -206,8 +219,7 @@ def clone_table( # noqa: PLR0913 new_table.create(bind=connection) return new_table - @staticmethod - def to_sql_type(jsonschema_type: dict) -> sa.types.TypeEngine: + def to_sql_type(self, jsonschema_type: dict) -> sa.types.TypeEngine: # type: ignore[override] """Return a JSON Schema representation of the provided type. By default will call `typing.to_sql_type()`. @@ -233,6 +245,8 @@ def to_sql_type(jsonschema_type: dict) -> sa.types.TypeEngine: json_type_dict = {"type": entry} if jsonschema_type.get("format", False): json_type_dict["format"] = jsonschema_type["format"] + if encoding := jsonschema_type.get("contentEncoding", False): + json_type_dict["contentEncoding"] = encoding json_type_array.append(json_type_dict) else: msg = "Invalid format for jsonschema type: not str or list." @@ -247,16 +261,13 @@ def to_sql_type(jsonschema_type: dict) -> sa.types.TypeEngine: return NOTYPE() sql_type_array = [] for json_type in json_type_array: - picked_type = PostgresConnector.pick_individual_type( - jsonschema_type=json_type - ) + picked_type = self.pick_individual_type(jsonschema_type=json_type) if picked_type is not None: sql_type_array.append(picked_type) return PostgresConnector.pick_best_sql_type(sql_type_array=sql_type_array) - @staticmethod - def pick_individual_type(jsonschema_type: dict): + def pick_individual_type(self, jsonschema_type: dict): """Select the correct sql type assuming jsonschema_type has only a single type. Args: @@ -273,8 +284,15 @@ def pick_individual_type(jsonschema_type: dict): return JSONB() if "array" in jsonschema_type["type"]: return ARRAY(JSONB()) + + # string formats if jsonschema_type.get("format") == "date-time": return TIMESTAMP() + if ( + self.interpret_content_encoding + and jsonschema_type.get("contentEncoding") == "base16" + ): + return HexByteString() individual_type = th.to_sql_type(jsonschema_type) return TEXT() if isinstance(individual_type, VARCHAR) else individual_type @@ -290,6 +308,7 @@ def pick_best_sql_type(sql_type_array: list): An instance of the best SQL type class based on defined precedence order. """ precedence_order = [ + HexByteString, ARRAY, JSONB, TEXT, @@ -315,7 +334,7 @@ def create_empty_table( # type: ignore[override] # noqa: PLR0913 meta: sa.MetaData, schema: dict, connection: sa.engine.Connection, - primary_keys: list[str] | None = None, + primary_keys: t.Sequence[str] | None = None, partition_keys: list[str] | None = None, as_temp_table: bool = False, ) -> sa.Table: @@ -831,3 +850,41 @@ def python_type(self): def as_generic(self, *args: t.Any, **kwargs: t.Any): """Return the generic type for this column.""" return TEXT() + + +class HexByteString(TypeDecorator): + """Convert Python string representing Hex data to bytes and vice versa. + + This is used to store binary data in more efficient format in the database. + The string is encoded using the base16 encoding, as defined in RFC 4648 + https://json-schema.org/draft/2020-12/draft-bhutton-json-schema-validation-00#rfc.section.8.3 + For convenience, data prefixed with `0x` or containing an odd number of characters + is supported although it's not part of the standard. + """ + + impl = BYTEA + + def process_bind_param(self, value, dialect): + """Convert hex string to bytes.""" + if value is None: + return None + + if isinstance(value, str): + if value.startswith("\\x") or value.startswith("0x"): + value = value[2:] + + if len(value) % 2: + value = f"0{value}" + + try: + value = bytes.fromhex(value) + except ValueError as ex: + raise ValueError(f"Invalid hexadecimal string: {value}") from ex + + if not isinstance(value, bytearray | memoryview | bytes): + raise TypeError( + "HexByteString columns support only bytes or hex string values. " + f"{type(value)} is not supported" + ) + + return value diff --git a/target_postgres/sinks.py b/target_postgres/sinks.py index 6f3714c6..0ae09c37 100644 --- a/target_postgres/sinks.py +++ b/target_postgres/sinks.py @@ -1,7 +1,7 @@ """Postgres target sink class, which handles writing streams.""" import uuid -from typing import Any, Dict, Iterable, List, Optional, Union, cast +from typing import Any, Dict, Iterable, List, Optional, Sequence, Union, cast import sqlalchemy as sa from pendulum import now @@ -118,7 +118,7 @@ def bulk_insert_records( # type: ignore[override] # noqa: PLR0913 table: sa.Table, schema: dict, records: Iterable[Dict[str, Any]], - primary_keys: List[str], + primary_keys: Sequence[str], connection: sa.engine.Connection, ) -> Optional[int]: """Bulk insert records to an existing destination table. @@ -175,7 +175,7 @@ def upsert( # noqa: PLR0913 from_table: sa.Table, to_table: sa.Table, schema: dict, - join_keys: List[str], + join_keys: Sequence[str], connection: sa.engine.Connection, ) -> Optional[int]: """Merge upsert data from one table to another. diff --git a/target_postgres/target.py b/target_postgres/target.py index ed83d6fa..57b45034 100644 --- a/target_postgres/target.py +++ b/target_postgres/target.py @@ -192,6 +192,17 @@ def __init__( + "for more information." ), ), + th.Property( + "interpret_content_encoding", + th.BooleanType, + default=False, + description=( + "If set to true, the target will interpret the content encoding of the " + "schema to determine how to store the data. Using this option may " + "result in a more efficient storage of the data but may also result " + "in an error if the data is not encoded as expected." + ), + ), th.Property( "ssl_enable", th.BooleanType, diff --git a/target_postgres/tests/data_files/base16_content_encoding_interpreted.singer b/target_postgres/tests/data_files/base16_content_encoding_interpreted.singer new file mode 100644 index 00000000..21c624c4 --- /dev/null +++ b/target_postgres/tests/data_files/base16_content_encoding_interpreted.singer @@ -0,0 +1,8 @@ +{"type":"SCHEMA","stream":"test_base_16_encoding_interpreted","schema":{"type":"object","properties":{"id":{"type":"string"},"contract_address":{"type":"string","contentEncoding":"base16"},"raw_event_data":{"type":["string","null"],"contentEncoding":"base16"}},"required":["id","contract_address","raw_event_data"]},"key_properties":["id"]} +{"type":"RECORD","stream":"test_base_16_encoding_interpreted","record":{"id":"test_handle_an_hex_str","contract_address":"0xA1B2C3D4E5F607080910","raw_event_data":"0xA1B2C3D4E5F60708091001020304050607080910010203040506070809100102030405060708091001020304050607080910"},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} +{"type":"RECORD","stream":"test_base_16_encoding_interpreted","record":{"id":"empty_0x_str","contract_address":"0x","raw_event_data":"0x"},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} +{"type":"RECORD","stream":"test_base_16_encoding_interpreted","record":{"id":"empty_str","contract_address":"","raw_event_data":""},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} +{"type":"RECORD","stream":"test_base_16_encoding_interpreted","record":{"id":"test_nullable_field","contract_address":"","raw_event_data":null},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} +{"type":"RECORD","stream":"test_base_16_encoding_interpreted","record":{"id":"test_handle_hex_without_the_0x_prefix","contract_address":"A1B2C3D4E5F607080910","raw_event_data":"A1B2C3D4E5F6070809100102030405060"},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} +{"type":"RECORD","stream":"test_base_16_encoding_interpreted","record":{"id":"test_handle_odd_and_even_number_of_chars","contract_address":"0xA1","raw_event_data":"A12"},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} +{"type":"RECORD","stream":"test_base_16_encoding_interpreted","record":{"id":"test_handle_upper_and_lowercase_hex","contract_address":"0xa1","raw_event_data":"A12b"},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} diff --git a/target_postgres/tests/data_files/base16_content_encoding_not_interpreted.singer b/target_postgres/tests/data_files/base16_content_encoding_not_interpreted.singer new file mode 100644 index 00000000..23d5add6 --- /dev/null +++ b/target_postgres/tests/data_files/base16_content_encoding_not_interpreted.singer @@ -0,0 +1,8 @@ +{"type":"SCHEMA","stream":"test_base_16_encoding_not_interpreted","schema":{"type":"object","properties":{"id":{"type":"string"},"contract_address":{"type":"string","contentEncoding":"base16"},"raw_event_data":{"type":["string","null"],"contentEncoding":"base16"}},"required":["id","contract_address","raw_event_data"]},"key_properties":["id"]} +{"type":"RECORD","stream":"test_base_16_encoding_not_interpreted","record":{"id":"test_handle_an_hex_str","contract_address":"0xA1B2C3D4E5F607080910","raw_event_data":"0xA1B2C3D4E5F60708091001020304050607080910010203040506070809100102030405060708091001020304050607080910"},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} +{"type":"RECORD","stream":"test_base_16_encoding_not_interpreted","record":{"id":"empty_0x_str","contract_address":"0x","raw_event_data":"0x"},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} +{"type":"RECORD","stream":"test_base_16_encoding_not_interpreted","record":{"id":"empty_str","contract_address":"","raw_event_data":""},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} +{"type":"RECORD","stream":"test_base_16_encoding_not_interpreted","record":{"id":"test_nullable_field","contract_address":"","raw_event_data":null},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} +{"type":"RECORD","stream":"test_base_16_encoding_not_interpreted","record":{"id":"test_handle_hex_without_the_0x_prefix","contract_address":"A1B2C3D4E5F607080910","raw_event_data":"A1B2C3D4E5F6070809100102030405060"},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} +{"type":"RECORD","stream":"test_base_16_encoding_not_interpreted","record":{"id":"test_handle_odd_and_even_number_of_chars","contract_address":"0xA1","raw_event_data":"A12"},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} +{"type":"RECORD","stream":"test_base_16_encoding_not_interpreted","record":{"id":"test_handle_upper_and_lowercase_hex","contract_address":"0xa1","raw_event_data":"A12b"},"time_extracted":"2023-09-15T19:33:01.841018+00:00"} diff --git a/target_postgres/tests/test_target_postgres.py b/target_postgres/tests/test_target_postgres.py index 23ba3133..fb189b7c 100644 --- a/target_postgres/tests/test_target_postgres.py +++ b/target_postgres/tests/test_target_postgres.py @@ -10,7 +10,7 @@ import jsonschema import pytest import sqlalchemy -from singer_sdk.exceptions import MissingKeyPropertiesError +from singer_sdk.exceptions import InvalidRecord, MissingKeyPropertiesError from singer_sdk.testing import get_target_test_class, sync_end_to_end from sqlalchemy.dialects.postgresql import ARRAY from sqlalchemy.types import TEXT, TIMESTAMP @@ -126,6 +126,14 @@ def verify_data( result_dict = [ remove_metadata_columns(row._asdict()) for row in result.all() ] + + # bytea columns are returned as memoryview objects + # we need to convert them to bytes to allow comparison with check_data + for row in result_dict: + for col in row: + if isinstance(row[col], memoryview): + row[col] = bytes(row[col]) + assert result_dict == check_data else: raise ValueError("Invalid check_data - not dict or list of dicts") @@ -230,7 +238,7 @@ def test_record_missing_key_property(postgres_target): def test_record_missing_required_property(postgres_target): - with pytest.raises(jsonschema.exceptions.ValidationError): + with pytest.raises(InvalidRecord): file_name = "record_missing_required_property.singer" singer_file_to_target(file_name, postgres_target) @@ -489,6 +497,78 @@ def test_new_array_column(postgres_target): singer_file_to_target(file_name, postgres_target) +def test_base16_content_encoding_not_interpreted(postgres_config_no_ssl): + """Make sure we can insert base16 encoded data into the database without interpretation""" + postgres_config_modified = copy.deepcopy(postgres_config_no_ssl) + postgres_config_modified["interpret_content_encoding"] = False + target = TargetPostgres(config=postgres_config_modified) + + singer_file_to_target("base16_content_encoding_not_interpreted.singer", target) + + rows = [ + {"id": "empty_0x_str", "contract_address": "0x", "raw_event_data": "0x"}, + {"id": "empty_str", "contract_address": "", "raw_event_data": ""}, + { + "id": "test_handle_an_hex_str", + "contract_address": "0xA1B2C3D4E5F607080910", + "raw_event_data": "0xA1B2C3D4E5F60708091001020304050607080910010203040506070809100102030405060708091001020304050607080910", + }, + { + "id": "test_handle_hex_without_the_0x_prefix", + "contract_address": "A1B2C3D4E5F607080910", + "raw_event_data": "A1B2C3D4E5F6070809100102030405060", + }, + { + "id": "test_handle_odd_and_even_number_of_chars", + "contract_address": "0xA1", + "raw_event_data": "A12", + }, + { + "id": "test_handle_upper_and_lowercase_hex", + "contract_address": "0xa1", + "raw_event_data": "A12b", + }, + {"id": "test_nullable_field", "contract_address": "", "raw_event_data": None}, + ] + verify_data(target, "test_base_16_encoding_not_interpreted", 7, "id", rows) + + +def test_base16_content_encoding_interpreted(postgres_config_no_ssl): + """Make sure we can insert base16 encoded data into the database with interpretation""" + postgres_config_modified = copy.deepcopy(postgres_config_no_ssl) + postgres_config_modified["interpret_content_encoding"] = True + target = TargetPostgres(config=postgres_config_modified) + + singer_file_to_target("base16_content_encoding_interpreted.singer", target) + + rows = [ + {"id": "empty_0x_str", "contract_address": b"", "raw_event_data": b""}, + {"id": "empty_str", "contract_address": b"", "raw_event_data": b""}, + { + "id": "test_handle_an_hex_str", + "contract_address": b"\xa1\xb2\xc3\xd4\xe5\xf6\x07\x08\x09\x10", + "raw_event_data": b"\xa1\xb2\xc3\xd4\xe5\xf6\x07\x08\x09\x10\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10\x01\x02\x03\x04\x05\x06\x07\x08\x09\x10", + }, + { + "id": "test_handle_hex_without_the_0x_prefix", + "contract_address": b"\xa1\xb2\xc3\xd4\xe5\xf6\x07\x08\x09\x10", + "raw_event_data": b"\x0a\x1b\x2c\x3d\x4e\x5f\x60\x70\x80\x91\x00\x10\x20\x30\x40\x50\x60", + }, + { + "id": "test_handle_odd_and_even_number_of_chars", + "contract_address": b"\xa1", + "raw_event_data": b"\x0a\x12", + }, + { + "id": "test_handle_upper_and_lowercase_hex", + "contract_address": b"\xa1", + "raw_event_data": b"\xa1\x2b", + }, + {"id": "test_nullable_field", "contract_address": b"", "raw_event_data": None}, + ] + verify_data(target, "test_base_16_encoding_interpreted", 7, "id", rows) + + def test_activate_version_hard_delete(postgres_config_no_ssl): """Activate Version Hard Delete Test""" table_name = "test_activate_version_hard"