diff --git a/README.md b/README.md index 5b50808e..116750a6 100644 --- a/README.md +++ b/README.md @@ -275,3 +275,41 @@ Example data: ``` For convenience, data prefixed with `0x` or containing an odd number of characters is supported although it's not part of the standard. + +## String enum support + +This target can store columns with the json-schema "enum" type as a pg DOMAIN type instead of a TEXT column. This can save space and improve performance. + +It is an opt-in feature because it might result in data loss if the actual data does not match the schema's advertised encoding. + +Please consider these several downsides to take into consideration before activating this feature: +- it changes the sort behavior of the resulting column +- string operations will not be available +- portability of the data is reduced +- it is not possible to add remove or modify the enum values +- enums are not shared accross tables, each column get his own custom type + +To enable it, set the `storage_optimized_enum` option to `True`. + +Example schema: +```json +{ + "type": "object", + "properties": { + "my_enum": { + "type": "string", + "enum": ["foo", "bar", "baz"] + } + } +} +``` + +Data will be stored as a custom domain type in the database. The domain name will be `enum__`. The domain will be created in the same schema as the table. + +Example generated SQL: +```sql +CREATE TYPE enum_123456_my_enum AS ENUM ('foo', 'bar', 'baz'); +CREATE TABLE my_table ( + my_enum enum_123456_my_enum +); +``` diff --git a/target_postgres/connector.py b/target_postgres/connector.py index 273f777f..b9ec1639 100644 --- a/target_postgres/connector.py +++ b/target_postgres/connector.py @@ -105,11 +105,11 @@ def storage_optimized_enum(self) -> bool: Please consider these several downsides to take into consideration before activating: - - it changes the sort behavior of the column + - it changes the sort behavior of the resulting column - string operations will not be available - portability of the data is reduced - it is not possible to add remove or modify the enum values - - enums are shared across all tables in the schema + - enums are not shared accross tables, each column get his own custom type Returns: True if the feature is enabled, False otherwise. @@ -363,7 +363,9 @@ def pick_individual_type( ): return HexByteString() if self.storage_optimized_enum and jsonschema_type.get("enum"): - # make sure that the enum name is unique + # make sure that the enum name is unique and that the uniqueness part + # can be determined using the first 71 characters of the enum name + # this is a limitation of postgres type names hasher = hashlib.md5(usedforsecurity=False) hasher.update(f"{schema_name}__{table_name}__{property_name}".encode()) hash_str = hasher.hexdigest() diff --git a/target_postgres/target.py b/target_postgres/target.py index 8587102d..f497c941 100644 --- a/target_postgres/target.py +++ b/target_postgres/target.py @@ -200,6 +200,23 @@ def __init__( "in an error if the data is not encoded as expected." ), ), + th.Property( + "storage_optimized_enum", + th.BooleanType, + default=False, + description=( + "If set to true, the target will store enum values as a custom pg type " + "instead of a text column. This can save space and improve performance," + " but may make the data harder to query and analyze." + "Please consider these several downsides to take into consideration " + "before activating this feature:" + " it changes the sort behavior of the resulting column," + " string operations will not be available," + " portability of the data is reduced," + " it is not possible to add remove or modify the enum values," + " enums are not shared accross tables." + ), + ), th.Property( "ssl_enable", th.BooleanType,