Merge branch 'main' into edgarrmondragon/chore/refactors-typos-cleanup

MeltanoLabs · Feb 6, 2024 · 0fc435f · 0fc435f
2 parents a2ff536 + 557c9da
commit 0fc435f
Show file tree

Hide file tree

Showing 10 changed files with 273 additions and 71 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
   - id: trailing-whitespace
 
 - repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: v0.1.14
+  rev: v0.2.0
   hooks:
   - id: ruff
     args: [--fix]

diff --git a/README.md b/README.md
@@ -110,6 +110,17 @@ pre-commit install
 
 ### Create and Run Tests
 
+Set up the SSL files permissions:
+
+```bash
+chmod 0600 .ssl/*.key
+```
+
+Start the test databases using Docker Compose:
+```bash
+docker-compose up -d
+```
+
 Create tests within the `target_postgres/tests` subfolder and
   then run:
 
@@ -163,7 +174,7 @@ The below table shows how this tap will map between jsonschema datatypes and Pos
 | UNSUPPORTED                    | bit varying [ (n) ]                     |
 | boolean                        | boolean                                 |
 | UNSUPPORTED                    | box                                     |
-| UNSUPPORTED                    | bytea                                   |
+| string with contentEncoding="base16" ([opt-in feature](#content-encoding-support)) | bytea                                   |
 | UNSUPPORTED                    | character [ (n) ]                       |
 | UNSUPPORTED                    | character varying [ (n) ]               |
 | UNSUPPORTED                    | cidr                                    |
@@ -204,6 +215,7 @@ The below table shows how this tap will map between jsonschema datatypes and Pos
 Note that while object types are mapped directly to jsonb, array types are mapped to a jsonb array.
 
 If a column has multiple jsonschema types, the following order is using to order Postgres types, from highest priority to lowest priority.
+- BYTEA
 - ARRAY(JSONB)
 - JSONB
 - TEXT
@@ -216,3 +228,50 @@ If a column has multiple jsonschema types, the following order is using to order
 - INTEGER
 - BOOLEAN
 - NOTYPE
+
+## Content Encoding Support
+
+Json Schema supports the [`contentEncoding` keyword](https://datatracker.ietf.org/doc/html/rfc4648#section-8), which can be used to specify the encoding of input string types.
+
+This target can detect content encoding clues in the schema to determine how to store the data in the postgres in a more efficient way.
+
+Content encoding interpretation is disabled by default. This is because the default config is meant to be as permissive as possible, and do not make any assumptions about the data that could lead to data loss.
+
+However if you know your data respects the advertised content encoding way, you can enable this feature to get better performance and storage efficiency.
+
+To enable it, set the `interpret_content_encoding` option to `True`.
+
+### base16
+
+The string is encoded using the base16 encoding, as defined in [RFC 4648](https://json-schema.org/draft/2020-12/draft-bhutton-json-schema-validation-00#rfc.section.8.3
+).
+
+Example schema:
+```json
+{
+  "type": "object",
+  "properties": {
+    "my_hex": {
+      "type": "string",
+      "contentEncoding": "base16"
+    }
+  }
+}
+```
+
+Data will be stored as a `bytea` in the database.
+
+Example data:
+```json
+# valid data
+{ "my_hex": "01AF" }
+{ "my_hex": "01af" }
+{ "my_hex": "1af" }
+{ "my_hex": "0x1234" }
+
+# invalid data
+{ "my_hex": " 0x1234 " }
+{ "my_hex": "House" }
+```
+
+For convenience, data prefixed with `0x` or containing an odd number of characters is supported although it's not part of the standard.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,7 @@ sqlalchemy = "~=2.0"
 sshtunnel = "0.4.0"
 
 [tool.poetry.dependencies.singer-sdk]
-version = "~=0.34.0"
+version = "~=0.35.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = ">=7.4.2"