From cc9a54119b0bb3b726ed1c75310437a78b57b072 Mon Sep 17 00:00:00 2001 From: Gavin Chait Date: Mon, 12 Feb 2024 12:20:52 +0100 Subject: [PATCH] Fixes for special characters in field names Fixes for where source field names include special characters (newlines / tabs) or characters used in scripts. As whyqd is used for more this may need thorough review. --- docs/changelog.md | 6 +++++- pyproject.toml | 2 +- whyqd/VERSION | 2 +- whyqd/parsers/action.py | 5 ++++- whyqd/parsers/category.py | 3 +-- whyqd/parsers/morph.py | 2 +- whyqd/parsers/script.py | 12 +++++++++--- 7 files changed, 22 insertions(+), 10 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index 7ae405a..af0f55b 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -3,11 +3,15 @@ title: Change log summary: Version history, including for legacy versions. authors: - Gavin Chait -date: 2023-12-12 +date: 2024-02-12 tags: wrangling, crosswalks, versions --- # Change log +## Version 1.1.1 (2024-02-12) + +- Fixes for where source field names include special characters (newlines / tabs) or characters used in scripts. As whyqd is used for more this may need thorough review. + ## Version 1.1.0 (2023-12-12) - Fixes to tests diff --git a/pyproject.toml b/pyproject.toml index 93d12f5..f24aa2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "whyqd" -version = "1.1.0" +version = "1.1.1" description = "data wrangling simplicity, complete audit transparency, and at speed" authors = ["Gavin Chait "] license = "BSD-3-Clause" diff --git a/whyqd/VERSION b/whyqd/VERSION index 1cc5f65..8cfbc90 100644 --- a/whyqd/VERSION +++ b/whyqd/VERSION @@ -1 +1 @@ -1.1.0 \ No newline at end of file +1.1.1 \ No newline at end of file diff --git a/whyqd/parsers/action.py b/whyqd/parsers/action.py index b3b2776..3de4203 100644 --- a/whyqd/parsers/action.py +++ b/whyqd/parsers/action.py @@ -127,7 +127,7 @@ def parse( # replace the txt with hx parsed_stack = parsed_stack.replace(f"[{txt}]", hx) i_prsed = [] - for s in self.parser.get_split_terms(script=parsed_stack, by=","): + for s in self.parser.get_split_terms(script=parsed_stack, by=",", maxsplit=-1): splt = self.parser.get_split_terms(script=s, by="<") if len(splt) == 1: i_prsed.extend(self.parser.get_listed_literal(text=s)) @@ -293,6 +293,9 @@ def recover_fields_from_hexed_script( if not isinstance(parsed, list): parsed = [parsed] for term in parsed: + if not term: + # Blank string artifacts can be introduced + continue recovered = None if isinstance(term, str) and term in modifier_names: recovered = action.get_modifier(term=term) diff --git a/whyqd/parsers/category.py b/whyqd/parsers/category.py index 3b5949d..ff198e2 100644 --- a/whyqd/parsers/category.py +++ b/whyqd/parsers/category.py @@ -185,7 +185,6 @@ def set_schema( self.schema_source = schema_source self.schema_destination = schema_destination - def get_schema_field_category(self, *, field: FieldModel, term: str, is_source: bool = True) -> CategoryModel | None: """ Recover a field category model from a string. It is possible that source and destination schema category share @@ -281,4 +280,4 @@ def get_assigned_uniques(self, *, text: str) -> list[str]: terms = list(self.parser.generate_contents(text=text)) if len(terms) != 1: raise ValueError(f"Category assignment actions must not be nested. ({text}).") - return [self.parser.get_literal(text=t) for t in self.parser.get_split_terms(script=terms[0][1], by=",")] + return [self.parser.get_literal(text=t) for t in self.parser.get_split_terms(script=terms[0][1], by=",", maxsplit=-1)] diff --git a/whyqd/parsers/morph.py b/whyqd/parsers/morph.py index 74d2a07..399f8fb 100644 --- a/whyqd/parsers/morph.py +++ b/whyqd/parsers/morph.py @@ -206,4 +206,4 @@ def get_morph_struts(self, *, term: str) -> list[str]: return [term] if len(terms) != 1: raise ValueError(f"Morph actions must not be nested. ({term}).") - return self.parser.get_split_terms(script=terms[0][1], by=",") + return self.parser.get_split_terms(script=terms[0][1], by=",", maxsplit=-1) diff --git a/whyqd/parsers/script.py b/whyqd/parsers/script.py index 0d992bb..1577ec6 100644 --- a/whyqd/parsers/script.py +++ b/whyqd/parsers/script.py @@ -146,15 +146,21 @@ def generate_contents(self, *, text) -> list[tuple[int, str]]: start = stack.pop() yield (len(stack), text[start + 1 : i]) - def get_split_terms(self, *, script: str, by: str) -> list[str]: - return [s.strip() for s in script.split(by)] + def get_split_terms(self, *, script: str, by: str, maxsplit: int = 1) -> list[str]: + # https://docs.python.org/3/library/stdtypes.html#str.split + # str.split(sep=None, maxsplit=-1) + return [s.strip() for s in script.split(sep=by, maxsplit=maxsplit)] def get_literal(self, *, text: str) -> str: literal = text try: literal = ast.literal_eval(text) - except ValueError: + except (ValueError, TypeError): pass + except SyntaxError: + # `literal_eval` strips special characters, leading to syntax errors + if text.startswith("'") and text.endswith("'"): + literal = text[1:-1] return literal def get_listed_literal(self, *, text: str) -> list[str]: