Skip to content

Commit

Permalink
Fixes for special characters in field names
Browse files Browse the repository at this point in the history
Fixes for where source field names include special characters (newlines / tabs) or characters used in scripts. As whyqd is used for more this may need thorough review.
  • Loading branch information
turukawa committed Feb 12, 2024
1 parent 7137e15 commit cc9a541
Show file tree
Hide file tree
Showing 7 changed files with 22 additions and 10 deletions.
6 changes: 5 additions & 1 deletion docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@ title: Change log
summary: Version history, including for legacy versions.
authors:
- Gavin Chait
date: 2023-12-12
date: 2024-02-12
tags: wrangling, crosswalks, versions
---
# Change log

## Version 1.1.1 (2024-02-12)

- Fixes for where source field names include special characters (newlines / tabs) or characters used in scripts. As whyqd is used for more this may need thorough review.

## Version 1.1.0 (2023-12-12)

- Fixes to tests
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "whyqd"
version = "1.1.0"
version = "1.1.1"
description = "data wrangling simplicity, complete audit transparency, and at speed"
authors = ["Gavin Chait <[email protected]>"]
license = "BSD-3-Clause"
Expand Down
2 changes: 1 addition & 1 deletion whyqd/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.1.0
1.1.1
5 changes: 4 additions & 1 deletion whyqd/parsers/action.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def parse(
# replace the txt with hx
parsed_stack = parsed_stack.replace(f"[{txt}]", hx)
i_prsed = []
for s in self.parser.get_split_terms(script=parsed_stack, by=","):
for s in self.parser.get_split_terms(script=parsed_stack, by=",", maxsplit=-1):
splt = self.parser.get_split_terms(script=s, by="<")
if len(splt) == 1:
i_prsed.extend(self.parser.get_listed_literal(text=s))
Expand Down Expand Up @@ -293,6 +293,9 @@ def recover_fields_from_hexed_script(
if not isinstance(parsed, list):
parsed = [parsed]
for term in parsed:
if not term:
# Blank string artifacts can be introduced
continue
recovered = None
if isinstance(term, str) and term in modifier_names:
recovered = action.get_modifier(term=term)
Expand Down
3 changes: 1 addition & 2 deletions whyqd/parsers/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,6 @@ def set_schema(
self.schema_source = schema_source
self.schema_destination = schema_destination


def get_schema_field_category(self, *, field: FieldModel, term: str, is_source: bool = True) -> CategoryModel | None:
"""
Recover a field category model from a string. It is possible that source and destination schema category share
Expand Down Expand Up @@ -281,4 +280,4 @@ def get_assigned_uniques(self, *, text: str) -> list[str]:
terms = list(self.parser.generate_contents(text=text))
if len(terms) != 1:
raise ValueError(f"Category assignment actions must not be nested. ({text}).")
return [self.parser.get_literal(text=t) for t in self.parser.get_split_terms(script=terms[0][1], by=",")]
return [self.parser.get_literal(text=t) for t in self.parser.get_split_terms(script=terms[0][1], by=",", maxsplit=-1)]
2 changes: 1 addition & 1 deletion whyqd/parsers/morph.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,4 +206,4 @@ def get_morph_struts(self, *, term: str) -> list[str]:
return [term]
if len(terms) != 1:
raise ValueError(f"Morph actions must not be nested. ({term}).")
return self.parser.get_split_terms(script=terms[0][1], by=",")
return self.parser.get_split_terms(script=terms[0][1], by=",", maxsplit=-1)
12 changes: 9 additions & 3 deletions whyqd/parsers/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,15 +146,21 @@ def generate_contents(self, *, text) -> list[tuple[int, str]]:
start = stack.pop()
yield (len(stack), text[start + 1 : i])

def get_split_terms(self, *, script: str, by: str) -> list[str]:
return [s.strip() for s in script.split(by)]
def get_split_terms(self, *, script: str, by: str, maxsplit: int = 1) -> list[str]:
# https://docs.python.org/3/library/stdtypes.html#str.split
# str.split(sep=None, maxsplit=-1)
return [s.strip() for s in script.split(sep=by, maxsplit=maxsplit)]

def get_literal(self, *, text: str) -> str:
literal = text
try:
literal = ast.literal_eval(text)
except ValueError:
except (ValueError, TypeError):
pass
except SyntaxError:
# `literal_eval` strips special characters, leading to syntax errors
if text.startswith("'") and text.endswith("'"):
literal = text[1:-1]
return literal

def get_listed_literal(self, *, text: str) -> list[str]:
Expand Down

0 comments on commit cc9a541

Please sign in to comment.