Skip to content

Commit

Permalink
pona: o kepeken nasin utf-8 lon ilo open
Browse files Browse the repository at this point in the history
  • Loading branch information
gregdan3 committed Oct 16, 2024
1 parent 9e5ddab commit 9e3c3e4
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 9 deletions.
8 changes: 4 additions & 4 deletions src/sonatoki/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,11 @@ def download_json(url: str) -> Dict[str, Any]:

def regen_linku_data():
data = download_json(LINKU_WORDS)
with open(os.path.join(HERE, "linku.json"), "w") as f:
with open(os.path.join(HERE, "linku.json"), "w", encoding="utf-8") as f:
_ = f.write(json.dumps(data))

data = download_json(LINKU_SANDBOX)
with open(os.path.join(HERE, "sandbox.json"), "w") as f:
with open(os.path.join(HERE, "sandbox.json"), "w", encoding="utf-8") as f:
_ = f.write(json.dumps(data))


Expand Down Expand Up @@ -96,11 +96,11 @@ def regen_false_negatives():
continue

# TODO: include short matches or no?
with open(os.path.join(HERE, "syllabic.txt"), "w") as f:
with open(os.path.join(HERE, "syllabic.txt"), "w", encoding="utf-8") as f:
syllabic_final = sorted([word + "\n" for word in syllabic_matches])
f.writelines(syllabic_final)

with open(os.path.join(HERE, "alphabetic.txt"), "w") as f:
with open(os.path.join(HERE, "alphabetic.txt"), "w", encoding="utf-8") as f:
alphabetic_final = sorted([word + "\n" for word in alphabetic_matches])
f.writelines(alphabetic_final)

Expand Down
8 changes: 4 additions & 4 deletions src/sonatoki/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,9 +699,9 @@ def linku_data() -> Dict[str, LinkuWord]:
# NOTE: this does open+read+parse two files each time you construct a filter
# but i expect users to construct filters only at the start of runtime
# there is no reason to waste your RAM by leaving the linku data in it
with open(LINKU) as f:
with open(LINKU, "r", encoding="utf-8") as f:
linku: Dict[str, LinkuWord] = json.loads(f.read())
with open(SANDBOX) as f:
with open(SANDBOX, "r", encoding="utf-8") as f:
sandbox: Dict[str, LinkuWord] = json.loads(f.read())

return {**linku, **sandbox}
Expand Down Expand Up @@ -732,10 +732,10 @@ def words_by_usage(
NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}


# with open(SYLLABICS) as f:
# with open(SYLLABICS, "r", encoding="utf-8") as f:
# FALSE_POS_SYLLABIC = {line.strip() for line in f}
#
# with open(ALPHABETICS) as f:
# with open(ALPHABETICS, "r", encoding="utf-8") as f:
# FALSE_POS_ALPHABETIC = {line.strip() for line in f}

__all__ = [
Expand Down
2 changes: 1 addition & 1 deletion tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ class TokenizerTest(TypedDict):


def load_params_from_yaml(json_path: str) -> List[TokenizerTest]:
with open(json_path) as f:
with open(json_path, "r", encoding="utf-8") as f:
return yaml.safe_load(f)


Expand Down

0 comments on commit 9e3c3e4

Please sign in to comment.