pona: o kepeken nasin utf-8 lon ilo open

gregdan3 · Oct 16, 2024 · 9e3c3e4 · 9e3c3e4
1 parent 9e5ddab
commit 9e3c3e4
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 9 deletions.
diff --git a/src/sonatoki/__main__.py b/src/sonatoki/__main__.py
@@ -60,11 +60,11 @@ def download_json(url: str) -> Dict[str, Any]:
 
 def regen_linku_data():
     data = download_json(LINKU_WORDS)
-    with open(os.path.join(HERE, "linku.json"), "w") as f:
+    with open(os.path.join(HERE, "linku.json"), "w", encoding="utf-8") as f:
         _ = f.write(json.dumps(data))
 
     data = download_json(LINKU_SANDBOX)
-    with open(os.path.join(HERE, "sandbox.json"), "w") as f:
+    with open(os.path.join(HERE, "sandbox.json"), "w", encoding="utf-8") as f:
         _ = f.write(json.dumps(data))
 
 
@@ -96,11 +96,11 @@ def regen_false_negatives():
             continue
 
     # TODO: include short matches or no?
-    with open(os.path.join(HERE, "syllabic.txt"), "w") as f:
+    with open(os.path.join(HERE, "syllabic.txt"), "w", encoding="utf-8") as f:
         syllabic_final = sorted([word + "\n" for word in syllabic_matches])
         f.writelines(syllabic_final)
 
-    with open(os.path.join(HERE, "alphabetic.txt"), "w") as f:
+    with open(os.path.join(HERE, "alphabetic.txt"), "w", encoding="utf-8") as f:
         alphabetic_final = sorted([word + "\n" for word in alphabetic_matches])
         f.writelines(alphabetic_final)
 

diff --git a/src/sonatoki/constants.py b/src/sonatoki/constants.py
@@ -699,9 +699,9 @@ def linku_data() -> Dict[str, LinkuWord]:
     # NOTE: this does open+read+parse two files each time you construct a filter
     # but i expect users to construct filters only at the start of runtime
     # there is no reason to waste your RAM by leaving the linku data in it
-    with open(LINKU) as f:
+    with open(LINKU, "r", encoding="utf-8") as f:
         linku: Dict[str, LinkuWord] = json.loads(f.read())
-    with open(SANDBOX) as f:
+    with open(SANDBOX, "r", encoding="utf-8") as f:
         sandbox: Dict[str, LinkuWord] = json.loads(f.read())
 
     return {**linku, **sandbox}
@@ -732,10 +732,10 @@ def words_by_usage(
 NIMI_PU_SYNONYMS = {"namako", "kin", "oko"}
 
 
-# with open(SYLLABICS) as f:
+# with open(SYLLABICS, "r", encoding="utf-8") as f:
 #     FALSE_POS_SYLLABIC = {line.strip() for line in f}
 #
-# with open(ALPHABETICS) as f:
+# with open(ALPHABETICS, "r", encoding="utf-8") as f:
 #     FALSE_POS_ALPHABETIC = {line.strip() for line in f}
 
 __all__ = [

diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -25,7 +25,7 @@ class TokenizerTest(TypedDict):
 
 
 def load_params_from_yaml(json_path: str) -> List[TokenizerTest]:
-    with open(json_path) as f:
+    with open(json_path, "r", encoding="utf-8") as f:
         return yaml.safe_load(f)