forked from CanCLID/ToJyutping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
74 lines (60 loc) · 1.72 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from opencc import OpenCC
import os
t2s = OpenCC('t2s').convert
os.system('wget -nc https://raw.githubusercontent.com/rime/rime-cantonese/5b6d334/jyut6ping3.dict.yaml')
def freq_str_to_float(s):
'''Convert frequency data in the dictionary file to float.
>>> freq_str_to_float('2')
2.0
>>> freq_str_to_float('2%')
0.02
'''
if s[-1] == '%':
return float(s[:-1]) * 0.01
else:
return float(s)
DEFAULT_FREQ = 0.07
def build_dict(d, filepath):
'''Create a dictionary of all the words with jyutping data.
If there are multiple possibilities, the one with higher frequency is used.
'''
with open(filepath) as f:
for line in f:
if line == '...\n':
break
next(f)
for line in f:
if line and line[0] != '#':
parts = line.rstrip().split('\t')
if len(parts) == 2:
字, 粵拼 = parts
詞頻 = DEFAULT_FREQ
elif len(parts) == 3:
字, 粵拼, 詞頻 = parts
try:
詞頻 = freq_str_to_float(詞頻)
except ValueError:
continue
is_valid_length = len(字) == 粵拼.count(' ') + 1
if is_valid_length or len(字) == 1: # 瓩
元字 = d.get(字)
if not 元字:
d[字] = (粵拼, 詞頻)
else:
元粵拼, 元詞頻 = 元字
should_change = (詞頻 > 元詞頻) or \
((詞頻 == 元詞頻) and (元粵拼[-1] != '2' and 粵拼[-1] == '2')) # 變2調優先
if should_change:
d[字] = (粵拼, 詞頻)
def write_dict(d):
with open('src/ToJyutping/jyut6ping3.simple.dict.yaml', 'w') as f:
for k, v in d.items():
print(k + '\t' + v, file=f)
def main():
d = {}
build_dict(d, 'jyut6ping3.dict.yaml')
d_t = {k: v[0] for k, v in d.items()}
d_cn = {t2s(k): v for k, v in d_t.items()}
d = {**d_cn, **d_t}
write_dict(d)
main()