-
Notifications
You must be signed in to change notification settings - Fork 12
/
fileformat.py
130 lines (97 loc) · 3.58 KB
/
fileformat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
encapsulates details of file format used for stemming.yaml, and lexicon YAMLs,
providing functions for loading the file and populating StemmingRuleSet or
Lexicon (with form and accent overrides)
"""
from collections import defaultdict
import yaml
from greek_accentuation.characters import strip_length as do_strip_length
from inflexion.lexicon import Lexicon
from inflexion.stemming import StemmingRuleSet
class RefDoesNotExistException(Exception):
pass
def load_stemming(stemming_file, strip_length=False):
ruleset = StemmingRuleSet()
with open(stemming_file) as f:
stemming_dict = yaml.safe_load(f)
for key, rules in stemming_dict.items():
while isinstance(rules, dict) and "ref" in rules:
if rules["ref"] in stemming_dict:
rules = stemming_dict[rules["ref"]]
else:
raise RefDoesNotExistException(
"ref to {} which doesn't exist".format(
rules["ref"]))
for rule in rules:
if strip_length:
rule = do_strip_length(rule)
if ";" in rule:
rule, annotation = rule.split(";")
ruleset.add(key, rule, {annotation})
else:
ruleset.add(key, rule)
return ruleset
def split_stem_tags(stems):
for stem in stems.split("/"):
if ";" in stem:
stem, tag = stem.split(";")
tag = {tag}
else:
tag = set()
yield stem, tag
def load_lexicon(lexicon_file, pre_processor=lambda x: x):
lexicon = Lexicon()
partnum_to_key_regex = {
"1-": "P",
"1-A": "PA",
"1-M": "PM",
"1+": "I",
"2-": "F[AM]",
"2-A": "FA",
"2-M": "FM",
"3-": "A[AM][NPDSO]",
"3+": "A[AM]I",
"3+A": "AAI",
"3+M": "AMI",
"4-": "XA",
"4+": "YA",
"5-": "X[MP]",
"5+": "Y[MP]",
"6-": "AP[NPDSO]",
"6+": "API",
"7-": "FP",
"8-": "Z[MP]",
"M": "..M",
"F": "..F",
"N": "..N",
}
form_override = {}
accent_override = defaultdict(list)
segmented_lemmas = {}
with open(lexicon_file) as f:
for lemma, entry in yaml.safe_load(f).items():
if entry:
if "-" in lemma:
segmented_lemma = lemma
lemma = lemma.replace("-", "")
segmented_lemmas[lemma] = segmented_lemma
if "stems" in entry:
stems = []
for partnum, stems in sorted((
entry["stems"] if entry.get("stems") else {}
).items()):
key_regex = partnum_to_key_regex[partnum]
for stem, tag in split_stem_tags(stems):
lexicon.add(
lemma, key_regex, pre_processor(stem), tag)
for key_regex, stems in entry.get("stem_overrides", []):
if stems is None:
continue
for stem, tag in split_stem_tags(stems):
lexicon.add(
lemma, key_regex, pre_processor(stem), tag)
for key, form in entry.get("forms", {}).items():
form_override[(lemma, key)] = form
for key_regex, form in entry.get("accents", []):
accent_override[lemma].append((key_regex, form))
return lexicon, form_override, accent_override, segmented_lemmas