Skip to content

Commit

Permalink
Updates KVP rules (issues #21 & #22)
Browse files Browse the repository at this point in the history
  • Loading branch information
jerefrer committed Oct 7, 2024
1 parent 75e8e9c commit 9451ae0
Show file tree
Hide file tree
Showing 10 changed files with 438 additions and 89 deletions.
26 changes: 13 additions & 13 deletions bophono/PhonStateKVP.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

class PhonStateKVP:
def __init__(self, options={}, pos=None, endOfSentence=False):
self.position = 0
Expand All @@ -20,21 +22,17 @@ def doCombineCurEnd(self, endofword, nrc='', nextvowel=''): # nrc = next root co
return
# ' from ends.csv should be replaced with a space
self.end = self.end.replace("'", ' ')
# e at the end of a word becomes é
if self.end.endswith("e") and endofword:
if self.accentuateall or self.phon+'e' in self.accentuateWL:
self.end = self.end[:-1]+"é"
# suffix ga is "k" except in the middle of words
if self.end.endswith("k") and not endofword:
self.end = self.end[:-1]+"g"
# nng or ngg -> ng
if self.end.endswith("ng") and nrc.startswith("g"):
self.end = self.end[:-1]
if self.end.endswith("ng") and nrc.startswith("ng"):
self.end = self.end[:-2]
if self.end.endswith("g") and nrc.startswith("g"):
self.end = self.end[:-1]+"k"
if self.end.endswith("n") and nrc.startswith("n"):
self.end = self.end[:-1]
# I suppose? TODO: check
if self.end.endswith("ng") and nrc.startswith("ng"):
self.end = self.end[:-2]
# optional, from Rigpa: kun dga' -> kun-ga
if self.splitNG and self.end.endswith("n") and nrc.startswith("g"):
self.end += "-"
Expand All @@ -44,25 +42,27 @@ def doCombineCurEnd(self, endofword, nrc='', nextvowel=''): # nrc = next root co
self.phon += self.end


def combineWithException(self, exception):
def combineWithException(self, exception, tibetanSyllable):
syllables = exception.split('|')
for syl in syllables:
indexplusminus = syl.find('-')
if indexplusminus == -1:
print("invalid exception syllable: "+syl)
continue
self.combineWith(syl[:indexplusminus], syl[indexplusminus+1:])
self.combineWith(syl[:indexplusminus], syl[indexplusminus+1:], tibetanSyllable)

def combineWith(self, nextroot, nextend):
def combineWith(self, nextroot, nextend, tibetanSyllable):
nextrootconsonant = nextroot
nextvowel = ''
self.doCombineCurEnd(False, nextrootconsonant, nextvowel)
self.position += 1
if nextrootconsonant == "-":
self.phon += ""
elif re.search(r'^བ[ོ]?[རསད]?(འི)?$', tibetanSyllable) and self.position == 1:
self.phon += "w"
elif nextrootconsonant.startswith("dz") and self.position > 1:
self.phon += "z"
elif nextrootconsonant.startswith("tr") and self.position == 1:
elif "གྲ" in tibetanSyllable and nextrootconsonant.startswith("tr") and self.position == 2:
self.phon += "dr"
else:
self.phon += nextrootconsonant
Expand All @@ -72,7 +72,7 @@ def combineWith(self, nextroot, nextend):
self.end = ends[0]
for endsyl in ends[1:]:
# we suppose that roots are always null
self.combineWith(endsyl[:1], endsyl[1:])
self.combineWith(endsyl[:1], endsyl[1:], tibetanSyllable)
else:
self.end = nextend

Expand Down
6 changes: 3 additions & 3 deletions bophono/UnicodeToApi.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(self, schema="MST", options={}):
self.ends = get_trie_from_file(self.__get_trie_path("ends.csv"), "ends", self.columnIndex)
self.exceptions = get_trie_from_file(self.__get_trie_path(exceptions), "exceptions", 1, self.ends)

self.ignored_chars = {'\u0F35': True, '\u0F37': True}
self.ignored_chars = {'\u0F35': True, '\u0F37': True, '\u0F71': True}

def __get_trie_path(self, name):
return os.path.join(os.path.split(__file__)[0], 'data', name)
Expand Down Expand Up @@ -70,7 +70,7 @@ def __combine_next_syll_phon(self, tibstr, bindex, state, eindex):
return -1
if endinfo['i'] < eindex and self.__is_tib_letter(tibstr[endinfo['i']]) and (tibstr[endinfo['i']] not in self.ignored_chars):
return -1
state.combineWith(rootinfo['d'], endinfo['d'])
state.combineWith(rootinfo['d'], endinfo['d'], tibstr[bindex:eindex])
assert(endinfo['i']>bindex)
return endinfo['i']

Expand All @@ -97,7 +97,7 @@ def get_api(self, tibstr, bindex=0, eindex=-1, pos=None, endOfSentence=False):
# if it starts with '2:' and we're in the first syllable, we ignore it:
if exceptioninfo['d'].startswith('2:'):
exceptioninfo['d'] = exceptioninfo['d'][2:]
state.combineWithException(exceptioninfo['d'])
state.combineWithException(exceptioninfo['d'], tibstr[bindex:eindex])
nextidx = self.__get_next_letter_index(tibstr, exceptioninfo['i']+1, eindex)
if nextidx == -1:
nextidx = eindex
Expand Down
54 changes: 27 additions & 27 deletions bophono/data/ends.csv
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
ལ,äl,ɛl,al,al
འི,äj,ɨ,e,e
འིའོ,a|-i|-o,i|wo,e’o,eo
འོ,a|-o,a|o,a'o,ao
འོ,a|-o,a|o,ao,ao
འང,a|-ang,a|wang,a’ang,ang
འམ,a|-am,a:m,am,am
ར,ar,ər,ar,ar
Expand All @@ -31,31 +31,31 @@
ིམས,im~,im,im,im
ིལ,il,il,il,il
ིའི,i:,i:,i,i
ིའིའོ,i:|-o,,i'o,io
ིའོ,i|-o,,i'o,io
ིའང,i|-ang,,i'ang,iang
ིའམ,i|-am,,i'am,iam
ིའིའོ,i:|-o,,io,io
ིའོ,i|-o,,io,io
ིའང,i|-ang,,iang,iang
ིའམ,i|-am,,iam,iam
ིར,ir,ir,ir,ir
ིས,i',i,i,i
ུ,u,u,u,u
ུག,uk,ɨx,uk,uk
ུགས,uk,ɨx,uk,uk
ུང,ung,ong,ung,ung
ུངས,ung~,ong,ung,ung
ུད,ü',ut,ü
ུན,ün,un,ün,ün
ུད,ü',ut,u
ུན,ün,un,un,ün
ུབ,up,ub,ub,ub
ུབས,up,ub,ub,ub
ུམ,um,um,um,um
ུམས,um~,um,um,um
ུལ,ül,ul,ül,ul
ུའི,üj,ɨ,ü,y
ུའིའོ,u|-i|-o,,ü’o,yo
ུའོ,u|-o,,ü’o,yo
ུལ,ül,ul,ul,ul
ུའི,üj,ɨ,u,y
ུའིའོ,u|-i|-o,,u’o,yo
ུའོ,u|-o,,u’o,yo
ུའང,u|-ang,,u’ang,uang
ུའམ,u|-am,,u’am,uam
ུར,ur,ur,ur,ur
ུས,ü',i,ü,y
ུས,ü',i,u,y
ེ,e,e,e,e
ེག,ek,əx,ek,ek
ེགས,ek,əx,ek,ek
Expand Down Expand Up @@ -96,41 +96,41 @@
ོས,ö',i,ö,ö
འུ,a|-u,-u,a’u,au
འུའི,a|-u|-i,,a’u,au
འུའིའོ,a|-u|-i|-o,,a'u’o,auo
འུའོ,a|-u|-o,,a'u’o,auo
འུའིའོ,a|-u|-i|-o,,au’o,auo
འུའོ,a|-u|-o,,au’o,auo
འུའང,a|-u|-ang,,a’u’ang,auang
འུའམ,a|-u|-am,,a’u’am,auam
འུར,a|-ur,ur,a’ur,aur
འུས,a|-ü',i,a’u,au
ིའུ,i|-u,u,i’u,iu
ིའུའི,i|-u|-i,,i,iy
ིའུའིའོ,i|-u|-i|-o,,i’ü’o,iyo
ིའུའི,i|-u|-i,,i’uu,iy
ིའུའིའོ,i|-u|-i|-o,,i’u’o,iyo
ིའུའོ,i|-u|-o,,i’u’o,iuo
ིའུའང,i|-u|-ang,,i’u’ang,iuang
ིའུའམ,i|-u|-am,,i’u’am,iuam
ིའུར,i|-ur,,i’ur,iur
ིའུས,i|-ü',,i’ü,iy
ིའུས,i|-ü',,i’u,iy
ུའུ,u|-u,u,u’u,uu
ུའུའི,u|-u|-i,u,u’ü,uy
ུའུའིའོ,u|-u|-i|-o,,u’ü’o,uyo
ུའུའི,u|-u|-i,u,u’u,uy
ུའུའིའོ,u|-u|-i|-o,,u’u’o,uyo
ུའུའོ,u|-u|-o,,u’u’o,uuo
ུའུའང,u|-u|-ang,,u’u’ang,uuang
ུའུའམ,u|-u|-am,,u’u’am,uuam
ུའུར,u|-ur,,e’ur,eur
ུའུས,u|-ü',,e’ü,ey
ུའུས,u|-ü',,e’u,ey
ེའུ,e|-u,u,e’u,eu
ེའུའི,e|-u|-i,,e’ü,ey
ེའུའིའོ,e|-u|-i|-o,,e’ü’o,eyo
ེའུའི,e|-u|-i,,e’u,ey
ེའུའིའོ,e|-u|-i|-o,,e’u’o,eyo
ེའུའོ,e|-u|-o,,e’u’o,euo
ེའུའང,e|-u|-ang,,e’u’ang,euang
ེའུའམ,e|-u|-am,,e’u’am,euam
ེའུར,e|-ur,,e’ur,eur
ེའུས,e|-ü',,e’ü,ey
ོའུ,o|-u,u,o'u,ou
ོའུའི,o|-u|-i,,o’ü,oy
ོའུའིའོ,o|-u|-i|-o,,o’ü’o,oyo
ེའུས,e|-ü',,e’u,ey
ོའུ,o|-u,u,ou,ou
ོའུའི,o|-u|-i,,o’u,oy
ོའུའིའོ,o|-u|-i|-o,,o’u’o,oyo
ོའུའོ,o|-u|-o,,o’u’o,ouo
ོའུའང,o|-u|-ang,,o’u’ang,ouang
ོའུའམ,o|-u|-am,,o’u’am,ouam
ོའུར,o|-ur,,o’ur,our
ོའུས,o|-ü',,o’ü,oy
ོའུས,o|-ü',,o’u,oy
28 changes: 22 additions & 6 deletions bophono/data/exceptions-kvp.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# transforming all the time, will be fixed manually
བ/Ca,w-
བོ/Co,w-
2:བ/Ca,w-
2:བོ/Co,w-
# KVP exceptions
བཀྲ་ཤིས,ta-shi
སྤྲུལ་སྐུ/Cu,tulk-
Expand All @@ -10,15 +10,28 @@
ངག་དབང,ngaw-ang
སྤྱན་རས་གཟིགས,chenrez-ig
སྣར་ཐང,narth-ang
བུམ་ཐང་,bumth-ang
ཞེ་ཆེན,shech-en
ཞྭ་ལུ/Cu,shal-u
ཞྭ་ལུ/Cu,shal-
ཞྭ་དམར་,sham-ar
ཞྭ་དམར་པ་,shamarp-a
འཕྲིན་ལས་ ནོར་བུ་,Thin-ley Norbu
གཏོང་ལེན,tongl-en
རིན་པོ་ཆེ/Ce,rinpoch-
འཕོ་བ/Ca,pow-
ཐང་ཀ/Ca,tangkh-
བར་ཆད,barch-é
བར་དུ,bard-u
བར་ཆད,barch-e
བར་དོ/Co,bard-
གཅོད,ch-öd
འཆི་མེད་,chim-é
མདོ་མེད་,dom-é
རིས་མེད་,rim-é
ཨ་མེས་,am-é
ཅོ་ནེ་,chon-é
བལ་ལྡན་,pald-en
གན་ལྡན་,gand-en
བློ་བཟང་,lobz-ang
# Exception from THL (not taking nasalizations)
རྡོ་རྗེ/Ce,dorj-
བླ་བྲང,lab-rang
Expand All @@ -31,6 +44,7 @@
ས་གདན,sabd-en
ཁ་གཅོད,khabch-ö
# dba becomes wa
དབ,w-a
དབ/Cb,w-
# numbers, from NT Annex 1, completed by Drupchen
བཅུ་གཅིག,chugch-ik
Expand All @@ -48,7 +62,8 @@
ང་བཞི/Ci,ngabsh-
དགུ་བཅུ/Cu,gubch-
# Sanskrit (TODO: check)
པདྨ/Ca,padm-
པདྨ/Ca,pem-
པདྨོ/Co,pem-
ཀརྨ/Ca,karm-
# NT Annex 1, nasalizer
ཞབས་འདེགས,shamd-ek
Expand Down Expand Up @@ -89,6 +104,7 @@
ཕྱག་མཛོད,changts-ö
ལྷ་མཛེས,lhandz-e
ལོ་མཆོད,lomch-ö
པཎ་ཆེན་,panch-en
# THL nasalizations
བཀའ་འགྱུར,kangy-ur
ངོས་འཛིན,ngöndz-in
Expand Down Expand Up @@ -152,4 +168,4 @@
འཕྲོ་འདུ/Cu,tront-
སྤྲོ་བསྡུ/Co,tront-
ན་བཟ/Cb,namz-
མ་འགགས,mank-ak
མ་འགགས,mank-ak
4 changes: 2 additions & 2 deletions bophono/data/roots.csv
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
མཐ,~th+,[m]t+,t,t
འཐ,~th+,[']t+,t,th
ད,th-,t,d,d
དྲ,thr-,tr,d,d
དྲ,thr-,tr,dr,d
གད*,t-,[r]d,d,d
བད*,t-,[b]d,d,d
མད*,~t-,[m]d,d,d
Expand Down Expand Up @@ -217,7 +217,7 @@
ཏྭ,t+,t,t,t
ཐྭ,th+,t+,t,th
དྭ,th-,t,d,d
དྲྭ,thr-,tr,d,d
དྲྭ,thr-,tr,dr,d
ཕྱྭ,ch+,sh,ch,ch
མྭ,m-,m,m,m
ཙྭ,ts+,ts,ts,ts
Expand Down
52 changes: 52 additions & 0 deletions tests/KVP_corrections.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
,output,correct,exceptions,,,
དྲྭ,da,dra,,,,
དྲི་,di,dri,,,,
དྲིན་,din,drin,,,,
དྲངས,dang,drang,,,,
དྲིལ,dil,dril,,,,
དྲན,den,dren,,,,
འཕྲུལ,drul,trul,,Note: two things need to be fixed.,,
རྡོ་རྗེ,doje,dorje,exception,,,
མཁའ་འགྲོ,khadro,khandro,exception,,,
དྲེགས,dek,drek,,Note: two things need to be fixed.,,
སྤྲིན,drin,trin,,,,
སྤྲུལ,drul,trul,,Note: two things need to be fixed.,,
སྤྲས,dre,tre,,,,
དཀྲིགས,drik,trik,,,,
བཀྲེན་,dren,tren,,,,
བཀྲམ,dram,tram,,,,
ཁྲོས,drö,trö,,,,
ཁྲི་,dri,tri,,,,
ཁྲུས,dru,tru,,,,
ཁྲིམས་,drim,trim,,,,
པདྨ,padma,pema,exception,,,
པདྨོ་,padmo,pemo,exception,,,
བར་ཆད,warche,barche,exception,,,
སློབ་དཔོན་,lobpön,lopön,exception,,,
རང་གྲོལ་,rangdrol,rangdröl,,,,
རོལ,rol,röl,,,,
སྩོལ་,tsol,tsöl,,,,
གསོལ་,sol,söl,,,,
གཞོལ་,zhol,zhöl,,,,
འབུལ,bul,bul,,,,
ཡུལ,yul,yul,,,,
བརྟུལ་,tul,tul,,,,
ཚུལ་,tsul,tsul,,,,
ཕུལ་,pul,pul,,,,
རྡུལ,dul,dul,,,,
བསྐུལ,kul,kul,,,,
བློ་བཟང་,lozang,lobzang,exception,,,
བཀྲ་ཤིས་,drashi,tashi,exception,,,"suggestion: If ""ta shi"" is chosen for sādhana phonetics, it should be an exception and the adjective བཀྲ་ should stay as ""tra"""
ནེའུ་,ne u,ne’u,,Note: curved smart quote mark would be ideal,,
བེའུ་,be u,be’u,,Note: curved smart quote mark would be ideal,,
མཆིའོ,chi o,chi’o,,Note: curved smart quote mark would be ideal,,
ཕྲན་,dren,tren,,,,
འཕྲོག་,drok,trok,,,,
འཕྲོ་,dro,tro,,,,
ཕྲག,drak,trak,,Note: two things need to be fixed.,,
ཕྲེང་,dreng,treng,,,,
ཀྲུང་,krung,trung,,,,
ལའང་,lang,la’ang,,Note: curved smart quote mark would be ideal,,
དྭངས,nge,dang,,,,
མ་ཧཱ་ གུ་རུ་,ma guru,maha guru,,,,
བདེ་ ཆེན་ ནཱ་ དའི་ དངོས,de chen de ngö,de chen na de ngö,,,,
Loading

0 comments on commit 9451ae0

Please sign in to comment.