Updates KVP rules (issues #21 & #22)

Esukhia · Oct 7, 2024 · 9451ae0 · 9451ae0
1 parent 75e8e9c
commit 9451ae0
Show file tree

Hide file tree

Showing 10 changed files with 438 additions and 89 deletions.
diff --git a/bophono/PhonStateKVP.py b/bophono/PhonStateKVP.py
@@ -1,3 +1,5 @@
+import re
+
 class PhonStateKVP:
     def __init__(self, options={}, pos=None, endOfSentence=False):
         self.position = 0
@@ -20,21 +22,17 @@ def doCombineCurEnd(self, endofword, nrc='', nextvowel=''): # nrc = next root co
             return
         # ' from ends.csv should be replaced with a space
         self.end = self.end.replace("'", ' ')
-        # e at the end of a word becomes é
-        if self.end.endswith("e") and endofword:
-            if self.accentuateall or self.phon+'e' in self.accentuateWL:
-                self.end = self.end[:-1]+"é"
         # suffix ga is "k" except in the middle of words
         if self.end.endswith("k") and not endofword:
             self.end = self.end[:-1]+"g"
-        # nng or ngg -> ng
+        if self.end.endswith("ng") and nrc.startswith("g"):
+            self.end = self.end[:-1]
+        if self.end.endswith("ng") and nrc.startswith("ng"):
+            self.end = self.end[:-2]
         if self.end.endswith("g") and nrc.startswith("g"):
             self.end = self.end[:-1]+"k"
         if self.end.endswith("n") and nrc.startswith("n"):
             self.end = self.end[:-1]
-        # I suppose? TODO: check
-        if self.end.endswith("ng") and nrc.startswith("ng"):
-            self.end = self.end[:-2]
         # optional, from Rigpa: kun dga' -> kun-ga
         if self.splitNG and self.end.endswith("n") and nrc.startswith("g"):
             self.end += "-"
@@ -44,25 +42,27 @@ def doCombineCurEnd(self, endofword, nrc='', nextvowel=''): # nrc = next root co
         self.phon += self.end
 
 
-    def combineWithException(self, exception):
+    def combineWithException(self, exception, tibetanSyllable):
         syllables = exception.split('|')
         for syl in syllables:
             indexplusminus = syl.find('-')
             if indexplusminus == -1:
                 print("invalid exception syllable: "+syl)
                 continue
-            self.combineWith(syl[:indexplusminus], syl[indexplusminus+1:])
+            self.combineWith(syl[:indexplusminus], syl[indexplusminus+1:], tibetanSyllable)
 
-    def combineWith(self, nextroot, nextend):
+    def combineWith(self, nextroot, nextend, tibetanSyllable):
         nextrootconsonant = nextroot
         nextvowel = ''
         self.doCombineCurEnd(False, nextrootconsonant, nextvowel)
         self.position += 1
         if nextrootconsonant == "-":
             self.phon += ""
+        elif re.search(r'^བ[ོ]?[རསད]?(འི)?$', tibetanSyllable) and self.position == 1:
+            self.phon += "w"
         elif nextrootconsonant.startswith("dz") and self.position > 1:
             self.phon += "z"
-        elif nextrootconsonant.startswith("tr") and self.position == 1:
+        elif "གྲ" in tibetanSyllable and nextrootconsonant.startswith("tr") and self.position == 2:
             self.phon += "dr"
         else:
             self.phon += nextrootconsonant
@@ -72,7 +72,7 @@ def combineWith(self, nextroot, nextend):
             self.end = ends[0]
             for endsyl in ends[1:]:
                 # we suppose that roots are always null
-                self.combineWith(endsyl[:1], endsyl[1:])
+                self.combineWith(endsyl[:1], endsyl[1:], tibetanSyllable)
         else:
             self.end = nextend
 

diff --git a/bophono/UnicodeToApi.py b/bophono/UnicodeToApi.py
@@ -35,7 +35,7 @@ def __init__(self, schema="MST", options={}):
         self.ends = get_trie_from_file(self.__get_trie_path("ends.csv"), "ends", self.columnIndex)
         self.exceptions = get_trie_from_file(self.__get_trie_path(exceptions), "exceptions", 1, self.ends)
 
-        self.ignored_chars = {'\u0F35': True, '\u0F37': True}
+        self.ignored_chars = {'\u0F35': True, '\u0F37': True, '\u0F71': True}
 
     def __get_trie_path(self, name):
         return os.path.join(os.path.split(__file__)[0], 'data', name)
@@ -70,7 +70,7 @@ def __combine_next_syll_phon(self, tibstr, bindex, state, eindex):
             return -1
         if endinfo['i'] < eindex and self.__is_tib_letter(tibstr[endinfo['i']]) and (tibstr[endinfo['i']] not in self.ignored_chars):
             return -1
-        state.combineWith(rootinfo['d'], endinfo['d'])
+        state.combineWith(rootinfo['d'], endinfo['d'], tibstr[bindex:eindex])
         assert(endinfo['i']>bindex)
         return endinfo['i']
 
@@ -97,7 +97,7 @@ def get_api(self, tibstr, bindex=0, eindex=-1, pos=None, endOfSentence=False):
                 # if it starts with '2:' and we're in the first syllable, we ignore it:
                 if exceptioninfo['d'].startswith('2:'):
                     exceptioninfo['d'] = exceptioninfo['d'][2:]
-                state.combineWithException(exceptioninfo['d'])
+                state.combineWithException(exceptioninfo['d'], tibstr[bindex:eindex])
                 nextidx = self.__get_next_letter_index(tibstr, exceptioninfo['i']+1, eindex)
                 if nextidx == -1:
                     nextidx = eindex

diff --git a/bophono/data/ends.csv b/bophono/data/ends.csv
@@ -13,7 +13,7 @@
 ལ,äl,ɛl,al,al 
 འི,äj,ɨ,e,e
 འིའོ,a|-i|-o,i|wo,e’o,eo
-འོ,a|-o,a|o,a'o,ao
+འོ,a|-o,a|o,a’o,ao
 འང,a|-ang,a|wang,a’ang,ang
 འམ,a|-am,a:m,am,am
 ར,ar,ər,ar,ar
@@ -31,31 +31,31 @@
 ིམས,im~,im,im,im
 ིལ,il,il,il,il
 ིའི,i:,i:,i,i
-ིའིའོ,i:|-o,,i'o,io
-ིའོ,i|-o,,i'o,io
-ིའང,i|-ang,,i'ang,iang
-ིའམ,i|-am,,i'am,iam
+ིའིའོ,i:|-o,,i’o,io
+ིའོ,i|-o,,i’o,io
+ིའང,i|-ang,,i’ang,iang
+ིའམ,i|-am,,i’am,iam
 ིར,ir,ir,ir,ir
 ིས,i',i,i,i
 ུ,u,u,u,u
 ུག,uk,ɨx,uk,uk
 ུགས,uk,ɨx,uk,uk
 ུང,ung,ong,ung,ung
 ུངས,ung~,ong,ung,ung
-ུད,ü',ut,ü,ü
-ུན,ün,un,ün,ün
+ུད,ü',ut,u,ü
+ུན,ün,un,un,ün
 ུབ,up,ub,ub,ub
 ུབས,up,ub,ub,ub
 ུམ,um,um,um,um
 ུམས,um~,um,um,um
-ུལ,ül,ul,ül,ul
-ུའི,üj,ɨ,ü,y
-ུའིའོ,u|-i|-o,,ü’o,yo
-ུའོ,u|-o,,ü’o,yo
+ུལ,ül,ul,ul,ul
+ུའི,üj,ɨ,u,y
+ུའིའོ,u|-i|-o,,u’o,yo
+ུའོ,u|-o,,u’o,yo
 ུའང,u|-ang,,u’ang,uang
 ུའམ,u|-am,,u’am,uam
 ུར,ur,ur,ur,ur
-ུས,ü',i,ü,y
+ུས,ü',i,u,y
 ེ,e,e,e,e
 ེག,ek,əx,ek,ek
 ེགས,ek,əx,ek,ek
@@ -96,41 +96,41 @@
 ོས,ö',i,ö,ö
 འུ,a|-u,-u,a’u,au
 འུའི,a|-u|-i,,a’u,au
-འུའིའོ,a|-u|-i|-o,,a'u’o,auo
-འུའོ,a|-u|-o,,a'u’o,auo
+འུའིའོ,a|-u|-i|-o,,a’u’o,auo
+འུའོ,a|-u|-o,,a’u’o,auo
 འུའང,a|-u|-ang,,a’u’ang,auang
 འུའམ,a|-u|-am,,a’u’am,auam
 འུར,a|-ur,ur,a’ur,aur
 འུས,a|-ü',i,a’u,au
 ིའུ,i|-u,u,i’u,iu
-ིའུའི,i|-u|-i,,i'ü,iy
-ིའུའིའོ,i|-u|-i|-o,,i’ü’o,iyo
+ིའུའི,i|-u|-i,,i’uu,iy
+ིའུའིའོ,i|-u|-i|-o,,i’u’o,iyo
 ིའུའོ,i|-u|-o,,i’u’o,iuo
 ིའུའང,i|-u|-ang,,i’u’ang,iuang
 ིའུའམ,i|-u|-am,,i’u’am,iuam
 ིའུར,i|-ur,,i’ur,iur
-ིའུས,i|-ü',,i’ü,iy
+ིའུས,i|-ü',,i’u,iy
 ུའུ,u|-u,u,u’u,uu
-ུའུའི,u|-u|-i,u,u’ü,uy
-ུའུའིའོ,u|-u|-i|-o,,u’ü’o,uyo
+ུའུའི,u|-u|-i,u,u’u,uy
+ུའུའིའོ,u|-u|-i|-o,,u’u’o,uyo
 ུའུའོ,u|-u|-o,,u’u’o,uuo
 ུའུའང,u|-u|-ang,,u’u’ang,uuang
 ུའུའམ,u|-u|-am,,u’u’am,uuam
 ུའུར,u|-ur,,e’ur,eur
-ུའུས,u|-ü',,e’ü,ey
+ུའུས,u|-ü',,e’u,ey
 ེའུ,e|-u,u,e’u,eu
-ེའུའི,e|-u|-i,,e’ü,ey
-ེའུའིའོ,e|-u|-i|-o,,e’ü’o,eyo
+ེའུའི,e|-u|-i,,e’u,ey
+ེའུའིའོ,e|-u|-i|-o,,e’u’o,eyo
 ེའུའོ,e|-u|-o,,e’u’o,euo
 ེའུའང,e|-u|-ang,,e’u’ang,euang
 ེའུའམ,e|-u|-am,,e’u’am,euam
 ེའུར,e|-ur,,e’ur,eur
-ེའུས,e|-ü',,e’ü,ey
-ོའུ,o|-u,u,o'u,ou
-ོའུའི,o|-u|-i,,o’ü,oy
-ོའུའིའོ,o|-u|-i|-o,,o’ü’o,oyo
+ེའུས,e|-ü',,e’u,ey
+ོའུ,o|-u,u,o’u,ou
+ོའུའི,o|-u|-i,,o’u,oy
+ོའུའིའོ,o|-u|-i|-o,,o’u’o,oyo
 ོའུའོ,o|-u|-o,,o’u’o,ouo
 ོའུའང,o|-u|-ang,,o’u’ang,ouang
 ོའུའམ,o|-u|-am,,o’u’am,ouam
 ོའུར,o|-ur,,o’ur,our
-ོའུས,o|-ü',,o’ü,oy
+ོའུས,o|-ü',,o’u,oy
diff --git a/bophono/data/exceptions-kvp.csv b/bophono/data/exceptions-kvp.csv
@@ -1,6 +1,6 @@
 # transforming all the time, will be fixed manually
-བ/Ca,w-
-བོ/Co,w-
+2:བ/Ca,w-
+2:བོ/Co,w-
 # KVP exceptions
 བཀྲ་ཤིས,ta-shi
 སྤྲུལ་སྐུ/Cu,tulk-
@@ -10,15 +10,28 @@
 ངག་དབང,ngaw-ang
 སྤྱན་རས་གཟིགས,chenrez-ig
 སྣར་ཐང,narth-ang
+བུམ་ཐང་,bumth-ang
 ཞེ་ཆེན,shech-en
-ཞྭ་ལུ/Cu,shal-u
+ཞྭ་ལུ/Cu,shal-
+ཞྭ་དམར་,sham-ar
+ཞྭ་དམར་པ་,shamarp-a
+འཕྲིན་ལས་ ནོར་བུ་,Thin-ley Norbu
 གཏོང་ལེན,tongl-en
 རིན་པོ་ཆེ/Ce,rinpoch-
 འཕོ་བ/Ca,pow-
 ཐང་ཀ/Ca,tangkh-
-བར་ཆད,barch-é
+བར་དུ,bard-u
+བར་ཆད,barch-e
 བར་དོ/Co,bard-
 གཅོད,ch-öd
+འཆི་མེད་,chim-é
+མདོ་མེད་,dom-é
+རིས་མེད་,rim-é
+ཨ་མེས་,am-é
+ཅོ་ནེ་,chon-é
+བལ་ལྡན་,pald-en
+གན་ལྡན་,gand-en
+བློ་བཟང་,lobz-ang
 # Exception from THL (not taking nasalizations)
 རྡོ་རྗེ/Ce,dorj-
 བླ་བྲང,lab-rang
@@ -31,6 +44,7 @@
 ས་གདན,sabd-en
 ཁ་གཅོད,khabch-ö
 # dba becomes wa
+དབ,w-a
 དབ/Cb,w-
 # numbers, from NT Annex 1, completed by Drupchen
 བཅུ་གཅིག,chugch-ik
@@ -48,7 +62,8 @@
 ང་བཞི/Ci,ngabsh-
 དགུ་བཅུ/Cu,gubch-
 # Sanskrit (TODO: check)
-པདྨ/Ca,padm-
+པདྨ/Ca,pem-
+པདྨོ/Co,pem-
 ཀརྨ/Ca,karm-
 # NT Annex 1, nasalizer
 ཞབས་འདེགས,shamd-ek
@@ -89,6 +104,7 @@
 ཕྱག་མཛོད,changts-ö
 ལྷ་མཛེས,lhandz-e
 ལོ་མཆོད,lomch-ö
+པཎ་ཆེན་,panch-en
 # THL nasalizations
 བཀའ་འགྱུར,kangy-ur
 ངོས་འཛིན,ngöndz-in
@@ -152,4 +168,4 @@
 འཕྲོ་འདུ/Cu,tront-
 སྤྲོ་བསྡུ/Co,tront-
 ན་བཟ/Cb,namz-
-མ་འགགས,mank-ak
+མ་འགགས,mank-ak
diff --git a/bophono/data/roots.csv b/bophono/data/roots.csv
@@ -97,7 +97,7 @@
 མཐ,~th+,[m]t+,t,t
 འཐ,~th+,[']t+,t,th
 ད,th-,t,d,d
-དྲ,thr-,tr,d,d
+དྲ,thr-,tr,dr,d
 གད*,t-,[r]d,d,d
 བད*,t-,[b]d,d,d
 མད*,~t-,[m]d,d,d
@@ -217,7 +217,7 @@
 ཏྭ,t+,t,t,t
 ཐྭ,th+,t+,t,th
 དྭ,th-,t,d,d
-དྲྭ,thr-,tr,d,d
+དྲྭ,thr-,tr,dr,d
 ཕྱྭ,ch+,sh,ch,ch
 མྭ,m-,m,m,m
 ཙྭ,ts+,ts,ts,ts

diff --git a/tests/KVP_corrections.csv b/tests/KVP_corrections.csv
@@ -0,0 +1,52 @@
+,output,correct,exceptions,,,
+དྲྭ,da,dra,,,,
+དྲི་,di,dri,,,,
+དྲིན་,din,drin,,,,
+དྲངས,dang,drang,,,,
+དྲིལ,dil,dril,,,,
+དྲན,den,dren,,,,
+འཕྲུལ,drul,trul,,Note: two things need to be fixed.,,
+རྡོ་རྗེ,doje,dorje,exception,,,
+མཁའ་འགྲོ,khadro,khandro,exception,,,
+དྲེགས,dek,drek,,Note: two things need to be fixed.,,
+སྤྲིན,drin,trin,,,,
+སྤྲུལ,drul,trul,,Note: two things need to be fixed.,,
+སྤྲས,dre,tre,,,,
+དཀྲིགས,drik,trik,,,,
+བཀྲེན་,dren,tren,,,,
+བཀྲམ,dram,tram,,,,
+ཁྲོས,drö,trö,,,,
+ཁྲི་,dri,tri,,,,
+ཁྲུས,dru,tru,,,,
+ཁྲིམས་,drim,trim,,,,
+པདྨ,padma,pema,exception,,,
+པདྨོ་,padmo,pemo,exception,,,
+བར་ཆད,warche,barche,exception,,,
+སློབ་དཔོན་,lobpön,lopön,exception,,,
+རང་གྲོལ་,rangdrol,rangdröl,,,,
+རོལ,rol,röl,,,,
+སྩོལ་,tsol,tsöl,,,,
+གསོལ་,sol,söl,,,,
+གཞོལ་,zhol,zhöl,,,,
+འབུལ,bul,bul,,,,
+ཡུལ,yul,yul,,,,
+བརྟུལ་,tul,tul,,,,
+ཚུལ་,tsul,tsul,,,,
+ཕུལ་,pul,pul,,,,
+རྡུལ,dul,dul,,,,
+བསྐུལ,kul,kul,,,,
+བློ་བཟང་,lozang,lobzang,exception,,,
+བཀྲ་ཤིས་,drashi,tashi,exception,,,"suggestion: If ""ta shi"" is chosen for sādhana phonetics, it should be an exception and the adjective བཀྲ་ should stay as ""tra"""
+ནེའུ་,ne u,ne’u,,Note: curved smart quote mark would be ideal,,
+བེའུ་,be u,be’u,,Note: curved smart quote mark would be ideal,,
+མཆིའོ,chi o,chi’o,,Note: curved smart quote mark would be ideal,,
+ཕྲན་,dren,tren,,,,
+འཕྲོག་,drok,trok,,,,
+འཕྲོ་,dro,tro,,,,
+ཕྲག,drak,trak,,Note: two things need to be fixed.,,
+ཕྲེང་,dreng,treng,,,,
+ཀྲུང་,krung,trung,,,,
+ལའང་,lang,la’ang,,Note: curved smart quote mark would be ideal,,
+དྭངས,nge,dang,,,,
+མ་ཧཱ་ གུ་རུ་,ma guru,maha guru,,,,
+བདེ་ ཆེན་ ནཱ་ དའི་ དངོས,de chen de ngö,de chen na de ngö,,,,