diff --git a/bophono/PhonStateKVP.py b/bophono/PhonStateKVP.py index e2e9c0c..04a6b44 100644 --- a/bophono/PhonStateKVP.py +++ b/bophono/PhonStateKVP.py @@ -1,3 +1,5 @@ +import re + class PhonStateKVP: def __init__(self, options={}, pos=None, endOfSentence=False): self.position = 0 @@ -20,21 +22,17 @@ def doCombineCurEnd(self, endofword, nrc='', nextvowel=''): # nrc = next root co return # ' from ends.csv should be replaced with a space self.end = self.end.replace("'", ' ') - # e at the end of a word becomes é - if self.end.endswith("e") and endofword: - if self.accentuateall or self.phon+'e' in self.accentuateWL: - self.end = self.end[:-1]+"é" # suffix ga is "k" except in the middle of words if self.end.endswith("k") and not endofword: self.end = self.end[:-1]+"g" - # nng or ngg -> ng + if self.end.endswith("ng") and nrc.startswith("g"): + self.end = self.end[:-1] + if self.end.endswith("ng") and nrc.startswith("ng"): + self.end = self.end[:-2] if self.end.endswith("g") and nrc.startswith("g"): self.end = self.end[:-1]+"k" if self.end.endswith("n") and nrc.startswith("n"): self.end = self.end[:-1] - # I suppose? TODO: check - if self.end.endswith("ng") and nrc.startswith("ng"): - self.end = self.end[:-2] # optional, from Rigpa: kun dga' -> kun-ga if self.splitNG and self.end.endswith("n") and nrc.startswith("g"): self.end += "-" @@ -44,25 +42,27 @@ def doCombineCurEnd(self, endofword, nrc='', nextvowel=''): # nrc = next root co self.phon += self.end - def combineWithException(self, exception): + def combineWithException(self, exception, tibetanSyllable): syllables = exception.split('|') for syl in syllables: indexplusminus = syl.find('-') if indexplusminus == -1: print("invalid exception syllable: "+syl) continue - self.combineWith(syl[:indexplusminus], syl[indexplusminus+1:]) + self.combineWith(syl[:indexplusminus], syl[indexplusminus+1:], tibetanSyllable) - def combineWith(self, nextroot, nextend): + def combineWith(self, nextroot, nextend, tibetanSyllable): nextrootconsonant = nextroot nextvowel = '' self.doCombineCurEnd(False, nextrootconsonant, nextvowel) self.position += 1 if nextrootconsonant == "-": self.phon += "" + elif re.search(r'^བ[ོ]?[རསད]?(འི)?$', tibetanSyllable) and self.position == 1: + self.phon += "w" elif nextrootconsonant.startswith("dz") and self.position > 1: self.phon += "z" - elif nextrootconsonant.startswith("tr") and self.position == 1: + elif "གྲ" in tibetanSyllable and nextrootconsonant.startswith("tr") and self.position == 2: self.phon += "dr" else: self.phon += nextrootconsonant @@ -72,7 +72,7 @@ def combineWith(self, nextroot, nextend): self.end = ends[0] for endsyl in ends[1:]: # we suppose that roots are always null - self.combineWith(endsyl[:1], endsyl[1:]) + self.combineWith(endsyl[:1], endsyl[1:], tibetanSyllable) else: self.end = nextend diff --git a/bophono/UnicodeToApi.py b/bophono/UnicodeToApi.py index 5545d76..c7a2de1 100644 --- a/bophono/UnicodeToApi.py +++ b/bophono/UnicodeToApi.py @@ -35,7 +35,7 @@ def __init__(self, schema="MST", options={}): self.ends = get_trie_from_file(self.__get_trie_path("ends.csv"), "ends", self.columnIndex) self.exceptions = get_trie_from_file(self.__get_trie_path(exceptions), "exceptions", 1, self.ends) - self.ignored_chars = {'\u0F35': True, '\u0F37': True} + self.ignored_chars = {'\u0F35': True, '\u0F37': True, '\u0F71': True} def __get_trie_path(self, name): return os.path.join(os.path.split(__file__)[0], 'data', name) @@ -70,7 +70,7 @@ def __combine_next_syll_phon(self, tibstr, bindex, state, eindex): return -1 if endinfo['i'] < eindex and self.__is_tib_letter(tibstr[endinfo['i']]) and (tibstr[endinfo['i']] not in self.ignored_chars): return -1 - state.combineWith(rootinfo['d'], endinfo['d']) + state.combineWith(rootinfo['d'], endinfo['d'], tibstr[bindex:eindex]) assert(endinfo['i']>bindex) return endinfo['i'] @@ -97,7 +97,7 @@ def get_api(self, tibstr, bindex=0, eindex=-1, pos=None, endOfSentence=False): # if it starts with '2:' and we're in the first syllable, we ignore it: if exceptioninfo['d'].startswith('2:'): exceptioninfo['d'] = exceptioninfo['d'][2:] - state.combineWithException(exceptioninfo['d']) + state.combineWithException(exceptioninfo['d'], tibstr[bindex:eindex]) nextidx = self.__get_next_letter_index(tibstr, exceptioninfo['i']+1, eindex) if nextidx == -1: nextidx = eindex diff --git a/bophono/data/ends.csv b/bophono/data/ends.csv index c668c3e..b7297f6 100644 --- a/bophono/data/ends.csv +++ b/bophono/data/ends.csv @@ -13,7 +13,7 @@ ལ,äl,ɛl,al,al འི,äj,ɨ,e,e འིའོ,a|-i|-o,i|wo,e’o,eo -འོ,a|-o,a|o,a'o,ao +འོ,a|-o,a|o,a’o,ao འང,a|-ang,a|wang,a’ang,ang འམ,a|-am,a:m,am,am ར,ar,ər,ar,ar @@ -31,10 +31,10 @@ ིམས,im~,im,im,im ིལ,il,il,il,il ིའི,i:,i:,i,i -ིའིའོ,i:|-o,,i'o,io -ིའོ,i|-o,,i'o,io -ིའང,i|-ang,,i'ang,iang -ིའམ,i|-am,,i'am,iam +ིའིའོ,i:|-o,,i’o,io +ིའོ,i|-o,,i’o,io +ིའང,i|-ang,,i’ang,iang +ིའམ,i|-am,,i’am,iam ིར,ir,ir,ir,ir ིས,i',i,i,i ུ,u,u,u,u @@ -42,20 +42,20 @@ ུགས,uk,ɨx,uk,uk ུང,ung,ong,ung,ung ུངས,ung~,ong,ung,ung -ུད,ü',ut,ü,ü -ུན,ün,un,ün,ün +ུད,ü',ut,u,ü +ུན,ün,un,un,ün ུབ,up,ub,ub,ub ུབས,up,ub,ub,ub ུམ,um,um,um,um ུམས,um~,um,um,um -ུལ,ül,ul,ül,ul -ུའི,üj,ɨ,ü,y -ུའིའོ,u|-i|-o,,ü’o,yo -ུའོ,u|-o,,ü’o,yo +ུལ,ül,ul,ul,ul +ུའི,üj,ɨ,u,y +ུའིའོ,u|-i|-o,,u’o,yo +ུའོ,u|-o,,u’o,yo ུའང,u|-ang,,u’ang,uang ུའམ,u|-am,,u’am,uam ུར,ur,ur,ur,ur -ུས,ü',i,ü,y +ུས,ü',i,u,y ེ,e,e,e,e ེག,ek,əx,ek,ek ེགས,ek,əx,ek,ek @@ -96,41 +96,41 @@ ོས,ö',i,ö,ö འུ,a|-u,-u,a’u,au འུའི,a|-u|-i,,a’u,au -འུའིའོ,a|-u|-i|-o,,a'u’o,auo -འུའོ,a|-u|-o,,a'u’o,auo +འུའིའོ,a|-u|-i|-o,,a’u’o,auo +འུའོ,a|-u|-o,,a’u’o,auo འུའང,a|-u|-ang,,a’u’ang,auang འུའམ,a|-u|-am,,a’u’am,auam འུར,a|-ur,ur,a’ur,aur འུས,a|-ü',i,a’u,au ིའུ,i|-u,u,i’u,iu -ིའུའི,i|-u|-i,,i'ü,iy -ིའུའིའོ,i|-u|-i|-o,,i’ü’o,iyo +ིའུའི,i|-u|-i,,i’uu,iy +ིའུའིའོ,i|-u|-i|-o,,i’u’o,iyo ིའུའོ,i|-u|-o,,i’u’o,iuo ིའུའང,i|-u|-ang,,i’u’ang,iuang ིའུའམ,i|-u|-am,,i’u’am,iuam ིའུར,i|-ur,,i’ur,iur -ིའུས,i|-ü',,i’ü,iy +ིའུས,i|-ü',,i’u,iy ུའུ,u|-u,u,u’u,uu -ུའུའི,u|-u|-i,u,u’ü,uy -ུའུའིའོ,u|-u|-i|-o,,u’ü’o,uyo +ུའུའི,u|-u|-i,u,u’u,uy +ུའུའིའོ,u|-u|-i|-o,,u’u’o,uyo ུའུའོ,u|-u|-o,,u’u’o,uuo ུའུའང,u|-u|-ang,,u’u’ang,uuang ུའུའམ,u|-u|-am,,u’u’am,uuam ུའུར,u|-ur,,e’ur,eur -ུའུས,u|-ü',,e’ü,ey +ུའུས,u|-ü',,e’u,ey ེའུ,e|-u,u,e’u,eu -ེའུའི,e|-u|-i,,e’ü,ey -ེའུའིའོ,e|-u|-i|-o,,e’ü’o,eyo +ེའུའི,e|-u|-i,,e’u,ey +ེའུའིའོ,e|-u|-i|-o,,e’u’o,eyo ེའུའོ,e|-u|-o,,e’u’o,euo ེའུའང,e|-u|-ang,,e’u’ang,euang ེའུའམ,e|-u|-am,,e’u’am,euam ེའུར,e|-ur,,e’ur,eur -ེའུས,e|-ü',,e’ü,ey -ོའུ,o|-u,u,o'u,ou -ོའུའི,o|-u|-i,,o’ü,oy -ོའུའིའོ,o|-u|-i|-o,,o’ü’o,oyo +ེའུས,e|-ü',,e’u,ey +ོའུ,o|-u,u,o’u,ou +ོའུའི,o|-u|-i,,o’u,oy +ོའུའིའོ,o|-u|-i|-o,,o’u’o,oyo ོའུའོ,o|-u|-o,,o’u’o,ouo ོའུའང,o|-u|-ang,,o’u’ang,ouang ོའུའམ,o|-u|-am,,o’u’am,ouam ོའུར,o|-ur,,o’ur,our -ོའུས,o|-ü',,o’ü,oy +ོའུས,o|-ü',,o’u,oy diff --git a/bophono/data/exceptions-kvp.csv b/bophono/data/exceptions-kvp.csv index 54a73b7..0a2a54e 100644 --- a/bophono/data/exceptions-kvp.csv +++ b/bophono/data/exceptions-kvp.csv @@ -1,6 +1,6 @@ # transforming all the time, will be fixed manually -བ/Ca,w- -བོ/Co,w- +2:བ/Ca,w- +2:བོ/Co,w- # KVP exceptions བཀྲ་ཤིས,ta-shi སྤྲུལ་སྐུ/Cu,tulk- @@ -10,15 +10,28 @@ ངག་དབང,ngaw-ang སྤྱན་རས་གཟིགས,chenrez-ig སྣར་ཐང,narth-ang +བུམ་ཐང་,bumth-ang ཞེ་ཆེན,shech-en -ཞྭ་ལུ/Cu,shal-u +ཞྭ་ལུ/Cu,shal- +ཞྭ་དམར་,sham-ar +ཞྭ་དམར་པ་,shamarp-a +འཕྲིན་ལས་ ནོར་བུ་,Thin-ley Norbu གཏོང་ལེན,tongl-en རིན་པོ་ཆེ/Ce,rinpoch- འཕོ་བ/Ca,pow- ཐང་ཀ/Ca,tangkh- -བར་ཆད,barch-é +བར་དུ,bard-u +བར་ཆད,barch-e བར་དོ/Co,bard- གཅོད,ch-öd +འཆི་མེད་,chim-é +མདོ་མེད་,dom-é +རིས་མེད་,rim-é +ཨ་མེས་,am-é +ཅོ་ནེ་,chon-é +བལ་ལྡན་,pald-en +གན་ལྡན་,gand-en +བློ་བཟང་,lobz-ang # Exception from THL (not taking nasalizations) རྡོ་རྗེ/Ce,dorj- བླ་བྲང,lab-rang @@ -31,6 +44,7 @@ ས་གདན,sabd-en ཁ་གཅོད,khabch-ö # dba becomes wa +དབ,w-a དབ/Cb,w- # numbers, from NT Annex 1, completed by Drupchen བཅུ་གཅིག,chugch-ik @@ -48,7 +62,8 @@ ང་བཞི/Ci,ngabsh- དགུ་བཅུ/Cu,gubch- # Sanskrit (TODO: check) -པདྨ/Ca,padm- +པདྨ/Ca,pem- +པདྨོ/Co,pem- ཀརྨ/Ca,karm- # NT Annex 1, nasalizer ཞབས་འདེགས,shamd-ek @@ -89,6 +104,7 @@ ཕྱག་མཛོད,changts-ö ལྷ་མཛེས,lhandz-e ལོ་མཆོད,lomch-ö +པཎ་ཆེན་,panch-en # THL nasalizations བཀའ་འགྱུར,kangy-ur ངོས་འཛིན,ngöndz-in @@ -152,4 +168,4 @@ འཕྲོ་འདུ/Cu,tront- སྤྲོ་བསྡུ/Co,tront- ན་བཟ/Cb,namz- -མ་འགགས,mank-ak +མ་འགགས,mank-ak \ No newline at end of file diff --git a/bophono/data/roots.csv b/bophono/data/roots.csv index d444db9..7aa770f 100644 --- a/bophono/data/roots.csv +++ b/bophono/data/roots.csv @@ -97,7 +97,7 @@ མཐ,~th+,[m]t+,t,t འཐ,~th+,[']t+,t,th ད,th-,t,d,d -དྲ,thr-,tr,d,d +དྲ,thr-,tr,dr,d གད*,t-,[r]d,d,d བད*,t-,[b]d,d,d མད*,~t-,[m]d,d,d @@ -217,7 +217,7 @@ ཏྭ,t+,t,t,t ཐྭ,th+,t+,t,th དྭ,th-,t,d,d -དྲྭ,thr-,tr,d,d +དྲྭ,thr-,tr,dr,d ཕྱྭ,ch+,sh,ch,ch མྭ,m-,m,m,m ཙྭ,ts+,ts,ts,ts diff --git a/tests/KVP_corrections.csv b/tests/KVP_corrections.csv new file mode 100644 index 0000000..5d05ee4 --- /dev/null +++ b/tests/KVP_corrections.csv @@ -0,0 +1,52 @@ +,output,correct,exceptions,,, +དྲྭ,da,dra,,,, +དྲི་,di,dri,,,, +དྲིན་,din,drin,,,, +དྲངས,dang,drang,,,, +དྲིལ,dil,dril,,,, +དྲན,den,dren,,,, +འཕྲུལ,drul,trul,,Note: two things need to be fixed.,, +རྡོ་རྗེ,doje,dorje,exception,,, +མཁའ་འགྲོ,khadro,khandro,exception,,, +དྲེགས,dek,drek,,Note: two things need to be fixed.,, +སྤྲིན,drin,trin,,,, +སྤྲུལ,drul,trul,,Note: two things need to be fixed.,, +སྤྲས,dre,tre,,,, +དཀྲིགས,drik,trik,,,, +བཀྲེན་,dren,tren,,,, +བཀྲམ,dram,tram,,,, +ཁྲོས,drö,trö,,,, +ཁྲི་,dri,tri,,,, +ཁྲུས,dru,tru,,,, +ཁྲིམས་,drim,trim,,,, +པདྨ,padma,pema,exception,,, +པདྨོ་,padmo,pemo,exception,,, +བར་ཆད,warche,barche,exception,,, +སློབ་དཔོན་,lobpön,lopön,exception,,, +རང་གྲོལ་,rangdrol,rangdröl,,,, +རོལ,rol,röl,,,, +སྩོལ་,tsol,tsöl,,,, +གསོལ་,sol,söl,,,, +གཞོལ་,zhol,zhöl,,,, +འབུལ,bul,bul,,,, +ཡུལ,yul,yul,,,, +བརྟུལ་,tul,tul,,,, +ཚུལ་,tsul,tsul,,,, +ཕུལ་,pul,pul,,,, +རྡུལ,dul,dul,,,, +བསྐུལ,kul,kul,,,, +བློ་བཟང་,lozang,lobzang,exception,,, +བཀྲ་ཤིས་,drashi,tashi,exception,,,"suggestion: If ""ta shi"" is chosen for sādhana phonetics, it should be an exception and the adjective བཀྲ་ should stay as ""tra""" +ནེའུ་,ne u,ne’u,,Note: curved smart quote mark would be ideal,, +བེའུ་,be u,be’u,,Note: curved smart quote mark would be ideal,, +མཆིའོ,chi o,chi’o,,Note: curved smart quote mark would be ideal,, +ཕྲན་,dren,tren,,,, +འཕྲོག་,drok,trok,,,, +འཕྲོ་,dro,tro,,,, +ཕྲག,drak,trak,,Note: two things need to be fixed.,, +ཕྲེང་,dreng,treng,,,, +ཀྲུང་,krung,trung,,,, +ལའང་,lang,la’ang,,Note: curved smart quote mark would be ideal,, +དྭངས,nge,dang,,,, +མ་ཧཱ་ གུ་རུ་,ma guru,maha guru,,,, +བདེ་ ཆེན་ ནཱ་ དའི་ དངོས,de chen de ngö,de chen na de ngö,,,, \ No newline at end of file diff --git a/tests/test_KVP_corrections.py b/tests/test_KVP_corrections.py new file mode 100644 index 0000000..b1149be --- /dev/null +++ b/tests/test_KVP_corrections.py @@ -0,0 +1,281 @@ +import sys +import os +import inspect +import pytest + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +import bophono +import csv + +from test_helpers import assert_equal_phonetics + + +### Phonetics Key table + +def test_vowels(): + assert_equal_phonetics("KVP", "ཨ", "a") + assert_equal_phonetics("KVP", "ཨེ", "e") + assert_equal_phonetics("KVP", "ཨོ", "o") + assert_equal_phonetics("KVP", "ཨི", "i") + assert_equal_phonetics("KVP", "ཨུ", "u") + assert_equal_phonetics("KVP", "ཨུས", "u") + assert_equal_phonetics("KVP", "ཨའི", "e") + +def test_ka(): + assert_equal_phonetics("KVP", "ཀ", "ka") + assert_equal_phonetics("KVP", "རྐ", "ka") + assert_equal_phonetics("KVP", "བཀའ", "ka") + +def test_kha(): + assert_equal_phonetics("KVP", "ཁ", "kha") + assert_equal_phonetics("KVP", "མཁའ", "kha") + +def test_ga_consonant(): + assert_equal_phonetics("KVP", "ག", "ga") + assert_equal_phonetics("KVP", "རྒ", "ga") + assert_equal_phonetics("KVP", "དགའ", "ga") + +def test_ga_suffix(): + assert_equal_phonetics("KVP", "བག", "bak") # Final consonant becomes k + assert_equal_phonetics("KVP", "དགེ་ལུགས་", "geluk") # Final consonant becomes k + assert_equal_phonetics("KVP", "དགེ་ལུགས་པ་", "gelugpa") # Not final consonant becomes g + assert_equal_phonetics("KVP", "རིག་པ་", "rigpa") # Not final consonant becomes g + assert_equal_phonetics("KVP", "རྫོགས་ཆེན་", "dzogchen") # Not final consonant becomes g + assert_equal_phonetics("KVP", "སྤྱན་རས་གཟིགས་", "chenrezig") # Exception + assert_equal_phonetics("KVP", "ངག་དབང་", "ngawang") # Exception + assert_equal_phonetics("KVP", "འབྲུག་པ་", "drukpa") # Exception + +def test_nga_consonant(): + assert_equal_phonetics("KVP", "ང", "nga") + assert_equal_phonetics("KVP", "རྔ", "nga") + +def test_cha_consonant(): + assert_equal_phonetics("KVP", "ཅ", "cha") + assert_equal_phonetics("KVP", "ལྕ", "cha") + +def test_cha_consonant(): + assert_equal_phonetics("KVP", "ཆ", "cha") + +def test_ja_consonant(): + assert_equal_phonetics("KVP", "ཇ", "ja") + assert_equal_phonetics("KVP", "རྗ", "ja") + +def test_nya_consonant(): + assert_equal_phonetics("KVP", "ཉ", "nya") + assert_equal_phonetics("KVP", "སྙ", "nya") + +def test_ta_consonant(): + assert_equal_phonetics("KVP", "ཏ", "ta") + assert_equal_phonetics("KVP", "རྟ", "ta") + +def test_ta_consonant(): + assert_equal_phonetics("KVP", "ཐ", "ta") + assert_equal_phonetics("KVP", "སྣར་ཐང་", "narthang") # Exception + assert_equal_phonetics("KVP", "བུམ་ཐང་", "bumthang") # Exception + +def test_da_consonant(): + assert_equal_phonetics("KVP", "ད", "da") + assert_equal_phonetics("KVP", "རྡ", "da") + +def test_na_consonant(): + assert_equal_phonetics("KVP", "ན", "na") + assert_equal_phonetics("KVP", "རྣ", "na") + +def test_pa_consonant(): + assert_equal_phonetics("KVP", "པ", "pa") + assert_equal_phonetics("KVP", "སྤ", "pa") + +def test_pha_consonant(): + assert_equal_phonetics("KVP", "ཕ", "pa") + assert_equal_phonetics("KVP", "འཕ", "pa") + assert_equal_phonetics("KVP", "འཕོ་བ་", "powa") # Exception + assert_equal_phonetics("KVP", "མི་ཕམ་", "mipham") # Exception + +def test_ba_consonant(): + assert_equal_phonetics("KVP", "བ", "wa") + assert_equal_phonetics("KVP", "བར", "war") + assert_equal_phonetics("KVP", "བས", "we") + assert_equal_phonetics("KVP", "བད", "we") + assert_equal_phonetics("KVP", "བའི", "we") + assert_equal_phonetics("KVP", "བར་དུ་", "bardu") + assert_equal_phonetics("KVP", "སྐང་བ་", "kangwa") + assert_equal_phonetics("KVP", "རྦ", "ba") + assert_equal_phonetics("KVP", "སྦ", "ba") + assert_equal_phonetics("KVP", "འབའ", "ba") + +def test_ba_suffix(): + assert_equal_phonetics("KVP", "གང་བ་", "gangwa") + assert_equal_phonetics("KVP", "སློབ་", "lob") + assert_equal_phonetics("KVP", "སློབ་དཔོན་", "lopön") + assert_equal_phonetics("KVP", "ཐུབ་བསྟན་", "tubten") + +def test_ma_consonant(): + assert_equal_phonetics("KVP", "མ", "ma") + assert_equal_phonetics("KVP", "མྲ", "ma") + assert_equal_phonetics("KVP", "སྨྲ་བ་", "mawa") + assert_equal_phonetics("KVP", "མྱ", "nya") + assert_equal_phonetics("KVP", "མྱང་", "nyang") + +def test_tsa_consonant(): + assert_equal_phonetics("KVP", "ཙ", "tsa") + assert_equal_phonetics("KVP", "རྩ", "tsa") + +def test_tsa_consonant(): + assert_equal_phonetics("KVP", "ཚ", "tsa") + +def test_dza_consonant(): + assert_equal_phonetics("KVP", "ཛ", "dza") # dza at the beginning of a word + assert_equal_phonetics("KVP", "འཛི་སྒར་", "tsa") # dza at the beginning of a word + assert_equal_phonetics("KVP", "ར་ཛ་", "raza") # za in the middle of a word + +def test_dza_consonant(): + assert_equal_phonetics("KVP", "ཞ", "zha") + assert_equal_phonetics("KVP", "བཞ", "zha") + assert_equal_phonetics("KVP", "ཞེ་ཆེན་", "shechen") # Exception + assert_equal_phonetics("KVP", "ཞྭ་ལུ", "shalu") # Exception + assert_equal_phonetics("KVP", "ཞྭ་དམར་", "shamar") # Exception + assert_equal_phonetics("KVP", "ཞྭ་དམར་པ་", "shamarpa") # Exception + +def test_za_consonant(): + assert_equal_phonetics("KVP", "ཟ", "za") + assert_equal_phonetics("KVP", "བཟ", "za") + +def test_ya_consonant(): + assert_equal_phonetics("KVP", "ཡ", "ya") + +def test_ra_consonant(): + assert_equal_phonetics("KVP", "ར", "ra") + +def test_la_consonant(): + assert_equal_phonetics("KVP", "ལ", "la") + +def test_lha_consonant(): + assert_equal_phonetics("KVP", "ལྷ", "lha") + +def test_sha_consonant(): + assert_equal_phonetics("KVP", "ཤ", "sha") + +def test_sa_consonant(): + assert_equal_phonetics("KVP", "ས", "sa") + +def test_ha_consonant(): + assert_equal_phonetics("KVP", "ཧ", "ha") + +def test_yatas(): + assert_equal_phonetics("KVP", "ཀྱི", "kyi") + assert_equal_phonetics("KVP", "ཆོས་ཀྱི", "chökyi") + assert_equal_phonetics("KVP", "བྱ", "ja") + assert_equal_phonetics("KVP", "བྱང་ཆུབ་", "jangchub") + assert_equal_phonetics("KVP", "པྱ", "cha") + assert_equal_phonetics("KVP", "ཕྱ", "cha") + assert_equal_phonetics("KVP", "ཕྱག་རྒྱ", "chakgya") + +def test_ratas(): + assert_equal_phonetics("KVP", "ཀྲ", "tra") + assert_equal_phonetics("KVP", "བཀྲ་བ་", "trawa") + assert_equal_phonetics("KVP", "ཀྲོག་ཀྲོག་", "trogtrok") + assert_equal_phonetics("KVP", "བཀྲ་ཤིས་", "tashi") + assert_equal_phonetics("KVP", "གྲ", "tra") # tra at the beginning of a word + assert_equal_phonetics("KVP", "གྲ་པ་", "trapa") + assert_equal_phonetics("KVP", "ལྷུན་གྲུབ་", "lhundrub") # dra if not beginning of a word + assert_equal_phonetics("KVP", "པྲ", "tra") + assert_equal_phonetics("KVP", "ཕྲ", "tra") + assert_equal_phonetics("KVP", "འཕྲིན་ལས་", "trinle") + assert_equal_phonetics("KVP", "སྤྲུལ་སྐུ་", "tulku") + assert_equal_phonetics("KVP", "བྲ", "dra") + assert_equal_phonetics("KVP", "མཉམ་འབྲེལ་", "nyamdrel") + +def test_dao_wa(): + assert_equal_phonetics("KVP", "དབ", "wa") + assert_equal_phonetics("KVP", "དབང", "wang") + assert_equal_phonetics("KVP", "དབུ", "u") + assert_equal_phonetics("KVP", "དབུས", "u") + assert_equal_phonetics("KVP", "དབྱང", "yang") + +### Additional Phonetics Instructions: + +# The ö umlaut is used when the o is followed by d, n, ʼi, l, and s suffixes (in accordance with Tour- nadre’s MST). +def test_o_umlaut(): + assert_equal_phonetics("KVP", "ཨོ", "o") + assert_equal_phonetics("KVP", "ཨོད", "ö") + assert_equal_phonetics("KVP", "ཨོས", "ö") + assert_equal_phonetics("KVP", "ཨོའི", "ö") + assert_equal_phonetics("KVP", "ཨོན", "ön") + assert_equal_phonetics("KVP", "ཨོལ", "öl") + assert_equal_phonetics("KVP", "གོལ", "göl") + +# Do not use the ü umlaut. +def test_no_u_umlaut(): + assert_equal_phonetics("KVP", "ཨུ", "u") + assert_equal_phonetics("KVP", "ཨུད", "u") + assert_equal_phonetics("KVP", "ཨུས", "u") + assert_equal_phonetics("KVP", "ཨུའི", "u") + assert_equal_phonetics("KVP", "ཨུན", "un") + assert_equal_phonetics("KVP", "ཨུལ", "ul") + assert_equal_phonetics("KVP", "གུལ", "gul") + +# The e accent should only be applied to the final e when there is a clear risk of mispronunciation (primarily for English words such as chime and dome). +def test_no_accent_on_e_apart_from_exceptions(): + assert_equal_phonetics("KVP", "མེ་", "me") + assert_equal_phonetics("KVP", "མེད", "me") + assert_equal_phonetics("KVP", "མིག་མེད་", "migme") + assert_equal_phonetics("KVP", "མེད་སྣང་", "menang") + assert_equal_phonetics("KVP", "འཆི་མེད་", "chimé") + assert_equal_phonetics("KVP", "མདོ་མེད་", "domé") + assert_equal_phonetics("KVP", "རིས་མེད་", "rimé") + assert_equal_phonetics("KVP", "ཨ་མེས་", "amé") + assert_equal_phonetics("KVP", "ཅོ་ནེ་", "choné") + +# g→k: If g ends the first syllable and the second syllable begins with a g, then it is spelled kg. For example: Chokgyur. +def test_gg_yields_kg(): + assert_equal_phonetics("KVP", "མཆོག་འགྱུར་", "chokgyur") + assert_equal_phonetics("KVP", "ཕྱག་རྒྱ", "chakgya") + +# ng+g: When a syllable that ends in ng is followed by a syllable starting with g, the second g is dropped. For example: Senge. +def test_ngg_yields_ng(): + assert_equal_phonetics("KVP", "སེང་གེ་", "senge") + assert_equal_phonetics("KVP", "གང་གི་", "gangi") + +# a→e: When followed by an n (but not when followed by an l), unless a conventional spelling in English. For example: Palden, Namgyal, but Panchen. +def test_a_followed_by_n_or_l(): + assert_equal_phonetics("KVP", "འགན་", "gen") + assert_equal_phonetics("KVP", "རྒྱན་", "gyen") + assert_equal_phonetics("KVP", "བལ་ལྡན་", "palden") + assert_equal_phonetics("KVP", "རྣམ་རྒྱལ་", "namgyal") + assert_equal_phonetics("KVP", "པཎ་ཆེན་", "panchen") + +# Names of contemporary masters, places, schools, and words that are commonly spelled in English (such as Dzongsar Khyentse, Drukpa Kagyu, Shechen, Shigatse, tonglen, chöd, rinpoche, and tulku) should be spelled according to convention. +def test_names_and_common_spellings(): + assert_equal_phonetics("KVP", "ཞེ་ཆེན་", "shechen") + assert_equal_phonetics("KVP", "གཏོང་ལེན་", "tonglen") + assert_equal_phonetics("KVP", "གཅོད་", "chöd") + assert_equal_phonetics("KVP", "རིན་པོ་ཆེ་", "rinpoche") + assert_equal_phonetics("KVP", "སྤྲུལ་སྐུ་", "tulku") + assert_equal_phonetics("KVP", "སྡེ་དགེ་", "derge") + assert_equal_phonetics("KVP", "སྤྱན་རས་གཟིགས་", "chenrezig") + assert_equal_phonetics("KVP", "མི་ཕམ་", "mipham") + assert_equal_phonetics("KVP", "སྣར་ཐང་", "narthang") + assert_equal_phonetics("KVP", "ཞྭ་ལུ", "shalu") + assert_equal_phonetics("KVP", "གན་ལྡན་", "ganden") + assert_equal_phonetics("KVP", "པཎ་ཆེན་ བླ་མ་", "panchen lama") + +# The achung should be ignored +def test_achung(): + assert_equal_phonetics("KVP", "མ་ཧཱ་", "maha") + assert_equal_phonetics("KVP", "བདེ་ཆེན་ ནཱ་དའི་ དངོས", "dechen nade ngö") + + +### Checking that things work as expected in KVP_corrections.csv + +def load_corrections(): + csv_file_path = os.path.join(os.path.dirname(__file__), 'KVP_corrections.csv') + with open(csv_file_path, 'r') as file: + reader = csv.reader(file) + next(reader) # Skip the first line (headers) + return [(row[0], row[2]) for row in reader] + +corrections = load_corrections() + +@pytest.mark.parametrize("tibetan, expected", corrections) +def test_phonetics_tool_corrections(tibetan, expected): + assert_equal_phonetics("KVP", tibetan, expected) \ No newline at end of file diff --git a/tests/test_KVP_kangki_lodro.py b/tests/test_KVP_kangki_lodro.py index 8045263..968ff00 100644 --- a/tests/test_KVP_kangki_lodro.py +++ b/tests/test_KVP_kangki_lodro.py @@ -20,14 +20,14 @@ def test_kangki_lodro_one_by_one(): བཅུ་ ཕྲག་ བཅུ་ དང་ བཅུ་ གཉིས་ རྒྱན་ སྤྲས་ བདག་ བློའི་ མུན་ སེལ་ འཇམ་ པའི་ དབྱངས་ ལ་ རབ་ ཏུ་ འདུད།། """ expected = """ - gang gi lo drö drib nyi drin dral nyi tar nam dak rab sal wé - ji nyé dön kün ji zhin zik chir nyi kyi tuk kar lek bam dzin - gang dak si pé tsön rar ma rik mün tum duk ngal gyi zir wé - dro tsok kün la bu chik tar tsé yen lak duk chü yang den sung - druk tar cher drok nyön mong nyi long lé kyi chak drok dröl dzé ching - ma rik mün sel duk ngal nyu gu ji nyé chöd dzé ral dri nam - dö né dak ching sa chü tar sön yön ten lü dzok gyal sé tu wö ku - chu drak chu dang chu nyi gyen dré dak lö mün sel jam pé yang la rab tu dü + gang gi lo trö drib nyi trin dral nyi tar nam dak rab sal we + ji nye dön kun ji zhin zik chir nyi kyi tuk kar lek bam dzin + gang dak si pe tsön rar ma rik mun tum duk ngal gyi zir we + dro tsok kun la bu chik tar tse yen lak druk chu yang den sung + druk tar cher drok nyön mong nyi long le kyi chak drok dröl dze ching + ma rik mun sel duk ngal nyu gu ji nye chöd dze ral tri nam + dö ne dak ching sa chu tar sön yön ten lu dzok gyal se tu wö ku + chu trak chu dang chu nyi gyen tre dak lö mun sel jam pe yang la rab tu du """ assert_equal_phonetics("KVP", tibetan, expected) @@ -43,14 +43,14 @@ def test_kangki_lodro_two_by_two(): བཅུ་ཕྲག་ བཅུ་དང་ བཅུ་གཉིས་ རྒྱན་སྤྲས་ བདག་བློའི་ མུན་སེལ་ འཇམ་པའི་ དབྱངས་ལ་ རབ་ཏུ་འདུད།། """ expected = """ - gankgi lotrö dribnyi drindral nyitar namdak rabsalwé - jinyé dönkün jizhin zigchir nyikyi tugkar legbamzin - gangdak sipé tsönrar marik müntum dugngal gyizirwé - drotsok künla buchik tartsé yenlak dugchü yangdensung + gangi lodrö dribnyi trindral nyitar namdak rabsalwe + jinye dönkun jizhin zigchir nyikyi tugkar legbamzin + gangdak sipe tsönrar marik muntum dugngal gyizirwe + drotsok kunla buchik tartse yenlak drugchu yangdensung drugtar cherdrok nyönmong nyilong lekyi chagdrok drölzeching - marik münsel dugngal nyugu jinyé chödzé raltrinam - döné dagching sachü tarsön yönten lüzok gyalsé tuwöku - chutrak chudang chugnyi gyentré daglö münsel jampé yangla rabtudü + marik munsel dugngal nyugu jinye chödze raldrinam + döne dagching sachu tarsön yönten luzok gyalse tuwöku + chutrak chudang chugnyi gyentre daglö munsel jampe yangla rabtudu """ assert_equal_phonetics("KVP", tibetan, expected) @@ -66,13 +66,13 @@ def test_kangki_lodro_word_by_word(): བཅུ་ཕྲག་ བཅུ་ དང་ བཅུ་གཉིས་ རྒྱན་ སྤྲས་ བདག་ བློའི་ མུན་སེལ་ འཇམ་པའི་ དབྱངས་ ལ་ རབ་ཏུ་ འདུད །། """ expected = """ - gankgi lotrö dribnyi drindral nyi tar namdak rabsalwé - jinyé dön kün jizhin zik chir nyikyi tugkar legbam dzin - gangdak sipé tsönrarmarik mün tum dugngal gyi zirwé - dro tsok kün la bu chik tar tsé yenlak duk chü yangden sung - druk tar cher drok nyönmong nyi long lekyi chagdrok dröl dzé ching - marik münsel dugngal nyugu jinyé chöd dzé raltri nam - dö né dak ching sachü tar sön yönten lüzok gyalsé tuwö ku - chutrak chu dang chugnyi gyen dré dak lö münsel jampé yang la rabtu dü + gangi lodrö dribnyi trindral nyi tar namdak rabsalwe + jinye dön kun jizhin zik chir nyikyi tugkar legbam dzin + gangdak sipe tsönrarmarik mun tum dugngal gyi zirwe + dro tsok kun la bu chik tar tse yenlak druk chu yangden sung + druk tar cher drok nyönmong nyi long lekyi chagdrok dröl dze ching + marik munsel dugngal nyugu jinye chöd dze raldri nam + dö ne dak ching sachu tar sön yönten luzok gyalse tuwö ku + chutrak chu dang chugnyi gyen tre dak lö munsel jampe yang la rabtu du """ assert_equal_phonetics("KVP", tibetan, expected) diff --git a/tests/test_KVP_wasur.py b/tests/test_KVP_wasur.py index 08a8d82..5bd5868 100644 --- a/tests/test_KVP_wasur.py +++ b/tests/test_KVP_wasur.py @@ -20,15 +20,15 @@ def load_wasur_cases(): def test_cases_without_wasur(): - assert_equal_phonetics("KVP", "མངས", "ngé") - assert_equal_phonetics("KVP", "མགས", "gé") - assert_equal_phonetics("KVP", "དབས", "wé") - assert_equal_phonetics("KVP", "དངས", "ngé") - assert_equal_phonetics("KVP", "དགས", "gé") - assert_equal_phonetics("KVP", "དམས", "mé") - assert_equal_phonetics("KVP", "བགས", "gé") - assert_equal_phonetics("KVP", "འབས", "bé") - assert_equal_phonetics("KVP", "འགས", "gé") + assert_equal_phonetics("KVP", "མངས", "nge") + assert_equal_phonetics("KVP", "མགས", "ge") + assert_equal_phonetics("KVP", "དབས", "we") + assert_equal_phonetics("KVP", "དངས", "nge") + assert_equal_phonetics("KVP", "དགས", "ge") + assert_equal_phonetics("KVP", "དམས", "me") + assert_equal_phonetics("KVP", "བགས", "ge") + assert_equal_phonetics("KVP", "འབས", "be") + assert_equal_phonetics("KVP", "འགས", "ge") def test_wasur_cases_with_root_position_change(): assert_equal_phonetics("KVP", "མྭངས", "mang") diff --git a/tests/wasur_cases.csv b/tests/wasur_cases.csv index 171a0ec..e2b9d43 100644 --- a/tests/wasur_cases.csv +++ b/tests/wasur_cases.csv @@ -1,12 +1,12 @@ ཀྭ,ka ཀྭན,ken -ཀྭས,ké +ཀྭས,ke ཁྭ,kha ཁྭག,khak ཁྭགས,khak ཁྭིམས,khim གྭ,ga -གྲྭ,dra +གྲྭ,tra ཉྭ,nya ཏྭ,ta ཏྭམ,tam @@ -19,12 +19,12 @@ དྭང,dang དྭངས,dang དྭོགས,dok -དྲྭ,da +དྲྭ,dra ཕྱྭ,cha བཀྭན,ken བཅྭ,cha བསྭ,sa -བསྭེ,sé +བསྭེ,se བསྭོ,so མྭ,ma ཙྭ,tsa @@ -43,11 +43,11 @@ ལྭོ,lo ལྷྭམ,lham ཤྭ,sha -ཤྭད,shé +ཤྭད,she ཤྭར,shar སྟྭ,ta སྭ,sa -སྭད,sé +སྭད,se སྭོ,so ཧྭ,ha ཧྭག,hak