forked from explosion/spaCy
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Kurdish Kurmanji language (explosion#13561)
* Add Kurdish Kurmanji language * Add lex_attrs
- Loading branch information
Showing
9 changed files
with
239 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
from .lex_attrs import LEX_ATTRS | ||
from ...language import BaseDefaults, Language | ||
from .stop_words import STOP_WORDS | ||
|
||
|
||
class KurmanjiDefaults(BaseDefaults): | ||
stop_words = STOP_WORDS | ||
lex_attr_getters = LEX_ATTRS | ||
|
||
|
||
class Kurmanji(Language): | ||
lang = "kmr" | ||
Defaults = KurmanjiDefaults | ||
|
||
__all__ = ["Kurmanji"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
""" | ||
Example sentences to test spaCy and its language models. | ||
>>> from spacy.lang.kmr.examples import sentences | ||
>>> docs = nlp.pipe(sentences) | ||
""" | ||
|
||
sentences = [ | ||
"Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future | ||
"Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years. | ||
"Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist | ||
"Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years | ||
"Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation | ||
"Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide | ||
"Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition | ||
"Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
from ...attrs import LIKE_NUM | ||
|
||
|
||
_num_words = [ | ||
"sifir", | ||
"yek", | ||
"du", | ||
"sê", | ||
"çar", | ||
"pênc", | ||
"şeş", | ||
"heft", | ||
"heşt", | ||
"neh", | ||
"deh", | ||
"yazde", | ||
"dazde", | ||
"sêzde", | ||
"çarde", | ||
"pazde", | ||
"şazde", | ||
"hevde", | ||
"hejde", | ||
"nozde", | ||
"bîst", | ||
"sî", | ||
"çil", | ||
"pêncî", | ||
"şêst", | ||
"heftê", | ||
"heştê", | ||
"nod", | ||
"sed", | ||
"hezar", | ||
"milyon", | ||
"milyar", | ||
] | ||
|
||
_ordinal_words = [ | ||
"yekem", | ||
"yekemîn", | ||
"duyem", | ||
"duyemîn", | ||
"sêyem", | ||
"sêyemîn", | ||
"çarem", | ||
"çaremîn", | ||
"pêncem", | ||
"pêncemîn", | ||
"şeşem", | ||
"şeşemîn", | ||
"heftem", | ||
"heftemîn", | ||
"heştem", | ||
"heştemîn", | ||
"nehem", | ||
"nehemîn", | ||
"dehem", | ||
"dehemîn", | ||
"yazdehem", | ||
"yazdehemîn", | ||
"dazdehem", | ||
"dazdehemîn", | ||
"sêzdehem", | ||
"sêzdehemîn", | ||
"çardehem", | ||
"çardehemîn", | ||
"pazdehem", | ||
"pazdehemîn", | ||
"şanzdehem", | ||
"şanzdehemîn", | ||
"hevdehem", | ||
"hevdehemîn", | ||
"hejdehem", | ||
"hejdehemîn", | ||
"nozdehem", | ||
"nozdehemîn", | ||
"bîstem", | ||
"bîstemîn", | ||
"sîyem", | ||
"sîyemîn", | ||
"çilem", | ||
"çilemîn", | ||
"pêncîyem", | ||
"pênciyemîn", | ||
"şêstem", | ||
"şêstemîn", | ||
"heftêyem", | ||
"heftêyemîn", | ||
"heştêyem", | ||
"heştêyemîn", | ||
"notem", | ||
"notemîn", | ||
"sedem", | ||
"sedemîn", | ||
"hezarem", | ||
"hezaremîn", | ||
"milyonem", | ||
"milyonemîn", | ||
"milyarem", | ||
"milyaremîn", | ||
] | ||
|
||
|
||
def like_num(text): | ||
if text.startswith(("+", "-", "±", "~")): | ||
text = text[1:] | ||
text = text.replace(",", "").replace(".", "") | ||
if text.isdigit(): | ||
return True | ||
if text.count("/") == 1: | ||
num, denom = text.split("/") | ||
if num.isdigit() and denom.isdigit(): | ||
return True | ||
text_lower = text.lower() | ||
if text_lower in _num_words: | ||
return True | ||
|
||
# Check ordinal number | ||
if text_lower in _ordinal_words: | ||
return True | ||
|
||
if is_digit(text_lower): | ||
return True | ||
|
||
return False | ||
|
||
|
||
def is_digit(text): | ||
endings = ("em", "yem", "emîn", "yemîn") | ||
for ending in endings: | ||
to = len(ending) | ||
if text.endswith(ending) and text[:-to].isdigit(): | ||
return True | ||
|
||
return False | ||
|
||
|
||
LEX_ATTRS = {LIKE_NUM: like_num} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
STOP_WORDS = set( | ||
""" | ||
û | ||
li | ||
bi | ||
di | ||
da | ||
de | ||
ji | ||
ku | ||
ew | ||
ez | ||
tu | ||
em | ||
hûn | ||
ew | ||
ev | ||
min | ||
te | ||
wî | ||
wê | ||
me | ||
we | ||
wan | ||
vê | ||
vî | ||
va | ||
çi | ||
kî | ||
kê | ||
çawa | ||
çima | ||
kengî | ||
li ku | ||
çend | ||
çiqas | ||
her | ||
hin | ||
gelek | ||
hemû | ||
kes | ||
tişt | ||
""".split() | ||
) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import pytest | ||
|
||
from spacy.lang.kmr.lex_attrs import like_num | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"] | ||
) | ||
def test_kmr_lex_attrs_like_number_for_ordinal(word): | ||
assert like_num(word) | ||
|
||
|
||
@pytest.mark.parametrize("word", ["deh"]) | ||
def test_kmr_lex_attrs_capitals(word): | ||
assert like_num(word) | ||
assert like_num(word.upper()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters