Skip to content

Commit

Permalink
Add Kurdish Kurmanji language (explosion#13561)
Browse files Browse the repository at this point in the history
* Add Kurdish Kurmanji language

* Add lex_attrs
  • Loading branch information
cikay authored Sep 9, 2024
1 parent 55db9c2 commit acbf2a4
Show file tree
Hide file tree
Showing 9 changed files with 239 additions and 1 deletion.
15 changes: 15 additions & 0 deletions spacy/lang/kmr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from .lex_attrs import LEX_ATTRS
from ...language import BaseDefaults, Language
from .stop_words import STOP_WORDS


class KurmanjiDefaults(BaseDefaults):
stop_words = STOP_WORDS
lex_attr_getters = LEX_ATTRS


class Kurmanji(Language):
lang = "kmr"
Defaults = KurmanjiDefaults

__all__ = ["Kurmanji"]
17 changes: 17 additions & 0 deletions spacy/lang/kmr/examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""
Example sentences to test spaCy and its language models.
>>> from spacy.lang.kmr.examples import sentences
>>> docs = nlp.pipe(sentences)
"""

sentences = [
"Berê mirovan her tim li geşedana pêşerojê ye", # People's gaze is always on the development of the future
"Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.", # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
"Mem Ararat hunermendekî Kurd yê bi nav û deng e.", # Mem Ararat is a famous Kurdish artist
"Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.", # Firat Ceweri has been writing Kurdish books for 40 years
"Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand", # The young journalist wrote an interesting news article about the economic situation
"Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne", # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
"Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn", # Talented students succeeded in the mathematics competition
"Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.", # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
]
139 changes: 139 additions & 0 deletions spacy/lang/kmr/lex_attrs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
from ...attrs import LIKE_NUM


_num_words = [
"sifir",
"yek",
"du",
"sê",
"çar",
"pênc",
"şeş",
"heft",
"heşt",
"neh",
"deh",
"yazde",
"dazde",
"sêzde",
"çarde",
"pazde",
"şazde",
"hevde",
"hejde",
"nozde",
"bîst",
"sî",
"çil",
"pêncî",
"şêst",
"heftê",
"heştê",
"nod",
"sed",
"hezar",
"milyon",
"milyar",
]

_ordinal_words = [
"yekem",
"yekemîn",
"duyem",
"duyemîn",
"sêyem",
"sêyemîn",
"çarem",
"çaremîn",
"pêncem",
"pêncemîn",
"şeşem",
"şeşemîn",
"heftem",
"heftemîn",
"heştem",
"heştemîn",
"nehem",
"nehemîn",
"dehem",
"dehemîn",
"yazdehem",
"yazdehemîn",
"dazdehem",
"dazdehemîn",
"sêzdehem",
"sêzdehemîn",
"çardehem",
"çardehemîn",
"pazdehem",
"pazdehemîn",
"şanzdehem",
"şanzdehemîn",
"hevdehem",
"hevdehemîn",
"hejdehem",
"hejdehemîn",
"nozdehem",
"nozdehemîn",
"bîstem",
"bîstemîn",
"sîyem",
"sîyemîn",
"çilem",
"çilemîn",
"pêncîyem",
"pênciyemîn",
"şêstem",
"şêstemîn",
"heftêyem",
"heftêyemîn",
"heştêyem",
"heştêyemîn",
"notem",
"notemîn",
"sedem",
"sedemîn",
"hezarem",
"hezaremîn",
"milyonem",
"milyonemîn",
"milyarem",
"milyaremîn",
]


def like_num(text):
if text.startswith(("+", "-", "±", "~")):
text = text[1:]
text = text.replace(",", "").replace(".", "")
if text.isdigit():
return True
if text.count("/") == 1:
num, denom = text.split("/")
if num.isdigit() and denom.isdigit():
return True
text_lower = text.lower()
if text_lower in _num_words:
return True

# Check ordinal number
if text_lower in _ordinal_words:
return True

if is_digit(text_lower):
return True

return False


def is_digit(text):
endings = ("em", "yem", "emîn", "yemîn")
for ending in endings:
to = len(ending)
if text.endswith(ending) and text[:-to].isdigit():
return True

return False


LEX_ATTRS = {LIKE_NUM: like_num}
44 changes: 44 additions & 0 deletions spacy/lang/kmr/stop_words.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
STOP_WORDS = set(
"""
û
li
bi
di
da
de
ji
ku
ew
ez
tu
em
hûn
ew
ev
min
te
me
we
wan
va
çi
çawa
çima
kengî
li ku
çend
çiqas
her
hin
gelek
hemû
kes
tişt
""".split()
)
Empty file.
16 changes: 16 additions & 0 deletions spacy/tests/lang/kmr/test_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest

from spacy.lang.kmr.lex_attrs import like_num


@pytest.mark.parametrize(
"word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"]
)
def test_kmr_lex_attrs_like_number_for_ordinal(word):
assert like_num(word)


@pytest.mark.parametrize("word", ["deh"])
def test_kmr_lex_attrs_capitals(word):
assert like_num(word)
assert like_num(word.upper())
2 changes: 1 addition & 1 deletion spacy/tests/lang/test_initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
"tr", "tt", "uk", "ur", "xx", "yo"]
"tr", "tt", "uk", "ur", "xx", "yo", "kmr"]
# fmt: on


Expand Down
1 change: 1 addition & 0 deletions spacy/tests/tokenizer/test_explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
pytest.param("tr", marks=pytest.mark.slow()),
pytest.param("tt", marks=pytest.mark.slow()),
pytest.param("ur", marks=pytest.mark.slow()),
pytest.param("kmr", marks=pytest.mark.slow()),
]


Expand Down
6 changes: 6 additions & 0 deletions website/meta/languages.json
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,12 @@
],
"example": "这是一个用于示例的句子。",
"has_examples": true
},
{
"code": "kmr",
"name": "Kurdish Kurmanji",
"example": "Ev hevokek e",
"has_examples": true
}
],
"licenses": [
Expand Down

0 comments on commit acbf2a4

Please sign in to comment.