Add Kurdish Kurmanji language (explosion#13561)

* Add Kurdish Kurmanji language * Add lex_attrs
DuyguA · Sep 9, 2024 · acbf2a4 · acbf2a4
1 parent 55db9c2
commit acbf2a4
Show file tree

Hide file tree

Showing 9 changed files with 239 additions and 1 deletion.
diff --git a/spacy/lang/kmr/__init__.py b/spacy/lang/kmr/__init__.py
@@ -0,0 +1,15 @@
+from .lex_attrs import LEX_ATTRS
+from ...language import BaseDefaults, Language
+from .stop_words import STOP_WORDS
+
+
+class KurmanjiDefaults(BaseDefaults):
+    stop_words = STOP_WORDS
+    lex_attr_getters = LEX_ATTRS
+
+
+class Kurmanji(Language):
+    lang = "kmr"
+    Defaults = KurmanjiDefaults
+
+__all__ = ["Kurmanji"]
diff --git a/spacy/lang/kmr/examples.py b/spacy/lang/kmr/examples.py
@@ -0,0 +1,17 @@
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.kmr.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+sentences = [
+    "Berê mirovan her tim li geşedana pêşerojê ye",  # People's gaze is always on the development of the future
+    "Kawa Nemir di 14 salan de Ulysses wergerand Kurmancî.",  # Kawa Nemir translated Ulysses into Kurmanji in 14 years.
+    "Mem Ararat hunermendekî Kurd yê bi nav û deng e.",  # Mem Ararat is a famous Kurdish artist
+    "Firat Cewerî 40 sal e pirtûkên Kurdî dinivîsîne.",  # Firat Ceweri has been writing Kurdish books for 40 years
+    "Rojnamegerê ciwan nûçeyeke balkêş li ser rewşa aborî nivîsand",  # The young journalist wrote an interesting news article about the economic situation
+    "Sektora çandiniyê beşeke giring a belavkirina gaza serayê li seranserê cîhanê pêk tîne",  # The agricultural sector constitutes an important part of greenhouse gas emissions worldwide
+    "Xwendekarên jêhatî di pêşbaziya matematîkê de serkeftî bûn",  # Talented students succeeded in the mathematics competition
+    "Ji ber ji tunebûnê bavê min xwişkeke min nedan xwendin ew ji min re bû derd û kulek.",  # Because of poverty, my father didn't send my sister to school, which became a pain and sorrow for me
+]
diff --git a/spacy/lang/kmr/lex_attrs.py b/spacy/lang/kmr/lex_attrs.py
@@ -0,0 +1,139 @@
+from ...attrs import LIKE_NUM
+
+
+_num_words = [
+    "sifir",
+    "yek",
+    "du",
+    "sê",
+    "çar",
+    "pênc",
+    "şeş",
+    "heft",
+    "heşt",
+    "neh",
+    "deh",
+    "yazde",
+    "dazde",
+    "sêzde",
+    "çarde",
+    "pazde",
+    "şazde",
+    "hevde",
+    "hejde",
+    "nozde",
+    "bîst",
+    "sî",
+    "çil",
+    "pêncî",
+    "şêst",
+    "heftê",
+    "heştê",
+    "nod",
+    "sed",
+    "hezar",
+    "milyon",
+    "milyar",
+]
+
+_ordinal_words = [
+    "yekem",
+    "yekemîn",
+    "duyem",
+    "duyemîn",
+    "sêyem",
+    "sêyemîn",
+    "çarem",
+    "çaremîn",
+    "pêncem",
+    "pêncemîn",
+    "şeşem",
+    "şeşemîn",
+    "heftem",
+    "heftemîn",
+    "heştem",
+    "heştemîn",
+    "nehem",
+    "nehemîn",
+    "dehem",
+    "dehemîn",
+    "yazdehem",
+    "yazdehemîn",
+    "dazdehem",
+    "dazdehemîn",
+    "sêzdehem",
+    "sêzdehemîn",
+    "çardehem",
+    "çardehemîn",
+    "pazdehem",
+    "pazdehemîn",
+    "şanzdehem",
+    "şanzdehemîn",
+    "hevdehem",
+    "hevdehemîn",
+    "hejdehem",
+    "hejdehemîn",
+    "nozdehem",
+    "nozdehemîn",
+    "bîstem",
+    "bîstemîn",
+    "sîyem",
+    "sîyemîn",
+    "çilem",
+    "çilemîn",
+    "pêncîyem",
+    "pênciyemîn",
+    "şêstem",
+    "şêstemîn",
+    "heftêyem",
+    "heftêyemîn",
+    "heştêyem",
+    "heştêyemîn",
+    "notem",
+    "notemîn",
+    "sedem",
+    "sedemîn",
+    "hezarem",
+    "hezaremîn",
+    "milyonem",
+    "milyonemîn",
+    "milyarem",
+    "milyaremîn",
+]
+
+
+def like_num(text):
+    if text.startswith(("+", "-", "±", "~")):
+        text = text[1:]
+    text = text.replace(",", "").replace(".", "")
+    if text.isdigit():
+        return True
+    if text.count("/") == 1:
+        num, denom = text.split("/")
+        if num.isdigit() and denom.isdigit():
+            return True
+    text_lower = text.lower()
+    if text_lower in _num_words:
+        return True
+
+    # Check ordinal number
+    if text_lower in _ordinal_words:
+        return True
+
+    if is_digit(text_lower):
+        return True
+
+    return False
+
+
+def is_digit(text):
+    endings = ("em", "yem", "emîn", "yemîn")
+    for ending in endings:
+        to = len(ending)
+        if text.endswith(ending) and text[:-to].isdigit():
+            return True
+
+    return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
diff --git a/spacy/lang/kmr/stop_words.py b/spacy/lang/kmr/stop_words.py
@@ -0,0 +1,44 @@
+STOP_WORDS = set(
+    """
+û
+li
+bi
+di
+da
+de
+ji
+ku
+ew
+ez
+tu
+em
+hûn
+ew
+ev
+min
+te
+wî
+wê
+me
+we
+wan
+vê
+vî
+va
+çi
+kî
+kê
+çawa
+çima
+kengî
+li ku
+çend
+çiqas
+her
+hin
+gelek
+hemû
+kes
+tişt
+""".split()
+)
diff --git a/spacy/tests/lang/kmr/__init__.py b/spacy/tests/lang/kmr/__init__.py
diff --git a/spacy/tests/lang/kmr/test_text.py b/spacy/tests/lang/kmr/test_text.py
@@ -0,0 +1,16 @@
+import pytest
+
+from spacy.lang.kmr.lex_attrs import like_num
+
+
+@pytest.mark.parametrize(
+    "word", ["yekem", "duyemîn", "100em", "dehem", "sedemîn", "34em", "30yem", "20emîn", "50yemîn"]
+)
+def test_kmr_lex_attrs_like_number_for_ordinal(word):
+    assert like_num(word)
+
+
+@pytest.mark.parametrize("word", ["deh"])
+def test_kmr_lex_attrs_capitals(word):
+    assert like_num(word)
+    assert like_num(word.upper())
diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py
@@ -10,7 +10,7 @@
              "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
              "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
              "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
-             "tr", "tt", "uk", "ur", "xx", "yo"]
+             "tr", "tt", "uk", "ur", "xx", "yo", "kmr"]
 # fmt: on
 
 

diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py
@@ -57,6 +57,7 @@
     pytest.param("tr", marks=pytest.mark.slow()),
     pytest.param("tt", marks=pytest.mark.slow()),
     pytest.param("ur", marks=pytest.mark.slow()),
+    pytest.param("kmr", marks=pytest.mark.slow()),
 ]
 
 

diff --git a/website/meta/languages.json b/website/meta/languages.json
@@ -480,6 +480,12 @@
             ],
             "example": "这是一个用于示例的句子。",
             "has_examples": true
+        },
+        {
+            "code": "kmr",
+            "name": "Kurdish Kurmanji",
+            "example": "Ev hevokek e",
+            "has_examples": true
         }
     ],
     "licenses": [