From c20574b7e793754506722e37bffc014506682bbb Mon Sep 17 00:00:00 2001
From: Tyler Cipriani <tcipriani@wikimedia.org>
Date: Mon, 11 Nov 2024 10:27:40 -0700
Subject: [PATCH] Fix: translate upenn treebank pos to wordnet pos
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problem:

The function to translate between a upenn treebank part-of-speech tag
and a wordnet part-of-speech tag, `get_wordnet_pos` universally returned
`wn.NOUN`.

Fix:

`nltk.tag`'s `pos_tag` returns a upenn treebank pos, available values
are listed in `nltk.help.upenn_tagset()`

Nouns:

- NN  Noun, singular or mass
- NNS Noun, plural
- NNP Proper noun, singular
- NNPS    Proper noun, plural

Verbs:

- VB  Verb, base form
- VBD Verb, past tense
- VBG Verb, gerund or present participle
- VBN Verb, past participle
- VBP Verb, non-3rd person singular present
- VBZ Verb, 3rd person singular present

Adjectives:

- JJ  Adjective
- JJR Adjective, comparative
- JJS Adjective, superlative

Adverb:

- RB  Adverb
- RBR Adverb, comparative
- RBS Adverb, superlative
- RP  Particle

The first letter of each maps to the wordnet lemmatizer pos.

The fix is to only look at the first letter. This makes a small but
noticable difference.

Before:

    $ macroetym --showfamilies Latinate,Germanic moby-dick.txt
          moby-dick.txt
    Latinate      63.449849
    Germanic      34.358548

After:

    $ macroetym --showfamilies Latinate,Germanic moby-dick.txt
          moby-dick.txt
    Latinate      62.542085
    Germanic      35.540172

This is because previously conjugated verbs, e.g., `writhed` were
treated as nouns but are now recognized as their base verb forms, e.g.,
`writhe` and able to be tagged, e.g., `writhe [wriþan (ang)]`.
---
 macroetym/main.py | 54 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 44 insertions(+), 10 deletions(-)

diff --git a/macroetym/main.py b/macroetym/main.py
index 411c998..2ef0998 100644
--- a/macroetym/main.py
+++ b/macroetym/main.py
@@ -210,21 +210,32 @@ def tokens(self):
 
     @property
     def clean_tokens(self, remove_stopwords=True):
-        clean = [token for token in self.tokens if token not in punctuation]
-        clean = [token.lower() for token in clean]
-        clean = [token for token in clean if token.isalpha()]
+        clean = [token.lower() for token in self.tokens
+                 if token not in punctuation and token.isalpha()]
         if remove_stopwords:
             clean = self.remove_stopwords(clean)
         return clean
 
     def remove_stopwords(self, tokens):
         """ Remove stopwords from a list of tokens. """
-        available_stopwords = """danish english french hungarian norwegian
-        spanish turkish dutch finnish german italian portuguese russian
-        swedish""".split()
-        stop_dict = {lang[:3]: lang for lang in available_stopwords}
-        stop_dict['fra'] = 'french' # Exception
-        stop_dict['deu'] = 'german' # Another exception
+        stop_dict = {
+            'dan': 'danish',
+            'eng': 'english',
+            'fra': 'french',
+            'hun': 'hungarian',
+            'nor': 'norwegian',
+            'spa': 'spanish',
+            'tur': 'turkish',
+            'dut': 'dutch',
+            'fin': 'finnish',
+            'deu': 'german',
+            'ita': 'italian',
+            'por': 'portuguese',
+            'rus': 'russian',
+            'swe': 'swedish',
+            'ger': 'german',
+            'fre': 'french',
+        }
         if self.lang in stop_dict:
             stops = stopwords.words(stop_dict[self.lang])
             return [token for token in tokens if token not in stops]
@@ -250,7 +261,30 @@ def lemmas(self):
         lemmatizer = WordNetLemmatizer()
 
         def get_wordnet_pos(treebank_tag):
-            """ Translate between treebank tag style and WordNet tag style."""
+            """
+            Translate between treebank tag style and WordNet tag style.
+
+            Here, we map the treebank tag to the wordnet tag by taking the
+            first letter of the treebank tag and mapping it to the wordnet tag.
+
+            Upenn Treebank part-of-speech tags are used by the nltk pos tagger.
+            The possible tags are ennumerated by nltk.help.upenn_tagset().
+
+            - Nouns, e.g., are tagged as 'NN', 'NNS', 'NNP', 'NNPS'.
+            - Verbs, e.g., are tagged as 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'.
+            - Adjectives, e.g., are tagged as 'JJ', 'JJR', 'JJS'.
+            - Adverbs, e.g., are tagged as 'RB', 'RBR', 'RBS'.
+
+            Wordnet uses a different part-of-speech tagset.
+
+            - Nouns are 'n'
+            - verbs are 'v'
+            - adjectives are 'a'
+            - adverbs are 'r'.
+
+            If the treebank tag is not in the map, we default to 'n' (noun).
+            """
+            treebank_tag = treebank_tag[0]
             tag_map = {"J": wn.ADJ,
                        "V": wn.VERB,
                        "N": wn.NOUN,