From c20574b7e793754506722e37bffc014506682bbb Mon Sep 17 00:00:00 2001 From: Tyler Cipriani Date: Mon, 11 Nov 2024 10:27:40 -0700 Subject: [PATCH] Fix: translate upenn treebank pos to wordnet pos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem: The function to translate between a upenn treebank part-of-speech tag and a wordnet part-of-speech tag, `get_wordnet_pos` universally returned `wn.NOUN`. Fix: `nltk.tag`'s `pos_tag` returns a upenn treebank pos, available values are listed in `nltk.help.upenn_tagset()` Nouns: - NN Noun, singular or mass - NNS Noun, plural - NNP Proper noun, singular - NNPS Proper noun, plural Verbs: - VB Verb, base form - VBD Verb, past tense - VBG Verb, gerund or present participle - VBN Verb, past participle - VBP Verb, non-3rd person singular present - VBZ Verb, 3rd person singular present Adjectives: - JJ Adjective - JJR Adjective, comparative - JJS Adjective, superlative Adverb: - RB Adverb - RBR Adverb, comparative - RBS Adverb, superlative - RP Particle The first letter of each maps to the wordnet lemmatizer pos. The fix is to only look at the first letter. This makes a small but noticable difference. Before: $ macroetym --showfamilies Latinate,Germanic moby-dick.txt moby-dick.txt Latinate 63.449849 Germanic 34.358548 After: $ macroetym --showfamilies Latinate,Germanic moby-dick.txt moby-dick.txt Latinate 62.542085 Germanic 35.540172 This is because previously conjugated verbs, e.g., `writhed` were treated as nouns but are now recognized as their base verb forms, e.g., `writhe` and able to be tagged, e.g., `writhe [wriĆ¾an (ang)]`. --- macroetym/main.py | 54 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/macroetym/main.py b/macroetym/main.py index 411c998..2ef0998 100644 --- a/macroetym/main.py +++ b/macroetym/main.py @@ -210,21 +210,32 @@ def tokens(self): @property def clean_tokens(self, remove_stopwords=True): - clean = [token for token in self.tokens if token not in punctuation] - clean = [token.lower() for token in clean] - clean = [token for token in clean if token.isalpha()] + clean = [token.lower() for token in self.tokens + if token not in punctuation and token.isalpha()] if remove_stopwords: clean = self.remove_stopwords(clean) return clean def remove_stopwords(self, tokens): """ Remove stopwords from a list of tokens. """ - available_stopwords = """danish english french hungarian norwegian - spanish turkish dutch finnish german italian portuguese russian - swedish""".split() - stop_dict = {lang[:3]: lang for lang in available_stopwords} - stop_dict['fra'] = 'french' # Exception - stop_dict['deu'] = 'german' # Another exception + stop_dict = { + 'dan': 'danish', + 'eng': 'english', + 'fra': 'french', + 'hun': 'hungarian', + 'nor': 'norwegian', + 'spa': 'spanish', + 'tur': 'turkish', + 'dut': 'dutch', + 'fin': 'finnish', + 'deu': 'german', + 'ita': 'italian', + 'por': 'portuguese', + 'rus': 'russian', + 'swe': 'swedish', + 'ger': 'german', + 'fre': 'french', + } if self.lang in stop_dict: stops = stopwords.words(stop_dict[self.lang]) return [token for token in tokens if token not in stops] @@ -250,7 +261,30 @@ def lemmas(self): lemmatizer = WordNetLemmatizer() def get_wordnet_pos(treebank_tag): - """ Translate between treebank tag style and WordNet tag style.""" + """ + Translate between treebank tag style and WordNet tag style. + + Here, we map the treebank tag to the wordnet tag by taking the + first letter of the treebank tag and mapping it to the wordnet tag. + + Upenn Treebank part-of-speech tags are used by the nltk pos tagger. + The possible tags are ennumerated by nltk.help.upenn_tagset(). + + - Nouns, e.g., are tagged as 'NN', 'NNS', 'NNP', 'NNPS'. + - Verbs, e.g., are tagged as 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'. + - Adjectives, e.g., are tagged as 'JJ', 'JJR', 'JJS'. + - Adverbs, e.g., are tagged as 'RB', 'RBR', 'RBS'. + + Wordnet uses a different part-of-speech tagset. + + - Nouns are 'n' + - verbs are 'v' + - adjectives are 'a' + - adverbs are 'r'. + + If the treebank tag is not in the map, we default to 'n' (noun). + """ + treebank_tag = treebank_tag[0] tag_map = {"J": wn.ADJ, "V": wn.VERB, "N": wn.NOUN,