-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTokenization.py
executable file
·122 lines (91 loc) · 3.47 KB
/
Tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/python
# coding: utf-8
"""..module:: archiveprocessor.Tokenization.
..moduleauthor:: Jan Lehecka <[email protected]>
"""
import nltk
import os
from BaseAlgorithms import BaseProcessAlgorithm
from metadata import LANGUAGE, URL, ID, PLAINTEXT, SENTENCES, TOKENS
from config import NLTK_DATA_DIR
from Record import Record
nltk.data.path = [NLTK_DATA_DIR]
class WordTokenizer(BaseProcessAlgorithm):
"""Tokenize plain text using NLTK package.
If language of the text is not supported by NLTK or unknown, it is assumed
to be Czech.
Requires:
NLTK package + Punkt corpus (included in this git):
$ pip install nltk
"""
def _init(self) -> None:
"""Class constructor."""
self.lang_iso2punkt = {
# language mapping from ISO 639-1 codes into Punkt corpus used
# in NLTK, see:
# https://github.com/joeyespo/gistmail/tree/master/nltk_data/tokenizers/punkt
'cs': 'czech',
'en': 'english',
'de': 'german',
'pl': 'polish',
'fr': 'french',
# sk and ru not supported at the time of writing
}
def _check_NLTK_data(self) -> None:
"""Move NLTK data into correct directory if necessary.
NLTK data must be in a specific sub-directory, but Spark puts all files
in the root of working directory.
TODO: tell NLTK where files are without moving them?
"""
dir_exists = os.path.isdir(NLTK_DATA_DIR)
if not dir_exists and "czech.pickle" in os.listdir("."):
self.logger.debug(f'Moving NLTK data.')
dest = f"{NLTK_DATA_DIR}/tokenizers/punkt/PY3"
os.makedirs(dest)
for lang in self.lang_iso2punkt.values():
os.rename(f"{lang}.pickle", f"{dest}/{lang}.pickle")
def _process(self, record: Record) -> Record:
"""Process the record.
Args:
record: Record to be processed.
Returns:
record: Processed record.
"""
lang = record[LANGUAGE] or 'cs'
plang = self.lang_iso2punkt.get(lang, 'czech')
text = record[PLAINTEXT] or ''
self._check_NLTK_data()
tokens = nltk.tokenize.word_tokenize(text, plang)
record[TOKENS] = tokens
self.logger.debug(
f'Plain text split into {len(tokens)} tokens using '
f'{plang.capitalize()} tokenizer (orig-lang={lang}, '
f'URL {record[URL]} and ID="{record[ID]}").'
)
return record
class SentenceTokenizer(WordTokenizer):
"""Split plain text into sentences using NLTK package.
If language of the text is not supported by NLTK or unknown, it is assumed
to be Czech.
Requires:
Same as WordTokenizer.
"""
def _process(self, record: Record) -> Record:
"""Process the record.
Args:
record: Record to be processed.
Returns:
record: Processed record.
"""
lang = record[LANGUAGE] or 'cs'
plang = self.lang_iso2punkt.get(lang, 'czech')
text = record[PLAINTEXT] or ''
self._check_NLTK_data()
sentences = nltk.tokenize.sent_tokenize(text, plang)
record[SENTENCES] = sentences
self.logger.debug(
f'Plain text split into {len(sentences)} sentences using '
f'{plang.capitalize()} tokenizer (orig-lang={lang}, URL '
f'{record[URL]} and ID="{record[ID]}").'
)
return record