-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path__main__.py
157 lines (128 loc) · 5.57 KB
/
__main__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# -*- coding: utf-8 -*-
"""
nlp_sum - multi-document summarizer.
Usage:
nlp_sum (ilp | kl | lexrank | lsa | nmf | random | submodular | textrank | manifoldrank) [--query=<query>] [--length=<length>] [--language=<lang>] [--stopwords=<file_path>] [--stem] [--format=<format>] [--para=<parameter>] [--output=<file_path>] --file=<file_path>
nlp_sum --help
Options:
--query=<query> query to summarize the text
--length=<length> Length limit of summarized text.
--language=<lang> Natural language of summarized text. [default: english]
--stopwords=<file_path> Path to a file containing a list of stopwords. One word per line in UTF-8 encoding.
If it is not specified default list of stopwords is used according to chosen lang.
--stem If specified, will need stem.
--format=<format> Format of input document. Possible values: plaintext
--file=<file_path> Path to the text file to summarize.(directory of file path)
--output=<file_path> File path to write the result of summarization.
--para=<parameter> parameter to the summarizer in string format such as '0.1 0.2'
--help, -h Displays current application version.
"""
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from os.path import isfile, isdir, abspath
from docopt import docopt
from nlp_sum.my_sum.utils import to_string, to_unicode, get_stop_words, read_stop_words
from nlp_sum.my_sum.parse.plaintext import PlaintextParser
from nlp_sum.my_sum.parse.xml_parse import XmlParser
from nlp_sum.my_sum.method.extract_summarizer.conceptILP import conceptILPSummarizer
from nlp_sum.my_sum.method.extract_summarizer.kl import KLSummarizer
from nlp_sum.my_sum.method.extract_summarizer.lexrank import LexRankSummarizer
from nlp_sum.my_sum.method.extract_summarizer.lsa import LsaSummarizer
from nlp_sum.my_sum.method.extract_summarizer.nmf import NmfSummarizer
from nlp_sum.my_sum.method.extract_summarizer.random import RandomSummarizer
from nlp_sum.my_sum.method.extract_summarizer.submodular import SubmodularSummarizer
from nlp_sum.my_sum.method.extract_summarizer.textrank import TextRankSummarizer
from nlp_sum.my_sum.method.query_summarizer.lexrank import LexRank_querySummarizer
from nlp_sum.my_sum.method.query_summarizer.manifoldrank import ManifoldRankSummarizer
PARSERS = {
"plaintext" : PlaintextParser,
"xml" : XmlParser
}
METHODS = {
"ilp" : conceptILPSummarizer,
"kl" : KLSummarizer,
"lexrank" : LexRankSummarizer,
"lsa" : LsaSummarizer,
"nmf" : NmfSummarizer,
"random" : RandomSummarizer,
"submodular" : SubmodularSummarizer,
"textrank" : TextRankSummarizer,
}
METHODS_Q = {
"lexrank" : LexRank_querySummarizer,
"manifoldrank" : ManifoldRankSummarizer,
}
def handle_arguments(args):
document_format = args['--format']
if document_format is not None and document_format not in PARSERS:
raise ValueError("Unsupported input format. Possible values are {0}. Given: {1}.").format(
", ".join(PARSERS.keys()),
document_format
)
parser = PARSERS[document_format or "plaintext"]
words_limit = args['--length'] or 250
words_limit = int(words_limit)
language = args['--language'] or "english"
parser = parser(language)
if args['--file'] is not None:
file_path = args['--file']
file_path = abspath(file_path)
if isdir(file_path):
document_set = parser.build_documentSet_from_dir(
file_path
)
elif isfile(file_path):
document_set = parser.build_document_from_file(
file_path
)
else:
raise ValueError("Input file is invalid")
if args['--stopwords']:
stop_words = read_stop_words(args['--stopwords'])
else:
stop_words = get_stop_words(language)
if args['--stem']:
stem_or_not = True
else:
stem_or_not = False
query = False or to_unicode(args['--query'])
if args['--query']:
summarizer_class = next(cls for name, cls in METHODS_Q.items() if args[name])
else:
summarizer_class = next(cls for name, cls in METHODS.items() if args[name])
summarizer = build_summarizer(summarizer_class, language, stop_words, stem_or_not)
return document_set, summarizer, language, words_limit, query
def build_summarizer(summarizer_class, language, stop_words, stem_or_not):
summarizer = summarizer_class(language, stem_or_not)
summarizer.stop_words = stop_words
return summarizer
def main(args=None):
args = docopt(to_string(__doc__))
document_set, summarizer, language, words_limit, query = handle_arguments(args)
output_path = args['--output']
if query:
summary = summarizer(document_set, query, words_limit)
else:
summary = summarizer(document_set, words_limit)
if language.startswith("en"):
summary_text = ' '.join(sentence._texts for sentence in summary)
elif language.startswith("ch"):
summary_text = ''.join(sentence._texts + '。' for sentence in summary)
if output_path:
output_path = abspath(output_path)
with open(output_path, 'wb') as file:
file.write(summary_text.encode("utf8"))
else:
print(summary_text)
return 0
if __name__ == "__main__":
# args = docopt(to_string(__doc__))
# print(args)
try:
exit_code = main()
exit(exit_code)
except KeyboardInterrupt:
exit(1)
except Exception as e:
print(e)
exit(1)