-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathnlp_util.py
executable file
·798 lines (697 loc) · 36.9 KB
/
nlp_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
import re
import string
import nltk
import spacy
import lemminflect
from nltk.corpus import wordnet
from nltk.tree import Tree, ParentedTree
from utils.constant import STOPWORD_LIST, CONTRACTION_MAP, GLOVE_WORD_LIST, REPLACE_ITEM_LIST, SELF_REPLACE_ITEM, ANTONYM_PREPOSITION_LIST
from allennlp.predictors import Predictor
import networkx as nx
class NLPUtil:
def __init__(self, constituency_model_path):
self.nlp = spacy.load("en_core_web_sm")
self.predictor_constituency = Predictor.from_path(constituency_model_path)
return
def isParallelStructure(self, constituency_tree_1, constituency_tree_2):
relation_tree_1 = self.__getNodes(constituency_tree_1)
relation_tree_2 = self.__getNodes(constituency_tree_2)
if relation_tree_1 == relation_tree_2:
return True
if len(relation_tree_1) == 0 or len(relation_tree_2) == 0:
return False
len_relation_tree_1 = len(relation_tree_1)
len_relation_tree_2 = len(relation_tree_2)
if len_relation_tree_1>len_relation_tree_2:
if relation_tree_2[:len_relation_tree_2-1][:-1] == relation_tree_2[:-1][:-1] and relation_tree_2[:-1][-1] in relation_tree_1[:len_relation_tree_2-1][-1]:
return True
elif len_relation_tree_1<len_relation_tree_2:
if relation_tree_2[:len_relation_tree_1-1][:-1] == relation_tree_1[:-1][:-1] and relation_tree_1[:-1][-1] in relation_tree_2[:len_relation_tree_1-1][-1]:
return True
return False
def findShortestPath(self, edges, source, target):
graph = nx.Graph(edges)
try:
shortest_path = nx.shortest_path(graph, source=source, target=target)
except nx.NetworkXNoPath:
shortest_path = []
return shortest_path
def getSynonymsAndAntonyms(self, word):
synonyms = []
antonyms = []
for syn in wordnet.synsets(word):
for l in syn.lemmas():
synonyms.append(l.name())
if l.antonyms():
antonyms.append(l.antonyms()[0].name())
return list(set(synonyms)), list(set(antonyms))
def predictConstituencyTree(self, sent):
sent = sent.replace("..",".").replace("(","").replace(")","").replace("{","").replace("}","")
tree = self.predictor_constituency.predict(sentence=sent)['trees']
constituency_tree = Tree.fromstring(tree)
return constituency_tree
def removeQuotationsInOptDescDict(self, opt_desc_dict):
new_opt_desc_dict = {}
for opt in opt_desc_dict:
new_opt_desc_dict[opt.replace("\"", "").replace("\'", "").strip()] = opt_desc_dict[opt].replace("\"", "").replace("\'", "").strip()
return new_opt_desc_dict
def splitOptDescDict(self, opt_desc_dict):
splitted_opt_desc_dict = {}
alias_dict = {}
blank_opt_list = []
for opt in opt_desc_dict:
desc = opt_desc_dict[opt].replace(" ", " ").strip()
# If the desc is empty, we combine this opt with the following opt
if desc == "":
blank_opt_list.append(opt)
continue
doc = self.nlp(desc)
topic_sentence = list(doc.sents)[0].text
# Combine blank options and reset the list
if len(blank_opt_list) > 0:
opt = "%s, %s" % (", ".join(blank_opt_list).strip(), opt)
blank_opt_list = []
# options combined by "and", we regard these options as independent of each other
if " and " in opt:
for splitted_part in opt.split(" and "):
# A, B, C, D, and E
for sub_part in splitted_part.split(","):
splitted_opt_list = re.split('\||,|/', sub_part)
if len(splitted_opt_list) > 1:
key = self.removeValueField(splitted_opt_list[0])
if key in splitted_opt_desc_dict:
splitted_opt_desc_dict[key] += " %s" % desc
else:
splitted_opt_desc_dict[key] = desc
tmp_alias_list = []
for alias in splitted_opt_list[1:]:
if not self.removeValueField(alias).startswith("-"):
continue
tmp_alias_list.append(self.removeValueField(alias))
if len(tmp_alias_list) > 0:
alias_dict[key] = tmp_alias_list
else:
key = self.removeValueField(sub_part)
if not key.startswith("-"):
continue
if key in splitted_opt_desc_dict:
splitted_opt_desc_dict[key] += " %s" % desc
else:
splitted_opt_desc_dict[key] = desc
else:
# "or" works like ", "
opt = opt.replace(" or ", ", ")
# replace pattern like "-w --warning"
opt = re.sub("(-{1,2}[a-z0-9A-Z-*=]+) +(?=-{1,2}[a-z0-9A-Z-*=]+)", r"\1, ", opt)
# Several options are put together (, / |)
splitted_opt_list = re.split('\||,|/', opt)
if len(splitted_opt_list) > 1:
# If the subject is a plural, we regard these options as independent of each other
if self.__isStartsWithDTPl(topic_sentence):
for splitted_opt in splitted_opt_list:
splitted_opt_desc_dict[self.removeValueField(splitted_opt)] = desc
# Else we take the first option and record other alias
else:
key = self.removeValueField(splitted_opt_list[0])
if key in splitted_opt_desc_dict:
splitted_opt_desc_dict[key] += " %s" % desc
else:
splitted_opt_desc_dict[key] = desc
tmp_alias_list = []
for alias in splitted_opt_list[1:]:
if not self.removeValueField(alias).startswith("-"):
continue
tmp_alias_list.append(self.removeValueField(alias))
if len(tmp_alias_list) > 0:
alias_dict[key] = tmp_alias_list
else:
key = self.removeValueField(opt)
if key in splitted_opt_desc_dict:
splitted_opt_desc_dict[key] += " %s" % desc
else:
splitted_opt_desc_dict[key] = desc
return splitted_opt_desc_dict, alias_dict
def formatOptDescDict(self, program, opt_desc_dict):
formatted_list = []
for opt in opt_desc_dict:
desc = opt_desc_dict[opt].replace(" ", " ").strip()
doc = self.nlp(desc)
for sent in doc.sents:
if sent.text.isspace():
continue
formatted_list.append({"cmd": program, "option": opt, "sent": sent.text})
return formatted_list
# Extract the first sentence (topic sentence), preprocess the topic sentence, and extract predicate and object
def extractTopicSentList(self, program, opt_desc_dict):
topic_sent_list = []
for opt in opt_desc_dict:
desc = opt_desc_dict[opt].replace(" ", " ").strip()
doc = self.nlp(desc)
for sent in doc.sents:
sentence = self.preprocessIDRSentence(sent.text)
if sentence == "":
continue
constituency_tree = self.predictConstituencyTree(sentence)
simplified_tree, topic_sentence, neg_flag = self.__simplifyConstituencyTree(constituency_tree)
predicate, object, prt = self.__getPredAndObj(topic_sentence)
if neg_flag and predicate:
predicate = "non_%s" % predicate
# Several options are put together
splitted_opt_list = re.split('\||,', opt)
if len(splitted_opt_list) > 1:
# If the subject is a plural, we regard these options as independent of each other
if self.__isStartsWithDTPl(sentence):
for splitted_opt in splitted_opt_list:
topic_sent_list.append({"cmd": program, "option": splitted_opt.split(" ")[0], "sent": sent.text, "predicate": predicate, "object": object, "prt": prt, "tree": simplified_tree})
# Else we take the first option
else:
topic_sent_list.append({"cmd": program, "option": splitted_opt_list[0].split(" ")[0], "sent": sent.text, "predicate": predicate, "object": object, "prt": prt, "tree": simplified_tree})
else:
topic_sent_list.append({"cmd": program, "option": splitted_opt_list[0].split(" ")[0], "sent": sent.text, "predicate": predicate, "object": object, "prt": prt, "tree": simplified_tree})
break
return topic_sent_list
# preprocessing for model
def preprocessing(self, formatted_list, alias_dict, all_option_list):
preprocessed_list = []
for sent_dict in formatted_list:
# Not need to consider '=' in model prediction, so we remove it
sent = sent_dict['sent'].replace("=", " ")
option = sent_dict['option']
alias_list = alias_dict[option] if option in alias_dict else []
preprocessed_sent, option_map = self.replaceOption(sent, all_option_list)
preprocessed_sent = re.sub("^this (switch|flag)", "This ", preprocessed_sent, flags=re.I)
preprocessed_sent = re.sub("^this (?!option)", "This option ", preprocessed_sent, flags=re.I)
# May have multiple "option"
preprocessed_sent = re.sub("(option ?){2,}", "option", preprocessed_sent)
option_map_new = {}
for opt in option_map:
if opt == option or (option in alias_dict and opt in alias_list):
option_map_new[option_map[opt]] = "param_current"
else:
option_map_new[option_map[opt]] = "param_other"
contractions_pattern = re.compile('({})'.format('|'.join(CONTRACTION_MAP.keys())), flags=re.IGNORECASE | re.DOTALL)
expanded_sentence = contractions_pattern.sub(self.__expandMatch, preprocessed_sent)
filtered_sentence = re.sub(r'[?|$|&|*|%|@|(|)|~]', r'', expanded_sentence.strip())
tokens = nltk.word_tokenize(filtered_sentence)
tagged_sent = nltk.pos_tag(tokens)
lemmas_sent = []
for tag in tagged_sent:
word_pos = self.__getPosFromTag(tag[1])
if not self.isReplacedOption(tag[0]) and word_pos is not None:
lemmas_sent.append(lemminflect.getLemma(tag[0], upos=word_pos)[0])
else:
lemmas_sent.append(tag[0])
stopword_list = STOPWORD_LIST
filtered_tokens = [
token for token in lemmas_sent if token.lower() not in stopword_list]
norm_sent = re.sub("--{1,}", "", " ".join(filtered_tokens))
item_list = []
for item in nltk.word_tokenize(norm_sent):
if self.isReplacedOption(item):
item = option_map_new[item]
item_list.append(item)
new_sent = " ".join(item_list).lower()
# remove digit
norm_sent = re.sub(r'[0-9]+', '', new_sent).strip()
# remove extra spaces
norm_sent = re.sub(' +', ' ', norm_sent)
# remove punctuation
norm_sent = re.sub(r"[%s]+" % string.punctuation, "", norm_sent)
# some replace rules
norm_sent = norm_sent.lower().replace('-- ', '--').replace('- ', '-').replace(' ( ', '(').replace(' ) ', ')').strip()
norm_sent_tokens = [token for token in norm_sent.split(' ') if token in GLOVE_WORD_LIST]
sent_dict["preprocessed_sent"] = ' '.join(norm_sent_tokens).replace(" ", " ")
preprocessed_list.append(sent_dict)
return preprocessed_list
# Preprocessing for IDR relationship extraction
def preprocessIDRSentence(self, sentence):
# Remove all content in quotations
sentence = re.sub(r'\(.*?\)', '', sentence)
# remove digit, maybe we can use token.like_num to do this
sentence = re.sub(r'[0-9]+', '', sentence)
sentence = re.sub(r' +', ' ', sentence)
sentence = re.sub(r'\.{2,}', '.', sentence)
sentence = re.sub(r'[{}\(\)]', '', sentence)
sentence = re.sub(r'by default', '', sentence, flags=re.I)
sentence = re.sub(r'^note that', '', sentence, flags=re.I)
sentence = sentence.split(":")[0].strip()
# Add subject to topic sentence starting with a verb
sentence = self.__addSubjectToSentence(sentence)
sentence, option_map_dict = self.replaceOption(sentence)
# Avoid splitted mistakenly by constituency parser, "non-" - > "non -"
sentence = sentence.replace("-", "_")
return sentence
# Preprocessing for EDR relationship extraction
def preprocessEDRSentence(self, sentence):
sentence = re.sub(r'^by default', '', sentence, flags=re.I)
sentence = re.sub(r'^note that', '', sentence, flags=re.I)
sentence = self.__addSubjectToSentence(sentence)
return sentence.strip()
def splitSentence(self, sent, option_map):
sent = sent.replace("(","").replace(")","")
sub_sent_list = []
sub_option_map_list = []
doc = self.nlp(sent)
conj_token_index_list = []
# {'-A': 'option1'} -> {'option1': '-A'}
reversed_option_map = {v: k for k, v in option_map.items()}
for token in doc:
if (token.tag_.startswith("V") or token.head.tag_.startswith("V")) and token.dep_ in ["conj"]:
conj_token_index_list.append(token.i)
conj_token_index_list.append(token.i + 1)
start_index= 0
for conj_token_index in conj_token_index_list:
sent_token_list = []
sub_option_map = {}
for i in range(start_index, conj_token_index):
# Skip the last cc
if i == conj_token_index - 1 and doc[i].text in ["and", "or", "but"]:
continue
if doc[i].text in reversed_option_map:
sub_option_map[reversed_option_map[doc[i].text]] = doc[i].text
# Save the original option
sent_token_list.append(doc[i].text)
sub_sent_list.append(" ".join(sent_token_list).strip().strip(",").strip())
sub_option_map_list.append(sub_option_map)
start_index = conj_token_index
return sub_sent_list, sub_option_map_list
def extractConditionClauseList(self, processed_sent):
constituency_tree = self.predictConstituencyTree(processed_sent)
return self.__getSubtrees(0, constituency_tree)
def extractMainClause(self, processed_sent):
condition_clause_list = self.extractConditionClauseList(processed_sent)
main_clause = ""
if len(condition_clause_list) > 0:
# Remove condition clause
for condition_clause in condition_clause_list:
try:
main_clause = processed_sent.replace(condition_clause, "")
except:
pass
else:
main_clause = processed_sent
return main_clause
def getReplaceItem(self):
for item in REPLACE_ITEM_LIST:
yield item
def getSelfReplaceItem(self):
return SELF_REPLACE_ITEM
def replaceOption(self, sent, included_list=[]):
option_map_dict = {}
option_list = self.__findOptionsInSent(sent)
item_generator = self.getReplaceItem()
for option in option_list:
if option not in option_map_dict:
if included_list:
# e.g., --keep-*
r = re.compile("^%s.*$" % option[:-1]) if option[-1] == "*" else re.compile("^%s$" % option)
# Options not belong to this program
if not any(r.match(opt) for opt in included_list):
sent = re.sub("%s(?![a-zA-Z0-9\-\*])" % re.escape(option), "option", sent).strip()
continue
replace_item = next(item_generator)
option_map_dict[option] = replace_item
sent = re.sub("%s(?![a-zA-Z0-9\-\*])" % re.escape(option.replace("*", "\*")), replace_item, sent).strip()
r = sorted(option_map_dict.items(), key=lambda kv: len(str(kv[0])), reverse=True)
option_map_dict = {i[0]: i[1] for i in r}
if sent.startswith("(") and sent.endswith(")"):
sent = sent.lstrip("(").rstrip(")").strip()
return sent, option_map_dict
def isReplacedOption(self, text):
target_list = REPLACE_ITEM_LIST + [SELF_REPLACE_ITEM]
return text in target_list
def removeValueField(self, opt):
# -a=xxx, -a[xxxx], -a<xxxx>, -a xxx
return re.split("=|\[|<| ", opt.strip())[0]
# Only used in check if an option belong to a program
def getOptList(self, opt_desc_dict, alias_dict, alias_flag):
opt_list = []
for key in opt_desc_dict:
if key == "alias":
continue
opt_list.append(key)
if alias_flag:
for opt in alias_dict:
opt_list.extend(alias_dict[opt])
return list(set(opt_list))
def findConjunctsOfToken(self, token):
conj_list = []
cc = None
for conj in token.conjuncts:
# check if the conj is an option
if self.isReplacedOption(conj.text):
conj_list.append(conj)
if conj.i > 0 and conj.nbor(-1).dep_ == "cc":
cc = conj.nbor(-1)
else:
# Check if option in the children
for child in conj.subtree:
if self.isReplacedOption(child.text):
conj_list.append(child)
if child.i > 0 and child.nbor(-1).dep_ == "cc":
cc = child.nbor(-1)
return conj_list, cc
def comparePredicate(self, predicate_1, predicate_2):
# Since predicate can have more than one word, we split them
predicate_1_root = predicate_1.split(" ")
predicate_2_root = predicate_2.split(" ")
if len(predicate_1_root) != len(predicate_2_root):
return []
rel_list = []
for idx in range(len(predicate_1_root)):
predicate1 = predicate_1_root[idx]
predicate2 = predicate_2_root[idx]
synonyms_vb1 = self.getSynonymsAndAntonyms(predicate1)[0]
antonyms_vb1 = self.getSynonymsAndAntonyms(predicate2)[1]
synonyms_vb2 = self.getSynonymsAndAntonyms(predicate2)[0]
antonyms_vb2 = self.getSynonymsAndAntonyms(predicate2)[1]
if predicate1 == predicate2 or predicate1 in synonyms_vb2 or predicate2 in synonyms_vb1:
rel_list.append("synonyms")
elif predicate1 in antonyms_vb2 or predicate2 in antonyms_vb1 or ("non_" + predicate1 == predicate2) or ("non_" + predicate2) == predicate1:
rel_list.append("antonyms")
else:
rel_list.append("none")
return rel_list
def comparePrepositions(self, prep_1, prep_2):
# Since predicate can have more than one word, we split them
prep_1_root = prep_1.split(" ")
prep_2_root = prep_2.split(" ")
if len(prep_1_root) != len(prep_2_root):
return []
rel_list = []
for idx in range(len(prep_1_root)):
prep1 = prep_1_root[idx]
prep2 = prep_2_root[idx]
if prep1 == prep2:
rel_list.append("synonyms")
elif prep1 not in ANTONYM_PREPOSITION_LIST or prep2 not in ANTONYM_PREPOSITION_LIST:
rel_list.append("none")
elif ANTONYM_PREPOSITION_LIST[prep_1] == prep_2 and ANTONYM_PREPOSITION_LIST[prep_2] == prep_1:
rel_list.append("antonyms")
else:
rel_list.append("none")
return rel_list
def removeOverlapElements(self, list1, list2):
return [e for e in list1 if e not in list2]
def searchVerbAndAuxOfToken(self, token):
aux_token, verb_token, with_to_flag = None, None, False
tmp_token, root_token = None, None
while True:
if not tmp_token and token.pos_ in ['AUX', 'VERB']:
tmp_token = token
if token.dep_ == "ROOT":
root_token = token
if token == token.head:
break
token = token.head
# If this sentence has no verb, check if the root is misjudged
if not tmp_token:
if root_token:
root_lemma_list = lemminflect.getAllLemmas(root_token.text)
if "VERB" in root_lemma_list:
verb_token = root_token
elif tmp_token.pos_ == 'AUX':
# 'Ought' as a modal verb cannot be conjugated, i.e. 'oughts'
if tmp_token.text == "ought":
aux_token = tmp_token
# verb is the head of 'to'
verb_token = aux_token.nbor(1).head
with_to_flag = True
elif tmp_token.lemma_ == "be":
aux_token = tmp_token
verb_token = aux_token.head
with_to_flag = False
elif tmp_token.pos_ == "VERB":
# If the verb is need/have to, regard it as an aux
if tmp_token.lemma_ in ['have', 'need'] and tmp_token.nbor(1).text == 'to':
aux_token = tmp_token
verb_token = aux_token.nbor(1).head
with_to_flag = True
# Normal verb
else:
verb_token = tmp_token
aux_token_list = [child for child in verb_token.children if child.pos_ == 'AUX']
aux_token = None if len(aux_token_list) == 0 else aux_token_list[0]
with_to_flag = False
return aux_token, verb_token, with_to_flag
def skipModifierToken(self, token):
ret_token = token
if token.dep_ in ["nmod", "appos" ,"compound", "amod"]:
for ancestor in token.ancestors:
if ancestor.pos_ == "NOUN" and ancestor.dep_ not in ["nmod", "appos" ,"compound", "amod"]:
ret_token = ancestor
break
return ret_token
def searchMissedKeyword(self, token):
ret_token = None
# e.g., Similar to -A
if token.dep_ == "pobj":
ret_token = token.head
while ret_token != ret_token.head and (ret_token.pos_ != "ADJ"):
ret_token = ret_token.head
return ret_token
# e.g., --keep-*
def expandOptionsInRegex(self, option, all_option_list):
r = re.compile("^%s.*$" % option[:-1])
return list(filter(r.match, all_option_list))
# Some options may miss their "-", we attempt to fix them
def fixBrokenOptions(self, sentence, option_list):
if "following options:" not in sentence:
return sentence
sentence = sentence.replace("-", "_")
doc = self.nlp(sentence)
end_pattern_list = ["."]
fix_start_flag = False
for token in doc:
if fix_start_flag:
if not token.is_punct:
# Regard as an option if the fixed token is in the option list
if "-%s" % token.text.replace("_", "-") in option_list:
sentence = sentence.replace(token.text, "-%s" % token.text.replace("_", "-"))
elif "--%s" % token.text.replace("_", "-") in option_list:
sentence = sentence.replace(token.text, "--%s" % token.text.replace("_", "-"))
if token.lemma_ == "option" and doc[token.i - 1].text == "following":
fix_start_flag = True
if fix_start_flag and token.text in end_pattern_list:
break
return sentence.replace("_", "-")
def __addSubjectToSentence(self, sentence):
word_list = sentence.split(" ")
first_word = word_list[0].lower()
first_word_lemma_list = lemminflect.getAllLemmas(first_word)
if "VERB" in first_word_lemma_list:
doc = self.nlp(sentence)
# If the first word also can be a noun
if "NOUN" in first_word_lemma_list:
second_word = word_list[1].lower()
second_word_lemma_list = lemminflect.getAllLemmas(second_word)
# If the first word is a plural noun, it must not be a verb, so it must be a singular
# If the second word is a VBZ, the first word cannot be verb
if "VERB" in second_word_lemma_list and second_word in lemminflect.getInflection(second_word_lemma_list['VERB'][0], tag="VBZ") and doc[1].pos_ in ["VERB", "AUX"]:
sentence = re.sub(' +', ' ', " ".join(word_list))
return sentence
wnl_word = first_word_lemma_list['VERB'][0]
tmp_list = word_list[1:]
isPassive = False
if doc[0].pos_ == "VERB":
if doc[0].tag == "VBN":
isPassive = True
else:
# If the first word can be both VBN and VB, we regard it as VB ("do")
if first_word in lemminflect.getInflection(wnl_word, 'VBN') and first_word not in lemminflect.getInflection(wnl_word, 'VB'):
isPassive = True
prefix_list = ["It is", lemminflect.getInflection(wnl_word, tag='VBN')[0]] if isPassive else ["It", lemminflect.getInflection(wnl_word, tag='VBZ')[0]]
word_list = prefix_list + tmp_list
sentence = re.sub(' +', ' ', " ".join(word_list))
return sentence
def __findOptionsInSent(self, sent):
# Add a space for regex, and tokenize to seperate option and punctuations
sentence = " %s " % (self.__joinMistakenlySplitedToken(" ".join(nltk.word_tokenize(sent))))
option_list = [item.strip() for item in re.findall(r"(?<=\s)-{1,2}.*?(?=\s)", sentence)]
return list(set(option_list))
def __simplifyConstituencyTree(self, tree):
# Remove redundant subtreess
found_subtree = False
for i, subtree in enumerate(tree):
if subtree.label() == "S" and not found_subtree:
found_subtree = True
elif subtree.label() == "S" and found_subtree:
del tree[i]
if tree[-1].label() == ",":
del tree[-1]
# Remove ADVP subtrees
ptree = ParentedTree.convert(tree)
advp_list = []
for subtree in ptree.subtrees():
if subtree.label() == "ADVP":
advp_list.append(subtree)
for advp_subtree in advp_list:
advp_subtree.parent().remove(advp_subtree)
# Remove aux at beginning
# start with a model verb
neg_flag = False
remove_subject_flag = False
if len(ptree) > 1 and ptree[1].label() == "VP":
if ptree[1][0].label() == "MD" or (ptree[1][0].label().startswith("VB") and ptree[1][0].leaves()[0].lower() in ["does", "oughts"]):
if ptree[1][1].label() == "RB":
ptree[1].remove(ptree[1][1])
neg_flag = True
ptree[1].remove(ptree[1][0])
if ptree[0].leaves()[0].lower() == "it":
ptree[0].parent().remove(ptree[0])
remove_subject_flag = True
new_sentence = self.__addSubjectToSentence(self.__joinMistakenlySplitedToken(' '.join(ptree.leaves())))
if remove_subject_flag:
new_sentence = self.__addSubjectToSentence(new_sentence)
new_tree = self.predictConstituencyTree(new_sentence)
return new_tree, new_sentence, neg_flag
def __joinMistakenlySplitedToken(self, sentence):
return sentence.replace("-- ","--").replace(" - ", "-").replace("- *", "-*")
def __expandMatch(self, contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = CONTRACTION_MAP.get(match)\
if CONTRACTION_MAP.get(match)\
else CONTRACTION_MAP.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
def __getPosFromTag(self, tag):
pos = None
if tag.startswith('J'):
pos = "ADJ"
elif tag.startswith('V'):
pos = "VERB"
elif tag.startswith('N'):
pos = "NOUN"
elif tag.startswith('R'):
pos = "ADV"
return pos
def __getPredAndObj(self, sentence):
doc = self.nlp(sentence)
edge_list = []
verb_candidate_list = []
attr, predicate, object, prt = "", "", "", ""
root_token, object_candidate_token = None, None
find_root_flag, find_verb_flag = False, False
# Traverse and collect key list
for token in doc:
# collect edges of current token for graph
for child in token.children:
edge_list.append((token, child))
if not find_verb_flag and token.tag_.startswith("V"):
find_verb_flag = True
if not find_root_flag and token.dep_ == "ROOT":
root_token = token
find_root_flag = True
# If the root is a word like "be", "become", we only need to check its subject
if root_token.lemma_ in ["be", "become", "seem"] or (root_token.lemma_ == "look" and doc[root_token.i + 1].text == "like"):
target_child_list = [child for child in root_token.children if child.dep_ in ["nsubj", "csubj"]]
if len(target_child_list) > 0:
subject_token = target_child_list[0]
if subject_token.pos_ not in ["PRON"] and subject_token.text not in ["default"]:
predicate = "be"
closest_mod_token = self.__getClosestModToken(subject_token)
pobj_token = None
find_prep_result = [child for child in subject_token.children if child.text == "of"]
if len(find_prep_result) > 0:
find_pobj_result = [child for child in find_prep_result[0].children if child.dep_ == "pobj"]
if len(find_pobj_result) > 0:
pobj_token = find_pobj_result[0]
object = "%s %s" % (closest_mod_token.text, subject_token.text) if closest_mod_token else subject_token.text
if pobj_token:
closest_pobj_mod_token = self.__getClosestModToken(pobj_token)
object = "%s of %s %s" % (object, closest_pobj_mod_token.text, pobj_token.text) if closest_pobj_mod_token else "%s of %s" % (object, pobj_token.text)
return predicate.lower(), object.lower(), prt.lower()
if root_token.tag_.startswith("V"):
predicate = lemminflect.getLemma(root_token.text, upos="VERB")[0]
find_prep_result = [child.text for child in root_token.children if child.dep_ in ["prt", "prep"]]
if len(find_prep_result) > 0:
# if "%predicate_%prep" is a verb phrase if it is in the wordnet dataset
prt = find_prep_result[0] if len(wordnet.synsets("%s_%s" % (predicate, find_prep_result[0]))) > 0 else ""
elif token.head.dep_ == "ROOT" and ((token.dep_ in ['pobj', 'dobj', 'nsubjpass'])):
object_candidate_token = token
# check 'to do'
# elif token.tag_ in ['VBN','VBZ','VB']:
elif find_root_flag and token.tag_ == "TO" and token.head.pos_ == "VERB":
verb_token = token.head
verb_candidate_list.append(verb_token)
# Fot topic sentence without a verb, we regard the root word as the object
if not find_verb_flag:
predicate = ""
object = root_token.text
else:
for verb_token in verb_candidate_list:
verb_count = 0
# Find all verbs behind root
for node in self.findShortestPath(edge_list, root_token, verb_token):
if node.pos_ == "VERB":
verb_count += 1
if verb_count == 2:
predicate += " " + lemminflect.getLemma(verb_token.text, upos='VERB')[0]
find_prep_result = [child.text for child in verb_token.children if child.dep_ in ["prt", "prep"]]
if len(find_prep_result) > 0:
# if "%predicate_%prep" is a verb phrase if it is in the wordnet dataset
prt = find_prep_result[0] if len(wordnet.synsets("%s_%s" % (predicate, find_prep_result[0]))) > 0 else ""
object_list = [child.text for child in verb_token.children if child.dep_ in ['nsubj', 'nsubjpass']]
object = object_list[0] if len(object_list) > 0 else ""
attr = " ".join([lemminflect.getLemma(child.text,upos='VERB')[0] for child in verb_token.children if child.dep_ in ['acomp','xcomp']])
if object_candidate_token:
closest_mod_token = None
pobj_token = None
closest_mod_token = self.__getClosestModToken(object_candidate_token)
find_prep_result = [child for child in object_candidate_token.children if child.text == "of"]
if len(find_prep_result) > 0:
find_pobj_result = [child for child in find_prep_result[0].children if child.dep_ == "pobj"]
if len(find_pobj_result) > 0:
pobj_token = find_pobj_result[0]
object = "%s %s" % (closest_mod_token.text, object_candidate_token.text) if closest_mod_token else object_candidate_token.text
if pobj_token:
closest_pobj_mod_token = self.__getClosestModToken(pobj_token)
object = "%s of %s %s" % (object, closest_pobj_mod_token.text, pobj_token.text) if closest_pobj_mod_token else "%s of %s" % (object, pobj_token.text)
predicate = self.__joinMistakenlySplitedToken(re.sub(' +', ' ', predicate.replace("be", "") + " " + attr).strip())
object = self.__joinMistakenlySplitedToken(object)
return predicate.lower(), object.lower(), prt.lower()
def __getClosestModToken(self, token):
closest_mod_token = None
for child in token.children:
# Get the closest compound/nmod
if child.dep_ in ['compound','nmod']:
if not closest_mod_token:
closest_mod_token = child
else:
if abs(child.i - token.i) < abs(closest_mod_token.i - token.i):
closest_mod_token = child
return closest_mod_token
def __getNodes(self, constituency_tree):
productions = []
for child in constituency_tree:
child_productions = child.productions()
for prod in child_productions:
prod = str(prod)
if "'" not in prod:
productions.append(prod)
if len(productions) > 0:
productions[-1] = productions[-1].split(' ')[0]
return productions
def __getSubtrees(self, granularity, tree):
subtrees = []
if granularity == 0:
for subtree in tree.subtrees():
if subtree.label() == "SBAR" and subtree.leaves()[0].lower() in ['if','when','unless','while', 'after', 'only', 'just', 'even']:
subtrees.append(' '.join(subtree.leaves()))
elif granularity == 1:
for subtree in tree.subtrees():
if subtree.label() in ["S","SBAR","FRAG"]:
subtrees.append(' '.join(subtree.leaves()))
return subtrees
def __isStartsWithDTPl(self, sentence):
doc = self.nlp(sentence)
for token in doc:
if token.tag_ in ['DT'] and token.text.lower() in ["these"]:
if token.head.tag_ in ['NNS'] and token.head.text.lower() in ["options", "switches", "flags", "commands", "directives", "configures"]:
token = token.head
for child in token.children:
if child.dep_ in ["appos"]:
return False
return True
return False