forked from teffland/Relation-Extraction
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsemeval_data_helper.py
358 lines (337 loc) · 15.8 KB
/
semeval_data_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
"""
Semeval Data handler
"""
import random
from spacy.en import English
nlp = English()
def convert_raw_x(line, verbose=False):
"""Convert raw line of semeval data into a useable form
Convert to a triple of (spacy sentence, e1_token, e2_token)
"""
if isinstance(line, str):
line = unicode(line)
s = line.strip()
s = s[s.index(u'"')+1: -(s[::-1].index(u'"')+1)] # get s between first " and last "
# we will assume that the first token follow the <e1> , <e2> tags are the entity words.
# note this is a big assumption and hopefully phrases will be in subtrees or in heads of the parse trees
# TODO: this can be addressed by making it a 5-tuple with the endpoints also encoded
# sometimes the tags are missing spaces in front or behind.
# check out those cases separately so we don't add exrta whitespace and mess up parsing
# Proper whitespaceing case
s = s.replace(u' <e1>', u' e1>') # make sure there's spacing so it's recognized as seperate token
s = s.replace(u'</e1> ', u' ') # drop right tag
s = s.replace(u' <e2>', u' e2>')
s = s.replace(u'</e2> ', u' ')
# if there wasn't proper whitespacing, the previous code didn't run
# so fill in the gaps with these corner cases where we add in extra whitespace
s = s.replace(u'<e1>', u' e1>') # make sure there's spacing so it's recognized as seperate token
s = s.replace(u'</e1>', u' ') # drop right tag
s = s.replace(u'<e2>', u' e2>')
s = s.replace(u'</e2>', u' ')
s = nlp(s)
tokenized_s = [token.text for token in s]
for i, token in enumerate(tokenized_s):
if u'e1>' == token[:3]:
tokenized_s[i] = token[3:]
e1_index = i
elif u'e2>' == token[:3]:
tokenized_s[i] = token[3:]
e2_index = i
s = u' '.join(tokenized_s)
s = nlp(s)
e1 = s[e1_index]
e2 = s[e2_index]
return (s, e1, e2)
def smart_token_to_text(token, lower=True):
"""Convet spacy token to lowercase text and simplify numbers and punctuation"""
text = token.text.lower() if lower else token.text
if token.is_punct:
text = u'<PUNCT>'
if token.like_num:
text = u'<NUM>'
return text
def dependency_path_to_root(token):
"""Traverse up the dependency tree. Include the token we are tracing"""
dep_path = [token]
while token.head is not token:
dep_path.append(token.head)
token = token.head
# dep_path.append(token.head) # add the root node
return dep_path
def find_common_ancestor(e1_path, e2_path, verbose=False):
"""Loop through both dep paths and return common ancestor"""
for t1 in e1_path:
for t2 in e2_path:
if verbose:
print(t1, t2)
if t1.idx == t2.idx:
if verbose:
print("Common found!")
return t1
return None
def convert_nominals_to_sdp(X, Y, include_ends=False, verbose=False):
X_path = dependency_path_to_root(X)
Y_path = dependency_path_to_root(Y)
if verbose:
print(X.text, X.dep_)
print(X_path)
print(Y.text, Y.dep_)
print(Y_path)
common = find_common_ancestor(X_path, Y_path, verbose=verbose)
# # now we don't want nouns for assembly
# X_path = X_path[1:]
# Y_path = Y_path[1:]
# CASE (1)
if not common:
print("Didn't find common ancestor")
return None
# CASE (2)
elif X is common:
sdp = []
for token in Y_path: # looks like (Y <- ... <- X <-) ...
sdp.append((smart_token_to_text(token), token.dep_, token.pos_))
if token is common: # stop after X
break
sdp = list(reversed(sdp)) # flip to get ... (-> X -> ... -> Y)
elif Y is common:
sdp = []
for token in X_path: # looks like (X <- ... <- Y <- ) ...
sdp.append((smart_token_to_text(token), token.dep_, token.pos_))
if token is common: # stop after Y
break
# CASE (3)
else:
sdp = []
for token in (X_path): # looks like (X <- ... <- Z <-) ...
sdp.append((smart_token_to_text(token), token.dep_, token.pos_))
if token is common: # keep Z this time
break
ysdp = [] # need to keep track of seperate, then will reverse and extend later
for token in Y_path: # looks like (Y <- ... <-) Z <- ...
if token is common: # don't keep Z from this side
break
ysdp.append((smart_token_to_text(token), token.dep_, token.pos_))
sdp.extend(list(reversed(ysdp))) # looks like (X <- ... <- Z -> ... ) -> Y)
# convert endpoints of the paths to placeholder X and Y tokens
if not include_ends:
sdp[0] = (u'<X>', sdp[0][1], sdp[0][2])
sdp[-1] = (u'<Y>', sdp[-1][1], sdp[-1][2])
# if len(sdp) < min_len or len(sdp) > max_len:
# continue # skip ones that are too short or long
return {'path': sdp, 'target':(X.text.lower(), Y.text.lower())}
def convert_nominals_to_sentence(X, Y, sent, include_ends=False, verbose=False):
sdp = []
started = False
# loop through sentence, start recording when we see x, then stop after we see Y
for i, token in enumerate(sent):
if token is X:
started = True
sdp.append((smart_token_to_text(token), token.dep_, token.pos_))
elif token is Y:
sdp.append((smart_token_to_text(token), token.dep_, token.pos_))
started = False
break
if started:
sdp.append((smart_token_to_text(token), token.dep_, token.pos_))
# convert endpoints of the paths to placeholder X and Y tokens
if not include_ends:
sdp[0] = (u'<X>', sdp[0][1], sdp[0][2])
sdp[-1] = (u'<Y>', sdp[-1][1], sdp[-1][2])
# if len(sdp) < min_len or len(sdp) > max_len:
# continue # skip ones that are too short or long
return {'path': sdp, 'target':(X.text.lower(), Y.text.lower())}
def post_process_sdp(sdp):
""" Filter out unwanted sdps structure """
if not sdp:
return sdp
bad_tokens = set([u'<PUNCT>']) #set([',', '.', '-', '(', ')', '&', '*', '_', '%', '!', '?', '/', '<', '>', '\\', '[', ']', '{', '}', '"', "'"])
sdp['path'] = [x for x in sdp['path'] if x[0] not in bad_tokens]
return sdp
def is_ok_sdp(sdp):#, int2vocab, oov_percent=75):
""" Helper function to mak sure SDP isn't a poor example.
Filters used to identify bas data:
1. Neither targets may be oov
2. The relation itself must be less than `oov_percent` percent number of relations
"""
# oov = int2vocab.keys()[-1]
# # print(oov, sdp['target'])
# if sdp['target'][0] == oov or sdp['target'][1] == oov:
# return False
# oov_count = len([ t for t in sdp['path'] if t[0] == oov])
# too_many = int((oov_percent/100.0)*len(sdp['path']))
# if oov_count > too_many:
# return False
if not sdp or not sdp['path'] or not sdp['target']:
return False
return True
def line_to_data(raw_line, include_ends=False, verbose=False, sentence=False, single=False):
sent = convert_raw_x(raw_line)
e1 = sent[1]
e2 = sent[2]
if sentence:
sdp = convert_nominals_to_sentence(e1, e2, sent[0], include_ends=include_ends, verbose=verbose)
else:
sdp = convert_nominals_to_sdp(e1, e2, include_ends=include_ends, verbose=verbose)
if not sdp:
print(raw_line)
print(sent)
# post_process_sdp(sdp)
if is_ok_sdp(sdp):
if single: # create a duplicate reversed and replace both ends with <X> or <Y>
dup = {k:v[:] for k,v in sdp.items()} # duplicate
dup['path'] = dup['path'][::-1] # reverse the path
dup['path'][-1] = (u'<X>', dup['path'][-1][1], dup['path'][-1][2]) # convert last to a directional token
sdp['path'][-1] = (u'<Y>', sdp['path'][-1][1], sdp['path'][-1][2]) # "
dup['target'] = [dup['target'][0]] # target is just the other entity, predict X|Y
sdp['target'] = [sdp['target'][1]] # Y|X, also other code expects targets as lists
return [sent]*2, [sdp['path'], dup['path']], [sdp['target'], dup['target']]
else:
return sent, sdp['path'], sdp['target']
else:
print("Bad sentence: %r" % raw_line )
print(sent, sdp)
return None, None, None
def create_label2int():
labels = [
'Cause-Effect(e1,e2)',
'Cause-Effect(e2,e1)',
'Product-Producer(e1,e2)',
'Product-Producer(e2,e1)',
'Entity-Origin(e1,e2)',
'Entity-Origin(e2,e1)',
'Instrument-Agency(e1,e2)',
'Instrument-Agency(e2,e1)',
'Component-Whole(e1,e2)',
'Component-Whole(e2,e1)',
'Content-Container(e1,e2)',
'Content-Container(e2,e1)',
'Entity-Destination(e1,e2)',
'Entity-Destination(e2,e1)',
'Member-Collection(e1,e2)',
'Member-Collection(e2,e1)',
'Message-Topic(e1,e2)',
'Message-Topic(e2,e1)',
'Other' ]
return {label:i for (i, label) in enumerate(labels)}
def line_to_label(raw_label_line, label2int):
"""Convert raw line of semeval labels into a useable form (ints)"""
# define the list ourselves so the ordering is nice
line = raw_label_line.strip()
# if line in label2int:
# return label2int[line]
# else:
# label2int[line] = len(label2int.keys())
# return label2int[line]
return label2int[line]
def load_semeval_data(shuffle_seed=42, include_ends=False, sentence=False, single=False):
"""Load in SemEval 2010 Task 8 Training file and return lists of tuples:
Tuple form = (spacy(stripped sentence), index of e1, index of e2)"""
### TRAINING AND VALIDATION DATA ###
training_txt_file = 'SemEval2010_task8_all_data/SemEval2010_task8_training/TRAIN_FILE.TXT'
validation_index = 8000 - 800 # len data - len valid - 1 since we start at 0
validation_size = 800
all_ = {'raws':[], 'sents':[], 'sdps':[], 'targets':[], 'labels':[], 'comments':[]}
train = {'raws':[], 'sents':[], 'sdps':[], 'targets':[], 'labels':[], 'comments':[]}
valid = {'raws':[], 'sents':[], 'sdps':[], 'targets':[], 'labels':[], 'comments':[]}
text = open(training_txt_file, 'r').readlines()
label2int = create_label2int() # keep running dictionary of labels
assert len(text) // 4 == 8000
for cursor in range(len(text) // 4): # each 4 lines is a datum
text_line = text[4*cursor]
label_line = text[4*cursor +1]
comment = text[4*cursor + 2]
if single: # really just for use by *2sdp for auxilary task
sent, sdp, target = line_to_data(text_line, include_ends=include_ends,
sentence=sentence, single=single)
# print(sent, sdp, target)
label = line_to_label(label_line, label2int)
# print(sent, sdp, target, label)
if not (sent and sdp and target):
print("Skipping this one... %r" % text_line)
print(sent, sdp, target, label)
continue
num = len(sent)# number per
all_['raws'].extend([text_line]*num)
all_['sents'].extend(list(sent))
all_['sdps'].extend(list(sdp))
all_['targets'].extend(list(target))
all_['labels'].extend([label]*num)
all_['comments'].extend([comment]*num)
if cursor < validation_index:
train['raws'].extend([text_line]*num)
train['sents'].extend(list(sent))
train['sdps'].extend(list(sdp))
train['targets'].extend(list(target))
train['labels'].extend([label]*num)
train['comments'].extend([comment]*num)
else:
valid['raws'].extend([text_line]*num)
valid['sents'].extend(list(sent))
valid['sdps'].extend(list(sdp))
valid['targets'].extend(list(target))
valid['labels'].extend([label]*num)
valid['comments'].extend([comment]*num)
else: # else we only get one per line
sent, sdp, target = line_to_data(text_line, include_ends=include_ends,
sentence=sentence, single=single)
label = line_to_label(label_line, label2int)
# print(sent, sdp, target, label)
if not (sent and sdp and target):
print("Skipping this one... %r" % text_line)
print(sent, sdp, target, label)
continue
all_['raws'].append(text_line)
all_['sents'].append(sent)
all_['sdps'].append(sdp)
all_['targets'].append(target)
all_['labels'].append(label)
all_['comments'].append(comment)
if cursor < validation_index:
train['raws'].append(text_line)
train['sents'].append(sent)
train['sdps'].append(sdp)
train['targets'].append(target)
train['labels'].append(label)
train['comments'].append(comment)
else:
valid['raws'].append(text_line)
valid['sents'].append(sent)
valid['sdps'].append(sdp)
valid['targets'].append(target)
valid['labels'].append(label)
# valid['comments'].append(comment)
# shuffle all and take the last validation_size as validation, rest as test
if shuffle_seed:
random.seed(shuffle_seed)
zip_all = zip(all_['raws'], all_['sents'], all_['sdps'], all_['targets'], all_['labels'], all_['comments'])
random.shuffle(zip_all)
raws, sents, sdps, targets, labels, comments = zip(*zip_all)
train['raws'], valid['raws'] = raws[:-validation_size], raws[-validation_size:]
train['sents'], valid['sents'] = sents[:-validation_size], sents[-validation_size:]
train['sdps'], valid['sdps'] = sdps[:-validation_size], sdps[-validation_size:]
train['targets'], valid['targets'] = targets[:-validation_size], targets[-validation_size:]
train['labels'], valid['labels'] = labels[:-validation_size], labels[-validation_size:]
train['comments'], valid['comments'] = comments[:-validation_size], comments[-validation_size:]
int2label = {i:label for (label, i) in label2int.items()}
print("Num training: %i" % len(train['labels']))
print("Num valididation: %i" % len(valid['labels']))
assert sorted(label2int.values()) == range(19) # 2 for each 9 asymmetric relations and 1 other
### TEST DATA ### (has no labels, is not used in duplicate form so don't output it)
# NOTE: converting test data into duplicate form is not implemented
test_txt_file = "SemEval2010_task8_all_data/SemEval2010_task8_testing/TEST_FILE.txt"
test = {'raws':[], 'sents':[], 'sdps':[], 'targets':[]}
text = open(test_txt_file, 'r').readlines()
for line in text:
sent, sdp, target = line_to_data(line, include_ends=include_ends, sentence=sentence)
if not (sent and sdp and target):
print("Skipping this one... %r" % text_line)
print(sent, sdp, target, label)
sent = [nlp(u'<OOV>')]*3
sdp = [[u'<OOV>',u'<OOV>', u'<OOV>']]
target= [u'<OOV>',u'<OOV>']
test['raws'].append(line)
test['sents'].append(sent)
test['sdps'].append(sdp)
test['targets'].append(target)
print("Num testing: %i" % len(test['targets']))
return train, valid, test, label2int, int2label