-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsimple.py
298 lines (262 loc) · 9.97 KB
/
simple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
#######################################################
# Simple Name Cleaner
# produced at Phylotastic-II / NESCent::HIP
#
# TNRS Team: Dan Leehr, Andrew Lenards, Guarav Vaidya
#######################################################
import requests
import codecs
import csv
import time
import json
import sys
import os
import types
from trees import Tree
from optparse import OptionParser
taxosaurus_url="http://taxosaurus.org/"
gnrd_url='http://gnrd.globalnames.org/name_finder.json'
MATCH_THRESHOLD=0.9
TYPE_NEWICK=1
TYPE_NEXML=2
def lookup_taxosaurus(names,limit_source):
print('Calling Taxosaurus'),
payload={'query': '\n'.join(names)}
if limit_source:
payload['source'] = limit_source
response = requests.post(taxosaurus_url + 'submit',params=payload)
while response.status_code == 302:
print('.'),
sys.stdout.flush()
time.sleep(0.5)
response = requests.get(response.url)
response.raise_for_status()
print('')
return response.json()
def get_args():
m_thres_help = ("the matching score threshold to use, defined as a " \
"decimal, all matches equal to or greater will be replaced." \
" The default is %s") % MATCH_THRESHOLD
usage = "usage:\n %prog [options] file-input\n or\n %prog [options] --file file-input"
parser = OptionParser(usage=usage)
parser.add_option("-f", "--file", dest="filename",
help="the file, FILE, read from...", metavar="FILE")
parser.add_option("-s", "--skip-gnrd",
help="Do not lookup names at GNRD. Only valid for a text file or newick tree",
dest="skip_gnrd",
action="store_true",
default=False)
parser.add_option("-n", "--newick",
help="The file is a newick tree",
dest="type",
action="store_const",
const=TYPE_NEWICK)
parser.add_option("-x", "--nexml",
help="The file is NeXML",
dest="type",
action="store_const",
const=TYPE_NEXML)
parser.add_option("--source",
help="Limit taxosaurus to a single source: [MSW3|iPlant|NCBI]",
dest="limit_source",
default=None)
parser.add_option("--match-threshold", dest="m_threshold",
default=MATCH_THRESHOLD, help=m_thres_help,
metavar="MATCH_SCORE_THRESHOLD")
return parser.parse_args()
def grab_file(options, args):
"""
Returns the assumed input file
If the --file/-f argument is not pass in, assume the first
positional argument is the filename to operate on
"""
if (options.filename == None):
return args[0]
return options.filename
def replace_names(names, mapping):
"""
names is the original list
mapping is the dictionary
"""
results = []
for name in names:
if name in mapping.keys():
results.append(mapping[name])
else:
results.append(name)
return results
def replace_names_nexml(filename,mapping):
"""
Dangerously replaces the labels in a nexml file
"""
n = Tree(filename,'nexml')
n.replace_otu_labels(mapping)
n.replace_node_labels(mapping)
pieces = filename.split('.')[:1]
prefix = pieces[0] if len(pieces) >= 1 else fname
report_filename = prefix + '_clean.xml'
n.write_nexml_tree(report_filename)
def get_best_match(matches):
"""
Returns the best match from the list of matches
"""
if (len(matches) == 0):
# No matches
return None
else:
# sort by score and return the highest
return sorted(matches, key=lambda k: float(k['score']))[-1]
def log_record_in(report, name, match, matches):
"""
Mutate the report's record for a given submitted name
"""
prov_record = report[name]
prov_record['submittedName'] = name
# prov_record['otherMatches'] = json.dumps(matches)
if not match:
prov_record['accepted'] = 'none'
# if there's no match, then skip this
else:
prov_record['accepted'] = match['acceptedName']
prov_record['sourceId'] = match['sourceId']
prov_record['uri'] = match['uri']
prov_record['score'] = match['score']
def create_name_mapping(names, match_threshold):
"""
Returns the mapping of input to clean names above the minimum score
and a report of all action taken
"""
mapping = dict()
prov_report = dict()
for name in names:
matches = name['matches']
submittedName = name['submittedName']
prov_report[submittedName] = dict()
if (len(matches) >= 1):
match = get_best_match(matches)
if match:
log_record_in(prov_report, submittedName, match, matches)
accepted = match['acceptedName']
score = float(match['score'])
if ((accepted != "") and (score >= match_threshold)):
mapping[submittedName] = accepted
else:
log_record_in(prov_report, submittedName, match, matches)
return mapping, prov_report
def lookup_gnrd(filename, names):
"""
Uses GNRD to look up the names in the provided list or file-like object
"""
files={'file': (os.path.basename(filename), names)}
params={'unique':'false'}
print("Calling Global Names Discovery Service"),
response = requests.get(gnrd_url, params=params, files=files)
while response.json()['status'] == 303:
print('.'),
sys.stdout.flush()
time.sleep(0.5)
response = requests.get(response.url)
response.raise_for_status()
print('')
names_dict = {}
for name in response.json()['names']:
# response json with unique true
# {
# "identifiedName": "Carnivora",
# "scientificName": "Carnivora",
# "verbatim": "Carnivora:"
# }
#
# response json with unique false
# {
# "identifiedName": "Halichoerus grypus",
# "offsetEnd": 3430,
# "offsetStart": 3411,
# "scientificName": "Halichoerus grypus",
# "verbatim": "(Halichoerus grypus)"
# }
scientific_name = name['scientificName']
name_dict = {} # keyed by scientificName
if scientific_name in names_dict.keys():
name_dict = names_dict[scientific_name]
else:
name_dict['scientific_name'] = name['scientificName']
name_dict['verbatims'] = []
name_dict['identified_names'] = []
name_dict['offsets'] = []
name_dict['verbatims'].append(name['verbatim'])
name_dict['identified_names'].append(name['identifiedName'])
name_dict['offsets'].append((name['offsetStart'], name['offsetEnd']))
names_dict[scientific_name] = name_dict
return (names_dict.keys(), names_dict)
def get_names_from_file(filename,tree_type=None):
"""
Returns a string containing names from the file, or a file-like object to POST
to gnrd
"""
names = None
# needs to be multipart/form-data
if tree_type in [TYPE_NEWICK, TYPE_NEXML]:
type = 'newick' if tree_type == TYPE_NEWICK else 'nexml'
n = Tree(filename, type)
labels = n.get_otu_labels()
labels = labels + n.get_node_labels()
# uniquify
labels = list(set(labels))
if None in labels:
del labels[labels.index(None)]
print "Extracted %d names from %s Tree" % (len(labels), type)
names ='\n'.join(labels)
else:
names = open(filename,'rb') # open in binary in case of PDF or Office document
return names
def main():
global MATCH_THRESHOLD
(options, args) = get_args()
fname = grab_file(options, args)
if (options.m_threshold != None and options.m_threshold != MATCH_THRESHOLD):
MATCH_THRESHOLD = float(options.m_threshold)
names = None
names_dict = None
source_names = get_names_from_file(fname, options.type)
if(options.skip_gnrd):
if isinstance(source_names, types.StringTypes):
names = source_names.split('\n')
else:
names = [n.rstrip().decode('utf-8') for n in source_names.readlines()]
else:
(names, names_dict) = lookup_gnrd(fname,source_names)
# names_dict contains results of GNRD extraction if performed
print "Found %d names" % (len(names))
result = lookup_taxosaurus(names,options.limit_source)
print "Received %d matches from Taxosaurus" % (len(result['names']))
# Check for errors in taxosaurus lookup
for source in result['metadata']['sources']:
if 'errorMessage' in source.keys():
print "Error querying %s: %s: %s" % (source['sourceId'], source['status'], source['errorMessage'])
else:
print "Queried %s: %s" % (source['sourceId'], source['status'])
(mapping, prov_report) = create_name_mapping(result['names'], MATCH_THRESHOLD)
# fields = ['submittedName','accepted','sourceId','uri','score','otherMatches']
fields = ['submittedName','accepted','sourceId','uri','score']
pieces = fname.split('.')[:1]
prefix = pieces[0] if len(pieces) >= 1 else fname
report_filename = prefix + '_change_report.csv'
print prov_report
with codecs.open(report_filename, 'w', encoding='utf-8') as report_writer:
report_writer.write(', '.join(fields) + '\n')
for key in prov_report.keys():
record = prov_report[key]
report_writer.write(', '.join([record[field] for field in fields]) + '\n')
# report_writer.write(', '.join(prov_report[key].values()) + '\n')
# [prov_report[key][field] for field in fields]
if options.type == TYPE_NEXML:
replace_names_nexml(fname, mapping)
else:
replaced = replace_names(names, mapping)
# For now, just write the list out to file
with codecs.open(fname + '.clean', 'w', encoding='utf-8') as dest:
for item in replaced:
dest.write(item + '\n')
if __name__ == "__main__":
main()