-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrank_genes.py
532 lines (447 loc) · 18.3 KB
/
rank_genes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
"""Extract, transform, and load data into TSV files for to rank genes
Ranked genes are all, at least, mentioned in a study's related publication.
These published genes are then sought among very differentially expressed
genes in the annotation's various labeled groups. The groups in which each
published gene is very DE'd (if any) are noted. Finally, data on each gene's
worldwide popularity, ultimately measured via the Gene Hints pipeline using
either Wikipedia page views or PubMed citation counts, is also included.
The output TSV file contains one gene per row. This is parsed by Image Pipeline
to prioritize which genes to cache.
EXAMPLES
python3 ingest/ingest_pipeline.py --study-id 5d276a50421aa9117c982845 --study-file-id 5dd5ae25421aa910a723a337 rank_genes --study-accession SCP367 --publication="https://doi.org/10.1126/sciadv.adf6251" --rank-genes
"""
import logging
import re
import datetime
import glob
import gzip
import tarfile
import json
import urllib
import urllib.request
import csv
from io import BytesIO
import xml.etree.ElementTree as ET
import requests
from utils import get_scp_api_base, fetch_pmid_pmcid
# Used when importing internally and in tests
from ingest_files import IngestFiles
from monitor import setup_logger
timestamp = datetime.datetime.now().isoformat(sep="T", timespec="seconds")
url_safe_timestamp = re.sub(':', '', timestamp)
log_name = f"rank_genes_{url_safe_timestamp}_log.txt"
logger = setup_logger(
__name__, log_name, level=logging.INFO, format="support_configs"
)
def download_gzip(url, output_path, cache=0):
"""Download gzip file, decompress, write to output path; use optional cache
Cached files can help speed development iterations by > 2x, and some
development scenarios (e.g. on a train or otherwise without an Internet
connection) can be impossible without it.
"""
is_targz = url.endswith("tar.gz")
headers = {"Accept-Encoding": "gzip"} if is_targz else {}
logger.info(f"Requesting {url}")
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
if is_targz:
tar = tarfile.open(name=None, fileobj=BytesIO(response.read()))
tar.extractall(output_path)
tar.close()
else:
content = gzip.decompress(response.read()).decode()
with open(output_path, "w") as f:
f.write(content)
def fetch_pmcid_text(pmcid):
"""Get full text for publication, given its PubMed Central ID"""
oa_url = f"https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id={pmcid}"
with urllib.request.urlopen(oa_url) as response:
data = response.read().decode("utf-8")
oa_xml = ET.fromstring(data)
oa_package = oa_xml.find(".//link[@format='tgz']")
if oa_package is None:
msg = f"This PMCID is not open access: {pmcid}"
raise ValueError(msg)
package_url = oa_package.attrib["href"].replace("ftp://", "https://")
download_gzip(package_url, ".")
nxml_path = glob.glob(f"{pmcid}/*.nxml")[0]
article = ET.parse(nxml_path).getroot()
text = str(ET.tostring(article.find("body"), method="text"))
return text
def fetch_publication_text(publication):
"""Get full text for a publicly-accessible article given its URL"""
# Examples:
# <Is open access> -- <publication URL> -- <Study accession>
# 0. Yes -- https://www.biorxiv.org/content/10.1101/2021.11.13.468496v1 -- SCP1671
# ---
# 1. Not really, but abstract mentions genes -- https://doi.org/10.1210/en.2018-00750 -- SCP2058
# 2. Yes -- https://www.biorxiv.org/content/10.1101/2022.07.01.498017v1 -- SCP2053
# 3. Yes -- https://www.biorxiv.org/content/10.1101/2022.09.27.509354v1 -- SCP2046
# 4. Yes -- https://www.biorxiv.org/content/10.1101/2022.09.27.509354v1 -- SCP2045
# 5. Yes -- https://www.biorxiv.org/content/10.1101/2022.09.27.509354v1 -- SCP2011 (summary body)
# 6. Yes (via PMC) -- https://doi.org/10.1038/s41467-022-28372-y -- SCP1985
# 7. Yes (via biorxiv) -- https://doi.org/10.1101/2022.01.09.475571 -- SCP1921
# 8. Yes -- https://www.biorxiv.org/content/10.1101/2022.06.29.496888v1 -- SCP1905
# 9. Yes -- https://www.biorxiv.org/content/10.1101/2022.06.29.496888v1 -- SCP1903
# 10. Not really, but abstract mentions genes -- https://doi.org/10.1016/j.immuni.2023.01.002 -- SCP1884
# 11. Not really, not until 2023-10-12, but abstract mentions genes, and full text available in biorxiv -- https://doi.org/10.1093/toxsci/kfac109 -- SCP1875
doi_split = publication.split("https://doi.org/")
if len(doi_split) > 1:
platform = 'pmc'
doi = doi_split[1]
[_, pmcid] = fetch_pmid_pmcid(doi, True)
logger.info('Converted DOI to PMC ID: ' + pmcid)
text = fetch_pmcid_text(pmcid)
elif publication.startswith("https://www.biorxiv.org"):
platform = 'biorxiv'
full_text_url = f"{publication}.full.txt"
logger.info('full_text_url', full_text_url)
with urllib.request.urlopen(full_text_url) as response:
raw_text = response.read().decode('utf-8')
text = raw_text.split('## Reference')[0] # Skip author names, etc.
else:
platform = 'other'
file_path = publication
with open(file_path) as f:
text = f.read()
return [text, platform]
def fetch_gene_cache(organism):
genes_filename = f"{organism}-genes.tsv" # e.g. homo-sapiens-genes.tsv
base_url = "https://cdn.jsdelivr.net/npm/"
genes_url = f"{base_url}[email protected]/dist/data/cache/genes/{genes_filename}.gz"
download_gzip(genes_url, genes_filename)
# Populate containers for various name and significance fields
interest_rank_by_gene = {}
counts_by_gene = {}
loci_by_gene = {}
full_names_by_gene = {}
i = 0
with open(genes_filename) as file:
reader = csv.reader(file, delimiter="\t")
for row in reader:
if row[0] == '#' or len(row) < 2:
continue
gene = row[4]
full_name = row[5]
counts_by_gene[gene] = 0
loci_by_gene[gene] = {
"chromosome": row[0],
"start": row[1],
"length": row[2],
}
full_names_by_gene[gene] = full_name
# Genes in upstream file are ordered by global popularity
interest_rank_by_gene[gene] = i
i += 1
return [interest_rank_by_gene, counts_by_gene, loci_by_gene, full_names_by_gene]
def screen_false_positive_mentions(publication_text, mentions_by_gene):
"""Screen out hits that are non-gene abbreviations
"""
fp_for_tf = bool(re.search("transcription factor(s)? \(TF", publication_text))
if fp_for_tf and "TF" in mentions_by_gene:
del mentions_by_gene["TF"]
logger.info(
'Deleted false positive gene mention for "TF", ' +
'defined as "transcription factor" in publication'
)
# Detect gene-mentions that use have the grammatical structure:
#
# the <term> (<"gene">)
#
# like "the Arcuate (ARC", which mentions a non-gene.
# Source context: https://doi.org/10.1126/sciadv.adf6251
articular_fp_genes = []
for gene in mentions_by_gene:
fp_match =\
re.search("the ([A-Za-z ]){0,20} \(" + gene, publication_text)
is_likely_fp = bool(fp_match)
if is_likely_fp:
term = fp_match.group(0)
if (
"express" in term or # e.g. "the Oxytocin expressing (OXT"
gene.lower() not in term.split('(')[0].lower() # e.g. "the astrocyte (GJA1"
):
continue
logger.info(
f'Deleted false positive gene mention for "{gene}", ' +
f'defined as "{term}" in publication'
)
articular_fp_genes.append(gene)
for fp_gene in articular_fp_genes:
del mentions_by_gene[fp_gene]
return mentions_by_gene
def extract_mentions_and_interest(organism, publication):
"""Get mentions in publication and interest rank for each gene in organism
"""
logger.info('Extracting mentions and interest')
[
interest_rank_by_gene,
counts_by_gene,
loci_by_gene,
full_names_by_gene,
] = fetch_gene_cache(organism)
[publication_text, platform] = fetch_publication_text(publication)
publication_words = re.split(r'[ /]', publication_text)
for word in publication_words:
raw_word = word.strip('*,.()-+')
if (
platform == 'biorxiv' and
len(raw_word) == 1 and # "a" and "T" are official mouse gene names, but rarely relevant
word[0] != '*' and word[-1] != '*' # Plaintext mark for italics in Biorxhiv, suggests gene name
):
logger.info(f'Omitting likely false positive gene mention: "{word}"')
continue
if raw_word in counts_by_gene:
counts_by_gene[raw_word] += 1
mentions_by_gene = {}
for gene in counts_by_gene:
counts_by_gene[gene]
count = counts_by_gene[gene]
if counts_by_gene[gene] > 0:
mentions_by_gene[gene] = count
mentions_by_gene =\
screen_false_positive_mentions(publication_text, mentions_by_gene)
return [
interest_rank_by_gene,
mentions_by_gene,
loci_by_gene,
full_names_by_gene
]
def extract(organism, publication, bucket, de_dict):
"""Data mine genes from publication"""
logger.info(f"Exracting gene names from publication: {publication}")
[
interest_rank_by_gene,
mentions_by_gene,
loci_by_gene,
full_names_by_gene
] = extract_mentions_and_interest(organism, publication)
de_by_gene = extract_de(bucket, de_dict)
return [
interest_rank_by_gene,
mentions_by_gene,
de_by_gene,
loci_by_gene,
full_names_by_gene
]
def sanitize_string(string):
return string.replace(' ', '_').replace('.', '_').replace('[', '_').replace(']', '_')
def extract_de(bucket, de_dict):
"""Fetch differential expression (DE) files, return DE fields by gene"""
de_by_gene = {}
# directory = "tests/data/rank_genes/"
directory = "_scp_internal/differential_expression/"
for [group, de_file] in de_dict["de_groups_and_files"]:
de_gsurl = f"gs://{bucket}/{directory}{de_file}"
ingest_file = IngestFiles(de_gsurl)
local_path = ingest_file.resolve_path(de_gsurl)[1]
with open(local_path) as file:
reader = csv.reader(file, delimiter="\t")
i = 0
# Headers in upstream, precomputed DE files:
# names scores logfoldchanges pvals pvals_adj pct_nz_group pct_nz_reference
for row in reader:
if row[0] == "":
continue # Skip novel header / empty lines
if i < 500:
# 500 is ~2% of 25k-ish genes in Ensembl genome annotation
gene = row[1]
de_entry = {
"group": group,
"log2fc": row[3], # Scanpy `logfoldchanges`
"adjusted_pval": row[5], # Scanpy `pvals_adj`
"scores_rank": str(i), # per Scanpy `scores`
}
if gene in de_by_gene:
de_by_gene[gene].append(de_entry)
else:
de_by_gene[gene] = [de_entry]
i += 1
sorted_de_by_gene = {}
for gene in de_by_gene:
de_entries = de_by_gene[gene]
# Sort each group by Scanpy `scores` rank
sorted_de = sorted(de_entries, key=lambda de: int(de["scores_rank"]))
sorted_de_by_gene[gene] = sorted_de
return sorted_de_by_gene
def get_de_column(de_entries, de_meta_keys):
"""Join DE entries for a gene by keys like log2fc, etc.
"""
# Collapse DE props for each group, delimit inner fields with "!"
de_grouped_props = []
for de in de_entries:
props = [de[key] for key in de_meta_keys]
de_grouped_props.append("!".join(props))
# Collapse grouped props, all DE data for each gene is in one TSV column
de_column = ";".join(de_grouped_props)
return de_column
def get_metainformation(meta, de_meta_keys):
"""Get headers about entire content, and inner column formats
Metainformation fields are also used in the VCF file specification.
"""
annot = meta["annotation"]
meta["annotation"] = f"{annot['name']}--group--{annot['scope']}"
content_meta = ";".join([f"{k}={v}" for (k, v) in meta.items()])
de_meta = "differential_expression keys: " + ";".join(de_meta_keys)
metainformation = (
"## "
+ "\n## ".join(
["Ranked genes - Single Cell Portal", content_meta, de_meta]
)
+ "\n"
)
return metainformation
def sort_genes_by_relevance(gene_dicts):
"""Sort genes by # mentions, then DE score rank, then global interest rank"""
[
interest_rank_by_gene,
mentions_by_gene,
de_by_gene,
loci_by_gene,
full_names_by_gene,
] = gene_dicts
genes = []
for gene in interest_rank_by_gene:
de = de_by_gene.get(gene, '')
top_de_rank = de[0]["scores_rank"] if de != '' else -1
genes.append({
"top_de_rank": top_de_rank,
"de": de,
"mentions": mentions_by_gene.get(gene, 0),
"interest_rank": interest_rank_by_gene[gene],
"locus": loci_by_gene[gene],
"full_name": full_names_by_gene[gene],
"symbol": gene,
})
genes = sorted(genes, key=lambda gene: int(gene["interest_rank"]))
genes = sorted(genes, key=lambda gene: int(gene["top_de_rank"]))
genes = sorted(genes, key=lambda gene: -int(gene["mentions"]))
return genes
def transform(gene_dicts, meta):
"""Transform extracted dicts into TSV content"""
num_genes_to_output = 100
de_meta_keys = ["group", "log2fc", "adjusted_pval", "scores_rank"]
ranked_genes = sort_genes_by_relevance(gene_dicts)
rows = []
i = 0
for gene in ranked_genes:
locus = gene["locus"]
chromosome = locus["chromosome"]
start = locus["start"]
length = locus["length"]
full_name = gene["full_name"]
de = get_de_column(gene["de"], de_meta_keys)
mentions = str(gene["mentions"])
interest_rank = str(gene["interest_rank"])
row = [
gene["symbol"],
chromosome,
start,
length,
full_name,
de,
mentions,
interest_rank,
]
rows.append("\t".join(row))
i += 1
rows = "\n".join(rows[:num_genes_to_output])
metainformation = get_metainformation(meta, de_meta_keys)
header = (
"# "
+ "\t".join([
'name',
'chromosome',
'start',
'length',
'full_name',
'differential_expression',
'publication_mentions',
'interest_rank',
])
+ "\n"
)
tsv_content = metainformation + header + rows
return tsv_content
def load(clustering, annotation, tsv_content, bucket_name):
"""Load TSV content into file, write to disk"""
# TODO: Consider cluster- and annotation-specific gene ranking
# output_path = f"ranked_genes_{clustering}--{annotation}.tsv"
output_path = "ranked_genes.tsv"
with open(output_path, "w") as f:
f.write(tsv_content)
logger.info('Wrote content')
print(tsv_content)
bucket_folder = "_scp_internal/ranked_genes"
gs_path = f"{bucket_folder}/{output_path}"
output_gs_url = f"gs://{bucket_name}/{gs_path}"
IngestFiles.delocalize_file(output_gs_url, output_path, gs_path)
logger.info("Uploaded gene ranks to bucket")
def fetch_context(accession):
"""Get cluster, annotation, and differential expression data from SCP API
"""
api_base = get_scp_api_base()
url = f"{api_base}/studies/{accession}/explore"
response = requests.get(url, verify=False)
try:
explore_json = response.json()
if response.status_code == 401:
print(
'*** Received 401 error from SCP API. ' +
'Ensure study is public, not private. ***'
)
except json.decoder.JSONDecodeError as e:
if 'staging' in api_base:
print(
'*** Error requesting SCP API on staging. ' +
'Ensure you are connected to VPN. ***'
)
raise e
bucket_id = explore_json["bucketId"]
raw_organism = explore_json["taxonNames"][0]
organism = raw_organism.lower().replace(' ', '-')
try:
# DE is available; use first eligible clustering and annotation
# TODO: Consider outputting multiple gene rank lists, 1 per DE annot
de = explore_json["differentialExpression"][0]
clustering = de["cluster_name"]
annotation = {
"name": de["annotation_name"],
"scope": de["annotation_scope"]
}
de_groups_and_files = de["select_options"]["one_vs_rest"]
de_dict = {
"clustering": clustering,
"annotation": annotation,
"de_groups_and_files": de_groups_and_files
}
except Exception as e:
# No DE available; use default clustering and annotation
# TODO: Add downstream handling for study without DE
raise ValueError(
"Rank genes only currently supports studies that have one-vs-rest SCP-computed DE"
)
return bucket_id, organism, de_dict
class RankGenes:
def __init__(
self,
study_accession: str,
publication: str
):
"""
:param study_accession Accession of a public SCP study, e.g. "SCP123"
:param publication URL of the study's publicly-accessible research article, or GS URL or local path to publication text file
"""
bucket, organism, de_dict = fetch_context(study_accession)
clustering = de_dict["clustering"]
annotation = de_dict["annotation"]
meta = {
"accession": study_accession,
"organism": organism,
"bucket": bucket,
"clustering": clustering,
"annotation": annotation
}
gene_dicts = extract(organism, publication, bucket, de_dict)
tsv_content = transform(gene_dicts, meta)
load(clustering, annotation, tsv_content, bucket)