Skip to content

Commit

Permalink
Merge pull request #365 from broadinstitute/ew-refine-mini-ontologies
Browse files Browse the repository at this point in the history
Refine ontology cache, for fast metadata validation (SCP-5790)
  • Loading branch information
eweitz authored Sep 13, 2024
2 parents 96979c6 + 39d90d3 commit 91f10c7
Show file tree
Hide file tree
Showing 14 changed files with 42 additions and 19 deletions.
10 changes: 7 additions & 3 deletions .github/workflows/minify_ontologies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ jobs:
uses: actions/checkout@v4

- name: Copy and decompress ontologies in repo
run: cd ingest/validation; mkdir tmp; cp -r *.min.tsv.gz tmp/; gzip -d tmp/*.min.tsv.gz
run: cd ingest/validation/ontologies; mkdir tmp; cp -r *.min.tsv.gz tmp/; gzip -d tmp/*.min.tsv.gz

- name: Minify newest ontologies
run: cd ingest/validation; python3 minify_ontologies.py; gzip -dkf *.min.tsv.gz
run: cd ingest/validation; python3 minify_ontologies.py; gzip -dkf ontologies/*.min.tsv.gz

- name: Diff and commit changes
run: |
Expand All @@ -30,7 +30,7 @@ jobs:
set +e
# set -x # Enable debugging
cd ingest/validation
cd ingest/validation/ontologies
# Define directories
SOURCE_DIR="."
Expand Down Expand Up @@ -76,6 +76,10 @@ jobs:
done
if [ "$CHANGES_DETECTED" = true ]; then
# Update version to signal downstream caches should update
echo "$(date +%s) # validation cache key" > version.txt
git add version.txt
# Configure Git
git config --global user.name "github-actions"
git config --global user.email "[email protected]"
Expand Down
Binary file removed ingest/validation/efo.min.tsv.gz
Binary file not shown.
38 changes: 26 additions & 12 deletions ingest/validation/minify_ontologies.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,15 @@
PATO_URL = 'https://github.com/pato-ontology/pato/raw/master/pato.json'
NCBITAXON_URL = 'https://github.com/obophenotype/ncbitaxon/releases/latest/download/taxslim.json'
EFO_URL = 'https://github.com/EBISPOT/efo/releases/latest/download/efo.json'
UBERON_URL = 'https://github.com/obophenotype/uberon/releases/latest/download/uberon.json'
CL_URL = 'https://github.com/obophenotype/cell-ontology/releases/latest/download/cl.json'

ONTOLOGY_JSON_URLS = {
'disease': [MONDO_URL, PATO_URL],
'species': [NCBITAXON_URL],
'library_preparation_protocol': [EFO_URL]
'library_preparation_protocol': [EFO_URL],
'organ': [UBERON_URL],
'cell_type': [CL_URL]
}

def fetch(url, use_cache=True):
Expand All @@ -41,7 +45,6 @@ def fetch(url, use_cache=True):
content = f.read()
return [content, filename]


def fetch_ontologies(ontology_json_urls, use_cache=True):
"""Retrieve ontology JSON and JSON filename for required ontology
"""
Expand All @@ -56,8 +59,7 @@ def fetch_ontologies(ontology_json_urls, use_cache=True):
ontologies[annotation].append([ontology_json, filename])
return ontologies


def get_synonyms(node):
def get_synonyms(node, label):
"""Get related and exact synonyms for an ontology node
"""
if 'meta' not in node or 'synonyms' not in node['meta']:
Expand All @@ -69,8 +71,12 @@ def get_synonyms(node):
if 'val' not in synonym_node:
# Handles e.g. incomplete EFO synonym nodes
continue
raw_synonyms.append(synonym_node['val'])
# print('raw_synonyms', raw_synonyms)
raw_synonym = synonym_node['val']
if (
not raw_synonym.startswith('obsolete ') and # Omit obsolete synonyms
raw_synonym != label # Omit synonyms that are redundant with label
):
raw_synonyms.append(raw_synonym)
synonyms = '||'.join(raw_synonyms) # Unambiguously delimit synonyms
return synonyms

Expand All @@ -88,17 +94,25 @@ def minify(ontology_json, filename):
graph_nodes
))

nodes = list(map(
lambda n: '\t'.join(
[n['id'].split('/')[-1], n['lbl'], get_synonyms(n)]
all_nodes = list(map(
lambda n: (
[n['id'].split('/')[-1], n['lbl'], get_synonyms(n, n['lbl'])]
), raw_nodes
))

tsv_content = '\n'.join(nodes)
# Remove obsolete labels
nodes = list(filter(
lambda n: not n[1].startswith('obsolete '),
all_nodes
))

tsv_content = '\n'.join(
map(lambda n: '\t'.join(n), nodes)
)
compressed_tsv_content = gzip.compress(tsv_content.encode())

output_filename = f'{ontology_shortname}.min.tsv.gz'
with open(f'{ontology_shortname}.min.tsv.gz', 'wb') as f:
output_filename = f'ontologies/{ontology_shortname}.min.tsv.gz'
with open(output_filename, 'wb') as f:
f.write(compressed_tsv_content)
print(f'Wrote {output_filename}')

Expand Down
Binary file removed ingest/validation/mondo.min.tsv.gz
Binary file not shown.
Binary file removed ingest/validation/ncbitaxon.min.tsv.gz
Binary file not shown.
Binary file added ingest/validation/ontologies/cl.min.tsv.gz
Binary file not shown.
Binary file added ingest/validation/ontologies/efo.min.tsv.gz
Binary file not shown.
Binary file added ingest/validation/ontologies/mondo.min.tsv.gz
Binary file not shown.
Binary file not shown.
Binary file added ingest/validation/ontologies/pato.min.tsv.gz
Binary file not shown.
Binary file added ingest/validation/ontologies/uberon.min.tsv.gz
Binary file not shown.
1 change: 1 addition & 0 deletions ingest/validation/ontologies/version.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1726164618 # validation cache key
Binary file removed ingest/validation/pato.min.tsv.gz
Binary file not shown.
12 changes: 8 additions & 4 deletions tests/test_minify_ontologies.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,28 @@

class TestOntologyMinifier(unittest.TestCase):

def setup_method(self, test_method):
os.mkdir('ontologies')

def test_mondo_and_pato_minification(self):
OntologyMinifier(['disease'], False)
files = glob.glob('*.tsv.gz')
files = glob.glob('ontologies/*.tsv.gz')
self.assertEqual(len(files), 2, 'Did not find 2 TSV.GZ files')
with gzip.open('mondo.min.tsv.gz', 'rt') as f:
with gzip.open('ontologies/mondo.min.tsv.gz', 'rt') as f:
first_line = f.readline().strip().split('\t')
expected_first_line = [
'MONDO_0000001',
'disease',
'condition||disease||disease or disorder||disease or disorder, non-neoplastic||diseases||diseases and disorders||disorder||disorders||medical condition||other disease'
'condition||disease or disorder||disease or disorder, non-neoplastic||diseases||diseases and disorders||disorder||disorders||medical condition||other disease'
]
error_message = 'Did not get expected first line in mondo.min.tsv.gz'
self.assertEqual(first_line, expected_first_line, error_message)

def teardown_method(self, test_method):
output_files = glob.glob('*.min.tsv.gz')
output_files = glob.glob('ontologies/*.min.tsv.gz')
for file in output_files:
os.remove(file)
os.rmdir('ontologies')



0 comments on commit 91f10c7

Please sign in to comment.