diff --git a/.github/workflows/minify_ontologies.yml b/.github/workflows/minify_ontologies.yml index fef5bf7e..8e878b5c 100644 --- a/.github/workflows/minify_ontologies.yml +++ b/.github/workflows/minify_ontologies.yml @@ -16,10 +16,10 @@ jobs: uses: actions/checkout@v4 - name: Copy and decompress ontologies in repo - run: cd ingest/validation; mkdir tmp; cp -r *.min.tsv.gz tmp/; gzip -d tmp/*.min.tsv.gz + run: cd ingest/validation/ontologies; mkdir tmp; cp -r *.min.tsv.gz tmp/; gzip -d tmp/*.min.tsv.gz - name: Minify newest ontologies - run: cd ingest/validation; python3 minify_ontologies.py; gzip -dkf *.min.tsv.gz + run: cd ingest/validation; python3 minify_ontologies.py; gzip -dkf ontologies/*.min.tsv.gz - name: Diff and commit changes run: | @@ -30,7 +30,7 @@ jobs: set +e # set -x # Enable debugging - cd ingest/validation + cd ingest/validation/ontologies # Define directories SOURCE_DIR="." @@ -76,6 +76,10 @@ jobs: done if [ "$CHANGES_DETECTED" = true ]; then + # Update version to signal downstream caches should update + echo "$(date +%s) # validation cache key" > version.txt + git add version.txt + # Configure Git git config --global user.name "github-actions" git config --global user.email "github-actions@github.com" diff --git a/ingest/validation/efo.min.tsv.gz b/ingest/validation/efo.min.tsv.gz deleted file mode 100644 index ae9e7e9d..00000000 Binary files a/ingest/validation/efo.min.tsv.gz and /dev/null differ diff --git a/ingest/validation/minify_ontologies.py b/ingest/validation/minify_ontologies.py index 54397518..4d377d7c 100644 --- a/ingest/validation/minify_ontologies.py +++ b/ingest/validation/minify_ontologies.py @@ -19,11 +19,15 @@ PATO_URL = 'https://github.com/pato-ontology/pato/raw/master/pato.json' NCBITAXON_URL = 'https://github.com/obophenotype/ncbitaxon/releases/latest/download/taxslim.json' EFO_URL = 'https://github.com/EBISPOT/efo/releases/latest/download/efo.json' +UBERON_URL = 'https://github.com/obophenotype/uberon/releases/latest/download/uberon.json' +CL_URL = 'https://github.com/obophenotype/cell-ontology/releases/latest/download/cl.json' ONTOLOGY_JSON_URLS = { 'disease': [MONDO_URL, PATO_URL], 'species': [NCBITAXON_URL], - 'library_preparation_protocol': [EFO_URL] + 'library_preparation_protocol': [EFO_URL], + 'organ': [UBERON_URL], + 'cell_type': [CL_URL] } def fetch(url, use_cache=True): @@ -41,7 +45,6 @@ def fetch(url, use_cache=True): content = f.read() return [content, filename] - def fetch_ontologies(ontology_json_urls, use_cache=True): """Retrieve ontology JSON and JSON filename for required ontology """ @@ -56,8 +59,7 @@ def fetch_ontologies(ontology_json_urls, use_cache=True): ontologies[annotation].append([ontology_json, filename]) return ontologies - -def get_synonyms(node): +def get_synonyms(node, label): """Get related and exact synonyms for an ontology node """ if 'meta' not in node or 'synonyms' not in node['meta']: @@ -69,8 +71,12 @@ def get_synonyms(node): if 'val' not in synonym_node: # Handles e.g. incomplete EFO synonym nodes continue - raw_synonyms.append(synonym_node['val']) - # print('raw_synonyms', raw_synonyms) + raw_synonym = synonym_node['val'] + if ( + not raw_synonym.startswith('obsolete ') and # Omit obsolete synonyms + raw_synonym != label # Omit synonyms that are redundant with label + ): + raw_synonyms.append(raw_synonym) synonyms = '||'.join(raw_synonyms) # Unambiguously delimit synonyms return synonyms @@ -88,17 +94,25 @@ def minify(ontology_json, filename): graph_nodes )) - nodes = list(map( - lambda n: '\t'.join( - [n['id'].split('/')[-1], n['lbl'], get_synonyms(n)] + all_nodes = list(map( + lambda n: ( + [n['id'].split('/')[-1], n['lbl'], get_synonyms(n, n['lbl'])] ), raw_nodes )) - tsv_content = '\n'.join(nodes) + # Remove obsolete labels + nodes = list(filter( + lambda n: not n[1].startswith('obsolete '), + all_nodes + )) + + tsv_content = '\n'.join( + map(lambda n: '\t'.join(n), nodes) + ) compressed_tsv_content = gzip.compress(tsv_content.encode()) - output_filename = f'{ontology_shortname}.min.tsv.gz' - with open(f'{ontology_shortname}.min.tsv.gz', 'wb') as f: + output_filename = f'ontologies/{ontology_shortname}.min.tsv.gz' + with open(output_filename, 'wb') as f: f.write(compressed_tsv_content) print(f'Wrote {output_filename}') diff --git a/ingest/validation/mondo.min.tsv.gz b/ingest/validation/mondo.min.tsv.gz deleted file mode 100644 index 46858468..00000000 Binary files a/ingest/validation/mondo.min.tsv.gz and /dev/null differ diff --git a/ingest/validation/ncbitaxon.min.tsv.gz b/ingest/validation/ncbitaxon.min.tsv.gz deleted file mode 100644 index 47fdc221..00000000 Binary files a/ingest/validation/ncbitaxon.min.tsv.gz and /dev/null differ diff --git a/ingest/validation/ontologies/cl.min.tsv.gz b/ingest/validation/ontologies/cl.min.tsv.gz new file mode 100644 index 00000000..b249bd8b Binary files /dev/null and b/ingest/validation/ontologies/cl.min.tsv.gz differ diff --git a/ingest/validation/ontologies/efo.min.tsv.gz b/ingest/validation/ontologies/efo.min.tsv.gz new file mode 100644 index 00000000..d21c1439 Binary files /dev/null and b/ingest/validation/ontologies/efo.min.tsv.gz differ diff --git a/ingest/validation/ontologies/mondo.min.tsv.gz b/ingest/validation/ontologies/mondo.min.tsv.gz new file mode 100644 index 00000000..867c58d1 Binary files /dev/null and b/ingest/validation/ontologies/mondo.min.tsv.gz differ diff --git a/ingest/validation/ontologies/ncbitaxon.min.tsv.gz b/ingest/validation/ontologies/ncbitaxon.min.tsv.gz new file mode 100644 index 00000000..213788ef Binary files /dev/null and b/ingest/validation/ontologies/ncbitaxon.min.tsv.gz differ diff --git a/ingest/validation/ontologies/pato.min.tsv.gz b/ingest/validation/ontologies/pato.min.tsv.gz new file mode 100644 index 00000000..5b55f361 Binary files /dev/null and b/ingest/validation/ontologies/pato.min.tsv.gz differ diff --git a/ingest/validation/ontologies/uberon.min.tsv.gz b/ingest/validation/ontologies/uberon.min.tsv.gz new file mode 100644 index 00000000..9f47cf36 Binary files /dev/null and b/ingest/validation/ontologies/uberon.min.tsv.gz differ diff --git a/ingest/validation/ontologies/version.txt b/ingest/validation/ontologies/version.txt new file mode 100644 index 00000000..0e02076d --- /dev/null +++ b/ingest/validation/ontologies/version.txt @@ -0,0 +1 @@ +1726164618 # validation cache key diff --git a/ingest/validation/pato.min.tsv.gz b/ingest/validation/pato.min.tsv.gz deleted file mode 100644 index 0ad06ea7..00000000 Binary files a/ingest/validation/pato.min.tsv.gz and /dev/null differ diff --git a/tests/test_minify_ontologies.py b/tests/test_minify_ontologies.py index 6712c2ed..995ba07b 100644 --- a/tests/test_minify_ontologies.py +++ b/tests/test_minify_ontologies.py @@ -15,24 +15,28 @@ class TestOntologyMinifier(unittest.TestCase): + def setup_method(self, test_method): + os.mkdir('ontologies') + def test_mondo_and_pato_minification(self): OntologyMinifier(['disease'], False) - files = glob.glob('*.tsv.gz') + files = glob.glob('ontologies/*.tsv.gz') self.assertEqual(len(files), 2, 'Did not find 2 TSV.GZ files') - with gzip.open('mondo.min.tsv.gz', 'rt') as f: + with gzip.open('ontologies/mondo.min.tsv.gz', 'rt') as f: first_line = f.readline().strip().split('\t') expected_first_line = [ 'MONDO_0000001', 'disease', - 'condition||disease||disease or disorder||disease or disorder, non-neoplastic||diseases||diseases and disorders||disorder||disorders||medical condition||other disease' + 'condition||disease or disorder||disease or disorder, non-neoplastic||diseases||diseases and disorders||disorder||disorders||medical condition||other disease' ] error_message = 'Did not get expected first line in mondo.min.tsv.gz' self.assertEqual(first_line, expected_first_line, error_message) def teardown_method(self, test_method): - output_files = glob.glob('*.min.tsv.gz') + output_files = glob.glob('ontologies/*.min.tsv.gz') for file in output_files: os.remove(file) + os.rmdir('ontologies')