Skip to content

Commit

Permalink
Merge pull request #31 from Knowledge-Graph-Hub/cat_assign
Browse files Browse the repository at this point in the history
Fix for node categories getting overwritten by those defined by edges
  • Loading branch information
caufieldjh authored Aug 18, 2023
2 parents 7829689 + eedf0e0 commit 5630f19
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "universalizer"
version = "0.0.7"
version = "0.0.8"
description = "Provides functions for knowledge graph cleanup and identifier normalization."
authors = ["caufieldjh <[email protected]>"]
license = "BSD-3"
Expand Down
4 changes: 4 additions & 0 deletions universalizer/categories.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
"""Categories more broadly used than those defined in SSSOM maps."""

# These categories aren't changed if they're already
# specified in the nodefile.
RETAINED_CAT_LIST = ["biolink:PhenotypicFeature"]

STY_TO_BIOLINK = {
"STY:T001": "biolink:OrganismTaxon",
"STY:T002": "biolink:OrganismTaxon",
Expand Down
11 changes: 9 additions & 2 deletions universalizer/norm.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from sssom.parsers import parse_sssom_table # type: ignore
from sssom.util import MappingSetDataFrame # type: ignore

from universalizer.categories import STY_TO_BIOLINK
from universalizer.categories import RETAINED_CAT_LIST, STY_TO_BIOLINK
from universalizer.oak_utils import get_cats_from_oak


Expand Down Expand Up @@ -332,13 +332,20 @@ def make_cat_maps(

# Examine edges, obtain biolink:category relations
# and those from UMLS semantic types (STY)
# These take precedence over nodelist category assignments
# These take precedence over nodelist category assignments,
# except in cases where we don't want to overwrite them.
# Those are in the RETAINED_CAT_LIST.
with open(input_edges, "r") as edgefile:
edgefile.readline()
for line in edgefile:
splitline = line.rstrip().split("\t")
edge_id = splitline[0]
subj_node_id = splitline[1]
if subj_node_id in id_and_cat_map:
# We may find a node here not mentioned in the nodefile,
# like if the node ID needs correction.
if id_and_cat_map[subj_node_id] in RETAINED_CAT_LIST:
continue
pred = splitline[2]
obj_node_id = splitline[3]
if pred.lower() == "biolink:category":
Expand Down

0 comments on commit 5630f19

Please sign in to comment.