Skip to content

Commit

Permalink
Merge pull request #372 from broadinstitute/jlc_show_gene_name
Browse files Browse the repository at this point in the history
Show gene name in DE table for CELLxGENE-style AnnData files (SCP-5859)
  • Loading branch information
jlchang authored Dec 3, 2024
2 parents 00109a5 + bfb9260 commit 9652c86
Show file tree
Hide file tree
Showing 13 changed files with 33,431 additions and 36 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/minify_ontologies.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
types: [opened] # Only trigger on PR "opened" event
# push: # Uncomment, update branches to develop / debug
# branches:
# jb-metadata-boolean
# jlc_show_gene_name

jobs:
build:
Expand Down
61 changes: 47 additions & 14 deletions ingest/de.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import scanpy as sc
import re
import glob
from anndata import AnnData

try:
from monitor import setup_logger, log_exception
Expand Down Expand Up @@ -171,7 +172,7 @@ def subset_adata(adata, de_cells):
"subsetting matrix on cells in clustering"
)
matrix_subset_list = np.in1d(adata.obs_names, de_cells)
adata = adata[matrix_subset_list]
adata = adata[matrix_subset_list].copy()
return adata

def execute_de(self):
Expand Down Expand Up @@ -219,8 +220,10 @@ def execute_de(self):
msg,
)
raise ValueError(msg)
except:
raise
except Exception as e:
print(e)



@staticmethod
def get_genes(genes_path):
Expand All @@ -241,7 +244,7 @@ def get_genes(genes_path):
"dev_info: Features file contains duplicate identifiers in column 2"
)
print(warning)
genes_df['new_id'] = genes_df[[0, 1]].agg('|'.join, axis=1)
genes_df['new_id'] = genes_df[[1, 0]].agg('|'.join, axis=1)
if genes_df['new_id'].count() != genes_df['new_id'].nunique():
msg = "Duplicates in features file even after joining gene_id and gene_name"
log_exception(
Expand Down Expand Up @@ -306,10 +309,16 @@ def delimiter_in_gene_name(rank):
@staticmethod
def extract_gene_id_for_out_file(rank):
"""Separate out gene name from gene ID"""
rank['gene_id'] = rank['names'].str.split('|').str[0]
rank['names'] = rank['names'].str.split('|').str[1]
rank['feature_id'] = rank['names'].str.split('|').str[1]
rank['names'] = rank['names'].str.split('|').str[0]

return rank

@staticmethod
def all_match_ensembl_id_regex(adata, col_name):
regex = r'ENS[A-Z]{0,4}\d{11}'
return adata[col_name].astype(str).str.match(regex).all()

@staticmethod
def run_scanpy_de(
cluster,
Expand Down Expand Up @@ -340,23 +349,40 @@ def run_scanpy_de(
elif matrix_file_type == "h5ad":
matrix_object = IngestFiles(matrix_file_path, None)
local_file_path = matrix_object.resolve_path(matrix_file_path)[1]
adata = matrix_object.open_anndata(local_file_path)
orig_adata = matrix_object.open_anndata(local_file_path)
else:
# MTX reconstitution UNTESTED (SCP-4203)
# will want try/except here to catch failed data object composition
adata = DifferentialExpression.adata_from_mtx(
matrix_file_path, genes_path, barcodes_path
)

if not matrix_file_type == "h5ad":
if matrix_file_type == "h5ad":
if orig_adata.raw is not None:
adata = AnnData(
# using .copy() for the AnnData components is good practice
# but we won't be using orig_adata for analyses
# choosing to avoid .copy() for memory efficiency
X = orig_adata.raw.X,
obs = orig_adata.obs,
var = orig_adata.var,
)
else:
msg = f'{matrix_file_path} does not have a .raw attribute'
print(msg)
log_exception(
DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg
)
raise ValueError(msg)
# AnnData expects gene x cell so dense and mtx matrices require transposition
else:
adata = adata.transpose()

use_raw = True if matrix_file_type == "h5ad" else False

adata = DifferentialExpression.subset_adata(adata, de_cells)

# will need try/except (SCP-4205)
adata.obs = DifferentialExpression.order_annots(de_annots, adata.obs_names)
# h5ad inputs will already have obs data, only non-h5ad need this step
if not matrix_file_type == "h5ad":
adata.obs = DifferentialExpression.order_annots(de_annots, adata.obs_names)

adata = DifferentialExpression.remove_single_sample_data(adata, annotation)

Expand All @@ -369,7 +395,6 @@ def run_scanpy_de(
adata,
annotation,
key_added=rank_key,
use_raw=use_raw,
method=method,
pts=True,
)
Expand All @@ -393,13 +418,21 @@ def run_scanpy_de(
for group in groups:
clean_group = DifferentialExpression.sanitize_string(group)
clean_annotation = DifferentialExpression.sanitize_string(annotation)
DifferentialExpression.de_logger.info(f"Writing DE output for {group}")
rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group)
if DifferentialExpression.all_match_ensembl_id_regex(rank, "names"):
feature_name_rank = rank.merge(adata.var['feature_name'], left_on="names", right_index=True, how='left')
feature_name_rank.rename(columns={'names':'feature_id'}, inplace=True)
feature_name_rank.rename(columns={'feature_name':'names'}, inplace=True)
new_column_order = ["names"] + [col for col in feature_name_rank.columns if col not in {"names", "feature_id"}] + ["feature_id"]
feature_name_rank = feature_name_rank[new_column_order]

rank = feature_name_rank
if DifferentialExpression.delimiter_in_gene_name(rank):
DifferentialExpression.extract_gene_id_for_out_file(rank)
out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
# Round numbers to 4 significant digits while respecting fixed point
# and scientific notation (note: trailing zeros are removed)
DifferentialExpression.de_logger.info(f"Writing DE output for {clean_group}")
rank.to_csv(out_file, sep='\t', float_format='%.4g')

# Provide h5ad of DE analysis as reference computable object
Expand Down
Binary file modified ingest/validation/ontologies/efo.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/mondo.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/ncbitaxon.min.tsv.gz
Binary file not shown.
Binary file modified ingest/validation/ontologies/uberon.min.tsv.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion ingest/validation/ontologies/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1729700083 # validation cache key
1733232455 # validation cache key
Binary file added tests/data/anndata/compliant_liver.h5ad
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading

0 comments on commit 9652c86

Please sign in to comment.