Merge pull request #372 from broadinstitute/jlc_show_gene_name

Show gene name in DE table for CELLxGENE-style AnnData files (SCP-5859)
broadinstitute · Dec 3, 2024 · 9652c86 · 9652c86
2 parents 00109a5 + bfb9260
commit 9652c86
Show file tree

Hide file tree

Showing 13 changed files with 33,431 additions and 36 deletions.
diff --git a/.github/workflows/minify_ontologies.yml b/.github/workflows/minify_ontologies.yml
@@ -5,7 +5,7 @@ on:
     types: [opened]  # Only trigger on PR "opened" event
 #  push: # Uncomment, update branches to develop / debug
 #    branches:
-#      jb-metadata-boolean
+#      jlc_show_gene_name 
 
 jobs:
   build:

diff --git a/ingest/de.py b/ingest/de.py
@@ -4,6 +4,7 @@
 import scanpy as sc
 import re
 import glob
+from anndata import AnnData
 
 try:
     from monitor import setup_logger, log_exception
@@ -171,7 +172,7 @@ def subset_adata(adata, de_cells):
             "subsetting matrix on cells in clustering"
         )
         matrix_subset_list = np.in1d(adata.obs_names, de_cells)
-        adata = adata[matrix_subset_list]
+        adata = adata[matrix_subset_list].copy()
         return adata
 
     def execute_de(self):
@@ -219,8 +220,10 @@ def execute_de(self):
                     msg,
                 )
                 raise ValueError(msg)
-        except:
-            raise
+        except Exception as e:
+            print(e)
+
+
 
     @staticmethod
     def get_genes(genes_path):
@@ -241,7 +244,7 @@ def get_genes(genes_path):
                     "dev_info: Features file contains duplicate identifiers in column 2"
                 )
                 print(warning)
-                genes_df['new_id'] = genes_df[[0, 1]].agg('|'.join, axis=1)
+                genes_df['new_id'] = genes_df[[1, 0]].agg('|'.join, axis=1)
                 if genes_df['new_id'].count() != genes_df['new_id'].nunique():
                     msg = "Duplicates in features file even after joining gene_id and gene_name"
                     log_exception(
@@ -306,10 +309,16 @@ def delimiter_in_gene_name(rank):
     @staticmethod
     def extract_gene_id_for_out_file(rank):
         """Separate out gene name from gene ID"""
-        rank['gene_id'] = rank['names'].str.split('|').str[0]
-        rank['names'] = rank['names'].str.split('|').str[1]
+        rank['feature_id'] = rank['names'].str.split('|').str[1]
+        rank['names'] = rank['names'].str.split('|').str[0]
+
         return rank
 
+    @staticmethod
+    def all_match_ensembl_id_regex(adata, col_name):
+        regex = r'ENS[A-Z]{0,4}\d{11}'
+        return adata[col_name].astype(str).str.match(regex).all()
+
     @staticmethod
     def run_scanpy_de(
         cluster,
@@ -340,23 +349,40 @@ def run_scanpy_de(
         elif matrix_file_type == "h5ad":
             matrix_object = IngestFiles(matrix_file_path, None)
             local_file_path = matrix_object.resolve_path(matrix_file_path)[1]
-            adata = matrix_object.open_anndata(local_file_path)
+            orig_adata = matrix_object.open_anndata(local_file_path)
         else:
             # MTX reconstitution UNTESTED (SCP-4203)
             # will want try/except here to catch failed data object composition
             adata = DifferentialExpression.adata_from_mtx(
                 matrix_file_path, genes_path, barcodes_path
             )
 
-        if not matrix_file_type == "h5ad":
+        if matrix_file_type == "h5ad":
+            if orig_adata.raw is not None:
+                adata = AnnData(
+                    # using .copy() for the AnnData components is good practice
+                    # but we won't be using orig_adata for analyses
+                    # choosing to avoid .copy() for memory efficiency
+                    X = orig_adata.raw.X,
+                    obs = orig_adata.obs,
+                    var = orig_adata.var,
+                )
+            else:
+                msg = f'{matrix_file_path} does not have a .raw attribute'
+                print(msg)
+                log_exception(
+                    DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg
+                )
+                raise ValueError(msg)
+        # AnnData expects gene x cell so dense and mtx matrices require transposition
+        else:
             adata = adata.transpose()
 
-        use_raw = True if matrix_file_type == "h5ad" else False
-
         adata = DifferentialExpression.subset_adata(adata, de_cells)
 
-        # will need try/except (SCP-4205)
-        adata.obs = DifferentialExpression.order_annots(de_annots, adata.obs_names)
+        # h5ad inputs will already have obs data, only non-h5ad need this step
+        if not matrix_file_type == "h5ad":
+            adata.obs = DifferentialExpression.order_annots(de_annots, adata.obs_names)
 
         adata = DifferentialExpression.remove_single_sample_data(adata, annotation)
 
@@ -369,7 +395,6 @@ def run_scanpy_de(
                 adata,
                 annotation,
                 key_added=rank_key,
-                use_raw=use_raw,
                 method=method,
                 pts=True,
             )
@@ -393,13 +418,21 @@ def run_scanpy_de(
         for group in groups:
             clean_group = DifferentialExpression.sanitize_string(group)
             clean_annotation = DifferentialExpression.sanitize_string(annotation)
-            DifferentialExpression.de_logger.info(f"Writing DE output for {group}")
             rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group)
+            if DifferentialExpression.all_match_ensembl_id_regex(rank, "names"):
+                feature_name_rank = rank.merge(adata.var['feature_name'], left_on="names", right_index=True, how='left')
+                feature_name_rank.rename(columns={'names':'feature_id'}, inplace=True)
+                feature_name_rank.rename(columns={'feature_name':'names'}, inplace=True)
+                new_column_order = ["names"] + [col for col in feature_name_rank.columns if col not in {"names", "feature_id"}] + ["feature_id"]
+                feature_name_rank = feature_name_rank[new_column_order]
+
+                rank = feature_name_rank
             if DifferentialExpression.delimiter_in_gene_name(rank):
                 DifferentialExpression.extract_gene_id_for_out_file(rank)
             out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv'
             # Round numbers to 4 significant digits while respecting fixed point
             # and scientific notation (note: trailing zeros are removed)
+            DifferentialExpression.de_logger.info(f"Writing DE output for {clean_group}")
             rank.to_csv(out_file, sep='\t', float_format='%.4g')
 
         # Provide h5ad of DE analysis as reference computable object

diff --git a/ingest/validation/ontologies/efo.min.tsv.gz b/ingest/validation/ontologies/efo.min.tsv.gz
diff --git a/ingest/validation/ontologies/mondo.min.tsv.gz b/ingest/validation/ontologies/mondo.min.tsv.gz
diff --git a/ingest/validation/ontologies/ncbitaxon.min.tsv.gz b/ingest/validation/ontologies/ncbitaxon.min.tsv.gz
diff --git a/ingest/validation/ontologies/uberon.min.tsv.gz b/ingest/validation/ontologies/uberon.min.tsv.gz
diff --git a/ingest/validation/ontologies/version.txt b/ingest/validation/ontologies/version.txt
@@ -1 +1 @@
-1729700083 # validation cache key
+1733232455 # validation cache key
diff --git a/tests/data/anndata/compliant_liver.h5ad b/tests/data/anndata/compliant_liver.h5ad
diff --git a/tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz b/tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz
diff --git a/tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz b/tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		1729700083 # validation cache key
		1733232455 # validation cache key