diff --git a/.github/workflows/minify_ontologies.yml b/.github/workflows/minify_ontologies.yml index 99554502..1c5bc132 100644 --- a/.github/workflows/minify_ontologies.yml +++ b/.github/workflows/minify_ontologies.yml @@ -3,9 +3,9 @@ name: Minify ontologies on: pull_request: types: [opened] # Only trigger on PR "opened" event -# push: # Uncomment, update branches to develop / debug -# branches: -# jlc_show_gene_name +# push: # Uncomment, update branches to develop / debug +# branches: +# jlc_show_de_pairwise jobs: build: diff --git a/ingest/anndata_.py b/ingest/anndata_.py index 6038be5b..5eeba512 100644 --- a/ingest/anndata_.py +++ b/ingest/anndata_.py @@ -6,6 +6,7 @@ import scipy from scipy.io.mmio import MMFile + # scipy.io.mmwrite uses scientific notation by default # https://stackoverflow.com/questions/64748513 class MMFileFixedFormat(MMFile): @@ -72,7 +73,7 @@ def create_cell_data_arrays(self): linear_data_id=self.study_file_id, cluster_name=raw_filename, study_file_id=self.study_file_id, - study_id=self.study_id + study_id=self.study_id, ): data_arrays.append(data_array) diff --git a/ingest/cell_metadata.py b/ingest/cell_metadata.py index b6ce18f5..bca427d9 100644 --- a/ingest/cell_metadata.py +++ b/ingest/cell_metadata.py @@ -7,6 +7,7 @@ PREREQUISITES Must have python 3.6 or higher. """ + import collections import ntpath from collections import defaultdict, OrderedDict @@ -115,6 +116,7 @@ def validate_header_for_coordinate_values(self): "error", msg, "format:cap:metadata-no-coordinates" ) return False + @staticmethod def make_multiindex_name(modality): """From modality, generate column name in multi-index format""" @@ -127,7 +129,9 @@ def create_boolean_modality_metadatum(df, modality): """Translate presence of single modality to boolean for BigQuery""" # check for empty cells (aka. nan) or empty strings modality_multiindex = CellMetadata.make_multiindex_name(modality) - no_modality_info = df[modality_multiindex].isna() | df[modality_multiindex].str.len().eq(0) + no_modality_info = df[modality_multiindex].isna() | df[ + modality_multiindex + ].str.len().eq(0) bool_name = modality + "_bool" bool_multiindex = CellMetadata.make_multiindex_name(bool_name) # store inverse of no_modality_info (ie. True = has modality info) @@ -145,12 +149,12 @@ def hide_modality_metadatum(self): m_to_rename[bool_name] = has_m self.modality_urls = self.file.filter(m_to_hide, axis=1) self.file.drop(m_to_hide, axis=1, inplace=True) - self.file.rename(columns= m_to_rename, inplace=True) + self.file.rename(columns=m_to_rename, inplace=True) return def booleanize_modality_metadata(self): """Translate presence of modality data to boolean for BigQuery - If no modality data, self.files is unchanged + If no modality data, self.files is unchanged """ if self.modalities is not None: df = copy.deepcopy(self.file) diff --git a/ingest/cli_parser.py b/ingest/cli_parser.py index f9373fb6..f6675abd 100644 --- a/ingest/cli_parser.py +++ b/ingest/cli_parser.py @@ -274,6 +274,13 @@ def create_parser(): help="Indicates that differential expression analysis should be invoked", ) + parser_differential_expression.add_argument( + "--de-type", + default="rest", + choices=['rest', 'pairwise'], + help="Accepted values: 'pairwise' or 'rest' (default)", + ) + parser_differential_expression.add_argument( "--study-accession", required=True, @@ -336,6 +343,17 @@ def create_parser(): "--gene-file", help="Path to .genes.tsv file" ) + # For pairwise analyses + parser_differential_expression.add_argument( + "--group1", + help="1st annotation label to use for pairwise DE analysis", + ) + + parser_differential_expression.add_argument( + "--group2", + help="2nd annotation label to use for pairwise DE analysis", + ) + parser_ingest_differential_expression = subparsers.add_parser( "ingest_differential_expression", help="Indicates author differential expression analysis processing", @@ -377,40 +395,38 @@ def create_parser(): parser_ingest_differential_expression.add_argument( "--differential-expression-file", required=True, - help="Path to DE file uploaded by author." + help="Path to DE file uploaded by author.", ) parser_ingest_differential_expression.add_argument( "--gene-header", required=True, - help="Header used for gene names / symbols in DE file" + help="Header used for gene names / symbols in DE file", ) parser_ingest_differential_expression.add_argument( - "--group-header", - required=True, - help="Header used for group in DE file" + "--group-header", required=True, help="Header used for group in DE file" ) parser_ingest_differential_expression.add_argument( "--comparison-group-header", required=False, help=( - "Header used for comparison group in DE file. " + - "For pairwise comparisons. Can omit if DE file is in one-vs-rest-only format." - ) + "Header used for comparison group in DE file. " + + "For pairwise comparisons. Can omit if DE file is in one-vs-rest-only format." + ), ) parser_ingest_differential_expression.add_argument( "--size-metric", required=True, - help='Header used as size metric in DE file, e.g. "logfoldchanges", "avg_log2FC", etc.' + help='Header used as size metric in DE file, e.g. "logfoldchanges", "avg_log2FC", etc.', ) parser_ingest_differential_expression.add_argument( "--significance-metric", required=True, - help='Header used as significance metric in DE file, e.g. "pvals_adj", "p_val_adj", etc.' + help='Header used as significance metric in DE file, e.g. "pvals_adj", "p_val_adj", etc.', ) # AnnData subparsers @@ -493,7 +509,7 @@ def create_parser(): parser_rank_genes.add_argument( '--publication', help="URL of the study's publicly-accessible research article, or GS URL or local path to publication text file", - required=True + required=True, ) return parser diff --git a/ingest/clusters.py b/ingest/clusters.py index 2289669f..6868dc4b 100644 --- a/ingest/clusters.py +++ b/ingest/clusters.py @@ -136,9 +136,11 @@ def transform(self): { "name": annot_name, "type": annot_type, - "values": self.file[annot_headers].unique().tolist() - if annot_type == "group" - else [], + "values": ( + self.file[annot_headers].unique().tolist() + if annot_type == "group" + else [] + ), } ) Annotations.dev_logger.info(f"Creating model for {self.study_id}") @@ -148,9 +150,11 @@ def transform(self): cell_annotations=cell_annotations, study_file_id=self.study_file_id, study_id=self.study_id, - domain_ranges=DomainRanges(**self.domain_ranges) - if self.domain_ranges is not None - else None, + domain_ranges=( + DomainRanges(**self.domain_ranges) + if self.domain_ranges is not None + else None + ), ) def get_data_array_annot(self, linear_data_id): diff --git a/ingest/de.py b/ingest/de.py index 0ee8a351..b214eab0 100644 --- a/ingest/de.py +++ b/ingest/de.py @@ -59,7 +59,7 @@ def __init__( self.barcodes_path = self.kwargs["barcode_file"] @staticmethod - def assess_annotation(annotation, metadata): + def assess_annotation(annotation, metadata, extra_params): """Check that annotation for DE is not of TYPE numeric""" dtype_info = dict(zip(metadata.headers, metadata.annot_types)) annotation_info = dtype_info.get(annotation, None) @@ -78,8 +78,39 @@ def assess_annotation(annotation, metadata): ) raise KeyError(msg) elif annotation_info == "group": - # DE annotations should be of TYPE group - return None + de_type = extra_params.get("de_type") + group1 = extra_params.get("group1") + group2 = extra_params.get("group2") + if de_type == "pairwise": + metadata.preprocess(False) + values = metadata.file[annotation]['group'].unique() + if group1 and group2: + if group1 not in values or group2 not in values: + msg = ( + f'Provided annotation value(s) group1, \"{group1}\", ' + f'or group2, \"{group2}\", not found in metadata file' + ) + log_exception( + DifferentialExpression.dev_logger, + DifferentialExpression.de_logger, + msg, + ) + raise ValueError(msg) + else: + msg = ( + f'Provided annotation value(s) group1, \"{group1}\", or ' + f'group2, \"{group2}\", were false-y' + ) + log_exception( + DifferentialExpression.dev_logger, + DifferentialExpression.de_logger, + msg, + ) + raise KeyError(msg) + # de_type is "rest" by default, DE annotation TYPE expected to be group + else: + # assessment does not raise error + return None else: msg = f"Error: \"{annotation}\" has unexpected type \"{annotation_info}\"." print(msg) @@ -176,40 +207,27 @@ def subset_adata(adata, de_cells): return adata def execute_de(self): - print(f'dev_info: Starting DE for {self.accession}') + DifferentialExpression.de_logger.info( + f'dev_info: Starting DE for {self.accession}' + ) try: # only used in output filename, replacing non-alphanumeric with underscores # except '+' replaced with 'pos' self.cluster_name = DifferentialExpression.sanitize_string( self.kwargs["name"] ) - if self.matrix_file_type == "mtx": - DifferentialExpression.de_logger.info("preparing DE on sparse matrix") - self.run_scanpy_de( - self.cluster, - self.metadata, - self.matrix_file_path, - self.matrix_file_type, - self.annotation, - self.annot_scope, - self.accession, - self.cluster_name, - self.method, - self.genes_path, - self.barcodes_path, + if self.matrix_file_type in ["dense", "mtx", "h5ad"]: + DifferentialExpression.de_logger.info( + f"preparing DE on {self.matrix_file_type} matrix" ) - elif self.matrix_file_type == "dense" or self.matrix_file_type == "h5ad": - DifferentialExpression.de_logger.info(f"preparing DE on {self.matrix_file_type} matrix") self.run_scanpy_de( self.cluster, self.metadata, self.matrix_file_path, self.matrix_file_type, self.annotation, - self.annot_scope, - self.accession, self.cluster_name, - self.method, + self.kwargs, ) else: msg = f"Submitted matrix_file_type should be \"dense\", \"mtx\" or \"h5ad\" not \"{self.matrix_file_type}\"" @@ -220,10 +238,9 @@ def execute_de(self): msg, ) raise ValueError(msg) - except Exception as e: - print(e) - - + except (TypeError, KeyError, ValueError) as e: + raise ValueError(e) + return @staticmethod def get_genes(genes_path): @@ -319,6 +336,61 @@ def all_match_ensembl_id_regex(adata, col_name): regex = r'ENS[A-Z]{0,4}\d{11}' return adata[col_name].astype(str).str.match(regex).all() + @staticmethod + def write_de_result(adata, group, annotation, rank_key, cluster_name, extra_params): + de_type = extra_params.get("de_type") + method = extra_params.get("method") + annot_scope = extra_params.get("annotation_scope") + clean_annotation = DifferentialExpression.sanitize_string(annotation) + rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group) + if DifferentialExpression.all_match_ensembl_id_regex(rank, "names"): + feature_name_rank = rank.merge( + adata.var['feature_name'], left_on="names", right_index=True, how='left' + ) + feature_name_rank.rename(columns={'names': 'feature_id'}, inplace=True) + feature_name_rank.rename(columns={'feature_name': 'names'}, inplace=True) + new_column_order = ( + ["names"] + + [ + col + for col in feature_name_rank.columns + if col not in {"names", "feature_id"} + ] + + ["feature_id"] + ) + feature_name_rank = feature_name_rank[new_column_order] + + rank = feature_name_rank + if DifferentialExpression.delimiter_in_gene_name(rank): + DifferentialExpression.extract_gene_id_for_out_file(rank) + if de_type == "rest": + clean_group = DifferentialExpression.sanitize_string(group) + out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv' + DifferentialExpression.de_logger.info( + f"Writing DE output for {clean_group} vs restq" + ) + elif de_type == "pairwise": + # rank_genes_groups accepts a list. For SCP pairwise, should be a list with one item + # converting list to string for incorporation into result filename + group1 = DifferentialExpression.sanitize_string( + ''.join(extra_params["group1"]) + ) + group2 = DifferentialExpression.sanitize_string(extra_params["group2"]) + out_file = f'{cluster_name}--{clean_annotation}--{group1}--{group2}--{annot_scope}--{method}.tsv' + DifferentialExpression.de_logger.info( + f"Writing DE output for {group1} vs {group2} pairwise" + ) + else: + msg = f'Unknown de_type, {de_type}' + print(msg) + log_exception( + DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg + ) + raise ValueError(msg) + # Round numbers to 4 significant digits while respecting fixed point + # and scientific notation (note: trailing zeros are removed) + rank.to_csv(out_file, sep='\t', float_format='%.4g') + @staticmethod def run_scanpy_de( cluster, @@ -326,17 +398,16 @@ def run_scanpy_de( matrix_file_path, matrix_file_type, annotation, - annot_scope, - study_accession, cluster_name, - method, - genes_path=None, - barcodes_path=None, + extra_params, ): + method = extra_params.get("method") + de_type = extra_params.get("de_type") + try: - DifferentialExpression.assess_annotation(annotation, metadata) + DifferentialExpression.assess_annotation(annotation, metadata, extra_params) except (TypeError, KeyError, ValueError) as e: - raise + raise ValueError(e) de_cells = DifferentialExpression.get_cluster_cells(cluster.file['NAME'].values) de_annots = DifferentialExpression.subset_annots(metadata, de_cells) @@ -349,10 +420,13 @@ def run_scanpy_de( elif matrix_file_type == "h5ad": matrix_object = IngestFiles(matrix_file_path, None) local_file_path = matrix_object.resolve_path(matrix_file_path)[1] - orig_adata = matrix_object.open_anndata(local_file_path) + # orig_adata = matrix_object.open_anndata(local_file_path) + orig_adata = sc.read_h5ad(local_file_path) else: # MTX reconstitution UNTESTED (SCP-4203) # will want try/except here to catch failed data object composition + genes_path = extra_params.get("gene_file") + barcodes_path = extra_params.get("barcode_file") adata = DifferentialExpression.adata_from_mtx( matrix_file_path, genes_path, barcodes_path ) @@ -363,15 +437,17 @@ def run_scanpy_de( # using .copy() for the AnnData components is good practice # but we won't be using orig_adata for analyses # choosing to avoid .copy() for memory efficiency - X = orig_adata.raw.X, - obs = orig_adata.obs, - var = orig_adata.var, + X=orig_adata.raw.X, + obs=orig_adata.obs, + var=orig_adata.var, ) else: msg = f'{matrix_file_path} does not have a .raw attribute' print(msg) log_exception( - DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg + DifferentialExpression.dev_logger, + DifferentialExpression.de_logger, + msg, ) raise ValueError(msg) # AnnData expects gene x cell so dense and mtx matrices require transposition @@ -390,55 +466,81 @@ def run_scanpy_de( sc.pp.log1p(adata) DifferentialExpression.de_logger.info("calculating DE") rank_key = f"rank.{annotation}.{method}" - try: - sc.tl.rank_genes_groups( + if de_type == "rest": + try: + sc.tl.rank_genes_groups( + adata, + annotation, + key_added=rank_key, + method=method, + pts=True, + ) + except KeyError: + msg = f"Missing expected annotation in metadata: {annotation}, unable to calculate DE." + print(msg) + log_exception( + DifferentialExpression.dev_logger, + DifferentialExpression.de_logger, + msg, + ) + raise KeyError(msg) + # ToDo - detection and handling of annotations with only one sample (SCP-4282) + except ValueError as e: + print(e) + log_exception( + DifferentialExpression.dev_logger, + DifferentialExpression.de_logger, + e, + ) + raise KeyError(e) + + DifferentialExpression.de_logger.info("Gathering DE annotation labels") + groups = np.unique(adata.obs[annotation]).tolist() + for group in groups: + DifferentialExpression.write_de_result( + adata, group, annotation, rank_key, cluster_name, extra_params + ) + + elif de_type == 'pairwise': + group1 = [extra_params["group1"]] + group2 = extra_params["group2"] + try: + sc.tl.rank_genes_groups( + adata, + groupby=annotation, + groups=group1, + reference=group2, + key_added=rank_key, + method=method, + pts=True, + ) + except KeyError: + msg = f"Missing expected annotation in metadata: {annotation}, unable to calculate DE." + print(msg) + log_exception( + DifferentialExpression.dev_logger, + DifferentialExpression.de_logger, + msg, + ) + raise KeyError(msg) + # ToDo - detection and handling of annotations with only one sample (SCP-4282) + except ValueError as e: + print(e) + log_exception( + DifferentialExpression.dev_logger, + DifferentialExpression.de_logger, + e, + ) + raise KeyError(e) + stringified_group1 = ''.join(extra_params["group1"]) + DifferentialExpression.write_de_result( adata, + stringified_group1, annotation, - key_added=rank_key, - method=method, - pts=True, + rank_key, + cluster_name, + extra_params, ) - except KeyError: - msg = f"Missing expected annotation in metadata: {annotation}, unable to calculate DE." - print(msg) - log_exception( - DifferentialExpression.dev_logger, DifferentialExpression.de_logger, msg - ) - raise KeyError(msg) - # ToDo - detection and handling of annotations with only one sample (SCP-4282) - except ValueError as e: - print(e) - log_exception( - DifferentialExpression.dev_logger, DifferentialExpression.de_logger, e - ) - raise KeyError(e) - - DifferentialExpression.de_logger.info("Gathering DE annotation labels") - groups = np.unique(adata.obs[annotation]).tolist() - for group in groups: - clean_group = DifferentialExpression.sanitize_string(group) - clean_annotation = DifferentialExpression.sanitize_string(annotation) - rank = sc.get.rank_genes_groups_df(adata, key=rank_key, group=group) - if DifferentialExpression.all_match_ensembl_id_regex(rank, "names"): - feature_name_rank = rank.merge(adata.var['feature_name'], left_on="names", right_index=True, how='left') - feature_name_rank.rename(columns={'names':'feature_id'}, inplace=True) - feature_name_rank.rename(columns={'feature_name':'names'}, inplace=True) - new_column_order = ["names"] + [col for col in feature_name_rank.columns if col not in {"names", "feature_id"}] + ["feature_id"] - feature_name_rank = feature_name_rank[new_column_order] - - rank = feature_name_rank - if DifferentialExpression.delimiter_in_gene_name(rank): - DifferentialExpression.extract_gene_id_for_out_file(rank) - out_file = f'{cluster_name}--{clean_annotation}--{clean_group}--{annot_scope}--{method}.tsv' - # Round numbers to 4 significant digits while respecting fixed point - # and scientific notation (note: trailing zeros are removed) - DifferentialExpression.de_logger.info(f"Writing DE output for {clean_group}") - rank.to_csv(out_file, sep='\t', float_format='%.4g') - - # Provide h5ad of DE analysis as reference computable object - # DifferentialExpression.de_logger.info("Writing DE h5ad file") - # file_name = f'{study_accession}_{cluster_name}_to_DE.h5ad' - # adata.write_h5ad(file_name) DifferentialExpression.de_logger.info("DE processing complete") diff --git a/ingest/ingest_files.py b/ingest/ingest_files.py index db776bd3..a3b57c88 100644 --- a/ingest/ingest_files.py +++ b/ingest/ingest_files.py @@ -3,6 +3,7 @@ DESCRIPTION Module provides extract capabilities for text, CSV, and TSV file types """ + import copy import csv import gzip @@ -91,10 +92,10 @@ class IngestFiles: } def __init__( - self, - file_path: str, - allowed_file_types: list = list(ALLOWED_FILE_EXTENSIONS.keys()) - ): + self, + file_path: str, + allowed_file_types: list = list(ALLOWED_FILE_EXTENSIONS.keys()), + ): """ :param file_path: GS URL or local file path :param allowed_file_types (optional): list of allowed file MIME types diff --git a/ingest/ingest_pipeline.py b/ingest/ingest_pipeline.py index 6300fd65..51af4328 100644 --- a/ingest/ingest_pipeline.py +++ b/ingest/ingest_pipeline.py @@ -69,7 +69,17 @@ # Differential expression analysis (h5ad matrix) python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name louvain --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --matrix-file-type h5ad --annotation-file ../tests/data/anndata/h5ad_frag.metadata.tsv --cluster-file ../tests/data/anndata/h5ad_frag.cluster.X_umap.tsv --cluster-name umap --study-accession SCPdev --differential-expression +# Pairwise differential expression analysis (dense matrix) +python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --de-type pairwise --group1 "['cholinergic neuron']" --group2 "cranial somatomotor neuron" --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/de_dense_matrix.tsv --matrix-file-type dense --annotation-file ../tests/data/differential_expression/de_dense_metadata.tsv --cluster-file ../tests/data/differential_expression/de_dense_cluster.tsv --cluster-name de_integration --study-accession SCPdev --differential-expression + +# Pairwise differential expression analysis (sparse matrix) +python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --de-type pairwise --group1 "['endothelial cell']" --group2 "smooth muscle cell" --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/sparse/sparsemini_matrix.mtx --gene-file ../tests/data/differential_expression/sparse/sparsemini_features.tsv --barcode-file ../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv --matrix-file-type mtx --annotation-file ../tests/data/differential_expression/sparse/sparsemini_metadata.txt --cluster-file ../tests/data/differential_expression/sparse/sparsemini_cluster.txt --cluster-name de_sparse_integration --study-accession SCPsparsemini --differential-expression + +# Pairwise differential expression analysis (h5ad matrix) +python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --de-type pairwise --group1 "['mature B cell']" --group2 "plasma cell" --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression + """ + import json import logging import os @@ -126,8 +136,15 @@ class IngestPipeline: # array of actions to use when reporting to Mixpanel ACTION_NAMES = [ - 'ingest_cluster', 'ingest_cell_metadata', 'ingest_expression', 'ingest_anndata', 'ingest_subsample', - 'ingest_differential_expression', 'differential_expression', 'render_expression_arrays', 'rank_genes' + 'ingest_cluster', + 'ingest_cell_metadata', + 'ingest_expression', + 'ingest_anndata', + 'ingest_subsample', + 'ingest_differential_expression', + 'differential_expression', + 'render_expression_arrays', + 'rank_genes', ] # Logger provides more details for trouble shooting @@ -391,7 +408,7 @@ def ingest_expression(self) -> int: @custom_metric(config.get_metric_properties) def validate_cell_metadata(self): """If indicated, validate cell metadata against convention - otherwise, just preprocess file + otherwise, just preprocess file """ validate_against_convention = False if self.kwargs["validate_convention"] is not None: @@ -421,7 +438,6 @@ def validate_cell_metadata(self): return 1 return 0 - @custom_metric(config.get_metric_properties) def ingest_cell_metadata(self): for metadata_model in self.cell_metadata.execute_ingest(): @@ -551,7 +567,9 @@ def extract_from_anndata(self): ): self.anndata.generate_processed_matrix(self.anndata.adata) - if self.kwargs.get('extract') and "raw_counts" in self.kwargs.get('extract'): + if self.kwargs.get('extract') and "raw_counts" in self.kwargs.get( + 'extract' + ): self.anndata.ingest_raw_cells() self.report_validation("success") return 0 @@ -590,7 +608,7 @@ def calculate_author_de(self): "group": kwargs["group_header"], "comparison_group": kwargs["comparison_group_header"], "size": kwargs["size_metric"], - "significance": kwargs["significance_metric"] + "significance": kwargs["significance_metric"], } author_de = AuthorDifferentialExpression( @@ -599,9 +617,8 @@ def calculate_author_de(self): kwargs["study_accession"], kwargs["annotation_scope"], kwargs["method"], - kwargs["differential_expression_file"], - header_refmap + header_refmap, ) author_de.execute() @@ -613,7 +630,6 @@ def calculate_author_de(self): # ToDo: surface failed DE for analytics (SCP-4206) return 0 - def render_expression_arrays(self): try: exp_writer = ExpressionWriter( @@ -631,10 +647,7 @@ def render_expression_arrays(self): def rank_genes(self): try: kwargs = self.kwargs - RankGenes( - kwargs["study_accession"], - kwargs["publication"] - ) + RankGenes(kwargs["study_accession"], kwargs["publication"]) except Exception as e: log_exception(IngestPipeline.dev_logger, IngestPipeline.user_logger, e) return 1 @@ -659,12 +672,17 @@ def run_ingest(ingest, arguments, parsed_args): config.set_parent_event_name("ingest-pipeline:cell_metadata:ingest") status_cell_metadata_validation = ingest.validate_cell_metadata() status.append(status_cell_metadata_validation) - if parsed_args.bq_table is not None and status_cell_metadata_validation == 0: + if ( + parsed_args.bq_table is not None + and status_cell_metadata_validation == 0 + ): status_metadata_bq = ingest.upload_metadata_to_bq() status.append(status_metadata_bq) if status_cell_metadata_validation == 0: if ingest.kwargs['has_modality'] is not None: - ingest.cell_metadata.file = CellMetadata.restore_modality_metadata(ingest.cell_metadata) + ingest.cell_metadata.file = CellMetadata.restore_modality_metadata( + ingest.cell_metadata + ) status_cell_metadata_ingest = ingest.ingest_cell_metadata() status.append(status_cell_metadata_ingest) elif "ingest_cluster" in arguments: @@ -686,7 +704,6 @@ def run_ingest(ingest, arguments, parsed_args): status_de = ingest.calculate_de() status.append(status_de) print(f"STATUS after DE {status}") - elif "ingest_differential_expression" in arguments: config.set_parent_event_name("ingest-pipeline:differential-expression:ingest") status_de = ingest.calculate_author_de() @@ -725,7 +742,10 @@ def exit_pipeline(ingest, status, status_cell_metadata, arguments): """Logs any errors, then exits Ingest Pipeline with standard OS code""" if len(status) > 0: # for successful DE jobs, need to delocalize results - if ("differential_expression" in arguments or "ingest_differential_expression" in arguments) and all(i < 1 for i in status): + if ( + "differential_expression" in arguments + or "ingest_differential_expression" in arguments + ) and all(i < 1 for i in status): file_path, study_file_id = get_delocalization_info(arguments) print('file_path', file_path) print('study_file_id', study_file_id) @@ -792,11 +812,13 @@ def exit_pipeline(ingest, status, status_cell_metadata, arguments): sys.exit(os.EX_DATAERR) sys.exit(1) + def get_action_from_args(arguments): """Get the action from list of arguments denoting which data type is being ingested/extracted""" action = list(set(IngestPipeline.ACTION_NAMES) & set(arguments)) return action[0] if action else "" + def main() -> None: """Enables running Ingest Pipeline via CLI @@ -821,7 +843,7 @@ def main() -> None: arguments["study_id"], arguments["study_file_id"], arguments["user_metrics_uuid"], - get_action_from_args(arguments) + get_action_from_args(arguments), ) ingest = IngestPipeline(**arguments) status, status_cell_metadata = run_ingest(ingest, arguments, parsed_args) diff --git a/ingest/mongo_connection.py b/ingest/mongo_connection.py index 1a7542f7..e23f72fb 100644 --- a/ingest/mongo_connection.py +++ b/ingest/mongo_connection.py @@ -34,7 +34,7 @@ def __init__(self): password=self.password, authSource=self.db_name, authMechanism="SCRAM-SHA-1", - serverSelectionTimeoutMS=60_000 + serverSelectionTimeoutMS=60_000, ) self._client = client[self.db_name] # Needed to due to lack of database mock library for MongoDB diff --git a/ingest/validation/metadata_validation.py b/ingest/validation/metadata_validation.py index 011d34f6..71642ca5 100644 --- a/ingest/validation/metadata_validation.py +++ b/ingest/validation/metadata_validation.py @@ -61,8 +61,7 @@ def create_parser(): - """Parse command line values for validate_metadata - """ + """Parse command line values for validate_metadata""" parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter ) @@ -122,8 +121,7 @@ def create_parser(): def check_if_old_output(): - """Exit if old output files found - """ + """Exit if old output files found""" output_files = ["bq.json"] old_output = False diff --git a/ingest/validation/validate_metadata.py b/ingest/validation/validate_metadata.py index 3abcc4ee..138ae8c7 100644 --- a/ingest/validation/validate_metadata.py +++ b/ingest/validation/validate_metadata.py @@ -57,8 +57,7 @@ def backoff_handler(details): - """Handler function to log backoff attempts when querying OLS - """ + """Handler function to log backoff attempts when querying OLS""" dev_logger.debug( "Backing off {wait:0.1f} seconds after {tries} tries " "calling function {target} with args {args} and kwargs " @@ -92,10 +91,10 @@ def retrieve_ontology_term_label_and_synonyms( if term not in self.cached_terms[property_name]: ontology_urls = get_urls_for_property(convention, property_name) - self.cached_terms[property_name][ - term - ] = self.retrieve_ontology_term_label_remote( - term, property_name, ontology_urls, attribute_type + self.cached_terms[property_name][term] = ( + self.retrieve_ontology_term_label_remote( + term, property_name, ontology_urls, attribute_type + ) ) return self.cached_terms[property_name][term] @@ -103,8 +102,7 @@ def retrieve_ontology_term_label_and_synonyms( def retrieve_ontology_term_label_remote( self, term, property_name, ontology_urls, attribute_type ): - """Look up official ontology term from an id, always checking against the remote - """ + """Look up official ontology term from an id, always checking against the remote""" # organ_region currently uses single a non-OLS ontology, the Allen Institute's Mouse Brain Atlas (MBA) # MBA was originally downloaded April 2020 from Allen Brain Atlas data portal URL # http://api.brain-map.org/api/v2/data/query.csv?criteria=model::Structure,rma::criteria,[ontology_id$eq1],rma::options[order$eq%27structures.graph_order%27][num_rows$eqall] @@ -218,8 +216,7 @@ def retrieve_ols_term(self, ontology_urls, term, property_name, attribute_type): raise RuntimeError(msg) def retrieve_mouse_brain_term(self, term, property_name): - """Determine whether ID is in mouse brain atlas (MBA) file - """ + """Determine whether ID is in mouse brain atlas (MBA) file""" # MBA ID is also the leaf entity of structure_id_path in the MBA file # Entries with short structure_id_path seem to be synonymous with # Uberon terms, suggesting MBA terms could be mapped as extensions of the @@ -233,18 +230,16 @@ def retrieve_mouse_brain_term(self, term, property_name): return {"label": mouse_brain_atlas[MBA_id]} def fetch_allen_mouse_brain_atlas(self): - """Get the mouse brain atlas file, either remote or cached, and parse it into an id -> name dictionary - """ + """Get the mouse brain atlas file, either remote or cached, and parse it into an id -> name dictionary""" if "allen_mouse_brain_atlas" not in self.cached_ontologies: - self.cached_ontologies[ - "allen_mouse_brain_atlas" - ] = fetch_allen_mouse_brain_atlas_remote() + self.cached_ontologies["allen_mouse_brain_atlas"] = ( + fetch_allen_mouse_brain_atlas_remote() + ) return self.cached_ontologies["allen_mouse_brain_atlas"] def parse_organ_region_ontology_id(term): - """Extract term id from valid identifiers or raise a ValueError - """ + """Extract term id from valid identifiers or raise a ValueError""" try: ontology_shortname, term_id = re.split("[_:]", term) if ontology_shortname == "MBA": @@ -261,8 +256,7 @@ def parse_organ_region_ontology_id(term): def fetch_allen_mouse_brain_atlas_remote(): - """Get the mouse brain atlas file from the remote source, and parse it into an id -> name dictionary - """ + """Get the mouse brain atlas file from the remote source, and parse it into an id -> name dictionary""" MBA_url = "https://raw.githubusercontent.com/broadinstitute/scp-ingest-pipeline/58f9308e425c667a34219a3dcadf7209fe09f788/schema/organ_region/mouse_brain_atlas/MouseBrainAtlas.csv" region_dict = {} try: @@ -311,8 +305,7 @@ def encode_term_iri(term_id, base_uri): def extract_terminal_pathname(url): - """Extract the last path segment from a URL - """ + """Extract the last path segment from a URL""" return list(filter(None, url.split("/"))).pop() @@ -321,8 +314,7 @@ def get_urls_for_property(convention, property_name): def get_ontology_file_location(ontology): - """Find and format fileLocation URL for running queries against an ontology - """ + """Find and format fileLocation URL for running queries against an ontology""" location = ontology["config"]["fileLocation"] # issue with some ontologies (like GO) having non-standard fileLocation URLs if 'extensions' in location: @@ -331,6 +323,7 @@ def get_ontology_file_location(ontology): location = "http://purl.obolibrary.org/obo/NCBITaxon_" return '/'.join(location.split('/')[:-1]) + '/' + ######################## END ONTOLOGY RETRIVER DEFINITION ######################### # create an OntologyRetriever instance to handle fetching and caching ontology terms @@ -355,8 +348,7 @@ def validate_schema(json, metadata): def is_array_metadata(convention, metadatum): - """Check if metadata is array type from metadata convention - """ + """Check if metadata is array type from metadata convention""" try: type_lookup = convention["properties"][metadatum]["type"] if type_lookup == "array": @@ -368,8 +360,7 @@ def is_array_metadata(convention, metadatum): def is_ontology_metadata(convention, metadatum): - """Check if metadata is ontology from metadata convention - """ + """Check if metadata is ontology from metadata convention""" try: return bool(convention["properties"][metadatum]["ontology"]) except KeyError: @@ -377,8 +368,7 @@ def is_ontology_metadata(convention, metadatum): def is_required_metadata(convention, metadatum): - """Check in metadata convention if metadata is required - """ + """Check in metadata convention if metadata is required""" try: if metadatum in convention["required"]: return True @@ -389,8 +379,7 @@ def is_required_metadata(convention, metadatum): def lookup_metadata_type(convention, metadatum): - """Look up metadata type from metadata convention - """ + """Look up metadata type from metadata convention""" try: if is_array_metadata(convention, metadatum): type_lookup = convention["properties"][metadatum]["items"]["type"] @@ -402,8 +391,7 @@ def lookup_metadata_type(convention, metadatum): def list_duplicates(cells): - """Find duplicates in list for detailed reporting - """ + """Find duplicates in list for detailed reporting""" # reference https://stackoverflow.com/questions/9835762 seen = set() # save add function to avoid repeated lookups @@ -416,8 +404,7 @@ def list_duplicates(cells): def validate_cells_unique(metadata): - """Check all CellID are unique. - """ + """Check all CellID are unique.""" valid = False if len(metadata.cells) == len(set(metadata.cells)): valid = True @@ -450,8 +437,10 @@ def insert_array_ontology_label_row_data( for id in row[property_name]: label_lookup = "" try: - label_and_synonyms = retriever.retrieve_ontology_term_label_and_synonyms( - id, property_name, convention, "array" + label_and_synonyms = ( + retriever.retrieve_ontology_term_label_and_synonyms( + id, property_name, convention, "array" + ) ) label_lookup = label_and_synonyms.get('label') reference_ontology = ( @@ -622,8 +611,7 @@ def omitted_values(ontology_label, cell_name): def collect_ontology_data(row_data, metadata, convention): - """Collect unique ontology IDs for ontology validation - """ + """Collect unique ontology IDs for ontology validation""" row_ref = copy.deepcopy(row_data) for entry in row_ref.keys(): if is_ontology_metadata(convention, entry): @@ -642,8 +630,7 @@ def collect_ontology_data(row_data, metadata, convention): def compare_type_annots_to_convention(metadata, convention): - """Check if metadata type annotation is consistent with metadata convention type - """ + """Check if metadata type annotation is consistent with metadata convention type""" metadata_names = metadata.file.columns.get_level_values(0).tolist() type_annots = metadata.file.columns.get_level_values(1).tolist() # if input was TSV metadata file, SCP format requires 'NAME' for the first @@ -697,8 +684,7 @@ def compare_type_annots_to_convention(metadata, convention): def cast_boolean_type(value): - """Cast metadata value as boolean, if castable - """ + """Cast metadata value as boolean, if castable""" if isinstance(value, bool): return value elif str(value).lower() == "true": @@ -727,8 +713,7 @@ def value_is_nan(value): def cast_integer_type(value): - """Cast metadata value as integer - """ + """Cast metadata value as integer""" if value_is_nan(value): # nan indicates missing data, has no valid integer value for casting return value @@ -737,14 +722,12 @@ def cast_integer_type(value): def cast_float_type(value): - """Cast metadata value as float - """ + """Cast metadata value as float""" return float(value) def cast_string_type(value): - """Cast string type per convention where Pandas autodetected a number - """ + """Cast string type per convention where Pandas autodetected a number""" if value_is_nan(value): # nan indicates missing data; by type, nan is a numpy float # so a separate type check is needed for proper handling @@ -756,8 +739,7 @@ def cast_string_type(value): def regularize_ontology_id(value): - """Regularize ontology_ids for storage with underscore format - """ + """Regularize ontology_ids for storage with underscore format""" try: return value.replace(":", "_") except AttributeError: @@ -768,7 +750,7 @@ def regularize_ontology_id(value): def cast_metadata_type(metadatum, value, id_for_error_detail, convention, metadata): """For metadatum, lookup expected type by metadata convention - and cast value as appropriate type for validation + and cast value as appropriate type for validation """ cast_metadata = {} metadata_types = { @@ -1102,8 +1084,10 @@ def validate_collected_ontology_data(metadata, convention): try: attribute_type = convention["properties"][property_name]["type"] # get actual label along with synonyms for more robust matching - label_and_synonyms = retriever.retrieve_ontology_term_label_and_synonyms( - ontology_id, property_name, convention, attribute_type + label_and_synonyms = ( + retriever.retrieve_ontology_term_label_and_synonyms( + ontology_id, property_name, convention, attribute_type + ) ) if not is_label_or_synonym(label_and_synonyms, ontology_label): @@ -1213,16 +1197,14 @@ def calculate_organism_age_in_seconds(organism_age, organism_age_unit_label): def serialize_issues(metadata): - """Write collected issues to json file - """ + """Write collected issues to json file""" with open("issues.json", "w") as jsonfile: json.dump(metadata.issues, jsonfile, indent=2) jsonfile.write("\n") # Add newline cause Py JSON does not def review_metadata_names(metadata): - """Check metadata names for disallowed characters - """ + """Check metadata names for disallowed characters""" metadata_names = metadata.file.columns.get_level_values(0).tolist() for name in metadata_names: allowed_char = re.compile("^[A-Za-z0-9_]+$") @@ -1238,7 +1220,7 @@ def review_metadata_names(metadata): def identify_multiply_assigned(list): """Given a list of ontology IDs and their purported labels, - return list of unique multiply-assigned labels + return list of unique multiply-assigned labels """ ontology_tracker = defaultdict(lambda: defaultdict(int)) multiply_assigned = [] @@ -1293,13 +1275,13 @@ def assess_ontology_ids(ids, property_name, metadata): def detect_excel_drag(metadata, convention): - """ Check if ontology IDs submitted have characteristic Excel drag properties - Todo1: "Excel drag" detection of array-based ontologyID data - is lacking (need to track pipe-delimited string) - Todo2: need to bypass EBI OLS queries to "fill in" - missing ontology labels for optional metadata) - Hint: try working with raw metadata.file data, would - allow this check to be moved before collect_jsonschema_errors + """Check if ontology IDs submitted have characteristic Excel drag properties + Todo1: "Excel drag" detection of array-based ontologyID data + is lacking (need to track pipe-delimited string) + Todo2: need to bypass EBI OLS queries to "fill in" + missing ontology labels for optional metadata) + Hint: try working with raw metadata.file data, would + allow this check to be moved before collect_jsonschema_errors """ excel_drag = False for property_name in metadata.ontology.keys(): @@ -1341,13 +1323,12 @@ def detect_excel_drag(metadata, convention): def to_1D(series): - """ Pandas values which are list need unnesting - """ + """Pandas values which are list need unnesting""" return [x for _list in series for x in _list] def replace_single_value_array(df, metadata_name, synonym, label): - """ Synonym replacement (in-place) for single-value array metadata + """Synonym replacement (in-place) for single-value array metadata Pandas doesn't operate well on lists which are potentially non-homogenous # https://stackoverflow.com/questions/53116286/how-to-assign-an-entire-list-to-each-row-of-a-pandas-dataframe """ @@ -1357,9 +1338,9 @@ def replace_single_value_array(df, metadata_name, synonym, label): def replace_synonym_in_multivalue_array(df, metadata_name, substitutions): - """ Synonym replacement (in-place) for multi-value array ontology labels - must identify all affected arrays of labels, construct replacement arrays - then replace old synonym-containing array with an updated array + """Synonym replacement (in-place) for multi-value array ontology labels + must identify all affected arrays of labels, construct replacement arrays + then replace old synonym-containing array with an updated array """ orig_values = list(df[metadata_name].transform(tuple).unique()) matching_synonyms = {} @@ -1416,8 +1397,7 @@ def replace_synonyms(metadata): def validate_input_metadata(metadata, convention, bq_json=None): - """Wrapper function to run validation functions - """ + """Wrapper function to run validation functions""" dev_logger.info("Checking metadata content against convention rules") collect_jsonschema_errors(metadata, convention, bq_json) review_metadata_names(metadata) @@ -1434,8 +1414,7 @@ def validate_input_metadata(metadata, convention, bq_json=None): def push_metadata_to_bq(metadata, ndjson, dataset, table): - """Upload local NDJSON to BigQuery - """ + """Upload local NDJSON to BigQuery""" client = bigquery.Client() dataset_ref = client.dataset(dataset) table_ref = dataset_ref.table(table) @@ -1468,16 +1447,14 @@ def push_metadata_to_bq(metadata, ndjson, dataset, table): def write_metadata_to_bq(metadata, bq_dataset, bq_table): - """Wrapper function to gather metadata and write to BigQuery - """ + """Wrapper function to gather metadata and write to BigQuery""" bq_filename = str(metadata.study_file_id) + ".json" push_status = push_metadata_to_bq(metadata, bq_filename, bq_dataset, bq_table) return push_status def check_if_old_output(): - """Exit if old output files found - """ + """Exit if old output files found""" output_files = ["bq.json"] old_output = False @@ -1487,4 +1464,3 @@ def check_if_old_output(): old_output = True if old_output: exit(1) - diff --git a/tests/data/anndata/compliant_liver.h5ad b/tests/data/anndata/compliant_liver.h5ad index 894d467c..b476f602 100644 Binary files a/tests/data/anndata/compliant_liver.h5ad and b/tests/data/anndata/compliant_liver.h5ad differ diff --git a/tests/test_de.py b/tests/test_de.py index 26d8e3f3..44d607a3 100644 --- a/tests/test_de.py +++ b/tests/test_de.py @@ -31,14 +31,28 @@ def get_annotation_labels(metadata, annotation, de_cells): return unique_labels.tolist() -def find_expected_files(labels, cluster_name, annotation, scope, method): +def find_expected_files(labels, cluster_name, test_config): """Check that files were created for all expected annotation labels""" found = [] sanitized_cluster_name = DifferentialExpression.sanitize_string(cluster_name) - sanitized_annotation = DifferentialExpression.sanitize_string(annotation) - for label in labels: - sanitized_label = DifferentialExpression.sanitize_string(label) - expected_file = f"{sanitized_cluster_name}--{sanitized_annotation}--{sanitized_label}--{scope}--{method}.tsv" + sanitized_annotation = DifferentialExpression.sanitize_string( + test_config.get("test_annotation") + ) + de_type = test_config.get("de_type") + method = test_config.get("test_method") + annot_scope = test_config.get("test_scope") + if de_type == "rest": + for label in labels: + sanitized_label = DifferentialExpression.sanitize_string(label) + expected_file = f"{sanitized_cluster_name}--{sanitized_annotation}--{sanitized_label}--{annot_scope}--{method}.tsv" + assert os.path.exists(expected_file) + found.append(expected_file) + elif de_type == "pairwise": + # rank_genes_groups accepts a list. For SCP pairwise, should be a list with one item + # converting list to string for incorporation into result filename + group1 = DifferentialExpression.sanitize_string(test_config["group1"]) + group2 = DifferentialExpression.sanitize_string(test_config["group2"]) + expected_file = f'{sanitized_cluster_name}--{sanitized_annotation}--{group1}--{group2}--{annot_scope}--{method}.tsv' assert os.path.exists(expected_file) found.append(expected_file) return found @@ -54,6 +68,7 @@ def run_de(**test_config): cluster_name = test_config["cluster_name"] matrix_path = test_config["matrix_file"] matrix_type = test_config["matrix_type"] + de_type = test_config["de_type"] cm = CellMetadata( annot_path, @@ -69,13 +84,24 @@ def run_de(**test_config): "dec0dedfeed0000000000000", cluster_name, ) - - de_kwargs = { - "study_accession": cm.study_accession, - "name": cluster.name, - "annotation_scope": test_scope, - "method": test_method, - } + if de_type == "pairwise": + de_kwargs = { + "study_accession": cm.study_accession, + "name": cluster.name, + "annotation_scope": test_scope, + "method": test_method, + "de_type": de_type, + "group1": test_config["group1"], + "group2": test_config["group2"], + } + else: + de_kwargs = { + "study_accession": cm.study_accession, + "name": cluster.name, + "annotation_scope": test_scope, + "method": test_method, + "de_type": de_type, + } if "gene_file" in test_config: de_kwargs["gene_file"] = test_config["gene_file"] @@ -91,9 +117,7 @@ def run_de(**test_config): labels = get_annotation_labels(cm, test_annotation, de_cells) # In find_expected_files, checks all files with expected names were created # yields the number of files expected for an external check for file count - found_labels = find_expected_files( - labels, cluster.name, test_annotation, test_scope, test_method - ) + found_labels = find_expected_files(labels, cluster.name, test_config) return found_labels @@ -126,20 +150,35 @@ def test_assess_annotation(self): study_accession="SCPde", tracer=None, ) + extra_params = { + "de_type": "rest", + } # alter metadata so seurat_clusters is TYPE numeric cm.annot_types[21] = 'numeric' self.assertRaises( - TypeError, DifferentialExpression.assess_annotation, test_annotation, cm + TypeError, + DifferentialExpression.assess_annotation, + test_annotation, + cm, + extra_params, ) # alter metadata so seurat_clusters has bad TYPE designation cm.annot_types[21] = 'foo' self.assertRaises( - ValueError, DifferentialExpression.assess_annotation, test_annotation, cm + ValueError, + DifferentialExpression.assess_annotation, + test_annotation, + cm, + extra_params, ) # remove seurat_clusters from metadata cm.headers.pop(21) self.assertRaises( - KeyError, DifferentialExpression.assess_annotation, test_annotation, cm + KeyError, + DifferentialExpression.assess_annotation, + test_annotation, + cm, + extra_params, ) def test_detect_duplicate_gene_names(self): @@ -258,6 +297,7 @@ def test_de_process_dense(self): "cluster_name": "de_integration", "matrix_file": "../tests/data/differential_expression/de_dense_matrix.tsv", "matrix_type": "dense", + "de_type": "rest", } found_labels = run_de(**test_config) @@ -335,6 +375,7 @@ def test_de_process_sparse(self): "matrix_type": "mtx", "gene_file": "../tests/data/differential_expression/sparse/sparsemini_dup_gene_name.tsv", "barcode_file": "../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv", + "de_type": "rest", } found_labels = run_de(**test_config) @@ -372,7 +413,9 @@ def test_de_process_sparse(self): "Did not find expected logfoldchange value for Sox17 in DE file.", ) # confirm duplicate gene input generates expected gene_id info in output - self.assertIn('feature_id', content.columns, "Expected feature_id output not found.") + self.assertIn( + 'feature_id', content.columns, "Expected feature_id output not found." + ) # md5 checksum calculated using reference file in tests/data/differential_expression/sparse/reference expected_checksum = "7b13cb24b020aca268015e714ca2d666" @@ -409,9 +452,12 @@ def test_de_process_sparse(self): except: print(f"Error while deleting file : {file}") - def test_de_process_h5ad(self): + def test_de_process_pairwise(self): test_annotation = "cell_type__ontology_label" test_config = { + "de_type": "pairwise", + "group1": "mature B cell", + "group2": "plasma cell", "test_annotation": test_annotation, "test_scope": "study", "test_method": "wilcoxon", @@ -428,16 +474,13 @@ def test_de_process_h5ad(self): self.assertEqual( found_label_count, - 2, - f"expected nine annotation labels for {test_annotation}", + 1, + f"expected one annotation label for pairwise DE with {test_annotation}", ) - expected_file = ( - "umap--cell_type__ontology_label--plasma_cell--study--wilcoxon.tsv" - ) + expected_file = "umap--cell_type__ontology_label--mature_B_cell--plasma_cell--study--wilcoxon.tsv" expected_file_path = f"../tests/{expected_file}" - # confirm expected results filename was generated in found result files self.assertIn( expected_file, found_labels, "Expected filename not in found files list" @@ -447,18 +490,18 @@ def test_de_process_h5ad(self): # confirm expected gene in DE file at expected position self.assertEqual( content.iloc[0, 0], - "MZB1", - "Did not find expected gene, MZB1, at second row in DE file.", + "RPL32", + "Did not find expected gene, RPL32, at second row in DE file.", ) # confirm calculated value has expected significant digits self.assertEqual( content.iloc[0, 2], - 5.329, - "Did not find expected logfoldchange value for MZB1 in DE file.", + 1.975, + "Did not find expected logfoldchange value for RPL32 in DE file.", ) # md5 checksum calculated using reference file in tests/data/differential_expression/reference - expected_checksum = "649e5fd26575255bfca14c7b25d804ba" + expected_checksum = "f47ce72ba097b52c7ba09e4e0da94a05" # running DifferentialExpression via pytest results in output files in the tests dir with open(expected_file_path, "rb") as f: @@ -470,7 +513,9 @@ def test_de_process_h5ad(self): "Generated output file should match expected checksum.", ) - expected_output_match = "umap--cell_type__ontology_label--*--study--wilcoxon.tsv" + expected_output_match = ( + "umap--cell_type__ontology_label--*--study--wilcoxon.tsv" + ) with patch('ingest_files.IngestFiles.delocalize_file'): DifferentialExpression.delocalize_de_files( @@ -479,10 +524,44 @@ def test_de_process_h5ad(self): self.assertEqual( IngestFiles.delocalize_file.call_count, - 2, - "expected 2 calls to delocalize output files", + 1, + "expected one call to delocalize output files", ) + test_config = { + "de_type": "pairwise", + "group1": "NO SUCH GROUP VALUE", + "group2": "plasma cell", + "test_annotation": test_annotation, + "test_scope": "study", + "test_method": "wilcoxon", + "annot_path": "../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz", + "study_accession": "SCPh5adde", + "cluster_path": "../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz", + "cluster_name": "umap", + "matrix_file": "../tests/data/anndata/compliant_liver.h5ad", + "matrix_type": "h5ad", + } + + self.assertRaises(ValueError, run_de, **test_config) + + test_config = { + "de_type": "pairwise", + "group1": "", + "group2": "plasma cell", + "test_annotation": test_annotation, + "test_scope": "study", + "test_method": "wilcoxon", + "annot_path": "../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz", + "study_accession": "SCPh5adde", + "cluster_path": "../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz", + "cluster_name": "umap", + "matrix_file": "../tests/data/anndata/compliant_liver.h5ad", + "matrix_type": "h5ad", + } + # Original error is KeyError but all errors passed through assess_annotation become ValueErrors + self.assertRaises(ValueError, run_de, **test_config) + # clean up DE outputs files = glob.glob(expected_output_match) @@ -507,6 +586,7 @@ def test_de_process_na(self): "cluster_name": "de_na", "matrix_file": "../tests/data/differential_expression/de_dense_matrix.tsv", "matrix_type": "dense", + "de_type": "rest", } found_labels = run_de(**test_config) @@ -551,6 +631,7 @@ def test_de_process_sanitize(self): "cluster_name": "UMAP, pre-QC", "matrix_file": "../tests/data/differential_expression/de_dense_matrix.tsv", "matrix_type": "dense", + "de_type": "rest", } found_labels = run_de(**test_config)