Skip to content

Commit

Permalink
Coerce boolean data to categorical annotations
Browse files Browse the repository at this point in the history
  • Loading branch information
bistline committed Oct 23, 2024
1 parent 4190761 commit b3cc27a
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 1 deletion.
4 changes: 3 additions & 1 deletion ingest/anndata_.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,9 @@ def generate_metadata_file(adata, output_name):
headers = adata.obs.columns.tolist()
types = []
for header in headers:
if pd.api.types.is_numeric_dtype(adata.obs[header]):
if pd.api.types.is_bool_dtype(adata.obs[header]):
types.append("GROUP")
elif pd.api.types.is_numeric_dtype(adata.obs[header]):
types.append("NUMERIC")
else:
types.append("GROUP")
Expand Down
Binary file added tests/data/anndata/anndata_boolean_test.h5ad
Binary file not shown.
38 changes: 38 additions & 0 deletions tests/test_anndata.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def setup_class(self):
filepath_dup_cell = "../tests/data/anndata/dup_cell.h5ad"
filepath_nan = "../tests/data/anndata/nan_value.h5ad"
filepath_synthetic = "../tests/data/anndata/anndata_test.h5ad"
filepath_boolean = "../tests/data/anndata/anndata_boolean_test.h5ad"
self.study_id = "addedfeed000000000000000"
self.study_file_id = "dec0dedfeed0000000000000"
self.valid_args = [filepath_valid, self.study_id, self.study_file_id]
Expand All @@ -41,6 +42,7 @@ def setup_class(self):
self.dup_cell_args = [filepath_dup_cell, self.study_id, self.study_file_id]
self.nan_value_args = [filepath_nan, self.study_id, self.study_file_id]
self.synthetic_args = [filepath_synthetic, self.study_id, self.study_file_id]
self.boolean_args = [filepath_boolean, self.study_id, self.study_file_id]
self.cluster_name = 'X_tsne'
self.valid_kwargs = {'obsm_keys': [self.cluster_name]}
self.anndata_ingest = AnnDataIngestor(*self.valid_args, **self.valid_kwargs)
Expand Down Expand Up @@ -181,6 +183,42 @@ def test_generate_metadata_file(self):
expected_types, type_line, 'did not get expected types from metadata body'
)

def test_generate_metadata_with_boolean(self):
boolean_ingest = AnnDataIngestor(*self.boolean_args, **self.valid_kwargs)
adata = boolean_ingest.obtain_adata()
boolean_filename = "h5ad_frag.metadata_boolean.tsv"
boolean_ingest.generate_metadata_file(
adata, boolean_filename
)
self.assertEqual(
'bool', adata.obs['is_primary_data'].dtype.name,
'did not correctly get "bool" dtype for "is_primary_data"'
)
compressed_file = boolean_filename + ".gz"
with gzip.open(compressed_file, "rt", encoding="utf-8-sig") as metadata_body:
name_line = metadata_body.readline().split("\t")
expected_headers = [
'NAME', 'donor_id', 'biosample_id', 'sex', 'species', 'species__ontology_label',
'library_preparation_protocol', 'library_preparation_protocol__ontology_label', 'organ',
'organ__ontology_label', 'disease', 'disease__ontology_label', "is_primary_data\n"
]
self.assertEqual(
expected_headers, name_line, 'did not get expected headers from metadata body'
)
expected_types = [
'TYPE', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP',
'GROUP', "GROUP\n"
]
type_line = metadata_body.readline().split("\t")
self.assertEqual(
expected_types, type_line, 'did not get expected types from metadata body'
)
for line in metadata_body.readlines():
is_primary_data = line.split("\t")[12].strip()
self.assertEqual(
"False", is_primary_data, 'did not correctly read boolean value as string from data'
)

def test_gene_id_indexed_generate_processed_matrix(self):
"""Tests creating matrix when indexed by Ensembl ID, not gene name
Expand Down

0 comments on commit b3cc27a

Please sign in to comment.