From b3cc27a36023574f5e528f351e3257499fe62c08 Mon Sep 17 00:00:00 2001 From: bistline Date: Wed, 23 Oct 2024 11:35:29 -0400 Subject: [PATCH] Coerce boolean data to categorical annotations --- ingest/anndata_.py | 4 +- tests/data/anndata/anndata_boolean_test.h5ad | Bin 0 -> 51536 bytes tests/test_anndata.py | 38 +++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 tests/data/anndata/anndata_boolean_test.h5ad diff --git a/ingest/anndata_.py b/ingest/anndata_.py index 5c1c357c..c502958d 100644 --- a/ingest/anndata_.py +++ b/ingest/anndata_.py @@ -147,7 +147,9 @@ def generate_metadata_file(adata, output_name): headers = adata.obs.columns.tolist() types = [] for header in headers: - if pd.api.types.is_numeric_dtype(adata.obs[header]): + if pd.api.types.is_bool_dtype(adata.obs[header]): + types.append("GROUP") + elif pd.api.types.is_numeric_dtype(adata.obs[header]): types.append("NUMERIC") else: types.append("GROUP") diff --git a/tests/data/anndata/anndata_boolean_test.h5ad b/tests/data/anndata/anndata_boolean_test.h5ad new file mode 100644 index 0000000000000000000000000000000000000000..3ac902b178a2c8baa59645791bfff743c3c83092 GIT binary patch literal 51536 zcmeHQU36SWk?ye*TO>BI^JgUiG6YBlNX$6#24(*;mMv!j##S5$#IH2ACYFe#8Ah@r z^Ro;;2@Vi2gb?wjx7Ar#j`#PN!|vHR?Add)Z+Y}%9`)$Qoc+sIb=7xot4FP|rD=1V zxqWnRcU4zcSJ&;T>f85@?knX(hj-p`?=6N?U!S?j?6O6i-qAGtpMn95`Y|o!G(WHT z(iSP?h$GkaD}I;q&uSTT)Opnk9vo{~&nyxBMqo)0GvT z*e7`Cck#EY>6R5<(FqPe*ZkdTx+R5|e=qny);Y#}V$)M;?J?not2SuXjBph%v>d_Op}xM87#zn>sT+ z^GK~RUbO?0Dl;?VmAMKa1LJuVyYpobi&ADx^Tug)i~GwFY@@^vfV zd+a%t>zOk|Oi30vDSlAr!>u0u;abgNb|@bC>|BGrJH$0RwceNay^4n%ti0zNmFcSD zZws`8w}RfrCr`~W>Fzdq(fjR+hyELt7cJ(F0R5ece^~RW$&-yrH5lwsI$Y23$=PaU zwu&HOP_Otl4)_7ir0Qs9VuQBs*V(|2o)>ED`OQa#j7AH zR*F}_RV)L4;Kvaak5;6T5cfWi<}P->~8lA5T^0 zs;6s>$y1dnTk}ky-Ej{Oxu5m$ko$z<(NF3x#qyrh`jGse4ex1jT)yq+p8ujgbDE z0lwc+d`SP_4Yd27hll*%_wbPaB@cgcs#as0d|B}!xqslb1Hbc%56S&XpxqBWJmmh7 zhetm@Ry^bzK5)WL{Nnz{?bLbItM}*qPXc^@s`!xn7Xs~m=HVg#&pkZke@*e*G^f@3 zG>61rD2#FWa=#wn_e;fxoTgUg&V%w;~BbTEwm0hb78v<~%*YbuWGT$sx^7L|@+LkGir zFbw&8xT*EVp=EkL7U^I-$_z6fA8s<94z4#&nc!+bF8%C{D5Tv{rZ z0FyjSzqk&}b)Yv&2N-!1?KKD8BJj@jW*p>2JJ6Suo@w|19m-%SY!7%y4zw?O@KYYl_mnO@--5dEkO#A2xb-Cu=37j+ ze$aytdNA)bxb(ElUj_5tf?H3^pj9yMCAjsxhv34zci_UjXW+uTSKz|DN8rLA^xy|P z_(LB2VGsU@2S4b+4|#CGgFouQAM@akd+@^^%zIJpc=mcQ?>)KoyyxV?yZ~}x-eYp% z$2=H&N`&s1wljs!bLrrQQAq~WMFJ&inhWFq6c(vJi z#P?3P=)4CTL;l-{Z|HrZMUAH|qTwX*$OPBHLY19+RRexJg1;(z>oUZl;Vvl(_UEqy zH*0@3&gdXGw#Kl(-NYC2BOF|^0g?MFTMcdW{aJS(=JrVP-$8s4udzNyynD+&tuUsp z7rfK^>tlMej`&!Zx5L*}g_N`NyH^A7A7eUS-qxWm=mX*y_Q$L-Usv=&(}ecB!}M86 za`YzSTJMpD)|*(T;yD*`>q^w0+)sVD>%=ZO#e1r=<nxOO7LIC3W4m1p)WB7+pQC~T;&5o6Af)lQHOUFAQ=#Klu1N<<6QlDhm zxyLks_=b6(*YQv2Jc2stS5ov=Hoae_?-N>pVAzL8j%L9e^VO(-Xenj7zU(s((yX*U zkf@HEe%ejuzIM?>l|K8-;R8pW9U|O`{Oua+$$Og#qUCCwFY9v?-n~PN-cYK_lItj z_+2_EqQ{Hh$EgF@cZq{JrEbOV1?4AP+t2U|E2Z|j4AWTyZag^AnLMFz_c6YhU^^H1 znF?;kPIe|wDBQR^nP58?-0{^rlP47JbD$R!Z0CYIKD^7{xz*}@qnECxfzr5o`SCTh zv)kQjJg;3tJKirFenT4Y9B^Ug>B791@4~z{_J8^hR2a|ce-)2n(A9%o!(}y7+>LCYU@Bs`tFn++$59k4dA7J!{_K?Rq zt?O`#abNl@$(`KSsiDQYf1fxgYKnLN{v34>LrQmp&;rry-Fw6E0uZj}4?dG&(@ zM_cYb#OC+kKTn)t--ew$rEcB7k0?Lk+J1&#SW>^zPSKl~7;)(fq{_svM~~l&(c|}* zXgo3W^kw3U_`UQQnSaabuTuZ9F}rj3Bfb(z{!!waTxStKwQn`-tBv3D>>Mb;P6f)& zy`q6o{9fY4r;}pE?-Cn1rItOraqmWn-xs6D?=jL1;&+RkJf&{M@BR*(yYLGWX@?>} zkDp`2r3&dZ@$1p!_vPsE`xK2QhMvZWFXHzE-{ey|ul}l=@w-ZVW61wJ@lCEX3r{Jr zZw~I1t~P!T@T8#xI~6E9_lgEW@q0q&5%dr%ej7GMN-cZ75##qt^!UxM+`_(_Cq+?d zI4!8=t}bhP|4sRYTEj0C#zGXmiI3se*=HwyJ$n4!bx5vUy#4d9QU@{g^aAlk{4Vg# zIHj`st8T{cDdHPL{%PWyTxS-Z670LklYo+D9rw?d^koOwskF|{y`q6o{4RLE(})$n z-<%Uow^ZS&{dMd9d+pzO)XzzWuoLY*n8CjK*{CVC?DVY$W;Fl_uAj%A8<_P4(w6i%##w_pvf4!JY)l&b^|6(7K>hTdjd^uM5B( z|7Qr_2?0L+Kl$=q5qy6H-#rO^=cLnjP1dV#`)CRYgYUqEzWajjyoA2%;(y0Q-Ey*S z$9TJ)U;FR0ul4tQBhl9vXGkgVYs2hJDRpaov84QjYx^1g@a=kgT`yx|#N)H1&?KHk zU$-=)uUqD6JTdgdw=!6_7~UM9lvh99&2`JyiEj+~zd?LEtw&b4b4sv3fwFV2Xdtv6 zdGp298tC?VME1M4CExY9toQ$j*S@_isW>aT-Ui(T4qB8hs=OBkGJt5Hf?+pzoT}huug3fZ<21HuOb7Ky0yMJul$5-`x*ZAvhI(yQ}iY#M%-glxPI%KOVQUi-=^`z&{LE6 zVtwQL(bRuzY~Rw|m-$X4`M*nilj}@nPKkYUr<5in>$nbDVrGmSsJnfl&Nj(0K$k#ERd=SEYxRc9h!RjS|1jkceK;5lEJfV`F~AeSd*X z6!v}bN5a2b@w>0XKT&~S7-6TAr9=ZU;u2rl>leQZI&b5>FUh7ILr*^;zKGxc{~6w_ z-MRPQKaC{+1@}5rnNwolTuEs{vX1e)@N*%Aoocfz>k$ov;`fNoBd8%({J!!F>7k_^ zrS^BD#P5OV@%v|_8`yXKHBnTz;`g%hQ}wrH@C*GcM3xc_#E47mimzY%9*Z8o*|cNm z>2>0Z`0fAy|I~kMY~Ad}?_Wle{|)Cl7Xte%1)|}Ed@Gfh^lJa_dHDa6*Q5KZ1(Gwy zb^jIdg&!RLYXT_MC8PezwzSdj5b^)7#Q$kzJwJ8u*b#eac1Z_%nGDl$8n4V%o^Mp9 zs{+dReW`r2!o|Ih?k!SztHRqf$9sS`DUABTio6hl_P{@LX5LbwJ@A7Sd7*W?$LDZO zUX+2~-l6rgbB)QF(~nddjf#B|igBPn$a{;@quw8IbIjMb{#M$yltsEPxAymmcjuqfcYjM}|E*^KJG#Mz?=kw4Z07sEus80a)9*&H zZ-ji~-_=_fk};H44JFmf#`dXszYXJITZjJ7tNzpHL-|$xh5-3p5NkzJ@z){B48NWR zV#EjDi$y$I(S7eewdc*wBj$~H#CyLn;>M+aU}@a<`k`6D`)B;$ zL>Y@QW1jwfdf4l(fAnj58;;>``4JG_BgpFlFQ+py-h3S?%ax2bU-_9~41Mu4N5t3V z-^wUf^oDD*@s-z)us61W81nNo&`^ABU6S#%B-8f+jluWbkZX08^|%K7_zM3%q4Nmp z!F}T}5Sz?le}#V*MrG}`(QOWov67WPdBaUris& z{+50x{VZ#L-LyY`$0mmT@jE&p`)mE1jHe}8W`C-OEL&RxA^V%qd4yK6?62^z(nDGM z>!$tjJ8LoQkKcI<+26o_$aq?kW%j3f$g;IH5VF6OKdh$jw*7(oh!z?Z`NvkZld~3c zR^PWJc$yIe+nnIK!Zx3(oU1md$(b2x2RXslGV8)g_FrzM{ToRa^aWw`XYzu-qqboqY|2{ON$&JLqAT@DspqG>2XJ{-v9C#qY?+uq%GY G{{I2330;o> literal 0 HcmV?d00001 diff --git a/tests/test_anndata.py b/tests/test_anndata.py index c669ce4d..ce412b41 100644 --- a/tests/test_anndata.py +++ b/tests/test_anndata.py @@ -29,6 +29,7 @@ def setup_class(self): filepath_dup_cell = "../tests/data/anndata/dup_cell.h5ad" filepath_nan = "../tests/data/anndata/nan_value.h5ad" filepath_synthetic = "../tests/data/anndata/anndata_test.h5ad" + filepath_boolean = "../tests/data/anndata/anndata_boolean_test.h5ad" self.study_id = "addedfeed000000000000000" self.study_file_id = "dec0dedfeed0000000000000" self.valid_args = [filepath_valid, self.study_id, self.study_file_id] @@ -41,6 +42,7 @@ def setup_class(self): self.dup_cell_args = [filepath_dup_cell, self.study_id, self.study_file_id] self.nan_value_args = [filepath_nan, self.study_id, self.study_file_id] self.synthetic_args = [filepath_synthetic, self.study_id, self.study_file_id] + self.boolean_args = [filepath_boolean, self.study_id, self.study_file_id] self.cluster_name = 'X_tsne' self.valid_kwargs = {'obsm_keys': [self.cluster_name]} self.anndata_ingest = AnnDataIngestor(*self.valid_args, **self.valid_kwargs) @@ -181,6 +183,42 @@ def test_generate_metadata_file(self): expected_types, type_line, 'did not get expected types from metadata body' ) + def test_generate_metadata_with_boolean(self): + boolean_ingest = AnnDataIngestor(*self.boolean_args, **self.valid_kwargs) + adata = boolean_ingest.obtain_adata() + boolean_filename = "h5ad_frag.metadata_boolean.tsv" + boolean_ingest.generate_metadata_file( + adata, boolean_filename + ) + self.assertEqual( + 'bool', adata.obs['is_primary_data'].dtype.name, + 'did not correctly get "bool" dtype for "is_primary_data"' + ) + compressed_file = boolean_filename + ".gz" + with gzip.open(compressed_file, "rt", encoding="utf-8-sig") as metadata_body: + name_line = metadata_body.readline().split("\t") + expected_headers = [ + 'NAME', 'donor_id', 'biosample_id', 'sex', 'species', 'species__ontology_label', + 'library_preparation_protocol', 'library_preparation_protocol__ontology_label', 'organ', + 'organ__ontology_label', 'disease', 'disease__ontology_label', "is_primary_data\n" + ] + self.assertEqual( + expected_headers, name_line, 'did not get expected headers from metadata body' + ) + expected_types = [ + 'TYPE', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', 'GROUP', + 'GROUP', "GROUP\n" + ] + type_line = metadata_body.readline().split("\t") + self.assertEqual( + expected_types, type_line, 'did not get expected types from metadata body' + ) + for line in metadata_body.readlines(): + is_primary_data = line.split("\t")[12].strip() + self.assertEqual( + "False", is_primary_data, 'did not correctly read boolean value as string from data' + ) + def test_gene_id_indexed_generate_processed_matrix(self): """Tests creating matrix when indexed by Ensembl ID, not gene name