From c2fbb0bb5ac34a6760355331efa9469cee345d25 Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Wed, 1 May 2024 14:11:12 -0400 Subject: [PATCH 1/6] Fix issue in read_bam implementation, it got the wrong contig name --- bioframe/io/fileops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bioframe/io/fileops.py b/bioframe/io/fileops.py index ac580b8..11ac9ba 100644 --- a/bioframe/io/fileops.py +++ b/bioframe/io/fileops.py @@ -243,7 +243,7 @@ def read_bam(fp, chrom=None, start=None, end=None): ( s.qname, s.flag, - s.rname, + s.reference_name, s.pos, s.mapq, s.cigarstring if s.mapq != 0 else np.nan, From 67b6ab50f7af3f21c616b16eba0351c976e7f49e Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Sat, 18 May 2024 16:54:46 -0400 Subject: [PATCH 2/6] WIP tests --- tests/test_data/toy.bam | Bin 0 -> 529 bytes tests/test_data/toy.sam | 14 ++++++++++++++ tests/test_fileops.py | 10 ++++++++++ 3 files changed, 24 insertions(+) create mode 100644 tests/test_data/toy.bam create mode 100644 tests/test_data/toy.sam diff --git a/tests/test_data/toy.bam b/tests/test_data/toy.bam new file mode 100644 index 0000000000000000000000000000000000000000..b4a95223dd42074ef46fdd6a671105348116e1f3 GIT binary patch literal 529 zcmb2|=3rp}f&Xj_PR>jW4GhIaUs8RN6A}tI_@3~5+pOi~`}oVp%|?b#oj4>KU3qrM zFx|1Vja}0vT!L3j9rL=?`y>4=zuEt2-&6Ii{l%LyscF7OA0`=Cf8MiMK7T`B z?xgGczS{n1ay}P3LC^F3-8}xFU!9(UC)G7Rz9KN;E|0o0F%b6u-TxaJW$?n)FB;2{S>Fte;D?dMznayaksgzU=jM_E{5`a2!_H*e+rC%0_+>YurtN^*sTT`yzL2>WeeKmcZxxx~p!* zv6$mt@-CBDGT&AD&Xk&N=A%7h@fKVADuKYbJ1JfZ`-N_+8m0!EWs=>w!0wEO{Yj%v zi3O7~C0G7EBGDhn`)l6L6^%E0>Z0%e@-+ toy.bam` + pass From 4e1545ea093a3b2e022fa0a1aece4befa5d21955 Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Sun, 19 May 2024 00:41:42 -0400 Subject: [PATCH 3/6] fix broken json serialization --- bioframe/__init__.py | 2 ++ bioframe/io/__init__.py | 2 ++ bioframe/io/fileops.py | 61 ++++++++++++++++++++++++------------ tests/test_data/toy.bam.bai | Bin 0 -> 176 bytes tests/test_fileops.py | 7 +++-- 5 files changed, 49 insertions(+), 23 deletions(-) create mode 100644 tests/test_data/toy.bam.bai diff --git a/bioframe/__init__.py b/bioframe/__init__.py index 69a23b9..259a81f 100644 --- a/bioframe/__init__.py +++ b/bioframe/__init__.py @@ -45,6 +45,7 @@ "fetch_centromeres", "fetch_chromsizes", "load_fasta", + "read_alignment", "read_bam", "read_bigbed", "read_bigwig", @@ -117,6 +118,7 @@ fetch_centromeres, fetch_chromsizes, load_fasta, + read_alignment, read_bam, read_bigbed, read_bigwig, diff --git a/bioframe/io/__init__.py b/bioframe/io/__init__.py index fe4853f..535a438 100644 --- a/bioframe/io/__init__.py +++ b/bioframe/io/__init__.py @@ -2,6 +2,7 @@ from .bed import to_bed from .fileops import ( load_fasta, + read_alignment, read_bam, read_bigbed, read_bigwig, @@ -23,6 +24,7 @@ "read_tabix", "read_pairix", "read_bam", + "read_alignment", "load_fasta", "read_bigwig", "to_bed", diff --git a/bioframe/io/fileops.py b/bioframe/io/fileops.py index 11ac9ba..4821fdb 100644 --- a/bioframe/io/fileops.py +++ b/bioframe/io/fileops.py @@ -1,3 +1,4 @@ +import array import io import json import os @@ -231,35 +232,55 @@ def read_pairix( return df -def read_bam(fp, chrom=None, start=None, end=None): +def read_alignment(fp, chrom=None, start=None, end=None): """ - Read bam records into a DataFrame. + Read alignemnt records into a DataFrame. """ import pysam - with closing(pysam.AlignmentFile(fp, "rb")) as f: - bam_iter = f.fetch(chrom, start, end) - records = [ - ( - s.qname, - s.flag, - s.reference_name, - s.pos, - s.mapq, - s.cigarstring if s.mapq != 0 else np.nan, - s.rnext, - s.pnext, - s.tlen, - s.seq, - s.qual, - json.dumps(OrderedDict(s.tags)), + ext = os.path.splitext(fp)[1] + if ext == '.sam': + mode = 'r' + elif ext == '.bam': + mode = 'rb' + elif ext == '.cram': + mode = 'rc' + else: + raise ValueError(f'{ext} is not a supported filetype') + + with closing(pysam.AlignmentFile(fp, mode)) as f: + records = [] + for s in f.fetch(chrom, start, end): + # Needed because array.array is not json serializable + tags = [(k, v.tolist() if type(v) == array.array else v) for k, v in s.tags] + records.append( + ( + s.qname, + s.flag, + s.reference_name, + s.pos, + s.mapq, + s.cigarstring if s.mapq != 0 else np.nan, + s.rnext, + s.pnext, + s.tlen, + s.seq, + s.qual, + json.dumps(OrderedDict(tags)), + ) ) - for s in bam_iter - ] df = pd.DataFrame(records, columns=BAM_FIELDS) return df +def read_bam(fp, chrom=None, start=None, end=None): + """ + Deprecated: use `read_alignment` instead. + Read bam file into dataframe, + """ + return read_alignment(fp, chrom, start, end) + + class PysamFastaRecord: def __init__(self, ff, ref): self.ff = ff diff --git a/tests/test_data/toy.bam.bai b/tests/test_data/toy.bam.bai new file mode 100644 index 0000000000000000000000000000000000000000..eeb15475a6e0fe8eb41d2554c1a1888b067959ec GIT binary patch literal 176 zcmZ>A^kigWU|;}YPay^dMj*|=&z>% literal 0 HcmV?d00001 diff --git a/tests/test_fileops.py b/tests/test_fileops.py index 6da00ed..cbca3b8 100644 --- a/tests/test_fileops.py +++ b/tests/test_fileops.py @@ -57,9 +57,10 @@ def test_read_beds(): def test_read_sam(): # SAM file taken from https://github.com/samtools/samtools/blob/develop/examples/toy.sam - pass + bioframe.read_alignment('tests/test_data/toy.sam') def test_read_bam(): - # converted toy.sam via `samtools view -bS toy.sam > toy.bam` - pass + # converted toy.sam via `samtools view -bS toy.sam > toy.bam; + # index file created with `samtools index toy.bam` + bioframe.read_alignment('tests/test_data/toy.bam') From ee36a5804a7d756f83cbc41b763eeeb9ae5116cd Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Sun, 19 May 2024 00:44:00 -0400 Subject: [PATCH 4/6] catch return value of read_alignements --- tests/test_fileops.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_fileops.py b/tests/test_fileops.py index cbca3b8..c408bcc 100644 --- a/tests/test_fileops.py +++ b/tests/test_fileops.py @@ -57,10 +57,10 @@ def test_read_beds(): def test_read_sam(): # SAM file taken from https://github.com/samtools/samtools/blob/develop/examples/toy.sam - bioframe.read_alignment('tests/test_data/toy.sam') + _ = bioframe.read_alignment('tests/test_data/toy.sam') def test_read_bam(): # converted toy.sam via `samtools view -bS toy.sam > toy.bam; # index file created with `samtools index toy.bam` - bioframe.read_alignment('tests/test_data/toy.bam') + _ = bioframe.read_alignment('tests/test_data/toy.bam') From e17539f255bea446f14dc9ed5c543033017c9e8f Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Sun, 19 May 2024 08:42:47 -0400 Subject: [PATCH 5/6] Fix typo --- bioframe/io/fileops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bioframe/io/fileops.py b/bioframe/io/fileops.py index 4821fdb..13fdf76 100644 --- a/bioframe/io/fileops.py +++ b/bioframe/io/fileops.py @@ -234,7 +234,7 @@ def read_pairix( def read_alignment(fp, chrom=None, start=None, end=None): """ - Read alignemnt records into a DataFrame. + Read alignment records into a DataFrame. """ import pysam From 04646a4fe8bcc7f8747d4c2f1dc7d3cd44719a9b Mon Sep 17 00:00:00 2001 From: Felix Raimundo Date: Sun, 19 May 2024 11:32:13 -0400 Subject: [PATCH 6/6] Use dict instead of defaultdict in alignmnent tag serialization --- bioframe/io/fileops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bioframe/io/fileops.py b/bioframe/io/fileops.py index 13fdf76..b511faa 100644 --- a/bioframe/io/fileops.py +++ b/bioframe/io/fileops.py @@ -266,7 +266,7 @@ def read_alignment(fp, chrom=None, start=None, end=None): s.tlen, s.seq, s.qual, - json.dumps(OrderedDict(tags)), + json.dumps(dict(tags)), ) ) df = pd.DataFrame(records, columns=BAM_FIELDS)