Skip to content

Commit

Permalink
Merge pull request #42 from broadinstitute/dp-scaffold
Browse files Browse the repository at this point in the history
bugfix: make skani dist tolerate empty input
  • Loading branch information
dpark01 authored Mar 19, 2024
2 parents 9c862b9 + f37a5cf commit b986d1a
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 1 deletion.
18 changes: 17 additions & 1 deletion assemble/skani.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import os.path
import shutil
import subprocess
import Bio.SeqIO

TOOL_NAME = "skani"

Expand Down Expand Up @@ -65,6 +66,16 @@ def version(self):
def _get_tool_version(self):
self.tool_version = subprocess.check_output([self.install_and_get_path(), '--version']).decode('UTF-8').strip().split()[1]

def _is_fasta_basically_empty(self, inFasta, min_length=500):
''' Check if a fasta file contains no sequences longer than min_length.
If the fasta contains only sequences < min_length, return True
'''
with open(inFasta, 'r') as inf:
for seq in Bio.SeqIO.parse(inf, 'fasta'):
if len(seq.seq.replace("-","").replace("N","")) >= min_length:
return False
return True

def execute(self, subcommand, args, outfile, threads=None):
''' generic execution of skani
'''
Expand All @@ -88,7 +99,12 @@ def dist(self, query_fasta, ref_fastas, outfile, other_args = (), threads=None):
''' skani dist computes ANI distance between a specified query set of
sequences (MAGs) and reference genomes (database)
'''
self.execute('dist', ['-q', query_fasta, '-r'] + list(ref_fastas) + list(other_args), outfile, threads=threads)
if self._is_fasta_basically_empty(query_fasta):
_log.warning("Query fasta file is empty or contains only very short sequences. Skipping skani dist (which will fail in this scenario).")
with open(outfile, 'w') as outf:
outf.write('\t'.join(['Ref_file','Query_file','ANI','Align_fraction_ref','Align_fraction_query','Ref_name','Query_name','Num_ref_contigs','Num_query_contigs','ANI_5_percentile','ANI_95_percentile','Standard_deviation','Ref_90_ctg_len','Ref_50_ctg_len','Ref_10_ctg_len','Query_90_ctg_len','Query_50_ctg_len','Query_10_ctg_len','Avg_chain_len','Total_bases_covered']) + '\n')
else:
self.execute('dist', ['-q', query_fasta, '-r'] + list(ref_fastas) + list(other_args), outfile, threads=threads)

def find_reference_clusters(self, ref_fastas,
other_args = ('-m', 50, '--no-learned-ani', '--slow', '--robust', '--detailed', '--ci', '--sparse'),
Expand Down
42 changes: 42 additions & 0 deletions test/unit/test_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,48 @@ def test_skani_contigs_to_refs(self):
expected_cluster = set(['ref.lasv.{}.fasta'.format(strain) for strain in ('josiah', 'KGH_G502')])
self.assertEqual(actual_cluster, expected_cluster)

def test_skani_no_big_contigs(self):
'''
Test that skani_dist tolerates empty (or practically empty) input query fasta
'''

inDir = os.path.join(util.file.get_test_input_path(), 'TestOrderAndOrient')
with util.file.tempfnames(('.skani.dist.out', '.skani.dist.filtered', '.clusters.filtered')) \
as (out_skani_dist, out_skani_dist_filtered, out_clusters_filtered):
contigs = os.path.join(inDir, 'contigs.lasv.one_small.fasta')
refs = [os.path.join(inDir, 'ref.ebov.{}.fasta'.format(strain))
for strain in ('lbr', 'sle', 'gin')]

assembly.skani_contigs_to_refs(contigs, refs, out_skani_dist, out_skani_dist_filtered, out_clusters_filtered, threads=1)

with open(out_clusters_filtered, 'r') as inf:
clusters = inf.readlines()
self.assertEqual(len(clusters), 0)
with open(out_skani_dist, 'r') as inf:
lines = inf.readlines()
self.assertEqual(len(lines), 1)

def test_skani_no_matches(self):
'''
Test that skani_dist tolerates empty outputs (no matches)
'''

inDir = os.path.join(util.file.get_test_input_path(), 'TestOrderAndOrient')
with util.file.tempfnames(('.skani.dist.out', '.skani.dist.filtered', '.clusters.filtered')) \
as (out_skani_dist, out_skani_dist_filtered, out_clusters_filtered):
contigs = os.path.join(inDir, 'contigs.lasv.fasta')
refs = [os.path.join(inDir, 'ref.ebov.{}.fasta'.format(strain))
for strain in ('lbr', 'sle', 'gin')]

assembly.skani_contigs_to_refs(contigs, refs, out_skani_dist, out_skani_dist_filtered, out_clusters_filtered, threads=1)

with open(out_clusters_filtered, 'r') as inf:
clusters = inf.readlines()
self.assertEqual(len(clusters), 0)
with open(out_skani_dist, 'r') as inf:
lines = inf.readlines()
self.assertEqual(len(lines), 1)


class TestMutableSequence(unittest.TestCase):
''' Test the MutableSequence class '''
Expand Down

0 comments on commit b986d1a

Please sign in to comment.