From e76674881317baed7e2746421a03ebc500612e77 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 19 Mar 2024 16:51:44 -0400 Subject: [PATCH 1/2] add test for failure --- test/unit/test_assembly.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py index 206a3769..7554a71b 100644 --- a/test/unit/test_assembly.py +++ b/test/unit/test_assembly.py @@ -644,6 +644,27 @@ def test_skani_contigs_to_refs(self): expected_cluster = set(['ref.lasv.{}.fasta'.format(strain) for strain in ('josiah', 'KGH_G502')]) self.assertEqual(actual_cluster, expected_cluster) + def test_skani_no_big_contigs(self): + ''' + Test that skani_dist tolerates empty (or practically empty) input query fasta + ''' + + inDir = os.path.join(util.file.get_test_input_path(), 'TestOrderAndOrient') + with util.file.tempfnames(('.skani.dist.out', '.skani.dist.filtered', '.clusters.filtered')) \ + as (out_skani_dist, out_skani_dist_filtered, out_clusters_filtered): + contigs = os.path.join(inDir, 'contigs.lasv.one_small.fasta') + refs = [os.path.join(inDir, 'ref.ebov.{}.fasta'.format(strain)) + for strain in ('lbr', 'sle', 'gin')] + + assembly.skani_contigs_to_refs(contigs, refs, out_skani_dist, out_skani_dist_filtered, out_clusters_filtered, threads=1) + + with open(out_clusters_filtered, 'r') as inf: + clusters = inf.readlines() + self.assertEqual(len(clusters), 0) + with open(out_skani_dist, 'r') as inf: + lines = inf.readlines() + self.assertEqual(len(lines), 1) + class TestMutableSequence(unittest.TestCase): ''' Test the MutableSequence class ''' From 616dfae1eb511b1cf366f2aba427f99fe2704621 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 19 Mar 2024 17:34:36 -0400 Subject: [PATCH 2/2] more unit tests and fixes to edge case --- assemble/skani.py | 18 +++++++++++++++++- test/unit/test_assembly.py | 21 +++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/assemble/skani.py b/assemble/skani.py index 06182cc9..dec1842c 100644 --- a/assemble/skani.py +++ b/assemble/skani.py @@ -14,6 +14,7 @@ import os.path import shutil import subprocess +import Bio.SeqIO TOOL_NAME = "skani" @@ -65,6 +66,16 @@ def version(self): def _get_tool_version(self): self.tool_version = subprocess.check_output([self.install_and_get_path(), '--version']).decode('UTF-8').strip().split()[1] + def _is_fasta_basically_empty(self, inFasta, min_length=500): + ''' Check if a fasta file contains no sequences longer than min_length. + If the fasta contains only sequences < min_length, return True + ''' + with open(inFasta, 'r') as inf: + for seq in Bio.SeqIO.parse(inf, 'fasta'): + if len(seq.seq.replace("-","").replace("N","")) >= min_length: + return False + return True + def execute(self, subcommand, args, outfile, threads=None): ''' generic execution of skani ''' @@ -88,7 +99,12 @@ def dist(self, query_fasta, ref_fastas, outfile, other_args = (), threads=None): ''' skani dist computes ANI distance between a specified query set of sequences (MAGs) and reference genomes (database) ''' - self.execute('dist', ['-q', query_fasta, '-r'] + list(ref_fastas) + list(other_args), outfile, threads=threads) + if self._is_fasta_basically_empty(query_fasta): + _log.warning("Query fasta file is empty or contains only very short sequences. Skipping skani dist (which will fail in this scenario).") + with open(outfile, 'w') as outf: + outf.write('\t'.join(['Ref_file','Query_file','ANI','Align_fraction_ref','Align_fraction_query','Ref_name','Query_name','Num_ref_contigs','Num_query_contigs','ANI_5_percentile','ANI_95_percentile','Standard_deviation','Ref_90_ctg_len','Ref_50_ctg_len','Ref_10_ctg_len','Query_90_ctg_len','Query_50_ctg_len','Query_10_ctg_len','Avg_chain_len','Total_bases_covered']) + '\n') + else: + self.execute('dist', ['-q', query_fasta, '-r'] + list(ref_fastas) + list(other_args), outfile, threads=threads) def find_reference_clusters(self, ref_fastas, other_args = ('-m', 50, '--no-learned-ani', '--slow', '--robust', '--detailed', '--ci', '--sparse'), diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py index 7554a71b..e9aab3a7 100644 --- a/test/unit/test_assembly.py +++ b/test/unit/test_assembly.py @@ -665,6 +665,27 @@ def test_skani_no_big_contigs(self): lines = inf.readlines() self.assertEqual(len(lines), 1) + def test_skani_no_matches(self): + ''' + Test that skani_dist tolerates empty outputs (no matches) + ''' + + inDir = os.path.join(util.file.get_test_input_path(), 'TestOrderAndOrient') + with util.file.tempfnames(('.skani.dist.out', '.skani.dist.filtered', '.clusters.filtered')) \ + as (out_skani_dist, out_skani_dist_filtered, out_clusters_filtered): + contigs = os.path.join(inDir, 'contigs.lasv.fasta') + refs = [os.path.join(inDir, 'ref.ebov.{}.fasta'.format(strain)) + for strain in ('lbr', 'sle', 'gin')] + + assembly.skani_contigs_to_refs(contigs, refs, out_skani_dist, out_skani_dist_filtered, out_clusters_filtered, threads=1) + + with open(out_clusters_filtered, 'r') as inf: + clusters = inf.readlines() + self.assertEqual(len(clusters), 0) + with open(out_skani_dist, 'r') as inf: + lines = inf.readlines() + self.assertEqual(len(lines), 1) + class TestMutableSequence(unittest.TestCase): ''' Test the MutableSequence class '''