From b509494c931c3efad267060edad37e7ce8dbfc1b Mon Sep 17 00:00:00 2001 From: William Haese-Hill Date: Fri, 4 Oct 2024 12:39:09 +0100 Subject: [PATCH 1/4] ensure ncRNA are reintegrated into output --- peaks2utr/constants.py | 1 + peaks2utr/postprocess.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/peaks2utr/constants.py b/peaks2utr/constants.py index fa3ceef..b0c8e3b 100644 --- a/peaks2utr/constants.py +++ b/peaks2utr/constants.py @@ -15,6 +15,7 @@ class AnnotationColour: class FeatureTypes: Gene = ['gene', 'protein_coding_gene'] + NonCodingGene = ['ncRNA_gene'] FivePrimeUTR = ['five_prime_UTR', 'five_prime_utr'] ThreePrimeUTR = ['three_prime_UTR', 'three_prime_utr'] GffTranscript = ['mRNA'] diff --git a/peaks2utr/postprocess.py b/peaks2utr/postprocess.py index cf16580..7756296 100644 --- a/peaks2utr/postprocess.py +++ b/peaks2utr/postprocess.py @@ -38,7 +38,7 @@ def merge_annotations(db, annotations): db = sqlite3.connect(db, check_same_thread=False) db = FeatureDB(db) - for gene in db.all_features(featuretype=FeatureTypes.Gene): + for gene in db.all_features(featuretype=FeatureTypes.Gene + FeatureTypes.NonCodingGene): if gene.id not in annotations: features = features_dict_for_gene(db, gene) annotations[gene.id] = features From ec0248dc04f5d82566c5da736067486a970f01e6 Mon Sep 17 00:00:00 2001 From: William Haese-Hill Date: Fri, 4 Oct 2024 12:50:00 +0100 Subject: [PATCH 2/4] increment semver --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 8a76beb..c11cba4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = peaks2utr -version = 1.3.2 +version = 1.3.3 author = William Haese-Hill author_email = william.haese-hill@glasgow.ac.uk description = A robust, parallelized Python CLI for annotating three_prime_UTR From 8d65dc23dc3cea4f27ee37bf3ff317a43c13c3b6 Mon Sep 17 00:00:00 2001 From: William Haese-Hill Date: Fri, 4 Oct 2024 15:40:51 +0100 Subject: [PATCH 3/4] non-coding transcript definition and passing unit tests --- peaks2utr/collections.py | 10 ++++++-- peaks2utr/constants.py | 1 + tests/test_output_formatting.py | 41 +++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/peaks2utr/collections.py b/peaks2utr/collections.py index 6953131..e346b85 100644 --- a/peaks2utr/collections.py +++ b/peaks2utr/collections.py @@ -44,11 +44,14 @@ def iter_feature_strings(self): @staticmethod def _apply_gff_dialect(feature, attrs): feature.dialect = constants.GFFUTILS_GFF_DIALECT - if feature.featuretype not in constants.FeatureTypes.Gene: + if feature.featuretype not in constants.FeatureTypes.Gene + constants.FeatureTypes.NonCodingGene: if feature.featuretype in constants.FeatureTypes.GtfTranscript: attrs['Parent'] = attrs.pop('gene_id') attrs['ID'] = attrs.pop('transcript_id') feature.featuretype = constants.FeatureTypes.GffTranscript[0] + elif feature.featuretype in constants.FeatureTypes.NonCodingTranscript: + attrs['Parent'] = attrs.pop('gene_id') + attrs['ID'] = attrs.pop('transcript_id') else: attrs.pop('gene_id') attrs['Parent'] = attrs.pop('transcript_id') @@ -59,11 +62,14 @@ def _apply_gff_dialect(feature, attrs): @staticmethod def _apply_gtf_dialect(feature, attrs, gene_id=None): feature.dialect = constants.GFFUTILS_GTF_DIALECT - if feature.featuretype not in constants.FeatureTypes.Gene: + if feature.featuretype not in constants.FeatureTypes.Gene + constants.FeatureTypes.NonCodingGene: if feature.featuretype in constants.FeatureTypes.GffTranscript: attrs['gene_id'] = attrs.pop('Parent') attrs['transcript_id'] = attrs.pop('ID') feature.featuretype = constants.FeatureTypes.GtfTranscript[0] + elif feature.featuretype in constants.FeatureTypes.NonCodingTranscript: + attrs['gene_id'] = attrs.pop('Parent') + attrs['transcript_id'] = attrs.pop('ID') else: attrs['gene_id'] = gene_id attrs['transcript_id'] = attrs.pop('Parent') diff --git a/peaks2utr/constants.py b/peaks2utr/constants.py index b0c8e3b..0fe20a2 100644 --- a/peaks2utr/constants.py +++ b/peaks2utr/constants.py @@ -20,6 +20,7 @@ class FeatureTypes: ThreePrimeUTR = ['three_prime_UTR', 'three_prime_utr'] GffTranscript = ['mRNA'] GtfTranscript = ['transcript'] + NonCodingTranscript = ['ncRNA', 'rRNA', 'snoRNA', 'tRNA'] Exon = ['exon'] diff --git a/tests/test_output_formatting.py b/tests/test_output_formatting.py index 9fb5005..593e687 100644 --- a/tests/test_output_formatting.py +++ b/tests/test_output_formatting.py @@ -7,6 +7,8 @@ from peaks2utr.models import Feature, UTR from peaks2utr.utils import get_output_filename +NCRNA_FEATURETYPE = "tRNA" + class TestOutputFormatting(unittest.TestCase): @@ -16,18 +18,29 @@ def setUp(self): end = 2000 strand = "+" gene_id = "gene1" + ncRNA_gene_id = "ncRNA_gene1" argparser = prepare_argparser() self.args = argparser.parse_args(["", ""]) self.gene_gff = Feature(chr, id=gene_id, featuretype=FeatureTypes.Gene[0], start=start, end=end, strand=strand, attributes={"ID": [gene_id]}) self.transcript_gff = Feature(chr, id="gene1:mRNA", featuretype=FeatureTypes.GffTranscript[0], start=start, end=end, strand=strand, attributes={"ID": ["gene1:mRNA"], "Parent": [gene_id]}) + self.ncRNA_gene_gff = Feature(chr, id=ncRNA_gene_id, featuretype=FeatureTypes.NonCodingGene[0], start=start, end=end, strand=strand, + attributes={"ID": [ncRNA_gene_id]}) + self.ncRNA_feature_gff = Feature(chr, id=ncRNA_gene_id+":t1", featuretype=NCRNA_FEATURETYPE, start=start, end=end, + strand=strand, attributes={"ID": [ncRNA_gene_id+":t1"], "Parent": [ncRNA_gene_id]}) self.gene_gtf = Feature(chr, id=gene_id, featuretype=FeatureTypes.Gene[0], source="gffutils_derived", start=start, end=end, strand=strand, attributes={"gene_id": [gene_id]}, dialect=GFFUTILS_GTF_DIALECT) self.transcript_gtf = Feature(chr, id="gene1.1", featuretype=FeatureTypes.GtfTranscript[0], start=start, end=end, strand=strand, attributes={"transcript_id": ["gene1.1"], "gene_id": [gene_id]}, dialect=GFFUTILS_GTF_DIALECT) + self.ncRNA_gene_gtf = Feature(chr, id=ncRNA_gene_id, featuretype=FeatureTypes.NonCodingGene[0], source="gffutils_derived", + start=start, end=end, strand=strand, attributes={"gene_id": [ncRNA_gene_id]}, + dialect=GFFUTILS_GTF_DIALECT) + self.ncRNA_feature_gtf = Feature(chr, id=ncRNA_gene_id+".t1", featuretype=NCRNA_FEATURETYPE, start=start, end=end, + strand=strand, attributes={"transcript_id": [ncRNA_gene_id+".t1"], "gene_id": [ncRNA_gene_id]}, + dialect=GFFUTILS_GTF_DIALECT) self.utr = UTR(start=start, end=end) self.db = MagicMock() self.db.children = MagicMock(return_value=[Feature(id="utr_1", featuretype=FeatureTypes.FivePrimeUTR[0])]) @@ -116,6 +129,34 @@ def test_gff_output_with_gtf_out_flag(self): output_fn = get_output_filename(self.args) self.assertRegex(cm.output[0], "WARNING") self.assertEqual(output_fn, self.args.output) + + def test_gtf_to_gff_ncRNA_retention(self): + self.args.gtf_in = True + self.args.gtf_out = False + expected_gene = ["chr1", "gffutils_derived", "ncRNA_gene", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1"] + expected_feature_0 = ["chr1", ".", "tRNA", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1.t1;Parent=ncRNA_gene1"] + annotations = AnnotationsDict(args=self.args) + annotations.update({ + self.ncRNA_gene_gtf.id: {"gene": self.ncRNA_gene_gtf, "feature_0": self.ncRNA_feature_gtf} + }) + gene, feature_0 = annotations.iter_feature_strings() + self.assertListEqual(gene.strip().split("\t"), expected_gene) + self.assertListEqual(feature_0.strip().split("\t"), expected_feature_0) + + def test_gff_to_gtf_ncRNA_retention(self): + self.args.gtf_in = False + self.args.gtf_out = True + expected_gene = ["chr1", ".", "ncRNA_gene", "1000", "2000", ".", "+", ".", + 'gene_id "ncRNA_gene1";'] + expected_feature_0 = ["chr1", ".", "tRNA", "1000", "2000", ".", "+", ".", + 'gene_id "ncRNA_gene1"; transcript_id "ncRNA_gene1:t1";'] + annotations = AnnotationsDict(args=self.args) + annotations.update({ + self.ncRNA_gene_gff.id: {"gene": self.ncRNA_gene_gff, "feature_0": self.ncRNA_feature_gff} + }) + gene, feature_0 = annotations.iter_feature_strings() + self.assertListEqual(gene.strip().split("\t"), expected_gene) + self.assertListEqual(feature_0.strip().split("\t"), expected_feature_0) if __name__ == '__main__': From 596c494f79b3f8ec0c0b8890ee31f212344ffe24 Mon Sep 17 00:00:00 2001 From: William Haese-Hill Date: Fri, 4 Oct 2024 15:47:49 +0100 Subject: [PATCH 4/4] generalise test featuretype --- tests/test_output_formatting.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/test_output_formatting.py b/tests/test_output_formatting.py index 593e687..e65cba3 100644 --- a/tests/test_output_formatting.py +++ b/tests/test_output_formatting.py @@ -7,8 +7,6 @@ from peaks2utr.models import Feature, UTR from peaks2utr.utils import get_output_filename -NCRNA_FEATURETYPE = "tRNA" - class TestOutputFormatting(unittest.TestCase): @@ -27,7 +25,7 @@ def setUp(self): strand=strand, attributes={"ID": ["gene1:mRNA"], "Parent": [gene_id]}) self.ncRNA_gene_gff = Feature(chr, id=ncRNA_gene_id, featuretype=FeatureTypes.NonCodingGene[0], start=start, end=end, strand=strand, attributes={"ID": [ncRNA_gene_id]}) - self.ncRNA_feature_gff = Feature(chr, id=ncRNA_gene_id+":t1", featuretype=NCRNA_FEATURETYPE, start=start, end=end, + self.ncRNA_feature_gff = Feature(chr, id=ncRNA_gene_id+":t1", featuretype=FeatureTypes.NonCodingTranscript[0], start=start, end=end, strand=strand, attributes={"ID": [ncRNA_gene_id+":t1"], "Parent": [ncRNA_gene_id]}) self.gene_gtf = Feature(chr, id=gene_id, featuretype=FeatureTypes.Gene[0], source="gffutils_derived", start=start, end=end, strand=strand, attributes={"gene_id": [gene_id]}, @@ -38,7 +36,7 @@ def setUp(self): self.ncRNA_gene_gtf = Feature(chr, id=ncRNA_gene_id, featuretype=FeatureTypes.NonCodingGene[0], source="gffutils_derived", start=start, end=end, strand=strand, attributes={"gene_id": [ncRNA_gene_id]}, dialect=GFFUTILS_GTF_DIALECT) - self.ncRNA_feature_gtf = Feature(chr, id=ncRNA_gene_id+".t1", featuretype=NCRNA_FEATURETYPE, start=start, end=end, + self.ncRNA_feature_gtf = Feature(chr, id=ncRNA_gene_id+".t1", featuretype=FeatureTypes.NonCodingTranscript[0], start=start, end=end, strand=strand, attributes={"transcript_id": [ncRNA_gene_id+".t1"], "gene_id": [ncRNA_gene_id]}, dialect=GFFUTILS_GTF_DIALECT) self.utr = UTR(start=start, end=end) @@ -134,7 +132,7 @@ def test_gtf_to_gff_ncRNA_retention(self): self.args.gtf_in = True self.args.gtf_out = False expected_gene = ["chr1", "gffutils_derived", "ncRNA_gene", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1"] - expected_feature_0 = ["chr1", ".", "tRNA", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1.t1;Parent=ncRNA_gene1"] + expected_feature_0 = ["chr1", ".", "ncRNA", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1.t1;Parent=ncRNA_gene1"] annotations = AnnotationsDict(args=self.args) annotations.update({ self.ncRNA_gene_gtf.id: {"gene": self.ncRNA_gene_gtf, "feature_0": self.ncRNA_feature_gtf} @@ -148,7 +146,7 @@ def test_gff_to_gtf_ncRNA_retention(self): self.args.gtf_out = True expected_gene = ["chr1", ".", "ncRNA_gene", "1000", "2000", ".", "+", ".", 'gene_id "ncRNA_gene1";'] - expected_feature_0 = ["chr1", ".", "tRNA", "1000", "2000", ".", "+", ".", + expected_feature_0 = ["chr1", ".", "ncRNA", "1000", "2000", ".", "+", ".", 'gene_id "ncRNA_gene1"; transcript_id "ncRNA_gene1:t1";'] annotations = AnnotationsDict(args=self.args) annotations.update({