Merge pull request #34 from haessar/fix-ncrna

Ensure ncRNA are reintegrated into output
haessar · Oct 4, 2024 · 3e34391 · 3e34391
2 parents 43d2875 + 596c494
commit 3e34391
Showing 5 changed files with 51 additions and 4 deletions.
diff --git a/peaks2utr/collections.py b/peaks2utr/collections.py
@@ -44,11 +44,14 @@ def iter_feature_strings(self):
     @staticmethod
     def _apply_gff_dialect(feature, attrs):
         feature.dialect = constants.GFFUTILS_GFF_DIALECT
-        if feature.featuretype not in constants.FeatureTypes.Gene:
+        if feature.featuretype not in constants.FeatureTypes.Gene + constants.FeatureTypes.NonCodingGene:
             if feature.featuretype in constants.FeatureTypes.GtfTranscript:
                 attrs['Parent'] = attrs.pop('gene_id')
                 attrs['ID'] = attrs.pop('transcript_id')
                 feature.featuretype = constants.FeatureTypes.GffTranscript[0]
+            elif feature.featuretype in constants.FeatureTypes.NonCodingTranscript:
+                attrs['Parent'] = attrs.pop('gene_id')
+                attrs['ID'] = attrs.pop('transcript_id')
             else:
                 attrs.pop('gene_id')
                 attrs['Parent'] = attrs.pop('transcript_id')
@@ -59,11 +62,14 @@ def _apply_gff_dialect(feature, attrs):
     @staticmethod
     def _apply_gtf_dialect(feature, attrs, gene_id=None):
         feature.dialect = constants.GFFUTILS_GTF_DIALECT
-        if feature.featuretype not in constants.FeatureTypes.Gene:
+        if feature.featuretype not in constants.FeatureTypes.Gene + constants.FeatureTypes.NonCodingGene:
             if feature.featuretype in constants.FeatureTypes.GffTranscript:
                 attrs['gene_id'] = attrs.pop('Parent')
                 attrs['transcript_id'] = attrs.pop('ID')
                 feature.featuretype = constants.FeatureTypes.GtfTranscript[0]
+            elif feature.featuretype in constants.FeatureTypes.NonCodingTranscript:
+                attrs['gene_id'] = attrs.pop('Parent')
+                attrs['transcript_id'] = attrs.pop('ID')
             else:
                 attrs['gene_id'] = gene_id
                 attrs['transcript_id'] = attrs.pop('Parent')

diff --git a/peaks2utr/constants.py b/peaks2utr/constants.py
@@ -15,10 +15,12 @@ class AnnotationColour:
 
 class FeatureTypes:
     Gene = ['gene', 'protein_coding_gene']
+    NonCodingGene = ['ncRNA_gene']
     FivePrimeUTR = ['five_prime_UTR', 'five_prime_utr']
     ThreePrimeUTR = ['three_prime_UTR', 'three_prime_utr']
     GffTranscript = ['mRNA']
     GtfTranscript = ['transcript']
+    NonCodingTranscript = ['ncRNA', 'rRNA', 'snoRNA', 'tRNA']
     Exon = ['exon']
 
 

diff --git a/peaks2utr/postprocess.py b/peaks2utr/postprocess.py
@@ -38,7 +38,7 @@ def merge_annotations(db, annotations):
 
     db = sqlite3.connect(db, check_same_thread=False)
     db = FeatureDB(db)
-    for gene in db.all_features(featuretype=FeatureTypes.Gene):
+    for gene in db.all_features(featuretype=FeatureTypes.Gene + FeatureTypes.NonCodingGene):
         if gene.id not in annotations:
             features = features_dict_for_gene(db, gene)
             annotations[gene.id] = features

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = peaks2utr
-version = 1.3.2
+version = 1.3.3
 author = William Haese-Hill
 author_email = [email protected]
 description = A robust, parallelized Python CLI for annotating three_prime_UTR

diff --git a/tests/test_output_formatting.py b/tests/test_output_formatting.py
@@ -16,18 +16,29 @@ def setUp(self):
         end = 2000
         strand = "+"
         gene_id = "gene1"
+        ncRNA_gene_id = "ncRNA_gene1"
         argparser = prepare_argparser()
         self.args = argparser.parse_args(["", ""])
         self.gene_gff = Feature(chr, id=gene_id, featuretype=FeatureTypes.Gene[0], start=start, end=end, strand=strand,
                                 attributes={"ID": [gene_id]})
         self.transcript_gff = Feature(chr, id="gene1:mRNA", featuretype=FeatureTypes.GffTranscript[0], start=start, end=end,
                                       strand=strand, attributes={"ID": ["gene1:mRNA"], "Parent": [gene_id]})
+        self.ncRNA_gene_gff = Feature(chr, id=ncRNA_gene_id, featuretype=FeatureTypes.NonCodingGene[0], start=start, end=end, strand=strand,
+                                attributes={"ID": [ncRNA_gene_id]})
+        self.ncRNA_feature_gff = Feature(chr, id=ncRNA_gene_id+":t1", featuretype=FeatureTypes.NonCodingTranscript[0], start=start, end=end,
+                                         strand=strand, attributes={"ID": [ncRNA_gene_id+":t1"], "Parent": [ncRNA_gene_id]})
         self.gene_gtf = Feature(chr, id=gene_id, featuretype=FeatureTypes.Gene[0], source="gffutils_derived",
                                 start=start, end=end, strand=strand, attributes={"gene_id": [gene_id]},
                                 dialect=GFFUTILS_GTF_DIALECT)
         self.transcript_gtf = Feature(chr, id="gene1.1", featuretype=FeatureTypes.GtfTranscript[0], start=start, end=end,
                                       strand=strand, attributes={"transcript_id": ["gene1.1"], "gene_id": [gene_id]},
                                       dialect=GFFUTILS_GTF_DIALECT)
+        self.ncRNA_gene_gtf = Feature(chr, id=ncRNA_gene_id, featuretype=FeatureTypes.NonCodingGene[0], source="gffutils_derived",
+                                start=start, end=end, strand=strand, attributes={"gene_id": [ncRNA_gene_id]},
+                                dialect=GFFUTILS_GTF_DIALECT)
+        self.ncRNA_feature_gtf = Feature(chr, id=ncRNA_gene_id+".t1", featuretype=FeatureTypes.NonCodingTranscript[0], start=start, end=end,
+                                         strand=strand, attributes={"transcript_id": [ncRNA_gene_id+".t1"], "gene_id": [ncRNA_gene_id]},
+                                         dialect=GFFUTILS_GTF_DIALECT)
         self.utr = UTR(start=start, end=end)
         self.db = MagicMock()
         self.db.children = MagicMock(return_value=[Feature(id="utr_1", featuretype=FeatureTypes.FivePrimeUTR[0])])
@@ -116,6 +127,34 @@ def test_gff_output_with_gtf_out_flag(self):
             output_fn = get_output_filename(self.args)
         self.assertRegex(cm.output[0], "WARNING")
         self.assertEqual(output_fn, self.args.output)
+
+    def test_gtf_to_gff_ncRNA_retention(self):
+        self.args.gtf_in = True
+        self.args.gtf_out = False
+        expected_gene = ["chr1", "gffutils_derived", "ncRNA_gene", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1"]
+        expected_feature_0 = ["chr1", ".", "ncRNA", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1.t1;Parent=ncRNA_gene1"]
+        annotations = AnnotationsDict(args=self.args)
+        annotations.update({
+            self.ncRNA_gene_gtf.id: {"gene": self.ncRNA_gene_gtf, "feature_0": self.ncRNA_feature_gtf}
+        })
+        gene, feature_0 = annotations.iter_feature_strings()
+        self.assertListEqual(gene.strip().split("\t"), expected_gene)
+        self.assertListEqual(feature_0.strip().split("\t"), expected_feature_0)
+
+    def test_gff_to_gtf_ncRNA_retention(self):
+        self.args.gtf_in = False
+        self.args.gtf_out = True
+        expected_gene = ["chr1", ".", "ncRNA_gene", "1000", "2000", ".", "+", ".",
+                               'gene_id "ncRNA_gene1";']
+        expected_feature_0 = ["chr1", ".", "ncRNA", "1000", "2000", ".", "+", ".",
+                               'gene_id "ncRNA_gene1"; transcript_id "ncRNA_gene1:t1";']
+        annotations = AnnotationsDict(args=self.args)
+        annotations.update({
+            self.ncRNA_gene_gff.id: {"gene": self.ncRNA_gene_gff, "feature_0": self.ncRNA_feature_gff}
+        })
+        gene, feature_0 = annotations.iter_feature_strings()
+        self.assertListEqual(gene.strip().split("\t"), expected_gene)
+        self.assertListEqual(feature_0.strip().split("\t"), expected_feature_0)
 
 
 if __name__ == '__main__':