From b509494c931c3efad267060edad37e7ce8dbfc1b Mon Sep 17 00:00:00 2001
From: William Haese-Hill <wahhill@gmail.com>
Date: Fri, 4 Oct 2024 12:39:09 +0100
Subject: [PATCH 1/4] ensure ncRNA are reintegrated into output

---
 peaks2utr/constants.py   | 1 +
 peaks2utr/postprocess.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/peaks2utr/constants.py b/peaks2utr/constants.py
index fa3ceef..b0c8e3b 100644
--- a/peaks2utr/constants.py
+++ b/peaks2utr/constants.py
@@ -15,6 +15,7 @@ class AnnotationColour:
 
 class FeatureTypes:
     Gene = ['gene', 'protein_coding_gene']
+    NonCodingGene = ['ncRNA_gene']
     FivePrimeUTR = ['five_prime_UTR', 'five_prime_utr']
     ThreePrimeUTR = ['three_prime_UTR', 'three_prime_utr']
     GffTranscript = ['mRNA']
diff --git a/peaks2utr/postprocess.py b/peaks2utr/postprocess.py
index cf16580..7756296 100644
--- a/peaks2utr/postprocess.py
+++ b/peaks2utr/postprocess.py
@@ -38,7 +38,7 @@ def merge_annotations(db, annotations):
 
     db = sqlite3.connect(db, check_same_thread=False)
     db = FeatureDB(db)
-    for gene in db.all_features(featuretype=FeatureTypes.Gene):
+    for gene in db.all_features(featuretype=FeatureTypes.Gene + FeatureTypes.NonCodingGene):
         if gene.id not in annotations:
             features = features_dict_for_gene(db, gene)
             annotations[gene.id] = features

From ec0248dc04f5d82566c5da736067486a970f01e6 Mon Sep 17 00:00:00 2001
From: William Haese-Hill <wahhill@gmail.com>
Date: Fri, 4 Oct 2024 12:50:00 +0100
Subject: [PATCH 2/4] increment semver

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 8a76beb..c11cba4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = peaks2utr
-version = 1.3.2
+version = 1.3.3
 author = William Haese-Hill
 author_email = william.haese-hill@glasgow.ac.uk
 description = A robust, parallelized Python CLI for annotating three_prime_UTR

From 8d65dc23dc3cea4f27ee37bf3ff317a43c13c3b6 Mon Sep 17 00:00:00 2001
From: William Haese-Hill <wahhill@gmail.com>
Date: Fri, 4 Oct 2024 15:40:51 +0100
Subject: [PATCH 3/4] non-coding transcript definition and passing unit tests

---
 peaks2utr/collections.py        | 10 ++++++--
 peaks2utr/constants.py          |  1 +
 tests/test_output_formatting.py | 41 +++++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/peaks2utr/collections.py b/peaks2utr/collections.py
index 6953131..e346b85 100644
--- a/peaks2utr/collections.py
+++ b/peaks2utr/collections.py
@@ -44,11 +44,14 @@ def iter_feature_strings(self):
     @staticmethod
     def _apply_gff_dialect(feature, attrs):
         feature.dialect = constants.GFFUTILS_GFF_DIALECT
-        if feature.featuretype not in constants.FeatureTypes.Gene:
+        if feature.featuretype not in constants.FeatureTypes.Gene + constants.FeatureTypes.NonCodingGene:
             if feature.featuretype in constants.FeatureTypes.GtfTranscript:
                 attrs['Parent'] = attrs.pop('gene_id')
                 attrs['ID'] = attrs.pop('transcript_id')
                 feature.featuretype = constants.FeatureTypes.GffTranscript[0]
+            elif feature.featuretype in constants.FeatureTypes.NonCodingTranscript:
+                attrs['Parent'] = attrs.pop('gene_id')
+                attrs['ID'] = attrs.pop('transcript_id')
             else:
                 attrs.pop('gene_id')
                 attrs['Parent'] = attrs.pop('transcript_id')
@@ -59,11 +62,14 @@ def _apply_gff_dialect(feature, attrs):
     @staticmethod
     def _apply_gtf_dialect(feature, attrs, gene_id=None):
         feature.dialect = constants.GFFUTILS_GTF_DIALECT
-        if feature.featuretype not in constants.FeatureTypes.Gene:
+        if feature.featuretype not in constants.FeatureTypes.Gene + constants.FeatureTypes.NonCodingGene:
             if feature.featuretype in constants.FeatureTypes.GffTranscript:
                 attrs['gene_id'] = attrs.pop('Parent')
                 attrs['transcript_id'] = attrs.pop('ID')
                 feature.featuretype = constants.FeatureTypes.GtfTranscript[0]
+            elif feature.featuretype in constants.FeatureTypes.NonCodingTranscript:
+                attrs['gene_id'] = attrs.pop('Parent')
+                attrs['transcript_id'] = attrs.pop('ID')
             else:
                 attrs['gene_id'] = gene_id
                 attrs['transcript_id'] = attrs.pop('Parent')
diff --git a/peaks2utr/constants.py b/peaks2utr/constants.py
index b0c8e3b..0fe20a2 100644
--- a/peaks2utr/constants.py
+++ b/peaks2utr/constants.py
@@ -20,6 +20,7 @@ class FeatureTypes:
     ThreePrimeUTR = ['three_prime_UTR', 'three_prime_utr']
     GffTranscript = ['mRNA']
     GtfTranscript = ['transcript']
+    NonCodingTranscript = ['ncRNA', 'rRNA', 'snoRNA', 'tRNA']
     Exon = ['exon']
 
 
diff --git a/tests/test_output_formatting.py b/tests/test_output_formatting.py
index 9fb5005..593e687 100644
--- a/tests/test_output_formatting.py
+++ b/tests/test_output_formatting.py
@@ -7,6 +7,8 @@
 from peaks2utr.models import Feature, UTR
 from peaks2utr.utils import get_output_filename
 
+NCRNA_FEATURETYPE = "tRNA"
+
 
 class TestOutputFormatting(unittest.TestCase):
 
@@ -16,18 +18,29 @@ def setUp(self):
         end = 2000
         strand = "+"
         gene_id = "gene1"
+        ncRNA_gene_id = "ncRNA_gene1"
         argparser = prepare_argparser()
         self.args = argparser.parse_args(["", ""])
         self.gene_gff = Feature(chr, id=gene_id, featuretype=FeatureTypes.Gene[0], start=start, end=end, strand=strand,
                                 attributes={"ID": [gene_id]})
         self.transcript_gff = Feature(chr, id="gene1:mRNA", featuretype=FeatureTypes.GffTranscript[0], start=start, end=end,
                                       strand=strand, attributes={"ID": ["gene1:mRNA"], "Parent": [gene_id]})
+        self.ncRNA_gene_gff = Feature(chr, id=ncRNA_gene_id, featuretype=FeatureTypes.NonCodingGene[0], start=start, end=end, strand=strand,
+                                attributes={"ID": [ncRNA_gene_id]})
+        self.ncRNA_feature_gff = Feature(chr, id=ncRNA_gene_id+":t1", featuretype=NCRNA_FEATURETYPE, start=start, end=end,
+                                         strand=strand, attributes={"ID": [ncRNA_gene_id+":t1"], "Parent": [ncRNA_gene_id]})
         self.gene_gtf = Feature(chr, id=gene_id, featuretype=FeatureTypes.Gene[0], source="gffutils_derived",
                                 start=start, end=end, strand=strand, attributes={"gene_id": [gene_id]},
                                 dialect=GFFUTILS_GTF_DIALECT)
         self.transcript_gtf = Feature(chr, id="gene1.1", featuretype=FeatureTypes.GtfTranscript[0], start=start, end=end,
                                       strand=strand, attributes={"transcript_id": ["gene1.1"], "gene_id": [gene_id]},
                                       dialect=GFFUTILS_GTF_DIALECT)
+        self.ncRNA_gene_gtf = Feature(chr, id=ncRNA_gene_id, featuretype=FeatureTypes.NonCodingGene[0], source="gffutils_derived",
+                                start=start, end=end, strand=strand, attributes={"gene_id": [ncRNA_gene_id]},
+                                dialect=GFFUTILS_GTF_DIALECT)
+        self.ncRNA_feature_gtf = Feature(chr, id=ncRNA_gene_id+".t1", featuretype=NCRNA_FEATURETYPE, start=start, end=end,
+                                         strand=strand, attributes={"transcript_id": [ncRNA_gene_id+".t1"], "gene_id": [ncRNA_gene_id]},
+                                         dialect=GFFUTILS_GTF_DIALECT)
         self.utr = UTR(start=start, end=end)
         self.db = MagicMock()
         self.db.children = MagicMock(return_value=[Feature(id="utr_1", featuretype=FeatureTypes.FivePrimeUTR[0])])
@@ -116,6 +129,34 @@ def test_gff_output_with_gtf_out_flag(self):
             output_fn = get_output_filename(self.args)
         self.assertRegex(cm.output[0], "WARNING")
         self.assertEqual(output_fn, self.args.output)
+    
+    def test_gtf_to_gff_ncRNA_retention(self):
+        self.args.gtf_in = True
+        self.args.gtf_out = False
+        expected_gene = ["chr1", "gffutils_derived", "ncRNA_gene", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1"]
+        expected_feature_0 = ["chr1", ".", "tRNA", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1.t1;Parent=ncRNA_gene1"]
+        annotations = AnnotationsDict(args=self.args)
+        annotations.update({
+            self.ncRNA_gene_gtf.id: {"gene": self.ncRNA_gene_gtf, "feature_0": self.ncRNA_feature_gtf}
+        })
+        gene, feature_0 = annotations.iter_feature_strings()
+        self.assertListEqual(gene.strip().split("\t"), expected_gene)
+        self.assertListEqual(feature_0.strip().split("\t"), expected_feature_0)
+    
+    def test_gff_to_gtf_ncRNA_retention(self):
+        self.args.gtf_in = False
+        self.args.gtf_out = True
+        expected_gene = ["chr1", ".", "ncRNA_gene", "1000", "2000", ".", "+", ".",
+                               'gene_id "ncRNA_gene1";']
+        expected_feature_0 = ["chr1", ".", "tRNA", "1000", "2000", ".", "+", ".",
+                               'gene_id "ncRNA_gene1"; transcript_id "ncRNA_gene1:t1";']
+        annotations = AnnotationsDict(args=self.args)
+        annotations.update({
+            self.ncRNA_gene_gff.id: {"gene": self.ncRNA_gene_gff, "feature_0": self.ncRNA_feature_gff}
+        })
+        gene, feature_0 = annotations.iter_feature_strings()
+        self.assertListEqual(gene.strip().split("\t"), expected_gene)
+        self.assertListEqual(feature_0.strip().split("\t"), expected_feature_0)
 
 
 if __name__ == '__main__':

From 596c494f79b3f8ec0c0b8890ee31f212344ffe24 Mon Sep 17 00:00:00 2001
From: William Haese-Hill <wahhill@gmail.com>
Date: Fri, 4 Oct 2024 15:47:49 +0100
Subject: [PATCH 4/4] generalise test featuretype

---
 tests/test_output_formatting.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/test_output_formatting.py b/tests/test_output_formatting.py
index 593e687..e65cba3 100644
--- a/tests/test_output_formatting.py
+++ b/tests/test_output_formatting.py
@@ -7,8 +7,6 @@
 from peaks2utr.models import Feature, UTR
 from peaks2utr.utils import get_output_filename
 
-NCRNA_FEATURETYPE = "tRNA"
-
 
 class TestOutputFormatting(unittest.TestCase):
 
@@ -27,7 +25,7 @@ def setUp(self):
                                       strand=strand, attributes={"ID": ["gene1:mRNA"], "Parent": [gene_id]})
         self.ncRNA_gene_gff = Feature(chr, id=ncRNA_gene_id, featuretype=FeatureTypes.NonCodingGene[0], start=start, end=end, strand=strand,
                                 attributes={"ID": [ncRNA_gene_id]})
-        self.ncRNA_feature_gff = Feature(chr, id=ncRNA_gene_id+":t1", featuretype=NCRNA_FEATURETYPE, start=start, end=end,
+        self.ncRNA_feature_gff = Feature(chr, id=ncRNA_gene_id+":t1", featuretype=FeatureTypes.NonCodingTranscript[0], start=start, end=end,
                                          strand=strand, attributes={"ID": [ncRNA_gene_id+":t1"], "Parent": [ncRNA_gene_id]})
         self.gene_gtf = Feature(chr, id=gene_id, featuretype=FeatureTypes.Gene[0], source="gffutils_derived",
                                 start=start, end=end, strand=strand, attributes={"gene_id": [gene_id]},
@@ -38,7 +36,7 @@ def setUp(self):
         self.ncRNA_gene_gtf = Feature(chr, id=ncRNA_gene_id, featuretype=FeatureTypes.NonCodingGene[0], source="gffutils_derived",
                                 start=start, end=end, strand=strand, attributes={"gene_id": [ncRNA_gene_id]},
                                 dialect=GFFUTILS_GTF_DIALECT)
-        self.ncRNA_feature_gtf = Feature(chr, id=ncRNA_gene_id+".t1", featuretype=NCRNA_FEATURETYPE, start=start, end=end,
+        self.ncRNA_feature_gtf = Feature(chr, id=ncRNA_gene_id+".t1", featuretype=FeatureTypes.NonCodingTranscript[0], start=start, end=end,
                                          strand=strand, attributes={"transcript_id": [ncRNA_gene_id+".t1"], "gene_id": [ncRNA_gene_id]},
                                          dialect=GFFUTILS_GTF_DIALECT)
         self.utr = UTR(start=start, end=end)
@@ -134,7 +132,7 @@ def test_gtf_to_gff_ncRNA_retention(self):
         self.args.gtf_in = True
         self.args.gtf_out = False
         expected_gene = ["chr1", "gffutils_derived", "ncRNA_gene", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1"]
-        expected_feature_0 = ["chr1", ".", "tRNA", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1.t1;Parent=ncRNA_gene1"]
+        expected_feature_0 = ["chr1", ".", "ncRNA", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1.t1;Parent=ncRNA_gene1"]
         annotations = AnnotationsDict(args=self.args)
         annotations.update({
             self.ncRNA_gene_gtf.id: {"gene": self.ncRNA_gene_gtf, "feature_0": self.ncRNA_feature_gtf}
@@ -148,7 +146,7 @@ def test_gff_to_gtf_ncRNA_retention(self):
         self.args.gtf_out = True
         expected_gene = ["chr1", ".", "ncRNA_gene", "1000", "2000", ".", "+", ".",
                                'gene_id "ncRNA_gene1";']
-        expected_feature_0 = ["chr1", ".", "tRNA", "1000", "2000", ".", "+", ".",
+        expected_feature_0 = ["chr1", ".", "ncRNA", "1000", "2000", ".", "+", ".",
                                'gene_id "ncRNA_gene1"; transcript_id "ncRNA_gene1:t1";']
         annotations = AnnotationsDict(args=self.args)
         annotations.update({