Skip to content

Commit

Permalink
Merge pull request #34 from haessar/fix-ncrna
Browse files Browse the repository at this point in the history
Ensure ncRNA are reintegrated into output
haessar authored Oct 4, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
2 parents 43d2875 + 596c494 commit 3e34391
Showing 5 changed files with 51 additions and 4 deletions.
10 changes: 8 additions & 2 deletions peaks2utr/collections.py
Original file line number Diff line number Diff line change
@@ -44,11 +44,14 @@ def iter_feature_strings(self):
@staticmethod
def _apply_gff_dialect(feature, attrs):
feature.dialect = constants.GFFUTILS_GFF_DIALECT
if feature.featuretype not in constants.FeatureTypes.Gene:
if feature.featuretype not in constants.FeatureTypes.Gene + constants.FeatureTypes.NonCodingGene:
if feature.featuretype in constants.FeatureTypes.GtfTranscript:
attrs['Parent'] = attrs.pop('gene_id')
attrs['ID'] = attrs.pop('transcript_id')
feature.featuretype = constants.FeatureTypes.GffTranscript[0]
elif feature.featuretype in constants.FeatureTypes.NonCodingTranscript:
attrs['Parent'] = attrs.pop('gene_id')
attrs['ID'] = attrs.pop('transcript_id')
else:
attrs.pop('gene_id')
attrs['Parent'] = attrs.pop('transcript_id')
@@ -59,11 +62,14 @@ def _apply_gff_dialect(feature, attrs):
@staticmethod
def _apply_gtf_dialect(feature, attrs, gene_id=None):
feature.dialect = constants.GFFUTILS_GTF_DIALECT
if feature.featuretype not in constants.FeatureTypes.Gene:
if feature.featuretype not in constants.FeatureTypes.Gene + constants.FeatureTypes.NonCodingGene:
if feature.featuretype in constants.FeatureTypes.GffTranscript:
attrs['gene_id'] = attrs.pop('Parent')
attrs['transcript_id'] = attrs.pop('ID')
feature.featuretype = constants.FeatureTypes.GtfTranscript[0]
elif feature.featuretype in constants.FeatureTypes.NonCodingTranscript:
attrs['gene_id'] = attrs.pop('Parent')
attrs['transcript_id'] = attrs.pop('ID')
else:
attrs['gene_id'] = gene_id
attrs['transcript_id'] = attrs.pop('Parent')
2 changes: 2 additions & 0 deletions peaks2utr/constants.py
Original file line number Diff line number Diff line change
@@ -15,10 +15,12 @@ class AnnotationColour:

class FeatureTypes:
Gene = ['gene', 'protein_coding_gene']
NonCodingGene = ['ncRNA_gene']
FivePrimeUTR = ['five_prime_UTR', 'five_prime_utr']
ThreePrimeUTR = ['three_prime_UTR', 'three_prime_utr']
GffTranscript = ['mRNA']
GtfTranscript = ['transcript']
NonCodingTranscript = ['ncRNA', 'rRNA', 'snoRNA', 'tRNA']
Exon = ['exon']


2 changes: 1 addition & 1 deletion peaks2utr/postprocess.py
Original file line number Diff line number Diff line change
@@ -38,7 +38,7 @@ def merge_annotations(db, annotations):

db = sqlite3.connect(db, check_same_thread=False)
db = FeatureDB(db)
for gene in db.all_features(featuretype=FeatureTypes.Gene):
for gene in db.all_features(featuretype=FeatureTypes.Gene + FeatureTypes.NonCodingGene):
if gene.id not in annotations:
features = features_dict_for_gene(db, gene)
annotations[gene.id] = features
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = peaks2utr
version = 1.3.2
version = 1.3.3
author = William Haese-Hill
author_email = [email protected]
description = A robust, parallelized Python CLI for annotating three_prime_UTR
39 changes: 39 additions & 0 deletions tests/test_output_formatting.py
Original file line number Diff line number Diff line change
@@ -16,18 +16,29 @@ def setUp(self):
end = 2000
strand = "+"
gene_id = "gene1"
ncRNA_gene_id = "ncRNA_gene1"
argparser = prepare_argparser()
self.args = argparser.parse_args(["", ""])
self.gene_gff = Feature(chr, id=gene_id, featuretype=FeatureTypes.Gene[0], start=start, end=end, strand=strand,
attributes={"ID": [gene_id]})
self.transcript_gff = Feature(chr, id="gene1:mRNA", featuretype=FeatureTypes.GffTranscript[0], start=start, end=end,
strand=strand, attributes={"ID": ["gene1:mRNA"], "Parent": [gene_id]})
self.ncRNA_gene_gff = Feature(chr, id=ncRNA_gene_id, featuretype=FeatureTypes.NonCodingGene[0], start=start, end=end, strand=strand,
attributes={"ID": [ncRNA_gene_id]})
self.ncRNA_feature_gff = Feature(chr, id=ncRNA_gene_id+":t1", featuretype=FeatureTypes.NonCodingTranscript[0], start=start, end=end,
strand=strand, attributes={"ID": [ncRNA_gene_id+":t1"], "Parent": [ncRNA_gene_id]})
self.gene_gtf = Feature(chr, id=gene_id, featuretype=FeatureTypes.Gene[0], source="gffutils_derived",
start=start, end=end, strand=strand, attributes={"gene_id": [gene_id]},
dialect=GFFUTILS_GTF_DIALECT)
self.transcript_gtf = Feature(chr, id="gene1.1", featuretype=FeatureTypes.GtfTranscript[0], start=start, end=end,
strand=strand, attributes={"transcript_id": ["gene1.1"], "gene_id": [gene_id]},
dialect=GFFUTILS_GTF_DIALECT)
self.ncRNA_gene_gtf = Feature(chr, id=ncRNA_gene_id, featuretype=FeatureTypes.NonCodingGene[0], source="gffutils_derived",
start=start, end=end, strand=strand, attributes={"gene_id": [ncRNA_gene_id]},
dialect=GFFUTILS_GTF_DIALECT)
self.ncRNA_feature_gtf = Feature(chr, id=ncRNA_gene_id+".t1", featuretype=FeatureTypes.NonCodingTranscript[0], start=start, end=end,
strand=strand, attributes={"transcript_id": [ncRNA_gene_id+".t1"], "gene_id": [ncRNA_gene_id]},
dialect=GFFUTILS_GTF_DIALECT)
self.utr = UTR(start=start, end=end)
self.db = MagicMock()
self.db.children = MagicMock(return_value=[Feature(id="utr_1", featuretype=FeatureTypes.FivePrimeUTR[0])])
@@ -116,6 +127,34 @@ def test_gff_output_with_gtf_out_flag(self):
output_fn = get_output_filename(self.args)
self.assertRegex(cm.output[0], "WARNING")
self.assertEqual(output_fn, self.args.output)

def test_gtf_to_gff_ncRNA_retention(self):
self.args.gtf_in = True
self.args.gtf_out = False
expected_gene = ["chr1", "gffutils_derived", "ncRNA_gene", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1"]
expected_feature_0 = ["chr1", ".", "ncRNA", "1000", "2000", ".", "+", ".", "ID=ncRNA_gene1.t1;Parent=ncRNA_gene1"]
annotations = AnnotationsDict(args=self.args)
annotations.update({
self.ncRNA_gene_gtf.id: {"gene": self.ncRNA_gene_gtf, "feature_0": self.ncRNA_feature_gtf}
})
gene, feature_0 = annotations.iter_feature_strings()
self.assertListEqual(gene.strip().split("\t"), expected_gene)
self.assertListEqual(feature_0.strip().split("\t"), expected_feature_0)

def test_gff_to_gtf_ncRNA_retention(self):
self.args.gtf_in = False
self.args.gtf_out = True
expected_gene = ["chr1", ".", "ncRNA_gene", "1000", "2000", ".", "+", ".",
'gene_id "ncRNA_gene1";']
expected_feature_0 = ["chr1", ".", "ncRNA", "1000", "2000", ".", "+", ".",
'gene_id "ncRNA_gene1"; transcript_id "ncRNA_gene1:t1";']
annotations = AnnotationsDict(args=self.args)
annotations.update({
self.ncRNA_gene_gff.id: {"gene": self.ncRNA_gene_gff, "feature_0": self.ncRNA_feature_gff}
})
gene, feature_0 = annotations.iter_feature_strings()
self.assertListEqual(gene.strip().split("\t"), expected_gene)
self.assertListEqual(feature_0.strip().split("\t"), expected_feature_0)


if __name__ == '__main__':

0 comments on commit 3e34391

Please sign in to comment.