From 305a724e4738aa302c32ea20935373c439a67029 Mon Sep 17 00:00:00 2001 From: Redmar van den Berg Date: Fri, 29 Jul 2022 10:10:18 +0200 Subject: [PATCH] GTF attribute values can contain spaces --- bin/fuma-gencode-gtf-to-bed | 5 ++++- tests/data/gencode_hg19.subset.bed | 1 + tests/data/gencode_hg19.subset.gtf | 1 + tests/data/gencode_hg19.subset.sorted.bed | 1 + 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/fuma-gencode-gtf-to-bed b/bin/fuma-gencode-gtf-to-bed index d3443ea..b8eff79 100755 --- a/bin/fuma-gencode-gtf-to-bed +++ b/bin/fuma-gencode-gtf-to-bed @@ -26,7 +26,10 @@ def attribute_to_dict(line): # The last 'field' is empty, as GTF lines are ; terminated if not entry: continue - key, value = entry.split() + # value can contain spaces + key, *value = entry.split() + # Put spaces back into value + value = ' '.join(value) data[key] = value.replace('"','') return data diff --git a/tests/data/gencode_hg19.subset.bed b/tests/data/gencode_hg19.subset.bed index 307f345..fb9e959 100644 --- a/tests/data/gencode_hg19.subset.bed +++ b/tests/data/gencode_hg19.subset.bed @@ -5,6 +5,7 @@ chr1 30266 31109 ENSG00000243485.2 chr1 30365 30503 ENSG00000243485.2 chr1 34553 36081 ENSG00000237613.2 chr1 35244 36073 ENSG00000237613.2 +chr1 685678 686673 ENSG00000284662 chr1 69090 70008 ENSG00000186092.4 chr1 89294 120932 ENSG00000238009.2 chr1 92229 129217 ENSG00000238009.2 diff --git a/tests/data/gencode_hg19.subset.gtf b/tests/data/gencode_hg19.subset.gtf index 4a1f69b..f8e1701 100644 --- a/tests/data/gencode_hg19.subset.gtf +++ b/tests/data/gencode_hg19.subset.gtf @@ -26,3 +26,4 @@ chr1 HAVANA exon 110953 111357 . - . gene_id "ENSG00000238009.2"; transcript_id chr1 HAVANA exon 133374 133566 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000453576.2"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-004"; exon_number 1; exon_id "ENSE00001737600.2"; level 2; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003689.1"; chr1 HAVANA exon 129081 129223 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000453576.2"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-004"; exon_number 2; exon_id "ENSE00001827073.1"; level 2; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003689.1"; chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2"; +chr1 ensembl_havana transcript 685679 686673 . - . gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; diff --git a/tests/data/gencode_hg19.subset.sorted.bed b/tests/data/gencode_hg19.subset.sorted.bed index 0f47317..2f1c582 100644 --- a/tests/data/gencode_hg19.subset.sorted.bed +++ b/tests/data/gencode_hg19.subset.sorted.bed @@ -8,3 +8,4 @@ chr1 89294 120932 ENSG00000238009.2 chr1 92229 129217 ENSG00000238009.2 chr1 110952 129173 ENSG00000238009.2 chr1 129080 133566 ENSG00000238009.2 +chr1 685678 686673 ENSG00000284662