Skip to content

Commit

Permalink
GTF attribute values can contain spaces
Browse files Browse the repository at this point in the history
  • Loading branch information
Redmar-van-den-Berg committed Apr 3, 2024
1 parent 7a79fc2 commit 305a724
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 1 deletion.
5 changes: 4 additions & 1 deletion bin/fuma-gencode-gtf-to-bed
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ def attribute_to_dict(line):
# The last 'field' is empty, as GTF lines are ; terminated
if not entry:
continue
key, value = entry.split()
# value can contain spaces
key, *value = entry.split()
# Put spaces back into value
value = ' '.join(value)
data[key] = value.replace('"','')
return data

Expand Down
1 change: 1 addition & 0 deletions tests/data/gencode_hg19.subset.bed
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ chr1 30266 31109 ENSG00000243485.2
chr1 30365 30503 ENSG00000243485.2
chr1 34553 36081 ENSG00000237613.2
chr1 35244 36073 ENSG00000237613.2
chr1 685678 686673 ENSG00000284662
chr1 69090 70008 ENSG00000186092.4
chr1 89294 120932 ENSG00000238009.2
chr1 92229 129217 ENSG00000238009.2
1 change: 1 addition & 0 deletions tests/data/gencode_hg19.subset.gtf
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ chr1 HAVANA exon 110953 111357 . - . gene_id "ENSG00000238009.2"; transcript_id
chr1 HAVANA exon 133374 133566 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000453576.2"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-004"; exon_number 1; exon_id "ENSE00001737600.2"; level 2; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003689.1";
chr1 HAVANA exon 129081 129223 . - . gene_id "ENSG00000238009.2"; transcript_id "ENST00000453576.2"; gene_type "lincRNA"; gene_status "NOVEL"; gene_name "RP11-34P13.7"; transcript_type "lincRNA"; transcript_status "KNOWN"; transcript_name "RP11-34P13.7-004"; exon_number 2; exon_id "ENSE00001827073.1"; level 2; havana_gene "OTTHUMG00000001096.2"; havana_transcript "OTTHUMT00000003689.1";
chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2";
chr1 ensembl_havana transcript 685679 686673 . - . gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)";
1 change: 1 addition & 0 deletions tests/data/gencode_hg19.subset.sorted.bed
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ chr1 89294 120932 ENSG00000238009.2
chr1 92229 129217 ENSG00000238009.2
chr1 110952 129173 ENSG00000238009.2
chr1 129080 133566 ENSG00000238009.2
chr1 685678 686673 ENSG00000284662

0 comments on commit 305a724

Please sign in to comment.