-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfasta_parsing.py
66 lines (58 loc) · 2.46 KB
/
fasta_parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import logging
import re
from typing import Dict, List
# These functions are (slightly) modified from AntiSMASH v.7.0.1 (antismash/antismash/common/fasta.py)
def write_fasta(headers: List[str], seqs: List[str], filename: str, id_only = False) -> None:
""" Writes name/sequence pairs to file in FASTA format
Argumnets:
names: a list of sequence identifiers
seqs: a list of sequences as strings
filename: the filename to write the FASTA formatted data to
Returns:
None
"""
with open(filename, "w", encoding="utf-8") as out_file:
for header, seq in zip(headers, seqs):
if id_only == True:
header = re.sub("tr\||sp\|", "", header)
if "WP_" in header or "XP_" in header:
name = '_'.join(re.split(r'[_,()| ]',header)[:2])
else:
name = re.split(r'[_,()| ]',header)[0]
else:
name = header
out_file.write(f">{name}\n{seq}\n")
def read_fasta(filename: str) -> Dict[str, str]:
""" Reads a fasta file into a dictionary
Arguments:
filename: the path to the FASTA file to read
Returns:
a dictionary mapping sequence ID to sequence
"""
ids = []
sequence_info = []
with open(filename, "r", encoding="utf-8") as fasta:
current_seq: List[str] = []
for line in fasta:
line = line.strip()
if not line:
continue
if line[0] == '>':
ids.append(line[1:].replace(" ", "_"))
if current_seq:
sequence_info.append("".join(current_seq))
current_seq.clear()
else:
if not ids:
raise ValueError("Sequence before identifier in fasta file")
if not line.replace("-", "z").isalpha():
raise ValueError("Sequence contains non-alphabetic characters")
current_seq.append(line)
if current_seq:
sequence_info.append("".join(current_seq))
if len(ids) != len(sequence_info):
raise ValueError("Fasta files contains different counts of sequences and ids")
if not ids:
logging.debug("Fasta file %s contains no sequences", filename)
raise ValueError("Fasta file contains no sequences")
return dict(zip(ids, sequence_info))