-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathfilterMethylatedReads.py
executable file
·151 lines (126 loc) · 4.38 KB
/
filterMethylatedReads.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/env python
import sys
import pysam
import argparse
parser = argparse.ArgumentParser(description= """
DESCRIPTION
Filter out reads from a bam file if they contain more than a threshold level of
methylated reads in non-CpG context.
Input is SAM or BAM file, typically produced by bismark, where the XM tag can
be extracted (e.g: XM:Z:......z....xz...h..h.)
USAGE
filterMethylatedReads.py -i <myreads.bam> -F 2 > filtered.bam
MEMO: Bismark codes for methylation are:
---------------------------------
z unmethylated C in CpG context
Z methylated C in CpG context
x unmethylated C in CHG context
X methylated C in CHG context
h unmethylated C in CHH context
H methylated C in CHH context
---------------------------------
""", formatter_class= argparse.RawTextHelpFormatter)
parser.add_argument('--input', '-i',
required= True,
help='''Input bam or sam file or - to read from stdin
''')
parser.add_argument('--isam', '-S',
required= False,
action= 'store_true',
help='''Input stream is SAM. Default is BAM.
''')
parser.add_argument('--outbam', '-b',
required= False,
action= 'store_true',
help='''Write to stdout as BAM. Defualt is to write SAM.
''')
parser.add_argument('--filter', '-F',
required= True,
type= str,
help='''Filter reads containing methylated calls in non-CpG
context. If int, reads with more non-CpG calls than F will be filtered out.
If float between 0 and 1, reads with percentage of non-CpG calls greater than
F will be removed.
''')
args = parser.parse_args()
# -----------------------------------------------------------------------------
def getTag(tagList, tag):
"""Return the index associated to to the tag from the list
of tags in pysam.AlignedRead.tags
E.g. getTag([('AS', -8), ('XN', 0), ('XM', 0)], 'AS') >>> 0
"""
i= 0
for t, v in tagList:
if t == tag:
return(i)
i += 1
return(None)
def countMethylation(xm):
"""Produce a dictionary of counts for each character in the XM tag
countMethylation('......z....xz...h..h.') >>> {'z': 2, 'h': 2, 'X': 0, 'H': 0, 'x': 1, 'Z': 0}
"""
metDict= {'z': xm.count('z'),
'Z': xm.count('Z'),
'x': xm.count('x'),
'X': xm.count('X'),
'h': xm.count('h'),
'H': xm.count('H')}
return(metDict)
def filterType(filter):
"""Return True if string 'filter' is int, False if filetr is a float
between 0 and 1, None otherwise
filterType('1') ## True
filterType('0') ## True
filterType('1.0') ## False
filterType('0.0') ## False
filterType('1.1') ## None
filterType('foo') ## None
filterType('-1') ## None
"""
if filter.isdigit():
if int(filter) < 0:
return(None)
else:
return(True)
else:
try:
filter= float(filter)
except ValueError:
return(None)
if filter >= 0 and filter <= 1:
return(False)
else:
return(None)
# -----------------------------------------------------------------------------
if args.input.endswith('.sam'):
samfile = pysam.Samfile(args.input, "r")
elif args.input.endswith('.bam'):
samfile = pysam.Samfile(args.input, "rb")
elif args.input == '-' and args.isam:
samfile = pysam.Samfile("-", "r")
elif args.input == '-' and not args.isam:
samfile = pysam.Samfile("-", "rb")
else:
sys.exit('Extension of input file must be .sam or .bam')
if args.outbam:
outfile = pysam.Samfile( "-", "wb", template = samfile)
else:
outfile = pysam.Samfile( "-", "w", template = samfile)
isint= filterType(args.filter)
if isint is None:
sys.exit('Invalid value for filter: "%s"' %(args.filter))
for read in samfile:
xmidx= getTag(read.tags, 'XM')
xm= read.tags[xmidx][1]
cntDict= countMethylation(xm)
cntXH= cntDict['X'] + cntDict['H']
## print(cntDict, cntXH)
if isint:
if cntXH <= int(args.filter):
outfile.write(read)
else:
if (float(cntXH) / len(xm)) <= float(args.filter):
outfile.write(read)
outfile.close()
samfile.close()
sys.exit()