-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #43 from DKFZ-ODCF/unidirectional-wgbs-read-reorde…
…ring Unidirectional wgbs read reordering
- Loading branch information
Showing
13 changed files
with
827 additions
and
179 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,42 @@ | ||
# Quality control workflows collection for Roddy | ||
# Alignment and Quality Control Plugin for Roddy | ||
|
||
This plugins contains a series of alignment and quality control related workflows: | ||
- PanCancer alignment workflow for whole genome and exome | ||
- Postmerge QC workflow for whole genome and exome | ||
- Bisulfite core workflow | ||
This plugins contains alignment and quality control related [Roddy](https://github.com/TheRoddyWMS/Roddy) workflows. | ||
|
||
This is the branch for 1.2.73 maintenance branch with only minimal documentation. Please see the master branch for details. | ||
|
||
## Read-Reordering with Unidirectional WGBS Data | ||
|
||
By setting `reorderUnidirectionalWGBSReadPairs` the read-reordering script will be run that decides based on the relative frequencies of TC and AG dinucleotides in both reads, what is the most likely correct orientations of the reads, and may swap the two reads. | ||
|
||
Note that after the swapping, the read-numbers are reversed. What was R1 in the input FASTQ will be R2 in the output BAM, and vice versa. | ||
|
||
Furthermore, not all reads can be unambiguously classified. These unclassified reads are currently dropped. | ||
|
||
The original script with a documentation of the underlying ideas can be found [here](https://github.com/cimbusch/TWGBS.git). | ||
|
||
## Change Logs | ||
|
||
* 1.2.73-3 (branch-specific change) | ||
- Updated unidirectional WGBS read-reordering script from [here](https://github.com/cimbusch/TWGBS.git) | ||
- Actually include the WGBS read-reordering script | ||
|
||
* 1.2.73-2 (branch-specific changes) | ||
- Improved error checking and reporting for BWA and surrounding pipe | ||
|
||
* 1.2.73-1 (branch-specific changes) | ||
- Lifted to Roddy 3.0 release (official LSF-capable release) | ||
- Bugfix with wrong Bash function export | ||
|
||
* 1.2.73 | ||
- Lifted 1.1.73 to Roddy 2.4 (development-only release) | ||
- Fingerprinting support also for WGBS | ||
- sambamba 0.5.9 for sorting and viewing BAMS | ||
- BAM termination sequence check | ||
|
||
* 1.1.73 | ||
- Bugfix mergeOnly step WGBS | ||
- Substituted sambamba-based compression by samtools compression for improved stability, time, and memory consumption | ||
- Tuning (tee -> mbuffer) | ||
- Node-local scratch by default | ||
- Fingerprinting (not for WGBS, yet) | ||
- Bugfix affecting CLIP_INDEX in configuration |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,10 @@ | ||
#!/usr/bin/python | ||
#!/usr/bin/env python | ||
# | ||
# Copyright (c) 2018 German Cancer Research Center (DKFZ). | ||
# | ||
# Distributed under the MIT License (license terms are at https://github.com/cimbusch/TWGBS). | ||
# Commit: a202ae5d9b19c46b348c58523a17a4a7c66d116f | ||
# | ||
""" | ||
Code to reconstruct correct R1-R2 reads relation from base ratio: | ||
|
@@ -8,22 +14,21 @@ | |
read pairs which don't fall into these categories are ignored | ||
Charles Imbusch ([email protected]), G200 | ||
""" | ||
|
||
import gzip | ||
import sys | ||
import argparse | ||
|
||
parser = argparse.ArgumentParser(description='Program trying to reconstruct correct R1-R2 reads relation from base ratio in WGBS data.') | ||
parser = argparse.ArgumentParser(description='Alignment-free program to reconstruct correct R1-R2 reads relation from T/C and A/G base ratio in TWGBS data.') | ||
parser.add_argument('--R1_in', help='R1 input file', required=True) | ||
parser.add_argument('--R2_in', help='R2 input file', required=True) | ||
parser.add_argument('--gzip_input', help='Set if input fastq files are compressed', action='store_true', required=False, default=False) | ||
parser.add_argument('--input_ascii', help='Set if input fastq files are *not* compressed', action='store_true', required=False, default=False) | ||
parser.add_argument('--R1_out', help='R1 output file', required=True) | ||
parser.add_argument('--R2_out', help='R2 output file', required=True) | ||
parser.add_argument('--R1_unassigned', help='Filename R1 for unassigned read pairs', required=True) | ||
parser.add_argument('--R2_unassigned', help='Filename R2 for unassigned read pairs', required=True) | ||
parser.add_argument('--gzip_output', help='Set if output fastq files should be compressed', action='store_true', required=False, default=False) | ||
parser.add_argument('--output_ascii', help='Set if output fastq files should be *not* compressed', action='store_true', required=False, default=False) | ||
parser.add_argument('--log', help='Write basic statistics to this file', required=False, default=False) | ||
parser.add_argument('--debug', help='Debug and write more output to console', action='store_true', required=False, default=False) | ||
|
||
|
@@ -42,15 +47,15 @@ | |
R2_unassigned = None | ||
|
||
# file handlers for input files | ||
if args_dict['gzip_input'] == True: | ||
f = open(args_dict['R1_in'], 'r') | ||
f2 = open(args_dict['R2_in'], 'r') | ||
else: | ||
if args_dict['input_ascii'] == False: | ||
f = gzip.open(args_dict['R1_in'], 'r') | ||
f2 = gzip.open(args_dict['R2_in'], 'r') | ||
else: | ||
f = open(args_dict['R1_in'], 'r') | ||
f2 = open(args_dict['R2_in'], 'r') | ||
|
||
# file handlers for output files | ||
if args_dict['gzip_output'] == True: | ||
if args_dict['output_ascii'] == False: | ||
f_R1_out = gzip.open(args_dict['R1_out'], 'wb') | ||
f_R2_out = gzip.open(args_dict['R2_out'], 'wb') | ||
R1_unassigned = gzip.open(args_dict['R1_unassigned'], 'wb') | ||
|
@@ -71,22 +76,11 @@ | |
|
||
### count A/T/C/G content in a given sequence | ||
def getContent(DNAsequence): | ||
A = 0 | ||
T = 0 | ||
C = 0 | ||
G = 0 | ||
N = 0 | ||
for c in DNAsequence: | ||
if c == 'A': | ||
A+=1 | ||
if c == 'T': | ||
T+=1 | ||
if c == 'C': | ||
C+=1 | ||
if c == 'G': | ||
G+=1 | ||
if c == 'N': | ||
N+=1 | ||
A = DNAsequence.count("A") | ||
T = DNAsequence.count("T") | ||
C = DNAsequence.count("C") | ||
G = DNAsequence.count("G") | ||
N = DNAsequence.count("N") | ||
return (A, T, C, G, N) | ||
|
||
### count base content masking CG/TG content | ||
|
@@ -98,9 +92,9 @@ def getContentR1(DNAsequence): | |
|
||
### count base content masking GC/GT content | ||
def getContentR2(DNAsequence): | ||
DNAsequence = DNAsequence.replace("GC", "") | ||
DNAsequence = DNAsequence.replace("GT", "") | ||
return getContent(DNAsequence) | ||
DNAsequence = DNAsequence.replace("GC", "") | ||
DNAsequence = DNAsequence.replace("GT", "") | ||
return getContent(DNAsequence) | ||
|
||
|
||
if args_dict['debug'] == True: | ||
|
@@ -134,15 +128,8 @@ def getContentR2(DNAsequence): | |
print "==" | ||
print "R1:R2\t" + str(R1_content) + "\t" + str(R2_content) + "\tR1:T:C|R1:A:G\t" + str(R1_T_C) + "\t" +str(R1_A_G) | ||
print "R2:R1\t" + str(R2_content) + "\t" + str(R1_content) + "\tR2:T:C|R2:A:G\t" + str(R2_T_C) + "\t" +str(R2_A_G) | ||
# third idea | ||
#R1R2_layout = ( (R1_T_C > R2_T_C) or (R1_T_C > R1_A_G) ) and ( (R2_A_G > R1_A_G) or (R2_A_G > R2_T_C) ) | ||
#R2R1_layout = ( (R1_T_C < R2_T_C) or (R1_T_C < R1_A_G) ) and ( (R2_A_G < R1_A_G) or (R2_A_G < R2_T_C) ) | ||
# second idea | ||
R1R2_layout = (R1_T_C > R2_T_C) and (R2_A_G > R1_A_G) | ||
R2R1_layout = (R1_T_C < R2_T_C) and (R2_A_G < R1_A_G) | ||
# first idea | ||
#R1R2_layout = (R1_T_C > R1_A_G) and (R2_A_G > R2_T_C) | ||
#R2R1_layout = (R1_T_C < R1_A_G) and (R2_A_G < R2_T_C) | ||
if R1R2_layout: | ||
R1_output = header1 + "\n" + sequence1 + "\n" + spacer1 + "\n" + qual_scores1 + "\n" | ||
R2_output = header2 + "\n" + sequence2 + "\n" + spacer2 + "\n" + qual_scores2 + "\n" | ||
|
Oops, something went wrong.