Merge pull request #43 from DKFZ-ODCF/unidirectional-wgbs-read-reorde…

…ring Unidirectional wgbs read reordering
DKFZ-ODCF · Feb 18, 2019 · fba38f0 · fba38f0
2 parents 3207dfc + 42e2b1e
commit fba38f0
Show file tree

Hide file tree

Showing 13 changed files with 827 additions and 179 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,42 @@
-# Quality control workflows collection for Roddy
+# Alignment and Quality Control Plugin for Roddy
 
-This plugins contains a series of alignment and quality control related workflows:
-- PanCancer alignment workflow for whole genome and exome
-- Postmerge QC workflow for whole genome and exome
-- Bisulfite core workflow
+This plugins contains alignment and quality control related [Roddy](https://github.com/TheRoddyWMS/Roddy) workflows.
 
+This is the branch for 1.2.73 maintenance branch with only minimal documentation. Please see the master branch for details.
 
+## Read-Reordering with Unidirectional WGBS Data
+
+By setting `reorderUnidirectionalWGBSReadPairs` the read-reordering script will be run that decides based on the relative frequencies of TC and AG dinucleotides in both reads, what is the most likely correct orientations of the reads, and may swap the two reads.
+
+Note that after the swapping, the read-numbers are reversed. What was R1 in the input FASTQ will be R2 in the output BAM, and vice versa.
+
+Furthermore, not all reads can be unambiguously classified. These unclassified reads are currently dropped. 
+
+The original script with a documentation of the underlying ideas can be found [here](https://github.com/cimbusch/TWGBS.git).
+
+## Change Logs
+
+* 1.2.73-3 (branch-specific change)
+  - Updated unidirectional WGBS read-reordering script from [here](https://github.com/cimbusch/TWGBS.git)
+  - Actually include the WGBS read-reordering script
+
+* 1.2.73-2 (branch-specific changes)
+  - Improved error checking and reporting for BWA and surrounding pipe
+
+* 1.2.73-1 (branch-specific changes)
+  - Lifted to Roddy 3.0 release (official LSF-capable release)
+  - Bugfix with wrong Bash function export
+
+* 1.2.73
+  - Lifted 1.1.73 to Roddy 2.4 (development-only release)
+  - Fingerprinting support also for WGBS
+  - sambamba 0.5.9 for sorting and viewing BAMS
+  - BAM termination sequence check
+
+* 1.1.73
+  - Bugfix mergeOnly step WGBS
+  - Substituted sambamba-based compression by samtools compression for improved stability, time, and memory consumption
+  - Tuning (tee -> mbuffer)
+  - Node-local scratch by default
+  - Fingerprinting (not for WGBS, yet)
+  - Bugfix affecting CLIP_INDEX in configuration 
diff --git a/resources/analysisTools/bisulfiteWorkflow/WGBS_read_pair_reconstruction.py b/resources/analysisTools/bisulfiteWorkflow/WGBS_read_pair_reconstruction.py
@@ -1,4 +1,10 @@
-#!/usr/bin/python
+#!/usr/bin/env python
+#
+# Copyright (c) 2018 German Cancer Research Center (DKFZ).
+#
+# Distributed under the MIT License (license terms are at https://github.com/cimbusch/TWGBS).
+# Commit: a202ae5d9b19c46b348c58523a17a4a7c66d116f
+#
 """
 Code to reconstruct correct R1-R2 reads relation from base ratio:
 
@@ -8,22 +14,21 @@
 read pairs which don't fall into these categories are ignored
 
 Charles Imbusch ([email protected]), G200
-
 """
 
 import gzip
 import sys
 import argparse
 
-parser = argparse.ArgumentParser(description='Program trying to reconstruct correct R1-R2 reads relation from base ratio in WGBS data.')
+parser = argparse.ArgumentParser(description='Alignment-free program to reconstruct correct R1-R2 reads relation from T/C and A/G base ratio in TWGBS data.')
 parser.add_argument('--R1_in', help='R1 input file', required=True)
 parser.add_argument('--R2_in', help='R2 input file', required=True)
-parser.add_argument('--gzip_input', help='Set if input fastq files are compressed', action='store_true', required=False, default=False)
+parser.add_argument('--input_ascii', help='Set if input fastq files are *not* compressed', action='store_true', required=False, default=False)
 parser.add_argument('--R1_out', help='R1 output file', required=True)
 parser.add_argument('--R2_out', help='R2 output file', required=True)
 parser.add_argument('--R1_unassigned', help='Filename R1 for unassigned read pairs', required=True)
 parser.add_argument('--R2_unassigned', help='Filename R2 for unassigned read pairs', required=True)
-parser.add_argument('--gzip_output', help='Set if output fastq files should be compressed', action='store_true', required=False, default=False)
+parser.add_argument('--output_ascii', help='Set if output fastq files should be *not* compressed', action='store_true', required=False, default=False)
 parser.add_argument('--log', help='Write basic statistics to this file', required=False, default=False)
 parser.add_argument('--debug', help='Debug and write more output to console', action='store_true', required=False, default=False)
 
@@ -42,15 +47,15 @@
 R2_unassigned = None
 
 # file handlers for input files
-if args_dict['gzip_input'] == True:
-    f = open(args_dict['R1_in'], 'r')
-    f2 = open(args_dict['R2_in'], 'r')
-else:
+if args_dict['input_ascii'] == False:
     f = gzip.open(args_dict['R1_in'], 'r')
     f2 = gzip.open(args_dict['R2_in'], 'r')
+else:
+    f = open(args_dict['R1_in'], 'r')
+    f2 = open(args_dict['R2_in'], 'r')
 
 # file handlers for output files
-if args_dict['gzip_output'] == True:
+if args_dict['output_ascii'] == False:
     f_R1_out = gzip.open(args_dict['R1_out'], 'wb')
     f_R2_out = gzip.open(args_dict['R2_out'], 'wb')
     R1_unassigned = gzip.open(args_dict['R1_unassigned'], 'wb')
@@ -71,22 +76,11 @@
 
 ### count A/T/C/G content in a given sequence
 def getContent(DNAsequence):
-    A = 0
-    T = 0
-    C = 0
-    G = 0
-    N = 0
-    for c in DNAsequence:
-        if c == 'A':
-            A+=1
-        if c == 'T':
-            T+=1
-        if c == 'C':
-            C+=1
-        if c == 'G':
-            G+=1
-        if c == 'N':
-            N+=1
+    A = DNAsequence.count("A")
+    T = DNAsequence.count("T")
+    C = DNAsequence.count("C")
+    G = DNAsequence.count("G")
+    N = DNAsequence.count("N")
     return (A, T, C, G, N)
 
 ### count base content masking CG/TG content
@@ -98,9 +92,9 @@ def getContentR1(DNAsequence):
 
 ### count base content masking GC/GT content
 def getContentR2(DNAsequence):
-        DNAsequence = DNAsequence.replace("GC", "")
-        DNAsequence = DNAsequence.replace("GT", "")
-        return getContent(DNAsequence)
+    DNAsequence = DNAsequence.replace("GC", "")
+    DNAsequence = DNAsequence.replace("GT", "")
+    return getContent(DNAsequence)
 
 
 if args_dict['debug'] == True:
@@ -134,15 +128,8 @@ def getContentR2(DNAsequence):
         print "=="
         print "R1:R2\t" + str(R1_content) + "\t" + str(R2_content) + "\tR1:T:C|R1:A:G\t" + str(R1_T_C) + "\t" +str(R1_A_G)
         print "R2:R1\t" + str(R2_content) + "\t" + str(R1_content) + "\tR2:T:C|R2:A:G\t" + str(R2_T_C) + "\t" +str(R2_A_G)
-    # third idea
-    #R1R2_layout = ( (R1_T_C > R2_T_C) or (R1_T_C > R1_A_G) ) and ( (R2_A_G > R1_A_G) or (R2_A_G > R2_T_C) )
-    #R2R1_layout = ( (R1_T_C < R2_T_C) or (R1_T_C < R1_A_G) ) and ( (R2_A_G < R1_A_G) or (R2_A_G < R2_T_C) )
-    # second idea
     R1R2_layout = (R1_T_C > R2_T_C) and (R2_A_G > R1_A_G)
     R2R1_layout = (R1_T_C < R2_T_C) and (R2_A_G < R1_A_G)
-    # first idea
-    #R1R2_layout = (R1_T_C > R1_A_G) and (R2_A_G > R2_T_C)
-    #R2R1_layout = (R1_T_C < R1_A_G) and (R2_A_G < R2_T_C)
     if R1R2_layout:
         R1_output = header1 + "\n" + sequence1 + "\n" + spacer1 + "\n" + qual_scores1 + "\n"
         R2_output = header2 + "\n" + sequence2 + "\n" + spacer2 + "\n" + qual_scores2 + "\n"