Merge pull request #27 from metagenlab/single_pair_new_try

Single pair new try
metagenlab · Jul 6, 2020 · dd52352 · dd52352
2 parents 728a7dd + 892c3c2
commit dd52352
Show file tree

Hide file tree

Showing 18 changed files with 341 additions and 31 deletions.
diff --git a/Snakefile b/Snakefile
@@ -11,7 +11,7 @@ if "--use-singularity" in sys.argv:
     ### Bind the directory of the database to the singularity containers.
     workflow.singularity_args += f' -B {config["tax_DB_path"]}:{config["tax_DB_path"]}'
     #### Load a dictionnary of singularity containers that will be called from each rule
-    singularity_envs = yaml.safe_load(open(os.path.join(workflow.basedir,  "envs/singularity/sing_envs.yml"), 'r'))
+singularity_envs = yaml.safe_load(open(os.path.join(workflow.basedir,  "envs/singularity/sing_envs.yml"), 'r'))
 
 
 ## Include the pipeline rules

diff --git a/ressources/r_scripts/importation/create_phyloseq_from_metadata.R b/ressources/r_scripts/importation/create_phyloseq_from_metadata.R
@@ -37,6 +37,6 @@ create_phyloseq_fct <- function(physeq_name = "physeq", melted_df_name = "physeq
   save(physeq_obj, file = "physeq_object")
 
   ### Write this table in a tsv file since it is ground for coming analysis and slow to compute
-  write.table(x = physeq_df, file = "physeq_df.tsv", append = F, quote = F, sep = "\t", eol = "\n", row.names = F, col.names = T )
+  write.table(x = physeq_df, file = "physeq_df.tsv", append = F, sep = "\t", eol = "\n", row.names = F, col.names = T )
 
 }
diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules
@@ -1,18 +1,16 @@
 ## Generate MultiQC reports
 
-rule assess_quality_paired_raw_reads_with_fastqc:
+rule assess_quality_raw_reads_with_fastqc:
     conda:
         "../../envs/FastQC.yml"
     container:
         singularity_envs["fastqc"]
     input:
-        "raw_reads/{sample}_R1.fastq.gz",
-        "raw_reads/{sample}_R2.fastq.gz",
+        "raw_reads/{sample}_{suffix}.fastq.gz"
     output:
-          "QC/FastQC/raw_reads/{RUN}/{sample}_R1_fastqc.zip",
-          "QC/FastQC/raw_reads/{RUN}/{sample}_R2_fastqc.zip",
+          "QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip"          
     log:
-        logging_folder + "QC/FastQC/raw_reads/{RUN}/{sample}_fastqc.txt"
+        logging_folder + "QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.txt"
     threads: 
         1
     shell:
@@ -21,14 +19,24 @@ rule assess_quality_paired_raw_reads_with_fastqc:
         """
 
 
-rule create_per_run_raw_reads_multiqc_report:
+def sample_list_run_QC(RUN):
+    file_list = []
+    samples =  list(all_samples.index.values[all_samples[config["run_column"]] == RUN])
+    for s in samples:
+        suffix_s = reads_ext[s]
+        combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = s, suffix = suffix_s)
+        file_list = file_list + combined_values 
+
+    return(file_list)
+
+
+rule create_per_run_multiqc_report:
     conda:
         "../../envs/MultiQC.yml"
     container:
         singularity_envs["multiqc"]
     input:
-      lambda wildcards: expand("QC/FastQC/raw_reads/{RUN}/{sample}_R1_fastqc.zip", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN], RUN = wildcards.RUN),
-      lambda wildcards: expand("QC/FastQC/raw_reads/{RUN}/{sample}_R2_fastqc.zip", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN], RUN = wildcards.RUN),
+      lambda wildcards: sample_list_run_QC(wildcards.RUN)
     output:
         report("QC/{RUN}_multiqc_raw_reads_report.html", caption="report/MultiQC.rst", category="MultiQC", subcategory="{RUN}"),
         "QC/{RUN}_multiqc_raw_reads_report_data/multiqc_general_stats.txt",
@@ -41,13 +49,16 @@ rule create_per_run_raw_reads_multiqc_report:
         multiqc --interactive -f -n {output[0]} $(dirname {input} | tr "\n" " ") --cl_config "max_table_rows: 100000" &> {log}
         """
 
-def sample_list():
+def sample_list_overall_QC():
 
     file_list = []
 
     for i in set(all_samples[config['run_column']]):
-        combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_R{read_number}_fastqc.zip", RUN = i, sample = all_samples.index.values[all_samples[config["run_column"]] == i], read_number = ["1","2"])
-        file_list = file_list + combined_values
+        samples =  list(all_samples.index.values[all_samples[config["run_column"]] == i])
+        for s in samples:
+            suffix_s = reads_ext[s]
+            combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = i, sample = s, suffix = suffix_s)
+            file_list = file_list + combined_values
 
     return(file_list)
 
@@ -58,7 +69,7 @@ rule create_overall_raw_reads_multiqc_report:
     container:
         singularity_envs["multiqc"]
     input:
-       sample_list()
+       sample_list_overall_QC()
     output:
         report("QC/multiqc_raw_reads_report.html", caption="report/MultiQC.rst", category="MultiQC", subcategory="All run"),
         "QC/multiqc_raw_reads_report_data/multiqc_general_stats.txt",

diff --git a/rules/0_preprocessing/get_reads.rules b/rules/0_preprocessing/get_reads.rules
@@ -43,6 +43,7 @@ reads_sra = {}
 
 reads_local = {}
 original_names = {}
+reads_ext = {}
 
 if "local_samples" not in config.keys() and "sra_samples" not in config.keys():
     raise ValueError("No samples defined in the config file")
@@ -66,6 +67,18 @@ if "local_samples" in config.keys():
             sample = str(sorted(list(original_names.keys()))[match.index(True)])
             original_correct[i] = original_names[sample]
             read_correct[i] = reads_local[sample]
+
+            if "LibraryLayout" in list(local_data):
+                if local_data.loc[i, "LibraryLayout"].lower()=="paired":
+                    reads_ext[i]=["R1", "R2"]
+                elif sra_data.loc[i, "LibraryLayout"].lower()=="single":
+                    reads_ext[i]=["single"]
+                else:
+                    raise ValueError("Problem in the sra file, LibraryLayout badly defined")
+            else:
+                reads_ext[i]=["R1", "R2"]
+
+
     else:
         for i in list(local_data["OldSampleName"]):
             regex = re.compile(r'%s([^a-zA-Z0-9]|$)' % i)
@@ -76,9 +89,21 @@ if "local_samples" in config.keys():
             sample=str(local_data.index[local_data['OldSampleName'] == i][0])
             original_correct[sample] = original_names[old_sample_name]
             read_correct[sample] = reads_local[old_sample_name]
+
+            if "LibraryLayout" in list(local_data):
+                if local_data.loc[sample, "LibraryLayout"].lower()=="paired":
+                    reads_ext[sample]=["R1", "R2"]
+                elif sra_data.loc[i, "LibraryLayout"].lower()=="single":
+                    reads_ext[sample]=["single"]
+                else:
+                    raise ValueError("Problem in the sra file, LibraryLayout badly defined")
+            else:
+                reads_ext[sample]=["R1", "R2"]
+
     original_names = original_correct
     reads_local = read_correct
     all_samples=local_data
+    reads_ext = reads_ext
 
 if "sra_samples" in config.keys():
     sra_data = pandas.read_csv(config["sra_samples"], sep="\t", index_col=0).drop_duplicates()
@@ -92,8 +117,10 @@ if "sra_samples" in config.keys():
         reads_sra[sample_name]=str(i)
         if sra_data.loc[i, "LibraryLayout"].lower()=="paired":
             sras_ext[sample_name]=["1.fastq.gz", "2.fastq.gz"]
+            reads_ext[sample_name]=["R1", "R2"]
         elif sra_data.loc[i, "LibraryLayout"].lower()=="single":
             sras_ext[sample_name] = ["fastq.gz"]
+            reads_ext[sample_name]=["single"]
         else:
             raise ValueError("Problem in the sra file, LibraryLayout badly defined")
     all_samples=sra_data

diff --git a/rules/0_preprocessing/get_sras.rules b/rules/0_preprocessing/get_sras.rules
@@ -38,6 +38,7 @@ rule sra_convert_to_fastq_paired:
         fastq-dump --split-3 --gzip --log-level 1 --disable-multithreading --minReadLen 0 --outdir $(dirname {output[0]}) {input[0]} &> {log}
         """
 
+
 rule sra_convert_to_fastq_single:
     conda:
         "../../envs/sra-tools.yml"

diff --git a/rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules b/rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules
@@ -1,6 +1,6 @@
 ## Remove primers from sequences
 
-rule cutadapt_trim_for_DADA2:
+rule cutadapt_trim_paired_for_DADA2:
     conda:
         "../../envs/cutadapt.yml"
     container:
@@ -20,4 +20,25 @@ rule cutadapt_trim_for_DADA2:
     threads:
         1
     script: 
-        "scripts/1_cutadapt.py"
+        "scripts/1_cutadapt_paired.py"
+
+
+rule cutadapt_trim_single_for_DADA2:
+    conda:
+        "../../envs/cutadapt.yml"
+    container:
+        singularity_envs["cutadapt"]
+    input:
+        R1_raw_reads = "raw_reads/{sample}_single.fastq.gz",
+    output:
+        R1_trimmed_reads = temp("{denoiser}/1a_trimmed_primers/{sample}_trimmed_single.fastq.gz")
+    log:
+        logging_folder + "{denoiser}/1a_trimmed_primers/{sample}_single_trimmed.txt"
+    params:
+        amplicon_type = config["ITS_or_16S"],
+        forward_primer = config["forward_primer"],
+        reverse_primer = config["reverse_primer"],
+    threads:
+        1
+    script: 
+        "scripts/1_cutadapt_single.py"
diff --git a/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules b/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules
@@ -1,22 +1,28 @@
 ## Process trimmed sequences with DADA2 to correct errors and generate ASVs. Follows the "big data" protocol (https://benjjneb.github.io/dada2/bigdata.html)
 
 ## Define trimmed or not trimmed input (for read already trimmed)
-
-def DADA2_trim_input():
+def DADA2_trim_input_paired():
     if config['Trim_primers'] == True :
         return ["DADA2/1a_trimmed_primers/{sample}_trimmed_R1.fastq.gz", "DADA2/1a_trimmed_primers/{sample}_trimmed_R2.fastq.gz"]
     else : 
         return ["raw_reads/{sample}_R1.fastq.gz", "raw_reads/{sample}_R2.fastq.gz"]
 
 
+def DADA2_trim_input_single():
+    if config['Trim_primers'] == True :
+        return ["DADA2/1a_trimmed_primers/{sample}_trimmed_single.fastq.gz"]
+    else : 
+        return ["raw_reads/{sample}_single.fastq.gz"]
+
+
 ### Remove low quality and too short reads
-rule DADA2_q_filtering:
+rule DADA2_q_filtering_paired:
     conda:
         "../../envs/DADA2_in_R.yml"
     container:
         singularity_envs["dada2"]
     input:
-        DADA2_trim_input()
+        DADA2_trim_input_paired()
     output:
         q_score_filtered_F = temp("DADA2/1b_q_score_filtered_paired/{sample}_filtered_R1.fastq.gz"),
         q_score_filtered_R = temp("DADA2/1b_q_score_filtered_paired/{sample}_filtered_R2.fastq.gz"),
@@ -32,10 +38,34 @@ rule DADA2_q_filtering:
     threads:
         1
     script:
-        "scripts/1_DADA2_q_filtering.R"
+        "scripts/1_DADA2_q_filtering_paired.R"
+
+
+rule DADA2_q_filtering_single:
+    conda:
+        "../../envs/DADA2_in_R.yml"
+    container:
+        singularity_envs["dada2"]
+    input:
+        DADA2_trim_input_single()
+    output:
+        q_score_filtered_F = temp("DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz"),
+        q_filtering_stats = "DADA2/1b_q_score_filtered_paired/{sample}_q_score_filtering_stats.rds"
+    log:
+        logging_folder + "DADA2/1b_q_score_filtered/{sample}_DADA2_1b_q_score_filtering.txt"
+    params:
+        F_reads_length_trim = config["DADA2_F_reads_length_trim"],
+        F_reads_extected_error = config["F_extected_error"],
+        sample_name = lambda wildcards: wildcards.sample,
+    threads:
+        1
+    script:
+        "scripts/1_DADA2_q_filtering_single.R"
+
+
 
 ### Learn error profile of the runs
-rule DADA2_learn_errors:
+rule DADA2_learn_errors_paired:
     conda:
         "../../envs/DADA2_in_R.yml"
     container:
@@ -53,10 +83,31 @@ rule DADA2_learn_errors:
     threads:
         8
     script:
-        "scripts/2a_big_data_DADA2_learn_errors.R"
+        "scripts/2a_big_data_DADA2_learn_errors_paired.R"
 
-### Infer error free sequences from the filtered reads and the runs error profiles
-rule DADA2_infer_ASV:
+
+
+rule DADA2_learn_errors_single:
+    conda:
+        "../../envs/DADA2_in_R.yml"
+    container:
+        singularity_envs["dada2"]
+    input:
+        q_score_filtered_F = lambda wildcards: expand("DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN]),
+    output:
+        error_profile_F = "DADA2/2_denoised/{RUN}/error_profile_single.rds",
+        error_profile_F_plot = report("DADA2/2_denoised/{RUN}/error_profile_single_plot.png", caption="report/DADA2_error_profile.rst", category="DADA2", subcategory="Error profile"),
+    log:
+        logging_folder + "DADA2/2_denoised/{RUN}/DADA2_learn_errors.txt"
+    threads:
+        8
+    script:
+        "scripts/2a_big_data_DADA2_learn_errors_single.R"
+
+
+
+## Correct errors based on profile error of the run
+rule DADA2_infer_paired_ASV:
     conda:
         "../../envs/DADA2_in_R.yml"
     container:
@@ -79,7 +130,35 @@ rule DADA2_infer_ASV:
     threads:
         4
     script:
-        "scripts/2b_big_data_DADA2_infer_ASV.R"
+        "scripts/2b_big_data_DADA2_infer_ASV_paired.R"
+
+
+
+rule DADA2_infer_single_ASV:
+    conda:
+        "../../envs/DADA2_in_R.yml"
+    container:
+        singularity_envs["dada2"]
+    input:
+        q_score_filtered_F = "DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz",
+        error_profile_F = "DADA2/2_denoised/{RUN}/error_profile_single.rds",
+    output:
+        infer_stats = "DADA2/2_denoised/{RUN}/{sample}_infer_stats.rds",
+        sample_seq_tab = "DADA2/2_denoised/{RUN}/{sample}_infer_seq_tab.rds",
+    params:
+        sample_name = lambda wildcards: wildcards.sample,
+        run = lambda wildcards: wildcards.RUN,
+        x_column_value = lambda wildcards: all_samples[config["sample_label"]][wildcards.sample],
+	min_overlap =  config["min_overlap"]
+    log:
+        logging_folder + "DADA2/2_denoised/{RUN}/{sample}_DADA2_2_infer_ASV.txt"
+    threads:
+        4
+    script:
+        "scripts/2b_big_data_DADA2_infer_ASV_single.R"
+
+
+
 
 ### Merge the ASVs determined from the different samples
 rule DADA2_merge_sample_ASV:
@@ -100,6 +179,7 @@ rule DADA2_merge_sample_ASV:
     script:
         "scripts/2c_big_data_DADA2_merge_ASV.R"
 
+
 ### Merge data from all run, filter out chimera and remove too short merged sequences
 rule DADA2_merge_runs_filter_chim:
     conda:

diff --git a/..._DADA2_ASVs/scripts/1_DADA2_q_filtering.R → ...ASVs/scripts/1_DADA2_q_filtering_paired.R b/..._DADA2_ASVs/scripts/1_DADA2_q_filtering.R → ...ASVs/scripts/1_DADA2_q_filtering_paired.R
@@ -33,7 +33,7 @@
 ### Reads are filtered based on the number of errors expected for the read (integration of the qscore and the length of the read). All reads with uncalled nucleotide (N) are removed too. Remaining phiX reads will be removed too. Finally, reads are cut at a length set in config.
 
 ### Filter and trim. The filtered reads are directly written while the filtering stats are being save for later compilation.
-    filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, rev = fnRs, filt.rev = q_score_filtered_R, truncLen=c(F_length,R_length), maxN=0, maxEE=c(F_extected_error,R_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads)
+    filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, rev = fnRs, filt.rev = q_score_filtered_R, truncLen=c(F_length,R_length), maxN=0, maxEE=c(F_extected_error,R_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads, verbose = TRUE)
     filtering_stats <- as.data.frame(filtering_stats)
     filtering_stats$Sample <- sample_name
 

diff --git a/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R b/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R
@@ -0,0 +1,37 @@
+# Title     : DADA2 - Q-score filtering
+# Objective : Filter reads based on q-score and length
+# Created by: valentinscherz
+# Created on: 06.06.19
+# Modified from :https://benjjneb.github.io/dada2/bigdata_paired.html
+
+## Redirect R output to the log file
+    log <- file(snakemake@log[[1]], open="wt")
+    sink(log)
+    sink(log, type="message")
+
+## Input
+    fnFs <- snakemake@input[[1]]
+
+## Output
+    q_score_filtered_F <- snakemake@output[["q_score_filtered_F"]]
+    q_filtering_stats_path <- snakemake@output[["q_filtering_stats"]]
+
+## Parameters
+    F_length <- snakemake@params[["F_reads_length_trim"]]
+    F_extected_error <- snakemake@params[["F_reads_extected_error"]]
+    sample_name <- snakemake@params[["sample_name"]]
+
+
+## Load needed libraries
+    library(dada2);packageVersion("dada2")
+
+## Filter and trim
+### Reads are filtered based on the number of errors expected for the read (integration of the qscore and the length of the read). All reads with uncalled nucleotide (N) are removed too. Remaining phiX reads will be removed too. Finally, reads are cut at a length set in config.
+
+### Filter and trim. The filtered reads are directly written while the filtering stats are being save for later compilation.
+    filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, truncLen=c(F_length), maxN=0, maxEE=c(F_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads, verbose = TRUE)
+    filtering_stats <- as.data.frame(filtering_stats)
+    filtering_stats$Sample <- sample_name
+
+### Save the stats for this sample in a R object
+    saveRDS(filtering_stats, file = q_filtering_stats_path)
diff --git a/rules/1_2_DADA2_ASVs/scripts/1_cutadapt.py → ...2_DADA2_ASVs/scripts/1_cutadapt_paired.py b/rules/1_2_DADA2_ASVs/scripts/1_cutadapt.py → ...2_DADA2_ASVs/scripts/1_cutadapt_paired.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -38,6 +38,7 @@ rule sra_convert_to_fastq_paired: @@
             fastq-dump --split-3 --gzip --log-level 1 --disable-multithreading --minReadLen 0 --outdir $(dirname {output[0]}) {input[0]} &> {log}
             """
     rule sra_convert_to_fastq_single:
         conda:
             "../../envs/sra-tools.yml"
@@ Expand Down @@