From 0300154acac2a09a6032dcfa18c48848e0ca150b Mon Sep 17 00:00:00 2001 From: Valentin Date: Mon, 25 May 2020 16:58:58 +0200 Subject: [PATCH 01/24] WIP --- rules/0_preprocessing/QC_raw_reads.rules | 10 ++++------ rules/0_preprocessing/get_sras.rules | 3 ++- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index cb0e25e4..98ca63a0 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -1,18 +1,16 @@ ## Generate MultiQC reports -rule assess_quality_paired_raw_reads_with_fastqc: +rule assess_quality_raw_reads_with_fastqc: conda: "../../envs/FastQC.yml" container: singularity_envs["fastqc"] input: - "raw_reads/{sample}_R1.fastq.gz", - "raw_reads/{sample}_R2.fastq.gz", + "raw_reads/{sample}_{suffix}.fastq.gz" output: - "QC/FastQC/raw_reads/{RUN}/{sample}_R1_fastqc.zip", - "QC/FastQC/raw_reads/{RUN}/{sample}_R2_fastqc.zip", + "QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip" log: - logging_folder + "QC/FastQC/raw_reads/{RUN}/{sample}_fastqc.txt" + logging_folder + "QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.txt" threads: 1 shell: diff --git a/rules/0_preprocessing/get_sras.rules b/rules/0_preprocessing/get_sras.rules index 7b0f3d50..84e07044 100644 --- a/rules/0_preprocessing/get_sras.rules +++ b/rules/0_preprocessing/get_sras.rules @@ -38,6 +38,7 @@ rule sra_convert_to_fastq_paired: fastq-dump --split-3 --gzip --log-level 1 --disable-multithreading --minReadLen 0 --outdir $(dirname {output[0]}) {input[0]} &> {log} """ + rule sra_convert_to_fastq_single: conda: "../../envs/sra-tools.yml" @@ -46,7 +47,7 @@ rule sra_convert_to_fastq_single: input: "ncbi/public/sra/{sample}.sra" output: - temp(link_directory + "{sample,[A-Za-z0-9]+}.fastq.gz"), + temp(link_directory + "{sample,[A-Za-z0-9]+}_fastq.gz"), log: logging_folder + "logs/sra_download/{sample}_dump.txt" shell: From b02253aed380c034fd7240ebce6e420e3e1220a5 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 17:28:26 +0200 Subject: [PATCH 02/24] WIP --- rules/0_preprocessing/get_sras.rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rules/0_preprocessing/get_sras.rules b/rules/0_preprocessing/get_sras.rules index 84e07044..c24f4277 100644 --- a/rules/0_preprocessing/get_sras.rules +++ b/rules/0_preprocessing/get_sras.rules @@ -47,7 +47,7 @@ rule sra_convert_to_fastq_single: input: "ncbi/public/sra/{sample}.sra" output: - temp(link_directory + "{sample,[A-Za-z0-9]+}_fastq.gz"), + temp(link_directory + "{sample,[A-Za-z0-9]+}.fastq.gz"), log: logging_folder + "logs/sra_download/{sample}_dump.txt" shell: From fe9278e1975963373e2beb2b254774c6261f3d26 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 17:53:48 +0200 Subject: [PATCH 03/24] WIP --- rules/0_preprocessing/QC_raw_reads.rules | 26 ++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index 98ca63a0..d5e8cef6 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -19,7 +19,29 @@ rule assess_quality_raw_reads_with_fastqc: """ -rule create_per_run_raw_reads_multiqc_report: + +rule create_per_run_single_raw_reads_multiqc_report: + conda: + "../../envs/MultiQC.yml" + container: + singularity_envs["multiqc"] + input: + lambda wildcards: expand("QC/FastQC/raw_reads/{RUN}/{sample}_single_fastqc.zip", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN], RUN = wildcards.RUN), + output: + report("QC/{RUN}_multiqc_raw_reads_report.html", caption="report/MultiQC.rst", category="MultiQC", subcategory="{RUN}"), + "QC/{RUN}_multiqc_raw_reads_report_data/multiqc_general_stats.txt", + log: + logging_folder + "QC/{RUN}_multiqc_raw_reads_report.txt" + priority: + 100 + shell: + """ + multiqc --interactive -f -n {output[0]} $(dirname {input} | tr "\n" " ") --cl_config "max_table_rows: 100000" &> {log} + """ + + + +rule create_per_run_paired_raw_reads_multiqc_report: conda: "../../envs/MultiQC.yml" container: @@ -44,7 +66,7 @@ def sample_list(): file_list = [] for i in set(all_samples[config['run_column']]): - combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_R{read_number}_fastqc.zip", RUN = i, sample = all_samples.index.values[all_samples[config["run_column"]] == i], read_number = ["1","2"]) + combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = i, sample = all_samples.index.values[all_samples[config["run_column"]] == i], suffix = read_naming[all_samples.index.values[all_samples[config["run_column"]] == i]][0]) file_list = file_list + combined_values return(file_list) From b6a2dd1468a243e2560c004b9fe1610b944da8cb Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 17:56:29 +0200 Subject: [PATCH 04/24] WIP --- rules/0_preprocessing/QC_raw_reads.rules | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index d5e8cef6..98edbdd0 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -66,7 +66,10 @@ def sample_list(): file_list = [] for i in set(all_samples[config['run_column']]): - combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = i, sample = all_samples.index.values[all_samples[config["run_column"]] == i], suffix = read_naming[all_samples.index.values[all_samples[config["run_column"]] == i]][0]) + RUN = i + samples = list(all_samples.index.values[all_samples[config["run_column"]] == i]) + suffix = read_naming[samples[0][0]][0] + combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = samples, suffix = suffix) file_list = file_list + combined_values return(file_list) From 148f1ebf800dd5fc6b2be3100ee99de4847eddfd Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 18:06:01 +0200 Subject: [PATCH 05/24] WIP --- rules/0_preprocessing/QC_raw_reads.rules | 2 +- rules/0_preprocessing/get_reads.rules | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index 98edbdd0..e49d8a6b 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -68,7 +68,7 @@ def sample_list(): for i in set(all_samples[config['run_column']]): RUN = i samples = list(all_samples.index.values[all_samples[config["run_column"]] == i]) - suffix = read_naming[samples[0][0]][0] + suffix = reads_ext[samples[0]] combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = samples, suffix = suffix) file_list = file_list + combined_values diff --git a/rules/0_preprocessing/get_reads.rules b/rules/0_preprocessing/get_reads.rules index efe20cc8..a62184dd 100644 --- a/rules/0_preprocessing/get_reads.rules +++ b/rules/0_preprocessing/get_reads.rules @@ -92,8 +92,10 @@ if "sra_samples" in config.keys(): reads_sra[sample_name]=str(i) if sra_data.loc[i, "LibraryLayout"].lower()=="paired": sras_ext[sample_name]=["1.fastq.gz", "2.fastq.gz"] + reads_ext[sample_name]=["R1", "R2"] elif sra_data.loc[i, "LibraryLayout"].lower()=="single": sras_ext[sample_name] = ["fastq.gz"] + reads_ext[sample_name]=["single"] else: raise ValueError("Problem in the sra file, LibraryLayout badly defined") all_samples=sra_data From 65c894ccb689d1cdee6b81800b8e084be7aaa33c Mon Sep 17 00:00:00 2001 From: Valentin Date: Mon, 25 May 2020 18:03:18 +0200 Subject: [PATCH 06/24] WIP --- rules/0_preprocessing/get_reads.rules | 1 + 1 file changed, 1 insertion(+) diff --git a/rules/0_preprocessing/get_reads.rules b/rules/0_preprocessing/get_reads.rules index a62184dd..f9b924ce 100644 --- a/rules/0_preprocessing/get_reads.rules +++ b/rules/0_preprocessing/get_reads.rules @@ -43,6 +43,7 @@ reads_sra = {} reads_local = {} original_names = {} +reads_ext = {} if "local_samples" not in config.keys() and "sra_samples" not in config.keys(): raise ValueError("No samples defined in the config file") From 3e2ebba85d7ca1e4e56663e3fa1cb057d0e88851 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 18:44:53 +0200 Subject: [PATCH 07/24] WIP --- rules/0_preprocessing/QC_raw_reads.rules | 44 +++++++++--------------- 1 file changed, 16 insertions(+), 28 deletions(-) diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index e49d8a6b..058af863 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -19,36 +19,24 @@ rule assess_quality_raw_reads_with_fastqc: """ +def sample_list_run_QC(): + file_list = [] + samples = list(all_samples.index.values[all_samples[config["run_column"]] == wilcards.RUN]) + for s in samples: + suffix_s = reads_ext[s] + combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = wilcards.RUN, sample = s, suffix = suffix_s) + file_list = file_list + combined_values -rule create_per_run_single_raw_reads_multiqc_report: - conda: - "../../envs/MultiQC.yml" - container: - singularity_envs["multiqc"] - input: - lambda wildcards: expand("QC/FastQC/raw_reads/{RUN}/{sample}_single_fastqc.zip", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN], RUN = wildcards.RUN), - output: - report("QC/{RUN}_multiqc_raw_reads_report.html", caption="report/MultiQC.rst", category="MultiQC", subcategory="{RUN}"), - "QC/{RUN}_multiqc_raw_reads_report_data/multiqc_general_stats.txt", - log: - logging_folder + "QC/{RUN}_multiqc_raw_reads_report.txt" - priority: - 100 - shell: - """ - multiqc --interactive -f -n {output[0]} $(dirname {input} | tr "\n" " ") --cl_config "max_table_rows: 100000" &> {log} - """ - + return(file_list) -rule create_per_run_paired_raw_reads_multiqc_report: +rule create_per_run_multiqc_report: conda: "../../envs/MultiQC.yml" container: singularity_envs["multiqc"] input: - lambda wildcards: expand("QC/FastQC/raw_reads/{RUN}/{sample}_R1_fastqc.zip", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN], RUN = wildcards.RUN), - lambda wildcards: expand("QC/FastQC/raw_reads/{RUN}/{sample}_R2_fastqc.zip", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN], RUN = wildcards.RUN), + sample_list_run_QC() output: report("QC/{RUN}_multiqc_raw_reads_report.html", caption="report/MultiQC.rst", category="MultiQC", subcategory="{RUN}"), "QC/{RUN}_multiqc_raw_reads_report_data/multiqc_general_stats.txt", @@ -61,16 +49,16 @@ rule create_per_run_paired_raw_reads_multiqc_report: multiqc --interactive -f -n {output[0]} $(dirname {input} | tr "\n" " ") --cl_config "max_table_rows: 100000" &> {log} """ -def sample_list(): +def sample_list_overall_QC(): file_list = [] for i in set(all_samples[config['run_column']]): - RUN = i samples = list(all_samples.index.values[all_samples[config["run_column"]] == i]) - suffix = reads_ext[samples[0]] - combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = samples, suffix = suffix) - file_list = file_list + combined_values + for s in samples: + suffix_s = reads_ext[s] + combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = i, sample = s, suffix = suffix_s) + file_list = file_list + combined_values return(file_list) @@ -81,7 +69,7 @@ rule create_overall_raw_reads_multiqc_report: container: singularity_envs["multiqc"] input: - sample_list() + sample_list_overall_QC() output: report("QC/multiqc_raw_reads_report.html", caption="report/MultiQC.rst", category="MultiQC", subcategory="All run"), "QC/multiqc_raw_reads_report_data/multiqc_general_stats.txt", From 196a064d48ad4e6c151ecd5a30730bc604c57d91 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 18:47:09 +0200 Subject: [PATCH 08/24] WIP --- rules/0_preprocessing/QC_raw_reads.rules | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index 058af863..c212b5e9 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -22,11 +22,10 @@ rule assess_quality_raw_reads_with_fastqc: def sample_list_run_QC(): file_list = [] samples = list(all_samples.index.values[all_samples[config["run_column"]] == wilcards.RUN]) - for s in samples: - suffix_s = reads_ext[s] - combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = wilcards.RUN, sample = s, suffix = suffix_s) - file_list = file_list + combined_values - + for s in samples: + suffix_s = reads_ext[s] + combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = wilcards.RUN, sample = s, suffix = suffix_s) + file_list = file_list + combined_values return(file_list) From 9fa0249bcd78aa906091aecfbea56040a3c400fb Mon Sep 17 00:00:00 2001 From: Valentin Date: Mon, 25 May 2020 18:50:15 +0200 Subject: [PATCH 09/24] WIP --- rules/0_preprocessing/QC_raw_reads.rules | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index c212b5e9..aaad26b2 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -19,13 +19,14 @@ rule assess_quality_raw_reads_with_fastqc: """ -def sample_list_run_QC(): +def sample_list_run_QC(RUN): file_list = [] - samples = list(all_samples.index.values[all_samples[config["run_column"]] == wilcards.RUN]) + samples = list(all_samples.index.values[all_samples[config["run_column"]] == RUN]) for s in samples: suffix_s = reads_ext[s] - combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = wilcards.RUN, sample = s, suffix = suffix_s) + combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = s, suffix = suffix_s) file_list = file_list + combined_values + return(file_list) @@ -35,7 +36,7 @@ rule create_per_run_multiqc_report: container: singularity_envs["multiqc"] input: - sample_list_run_QC() + lambda wildcards: sample_list_run_QC(wildcards.RUN) output: report("QC/{RUN}_multiqc_raw_reads_report.html", caption="report/MultiQC.rst", category="MultiQC", subcategory="{RUN}"), "QC/{RUN}_multiqc_raw_reads_report_data/multiqc_general_stats.txt", From 849ad6f55726d8a9df398d18421dcbde49996c8f Mon Sep 17 00:00:00 2001 From: Valentin Date: Mon, 25 May 2020 21:02:09 +0200 Subject: [PATCH 10/24] WIP --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index ff3908dd..be7c49f9 100644 --- a/Snakefile +++ b/Snakefile @@ -11,7 +11,7 @@ if "--use-singularity" in sys.argv: ### Bind the directory of the database to the singularity containers. workflow.singularity_args += f' -B {config["tax_DB_path"]}:{config["tax_DB_path"]}' #### Load a dictionnary of singularity containers that will be called from each rule - singularity_envs = yaml.safe_load(open(os.path.join(workflow.basedir, "envs/singularity/sing_envs.yml"), 'r')) +singularity_envs = yaml.safe_load(open(os.path.join(workflow.basedir, "envs/singularity/sing_envs.yml"), 'r')) ## Include the pipeline rules From 3bcff07ac562e93d7dfa15dec84ed54df8d31299 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 21:06:40 +0200 Subject: [PATCH 11/24] get sing env even when not using it --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index ff3908dd..be7c49f9 100644 --- a/Snakefile +++ b/Snakefile @@ -11,7 +11,7 @@ if "--use-singularity" in sys.argv: ### Bind the directory of the database to the singularity containers. workflow.singularity_args += f' -B {config["tax_DB_path"]}:{config["tax_DB_path"]}' #### Load a dictionnary of singularity containers that will be called from each rule - singularity_envs = yaml.safe_load(open(os.path.join(workflow.basedir, "envs/singularity/sing_envs.yml"), 'r')) +singularity_envs = yaml.safe_load(open(os.path.join(workflow.basedir, "envs/singularity/sing_envs.yml"), 'r')) ## Include the pipeline rules From 62557629471a7d91c9f8aaba944e2931e220ff53 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 21:16:10 +0200 Subject: [PATCH 12/24] WIP --- rules/0_preprocessing/QC_raw_reads.rules | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index aaad26b2..dacf4e06 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -21,9 +21,12 @@ rule assess_quality_raw_reads_with_fastqc: def sample_list_run_QC(RUN): file_list = [] + print(RUN) samples = list(all_samples.index.values[all_samples[config["run_column"]] == RUN]) for s in samples: + print(s) suffix_s = reads_ext[s] + print(suffix_s) combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = s, suffix = suffix_s) file_list = file_list + combined_values From 3528be940dd145be0e4b621f737c0b5b2828d653 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 21:20:09 +0200 Subject: [PATCH 13/24] WIP --- rules/0_preprocessing/QC_raw_reads.rules | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index dacf4e06..b038af33 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -21,15 +21,13 @@ rule assess_quality_raw_reads_with_fastqc: def sample_list_run_QC(RUN): file_list = [] - print(RUN) samples = list(all_samples.index.values[all_samples[config["run_column"]] == RUN]) for s in samples: - print(s) suffix_s = reads_ext[s] - print(suffix_s) combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = s, suffix = suffix_s) file_list = file_list + combined_values + print(file_list) return(file_list) From 8bebbb1b9473743b2174124b5ad98ade93ab8c24 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 21:54:20 +0200 Subject: [PATCH 14/24] WIP --- rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules | 25 +++- rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules | 111 +++++++++++++++++- ...ltering.R => 1_DADA2_q_filtering_paired.R} | 0 .../scripts/1_DADA2_q_filtering_single.R | 37 ++++++ .../{1_cutadapt.py => 1_cutadapt_paired.py} | 0 .../scripts/1_cutadapt_single.py | 39 ++++++ 6 files changed, 204 insertions(+), 8 deletions(-) rename rules/1_2_DADA2_ASVs/scripts/{1_DADA2_q_filtering.R => 1_DADA2_q_filtering_paired.R} (100%) create mode 100644 rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R rename rules/1_2_DADA2_ASVs/scripts/{1_cutadapt.py => 1_cutadapt_paired.py} (100%) create mode 100644 rules/1_2_DADA2_ASVs/scripts/1_cutadapt_single.py diff --git a/rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules b/rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules index a1cd91f8..0dd7edb4 100644 --- a/rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules +++ b/rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules @@ -1,6 +1,6 @@ ## Remove primers from sequences -rule cutadapt_trim_for_DADA2: +rule cutadapt_trim_paired_for_DADA2: conda: "../../envs/cutadapt.yml" container: @@ -20,4 +20,25 @@ rule cutadapt_trim_for_DADA2: threads: 1 script: - "scripts/1_cutadapt.py" + "scripts/1_cutadapt_paired.py" + + +rule cutadapt_trim_single_for_DADA2: + conda: + "../../envs/cutadapt.yml" + container: + singularity_envs["cutadapt"] + input: + R1_raw_reads = "raw_reads/{sample}_single.fastq.gz", + output: + R1_trimmed_reads = temp("{denoiser}/1a_trimmed_primers/{sample}_trimmed_single.fastq.gz") + log: + logging_folder + "{denoiser}/1a_trimmed_primers/{sample}_single_trimmed.txt" + params: + amplicon_type = config["ITS_or_16S"], + forward_primer = config["forward_primer"], + reverse_primer = config["reverse_primer"], + threads: + 1 + script: + "scripts/1_cutadapt_single.py" \ No newline at end of file diff --git a/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules b/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules index 715fd571..22ee3c89 100644 --- a/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules +++ b/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules @@ -2,21 +2,28 @@ ## Define trimmed or not trimmed input (for read already trimmed) -def DADA2_trim_input(): +def DADA2_trim_input_paired(): if config['Trim_primers'] == True : return ["DADA2/1a_trimmed_primers/{sample}_trimmed_R1.fastq.gz", "DADA2/1a_trimmed_primers/{sample}_trimmed_R2.fastq.gz"] else : return ["raw_reads/{sample}_R1.fastq.gz", "raw_reads/{sample}_R2.fastq.gz"] +def DADA2_trim_input_single(): + if config['Trim_primers'] == True : + return ["DADA2/1a_trimmed_primers/{sample}_trimmed_single.fastq.gz"] + else : + return ["raw_reads/{sample}_single.fastq.gz"] + + ### Remove low quality and too short reads -rule DADA2_q_filtering: +rule DADA2_q_filtering_paired: conda: "../../envs/DADA2_in_R.yml" container: singularity_envs["dada2"] input: - DADA2_trim_input() + DADA2_trim_input_paired() output: q_score_filtered_F = temp("DADA2/1b_q_score_filtered_paired/{sample}_filtered_R1.fastq.gz"), q_score_filtered_R = temp("DADA2/1b_q_score_filtered_paired/{sample}_filtered_R2.fastq.gz"), @@ -32,10 +39,35 @@ rule DADA2_q_filtering: threads: 1 script: - "scripts/1_DADA2_q_filtering.R" + "scripts/1_DADA2_q_filtering_paired.R" + + +### Remove low quality and too short reads +rule DADA2_q_filtering_single: + conda: + "../../envs/DADA2_in_R.yml" + container: + singularity_envs["dada2"] + input: + DADA2_trim_input_single() + output: + q_score_filtered_F = temp("DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz"), + q_filtering_stats = "DADA2/1b_q_score_filtered_paired/{sample}_q_score_filtering_stats.rds" + log: + logging_folder + "DADA2/1b_q_score_filtered/{sample}_DADA2_1b_q_score_filtering.txt" + params: + F_reads_length_trim = config["DADA2_F_reads_length_trim"], + F_reads_extected_error = config["F_extected_error"], + sample_name = lambda wildcards: wildcards.sample, + threads: + 1 + script: + "scripts/1_DADA2_q_filtering_single.R" + + ### Learn error profile of the runs -rule DADA2_learn_errors: +rule DADA2_learn_errors_paired: conda: "../../envs/DADA2_in_R.yml" container: @@ -55,8 +87,45 @@ rule DADA2_learn_errors: script: "scripts/2a_big_data_DADA2_learn_errors.R" + + +### Learn error profile of the runs +rule DADA2_learn_errors_single: + conda: + "../../envs/DADA2_in_R.yml" + container: + singularity_envs["dada2"] + input: + q_score_filtered_F = lambda wildcards: expand("DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN]), + output: + error_profile_F = "DADA2/2_denoised/{RUN}/error_profile_F.rds", + error_profile_R = "DADA2/2_denoised/{RUN}/error_profile_R.rds", + error_profile_F_plot = report("DADA2/2_denoised/{RUN}/error_profile_F_plot.png", caption="report/DADA2_error_profile.rst", category="DADA2", subcategory="Error profile"), + error_profile_R_plot = report("DADA2/2_denoised/{RUN}/error_profile_R_plot.png", caption="report/DADA2_error_profile.rst", category="DADA2", subcategory="Error profile"), + log: + logging_folder + "DADA2/2_denoised/{RUN}/DADA2_learn_errors.txt" + threads: + 8 + script: + "scripts/2a_big_data_DADA2_learn_errors.R" + + + ### Infer error free sequences from the filtered reads and the runs error profiles -rule DADA2_infer_ASV: +def sample_list_run_QC(RUN): + file_list = [] + samples = list(all_samples.index.values[all_samples[config["run_column"]] == RUN]) + for s in samples: + suffix_s = reads_ext[s] + combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = s, suffix = suffix_s) + file_list = file_list + combined_values + + print(file_list) + return(file_list) + + + +rule DADA2_infer_paired_ASV: conda: "../../envs/DADA2_in_R.yml" container: @@ -81,6 +150,35 @@ rule DADA2_infer_ASV: script: "scripts/2b_big_data_DADA2_infer_ASV.R" + + +rule DADA2_infer_single_ASV: + conda: + "../../envs/DADA2_in_R.yml" + container: + singularity_envs["dada2"] + input: + q_score_filtered_F = "DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz", + error_profile_F = "DADA2/2_denoised/{RUN}/error_profile_single.rds", + output: + infer_stats = "DADA2/2_denoised/{RUN}/{sample}_infer_stats.rds", + sample_seq_tab = "DADA2/2_denoised/{RUN}/{sample}_infer_seq_tab.rds", + params: + sample_name = lambda wildcards: wildcards.sample, + run = lambda wildcards: wildcards.RUN, + x_column_value = lambda wildcards: all_samples[config["sample_label"]][wildcards.sample], + min_overlap = config["min_overlap"] + log: + logging_folder + "DADA2/2_denoised/{RUN}/{sample}_DADA2_2_infer_ASV.txt" + threads: + 4 + script: + "scripts/2b_big_data_DADA2_infer_ASV.R" + + + + + ### Merge the ASVs determined from the different samples rule DADA2_merge_sample_ASV: conda: @@ -100,6 +198,7 @@ rule DADA2_merge_sample_ASV: script: "scripts/2c_big_data_DADA2_merge_ASV.R" + ### Merge data from all run, filter out chimera and remove too short merged sequences rule DADA2_merge_runs_filter_chim: conda: diff --git a/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering.R b/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_paired.R similarity index 100% rename from rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering.R rename to rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_paired.R diff --git a/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R b/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R new file mode 100644 index 00000000..7bd86b9f --- /dev/null +++ b/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R @@ -0,0 +1,37 @@ +# Title : DADA2 - Q-score filtering +# Objective : Filter reads based on q-score and length +# Created by: valentinscherz +# Created on: 06.06.19 +# Modified from :https://benjjneb.github.io/dada2/bigdata_paired.html + +## Redirect R output to the log file + log <- file(snakemake@log[[1]], open="wt") + sink(log) + sink(log, type="message") + +## Input + fnFs <- snakemake@input[[1]] + +## Output + q_score_filtered_F <- snakemake@output[["q_score_filtered_F"]] + q_filtering_stats_path <- snakemake@output[["q_filtering_stats"]] + +## Parameters + F_length <- snakemake@params[["F_reads_length_trim"]] + F_extected_error <- snakemake@params[["F_reads_extected_error"]] + sample_name <- snakemake@params[["sample_name"]] + + +## Load needed libraries + library(dada2);packageVersion("dada2") + +## Filter and trim +### Reads are filtered based on the number of errors expected for the read (integration of the qscore and the length of the read). All reads with uncalled nucleotide (N) are removed too. Remaining phiX reads will be removed too. Finally, reads are cut at a length set in config. + +### Filter and trim. The filtered reads are directly written while the filtering stats are being save for later compilation. + filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, truncLen=c(F_length), maxN=0, maxEE=c(F_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads) + filtering_stats <- as.data.frame(filtering_stats) + filtering_stats$Sample <- sample_name + +### Save the stats for this sample in a R object + saveRDS(filtering_stats, file = q_filtering_stats_path) diff --git a/rules/1_2_DADA2_ASVs/scripts/1_cutadapt.py b/rules/1_2_DADA2_ASVs/scripts/1_cutadapt_paired.py similarity index 100% rename from rules/1_2_DADA2_ASVs/scripts/1_cutadapt.py rename to rules/1_2_DADA2_ASVs/scripts/1_cutadapt_paired.py diff --git a/rules/1_2_DADA2_ASVs/scripts/1_cutadapt_single.py b/rules/1_2_DADA2_ASVs/scripts/1_cutadapt_single.py new file mode 100644 index 00000000..54059f31 --- /dev/null +++ b/rules/1_2_DADA2_ASVs/scripts/1_cutadapt_single.py @@ -0,0 +1,39 @@ +from Bio.Seq import Seq +from Bio.Alphabet import IUPAC +from snakemake import shell + +if snakemake.params['amplicon_type'] == "ITS": + print("ITS Trimming") + forward_primer_compl = Seq.reverse_complement(Seq(snakemake.params['forward_primer'], IUPAC.ambiguous_dna)) + reverse_primer_compl = Seq.reverse_complement(Seq(snakemake.params['reverse_primer'], IUPAC.ambiguous_dna)) + shell("""cutadapt \ + --cores {snakemake.threads} \ + --error-rate 0.1 \ + --times 2 \ + --overlap 3 \ + -o {snakemake.output[R1_trimmed_reads]} \ + -g '{snakemake.params[forward_primer]}' \ + -a '{reverse_primer_compl}' \ + -G '{snakemake.params[reverse_primer]}' \ + -A '{forward_primer_compl}' \ + --match-read-wildcards \ + --discard-untrimmed \ + {snakemake.input[R1_raw_reads]} >> {snakemake.log[0]}""") + +elif snakemake.params['amplicon_type'] == "16S": + print("16S Trimming") + shell("""cutadapt \ + --cores {snakemake.threads} \ + --error-rate 0.1 \ + --times 1 \ + --overlap 3 \ + -o {snakemake.output[R1_trimmed_reads]} \ + -g '{snakemake.params[forward_primer]}' \ + -G '{snakemake.params[reverse_primer]}' \ + --match-read-wildcards \ + --discard-untrimmed \ + {snakemake.input[R1_raw_reads]} >> {snakemake.log[0]}""") + + + + From fa63cb4522aa89bc13cc4940e2fbcdc811df1da2 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 21:55:26 +0200 Subject: [PATCH 15/24] WIP --- rules/0_preprocessing/QC_raw_reads.rules | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index b038af33..38056331 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -25,9 +25,8 @@ def sample_list_run_QC(RUN): for s in samples: suffix_s = reads_ext[s] combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = s, suffix = suffix_s) - file_list = file_list + combined_values - - print(file_list) + file_list = file_list + combined_values + return(file_list) From 8e6e9efa30c22330c4ea721633fdf109e5161231 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 21:56:12 +0200 Subject: [PATCH 16/24] WIP --- rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules b/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules index 22ee3c89..993a9f7e 100644 --- a/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules +++ b/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules @@ -119,8 +119,7 @@ def sample_list_run_QC(RUN): suffix_s = reads_ext[s] combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = s, suffix = suffix_s) file_list = file_list + combined_values - - print(file_list) + return(file_list) From 5db23c9e83bebb3fc52eb6ac3201b9dea276b646 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 21:58:09 +0200 Subject: [PATCH 17/24] WIP --- rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules b/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules index 993a9f7e..f3faf90b 100644 --- a/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules +++ b/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules @@ -98,10 +98,8 @@ rule DADA2_learn_errors_single: input: q_score_filtered_F = lambda wildcards: expand("DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN]), output: - error_profile_F = "DADA2/2_denoised/{RUN}/error_profile_F.rds", - error_profile_R = "DADA2/2_denoised/{RUN}/error_profile_R.rds", - error_profile_F_plot = report("DADA2/2_denoised/{RUN}/error_profile_F_plot.png", caption="report/DADA2_error_profile.rst", category="DADA2", subcategory="Error profile"), - error_profile_R_plot = report("DADA2/2_denoised/{RUN}/error_profile_R_plot.png", caption="report/DADA2_error_profile.rst", category="DADA2", subcategory="Error profile"), + error_profile_F = "DADA2/2_denoised/{RUN}/error_profile_single.rds", + error_profile_F_plot = report("DADA2/2_denoised/{RUN}/error_profile_single_plot.png", caption="report/DADA2_error_profile.rst", category="DADA2", subcategory="Error profile"), log: logging_folder + "DADA2/2_denoised/{RUN}/DADA2_learn_errors.txt" threads: From 10c536f5cba4fcb853f0120e911d1589a2578090 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 22:13:47 +0200 Subject: [PATCH 18/24] WIP --- rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules | 26 ++------ ... 2a_big_data_DADA2_learn_errors _paired.R} | 0 .../2a_big_data_DADA2_learn_errors_single.R | 33 ++++++++++ ...R => 2b_big_data_DADA2_infer_ASV_paired.R} | 0 .../2b_big_data_DADA2_infer_ASV_single.R | 64 +++++++++++++++++++ 5 files changed, 102 insertions(+), 21 deletions(-) rename rules/1_2_DADA2_ASVs/scripts/{2a_big_data_DADA2_learn_errors.R => 2a_big_data_DADA2_learn_errors _paired.R} (100%) create mode 100644 rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors_single.R rename rules/1_2_DADA2_ASVs/scripts/{2b_big_data_DADA2_infer_ASV.R => 2b_big_data_DADA2_infer_ASV_paired.R} (100%) create mode 100644 rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV_single.R diff --git a/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules b/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules index f3faf90b..f5224c79 100644 --- a/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules +++ b/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules @@ -1,7 +1,6 @@ ## Process trimmed sequences with DADA2 to correct errors and generate ASVs. Follows the "big data" protocol (https://benjjneb.github.io/dada2/bigdata.html) ## Define trimmed or not trimmed input (for read already trimmed) - def DADA2_trim_input_paired(): if config['Trim_primers'] == True : return ["DADA2/1a_trimmed_primers/{sample}_trimmed_R1.fastq.gz", "DADA2/1a_trimmed_primers/{sample}_trimmed_R2.fastq.gz"] @@ -42,7 +41,6 @@ rule DADA2_q_filtering_paired: "scripts/1_DADA2_q_filtering_paired.R" -### Remove low quality and too short reads rule DADA2_q_filtering_single: conda: "../../envs/DADA2_in_R.yml" @@ -85,11 +83,10 @@ rule DADA2_learn_errors_paired: threads: 8 script: - "scripts/2a_big_data_DADA2_learn_errors.R" + "scripts/2a_big_data_DADA2_learn_errors_paired.R" -### Learn error profile of the runs rule DADA2_learn_errors_single: conda: "../../envs/DADA2_in_R.yml" @@ -105,23 +102,11 @@ rule DADA2_learn_errors_single: threads: 8 script: - "scripts/2a_big_data_DADA2_learn_errors.R" + "scripts/2a_big_data_DADA2_learn_errors_single.R" -### Infer error free sequences from the filtered reads and the runs error profiles -def sample_list_run_QC(RUN): - file_list = [] - samples = list(all_samples.index.values[all_samples[config["run_column"]] == RUN]) - for s in samples: - suffix_s = reads_ext[s] - combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = s, suffix = suffix_s) - file_list = file_list + combined_values - - return(file_list) - - - +## Correct errors based on profile error of the run rule DADA2_infer_paired_ASV: conda: "../../envs/DADA2_in_R.yml" @@ -145,7 +130,7 @@ rule DADA2_infer_paired_ASV: threads: 4 script: - "scripts/2b_big_data_DADA2_infer_ASV.R" + "scripts/2b_big_data_DADA2_infer_ASV_paired.R" @@ -170,8 +155,7 @@ rule DADA2_infer_single_ASV: threads: 4 script: - "scripts/2b_big_data_DADA2_infer_ASV.R" - + "scripts/2b_big_data_DADA2_infer_ASV_single.R" diff --git a/rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors.R b/rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors _paired.R similarity index 100% rename from rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors.R rename to rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors _paired.R diff --git a/rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors_single.R b/rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors_single.R new file mode 100644 index 00000000..d6eec62e --- /dev/null +++ b/rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors_single.R @@ -0,0 +1,33 @@ +# Title : DADA2 - Learn run errors profile +# Objective : Generate an error profile for each run +# Created by: valentinscherz +# Created on: 06.06.19 +# Modified from :https://benjjneb.github.io/dada2/bigdata_paired.html + +## Redirect R output + log <- file(snakemake@log[[1]], open="wt") + sink(log) + sink(log, type="message") + +## Input + q_score_filtered_F <- snakemake@input[["q_score_filtered_F"]] + +## Output + error_profile_F <- snakemake@output[["error_profile_F"]] + error_plot_F <- snakemake@output[["error_profile_F_plot"]] + +## Load needed libraries + library(dada2); packageVersion("dada2") + + +## Learn error rates + errF <- learnErrors(q_score_filtered_F, nbases=1e8, multithread = snakemake@threads, verbose = 1) + +## Write these error profiles + saveRDS(object = errF, file = error_profile_F) + +## Write error_plots + png(error_plot_F) + plotErrors(errF, nominalQ=TRUE) + + dev.off() diff --git a/rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV.R b/rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV_paired.R similarity index 100% rename from rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV.R rename to rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV_paired.R diff --git a/rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV_single.R b/rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV_single.R new file mode 100644 index 00000000..df2a7c41 --- /dev/null +++ b/rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV_single.R @@ -0,0 +1,64 @@ +# Title : DADA2 - infer ASVs +# Objective : Infers ASVs from the reads, based on error profile of the run +# Created by: valentinscherz +# Created on: 06.06.2019 +# Modified from :https://benjjneb.github.io/dada2/bigdata_paired.html + +## Redirect R output + log <- file(snakemake@log[[1]], open="wt") + sink(log) + sink(log, type="message") + +## Input + q_score_filtered_F <- snakemake@input[["q_score_filtered_F"]] + error_profile_F <- snakemake@input[["error_profile_F"]] + +## Output + infer_stats <- snakemake@output[["infer_stats"]] + sample_seq_tab <- snakemake@output[["sample_seq_tab"]] + +## Parameters + sample_name <- snakemake@params[["sample_name"]] + run <- snakemake@params[["run"]] + x_column_value <- snakemake@params[["x_column_value"]] + min_overlap <- snakemake@params[["min_overlap"]] + print(paste("min_overlap is :", min_overlap)) + +## Load needed libraries + library(dada2); packageVersion("dada2") + +## Create a useful function to count the number of sequences + getN <- function(x) sum(getUniques(x)) + +## File renaming + names(q_score_filtered_F) <- sample_name + +## Read error rates + errF <- readRDS(error_profile_F) + +## Prepare named vector + mergers <- vector("list", 1) + names(mergers) <- sample_name + +## Sample inference and merger of paired-end reads + cat("Processing:", sample_name, "\n") + ## Forward + derepF <- derepFastq(q_score_filtered_F, verbose = TRUE) + ddF <- dada(derepF, err=errF, multithread = snakemake@threads, verbose = 1, pool = FALSE, selfConsist = TRUE) + mergers[[sample_name]] <- ddF + +## Save the dereplicated, corrected and merged sequences for this sample + saveRDS(mergers, file = sample_seq_tab) + +## For statistics record the number of reads + ### Write the statistics in a dataframe + infer <- data.frame(denoisedF = getN(ddF)) + infer$denoisedR <- NULL + infer$merged_pairs <- infer$denoisedF + infer$Sample <- sample_name + infer$label <- x_column_value + infer$RUN <- run + + ### Save the sequences stats for this sample + saveRDS(infer, file = infer_stats) + From df0a93c037a8f32823ceb94ff2fef128916142f7 Mon Sep 17 00:00:00 2001 From: valscherz Date: Mon, 25 May 2020 22:18:14 +0200 Subject: [PATCH 19/24] WIP --- rules/1_2_DADA2_ASVs/scripts/1_cutadapt_single.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/rules/1_2_DADA2_ASVs/scripts/1_cutadapt_single.py b/rules/1_2_DADA2_ASVs/scripts/1_cutadapt_single.py index 54059f31..d3cd84ac 100644 --- a/rules/1_2_DADA2_ASVs/scripts/1_cutadapt_single.py +++ b/rules/1_2_DADA2_ASVs/scripts/1_cutadapt_single.py @@ -14,8 +14,6 @@ -o {snakemake.output[R1_trimmed_reads]} \ -g '{snakemake.params[forward_primer]}' \ -a '{reverse_primer_compl}' \ - -G '{snakemake.params[reverse_primer]}' \ - -A '{forward_primer_compl}' \ --match-read-wildcards \ --discard-untrimmed \ {snakemake.input[R1_raw_reads]} >> {snakemake.log[0]}""") @@ -29,7 +27,6 @@ --overlap 3 \ -o {snakemake.output[R1_trimmed_reads]} \ -g '{snakemake.params[forward_primer]}' \ - -G '{snakemake.params[reverse_primer]}' \ --match-read-wildcards \ --discard-untrimmed \ {snakemake.input[R1_raw_reads]} >> {snakemake.log[0]}""") From 439330fa386627df929f424c8db66df1bb9cdc78 Mon Sep 17 00:00:00 2001 From: Valentin Date: Fri, 29 May 2020 18:45:37 +0200 Subject: [PATCH 20/24] Generalize paired/single for local too --- rules/0_preprocessing/get_reads.rules | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/rules/0_preprocessing/get_reads.rules b/rules/0_preprocessing/get_reads.rules index f9b924ce..4f884ade 100644 --- a/rules/0_preprocessing/get_reads.rules +++ b/rules/0_preprocessing/get_reads.rules @@ -67,6 +67,18 @@ if "local_samples" in config.keys(): sample = str(sorted(list(original_names.keys()))[match.index(True)]) original_correct[i] = original_names[sample] read_correct[i] = reads_local[sample] + + if "LibraryLayout" in list(local_data): + if local_data.loc[i, "LibraryLayout"].lower()=="paired": + reads_ext[sample]=["R1", "R2"] + elif sra_data.loc[i, "LibraryLayout"].lower()=="single": + reads_ext[sample]=["single"] + else: + raise ValueError("Problem in the sra file, LibraryLayout badly defined") + else: + reads_ext[sample]=["R1", "R2"] + + else: for i in list(local_data["OldSampleName"]): regex = re.compile(r'%s([^a-zA-Z0-9]|$)' % i) @@ -77,6 +89,17 @@ if "local_samples" in config.keys(): sample=str(local_data.index[local_data['OldSampleName'] == i][0]) original_correct[sample] = original_names[old_sample_name] read_correct[sample] = reads_local[old_sample_name] + + if "LibraryLayout" in list(local_data): + if local_data.loc[i, "LibraryLayout"].lower()=="paired": + reads_ext[sample]=["R1", "R2"] + elif sra_data.loc[i, "LibraryLayout"].lower()=="single": + reads_ext[sample]=["single"] + else: + raise ValueError("Problem in the sra file, LibraryLayout badly defined") + else: + reads_ext[sample]=["R1", "R2"] + original_names = original_correct reads_local = read_correct all_samples=local_data From e05e2aa575ea3a920048db92f3d4f1ed3a400642 Mon Sep 17 00:00:00 2001 From: Valentin Date: Fri, 29 May 2020 18:46:00 +0200 Subject: [PATCH 21/24] Adapt scripts to single and paired --- rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_paired.R | 2 +- rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R | 2 +- ...errors _paired.R => 2a_big_data_DADA2_learn_errors_paired.R} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename rules/1_2_DADA2_ASVs/scripts/{2a_big_data_DADA2_learn_errors _paired.R => 2a_big_data_DADA2_learn_errors_paired.R} (100%) diff --git a/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_paired.R b/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_paired.R index 6f690985..809ca1ae 100644 --- a/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_paired.R +++ b/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_paired.R @@ -33,7 +33,7 @@ ### Reads are filtered based on the number of errors expected for the read (integration of the qscore and the length of the read). All reads with uncalled nucleotide (N) are removed too. Remaining phiX reads will be removed too. Finally, reads are cut at a length set in config. ### Filter and trim. The filtered reads are directly written while the filtering stats are being save for later compilation. - filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, rev = fnRs, filt.rev = q_score_filtered_R, truncLen=c(F_length,R_length), maxN=0, maxEE=c(F_extected_error,R_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads) + filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, rev = fnRs, filt.rev = q_score_filtered_R, truncLen=c(F_length,R_length), maxN=0, maxEE=c(F_extected_error,R_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads, verbose = TRUE) filtering_stats <- as.data.frame(filtering_stats) filtering_stats$Sample <- sample_name diff --git a/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R b/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R index 7bd86b9f..c378b72a 100644 --- a/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R +++ b/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R @@ -29,7 +29,7 @@ ### Reads are filtered based on the number of errors expected for the read (integration of the qscore and the length of the read). All reads with uncalled nucleotide (N) are removed too. Remaining phiX reads will be removed too. Finally, reads are cut at a length set in config. ### Filter and trim. The filtered reads are directly written while the filtering stats are being save for later compilation. - filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, truncLen=c(F_length), maxN=0, maxEE=c(F_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads) + filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, truncLen=c(F_length), maxN=0, maxEE=c(F_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads, verbose = TRUE) filtering_stats <- as.data.frame(filtering_stats) filtering_stats$Sample <- sample_name diff --git a/rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors _paired.R b/rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors_paired.R similarity index 100% rename from rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors _paired.R rename to rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors_paired.R From d18b701173c80448115409b771f102e26dfdbfdb Mon Sep 17 00:00:00 2001 From: Valentin Date: Fri, 12 Jun 2020 10:53:54 +0200 Subject: [PATCH 22/24] correct index for local samples --- rules/0_preprocessing/QC_raw_reads.rules | 1 + rules/0_preprocessing/get_reads.rules | 13 +++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index 38056331..994adac5 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -56,6 +56,7 @@ def sample_list_overall_QC(): for i in set(all_samples[config['run_column']]): samples = list(all_samples.index.values[all_samples[config["run_column"]] == i]) for s in samples: + print(reads_ext) suffix_s = reads_ext[s] combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = i, sample = s, suffix = suffix_s) file_list = file_list + combined_values diff --git a/rules/0_preprocessing/get_reads.rules b/rules/0_preprocessing/get_reads.rules index 4f884ade..770de26b 100644 --- a/rules/0_preprocessing/get_reads.rules +++ b/rules/0_preprocessing/get_reads.rules @@ -70,13 +70,13 @@ if "local_samples" in config.keys(): if "LibraryLayout" in list(local_data): if local_data.loc[i, "LibraryLayout"].lower()=="paired": - reads_ext[sample]=["R1", "R2"] + reads_ext[i]=["R1", "R2"] elif sra_data.loc[i, "LibraryLayout"].lower()=="single": - reads_ext[sample]=["single"] + reads_ext[i]=["single"] else: raise ValueError("Problem in the sra file, LibraryLayout badly defined") else: - reads_ext[sample]=["R1", "R2"] + reads_ext[i]=["R1", "R2"] else: @@ -92,17 +92,18 @@ if "local_samples" in config.keys(): if "LibraryLayout" in list(local_data): if local_data.loc[i, "LibraryLayout"].lower()=="paired": - reads_ext[sample]=["R1", "R2"] + reads_ext[i]=["R1", "R2"] elif sra_data.loc[i, "LibraryLayout"].lower()=="single": - reads_ext[sample]=["single"] + reads_ext[i]=["single"] else: raise ValueError("Problem in the sra file, LibraryLayout badly defined") else: - reads_ext[sample]=["R1", "R2"] + reads_ext[i]=["R1", "R2"] original_names = original_correct reads_local = read_correct all_samples=local_data + reads_ext = reads_ext if "sra_samples" in config.keys(): sra_data = pandas.read_csv(config["sra_samples"], sep="\t", index_col=0).drop_duplicates() From 51641fd8a6d6caa1fe82f22f3be9a379a73d1466 Mon Sep 17 00:00:00 2001 From: Valentin Date: Mon, 6 Jul 2020 10:13:45 +0200 Subject: [PATCH 23/24] correct quote in metadata --- .../r_scripts/importation/create_phyloseq_from_metadata.R | 2 +- rules/4_post_processing/scripts/physeq_export.R | 2 +- rules/4_post_processing/scripts/physeq_melt_table.R | 2 +- .../scripts/transpose_and_add_meta_count_table.R | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ressources/r_scripts/importation/create_phyloseq_from_metadata.R b/ressources/r_scripts/importation/create_phyloseq_from_metadata.R index b2ae24ce..28ad3602 100755 --- a/ressources/r_scripts/importation/create_phyloseq_from_metadata.R +++ b/ressources/r_scripts/importation/create_phyloseq_from_metadata.R @@ -37,6 +37,6 @@ create_phyloseq_fct <- function(physeq_name = "physeq", melted_df_name = "physeq save(physeq_obj, file = "physeq_object") ### Write this table in a tsv file since it is ground for coming analysis and slow to compute - write.table(x = physeq_df, file = "physeq_df.tsv", append = F, quote = F, sep = "\t", eol = "\n", row.names = F, col.names = T ) + write.table(x = physeq_df, file = "physeq_df.tsv", append = F, sep = "\t", eol = "\n", row.names = F, col.names = T ) } \ No newline at end of file diff --git a/rules/4_post_processing/scripts/physeq_export.R b/rules/4_post_processing/scripts/physeq_export.R index 38d100e2..da9d9966 100644 --- a/rules/4_post_processing/scripts/physeq_export.R +++ b/rules/4_post_processing/scripts/physeq_export.R @@ -41,7 +41,7 @@ ape::write.tree(tree, tree_path) metadata <- (sample_data(phyloseq_obj)) metadata$Sample <- rownames(metadata) metadata <- metadata %>% select(Sample, everything()) -write.table(metadata, meta_path , sep="\t", quote=F, row.names = FALSE) +write.table(metadata, meta_path , sep="\t", row.names = FALSE) ## Write the taxonomy table of the phyloseq object taxa_df<- as.data.frame(tax_table(phyloseq_obj), stringsAsFactors = F) diff --git a/rules/4_post_processing/scripts/physeq_melt_table.R b/rules/4_post_processing/scripts/physeq_melt_table.R index 798984d8..e4fcbc6a 100644 --- a/rules/4_post_processing/scripts/physeq_melt_table.R +++ b/rules/4_post_processing/scripts/physeq_melt_table.R @@ -29,4 +29,4 @@ phyloseq_obj <- prune_taxa(taxa_sums(phyloseq_obj) > 0, phyloseq_obj) ## Removes physeq_df <- psmelt(phyloseq_obj) ### Write this table in a tsv format -write.table(x = physeq_df, file = phyloseq_melted_table, append = F, quote = F, sep = "\t", eol = "\n", row.names = F, col.names = T ) +write.table(x = physeq_df, file = phyloseq_melted_table, append = F, sep = "\t", eol = "\n", row.names = F, col.names = T ) diff --git a/rules/4_post_processing/scripts/transpose_and_add_meta_count_table.R b/rules/4_post_processing/scripts/transpose_and_add_meta_count_table.R index 519ddb59..6ef31de6 100644 --- a/rules/4_post_processing/scripts/transpose_and_add_meta_count_table.R +++ b/rules/4_post_processing/scripts/transpose_and_add_meta_count_table.R @@ -20,7 +20,7 @@ ## Load the phyloseq phyloseq_object table <- read.table(count_table, sep = "\t", header = TRUE) - meta <- read.table(meta, sep = "\t", header = TRUE) + meta <- read.table(meta, sep = "\t", header = TRUE, na.strings = "NA") ## Tranpose table table_t <- t(table) From 892c3c2eb00b3332930053a3d62a8a052ecc3cb6 Mon Sep 17 00:00:00 2001 From: Valentin Date: Mon, 6 Jul 2020 10:23:19 +0200 Subject: [PATCH 24/24] Correct index for R1-2/single for oldsamplename --- rules/0_preprocessing/QC_raw_reads.rules | 1 - rules/0_preprocessing/get_reads.rules | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index 994adac5..38056331 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -56,7 +56,6 @@ def sample_list_overall_QC(): for i in set(all_samples[config['run_column']]): samples = list(all_samples.index.values[all_samples[config["run_column"]] == i]) for s in samples: - print(reads_ext) suffix_s = reads_ext[s] combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = i, sample = s, suffix = suffix_s) file_list = file_list + combined_values diff --git a/rules/0_preprocessing/get_reads.rules b/rules/0_preprocessing/get_reads.rules index 770de26b..1090643a 100644 --- a/rules/0_preprocessing/get_reads.rules +++ b/rules/0_preprocessing/get_reads.rules @@ -91,14 +91,14 @@ if "local_samples" in config.keys(): read_correct[sample] = reads_local[old_sample_name] if "LibraryLayout" in list(local_data): - if local_data.loc[i, "LibraryLayout"].lower()=="paired": - reads_ext[i]=["R1", "R2"] + if local_data.loc[sample, "LibraryLayout"].lower()=="paired": + reads_ext[sample]=["R1", "R2"] elif sra_data.loc[i, "LibraryLayout"].lower()=="single": - reads_ext[i]=["single"] + reads_ext[sample]=["single"] else: raise ValueError("Problem in the sra file, LibraryLayout badly defined") else: - reads_ext[i]=["R1", "R2"] + reads_ext[sample]=["R1", "R2"] original_names = original_correct reads_local = read_correct