diff --git a/Snakefile b/Snakefile index ff3908dd..be7c49f9 100644 --- a/Snakefile +++ b/Snakefile @@ -11,7 +11,7 @@ if "--use-singularity" in sys.argv: ### Bind the directory of the database to the singularity containers. workflow.singularity_args += f' -B {config["tax_DB_path"]}:{config["tax_DB_path"]}' #### Load a dictionnary of singularity containers that will be called from each rule - singularity_envs = yaml.safe_load(open(os.path.join(workflow.basedir, "envs/singularity/sing_envs.yml"), 'r')) +singularity_envs = yaml.safe_load(open(os.path.join(workflow.basedir, "envs/singularity/sing_envs.yml"), 'r')) ## Include the pipeline rules diff --git a/ressources/r_scripts/importation/create_phyloseq_from_metadata.R b/ressources/r_scripts/importation/create_phyloseq_from_metadata.R index b2ae24ce..28ad3602 100755 --- a/ressources/r_scripts/importation/create_phyloseq_from_metadata.R +++ b/ressources/r_scripts/importation/create_phyloseq_from_metadata.R @@ -37,6 +37,6 @@ create_phyloseq_fct <- function(physeq_name = "physeq", melted_df_name = "physeq save(physeq_obj, file = "physeq_object") ### Write this table in a tsv file since it is ground for coming analysis and slow to compute - write.table(x = physeq_df, file = "physeq_df.tsv", append = F, quote = F, sep = "\t", eol = "\n", row.names = F, col.names = T ) + write.table(x = physeq_df, file = "physeq_df.tsv", append = F, sep = "\t", eol = "\n", row.names = F, col.names = T ) } \ No newline at end of file diff --git a/rules/0_preprocessing/QC_raw_reads.rules b/rules/0_preprocessing/QC_raw_reads.rules index cb0e25e4..38056331 100644 --- a/rules/0_preprocessing/QC_raw_reads.rules +++ b/rules/0_preprocessing/QC_raw_reads.rules @@ -1,18 +1,16 @@ ## Generate MultiQC reports -rule assess_quality_paired_raw_reads_with_fastqc: +rule assess_quality_raw_reads_with_fastqc: conda: "../../envs/FastQC.yml" container: singularity_envs["fastqc"] input: - "raw_reads/{sample}_R1.fastq.gz", - "raw_reads/{sample}_R2.fastq.gz", + "raw_reads/{sample}_{suffix}.fastq.gz" output: - "QC/FastQC/raw_reads/{RUN}/{sample}_R1_fastqc.zip", - "QC/FastQC/raw_reads/{RUN}/{sample}_R2_fastqc.zip", + "QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip" log: - logging_folder + "QC/FastQC/raw_reads/{RUN}/{sample}_fastqc.txt" + logging_folder + "QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.txt" threads: 1 shell: @@ -21,14 +19,24 @@ rule assess_quality_paired_raw_reads_with_fastqc: """ -rule create_per_run_raw_reads_multiqc_report: +def sample_list_run_QC(RUN): + file_list = [] + samples = list(all_samples.index.values[all_samples[config["run_column"]] == RUN]) + for s in samples: + suffix_s = reads_ext[s] + combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = s, suffix = suffix_s) + file_list = file_list + combined_values + + return(file_list) + + +rule create_per_run_multiqc_report: conda: "../../envs/MultiQC.yml" container: singularity_envs["multiqc"] input: - lambda wildcards: expand("QC/FastQC/raw_reads/{RUN}/{sample}_R1_fastqc.zip", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN], RUN = wildcards.RUN), - lambda wildcards: expand("QC/FastQC/raw_reads/{RUN}/{sample}_R2_fastqc.zip", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN], RUN = wildcards.RUN), + lambda wildcards: sample_list_run_QC(wildcards.RUN) output: report("QC/{RUN}_multiqc_raw_reads_report.html", caption="report/MultiQC.rst", category="MultiQC", subcategory="{RUN}"), "QC/{RUN}_multiqc_raw_reads_report_data/multiqc_general_stats.txt", @@ -41,13 +49,16 @@ rule create_per_run_raw_reads_multiqc_report: multiqc --interactive -f -n {output[0]} $(dirname {input} | tr "\n" " ") --cl_config "max_table_rows: 100000" &> {log} """ -def sample_list(): +def sample_list_overall_QC(): file_list = [] for i in set(all_samples[config['run_column']]): - combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_R{read_number}_fastqc.zip", RUN = i, sample = all_samples.index.values[all_samples[config["run_column"]] == i], read_number = ["1","2"]) - file_list = file_list + combined_values + samples = list(all_samples.index.values[all_samples[config["run_column"]] == i]) + for s in samples: + suffix_s = reads_ext[s] + combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = i, sample = s, suffix = suffix_s) + file_list = file_list + combined_values return(file_list) @@ -58,7 +69,7 @@ rule create_overall_raw_reads_multiqc_report: container: singularity_envs["multiqc"] input: - sample_list() + sample_list_overall_QC() output: report("QC/multiqc_raw_reads_report.html", caption="report/MultiQC.rst", category="MultiQC", subcategory="All run"), "QC/multiqc_raw_reads_report_data/multiqc_general_stats.txt", diff --git a/rules/0_preprocessing/get_reads.rules b/rules/0_preprocessing/get_reads.rules index efe20cc8..1090643a 100644 --- a/rules/0_preprocessing/get_reads.rules +++ b/rules/0_preprocessing/get_reads.rules @@ -43,6 +43,7 @@ reads_sra = {} reads_local = {} original_names = {} +reads_ext = {} if "local_samples" not in config.keys() and "sra_samples" not in config.keys(): raise ValueError("No samples defined in the config file") @@ -66,6 +67,18 @@ if "local_samples" in config.keys(): sample = str(sorted(list(original_names.keys()))[match.index(True)]) original_correct[i] = original_names[sample] read_correct[i] = reads_local[sample] + + if "LibraryLayout" in list(local_data): + if local_data.loc[i, "LibraryLayout"].lower()=="paired": + reads_ext[i]=["R1", "R2"] + elif sra_data.loc[i, "LibraryLayout"].lower()=="single": + reads_ext[i]=["single"] + else: + raise ValueError("Problem in the sra file, LibraryLayout badly defined") + else: + reads_ext[i]=["R1", "R2"] + + else: for i in list(local_data["OldSampleName"]): regex = re.compile(r'%s([^a-zA-Z0-9]|$)' % i) @@ -76,9 +89,21 @@ if "local_samples" in config.keys(): sample=str(local_data.index[local_data['OldSampleName'] == i][0]) original_correct[sample] = original_names[old_sample_name] read_correct[sample] = reads_local[old_sample_name] + + if "LibraryLayout" in list(local_data): + if local_data.loc[sample, "LibraryLayout"].lower()=="paired": + reads_ext[sample]=["R1", "R2"] + elif sra_data.loc[i, "LibraryLayout"].lower()=="single": + reads_ext[sample]=["single"] + else: + raise ValueError("Problem in the sra file, LibraryLayout badly defined") + else: + reads_ext[sample]=["R1", "R2"] + original_names = original_correct reads_local = read_correct all_samples=local_data + reads_ext = reads_ext if "sra_samples" in config.keys(): sra_data = pandas.read_csv(config["sra_samples"], sep="\t", index_col=0).drop_duplicates() @@ -92,8 +117,10 @@ if "sra_samples" in config.keys(): reads_sra[sample_name]=str(i) if sra_data.loc[i, "LibraryLayout"].lower()=="paired": sras_ext[sample_name]=["1.fastq.gz", "2.fastq.gz"] + reads_ext[sample_name]=["R1", "R2"] elif sra_data.loc[i, "LibraryLayout"].lower()=="single": sras_ext[sample_name] = ["fastq.gz"] + reads_ext[sample_name]=["single"] else: raise ValueError("Problem in the sra file, LibraryLayout badly defined") all_samples=sra_data diff --git a/rules/0_preprocessing/get_sras.rules b/rules/0_preprocessing/get_sras.rules index 7b0f3d50..c24f4277 100644 --- a/rules/0_preprocessing/get_sras.rules +++ b/rules/0_preprocessing/get_sras.rules @@ -38,6 +38,7 @@ rule sra_convert_to_fastq_paired: fastq-dump --split-3 --gzip --log-level 1 --disable-multithreading --minReadLen 0 --outdir $(dirname {output[0]}) {input[0]} &> {log} """ + rule sra_convert_to_fastq_single: conda: "../../envs/sra-tools.yml" diff --git a/rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules b/rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules index a1cd91f8..0dd7edb4 100644 --- a/rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules +++ b/rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules @@ -1,6 +1,6 @@ ## Remove primers from sequences -rule cutadapt_trim_for_DADA2: +rule cutadapt_trim_paired_for_DADA2: conda: "../../envs/cutadapt.yml" container: @@ -20,4 +20,25 @@ rule cutadapt_trim_for_DADA2: threads: 1 script: - "scripts/1_cutadapt.py" + "scripts/1_cutadapt_paired.py" + + +rule cutadapt_trim_single_for_DADA2: + conda: + "../../envs/cutadapt.yml" + container: + singularity_envs["cutadapt"] + input: + R1_raw_reads = "raw_reads/{sample}_single.fastq.gz", + output: + R1_trimmed_reads = temp("{denoiser}/1a_trimmed_primers/{sample}_trimmed_single.fastq.gz") + log: + logging_folder + "{denoiser}/1a_trimmed_primers/{sample}_single_trimmed.txt" + params: + amplicon_type = config["ITS_or_16S"], + forward_primer = config["forward_primer"], + reverse_primer = config["reverse_primer"], + threads: + 1 + script: + "scripts/1_cutadapt_single.py" \ No newline at end of file diff --git a/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules b/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules index 715fd571..f5224c79 100644 --- a/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules +++ b/rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules @@ -1,22 +1,28 @@ ## Process trimmed sequences with DADA2 to correct errors and generate ASVs. Follows the "big data" protocol (https://benjjneb.github.io/dada2/bigdata.html) ## Define trimmed or not trimmed input (for read already trimmed) - -def DADA2_trim_input(): +def DADA2_trim_input_paired(): if config['Trim_primers'] == True : return ["DADA2/1a_trimmed_primers/{sample}_trimmed_R1.fastq.gz", "DADA2/1a_trimmed_primers/{sample}_trimmed_R2.fastq.gz"] else : return ["raw_reads/{sample}_R1.fastq.gz", "raw_reads/{sample}_R2.fastq.gz"] +def DADA2_trim_input_single(): + if config['Trim_primers'] == True : + return ["DADA2/1a_trimmed_primers/{sample}_trimmed_single.fastq.gz"] + else : + return ["raw_reads/{sample}_single.fastq.gz"] + + ### Remove low quality and too short reads -rule DADA2_q_filtering: +rule DADA2_q_filtering_paired: conda: "../../envs/DADA2_in_R.yml" container: singularity_envs["dada2"] input: - DADA2_trim_input() + DADA2_trim_input_paired() output: q_score_filtered_F = temp("DADA2/1b_q_score_filtered_paired/{sample}_filtered_R1.fastq.gz"), q_score_filtered_R = temp("DADA2/1b_q_score_filtered_paired/{sample}_filtered_R2.fastq.gz"), @@ -32,10 +38,34 @@ rule DADA2_q_filtering: threads: 1 script: - "scripts/1_DADA2_q_filtering.R" + "scripts/1_DADA2_q_filtering_paired.R" + + +rule DADA2_q_filtering_single: + conda: + "../../envs/DADA2_in_R.yml" + container: + singularity_envs["dada2"] + input: + DADA2_trim_input_single() + output: + q_score_filtered_F = temp("DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz"), + q_filtering_stats = "DADA2/1b_q_score_filtered_paired/{sample}_q_score_filtering_stats.rds" + log: + logging_folder + "DADA2/1b_q_score_filtered/{sample}_DADA2_1b_q_score_filtering.txt" + params: + F_reads_length_trim = config["DADA2_F_reads_length_trim"], + F_reads_extected_error = config["F_extected_error"], + sample_name = lambda wildcards: wildcards.sample, + threads: + 1 + script: + "scripts/1_DADA2_q_filtering_single.R" + + ### Learn error profile of the runs -rule DADA2_learn_errors: +rule DADA2_learn_errors_paired: conda: "../../envs/DADA2_in_R.yml" container: @@ -53,10 +83,31 @@ rule DADA2_learn_errors: threads: 8 script: - "scripts/2a_big_data_DADA2_learn_errors.R" + "scripts/2a_big_data_DADA2_learn_errors_paired.R" -### Infer error free sequences from the filtered reads and the runs error profiles -rule DADA2_infer_ASV: + + +rule DADA2_learn_errors_single: + conda: + "../../envs/DADA2_in_R.yml" + container: + singularity_envs["dada2"] + input: + q_score_filtered_F = lambda wildcards: expand("DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN]), + output: + error_profile_F = "DADA2/2_denoised/{RUN}/error_profile_single.rds", + error_profile_F_plot = report("DADA2/2_denoised/{RUN}/error_profile_single_plot.png", caption="report/DADA2_error_profile.rst", category="DADA2", subcategory="Error profile"), + log: + logging_folder + "DADA2/2_denoised/{RUN}/DADA2_learn_errors.txt" + threads: + 8 + script: + "scripts/2a_big_data_DADA2_learn_errors_single.R" + + + +## Correct errors based on profile error of the run +rule DADA2_infer_paired_ASV: conda: "../../envs/DADA2_in_R.yml" container: @@ -79,7 +130,35 @@ rule DADA2_infer_ASV: threads: 4 script: - "scripts/2b_big_data_DADA2_infer_ASV.R" + "scripts/2b_big_data_DADA2_infer_ASV_paired.R" + + + +rule DADA2_infer_single_ASV: + conda: + "../../envs/DADA2_in_R.yml" + container: + singularity_envs["dada2"] + input: + q_score_filtered_F = "DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz", + error_profile_F = "DADA2/2_denoised/{RUN}/error_profile_single.rds", + output: + infer_stats = "DADA2/2_denoised/{RUN}/{sample}_infer_stats.rds", + sample_seq_tab = "DADA2/2_denoised/{RUN}/{sample}_infer_seq_tab.rds", + params: + sample_name = lambda wildcards: wildcards.sample, + run = lambda wildcards: wildcards.RUN, + x_column_value = lambda wildcards: all_samples[config["sample_label"]][wildcards.sample], + min_overlap = config["min_overlap"] + log: + logging_folder + "DADA2/2_denoised/{RUN}/{sample}_DADA2_2_infer_ASV.txt" + threads: + 4 + script: + "scripts/2b_big_data_DADA2_infer_ASV_single.R" + + + ### Merge the ASVs determined from the different samples rule DADA2_merge_sample_ASV: @@ -100,6 +179,7 @@ rule DADA2_merge_sample_ASV: script: "scripts/2c_big_data_DADA2_merge_ASV.R" + ### Merge data from all run, filter out chimera and remove too short merged sequences rule DADA2_merge_runs_filter_chim: conda: diff --git a/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering.R b/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_paired.R similarity index 98% rename from rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering.R rename to rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_paired.R index 6f690985..809ca1ae 100644 --- a/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering.R +++ b/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_paired.R @@ -33,7 +33,7 @@ ### Reads are filtered based on the number of errors expected for the read (integration of the qscore and the length of the read). All reads with uncalled nucleotide (N) are removed too. Remaining phiX reads will be removed too. Finally, reads are cut at a length set in config. ### Filter and trim. The filtered reads are directly written while the filtering stats are being save for later compilation. - filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, rev = fnRs, filt.rev = q_score_filtered_R, truncLen=c(F_length,R_length), maxN=0, maxEE=c(F_extected_error,R_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads) + filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, rev = fnRs, filt.rev = q_score_filtered_R, truncLen=c(F_length,R_length), maxN=0, maxEE=c(F_extected_error,R_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads, verbose = TRUE) filtering_stats <- as.data.frame(filtering_stats) filtering_stats$Sample <- sample_name diff --git a/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R b/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R new file mode 100644 index 00000000..c378b72a --- /dev/null +++ b/rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R @@ -0,0 +1,37 @@ +# Title : DADA2 - Q-score filtering +# Objective : Filter reads based on q-score and length +# Created by: valentinscherz +# Created on: 06.06.19 +# Modified from :https://benjjneb.github.io/dada2/bigdata_paired.html + +## Redirect R output to the log file + log <- file(snakemake@log[[1]], open="wt") + sink(log) + sink(log, type="message") + +## Input + fnFs <- snakemake@input[[1]] + +## Output + q_score_filtered_F <- snakemake@output[["q_score_filtered_F"]] + q_filtering_stats_path <- snakemake@output[["q_filtering_stats"]] + +## Parameters + F_length <- snakemake@params[["F_reads_length_trim"]] + F_extected_error <- snakemake@params[["F_reads_extected_error"]] + sample_name <- snakemake@params[["sample_name"]] + + +## Load needed libraries + library(dada2);packageVersion("dada2") + +## Filter and trim +### Reads are filtered based on the number of errors expected for the read (integration of the qscore and the length of the read). All reads with uncalled nucleotide (N) are removed too. Remaining phiX reads will be removed too. Finally, reads are cut at a length set in config. + +### Filter and trim. The filtered reads are directly written while the filtering stats are being save for later compilation. + filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, truncLen=c(F_length), maxN=0, maxEE=c(F_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads, verbose = TRUE) + filtering_stats <- as.data.frame(filtering_stats) + filtering_stats$Sample <- sample_name + +### Save the stats for this sample in a R object + saveRDS(filtering_stats, file = q_filtering_stats_path) diff --git a/rules/1_2_DADA2_ASVs/scripts/1_cutadapt.py b/rules/1_2_DADA2_ASVs/scripts/1_cutadapt_paired.py similarity index 100% rename from rules/1_2_DADA2_ASVs/scripts/1_cutadapt.py rename to rules/1_2_DADA2_ASVs/scripts/1_cutadapt_paired.py diff --git a/rules/1_2_DADA2_ASVs/scripts/1_cutadapt_single.py b/rules/1_2_DADA2_ASVs/scripts/1_cutadapt_single.py new file mode 100644 index 00000000..d3cd84ac --- /dev/null +++ b/rules/1_2_DADA2_ASVs/scripts/1_cutadapt_single.py @@ -0,0 +1,36 @@ +from Bio.Seq import Seq +from Bio.Alphabet import IUPAC +from snakemake import shell + +if snakemake.params['amplicon_type'] == "ITS": + print("ITS Trimming") + forward_primer_compl = Seq.reverse_complement(Seq(snakemake.params['forward_primer'], IUPAC.ambiguous_dna)) + reverse_primer_compl = Seq.reverse_complement(Seq(snakemake.params['reverse_primer'], IUPAC.ambiguous_dna)) + shell("""cutadapt \ + --cores {snakemake.threads} \ + --error-rate 0.1 \ + --times 2 \ + --overlap 3 \ + -o {snakemake.output[R1_trimmed_reads]} \ + -g '{snakemake.params[forward_primer]}' \ + -a '{reverse_primer_compl}' \ + --match-read-wildcards \ + --discard-untrimmed \ + {snakemake.input[R1_raw_reads]} >> {snakemake.log[0]}""") + +elif snakemake.params['amplicon_type'] == "16S": + print("16S Trimming") + shell("""cutadapt \ + --cores {snakemake.threads} \ + --error-rate 0.1 \ + --times 1 \ + --overlap 3 \ + -o {snakemake.output[R1_trimmed_reads]} \ + -g '{snakemake.params[forward_primer]}' \ + --match-read-wildcards \ + --discard-untrimmed \ + {snakemake.input[R1_raw_reads]} >> {snakemake.log[0]}""") + + + + diff --git a/rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors.R b/rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors_paired.R similarity index 100% rename from rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors.R rename to rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors_paired.R diff --git a/rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors_single.R b/rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors_single.R new file mode 100644 index 00000000..d6eec62e --- /dev/null +++ b/rules/1_2_DADA2_ASVs/scripts/2a_big_data_DADA2_learn_errors_single.R @@ -0,0 +1,33 @@ +# Title : DADA2 - Learn run errors profile +# Objective : Generate an error profile for each run +# Created by: valentinscherz +# Created on: 06.06.19 +# Modified from :https://benjjneb.github.io/dada2/bigdata_paired.html + +## Redirect R output + log <- file(snakemake@log[[1]], open="wt") + sink(log) + sink(log, type="message") + +## Input + q_score_filtered_F <- snakemake@input[["q_score_filtered_F"]] + +## Output + error_profile_F <- snakemake@output[["error_profile_F"]] + error_plot_F <- snakemake@output[["error_profile_F_plot"]] + +## Load needed libraries + library(dada2); packageVersion("dada2") + + +## Learn error rates + errF <- learnErrors(q_score_filtered_F, nbases=1e8, multithread = snakemake@threads, verbose = 1) + +## Write these error profiles + saveRDS(object = errF, file = error_profile_F) + +## Write error_plots + png(error_plot_F) + plotErrors(errF, nominalQ=TRUE) + + dev.off() diff --git a/rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV.R b/rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV_paired.R similarity index 100% rename from rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV.R rename to rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV_paired.R diff --git a/rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV_single.R b/rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV_single.R new file mode 100644 index 00000000..df2a7c41 --- /dev/null +++ b/rules/1_2_DADA2_ASVs/scripts/2b_big_data_DADA2_infer_ASV_single.R @@ -0,0 +1,64 @@ +# Title : DADA2 - infer ASVs +# Objective : Infers ASVs from the reads, based on error profile of the run +# Created by: valentinscherz +# Created on: 06.06.2019 +# Modified from :https://benjjneb.github.io/dada2/bigdata_paired.html + +## Redirect R output + log <- file(snakemake@log[[1]], open="wt") + sink(log) + sink(log, type="message") + +## Input + q_score_filtered_F <- snakemake@input[["q_score_filtered_F"]] + error_profile_F <- snakemake@input[["error_profile_F"]] + +## Output + infer_stats <- snakemake@output[["infer_stats"]] + sample_seq_tab <- snakemake@output[["sample_seq_tab"]] + +## Parameters + sample_name <- snakemake@params[["sample_name"]] + run <- snakemake@params[["run"]] + x_column_value <- snakemake@params[["x_column_value"]] + min_overlap <- snakemake@params[["min_overlap"]] + print(paste("min_overlap is :", min_overlap)) + +## Load needed libraries + library(dada2); packageVersion("dada2") + +## Create a useful function to count the number of sequences + getN <- function(x) sum(getUniques(x)) + +## File renaming + names(q_score_filtered_F) <- sample_name + +## Read error rates + errF <- readRDS(error_profile_F) + +## Prepare named vector + mergers <- vector("list", 1) + names(mergers) <- sample_name + +## Sample inference and merger of paired-end reads + cat("Processing:", sample_name, "\n") + ## Forward + derepF <- derepFastq(q_score_filtered_F, verbose = TRUE) + ddF <- dada(derepF, err=errF, multithread = snakemake@threads, verbose = 1, pool = FALSE, selfConsist = TRUE) + mergers[[sample_name]] <- ddF + +## Save the dereplicated, corrected and merged sequences for this sample + saveRDS(mergers, file = sample_seq_tab) + +## For statistics record the number of reads + ### Write the statistics in a dataframe + infer <- data.frame(denoisedF = getN(ddF)) + infer$denoisedR <- NULL + infer$merged_pairs <- infer$denoisedF + infer$Sample <- sample_name + infer$label <- x_column_value + infer$RUN <- run + + ### Save the sequences stats for this sample + saveRDS(infer, file = infer_stats) + diff --git a/rules/4_post_processing/scripts/physeq_export.R b/rules/4_post_processing/scripts/physeq_export.R index 38d100e2..da9d9966 100644 --- a/rules/4_post_processing/scripts/physeq_export.R +++ b/rules/4_post_processing/scripts/physeq_export.R @@ -41,7 +41,7 @@ ape::write.tree(tree, tree_path) metadata <- (sample_data(phyloseq_obj)) metadata$Sample <- rownames(metadata) metadata <- metadata %>% select(Sample, everything()) -write.table(metadata, meta_path , sep="\t", quote=F, row.names = FALSE) +write.table(metadata, meta_path , sep="\t", row.names = FALSE) ## Write the taxonomy table of the phyloseq object taxa_df<- as.data.frame(tax_table(phyloseq_obj), stringsAsFactors = F) diff --git a/rules/4_post_processing/scripts/physeq_melt_table.R b/rules/4_post_processing/scripts/physeq_melt_table.R index 798984d8..e4fcbc6a 100644 --- a/rules/4_post_processing/scripts/physeq_melt_table.R +++ b/rules/4_post_processing/scripts/physeq_melt_table.R @@ -29,4 +29,4 @@ phyloseq_obj <- prune_taxa(taxa_sums(phyloseq_obj) > 0, phyloseq_obj) ## Removes physeq_df <- psmelt(phyloseq_obj) ### Write this table in a tsv format -write.table(x = physeq_df, file = phyloseq_melted_table, append = F, quote = F, sep = "\t", eol = "\n", row.names = F, col.names = T ) +write.table(x = physeq_df, file = phyloseq_melted_table, append = F, sep = "\t", eol = "\n", row.names = F, col.names = T ) diff --git a/rules/4_post_processing/scripts/transpose_and_add_meta_count_table.R b/rules/4_post_processing/scripts/transpose_and_add_meta_count_table.R index 519ddb59..6ef31de6 100644 --- a/rules/4_post_processing/scripts/transpose_and_add_meta_count_table.R +++ b/rules/4_post_processing/scripts/transpose_and_add_meta_count_table.R @@ -20,7 +20,7 @@ ## Load the phyloseq phyloseq_object table <- read.table(count_table, sep = "\t", header = TRUE) - meta <- read.table(meta, sep = "\t", header = TRUE) + meta <- read.table(meta, sep = "\t", header = TRUE, na.strings = "NA") ## Tranpose table table_t <- t(table)