Skip to content

Commit

Permalink
Merge pull request #27 from metagenlab/single_pair_new_try
Browse files Browse the repository at this point in the history
Single pair new try
  • Loading branch information
valscherz authored Jul 6, 2020
2 parents 728a7dd + 892c3c2 commit dd52352
Show file tree
Hide file tree
Showing 18 changed files with 341 additions and 31 deletions.
2 changes: 1 addition & 1 deletion Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ if "--use-singularity" in sys.argv:
### Bind the directory of the database to the singularity containers.
workflow.singularity_args += f' -B {config["tax_DB_path"]}:{config["tax_DB_path"]}'
#### Load a dictionnary of singularity containers that will be called from each rule
singularity_envs = yaml.safe_load(open(os.path.join(workflow.basedir, "envs/singularity/sing_envs.yml"), 'r'))
singularity_envs = yaml.safe_load(open(os.path.join(workflow.basedir, "envs/singularity/sing_envs.yml"), 'r'))


## Include the pipeline rules
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,6 @@ create_phyloseq_fct <- function(physeq_name = "physeq", melted_df_name = "physeq
save(physeq_obj, file = "physeq_object")

### Write this table in a tsv file since it is ground for coming analysis and slow to compute
write.table(x = physeq_df, file = "physeq_df.tsv", append = F, quote = F, sep = "\t", eol = "\n", row.names = F, col.names = T )
write.table(x = physeq_df, file = "physeq_df.tsv", append = F, sep = "\t", eol = "\n", row.names = F, col.names = T )

}
37 changes: 24 additions & 13 deletions rules/0_preprocessing/QC_raw_reads.rules
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
## Generate MultiQC reports

rule assess_quality_paired_raw_reads_with_fastqc:
rule assess_quality_raw_reads_with_fastqc:
conda:
"../../envs/FastQC.yml"
container:
singularity_envs["fastqc"]
input:
"raw_reads/{sample}_R1.fastq.gz",
"raw_reads/{sample}_R2.fastq.gz",
"raw_reads/{sample}_{suffix}.fastq.gz"
output:
"QC/FastQC/raw_reads/{RUN}/{sample}_R1_fastqc.zip",
"QC/FastQC/raw_reads/{RUN}/{sample}_R2_fastqc.zip",
"QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip"
log:
logging_folder + "QC/FastQC/raw_reads/{RUN}/{sample}_fastqc.txt"
logging_folder + "QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.txt"
threads:
1
shell:
Expand All @@ -21,14 +19,24 @@ rule assess_quality_paired_raw_reads_with_fastqc:
"""


rule create_per_run_raw_reads_multiqc_report:
def sample_list_run_QC(RUN):
file_list = []
samples = list(all_samples.index.values[all_samples[config["run_column"]] == RUN])
for s in samples:
suffix_s = reads_ext[s]
combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = RUN, sample = s, suffix = suffix_s)
file_list = file_list + combined_values

return(file_list)


rule create_per_run_multiqc_report:
conda:
"../../envs/MultiQC.yml"
container:
singularity_envs["multiqc"]
input:
lambda wildcards: expand("QC/FastQC/raw_reads/{RUN}/{sample}_R1_fastqc.zip", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN], RUN = wildcards.RUN),
lambda wildcards: expand("QC/FastQC/raw_reads/{RUN}/{sample}_R2_fastqc.zip", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN], RUN = wildcards.RUN),
lambda wildcards: sample_list_run_QC(wildcards.RUN)
output:
report("QC/{RUN}_multiqc_raw_reads_report.html", caption="report/MultiQC.rst", category="MultiQC", subcategory="{RUN}"),
"QC/{RUN}_multiqc_raw_reads_report_data/multiqc_general_stats.txt",
Expand All @@ -41,13 +49,16 @@ rule create_per_run_raw_reads_multiqc_report:
multiqc --interactive -f -n {output[0]} $(dirname {input} | tr "\n" " ") --cl_config "max_table_rows: 100000" &> {log}
"""

def sample_list():
def sample_list_overall_QC():

file_list = []

for i in set(all_samples[config['run_column']]):
combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_R{read_number}_fastqc.zip", RUN = i, sample = all_samples.index.values[all_samples[config["run_column"]] == i], read_number = ["1","2"])
file_list = file_list + combined_values
samples = list(all_samples.index.values[all_samples[config["run_column"]] == i])
for s in samples:
suffix_s = reads_ext[s]
combined_values = expand("QC/FastQC/raw_reads/{RUN}/{sample}_{suffix}_fastqc.zip", RUN = i, sample = s, suffix = suffix_s)
file_list = file_list + combined_values

return(file_list)

Expand All @@ -58,7 +69,7 @@ rule create_overall_raw_reads_multiqc_report:
container:
singularity_envs["multiqc"]
input:
sample_list()
sample_list_overall_QC()
output:
report("QC/multiqc_raw_reads_report.html", caption="report/MultiQC.rst", category="MultiQC", subcategory="All run"),
"QC/multiqc_raw_reads_report_data/multiqc_general_stats.txt",
Expand Down
27 changes: 27 additions & 0 deletions rules/0_preprocessing/get_reads.rules
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ reads_sra = {}

reads_local = {}
original_names = {}
reads_ext = {}

if "local_samples" not in config.keys() and "sra_samples" not in config.keys():
raise ValueError("No samples defined in the config file")
Expand All @@ -66,6 +67,18 @@ if "local_samples" in config.keys():
sample = str(sorted(list(original_names.keys()))[match.index(True)])
original_correct[i] = original_names[sample]
read_correct[i] = reads_local[sample]

if "LibraryLayout" in list(local_data):
if local_data.loc[i, "LibraryLayout"].lower()=="paired":
reads_ext[i]=["R1", "R2"]
elif sra_data.loc[i, "LibraryLayout"].lower()=="single":
reads_ext[i]=["single"]
else:
raise ValueError("Problem in the sra file, LibraryLayout badly defined")
else:
reads_ext[i]=["R1", "R2"]


else:
for i in list(local_data["OldSampleName"]):
regex = re.compile(r'%s([^a-zA-Z0-9]|$)' % i)
Expand All @@ -76,9 +89,21 @@ if "local_samples" in config.keys():
sample=str(local_data.index[local_data['OldSampleName'] == i][0])
original_correct[sample] = original_names[old_sample_name]
read_correct[sample] = reads_local[old_sample_name]

if "LibraryLayout" in list(local_data):
if local_data.loc[sample, "LibraryLayout"].lower()=="paired":
reads_ext[sample]=["R1", "R2"]
elif sra_data.loc[i, "LibraryLayout"].lower()=="single":
reads_ext[sample]=["single"]
else:
raise ValueError("Problem in the sra file, LibraryLayout badly defined")
else:
reads_ext[sample]=["R1", "R2"]

original_names = original_correct
reads_local = read_correct
all_samples=local_data
reads_ext = reads_ext

if "sra_samples" in config.keys():
sra_data = pandas.read_csv(config["sra_samples"], sep="\t", index_col=0).drop_duplicates()
Expand All @@ -92,8 +117,10 @@ if "sra_samples" in config.keys():
reads_sra[sample_name]=str(i)
if sra_data.loc[i, "LibraryLayout"].lower()=="paired":
sras_ext[sample_name]=["1.fastq.gz", "2.fastq.gz"]
reads_ext[sample_name]=["R1", "R2"]
elif sra_data.loc[i, "LibraryLayout"].lower()=="single":
sras_ext[sample_name] = ["fastq.gz"]
reads_ext[sample_name]=["single"]
else:
raise ValueError("Problem in the sra file, LibraryLayout badly defined")
all_samples=sra_data
Expand Down
1 change: 1 addition & 0 deletions rules/0_preprocessing/get_sras.rules
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ rule sra_convert_to_fastq_paired:
fastq-dump --split-3 --gzip --log-level 1 --disable-multithreading --minReadLen 0 --outdir $(dirname {output[0]}) {input[0]} &> {log}
"""


rule sra_convert_to_fastq_single:
conda:
"../../envs/sra-tools.yml"
Expand Down
25 changes: 23 additions & 2 deletions rules/1_2_DADA2_ASVs/1_cutadapt_trim.rules
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
## Remove primers from sequences

rule cutadapt_trim_for_DADA2:
rule cutadapt_trim_paired_for_DADA2:
conda:
"../../envs/cutadapt.yml"
container:
Expand All @@ -20,4 +20,25 @@ rule cutadapt_trim_for_DADA2:
threads:
1
script:
"scripts/1_cutadapt.py"
"scripts/1_cutadapt_paired.py"


rule cutadapt_trim_single_for_DADA2:
conda:
"../../envs/cutadapt.yml"
container:
singularity_envs["cutadapt"]
input:
R1_raw_reads = "raw_reads/{sample}_single.fastq.gz",
output:
R1_trimmed_reads = temp("{denoiser}/1a_trimmed_primers/{sample}_trimmed_single.fastq.gz")
log:
logging_folder + "{denoiser}/1a_trimmed_primers/{sample}_single_trimmed.txt"
params:
amplicon_type = config["ITS_or_16S"],
forward_primer = config["forward_primer"],
reverse_primer = config["reverse_primer"],
threads:
1
script:
"scripts/1_cutadapt_single.py"
100 changes: 90 additions & 10 deletions rules/1_2_DADA2_ASVs/2_DADA2_denoising.rules
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
## Process trimmed sequences with DADA2 to correct errors and generate ASVs. Follows the "big data" protocol (https://benjjneb.github.io/dada2/bigdata.html)

## Define trimmed or not trimmed input (for read already trimmed)

def DADA2_trim_input():
def DADA2_trim_input_paired():
if config['Trim_primers'] == True :
return ["DADA2/1a_trimmed_primers/{sample}_trimmed_R1.fastq.gz", "DADA2/1a_trimmed_primers/{sample}_trimmed_R2.fastq.gz"]
else :
return ["raw_reads/{sample}_R1.fastq.gz", "raw_reads/{sample}_R2.fastq.gz"]


def DADA2_trim_input_single():
if config['Trim_primers'] == True :
return ["DADA2/1a_trimmed_primers/{sample}_trimmed_single.fastq.gz"]
else :
return ["raw_reads/{sample}_single.fastq.gz"]


### Remove low quality and too short reads
rule DADA2_q_filtering:
rule DADA2_q_filtering_paired:
conda:
"../../envs/DADA2_in_R.yml"
container:
singularity_envs["dada2"]
input:
DADA2_trim_input()
DADA2_trim_input_paired()
output:
q_score_filtered_F = temp("DADA2/1b_q_score_filtered_paired/{sample}_filtered_R1.fastq.gz"),
q_score_filtered_R = temp("DADA2/1b_q_score_filtered_paired/{sample}_filtered_R2.fastq.gz"),
Expand All @@ -32,10 +38,34 @@ rule DADA2_q_filtering:
threads:
1
script:
"scripts/1_DADA2_q_filtering.R"
"scripts/1_DADA2_q_filtering_paired.R"


rule DADA2_q_filtering_single:
conda:
"../../envs/DADA2_in_R.yml"
container:
singularity_envs["dada2"]
input:
DADA2_trim_input_single()
output:
q_score_filtered_F = temp("DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz"),
q_filtering_stats = "DADA2/1b_q_score_filtered_paired/{sample}_q_score_filtering_stats.rds"
log:
logging_folder + "DADA2/1b_q_score_filtered/{sample}_DADA2_1b_q_score_filtering.txt"
params:
F_reads_length_trim = config["DADA2_F_reads_length_trim"],
F_reads_extected_error = config["F_extected_error"],
sample_name = lambda wildcards: wildcards.sample,
threads:
1
script:
"scripts/1_DADA2_q_filtering_single.R"



### Learn error profile of the runs
rule DADA2_learn_errors:
rule DADA2_learn_errors_paired:
conda:
"../../envs/DADA2_in_R.yml"
container:
Expand All @@ -53,10 +83,31 @@ rule DADA2_learn_errors:
threads:
8
script:
"scripts/2a_big_data_DADA2_learn_errors.R"
"scripts/2a_big_data_DADA2_learn_errors_paired.R"

### Infer error free sequences from the filtered reads and the runs error profiles
rule DADA2_infer_ASV:


rule DADA2_learn_errors_single:
conda:
"../../envs/DADA2_in_R.yml"
container:
singularity_envs["dada2"]
input:
q_score_filtered_F = lambda wildcards: expand("DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz", sample = all_samples.index.values[all_samples[config["run_column"]] == wildcards.RUN]),
output:
error_profile_F = "DADA2/2_denoised/{RUN}/error_profile_single.rds",
error_profile_F_plot = report("DADA2/2_denoised/{RUN}/error_profile_single_plot.png", caption="report/DADA2_error_profile.rst", category="DADA2", subcategory="Error profile"),
log:
logging_folder + "DADA2/2_denoised/{RUN}/DADA2_learn_errors.txt"
threads:
8
script:
"scripts/2a_big_data_DADA2_learn_errors_single.R"



## Correct errors based on profile error of the run
rule DADA2_infer_paired_ASV:
conda:
"../../envs/DADA2_in_R.yml"
container:
Expand All @@ -79,7 +130,35 @@ rule DADA2_infer_ASV:
threads:
4
script:
"scripts/2b_big_data_DADA2_infer_ASV.R"
"scripts/2b_big_data_DADA2_infer_ASV_paired.R"



rule DADA2_infer_single_ASV:
conda:
"../../envs/DADA2_in_R.yml"
container:
singularity_envs["dada2"]
input:
q_score_filtered_F = "DADA2/1b_q_score_filtered_paired/{sample}_filtered_single.fastq.gz",
error_profile_F = "DADA2/2_denoised/{RUN}/error_profile_single.rds",
output:
infer_stats = "DADA2/2_denoised/{RUN}/{sample}_infer_stats.rds",
sample_seq_tab = "DADA2/2_denoised/{RUN}/{sample}_infer_seq_tab.rds",
params:
sample_name = lambda wildcards: wildcards.sample,
run = lambda wildcards: wildcards.RUN,
x_column_value = lambda wildcards: all_samples[config["sample_label"]][wildcards.sample],
min_overlap = config["min_overlap"]
log:
logging_folder + "DADA2/2_denoised/{RUN}/{sample}_DADA2_2_infer_ASV.txt"
threads:
4
script:
"scripts/2b_big_data_DADA2_infer_ASV_single.R"




### Merge the ASVs determined from the different samples
rule DADA2_merge_sample_ASV:
Expand All @@ -100,6 +179,7 @@ rule DADA2_merge_sample_ASV:
script:
"scripts/2c_big_data_DADA2_merge_ASV.R"


### Merge data from all run, filter out chimera and remove too short merged sequences
rule DADA2_merge_runs_filter_chim:
conda:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
### Reads are filtered based on the number of errors expected for the read (integration of the qscore and the length of the read). All reads with uncalled nucleotide (N) are removed too. Remaining phiX reads will be removed too. Finally, reads are cut at a length set in config.

### Filter and trim. The filtered reads are directly written while the filtering stats are being save for later compilation.
filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, rev = fnRs, filt.rev = q_score_filtered_R, truncLen=c(F_length,R_length), maxN=0, maxEE=c(F_extected_error,R_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads)
filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, rev = fnRs, filt.rev = q_score_filtered_R, truncLen=c(F_length,R_length), maxN=0, maxEE=c(F_extected_error,R_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads, verbose = TRUE)
filtering_stats <- as.data.frame(filtering_stats)
filtering_stats$Sample <- sample_name

Expand Down
37 changes: 37 additions & 0 deletions rules/1_2_DADA2_ASVs/scripts/1_DADA2_q_filtering_single.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# Title : DADA2 - Q-score filtering
# Objective : Filter reads based on q-score and length
# Created by: valentinscherz
# Created on: 06.06.19
# Modified from :https://benjjneb.github.io/dada2/bigdata_paired.html

## Redirect R output to the log file
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

## Input
fnFs <- snakemake@input[[1]]

## Output
q_score_filtered_F <- snakemake@output[["q_score_filtered_F"]]
q_filtering_stats_path <- snakemake@output[["q_filtering_stats"]]

## Parameters
F_length <- snakemake@params[["F_reads_length_trim"]]
F_extected_error <- snakemake@params[["F_reads_extected_error"]]
sample_name <- snakemake@params[["sample_name"]]


## Load needed libraries
library(dada2);packageVersion("dada2")

## Filter and trim
### Reads are filtered based on the number of errors expected for the read (integration of the qscore and the length of the read). All reads with uncalled nucleotide (N) are removed too. Remaining phiX reads will be removed too. Finally, reads are cut at a length set in config.

### Filter and trim. The filtered reads are directly written while the filtering stats are being save for later compilation.
filtering_stats <- filterAndTrim(fwd = fnFs, filt = q_score_filtered_F, truncLen=c(F_length), maxN=0, maxEE=c(F_extected_error), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=snakemake@threads, verbose = TRUE)
filtering_stats <- as.data.frame(filtering_stats)
filtering_stats$Sample <- sample_name

### Save the stats for this sample in a R object
saveRDS(filtering_stats, file = q_filtering_stats_path)
File renamed without changes.
Loading

0 comments on commit dd52352

Please sign in to comment.