Skip to content

Commit

Permalink
v1.1.0: rRNA plots and updated nextflow version (#45)
Browse files Browse the repository at this point in the history
* Update DSL2 enable

* Update minimum Nextflow version

* Uppercase workflow and process names according to DSL2 convention

* Remove unsupported publish directive

* Add channel element structure comments

* Add comment

* Fix lowercase process names

* Fix unqualified input value declaration

* Plot rRNA statistics from FastQScreen output (#44)

* extract rRNA numbers from fastq_screen and plot with MultiQC

* fix syntax

* custom rrna plots into main multiqc config

* refactored rrna extraction

* refactored combining rrna data

* rename channel

* added explanatory comments

* Update nextflow.config

update version to 1.1.0-rc1

* Bump version to 1.1.0 to make production release

Co-authored-by: Mahesh Binzer-Panchal <[email protected]>
Co-authored-by: Pontus Larsson <[email protected]>
Co-authored-by: alvaannett <[email protected]>
  • Loading branch information
4 people authored Sep 1, 2021
1 parent 2cbbeba commit aa60649
Show file tree
Hide file tree
Showing 6 changed files with 209 additions and 58 deletions.
4 changes: 2 additions & 2 deletions config/compute_resources.config
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
process {
withName: 'fastq_screen' {
withName: 'FASTQ_SCREEN' {
memory = '4G'
}
withName: 'get_QC_thresholds' {
withName: 'GET_QC_THRESHOLDS' {
errorStrategy = 'ignore'
}
withLabel: 'high_memory' {
Expand Down
1 change: 1 addition & 0 deletions config/multiqc_flowcell_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ top_modules:
- 'custom_content'
- 'bcl2fastq'
- 'interop'

128 changes: 128 additions & 0 deletions config/multiqc_main_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,131 @@ custom_plot_config:

remove_sections:
- 'fastqc_sequence_counts'

custom_data:
rrna_plot:
id: "rrna_plot"
section_name: "Ribosomal RNA - plot"
parent_id: "rrna"
parent_name: "Ribosomal RNA"
title: "rRNA mapping statistics extracted from FastQScreen output"
description: "shows the FastQScreen mapping statistics for the rRNA genome. The statistics have been extracted from the full FastQScreen output shown elsewhere in this report in order to highlight the rRNA contents."
file_format: "tsv"
plot_type: "bargraph"
categories:
- "#Unmapped"
- "#One_hit_one_genome"
- "#Multiple_hits_one_genome"
- "#One_hit_multiple_genomes"
- "Multiple_hits_multiple_genomes"
pconfig:
hide_zero_cats: False
cpswitch_c_active: False
title: "Reads mapped to rRNA genome"
rrna_table:
id: "rrna_table"
parent_id: "rrna"
parent_name: "Ribosomal RNA"
section_name: "Ribosomal RNA - table"
file_format: "tsv"
plot_type: "table"
description: "shows the FastQScreen mapping statistics for the rRNA genome. The statistics have been extracted from the full FastQScreen output shown elsewhere in this report in order to highlight the rRNA contents."
pconfig:
sortRows: True
table_title: "rRNA mapping statistics extracted from FastQScreen output"
headers:
"Genome":
title: 'Genome'
description: screened genome
hidden: True
"#Reads_processed":
namespace: 'rRNA number'
title: 'Reads_processed'
format: '{:,.0f}'
description: number of sampled reads for the screen
"#Unmapped":
title: 'Unmapped'
namespace: 'rRNA number'
hidden: True
format: '{:,.0f}'
description: reads with no hits in any of the screened genomes
"%Unmapped":
namespace: 'rRNA percentage'
title: 'Unmapped'
suffix: '%'
max: 100
min: 0
ceiling: 100
floor: 0
scale: 'RdYlGn'
description: reads with no hits in any of the screened genomes
"#One_hit_one_genome":
namespace: 'rRNA number'
title: 'One_hit_one_genome'
hidden: True
format: '{:,.0f}'
description: reads with a unique hit only in the specified genome
"%One_hit_one_genome":
namespace: 'rRNA percentage'
title: 'One_hit_one_genome'
suffix: '%'
max: 100
min: 0
ceiling: 100
floor: 0
scale: 'Reds'
description: reads with a unique hit only in the specified genome
"#Multiple_hits_one_genome":
namespace: 'rRNA number'
title: 'Multiple_hits_one_genome'
hidden: True
format: '{:,.0f}'
description: reads with multiple hits only in the specified genome
"%Multiple_hits_one_genome":
namespace: 'rRNA percentage'
title: 'Multiple_hits_one_genome'
suffix: '%'
max: 100
min: 0
ceiling: 100
floor: 0
scale: 'Reds'
description: reads with multiple hits only in the specified genome
"#One_hit_multiple_genomes":
namespace: 'rRNA number'
title: 'One_hit_multiple_genomes'
hidden: True
format: '{:,.0f}'
description: reads with a unique hit in multiple screened genomes
"%One_hit_multiple_genomes":
namespace: 'rRNA percentage'
title: 'One_hit_multiple_genomes'
suffix: '%'
max: 100
min: 0
ceiling: 100
floor: 0
scale: 'Reds'
description: reads with a unique hit in multiple screened genomes
"#Multiple_hits_multiple_genomes":
namespace: 'rRNA number'
title: 'Multiple_hits_multiple_genomes'
hidden: True
format: '{:,.0f}'
description: reads with multiple hits in multiple screened genomes
"%Multiple_hits_multiple_genomes":
namespace: 'rRNA percentage'
title: 'Multiple_hits_multiple_genomes'
suffix: '%'
max: 100
min: 0
ceiling: 100
floor: 0
scale: 'Reds'
description: reads with multiple hits in multiple screened genomes

sp:
rrna_table:
fn: "rrna_table.tsv"
rrna_plot:
fn: "rrna_plot.tsv"
1 change: 0 additions & 1 deletion config/multiqc_project_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,3 @@ table_columns_visible:

top_modules:
- 'custom_content'

111 changes: 67 additions & 44 deletions main.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#! /usr/bin/env nextflow

nextflow.preview.dsl=2
nextflow.enable.dsl=2
/* ####################################################
seqreports: SNP & SEQ Run folder QC pipeline
Expand Down Expand Up @@ -80,11 +80,7 @@ workflow {
Channel.fromPath(params.run_folder,checkIfExists:true)
.ifEmpty { "Error: No run folder (--run_folder) given."; exit 1 }
.set {run_folder}
check_run_quality(run_folder)

publish:
check_run_quality.out.projectqc to: "${params.result_dir}/projects", mode: 'copy', overwrite: true
check_run_quality.out.flowcellqc to: "${params.result_dir}/flowcell_report", mode: 'copy', overwrite: true
CHECK_RUN_QUALITY(run_folder)

}

Expand All @@ -106,13 +102,20 @@ def get_project_and_reads(run_folder) {

}

def combine_results_by_project (fastqc_results,fastq_screen_results) {
def combine_results_by_project (fastqc_results,fastq_screen_results,rrna_results) {
// fastqc_results // [Project, [fqcfiles1, fqcfiles2, fqcfiles3]]
// fastq_screen_results // [Project, [fqsfiles1, fqsfiles2, fqsfiles3]]
// rrna_results // [Project, [rrnafiles1, rrnafiles2, rrnafiles3]]

fastqc_results.mix(fastq_screen_results).groupTuple().map { it -> tuple(it[0],it[1][0].flatten(),it[1][1].flatten()) }
fastqc_results.join(fastq_screen_results)
.join(
rrna_results.collectFile(keepHeader:true,skip:1,sort:true) { it -> ["${it[0]}_rrna_table.tsv", it[1]] }
.map { it -> tuple((it.name - ~/_rrna_table.tsv/), [it]) })
// [Project, [fqcfiles1,fqcfiles2,fqcfiles3],[fqsfiles1,fqsfiles2,fqsfiles3],[Project_rrna_table.tsv]]

}

workflow check_run_quality {
workflow CHECK_RUN_QUALITY {

/* Workflow Graph
Expand All @@ -128,46 +131,46 @@ workflow check_run_quality {
run_folder

main:
interop_summary(run_folder)
get_QC_thresholds(run_folder)
get_metadata(run_folder)
INTEROP_SUMMARY(run_folder)
GET_QC_THRESHOLDS(run_folder)
GET_METADATA(run_folder)
project_and_reads = get_project_and_reads(params.run_folder)
fastqc(project_and_reads)
fastq_screen(project_and_reads,
FASTQC(project_and_reads)
FASTQ_SCREEN(project_and_reads,
params.config_dir,
params.fastqscreen_databases)
multiqc_per_flowcell( params.run_folder,
fastqc.out.map{ it[1] }.collect(),
fastq_screen.out.map{ it[1] }.collect(),
interop_summary.out.collect(),
get_QC_thresholds.out.collect().ifEmpty([]),
get_metadata.out.collect(),
MULTIQC_PER_FLOWCELL( params.run_folder,
FASTQC.out.map{ it[1] }.collect(),
FASTQ_SCREEN.out.results.map{ it[1] }.collect(),
FASTQ_SCREEN.out.tsv.map{ it[1] }.collectFile(keepHeader:true,skip:1,sort:true),
INTEROP_SUMMARY.out.collect(),
GET_QC_THRESHOLDS.out.collect().ifEmpty([]),
GET_METADATA.out.collect(),
Channel.fromPath("${params.run_folder}/${params.bcl2fastq_outdir}/Stats/Stats.json").collect().ifEmpty([]),
params.assets_dir,
params.config_dir)
multiqc_per_project( params.run_folder,
combine_results_by_project(fastqc.out.groupTuple(),fastq_screen.out.groupTuple()),
get_metadata.out.collect(),
MULTIQC_PER_PROJECT( params.run_folder,
combine_results_by_project(
FASTQC.out.groupTuple(),
FASTQ_SCREEN.out.results.groupTuple(),
FASTQ_SCREEN.out.tsv),
GET_METADATA.out.collect(),
params.assets_dir,
params.config_dir)

emit:
flowcellqc = multiqc_per_flowcell.out
projectqc = multiqc_per_project.out

}

// ---------------------------------------------------
// Processes
// ---------------------------------------------------

process fastqc {

process FASTQC {

input:
tuple project, path(fastq_file)
tuple val(project), path(fastq_file)

output:
tuple project, path("*_results")
tuple val(project), path("*_results")

script:
"""
Expand All @@ -176,17 +179,20 @@ process fastqc {
"""
}

process fastq_screen {
process FASTQ_SCREEN {

input:
tuple project, path(fastq_file)
tuple val(project), path(fastq_file)
path config_dir
path fastqscreen_databases

output:
tuple project, path("*_results")
tuple val(project), path("*_results"), emit: results
tuple val(project), path("rrna.tsv"), emit: tsv

script:
outdir = fastq_file + "_fastq_screen_results"
sample_name = (fastq_file.name =~ /^(.*_S\d+_L\d{3}_R\d+).*/)[0][1]
"""
sed -E 's/^(THREADS[[:blank:]]+)[[:digit:]]+/\1${task.cpus}/' \\
${config_dir}/fastq_screen.conf > fastq_screen.conf
Expand All @@ -195,12 +201,18 @@ process fastq_screen {
elif [ "${fastqscreen_databases}" != "${fastqscreen_default_databases}" ]; then
sed -i 's#${fastqscreen_default_databases}#${fastqscreen_databases}#' fastq_screen.conf
fi
mkdir -p $fastq_file"_fastq_screen_results"
fastq_screen --conf fastq_screen.conf --outdir $fastq_file"_fastq_screen_results" $fastq_file
mkdir -p $outdir
fastq_screen --conf fastq_screen.conf --outdir $outdir $fastq_file
# extract rRNA numbers for custom plotting with MultiQC
printf \"Sample\\t\" > rrna.tsv
grep -e '^Genome' -m1 -h $outdir/*_screen.txt >> rrna.tsv
printf \"$sample_name\\t\" >> rrna.tsv
grep -e '^rRNA' -h $outdir/*_screen.txt >> rrna.tsv
"""
}

process get_QC_thresholds {
process GET_QC_THRESHOLDS {

input:
path runfolder
Expand All @@ -220,7 +232,7 @@ process get_QC_thresholds {
"""
}

process get_metadata {
process GET_METADATA {

input:
path runfolder
Expand All @@ -240,7 +252,7 @@ process get_metadata {
"""
}

process interop_summary {
process INTEROP_SUMMARY {

input:
path runfolder
Expand All @@ -254,26 +266,32 @@ process interop_summary {
"""
}

process multiqc_per_flowcell {
process MULTIQC_PER_FLOWCELL {

publishDir "${params.result_dir}/flowcell_report", mode: 'copy', overwrite: true
label 'high_memory'

input:
val runfolder_name // Run folder name
path ('FastQC/*') // Fastqc logs
path ('FastqScreen/*') // Fastq screen logs
path ('rRNA/rrna_table.tsv') // Extracted rRNA values
path ('Interop_summary/*') // Interop log
path qc_thresholds // Quality check thresholds (optional)
path sequencing_metadata // Sequencing meta data ( custom content data )
path bcl2fastq_stats // Bcl2Fastq logs
path assets // Staged copy of assets folder
path config_dir
path config_dir // Staged copy of config folder

output:
tuple path("*multiqc_report.html"), path("*_data.zip")

script:
threshold_parameter = qc_thresholds ? "-c ${qc_thresholds}" : ""
"""
# making a separate file to use for plotting in MultiQC since custom content can only have one plot per section
# as described here: https://multiqc.info/docs/#introduction-1
cp rRNA/rrna_table.tsv rRNA/rrna_plot.tsv
RUNFOLDER=\$( basename ${runfolder_name} )
multiqc \\
--title "Flowcell report for \${RUNFOLDER}" \\
Expand All @@ -286,21 +304,26 @@ process multiqc_per_flowcell {

}

process multiqc_per_project {
process MULTIQC_PER_PROJECT {

publishDir "${params.result_dir}/projects", mode: 'copy', overwrite: true
label 'high_memory'

input:
val runfolder_name
tuple project, path("FastQC/*"), path("FastqScreen/*")
tuple val(project), path("FastQC/*"), path("FastqScreen/*"), path("rRNA/rrna_table.tsv")
path sequencing_metadata
path assets // Staged copy of assets folder
path config_dir
path config_dir // Staged copy of config folder

output:
tuple path("${project}/*multiqc_report.html"), path("${project}/*_data.zip")

script:
"""
# making a separate file to use for plotting in MultiQC since custom content can only have one plot per section
# as described here: https://multiqc.info/docs/#introduction-1
cp rRNA/rrna_table.tsv rRNA/rrna_plot.tsv
RUNFOLDER=\$( basename ${runfolder_name} )
multiqc \\
--title "Report for project ${project} on runfolder \${RUNFOLDER}" \\
Expand Down
Loading

0 comments on commit aa60649

Please sign in to comment.