v1.1.0: rRNA plots and updated nextflow version (#45)

* Update DSL2 enable * Update minimum Nextflow version * Uppercase workflow and process names according to DSL2 convention * Remove unsupported publish directive * Add channel element structure comments * Add comment * Fix lowercase process names * Fix unqualified input value declaration * Plot rRNA statistics from FastQScreen output (#44) * extract rRNA numbers from fastq_screen and plot with MultiQC * fix syntax * custom rrna plots into main multiqc config * refactored rrna extraction * refactored combining rrna data * rename channel * added explanatory comments * Update nextflow.config update version to 1.1.0-rc1 * Bump version to 1.1.0 to make production release Co-authored-by: Mahesh Binzer-Panchal <[email protected]> Co-authored-by: Pontus Larsson <[email protected]> Co-authored-by: alvaannett <[email protected]>
Molmed · Sep 1, 2021 · aa60649 · aa60649
1 parent 2cbbeba
commit aa60649
Show file tree

Hide file tree

Showing 6 changed files with 209 additions and 58 deletions.
diff --git a/config/compute_resources.config b/config/compute_resources.config
@@ -1,8 +1,8 @@
 process {
-    withName: 'fastq_screen' {
+    withName: 'FASTQ_SCREEN' {
         memory = '4G'
     }
-    withName: 'get_QC_thresholds' {
+    withName: 'GET_QC_THRESHOLDS' {
         errorStrategy = 'ignore'
     }
     withLabel: 'high_memory' {

diff --git a/config/multiqc_flowcell_config.yaml b/config/multiqc_flowcell_config.yaml
@@ -17,3 +17,4 @@ top_modules:
     - 'custom_content'
     - 'bcl2fastq'
     - 'interop'
+
diff --git a/config/multiqc_main_config.yaml b/config/multiqc_main_config.yaml
@@ -23,3 +23,131 @@ custom_plot_config:
 
 remove_sections:
     - 'fastqc_sequence_counts'
+
+custom_data:
+  rrna_plot:
+    id: "rrna_plot"
+    section_name: "Ribosomal RNA - plot"
+    parent_id: "rrna"
+    parent_name: "Ribosomal RNA"
+    title: "rRNA mapping statistics extracted from FastQScreen output"
+    description: "shows the FastQScreen mapping statistics for the rRNA genome. The statistics have been extracted from the full FastQScreen output shown elsewhere in this report in order to highlight the rRNA contents."
+    file_format: "tsv"
+    plot_type: "bargraph"
+    categories:
+        - "#Unmapped"
+        - "#One_hit_one_genome"
+        - "#Multiple_hits_one_genome"
+        - "#One_hit_multiple_genomes"
+        - "Multiple_hits_multiple_genomes"
+    pconfig:
+      hide_zero_cats: False
+      cpswitch_c_active: False
+      title: "Reads mapped to rRNA genome"
+  rrna_table:
+    id: "rrna_table"
+    parent_id: "rrna"
+    parent_name: "Ribosomal RNA"
+    section_name: "Ribosomal RNA - table"
+    file_format: "tsv"
+    plot_type: "table"
+    description: "shows the FastQScreen mapping statistics for the rRNA genome. The statistics have been extracted from the full FastQScreen output shown elsewhere in this report in order to highlight the rRNA contents."
+    pconfig:
+      sortRows: True
+      table_title: "rRNA mapping statistics extracted from FastQScreen output"
+    headers:
+      "Genome":
+        title: 'Genome'
+        description: screened genome
+        hidden: True
+      "#Reads_processed":
+        namespace: 'rRNA number'
+        title: 'Reads_processed'
+        format: '{:,.0f}'
+        description: number of sampled reads for the screen
+      "#Unmapped":
+        title: 'Unmapped'
+        namespace: 'rRNA number'
+        hidden: True
+        format: '{:,.0f}'
+        description: reads with no hits in any of the screened genomes
+      "%Unmapped":
+        namespace: 'rRNA percentage'
+        title: 'Unmapped'
+        suffix: '%'
+        max: 100
+        min: 0
+        ceiling: 100
+        floor: 0
+        scale: 'RdYlGn'
+        description: reads with no hits in any of the screened genomes
+      "#One_hit_one_genome":
+        namespace: 'rRNA number'
+        title: 'One_hit_one_genome'
+        hidden: True
+        format: '{:,.0f}'
+        description: reads with a unique hit only in the specified genome
+      "%One_hit_one_genome":
+        namespace: 'rRNA percentage'
+        title: 'One_hit_one_genome'
+        suffix: '%'
+        max: 100
+        min: 0
+        ceiling: 100
+        floor: 0
+        scale: 'Reds'
+        description: reads with a unique hit only in the specified genome
+      "#Multiple_hits_one_genome":
+        namespace: 'rRNA number'
+        title: 'Multiple_hits_one_genome'
+        hidden: True
+        format: '{:,.0f}'
+        description: reads with multiple hits only in the specified genome
+      "%Multiple_hits_one_genome":
+        namespace: 'rRNA percentage'
+        title: 'Multiple_hits_one_genome'
+        suffix: '%'
+        max: 100
+        min: 0
+        ceiling: 100
+        floor: 0
+        scale: 'Reds'
+        description: reads with multiple hits only in the specified genome
+      "#One_hit_multiple_genomes":
+        namespace: 'rRNA number'
+        title: 'One_hit_multiple_genomes'
+        hidden: True
+        format: '{:,.0f}'
+        description: reads with a unique hit in multiple screened genomes
+      "%One_hit_multiple_genomes":
+        namespace: 'rRNA percentage'
+        title: 'One_hit_multiple_genomes'
+        suffix: '%'
+        max: 100
+        min: 0
+        ceiling: 100
+        floor: 0
+        scale: 'Reds'
+        description: reads with a unique hit in multiple screened genomes
+      "#Multiple_hits_multiple_genomes":
+        namespace: 'rRNA number'
+        title: 'Multiple_hits_multiple_genomes'
+        hidden: True
+        format: '{:,.0f}'
+        description: reads with multiple hits in multiple screened genomes
+      "%Multiple_hits_multiple_genomes":
+        namespace: 'rRNA percentage'
+        title: 'Multiple_hits_multiple_genomes'
+        suffix: '%'
+        max: 100
+        min: 0
+        ceiling: 100
+        floor: 0
+        scale: 'Reds'
+        description: reads with multiple hits in multiple screened genomes
+
+sp:
+  rrna_table:
+    fn: "rrna_table.tsv"
+  rrna_plot:
+    fn: "rrna_plot.tsv"
diff --git a/config/multiqc_project_config.yaml b/config/multiqc_project_config.yaml
@@ -11,4 +11,3 @@ table_columns_visible:
 
 top_modules:
     - 'custom_content'
-
diff --git a/main.nf b/main.nf
@@ -1,6 +1,6 @@
 #! /usr/bin/env nextflow
 
-nextflow.preview.dsl=2
+nextflow.enable.dsl=2
 /* ####################################################
 
    seqreports: SNP & SEQ Run folder QC pipeline
@@ -80,11 +80,7 @@ workflow {
     Channel.fromPath(params.run_folder,checkIfExists:true)
         .ifEmpty { "Error: No run folder (--run_folder) given."; exit 1 }
         .set {run_folder}
-    check_run_quality(run_folder)
-
-    publish:
-    check_run_quality.out.projectqc            to: "${params.result_dir}/projects", mode: 'copy', overwrite: true
-    check_run_quality.out.flowcellqc           to: "${params.result_dir}/flowcell_report", mode: 'copy', overwrite: true
+    CHECK_RUN_QUALITY(run_folder)
 
 }
 
@@ -106,13 +102,20 @@ def get_project_and_reads(run_folder) {
 
 }
 
-def combine_results_by_project (fastqc_results,fastq_screen_results) {
+def combine_results_by_project (fastqc_results,fastq_screen_results,rrna_results) {
+    // fastqc_results           // [Project, [fqcfiles1, fqcfiles2, fqcfiles3]]
+    // fastq_screen_results     // [Project, [fqsfiles1, fqsfiles2, fqsfiles3]]
+    // rrna_results             // [Project, [rrnafiles1, rrnafiles2, rrnafiles3]]
 
-    fastqc_results.mix(fastq_screen_results).groupTuple().map { it -> tuple(it[0],it[1][0].flatten(),it[1][1].flatten()) }
+    fastqc_results.join(fastq_screen_results)
+        .join(
+            rrna_results.collectFile(keepHeader:true,skip:1,sort:true) { it -> ["${it[0]}_rrna_table.tsv", it[1]] }
+            .map { it -> tuple((it.name - ~/_rrna_table.tsv/), [it]) })
+    // [Project, [fqcfiles1,fqcfiles2,fqcfiles3],[fqsfiles1,fqsfiles2,fqsfiles3],[Project_rrna_table.tsv]]
 
 }
 
-workflow check_run_quality {
+workflow CHECK_RUN_QUALITY {
 
     /* Workflow Graph
 
@@ -128,46 +131,46 @@ workflow check_run_quality {
         run_folder
 
     main:
-        interop_summary(run_folder)
-        get_QC_thresholds(run_folder)
-        get_metadata(run_folder)
+        INTEROP_SUMMARY(run_folder)
+        GET_QC_THRESHOLDS(run_folder)
+        GET_METADATA(run_folder)
         project_and_reads = get_project_and_reads(params.run_folder)
-        fastqc(project_and_reads)
-        fastq_screen(project_and_reads,
+        FASTQC(project_and_reads)
+        FASTQ_SCREEN(project_and_reads,
 		     params.config_dir,
 		     params.fastqscreen_databases)
-        multiqc_per_flowcell( params.run_folder,
-            fastqc.out.map{ it[1] }.collect(),
-            fastq_screen.out.map{ it[1] }.collect(),
-            interop_summary.out.collect(),
-            get_QC_thresholds.out.collect().ifEmpty([]),
-            get_metadata.out.collect(),
+        MULTIQC_PER_FLOWCELL( params.run_folder,
+            FASTQC.out.map{ it[1] }.collect(),
+            FASTQ_SCREEN.out.results.map{ it[1] }.collect(),
+            FASTQ_SCREEN.out.tsv.map{ it[1] }.collectFile(keepHeader:true,skip:1,sort:true),
+            INTEROP_SUMMARY.out.collect(),
+            GET_QC_THRESHOLDS.out.collect().ifEmpty([]),
+            GET_METADATA.out.collect(),
             Channel.fromPath("${params.run_folder}/${params.bcl2fastq_outdir}/Stats/Stats.json").collect().ifEmpty([]),
             params.assets_dir,
             params.config_dir)
-        multiqc_per_project( params.run_folder,
-            combine_results_by_project(fastqc.out.groupTuple(),fastq_screen.out.groupTuple()),
-            get_metadata.out.collect(),
+        MULTIQC_PER_PROJECT( params.run_folder,
+            combine_results_by_project(
+                FASTQC.out.groupTuple(),
+                FASTQ_SCREEN.out.results.groupTuple(),
+                FASTQ_SCREEN.out.tsv),
+            GET_METADATA.out.collect(),
             params.assets_dir,
             params.config_dir)
-
-    emit:
-        flowcellqc = multiqc_per_flowcell.out
-        projectqc = multiqc_per_project.out
-
 }
 
 // ---------------------------------------------------
 // Processes
 // ---------------------------------------------------
 
-process fastqc {
+
+process FASTQC {
 
     input:
-    tuple project, path(fastq_file)
+    tuple val(project), path(fastq_file)
 
     output:
-    tuple project, path("*_results")
+    tuple val(project), path("*_results")
 
     script:
     """
@@ -176,17 +179,20 @@ process fastqc {
     """
 }
 
-process fastq_screen {
+process FASTQ_SCREEN {
 
     input:
-    tuple project, path(fastq_file)
+    tuple val(project), path(fastq_file)
     path config_dir
     path fastqscreen_databases
 
     output:
-    tuple project, path("*_results")
+    tuple val(project), path("*_results"), emit: results
+    tuple val(project), path("rrna.tsv"), emit: tsv
 
     script:
+    outdir = fastq_file + "_fastq_screen_results"
+    sample_name = (fastq_file.name =~ /^(.*_S\d+_L\d{3}_R\d+).*/)[0][1]
     """
     sed -E 's/^(THREADS[[:blank:]]+)[[:digit:]]+/\1${task.cpus}/' \\
         ${config_dir}/fastq_screen.conf > fastq_screen.conf
@@ -195,12 +201,18 @@ process fastq_screen {
     elif [ "${fastqscreen_databases}" != "${fastqscreen_default_databases}" ]; then
         sed -i 's#${fastqscreen_default_databases}#${fastqscreen_databases}#' fastq_screen.conf
     fi
-    mkdir -p $fastq_file"_fastq_screen_results"
-    fastq_screen --conf fastq_screen.conf --outdir $fastq_file"_fastq_screen_results" $fastq_file
+    mkdir -p $outdir
+    fastq_screen --conf fastq_screen.conf --outdir $outdir $fastq_file
+    
+    # extract rRNA numbers for custom plotting with MultiQC
+    printf \"Sample\\t\" > rrna.tsv
+    grep -e '^Genome' -m1 -h $outdir/*_screen.txt >> rrna.tsv
+    printf \"$sample_name\\t\" >> rrna.tsv
+    grep -e '^rRNA' -h $outdir/*_screen.txt >> rrna.tsv
     """
 }
 
-process get_QC_thresholds {
+process GET_QC_THRESHOLDS {
 
     input:
     path runfolder
@@ -220,7 +232,7 @@ process get_QC_thresholds {
     """
 }
 
-process get_metadata {
+process GET_METADATA {
 
     input:
     path runfolder
@@ -240,7 +252,7 @@ process get_metadata {
     """
 }
 
-process interop_summary {
+process INTEROP_SUMMARY {
 
     input:
     path runfolder
@@ -254,26 +266,32 @@ process interop_summary {
     """
 }
 
-process multiqc_per_flowcell {
+process MULTIQC_PER_FLOWCELL {
+
+    publishDir "${params.result_dir}/flowcell_report", mode: 'copy', overwrite: true
     label 'high_memory'
 
     input:
     val runfolder_name              // Run folder name
     path ('FastQC/*')               // Fastqc logs
     path ('FastqScreen/*')          // Fastq screen logs
+    path ('rRNA/rrna_table.tsv')    // Extracted rRNA values
     path ('Interop_summary/*')      // Interop log
     path qc_thresholds              // Quality check thresholds (optional)
     path sequencing_metadata        // Sequencing meta data ( custom content data )
     path bcl2fastq_stats            // Bcl2Fastq logs
     path assets                     // Staged copy of assets folder
-    path config_dir    
+    path config_dir                 // Staged copy of config folder
 
     output:
     tuple path("*multiqc_report.html"), path("*_data.zip")
 
     script:
     threshold_parameter = qc_thresholds ? "-c ${qc_thresholds}" : ""
     """
+    # making a separate file to use for plotting in MultiQC since custom content can only have one plot per section
+    # as described here: https://multiqc.info/docs/#introduction-1
+    cp rRNA/rrna_table.tsv rRNA/rrna_plot.tsv
     RUNFOLDER=\$( basename ${runfolder_name} )
     multiqc \\
         --title "Flowcell report for \${RUNFOLDER}" \\
@@ -286,21 +304,26 @@ process multiqc_per_flowcell {
 
 }
 
-process multiqc_per_project {
+process MULTIQC_PER_PROJECT {
+
+    publishDir "${params.result_dir}/projects", mode: 'copy', overwrite: true
     label 'high_memory'
 
     input:
     val runfolder_name
-    tuple project, path("FastQC/*"), path("FastqScreen/*")
+    tuple val(project), path("FastQC/*"), path("FastqScreen/*"), path("rRNA/rrna_table.tsv")
     path sequencing_metadata
     path assets                     // Staged copy of assets folder
-    path config_dir
+    path config_dir                 // Staged copy of config folder
 
     output:
     tuple path("${project}/*multiqc_report.html"), path("${project}/*_data.zip")
 
     script:
     """
+    # making a separate file to use for plotting in MultiQC since custom content can only have one plot per section
+    # as described here: https://multiqc.info/docs/#introduction-1
+    cp rRNA/rrna_table.tsv rRNA/rrna_plot.tsv
     RUNFOLDER=\$( basename ${runfolder_name} )
     multiqc \\
         --title "Report for project ${project} on runfolder \${RUNFOLDER}" \\
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,3 +17,4 @@ top_modules:
		- 'custom_content'
		- 'bcl2fastq'
		- 'interop'
Original file line number	Diff line number	Diff line change
Expand Up		@@ -11,4 +11,3 @@ table_columns_visible:

		top_modules:
		- 'custom_content'