diff --git a/conf/hgi.config b/conf/hgi.config new file mode 100644 index 00000000..3a645842 --- /dev/null +++ b/conf/hgi.config @@ -0,0 +1,76 @@ +process { + cache = 'lenient' + shell = ['/bin/bash', '-euo', 'pipefail'] + + errorStrategy = { task.attempt <= 3 ? 'retry' : 'finish' } + maxRetries = 3 + + withName: 'GATK4_MARKDUPLICATES|GATK4_MARKDUPLICATESSPARK' { + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + } + + withName: 'GATK4_GENOMICSDBIMPORT' { + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + clusterOptions = { "-R \"rusage[tmp=20G]\"" } + ext.args = "--batch-size 50 --reader-threads 1 -ip 500" + } + + withName: 'GATK4_GENOTYPEGVCFS' { + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + clusterOptions = { "-R \"rusage[tmp=20G]\"" } + ext.args = { "-G StandardAnnotation -G AS_StandardAnnotation" } + } + + withName:'GATK4_APPLYBQSR|GATK4_APPLYBQSR_SPARK|GATK4_BASERECALIBRATOR|GATK4_BASERECALIBRATOR_SPARK|GATK4_GATHERBQSRREPORTS'{ + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + } + + withName:'GATK4_MERGEVCFS'{ + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + } + + withName: 'GATK4_HAPLOTYPECALLER' { + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + } + + withName: 'MERGE_HAPLOTYPECALLER' { + memory = { check_max( 2.GB * task.attempt, 'memory' ) } + } + + withName: 'TABIX_BGZIPTABIX_INTERVAL_SPLIT' { + memory = { check_max( 256.MB * task.attempt, 'memory' ) } + } + + withName: 'BCFTOOLS_STATS' { + memory = { check_max( 128.MB * task.attempt, 'memory' ) } + } + + withName: 'VCFTOOLS_TSTV_COUNT|VCFTOOLS_TSTV_QUAL|VCFTOOLS_SUMMARY' { + memory = { check_max( 128.MB * task.attempt, 'memory' ) } + } + + withName: CUSTOM_DUMPSOFTWAREVERSIONS { + memory = { check_max( 128.MB * task.attempt, 'memory' ) } + } + + withName: 'MULTIQC' { + cpus = { check_max( 2 * task.attempt, 'cpus' ) } + memory = { check_max( 2.GB * task.attempt, 'memory' ) } + } +} + +singularity { + autoMounts = true + enabled = true + cacheDir = '/nfs/hgi/singularityContainers/' +} + +executor { + queueSize = 4000 +} diff --git a/conf/modules/haplotypecaller.config b/conf/modules/haplotypecaller.config index be159c41..99ea65fc 100644 --- a/conf/modules/haplotypecaller.config +++ b/conf/modules/haplotypecaller.config @@ -16,9 +16,9 @@ process { withName: 'GATK4_HAPLOTYPECALLER' { - ext.args = { params.joint_germline ? "-ERC GVCF" : "" } + ext.args = { params.joint_germline ? "-ERC GVCF -G StandardAnnotation -G AS_StandardAnnotation -G StandardHCAnnotation" : "" } ext.prefix = { meta.num_intervals <= 1 ? ( params.joint_germline ? "${meta.id}.haplotypecaller.g" : "${meta.id}.haplotypecaller" ) : ( params.joint_germline ? "${meta.id}.haplotypecaller.${intervals.simpleName}.g" :"${meta.id}.haplotypecaller.${intervals.simpleName}" ) } - ext.when = { params.tools && params.tools.split(',').contains('haplotypecaller') } + ext.when = { params.tools && params.tools.split(',').any{ it.startsWith('haplotypecaller') } } publishDir = [ mode: params.publish_dir_mode, path: { "${params.outdir}/variant_calling/"}, diff --git a/conf/modules/joint_germline.config b/conf/modules/joint_germline.config index 5905c482..cb2a5a4f 100644 --- a/conf/modules/joint_germline.config +++ b/conf/modules/joint_germline.config @@ -51,7 +51,7 @@ process { withName: 'VARIANTRECALIBRATOR_INDEL' { ext.prefix = { "${meta.id}_INDEL" } - ext.args = "-an QD -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an DP -mode INDEL" + ext.args = "-AS -an QD -an MQRankSum -an ReadPosRankSum -an FS -an SOR -an DP -mode INDEL" publishDir = [ enabled: false ] @@ -59,7 +59,7 @@ process { withName: 'VARIANTRECALIBRATOR_SNP' { ext.prefix = { "${meta.id}_SNP" } - ext.args = "-an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -mode SNP" + ext.args = "-AS -an QD -an MQ -an MQRankSum -an ReadPosRankSum -an FS -an SOR -mode SNP" publishDir = [ enabled: false ] diff --git a/modules/local/create_intervals_bed/main.nf b/modules/local/create_intervals_bed/main.nf index 4ade5d8b..87f3696b 100644 --- a/modules/local/create_intervals_bed/main.nf +++ b/modules/local/create_intervals_bed/main.nf @@ -28,11 +28,12 @@ process CREATE_INTERVALS_BED { # no runtime estimate in this row, assume default value t = (\$3 - \$2) / ${params.nucleotides_per_second} } - if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05)) { + if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.00) || \$1 != chr) { # start a new chunk name = sprintf("%s_%d-%d.bed", \$1, \$2+1, \$3) chunk = 0 longest = 0 + chr = \$1 } if (t > longest) longest = t diff --git a/modules/nf-core/gatk4/applybqsr/main.nf b/modules/nf-core/gatk4/applybqsr/main.nf index e5e6bf99..a52bc3e7 100644 --- a/modules/nf-core/gatk4/applybqsr/main.nf +++ b/modules/nf-core/gatk4/applybqsr/main.nf @@ -33,7 +33,7 @@ process GATK4_APPLYBQSR { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ ApplyBQSR \\ --input $input \\ --output ${prefix}.${input.getExtension()} \\ diff --git a/modules/nf-core/gatk4/applyvqsr/main.nf b/modules/nf-core/gatk4/applyvqsr/main.nf index 8413f2bb..bf6f0f42 100644 --- a/modules/nf-core/gatk4/applyvqsr/main.nf +++ b/modules/nf-core/gatk4/applyvqsr/main.nf @@ -33,7 +33,7 @@ process GATK4_APPLYVQSR { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ ApplyVQSR \\ --variant ${vcf} \\ --output ${prefix}.vcf.gz \\ diff --git a/modules/nf-core/gatk4/baserecalibrator/main.nf b/modules/nf-core/gatk4/baserecalibrator/main.nf index 5375289a..77baaff2 100644 --- a/modules/nf-core/gatk4/baserecalibrator/main.nf +++ b/modules/nf-core/gatk4/baserecalibrator/main.nf @@ -35,7 +35,7 @@ process GATK4_BASERECALIBRATOR { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ BaseRecalibrator \\ --input $input \\ --output ${prefix}.table \\ diff --git a/modules/nf-core/gatk4/calculatecontamination/main.nf b/modules/nf-core/gatk4/calculatecontamination/main.nf index 9dd961be..2ce606f4 100644 --- a/modules/nf-core/gatk4/calculatecontamination/main.nf +++ b/modules/nf-core/gatk4/calculatecontamination/main.nf @@ -30,7 +30,7 @@ process GATK4_CALCULATECONTAMINATION { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ CalculateContamination \\ --input $pileup \\ --output ${prefix}.contamination.table \\ diff --git a/modules/nf-core/gatk4/cnnscorevariants/main.nf b/modules/nf-core/gatk4/cnnscorevariants/main.nf index 71efe9b1..e8e52ae4 100644 --- a/modules/nf-core/gatk4/cnnscorevariants/main.nf +++ b/modules/nf-core/gatk4/cnnscorevariants/main.nf @@ -40,7 +40,7 @@ process GATK4_CNNSCOREVARIANTS { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ CNNScoreVariants \\ --variant $vcf \\ --output ${prefix}.cnn.vcf.gz \\ diff --git a/modules/nf-core/gatk4/createsequencedictionary/main.nf b/modules/nf-core/gatk4/createsequencedictionary/main.nf index 3e4efdd9..bdcf9d6c 100644 --- a/modules/nf-core/gatk4/createsequencedictionary/main.nf +++ b/modules/nf-core/gatk4/createsequencedictionary/main.nf @@ -27,7 +27,7 @@ process GATK4_CREATESEQUENCEDICTIONARY { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ CreateSequenceDictionary \\ --REFERENCE $fasta \\ --URI $fasta \\ diff --git a/modules/nf-core/gatk4/estimatelibrarycomplexity/main.nf b/modules/nf-core/gatk4/estimatelibrarycomplexity/main.nf index 81fc8351..9a2c201b 100644 --- a/modules/nf-core/gatk4/estimatelibrarycomplexity/main.nf +++ b/modules/nf-core/gatk4/estimatelibrarycomplexity/main.nf @@ -32,7 +32,7 @@ process GATK4_ESTIMATELIBRARYCOMPLEXITY { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ EstimateLibraryComplexity \\ $input_list \\ --OUTPUT ${prefix}.metrics \\ diff --git a/modules/nf-core/gatk4/filtermutectcalls/main.nf b/modules/nf-core/gatk4/filtermutectcalls/main.nf index 623b91ae..0bcc6c83 100644 --- a/modules/nf-core/gatk4/filtermutectcalls/main.nf +++ b/modules/nf-core/gatk4/filtermutectcalls/main.nf @@ -38,7 +38,7 @@ process GATK4_FILTERMUTECTCALLS { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ FilterMutectCalls \\ --variant $vcf \\ --output ${prefix}.vcf.gz \\ diff --git a/modules/nf-core/gatk4/filtervarianttranches/main.nf b/modules/nf-core/gatk4/filtervarianttranches/main.nf index 90cbf5f0..550d1ca8 100644 --- a/modules/nf-core/gatk4/filtervarianttranches/main.nf +++ b/modules/nf-core/gatk4/filtervarianttranches/main.nf @@ -36,7 +36,7 @@ process GATK4_FILTERVARIANTTRANCHES { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ FilterVariantTranches \\ --variant $vcf \\ $resources \\ diff --git a/modules/nf-core/gatk4/gatherbqsrreports/main.nf b/modules/nf-core/gatk4/gatherbqsrreports/main.nf index 3eeca5ad..e1f39ef7 100644 --- a/modules/nf-core/gatk4/gatherbqsrreports/main.nf +++ b/modules/nf-core/gatk4/gatherbqsrreports/main.nf @@ -29,7 +29,7 @@ process GATK4_GATHERBQSRREPORTS { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ GatherBQSRReports \\ $input_list \\ --output ${prefix}.table \\ diff --git a/modules/nf-core/gatk4/gatherpileupsummaries/main.nf b/modules/nf-core/gatk4/gatherpileupsummaries/main.nf index f315e1af..575fa9ee 100644 --- a/modules/nf-core/gatk4/gatherpileupsummaries/main.nf +++ b/modules/nf-core/gatk4/gatherpileupsummaries/main.nf @@ -31,7 +31,7 @@ process GATK4_GATHERPILEUPSUMMARIES { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ GatherPileupSummaries \\ $input_list \\ --O ${prefix}.pileups.table \\ diff --git a/modules/nf-core/gatk4/genomicsdbimport/main.nf b/modules/nf-core/gatk4/genomicsdbimport/main.nf index a8725d3f..aa1afb21 100644 --- a/modules/nf-core/gatk4/genomicsdbimport/main.nf +++ b/modules/nf-core/gatk4/genomicsdbimport/main.nf @@ -1,6 +1,6 @@ process GATK4_GENOMICSDBIMPORT { tag "$meta.id" - label 'process_medium' + //label 'process_medium' conda "bioconda::gatk4=4.4.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -29,19 +29,22 @@ process GATK4_GENOMICSDBIMPORT { // settings for running default create gendb mode input_command = input_map ? "--sample-name-map ${vcf[0]}" : vcf.collect(){"--variant $it"}.join(' ') - genomicsdb_command = "--genomicsdb-workspace-path ${prefix}" + //genomicsdb_command = "--genomicsdb-workspace-path ${prefix}" + genomicsdb_command = "--genomicsdb-workspace-path" interval_command = interval_file ? "--intervals ${interval_file}" : "--intervals ${interval_value}" updated_db = "" // settings changed for running get intervals list mode if run_intlist is true if (run_intlist) { - genomicsdb_command = "--genomicsdb-update-workspace-path ${wspace}" + //genomicsdb_command = "--genomicsdb-update-workspace-path ${wspace}" + genomicsdb_command = "--genomicsdb-update-workspace-path" interval_command = "--output-interval-list-to-file ${prefix}.interval_list" } // settings changed for running update gendb mode. input_command same as default, update_db forces module to emit the updated gendb if (run_updatewspace) { - genomicsdb_command = "--genomicsdb-update-workspace-path ${wspace}" + //genomicsdb_command = "--genomicsdb-update-workspace-path ${wspace}" + genomicsdb_command = "--genomicsdb-update-workspace-path" interval_command = '' updated_db = "${wspace}" } @@ -53,14 +56,18 @@ process GATK4_GENOMICSDBIMPORT { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + declare WORKSPACE="\$(TMPDIR="/tmp" mktemp -du)" + trap 'rm -rf "\$WORKSPACE"' EXIT + + gatk --java-options "-Xmx${avail_mem}M -XX:+UseSerialGC -XX:-UsePerfData" \\ GenomicsDBImport \\ $input_command \\ - $genomicsdb_command \\ + $genomicsdb_command \$WORKSPACE \\ $interval_command \\ - --tmp-dir . \\ $args + tar cf "${prefix}" -C "\$WORKSPACE" . + cat <<-END_VERSIONS > versions.yml "${task.process}": gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') diff --git a/modules/nf-core/gatk4/genotypegvcfs/main.nf b/modules/nf-core/gatk4/genotypegvcfs/main.nf index a3e3129f..63cf29a3 100644 --- a/modules/nf-core/gatk4/genotypegvcfs/main.nf +++ b/modules/nf-core/gatk4/genotypegvcfs/main.nf @@ -1,6 +1,6 @@ process GATK4_GENOTYPEGVCFS { tag "$meta.id" - label 'process_high' + //label 'process_high' conda "bioconda::gatk4=4.4.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -37,14 +37,16 @@ process GATK4_GENOTYPEGVCFS { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + declare WORKSPACE="\$(TMPDIR="/tmp" mktemp -d)" + trap 'rm -rf "\$WORKSPACE"' EXIT + tar xf "${gvcf}" -C "\$WORKSPACE" + gatk --java-options "-Xmx${avail_mem}M -XX:+UseSerialGC -XX:-UsePerfData" \\ GenotypeGVCFs \\ - --variant $gvcf_command \\ + --variant gendb://\$WORKSPACE \\ --output ${prefix}.vcf.gz \\ --reference $fasta \\ $interval_command \\ $dbsnp_command \\ - --tmp-dir . \\ $args cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/gatk4/getpileupsummaries/main.nf b/modules/nf-core/gatk4/getpileupsummaries/main.nf index f7d0f294..faa5b693 100644 --- a/modules/nf-core/gatk4/getpileupsummaries/main.nf +++ b/modules/nf-core/gatk4/getpileupsummaries/main.nf @@ -35,7 +35,7 @@ process GATK4_GETPILEUPSUMMARIES { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ GetPileupSummaries \\ --input $input \\ --variant $variants \\ diff --git a/modules/nf-core/gatk4/haplotypecaller/main.nf b/modules/nf-core/gatk4/haplotypecaller/main.nf index 9ac87518..bdcda702 100644 --- a/modules/nf-core/gatk4/haplotypecaller/main.nf +++ b/modules/nf-core/gatk4/haplotypecaller/main.nf @@ -39,7 +39,7 @@ process GATK4_HAPLOTYPECALLER { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ HaplotypeCaller \\ --input $input \\ --output ${prefix}.vcf.gz \\ diff --git a/modules/nf-core/gatk4/intervallisttobed/main.nf b/modules/nf-core/gatk4/intervallisttobed/main.nf index 2537f0aa..b43df669 100644 --- a/modules/nf-core/gatk4/intervallisttobed/main.nf +++ b/modules/nf-core/gatk4/intervallisttobed/main.nf @@ -28,7 +28,7 @@ process GATK4_INTERVALLISTTOBED { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ IntervalListToBed \\ --INPUT $intervals \\ --OUTPUT ${prefix}.bed \\ diff --git a/modules/nf-core/gatk4/learnreadorientationmodel/main.nf b/modules/nf-core/gatk4/learnreadorientationmodel/main.nf index 89a6ae77..c1d9b9dc 100644 --- a/modules/nf-core/gatk4/learnreadorientationmodel/main.nf +++ b/modules/nf-core/gatk4/learnreadorientationmodel/main.nf @@ -29,7 +29,7 @@ process GATK4_LEARNREADORIENTATIONMODEL { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ LearnReadOrientationModel \\ $input_list \\ --output ${prefix}.tar.gz \\ diff --git a/modules/nf-core/gatk4/markduplicates/main.nf b/modules/nf-core/gatk4/markduplicates/main.nf index e4c01f9a..5b8fcbc1 100644 --- a/modules/nf-core/gatk4/markduplicates/main.nf +++ b/modules/nf-core/gatk4/markduplicates/main.nf @@ -43,7 +43,7 @@ process GATK4_MARKDUPLICATES { // Using samtools and not Markduplicates to compress to CRAM speeds up computation: // https://medium.com/@acarroll.dna/looking-at-trade-offs-in-compression-levels-for-genomics-tools-eec2834e8b94 """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ MarkDuplicates \\ $input_list \\ --OUTPUT ${prefix_bam} \\ diff --git a/modules/nf-core/gatk4/mergemutectstats/main.nf b/modules/nf-core/gatk4/mergemutectstats/main.nf index 269721cb..ec2a9c92 100644 --- a/modules/nf-core/gatk4/mergemutectstats/main.nf +++ b/modules/nf-core/gatk4/mergemutectstats/main.nf @@ -29,7 +29,7 @@ process GATK4_MERGEMUTECTSTATS { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ MergeMutectStats \\ $input_list \\ --output ${prefix}.vcf.gz.stats \\ diff --git a/modules/nf-core/gatk4/mergevcfs/main.nf b/modules/nf-core/gatk4/mergevcfs/main.nf index 29c08e16..76593998 100644 --- a/modules/nf-core/gatk4/mergevcfs/main.nf +++ b/modules/nf-core/gatk4/mergevcfs/main.nf @@ -32,7 +32,7 @@ process GATK4_MERGEVCFS { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ MergeVcfs \\ $input_list \\ --OUTPUT ${prefix}.vcf.gz \\ diff --git a/modules/nf-core/gatk4/mutect2/main.nf b/modules/nf-core/gatk4/mutect2/main.nf index 4e353979..93f1204a 100644 --- a/modules/nf-core/gatk4/mutect2/main.nf +++ b/modules/nf-core/gatk4/mutect2/main.nf @@ -42,7 +42,7 @@ process GATK4_MUTECT2 { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ Mutect2 \\ $inputs \\ --output ${prefix}.vcf.gz \\ diff --git a/modules/nf-core/gatk4/variantrecalibrator/main.nf b/modules/nf-core/gatk4/variantrecalibrator/main.nf index fa262e4a..15fe43a8 100644 --- a/modules/nf-core/gatk4/variantrecalibrator/main.nf +++ b/modules/nf-core/gatk4/variantrecalibrator/main.nf @@ -39,7 +39,7 @@ process GATK4_VARIANTRECALIBRATOR { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ VariantRecalibrator \\ --variant $vcf \\ --output ${prefix}.recal \\ diff --git a/nextflow.config b/nextflow.config index 123a58ae..67310e63 100644 --- a/nextflow.config +++ b/nextflow.config @@ -242,6 +242,8 @@ profiles { executor.memory = 8.GB executor.name = 'local' } + // Custom profile + hgi { includeConfig 'conf/hgi.config' } // Basic test profile for CI test { includeConfig 'conf/test.config' } test_cache { includeConfig 'conf/test/cache.config' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 51850b6f..7581ddd6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,33 +10,11 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "help_text": "Specify input samplesheet, step and output folder.", "required": ["step", "outdir"], "properties": { - "input": { - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "A design file with information about the samples in your experiment. Use this parameter to specify the location of the input files. It has to be a comma-separated file with a header row. See [usage docs](https://nf-co.re/sarek/usage#input).\n\nIf no input file is specified, sarek will attempt to locate one in the `{outdir}` directory. If no input should be supplied, i.e. when --step is supplied or --build_from_index, then set --input false", - "fa_icon": "fas fa-file-csv", - "schema": "assets/schema_input.json", - "type": "string", - "format": "file-path", - "exists": true, - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$" - }, - "input_restart": { - "type": "string", - "description": "Automatic retrieval for restart", - "format": "file-path", - "exists": true, - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "hidden": true, - "schema": "assets/schema_input.json" - }, "step": { "type": "string", - "default": "mapping", + "default": "variant_calling", "fa_icon": "fas fa-play", "description": "Starting step", "help_text": "The pipeline starts from this step and then runs through the possible subsequent steps.", @@ -46,16 +24,28 @@ "prepare_recalibration", "recalibrate", "variant_calling", + "joint_calling", "annotate" ] }, + "input": { + "type": "string", + "format": "file-path", + "mimetype": "text/csv", + "pattern": "\\.csv$", + "schema": "assets/schema_input.json", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "A design file with information about the samples in your experiment. Use this parameter to specify the location of the input files. It has to be a comma-separated file with a header row. See [usage docs](https://nf-co.re/sarek/usage#input).\n\nIf no input file is specified, sarek will attempt to locate one in the `{outdir}` directory.", + "fa_icon": "fas fa-file-csv" + }, "outdir": { "type": "string", "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" } - } + }, + "help_text": "" }, "main_options": { "title": "Main options", @@ -79,7 +69,7 @@ "intervals": { "type": "string", "fa_icon": "fas fa-file-alt", - "help_text": "To speed up preprocessing and variant calling processes, the execution is parallelized across a reference chopped into smaller pieces.\n\nParts of preprocessing and variant calling are done by these intervals, the different resulting files are then merged.\nThis can parallelize processes, and push down wall clock time significantly.\n\nWe are aligning to the whole genome, and then run Base Quality Score Recalibration and Variant Calling on the supplied regions.\n\n**Whole Genome Sequencing:**\n\nThe (provided) intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs.\n\nWe are ignoring the `hs37d5` contig that contains concatenated decoy sequences.\n\nThe calling intervals can be defined using a .list or a BED file.\nA .list file contains one interval per line in the format `chromosome:start-end` (1-based coordinates).\nA BED file must be a tab-separated text file with one interval per line.\nThere must be at least three columns: chromosome, start, and end (0-based coordinates).\nAdditionally, the score column of the BED file can be used to provide an estimate of how many seconds it will take to call variants on that interval.\nThe fourth column remains unused.\n\n```\n|chr1|10000|207666|NA|47.3|\n```\nThis indicates that variant calling on the interval chr1:10001-207666 takes approximately 47.3 seconds.\n\nThe runtime estimate is used in two different ways.\nFirst, when there are multiple consecutive intervals in the file that take little time to compute, they are processed as a single job, thus reducing the number of processes that needs to be spawned.\nSecond, the jobs with largest processing time are started first, which reduces wall-clock time.\nIf no runtime is given, a time of 200000 nucleotides per second is assumed. See `--nucleotides_per_second` on how to customize this.\nActual figures vary from 2 nucleotides/second to 30000 nucleotides/second.\nIf you prefer, you can specify the full path to your reference genome when you run the pipeline:\n\n> **NB** If none provided, will be generated automatically from the FASTA reference\n> **NB** Use --no_intervals to disable automatic generation.\n\n**Targeted Sequencing:**\n\nThe recommended flow for targeted sequencing data is to use the workflow as it is, but also provide a `BED` file containing targets for all steps using the `--intervals` option. In addition, the parameter `--wes` should be set.\nIt is advised to pad the variant calling regions (exons or target) to some extent before submitting to the workflow.\n\nThe procedure is similar to whole genome sequencing, except that only BED file are accepted. See above for formatting description.\nAdding every exon as an interval in case of `WES` can generate >200K processes or jobs, much more forks, and similar number of directories in the Nextflow work directory. These are appropriately grouped together to reduce number of processes run in parallel (see above and `--nucleotides_per_second` for details). \nFurthermore, primers and/or baits are not 100% specific, (certainly not for MHC and KIR, etc.), quite likely there going to be reads mapping to multiple locations.\nIf you are certain that the target is unique for your genome (all the reads will certainly map to only one location), and aligning to the whole genome is an overkill, it is actually better to change the reference itself.", + "help_text": "To speed up preprocessing and variant calling processes, the execution is parallelized across a reference chopped into smaller pieces.\n\nParts of preprocessing and variant calling are done by these intervals, the different resulting files are then merged.\nThis can parallelize processes, and push down wall clock time significantly.\n\nWe are aligning to the whole genome, and then run Base Quality Score Recalibration and Variant Calling on the supplied regions.\n\n**NOTE:** Intervals are not split so the largest interval in the interval_list or bed file will be the largest shard to process if that size is greater than the average shard size. Therefore, use a file which has lots of small intervals rather than one with a few very large intervals.\n\n**Whole Genome Sequencing:**\n\nThe (provided) intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs.\n\nWe are ignoring the `hs37d5` contig that contains concatenated decoy sequences.\n\nThe calling intervals can be defined using a .list or a BED file.\nA .list file contains one interval per line in the format `chromosome:start-end` (1-based coordinates).\nA BED file must be a tab-separated text file with one interval per line.\nThere must be at least three columns: chromosome, start, and end (0-based coordinates).\nAdditionally, the score column of the BED file can be used to provide an estimate of how many seconds it will take to call variants on that interval.\nThe fourth column remains unused.\n\n```\n|chr1|10000|207666|NA|47.3|\n```\nThis indicates that variant calling on the interval chr1:10001-207666 takes approximately 47.3 seconds.\n\nThe runtime estimate is used in two different ways.\nFirst, when there are multiple consecutive intervals in the file that take little time to compute, they are processed as a single job, thus reducing the number of processes that needs to be spawned.\nSecond, the jobs with largest processing time are started first, which reduces wall-clock time.\nIf no runtime is given, a time of 1000 nucleotides per second is assumed. See `-nucleotides_per_second` on how to customize this.\nActual figures vary from 2 nucleotides/second to 30000 nucleotides/second.\nIf you prefer, you can specify the full path to your reference genome when you run the pipeline:\n\n> **NB** If none provided, will be generated automatically from the FASTA reference\n> **NB** Use --no_intervals to disable automatic generation.\n\n**Targeted Sequencing:**\n\nThe recommended flow for targeted sequencing data is to use the workflow as it is, but also provide a `BED` file containing targets for all steps using the `--intervals` option. In addition, the parameter `--wes` should be set.\nIt is advised to pad the variant calling regions (exons or target) to some extent before submitting to the workflow.\n\nThe procedure is similar to whole genome sequencing, except that only BED file are accepted. See above for formatting description.\nAdding every exon as an interval in case of `WES` can generate >200K processes or jobs, much more forks, and similar number of directories in the Nextflow work directory. These are appropriately grouped together to reduce number of processes run in parallel (see above and `--nucleotides_per_second` for details). \nFurthermore, primers and/or baits are not 100% specific, (certainly not for MHC and KIR, etc.), quite likely there going to be reads mapping to multiple locations.\nIf you are certain that the target is unique for your genome (all the reads will certainly map to only one location), and aligning to the whole genome is an overkill, it is actually better to change the reference itself.", "description": "Path to target bed file in case of whole exome or targeted sequencing or intervals file." }, "nucleotides_per_second": { @@ -87,8 +77,8 @@ "fa_icon": "fas fa-clock", "description": "Estimate interval size.", "help_text": "Intervals are parts of the chopped up genome used to speed up preprocessing and variant calling. See `--intervals` for more info. \n\nChanging this parameter, changes the number of intervals that are grouped and processed together. Bed files from target sequencing can contain thousands or small intervals. Spinning up a new process for each can be quite resource intensive. Instead it can be desired to process small intervals together on larger nodes. \nIn order to make use of this parameter, no runtime estimate can be present in the bed file (column 5). ", - "default": 200000 - }, + "default": 40000 + }, "no_intervals": { "type": "boolean", "fa_icon": "fas fa-ban", @@ -98,16 +88,16 @@ "tools": { "type": "string", "fa_icon": "fas fa-toolbox", - "description": "Tools to use for duplicate marking, variant calling and/or for annotation.", - "help_text": "Multiple tools separated with commas.\n\n**Variant Calling:**\n\nGermline variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: DeepVariant, FreeBayes, GATK HaplotypeCaller, mpileup, Sentieon Haplotyper, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit\n\nTumor-only somatic variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, mpileup, Mutect2, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit, ControlFREEC\n\nSomatic variant calling can currently only be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, Mutect2, Strelka2\n- Structural variants: Manta, TIDDIT\n- Copy-Number: ASCAT, CNVKit, Control-FREEC\n- Microsatellite Instability: MSIsensorpro\n\n> **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.", - "pattern": "^((ascat|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|sentieon_haplotyper|manta|merge|mpileup|msisensorpro|mutect2|sentieon_dedup|snpeff|strelka|tiddit|vep)?,?)*(? **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.", + "pattern": "^((ascat|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|manta|merge|mpileup|msisensorpro|mutect2|snpeff|strelka|tiddit|vep)?,?)*[^,]+$" }, "skip_tools": { "type": "string", "fa_icon": "fas fa-forward", "description": "Disable specified tools.", "help_text": "Multiple tools can be specified, separated by commas.\n\n> **NB** `--skip_tools baserecalibrator_report` is actually just not saving the reports.\n> **NB** `--skip_tools markduplicates_report` does not skip `MarkDuplicates` but prevent the collection of duplicate metrics that slows down performance.", - "pattern": "^((baserecalibrator|baserecalibrator_report|bcftools|documentation|fastqc|haplotypecaller_filter|haplotyper_filter|markduplicates|markduplicates_report|mosdepth|multiqc|samtools|vcftools|versions)?,?)*(? The GATK4 Base Quality Score recalibration tools `Baserecalibrator` and `ApplyBQSR` are currently available as Beta release. Use with caution!", - "pattern": "^((baserecalibrator|markduplicates)?,?)*(? **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", - "hidden": true - }, "dragmap": { "type": "string", "fa_icon": "fas fa-copy", @@ -644,9 +584,8 @@ "fasta": { "type": "string", "format": "file-path", - "exists": true, "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", + "pattern": "\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nThis parameter is *mandatory* if `--genome` is not specified.", "fa_icon": "far fa-file-code" @@ -657,6 +596,12 @@ "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", "description": "Path to FASTA reference index." }, + "dict": { + "type": "string", + "fa_icon": "fas fa-file", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", + "description": "Path to FASTA reference dictionary." + }, "germline_resource": { "type": "string", "fa_icon": "fas fa-file", @@ -717,31 +662,50 @@ "type": "string", "fa_icon": "fas fa-database", "description": "snpEff DB version.", - "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the database to be use to annotate with.\nAlternatively databases' names can be listed with the `snpEff databases`." + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the database to be use to annotate with.\nAlternatively databases' names can be listed with the `snpEff databases`.", + "hidden": true }, "snpeff_genome": { "type": "string", "fa_icon": "fas fa-microscope", "description": "snpEff genome.", - "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when using the container with pre-downloaded cache." + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when using the container with pre-downloaded cache.", + "hidden": true + }, + "snpeff_version": { + "type": "string", + "fa_icon": "fas fa-tag", + "description": "snpEff version.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the snpeff version when using the container with pre-downloaded cache.", + "hidden": true }, "vep_genome": { "type": "string", "fa_icon": "fas fa-microscope", "description": "VEP genome.", - "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when using the container with pre-downloaded cache." + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when using the container with pre-downloaded cache.", + "hidden": true }, "vep_species": { "type": "string", "fa_icon": "fas fa-microscope", "description": "VEP species.", - "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively species listed in Ensembl Genomes caches can be used." + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively species listed in Ensembl Genomes caches can be used.", + "hidden": true }, "vep_cache_version": { "type": "number", "fa_icon": "fas fa-tag", "description": "VEP cache version.", - "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively cache version can be use to specify the correct Ensembl Genomes version number as these differ from the concurrent Ensembl/VEP version numbers" + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively cache version can be use to specify the correct Ensembl Genomes version number as these differ from the concurrent Ensembl/VEP version numbers", + "hidden": true + }, + "vep_version": { + "type": "string", + "fa_icon": "fas fa-tag", + "description": "VEP version.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the VEP version when using the container with pre-downloaded cache.", + "hidden": true }, "save_reference": { "type": "boolean", @@ -749,18 +713,6 @@ "description": "Save built references.", "help_text": "Set this parameter, if you wish to save all computed reference files. This is useful to avoid re-computation on future runs." }, - "build_only_index": { - "type": "boolean", - "fa_icon": "fas fa-download", - "description": "Only built references.", - "help_text": "Set this parameter, if you wish to compute and save all computed reference files. No alignment or any other downstream steps will be performed." - }, - "download_cache": { - "type": "boolean", - "fa_icon": "fas fa-download", - "description": "Download annotation cache.", - "help_text": "Set this parameter, if you wish to download annotation cache." - }, "igenomes_base": { "type": "string", "format": "directory-path", @@ -823,13 +775,6 @@ "hidden": true, "fa_icon": "fas fa-users-cog" }, - "test_data_base": { - "type": "string", - "default": "https://raw.githubusercontent.com/nf-core/test-datasets/sarek3", - "description": "Base path / URL for data used in the test profiles", - "help_text": "Warning: The `-profile test` samplesheet file itself contains remote paths. Setting this parameter does not alter the contents of that file.", - "hidden": true - }, "seq_center": { "type": "string", "fa_icon": "fas fa-university", @@ -875,7 +820,7 @@ "description": "Maximum amount of time that can be requested for any single job.", "default": "240.h", "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", + "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", "hidden": true, "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`." } @@ -894,12 +839,6 @@ "fa_icon": "fas fa-question-circle", "hidden": true }, - "version": { - "type": "boolean", - "description": "Display version and exit.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, "publish_dir_mode": { "type": "string", "default": "copy", @@ -951,22 +890,17 @@ }, "multiqc_config": { "type": "string", - "format": "file-path", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true }, - "multiqc_logo": { + "tracedir": { "type": "string", - "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", - "fa_icon": "fas fa-image", + "description": "Directory to keep pipeline Nextflow logs and reports.", + "default": "${params.outdir}/pipeline_info", + "fa_icon": "fas fa-cogs", "hidden": true }, - "multiqc_methods_description": { - "type": "string", - "description": "Custom MultiQC yaml file containing HTML including a methods description.", - "fa_icon": "fas fa-cog" - }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", @@ -974,33 +908,18 @@ "fa_icon": "fas fa-check-square", "hidden": true }, - "validationShowHiddenParams": { + "show_hidden_params": { "type": "boolean", "fa_icon": "far fa-eye-slash", "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." }, - "validationFailUnrecognisedParams": { + "enable_conda": { "type": "boolean", - "fa_icon": "far fa-check-circle", - "description": "Validation of parameters fails when an unrecognised parameter is found.", + "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", "hidden": true, - "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." - }, - "validationLenientMode": { - "type": "boolean", - "fa_icon": "far fa-check-circle", - "description": "Validation of parameters in lenient more.", - "hidden": true, - "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." - }, - "hook_url": { - "type": "string", - "description": "Incoming hook URL for messaging service", - "fa_icon": "fas fa-people-group", - "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", - "hidden": true + "fa_icon": "fas fa-bacon" } } } diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf index 666c7c7b..e599cfee 100644 --- a/subworkflows/local/bam_variant_calling_germline_all/main.nf +++ b/subworkflows/local/bam_variant_calling_germline_all/main.nf @@ -167,6 +167,58 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { } } + // HAPLOTYPECALLER [variant calling step only] + if (tools.split(',').contains('haplotypecaller_vc')) { + if (! joint_germline) { + error("params.joint_germline must be true when using haplotypecaller_vc") + } + + BAM_VARIANT_CALLING_HAPLOTYPECALLER( + cram, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + intervals + ) + + vcf_haplotypecaller = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.vcf + versions = versions.mix(BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.versions) + } + + // HAPLOTYPECALLER [joint calling step only] + if (tools.split(',').contains('haplotypecaller_jc')) { + if (! joint_germline) { + error("params.joint_germline must be true when using haplotypecaller_jc") + } + + gvcf_tbi_intervals = cram + .map{ meta, gvcf -> [ meta, gvcf, gvcf + '.tbi' ] } + .combine(intervals) + .map{ meta, gvcf, tbi, intervals, num_intervals -> [ meta + [ interval_name:intervals.simpleName, num_intervals:num_intervals ], gvcf, tbi, intervals ] } + + BAM_JOINT_CALLING_GERMLINE_GATK( + gvcf_tbi_intervals, + fasta, + fasta_fai, + dict, + dbsnp, + dbsnp_tbi, + dbsnp_vqsr, + known_sites_indels, + known_sites_indels_tbi, + known_indels_vqsr, + known_sites_snps, + known_sites_snps_tbi, + known_snps_vqsr + ) + + vcf_haplotypecaller = BAM_JOINT_CALLING_GERMLINE_GATK.out.genotype_vcf + versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_GATK.out.versions) + } + // MANTA if (tools.split(',').contains('manta')) { BAM_VARIANT_CALLING_GERMLINE_MANTA ( diff --git a/workflows/sarek.nf b/workflows/sarek.nf index 5ae80fba..88b31d9b 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -165,7 +165,8 @@ input_sample = ch_from_samplesheet } else if (vcf) { meta = meta + [id: meta.sample, data_type: 'vcf', variantcaller: variantcaller ?: ''] - if (params.step == 'annotate') return [ meta - meta.subMap('lane'), vcf ] + if (params.step == 'joint_calling') return [ meta - meta.subMap('lane'), vcf ] + else if (params.step == 'annotate') return [ meta - meta.subMap('lane'), vcf ] else { error("Samplesheet contains vcf files but step is `$params.step`. Please check your samplesheet or adjust the step parameter.\nhttps://nf-co.re/sarek/usage#input-samplesheet-configurations") } @@ -259,7 +260,7 @@ if (!params.dbsnp && !params.known_indels) { log.warn "If GATK's Haplotypecaller or Sentieon's Haplotyper is specified, without `--dbsnp` or `--known_indels no filtering will be done. For filtering, please provide at least one of `--dbsnp` or `--known_indels`.\nFor more information see FilterVariantTranches (single-sample, default): https://gatk.broadinstitute.org/hc/en-us/articles/5358928898971-FilterVariantTranches\nFor more information see VariantRecalibration (--joint_germline): https://gatk.broadinstitute.org/hc/en-us/articles/5358906115227-VariantRecalibrator\nFor more information on GATK Best practice germline variant calling: https://gatk.broadinstitute.org/hc/en-us/articles/360035535932-Germline-short-variant-discovery-SNPs-Indels-" } } -if (params.joint_germline && (!params.tools || !(params.tools.split(',').contains('haplotypecaller') || params.tools.split(',').contains('sentieon_haplotyper')))) { +if (params.joint_germline && (!params.tools || !(params.tools.split(',').any{ it.startsWith('haplotypecaller') } || params.tools.split(',').contains('sentieon_haplotyper')))) { error("The GATK's Haplotypecaller or Sentieon's Haplotyper should be specified as one of the tools when doing joint germline variant calling.) ") } @@ -1077,6 +1078,7 @@ workflow SAREK { if (params.tools) { if (params.step == 'annotate') cram_variant_calling = Channel.empty() + if (params.step == 'joint_calling') cram_variant_calling = Channel.empty() // // Logic to separate germline samples, tumor samples with no matched normal, and combine tumor-normal pairs @@ -1139,6 +1141,8 @@ workflow SAREK { [ meta, normal[2], normal[3], tumor[2], tumor[3] ] } + if (params.step == 'joint_calling') cram_variant_calling_status_normal = input_sample + // GERMLINE VARIANT CALLING BAM_VARIANT_CALLING_GERMLINE_ALL( params.tools,