diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c66c4c23c8..ccdc590cb6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -123,6 +123,24 @@ jobs: - name: Disk space cleanup uses: jlumbroso/free-disk-space@v1.3.1 + - name: create references + id: references + run: | + mkdir tmp/ + nextflow run nf-core/references -profile ${{ matrix.profile }} \ + -r 8112ae8 \ + --input https://raw.githubusercontent.com/nf-core/references-assets/41545a3631addaf491d22751b17607149b8512ac/assets/test/sarek/GRCh38_chr22.yml \ + --tools bwamem1,createsequencedictionary,faidx,intervals,tabix \ + --outdir ../references/ + cd .. + rm -rf tmp/ + + - name: use references + id: references-use + run: | + nextflow run . -profile test,${{ matrix.profile }} --outdir results \ + -params-file https://raw.githubusercontent.com/nf-core/references-assets/41545a3631addaf491d22751b17607149b8512ac/assets/test/sarek/GRCh38_chr22_params.yml + - name: Start summary id: print-test run: | diff --git a/main.nf b/main.nf index effa97ef16..968b5fd176 100755 --- a/main.nf +++ b/main.nf @@ -73,7 +73,6 @@ include { ANNOTATION_CACHE_INITIALISATION } from './subworkflows/local/annotati include { DOWNLOAD_CACHE_SNPEFF_VEP } from './subworkflows/local/download_cache_snpeff_vep' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_sarek_pipeline' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_sarek_pipeline' -include { PREPARE_GENOME } from './subworkflows/local/prepare_genome' include { PREPARE_INTERVALS } from './subworkflows/local/prepare_intervals' include { PREPARE_REFERENCE_CNVKIT } from './subworkflows/local/prepare_reference_cnvkit' @@ -92,6 +91,12 @@ known_snps = params.known_snps ? Channel.fromPath(para mappability = params.mappability ? Channel.fromPath(params.mappability).collect() : Channel.value([]) pon = params.pon ? Channel.fromPath(params.pon).collect() : Channel.value([]) // PON is optional for Mutect2 (but highly recommended) sentieon_dnascope_model = params.sentieon_dnascope_model ? Channel.fromPath(params.sentieon_dnascope_model).collect() : Channel.value([]) +msisensorpro_scan = params.msisensorpro_scan ? Channel.fromPath(params.msisensorpro_scan).collect() : Channel.empty() +allele_files = params.allele_files ? Channel.fromPath(params.allele_files).collect() : Channel.empty() +chr_files = params.chr_files ? Channel.fromPath(params.chr_files).collect() : Channel.empty() +gc_file = params.gc_file ? Channel.fromPath(params.gc_file).collect() : Channel.empty() +loci_files = params.loci_files ? Channel.fromPath(params.loci_files).collect() : Channel.empty() +rt_file = params.rt_file ? Channel.fromPath(params.rt_file).collect() : Channel.empty() // Initialize value channels based on params, defined in the params.genomes[params.genome] scope ascat_genome = params.ascat_genome ?: Channel.empty() @@ -104,6 +109,7 @@ vep_cache_version = params.vep_cache_version ?: Channel.empty() vep_genome = params.vep_genome ?: Channel.empty() vep_species = params.vep_species ?: Channel.empty() + vep_extra_files = [] if (params.dbnsfp && params.dbnsfp_tbi) { @@ -132,56 +138,26 @@ workflow NFCORE_SAREK { main: versions = Channel.empty() - // build indexes if needed - PREPARE_GENOME( - params.ascat_alleles, - params.ascat_loci, - params.ascat_loci_gc, - params.ascat_loci_rt, - bcftools_annotations, - params.chr_dir, - dbsnp, - fasta, - germline_resource, - known_indels, - known_snps, - pon) - // Gather built indices or get them from the params // Built from the fasta file: - dict = params.dict ? Channel.fromPath(params.dict).map{ it -> [ [id:'dict'], it ] }.collect() - : PREPARE_GENOME.out.dict - fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).map{ it -> [ [id:'fai'], it ] }.collect() - : PREPARE_GENOME.out.fasta_fai - bwa = params.bwa ? Channel.fromPath(params.bwa).map{ it -> [ [id:'bwa'], it ] }.collect() - : PREPARE_GENOME.out.bwa - bwamem2 = params.bwamem2 ? Channel.fromPath(params.bwamem2).map{ it -> [ [id:'bwamem2'], it ] }.collect() - : PREPARE_GENOME.out.bwamem2 - dragmap = params.dragmap ? Channel.fromPath(params.dragmap).map{ it -> [ [id:'dragmap'], it ] }.collect() - : PREPARE_GENOME.out.hashtable + dict = params.dict ? Channel.fromPath(params.dict).map{ it -> [ [id:'dict'], it ] }.collect() : Channel.value([]) + fasta_fai = params.fasta_fai ? Channel.fromPath(params.fasta_fai).map{ it -> [ [id:'fai'], it ] }.collect() : Channel.value([]) + bwa = params.bwa ? Channel.fromPath(params.bwa).map{ it -> [ [id:'bwa'], it ] }.collect() : Channel.empty() + bwamem2 = params.bwamem2 ? Channel.fromPath(params.bwamem2).map{ it -> [ [id:'bwamem2'], it ] }.collect() : Channel.empty() + dragmap = params.dragmap ? Channel.fromPath(params.dragmap).map{ it -> [ [id:'dragmap'], it ] }.collect() : Channel.empty() // Gather index for mapping given the chosen aligner index_alignment = (aligner == "bwa-mem" || aligner == "sentieon-bwamem") ? bwa : aligner == "bwa-mem2" ? bwamem2 : dragmap - // TODO: add a params for msisensorpro_scan - msisensorpro_scan = PREPARE_GENOME.out.msisensorpro_scan - - // For ASCAT, extracted from zip or tar.gz files - allele_files = PREPARE_GENOME.out.allele_files - chr_files = PREPARE_GENOME.out.chr_files - gc_file = PREPARE_GENOME.out.gc_file - loci_files = PREPARE_GENOME.out.loci_files - rt_file = PREPARE_GENOME.out.rt_file - // Tabix indexed vcf files - bcftools_annotations_tbi = params.bcftools_annotations ? params.bcftools_annotations_tbi ? Channel.fromPath(params.bcftools_annotations_tbi).collect() : PREPARE_GENOME.out.bcftools_annotations_tbi : Channel.value([]) - dbsnp_tbi = params.dbsnp ? params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi).collect() : PREPARE_GENOME.out.dbsnp_tbi : Channel.value([]) - germline_resource_tbi = params.germline_resource ? params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi).collect() : PREPARE_GENOME.out.germline_resource_tbi : [] //do not change to Channel.value([]), the check for its existence then fails for Getpileupsumamries - known_indels_tbi = params.known_indels ? params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : PREPARE_GENOME.out.known_indels_tbi : Channel.value([]) - known_snps_tbi = params.known_snps ? params.known_snps_tbi ? Channel.fromPath(params.known_snps_tbi).collect() : PREPARE_GENOME.out.known_snps_tbi : Channel.value([]) - pon_tbi = params.pon ? params.pon_tbi ? Channel.fromPath(params.pon_tbi).collect() : PREPARE_GENOME.out.pon_tbi : Channel.value([]) + bcftools_annotations_tbi = params.bcftools_annotations && params.bcftools_annotations_tbi ? Channel.fromPath(params.bcftools_annotations_tbi).collect() : Channel.value([]) + dbsnp_tbi = params.dbsnp && params.dbsnp_tbi ? Channel.fromPath(params.dbsnp_tbi).collect() : Channel.value([]) + germline_resource_tbi = params.germline_resource && params.germline_resource_tbi ? Channel.fromPath(params.germline_resource_tbi).collect() : [] //do not change to Channel.value([]), the check for its existence then fails for Getpileupsumamries + known_indels_tbi = params.known_indels && params.known_indels_tbi ? Channel.fromPath(params.known_indels_tbi).collect() : Channel.value([]) + known_snps_tbi = params.known_snps && params.known_snps_tbi ? Channel.fromPath(params.known_snps_tbi).collect() : Channel.value([]) + pon_tbi = params.pon && params.pon_tbi ? Channel.fromPath(params.pon_tbi).collect() : Channel.value([]) // known_sites is made by grouping both the dbsnp and the known snps/indels resources // Which can either or both be optional @@ -224,8 +200,8 @@ workflow NFCORE_SAREK { } else { cnvkit_reference = Channel.value([]) } + // Gather used softwares versions - versions = versions.mix(PREPARE_GENOME.out.versions) versions = versions.mix(PREPARE_INTERVALS.out.versions) vep_fasta = (params.vep_include_fasta) ? fasta.map{ fasta -> [ [ id:fasta.baseName ], fasta ] } : [[id: 'null'], []] diff --git a/subworkflows/local/prepare_genome/main.nf b/subworkflows/local/prepare_genome/main.nf deleted file mode 100644 index 772af47b37..0000000000 --- a/subworkflows/local/prepare_genome/main.nf +++ /dev/null @@ -1,138 +0,0 @@ -// -// PREPARE GENOME -// - -// Initialize channels based on params or indices that were just built -// For all modules here: -// A when clause condition is defined in the conf/modules.config to determine if the module should be run -// Condition is based on params.step and params.tools -// If and extra condition exists, it's specified in comments - -include { BWA_INDEX as BWAMEM1_INDEX } from '../../../modules/nf-core/bwa/index/main' -include { BWAMEM2_INDEX } from '../../../modules/nf-core/bwamem2/index/main' -include { DRAGMAP_HASHTABLE } from '../../../modules/nf-core/dragmap/hashtable/main' -include { GATK4_CREATESEQUENCEDICTIONARY } from '../../../modules/nf-core/gatk4/createsequencedictionary/main' -include { MSISENSORPRO_SCAN } from '../../../modules/nf-core/msisensorpro/scan/main' -include { SAMTOOLS_FAIDX } from '../../../modules/nf-core/samtools/faidx/main' -include { TABIX_TABIX as TABIX_BCFTOOLS_ANNOTATIONS } from '../../../modules/nf-core/tabix/tabix/main' -include { TABIX_TABIX as TABIX_DBSNP } from '../../../modules/nf-core/tabix/tabix/main' -include { TABIX_TABIX as TABIX_GERMLINE_RESOURCE } from '../../../modules/nf-core/tabix/tabix/main' -include { TABIX_TABIX as TABIX_KNOWN_INDELS } from '../../../modules/nf-core/tabix/tabix/main' -include { TABIX_TABIX as TABIX_KNOWN_SNPS } from '../../../modules/nf-core/tabix/tabix/main' -include { TABIX_TABIX as TABIX_PON } from '../../../modules/nf-core/tabix/tabix/main' -include { UNTAR as UNTAR_CHR_DIR } from '../../../modules/nf-core/untar/main' -include { UNZIP as UNZIP_ALLELES } from '../../../modules/nf-core/unzip/main' -include { UNZIP as UNZIP_GC } from '../../../modules/nf-core/unzip/main' -include { UNZIP as UNZIP_LOCI } from '../../../modules/nf-core/unzip/main' -include { UNZIP as UNZIP_RT } from '../../../modules/nf-core/unzip/main' - -workflow PREPARE_GENOME { - take: - ascat_alleles // params.ascat_alleles - ascat_loci // params.ascat_loci - ascat_loci_gc // params.ascat_loci_gc - ascat_loci_rt // params.ascat_loci_rt - bcftools_annotations // channel: [optional] bcftools annotations file - chr_dir // params.chr_dir - dbsnp // channel: [optional] dbsnp - fasta // channel: [mandatory] fasta - germline_resource // channel: [optional] germline_resource - known_indels // channel: [optional] known_indels - known_snps // channel: [optional] known_snps - pon // channel: [optional] pon - - - main: - versions = Channel.empty() - - BWAMEM1_INDEX(fasta) // If aligner is bwa-mem - BWAMEM2_INDEX(fasta) // If aligner is bwa-mem2 - DRAGMAP_HASHTABLE(fasta) // If aligner is dragmap - - GATK4_CREATESEQUENCEDICTIONARY(fasta) - MSISENSORPRO_SCAN(fasta) - SAMTOOLS_FAIDX(fasta, [ [ id:'no_fai' ], [] ] ) - - // the following are flattened and mapped in case the user supplies more than one value for the param - // written for KNOWN_INDELS, but preemptively applied to the rest - // [ file1, file2 ] becomes [ [ meta1, file1 ], [ meta2, file2 ] ] - // outputs are collected to maintain a single channel for relevant TBI files - TABIX_BCFTOOLS_ANNOTATIONS(bcftools_annotations.flatten().map{ it -> [ [ id:it.baseName ], it ] }) - TABIX_DBSNP(dbsnp.flatten().map{ it -> [ [ id:it.baseName ], it ] }) - TABIX_GERMLINE_RESOURCE(germline_resource.flatten().map{ it -> [ [ id:it.baseName ], it ] }) - TABIX_KNOWN_SNPS(known_snps.flatten().map{ it -> [ [ id:it.baseName ], it ] } ) - TABIX_KNOWN_INDELS(known_indels.flatten().map{ it -> [ [ id:it.baseName ], it ] } ) - TABIX_PON(pon.flatten().map{ it -> [ [ id:it.baseName ], it ] }) - - // prepare ascat and controlfreec reference files - if (!ascat_alleles) allele_files = Channel.empty() - else if (ascat_alleles.endsWith(".zip")) { - UNZIP_ALLELES(Channel.fromPath(file(ascat_alleles)).collect().map{ it -> [ [ id:it[0].baseName ], it ] }) - allele_files = UNZIP_ALLELES.out.unzipped_archive.map{ it[1] } - versions = versions.mix(UNZIP_ALLELES.out.versions) - } else allele_files = Channel.fromPath(ascat_alleles).collect() - - if (!ascat_loci) loci_files = Channel.empty() - else if (ascat_loci.endsWith(".zip")) { - UNZIP_LOCI(Channel.fromPath(file(ascat_loci)).collect().map{ it -> [ [ id:it[0].baseName ], it ] }) - loci_files = UNZIP_LOCI.out.unzipped_archive.map{ it[1] } - versions = versions.mix(UNZIP_LOCI.out.versions) - } else loci_files = Channel.fromPath(ascat_loci).collect() - - if (!ascat_loci_gc) gc_file = Channel.value([]) - else if (ascat_loci_gc.endsWith(".zip")) { - UNZIP_GC(Channel.fromPath(file(ascat_loci_gc)).collect().map{ it -> [ [ id:it[0].baseName ], it ] }) - gc_file = UNZIP_GC.out.unzipped_archive.map{ it[1] } - versions = versions.mix(UNZIP_GC.out.versions) - } else gc_file = Channel.fromPath(ascat_loci_gc).collect() - - if (!ascat_loci_rt) rt_file = Channel.value([]) - else if (ascat_loci_rt.endsWith(".zip")) { - UNZIP_RT(Channel.fromPath(file(ascat_loci_rt)).collect().map{ it -> [ [ id:it[0].baseName ], it ] }) - rt_file = UNZIP_RT.out.unzipped_archive.map{ it[1] } - versions = versions.mix(UNZIP_RT.out.versions) - } else rt_file = Channel.fromPath(ascat_loci_rt).collect() - - if (!chr_dir) chr_files = Channel.value([]) - else if (chr_dir.endsWith(".tar.gz")) { - UNTAR_CHR_DIR(Channel.fromPath(file(chr_dir)).collect().map{ it -> [ [ id:it[0].baseName ], it ] }) - chr_files = UNTAR_CHR_DIR.out.untar.map{ it[1] } - versions = versions.mix(UNTAR_CHR_DIR.out.versions) - } else chr_files = Channel.fromPath(chr_dir).collect() - - // Gather versions of all tools used - versions = versions.mix(BWAMEM1_INDEX.out.versions) - versions = versions.mix(BWAMEM2_INDEX.out.versions) - versions = versions.mix(DRAGMAP_HASHTABLE.out.versions) - versions = versions.mix(GATK4_CREATESEQUENCEDICTIONARY.out.versions) - versions = versions.mix(MSISENSORPRO_SCAN.out.versions) - versions = versions.mix(SAMTOOLS_FAIDX.out.versions) - versions = versions.mix(TABIX_BCFTOOLS_ANNOTATIONS.out.versions) - versions = versions.mix(TABIX_DBSNP.out.versions) - versions = versions.mix(TABIX_GERMLINE_RESOURCE.out.versions) - versions = versions.mix(TABIX_KNOWN_INDELS.out.versions) - versions = versions.mix(TABIX_KNOWN_SNPS.out.versions) - versions = versions.mix(TABIX_PON.out.versions) - - emit: - bcftools_annotations_tbi = TABIX_BCFTOOLS_ANNOTATIONS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: bcftools_annotations.vcf.gz.tbi - bwa = BWAMEM1_INDEX.out.index.collect() // path: bwa/* - bwamem2 = BWAMEM2_INDEX.out.index.collect() // path: bwamem2/* - hashtable = DRAGMAP_HASHTABLE.out.hashmap.collect() // path: dragmap/* - dbsnp_tbi = TABIX_DBSNP.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: dbsnb.vcf.gz.tbi - dict = GATK4_CREATESEQUENCEDICTIONARY.out.dict.collect() // path: genome.fasta.dict - fasta_fai = SAMTOOLS_FAIDX.out.fai.collect() // path: genome.fasta.fai - germline_resource_tbi = TABIX_GERMLINE_RESOURCE.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: germline_resource.vcf.gz.tbi - known_snps_tbi = TABIX_KNOWN_SNPS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi - known_indels_tbi = TABIX_KNOWN_INDELS.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: {known_indels*}.vcf.gz.tbi - msisensorpro_scan = MSISENSORPRO_SCAN.out.list.map{ meta, list -> [list] } // path: genome_msi.list - pon_tbi = TABIX_PON.out.tbi.map{ meta, tbi -> [tbi] }.collect() // path: pon.vcf.gz.tbi - - allele_files // path: allele_files - chr_files // path: chr_files - gc_file // path: gc_file - loci_files // path: loci_files - rt_file // path: rt_file - - versions // channel: [ versions.yml ] -} diff --git a/subworkflows/local/prepare_intervals/main.nf b/subworkflows/local/prepare_intervals/main.nf index 27c4e9c145..9b7783ed42 100644 --- a/subworkflows/local/prepare_intervals/main.nf +++ b/subworkflows/local/prepare_intervals/main.nf @@ -6,11 +6,10 @@ // For all modules here: // A when clause condition is defined in the conf/modules.config to determine if the module should be run -include { CREATE_INTERVALS_BED } from '../../../modules/local/create_intervals_bed/main' -include { GATK4_INTERVALLISTTOBED } from '../../../modules/nf-core/gatk4/intervallisttobed/main' -include { GAWK as BUILD_INTERVALS } from '../../../modules/nf-core/gawk/main' -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../../modules/nf-core/tabix/bgziptabix/main' -include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_COMBINED } from '../../../modules/nf-core/tabix/bgziptabix/main' +include { CREATE_INTERVALS_BED } from '../../../modules/local/create_intervals_bed' +include { GATK4_INTERVALLISTTOBED } from '../../../modules/nf-core/gatk4/intervallisttobed' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_SPLIT } from '../../../modules/nf-core/tabix/bgziptabix' +include { TABIX_BGZIPTABIX as TABIX_BGZIPTABIX_INTERVAL_COMBINED } from '../../../modules/nf-core/tabix/bgziptabix' workflow PREPARE_INTERVALS { take: @@ -38,31 +37,18 @@ workflow PREPARE_INTERVALS { intervals_combined = Channel.fromPath(file("${outdir}/no_intervals.bed")).map{ it -> [ [ id:it.simpleName ], it ] } } else if (step != 'annotate' && step != 'controlfreec') { // If no interval/target file is provided, then generated intervals from FASTA file - if (!intervals) { - BUILD_INTERVALS(fasta_fai, []) + intervals_combined = Channel.fromPath(file(intervals)).map{it -> [ [ id:it.baseName ], it ] } + CREATE_INTERVALS_BED(file(intervals), nucleotides_per_second) - intervals_combined = BUILD_INTERVALS.out.output + intervals_bed = CREATE_INTERVALS_BED.out.bed - CREATE_INTERVALS_BED(intervals_combined.map{ meta, path -> path }, nucleotides_per_second) + versions = versions.mix(CREATE_INTERVALS_BED.out.versions) - intervals_bed = CREATE_INTERVALS_BED.out.bed - - versions = versions.mix(BUILD_INTERVALS.out.versions) - versions = versions.mix(CREATE_INTERVALS_BED.out.versions) - } else { - intervals_combined = Channel.fromPath(file(intervals)).map{it -> [ [ id:it.baseName ], it ] } - CREATE_INTERVALS_BED(file(intervals), nucleotides_per_second) - - intervals_bed = CREATE_INTERVALS_BED.out.bed - - versions = versions.mix(CREATE_INTERVALS_BED.out.versions) - - // If interval file is not provided as .bed, but e.g. as .interval_list then convert to BED format - if (intervals.endsWith(".interval_list")) { - GATK4_INTERVALLISTTOBED(intervals_combined) - intervals_combined = GATK4_INTERVALLISTTOBED.out.bed - versions = versions.mix(GATK4_INTERVALLISTTOBED.out.versions) - } + // If interval file is not provided as .bed, but e.g. as .interval_list then convert to BED format + if (intervals.endsWith(".interval_list")) { + GATK4_INTERVALLISTTOBED(intervals_combined) + intervals_combined = GATK4_INTERVALLISTTOBED.out.bed + versions = versions.mix(GATK4_INTERVALLISTTOBED.out.versions) } // Now for the intervals.bed the following operations are done: