diff --git a/CHANGELOG.md b/CHANGELOG.md index f251b17d20..058f5c97f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ A set of connecting glaciers. ### Added +- [1613](https://github.com/nf-core/sarek/pull/1613) - add indexcov - [1638](https://github.com/nf-core/sarek/pull/1638) - Added additional documentation detailing ASCAT WES usage. - [1640](https://github.com/nf-core/sarek/pull/1620) - Add `lofreq` as a tumor-only variant caller - [1642](https://github.com/nf-core/sarek/pull/1642) - Back to dev diff --git a/README.md b/README.md index 33892e0a6b..c4f7b5443b 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ Depending on the options and samples provided, the pipeline can currently perfor - `freebayes` - `GATK HaplotypeCaller` - `Manta` + - `indexcov` - `mpileup` - `MSIsensor-pro` - `Mutect2` @@ -171,6 +172,7 @@ We thank the following people for their extensive assistance in the development - [pallolason](https://github.com/pallolason) - [Paul Cantalupo](https://github.com/pcantalupo) - [Phil Ewels](https://github.com/ewels) +- [Pierre Lindenbaum](https://github.com/lindenb) - [Sabrina Krakau](https://github.com/skrakau) - [Sam Minot](https://github.com/sminot) - [Sebastian-D](https://github.com/Sebastian-D) diff --git a/conf/modules/indexcov.config b/conf/modules/indexcov.config new file mode 100644 index 0000000000..082ea3b7cc --- /dev/null +++ b/conf/modules/indexcov.config @@ -0,0 +1,21 @@ + +// INDEXCOV + +process { + if (params.tools && params.tools.split(',').contains('indexcov')) { + + withName: 'SAMTOOLS_REINDEX_BAM' { + ext.args = { ' -F 3844 -q 30 ' } // high mapq , primary read paired properly mapped + } + + withName: 'GOLEFT_INDEXCOV' { + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/indexcov/" } + ] + + } + + } + +} diff --git a/docs/images/sarek_subway.png b/docs/images/sarek_subway.png index f2f20ffc01..a381343500 100644 Binary files a/docs/images/sarek_subway.png and b/docs/images/sarek_subway.png differ diff --git a/docs/images/sarek_subway.svg b/docs/images/sarek_subway.svg index 62544b70fc..6d8d172652 100644 --- a/docs/images/sarek_subway.svg +++ b/docs/images/sarek_subway.svg @@ -32,12 +32,12 @@ inkscape:pagecheckerboard="false" inkscape:document-units="mm" showgrid="true" - inkscape:zoom="0.61695405" - inkscape:cx="709.12898" - inkscape:cy="403.59569" - inkscape:window-width="1440" - inkscape:window-height="847" - inkscape:window-x="0" + inkscape:zoom="1.8101934" + inkscape:cx="659.04562" + inkscape:cy="459.61941" + inkscape:window-width="2560" + inkscape:window-height="1027" + inkscape:window-x="1512" inkscape:window-y="25" inkscape:window-maximized="1" inkscape:current-layer="layer4" @@ -795,13 +795,27 @@ id="rect6693-4" style="display:inline;fill:#e6e6e6;fill-opacity:1;stroke:none;stroke-width:4.00201;stroke-linecap:butt;stroke-linejoin:bevel;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;paint-order:normal" />indexcovdeepvariantfreebayeshaplotypecallerstrelka2tidditmutect2freebayesmantalofreqlofreqExample analysis pathwaysmpileupmpileupSentieon haplotyperSentieon dnascopeSNPs & IndelsMSI + style="color:#000000;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:medium;line-height:normal;font-family:sans-serif;font-variant-ligatures:normal;font-variant-position:normal;font-variant-caps:normal;font-variant-numeric:normal;font-variant-alternates:normal;font-feature-settings:normal;text-indent:0;text-align:start;text-decoration:none;text-decoration-line:none;text-decoration-style:solid;text-decoration-color:#000000;letter-spacing:normal;word-spacing:normal;text-transform:none;writing-mode:lr-tb;direction:ltr;text-orientation:mixed;dominant-baseline:auto;baseline-shift:baseline;text-anchor:start;white-space:normal;shape-padding:0;clip-rule:nonzero;display:inline;overflow:visible;visibility:visible;opacity:1;isolation:auto;mix-blend-mode:normal;color-interpolation:sRGB;color-interpolation-filters:linearRGB;solid-color:#000000;solid-opacity:1;vector-effect:none;fill:#000000;fill-opacity:1;fill-rule:nonzero;stroke:none;stroke-width:4.00004;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-dashoffset:0;stroke-opacity:1;color-rendering:auto;image-rendering:auto;shape-rendering:auto;text-rendering:auto;enable-background:accumulate" />strelka2 diff --git a/docs/output.md b/docs/output.md index 6204ada6a5..6d723ba03b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -45,6 +45,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Strelka](#strelka) - [Lofreq](#lofreq) - [Structural Variants](#structural-variants) + - [Indexcov](#indexcov) - [Manta](#manta) - [TIDDIT](#tiddit) - [Sample heterogeneity, ploidy and CNVs](#sample-heterogeneity-ploidy-and-cnvs) @@ -592,6 +593,30 @@ For further downstream analysis, take a look [here](https://github.com/Illumina/ ### Structural Variants +#### indexcov + +[indexcov](https://github.com/brentp/goleft/tree/master/indexcov) quickly estimate coverage from a whole-genome bam or cram index. +A bam index has 16KB resolution and it is used as a coverage estimate . +The output is scaled to around 1. So a long stretch with values of 1.5 would be a heterozygous duplication. This is useful as a quick QC to get coverage values across the genome. + +**Output directory: `{outdir}/variantcalling/indexcov/`** + +In addition to the interactive HTML files, `indexcov` outputs a number of text files: + +- `-indexcov.ped`: a .ped/.fam file with the inferred sex in the appropriate column if the sex chromosomes were found. + the CNX and CNY columns indicating the floating-point estimate of copy-number for those chromosomes. + `bins.out`: how many bins had a coverage value outside of (0.85, 1.15). high values can indicate high-bias samples. + `bins.lo`: number of bins with value < 0.15. high values indicate missing data. + `bins.hi`: number of bins with value > 1.15. + `bins.in`: number of bins with value inside of (0.85, 1.15) + `p.out`: `bins.out/bins.in` + `PC1...PC5`: PCA projections calculated with depth of autosomes. + +- `-indexcov.roc`: tab-delimited columns of chrom, scaled coverage cutoff, and $n_samples columns where each indicates the + proportion of 16KB blocks at or above that scaled coverage value. +- `-indexcov.bed.gz`: a bed file with columns of chrom, start, end, and a column per sample where the values indicate there + scaled coverage for that sample in that 16KB chunk. + #### Manta [Manta](https://github.com/Illumina/manta) calls structural variants (SVs) and indels from mapped paired-end sequencing reads. diff --git a/docs/usage.md b/docs/usage.md index bedacabd11..0a279cd6c5 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -585,6 +585,7 @@ This list is by no means exhaustive and it will depend on the specific analysis | [mpileup](https://www.htslib.org/doc/samtools-mpileup.html) | x | x | x | x | x | - | | [Strelka](https://github.com/Illumina/strelka) | x | x | x | x | - | x | | [Manta](https://github.com/Illumina/manta) | x | x | x | x | x | x | +| [indexcov](https://github.com/brentp/goleft/tree/master/indexcov) | x | - | - | x | - | x | | [TIDDIT](https://github.com/SciLifeLab/TIDDIT) | x | x | x | x | x | x | | [ASCAT](https://github.com/VanLoo-lab/ascat) | x | x | - | - | - | x | | [CNVKit](https://cnvkit.readthedocs.io/en/stable/) | x | x | - | x | x | x | @@ -921,30 +922,30 @@ nextflow run nf-core/sarek --known_indels false --genome GRCh38.GATK For GATK.GRCh38 the links for each reference file and the corresponding processes that use them is listed below. For GATK.GRCh37 the files originate from the same sources: -| File | Tools | Origin | Docs | -| :-------------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------- | -| ascat_alleles | ASCAT | https://www.dropbox.com/s/uouszfktzgoqfy7/G1000_alleles_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | -| ascat_loci | ASCAT | https://www.dropbox.com/s/80cq0qgao8l1inj/G1000_loci_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | -| ascat_loci_gc | ASCAT | https://www.dropbox.com/s/80cq0qgao8l1inj/G1000_loci_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | -| ascat_loci_rt | ASCAT | https://www.dropbox.com/s/xlp99uneqh6nh6p/RT_G1000_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | -| bwa | bwa-mem | `bwa index -p bwa/${fasta.baseName} $fasta` | | -| bwamem2 | bwa-mem2 | `bwa-mem2 index -p bwamem2/${fasta} $fasta` | | -| dragmap | DragMap | `dragen-os --build-hash-table true --ht-reference $fasta --output-directory dragmap` | | -| dbsnp | Baserecalibrator, ControlFREEC, GenotypeGVCF, HaplotypeCaller | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | -| dbsnp_tbi | Baserecalibrator, ControlFREEC, GenotypeGVCF, HaplotypeCaller | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | -| dict | Baserecalibrator(Spark), CNNScoreVariant, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, MarkDulpicates(Spark), MergeVCFs, Mutect2, Variantrecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | -| fasta | ApplyBQSR(Spark), ApplyVQSR, ASCAT, Baserecalibrator(Spark), BWA, BWAMem2, CNNScoreVariant, CNVKit, ControlFREEC, DragMap, DEEPVariant, EnsemblVEP, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, FreeBayes, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, interval building, Manta, MarkDuplicates(Spark),MergeVCFs,MSISensorPro, Mutect2, Samtools, SnpEff, Strelka, Tiddit, Variantrecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | -| fasta_fai | ApplyBQSR(Spark), ApplyVQSR, ASCAT, Baserecalibrator(Spark), BWA, BWAMem2, CNNScoreVariant, CNVKit, ControlFREEC, DragMap, DEEPVariant, EnsemblVEP, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, FreeBayes, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, interval building, Manta, MarkDuplicates(Spark),MergeVCFs,MSISensorPro, Mutect2, Samtools, SnpEff, Strelka, Tiddit, Variantrecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | -| germline_resource | GetPileupsummaries,Mutect2 | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | -| germline_resource_tbi | GetPileupsummaries,Mutect2 | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | -| intervals | ApplyBQSR(Spark), ASCAT, Baserecalibrator(Spark), BCFTools, CNNScoreVariants, ControlFREEC, Deepvariant, FilterVariantTranches, FreeBayes, GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, Strelka, mpileup, MSISensorPro, Mutect2, VCFTools | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | -| known_indels | BaseRecalibrator(Spark), FilterVariantTranches | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | -| known_indels_tbi | BaseRecalibrator(Spark), FilterVariantTranches | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | -| known_snps | BaseRecalibrator(Spark), FilterVariantTranches, VariantRecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | -| known_snps_tbi | BaseRecalibrator(Spark), FilterVariantTranches, VariantRecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | -| mappability | ControlFREEC | http://xfer.curie.fr/get/vyIi4w8EONl/out100m2_hg38.zip | http://boevalab.inf.ethz.ch/FREEC/tutorial.html | -| pon | Mutect2 | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- | -| pon_tbi | Mutect2 | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- | +| File | Tools | Origin | Docs | +| :-------------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------------------------------------------------------------------------------------------------------- | :----------------------------------------------------------------------------------- | +| ascat_alleles | ASCAT | https://www.dropbox.com/s/uouszfktzgoqfy7/G1000_alleles_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | +| ascat_loci | ASCAT | https://www.dropbox.com/s/80cq0qgao8l1inj/G1000_loci_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | +| ascat_loci_gc | ASCAT | https://www.dropbox.com/s/80cq0qgao8l1inj/G1000_loci_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | +| ascat_loci_rt | ASCAT | https://www.dropbox.com/s/xlp99uneqh6nh6p/RT_G1000_hg38.zip | https://github.com/VanLoo-lab/ascat/tree/master/ReferenceFiles/WGS | +| bwa | bwa-mem | `bwa index -p bwa/${fasta.baseName} $fasta` | | +| bwamem2 | bwa-mem2 | `bwa-mem2 index -p bwamem2/${fasta} $fasta` | | +| dragmap | DragMap | `dragen-os --build-hash-table true --ht-reference $fasta --output-directory dragmap` | | +| dbsnp | Baserecalibrator, ControlFREEC, GenotypeGVCF, HaplotypeCaller | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | +| dbsnp_tbi | Baserecalibrator, ControlFREEC, GenotypeGVCF, HaplotypeCaller | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| dict | Baserecalibrator(Spark), CNNScoreVariant, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, MarkDulpicates(Spark), MergeVCFs, Mutect2, Variantrecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | +| fasta | ApplyBQSR(Spark), ApplyVQSR, ASCAT, Baserecalibrator(Spark), BWA, BWAMem2, CNNScoreVariant, CNVKit, ControlFREEC, DragMap, DEEPVariant, EnsemblVEP, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, FreeBayes, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, indexcov, interval building, Manta, MarkDuplicates(Spark),MergeVCFs,MSISensorPro, Mutect2, Samtools, SnpEff, Strelka, Tiddit, Variantrecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | +| fasta_fai | ApplyBQSR(Spark), ApplyVQSR, ASCAT, Baserecalibrator(Spark), BWA, BWAMem2, CNNScoreVariant, CNVKit, ControlFREEC, DragMap, DEEPVariant, EnsemblVEP, EstimateLibraryComplexity, FilterMutectCalls, FilterVariantTranches, FreeBayes, GatherPileupSummaries,GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, indexcov, interval building, Manta, MarkDuplicates(Spark),MergeVCFs,MSISensorPro, Mutect2, Samtools, SnpEff, Strelka, Tiddit, Variantrecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle | +| germline_resource | GetPileupsummaries,Mutect2 | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| germline_resource_tbi | GetPileupsummaries,Mutect2 | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| intervals | ApplyBQSR(Spark), ASCAT, Baserecalibrator(Spark), BCFTools, CNNScoreVariants, ControlFREEC, Deepvariant, FilterVariantTranches, FreeBayes, GenotypeGVCF, GetPileupSummaries, HaplotypeCaller, Strelka, mpileup, MSISensorPro, Mutect2, VCFTools | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| known_indels | BaseRecalibrator(Spark), FilterVariantTranches | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| known_indels_tbi | BaseRecalibrator(Spark), FilterVariantTranches | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| known_snps | BaseRecalibrator(Spark), FilterVariantTranches, VariantRecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | | +| known_snps_tbi | BaseRecalibrator(Spark), FilterVariantTranches, VariantRecalibrator | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | +| mappability | ControlFREEC | http://xfer.curie.fr/get/vyIi4w8EONl/out100m2_hg38.zip | http://boevalab.inf.ethz.ch/FREEC/tutorial.html | +| pon | Mutect2 | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- | +| pon_tbi | Mutect2 | [GATKBundle](https://console.cloud.google.com/storage/browser/_details/genomics-public-data/resources/broad/hg38/v0/) | https://gatk.broadinstitute.org/hc/en-us/articles/360035890631-Panel-of-Normals-PON- | ## How to customise SnpEff and VEP annotation diff --git a/modules.json b/modules.json index 1b00aac7e4..ad4fd57616 100644 --- a/modules.json +++ b/modules.json @@ -310,6 +310,11 @@ "git_sha": "97321eded31a12598837a476d3615300af413bb7", "installed_by": ["modules"] }, + "goleft/indexcov": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "lofreq/callparallel": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", diff --git a/modules/local/samtools/reindex_bam/environment.yml b/modules/local/samtools/reindex_bam/environment.yml new file mode 100644 index 0000000000..da2df5e43a --- /dev/null +++ b/modules/local/samtools/reindex_bam/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::samtools=1.20 + - bioconda::htslib=1.20 diff --git a/modules/local/samtools/reindex_bam/main.nf b/modules/local/samtools/reindex_bam/main.nf new file mode 100644 index 0000000000..153f9093d6 --- /dev/null +++ b/modules/local/samtools/reindex_bam/main.nf @@ -0,0 +1,57 @@ +/** + * The aim of this process is to re-index the bam file without the duplicate, supplementary, unmapped etc, for goleft/indexcov + * It creates a BAM containing only a header (so indexcov can get the sample name) and a BAM index were low quality reads, supplementary etc, have been removed + */ +process SAMTOOLS_REINDEX_BAM { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.20--h50ea8bc_0' : + 'biocontainers/samtools:1.20--h50ea8bc_0' }" + + input: + tuple val(meta), path(input), path(input_index) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + + output: + tuple val(meta), path("${meta.id}.reindex.bam"), path("${meta.id}.reindex.bam.bai"),emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def reference = fasta ? "--reference ${fasta}" : "" + """ + # write header only + samtools \\ + view \\ + --header-only \\ + --threads ${task.cpus} \\ + -O BAM \\ + -o "${meta.id}.reindex.bam" \\ + ${reference} \\ + ${input} + + # write BAM index only, remove unmapped, supplementary, etc... + samtools \\ + view \\ + --uncompressed \\ + --write-index \\ + --threads ${task.cpus} \\ + -O BAM \\ + -o "/dev/null##idx##${meta.id}.reindex.bam.bai" \\ + ${reference} \\ + ${args} \\ + ${input} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/goleft/indexcov/environment.yml b/modules/nf-core/goleft/indexcov/environment.yml new file mode 100644 index 0000000000..813146929c --- /dev/null +++ b/modules/nf-core/goleft/indexcov/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::goleft=0.2.4 + - bioconda::htslib=1.12 diff --git a/modules/nf-core/goleft/indexcov/main.nf b/modules/nf-core/goleft/indexcov/main.nf new file mode 100644 index 0000000000..5d0ed5dfb0 --- /dev/null +++ b/modules/nf-core/goleft/indexcov/main.nf @@ -0,0 +1,65 @@ +process GOLEFT_INDEXCOV { + tag "${meta.id}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/goleft:0.2.4--h9ee0642_1': + 'biocontainers/goleft:0.2.4--h9ee0642_1' }" + + input: + tuple val(meta), path(bams), path(indexes) + tuple val(meta2), path(fai) + + output: + tuple val(meta), path("${prefix}/*") , emit: output + tuple val(meta), path("${prefix}/*ped") , emit: ped , optional: true + tuple val(meta), path("${prefix}/*bed.gz") , emit: bed , optional: true + tuple val(meta), path("${prefix}/*bed.gz.tbi"), emit: bed_index , optional: true + tuple val(meta), path("${prefix}/*roc") , emit: roc , optional: true + tuple val(meta), path("${prefix}/*html") , emit: html, optional: true + tuple val(meta), path("${prefix}/*png") , emit: png , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + // indexcov uses BAM files or CRAI + def input_files = bams.findAll{it.name.endsWith(".bam")} + indexes.findAll{it.name.endsWith(".crai")} + def extranormalize = input_files.any{it.name.endsWith(".crai")} ? " --extranormalize " : "" + """ + goleft indexcov \\ + --fai ${fai} \\ + --directory ${prefix} \\ + ${extranormalize} \\ + $args \\ + ${input_files.join(" ")} + + if [ -f "${prefix}/${prefix}-indexcov.bed.gz" ] ; then + tabix -p bed "${prefix}/${prefix}-indexcov.bed.gz" + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + goleft: \$(goleft --version 2>&1 | head -n 1 | sed 's/^.*goleft Version: //') + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir "${prefix}" + echo "" | gzip > "${prefix}/${prefix}-indexcov.bed.gz" + touch "${prefix}/${prefix}-indexcov.bed.gz.tbi" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + goleft: \$(goleft --version 2>&1 | head -n 1 | sed 's/^.*goleft Version: //') + tabix: \$(echo \$(tabix -h 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/goleft/indexcov/meta.yml b/modules/nf-core/goleft/indexcov/meta.yml new file mode 100644 index 0000000000..1619caf32d --- /dev/null +++ b/modules/nf-core/goleft/indexcov/meta.yml @@ -0,0 +1,122 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "goleft_indexcov" +description: Quickly estimate coverage from a whole-genome bam or cram index. A bam + index has 16KB resolution so that's what this gives, but it provides what appears + to be a high-quality coverage estimate in seconds per genome. +keywords: + - coverage + - cnv + - genomics + - depth +tools: + - "goleft": + description: "goleft is a collection of bioinformatics tools distributed under + MIT license in a single static binary" + homepage: "https://github.com/brentp/goleft" + documentation: "https://github.com/brentp/goleft" + tool_dev_url: "https://github.com/brentp/goleft" + doi: "10.1093/gigascience/gix090" + licence: ["MIT"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false] + - bams: + type: file + description: Sorted BAM/CRAM/SAM files + pattern: "*.{bam,cram,sam}" + - indexes: + type: file + description: BAI/CRAI files + pattern: "*.{bai,crai}" + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false] + - fai: + type: file + description: FASTA index + pattern: "*.{fai}" +output: + - output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}/*: + type: file + description: Files generated by indexcov + - ped: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}/*ped: + type: file + description: ped files + pattern: "*ped" + - bed: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}/*bed.gz: + type: file + description: bed files + pattern: "*bed.gz" + - bed_index: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}/*bed.gz.tbi: + type: file + description: bed index files + pattern: "*bed.gz.tbi" + - roc: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}/*roc: + type: file + description: roc files + pattern: "*roc" + - html: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}/*html: + type: file + description: html files + pattern: "*html" + - png: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}/*png: + type: file + description: png files + pattern: "*png" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@lindenb" +maintainers: + - "@lindenb" diff --git a/modules/nf-core/goleft/indexcov/tests/main.nf.test b/modules/nf-core/goleft/indexcov/tests/main.nf.test new file mode 100644 index 0000000000..1296c644cd --- /dev/null +++ b/modules/nf-core/goleft/indexcov/tests/main.nf.test @@ -0,0 +1,131 @@ +nextflow_process { + + name "Test Process GOLEFT_INDEXCOV" + script "../main.nf" + process "GOLEFT_INDEXCOV" + + tag "modules" + tag "modules_nfcore" + tag "goleft" + tag "goleft/indexcov" + + test("sarscov2 - bam") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test' ], // meta map + [ + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.sorted.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam", checkIfExists: true) + ], + [ + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.sorted.bam.bai", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai", checkIfExists: true) + ], + ]) + + input[1] = Channel.of( + [ + [:], + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.fasta.fai", checkIfExists: true) + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.ped, + process.out.bed, + file(process.out.bed_index[0][1]).name, + process.out.roc, + process.out.html, + process.out.png, + process.out.versions + ).match() } + ) + } + + } + + + test("sarscov2 - crai") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test' ], // meta map + [ + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram", checkIfExists: true) + ], + [ + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/cram/test.paired_end.markduplicates.sorted.cram.crai", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai", checkIfExists: true) + ] + ]) + + input[1] = Channel.of( + [ + [:], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/chr21/sequence/genome.fasta.fai", checkIfExists: true) + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.ped, + process.out.bed, + file(process.out.bed_index[0][1]).name, + process.out.roc, + process.out.html, + process.out.png, + process.out.versions + ).match() } + ) + } + + } + + test("sarscov2 - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test' ], // meta map + [], + [] + ]) + + input[1] = Channel.of([ + [:], + [] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/goleft/indexcov/tests/main.nf.test.snap b/modules/nf-core/goleft/indexcov/tests/main.nf.test.snap new file mode 100644 index 0000000000..1c79232db0 --- /dev/null +++ b/modules/nf-core/goleft/indexcov/tests/main.nf.test.snap @@ -0,0 +1,205 @@ +{ + "sarscov2 - crai": { + "content": [ + [ + [ + { + "id": "test" + }, + "test-indexcov.ped:md5,8737714b6ea160e06d5282391f89f791" + ] + ], + [ + [ + { + "id": "test" + }, + "test-indexcov.bed.gz:md5,04aa3637cffca5d99316df7741c06589" + ] + ], + "test-indexcov.bed.gz.tbi", + [ + [ + { + "id": "test" + }, + "test-indexcov.roc:md5,548b76fdf16e97768b0c9b8ecbfd5bef" + ] + ], + [ + [ + { + "id": "test" + }, + [ + "index.html:md5,41840ede180b20cdf6074c431269929e", + "test-indexcov-depth-chr21.html:md5,4c839b03f2f41e3fdca5642903c35008", + "test-indexcov-roc-chr21.html:md5,f84b547328a23196f16f71d093eb7450" + ] + ] + ], + [ + [ + { + "id": "test" + }, + [ + "test-indexcov-depth-chr21.png:md5,1999b0bf1cd0680f6d107d438e7257cf", + "test-indexcov-roc-chr21.png:md5,41f1460535b255fff053da59fcccf698" + ] + ] + ], + [ + "versions.yml:md5,f9c06c1c05a2a31854b4e04e449a24c5" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-08-22T06:40:17.142801459" + }, + "sarscov2 - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + [ + "test-indexcov.bed.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test-indexcov.bed.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "test-indexcov.bed.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test-indexcov.bed.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "4": [ + + ], + "5": [ + + ], + "6": [ + + ], + "7": [ + "versions.yml:md5,f9c06c1c05a2a31854b4e04e449a24c5" + ], + "bed": [ + [ + { + "id": "test" + }, + "test-indexcov.bed.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "bed_index": [ + [ + { + "id": "test" + }, + "test-indexcov.bed.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "html": [ + + ], + "output": [ + [ + { + "id": "test" + }, + [ + "test-indexcov.bed.gz:md5,68b329da9893e34099c7d8ad5cb9c940", + "test-indexcov.bed.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + ], + "ped": [ + + ], + "png": [ + + ], + "roc": [ + + ], + "versions": [ + "versions.yml:md5,f9c06c1c05a2a31854b4e04e449a24c5" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-08-22T06:44:59.203730744" + }, + "sarscov2 - bam": { + "content": [ + [ + [ + { + "id": "test" + }, + "test-indexcov.ped:md5,da2bd9882474d2f00f8ad2ab20b140c9" + ] + ], + [ + [ + { + "id": "test" + }, + "test-indexcov.bed.gz:md5,eab7a78287e261d600c06def12a33029" + ] + ], + "test-indexcov.bed.gz.tbi", + [ + [ + { + "id": "test" + }, + "test-indexcov.roc:md5,3f460308bb86203d1ada71b7c84d995d" + ] + ], + [ + [ + { + "id": "test" + }, + "index.html:md5,d1cc28023cd827446e0f9c905c94fe3e" + ] + ], + [ + + ], + [ + "versions.yml:md5,f9c06c1c05a2a31854b4e04e449a24c5" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-08-22T06:39:48.470187823" + } +} \ No newline at end of file diff --git a/modules/nf-core/goleft/indexcov/tests/tags.yml b/modules/nf-core/goleft/indexcov/tests/tags.yml new file mode 100644 index 0000000000..c27c4b9d5e --- /dev/null +++ b/modules/nf-core/goleft/indexcov/tests/tags.yml @@ -0,0 +1,2 @@ +goleft/indexcov: + - "modules/nf-core/goleft/indexcov/**" diff --git a/nextflow.config b/nextflow.config index d3baa4fdb4..1268aab63a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -460,6 +460,7 @@ includeConfig 'conf/modules/controlfreec.config' includeConfig 'conf/modules/deepvariant.config' includeConfig 'conf/modules/freebayes.config' includeConfig 'conf/modules/haplotypecaller.config' +includeConfig 'conf/modules/indexcov.config' includeConfig 'conf/modules/joint_germline.config' includeConfig 'conf/modules/manta.config' includeConfig 'conf/modules/mpileup.config' diff --git a/nextflow_schema.json b/nextflow_schema.json index 2e66ccdf53..5cdf35d555 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -111,8 +111,8 @@ "type": "string", "fa_icon": "fas fa-toolbox", "description": "Tools to use for duplicate marking, variant calling and/or for annotation.", - "help_text": "Multiple tools separated with commas.\n\n**Variant Calling:**\n\nGermline variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: DeepVariant, FreeBayes, GATK HaplotypeCaller, mpileup, Sentieon Haplotyper, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit\n\nTumor-only somatic variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, mpileup, Mutect2, Lofreq, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit, ControlFREEC\n\nSomatic variant calling can currently only be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, Mutect2, Strelka\n- Structural variants: Manta, TIDDIT\n- Copy-Number: ASCAT, CNVKit, Control-FREEC\n- Microsatellite Instability: MSIsensorpro\n\n> **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively), and bcftools annotate (needs `--bcftools_annotation`).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.", - "pattern": "^((ascat|bcfann|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|lofreq|sentieon_dnascope|sentieon_haplotyper|manta|merge|mpileup|msisensorpro|mutect2|ngscheckmate|sentieon_dedup|snpeff|strelka|tiddit|vep)?,?)*(? **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively), and bcftools annotate (needs `--bcftools_annotation`).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.", + "pattern": "^((ascat|bcfann|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|lofreq|sentieon_dnascope|sentieon_haplotyper|manta|indexcov|merge|mpileup|msisensorpro|mutect2|ngscheckmate|sentieon_dedup|snpeff|strelka|tiddit|vep)?,?)*(? [ counts ] }) reports = reports.mix(VCF_QC_BCFTOOLS_VCFTOOLS.out.vcftools_tstv_qual.collect{ meta, qual -> [ qual ] }) reports = reports.mix(VCF_QC_BCFTOOLS_VCFTOOLS.out.vcftools_filter_summary.collect{ meta, summary -> [ summary ] }) + reports = reports.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.out_indexcov.collect{ meta, indexcov -> indexcov.flatten() }) + reports = reports.mix(BAM_VARIANT_CALLING_SOMATIC_ALL.out.out_indexcov.collect{ meta, indexcov -> indexcov.flatten() }) CHANNEL_VARIANT_CALLING_CREATE_CSV(vcf_to_annotate, params.outdir)