From da274a53409b576e8a3d8e5cdc4ff2d3fb11a7b8 Mon Sep 17 00:00:00 2001 From: Allan Daly Date: Fri, 16 Feb 2024 11:13:33 +0000 Subject: [PATCH 01/12] Changed config order --- conf/hgi.config | 4 ++-- nextflow.config | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/conf/hgi.config b/conf/hgi.config index 7d86e56e..b44878eb 100644 --- a/conf/hgi.config +++ b/conf/hgi.config @@ -29,8 +29,8 @@ process { } withName: 'GATK4_HAPLOTYPECALLER' { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 12.GB * task.attempt, 'memory' ) } + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } ext.args = { params.joint_germline ? "-ERC GVCF -G StandardAnnotation -G AS_StandardAnnotation -G StandardHCAnnotation" : "" } } diff --git a/nextflow.config b/nextflow.config index 67310e63..95ef967e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -242,8 +242,6 @@ profiles { executor.memory = 8.GB executor.name = 'local' } - // Custom profile - hgi { includeConfig 'conf/hgi.config' } // Basic test profile for CI test { includeConfig 'conf/test.config' } test_cache { includeConfig 'conf/test/cache.config' } @@ -389,6 +387,9 @@ includeConfig 'conf/modules/post_variant_calling.config' //annotate includeConfig 'conf/modules/annotate.config' +// Custom profile +includeConfig 'conf/hgi.config' + // Function to ensure that resource requirements don't go beyond // a maximum limit def check_max(obj, type) { From f68a1084ed3fae344c95e81754cefc6044631834 Mon Sep 17 00:00:00 2001 From: Allan Daly Date: Fri, 16 Feb 2024 12:31:35 +0000 Subject: [PATCH 02/12] Added improved schema --- nextflow_schema.json | 245 ++++++++++++++----------------------------- 1 file changed, 81 insertions(+), 164 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 7a469024..b26f8e72 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,33 +10,11 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "help_text": "Specify input samplesheet, step and output folder.", "required": ["step", "outdir"], "properties": { - "input": { - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "A design file with information about the samples in your experiment. Use this parameter to specify the location of the input files. It has to be a comma-separated file with a header row. See [usage docs](https://nf-co.re/sarek/usage#input).\n\nIf no input file is specified, sarek will attempt to locate one in the `{outdir}` directory. If no input should be supplied, i.e. when --step is supplied or --build_from_index, then set --input false", - "fa_icon": "fas fa-file-csv", - "schema": "assets/schema_input.json", - "type": "string", - "format": "file-path", - "exists": true, - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$" - }, - "input_restart": { - "type": "string", - "description": "Automatic retrieval for restart", - "format": "file-path", - "exists": true, - "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "hidden": true, - "schema": "assets/schema_input.json" - }, "step": { "type": "string", - "default": "mapping", + "default": "variant_calling", "fa_icon": "fas fa-play", "description": "Starting step", "help_text": "The pipeline starts from this step and then runs through the possible subsequent steps.", @@ -46,17 +24,27 @@ "prepare_recalibration", "recalibrate", "variant_calling", - "joint_calling", "annotate" ] }, + "input": { + "type": "string", + "format": "file-path", + "mimetype": "text/csv", + "pattern": "\\.csv$", + "schema": "assets/schema_input.json", + "description": "Path to comma-separated file containing information about the samples in the experiment.", + "help_text": "A design file with information about the samples in your experiment. Use this parameter to specify the location of the input files. It has to be a comma-separated file with a header row. See [usage docs](https://nf-co.re/sarek/usage#input).\n\nIf no input file is specified, sarek will attempt to locate one in the `{outdir}` directory.", + "fa_icon": "fas fa-file-csv" + }, "outdir": { "type": "string", "format": "directory-path", "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", "fa_icon": "fas fa-folder-open" } - } + }, + "help_text": "" }, "main_options": { "title": "Main options", @@ -80,7 +68,7 @@ "intervals": { "type": "string", "fa_icon": "fas fa-file-alt", - "help_text": "To speed up preprocessing and variant calling processes, the execution is parallelized across a reference chopped into smaller pieces.\n\nParts of preprocessing and variant calling are done by these intervals, the different resulting files are then merged.\nThis can parallelize processes, and push down wall clock time significantly.\n\nWe are aligning to the whole genome, and then run Base Quality Score Recalibration and Variant Calling on the supplied regions.\n\n**Whole Genome Sequencing:**\n\nThe (provided) intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs.\n\nWe are ignoring the `hs37d5` contig that contains concatenated decoy sequences.\n\nThe calling intervals can be defined using a .list or a BED file.\nA .list file contains one interval per line in the format `chromosome:start-end` (1-based coordinates).\nA BED file must be a tab-separated text file with one interval per line.\nThere must be at least three columns: chromosome, start, and end (0-based coordinates).\nAdditionally, the score column of the BED file can be used to provide an estimate of how many seconds it will take to call variants on that interval.\nThe fourth column remains unused.\n\n```\n|chr1|10000|207666|NA|47.3|\n```\nThis indicates that variant calling on the interval chr1:10001-207666 takes approximately 47.3 seconds.\n\nThe runtime estimate is used in two different ways.\nFirst, when there are multiple consecutive intervals in the file that take little time to compute, they are processed as a single job, thus reducing the number of processes that needs to be spawned.\nSecond, the jobs with largest processing time are started first, which reduces wall-clock time.\nIf no runtime is given, a time of 200000 nucleotides per second is assumed. See `--nucleotides_per_second` on how to customize this.\nActual figures vary from 2 nucleotides/second to 30000 nucleotides/second.\nIf you prefer, you can specify the full path to your reference genome when you run the pipeline:\n\n> **NB** If none provided, will be generated automatically from the FASTA reference\n> **NB** Use --no_intervals to disable automatic generation.\n\n**Targeted Sequencing:**\n\nThe recommended flow for targeted sequencing data is to use the workflow as it is, but also provide a `BED` file containing targets for all steps using the `--intervals` option. In addition, the parameter `--wes` should be set.\nIt is advised to pad the variant calling regions (exons or target) to some extent before submitting to the workflow.\n\nThe procedure is similar to whole genome sequencing, except that only BED file are accepted. See above for formatting description.\nAdding every exon as an interval in case of `WES` can generate >200K processes or jobs, much more forks, and similar number of directories in the Nextflow work directory. These are appropriately grouped together to reduce number of processes run in parallel (see above and `--nucleotides_per_second` for details). \nFurthermore, primers and/or baits are not 100% specific, (certainly not for MHC and KIR, etc.), quite likely there going to be reads mapping to multiple locations.\nIf you are certain that the target is unique for your genome (all the reads will certainly map to only one location), and aligning to the whole genome is an overkill, it is actually better to change the reference itself.", + "help_text": "To speed up preprocessing and variant calling processes, the execution is parallelized across a reference chopped into smaller pieces.\n\nParts of preprocessing and variant calling are done by these intervals, the different resulting files are then merged.\nThis can parallelize processes, and push down wall clock time significantly.\n\nWe are aligning to the whole genome, and then run Base Quality Score Recalibration and Variant Calling on the supplied regions.\n\n**NOTE:** Intervals are not split so the largest interval in the interval_list or bed file will be the largest shard to process if that size is greater than the average shard size. Therefore, use a file which has lots of small intervals rather than one with a few very large intervals.\n\n**Whole Genome Sequencing:**\n\nThe (provided) intervals are chromosomes cut at their centromeres (so each chromosome arm processed separately) also additional unassigned contigs.\n\nWe are ignoring the `hs37d5` contig that contains concatenated decoy sequences.\n\nThe calling intervals can be defined using a .list or a BED file.\nA .list file contains one interval per line in the format `chromosome:start-end` (1-based coordinates).\nA BED file must be a tab-separated text file with one interval per line.\nThere must be at least three columns: chromosome, start, and end (0-based coordinates).\nAdditionally, the score column of the BED file can be used to provide an estimate of how many seconds it will take to call variants on that interval.\nThe fourth column remains unused.\n\n```\n|chr1|10000|207666|NA|47.3|\n```\nThis indicates that variant calling on the interval chr1:10001-207666 takes approximately 47.3 seconds.\n\nThe runtime estimate is used in two different ways.\nFirst, when there are multiple consecutive intervals in the file that take little time to compute, they are processed as a single job, thus reducing the number of processes that needs to be spawned.\nSecond, the jobs with largest processing time are started first, which reduces wall-clock time.\nIf no runtime is given, a time of 1000 nucleotides per second is assumed. See `-nucleotides_per_second` on how to customize this.\nActual figures vary from 2 nucleotides/second to 30000 nucleotides/second.\nIf you prefer, you can specify the full path to your reference genome when you run the pipeline:\n\n> **NB** If none provided, will be generated automatically from the FASTA reference\n> **NB** Use --no_intervals to disable automatic generation.\n\n**Targeted Sequencing:**\n\nThe recommended flow for targeted sequencing data is to use the workflow as it is, but also provide a `BED` file containing targets for all steps using the `--intervals` option. In addition, the parameter `--wes` should be set.\nIt is advised to pad the variant calling regions (exons or target) to some extent before submitting to the workflow.\n\nThe procedure is similar to whole genome sequencing, except that only BED file are accepted. See above for formatting description.\nAdding every exon as an interval in case of `WES` can generate >200K processes or jobs, much more forks, and similar number of directories in the Nextflow work directory. These are appropriately grouped together to reduce number of processes run in parallel (see above and `--nucleotides_per_second` for details). \nFurthermore, primers and/or baits are not 100% specific, (certainly not for MHC and KIR, etc.), quite likely there going to be reads mapping to multiple locations.\nIf you are certain that the target is unique for your genome (all the reads will certainly map to only one location), and aligning to the whole genome is an overkill, it is actually better to change the reference itself.", "description": "Path to target bed file in case of whole exome or targeted sequencing or intervals file." }, "nucleotides_per_second": { @@ -88,8 +76,8 @@ "fa_icon": "fas fa-clock", "description": "Estimate interval size.", "help_text": "Intervals are parts of the chopped up genome used to speed up preprocessing and variant calling. See `--intervals` for more info. \n\nChanging this parameter, changes the number of intervals that are grouped and processed together. Bed files from target sequencing can contain thousands or small intervals. Spinning up a new process for each can be quite resource intensive. Instead it can be desired to process small intervals together on larger nodes. \nIn order to make use of this parameter, no runtime estimate can be present in the bed file (column 5). ", - "default": 200000 - }, + "default": 40000 + }, "no_intervals": { "type": "boolean", "fa_icon": "fas fa-ban", @@ -99,16 +87,16 @@ "tools": { "type": "string", "fa_icon": "fas fa-toolbox", - "description": "Tools to use for duplicate marking, variant calling and/or for annotation.", - "help_text": "Multiple tools separated with commas.\n\n**Variant Calling:**\n\nGermline variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: DeepVariant, FreeBayes, GATK HaplotypeCaller, mpileup, Sentieon Haplotyper, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit\n\nTumor-only somatic variant calling can currently be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, mpileup, Mutect2, Strelka\n- Structural Variants: Manta, TIDDIT\n- Copy-number: CNVKit, ControlFREEC\n\nSomatic variant calling can currently only be performed with the following variant callers:\n- SNPs/Indels: FreeBayes, Mutect2, Strelka2\n- Structural variants: Manta, TIDDIT\n- Copy-Number: ASCAT, CNVKit, Control-FREEC\n- Microsatellite Instability: MSIsensorpro\n\n> **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.", - "pattern": "^((ascat|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller(|_vc|_jc)|sentieon_haplotyper|manta|merge|mpileup|msisensorpro|mutect2|sentieon_dedup|snpeff|strelka|tiddit|vep)?,?)*(? **NB** Mutect2 for somatic variant calling cannot be combined with `--no_intervals`\n\n**Annotation:**\n \n- snpEff, VEP, merge (both consecutively).\n\n> **NB** As Sarek will use bgzip and tabix to compress and index VCF files annotated, it expects VCF files to be sorted when starting from `--step annotate`.", + "pattern": "^((ascat|cnvkit|controlfreec|deepvariant|freebayes|haplotypecaller|manta|merge|mpileup|msisensorpro|mutect2|snpeff|strelka|tiddit|vep)?,?)*[^,]+$" }, "skip_tools": { "type": "string", "fa_icon": "fas fa-forward", "description": "Disable specified tools.", "help_text": "Multiple tools can be specified, separated by commas.\n\n> **NB** `--skip_tools baserecalibrator_report` is actually just not saving the reports.\n> **NB** `--skip_tools markduplicates_report` does not skip `MarkDuplicates` but prevent the collection of duplicate metrics that slows down performance.", - "pattern": "^((baserecalibrator|baserecalibrator_report|bcftools|documentation|fastqc|haplotypecaller_filter|haplotyper_filter|markduplicates|markduplicates_report|mosdepth|multiqc|samtools|vcftools|versions)?,?)*(? The GATK4 Base Quality Score recalibration tools `Baserecalibrator` and `ApplyBQSR` are currently available as Beta release. Use with caution!", - "pattern": "^((baserecalibrator|markduplicates)?,?)*(? **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", - "hidden": true - }, "dragmap": { "type": "string", "fa_icon": "fas fa-copy", @@ -645,9 +583,8 @@ "fasta": { "type": "string", "format": "file-path", - "exists": true, "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", + "pattern": "\\.fn?a(sta)?(\\.gz)?$", "description": "Path to FASTA genome file.", "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\nThis parameter is *mandatory* if `--genome` is not specified.", "fa_icon": "far fa-file-code" @@ -658,6 +595,12 @@ "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", "description": "Path to FASTA reference index." }, + "dict": { + "type": "string", + "fa_icon": "fas fa-file", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\n\n> **NB** If none provided, will be generated automatically from the FASTA reference. Combine with `--save_reference` to save for future runs.", + "description": "Path to FASTA reference dictionary." + }, "germline_resource": { "type": "string", "fa_icon": "fas fa-file", @@ -718,31 +661,50 @@ "type": "string", "fa_icon": "fas fa-database", "description": "snpEff DB version.", - "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the database to be use to annotate with.\nAlternatively databases' names can be listed with the `snpEff databases`." + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the database to be use to annotate with.\nAlternatively databases' names can be listed with the `snpEff databases`.", + "hidden": true }, "snpeff_genome": { "type": "string", "fa_icon": "fas fa-microscope", "description": "snpEff genome.", - "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when using the container with pre-downloaded cache." + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when using the container with pre-downloaded cache.", + "hidden": true + }, + "snpeff_version": { + "type": "string", + "fa_icon": "fas fa-tag", + "description": "snpEff version.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the snpeff version when using the container with pre-downloaded cache.", + "hidden": true }, "vep_genome": { "type": "string", "fa_icon": "fas fa-microscope", "description": "VEP genome.", - "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when using the container with pre-downloaded cache." + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the genome when using the container with pre-downloaded cache.", + "hidden": true }, "vep_species": { "type": "string", "fa_icon": "fas fa-microscope", "description": "VEP species.", - "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively species listed in Ensembl Genomes caches can be used." + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively species listed in Ensembl Genomes caches can be used.", + "hidden": true }, "vep_cache_version": { "type": "number", "fa_icon": "fas fa-tag", "description": "VEP cache version.", - "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively cache version can be use to specify the correct Ensembl Genomes version number as these differ from the concurrent Ensembl/VEP version numbers" + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nAlternatively cache version can be use to specify the correct Ensembl Genomes version number as these differ from the concurrent Ensembl/VEP version numbers", + "hidden": true + }, + "vep_version": { + "type": "string", + "fa_icon": "fas fa-tag", + "description": "VEP version.", + "help_text": "If you use AWS iGenomes, this has already been set for you appropriately.\nThis is used to specify the VEP version when using the container with pre-downloaded cache.", + "hidden": true }, "save_reference": { "type": "boolean", @@ -750,18 +712,6 @@ "description": "Save built references.", "help_text": "Set this parameter, if you wish to save all computed reference files. This is useful to avoid re-computation on future runs." }, - "build_only_index": { - "type": "boolean", - "fa_icon": "fas fa-download", - "description": "Only built references.", - "help_text": "Set this parameter, if you wish to compute and save all computed reference files. No alignment or any other downstream steps will be performed." - }, - "download_cache": { - "type": "boolean", - "fa_icon": "fas fa-download", - "description": "Download annotation cache.", - "help_text": "Set this parameter, if you wish to download annotation cache." - }, "igenomes_base": { "type": "string", "format": "directory-path", @@ -824,13 +774,6 @@ "hidden": true, "fa_icon": "fas fa-users-cog" }, - "test_data_base": { - "type": "string", - "default": "https://raw.githubusercontent.com/nf-core/test-datasets/sarek3", - "description": "Base path / URL for data used in the test profiles", - "help_text": "Warning: The `-profile test` samplesheet file itself contains remote paths. Setting this parameter does not alter the contents of that file.", - "hidden": true - }, "seq_center": { "type": "string", "fa_icon": "fas fa-university", @@ -876,7 +819,7 @@ "description": "Maximum amount of time that can be requested for any single job.", "default": "240.h", "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", + "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", "hidden": true, "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`." } @@ -895,12 +838,6 @@ "fa_icon": "fas fa-question-circle", "hidden": true }, - "version": { - "type": "boolean", - "description": "Display version and exit.", - "fa_icon": "fas fa-question-circle", - "hidden": true - }, "publish_dir_mode": { "type": "string", "default": "copy", @@ -952,22 +889,17 @@ }, "multiqc_config": { "type": "string", - "format": "file-path", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true }, - "multiqc_logo": { + "tracedir": { "type": "string", - "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", - "fa_icon": "fas fa-image", + "description": "Directory to keep pipeline Nextflow logs and reports.", + "default": "${params.outdir}/pipeline_info", + "fa_icon": "fas fa-cogs", "hidden": true }, - "multiqc_methods_description": { - "type": "string", - "description": "Custom MultiQC yaml file containing HTML including a methods description.", - "fa_icon": "fas fa-cog" - }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", @@ -975,33 +907,18 @@ "fa_icon": "fas fa-check-square", "hidden": true }, - "validationShowHiddenParams": { + "show_hidden_params": { "type": "boolean", "fa_icon": "far fa-eye-slash", "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." }, - "validationFailUnrecognisedParams": { + "enable_conda": { "type": "boolean", - "fa_icon": "far fa-check-circle", - "description": "Validation of parameters fails when an unrecognised parameter is found.", + "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", "hidden": true, - "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." - }, - "validationLenientMode": { - "type": "boolean", - "fa_icon": "far fa-check-circle", - "description": "Validation of parameters in lenient more.", - "hidden": true, - "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." - }, - "hook_url": { - "type": "string", - "description": "Incoming hook URL for messaging service", - "fa_icon": "fas fa-people-group", - "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", - "hidden": true + "fa_icon": "fas fa-bacon" } } } From 90892096c51da703cba43d2b02a5ad2c55bebd34 Mon Sep 17 00:00:00 2001 From: Allan Daly Date: Fri, 16 Feb 2024 13:23:17 +0000 Subject: [PATCH 03/12] changed interval split to notspan chromosomes --- modules/local/create_intervals_bed/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/create_intervals_bed/main.nf b/modules/local/create_intervals_bed/main.nf index 4ade5d8b..08d67574 100644 --- a/modules/local/create_intervals_bed/main.nf +++ b/modules/local/create_intervals_bed/main.nf @@ -28,7 +28,7 @@ process CREATE_INTERVALS_BED { # no runtime estimate in this row, assume default value t = (\$3 - \$2) / ${params.nucleotides_per_second} } - if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05)) { + if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05) || \$1 != chr) { # start a new chunk name = sprintf("%s_%d-%d.bed", \$1, \$2+1, \$3) chunk = 0 From 961c27218808d374f8c84ef39475b891aa4b8cae Mon Sep 17 00:00:00 2001 From: Allan Daly Date: Fri, 16 Feb 2024 14:30:26 +0000 Subject: [PATCH 04/12] Added LSF specifics to impoertGVF and genotypeGVCF --- conf/hgi.config | 13 +++++++------ modules/nf-core/gatk4/genomicsdbimport/main.nf | 11 ++++++++--- modules/nf-core/gatk4/genotypegvcfs/main.nf | 9 ++++++--- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/conf/hgi.config b/conf/hgi.config index b44878eb..db417c50 100644 --- a/conf/hgi.config +++ b/conf/hgi.config @@ -8,13 +8,14 @@ process { } withName: 'GATK4_GENOMICSDBIMPORT' { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + ext.args = "--batch-size 50 --reader-threads 1 -ip 500" } withName: 'GATK4_GENOTYPEGVCFS' { - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 16.GB * task.attempt, 'memory' ) } + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } ext.args = { "-G StandardAnnotation -G AS_StandardAnnotation" } } @@ -24,8 +25,8 @@ process { } withName:'GATK4_MERGEVCFS'{ - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 4.GB * task.attempt, 'memory' ) } + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } } withName: 'GATK4_HAPLOTYPECALLER' { diff --git a/modules/nf-core/gatk4/genomicsdbimport/main.nf b/modules/nf-core/gatk4/genomicsdbimport/main.nf index a8725d3f..cacb99ed 100644 --- a/modules/nf-core/gatk4/genomicsdbimport/main.nf +++ b/modules/nf-core/gatk4/genomicsdbimport/main.nf @@ -1,6 +1,6 @@ process GATK4_GENOMICSDBIMPORT { tag "$meta.id" - label 'process_medium' + \\label 'process_medium' conda "bioconda::gatk4=4.4.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -53,14 +53,19 @@ process GATK4_GENOMICSDBIMPORT { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + declare WORKSPACE="\$(TMPDIR="/tmp" mktemp -du)" + trap 'rm -rf "\$WORKSPACE"' EXIT + + gatk --java-options "-Xmx${avail_mem}M -XX:+UseSerialGC -XX:-UsePerfData" \\ GenomicsDBImport \\ $input_command \\ $genomicsdb_command \\ $interval_command \\ - --tmp-dir . \\ + --tmp-dir "\$WORKSPACE" \\ $args + tar cf "${prefix}" -C "\$WORKSPACE" . + cat <<-END_VERSIONS > versions.yml "${task.process}": gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') diff --git a/modules/nf-core/gatk4/genotypegvcfs/main.nf b/modules/nf-core/gatk4/genotypegvcfs/main.nf index a3e3129f..5e14e111 100644 --- a/modules/nf-core/gatk4/genotypegvcfs/main.nf +++ b/modules/nf-core/gatk4/genotypegvcfs/main.nf @@ -1,6 +1,6 @@ process GATK4_GENOTYPEGVCFS { tag "$meta.id" - label 'process_high' + \\label 'process_high' conda "bioconda::gatk4=4.4.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? @@ -37,9 +37,12 @@ process GATK4_GENOTYPEGVCFS { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + declare WORKSPACE="\$(TMPDIR="/tmp" mktemp -d)" + trap 'rm -rf "\$WORKSPACE"' EXIT + tar xf "${gvcf}" -C "\$WORKSPACE" + gatk --java-options "-Xmx${avail_mem}M -XX:+UseSerialGC -XX:-UsePerfData" \\ GenotypeGVCFs \\ - --variant $gvcf_command \\ + --variant gendb://\$WORKSPACE \\ --output ${prefix}.vcf.gz \\ --reference $fasta \\ $interval_command \\ From 0f41fb63bb78d3692c8a8e6af758ba1c5bad7c96 Mon Sep 17 00:00:00 2001 From: Allan Daly Date: Fri, 16 Feb 2024 15:20:18 +0000 Subject: [PATCH 05/12] Added -XX:+UseSerialGC to gatk modules and lsf config update --- modules/nf-core/gatk4/applybqsr/main.nf | 2 +- modules/nf-core/gatk4/applyvqsr/main.nf | 2 +- modules/nf-core/gatk4/baserecalibrator/main.nf | 2 +- modules/nf-core/gatk4/calculatecontamination/main.nf | 2 +- modules/nf-core/gatk4/cnnscorevariants/main.nf | 2 +- modules/nf-core/gatk4/createsequencedictionary/main.nf | 2 +- modules/nf-core/gatk4/estimatelibrarycomplexity/main.nf | 2 +- modules/nf-core/gatk4/filtermutectcalls/main.nf | 2 +- modules/nf-core/gatk4/filtervarianttranches/main.nf | 2 +- modules/nf-core/gatk4/gatherbqsrreports/main.nf | 2 +- modules/nf-core/gatk4/gatherpileupsummaries/main.nf | 2 +- modules/nf-core/gatk4/getpileupsummaries/main.nf | 2 +- modules/nf-core/gatk4/haplotypecaller/main.nf | 2 +- modules/nf-core/gatk4/intervallisttobed/main.nf | 2 +- modules/nf-core/gatk4/learnreadorientationmodel/main.nf | 2 +- modules/nf-core/gatk4/markduplicates/main.nf | 2 +- modules/nf-core/gatk4/mergemutectstats/main.nf | 2 +- modules/nf-core/gatk4/mergevcfs/main.nf | 2 +- modules/nf-core/gatk4/mutect2/main.nf | 2 +- modules/nf-core/gatk4/variantrecalibrator/main.nf | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) diff --git a/modules/nf-core/gatk4/applybqsr/main.nf b/modules/nf-core/gatk4/applybqsr/main.nf index e5e6bf99..a52bc3e7 100644 --- a/modules/nf-core/gatk4/applybqsr/main.nf +++ b/modules/nf-core/gatk4/applybqsr/main.nf @@ -33,7 +33,7 @@ process GATK4_APPLYBQSR { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ ApplyBQSR \\ --input $input \\ --output ${prefix}.${input.getExtension()} \\ diff --git a/modules/nf-core/gatk4/applyvqsr/main.nf b/modules/nf-core/gatk4/applyvqsr/main.nf index 8413f2bb..bf6f0f42 100644 --- a/modules/nf-core/gatk4/applyvqsr/main.nf +++ b/modules/nf-core/gatk4/applyvqsr/main.nf @@ -33,7 +33,7 @@ process GATK4_APPLYVQSR { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ ApplyVQSR \\ --variant ${vcf} \\ --output ${prefix}.vcf.gz \\ diff --git a/modules/nf-core/gatk4/baserecalibrator/main.nf b/modules/nf-core/gatk4/baserecalibrator/main.nf index 5375289a..77baaff2 100644 --- a/modules/nf-core/gatk4/baserecalibrator/main.nf +++ b/modules/nf-core/gatk4/baserecalibrator/main.nf @@ -35,7 +35,7 @@ process GATK4_BASERECALIBRATOR { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ BaseRecalibrator \\ --input $input \\ --output ${prefix}.table \\ diff --git a/modules/nf-core/gatk4/calculatecontamination/main.nf b/modules/nf-core/gatk4/calculatecontamination/main.nf index 9dd961be..2ce606f4 100644 --- a/modules/nf-core/gatk4/calculatecontamination/main.nf +++ b/modules/nf-core/gatk4/calculatecontamination/main.nf @@ -30,7 +30,7 @@ process GATK4_CALCULATECONTAMINATION { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ CalculateContamination \\ --input $pileup \\ --output ${prefix}.contamination.table \\ diff --git a/modules/nf-core/gatk4/cnnscorevariants/main.nf b/modules/nf-core/gatk4/cnnscorevariants/main.nf index 71efe9b1..e8e52ae4 100644 --- a/modules/nf-core/gatk4/cnnscorevariants/main.nf +++ b/modules/nf-core/gatk4/cnnscorevariants/main.nf @@ -40,7 +40,7 @@ process GATK4_CNNSCOREVARIANTS { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ CNNScoreVariants \\ --variant $vcf \\ --output ${prefix}.cnn.vcf.gz \\ diff --git a/modules/nf-core/gatk4/createsequencedictionary/main.nf b/modules/nf-core/gatk4/createsequencedictionary/main.nf index 3e4efdd9..bdcf9d6c 100644 --- a/modules/nf-core/gatk4/createsequencedictionary/main.nf +++ b/modules/nf-core/gatk4/createsequencedictionary/main.nf @@ -27,7 +27,7 @@ process GATK4_CREATESEQUENCEDICTIONARY { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ CreateSequenceDictionary \\ --REFERENCE $fasta \\ --URI $fasta \\ diff --git a/modules/nf-core/gatk4/estimatelibrarycomplexity/main.nf b/modules/nf-core/gatk4/estimatelibrarycomplexity/main.nf index 81fc8351..9a2c201b 100644 --- a/modules/nf-core/gatk4/estimatelibrarycomplexity/main.nf +++ b/modules/nf-core/gatk4/estimatelibrarycomplexity/main.nf @@ -32,7 +32,7 @@ process GATK4_ESTIMATELIBRARYCOMPLEXITY { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ EstimateLibraryComplexity \\ $input_list \\ --OUTPUT ${prefix}.metrics \\ diff --git a/modules/nf-core/gatk4/filtermutectcalls/main.nf b/modules/nf-core/gatk4/filtermutectcalls/main.nf index 623b91ae..0bcc6c83 100644 --- a/modules/nf-core/gatk4/filtermutectcalls/main.nf +++ b/modules/nf-core/gatk4/filtermutectcalls/main.nf @@ -38,7 +38,7 @@ process GATK4_FILTERMUTECTCALLS { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ FilterMutectCalls \\ --variant $vcf \\ --output ${prefix}.vcf.gz \\ diff --git a/modules/nf-core/gatk4/filtervarianttranches/main.nf b/modules/nf-core/gatk4/filtervarianttranches/main.nf index 90cbf5f0..550d1ca8 100644 --- a/modules/nf-core/gatk4/filtervarianttranches/main.nf +++ b/modules/nf-core/gatk4/filtervarianttranches/main.nf @@ -36,7 +36,7 @@ process GATK4_FILTERVARIANTTRANCHES { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ FilterVariantTranches \\ --variant $vcf \\ $resources \\ diff --git a/modules/nf-core/gatk4/gatherbqsrreports/main.nf b/modules/nf-core/gatk4/gatherbqsrreports/main.nf index 3eeca5ad..e1f39ef7 100644 --- a/modules/nf-core/gatk4/gatherbqsrreports/main.nf +++ b/modules/nf-core/gatk4/gatherbqsrreports/main.nf @@ -29,7 +29,7 @@ process GATK4_GATHERBQSRREPORTS { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ GatherBQSRReports \\ $input_list \\ --output ${prefix}.table \\ diff --git a/modules/nf-core/gatk4/gatherpileupsummaries/main.nf b/modules/nf-core/gatk4/gatherpileupsummaries/main.nf index f315e1af..575fa9ee 100644 --- a/modules/nf-core/gatk4/gatherpileupsummaries/main.nf +++ b/modules/nf-core/gatk4/gatherpileupsummaries/main.nf @@ -31,7 +31,7 @@ process GATK4_GATHERPILEUPSUMMARIES { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ GatherPileupSummaries \\ $input_list \\ --O ${prefix}.pileups.table \\ diff --git a/modules/nf-core/gatk4/getpileupsummaries/main.nf b/modules/nf-core/gatk4/getpileupsummaries/main.nf index f7d0f294..faa5b693 100644 --- a/modules/nf-core/gatk4/getpileupsummaries/main.nf +++ b/modules/nf-core/gatk4/getpileupsummaries/main.nf @@ -35,7 +35,7 @@ process GATK4_GETPILEUPSUMMARIES { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ GetPileupSummaries \\ --input $input \\ --variant $variants \\ diff --git a/modules/nf-core/gatk4/haplotypecaller/main.nf b/modules/nf-core/gatk4/haplotypecaller/main.nf index 9ac87518..bdcda702 100644 --- a/modules/nf-core/gatk4/haplotypecaller/main.nf +++ b/modules/nf-core/gatk4/haplotypecaller/main.nf @@ -39,7 +39,7 @@ process GATK4_HAPLOTYPECALLER { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ HaplotypeCaller \\ --input $input \\ --output ${prefix}.vcf.gz \\ diff --git a/modules/nf-core/gatk4/intervallisttobed/main.nf b/modules/nf-core/gatk4/intervallisttobed/main.nf index 2537f0aa..b43df669 100644 --- a/modules/nf-core/gatk4/intervallisttobed/main.nf +++ b/modules/nf-core/gatk4/intervallisttobed/main.nf @@ -28,7 +28,7 @@ process GATK4_INTERVALLISTTOBED { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ IntervalListToBed \\ --INPUT $intervals \\ --OUTPUT ${prefix}.bed \\ diff --git a/modules/nf-core/gatk4/learnreadorientationmodel/main.nf b/modules/nf-core/gatk4/learnreadorientationmodel/main.nf index 89a6ae77..c1d9b9dc 100644 --- a/modules/nf-core/gatk4/learnreadorientationmodel/main.nf +++ b/modules/nf-core/gatk4/learnreadorientationmodel/main.nf @@ -29,7 +29,7 @@ process GATK4_LEARNREADORIENTATIONMODEL { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ LearnReadOrientationModel \\ $input_list \\ --output ${prefix}.tar.gz \\ diff --git a/modules/nf-core/gatk4/markduplicates/main.nf b/modules/nf-core/gatk4/markduplicates/main.nf index e4c01f9a..5b8fcbc1 100644 --- a/modules/nf-core/gatk4/markduplicates/main.nf +++ b/modules/nf-core/gatk4/markduplicates/main.nf @@ -43,7 +43,7 @@ process GATK4_MARKDUPLICATES { // Using samtools and not Markduplicates to compress to CRAM speeds up computation: // https://medium.com/@acarroll.dna/looking-at-trade-offs-in-compression-levels-for-genomics-tools-eec2834e8b94 """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ MarkDuplicates \\ $input_list \\ --OUTPUT ${prefix_bam} \\ diff --git a/modules/nf-core/gatk4/mergemutectstats/main.nf b/modules/nf-core/gatk4/mergemutectstats/main.nf index 269721cb..ec2a9c92 100644 --- a/modules/nf-core/gatk4/mergemutectstats/main.nf +++ b/modules/nf-core/gatk4/mergemutectstats/main.nf @@ -29,7 +29,7 @@ process GATK4_MERGEMUTECTSTATS { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ MergeMutectStats \\ $input_list \\ --output ${prefix}.vcf.gz.stats \\ diff --git a/modules/nf-core/gatk4/mergevcfs/main.nf b/modules/nf-core/gatk4/mergevcfs/main.nf index 29c08e16..76593998 100644 --- a/modules/nf-core/gatk4/mergevcfs/main.nf +++ b/modules/nf-core/gatk4/mergevcfs/main.nf @@ -32,7 +32,7 @@ process GATK4_MERGEVCFS { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ MergeVcfs \\ $input_list \\ --OUTPUT ${prefix}.vcf.gz \\ diff --git a/modules/nf-core/gatk4/mutect2/main.nf b/modules/nf-core/gatk4/mutect2/main.nf index 4e353979..93f1204a 100644 --- a/modules/nf-core/gatk4/mutect2/main.nf +++ b/modules/nf-core/gatk4/mutect2/main.nf @@ -42,7 +42,7 @@ process GATK4_MUTECT2 { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ Mutect2 \\ $inputs \\ --output ${prefix}.vcf.gz \\ diff --git a/modules/nf-core/gatk4/variantrecalibrator/main.nf b/modules/nf-core/gatk4/variantrecalibrator/main.nf index fa262e4a..15fe43a8 100644 --- a/modules/nf-core/gatk4/variantrecalibrator/main.nf +++ b/modules/nf-core/gatk4/variantrecalibrator/main.nf @@ -39,7 +39,7 @@ process GATK4_VARIANTRECALIBRATOR { avail_mem = (task.memory.mega*0.8).intValue() } """ - gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData -XX:+UseSerialGC" \\ VariantRecalibrator \\ --variant $vcf \\ --output ${prefix}.recal \\ From 69668ea379f9bf11efd675edc63de65b304d9727 Mon Sep 17 00:00:00 2001 From: Allan Daly Date: Fri, 16 Feb 2024 15:34:58 +0000 Subject: [PATCH 06/12] typo in genomicsdbimport --- conf/base.config | 10 +++++----- conf/hgi.config | 14 ++++++++++++++ modules/nf-core/gatk4/genomicsdbimport/main.nf | 2 +- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/conf/base.config b/conf/base.config index db117587..dfe61345 100644 --- a/conf/base.config +++ b/conf/base.config @@ -16,7 +16,7 @@ process { // memory errors which should be retried. otherwise error out errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - maxRetries = 1 + maxRetries = 3 maxErrors = '-1' // Process-specific resource requirements @@ -71,12 +71,12 @@ process { memory = { check_max( 30.GB * task.attempt, 'memory' ) } } withName: 'GATK4_MARKDUPLICATES|GATK4_MARKDUPLICATESSPARK' { - cpus = { check_max( 6 * task.attempt, 'cpus' ) } - memory = { check_max( 30.GB * task.attempt, 'memory' ) } + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } } withName:'GATK4_APPLYBQSR|GATK4_APPLYBQSR_SPARK|GATK4_BASERECALIBRATOR|GATK4_BASERECALIBRATOR_SPARK|GATK4_GATHERBQSRREPORTS'{ - cpus = { check_max( 2 * task.attempt, 'cpus' ) } - memory = { check_max( 4.GB * task.attempt, 'memory' ) } + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } } withName:'MOSDEPTH'{ cpus = { check_max( 4 * task.attempt, 'cpus' ) } diff --git a/conf/hgi.config b/conf/hgi.config index db417c50..d53bcfcb 100644 --- a/conf/hgi.config +++ b/conf/hgi.config @@ -49,3 +49,17 @@ singularity { enabled = true cacheDir = '/nfs/hgi/singularityContainers/' } + + executor { + name = 'lsf' + queueSize = 4000 + poolSize = 4 + submitRateLimit = '10 sec' + killBatchSize = 50 + pollInterval = '10 sec' + queueStatInterval = '20 sec' + dumpInterval = '10 sec' + exitReadTimeout= '10 sec' + perJobMemLimit=true + } + diff --git a/modules/nf-core/gatk4/genomicsdbimport/main.nf b/modules/nf-core/gatk4/genomicsdbimport/main.nf index cacb99ed..1783b992 100644 --- a/modules/nf-core/gatk4/genomicsdbimport/main.nf +++ b/modules/nf-core/gatk4/genomicsdbimport/main.nf @@ -1,6 +1,6 @@ process GATK4_GENOMICSDBIMPORT { tag "$meta.id" - \\label 'process_medium' + //label 'process_medium' conda "bioconda::gatk4=4.4.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? From 8c230fd403cc62a0ef1e6a41b5972fa5294cbece Mon Sep 17 00:00:00 2001 From: Allan Daly Date: Fri, 16 Feb 2024 15:39:54 +0000 Subject: [PATCH 07/12] typo in genotypegvcfs --- modules/nf-core/gatk4/genotypegvcfs/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/nf-core/gatk4/genotypegvcfs/main.nf b/modules/nf-core/gatk4/genotypegvcfs/main.nf index 5e14e111..1ebd0c82 100644 --- a/modules/nf-core/gatk4/genotypegvcfs/main.nf +++ b/modules/nf-core/gatk4/genotypegvcfs/main.nf @@ -1,6 +1,6 @@ process GATK4_GENOTYPEGVCFS { tag "$meta.id" - \\label 'process_high' + //label 'process_high' conda "bioconda::gatk4=4.4.0.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? From 46900382941750cfcff089d9e488ec814f17f513 Mon Sep 17 00:00:00 2001 From: Allan Daly Date: Fri, 16 Feb 2024 15:53:46 +0000 Subject: [PATCH 08/12] Corrected bed split script --- modules/local/create_intervals_bed/main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modules/local/create_intervals_bed/main.nf b/modules/local/create_intervals_bed/main.nf index 08d67574..87f3696b 100644 --- a/modules/local/create_intervals_bed/main.nf +++ b/modules/local/create_intervals_bed/main.nf @@ -28,11 +28,12 @@ process CREATE_INTERVALS_BED { # no runtime estimate in this row, assume default value t = (\$3 - \$2) / ${params.nucleotides_per_second} } - if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.05) || \$1 != chr) { + if (name == "" || (chunk > 600 && (chunk + t) > longest * 1.00) || \$1 != chr) { # start a new chunk name = sprintf("%s_%d-%d.bed", \$1, \$2+1, \$3) chunk = 0 longest = 0 + chr = \$1 } if (t > longest) longest = t From 3ee04f030d654e138c027169a2ddd4c49d070efe Mon Sep 17 00:00:00 2001 From: Allan Daly Date: Mon, 19 Feb 2024 09:48:00 +0000 Subject: [PATCH 09/12] Made everything retry --- conf/base.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index dfe61345..62937a4e 100644 --- a/conf/base.config +++ b/conf/base.config @@ -15,7 +15,8 @@ process { shell = ['/bin/bash', '-euo', 'pipefail'] // memory errors which should be retried. otherwise error out - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + //errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } + errorStrategy = { task.attempt <= 3 ? 'retry' : 'ignore' } maxRetries = 3 maxErrors = '-1' From 2c05a90cc623f6baf1566654f327071f47150e36 Mon Sep 17 00:00:00 2001 From: Allan Daly Date: Mon, 19 Feb 2024 20:43:43 +0000 Subject: [PATCH 10/12] updated schema --- nextflow_schema.json | 1 + 1 file changed, 1 insertion(+) diff --git a/nextflow_schema.json b/nextflow_schema.json index b26f8e72..7581ddd6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -24,6 +24,7 @@ "prepare_recalibration", "recalibrate", "variant_calling", + "joint_calling", "annotate" ] }, From 790afe40d058fd7bad96540cb6e99814b0590ec2 Mon Sep 17 00:00:00 2001 From: Allan Daly Date: Tue, 20 Feb 2024 10:29:57 +0000 Subject: [PATCH 11/12] Removed tmp_dir from import and genotype plus added lsf tmp resource --- conf/hgi.config | 2 ++ modules/nf-core/gatk4/genomicsdbimport/main.nf | 1 - modules/nf-core/gatk4/genotypegvcfs/main.nf | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/hgi.config b/conf/hgi.config index d53bcfcb..b4c4e338 100644 --- a/conf/hgi.config +++ b/conf/hgi.config @@ -10,12 +10,14 @@ process { withName: 'GATK4_GENOMICSDBIMPORT' { cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 8.GB * task.attempt, 'memory' ) } + clusterOptions = { "-R \"rusage[tmp=20G]\"" } ext.args = "--batch-size 50 --reader-threads 1 -ip 500" } withName: 'GATK4_GENOTYPEGVCFS' { cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 8.GB * task.attempt, 'memory' ) } + clusterOptions = { "-R \"rusage[tmp=20G]\"" } ext.args = { "-G StandardAnnotation -G AS_StandardAnnotation" } } diff --git a/modules/nf-core/gatk4/genomicsdbimport/main.nf b/modules/nf-core/gatk4/genomicsdbimport/main.nf index 1783b992..26310ae4 100644 --- a/modules/nf-core/gatk4/genomicsdbimport/main.nf +++ b/modules/nf-core/gatk4/genomicsdbimport/main.nf @@ -61,7 +61,6 @@ process GATK4_GENOMICSDBIMPORT { $input_command \\ $genomicsdb_command \\ $interval_command \\ - --tmp-dir "\$WORKSPACE" \\ $args tar cf "${prefix}" -C "\$WORKSPACE" . diff --git a/modules/nf-core/gatk4/genotypegvcfs/main.nf b/modules/nf-core/gatk4/genotypegvcfs/main.nf index 1ebd0c82..63cf29a3 100644 --- a/modules/nf-core/gatk4/genotypegvcfs/main.nf +++ b/modules/nf-core/gatk4/genotypegvcfs/main.nf @@ -47,7 +47,6 @@ process GATK4_GENOTYPEGVCFS { --reference $fasta \\ $interval_command \\ $dbsnp_command \\ - --tmp-dir . \\ $args cat <<-END_VERSIONS > versions.yml From 38a103da8a1c6aa56716c7f24bb29330b0ab4c73 Mon Sep 17 00:00:00 2001 From: Allan Daly Date: Tue, 20 Feb 2024 12:17:44 +0000 Subject: [PATCH 12/12] Corrected genomicDBimport --- modules/nf-core/gatk4/genomicsdbimport/main.nf | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/nf-core/gatk4/genomicsdbimport/main.nf b/modules/nf-core/gatk4/genomicsdbimport/main.nf index 26310ae4..aa1afb21 100644 --- a/modules/nf-core/gatk4/genomicsdbimport/main.nf +++ b/modules/nf-core/gatk4/genomicsdbimport/main.nf @@ -29,19 +29,22 @@ process GATK4_GENOMICSDBIMPORT { // settings for running default create gendb mode input_command = input_map ? "--sample-name-map ${vcf[0]}" : vcf.collect(){"--variant $it"}.join(' ') - genomicsdb_command = "--genomicsdb-workspace-path ${prefix}" + //genomicsdb_command = "--genomicsdb-workspace-path ${prefix}" + genomicsdb_command = "--genomicsdb-workspace-path" interval_command = interval_file ? "--intervals ${interval_file}" : "--intervals ${interval_value}" updated_db = "" // settings changed for running get intervals list mode if run_intlist is true if (run_intlist) { - genomicsdb_command = "--genomicsdb-update-workspace-path ${wspace}" + //genomicsdb_command = "--genomicsdb-update-workspace-path ${wspace}" + genomicsdb_command = "--genomicsdb-update-workspace-path" interval_command = "--output-interval-list-to-file ${prefix}.interval_list" } // settings changed for running update gendb mode. input_command same as default, update_db forces module to emit the updated gendb if (run_updatewspace) { - genomicsdb_command = "--genomicsdb-update-workspace-path ${wspace}" + //genomicsdb_command = "--genomicsdb-update-workspace-path ${wspace}" + genomicsdb_command = "--genomicsdb-update-workspace-path" interval_command = '' updated_db = "${wspace}" } @@ -59,7 +62,7 @@ process GATK4_GENOMICSDBIMPORT { gatk --java-options "-Xmx${avail_mem}M -XX:+UseSerialGC -XX:-UsePerfData" \\ GenomicsDBImport \\ $input_command \\ - $genomicsdb_command \\ + $genomicsdb_command \$WORKSPACE \\ $interval_command \\ $args