From 9c6c96e7d3c9f7295b6dde427f48141a53624801 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Wed, 16 Aug 2023 11:53:14 -0700 Subject: [PATCH 01/17] Calculate total bases input for each parent to set yak params on the fly --- .../de_novo_assembly_trio.wdl | 121 +++++++++++++++++- 1 file changed, 120 insertions(+), 1 deletion(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index b5c9bcc..d223029 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -41,10 +41,28 @@ workflow de_novo_assembly_trio { } } + # For yak, we need to know the total input coverage so we can set cloud memory resources accordingly + scatter (fasta in samtools_fasta_father.reads_fasta) { + call fasta_basecount as fasta_bc_father { + input: + reads_fasta = fasta, + runtime_attributes = default_runtime_attributes + } + } + + call get_total_bp as get_total_bp_father { + input: + sample_id = father.sample_id, + fasta_totals = fasta_bc_father.read_total_bp, + runtime_attributes = default_runtime_attributes + } + call yak_count as yak_count_father { input: sample_id = father.sample_id, reads_fastas = samtools_fasta_father.reads_fasta, + sample_total_bp = get_total_bp_father.sample_total_bp, + runtime_attributes = default_runtime_attributes } @@ -56,10 +74,29 @@ workflow de_novo_assembly_trio { } } + # For yak, we need to know the total input coverage so we can set cloud memory resources accordingly + scatter (fasta in samtools_fasta_mother.reads_fasta) { + call fasta_basecount as fasta_bc_mother { + input: + reads_fasta = fasta, + runtime_attributes = default_runtime_attributes + } + } + + call get_total_bp as get_total_bp_mother { + input: + sample_id = mother.sample_id, + fasta_totals = fasta_bc_mother.read_total_bp, + runtime_attributes = default_runtime_attributes + } + + call yak_count as yak_count_mother { input: sample_id = mother.sample_id, reads_fastas = samtools_fasta_mother.reads_fasta, + sample_total_bp = get_total_bp_father.sample_total_bp, + runtime_attributes = default_runtime_attributes } @@ -149,15 +186,18 @@ task yak_count { input { String sample_id Array[File] reads_fastas + Int sample_total_bp RuntimeAttributes runtime_attributes } - Int threads = 10 # Usage up to 140 GB @ 10 threads for Revio samples Int mem_gb = 16 * threads Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) + + # if sample is less than 15X (3.2Gb * 15) use -b37 bloom filter parameter + String yak_options = if sample_total_bp > 48 then "-b37" else "" command <<< set -euo pipefail @@ -165,6 +205,7 @@ task yak_count { yak count \ -t ~{threads} \ -o ~{sample_id}.yak \ + ~{yak_options} \ ~{sep=' ' reads_fastas} >>> @@ -185,3 +226,81 @@ task yak_count { zones: runtime_attributes.zones } } + +task fasta_basecount { + input { + File reads_fasta + String reads_fasta_basename = basename(reads_fasta) + + RuntimeAttributes runtime_attributes + } + + Int threads = 1 + Int mem_gb = 4 * threads + + Int disk_size = ceil(size(reads_fasta, "GB") * 2 + 20) + + command <<< + set -euo pipefail + + grep -v "^>" ~{reads_fasta} | tr -d '\n' | wc -c > ~{reads_fasta_basename}.total + >>> + + output { + File read_total_bp = "~{reads_fasta_basename}.total" + } + + runtime { + docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } +} + +task get_total_bp { + input { + String sample_id + Array[File] fasta_totals + + RuntimeAttributes runtime_attributes + } + + Int threads = 1 + Int mem_gb = 4 * threads + + Int disk_size = ceil(size(fasta_totals[0], "GB") * 2 + 20) + + command <<< + set -euo pipefail + + cat ~{sep=' ' fasta_totals} | awk '{sum+=$1}END{print sum/1000000000}' > ~{sample_id}.total + + >>> + + output { + Int sample_total_bp = round(read_float("~{sample_id}.total")) +# File sample_total_bp = "~{sample_id}.total" + + } + + runtime { + docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2" + cpu: threads + memory: mem_gb + " GB" + disk: disk_size + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: runtime_attributes.preemptible_tries + maxRetries: runtime_attributes.max_retries + awsBatchRetryAttempts: runtime_attributes.max_retries + queueArn: runtime_attributes.queue_arn + zones: runtime_attributes.zones + } + +} From c9f709291d7b94716555cdbec2b9b5741a989014 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Wed, 16 Aug 2023 11:55:54 -0700 Subject: [PATCH 02/17] less than not greater than --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index d223029..2d62021 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -197,7 +197,7 @@ task yak_count { Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) # if sample is less than 15X (3.2Gb * 15) use -b37 bloom filter parameter - String yak_options = if sample_total_bp > 48 then "-b37" else "" + String yak_options = if sample_total_bp < 48 then "-b37" else "" command <<< set -euo pipefail From 56833d4077f502bcadea5a77b87088cda0f2047e Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Fri, 22 Sep 2023 15:18:47 -0700 Subject: [PATCH 03/17] Adding multi-reference alignment option --- backends/aws/inputs.aws.json | 11 ++-- backends/azure/inputs.azure.json | 11 ++-- backends/gcp/inputs.gcp.json | 11 ++-- backends/hpc/inputs.hpc.json | 11 ++-- wdl-ci.config.json | 2 +- workflows/assemble_genome/assemble_genome.wdl | 37 +++++++----- .../de_novo_assembly_sample.wdl | 58 +++++++++++-------- .../de_novo_assembly_trio.wdl | 9 +-- workflows/input_template.json | 17 +++--- workflows/main.wdl | 15 ++--- 10 files changed, 105 insertions(+), 77 deletions(-) diff --git a/backends/aws/inputs.aws.json b/backends/aws/inputs.aws.json index bd89ebf..44c4a52 100644 --- a/backends/aws/inputs.aws.json +++ b/backends/aws/inputs.aws.json @@ -13,13 +13,14 @@ ], "run_de_novo_assembly_trio": "Boolean" }, - "de_novo_assembly.reference": { - "name": "GRCh38", + "de_novo_assembly.references": [ + { + "name": "String", "fasta": { - "data": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "s3://dnastack-resources/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + "data": "File", + "data_index": "File" } - }, + ], "de_novo_assembly.backend": "AWS", "de_novo_assembly.zones": "us-east-2a us-east-2b us-east-2c", "de_novo_assembly.aws_spot_queue_arn": "", diff --git a/backends/azure/inputs.azure.json b/backends/azure/inputs.azure.json index 64a1911..5d63fa6 100644 --- a/backends/azure/inputs.azure.json +++ b/backends/azure/inputs.azure.json @@ -13,13 +13,14 @@ ], "run_de_novo_assembly_trio": "Boolean" }, - "de_novo_assembly.reference": { - "name": "GRCh38", + "de_novo_assembly.references": [ + { + "name": "String", "fasta": { - "data": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "/datasetpbrarediseases/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + "data": "File", + "data_index": "File" } - }, + ], "de_novo_assembly.backend": "Azure", "de_novo_assembly.preemptible": "Boolean" } diff --git a/backends/gcp/inputs.gcp.json b/backends/gcp/inputs.gcp.json index f4cc3c5..723742a 100644 --- a/backends/gcp/inputs.gcp.json +++ b/backends/gcp/inputs.gcp.json @@ -13,13 +13,14 @@ ], "run_de_novo_assembly_trio": "Boolean" }, - "de_novo_assembly.reference": { - "name": "GRCh38", + "de_novo_assembly.references": [ + { + "name": "String", "fasta": { - "data": "gs:///dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "gs:///dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + "data": "File", + "data_index": "File" } - }, + ], "de_novo_assembly.backend": "GCP", "de_novo_assembly.zones": "String", "de_novo_assembly.preemptible": "Boolean" diff --git a/backends/hpc/inputs.hpc.json b/backends/hpc/inputs.hpc.json index 338e58d..e0979de 100644 --- a/backends/hpc/inputs.hpc.json +++ b/backends/hpc/inputs.hpc.json @@ -13,13 +13,14 @@ ], "run_de_novo_assembly_trio": "Boolean" }, - "de_novo_assembly.reference": { - "name": "GRCh38", + "de_novo_assembly.references": [ + { + "name": "String", "fasta": { - "data": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta", - "data_index": "/dataset/GRCh38/human_GRCh38_no_alt_analysis_set.fasta.fai" + "data": "File", + "data_index": "File" } - }, + ], "de_novo_assembly.backend": "HPC", "de_novo_assembly.preemptible": false } diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 97c0734..2e5bd10 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -361,4 +361,4 @@ } } } -} \ No newline at end of file +} diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index 96a733d..66acedf 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -9,7 +9,7 @@ workflow assemble_genome { String sample_id Array[File] reads_fastas - ReferenceData reference + Array[ReferenceData] references String? hifiasm_extra_params File? father_yak @@ -38,26 +38,36 @@ workflow assemble_genome { call gfa2fa { input: gfa = gfa, - reference_index = reference.fasta.data_index, - runtime_attributes = default_runtime_attributes + runtime_attributes = default_runtime_attributes } } + + scatter (ref in references) { + call align_hifiasm { + input: + sample_id = sample_id, + query_sequences = gfa2fa.zipped_fasta, + reference = ref.fasta.data, + reference_name = ref.name, + runtime_attributes = default_runtime_attributes + } - call align_hifiasm { - input: - sample_id = sample_id, - query_sequences = gfa2fa.zipped_fasta, - reference = reference.fasta.data, - reference_name = reference.name, - runtime_attributes = default_runtime_attributes + IndexData sample_aligned_bam = { + "data": align_hifiasm.asm_bam, + "data_index": align_hifiasm.asm_bam_index + } + + Pair[ReferenceData,IndexData] align_data = (ref, sample_aligned_bam) } + output { Array[File] assembly_noseq_gfas = hifiasm_assemble.assembly_noseq_gfas Array[File] assembly_lowQ_beds = hifiasm_assemble.assembly_lowQ_beds Array[File] zipped_assembly_fastas = gfa2fa.zipped_fasta Array[File] assembly_stats = gfa2fa.assembly_stats - IndexData asm_bam = {"data": align_hifiasm.asm_bam, "data_index": align_hifiasm.asm_bam_index} + Array[IndexData] asm_bams = sample_aligned_bam + Array[Pair[ReferenceData,IndexData]] alignments = align_data } parameter_meta { @@ -132,8 +142,6 @@ task gfa2fa { input { File gfa - File reference_index - RuntimeAttributes runtime_attributes } @@ -157,11 +165,12 @@ task gfa2fa { # Calculate assembly stats k8 \ /opt/calN50/calN50.js \ - -f ~{reference_index} \ ~{gfa_basename}.fasta.gz \ > ~{gfa_basename}.fasta.stats.txt >>> + + output { File zipped_fasta = "~{gfa_basename}.fasta.gz" File assembly_stats = "~{gfa_basename}.fasta.stats.txt" diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl index 72f7957..6481d5f 100644 --- a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl +++ b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl @@ -12,7 +12,7 @@ workflow de_novo_assembly_sample { input { Sample sample - ReferenceData reference + Array[ReferenceData] references String backend RuntimeAttributes default_runtime_attributes @@ -31,43 +31,53 @@ workflow de_novo_assembly_sample { input: sample_id = sample.sample_id, reads_fastas = samtools_fasta.reads_fasta, - reference = reference, + references = references, hifiasm_extra_params = "", backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = on_demand_runtime_attributes } - call htsbox { - input: - bam = assemble_genome.asm_bam.data, - bam_index = assemble_genome.asm_bam.data_index, - reference = reference.fasta.data, - runtime_attributes = default_runtime_attributes - } + scatter (aln in assemble_genome.alignments) { + ReferenceData ref = aln.left + IndexData bam = aln.right + call htsbox { + input: + bam = bam.data, + bam_index = bam.data_index, + reference = ref.fasta.data, + runtime_attributes = default_runtime_attributes + } - call ZipIndexVcf.zip_index_vcf { - input: - vcf = htsbox.htsbox_vcf, - runtime_attributes = default_runtime_attributes - } + call ZipIndexVcf.zip_index_vcf { + input: + vcf = htsbox.htsbox_vcf, + runtime_attributes = default_runtime_attributes + } - call BcftoolsStats.bcftools_stats { - input: - vcf = zip_index_vcf.zipped_vcf, - params = "--samples ~{basename(assemble_genome.asm_bam.data)}", - reference = reference.fasta.data, - runtime_attributes = default_runtime_attributes - } + IndexData htsbox_vcf = { + "data": zip_index_vcf.zipped_vcf, + "data_index": zip_index_vcf.zipped_vcf_index + } + call BcftoolsStats.bcftools_stats { + input: + vcf = zip_index_vcf.zipped_vcf, + params = "--samples ~{basename(bam.data)}", + reference = ref.fasta.data, + runtime_attributes = default_runtime_attributes + } + + } output { Array[File] assembly_noseq_gfas = assemble_genome.assembly_noseq_gfas Array[File] assembly_lowQ_beds = assemble_genome.assembly_lowQ_beds Array[File] zipped_assembly_fastas = assemble_genome.zipped_assembly_fastas Array[File] assembly_stats = assemble_genome.assembly_stats - IndexData asm_bam = assemble_genome.asm_bam - IndexData htsbox_vcf = {"data": zip_index_vcf.zipped_vcf, "data_index": zip_index_vcf.zipped_vcf_index} - File htsbox_vcf_stats = bcftools_stats.stats + Array[IndexData] asm_bams = assemble_genome.asm_bams + + Array[IndexData] htsbox_vcfs = htsbox_vcf + Array[File] htsbox_vcf_stats = bcftools_stats.stats } parameter_meta { diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 2d62021..b9b6502 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -11,7 +11,7 @@ workflow de_novo_assembly_trio { input { Cohort cohort - ReferenceData reference + Array[ReferenceData] references String backend RuntimeAttributes default_runtime_attributes @@ -121,7 +121,7 @@ workflow de_novo_assembly_trio { input: sample_id = "~{cohort.cohort_id}.~{child.sample_id}", reads_fastas = samtools_fasta_child.reads_fasta, - reference = reference, + references = references, hifiasm_extra_params = "-c1 -d1", father_yak = yak_count_father.yak, mother_yak = yak_count_mother.yak, @@ -138,12 +138,13 @@ workflow de_novo_assembly_trio { Array[Array[File]] assembly_lowQ_beds = flatten(assemble_genome.assembly_lowQ_beds) Array[Array[File]] zipped_assembly_fastas = flatten(assemble_genome.zipped_assembly_fastas) Array[Array[File]] assembly_stats = flatten(assemble_genome.assembly_stats) - Array[IndexData] asm_bams = flatten(assemble_genome.asm_bam) + Array[Array[IndexData]] asm_bams = flatten(assemble_genome.asm_bams) + } parameter_meta { cohort: {help: "Sample information for the cohort"} - reference: {help: "Reference genome data"} + references: {help: "List of reference genome data"} default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"} } diff --git a/workflows/input_template.json b/workflows/input_template.json index 148b817..e97ef8d 100644 --- a/workflows/input_template.json +++ b/workflows/input_template.json @@ -13,16 +13,19 @@ ], "run_de_novo_assembly_trio": "Boolean" }, - "de_novo_assembly.reference": { + "de_novo_assembly.references": [ + { "name": "String", "fasta": { "data": "File", "data_index": "File" } - }, - "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", - "de_novo_assembly.zones": "String? (optional); required if backend is set to 'GCP' or 'AWS'", + ], + "de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'", "de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'", - "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional); required if backend is set to 'AWS'", - "de_novo_assembly.preemptible": "Boolean" -} \ No newline at end of file + "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)", + "de_novo_assembly.preemptible": "Boolean", + "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", + "de_novo_assembly.container_registry": "String? (optional)", + } +} diff --git a/workflows/main.wdl b/workflows/main.wdl index bafe340..7be4319 100644 --- a/workflows/main.wdl +++ b/workflows/main.wdl @@ -10,7 +10,7 @@ workflow de_novo_assembly { input { Cohort cohort - ReferenceData reference + Array[ReferenceData] references # Backend configuration String backend @@ -38,7 +38,7 @@ workflow de_novo_assembly { call DeNovoAssemblySample.de_novo_assembly_sample { input: sample = sample, - reference = reference, + references = references, backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = backend_configuration.on_demand_runtime_attributes @@ -51,7 +51,7 @@ workflow de_novo_assembly { call DeNovoAssemblyTrio.de_novo_assembly_trio { input: cohort = cohort, - reference = reference, + references = references, backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = backend_configuration.on_demand_runtime_attributes @@ -65,9 +65,9 @@ workflow de_novo_assembly { Array[Array[File]?] assembly_lowQ_beds = de_novo_assembly_sample.assembly_lowQ_beds Array[Array[File]?] zipped_assembly_fastas = de_novo_assembly_sample.zipped_assembly_fastas Array[Array[File]?] assembly_stats = de_novo_assembly_sample.assembly_stats - Array[IndexData?] asm_bam = de_novo_assembly_sample.asm_bam - Array[IndexData?] htsbox_vcf = de_novo_assembly_sample.htsbox_vcf - Array[File?] htsbox_vcf_stats = de_novo_assembly_sample.htsbox_vcf_stats + Array[Array[IndexData]?] asm_bam = de_novo_assembly_sample.asm_bams + Array[Array[IndexData]?] htsbox_vcf = de_novo_assembly_sample.htsbox_vcfs + Array[Array[File]?] htsbox_vcf_stats = de_novo_assembly_sample.htsbox_vcf_stats # de_novo_assembly_trio output Array[Map[String, String]]? haplotype_key = de_novo_assembly_trio.haplotype_key @@ -75,7 +75,8 @@ workflow de_novo_assembly { Array[Array[File]]? trio_assembly_lowQ_beds = de_novo_assembly_trio.assembly_lowQ_beds Array[Array[File]]? trio_zipped_assembly_fastas = de_novo_assembly_trio.zipped_assembly_fastas Array[Array[File]]? trio_assembly_stats = de_novo_assembly_trio.assembly_stats - Array[IndexData]? trio_asm_bams = de_novo_assembly_trio.asm_bams +## Array[IndexData]? trio_asm_bams = de_novo_assembly_trio.asm_bams + Array[Array[IndexData]]? trio_asm_bams = de_novo_assembly_trio.asm_bams } parameter_meta { From a40d2a2ca8cb01b77fb6072a95aebd6fff59842c Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Fri, 22 Sep 2023 15:48:11 -0700 Subject: [PATCH 04/17] add yak bloom filter condition --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index b9b6502..fcade9e 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -197,8 +197,8 @@ task yak_count { Int mem_gb = 16 * threads Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) - # if sample is less than 15X (3.2Gb * 15) use -b37 bloom filter parameter - String yak_options = if sample_total_bp < 48 then "-b37" else "" + # Use bloom filter (-b37) to conserve on resources unless input coverage is low (<15X) + String yak_options = if sample_total_bp < 48000000000 then "" else "-b37" command <<< set -euo pipefail @@ -287,7 +287,6 @@ task get_total_bp { output { Int sample_total_bp = round(read_float("~{sample_id}.total")) -# File sample_total_bp = "~{sample_id}.total" } From fcf9ffe23d40eeaf6dfc8d07a4f6b8c02688257d Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Fri, 22 Sep 2023 16:11:34 -0700 Subject: [PATCH 05/17] fix coverage --- wdl-ci.config.json | 162 +++++++++++++++++- .../de_novo_assembly_trio.wdl | 21 +-- 2 files changed, 172 insertions(+), 11 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 2e5bd10..f90b9a4 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -163,6 +163,16 @@ } } ] + }, + "fasta_basecount": { + "key": "fasta_basecount", + "digest": "", + "tests": [] + }, + "get_total_gbp": { + "key": "get_total_gbp", + "digest": "", + "tests": [] } } }, @@ -325,6 +335,156 @@ "name": "", "description": "", "tasks": {} + }, + "workflows/wdl-common/wdl/tasks/glnexus.wdl": { + "key": "workflows/wdl-common/wdl/tasks/glnexus.wdl", + "name": "", + "description": "", + "tasks": { + "glnexus": { + "key": "glnexus", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/mosdepth.wdl": { + "key": "workflows/wdl-common/wdl/tasks/mosdepth.wdl", + "name": "", + "description": "", + "tasks": { + "mosdepth": { + "key": "mosdepth", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/pbsv_call.wdl": { + "key": "workflows/wdl-common/wdl/tasks/pbsv_call.wdl", + "name": "", + "description": "", + "tasks": { + "pbsv_call": { + "key": "pbsv_call", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/pbsv_discover.wdl": { + "key": "workflows/wdl-common/wdl/tasks/pbsv_discover.wdl", + "name": "", + "description": "", + "tasks": { + "pbsv_discover": { + "key": "pbsv_discover", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/pharmcat.wdl": { + "key": "workflows/wdl-common/wdl/tasks/pharmcat.wdl", + "name": "", + "description": "", + "tasks": { + "pangu_cyp2d6": { + "key": "pangu_cyp2d6", + "digest": "", + "tests": [] + }, + "pharmcat_preprocess": { + "key": "pharmcat_preprocess", + "digest": "", + "tests": [] + }, + "filter_preprocessed_vcf": { + "key": "filter_preprocessed_vcf", + "digest": "", + "tests": [] + }, + "run_pharmcat": { + "key": "run_pharmcat", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/whatshap_haplotag.wdl": { + "key": "workflows/wdl-common/wdl/tasks/whatshap_haplotag.wdl", + "name": "", + "description": "", + "tasks": { + "whatshap_haplotag": { + "key": "whatshap_haplotag", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/whatshap_phase.wdl": { + "key": "workflows/wdl-common/wdl/tasks/whatshap_phase.wdl", + "name": "", + "description": "", + "tasks": { + "whatshap_phase": { + "key": "whatshap_phase", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/tasks/whatshap_stats.wdl": { + "key": "workflows/wdl-common/wdl/tasks/whatshap_stats.wdl", + "name": "", + "description": "", + "tasks": { + "whatshap_stats": { + "key": "whatshap_stats", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/workflows/deepvariant/deepvariant.wdl": { + "key": "workflows/wdl-common/wdl/workflows/deepvariant/deepvariant.wdl", + "name": "", + "description": "", + "tasks": { + "deepvariant_make_examples": { + "key": "deepvariant_make_examples", + "digest": "", + "tests": [] + }, + "deepvariant_call_variants": { + "key": "deepvariant_call_variants", + "digest": "", + "tests": [] + }, + "deepvariant_postprocess_variants": { + "key": "deepvariant_postprocess_variants", + "digest": "", + "tests": [] + } + } + }, + "workflows/wdl-common/wdl/workflows/phase_vcf/phase_vcf.wdl": { + "key": "workflows/wdl-common/wdl/workflows/phase_vcf/phase_vcf.wdl", + "name": "", + "description": "", + "tasks": { + "split_vcf": { + "key": "split_vcf", + "digest": "", + "tests": [] + }, + "bcftools_concat": { + "key": "bcftools_concat", + "digest": "", + "tests": [] + } + } } }, "engines": { @@ -361,4 +521,4 @@ } } } -} +} \ No newline at end of file diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index fcade9e..15a1807 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -50,7 +50,7 @@ workflow de_novo_assembly_trio { } } - call get_total_bp as get_total_bp_father { + call get_total_gbp as get_total_bp_father { input: sample_id = father.sample_id, fasta_totals = fasta_bc_father.read_total_bp, @@ -61,7 +61,7 @@ workflow de_novo_assembly_trio { input: sample_id = father.sample_id, reads_fastas = samtools_fasta_father.reads_fasta, - sample_total_bp = get_total_bp_father.sample_total_bp, + sample_total_gbp = get_total_bp_father.sample_total_gbp, runtime_attributes = default_runtime_attributes } @@ -83,7 +83,7 @@ workflow de_novo_assembly_trio { } } - call get_total_bp as get_total_bp_mother { + call get_total_gbp as get_total_bp_mother { input: sample_id = mother.sample_id, fasta_totals = fasta_bc_mother.read_total_bp, @@ -95,7 +95,7 @@ workflow de_novo_assembly_trio { input: sample_id = mother.sample_id, reads_fastas = samtools_fasta_mother.reads_fasta, - sample_total_bp = get_total_bp_father.sample_total_bp, + sample_total_gbp = get_total_bp_father.sample_total_gbp, runtime_attributes = default_runtime_attributes } @@ -187,7 +187,7 @@ task yak_count { input { String sample_id Array[File] reads_fastas - Int sample_total_bp + Int sample_total_gbp RuntimeAttributes runtime_attributes } @@ -197,8 +197,9 @@ task yak_count { Int mem_gb = 16 * threads Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) - # Use bloom filter (-b37) to conserve on resources unless input coverage is low (<15X) - String yak_options = if sample_total_bp < 48000000000 then "" else "-b37" + # Use bloom filter (-b37) to conserve resources unless input coverage + # is low ( <15X; (3.2Gb*15=48)) + String yak_options = if sample_total_gbp < 48 then "" else "-b37" command <<< set -euo pipefail @@ -265,7 +266,7 @@ task fasta_basecount { } } -task get_total_bp { +task get_total_gbp { input { String sample_id Array[File] fasta_totals @@ -286,8 +287,8 @@ task get_total_bp { >>> output { - Int sample_total_bp = round(read_float("~{sample_id}.total")) - + Int sample_total_gbp = round(read_float("~{sample_id}.total")) + #Int sample_total_cov = round(sample_total_bp / 3200000000) } runtime { From d46292426d939977eef43ad8f78b9ac47f1c715e Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Mon, 25 Sep 2023 11:03:44 -0700 Subject: [PATCH 06/17] determine yak settings for both parents rather than independently --- wdl-ci.config.json | 5 ++ .../de_novo_assembly_trio.wdl | 61 ++++++++++++++----- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index f90b9a4..59817e7 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -173,6 +173,11 @@ "key": "get_total_gbp", "digest": "", "tests": [] + }, + "determine_yak_options": { + "key": "determine_yak_options", + "digest": "", + "tests": [] } } }, diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 15a1807..0b3c178 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -50,22 +50,13 @@ workflow de_novo_assembly_trio { } } - call get_total_gbp as get_total_bp_father { + call get_total_gbp as get_total_gbp_father { input: sample_id = father.sample_id, fasta_totals = fasta_bc_father.read_total_bp, runtime_attributes = default_runtime_attributes } - call yak_count as yak_count_father { - input: - sample_id = father.sample_id, - reads_fastas = samtools_fasta_father.reads_fasta, - sample_total_gbp = get_total_bp_father.sample_total_gbp, - - runtime_attributes = default_runtime_attributes - } - scatter (movie_bam in mother.movie_bams) { call SamtoolsFasta.samtools_fasta as samtools_fasta_mother { input: @@ -83,19 +74,35 @@ workflow de_novo_assembly_trio { } } - call get_total_gbp as get_total_bp_mother { + call get_total_gbp as get_total_gbp_mother { input: sample_id = mother.sample_id, fasta_totals = fasta_bc_mother.read_total_bp, runtime_attributes = default_runtime_attributes } + call determine_yak_options { + input: + father_total_gbp = get_total_gbp_father.sample_total_gbp, + mother_total_gbp = get_total_gbp_mother.sample_total_gbp, + } + + call yak_count as yak_count_father { + input: + sample_id = father.sample_id, + reads_fastas = samtools_fasta_father.reads_fasta, + yak_options = determine_yak_options.yak_options, +# sample_total_gbp = get_total_gbp_father.sample_total_gbp, + + runtime_attributes = default_runtime_attributes + } call yak_count as yak_count_mother { input: sample_id = mother.sample_id, reads_fastas = samtools_fasta_mother.reads_fasta, - sample_total_gbp = get_total_bp_father.sample_total_gbp, + yak_options = determine_yak_options.yak_options, +# sample_total_gbp = get_total_gbp_mother.sample_total_gbp, runtime_attributes = default_runtime_attributes } @@ -183,11 +190,32 @@ task parse_families { } } +task determine_yak_options { + input { + Int mother_total_gbp + Int father_total_gbp + } + + command { + set -e + if [ ~{father_total_gbp} -lt 48 ] && [ ~{mother_total_gbp} -lt 48 ]; then + options="" + else + options="-b37" + fi + echo $options + } + output { + String yak_options = read_string(stdout()) + } +} + task yak_count { input { String sample_id Array[File] reads_fastas - Int sample_total_gbp + #Int sample_total_gbp + String yak_options RuntimeAttributes runtime_attributes } @@ -199,7 +227,7 @@ task yak_count { # Use bloom filter (-b37) to conserve resources unless input coverage # is low ( <15X; (3.2Gb*15=48)) - String yak_options = if sample_total_gbp < 48 then "" else "-b37" + #String yak_options = if sample_total_gbp < 48 then "" else "-b37" command <<< set -euo pipefail @@ -282,7 +310,7 @@ task get_total_gbp { command <<< set -euo pipefail - cat ~{sep=' ' fasta_totals} | awk '{sum+=$1}END{print sum/1000000000}' > ~{sample_id}.total + awk '{sum+=$1}END{print sum/1000000000}' ~{sep=' ' fasta_totals} > ~{sample_id}.total >>> @@ -305,3 +333,6 @@ task get_total_gbp { } } + +# cat ~{sep=' ' fasta_totals} | awk '{sum+=$1}END{print sum/1000000000}' > ~{sample_id}.total + From af88a08171ce731758aac2b921dcd1d95aec822a Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Mon, 25 Sep 2023 11:29:33 -0700 Subject: [PATCH 07/17] fix tests and remove some debug comments I missed --- wdl-ci.config.json | 1 + .../de_novo_assembly_trio/de_novo_assembly_trio.wdl | 13 ------------- workflows/main.wdl | 1 - 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 59817e7..86c24dd 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -150,6 +150,7 @@ "${resources_file_path}/m64017_200108_232219.hifi_reads.fasta", "${resources_file_path}/m64017_200112_090459.hifi_reads.fasta" ], + "yak_options": "-b37", "runtime_attributes": "${default_runtime_attributes}" }, "output_tests": { diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 0b3c178..0f155ef 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -92,8 +92,6 @@ workflow de_novo_assembly_trio { sample_id = father.sample_id, reads_fastas = samtools_fasta_father.reads_fasta, yak_options = determine_yak_options.yak_options, -# sample_total_gbp = get_total_gbp_father.sample_total_gbp, - runtime_attributes = default_runtime_attributes } @@ -102,8 +100,6 @@ workflow de_novo_assembly_trio { sample_id = mother.sample_id, reads_fastas = samtools_fasta_mother.reads_fasta, yak_options = determine_yak_options.yak_options, -# sample_total_gbp = get_total_gbp_mother.sample_total_gbp, - runtime_attributes = default_runtime_attributes } @@ -214,7 +210,6 @@ task yak_count { input { String sample_id Array[File] reads_fastas - #Int sample_total_gbp String yak_options RuntimeAttributes runtime_attributes @@ -225,10 +220,6 @@ task yak_count { Int mem_gb = 16 * threads Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) - # Use bloom filter (-b37) to conserve resources unless input coverage - # is low ( <15X; (3.2Gb*15=48)) - #String yak_options = if sample_total_gbp < 48 then "" else "-b37" - command <<< set -euo pipefail @@ -316,7 +307,6 @@ task get_total_gbp { output { Int sample_total_gbp = round(read_float("~{sample_id}.total")) - #Int sample_total_cov = round(sample_total_bp / 3200000000) } runtime { @@ -333,6 +323,3 @@ task get_total_gbp { } } - -# cat ~{sep=' ' fasta_totals} | awk '{sum+=$1}END{print sum/1000000000}' > ~{sample_id}.total - diff --git a/workflows/main.wdl b/workflows/main.wdl index 7be4319..7647f34 100644 --- a/workflows/main.wdl +++ b/workflows/main.wdl @@ -75,7 +75,6 @@ workflow de_novo_assembly { Array[Array[File]]? trio_assembly_lowQ_beds = de_novo_assembly_trio.assembly_lowQ_beds Array[Array[File]]? trio_zipped_assembly_fastas = de_novo_assembly_trio.zipped_assembly_fastas Array[Array[File]]? trio_assembly_stats = de_novo_assembly_trio.assembly_stats -## Array[IndexData]? trio_asm_bams = de_novo_assembly_trio.asm_bams Array[Array[IndexData]]? trio_asm_bams = de_novo_assembly_trio.asm_bams } From 137d6a94b002236f627619651cd2092d500d927a Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 25 Sep 2023 18:43:32 +0000 Subject: [PATCH 08/17] update wdl-ci config file after successful tests --- wdl-ci.config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 86c24dd..fffe292 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -246,7 +246,7 @@ }, "gfa2fa": { "key": "gfa2fa", - "digest": "liyb2m4cbkovxctcgaxwunqkn5az77ev", + "digest": "es7l5kyje3fiy5vxjnnsqg4fw6sitmdo", "tests": [ { "inputs": { From 6c0232749f49dc4451c67a1911b2b5166e958b51 Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 25 Sep 2023 21:26:56 +0000 Subject: [PATCH 09/17] update wdl-ci config file after successful tests --- wdl-ci.config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index fffe292..b2e5d4f 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -137,7 +137,7 @@ }, "yak_count": { "key": "yak_count", - "digest": "2ovi7jh4btl4sb7xr23ga6mxtd7nlq4s", + "digest": "qysjdjudeldfcf6pm2unping3zkh4qve", "tests": [ { "inputs": { From 49acf5ccc5c2c1b42d8827b690a6a4e98ebcdecf Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 11:10:47 -0700 Subject: [PATCH 10/17] Take a stab at estimating depth based on filesize. - updated parameter_meta - updated inputs.json - cleaned up some whitespace - added comments - using fasta filesize to estimate depth rather than a separate task; based on Greg's experiments, an uncompressed 10x FASTA is ~60GB --- workflows/assemble_genome/assemble_genome.wdl | 11 +- .../de_novo_assembly_sample.wdl | 3 +- .../de_novo_assembly_trio.wdl | 164 +++--------------- workflows/input_template.json | 58 +++---- 4 files changed, 57 insertions(+), 179 deletions(-) diff --git a/workflows/assemble_genome/assemble_genome.wdl b/workflows/assemble_genome/assemble_genome.wdl index 66acedf..1e6807f 100644 --- a/workflows/assemble_genome/assemble_genome.wdl +++ b/workflows/assemble_genome/assemble_genome.wdl @@ -73,7 +73,7 @@ workflow assemble_genome { parameter_meta { sample_id: {help: "Sample ID; used for naming files"} reads_fastas: {help: "Reads in fasta format to be used for assembly; one for each movie bam to be used in assembly. Reads fastas from one or more sample may be combined to use in the assembly"} - reference: {help: "Reference genome data"} + references: {help: "Array of Reference genomes data"} hiiasm_extra_params: {help: "[OPTIONAL] Additional parameters to pass to hifiasm assembly"} father_yak: {help: "[OPTIONAL] kmer counts for the father; required if running trio-based assembly"} mother_yak: {help: "[OPTIONAL] kmer counts for the mother; required if running trio-based assembly"} @@ -98,7 +98,7 @@ task hifiasm_assemble { String prefix = "~{sample_id}.asm" Int threads = 48 Int mem_gb = threads * 6 - Int disk_size = ceil((size(reads_fastas[0], "GB") * length(reads_fastas)) * 4 + 20) + Int disk_size = ceil(size(reads_fastas, "GB") * 4 + 20) command <<< set -euo pipefail @@ -202,7 +202,8 @@ task align_hifiasm { } Int threads = 16 - Int disk_size = ceil((size(query_sequences[0], "GB") * length(query_sequences) + size(reference, "GB")) * 2 + 20) + Int mem_gb = threads * 8 + Int disk_size = ceil((size(query_sequences, "GB") + size(reference, "GB")) * 2 + 20) command <<< set -euo pipefail @@ -218,7 +219,7 @@ task align_hifiasm { ~{reference} \ ~{sep=' ' query_sequences} \ | samtools sort \ - -@ 4 \ + -@ 3 \ -T ./TMP \ -m 8G \ -O BAM \ @@ -235,7 +236,7 @@ task align_hifiasm { runtime { docker: "~{runtime_attributes.container_registry}/align_hifiasm@sha256:3968cb152a65163005ffed46297127536701ec5af4c44e8f3e7051f7b01f80fe" cpu: threads - memory: "128 GB" + memory: mem_gb + " GB" disk: disk_size + " GB" disks: "local-disk " + disk_size + " HDD" preemptible: runtime_attributes.preemptible_tries diff --git a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl index 6481d5f..43354fc 100644 --- a/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl +++ b/workflows/de_novo_assembly_sample/de_novo_assembly_sample.wdl @@ -32,7 +32,6 @@ workflow de_novo_assembly_sample { sample_id = sample.sample_id, reads_fastas = samtools_fasta.reads_fasta, references = references, - hifiasm_extra_params = "", backend = backend, default_runtime_attributes = default_runtime_attributes, on_demand_runtime_attributes = on_demand_runtime_attributes @@ -82,7 +81,7 @@ workflow de_novo_assembly_sample { parameter_meta { sample: {help: "Sample information and associated data files"} - reference: {help: "Reference genome data"} + references: {help: "Array of Reference genomes data"} default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"} } diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 0f155ef..f06513c 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -41,22 +41,6 @@ workflow de_novo_assembly_trio { } } - # For yak, we need to know the total input coverage so we can set cloud memory resources accordingly - scatter (fasta in samtools_fasta_father.reads_fasta) { - call fasta_basecount as fasta_bc_father { - input: - reads_fasta = fasta, - runtime_attributes = default_runtime_attributes - } - } - - call get_total_gbp as get_total_gbp_father { - input: - sample_id = father.sample_id, - fasta_totals = fasta_bc_father.read_total_bp, - runtime_attributes = default_runtime_attributes - } - scatter (movie_bam in mother.movie_bams) { call SamtoolsFasta.samtools_fasta as samtools_fasta_mother { input: @@ -65,33 +49,23 @@ workflow de_novo_assembly_trio { } } - # For yak, we need to know the total input coverage so we can set cloud memory resources accordingly - scatter (fasta in samtools_fasta_mother.reads_fasta) { - call fasta_basecount as fasta_bc_mother { - input: - reads_fasta = fasta, - runtime_attributes = default_runtime_attributes - } - } - - call get_total_gbp as get_total_gbp_mother { - input: - sample_id = mother.sample_id, - fasta_totals = fasta_bc_mother.read_total_bp, - runtime_attributes = default_runtime_attributes - } + # if parental coverage is low (<15x), keep singleton kmers from parents and use them to bin child reads + # if parental coverage is high (>=15x), use bloom filter and require that a kmer occur >= 5 times in + # one parent and <2 times in the other parent to be used for binning + # 60GB uncompressed FASTA ~= 10x coverage + # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) + Boolean bloom_filter = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false - call determine_yak_options { - input: - father_total_gbp = get_total_gbp_father.sample_total_gbp, - mother_total_gbp = get_total_gbp_mother.sample_total_gbp, - } + String yak_params = if (bloom_filter) then "-b37" else "" + Int yak_mem_gb = if (bloom_filter) then 50 else 70 + String hifiasm_extra_params = if (bloom_filter) then "" else "-c1 -d1" call yak_count as yak_count_father { input: sample_id = father.sample_id, reads_fastas = samtools_fasta_father.reads_fasta, - yak_options = determine_yak_options.yak_options, + yak_params = yak_params, + mem_gb = yak_mem_gb, runtime_attributes = default_runtime_attributes } @@ -99,7 +73,8 @@ workflow de_novo_assembly_trio { input: sample_id = mother.sample_id, reads_fastas = samtools_fasta_mother.reads_fasta, - yak_options = determine_yak_options.yak_options, + yak_params = yak_params, + mem_gb = yak_mem_gb, runtime_attributes = default_runtime_attributes } @@ -125,7 +100,7 @@ workflow de_novo_assembly_trio { sample_id = "~{cohort.cohort_id}.~{child.sample_id}", reads_fastas = samtools_fasta_child.reads_fasta, references = references, - hifiasm_extra_params = "-c1 -d1", + hifiasm_extra_params = hifiasm_extra_params, father_yak = yak_count_father.yak, mother_yak = yak_count_mother.yak, backend = backend, @@ -142,12 +117,11 @@ workflow de_novo_assembly_trio { Array[Array[File]] zipped_assembly_fastas = flatten(assemble_genome.zipped_assembly_fastas) Array[Array[File]] assembly_stats = flatten(assemble_genome.assembly_stats) Array[Array[IndexData]] asm_bams = flatten(assemble_genome.asm_bams) - } parameter_meta { cohort: {help: "Sample information for the cohort"} - references: {help: "List of reference genome data"} + references: {help: "Array of Reference genomes data"} default_runtime_attributes: {help: "Default RuntimeAttributes; spot if preemptible was set to true, otherwise on_demand"} on_demand_runtime_attributes: {help: "RuntimeAttributes for tasks that require dedicated instances"} } @@ -186,47 +160,27 @@ task parse_families { } } -task determine_yak_options { - input { - Int mother_total_gbp - Int father_total_gbp - } - - command { - set -e - if [ ~{father_total_gbp} -lt 48 ] && [ ~{mother_total_gbp} -lt 48 ]; then - options="" - else - options="-b37" - fi - echo $options - } - output { - String yak_options = read_string(stdout()) - } -} - task yak_count { input { String sample_id Array[File] reads_fastas - String yak_options + + String yak_params + String mem_gb RuntimeAttributes runtime_attributes } - Int threads = 10 - # Usage up to 140 GB @ 10 threads for Revio samples - Int mem_gb = 16 * threads + Int threads = 24 Int disk_size = ceil(size(reads_fastas, "GB") * 2 + 20) - + command <<< set -euo pipefail yak count \ -t ~{threads} \ -o ~{sample_id}.yak \ - ~{yak_options} \ + ~{yak_params} \ ~{sep=' ' reads_fastas} >>> @@ -247,79 +201,3 @@ task yak_count { zones: runtime_attributes.zones } } - -task fasta_basecount { - input { - File reads_fasta - String reads_fasta_basename = basename(reads_fasta) - - RuntimeAttributes runtime_attributes - } - - Int threads = 1 - Int mem_gb = 4 * threads - - Int disk_size = ceil(size(reads_fasta, "GB") * 2 + 20) - - command <<< - set -euo pipefail - - grep -v "^>" ~{reads_fasta} | tr -d '\n' | wc -c > ~{reads_fasta_basename}.total - >>> - - output { - File read_total_bp = "~{reads_fasta_basename}.total" - } - - runtime { - docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2" - cpu: threads - memory: mem_gb + " GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } -} - -task get_total_gbp { - input { - String sample_id - Array[File] fasta_totals - - RuntimeAttributes runtime_attributes - } - - Int threads = 1 - Int mem_gb = 4 * threads - - Int disk_size = ceil(size(fasta_totals[0], "GB") * 2 + 20) - - command <<< - set -euo pipefail - - awk '{sum+=$1}END{print sum/1000000000}' ~{sep=' ' fasta_totals} > ~{sample_id}.total - - >>> - - output { - Int sample_total_gbp = round(read_float("~{sample_id}.total")) - } - - runtime { - docker: "~{runtime_attributes.container_registry}/python@sha256:e4d921e252c3c19fe64097aa619c369c50cc862768d5fcb5e19d2877c55cfdd2" - cpu: threads - memory: mem_gb + " GB" - disk: disk_size + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: runtime_attributes.preemptible_tries - maxRetries: runtime_attributes.max_retries - awsBatchRetryAttempts: runtime_attributes.max_retries - queueArn: runtime_attributes.queue_arn - zones: runtime_attributes.zones - } - -} diff --git a/workflows/input_template.json b/workflows/input_template.json index e97ef8d..64e5d62 100644 --- a/workflows/input_template.json +++ b/workflows/input_template.json @@ -1,31 +1,31 @@ { - "de_novo_assembly.cohort": { - "cohort_id": "String", - "samples": [ - { - "sample_id": "String", - "movie_bams": "Array[File]", - "sex": "String?", - "father_id": "String?", - "mother_id": "String?", - "run_de_novo_assembly": "Boolean" - } + "de_novo_assembly.cohort": { + "cohort_id": "String", + "samples": [ + { + "sample_id": "String", + "movie_bams": "Array[File]", + "sex": "String?", + "father_id": "String?", + "mother_id": "String?", + "run_de_novo_assembly": "Boolean" + } + ], + "run_de_novo_assembly_trio": "Boolean" + }, + "de_novo_assembly.references": [ + { + "name": "String", + "fasta": { + "data": "File", + "data_index": "File" + } + } ], - "run_de_novo_assembly_trio": "Boolean" - }, - "de_novo_assembly.references": [ - { - "name": "String", - "fasta": { - "data": "File", - "data_index": "File" - } - ], - "de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'", - "de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'", - "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)", - "de_novo_assembly.preemptible": "Boolean", - "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", - "de_novo_assembly.container_registry": "String? (optional)", - } -} + "de_novo_assembly.zones": "String? (optional); required if backend is set to 'AWS'", + "de_novo_assembly.aws_spot_queue_arn": "String? (optional); required if backend is set to 'AWS'", + "de_novo_assembly.aws_on_demand_queue_arn": "String? (optional)", + "de_novo_assembly.preemptible": "Boolean", + "de_novo_assembly.backend": "String ['GCP', 'Azure', 'AWS', or 'HPC']", + "de_novo_assembly.container_registry": "String? (optional)" +} \ No newline at end of file From f24e1e6e95ab4329744252a4dcd6b98d3a3aae80 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 11:14:50 -0700 Subject: [PATCH 11/17] changed flag name and fixed my flipped logic --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index f06513c..d3aa695 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -54,11 +54,11 @@ workflow de_novo_assembly_trio { # one parent and <2 times in the other parent to be used for binning # 60GB uncompressed FASTA ~= 10x coverage # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) - Boolean bloom_filter = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false + Boolean low_depth = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false - String yak_params = if (bloom_filter) then "-b37" else "" - Int yak_mem_gb = if (bloom_filter) then 50 else 70 - String hifiasm_extra_params = if (bloom_filter) then "" else "-c1 -d1" + String yak_params = if (low_depth) then "" else "-b37" + Int yak_mem_gb = if (low_depth) then 70 else 50 + String hifiasm_extra_params = if (low_depth) then "-c1 -d1" else "" call yak_count as yak_count_father { input: From 42d0d847d5c06f4ea16fbc8d9890f29a408ef45a Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 11:23:58 -0700 Subject: [PATCH 12/17] Adjusted tests. --- wdl-ci.config.json | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index b2e5d4f..792f307 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -150,7 +150,8 @@ "${resources_file_path}/m64017_200108_232219.hifi_reads.fasta", "${resources_file_path}/m64017_200112_090459.hifi_reads.fasta" ], - "yak_options": "-b37", + "yak_params": "-b37", + "mem_gb": 70, "runtime_attributes": "${default_runtime_attributes}" }, "output_tests": { @@ -164,21 +165,6 @@ } } ] - }, - "fasta_basecount": { - "key": "fasta_basecount", - "digest": "", - "tests": [] - }, - "get_total_gbp": { - "key": "get_total_gbp", - "digest": "", - "tests": [] - }, - "determine_yak_options": { - "key": "determine_yak_options", - "digest": "", - "tests": [] } } }, From c8a9b8d9d3a628d10e3355608acabb355cf78225 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 11:31:20 -0700 Subject: [PATCH 13/17] Memory is an int. --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index d3aa695..2d93adb 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -166,7 +166,7 @@ task yak_count { Array[File] reads_fastas String yak_params - String mem_gb + Int mem_gb RuntimeAttributes runtime_attributes } From 42735154d3c3c8736697dbb89001c4ced04d61f0 Mon Sep 17 00:00:00 2001 From: github-actions <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 28 Sep 2023 18:51:21 +0000 Subject: [PATCH 14/17] update wdl-ci config file after successful tests --- wdl-ci.config.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wdl-ci.config.json b/wdl-ci.config.json index 792f307..081474a 100644 --- a/wdl-ci.config.json +++ b/wdl-ci.config.json @@ -137,7 +137,7 @@ }, "yak_count": { "key": "yak_count", - "digest": "qysjdjudeldfcf6pm2unping3zkh4qve", + "digest": "6hlh6n3b3cqohtmjweg57of626he4c4v", "tests": [ { "inputs": { @@ -175,7 +175,7 @@ "tasks": { "hifiasm_assemble": { "key": "hifiasm_assemble", - "digest": "r4ikydzmdaed4hzsmc3t7efh6mz5e4mx", + "digest": "vhkzwee3f754jcjksog22uyps3j6myow", "tests": [ { "inputs": { @@ -264,7 +264,7 @@ }, "align_hifiasm": { "key": "align_hifiasm", - "digest": "77gs34t4c2i6epsg2epukfoaign2fmnt", + "digest": "4qf5jeepfn3jv3g2socql6xh7vmd4b7s", "tests": [ { "inputs": { From 2dc27601fd4183bcbead838d8359c5db9af59668 Mon Sep 17 00:00:00 2001 From: William Rowell Date: Thu, 28 Sep 2023 12:16:53 -0700 Subject: [PATCH 15/17] added warning comment about estimating depth by filesize --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 2d93adb..5db4c29 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -52,7 +52,7 @@ workflow de_novo_assembly_trio { # if parental coverage is low (<15x), keep singleton kmers from parents and use them to bin child reads # if parental coverage is high (>=15x), use bloom filter and require that a kmer occur >= 5 times in # one parent and <2 times in the other parent to be used for binning - # 60GB uncompressed FASTA ~= 10x coverage + # 60GB uncompressed FASTA ~= 10x coverage (this is not robust to big changes in mean read length) # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) Boolean low_depth = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false From ebe6afeb0d5ec2409d6b31aaf5aa838976672bdc Mon Sep 17 00:00:00 2001 From: William Rowell Date: Fri, 29 Sep 2023 13:35:14 -0700 Subject: [PATCH 16/17] Explicitly pass default `yak count` and `hifiasm` params. --- workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl index 5db4c29..aa07ffc 100644 --- a/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl +++ b/workflows/de_novo_assembly_trio/de_novo_assembly_trio.wdl @@ -56,9 +56,9 @@ workflow de_novo_assembly_trio { # memory for 24 threads is 48GB with bloom filter (<=50x coverage) and 65GB without bloom filter (<=30x coverage) Boolean low_depth = if ((size(samtools_fasta_father.reads_fasta, "GB") < 90) && (size(samtools_fasta_mother.reads_fasta, "GB") < 90)) then true else false - String yak_params = if (low_depth) then "" else "-b37" + String yak_params = if (low_depth) then "-b0" else "-b37" Int yak_mem_gb = if (low_depth) then 70 else 50 - String hifiasm_extra_params = if (low_depth) then "-c1 -d1" else "" + String hifiasm_extra_params = if (low_depth) then "-c1 -d1" else "-c2 -d5" call yak_count as yak_count_father { input: From 05312fe686c22bee0c2106d3ab5087d1b7480079 Mon Sep 17 00:00:00 2001 From: gconcepcion Date: Thu, 26 Oct 2023 11:02:26 -0700 Subject: [PATCH 17/17] update README to reflect array of reference inputs as well as arrays of aligned bam outputs --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c031286..bb97dbf 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,7 @@ These files are hosted publicly in each of the cloud backends; see `backends/${b | Type | Name | Description | Notes | | :- | :- | :- | :- | | String | name | Reference name; used to name outputs (e.g., "GRCh38") | | -| [IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl) | fasta | Reference genome and index | | +| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)] | fastas | Reference genomes and associatedindex | | ## Other inputs @@ -209,9 +209,9 @@ These files will be output if `cohort.samples[sample]` is set to `true` for any | Array[Array[File]?] | assembly_noseq_gfas | Assembly graphs in [GFA format](https://github.com/chhylp123/hifiasm/blob/master/docs/source/interpreting-output.rst). | | | Array[Array[File]?] | assembly_lowQ_beds | Coordinates of low quality regions in BED format. | | | Array[Array[File]?] | assembly_stats | Assembly size and NG50 stats generated by [calN50](https://github.com/lh3/calN50). | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?] | asm_bam | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?] | htsbox_vcf | Naive pileup variant calling of assembly against reference with [`htsbox`](https://github.com/lh3/htsbox) | | -| Array[File?] | htsbox_vcf_stats | [`bcftools stats`](https://samtools.github.io/bcftools/bcftools.html#stats) summary statistics for `htsbox` variant calls | | +| Array[Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?]] | asm_bam | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | +| Array[Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)?]] | htsbox_vcf | Naive pileup variant calling of assembly against reference with [`htsbox`](https://github.com/lh3/htsbox) | | +| Array[Array[File?]] | htsbox_vcf_stats | [`bcftools stats`](https://samtools.github.io/bcftools/bcftools.html#stats) summary statistics for `htsbox` variant calls | | ## De novo assembly - trio @@ -223,7 +223,7 @@ These files will be output if `cohort.de_novo_assembly_trio` is set to `true` an | Array[Array[File]]? | trio_assembly_noseq_gfas | Assembly graphs in [GFA format](https://github.com/chhylp123/hifiasm/blob/master/docs/source/interpreting-output.rst). | | | Array[Array[File]]? | trio_assembly_lowQ_beds | Coordinates of low quality regions in BED format. | | | Array[Array[File]]? | trio_assembly_stats | Assembly size and NG50 stats generated by [calN50](https://github.com/lh3/calN50). | | -| Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)]? | trio_asm_bams | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | +| Array[Array[[IndexData](https://github.com/PacificBiosciences/wdl-common/blob/main/wdl/structs.wdl)]?] | trio_asm_bams | [minimap2](https://github.com/lh3/minimap2) alignment of assembly to reference. | | | Array[Map[String, String]]? | haplotype_key | Indication of which haplotype (`hap1`/`hap2`) corresponds to which parent. | | # Tool versions and Docker images