-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Several new WDLs and a gated MLP bug fix (#160)
* WDL combining gathering plain text data and preprocessing * WDL to run generate train and test datasets with M2, make UDA dataset and train, run Permutect filtering * bug fix for NaNs in base model output features introduced by the recent mixed-count base batch PR
- Loading branch information
1 parent
1f0b980
commit dc8aa47
Showing
9 changed files
with
472 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,300 @@ | ||
version 1.0 | ||
|
||
# run Mutect2 to get both training AND test datasets. The training dataset is preprocessed and combined with | ||
# high-quality labeled data to make a UDA dataset, then used to train an artifact model. The test dataset is used | ||
# for the posterior model and filtering. | ||
# note that the artifact model can be trained before the Mutect2 workflow runs FilterMutectCalls | ||
import "https://api.firecloud.org/ga4gh/v1/tools/davidben:mutect2/versions/18/plain-WDL/descriptor" as m2 | ||
import "https://api.firecloud.org/ga4gh/v1/tools/davidben:permutect-uda-dataset/versions/3/plain-WDL/descriptor" as uda | ||
import "https://api.firecloud.org/ga4gh/v1/tools/davidben:permutect-train-artifact-model/versions/12/plain-WDL/descriptor" as training | ||
import "https://api.firecloud.org/ga4gh/v1/tools/davidben:permutect-call-variants/versions/18/plain-WDL/descriptor" as calling | ||
|
||
workflow CallVariantsWithUDA { | ||
input { | ||
# basic inputs for Mutect2 | ||
File? intervals | ||
File? masked_intervals | ||
File ref_fasta | ||
File ref_fai | ||
File ref_dict | ||
File primary_bam | ||
File primary_bai | ||
File? control_bam | ||
File? control_bai | ||
File? gnomad | ||
File? gnomad_idx | ||
String? m2_extra_args | ||
File? dragstr_model | ||
Boolean make_bamout = false | ||
Boolean compress_vcfs = false | ||
|
||
# Mutect2 filtering | ||
Boolean skip_m2_filtering | ||
File? variants_for_contamination | ||
File? variants_for_contamination_idx | ||
File? realignment_index_bundle | ||
String? realignment_extra_args | ||
Boolean? run_orientation_bias_mixture_model_filter | ||
|
||
# preprocessing arguments | ||
Int chunk_size | ||
|
||
# training arguments for both artifact model and posterior model | ||
Boolean use_gpu = true | ||
Int batch_size | ||
Int inference_batch_size | ||
Int num_workers | ||
Int? gpu_count | ||
Int? training_mem | ||
|
||
# UDA training arguments | ||
File base_model | ||
File source_train_tar | ||
String source_edit_type = "keep_everything" | ||
String target_edit_type = "unlabel_everything" | ||
Int num_epochs | ||
Int num_calibration_epochs | ||
Float dropout_p | ||
Array[Int] aggregation_layers | ||
Array[Int] calibration_layers | ||
String? training_extra_args | ||
Boolean learn_artifact_spectra | ||
Float? genomic_span | ||
|
||
# Permutect filtering / posterior model | ||
File? test_dataset_truth_vcf # used for evaluation | ||
File? test_dataset_truth_vcf_idx | ||
Int? num_spectrum_iterations | ||
Float? spectrum_learning_rate | ||
String? permutect_filtering_extra_args | ||
String bcftools_docker = "us.gcr.io/broad-dsde-methods/davidben/bcftools" | ||
File? obscene_hack_leave_unset | ||
|
||
|
||
# runtime | ||
String gatk_docker | ||
String permutect_docker | ||
File? gatk_override | ||
String basic_bash_docker = "ubuntu:16.04" | ||
Int scatter_count | ||
Int preemptible = 2 | ||
Int max_retries = 1 | ||
Int small_task_cpu = 2 | ||
Int small_task_mem = 4 | ||
Int small_task_disk = 100 | ||
Int boot_disk_size = 12 | ||
Int learn_read_orientation_mem = 8000 | ||
Int filter_alignment_artifacts_mem = 9000 | ||
String? gcs_project_for_requester_pays | ||
|
||
# Use as a last resort to increase the disk given to every task in case of ill behaving data | ||
Int emergency_extra_disk = 0 | ||
} | ||
|
||
# note: we make both training and test datasets | ||
# note: for speed we may skip filtering in order to begin UDA artifact model training immediately | ||
# the only M2 filtering we may need is contamination, and that may be skipped | ||
call m2.Mutect2 { | ||
input: | ||
intervals = intervals, | ||
masked_intervals = masked_intervals, | ||
ref_fasta = ref_fasta, | ||
ref_fai = ref_fai, | ||
ref_dict = ref_dict, | ||
tumor_reads = primary_bam, | ||
tumor_reads_index = primary_bai, | ||
normal_reads = control_bam, | ||
normal_reads_index = control_bai, | ||
gnomad = gnomad, | ||
gnomad_idx = gnomad_idx, | ||
variants_for_contamination = variants_for_contamination, | ||
variants_for_contamination_idx = variants_for_contamination_idx, | ||
realignment_index_bundle = realignment_index_bundle, | ||
realignment_extra_args = realignment_extra_args, | ||
run_orientation_bias_mixture_model_filter = run_orientation_bias_mixture_model_filter, | ||
m2_extra_args = m2_extra_args, | ||
dragstr_model = dragstr_model, | ||
make_bamout = make_bamout, | ||
make_permutect_training_dataset = true, | ||
make_permutect_test_dataset = true, | ||
permutect_test_dataset_truth_vcf = test_dataset_truth_vcf, | ||
permutect_test_dataset_truth_vcf_idx = test_dataset_truth_vcf_idx, | ||
skip_filtering = skip_m2_filtering, | ||
gatk_docker = gatk_docker, | ||
gatk_override = gatk_override, | ||
scatter_count = scatter_count, | ||
preemptible = preemptible, | ||
max_retries = max_retries, | ||
small_task_cpu = small_task_cpu, | ||
small_task_mem = small_task_mem, | ||
small_task_disk = small_task_disk, | ||
boot_disk_size = boot_disk_size, | ||
gcs_project_for_requester_pays = gcs_project_for_requester_pays, | ||
emergency_extra_disk = emergency_extra_disk | ||
} | ||
|
||
# preprocess the training data from Mutect2 | ||
call Preprocess { | ||
input: | ||
training_dataset = select_first([Mutect2.permutect_training_dataset]), | ||
chunk_size = chunk_size, | ||
permutect_docker = permutect_docker | ||
} | ||
|
||
# combine the source_tar and preprocessed training data into a UDA dataset | ||
call uda.PermutectUDADataset { | ||
input: | ||
source_train_tar = source_train_tar, | ||
target_train_tar = Preprocess.train_tar, | ||
source_edit_type = source_edit_type, | ||
target_edit_type = target_edit_type, | ||
chunk_size = chunk_size, | ||
permutect_docker = permutect_docker, | ||
preemptible = 0, | ||
max_retries = 0 | ||
} | ||
|
||
# train an artifact model on the UDA dataset | ||
call training.TrainPermutect { | ||
input: | ||
train_tar = PermutectUDADataset.uda_train_tar, | ||
base_model = base_model, | ||
num_epochs = num_epochs, | ||
num_calibration_epochs = num_calibration_epochs, | ||
batch_size = batch_size, | ||
inference_batch_size = inference_batch_size, | ||
num_workers = num_workers, | ||
mem = training_mem, | ||
use_gpu = use_gpu, | ||
gpu_count = gpu_count, | ||
dropout_p = dropout_p, | ||
aggregation_layers = aggregation_layers, | ||
calibration_layers = calibration_layers, | ||
train_m3_extra_args = training_extra_args, | ||
learn_artifact_spectra = learn_artifact_spectra, | ||
genomic_span = genomic_span, | ||
permutect_docker = permutect_docker, | ||
preemptible = 0, | ||
max_retries = 0 | ||
} | ||
|
||
# we already ran M2 so we don't need the entire calling workflow, just the post-M2 parts of it | ||
call calling.SplitMultiallelics { | ||
input: | ||
input_vcf = Mutect2.output_vcf, | ||
input_vcf_idx = Mutect2.output_vcf_idx, | ||
ref_fasta = ref_fasta, | ||
ref_fai = ref_fai, | ||
ref_dict = ref_dict, | ||
bcftools_docker = bcftools_docker | ||
} | ||
|
||
call calling.IndexVCF as IndexAfterSplitting { | ||
input: | ||
unindexed_vcf = SplitMultiallelics.output_vcf, | ||
gatk_docker = gatk_docker | ||
} | ||
|
||
if (use_gpu) { | ||
call calling.PermutectFilteringGPU { | ||
input: | ||
mutect2_vcf = IndexAfterSplitting.vcf, | ||
mutect2_vcf_idx = IndexAfterSplitting.vcf_index, | ||
permutect_model = TrainPermutect.artifact_model, | ||
test_dataset = select_first([Mutect2.permutect_test_dataset]), | ||
contigs_table = Mutect2.permutect_contigs_table, | ||
maf_segments = Mutect2.maf_segments, | ||
mutect_stats = Mutect2.mutect_stats, | ||
batch_size = batch_size, | ||
num_workers = num_workers, | ||
gpu_count = gpu_count, | ||
num_spectrum_iterations = num_spectrum_iterations, | ||
spectrum_learning_rate = spectrum_learning_rate, | ||
chunk_size = chunk_size, | ||
permutect_filtering_extra_args = permutect_filtering_extra_args, | ||
permutect_docker = permutect_docker, | ||
} | ||
} | ||
|
||
if (!use_gpu) { | ||
call calling.PermutectFilteringCPU { | ||
input: | ||
mutect2_vcf = IndexAfterSplitting.vcf, | ||
mutect2_vcf_idx = IndexAfterSplitting.vcf_index, | ||
permutect_model = TrainPermutect.artifact_model, | ||
test_dataset = select_first([Mutect2.permutect_test_dataset]), | ||
contigs_table = Mutect2.permutect_contigs_table, | ||
maf_segments = Mutect2.maf_segments, | ||
mutect_stats = Mutect2.mutect_stats, | ||
batch_size = batch_size, | ||
num_workers = num_workers, | ||
num_spectrum_iterations = num_spectrum_iterations, | ||
spectrum_learning_rate = spectrum_learning_rate, | ||
chunk_size = chunk_size, | ||
permutect_filtering_extra_args = permutect_filtering_extra_args, | ||
permutect_docker = permutect_docker, | ||
} | ||
} | ||
|
||
call calling.IndexVCF as IndexAfterFiltering { | ||
input: | ||
unindexed_vcf = select_first([PermutectFilteringGPU.output_vcf, PermutectFilteringCPU.output_vcf]), | ||
gatk_docker = gatk_docker | ||
} | ||
|
||
output { | ||
File? bamout = Mutect2.bamout | ||
File? bamout_index = Mutect2.bamout_index | ||
File mutect_stats = Mutect2.mutect_stats | ||
File permutect_contigs_table = Mutect2.permutect_contigs_table | ||
File permutect_read_groups_table = Mutect2.permutect_read_groups_table | ||
File train_tar = Preprocess.train_tar | ||
File training_tensorboard_tar = TrainPermutect.training_tensorboard_tar | ||
File output_vcf = IndexAfterFiltering.vcf | ||
File output_vcf_idx = IndexAfterFiltering.vcf_index | ||
File calling_tensorboard_tar = select_first([PermutectFilteringGPU.tensorboard_report, PermutectFilteringCPU.tensorboard_report]) | ||
} | ||
|
||
} | ||
|
||
task Preprocess { | ||
input { | ||
File training_dataset | ||
Int chunk_size | ||
Int? source_label | ||
|
||
String permutect_docker | ||
Int? preemptible | ||
Int? max_retries | ||
Int? disk_space | ||
Int? cpu | ||
Int? mem | ||
Boolean use_ssd = true | ||
} | ||
|
||
# Mem is in units of GB but our command and memory runtime values are in MB | ||
Int machine_mem = if defined(mem) then mem * 1000 else 16000 | ||
Int command_mem = machine_mem - 500 | ||
|
||
command <<< | ||
set -e | ||
|
||
preprocess_dataset --training_datasets ~{training_dataset} --chunk_size ~{chunk_size} ~{"--sources " + source_label} --output train.tar | ||
>>> | ||
|
||
runtime { | ||
docker: permutect_docker | ||
bootDiskSizeGb: 12 | ||
memory: machine_mem + " MB" | ||
disks: "local-disk " + select_first([disk_space, 100]) + if use_ssd then " SSD" else " HDD" | ||
preemptible: select_first([preemptible, 2]) | ||
maxRetries: select_first([max_retries, 0]) | ||
cpu: select_first([cpu, 1]) | ||
} | ||
|
||
output { | ||
File train_tar = "train.tar" | ||
} | ||
} |
Oops, something went wrong.