From aea7312905dedd0b88ba8229b1a02a9bd37793f5 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 13:16:18 -0800 Subject: [PATCH 01/31] change to Dataset as in brain segmentation bundle, add support for amp, add validate in train_autoencoder.json Signed-off-by: Can-Zhao --- .../configs/train_autoencoder.json | 135 ++++++++++++++---- .../scripts/ldm_trainer.py | 8 +- .../scripts/prepare_datalist.py | 72 ++++++++++ 3 files changed, 186 insertions(+), 29 deletions(-) create mode 100644 models/brats_mri_generative_diffusion/scripts/prepare_datalist.py diff --git a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json index ddaa8c41..e7be6686 100644 --- a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json @@ -2,21 +2,29 @@ "imports": [ "$import functools", "$import glob", - "$import scripts" + "$import scripts", + "$import generative" ], "bundle_root": ".", "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')", "ckpt_dir": "$@bundle_root + '/models'", "tf_dir": "$@bundle_root + '/eval'", - "dataset_dir": "/workspace/data/medical", + "data_list_file_path": "$@bundle_root + '/configs/datalist.json'", + "dataset_dir": "/datasets/brats18", + "train_datalist": "$monai.data.load_decathlon_datalist(@data_list_file_path, data_list_key='training', base_dir=@dataset_dir)", + "val_datalist": "$monai.data.load_decathlon_datalist(@data_list_file_path, data_list_key='validation', base_dir=@dataset_dir)", "pretrained": false, "perceptual_loss_model_weights_path": null, "train_batch_size": 2, - "lr": 1e-05, + "val_batch_size": 2, + "epochs": 4000, + "val_interval": 10, + "lr": 1e-04, + "amp": true, "train_patch_size": [ - 112, 128, - 80 + 128, + 128 ], "channel": 0, "spacing": [ @@ -26,7 +34,7 @@ ], "spatial_dims": 3, "image_channels": 1, - "latent_channels": 8, + "latent_channels": 4, "discriminator_def": { "_target_": "generative.networks.nets.PatchDiscriminator", "spatial_dims": "@spatial_dims", @@ -56,7 +64,9 @@ false ], "with_encoder_nonlocal_attn": false, - "with_decoder_nonlocal_attn": false + "with_decoder_nonlocal_attn": false, + "use_checkpointing": true, + "use_convtranspose": false }, "perceptual_loss_def": { "_target_": "generative.losses.PerceptualLoss", @@ -140,13 +150,8 @@ "transforms": "$@preprocessing_transforms + @train#crop_transforms + @final_transforms" }, "dataset": { - "_target_": "monai.apps.DecathlonDataset", - "root_dir": "@dataset_dir", - "task": "Task01_BrainTumour", - "section": "training", - "cache_rate": 1.0, - "num_workers": 8, - "download": false, + "_target_": "Dataset", + "data": "@train_datalist", "transform": "@train#preprocessing" }, "dataloader": { @@ -158,32 +163,33 @@ }, "handlers": [ { - "_target_": "CheckpointSaver", - "save_dir": "@ckpt_dir", - "save_dict": { - "model": "@gnetwork" - }, - "save_interval": 0, - "save_final": true, + "_target_": "ValidationHandler", + "validator": "@validate#evaluator", "epoch_level": true, - "final_filename": "model_autoencoder.pt" + "interval": "@val_interval" }, { "_target_": "StatsHandler", "tag_name": "train_loss", - "output_transform": "$lambda x: monai.handlers.from_engine(['g_loss'], first=True)(x)[0]" + "output_transform": "$lambda x: monai.handlers.from_engine(['g_loss'], first=True)(x)[0]+monai.handlers.from_engine(['d_loss'], first=True)(x)[0]" }, { "_target_": "TensorBoardStatsHandler", "log_dir": "@tf_dir", - "tag_name": "train_loss", + "tag_name": "train_generator_loss", "output_transform": "$lambda x: monai.handlers.from_engine(['g_loss'], first=True)(x)[0]" + }, + { + "_target_": "TensorBoardStatsHandler", + "log_dir": "@tf_dir", + "tag_name": "train_discriminator_loss", + "output_transform": "$lambda x: monai.handlers.from_engine(['d_loss'], first=True)(x)[0]" } ], "trainer": { "_target_": "scripts.ldm_trainer.VaeGanTrainer", "device": "@device", - "max_epochs": 1500, + "max_epochs": "@epochs", "train_data_loader": "@train#dataloader", "g_network": "@gnetwork", "g_optimizer": "@goptimizer", @@ -195,7 +201,82 @@ "g_update_latents": true, "latent_shape": "@latent_channels", "key_train_metric": "$None", - "train_handlers": "@train#handlers" + "train_handlers": "@train#handlers", + "amp": "@amp" + } + }, + "validate": { + "crop_transforms": [ + { + "_target_": "DivisiblePadd", + "keys": "image", + "k": 16 + } + ], + "preprocessing": { + "_target_": "Compose", + "transforms": "$@preprocessing_transforms + @validate#crop_transforms + @final_transforms" + }, + "dataset": { + "_target_": "Dataset", + "data": "@val_datalist", + "transform": "@validate#preprocessing" + }, + "dataloader": { + "_target_": "DataLoader", + "dataset": "@validate#dataset", + "batch_size": "@val_batch_size", + "shuffle": false, + "num_workers": 4 + }, + "postprocessing": { + "_target_": "Compose", + "transforms": [ + { + "_target_": "Lambdad", + "keys": "pred", + "func": "$lambda x: x[0]" + } + ] + }, + "handlers": [ + { + "_target_": "StatsHandler", + "iteration_log": false + }, + { + "_target_": "TensorBoardStatsHandler", + "log_dir": "@tf_dir", + "iteration_log": false + }, + { + "_target_": "CheckpointSaver", + "save_dir": "@ckpt_dir", + "save_dict": { + "model": "@gnetwork" + }, + "save_interval": 0, + "save_final": true, + "epoch_level": true, + "final_filename": "model_autoencoder.pt" + } + ], + "key_metric": { + "val_mean_l2": { + "_target_": "MeanSquaredError", + "output_transform": "$monai.handlers.from_engine(['pred', 'image'])" + } + }, + "evaluator": { + "_target_": "SupervisedEvaluator", + "device": "@device", + "val_data_loader": "@validate#dataloader", + "network": "@gnetwork", + "postprocessing": "@validate#postprocessing", + "key_val_metric": "$@validate#key_metric", + "metric_cmp_fn": "$lambda current_metric,prev_best: current_metric < prev_best", + "val_handlers": "@validate#handlers", + "amp": "@amp" } }, "initialize": [ @@ -204,4 +285,4 @@ "run": [ "$@train#trainer.run()" ] -} +} \ No newline at end of file diff --git a/models/brats_mri_generative_diffusion/scripts/ldm_trainer.py b/models/brats_mri_generative_diffusion/scripts/ldm_trainer.py index c1a21bfa..04952923 100644 --- a/models/brats_mri_generative_diffusion/scripts/ldm_trainer.py +++ b/models/brats_mri_generative_diffusion/scripts/ldm_trainer.py @@ -11,14 +11,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Callable, Iterable, Sequence +from typing import TYPE_CHECKING, Any, Callable, Sequence import torch from monai.config import IgniteInfo from monai.engines.utils import IterationEvents, default_metric_cmp_fn, default_prepare_batch from monai.inferers import Inferer, SimpleInferer from monai.transforms import Transform -from monai.utils import min_version, optional_import +from monai.utils import GanKeys, min_version, optional_import from monai.utils.enums import CommonKeys, GanKeys from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader @@ -81,6 +81,7 @@ class VaeGanTrainer(Trainer): `best_metric` and `best_metric_epoch` with current metric and epoch, default to `greater than`. train_handlers: every handler is a set of Ignite Event-Handlers, must have `attach` function, like: CheckpointHandler, StatsHandler, etc. + amp: whether to enable auto-mixed-precision training, default is False. decollate: whether to decollate the batch-first data to a list of data after model computation, recommend `decollate=True` when `postprocessing` uses components from `monai.transforms`. default to `True`. @@ -118,6 +119,7 @@ def __init__( additional_metrics: dict[str, Metric] | None = None, metric_cmp_fn: Callable = default_metric_cmp_fn, train_handlers: Sequence | None = None, + amp: bool = False, decollate: bool = True, optim_set_to_none: bool = False, to_kwargs: dict | None = None, @@ -139,6 +141,7 @@ def __init__( additional_metrics=additional_metrics, metric_cmp_fn=metric_cmp_fn, handlers=train_handlers, + amp=amp, postprocessing=postprocessing, decollate=decollate, to_kwargs=to_kwargs, @@ -173,6 +176,7 @@ def _iteration( raise ValueError("must provide batch data for current iteration.") d_input = engine.prepare_batch(batchdata, engine.state.device, engine.non_blocking, **engine.to_kwargs)[0] + batch_size = engine.data_loader.batch_size # type: ignore g_input = d_input g_output, z_mu, z_sigma = engine.g_inferer(g_input, engine.g_network) diff --git a/models/brats_mri_generative_diffusion/scripts/prepare_datalist.py b/models/brats_mri_generative_diffusion/scripts/prepare_datalist.py new file mode 100644 index 00000000..e48edbb9 --- /dev/null +++ b/models/brats_mri_generative_diffusion/scripts/prepare_datalist.py @@ -0,0 +1,72 @@ +import argparse +import glob +import json +import os + +import monai +from sklearn.model_selection import train_test_split + + +def produce_sample_dict(line: str): + names = os.listdir(line) + seg, t1ce, t1, t2, flair = [], [], [], [], [] + for name in names: + name = os.path.join(line, name) + if "_seg.nii" in name: + seg.append(name) + elif "_t1ce.nii" in name: + t1ce.append(name) + elif "_t1.nii" in name: + t1.append(name) + elif "_t2.nii" in name: + t2.append(name) + elif "_flair.nii" in name: + flair.append(name) + + return {"label": seg[0], "image": t1ce + t1 + t2 + flair} + + +def produce_datalist(dataset_dir: str, train_size: int = 200): + """ + This function is used to split the dataset. + It will produce "train_size" number of samples for training, and the other samples + are divided equally into val and test sets. + """ + + samples = sorted(glob.glob(os.path.join(dataset_dir, "*", "*"), recursive=True)) + datalist = [] + for line in samples: + datalist.append(produce_sample_dict(line)) + train_list, other_list = train_test_split(datalist, train_size=train_size) + val_list, test_list = train_test_split(other_list, train_size=0.5) + + return {"training": train_list, "validation": val_list, "testing": test_list} + + +def main(args): + """ + split the dataset and output the data list into a json file. + """ + data_file_base_dir = os.path.join(os.path.abspath(args.path), "training") + # produce deterministic data splits + monai.utils.set_determinism(seed=123) + datalist = produce_datalist(dataset_dir=data_file_base_dir, train_size=args.train_size) + with open(args.output, "w") as f: + json.dump(datalist, f) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="") + parser.add_argument( + "--path", + type=str, + default="/workspace/data/medical/brats2018challenge", + help="root path of brats 2018 dataset.", + ) + parser.add_argument( + "--output", type=str, default="configs/datalist.json", help="relative path of output datalist json file." + ) + parser.add_argument("--train_size", type=int, default=200, help="number of training samples.") + args = parser.parse_args() + + main(args) From bc2be5af03348adbbfb43dfab6680e63b2576dd4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 Dec 2023 21:17:37 +0000 Subject: [PATCH 02/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../configs/train_autoencoder.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json index e7be6686..dc27eede 100644 --- a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json @@ -19,7 +19,7 @@ "val_batch_size": 2, "epochs": 4000, "val_interval": 10, - "lr": 1e-04, + "lr": 0.0001, "amp": true, "train_patch_size": [ 128, @@ -285,4 +285,4 @@ "run": [ "$@train#trainer.run()" ] -} \ No newline at end of file +} From 54bf0d29665ce126c339a08d074daf0941e458c6 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 13:20:06 -0800 Subject: [PATCH 03/31] typo Signed-off-by: Can-Zhao --- .../configs/train_autoencoder.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json index e7be6686..5cd49721 100644 --- a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json @@ -206,7 +206,7 @@ } }, "validate": { - "crop_transforms": [ + "croppad_transforms": [ { "_target_": "DivisiblePadd", "keys": "image", @@ -215,7 +215,7 @@ ], "preprocessing": { "_target_": "Compose", - "transforms": "$@preprocessing_transforms + @validate#crop_transforms + @final_transforms" + "transforms": "$@preprocessing_transforms + @validate#croppad_transforms + @final_transforms" }, "dataset": { "_target_": "Dataset", From 9d2d2f976aca97b86c4fe9e0eaa51a9b0310dd84 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 14:38:43 -0800 Subject: [PATCH 04/31] update train-diffusion.json Signed-off-by: Can-Zhao --- .../configs/train_diffusion.json | 73 +++++++------------ 1 file changed, 28 insertions(+), 45 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_diffusion.json b/models/brats_mri_generative_diffusion/configs/train_diffusion.json index 85c8ca8a..441b20ee 100644 --- a/models/brats_mri_generative_diffusion/configs/train_diffusion.json +++ b/models/brats_mri_generative_diffusion/configs/train_diffusion.json @@ -1,55 +1,42 @@ { "ckpt_dir": "$@bundle_root + '/models'", "train_batch_size": 4, - "lr": 1e-05, "train_patch_size": [ - 144, - 176, - 112 + 192, + 192, + 128 ], "latent_shape": [ "@latent_channels", - 36, - 44, - 28 + 48, + 48, + 32 ], "load_autoencoder_path": "$@bundle_root + '/models/model_autoencoder.pt'", "load_autoencoder": "$@autoencoder_def.load_state_dict(torch.load(@load_autoencoder_path))", "autoencoder": "$@autoencoder_def.to(@device)", - "network_def": { + "diffusion_def": { "_target_": "generative.networks.nets.DiffusionModelUNet", "spatial_dims": "@spatial_dims", "in_channels": "@latent_channels", "out_channels": "@latent_channels", - "num_channels": [ - 256, - 256, - 512 - ], - "attention_levels": [ - false, - true, - true - ], - "num_head_channels": [ - 0, - 64, - 64 - ], - "num_res_blocks": 2 + "num_channels":[128, 256, 512], + "attention_levels":[false, true, true], + "num_head_channels":[0, 32, 32], + "num_res_blocks": 2, + "use_flash_attention": true }, - "diffusion": "$@network_def.to(@device)", + "diffusion": "$@diffusion_def.to(@device)", "optimizer": { "_target_": "torch.optim.Adam", "params": "$@diffusion.parameters()", - "lr": "@lr" + "lr": 1e-04 }, "lr_scheduler": { "_target_": "torch.optim.lr_scheduler.MultiStepLR", "optimizer": "@optimizer", "milestones": [ - 100, - 1000 + 5000 ], "gamma": 0.1 }, @@ -59,20 +46,20 @@ "_requires_": [ "@load_autoencoder" ], - "schedule": "scaled_linear_beta", + "schedule": "linear_beta", "num_train_timesteps": 1000, "beta_start": 0.0015, "beta_end": 0.0195 }, + "inferer": { + "_target_": "generative.inferers.LatentDiffusionInferer", + "scheduler": "@noise_scheduler", + "scale_factor": "@scale_factor" + }, "loss": { "_target_": "torch.nn.MSELoss" }, "train": { - "inferer": { - "_target_": "generative.inferers.LatentDiffusionInferer", - "scheduler": "@noise_scheduler", - "scale_factor": "@scale_factor" - }, "crop_transforms": [ { "_target_": "CenterSpatialCropd", @@ -85,13 +72,8 @@ "transforms": "$@preprocessing_transforms + @train#crop_transforms + @final_transforms" }, "dataset": { - "_target_": "monai.apps.DecathlonDataset", - "root_dir": "@dataset_dir", - "task": "Task01_BrainTumour", - "section": "training", - "cache_rate": 1.0, - "num_workers": 8, - "download": false, + "_target_": "Dataset", + "data": "@train_datalist", "transform": "@train#preprocessing" }, "dataloader": { @@ -116,7 +98,7 @@ "save_interval": 0, "save_final": true, "epoch_level": true, - "final_filename": "model.pt" + "final_filename": "model_ldm.pt" }, { "_target_": "StatsHandler", @@ -127,20 +109,21 @@ "_target_": "TensorBoardStatsHandler", "log_dir": "@tf_dir", "tag_name": "train_diffusion_loss", - "output_transform": "$lambda x: monai.handlers.from_engine(['loss'], first=True)(x)" + "output_transform": "$lambda x: monai.handlers.from_engine(['loss'], first=True)(x)", + "iteration_log":false } ], "trainer": { "_target_": "scripts.ldm_trainer.LDMTrainer", "device": "@device", - "max_epochs": 5000, + "max_epochs": 10000, "train_data_loader": "@train#dataloader", "network": "@diffusion", "autoencoder_model": "@autoencoder", "optimizer": "@optimizer", "loss_function": "@loss", "latent_shape": "@latent_shape", - "inferer": "@train#inferer", + "inferer": "@inferer", "key_train_metric": "$None", "train_handlers": "@train#handlers" } From d51159e36452773b726589b2ed37d372d14f0658 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 Dec 2023 22:40:04 +0000 Subject: [PATCH 05/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../configs/train_diffusion.json | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_diffusion.json b/models/brats_mri_generative_diffusion/configs/train_diffusion.json index 441b20ee..8e82141d 100644 --- a/models/brats_mri_generative_diffusion/configs/train_diffusion.json +++ b/models/brats_mri_generative_diffusion/configs/train_diffusion.json @@ -20,9 +20,21 @@ "spatial_dims": "@spatial_dims", "in_channels": "@latent_channels", "out_channels": "@latent_channels", - "num_channels":[128, 256, 512], - "attention_levels":[false, true, true], - "num_head_channels":[0, 32, 32], + "num_channels": [ + 128, + 256, + 512 + ], + "attention_levels": [ + false, + true, + true + ], + "num_head_channels": [ + 0, + 32, + 32 + ], "num_res_blocks": 2, "use_flash_attention": true }, @@ -30,7 +42,7 @@ "optimizer": { "_target_": "torch.optim.Adam", "params": "$@diffusion.parameters()", - "lr": 1e-04 + "lr": 0.0001 }, "lr_scheduler": { "_target_": "torch.optim.lr_scheduler.MultiStepLR", @@ -110,7 +122,7 @@ "log_dir": "@tf_dir", "tag_name": "train_diffusion_loss", "output_transform": "$lambda x: monai.handlers.from_engine(['loss'], first=True)(x)", - "iteration_log":false + "iteration_log": false } ], "trainer": { From 18033f752792fa2ca553807557d8878d255c528e Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 14:52:38 -0800 Subject: [PATCH 06/31] update train-diffusion.json Signed-off-by: Can-Zhao --- .../configs/train_diffusion.json | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_diffusion.json b/models/brats_mri_generative_diffusion/configs/train_diffusion.json index 8e82141d..bffaec5c 100644 --- a/models/brats_mri_generative_diffusion/configs/train_diffusion.json +++ b/models/brats_mri_generative_diffusion/configs/train_diffusion.json @@ -1,6 +1,7 @@ { "ckpt_dir": "$@bundle_root + '/models'", "train_batch_size": 4, + "lr": 5e-05, "train_patch_size": [ 192, 192, @@ -15,7 +16,7 @@ "load_autoencoder_path": "$@bundle_root + '/models/model_autoencoder.pt'", "load_autoencoder": "$@autoencoder_def.load_state_dict(torch.load(@load_autoencoder_path))", "autoencoder": "$@autoencoder_def.to(@device)", - "diffusion_def": { + "network_def": { "_target_": "generative.networks.nets.DiffusionModelUNet", "spatial_dims": "@spatial_dims", "in_channels": "@latent_channels", @@ -38,11 +39,11 @@ "num_res_blocks": 2, "use_flash_attention": true }, - "diffusion": "$@diffusion_def.to(@device)", + "diffusion": "$@network_def.to(@device)", "optimizer": { "_target_": "torch.optim.Adam", "params": "$@diffusion.parameters()", - "lr": 0.0001 + "lr": "@lr" }, "lr_scheduler": { "_target_": "torch.optim.lr_scheduler.MultiStepLR", @@ -63,15 +64,15 @@ "beta_start": 0.0015, "beta_end": 0.0195 }, - "inferer": { - "_target_": "generative.inferers.LatentDiffusionInferer", - "scheduler": "@noise_scheduler", - "scale_factor": "@scale_factor" - }, "loss": { "_target_": "torch.nn.MSELoss" }, "train": { + "inferer": { + "_target_": "generative.inferers.LatentDiffusionInferer", + "scheduler": "@noise_scheduler", + "scale_factor": "@scale_factor" + }, "crop_transforms": [ { "_target_": "CenterSpatialCropd", @@ -110,7 +111,7 @@ "save_interval": 0, "save_final": true, "epoch_level": true, - "final_filename": "model_ldm.pt" + "final_filename": "model.pt" }, { "_target_": "StatsHandler", @@ -121,8 +122,7 @@ "_target_": "TensorBoardStatsHandler", "log_dir": "@tf_dir", "tag_name": "train_diffusion_loss", - "output_transform": "$lambda x: monai.handlers.from_engine(['loss'], first=True)(x)", - "iteration_log": false + "output_transform": "$lambda x: monai.handlers.from_engine(['loss'], first=True)(x)" } ], "trainer": { @@ -135,9 +135,10 @@ "optimizer": "@optimizer", "loss_function": "@loss", "latent_shape": "@latent_shape", - "inferer": "@inferer", + "inferer": "@train#inferer", "key_train_metric": "$None", - "train_handlers": "@train#handlers" + "train_handlers": "@train#handlers", + "amp": "@amp" } }, "initialize": [ @@ -149,4 +150,4 @@ "$print('scale factor:',@scale_factor)", "$@train#trainer.run()" ] -} +} \ No newline at end of file From 466c3c95a486778a13764a227f44176c8fc3994e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 Dec 2023 22:54:05 +0000 Subject: [PATCH 07/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../brats_mri_generative_diffusion/configs/train_diffusion.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_diffusion.json b/models/brats_mri_generative_diffusion/configs/train_diffusion.json index bffaec5c..08828bec 100644 --- a/models/brats_mri_generative_diffusion/configs/train_diffusion.json +++ b/models/brats_mri_generative_diffusion/configs/train_diffusion.json @@ -150,4 +150,4 @@ "$print('scale factor:',@scale_factor)", "$@train#trainer.run()" ] -} \ No newline at end of file +} From ad7a3a5d6b2526ebde4da8d7db2bf3f6c512fc13 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 15:04:05 -0800 Subject: [PATCH 08/31] update train_autoencoder.json and inference.json Signed-off-by: Can-Zhao --- .../configs/inference.json | 31 ++++++++++--------- .../configs/train_autoencoder.json | 17 ++++++---- .../configs/train_diffusion.json | 2 +- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/inference.json b/models/brats_mri_generative_diffusion/configs/inference.json index 22767e98..1f2c9e57 100644 --- a/models/brats_mri_generative_diffusion/configs/inference.json +++ b/models/brats_mri_generative_diffusion/configs/inference.json @@ -12,12 +12,12 @@ "output_postfix": "$datetime.now().strftime('sample_%Y%m%d_%H%M%S')", "spatial_dims": 3, "image_channels": 1, - "latent_channels": 8, - "latent_shape": [ - 8, - 36, - 44, - 28 + "latent_channels": 4, + "latent_shape": [ + "@latent_channels", + 48, + 48, + 32 ], "autoencoder_def": { "_target_": "generative.networks.nets.AutoencoderKL", @@ -39,7 +39,9 @@ false ], "with_encoder_nonlocal_attn": false, - "with_decoder_nonlocal_attn": false + "with_decoder_nonlocal_attn": false, + "use_checkpointing": true, + "use_convtranspose": false }, "network_def": { "_target_": "generative.networks.nets.DiffusionModelUNet", @@ -47,7 +49,7 @@ "in_channels": "@latent_channels", "out_channels": "@latent_channels", "num_channels": [ - 256, + 128, 256, 512 ], @@ -58,10 +60,11 @@ ], "num_head_channels": [ 0, - 64, - 64 + 32, + 32 ], - "num_res_blocks": 2 + "num_res_blocks": 2, + "use_flash_attention": true }, "load_autoencoder_path": "$@bundle_root + '/models/model_autoencoder.pt'", "load_autoencoder": "$@autoencoder_def.load_state_dict(torch.load(@load_autoencoder_path))", @@ -70,15 +73,15 @@ "load_diffusion": "$@network_def.load_state_dict(torch.load(@load_diffusion_path))", "diffusion": "$@network_def.to(@device)", "noise_scheduler": { - "_target_": "generative.networks.schedulers.DDIMScheduler", + "_target_": "generative.networks.schedulers.DDPMScheduler", "_requires_": [ "@load_diffusion", "@load_autoencoder" ], + "schedule": "scaled_linear_beta", "num_train_timesteps": 1000, "beta_start": 0.0015, "beta_end": 0.0195, - "schedule": "scaled_linear_beta", "clip_sample": false }, "noise": "$torch.randn([1]+@latent_shape).to(@device)", @@ -98,4 +101,4 @@ "run": [ "$@saver(@generated_image[0])" ] -} +} \ No newline at end of file diff --git a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json index 840dc360..6e326a04 100644 --- a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json @@ -19,13 +19,18 @@ "val_batch_size": 2, "epochs": 4000, "val_interval": 10, - "lr": 0.0001, + "lr": 1e-04, "amp": true, "train_patch_size": [ 128, 128, 128 ], + "val_patch_size": [ + 192, + 192, + 128 + ], "channel": 0, "spacing": [ 1.1, @@ -206,16 +211,16 @@ } }, "validate": { - "croppad_transforms": [ + "crop_transforms": [ { - "_target_": "DivisiblePadd", + "_target_": "CenterSpatialCropd", "keys": "image", - "k": 16 + "roi_size": "@val_patch_size" } ], "preprocessing": { "_target_": "Compose", - "transforms": "$@preprocessing_transforms + @validate#croppad_transforms + @final_transforms" + "transforms": "$@preprocessing_transforms + @validate#crop_transforms + @final_transforms" }, "dataset": { "_target_": "Dataset", @@ -285,4 +290,4 @@ "run": [ "$@train#trainer.run()" ] -} +} \ No newline at end of file diff --git a/models/brats_mri_generative_diffusion/configs/train_diffusion.json b/models/brats_mri_generative_diffusion/configs/train_diffusion.json index bffaec5c..41e4b402 100644 --- a/models/brats_mri_generative_diffusion/configs/train_diffusion.json +++ b/models/brats_mri_generative_diffusion/configs/train_diffusion.json @@ -59,7 +59,7 @@ "_requires_": [ "@load_autoencoder" ], - "schedule": "linear_beta", + "schedule": "scaled_linear_beta", "num_train_timesteps": 1000, "beta_start": 0.0015, "beta_end": 0.0195 From e8b6a029f41bd21b0846a8e2162d30b3de937520 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 Dec 2023 23:04:35 +0000 Subject: [PATCH 09/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- models/brats_mri_generative_diffusion/configs/inference.json | 4 ++-- .../configs/train_autoencoder.json | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/inference.json b/models/brats_mri_generative_diffusion/configs/inference.json index 1f2c9e57..811d51fd 100644 --- a/models/brats_mri_generative_diffusion/configs/inference.json +++ b/models/brats_mri_generative_diffusion/configs/inference.json @@ -13,7 +13,7 @@ "spatial_dims": 3, "image_channels": 1, "latent_channels": 4, - "latent_shape": [ + "latent_shape": [ "@latent_channels", 48, 48, @@ -101,4 +101,4 @@ "run": [ "$@saver(@generated_image[0])" ] -} \ No newline at end of file +} diff --git a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json index 6e326a04..ab179c95 100644 --- a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json @@ -19,7 +19,7 @@ "val_batch_size": 2, "epochs": 4000, "val_interval": 10, - "lr": 1e-04, + "lr": 0.0001, "amp": true, "train_patch_size": [ 128, @@ -290,4 +290,4 @@ "run": [ "$@train#trainer.run()" ] -} \ No newline at end of file +} From 4c8646b1653feb13d77031c33f1556c24d0cdc98 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 15:05:09 -0800 Subject: [PATCH 10/31] typo Signed-off-by: Can-Zhao --- models/brats_mri_generative_diffusion/configs/inference.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/brats_mri_generative_diffusion/configs/inference.json b/models/brats_mri_generative_diffusion/configs/inference.json index 1f2c9e57..aa3f3659 100644 --- a/models/brats_mri_generative_diffusion/configs/inference.json +++ b/models/brats_mri_generative_diffusion/configs/inference.json @@ -13,7 +13,7 @@ "spatial_dims": 3, "image_channels": 1, "latent_channels": 4, - "latent_shape": [ + "latent_shape": [ "@latent_channels", 48, 48, From ce9bdcddf6e55ea764184ae2a22aa88b65e07e7a Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 15:13:17 -0800 Subject: [PATCH 11/31] typo Signed-off-by: Can-Zhao --- .../brats_mri_generative_diffusion/configs/train_diffusion.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_diffusion.json b/models/brats_mri_generative_diffusion/configs/train_diffusion.json index 00f43153..5bb55a7c 100644 --- a/models/brats_mri_generative_diffusion/configs/train_diffusion.json +++ b/models/brats_mri_generative_diffusion/configs/train_diffusion.json @@ -1,7 +1,7 @@ { "ckpt_dir": "$@bundle_root + '/models'", "train_batch_size": 4, - "lr": 5e-05, + "lr": 2e-05, "train_patch_size": [ 192, 192, From a6d7d851012778f6a14c3f10534f5921e7b83880 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 15:14:25 -0800 Subject: [PATCH 12/31] typo Signed-off-by: Can-Zhao --- .../configs/train_autoencoder.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json index ab179c95..8e4f8864 100644 --- a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json @@ -19,7 +19,7 @@ "val_batch_size": 2, "epochs": 4000, "val_interval": 10, - "lr": 0.0001, + "lr": 2e-05, "amp": true, "train_patch_size": [ 128, @@ -290,4 +290,4 @@ "run": [ "$@train#trainer.run()" ] -} +} \ No newline at end of file From c94a50a755208e3591fb83b02acf4a88de12ce70 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 Dec 2023 23:14:35 +0000 Subject: [PATCH 13/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../configs/train_autoencoder.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json index 8e4f8864..4526bb6b 100644 --- a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json @@ -290,4 +290,4 @@ "run": [ "$@train#trainer.run()" ] -} \ No newline at end of file +} From 2d4f8a7d232147582b7704e1e662cf0962dd18ec Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 15:15:32 -0800 Subject: [PATCH 14/31] typo Signed-off-by: Can-Zhao --- .../configs/train_autoencoder.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json index 8e4f8864..033e7fb9 100644 --- a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json @@ -17,7 +17,7 @@ "perceptual_loss_model_weights_path": null, "train_batch_size": 2, "val_batch_size": 2, - "epochs": 4000, + "epochs": 8000, "val_interval": 10, "lr": 2e-05, "amp": true, From 3536210a86df40407dbf19981280a07fe251431d Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 15:18:31 -0800 Subject: [PATCH 15/31] typo Signed-off-by: Can-Zhao --- .../brats_mri_generative_diffusion/configs/inference.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/inference.json b/models/brats_mri_generative_diffusion/configs/inference.json index 811d51fd..a81aeb0d 100644 --- a/models/brats_mri_generative_diffusion/configs/inference.json +++ b/models/brats_mri_generative_diffusion/configs/inference.json @@ -73,15 +73,15 @@ "load_diffusion": "$@network_def.load_state_dict(torch.load(@load_diffusion_path))", "diffusion": "$@network_def.to(@device)", "noise_scheduler": { - "_target_": "generative.networks.schedulers.DDPMScheduler", + "_target_": "generative.networks.schedulers.DDIMScheduler", "_requires_": [ "@load_diffusion", "@load_autoencoder" - ], - "schedule": "scaled_linear_beta", + ], "num_train_timesteps": 1000, "beta_start": 0.0015, "beta_end": 0.0195, + "schedule": "scaled_linear_beta", "clip_sample": false }, "noise": "$torch.randn([1]+@latent_shape).to(@device)", @@ -101,4 +101,4 @@ "run": [ "$@saver(@generated_image[0])" ] -} +} \ No newline at end of file From 29fc950f4f695e80cb52e11a77da8fa81b333a13 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 13 Dec 2023 23:18:48 +0000 Subject: [PATCH 16/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- models/brats_mri_generative_diffusion/configs/inference.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/inference.json b/models/brats_mri_generative_diffusion/configs/inference.json index a81aeb0d..d4952061 100644 --- a/models/brats_mri_generative_diffusion/configs/inference.json +++ b/models/brats_mri_generative_diffusion/configs/inference.json @@ -77,7 +77,7 @@ "_requires_": [ "@load_diffusion", "@load_autoencoder" - ], + ], "num_train_timesteps": 1000, "beta_start": 0.0015, "beta_end": 0.0195, @@ -101,4 +101,4 @@ "run": [ "$@saver(@generated_image[0])" ] -} \ No newline at end of file +} From e6af5f828b79b9caf878cf56eba4bcb6834a15d5 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 15:19:08 -0800 Subject: [PATCH 17/31] typo Signed-off-by: Can-Zhao --- models/brats_mri_generative_diffusion/configs/inference.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/brats_mri_generative_diffusion/configs/inference.json b/models/brats_mri_generative_diffusion/configs/inference.json index a81aeb0d..328e9f7e 100644 --- a/models/brats_mri_generative_diffusion/configs/inference.json +++ b/models/brats_mri_generative_diffusion/configs/inference.json @@ -77,7 +77,7 @@ "_requires_": [ "@load_diffusion", "@load_autoencoder" - ], + ], "num_train_timesteps": 1000, "beta_start": 0.0015, "beta_end": 0.0195, From 703e92dc2d591a5593f8ac7b9fca6d9e16040600 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 15:25:48 -0800 Subject: [PATCH 18/31] update readme Signed-off-by: Can-Zhao --- .../docs/README.md | 43 +++++++++++-------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/models/brats_mri_generative_diffusion/docs/README.md b/models/brats_mri_generative_diffusion/docs/README.md index 1c01d861..444db62e 100644 --- a/models/brats_mri_generative_diffusion/docs/README.md +++ b/models/brats_mri_generative_diffusion/docs/README.md @@ -1,11 +1,11 @@ # Model Overview A pre-trained model for volumetric (3D) Brats MRI 3D Latent Diffusion Generative Model. -This model is trained on BraTS 2016 and 2017 data from [Medical Decathlon](http://medicaldecathlon.com/), using the Latent diffusion model [1]. +This model is trained based on BraTS 2018 data from [Multimodal Brain Tumor Segmentation Challenge (BraTS) 2018](https://www.med.upenn.edu/sbia/brats2018.html), using the Latent diffusion model [1]. ![model workflow](https://developer.download.nvidia.com/assets/Clara/Images/monai_brain_image_gen_ldm3d_network.png) -This model is a generator for creating images like the Flair MRIs based on BraTS 2016 and 2017 data. It was trained as a 3d latent diffusion model and accepts Gaussian random noise as inputs to produce an image output. The `train_autoencoder.json` file describes the training process of the variational autoencoder with GAN loss. The `train_diffusion.json` file describes the training process of the 3D latent diffusion model. +This model is a generator for creating images like the T1CE MRIs based on BraTS 2018 data. It was trained as a 3d latent diffusion model and accepts Gaussian random noise as inputs to produce an image output. The `train_autoencoder.json` file describes the training process of the variational autoencoder with GAN loss. The `train_diffusion.json` file describes the training process of the 3D latent diffusion model. In this bundle, the autoencoder uses perceptual loss, which is based on ResNet50 with pre-trained weights (the network is frozen and will not be trained in the bundle). In default, the `pretrained` parameter is specified as `False` in `train_autoencoder.json`. To ensure correct training, changing the default settings is necessary. There are two ways to utilize pretrained weights: 1. if set `pretrained` to `True`, ImageNet pretrained weights from [torchvision](https://pytorch.org/vision/stable/_modules/torchvision/models/resnet.html#ResNet50_Weights) will be used. However, the weights are for non-commercial use only. @@ -20,12 +20,21 @@ An example result from inference is shown below: **This is a demonstration network meant to just show the training process for this sort of network with MONAI. To achieve better performance, users need to use larger dataset like [Brats 2021](https://www.synapse.org/#!Synapse:syn25829067/wiki/610865) and have GPU with memory larger than 32G to enable larger networks and attention layers.** ## Data -The training data is BraTS 2016 and 2017 from the Medical Segmentation Decathalon. Users can find more details on the dataset (`Task01_BrainTumour`) at http://medicaldecathlon.com/. +The training data is from the [Multimodal Brain Tumor Segmentation Challenge (BraTS) 2018](https://www.med.upenn.edu/sbia/brats2018.html). - Target: Image Generation - Task: Synthesis - Modality: MRI -- Size: 388 3D volumes (1 channel used) +- Size: 285 3D volumes (1 channel used) + +The provided labelled data was partitioned, based on our own split, into training (200 studies), validation (42 studies) and testing (43 studies) datasets. + +### Preprocessing +The data list/split can be created with the script `scripts/prepare_datalist.py`. + +``` +python scripts/prepare_datalist.py --path your-brats18-dataset-path +``` ## Training Configuration If you have a GPU with less than 32G of memory, you may need to decrease the batch size when training. To do so, modify the `train_batch_size` parameter in the [configs/train_autoencoder.json](../configs/train_autoencoder.json) and [configs/train_diffusion.json](../configs/train_diffusion.json) configuration files. @@ -34,46 +43,42 @@ If you have a GPU with less than 32G of memory, you may need to decrease the bat The autoencoder was trained using the following configuration: - GPU: at least 32GB GPU memory -- Actual Model Input: 112 x 128 x 80 +- Actual Model Input: 128 x 128 x 128 - AMP: False - Optimizer: Adam -- Learning Rate: 1e-5 +- Learning Rate: 2e-5 - Loss: L1 loss, perceptual loss, KL divergence loss, adversarial loss, GAN BCE loss #### Input -1 channel 3D MRI Flair patches +1 channel 3D MRI T1CE patches #### Output - 1 channel 3D MRI reconstructed patches -- 8 channel mean of latent features -- 8 channel standard deviation of latent features +- 4 channel mean of latent features +- 4 channel standard deviation of latent features ### Training Configuration of Diffusion Model The latent diffusion model was trained using the following configuration: - GPU: at least 32GB GPU memory -- Actual Model Input: 36 x 44 x 28 +- Actual Model Input: 48 x 48 x 32 - AMP: False - Optimizer: Adam -- Learning Rate: 1e-5 +- Learning Rate: 2e-5 - Loss: MSE loss #### Training Input -- 8 channel noisy latent features +- 4 channel noisy latent features - a long int that indicates the time step #### Training Output -8 channel predicted added noise +4 channel predicted added noise #### Inference Input -8 channel noise +4 channel noise #### Inference Output -8 channel denoised latent features - -### Memory Consumption Warning - -If you face memory issues with data loading, you can lower the caching rate `cache_rate` in the configurations within range [0, 1] to minimize the System RAM requirements. +4 channel denoised latent features ## Performance From bdbfa100ae99f279969638dfd0e8849f6056f607 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 15:29:07 -0800 Subject: [PATCH 19/31] update loss weights Signed-off-by: Can-Zhao --- models/brats_mri_generative_diffusion/scripts/losses.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/brats_mri_generative_diffusion/scripts/losses.py b/models/brats_mri_generative_diffusion/scripts/losses.py index 43536067..ec95ce77 100644 --- a/models/brats_mri_generative_diffusion/scripts/losses.py +++ b/models/brats_mri_generative_diffusion/scripts/losses.py @@ -15,7 +15,7 @@ adv_loss = PatchAdversarialLoss(criterion="least_squares") adv_weight = 0.1 -perceptual_weight = 0.1 +perceptual_weight = 0.3 # kl_weight: important hyper-parameter. # If too large, decoder cannot recon good results from latent space. # If too small, latent space will not be regularized enough for the diffusion model From c40bd4f0f363f3c97ee7b282fd773ddbf4948d40 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 15:35:05 -0800 Subject: [PATCH 20/31] update readme Signed-off-by: Can-Zhao --- models/brats_mri_generative_diffusion/docs/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models/brats_mri_generative_diffusion/docs/README.md b/models/brats_mri_generative_diffusion/docs/README.md index 444db62e..f014184e 100644 --- a/models/brats_mri_generative_diffusion/docs/README.md +++ b/models/brats_mri_generative_diffusion/docs/README.md @@ -111,7 +111,7 @@ python -m monai.bundle run --config_file configs/train_autoencoder.json --datase To train with multiple GPUs, use the following command, which requires scaling up the learning rate according to the number of GPUs. ``` -torchrun --standalone --nnodes=1 --nproc_per_node=8 -m monai.bundle run --config_file "['configs/train_autoencoder.json','configs/multi_gpu_train_autoencoder.json']" --lr 8e-5 +torchrun --standalone --nnodes=1 --nproc_per_node=8 -m monai.bundle run --config_file "['configs/train_autoencoder.json','configs/multi_gpu_train_autoencoder.json']" --lr 1e-4 ``` #### Check the Autoencoder Training result @@ -139,7 +139,7 @@ python -m monai.bundle run --config_file "['configs/train_autoencoder.json','con To train with multiple GPUs, use the following command, which requires scaling up the learning rate according to the number of GPUs. ``` -torchrun --standalone --nnodes=1 --nproc_per_node=8 -m monai.bundle run --config_file "['configs/train_autoencoder.json','configs/train_diffusion.json','configs/multi_gpu_train_autoencoder.json','configs/multi_gpu_train_diffusion.json']" --lr 8e-5 +torchrun --standalone --nnodes=1 --nproc_per_node=8 -m monai.bundle run --config_file "['configs/train_autoencoder.json','configs/train_diffusion.json','configs/multi_gpu_train_autoencoder.json','configs/multi_gpu_train_diffusion.json']" --lr 1e-4 ``` #### Execute inference From 95e4ddfe0e29d96a8cbf2cba069d632cdc95dbfb Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Wed, 13 Dec 2023 15:37:39 -0800 Subject: [PATCH 21/31] flake Signed-off-by: Can-Zhao --- models/brats_mri_generative_diffusion/scripts/ldm_trainer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/models/brats_mri_generative_diffusion/scripts/ldm_trainer.py b/models/brats_mri_generative_diffusion/scripts/ldm_trainer.py index 04952923..a7bbd6c8 100644 --- a/models/brats_mri_generative_diffusion/scripts/ldm_trainer.py +++ b/models/brats_mri_generative_diffusion/scripts/ldm_trainer.py @@ -11,14 +11,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Callable, Sequence +from typing import TYPE_CHECKING, Any, Callable, Iterable, Sequence import torch from monai.config import IgniteInfo from monai.engines.utils import IterationEvents, default_metric_cmp_fn, default_prepare_batch from monai.inferers import Inferer, SimpleInferer from monai.transforms import Transform -from monai.utils import GanKeys, min_version, optional_import +from monai.utils import min_version, optional_import from monai.utils.enums import CommonKeys, GanKeys from torch.optim.optimizer import Optimizer from torch.utils.data import DataLoader @@ -176,7 +176,6 @@ def _iteration( raise ValueError("must provide batch data for current iteration.") d_input = engine.prepare_batch(batchdata, engine.state.device, engine.non_blocking, **engine.to_kwargs)[0] - batch_size = engine.data_loader.batch_size # type: ignore g_input = d_input g_output, z_mu, z_sigma = engine.g_inferer(g_input, engine.g_network) From a24788c111bf5e9bb56e97fd8655635699888627 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Thu, 14 Dec 2023 17:05:04 -0800 Subject: [PATCH 22/31] update readme Signed-off-by: Can-Zhao --- models/brats_mri_generative_diffusion/docs/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/models/brats_mri_generative_diffusion/docs/README.md b/models/brats_mri_generative_diffusion/docs/README.md index f014184e..3e2df2a2 100644 --- a/models/brats_mri_generative_diffusion/docs/README.md +++ b/models/brats_mri_generative_diffusion/docs/README.md @@ -37,6 +37,12 @@ python scripts/prepare_datalist.py --path your-brats18-dataset-path ``` ## Training Configuration +We need to install the required packages. +``` +pip install git+https://github.com/Project-MONAI/GenerativeModels.git +pip install lpips +``` + If you have a GPU with less than 32G of memory, you may need to decrease the batch size when training. To do so, modify the `train_batch_size` parameter in the [configs/train_autoencoder.json](../configs/train_autoencoder.json) and [configs/train_diffusion.json](../configs/train_diffusion.json) configuration files. ### Training Configuration of Autoencoder From 46f97151b54ed0a3e0c40eead5e8271839e49cb3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 15 Dec 2023 01:06:22 +0000 Subject: [PATCH 23/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- models/brats_mri_generative_diffusion/docs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/brats_mri_generative_diffusion/docs/README.md b/models/brats_mri_generative_diffusion/docs/README.md index 3e2df2a2..831cc1a2 100644 --- a/models/brats_mri_generative_diffusion/docs/README.md +++ b/models/brats_mri_generative_diffusion/docs/README.md @@ -39,7 +39,7 @@ python scripts/prepare_datalist.py --path your-brats18-dataset-path ## Training Configuration We need to install the required packages. ``` -pip install git+https://github.com/Project-MONAI/GenerativeModels.git +pip install git+https://github.com/Project-MONAI/GenerativeModels.git pip install lpips ``` From 424cbe204b61366c21e18f549a83a8be4c0e3160 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Thu, 14 Dec 2023 17:08:59 -0800 Subject: [PATCH 24/31] update readme Signed-off-by: Can-Zhao --- models/brats_mri_generative_diffusion/docs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/brats_mri_generative_diffusion/docs/README.md b/models/brats_mri_generative_diffusion/docs/README.md index 831cc1a2..75eb7a0d 100644 --- a/models/brats_mri_generative_diffusion/docs/README.md +++ b/models/brats_mri_generative_diffusion/docs/README.md @@ -107,7 +107,7 @@ For more details usage instructions, visit the [MONAI Bundle Configuration Page] python -m monai.bundle run --config_file configs/train_autoencoder.json ``` -Please note that if the default dataset path is not modified with the actual path (it should be the path that contains `Task01_BrainTumour`) in the bundle config files, you can also override it by using `--dataset_dir`: +Please note that if the default dataset path is not modified with the actual path in the bundle config files, you can also override it by using `--dataset_dir`: ``` python -m monai.bundle run --config_file configs/train_autoencoder.json --dataset_dir From a52db12d52fea07556d2122bae87ab4a3b81d2cc Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Thu, 14 Dec 2023 17:15:10 -0800 Subject: [PATCH 25/31] update inference_autoencoder.json Signed-off-by: Can-Zhao --- .../configs/inference_autoencoder.json | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/inference_autoencoder.json b/models/brats_mri_generative_diffusion/configs/inference_autoencoder.json index eb66dee2..a8f4f096 100644 --- a/models/brats_mri_generative_diffusion/configs/inference_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/inference_autoencoder.json @@ -2,11 +2,14 @@ "imports": [ "$import torch", "$from datetime import datetime", - "$from pathlib import Path" + "$from pathlib import Path", + "$import generative" ], "bundle_root": ".", "model_dir": "$@bundle_root + '/models'", - "dataset_dir": "/workspace/data/medical", + "data_list_file_path": "$@bundle_root + '/configs/datalist.json'", + "dataset_dir": "/datasets/brats18", + "test_datalist": "$monai.data.load_decathlon_datalist(@data_list_file_path, data_list_key='testing', base_dir=@dataset_dir)", "output_dir": "$@bundle_root + '/output'", "create_output_dir": "$Path(@output_dir).mkdir(exist_ok=True)", "device": "$torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')", @@ -20,11 +23,11 @@ ], "spatial_dims": 3, "image_channels": 1, - "latent_channels": 8, + "latent_channels": 4, "infer_patch_size": [ - 144, - 176, - 112 + 192, + 192, + 128 ], "autoencoder_def": { "_target_": "generative.networks.nets.AutoencoderKL", @@ -46,7 +49,9 @@ false ], "with_encoder_nonlocal_attn": false, - "with_decoder_nonlocal_attn": false + "with_decoder_nonlocal_attn": false, + "use_checkpointing": true, + "use_convtranspose": false }, "load_autoencoder_path": "$@bundle_root + '/models/model_autoencoder.pt'", "load_autoencoder": "$@autoencoder_def.load_state_dict(torch.load(@load_autoencoder_path))", @@ -108,15 +113,10 @@ "transforms": "$@preprocessing_transforms + @crop_transforms + @final_transforms" }, "dataset": { - "_target_": "monai.apps.DecathlonDataset", - "root_dir": "@dataset_dir", - "task": "Task01_BrainTumour", - "section": "validation", - "cache_rate": 0.0, - "num_workers": 8, - "download": false, - "transform": "@preprocessing" - }, + "_target_": "Dataset", + "data": "@test_datalist", + "transform": "@preprocessing" + }, "dataloader": { "_target_": "DataLoader", "dataset": "@dataset", From ace8223dc17e4310c83564d7c313da11fb83196c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 15 Dec 2023 01:16:26 +0000 Subject: [PATCH 26/31] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../configs/inference_autoencoder.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/inference_autoencoder.json b/models/brats_mri_generative_diffusion/configs/inference_autoencoder.json index a8f4f096..0bbffffc 100644 --- a/models/brats_mri_generative_diffusion/configs/inference_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/inference_autoencoder.json @@ -113,10 +113,10 @@ "transforms": "$@preprocessing_transforms + @crop_transforms + @final_transforms" }, "dataset": { - "_target_": "Dataset", - "data": "@test_datalist", - "transform": "@preprocessing" - }, + "_target_": "Dataset", + "data": "@test_datalist", + "transform": "@preprocessing" + }, "dataloader": { "_target_": "DataLoader", "dataset": "@dataset", From 3618cc8de04410ed4af0939584d86b8fe0648b11 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Thu, 14 Dec 2023 17:19:35 -0800 Subject: [PATCH 27/31] update readme Signed-off-by: Can-Zhao --- models/brats_mri_generative_diffusion/docs/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/models/brats_mri_generative_diffusion/docs/README.md b/models/brats_mri_generative_diffusion/docs/README.md index 75eb7a0d..b8616473 100644 --- a/models/brats_mri_generative_diffusion/docs/README.md +++ b/models/brats_mri_generative_diffusion/docs/README.md @@ -41,6 +41,7 @@ We need to install the required packages. ``` pip install git+https://github.com/Project-MONAI/GenerativeModels.git pip install lpips +pip install xformers ``` If you have a GPU with less than 32G of memory, you may need to decrease the batch size when training. To do so, modify the `train_batch_size` parameter in the [configs/train_autoencoder.json](../configs/train_autoencoder.json) and [configs/train_diffusion.json](../configs/train_diffusion.json) configuration files. From 09427c365eaac6d4c7589e4764cdcb10cfe8fc11 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Thu, 14 Dec 2023 17:33:34 -0800 Subject: [PATCH 28/31] add cache Signed-off-by: Can-Zhao --- .../configs/train_autoencoder.json | 10 ++++++---- .../configs/train_diffusion.json | 5 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json index e4bf0669..3086aa44 100644 --- a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json @@ -155,9 +155,10 @@ "transforms": "$@preprocessing_transforms + @train#crop_transforms + @final_transforms" }, "dataset": { - "_target_": "Dataset", + "_target_": "CacheDataset", "data": "@train_datalist", - "transform": "@train#preprocessing" + "transform": "@train#preprocessing", + "cache_rate": 1.0 }, "dataloader": { "_target_": "DataLoader", @@ -223,9 +224,10 @@ "transforms": "$@preprocessing_transforms + @validate#crop_transforms + @final_transforms" }, "dataset": { - "_target_": "Dataset", + "_target_": "CacheDataset", "data": "@val_datalist", - "transform": "@validate#preprocessing" + "transform": "@validate#preprocessing", + "cache_rate": 1.0 }, "dataloader": { "_target_": "DataLoader", diff --git a/models/brats_mri_generative_diffusion/configs/train_diffusion.json b/models/brats_mri_generative_diffusion/configs/train_diffusion.json index 5bb55a7c..e699fa50 100644 --- a/models/brats_mri_generative_diffusion/configs/train_diffusion.json +++ b/models/brats_mri_generative_diffusion/configs/train_diffusion.json @@ -85,9 +85,10 @@ "transforms": "$@preprocessing_transforms + @train#crop_transforms + @final_transforms" }, "dataset": { - "_target_": "Dataset", + "_target_": "CacheDataset", "data": "@train_datalist", - "transform": "@train#preprocessing" + "transform": "@train#preprocessing", + "cache_rate": 1.0 }, "dataloader": { "_target_": "DataLoader", From 026152aaac498cb91a438c50392f9cd2fab0ba6e Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Thu, 14 Dec 2023 18:13:56 -0800 Subject: [PATCH 29/31] maximize batch size Signed-off-by: Can-Zhao --- .../configs/train_autoencoder.json | 30 ++++++++----------- .../configs/train_diffusion.json | 16 ++-------- 2 files changed, 15 insertions(+), 31 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json index 3086aa44..02e3cb2f 100644 --- a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json @@ -15,16 +15,16 @@ "val_datalist": "$monai.data.load_decathlon_datalist(@data_list_file_path, data_list_key='validation', base_dir=@dataset_dir)", "pretrained": false, "perceptual_loss_model_weights_path": null, - "train_batch_size": 2, - "val_batch_size": 2, + "train_batch_size": 4, + "val_batch_size": 3, "epochs": 8000, "val_interval": 10, "lr": 2e-05, "amp": true, "train_patch_size": [ - 128, - 128, - 128 + 112, + 112, + 80 ], "val_patch_size": [ 192, @@ -129,9 +129,12 @@ "keys": "image", "pixdim": "@spacing", "mode": "bilinear" - } - ], - "final_transforms": [ + }, + { + "_target_": "CenterSpatialCropd", + "keys": "image", + "roi_size": "@val_patch_size" + }, { "_target_": "ScaleIntensityRangePercentilesd", "keys": "image", @@ -152,7 +155,7 @@ ], "preprocessing": { "_target_": "Compose", - "transforms": "$@preprocessing_transforms + @train#crop_transforms + @final_transforms" + "transforms": "$@preprocessing_transforms + @train#crop_transforms" }, "dataset": { "_target_": "CacheDataset", @@ -212,16 +215,9 @@ } }, "validate": { - "crop_transforms": [ - { - "_target_": "CenterSpatialCropd", - "keys": "image", - "roi_size": "@val_patch_size" - } - ], "preprocessing": { "_target_": "Compose", - "transforms": "$@preprocessing_transforms + @validate#crop_transforms + @final_transforms" + "transforms": "$@preprocessing_transforms" }, "dataset": { "_target_": "CacheDataset", diff --git a/models/brats_mri_generative_diffusion/configs/train_diffusion.json b/models/brats_mri_generative_diffusion/configs/train_diffusion.json index e699fa50..ff241dbc 100644 --- a/models/brats_mri_generative_diffusion/configs/train_diffusion.json +++ b/models/brats_mri_generative_diffusion/configs/train_diffusion.json @@ -1,12 +1,7 @@ { "ckpt_dir": "$@bundle_root + '/models'", - "train_batch_size": 4, + "train_batch_size": 5, "lr": 2e-05, - "train_patch_size": [ - 192, - 192, - 128 - ], "latent_shape": [ "@latent_channels", 48, @@ -73,16 +68,9 @@ "scheduler": "@noise_scheduler", "scale_factor": "@scale_factor" }, - "crop_transforms": [ - { - "_target_": "CenterSpatialCropd", - "keys": "image", - "roi_size": "@train_patch_size" - } - ], "preprocessing": { "_target_": "Compose", - "transforms": "$@preprocessing_transforms + @train#crop_transforms + @final_transforms" + "transforms": "$@preprocessing_transforms" }, "dataset": { "_target_": "CacheDataset", From 3ddf7d2c774efcc73620a6a02abf6233ef005037 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Thu, 14 Dec 2023 18:31:31 -0800 Subject: [PATCH 30/31] reduce epoch num Signed-off-by: Can-Zhao --- .../configs/train_autoencoder.json | 4 ++-- .../configs/train_diffusion.json | 6 +++--- models/brats_mri_generative_diffusion/docs/README.md | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json index 02e3cb2f..ab955068 100644 --- a/models/brats_mri_generative_diffusion/configs/train_autoencoder.json +++ b/models/brats_mri_generative_diffusion/configs/train_autoencoder.json @@ -17,9 +17,9 @@ "perceptual_loss_model_weights_path": null, "train_batch_size": 4, "val_batch_size": 3, - "epochs": 8000, + "epochs": 3000, "val_interval": 10, - "lr": 2e-05, + "lr": 5e-05, "amp": true, "train_patch_size": [ 112, diff --git a/models/brats_mri_generative_diffusion/configs/train_diffusion.json b/models/brats_mri_generative_diffusion/configs/train_diffusion.json index ff241dbc..fc723495 100644 --- a/models/brats_mri_generative_diffusion/configs/train_diffusion.json +++ b/models/brats_mri_generative_diffusion/configs/train_diffusion.json @@ -1,7 +1,7 @@ { "ckpt_dir": "$@bundle_root + '/models'", "train_batch_size": 5, - "lr": 2e-05, + "lr": 5e-05, "latent_shape": [ "@latent_channels", 48, @@ -44,7 +44,7 @@ "_target_": "torch.optim.lr_scheduler.MultiStepLR", "optimizer": "@optimizer", "milestones": [ - 5000 + 2000 ], "gamma": 0.1 }, @@ -117,7 +117,7 @@ "trainer": { "_target_": "scripts.ldm_trainer.LDMTrainer", "device": "@device", - "max_epochs": 10000, + "max_epochs": 5000, "train_data_loader": "@train#dataloader", "network": "@diffusion", "autoencoder_model": "@autoencoder", diff --git a/models/brats_mri_generative_diffusion/docs/README.md b/models/brats_mri_generative_diffusion/docs/README.md index b8616473..11662dd7 100644 --- a/models/brats_mri_generative_diffusion/docs/README.md +++ b/models/brats_mri_generative_diffusion/docs/README.md @@ -53,7 +53,7 @@ The autoencoder was trained using the following configuration: - Actual Model Input: 128 x 128 x 128 - AMP: False - Optimizer: Adam -- Learning Rate: 2e-5 +- Learning Rate: 5e-5 - Loss: L1 loss, perceptual loss, KL divergence loss, adversarial loss, GAN BCE loss #### Input @@ -71,7 +71,7 @@ The latent diffusion model was trained using the following configuration: - Actual Model Input: 48 x 48 x 32 - AMP: False - Optimizer: Adam -- Learning Rate: 2e-5 +- Learning Rate: 5e-5 - Loss: MSE loss #### Training Input @@ -118,7 +118,7 @@ python -m monai.bundle run --config_file configs/train_autoencoder.json --datase To train with multiple GPUs, use the following command, which requires scaling up the learning rate according to the number of GPUs. ``` -torchrun --standalone --nnodes=1 --nproc_per_node=8 -m monai.bundle run --config_file "['configs/train_autoencoder.json','configs/multi_gpu_train_autoencoder.json']" --lr 1e-4 +torchrun --standalone --nnodes=1 --nproc_per_node=8 -m monai.bundle run --config_file "['configs/train_autoencoder.json','configs/multi_gpu_train_autoencoder.json']" --lr 2e-4 ``` #### Check the Autoencoder Training result @@ -146,7 +146,7 @@ python -m monai.bundle run --config_file "['configs/train_autoencoder.json','con To train with multiple GPUs, use the following command, which requires scaling up the learning rate according to the number of GPUs. ``` -torchrun --standalone --nnodes=1 --nproc_per_node=8 -m monai.bundle run --config_file "['configs/train_autoencoder.json','configs/train_diffusion.json','configs/multi_gpu_train_autoencoder.json','configs/multi_gpu_train_diffusion.json']" --lr 1e-4 +torchrun --standalone --nnodes=1 --nproc_per_node=8 -m monai.bundle run --config_file "['configs/train_autoencoder.json','configs/train_diffusion.json','configs/multi_gpu_train_autoencoder.json','configs/multi_gpu_train_diffusion.json']" --lr 2e-4 ``` #### Execute inference From 483917461e6cee40801304f1eabe34309dbb7d03 Mon Sep 17 00:00:00 2001 From: Can-Zhao Date: Thu, 14 Dec 2023 18:33:13 -0800 Subject: [PATCH 31/31] update readme Signed-off-by: Can-Zhao --- models/brats_mri_generative_diffusion/docs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/brats_mri_generative_diffusion/docs/README.md b/models/brats_mri_generative_diffusion/docs/README.md index 11662dd7..81500cbb 100644 --- a/models/brats_mri_generative_diffusion/docs/README.md +++ b/models/brats_mri_generative_diffusion/docs/README.md @@ -50,7 +50,7 @@ If you have a GPU with less than 32G of memory, you may need to decrease the bat The autoencoder was trained using the following configuration: - GPU: at least 32GB GPU memory -- Actual Model Input: 128 x 128 x 128 +- Actual Model Input: 112 x 112 x 80 - AMP: False - Optimizer: Adam - Learning Rate: 5e-5