ft42 commited on Sep 24, 2025

Commit

599a397

verified ·

1 Parent(s): c196078

Upload 63 files

Browse files

Added Inference code, demo data and config and slum script

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +13 -0
NoMAISI_logo.png +3 -0
configs/config_maisi3d-rflow.json +150 -0
configs/infr_config_NoMAISI_controlnet.json +17 -0
configs/infr_env_NoMAISI_DLCSD24_demo.json +11 -0
data/DLCS_1419_seg_sh.nii.gz +3 -0
data/infr_NoMAISI_DLCSD24_demo_512xy_256z_771p25m_dataset.json +32 -0
doc/images/DLCS_1419_ann0_slice134_triple.png +3 -0
doc/images/DLCS_1419_ann1_slice204_triple.png +3 -0
doc/images/DLCS_1443_ann1_slice125_triple.png +3 -0
doc/images/DLCS_1446_ann0_slice122_triple.png +3 -0
doc/images/DLCS_1447_ann0_slice206_triple.png +3 -0
doc/images/DLCS_1453_ann0_slice204_triple.png +3 -0
doc/images/DLCS_1508_ann0_slice46_triple.png +3 -0
doc/images/DLCS_1519_ann3_slice155_triple.png +3 -0
doc/images/GanAI_fid_scatter_marker_legend.png +3 -0
doc/images/NoMAISI_train_and_infer.png +3 -0
doc/images/TaskCls.png +3 -0
doc/images/workflow.png +3 -0
inference.sub +26 -0
logs/NoMAISI-infr-log-38612.out +18 -0
scripts/__init__.py +10 -0
scripts/__pycache__/__init__.cpython-310.pyc +0 -0
scripts/__pycache__/augmentation.cpython-310.pyc +0 -0
scripts/__pycache__/diff_model_create_training_data.cpython-310.pyc +0 -0
scripts/__pycache__/diff_model_setting.cpython-310.pyc +0 -0
scripts/__pycache__/find_masks.cpython-310.pyc +0 -0
scripts/__pycache__/infer_controlnet.cpython-310.pyc +0 -0
scripts/__pycache__/infer_testV2_controlnet.cpython-310.pyc +0 -0
scripts/__pycache__/infer_test_controlnet.cpython-310.pyc +0 -0
scripts/__pycache__/inference.cpython-310.pyc +0 -0
scripts/__pycache__/quality_check.cpython-310.pyc +0 -0
scripts/__pycache__/rectified_flow.cpython-310.pyc +0 -0
scripts/__pycache__/sample.cpython-310.pyc +0 -0
scripts/__pycache__/train_controlnet.cpython-310.pyc +0 -0
scripts/__pycache__/utils.cpython-310.pyc +0 -0
scripts/__pycache__/utils_plot.cpython-310.pyc +0 -0
scripts/augmentation.py +373 -0
scripts/compute_fid_2-5d_ct.py +747 -0
scripts/diff_model_create_training_data.py +231 -0
scripts/diff_model_infer.py +358 -0
scripts/diff_model_setting.py +92 -0
scripts/diff_model_train.py +499 -0
scripts/find_masks.py +157 -0
scripts/infer_controlnet.py +222 -0
scripts/infer_testV2_controlnet.py +220 -0
scripts/infer_test_controlnet.py +220 -0
scripts/inference.py +299 -0
scripts/quality_check.py +149 -0
scripts/rectified_flow.py +322 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,16 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+doc/images/DLCS_1419_ann0_slice134_triple.png filter=lfs diff=lfs merge=lfs -text
+doc/images/DLCS_1419_ann1_slice204_triple.png filter=lfs diff=lfs merge=lfs -text
+doc/images/DLCS_1443_ann1_slice125_triple.png filter=lfs diff=lfs merge=lfs -text
+doc/images/DLCS_1446_ann0_slice122_triple.png filter=lfs diff=lfs merge=lfs -text
+doc/images/DLCS_1447_ann0_slice206_triple.png filter=lfs diff=lfs merge=lfs -text
+doc/images/DLCS_1453_ann0_slice204_triple.png filter=lfs diff=lfs merge=lfs -text
+doc/images/DLCS_1508_ann0_slice46_triple.png filter=lfs diff=lfs merge=lfs -text
+doc/images/DLCS_1519_ann3_slice155_triple.png filter=lfs diff=lfs merge=lfs -text
+doc/images/GanAI_fid_scatter_marker_legend.png filter=lfs diff=lfs merge=lfs -text
+doc/images/NoMAISI_train_and_infer.png filter=lfs diff=lfs merge=lfs -text
+doc/images/TaskCls.png filter=lfs diff=lfs merge=lfs -text
+doc/images/workflow.png filter=lfs diff=lfs merge=lfs -text
+NoMAISI_logo.png filter=lfs diff=lfs merge=lfs -text

NoMAISI_logo.png ADDED Viewed

Git LFS Details

SHA256: 59e28b561fa2a934150fa912146fc81f75aa8b526defd5c698c46cac09995c94
Pointer size: 131 Bytes
Size of remote file: 186 kB

configs/config_maisi3d-rflow.json ADDED Viewed

	@@ -0,0 +1,150 @@

+{
+    "spatial_dims": 3,
+    "image_channels": 1,
+    "latent_channels": 4,
+    "include_body_region": false,
+    "mask_generation_latent_shape": [
+        4,
+        64,
+        64,
+        64
+    ],
+    "autoencoder_def": {
+        "_target_": "monai.apps.generation.maisi.networks.autoencoderkl_maisi.AutoencoderKlMaisi",
+        "spatial_dims": "@spatial_dims",
+        "in_channels": "@image_channels",
+        "out_channels": "@image_channels",
+        "latent_channels": "@latent_channels",
+        "num_channels": [
+            64,
+            128,
+            256
+        ],
+        "num_res_blocks": [2,2,2],
+        "norm_num_groups": 32,
+        "norm_eps": 1e-06,
+        "attention_levels": [
+            false,
+            false,
+            false
+        ],
+        "with_encoder_nonlocal_attn": false,
+        "with_decoder_nonlocal_attn": false,
+        "use_checkpointing": false,
+        "use_convtranspose": false,
+        "norm_float16": true,
+        "num_splits": 4,
+        "dim_split": 1
+    },
+    "diffusion_unet_def": {
+        "_target_": "monai.apps.generation.maisi.networks.diffusion_model_unet_maisi.DiffusionModelUNetMaisi",
+        "spatial_dims": "@spatial_dims",
+        "in_channels": "@latent_channels",
+        "out_channels": "@latent_channels",
+        "num_channels": [64, 128, 256, 512],
+        "attention_levels": [
+            false,
+            false,
+            true,
+            true
+        ],
+        "num_head_channels": [
+            0,
+            0,
+            32,
+            32
+        ],
+        "num_res_blocks": 2,
+        "use_flash_attention": true,
+        "include_top_region_index_input": "@include_body_region",
+        "include_bottom_region_index_input": "@include_body_region",
+        "include_spacing_input": true,
+        "num_class_embeds": 128,
+        "resblock_updown": true,
+        "include_fc": true
+    },
+    "controlnet_def": {
+        "_target_": "monai.apps.generation.maisi.networks.controlnet_maisi.ControlNetMaisi",
+        "spatial_dims": "@spatial_dims",
+        "in_channels": "@latent_channels",
+        "num_channels": [64, 128, 256, 512],
+        "attention_levels": [
+            false,
+            false,
+            true,
+            true
+        ],
+        "num_head_channels": [
+            0,
+            0,
+            32,
+            32
+        ],
+        "num_res_blocks": 2,
+        "use_flash_attention": true,
+        "conditioning_embedding_in_channels": 8,
+        "conditioning_embedding_num_channels": [8, 32, 64],
+        "num_class_embeds": 128,
+        "resblock_updown": true,
+        "include_fc": true
+    },
+    "mask_generation_autoencoder_def": {
+        "_target_": "monai.apps.generation.maisi.networks.autoencoderkl_maisi.AutoencoderKlMaisi",
+        "spatial_dims": "@spatial_dims",
+        "in_channels": 8,
+        "out_channels": 125,
+        "latent_channels": "@latent_channels",
+        "num_channels": [
+            32,
+            64,
+            128
+        ],
+        "num_res_blocks": [1, 2, 2],
+        "norm_num_groups": 32,
+        "norm_eps": 1e-06,
+        "attention_levels": [
+            false,
+            false,
+            false
+        ],
+        "with_encoder_nonlocal_attn": false,
+        "with_decoder_nonlocal_attn": false,
+        "use_flash_attention": false,
+        "use_checkpointing": true,
+        "use_convtranspose": true,
+        "norm_float16": true,
+        "num_splits": 8,
+        "dim_split": 1
+    },
+    "mask_generation_diffusion_def": {
+        "_target_": "monai.networks.nets.diffusion_model_unet.DiffusionModelUNet",
+        "spatial_dims": "@spatial_dims",
+        "in_channels": "@latent_channels",
+        "out_channels": "@latent_channels",
+        "channels":[64, 128, 256, 512],
+        "attention_levels":[false, false, true, true],
+        "num_head_channels":[0, 0, 32, 32],
+        "num_res_blocks": 2,
+        "use_flash_attention": true,
+        "with_conditioning": true,
+        "upcast_attention": true,
+        "cross_attention_dim": 10
+    },
+    "mask_generation_scale_factor": 1.0055984258651733,
+    "noise_scheduler": {
+        "_target_": "monai.networks.schedulers.rectified_flow.RFlowScheduler",
+        "num_train_timesteps": 1000,
+        "use_discrete_timesteps": false,
+        "use_timestep_transform": true,
+        "sample_method": "uniform",
+        "scale":1.4
+    },
+    "mask_generation_noise_scheduler": {
+        "_target_": "monai.networks.schedulers.ddpm.DDPMScheduler",
+        "num_train_timesteps": 1000,
+        "beta_start": 0.0015,
+        "beta_end": 0.0195,
+        "schedule": "scaled_linear_beta",
+        "clip_sample": false
+    }
+}

configs/infr_config_NoMAISI_controlnet.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+    "controlnet_train": {
+        "batch_size": 2,
+        "cache_rate": 0.0,
+        "fold": 1,
+        "lr": 1e-5,
+        "n_epochs": 500,
+        "weighted_loss_label": [23],
+        "weighted_loss": 100
+    },
+    "controlnet_infer": {
+       "num_inference_steps": 30,
+       "autoencoder_sliding_window_infer_size": [80, 80, 64],
+       "autoencoder_sliding_window_infer_overlap": 0.25,
+       "modality": 1
+    }
+}

configs/infr_env_NoMAISI_DLCSD24_demo.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "model_dir": "./models/",
+    "output_dir": "./outputs/NoMAISI_DLCSD24_demo_512xy_256z_771p25m",
+    "tfevent_path": "./outputs/tfevent",
+    "trained_autoencoder_path": "./models/autoencoder.pt",
+    "trained_diffusion_path": "./models/diffusion_unet.pt",
+    "trained_controlnet_path": "./models/Experiments_NoMAISI_512xy_256z_771p25m_finetune_500epoch_best.pt",
+    "exp_name": "NoMAISI_DLCSD24_demo_512xy_256z_771p25m",
+    "data_base_dir": ["/home/ft42/NoMAISI/data"],
+    "json_data_list": ["/home/ft42/NoMAISI/data/infr_NoMAISI_DLCSD24_demo_512xy_256z_771p25m_dataset.json"]
+}

data/DLCS_1419_seg_sh.nii.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83da8dbf3b165023f3ffcec571fe5766177b65aabfa143f3a0bef5be41af757b
+size 2265286

data/infr_NoMAISI_DLCSD24_demo_512xy_256z_771p25m_dataset.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+    "name": "NoMAISI_DLCSD24_demo_512xy_256z_771p25m",
+    "numTest": 1,
+    "testing": [
+        {
+            "label": "DLCS_1419_seg_sh.nii.gz",
+            "fold": 0,
+            "dim": [
+                512,
+                512,
+                256
+            ],
+            "spacing": [
+                0.703125,
+                0.703125,
+                1.25
+            ],
+            "top_region_index": [
+                0,
+                1,
+                0,
+                0
+            ],
+            "bottom_region_index": [
+                0,
+                0,
+                1,
+                0
+            ]
+        }]
+}

doc/images/DLCS_1419_ann0_slice134_triple.png ADDED Viewed

Git LFS Details

SHA256: 9729e15104e9f3b6ae675f57bf7d5f9f1aec3e191a4d7a68209bde4a3d148363
Pointer size: 132 Bytes
Size of remote file: 1.24 MB

doc/images/DLCS_1419_ann1_slice204_triple.png ADDED Viewed

Git LFS Details

SHA256: 5bbcd3ddca8a3623f38764984fed7f9a36c92d8e2c98336c4b3e5e0aadb29e0a
Pointer size: 132 Bytes
Size of remote file: 1.13 MB

doc/images/DLCS_1443_ann1_slice125_triple.png ADDED Viewed

Git LFS Details

SHA256: e6336851a8174aeedd990f169f4dfa1ec8f2524adbbfd048f1d491ba0973ae72
Pointer size: 132 Bytes
Size of remote file: 1.11 MB

doc/images/DLCS_1446_ann0_slice122_triple.png ADDED Viewed

Git LFS Details

SHA256: 29706ad025325e95dd9ad6cc56e52ea9481866a23d97fb3033198f76a5b65a13
Pointer size: 131 Bytes
Size of remote file: 955 kB

doc/images/DLCS_1447_ann0_slice206_triple.png ADDED Viewed

Git LFS Details

SHA256: ad5d313eee8c53edb67c8240963c29c340f2e4456db8cdfc538f7c10fcbf7f2f
Pointer size: 131 Bytes
Size of remote file: 893 kB

doc/images/DLCS_1453_ann0_slice204_triple.png ADDED Viewed

Git LFS Details

SHA256: c1cb674c92523eab8a008367b658561cfb94ebc3dfbc84b6f666d609097f2863
Pointer size: 132 Bytes
Size of remote file: 1.2 MB

doc/images/DLCS_1508_ann0_slice46_triple.png ADDED Viewed

Git LFS Details

SHA256: 5d3f245b13e4495d01e8585058239c02f2cbc17b72557d8306b58bce23747334
Pointer size: 132 Bytes
Size of remote file: 1.64 MB

doc/images/DLCS_1519_ann3_slice155_triple.png ADDED Viewed

Git LFS Details

SHA256: a0a7db06ba28e1412d546d2ef917c50f04dd2cff9f06c5d83d611c406185fd13
Pointer size: 132 Bytes
Size of remote file: 1.36 MB

doc/images/GanAI_fid_scatter_marker_legend.png ADDED Viewed

Git LFS Details

SHA256: 60c1e2e2be297fd13de2600aa2559c853db277d9ef3238da7a166c1e3472a237
Pointer size: 131 Bytes
Size of remote file: 179 kB

doc/images/NoMAISI_train_and_infer.png ADDED Viewed

Git LFS Details

SHA256: ffc762231f799865c8a36898ae6e23434f0f188edd45fec1be88bbd9f582a3f4
Pointer size: 131 Bytes
Size of remote file: 457 kB

doc/images/TaskCls.png ADDED Viewed

Git LFS Details

SHA256: 8d23c4d5110aab51b39e9772122eb98edaa5d260e1fcc3de24ff486fb5feaa06
Pointer size: 131 Bytes
Size of remote file: 280 kB

doc/images/workflow.png ADDED Viewed

Git LFS Details

SHA256: 3bfeafa6ca6729ce6808c39e13afaa222d7ce102277b0f5fcb7d3eb29148ef93
Pointer size: 131 Bytes
Size of remote file: 610 kB

inference.sub ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/bin/bash
+#SBATCH --job-name=nomaisi
+#SBATCH --mail-type=END,FAIL
+#SBATCH --mail-user=ft42@duke.edu
+#SBATCH -p vram48
+#SBATCH --ntasks=1  #
+#SBATCH --gpus=1    # 2 GPU per task, chose more if model is capable of multi gpu training
+#SBATCH --cpus-per-task=16 # More if it is CPU intensive job too NNUNET demands lot of CPU
+## Make sure logs directory is present on current directory (same as this script)
+#SBATCH --output=logs/NoMAISI-infr-log-%j.out
+#SBATCH --error=logs/NoMAISI-infr-log-%j.out
+echo "Job starting"
+echo "GPUs Given:   $CUDA_VISIBLE_DEVICES"
+module load miniconda/py39_4.12.0
+source activate monai-auto3dseg
+# Add the correct path to PYTHONPATH
+export MONAI_DATA_DIRECTORY=/home/ft42/NoMAISI/
+python -m scripts.infer_testV2_controlnet -c ./configs/config_maisi3d-rflow.json -e ./configs/infr_env_NoMAISI_DLCSD24_demo.json -t ./configs/infr_config_NoMAISI_controlnet.json

logs/NoMAISI-infr-log-38612.out ADDED Viewed

@@ -0,0 +1,18 @@
  0%|          | 0/30 [00:00<?, ?it/s]
  3%|▎         | 1/30 [00:00<00:23,  1.22it/s]
  7%|▋         | 2/30 [00:01<00:14,  1.93it/s]
 10%|█         | 3/30 [00:01<00:12,  2.17it/s]
 13%|█▎        | 4/30 [00:01<00:11,  2.30it/s]
 17%|█▋        | 5/30 [00:02<00:10,  2.39it/s]
 20%|██        | 6/30 [00:02<00:09,  2.44it/s]
 23%|██▎       | 7/30 [00:03<00:09,  2.47it/s]
 27%|██▋       | 8/30 [00:03<00:08,  2.49it/s]
 30%|███       | 9/30 [00:03<00:08,  2.51it/s]
 33%|███▎      | 10/30 [00:04<00:07,  2.52it/s]
 37%|███▋      | 11/30 [00:04<00:07,  2.53it/s]
 40%|████      | 12/30 [00:05<00:07,  2.53it/s]
 43%|████▎     | 13/30 [00:05<00:06,  2.53it/s]
 47%|████▋     | 14/30 [00:05<00:06,  2.54it/s]
 50%|█████     | 15/30 [00:06<00:05,  2.54it/s]
 53%|█████▎    | 16/30 [00:06<00:05,  2.54it/s]
 57%|█████▋    | 17/30 [00:07<00:05,  2.54it/s]
 60%|██████    | 18/30 [00:07<00:04,  2.54it/s]
 63%|██████▎   | 19/30 [00:07<00:04,  2.54it/s]
 67%|██████▋   | 20/30 [00:08<00:03,  2.54it/s]
 70%|███████   | 21/30 [00:08<00:03,  2.54it/s]
 73%|███████▎  | 22/30 [00:08<00:03,  2.54it/s]
 77%|███████▋  | 23/30 [00:09<00:02,  2.53it/s]
 80%|████████  | 24/30 [00:09<00:02,  2.54it/s]
 83%|████████▎ | 25/30 [00:10<00:01,  2.53it/s]
 87%|████████▋ | 26/30 [00:10<00:01,  2.53it/s]
 90%|█████████ | 27/30 [00:10<00:01,  2.53it/s]
 93%|█████████▎| 28/30 [00:11<00:00,  2.53it/s]
 97%|█████████▋| 29/30 [00:11<00:00,  2.53it/s]
  0%|          | 0/4 [00:00<?, ?it/s]
 25%|██▌       | 1/4 [00:04<00:13,  4.36s/it]
 50%|█████     | 2/4 [00:08<00:07,  4.00s/it]
 75%|███████▌  | 3/4 [00:11<00:03,  3.79s/it]

+Job starting
+GPUs Given:   0
+[2025-09-24 13:42:58.511][ INFO](maisi.controlnet.infer) - Number of GPUs: 1
+[2025-09-24 13:42:58.512][ INFO](maisi.controlnet.infer) - World_size: 1
+[2025-09-24 13:42:59.541][ INFO](maisi.controlnet.infer) - Load trained diffusion model from ./models/autoencoder.pt.
+[2025-09-24 13:43:03.285][ INFO](maisi.controlnet.infer) - Load trained diffusion model from ./models/diffusion_unet.pt.
+[2025-09-24 13:43:03.287][ INFO](maisi.controlnet.infer) - loaded scale_factor from diffusion model ckpt -> 1.0311251878738403.
+2025-09-24 13:43:03,824 - INFO - 'dst' model updated: 180 of 231 variables.
+[2025-09-24 13:43:04.077][ INFO](maisi.controlnet.infer) - load trained controlnet model from ./models/Experiments_NoMAISI_512xy_256z_771p25m_finetune_500epoch_best.pt
+[2025-09-24 13:43:07.130][ INFO](root) - `controllable_anatomy_size` is not provided.
+[2025-09-24 13:43:07.133][ INFO](root) - ---- Start generating latent features... ----
  0%|          | 0/30 [00:00<?, ?it/s]
  3%|▎         | 1/30 [00:00<00:23,  1.22it/s]
  7%|▋         | 2/30 [00:01<00:14,  1.93it/s]
 10%|█         | 3/30 [00:01<00:12,  2.17it/s]
 13%|█▎        | 4/30 [00:01<00:11,  2.30it/s]
 17%|█▋        | 5/30 [00:02<00:10,  2.39it/s]
 20%|██        | 6/30 [00:02<00:09,  2.44it/s]
 23%|██▎       | 7/30 [00:03<00:09,  2.47it/s]
 27%|██▋       | 8/30 [00:03<00:08,  2.49it/s]
 30%|███       | 9/30 [00:03<00:08,  2.51it/s]
 33%|███▎      | 10/30 [00:04<00:07,  2.52it/s]
 37%|███▋      | 11/30 [00:04<00:07,  2.53it/s]
 40%|████      | 12/30 [00:05<00:07,  2.53it/s]
 43%|████▎     | 13/30 [00:05<00:06,  2.53it/s]
 47%|████▋     | 14/30 [00:05<00:06,  2.54it/s]
 50%|█████     | 15/30 [00:06<00:05,  2.54it/s]
 53%|█████▎    | 16/30 [00:06<00:05,  2.54it/s]
 57%|█████▋    | 17/30 [00:07<00:05,  2.54it/s]
 60%|██████    | 18/30 [00:07<00:04,  2.54it/s]
 63%|██████▎   | 19/30 [00:07<00:04,  2.54it/s]
 67%|██████▋   | 20/30 [00:08<00:03,  2.54it/s]
 70%|███████   | 21/30 [00:08<00:03,  2.54it/s]
 73%|███████▎  | 22/30 [00:08<00:03,  2.54it/s]
 77%|███████▋  | 23/30 [00:09<00:02,  2.53it/s]
 80%|████████  | 24/30 [00:09<00:02,  2.54it/s]
 83%|████████▎ | 25/30 [00:10<00:01,  2.53it/s]
 87%|████████▋ | 26/30 [00:10<00:01,  2.53it/s]
 90%|█████████ | 27/30 [00:10<00:01,  2.53it/s]
 93%|█████████▎| 28/30 [00:11<00:00,  2.53it/s]
 97%|█████████▋| 29/30 [00:11<00:00,  2.53it/s]
+[2025-09-24 13:43:19.446][ INFO](root) - ---- DM/ControlNet Latent features generation time: 12.313125371932983 seconds ----
+[2025-09-24 13:43:20.016][ INFO](root) - ---- Start decoding latent features into images... ----
  0%|          | 0/4 [00:00<?, ?it/s]
 25%|██▌       | 1/4 [00:04<00:13,  4.36s/it]
 50%|█████     | 2/4 [00:08<00:07,  4.00s/it]
 75%|███████▌  | 3/4 [00:11<00:03,  3.79s/it]
+[2025-09-24 13:43:35.252][ INFO](root) - ---- Image VAE decoding time: 15.23531699180603 seconds ----
+2025-09-24 13:43:37,053 INFO image_writer.py:197 - writing: outputs/NoMAISI_DLCSD24_demo_512xy_256z_771p25m/DLCS_1419_seg_sh_image.nii.gz
+2025-09-24 13:43:41,437 INFO image_writer.py:197 - writing: outputs/NoMAISI_DLCSD24_demo_512xy_256z_771p25m/DLCS_1419_seg_sh_label.nii.gz

scripts/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# You may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

scripts/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (140 Bytes). View file

scripts/__pycache__/augmentation.cpython-310.pyc ADDED Viewed

Binary file (6.81 kB). View file

scripts/__pycache__/diff_model_create_training_data.cpython-310.pyc ADDED Viewed

Binary file (7.38 kB). View file

scripts/__pycache__/diff_model_setting.cpython-310.pyc ADDED Viewed

Binary file (2.47 kB). View file

scripts/__pycache__/find_masks.cpython-310.pyc ADDED Viewed

Binary file (4.48 kB). View file

scripts/__pycache__/infer_controlnet.cpython-310.pyc ADDED Viewed

Binary file (5.72 kB). View file

scripts/__pycache__/infer_testV2_controlnet.cpython-310.pyc ADDED Viewed

Binary file (5.76 kB). View file

scripts/__pycache__/infer_test_controlnet.cpython-310.pyc ADDED Viewed

Binary file (5.75 kB). View file

scripts/__pycache__/inference.cpython-310.pyc ADDED Viewed

Binary file (7.62 kB). View file

scripts/__pycache__/quality_check.cpython-310.pyc ADDED Viewed

Binary file (4.39 kB). View file

scripts/__pycache__/rectified_flow.cpython-310.pyc ADDED Viewed

Binary file (10.9 kB). View file

scripts/__pycache__/sample.cpython-310.pyc ADDED Viewed

Binary file (31.4 kB). View file

scripts/__pycache__/train_controlnet.cpython-310.pyc ADDED Viewed

Binary file (8.01 kB). View file

scripts/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (26.5 kB). View file

scripts/__pycache__/utils_plot.cpython-310.pyc ADDED Viewed

Binary file (6.66 kB). View file

scripts/augmentation.py ADDED Viewed

	@@ -0,0 +1,373 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+import torch.nn.functional as F
+from monai.transforms import Rand3DElastic, RandAffine, RandZoom
+from monai.utils import ensure_tuple_rep
+def erode3d(input_tensor, erosion=3):
+    # Define the structuring element
+    erosion = ensure_tuple_rep(erosion, 3)
+    structuring_element = torch.ones(1, 1, erosion[0], erosion[1], erosion[2]).to(input_tensor.device)
+    # Pad the input tensor to handle border pixels
+    input_padded = F.pad(
+        input_tensor.float().unsqueeze(0).unsqueeze(0),
+        (erosion[0] // 2, erosion[0] // 2, erosion[1] // 2, erosion[1] // 2, erosion[2] // 2, erosion[2] // 2),
+        mode="constant",
+        value=1.0,
+    )
+    # Apply erosion operation
+    output = F.conv3d(input_padded, structuring_element, padding=0)
+    # Set output values based on the minimum value within the structuring element
+    output = torch.where(output == torch.sum(structuring_element), 1.0, 0.0)
+    return output.squeeze(0).squeeze(0)
+def dilate3d(input_tensor, erosion=3):
+    # Define the structuring element
+    erosion = ensure_tuple_rep(erosion, 3)
+    structuring_element = torch.ones(1, 1, erosion[0], erosion[1], erosion[2]).to(input_tensor.device)
+    # Pad the input tensor to handle border pixels
+    input_padded = F.pad(
+        input_tensor.float().unsqueeze(0).unsqueeze(0),
+        (erosion[0] // 2, erosion[0] // 2, erosion[1] // 2, erosion[1] // 2, erosion[2] // 2, erosion[2] // 2),
+        mode="constant",
+        value=1.0,
+    )
+    # Apply erosion operation
+    output = F.conv3d(input_padded, structuring_element, padding=0)
+    # Set output values based on the minimum value within the structuring element
+    output = torch.where(output > 0, 1.0, 0.0)
+    return output.squeeze(0).squeeze(0)
+def augmentation_tumor_bone(pt_nda, output_size, random_seed=None):
+    volume = pt_nda.squeeze(0)
+    real_l_volume_ = torch.zeros_like(volume)
+    real_l_volume_[volume == 128] = 1
+    real_l_volume_ = real_l_volume_.to(torch.uint8)
+    elastic = RandAffine(
+        mode="nearest",
+        prob=1.0,
+        translate_range=(5, 5, 0),
+        rotate_range=(0, 0, 0.1),
+        scale_range=(0.15, 0.15, 0),
+        padding_mode="zeros",
+    )
+    elastic.set_random_state(seed=random_seed)
+    tumor_szie = torch.sum((real_l_volume_ > 0).float())
+    ###########################
+    # remove pred in pseudo_label in real lesion region
+    volume[real_l_volume_ > 0] = 200
+    ###########################
+    if tumor_szie > 0:
+        # get organ mask
+        organ_mask = (
+            torch.logical_and(33 <= volume, volume <= 56).float()
+            + torch.logical_and(63 <= volume, volume <= 97).float()
+            + (volume == 127).float()
+            + (volume == 114).float()
+            + real_l_volume_
+        )
+        organ_mask = (organ_mask > 0).float()
+        cnt = 0
+        while True:
+            threshold = 0.8 if cnt < 40 else 0.75
+            real_l_volume = real_l_volume_
+            # random distor mask
+            distored_mask = elastic((real_l_volume > 0).cuda(), spatial_size=tuple(output_size)).as_tensor()
+            real_l_volume = distored_mask * organ_mask
+            cnt += 1
+            print(torch.sum(real_l_volume), "|", tumor_szie * threshold)
+            if torch.sum(real_l_volume) >= tumor_szie * threshold:
+                real_l_volume = dilate3d(real_l_volume.squeeze(0), erosion=5)
+                real_l_volume = erode3d(real_l_volume, erosion=5).unsqueeze(0).to(torch.uint8)
+                break
+    else:
+        real_l_volume = real_l_volume_
+    volume[real_l_volume == 1] = 128
+    pt_nda = volume.unsqueeze(0)
+    return pt_nda
+def augmentation_tumor_liver(pt_nda, output_size, random_seed=None):
+    volume = pt_nda.squeeze(0)
+    real_l_volume_ = torch.zeros_like(volume)
+    real_l_volume_[volume == 1] = 1
+    real_l_volume_[volume == 26] = 2
+    real_l_volume_ = real_l_volume_.to(torch.uint8)
+    elastic = Rand3DElastic(
+        mode="nearest",
+        prob=1.0,
+        sigma_range=(5, 8),
+        magnitude_range=(100, 200),
+        translate_range=(10, 10, 10),
+        rotate_range=(np.pi / 36, np.pi / 36, np.pi / 36),
+        scale_range=(0.2, 0.2, 0.2),
+        padding_mode="zeros",
+    )
+    elastic.set_random_state(seed=random_seed)
+    tumor_szie = torch.sum(real_l_volume_ == 2)
+    ###########################
+    # remove pred  organ labels
+    volume[volume == 1] = 0
+    volume[volume == 26] = 0
+    # before move tumor maks, full the original location by organ labels
+    volume[real_l_volume_ == 1] = 1
+    volume[real_l_volume_ == 2] = 1
+    ###########################
+    while True:
+        real_l_volume = real_l_volume_
+        # random distor mask
+        real_l_volume = elastic((real_l_volume == 2).cuda(), spatial_size=tuple(output_size)).as_tensor()
+        # get organ mask
+        organ_mask = (real_l_volume_ == 1).float() + (real_l_volume_ == 2).float()
+        organ_mask = dilate3d(organ_mask.squeeze(0), erosion=5)
+        organ_mask = erode3d(organ_mask, erosion=5).unsqueeze(0)
+        real_l_volume = real_l_volume * organ_mask
+        print(torch.sum(real_l_volume), "|", tumor_szie * 0.80)
+        if torch.sum(real_l_volume) >= tumor_szie * 0.80:
+            real_l_volume = dilate3d(real_l_volume.squeeze(0), erosion=5)
+            real_l_volume = erode3d(real_l_volume, erosion=5).unsqueeze(0)
+            break
+    volume[real_l_volume == 1] = 26
+    pt_nda = volume.unsqueeze(0)
+    return pt_nda
+def augmentation_tumor_lung(pt_nda, output_size, random_seed=None):
+    volume = pt_nda.squeeze(0)
+    real_l_volume_ = torch.zeros_like(volume)
+    real_l_volume_[volume == 23] = 1
+    real_l_volume_ = real_l_volume_.to(torch.uint8)
+    elastic = Rand3DElastic(
+        mode="nearest",
+        prob=1.0,
+        sigma_range=(5, 8),
+        magnitude_range=(100, 200),
+        translate_range=(20, 20, 20),
+        rotate_range=(np.pi / 36, np.pi / 36, np.pi),
+        scale_range=(0.15, 0.15, 0.15),
+        padding_mode="zeros",
+    )
+    elastic.set_random_state(seed=random_seed)
+    tumor_szie = torch.sum(real_l_volume_)
+    # before move lung tumor maks, full the original location by lung labels
+    new_real_l_volume_ = dilate3d(real_l_volume_.squeeze(0), erosion=3)
+    new_real_l_volume_ = new_real_l_volume_.unsqueeze(0)
+    new_real_l_volume_[real_l_volume_ > 0] = 0
+    new_real_l_volume_[volume < 28] = 0
+    new_real_l_volume_[volume > 32] = 0
+    tmp = volume[(volume * new_real_l_volume_).nonzero(as_tuple=True)].view(-1)
+    mode = torch.mode(tmp, 0)[0].item()
+    print(mode)
+    assert 28 <= mode <= 32
+    volume[real_l_volume_.bool()] = mode
+    ###########################
+    if tumor_szie > 0:
+        # aug
+        while True:
+            real_l_volume = real_l_volume_
+            # random distor mask
+            real_l_volume = elastic(real_l_volume, spatial_size=tuple(output_size)).as_tensor()
+            # get lung mask v2 (133 order)
+            lung_mask = (
+                (volume == 28).float()
+                + (volume == 29).float()
+                + (volume == 30).float()
+                + (volume == 31).float()
+                + (volume == 32).float()
+            )
+            lung_mask = dilate3d(lung_mask.squeeze(0), erosion=5)
+            lung_mask = erode3d(lung_mask, erosion=5).unsqueeze(0)
+            real_l_volume = real_l_volume * lung_mask
+            print(torch.sum(real_l_volume), "|", tumor_szie * 0.85)
+            if torch.sum(real_l_volume) >= tumor_szie * 0.85:
+                real_l_volume = dilate3d(real_l_volume.squeeze(0), erosion=5)
+                real_l_volume = erode3d(real_l_volume, erosion=5).unsqueeze(0).to(torch.uint8)
+                break
+    else:
+        real_l_volume = real_l_volume_
+    volume[real_l_volume == 1] = 23
+    pt_nda = volume.unsqueeze(0)
+    return pt_nda
+def augmentation_tumor_pancreas(pt_nda, output_size, random_seed=None):
+    volume = pt_nda.squeeze(0)
+    real_l_volume_ = torch.zeros_like(volume)
+    real_l_volume_[volume == 4] = 1
+    real_l_volume_[volume == 24] = 2
+    real_l_volume_ = real_l_volume_.to(torch.uint8)
+    elastic = Rand3DElastic(
+        mode="nearest",
+        prob=1.0,
+        sigma_range=(5, 8),
+        magnitude_range=(100, 200),
+        translate_range=(15, 15, 15),
+        rotate_range=(np.pi / 36, np.pi / 36, np.pi / 36),
+        scale_range=(0.1, 0.1, 0.1),
+        padding_mode="zeros",
+    )
+    elastic.set_random_state(seed=random_seed)
+    tumor_szie = torch.sum(real_l_volume_ == 2)
+    ###########################
+    # remove pred  organ labels
+    volume[volume == 24] = 0
+    volume[volume == 4] = 0
+    # before move tumor maks, full the original location by organ labels
+    volume[real_l_volume_ == 1] = 4
+    volume[real_l_volume_ == 2] = 4
+    ###########################
+    while True:
+        real_l_volume = real_l_volume_
+        # random distor mask
+        real_l_volume = elastic((real_l_volume == 2).cuda(), spatial_size=tuple(output_size)).as_tensor()
+        # get organ mask
+        organ_mask = (real_l_volume_ == 1).float() + (real_l_volume_ == 2).float()
+        organ_mask = dilate3d(organ_mask.squeeze(0), erosion=5)
+        organ_mask = erode3d(organ_mask, erosion=5).unsqueeze(0)
+        real_l_volume = real_l_volume * organ_mask
+        print(torch.sum(real_l_volume), "|", tumor_szie * 0.80)
+        if torch.sum(real_l_volume) >= tumor_szie * 0.80:
+            real_l_volume = dilate3d(real_l_volume.squeeze(0), erosion=5)
+            real_l_volume = erode3d(real_l_volume, erosion=5).unsqueeze(0)
+            break
+    volume[real_l_volume == 1] = 24
+    pt_nda = volume.unsqueeze(0)
+    return pt_nda
+def augmentation_tumor_colon(pt_nda, output_size, random_seed=None):
+    volume = pt_nda.squeeze(0)
+    real_l_volume_ = torch.zeros_like(volume)
+    real_l_volume_[volume == 27] = 1
+    real_l_volume_ = real_l_volume_.to(torch.uint8)
+    elastic = Rand3DElastic(
+        mode="nearest",
+        prob=1.0,
+        sigma_range=(5, 8),
+        magnitude_range=(100, 200),
+        translate_range=(5, 5, 5),
+        rotate_range=(np.pi / 36, np.pi / 36, np.pi / 36),
+        scale_range=(0.1, 0.1, 0.1),
+        padding_mode="zeros",
+    )
+    elastic.set_random_state(seed=random_seed)
+    tumor_szie = torch.sum(real_l_volume_)
+    ###########################
+    # before move tumor maks, full the original location by organ labels
+    volume[real_l_volume_.bool()] = 62
+    ###########################
+    if tumor_szie > 0:
+        # get organ mask
+        organ_mask = (volume == 62).float()
+        organ_mask = dilate3d(organ_mask.squeeze(0), erosion=5)
+        organ_mask = erode3d(organ_mask, erosion=5).unsqueeze(0)
+        #         cnt = 0
+        cnt = 0
+        while True:
+            threshold = 0.8
+            real_l_volume = real_l_volume_
+            if cnt < 20:
+                # random distor mask
+                distored_mask = elastic((real_l_volume == 1).cuda(), spatial_size=tuple(output_size)).as_tensor()
+                real_l_volume = distored_mask * organ_mask
+            elif 20 <= cnt < 40:
+                threshold = 0.75
+            else:
+                break
+            real_l_volume = real_l_volume * organ_mask
+            print(torch.sum(real_l_volume), "|", tumor_szie * threshold)
+            cnt += 1
+            if torch.sum(real_l_volume) >= tumor_szie * threshold:
+                real_l_volume = dilate3d(real_l_volume.squeeze(0), erosion=5)
+                real_l_volume = erode3d(real_l_volume, erosion=5).unsqueeze(0).to(torch.uint8)
+                break
+    else:
+        real_l_volume = real_l_volume_
+    #     break
+    volume[real_l_volume == 1] = 27
+    pt_nda = volume.unsqueeze(0)
+    return pt_nda
+def augmentation_body(pt_nda, random_seed=None):
+    volume = pt_nda.squeeze(0)
+    zoom = RandZoom(min_zoom=0.99, max_zoom=1.01, mode="nearest", align_corners=None, prob=1.0)
+    zoom.set_random_state(seed=random_seed)
+    volume = zoom(volume)
+    pt_nda = volume.unsqueeze(0)
+    return pt_nda
+def augmentation(pt_nda, output_size, random_seed=None):
+    label_list = torch.unique(pt_nda)
+    label_list = list(label_list.cpu().numpy())
+    if 128 in label_list:
+        print("augmenting bone lesion/tumor")
+        pt_nda = augmentation_tumor_bone(pt_nda, output_size, random_seed)
+    elif 26 in label_list:
+        print("augmenting liver tumor")
+        pt_nda = augmentation_tumor_liver(pt_nda, output_size, random_seed)
+    elif 23 in label_list:
+        print("augmenting lung tumor")
+        pt_nda = augmentation_tumor_lung(pt_nda, output_size, random_seed)
+    elif 24 in label_list:
+        print("augmenting pancreas tumor")
+        pt_nda = augmentation_tumor_pancreas(pt_nda, output_size, random_seed)
+    elif 27 in label_list:
+        print("augmenting colon tumor")
+        pt_nda = augmentation_tumor_colon(pt_nda, output_size, random_seed)
+    else:
+        print("augmenting body")
+        pt_nda = augmentation_body(pt_nda, random_seed)
+    return pt_nda

scripts/compute_fid_2-5d_ct.py ADDED Viewed

	@@ -0,0 +1,747 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at:
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+# either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+"""
+Compute 2.5D FID using distributed GPU processing.
+SHELL Usage Example:
+-------------------
+    #!/bin/bash
+    export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6
+    NUM_GPUS=7
+    torchrun --nproc_per_node=${NUM_GPUS} compute_fid_2-5d_ct.py \
+        --model_name "radimagenet_resnet50" \
+        --real_dataset_root "path/to/datasetA" \
+        --real_filelist "path/to/filelistA.txt" \
+        --real_features_dir "datasetA" \
+        --synth_dataset_root "path/to/datasetB" \
+        --synth_filelist "path/to/filelistB.txt" \
+        --synth_features_dir "datasetB" \
+        --enable_center_slices_ratio 0.4 \
+        --enable_padding True \
+        --enable_center_cropping True \
+        --enable_resampling_spacing "1.0x1.0x1.0" \
+        --ignore_existing True \
+        --num_images 100 \
+        --output_root "./features/features-512x512x512" \
+        --target_shape "512x512x512"
+This script loads two datasets (real vs. synthetic) in 3D medical format (NIfTI)
+and extracts feature maps via a 2.5D approach. It then computes the Frechet
+Inception Distance (FID) across three orthogonal planes. Data parallelism
+is implemented using torch.distributed with an NCCL backend.
+Function Arguments (main):
+--------------------------
+    real_dataset_root (str):
+        Root folder for the real dataset.
+    real_filelist (str):
+        Text file listing 3D images for the real dataset.
+    real_features_dir (str):
+        Subdirectory (under `output_root`) in which to store feature files
+        extracted from the real dataset.
+    synth_dataset_root (str):
+        Root folder for the synthetic dataset.
+    synth_filelist (str):
+        Text file listing 3D images for the synthetic dataset.
+    synth_features_dir (str):
+        Subdirectory (under `output_root`) in which to store feature files
+        extracted from the synthetic dataset.
+    enable_center_slices_ratio (float or None):
+        - If not None, only slices around the specified center ratio will be used
+          (analogous to "enable_center_slices=True" with that ratio).
+        - If None, no center-slice selection is performed
+          (analogous to "enable_center_slices=False").
+    enable_padding (bool):
+        Whether to pad images to `target_shape`.
+    enable_center_cropping (bool):
+        Whether to center-crop images to `target_shape`.
+    enable_resampling_spacing (str or None):
+        - If not None, resample images to the specified voxel spacing (e.g. "1.0x1.0x1.0")
+          (analogous to "enable_resampling=True" with that spacing).
+        - If None, resampling is skipped
+          (analogous to "enable_resampling=False").
+    ignore_existing (bool):
+        If True, ignore any existing .pt feature files and force re-extraction.
+    model_name (str):
+        Model identifier. Typically "radimagenet_resnet50" or "squeezenet1_1".
+    num_images (int):
+        Max number of images to process from each dataset (truncate if more are present).
+    output_root (str):
+        Folder where extracted .pt feature files, logs, and results are saved.
+    target_shape (str):
+        Target shape as "XxYxZ" for padding, cropping, or resampling operations.
+"""
+from __future__ import annotations
+import os
+import sys
+import torch
+import fire
+import monai
+import re
+import torch.distributed as dist
+import torch.nn.functional as F
+from datetime import timedelta
+from pathlib import Path
+from monai.metrics.fid import FIDMetric
+from monai.transforms import Compose
+import logging
+# ------------------------------------------------------------------------------
+# Create logger
+# ------------------------------------------------------------------------------
+logger = logging.getLogger("fid_2-5d_ct")
+if not logger.handlers:
+    # Configure logger only if it has no handlers (avoid reconfiguring in multi-rank scenarios)
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+logger.setLevel(logging.INFO)
+def drop_empty_slice(slices, empty_threshold: float):
+    """
+    Decide which 2D slices to keep by checking if their maximum intensity
+    is below a certain threshold.
+    Args:
+        slices (tuple or list of Tensors): Each element is (B, C, H, W).
+        empty_threshold (float): If the slice's maximum value is below this threshold,
+            it is considered "empty".
+    Returns:
+        list[bool]: A list of booleans indicating for each slice whether to keep it.
+    """
+    outputs = []
+    n_drop = 0
+    for s in slices:
+        largest_unique = torch.max(torch.unique(s))
+        if largest_unique < empty_threshold:
+            outputs.append(False)
+            n_drop += 1
+        else:
+            outputs.append(True)
+    logger.info(f"Empty slice drop rate {round((n_drop/len(slices))*100,1)}%")
+    return outputs
+def subtract_mean(x: torch.Tensor) -> torch.Tensor:
+    """
+    Subtract per-channel means (ImageNet-like: [0.406, 0.456, 0.485])
+    from the input 4D or 5D tensor. Expects channels in the first dimension
+    after the batch dimension: (B, C, H, W) or (B, C, H, W, D).
+    """
+    mean = [0.406, 0.456, 0.485]
+    x[:, 0, ...] -= mean[0]
+    x[:, 1, ...] -= mean[1]
+    x[:, 2, ...] -= mean[2]
+    return x
+def spatial_average(x: torch.Tensor, keepdim: bool = True) -> torch.Tensor:
+    """
+    Average out the spatial dimensions of a tensor, preserving or removing them
+    according to `keepdim`. This is used to produce a 1D feature vector
+    out of a feature map.
+    Args:
+        x (torch.Tensor): Input tensor (B, C, H, W, ...) or (B, C, H, W).
+        keepdim (bool): Whether to keep dimension or not after averaging.
+    Returns:
+        torch.Tensor: Tensor with reduced spatial dimensions.
+    """
+    dim = len(x.shape)
+    # 2D -> no average
+    if dim == 2:
+        return x
+    # 3D -> average over last dim
+    if dim == 3:
+        return x.mean([2], keepdim=keepdim)
+    # 4D -> average over H,W
+    if dim == 4:
+        return x.mean([2, 3], keepdim=keepdim)
+    # 5D -> average over H,W,D
+    if dim == 5:
+        return x.mean([2, 3, 4], keepdim=keepdim)
+    return x
+def medicalnet_intensity_normalisation(volume: torch.Tensor) -> torch.Tensor:
+    """
+    Intensity normalization approach from MedicalNet:
+    (volume - mean) / (std + 1e-5) across spatial dims.
+    Expects (B, C, H, W) or (B, C, H, W, D).
+    """
+    dim = len(volume.shape)
+    if dim == 4:
+        mean = volume.mean([2, 3], keepdim=True)
+        std = volume.std([2, 3], keepdim=True)
+    elif dim == 5:
+        mean = volume.mean([2, 3, 4], keepdim=True)
+        std = volume.std([2, 3, 4], keepdim=True)
+    else:
+        return volume
+    return (volume - mean) / (std + 1e-5)
+def radimagenet_intensity_normalisation(volume: torch.Tensor, norm2d: bool = False) -> torch.Tensor:
+    """
+    Intensity normalization for radimagenet_resnet. Optionally normalizes each 2D slice individually.
+    Args:
+        volume (torch.Tensor): Input (B, C, H, W) or (B, C, H, W, D).
+        norm2d (bool): If True, normalizes each (H,W) slice to [0,1], then subtracts the ImageNet mean.
+    """
+    logger.info(f"norm2d: {norm2d}")
+    dim = len(volume.shape)
+    # If norm2d is True, only meaningful for 4D data (B, C, H, W):
+    if dim == 4 and norm2d:
+        max2d, _ = torch.max(volume, dim=2, keepdim=True)
+        max2d, _ = torch.max(max2d, dim=3, keepdim=True)
+        min2d, _ = torch.min(volume, dim=2, keepdim=True)
+        min2d, _ = torch.min(min2d, dim=3, keepdim=True)
+        # Scale each slice to 0..1
+        volume = (volume - min2d) / (max2d - min2d + 1e-10)
+        # Subtract channel mean
+        return subtract_mean(volume)
+    elif dim == 4:
+        # 4D but no per-slice normalization
+        max3d = torch.max(volume)
+        min3d = torch.min(volume)
+        volume = (volume - min3d) / (max3d - min3d + 1e-10)
+        return subtract_mean(volume)
+    # Fallback for e.g. 5D data is simply a min-max over entire volume
+    if dim == 5:
+        maxval = torch.max(volume)
+        minval = torch.min(volume)
+        volume = (volume - minval) / (maxval - minval + 1e-10)
+        return subtract_mean(volume)
+    return volume
+def get_features_2p5d(
+    image: torch.Tensor,
+    feature_network: torch.nn.Module,
+    center_slices: bool = False,
+    center_slices_ratio: float = 1.0,
+    sample_every_k: int = 1,
+    xy_only: bool = True,
+    drop_empty: bool = False,
+    empty_threshold: float = -700,
+) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None]:
+    """
+    Extract 2.5D features from a 3D image by slicing it along XY, YZ, ZX planes.
+    Args:
+        image (torch.Tensor): Input 5D tensor in shape (B, C, H, W, D).
+        feature_network (torch.nn.Module): Model that processes 2D slices (C,H,W).
+        center_slices (bool): Whether to slice only the center portion of each axis.
+        center_slices_ratio (float): Ratio of slices to keep in the center if `center_slices` is True.
+        sample_every_k (int): Downsampling factor along each axis when slicing.
+        xy_only (bool): If True, return only the XY-plane features.
+        drop_empty (bool): Drop slices that are deemed "empty" below `empty_threshold`.
+        empty_threshold (float): Threshold to decide emptiness of slices.
+    Returns:
+        tuple of torch.Tensor or None: (XY_features, YZ_features, ZX_features).
+    """
+    logger.info(f"center_slices: {center_slices}, ratio: {center_slices_ratio}")
+    # If there's only 1 channel, replicate to 3 channels
+    if image.shape[1] == 1:
+        image = image.repeat(1, 3, 1, 1, 1)
+    # Convert from 'RGB'→(R,G,B) to (B,G,R)
+    image = image[:, [2, 1, 0], ...]
+    B, C, H, W, D = image.size()
+    with torch.no_grad():
+        # ---------------------- XY-plane slicing along D ----------------------
+        if center_slices:
+            start_d = int((1.0 - center_slices_ratio) / 2.0 * D)
+            end_d = int((1.0 + center_slices_ratio) / 2.0 * D)
+            slices = torch.unbind(image[:, :, :, :, start_d:end_d:sample_every_k], dim=-1)
+        else:
+            slices = torch.unbind(image, dim=-1)
+        if drop_empty:
+            mapping_index = drop_empty_slice(slices, empty_threshold)
+        else:
+            mapping_index = [True for _ in range(len(slices))]
+        images_2d = torch.cat(slices, dim=0)
+        images_2d = radimagenet_intensity_normalisation(images_2d)
+        images_2d = images_2d[mapping_index]
+        feature_image_xy = feature_network.forward(images_2d)
+        feature_image_xy = spatial_average(feature_image_xy, keepdim=False)
+        if xy_only:
+            return feature_image_xy, None, None
+        # ---------------------- YZ-plane slicing along H ----------------------
+        if center_slices:
+            start_h = int((1.0 - center_slices_ratio) / 2.0 * H)
+            end_h = int((1.0 + center_slices_ratio) / 2.0 * H)
+            slices = torch.unbind(image[:, :, start_h:end_h:sample_every_k, :, :], dim=2)
+        else:
+            slices = torch.unbind(image, dim=2)
+        if drop_empty:
+            mapping_index = drop_empty_slice(slices, empty_threshold)
+        else:
+            mapping_index = [True for _ in range(len(slices))]
+        images_2d = torch.cat(slices, dim=0)
+        images_2d = radimagenet_intensity_normalisation(images_2d)
+        images_2d = images_2d[mapping_index]
+        feature_image_yz = feature_network.forward(images_2d)
+        feature_image_yz = spatial_average(feature_image_yz, keepdim=False)
+        # ---------------------- ZX-plane slicing along W ----------------------
+        if center_slices:
+            start_w = int((1.0 - center_slices_ratio) / 2.0 * W)
+            end_w = int((1.0 + center_slices_ratio) / 2.0 * W)
+            slices = torch.unbind(image[:, :, :, start_w:end_w:sample_every_k, :], dim=3)
+        else:
+            slices = torch.unbind(image, dim=3)
+        if drop_empty:
+            mapping_index = drop_empty_slice(slices, empty_threshold)
+        else:
+            mapping_index = [True for _ in range(len(slices))]
+        images_2d = torch.cat(slices, dim=0)
+        images_2d = radimagenet_intensity_normalisation(images_2d)
+        images_2d = images_2d[mapping_index]
+        feature_image_zx = feature_network.forward(images_2d)
+        feature_image_zx = spatial_average(feature_image_zx, keepdim=False)
+    return feature_image_xy, feature_image_yz, feature_image_zx
+def pad_to_max_size(tensor: torch.Tensor, max_size: int, padding_value: float = 0.0) -> torch.Tensor:
+    """
+    Zero-pad a 2D feature map or other tensor along the first dimension to match a specified size.
+    Args:
+        tensor (torch.Tensor): The feature tensor to pad.
+        max_size (int): Desired size along the first dimension.
+        padding_value (float): Value to fill during padding.
+    Returns:
+        torch.Tensor: Padded tensor matching `max_size` along dim=0.
+    """
+    pad_size = [0, 0] * (len(tensor.shape) - 1) + [0, max_size - tensor.shape[0]]
+    return F.pad(tensor, pad_size, "constant", padding_value)
+def main(
+    real_dataset_root: str = "path/to/datasetA",
+    real_filelist: str = "path/to/filelistA.txt",
+    real_features_dir: str = "datasetA",
+    synth_dataset_root: str = "path/to/datasetB",
+    synth_filelist: str = "path/to/filelistB.txt",
+    synth_features_dir: str = "datasetB",
+    enable_center_slices_ratio: float = None,
+    enable_padding: bool = True,
+    enable_center_cropping: bool = True,
+    enable_resampling_spacing: str = None,
+    ignore_existing: bool = False,
+    model_name: str = "radimagenet_resnet50",
+    num_images: int = 100,
+    output_root: str = "./features/features-512x512x512",
+    target_shape: str = "512x512x512",
+):
+    """
+    Compute 2.5D FID using distributed GPU processing.
+    This function loads two datasets (real vs. synthetic) in 3D medical format (NIfTI)
+    and extracts feature maps via a 2.5D approach, then computes the Frechet Inception
+    Distance (FID) across three orthogonal planes. Data parallelism is implemented
+    using torch.distributed with an NCCL backend.
+    Args:
+        real_dataset_root (str):
+            Root folder for the real dataset.
+        real_filelist (str):
+            Path to a text file listing 3D images (e.g., NIfTI files) for the real dataset.
+            Each line in this file should contain a relative path (or filename) to a NIfTI file.
+            For example, your "real_filelist.txt" could look like:
+                case001.nii.gz
+                case002.nii.gz
+                case003.nii.gz
+                ...
+            These entries will be appended to `real_dataset_root`.
+        real_features_dir (str):
+            Name of the directory under `output_root` in which to store
+            extracted features for the real dataset.
+        synth_dataset_root (str):
+            Root folder for the synthetic dataset.
+        synth_filelist (str):
+            Path to a text file listing 3D images (e.g., NIfTI files) for the synthetic dataset.
+            The format is the same as the real dataset file list, for example:
+                synth_case001.nii.gz
+                synth_case002.nii.gz
+                synth_case003.nii.gz
+                ...
+            These entries will be appended to `synth_dataset_root`.
+        synth_features_dir (str):
+            Name of the directory under `output_root` in which to store
+            extracted features for the synthetic dataset.
+        enable_center_slices_ratio (float or None):
+            - If not None, only slices around the specified center ratio are used.
+              (similar to "enable_center_slices=True" with that ratio in an earlier script).
+            - If None, no center-slice selection is performed
+              (similar to "enable_center_slices=False").
+        enable_padding (bool):
+            Whether to pad images to `target_shape`.
+        enable_center_cropping (bool):
+            Whether to center-crop images to `target_shape`.
+        enable_resampling_spacing (str or None):
+            - If not None, resample images to this voxel spacing (e.g. "1.0x1.0x1.0")
+              (similar to "enable_resampling=True" with that spacing).
+            - If None, skip resampling (similar to "enable_resampling=False").
+        ignore_existing (bool):
+            If True, ignore any existing .pt feature files and force re-computation.
+        model_name (str):
+            Model identifier. Typically "radimagenet_resnet50" or "squeezenet1_1".
+        num_images (int):
+            Maximum number of images to load from each dataset (truncate if more are present).
+        output_root (str):
+            Parent folder where extracted .pt files and logs will be saved.
+        target_shape (str):
+            Target shape, e.g. "512x512x512", for padding, cropping, or resampling operations.
+    Returns:
+        None
+    """
+    # -------------------------------------------------------------------------
+    # Initialize Process Group (Distributed)
+    # -------------------------------------------------------------------------
+    dist.init_process_group(backend="nccl", init_method="env://", timeout=timedelta(seconds=7200))
+    local_rank = int(os.environ["LOCAL_RANK"])
+    world_size = int(dist.get_world_size())
+    device = torch.device("cuda", local_rank)
+    torch.cuda.set_device(device)
+    logger.info(f"[INFO] Running process on {device} of total {world_size} ranks.")
+    # Convert potential string bools to actual bools (if using Fire or similar)
+    if not isinstance(enable_padding, bool):
+        enable_padding = enable_padding.lower() == "true"
+    if not isinstance(enable_center_cropping, bool):
+        enable_center_cropping = enable_center_cropping.lower() == "true"
+    if not isinstance(ignore_existing, bool):
+        ignore_existing = ignore_existing.lower() == "true"
+    # Merge logic for center slices
+    enable_center_slices = enable_center_slices_ratio is not None
+    # Merge logic for resampling
+    enable_resampling = enable_resampling_spacing is not None
+    # Print out some flags on rank 0
+    if local_rank == 0:
+        logger.info(f"Real dataset root: {real_dataset_root}")
+        logger.info(f"Synth dataset root: {synth_dataset_root}")
+        logger.info(f"enable_center_slices_ratio: {enable_center_slices_ratio}")
+        logger.info(f"enable_center_slices: {enable_center_slices}")
+        logger.info(f"enable_padding: {enable_padding}")
+        logger.info(f"enable_center_cropping: {enable_center_cropping}")
+        logger.info(f"enable_resampling_spacing: {enable_resampling_spacing}")
+        logger.info(f"enable_resampling: {enable_resampling}")
+        logger.info(f"ignore_existing: {ignore_existing}")
+    # -------------------------------------------------------------------------
+    # Load feature extraction model
+    # -------------------------------------------------------------------------
+    if model_name == "radimagenet_resnet50":
+        feature_network = torch.hub.load(
+            "Warvito/radimagenet-models", model="radimagenet_resnet50", verbose=True, trust_repo=True
+        )
+        suffix = "radimagenet_resnet50"
+    else:
+        import torchvision
+        feature_network = torchvision.models.squeezenet1_1(pretrained=True)
+        suffix = "squeezenet1_1"
+    feature_network.to(device)
+    feature_network.eval()
+    # -------------------------------------------------------------------------
+    # Parse shape/spacings
+    # -------------------------------------------------------------------------
+    t_shape = [int(x) for x in target_shape.split("x")]
+    target_shape_tuple = tuple(t_shape)
+    # If not None, parse the resampling spacing
+    if enable_resampling:
+        rs_spacing = [float(x) for x in enable_resampling_spacing.split("x")]
+        rs_spacing_tuple = tuple(rs_spacing)
+        if local_rank == 0:
+            logger.info(f"Resampling spacing: {rs_spacing_tuple}")
+    else:
+        rs_spacing_tuple = (1.0, 1.0, 1.0)
+    # Use the ratio if provided, otherwise 1.0
+    center_slices_ratio_final = enable_center_slices_ratio if enable_center_slices else 1.0
+    if local_rank == 0:
+        logger.info(f"center_slices_ratio: {center_slices_ratio_final}")
+    # -------------------------------------------------------------------------
+    # Prepare Real Dataset
+    # -------------------------------------------------------------------------
+    output_root_real = os.path.join(output_root, real_features_dir)
+    with open(real_filelist, "r") as rf:
+        real_lines = [l.strip() for l in rf.readlines()]
+    real_lines.sort()
+    real_lines = real_lines[:num_images]
+    real_filenames = [{"image": os.path.join(real_dataset_root, f)} for f in real_lines]
+    real_filenames = monai.data.partition_dataset(
+        data=real_filenames, shuffle=False, num_partitions=world_size, even_divisible=False
+    )[local_rank]
+    # -------------------------------------------------------------------------
+    # Prepare Synthetic Dataset
+    # -------------------------------------------------------------------------
+    output_root_synth = os.path.join(output_root, synth_features_dir)
+    with open(synth_filelist, "r") as sf:
+        synth_lines = [l.strip() for l in sf.readlines()]
+    synth_lines.sort()
+    synth_lines = synth_lines[:num_images]
+    synth_filenames = [{"image": os.path.join(synth_dataset_root, f)} for f in synth_lines]
+    synth_filenames = monai.data.partition_dataset(
+        data=synth_filenames, shuffle=False, num_partitions=world_size, even_divisible=False
+    )[local_rank]
+    # -------------------------------------------------------------------------
+    # Build MONAI transforms
+    # -------------------------------------------------------------------------
+    transform_list = [
+        monai.transforms.LoadImaged(keys=["image"]),
+        monai.transforms.EnsureChannelFirstd(keys=["image"]),
+        monai.transforms.Orientationd(keys=["image"], axcodes="RAS"),
+    ]
+    if enable_resampling:
+        transform_list.append(monai.transforms.Spacingd(keys=["image"], pixdim=rs_spacing_tuple, mode=["bilinear"]))
+    if enable_padding:
+        transform_list.append(
+            monai.transforms.SpatialPadd(keys=["image"], spatial_size=target_shape_tuple, mode="constant", value=-1000)
+        )
+    if enable_center_cropping:
+        transform_list.append(monai.transforms.CenterSpatialCropd(keys=["image"], roi_size=target_shape_tuple))
+    transform_list.append(
+        monai.transforms.ScaleIntensityRanged(
+            keys=["image"], a_min=-1000, a_max=1000, b_min=-1000, b_max=1000, clip=True
+        )
+    )
+    transforms = Compose(transform_list)
+    # -------------------------------------------------------------------------
+    # Create DataLoaders
+    # -------------------------------------------------------------------------
+    real_ds = monai.data.Dataset(data=real_filenames, transform=transforms)
+    real_loader = monai.data.DataLoader(real_ds, num_workers=6, batch_size=1, shuffle=False)
+    synth_ds = monai.data.Dataset(data=synth_filenames, transform=transforms)
+    synth_loader = monai.data.DataLoader(synth_ds, num_workers=6, batch_size=1, shuffle=False)
+    # -------------------------------------------------------------------------
+    # Extract features for Real Dataset
+    # -------------------------------------------------------------------------
+    real_features_xy, real_features_yz, real_features_zx = [], [], []
+    for idx, batch_data in enumerate(real_loader, start=1):
+        img = batch_data["image"].to(device)
+        fn = img.meta["filename_or_obj"][0]
+        logger.info(f"[Rank {local_rank}] Real data {idx}/{len(real_filenames)}: {fn}")
+        out_fp = fn.replace(real_dataset_root, output_root_real).replace(".nii.gz", ".pt")
+        out_fp = Path(out_fp)
+        out_fp.parent.mkdir(parents=True, exist_ok=True)
+        if (not ignore_existing) and os.path.isfile(out_fp):
+            feats = torch.load(out_fp, weights_only=True)
+        else:
+            img_t = img.as_tensor()
+            logger.info(f"image shape: {tuple(img_t.shape)}")
+            feats = get_features_2p5d(
+                img_t,
+                feature_network,
+                center_slices=enable_center_slices,
+                center_slices_ratio=center_slices_ratio_final,
+                xy_only=False,
+            )
+            logger.info(f"feats shapes: {feats[0].shape}, {feats[1].shape}, {feats[2].shape}")
+            torch.save(feats, out_fp)
+        real_features_xy.append(feats[0])
+        real_features_yz.append(feats[1])
+        real_features_zx.append(feats[2])
+    real_features_xy = torch.vstack(real_features_xy)
+    real_features_yz = torch.vstack(real_features_yz)
+    real_features_zx = torch.vstack(real_features_zx)
+    logger.info(
+        f"Real feature shapes: {real_features_xy.shape}, " f"{real_features_yz.shape}, {real_features_zx.shape}"
+    )
+    # -------------------------------------------------------------------------
+    # Extract features for Synthetic Dataset
+    # -------------------------------------------------------------------------
+    synth_features_xy, synth_features_yz, synth_features_zx = [], [], []
+    for idx, batch_data in enumerate(synth_loader, start=1):
+        img = batch_data["image"].to(device)
+        fn = img.meta["filename_or_obj"][0]
+        logger.info(f"[Rank {local_rank}] Synth data {idx}/{len(synth_filenames)}: {fn}")
+        out_fp = fn.replace(synth_dataset_root, output_root_synth).replace(".nii.gz", ".pt")
+        out_fp = Path(out_fp)
+        out_fp.parent.mkdir(parents=True, exist_ok=True)
+        if (not ignore_existing) and os.path.isfile(out_fp):
+            feats = torch.load(out_fp, weights_only=True)
+        else:
+            img_t = img.as_tensor()
+            logger.info(f"image shape: {tuple(img_t.shape)}")
+            feats = get_features_2p5d(
+                img_t,
+                feature_network,
+                center_slices=enable_center_slices,
+                center_slices_ratio=center_slices_ratio_final,
+                xy_only=False,
+            )
+            logger.info(f"feats shapes: {feats[0].shape}, {feats[1].shape}, {feats[2].shape}")
+            torch.save(feats, out_fp)
+        synth_features_xy.append(feats[0])
+        synth_features_yz.append(feats[1])
+        synth_features_zx.append(feats[2])
+    synth_features_xy = torch.vstack(synth_features_xy)
+    synth_features_yz = torch.vstack(synth_features_yz)
+    synth_features_zx = torch.vstack(synth_features_zx)
+    logger.info(
+        f"Synth feature shapes: {synth_features_xy.shape}, " f"{synth_features_yz.shape}, {synth_features_zx.shape}"
+    )
+    # -------------------------------------------------------------------------
+    # All-reduce / gather features across ranks
+    # -------------------------------------------------------------------------
+    features = [
+        real_features_xy,
+        real_features_yz,
+        real_features_zx,
+        synth_features_xy,
+        synth_features_yz,
+        synth_features_zx,
+    ]
+    # 1) Gather local feature sizes across ranks
+    local_sizes = []
+    for ft_idx in range(len(features)):
+        local_size = torch.tensor([features[ft_idx].shape[0]], dtype=torch.int64, device=device)
+        local_sizes.append(local_size)
+    all_sizes = []
+    for ft_idx in range(len(features)):
+        rank_sizes = [torch.tensor([0], dtype=torch.int64, device=device) for _ in range(world_size)]
+        dist.all_gather(rank_sizes, local_sizes[ft_idx])
+        all_sizes.append(rank_sizes)
+    # 2) Pad and gather all features
+    all_tensors_list = []
+    for ft_idx, ft in enumerate(features):
+        max_size = max(all_sizes[ft_idx]).item()
+        ft_padded = pad_to_max_size(ft, max_size)
+        gather_list = [torch.empty_like(ft_padded) for _ in range(world_size)]
+        dist.all_gather(gather_list, ft_padded)
+        # Trim each gather back to the real size
+        for rk in range(world_size):
+            gather_list[rk] = gather_list[rk][: all_sizes[ft_idx][rk], :]
+        all_tensors_list.append(gather_list)
+    # On rank 0, compute FID
+    if local_rank == 0:
+        real_xy = torch.vstack(all_tensors_list[0])
+        real_yz = torch.vstack(all_tensors_list[1])
+        real_zx = torch.vstack(all_tensors_list[2])
+        synth_xy = torch.vstack(all_tensors_list[3])
+        synth_yz = torch.vstack(all_tensors_list[4])
+        synth_zx = torch.vstack(all_tensors_list[5])
+        logger.info(f"Final Real shapes: {real_xy.shape}, {real_yz.shape}, {real_zx.shape}")
+        logger.info(f"Final Synth shapes: {synth_xy.shape}, {synth_yz.shape}, {synth_zx.shape}")
+        fid = FIDMetric()
+        logger.info(f"Computing FID for: {output_root_real} | {output_root_synth}")
+        fid_res_xy = fid(synth_xy, real_xy)
+        fid_res_yz = fid(synth_yz, real_yz)
+        fid_res_zx = fid(synth_zx, real_zx)
+        logger.info(f"FID XY: {fid_res_xy}")
+        logger.info(f"FID YZ: {fid_res_yz}")
+        logger.info(f"FID ZX: {fid_res_zx}")
+        fid_avg = (fid_res_xy + fid_res_yz + fid_res_zx) / 3.0
+        logger.info(f"FID Avg: {fid_avg}")
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    fire.Fire(main)

scripts/diff_model_create_training_data.py ADDED Viewed

	@@ -0,0 +1,231 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import argparse
+import json
+import logging
+import os
+from pathlib import Path
+import monai
+import nibabel as nib
+import numpy as np
+import torch
+import torch.distributed as dist
+from monai.transforms import Compose
+from monai.utils import set_determinism
+from .diff_model_setting import initialize_distributed, load_config, setup_logging
+from .utils import define_instance
+# Set the random seed for reproducibility
+set_determinism(seed=0)
+def create_transforms(dim: tuple = None) -> Compose:
+    """
+    Create a set of MONAI transforms for preprocessing.
+    Args:
+        dim (tuple, optional): New dimensions for resizing. Defaults to None.
+    Returns:
+        Compose: Composed MONAI transforms.
+    """
+    if dim:
+        return Compose(
+            [
+                monai.transforms.LoadImaged(keys="image"),
+                monai.transforms.EnsureChannelFirstd(keys="image"),
+                monai.transforms.Orientationd(keys="image", axcodes="RAS"),
+                monai.transforms.EnsureTyped(keys="image", dtype=torch.float32),
+                monai.transforms.ScaleIntensityRanged(
+                    keys="image", a_min=-1000, a_max=1000, b_min=0, b_max=1, clip=True
+                ),
+                monai.transforms.Resized(keys="image", spatial_size=dim, mode="trilinear"),
+            ]
+        )
+    else:
+        return Compose(
+            [
+                monai.transforms.LoadImaged(keys="image"),
+                monai.transforms.EnsureChannelFirstd(keys="image"),
+                monai.transforms.Orientationd(keys="image", axcodes="RAS"),
+            ]
+        )
+def round_number(number: int, base_number: int = 128) -> int:
+    """
+    Round the number to the nearest multiple of the base number, with a minimum value of the base number.
+    Args:
+        number (int): Number to be rounded.
+        base_number (int): Number to be common divisor.
+    Returns:
+        int: Rounded number.
+    """
+    new_number = max(round(float(number) / float(base_number)), 1.0) * float(base_number)
+    return int(new_number)
+def load_filenames(data_list_path: str) -> list:
+    """
+    Load filenames from the JSON data list.
+    Args:
+        data_list_path (str): Path to the JSON data list file.
+    Returns:
+        list: List of filenames.
+    """
+    with open(data_list_path, "r") as file:
+        json_data = json.load(file)
+    filenames_raw = json_data["training"]
+    return [_item["image"] for _item in filenames_raw]
+def process_file(
+    filepath: str,
+    args: argparse.Namespace,
+    autoencoder: torch.nn.Module,
+    device: torch.device,
+    plain_transforms: Compose,
+    new_transforms: Compose,
+    logger: logging.Logger,
+) -> None:
+    """
+    Process a single file to create training data.
+    Args:
+        filepath (str): Path to the file to be processed.
+        args (argparse.Namespace): Configuration arguments.
+        autoencoder (torch.nn.Module): Autoencoder model.
+        device (torch.device): Device to process the file on.
+        plain_transforms (Compose): Plain transforms.
+        new_transforms (Compose): New transforms.
+        logger (logging.Logger): Logger for logging information.
+    """
+    out_filename_base = filepath.replace(".gz", "").replace(".nii", "")
+    out_filename_base = os.path.join(args.embedding_base_dir, out_filename_base)
+    out_filename = out_filename_base + "_emb.nii.gz"
+    if os.path.isfile(out_filename):
+        return
+    test_data = {"image": os.path.join(args.data_base_dir, filepath)}
+    transformed_data = plain_transforms(test_data)
+    nda = transformed_data["image"]
+    dim = [int(nda.meta["dim"][_i]) for _i in range(1, 4)]
+    spacing = [float(nda.meta["pixdim"][_i]) for _i in range(1, 4)]
+    logger.info(f"old dim: {dim}, old spacing: {spacing}")
+    new_data = new_transforms(test_data)
+    nda_image = new_data["image"]
+    new_affine = nda_image.meta["affine"].numpy()
+    nda_image = nda_image.numpy().squeeze()
+    logger.info(f"new dim: {nda_image.shape}, new affine: {new_affine}")
+    try:
+        out_path = Path(out_filename)
+        out_path.parent.mkdir(parents=True, exist_ok=True)
+        logger.info(f"out_filename: {out_filename}")
+        with torch.amp.autocast("cuda"):
+            pt_nda = torch.from_numpy(nda_image).float().to(device).unsqueeze(0).unsqueeze(0)
+            z = autoencoder.encode_stage_2_inputs(pt_nda)
+            logger.info(f"z: {z.size()}, {z.dtype}")
+            out_nda = z.squeeze().cpu().detach().numpy().transpose(1, 2, 3, 0)
+            out_img = nib.Nifti1Image(np.float32(out_nda), affine=new_affine)
+            nib.save(out_img, out_filename)
+    except Exception as e:
+        logger.error(f"Error processing {filepath}: {e}")
+@torch.inference_mode()
+def diff_model_create_training_data(
+    env_config_path: str, model_config_path: str, model_def_path: str, num_gpus: int
+) -> None:
+    """
+    Create training data for the diffusion model.
+    Args:
+        env_config_path (str): Path to the environment configuration file.
+        model_config_path (str): Path to the model configuration file.
+        model_def_path (str): Path to the model definition file.
+    """
+    args = load_config(env_config_path, model_config_path, model_def_path)
+    local_rank, world_size, device = initialize_distributed(num_gpus=num_gpus)
+    logger = setup_logging("creating training data")
+    logger.info(f"Using device {device}")
+    autoencoder = define_instance(args, "autoencoder_def").to(device)
+    try:
+        checkpoint_autoencoder = torch.load(args.trained_autoencoder_path, weights_only=True)
+        autoencoder.load_state_dict(checkpoint_autoencoder)
+    except Exception:
+        logger.error("The trained_autoencoder_path does not exist!")
+    Path(args.embedding_base_dir).mkdir(parents=True, exist_ok=True)
+    filenames_raw = load_filenames(args.json_data_list)
+    logger.info(f"filenames_raw: {filenames_raw}")
+    plain_transforms = create_transforms(dim=None)
+    for _iter in range(len(filenames_raw)):
+        if _iter % world_size != local_rank:
+            continue
+        filepath = filenames_raw[_iter]
+        new_dim = tuple(
+            round_number(
+                int(plain_transforms({"image": os.path.join(args.data_base_dir, filepath)})["image"].meta["dim"][_i])
+            )
+            for _i in range(1, 4)
+        )
+        new_transforms = create_transforms(new_dim)
+        process_file(filepath, args, autoencoder, device, plain_transforms, new_transforms, logger)
+    if dist.is_initialized():
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Diffusion Model Training Data Creation")
+    parser.add_argument(
+        "--env_config",
+        type=str,
+        default="./configs/environment_maisi_diff_model_train.json",
+        help="Path to environment configuration file",
+    )
+    parser.add_argument(
+        "--model_config",
+        type=str,
+        default="./configs/config_maisi_diff_model_train.json",
+        help="Path to model training/inference configuration",
+    )
+    parser.add_argument(
+        "--model_def", type=str, default="./configs/config_maisi.json", help="Path to model definition file"
+    )
+    parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use for distributed training")
+    args = parser.parse_args()
+    diff_model_create_training_data(args.env_config, args.model_config, args.model_def, args.num_gpus)

scripts/diff_model_infer.py ADDED Viewed

	@@ -0,0 +1,358 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import argparse
+import logging
+import os
+import random
+from datetime import datetime
+import nibabel as nib
+import numpy as np
+import torch
+import torch.distributed as dist
+from monai.inferers import sliding_window_inference
+from monai.inferers.inferer import SlidingWindowInferer
+from monai.networks.schedulers import RFlowScheduler
+from monai.utils import set_determinism
+from tqdm import tqdm
+from .diff_model_setting import initialize_distributed, load_config, setup_logging
+from .sample import ReconModel, check_input
+from .utils import define_instance, dynamic_infer
+def set_random_seed(seed: int) -> int:
+    """
+    Set random seed for reproducibility.
+    Args:
+        seed (int): Random seed.
+    Returns:
+        int: Set random seed.
+    """
+    random_seed = random.randint(0, 99999) if seed is None else seed
+    set_determinism(random_seed)
+    return random_seed
+def load_models(args: argparse.Namespace, device: torch.device, logger: logging.Logger) -> tuple:
+    """
+    Load the autoencoder and UNet models.
+    Args:
+        args (argparse.Namespace): Configuration arguments.
+        device (torch.device): Device to load models on.
+        logger (logging.Logger): Logger for logging information.
+    Returns:
+        tuple: Loaded autoencoder, UNet model, and scale factor.
+    """
+    autoencoder = define_instance(args, "autoencoder_def").to(device)
+    try:
+        checkpoint_autoencoder = torch.load(args.trained_autoencoder_path, weights_only=True)
+        autoencoder.load_state_dict(checkpoint_autoencoder)
+    except Exception:
+        logger.error("The trained_autoencoder_path does not exist!")
+    unet = define_instance(args, "diffusion_unet_def").to(device)
+    checkpoint = torch.load(f"{args.model_dir}/{args.model_filename}", map_location=device, weights_only=False)
+    unet.load_state_dict(checkpoint["unet_state_dict"], strict=True)
+    logger.info(f"checkpoints {args.model_dir}/{args.model_filename} loaded.")
+    scale_factor = checkpoint["scale_factor"]
+    logger.info(f"scale_factor -> {scale_factor}.")
+    return autoencoder, unet, scale_factor
+def prepare_tensors(args: argparse.Namespace, device: torch.device) -> tuple:
+    """
+    Prepare necessary tensors for inference.
+    Args:
+        args (argparse.Namespace): Configuration arguments.
+        device (torch.device): Device to load tensors on.
+    Returns:
+        tuple: Prepared top_region_index_tensor, bottom_region_index_tensor, and spacing_tensor.
+    """
+    top_region_index_tensor = np.array(args.diffusion_unet_inference["top_region_index"]).astype(float) * 1e2
+    bottom_region_index_tensor = np.array(args.diffusion_unet_inference["bottom_region_index"]).astype(float) * 1e2
+    spacing_tensor = np.array(args.diffusion_unet_inference["spacing"]).astype(float) * 1e2
+    top_region_index_tensor = torch.from_numpy(top_region_index_tensor[np.newaxis, :]).half().to(device)
+    bottom_region_index_tensor = torch.from_numpy(bottom_region_index_tensor[np.newaxis, :]).half().to(device)
+    spacing_tensor = torch.from_numpy(spacing_tensor[np.newaxis, :]).half().to(device)
+    modality_tensor = args.diffusion_unet_inference["modality"] * torch.ones(
+        (len(spacing_tensor)), dtype=torch.long
+    ).to(device)
+    return top_region_index_tensor, bottom_region_index_tensor, spacing_tensor, modality_tensor
+def run_inference(
+    args: argparse.Namespace,
+    device: torch.device,
+    autoencoder: torch.nn.Module,
+    unet: torch.nn.Module,
+    scale_factor: float,
+    top_region_index_tensor: torch.Tensor,
+    bottom_region_index_tensor: torch.Tensor,
+    spacing_tensor: torch.Tensor,
+    modality_tensor: torch.Tensor,
+    output_size: tuple,
+    divisor: int,
+    logger: logging.Logger,
+) -> np.ndarray:
+    """
+    Run the inference to generate synthetic images.
+    Args:
+        args (argparse.Namespace): Configuration arguments.
+        device (torch.device): Device to run inference on.
+        autoencoder (torch.nn.Module): Autoencoder model.
+        unet (torch.nn.Module): UNet model.
+        scale_factor (float): Scale factor for the model.
+        top_region_index_tensor (torch.Tensor): Top region index tensor.
+        bottom_region_index_tensor (torch.Tensor): Bottom region index tensor.
+        spacing_tensor (torch.Tensor): Spacing tensor.
+        modality_tensor (torch.Tensor): Modality tensor.
+        output_size (tuple): Output size of the synthetic image.
+        divisor (int): Divisor for downsample level.
+        logger (logging.Logger): Logger for logging information.
+    Returns:
+        np.ndarray: Generated synthetic image data.
+    """
+    include_body_region = unet.include_top_region_index_input
+    include_modality = unet.num_class_embeds is not None
+    noise = torch.randn(
+        (
+            1,
+            args.latent_channels,
+            output_size[0] // divisor,
+            output_size[1] // divisor,
+            output_size[2] // divisor,
+        ),
+        device=device,
+    )
+    logger.info(f"noise: {noise.device}, {noise.dtype}, {type(noise)}")
+    image = noise
+    noise_scheduler = define_instance(args, "noise_scheduler")
+    if isinstance(noise_scheduler, RFlowScheduler):
+        noise_scheduler.set_timesteps(
+            num_inference_steps=args.diffusion_unet_inference["num_inference_steps"],
+            input_img_size_numel=torch.prod(torch.tensor(noise.shape[2:])),
+        )
+    else:
+        noise_scheduler.set_timesteps(num_inference_steps=args.diffusion_unet_inference["num_inference_steps"])
+    recon_model = ReconModel(autoencoder=autoencoder, scale_factor=scale_factor).to(device)
+    autoencoder.eval()
+    unet.eval()
+    all_timesteps = noise_scheduler.timesteps
+    all_next_timesteps = torch.cat((all_timesteps[1:], torch.tensor([0], dtype=all_timesteps.dtype)))
+    progress_bar = tqdm(
+        zip(all_timesteps, all_next_timesteps),
+        total=min(len(all_timesteps), len(all_next_timesteps)),
+    )
+    with torch.amp.autocast("cuda", enabled=True):
+        for t, next_t in progress_bar:
+            # Create a dictionary to store the inputs
+            unet_inputs = {
+                "x": image,
+                "timesteps": torch.Tensor((t,)).to(device),
+                "spacing_tensor": spacing_tensor,
+            }
+            # Add extra arguments if include_body_region is True
+            if include_body_region:
+                unet_inputs.update(
+                    {
+                        "top_region_index_tensor": top_region_index_tensor,
+                        "bottom_region_index_tensor": bottom_region_index_tensor,
+                    }
+                )
+            if include_modality:
+                unet_inputs.update(
+                    {
+                        "class_labels": modality_tensor,
+                    }
+                )
+            model_output = unet(**unet_inputs)
+            if not isinstance(noise_scheduler, RFlowScheduler):
+                image, _ = noise_scheduler.step(model_output, t, image)  # type: ignore
+            else:
+                image, _ = noise_scheduler.step(model_output, t, image, next_t)  # type: ignore
+        inferer = SlidingWindowInferer(
+            roi_size=[80, 80, 80],
+            sw_batch_size=1,
+            progress=True,
+            mode="gaussian",
+            overlap=0.4,
+            sw_device=device,
+            device=device,
+        )
+        synthetic_images = dynamic_infer(inferer, recon_model, image)
+        data = synthetic_images.squeeze().cpu().detach().numpy()
+        a_min, a_max, b_min, b_max = -1000, 1000, 0, 1
+        data = (data - b_min) / (b_max - b_min) * (a_max - a_min) + a_min
+        data = np.clip(data, a_min, a_max)
+        return np.int16(data)
+def save_image(
+    data: np.ndarray,
+    output_size: tuple,
+    out_spacing: tuple,
+    output_path: str,
+    logger: logging.Logger,
+) -> None:
+    """
+    Save the generated synthetic image to a file.
+    Args:
+        data (np.ndarray): Synthetic image data.
+        output_size (tuple): Output size of the image.
+        out_spacing (tuple): Spacing of the output image.
+        output_path (str): Path to save the output image.
+        logger (logging.Logger): Logger for logging information.
+    """
+    out_affine = np.eye(4)
+    for i in range(3):
+        out_affine[i, i] = out_spacing[i]
+    new_image = nib.Nifti1Image(data, affine=out_affine)
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    nib.save(new_image, output_path)
+    logger.info(f"Saved {output_path}.")
+@torch.inference_mode()
+def diff_model_infer(env_config_path: str, model_config_path: str, model_def_path: str, num_gpus: int) -> None:
+    """
+    Main function to run the diffusion model inference.
+    Args:
+        env_config_path (str): Path to the environment configuration file.
+        model_config_path (str): Path to the model configuration file.
+        model_def_path (str): Path to the model definition file.
+    """
+    args = load_config(env_config_path, model_config_path, model_def_path)
+    local_rank, world_size, device = initialize_distributed(num_gpus)
+    logger = setup_logging("inference")
+    random_seed = set_random_seed(
+        args.diffusion_unet_inference["random_seed"] + local_rank
+        if args.diffusion_unet_inference["random_seed"]
+        else None
+    )
+    logger.info(f"Using {device} of {world_size} with random seed: {random_seed}")
+    output_size = tuple(args.diffusion_unet_inference["dim"])
+    out_spacing = tuple(args.diffusion_unet_inference["spacing"])
+    output_prefix = args.output_prefix
+    ckpt_filepath = f"{args.model_dir}/{args.model_filename}"
+    if local_rank == 0:
+        logger.info(f"[config] ckpt_filepath -> {ckpt_filepath}.")
+        logger.info(f"[config] random_seed -> {random_seed}.")
+        logger.info(f"[config] output_prefix -> {output_prefix}.")
+        logger.info(f"[config] output_size -> {output_size}.")
+        logger.info(f"[config] out_spacing -> {out_spacing}.")
+    check_input(None, None, None, output_size, out_spacing, None)
+    autoencoder, unet, scale_factor = load_models(args, device, logger)
+    num_downsample_level = max(
+        1,
+        (
+            len(args.diffusion_unet_def["num_channels"])
+            if isinstance(args.diffusion_unet_def["num_channels"], list)
+            else len(args.diffusion_unet_def["attention_levels"])
+        ),
+    )
+    divisor = 2 ** (num_downsample_level - 2)
+    logger.info(f"num_downsample_level -> {num_downsample_level}, divisor -> {divisor}.")
+    top_region_index_tensor, bottom_region_index_tensor, spacing_tensor, modality_tensor = prepare_tensors(args, device)
+    data = run_inference(
+        args,
+        device,
+        autoencoder,
+        unet,
+        scale_factor,
+        top_region_index_tensor,
+        bottom_region_index_tensor,
+        spacing_tensor,
+        modality_tensor,
+        output_size,
+        divisor,
+        logger,
+    )
+    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
+    output_path = "{0}/{1}_seed{2}_size{3:d}x{4:d}x{5:d}_spacing{6:.2f}x{7:.2f}x{8:.2f}_{9}_rank{10}.nii.gz".format(
+        args.output_dir,
+        output_prefix,
+        random_seed,
+        output_size[0],
+        output_size[1],
+        output_size[2],
+        out_spacing[0],
+        out_spacing[1],
+        out_spacing[2],
+        timestamp,
+        local_rank,
+    )
+    save_image(data, output_size, out_spacing, output_path, logger)
+    if dist.is_initialized():
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Diffusion Model Inference")
+    parser.add_argument(
+        "--env_config",
+        type=str,
+        default="./configs/environment_maisi_diff_model_train.json",
+        help="Path to environment configuration file",
+    )
+    parser.add_argument(
+        "--model_config",
+        type=str,
+        default="./configs/config_maisi_diff_model_train.json",
+        help="Path to model training/inference configuration",
+    )
+    parser.add_argument(
+        "--model_def",
+        type=str,
+        default="./configs/config_maisi.json",
+        help="Path to model definition file",
+    )
+    parser.add_argument(
+        "--num_gpus",
+        type=int,
+        default=1,
+        help="Number of GPUs to use for distributed inference",
+    )
+    args = parser.parse_args()
+    diff_model_infer(args.env_config, args.model_config, args.model_def, args.num_gpus)

scripts/diff_model_setting.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import argparse
+import json
+import logging
+import torch
+import torch.distributed as dist
+from monai.utils import RankFilter
+def setup_logging(logger_name: str = "") -> logging.Logger:
+    """
+    Setup the logging configuration.
+    Args:
+        logger_name (str): logger name.
+    Returns:
+        logging.Logger: Configured logger.
+    """
+    logger = logging.getLogger(logger_name)
+    if dist.is_initialized():
+        logger.addFilter(RankFilter())
+    logging.basicConfig(
+        level=logging.INFO,
+        format="[%(asctime)s.%(msecs)03d][%(levelname)5s](%(name)s) - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    return logger
+def load_config(env_config_path: str, model_config_path: str, model_def_path: str) -> argparse.Namespace:
+    """
+    Load configuration from JSON files.
+    Args:
+        env_config_path (str): Path to the environment configuration file.
+        model_config_path (str): Path to the model configuration file.
+        model_def_path (str): Path to the model definition file.
+    Returns:
+        argparse.Namespace: Loaded configuration.
+    """
+    args = argparse.Namespace()
+    with open(env_config_path, "r") as f:
+        env_config = json.load(f)
+    for k, v in env_config.items():
+        setattr(args, k, v)
+    with open(model_config_path, "r") as f:
+        model_config = json.load(f)
+    for k, v in model_config.items():
+        setattr(args, k, v)
+    with open(model_def_path, "r") as f:
+        model_def = json.load(f)
+    for k, v in model_def.items():
+        setattr(args, k, v)
+    return args
+def initialize_distributed(num_gpus: int) -> tuple:
+    """
+    Initialize distributed training.
+    Returns:
+        tuple: local_rank, world_size, and device.
+    """
+    if torch.cuda.is_available() and num_gpus > 1:
+        dist.init_process_group(backend="nccl", init_method="env://")
+        local_rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        local_rank = 0
+        world_size = 1
+    device = torch.device("cuda", local_rank)
+    torch.cuda.set_device(device)
+    return local_rank, world_size, device

scripts/diff_model_train.py ADDED Viewed

	@@ -0,0 +1,499 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import argparse
+import json
+import logging
+import os
+from datetime import datetime
+from pathlib import Path
+import monai
+import torch
+import torch.distributed as dist
+from monai.data import DataLoader, partition_dataset
+from monai.networks.schedulers import RFlowScheduler
+from monai.networks.schedulers.ddpm import DDPMPredictionType
+from monai.transforms import Compose
+from monai.utils import first
+from torch.amp import GradScaler, autocast
+from torch.nn.parallel import DistributedDataParallel
+from .diff_model_setting import initialize_distributed, load_config, setup_logging
+from .utils import define_instance
+def load_filenames(data_list_path: str) -> list:
+    """
+    Load filenames from the JSON data list.
+    Args:
+        data_list_path (str): Path to the JSON data list file.
+    Returns:
+        list: List of filenames.
+    """
+    with open(data_list_path, "r") as file:
+        json_data = json.load(file)
+    filenames_train = json_data["training"]
+    return [_item["image"].replace(".nii.gz", "_emb.nii.gz") for _item in filenames_train]
+def prepare_data(
+    train_files: list,
+    device: torch.device,
+    cache_rate: float,
+    num_workers: int = 2,
+    batch_size: int = 1,
+    include_body_region: bool = False,
+) -> DataLoader:
+    """
+    Prepare training data.
+    Args:
+        train_files (list): List of training files.
+        device (torch.device): Device to use for training.
+        cache_rate (float): Cache rate for dataset.
+        num_workers (int): Number of workers for data loading.
+        batch_size (int): Mini-batch size.
+        include_body_region (bool): Whether to include body region in data
+    Returns:
+        DataLoader: Data loader for training.
+    """
+    def _load_data_from_file(file_path, key):
+        with open(file_path) as f:
+            return torch.FloatTensor(json.load(f)[key])
+    train_transforms_list = [
+        monai.transforms.LoadImaged(keys=["image"]),
+        monai.transforms.EnsureChannelFirstd(keys=["image"]),
+        monai.transforms.Lambdad(keys="spacing", func=lambda x: _load_data_from_file(x, "spacing")),
+        monai.transforms.Lambdad(keys="spacing", func=lambda x: x * 1e2),
+    ]
+    if include_body_region:
+        train_transforms_list += [
+            monai.transforms.Lambdad(
+                keys="top_region_index", func=lambda x: _load_data_from_file(x, "top_region_index")
+            ),
+            monai.transforms.Lambdad(
+                keys="bottom_region_index", func=lambda x: _load_data_from_file(x, "bottom_region_index")
+            ),
+            monai.transforms.Lambdad(keys="top_region_index", func=lambda x: x * 1e2),
+            monai.transforms.Lambdad(keys="bottom_region_index", func=lambda x: x * 1e2),
+        ]
+    train_transforms = Compose(train_transforms_list)
+    train_ds = monai.data.CacheDataset(
+        data=train_files, transform=train_transforms, cache_rate=cache_rate, num_workers=num_workers
+    )
+    return DataLoader(train_ds, num_workers=6, batch_size=batch_size, shuffle=True)
+def load_unet(args: argparse.Namespace, device: torch.device, logger: logging.Logger) -> torch.nn.Module:
+    """
+    Load the UNet model.
+    Args:
+        args (argparse.Namespace): Configuration arguments.
+        device (torch.device): Device to load the model on.
+        logger (logging.Logger): Logger for logging information.
+    Returns:
+        torch.nn.Module: Loaded UNet model.
+    """
+    unet = define_instance(args, "diffusion_unet_def").to(device)
+    unet = torch.nn.SyncBatchNorm.convert_sync_batchnorm(unet)
+    if dist.is_initialized():
+        unet = DistributedDataParallel(unet, device_ids=[device], find_unused_parameters=True)
+    if args.existing_ckpt_filepath is None:
+        logger.info("Training from scratch.")
+    else:
+        checkpoint_unet = torch.load(f"{args.existing_ckpt_filepath}", map_location=device, weights_only=False)
+        if dist.is_initialized():
+            unet.module.load_state_dict(checkpoint_unet["unet_state_dict"], strict=True)
+        else:
+            unet.load_state_dict(checkpoint_unet["unet_state_dict"], strict=True)
+        logger.info(f"Pretrained checkpoint {args.existing_ckpt_filepath} loaded.")
+    return unet
+def calculate_scale_factor(train_loader: DataLoader, device: torch.device, logger: logging.Logger) -> torch.Tensor:
+    """
+    Calculate the scaling factor for the dataset.
+    Args:
+        train_loader (DataLoader): Data loader for training.
+        device (torch.device): Device to use for calculation.
+        logger (logging.Logger): Logger for logging information.
+    Returns:
+        torch.Tensor: Calculated scaling factor.
+    """
+    check_data = first(train_loader)
+    z = check_data["image"].to(device)
+    scale_factor = 1 / torch.std(z)
+    logger.info(f"Scaling factor set to {scale_factor}.")
+    if dist.is_initialized():
+        dist.barrier()
+        dist.all_reduce(scale_factor, op=torch.distributed.ReduceOp.AVG)
+    logger.info(f"scale_factor -> {scale_factor}.")
+    return scale_factor
+def create_optimizer(model: torch.nn.Module, lr: float) -> torch.optim.Optimizer:
+    """
+    Create optimizer for training.
+    Args:
+        model (torch.nn.Module): Model to optimize.
+        lr (float): Learning rate.
+    Returns:
+        torch.optim.Optimizer: Created optimizer.
+    """
+    return torch.optim.Adam(params=model.parameters(), lr=lr)
+def create_lr_scheduler(optimizer: torch.optim.Optimizer, total_steps: int) -> torch.optim.lr_scheduler.PolynomialLR:
+    """
+    Create learning rate scheduler.
+    Args:
+        optimizer (torch.optim.Optimizer): Optimizer to schedule.
+        total_steps (int): Total number of training steps.
+    Returns:
+        torch.optim.lr_scheduler.PolynomialLR: Created learning rate scheduler.
+    """
+    return torch.optim.lr_scheduler.PolynomialLR(optimizer, total_iters=total_steps, power=2.0)
+def train_one_epoch(
+    epoch: int,
+    unet: torch.nn.Module,
+    train_loader: DataLoader,
+    optimizer: torch.optim.Optimizer,
+    lr_scheduler: torch.optim.lr_scheduler.PolynomialLR,
+    loss_pt: torch.nn.L1Loss,
+    scaler: GradScaler,
+    scale_factor: torch.Tensor,
+    noise_scheduler: torch.nn.Module,
+    num_images_per_batch: int,
+    num_train_timesteps: int,
+    device: torch.device,
+    logger: logging.Logger,
+    local_rank: int,
+    amp: bool = True,
+) -> torch.Tensor:
+    """
+    Train the model for one epoch.
+    Args:
+        epoch (int): Current epoch number.
+        unet (torch.nn.Module): UNet model.
+        train_loader (DataLoader): Data loader for training.
+        optimizer (torch.optim.Optimizer): Optimizer.
+        lr_scheduler (torch.optim.lr_scheduler.PolynomialLR): Learning rate scheduler.
+        loss_pt (torch.nn.L1Loss): Loss function.
+        scaler (GradScaler): Gradient scaler for mixed precision training.
+        scale_factor (torch.Tensor): Scaling factor.
+        noise_scheduler (torch.nn.Module): Noise scheduler.
+        num_images_per_batch (int): Number of images per batch.
+        num_train_timesteps (int): Number of training timesteps.
+        device (torch.device): Device to use for training.
+        logger (logging.Logger): Logger for logging information.
+        local_rank (int): Local rank for distributed training.
+        amp (bool): Use automatic mixed precision training.
+    Returns:
+        torch.Tensor: Training loss for the epoch.
+    """
+    include_body_region = unet.include_top_region_index_input
+    include_modality = unet.num_class_embeds is not None
+    if local_rank == 0:
+        current_lr = optimizer.param_groups[0]["lr"]
+        logger.info(f"Epoch {epoch + 1}, lr {current_lr}.")
+    _iter = 0
+    loss_torch = torch.zeros(2, dtype=torch.float, device=device)
+    unet.train()
+    for train_data in train_loader:
+        current_lr = optimizer.param_groups[0]["lr"]
+        _iter += 1
+        images = train_data["image"].to(device)
+        images = images * scale_factor
+        if include_body_region:
+            top_region_index_tensor = train_data["top_region_index"].to(device)
+            bottom_region_index_tensor = train_data["bottom_region_index"].to(device)
+        # We trained with only CT in this version
+        if include_modality:
+            modality_tensor = torch.ones((len(images),), dtype=torch.long).to(device)
+        spacing_tensor = train_data["spacing"].to(device)
+        optimizer.zero_grad(set_to_none=True)
+        with autocast("cuda", enabled=amp):
+            noise = torch.randn_like(images)
+            if isinstance(noise_scheduler, RFlowScheduler):
+                timesteps = noise_scheduler.sample_timesteps(images)
+            else:
+                timesteps = torch.randint(0, num_train_timesteps, (images.shape[0],), device=images.device).long()
+            noisy_latent = noise_scheduler.add_noise(original_samples=images, noise=noise, timesteps=timesteps)
+            # Create a dictionary to store the inputs
+            unet_inputs = {
+                "x": noisy_latent,
+                "timesteps": timesteps,
+                "spacing_tensor": spacing_tensor,
+            }
+            # Add extra arguments if include_body_region is True
+            if include_body_region:
+                unet_inputs.update(
+                    {
+                        "top_region_index_tensor": top_region_index_tensor,
+                        "bottom_region_index_tensor": bottom_region_index_tensor,
+                    }
+                )
+            if include_modality:
+                unet_inputs.update(
+                    {
+                        "class_labels": modality_tensor,
+                    }
+                )
+            model_output = unet(**unet_inputs)
+            if noise_scheduler.prediction_type == DDPMPredictionType.EPSILON:
+                # predict noise
+                model_gt = noise
+            elif noise_scheduler.prediction_type == DDPMPredictionType.SAMPLE:
+                # predict sample
+                model_gt = images
+            elif noise_scheduler.prediction_type == DDPMPredictionType.V_PREDICTION:
+                # predict velocity
+                model_gt = images - noise
+            else:
+                raise ValueError(
+                    "noise scheduler prediction type has to be chosen from ",
+                    f"[{DDPMPredictionType.EPSILON},{DDPMPredictionType.SAMPLE},{DDPMPredictionType.V_PREDICTION}]",
+                )
+            loss = loss_pt(model_output.float(), model_gt.float())
+        if amp:
+            scaler.scale(loss).backward()
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            loss.backward()
+            optimizer.step()
+        lr_scheduler.step()
+        loss_torch[0] += loss.item()
+        loss_torch[1] += 1.0
+        if local_rank == 0:
+            logger.info(
+                "[{0}] epoch {1}, iter {2}/{3}, loss: {4:.4f}, lr: {5:.12f}.".format(
+                    str(datetime.now())[:19], epoch + 1, _iter, len(train_loader), loss.item(), current_lr
+                )
+            )
+    if dist.is_initialized():
+        dist.all_reduce(loss_torch, op=torch.distributed.ReduceOp.SUM)
+    return loss_torch
+def save_checkpoint(
+    epoch: int,
+    unet: torch.nn.Module,
+    loss_torch_epoch: float,
+    num_train_timesteps: int,
+    scale_factor: torch.Tensor,
+    ckpt_folder: str,
+    args: argparse.Namespace,
+) -> None:
+    """
+    Save checkpoint.
+    Args:
+        epoch (int): Current epoch number.
+        unet (torch.nn.Module): UNet model.
+        loss_torch_epoch (float): Training loss for the epoch.
+        num_train_timesteps (int): Number of training timesteps.
+        scale_factor (torch.Tensor): Scaling factor.
+        ckpt_folder (str): Checkpoint folder path.
+        args (argparse.Namespace): Configuration arguments.
+    """
+    unet_state_dict = unet.module.state_dict() if dist.is_initialized() else unet.state_dict()
+    torch.save(
+        {
+            "epoch": epoch + 1,
+            "loss": loss_torch_epoch,
+            "num_train_timesteps": num_train_timesteps,
+            "scale_factor": scale_factor,
+            "unet_state_dict": unet_state_dict,
+        },
+        f"{ckpt_folder}/{args.model_filename}",
+    )
+def diff_model_train(
+    env_config_path: str, model_config_path: str, model_def_path: str, num_gpus: int, amp: bool = True
+) -> None:
+    """
+    Main function to train a diffusion model.
+    Args:
+        env_config_path (str): Path to the environment configuration file.
+        model_config_path (str): Path to the model configuration file.
+        model_def_path (str): Path to the model definition file.
+        num_gpus (int): Number of GPUs to use for training.
+        amp (bool): Use automatic mixed precision training.
+    """
+    args = load_config(env_config_path, model_config_path, model_def_path)
+    local_rank, world_size, device = initialize_distributed(num_gpus)
+    logger = setup_logging("training")
+    logger.info(f"Using {device} of {world_size}")
+    if local_rank == 0:
+        logger.info(f"[config] ckpt_folder -> {args.model_dir}.")
+        logger.info(f"[config] data_root -> {args.embedding_base_dir}.")
+        logger.info(f"[config] data_list -> {args.json_data_list}.")
+        logger.info(f"[config] lr -> {args.diffusion_unet_train['lr']}.")
+        logger.info(f"[config] num_epochs -> {args.diffusion_unet_train['n_epochs']}.")
+        logger.info(f"[config] num_train_timesteps -> {args.noise_scheduler['num_train_timesteps']}.")
+        Path(args.model_dir).mkdir(parents=True, exist_ok=True)
+    unet = load_unet(args, device, logger)
+    noise_scheduler = define_instance(args, "noise_scheduler")
+    include_body_region = unet.include_top_region_index_input
+    filenames_train = load_filenames(args.json_data_list)
+    if local_rank == 0:
+        logger.info(f"num_files_train: {len(filenames_train)}")
+    train_files = []
+    for _i in range(len(filenames_train)):
+        str_img = os.path.join(args.embedding_base_dir, filenames_train[_i])
+        if not os.path.exists(str_img):
+            continue
+        str_info = os.path.join(args.embedding_base_dir, filenames_train[_i]) + ".json"
+        train_files_i = {"image": str_img, "spacing": str_info}
+        if include_body_region:
+            train_files_i["top_region_index"] = str_info
+            train_files_i["bottom_region_index"] = str_info
+        train_files.append(train_files_i)
+    if dist.is_initialized():
+        train_files = partition_dataset(
+            data=train_files, shuffle=True, num_partitions=dist.get_world_size(), even_divisible=True
+        )[local_rank]
+    train_loader = prepare_data(
+        train_files,
+        device,
+        args.diffusion_unet_train["cache_rate"],
+        batch_size=args.diffusion_unet_train["batch_size"],
+        include_body_region=include_body_region,
+    )
+    scale_factor = calculate_scale_factor(train_loader, device, logger)
+    optimizer = create_optimizer(unet, args.diffusion_unet_train["lr"])
+    total_steps = (args.diffusion_unet_train["n_epochs"] * len(train_loader.dataset)) / args.diffusion_unet_train[
+        "batch_size"
+    ]
+    lr_scheduler = create_lr_scheduler(optimizer, total_steps)
+    loss_pt = torch.nn.L1Loss()
+    scaler = GradScaler("cuda")
+    torch.set_float32_matmul_precision("highest")
+    logger.info("torch.set_float32_matmul_precision -> highest.")
+    for epoch in range(args.diffusion_unet_train["n_epochs"]):
+        loss_torch = train_one_epoch(
+            epoch,
+            unet,
+            train_loader,
+            optimizer,
+            lr_scheduler,
+            loss_pt,
+            scaler,
+            scale_factor,
+            noise_scheduler,
+            args.diffusion_unet_train["batch_size"],
+            args.noise_scheduler["num_train_timesteps"],
+            device,
+            logger,
+            local_rank,
+            amp=amp,
+        )
+        loss_torch = loss_torch.tolist()
+        if torch.cuda.device_count() == 1 or local_rank == 0:
+            loss_torch_epoch = loss_torch[0] / loss_torch[1]
+            logger.info(f"epoch {epoch + 1} average loss: {loss_torch_epoch:.4f}.")
+            save_checkpoint(
+                epoch,
+                unet,
+                loss_torch_epoch,
+                args.noise_scheduler["num_train_timesteps"],
+                scale_factor,
+                args.model_dir,
+                args,
+            )
+    if dist.is_initialized():
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Diffusion Model Training")
+    parser.add_argument(
+        "--env_config",
+        type=str,
+        default="./configs/environment_maisi_diff_model.json",
+        help="Path to environment configuration file",
+    )
+    parser.add_argument(
+        "--model_config",
+        type=str,
+        default="./configs/config_maisi_diff_model.json",
+        help="Path to model training/inference configuration",
+    )
+    parser.add_argument(
+        "--model_def", type=str, default="./configs/config_maisi.json", help="Path to model definition file"
+    )
+    parser.add_argument("--num_gpus", type=int, default=1, help="Number of GPUs to use for training")
+    parser.add_argument("--no_amp", dest="amp", action="store_false", help="Disable automatic mixed precision training")
+    args = parser.parse_args()
+    diff_model_train(args.env_config, args.model_config, args.model_def, args.num_gpus, args.amp)

scripts/find_masks.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from typing import Sequence
+from monai.apps.utils import extractall
+from monai.utils import ensure_tuple_rep
+def convert_body_region(body_region: str | Sequence[str]) -> Sequence[int]:
+    """
+    Convert body region string to body region index.
+    Args:
+        body_region: list of input body region string. If single str, will be converted to list of str.
+    Return:
+        body_region_indices, list of input body region index.
+    """
+    if type(body_region) is str:
+        body_region = [body_region]
+    # body region mapping for maisi
+    region_mapping_maisi = {
+        "head": 0,
+        "chest": 1,
+        "thorax": 1,
+        "chest/thorax": 1,
+        "abdomen": 2,
+        "pelvis": 3,
+        "lower": 3,
+        "pelvis/lower": 3,
+    }
+    # perform mapping
+    body_region_indices = []
+    for region in body_region:
+        normalized_region = region.lower()  # norm str to lower case
+        if normalized_region not in region_mapping_maisi:
+            raise ValueError(f"Invalid region: {normalized_region}")
+        body_region_indices.append(region_mapping_maisi[normalized_region])
+    return body_region_indices
+def find_masks(
+    body_region: str | Sequence[str],
+    anatomy_list: int | Sequence[int],
+    spacing: Sequence[float] | float = 1.0,
+    output_size: Sequence[int] = [512, 512, 512],
+    check_spacing_and_output_size: bool = False,
+    database_filepath: str = "./configs/database.json",
+    mask_foldername: str = "./datasets/masks/",
+):
+    """
+    Find candidate masks that fullfills all the requirements.
+    They shoud contain all the body region in `body_region`, all the anatomies in `anatomy_list`.
+    If there is no tumor specified in `anatomy_list`, we also expect the candidate masks to be tumor free.
+    If check_spacing_and_output_size is True, the candidate masks need to have the expected `spacing` and `output_size`.
+    Args:
+        body_region: list of input body region string. If single str, will be converted to list of str.
+            The found candidate mask will include these body regions.
+        anatomy_list: list of input anatomy. The found candidate mask will include these anatomies.
+        spacing: list of three floats, voxel spacing. If providing a single number, will use it for all the three dimensions.
+        output_size: list of three int, expected candidate mask spatial size.
+        check_spacing_and_output_size: whether we expect candidate mask to have spatial size of `output_size` and voxel size of `spacing`.
+        database_filepath: path for the json file that stores the information of all the candidate masks.
+        mask_foldername: directory that saves all the candidate masks.
+    Return:
+        candidate_masks, list of dict, each dict contains information of one candidate mask that fullfills all the requirements.
+    """
+    # check and preprocess input
+    body_region = convert_body_region(body_region)
+    if isinstance(anatomy_list, int):
+        anatomy_list = [anatomy_list]
+    spacing = ensure_tuple_rep(spacing, 3)
+    if not os.path.exists(mask_foldername):
+        zip_file_path = mask_foldername + ".zip"
+        if not os.path.isfile(zip_file_path):
+            raise ValueError(f"Please download {zip_file_path} following the instruction in ./datasets/README.md.")
+        print(f"Extracting {zip_file_path} to {os.path.dirname(zip_file_path)}")
+        extractall(filepath=zip_file_path, output_dir=os.path.dirname(zip_file_path), file_type="zip")
+        print(f"Unzipped {zip_file_path} to {mask_foldername}.")
+    if not os.path.isfile(database_filepath):
+        raise ValueError(f"Please download {database_filepath} following the instruction in ./datasets/README.md.")
+    with open(database_filepath, "r") as f:
+        db = json.load(f)
+    # select candidate_masks
+    candidate_masks = []
+    for _item in db:
+        if not set(anatomy_list).issubset(_item["label_list"]):
+            continue
+        # whether to keep this mask, default to be True.
+        keep_mask = True
+        # extract region indice (top_index and bottom_index) for candidate mask
+        include_body_region = "top_region_index" in _item.keys()
+        if include_body_region:
+            top_index = [index for index, element in enumerate(_item["top_region_index"]) if element != 0]
+            top_index = top_index[0]
+            bottom_index = [index for index, element in enumerate(_item["bottom_region_index"]) if element != 0]
+            bottom_index = bottom_index[0]
+            # if candiate mask does not contain all the body_region, skip it
+            for _idx in body_region:
+                if _idx > bottom_index or _idx < top_index:
+                    keep_mask = False
+        for tumor_label in [23, 24, 26, 27, 128]:
+            # we skip those mask with tumors if users do not provide tumor label in anatomy_list
+            if tumor_label not in anatomy_list and tumor_label in _item["label_list"]:
+                keep_mask = False
+        if check_spacing_and_output_size:
+            # if the output_size and spacing are different with user's input, skip it
+            for axis in range(3):
+                if _item["dim"][axis] != output_size[axis] or _item["spacing"][axis] != spacing[axis]:
+                    keep_mask = False
+        if keep_mask:
+            # if decide to keep this mask, we pack the information of this mask and add to final output.
+            candidate = {
+                "pseudo_label": os.path.join(mask_foldername, _item["pseudo_label_filename"]),
+                "spacing": _item["spacing"],
+                "dim": _item["dim"],
+            }
+            if include_body_region:
+                candidate["top_region_index"] = _item["top_region_index"]
+                candidate["bottom_region_index"] = _item["bottom_region_index"]
+            # Conditionally add the label to the candidate dictionary
+            if "label_filename" in _item:
+                candidate["label"] = os.path.join(mask_foldername, _item["label_filename"])
+            candidate_masks.append(candidate)
+    if len(candidate_masks) == 0 and not check_spacing_and_output_size:
+        raise ValueError("Cannot find body region with given anatomy list.")
+    return candidate_masks

scripts/infer_controlnet.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import logging
+import os
+import sys
+from datetime import datetime
+import torch
+import torch.distributed as dist
+from monai.data import MetaTensor, decollate_batch
+from monai.networks.utils import copy_model_state
+from monai.transforms import SaveImage
+from monai.utils import RankFilter
+from .sample import check_input, ldm_conditional_sample_one_image
+from .utils import define_instance, prepare_maisi_controlnet_json_dataloader, setup_ddp
+@torch.inference_mode()
+def main():
+    parser = argparse.ArgumentParser(description="maisi.controlnet.infer")
+    parser.add_argument(
+        "-e",
+        "--environment-file",
+        default="./configs/environment_maisi_controlnet_train.json",
+        help="environment json file that stores environment path",
+    )
+    parser.add_argument(
+        "-c",
+        "--config-file",
+        default="./configs/config_maisi.json",
+        help="config json file that stores network hyper-parameters",
+    )
+    parser.add_argument(
+        "-t",
+        "--training-config",
+        default="./configs/config_maisi_controlnet_train.json",
+        help="config json file that stores training hyper-parameters",
+    )
+    parser.add_argument("-g", "--gpus", default=1, type=int, help="number of gpus per node")
+    args = parser.parse_args()
+    # Step 0: configuration
+    logger = logging.getLogger("maisi.controlnet.infer")
+    # whether to use distributed data parallel
+    use_ddp = args.gpus > 1
+    if use_ddp:
+        rank = int(os.environ["LOCAL_RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+        device = setup_ddp(rank, world_size)
+        logger.addFilter(RankFilter())
+    else:
+        rank = 0
+        world_size = 1
+        device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    logger.info(f"Number of GPUs: {torch.cuda.device_count()}")
+    logger.info(f"World_size: {world_size}")
+    with open(args.environment_file, "r") as env_file:
+        env_dict = json.load(env_file)
+    with open(args.config_file, "r") as config_file:
+        config_dict = json.load(config_file)
+    with open(args.training_config, "r") as training_config_file:
+        training_config_dict = json.load(training_config_file)
+    for k, v in env_dict.items():
+        setattr(args, k, v)
+    for k, v in config_dict.items():
+        setattr(args, k, v)
+    for k, v in training_config_dict.items():
+        setattr(args, k, v)
+    # Step 1: set data loader
+    _, val_loader = prepare_maisi_controlnet_json_dataloader(
+        json_data_list=args.json_data_list,
+        data_base_dir=args.data_base_dir,
+        rank=rank,
+        world_size=world_size,
+        batch_size=args.controlnet_train["batch_size"],
+        cache_rate=args.controlnet_train["cache_rate"],
+        fold=args.controlnet_train["fold"],
+    )
+    # Step 2: define AE, diffusion model and controlnet
+    # define AE
+    autoencoder = define_instance(args, "autoencoder_def").to(device)
+    # load trained autoencoder model
+    if args.trained_autoencoder_path is not None:
+        if not os.path.exists(args.trained_autoencoder_path):
+            raise ValueError("Please download the autoencoder checkpoint.")
+        autoencoder_ckpt = torch.load(args.trained_autoencoder_path, weights_only=True)
+        autoencoder.load_state_dict(autoencoder_ckpt)
+        logger.info(f"Load trained diffusion model from {args.trained_autoencoder_path}.")
+    else:
+        logger.info("trained autoencoder model is not loaded.")
+    # define diffusion Model
+    unet = define_instance(args, "diffusion_unet_def").to(device)
+    include_body_region = unet.include_top_region_index_input
+    include_modality = unet.num_class_embeds is not None
+    # load trained diffusion model
+    if args.trained_diffusion_path is not None:
+        if not os.path.exists(args.trained_diffusion_path):
+            raise ValueError("Please download the trained diffusion unet checkpoint.")
+        diffusion_model_ckpt = torch.load(args.trained_diffusion_path, map_location=device, weights_only=False)
+        unet.load_state_dict(diffusion_model_ckpt["unet_state_dict"])
+        # load scale factor from diffusion model checkpoint
+        scale_factor = diffusion_model_ckpt["scale_factor"]
+        logger.info(f"Load trained diffusion model from {args.trained_diffusion_path}.")
+        logger.info(f"loaded scale_factor from diffusion model ckpt -> {scale_factor}.")
+    else:
+        logger.info("trained diffusion model is not loaded.")
+        scale_factor = 1.0
+        logger.info(f"set scale_factor -> {scale_factor}.")
+    # define ControlNet
+    controlnet = define_instance(args, "controlnet_def").to(device)
+    # copy weights from the DM to the controlnet
+    copy_model_state(controlnet, unet.state_dict())
+    # load trained controlnet model if it is provided
+    if args.trained_controlnet_path is not None:
+        if not os.path.exists(args.trained_controlnet_path):
+            raise ValueError("Please download the trained ControlNet checkpoint.")
+        controlnet.load_state_dict(
+            torch.load(args.trained_controlnet_path, map_location=device, weights_only=False)["controlnet_state_dict"]
+        )
+        logger.info(f"load trained controlnet model from {args.trained_controlnet_path}")
+    else:
+        logger.info("trained controlnet is not loaded.")
+    noise_scheduler = define_instance(args, "noise_scheduler")
+    # Step 3: inference
+    autoencoder.eval()
+    controlnet.eval()
+    unet.eval()
+    for batch in val_loader:
+        # get label mask
+        labels = batch["label"].to(device)
+        # get corresponding conditions
+        if include_body_region:
+            top_region_index_tensor = batch["top_region_index"].to(device)
+            bottom_region_index_tensor = batch["bottom_region_index"].to(device)
+        else:
+            top_region_index_tensor = None
+            bottom_region_index_tensor = None
+        spacing_tensor = batch["spacing"].to(device)
+        modality_tensor = args.controlnet_infer["modality"] * torch.ones((len(labels),), dtype=torch.long).to(device)
+        out_spacing = tuple((batch["spacing"].squeeze().numpy() / 100).tolist())
+        # get target dimension
+        dim = batch["dim"]
+        output_size = (dim[0].item(), dim[1].item(), dim[2].item())
+        latent_shape = (args.latent_channels, output_size[0] // 4, output_size[1] // 4, output_size[2] // 4)
+        # check if output_size and out_spacing are valid.
+        check_input(None, None, None, output_size, out_spacing, None)
+        # generate a single synthetic image using a latent diffusion model with controlnet.
+        synthetic_images, _ = ldm_conditional_sample_one_image(
+            autoencoder=autoencoder,
+            diffusion_unet=unet,
+            controlnet=controlnet,
+            noise_scheduler=noise_scheduler,
+            scale_factor=scale_factor,
+            device=device,
+            combine_label_or=labels,
+            top_region_index_tensor=top_region_index_tensor,
+            bottom_region_index_tensor=bottom_region_index_tensor,
+            spacing_tensor=spacing_tensor,
+            modality_tensor=modality_tensor,
+            latent_shape=latent_shape,
+            output_size=output_size,
+            noise_factor=1.0,
+            num_inference_steps=args.controlnet_infer["num_inference_steps"],
+            autoencoder_sliding_window_infer_size=args.controlnet_infer["autoencoder_sliding_window_infer_size"],
+            autoencoder_sliding_window_infer_overlap=args.controlnet_infer["autoencoder_sliding_window_infer_overlap"],
+        )
+        # save image/label pairs
+        labels = decollate_batch(batch)[0]["label"]
+        real_object_name = labels.meta.get("filename_or_obj", "default_name.nii.gz")
+        labels.meta["filename_or_obj"] = real_object_name
+        output_postfix = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        synthetic_images = MetaTensor(synthetic_images.squeeze(0), meta=labels.meta)
+        img_saver = SaveImage(
+            output_dir=args.output_dir,
+            output_postfix="image",
+            separate_folder=False,
+        )
+        img_saver(synthetic_images)
+        label_saver = SaveImage(
+            output_dir=args.output_dir,
+            output_postfix="label",
+            separate_folder=False,
+        )
+        label_saver(labels)
+    if use_ddp:
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    logging.basicConfig(
+        stream=sys.stdout,
+        level=logging.INFO,
+        format="[%(asctime)s.%(msecs)03d][%(levelname)5s](%(name)s) - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    main()

scripts/infer_testV2_controlnet.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import logging
+import os
+import sys
+from datetime import datetime
+import torch
+import torch.distributed as dist
+from monai.data import MetaTensor, decollate_batch
+from monai.networks.utils import copy_model_state
+from monai.transforms import SaveImage
+from monai.utils import RankFilter
+from .sample import check_input, ldm_conditional_sample_one_image
+from .utils import define_instance, prepare_maisi_controlnet_json_dataloader, setup_ddp, prepare_maisi_controlnet_test_dataloader
+@torch.inference_mode()
+def main():
+    parser = argparse.ArgumentParser(description="maisi.controlnet.infer")
+    parser.add_argument(
+        "-e",
+        "--environment-file",
+        default="./configs/environment_maisi_controlnet_train.json",
+        help="environment json file that stores environment path",
+    )
+    parser.add_argument(
+        "-c",
+        "--config-file",
+        default="./configs/config_maisi.json",
+        help="config json file that stores network hyper-parameters",
+    )
+    parser.add_argument(
+        "-t",
+        "--training-config",
+        default="./configs/config_maisi_controlnet_train.json",
+        help="config json file that stores training hyper-parameters",
+    )
+    parser.add_argument("-g", "--gpus", default=1, type=int, help="number of gpus per node")
+    args = parser.parse_args()
+    # Step 0: configuration
+    logger = logging.getLogger("maisi.controlnet.infer")
+    # whether to use distributed data parallel
+    use_ddp = args.gpus > 1
+    if use_ddp:
+        rank = int(os.environ["LOCAL_RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+        device = setup_ddp(rank, world_size)
+        logger.addFilter(RankFilter())
+    else:
+        rank = 0
+        world_size = 1
+        device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    logger.info(f"Number of GPUs: {torch.cuda.device_count()}")
+    logger.info(f"World_size: {world_size}")
+    with open(args.environment_file, "r") as env_file:
+        env_dict = json.load(env_file)
+    with open(args.config_file, "r") as config_file:
+        config_dict = json.load(config_file)
+    with open(args.training_config, "r") as training_config_file:
+        training_config_dict = json.load(training_config_file)
+    for k, v in env_dict.items():
+        setattr(args, k, v)
+    for k, v in config_dict.items():
+        setattr(args, k, v)
+    for k, v in training_config_dict.items():
+        setattr(args, k, v)
+    # Step 1: set data loader
+    val_loader = prepare_maisi_controlnet_test_dataloader(
+        json_data_list=args.json_data_list,
+        data_base_dir=args.data_base_dir,
+        batch_size=args.controlnet_train["batch_size"],
+        cache_rate=args.controlnet_train["cache_rate"],
+        rank=rank,
+        world_size=world_size,)
+    # Step 2: define AE, diffusion model and controlnet
+    # define AE
+    autoencoder = define_instance(args, "autoencoder_def").to(device)
+    # load trained autoencoder model
+    if args.trained_autoencoder_path is not None:
+        if not os.path.exists(args.trained_autoencoder_path):
+            raise ValueError("Please download the autoencoder checkpoint.")
+        autoencoder_ckpt = torch.load(args.trained_autoencoder_path, weights_only=True)
+        autoencoder.load_state_dict(autoencoder_ckpt)
+        logger.info(f"Load trained diffusion model from {args.trained_autoencoder_path}.")
+    else:
+        logger.info("trained autoencoder model is not loaded.")
+    # define diffusion Model
+    unet = define_instance(args, "diffusion_unet_def").to(device)
+    include_body_region = unet.include_top_region_index_input
+    include_modality = unet.num_class_embeds is not None
+    # load trained diffusion model
+    if args.trained_diffusion_path is not None:
+        if not os.path.exists(args.trained_diffusion_path):
+            raise ValueError("Please download the trained diffusion unet checkpoint.")
+        diffusion_model_ckpt = torch.load(args.trained_diffusion_path, map_location=device, weights_only=False)
+        unet.load_state_dict(diffusion_model_ckpt["unet_state_dict"])
+        # load scale factor from diffusion model checkpoint
+        scale_factor = diffusion_model_ckpt["scale_factor"]
+        logger.info(f"Load trained diffusion model from {args.trained_diffusion_path}.")
+        logger.info(f"loaded scale_factor from diffusion model ckpt -> {scale_factor}.")
+    else:
+        logger.info("trained diffusion model is not loaded.")
+        scale_factor = 1.0
+        logger.info(f"set scale_factor -> {scale_factor}.")
+    # define ControlNet
+    controlnet = define_instance(args, "controlnet_def").to(device)
+    # copy weights from the DM to the controlnet
+    copy_model_state(controlnet, unet.state_dict())
+    # load trained controlnet model if it is provided
+    if args.trained_controlnet_path is not None:
+        if not os.path.exists(args.trained_controlnet_path):
+            raise ValueError("Please download the trained ControlNet checkpoint.")
+        controlnet.load_state_dict(
+            torch.load(args.trained_controlnet_path, map_location=device, weights_only=False)["controlnet_state_dict"]
+        )
+        logger.info(f"load trained controlnet model from {args.trained_controlnet_path}")
+    else:
+        logger.info("trained controlnet is not loaded.")
+    noise_scheduler = define_instance(args, "noise_scheduler")
+    # Step 3: inference
+    autoencoder.eval()
+    controlnet.eval()
+    unet.eval()
+    for batch in val_loader:
+        # get label mask
+        labels = batch["label"].to(device)
+        # get corresponding conditions
+        if include_body_region:
+            top_region_index_tensor = batch["top_region_index"].to(device)
+            bottom_region_index_tensor = batch["bottom_region_index"].to(device)
+        else:
+            top_region_index_tensor = None
+            bottom_region_index_tensor = None
+        spacing_tensor = batch["spacing"].to(device)
+        modality_tensor = args.controlnet_infer["modality"] * torch.ones((len(labels),), dtype=torch.long).to(device)
+        out_spacing = tuple((batch["spacing"].squeeze().numpy() / 100).tolist())
+        # get target dimension
+        dim = batch["dim"]
+        output_size = (dim[0].item(), dim[1].item(), dim[2].item())
+        latent_shape = (args.latent_channels, output_size[0] // 4, output_size[1] // 4, output_size[2] // 4)
+        # check if output_size and out_spacing are valid.
+        check_input(None, None, None, output_size, out_spacing, None)
+        # generate a single synthetic image using a latent diffusion model with controlnet.
+        synthetic_images, _ = ldm_conditional_sample_one_image(
+            autoencoder=autoencoder,
+            diffusion_unet=unet,
+            controlnet=controlnet,
+            noise_scheduler=noise_scheduler,
+            scale_factor=scale_factor,
+            device=device,
+            combine_label_or=labels,
+            top_region_index_tensor=top_region_index_tensor,
+            bottom_region_index_tensor=bottom_region_index_tensor,
+            spacing_tensor=spacing_tensor,
+            modality_tensor=modality_tensor,
+            latent_shape=latent_shape,
+            output_size=output_size,
+            noise_factor=1.0,
+            num_inference_steps=args.controlnet_infer["num_inference_steps"],
+            autoencoder_sliding_window_infer_size=args.controlnet_infer["autoencoder_sliding_window_infer_size"],
+            autoencoder_sliding_window_infer_overlap=args.controlnet_infer["autoencoder_sliding_window_infer_overlap"],
+        )
+        # save image/label pairs
+        labels = decollate_batch(batch)[0]["label"]
+        real_object_name = labels.meta.get("filename_or_obj", "default_name.nii.gz")
+        labels.meta["filename_or_obj"] = real_object_name
+        output_postfix = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        synthetic_images = MetaTensor(synthetic_images.squeeze(0), meta=labels.meta)
+        img_saver = SaveImage(
+            output_dir=args.output_dir,
+            output_postfix="image",
+            separate_folder=False,
+        )
+        img_saver(synthetic_images)
+        label_saver = SaveImage(
+            output_dir=args.output_dir,
+            output_postfix="label",
+            separate_folder=False,
+        )
+        label_saver(labels)
+    if use_ddp:
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    logging.basicConfig(
+        stream=sys.stdout,
+        level=logging.INFO,
+        format="[%(asctime)s.%(msecs)03d][%(levelname)5s](%(name)s) - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    main()

scripts/infer_test_controlnet.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import logging
+import os
+import sys
+from datetime import datetime
+import torch
+import torch.distributed as dist
+from monai.data import MetaTensor, decollate_batch
+from monai.networks.utils import copy_model_state
+from monai.transforms import SaveImage
+from monai.utils import RankFilter
+from .sample import check_input, ldm_conditional_sample_one_image
+from .utils import define_instance, prepare_maisi_controlnet_json_dataloader, setup_ddp, prepare_maisi_controlnet_infer_dataloader
+@torch.inference_mode()
+def main():
+    parser = argparse.ArgumentParser(description="maisi.controlnet.infer")
+    parser.add_argument(
+        "-e",
+        "--environment-file",
+        default="./configs/environment_maisi_controlnet_train.json",
+        help="environment json file that stores environment path",
+    )
+    parser.add_argument(
+        "-c",
+        "--config-file",
+        default="./configs/config_maisi.json",
+        help="config json file that stores network hyper-parameters",
+    )
+    parser.add_argument(
+        "-t",
+        "--training-config",
+        default="./configs/config_maisi_controlnet_train.json",
+        help="config json file that stores training hyper-parameters",
+    )
+    parser.add_argument("-g", "--gpus", default=1, type=int, help="number of gpus per node")
+    args = parser.parse_args()
+    # Step 0: configuration
+    logger = logging.getLogger("maisi.controlnet.infer")
+    # whether to use distributed data parallel
+    use_ddp = args.gpus > 1
+    if use_ddp:
+        rank = int(os.environ["LOCAL_RANK"])
+        world_size = int(os.environ["WORLD_SIZE"])
+        device = setup_ddp(rank, world_size)
+        logger.addFilter(RankFilter())
+    else:
+        rank = 0
+        world_size = 1
+        device = torch.device(f"cuda:{rank}")
+    torch.cuda.set_device(device)
+    logger.info(f"Number of GPUs: {torch.cuda.device_count()}")
+    logger.info(f"World_size: {world_size}")
+    with open(args.environment_file, "r") as env_file:
+        env_dict = json.load(env_file)
+    with open(args.config_file, "r") as config_file:
+        config_dict = json.load(config_file)
+    with open(args.training_config, "r") as training_config_file:
+        training_config_dict = json.load(training_config_file)
+    for k, v in env_dict.items():
+        setattr(args, k, v)
+    for k, v in config_dict.items():
+        setattr(args, k, v)
+    for k, v in training_config_dict.items():
+        setattr(args, k, v)
+    # Step 1: set data loader
+    val_loader = prepare_maisi_controlnet_infer_dataloader(
+        json_data_list=args.json_data_list,
+        data_base_dir=args.data_base_dir,
+        batch_size=args.controlnet_train["batch_size"],
+        cache_rate=args.controlnet_train["cache_rate"],
+        rank=rank,
+        world_size=world_size,)
+    # Step 2: define AE, diffusion model and controlnet
+    # define AE
+    autoencoder = define_instance(args, "autoencoder_def").to(device)
+    # load trained autoencoder model
+    if args.trained_autoencoder_path is not None:
+        if not os.path.exists(args.trained_autoencoder_path):
+            raise ValueError("Please download the autoencoder checkpoint.")
+        autoencoder_ckpt = torch.load(args.trained_autoencoder_path, weights_only=True)
+        autoencoder.load_state_dict(autoencoder_ckpt)
+        logger.info(f"Load trained diffusion model from {args.trained_autoencoder_path}.")
+    else:
+        logger.info("trained autoencoder model is not loaded.")
+    # define diffusion Model
+    unet = define_instance(args, "diffusion_unet_def").to(device)
+    include_body_region = unet.include_top_region_index_input
+    include_modality = unet.num_class_embeds is not None
+    # load trained diffusion model
+    if args.trained_diffusion_path is not None:
+        if not os.path.exists(args.trained_diffusion_path):
+            raise ValueError("Please download the trained diffusion unet checkpoint.")
+        diffusion_model_ckpt = torch.load(args.trained_diffusion_path, map_location=device, weights_only=False)
+        unet.load_state_dict(diffusion_model_ckpt["unet_state_dict"])
+        # load scale factor from diffusion model checkpoint
+        scale_factor = diffusion_model_ckpt["scale_factor"]
+        logger.info(f"Load trained diffusion model from {args.trained_diffusion_path}.")
+        logger.info(f"loaded scale_factor from diffusion model ckpt -> {scale_factor}.")
+    else:
+        logger.info("trained diffusion model is not loaded.")
+        scale_factor = 1.0
+        logger.info(f"set scale_factor -> {scale_factor}.")
+    # define ControlNet
+    controlnet = define_instance(args, "controlnet_def").to(device)
+    # copy weights from the DM to the controlnet
+    copy_model_state(controlnet, unet.state_dict())
+    # load trained controlnet model if it is provided
+    if args.trained_controlnet_path is not None:
+        if not os.path.exists(args.trained_controlnet_path):
+            raise ValueError("Please download the trained ControlNet checkpoint.")
+        controlnet.load_state_dict(
+            torch.load(args.trained_controlnet_path, map_location=device, weights_only=False)["controlnet_state_dict"]
+        )
+        logger.info(f"load trained controlnet model from {args.trained_controlnet_path}")
+    else:
+        logger.info("trained controlnet is not loaded.")
+    noise_scheduler = define_instance(args, "noise_scheduler")
+    # Step 3: inference
+    autoencoder.eval()
+    controlnet.eval()
+    unet.eval()
+    for batch in val_loader:
+        # get label mask
+        labels = batch["label"].to(device)
+        # get corresponding conditions
+        if include_body_region:
+            top_region_index_tensor = batch["top_region_index"].to(device)
+            bottom_region_index_tensor = batch["bottom_region_index"].to(device)
+        else:
+            top_region_index_tensor = None
+            bottom_region_index_tensor = None
+        spacing_tensor = batch["spacing"].to(device)
+        modality_tensor = args.controlnet_infer["modality"] * torch.ones((len(labels),), dtype=torch.long).to(device)
+        out_spacing = tuple((batch["spacing"].squeeze().numpy() / 100).tolist())
+        # get target dimension
+        dim = batch["dim"]
+        output_size = (dim[0].item(), dim[1].item(), dim[2].item())
+        latent_shape = (args.latent_channels, output_size[0] // 4, output_size[1] // 4, output_size[2] // 4)
+        # check if output_size and out_spacing are valid.
+        check_input(None, None, None, output_size, out_spacing, None)
+        # generate a single synthetic image using a latent diffusion model with controlnet.
+        synthetic_images, _ = ldm_conditional_sample_one_image(
+            autoencoder=autoencoder,
+            diffusion_unet=unet,
+            controlnet=controlnet,
+            noise_scheduler=noise_scheduler,
+            scale_factor=scale_factor,
+            device=device,
+            combine_label_or=labels,
+            top_region_index_tensor=top_region_index_tensor,
+            bottom_region_index_tensor=bottom_region_index_tensor,
+            spacing_tensor=spacing_tensor,
+            modality_tensor=modality_tensor,
+            latent_shape=latent_shape,
+            output_size=output_size,
+            noise_factor=1.0,
+            num_inference_steps=args.controlnet_infer["num_inference_steps"],
+            autoencoder_sliding_window_infer_size=args.controlnet_infer["autoencoder_sliding_window_infer_size"],
+            autoencoder_sliding_window_infer_overlap=args.controlnet_infer["autoencoder_sliding_window_infer_overlap"],
+        )
+        # save image/label pairs
+        labels = decollate_batch(batch)[0]["label"]
+        real_object_name = labels.meta.get("filename_or_obj", "default_name.nii.gz")
+        labels.meta["filename_or_obj"] = real_object_name
+        output_postfix = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
+        synthetic_images = MetaTensor(synthetic_images.squeeze(0), meta=labels.meta)
+        img_saver = SaveImage(
+            output_dir=args.output_dir,
+            output_postfix="image",
+            separate_folder=False,
+        )
+        img_saver(synthetic_images)
+        label_saver = SaveImage(
+            output_dir=args.output_dir,
+            output_postfix="label",
+            separate_folder=False,
+        )
+        label_saver(labels)
+    if use_ddp:
+        dist.destroy_process_group()
+if __name__ == "__main__":
+    logging.basicConfig(
+        stream=sys.stdout,
+        level=logging.INFO,
+        format="[%(asctime)s.%(msecs)03d][%(levelname)5s](%(name)s) - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    main()

scripts/inference.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# &nbsp;&nbsp;&nbsp;&nbsp;http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# # MAISI Inference Script
+import argparse
+import json
+import logging
+import os
+import sys
+import tempfile
+import monai
+import torch
+from monai.apps import download_url
+from monai.config import print_config
+from monai.transforms import LoadImage, Orientation
+from monai.utils import set_determinism
+from scripts.sample import LDMSampler, check_input
+from scripts.utils import define_instance
+from scripts.utils_plot import find_label_center_loc, get_xyz_plot, show_image
+def main():
+    parser = argparse.ArgumentParser(description="maisi.controlnet.training")
+    parser.add_argument(
+        "-e",
+        "--environment-file",
+        default="./configs/environment.json",
+        help="environment json file that stores environment path",
+    )
+    parser.add_argument(
+        "-c",
+        "--config-file",
+        default="./configs/config_maisi.json",
+        help="config json file that stores network hyper-parameters",
+    )
+    parser.add_argument(
+        "-i",
+        "--inference-file",
+        default="./configs/config_infer.json",
+        help="config json file that stores inference hyper-parameters",
+    )
+    parser.add_argument(
+        "-x",
+        "--extra-config-file",
+        default=None,
+        help="config json file that stores inference extra parameters",
+    )
+    parser.add_argument(
+        "-s",
+        "--random-seed",
+        default=None,
+        help="random seed, can be None or int",
+    )
+    parser.add_argument(
+        "--version",
+        default="maisi3d-rflow",
+        type=str,
+        help="maisi_version, choose from ['maisi3d-ddpm', 'maisi3d-rflow']",
+    )
+    args = parser.parse_args()
+    # Step 0: configuration
+    logger = logging.getLogger("maisi.inference")
+    maisi_version = args.version
+    # ## Set deterministic training for reproducibility
+    if args.random_seed is not None:
+        set_determinism(seed=args.random_seed)
+    # ## Setup data directory
+    # You can specify a directory with the `MONAI_DATA_DIRECTORY` environment variable.
+    # This allows you to save results and reuse downloads.
+    # If not specified a temporary directory will be used.
+    directory = os.environ.get("MONAI_DATA_DIRECTORY")
+    if directory is not None:
+        os.makedirs(directory, exist_ok=True)
+    root_dir = tempfile.mkdtemp() if directory is None else directory
+    print(root_dir)
+    # TODO: remove the `files` after the files are uploaded to the NGC
+    files = [
+        {
+            "path": "models/autoencoder_epoch273.pt",
+            "url": "https://developer.download.nvidia.com/assets/Clara/monai/tutorials"
+            "/model_zoo/model_maisi_autoencoder_epoch273_alternative.pt",
+        },
+        {
+            "path": "models/mask_generation_autoencoder.pt",
+            "url": "https://developer.download.nvidia.com/assets/Clara/monai"
+            "/tutorials/mask_generation_autoencoder.pt",
+        },
+        {
+            "path": "models/mask_generation_diffusion_unet.pt",
+            "url": "https://developer.download.nvidia.com/assets/Clara/monai"
+            "/tutorials/model_zoo/model_maisi_mask_generation_diffusion_unet_v2.pt",
+        },
+        {
+            "path": "configs/all_anatomy_size_condtions.json",
+            "url": "https://developer.download.nvidia.com/assets/Clara/monai/tutorials/all_anatomy_size_condtions.json",
+        },
+        {
+            "path": "datasets/all_masks_flexible_size_and_spacing_4000.zip",
+            "url": "https://developer.download.nvidia.com/assets/Clara/monai"
+            "/tutorials/all_masks_flexible_size_and_spacing_4000.zip",
+        },
+    ]
+    if maisi_version == "maisi3d-ddpm":
+        files += [
+            {
+                "path": "models/diff_unet_3d_ddpm.pt",
+                "url": "https://developer.download.nvidia.com/assets/Clara/monai/tutorials/model_zoo"
+                "/model_maisi_input_unet3d_data-all_steps1000size512ddpm_random_current_inputx_v1_alternative.pt",
+            },
+            {
+                "path": "models/controlnet_3d_ddpm.pt",
+                "url": "https://developer.download.nvidia.com/assets/Clara/monai/tutorials/model_zoo"
+                "/model_maisi_controlnet-20datasets-e20wl100fold0bc_noi_dia_fsize_current_alternative.pt",
+            },
+            {
+                "path": "configs/candidate_masks_flexible_size_and_spacing_3000.json",
+                "url": "https://developer.download.nvidia.com/assets/Clara/monai"
+                "/tutorials/candidate_masks_flexible_size_and_spacing_3000.json",
+            },
+        ]
+    elif maisi_version == "maisi3d-rflow":
+        files += [
+            {
+                "path": "models/diff_unet_3d_rflow.pt",
+                "url": "https://developer.download.nvidia.com/assets/Clara/monai/tutorials/"
+                "diff_unet_ckpt_rflow_epoch19350.pt",
+            },
+            {
+                "path": "models/controlnet_3d_rflow.pt",
+                "url": "https://developer.download.nvidia.com/assets/Clara/monai/tutorials/"
+                "controlnet_rflow_epoch60.pt",
+            },
+            {
+                "path": "configs/candidate_masks_flexible_size_and_spacing_4000.json",
+                "url": "https://developer.download.nvidia.com/assets/Clara/monai"
+                "/tutorials/candidate_masks_flexible_size_and_spacing_4000.json",
+            },
+        ]
+    else:
+        raise ValueError(
+            f"maisi_version has to be chosen from ['maisi3d-ddpm', 'maisi3d-rflow'], yet got {maisi_version}."
+        )
+    for file in files:
+        file["path"] = file["path"] if "datasets/" not in file["path"] else os.path.join(root_dir, file["path"])
+        download_url(url=file["url"], filepath=file["path"])
+    # ## Read in environment setting, including data directory, model directory, and output directory
+    # The information for data directory, model directory, and output directory are saved in ./configs/environment.json
+    env_dict = json.load(open(args.environment_file, "r"))
+    for k, v in env_dict.items():
+        # Update the path to the downloaded dataset in MONAI_DATA_DIRECTORY
+        val = v if "datasets/" not in v else os.path.join(root_dir, v)
+        setattr(args, k, val)
+        print(f"{k}: {val}")
+    print("Global config variables have been loaded.")
+    # ## Read in configuration setting, including network definition, body region and anatomy to generate, etc.
+    #
+    # The information for the inference input, like body region and anatomy to generate, is stored in "./configs/config_infer.json".
+    # Please refer to README.md for the details.
+    config_dict = json.load(open(args.config_file, "r"))
+    for k, v in config_dict.items():
+        setattr(args, k, v)
+    # check the format of inference inputs
+    config_infer_dict = json.load(open(args.inference_file, "r"))
+    # override num_split if asked
+    if "autoencoder_tp_num_splits" in config_infer_dict:
+        args.autoencoder_def["num_splits"] = config_infer_dict["autoencoder_tp_num_splits"]
+        args.mask_generation_autoencoder_def["num_splits"] = config_infer_dict["autoencoder_tp_num_splits"]
+    for k, v in config_infer_dict.items():
+        setattr(args, k, v)
+        print(f"{k}: {v}")
+    #
+    # ## Read in optional extra configuration setting - typically acceleration options (TRT)
+    #
+    #
+    if args.extra_config_file is not None:
+        extra_config_dict = json.load(open(args.extra_config_file, "r"))
+        for k, v in extra_config_dict.items():
+            setattr(args, k, v)
+            print(f"{k}: {v}")
+    check_input(
+        args.body_region,
+        args.anatomy_list,
+        args.label_dict_json,
+        args.output_size,
+        args.spacing,
+        args.controllable_anatomy_size,
+    )
+    latent_shape = [args.latent_channels, args.output_size[0] // 4, args.output_size[1] // 4, args.output_size[2] // 4]
+    print("Network definition and inference inputs have been loaded.")
+    # ## Initialize networks and noise scheduler, then load the trained model weights.
+    # The networks and noise scheduler are defined in `config_file`. We will read them in and load the model weights.
+    noise_scheduler = define_instance(args, "noise_scheduler")
+    mask_generation_noise_scheduler = define_instance(args, "mask_generation_noise_scheduler")
+    device = torch.device("cuda")
+    autoencoder = define_instance(args, "autoencoder").to(device)
+    checkpoint_autoencoder = torch.load(args.trained_autoencoder_path, weights_only=True)
+    autoencoder.load_state_dict(checkpoint_autoencoder)
+    diffusion_unet = define_instance(args, "diffusion_unet").to(device)
+    checkpoint_diffusion_unet = torch.load(args.trained_diffusion_path, weights_only=False)
+    diffusion_unet.load_state_dict(checkpoint_diffusion_unet["unet_state_dict"], strict=True)
+    scale_factor = checkpoint_diffusion_unet["scale_factor"].to(device)
+    controlnet = define_instance(args, "controlnet").to(device)
+    checkpoint_controlnet = torch.load(args.trained_controlnet_path, weights_only=False)
+    monai.networks.utils.copy_model_state(controlnet, diffusion_unet.state_dict())
+    controlnet.load_state_dict(checkpoint_controlnet["controlnet_state_dict"], strict=True)
+    mask_generation_autoencoder = define_instance(args, "mask_generation_autoencoder").to(device)
+    checkpoint_mask_generation_autoencoder = torch.load(
+        args.trained_mask_generation_autoencoder_path, weights_only=True
+    )
+    mask_generation_autoencoder.load_state_dict(checkpoint_mask_generation_autoencoder)
+    mask_generation_diffusion_unet = define_instance(args, "mask_generation_diffusion").to(device)
+    checkpoint_mask_generation_diffusion_unet = torch.load(
+        args.trained_mask_generation_diffusion_path, weights_only=False
+    )
+    mask_generation_diffusion_unet.load_state_dict(checkpoint_mask_generation_diffusion_unet["unet_state_dict"])
+    mask_generation_scale_factor = checkpoint_mask_generation_diffusion_unet["scale_factor"]
+    print("All the trained model weights have been loaded.")
+    # ## Define the LDM Sampler, which contains functions that will perform the inference.
+    ldm_sampler = LDMSampler(
+        args.body_region,
+        args.anatomy_list,
+        args.all_mask_files_json,
+        args.all_anatomy_size_conditions_json,
+        args.all_mask_files_base_dir,
+        args.label_dict_json,
+        args.label_dict_remap_json,
+        autoencoder,
+        diffusion_unet,
+        controlnet,
+        noise_scheduler,
+        scale_factor,
+        mask_generation_autoencoder,
+        mask_generation_diffusion_unet,
+        mask_generation_scale_factor,
+        mask_generation_noise_scheduler,
+        device,
+        latent_shape,
+        args.mask_generation_latent_shape,
+        args.output_size,
+        args.output_dir,
+        args.controllable_anatomy_size,
+        image_output_ext=args.image_output_ext,
+        label_output_ext=args.label_output_ext,
+        spacing=args.spacing,
+        modality=args.modality,
+        num_inference_steps=args.num_inference_steps,
+        mask_generation_num_inference_steps=args.mask_generation_num_inference_steps,
+        random_seed=args.random_seed,
+        autoencoder_sliding_window_infer_size=args.autoencoder_sliding_window_infer_size,
+        autoencoder_sliding_window_infer_overlap=args.autoencoder_sliding_window_infer_overlap,
+    )
+    print(f"The generated image/mask pairs will be saved in {args.output_dir}.")
+    output_filenames = ldm_sampler.sample_multiple_images(args.num_output_samples)
+    print("MAISI image/mask generation finished")
+if __name__ == "__main__":
+    logging.basicConfig(
+        stream=sys.stdout,
+        level=logging.INFO,
+        format="[%(asctime)s.%(msecs)03d][%(levelname)5s](%(name)s) - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    torch.cuda.reset_peak_memory_stats()
+    main()
+    peak_memory_gb = torch.cuda.max_memory_allocated() / (1024**3)  # Convert to GB
+    print(f"Peak GPU memory usage: {peak_memory_gb:.2f} GB")

scripts/quality_check.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+def get_masked_data(label_data, image_data, labels):
+    """
+    Extracts and returns the image data corresponding to specified labels within a 3D volume.
+    This function efficiently masks the `image_data` array based on the provided `labels` in the `label_data` array.
+    The function handles cases with both a large and small number of labels, optimizing performance accordingly.
+    Args:
+        label_data (np.ndarray): A NumPy array containing label data, representing different anatomical
+                                 regions or classes in a 3D medical image.
+        image_data (np.ndarray): A NumPy array containing the image data from which the relevant regions
+                                 will be extracted.
+        labels (list of int): A list of integers representing the label values to be used for masking.
+    Returns:
+        np.ndarray: A NumPy array containing the elements of `image_data` that correspond to the specified
+                    labels in `label_data`. If no labels are provided, an empty array is returned.
+    Raises:
+        ValueError: If `image_data` and `label_data` do not have the same shape.
+    Example:
+        label_int_dict = {"liver": [1], "kidney": [5, 14]}
+        masked_data = get_masked_data(label_data, image_data, label_int_dict["kidney"])
+    """
+    # Check if the shapes of image_data and label_data match
+    if image_data.shape != label_data.shape:
+        raise ValueError(
+            f"Shape mismatch: image_data has shape {image_data.shape}, "
+            f"but label_data has shape {label_data.shape}. They must be the same."
+        )
+    if not labels:
+        return np.array([])  # Return an empty array if no labels are provided
+    labels = list(set(labels))  # remove duplicate items
+    # Optimize performance based on the number of labels
+    num_label_acceleration_thresh = 3
+    if len(labels) >= num_label_acceleration_thresh:
+        # if many labels, np.isin is faster
+        mask = np.isin(label_data, labels)
+    else:
+        # Use logical OR to combine masks if the number of labels is small
+        mask = np.zeros_like(label_data, dtype=bool)
+        for label in labels:
+            mask = np.logical_or(mask, label_data == label)
+    # Retrieve the masked data
+    masked_data = image_data[mask.astype(bool)]
+    return masked_data
+def is_outlier(statistics, image_data, label_data, label_int_dict):
+    """
+    Perform a quality check on the generated image by comparing its statistics with precomputed thresholds.
+    Args:
+        statistics (dict): Dictionary containing precomputed statistics including mean +/- 3sigma ranges.
+        image_data (np.ndarray): The image data to be checked, typically a 3D NumPy array.
+        label_data (np.ndarray): The label data corresponding to the image, used for masking regions of interest.
+        label_int_dict (dict): Dictionary mapping label names to their corresponding integer lists.
+            e.g., label_int_dict = {"liver": [1], "kidney": [5, 14]}
+    Returns:
+        dict: A dictionary with labels as keys, each containing the quality check result,
+              including whether it's an outlier, the median value, and the thresholds used.
+              If no data is found for a label, the median value will be `None` and `is_outlier` will be `False`.
+    Example:
+        # Example input data
+        statistics = {
+            "liver": {
+                "sigma_6_low": -21.596463547885904,
+                "sigma_6_high": 156.27881534763367
+            },
+            "kidney": {
+                "sigma_6_low": -15.0,
+                "sigma_6_high": 120.0
+            }
+        }
+        label_int_dict = {
+            "liver": [1],
+            "kidney": [5, 14]
+        }
+        image_data = np.random.rand(100, 100, 100)  # Replace with actual image data
+        label_data = np.zeros((100, 100, 100))  # Replace with actual label data
+        label_data[40:60, 40:60, 40:60] = 1  # Example region for liver
+        label_data[70:90, 70:90, 70:90] = 5  # Example region for kidney
+        result = is_outlier(statistics, image_data, label_data, label_int_dict)
+    """
+    outlier_results = {}
+    for label_name, stats in statistics.items():
+        # Get the thresholds from the statistics
+        low_thresh = min(stats["sigma_6_low"], stats["percentile_0_5"])  # or "sigma_12_low" depending on your needs
+        high_thresh = max(stats["sigma_6_high"], stats["percentile_99_5"])  # or "sigma_12_high" depending on your needs
+        if label_name == "bone":
+            high_thresh = 1000.0
+        # Retrieve the corresponding label integers
+        labels = label_int_dict.get(label_name, [])
+        masked_data = get_masked_data(label_data, image_data, labels)
+        masked_data = masked_data[~np.isnan(masked_data)]
+        if len(masked_data) == 0 or masked_data.size == 0:
+            outlier_results[label_name] = {
+                "is_outlier": False,
+                "median_value": None,
+                "low_thresh": low_thresh,
+                "high_thresh": high_thresh,
+            }
+            continue
+        # Compute the median of the masked region
+        median_value = np.nanmedian(masked_data)
+        if np.isnan(median_value):
+            median_value = None
+            is_outlier = False
+        else:
+            # Determine if the median value is an outlier
+            is_outlier = median_value < low_thresh or median_value > high_thresh
+        outlier_results[label_name] = {
+            "is_outlier": is_outlier,
+            "median_value": median_value,
+            "low_thresh": low_thresh,
+            "high_thresh": high_thresh,
+        }
+    return outlier_results

scripts/rectified_flow.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# =========================================================================
+# Adapted from https://github.com/hpcaitech/Open-Sora/blob/main/opensora/schedulers/rf/rectified_flow.py
+# which has the following license:
+# https://github.com/hpcaitech/Open-Sora/blob/main/LICENSE
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========================================================================
+from __future__ import annotations
+from typing import Union
+import numpy as np
+import torch
+from torch.distributions import LogisticNormal
+from monai.utils import StrEnum
+from .ddpm import DDPMPredictionType
+from .scheduler import Scheduler
+class RFlowPredictionType(StrEnum):
+    """
+    Set of valid prediction type names for the RFlow scheduler's `prediction_type` argument.
+    v_prediction: velocity prediction, see section 2.4 https://imagen.research.google/video/paper.pdf
+    """
+    V_PREDICTION = DDPMPredictionType.V_PREDICTION
+def timestep_transform(
+    t, input_img_size_numel, base_img_size_numel=32 * 32 * 32, scale=1.0, num_train_timesteps=1000, spatial_dim=3
+):
+    """
+    Applies a transformation to the timestep based on image resolution scaling.
+    Args:
+        t (torch.Tensor): The original timestep(s).
+        input_img_size_numel (torch.Tensor): The input image's size (H * W * D).
+        base_img_size_numel (int): reference H*W*D size, usually smaller than input_img_size_numel.
+        scale (float): Scaling factor for the transformation.
+        num_train_timesteps (int): Total number of training timesteps.
+        spatial_dim (int): Number of spatial dimensions in the image.
+    Returns:
+        torch.Tensor: Transformed timestep(s).
+    """
+    t = t / num_train_timesteps
+    ratio_space = (input_img_size_numel / base_img_size_numel) ** (1.0 / spatial_dim)
+    ratio = ratio_space * scale
+    new_t = ratio * t / (1 + (ratio - 1) * t)
+    new_t = new_t * num_train_timesteps
+    return new_t
+class RFlowScheduler(Scheduler):
+    """
+    A rectified flow scheduler for guiding the diffusion process in a generative model.
+    Supports uniform and logit-normal sampling methods, timestep transformation for
+    different resolutions, and noise addition during diffusion.
+    Args:
+        num_train_timesteps (int): Total number of training timesteps.
+        use_discrete_timesteps (bool): Whether to use discrete timesteps.
+        sample_method (str): Training time step sampling method ('uniform' or 'logit-normal').
+        loc (float): Location parameter for logit-normal distribution, used only if sample_method='logit-normal'.
+        scale (float): Scale parameter for logit-normal distribution, used only if sample_method='logit-normal'.
+        use_timestep_transform (bool): Whether to apply timestep transformation.
+            If true, there will be more inference timesteps at early(noisy) stages for larger image volumes.
+        transform_scale (float): Scaling factor for timestep transformation, used only if use_timestep_transform=True.
+        steps_offset (int): Offset added to computed timesteps, used only if use_timestep_transform=True.
+        base_img_size_numel (int): Reference image volume size for scaling, used only if use_timestep_transform=True.
+        spatial_dim (int): 2 or 3, incidcating 2D or 3D images, used only if use_timestep_transform=True.
+    Example:
+        .. code-block:: python
+            # define a scheduler
+            noise_scheduler = RFlowScheduler(
+                num_train_timesteps = 1000,
+                use_discrete_timesteps = True,
+                sample_method = 'logit-normal',
+                use_timestep_transform = True,
+                base_img_size_numel = 32 * 32 * 32,
+                spatial_dim = 3
+            )
+            # during training
+            inputs = torch.ones(2,4,64,64,32)
+            noise = torch.randn_like(inputs)
+            timesteps = noise_scheduler.sample_timesteps(inputs)
+            noisy_inputs = noise_scheduler.add_noise(original_samples=inputs, noise=noise, timesteps=timesteps)
+            predicted_velocity = diffusion_unet(
+                x=noisy_inputs,
+                timesteps=timesteps
+            )
+            loss = loss_l1(predicted_velocity, (inputs - noise))
+            # during inference
+            noisy_inputs = torch.randn(2,4,64,64,32)
+            input_img_size_numel = torch.prod(torch.tensor(noisy_inputs.shape[-3:])
+            noise_scheduler.set_timesteps(
+                num_inference_steps=30, input_img_size_numel=input_img_size_numel)
+            )
+            all_next_timesteps = torch.cat(
+                (noise_scheduler.timesteps[1:], torch.tensor([0], dtype=noise_scheduler.timesteps.dtype))
+            )
+            for t, next_t in tqdm(
+                zip(noise_scheduler.timesteps, all_next_timesteps),
+                total=min(len(noise_scheduler.timesteps), len(all_next_timesteps)),
+            ):
+                predicted_velocity = diffusion_unet(
+                    x=noisy_inputs,
+                    timesteps=timesteps
+                )
+                noisy_inputs, _ = noise_scheduler.step(predicted_velocity, t, noisy_inputs, next_t)
+            final_output = noisy_inputs
+    """
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        use_discrete_timesteps: bool = True,
+        sample_method: str = "uniform",
+        loc: float = 0.0,
+        scale: float = 1.0,
+        use_timestep_transform: bool = False,
+        transform_scale: float = 1.0,
+        steps_offset: int = 0,
+        base_img_size_numel: int = 32 * 32 * 32,
+        spatial_dim: int = 3,
+    ):
+        # rectified flow only accepts velocity prediction
+        self.prediction_type = RFlowPredictionType.V_PREDICTION
+        self.num_train_timesteps = num_train_timesteps
+        self.use_discrete_timesteps = use_discrete_timesteps
+        self.base_img_size_numel = base_img_size_numel
+        self.spatial_dim = spatial_dim
+        # sample method
+        if sample_method not in ["uniform", "logit-normal"]:
+            raise ValueError(
+                f"sample_method = {sample_method}, which has to be chosen from ['uniform', 'logit-normal']."
+            )
+        self.sample_method = sample_method
+        if sample_method == "logit-normal":
+            self.distribution = LogisticNormal(torch.tensor([loc]), torch.tensor([scale]))
+            self.sample_t = lambda x: self.distribution.sample((x.shape[0],))[:, 0].to(x.device)
+        # timestep transform
+        self.use_timestep_transform = use_timestep_transform
+        self.transform_scale = transform_scale
+        self.steps_offset = steps_offset
+    def add_noise(self, original_samples: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
+        """
+        Add noise to the original samples.
+        Args:
+            original_samples: original samples
+            noise: noise to add to samples
+            timesteps: timesteps tensor with shape of (N,), indicating the timestep to be computed for each sample.
+        Returns:
+            noisy_samples: sample with added noise
+        """
+        timepoints: torch.Tensor = timesteps.float() / self.num_train_timesteps
+        timepoints = 1 - timepoints  # [1,1/1000]
+        # expand timepoint to noise shape
+        if noise.ndim == 5:
+            timepoints = timepoints[..., None, None, None, None].expand(-1, *noise.shape[1:])
+        elif noise.ndim == 4:
+            timepoints = timepoints[..., None, None, None].expand(-1, *noise.shape[1:])
+        else:
+            raise ValueError(f"noise tensor has to be 4D or 5D tensor, yet got shape of {noise.shape}")
+        noisy_samples: torch.Tensor = timepoints * original_samples + (1 - timepoints) * noise
+        return noisy_samples
+    def set_timesteps(
+        self,
+        num_inference_steps: int,
+        device: str | torch.device | None = None,
+        input_img_size_numel: int | None = None,
+    ) -> None:
+        """
+        Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference.
+        Args:
+            num_inference_steps: number of diffusion steps used when generating samples with a pre-trained model.
+            device: target device to put the data.
+            input_img_size_numel: int, H*W*D of the image, used with self.use_timestep_transform is True.
+        """
+        if num_inference_steps > self.num_train_timesteps or num_inference_steps < 1:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} should be at least 1, "
+                "and cannot be larger than `self.num_train_timesteps`:"
+                f" {self.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.num_train_timesteps} timesteps."
+            )
+        self.num_inference_steps = num_inference_steps
+        # prepare timesteps
+        timesteps = [
+            (1.0 - i / self.num_inference_steps) * self.num_train_timesteps for i in range(self.num_inference_steps)
+        ]
+        if self.use_discrete_timesteps:
+            timesteps = [int(round(t)) for t in timesteps]
+        if self.use_timestep_transform:
+            timesteps = [
+                timestep_transform(
+                    t,
+                    input_img_size_numel=input_img_size_numel,
+                    base_img_size_numel=self.base_img_size_numel,
+                    num_train_timesteps=self.num_train_timesteps,
+                    spatial_dim=self.spatial_dim,
+                )
+                for t in timesteps
+            ]
+        timesteps_np = np.array(timesteps).astype(np.float16)
+        if self.use_discrete_timesteps:
+            timesteps_np = timesteps_np.astype(np.int64)
+        self.timesteps = torch.from_numpy(timesteps_np).to(device)
+        self.timesteps += self.steps_offset
+    def sample_timesteps(self, x_start):
+        """
+        Randomly samples training timesteps using the chosen sampling method.
+        Args:
+            x_start (torch.Tensor): The input tensor for sampling.
+        Returns:
+            torch.Tensor: Sampled timesteps.
+        """
+        if self.sample_method == "uniform":
+            t = torch.rand((x_start.shape[0],), device=x_start.device) * self.num_train_timesteps
+        elif self.sample_method == "logit-normal":
+            t = self.sample_t(x_start) * self.num_train_timesteps
+        if self.use_discrete_timesteps:
+            t = t.long()
+        if self.use_timestep_transform:
+            input_img_size_numel = torch.prod(torch.tensor(x_start.shape[2:]))
+            t = timestep_transform(
+                t,
+                input_img_size_numel=input_img_size_numel,
+                base_img_size_numel=self.base_img_size_numel,
+                num_train_timesteps=self.num_train_timesteps,
+                spatial_dim=len(x_start.shape) - 2,
+            )
+        return t
+    def step(
+        self, model_output: torch.Tensor, timestep: int, sample: torch.Tensor, next_timestep: Union[int, None] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predicts the next sample in the diffusion process.
+        Args:
+            model_output (torch.Tensor): Output from the trained diffusion model.
+            timestep (int): Current timestep in the diffusion chain.
+            sample (torch.Tensor): Current sample in the process.
+            next_timestep (Union[int, None]): Optional next timestep.
+        Returns:
+            tuple[torch.Tensor, torch.Tensor]: Predicted sample at the next step and additional info.
+        """
+        # Ensure num_inference_steps exists and is a valid integer
+        if not hasattr(self, "num_inference_steps") or not isinstance(self.num_inference_steps, int):
+            raise AttributeError(
+                "num_inference_steps is missing or not an integer in the class."
+                "Please run self.set_timesteps(num_inference_steps,device,input_img_size_numel) to set it."
+            )
+        v_pred = model_output
+        if next_timestep is not None:
+            next_timestep = int(next_timestep)
+            dt: float = (
+                float(timestep - next_timestep) / self.num_train_timesteps
+            )  # Now next_timestep is guaranteed to be int
+        else:
+            dt = (
+                1.0 / float(self.num_inference_steps) if self.num_inference_steps > 0 else 0.0
+            )  # Avoid division by zero
+        pred_post_sample = sample + v_pred * dt
+        pred_original_sample = sample + v_pred * timestep / self.num_train_timesteps
+        return pred_post_sample, pred_original_sample