diaoquesang commited on Jun 8, 2025

Commit

6434535

verified ·

1 Parent(s): c2101b4

Upload 29 files

Browse files

Files changed (30) hide show

.gitattributes +10 -0
codes/batch_lcm_eval.py +155 -0
codes/config.py +43 -0
codes/dataset.py +74 -0
codes/datasetSegmentation.py +45 -0
codes/lcm_train.py +180 -0
codes/lungSegmentation.ipynb +0 -0
codes/metrics.py +140 -0
codes/model.py +37 -0
codes/modules/__init__.py +3 -0
codes/modules/diffusion_model_unet.py +2099 -0
codes/modules/fp16_util.py +237 -0
codes/modules/logger.py +495 -0
codes/modules/nn.py +170 -0
codes/modules/unet.py +894 -0
codes/modules/unet_2d.py +334 -0
codes/pytorch_msssim.py +156 -0
codes/transform.py +28 -0
codes/vq-gan_eval.py +60 -0
data/BS/0.png +3 -0
data/BS/1.png +3 -0
data/BS/2.png +3 -0
data/CXR/0.png +3 -0
data/CXR/1.png +3 -0
data/CXR/2.png +3 -0
images/GL-LCM_gif.gif +3 -0
images/ablation.png +3 -0
images/comparison.png +3 -0
images/framework.png +3 -0
requirements.txt +20 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/BS/0.png filter=lfs diff=lfs merge=lfs -text
+data/BS/1.png filter=lfs diff=lfs merge=lfs -text
+data/BS/2.png filter=lfs diff=lfs merge=lfs -text
+data/CXR/0.png filter=lfs diff=lfs merge=lfs -text
+data/CXR/1.png filter=lfs diff=lfs merge=lfs -text
+data/CXR/2.png filter=lfs diff=lfs merge=lfs -text
+images/ablation.png filter=lfs diff=lfs merge=lfs -text
+images/comparison.png filter=lfs diff=lfs merge=lfs -text
+images/framework.png filter=lfs diff=lfs merge=lfs -text
+images/GL-LCM_gif.gif filter=lfs diff=lfs merge=lfs -text

codes/batch_lcm_eval.py ADDED Viewed

	@@ -0,0 +1,155 @@

+from config import config
+import numpy as np
+from dataset import myC2BDataset
+from transform import myTransform
+from torch.utils.data import DataLoader
+from diffusers import LCMScheduler
+from tqdm import tqdm
+import cv2 as cv
+import torch
+import time
+import os
+from monai.utils import set_determinism
+set_determinism(42)
+def eval():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 设置运行环境
+    output_path = os.path.join("lcm_output_bs", "BS")
+    masked_output_path = os.path.join("lcm_output_bs", "Masked_BS")
+    fusion_output_path = os.path.join("lcm_output_bs", "Fusion_BS")
+    cxr_path = os.path.join("SZCH-X-Rays", "CXR")
+    masked_cxr_path = os.path.join("SZCH-X-Rays", "Masked_CXR")
+    mask_path = os.path.join("SZCH-X-Rays", "Mask")
+    model = torch.load("masked_lcm-600-2024-12-19-myModel.pth").to(device).eval()
+    VQGAN = torch.load("2024-12-12-Mask-SZCH-VQGAN.pth").to(device).eval()
+    testset_list = "SZCH.txt"
+    myTestSet = myC2BDataset(testset_list, cxr_path, masked_cxr_path, myTransform['testTransform'])
+    myTestLoader = DataLoader(myTestSet, batch_size=1, shuffle=False)
+    # 设置噪声调度器
+    noise_scheduler = LCMScheduler(num_train_timesteps=config.num_train_timesteps,
+                                   clip_sample=config.clip_sample,
+                                   clip_sample_range=config.initial_clip_sample_range_g)
+    noise_scheduler.set_timesteps(config.num_infer_timesteps)
+    with torch.no_grad():
+        progress_bar = tqdm(enumerate(myTestLoader), total=len(myTestLoader), ncols=100)
+        total_start = time.time()
+        for step, batch in progress_bar:
+            cxr = batch[0].to(device=device, non_blocking=True).float()
+            masked_cxr = batch[1].to(device=device, non_blocking=True).float()
+            filename = batch[2][0]
+            cxr_copy = np.array(cxr.detach().to("cpu"))
+            cxr_copy = np.squeeze(cxr_copy)  # HW
+            cxr_copy = cxr_copy * 0.5 + 0.5
+            cxr_copy *= 255
+            cxr_copy = cxr_copy.astype(np.int8)
+            cxr = VQGAN.encode_stage_2_inputs(cxr)
+            masked_cxr = VQGAN.encode_stage_2_inputs(masked_cxr)
+            noise = torch.randn_like(cxr).to(device)
+            sample = torch.cat((noise, cxr), dim=1).to(device)  # BCHW
+            masked_sample = torch.cat((noise, masked_cxr), dim=1).to(device)  # BCHW
+            for j, t in tqdm(enumerate(noise_scheduler.timesteps)):
+                residual = model(sample, torch.Tensor((t,)).to(device).long()).to(device)
+                masked_residual = model(masked_sample, torch.Tensor((t,)).to(device).long()).to(device)
+                # masked_residual = (1 - config.alpha) * residual + config.alpha * masked_residual
+                masked_residual = config.alpha * masked_residual + (1 - config.alpha) * torch.randn_like(
+                    masked_residual).to(device) / torch.std(masked_residual)
+                noise_scheduler = LCMScheduler(num_train_timesteps=config.num_train_timesteps,
+                                               clip_sample=config.clip_sample,
+                                               clip_sample_range=
+                                               config.initial_clip_sample_range_g
+                                               + config.clip_rate * j
+                                               )
+                noise_scheduler.set_timesteps(config.num_infer_timesteps)
+                sample = noise_scheduler.step(residual, t, sample).prev_sample
+                noise_scheduler = LCMScheduler(num_train_timesteps=config.num_train_timesteps,
+                                               clip_sample=config.clip_sample,
+                                               clip_sample_range=
+                                               config.initial_clip_sample_range_l
+                                               + config.clip_rate * j
+                                               )
+                noise_scheduler.set_timesteps(config.num_infer_timesteps)
+                masked_sample = noise_scheduler.step(masked_residual, t, masked_sample).prev_sample
+                sample = torch.cat((sample[:, :4], cxr), dim=1)  # BCHW
+                masked_sample = torch.cat((masked_sample[:, :4], masked_cxr), dim=1).to(device)  # BCHW
+                if config.output_feature_map:
+                    bs_show = np.array(sample[:, 0].detach().to("cpu"))
+                    bs_show = np.squeeze(bs_show)  # HW
+                    bs_show = bs_show * 0.5 + 0.5
+                    bs_show = np.clip(bs_show, 0, 1)
+                    masked_bs_show = np.array(masked_sample[:, 0].detach().to("cpu"))
+                    masked_bs_show = np.squeeze(masked_bs_show)  # HW
+                    masked_bs_show = masked_bs_show * 0.5 + 0.5
+                    masked_bs_show = np.clip(masked_bs_show, 0, 1)
+                    if not config.use_server:
+                        cv.imshow("win1", bs_show)
+                        cv.imshow("win2", masked_bs_show)
+                        cv.waitKey(1)
+            mask = cv.imread(os.path.join(mask_path, filename), 0)
+            mask[mask < 255] = 0
+            bs = VQGAN.decode((sample[:, :4]))
+            bs = np.array(bs.detach().to("cpu"))
+            bs = np.squeeze(bs)  # HW
+            bs = bs * 0.5 + 0.5
+            bs[cxr_copy == 0] = 0
+            masked_bs = VQGAN.decode((masked_sample[:, :4]))
+            masked_bs = np.array(masked_bs.detach().to("cpu"))
+            masked_bs = np.squeeze(masked_bs)  # HW
+            masked_bs = masked_bs * 0.5 + 0.5
+            masked_bs[mask > 0] = masked_bs[mask > 0] + np.mean(bs[mask > 0]) - np.mean(masked_bs[mask > 0])
+            masked_bs[cxr_copy == 0] = 0
+            if not config.use_server:
+                cv.imshow("win3", bs)
+                cv.imshow("win4", masked_bs)
+                cv.waitKey(1)
+            bs *= 255
+            cv.imwrite(os.path.join(output_path, filename), bs)
+            masked_bs *= 255
+            cv.imwrite(os.path.join(masked_output_path, filename), masked_bs)
+            num_labels, labels, stats, _ = cv.connectedComponentsWithStats(mask)
+            min_area = 100
+            for i in range(1, num_labels):
+                if stats[i, cv.CC_STAT_AREA] < min_area:
+                    labels[labels == i] = 0
+            mask[labels == 0] = 0
+            br = cv.boundingRect(mask)
+            p = (br[0] + br[2] // 2, br[1] + br[3] // 2)
+            masked_bs = np.clip(masked_bs, 0, 255)
+            masked_bs = cv.cvtColor(masked_bs, cv.COLOR_GRAY2BGR).astype(np.uint8)
+            bs = np.clip(bs, 0, 255)
+            bs = cv.cvtColor(bs, cv.COLOR_GRAY2BGR).astype(np.uint8)
+            fusion_bs = cv.seamlessClone(masked_bs, bs, mask, p, cv.MONOCHROME_TRANSFER)
+            # cv.rectangle(fusion_bs, br, (0, 255, 0), 2)
+            # fusion_bs[mask==255]=(255, 0, 0)
+            cv.imwrite(os.path.join(fusion_output_path, filename), fusion_bs)
+        total_time = time.time() - total_start
+        print(f"Total time: {total_time}.")
+if __name__ == "__main__":
+    eval()

codes/config.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from dataclasses import dataclass
+@dataclass
+class config():
+    use_server = True
+    test_epoch_interval = 10
+    image_size = 1024
+    r = 8
+    # VQGAN
+    vae_epoch_number = 300
+    vae_batch_size = 4
+    milestones_g = [200]
+    milestones_d = [200]
+    initial_learning_rate_g = 1e-4
+    initial_learning_rate_d = 5e-4
+    # lcm
+    batch_size = 4
+    epoch_number = 600
+    initial_learning_rate = 1e-4
+    milestones = [300, 400, 500]
+    num_train_timesteps = 1000
+    beta_start = 0.00085
+    beta_end = 0.012
+    offset_noise = True
+    offset_noise_coefficient = 0.1
+    output_feature_map = True
+    clip_sample = True
+    num_infer_timesteps = 50
+    alpha = 3
+    video = False
+    # SZCH
+    clip_rate = 0.025
+    initial_clip_sample_range_g = 2
+    initial_clip_sample_range_l = 3.5
+    # JSRT
+    # clip_rate = 0.025
+    # initial_clip_sample_range_g = 1.7
+    # initial_clip_sample_range_l = 3

codes/dataset.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from torch.utils.data import Dataset
+import pandas as pd
+import cv2 as cv
+import os
+from config import config
+import torch
+import numpy as np
+class mySingleDataset(Dataset):  # 定义数据集类
+    def __init__(self, filelist, img_dir, transform=None):  # 传入参数(标签路径,图像路径,图像预处理方式,标签预处理方式)
+        self.img_dir = img_dir  # 读取图像路径
+        self.transform = transform  # 读取图像预处理方式
+        self.filelist = pd.read_csv(filelist, sep="\t", header=None)  # 读取文件名列表
+    def __len__(self):
+        return len(self.filelist)  # 读取文件名数量作为数据集长度
+    def __getitem__(self, idx):  # 从数据集中取出数据
+        img_path = self.img_dir  # 读取图片文件夹路径
+        file = self.filelist.iloc[idx, 0]  # 读取文件名
+        image = cv.imread(os.path.join(img_path, file))  # 用openCV的imread函数读取图像
+        if self.transform:
+            image = self.transform(image)  # 图像预处理
+        return image, file  # 返回图像和名称
+class myDataset(Dataset):  # 定义数据集类
+    def __init__(self, filelist, cxr_dir, bs_dir,
+                 transform=None):  # 传入参数(标签路径,图像路径,图像预处理方式,标签预处理方式)
+        self.cxr_dir = cxr_dir  # 读取图像路径
+        self.bs_dir = bs_dir  # 读取图像路径
+        self.transform = transform  # 读取图像预处理方式
+        self.filelist = pd.read_csv(filelist, sep="\t", header=None)  # 读取文件名列表
+    def __len__(self):
+        return len(self.filelist)  # 读取文件名数量作为数据集长度
+    def __getitem__(self, idx):  # 从数据集中取出数据
+        file = self.filelist.iloc[idx, 0]  # 读取文件名
+        cxr = cv.imread(os.path.join(self.cxr_dir, file))  # 用openCV的imread函数读取图像
+        bs = cv.imread(os.path.join(self.bs_dir, file))  # 用openCV的imread函数读取图像
+        if self.transform:
+            cxr = self.transform(cxr)  # 图像预处理
+            bs = self.transform(bs)  # 图像预处理
+        return cxr, bs, file  # 返回图像和标签
+class myC2BDataset(Dataset):  # 定义数据集类
+    def __init__(self, filelist, cxr_dir, masked_cxr_dir,
+                 transform=None):  # 传入参数(标签路径,图像路径,图像预处理方式,标签预处理方式)
+        self.cxr_dir = cxr_dir  # 读取图像路径
+        self.masked_cxr_dir = masked_cxr_dir  # 读取图像路径
+        self.transform = transform  # 读取图像预处理方式
+        self.filelist = pd.read_csv(filelist, sep="\t", header=None)  # 读取文件名列表
+    def __len__(self):
+        return len(self.filelist)  # 读取文件名数量作为数据集长度
+    def __getitem__(self, idx):  # 从数据集中取出数据
+        file = self.filelist.iloc[idx, 0]  # 读取文件名
+        cxr = cv.imread(os.path.join(self.cxr_dir, file))  # 用openCV的imread函数读取图像
+        masked_cxr = cv.imread(os.path.join(self.masked_cxr_dir, file))  # 用openCV的imread函数读取图像
+        if self.transform:
+            cxr = self.transform(cxr)  # 图像预处理
+            masked_cxr = self.transform(masked_cxr)  # 图像预处理
+        return cxr, masked_cxr, file  # 返回图像和标签

codes/datasetSegmentation.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import random
+import os
+def traverse_directory(directory, txt_name):
+    # 创建一个空的列表用于存储文件名
+    file_names = []
+    # 遍历目录中的所有文件和子目录
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            file_names.append(file)
+    # 按照文件名排序（如果你希望的话）
+    file_names.sort()
+    # 创建一个新的txt文件，并将文件名写入该文件
+    with open(txt_name, 'w') as f:
+        for file_name in file_names:
+            f.write(file_name + '\n')
+def split_dataset(file_path, train_ratio=0.8, val_ratio=0.1, former=None):
+    # 读取数据集
+    with open(file_path, 'r') as f:
+        data = f.readlines()
+    # 随机打乱数据集
+    random.shuffle(data)
+    train_size = int(len(data) * train_ratio)
+    val_size = int(len(data) * val_ratio)
+    train_set = data[:train_size]
+    val_set = data[train_size:train_size + val_size]
+    test_set = data[train_size + val_size:]
+    with open(former + '_trainset.txt', 'w') as f:
+        f.writelines(train_set)
+    with open(former + '_valset.txt', 'w') as f:
+        f.writelines(val_set)
+    with open(former + '_testset.txt', 'w') as f:
+        f.writelines(test_set)
+    print(f"Finished.")
+if __name__ == "__main__":
+    traverse_directory('JSRTnew1024-241/CXR', "JSRT.txt")
+    split_dataset('JSRT.txt',0.8,0.1, "JSRT")

codes/lcm_train.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from matplotlib import pyplot as plt
+from config import config
+from dataset import myDataset
+from transform import myTransform
+from torch.utils.data import DataLoader
+from model import myUnet, myVQGANModel
+from diffusers import LCMScheduler,DDPMScheduler
+from torch.optim.lr_scheduler import MultiStepLR
+from tqdm import tqdm
+from datetime import date
+import torch.nn.functional as F
+import torch
+import time
+from monai.utils import set_determinism
+set_determinism(42)
+def train():
+    if config.use_server:
+        file = open('log.txt', 'w')  # 保存日志位置
+    else:
+        file = None  # 取消日志输出
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 设置运行环境
+    train_file_list = "JSRT_trainset.txt"  # 存储训练集文件名的文本文件
+    test_file_list = "JSRT_valset.txt"  # 存储测试集文件名的文本文件
+    cxr_path = "/mntcephfs/med_dataset/SYF/JSRTnew1024-241/CXR"  # 图像文件夹路径
+    bs_path = "/mntcephfs/med_dataset/SYF/JSRTnew1024-241/BS"  # 图像文件夹路径
+    masked_cxr_path = "/mntcephfs/med_dataset/SYF/JSRTnew1024-241/Masked_CXR"  # 图像文件夹路径
+    masked_bs_path = "/mntcephfs/med_dataset/SYF/JSRTnew1024-241/Masked_BS"  # 图像文件夹路径
+    myTrainSet = myDataset(train_file_list, cxr_path, bs_path,
+                           myTransform['trainTransform']) + myDataset(train_file_list, masked_cxr_path, masked_bs_path,
+                                                                      myTransform['trainTransform'])
+    myTestSet = myDataset(test_file_list, cxr_path, bs_path,
+                          myTransform['testTransform']) + myDataset(test_file_list, masked_cxr_path, masked_bs_path,
+                                                                    myTransform['testTransform'])
+    myTrainLoader = DataLoader(myTrainSet, batch_size=config.batch_size, shuffle=True)
+    myTestLoader = DataLoader(myTestSet, batch_size=config.batch_size, shuffle=True)
+    print("Number of batches in train set:", len(myTrainLoader))  # 输出训练集batch数量
+    print("Train set size:", len(myTrainSet))  # 输出训练集大小
+    print("Number of batches in test set:", len(myTestLoader))  # 输出测试集batch数量
+    print("Test set size:", len(myTestSet))  # 输出测试集大小
+    model = myUnet.to(device).train()
+    # 设置噪声调度器
+    noise_scheduler = LCMScheduler(num_train_timesteps=config.num_train_timesteps)
+    noise_scheduler.set_timesteps(config.num_infer_timesteps)
+    # 设置动态学习率
+    optimizer = torch.optim.AdamW(model.parameters(), lr=config.initial_learning_rate, eps=1e-6)
+    milestones = [x * len(myTrainLoader) for x in config.milestones]
+    optimizer_scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1)
+    train_losses = []
+    test_losses = []
+    plt_train_loss_epoch = []
+    plt_test_loss_epoch = []
+    train_epoch_list = list(range(0, config.epoch_number))
+    test_epoch_list = list(range(0, int(config.epoch_number / config.test_epoch_interval)))
+    VQGAN = torch.load("2025-02-04-Mask-JSRT-VQGAN.pth").to(device).eval()
+    print(time.strftime("%H:%M:%S", time.localtime()), "----------Begin Training----------", file=file)
+    for epoch in range(config.epoch_number):
+        model.train()
+        print(time.strftime("%H:%M:%S", time.localtime()),
+              f"Epoch:{epoch},learning rate:{optimizer.param_groups[0]['lr']}", file=file)
+        for i, batch in tqdm(enumerate(myTrainLoader)):
+            cxr_i, bs_i = batch[0].to(device), batch[1].to(device)
+            with torch.no_grad():
+                cxr = VQGAN.encode_stage_2_inputs(cxr_i)
+                bs = VQGAN.encode_stage_2_inputs(bs_i)
+            cat = torch.cat((bs, cxr), dim=-3)
+            # 为图片添加噪声
+            if config.offset_noise:
+                noise = torch.randn_like(cxr).to(device) + config.offset_noise_coefficient * torch.randn(
+                    cxr.shape[0], cxr.shape[1], 1,
+                    1).to(device)
+            else:
+                noise = torch.randn_like(cxr).to(device)
+            blank = torch.zeros_like(cxr).to(device)
+            noise = torch.cat((noise, blank), dim=-3)
+            # 为每张图片随机采样一个时间步
+            timesteps = torch.randint(0, config.num_train_timesteps, (cxr.shape[0],), device=device).long()
+            # 根据每个时间步的噪声幅度，向清晰的图片中添加噪声
+            noisy_images = noise_scheduler.add_noise(cat, noise, timesteps)
+            # 获取模型的预测结果
+            noise_pred = model(noisy_images, timesteps)
+            # 计算损失
+            loss = F.mse_loss(noise_pred[:, :4].float(), noise[:, :4].float())
+            loss.backward()
+            train_losses.append(loss.item())
+            # 迭代模型参数
+            optimizer.step()
+            optimizer.zero_grad()
+            optimizer_scheduler.step()
+        train_loss_epoch = sum(train_losses[-len(myTrainLoader):]) / len(myTrainLoader)
+        print(time.strftime("%H:%M:%S", time.localtime()), f"Epoch:{epoch},train losses:{train_loss_epoch}", file=file)
+        plt_train_loss_epoch.append(train_loss_epoch)
+        if (epoch + 1) % config.test_epoch_interval == 0:
+            model.eval()
+            print(time.strftime("%H:%M:%S", time.localtime()), "----------Stop Training----------", file=file)
+            print(time.strftime("%H:%M:%S", time.localtime()), "----------Begin Testing----------", file=file)
+            with torch.no_grad():
+                for i, batch in tqdm(enumerate(myTestLoader)):
+                    cxr_i, bs_i = batch[0].to(device), batch[1].to(device)
+                    with torch.no_grad():
+                        cxr = VQGAN.encode_stage_2_inputs(cxr_i)
+                        bs = VQGAN.encode_stage_2_inputs(bs_i)
+                    cat = torch.cat((bs, cxr), dim=-3)
+                    # 为图片添加噪声
+                    if config.offset_noise:
+                        noise = torch.randn_like(cxr).to(device) + config.offset_noise_coefficient * torch.randn(
+                            cxr.shape[0],
+                            cxr.shape[1], 1,
+                            1).to(device)
+                    else:
+                        noise = torch.randn_like(cxr).to(device)
+                    blank = torch.zeros_like(cxr).to(device)
+                    noise = torch.cat((noise, blank), dim=-3)
+                    # 为每张图片随机采样一个时间步
+                    timesteps = torch.randint(0, config.num_train_timesteps, (cxr.shape[0],),
+                                              device=device).long()
+                    # 根据每个时间步的噪声幅度，向清晰的图片中添加噪声
+                    noisy_images = noise_scheduler.add_noise(cat, noise, timesteps)
+                    # 获取模型的预测结果
+                    noise_pred = model(noisy_images, timesteps)
+                    # 计算损失
+                    loss = F.mse_loss(noise_pred[:, :4].float(), noise[:, :4].float())
+                    test_losses.append(loss.item())
+                test_loss_epoch = sum(test_losses[-len(myTestLoader):]) / len(myTestLoader)
+                print(time.strftime("%H:%M:%S", time.localtime()), f"Epoch:{epoch},test losses:{test_loss_epoch}",
+                      file=file)
+                plt_test_loss_epoch.append(test_loss_epoch)
+                print(time.strftime("%H:%M:%S", time.localtime()), "----------End Validation----------", file=file)
+                print(time.strftime("%H:%M:%S", time.localtime()), "----------Continue to Train----------",
+                      file=file)
+    print(time.strftime("%H:%M:%S", time.localtime()), "----------End Training Normally----------", file=file)
+    # 查看损失曲线
+    f, ([ax1, ax2]) = plt.subplots(1, 2)
+    ax1.plot(train_epoch_list, plt_train_loss_epoch, color="red")  # 绘制曲线
+    ax1.set_title('Train loss')  # 添加标题
+    ax2.plot(test_epoch_list, plt_test_loss_epoch, color="blue")  # 绘制曲线
+    ax2.set_title('Test loss')  # 添加标题
+    plt.savefig("./loss.png")  # 保存损失曲线
+    if not config.use_server:
+        plt.show()  # 展示损失曲线
+    torch.save(model, "masked_lcm-600JSRT-" + str(date.today()) + "-myModel.pth")
+if __name__ == "__main__":
+    train()

codes/lungSegmentation.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

codes/metrics.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import cv2 as cv
+import lpips
+import numpy as np
+import os
+from config import config
+from openpyxl import Workbook
+import torch
+from math import log10, sqrt
+import torchvision.transforms as transforms
+def cal_BSR(cxr_path, gt_path, bs_path):
+    cxr = cv.imread(cxr_path, 0)
+    gt = cv.imread(gt_path, 0)
+    bs = cv.imread(bs_path, 0)
+    cxr = cxr / 255
+    gt = gt / 255
+    bs = bs / 255
+    bone = cv.subtract(cxr, gt)
+    gt = cv.resize(gt, (config.image_size, config.image_size))
+    bs = cv.resize(bs, (config.image_size, config.image_size))
+    bone = cv.resize(bone, (config.image_size, config.image_size))
+    bs += np.average(cv.subtract(gt, bs))
+    bias = cv.subtract(gt, bs)
+    bias[bias < 0] = 0
+    BSR = 1 - np.sum(bias ** 2) / np.sum(bone ** 2)
+    return BSR
+def cal_MSE(gt_path, bs_path):
+    gt = cv.imread(gt_path, 0)
+    bs = cv.imread(bs_path, 0)
+    gt = cv.resize(gt, (config.image_size, config.image_size))
+    bs = cv.resize(bs, (config.image_size, config.image_size))
+    MSE = np.mean((gt - bs) ** 2)
+    MSE = 2*lpips.l2(gt,bs)
+    return MSE
+def cal_PSNR(gt_path, bs_path):
+    mse = cal_MSE(gt_path,bs_path)
+    max_pixel = 1
+    PSNR = 20 * log10(max_pixel / sqrt(mse))
+    return PSNR
+def cal_LPIPS(gt_path, bs_path):
+    lplps_model = lpips.LPIPS()
+    gt = cv.imread(gt_path, 0)
+    bs = cv.imread(bs_path, 0)
+    gt = cv.resize(gt, (config.image_size, config.image_size))
+    bs = cv.resize(bs, (config.image_size, config.image_size))
+    gt = transforms.ToTensor()(gt)
+    bs = transforms.ToTensor()(bs)
+    gt = torch.unsqueeze(gt, dim=0)
+    bs = torch.unsqueeze(bs, dim=0)
+    LPIPS = lplps_model(gt, bs).item()
+    return LPIPS
+if __name__ == "__main__":
+    wb = Workbook()
+    ws = wb.active
+    CXR_path = "SZCH-X-Rays/CXR"
+    GT_path = "SZCH-X-Rays/BS"
+    BS_path = "lcm_output_bs/Fusion_BS"
+    BSR_list = []
+    MSE_list = []
+    PSNR_list = []
+    LPIPS_list = []
+    ws.append(["Filename", "BSR", "MSE", "PSNR", "LPIPS"])
+    txt = 'SZCH_testset.txt'
+    with open(txt, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+    file_names = [line.strip() for line in lines]
+    for filename in os.listdir(BS_path):
+        if filename in file_names:
+            pass
+        else:
+            continue
+        cxr_path = os.path.join(CXR_path, filename)
+        gt_path = os.path.join(GT_path, filename)
+        bs_path = os.path.join(BS_path, filename)
+        BSR = cal_BSR(cxr_path, gt_path, bs_path)
+        MSE = cal_MSE(gt_path, bs_path)
+        PSNR = cal_PSNR(gt_path, bs_path)
+        LPIPS = cal_LPIPS(gt_path, bs_path)
+        BSR_list.append(BSR)
+        MSE_list.append(MSE)
+        PSNR_list.append(PSNR)
+        LPIPS_list.append(LPIPS)
+        print(f"{filename} BSR: {BSR} MSE: {MSE} PSNR:{PSNR} LPIPS:{LPIPS}")
+        ws.append([filename, BSR, MSE, PSNR, LPIPS])
+    ws.append(["Mean",
+               np.mean(np.array(BSR_list)),
+               np.mean(np.array(MSE_list)),
+               np.mean(np.array(PSNR_list)),
+               np.mean(np.array(LPIPS_list))])
+    ws.append(["Std",
+               np.std(np.array(BSR_list)),
+               np.std(np.array(MSE_list)),
+               np.std(np.array(PSNR_list)),
+               np.std(np.array(LPIPS_list))])
+    print("Average BSR:", np.mean(np.array(BSR_list)), "Std:", np.std(np.array(BSR_list)))
+    print("Average MSE:", np.mean(np.array(MSE_list)), "Std:", np.std(np.array(MSE_list)))
+    print("Average PSNR:", np.mean(np.array(PSNR_list)), "Std:", np.std(np.array(PSNR_list)))
+    print("Average LPIPS:", np.mean(np.array(LPIPS_list)), "Std:", np.std(np.array(LPIPS_list)))
+    # 保存工作簿到文件
+    wb.save("sample.xlsx")

codes/model.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from modules.unet import UNetModel
+from generative.networks.nets import VQVAE
+from config import config
+myUnet = UNetModel(
+    image_size=config.image_size / config.r,
+    model_channels=128,
+    in_channels=8,
+    out_channels=8,
+    num_res_blocks=8,
+    num_heads=8,
+    attention_resolutions=(64, 32, 16, 8),
+    num_heads_upsample=-1,
+    num_head_channels=-1,
+    resblock_updown=True,
+    channel_mult=(1, 1, 2, 2, 4, 4),
+    use_scale_shift_norm=True,
+    use_new_attention_order=True
+)
+myVQGANModel = VQVAE(
+    spatial_dims=2,
+    in_channels=1,
+    out_channels=1,
+    num_channels=(128, 256, 512),
+    num_res_channels=512,
+    num_res_layers=2,
+    downsample_parameters=((2, 4, 1, 1), (2, 4, 1, 1), (2, 4, 1, 1),),
+    upsample_parameters=((2, 4, 1, 1, 0), (2, 4, 1, 1, 0), (2, 4, 1, 1, 0)),
+    num_embeddings=1024,
+    embedding_dim=4,
+)
+if __name__ == "__main__":
+    print("Number of model parameters:", sum([p.numel() for p in myUnet.parameters()]))
+    print("Number of model parameters:", sum([p.numel() for p in myVQGANModel.parameters()]))

codes/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Codebase for "Bone Suppression via conditional diffusion model".
+"""

codes/modules/diffusion_model_unet.py ADDED Viewed

	@@ -0,0 +1,2099 @@

+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# =========================================================================
+# Adapted from https://github.com/huggingface/diffusers
+# which has the following license:
+# https://github.com/huggingface/diffusers/blob/main/LICENSE
+#
+# Copyright 2022 UC Berkeley Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========================================================================
+from __future__ import annotations
+import importlib.util
+import math
+from collections.abc import Sequence
+import torch
+import torch.nn.functional as F
+from monai.networks.blocks import Convolution, MLPBlock
+from monai.networks.layers.factories import Pool
+from monai.utils import ensure_tuple_rep
+from torch import nn
+# To install xformers, use pip install xformers==0.0.16rc401
+if importlib.util.find_spec("xformers") is not None:
+    import xformers
+    import xformers.ops
+    has_xformers = True
+else:
+    xformers = None
+    has_xformers = False
+# TODO: Use MONAI's optional_import
+# from monai.utils import optional_import
+# xformers, has_xformers = optional_import("xformers.ops", name="xformers")
+__all__ = ["DiffusionModelUNet"]
+def zero_module(module: nn.Module) -> nn.Module:
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+class CrossAttention(nn.Module):
+    """
+    A cross attention layer.
+    Args:
+        query_dim: number of channels in the query.
+        cross_attention_dim: number of channels in the context.
+        num_attention_heads: number of heads to use for multi-head attention.
+        num_head_channels: number of channels in each head.
+        dropout: dropout probability to use.
+        upcast_attention: if True, upcast attention operations to full precision.
+        use_flash_attention: if True, use flash attention for a memory efficient attention mechanism.
+    """
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: int | None = None,
+        num_attention_heads: int = 8,
+        num_head_channels: int = 64,
+        dropout: float = 0.0,
+        upcast_attention: bool = False,
+        use_flash_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        self.use_flash_attention = use_flash_attention
+        inner_dim = num_head_channels * num_attention_heads
+        cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.scale = 1 / math.sqrt(num_head_channels)
+        self.num_heads = num_attention_heads
+        self.upcast_attention = upcast_attention
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
+    def reshape_heads_to_batch_dim(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Divide hidden state dimension to the multiple attention heads and reshape their input as instances in the batch.
+        """
+        batch_size, seq_len, dim = x.shape
+        x = x.reshape(batch_size, seq_len, self.num_heads, dim // self.num_heads)
+        x = x.permute(0, 2, 1, 3).reshape(batch_size * self.num_heads, seq_len, dim // self.num_heads)
+        return x
+    def reshape_batch_dim_to_heads(self, x: torch.Tensor) -> torch.Tensor:
+        """Combine the output of the attention heads back into the hidden state dimension."""
+        batch_size, seq_len, dim = x.shape
+        x = x.reshape(batch_size // self.num_heads, self.num_heads, seq_len, dim)
+        x = x.permute(0, 2, 1, 3).reshape(batch_size // self.num_heads, seq_len, dim * self.num_heads)
+        return x
+    def _memory_efficient_attention_xformers(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> torch.Tensor:
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        x = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=None)
+        return x
+    def _attention(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
+        dtype = query.dtype
+        if self.upcast_attention:
+            query = query.float()
+            key = key.float()
+        attention_scores = torch.baddbmm(
+            torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),
+            query,
+            key.transpose(-1, -2),
+            beta=0,
+            alpha=self.scale,
+        )
+        attention_probs = attention_scores.softmax(dim=-1)
+        attention_probs = attention_probs.to(dtype=dtype)
+        x = torch.bmm(attention_probs, value)
+        return x
+    def forward(self, x: torch.Tensor, context: torch.Tensor | None = None) -> torch.Tensor:
+        query = self.to_q(x)
+        context = context if context is not None else x
+        key = self.to_k(context)
+        value = self.to_v(context)
+        # Multi-Head Attention
+        query = self.reshape_heads_to_batch_dim(query)
+        key = self.reshape_heads_to_batch_dim(key)
+        value = self.reshape_heads_to_batch_dim(value)
+        if self.use_flash_attention:
+            x = self._memory_efficient_attention_xformers(query, key, value)
+        else:
+            x = self._attention(query, key, value)
+        x = self.reshape_batch_dim_to_heads(x)
+        x = x.to(query.dtype)
+        return self.to_out(x)
+class BasicTransformerBlock(nn.Module):
+    """
+    A basic Transformer block.
+    Args:
+        num_channels: number of channels in the input and output.
+        num_attention_heads: number of heads to use for multi-head attention.
+        num_head_channels: number of channels in each attention head.
+        dropout: dropout probability to use.
+        cross_attention_dim: size of the context vector for cross attention.
+        upcast_attention: if True, upcast attention operations to full precision.
+        use_flash_attention: if True, use flash attention for a memory efficient attention mechanism.
+    """
+    def __init__(
+        self,
+        num_channels: int,
+        num_attention_heads: int,
+        num_head_channels: int,
+        dropout: float = 0.0,
+        cross_attention_dim: int | None = None,
+        upcast_attention: bool = False,
+        use_flash_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        self.attn1 = CrossAttention(
+            query_dim=num_channels,
+            num_attention_heads=num_attention_heads,
+            num_head_channels=num_head_channels,
+            dropout=dropout,
+            upcast_attention=upcast_attention,
+            use_flash_attention=use_flash_attention,
+        )  # is a self-attention
+        self.ff = MLPBlock(hidden_size=num_channels, mlp_dim=num_channels * 4, act="GEGLU", dropout_rate=dropout)
+        self.attn2 = CrossAttention(
+            query_dim=num_channels,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads,
+            num_head_channels=num_head_channels,
+            dropout=dropout,
+            upcast_attention=upcast_attention,
+            use_flash_attention=use_flash_attention,
+        )  # is a self-attention if context is None
+        self.norm1 = nn.LayerNorm(num_channels)
+        self.norm2 = nn.LayerNorm(num_channels)
+        self.norm3 = nn.LayerNorm(num_channels)
+    def forward(self, x: torch.Tensor, context: torch.Tensor | None = None) -> torch.Tensor:
+        # 1. Self-Attention
+        x = self.attn1(self.norm1(x)) + x
+        # 2. Cross-Attention
+        x = self.attn2(self.norm2(x), context=context) + x
+        # 3. Feed-forward
+        x = self.ff(self.norm3(x)) + x
+        return x
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data. First, project the input (aka embedding) and reshape to b, t, d. Then apply
+    standard transformer action. Finally, reshape to image.
+    Args:
+        spatial_dims: number of spatial dimensions.
+        in_channels: number of channels in the input and output.
+        num_attention_heads: number of heads to use for multi-head attention.
+        num_head_channels: number of channels in each attention head.
+        num_layers: number of layers of Transformer blocks to use.
+        dropout: dropout probability to use.
+        norm_num_groups: number of groups for the normalization.
+        norm_eps: epsilon for the normalization.
+        cross_attention_dim: number of context dimensions to use.
+        upcast_attention: if True, upcast attention operations to full precision.
+        use_flash_attention: if True, use flash attention for a memory efficient attention mechanism.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        num_attention_heads: int,
+        num_head_channels: int,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        cross_attention_dim: int | None = None,
+        upcast_attention: bool = False,
+        use_flash_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        self.spatial_dims = spatial_dims
+        self.in_channels = in_channels
+        inner_dim = num_attention_heads * num_head_channels
+        self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=norm_eps, affine=True)
+        self.proj_in = Convolution(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=inner_dim,
+            strides=1,
+            kernel_size=1,
+            padding=0,
+            conv_only=True,
+        )
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    num_channels=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    num_head_channels=num_head_channels,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    upcast_attention=upcast_attention,
+                    use_flash_attention=use_flash_attention,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.proj_out = zero_module(
+            Convolution(
+                spatial_dims=spatial_dims,
+                in_channels=inner_dim,
+                out_channels=in_channels,
+                strides=1,
+                kernel_size=1,
+                padding=0,
+                conv_only=True,
+            )
+        )
+    def forward(self, x: torch.Tensor, context: torch.Tensor | None = None) -> torch.Tensor:
+        # note: if no context is given, cross-attention defaults to self-attention
+        batch = channel = height = width = depth = -1
+        if self.spatial_dims == 2:
+            batch, channel, height, width = x.shape
+        if self.spatial_dims == 3:
+            batch, channel, height, width, depth = x.shape
+        residual = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        inner_dim = x.shape[1]
+        if self.spatial_dims == 2:
+            x = x.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+        if self.spatial_dims == 3:
+            x = x.permute(0, 2, 3, 4, 1).reshape(batch, height * width * depth, inner_dim)
+        for block in self.transformer_blocks:
+            x = block(x, context=context)
+        if self.spatial_dims == 2:
+            x = x.reshape(batch, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+        if self.spatial_dims == 3:
+            x = x.reshape(batch, height, width, depth, inner_dim).permute(0, 4, 1, 2, 3).contiguous()
+        x = self.proj_out(x)
+        return x + residual
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other. Uses three q, k, v linear layers to
+    compute attention.
+    Args:
+        spatial_dims: number of spatial dimensions.
+        num_channels: number of input channels.
+        num_head_channels: number of channels in each attention head.
+        norm_num_groups: number of groups involved for the group normalisation layer. Ensure that your number of
+            channels is divisible by this number.
+        norm_eps: epsilon value to use for the normalisation.
+        use_flash_attention: if True, use flash attention for a memory efficient attention mechanism.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        num_channels: int,
+        num_head_channels: int | None = None,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        use_flash_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        self.use_flash_attention = use_flash_attention
+        self.spatial_dims = spatial_dims
+        self.num_channels = num_channels
+        self.num_heads = num_channels // num_head_channels if num_head_channels is not None else 1
+        self.scale = 1 / math.sqrt(num_channels / self.num_heads)
+        self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=num_channels, eps=norm_eps, affine=True)
+        self.to_q = nn.Linear(num_channels, num_channels)
+        self.to_k = nn.Linear(num_channels, num_channels)
+        self.to_v = nn.Linear(num_channels, num_channels)
+        self.proj_attn = nn.Linear(num_channels, num_channels)
+    def reshape_heads_to_batch_dim(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, dim = x.shape
+        x = x.reshape(batch_size, seq_len, self.num_heads, dim // self.num_heads)
+        x = x.permute(0, 2, 1, 3).reshape(batch_size * self.num_heads, seq_len, dim // self.num_heads)
+        return x
+    def reshape_batch_dim_to_heads(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, dim = x.shape
+        x = x.reshape(batch_size // self.num_heads, self.num_heads, seq_len, dim)
+        x = x.permute(0, 2, 1, 3).reshape(batch_size // self.num_heads, seq_len, dim * self.num_heads)
+        return x
+    def _memory_efficient_attention_xformers(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
+    ) -> torch.Tensor:
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+        x = xformers.ops.memory_efficient_attention(query, key, value, attn_bias=None)
+        return x
+    def _attention(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
+        attention_scores = torch.baddbmm(
+            torch.empty(query.shape[0], query.shape[1], key.shape[1], dtype=query.dtype, device=query.device),
+            query,
+            key.transpose(-1, -2),
+            beta=0,
+            alpha=self.scale,
+        )
+        attention_probs = attention_scores.softmax(dim=-1)
+        x = torch.bmm(attention_probs, value)
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        batch = channel = height = width = depth = -1
+        if self.spatial_dims == 2:
+            batch, channel, height, width = x.shape
+        if self.spatial_dims == 3:
+            batch, channel, height, width, depth = x.shape
+        # norm
+        x = self.norm(x)
+        if self.spatial_dims == 2:
+            x = x.view(batch, channel, height * width).transpose(1, 2)
+        if self.spatial_dims == 3:
+            x = x.view(batch, channel, height * width * depth).transpose(1, 2)
+        # proj to q, k, v
+        query = self.to_q(x)
+        key = self.to_k(x)
+        value = self.to_v(x)
+        # Multi-Head Attention
+        query = self.reshape_heads_to_batch_dim(query)
+        key = self.reshape_heads_to_batch_dim(key)
+        value = self.reshape_heads_to_batch_dim(value)
+        if self.use_flash_attention:
+            x = self._memory_efficient_attention_xformers(query, key, value)
+        else:
+            x = self._attention(query, key, value)
+        x = self.reshape_batch_dim_to_heads(x)
+        x = x.to(query.dtype)
+        if self.spatial_dims == 2:
+            x = x.transpose(-1, -2).reshape(batch, channel, height, width)
+        if self.spatial_dims == 3:
+            x = x.transpose(-1, -2).reshape(batch, channel, height, width, depth)
+        return x + residual
+def get_timestep_embedding(timesteps: torch.Tensor, embedding_dim: int, max_period: int = 10000) -> torch.Tensor:
+    """
+    Create sinusoidal timestep embeddings following the implementation in Ho et al. "Denoising Diffusion Probabilistic
+    Models" https://arxiv.org/abs/2006.11239.
+    Args:
+        timesteps: a 1-D Tensor of N indices, one per batch element.
+        embedding_dim: the dimension of the output.
+        max_period: controls the minimum frequency of the embeddings.
+    """
+    if timesteps.ndim != 1:
+        raise ValueError("Timesteps should be a 1d-array")
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(start=0, end=half_dim, dtype=torch.float32, device=timesteps.device)
+    freqs = torch.exp(exponent / half_dim)
+    args = timesteps[:, None].float() * freqs[None, :]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        embedding = torch.nn.functional.pad(embedding, (0, 1, 0, 0))
+    return embedding
+class Downsample(nn.Module):
+    """
+    Downsampling layer.
+    Args:
+        spatial_dims: number of spatial dimensions.
+        num_channels: number of input channels.
+        use_conv: if True uses Convolution instead of Pool average to perform downsampling. In case that use_conv is
+            False, the number of output channels must be the same as the number of input channels.
+        out_channels: number of output channels.
+        padding: controls the amount of implicit zero-paddings on both sides for padding number of points
+            for each dimension.
+    """
+    def __init__(
+        self, spatial_dims: int, num_channels: int, use_conv: bool, out_channels: int | None = None, padding: int = 1
+    ) -> None:
+        super().__init__()
+        self.num_channels = num_channels
+        self.out_channels = out_channels or num_channels
+        self.use_conv = use_conv
+        if use_conv:
+            self.op = Convolution(
+                spatial_dims=spatial_dims,
+                in_channels=self.num_channels,
+                out_channels=self.out_channels,
+                strides=2,
+                kernel_size=3,
+                padding=padding,
+                conv_only=True,
+            )
+        else:
+            if self.num_channels != self.out_channels:
+                raise ValueError("num_channels and out_channels must be equal when use_conv=False")
+            self.op = Pool[Pool.AVG, spatial_dims](kernel_size=2, stride=2)
+    def forward(self, x: torch.Tensor, emb: torch.Tensor | None = None) -> torch.Tensor:
+        del emb
+        if x.shape[1] != self.num_channels:
+            raise ValueError(
+                f"Input number of channels ({x.shape[1]}) is not equal to expected number of channels "
+                f"({self.num_channels})"
+            )
+        return self.op(x)
+class Upsample(nn.Module):
+    """
+    Upsampling layer with an optional convolution.
+    Args:
+        spatial_dims: number of spatial dimensions.
+        num_channels: number of input channels.
+        use_conv: if True uses Convolution instead of Pool average to perform downsampling.
+        out_channels: number of output channels.
+        padding: controls the amount of implicit zero-paddings on both sides for padding number of points for each
+            dimension.
+    """
+    def __init__(
+        self, spatial_dims: int, num_channels: int, use_conv: bool, out_channels: int | None = None, padding: int = 1
+    ) -> None:
+        super().__init__()
+        self.num_channels = num_channels
+        self.out_channels = out_channels or num_channels
+        self.use_conv = use_conv
+        if use_conv:
+            self.conv = Convolution(
+                spatial_dims=spatial_dims,
+                in_channels=self.num_channels,
+                out_channels=self.out_channels,
+                strides=1,
+                kernel_size=3,
+                padding=padding,
+                conv_only=True,
+            )
+        else:
+            self.conv = None
+    def forward(self, x: torch.Tensor, emb: torch.Tensor | None = None) -> torch.Tensor:
+        del emb
+        if x.shape[1] != self.num_channels:
+            raise ValueError("Input channels should be equal to num_channels")
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        # https://github.com/pytorch/pytorch/issues/86679
+        dtype = x.dtype
+        if dtype == torch.bfloat16:
+            x = x.to(torch.float32)
+        x = F.interpolate(x, scale_factor=2.0, mode="nearest")
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            x = x.to(dtype)
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class ResnetBlock(nn.Module):
+    """
+    Residual block with timestep conditioning.
+    Args:
+        spatial_dims: The number of spatial dimensions.
+        in_channels: number of input channels.
+        temb_channels: number of timestep embedding  channels.
+        out_channels: number of output channels.
+        up: if True, performs upsampling.
+        down: if True, performs downsampling.
+        norm_num_groups: number of groups for the group normalization.
+        norm_eps: epsilon for the group normalization.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        temb_channels: int,
+        out_channels: int | None = None,
+        up: bool = False,
+        down: bool = False,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self.spatial_dims = spatial_dims
+        self.channels = in_channels
+        self.emb_channels = temb_channels
+        self.out_channels = out_channels or in_channels
+        self.up = up
+        self.down = down
+        self.norm1 = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=norm_eps, affine=True)
+        self.nonlinearity = nn.SiLU()
+        self.conv1 = Convolution(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=self.out_channels,
+            strides=1,
+            kernel_size=3,
+            padding=1,
+            conv_only=True,
+            dilation=3
+        )
+        self.upsample = self.downsample = None
+        if self.up:
+            self.upsample = Upsample(spatial_dims, in_channels, use_conv=False)
+        elif down:
+            self.downsample = Downsample(spatial_dims, in_channels, use_conv=False)
+        self.time_emb_proj = nn.Linear(temb_channels, self.out_channels)
+        self.norm2 = nn.GroupNorm(num_groups=norm_num_groups, num_channels=self.out_channels, eps=norm_eps, affine=True)
+        self.conv2 = zero_module(
+            Convolution(
+                spatial_dims=spatial_dims,
+                in_channels=self.out_channels,
+                out_channels=self.out_channels,
+                strides=1,
+                kernel_size=3,
+                padding=1,
+                conv_only=True,
+                dilation=2
+            )
+        )
+        if self.out_channels == in_channels:
+            self.skip_connection = nn.Identity()
+        else:
+            self.skip_connection = Convolution(
+                spatial_dims=spatial_dims,
+                in_channels=in_channels,
+                out_channels=self.out_channels,
+                strides=1,
+                kernel_size=1,
+                padding=0,
+                conv_only=True,
+                dilation=1
+            )
+    def forward(self, x: torch.Tensor, emb: torch.Tensor) -> torch.Tensor:
+        h = x
+        h = self.norm1(h)
+        h = self.nonlinearity(h)
+        if self.upsample is not None:
+            if h.shape[0] >= 64:
+                x = x.contiguous()
+                h = h.contiguous()
+            x = self.upsample(x)
+            h = self.upsample(h)
+        elif self.downsample is not None:
+            x = self.downsample(x)
+            h = self.downsample(h)
+        h = self.conv1(h)
+        if self.spatial_dims == 2:
+            temb = self.time_emb_proj(self.nonlinearity(emb))[:, :, None, None]
+        else:
+            temb = self.time_emb_proj(self.nonlinearity(emb))[:, :, None, None, None]
+        h = h + temb
+        h = self.norm2(h)
+        h = self.nonlinearity(h)
+        h = self.conv2(h)
+        return self.skip_connection(x) + h
+class DownBlock(nn.Module):
+    """
+    Unet's down block containing resnet and downsamplers blocks.
+    Args:
+        spatial_dims: The number of spatial dimensions.
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        temb_channels: number of timestep embedding channels.
+        num_res_blocks: number of residual blocks.
+        norm_num_groups: number of groups for the group normalization.
+        norm_eps: epsilon for the group normalization.
+        add_downsample: if True add downsample block.
+        resblock_updown: if True use residual blocks for downsampling.
+        downsample_padding: padding used in the downsampling block.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        num_res_blocks: int = 1,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        add_downsample: bool = True,
+        resblock_updown: bool = False,
+        downsample_padding: int = 1,
+    ) -> None:
+        super().__init__()
+        self.resblock_updown = resblock_updown
+        resnets = []
+        for i in range(num_res_blocks):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock(
+                    spatial_dims=spatial_dims,
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            if resblock_updown:
+                self.downsampler = ResnetBlock(
+                    spatial_dims=spatial_dims,
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                    down=True,
+                )
+            else:
+                self.downsampler = Downsample(
+                    spatial_dims=spatial_dims,
+                    num_channels=out_channels,
+                    use_conv=True,
+                    out_channels=out_channels,
+                    padding=downsample_padding,
+                )
+        else:
+            self.downsampler = None
+    def forward(
+        self, hidden_states: torch.Tensor, temb: torch.Tensor, context: torch.Tensor | None = None
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        del context
+        output_states = []
+        for resnet in self.resnets:
+            hidden_states = resnet(hidden_states, temb)
+            output_states.append(hidden_states)
+        if self.downsampler is not None:
+            hidden_states = self.downsampler(hidden_states, temb)
+            output_states.append(hidden_states)
+        return hidden_states, output_states
+class AttnDownBlock(nn.Module):
+    """
+    Unet's down block containing resnet, downsamplers and self-attention blocks.
+    Args:
+        spatial_dims: The number of spatial dimensions.
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        temb_channels: number of timestep embedding  channels.
+        num_res_blocks: number of residual blocks.
+        norm_num_groups: number of groups for the group normalization.
+        norm_eps: epsilon for the group normalization.
+        add_downsample: if True add downsample block.
+        resblock_updown: if True use residual blocks for downsampling.
+        downsample_padding: padding used in the downsampling block.
+        num_head_channels: number of channels in each attention head.
+        use_flash_attention: if True, use flash attention for a memory efficient attention mechanism.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        num_res_blocks: int = 1,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        add_downsample: bool = True,
+        resblock_updown: bool = False,
+        downsample_padding: int = 1,
+        num_head_channels: int = 1,
+        use_flash_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        self.resblock_updown = resblock_updown
+        resnets = []
+        attentions = []
+        for i in range(num_res_blocks):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock(
+                    spatial_dims=spatial_dims,
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    spatial_dims=spatial_dims,
+                    num_channels=out_channels,
+                    num_head_channels=num_head_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                    use_flash_attention=use_flash_attention,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            if resblock_updown:
+                self.downsampler = ResnetBlock(
+                    spatial_dims=spatial_dims,
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                    down=True,
+                )
+            else:
+                self.downsampler = Downsample(
+                    spatial_dims=spatial_dims,
+                    num_channels=out_channels,
+                    use_conv=True,
+                    out_channels=out_channels,
+                    padding=downsample_padding,
+                )
+        else:
+            self.downsampler = None
+    def forward(
+        self, hidden_states: torch.Tensor, temb: torch.Tensor, context: torch.Tensor | None = None
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        del context
+        output_states = []
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+            output_states.append(hidden_states)
+        if self.downsampler is not None:
+            hidden_states = self.downsampler(hidden_states, temb)
+            output_states.append(hidden_states)
+        return hidden_states, output_states
+class CrossAttnDownBlock(nn.Module):
+    """
+    Unet's down block containing resnet, downsamplers and cross-attention blocks.
+    Args:
+        spatial_dims: number of spatial dimensions.
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        temb_channels: number of timestep embedding channels.
+        num_res_blocks: number of residual blocks.
+        norm_num_groups: number of groups for the group normalization.
+        norm_eps: epsilon for the group normalization.
+        add_downsample: if True add downsample block.
+        resblock_updown: if True use residual blocks for downsampling.
+        downsample_padding: padding used in the downsampling block.
+        num_head_channels: number of channels in each attention head.
+        transformer_num_layers: number of layers of Transformer blocks to use.
+        cross_attention_dim: number of context dimensions to use.
+        upcast_attention: if True, upcast attention operations to full precision.
+        use_flash_attention: if True, use flash attention for a memory efficient attention mechanism.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        num_res_blocks: int = 1,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        add_downsample: bool = True,
+        resblock_updown: bool = False,
+        downsample_padding: int = 1,
+        num_head_channels: int = 1,
+        transformer_num_layers: int = 1,
+        cross_attention_dim: int | None = None,
+        upcast_attention: bool = False,
+        use_flash_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        self.resblock_updown = resblock_updown
+        resnets = []
+        attentions = []
+        for i in range(num_res_blocks):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock(
+                    spatial_dims=spatial_dims,
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                )
+            )
+            attentions.append(
+                SpatialTransformer(
+                    spatial_dims=spatial_dims,
+                    in_channels=out_channels,
+                    num_attention_heads=out_channels // num_head_channels,
+                    num_head_channels=num_head_channels,
+                    num_layers=transformer_num_layers,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                    cross_attention_dim=cross_attention_dim,
+                    upcast_attention=upcast_attention,
+                    use_flash_attention=use_flash_attention,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            if resblock_updown:
+                self.downsampler = ResnetBlock(
+                    spatial_dims=spatial_dims,
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                    down=True,
+                )
+            else:
+                self.downsampler = Downsample(
+                    spatial_dims=spatial_dims,
+                    num_channels=out_channels,
+                    use_conv=True,
+                    out_channels=out_channels,
+                    padding=downsample_padding,
+                )
+        else:
+            self.downsampler = None
+    def forward(
+        self, hidden_states: torch.Tensor, temb: torch.Tensor, context: torch.Tensor | None = None
+    ) -> tuple[torch.Tensor, list[torch.Tensor]]:
+        output_states = []
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states, context=context)
+            output_states.append(hidden_states)
+        if self.downsampler is not None:
+            hidden_states = self.downsampler(hidden_states, temb)
+            output_states.append(hidden_states)
+        return hidden_states, output_states
+class AttnMidBlock(nn.Module):
+    """
+    Unet's mid block containing resnet and self-attention blocks.
+    Args:
+        spatial_dims: The number of spatial dimensions.
+        in_channels: number of input channels.
+        temb_channels: number of timestep embedding channels.
+        norm_num_groups: number of groups for the group normalization.
+        norm_eps: epsilon for the group normalization.
+        num_head_channels: number of channels in each attention head.
+        use_flash_attention: if True, use flash attention for a memory efficient attention mechanism.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        temb_channels: int,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        num_head_channels: int = 1,
+        use_flash_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        self.attention = None
+        self.resnet_1 = ResnetBlock(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=in_channels,
+            temb_channels=temb_channels,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+        )
+        self.attention = AttentionBlock(
+            spatial_dims=spatial_dims,
+            num_channels=in_channels,
+            num_head_channels=num_head_channels,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            use_flash_attention=use_flash_attention,
+        )
+        self.resnet_2 = ResnetBlock(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=in_channels,
+            temb_channels=temb_channels,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+        )
+    def forward(
+        self, hidden_states: torch.Tensor, temb: torch.Tensor, context: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        del context
+        hidden_states = self.resnet_1(hidden_states, temb)
+        hidden_states = self.attention(hidden_states)
+        hidden_states = self.resnet_2(hidden_states, temb)
+        return hidden_states
+class CrossAttnMidBlock(nn.Module):
+    """
+    Unet's mid block containing resnet and cross-attention blocks.
+    Args:
+        spatial_dims: The number of spatial dimensions.
+        in_channels: number of input channels.
+        temb_channels: number of timestep embedding channels
+        norm_num_groups: number of groups for the group normalization.
+        norm_eps: epsilon for the group normalization.
+        num_head_channels: number of channels in each attention head.
+        transformer_num_layers: number of layers of Transformer blocks to use.
+        cross_attention_dim: number of context dimensions to use.
+        upcast_attention: if True, upcast attention operations to full precision.
+        use_flash_attention: if True, use flash attention for a memory efficient attention mechanism.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        temb_channels: int,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        num_head_channels: int = 1,
+        transformer_num_layers: int = 1,
+        cross_attention_dim: int | None = None,
+        upcast_attention: bool = False,
+        use_flash_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        self.attention = None
+        self.resnet_1 = ResnetBlock(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=in_channels,
+            temb_channels=temb_channels,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+        )
+        self.attention = SpatialTransformer(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            num_attention_heads=in_channels // num_head_channels,
+            num_head_channels=num_head_channels,
+            num_layers=transformer_num_layers,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            cross_attention_dim=cross_attention_dim,
+            upcast_attention=upcast_attention,
+            use_flash_attention=use_flash_attention,
+        )
+        self.resnet_2 = ResnetBlock(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=in_channels,
+            temb_channels=temb_channels,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+        )
+    def forward(
+        self, hidden_states: torch.Tensor, temb: torch.Tensor, context: torch.Tensor | None = None
+    ) -> torch.Tensor:
+        hidden_states = self.resnet_1(hidden_states, temb)
+        hidden_states = self.attention(hidden_states, context=context)
+        hidden_states = self.resnet_2(hidden_states, temb)
+        return hidden_states
+class UpBlock(nn.Module):
+    """
+    Unet's up block containing resnet and upsamplers blocks.
+    Args:
+        spatial_dims: The number of spatial dimensions.
+        in_channels: number of input channels.
+        prev_output_channel: number of channels from residual connection.
+        out_channels: number of output channels.
+        temb_channels: number of timestep embedding channels.
+        num_res_blocks: number of residual blocks.
+        norm_num_groups: number of groups for the group normalization.
+        norm_eps: epsilon for the group normalization.
+        add_upsample: if True add downsample block.
+        resblock_updown: if True use residual blocks for upsampling.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        num_res_blocks: int = 1,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        add_upsample: bool = True,
+        resblock_updown: bool = False,
+    ) -> None:
+        super().__init__()
+        self.resblock_updown = resblock_updown
+        resnets = []
+        for i in range(num_res_blocks):
+            res_skip_channels = in_channels if (i == num_res_blocks - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock(
+                    spatial_dims=spatial_dims,
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            if resblock_updown:
+                self.upsampler = ResnetBlock(
+                    spatial_dims=spatial_dims,
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                    up=True,
+                )
+            else:
+                self.upsampler = Upsample(
+                    spatial_dims=spatial_dims, num_channels=out_channels, use_conv=True, out_channels=out_channels
+                )
+        else:
+            self.upsampler = None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_list: list[torch.Tensor],
+        temb: torch.Tensor,
+        context: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        del context
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_list[-1]
+            res_hidden_states_list = res_hidden_states_list[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+        if self.upsampler is not None:
+            hidden_states = self.upsampler(hidden_states, temb)
+        return hidden_states
+class AttnUpBlock(nn.Module):
+    """
+    Unet's up block containing resnet, upsamplers, and self-attention blocks.
+    Args:
+        spatial_dims: The number of spatial dimensions.
+        in_channels: number of input channels.
+        prev_output_channel: number of channels from residual connection.
+        out_channels: number of output channels.
+        temb_channels: number of timestep embedding channels.
+        num_res_blocks: number of residual blocks.
+        norm_num_groups: number of groups for the group normalization.
+        norm_eps: epsilon for the group normalization.
+        add_upsample: if True add downsample block.
+        resblock_updown: if True use residual blocks for upsampling.
+        num_head_channels: number of channels in each attention head.
+        use_flash_attention: if True, use flash attention for a memory efficient attention mechanism.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        num_res_blocks: int = 1,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        add_upsample: bool = True,
+        resblock_updown: bool = False,
+        num_head_channels: int = 1,
+        use_flash_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        self.resblock_updown = resblock_updown
+        resnets = []
+        attentions = []
+        for i in range(num_res_blocks):
+            res_skip_channels = in_channels if (i == num_res_blocks - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock(
+                    spatial_dims=spatial_dims,
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                )
+            )
+            attentions.append(
+                AttentionBlock(
+                    spatial_dims=spatial_dims,
+                    num_channels=out_channels,
+                    num_head_channels=num_head_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                    use_flash_attention=use_flash_attention,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        self.attentions = nn.ModuleList(attentions)
+        if add_upsample:
+            if resblock_updown:
+                self.upsampler = ResnetBlock(
+                    spatial_dims=spatial_dims,
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                    up=True,
+                )
+            else:
+                self.upsampler = Upsample(
+                    spatial_dims=spatial_dims, num_channels=out_channels, use_conv=True, out_channels=out_channels
+                )
+        else:
+            self.upsampler = None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_list: list[torch.Tensor],
+        temb: torch.Tensor,
+        context: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        del context
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_list[-1]
+            res_hidden_states_list = res_hidden_states_list[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states)
+        if self.upsampler is not None:
+            hidden_states = self.upsampler(hidden_states, temb)
+        return hidden_states
+class CrossAttnUpBlock(nn.Module):
+    """
+    Unet's up block containing resnet, upsamplers, and self-attention blocks.
+    Args:
+        spatial_dims: The number of spatial dimensions.
+        in_channels: number of input channels.
+        prev_output_channel: number of channels from residual connection.
+        out_channels: number of output channels.
+        temb_channels: number of timestep embedding channels.
+        num_res_blocks: number of residual blocks.
+        norm_num_groups: number of groups for the group normalization.
+        norm_eps: epsilon for the group normalization.
+        add_upsample: if True add downsample block.
+        resblock_updown: if True use residual blocks for upsampling.
+        num_head_channels: number of channels in each attention head.
+        transformer_num_layers: number of layers of Transformer blocks to use.
+        cross_attention_dim: number of context dimensions to use.
+        upcast_attention: if True, upcast attention operations to full precision.
+        use_flash_attention: if True, use flash attention for a memory efficient attention mechanism.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        num_res_blocks: int = 1,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        add_upsample: bool = True,
+        resblock_updown: bool = False,
+        num_head_channels: int = 1,
+        transformer_num_layers: int = 1,
+        cross_attention_dim: int | None = None,
+        upcast_attention: bool = False,
+        use_flash_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        self.resblock_updown = resblock_updown
+        resnets = []
+        attentions = []
+        for i in range(num_res_blocks):
+            res_skip_channels = in_channels if (i == num_res_blocks - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock(
+                    spatial_dims=spatial_dims,
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                )
+            )
+            attentions.append(
+                SpatialTransformer(
+                    spatial_dims=spatial_dims,
+                    in_channels=out_channels,
+                    num_attention_heads=out_channels // num_head_channels,
+                    num_head_channels=num_head_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                    num_layers=transformer_num_layers,
+                    cross_attention_dim=cross_attention_dim,
+                    upcast_attention=upcast_attention,
+                    use_flash_attention=use_flash_attention,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            if resblock_updown:
+                self.upsampler = ResnetBlock(
+                    spatial_dims=spatial_dims,
+                    in_channels=out_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    norm_num_groups=norm_num_groups,
+                    norm_eps=norm_eps,
+                    up=True,
+                )
+            else:
+                self.upsampler = Upsample(
+                    spatial_dims=spatial_dims, num_channels=out_channels, use_conv=True, out_channels=out_channels
+                )
+        else:
+            self.upsampler = None
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        res_hidden_states_list: list[torch.Tensor],
+        temb: torch.Tensor,
+        context: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_list[-1]
+            res_hidden_states_list = res_hidden_states_list[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            hidden_states = resnet(hidden_states, temb)
+            hidden_states = attn(hidden_states, context=context)
+        if self.upsampler is not None:
+            hidden_states = self.upsampler(hidden_states, temb)
+        return hidden_states
+def get_down_block(
+    spatial_dims: int,
+    in_channels: int,
+    out_channels: int,
+    temb_channels: int,
+    num_res_blocks: int,
+    norm_num_groups: int,
+    norm_eps: float,
+    add_downsample: bool,
+    resblock_updown: bool,
+    with_attn: bool,
+    with_cross_attn: bool,
+    num_head_channels: int,
+    transformer_num_layers: int,
+    cross_attention_dim: int | None,
+    upcast_attention: bool = False,
+    use_flash_attention: bool = False,
+) -> nn.Module:
+    if with_attn:
+        return AttnDownBlock(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            num_res_blocks=num_res_blocks,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            add_downsample=add_downsample,
+            resblock_updown=resblock_updown,
+            num_head_channels=num_head_channels,
+            use_flash_attention=use_flash_attention,
+        )
+    elif with_cross_attn:
+        return CrossAttnDownBlock(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            num_res_blocks=num_res_blocks,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            add_downsample=add_downsample,
+            resblock_updown=resblock_updown,
+            num_head_channels=num_head_channels,
+            transformer_num_layers=transformer_num_layers,
+            cross_attention_dim=cross_attention_dim,
+            upcast_attention=upcast_attention,
+            use_flash_attention=use_flash_attention,
+        )
+    else:
+        return DownBlock(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            num_res_blocks=num_res_blocks,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            add_downsample=add_downsample,
+            resblock_updown=resblock_updown,
+        )
+def get_mid_block(
+    spatial_dims: int,
+    in_channels: int,
+    temb_channels: int,
+    norm_num_groups: int,
+    norm_eps: float,
+    with_conditioning: bool,
+    num_head_channels: int,
+    transformer_num_layers: int,
+    cross_attention_dim: int | None,
+    upcast_attention: bool = False,
+    use_flash_attention: bool = False,
+) -> nn.Module:
+    if with_conditioning:
+        return CrossAttnMidBlock(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            num_head_channels=num_head_channels,
+            transformer_num_layers=transformer_num_layers,
+            cross_attention_dim=cross_attention_dim,
+            upcast_attention=upcast_attention,
+            use_flash_attention=use_flash_attention,
+        )
+    else:
+        return AttnMidBlock(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            temb_channels=temb_channels,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            num_head_channels=num_head_channels,
+            use_flash_attention=use_flash_attention,
+        )
+def get_up_block(
+    spatial_dims: int,
+    in_channels: int,
+    prev_output_channel: int,
+    out_channels: int,
+    temb_channels: int,
+    num_res_blocks: int,
+    norm_num_groups: int,
+    norm_eps: float,
+    add_upsample: bool,
+    resblock_updown: bool,
+    with_attn: bool,
+    with_cross_attn: bool,
+    num_head_channels: int,
+    transformer_num_layers: int,
+    cross_attention_dim: int | None,
+    upcast_attention: bool = False,
+    use_flash_attention: bool = False,
+) -> nn.Module:
+    if with_attn:
+        return AttnUpBlock(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            prev_output_channel=prev_output_channel,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            num_res_blocks=num_res_blocks,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            add_upsample=add_upsample,
+            resblock_updown=resblock_updown,
+            num_head_channels=num_head_channels,
+            use_flash_attention=use_flash_attention,
+        )
+    elif with_cross_attn:
+        return CrossAttnUpBlock(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            prev_output_channel=prev_output_channel,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            num_res_blocks=num_res_blocks,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            add_upsample=add_upsample,
+            resblock_updown=resblock_updown,
+            num_head_channels=num_head_channels,
+            transformer_num_layers=transformer_num_layers,
+            cross_attention_dim=cross_attention_dim,
+            upcast_attention=upcast_attention,
+            use_flash_attention=use_flash_attention,
+        )
+    else:
+        return UpBlock(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            prev_output_channel=prev_output_channel,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            num_res_blocks=num_res_blocks,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            add_upsample=add_upsample,
+            resblock_updown=resblock_updown,
+        )
+class DiffusionModelUNet(nn.Module):
+    """
+    Unet network with timestep embedding and attention mechanisms for conditioning based on
+    Rombach et al. "High-Resolution Image Synthesis with Latent Diffusion Models" https://arxiv.org/abs/2112.10752
+    and Pinaya et al. "Brain Imaging Generation with Latent Diffusion Models" https://arxiv.org/abs/2209.07162
+    Args:
+        spatial_dims: number of spatial dimensions.
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        num_res_blocks: number of residual blocks (see ResnetBlock) per level.
+        num_channels: tuple of block output channels.
+        attention_levels: list of levels to add attention.
+        norm_num_groups: number of groups for the normalization.
+        norm_eps: epsilon for the normalization.
+        resblock_updown: if True use residual blocks for up/downsampling.
+        num_head_channels: number of channels in each attention head.
+        with_conditioning: if True add spatial transformers to perform conditioning.
+        transformer_num_layers: number of layers of Transformer blocks to use.
+        cross_attention_dim: number of context dimensions to use.
+        num_class_embeds: if specified (as an int), then this model will be class-conditional with `num_class_embeds`
+        classes.
+        upcast_attention: if True, upcast attention operations to full precision.
+        use_flash_attention: if True, use flash attention for a memory efficient attention mechanism.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        num_res_blocks: Sequence[int] | int = (2, 2, 2, 2),
+        num_channels: Sequence[int] = (32, 64, 64, 64),
+        attention_levels: Sequence[bool] = (False, False, True, True),
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        resblock_updown: bool = False,
+        num_head_channels: int | Sequence[int] = 8,
+        with_conditioning: bool = False,
+        transformer_num_layers: int = 1,
+        cross_attention_dim: int | None = None,
+        num_class_embeds: int | None = None,
+        upcast_attention: bool = False,
+        use_flash_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        if with_conditioning is True and cross_attention_dim is None:
+            raise ValueError(
+                "DiffusionModelUNet expects dimension of the cross-attention conditioning (cross_attention_dim) "
+                "when using with_conditioning."
+            )
+        if cross_attention_dim is not None and with_conditioning is False:
+            raise ValueError(
+                "DiffusionModelUNet expects with_conditioning=True when specifying the cross_attention_dim."
+            )
+        # All number of channels should be multiple of num_groups
+        if any((out_channel % norm_num_groups) != 0 for out_channel in num_channels):
+            raise ValueError("DiffusionModelUNet expects all num_channels being multiple of norm_num_groups")
+        if len(num_channels) != len(attention_levels):
+            raise ValueError("DiffusionModelUNet expects num_channels being same size of attention_levels")
+        if isinstance(num_head_channels, int):
+            num_head_channels = ensure_tuple_rep(num_head_channels, len(attention_levels))
+        if len(num_head_channels) != len(attention_levels):
+            raise ValueError(
+                "num_head_channels should have the same length as attention_levels. For the i levels without attention,"
+                " i.e. `attention_level[i]=False`, the num_head_channels[i] will be ignored."
+            )
+        if isinstance(num_res_blocks, int):
+            num_res_blocks = ensure_tuple_rep(num_res_blocks, len(num_channels))
+        if len(num_res_blocks) != len(num_channels):
+            raise ValueError(
+                "`num_res_blocks` should be a single integer or a tuple of integers with the same length as "
+                "`num_channels`."
+            )
+        if use_flash_attention and not has_xformers:
+            raise ValueError("use_flash_attention is True but xformers is not installed.")
+        if use_flash_attention is True and not torch.cuda.is_available():
+            raise ValueError(
+                "torch.cuda.is_available() should be True but is False. Flash attention is only available for GPU."
+            )
+        self.in_channels = in_channels
+        self.block_out_channels = num_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_levels = attention_levels
+        self.num_head_channels = num_head_channels
+        self.with_conditioning = with_conditioning
+        # input
+        self.conv_in = Convolution(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=num_channels[0],
+            strides=1,
+            kernel_size=3,
+            padding=1,
+            conv_only=True,
+        )
+        # time
+        time_embed_dim = num_channels[0] * 4
+        self.time_embed = nn.Sequential(
+            nn.Linear(num_channels[0], time_embed_dim), nn.SiLU(), nn.Linear(time_embed_dim, time_embed_dim)
+        )
+        # class embedding
+        self.num_class_embeds = num_class_embeds
+        if num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        # down
+        self.down_blocks = nn.ModuleList([])
+        output_channel = num_channels[0]
+        for i in range(len(num_channels)):
+            input_channel = output_channel
+            output_channel = num_channels[i]
+            is_final_block = i == len(num_channels) - 1
+            down_block = get_down_block(
+                spatial_dims=spatial_dims,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                num_res_blocks=num_res_blocks[i],
+                norm_num_groups=norm_num_groups,
+                norm_eps=norm_eps,
+                add_downsample=not is_final_block,
+                resblock_updown=resblock_updown,
+                with_attn=(attention_levels[i] and not with_conditioning),
+                with_cross_attn=(attention_levels[i] and with_conditioning),
+                num_head_channels=num_head_channels[i],
+                transformer_num_layers=transformer_num_layers,
+                cross_attention_dim=cross_attention_dim,
+                upcast_attention=upcast_attention,
+                use_flash_attention=use_flash_attention,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.middle_block = get_mid_block(
+            spatial_dims=spatial_dims,
+            in_channels=num_channels[-1],
+            temb_channels=time_embed_dim,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            with_conditioning=with_conditioning,
+            num_head_channels=num_head_channels[-1],
+            transformer_num_layers=transformer_num_layers,
+            cross_attention_dim=cross_attention_dim,
+            upcast_attention=upcast_attention,
+            use_flash_attention=use_flash_attention,
+        )
+        # up
+        self.up_blocks = nn.ModuleList([])
+        reversed_block_out_channels = list(reversed(num_channels))
+        reversed_num_res_blocks = list(reversed(num_res_blocks))
+        reversed_attention_levels = list(reversed(attention_levels))
+        reversed_num_head_channels = list(reversed(num_head_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i in range(len(reversed_block_out_channels)):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(num_channels) - 1)]
+            is_final_block = i == len(num_channels) - 1
+            up_block = get_up_block(
+                spatial_dims=spatial_dims,
+                in_channels=input_channel,
+                prev_output_channel=prev_output_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                num_res_blocks=reversed_num_res_blocks[i] + 1,
+                norm_num_groups=norm_num_groups,
+                norm_eps=norm_eps,
+                add_upsample=not is_final_block,
+                resblock_updown=resblock_updown,
+                with_attn=(reversed_attention_levels[i] and not with_conditioning),
+                with_cross_attn=(reversed_attention_levels[i] and with_conditioning),
+                num_head_channels=reversed_num_head_channels[i],
+                transformer_num_layers=transformer_num_layers,
+                cross_attention_dim=cross_attention_dim,
+                upcast_attention=upcast_attention,
+                use_flash_attention=use_flash_attention,
+            )
+            self.up_blocks.append(up_block)
+        # out
+        self.out = nn.Sequential(
+            nn.GroupNorm(num_groups=norm_num_groups, num_channels=num_channels[0], eps=norm_eps, affine=True),
+            nn.SiLU(),
+            zero_module(
+                Convolution(
+                    spatial_dims=spatial_dims,
+                    in_channels=num_channels[0],
+                    out_channels=out_channels,
+                    strides=1,
+                    kernel_size=3,
+                    padding=1,
+                    conv_only=True,
+                    dilation=2
+                )
+            ),
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        timesteps: torch.Tensor,
+        context: torch.Tensor | None = None,
+        class_labels: torch.Tensor | None = None,
+        down_block_additional_residuals: tuple[torch.Tensor] | None = None,
+        mid_block_additional_residual: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: input tensor (N, C, SpatialDims).
+            timesteps: timestep tensor (N,).
+            context: context tensor (N, 1, ContextDim).
+            class_labels: context tensor (N, ).
+            down_block_additional_residuals: additional residual tensors for down blocks (N, C, FeatureMapsDims).
+            mid_block_additional_residual: additional residual tensor for mid block (N, C, FeatureMapsDims).
+        """
+        # 1. time
+        t_emb = get_timestep_embedding(timesteps, self.block_out_channels[0])
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=x.dtype)
+        emb = self.time_embed(t_emb)
+        # 2. class
+        if self.num_class_embeds is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            class_emb = self.class_embedding(class_labels)
+            class_emb = class_emb.to(dtype=x.dtype)
+            emb = emb + class_emb
+        # 3. initial convolution
+        h = self.conv_in(x)
+        # 4. down
+        if context is not None and self.with_conditioning is False:
+            raise ValueError("model should have with_conditioning = True if context is provided")
+        down_block_res_samples: list[torch.Tensor] = [h]
+        for downsample_block in self.down_blocks:
+            h, res_samples = downsample_block(hidden_states=h, temb=emb, context=context)
+            for residual in res_samples:
+                down_block_res_samples.append(residual)
+        # Additional residual conections for Controlnets
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample = down_block_res_sample + down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 5. mid
+        h = self.middle_block(hidden_states=h, temb=emb, context=context)
+        # Additional residual conections for Controlnets
+        if mid_block_additional_residual is not None:
+            h = h + mid_block_additional_residual
+        # 6. up
+        for upsample_block in self.up_blocks:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            h = upsample_block(hidden_states=h, res_hidden_states_list=res_samples, temb=emb, context=context)
+        # 7. output block
+        h = self.out(h)
+        return h
+class DiffusionModelEncoder(nn.Module):
+    """
+    Classification Network based on the Encoder of the Diffusion Model, followed by fully connected layers. This network is based on
+    Wolleb et al. "Diffusion Models for Medical Anomaly Detection" (https://arxiv.org/abs/2203.04306).
+    Args:
+        spatial_dims: number of spatial dimensions.
+        in_channels: number of input channels.
+        out_channels: number of output channels.
+        num_res_blocks: number of residual blocks (see ResnetBlock) per level.
+        num_channels: tuple of block output channels.
+        attention_levels: list of levels to add attention.
+        norm_num_groups: number of groups for the normalization.
+        norm_eps: epsilon for the normalization.
+        resblock_updown: if True use residual blocks for downsampling.
+        num_head_channels: number of channels in each attention head.
+        with_conditioning: if True add spatial transformers to perform conditioning.
+        transformer_num_layers: number of layers of Transformer blocks to use.
+        cross_attention_dim: number of context dimensions to use.
+        num_class_embeds: if specified (as an int), then this model will be class-conditional with `num_class_embeds` classes.
+        upcast_attention: if True, upcast attention operations to full precision.
+    """
+    def __init__(
+        self,
+        spatial_dims: int,
+        in_channels: int,
+        out_channels: int,
+        num_res_blocks: Sequence[int] | int = (2, 2, 2, 2),
+        num_channels: Sequence[int] = (32, 64, 64, 64),
+        attention_levels: Sequence[bool] = (False, False, True, True),
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-6,
+        resblock_updown: bool = False,
+        num_head_channels: int | Sequence[int] = 8,
+        with_conditioning: bool = False,
+        transformer_num_layers: int = 1,
+        cross_attention_dim: int | None = None,
+        num_class_embeds: int | None = None,
+        upcast_attention: bool = False,
+    ) -> None:
+        super().__init__()
+        if with_conditioning is True and cross_attention_dim is None:
+            raise ValueError(
+                "DiffusionModelEncoder expects dimension of the cross-attention conditioning (cross_attention_dim) "
+                "when using with_conditioning."
+            )
+        if cross_attention_dim is not None and with_conditioning is False:
+            raise ValueError(
+                "DiffusionModelEncoder expects with_conditioning=True when specifying the cross_attention_dim."
+            )
+        # All number of channels should be multiple of num_groups
+        if any((out_channel % norm_num_groups) != 0 for out_channel in num_channels):
+            raise ValueError("DiffusionModelEncoder expects all num_channels being multiple of norm_num_groups")
+        if len(num_channels) != len(attention_levels):
+            raise ValueError("DiffusionModelEncoder expects num_channels being same size of attention_levels")
+        if isinstance(num_head_channels, int):
+            num_head_channels = ensure_tuple_rep(num_head_channels, len(attention_levels))
+        if len(num_head_channels) != len(attention_levels):
+            raise ValueError(
+                "num_head_channels should have the same length as attention_levels. For the i levels without attention,"
+                " i.e. `attention_level[i]=False`, the num_head_channels[i] will be ignored."
+            )
+        self.in_channels = in_channels
+        self.block_out_channels = num_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_levels = attention_levels
+        self.num_head_channels = num_head_channels
+        self.with_conditioning = with_conditioning
+        # input
+        self.conv_in = Convolution(
+            spatial_dims=spatial_dims,
+            in_channels=in_channels,
+            out_channels=num_channels[0],
+            strides=1,
+            kernel_size=3,
+            padding=1,
+            conv_only=True,
+        )
+        # time
+        time_embed_dim = num_channels[0] * 4
+        self.time_embed = nn.Sequential(
+            nn.Linear(num_channels[0], time_embed_dim), nn.SiLU(), nn.Linear(time_embed_dim, time_embed_dim)
+        )
+        # class embedding
+        self.num_class_embeds = num_class_embeds
+        if num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        # down
+        self.down_blocks = nn.ModuleList([])
+        output_channel = num_channels[0]
+        for i in range(len(num_channels)):
+            input_channel = output_channel
+            output_channel = num_channels[i]
+            is_final_block = i == len(num_channels)  # - 1
+            down_block = get_down_block(
+                spatial_dims=spatial_dims,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                num_res_blocks=num_res_blocks[i],
+                norm_num_groups=norm_num_groups,
+                norm_eps=norm_eps,
+                add_downsample=not is_final_block,
+                resblock_updown=resblock_updown,
+                with_attn=(attention_levels[i] and not with_conditioning),
+                with_cross_attn=(attention_levels[i] and with_conditioning),
+                num_head_channels=num_head_channels[i],
+                transformer_num_layers=transformer_num_layers,
+                cross_attention_dim=cross_attention_dim,
+                upcast_attention=upcast_attention,
+            )
+            self.down_blocks.append(down_block)
+        self.out = nn.Sequential(nn.Linear(4096, 512), nn.ReLU(), nn.Dropout(0.1), nn.Linear(512, self.out_channels))
+    def forward(
+        self,
+        x: torch.Tensor,
+        timesteps: torch.Tensor,
+        context: torch.Tensor | None = None,
+        class_labels: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: input tensor (N, C, SpatialDims).
+            timesteps: timestep tensor (N,).
+            context: context tensor (N, 1, ContextDim).
+            class_labels: context tensor (N, ).
+        """
+        # 1. time
+        t_emb = get_timestep_embedding(timesteps, self.block_out_channels[0])
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=x.dtype)
+        emb = self.time_embed(t_emb)
+        # 2. class
+        if self.num_class_embeds is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            class_emb = self.class_embedding(class_labels)
+            class_emb = class_emb.to(dtype=x.dtype)
+            emb = emb + class_emb
+        # 3. initial convolution
+        h = self.conv_in(x)
+        # 4. down
+        if context is not None and self.with_conditioning is False:
+            raise ValueError("model should have with_conditioning = True if context is provided")
+        for downsample_block in self.down_blocks:
+            h, _ = downsample_block(hidden_states=h, temb=emb, context=context)
+        h = h.reshape(h.shape[0], -1)
+        output = self.out(h)
+        return output

codes/modules/fp16_util.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+Helpers to train with 16-bit precision.
+"""
+import numpy as np
+import torch as th
+import torch.nn as nn
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from . import logger
+INITIAL_LOG_LOSS_SCALE = 20.0
+def convert_module_to_f16(l):
+    """
+    Convert primitive modules to float16.
+    """
+    if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        l.weight.data = l.weight.data.half()
+        if l.bias is not None:
+            l.bias.data = l.bias.data.half()
+def convert_module_to_f32(l):
+    """
+    Convert primitive modules to float32, undoing convert_module_to_f16().
+    """
+    if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Conv3d)):
+        l.weight.data = l.weight.data.float()
+        if l.bias is not None:
+            l.bias.data = l.bias.data.float()
+def make_master_params(param_groups_and_shapes):
+    """
+    Copy model parameters into a (differently-shaped) list of full-precision
+    parameters.
+    """
+    master_params = []
+    for param_group, shape in param_groups_and_shapes:
+        master_param = nn.Parameter(
+            _flatten_dense_tensors(
+                [param.detach().float() for (_, param) in param_group]
+            ).view(shape)
+        )
+        master_param.requires_grad = True
+        master_params.append(master_param)
+    return master_params
+def model_grads_to_master_grads(param_groups_and_shapes, master_params):
+    """
+    Copy the gradients from the model parameters into the master parameters
+    from make_master_params().
+    """
+    for master_param, (param_group, shape) in zip(
+        master_params, param_groups_and_shapes
+    ):
+        master_param.grad = _flatten_dense_tensors(
+            [param_grad_or_zeros(param) for (_, param) in param_group]
+        ).view(shape)
+def master_params_to_model_params(param_groups_and_shapes, master_params):
+    """
+    Copy the master parameter data back into the model parameters.
+    """
+    # Without copying to a list, if a generator is passed, this will
+    # silently not copy any parameters.
+    for master_param, (param_group, _) in zip(master_params, param_groups_and_shapes):
+        for (_, param), unflat_master_param in zip(
+            param_group, unflatten_master_params(param_group, master_param.view(-1))
+        ):
+            param.detach().copy_(unflat_master_param)
+def unflatten_master_params(param_group, master_param):
+    return _unflatten_dense_tensors(master_param, [param for (_, param) in param_group])
+def get_param_groups_and_shapes(named_model_params):
+    named_model_params = list(named_model_params)
+    scalar_vector_named_params = (
+        [(n, p) for (n, p) in named_model_params if p.ndim <= 1],
+        (-1),
+    )
+    matrix_named_params = (
+        [(n, p) for (n, p) in named_model_params if p.ndim > 1],
+        (1, -1),
+    )
+    return [scalar_vector_named_params, matrix_named_params]
+def master_params_to_state_dict(
+    model, param_groups_and_shapes, master_params, use_fp16
+):
+    if use_fp16:
+        state_dict = model.state_dict()
+        for master_param, (param_group, _) in zip(
+            master_params, param_groups_and_shapes
+        ):
+            for (name, _), unflat_master_param in zip(
+                param_group, unflatten_master_params(param_group, master_param.view(-1))
+            ):
+                assert name in state_dict
+                state_dict[name] = unflat_master_param
+    else:
+        state_dict = model.state_dict()
+        for i, (name, _value) in enumerate(model.named_parameters()):
+            assert name in state_dict
+            state_dict[name] = master_params[i]
+    return state_dict
+def state_dict_to_master_params(model, state_dict, use_fp16):
+    if use_fp16:
+        named_model_params = [
+            (name, state_dict[name]) for name, _ in model.named_parameters()
+        ]
+        param_groups_and_shapes = get_param_groups_and_shapes(named_model_params)
+        master_params = make_master_params(param_groups_and_shapes)
+    else:
+        master_params = [state_dict[name] for name, _ in model.named_parameters()]
+    return master_params
+def zero_master_grads(master_params):
+    for param in master_params:
+        param.grad = None
+def zero_grad(model_params):
+    for param in model_params:
+        # Taken from https://pytorch.org/docs/stable/_modules/torch/optim/optimizer.html#Optimizer.add_param_group
+        if param.grad is not None:
+            param.grad.detach_()
+            param.grad.zero_()
+def param_grad_or_zeros(param):
+    if param.grad is not None:
+        return param.grad.data.detach()
+    else:
+        return th.zeros_like(param)
+class MixedPrecisionTrainer:
+    def __init__(
+        self,
+        *,
+        model,
+        use_fp16=False,
+        fp16_scale_growth=1e-3,
+        initial_lg_loss_scale=INITIAL_LOG_LOSS_SCALE,
+    ):
+        self.model = model
+        self.use_fp16 = use_fp16
+        self.fp16_scale_growth = fp16_scale_growth
+        self.model_params = list(self.model.parameters())
+        self.master_params = self.model_params
+        self.param_groups_and_shapes = None
+        self.lg_loss_scale = initial_lg_loss_scale
+        if self.use_fp16:
+            self.param_groups_and_shapes = get_param_groups_and_shapes(
+                self.model.named_parameters()
+            )
+            self.master_params = make_master_params(self.param_groups_and_shapes)
+            self.model.convert_to_fp16()
+    def zero_grad(self):
+        zero_grad(self.model_params)
+    def backward(self, loss: th.Tensor):
+        if self.use_fp16:
+            loss_scale = 2 ** self.lg_loss_scale
+            (loss * loss_scale).backward()
+        else:
+            loss.backward()
+    def optimize(self, opt: th.optim.Optimizer):
+        if self.use_fp16:
+            return self._optimize_fp16(opt)
+        else:
+            return self._optimize_normal(opt)
+    def _optimize_fp16(self, opt: th.optim.Optimizer):
+        logger.logkv_mean("lg_loss_scale", self.lg_loss_scale)
+        model_grads_to_master_grads(self.param_groups_and_shapes, self.master_params)
+        grad_norm, param_norm = self._compute_norms(grad_scale=2 ** self.lg_loss_scale)
+        if check_overflow(grad_norm):
+            self.lg_loss_scale -= 1
+            logger.log(f"Found NaN, decreased lg_loss_scale to {self.lg_loss_scale}")
+            zero_master_grads(self.master_params)
+            return False
+        logger.logkv_mean("grad_norm", grad_norm)
+        logger.logkv_mean("param_norm", param_norm)
+        for p in self.master_params:
+            p.grad.mul_(1.0 / (2 ** self.lg_loss_scale))
+        opt.step()
+        zero_master_grads(self.master_params)
+        master_params_to_model_params(self.param_groups_and_shapes, self.master_params)
+        self.lg_loss_scale += self.fp16_scale_growth
+        return True
+    def _optimize_normal(self, opt: th.optim.Optimizer):
+        grad_norm, param_norm = self._compute_norms()
+        logger.logkv_mean("grad_norm", grad_norm)
+        logger.logkv_mean("param_norm", param_norm)
+        opt.step()
+        return True
+    def _compute_norms(self, grad_scale=1.0):
+        grad_norm = 0.0
+        param_norm = 0.0
+        for p in self.master_params:
+            with th.no_grad():
+                param_norm += th.norm(p, p=2, dtype=th.float32).item() ** 2
+                if p.grad is not None:
+                    grad_norm += th.norm(p.grad, p=2, dtype=th.float32).item() ** 2
+        return np.sqrt(grad_norm) / grad_scale, np.sqrt(param_norm)
+    def master_params_to_state_dict(self, master_params):
+        return master_params_to_state_dict(
+            self.model, self.param_groups_and_shapes, master_params, self.use_fp16
+        )
+    def state_dict_to_master_params(self, state_dict):
+        return state_dict_to_master_params(self.model, state_dict, self.use_fp16)
+def check_overflow(value):
+    return (value == float("inf")) or (value == -float("inf")) or (value != value)

codes/modules/logger.py ADDED Viewed

	@@ -0,0 +1,495 @@

+"""
+Logger copied from OpenAI baselines to avoid extra RL-based dependencies:
+https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/logger.py
+"""
+import os
+import sys
+import shutil
+import os.path as osp
+import json
+import time
+import datetime
+import tempfile
+import warnings
+from collections import defaultdict
+from contextlib import contextmanager
+DEBUG = 10
+INFO = 20
+WARN = 30
+ERROR = 40
+DISABLED = 50
+class KVWriter(object):
+    def writekvs(self, kvs):
+        raise NotImplementedError
+class SeqWriter(object):
+    def writeseq(self, seq):
+        raise NotImplementedError
+class HumanOutputFormat(KVWriter, SeqWriter):
+    def __init__(self, filename_or_file):
+        if isinstance(filename_or_file, str):
+            self.file = open(filename_or_file, "wt")
+            self.own_file = True
+        else:
+            assert hasattr(filename_or_file, "read"), (
+                "expected file or str, got %s" % filename_or_file
+            )
+            self.file = filename_or_file
+            self.own_file = False
+    def writekvs(self, kvs):
+        # Create strings for printing
+        key2str = {}
+        for (key, val) in sorted(kvs.items()):
+            if hasattr(val, "__float__"):
+                valstr = "%-8.3g" % val
+            else:
+                valstr = str(val)
+            key2str[self._truncate(key)] = self._truncate(valstr)
+        # Find max widths
+        if len(key2str) == 0:
+            print("WARNING: tried to write empty key-value dict")
+            return
+        else:
+            keywidth = max(map(len, key2str.keys()))
+            valwidth = max(map(len, key2str.values()))
+        # Write out the data
+        dashes = "-" * (keywidth + valwidth + 7)
+        lines = [dashes]
+        for (key, val) in sorted(key2str.items(), key=lambda kv: kv[0].lower()):
+            lines.append(
+                "| %s%s | %s%s |"
+                % (key, " " * (keywidth - len(key)), val, " " * (valwidth - len(val)))
+            )
+        lines.append(dashes)
+        self.file.write("\n".join(lines) + "\n")
+        # Flush the output to the file
+        self.file.flush()
+    def _truncate(self, s):
+        maxlen = 30
+        return s[: maxlen - 3] + "..." if len(s) > maxlen else s
+    def writeseq(self, seq):
+        seq = list(seq)
+        for (i, elem) in enumerate(seq):
+            self.file.write(elem)
+            if i < len(seq) - 1:  # add space unless this is the last one
+                self.file.write(" ")
+        self.file.write("\n")
+        self.file.flush()
+    def close(self):
+        if self.own_file:
+            self.file.close()
+class JSONOutputFormat(KVWriter):
+    def __init__(self, filename):
+        self.file = open(filename, "wt")
+    def writekvs(self, kvs):
+        for k, v in sorted(kvs.items()):
+            if hasattr(v, "dtype"):
+                kvs[k] = float(v)
+        self.file.write(json.dumps(kvs) + "\n")
+        self.file.flush()
+    def close(self):
+        self.file.close()
+class CSVOutputFormat(KVWriter):
+    def __init__(self, filename):
+        self.file = open(filename, "w+t")
+        self.keys = []
+        self.sep = ","
+    def writekvs(self, kvs):
+        # Add our current row to the history
+        extra_keys = list(kvs.keys() - self.keys)
+        extra_keys.sort()
+        if extra_keys:
+            self.keys.extend(extra_keys)
+            self.file.seek(0)
+            lines = self.file.readlines()
+            self.file.seek(0)
+            for (i, k) in enumerate(self.keys):
+                if i > 0:
+                    self.file.write(",")
+                self.file.write(k)
+            self.file.write("\n")
+            for line in lines[1:]:
+                self.file.write(line[:-1])
+                self.file.write(self.sep * len(extra_keys))
+                self.file.write("\n")
+        for (i, k) in enumerate(self.keys):
+            if i > 0:
+                self.file.write(",")
+            v = kvs.get(k)
+            if v is not None:
+                self.file.write(str(v))
+        self.file.write("\n")
+        self.file.flush()
+    def close(self):
+        self.file.close()
+class TensorBoardOutputFormat(KVWriter):
+    """
+    Dumps key/value pairs into TensorBoard's numeric format.
+    """
+    def __init__(self, dir):
+        os.makedirs(dir, exist_ok=True)
+        self.dir = dir
+        self.step = 1
+        prefix = "events"
+        path = osp.join(osp.abspath(dir), prefix)
+        import tensorflow as tf
+        from tensorflow.python import pywrap_tensorflow
+        from tensorflow.core.util import event_pb2
+        from tensorflow.python.util import compat
+        self.tf = tf
+        self.event_pb2 = event_pb2
+        self.pywrap_tensorflow = pywrap_tensorflow
+        self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path))
+    def writekvs(self, kvs):
+        def summary_val(k, v):
+            kwargs = {"tag": k, "simple_value": float(v)}
+            return self.tf.Summary.Value(**kwargs)
+        summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()])
+        event = self.event_pb2.Event(wall_time=time.time(), summary=summary)
+        event.step = (
+            self.step
+        )  # is there any reason why you'd want to specify the step?
+        self.writer.WriteEvent(event)
+        self.writer.Flush()
+        self.step += 1
+    def close(self):
+        if self.writer:
+            self.writer.Close()
+            self.writer = None
+def make_output_format(format, ev_dir, log_suffix=""):
+    os.makedirs(ev_dir, exist_ok=True)
+    if format == "stdout":
+        return HumanOutputFormat(sys.stdout)
+    elif format == "log":
+        return HumanOutputFormat(osp.join(ev_dir, "log%s.txt" % log_suffix))
+    elif format == "json":
+        return JSONOutputFormat(osp.join(ev_dir, "progress%s.json" % log_suffix))
+    elif format == "csv":
+        return CSVOutputFormat(osp.join(ev_dir, "progress%s.csv" % log_suffix))
+    elif format == "tensorboard":
+        return TensorBoardOutputFormat(osp.join(ev_dir, "tb%s" % log_suffix))
+    else:
+        raise ValueError("Unknown format specified: %s" % (format,))
+# ================================================================
+# API
+# ================================================================
+def logkv(key, val):
+    """
+    Log a value of some diagnostic
+    Call this once for each diagnostic quantity, each iteration
+    If called many times, last value will be used.
+    """
+    get_current().logkv(key, val)
+def logkv_mean(key, val):
+    """
+    The same as logkv(), but if called many times, values averaged.
+    """
+    get_current().logkv_mean(key, val)
+def logkvs(d):
+    """
+    Log a dictionary of key-value pairs
+    """
+    for (k, v) in d.items():
+        logkv(k, v)
+def dumpkvs():
+    """
+    Write all of the diagnostics from the current iteration
+    """
+    return get_current().dumpkvs()
+def getkvs():
+    return get_current().name2val
+def log(*args, level=INFO):
+    """
+    Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
+    """
+    get_current().log(*args, level=level)
+def debug(*args):
+    log(*args, level=DEBUG)
+def info(*args):
+    log(*args, level=INFO)
+def warn(*args):
+    log(*args, level=WARN)
+def error(*args):
+    log(*args, level=ERROR)
+def set_level(level):
+    """
+    Set logging threshold on current logger.
+    """
+    get_current().set_level(level)
+def set_comm(comm):
+    get_current().set_comm(comm)
+def get_dir():
+    """
+    Get directory that log files are being written to.
+    will be None if there is no output directory (i.e., if you didn't call start)
+    """
+    return get_current().get_dir()
+record_tabular = logkv
+dump_tabular = dumpkvs
+@contextmanager
+def profile_kv(scopename):
+    logkey = "wait_" + scopename
+    tstart = time.time()
+    try:
+        yield
+    finally:
+        get_current().name2val[logkey] += time.time() - tstart
+def profile(n):
+    """
+    Usage:
+    @profile("my_func")
+    def my_func(): code
+    """
+    def decorator_with_name(func):
+        def func_wrapper(*args, **kwargs):
+            with profile_kv(n):
+                return func(*args, **kwargs)
+        return func_wrapper
+    return decorator_with_name
+# ================================================================
+# Backend
+# ================================================================
+def get_current():
+    if Logger.CURRENT is None:
+        _configure_default_logger()
+    return Logger.CURRENT
+class Logger(object):
+    DEFAULT = None  # A logger with no output files. (See right below class definition)
+    # So that you can still log to the terminal without setting up any output files
+    CURRENT = None  # Current logger being used by the free functions above
+    def __init__(self, dir, output_formats, comm=None):
+        self.name2val = defaultdict(float)  # values this iteration
+        self.name2cnt = defaultdict(int)
+        self.level = INFO
+        self.dir = dir
+        self.output_formats = output_formats
+        self.comm = comm
+    # Logging API, forwarded
+    # ----------------------------------------
+    def logkv(self, key, val):
+        self.name2val[key] = val
+    def logkv_mean(self, key, val):
+        oldval, cnt = self.name2val[key], self.name2cnt[key]
+        self.name2val[key] = oldval * cnt / (cnt + 1) + val / (cnt + 1)
+        self.name2cnt[key] = cnt + 1
+    def dumpkvs(self):
+        if self.comm is None:
+            d = self.name2val
+        else:
+            d = mpi_weighted_mean(
+                self.comm,
+                {
+                    name: (val, self.name2cnt.get(name, 1))
+                    for (name, val) in self.name2val.items()
+                },
+            )
+            if self.comm.rank != 0:
+                d["dummy"] = 1  # so we don't get a warning about empty dict
+        out = d.copy()  # Return the dict for unit testing purposes
+        for fmt in self.output_formats:
+            if isinstance(fmt, KVWriter):
+                fmt.writekvs(d)
+        self.name2val.clear()
+        self.name2cnt.clear()
+        return out
+    def log(self, *args, level=INFO):
+        if self.level <= level:
+            self._do_log(args)
+    # Configuration
+    # ----------------------------------------
+    def set_level(self, level):
+        self.level = level
+    def set_comm(self, comm):
+        self.comm = comm
+    def get_dir(self):
+        return self.dir
+    def close(self):
+        for fmt in self.output_formats:
+            fmt.close()
+    # Misc
+    # ----------------------------------------
+    def _do_log(self, args):
+        for fmt in self.output_formats:
+            if isinstance(fmt, SeqWriter):
+                fmt.writeseq(map(str, args))
+def get_rank_without_mpi_import():
+    # check environment variables here instead of importing mpi4py
+    # to avoid calling MPI_Init() when this module is imported
+    for varname in ["PMI_RANK", "OMPI_COMM_WORLD_RANK"]:
+        if varname in os.environ:
+            return int(os.environ[varname])
+    return 0
+def mpi_weighted_mean(comm, local_name2valcount):
+    """
+    Copied from: https://github.com/openai/baselines/blob/ea25b9e8b234e6ee1bca43083f8f3cf974143998/baselines/common/mpi_util.py#L110
+    Perform a weighted average over dicts that are each on a different node
+    Input: local_name2valcount: dict mapping key -> (value, count)
+    Returns: key -> mean
+    """
+    all_name2valcount = comm.gather(local_name2valcount)
+    if comm.rank == 0:
+        name2sum = defaultdict(float)
+        name2count = defaultdict(float)
+        for n2vc in all_name2valcount:
+            for (name, (val, count)) in n2vc.items():
+                try:
+                    val = float(val)
+                except ValueError:
+                    if comm.rank == 0:
+                        warnings.warn(
+                            "WARNING: tried to compute mean on non-float {}={}".format(
+                                name, val
+                            )
+                        )
+                else:
+                    name2sum[name] += val * count
+                    name2count[name] += count
+        return {name: name2sum[name] / name2count[name] for name in name2sum}
+    else:
+        return {}
+def configure(dir=None, format_strs=None, comm=None, log_suffix=""):
+    """
+    If comm is provided, average all numerical stats across that comm
+    """
+    if dir is None:
+        dir = os.getenv("OPENAI_LOGDIR")
+    if dir is None:
+        dir = osp.join(
+            tempfile.gettempdir(),
+            datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"),
+        )
+    assert isinstance(dir, str)
+    dir = os.path.expanduser(dir)
+    os.makedirs(os.path.expanduser(dir), exist_ok=True)
+    rank = get_rank_without_mpi_import()
+    if rank > 0:
+        log_suffix = log_suffix + "-rank%03i" % rank
+    if format_strs is None:
+        if rank == 0:
+            format_strs = os.getenv("OPENAI_LOG_FORMAT", "stdout,log,csv").split(",")
+        else:
+            format_strs = os.getenv("OPENAI_LOG_FORMAT_MPI", "log").split(",")
+    format_strs = filter(None, format_strs)
+    output_formats = [make_output_format(f, dir, log_suffix) for f in format_strs]
+    Logger.CURRENT = Logger(dir=dir, output_formats=output_formats, comm=comm)
+    if output_formats:
+        log("Logging to %s" % dir)
+def _configure_default_logger():
+    configure()
+    Logger.DEFAULT = Logger.CURRENT
+def reset():
+    if Logger.CURRENT is not Logger.DEFAULT:
+        Logger.CURRENT.close()
+        Logger.CURRENT = Logger.DEFAULT
+        log("Reset logger")
+@contextmanager
+def scoped_configure(dir=None, format_strs=None, comm=None):
+    prevlogger = Logger.CURRENT
+    configure(dir=dir, format_strs=format_strs, comm=comm)
+    try:
+        yield
+    finally:
+        Logger.CURRENT.close()
+        Logger.CURRENT = prevlogger

codes/modules/nn.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""
+Various utilities for neural networks.
+"""
+import math
+import torch as th
+import torch.nn as nn
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * th.sigmoid(x)
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def update_ema(target_params, source_params, rate=0.99):
+    """
+    Update target parameters to be closer to those of source parameters using
+    an exponential moving average.
+    :param target_params: the target parameter sequence.
+    :param source_params: the source parameter sequence.
+    :param rate: the EMA rate (closer to 1 means slower).
+    """
+    for targ, src in zip(target_params, source_params):
+        targ.detach().mul_(rate).add_(src, alpha=1 - rate)
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = th.exp(
+        -math.log(max_period) * th.arange(start=0, end=half, dtype=th.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = th.cat([th.cos(args), th.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = th.cat([embedding, th.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+class CheckpointFunction(th.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        with th.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with th.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = th.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads

codes/modules/unet.py ADDED Viewed

	@@ -0,0 +1,894 @@

+from abc import abstractmethod
+import math
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+from .fp16_util import convert_module_to_f16, convert_module_to_f32
+from .nn import (
+    checkpoint,
+    conv_nd,
+    linear,
+    avg_pool_nd,
+    zero_module,
+    normalization,
+    timestep_embedding,
+)
+class AttentionPool2d(nn.Module):
+    """
+    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
+    """
+    def __init__(
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5
+        )
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)  # NC(HW)
+        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+    def forward(self, x, emb):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            else:
+                x = layer(x)
+        return x
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=1)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest"
+            )
+        else:
+            x = F.interpolate(x, scale_factor=2, mode="nearest")
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, 3, stride=stride, padding=1
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+        self.updown = up or down
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1)
+            ),
+        )
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, 3, padding=1
+            )
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+    def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        return checkpoint(
+            self._forward, (x, emb), self.parameters(), self.use_checkpoint
+        )
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = th.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+    def forward(self, x):
+        return checkpoint(self._forward, (x,), self.parameters(), True)
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+def count_flops_attn(model, _x, y):
+    """
+    A counter for the `thop` package to count the operations in an
+    attention operation.
+    Meant to be used like:
+        macs, params = thop.profile(
+            model,
+            inputs=(inputs, timestamps),
+            custom_ops={QKVAttention: QKVAttention.count_flops},
+        )
+    """
+    b, c, *spatial = y[0].shape
+    num_spatial = int(np.prod(spatial))
+    # We perform two matmuls with the same number of ops.
+    # The first computes the weight matrix, the second computes
+    # the combination of the value vectors.
+    matmul_ops = 2 * b * (num_spatial ** 2) * c
+    model.total_ops += th.DoubleTensor([matmul_ops])
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts", q * scale, k * scale
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v)
+        return a.reshape(bs, -1, length)
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            "bct,bcs->bts",
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        if self.num_classes is not None:
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+        ch = input_ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList(
+            [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]
+        )
+        self._feature_size = ch
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(model_channels * mult),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(model_channels * mult)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads_upsample,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, input_ch, out_channels, 3, padding=1)),
+        )
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.output_blocks.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.output_blocks.apply(convert_module_to_f32)
+    def forward(self, x, timesteps, y=None):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), "must specify y if and only if the model is class-conditional"
+        hs = []
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        if self.num_classes is not None:
+            assert y.shape == (x.shape[0],)
+            emb = emb + self.label_emb(y)
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            hs.append(h)
+        h = self.middle_block(h, emb)
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb)
+        h = h.type(x.dtype)
+        return self.out(h)
+class SuperResModel(UNetModel):
+    """
+    A UNetModel that performs super-resolution.
+    Expects an extra kwarg `low_res` to condition on a low-resolution image.
+    """
+    def __init__(self, image_size, in_channels, *args, **kwargs):
+        super().__init__(image_size, in_channels * 2, *args, **kwargs)
+    def forward(self, x, timesteps, low_res=None, **kwargs):
+        _, _, new_height, new_width = x.shape
+        upsampled = F.interpolate(low_res, (new_height, new_width), mode="bilinear")
+        x = th.cat([x, upsampled], dim=1)
+        return super().forward(x, timesteps, **kwargs)
+class EncoderUNetModel(nn.Module):
+    """
+    The half UNet model with attention and timestep embedding.
+    For usage, see UNet.
+    """
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        pool="adaptive",
+    ):
+        super().__init__()
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        ch = int(channel_mult[0] * model_channels)
+        self.input_blocks = nn.ModuleList(
+            [TimestepEmbedSequential(conv_nd(dims, in_channels, ch, 3, padding=1))]
+        )
+        self._feature_size = ch
+        input_block_chans = [ch]
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=int(mult * model_channels),
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = int(mult * model_channels)
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        )
+                    )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        )
+                        if resblock_updown
+                        else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch
+                        )
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.pool = pool
+        if pool == "adaptive":
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                nn.AdaptiveAvgPool2d((1, 1)),
+                zero_module(conv_nd(dims, ch, out_channels, 1)),
+                nn.Flatten(),
+            )
+        elif pool == "attention":
+            assert num_head_channels != -1
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d(
+                    (image_size // ds), ch, num_head_channels, out_channels
+                ),
+            )
+        elif pool == "spatial":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, self.out_channels),
+            )
+        elif pool == "spatial_v2":
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                normalization(2048),
+                nn.SiLU(),
+                nn.Linear(2048, self.out_channels),
+            )
+        else:
+            raise NotImplementedError(f"Unexpected {pool} pooling")
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+    def forward(self, x, timesteps):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :return: an [N x K] Tensor of outputs.
+        """
+        emb = self.time_embed(timestep_embedding(timesteps, self.model_channels))
+        results = []
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            if self.pool.startswith("spatial"):
+                results.append(h.type(x.dtype).mean(dim=(2, 3)))
+        h = self.middle_block(h, emb)
+        if self.pool.startswith("spatial"):
+            results.append(h.type(x.dtype).mean(dim=(2, 3)))
+            h = th.cat(results, axis=-1)
+            return self.out(h)
+        else:
+            h = h.type(x.dtype)
+            return self.out(h)

codes/modules/unet_2d.py ADDED Viewed

	@@ -0,0 +1,334 @@

+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput
+from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from .modeling_utils import ModelMixin
+from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
+@dataclass
+class UNet2DOutput(BaseOutput):
+    """
+    The output of [`UNet2DModel`].
+    Args:
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            The hidden states output from the last layer of the model.
+    """
+    sample: torch.FloatTensor
+class UNet2DModel(ModelMixin, ConfigMixin):
+    r"""
+    A 2D UNet model that takes a noisy sample and a timestep and returns a sample shaped output.
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+    Parameters:
+        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
+            Height and width of input/output sample. Dimensions must be a multiple of `2 ** (len(block_out_channels) -
+            1)`.
+        in_channels (`int`, *optional*, defaults to 3): Number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 3): Number of channels in the output.
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        time_embedding_type (`str`, *optional*, defaults to `"positional"`): Type of time embedding to use.
+        freq_shift (`int`, *optional*, defaults to 0): Frequency shift for Fourier time embedding.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+            Whether to flip sin to cos for Fourier time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`):
+            Tuple of downsample block types.
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2D"`):
+            Block type for middle of UNet, it can be either `UNetMidBlock2D` or `UnCLIPUNetMidBlock2D`.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(224, 448, 672, 896)`):
+            Tuple of block output channels.
+        layers_per_block (`int`, *optional*, defaults to `2`): The number of layers per block.
+        mid_block_scale_factor (`float`, *optional*, defaults to `1`): The scale factor for the mid block.
+        downsample_padding (`int`, *optional*, defaults to `1`): The padding for the downsample convolution.
+        downsample_type (`str`, *optional*, defaults to `conv`):
+            The downsample type for downsampling layers. Choose between "conv" and "resnet"
+        upsample_type (`str`, *optional*, defaults to `conv`):
+            The upsample type for upsampling layers. Choose between "conv" and "resnet"
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
+        norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for normalization.
+        norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for normalization.
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, or `"identity"`.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim` when performing class
+            conditioning with `class_embed_type` equal to `None`.
+    """
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[Union[int, Tuple[int, int]]] = None,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        center_input_sample: bool = False,
+        time_embedding_type: str = "positional",
+        freq_shift: int = 0,
+        flip_sin_to_cos: bool = True,
+        down_block_types: Tuple[str] = ("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
+        up_block_types: Tuple[str] = ("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
+        block_out_channels: Tuple[int] = (224, 448, 672, 896),
+        layers_per_block: int = 2,
+        mid_block_scale_factor: float = 1,
+        downsample_padding: int = 1,
+        downsample_type: str = "conv",
+        upsample_type: str = "conv",
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        attention_head_dim: Optional[int] = 8,
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        resnet_time_scale_shift: str = "default",
+        add_attention: bool = True,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+        # Check inputs
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+        # input
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+        # time
+        if time_embedding_type == "fourier":
+            self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16)
+            timestep_input_dim = 2 * block_out_channels[0]
+        elif time_embedding_type == "positional":
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
+                downsample_padding=downsample_padding,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                downsample_type=downsample_type,
+                dropout=dropout,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            dropout=dropout,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            attention_head_dim=attention_head_dim if attention_head_dim is not None else block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            add_attention=add_attention,
+        )
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            is_final_block = i == len(block_out_channels) - 1
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                upsample_type=upsample_type,
+                dropout=dropout,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=num_groups_out, eps=norm_eps)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        class_labels: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet2DOutput, Tuple]:
+        r"""
+        The [`UNet2DModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            class_labels (`torch.FloatTensor`, *optional*, defaults to `None`):
+                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.unet_2d.UNet2DOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is
+                returned where the first element is the sample tensor.
+        """
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps * torch.ones(sample.shape[0], dtype=timesteps.dtype, device=timesteps.device)
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when doing class conditioning")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        # 2. pre-process
+        skip_sample = sample
+        sample = self.conv_in(sample)
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "skip_conv"):
+                sample, res_samples, skip_sample = downsample_block(
+                    hidden_states=sample, temb=emb, skip_sample=skip_sample
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # 4. mid
+        sample = self.mid_block(sample, emb)
+        # 5. up
+        skip_sample = None
+        for upsample_block in self.up_blocks:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            if hasattr(upsample_block, "skip_conv"):
+                sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample)
+            else:
+                sample = upsample_block(sample, res_samples, emb)
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if skip_sample is not None:
+            sample += skip_sample
+        if self.config.time_embedding_type == "fourier":
+            timesteps = timesteps.reshape((sample.shape[0], *([1] * len(sample.shape[1:]))))
+            sample = sample / timesteps
+        if not return_dict:
+            return (sample,)
+        return UNet2DOutput(sample=sample)

codes/pytorch_msssim.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import torch
+import torch.nn.functional as F
+from math import exp
+import numpy as np
+"""
+CODE SOURCE:
+https://github.com/jorge-pessoa/pytorch-msssim/blob/master/pytorch_msssim/__init__.py
+License:
+MIT
+Original Paper:
+DOI: 10.1109/ACSSC.2003.1292216
+"""
+def gaussian(window_size, sigma):
+    gauss = torch.Tensor([exp(-(x - window_size // 2) ** 2 / float(2 * sigma ** 2)) for x in range(window_size)])
+    return gauss / gauss.sum()
+def create_window(window_size, channel=1):
+    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
+    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
+    window = _2D_window.expand(channel, 1, window_size, window_size).contiguous()
+    return window
+def ssim(img1, img2, window_size=11, window=None, size_average=True, full=False, val_range=None):
+    # Value range can be different from 255. Other common ranges are 1 (sigmoid) and 2 (tanh).
+    if val_range is None:
+        if torch.max(img1) > 128:
+            max_val = 255
+        else:
+            max_val = 1
+        if torch.min(img1) < -0.5:
+            min_val = -1
+        else:
+            min_val = 0
+        L = max_val - min_val
+    else:
+        L = val_range
+    padd = 0
+    (_, channel, height, width) = img1.size()
+    if window is None:
+        real_size = min(window_size, height, width)
+        window = create_window(real_size, channel=channel).to(img1.device)
+    mu1 = F.conv2d(img1, window, padding=padd, groups=channel)
+    mu2 = F.conv2d(img2, window, padding=padd, groups=channel)
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = F.conv2d(img1 * img1, window, padding=padd, groups=channel) - mu1_sq
+    sigma2_sq = F.conv2d(img2 * img2, window, padding=padd, groups=channel) - mu2_sq
+    sigma12 = F.conv2d(img1 * img2, window, padding=padd, groups=channel) - mu1_mu2
+    C1 = (0.01 * L) ** 2
+    C2 = (0.03 * L) ** 2
+    v1 = 2.0 * sigma12 + C2
+    v2 = sigma1_sq + sigma2_sq + C2
+    cs = v1 / v2  # contrast sensitivity
+    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)
+    if size_average:
+        cs = cs.mean()
+        ret = ssim_map.mean()
+    else:
+        cs = cs.mean(1).mean(1).mean(1)
+        ret = ssim_map.mean(1).mean(1).mean(1)
+    if full:
+        return ret, cs
+    return ret
+def msssim(img1, img2, window_size=11, size_average=True, val_range=None, normalize=None):
+    device = img1.device
+    weights = torch.FloatTensor([0.0448, 0.2856, 0.3001, 0.2363, 0.1333]).to(device)
+    levels = weights.size()[0]
+    ssims = []
+    mcs = []
+    for _ in range(levels):
+        sim, cs = ssim(img1, img2, window_size=window_size, size_average=size_average, full=True, val_range=val_range)
+        # Relu normalize (not compliant with original definition)
+        if normalize == "relu":
+            ssims.append(torch.relu(sim))
+            mcs.append(torch.relu(cs))
+        else:
+            ssims.append(sim)
+            mcs.append(cs)
+        img1 = F.avg_pool2d(img1, (2, 2))
+        img2 = F.avg_pool2d(img2, (2, 2))
+    ssims = torch.stack(ssims)
+    mcs = torch.stack(mcs)
+    # Simple normalize (not compliant with original definition)
+    # TODO: remove support for normalize == True (kept for backward support)
+    if normalize == "simple" or normalize == True:
+        ssims = (ssims + 1) / 2
+        mcs = (mcs + 1) / 2
+    pow1 = mcs ** weights
+    pow2 = ssims ** weights
+    # From Matlab implementation https://ece.uwaterloo.ca/~z70wang/research/iwssim/
+    output = torch.prod(pow1[:-1]) * pow2[-1]
+    return output
+# Classes to re-use window
+class SSIM(torch.nn.Module):
+    def __init__(self, window_size=11, size_average=True, val_range=None):
+        super(SSIM, self).__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.val_range = val_range
+        # Assume 1 channel for SSIM
+        self.channel = 1
+        self.window = create_window(window_size)
+    def forward(self, img1, img2):
+        (_, channel, _, _) = img1.size()
+        if channel == self.channel and self.window.dtype == img1.dtype:
+            window = self.window
+        else:
+            window = create_window(self.window_size, channel).to(img1.device).type(img1.dtype)
+            self.window = window
+            self.channel = channel
+        return ssim(img1, img2, window=window, window_size=self.window_size, size_average=self.size_average)
+class MSSSIM(torch.nn.Module):
+    def __init__(self, window_size=11, size_average=True, channel=3, normalize=None):
+        super(MSSSIM, self).__init__()
+        self.window_size = window_size
+        self.size_average = size_average
+        self.channel = channel
+        self.normalize = normalize
+    def forward(self, img1, img2):
+        # TODO: store window between calls if possible
+        return msssim(img1, img2, window_size=self.window_size, size_average=self.size_average,
+                      normalize=self.normalize)

codes/transform.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from config import config
+from torchvision import transforms
+import cv2 as cv
+class myTransformMethod():
+    def __call__(self, img):
+        img = cv.resize(img, (config.image_size, config.image_size))
+        if img.shape[-1] == 3:  # HWC
+            img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
+        return img
+myTransform = {
+    'trainTransform': transforms.Compose([
+        myTransformMethod(),
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5])
+    ]),
+    'testTransform': transforms.Compose([
+        myTransformMethod(),
+        transforms.ToTensor(),
+        transforms.Normalize([0.5], [0.5])
+    ]),
+}

codes/vq-gan_eval.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from config import config
+from transform import myTransform
+import torch
+import os
+import cv2 as cv
+import numpy as np
+from tqdm import tqdm
+def eval():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # 设置运行环境
+    source_path = "./test_eval"  # 图像文件夹路径
+    recon_output_path = "./vq-gan_recon"
+    compress_output_path = "./vq-gan_compress"
+    model = torch.load("2025-02-04-Mask-JSRT-VQGAN.pth").to(device).eval()
+    with torch.no_grad():
+        for filename in tqdm(os.listdir(source_path)):
+            img_path = os.path.join(source_path, filename)
+            img = cv.imread(img_path, 0)
+            img = myTransform["testTransform"](img).to(device)  # CHW
+            img = torch.unsqueeze(img, dim=0).to(device)  # BCHW
+            recon, _ = model(img)
+            recon = np.array(recon.detach().to("cpu"))  # BCHW
+            recon = np.squeeze(recon)  # HW
+            recon = recon * 0.5 + 0.5
+            recon = np.clip(recon, 0, 1)
+            if not config.use_server:
+                cv.imshow("win", recon)
+                cv.waitKey(0)
+            recon *= 255
+            cv.imwrite(os.path.join(recon_output_path, filename), recon)
+            if config.output_feature_map:
+                compress = model.encode_stage_2_inputs(img).cpu().detach().numpy()
+                compress = np.transpose(np.squeeze(compress)[1:], (1, 2, 0))
+                compress = compress * 0.5 + 0.5
+                compress = np.clip(compress, 0, 1)
+                if not config.use_server:
+                    cv.imshow("win", compress)
+                    cv.waitKey(0)
+                compress *= 255
+                cv.imwrite(os.path.join(compress_output_path, filename), compress)
+if __name__ == "__main__":
+    eval()

data/BS/0.png ADDED Viewed

Git LFS Details

SHA256: 596d2da0059a04d0d22248c4f4c14bcde03b6898a229343315a026fce5fda31c
Pointer size: 131 Bytes
Size of remote file: 455 kB

data/BS/1.png ADDED Viewed

Git LFS Details

SHA256: 743a74220b96fae88c0a3c1bd029fd81b7bb2a007c2f0b871940521bbbbaf7de
Pointer size: 131 Bytes
Size of remote file: 487 kB

data/BS/2.png ADDED Viewed

Git LFS Details

SHA256: 7ea0c94ae904f9f28aa85a413e54b2cb65f386529e9df0ebe23b7c56ff837114
Pointer size: 131 Bytes
Size of remote file: 424 kB

data/CXR/0.png ADDED Viewed

Git LFS Details

SHA256: 2ff0dbefdee19fcd6a63f5d8e21b25a1fa085b5ef3a96fce3240faa90696d9b7
Pointer size: 131 Bytes
Size of remote file: 490 kB

data/CXR/1.png ADDED Viewed

Git LFS Details

SHA256: b0f49c3985a2a610fbee7f27d372c088fbcf55b73c8198bd5f154dd713484ea3
Pointer size: 131 Bytes
Size of remote file: 526 kB

data/CXR/2.png ADDED Viewed

Git LFS Details

SHA256: 38d2a3ce3e0f4afe21047e5f1e7ea8a3c2f6b55b07c15621d8da14044671b674
Pointer size: 131 Bytes
Size of remote file: 511 kB

images/GL-LCM_gif.gif ADDED Viewed

Git LFS Details

SHA256: 75a83d8e3dfd78addc45f695e73846ae35661138f56085f483c454a7d9022585
Pointer size: 132 Bytes
Size of remote file: 8.76 MB

images/ablation.png ADDED Viewed

Git LFS Details

SHA256: e9457e6032368320af09bc2632c169abb43cec6e9cf6281c29732c0369f381ae
Pointer size: 132 Bytes
Size of remote file: 1.02 MB

images/comparison.png ADDED Viewed

Git LFS Details

SHA256: 397bb20751ca9d2a81df6694dd0569c5ff476bd47947f9b0af1b8934b04dc855
Pointer size: 133 Bytes
Size of remote file: 13.6 MB

images/framework.png ADDED Viewed

Git LFS Details

SHA256: 0bee5e8f15b3fdee26bb24abe444c4982025e6a88c67a87241d79584bc7f5f09
Pointer size: 132 Bytes
Size of remote file: 2.72 MB

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+diffusers~=0.27.2
+lpips~=0.1.4
+matplotlib~=3.7.2
+matplotlib-inline~=0.1.6
+monai~=1.2.0
+monai-generative~=0.2.2
+numpy~=1.26.4
+opencv-python~=4.8.1.78
+pandas~=2.0.3
+Pillow~=9.3.0
+scikit-image~=0.22.0
+scikit-learn~=1.3.1
+scipy~=1.11.3
+seaborn~=0.13.0
+torch~=2.0.1+cu117
+torch-ema=~0.3
+torchaudio~=2.0.2+cu117
+torchsummary~=1.5.1
+torchvision~=0.15.2+cu117
+tqdm~=4.66.1