NVIDIA
diff --git a/‎PyTorch/Segmentation/nnUNet/Dockerfile‎
Lines changed: 4 additions & 4 deletions b/‎PyTorch/Segmentation/nnUNet/Dockerfile‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎PyTorch/Segmentation/nnUNet/README.md‎
Lines changed: 170 additions & 145 deletions b/‎PyTorch/Segmentation/nnUNet/README.md‎
Lines changed: 170 additions & 145 deletions
diff --git a/‎PyTorch/Segmentation/nnUNet/data_loading/dali_loader.py‎
Lines changed: 16 additions & 0 deletions b/‎PyTorch/Segmentation/nnUNet/data_loading/dali_loader.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎PyTorch/Segmentation/nnUNet/data_loading/data_module.py‎
Lines changed: 6 additions & 0 deletions b/‎PyTorch/Segmentation/nnUNet/data_loading/data_module.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎PyTorch/Segmentation/nnUNet/main.py‎
Lines changed: 47 additions & 34 deletions b/‎PyTorch/Segmentation/nnUNet/main.py‎
Lines changed: 47 additions & 34 deletions
diff --git a/‎PyTorch/Segmentation/nnUNet/nnunet/metrics.py‎
Lines changed: 39 additions & 21 deletions b/‎PyTorch/Segmentation/nnUNet/nnunet/metrics.py‎
Lines changed: 39 additions & 21 deletions
@@ -1,17 +1,17 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.11-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.11-py3
 FROM ${FROM_IMAGE_NAME} 
 
 ADD ./requirements.txt .
 RUN pip install --disable-pip-version-check -r requirements.txt
-RUN pip install monai==0.8.1 --no-dependencies
-RUN pip uninstall -y torchtext
+RUN pip install monai==1.0.0 --no-dependencies
 RUN pip install numpy --upgrade
-RUN pip install --extra-index-url https://developer.download.nvidia.com/compute/redist/ nvidia-dali-cuda110==1.16.0
 
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
 RUN unzip -qq awscliv2.zip
 RUN ./aws/install
 RUN rm -rf awscliv2.zip aws
 
+ENV OMP_NUM_THREADS=2
 WORKDIR /workspace/nnunet_pyt
 ADD . /workspace/nnunet_pyt 
+RUN cp utils/instance_norm.py /usr/local/lib/python3.8/dist-packages/apex/normalization
@@ -36,10 +36,12 @@ def __init__(self, batch_size, num_threads, device_id, **kwargs):
         self.kwargs = kwargs
         self.dim = kwargs["dim"]
         self.device = device_id
+        self.layout = kwargs["layout"]
         self.patch_size = kwargs["patch_size"]
         self.load_to_gpu = kwargs["load_to_gpu"]
         self.input_x = self.get_reader(kwargs["imgs"])
         self.input_y = self.get_reader(kwargs["lbls"]) if kwargs["lbls"] is not None else None
+        self.cdhw2dhwc = ops.Transpose(device="gpu", perm=[1, 2, 3, 0])
 
     def get_reader(self, data):
         return ops.readers.Numpy(
@@ -67,6 +69,10 @@ def load_data(self):
             return img, lbl
         return img
 
+    def make_dhwc_layout(self, img, lbl):
+        img, lbl = self.cdhw2dhwc(img), self.cdhw2dhwc(lbl)
+        return img, lbl
+
     def crop(self, data):
         return fn.crop(data, crop=self.patch_size, out_of_bounds_policy="pad")
 
@@ -154,6 +160,8 @@ def define_graph(self):
         img = self.contrast_fn(img)
         if self.dim == 2:
             img, lbl = self.transpose_fn(img, lbl)
+        if self.layout == "NDHWC" and self.dim == 3:
+            img, lbl = self.make_dhwc_layout(img, lbl)
         return img, lbl
 
 
@@ -171,6 +179,8 @@ def define_graph(self):
             meta = self.input_meta(name="ReaderM")
             orig_lbl = self.input_orig_y(name="ReaderO")
             return img, lbl, meta, orig_lbl
+        if self.layout == "NDHWC" and self.dim == 3:
+            img, lbl = self.make_dhwc_layout(img, lbl)
         return img, lbl
 
 
@@ -204,6 +214,8 @@ def define_graph(self):
         img, lbl = self.crop_fn(img, lbl)
         if self.dim == 2:
             img, lbl = self.transpose_fn(img, lbl)
+        if self.layout == "NDHWC" and self.dim == 3:
+            img, lbl = self.make_dhwc_layout(img, lbl)
         return img, lbl
 
 
@@ -250,6 +262,10 @@ def fetch_dali_loader(imgs, lbls, batch_size, mode, **kwargs):
         pipe_kwargs.update({"patch_size": [batch_size_2d] + kwargs["patch_size"]})
 
     rank = int(os.getenv("LOCAL_RANK", "0"))
+    if mode == "eval":  # We sharded the data for evaluation manually.
+        rank = 0
+        pipe_kwargs["gpus"] = 1
+
     pipe = pipeline(batch_size, kwargs["num_workers"], rank, **pipe_kwargs)
     return LightningWrapper(
         pipe,
 
@@ -34,6 +34,7 @@ def __init__(self, args):
             "seed": self.args.seed,
             "gpus": self.args.gpus,
             "nvol": self.args.nvol,
+            "layout": self.args.layout,
             "overlap": self.args.overlap,
             "benchmark": self.args.benchmark,
             "num_workers": self.args.num_workers,
@@ -57,6 +58,11 @@ def setup(self, stage=None):
             self.kwargs.update({"orig_lbl": orig_lbl, "meta": meta})
             self.train_imgs, self.train_lbls = get_split(imgs, train_idx), get_split(lbls, train_idx)
             self.val_imgs, self.val_lbls = get_split(imgs, val_idx), get_split(lbls, val_idx)
+
+            if self.args.gpus > 1:
+                rank = int(os.getenv("LOCAL_RANK", "0"))
+                self.val_imgs = self.val_imgs[rank :: self.args.gpus]
+                self.val_lbls = self.val_lbls[rank :: self.args.gpus]
         else:
             self.kwargs.update({"meta": test_meta})
         print0(f"{len(self.train_imgs)} training, {len(self.val_imgs)} validation, {len(self.test_imgs)} test examples")
 
@@ -14,29 +14,64 @@
 
 import os
 
+import torch
 from pytorch_lightning import Trainer, seed_everything
 from pytorch_lightning.callbacks import ModelCheckpoint, ModelSummary, RichProgressBar
-from pytorch_lightning.loggers import TensorBoardLogger
+from pytorch_lightning.plugins.io import AsyncCheckpointIO
+from pytorch_lightning.strategies import DDPStrategy
 
 from data_loading.data_module import DataModule
 from nnunet.nn_unet import NNUnet
 from utils.args import get_main_args
 from utils.logger import LoggingCallback
 from utils.utils import make_empty_dir, set_cuda_devices, set_granularity, verify_ckpt_path
 
-if __name__ == "__main__":
+torch.backends.cuda.matmul.allow_tf32 = True
+torch.backends.cudnn.allow_tf32 = True
+
+
+def get_trainer(args, callbacks):
+    return Trainer(
+        logger=False,
+        default_root_dir=args.results,
+        benchmark=True,
+        deterministic=False,
+        max_epochs=args.epochs,
+        precision=16 if args.amp else 32,
+        gradient_clip_val=args.gradient_clip_val,
+        enable_checkpointing=args.save_ckpt,
+        callbacks=callbacks,
+        num_sanity_val_steps=0,
+        accelerator="gpu",
+        devices=args.gpus,
+        num_nodes=args.nodes,
+        plugins=[AsyncCheckpointIO()],
+        strategy=DDPStrategy(
+            find_unused_parameters=False,
+            static_graph=True,
+            gradient_as_bucket_view=True,
+        ),
+        limit_train_batches=1.0 if args.train_batches == 0 else args.train_batches,
+        limit_val_batches=1.0 if args.test_batches == 0 else args.test_batches,
+        limit_test_batches=1.0 if args.test_batches == 0 else args.test_batches,
+    )
+
+
+def main():
     args = get_main_args()
-    set_granularity()  # Increase maximum fetch granularity of L2 to 128 bytes
+    set_granularity()
     set_cuda_devices(args)
     if args.seed is not None:
         seed_everything(args.seed)
     data_module = DataModule(args)
     data_module.setup()
     ckpt_path = verify_ckpt_path(args)
 
-    model = NNUnet(args)
+    if ckpt_path is not None:
+        model = NNUnet.load_from_checkpoint(ckpt_path, strict=False, args=args)
+    else:
+        model = NNUnet(args)
     callbacks = [RichProgressBar(), ModelSummary(max_depth=2)]
-    logger = False
     if args.benchmark:
         batch_size = args.batch_size if args.exec_mode == "train" else args.val_batch_size
         filnename = args.logname if args.logname is not None else "perf.json"
@@ -51,13 +86,6 @@
             )
         )
     elif args.exec_mode == "train":
-        if args.tb_logs:
-            logger = TensorBoardLogger(
-                save_dir=f"{args.results}/tb_logs",
-                name=f"task={args.task}_dim={args.dim}_fold={args.fold}_precision={16 if args.amp else 32}",
-                default_hp_metric=False,
-                version=0,
-            )
         if args.save_ckpt:
             callbacks.append(
                 ModelCheckpoint(
@@ -69,26 +97,7 @@
                 )
             )
 
-    trainer = Trainer(
-        logger=logger,
-        default_root_dir=args.results,
-        benchmark=True,
-        deterministic=False,
-        max_epochs=args.epochs,
-        precision=16 if args.amp else 32,
-        gradient_clip_val=args.gradient_clip_val,
-        enable_checkpointing=args.save_ckpt,
-        callbacks=callbacks,
-        num_sanity_val_steps=0,
-        accelerator="gpu",
-        devices=args.gpus,
-        num_nodes=args.nodes,
-        strategy="ddp" if args.gpus > 1 else None,
-        limit_train_batches=1.0 if args.train_batches == 0 else args.train_batches,
-        limit_val_batches=1.0 if args.test_batches == 0 else args.test_batches,
-        limit_test_batches=1.0 if args.test_batches == 0 else args.test_batches,
-    )
-
+    trainer = get_trainer(args, callbacks)
     if args.benchmark:
         if args.exec_mode == "train":
             trainer.fit(model, train_dataloaders=data_module.train_dataloader())
@@ -99,7 +108,7 @@
             model.start_benchmark = 1
             trainer.test(model, dataloaders=data_module.test_dataloader(), verbose=False)
     elif args.exec_mode == "train":
-        trainer.fit(model, datamodule=data_module, ckpt_path=ckpt_path)
+        trainer.fit(model, datamodule=data_module)
     elif args.exec_mode == "evaluate":
         trainer.validate(model, dataloaders=data_module.val_dataloader())
     elif args.exec_mode == "predict":
@@ -113,4 +122,8 @@
             model.save_dir = save_dir
             make_empty_dir(save_dir)
         model.args = args
-        trainer.test(model, dataloaders=data_module.test_dataloader(), ckpt_path=ckpt_path)
+        trainer.test(model, dataloaders=data_module.test_dataloader())
+
+
+if __name__ == "__main__":
+    main()
@@ -13,45 +13,63 @@
 # limitations under the License.
 
 import torch
-from monai.metrics import compute_meandice, do_metric_reduction
-from monai.networks.utils import one_hot
 from torchmetrics import Metric
 
 
 class Dice(Metric):
+    full_state_update = False
+
     def __init__(self, n_class, brats):
         super().__init__(dist_sync_on_step=False)
         self.n_class = n_class
         self.brats = brats
-        self.add_state("loss", default=torch.zeros(1), dist_reduce_fx="sum")
         self.add_state("steps", default=torch.zeros(1), dist_reduce_fx="sum")
         self.add_state("dice", default=torch.zeros((n_class,)), dist_reduce_fx="sum")
+        self.add_state("loss", default=torch.zeros(1), dist_reduce_fx="sum")
 
     def update(self, p, y, l):
-        if self.brats:
-            p = (torch.sigmoid(p) > 0.5).int()
-            y_wt, y_tc, y_et = y > 0, ((y == 1) + (y == 3)) > 0, y == 3
-            y = torch.stack([y_wt, y_tc, y_et], dim=1)
-        else:
-            p, y = self.ohe(torch.argmax(p, dim=1)), self.ohe(y)
-
         self.steps += 1
+        self.dice += self.compute_stats_brats(p, y) if self.brats else self.compute_stats(p, y)
         self.loss += l
-        self.dice += self.compute_metric(p, y, compute_meandice, 1, 0)
 
     def compute(self):
         return 100 * self.dice / self.steps, self.loss / self.steps
 
-    def ohe(self, x):
-        return one_hot(x.unsqueeze(1), num_classes=self.n_class + 1, dim=1)
-
-    def compute_metric(self, p, y, metric_fn, best_metric, worst_metric):
-        metric = metric_fn(p, y, include_background=self.brats)
-        metric = torch.nan_to_num(metric, nan=worst_metric, posinf=worst_metric, neginf=worst_metric)
-        metric = do_metric_reduction(metric, "mean_batch")[0]
+    def compute_stats_brats(self, p, y):
+        scores = torch.zeros(self.n_class, device=p.device, dtype=torch.float32)
+        p = (torch.sigmoid(p) > 0.5).int()
+        y_wt, y_tc, y_et = y > 0, ((y == 1) + (y == 3)) > 0, y == 3
+        y = torch.stack([y_wt, y_tc, y_et], dim=1)
 
         for i in range(self.n_class):
-            if (y[:, i] != 1).all():
-                metric[i - 1] += best_metric if (p[:, i] != 1).all() else worst_metric
+            p_i, y_i = p[:, i], y[:, i]
+            if (y_i != 1).all():
+                # no foreground class
+                scores[i - 1] += 1 if (p_i != 1).all() else 0
+                continue
+            tp, fn, fp = self.get_stats(p_i, y_i, 1)
+            denom = (2 * tp + fp + fn).to(torch.float)
+            score_cls = (2 * tp).to(torch.float) / denom if torch.is_nonzero(denom) else 0.0
+            scores[i - 1] += score_cls
+        return scores
+
+    def compute_stats(self, p, y):
+        scores = torch.zeros(self.n_class, device=p.device, dtype=torch.float32)
+        p = torch.argmax(p, dim=1)
+        for i in range(1, self.n_class + 1):
+            if (y != i).all():
+                # no foreground class
+                scores[i - 1] += 1 if (p != i).all() else 0
+                continue
+            tp, fn, fp = self.get_stats(p, y, i)
+            denom = (2 * tp + fp + fn).to(torch.float)
+            score_cls = (2 * tp).to(torch.float) / denom if torch.is_nonzero(denom) else 0.0
+            scores[i - 1] += score_cls
+        return scores
 
-        return metric
+    @staticmethod
+    def get_stats(p, y, c):
+        tp = torch.logical_and(p == c, y == c).sum()
+        fn = torch.logical_and(p != c, y == c).sum()
+        fp = torch.logical_and(p == c, y != c).sum()
+        return tp, fn, fp