NVIDIA
diff --git a/‎TensorFlow2/Segmentation/nnUNet/Dockerfile‎
Lines changed: 2 additions & 1 deletion b/‎TensorFlow2/Segmentation/nnUNet/Dockerfile‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎TensorFlow2/Segmentation/nnUNet/README.md‎
Lines changed: 163 additions & 162 deletions b/‎TensorFlow2/Segmentation/nnUNet/README.md‎
Lines changed: 163 additions & 162 deletions
diff --git a/‎TensorFlow2/Segmentation/nnUNet/data_loading/dali_loader.py‎
Lines changed: 18 additions & 74 deletions b/‎TensorFlow2/Segmentation/nnUNet/data_loading/dali_loader.py‎
Lines changed: 18 additions & 74 deletions
diff --git a/‎TensorFlow2/Segmentation/nnUNet/data_loading/data_module.py‎
Lines changed: 0 additions & 3 deletions b/‎TensorFlow2/Segmentation/nnUNet/data_loading/data_module.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎TensorFlow2/Segmentation/nnUNet/main.py‎
Lines changed: 0 additions & 14 deletions b/‎TensorFlow2/Segmentation/nnUNet/main.py‎
Lines changed: 0 additions & 14 deletions
diff --git a/‎TensorFlow2/Segmentation/nnUNet/models/layers.py‎
Lines changed: 4 additions & 1 deletion b/‎TensorFlow2/Segmentation/nnUNet/models/layers.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎TensorFlow2/Segmentation/nnUNet/models/nn_unet.py‎
Lines changed: 24 additions & 7 deletions b/‎TensorFlow2/Segmentation/nnUNet/models/nn_unet.py‎
Lines changed: 24 additions & 7 deletions
@@ -1,4 +1,4 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:22.04-tf2-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:22.11-tf2-py3
 FROM ${FROM_IMAGE_NAME}
 
 RUN pip install nvidia-pyindex
@@ -13,6 +13,7 @@ RUN unzip -qq awscliv2.zip
 RUN ./aws/install
 RUN rm -rf awscliv2.zip aws
 
+ENV OMP_NUM_THREADS=2
 ENV TF_CPP_MIN_LOG_LEVEL 3
 ENV OMPI_MCA_coll_hcoll_enable 0
 ENV HCOLL_ENABLE_MCAST 0 
 
@@ -17,7 +17,6 @@
 import horovod.tensorflow as hvd
 import numpy as np
 import nvidia.dali.fn as fn
-import nvidia.dali.math as math
 import nvidia.dali.ops as ops
 import nvidia.dali.plugin.tf as dali_tf
 import nvidia.dali.types as types
@@ -57,7 +56,6 @@ def __init__(
         shuffle_input=True,
         input_x_files=None,
         input_y_files=None,
-        use_cpu=False,
     ):
         super().__init__(
             batch_size=batch_size,
@@ -85,19 +83,12 @@ def __init__(
 
         self.dim = dim
         self.internal_seed = seed
-        self.use_cpu = use_cpu
-
-    def mark_pipeline_start(self, x, y):
-        if not self.use_cpu:
-            x, y = x.gpu(), y.gpu()
-        return x, y
 
 
 class TrainPipeline(GenericPipeline):
-    def __init__(self, imgs, lbls, oversampling, patch_size, read_roi=False, batch_size_2d=None, **kwargs):
+    def __init__(self, imgs, lbls, oversampling, patch_size, batch_size_2d=None, **kwargs):
         super().__init__(input_x_files=imgs, input_y_files=lbls, shuffle_input=True, **kwargs)
         self.oversampling = oversampling
-        self.read_roi = read_roi
         self.patch_size = patch_size
         if self.dim == 2 and batch_size_2d is not None:
             self.patch_size = [batch_size_2d] + self.patch_size
@@ -129,7 +120,7 @@ def biased_crop_fn(self, img, lbl):
             roi_end=roi_end,
             crop_shape=[*self.patch_size, 1],
         )
-        anchor = fn.slice(anchor, 0, 3, axes=[0])  # drop channel from anchor
+        anchor = fn.slice(anchor, 0, 3, axes=[0])
         img, lbl = fn.slice(
             [img, lbl],
             anchor,
@@ -138,40 +129,7 @@ def biased_crop_fn(self, img, lbl):
             out_of_bounds_policy="pad",
             device="cpu",
         )
-
-        return img.gpu(), lbl.gpu()
-
-    def load_roi(self):
-        lbl = self.input_y(name="ReaderY")
-        lbl = fn.reshape(lbl, layout="DHWC")
-        roi_start, roi_end = fn.segmentation.random_object_bbox(
-            lbl,
-            format="start_end",
-            foreground_prob=self.oversampling,
-            k_largest=2,
-            device="cpu",
-            cache_objects=True,
-        )
-        anchor = fn.roi_random_crop(lbl, roi_start=roi_start, roi_end=roi_end, crop_shape=[1, *self.patch_size])
-        anchor = fn.slice(anchor, 1, 3, axes=[0])  # drop channel from anchor
-        lbl = fn.slice(
-            lbl,
-            anchor,
-            self.crop_shape,
-            axis_names="DHW",
-            out_of_bounds_policy="pad",
-            device="cpu",
-        )
-
-        img = self.input_x(
-            name="ReaderX",
-            roi_start=fn.cast(anchor, dtype=types.INT32),
-            roi_axes=[1, 2, 3],
-            roi_shape=self.patch_size,
-            out_of_bounds_policy="pad",
-        )
-        img = fn.reshape(img, layout="DHWC")
-
+        img, lbl = img.gpu(), lbl.gpu()
         return img, lbl
 
     def zoom_fn(self, img, lbl):
@@ -189,22 +147,18 @@ def zoom_fn(self, img, lbl):
         return img, lbl
 
     def noise_fn(self, img):
-        img_noised = img + fn.random.normal(img, stddev=fn.random.uniform(range=(0.0, 0.33)))
+        img_noised = fn.noise.gaussian(img, stddev=fn.random.uniform(range=(0.0, 0.3)))
         return random_augmentation(0.15, img_noised, img)
 
     def blur_fn(self, img):
         img_blurred = fn.gaussian_blur(img, sigma=fn.random.uniform(range=(0.5, 1.5)))
         return random_augmentation(0.15, img_blurred, img)
 
-    def brightness_fn(self, img):
-        brightness_scale = random_augmentation(0.15, fn.random.uniform(range=(0.7, 1.3)), 1.0)
-        return img * brightness_scale
-
-    def contrast_fn(self, img):
-        min_, max_ = fn.reductions.min(img), fn.reductions.max(img)
-        scale = random_augmentation(0.15, fn.random.uniform(range=(0.65, 1.5)), 1.0)
-        img = math.clamp(img * scale, min_, max_)
-        return img
+    def brightness_contrast_fn(self, img):
+        img_transformed = fn.brightness_contrast(
+            img, brightness=fn.random.uniform(range=(0.7, 1.3)), contrast=fn.random.uniform(range=(0.65, 1.5))
+        )
+        return random_augmentation(0.15, img_transformed, img)
 
     def flips_fn(self, img, lbl):
         kwargs = {
@@ -216,16 +170,13 @@ def flips_fn(self, img, lbl):
         return fn.flip(img, **kwargs), fn.flip(lbl, **kwargs)
 
     def define_graph(self):
-        if self.read_roi:
-            img, lbl = self.load_roi()
-        else:
-            img, lbl = self.load_data()
-            img, lbl = self.biased_crop_fn(img, lbl)
-        img, lbl = img.gpu(), lbl.gpu()
+        img, lbl = self.load_data()
+        img, lbl = self.biased_crop_fn(img, lbl)
         img, lbl = self.zoom_fn(img, lbl)
         img, lbl = self.flips_fn(img, lbl)
-        img = self.brightness_fn(img)
-        img = self.contrast_fn(img)
+        img = self.noise_fn(img)
+        img = self.blur_fn(img)
+        img = self.brightness_contrast_fn(img)
         return img, lbl
 
 
@@ -251,12 +202,11 @@ def define_graph(self):
 
 
 class BenchmarkPipeline(GenericPipeline):
-    def __init__(self, imgs, lbls, patch_size, batch_size_2d=None, sw_benchmark=False, **kwargs):
+    def __init__(self, imgs, lbls, patch_size, batch_size_2d=None, **kwargs):
         super().__init__(input_x_files=imgs, input_y_files=lbls, shuffle_input=False, **kwargs)
         self.patch_size = patch_size
         if self.dim == 2 and batch_size_2d is not None:
             self.patch_size = [batch_size_2d] + self.patch_size
-        self.crop = not sw_benchmark
 
     def crop_fn(self, img, lbl):
         img = fn.crop(img, crop=self.patch_size, out_of_bounds_policy="pad")
@@ -265,9 +215,8 @@ def crop_fn(self, img, lbl):
 
     def define_graph(self):
         img, lbl = self.input_x(name="ReaderX").gpu(), self.input_y(name="ReaderY").gpu()
+        img, lbl = self.crop_fn(img, lbl)
         img, lbl = fn.reshape(img, layout="DHWC"), fn.reshape(lbl, layout="DHWC")
-        if self.crop:
-            img, lbl = self.crop_fn(img, lbl)
         return img, lbl
 
 
@@ -293,7 +242,6 @@ def fetch_dali_loader(imgs, lbls, batch_size, mode, **kwargs):
         "batch_size": batch_size,
         "num_threads": kwargs["num_workers"],
         "shard_id": device_id,
-        "use_cpu": kwargs["use_cpu"],
     }
     if kwargs["dim"] == 2:
         if kwargs["benchmark"]:
@@ -308,13 +256,9 @@ def fetch_dali_loader(imgs, lbls, batch_size, mode, **kwargs):
 
     output_dtypes = (tf.float32, tf.uint8)
     if kwargs["benchmark"]:
-        pipeline = BenchmarkPipeline(
-            imgs, lbls, kwargs["patch_size"], sw_benchmark=kwargs["sw_benchmark"], **pipe_kwargs
-        )
+        pipeline = BenchmarkPipeline(imgs, lbls, kwargs["patch_size"], **pipe_kwargs)
     elif mode == "train":
-        pipeline = TrainPipeline(
-            imgs, lbls, kwargs["oversampling"], kwargs["patch_size"], kwargs["read_roi"], **pipe_kwargs
-        )
+        pipeline = TrainPipeline(imgs, lbls, kwargs["oversampling"], kwargs["patch_size"], **pipe_kwargs)
     elif mode == "eval":
         pipeline = EvalPipeline(imgs, lbls, kwargs["patch_size"], **pipe_kwargs)
     else:
 
@@ -44,9 +44,6 @@ def __init__(self, args):
             "nvol": self.args.nvol,
             "bench_steps": self.args.bench_steps,
             "meta": load_data(self.data_path, "*_meta.npy"),
-            "read_roi": self.args.read_roi,
-            "use_cpu": self.args.dali_use_cpu,
-            "sw_benchmark": self.args.sw_benchmark,
         }
 
     def setup(self, stage=None):
 
@@ -12,9 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import ctypes
-import os
-
 from data_loading.data_module import DataModule
 from models.nn_unet import NNUnet
 from runtime.args import get_main_args
@@ -25,17 +22,6 @@
 
 
 def main(args):
-    os.environ["TF_GPU_THREAD_MODE"] = "gpu_private"
-    os.environ["TF_GPU_THREAD_COUNT"] = "1"
-
-    _libcudart = ctypes.CDLL("libcudart.so")
-    # Set device limit on the current device
-    # cudaLimitMaxL2FetchGranularity = 0x05
-    pValue = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
-    _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
-    _libcudart.cudaDeviceGetLimit(pValue, ctypes.c_int(0x05))
-    assert pValue.contents.value == 128
-
     hvd_init()
     if args.seed is not None:
         set_seed(args.seed)
 
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import nv_norms
 import tensorflow as tf
 import tensorflow_addons as tfa
 
@@ -26,7 +27,7 @@
 class KaimingNormal(tf.keras.initializers.VarianceScaling):
     def __init__(self, negative_slope, seed=None):
         super().__init__(
-            scale=2.0 / (1 + negative_slope ** 2), mode="fan_in", distribution="untruncated_normal", seed=seed
+            scale=2.0 / (1 + negative_slope**2), mode="fan_in", distribution="untruncated_normal", seed=seed
         )
 
     def get_config(self):
@@ -38,6 +39,8 @@ def get_norm(name):
         return tfa.layers.GroupNormalization(32, axis=-1, center=True, scale=True)
     elif "batch" in name:
         return tf.keras.layers.BatchNormalization(axis=-1, center=True, scale=True)
+    elif "atex_instance" in name:
+        return nv_norms.InstanceNormalization(axis=-1)
     elif "instance" in name:
         return tfa.layers.InstanceNormalization(axis=-1, center=True, scale=True)
     elif "none" in name:
 
@@ -19,7 +19,7 @@
 from runtime.utils import get_config_file, get_tta_flips, is_main_process
 from skimage.transform import resize
 
-from models.sliding_window import sliding_window_inference
+from models.sliding_window import get_importance_kernel, sliding_window_inference
 from models.unet import UNet
 
 
@@ -41,6 +41,8 @@ def wrapped_model(inputs, *args, **kwargs):
 
             self.model = wrapped_model
         else:
+            if not self.args.xla and self.args.norm == "instance":
+                self.args.norm = "atex_instance"
             self.model = UNet(
                 input_shape=input_shape,
                 n_class=n_class,
@@ -54,11 +56,28 @@ def wrapped_model(inputs, *args, **kwargs):
             if is_main_process():
                 print(f"Filters: {self.model.filters},\nKernels: {kernels}\nStrides: {strides}")
         self.tta_flips = get_tta_flips(self.args.dim)
+        if self.args.dim == 3:
+            self.predictor = self.sw_inference
+        elif self.args.benchmark:
+            self.predictor = self.call
+        else:
+            self.predictor = self.call_2d
+
+        if args.dim == 3:
+            importance_kernel = get_importance_kernel(self.patch_size, args.blend_mode, 0.125)
+            self.importance_map = tf.tile(
+                tf.reshape(importance_kernel, shape=[1, *self.patch_size, 1]),
+                multiples=[1, 1, 1, 1, n_class],
+            )
 
-    @tf.function(experimental_relax_shapes=True)
+    @tf.function
     def call(self, *args, **kwargs):
         return self.model(*args, **kwargs)
 
+    @tf.function(reduce_retracing=True)
+    def call_2d(self, *args, **kwargs):
+        return self.model(*args, **kwargs)
+
     @tf.function
     def compute_loss(self, loss_fn, label, preds):
         if self.args.deep_supervision:
@@ -77,21 +96,19 @@ def sw_inference(self, img, **kwargs):
         return sliding_window_inference(
             inputs=img,
             roi_size=self.patch_size,
-            sw_batch_size=self.args.sw_batch_size,
             model=self.model,
             overlap=self.args.overlap,
             n_class=self.n_class,
-            blend_mode=self.args.blend_mode,
+            importance_map=self.importance_map,
             **kwargs,
         )
 
     def inference(self, img):
-        predictor = self.call if self.args.dim == 2 else self.sw_inference
-        pred = predictor(img, training=False)
+        pred = self.predictor(img, training=False)
         if self.args.tta:
             for flip_axes in self.tta_flips:
                 flipped_img = tf.reverse(img, axis=flip_axes)
-                flipped_pred = predictor(flipped_img, training=False)
+                flipped_pred = self.predictor(flipped_img, training=False)
                 pred = pred + tf.reverse(flipped_pred, axis=flip_axes)
             pred = pred / (len(self.tta_flips) + 1)
         return pred
Original file line number	Diff line number	Diff line change
`@@ -44,9 +44,6 @@ def __init__(self, args):`
`44`	`44`	`"nvol": self.args.nvol,`
`45`	`45`	`"bench_steps": self.args.bench_steps,`
`46`	`46`	`"meta": load_data(self.data_path, "*_meta.npy"),`
`47`		`- "read_roi": self.args.read_roi,`
`48`		`- "use_cpu": self.args.dali_use_cpu,`
`49`		`- "sw_benchmark": self.args.sw_benchmark,`
`50`	`47`	`}`
`51`	`48`
`52`	`49`	`def setup(self, stage=None):`