[SSD/PyT] New release with 22.10 base image

ahmadki · nv-kkudrynski · commit 04099020ae68 · 2023-01-30T05:34:08.000-08:00
diff --git a/PyTorch/Detection/SSD/Dockerfile b/PyTorch/Detection/SSD/Dockerfile
@@ -1,20 +1,14 @@
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.07-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.10-py3
 FROM ${FROM_IMAGE_NAME}
 
 # Set working directory
 WORKDIR /workspace/ssd
 
-# Install nv-cocoapi
-ENV COCOAPI_VERSION=2.0+nv0.6.0
-RUN export COCOAPI_TAG=$(echo ${COCOAPI_VERSION} | sed 's/^.*+n//') \
- && pip install --no-cache-dir pybind11                             \
- && pip install --no-cache-dir git+https://github.com/NVIDIA/cocoapi.git@${COCOAPI_TAG}#subdirectory=PythonAPI
-# Install dllogger
-RUN pip install --no-cache-dir git+https://github.com/NVIDIA/dllogger.git#egg=dllogger
+# Copy the model files
+COPY . .
 
-# Install requirements
-COPY requirements.txt .
-RUN pip install -r requirements.txt
-RUN python3 -m pip install pycocotools==2.0.0
+# Install python requirements
+RUN pip install --no-cache-dir -r requirements.txt
 
-COPY . .
+ENV CUDNN_V8_API_ENABLED=1
+ENV TORCH_CUDNN_V8_API_ENABLED=1
diff --git a/PyTorch/Detection/SSD/README.md b/PyTorch/Detection/SSD/README.md
diff --git a/PyTorch/Detection/SSD/examples/SSD300_A100_FP16_1GPU.sh b/PyTorch/Detection/SSD/examples/SSD300_A100_FP16_1GPU.sh
@@ -1,4 +1,4 @@
 # This script launches SSD300 training in FP16 on 1 GPUs using 256 batch size
 # Usage bash SSD300_FP16_1GPU.sh <path to this repository> <path to dataset> <additional flags>
 
-python $1/main.py --backbone resnet50 --warmup 300 --bs 256 --amp --data $2 ${@:3}
+python $1/main.py --backbone resnet50 --warmup 300 --bs 256 --data $2 ${@:3}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_A100_FP16_4GPU.sh b/PyTorch/Detection/SSD/examples/SSD300_A100_FP16_4GPU.sh
@@ -1,4 +1,4 @@
 # This script launches SSD300 training in FP16 on 4 GPUs using 1024 batch size (256 per GPU)
 # Usage ./SSD300_FP16_4GPU.sh <path to this repository> <path to dataset> <additional flags>
 
-python -m torch.distributed.launch --nproc_per_node=4 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 256 --amp --data $2 ${@:3}
+torchrun --nproc_per_node=4 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 256 --data $2 ${@:3}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_A100_FP16_8GPU.sh b/PyTorch/Detection/SSD/examples/SSD300_A100_FP16_8GPU.sh
@@ -1,4 +1,4 @@
 # This script launches SSD300 training in FP16 on 8 GPUs using 1024 batch size (128 per GPU)
 # Usage ./SSD300_FP16_8GPU.sh <path to this repository> <path to dataset> <additional flags>
 
-python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 128 --amp --data $2 ${@:3}
+torchrun --nproc_per_node=8 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 128 --data $2 ${@:3}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_A100_FP32_8GPU.sh b/PyTorch/Detection/SSD/examples/SSD300_A100_FP32_8GPU.sh
@@ -1,4 +1,4 @@
 # This script launches SSD300 training in FP32 on 8 GPUs using 1024 batch size (128 per GPU)
 # Usage ./SSD300_FP32_8GPU.sh <path to this repository> <path to dataset> <additional flags>
 
-python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 128 --data $2 ${@:3}
+torchrun --nproc_per_node=8 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 128 --no-amp --data $2 ${@:3}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_FP16_1GPU.sh b/PyTorch/Detection/SSD/examples/SSD300_FP16_1GPU.sh
@@ -1,4 +1,4 @@
 # This script launches SSD300 training in FP16 on 1 GPUs using 64 batch size
 # Usage bash SSD300_FP16_1GPU.sh <path to this repository> <path to dataset> <additional flags>
 
-python $1/main.py --backbone resnet50 --warmup 300 --bs 64 --amp --data $2 ${@:3}
+python $1/main.py --backbone resnet50 --warmup 300 --bs 64 --data $2 ${@:3}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_FP16_4GPU.sh b/PyTorch/Detection/SSD/examples/SSD300_FP16_4GPU.sh
@@ -1,4 +1,4 @@
 # This script launches SSD300 training in FP16 on 4 GPUs using 256 batch size (64 per GPU)
 # Usage ./SSD300_FP16_4GPU.sh <path to this repository> <path to dataset> <additional flags>
 
-python -m torch.distributed.launch --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --amp --data $2 ${@:3}
+torchrun --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --data $2 ${@:3}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_FP16_8GPU.sh b/PyTorch/Detection/SSD/examples/SSD300_FP16_8GPU.sh
@@ -1,4 +1,4 @@
 # This script launches SSD300 training in FP16 on 8 GPUs using 512 batch size (64 per GPU)
 # Usage ./SSD300_FP16_8GPU.sh <path to this repository> <path to dataset> <additional flags>
 
-python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --amp --data $2 ${@:3}
+torchrun --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --data $2 ${@:3}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_FP16_EVAL.sh b/PyTorch/Detection/SSD/examples/SSD300_FP16_EVAL.sh
@@ -1,4 +1,4 @@
 # This script evaluates SSD300 model in FP16 using 32 batch size on 1 GPU
 # Usage: ./SSD300_FP16_EVAL.sh <path to this repository> <path to dataset> <path to checkpoint> <additional flags>
 
-python $1/main.py --backbone resnet50 --amp --ebs 32 --data $2 --mode evaluation --checkpoint $3 ${@:4}
+python $1/main.py --backbone resnet50 --ebs 32 --data $2 --mode evaluation --checkpoint $3 ${@:4}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_FP16_INFERENCE_BENCHMARK.sh b/PyTorch/Detection/SSD/examples/SSD300_FP16_INFERENCE_BENCHMARK.sh
@@ -1,4 +1,4 @@
 # This script launches SSD300 inference benchmark in FP16 on 1 GPU with 64 batch size
 # Usage bash SSD300_FP16_INFERENCE_BENCHMARK.sh <path to this repository> <path to dataset> <additional flags>
 
-python $1/main.py --backbone resnet50 --mode benchmark-inference --bs 64 --amp --data $2 ${@:3}
+python $1/main.py --backbone resnet50 --mode benchmark-inference --bs 64 --data $2 ${@:3}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_FP32_1GPU.sh b/PyTorch/Detection/SSD/examples/SSD300_FP32_1GPU.sh
@@ -1,4 +1,4 @@
 # This script launches SSD300 training in FP32 on 1 GPUs using 32 batch size
 # Usage ./SSD300_FP32_1GPU.sh <path to this repository> <path to dataset> <additional flags>
 
-python $1/main.py --backbone resnet50 --bs 32 --warmup 300 --data $2 ${@:3}
+python $1/main.py --backbone resnet50 --bs 32 --warmup 300 --no-amp --data-layout channels_first --data $2 ${@:3}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_FP32_4GPU.sh b/PyTorch/Detection/SSD/examples/SSD300_FP32_4GPU.sh
@@ -1,4 +1,4 @@
 # This script launches SSD300 training in FP32 on 4 GPUs using 128 batch size (32 per GPU)
 # Usage ./SSD300_FP32_4GPU.sh <path to this repository> <path to dataset> <additional flags>
 
-python -m torch.distributed.launch --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 32 --data $2 ${@:3}
+torchrun --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 32 --no-amp --data-layout channels_first --data $2 ${@:3}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_FP32_8GPU.sh b/PyTorch/Detection/SSD/examples/SSD300_FP32_8GPU.sh
@@ -1,4 +1,4 @@
 # This script launches SSD300 training in FP32 on 8 GPUs using 256 batch size (32 per GPU)
 # Usage ./SSD300_FP32_8GPU.sh <path to this repository> <path to dataset> <additional flags>
 
-python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 32 --data $2 ${@:3}
+torchrun --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 32 --no-amp --data-layout channels_first --data $2 ${@:3}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_FP32_EVAL.sh b/PyTorch/Detection/SSD/examples/SSD300_FP32_EVAL.sh
@@ -1,4 +1,4 @@
 # This script evaluates SSD300 model in FP32 using 32 batch size on 1 GPU
 # Usage: ./SSD300_FP32_EVAL.sh <path to this repository> <path to dataset> <path to checkpoint> <additional flags>
 
-python $1/main.py --backbone resnet50 --ebs 32 --data $2 --mode evaluation --checkpoint $3 ${@:4}
+python $1/main.py --backbone resnet50 --ebs 32 --data $2 --mode evaluation --no-amp --data-layout channels_first --checkpoint $3 ${@:4}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_FP32_INFERENCE_BENCHMARK.sh b/PyTorch/Detection/SSD/examples/SSD300_FP32_INFERENCE_BENCHMARK.sh
@@ -1,4 +1,4 @@
 # This script launches SSD300 inference benchmark in FP32 on 1 GPU with 64 batch size
 # Usage bash SSD300_FP32_INFERENCE_BENCHMARK.sh <path to this repository> <path to dataset> <additional flags>
 
-python $1/main.py --backbone resnet50 --warmup 300 --mode benchmark-inference --bs 32 --data $2 ${@:3}
+python $1/main.py --backbone resnet50 --warmup 300 --mode benchmark-inference --bs 32 --no-amp --data-layout channels_first --data $2 ${@:3}
diff --git a/PyTorch/Detection/SSD/examples/SSD300_inference.py b/PyTorch/Detection/SSD/examples/SSD300_inference.py
@@ -28,7 +28,7 @@ def load_checkpoint(model, model_file):
 
 
 def build_predictor(model_file, backbone='resnet50'):
-    ssd300 = SSD300(backbone=ResNet(backbone))
+    ssd300 = SSD300(backbone=ResNet(backbone=backbone))
     load_checkpoint(ssd300, model_file)
 
     return ssd300
diff --git a/PyTorch/Detection/SSD/main.py b/PyTorch/Detection/SSD/main.py
@@ -67,6 +67,9 @@ def make_parser():
                         help='manually set random seed for torch')
     parser.add_argument('--checkpoint', type=str, default=None,
                         help='path to model checkpoint file')
+    parser.add_argument('--torchvision-weights-version', type=str, default="IMAGENET1K_V2",
+                        choices=['IMAGENET1K_V1', 'IMAGENET1K_V2', 'DEFAULT'],
+                        help='The torchvision weights version to use when --checkpoint is not specified')
     parser.add_argument('--save', type=str, default=None,
                         help='save model checkpoints in the specified directory')
     parser.add_argument('--mode', type=str, default='training',
@@ -97,9 +100,19 @@ def make_parser():
                              ' backbone model declared with the --backbone argument.'
                              ' When it is not provided, pretrained model from torchvision'
                              ' will be downloaded.')
-    parser.add_argument('--num-workers', type=int, default=4)
-    parser.add_argument('--amp', action='store_true',
-                        help='Whether to enable AMP ops. When false, uses TF32 on A100 and FP32 on V100 GPUS.')
+    parser.add_argument('--num-workers', type=int, default=8)
+    parser.add_argument("--amp", dest='amp', action="store_true",
+                        help="Enable Automatic Mixed Precision (AMP).")
+    parser.add_argument("--no-amp", dest='amp', action="store_false",
+                        help="Disable Automatic Mixed Precision (AMP).")
+    parser.set_defaults(amp=True)
+    parser.add_argument("--allow-tf32", dest='allow_tf32', action="store_true",
+                        help="Allow TF32 computations on supported GPUs.")
+    parser.add_argument("--no-allow-tf32", dest='allow_tf32', action="store_false",
+                        help="Disable TF32 computations.")
+    parser.set_defaults(allow_tf32=True)
+    parser.add_argument('--data-layout', default="channels_last", choices=['channels_first', 'channels_last'],
+                        help="Model data layout. It's recommended to use channels_first with --no-amp")
     parser.add_argument('--log-interval', type=int, default=20,
                         help='Logging interval.')
     parser.add_argument('--json-summary', type=str, default=None,
@@ -150,7 +163,9 @@ def train(train_loop_func, logger, args):
     val_dataset = get_val_dataset(args)
     val_dataloader = get_val_dataloader(val_dataset, args)
 
-    ssd300 = SSD300(backbone=ResNet(args.backbone, args.backbone_path))
+    ssd300 = SSD300(backbone=ResNet(backbone=args.backbone,
+                                    backbone_path=args.backbone_path,
+                                    weights=args.torchvision_weights_version))
     args.learning_rate = args.learning_rate * args.N_gpu * (args.batch_size / 32)
     start_epoch = 0
     iteration = 0
@@ -223,6 +238,7 @@ def train(train_loop_func, logger, args):
                 obj['model'] = ssd300.module.state_dict()
             else:
                 obj['model'] = ssd300.state_dict()
+            os.makedirs(args.save, exist_ok=True)
             save_path = os.path.join(args.save, f'epoch_{epoch}.pt')
             torch.save(obj, save_path)
             logger.log('model path', save_path)
@@ -261,6 +277,8 @@ def log_params(logger, args):
     if args.local_rank == 0:
         os.makedirs('./models', exist_ok=True)
 
+    torch.backends.cuda.matmul.allow_tf32 = args.allow_tf32
+    torch.backends.cudnn.allow_tf32 = args.allow_tf32
     torch.backends.cudnn.benchmark = True
 
     # write json only on the main thread
diff --git a/PyTorch/Detection/SSD/requirements.txt b/PyTorch/Detection/SSD/requirements.txt
@@ -1,3 +1,6 @@
-Cython>=0.28.4
-scikit-image>=0.15.0
-ujson>=4.0.2
+Cython>=0.29.32
+scikit-image>=0.19.3
+ujson>=5.5.0
+pybind11>=2.10.0
+git+https://github.com/NVIDIA/cocoapi.git@v0.7.3#subdirectory=PythonAPI
+git+https://github.com/NVIDIA/dllogger.git#egg=dllogger
diff --git a/PyTorch/Detection/SSD/ssd/coco_pipeline.py b/PyTorch/Detection/SSD/ssd/coco_pipeline.py
@@ -21,6 +21,7 @@
 # DALI imports
 import nvidia.dali as dali
 from nvidia.dali.pipeline import Pipeline
+from nvidia.dali.types import to_numpy_type
 
 
 class COCOPipeline(Pipeline):
@@ -124,14 +125,14 @@ def define_graph(self):
         return (images, bboxes.gpu(), labels.gpu())
 
 to_torch_type = {
-    np.dtype(np.float32) : torch.float32,
-    np.dtype(np.float64) : torch.float64,
-    np.dtype(np.float16) : torch.float16,
-    np.dtype(np.uint8)   : torch.uint8,
-    np.dtype(np.int8)    : torch.int8,
-    np.dtype(np.int16)   : torch.int16,
-    np.dtype(np.int32)   : torch.int32,
-    np.dtype(np.int64)   : torch.int64
+    np.float32 : torch.float32,
+    np.float64 : torch.float64,
+    np.float16 : torch.float16,
+    np.uint8   : torch.uint8,
+    np.int8    : torch.int8,
+    np.int16   : torch.int16,
+    np.int32   : torch.int32,
+    np.int64   : torch.int64
 }
 
 def feed_ndarray(dali_tensor, arr):
@@ -242,9 +243,9 @@ def __next__(self):
                     labels_shape[j].append(lshape)
 
             # We always need to alocate new memory as bboxes and labels varies in shape
-            images_torch_type = to_torch_type[np.dtype(images[0].dtype())]
-            bboxes_torch_type = to_torch_type[np.dtype(bboxes[0][0].dtype())]
-            labels_torch_type = to_torch_type[np.dtype(labels[0][0].dtype())]
+            images_torch_type = to_torch_type[to_numpy_type(images[0].dtype)]
+            bboxes_torch_type = to_torch_type[to_numpy_type(bboxes[0][0].dtype)]
+            labels_torch_type = to_torch_type[to_numpy_type(labels[0][0].dtype)]
 
             torch_gpu_device = torch.device('cuda', dev_id)
             torch_cpu_device = torch.device('cpu')
diff --git a/PyTorch/Detection/SSD/ssd/evaluate.py b/PyTorch/Detection/SSD/ssd/evaluate.py
@@ -52,10 +52,8 @@ def evaluate(model, coco, cocoGt, encoder, inv_map, args):
 
                 try:
                     result = encoder.decode_batch(ploc_i, plabel_i, 0.50, 200)[0]
-                except:
-                    # raise
-                    print("")
-                    print("No object detected in idx: {}".format(idx))
+                except Exception as e:
+                    print("Skipping idx {}, failed to decode with message {}, Skipping.".format(idx, e))
                     continue
 
                 htot, wtot = img_size[0][idx].item(), img_size[1][idx].item()
diff --git a/PyTorch/Detection/SSD/ssd/model.py b/PyTorch/Detection/SSD/ssd/model.py
@@ -18,22 +18,22 @@
 
 
 class ResNet(nn.Module):
-    def __init__(self, backbone='resnet50', backbone_path=None):
+    def __init__(self, backbone='resnet50', backbone_path=None, weights="IMAGENET1K_V1"):
         super().__init__()
         if backbone == 'resnet18':
-            backbone = resnet18(pretrained=not backbone_path)
+            backbone = resnet18(weights=None if backbone_path else weights)
             self.out_channels = [256, 512, 512, 256, 256, 128]
         elif backbone == 'resnet34':
-            backbone = resnet34(pretrained=not backbone_path)
+            backbone = resnet34(weights=None if backbone_path else weights)
             self.out_channels = [256, 512, 512, 256, 256, 256]
         elif backbone == 'resnet50':
-            backbone = resnet50(pretrained=not backbone_path)
+            backbone = resnet50(weights=None if backbone_path else weights)
             self.out_channels = [1024, 512, 512, 256, 256, 256]
         elif backbone == 'resnet101':
-            backbone = resnet101(pretrained=not backbone_path)
+            backbone = resnet101(weights=None if backbone_path else weights)
             self.out_channels = [1024, 512, 512, 256, 256, 256]
         else:  # backbone == 'resnet152':
-            backbone = resnet152(pretrained=not backbone_path)
+            backbone = resnet152(weights=None if backbone_path else weights)
             self.out_channels = [1024, 512, 512, 256, 256, 256]
         if backbone_path:
             backbone.load_state_dict(torch.load(backbone_path))
@@ -108,7 +108,7 @@ def _init_weights(self):
     def bbox_view(self, src, loc, conf):
         ret = []
         for s, l, c in zip(src, loc, conf):
-            ret.append((l(s).view(s.size(0), 4, -1), c(s).view(s.size(0), self.label_num, -1)))
+            ret.append((l(s).reshape(s.size(0), 4, -1), c(s).reshape(s.size(0), self.label_num, -1)))
 
         locs, confs = list(zip(*ret))
         locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()
diff --git a/PyTorch/Detection/SSD/ssd/train.py b/PyTorch/Detection/SSD/ssd/train.py
@@ -44,6 +44,8 @@ def train_loop(model, loss_func, scaler, epoch, optim, train_dataloader, val_dat
         label = label.view(N, M)
 
         with torch.cuda.amp.autocast(enabled=args.amp):
+            if args.data_layout == 'channels_last':
+                img = img.to(memory_format=torch.channels_last)
             ploc, plabel = model(img)
 
             ploc, plabel = ploc.float(), plabel.float()
@@ -101,6 +103,8 @@ def benchmark_train_loop(model, loss_func, scaler, epoch, optim, train_dataloade
         label = label.view(N, M)
 
         with torch.cuda.amp.autocast(enabled=args.amp):
+            if args.data_layout == 'channels_last':
+                img = img.to(memory_format=torch.channels_last)
             ploc, plabel = model(img)
 
             ploc, plabel = ploc.float(), plabel.float()
diff --git a/PyTorch/Detection/SSD/ssd/utils.py b/PyTorch/Detection/SSD/ssd/utils.py
@@ -217,7 +217,7 @@ def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200)
 
 
         _, max_ids = scores_out.sort(dim=0)
-        max_ids = max_ids[-max_output:]
+        max_ids = max_ids[-max_output:].to("cpu")
         return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]