Skip to content

Commit 0409902

Browse files
ahmadkinv-kkudrynski
authored andcommitted
[SSD/PyT] New release with 22.10 base image
1 parent 35feabc commit 0409902

24 files changed

Lines changed: 167 additions & 127 deletions

PyTorch/Detection/SSD/Dockerfile

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,14 @@
1-
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.07-py3
1+
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.10-py3
22
FROM ${FROM_IMAGE_NAME}
33

44
# Set working directory
55
WORKDIR /workspace/ssd
66

7-
# Install nv-cocoapi
8-
ENV COCOAPI_VERSION=2.0+nv0.6.0
9-
RUN export COCOAPI_TAG=$(echo ${COCOAPI_VERSION} | sed 's/^.*+n//') \
10-
&& pip install --no-cache-dir pybind11 \
11-
&& pip install --no-cache-dir git+https://github.com/NVIDIA/cocoapi.git@${COCOAPI_TAG}#subdirectory=PythonAPI
12-
# Install dllogger
13-
RUN pip install --no-cache-dir git+https://github.com/NVIDIA/dllogger.git#egg=dllogger
7+
# Copy the model files
8+
COPY . .
149

15-
# Install requirements
16-
COPY requirements.txt .
17-
RUN pip install -r requirements.txt
18-
RUN python3 -m pip install pycocotools==2.0.0
10+
# Install python requirements
11+
RUN pip install --no-cache-dir -r requirements.txt
1912

20-
COPY . .
13+
ENV CUDNN_V8_API_ENABLED=1
14+
ENV TORCH_CUDNN_V8_API_ENABLED=1

PyTorch/Detection/SSD/README.md

Lines changed: 91 additions & 69 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP16 on 1 GPUs using 256 batch size
22
# Usage bash SSD300_FP16_1GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python $1/main.py --backbone resnet50 --warmup 300 --bs 256 --amp --data $2 ${@:3}
4+
python $1/main.py --backbone resnet50 --warmup 300 --bs 256 --data $2 ${@:3}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP16 on 4 GPUs using 1024 batch size (256 per GPU)
22
# Usage ./SSD300_FP16_4GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python -m torch.distributed.launch --nproc_per_node=4 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 256 --amp --data $2 ${@:3}
4+
torchrun --nproc_per_node=4 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 256 --data $2 ${@:3}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP16 on 8 GPUs using 1024 batch size (128 per GPU)
22
# Usage ./SSD300_FP16_8GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 128 --amp --data $2 ${@:3}
4+
torchrun --nproc_per_node=8 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 128 --data $2 ${@:3}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP32 on 8 GPUs using 1024 batch size (128 per GPU)
22
# Usage ./SSD300_FP32_8GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 128 --data $2 ${@:3}
4+
torchrun --nproc_per_node=8 $1/main.py --backbone resnet50 --learning-rate 2.7e-3 --warmup 1200 --bs 128 --no-amp --data $2 ${@:3}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP16 on 1 GPUs using 64 batch size
22
# Usage bash SSD300_FP16_1GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python $1/main.py --backbone resnet50 --warmup 300 --bs 64 --amp --data $2 ${@:3}
4+
python $1/main.py --backbone resnet50 --warmup 300 --bs 64 --data $2 ${@:3}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP16 on 4 GPUs using 256 batch size (64 per GPU)
22
# Usage ./SSD300_FP16_4GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python -m torch.distributed.launch --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --amp --data $2 ${@:3}
4+
torchrun --nproc_per_node=4 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --data $2 ${@:3}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script launches SSD300 training in FP16 on 8 GPUs using 512 batch size (64 per GPU)
22
# Usage ./SSD300_FP16_8GPU.sh <path to this repository> <path to dataset> <additional flags>
33

4-
python -m torch.distributed.launch --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --amp --data $2 ${@:3}
4+
torchrun --nproc_per_node=8 $1/main.py --backbone resnet50 --warmup 300 --bs 64 --data $2 ${@:3}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# This script evaluates SSD300 model in FP16 using 32 batch size on 1 GPU
22
# Usage: ./SSD300_FP16_EVAL.sh <path to this repository> <path to dataset> <path to checkpoint> <additional flags>
33

4-
python $1/main.py --backbone resnet50 --amp --ebs 32 --data $2 --mode evaluation --checkpoint $3 ${@:4}
4+
python $1/main.py --backbone resnet50 --ebs 32 --data $2 --mode evaluation --checkpoint $3 ${@:4}

0 commit comments

Comments
 (0)