[FastPitch/PyT] Fix off-by-one in grad acc

alancucki · nv-kkudrynski · commit a2f02eb7021a · 2022-09-14T03:14:03.000-07:00
diff --git a/PyTorch/SpeechSynthesis/FastPitch/inference.py b/PyTorch/SpeechSynthesis/FastPitch/inference.py
@@ -514,7 +514,7 @@ def generate_audio(mel):
                             audio[-fade_len:] *= fade_w.to(audio.device)
 
                         audio = audio / torch.max(torch.abs(audio))
-                        fname = b['output'][i] if 'output' in b else f'audio_{i}.wav'
+                        fname = b['output'][i] if 'output' in b else f'audio_{all_utterances + i}.wav'
                         audio_path = Path(args.output, fname)
                         write(audio_path, args.sampling_rate, audio.cpu().numpy())
 
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_benchmark.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_benchmark.sh
@@ -2,12 +2,13 @@
 
 set -a
 
-: ${PHRASES:="phrases/benchmark_8_128.tsv"}
-: ${OUTPUT_DIR:="./output/audio_$(basename ${PHRASES} .tsv)"}
+: ${FILELIST:="phrases/benchmark_8_128.tsv"}
+: ${OUTPUT_DIR:="./output/audio_$(basename ${FILELIST} .tsv)"}
 : ${TORCHSCRIPT:=true}
-: ${REPEATS:=100}
 : ${BS_SEQUENCE:="1 4 8"}
-: ${WARMUP:=100}
+: ${WARMUP:=64}
+: ${REPEATS:=500}
+: ${AMP:=false}
 
 for BATCH_SIZE in $BS_SEQUENCE ; do
     LOG_FILE="$OUTPUT_DIR"/perf-infer_amp-${AMP}_bs${BATCH_SIZE}.json
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_example.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/inference_example.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 
-export CUDNN_V8_API_ENABLED=1
+export CUDNN_V8_API_ENABLED=1  # Keep the flag for older containers
+export TORCH_CUDNN_V8_API_ENABLED=1
 
 : ${DATASET_DIR:="data/LJSpeech-1.1"}
 : ${BATCH_SIZE:=32}
@@ -55,7 +56,7 @@ echo -e "\nAMP=$AMP, batch_size=$BATCH_SIZE\n"
 
 ARGS=""
 ARGS+=" --cuda"
-ARGS+=" --cudnn-benchmark"
+# ARGS+=" --cudnn-benchmark"  # Enable for benchmarking or long operation
 ARGS+=" --dataset-path $DATASET_DIR"
 ARGS+=" -i $FILELIST"
 ARGS+=" -o $OUTPUT_DIR"
diff --git a/PyTorch/SpeechSynthesis/FastPitch/scripts/train_benchmark.sh b/PyTorch/SpeechSynthesis/FastPitch/scripts/train_benchmark.sh
@@ -11,6 +11,7 @@ set -a
 for NUM_GPUS in $NUM_GPUS_SEQUENCE ; do
     GRAD_ACCUMULATION=$((256 / $BATCH_SIZE / $NUM_GPUS ))
     LOG_FILE=$OUTPUT_DIR/perf-train_amp-${AMP}_${NUM_GPUS}x${BATCH_SIZE}x${GRAD_ACCUMULATION}.json
-    bash scripts/train.sh "$@"
+    BMARK_EPOCHS=$((EPOCHS * 2 / 3 * $NUM_GPUS / 8))  # 2/3 of EPOCHS
+    EPOCHS=$((EPOCHS * $NUM_GPUS / 8)) bash scripts/train.sh "$@" --benchmark-epochs-num $BMARK_EPOCHS
     rm -f $OUTPUT_DIR/FastPitch*.pt
 done
diff --git a/PyTorch/SpeechSynthesis/FastPitch/train.py b/PyTorch/SpeechSynthesis/FastPitch/train.py
@@ -417,8 +417,8 @@ def main():
 
         epoch_iter = 1
         for batch, accum_step in zip(train_loader,
-                                     cycle(range(args.grad_accumulation))):
-            if accum_step == 0:
+                                     cycle(range(1, args.grad_accumulation + 1))):
+            if accum_step == 1:
                 adjust_learning_rate(total_iter, optimizer, args.learning_rate,
                                      args.warmup_steps)