Merge: [FastPitch/PyT] Optimize CPU perf and remove GPU syncs

nv-kkudrynski · nv-kkudrynski · commit fc80de5894e7 · 2022-08-16T05:27:38.000-07:00
diff --git a/PyTorch/SpeechSynthesis/FastPitch/README.md b/PyTorch/SpeechSynthesis/FastPitch/README.md
@@ -675,6 +675,12 @@ We're constantly refining and improving our performance on AI and HPC workloads
 
 ### Changelog
 
+July 2022
+- Performance optimizations, speedups up to 2x (DGX-1) and 2.5x (DGX A100)
+
+June 2022
+- MHA bug fix affecting models with > 1 attention heads
+
 August 2021
 - Improved quality of synthesized audio
 - Added capability to automatically align audio to transcripts during training without a pre-trained Tacotron 2 aligning model
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/alignment.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/alignment.py
@@ -17,69 +17,67 @@
 
 
 @jit(nopython=True)
-def mas(attn_map, width=1):
+def mas(log_attn_map, width=1):
     # assumes mel x text
-    opt = np.zeros_like(attn_map)
-    attn_map = np.log(attn_map)
-    attn_map[0, 1:] = -np.inf
-    log_p = np.zeros_like(attn_map)
-    log_p[0, :] = attn_map[0, :]
-    prev_ind = np.zeros_like(attn_map, dtype=np.int64)
-    for i in range(1, attn_map.shape[0]):
-        for j in range(attn_map.shape[1]):  # for each text dim
+    opt = np.zeros_like(log_attn_map)
+    log_attn_map = log_attn_map.copy()
+    log_attn_map[0, 1:] = -np.inf
+    log_p = np.zeros_like(log_attn_map)
+    log_p[0, :] = log_attn_map[0, :]
+    prev_ind = np.zeros_like(log_attn_map, dtype=np.int64)
+    for i in range(1, log_attn_map.shape[0]):
+        for j in range(log_attn_map.shape[1]):  # for each text dim
             prev_j = np.arange(max(0, j-width), j+1)
             prev_log = np.array([log_p[i-1, prev_idx] for prev_idx in prev_j])
 
             ind = np.argmax(prev_log)
-            log_p[i, j] = attn_map[i, j] + prev_log[ind]
+            log_p[i, j] = log_attn_map[i, j] + prev_log[ind]
             prev_ind[i, j] = prev_j[ind]
 
     # now backtrack
-    curr_text_idx = attn_map.shape[1]-1
-    for i in range(attn_map.shape[0]-1, -1, -1):
+    curr_text_idx = log_attn_map.shape[1]-1
+    for i in range(log_attn_map.shape[0]-1, -1, -1):
         opt[i, curr_text_idx] = 1
         curr_text_idx = prev_ind[i, curr_text_idx]
     opt[0, curr_text_idx] = 1
     return opt
 
 
 @jit(nopython=True)
-def mas_width1(attn_map):
+def mas_width1(log_attn_map):
     """mas with hardcoded width=1"""
     # assumes mel x text
-    opt = np.zeros_like(attn_map)
-    attn_map = np.log(attn_map)
-    attn_map[0, 1:] = -np.inf
-    log_p = np.zeros_like(attn_map)
-    log_p[0, :] = attn_map[0, :]
-    prev_ind = np.zeros_like(attn_map, dtype=np.int64)
-    for i in range(1, attn_map.shape[0]):
-        for j in range(attn_map.shape[1]):  # for each text dim
-            prev_log = log_p[i-1, j]
-            prev_j = j
-
-            if j-1 >= 0 and log_p[i-1, j-1] >= log_p[i-1, j]:
-                prev_log = log_p[i-1, j-1]
-                prev_j = j-1
-
-            log_p[i, j] = attn_map[i, j] + prev_log
-            prev_ind[i, j] = prev_j
+    neg_inf = log_attn_map.dtype.type(-np.inf)
+    log_p = log_attn_map.copy()
+    log_p[0, 1:] = neg_inf
+    for i in range(1, log_p.shape[0]):
+        prev_log1 = neg_inf
+        for j in range(log_p.shape[1]):
+            prev_log2 = log_p[i-1, j]
+            log_p[i, j] += max(prev_log1, prev_log2)
+            prev_log1 = prev_log2
 
     # now backtrack
-    curr_text_idx = attn_map.shape[1]-1
-    for i in range(attn_map.shape[0]-1, -1, -1):
-        opt[i, curr_text_idx] = 1
-        curr_text_idx = prev_ind[i, curr_text_idx]
-    opt[0, curr_text_idx] = 1
+    opt = np.zeros_like(log_p)
+    one = opt.dtype.type(1)
+    j = log_p.shape[1]-1
+    for i in range(log_p.shape[0]-1, 0, -1):
+        opt[i, j] = one
+        if log_p[i-1, j-1] >= log_p[i-1, j]:
+            j -= 1
+            if j == 0:
+                opt[1:i, j] = one
+                break
+    opt[0, j] = one
     return opt
 
 
 @jit(nopython=True, parallel=True)
-def b_mas(b_attn_map, in_lens, out_lens, width=1):
+def b_mas(b_log_attn_map, in_lens, out_lens, width=1):
     assert width == 1
-    attn_out = np.zeros_like(b_attn_map)
+    attn_out = np.zeros_like(b_log_attn_map)
 
-    for b in prange(b_attn_map.shape[0]):
-        out = mas_width1(b_attn_map[b, 0, :out_lens[b], :in_lens[b]])
+    for b in prange(b_log_attn_map.shape[0]):
+        out = mas_width1(b_log_attn_map[b, 0, :out_lens[b], :in_lens[b]])
         attn_out[b, 0, :out_lens[b], :in_lens[b]] = out
     return attn_out
diff --git a/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py b/PyTorch/SpeechSynthesis/FastPitch/fastpitch/model.py
@@ -27,6 +27,8 @@
 
 from typing import Optional
 
+import numpy as np
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -52,7 +54,7 @@ def regulate_len(durations, enc_out, pace: float = 1.0,
                                dim=1)[:, None, :]
     reps_cumsum = reps_cumsum.to(dtype)
 
-    range_ = torch.arange(max_len).to(enc_out.device)[None, :, None]
+    range_ = torch.arange(max_len, device=enc_out.device)[None, :, None]
     mult = ((reps_cumsum[:, :, :-1] <= range_) &
             (reps_cumsum[:, :, 1:] > range_))
     mult = mult.to(dtype)
@@ -218,13 +220,17 @@ def binarize_attention(self, attn, in_lens, out_lens):
         """
         b_size = attn.shape[0]
         with torch.no_grad():
-            attn_cpu = attn.data.cpu().numpy()
-            attn_out = torch.zeros_like(attn)
+            attn_out_cpu = np.zeros(attn.data.shape, dtype=np.float32)
+            log_attn_cpu = torch.log(attn.data).to(device='cpu', dtype=torch.float32)
+            log_attn_cpu = log_attn_cpu.numpy()
+            out_lens_cpu = out_lens.cpu()
+            in_lens_cpu = in_lens.cpu()
             for ind in range(b_size):
                 hard_attn = mas_width1(
-                    attn_cpu[ind, 0, :out_lens[ind], :in_lens[ind]])
-                attn_out[ind, 0, :out_lens[ind], :in_lens[ind]] = torch.tensor(
-                    hard_attn, device=attn.get_device())
+                    log_attn_cpu[ind, 0, :out_lens_cpu[ind], :in_lens_cpu[ind]])
+                attn_out_cpu[ind, 0, :out_lens_cpu[ind], :in_lens_cpu[ind]] = hard_attn
+            attn_out = torch.tensor(
+                attn_out_cpu, device=attn.get_device(), dtype=attn.dtype)
         return attn_out
 
     def binarize_attention_parallel(self, attn, in_lens, out_lens):
@@ -235,8 +241,8 @@ def binarize_attention_parallel(self, attn, in_lens, out_lens):
             attn: B x 1 x max_mel_len x max_text_len
         """
         with torch.no_grad():
-            attn_cpu = attn.data.cpu().numpy()
-            attn_out = b_mas(attn_cpu, in_lens.cpu().numpy(),
+            log_attn_cpu = torch.log(attn.data).cpu().numpy()
+            attn_out = b_mas(log_attn_cpu, in_lens.cpu().numpy(),
                              out_lens.cpu().numpy(), width=1)
         return torch.from_numpy(attn_out).to(attn.get_device())
 
@@ -245,6 +251,7 @@ def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75):
         (inputs, input_lens, mel_tgt, mel_lens, pitch_dense, energy_dense,
          speaker, attn_prior, audiopaths) = inputs
 
+        text_max_len = inputs.size(1)
         mel_max_len = mel_tgt.size(2)
 
         # Calculate speaker embedding
@@ -257,33 +264,32 @@ def forward(self, inputs, use_gt_pitch=True, pace=1.0, max_duration=75):
         # Input FFT
         enc_out, enc_mask = self.encoder(inputs, conditioning=spk_emb)
 
+        # Predict durations
+        log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1)
+        dur_pred = torch.clamp(torch.exp(log_dur_pred) - 1, 0, max_duration)
+
+        # Predict pitch
+        pitch_pred = self.pitch_predictor(enc_out, enc_mask).permute(0, 2, 1)
+
         # Alignment
         text_emb = self.encoder.word_emb(inputs)
 
         # make sure to do the alignments before folding
-        attn_mask = mask_from_lens(input_lens)[..., None] == 0
+        attn_mask = mask_from_lens(input_lens, max_len=text_max_len)
+        attn_mask = attn_mask[..., None] == 0
         # attn_mask should be 1 for unused timesteps in the text_enc_w_spkvec tensor
 
         attn_soft, attn_logprob = self.attention(
             mel_tgt, text_emb.permute(0, 2, 1), mel_lens, attn_mask,
             key_lens=input_lens, keys_encoded=enc_out, attn_prior=attn_prior)
 
-        attn_hard = self.binarize_attention_parallel(
-            attn_soft, input_lens, mel_lens)
+        attn_hard = self.binarize_attention(attn_soft, input_lens, mel_lens)
 
         # Viterbi --> durations
         attn_hard_dur = attn_hard.sum(2)[:, 0, :]
         dur_tgt = attn_hard_dur
-
         assert torch.all(torch.eq(dur_tgt.sum(dim=1), mel_lens))
 
-        # Predict durations
-        log_dur_pred = self.duration_predictor(enc_out, enc_mask).squeeze(-1)
-        dur_pred = torch.clamp(torch.exp(log_dur_pred) - 1, 0, max_duration)
-
-        # Predict pitch
-        pitch_pred = self.pitch_predictor(enc_out, enc_mask).permute(0, 2, 1)
-
         # Average pitch over characters
         pitch_tgt = average_pitch(pitch_dense, dur_tgt)