33import noisereduce as nr
44import numpy as np
55from fairseq import checkpoint_utils
6- import librosa ,torch ,parselmouth ,faiss ,time ,threading
6+ import librosa ,torch ,parselmouth ,faiss ,time ,threading , math
77import torch .nn .functional as F
8+ import torchaudio .transforms as tat
9+
810#import matplotlib.pyplot as plt
911from infer_pack .models import SynthesizerTrnMs256NSFsid , SynthesizerTrnMs256NSFsid_nono
1012from webui_locale import I18nAuto
@@ -85,7 +87,7 @@ def infer(self,audio:np.ndarray,sampling_rate:int) -> np.ndarray:
8587 audio = librosa .to_mono (audio .transpose (1 , 0 ))
8688 if sampling_rate != 16000 :
8789 audio = librosa .resample (audio , orig_sr = sampling_rate , target_sr = 16000 )
88- print ('test:audio:' + str (audio .shape ))
90+ # print('test:audio:'+str(audio.shape))
8991 '''padding'''
9092
9193
@@ -147,7 +149,8 @@ def __init__(self) -> None:
147149 self .threhold :int = - 30
148150 self .crossfade_time :float = 0.08
149151 self .extra_time :float = 0.04
150- self .noise_reduce = False
152+ self .I_noise_reduce = False
153+ self .O_noise_reduce = False
151154
152155class GUI :
153156 def __init__ (self ) -> None :
@@ -162,6 +165,7 @@ def launcher(self):
162165 layout = [
163166 [
164167 sg .Frame (title = i18n ('加载模型' ),layout = [
168+ [sg .Input (default_text = 'TEMP\\ hubert_base.pt' ,key = 'hubert_path' ),sg .FileBrowse (i18n ('Hubert File' ))],
165169 [sg .Input (default_text = 'TEMP\\ atri.pth' ,key = 'pth_path' ),sg .FileBrowse (i18n ('选择.pth文件' ))],
166170 [sg .Input (default_text = 'TEMP\\ added_IVF512_Flat_atri_baseline_src_feat.index' ,key = 'index_path' ),sg .FileBrowse (i18n ('选择.index文件' ))],
167171 [sg .Input (default_text = 'TEMP\\ big_src_feature_atri.npy' ,key = 'npy_path' ),sg .FileBrowse (i18n ('选择.npy文件' ))]
@@ -183,10 +187,10 @@ def launcher(self):
183187 [sg .Text (i18n ("采样长度" )),sg .Slider (range = (0.1 ,3.0 ),key = 'block_time' ,resolution = 0.1 ,orientation = 'h' ,default_value = 1.0 )],
184188 [sg .Text (i18n ("淡入淡出长度" )),sg .Slider (range = (0.01 ,0.15 ),key = 'crossfade_length' ,resolution = 0.01 ,orientation = 'h' ,default_value = 0.08 )],
185189 [sg .Text (i18n ("额外推理时长" )),sg .Slider (range = (0.05 ,3.00 ),key = 'extra_time' ,resolution = 0.01 ,orientation = 'h' ,default_value = 0.05 )],
186- [sg .Checkbox (i18n ('输出降噪/ Output Noisereduce' ),key = 'noise_reduce ' )]
190+ [sg .Checkbox (i18n ('Input Noisereduce' ), key = 'I_noise_reduce' ), sg . Checkbox ( i18n ( ' Output Noisereduce' ),key = 'O_noise_reduce ' )]
187191 ],title = i18n ("性能设置" ))
188192 ],
189- [sg .Button (i18n ("开始音频转换" ),key = 'start_vc' ),sg .Button (i18n ("停止音频转换" ),key = 'stop_vc' )]
193+ [sg .Button (i18n ("开始音频转换" ),key = 'start_vc' ),sg .Button (i18n ("停止音频转换" ),key = 'stop_vc' ), sg . Text ( i18n ( "Infer Time(ms):" )), sg . Text ( "0" , key = 'infer_time' ) ]
190194 ]
191195
192196 self .window = sg .Window ("RVC - GUI" ,layout = layout )
@@ -219,23 +223,29 @@ def set_values(self,values):
219223 self .config .block_time = values ['block_time' ]
220224 self .config .crossfade_time = values ['crossfade_length' ]
221225 self .config .extra_time = values ['extra_time' ]
222- self .config .noise_reduce = values ['noise_reduce' ]
226+ self .config .I_noise_reduce = values ['I_noise_reduce' ]
227+ self .config .O_noise_reduce = values ['O_noise_reduce' ]
223228
224229 def start_vc (self ):
225230 torch .cuda .empty_cache ()
226231 self .flag_vc = True
232+ self .RMS_threhold = math .e ** (float (self .config .threhold )/ 10 )
227233 self .block_frame = int (self .config .block_time * self .config .samplerate )
228234 self .crossfade_frame = int (self .config .crossfade_time * self .config .samplerate )
229235 self .sola_search_frame = int (0.012 * self .config .samplerate )
230236 self .delay_frame = int (0.02 * self .config .samplerate )#往前预留0.02s
231237 self .extra_frame = int (self .config .extra_time * self .config .samplerate )#往后预留0.04s
232238 self .rvc = None
233239 self .rvc = RVC (self .config .pitch ,self .config .pth_path ,self .config .index_path ,self .config .npy_path )
234- self .input_wav :np .ndarray = np .zeros (self .extra_frame + self .crossfade_frame + self .sola_search_frame + self .block_frame )
235- self .output_wav :np .ndarray = np .zeros (self .block_frame )
236- self .sola_buffer :np .ndarray = np .zeros (self .crossfade_frame ,dtype = 'float32' )
237- self .fade_in_window :np .ndarray = np .linspace (0 , 1 , self .crossfade_frame )
238- self .fade_out_window :np .ndarray = 1 - self .fade_in_window
240+ self .input_wav :np .ndarray = np .zeros (self .extra_frame + self .crossfade_frame + self .sola_search_frame + self .block_frame ,dtype = 'float32' )
241+ self .output_wav :torch .Tensor = torch .zeros (self .block_frame ,device = device ,dtype = torch .float32 )
242+ #self.sola_buffer:np.ndarray=np.zeros(self.crossfade_frame,dtype='float32')
243+ self .sola_buffer :torch .Tensor = torch .zeros (self .crossfade_frame ,device = device ,dtype = torch .float32 )
244+ #self.fade_in_window:np.ndarray = np.linspace(0, 1, self.crossfade_frame)
245+ self .fade_in_window :torch .Tensor = torch .linspace (0.0 ,1.0 ,steps = self .crossfade_frame ,device = device ,dtype = torch .float32 )
246+ self .fade_out_window :torch .Tensor = 1 - self .fade_in_window
247+ self .resampler = tat .Resample (orig_freq = 40000 ,new_freq = self .config .samplerate ,dtype = torch .float32 )
248+ self .RMS = lambda y :torch .sqrt (torch .mean (torch .square (y ))).item ()#RMS calculator
239249 thread_vc = threading .Thread (target = self .soundinput )
240250 thread_vc .start ()
241251
@@ -257,46 +267,48 @@ def audio_callback(self,indata:np.ndarray,outdata:np.ndarray, frames, times, sta
257267 '''
258268 start_time = time .perf_counter ()
259269 indata = librosa .to_mono (indata .T )
260- self .input_wav [:] = np . roll ( self . input_wav , - self . block_frame )
261-
262- #TODO:Convert all numpy calculation to torch
270+ if self .config . I_noise_reduce :
271+ indata [:] = nr . reduce_noise ( y = indata , sr = self . config . samplerate )
272+
263273 '''noise gate'''
264- frame_length = 1024
265- hop_length = 512
274+ frame_length = 2048
275+ hop_length = 1024
266276 rms = librosa .feature .rms (y = indata ,frame_length = frame_length ,hop_length = hop_length )
267277 db_threhold = librosa .amplitude_to_db (rms ,ref = 1.0 )[0 ]< self .config .threhold
268278 #print(rms.shape,db.shape,db)
269279 for i in range (db_threhold .shape [0 ]):
270280 if db_threhold [i ]:
271281 indata [i * hop_length :(i + 1 )* hop_length ]= 0
272- self .input_wav [- self . block_frame :]= indata [:]
282+ self .input_wav [:]= np . append ( self . input_wav [ self . block_frame :], indata )
273283
274284 #infer
275285 print ('input_wav:' + str (self .input_wav .shape ))
276- infer_wav = librosa .resample (y = self .rvc .infer (self .input_wav [:],self .config .samplerate ),orig_sr = 40000 ,target_sr = self .config .samplerate )[- self .crossfade_frame - self .sola_search_frame - self .block_frame :]
277- print ('infered_wav:' + str (infer_wav .shape ))
278-
286+ #print('infered_wav:'+str(infer_wav.shape))
287+ infer_wav :torch .Tensor = self .resampler (torch .from_numpy (self .rvc .infer (self .input_wav ,self .config .samplerate )))[- self .crossfade_frame - self .sola_search_frame - self .block_frame :].to (device )
288+ print ('infer_wav:' + str (infer_wav .shape ))
289+
279290 # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
280- cor_nom = np . convolve (infer_wav [ : self .crossfade_frame + self .sola_search_frame ], np . flip ( self .sola_buffer ), 'valid' )
281- cor_den = np .sqrt (np . convolve (infer_wav [ : self .crossfade_frame + self .sola_search_frame ] ** 2 , np .ones (self .crossfade_frame ), 'valid' ) + 1e-3 )
282- sola_offset = np .argmax ( cor_nom / cor_den )
283- print ('sola offset: ' + str (sola_offset ))
291+ cor_nom = F . conv1d (infer_wav [None , None ,: self .crossfade_frame + self .sola_search_frame ],self .sola_buffer [ None , None ,:] )
292+ cor_den = torch .sqrt (F . conv1d (infer_wav [None , None ,: self .crossfade_frame + self .sola_search_frame ]** 2 , torch .ones (1 , 1 , self .crossfade_frame , device = device )) + 1e-8 )
293+ sola_offset = torch .argmax ( cor_nom [ 0 , 0 ] / cor_den [ 0 , 0 ] )
294+ print ('sola offset: ' + str (int ( sola_offset ) ))
284295
285296 # crossfade
286297 self .output_wav [:]= infer_wav [sola_offset : sola_offset + self .block_frame ]
287298 self .output_wav [:self .crossfade_frame ] *= self .fade_in_window
288299 self .output_wav [:self .crossfade_frame ] += self .sola_buffer [:]
289-
290300 if sola_offset < self .sola_search_frame :
291301 self .sola_buffer [:] = infer_wav [- self .sola_search_frame - self .crossfade_frame + sola_offset : - self .sola_search_frame + sola_offset ]* self .fade_out_window
292302 else :
293303 self .sola_buffer [:] = infer_wav [- self .crossfade_frame :]* self .fade_out_window
294304
295- if self .config .noise_reduce :
296- self .output_wav [:]= nr .reduce_noise (y = self .output_wav ,sr = self .config .samplerate )
297-
298- outdata [:]= np .array ([self .output_wav ,self .output_wav ]).T
299- print ('infer time:' + str (time .perf_counter ()- start_time ))
305+ if self .config .O_noise_reduce :
306+ outdata [:]= np .tile (nr .reduce_noise (y = self .output_wav [:].cpu ().numpy (),sr = self .config .samplerate ),(2 ,1 )).T
307+ else :
308+ outdata [:]= self .output_wav [:].repeat (2 , 1 ).t ().cpu ().numpy ()
309+ total_time = time .perf_counter ()- start_time
310+ print ('infer time:' + str (total_time ))
311+ self .window ['infer_time' ].update (int (total_time * 1000 ))
300312
301313 def get_devices (self ,update : bool = True ):
302314 '''获取设备列表'''
0 commit comments