33import noisereduce as nr
44import numpy as np
55from fairseq import checkpoint_utils
6- import librosa ,torch ,parselmouth ,faiss ,time ,threading , math
6+ import librosa ,torch ,parselmouth ,faiss ,time ,threading
77import torch .nn .functional as F
88import torchaudio .transforms as tat
99
1515device = torch .device ("cuda" if torch .cuda .is_available () else "cpu" )
1616
1717class RVC :
18- def __init__ (self ,key ,pth_path ,index_path ,npy_path ) -> None :
18+ def __init__ (self ,key ,hubert_path , pth_path ,index_path ,npy_path , index_rate ) -> None :
1919 '''
2020 初始化
2121 '''
@@ -26,8 +26,10 @@ def __init__(self,key,pth_path,index_path,npy_path) -> None:
2626 self .f0_mel_min = 1127 * np .log (1 + self .f0_min / 700 )
2727 self .f0_mel_max = 1127 * np .log (1 + self .f0_max / 700 )
2828 self .index = faiss .read_index (index_path )
29+ self .index_rate = index_rate
30+ '''NOT YET USED'''
2931 self .big_npy = np .load (npy_path )
30- model_path = "TEMP \\ hubert_base.pt"
32+ model_path = hubert_path
3133 print ("load model(s) from {}" .format (model_path ))
3234 models , saved_cfg , task = checkpoint_utils .load_model_ensemble_and_task (
3335 [model_path ],
@@ -75,25 +77,11 @@ def get_f0(self,x, p_len,f0_up_key=0):
7577 return f0_coarse , f0bak
7678
7779
78- def infer (self ,audio : np . ndarray , sampling_rate : int ) -> np .ndarray :
80+ def infer (self ,feats : torch . Tensor ) -> np .ndarray :
7981 '''
80- 推理函数。
81- :param audio: ndarray(n,2)
82- :sampling_rate: 采样率
82+ 推理函数
8383 '''
84-
85- # f0_up_key=12
86- if len (audio .shape ) > 1 :
87- audio = librosa .to_mono (audio .transpose (1 , 0 ))
88- if sampling_rate != 16000 :
89- audio = librosa .resample (audio , orig_sr = sampling_rate , target_sr = 16000 )
90- #print('test:audio:'+str(audio.shape))
91- '''padding'''
92-
93-
94- feats = torch .from_numpy (audio ).float ()
95- if feats .dim () == 2 : # double channels
96- feats = feats .mean (- 1 )
84+ audio = feats .clone ().cpu ().numpy ()
9785 assert feats .dim () == 1 , feats .dim ()
9886 feats = feats .view (1 , - 1 )
9987 padding_mask = torch .BoolTensor (feats .shape ).fill_ (False )
@@ -108,17 +96,17 @@ def infer(self,audio:np.ndarray,sampling_rate:int) -> np.ndarray:
10896 feats = self .model .final_proj (logits [0 ])
10997
11098 ####索引优化
111- npy = feats [0 ].cpu ().numpy ().astype ("float32" )
112- D , I = self .index .search (npy , 1 )
113- # feats = torch.from_numpy(big_npy[I.squeeze()].astype("float16")).unsqueeze(0).to(device)
114- index_rate = 0.5
115- feats = torch .from_numpy (npy ).unsqueeze (0 ).to (device ) * index_rate + (1 - index_rate ) * feats
116- feats = feats .half ()
99+ if (isinstance (self .index ,type (None ))== False and isinstance (self .big_npy ,type (None ))== False and self .index_rate != 0 ):
100+ npy = feats [0 ].cpu ().numpy ().astype ("float32" )
101+ _ , I = self .index .search (npy , 1 )
102+ npy = self .big_npy [I .squeeze ()].astype ("float16" )
103+ feats = torch .from_numpy (npy ).unsqueeze (0 ).to (device )* self .index_rate + (1 - self .index_rate )* feats
117104
118105 feats = F .interpolate (feats .permute (0 ,2 ,1 ),scale_factor = 2 ).permute (0 ,2 ,1 )
119106 torch .cuda .synchronize ()
120107 # p_len = min(feats.shape[1],10000,pitch.shape[0])#太大了爆显存
121108 p_len = min (feats .shape [1 ],12000 )#
109+ print (feats .shape )
122110 pitch , pitchf = self .get_f0 (audio , p_len ,self .f0_up_key )
123111 p_len = min (feats .shape [1 ],12000 ,pitch .shape [0 ])#太大了爆显存
124112 torch .cuda .synchronize ()
@@ -132,13 +120,14 @@ def infer(self,audio:np.ndarray,sampling_rate:int) -> np.ndarray:
132120 ii = 0 #sid
133121 sid = torch .LongTensor ([ii ]).to (device )
134122 with torch .no_grad ():
135- audio = self .net_g .infer (feats , p_len ,pitch ,pitchf ,sid )[0 ][0 , 0 ].data .cpu ().float (). numpy ()#nsf
123+ infered_audio = self .net_g .infer (feats , p_len ,pitch ,pitchf ,sid )[0 ][0 , 0 ].data .cpu ().float ()#nsf
136124 torch .cuda .synchronize ()
137- return audio
125+ return infered_audio
138126
139127
140128class Config :
141129 def __init__ (self ) -> None :
130+ self .hubert_path :str = ''
142131 self .pth_path :str = ''
143132 self .index_path :str = ''
144133 self .npy_path :str = ''
@@ -151,6 +140,7 @@ def __init__(self) -> None:
151140 self .extra_time :float = 0.04
152141 self .I_noise_reduce = False
153142 self .O_noise_reduce = False
143+ self .index_rate = 0.3
154144
155145class GUI :
156146 def __init__ (self ) -> None :
@@ -180,8 +170,8 @@ def launcher(self):
180170 [
181171 sg .Frame (layout = [
182172 [sg .Text (i18n ("响应阈值" )),sg .Slider (range = (- 60 ,0 ),key = 'threhold' ,resolution = 1 ,orientation = 'h' ,default_value = - 30 )],
183- [sg .Text (i18n ("音调设置" )),sg .Slider (range = (- 24 ,24 ),key = 'pitch' ,resolution = 1 ,orientation = 'h' ,default_value = 12 )]
184-
173+ [sg .Text (i18n ("音调设置" )),sg .Slider (range = (- 24 ,24 ),key = 'pitch' ,resolution = 1 ,orientation = 'h' ,default_value = 12 )],
174+ [ sg . Text ( i18n ( 'Index Rate' )), sg . Slider ( range = ( 0.0 , 1.0 ), key = 'index_rate' , resolution = 0.01 , orientation = 'h' , default_value = 0.5 )]
185175 ],title = i18n ("常规设置" )),
186176 sg .Frame (layout = [
187177 [sg .Text (i18n ("采样长度" )),sg .Slider (range = (0.1 ,3.0 ),key = 'block_time' ,resolution = 0.1 ,orientation = 'h' ,default_value = 1.0 )],
@@ -204,9 +194,7 @@ def event_handler(self):
204194 exit ()
205195 if event == 'start_vc' and self .flag_vc == False :
206196 self .set_values (values )
207- print ('pth_path:' + self .config .pth_path )
208- print ('index_path:' + self .config .index_path )
209- print ('npy_path:' + self .config .npy_path )
197+ print (str (self .config .__dict__ ))
210198 print ('using_cuda:' + str (torch .cuda .is_available ()))
211199 self .start_vc ()
212200 if event == 'stop_vc' and self .flag_vc == True :
@@ -215,6 +203,7 @@ def event_handler(self):
215203
216204 def set_values (self ,values ):
217205 self .set_devices (values ["sg_input_device" ],values ['sg_output_device' ])
206+ self .config .hubert_path = values ['hubert_path' ]
218207 self .config .pth_path = values ['pth_path' ]
219208 self .config .index_path = values ['index_path' ]
220209 self .config .npy_path = values ['npy_path' ]
@@ -225,27 +214,25 @@ def set_values(self,values):
225214 self .config .extra_time = values ['extra_time' ]
226215 self .config .I_noise_reduce = values ['I_noise_reduce' ]
227216 self .config .O_noise_reduce = values ['O_noise_reduce' ]
217+ self .config .index_rate = values ['index_rate' ]
228218
229219 def start_vc (self ):
230220 torch .cuda .empty_cache ()
231221 self .flag_vc = True
232- self .RMS_threhold = math .e ** (float (self .config .threhold )/ 10 )
233222 self .block_frame = int (self .config .block_time * self .config .samplerate )
234223 self .crossfade_frame = int (self .config .crossfade_time * self .config .samplerate )
235224 self .sola_search_frame = int (0.012 * self .config .samplerate )
236225 self .delay_frame = int (0.02 * self .config .samplerate )#往前预留0.02s
237226 self .extra_frame = int (self .config .extra_time * self .config .samplerate )#往后预留0.04s
238227 self .rvc = None
239- self .rvc = RVC (self .config .pitch ,self .config .pth_path ,self .config .index_path ,self .config .npy_path )
228+ self .rvc = RVC (self .config .pitch ,self .config .hubert_path , self . config . pth_path ,self .config .index_path ,self .config .npy_path , self . config . index_rate )
240229 self .input_wav :np .ndarray = np .zeros (self .extra_frame + self .crossfade_frame + self .sola_search_frame + self .block_frame ,dtype = 'float32' )
241230 self .output_wav :torch .Tensor = torch .zeros (self .block_frame ,device = device ,dtype = torch .float32 )
242- #self.sola_buffer:np.ndarray=np.zeros(self.crossfade_frame,dtype='float32')
243231 self .sola_buffer :torch .Tensor = torch .zeros (self .crossfade_frame ,device = device ,dtype = torch .float32 )
244- #self.fade_in_window:np.ndarray = np.linspace(0, 1, self.crossfade_frame)
245232 self .fade_in_window :torch .Tensor = torch .linspace (0.0 ,1.0 ,steps = self .crossfade_frame ,device = device ,dtype = torch .float32 )
246233 self .fade_out_window :torch .Tensor = 1 - self .fade_in_window
247- self .resampler = tat .Resample (orig_freq = 40000 , new_freq = self .config .samplerate ,dtype = torch .float32 )
248- self .RMS = lambda y : torch . sqrt ( torch . mean ( torch .square ( y ))). item () #RMS calculator
234+ self .resampler1 = tat .Resample (orig_freq = self .config .samplerate , new_freq = 16000 ,dtype = torch .float32 )
235+ self .resampler2 = tat . Resample ( orig_freq = 40000 , new_freq = self . config . samplerate , dtype = torch .float32 )
249236 thread_vc = threading .Thread (target = self .soundinput )
250237 thread_vc .start ()
251238
@@ -284,7 +271,7 @@ def audio_callback(self,indata:np.ndarray,outdata:np.ndarray, frames, times, sta
284271 #infer
285272 print ('input_wav:' + str (self .input_wav .shape ))
286273 #print('infered_wav:'+str(infer_wav.shape))
287- infer_wav :torch .Tensor = self .resampler ( torch . from_numpy ( self .rvc .infer (self .input_wav , self .config . samplerate )))[- self .crossfade_frame - self .sola_search_frame - self .block_frame :].to (device )
274+ infer_wav :torch .Tensor = self .resampler2 ( self .rvc .infer (self .resampler1 ( torch . from_numpy ( self .input_wav ) )))[- self .crossfade_frame - self .sola_search_frame - self .block_frame :].to (device )
288275 print ('infer_wav:' + str (infer_wav .shape ))
289276
290277 # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC
0 commit comments