mirror of https://github.com/coqui-ai/TTS.git
spec_gain fft_size and stft_pad_mode parameters for audio_processor
This commit is contained in:
parent
70c83671e6
commit
2f3f43844e
3
train.py
3
train.py
|
@ -473,8 +473,6 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
|
||||
model = setup_model(num_chars, num_speakers, c)
|
||||
|
||||
print(" | > Num output units : {}".format(ap.num_freq), flush=True)
|
||||
|
||||
params = set_weight_decay(model, c.wd)
|
||||
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
|
||||
if c.stopnet and c.separate_stopnet:
|
||||
|
@ -542,7 +540,6 @@ def main(args): # pylint: disable=redefined-outer-name
|
|||
if c.bidirectional_decoder:
|
||||
model.decoder_backward.set_r(r)
|
||||
print("\n > Number of output frames:", model.decoder.r)
|
||||
|
||||
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
|
||||
optimizer_st, scheduler, ap,
|
||||
global_step, epoch)
|
||||
|
|
|
@ -17,7 +17,7 @@ class AudioProcessor(object):
|
|||
hop_length=None,
|
||||
win_length=None,
|
||||
ref_level_db=None,
|
||||
num_freq=None,
|
||||
fft_size=1024,
|
||||
power=None,
|
||||
preemphasis=0.0,
|
||||
signal_norm=None,
|
||||
|
@ -25,6 +25,8 @@ class AudioProcessor(object):
|
|||
max_norm=None,
|
||||
mel_fmin=None,
|
||||
mel_fmax=None,
|
||||
spec_gain=20,
|
||||
stft_pad_mode='reflect',
|
||||
clip_norm=True,
|
||||
griffin_lim_iters=None,
|
||||
do_trim_silence=False,
|
||||
|
@ -41,7 +43,7 @@ class AudioProcessor(object):
|
|||
self.frame_shift_ms = frame_shift_ms
|
||||
self.frame_length_ms = frame_length_ms
|
||||
self.ref_level_db = ref_level_db
|
||||
self.num_freq = num_freq
|
||||
self.fft_size = fft_size
|
||||
self.power = power
|
||||
self.preemphasis = preemphasis
|
||||
self.griffin_lim_iters = griffin_lim_iters
|
||||
|
@ -49,6 +51,8 @@ class AudioProcessor(object):
|
|||
self.symmetric_norm = symmetric_norm
|
||||
self.mel_fmin = mel_fmin or 0
|
||||
self.mel_fmax = mel_fmax
|
||||
self.spec_gain = float(spec_gain)
|
||||
self.stft_pad_mode = 'reflect'
|
||||
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
||||
self.clip_norm = clip_norm
|
||||
self.do_trim_silence = do_trim_silence
|
||||
|
@ -58,12 +62,11 @@ class AudioProcessor(object):
|
|||
# setup stft parameters
|
||||
if hop_length is None:
|
||||
# compute stft parameters from given time values
|
||||
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
||||
self.hop_length, self.win_length = self._stft_parameters()
|
||||
else:
|
||||
# use stft parameters from config file
|
||||
self.hop_length = hop_length
|
||||
self.win_length = win_length
|
||||
self.n_fft = (self.num_freq - 1) * 2
|
||||
assert min_level_db != 0.0, " [!] min_level_db is 0"
|
||||
members = vars(self)
|
||||
for key, value in members.items():
|
||||
|
@ -86,19 +89,18 @@ class AudioProcessor(object):
|
|||
assert self.mel_fmax <= self.sample_rate // 2
|
||||
return librosa.filters.mel(
|
||||
self.sample_rate,
|
||||
self.n_fft,
|
||||
self.fft_size,
|
||||
n_mels=self.num_mels,
|
||||
fmin=self.mel_fmin,
|
||||
fmax=self.mel_fmax)
|
||||
|
||||
def _stft_parameters(self, ):
|
||||
"""Compute necessary stft parameters with given time values"""
|
||||
n_fft = (self.num_freq - 1) * 2
|
||||
factor = self.frame_length_ms / self.frame_shift_ms
|
||||
assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
|
||||
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
|
||||
win_length = int(hop_length * factor)
|
||||
return n_fft, hop_length, win_length
|
||||
return hop_length, win_length
|
||||
|
||||
### normalization ###
|
||||
def _normalize(self, S):
|
||||
|
@ -110,7 +112,7 @@ class AudioProcessor(object):
|
|||
if hasattr(self, 'mel_scaler'):
|
||||
if S.shape[0] == self.num_mels:
|
||||
return self.mel_scaler.transform(S.T).T
|
||||
elif S.shape[0] == self.n_fft / 2:
|
||||
elif S.shape[0] == self.fft_size / 2:
|
||||
return self.linear_scaler.transform(S.T).T
|
||||
else:
|
||||
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
||||
|
@ -139,7 +141,7 @@ class AudioProcessor(object):
|
|||
if hasattr(self, 'mel_scaler'):
|
||||
if S_denorm.shape[0] == self.num_mels:
|
||||
return self.mel_scaler.inverse_transform(S_denorm.T).T
|
||||
elif S_denorm.shape[0] == self.n_fft / 2:
|
||||
elif S_denorm.shape[0] == self.fft_size / 2:
|
||||
return self.linear_scaler.inverse_transform(S_denorm.T).T
|
||||
else:
|
||||
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
||||
|
@ -184,11 +186,11 @@ class AudioProcessor(object):
|
|||
### DB and AMP conversion ###
|
||||
# pylint: disable=no-self-use
|
||||
def _amp_to_db(self, x):
|
||||
return 20 * np.log10(np.maximum(1e-5, x))
|
||||
return self.spec_gain * np.log10(np.maximum(1e-5, x))
|
||||
|
||||
# pylint: disable=no-self-use
|
||||
def _db_to_amp(self, x):
|
||||
return np.power(10.0, x * 0.05)
|
||||
return np.power(10.0, x / self.spec_gain)
|
||||
|
||||
### Preemphasis ###
|
||||
def apply_preemphasis(self, x):
|
||||
|
@ -254,10 +256,10 @@ class AudioProcessor(object):
|
|||
def _stft(self, y):
|
||||
return librosa.stft(
|
||||
y=y,
|
||||
n_fft=self.n_fft,
|
||||
n_fft=self.fft_size,
|
||||
hop_length=self.hop_length,
|
||||
win_length=self.win_length,
|
||||
pad_mode='constant'
|
||||
pad_mode=self.stft_pad_mode,
|
||||
)
|
||||
|
||||
def _istft(self, y):
|
||||
|
|
|
@ -146,7 +146,7 @@ def setup_model(num_chars, num_speakers, c):
|
|||
model = MyModel(num_chars=num_chars,
|
||||
num_speakers=num_speakers,
|
||||
r=c.r,
|
||||
postnet_output_dim=c.audio['num_freq'],
|
||||
postnet_output_dim=int(c.audio['fft_size'] / 2 + 1),
|
||||
decoder_output_dim=c.audio['num_mels'],
|
||||
gst=c.use_gst,
|
||||
memory_size=c.memory_size,
|
||||
|
@ -252,7 +252,7 @@ def check_config(c):
|
|||
|
||||
# audio processing parameters
|
||||
_check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
|
||||
_check_argument('num_freq', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
|
||||
_check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
|
||||
_check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
|
||||
_check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
|
||||
_check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
|
||||
|
@ -278,6 +278,7 @@ def check_config(c):
|
|||
_check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
|
||||
_check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
|
||||
_check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
|
||||
_check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100)
|
||||
_check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
|
||||
_check_argument('trim_db', c['audio'], restricted=True, val_type=int)
|
||||
|
||||
|
|
Loading…
Reference in New Issue