mirror of https://github.com/coqui-ai/TTS.git
spec_gain fft_size and stft_pad_mode parameters for audio_processor
This commit is contained in:
parent
70c83671e6
commit
2f3f43844e
3
train.py
3
train.py
|
@ -473,8 +473,6 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
model = setup_model(num_chars, num_speakers, c)
|
model = setup_model(num_chars, num_speakers, c)
|
||||||
|
|
||||||
print(" | > Num output units : {}".format(ap.num_freq), flush=True)
|
|
||||||
|
|
||||||
params = set_weight_decay(model, c.wd)
|
params = set_weight_decay(model, c.wd)
|
||||||
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
|
optimizer = RAdam(params, lr=c.lr, weight_decay=0)
|
||||||
if c.stopnet and c.separate_stopnet:
|
if c.stopnet and c.separate_stopnet:
|
||||||
|
@ -542,7 +540,6 @@ def main(args): # pylint: disable=redefined-outer-name
|
||||||
if c.bidirectional_decoder:
|
if c.bidirectional_decoder:
|
||||||
model.decoder_backward.set_r(r)
|
model.decoder_backward.set_r(r)
|
||||||
print("\n > Number of output frames:", model.decoder.r)
|
print("\n > Number of output frames:", model.decoder.r)
|
||||||
|
|
||||||
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
|
train_avg_loss_dict, global_step = train(model, criterion, optimizer,
|
||||||
optimizer_st, scheduler, ap,
|
optimizer_st, scheduler, ap,
|
||||||
global_step, epoch)
|
global_step, epoch)
|
||||||
|
|
|
@ -17,7 +17,7 @@ class AudioProcessor(object):
|
||||||
hop_length=None,
|
hop_length=None,
|
||||||
win_length=None,
|
win_length=None,
|
||||||
ref_level_db=None,
|
ref_level_db=None,
|
||||||
num_freq=None,
|
fft_size=1024,
|
||||||
power=None,
|
power=None,
|
||||||
preemphasis=0.0,
|
preemphasis=0.0,
|
||||||
signal_norm=None,
|
signal_norm=None,
|
||||||
|
@ -25,6 +25,8 @@ class AudioProcessor(object):
|
||||||
max_norm=None,
|
max_norm=None,
|
||||||
mel_fmin=None,
|
mel_fmin=None,
|
||||||
mel_fmax=None,
|
mel_fmax=None,
|
||||||
|
spec_gain=20,
|
||||||
|
stft_pad_mode='reflect',
|
||||||
clip_norm=True,
|
clip_norm=True,
|
||||||
griffin_lim_iters=None,
|
griffin_lim_iters=None,
|
||||||
do_trim_silence=False,
|
do_trim_silence=False,
|
||||||
|
@ -41,7 +43,7 @@ class AudioProcessor(object):
|
||||||
self.frame_shift_ms = frame_shift_ms
|
self.frame_shift_ms = frame_shift_ms
|
||||||
self.frame_length_ms = frame_length_ms
|
self.frame_length_ms = frame_length_ms
|
||||||
self.ref_level_db = ref_level_db
|
self.ref_level_db = ref_level_db
|
||||||
self.num_freq = num_freq
|
self.fft_size = fft_size
|
||||||
self.power = power
|
self.power = power
|
||||||
self.preemphasis = preemphasis
|
self.preemphasis = preemphasis
|
||||||
self.griffin_lim_iters = griffin_lim_iters
|
self.griffin_lim_iters = griffin_lim_iters
|
||||||
|
@ -49,6 +51,8 @@ class AudioProcessor(object):
|
||||||
self.symmetric_norm = symmetric_norm
|
self.symmetric_norm = symmetric_norm
|
||||||
self.mel_fmin = mel_fmin or 0
|
self.mel_fmin = mel_fmin or 0
|
||||||
self.mel_fmax = mel_fmax
|
self.mel_fmax = mel_fmax
|
||||||
|
self.spec_gain = float(spec_gain)
|
||||||
|
self.stft_pad_mode = 'reflect'
|
||||||
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
self.max_norm = 1.0 if max_norm is None else float(max_norm)
|
||||||
self.clip_norm = clip_norm
|
self.clip_norm = clip_norm
|
||||||
self.do_trim_silence = do_trim_silence
|
self.do_trim_silence = do_trim_silence
|
||||||
|
@ -58,12 +62,11 @@ class AudioProcessor(object):
|
||||||
# setup stft parameters
|
# setup stft parameters
|
||||||
if hop_length is None:
|
if hop_length is None:
|
||||||
# compute stft parameters from given time values
|
# compute stft parameters from given time values
|
||||||
self.n_fft, self.hop_length, self.win_length = self._stft_parameters()
|
self.hop_length, self.win_length = self._stft_parameters()
|
||||||
else:
|
else:
|
||||||
# use stft parameters from config file
|
# use stft parameters from config file
|
||||||
self.hop_length = hop_length
|
self.hop_length = hop_length
|
||||||
self.win_length = win_length
|
self.win_length = win_length
|
||||||
self.n_fft = (self.num_freq - 1) * 2
|
|
||||||
assert min_level_db != 0.0, " [!] min_level_db is 0"
|
assert min_level_db != 0.0, " [!] min_level_db is 0"
|
||||||
members = vars(self)
|
members = vars(self)
|
||||||
for key, value in members.items():
|
for key, value in members.items():
|
||||||
|
@ -86,19 +89,18 @@ class AudioProcessor(object):
|
||||||
assert self.mel_fmax <= self.sample_rate // 2
|
assert self.mel_fmax <= self.sample_rate // 2
|
||||||
return librosa.filters.mel(
|
return librosa.filters.mel(
|
||||||
self.sample_rate,
|
self.sample_rate,
|
||||||
self.n_fft,
|
self.fft_size,
|
||||||
n_mels=self.num_mels,
|
n_mels=self.num_mels,
|
||||||
fmin=self.mel_fmin,
|
fmin=self.mel_fmin,
|
||||||
fmax=self.mel_fmax)
|
fmax=self.mel_fmax)
|
||||||
|
|
||||||
def _stft_parameters(self, ):
|
def _stft_parameters(self, ):
|
||||||
"""Compute necessary stft parameters with given time values"""
|
"""Compute necessary stft parameters with given time values"""
|
||||||
n_fft = (self.num_freq - 1) * 2
|
|
||||||
factor = self.frame_length_ms / self.frame_shift_ms
|
factor = self.frame_length_ms / self.frame_shift_ms
|
||||||
assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
|
assert (factor).is_integer(), " [!] frame_shift_ms should divide frame_length_ms"
|
||||||
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
|
hop_length = int(self.frame_shift_ms / 1000.0 * self.sample_rate)
|
||||||
win_length = int(hop_length * factor)
|
win_length = int(hop_length * factor)
|
||||||
return n_fft, hop_length, win_length
|
return hop_length, win_length
|
||||||
|
|
||||||
### normalization ###
|
### normalization ###
|
||||||
def _normalize(self, S):
|
def _normalize(self, S):
|
||||||
|
@ -110,7 +112,7 @@ class AudioProcessor(object):
|
||||||
if hasattr(self, 'mel_scaler'):
|
if hasattr(self, 'mel_scaler'):
|
||||||
if S.shape[0] == self.num_mels:
|
if S.shape[0] == self.num_mels:
|
||||||
return self.mel_scaler.transform(S.T).T
|
return self.mel_scaler.transform(S.T).T
|
||||||
elif S.shape[0] == self.n_fft / 2:
|
elif S.shape[0] == self.fft_size / 2:
|
||||||
return self.linear_scaler.transform(S.T).T
|
return self.linear_scaler.transform(S.T).T
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
||||||
|
@ -139,7 +141,7 @@ class AudioProcessor(object):
|
||||||
if hasattr(self, 'mel_scaler'):
|
if hasattr(self, 'mel_scaler'):
|
||||||
if S_denorm.shape[0] == self.num_mels:
|
if S_denorm.shape[0] == self.num_mels:
|
||||||
return self.mel_scaler.inverse_transform(S_denorm.T).T
|
return self.mel_scaler.inverse_transform(S_denorm.T).T
|
||||||
elif S_denorm.shape[0] == self.n_fft / 2:
|
elif S_denorm.shape[0] == self.fft_size / 2:
|
||||||
return self.linear_scaler.inverse_transform(S_denorm.T).T
|
return self.linear_scaler.inverse_transform(S_denorm.T).T
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
raise RuntimeError(' [!] Mean-Var stats does not match the given feature dimensions.')
|
||||||
|
@ -184,11 +186,11 @@ class AudioProcessor(object):
|
||||||
### DB and AMP conversion ###
|
### DB and AMP conversion ###
|
||||||
# pylint: disable=no-self-use
|
# pylint: disable=no-self-use
|
||||||
def _amp_to_db(self, x):
|
def _amp_to_db(self, x):
|
||||||
return 20 * np.log10(np.maximum(1e-5, x))
|
return self.spec_gain * np.log10(np.maximum(1e-5, x))
|
||||||
|
|
||||||
# pylint: disable=no-self-use
|
# pylint: disable=no-self-use
|
||||||
def _db_to_amp(self, x):
|
def _db_to_amp(self, x):
|
||||||
return np.power(10.0, x * 0.05)
|
return np.power(10.0, x / self.spec_gain)
|
||||||
|
|
||||||
### Preemphasis ###
|
### Preemphasis ###
|
||||||
def apply_preemphasis(self, x):
|
def apply_preemphasis(self, x):
|
||||||
|
@ -254,10 +256,10 @@ class AudioProcessor(object):
|
||||||
def _stft(self, y):
|
def _stft(self, y):
|
||||||
return librosa.stft(
|
return librosa.stft(
|
||||||
y=y,
|
y=y,
|
||||||
n_fft=self.n_fft,
|
n_fft=self.fft_size,
|
||||||
hop_length=self.hop_length,
|
hop_length=self.hop_length,
|
||||||
win_length=self.win_length,
|
win_length=self.win_length,
|
||||||
pad_mode='constant'
|
pad_mode=self.stft_pad_mode,
|
||||||
)
|
)
|
||||||
|
|
||||||
def _istft(self, y):
|
def _istft(self, y):
|
||||||
|
|
|
@ -146,7 +146,7 @@ def setup_model(num_chars, num_speakers, c):
|
||||||
model = MyModel(num_chars=num_chars,
|
model = MyModel(num_chars=num_chars,
|
||||||
num_speakers=num_speakers,
|
num_speakers=num_speakers,
|
||||||
r=c.r,
|
r=c.r,
|
||||||
postnet_output_dim=c.audio['num_freq'],
|
postnet_output_dim=int(c.audio['fft_size'] / 2 + 1),
|
||||||
decoder_output_dim=c.audio['num_mels'],
|
decoder_output_dim=c.audio['num_mels'],
|
||||||
gst=c.use_gst,
|
gst=c.use_gst,
|
||||||
memory_size=c.memory_size,
|
memory_size=c.memory_size,
|
||||||
|
@ -252,7 +252,7 @@ def check_config(c):
|
||||||
|
|
||||||
# audio processing parameters
|
# audio processing parameters
|
||||||
_check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
|
_check_argument('num_mels', c['audio'], restricted=True, val_type=int, min_val=10, max_val=2056)
|
||||||
_check_argument('num_freq', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
|
_check_argument('fft_size', c['audio'], restricted=True, val_type=int, min_val=128, max_val=4058)
|
||||||
_check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
|
_check_argument('sample_rate', c['audio'], restricted=True, val_type=int, min_val=512, max_val=100000)
|
||||||
_check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
|
_check_argument('frame_length_ms', c['audio'], restricted=True, val_type=float, min_val=10, max_val=1000, alternative='win_length')
|
||||||
_check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
|
_check_argument('frame_shift_ms', c['audio'], restricted=True, val_type=float, min_val=1, max_val=1000, alternative='hop_length')
|
||||||
|
@ -278,6 +278,7 @@ def check_config(c):
|
||||||
_check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
|
_check_argument('clip_norm', c['audio'], restricted=True, val_type=bool)
|
||||||
_check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
|
_check_argument('mel_fmin', c['audio'], restricted=True, val_type=float, min_val=0.0, max_val=1000)
|
||||||
_check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
|
_check_argument('mel_fmax', c['audio'], restricted=True, val_type=float, min_val=500.0)
|
||||||
|
_check_argument('spec_gain', c['audio'], restricted=True, val_type=float, min_val=1, max_val=100)
|
||||||
_check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
|
_check_argument('do_trim_silence', c['audio'], restricted=True, val_type=bool)
|
||||||
_check_argument('trim_db', c['audio'], restricted=True, val_type=int)
|
_check_argument('trim_db', c['audio'], restricted=True, val_type=int)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue