Merge remote-tracking branch 'TTS/dev' into dev

This commit is contained in:
thllwg 2020-07-13 09:22:32 +02:00
commit 9530a6bdaf
15 changed files with 239 additions and 69 deletions

View File

@ -1,15 +1,20 @@
numpy>=1.16.0
torch>=1.5 torch>=1.5
librosa>=0.5.1 tensorflow>=2.2
Unidecode>=0.4.20 numpy>=1.16.0
tensorboard scipy>=0.19.0
numba==0.48
librosa==0.7.2
phonemizer>=2.2.0
unidecode==0.4.20
attrdict
tensorboardX tensorboardX
matplotlib matplotlib
Pillow Pillow
flask flask
scipy
tqdm tqdm
soundfile
phonemizer
bokeh==1.4.0
inflect inflect
bokeh==1.4.0
soundfile
nose==1.3.7
cardboardlint==1.3.0
pylint==2.5.3

View File

@ -1,18 +1,19 @@
torch>=1.5
tensorflow==2.3rc
numpy>=1.16.0 numpy>=1.16.0
scipy>=0.19.0
numba==0.48 numba==0.48
torch>=0.4.1 librosa==0.7.2
tensorflow>=2.2 phonemizer>=2.2.0
librosa>=0.5.1 unidecode==0.4.20
Unidecode>=0.4.20 attrdict
tensorboard
tensorboardX tensorboardX
matplotlib matplotlib
Pillow Pillow
flask flask
scipy
tqdm tqdm
soundfile
inflect inflect
phonemizer
bokeh==1.4.0 bokeh==1.4.0
nose soundfile
nose==1.3.7
cardboardlint==1.3.0

View File

@ -15,12 +15,9 @@ def create_argparser():
parser.add_argument('--tts_config', type=str, help='path to TTS config.json file') parser.add_argument('--tts_config', type=str, help='path to TTS config.json file')
parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model') parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.') parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.') parser.add_argument('--wavernn_checkpoint', type=str, default=None, help='path to WaveRNN checkpoint file.')
parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.') parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.') parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
parser.add_argument('--pwgan_lib_path', type=str, default=None, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
parser.add_argument('--pwgan_file', type=str, default=None, help='path to ParallelWaveGAN checkpoint file.')
parser.add_argument('--pwgan_config', type=str, default=None, help='path to ParallelWaveGAN config file.')
parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.') parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.')
parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.') parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.')
parser.add_argument('--port', type=int, default=5002, help='port to listen on.') parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
@ -46,10 +43,6 @@ embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn')
wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar') wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json') wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
embedded_pwgan_folder = os.path.join(embedded_models_folder, 'pwgan')
pwgan_checkpoint_file = os.path.join(embedded_pwgan_folder, 'checkpoint.pkl')
pwgan_config_file = os.path.join(embedded_pwgan_folder, 'config.yml')
args = create_argparser().parse_args() args = create_argparser().parse_args()
# If these were not specified in the CLI args, use default values with embedded model files # If these were not specified in the CLI args, use default values with embedded model files
@ -57,19 +50,16 @@ if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
args.tts_checkpoint = tts_checkpoint_file args.tts_checkpoint = tts_checkpoint_file
if not args.tts_config and os.path.isfile(tts_config_file): if not args.tts_config and os.path.isfile(tts_config_file):
args.tts_config = tts_config_file args.tts_config = tts_config_file
if not args.vocoder_checkpoint and os.path.isfile(tts_checkpoint_file):
args.tts_checkpoint = tts_checkpoint_file
if not args.vocoder_config and os.path.isfile(tts_config_file):
args.tts_config = tts_config_file
if not args.wavernn_file and os.path.isfile(wavernn_checkpoint_file): if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file):
args.wavernn_file = wavernn_checkpoint_file args.vocoder_checkpoint = vocoder_checkpoint_file
if not args.vocoder_config and os.path.isfile(vocoder_config_file):
args.vocoder_config = vocoder_config_file
if not args.wavernn_checkpoint and os.path.isfile(wavernn_checkpoint_file):
args.wavernn_checkpoint = wavernn_checkpoint_file
if not args.wavernn_config and os.path.isfile(wavernn_config_file): if not args.wavernn_config and os.path.isfile(wavernn_config_file):
args.wavernn_config = wavernn_config_file args.wavernn_config = wavernn_config_file
if not args.pwgan_file and os.path.isfile(pwgan_checkpoint_file):
args.pwgan_file = pwgan_checkpoint_file
if not args.pwgan_config and os.path.isfile(pwgan_config_file):
args.pwgan_config = pwgan_config_file
synthesizer = Synthesizer(args) synthesizer = Synthesizer(args)

View File

@ -31,15 +31,16 @@ class Synthesizer(object):
self.wavernn = None self.wavernn = None
self.vocoder_model = None self.vocoder_model = None
self.config = config self.config = config
print(config)
self.use_cuda = self.config.use_cuda self.use_cuda = self.config.use_cuda
if self.use_cuda: if self.use_cuda:
assert torch.cuda.is_available(), "CUDA is not availabe on this machine." assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
self.load_tts(self.config.tts_checkpoint, self.config.tts_config, self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
self.config.use_cuda) self.config.use_cuda)
if self.config.vocoder_file: if self.config.vocoder_checkpoint:
self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda) self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda)
if self.config.wavernn_lib_path: if self.config.wavernn_lib_path:
self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file, self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint,
self.config.wavernn_config, self.config.use_cuda) self.config.wavernn_config, self.config.use_cuda)
def load_tts(self, tts_checkpoint, tts_config, use_cuda): def load_tts(self, tts_checkpoint, tts_config, use_cuda):

View File

@ -69,6 +69,41 @@ if 'bdist_wheel' in unknown_args and args.checkpoint and args.model_config:
shutil.copy(args.model_config, embedded_config_path) shutil.copy(args.model_config, embedded_config_path)
package_data.extend([embedded_checkpoint_path, embedded_config_path]) package_data.extend([embedded_checkpoint_path, embedded_config_path])
def pip_install(package_name):
subprocess.call(
[sys.executable, '-m', 'pip', 'install', package_name]
)
requirements = {
'install_requires':[
"torch>=1.5",
"numpy>=1.16.0",
"numba==0.48",
"scipy>=0.19.0",
"librosa==0.7.2",
"unidecode==0.4.20",
"attrdict",
"tensorboardX",
"matplotlib",
"Pillow",
"flask",
"tqdm",
"inflect",
"bokeh==1.4.0",
"soundfile",
"phonemizer>=2.2.0",
"nose==1.3.7",
"cardboardlint==1.3.0",
"pylint==2.5.3",
],
'pip_install':[
'tensorflow>=2.2.0',
]
}
setup( setup(
name='TTS', name='TTS',
version=version, version=version,
@ -95,24 +130,23 @@ setup(
'build_py': build_py, 'build_py': build_py,
'develop': develop, 'develop': develop,
}, },
install_requires=[ install_requires=requirements['install_requires'],
"scipy>=0.19.0", python_requires='>=3.6.0',
"torch>=1.5", classifiers=[
"numpy>=1.16.0", "Programming Language :: Python",
"librosa==0.6.2", "Programming Language :: Python :: 3",
"unidecode==0.4.20", "Programming Language :: Python :: 3.6",
"attrdict", "Programming Language :: Python :: 3.7",
"tensorboardX", "Programming Language :: Python :: 3.8",
"matplotlib", 'Development Status :: 3 - Alpha',
"Pillow", "Intended Audience :: Science/Research :: Developers",
"flask", "Operating System :: POSIX :: Linux",
"tqdm", 'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
"inflect", "Topic :: Software Development :: Libraries :: Python Modules :: Speech :: Sound/Audio :: Multimedia :: Artificial Intelligence",
"bokeh==1.4.0",
"soundfile",
"phonemizer @ https://github.com/bootphon/phonemizer/tarball/master",
],
dependency_links=[
"http://github.com/bootphon/phonemizer/tarball/master#egg=phonemizer-1.0.1"
] ]
) )
# for some reason having tensorflow in 'install_requires'
# breaks some of the dependencies.
for module in requirements['pip_install']:
pip_install(module)

Binary file not shown.

View File

@ -6,7 +6,7 @@
"wavernn_file": null, // wavernn checkpoint file name "wavernn_file": null, // wavernn checkpoint file name
"wavernn_config": null, // wavernn config file "wavernn_config": null, // wavernn config file
"vocoder_config":null, "vocoder_config":null,
"vocoder_file": null, "vocoder_checkpoint": null,
"is_wavernn_batched":true, "is_wavernn_batched":true,
"port": 5002, "port": 5002,
"use_cuda": false, "use_cuda": false,

View File

@ -14,6 +14,9 @@ rm -f dist/*.whl
python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json
pip install --quiet dist/TTS*.whl pip install --quiet dist/TTS*.whl
# this is related to https://github.com/librosa/librosa/issues/1160
pip install numba==0.48
python -m TTS.server.server & python -m TTS.server.server &
SERVER_PID=$! SERVER_PID=$!

0
tf/__init__.py Normal file
View File

1
tf/tests/__init__.py Normal file
View File

@ -0,0 +1 @@

View File

View File

@ -0,0 +1,134 @@
import os
import torch
import unittest
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('INFO')
from TTS.utils.io import load_config
from TTS.tf.models.tacotron2 import Tacotron2
from TTS.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model
#pylint: disable=unused-variable
torch.manual_seed(1)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
file_path = os.path.dirname(os.path.realpath(__file__)).replace('/tf/', '/')
c = load_config(os.path.join(file_path, 'test_config.json'))
class TacotronTFTrainTest(unittest.TestCase):
@staticmethod
def generate_dummy_inputs():
chars_seq = torch.randint(0, 24, (8, 128)).long().to(device)
chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device)
chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0]
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
stop_targets = torch.zeros(8, 30, 1).float().to(device)
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy())
mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
stop_targets, speaker_ids
def test_train_step(self):
''' test forward pass '''
chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
stop_targets, speaker_ids = self.generate_dummy_inputs()
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(chars_seq.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5)
# training pass
output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)
# check model output shapes
assert np.all(output[0].shape == mel_spec.shape)
assert np.all(output[1].shape == mel_spec.shape)
assert output[2].shape[2] == chars_seq.shape[1]
assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)
# inference pass
output = model(chars_seq, training=False)
def test_forward_attention(self,):
chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
stop_targets, speaker_ids = self.generate_dummy_inputs()
for idx in mel_lengths:
stop_targets[:, int(idx.item()):, 0] = 1.0
stop_targets = stop_targets.view(chars_seq.shape[0],
stop_targets.size(1) // c.r, -1)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, forward_attn=True)
# training pass
output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)
# check model output shapes
assert np.all(output[0].shape == mel_spec.shape)
assert np.all(output[1].shape == mel_spec.shape)
assert output[2].shape[2] == chars_seq.shape[1]
assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)
# inference pass
output = model(chars_seq, training=False)
def test_tflite_conversion(self, ): #pylint:disable=no-self-use
model = Tacotron2(num_chars=24,
num_speakers=0,
r=3,
postnet_output_dim=80,
decoder_output_dim=80,
attn_type='original',
attn_win=False,
attn_norm='sigmoid',
prenet_type='original',
prenet_dropout=True,
forward_attn=False,
trans_agent=False,
forward_attn_mask=False,
location_attn=True,
attn_K=0,
separate_stopnet=True,
bidirectional_decoder=False,
enable_tflite=True)
model.build_inference()
convert_tacotron2_to_tflite(model, output_path='test_tacotron2.tflite', experimental_converter=True)
# init tflite model
tflite_model = load_tflite_model('test_tacotron2.tflite')
# fake input
inputs = tf.random.uniform([1, 4], maxval=10, dtype=tf.int32) #pylint:disable=unexpected-keyword-arg
# run inference
# get input and output details
input_details = tflite_model.get_input_details()
output_details = tflite_model.get_output_details()
# reshape input tensor for the new input shape
tflite_model.resize_tensor_input(input_details[0]['index'], inputs.shape) #pylint:disable=unexpected-keyword-arg
tflite_model.allocate_tensors()
detail = input_details[0]
input_shape = detail['shape']
tflite_model.set_tensor(detail['index'], inputs)
# run the tflite_model
tflite_model.invoke()
# collect outputs
decoder_output = tflite_model.get_tensor(output_details[0]['index'])
postnet_output = tflite_model.get_tensor(output_details[1]['index'])
# remove tflite binary
os.remove('test_tacotron2.tflite')

View File

@ -16,23 +16,23 @@ You can see here an example (Soon)[Colab Notebook]() training MelGAN with LJSpee
In order to train a new model, you need to collecto all your wav files under a common parent folder and give this path to `data_path` field in '''config.json''' In order to train a new model, you need to collecto all your wav files under a common parent folder and give this path to `data_path` field in '''config.json'''
You need to define other relevant parameters in your ```config.json``` and then start traning with the following command from Mozilla TTS root path. You need to define other relevant parameters in your ```config.json``` and then start traning with the following command from Mozilla TTS root path, where '0' is the Id of the GPU you wish to use.
```CUDA_VISIBLE_DEVICES='1' python vocoder/train.py --config_path path/to/config.json``` ```CUDA_VISIBLE_DEVICES='0' python vocoder/train.py --config_path path/to/config.json```
Exampled config files can be found under `vocoder/configs/` folder. Exampled config files can be found under `vocoder/configs/` folder.
You can continue a previous training by the following command. You can continue a previous training by the following command.
```CUDA_VISIBLE_DEVICES='1' python vocoder/train.py --continue_path path/to/your/model/folder``` ```CUDA_VISIBLE_DEVICES='0' python vocoder/train.py --continue_path path/to/your/model/folder```
You can fine-tune a pre-trained model by the following command. You can fine-tune a pre-trained model by the following command.
```CUDA_VISIBLE_DEVICES='1' python vocoder/train.py --restore_path path/to/your/model.pth.tar``` ```CUDA_VISIBLE_DEVICES='0' python vocoder/train.py --restore_path path/to/your/model.pth.tar```
Restoring a model starts a new training in a different output folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same conditions the previous training run left off. Restoring a model starts a new training in a different output folder. It only restores model weights with the given checkpoint file. However, continuing a training starts from the same conditions the previous training run left off.
You can also follow your training runs on Tensorboard as you do with our TTS models. You can also follow your training runs on Tensorboard as you do with our TTS models.
## Acknowledgement ## Acknowledgement
Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work. Thanks to @kan-bayashi for his [repository](https://github.com/kan-bayashi/ParallelWaveGAN) being the start point of our work.

View File

@ -87,6 +87,11 @@ class GANDataset(Dataset):
audio, mel = self.cache[idx] audio, mel = self.cache[idx]
else: else:
audio = self.ap.load_wav(wavpath) audio = self.ap.load_wav(wavpath)
if len(audio) < self.seq_len + self.pad_short:
audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \
mode='constant', constant_values=0.0)
mel = self.ap.melspectrogram(audio) mel = self.ap.melspectrogram(audio)
else: else:
@ -99,10 +104,6 @@ class GANDataset(Dataset):
audio = self.ap.load_wav(wavpath) audio = self.ap.load_wav(wavpath)
mel = np.load(feat_path) mel = np.load(feat_path)
if len(audio) < self.seq_len + self.pad_short:
audio = np.pad(audio, (0, self.seq_len + self.pad_short - len(audio)), \
mode='constant', constant_values=0.0)
# correct the audio length wrt padding applied in stft # correct the audio length wrt padding applied in stft
audio = np.pad(audio, (0, self.hop_len), mode="edge") audio = np.pad(audio, (0, self.hop_len), mode="edge")
audio = audio[:mel.shape[-1] * self.hop_len] audio = audio[:mel.shape[-1] * self.hop_len]

View File

@ -59,9 +59,9 @@ def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, us
audio = wav1[idx].squeeze() audio = wav1[idx].squeeze()
feat = feat1[idx] feat = feat1[idx]
mel = ap.melspectrogram(audio) mel = ap.melspectrogram(audio)
# the first 2 and the last frame is skipped due to the padding # the first 2 and the last 2 frames are skipped due to the padding
# applied in spec. computation. # differences in stft
assert (feat - mel[:, :feat1.shape[-1]])[:, 2:-1].sum() == 0, f' [!] {(feat - mel[:, :feat1.shape[-1]])[:, 2:-1].sum()}' assert (feat - mel[:, :feat1.shape[-1]])[:, 2:-2].sum() <= 0, f' [!] {(feat - mel[:, :feat1.shape[-1]])[:, 2:-2].sum()}'
count_iter += 1 count_iter += 1
# if count_iter == max_iter: # if count_iter == max_iter: