mirror of https://github.com/coqui-ai/TTS.git
commit
95b6a16d65
|
@ -1,15 +1,20 @@
|
|||
numpy>=1.16.0
|
||||
torch>=1.5
|
||||
librosa>=0.5.1
|
||||
Unidecode>=0.4.20
|
||||
tensorboard
|
||||
tensorflow>=2.2
|
||||
numpy>=1.16.0
|
||||
scipy>=0.19.0
|
||||
numba==0.48
|
||||
librosa==0.7.2
|
||||
phonemizer>=2.2.0
|
||||
unidecode==0.4.20
|
||||
attrdict
|
||||
tensorboardX
|
||||
matplotlib
|
||||
Pillow
|
||||
flask
|
||||
scipy
|
||||
tqdm
|
||||
soundfile
|
||||
phonemizer
|
||||
bokeh==1.4.0
|
||||
inflect
|
||||
bokeh==1.4.0
|
||||
soundfile
|
||||
nose==1.3.7
|
||||
cardboardlint==1.3.0
|
||||
pylint==2.5.3
|
|
@ -1,18 +1,19 @@
|
|||
torch>=1.5
|
||||
tensorflow==2.3rc
|
||||
numpy>=1.16.0
|
||||
scipy>=0.19.0
|
||||
numba==0.48
|
||||
torch>=0.4.1
|
||||
tensorflow>=2.2
|
||||
librosa>=0.5.1
|
||||
Unidecode>=0.4.20
|
||||
tensorboard
|
||||
librosa==0.7.2
|
||||
phonemizer>=2.2.0
|
||||
unidecode==0.4.20
|
||||
attrdict
|
||||
tensorboardX
|
||||
matplotlib
|
||||
Pillow
|
||||
flask
|
||||
scipy
|
||||
tqdm
|
||||
soundfile
|
||||
inflect
|
||||
phonemizer
|
||||
bokeh==1.4.0
|
||||
nose
|
||||
soundfile
|
||||
nose==1.3.7
|
||||
cardboardlint==1.3.0
|
|
@ -15,12 +15,9 @@ def create_argparser():
|
|||
parser.add_argument('--tts_config', type=str, help='path to TTS config.json file')
|
||||
parser.add_argument('--tts_speakers', type=str, help='path to JSON file containing speaker ids, if speaker ids are used in the model')
|
||||
parser.add_argument('--wavernn_lib_path', type=str, default=None, help='path to WaveRNN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
|
||||
parser.add_argument('--wavernn_file', type=str, default=None, help='path to WaveRNN checkpoint file.')
|
||||
parser.add_argument('--wavernn_checkpoint', type=str, default=None, help='path to WaveRNN checkpoint file.')
|
||||
parser.add_argument('--wavernn_config', type=str, default=None, help='path to WaveRNN config file.')
|
||||
parser.add_argument('--is_wavernn_batched', type=convert_boolean, default=False, help='true to use batched WaveRNN.')
|
||||
parser.add_argument('--pwgan_lib_path', type=str, default=None, help='path to ParallelWaveGAN project folder to be imported. If this is not passed, model uses Griffin-Lim for synthesis.')
|
||||
parser.add_argument('--pwgan_file', type=str, default=None, help='path to ParallelWaveGAN checkpoint file.')
|
||||
parser.add_argument('--pwgan_config', type=str, default=None, help='path to ParallelWaveGAN config file.')
|
||||
parser.add_argument('--vocoder_config', type=str, default=None, help='path to TTS.vocoder config file.')
|
||||
parser.add_argument('--vocoder_checkpoint', type=str, default=None, help='path to TTS.vocoder checkpoint file.')
|
||||
parser.add_argument('--port', type=int, default=5002, help='port to listen on.')
|
||||
|
@ -46,10 +43,6 @@ embedded_wavernn_folder = os.path.join(embedded_models_folder, 'wavernn')
|
|||
wavernn_checkpoint_file = os.path.join(embedded_wavernn_folder, 'checkpoint.pth.tar')
|
||||
wavernn_config_file = os.path.join(embedded_wavernn_folder, 'config.json')
|
||||
|
||||
embedded_pwgan_folder = os.path.join(embedded_models_folder, 'pwgan')
|
||||
pwgan_checkpoint_file = os.path.join(embedded_pwgan_folder, 'checkpoint.pkl')
|
||||
pwgan_config_file = os.path.join(embedded_pwgan_folder, 'config.yml')
|
||||
|
||||
args = create_argparser().parse_args()
|
||||
|
||||
# If these were not specified in the CLI args, use default values with embedded model files
|
||||
|
@ -57,19 +50,16 @@ if not args.tts_checkpoint and os.path.isfile(tts_checkpoint_file):
|
|||
args.tts_checkpoint = tts_checkpoint_file
|
||||
if not args.tts_config and os.path.isfile(tts_config_file):
|
||||
args.tts_config = tts_config_file
|
||||
if not args.vocoder_checkpoint and os.path.isfile(tts_checkpoint_file):
|
||||
args.tts_checkpoint = tts_checkpoint_file
|
||||
if not args.vocoder_config and os.path.isfile(tts_config_file):
|
||||
args.tts_config = tts_config_file
|
||||
|
||||
if not args.wavernn_file and os.path.isfile(wavernn_checkpoint_file):
|
||||
args.wavernn_file = wavernn_checkpoint_file
|
||||
if not args.vocoder_checkpoint and os.path.isfile(vocoder_checkpoint_file):
|
||||
args.vocoder_checkpoint = vocoder_checkpoint_file
|
||||
if not args.vocoder_config and os.path.isfile(vocoder_config_file):
|
||||
args.vocoder_config = vocoder_config_file
|
||||
|
||||
if not args.wavernn_checkpoint and os.path.isfile(wavernn_checkpoint_file):
|
||||
args.wavernn_checkpoint = wavernn_checkpoint_file
|
||||
if not args.wavernn_config and os.path.isfile(wavernn_config_file):
|
||||
args.wavernn_config = wavernn_config_file
|
||||
if not args.pwgan_file and os.path.isfile(pwgan_checkpoint_file):
|
||||
args.pwgan_file = pwgan_checkpoint_file
|
||||
if not args.pwgan_config and os.path.isfile(pwgan_config_file):
|
||||
args.pwgan_config = pwgan_config_file
|
||||
|
||||
synthesizer = Synthesizer(args)
|
||||
|
||||
|
|
|
@ -31,15 +31,16 @@ class Synthesizer(object):
|
|||
self.wavernn = None
|
||||
self.vocoder_model = None
|
||||
self.config = config
|
||||
print(config)
|
||||
self.use_cuda = self.config.use_cuda
|
||||
if self.use_cuda:
|
||||
assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
|
||||
self.load_tts(self.config.tts_checkpoint, self.config.tts_config,
|
||||
self.config.use_cuda)
|
||||
if self.config.vocoder_file:
|
||||
if self.config.vocoder_checkpoint:
|
||||
self.load_vocoder(self.config.vocoder_checkpoint, self.config.vocoder_config, self.config.use_cuda)
|
||||
if self.config.wavernn_lib_path:
|
||||
self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_file,
|
||||
self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint,
|
||||
self.config.wavernn_config, self.config.use_cuda)
|
||||
|
||||
def load_tts(self, tts_checkpoint, tts_config, use_cuda):
|
||||
|
|
72
setup.py
72
setup.py
|
@ -69,6 +69,41 @@ if 'bdist_wheel' in unknown_args and args.checkpoint and args.model_config:
|
|||
shutil.copy(args.model_config, embedded_config_path)
|
||||
package_data.extend([embedded_checkpoint_path, embedded_config_path])
|
||||
|
||||
|
||||
def pip_install(package_name):
|
||||
subprocess.call(
|
||||
[sys.executable, '-m', 'pip', 'install', package_name]
|
||||
)
|
||||
|
||||
|
||||
requirements = {
|
||||
'install_requires':[
|
||||
"torch>=1.5",
|
||||
"numpy>=1.16.0",
|
||||
"numba==0.48",
|
||||
"scipy>=0.19.0",
|
||||
"librosa==0.7.2",
|
||||
"unidecode==0.4.20",
|
||||
"attrdict",
|
||||
"tensorboardX",
|
||||
"matplotlib",
|
||||
"Pillow",
|
||||
"flask",
|
||||
"tqdm",
|
||||
"inflect",
|
||||
"bokeh==1.4.0",
|
||||
"soundfile",
|
||||
"phonemizer>=2.2.0",
|
||||
"nose==1.3.7",
|
||||
"cardboardlint==1.3.0",
|
||||
"pylint==2.5.3",
|
||||
],
|
||||
'pip_install':[
|
||||
'tensorflow>=2.2.0',
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
setup(
|
||||
name='TTS',
|
||||
version=version,
|
||||
|
@ -95,24 +130,23 @@ setup(
|
|||
'build_py': build_py,
|
||||
'develop': develop,
|
||||
},
|
||||
install_requires=[
|
||||
"scipy>=0.19.0",
|
||||
"torch>=1.5",
|
||||
"numpy>=1.16.0",
|
||||
"librosa==0.6.2",
|
||||
"unidecode==0.4.20",
|
||||
"attrdict",
|
||||
"tensorboardX",
|
||||
"matplotlib",
|
||||
"Pillow",
|
||||
"flask",
|
||||
"tqdm",
|
||||
"inflect",
|
||||
"bokeh==1.4.0",
|
||||
"soundfile",
|
||||
"phonemizer @ https://github.com/bootphon/phonemizer/tarball/master",
|
||||
],
|
||||
dependency_links=[
|
||||
"http://github.com/bootphon/phonemizer/tarball/master#egg=phonemizer-1.0.1"
|
||||
install_requires=requirements['install_requires'],
|
||||
python_requires='>=3.6.0',
|
||||
classifiers=[
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
'Development Status :: 3 - Alpha',
|
||||
"Intended Audience :: Science/Research :: Developers",
|
||||
"Operating System :: POSIX :: Linux",
|
||||
'License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)',
|
||||
"Topic :: Software Development :: Libraries :: Python Modules :: Speech :: Sound/Audio :: Multimedia :: Artificial Intelligence",
|
||||
]
|
||||
)
|
||||
|
||||
# for some reason having tensorflow in 'install_requires'
|
||||
# breaks some of the dependencies.
|
||||
for module in requirements['pip_install']:
|
||||
pip_install(module)
|
Binary file not shown.
|
@ -6,7 +6,7 @@
|
|||
"wavernn_file": null, // wavernn checkpoint file name
|
||||
"wavernn_config": null, // wavernn config file
|
||||
"vocoder_config":null,
|
||||
"vocoder_file": null,
|
||||
"vocoder_checkpoint": null,
|
||||
"is_wavernn_batched":true,
|
||||
"port": 5002,
|
||||
"use_cuda": false,
|
||||
|
|
|
@ -14,6 +14,9 @@ rm -f dist/*.whl
|
|||
python setup.py --quiet bdist_wheel --checkpoint tests/outputs/checkpoint_10.pth.tar --model_config tests/outputs/dummy_model_config.json
|
||||
pip install --quiet dist/TTS*.whl
|
||||
|
||||
# this is related to https://github.com/librosa/librosa/issues/1160
|
||||
pip install numba==0.48
|
||||
|
||||
python -m TTS.server.server &
|
||||
SERVER_PID=$!
|
||||
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
|
|
@ -0,0 +1,134 @@
|
|||
import os
|
||||
import torch
|
||||
import unittest
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
tf.get_logger().setLevel('INFO')
|
||||
|
||||
from TTS.utils.io import load_config
|
||||
from TTS.tf.models.tacotron2 import Tacotron2
|
||||
from TTS.tf.utils.tflite import convert_tacotron2_to_tflite, load_tflite_model
|
||||
|
||||
#pylint: disable=unused-variable
|
||||
|
||||
torch.manual_seed(1)
|
||||
use_cuda = torch.cuda.is_available()
|
||||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
file_path = os.path.dirname(os.path.realpath(__file__)).replace('/tf/', '/')
|
||||
c = load_config(os.path.join(file_path, 'test_config.json'))
|
||||
|
||||
|
||||
class TacotronTFTrainTest(unittest.TestCase):
|
||||
|
||||
@staticmethod
|
||||
def generate_dummy_inputs():
|
||||
chars_seq = torch.randint(0, 24, (8, 128)).long().to(device)
|
||||
chars_seq_lengths = torch.randint(100, 128, (8, )).long().to(device)
|
||||
chars_seq_lengths = torch.sort(chars_seq_lengths, descending=True)[0]
|
||||
mel_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
||||
mel_postnet_spec = torch.rand(8, 30, c.audio['num_mels']).to(device)
|
||||
mel_lengths = torch.randint(20, 30, (8, )).long().to(device)
|
||||
stop_targets = torch.zeros(8, 30, 1).float().to(device)
|
||||
speaker_ids = torch.randint(0, 5, (8, )).long().to(device)
|
||||
|
||||
chars_seq = tf.convert_to_tensor(chars_seq.cpu().numpy())
|
||||
chars_seq_lengths = tf.convert_to_tensor(chars_seq_lengths.cpu().numpy())
|
||||
mel_spec = tf.convert_to_tensor(mel_spec.cpu().numpy())
|
||||
return chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
|
||||
stop_targets, speaker_ids
|
||||
|
||||
def test_train_step(self):
|
||||
''' test forward pass '''
|
||||
chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
|
||||
stop_targets, speaker_ids = self.generate_dummy_inputs()
|
||||
|
||||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()):, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(chars_seq.shape[0],
|
||||
stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5)
|
||||
# training pass
|
||||
output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)
|
||||
|
||||
# check model output shapes
|
||||
assert np.all(output[0].shape == mel_spec.shape)
|
||||
assert np.all(output[1].shape == mel_spec.shape)
|
||||
assert output[2].shape[2] == chars_seq.shape[1]
|
||||
assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
|
||||
assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)
|
||||
|
||||
# inference pass
|
||||
output = model(chars_seq, training=False)
|
||||
|
||||
def test_forward_attention(self,):
|
||||
chars_seq, chars_seq_lengths, mel_spec, mel_postnet_spec, mel_lengths,\
|
||||
stop_targets, speaker_ids = self.generate_dummy_inputs()
|
||||
|
||||
for idx in mel_lengths:
|
||||
stop_targets[:, int(idx.item()):, 0] = 1.0
|
||||
|
||||
stop_targets = stop_targets.view(chars_seq.shape[0],
|
||||
stop_targets.size(1) // c.r, -1)
|
||||
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze()
|
||||
|
||||
model = Tacotron2(num_chars=24, r=c.r, num_speakers=5, forward_attn=True)
|
||||
# training pass
|
||||
output = model(chars_seq, chars_seq_lengths, mel_spec, training=True)
|
||||
|
||||
# check model output shapes
|
||||
assert np.all(output[0].shape == mel_spec.shape)
|
||||
assert np.all(output[1].shape == mel_spec.shape)
|
||||
assert output[2].shape[2] == chars_seq.shape[1]
|
||||
assert output[2].shape[1] == (mel_spec.shape[1] // model.decoder.r)
|
||||
assert output[3].shape[1] == (mel_spec.shape[1] // model.decoder.r)
|
||||
|
||||
# inference pass
|
||||
output = model(chars_seq, training=False)
|
||||
|
||||
def test_tflite_conversion(self, ): #pylint:disable=no-self-use
|
||||
model = Tacotron2(num_chars=24,
|
||||
num_speakers=0,
|
||||
r=3,
|
||||
postnet_output_dim=80,
|
||||
decoder_output_dim=80,
|
||||
attn_type='original',
|
||||
attn_win=False,
|
||||
attn_norm='sigmoid',
|
||||
prenet_type='original',
|
||||
prenet_dropout=True,
|
||||
forward_attn=False,
|
||||
trans_agent=False,
|
||||
forward_attn_mask=False,
|
||||
location_attn=True,
|
||||
attn_K=0,
|
||||
separate_stopnet=True,
|
||||
bidirectional_decoder=False,
|
||||
enable_tflite=True)
|
||||
model.build_inference()
|
||||
convert_tacotron2_to_tflite(model, output_path='test_tacotron2.tflite', experimental_converter=True)
|
||||
# init tflite model
|
||||
tflite_model = load_tflite_model('test_tacotron2.tflite')
|
||||
# fake input
|
||||
inputs = tf.random.uniform([1, 4], maxval=10, dtype=tf.int32) #pylint:disable=unexpected-keyword-arg
|
||||
# run inference
|
||||
# get input and output details
|
||||
input_details = tflite_model.get_input_details()
|
||||
output_details = tflite_model.get_output_details()
|
||||
# reshape input tensor for the new input shape
|
||||
tflite_model.resize_tensor_input(input_details[0]['index'], inputs.shape) #pylint:disable=unexpected-keyword-arg
|
||||
tflite_model.allocate_tensors()
|
||||
detail = input_details[0]
|
||||
input_shape = detail['shape']
|
||||
tflite_model.set_tensor(detail['index'], inputs)
|
||||
# run the tflite_model
|
||||
tflite_model.invoke()
|
||||
# collect outputs
|
||||
decoder_output = tflite_model.get_tensor(output_details[0]['index'])
|
||||
postnet_output = tflite_model.get_tensor(output_details[1]['index'])
|
||||
# remove tflite binary
|
||||
os.remove('test_tacotron2.tflite')
|
||||
|
|
@ -59,9 +59,9 @@ def gan_dataset_case(batch_size, seq_len, hop_len, conv_pad, return_segments, us
|
|||
audio = wav1[idx].squeeze()
|
||||
feat = feat1[idx]
|
||||
mel = ap.melspectrogram(audio)
|
||||
# the first 2 and the last frame is skipped due to the padding
|
||||
# applied in spec. computation.
|
||||
assert (feat - mel[:, :feat1.shape[-1]])[:, 2:-1].sum() == 0, f' [!] {(feat - mel[:, :feat1.shape[-1]])[:, 2:-1].sum()}'
|
||||
# the first 2 and the last 2 frames are skipped due to the padding
|
||||
# differences in stft
|
||||
assert (feat - mel[:, :feat1.shape[-1]])[:, 2:-2].sum() <= 0, f' [!] {(feat - mel[:, :feat1.shape[-1]])[:, 2:-2].sum()}'
|
||||
|
||||
count_iter += 1
|
||||
# if count_iter == max_iter:
|
||||
|
|
Loading…
Reference in New Issue