coqui-tts/notebooks/Tortoise.ipynb

3.3 KiB

None <html lang="en"> <head> </head>
In [ ]:
#@title # Setup
# Imports used through the rest of the notebook.
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F

import IPython

from TTS.tts.models.tortoise import TextToSpeech
from TTS.tts.layers.tortoise.audio_utils import load_audio, load_voice, load_voices

# This will download all the models used by Tortoise from the HuggingFace hub.
tts = TextToSpeech()
In [ ]:
# This is the text that will be spoken.
text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?" #@param {type:"string"}
#@markdown Show code for multiline text input
# Here's something for the poetically inclined.. (set text=)
"""
Then took the other, as just as fair,
And having perhaps the better claim,
Because it was grassy and wanted wear;
Though as for that the passing there
Had worn them really about the same,"""

# Pick a "preset mode" to determine quality. Options: {"ultra_fast", "fast" (default), "standard", "high_quality"}. See docs in api.py
preset = "fast" #@param ["ultra_fast", "fast", "standard", "high_quality"]
In [ ]:
%ls ../TTS/tts/utils/assets/tortoise/voices/
import IPython
IPython.display.Audio(filename='../TTS/tts/utils/assets/tortoise/voices/lj/1.wav')
In [ ]:
#@markdown Pick one of the voices from the output above
voice = 'lj' #@param {type:"string"}

#@markdown Load it and send it through Tortoise.
voice_samples, conditioning_latents = load_voice(voice)
gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, 
                          preset=preset)
torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
IPython.display.Audio('generated.wav')
In [ ]:
 
</html>