mirror of https://github.com/coqui-ai/TTS.git
3.3 KiB
3.3 KiB
None
<html lang="en">
<head>
</head>
</html>
In [ ]:
#@title # Setup # Imports used through the rest of the notebook. import torch import torchaudio import torch.nn as nn import torch.nn.functional as F import IPython from TTS.tts.models.tortoise import TextToSpeech from TTS.tts.layers.tortoise.audio_utils import load_audio, load_voice, load_voices # This will download all the models used by Tortoise from the HuggingFace hub. tts = TextToSpeech()
In [ ]:
# This is the text that will be spoken. text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?" #@param {type:"string"} #@markdown Show code for multiline text input # Here's something for the poetically inclined.. (set text=) """ Then took the other, as just as fair, And having perhaps the better claim, Because it was grassy and wanted wear; Though as for that the passing there Had worn them really about the same,""" # Pick a "preset mode" to determine quality. Options: {"ultra_fast", "fast" (default), "standard", "high_quality"}. See docs in api.py preset = "fast" #@param ["ultra_fast", "fast", "standard", "high_quality"]
In [ ]:
%ls ../TTS/tts/utils/assets/tortoise/voices/ import IPython IPython.display.Audio(filename='../TTS/tts/utils/assets/tortoise/voices/lj/1.wav')
In [ ]:
#@markdown Pick one of the voices from the output above voice = 'lj' #@param {type:"string"} #@markdown Load it and send it through Tortoise. voice_samples, conditioning_latents = load_voice(voice) gen = tts.tts_with_preset(text, voice_samples=voice_samples, conditioning_latents=conditioning_latents, preset=preset) torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000) IPython.display.Audio('generated.wav')
In [ ]: