mirror of https://github.com/coqui-ai/TTS.git
Fix style tests
This commit is contained in:
parent
0ae1e0248c
commit
3435bc8fca
|
@ -4,6 +4,7 @@ import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from TTS.utils.vad import get_vad_model_and_utils, remove_silence
|
from TTS.utils.vad import get_vad_model_and_utils, remove_silence
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,7 +17,13 @@ def adjust_path_and_remove_silence(audio_path):
|
||||||
# create all directory structure
|
# create all directory structure
|
||||||
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
# remove the silence and save the audio
|
# remove the silence and save the audio
|
||||||
output_path = remove_silence(model_and_utils, audio_path, output_path, trim_just_beginning_and_end=args.trim_just_beginning_and_end, use_cuda=args.use_cuda)
|
output_path = remove_silence(
|
||||||
|
model_and_utils,
|
||||||
|
audio_path,
|
||||||
|
output_path,
|
||||||
|
trim_just_beginning_and_end=args.trim_just_beginning_and_end,
|
||||||
|
use_cuda=args.use_cuda,
|
||||||
|
)
|
||||||
|
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
|
|
||||||
|
|
||||||
def read_audio(path):
|
def read_audio(path):
|
||||||
wav, sr = torchaudio.load(path)
|
wav, sr = torchaudio.load(path)
|
||||||
|
|
||||||
|
@ -9,39 +10,42 @@ def read_audio(path):
|
||||||
|
|
||||||
return wav.squeeze(0), sr
|
return wav.squeeze(0), sr
|
||||||
|
|
||||||
|
|
||||||
def resample_wav(wav, sr, new_sr):
|
def resample_wav(wav, sr, new_sr):
|
||||||
wav = wav.unsqueeze(0)
|
wav = wav.unsqueeze(0)
|
||||||
transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr)
|
transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=new_sr)
|
||||||
wav = transform(wav)
|
wav = transform(wav)
|
||||||
return wav.squeeze(0)
|
return wav.squeeze(0)
|
||||||
|
|
||||||
|
|
||||||
def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False):
|
def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False):
|
||||||
factor = new_sr / vad_sr
|
factor = new_sr / vad_sr
|
||||||
new_timestamps = []
|
new_timestamps = []
|
||||||
if just_begging_end and timestamps:
|
if just_begging_end and timestamps:
|
||||||
# get just the start and end timestamps
|
# get just the start and end timestamps
|
||||||
new_dict = {'start': int(timestamps[0]['start']*factor), 'end': int(timestamps[-1]['end']*factor)}
|
new_dict = {"start": int(timestamps[0]["start"] * factor), "end": int(timestamps[-1]["end"] * factor)}
|
||||||
new_timestamps.append(new_dict)
|
new_timestamps.append(new_dict)
|
||||||
else:
|
else:
|
||||||
for ts in timestamps:
|
for ts in timestamps:
|
||||||
# map to the new SR
|
# map to the new SR
|
||||||
new_dict = {'start': int(ts['start']*factor), 'end': int(ts['end']*factor)}
|
new_dict = {"start": int(ts["start"] * factor), "end": int(ts["end"] * factor)}
|
||||||
new_timestamps.append(new_dict)
|
new_timestamps.append(new_dict)
|
||||||
|
|
||||||
return new_timestamps
|
return new_timestamps
|
||||||
|
|
||||||
|
|
||||||
def get_vad_model_and_utils(use_cuda=False):
|
def get_vad_model_and_utils(use_cuda=False):
|
||||||
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
model, utils = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=False)
|
||||||
model='silero_vad',
|
|
||||||
force_reload=True,
|
|
||||||
onnx=False)
|
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
model = model.cuda()
|
model = model.cuda()
|
||||||
|
|
||||||
get_speech_timestamps, save_audio, _, _, collect_chunks = utils
|
get_speech_timestamps, save_audio, _, _, collect_chunks = utils
|
||||||
return model, get_speech_timestamps, save_audio, collect_chunks
|
return model, get_speech_timestamps, save_audio, collect_chunks
|
||||||
|
|
||||||
def remove_silence(model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False):
|
|
||||||
|
def remove_silence(
|
||||||
|
model_and_utils, audio_path, out_path, vad_sample_rate=8000, trim_just_beginning_and_end=True, use_cuda=False
|
||||||
|
):
|
||||||
|
|
||||||
# get the VAD model and utils functions
|
# get the VAD model and utils functions
|
||||||
model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils
|
model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils
|
||||||
|
@ -62,7 +66,9 @@ def remove_silence(model_and_utils, audio_path, out_path, vad_sample_rate=8000,
|
||||||
speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768)
|
speech_timestamps = get_speech_timestamps(wav_vad, model, sampling_rate=vad_sample_rate, window_size_samples=768)
|
||||||
|
|
||||||
# map the current speech_timestamps to the sample rate of the ground truth audio
|
# map the current speech_timestamps to the sample rate of the ground truth audio
|
||||||
new_speech_timestamps = map_timestamps_to_new_sr(vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end)
|
new_speech_timestamps = map_timestamps_to_new_sr(
|
||||||
|
vad_sample_rate, gt_sample_rate, speech_timestamps, trim_just_beginning_and_end
|
||||||
|
)
|
||||||
|
|
||||||
# if have speech timestamps else save the wav
|
# if have speech timestamps else save the wav
|
||||||
if new_speech_timestamps:
|
if new_speech_timestamps:
|
||||||
|
|
Loading…
Reference in New Issue