mirror of https://github.com/coqui-ai/TTS.git
Update VAD for silence trimming. (#2604)
* Update vad for mp3 and fault tolerance * Make style * Remove importt * Remove stupid defaults
This commit is contained in:
parent
5c89c621ca
commit
9b5822d625
|
@ -16,7 +16,7 @@ def adjust_path_and_remove_silence(audio_path):
|
||||||
output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
|
output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
|
||||||
# ignore if the file exists
|
# ignore if the file exists
|
||||||
if os.path.exists(output_path) and not args.force:
|
if os.path.exists(output_path) and not args.force:
|
||||||
return output_path
|
return output_path, False
|
||||||
|
|
||||||
# create all directory structure
|
# create all directory structure
|
||||||
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
@ -28,7 +28,6 @@ def adjust_path_and_remove_silence(audio_path):
|
||||||
trim_just_beginning_and_end=args.trim_just_beginning_and_end,
|
trim_just_beginning_and_end=args.trim_just_beginning_and_end,
|
||||||
use_cuda=args.use_cuda,
|
use_cuda=args.use_cuda,
|
||||||
)
|
)
|
||||||
|
|
||||||
return output_path, is_speech
|
return output_path, is_speech
|
||||||
|
|
||||||
|
|
||||||
|
@ -70,7 +69,7 @@ def preprocess_audios():
|
||||||
# write files that do not have speech
|
# write files that do not have speech
|
||||||
with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
|
with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
|
||||||
for file in filtered_files:
|
for file in filtered_files:
|
||||||
f.write(file + "\n")
|
f.write(str(file) + "\n")
|
||||||
else:
|
else:
|
||||||
print("> No files Found !")
|
print("> No files Found !")
|
||||||
|
|
||||||
|
@ -79,10 +78,8 @@ if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
|
description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end True"
|
||||||
)
|
)
|
||||||
parser.add_argument("-i", "--input_dir", type=str, default="../VCTK-Corpus", help="Dataset root dir")
|
parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
|
||||||
parser.add_argument(
|
parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
|
||||||
"-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir"
|
|
||||||
)
|
|
||||||
parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
|
parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-g",
|
"-g",
|
||||||
|
@ -118,6 +115,10 @@ if __name__ == "__main__":
|
||||||
help="Number of processes to use",
|
help="Number of processes to use",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.output_dir == "":
|
||||||
|
args.output_dir = args.input_dir
|
||||||
|
|
||||||
# load the model and utils
|
# load the model and utils
|
||||||
model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda)
|
model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
|
||||||
preprocess_audios()
|
preprocess_audios()
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
import soundfile as sf
|
|
||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
|
|
||||||
|
@ -35,8 +34,10 @@ def map_timestamps_to_new_sr(vad_sr, new_sr, timestamps, just_begging_end=False)
|
||||||
return new_timestamps
|
return new_timestamps
|
||||||
|
|
||||||
|
|
||||||
def get_vad_model_and_utils(use_cuda=False):
|
def get_vad_model_and_utils(use_cuda=False, use_onnx=False):
|
||||||
model, utils = torch.hub.load(repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=False)
|
model, utils = torch.hub.load(
|
||||||
|
repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=True, onnx=use_onnx, force_onnx_cpu=True
|
||||||
|
)
|
||||||
if use_cuda:
|
if use_cuda:
|
||||||
model = model.cuda()
|
model = model.cuda()
|
||||||
|
|
||||||
|
@ -51,7 +52,11 @@ def remove_silence(
|
||||||
model, get_speech_timestamps, _, collect_chunks = model_and_utils
|
model, get_speech_timestamps, _, collect_chunks = model_and_utils
|
||||||
|
|
||||||
# read ground truth wav and resample the audio for the VAD
|
# read ground truth wav and resample the audio for the VAD
|
||||||
|
try:
|
||||||
wav, gt_sample_rate = read_audio(audio_path)
|
wav, gt_sample_rate = read_audio(audio_path)
|
||||||
|
except:
|
||||||
|
print(f"> ❗ Failed to read {audio_path}")
|
||||||
|
return None, False
|
||||||
|
|
||||||
# if needed, resample the audio for the VAD model
|
# if needed, resample the audio for the VAD model
|
||||||
if gt_sample_rate != vad_sample_rate:
|
if gt_sample_rate != vad_sample_rate:
|
||||||
|
@ -78,6 +83,6 @@ def remove_silence(
|
||||||
print(f"> The file {audio_path} probably does not have speech please check it !!")
|
print(f"> The file {audio_path} probably does not have speech please check it !!")
|
||||||
is_speech = False
|
is_speech = False
|
||||||
|
|
||||||
# save audio
|
# save
|
||||||
sf.write(out_path, wav, gt_sample_rate, subtype="PCM_16")
|
torchaudio.save(out_path, wav[None, :], gt_sample_rate)
|
||||||
return out_path, is_speech
|
return out_path, is_speech
|
||||||
|
|
Loading…
Reference in New Issue