From 5f5d441ee587f50a62160ef1f08839319fb21619 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 6 Oct 2022 13:25:54 +0200 Subject: [PATCH] Write non-speech files in a TXT (#2048) * Write non-speech files in a txt * Save 16-bit wav out of vad --- TTS/bin/remove_silence_using_vad.py | 16 ++++++++++++---- TTS/utils/vad.py | 9 ++++++--- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index 7d88ae91..352628bb 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -17,7 +17,7 @@ def adjust_path_and_remove_silence(audio_path): # create all directory structure pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) # remove the silence and save the audio - output_path = remove_silence( + output_path, is_speech = remove_silence( model_and_utils, audio_path, output_path, @@ -25,26 +25,34 @@ def adjust_path_and_remove_silence(audio_path): use_cuda=args.use_cuda, ) - return output_path + return output_path, is_speech def preprocess_audios(): files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True)) print("> Number of files: ", len(files)) if not args.force: - print("> Ignoring files that already exist in the output directory.") + print("> Ignoring files that already exist in the output idrectory.") if args.trim_just_beginning_and_end: print("> Trimming just the beginning and the end with nonspeech parts.") else: print("> Trimming all nonspeech parts.") + filtered_files = [] if files: # create threads # num_threads = multiprocessing.cpu_count() # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15) for f in tqdm(files): - adjust_path_and_remove_silence(f) + output_path, is_speech = adjust_path_and_remove_silence(f) + if not is_speech: + filtered_files.append(output_path) + + # write files that do not have speech + with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f: + for file in filtered_files: + f.write(file + "\n") else: print("> No files Found !") diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py index 033b911a..c978c837 100644 --- a/TTS/utils/vad.py +++ b/TTS/utils/vad.py @@ -1,3 +1,4 @@ +import soundfile as sf import torch import torchaudio @@ -48,7 +49,7 @@ def remove_silence( ): # get the VAD model and utils functions - model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils + model, get_speech_timestamps, _, collect_chunks = model_and_utils # read ground truth wav and resample the audio for the VAD wav, gt_sample_rate = read_audio(audio_path) @@ -73,9 +74,11 @@ def remove_silence( # if have speech timestamps else save the wav if new_speech_timestamps: wav = collect_chunks(new_speech_timestamps, wav) + is_speech = True else: print(f"> The file {audio_path} probably does not have speech please check it !!") + is_speech = False # save audio - save_audio(out_path, wav, sampling_rate=gt_sample_rate) - return out_path + sf.write(out_path, wav, gt_sample_rate, subtype="PCM_16") + return out_path, is_speech