Write non-speech files in a TXT (#2048)

* Write non-speech files in a txt * Save 16-bit wav out of vad
2022-10-06 13:25:54 +02:00 · 2022-10-06 13:25:54 +02:00 · 5f5d441ee5
parent d6ad9a05b4
commit 5f5d441ee5
2 changed files with 18 additions and 7 deletions
--- a/TTS/bin/remove_silence_using_vad.py
+++ b/TTS/bin/remove_silence_using_vad.py
@ -17,7 +17,7 @@ def adjust_path_and_remove_silence(audio_path):
    # create all directory structure
    pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
    # remove the silence and save the audio
-    output_path = remove_silence(
+    output_path, is_speech = remove_silence(
        model_and_utils,
        audio_path,
        output_path,
@ -25,26 +25,34 @@ def adjust_path_and_remove_silence(audio_path):
        use_cuda=args.use_cuda,
    )

-    return output_path
+    return output_path, is_speech


 def preprocess_audios():
    files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
    print("> Number of files: ", len(files))
    if not args.force:
-        print("> Ignoring files that already exist in the output directory.")
+        print("> Ignoring files that already exist in the output idrectory.")

    if args.trim_just_beginning_and_end:
        print("> Trimming just the beginning and the end with nonspeech parts.")
    else:
        print("> Trimming all nonspeech parts.")

+    filtered_files = []
    if files:
        # create threads
        # num_threads = multiprocessing.cpu_count()
        # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
        for f in tqdm(files):
-            adjust_path_and_remove_silence(f)
+            output_path, is_speech = adjust_path_and_remove_silence(f)
+            if not is_speech:
+                filtered_files.append(output_path)
+
+        # write files that do not have speech
+        with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
+            for file in filtered_files:
+                f.write(file + "\n")
    else:
        print("> No files Found !")

--- a/TTS/utils/vad.py
+++ b/TTS/utils/vad.py
@ -1,3 +1,4 @@
+import soundfile as sf
 import torch
 import torchaudio

@ -48,7 +49,7 @@ def remove_silence(
 ):

    # get the VAD model and utils functions
-    model, get_speech_timestamps, save_audio, collect_chunks = model_and_utils
+    model, get_speech_timestamps, _, collect_chunks = model_and_utils

    # read ground truth wav and resample the audio for the VAD
    wav, gt_sample_rate = read_audio(audio_path)
@ -73,9 +74,11 @@ def remove_silence(
    # if have speech timestamps else save the wav
    if new_speech_timestamps:
        wav = collect_chunks(new_speech_timestamps, wav)
+        is_speech = True
    else:
        print(f"> The file {audio_path} probably does not have speech please check it !!")
+        is_speech = False

    # save audio
-    save_audio(out_path, wav, sampling_rate=gt_sample_rate)
-    return out_path
+    sf.write(out_path, wav, gt_sample_rate, subtype="PCM_16")
+    return out_path, is_speech