From 352aa69ecab16fb76815ff3f0c9c349ff1a83330 Mon Sep 17 00:00:00 2001 From: Edresson Date: Sun, 21 Nov 2021 12:20:35 -0300 Subject: [PATCH] Create a module for the VAD script --- TTS/bin/remove_silence_using_vad.py | 164 +++------------------------- TTS/utils/vad.py | 142 ++++++++++++++++++++++++ 2 files changed, 160 insertions(+), 146 deletions(-) create mode 100644 TTS/utils/vad.py diff --git a/TTS/bin/remove_silence_using_vad.py b/TTS/bin/remove_silence_using_vad.py index 8951662b..a32f0f45 100755 --- a/TTS/bin/remove_silence_using_vad.py +++ b/TTS/bin/remove_silence_using_vad.py @@ -1,162 +1,31 @@ -# This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py -import argparse -import collections -import contextlib -import glob -import multiprocessing import os +import glob import pathlib -import wave -import webrtcvad +import argparse +import multiprocessing + from tqdm.contrib.concurrent import process_map - -def read_wave(path): - """Reads a .wav file. - - Takes the path, and returns (PCM audio data, sample rate). - """ - with contextlib.closing(wave.open(path, "rb")) as wf: - num_channels = wf.getnchannels() - assert num_channels == 1 - sample_width = wf.getsampwidth() - assert sample_width == 2 - sample_rate = wf.getframerate() - assert sample_rate in (8000, 16000, 32000, 48000) - pcm_data = wf.readframes(wf.getnframes()) - return pcm_data, sample_rate - - -def write_wave(path, audio, sample_rate): - """Writes a .wav file. - - Takes path, PCM audio data, and sample rate. - """ - with contextlib.closing(wave.open(path, "wb")) as wf: - wf.setnchannels(1) - wf.setsampwidth(2) - wf.setframerate(sample_rate) - wf.writeframes(audio) - - -class Frame(object): - """Represents a "frame" of audio data.""" - - def __init__(self, _bytes, timestamp, duration): - self.bytes =_bytes - self.timestamp = timestamp - self.duration = duration - - -def frame_generator(frame_duration_ms, audio, sample_rate): - """Generates audio frames from PCM audio data. - - Takes the desired frame duration in milliseconds, the PCM data, and - the sample rate. - - Yields Frames of the requested duration. - """ - n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) - offset = 0 - timestamp = 0.0 - duration = (float(n) / sample_rate) / 2.0 - while offset + n < len(audio): - yield Frame(audio[offset : offset + n], timestamp, duration) - timestamp += duration - offset += n - - -def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): - """Filters out non-voiced audio frames. - - Given a webrtcvad.Vad and a source of audio frames, yields only - the voiced audio. - - Uses a padded, sliding window algorithm over the audio frames. - When more than 90% of the frames in the window are voiced (as - reported by the VAD), the collector triggers and begins yielding - audio frames. Then the collector waits until 90% of the frames in - the window are unvoiced to detrigger. - - The window is padded at the front and back to provide a small - amount of silence or the beginnings/endings of speech around the - voiced frames. - - Arguments: - - sample_rate - The audio sample rate, in Hz. - frame_duration_ms - The frame duration in milliseconds. - padding_duration_ms - The amount to pad the window, in milliseconds. - vad - An instance of webrtcvad.Vad. - frames - a source of audio frames (sequence or generator). - - Returns: A generator that yields PCM audio data. - """ - num_padding_frames = int(padding_duration_ms / frame_duration_ms) - # We use a deque for our sliding window/ring buffer. - ring_buffer = collections.deque(maxlen=num_padding_frames) - # We have two states: TRIGGERED and NOTTRIGGERED. We start in the - # NOTTRIGGERED state. - triggered = False - - voiced_frames = [] - for frame in frames: - is_speech = vad.is_speech(frame.bytes, sample_rate) - - # sys.stdout.write('1' if is_speech else '0') - if not triggered: - ring_buffer.append((frame, is_speech)) - num_voiced = len([f for f, speech in ring_buffer if speech]) - # If we're NOTTRIGGERED and more than 90% of the frames in - # the ring buffer are voiced frames, then enter the - # TRIGGERED state. - if num_voiced > 0.9 * ring_buffer.maxlen: - triggered = True - # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,)) - # We want to yield all the audio we see from now until - # we are NOTTRIGGERED, but we have to start with the - # audio that's already in the ring buffer. - for f, _ in ring_buffer: - voiced_frames.append(f) - ring_buffer.clear() - else: - # We're in the TRIGGERED state, so collect the audio data - # and add it to the ring buffer. - voiced_frames.append(frame) - ring_buffer.append((frame, is_speech)) - num_unvoiced = len([f for f, speech in ring_buffer if not speech]) - # If more than 90% of the frames in the ring buffer are - # unvoiced, then enter NOTTRIGGERED and yield whatever - # audio we've collected. - if num_unvoiced > 0.9 * ring_buffer.maxlen: - # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) - triggered = False - yield b"".join([f.bytes for f in voiced_frames]) - ring_buffer.clear() - voiced_frames = [] - # If we have any leftover voiced audio when we run out of input, - # yield it. - if voiced_frames: - yield b"".join([f.bytes for f in voiced_frames]) - +from TTS.utils.vad import read_wave, write_wave, get_vad_speech_segments def remove_silence(filepath): output_path = filepath.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, "")) # ignore if the file exists if os.path.exists(output_path) and not args.force: return + # create all directory structure pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True) - padding_duration_ms = 300 # default 300 + # load wave audio, sample_rate = read_wave(filepath) - vad = webrtcvad.Vad(int(args.aggressiveness)) - frames = frame_generator(30, audio, sample_rate) - frames = list(frames) - segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames) - flag = False + + # get speech segments + segments = get_vad_speech_segments(audio, sample_rate, aggressiveness=args.aggressiveness) + segments = list(segments) num_segments = len(segments) - + flag = False + # create the output wave if num_segments != 0: for i, segment in reversed(list(enumerate(segments))): if i >= 1: @@ -168,8 +37,8 @@ def remove_silence(filepath): else: if flag: segment = segment + concat_segment + # print("Saving: ", output_path) write_wave(output_path, segment, sample_rate) - print(output_path) return else: print("> Just Copying the file to:", output_path) @@ -200,7 +69,10 @@ if __name__ == "__main__": parser.add_argument( "-o", "--output_dir", type=str, default="../VCTK-Corpus-removed-silence", help="Output Dataset dir" ) - parser.add_argument("-f", "--force", type=bool, default=True, help="Force the replace of exists files") + parser.add_argument("-f", "--force", + default=False, + action='store_true', + help='Force the replace of exists files') parser.add_argument( "-g", "--glob", diff --git a/TTS/utils/vad.py b/TTS/utils/vad.py new file mode 100644 index 00000000..4e61f490 --- /dev/null +++ b/TTS/utils/vad.py @@ -0,0 +1,142 @@ +# This code is adpated from: https://github.com/wiseman/py-webrtcvad/blob/master/example.py +import wave +import webrtcvad +import contextlib +import collections + + +def read_wave(path): + """Reads a .wav file. + + Takes the path, and returns (PCM audio data, sample rate). + """ + with contextlib.closing(wave.open(path, "rb")) as wf: + num_channels = wf.getnchannels() + assert num_channels == 1 + sample_width = wf.getsampwidth() + assert sample_width == 2 + sample_rate = wf.getframerate() + assert sample_rate in (8000, 16000, 32000, 48000) + pcm_data = wf.readframes(wf.getnframes()) + return pcm_data, sample_rate + + +def write_wave(path, audio, sample_rate): + """Writes a .wav file. + + Takes path, PCM audio data, and sample rate. + """ + with contextlib.closing(wave.open(path, "wb")) as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(audio) + + +class Frame(object): + """Represents a "frame" of audio data.""" + + def __init__(self, _bytes, timestamp, duration): + self.bytes =_bytes + self.timestamp = timestamp + self.duration = duration + + +def frame_generator(frame_duration_ms, audio, sample_rate): + """Generates audio frames from PCM audio data. + + Takes the desired frame duration in milliseconds, the PCM data, and + the sample rate. + + Yields Frames of the requested duration. + """ + n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) + offset = 0 + timestamp = 0.0 + duration = (float(n) / sample_rate) / 2.0 + while offset + n < len(audio): + yield Frame(audio[offset : offset + n], timestamp, duration) + timestamp += duration + offset += n + + +def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): + """Filters out non-voiced audio frames. + + Given a webrtcvad.Vad and a source of audio frames, yields only + the voiced audio. + + Uses a padded, sliding window algorithm over the audio frames. + When more than 90% of the frames in the window are voiced (as + reported by the VAD), the collector triggers and begins yielding + audio frames. Then the collector waits until 90% of the frames in + the window are unvoiced to detrigger. + + The window is padded at the front and back to provide a small + amount of silence or the beginnings/endings of speech around the + voiced frames. + + Arguments: + + sample_rate - The audio sample rate, in Hz. + frame_duration_ms - The frame duration in milliseconds. + padding_duration_ms - The amount to pad the window, in milliseconds. + vad - An instance of webrtcvad.Vad. + frames - a source of audio frames (sequence or generator). + + Returns: A generator that yields PCM audio data. + """ + num_padding_frames = int(padding_duration_ms / frame_duration_ms) + # We use a deque for our sliding window/ring buffer. + ring_buffer = collections.deque(maxlen=num_padding_frames) + # We have two states: TRIGGERED and NOTTRIGGERED. We start in the + # NOTTRIGGERED state. + triggered = False + + voiced_frames = [] + for frame in frames: + is_speech = vad.is_speech(frame.bytes, sample_rate) + + # sys.stdout.write('1' if is_speech else '0') + if not triggered: + ring_buffer.append((frame, is_speech)) + num_voiced = len([f for f, speech in ring_buffer if speech]) + # If we're NOTTRIGGERED and more than 90% of the frames in + # the ring buffer are voiced frames, then enter the + # TRIGGERED state. + if num_voiced > 0.9 * ring_buffer.maxlen: + triggered = True + # sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,)) + # We want to yield all the audio we see from now until + # we are NOTTRIGGERED, but we have to start with the + # audio that's already in the ring buffer. + for f, _ in ring_buffer: + voiced_frames.append(f) + ring_buffer.clear() + else: + # We're in the TRIGGERED state, so collect the audio data + # and add it to the ring buffer. + voiced_frames.append(frame) + ring_buffer.append((frame, is_speech)) + num_unvoiced = len([f for f, speech in ring_buffer if not speech]) + # If more than 90% of the frames in the ring buffer are + # unvoiced, then enter NOTTRIGGERED and yield whatever + # audio we've collected. + if num_unvoiced > 0.9 * ring_buffer.maxlen: + # sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration)) + triggered = False + yield b"".join([f.bytes for f in voiced_frames]) + ring_buffer.clear() + voiced_frames = [] + # If we have any leftover voiced audio when we run out of input, + # yield it. + if voiced_frames: + yield b"".join([f.bytes for f in voiced_frames]) + +def get_vad_speech_segments(audio, sample_rate, aggressiveness=2, padding_duration_ms=300): + + vad = webrtcvad.Vad(int(aggressiveness)) + frames = list(frame_generator(30, audio, sample_rate)) + segments = vad_collector(sample_rate, 30, padding_duration_ms, vad, frames) + + return segments \ No newline at end of file