mirror of https://github.com/coqui-ai/TTS.git
Update TTSDataset
This commit is contained in:
parent
750903d2ba
commit
ff23dce081
|
@ -37,10 +37,10 @@ def noise_augment_audio(wav):
|
||||||
class TTSDataset(Dataset):
|
class TTSDataset(Dataset):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
outputs_per_step: int,
|
outputs_per_step: int = 1,
|
||||||
compute_linear_spec: bool,
|
compute_linear_spec: bool = False,
|
||||||
ap: AudioProcessor,
|
ap: AudioProcessor = None,
|
||||||
samples: List[Dict],
|
samples: List[Dict] = None,
|
||||||
tokenizer: "TTSTokenizer" = None,
|
tokenizer: "TTSTokenizer" = None,
|
||||||
compute_f0: bool = False,
|
compute_f0: bool = False,
|
||||||
f0_cache_path: str = None,
|
f0_cache_path: str = None,
|
||||||
|
@ -118,7 +118,6 @@ class TTSDataset(Dataset):
|
||||||
self.batch_group_size = batch_group_size
|
self.batch_group_size = batch_group_size
|
||||||
self._samples = samples
|
self._samples = samples
|
||||||
self.outputs_per_step = outputs_per_step
|
self.outputs_per_step = outputs_per_step
|
||||||
self.sample_rate = ap.sample_rate
|
|
||||||
self.compute_linear_spec = compute_linear_spec
|
self.compute_linear_spec = compute_linear_spec
|
||||||
self.return_wav = return_wav
|
self.return_wav = return_wav
|
||||||
self.compute_f0 = compute_f0
|
self.compute_f0 = compute_f0
|
||||||
|
@ -153,6 +152,15 @@ class TTSDataset(Dataset):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
self.print_logs()
|
self.print_logs()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def lengths(self):
|
||||||
|
lens = []
|
||||||
|
for item in self.samples:
|
||||||
|
_, wav_file, *_ = _parse_sample(item)
|
||||||
|
audio_len = os.path.getsize(wav_file) / 16 * 8 # assuming 16bit audio
|
||||||
|
lens.append(audio_len)
|
||||||
|
return lens
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def samples(self):
|
def samples(self):
|
||||||
return self._samples
|
return self._samples
|
||||||
|
@ -763,80 +771,3 @@ class F0Dataset:
|
||||||
print(f"{indent}| > Number of instances : {len(self.samples)}")
|
print(f"{indent}| > Number of instances : {len(self.samples)}")
|
||||||
|
|
||||||
|
|
||||||
# if __name__ == "__main__":
|
|
||||||
# from torch.utils.data import DataLoader
|
|
||||||
|
|
||||||
# from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig
|
|
||||||
# from TTS.tts.datasets import load_tts_samples
|
|
||||||
# from TTS.tts.utils.text.characters import IPAPhonemes
|
|
||||||
# from TTS.tts.utils.text.phonemizers import ESpeak
|
|
||||||
|
|
||||||
# dataset_config = BaseDatasetConfig(
|
|
||||||
# name="ljspeech",
|
|
||||||
# meta_file_train="metadata.csv",
|
|
||||||
# path="/Users/erengolge/Projects/TTS/recipes/ljspeech/LJSpeech-1.1",
|
|
||||||
# )
|
|
||||||
# train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True)
|
|
||||||
# samples = train_samples + eval_samples
|
|
||||||
|
|
||||||
# phonemizer = ESpeak(language="en-us")
|
|
||||||
# tokenizer = TTSTokenizer(use_phonemes=True, characters=IPAPhonemes(), phonemizer=phonemizer)
|
|
||||||
# # ph_dataset = PhonemeDataset(samples, tokenizer, phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests")
|
|
||||||
# # ph_dataset.precompute(num_workers=4)
|
|
||||||
|
|
||||||
# # dataloader = DataLoader(ph_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=ph_dataset.collate_fn)
|
|
||||||
# # for batch in dataloader:
|
|
||||||
# # print(batch)
|
|
||||||
# # break
|
|
||||||
|
|
||||||
# audio_config = BaseAudioConfig(
|
|
||||||
# sample_rate=22050,
|
|
||||||
# win_length=1024,
|
|
||||||
# hop_length=256,
|
|
||||||
# num_mels=80,
|
|
||||||
# preemphasis=0.0,
|
|
||||||
# ref_level_db=20,
|
|
||||||
# log_func="np.log",
|
|
||||||
# do_trim_silence=True,
|
|
||||||
# trim_db=45,
|
|
||||||
# mel_fmin=0,
|
|
||||||
# mel_fmax=8000,
|
|
||||||
# spec_gain=1.0,
|
|
||||||
# signal_norm=False,
|
|
||||||
# do_amp_to_db_linear=False,
|
|
||||||
# )
|
|
||||||
|
|
||||||
# ap = AudioProcessor.init_from_config(audio_config)
|
|
||||||
|
|
||||||
# # f0_dataset = F0Dataset(samples, ap, cache_path="/Users/erengolge/Projects/TTS/f0_tests", verbose=False, precompute_num_workers=4)
|
|
||||||
|
|
||||||
# # dataloader = DataLoader(f0_dataset, batch_size=4, shuffle=False, num_workers=4, collate_fn=f0_dataset.collate_fn)
|
|
||||||
# # for batch in dataloader:
|
|
||||||
# # print(batch)
|
|
||||||
# # breakpoint()
|
|
||||||
# # break
|
|
||||||
|
|
||||||
# dataset = TTSDataset(
|
|
||||||
# outputs_per_step=1,
|
|
||||||
# compute_linear_spec=False,
|
|
||||||
# samples=samples,
|
|
||||||
# ap=ap,
|
|
||||||
# return_wav=False,
|
|
||||||
# batch_group_size=0,
|
|
||||||
# min_seq_len=0,
|
|
||||||
# max_seq_len=500,
|
|
||||||
# use_noise_augment=False,
|
|
||||||
# verbose=True,
|
|
||||||
# speaker_id_mapping=None,
|
|
||||||
# d_vector_mapping=None,
|
|
||||||
# compute_f0=True,
|
|
||||||
# f0_cache_path="/Users/erengolge/Projects/TTS/f0_tests",
|
|
||||||
# tokenizer=tokenizer,
|
|
||||||
# phoneme_cache_path="/Users/erengolge/Projects/TTS/phonemes_tests",
|
|
||||||
# precompute_num_workers=4,
|
|
||||||
# )
|
|
||||||
|
|
||||||
# dataloader = DataLoader(dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=dataset.collate_fn)
|
|
||||||
# for batch in dataloader:
|
|
||||||
# print(batch)
|
|
||||||
# break
|
|
||||||
|
|
Loading…
Reference in New Issue