Handle wider range of sentence splits

2020-07-13 02:05:52 +01:00 · 2020-07-13 02:05:52 +01:00 · ce2481d9cd
parent 95b6a16d65
commit ce2481d9cd
2 changed files with 32 additions and 38 deletions
--- a/server/synthesizer.py
+++ b/server/synthesizer.py
@ -6,6 +6,7 @@ import time
 import numpy as np
 import torch
 import yaml
+import pysbd

 from TTS.utils.audio import AudioProcessor
 from TTS.utils.io import load_config
@ -18,13 +19,6 @@ from TTS.utils.synthesis import *

 from TTS.utils.text import make_symbols, phonemes, symbols

-alphabets = r"([A-Za-z])"
-prefixes = r"(Mr|St|Mrs|Ms|Dr)[.]"
-suffixes = r"(Inc|Ltd|Jr|Sr|Co)"
-starters = r"(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
-acronyms = r"([A-Z][.][A-Z][.](?:[A-Z][.])?)"
-websites = r"[.](com|net|org|io|gov)"
-

 class Synthesizer(object):
    def __init__(self, config):
@ -32,6 +26,7 @@ class Synthesizer(object):
        self.vocoder_model = None
        self.config = config
        print(config)
+        self.seg = self.get_segmenter("en")
        self.use_cuda = self.config.use_cuda
        if self.use_cuda:
            assert torch.cuda.is_available(), "CUDA is not availabe on this machine."
@ -43,6 +38,10 @@ class Synthesizer(object):
            self.load_wavernn(self.config.wavernn_lib_path, self.config.wavernn_checkpoint,
                              self.config.wavernn_config, self.config.use_cuda)

+    @staticmethod
+    def get_segmenter(lang):
+        return pysbd.Segmenter(language=lang, clean=True)
+
    def load_tts(self, tts_checkpoint, tts_config, use_cuda):
        # pylint: disable=global-statement
        global symbols, phonemes
@ -132,37 +131,8 @@ class Synthesizer(object):
        wav = np.array(wav)
        self.ap.save_wav(wav, path)

-    @staticmethod
-    def split_into_sentences(text):
-        text = " " + text + "  <stop>"
-        text = text.replace("\n", " ")
-        text = re.sub(prefixes, "\\1<prd>", text)
-        text = re.sub(websites, "<prd>\\1", text)
-        if "Ph.D" in text:
-            text = text.replace("Ph.D.", "Ph<prd>D<prd>")
-        text = re.sub(r"\s" + alphabets + "[.] ", " \\1<prd> ", text)
-        text = re.sub(acronyms+" "+starters, "\\1<stop> \\2", text)
-        text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>\\3<prd>", text)
-        text = re.sub(alphabets + "[.]" + alphabets + "[.]", "\\1<prd>\\2<prd>", text)
-        text = re.sub(" "+suffixes+"[.] "+starters, " \\1<stop> \\2", text)
-        text = re.sub(" "+suffixes+"[.]", " \\1<prd>", text)
-        text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
-        if "”" in text:
-            text = text.replace(".”", "”.")
-        if "\"" in text:
-            text = text.replace(".\"", "\".")
-        if "!" in text:
-            text = text.replace("!\"", "\"!")
-        if "?" in text:
-            text = text.replace("?\"", "\"?")
-        text = text.replace(".", ".<stop>")
-        text = text.replace("?", "?<stop>")
-        text = text.replace("!", "!<stop>")
-        text = text.replace("<prd>", ".")
-        sentences = text.split("<stop>")
-        sentences = sentences[:-1]
-        sentences = list(filter(None, [s.strip() for s in sentences])) # remove empty sentences
-        return sentences
+    def split_into_sentences(self, text):
+        return self.seg.segment(text)

    def tts(self, text, speaker_id=None):
        start_time = time.time()
--- a/tests/test_demo_server.py
+++ b/tests/test_demo_server.py
@ -32,3 +32,27 @@ class DemoServerTest(unittest.TestCase):
        config['tts_config'] = os.path.join(tts_root_path, config['tts_config'])
        synthesizer = Synthesizer(config)
        synthesizer.tts("Better this test works!!")
+
+    def test_split_into_sentences(self):
+        """Check demo server sentences split as expected"""
+        print("\n > Testing demo server sentence splitting")
+        # pylint: disable=attribute-defined-outside-init
+        self.seg = Synthesizer.get_segmenter("en")
+        sis = Synthesizer.split_into_sentences
+        assert sis(self, 'Hello. Two sentences') == ['Hello.', 'Two sentences']
+        assert sis(self, 'He went to meet the adviser from Scott, Waltman & Co. next morning.') == ['He went to meet the adviser from Scott, Waltman & Co. next morning.']
+        assert sis(self, 'Let\'s run it past Sarah and co. They\'ll want to see this.') == ['Let\'s run it past Sarah and co.', 'They\'ll want to see this.']
+        assert sis(self, 'Where is Bobby Jr.\'s rabbit?') == ['Where is Bobby Jr.\'s rabbit?']
+        assert sis(self, 'Please inform the U.K. authorities right away.') == ['Please inform the U.K. authorities right away.']
+        assert sis(self, 'Were David and co. at the event?') == ['Were David and co. at the event?']
+        assert sis(self, 'paging dr. green, please come to theatre four immediately.') == ['paging dr. green, please come to theatre four immediately.']
+        assert sis(self, 'The email format is Firstname.Lastname@example.com. I think you reversed them.') == ['The email format is Firstname.Lastname@example.com.', 'I think you reversed them.']
+        assert sis(self, 'The demo site is: https://top100.example.com/subsection/latestnews.html. Please send us your feedback.') == ['The demo site is: https://top100.example.com/subsection/latestnews.html.', 'Please send us your feedback.']
+        assert sis(self, 'Scowling at him, \'You are not done yet!\' she yelled.') == ['Scowling at him, \'You are not done yet!\' she yelled.'] # with the  final lowercase "she" we see it's all one sentence
+        assert sis(self, 'Hey!! So good to see you.') == ['Hey!!', 'So good to see you.']
+        assert sis(self, 'He went to Yahoo! but I don\'t know the division.') == ['He went to Yahoo! but I don\'t know the division.']
+        assert sis(self, 'If you can\'t remember a quote, “at least make up a memorable one that\'s plausible..."') == ['If you can\'t remember a quote, “at least make up a memorable one that\'s plausible..."']
+        assert sis(self, 'The address is not google.com.') == ['The address is not google.com.']
+        assert sis(self, '1.) The first item 2.) The second item') == ['1.) The first item', '2.) The second item']
+        assert sis(self, '1) The first item 2) The second item') == ['1) The first item', '2) The second item']
+        assert sis(self, 'a. The first item b. The second item c. The third list item') == ['a. The first item', 'b. The second item', 'c. The third list item']