fix sentence boundary

2022-03-01 21:42:01 +02:00
parent 43ee535752
commit 16d5dc273c
5 changed files with 112 additions and 52 deletions
--- a/lint.sh
+++ b/lint.sh
@@ -0,0 +1,2 @@
 find src -name '*.py' | xargs black
 find src -name '*.py' | xargs isort
--- a/src/edge_playback/init.py
+++ b/src/edge_playback/init.py
@@ -18,16 +18,14 @@ def main():
        with tempfile.NamedTemporaryFile() as media:
            with tempfile.NamedTemporaryFile() as subtitle:
                print()
-                print(f"Media file      {media.name}")
+                print(f"Media file: {media.name}")
-                print(f"Subtitle file   {subtitle.name}\n")
+                print(f"Subtitle file: {subtitle.name}\n")
                with subprocess.Popen(
                    [
                        "edge-tts",
-                        "-w",
+                        "--boundary-type=2",
-                        "--write-media",
+                        f"--write-media={media.name}",
-                        media.name,
+                        f"--write-subtitles={subtitle.name}",
                        "--write-subtitles",
                        subtitle.name,
                    ]
                    + sys.argv[1:]
                ) as process:
--- a/src/edge_tts/communicate.py
+++ b/src/edge_tts/communicate.py
@@ -207,8 +207,7 @@ class Communicate:  # pylint: disable=too-few-public-methods
    async def run(
        self,
        messages,
-        sentence_boundary=False,
+        boundary_type=0,
        word_boundary=False,
        codec="audio-24khz-48kbitrate-mono-mp3",
        voice="Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)",
        pitch="+0Hz",
@@ -221,8 +220,7 @@ class Communicate:  # pylint: disable=too-few-public-methods
        Args:
            messages (str or list): A list of SSML strings or a single text.
-            sentence_boundary (bool): Whether to use sentence boundary.
+            boundery_type (int): The type of boundary to use. 0 for none, 1 for word_boundary, 2 for sentence_boundary.
            word_boundary (bool): Whether to use word boundary.
            codec (str): The codec to use.
            voice (str): The voice to use (only applicable to non-customspeak).
            pitch (str): The pitch to use (only applicable to non-customspeak).
@@ -234,11 +232,19 @@ class Communicate:  # pylint: disable=too-few-public-methods
            tuple: The subtitle offset, subtitle, and audio data.
        """
        word_boundary = False
        sentence_boundary = False
        if boundary_type > 0:
            word_boundary = True
        if boundary_type > 1:
            sentence_boundary = True
        sentence_boundary = str(sentence_boundary).lower()
        word_boundary = str(word_boundary).lower()
        if not customspeak:
-            websocket_max_size = 2 ** 16
+            websocket_max_size = 2**16
            overhead_per_message = (
                len(
                    ssml_headers_plus_data(
@@ -312,6 +318,9 @@ class Communicate:  # pylint: disable=too-few-public-methods
                    # Begin listening for the response.
                    download = False
                    current_subtitle = ""
                    first_offset = None
                    last_offset = None
                    async for received in websocket:
                        if received.type in (
                            aiohttp.WSMsgType.CLOSED,
@@ -337,13 +346,53 @@ class Communicate:  # pylint: disable=too-few-public-methods
                                and parameters["Path"] == "audio.metadata"
                            ):
                                metadata = json.loads(data)
-                                text = metadata["Metadata"][0]["Data"]["text"]["Text"]
+                                metadata_type = metadata["Metadata"][0]["Type"]
-                                offset = metadata["Metadata"][0]["Data"]["Offset"]
+                                metadata_offset = metadata["Metadata"][0]["Data"][
-                                yield (
+                                    "Offset"
-                                    offset,
+                                ]
-                                    text,
+                                try:
-                                    None,
+                                    metadata_duration = metadata["Metadata"][0]["Data"][
-                                )
+                                        "Duration"
                                    ]
                                except KeyError:
                                    metadata_duration = 0
                                metadata_text = metadata["Metadata"][0]["Data"]["text"][
                                    "Text"
                                ]
                                if boundary_type == 1:
                                    yield (
                                        [
                                            metadata_offset,
                                            metadata_duration,
                                        ],
                                        metadata_text,
                                        None,
                                    )
                                else:
                                    if metadata_type == "WordBoundary":
                                        if current_subtitle:
                                            current_subtitle += " "
                                        current_subtitle += metadata_text
                                        if first_offset is None:
                                            first_offset = metadata_offset
                                        last_offset = [
                                            metadata_offset,
                                            metadata_duration,
                                        ]
                                    elif metadata_type == "SentenceBoundary":
                                        if current_subtitle:
                                            yield (
                                                [
                                                    first_offset,
                                                    sum(last_offset) - first_offset,
                                                ],
                                                current_subtitle,
                                                None,
                                            )
                                        current_subtitle = ""
                                        first_offset = None
                                        last_offset = None
                        elif received.type == aiohttp.WSMsgType.BINARY:
                            if download:
                                yield (
@@ -353,4 +402,10 @@ class Communicate:  # pylint: disable=too-few-public-methods
                                        received.data.split(b"Path:audio\r\n")[1:]
                                    ),
                                )
                if current_subtitle:
                    yield (
                        [first_offset, sum(last_offset) - first_offset],
                        current_subtitle,
                        None,
                    )
                await websocket.close()
--- a/src/edge_tts/submaker.py
+++ b/src/edge_tts/submaker.py
@@ -28,9 +28,9 @@ def mktimestamp(time_unit):
    Returns:
        str: The timecode of the subtitle.
    """
-    hour = math.floor(time_unit / 10 ** 7 / 3600)
+    hour = math.floor(time_unit / 10**7 / 3600)
-    minute = math.floor((time_unit / 10 ** 7 / 60) % 60)
+    minute = math.floor((time_unit / 10**7 / 60) % 60)
-    seconds = (time_unit / 10 ** 7) % 60
+    seconds = (time_unit / 10**7) % 60
    return f"{hour:02d}:{minute:02d}:{seconds:06.3f}"
@@ -49,7 +49,7 @@ class SubMaker:
        """
        self.subs_and_offset = []
        self.broken_offset = []
-        self.overlapping = overlapping * (10 ** 7)
+        self.overlapping = overlapping * (10**7)
    def create_sub(self, timestamp, text):
        """
@@ -57,16 +57,19 @@ class SubMaker:
        and adds it to the list of subtitles
        Args:
-            timestamp (int): The timestamp of the subtitle.
+            timestamp (tuple): The offset and duration of the subtitle.
            text (str): The text of the subtitle.
        Returns:
            None
        """
        timestamp[1] += timestamp[0]
        if len(self.subs_and_offset) >= 2:
-            if self.subs_and_offset[-2] >= timestamp + sum(self.broken_offset):
+            if self.subs_and_offset[-2][-1] >= timestamp[1] + sum(self.broken_offset):
-                self.broken_offset.append(self.subs_and_offset[-2])
+                self.broken_offset.append(self.subs_and_offset[-2][1])
-            timestamp = timestamp + sum(self.broken_offset)
+            timestamp[0] += sum(self.broken_offset)
            timestamp[1] += sum(self.broken_offset)
        self.subs_and_offset.append(timestamp)
        self.subs_and_offset.append(text)
@@ -80,19 +83,27 @@ class SubMaker:
        """
        if len(self.subs_and_offset) >= 2:
            data = "WEBVTT\r\n\r\n"
            old_time_stamp = None
            old_sub_data = None
            for offset, subs in zip(
                self.subs_and_offset[::2], self.subs_and_offset[1::2]
            ):
-                if old_time_stamp is not None and old_sub_data is not None:
+                subs = [subs[i : i + 79] for i in range(0, len(subs), 79)]
-                    data += formatter(
+
-                        old_time_stamp, offset + self.overlapping, old_sub_data
+                for i in range(len(subs) - 1):
-                    )
+                    sub = subs[i]
-                old_time_stamp = offset
+                    split_at_word = True
-                old_sub_data = subs
+                    if sub[-1] == " ":
-            data += formatter(
+                        subs[i] = sub[:-1]
-                old_time_stamp, old_time_stamp + ((10 ** 7) * 10), old_sub_data
+                        split_at_word = False
-            )
+
                    if sub[0] == " ":
                        subs[i] = sub[1:]
                        split_at_word = False
                    if split_at_word:
                        subs[i] += "-"
                subs = "\r\n".join(subs)
                data += formatter(offset[0], offset[1] + self.overlapping, subs)
            return data
        return ""
--- a/src/edge_tts/util.py
+++ b/src/edge_tts/util.py
@@ -32,8 +32,7 @@ async def _tts(args):
        media_file = open(args.write_media, "wb")  # pylint: disable=consider-using-with
    async for i in tts.run(
        args.text,
-        args.enable_sentence_boundary,
+        args.boundary_type,
        args.enable_word_boundary,
        args.codec,
        args.voice,
        args.pitch,
@@ -108,18 +107,6 @@ async def _main():
        help="set TTS volume. Default +0%%. For more info check https://bit.ly/3eAE5Nx",
        default="+0%",
    )
    parser.add_argument(
        "-s",
        "--enable-sentence-boundary",
        help="enable sentence boundary",
        action="store_true",
    )
    parser.add_argument(
        "-w",
        "--enable-word-boundary",
        help="enable word boundary",
        action="store_true",
    )
    parser.add_argument(
        "-O",
        "--overlapping",
@@ -127,6 +114,13 @@ async def _main():
        default=5,
        type=float,
    )
    parser.add_argument(
        "-b",
        "--boundary-type",
        help="set boundary type for subtitles. Default 0 for none. Set 1 for word_boundary, 2 for sentence_boundary",
        default=0,
        type=int,
    )
    parser.add_argument(
        "--write-media", help="instead of stdout, send media output to provided file"
    )
		`@@ -0,0 +1,2 @@`
							`find src -name '*.py' \| xargs black`
							`find src -name '*.py' \| xargs isort`