From 16d5dc273cdf0dd96301dabbfd2cff1fe85adcc3 Mon Sep 17 00:00:00 2001
From: rany <ranygh@riseup.net>
Date: Tue, 1 Mar 2022 21:42:01 +0200
Subject: [PATCH] fix sentence boundary

---
 lint.sh                       |  2 +
 src/edge_playback/__init__.py | 12 +++---
 src/edge_tts/communicate.py   | 79 +++++++++++++++++++++++++++++------
 src/edge_tts/submaker.py      | 49 +++++++++++++---------
 src/edge_tts/util.py          | 22 ++++------
 5 files changed, 112 insertions(+), 52 deletions(-)
 create mode 100755 lint.sh

diff --git a/lint.sh b/lint.sh
new file mode 100755
index 0000000..08d10a1
--- /dev/null
+++ b/lint.sh
@@ -0,0 +1,2 @@
+find src -name '*.py' | xargs black
+find src -name '*.py' | xargs isort
diff --git a/src/edge_playback/__init__.py b/src/edge_playback/__init__.py
index 45c8716..59ccfd1 100644
--- a/src/edge_playback/__init__.py
+++ b/src/edge_playback/__init__.py
@@ -18,16 +18,14 @@ def main():
         with tempfile.NamedTemporaryFile() as media:
             with tempfile.NamedTemporaryFile() as subtitle:
                 print()
-                print(f"Media file      {media.name}")
-                print(f"Subtitle file   {subtitle.name}\n")
+                print(f"Media file: {media.name}")
+                print(f"Subtitle file: {subtitle.name}\n")
                 with subprocess.Popen(
                     [
                         "edge-tts",
-                        "-w",
-                        "--write-media",
-                        media.name,
-                        "--write-subtitles",
-                        subtitle.name,
+                        "--boundary-type=2",
+                        f"--write-media={media.name}",
+                        f"--write-subtitles={subtitle.name}",
                     ]
                     + sys.argv[1:]
                 ) as process:
diff --git a/src/edge_tts/communicate.py b/src/edge_tts/communicate.py
index fe3be42..819efe1 100644
--- a/src/edge_tts/communicate.py
+++ b/src/edge_tts/communicate.py
@@ -207,8 +207,7 @@ class Communicate:  # pylint: disable=too-few-public-methods
     async def run(
         self,
         messages,
-        sentence_boundary=False,
-        word_boundary=False,
+        boundary_type=0,
         codec="audio-24khz-48kbitrate-mono-mp3",
         voice="Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)",
         pitch="+0Hz",
@@ -221,8 +220,7 @@ class Communicate:  # pylint: disable=too-few-public-methods
 
         Args:
             messages (str or list): A list of SSML strings or a single text.
-            sentence_boundary (bool): Whether to use sentence boundary.
-            word_boundary (bool): Whether to use word boundary.
+            boundery_type (int): The type of boundary to use. 0 for none, 1 for word_boundary, 2 for sentence_boundary.
             codec (str): The codec to use.
             voice (str): The voice to use (only applicable to non-customspeak).
             pitch (str): The pitch to use (only applicable to non-customspeak).
@@ -234,11 +232,19 @@ class Communicate:  # pylint: disable=too-few-public-methods
             tuple: The subtitle offset, subtitle, and audio data.
         """
 
+        word_boundary = False
+        sentence_boundary = False
+
+        if boundary_type > 0:
+            word_boundary = True
+        if boundary_type > 1:
+            sentence_boundary = True
+
         sentence_boundary = str(sentence_boundary).lower()
         word_boundary = str(word_boundary).lower()
 
         if not customspeak:
-            websocket_max_size = 2 ** 16
+            websocket_max_size = 2**16
             overhead_per_message = (
                 len(
                     ssml_headers_plus_data(
@@ -312,6 +318,9 @@ class Communicate:  # pylint: disable=too-few-public-methods
 
                     # Begin listening for the response.
                     download = False
+                    current_subtitle = ""
+                    first_offset = None
+                    last_offset = None
                     async for received in websocket:
                         if received.type in (
                             aiohttp.WSMsgType.CLOSED,
@@ -337,13 +346,53 @@ class Communicate:  # pylint: disable=too-few-public-methods
                                 and parameters["Path"] == "audio.metadata"
                             ):
                                 metadata = json.loads(data)
-                                text = metadata["Metadata"][0]["Data"]["text"]["Text"]
-                                offset = metadata["Metadata"][0]["Data"]["Offset"]
-                                yield (
-                                    offset,
-                                    text,
-                                    None,
-                                )
+                                metadata_type = metadata["Metadata"][0]["Type"]
+                                metadata_offset = metadata["Metadata"][0]["Data"][
+                                    "Offset"
+                                ]
+                                try:
+                                    metadata_duration = metadata["Metadata"][0]["Data"][
+                                        "Duration"
+                                    ]
+                                except KeyError:
+                                    metadata_duration = 0
+                                metadata_text = metadata["Metadata"][0]["Data"]["text"][
+                                    "Text"
+                                ]
+                                if boundary_type == 1:
+                                    yield (
+                                        [
+                                            metadata_offset,
+                                            metadata_duration,
+                                        ],
+                                        metadata_text,
+                                        None,
+                                    )
+                                else:
+                                    if metadata_type == "WordBoundary":
+                                        if current_subtitle:
+                                            current_subtitle += " "
+                                        current_subtitle += metadata_text
+                                        if first_offset is None:
+                                            first_offset = metadata_offset
+                                        last_offset = [
+                                            metadata_offset,
+                                            metadata_duration,
+                                        ]
+                                    elif metadata_type == "SentenceBoundary":
+                                        if current_subtitle:
+                                            yield (
+                                                [
+                                                    first_offset,
+                                                    sum(last_offset) - first_offset,
+                                                ],
+                                                current_subtitle,
+                                                None,
+                                            )
+                                        current_subtitle = ""
+                                        first_offset = None
+                                        last_offset = None
+
                         elif received.type == aiohttp.WSMsgType.BINARY:
                             if download:
                                 yield (
@@ -353,4 +402,10 @@ class Communicate:  # pylint: disable=too-few-public-methods
                                         received.data.split(b"Path:audio\r\n")[1:]
                                     ),
                                 )
+                if current_subtitle:
+                    yield (
+                        [first_offset, sum(last_offset) - first_offset],
+                        current_subtitle,
+                        None,
+                    )
                 await websocket.close()
diff --git a/src/edge_tts/submaker.py b/src/edge_tts/submaker.py
index fe80b61..55b1413 100644
--- a/src/edge_tts/submaker.py
+++ b/src/edge_tts/submaker.py
@@ -28,9 +28,9 @@ def mktimestamp(time_unit):
     Returns:
         str: The timecode of the subtitle.
     """
-    hour = math.floor(time_unit / 10 ** 7 / 3600)
-    minute = math.floor((time_unit / 10 ** 7 / 60) % 60)
-    seconds = (time_unit / 10 ** 7) % 60
+    hour = math.floor(time_unit / 10**7 / 3600)
+    minute = math.floor((time_unit / 10**7 / 60) % 60)
+    seconds = (time_unit / 10**7) % 60
     return f"{hour:02d}:{minute:02d}:{seconds:06.3f}"
 
 
@@ -49,7 +49,7 @@ class SubMaker:
         """
         self.subs_and_offset = []
         self.broken_offset = []
-        self.overlapping = overlapping * (10 ** 7)
+        self.overlapping = overlapping * (10**7)
 
     def create_sub(self, timestamp, text):
         """
@@ -57,16 +57,19 @@ class SubMaker:
         and adds it to the list of subtitles
 
         Args:
-            timestamp (int): The timestamp of the subtitle.
+            timestamp (tuple): The offset and duration of the subtitle.
             text (str): The text of the subtitle.
 
         Returns:
             None
         """
+        timestamp[1] += timestamp[0]
+
         if len(self.subs_and_offset) >= 2:
-            if self.subs_and_offset[-2] >= timestamp + sum(self.broken_offset):
-                self.broken_offset.append(self.subs_and_offset[-2])
-            timestamp = timestamp + sum(self.broken_offset)
+            if self.subs_and_offset[-2][-1] >= timestamp[1] + sum(self.broken_offset):
+                self.broken_offset.append(self.subs_and_offset[-2][1])
+            timestamp[0] += sum(self.broken_offset)
+            timestamp[1] += sum(self.broken_offset)
 
         self.subs_and_offset.append(timestamp)
         self.subs_and_offset.append(text)
@@ -80,19 +83,27 @@ class SubMaker:
         """
         if len(self.subs_and_offset) >= 2:
             data = "WEBVTT\r\n\r\n"
-            old_time_stamp = None
-            old_sub_data = None
             for offset, subs in zip(
                 self.subs_and_offset[::2], self.subs_and_offset[1::2]
             ):
-                if old_time_stamp is not None and old_sub_data is not None:
-                    data += formatter(
-                        old_time_stamp, offset + self.overlapping, old_sub_data
-                    )
-                old_time_stamp = offset
-                old_sub_data = subs
-            data += formatter(
-                old_time_stamp, old_time_stamp + ((10 ** 7) * 10), old_sub_data
-            )
+                subs = [subs[i : i + 79] for i in range(0, len(subs), 79)]
+
+                for i in range(len(subs) - 1):
+                    sub = subs[i]
+                    split_at_word = True
+                    if sub[-1] == " ":
+                        subs[i] = sub[:-1]
+                        split_at_word = False
+
+                    if sub[0] == " ":
+                        subs[i] = sub[1:]
+                        split_at_word = False
+
+                    if split_at_word:
+                        subs[i] += "-"
+
+                subs = "\r\n".join(subs)
+
+                data += formatter(offset[0], offset[1] + self.overlapping, subs)
             return data
         return ""
diff --git a/src/edge_tts/util.py b/src/edge_tts/util.py
index 8d64863..98b4aee 100644
--- a/src/edge_tts/util.py
+++ b/src/edge_tts/util.py
@@ -32,8 +32,7 @@ async def _tts(args):
         media_file = open(args.write_media, "wb")  # pylint: disable=consider-using-with
     async for i in tts.run(
         args.text,
-        args.enable_sentence_boundary,
-        args.enable_word_boundary,
+        args.boundary_type,
         args.codec,
         args.voice,
         args.pitch,
@@ -108,18 +107,6 @@ async def _main():
         help="set TTS volume. Default +0%%. For more info check https://bit.ly/3eAE5Nx",
         default="+0%",
     )
-    parser.add_argument(
-        "-s",
-        "--enable-sentence-boundary",
-        help="enable sentence boundary",
-        action="store_true",
-    )
-    parser.add_argument(
-        "-w",
-        "--enable-word-boundary",
-        help="enable word boundary",
-        action="store_true",
-    )
     parser.add_argument(
         "-O",
         "--overlapping",
@@ -127,6 +114,13 @@ async def _main():
         default=5,
         type=float,
     )
+    parser.add_argument(
+        "-b",
+        "--boundary-type",
+        help="set boundary type for subtitles. Default 0 for none. Set 1 for word_boundary, 2 for sentence_boundary",
+        default=0,
+        type=int,
+    )
     parser.add_argument(
         "--write-media", help="instead of stdout, send media output to provided file"
     )