Drop words_in_cue code for SubMaker and switch to SRT (#323)

WebVTT isn't a very common format in the first place and attempting to make WordBoundary play nice with input text is very hard. Instead we now just display the word that the TTS is saying at a given time. In the future, we could try to enable SentenceBoundary but there is a risk that it will be banned by Microsoft as it is not used by Microsoft Edge itself. Closes: https://github.com/rany2/edge-tts/issues/118 Closes: https://github.com/rany2/edge-tts/issues/171 Closes: https://github.com/rany2/edge-tts/issues/229 Closes: https://github.com/rany2/edge-tts/issues/234 Signed-off-by: rany <rany2@riseup.net>
2024-11-22 20:58:47 +02:00
parent 5a2674cd03
commit 4f5d79ed57
6 changed files with 38 additions and 123 deletions
--- a/src/edge_tts/submaker.py
+++ b/src/edge_tts/submaker.py
@@ -1,33 +1,8 @@
 """SubMaker module is used to generate subtitles from WordBoundary events."""

-import math
 from typing import List, Tuple
-from xml.sax.saxutils import escape, unescape

-
-def formatter(start_time: float, end_time: float, subdata: str) -> str:
-    """
-    formatter returns the timecode and the text of the subtitle.
-    """
-    return (
-        f"{mktimestamp(start_time)} --> {mktimestamp(end_time)}\r\n"
-        f"{escape(subdata)}\r\n\r\n"
-    )
-
-
-def mktimestamp(time_unit: float) -> str:
-    """
-    mktimestamp returns the timecode of the subtitle.
-
-    The timecode is in the format of 00:00:00.000.
-
-    Returns:
-        str: The timecode of the subtitle.
-    """
-    hour = math.floor(time_unit / 10**7 / 3600)
-    minute = math.floor((time_unit / 10**7 / 60) % 60)
-    seconds = (time_unit / 10**7) % 60
-    return f"{hour:02d}:{minute:02d}:{seconds:06.3f}"
+import srt  # type: ignore


 class SubMaker:
@@ -36,19 +11,11 @@ class SubMaker:
    """

    def __init__(self) -> None:
-        """
-        SubMaker constructor initializes the list of subtitles and the list of offsets.
+        self.cues: List[srt.Subtitle] = []  # type: ignore

-        Returns:
-            None
+    def add_cue(self, timestamp: Tuple[float, float], text: str) -> None:
        """
-        self.offset: List[Tuple[float, float]] = []
-        self.subs: List[str] = []
-
-    def create_sub(self, timestamp: Tuple[float, float], text: str) -> None:
-        """
-        create_sub creates a subtitle from the given timestamp and text,
-        and appends it to the list of subtitles.
+        Add a subtitle part to the SubMaker object.

        Args:
            timestamp (tuple): The offset and duration of the subtitle.
@@ -57,67 +24,20 @@ class SubMaker:
        Returns:
            None
        """
-        self.offset.append((timestamp[0], timestamp[0] + timestamp[1]))
-        self.subs.append(text)
+        self.cues.append(
+            srt.Subtitle(
+                index=len(self.cues) + 1,
+                start=srt.timedelta(microseconds=timestamp[0] / 10),
+                end=srt.timedelta(microseconds=sum(timestamp) / 10),
+                content=text,
+            )
+        )

-    def generate_subs(self, words_in_cue: int = 10) -> str:
+    def get_srt(self) -> str:
        """
-        generate_subs generates the complete subtitle file.
-
-        Args:
-            words_in_cue (int): defines the number of words in a given cue
+        Get the SRT formatted subtitles from the SubMaker object.

        Returns:
-            str: The complete subtitle file.
+            str: The SRT formatted subtitles.
        """
-        if len(self.subs) != len(self.offset):
-            raise ValueError("subs and offset are not of the same length")
-
-        if words_in_cue <= 0:
-            raise ValueError("words_in_cue must be greater than 0")
-
-        data = "WEBVTT\r\n\r\n"
-        sub_state_count = 0
-        sub_state_start = -1.0
-        sub_state_subs = ""
-        for idx, (offset, subs) in enumerate(zip(self.offset, self.subs)):
-            start_time, end_time = offset
-            subs = unescape(subs)
-
-            # wordboundary is guaranteed not to contain whitespace
-            if len(sub_state_subs) > 0:
-                sub_state_subs += " "
-            sub_state_subs += subs
-
-            if sub_state_start == -1.0:
-                sub_state_start = start_time
-            sub_state_count += 1
-
-            if sub_state_count == words_in_cue or idx == len(self.offset) - 1:
-                subs = sub_state_subs
-                split_subs: List[str] = [
-                    subs[i : i + 79] for i in range(0, len(subs), 79)
-                ]
-                for i in range(len(split_subs) - 1):
-                    sub = split_subs[i]
-                    split_at_word = True
-                    if sub[-1] == " ":
-                        split_subs[i] = sub[:-1]
-                        split_at_word = False
-
-                    if sub[0] == " ":
-                        split_subs[i] = sub[1:]
-                        split_at_word = False
-
-                    if split_at_word:
-                        split_subs[i] += "-"
-
-                data += formatter(
-                    start_time=sub_state_start,
-                    end_time=end_time,
-                    subdata="\r\n".join(split_subs),
-                )
-                sub_state_count = 0
-                sub_state_start = -1
-                sub_state_subs = ""
-        return data
+        return srt.compose(self.cues)  # type: ignore