Add option to configure SentenceBoundary (#348)

* Create FUNDING.yml * add setence boundary which is more friendly to Chinese users --------- Co-authored-by: Rany <ranygh@riseup.net>
2025-08-05 18:06:03 +08:00
parent ffe009af9a
commit f78c8ece0a
10 changed files with 90 additions and 18 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -0,0 +1 @@
 github: rany2
--- a/.gitignore
+++ b/.gitignore
@@ -162,3 +162,4 @@ cython_debug/
 # Edge-TTS specific ignores
 *.mp3
 *.srt
 /.idea/
--- a/examples/async_audio_streaming_with_predefined_voice_and_subtitles.py
+++ b/examples/async_audio_streaming_with_predefined_voice_and_subtitles.py
@@ -21,7 +21,7 @@ async def amain() -> None:
        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                file.write(chunk["data"])
-            elif chunk["type"] == "WordBoundary":
+            elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
                submaker.feed(chunk)
    with open(SRT_FILE, "w", encoding="utf-8") as file:
--- a/examples/sync_audio_streaming_with_predefined_voice_subtitles.py
+++ b/examples/sync_audio_streaming_with_predefined_voice_subtitles.py
@@ -20,7 +20,7 @@ def main() -> None:
        for chunk in communicate.stream_sync():
            if chunk["type"] == "audio":
                file.write(chunk["data"])
-            elif chunk["type"] == "WordBoundary":
+            elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
                submaker.feed(chunk)
    with open(SRT_FILE, "w", encoding="utf-8") as file:
--- a/examples/sync_audio_streaming_with_predefined_voice_subtitles_print2stdout.py
+++ b/examples/sync_audio_streaming_with_predefined_voice_subtitles_print2stdout.py
@@ -0,0 +1,42 @@
 #!/usr/bin/env python3
 """Sync variant of the async .stream() method to
 get audio chunks and feed them to SubMaker to
 generate subtitles"""
 import sys
 import edge_tts
 TEXT = """君不见，黄河之水天上来，奔流到海不复回。
 君不见，高堂明镜悲白发，朝如青丝暮成雪。
 人生得意须尽欢，莫使金樽空对月。
 天生我材必有用，千金散尽还复来。
 烹羊宰牛且为乐，会须一饮三百杯。
 岑夫子，丹丘生，将进酒，杯莫停。
 与君歌一曲，请君为我倾耳听。
 钟鼓馔玉不足贵，但愿长醉不复醒。
 古来圣贤皆寂寞，惟有饮者留其名。
 陈王昔时宴平乐，斗酒十千恣欢谑。
 主人何为言少钱，径须沽取对君酌。
 五花马，千金裘，呼儿将出换美酒，与尔同销万古愁。"""
 VOICE = "zh-CN-YunjianNeural"
 def main() -> None:
    """Main function"""
    communicate = edge_tts.Communicate(TEXT, VOICE, Boundary="SentenceBoundary")
    submaker = edge_tts.SubMaker()
    stdout = sys.stdout
    audio_bytes = []
    for chunk in communicate.stream_sync():
        if chunk["type"] == "audio":
            audio_bytes.append(chunk["data"])
        elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
            submaker.feed(chunk)
    stdout.write(f"audio file length: {len(audio_bytes)}")
    stdout.write(submaker.get_srt())
 if __name__ == "__main__":
    main()
--- a/src/edge_tts/communicate.py
+++ b/src/edge_tts/communicate.py
@@ -16,14 +16,17 @@ from typing import (
    Dict,
    Generator,
    List,
    Literal,
    Optional,
    Tuple,
    TypedDict,
    Union,
 )
 from xml.sax.saxutils import escape, unescape
 import aiohttp
 import certifi
 from typing_extensions import NotRequired, Unpack
 from .constants import DEFAULT_VOICE, SEC_MS_GEC_VERSION, WSS_HEADERS, WSS_URL
 from .data_classes import TTSConfig
@@ -328,6 +331,14 @@ def calc_max_mesg_size(tts_config: TTSConfig) -> int:
    return websocket_max_size - overhead_per_message
 class CommunicateRequest(TypedDict):
    """
    A class to communicate with the service.
    """
    Boundary: NotRequired[Literal["WordBoundary", "SentenceBoundary"]]
 class Communicate:
    """
    Communicate with the service.
@@ -345,9 +356,21 @@ class Communicate:
        proxy: Optional[str] = None,
        connect_timeout: Optional[int] = 10,
        receive_timeout: Optional[int] = 60,
        **kwargs: Unpack[CommunicateRequest],
    ):
        """
        Args:
            boundary (str): The boundary to use for the TTS.
                Defaults to "WordBoundary".
                Valid values are "WordBoundary" and "SentenceBoundary".
                If "WordBoundary", the TTS will return a word boundary for each word.
                If "SentenceBoundary", the TTS will return a sentence boundary for each sentence.
                    Which is more friendly to Chinese users.
        """
        # Validate TTS settings and store the TTSConfig object.
-        self.tts_config = TTSConfig(voice, rate, volume, pitch)
+        boundary = kwargs.get("Boundary", "WordBoundary")
        self.tts_config = TTSConfig(voice, rate, volume, pitch, boundary)
        # Validate the text parameter.
        if not isinstance(text, str):
@@ -392,7 +415,7 @@ class Communicate:
    def __parse_metadata(self, data: bytes) -> TTSChunk:
        for meta_obj in json.loads(data)["Metadata"]:
            meta_type = meta_obj["Type"]
-            if meta_type == "WordBoundary":
+            if meta_type in ("WordBoundary", "SentenceBoundary"):
                current_offset = (
                    meta_obj["Data"]["Offset"] + self.state["offset_compensation"]
                )
@@ -411,12 +434,16 @@ class Communicate:
    async def __stream(self) -> AsyncGenerator[TTSChunk, None]:
        async def send_command_request() -> None:
            """Sends the command request to the service."""
            word_boundary = self.tts_config.boundary == "WordBoundary"
            wd = "true" if word_boundary else "false"
            sq = "true" if not word_boundary else "false"
            await websocket.send_str(
                f"X-Timestamp:{date_to_string()}\r\n"
                "Content-Type:application/json; charset=utf-8\r\n"
                "Path:speech.config\r\n\r\n"
                '{"context":{"synthesis":{"audio":{"metadataoptions":{'
-                '"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},'
+                f'"sentenceBoundaryEnabled":"{sq}","wordBoundaryEnabled":"{wd}"'
                "},"
                '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
                "}}}}\r\n"
            )
@@ -603,9 +630,9 @@ class Communicate:
            async for message in self.stream():
                if message["type"] == "audio":
                    audio.write(message["data"])
-                elif (
+                elif isinstance(metadata, TextIOWrapper) and message["type"] in (
-                    isinstance(metadata, TextIOWrapper)
+                    "WordBoundary",
-                    and message["type"] == "WordBoundary"
+                    "SentenceBoundary",
                ):
                    json.dump(message, metadata)
                    metadata.write("\n")
--- a/src/edge_tts/data_classes.py
+++ b/src/edge_tts/data_classes.py
@@ -17,6 +17,7 @@ class TTSConfig:
    rate: str
    volume: str
    pitch: str
    boundary: str
    @staticmethod
    def validate_string_param(param_name: str, param_value: str, pattern: str) -> str:
--- a/src/edge_tts/submaker.py
+++ b/src/edge_tts/submaker.py
@@ -1,4 +1,4 @@
-"""SubMaker module is used to generate subtitles from WordBoundary events."""
+"""SubMaker module is used to generate subtitles from WordBoundary and SentenceBoundary events."""
 from typing import List
@@ -9,7 +9,7 @@ from .typing import TTSChunk
 class SubMaker:
    """
-    SubMaker is used to generate subtitles from WordBoundary messages.
+    SubMaker is used to generate subtitles from WordBoundary and SentenceBoundary messages.
    """
    def __init__(self) -> None:
@@ -17,15 +17,15 @@ class SubMaker:
    def feed(self, msg: TTSChunk) -> None:
        """
-        Feed a WordBoundary message to the SubMaker object.
+        Feed a WordBoundary or SentenceBoundary message to the SubMaker object.
        Args:
-            msg (dict): The WordBoundary message.
+            msg (dict): The WordBoundary or SentenceBoundary message.
        Returns:
            None
        """
-        if msg["type"] != "WordBoundary":
+        if msg["type"] not in ("WordBoundary", "SentenceBoundary"):
            raise ValueError("Invalid message type, expected 'WordBoundary'")
        self.cues.append(
--- a/src/edge_tts/typing.py
+++ b/src/edge_tts/typing.py
@@ -10,11 +10,11 @@ from typing_extensions import Literal, NotRequired, TypedDict
 class TTSChunk(TypedDict):
    """TTS chunk data."""
-    type: Literal["audio", "WordBoundary"]
+    type: Literal["audio", "WordBoundary", "SentenceBoundary"]
    data: NotRequired[bytes]  # only for audio
-    duration: NotRequired[float]  # only for WordBoundary
+    duration: NotRequired[float]  # only for WordBoundary and SentenceBoundary
-    offset: NotRequired[float]  # only for WordBoundary
+    offset: NotRequired[float]  # only for WordBoundary and SentenceBoundary
-    text: NotRequired[str]  # only for WordBoundary
+    text: NotRequired[str]  # only for WordBoundary and SentenceBoundary
 class VoiceTag(TypedDict):
--- a/src/edge_tts/util.py
+++ b/src/edge_tts/util.py
@@ -72,7 +72,7 @@ async def _run_tts(args: UtilArgs) -> None:
        async for chunk in communicate.stream():
            if chunk["type"] == "audio":
                audio_file.write(chunk["data"])
-            elif chunk["type"] == "WordBoundary":
+            elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
                submaker.feed(chunk)
        if args.words_in_cue > 0: