Add option to configure SentenceBoundary (#348)

* Create FUNDING.yml * add setence boundary which is more friendly to Chinese users --------- Co-authored-by: Rany <ranygh@riseup.net>
2025-08-05 18:06:03 +08:00
parent ffe009af9a
commit f78c8ece0a
10 changed files with 90 additions and 18 deletions
--- a/src/edge_tts/communicate.py
+++ b/src/edge_tts/communicate.py
@@ -16,14 +16,17 @@ from typing import (
    Dict,
    Generator,
    List,
+    Literal,
    Optional,
    Tuple,
+    TypedDict,
    Union,
 )
 from xml.sax.saxutils import escape, unescape

 import aiohttp
 import certifi
+from typing_extensions import NotRequired, Unpack

 from .constants import DEFAULT_VOICE, SEC_MS_GEC_VERSION, WSS_HEADERS, WSS_URL
 from .data_classes import TTSConfig
@@ -328,6 +331,14 @@ def calc_max_mesg_size(tts_config: TTSConfig) -> int:
    return websocket_max_size - overhead_per_message


+class CommunicateRequest(TypedDict):
+    """
+    A class to communicate with the service.
+    """
+
+    Boundary: NotRequired[Literal["WordBoundary", "SentenceBoundary"]]
+
+
 class Communicate:
    """
    Communicate with the service.
@@ -345,9 +356,21 @@ class Communicate:
        proxy: Optional[str] = None,
        connect_timeout: Optional[int] = 10,
        receive_timeout: Optional[int] = 60,
+        **kwargs: Unpack[CommunicateRequest],
    ):
+        """
+        Args:
+            boundary (str): The boundary to use for the TTS.
+                Defaults to "WordBoundary".
+                Valid values are "WordBoundary" and "SentenceBoundary".
+                If "WordBoundary", the TTS will return a word boundary for each word.
+                If "SentenceBoundary", the TTS will return a sentence boundary for each sentence.
+                    Which is more friendly to Chinese users.
+        """
+
        # Validate TTS settings and store the TTSConfig object.
-        self.tts_config = TTSConfig(voice, rate, volume, pitch)
+        boundary = kwargs.get("Boundary", "WordBoundary")
+        self.tts_config = TTSConfig(voice, rate, volume, pitch, boundary)

        # Validate the text parameter.
        if not isinstance(text, str):
@@ -392,7 +415,7 @@ class Communicate:
    def __parse_metadata(self, data: bytes) -> TTSChunk:
        for meta_obj in json.loads(data)["Metadata"]:
            meta_type = meta_obj["Type"]
-            if meta_type == "WordBoundary":
+            if meta_type in ("WordBoundary", "SentenceBoundary"):
                current_offset = (
                    meta_obj["Data"]["Offset"] + self.state["offset_compensation"]
                )
@@ -411,12 +434,16 @@ class Communicate:
    async def __stream(self) -> AsyncGenerator[TTSChunk, None]:
        async def send_command_request() -> None:
            """Sends the command request to the service."""
+            word_boundary = self.tts_config.boundary == "WordBoundary"
+            wd = "true" if word_boundary else "false"
+            sq = "true" if not word_boundary else "false"
            await websocket.send_str(
                f"X-Timestamp:{date_to_string()}\r\n"
                "Content-Type:application/json; charset=utf-8\r\n"
                "Path:speech.config\r\n\r\n"
                '{"context":{"synthesis":{"audio":{"metadataoptions":{'
-                '"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},'
+                f'"sentenceBoundaryEnabled":"{sq}","wordBoundaryEnabled":"{wd}"'
+                "},"
                '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
                "}}}}\r\n"
            )
@@ -603,9 +630,9 @@ class Communicate:
            async for message in self.stream():
                if message["type"] == "audio":
                    audio.write(message["data"])
-                elif (
-                    isinstance(metadata, TextIOWrapper)
-                    and message["type"] == "WordBoundary"
+                elif isinstance(metadata, TextIOWrapper) and message["type"] in (
+                    "WordBoundary",
+                    "SentenceBoundary",
                ):
                    json.dump(message, metadata)
                    metadata.write("\n")