Cleanup SentenceBoundary support (#396)

- Default to SentenceBoundary
- Modify boundary argument to lowercase to match other options.
- Drop merge_cues support as SentenceBoundary renders it obsolete.

Signed-off-by: rany <rany2@riseup.net>
This commit is contained in:
rany2
2025-08-05 14:30:30 +03:00
committed by GitHub
parent 645c207cfd
commit c78e49d28e
5 changed files with 17 additions and 61 deletions

View File

@@ -24,7 +24,7 @@ VOICE = "zh-CN-YunjianNeural"
def main() -> None: def main() -> None:
"""Main function""" """Main function"""
communicate = edge_tts.Communicate(TEXT, VOICE, Boundary="SentenceBoundary") communicate = edge_tts.Communicate(TEXT, VOICE, boundary="SentenceBoundary")
submaker = edge_tts.SubMaker() submaker = edge_tts.SubMaker()
stdout = sys.stdout stdout = sys.stdout
audio_bytes = [] audio_bytes = []

View File

@@ -19,14 +19,12 @@ from typing import (
Literal, Literal,
Optional, Optional,
Tuple, Tuple,
TypedDict,
Union, Union,
) )
from xml.sax.saxutils import escape, unescape from xml.sax.saxutils import escape, unescape
import aiohttp import aiohttp
import certifi import certifi
from typing_extensions import NotRequired, Unpack
from .constants import DEFAULT_VOICE, SEC_MS_GEC_VERSION, WSS_HEADERS, WSS_URL from .constants import DEFAULT_VOICE, SEC_MS_GEC_VERSION, WSS_HEADERS, WSS_URL
from .data_classes import TTSConfig from .data_classes import TTSConfig
@@ -311,19 +309,12 @@ def ssml_headers_plus_data(request_id: str, timestamp: str, ssml: str) -> str:
) )
class CommunicateRequest(TypedDict):
"""
A class to communicate with the service.
"""
Boundary: NotRequired[Literal["WordBoundary", "SentenceBoundary"]]
class Communicate: class Communicate:
""" """
Communicate with the service. Communicate with the service.
""" """
# pylint: disable=too-many-arguments
def __init__( def __init__(
self, self,
text: str, text: str,
@@ -332,24 +323,13 @@ class Communicate:
rate: str = "+0%", rate: str = "+0%",
volume: str = "+0%", volume: str = "+0%",
pitch: str = "+0Hz", pitch: str = "+0Hz",
boundary: Literal["WordBoundary", "SentenceBoundary"] = "SentenceBoundary",
connector: Optional[aiohttp.BaseConnector] = None, connector: Optional[aiohttp.BaseConnector] = None,
proxy: Optional[str] = None, proxy: Optional[str] = None,
connect_timeout: Optional[int] = 10, connect_timeout: Optional[int] = 10,
receive_timeout: Optional[int] = 60, receive_timeout: Optional[int] = 60,
**kwargs: Unpack[CommunicateRequest],
): ):
"""
Args:
boundary (str): The boundary to use for the TTS.
Defaults to "WordBoundary".
Valid values are "WordBoundary" and "SentenceBoundary".
If "WordBoundary", the TTS will return a word boundary for each word.
If "SentenceBoundary", the TTS will return a sentence boundary for each sentence.
Which is more friendly to Chinese users.
"""
# Validate TTS settings and store the TTSConfig object. # Validate TTS settings and store the TTSConfig object.
boundary = kwargs.get("Boundary", "WordBoundary")
self.tts_config = TTSConfig(voice, rate, volume, pitch, boundary) self.tts_config = TTSConfig(voice, rate, volume, pitch, boundary)
# Validate the text parameter. # Validate the text parameter.

View File

@@ -5,6 +5,7 @@
import argparse import argparse
import re import re
from dataclasses import dataclass from dataclasses import dataclass
from typing import Literal
@dataclass @dataclass
@@ -17,7 +18,7 @@ class TTSConfig:
rate: str rate: str
volume: str volume: str
pitch: str pitch: str
boundary: str boundary: Literal["WordBoundary", "SentenceBoundary"]
@staticmethod @staticmethod
def validate_string_param(param_name: str, param_value: str, pattern: str) -> str: def validate_string_param(param_name: str, param_value: str, pattern: str) -> str:

View File

@@ -1,7 +1,7 @@
"""SubMaker module is used to generate subtitles from WordBoundary and SentenceBoundary events.""" """SubMaker module is used to generate subtitles from WordBoundary and SentenceBoundary events."""
from datetime import timedelta from datetime import timedelta
from typing import List from typing import List, Optional
from .srt_composer import Subtitle, compose from .srt_composer import Subtitle, compose
from .typing import TTSChunk from .typing import TTSChunk
@@ -14,6 +14,7 @@ class SubMaker:
def __init__(self) -> None: def __init__(self) -> None:
self.cues: List[Subtitle] = [] self.cues: List[Subtitle] = []
self.type: Optional[str] = None
def feed(self, msg: TTSChunk) -> None: def feed(self, msg: TTSChunk) -> None:
""" """
@@ -26,7 +27,16 @@ class SubMaker:
None None
""" """
if msg["type"] not in ("WordBoundary", "SentenceBoundary"): if msg["type"] not in ("WordBoundary", "SentenceBoundary"):
raise ValueError("Invalid message type, expected 'WordBoundary'") raise ValueError(
"Invalid message type, expected 'WordBoundary' or 'SentenceBoundary'."
)
if self.type is None:
self.type = msg["type"]
elif self.type != msg["type"]:
raise ValueError(
f"Expected message type '{self.type}', but got '{msg['type']}'."
)
self.cues.append( self.cues.append(
Subtitle( Subtitle(
@@ -37,38 +47,6 @@ class SubMaker:
) )
) )
def merge_cues(self, words: int) -> None:
"""
Merge cues to reduce the number of cues.
Args:
words (int): The number of words to merge.
Returns:
None
"""
if words <= 0:
raise ValueError("Invalid number of words to merge, expected > 0")
if len(self.cues) == 0:
return
new_cues: List[Subtitle] = []
current_cue: Subtitle = self.cues[0]
for cue in self.cues[1:]:
if len(current_cue.content.split()) < words:
current_cue = Subtitle(
index=current_cue.index,
start=current_cue.start,
end=cue.end,
content=f"{current_cue.content} {cue.content}",
)
else:
new_cues.append(current_cue)
current_cue = cue
new_cues.append(current_cue)
self.cues = new_cues
def get_srt(self) -> str: def get_srt(self) -> str:
""" """
Get the SRT formatted subtitles from the SubMaker object. Get the SRT formatted subtitles from the SubMaker object.

View File

@@ -75,9 +75,6 @@ async def _run_tts(args: UtilArgs) -> None:
elif chunk["type"] in ("WordBoundary", "SentenceBoundary"): elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
submaker.feed(chunk) submaker.feed(chunk)
if args.words_in_cue > 0:
submaker.merge_cues(args.words_in_cue)
if sub_file is not None: if sub_file is not None:
sub_file.write(submaker.get_srt()) sub_file.write(submaker.get_srt())
finally: finally: