diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..bcaae1a --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1 @@ +github: rany2 diff --git a/.gitignore b/.gitignore index 19e9b0a..8e093ea 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,4 @@ cython_debug/ # Edge-TTS specific ignores *.mp3 *.srt +/.idea/ diff --git a/examples/async_audio_streaming_with_predefined_voice_and_subtitles.py b/examples/async_audio_streaming_with_predefined_voice_and_subtitles.py index 3ed2579..fc8b444 100644 --- a/examples/async_audio_streaming_with_predefined_voice_and_subtitles.py +++ b/examples/async_audio_streaming_with_predefined_voice_and_subtitles.py @@ -21,7 +21,7 @@ async def amain() -> None: async for chunk in communicate.stream(): if chunk["type"] == "audio": file.write(chunk["data"]) - elif chunk["type"] == "WordBoundary": + elif chunk["type"] in ("WordBoundary", "SentenceBoundary"): submaker.feed(chunk) with open(SRT_FILE, "w", encoding="utf-8") as file: diff --git a/examples/sync_audio_streaming_with_predefined_voice_subtitles.py b/examples/sync_audio_streaming_with_predefined_voice_subtitles.py index 8ef0a5a..f802521 100644 --- a/examples/sync_audio_streaming_with_predefined_voice_subtitles.py +++ b/examples/sync_audio_streaming_with_predefined_voice_subtitles.py @@ -20,7 +20,7 @@ def main() -> None: for chunk in communicate.stream_sync(): if chunk["type"] == "audio": file.write(chunk["data"]) - elif chunk["type"] == "WordBoundary": + elif chunk["type"] in ("WordBoundary", "SentenceBoundary"): submaker.feed(chunk) with open(SRT_FILE, "w", encoding="utf-8") as file: diff --git a/examples/sync_audio_streaming_with_predefined_voice_subtitles_print2stdout.py b/examples/sync_audio_streaming_with_predefined_voice_subtitles_print2stdout.py new file mode 100644 index 0000000..7c7890d --- /dev/null +++ b/examples/sync_audio_streaming_with_predefined_voice_subtitles_print2stdout.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +"""Sync variant of the async .stream() method to +get audio chunks and feed them to SubMaker to +generate subtitles""" +import sys + +import edge_tts + +TEXT = """君不见,黄河之水天上来,奔流到海不复回。 +君不见,高堂明镜悲白发,朝如青丝暮成雪。 +人生得意须尽欢,莫使金樽空对月。 +天生我材必有用,千金散尽还复来。 +烹羊宰牛且为乐,会须一饮三百杯。 +岑夫子,丹丘生,将进酒,杯莫停。 +与君歌一曲,请君为我倾耳听。 +钟鼓馔玉不足贵,但愿长醉不复醒。 +古来圣贤皆寂寞,惟有饮者留其名。 +陈王昔时宴平乐,斗酒十千恣欢谑。 +主人何为言少钱,径须沽取对君酌。 +五花马,千金裘,呼儿将出换美酒,与尔同销万古愁。""" +VOICE = "zh-CN-YunjianNeural" + + +def main() -> None: + """Main function""" + communicate = edge_tts.Communicate(TEXT, VOICE, Boundary="SentenceBoundary") + submaker = edge_tts.SubMaker() + stdout = sys.stdout + audio_bytes = [] + for chunk in communicate.stream_sync(): + if chunk["type"] == "audio": + audio_bytes.append(chunk["data"]) + elif chunk["type"] in ("WordBoundary", "SentenceBoundary"): + submaker.feed(chunk) + + stdout.write(f"audio file length: {len(audio_bytes)}") + stdout.write(submaker.get_srt()) + + +if __name__ == "__main__": + main() diff --git a/src/edge_tts/communicate.py b/src/edge_tts/communicate.py index aa45165..8a8a9fe 100644 --- a/src/edge_tts/communicate.py +++ b/src/edge_tts/communicate.py @@ -16,14 +16,17 @@ from typing import ( Dict, Generator, List, + Literal, Optional, Tuple, + TypedDict, Union, ) from xml.sax.saxutils import escape, unescape import aiohttp import certifi +from typing_extensions import NotRequired, Unpack from .constants import DEFAULT_VOICE, SEC_MS_GEC_VERSION, WSS_HEADERS, WSS_URL from .data_classes import TTSConfig @@ -328,6 +331,14 @@ def calc_max_mesg_size(tts_config: TTSConfig) -> int: return websocket_max_size - overhead_per_message +class CommunicateRequest(TypedDict): + """ + A class to communicate with the service. + """ + + Boundary: NotRequired[Literal["WordBoundary", "SentenceBoundary"]] + + class Communicate: """ Communicate with the service. @@ -345,9 +356,21 @@ class Communicate: proxy: Optional[str] = None, connect_timeout: Optional[int] = 10, receive_timeout: Optional[int] = 60, + **kwargs: Unpack[CommunicateRequest], ): + """ + Args: + boundary (str): The boundary to use for the TTS. + Defaults to "WordBoundary". + Valid values are "WordBoundary" and "SentenceBoundary". + If "WordBoundary", the TTS will return a word boundary for each word. + If "SentenceBoundary", the TTS will return a sentence boundary for each sentence. + Which is more friendly to Chinese users. + """ + # Validate TTS settings and store the TTSConfig object. - self.tts_config = TTSConfig(voice, rate, volume, pitch) + boundary = kwargs.get("Boundary", "WordBoundary") + self.tts_config = TTSConfig(voice, rate, volume, pitch, boundary) # Validate the text parameter. if not isinstance(text, str): @@ -392,7 +415,7 @@ class Communicate: def __parse_metadata(self, data: bytes) -> TTSChunk: for meta_obj in json.loads(data)["Metadata"]: meta_type = meta_obj["Type"] - if meta_type == "WordBoundary": + if meta_type in ("WordBoundary", "SentenceBoundary"): current_offset = ( meta_obj["Data"]["Offset"] + self.state["offset_compensation"] ) @@ -411,12 +434,16 @@ class Communicate: async def __stream(self) -> AsyncGenerator[TTSChunk, None]: async def send_command_request() -> None: """Sends the command request to the service.""" + word_boundary = self.tts_config.boundary == "WordBoundary" + wd = "true" if word_boundary else "false" + sq = "true" if not word_boundary else "false" await websocket.send_str( f"X-Timestamp:{date_to_string()}\r\n" "Content-Type:application/json; charset=utf-8\r\n" "Path:speech.config\r\n\r\n" '{"context":{"synthesis":{"audio":{"metadataoptions":{' - '"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},' + f'"sentenceBoundaryEnabled":"{sq}","wordBoundaryEnabled":"{wd}"' + "}," '"outputFormat":"audio-24khz-48kbitrate-mono-mp3"' "}}}}\r\n" ) @@ -603,9 +630,9 @@ class Communicate: async for message in self.stream(): if message["type"] == "audio": audio.write(message["data"]) - elif ( - isinstance(metadata, TextIOWrapper) - and message["type"] == "WordBoundary" + elif isinstance(metadata, TextIOWrapper) and message["type"] in ( + "WordBoundary", + "SentenceBoundary", ): json.dump(message, metadata) metadata.write("\n") diff --git a/src/edge_tts/data_classes.py b/src/edge_tts/data_classes.py index 6fa0083..ec80194 100644 --- a/src/edge_tts/data_classes.py +++ b/src/edge_tts/data_classes.py @@ -17,6 +17,7 @@ class TTSConfig: rate: str volume: str pitch: str + boundary: str @staticmethod def validate_string_param(param_name: str, param_value: str, pattern: str) -> str: diff --git a/src/edge_tts/submaker.py b/src/edge_tts/submaker.py index 30da42a..56d2a24 100644 --- a/src/edge_tts/submaker.py +++ b/src/edge_tts/submaker.py @@ -1,4 +1,4 @@ -"""SubMaker module is used to generate subtitles from WordBoundary events.""" +"""SubMaker module is used to generate subtitles from WordBoundary and SentenceBoundary events.""" from typing import List @@ -9,7 +9,7 @@ from .typing import TTSChunk class SubMaker: """ - SubMaker is used to generate subtitles from WordBoundary messages. + SubMaker is used to generate subtitles from WordBoundary and SentenceBoundary messages. """ def __init__(self) -> None: @@ -17,15 +17,15 @@ class SubMaker: def feed(self, msg: TTSChunk) -> None: """ - Feed a WordBoundary message to the SubMaker object. + Feed a WordBoundary or SentenceBoundary message to the SubMaker object. Args: - msg (dict): The WordBoundary message. + msg (dict): The WordBoundary or SentenceBoundary message. Returns: None """ - if msg["type"] != "WordBoundary": + if msg["type"] not in ("WordBoundary", "SentenceBoundary"): raise ValueError("Invalid message type, expected 'WordBoundary'") self.cues.append( diff --git a/src/edge_tts/typing.py b/src/edge_tts/typing.py index 225293d..df92e87 100644 --- a/src/edge_tts/typing.py +++ b/src/edge_tts/typing.py @@ -10,11 +10,11 @@ from typing_extensions import Literal, NotRequired, TypedDict class TTSChunk(TypedDict): """TTS chunk data.""" - type: Literal["audio", "WordBoundary"] + type: Literal["audio", "WordBoundary", "SentenceBoundary"] data: NotRequired[bytes] # only for audio - duration: NotRequired[float] # only for WordBoundary - offset: NotRequired[float] # only for WordBoundary - text: NotRequired[str] # only for WordBoundary + duration: NotRequired[float] # only for WordBoundary and SentenceBoundary + offset: NotRequired[float] # only for WordBoundary and SentenceBoundary + text: NotRequired[str] # only for WordBoundary and SentenceBoundary class VoiceTag(TypedDict): diff --git a/src/edge_tts/util.py b/src/edge_tts/util.py index 38acdb1..180e262 100644 --- a/src/edge_tts/util.py +++ b/src/edge_tts/util.py @@ -72,7 +72,7 @@ async def _run_tts(args: UtilArgs) -> None: async for chunk in communicate.stream(): if chunk["type"] == "audio": audio_file.write(chunk["data"]) - elif chunk["type"] == "WordBoundary": + elif chunk["type"] in ("WordBoundary", "SentenceBoundary"): submaker.feed(chunk) if args.words_in_cue > 0: